From 554fd8c5195424bdbcabf5de30fdc183aba391bd Mon Sep 17 00:00:00 2001 From: upstream source tree Date: Sun, 15 Mar 2015 20:14:05 -0400 Subject: obtained gcc-4.6.4.tar.bz2 from upstream website; verified gcc-4.6.4.tar.bz2.sig; imported gcc-4.6.4 source tree from verified upstream tarball. downloading a git-generated archive based on the 'upstream' tag should provide you with a source tree that is binary identical to the one extracted from the above tarball. if you have obtained the source via the command 'git clone', however, do note that line-endings of files in your working directory might differ from line-endings of the respective files in the upstream repository. --- libgo/go/html/escape.go | 224 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 224 insertions(+) create mode 100644 libgo/go/html/escape.go (limited to 'libgo/go/html/escape.go') diff --git a/libgo/go/html/escape.go b/libgo/go/html/escape.go new file mode 100644 index 000000000..2799f6908 --- /dev/null +++ b/libgo/go/html/escape.go @@ -0,0 +1,224 @@ +// Copyright 2010 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package html + +import ( + "bytes" + "strings" + "utf8" +) + +// These replacements permit compatibility with old numeric entities that +// assumed Windows-1252 encoding. +// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference +var replacementTable = [...]int{ + '\u20AC', // First entry is what 0x80 should be replaced with. + '\u0081', + '\u201A', + '\u0192', + '\u201E', + '\u2026', + '\u2020', + '\u2021', + '\u02C6', + '\u2030', + '\u0160', + '\u2039', + '\u0152', + '\u008D', + '\u017D', + '\u008F', + '\u0090', + '\u2018', + '\u2019', + '\u201C', + '\u201D', + '\u2022', + '\u2013', + '\u2014', + '\u02DC', + '\u2122', + '\u0161', + '\u203A', + '\u0153', + '\u009D', + '\u017E', + '\u0178', // Last entry is 0x9F. + // 0x00->'\uFFFD' is handled programmatically. + // 0x0D->'\u000D' is a no-op. +} + +// unescapeEntity reads an entity like "<" from b[src:] and writes the +// corresponding "<" to b[dst:], returning the incremented dst and src cursors. +// Precondition: b[src] == '&' && dst <= src. +func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) { + // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference + + // i starts at 1 because we already know that s[0] == '&'. + i, s := 1, b[src:] + + if len(s) <= 1 { + b[dst] = b[src] + return dst + 1, src + 1 + } + + if s[i] == '#' { + if len(s) <= 3 { // We need to have at least "&#.". + b[dst] = b[src] + return dst + 1, src + 1 + } + i++ + c := s[i] + hex := false + if c == 'x' || c == 'X' { + hex = true + i++ + } + + x := 0 + for i < len(s) { + c = s[i] + i++ + if hex { + if '0' <= c && c <= '9' { + x = 16*x + int(c) - '0' + continue + } else if 'a' <= c && c <= 'f' { + x = 16*x + int(c) - 'a' + 10 + continue + } else if 'A' <= c && c <= 'F' { + x = 16*x + int(c) - 'A' + 10 + continue + } + } else if '0' <= c && c <= '9' { + x = 10*x + int(c) - '0' + continue + } + if c != ';' { + i-- + } + break + } + + if i <= 3 { // No characters matched. + b[dst] = b[src] + return dst + 1, src + 1 + } + + if 0x80 <= x && x <= 0x9F { + // Replace characters from Windows-1252 with UTF-8 equivalents. + x = replacementTable[x-0x80] + } else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF { + // Replace invalid characters with the replacement character. + x = '\uFFFD' + } + + return dst + utf8.EncodeRune(b[dst:], x), src + i + } + + // Consume the maximum number of characters possible, with the + // consumed characters matching one of the named references. + + // TODO(nigeltao): unescape("¬it;") should be "¬it;" + for i < len(s) { + c := s[i] + i++ + // Lower-cased characters are more common in entities, so we check for them first. + if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' { + continue + } + if c != ';' { + i-- + } + break + } + + entityName := string(s[1:i]) + if x := entity[entityName]; x != 0 { + return dst + utf8.EncodeRune(b[dst:], x), src + i + } else if x := entity2[entityName]; x[0] != 0 { // Check if it's a two-character entity. + dst1 := dst + utf8.EncodeRune(b[dst:], x[0]) + return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i + } + + dst1, src1 = dst+i, src+i + copy(b[dst:dst1], b[src:src1]) + return dst1, src1 +} + +// unescape unescapes b's entities in-place, so that "a<b" becomes "a"` + +func escape(buf *bytes.Buffer, s string) { + i := strings.IndexAny(s, escapedChars) + for i != -1 { + buf.WriteString(s[0:i]) + var esc string + switch s[i] { + case '&': + esc = "&" + case '\'': + esc = "'" + case '<': + esc = "<" + case '>': + esc = ">" + case '"': + esc = """ + default: + panic("unrecognized escape character") + } + s = s[i+1:] + buf.WriteString(esc) + i = strings.IndexAny(s, escapedChars) + } + buf.WriteString(s) +} + +// EscapeString escapes special characters like "<" to become "<". It +// escapes only five such characters: amp, apos, lt, gt and quot. +// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't +// always true. +func EscapeString(s string) string { + if strings.IndexAny(s, escapedChars) == -1 { + return s + } + buf := bytes.NewBuffer(nil) + escape(buf, s) + return buf.String() +} + +// UnescapeString unescapes entities like "<" to become "<". It unescapes a +// larger range of entities than EscapeString escapes. For example, "á" +// unescapes to "á", as does "á" and "&xE1;". +// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't +// always true. +func UnescapeString(s string) string { + for _, c := range s { + if c == '&' { + return string(unescape([]byte(s))) + } + } + return s +} -- cgit v1.2.3