From 554fd8c5195424bdbcabf5de30fdc183aba391bd Mon Sep 17 00:00:00 2001 From: upstream source tree Date: Sun, 15 Mar 2015 20:14:05 -0400 Subject: obtained gcc-4.6.4.tar.bz2 from upstream website; verified gcc-4.6.4.tar.bz2.sig; imported gcc-4.6.4 source tree from verified upstream tarball. downloading a git-generated archive based on the 'upstream' tag should provide you with a source tree that is binary identical to the one extracted from the above tarball. if you have obtained the source via the command 'git clone', however, do note that line-endings of files in your working directory might differ from line-endings of the respective files in the upstream repository. --- libjava/classpath/gnu/java/nio/charset/UTF_8.java | 311 ++++++++++++++++++++++ 1 file changed, 311 insertions(+) create mode 100644 libjava/classpath/gnu/java/nio/charset/UTF_8.java (limited to 'libjava/classpath/gnu/java/nio/charset/UTF_8.java') diff --git a/libjava/classpath/gnu/java/nio/charset/UTF_8.java b/libjava/classpath/gnu/java/nio/charset/UTF_8.java new file mode 100644 index 000000000..0423efe51 --- /dev/null +++ b/libjava/classpath/gnu/java/nio/charset/UTF_8.java @@ -0,0 +1,311 @@ +/* UTF_8.java -- + Copyright (C) 2002, 2004, 2005 Free Software Foundation, Inc. + +This file is part of GNU Classpath. + +GNU Classpath is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU Classpath is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Classpath; see the file COPYING. If not, write to the +Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +02110-1301 USA. + +Linking this library statically or dynamically with other modules is +making a combined work based on this library. Thus, the terms and +conditions of the GNU General Public License cover the whole +combination. + +As a special exception, the copyright holders of this library give you +permission to link this library with independent modules to produce an +executable, regardless of the license terms of these independent +modules, and to copy and distribute the resulting executable under +terms of your choice, provided that you also meet, for each linked +independent module, the terms and conditions of the license of that +module. An independent module is a module which is not derived from +or based on this library. If you modify this library, you may extend +this exception to your version of the library, but you are not +obligated to do so. If you do not wish to do so, delete this +exception statement from your version. */ + +package gnu.java.nio.charset; + +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.CoderResult; + +/** + * UTF-8 charset. + * + *

UTF-8 references: + *

+ * + * @author Jesse Rosenstock + */ +final class UTF_8 extends Charset +{ + UTF_8 () + { + super ("UTF-8", new String[] { + /* These names are provided by + * http://oss.software.ibm.com/cgi-bin/icu/convexp?s=ALL + */ + "ibm-1208", "ibm-1209", "ibm-5304", "ibm-5305", + "windows-65001", "cp1208", + // see http://java.sun.com/j2se/1.5.0/docs/guide/intl/encoding.doc.html + "UTF8" + }); + } + + public boolean contains (Charset cs) + { + return cs instanceof US_ASCII || cs instanceof ISO_8859_1 + || cs instanceof UTF_8 || cs instanceof UTF_16BE + || cs instanceof UTF_16LE || cs instanceof UTF_16; + } + + public CharsetDecoder newDecoder () + { + return new Decoder (this); + } + + public CharsetEncoder newEncoder () + { + return new Encoder (this); + } + + private static final class Decoder extends CharsetDecoder + { + // Package-private to avoid a trampoline constructor. + Decoder (Charset cs) + { + super (cs, 1f, 1f); + } + + protected CoderResult decodeLoop (ByteBuffer in, CharBuffer out) + { + // TODO: Optimize this in the case in.hasArray() / out.hasArray() + int inPos = in.position(); + try + { + while (in.hasRemaining ()) + { + char c; + byte b1 = in.get (); + int highNibble = ((b1 & 0xFF) >> 4) & 0xF; + switch (highNibble) + { + case 0: case 1: case 2: case 3: + case 4: case 5: case 6: case 7: + if (out.remaining () < 1) + return CoderResult.OVERFLOW; + out.put ((char) b1); + inPos++; + break; + + case 0xC: case 0xD: + byte b2; + if (in.remaining () < 1) + return CoderResult.UNDERFLOW; + if (out.remaining () < 1) + return CoderResult.OVERFLOW; + if (!isContinuation (b2 = in.get ())) + return CoderResult.malformedForLength (1); + c = (char) (((b1 & 0x1F) << 6) | (b2 & 0x3F)); + // check that we had the shortest encoding + if (c <= 0x7F) + return CoderResult.malformedForLength (2); + out.put (c); + inPos += 2; + break; + + case 0xE: + byte b3; + if (in.remaining () < 2) + return CoderResult.UNDERFLOW; + if (out.remaining () < 1) + return CoderResult.OVERFLOW; + if (!isContinuation (b2 = in.get ())) + return CoderResult.malformedForLength (1); + if (!isContinuation (b3 = in.get ())) + return CoderResult.malformedForLength (1); + c = (char) (((b1 & 0x0F) << 12) + | ((b2 & 0x3F) << 6) + | (b3 & 0x3F)); + // check that we had the shortest encoding + if (c <= 0x7FF) + return CoderResult.malformedForLength (3); + out.put (c); + inPos += 3; + break; + + case 0xF: + byte b4; + if (in.remaining () < 3) + return CoderResult.UNDERFLOW; + if((b1&0x0F) > 4) + return CoderResult.malformedForLength (4); + if (out.remaining () < 2) + return CoderResult.OVERFLOW; + if (!isContinuation (b2 = in.get ())) + return CoderResult.malformedForLength (3); + if (!isContinuation (b3 = in.get ())) + return CoderResult.malformedForLength (2); + if (!isContinuation (b4 = in.get ())) + return CoderResult.malformedForLength (1); + int n = (((b1 & 0x3) << 18) + | ((b2 & 0x3F) << 12) + | ((b3 & 0x3F) << 6) + | (b4 & 0x3F)) - 0x10000; + char c1 = (char)(0xD800 | (n & 0xFFC00)>>10); + char c2 = (char)(0xDC00 | (n & 0x003FF)); + out.put (c1); + out.put (c2); + inPos += 4; + break; + + default: + return CoderResult.malformedForLength (1); + } + } + + return CoderResult.UNDERFLOW; + } + finally + { + // In case we did a get(), then encountered an error, reset the + // position to before the error. If there was no error, this + // will benignly reset the position to the value it already has. + in.position (inPos); + } + } + + private static boolean isContinuation (byte b) + { + return (b & 0xC0) == 0x80; + } + } + + private static final class Encoder extends CharsetEncoder + { + // Package-private to avoid a trampoline constructor. + Encoder (Charset cs) + { + // According to + // http://www-106.ibm.com/developerworks/unicode/library/utfencodingforms/index.html + // On average, English takes slightly over one unit per code point. + // Most Latin-script languages take about 1.1 bytes. Greek, Russian, + // Arabic and Hebrew take about 1.7 bytes, and most others (including + // Japanese, Chinese, Korean and Hindi) take about 3 bytes. + // We assume we will be dealing with latin scripts, and use 1.1 + // for averageBytesPerChar. + super (cs, 1.1f, 4.0f); + } + + protected CoderResult encodeLoop (CharBuffer in, ByteBuffer out) + { + int inPos = in.position(); + try + { + // TODO: Optimize this in the case in.hasArray() / out.hasArray() + while (in.hasRemaining ()) + { + int remaining = out.remaining (); + char c = in.get (); + + // UCS-4 range (hex.) UTF-8 octet sequence (binary) + // 0000 0000-0000 007F 0xxxxxxx + // 0000 0080-0000 07FF 110xxxxx 10xxxxxx + // 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx + + // Scalar Value UTF-16 byte 1 byte 2 byte 3 byte 4 + // 0000 0000 0xxx xxxx 0000 0000 0xxx xxxx 0xxx xxxx + // 0000 0yyy yyxx xxxx 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx + // zzzz yyyy yyxx xxxx zzzz yyyy yyxx xxxx 1110 zzzz 10yy yyyy 10xx xxxx + // u uuuu zzzz yyyy yyxx xxxx 1101 10ww wwzz zzyy 1111 0uuu 10uu zzzz 10yy yyyy 10xx xxxx + // + 1101 11yy yyxx xxxx + // Note: uuuuu = wwww + 1 + if (c <= 0x7F) + { + if (remaining < 1) + return CoderResult.OVERFLOW; + out.put ((byte) c); + inPos++; + } + else if (c <= 0x7FF) + { + if (remaining < 2) + return CoderResult.OVERFLOW; + out.put ((byte) (0xC0 | (c >> 6))); + out.put ((byte) (0x80 | (c & 0x3F))); + inPos++; + } + else if (0xD800 <= c && c <= 0xDFFF) + { + if (remaining < 4) + return CoderResult.OVERFLOW; + + // we got a low surrogate without a preciding high one + if (c > 0xDBFF) + return CoderResult.malformedForLength (1); + + // high surrogates + if (!in.hasRemaining ()) + return CoderResult.UNDERFLOW; + + char d = in.get (); + + // make sure d is a low surrogate + if (d < 0xDC00 || d > 0xDFFF) + return CoderResult.malformedForLength (1); + + // make the 32 bit value + // int value2 = (c - 0xD800) * 0x400 + (d - 0xDC00) + 0x10000; + int value = (((c & 0x3FF) << 10) | (d & 0x3FF)) + 0x10000; + // assert value == value2; + out.put ((byte) (0xF0 | ((value >> 18) & 0x07))); + out.put ((byte) (0x80 | ((value >> 12) & 0x3F))); + out.put ((byte) (0x80 | ((value >> 6) & 0x3F))); + out.put ((byte) (0x80 | ((value ) & 0x3F))); + inPos += 2; + } + else + { + if (remaining < 3) + return CoderResult.OVERFLOW; + + out.put ((byte) (0xE0 | (c >> 12))); + out.put ((byte) (0x80 | ((c >> 6) & 0x3F))); + out.put ((byte) (0x80 | (c & 0x3F))); + inPos++; + } + } + + return CoderResult.UNDERFLOW; + } + finally + { + // In case we did a get(), then encountered an error, reset the + // position to before the error. If there was no error, this + // will benignly reset the position to the value it already has. + in.position (inPos); + } + } + } +} -- cgit v1.2.3