libjava/gnu/gcj/convert/Input_UTF8.java


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114

/* Copyright (C) 1999, 2000  Free Software Foundation

   This file is part of libgcj.

This software is copyrighted work licensed under the terms of the
Libgcj License.  Please consult the file "LIBGCJ_LICENSE" for
details.  */

package gnu.gcj.convert;

/**
 * Convert UTF8 to Unicode.
 * @author Per Bothner <bothner@cygnus.com>
 * @date March 1999.
 */

public class Input_UTF8 extends BytesToUnicode
{
  public String getName() { return "UTF8"; }

  int partial = 0;
  int partial_bytes_expected = 0;
  //int suggogate_second = -1;

  public int read (char[] outbuffer, int outpos, int count)
  {
    int origpos = outpos;
    for (;;)
      {
	if (outpos - origpos >= count)
	  break;
	if (inpos >= inlength)
	  break;
	int b = inbuffer[inpos++];
	if (b >= 0)
	  outbuffer[outpos++] = (char) b;
	else
	  {
	    if ((b & 0xC0) == 0x80) // Continuation byte
	      {
		partial = (partial << 6) | (b & 0x3F);
		--partial_bytes_expected;
		if (partial_bytes_expected == 1)
		  {
		    if (partial > (0xFFFF>>6))
		      {
			// The next continuation byte will cause the result
			// to exceed 0xFFFF, so we must use a surrogate pair.
			// The "Unicode scalar value" (see D28 in section 3.7
			// of the Unicode Standard 2.0) is defined as:
			// value == (hi-0xD800)*0x400+(lo-0xDC00)+0x10000,
			// where (hi, lo) is the Unicode surrogate pair.
			// After reading the first three bytes, we have:
			// partial == (value >> 6).
			// Substituting and simplifying, we get:
			// partial == (hi-0xD800)*0x10+((lo-0xDC00)>>6)+0x400.
			// The definition lo>=0xDC00 && lo<=0xDFFF implies
			// that (lo-0xDC00)>>6 is in the range 0..15.
			// Hence we can solve for `hi' and we can emit
			// the high-surrogate without waiting for the
			// final byte:
			outbuffer[outpos++]
			  = (char) (0xD800 + ((partial - 0x400) >> 4));

			// Now we want to set it up so that when we read
			// the final byte on the next iteration, we will
			// get the low-surrogate without special handling.
			// I.e. we want:
			// lo == (next_partial << 6) | (next & 0x3F)
			// where next is the next input byte and next_partial
			// is the value of partial at the end of this
			// iteration.  This implies:  next_partial == lo >> 6.
			// We can simplify the previous:
			// partial == (hi-0xD800)*0x10+((lo-0xDC00)>>6)+0x400,
			// to: partial == (hi-0xD800)*0x10+(lo>>6)+0x90.
			// Inserting the values of hi and next_partial,
			// and simplifying, we get:  partial ==
			// ( (partial-0x400)&~0xF) + next_partial + 0x90.
			// Solving for next_partial, we get:
			// next_partial = partial+0x400-0x90-(partial&~0xF):
			// or: next_partial = (partial&0xF) + 0x370.  Hence:
			partial = (partial & 0xF) + 0x370;
		      }
		  }
		else if (partial_bytes_expected == 0)
		  {
		    outbuffer[outpos++] = (char) partial;
		    partial = 0;
		    partial_bytes_expected = 0;
		  }
	      }
	    else // prefix byte
	      {
		if ((b & 0xE0) == 0xC0)
		  {
		    partial = b & 0x1F;
		    partial_bytes_expected = 1;
		  }
		else if ((b & 0xF0) == 0xE0)
		  {
		    partial = b & 0xF;
		    partial_bytes_expected = 2;
		  }
		else
		  {
		    partial = b & 7;
		    partial_bytes_expected = 3;
		  }
	      }
	  }
      }
    return outpos - origpos;
  }
}