From 554fd8c5195424bdbcabf5de30fdc183aba391bd Mon Sep 17 00:00:00 2001 From: upstream source tree Date: Sun, 15 Mar 2015 20:14:05 -0400 Subject: obtained gcc-4.6.4.tar.bz2 from upstream website; verified gcc-4.6.4.tar.bz2.sig; imported gcc-4.6.4 source tree from verified upstream tarball. downloading a git-generated archive based on the 'upstream' tag should provide you with a source tree that is binary identical to the one extracted from the above tarball. if you have obtained the source via the command 'git clone', however, do note that line-endings of files in your working directory might differ from line-endings of the respective files in the upstream repository. --- libjava/classpath/gnu/xml/aelfred2/XmlParser.java | 5831 +++++++++++++++++++++ 1 file changed, 5831 insertions(+) create mode 100644 libjava/classpath/gnu/xml/aelfred2/XmlParser.java (limited to 'libjava/classpath/gnu/xml/aelfred2/XmlParser.java') diff --git a/libjava/classpath/gnu/xml/aelfred2/XmlParser.java b/libjava/classpath/gnu/xml/aelfred2/XmlParser.java new file mode 100644 index 000000000..813593d93 --- /dev/null +++ b/libjava/classpath/gnu/xml/aelfred2/XmlParser.java @@ -0,0 +1,5831 @@ +/* XmlParser.java -- + Copyright (C) 1999,2000,2001 Free Software Foundation, Inc. + +This file is part of GNU Classpath. + +GNU Classpath is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU Classpath is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Classpath; see the file COPYING. If not, write to the +Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +02110-1301 USA. + +Linking this library statically or dynamically with other modules is +making a combined work based on this library. Thus, the terms and +conditions of the GNU General Public License cover the whole +combination. + +As a special exception, the copyright holders of this library give you +permission to link this library with independent modules to produce an +executable, regardless of the license terms of these independent +modules, and to copy and distribute the resulting executable under +terms of your choice, provided that you also meet, for each linked +independent module, the terms and conditions of the license of that +module. An independent module is a module which is not derived from +or based on this library. If you modify this library, you may extend +this exception to your version of the library, but you are not +obligated to do so. If you do not wish to do so, delete this +exception statement from your version. + +Partly derived from code which carried the following notice: + + Copyright (c) 1997, 1998 by Microstar Software Ltd. + + AElfred is free for both commercial and non-commercial use and + redistribution, provided that Microstar's copyright and disclaimer are + retained intact. You are free to modify AElfred for your own use and + to redistribute AElfred with your modifications, provided that the + modifications are clearly documented. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + merchantability or fitness for a particular purpose. Please use it AT + YOUR OWN RISK. +*/ + +package gnu.xml.aelfred2; + +import gnu.java.security.action.GetPropertyAction; + +import java.io.BufferedInputStream; +import java.io.CharConversionException; +import java.io.EOFException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.IOException; +import java.io.Reader; +import java.io.UnsupportedEncodingException; +import java.net.URL; +import java.net.URLConnection; +import java.security.AccessController; + +import java.util.Iterator; +import java.util.HashMap; +import java.util.LinkedList; + +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; + + +/** + * Parse XML documents and return parse events through call-backs. + * Use the SAXDriver class as your entry point, as all + * internal parser interfaces are subject to change. + * + * @author Written by David Megginson <dmeggins@microstar.com> + * (version 1.2a with bugfixes) + * @author Updated by David Brownell <dbrownell@users.sourceforge.net> + * @see SAXDriver + */ +final class XmlParser +{ + + // avoid slow per-character readCh() + private final static boolean USE_CHEATS = true; + + //////////////////////////////////////////////////////////////////////// + // Constants. + //////////////////////////////////////////////////////////////////////// + + // + // Constants for element content type. + // + + /** + * Constant: an element has not been declared. + * @see #getElementContentType + */ + public final static int CONTENT_UNDECLARED = 0; + + /** + * Constant: the element has a content model of ANY. + * @see #getElementContentType + */ + public final static int CONTENT_ANY = 1; + + /** + * Constant: the element has declared content of EMPTY. + * @see #getElementContentType + */ + public final static int CONTENT_EMPTY = 2; + + /** + * Constant: the element has mixed content. + * @see #getElementContentType + */ + public final static int CONTENT_MIXED = 3; + + /** + * Constant: the element has element content. + * @see #getElementContentType + */ + public final static int CONTENT_ELEMENTS = 4; + + + // + // Constants for the entity type. + // + + /** + * Constant: the entity has not been declared. + * @see #getEntityType + */ + public final static int ENTITY_UNDECLARED = 0; + + /** + * Constant: the entity is internal. + * @see #getEntityType + */ + public final static int ENTITY_INTERNAL = 1; + + /** + * Constant: the entity is external, non-parsable data. + * @see #getEntityType + */ + public final static int ENTITY_NDATA = 2; + + /** + * Constant: the entity is external XML data. + * @see #getEntityType + */ + public final static int ENTITY_TEXT = 3; + + // + // Attribute type constants are interned literal strings. + // + + // + // Constants for supported encodings. "external" is just a flag. + // + private final static int ENCODING_EXTERNAL = 0; + private final static int ENCODING_UTF_8 = 1; + private final static int ENCODING_ISO_8859_1 = 2; + private final static int ENCODING_UCS_2_12 = 3; + private final static int ENCODING_UCS_2_21 = 4; + private final static int ENCODING_UCS_4_1234 = 5; + private final static int ENCODING_UCS_4_4321 = 6; + private final static int ENCODING_UCS_4_2143 = 7; + private final static int ENCODING_UCS_4_3412 = 8; + private final static int ENCODING_ASCII = 9; + + // + // Constants for attribute default value. + // + + /** + * Constant: the attribute is not declared. + * @see #getAttributeDefaultValueType + */ + public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 30; + + /** + * Constant: the attribute has a literal default value specified. + * @see #getAttributeDefaultValueType + * @see #getAttributeDefaultValue + */ + public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 31; + + /** + * Constant: the attribute was declared #IMPLIED. + * @see #getAttributeDefaultValueType + */ + public final static int ATTRIBUTE_DEFAULT_IMPLIED = 32; + + /** + * Constant: the attribute was declared #REQUIRED. + * @see #getAttributeDefaultValueType + */ + public final static int ATTRIBUTE_DEFAULT_REQUIRED = 33; + + /** + * Constant: the attribute was declared #FIXED. + * @see #getAttributeDefaultValueType + * @see #getAttributeDefaultValue + */ + public final static int ATTRIBUTE_DEFAULT_FIXED = 34; + + // + // Constants for input. + // + private final static int INPUT_NONE = 0; + private final static int INPUT_INTERNAL = 1; + private final static int INPUT_STREAM = 3; + private final static int INPUT_READER = 5; + + // + // Flags for reading literals. + // + // expand general entity refs (attribute values in dtd and content) + private final static int LIT_ENTITY_REF = 2; + // normalize this value (space chars) (attributes, public ids) + private final static int LIT_NORMALIZE = 4; + // literal is an attribute value + private final static int LIT_ATTRIBUTE = 8; + // don't expand parameter entities + private final static int LIT_DISABLE_PE = 16; + // don't expand [or parse] character refs + private final static int LIT_DISABLE_CREF = 32; + // don't parse general entity refs + private final static int LIT_DISABLE_EREF = 64; + // literal is a public ID value + private final static int LIT_PUBID = 256; + + // + // Flags affecting PE handling in DTDs (if expandPE is true). + // PEs expand with space padding, except inside literals. + // + private final static int CONTEXT_NORMAL = 0; + private final static int CONTEXT_LITERAL = 1; + + // Emit warnings for relative URIs with no base URI. + static boolean uriWarnings; + static + { + String key = "gnu.xml.aelfred2.XmlParser.uriWarnings"; + GetPropertyAction a = new GetPropertyAction(key); + uriWarnings = "true".equals(AccessController.doPrivileged(a)); + } + + // + // The current XML handler interface. + // + private SAXDriver handler; + + // + // I/O information. + // + private Reader reader; // current reader + private InputStream is; // current input stream + private int line; // current line number + private int column; // current column number + private int sourceType; // type of input source + private LinkedList inputStack; // stack of input soruces + private URLConnection externalEntity; // current external entity + private int encoding; // current character encoding + private int currentByteCount; // bytes read from current source + private InputSource scratch; // temporary + + // + // Buffers for decoded but unparsed character input. + // + private char[] readBuffer; + private int readBufferPos; + private int readBufferLength; + private int readBufferOverflow; // overflow from last data chunk. + + // + // Buffer for undecoded raw byte input. + // + private final static int READ_BUFFER_MAX = 16384; + private byte[] rawReadBuffer; + + + // + // Buffer for attribute values, char refs, DTD stuff. + // + private static int DATA_BUFFER_INITIAL = 4096; + private char[] dataBuffer; + private int dataBufferPos; + + // + // Buffer for parsed names. + // + private static int NAME_BUFFER_INITIAL = 1024; + private char[] nameBuffer; + private int nameBufferPos; + + // + // Save any standalone flag + // + private boolean docIsStandalone; + + // + // Hashtables for DTD information on elements, entities, and notations. + // Populated until we start ignoring decls (because of skipping a PE) + // + private HashMap elementInfo; + private HashMap entityInfo; + private HashMap notationInfo; + private boolean skippedPE; + + // + // Element type currently in force. + // + private String currentElement; + private int currentElementContent; + + // + // Stack of entity names, to detect recursion. + // + private LinkedList entityStack; + + // + // PE expansion is enabled in most chunks of the DTD, not all. + // When it's enabled, literals are treated differently. + // + private boolean inLiteral; + private boolean expandPE; + private boolean peIsError; + + // + // can't report entity expansion inside two constructs: + // - attribute expansions (internal entities only) + // - markup declarations (parameter entities only) + // + private boolean doReport; + + // + // Symbol table, for caching interned names. + // + // These show up wherever XML names or nmtokens are used: naming elements, + // attributes, PIs, notations, entities, and enumerated attribute values. + // + // NOTE: This hashtable doesn't grow. The default size is intended to be + // rather large for most documents. Example: one snapshot of the DocBook + // XML 4.1 DTD used only about 350 such names. As a rule, only pathological + // documents (ones that don't reuse names) should ever see much collision. + // + // Be sure that SYMBOL_TABLE_LENGTH always stays prime, for best hashing. + // "2039" keeps the hash table size at about two memory pages on typical + // 32 bit hardware. + // + private final static int SYMBOL_TABLE_LENGTH = 2039; + + private Object[][] symbolTable; + + // + // Hash table of attributes found in current start tag. + // + private String[] tagAttributes; + private int tagAttributePos; + + // + // Utility flag: have we noticed a CR while reading the last + // data chunk? If so, we will have to go back and normalise + // CR or CR/LF line ends. + // + private boolean sawCR; + + // + // Utility flag: are we in CDATA? If so, whitespace isn't ignorable. + // + private boolean inCDATA; + + // + // Xml version. + // + private static final int XML_10 = 0; + private static final int XML_11 = 1; + private int xmlVersion = XML_10; + + ////////////////////////////////////////////////////////////////////// + // Constructors. + //////////////////////////////////////////////////////////////////////// + + /** + * Construct a new parser with no associated handler. + * @see #setHandler + * @see #parse + */ + // package private + XmlParser() + { + } + + /** + * Set the handler that will receive parsing events. + * @param handler The handler to receive callback events. + * @see #parse + */ + // package private + void setHandler(SAXDriver handler) + { + this.handler = handler; + } + + /** + * Parse an XML document from the character stream, byte stream, or URI + * that you provide (in that order of preference). Any URI that you + * supply will become the base URI for resolving relative URI, and may + * be used to acquire a reader or byte stream. + * + *

Only one thread at a time may use this parser; since it is + * private to this package, post-parse cleanup is done by the caller, + * which MUST NOT REUSE the parser (just null it). + * + * @param systemId Absolute URI of the document; should never be null, + * but may be so iff a reader or a stream is provided. + * @param publicId The public identifier of the document, or null. + * @param reader A character stream; must be null if stream isn't. + * @param stream A byte input stream; must be null if reader isn't. + * @param encoding The suggested encoding, or null if unknown. + * @exception java.lang.Exception Basically SAXException or IOException + */ + // package private + void doParse(String systemId, String publicId, Reader reader, + InputStream stream, String encoding) + throws Exception + { + if (handler == null) + { + throw new IllegalStateException("no callback handler"); + } + + initializeVariables(); + + // predeclare the built-in entities here (replacement texts) + // we don't need to intern(), since we're guaranteed literals + // are always (globally) interned. + setInternalEntity("amp", "&"); + setInternalEntity("lt", "<"); + setInternalEntity("gt", ">"); + setInternalEntity("apos", "'"); + setInternalEntity("quot", """); + + try + { + // pushURL first to ensure locator is correct in startDocument + // ... it might report an IO or encoding exception. + handler.startDocument(); + pushURL(false, "[document]", + // default baseURI: null + new ExternalIdentifiers(publicId, systemId, null), + reader, stream, encoding, false); + + parseDocument(); + } + catch (EOFException e) + { + //empty input + error("empty document, with no root element."); + } + finally + { + if (reader != null) + { + try + { + reader.close(); + } + catch (IOException e) + { + /* ignore */ + } + } + if (stream != null) + { + try + { + stream.close(); + } + catch (IOException e) + { + /* ignore */ + } + } + if (is != null) + { + try + { + is.close(); + } + catch (IOException e) + { + /* ignore */ + } + } + scratch = null; + } + } + + ////////////////////////////////////////////////////////////////////// + // Error reporting. + ////////////////////////////////////////////////////////////////////// + + /** + * Report an error. + * @param message The error message. + * @param textFound The text that caused the error (or null). + * @see SAXDriver#error + * @see #line + */ + private void error(String message, String textFound, String textExpected) + throws SAXException + { + if (textFound != null) + { + message = message + " (found \"" + textFound + "\")"; + } + if (textExpected != null) + { + message = message + " (expected \"" + textExpected + "\")"; + } + handler.fatal(message); + + // "can't happen" + throw new SAXException(message); + } + + /** + * Report a serious error. + * @param message The error message. + * @param textFound The text that caused the error (or null). + */ + private void error(String message, char textFound, String textExpected) + throws SAXException + { + error(message, Character.toString(textFound), textExpected); + } + + /** + * Report typical case fatal errors. + */ + private void error(String message) + throws SAXException + { + handler.fatal(message); + } + + ////////////////////////////////////////////////////////////////////// + // Major syntactic productions. + ////////////////////////////////////////////////////////////////////// + + /** + * Parse an XML document. + *

+   * [1] document ::= prolog element Misc*
+   * 
+ *

This is the top-level parsing function for a single XML + * document. As a minimum, a well-formed document must have + * a document element, and a valid document must have a prolog + * (one with doctype) as well. + */ + private void parseDocument() + throws Exception + { + try + { // added by MHK + boolean sawDTD = parseProlog(); + require('<'); + parseElement(!sawDTD); + } + catch (EOFException ee) + { // added by MHK + error("premature end of file", "[EOF]", null); + } + + try + { + parseMisc(); //skip all white, PIs, and comments + char c = readCh(); //if this doesn't throw an exception... + error("unexpected characters after document end", c, null); + } + catch (EOFException e) + { + return; + } + } + + static final char[] startDelimComment = { '<', '!', '-', '-' }; + static final char[] endDelimComment = { '-', '-' }; + + /** + * Skip a comment. + *

+   * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* "-->"
+   * 
+ *

(The <!-- has already been read.) + */ + private void parseComment() + throws Exception + { + char c; + boolean saved = expandPE; + + expandPE = false; + parseUntil(endDelimComment); + require('>'); + expandPE = saved; + handler.comment(dataBuffer, 0, dataBufferPos); + dataBufferPos = 0; + } + + static final char[] startDelimPI = { '<', '?' }; + static final char[] endDelimPI = { '?', '>' }; + + /** + * Parse a processing instruction and do a call-back. + *

+   * [16] PI ::= '<?' PITarget
+   *    (S (Char* - (Char* '?>' Char*)))?
+   *    '?>'
+   * [17] PITarget ::= Name - ( ('X'|'x') ('M'|m') ('L'|l') )
+   * 
+ *

(The <? has already been read.) + */ + private void parsePI() + throws SAXException, IOException + { + String name; + boolean saved = expandPE; + + expandPE = false; + name = readNmtoken(true); + //NE08 + if (name.indexOf(':') >= 0) + { + error("Illegal character(':') in processing instruction name ", + name, null); + } + if ("xml".equalsIgnoreCase(name)) + { + error("Illegal processing instruction target", name, null); + } + if (!tryRead(endDelimPI)) + { + requireWhitespace(); + parseUntil(endDelimPI); + } + expandPE = saved; + handler.processingInstruction(name, dataBufferToString()); + } + + static final char[] endDelimCDATA = { ']', ']', '>' }; + + private boolean isDirtyCurrentElement; + + /** + * Parse a CDATA section. + *

+   * [18] CDSect ::= CDStart CData CDEnd
+   * [19] CDStart ::= '<![CDATA['
+   * [20] CData ::= (Char* - (Char* ']]>' Char*))
+   * [21] CDEnd ::= ']]>'
+   * 
+ *

(The '<![CDATA[' has already been read.) + */ + private void parseCDSect() + throws Exception + { + parseUntil(endDelimCDATA); + dataBufferFlush(); + } + + /** + * Parse the prolog of an XML document. + *

+   * [22] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)?
+   * 
+ *

We do not look for the XML declaration here, because it was + * handled by pushURL (). + * @see pushURL + * @return true if a DTD was read. + */ + private boolean parseProlog() + throws Exception + { + parseMisc(); + + if (tryRead(" + * [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' + * [24] VersionInfo ::= S 'version' Eq + * ("'" VersionNum "'" | '"' VersionNum '"' ) + * [26] VersionNum ::= ([a-zA-Z0-9_.:] | '-')* + * [32] SDDecl ::= S 'standalone' Eq + * ( "'"" ('yes' | 'no') "'"" | '"' ("yes" | "no") '"' ) + * [80] EncodingDecl ::= S 'encoding' Eq + * ( "'" EncName "'" | "'" EncName "'" ) + * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* + * + *

(The <?xml and whitespace have already been read.) + * @return the encoding in the declaration, uppercased; or null + * @see #parseTextDecl + * @see #setupDecoding + */ + private String parseXMLDecl(boolean ignoreEncoding) + throws SAXException, IOException + { + String version; + String encodingName = null; + String standalone = null; + int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF; + String inputEncoding = null; + + switch (this.encoding) + { + case ENCODING_EXTERNAL: + case ENCODING_UTF_8: + inputEncoding = "UTF-8"; + break; + case ENCODING_ISO_8859_1: + inputEncoding = "ISO-8859-1"; + break; + case ENCODING_UCS_2_12: + inputEncoding = "UTF-16BE"; + break; + case ENCODING_UCS_2_21: + inputEncoding = "UTF-16LE"; + break; + } + + // Read the version. + require("version"); + parseEq(); + checkLegalVersion(version = readLiteral(flags)); + if (!version.equals("1.0")) + { + if (version.equals("1.1")) + { + handler.warn("expected XML version 1.0, not: " + version); + xmlVersion = XML_11; + } + else + { + error("illegal XML version", version, "1.0 or 1.1"); + } + } + else + { + xmlVersion = XML_10; + } + // Try reading an encoding declaration. + boolean white = tryWhitespace(); + + if (tryRead("encoding")) + { + if (!white) + { + error("whitespace required before 'encoding='"); + } + parseEq(); + encodingName = readLiteral(flags); + if (!ignoreEncoding) + { + setupDecoding(encodingName); + } + } + + // Try reading a standalone declaration + if (encodingName != null) + { + white = tryWhitespace(); + } + if (tryRead("standalone")) + { + if (!white) + { + error("whitespace required before 'standalone='"); + } + parseEq(); + standalone = readLiteral(flags); + if ("yes".equals(standalone)) + { + docIsStandalone = true; + } + else if (!"no".equals(standalone)) + { + error("standalone flag must be 'yes' or 'no'"); + } + } + + skipWhitespace(); + require("?>"); + + if (inputEncoding == null) + { + inputEncoding = encodingName; + } + return encodingName; + } + + /** + * Parse a text declaration. + *

+   * [79] TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>'
+   * [80] EncodingDecl ::= S 'encoding' Eq
+   *    ( '"' EncName '"' | "'" EncName "'" )
+   * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
+   * 
+ *

(The <?xml' and whitespace have already been read.) + * @return the encoding in the declaration, uppercased; or null + * @see #parseXMLDecl + * @see #setupDecoding + */ + private String parseTextDecl(boolean ignoreEncoding) + throws SAXException, IOException + { + String encodingName = null; + int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF; + + // Read an optional version. + if (tryRead ("version")) + { + String version; + parseEq(); + checkLegalVersion(version = readLiteral(flags)); + + if (version.equals("1.1")) + { + if (xmlVersion == XML_10) + { + error("external subset has later version number.", "1.0", + version); + } + handler.warn("expected XML version 1.0, not: " + version); + xmlVersion = XML_11; + } + else if (!version.equals("1.0")) + { + error("illegal XML version", version, "1.0 or 1.1"); + } + requireWhitespace(); + } + + // Read the encoding. + require("encoding"); + parseEq(); + encodingName = readLiteral(flags); + if (!ignoreEncoding) + { + setupDecoding(encodingName); + } + skipWhitespace(); + require("?>"); + + return encodingName; + } + + /** + * Sets up internal state so that we can decode an entity using the + * specified encoding. This is used when we start to read an entity + * and we have been given knowledge of its encoding before we start to + * read any data (e.g. from a SAX input source or from a MIME type). + * + *

It is also used after autodetection, at which point only very + * limited adjustments to the encoding may be used (switching between + * related builtin decoders). + * + * @param encodingName The name of the encoding specified by the user. + * @exception IOException if the encoding isn't supported either + * internally to this parser, or by the hosting JVM. + * @see #parseXMLDecl + * @see #parseTextDecl + */ + private void setupDecoding(String encodingName) + throws SAXException, IOException + { + encodingName = encodingName.toUpperCase(); + + // ENCODING_EXTERNAL indicates an encoding that wasn't + // autodetected ... we can use builtin decoders, or + // ones from the JVM (InputStreamReader). + + // Otherwise we can only tweak what was autodetected, and + // only for single byte (ASCII derived) builtin encodings. + + // ASCII-derived encodings + if (encoding == ENCODING_UTF_8 || encoding == ENCODING_EXTERNAL) + { + if (encodingName.equals("ISO-8859-1") + || encodingName.equals("8859_1") + || encodingName.equals("ISO8859_1")) + { + encoding = ENCODING_ISO_8859_1; + return; + } + else if (encodingName.equals("US-ASCII") + || encodingName.equals("ASCII")) + { + encoding = ENCODING_ASCII; + return; + } + else if (encodingName.equals("UTF-8") + || encodingName.equals("UTF8")) + { + encoding = ENCODING_UTF_8; + return; + } + else if (encoding != ENCODING_EXTERNAL) + { + // used to start with a new reader ... + throw new UnsupportedEncodingException(encodingName); + } + // else fallthrough ... + // it's ASCII-ish and something other than a builtin + } + + // Unicode and such + if (encoding == ENCODING_UCS_2_12 || encoding == ENCODING_UCS_2_21) + { + if (!(encodingName.equals("ISO-10646-UCS-2") + || encodingName.equals("UTF-16") + || encodingName.equals("UTF-16BE") + || encodingName.equals("UTF-16LE"))) + { + error("unsupported Unicode encoding", encodingName, "UTF-16"); + } + return; + } + + // four byte encodings + if (encoding == ENCODING_UCS_4_1234 + || encoding == ENCODING_UCS_4_4321 + || encoding == ENCODING_UCS_4_2143 + || encoding == ENCODING_UCS_4_3412) + { + // Strictly: "UCS-4" == "UTF-32BE"; also, "UTF-32LE" exists + if (!encodingName.equals("ISO-10646-UCS-4")) + { + error("unsupported 32-bit encoding", encodingName, + "ISO-10646-UCS-4"); + } + return; + } + + // assert encoding == ENCODING_EXTERNAL + // if (encoding != ENCODING_EXTERNAL) + // throw new RuntimeException ("encoding = " + encoding); + + if (encodingName.equals("UTF-16BE")) + { + encoding = ENCODING_UCS_2_12; + return; + } + if (encodingName.equals("UTF-16LE")) + { + encoding = ENCODING_UCS_2_21; + return; + } + + // We couldn't use the builtin decoders at all. But we can try to + // create a reader, since we haven't messed up buffering. Tweak + // the encoding name if necessary. + + if (encodingName.equals("UTF-16") + || encodingName.equals("ISO-10646-UCS-2")) + { + encodingName = "Unicode"; + } + // Ignoring all the EBCDIC aliases here + + reader = new InputStreamReader(is, encodingName); + sourceType = INPUT_READER; + } + + /** + * Parse miscellaneous markup outside the document element and DOCTYPE + * declaration. + *

+   * [27] Misc ::= Comment | PI | S
+   * 
+ */ + private void parseMisc() + throws Exception + { + while (true) + { + skipWhitespace(); + if (tryRead(startDelimPI)) + { + parsePI(); + } + else if (tryRead(startDelimComment)) + { + parseComment(); + } + else + { + return; + } + } + } + + /** + * Parse a document type declaration. + *
+   * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
+   *    ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
+   * 
+ *

(The <!DOCTYPE has already been read.) + */ + private void parseDoctypedecl() + throws Exception + { + String rootName; + ExternalIdentifiers ids; + + // Read the document type name. + requireWhitespace(); + rootName = readNmtoken(true); + + // Read the External subset's IDs + skipWhitespace(); + ids = readExternalIds(false, true); + + // report (a) declaration of name, (b) lexical info (ids) + handler.doctypeDecl(rootName, ids.publicId, ids.systemId); + + // Internal subset is parsed first, if present + skipWhitespace(); + if (tryRead('[')) + { + + // loop until the subset ends + while (true) + { + doReport = expandPE = true; + skipWhitespace(); + doReport = expandPE = false; + if (tryRead(']')) + { + break; // end of subset + } + else + { + // WFC, PEs in internal subset (only between decls) + peIsError = expandPE = true; + parseMarkupdecl(); + peIsError = expandPE = false; + } + } + } + skipWhitespace(); + require('>'); + + // Read the external subset, if any + InputSource subset; + + if (ids.systemId == null) + { + subset = handler.getExternalSubset(rootName, + handler.getSystemId()); + } + else + { + subset = null; + } + if (ids.systemId != null || subset != null) + { + pushString(null, ">"); + + // NOTE: [dtd] is so we say what SAX2 expects, + // though it's misleading (subset, not entire dtd) + if (ids.systemId != null) + { + pushURL(true, "[dtd]", ids, null, null, null, true); + } + else + { + handler.warn("modifying document by adding external subset"); + pushURL(true, "[dtd]", + new ExternalIdentifiers(subset.getPublicId(), + subset.getSystemId(), + null), + subset.getCharacterStream(), + subset.getByteStream(), + subset.getEncoding(), + false); + } + + // Loop until we end up back at '>' + while (true) + { + doReport = expandPE = true; + skipWhitespace(); + doReport = expandPE = false; + if (tryRead('>')) + { + break; + } + else + { + expandPE = true; + parseMarkupdecl(); + expandPE = false; + } + } + + // the ">" string isn't popped yet + if (inputStack.size() != 1) + { + error("external subset has unmatched '>'"); + } + } + + // done dtd + handler.endDoctype(); + expandPE = false; + doReport = true; + } + + /** + * Parse a markup declaration in the internal or external DTD subset. + *

+   * [29] markupdecl ::= elementdecl | Attlistdecl | EntityDecl
+   *    | NotationDecl | PI | Comment
+   * [30] extSubsetDecl ::= (markupdecl | conditionalSect
+   *    | PEReference | S) *
+   * 
+ *

Reading toplevel PE references is handled as a lexical issue + * by the caller, as is whitespace. + */ + private void parseMarkupdecl() + throws Exception + { + char[] saved = null; + boolean savedPE = expandPE; + + // prevent "<%foo;" and ensures saved entity is right + require('<'); + unread('<'); + expandPE = false; + + if (tryRead(" 0) + { + parseConditionalSect(saved); + } + else + { + error("conditional sections illegal in internal subset"); + } + } + else + { + error("expected markup declaration"); + } + + // VC: Proper Decl/PE Nesting + if (readBuffer != saved) + { + handler.verror("Illegal Declaration/PE nesting"); + } + } + + /** + * Parse an element, with its tags. + *

+   * [39] element ::= EmptyElementTag | STag content ETag
+   * [40] STag ::= '<' Name (S Attribute)* S? '>'
+   * [44] EmptyElementTag ::= '<' Name (S Attribute)* S? '/>'
+   * 
+ *

(The '<' has already been read.) + *

NOTE: this method actually chains onto parseContent (), if necessary, + * and parseContent () will take care of calling parseETag (). + */ + private void parseElement(boolean maybeGetSubset) + throws Exception + { + String gi; + char c; + int oldElementContent = currentElementContent; + String oldElement = currentElement; + ElementDecl element; + + // This is the (global) counter for the + // array of specified attributes. + tagAttributePos = 0; + + // Read the element type name. + gi = readNmtoken(true); + + // If we saw no DTD, and this is the document root element, + // let the application modify the input stream by providing one. + if (maybeGetSubset) + { + InputSource subset = handler.getExternalSubset(gi, + handler.getSystemId()); + if (subset != null) + { + String publicId = subset.getPublicId(); + String systemId = subset.getSystemId(); + + handler.warn("modifying document by adding DTD"); + handler.doctypeDecl(gi, publicId, systemId); + pushString(null, ">"); + + // NOTE: [dtd] is so we say what SAX2 expects, + // though it's misleading (subset, not entire dtd) + pushURL(true, "[dtd]", + new ExternalIdentifiers(publicId, systemId, null), + subset.getCharacterStream(), + subset.getByteStream(), + subset.getEncoding(), + false); + + // Loop until we end up back at '>' + while (true) + { + doReport = expandPE = true; + skipWhitespace(); + doReport = expandPE = false; + if (tryRead('>')) + { + break; + } + else + { + expandPE = true; + parseMarkupdecl(); + expandPE = false; + } + } + + // the ">" string isn't popped yet + if (inputStack.size() != 1) + { + error("external subset has unmatched '>'"); + } + + handler.endDoctype(); + } + } + + // Determine the current content type. + currentElement = gi; + element = (ElementDecl) elementInfo.get(gi); + currentElementContent = getContentType(element, CONTENT_ANY); + + // Read the attributes, if any. + // After this loop, "c" is the closing delimiter. + boolean white = tryWhitespace(); + c = readCh(); + while (c != '/' && c != '>') + { + unread(c); + if (!white) + { + error("need whitespace between attributes"); + } + parseAttribute(gi); + white = tryWhitespace(); + c = readCh(); + } + + // Supply any defaulted attributes. + Iterator atts = declaredAttributes(element); + if (atts != null) + { + String aname; +loop: + while (atts.hasNext()) + { + aname = (String) atts.next(); + // See if it was specified. + for (int i = 0; i < tagAttributePos; i++) + { + if (tagAttributes[i] == aname) + { + continue loop; + } + } + // ... or has a default + String value = getAttributeDefaultValue(gi, aname); + + if (value == null) + { + continue; + } + handler.attribute(aname, value, false); + } + } + + // Figure out if this is a start tag + // or an empty element, and dispatch an + // event accordingly. + switch (c) + { + case '>': + handler.startElement(gi); + parseContent(); + break; + case '/': + require('>'); + handler.startElement(gi); + handler.endElement(gi); + break; + } + + // Restore the previous state. + currentElement = oldElement; + currentElementContent = oldElementContent; + } + + /** + * Parse an attribute assignment. + *

+   * [41] Attribute ::= Name Eq AttValue
+   * 
+ * @param name The name of the attribute's element. + * @see SAXDriver#attribute + */ + private void parseAttribute(String name) + throws Exception + { + String aname; + String type; + String value; + int flags = LIT_ATTRIBUTE | LIT_ENTITY_REF; + + // Read the attribute name. + aname = readNmtoken(true); + type = getAttributeType(name, aname); + + // Parse '=' + parseEq(); + + // Read the value, normalizing whitespace + // unless it is CDATA. + if (handler.stringInterning) + { + if (type == "CDATA" || type == null) + { + value = readLiteral(flags); + } + else + { + value = readLiteral(flags | LIT_NORMALIZE); + } + } + else + { + if (type == null || type.equals("CDATA")) + { + value = readLiteral(flags); + } + else + { + value = readLiteral(flags | LIT_NORMALIZE); + } + } + + // WFC: no duplicate attributes + for (int i = 0; i < tagAttributePos; i++) + { + if (aname.equals(tagAttributes [i])) + { + error("duplicate attribute", aname, null); + } + } + + // Inform the handler about the + // attribute. + handler.attribute(aname, value, true); + dataBufferPos = 0; + + // Note that the attribute has been + // specified. + if (tagAttributePos == tagAttributes.length) + { + String newAttrib[] = new String[tagAttributes.length * 2]; + System.arraycopy(tagAttributes, 0, newAttrib, 0, tagAttributePos); + tagAttributes = newAttrib; + } + tagAttributes[tagAttributePos++] = aname; + } + + /** + * Parse an equals sign surrounded by optional whitespace. + *
+   * [25] Eq ::= S? '=' S?
+   * 
+ */ + private void parseEq() + throws SAXException, IOException + { + skipWhitespace(); + require('='); + skipWhitespace(); + } + + /** + * Parse an end tag. + *
+   * [42] ETag ::= ''
+   * 
+ *

NOTE: parseContent () chains to here, we already read the + * "</". + */ + private void parseETag() + throws Exception + { + require(currentElement); + skipWhitespace(); + require('>'); + handler.endElement(currentElement); + // not re-reporting any SAXException re bogus end tags, + // even though that diagnostic might be clearer ... + } + + /** + * Parse the content of an element. + *

+   * [43] content ::= (element | CharData | Reference
+   *    | CDSect | PI | Comment)*
+   * [67] Reference ::= EntityRef | CharRef
+   * 
+ *

NOTE: consumes ETtag. + */ + private void parseContent() + throws Exception + { + char c; + + while (true) + { + // consume characters (or ignorable whitspace) until delimiter + parseCharData(); + + // Handle delimiters + c = readCh(); + switch (c) + { + case '&': // Found "&" + c = readCh(); + if (c == '#') + { + parseCharRef(); + } + else + { + unread(c); + parseEntityRef(true); + } + isDirtyCurrentElement = true; + break; + + case '<': // Found "<" + dataBufferFlush(); + c = readCh(); + switch (c) + { + case '!': // Found " + * [45] elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>' + * + *

NOTE: the '<!ELEMENT' has already been read. + */ + private void parseElementDecl() + throws Exception + { + String name; + + requireWhitespace(); + // Read the element type name. + name = readNmtoken(true); + + requireWhitespace(); + // Read the content model. + parseContentspec(name); + + skipWhitespace(); + require('>'); + } + + /** + * Content specification. + *

+   * [46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | elements
+   * 
+ */ + private void parseContentspec(String name) + throws Exception + { + // FIXME: move elementDecl() into setElement(), pass EMTPY/ANY ... + if (tryRead("EMPTY")) + { + setElement(name, CONTENT_EMPTY, null, null); + if (!skippedPE) + { + handler.getDeclHandler().elementDecl(name, "EMPTY"); + } + return; + } + else if (tryRead("ANY")) + { + setElement(name, CONTENT_ANY, null, null); + if (!skippedPE) + { + handler.getDeclHandler().elementDecl(name, "ANY"); + } + return; + } + else + { + String model; + char[] saved; + + require('('); + saved = readBuffer; + dataBufferAppend('('); + skipWhitespace(); + if (tryRead("#PCDATA")) + { + dataBufferAppend("#PCDATA"); + parseMixed(saved); + model = dataBufferToString(); + setElement(name, CONTENT_MIXED, model, null); + } + else + { + parseElements(saved); + model = dataBufferToString(); + setElement(name, CONTENT_ELEMENTS, model, null); + } + if (!skippedPE) + { + handler.getDeclHandler().elementDecl(name, model); + } + } + } + + /** + * Parse an element-content model. + *
+   * [47] elements ::= (choice | seq) ('?' | '*' | '+')?
+   * [49] choice ::= '(' S? cp (S? '|' S? cp)+ S? ')'
+   * [50] seq ::= '(' S? cp (S? ',' S? cp)* S? ')'
+   * 
+ * + *

NOTE: the opening '(' and S have already been read. + * + * @param saved Buffer for entity that should have the terminal ')' + */ + private void parseElements(char[] saved) + throws Exception + { + char c; + char sep; + + // Parse the first content particle + skipWhitespace(); + parseCp(); + + // Check for end or for a separator. + skipWhitespace(); + c = readCh(); + switch (c) + { + case ')': + // VC: Proper Group/PE Nesting + if (readBuffer != saved) + { + handler.verror("Illegal Group/PE nesting"); + } + + dataBufferAppend(')'); + c = readCh(); + switch (c) + { + case '*': + case '+': + case '?': + dataBufferAppend(c); + break; + default: + unread(c); + } + return; + case ',': // Register the separator. + case '|': + sep = c; + dataBufferAppend(c); + break; + default: + error("bad separator in content model", c, null); + return; + } + + // Parse the rest of the content model. + while (true) + { + skipWhitespace(); + parseCp(); + skipWhitespace(); + c = readCh(); + if (c == ')') + { + // VC: Proper Group/PE Nesting + if (readBuffer != saved) + { + handler.verror("Illegal Group/PE nesting"); + } + + dataBufferAppend(')'); + break; + } + else if (c != sep) + { + error("bad separator in content model", c, null); + return; + } + else + { + dataBufferAppend(c); + } + } + + // Check for the occurrence indicator. + c = readCh(); + switch (c) + { + case '?': + case '*': + case '+': + dataBufferAppend(c); + return; + default: + unread(c); + return; + } + } + + /** + * Parse a content particle. + *

+   * [48] cp ::= (Name | choice | seq) ('?' | '*' | '+')?
+   * 
+ */ + private void parseCp() + throws Exception + { + if (tryRead('(')) + { + dataBufferAppend('('); + parseElements(readBuffer); + } + else + { + dataBufferAppend(readNmtoken(true)); + char c = readCh(); + switch (c) + { + case '?': + case '*': + case '+': + dataBufferAppend(c); + break; + default: + unread(c); + break; + } + } + } + + /** + * Parse mixed content. + *
+   * [51] Mixed ::= '(' S? ( '#PCDATA' (S? '|' S? Name)*) S? ')*'
+   *        | '(' S? ('#PCDATA') S? ')'
+   * 
+ * + * @param saved Buffer for entity that should have the terminal ')' + */ + private void parseMixed(char[] saved) + throws Exception + { + // Check for PCDATA alone. + skipWhitespace(); + if (tryRead(')')) + { + // VC: Proper Group/PE Nesting + if (readBuffer != saved) + { + handler.verror("Illegal Group/PE nesting"); + } + + dataBufferAppend(")*"); + tryRead('*'); + return; + } + + // Parse mixed content. + skipWhitespace(); + while (!tryRead(")")) + { + require('|'); + dataBufferAppend('|'); + skipWhitespace(); + dataBufferAppend(readNmtoken(true)); + skipWhitespace(); + } + + // VC: Proper Group/PE Nesting + if (readBuffer != saved) + { + handler.verror("Illegal Group/PE nesting"); + } + + require('*'); + dataBufferAppend(")*"); + } + + /** + * Parse an attribute list declaration. + *
+   * [52] AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>'
+   * 
+ *

NOTE: the '<!ATTLIST' has already been read. + */ + private void parseAttlistDecl() + throws Exception + { + String elementName; + + requireWhitespace(); + elementName = readNmtoken(true); + boolean white = tryWhitespace(); + while (!tryRead('>')) + { + if (!white) + { + error("whitespace required before attribute definition"); + } + parseAttDef(elementName); + white = tryWhitespace(); + } + } + + /** + * Parse a single attribute definition. + *

+   * [53] AttDef ::= S Name S AttType S DefaultDecl
+   * 
+ */ + private void parseAttDef(String elementName) + throws Exception + { + String name; + String type; + String enumer = null; + + // Read the attribute name. + name = readNmtoken(true); + + // Read the attribute type. + requireWhitespace(); + type = readAttType(); + + // Get the string of enumerated values if necessary. + if (handler.stringInterning) + { + if ("ENUMERATION" == type || "NOTATION" == type) + { + enumer = dataBufferToString(); + } + } + else + { + if ("ENUMERATION".equals(type) || "NOTATION".equals(type)) + { + enumer = dataBufferToString(); + } + } + + // Read the default value. + requireWhitespace(); + parseDefault(elementName, name, type, enumer); + } + + /** + * Parse the attribute type. + *
+   * [54] AttType ::= StringType | TokenizedType | EnumeratedType
+   * [55] StringType ::= 'CDATA'
+   * [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY'
+   *    | 'ENTITIES' | 'NMTOKEN' | 'NMTOKENS'
+   * [57] EnumeratedType ::= NotationType | Enumeration
+   * 
+ */ + private String readAttType() + throws Exception + { + if (tryRead('(')) + { + parseEnumeration(false); + return "ENUMERATION"; + } + else + { + String typeString = readNmtoken(true); + if (handler.stringInterning) + { + if ("NOTATION" == typeString) + { + parseNotationType(); + return typeString; + } + else if ("CDATA" == typeString + || "ID" == typeString + || "IDREF" == typeString + || "IDREFS" == typeString + || "ENTITY" == typeString + || "ENTITIES" == typeString + || "NMTOKEN" == typeString + || "NMTOKENS" == typeString) + { + return typeString; + } + } + else + { + if ("NOTATION".equals(typeString)) + { + parseNotationType(); + return typeString; + } + else if ("CDATA".equals(typeString) + || "ID".equals(typeString) + || "IDREF".equals(typeString) + || "IDREFS".equals(typeString) + || "ENTITY".equals(typeString) + || "ENTITIES".equals(typeString) + || "NMTOKEN".equals(typeString) + || "NMTOKENS".equals(typeString)) + { + return typeString; + } + } + error("illegal attribute type", typeString, null); + return null; + } + } + + /** + * Parse an enumeration. + *
+   * [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')'
+   * 
+ *

NOTE: the '(' has already been read. + */ + private void parseEnumeration(boolean isNames) + throws Exception + { + dataBufferAppend('('); + + // Read the first token. + skipWhitespace(); + dataBufferAppend(readNmtoken(isNames)); + // Read the remaining tokens. + skipWhitespace(); + while (!tryRead(')')) + { + require('|'); + dataBufferAppend('|'); + skipWhitespace(); + dataBufferAppend(readNmtoken (isNames)); + skipWhitespace(); + } + dataBufferAppend(')'); + } + + /** + * Parse a notation type for an attribute. + *

+   * [58] NotationType ::= 'NOTATION' S '(' S? NameNtoks
+   *    (S? '|' S? name)* S? ')'
+   * 
+ *

NOTE: the 'NOTATION' has already been read + */ + private void parseNotationType() + throws Exception + { + requireWhitespace(); + require('('); + + parseEnumeration(true); + } + + /** + * Parse the default value for an attribute. + *

+   * [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED'
+   *    | (('#FIXED' S)? AttValue)
+   * 
+ */ + private void parseDefault(String elementName, String name, + String type, String enumer) + throws Exception + { + int valueType = ATTRIBUTE_DEFAULT_SPECIFIED; + String value = null; + int flags = LIT_ATTRIBUTE; + boolean saved = expandPE; + String defaultType = null; + + // LIT_ATTRIBUTE forces '<' checks now (ASAP) and turns whitespace + // chars to spaces (doesn't matter when that's done if it doesn't + // interfere with char refs expanding to whitespace). + + if (!skippedPE) + { + flags |= LIT_ENTITY_REF; + if (handler.stringInterning) + { + if ("CDATA" != type) + { + flags |= LIT_NORMALIZE; + } + } + else + { + if (!"CDATA".equals(type)) + { + flags |= LIT_NORMALIZE; + } + } + } + + expandPE = false; + if (tryRead('#')) + { + if (tryRead("FIXED")) + { + defaultType = "#FIXED"; + valueType = ATTRIBUTE_DEFAULT_FIXED; + requireWhitespace(); + value = readLiteral(flags); + } + else if (tryRead("REQUIRED")) + { + defaultType = "#REQUIRED"; + valueType = ATTRIBUTE_DEFAULT_REQUIRED; + } + else if (tryRead("IMPLIED")) + { + defaultType = "#IMPLIED"; + valueType = ATTRIBUTE_DEFAULT_IMPLIED; + } + else + { + error("illegal keyword for attribute default value"); + } + } + else + { + value = readLiteral(flags); + } + expandPE = saved; + setAttribute(elementName, name, type, enumer, value, valueType); + if (handler.stringInterning) + { + if ("ENUMERATION" == type) + { + type = enumer; + } + else if ("NOTATION" == type) + { + type = "NOTATION " + enumer; + } + } + else + { + if ("ENUMERATION".equals(type)) + { + type = enumer; + } + else if ("NOTATION".equals(type)) + { + type = "NOTATION " + enumer; + } + } + if (!skippedPE) + { + handler.getDeclHandler().attributeDecl(elementName, name, type, + defaultType, value); + } + } + + /** + * Parse a conditional section. + *
+   * [61] conditionalSect ::= includeSect || ignoreSect
+   * [62] includeSect ::= '<![' S? 'INCLUDE' S? '['
+   *    extSubsetDecl ']]>'
+   * [63] ignoreSect ::= '<![' S? 'IGNORE' S? '['
+   *    ignoreSectContents* ']]>'
+   * [64] ignoreSectContents ::= Ignore
+   *    ('<![' ignoreSectContents* ']]>' Ignore )*
+   * [65] Ignore ::= Char* - (Char* ( '<![' | ']]>') Char* )
+   * 
+ *

NOTE: the '>![' has already been read. + */ + private void parseConditionalSect(char[] saved) + throws Exception + { + skipWhitespace(); + if (tryRead("INCLUDE")) + { + skipWhitespace(); + require('['); + // VC: Proper Conditional Section/PE Nesting + if (readBuffer != saved) + { + handler.verror("Illegal Conditional Section/PE nesting"); + } + skipWhitespace(); + while (!tryRead("]]>")) + { + parseMarkupdecl(); + skipWhitespace(); + } + } + else if (tryRead("IGNORE")) + { + skipWhitespace(); + require('['); + // VC: Proper Conditional Section/PE Nesting + if (readBuffer != saved) + { + handler.verror("Illegal Conditional Section/PE nesting"); + } + int nesting = 1; + char c; + expandPE = false; + for (int nest = 1; nest > 0; ) + { + c = readCh(); + switch (c) + { + case '<': + if (tryRead("![")) + { + nest++; + } + break; + case ']': + if (tryRead("]>")) + { + nest--; + } + } + } + expandPE = true; + } + else + { + error("conditional section must begin with INCLUDE or IGNORE"); + } + } + + private void parseCharRef() + throws SAXException, IOException + { + parseCharRef(true /* do flushDataBuffer by default */); + } + + /** + * Try to read a character reference without consuming data from buffer. + *

+   * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
+   * 
+ *

NOTE: the '&#' has already been read. + */ + private void tryReadCharRef() + throws SAXException, IOException + { + int value = 0; + char c; + + if (tryRead('x')) + { +loop1: + while (true) + { + c = readCh(); + if (c == ';') + { + break loop1; + } + else + { + int n = Character.digit(c, 16); + if (n == -1) + { + error("illegal character in character reference", c, null); + break loop1; + } + value *= 16; + value += n; + } + } + } + else + { +loop2: + while (true) + { + c = readCh(); + if (c == ';') + { + break loop2; + } + else + { + int n = Character.digit(c, 10); + if (n == -1) + { + error("illegal character in character reference", c, null); + break loop2; + } + value *= 10; + value += n; + } + } + } + + // check for character refs being legal XML + if ((value < 0x0020 + && ! (value == '\n' || value == '\t' || value == '\r')) + || (value >= 0xD800 && value <= 0xDFFF) + || value == 0xFFFE || value == 0xFFFF + || value > 0x0010ffff) + { + error("illegal XML character reference U+" + + Integer.toHexString(value)); + } + + // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz + // (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz: + if (value > 0x0010ffff) + { + // too big for surrogate + error("character reference " + value + " is too large for UTF-16", + Integer.toString(value), null); + } + + } + + /** + * Read and interpret a character reference. + *

+   * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
+   * 
+ *

NOTE: the '&#' has already been read. + */ + private void parseCharRef(boolean doFlush) + throws SAXException, IOException + { + int value = 0; + char c; + + if (tryRead('x')) + { +loop1: + while (true) + { + c = readCh(); + if (c == ';') + { + break loop1; + } + else + { + int n = Character.digit(c, 16); + if (n == -1) + { + error("illegal character in character reference", c, null); + break loop1; + } + value *= 16; + value += n; + } + } + } + else + { +loop2: + while (true) + { + c = readCh(); + if (c == ';') + { + break loop2; + } + else + { + int n = Character.digit(c, 10); + if (n == -1) + { + error("illegal character in character reference", c, null); + break loop2; + } + value *= 10; + value += c - '0'; + } + } + } + + // check for character refs being legal XML + if ((value < 0x0020 + && ! (value == '\n' || value == '\t' || value == '\r')) + || (value >= 0xD800 && value <= 0xDFFF) + || value == 0xFFFE || value == 0xFFFF + || value > 0x0010ffff) + { + error("illegal XML character reference U+" + + Integer.toHexString(value)); + } + + // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz + // (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz: + if (value <= 0x0000ffff) + { + // no surrogates needed + dataBufferAppend((char) value); + } + else if (value <= 0x0010ffff) + { + value -= 0x10000; + // > 16 bits, surrogate needed + dataBufferAppend((char) (0xd800 | (value >> 10))); + dataBufferAppend((char) (0xdc00 | (value & 0x0003ff))); + } + else + { + // too big for surrogate + error("character reference " + value + " is too large for UTF-16", + Integer.toString(value), null); + } + if (doFlush) + { + dataBufferFlush(); + } + } + + /** + * Parse and expand an entity reference. + *

+   * [68] EntityRef ::= '&' Name ';'
+   * 
+ *

NOTE: the '&' has already been read. + * @param externalAllowed External entities are allowed here. + */ + private void parseEntityRef(boolean externalAllowed) + throws SAXException, IOException + { + String name; + + name = readNmtoken(true); + require(';'); + switch (getEntityType(name)) + { + case ENTITY_UNDECLARED: + // NOTE: XML REC describes amazingly convoluted handling for + // this case. Nothing as meaningful as being a WFness error + // unless the processor might _legitimately_ not have seen a + // declaration ... which is what this implements. + String message; + + message = "reference to undeclared general entity " + name; + if (skippedPE && !docIsStandalone) + { + handler.verror(message); + // we don't know this entity, and it might be external... + if (externalAllowed) + { + handler.skippedEntity(name); + } + } + else + { + error(message); + } + break; + case ENTITY_INTERNAL: + pushString(name, getEntityValue(name)); + + //workaround for possible input pop before marking + //the buffer reading position + char t = readCh(); + unread(t); + int bufferPosMark = readBufferPos; + + int end = readBufferPos + getEntityValue(name).length(); + for (int k = readBufferPos; k < end; k++) + { + t = readCh(); + if (t == '&') + { + t = readCh(); + if (t == '#') + { + //try to match a character ref + tryReadCharRef(); + + //everything has been read + if (readBufferPos >= end) + { + break; + } + k = readBufferPos; + continue; + } + else if (Character.isLetter(t)) + { + //looks like an entity ref + unread(t); + readNmtoken(true); + require(';'); + + //everything has been read + if (readBufferPos >= end) + { + break; + } + k = readBufferPos; + continue; + } + error(" malformed entity reference"); + } + + } + readBufferPos = bufferPosMark; + break; + case ENTITY_TEXT: + if (externalAllowed) + { + pushURL(false, name, getEntityIds(name), + null, null, null, true); + } + else + { + error("reference to external entity in attribute value.", + name, null); + } + break; + case ENTITY_NDATA: + if (externalAllowed) + { + error("unparsed entity reference in content", name, null); + } + else + { + error("reference to external entity in attribute value.", + name, null); + } + break; + default: + throw new RuntimeException(); + } + } + + /** + * Parse and expand a parameter entity reference. + *

+   * [69] PEReference ::= '%' Name ';'
+   * 
+ *

NOTE: the '%' has already been read. + */ + private void parsePEReference() + throws SAXException, IOException + { + String name; + + name = "%" + readNmtoken(true); + require(';'); + switch (getEntityType(name)) + { + case ENTITY_UNDECLARED: + // VC: Entity Declared + handler.verror("reference to undeclared parameter entity " + name); + + // we should disable handling of all subsequent declarations + // unless this is a standalone document (info discarded) + break; + case ENTITY_INTERNAL: + if (inLiteral) + { + pushString(name, getEntityValue(name)); + } + else + { + pushString(name, ' ' + getEntityValue(name) + ' '); + } + break; + case ENTITY_TEXT: + if (!inLiteral) + { + pushString(null, " "); + } + pushURL(true, name, getEntityIds(name), null, null, null, true); + if (!inLiteral) + { + pushString(null, " "); + } + break; + } + } + + /** + * Parse an entity declaration. + *

+   * [70] EntityDecl ::= GEDecl | PEDecl
+   * [71] GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'
+   * [72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
+   * [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?)
+   * [74] PEDef ::= EntityValue | ExternalID
+   * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
+   *       | 'PUBLIC' S PubidLiteral S SystemLiteral
+   * [76] NDataDecl ::= S 'NDATA' S Name
+   * 
+ *

NOTE: the '<!ENTITY' has already been read. + */ + private void parseEntityDecl() + throws Exception + { + boolean peFlag = false; + int flags = 0; + + // Check for a parameter entity. + expandPE = false; + requireWhitespace(); + if (tryRead('%')) + { + peFlag = true; + requireWhitespace(); + } + expandPE = true; + + // Read the entity name, and prepend + // '%' if necessary. + String name = readNmtoken(true); + //NE08 + if (name.indexOf(':') >= 0) + { + error("Illegal character(':') in entity name ", name, null); + } + if (peFlag) + { + name = "%" + name; + } + + // Read the entity value. + requireWhitespace(); + char c = readCh(); + unread (c); + if (c == '"' || c == '\'') + { + // Internal entity ... replacement text has expanded refs + // to characters and PEs, but not to general entities + String value = readLiteral(flags); + setInternalEntity(name, value); + } + else + { + // Read the external IDs + ExternalIdentifiers ids = readExternalIds(false, false); + + // Check for NDATA declaration. + boolean white = tryWhitespace(); + if (!peFlag && tryRead("NDATA")) + { + if (!white) + { + error("whitespace required before NDATA"); + } + requireWhitespace(); + String notationName = readNmtoken(true); + if (!skippedPE) + { + setExternalEntity(name, ENTITY_NDATA, ids, notationName); + handler.unparsedEntityDecl(name, ids.publicId, ids.systemId, + ids.baseUri, notationName); + } + } + else if (!skippedPE) + { + setExternalEntity(name, ENTITY_TEXT, ids, null); + handler.getDeclHandler() + .externalEntityDecl(name, ids.publicId, + handler.resolveURIs() + // FIXME: ASSUMES not skipped + // "false" forces error on bad URI + ? handler.absolutize(ids.baseUri, + ids.systemId, + false) + : ids.systemId); + } + } + + // Finish the declaration. + skipWhitespace(); + require('>'); + } + + /** + * Parse a notation declaration. + *

+   * [82] NotationDecl ::= '<!NOTATION' S Name S
+   *    (ExternalID | PublicID) S? '>'
+   * [83] PublicID ::= 'PUBLIC' S PubidLiteral
+   * 
+ *

NOTE: the '<!NOTATION' has already been read. + */ + private void parseNotationDecl() + throws Exception + { + String nname; + ExternalIdentifiers ids; + + requireWhitespace(); + nname = readNmtoken(true); + //NE08 + if (nname.indexOf(':') >= 0) + { + error("Illegal character(':') in notation name ", nname, null); + } + requireWhitespace(); + + // Read the external identifiers. + ids = readExternalIds(true, false); + + // Register the notation. + setNotation(nname, ids); + + skipWhitespace(); + require('>'); + } + + /** + * Parse character data. + *

+   * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
+   * 
+ */ + private void parseCharData() + throws Exception + { + char c; + int state = 0; + boolean pureWhite = false; + + // assert (dataBufferPos == 0); + + // are we expecting pure whitespace? it might be dirty... + if ((currentElementContent == CONTENT_ELEMENTS) && !isDirtyCurrentElement) + { + pureWhite = true; + } + + // always report right out of readBuffer + // to minimize (pointless) buffer copies + while (true) + { + int lineAugment = 0; + int columnAugment = 0; + int i; + +loop: + for (i = readBufferPos; i < readBufferLength; i++) + { + switch (c = readBuffer[i]) + { + case '\n': + lineAugment++; + columnAugment = 0; + // pureWhite unmodified + break; + case '\r': // should not happen!! + case '\t': + case ' ': + // pureWhite unmodified + columnAugment++; + break; + case '&': + case '<': + columnAugment++; + // pureWhite unmodified + // CLEAN end of text sequence + state = 1; + break loop; + case ']': + // that's not a whitespace char, and + // can not terminate pure whitespace either + pureWhite = false; + if ((i + 2) < readBufferLength) + { + if (readBuffer [i + 1] == ']' + && readBuffer [i + 2] == '>') + { + // ERROR end of text sequence + state = 2; + break loop; + } + } + else + { + // FIXME missing two end-of-buffer cases + } + columnAugment++; + break; + default: + if ((c < 0x0020 || c > 0xFFFD) + || ((c >= 0x007f) && (c <= 0x009f) && (c != 0x0085) + && xmlVersion == XML_11)) + { + error("illegal XML character U+" + + Integer.toHexString(c)); + } + // that's not a whitespace char + pureWhite = false; + columnAugment++; + } + } + + // report text thus far + if (lineAugment > 0) + { + line += lineAugment; + column = columnAugment; + } + else + { + column += columnAugment; + } + + // report characters/whitspace + int length = i - readBufferPos; + + if (length != 0) + { + if (pureWhite) + { + handler.ignorableWhitespace(readBuffer, + readBufferPos, length); + } + else + { + handler.charData(readBuffer, readBufferPos, length); + } + readBufferPos = i; + } + + if (state != 0) + { + break; + } + + // fill next buffer from this entity, or + // pop stack and continue with previous entity + unread(readCh()); + } + if (!pureWhite) + { + isDirtyCurrentElement = true; + } + // finish, maybe with error + if (state != 1) // finish, no error + { + error("character data may not contain ']]>'"); + } + } + + ////////////////////////////////////////////////////////////////////// + // High-level reading and scanning methods. + ////////////////////////////////////////////////////////////////////// + + /** + * Require whitespace characters. + */ + private void requireWhitespace() + throws SAXException, IOException + { + char c = readCh(); + if (isWhitespace(c)) + { + skipWhitespace(); + } + else + { + error("whitespace required", c, null); + } + } + + /** + * Skip whitespace characters. + *
+   * [3] S ::= (#x20 | #x9 | #xd | #xa)+
+   * 
+ */ + private void skipWhitespace() + throws SAXException, IOException + { + // Start with a little cheat. Most of + // the time, the white space will fall + // within the current read buffer; if + // not, then fall through. + if (USE_CHEATS) + { + int lineAugment = 0; + int columnAugment = 0; + +loop: + for (int i = readBufferPos; i < readBufferLength; i++) + { + switch (readBuffer[i]) + { + case ' ': + case '\t': + case '\r': + columnAugment++; + break; + case '\n': + lineAugment++; + columnAugment = 0; + break; + case '%': + if (expandPE) + { + break loop; + } + // else fall through... + default: + readBufferPos = i; + if (lineAugment > 0) + { + line += lineAugment; + column = columnAugment; + } + else + { + column += columnAugment; + } + return; + } + } + } + + // OK, do it the slow way. + char c = readCh (); + while (isWhitespace(c)) + { + c = readCh(); + } + unread(c); + } + + /** + * Read a name or (when parsing an enumeration) name token. + *
+   * [5] Name ::= (Letter | '_' | ':') (NameChar)*
+   * [7] Nmtoken ::= (NameChar)+
+   * 
+ */ + private String readNmtoken(boolean isName) + throws SAXException, IOException + { + char c; + + if (USE_CHEATS) + { +loop: + for (int i = readBufferPos; i < readBufferLength; i++) + { + c = readBuffer[i]; + switch (c) + { + case '%': + if (expandPE) + { + break loop; + } + // else fall through... + + // What may legitimately come AFTER a name/nmtoken? + case '<': case '>': case '&': + case ',': case '|': case '*': case '+': case '?': + case ')': + case '=': + case '\'': case '"': + case '[': + case ' ': case '\t': case '\r': case '\n': + case ';': + case '/': + int start = readBufferPos; + if (i == start) + { + error("name expected", readBuffer[i], null); + } + readBufferPos = i; + return intern(readBuffer, start, i - start); + + default: + // FIXME ... per IBM's OASIS test submission, these: + // ? U+06dd + // Combining U+309B + //these switches are kind of ugly but at least we won't + //have to go over the whole lits for each char + if (isName && i == readBufferPos) + { + char c2 = (char) (c & 0x00f0); + switch (c & 0xff00) + { + //starting with 01 + case 0x0100: + switch (c2) + { + case 0x0030: + if (c == 0x0132 || c == 0x0133 || c == 0x013f) + { + error("Not a name start character, U+" + + Integer.toHexString(c)); + } + break; + case 0x0040: + if (c == 0x0140 || c == 0x0149) + { + error("Not a name start character, U+" + + Integer.toHexString(c)); + } + break; + case 0x00c0: + if (c == 0x01c4 || c == 0x01cc) + { + error("Not a name start character, U+" + + Integer.toHexString(c)); + } + break; + case 0x00f0: + if (c == 0x01f1 || c == 0x01f3) + { + error("Not a name start character, U+" + + Integer.toHexString(c)); + } + break; + case 0x00b0: + if (c == 0x01f1 || c == 0x01f3) + { + error("Not a name start character, U+" + + Integer.toHexString(c)); + } + break; + default: + if (c == 0x017f) + { + error("Not a name start character, U+" + + Integer.toHexString(c)); + } + } + + break; + //starting with 11 + case 0x1100: + switch (c2) + { + case 0x0000: + if (c == 0x1104 || c == 0x1108 || + c == 0x110a || c == 0x110d) + { + error("Not a name start character, U+" + + Integer.toHexString(c)); + } + break; + case 0x0030: + if (c == 0x113b || c == 0x113f) + { + error("Not a name start character, U+" + + Integer.toHexString(c)); + } + break; + case 0x0040: + if (c == 0x1141 || c == 0x114d + || c == 0x114f ) + { + error("Not a name start character, U+" + + Integer.toHexString(c)); + } + break; + case 0x0050: + if (c == 0x1151 || c == 0x1156) + { + error("Not a name start character, U+" + + Integer.toHexString(c)); + } + break; + case 0x0060: + if (c == 0x1162 || c == 0x1164 + || c == 0x1166 || c == 0x116b + || c == 0x116f) + { + error("Not a name start character, U+" + + Integer.toHexString(c)); + } + break; + case 0x00b0: + if (c == 0x11b6 || c == 0x11b9 + || c == 0x11bb || c == 0x116f) + { + error("Not a name start character, U+" + + Integer.toHexString(c)); + } + break; + default: + if (c == 0x1174 || c == 0x119f + || c == 0x11ac || c == 0x11c3 + || c == 0x11f1) + { + error("Not a name start character, U+" + + Integer.toHexString(c)); + } + } + break; + default: + if (c == 0x0e46 || c == 0x1011 + || c == 0x212f || c == 0x0587 + || c == 0x0230 ) + { + error("Not a name start character, U+" + + Integer.toHexString(c)); + } + } + } + // punt on exact tests from Appendix A; approximate + // them using the Unicode ID start/part rules + if (i == readBufferPos && isName) + { + if (!Character.isUnicodeIdentifierStart(c) + && c != ':' && c != '_') + { + error("Not a name start character, U+" + + Integer.toHexString(c)); + } + } + else if (!Character.isUnicodeIdentifierPart(c) + && c != '-' && c != ':' && c != '_' && c != '.' + && !isExtender(c)) + { + error("Not a name character, U+" + + Integer.toHexString(c)); + } + } + } + } + + nameBufferPos = 0; + + // Read the first character. + while (true) + { + c = readCh(); + switch (c) + { + case '%': + case '<': case '>': case '&': + case ',': case '|': case '*': case '+': case '?': + case ')': + case '=': + case '\'': case '"': + case '[': + case ' ': case '\t': case '\n': case '\r': + case ';': + case '/': + unread(c); + if (nameBufferPos == 0) + { + error ("name expected"); + } + // punt on exact tests from Appendix A, but approximate them + if (isName + && !Character.isUnicodeIdentifierStart(nameBuffer[0]) + && ":_".indexOf(nameBuffer[0]) == -1) + { + error("Not a name start character, U+" + + Integer.toHexString(nameBuffer[0])); + } + String s = intern(nameBuffer, 0, nameBufferPos); + nameBufferPos = 0; + return s; + default: + // punt on exact tests from Appendix A, but approximate them + + if ((nameBufferPos != 0 || !isName) + && !Character.isUnicodeIdentifierPart(c) + && ":-_.".indexOf(c) == -1 + && !isExtender(c)) + { + error("Not a name character, U+" + + Integer.toHexString(c)); + } + if (nameBufferPos >= nameBuffer.length) + { + nameBuffer = + (char[]) extendArray(nameBuffer, + nameBuffer.length, nameBufferPos); + } + nameBuffer[nameBufferPos++] = c; + } + } + } + + private static boolean isExtender(char c) + { + // [88] Extender ::= ... + return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387 + || c == 0x0640 || c == 0x0e46 || c == 0x0ec6 || c == 0x3005 + || (c >= 0x3031 && c <= 0x3035) + || (c >= 0x309d && c <= 0x309e) + || (c >= 0x30fc && c <= 0x30fe); + } + + /** + * Read a literal. With matching single or double quotes as + * delimiters (and not embedded!) this is used to parse: + *
+   *  [9] EntityValue ::= ... ([^%&] | PEReference | Reference)* ...
+   *  [10] AttValue ::= ... ([^<&] | Reference)* ...
+   *  [11] SystemLiteral ::= ... (URLchar - "'")* ...
+   *  [12] PubidLiteral ::= ... (PubidChar - "'")* ...
+   * 
+ * as well as the quoted strings in XML and text declarations + * (for version, encoding, and standalone) which have their + * own constraints. + */ + private String readLiteral(int flags) + throws SAXException, IOException + { + char delim, c; + int startLine = line; + boolean saved = expandPE; + boolean savedReport = doReport; + + // Find the first delimiter. + delim = readCh(); + if (delim != '"' && delim != '\'') + { + error("expected '\"' or \"'\"", delim, null); + return null; + } + inLiteral = true; + if ((flags & LIT_DISABLE_PE) != 0) + { + expandPE = false; + } + doReport = false; + + // Each level of input source has its own buffer; remember + // ours, so we won't read the ending delimiter from any + // other input source, regardless of entity processing. + char[] ourBuf = readBuffer; + + // Read the literal. + try + { + c = readCh(); + boolean ampRead = false; +loop: + while (! (c == delim && readBuffer == ourBuf)) + { + switch (c) + { + // attributes and public ids are normalized + // in almost the same ways + case '\n': + case '\r': + if ((flags & (LIT_ATTRIBUTE | LIT_PUBID)) != 0) + { + c = ' '; + } + break; + case '\t': + if ((flags & LIT_ATTRIBUTE) != 0) + { + c = ' '; + } + break; + case '&': + c = readCh(); + // Char refs are expanded immediately, except for + // all the cases where it's deferred. + if (c == '#') + { + if ((flags & LIT_DISABLE_CREF) != 0) + { + dataBufferAppend('&'); + break; + } + parseCharRef(false /* Do not do flushDataBuffer */); + + // exotic WFness risk: this is an entity literal, + // dataBuffer [dataBufferPos - 1] == '&', and + // following chars are a _partial_ entity/char ref + + // It looks like an entity ref ... + } + else + { + unread(c); + // Expand it? + if ((flags & LIT_ENTITY_REF) > 0) + { + parseEntityRef(false); + if (String.valueOf(readBuffer).equals("&")) + { + ampRead = true; + } + //Is it just data? + } + else if ((flags & LIT_DISABLE_EREF) != 0) + { + dataBufferAppend('&'); + + // OK, it will be an entity ref -- expanded later. + } + else + { + String name = readNmtoken(true); + require(';'); + dataBufferAppend('&'); + dataBufferAppend(name); + dataBufferAppend(';'); + } + } + c = readCh(); + continue loop; + + case '<': + // and why? Perhaps so "&foo;" expands the same + // inside and outside an attribute? + if ((flags & LIT_ATTRIBUTE) != 0) + { + error("attribute values may not contain '<'"); + } + break; + + // We don't worry about case '%' and PE refs, readCh does. + + default: + break; + } + dataBufferAppend(c); + c = readCh(); + } + } + catch (EOFException e) + { + error("end of input while looking for delimiter (started on line " + + startLine + ')', null, Character.toString(delim)); + } + inLiteral = false; + expandPE = saved; + doReport = savedReport; + + // Normalise whitespace if necessary. + if ((flags & LIT_NORMALIZE) > 0) + { + dataBufferNormalize(); + } + + // Return the value. + return dataBufferToString(); + } + + /** + * Try reading external identifiers. + * A system identifier is not required for notations. + * @param inNotation Are we parsing a notation decl? + * @param isSubset Parsing external subset decl (may be omitted)? + * @return A three-member String array containing the identifiers, + * or nulls. Order: public, system, baseURI. + */ + private ExternalIdentifiers readExternalIds(boolean inNotation, + boolean isSubset) + throws Exception + { + char c; + ExternalIdentifiers ids = new ExternalIdentifiers(); + int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF; + + if (tryRead("PUBLIC")) + { + requireWhitespace(); + ids.publicId = readLiteral(LIT_NORMALIZE | LIT_PUBID | flags); + if (inNotation) + { + skipWhitespace(); + c = readCh(); + unread(c); + if (c == '"' || c == '\'') + { + ids.systemId = readLiteral(flags); + } + } + else + { + requireWhitespace(); + ids.systemId = readLiteral(flags); + } + + for (int i = 0; i < ids.publicId.length(); i++) + { + c = ids.publicId.charAt(i); + if (c >= 'a' && c <= 'z') + { + continue; + } + if (c >= 'A' && c <= 'Z') + { + continue; + } + if (" \r\n0123456789-' ()+,./:=?;!*#@$_%".indexOf(c) != -1) + { + continue; + } + error("illegal PUBLIC id character U+" + + Integer.toHexString(c)); + } + } + else if (tryRead("SYSTEM")) + { + requireWhitespace(); + ids.systemId = readLiteral(flags); + } + else if (!isSubset) + { + error("missing SYSTEM or PUBLIC keyword"); + } + + if (ids.systemId != null) + { + if (ids.systemId.indexOf('#') != -1) + { + handler.verror("SYSTEM id has a URI fragment: " + ids.systemId); + } + ids.baseUri = handler.getSystemId(); + if (ids.baseUri == null && uriWarnings) + { + handler.warn("No base URI; hope URI is absolute: " + + ids.systemId); + } + } + + return ids; + } + + /** + * Test if a character is whitespace. + *
+   * [3] S ::= (#x20 | #x9 | #xd | #xa)+
+   * 
+ * @param c The character to test. + * @return true if the character is whitespace. + */ + private final boolean isWhitespace(char c) + { + if (c > 0x20) + { + return false; + } + if (c == 0x20 || c == 0x0a || c == 0x09 || c == 0x0d) + { + return true; + } + return false; // illegal ... + } + + ////////////////////////////////////////////////////////////////////// + // Utility routines. + ////////////////////////////////////////////////////////////////////// + + /** + * Add a character to the data buffer. + */ + private void dataBufferAppend(char c) + { + // Expand buffer if necessary. + if (dataBufferPos >= dataBuffer.length) + { + dataBuffer = (char[]) extendArray(dataBuffer, + dataBuffer.length, dataBufferPos); + } + dataBuffer[dataBufferPos++] = c; + } + + /** + * Add a string to the data buffer. + */ + private void dataBufferAppend(String s) + { + dataBufferAppend(s.toCharArray(), 0, s.length()); + } + + /** + * Append (part of) a character array to the data buffer. + */ + private void dataBufferAppend(char[] ch, int start, int length) + { + dataBuffer = (char[]) extendArray(dataBuffer, dataBuffer.length, + dataBufferPos + length); + + System.arraycopy(ch, start, dataBuffer, dataBufferPos, length); + dataBufferPos += length; + } + + /** + * Normalise space characters in the data buffer. + */ + private void dataBufferNormalize() + { + int i = 0; + int j = 0; + int end = dataBufferPos; + + // Skip spaces at the start. + while (j < end && dataBuffer[j] == ' ') + { + j++; + } + + // Skip whitespace at the end. + while (end > j && dataBuffer[end - 1] == ' ') + { + end --; + } + + // Start copying to the left. + while (j < end) + { + + char c = dataBuffer[j++]; + + // Normalise all other spaces to + // a single space. + if (c == ' ') + { + while (j < end && dataBuffer[j++] == ' ') + { + continue; + } + dataBuffer[i++] = ' '; + dataBuffer[i++] = dataBuffer[j - 1]; + } + else + { + dataBuffer[i++] = c; + } + } + + // The new length is <= the old one. + dataBufferPos = i; + } + + /** + * Convert the data buffer to a string. + */ + private String dataBufferToString() + { + String s = new String(dataBuffer, 0, dataBufferPos); + dataBufferPos = 0; + return s; + } + + /** + * Flush the contents of the data buffer to the handler, as + * appropriate, and reset the buffer for new input. + */ + private void dataBufferFlush() + throws SAXException + { + if (currentElementContent == CONTENT_ELEMENTS + && dataBufferPos > 0 + && !inCDATA) + { + // We can't just trust the buffer to be whitespace, there + // are (error) cases when it isn't + for (int i = 0; i < dataBufferPos; i++) + { + if (!isWhitespace(dataBuffer[i])) + { + handler.charData(dataBuffer, 0, dataBufferPos); + dataBufferPos = 0; + } + } + if (dataBufferPos > 0) + { + handler.ignorableWhitespace(dataBuffer, 0, dataBufferPos); + dataBufferPos = 0; + } + } + else if (dataBufferPos > 0) + { + handler.charData(dataBuffer, 0, dataBufferPos); + dataBufferPos = 0; + } + } + + /** + * Require a string to appear, or throw an exception. + *

Precondition: Entity expansion is not required. + *

Precondition: data buffer has no characters that + * will get sent to the application. + */ + private void require(String delim) + throws SAXException, IOException + { + int length = delim.length(); + char[] ch; + + if (length < dataBuffer.length) + { + ch = dataBuffer; + delim.getChars(0, length, ch, 0); + } + else + { + ch = delim.toCharArray(); + } + + if (USE_CHEATS && length <= (readBufferLength - readBufferPos)) + { + int offset = readBufferPos; + + for (int i = 0; i < length; i++, offset++) + { + if (ch[i] != readBuffer[offset]) + { + error ("required string", null, delim); + } + } + readBufferPos = offset; + + } + else + { + for (int i = 0; i < length; i++) + { + require(ch[i]); + } + } + } + + /** + * Require a character to appear, or throw an exception. + */ + private void require(char delim) + throws SAXException, IOException + { + char c = readCh(); + + if (c != delim) + { + error("required character", c, Character.toString(delim)); + } + } + + /** + * Create an interned string from a character array. + * Ælfred uses this method to create an interned version + * of all names and name tokens, so that it can test equality + * with == instead of String.equals (). + * + *

This is much more efficient than constructing a non-interned + * string first, and then interning it. + * + * @param ch an array of characters for building the string. + * @param start the starting position in the array. + * @param length the number of characters to place in the string. + * @return an interned string. + * @see #intern (String) + * @see java.lang.String#intern + */ + public String intern(char[] ch, int start, int length) + { + int index = 0; + int hash = 0; + Object[] bucket; + + // Generate a hash code. This is a widely used string hash, + // often attributed to Brian Kernighan. + for (int i = start; i < start + length; i++) + { + hash = 31 * hash + ch[i]; + } + hash = (hash & 0x7fffffff) % SYMBOL_TABLE_LENGTH; + + // Get the bucket -- consists of {array,String} pairs + if ((bucket = symbolTable[hash]) == null) + { + // first string in this bucket + bucket = new Object[8]; + + // Search for a matching tuple, and + // return the string if we find one. + } + else + { + while (index < bucket.length) + { + char[] chFound = (char[]) bucket[index]; + + // Stop when we hit an empty entry. + if (chFound == null) + { + break; + } + + // If they're the same length, check for a match. + if (chFound.length == length) + { + for (int i = 0; i < chFound.length; i++) + { + // continue search on failure + if (ch[start + i] != chFound[i]) + { + break; + } + else if (i == length - 1) + { + // That's it, we have a match! + return (String) bucket[index + 1]; + } + } + } + index += 2; + } + // Not found -- we'll have to add it. + + // Do we have to grow the bucket? + bucket = (Object[]) extendArray(bucket, bucket.length, index); + } + symbolTable[hash] = bucket; + + // OK, add it to the end of the bucket -- "local" interning. + // Intern "globally" to let applications share interning benefits. + // That is, "!=" and "==" work on our strings, not just equals(). + String s = new String(ch, start, length).intern(); + bucket[index] = s.toCharArray(); + bucket[index + 1] = s; + return s; + } + + /** + * Ensure the capacity of an array, allocating a new one if + * necessary. Usually extends only for name hash collisions. + */ + private Object extendArray(Object array, int currentSize, int requiredSize) + { + if (requiredSize < currentSize) + { + return array; + } + else + { + Object newArray = null; + int newSize = currentSize * 2; + + if (newSize <= requiredSize) + { + newSize = requiredSize + 1; + } + + if (array instanceof char[]) + { + newArray = new char[newSize]; + } + else if (array instanceof Object[]) + { + newArray = new Object[newSize]; + } + else + { + throw new RuntimeException(); + } + + System.arraycopy(array, 0, newArray, 0, currentSize); + return newArray; + } + } + + ////////////////////////////////////////////////////////////////////// + // XML query routines. + ////////////////////////////////////////////////////////////////////// + + boolean isStandalone() + { + return docIsStandalone; + } + + // + // Elements + // + + private int getContentType(ElementDecl element, int defaultType) + { + int retval; + + if (element == null) + { + return defaultType; + } + retval = element.contentType; + if (retval == CONTENT_UNDECLARED) + { + retval = defaultType; + } + return retval; + } + + /** + * Look up the content type of an element. + * @param name The element type name. + * @return An integer constant representing the content type. + * @see #CONTENT_UNDECLARED + * @see #CONTENT_ANY + * @see #CONTENT_EMPTY + * @see #CONTENT_MIXED + * @see #CONTENT_ELEMENTS + */ + public int getElementContentType(String name) + { + ElementDecl element = (ElementDecl) elementInfo.get(name); + return getContentType(element, CONTENT_UNDECLARED); + } + + /** + * Register an element. + * Array format: + * [0] element type name + * [1] content model (mixed, elements only) + * [2] attribute hash table + */ + private void setElement(String name, int contentType, + String contentModel, HashMap attributes) + throws SAXException + { + if (skippedPE) + { + return; + } + + ElementDecl element = (ElementDecl) elementInfo.get(name); + + // first or for this type? + if (element == null) + { + element = new ElementDecl(); + element.contentType = contentType; + element.contentModel = contentModel; + element.attributes = attributes; + elementInfo.put(name, element); + return; + } + + // declaration? + if (contentType != CONTENT_UNDECLARED) + { + // ... following an associated + if (element.contentType == CONTENT_UNDECLARED) + { + element.contentType = contentType; + element.contentModel = contentModel; + } + else + { + // VC: Unique Element Type Declaration + handler.verror("multiple declarations for element type: " + + name); + } + } + + // first , before ? + else if (attributes != null) + { + element.attributes = attributes; + } + } + + /** + * Look up the attribute hash table for an element. + * The hash table is the second item in the element array. + */ + private HashMap getElementAttributes(String name) + { + ElementDecl element = (ElementDecl) elementInfo.get(name); + return (element == null) ? null : element.attributes; + } + + // + // Attributes + // + + /** + * Get the declared attributes for an element type. + * @param elname The name of the element type. + * @return An iterator over all the attributes declared for + * a specific element type. The results will be valid only + * after the DTD (if any) has been parsed. + * @see #getAttributeType + * @see #getAttributeEnumeration + * @see #getAttributeDefaultValueType + * @see #getAttributeDefaultValue + * @see #getAttributeExpandedValue + */ + private Iterator declaredAttributes(ElementDecl element) + { + HashMap attlist; + + if (element == null) + { + return null; + } + if ((attlist = element.attributes) == null) + { + return null; + } + return attlist.keySet().iterator(); + } + + /** + * Get the declared attributes for an element type. + * @param elname The name of the element type. + * @return An iterator over all the attributes declared for + * a specific element type. The results will be valid only + * after the DTD (if any) has been parsed. + * @see #getAttributeType + * @see #getAttributeEnumeration + * @see #getAttributeDefaultValueType + * @see #getAttributeDefaultValue + * @see #getAttributeExpandedValue + */ + public Iterator declaredAttributes(String elname) + { + return declaredAttributes((ElementDecl) elementInfo.get(elname)); + } + + /** + * Retrieve the declared type of an attribute. + * @param name The name of the associated element. + * @param aname The name of the attribute. + * @return An interend string denoting the type, or null + * indicating an undeclared attribute. + */ + public String getAttributeType(String name, String aname) + { + AttributeDecl attribute = getAttribute(name, aname); + return (attribute == null) ? null : attribute.type; + } + + /** + * Retrieve the allowed values for an enumerated attribute type. + * @param name The name of the associated element. + * @param aname The name of the attribute. + * @return A string containing the token list. + */ + public String getAttributeEnumeration(String name, String aname) + { + AttributeDecl attribute = getAttribute(name, aname); + // assert: attribute.enumeration is "ENUMERATION" or "NOTATION" + return (attribute == null) ? null : attribute.enumeration; + } + + /** + * Retrieve the default value of a declared attribute. + * @param name The name of the associated element. + * @param aname The name of the attribute. + * @return The default value, or null if the attribute was + * #IMPLIED or simply undeclared and unspecified. + * @see #getAttributeExpandedValue + */ + public String getAttributeDefaultValue(String name, String aname) + { + AttributeDecl attribute = getAttribute(name, aname); + return (attribute == null) ? null : attribute.value; + } + + /* + +// FIXME: Leaving this in, until W3C finally resolves the confusion +// between parts of the XML 2nd REC about when entity declararations +// are guaranteed to be known. Current code matches what section 5.1 +// (conformance) describes, but some readings of the self-contradicting +// text in 4.1 (the "Entity Declared" WFC and VC) seem to expect that +// attribute expansion/normalization must be deferred in some cases +// (just TRY to identify them!). + + * Retrieve the expanded value of a declared attribute. + *

General entities (and char refs) will be expanded (once). + * @param name The name of the associated element. + * @param aname The name of the attribute. + * @return The expanded default value, or null if the attribute was + * #IMPLIED or simply undeclared + * @see #getAttributeDefaultValue + public String getAttributeExpandedValue (String name, String aname) + throws Exception + { + AttributeDecl attribute = getAttribute (name, aname); + + if (attribute == null) { + return null; + } else if (attribute.defaultValue == null && attribute.value != null) { + // we MUST use the same buf for both quotes else the literal + // can't be properly terminated + char buf [] = new char [1]; + int flags = LIT_ENTITY_REF | LIT_ATTRIBUTE; + String type = getAttributeType (name, aname); + + if (type != "CDATA" && type != null) + flags |= LIT_NORMALIZE; + buf [0] = '"'; + pushCharArray (null, buf, 0, 1); + pushString (null, attribute.value); + pushCharArray (null, buf, 0, 1); + attribute.defaultValue = readLiteral (flags); + } + return attribute.defaultValue; + } + */ + + /** + * Retrieve the default value mode of a declared attribute. + * @see #ATTRIBUTE_DEFAULT_SPECIFIED + * @see #ATTRIBUTE_DEFAULT_IMPLIED + * @see #ATTRIBUTE_DEFAULT_REQUIRED + * @see #ATTRIBUTE_DEFAULT_FIXED + */ + public int getAttributeDefaultValueType(String name, String aname) + { + AttributeDecl attribute = getAttribute(name, aname); + return (attribute == null) ? ATTRIBUTE_DEFAULT_UNDECLARED : + attribute.valueType; + } + + /** + * Register an attribute declaration for later retrieval. + * Format: + * - String type + * - String default value + * - int value type + * - enumeration + * - processed default value + */ + private void setAttribute(String elName, String name, String type, + String enumeration, String value, int valueType) + throws Exception + { + HashMap attlist; + + if (skippedPE) + { + return; + } + + // Create a new hashtable if necessary. + attlist = getElementAttributes(elName); + if (attlist == null) + { + attlist = new HashMap(); + } + + // ignore multiple attribute declarations! + if (attlist.get(name) != null) + { + // warn ... + return; + } + else + { + AttributeDecl attribute = new AttributeDecl(); + attribute.type = type; + attribute.value = value; + attribute.valueType = valueType; + attribute.enumeration = enumeration; + attlist.put(name, attribute); + + // save; but don't overwrite any existing + setElement(elName, CONTENT_UNDECLARED, null, attlist); + } + } + + /** + * Retrieve the attribute declaration for the given element name and name. + */ + private AttributeDecl getAttribute(String elName, String name) + { + HashMap attlist = getElementAttributes(elName); + return (attlist == null) ? null : (AttributeDecl) attlist.get(name); + } + + // + // Entities + // + + /** + * Find the type of an entity. + * @returns An integer constant representing the entity type. + * @see #ENTITY_UNDECLARED + * @see #ENTITY_INTERNAL + * @see #ENTITY_NDATA + * @see #ENTITY_TEXT + */ + public int getEntityType(String ename) + { + EntityInfo entity = (EntityInfo) entityInfo.get(ename); + return (entity == null) ? ENTITY_UNDECLARED : entity.type; + } + + /** + * Return an external entity's identifiers. + * @param ename The name of the external entity. + * @return The entity's public identifier, system identifier, and base URI. + * Null if the entity was not declared as an external entity. + * @see #getEntityType + */ + public ExternalIdentifiers getEntityIds(String ename) + { + EntityInfo entity = (EntityInfo) entityInfo.get(ename); + return (entity == null) ? null : entity.ids; + } + + /** + * Return an internal entity's replacement text. + * @param ename The name of the internal entity. + * @return The entity's replacement text, or null if + * the entity was not declared as an internal entity. + * @see #getEntityType + */ + public String getEntityValue(String ename) + { + EntityInfo entity = (EntityInfo) entityInfo.get(ename); + return (entity == null) ? null : entity.value; + } + + /** + * Register an entity declaration for later retrieval. + */ + private void setInternalEntity(String eName, String value) + throws SAXException + { + if (skippedPE) + { + return; + } + + if (entityInfo.get(eName) == null) + { + EntityInfo entity = new EntityInfo(); + entity.type = ENTITY_INTERNAL; + entity.value = value; + entityInfo.put(eName, entity); + } + if (handler.stringInterning) + { + if ("lt" == eName || "gt" == eName || "quot" == eName + || "apos" == eName || "amp" == eName) + { + return; + } + } + else + { + if ("lt".equals(eName) || "gt".equals(eName) || "quot".equals(eName) + || "apos".equals(eName) || "amp".equals(eName)) + { + return; + } + } + handler.getDeclHandler().internalEntityDecl(eName, value); + } + + /** + * Register an external entity declaration for later retrieval. + */ + private void setExternalEntity(String eName, int eClass, + ExternalIdentifiers ids, String nName) + { + if (entityInfo.get(eName) == null) + { + EntityInfo entity = new EntityInfo(); + entity.type = eClass; + entity.ids = ids; + entity.notationName = nName; + entityInfo.put(eName, entity); + } + } + + // + // Notations. + // + + /** + * Report a notation declaration, checking for duplicates. + */ + private void setNotation(String nname, ExternalIdentifiers ids) + throws SAXException + { + if (skippedPE) + { + return; + } + + handler.notationDecl(nname, ids.publicId, ids.systemId, ids.baseUri); + if (notationInfo.get(nname) == null) + { + notationInfo.put(nname, nname); + } + else + { + // VC: Unique Notation Name + handler.verror("Duplicate notation name decl: " + nname); + } + } + + // + // Location. + // + + /** + * Return the current line number. + */ + public int getLineNumber() + { + return line; + } + + /** + * Return the current column number. + */ + public int getColumnNumber() + { + return column; + } + + ////////////////////////////////////////////////////////////////////// + // High-level I/O. + ////////////////////////////////////////////////////////////////////// + + /** + * Read a single character from the readBuffer. + *

The readDataChunk () method maintains the buffer. + *

If we hit the end of an entity, try to pop the stack and + * keep going. + *

(This approach doesn't really enforce XML's rules about + * entity boundaries, but this is not currently a validating + * parser). + *

This routine also attempts to keep track of the current + * position in external entities, but it's not entirely accurate. + * @return The next available input character. + * @see #unread (char) + * @see #readDataChunk + * @see #readBuffer + * @see #line + * @return The next character from the current input source. + */ + private char readCh() + throws SAXException, IOException + { + // As long as there's nothing in the + // read buffer, try reading more data + // (for an external entity) or popping + // the entity stack (for either). + while (readBufferPos >= readBufferLength) + { + switch (sourceType) + { + case INPUT_READER: + case INPUT_STREAM: + readDataChunk(); + while (readBufferLength < 1) + { + popInput(); + if (readBufferLength < 1) + { + readDataChunk(); + } + } + break; + + default: + + popInput(); + break; + } + } + + char c = readBuffer[readBufferPos++]; + + if (c == '\n') + { + line++; + column = 0; + } + else + { + if (c == '<') + { + /* the most common return to parseContent () ... NOP */ + } + else if (((c < 0x0020 && (c != '\t') && (c != '\r')) || c > 0xFFFD) + || ((c >= 0x007f) && (c <= 0x009f) && (c != 0x0085) + && xmlVersion == XML_11)) + { + error("illegal XML character U+" + Integer.toHexString(c)); + } + + // If we're in the DTD and in a context where PEs get expanded, + // do so ... 1/14/2000 errata identify those contexts. There + // are also spots in the internal subset where PE refs are fatal + // errors, hence yet another flag. + else if (c == '%' && expandPE) + { + if (peIsError) + { + error("PE reference within decl in internal subset."); + } + parsePEReference(); + return readCh(); + } + column++; + } + + return c; + } + + /** + * Push a single character back onto the current input stream. + *

This method usually pushes the character back onto + * the readBuffer. + *

I don't think that this would ever be called with + * readBufferPos = 0, because the methods always reads a character + * before unreading it, but just in case, I've added a boundary + * condition. + * @param c The character to push back. + * @see #readCh + * @see #unread (char[]) + * @see #readBuffer + */ + private void unread(char c) + throws SAXException + { + // Normal condition. + if (c == '\n') + { + line--; + column = -1; + } + if (readBufferPos > 0) + { + readBuffer[--readBufferPos] = c; + } + else + { + pushString(null, Character.toString(c)); + } + } + + /** + * Push a char array back onto the current input stream. + *

NOTE: you must never push back characters that you + * haven't actually read: use pushString () instead. + * @see #readCh + * @see #unread (char) + * @see #readBuffer + * @see #pushString + */ + private void unread(char[] ch, int length) + throws SAXException + { + for (int i = 0; i < length; i++) + { + if (ch[i] == '\n') + { + line--; + column = -1; + } + } + if (length < readBufferPos) + { + readBufferPos -= length; + } + else + { + pushCharArray(null, ch, 0, length); + } + } + + /** + * Push, or skip, a new external input source. + * The source will be some kind of parsed entity, such as a PE + * (including the external DTD subset) or content for the body. + * + * @param url The java.net.URL object for the entity. + * @see SAXDriver#resolveEntity + * @see #pushString + * @see #sourceType + * @see #pushInput + * @see #detectEncoding + * @see #sourceType + * @see #readBuffer + */ + private void pushURL(boolean isPE, + String ename, + ExternalIdentifiers ids, + Reader reader, + InputStream stream, + String encoding, + boolean doResolve) + throws SAXException, IOException + { + boolean ignoreEncoding; + String systemId; + InputSource source; + + if (!isPE) + { + dataBufferFlush(); + } + + scratch.setPublicId(ids.publicId); + scratch.setSystemId(ids.systemId); + + // See if we should skip or substitute the entity. + // If we're not skipping, resolving reports startEntity() + // and updates the (handler's) stack of URIs. + if (doResolve) + { + // assert (stream == null && reader == null && encoding == null) + source = handler.resolveEntity(isPE, ename, scratch, ids.baseUri); + if (source == null) + { + handler.warn("skipping entity: " + ename); + handler.skippedEntity(ename); + if (isPE) + { + skippedPE = true; + } + return; + } + + // we might be using alternate IDs/encoding + systemId = source.getSystemId(); + // The following warning and setting systemId was deleted bcause + // the application has the option of not setting systemId + // provided that it has set the characte/byte stream. + /* + if (systemId == null) { + handler.warn ("missing system ID, using " + ids.systemId); + systemId = ids.systemId; + } + */ + } + else + { + // "[document]", or "[dtd]" via getExternalSubset() + scratch.setCharacterStream(reader); + scratch.setByteStream(stream); + scratch.setEncoding(encoding); + source = scratch; + systemId = ids.systemId; + if (handler.stringInterning) + { + handler.startExternalEntity(ename, systemId, + "[document]" == ename); + } + else + { + handler.startExternalEntity(ename, systemId, + "[document]".equals(ename)); + } + } + + // we may have been given I/O streams directly + if (source.getCharacterStream() != null) + { + if (source.getByteStream() != null) + error("InputSource has two streams!"); + reader = source.getCharacterStream(); + } + else if (source.getByteStream() != null) + { + encoding = source.getEncoding(); + if (encoding == null) + { + stream = source.getByteStream(); + } + else + { + try + { + reader = new InputStreamReader(source.getByteStream(), + encoding); + } + catch (IOException e) + { + stream = source.getByteStream(); + } + } + } + else if (systemId == null) + { + error("InputSource has no URI!"); + } + scratch.setCharacterStream(null); + scratch.setByteStream(null); + scratch.setEncoding(null); + + // Push the existing status. + pushInput(ename); + + // Create a new read buffer. + // (Note the four-character margin) + readBuffer = new char[READ_BUFFER_MAX + 4]; + readBufferPos = 0; + readBufferLength = 0; + readBufferOverflow = -1; + is = null; + line = 1; + column = 0; + currentByteCount = 0; + + // If there's an explicit character stream, just + // ignore encoding declarations. + if (reader != null) + { + sourceType = INPUT_READER; + this.reader = reader; + tryEncodingDecl(true); + return; + } + + // Else we handle the conversion, and need to ensure + // it's done right. + sourceType = INPUT_STREAM; + if (stream != null) + { + is = stream; + } + else + { + // We have to open our own stream to the URL. + URL url = new URL(systemId); + + externalEntity = url.openConnection(); + externalEntity.connect(); + is = externalEntity.getInputStream(); + } + + // If we get to here, there must be + // an InputStream available. + if (!is.markSupported()) + { + is = new BufferedInputStream(is); + } + + // Get any external encoding label. + if (encoding == null && externalEntity != null) + { + // External labels can be untrustworthy; filesystems in + // particular often have the wrong default for content + // that wasn't locally originated. Those we autodetect. + if (!"file".equals(externalEntity.getURL().getProtocol())) + { + int temp; + + // application/xml;charset=something;otherAttr=... + // ... with many variants on 'something' + encoding = externalEntity.getContentType(); + + // MHK code (fix for Saxon 5.5.1/007): + // protect against encoding==null + if (encoding == null) + { + temp = -1; + } + else + { + temp = encoding.indexOf("charset"); + } + + // RFC 2376 sez MIME text defaults to ASCII, but since the + // JDK will create a MIME type out of thin air, we always + // autodetect when there's no explicit charset attribute. + if (temp < 0) + { + encoding = null; // autodetect + } + else + { + // only this one attribute + if ((temp = encoding.indexOf(';')) > 0) + { + encoding = encoding.substring(0, temp); + } + + if ((temp = encoding.indexOf('=', temp + 7)) > 0) + { + encoding = encoding.substring(temp + 1); + + // attributes can have comment fields (RFC 822) + if ((temp = encoding.indexOf('(')) > 0) + { + encoding = encoding.substring(0, temp); + } + // ... and values may be quoted + if ((temp = encoding.indexOf('"')) > 0) + { + encoding = + encoding.substring(temp + 1, + encoding.indexOf('"', temp + 2)); + } + encoding = encoding.trim(); + } + else + { + handler.warn("ignoring illegal MIME attribute: " + + encoding); + encoding = null; + } + } + } + } + + // if we got an external encoding label, use it ... + if (encoding != null) + { + this.encoding = ENCODING_EXTERNAL; + setupDecoding(encoding); + ignoreEncoding = true; + + // ... else autodetect from first bytes. + } + else + { + detectEncoding(); + ignoreEncoding = false; + } + + // Read any XML or text declaration. + // If we autodetected, it may tell us the "real" encoding. + try + { + tryEncodingDecl(ignoreEncoding); + } + catch (UnsupportedEncodingException x) + { + encoding = x.getMessage(); + + // if we don't handle the declared encoding, + // try letting a JVM InputStreamReader do it + try + { + if (sourceType != INPUT_STREAM) + { + throw x; + } + + is.reset(); + readBufferPos = 0; + readBufferLength = 0; + readBufferOverflow = -1; + line = 1; + currentByteCount = column = 0; + + sourceType = INPUT_READER; + this.reader = new InputStreamReader(is, encoding); + is = null; + + tryEncodingDecl(true); + + } + catch (IOException e) + { + error("unsupported text encoding", + encoding, + null); + } + } + } + + /** + * Check for an encoding declaration. This is the second part of the + * XML encoding autodetection algorithm, relying on detectEncoding to + * get to the point that this part can read any encoding declaration + * in the document (using only US-ASCII characters). + * + *

Because this part starts to fill parser buffers with this data, + * it's tricky to setup a reader so that Java's built-in decoders can be + * used for the character encodings that aren't built in to this parser + * (such as EUC-JP, KOI8-R, Big5, etc). + * + * @return any encoding in the declaration, uppercased; or null + * @see detectEncoding + */ + private String tryEncodingDecl(boolean ignoreEncoding) + throws SAXException, IOException + { + // Read the XML/text declaration. + if (tryRead(" 0) + { + return parseTextDecl(ignoreEncoding); + } + else + { + return parseXMLDecl(ignoreEncoding); + } + } + else + { + // or similar + unread('l'); + unread('m'); + unread('x'); + unread('?'); + unread('<'); + } + } + return null; + } + + /** + * Attempt to detect the encoding of an entity. + *

The trick here (as suggested in the XML standard) is that + * any entity not in UTF-8, or in UCS-2 with a byte-order mark, + * must begin with an XML declaration or an encoding + * declaration; we simply have to look for "<?xml" in various + * encodings. + *

This method has no way to distinguish among 8-bit encodings. + * Instead, it sets up for UTF-8, then (possibly) revises its assumption + * later in setupDecoding (). Any ASCII-derived 8-bit encoding + * should work, but most will be rejected later by setupDecoding (). + * @see #tryEncoding (byte[], byte, byte, byte, byte) + * @see #tryEncoding (byte[], byte, byte) + * @see #setupDecoding + */ + private void detectEncoding() + throws SAXException, IOException + { + byte[] signature = new byte[4]; + + // Read the first four bytes for + // autodetection. + is.mark(4); + is.read(signature); + is.reset(); + + // + // FIRST: four byte encodings (who uses these?) + // + if (tryEncoding(signature, (byte) 0x00, (byte) 0x00, + (byte) 0x00, (byte) 0x3c)) + { + // UCS-4 must begin with "Utility routine for detectEncoding (). + *

Always looks for some part of "Looks for a UCS-2 byte-order mark. + *

Utility routine for detectEncoding (). + * @param sig The first four bytes read. + * @param b1 The first byte of the signature + * @param b2 The second byte of the signature + * @see #detectEncoding + */ + private static boolean tryEncoding(byte[] sig, byte b1, byte b2) + { + return ((sig[0] == b1) && (sig[1] == b2)); + } + + /** + * This method pushes a string back onto input. + *

It is useful either as the expansion of an internal entity, + * or for backtracking during the parse. + *

Call pushCharArray () to do the actual work. + * @param s The string to push back onto input. + * @see #pushCharArray + */ + private void pushString(String ename, String s) + throws SAXException + { + char[] ch = s.toCharArray(); + pushCharArray(ename, ch, 0, ch.length); + } + + /** + * Push a new internal input source. + *

This method is useful for expanding an internal entity, + * or for unreading a string of characters. It creates a new + * readBuffer containing the characters in the array, instead + * of characters converted from an input byte stream. + * @param ch The char array to push. + * @see #pushString + * @see #pushURL + * @see #readBuffer + * @see #sourceType + * @see #pushInput + */ + private void pushCharArray(String ename, char[] ch, int start, int length) + throws SAXException + { + // Push the existing status + pushInput(ename); + if (ename != null && doReport) + { + dataBufferFlush(); + handler.startInternalEntity(ename); + } + sourceType = INPUT_INTERNAL; + readBuffer = ch; + readBufferPos = start; + readBufferLength = length; + readBufferOverflow = -1; + } + + /** + * Save the current input source onto the stack. + *

This method saves all of the global variables associated with + * the current input source, so that they can be restored when a new + * input source has finished. It also tests for entity recursion. + *

The method saves the following global variables onto a stack + * using a fixed-length array: + *

    + *
  1. sourceType + *
  2. externalEntity + *
  3. readBuffer + *
  4. readBufferPos + *
  5. readBufferLength + *
  6. line + *
  7. encoding + *
+ * @param ename The name of the entity (if any) causing the new input. + * @see #popInput + * @see #sourceType + * @see #externalEntity + * @see #readBuffer + * @see #readBufferPos + * @see #readBufferLength + * @see #line + * @see #encoding + */ + private void pushInput(String ename) + throws SAXException + { + // Check for entity recursion. + if (ename != null) + { + Iterator entities = entityStack.iterator(); + while (entities.hasNext()) + { + String e = (String) entities.next(); + if (e != null && e == ename) + { + error("recursive reference to entity", ename, null); + } + } + } + entityStack.addLast(ename); + + // Don't bother if there is no current input. + if (sourceType == INPUT_NONE) + { + return; + } + + // Set up a snapshot of the current + // input source. + Input input = new Input(); + + input.sourceType = sourceType; + input.externalEntity = externalEntity; + input.readBuffer = readBuffer; + input.readBufferPos = readBufferPos; + input.readBufferLength = readBufferLength; + input.line = line; + input.encoding = encoding; + input.readBufferOverflow = readBufferOverflow; + input.is = is; + input.currentByteCount = currentByteCount; + input.column = column; + input.reader = reader; + + // Push it onto the stack. + inputStack.addLast(input); + } + + /** + * Restore a previous input source. + *

This method restores all of the global variables associated with + * the current input source. + * @exception java.io.EOFException + * If there are no more entries on the input stack. + * @see #pushInput + * @see #sourceType + * @see #externalEntity + * @see #readBuffer + * @see #readBufferPos + * @see #readBufferLength + * @see #line + * @see #encoding + */ + private void popInput() + throws SAXException, IOException + { + String ename = (String) entityStack.removeLast(); + + if (ename != null && doReport) + { + dataBufferFlush(); + } + switch (sourceType) + { + case INPUT_STREAM: + handler.endExternalEntity(ename); + is.close(); + break; + case INPUT_READER: + handler.endExternalEntity(ename); + reader.close(); + break; + case INPUT_INTERNAL: + if (ename != null && doReport) + { + handler.endInternalEntity(ename); + } + break; + } + + // Throw an EOFException if there + // is nothing else to pop. + if (inputStack.isEmpty()) + { + throw new EOFException("no more input"); + } + + Input input = (Input) inputStack.removeLast(); + + sourceType = input.sourceType; + externalEntity = input.externalEntity; + readBuffer = input.readBuffer; + readBufferPos = input.readBufferPos; + readBufferLength = input.readBufferLength; + line = input.line; + encoding = input.encoding; + readBufferOverflow = input.readBufferOverflow; + is = input.is; + currentByteCount = input.currentByteCount; + column = input.column; + reader = input.reader; + } + + /** + * Return true if we can read the expected character. + *

Note that the character will be removed from the input stream + * on success, but will be put back on failure. Do not attempt to + * read the character again if the method succeeds. + * @param delim The character that should appear next. For a + * insensitive match, you must supply this in upper-case. + * @return true if the character was successfully read, or false if + * it was not. + * @see #tryRead (String) + */ + private boolean tryRead(char delim) + throws SAXException, IOException + { + char c; + + // Read the character + c = readCh(); + + // Test for a match, and push the character + // back if the match fails. + if (c == delim) + { + return true; + } + else + { + unread(c); + return false; + } + } + + /** + * Return true if we can read the expected string. + *

This is simply a convenience method. + *

Note that the string will be removed from the input stream + * on success, but will be put back on failure. Do not attempt to + * read the string again if the method succeeds. + *

This method will push back a character rather than an + * array whenever possible (probably the majority of cases). + * @param delim The string that should appear next. + * @return true if the string was successfully read, or false if + * it was not. + * @see #tryRead (char) + */ + private boolean tryRead(String delim) + throws SAXException, IOException + { + return tryRead(delim.toCharArray()); + } + + private boolean tryRead(char[] ch) + throws SAXException, IOException + { + char c; + + // Compare the input, character- + // by character. + + for (int i = 0; i < ch.length; i++) + { + c = readCh(); + if (c != ch[i]) + { + unread(c); + if (i != 0) + { + unread(ch, i); + } + return false; + } + } + return true; + } + + /** + * Return true if we can read some whitespace. + *

This is simply a convenience method. + *

This method will push back a character rather than an + * array whenever possible (probably the majority of cases). + * @return true if whitespace was found. + */ + private boolean tryWhitespace() + throws SAXException, IOException + { + char c; + c = readCh(); + if (isWhitespace(c)) + { + skipWhitespace(); + return true; + } + else + { + unread(c); + return false; + } + } + + /** + * Read all data until we find the specified string. + * This is useful for scanning CDATA sections and PIs. + *

This is inefficient right now, since it calls tryRead () + * for every character. + * @param delim The string delimiter + * @see #tryRead (String, boolean) + * @see #readCh + */ + private void parseUntil(String delim) + throws SAXException, IOException + { + parseUntil(delim.toCharArray()); + } + + private void parseUntil(char[] delim) + throws SAXException, IOException + { + char c; + int startLine = line; + + try + { + while (!tryRead(delim)) + { + c = readCh(); + dataBufferAppend(c); + } + } + catch (EOFException e) + { + error("end of input while looking for delimiter " + + "(started on line " + startLine + + ')', null, new String(delim)); + } + } + + ////////////////////////////////////////////////////////////////////// + // Low-level I/O. + ////////////////////////////////////////////////////////////////////// + + /** + * Prefetch US-ASCII XML/text decl from input stream into read buffer. + * Doesn't buffer more than absolutely needed, so that when an encoding + * decl says we need to create an InputStreamReader, we can discard our + * buffer and reset(). Caller knows the first chars of the decl exist + * in the input stream. + */ + private void prefetchASCIIEncodingDecl() + throws SAXException, IOException + { + int ch; + readBufferPos = readBufferLength = 0; + + is.mark(readBuffer.length); + while (true) + { + ch = is.read(); + readBuffer[readBufferLength++] = (char) ch; + switch (ch) + { + case (int) '>': + return; + case -1: + error("file ends before end of XML or encoding declaration.", + null, "?>"); + } + if (readBuffer.length == readBufferLength) + { + error("unfinished XML or encoding declaration"); + } + } + } + + /** + * Read a chunk of data from an external input source. + *

This is simply a front-end that fills the rawReadBuffer + * with bytes, then calls the appropriate encoding handler. + * @see #encoding + * @see #rawReadBuffer + * @see #readBuffer + * @see #filterCR + * @see #copyUtf8ReadBuffer + * @see #copyIso8859_1ReadBuffer + * @see #copyUcs_2ReadBuffer + * @see #copyUcs_4ReadBuffer + */ + private void readDataChunk() + throws SAXException, IOException + { + int count; + + // See if we have any overflow (filterCR sets for CR at end) + if (readBufferOverflow > -1) + { + readBuffer[0] = (char) readBufferOverflow; + readBufferOverflow = -1; + readBufferPos = 1; + sawCR = true; + } + else + { + readBufferPos = 0; + sawCR = false; + } + + // input from a character stream. + if (sourceType == INPUT_READER) + { + count = reader.read(readBuffer, + readBufferPos, READ_BUFFER_MAX - readBufferPos); + if (count < 0) + { + readBufferLength = readBufferPos; + } + else + { + readBufferLength = readBufferPos + count; + } + if (readBufferLength > 0) + { + filterCR(count >= 0); + } + sawCR = false; + return; + } + + // Read as many bytes as possible into the raw buffer. + count = is.read(rawReadBuffer, 0, READ_BUFFER_MAX); + + // Dispatch to an encoding-specific reader method to populate + // the readBuffer. In most parser speed profiles, these routines + // show up at the top of the CPU usage chart. + if (count > 0) + { + switch (encoding) + { + // one byte builtins + case ENCODING_ASCII: + copyIso8859_1ReadBuffer(count, (char) 0x0080); + break; + case ENCODING_UTF_8: + copyUtf8ReadBuffer(count); + break; + case ENCODING_ISO_8859_1: + copyIso8859_1ReadBuffer(count, (char) 0); + break; + + // two byte builtins + case ENCODING_UCS_2_12: + copyUcs2ReadBuffer(count, 8, 0); + break; + case ENCODING_UCS_2_21: + copyUcs2ReadBuffer(count, 0, 8); + break; + + // four byte builtins + case ENCODING_UCS_4_1234: + copyUcs4ReadBuffer(count, 24, 16, 8, 0); + break; + case ENCODING_UCS_4_4321: + copyUcs4ReadBuffer(count, 0, 8, 16, 24); + break; + case ENCODING_UCS_4_2143: + copyUcs4ReadBuffer(count, 16, 24, 0, 8); + break; + case ENCODING_UCS_4_3412: + copyUcs4ReadBuffer(count, 8, 0, 24, 16); + break; + } + } + else + { + readBufferLength = readBufferPos; + } + + readBufferPos = 0; + + // Filter out all carriage returns if we've seen any + // (including any saved from a previous read) + if (sawCR) + { + filterCR(count >= 0); + sawCR = false; + + // must actively report EOF, lest some CRs get lost. + if (readBufferLength == 0 && count >= 0) + { + readDataChunk(); + } + } + + if (count > 0) + { + currentByteCount += count; + } + } + + /** + * Filter carriage returns in the read buffer. + * CRLF becomes LF; CR becomes LF. + * @param moreData true iff more data might come from the same source + * @see #readDataChunk + * @see #readBuffer + * @see #readBufferOverflow + */ + private void filterCR(boolean moreData) + { + int i, j; + + readBufferOverflow = -1; + +loop: + for (i = j = readBufferPos; j < readBufferLength; i++, j++) + { + switch (readBuffer[j]) + { + case '\r': + if (j == readBufferLength - 1) + { + if (moreData) + { + readBufferOverflow = '\r'; + readBufferLength--; + } + else // CR at end of buffer + { + readBuffer[i++] = '\n'; + } + break loop; + } + else if (readBuffer[j + 1] == '\n') + { + j++; + } + readBuffer[i] = '\n'; + break; + + case '\n': + default: + readBuffer[i] = readBuffer[j]; + break; + } + } + readBufferLength = i; + } + + /** + * Convert a buffer of UTF-8-encoded bytes into UTF-16 characters. + *

When readDataChunk () calls this method, the raw bytes are in + * rawReadBuffer, and the final characters will appear in + * readBuffer. + *

Note that as of Unicode 3.1, good practice became a requirement, + * so that each Unicode character has exactly one UTF-8 representation. + * @param count The number of bytes to convert. + * @see #readDataChunk + * @see #rawReadBuffer + * @see #readBuffer + * @see #getNextUtf8Byte + */ + private void copyUtf8ReadBuffer(int count) + throws SAXException, IOException + { + int i = 0; + int j = readBufferPos; + int b1; + char c = 0; + + /* + // check once, so the runtime won't (if it's smart enough) + if (count < 0 || count > rawReadBuffer.length) + throw new ArrayIndexOutOfBoundsException (Integer.toString (count)); + */ + + while (i < count) + { + b1 = rawReadBuffer[i++]; + + // Determine whether we are dealing + // with a one-, two-, three-, or four- + // byte sequence. + if (b1 < 0) + { + if ((b1 & 0xe0) == 0xc0) + { + // 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx + c = (char) (((b1 & 0x1f) << 6) + | getNextUtf8Byte(i++, count)); + if (c < 0x0080) + { + encodingError("Illegal two byte UTF-8 sequence", + c, 0); + } + + //Sec 2.11 + // [1] the two-character sequence #xD #xA + // [2] the two-character sequence #xD #x85 + if ((c == 0x0085 || c == 0x000a) && sawCR) + { + continue; + } + + // Sec 2.11 + // [3] the single character #x85 + + if (c == 0x0085 && xmlVersion == XML_11) + { + readBuffer[j++] = '\r'; + } + } + else if ((b1 & 0xf0) == 0xe0) + { + // 3-byte sequence: + // zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx + // most CJKV characters + c = (char) (((b1 & 0x0f) << 12) | + (getNextUtf8Byte(i++, count) << 6) | + getNextUtf8Byte(i++, count)); + //sec 2.11 + //[4] the single character #x2028 + if (c == 0x2028 && xmlVersion == XML_11) + { + readBuffer[j++] = '\r'; + sawCR = true; + continue; + } + if (c < 0x0800 || (c >= 0xd800 && c <= 0xdfff)) + { + encodingError("Illegal three byte UTF-8 sequence", + c, 0); + } + } + else if ((b1 & 0xf8) == 0xf0) + { + // 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx + // = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx + // (uuuuu = wwww + 1) + // "Surrogate Pairs" ... from the "Astral Planes" + // Unicode 3.1 assigned the first characters there + int iso646 = b1 & 07; + iso646 = (iso646 << 6) + getNextUtf8Byte(i++, count); + iso646 = (iso646 << 6) + getNextUtf8Byte(i++, count); + iso646 = (iso646 << 6) + getNextUtf8Byte(i++, count); + + if (iso646 <= 0xffff) + { + encodingError("Illegal four byte UTF-8 sequence", + iso646, 0); + } + else + { + if (iso646 > 0x0010ffff) + { + encodingError("UTF-8 value out of range for Unicode", + iso646, 0); + } + iso646 -= 0x010000; + readBuffer[j++] = (char) (0xd800 | (iso646 >> 10)); + readBuffer[j++] = (char) (0xdc00 | (iso646 & 0x03ff)); + continue; + } + } + else + { + // The five and six byte encodings aren't supported; + // they exceed the Unicode (and XML) range. + encodingError("unsupported five or six byte UTF-8 sequence", + 0xff & b1, i); + // NOTREACHED + c = 0; + } + } + else + { + // 1-byte sequence: 000000000xxxxxxx = 0xxxxxxx + // (US-ASCII character, "common" case, one branch to here) + c = (char) b1; + } + readBuffer[j++] = c; + if (c == '\r') + { + sawCR = true; + } + } + // How many characters have we read? + readBufferLength = j; + } + + /** + * Return the next byte value in a UTF-8 sequence. + * If it is not possible to get a byte from the current + * entity, throw an exception. + * @param pos The current position in the rawReadBuffer. + * @param count The number of bytes in the rawReadBuffer + * @return The significant six bits of a non-initial byte in + * a UTF-8 sequence. + * @exception EOFException If the sequence is incomplete. + */ + private int getNextUtf8Byte(int pos, int count) + throws SAXException, IOException + { + int val; + + // Take a character from the buffer + // or from the actual input stream. + if (pos < count) + { + val = rawReadBuffer[pos]; + } + else + { + val = is.read(); + if (val == -1) + { + encodingError("unfinished multi-byte UTF-8 sequence at EOF", + -1, pos); + } + } + + // Check for the correct bits at the start. + if ((val & 0xc0) != 0x80) + { + encodingError("bad continuation of multi-byte UTF-8 sequence", + val, pos + 1); + } + + // Return the significant bits. + return (val & 0x3f); + } + + /** + * Convert a buffer of US-ASCII or ISO-8859-1-encoded bytes into + * UTF-16 characters. + * + *

When readDataChunk () calls this method, the raw bytes are in + * rawReadBuffer, and the final characters will appear in + * readBuffer. + * + * @param count The number of bytes to convert. + * @param mask For ASCII conversion, 0x7f; else, 0xff. + * @see #readDataChunk + * @see #rawReadBuffer + * @see #readBuffer + */ + private void copyIso8859_1ReadBuffer(int count, char mask) + throws IOException + { + int i, j; + for (i = 0, j = readBufferPos; i < count; i++, j++) + { + char c = (char) (rawReadBuffer[i] & 0xff); + if ((c & mask) != 0) + { + throw new CharConversionException("non-ASCII character U+" + + Integer.toHexString(c)); + } + if (c == 0x0085 && xmlVersion == XML_11) + { + c = '\r'; + } + readBuffer[j] = c; + if (c == '\r') + { + sawCR = true; + } + } + readBufferLength = j; + } + + /** + * Convert a buffer of UCS-2-encoded bytes into UTF-16 characters + * (as used in Java string manipulation). + * + *

When readDataChunk () calls this method, the raw bytes are in + * rawReadBuffer, and the final characters will appear in + * readBuffer. + * @param count The number of bytes to convert. + * @param shift1 The number of bits to shift byte 1. + * @param shift2 The number of bits to shift byte 2 + * @see #readDataChunk + * @see #rawReadBuffer + * @see #readBuffer + */ + private void copyUcs2ReadBuffer(int count, int shift1, int shift2) + throws SAXException + { + int j = readBufferPos; + + if (count > 0 && (count % 2) != 0) + { + encodingError("odd number of bytes in UCS-2 encoding", -1, count); + } + // The loops are faster with less internal brancing; hence two + if (shift1 == 0) + { // "UTF-16-LE" + for (int i = 0; i < count; i += 2) + { + char c = (char) (rawReadBuffer[i + 1] << 8); + c |= 0xff & rawReadBuffer[i]; + readBuffer[j++] = c; + if (c == '\r') + { + sawCR = true; + } + } + } + else + { // "UTF-16-BE" + for (int i = 0; i < count; i += 2) + { + char c = (char) (rawReadBuffer[i] << 8); + c |= 0xff & rawReadBuffer[i + 1]; + readBuffer[j++] = c; + if (c == '\r') + { + sawCR = true; + } + } + } + readBufferLength = j; + } + + /** + * Convert a buffer of UCS-4-encoded bytes into UTF-16 characters. + * + *

When readDataChunk () calls this method, the raw bytes are in + * rawReadBuffer, and the final characters will appear in + * readBuffer. + *

Java has Unicode chars, and this routine uses surrogate pairs + * for ISO-10646 values between 0x00010000 and 0x000fffff. An + * exception is thrown if the ISO-10646 character has no Unicode + * representation. + * + * @param count The number of bytes to convert. + * @param shift1 The number of bits to shift byte 1. + * @param shift2 The number of bits to shift byte 2 + * @param shift3 The number of bits to shift byte 2 + * @param shift4 The number of bits to shift byte 2 + * @see #readDataChunk + * @see #rawReadBuffer + * @see #readBuffer + */ + private void copyUcs4ReadBuffer(int count, int shift1, int shift2, + int shift3, int shift4) + throws SAXException + { + int j = readBufferPos; + + if (count > 0 && (count % 4) != 0) + { + encodingError("number of bytes in UCS-4 encoding " + + "not divisible by 4", + -1, count); + } + for (int i = 0; i < count; i += 4) + { + int value = (((rawReadBuffer [i] & 0xff) << shift1) | + ((rawReadBuffer [i + 1] & 0xff) << shift2) | + ((rawReadBuffer [i + 2] & 0xff) << shift3) | + ((rawReadBuffer [i + 3] & 0xff) << shift4)); + if (value < 0x0000ffff) + { + readBuffer [j++] = (char) value; + if (value == (int) '\r') + { + sawCR = true; + } + } + else if (value < 0x0010ffff) + { + value -= 0x010000; + readBuffer[j++] = (char) (0xd8 | ((value >> 10) & 0x03ff)); + readBuffer[j++] = (char) (0xdc | (value & 0x03ff)); + } + else + { + encodingError("UCS-4 value out of range for Unicode", + value, i); + } + } + readBufferLength = j; + } + + /** + * Report a character encoding error. + */ + private void encodingError(String message, int value, int offset) + throws SAXException + { + if (value != -1) + { + message = message + " (character code: 0x" + + Integer.toHexString(value) + ')'; + error(message); + } + } + + ////////////////////////////////////////////////////////////////////// + // Local Variables. + ////////////////////////////////////////////////////////////////////// + + /** + * Re-initialize the variables for each parse. + */ + private void initializeVariables() + { + // First line + line = 1; + column = 0; + + // Set up the buffers for data and names + dataBufferPos = 0; + dataBuffer = new char[DATA_BUFFER_INITIAL]; + nameBufferPos = 0; + nameBuffer = new char[NAME_BUFFER_INITIAL]; + + // Set up the DTD hash tables + elementInfo = new HashMap(); + entityInfo = new HashMap(); + notationInfo = new HashMap(); + skippedPE = false; + + // Set up the variables for the current + // element context. + currentElement = null; + currentElementContent = CONTENT_UNDECLARED; + + // Set up the input variables + sourceType = INPUT_NONE; + inputStack = new LinkedList(); + entityStack = new LinkedList(); + externalEntity = null; + tagAttributePos = 0; + tagAttributes = new String[100]; + rawReadBuffer = new byte[READ_BUFFER_MAX]; + readBufferOverflow = -1; + + scratch = new InputSource(); + + inLiteral = false; + expandPE = false; + peIsError = false; + + doReport = false; + + inCDATA = false; + + symbolTable = new Object[SYMBOL_TABLE_LENGTH][]; + } + + static class ExternalIdentifiers + { + + String publicId; + String systemId; + String baseUri; + + ExternalIdentifiers() + { + } + + ExternalIdentifiers(String publicId, String systemId, String baseUri) + { + this.publicId = publicId; + this.systemId = systemId; + this.baseUri = baseUri; + } + + } + + static class EntityInfo + { + + int type; + ExternalIdentifiers ids; + String value; + String notationName; + + } + + static class AttributeDecl + { + + String type; + String value; + int valueType; + String enumeration; + String defaultValue; + + } + + static class ElementDecl + { + + int contentType; + String contentModel; + HashMap attributes; + + } + + static class Input + { + + int sourceType; + URLConnection externalEntity; + char[] readBuffer; + int readBufferPos; + int readBufferLength; + int line; + int encoding; + int readBufferOverflow; + InputStream is; + int currentByteCount; + int column; + Reader reader; + + } + +} -- cgit v1.2.3