/* XMLParser.java --
Copyright (C) 2005 Free Software Foundation, Inc.
This file is part of GNU Classpath.
GNU Classpath is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
GNU Classpath is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with GNU Classpath; see the file COPYING. If not, write to the
Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301 USA.
Linking this library statically or dynamically with other modules is
making a combined work based on this library. Thus, the terms and
conditions of the GNU General Public License cover the whole
combination.
As a special exception, the copyright holders of this library give you
permission to link this library with independent modules to produce an
executable, regardless of the license terms of these independent
modules, and to copy and distribute the resulting executable under
terms of your choice, provided that you also meet, for each linked
independent module, the terms and conditions of the license of that
module. An independent module is a module which is not derived from
or based on this library. If you modify this library, you may extend
this exception to your version of the library, but you are not
obligated to do so. If you do not wish to do so, delete this
exception statement from your version.
Partly derived from code which carried the following notice:
Copyright (c) 1997, 1998 by Microstar Software Ltd.
AElfred is free for both commercial and non-commercial use and
redistribution, provided that Microstar's copyright and disclaimer are
retained intact. You are free to modify AElfred for your own use and
to redistribute AElfred with your modifications, provided that the
modifications are clearly documented.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
merchantability or fitness for a particular purpose. Please use it AT
YOUR OWN RISK.
*/
package gnu.xml.stream;
import gnu.java.lang.CPStringBuilder;
import java.io.BufferedInputStream;
import java.io.EOFException;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.StringTokenizer;
import javax.xml.XMLConstants;
import javax.xml.namespace.NamespaceContext;
import javax.xml.namespace.QName;
import javax.xml.stream.Location;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLReporter;
import javax.xml.stream.XMLResolver;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import gnu.java.net.CRLFInputStream;
import gnu.classpath.debug.TeeInputStream;
import gnu.classpath.debug.TeeReader;
/**
* An XML parser.
* This parser supports the following additional StAX properties:
*
* gnu.xml.stream.stringInterning |
* Boolean |
* Indicates whether markup strings will be interned |
* gnu.xml.stream.xmlBase |
* Boolean |
* Indicates whether XML Base processing will be performed |
* gnu.xml.stream.baseURI |
* String |
* Returns the base URI of the current event |
*
*
* @see http://www.w3.org/TR/REC-xml/
* @see http://www.w3.org/TR/xml11/
* @see http://www.w3.org/TR/REC-xml-names
* @see http://www.w3.org/TR/xml-names11
* @see http://www.w3.org/TR/xmlbase/
*
* @author Chris Burdess
*/
public class XMLParser
implements XMLStreamReader, NamespaceContext
{
// -- parser state machine states --
private static final int INIT = 0; // start state
private static final int PROLOG = 1; // in prolog
private static final int CONTENT = 2; // in content
private static final int EMPTY_ELEMENT = 3; // empty element state
private static final int MISC = 4; // in Misc (after root element)
// -- parameters for parsing literals --
private final static int LIT_ENTITY_REF = 2;
private final static int LIT_NORMALIZE = 4;
private final static int LIT_ATTRIBUTE = 8;
private final static int LIT_DISABLE_PE = 16;
private final static int LIT_DISABLE_CREF = 32;
private final static int LIT_DISABLE_EREF = 64;
private final static int LIT_PUBID = 256;
// -- types of attribute values --
final static int ATTRIBUTE_DEFAULT_UNDECLARED = 30;
final static int ATTRIBUTE_DEFAULT_SPECIFIED = 31;
final static int ATTRIBUTE_DEFAULT_IMPLIED = 32;
final static int ATTRIBUTE_DEFAULT_REQUIRED = 33;
final static int ATTRIBUTE_DEFAULT_FIXED = 34;
// -- additional event types --
final static int START_ENTITY = 50;
final static int END_ENTITY = 51;
/**
* The current input.
*/
private Input input;
/**
* Stack of inputs representing XML general entities.
* The input representing the XML input stream or reader is always the
* first element in this stack.
*/
private LinkedList inputStack = new LinkedList();
/**
* Stack of start-entity events to be reported.
*/
private LinkedList startEntityStack = new LinkedList();
/**
* Stack of end-entity events to be reported.
*/
private LinkedList endEntityStack = new LinkedList();
/**
* Current parser state within the main state machine.
*/
private int state = INIT;
/**
* The (type of the) current event.
*/
private int event;
/**
* The element name stack. The first element in this stack will be the
* root element.
*/
private LinkedList stack = new LinkedList();
/**
* Stack of namespace contexts. These are maps specifying prefix-to-URI
* mappings. The first element in this stack is the most recent namespace
* context (i.e. the other way around from the element name stack).
*/
private LinkedList namespaces = new LinkedList();
/**
* The base-URI stack. This holds the base URI context for each element.
* The first element in this stack is the most recent context (i.e. the
* other way around from the element name stack).
*/
private LinkedList bases = new LinkedList();
/**
* The list of attributes for the current element, in the order defined in
* the XML stream.
*/
private ArrayList attrs = new ArrayList();
/**
* Buffer for text and character data.
*/
private StringBuffer buf = new StringBuffer();
/**
* Buffer for NMTOKEN strings (markup).
*/
private StringBuffer nmtokenBuf = new StringBuffer();
/**
* Buffer for string literals. (e.g. attribute values)
*/
private StringBuffer literalBuf = new StringBuffer();
/**
* Temporary Unicode character buffer used during character data reads.
*/
private int[] tmpBuf = new int[1024];
/**
* The element content model for the current element.
*/
private ContentModel currentContentModel;
/**
* The validation stack. This holds lists of the elements seen for each
* element, in order to determine whether the names and order of these
* elements match the content model for the element. The last entry in
* this stack represents the current element.
*/
private LinkedList validationStack;
/**
* These sets contain the IDs and the IDREFs seen in the document, to
* ensure that IDs are unique and that each IDREF refers to an ID in the
* document.
*/
private HashSet ids, idrefs;
/**
* The target and data associated with the current processing instruction
* event.
*/
private String piTarget, piData;
/**
* The XML version declared in the XML declaration.
*/
private String xmlVersion;
/**
* The encoding declared in the XML declaration.
*/
private String xmlEncoding;
/**
* The standalone value declared in the XML declaration.
*/
private Boolean xmlStandalone;
/**
* The document type definition.
*/
Doctype doctype;
/**
* State variables for determining parameter-entity expansion.
*/
private boolean expandPE, peIsError;
/**
* Whether this is a validating parser.
*/
private final boolean validating;
/**
* Whether strings representing markup will be interned.
*/
private final boolean stringInterning;
/**
* If true, CDATA sections will be merged with adjacent text nodes into a
* single event.
*/
private final boolean coalescing;
/**
* Whether to replace general entity references with their replacement
* text automatically during parsing.
* Otherwise entity-reference events will be issued.
*/
private final boolean replaceERefs;
/**
* Whether to support external entities.
*/
private final boolean externalEntities;
/**
* Whether to support DTDs.
*/
private final boolean supportDTD;
/**
* Whether to support XML namespaces. If true, namespace information will
* be available. Otherwise namespaces will simply be reported as ordinary
* attributes.
*/
private final boolean namespaceAware;
/**
* Whether to support XML Base. If true, URIs specified in xml:base
* attributes will be honoured when resolving external entities.
*/
private final boolean baseAware;
/**
* Whether to report extended event types (START_ENTITY and END_ENTITY)
* in addition to the standard event types. Used by the SAX parser.
*/
private final boolean extendedEventTypes;
/**
* The reporter to receive parsing warnings.
*/
final XMLReporter reporter;
/**
* Callback interface for resolving external entities.
*/
final XMLResolver resolver;
// -- Constants for testing the next kind of markup event --
private static final String TEST_START_ELEMENT = "<";
private static final String TEST_END_ELEMENT = "";
private static final String TEST_COMMENT = "