From 554fd8c5195424bdbcabf5de30fdc183aba391bd Mon Sep 17 00:00:00 2001 From: upstream source tree Date: Sun, 15 Mar 2015 20:14:05 -0400 Subject: obtained gcc-4.6.4.tar.bz2 from upstream website; verified gcc-4.6.4.tar.bz2.sig; imported gcc-4.6.4 source tree from verified upstream tarball. downloading a git-generated archive based on the 'upstream' tag should provide you with a source tree that is binary identical to the one extracted from the above tarball. if you have obtained the source via the command 'git clone', however, do note that line-endings of files in your working directory might differ from line-endings of the respective files in the upstream repository. --- libjava/classpath/gnu/xml/util/XMLWriter.java | 1931 +++++++++++++++++++++++++ 1 file changed, 1931 insertions(+) create mode 100644 libjava/classpath/gnu/xml/util/XMLWriter.java (limited to 'libjava/classpath/gnu/xml/util/XMLWriter.java') diff --git a/libjava/classpath/gnu/xml/util/XMLWriter.java b/libjava/classpath/gnu/xml/util/XMLWriter.java new file mode 100644 index 000000000..a371debff --- /dev/null +++ b/libjava/classpath/gnu/xml/util/XMLWriter.java @@ -0,0 +1,1931 @@ +/* XMLWriter.java -- + Copyright (C) 1999,2000,2001 Free Software Foundation, Inc. + +This file is part of GNU Classpath. + +GNU Classpath is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU Classpath is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Classpath; see the file COPYING. If not, write to the +Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +02110-1301 USA. + +Linking this library statically or dynamically with other modules is +making a combined work based on this library. Thus, the terms and +conditions of the GNU General Public License cover the whole +combination. + +As a special exception, the copyright holders of this library give you +permission to link this library with independent modules to produce an +executable, regardless of the license terms of these independent +modules, and to copy and distribute the resulting executable under +terms of your choice, provided that you also meet, for each linked +independent module, the terms and conditions of the license of that +module. An independent module is a module which is not derived from +or based on this library. If you modify this library, you may extend +this exception to your version of the library, but you are not +obligated to do so. If you do not wish to do so, delete this +exception statement from your version. */ + +package gnu.xml.util; + +import gnu.java.lang.CPStringBuilder; + +import java.io.BufferedWriter; +import java.io.CharConversionException; +import java.io.IOException; +import java.io.OutputStream; +import java.io.OutputStreamWriter; +import java.io.Writer; +import java.util.Stack; + +import org.xml.sax.*; +import org.xml.sax.ext.*; +import org.xml.sax.helpers.*; + + +/** + * This class is a SAX handler which writes all its input as a well formed + * XML or XHTML document. If driven using SAX2 events, this output may + * include a recreated document type declaration, subject to limitations + * of SAX (no internal subset exposed) or DOM (the important declarations, + * with their documentation, are discarded). + * + *

By default, text is generated "as-is", but some optional modes + * are supported. Pretty-printing is supported, to make life easier + * for people reading the output. XHTML (1.0) output has can be made + * particularly pretty; all the built-in character entities are known. + * Canonical XML can also be generated, assuming the input is properly + * formed. + * + *


+ * + *

Some of the methods on this class are intended for applications to + * use directly, rather than as pure SAX2 event callbacks. Some of those + * methods access the JavaBeans properties (used to tweak output formats, + * for example canonicalization and pretty printing). Subclasses + * are expected to add new behaviors, not to modify current behavior, so + * many such methods are final.

+ * + *

The write*() methods may be slightly simpler for some + * applications to use than direct callbacks. For example, they support + * a simple policy for encoding data items as the content of a single element. + * + *

To reuse an XMLWriter you must provide it with a new Writer, since + * this handler closes the writer it was given as part of its endDocument() + * handling. (XML documents have an end of input, and the way to encode + * that on a stream is to close it.)

+ * + *
+ * + *

Note that any relative URIs in the source document, as found in + * entity and notation declarations, ought to have been fully resolved by + * the parser providing events to this handler. This means that the + * output text should only have fully resolved URIs, which may not be + * the desired behavior in cases where later binding is desired.

+ * + *

Note that due to SAX2 defaults, you may need to manually + * ensure that the input events are XML-conformant with respect to namespace + * prefixes and declarations. {@link gnu.xml.pipeline.NSFilter} is + * one solution to this problem, in the context of processing pipelines. + * Something as simple as connecting this handler to a parser might not + * generate the correct output. Another workaround is to ensure that the + * namespace-prefixes feature is always set to true, if you're + * hooking this directly up to some XMLReader implementation. + * + * @see gnu.xml.pipeline.TextConsumer + * + * @author David Brownell + * + * @deprecated Please use the javax.xml.stream APIs instead + */ +public class XMLWriter + implements ContentHandler, LexicalHandler, DTDHandler, DeclHandler +{ + // text prints/escapes differently depending on context + // CTX_ENTITY ... entity literal value + // CTX_ATTRIBUTE ... attribute literal value + // CTX_CONTENT ... content of an element + // CTX_UNPARSED ... CDATA, comment, PI, names, etc + // CTX_NAME ... name or nmtoken, no escapes possible + private static final int CTX_ENTITY = 1; + private static final int CTX_ATTRIBUTE = 2; + private static final int CTX_CONTENT = 3; + private static final int CTX_UNPARSED = 4; + private static final int CTX_NAME = 5; + +// FIXME: names (element, attribute, PI, notation, etc) are not +// currently written out with range checks (escapeChars). +// In non-XHTML, some names can't be directly written; panic! + + private static String sysEOL; + + static { + try { + sysEOL = System.getProperty ("line.separator", "\n"); + + // don't use the system's EOL if it's illegal XML. + if (!isLineEnd (sysEOL)) + sysEOL = "\n"; + + } catch (SecurityException e) { + sysEOL = "\n"; + } + } + + private static boolean isLineEnd (String eol) + { + return "\n".equals (eol) + || "\r".equals (eol) + || "\r\n".equals (eol); + } + + private Writer out; + private boolean inCDATA; + private int elementNestLevel; + private String eol = sysEOL; + + private short dangerMask; + private CPStringBuilder stringBuf; + private Locator locator; + private ErrorHandler errHandler; + + private boolean expandingEntities = false; + private int entityNestLevel; + private boolean xhtml; + private boolean startedDoctype; + private String encoding; + + private boolean canonical; + private boolean inDoctype; + private boolean inEpilogue; + + // pretty printing controls + private boolean prettyPrinting; + private int column; + private boolean noWrap; + private Stack space = new Stack (); + + // this is not a hard'n'fast rule -- longer lines are OK, + // but are to be avoided. Here, prettyprinting is more to + // show structure "cleanly" than to be precise about it. + // better to have ragged layout than one line 24Kb long. + private static final int lineLength = 75; + + + /** + * Constructs this handler with System.out used to write SAX events + * using the UTF-8 encoding. Avoid using this except when you know + * it's safe to close System.out at the end of the document. + */ + public XMLWriter () throws IOException + { this (System.out); } + + /** + * Constructs a handler which writes all input to the output stream + * in the UTF-8 encoding, and closes it when endDocument is called. + * (Yes it's annoying that this throws an exception -- but there's + * really no way around it, since it's barely possible a JDK may + * exist somewhere that doesn't know how to emit UTF-8.) + */ + public XMLWriter (OutputStream out) throws IOException + { + this (new OutputStreamWriter (out, "UTF8")); + } + + /** + * Constructs a handler which writes all input to the writer, and then + * closes the writer when the document ends. If an XML declaration is + * written onto the output, and this class can determine the name of + * the character encoding for this writer, that encoding name will be + * included in the XML declaration. + * + *

See the description of the constructor which takes an encoding + * name for imporant information about selection of encodings. + * + * @param writer XML text is written to this writer. + */ + public XMLWriter (Writer writer) + { + this (writer, null); + } + + /** + * Constructs a handler which writes all input to the writer, and then + * closes the writer when the document ends. If an XML declaration is + * written onto the output, this class will use the specified encoding + * name in that declaration. If no encoding name is specified, no + * encoding name will be declared unless this class can otherwise + * determine the name of the character encoding for this writer. + * + *

At this time, only the UTF-8 ("UTF8") and UTF-16 ("Unicode") + * output encodings are fully lossless with respect to XML data. If you + * use any other encoding you risk having your data be silently mangled + * on output, as the standard Java character encoding subsystem silently + * maps non-encodable characters to a question mark ("?") and will not + * report such errors to applications. + * + *

For a few other encodings the risk can be reduced. If the writer is + * a java.io.OutputStreamWriter, and uses either the ISO-8859-1 ("8859_1", + * "ISO8859_1", etc) or US-ASCII ("ASCII") encodings, content which + * can't be encoded in those encodings will be written safely. Where + * relevant, the XHTML entity names will be used; otherwise, numeric + * character references will be emitted. + * + *

However, there remain a number of cases where substituting such + * entity or character references is not an option. Such references are + * not usable within a DTD, comment, PI, or CDATA section. Neither may + * they be used when element, attribute, entity, or notation names have + * the problematic characters. + * + * @param writer XML text is written to this writer. + * @param encoding if non-null, and an XML declaration is written, + * this is the name that will be used for the character encoding. + */ + public XMLWriter (Writer writer, String encoding) + { + setWriter (writer, encoding); + } + + private void setEncoding (String encoding) + { + if (encoding == null && out instanceof OutputStreamWriter) + encoding = ((OutputStreamWriter)out).getEncoding (); + + if (encoding != null) { + encoding = encoding.toUpperCase (); + + // Use official encoding names where we know them, + // avoiding the Java-only names. When using common + // encodings where we can easily tell if characters + // are out of range, we'll escape out-of-range + // characters using character refs for safety. + + // I _think_ these are all the main synonyms for these! + if ("UTF8".equals (encoding)) { + encoding = "UTF-8"; + } else if ("US-ASCII".equals (encoding) + || "ASCII".equals (encoding)) { + dangerMask = (short) 0xff80; + encoding = "US-ASCII"; + } else if ("ISO-8859-1".equals (encoding) + || "8859_1".equals (encoding) + || "ISO8859_1".equals (encoding)) { + dangerMask = (short) 0xff00; + encoding = "ISO-8859-1"; + } else if ("UNICODE".equals (encoding) + || "UNICODE-BIG".equals (encoding) + || "UNICODE-LITTLE".equals (encoding)) { + encoding = "UTF-16"; + + // TODO: UTF-16BE, UTF-16LE ... no BOM; what + // release of JDK supports those Unicode names? + } + + if (dangerMask != 0) + stringBuf = new CPStringBuilder (); + } + + this.encoding = encoding; + } + + + /** + * Resets the handler to write a new text document. + * + * @param writer XML text is written to this writer. + * @param encoding if non-null, and an XML declaration is written, + * this is the name that will be used for the character encoding. + * + * @exception IllegalStateException if the current + * document hasn't yet ended (with {@link #endDocument}) + */ + final public void setWriter (Writer writer, String encoding) + { + if (out != null) + throw new IllegalStateException ( + "can't change stream in mid course"); + out = writer; + if (out != null) + setEncoding (encoding); + if (!(out instanceof BufferedWriter)) + out = new BufferedWriter (out); + space.push ("default"); + } + + /** + * Assigns the line ending style to be used on output. + * @param eolString null to use the system default; else + * "\n", "\r", or "\r\n". + */ + final public void setEOL (String eolString) + { + if (eolString == null) + eol = sysEOL; + else if (!isLineEnd (eolString)) + eol = eolString; + else + throw new IllegalArgumentException (eolString); + } + + /** + * Assigns the error handler to be used to present most fatal + * errors. + */ + public void setErrorHandler (ErrorHandler handler) + { + errHandler = handler; + } + + /** + * Used internally and by subclasses, this encapsulates the logic + * involved in reporting fatal errors. It uses locator information + * for good diagnostics, if available, and gives the application's + * ErrorHandler the opportunity to handle the error before throwing + * an exception. + */ + protected void fatal (String message, Exception e) + throws SAXException + { + SAXParseException x; + + if (locator == null) + x = new SAXParseException (message, null, null, -1, -1, e); + else + x = new SAXParseException (message, locator, e); + if (errHandler != null) + errHandler.fatalError (x); + throw x; + } + + + // JavaBeans properties + + /** + * Controls whether the output should attempt to follow the "transitional" + * XHTML rules so that it meets the "HTML Compatibility Guidelines" + * appendix in the XHTML specification. A "transitional" Document Type + * Declaration (DTD) is placed near the beginning of the output document, + * instead of whatever DTD would otherwise have been placed there, and + * XHTML empty elements are printed specially. When writing text in + * US-ASCII or ISO-8859-1 encodings, the predefined XHTML internal + * entity names are used (in preference to character references) when + * writing content characters which can't be expressed in those encodings. + * + *

When this option is enabled, it is the caller's responsibility + * to ensure that the input is otherwise valid as XHTML. Things to + * be careful of in all cases, as described in the appendix referenced + * above, include:

+ * + *

Additionally, some of the oldest browsers have additional + * quirks, to address with guidelines such as:

+ * + *

Also, some characteristics of the resulting output may be + * a function of whether the document is later given a MIME + * content type of text/html rather than one indicating + * XML (application/xml or text/xml). Worse, + * some browsers ignore MIME content types and prefer to rely URI + * name suffixes -- so an "index.xml" could always be XML, never + * XHTML, no matter its MIME type. + */ + final public void setXhtml (boolean value) + { + if (locator != null) + throw new IllegalStateException ("started parsing"); + xhtml = value; + if (xhtml) + canonical = false; + } + + /** + * Returns true if the output attempts to echo the input following + * "transitional" XHTML rules and matching the "HTML Compatibility + * Guidelines" so that an HTML version 3 browser can read the output + * as HTML; returns false (the default) othewise. + */ + final public boolean isXhtml () + { + return xhtml; + } + + /** + * Controls whether the output text contains references to + * entities (the default), or instead contains the expanded + * values of those entities. + */ + final public void setExpandingEntities (boolean value) + { + if (locator != null) + throw new IllegalStateException ("started parsing"); + expandingEntities = value; + if (!expandingEntities) + canonical = false; + } + + /** + * Returns true if the output will have no entity references; + * returns false (the default) otherwise. + */ + final public boolean isExpandingEntities () + { + return expandingEntities; + } + + /** + * Controls pretty-printing, which by default is not enabled + * (and currently is most useful for XHTML output). + * Pretty printing enables structural indentation, sorting of attributes + * by name, line wrapping, and potentially other mechanisms for making + * output more or less readable. + * + *

At this writing, structural indentation and line wrapping are + * enabled when pretty printing is enabled and the xml:space + * attribute has the value default (its other legal value is + * preserve, as defined in the XML specification). The three + * XHTML element types which use another value are recognized by their + * names (namespaces are ignored). + * + *

Also, for the record, the "pretty" aspect of printing here + * is more to provide basic structure on outputs that would otherwise + * risk being a single long line of text. For now, expect the + * structure to be ragged ... unless you'd like to submit a patch + * to make this be more strictly formatted! + * + * @exception IllegalStateException thrown if this method is invoked + * after output has begun. + */ + final public void setPrettyPrinting (boolean value) + { + if (locator != null) + throw new IllegalStateException ("started parsing"); + prettyPrinting = value; + if (prettyPrinting) + canonical = false; + } + + /** + * Returns value of flag controlling pretty printing. + */ + final public boolean isPrettyPrinting () + { + return prettyPrinting; + } + + + /** + * Sets the output style to be canonicalized. Input events must + * meet requirements that are slightly more stringent than the + * basic well-formedness ones, and include:

+ * + *

Note that fragments of XML documents, as specified by an XPath + * node set, may be canonicalized. In such cases, elements may need + * some fixup (for xml:* attributes and application-specific + * context). + * + * @exception IllegalArgumentException if the output encoding + * is anything other than UTF-8. + */ + final public void setCanonical (boolean value) + { + if (value && !"UTF-8".equals (encoding)) + throw new IllegalArgumentException ("encoding != UTF-8"); + canonical = value; + if (canonical) { + prettyPrinting = xhtml = false; + expandingEntities = true; + eol = "\n"; + } + } + + + /** + * Returns value of flag controlling canonical output. + */ + final public boolean isCanonical () + { + return canonical; + } + + + /** + * Flushes the output stream. When this handler is used in long lived + * pipelines, it can be important to flush buffered state, for example + * so that it can reach the disk as part of a state checkpoint. + */ + final public void flush () + throws IOException + { + if (out != null) + out.flush (); + } + + + // convenience routines + +// FIXME: probably want a subclass that holds a lot of these... +// and maybe more! + + /** + * Writes the string as if characters() had been called on the contents + * of the string. This is particularly useful when applications act as + * producers and write data directly to event consumers. + */ + final public void write (String data) + throws SAXException + { + char buf [] = data.toCharArray (); + characters (buf, 0, buf.length); + } + + + /** + * Writes an element that has content consisting of a single string. + * @see #writeEmptyElement + * @see #startElement + */ + public void writeElement ( + String uri, + String localName, + String qName, + Attributes atts, + String content + ) throws SAXException + { + if (content == null || content.length () == 0) { + writeEmptyElement (uri, localName, qName, atts); + return; + } + startElement (uri, localName, qName, atts); + char chars [] = content.toCharArray (); + characters (chars, 0, chars.length); + endElement (uri, localName, qName); + } + + + /** + * Writes an element that has content consisting of a single integer, + * encoded as a decimal string. + * @see #writeEmptyElement + * @see #startElement + */ + public void writeElement ( + String uri, + String localName, + String qName, + Attributes atts, + int content + ) throws SAXException + { + writeElement (uri, localName, qName, atts, Integer.toString (content)); + } + + + // SAX1 ContentHandler + /** SAX1: provides parser status information */ + final public void setDocumentLocator (Locator l) + { + locator = l; + } + + + // URL for dtd that validates against all normal HTML constructs + private static final String xhtmlFullDTD = + "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"; + + + /** + * SAX1: indicates the beginning of a document parse. + * If you're writing (well formed) fragments of XML, neither + * this nor endDocument should be called. + */ + // NOT final + public void startDocument () + throws SAXException + { + try { + if (out == null) + throw new IllegalStateException ( + "null Writer given to XMLWriter"); + + // Not all parsers provide the locator we want; this also + // flags whether events are being sent to this object yet. + // We could only have this one call if we only printed whole + // documents ... but we also print fragments, so most of the + // callbacks here replicate this test. + + if (locator == null) + locator = new LocatorImpl (); + + // Unless the data is in US-ASCII or we're canonicalizing, write + // the XML declaration if we know the encoding. US-ASCII won't + // normally get mangled by web server confusion about the + // character encodings used. Plus, it's an easy way to + // ensure we can write ASCII that's unlikely to confuse + // elderly HTML parsers. + + if (!canonical + && dangerMask != (short) 0xff80 + && encoding != null) { + rawWrite (""); + newline (); + } + + if (xhtml) { + + rawWrite (""); + newline (); + newline (); + + // fake the rest of the handler into ignoring + // everything until the root element, so any + // XHTML DTD comments, PIs, etc are ignored + startedDoctype = true; + } + + entityNestLevel = 0; + + } catch (IOException e) { + fatal ("can't write", e); + } + } + + /** + * SAX1: indicates the completion of a parse. + * Note that all complete SAX event streams make this call, even + * if an error is reported during a parse. + */ + // NOT final + public void endDocument () + throws SAXException + { + try { + if (!canonical) { + newline (); + newline (); + } + out.close (); + out = null; + locator = null; + } catch (IOException e) { + fatal ("can't write", e); + } + } + + // XHTML elements declared as EMPTY print differently + final private static boolean isEmptyElementTag (String tag) + { + switch (tag.charAt (0)) { + case 'a': return "area".equals (tag); + case 'b': return "base".equals (tag) + || "basefont".equals (tag) + || "br".equals (tag); + case 'c': return "col".equals (tag); + case 'f': return "frame".equals (tag); + case 'h': return "hr".equals (tag); + case 'i': return "img".equals (tag) + || "input".equals (tag) + || "isindex".equals (tag); + case 'l': return "link".equals (tag); + case 'm': return "meta".equals (tag); + case 'p': return "param".equals (tag); + } + return false; + } + + private static boolean indentBefore (String tag) + { + // basically indent before block content + // and within structure like tables, lists + switch (tag.charAt (0)) { + case 'a': return "applet".equals (tag); + case 'b': return "body".equals (tag) + || "blockquote".equals (tag); + case 'c': return "center".equals (tag); + case 'f': return "frame".equals (tag) + || "frameset".equals (tag); + case 'h': return "head".equals (tag); + case 'm': return "meta".equals (tag); + case 'o': return "object".equals (tag); + case 'p': return "param".equals (tag) + || "pre".equals (tag); + case 's': return "style".equals (tag); + case 't': return "title".equals (tag) + || "td".equals (tag) + || "th".equals (tag); + } + // ... but not inline elements like "em", "b", "font" + return false; + } + + private static boolean spaceBefore (String tag) + { + // blank line AND INDENT before certain structural content + switch (tag.charAt (0)) { + case 'h': return "h1".equals (tag) + || "h2".equals (tag) + || "h3".equals (tag) + || "h4".equals (tag) + || "h5".equals (tag) + || "h6".equals (tag) + || "hr".equals (tag); + case 'l': return "li".equals (tag); + case 'o': return "ol".equals (tag); + case 'p': return "p".equals (tag); + case 't': return "table".equals (tag) + || "tr".equals (tag); + case 'u': return "ul".equals (tag); + } + return false; + } + + // XHTML DTDs say these three have xml:space="preserve" + private static boolean spacePreserve (String tag) + { + return "pre".equals (tag) + || "style".equals (tag) + || "script".equals (tag); + } + + /** + * SAX2: ignored. + */ + final public void startPrefixMapping (String prefix, String uri) + {} + + /** + * SAX2: ignored. + */ + final public void endPrefixMapping (String prefix) + {} + + private void writeStartTag ( + String name, + Attributes atts, + boolean isEmpty + ) throws SAXException, IOException + { + rawWrite ('<'); + rawWrite (name); + + // write out attributes ... sorting is particularly useful + // with output that's been heavily defaulted. + if (atts != null && atts.getLength () != 0) { + + // Set up to write, with optional sorting + int indices [] = new int [atts.getLength ()]; + + for (int i= 0; i < indices.length; i++) + indices [i] = i; + + // optionally sort + +// FIXME: canon xml demands xmlns nodes go first, +// and sorting by URI first (empty first) then localname +// it should maybe use a different sort + + if (canonical || prettyPrinting) { + + // insertion sort by attribute name + for (int i = 1; i < indices.length; i++) { + int n = indices [i], j; + String s = atts.getQName (n); + + for (j = i - 1; j >= 0; j--) { + if (s.compareTo (atts.getQName (indices [j])) + >= 0) + break; + indices [j + 1] = indices [j]; + } + indices [j + 1] = n; + } + } + + // write, sorted or no + for (int i= 0; i < indices.length; i++) { + String s = atts.getQName (indices [i]); + + if (s == null || "".equals (s)) + throw new IllegalArgumentException ("no XML name"); + rawWrite (" "); + rawWrite (s); + rawWrite ("="); + writeQuotedValue (atts.getValue (indices [i]), + CTX_ATTRIBUTE); + } + } + if (isEmpty) + rawWrite (" /"); + rawWrite ('>'); + } + + /** + * SAX2: indicates the start of an element. + * When XHTML is in use, avoid attribute values with + * line breaks or multiple whitespace characters, since + * not all user agents handle them correctly. + */ + final public void startElement ( + String uri, + String localName, + String qName, + Attributes atts + ) throws SAXException + { + startedDoctype = false; + + if (locator == null) + locator = new LocatorImpl (); + + if (qName == null || "".equals (qName)) + throw new IllegalArgumentException ("no XML name"); + + try { + if (entityNestLevel != 0) + return; + if (prettyPrinting) { + String whitespace = null; + + if (xhtml && spacePreserve (qName)) + whitespace = "preserve"; + else if (atts != null) + whitespace = atts.getValue ("xml:space"); + if (whitespace == null) + whitespace = (String) space.peek (); + space.push (whitespace); + + if ("default".equals (whitespace)) { + if (xhtml) { + if (spaceBefore (qName)) { + newline (); + doIndent (); + } else if (indentBefore (qName)) + doIndent (); + // else it's inlined, modulo line length + // FIXME: incrementing element nest level + // for inlined elements causes ugliness + } else + doIndent (); + } + } + elementNestLevel++; + writeStartTag (qName, atts, xhtml && isEmptyElementTag (qName)); + + if (xhtml) { +// FIXME: if this is an XHTML "pre" element, turn +// off automatic wrapping. + } + + } catch (IOException e) { + fatal ("can't write", e); + } + } + + /** + * Writes an empty element. + * @see #startElement + */ + public void writeEmptyElement ( + String uri, + String localName, + String qName, + Attributes atts + ) throws SAXException + { + if (canonical) { + startElement (uri, localName, qName, atts); + endElement (uri, localName, qName); + } else { + try { + writeStartTag (qName, atts, true); + } catch (IOException e) { + fatal ("can't write", e); + } + } + } + + + /** SAX2: indicates the end of an element */ + final public void endElement (String uri, String localName, String qName) + throws SAXException + { + if (qName == null || "".equals (qName)) + throw new IllegalArgumentException ("no XML name"); + + try { + elementNestLevel--; + if (entityNestLevel != 0) + return; + if (xhtml && isEmptyElementTag (qName)) + return; + rawWrite ("'); + + if (prettyPrinting) { + if (!space.empty ()) + space.pop (); + else + fatal ("stack discipline", null); + } + if (elementNestLevel == 0) + inEpilogue = true; + + } catch (IOException e) { + fatal ("can't write", e); + } + } + + /** SAX1: reports content characters */ + final public void characters (char ch [], int start, int length) + throws SAXException + { + if (locator == null) + locator = new LocatorImpl (); + + try { + if (entityNestLevel != 0) + return; + if (inCDATA) { + escapeChars (ch, start, length, CTX_UNPARSED); + } else { + escapeChars (ch, start, length, CTX_CONTENT); + } + } catch (IOException e) { + fatal ("can't write", e); + } + } + + /** SAX1: reports ignorable whitespace */ + final public void ignorableWhitespace (char ch [], int start, int length) + throws SAXException + { + if (locator == null) + locator = new LocatorImpl (); + + try { + if (entityNestLevel != 0) + return; + // don't forget to map NL to CRLF, CR, etc + escapeChars (ch, start, length, CTX_CONTENT); + } catch (IOException e) { + fatal ("can't write", e); + } + } + + /** + * SAX1: reports a PI. + * This doesn't check for illegal target names, such as "xml" or "XML", + * or namespace-incompatible ones like "big:dog"; the caller is + * responsible for ensuring those names are legal. + */ + final public void processingInstruction (String target, String data) + throws SAXException + { + if (locator == null) + locator = new LocatorImpl (); + + // don't print internal subset for XHTML + if (xhtml && startedDoctype) + return; + + // ancient HTML browsers might render these ... their loss. + // to prevent: "if (xhtml) return;". + + try { + if (entityNestLevel != 0) + return; + if (canonical && inEpilogue) + newline (); + rawWrite (""); + if (elementNestLevel == 0 && !(canonical && inEpilogue)) + newline (); + } catch (IOException e) { + fatal ("can't write", e); + } + } + + /** SAX1: indicates a non-expanded entity reference */ + public void skippedEntity (String name) + throws SAXException + { + try { + rawWrite ("&"); + rawWrite (name); + rawWrite (";"); + } catch (IOException e) { + fatal ("can't write", e); + } + } + + // SAX2 LexicalHandler + + /** SAX2: called before parsing CDATA characters */ + final public void startCDATA () + throws SAXException + { + if (locator == null) + locator = new LocatorImpl (); + + if (canonical) + return; + + try { + inCDATA = true; + if (entityNestLevel == 0) + rawWrite ("SAX2: called after parsing CDATA characters */ + final public void endCDATA () + throws SAXException + { + if (canonical) + return; + + try { + inCDATA = false; + if (entityNestLevel == 0) + rawWrite ("]]>"); + } catch (IOException e) { + fatal ("can't write", e); + } + } + + /** + * SAX2: called when the doctype is partially parsed + * Note that this, like other doctype related calls, is ignored + * when XHTML is in use. + */ + final public void startDTD (String name, String publicId, String systemId) + throws SAXException + { + if (locator == null) + locator = new LocatorImpl (); + if (xhtml) + return; + try { + inDoctype = startedDoctype = true; + if (canonical) + return; + rawWrite ("SAX2: called after the doctype is parsed */ + final public void endDTD () + throws SAXException + { + inDoctype = false; + if (canonical || xhtml) + return; + try { + rawWrite ("]>"); + newline (); + } catch (IOException e) { + fatal ("can't write", e); + } + } + + /** + * SAX2: called before parsing a general entity in content + */ + final public void startEntity (String name) + throws SAXException + { + try { + boolean writeEOL = true; + + // Predefined XHTML entities (for characters) will get + // mapped back later. + if (xhtml || expandingEntities) + return; + + entityNestLevel++; + if (name.equals ("[dtd]")) + return; + if (entityNestLevel != 1) + return; + if (!name.startsWith ("%")) { + writeEOL = false; + rawWrite ('&'); + } + rawWrite (name); + rawWrite (';'); + if (writeEOL) + newline (); + } catch (IOException e) { + fatal ("can't write", e); + } + } + + /** + * SAX2: called after parsing a general entity in content + */ + final public void endEntity (String name) + throws SAXException + { + if (xhtml || expandingEntities) + return; + entityNestLevel--; + } + + /** + * SAX2: called when comments are parsed. + * When XHTML is used, the old HTML tradition of using comments + * to for inline CSS, or for JavaScript code is discouraged. + * This is because XML processors are encouraged to discard, on + * the grounds that comments are for users (and perhaps text + * editors) not programs. Instead, use external scripts + */ + final public void comment (char ch [], int start, int length) + throws SAXException + { + if (locator == null) + locator = new LocatorImpl (); + + // don't print internal subset for XHTML + if (xhtml && startedDoctype) + return; + // don't print comment in doctype for canon xml + if (canonical && inDoctype) + return; + + try { + boolean indent; + + if (prettyPrinting && space.empty ()) + fatal ("stack discipline", null); + indent = prettyPrinting && "default".equals (space.peek ()); + if (entityNestLevel != 0) + return; + if (indent) + doIndent (); + if (canonical && inEpilogue) + newline (); + rawWrite (""); + if (indent) + doIndent (); + if (elementNestLevel == 0 && !(canonical && inEpilogue)) + newline (); + } catch (IOException e) { + fatal ("can't write", e); + } + } + + // SAX1 DTDHandler + + /** SAX1: called on notation declarations */ + final public void notationDecl (String name, + String publicId, String systemId) + throws SAXException + { + if (xhtml) + return; + try { + // At this time, only SAX2 callbacks start these. + if (!startedDoctype) + return; + + if (entityNestLevel != 0) + return; + rawWrite (""); + newline (); + } catch (IOException e) { + fatal ("can't write", e); + } + } + + /** SAX1: called on unparsed entity declarations */ + final public void unparsedEntityDecl (String name, + String publicId, String systemId, + String notationName) + throws SAXException + { + if (xhtml) + return; + try { + // At this time, only SAX2 callbacks start these. + if (!startedDoctype) { + // FIXME: write to temporary buffer, and make the start + // of the root element write these declarations. + return; + } + + if (entityNestLevel != 0) + return; + rawWrite (""); + newline (); + } catch (IOException e) { + fatal ("can't write", e); + } + } + + // SAX2 DeclHandler + + /** SAX2: called on attribute declarations */ + final public void attributeDecl (String eName, String aName, + String type, String mode, String value) + throws SAXException + { + if (xhtml) + return; + try { + // At this time, only SAX2 callbacks start these. + if (!startedDoctype) + return; + if (entityNestLevel != 0) + return; + rawWrite ("'); + newline (); + } catch (IOException e) { + fatal ("can't write", e); + } + } + + /** SAX2: called on element declarations */ + final public void elementDecl (String name, String model) + throws SAXException + { + if (xhtml) + return; + try { + // At this time, only SAX2 callbacks start these. + if (!startedDoctype) + return; + if (entityNestLevel != 0) + return; + rawWrite ("'); + newline (); + } catch (IOException e) { + fatal ("can't write", e); + } + } + + /** SAX2: called on external entity declarations */ + final public void externalEntityDecl ( + String name, + String publicId, + String systemId) + throws SAXException + { + if (xhtml) + return; + try { + // At this time, only SAX2 callbacks start these. + if (!startedDoctype) + return; + if (entityNestLevel != 0) + return; + rawWrite (""); + newline (); + } catch (IOException e) { + fatal ("can't write", e); + } + } + + /** SAX2: called on internal entity declarations */ + final public void internalEntityDecl (String name, String value) + throws SAXException + { + if (xhtml) + return; + try { + // At this time, only SAX2 callbacks start these. + if (!startedDoctype) + return; + if (entityNestLevel != 0) + return; + rawWrite ("'); + newline (); + } catch (IOException e) { + fatal ("can't write", e); + } + } + + private void writeQuotedValue (String value, int code) + throws SAXException, IOException + { + char buf [] = value.toCharArray (); + int off = 0, len = buf.length; + + // we can't add line breaks to attribute/entity/... values + noWrap = true; + rawWrite ('"'); + escapeChars (buf, off, len, code); + rawWrite ('"'); + noWrap = false; + } + + // From "HTMLlat1x.ent" ... names of entities for ISO-8859-1 + // (Latin/1) characters, all codes: 160-255 (0xA0-0xFF). + // Codes 128-159 have no assigned values. + private static final String HTMLlat1x [] = { + // 160 + "nbsp", "iexcl", "cent", "pound", "curren", + "yen", "brvbar", "sect", "uml", "copy", + + // 170 + "ordf", "laquo", "not", "shy", "reg", + "macr", "deg", "plusmn", "sup2", "sup3", + + // 180 + "acute", "micro", "para", "middot", "cedil", + "sup1", "ordm", "raquo", "frac14", "frac12", + + // 190 + "frac34", "iquest", "Agrave", "Aacute", "Acirc", + "Atilde", "Auml", "Aring", "AElig", "Ccedil", + + // 200 + "Egrave", "Eacute", "Ecirc", "Euml", "Igrave", + "Iacute", "Icirc", "Iuml", "ETH", "Ntilde", + + // 210 + "Ograve", "Oacute", "Ocirc", "Otilde", "Ouml", + "times", "Oslash", "Ugrave", "Uacute", "Ucirc", + + // 220 + "Uuml", "Yacute", "THORN", "szlig", "agrave", + "aacute", "acirc", "atilde", "auml", "aring", + + // 230 + "aelig", "ccedil", "egrave", "eacute", "ecirc", + "euml", "igrave", "iacute", "icirc", "iuml", + + // 240 + "eth", "ntilde", "ograve", "oacute", "ocirc", + "otilde", "ouml", "divide", "oslash", "ugrave", + + // 250 + "uacute", "ucirc", "uuml", "yacute", "thorn", + "yuml" + }; + + // From "HTMLsymbolx.ent" ... some of the symbols that + // we can conveniently handle. Entities for the Greek. + // alphabet (upper and lower cases) are compact. + private static final String HTMLsymbolx_GR [] = { + // 913 + "Alpha", "Beta", "Gamma", "Delta", "Epsilon", + "Zeta", "Eta", "Theta", "Iota", "Kappa", + + // 923 + "Lambda", "Mu", "Nu", "Xi", "Omicron", + "Pi", "Rho", null, "Sigma", "Tau", + + // 933 + "Upsilon", "Phi", "Chi", "Psi", "Omega" + }; + + private static final String HTMLsymbolx_gr [] = { + // 945 + "alpha", "beta", "gamma", "delta", "epsilon", + "zeta", "eta", "theta", "iota", "kappa", + + // 955 + "lambda", "mu", "nu", "xi", "omicron", + "pi", "rho", "sigmaf", "sigma", "tau", + + // 965 + "upsilon", "phi", "chi", "psi", "omega" + }; + + + // General routine to write text and substitute predefined + // entities (XML, and a special case for XHTML) as needed. + private void escapeChars (char buf [], int off, int len, int code) + throws SAXException, IOException + { + int first = 0; + + if (off < 0) { + off = 0; + len = buf.length; + } + for (int i = 0; i < len; i++) { + String esc; + char c = buf [off + i]; + + switch (c) { + // Note that CTX_ATTRIBUTE isn't explicitly tested here; + // all syntax delimiters are escaped in CTX_ATTRIBUTE, + // otherwise it's similar to CTX_CONTENT + + // ampersand flags entity references; entity replacement + // text has unexpanded references, other text doesn't. + case '&': + if (code == CTX_ENTITY || code == CTX_UNPARSED) + continue; + esc = "amp"; + break; + + // attributes and text may NOT have literal '<', but + // entities may have markup constructs + case '<': + if (code == CTX_ENTITY || code == CTX_UNPARSED) + continue; + esc = "lt"; + break; + + // as above re markup constructs; but otherwise + // except when canonicalizing, this is for consistency + case '>': + if (code == CTX_ENTITY || code == CTX_UNPARSED) + continue; + esc = "gt"; + break; + case '\'': + if (code == CTX_CONTENT || code == CTX_UNPARSED) + continue; + if (canonical) + continue; + esc = "apos"; + break; + + // needed when printing quoted attribute/entity values + case '"': + if (code == CTX_CONTENT || code == CTX_UNPARSED) + continue; + esc = "quot"; + break; + + // make line ends work per host OS convention + case '\n': + esc = eol; + break; + + // + // No other characters NEED special treatment ... except + // for encoding-specific issues, like whether the character + // can really be represented in that encoding. + // + default: + // + // There are characters we can never write safely; getting + // them is an error. + // + // (a) They're never legal in XML ... detected by range + // checks, and (eventually) by remerging surrogate + // pairs on output. (Easy error for apps to prevent.) + // + // (b) This encoding can't represent them, and we + // can't make reference substitution (e.g. inside + // CDATA sections, names, PI data, etc). (Hard for + // apps to prevent, except by using UTF-8 or UTF-16 + // as their output encoding.) + // + // We know a very little bit about what characters + // the US-ASCII and ISO-8859-1 encodings support. For + // other encodings we can't detect the second type of + // error at all. (Never an issue for UTF-8 or UTF-16.) + // + +// FIXME: CR in CDATA is an error; in text, turn to a char ref + +// FIXME: CR/LF/TAB in attributes should become char refs + + if ((c > 0xfffd) + || ((c < 0x0020) && !((c == 0x0009) + || (c == 0x000A) || (c == 0x000D))) + || (((c & dangerMask) != 0) + && (code == CTX_UNPARSED))) { + + // if case (b) in CDATA, we might end the section, + // write a reference, then restart ... possible + // in one DOM L3 draft. + + throw new CharConversionException ( + "Illegal or non-writable character: U+" + + Integer.toHexString (c)); + } + + // + // If the output encoding represents the character + // directly, let it do so! Else we'll escape it. + // + if ((c & dangerMask) == 0) + continue; + esc = null; + + // Avoid numeric refs where symbolic ones exist, as + // symbolic ones make more sense to humans reading! + if (xhtml) { + // all the HTMLlat1x.ent entities + // (all the "ISO-8859-1" characters) + if (c >= 160 && c <= 255) + esc = HTMLlat1x [c - 160]; + + // not quite half the HTMLsymbolx.ent entities + else if (c >= 913 && c <= 937) + esc = HTMLsymbolx_GR [c - 913]; + else if (c >= 945 && c <= 969) + esc = HTMLsymbolx_gr [c - 945]; + + else switch (c) { + // all of the HTMLspecialx.ent entities + case 338: esc = "OElig"; break; + case 339: esc = "oelig"; break; + case 352: esc = "Scaron"; break; + case 353: esc = "scaron"; break; + case 376: esc = "Yuml"; break; + case 710: esc = "circ"; break; + case 732: esc = "tilde"; break; + case 8194: esc = "ensp"; break; + case 8195: esc = "emsp"; break; + case 8201: esc = "thinsp"; break; + case 8204: esc = "zwnj"; break; + case 8205: esc = "zwj"; break; + case 8206: esc = "lrm"; break; + case 8207: esc = "rlm"; break; + case 8211: esc = "ndash"; break; + case 8212: esc = "mdash"; break; + case 8216: esc = "lsquo"; break; + case 8217: esc = "rsquo"; break; + case 8218: esc = "sbquo"; break; + case 8220: esc = "ldquo"; break; + case 8221: esc = "rdquo"; break; + case 8222: esc = "bdquo"; break; + case 8224: esc = "dagger"; break; + case 8225: esc = "Dagger"; break; + case 8240: esc = "permil"; break; + case 8249: esc = "lsaquo"; break; + case 8250: esc = "rsaquo"; break; + case 8364: esc = "euro"; break; + + // the other HTMLsymbox.ent entities + case 402: esc = "fnof"; break; + case 977: esc = "thetasym"; break; + case 978: esc = "upsih"; break; + case 982: esc = "piv"; break; + case 8226: esc = "bull"; break; + case 8230: esc = "hellip"; break; + case 8242: esc = "prime"; break; + case 8243: esc = "Prime"; break; + case 8254: esc = "oline"; break; + case 8260: esc = "frasl"; break; + case 8472: esc = "weierp"; break; + case 8465: esc = "image"; break; + case 8476: esc = "real"; break; + case 8482: esc = "trade"; break; + case 8501: esc = "alefsym"; break; + case 8592: esc = "larr"; break; + case 8593: esc = "uarr"; break; + case 8594: esc = "rarr"; break; + case 8595: esc = "darr"; break; + case 8596: esc = "harr"; break; + case 8629: esc = "crarr"; break; + case 8656: esc = "lArr"; break; + case 8657: esc = "uArr"; break; + case 8658: esc = "rArr"; break; + case 8659: esc = "dArr"; break; + case 8660: esc = "hArr"; break; + case 8704: esc = "forall"; break; + case 8706: esc = "part"; break; + case 8707: esc = "exist"; break; + case 8709: esc = "empty"; break; + case 8711: esc = "nabla"; break; + case 8712: esc = "isin"; break; + case 8713: esc = "notin"; break; + case 8715: esc = "ni"; break; + case 8719: esc = "prod"; break; + case 8721: esc = "sum"; break; + case 8722: esc = "minus"; break; + case 8727: esc = "lowast"; break; + case 8730: esc = "radic"; break; + case 8733: esc = "prop"; break; + case 8734: esc = "infin"; break; + case 8736: esc = "ang"; break; + case 8743: esc = "and"; break; + case 8744: esc = "or"; break; + case 8745: esc = "cap"; break; + case 8746: esc = "cup"; break; + case 8747: esc = "int"; break; + case 8756: esc = "there4"; break; + case 8764: esc = "sim"; break; + case 8773: esc = "cong"; break; + case 8776: esc = "asymp"; break; + case 8800: esc = "ne"; break; + case 8801: esc = "equiv"; break; + case 8804: esc = "le"; break; + case 8805: esc = "ge"; break; + case 8834: esc = "sub"; break; + case 8835: esc = "sup"; break; + case 8836: esc = "nsub"; break; + case 8838: esc = "sube"; break; + case 8839: esc = "supe"; break; + case 8853: esc = "oplus"; break; + case 8855: esc = "otimes"; break; + case 8869: esc = "perp"; break; + case 8901: esc = "sdot"; break; + case 8968: esc = "lceil"; break; + case 8969: esc = "rceil"; break; + case 8970: esc = "lfloor"; break; + case 8971: esc = "rfloor"; break; + case 9001: esc = "lang"; break; + case 9002: esc = "rang"; break; + case 9674: esc = "loz"; break; + case 9824: esc = "spades"; break; + case 9827: esc = "clubs"; break; + case 9829: esc = "hearts"; break; + case 9830: esc = "diams"; break; + } + } + + // else escape with numeric char refs + if (esc == null) { + stringBuf.setLength (0); + stringBuf.append ("#x"); + stringBuf.append (Integer.toHexString (c).toUpperCase ()); + esc = stringBuf.toString (); + + // FIXME: We don't write surrogate pairs correctly. + // They should work as one ref per character, since + // each pair is one character. For reading back into + // Unicode, it matters beginning in Unicode 3.1 ... + } + break; + } + if (i != first) + rawWrite (buf, off + first, i - first); + first = i + 1; + if (esc == eol) + newline (); + else { + rawWrite ('&'); + rawWrite (esc); + rawWrite (';'); + } + } + if (first < len) + rawWrite (buf, off + first, len - first); + } + + + + private void newline () + throws SAXException, IOException + { + out.write (eol); + column = 0; + } + + private void doIndent () + throws SAXException, IOException + { + int space = elementNestLevel * 2; + + newline (); + column = space; + // track tabs only at line starts + while (space > 8) { + out.write ("\t"); + space -= 8; + } + while (space > 0) { + out.write (" "); + space -= 2; + } + } + + private void rawWrite (char c) + throws IOException + { + out.write (c); + column++; + } + + private void rawWrite (String s) + throws SAXException, IOException + { + if (prettyPrinting && "default".equals (space.peek ())) { + char data [] = s.toCharArray (); + rawWrite (data, 0, data.length); + } else { + out.write (s); + column += s.length (); + } + } + + // NOTE: if xhtml, the REC gives some rules about whitespace + // which we could follow ... notably, many places where conformant + // agents "must" consolidate/normalize whitespace. Line ends can + // be removed there, etc. This may not be the right place to do + // such mappings though. + + // Line buffering may help clarify algorithms and improve results. + + // It's likely xml:space needs more attention. + + private void rawWrite (char buf [], int offset, int length) + throws SAXException, IOException + { + boolean wrap; + + if (prettyPrinting && space.empty ()) + fatal ("stack discipline", null); + + wrap = prettyPrinting && "default".equals (space.peek ()); + if (!wrap) { + out.write (buf, offset, length); + column += length; + return; + } + + // we're pretty printing and want to fill lines out only + // to the desired line length. + while (length > 0) { + int target = lineLength - column; + boolean wrote = false; + + // Do we even have a problem? + if (target > length || noWrap) { + out.write (buf, offset, length); + column += length; + return; + } + + // break the line at a space character, trying to fill + // as much of the line as possible. + char c; + + for (int i = target - 1; i >= 0; i--) { + if ((c = buf [offset + i]) == ' ' || c == '\t') { + i++; + out.write (buf, offset, i); + doIndent (); + offset += i; + length -= i; + wrote = true; + break; + } + } + if (wrote) + continue; + + // no space character permitting break before target + // line length is filled. So, take the next one. + if (target < 0) + target = 0; + for (int i = target; i < length; i++) + if ((c = buf [offset + i]) == ' ' || c == '\t') { + i++; + out.write (buf, offset, i); + doIndent (); + offset += i; + length -= i; + wrote = true; + break; + } + if (wrote) + continue; + + // no such luck. + out.write (buf, offset, length); + column += length; + break; + } + } +} -- cgit v1.2.3