/* XMLWriter.java -- Copyright (C) 1999,2000,2001 Free Software Foundation, Inc. This file is part of GNU Classpath. GNU Classpath is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. GNU Classpath is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with GNU Classpath; see the file COPYING. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. Linking this library statically or dynamically with other modules is making a combined work based on this library. Thus, the terms and conditions of the GNU General Public License cover the whole combination. As a special exception, the copyright holders of this library give you permission to link this library with independent modules to produce an executable, regardless of the license terms of these independent modules, and to copy and distribute the resulting executable under terms of your choice, provided that you also meet, for each linked independent module, the terms and conditions of the license of that module. An independent module is a module which is not derived from or based on this library. If you modify this library, you may extend this exception to your version of the library, but you are not obligated to do so. If you do not wish to do so, delete this exception statement from your version. */ package gnu.xml.util; import gnu.java.lang.CPStringBuilder; import java.io.BufferedWriter; import java.io.CharConversionException; import java.io.IOException; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.Writer; import java.util.Stack; import org.xml.sax.*; import org.xml.sax.ext.*; import org.xml.sax.helpers.*; /** * This class is a SAX handler which writes all its input as a well formed * XML or XHTML document. If driven using SAX2 events, this output may * include a recreated document type declaration, subject to limitations * of SAX (no internal subset exposed) or DOM (the important declarations, * with their documentation, are discarded). * *

By default, text is generated "as-is", but some optional modes * are supported. Pretty-printing is supported, to make life easier * for people reading the output. XHTML (1.0) output has can be made * particularly pretty; all the built-in character entities are known. * Canonical XML can also be generated, assuming the input is properly * formed. * *

* *

Some of the methods on this class are intended for applications to * use directly, rather than as pure SAX2 event callbacks. Some of those * methods access the JavaBeans properties (used to tweak output formats, * for example canonicalization and pretty printing). Subclasses * are expected to add new behaviors, not to modify current behavior, so * many such methods are final.

* *

The write*() methods may be slightly simpler for some * applications to use than direct callbacks. For example, they support * a simple policy for encoding data items as the content of a single element. * *

To reuse an XMLWriter you must provide it with a new Writer, since * this handler closes the writer it was given as part of its endDocument() * handling. (XML documents have an end of input, and the way to encode * that on a stream is to close it.)

* *

Note that any relative URIs in the source document, as found in * entity and notation declarations, ought to have been fully resolved by * the parser providing events to this handler. This means that the * output text should only have fully resolved URIs, which may not be * the desired behavior in cases where later binding is desired.

* *

Note that due to SAX2 defaults, you may need to manually * ensure that the input events are XML-conformant with respect to namespace * prefixes and declarations. {@link gnu.xml.pipeline.NSFilter} is * one solution to this problem, in the context of processing pipelines. * Something as simple as connecting this handler to a parser might not * generate the correct output. Another workaround is to ensure that the * namespace-prefixes feature is always set to true, if you're * hooking this directly up to some XMLReader implementation. * * @see gnu.xml.pipeline.TextConsumer * * @author David Brownell * * @deprecated Please use the javax.xml.stream APIs instead */ public class XMLWriter implements ContentHandler, LexicalHandler, DTDHandler, DeclHandler { // text prints/escapes differently depending on context // CTX_ENTITY ... entity literal value // CTX_ATTRIBUTE ... attribute literal value // CTX_CONTENT ... content of an element // CTX_UNPARSED ... CDATA, comment, PI, names, etc // CTX_NAME ... name or nmtoken, no escapes possible private static final int CTX_ENTITY = 1; private static final int CTX_ATTRIBUTE = 2; private static final int CTX_CONTENT = 3; private static final int CTX_UNPARSED = 4; private static final int CTX_NAME = 5; // FIXME: names (element, attribute, PI, notation, etc) are not // currently written out with range checks (escapeChars). // In non-XHTML, some names can't be directly written; panic! private static String sysEOL; static { try { sysEOL = System.getProperty ("line.separator", "\n"); // don't use the system's EOL if it's illegal XML. if (!isLineEnd (sysEOL)) sysEOL = "\n"; } catch (SecurityException e) { sysEOL = "\n"; } } private static boolean isLineEnd (String eol) { return "\n".equals (eol) || "\r".equals (eol) || "\r\n".equals (eol); } private Writer out; private boolean inCDATA; private int elementNestLevel; private String eol = sysEOL; private short dangerMask; private CPStringBuilder stringBuf; private Locator locator; private ErrorHandler errHandler; private boolean expandingEntities = false; private int entityNestLevel; private boolean xhtml; private boolean startedDoctype; private String encoding; private boolean canonical; private boolean inDoctype; private boolean inEpilogue; // pretty printing controls private boolean prettyPrinting; private int column; private boolean noWrap; private Stack space = new Stack (); // this is not a hard'n'fast rule -- longer lines are OK, // but are to be avoided. Here, prettyprinting is more to // show structure "cleanly" than to be precise about it. // better to have ragged layout than one line 24Kb long. private static final int lineLength = 75; /** * Constructs this handler with System.out used to write SAX events * using the UTF-8 encoding. Avoid using this except when you know * it's safe to close System.out at the end of the document. */ public XMLWriter () throws IOException { this (System.out); } /** * Constructs a handler which writes all input to the output stream * in the UTF-8 encoding, and closes it when endDocument is called. * (Yes it's annoying that this throws an exception -- but there's * really no way around it, since it's barely possible a JDK may * exist somewhere that doesn't know how to emit UTF-8.) */ public XMLWriter (OutputStream out) throws IOException { this (new OutputStreamWriter (out, "UTF8")); } /** * Constructs a handler which writes all input to the writer, and then * closes the writer when the document ends. If an XML declaration is * written onto the output, and this class can determine the name of * the character encoding for this writer, that encoding name will be * included in the XML declaration. * *

See the description of the constructor which takes an encoding * name for imporant information about selection of encodings. * * @param writer XML text is written to this writer. */ public XMLWriter (Writer writer) { this (writer, null); } /** * Constructs a handler which writes all input to the writer, and then * closes the writer when the document ends. If an XML declaration is * written onto the output, this class will use the specified encoding * name in that declaration. If no encoding name is specified, no * encoding name will be declared unless this class can otherwise * determine the name of the character encoding for this writer. * *

At this time, only the UTF-8 ("UTF8") and UTF-16 ("Unicode") * output encodings are fully lossless with respect to XML data. If you * use any other encoding you risk having your data be silently mangled * on output, as the standard Java character encoding subsystem silently * maps non-encodable characters to a question mark ("?") and will not * report such errors to applications. * *

For a few other encodings the risk can be reduced. If the writer is * a java.io.OutputStreamWriter, and uses either the ISO-8859-1 ("8859_1", * "ISO8859_1", etc) or US-ASCII ("ASCII") encodings, content which * can't be encoded in those encodings will be written safely. Where * relevant, the XHTML entity names will be used; otherwise, numeric * character references will be emitted. * *

However, there remain a number of cases where substituting such * entity or character references is not an option. Such references are * not usable within a DTD, comment, PI, or CDATA section. Neither may * they be used when element, attribute, entity, or notation names have * the problematic characters. * * @param writer XML text is written to this writer. * @param encoding if non-null, and an XML declaration is written, * this is the name that will be used for the character encoding. */ public XMLWriter (Writer writer, String encoding) { setWriter (writer, encoding); } private void setEncoding (String encoding) { if (encoding == null && out instanceof OutputStreamWriter) encoding = ((OutputStreamWriter)out).getEncoding (); if (encoding != null) { encoding = encoding.toUpperCase (); // Use official encoding names where we know them, // avoiding the Java-only names. When using common // encodings where we can easily tell if characters // are out of range, we'll escape out-of-range // characters using character refs for safety. // I _think_ these are all the main synonyms for these! if ("UTF8".equals (encoding)) { encoding = "UTF-8"; } else if ("US-ASCII".equals (encoding) || "ASCII".equals (encoding)) { dangerMask = (short) 0xff80; encoding = "US-ASCII"; } else if ("ISO-8859-1".equals (encoding) || "8859_1".equals (encoding) || "ISO8859_1".equals (encoding)) { dangerMask = (short) 0xff00; encoding = "ISO-8859-1"; } else if ("UNICODE".equals (encoding) || "UNICODE-BIG".equals (encoding) || "UNICODE-LITTLE".equals (encoding)) { encoding = "UTF-16"; // TODO: UTF-16BE, UTF-16LE ... no BOM; what // release of JDK supports those Unicode names? } if (dangerMask != 0) stringBuf = new CPStringBuilder (); } this.encoding = encoding; } /** * Resets the handler to write a new text document. * * @param writer XML text is written to this writer. * @param encoding if non-null, and an XML declaration is written, * this is the name that will be used for the character encoding. * * @exception IllegalStateException if the current * document hasn't yet ended (with {@link #endDocument}) */ final public void setWriter (Writer writer, String encoding) { if (out != null) throw new IllegalStateException ( "can't change stream in mid course"); out = writer; if (out != null) setEncoding (encoding); if (!(out instanceof BufferedWriter)) out = new BufferedWriter (out); space.push ("default"); } /** * Assigns the line ending style to be used on output. * @param eolString null to use the system default; else * "\n", "\r", or "\r\n". */ final public void setEOL (String eolString) { if (eolString == null) eol = sysEOL; else if (!isLineEnd (eolString)) eol = eolString; else throw new IllegalArgumentException (eolString); } /** * Assigns the error handler to be used to present most fatal * errors. */ public void setErrorHandler (ErrorHandler handler) { errHandler = handler; } /** * Used internally and by subclasses, this encapsulates the logic * involved in reporting fatal errors. It uses locator information * for good diagnostics, if available, and gives the application's * ErrorHandler the opportunity to handle the error before throwing * an exception. */ protected void fatal (String message, Exception e) throws SAXException { SAXParseException x; if (locator == null) x = new SAXParseException (message, null, null, -1, -1, e); else x = new SAXParseException (message, locator, e); if (errHandler != null) errHandler.fatalError (x); throw x; } // JavaBeans properties /** * Controls whether the output should attempt to follow the "transitional" * XHTML rules so that it meets the "HTML Compatibility Guidelines" * appendix in the XHTML specification. A "transitional" Document Type * Declaration (DTD) is placed near the beginning of the output document, * instead of whatever DTD would otherwise have been placed there, and * XHTML empty elements are printed specially. When writing text in * US-ASCII or ISO-8859-1 encodings, the predefined XHTML internal * entity names are used (in preference to character references) when * writing content characters which can't be expressed in those encodings. * *

When this option is enabled, it is the caller's responsibility * to ensure that the input is otherwise valid as XHTML. Things to * be careful of in all cases, as described in the appendix referenced * above, include:

Element and attribute names must be in lower case, both * in the document and in any CSS style sheet. *
All XML constructs must be valid as defined by the XHTML * "transitional" DTD (including all familiar constructs, * even deprecated ones). *
The root element must be "html". *
Elements that must be empty (such as <br> * must have no content. *
Use both lang and xml:lang attributes * when specifying language. *
Similarly, use both id and name attributes * when defining elements that may be referred to through * URI fragment identifiers ... and make sure that the * value is a legal NMTOKEN, since not all such HTML 4.0 * identifiers are valid in XML. *
Be careful with character encodings; make sure you provide * a <meta http-equiv="Content-type" * content="text/xml;charset=..." /> element in * the HTML "head" element, naming the same encoding * used to create this handler. Also, if that encoding * is anything other than US-ASCII, make sure that if * the document is given a MIME content type, it has * a charset=... attribute with that encoding. *

* *

Additionally, some of the oldest browsers have additional * quirks, to address with guidelines such as:

Processing instructions may be rendered, so avoid them. * (Similarly for an XML declaration.) *
Embedded style sheets and scripts should not contain XML * markup delimiters: &, <, and ]]> are trouble. *
Attribute values should not have line breaks or multiple * consecutive white space characters. *
Use no more than one of the deprecated (transitional) * <isindex> elements. *
Some boolean attributes (such as compact, checked, * disabled, readonly, selected, and more) confuse * some browsers, since they only understand minimized * versions which are illegal in XML. *

* *

Also, some characteristics of the resulting output may be * a function of whether the document is later given a MIME * content type of text/html rather than one indicating * XML (application/xml or text/xml). Worse, * some browsers ignore MIME content types and prefer to rely URI * name suffixes -- so an "index.xml" could always be XML, never * XHTML, no matter its MIME type. */ final public void setXhtml (boolean value) { if (locator != null) throw new IllegalStateException ("started parsing"); xhtml = value; if (xhtml) canonical = false; } /** * Returns true if the output attempts to echo the input following * "transitional" XHTML rules and matching the "HTML Compatibility * Guidelines" so that an HTML version 3 browser can read the output * as HTML; returns false (the default) othewise. */ final public boolean isXhtml () { return xhtml; } /** * Controls whether the output text contains references to * entities (the default), or instead contains the expanded * values of those entities. */ final public void setExpandingEntities (boolean value) { if (locator != null) throw new IllegalStateException ("started parsing"); expandingEntities = value; if (!expandingEntities) canonical = false; } /** * Returns true if the output will have no entity references; * returns false (the default) otherwise. */ final public boolean isExpandingEntities () { return expandingEntities; } /** * Controls pretty-printing, which by default is not enabled * (and currently is most useful for XHTML output). * Pretty printing enables structural indentation, sorting of attributes * by name, line wrapping, and potentially other mechanisms for making * output more or less readable. * *

At this writing, structural indentation and line wrapping are * enabled when pretty printing is enabled and the xml:space * attribute has the value default (its other legal value is * preserve, as defined in the XML specification). The three * XHTML element types which use another value are recognized by their * names (namespaces are ignored). * *

Also, for the record, the "pretty" aspect of printing here * is more to provide basic structure on outputs that would otherwise * risk being a single long line of text. For now, expect the * structure to be ragged ... unless you'd like to submit a patch * to make this be more strictly formatted! * * @exception IllegalStateException thrown if this method is invoked * after output has begun. */ final public void setPrettyPrinting (boolean value) { if (locator != null) throw new IllegalStateException ("started parsing"); prettyPrinting = value; if (prettyPrinting) canonical = false; } /** * Returns value of flag controlling pretty printing. */ final public boolean isPrettyPrinting () { return prettyPrinting; } /** * Sets the output style to be canonicalized. Input events must * meet requirements that are slightly more stringent than the * basic well-formedness ones, and include:

Namespace prefixes must not have been changed from those * in the original document. (This may only be ensured by setting * the SAX2 XMLReader namespace-prefixes feature flag; * by default, it is cleared.) * *
Redundant namespace declaration attributes have been * removed. (If an ancestor element defines a namespace prefix * and that declaration hasn't been overriden, an element must * not redeclare it.) * *
If comments are not to be included in the canonical output, * they must first be removed from the input event stream; this * Canonical XML with comments by default. * *
If the input character encoding was not UCS-based, the * character data must have been normalized using Unicode * Normalization Form C. (UTF-8 and UTF-16 are UCS-based.) * *
Attribute values must have been normalized, as is done * by any conformant XML processor which processes all external * parameter entities. * *
Similarly, attribute value defaulting has been performed. * *

* *

Note that fragments of XML documents, as specified by an XPath * node set, may be canonicalized. In such cases, elements may need * some fixup (for xml:* attributes and application-specific * context). * * @exception IllegalArgumentException if the output encoding * is anything other than UTF-8. */ final public void setCanonical (boolean value) { if (value && !"UTF-8".equals (encoding)) throw new IllegalArgumentException ("encoding != UTF-8"); canonical = value; if (canonical) { prettyPrinting = xhtml = false; expandingEntities = true; eol = "\n"; } } /** * Returns value of flag controlling canonical output. */ final public boolean isCanonical () { return canonical; } /** * Flushes the output stream. When this handler is used in long lived * pipelines, it can be important to flush buffered state, for example * so that it can reach the disk as part of a state checkpoint. */ final public void flush () throws IOException { if (out != null) out.flush (); } // convenience routines // FIXME: probably want a subclass that holds a lot of these... // and maybe more! /** * Writes the string as if characters() had been called on the contents * of the string. This is particularly useful when applications act as * producers and write data directly to event consumers. */ final public void write (String data) throws SAXException { char buf [] = data.toCharArray (); characters (buf, 0, buf.length); } /** * Writes an element that has content consisting of a single string. * @see #writeEmptyElement * @see #startElement */ public void writeElement ( String uri, String localName, String qName, Attributes atts, String content ) throws SAXException { if (content == null || content.length () == 0) { writeEmptyElement (uri, localName, qName, atts); return; } startElement (uri, localName, qName, atts); char chars [] = content.toCharArray (); characters (chars, 0, chars.length); endElement (uri, localName, qName); } /** * Writes an element that has content consisting of a single integer, * encoded as a decimal string. * @see #writeEmptyElement * @see #startElement */ public void writeElement ( String uri, String localName, String qName, Attributes atts, int content ) throws SAXException { writeElement (uri, localName, qName, atts, Integer.toString (content)); } // SAX1 ContentHandler /** SAX1: provides parser status information */ final public void setDocumentLocator (Locator l) { locator = l; } // URL for dtd that validates against all normal HTML constructs private static final String xhtmlFullDTD = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"; /** * SAX1: indicates the beginning of a document parse. * If you're writing (well formed) fragments of XML, neither * this nor endDocument should be called. */ // NOT final public void startDocument () throws SAXException { try { if (out == null) throw new IllegalStateException ( "null Writer given to XMLWriter"); // Not all parsers provide the locator we want; this also // flags whether events are being sent to this object yet. // We could only have this one call if we only printed whole // documents ... but we also print fragments, so most of the // callbacks here replicate this test. if (locator == null) locator = new LocatorImpl (); // Unless the data is in US-ASCII or we're canonicalizing, write // the XML declaration if we know the encoding. US-ASCII won't // normally get mangled by web server confusion about the // character encodings used. Plus, it's an easy way to // ensure we can write ASCII that's unlikely to confuse // elderly HTML parsers. if (!canonical && dangerMask != (short) 0xff80 && encoding != null) { rawWrite (""); newline (); } if (xhtml) { rawWrite (""); newline (); newline (); // fake the rest of the handler into ignoring // everything until the root element, so any // XHTML DTD comments, PIs, etc are ignored startedDoctype = true; } entityNestLevel = 0; } catch (IOException e) { fatal ("can't write", e); } } /** * SAX1: indicates the completion of a parse. * Note that all complete SAX event streams make this call, even * if an error is reported during a parse. */ // NOT final public void endDocument () throws SAXException { try { if (!canonical) { newline (); newline (); } out.close (); out = null; locator = null; } catch (IOException e) { fatal ("can't write", e); } } // XHTML elements declared as EMPTY print differently final private static boolean isEmptyElementTag (String tag) { switch (tag.charAt (0)) { case 'a': return "area".equals (tag); case 'b': return "base".equals (tag) || "basefont".equals (tag) || "br".equals (tag); case 'c': return "col".equals (tag); case 'f': return "frame".equals (tag); case 'h': return "hr".equals (tag); case 'i': return "img".equals (tag) || "input".equals (tag) || "isindex".equals (tag); case 'l': return "link".equals (tag); case 'm': return "meta".equals (tag); case 'p': return "param".equals (tag); } return false; } private static boolean indentBefore (String tag) { // basically indent before block content // and within structure like tables, lists switch (tag.charAt (0)) { case 'a': return "applet".equals (tag); case 'b': return "body".equals (tag) || "blockquote".equals (tag); case 'c': return "center".equals (tag); case 'f': return "frame".equals (tag) || "frameset".equals (tag); case 'h': return "head".equals (tag); case 'm': return "meta".equals (tag); case 'o': return "object".equals (tag); case 'p': return "param".equals (tag) || "pre".equals (tag); case 's': return "style".equals (tag); case 't': return "title".equals (tag) || "td".equals (tag) || "th".equals (tag); } // ... but not inline elements like "em", "b", "font" return false; } private static boolean spaceBefore (String tag) { // blank line AND INDENT before certain structural content switch (tag.charAt (0)) { case 'h': return "h1".equals (tag) || "h2".equals (tag) || "h3".equals (tag) || "h4".equals (tag) || "h5".equals (tag) || "h6".equals (tag) || "hr".equals (tag); case 'l': return "li".equals (tag); case 'o': return "ol".equals (tag); case 'p': return "p".equals (tag); case 't': return "table".equals (tag) || "tr".equals (tag); case 'u': return "ul".equals (tag); } return false; } // XHTML DTDs say these three have xml:space="preserve" private static boolean spacePreserve (String tag) { return "pre".equals (tag) || "style".equals (tag) || "script".equals (tag); } /** * SAX2: ignored. */ final public void startPrefixMapping (String prefix, String uri) {} /** * SAX2: ignored. */ final public void endPrefixMapping (String prefix) {} private void writeStartTag ( String name, Attributes atts, boolean isEmpty ) throws SAXException, IOException { rawWrite ('<'); rawWrite (name); // write out attributes ... sorting is particularly useful // with output that's been heavily defaulted. if (atts != null && atts.getLength () != 0) { // Set up to write, with optional sorting int indices [] = new int [atts.getLength ()]; for (int i= 0; i < indices.length; i++) indices [i] = i; // optionally sort // FIXME: canon xml demands xmlns nodes go first, // and sorting by URI first (empty first) then localname // it should maybe use a different sort if (canonical || prettyPrinting) { // insertion sort by attribute name for (int i = 1; i < indices.length; i++) { int n = indices [i], j; String s = atts.getQName (n); for (j = i - 1; j >= 0; j--) { if (s.compareTo (atts.getQName (indices [j])) >= 0) break; indices [j + 1] = indices [j]; } indices [j + 1] = n; } } // write, sorted or no for (int i= 0; i < indices.length; i++) { String s = atts.getQName (indices [i]); if (s == null || "".equals (s)) throw new IllegalArgumentException ("no XML name"); rawWrite (" "); rawWrite (s); rawWrite ("="); writeQuotedValue (atts.getValue (indices [i]), CTX_ATTRIBUTE); } } if (isEmpty) rawWrite (" /"); rawWrite ('>'); } /** * SAX2: indicates the start of an element. * When XHTML is in use, avoid attribute values with * line breaks or multiple whitespace characters, since * not all user agents handle them correctly. */ final public void startElement ( String uri, String localName, String qName, Attributes atts ) throws SAXException { startedDoctype = false; if (locator == null) locator = new LocatorImpl (); if (qName == null || "".equals (qName)) throw new IllegalArgumentException ("no XML name"); try { if (entityNestLevel != 0) return; if (prettyPrinting) { String whitespace = null; if (xhtml && spacePreserve (qName)) whitespace = "preserve"; else if (atts != null) whitespace = atts.getValue ("xml:space"); if (whitespace == null) whitespace = (String) space.peek (); space.push (whitespace); if ("default".equals (whitespace)) { if (xhtml) { if (spaceBefore (qName)) { newline (); doIndent (); } else if (indentBefore (qName)) doIndent (); // else it's inlined, modulo line length // FIXME: incrementing element nest level // for inlined elements causes ugliness } else doIndent (); } } elementNestLevel++; writeStartTag (qName, atts, xhtml && isEmptyElementTag (qName)); if (xhtml) { // FIXME: if this is an XHTML "pre" element, turn // off automatic wrapping. } } catch (IOException e) { fatal ("can't write", e); } } /** * Writes an empty element. * @see #startElement */ public void writeEmptyElement ( String uri, String localName, String qName, Attributes atts ) throws SAXException { if (canonical) { startElement (uri, localName, qName, atts); endElement (uri, localName, qName); } else { try { writeStartTag (qName, atts, true); } catch (IOException e) { fatal ("can't write", e); } } } /** SAX2: indicates the end of an element */ final public void endElement (String uri, String localName, String qName) throws SAXException { if (qName == null || "".equals (qName)) throw new IllegalArgumentException ("no XML name"); try { elementNestLevel--; if (entityNestLevel != 0) return; if (xhtml && isEmptyElementTag (qName)) return; rawWrite ("'); if (prettyPrinting) { if (!space.empty ()) space.pop (); else fatal ("stack discipline", null); } if (elementNestLevel == 0) inEpilogue = true; } catch (IOException e) { fatal ("can't write", e); } } /** SAX1: reports content characters */ final public void characters (char ch [], int start, int length) throws SAXException { if (locator == null) locator = new LocatorImpl (); try { if (entityNestLevel != 0) return; if (inCDATA) { escapeChars (ch, start, length, CTX_UNPARSED); } else { escapeChars (ch, start, length, CTX_CONTENT); } } catch (IOException e) { fatal ("can't write", e); } } /** SAX1: reports ignorable whitespace */ final public void ignorableWhitespace (char ch [], int start, int length) throws SAXException { if (locator == null) locator = new LocatorImpl (); try { if (entityNestLevel != 0) return; // don't forget to map NL to CRLF, CR, etc escapeChars (ch, start, length, CTX_CONTENT); } catch (IOException e) { fatal ("can't write", e); } } /** * SAX1: reports a PI. * This doesn't check for illegal target names, such as "xml" or "XML", * or namespace-incompatible ones like "big:dog"; the caller is * responsible for ensuring those names are legal. */ final public void processingInstruction (String target, String data) throws SAXException { if (locator == null) locator = new LocatorImpl (); // don't print internal subset for XHTML if (xhtml && startedDoctype) return; // ancient HTML browsers might render these ... their loss. // to prevent: "if (xhtml) return;". try { if (entityNestLevel != 0) return; if (canonical && inEpilogue) newline (); rawWrite (""); if (elementNestLevel == 0 && !(canonical && inEpilogue)) newline (); } catch (IOException e) { fatal ("can't write", e); } } /** SAX1: indicates a non-expanded entity reference */ public void skippedEntity (String name) throws SAXException { try { rawWrite ("&"); rawWrite (name); rawWrite (";"); } catch (IOException e) { fatal ("can't write", e); } } // SAX2 LexicalHandler /** SAX2: called before parsing CDATA characters */ final public void startCDATA () throws SAXException { if (locator == null) locator = new LocatorImpl (); if (canonical) return; try { inCDATA = true; if (entityNestLevel == 0) rawWrite ("SAX2: called after parsing CDATA characters */ final public void endCDATA () throws SAXException { if (canonical) return; try { inCDATA = false; if (entityNestLevel == 0) rawWrite ("]]>"); } catch (IOException e) { fatal ("can't write", e); } } /** * SAX2: called when the doctype is partially parsed * Note that this, like other doctype related calls, is ignored * when XHTML is in use. */ final public void startDTD (String name, String publicId, String systemId) throws SAXException { if (locator == null) locator = new LocatorImpl (); if (xhtml) return; try { inDoctype = startedDoctype = true; if (canonical) return; rawWrite ("SAX2: called after the doctype is parsed */ final public void endDTD () throws SAXException { inDoctype = false; if (canonical || xhtml) return; try { rawWrite ("]>"); newline (); } catch (IOException e) { fatal ("can't write", e); } } /** * SAX2: called before parsing a general entity in content */ final public void startEntity (String name) throws SAXException { try { boolean writeEOL = true; // Predefined XHTML entities (for characters) will get // mapped back later. if (xhtml || expandingEntities) return; entityNestLevel++; if (name.equals ("[dtd]")) return; if (entityNestLevel != 1) return; if (!name.startsWith ("%")) { writeEOL = false; rawWrite ('&'); } rawWrite (name); rawWrite (';'); if (writeEOL) newline (); } catch (IOException e) { fatal ("can't write", e); } } /** * SAX2: called after parsing a general entity in content */ final public void endEntity (String name) throws SAXException { if (xhtml || expandingEntities) return; entityNestLevel--; } /** * SAX2: called when comments are parsed. * When XHTML is used, the old HTML tradition of using comments * to for inline CSS, or for JavaScript code is discouraged. * This is because XML processors are encouraged to discard, on * the grounds that comments are for users (and perhaps text * editors) not programs. Instead, use external scripts */ final public void comment (char ch [], int start, int length) throws SAXException { if (locator == null) locator = new LocatorImpl (); // don't print internal subset for XHTML if (xhtml && startedDoctype) return; // don't print comment in doctype for canon xml if (canonical && inDoctype) return; try { boolean indent; if (prettyPrinting && space.empty ()) fatal ("stack discipline", null); indent = prettyPrinting && "default".equals (space.peek ()); if (entityNestLevel != 0) return; if (indent) doIndent (); if (canonical && inEpilogue) newline (); rawWrite (""); if (indent) doIndent (); if (elementNestLevel == 0 && !(canonical && inEpilogue)) newline (); } catch (IOException e) { fatal ("can't write", e); } } // SAX1 DTDHandler /** SAX1: called on notation declarations */ final public void notationDecl (String name, String publicId, String systemId) throws SAXException { if (xhtml) return; try { // At this time, only SAX2 callbacks start these. if (!startedDoctype) return; if (entityNestLevel != 0) return; rawWrite (""); newline (); } catch (IOException e) { fatal ("can't write", e); } } /** SAX1: called on unparsed entity declarations */ final public void unparsedEntityDecl (String name, String publicId, String systemId, String notationName) throws SAXException { if (xhtml) return; try { // At this time, only SAX2 callbacks start these. if (!startedDoctype) { // FIXME: write to temporary buffer, and make the start // of the root element write these declarations. return; } if (entityNestLevel != 0) return; rawWrite (""); newline (); } catch (IOException e) { fatal ("can't write", e); } } // SAX2 DeclHandler /** SAX2: called on attribute declarations */ final public void attributeDecl (String eName, String aName, String type, String mode, String value) throws SAXException { if (xhtml) return; try { // At this time, only SAX2 callbacks start these. if (!startedDoctype) return; if (entityNestLevel != 0) return; rawWrite ("'); newline (); } catch (IOException e) { fatal ("can't write", e); } } /** SAX2: called on element declarations */ final public void elementDecl (String name, String model) throws SAXException { if (xhtml) return; try { // At this time, only SAX2 callbacks start these. if (!startedDoctype) return; if (entityNestLevel != 0) return; rawWrite ("'); newline (); } catch (IOException e) { fatal ("can't write", e); } } /** SAX2: called on external entity declarations */ final public void externalEntityDecl ( String name, String publicId, String systemId) throws SAXException { if (xhtml) return; try { // At this time, only SAX2 callbacks start these. if (!startedDoctype) return; if (entityNestLevel != 0) return; rawWrite (""); newline (); } catch (IOException e) { fatal ("can't write", e); } } /** SAX2: called on internal entity declarations */ final public void internalEntityDecl (String name, String value) throws SAXException { if (xhtml) return; try { // At this time, only SAX2 callbacks start these. if (!startedDoctype) return; if (entityNestLevel != 0) return; rawWrite ("'); newline (); } catch (IOException e) { fatal ("can't write", e); } } private void writeQuotedValue (String value, int code) throws SAXException, IOException { char buf [] = value.toCharArray (); int off = 0, len = buf.length; // we can't add line breaks to attribute/entity/... values noWrap = true; rawWrite ('"'); escapeChars (buf, off, len, code); rawWrite ('"'); noWrap = false; } // From "HTMLlat1x.ent" ... names of entities for ISO-8859-1 // (Latin/1) characters, all codes: 160-255 (0xA0-0xFF). // Codes 128-159 have no assigned values. private static final String HTMLlat1x [] = { // 160 "nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar", "sect", "uml", "copy", // 170 "ordf", "laquo", "not", "shy", "reg", "macr", "deg", "plusmn", "sup2", "sup3", // 180 "acute", "micro", "para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14", "frac12", // 190 "frac34", "iquest", "Agrave", "Aacute", "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", // 200 "Egrave", "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc", "Iuml", "ETH", "Ntilde", // 210 "Ograve", "Oacute", "Ocirc", "Otilde", "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", // 220 "Uuml", "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc", "atilde", "auml", "aring", // 230 "aelig", "ccedil", "egrave", "eacute", "ecirc", "euml", "igrave", "iacute", "icirc", "iuml", // 240 "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde", "ouml", "divide", "oslash", "ugrave", // 250 "uacute", "ucirc", "uuml", "yacute", "thorn", "yuml" }; // From "HTMLsymbolx.ent" ... some of the symbols that // we can conveniently handle. Entities for the Greek. // alphabet (upper and lower cases) are compact. private static final String HTMLsymbolx_GR [] = { // 913 "Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta", "Iota", "Kappa", // 923 "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho", null, "Sigma", "Tau", // 933 "Upsilon", "Phi", "Chi", "Psi", "Omega" }; private static final String HTMLsymbolx_gr [] = { // 945 "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta", "iota", "kappa", // 955 "lambda", "mu", "nu", "xi", "omicron", "pi", "rho", "sigmaf", "sigma", "tau", // 965 "upsilon", "phi", "chi", "psi", "omega" }; // General routine to write text and substitute predefined // entities (XML, and a special case for XHTML) as needed. private void escapeChars (char buf [], int off, int len, int code) throws SAXException, IOException { int first = 0; if (off < 0) { off = 0; len = buf.length; } for (int i = 0; i < len; i++) { String esc; char c = buf [off + i]; switch (c) { // Note that CTX_ATTRIBUTE isn't explicitly tested here; // all syntax delimiters are escaped in CTX_ATTRIBUTE, // otherwise it's similar to CTX_CONTENT // ampersand flags entity references; entity replacement // text has unexpanded references, other text doesn't. case '&': if (code == CTX_ENTITY || code == CTX_UNPARSED) continue; esc = "amp"; break; // attributes and text may NOT have literal '<', but // entities may have markup constructs case '<': if (code == CTX_ENTITY || code == CTX_UNPARSED) continue; esc = "lt"; break; // as above re markup constructs; but otherwise // except when canonicalizing, this is for consistency case '>': if (code == CTX_ENTITY || code == CTX_UNPARSED) continue; esc = "gt"; break; case '\'': if (code == CTX_CONTENT || code == CTX_UNPARSED) continue; if (canonical) continue; esc = "apos"; break; // needed when printing quoted attribute/entity values case '"': if (code == CTX_CONTENT || code == CTX_UNPARSED) continue; esc = "quot"; break; // make line ends work per host OS convention case '\n': esc = eol; break; // // No other characters NEED special treatment ... except // for encoding-specific issues, like whether the character // can really be represented in that encoding. // default: // // There are characters we can never write safely; getting // them is an error. // // (a) They're never legal in XML ... detected by range // checks, and (eventually) by remerging surrogate // pairs on output. (Easy error for apps to prevent.) // // (b) This encoding can't represent them, and we // can't make reference substitution (e.g. inside // CDATA sections, names, PI data, etc). (Hard for // apps to prevent, except by using UTF-8 or UTF-16 // as their output encoding.) // // We know a very little bit about what characters // the US-ASCII and ISO-8859-1 encodings support. For // other encodings we can't detect the second type of // error at all. (Never an issue for UTF-8 or UTF-16.) // // FIXME: CR in CDATA is an error; in text, turn to a char ref // FIXME: CR/LF/TAB in attributes should become char refs if ((c > 0xfffd) || ((c < 0x0020) && !((c == 0x0009) || (c == 0x000A) || (c == 0x000D))) || (((c & dangerMask) != 0) && (code == CTX_UNPARSED))) { // if case (b) in CDATA, we might end the section, // write a reference, then restart ... possible // in one DOM L3 draft. throw new CharConversionException ( "Illegal or non-writable character: U+" + Integer.toHexString (c)); } // // If the output encoding represents the character // directly, let it do so! Else we'll escape it. // if ((c & dangerMask) == 0) continue; esc = null; // Avoid numeric refs where symbolic ones exist, as // symbolic ones make more sense to humans reading! if (xhtml) { // all the HTMLlat1x.ent entities // (all the "ISO-8859-1" characters) if (c >= 160 && c <= 255) esc = HTMLlat1x [c - 160]; // not quite half the HTMLsymbolx.ent entities else if (c >= 913 && c <= 937) esc = HTMLsymbolx_GR [c - 913]; else if (c >= 945 && c <= 969) esc = HTMLsymbolx_gr [c - 945]; else switch (c) { // all of the HTMLspecialx.ent entities case 338: esc = "OElig"; break; case 339: esc = "oelig"; break; case 352: esc = "Scaron"; break; case 353: esc = "scaron"; break; case 376: esc = "Yuml"; break; case 710: esc = "circ"; break; case 732: esc = "tilde"; break; case 8194: esc = "ensp"; break; case 8195: esc = "emsp"; break; case 8201: esc = "thinsp"; break; case 8204: esc = "zwnj"; break; case 8205: esc = "zwj"; break; case 8206: esc = "lrm"; break; case 8207: esc = "rlm"; break; case 8211: esc = "ndash"; break; case 8212: esc = "mdash"; break; case 8216: esc = "lsquo"; break; case 8217: esc = "rsquo"; break; case 8218: esc = "sbquo"; break; case 8220: esc = "ldquo"; break; case 8221: esc = "rdquo"; break; case 8222: esc = "bdquo"; break; case 8224: esc = "dagger"; break; case 8225: esc = "Dagger"; break; case 8240: esc = "permil"; break; case 8249: esc = "lsaquo"; break; case 8250: esc = "rsaquo"; break; case 8364: esc = "euro"; break; // the other HTMLsymbox.ent entities case 402: esc = "fnof"; break; case 977: esc = "thetasym"; break; case 978: esc = "upsih"; break; case 982: esc = "piv"; break; case 8226: esc = "bull"; break; case 8230: esc = "hellip"; break; case 8242: esc = "prime"; break; case 8243: esc = "Prime"; break; case 8254: esc = "oline"; break; case 8260: esc = "frasl"; break; case 8472: esc = "weierp"; break; case 8465: esc = "image"; break; case 8476: esc = "real"; break; case 8482: esc = "trade"; break; case 8501: esc = "alefsym"; break; case 8592: esc = "larr"; break; case 8593: esc = "uarr"; break; case 8594: esc = "rarr"; break; case 8595: esc = "darr"; break; case 8596: esc = "harr"; break; case 8629: esc = "crarr"; break; case 8656: esc = "lArr"; break; case 8657: esc = "uArr"; break; case 8658: esc = "rArr"; break; case 8659: esc = "dArr"; break; case 8660: esc = "hArr"; break; case 8704: esc = "forall"; break; case 8706: esc = "part"; break; case 8707: esc = "exist"; break; case 8709: esc = "empty"; break; case 8711: esc = "nabla"; break; case 8712: esc = "isin"; break; case 8713: esc = "notin"; break; case 8715: esc = "ni"; break; case 8719: esc = "prod"; break; case 8721: esc = "sum"; break; case 8722: esc = "minus"; break; case 8727: esc = "lowast"; break; case 8730: esc = "radic"; break; case 8733: esc = "prop"; break; case 8734: esc = "infin"; break; case 8736: esc = "ang"; break; case 8743: esc = "and"; break; case 8744: esc = "or"; break; case 8745: esc = "cap"; break; case 8746: esc = "cup"; break; case 8747: esc = "int"; break; case 8756: esc = "there4"; break; case 8764: esc = "sim"; break; case 8773: esc = "cong"; break; case 8776: esc = "asymp"; break; case 8800: esc = "ne"; break; case 8801: esc = "equiv"; break; case 8804: esc = "le"; break; case 8805: esc = "ge"; break; case 8834: esc = "sub"; break; case 8835: esc = "sup"; break; case 8836: esc = "nsub"; break; case 8838: esc = "sube"; break; case 8839: esc = "supe"; break; case 8853: esc = "oplus"; break; case 8855: esc = "otimes"; break; case 8869: esc = "perp"; break; case 8901: esc = "sdot"; break; case 8968: esc = "lceil"; break; case 8969: esc = "rceil"; break; case 8970: esc = "lfloor"; break; case 8971: esc = "rfloor"; break; case 9001: esc = "lang"; break; case 9002: esc = "rang"; break; case 9674: esc = "loz"; break; case 9824: esc = "spades"; break; case 9827: esc = "clubs"; break; case 9829: esc = "hearts"; break; case 9830: esc = "diams"; break; } } // else escape with numeric char refs if (esc == null) { stringBuf.setLength (0); stringBuf.append ("#x"); stringBuf.append (Integer.toHexString (c).toUpperCase ()); esc = stringBuf.toString (); // FIXME: We don't write surrogate pairs correctly. // They should work as one ref per character, since // each pair is one character. For reading back into // Unicode, it matters beginning in Unicode 3.1 ... } break; } if (i != first) rawWrite (buf, off + first, i - first); first = i + 1; if (esc == eol) newline (); else { rawWrite ('&'); rawWrite (esc); rawWrite (';'); } } if (first < len) rawWrite (buf, off + first, len - first); } private void newline () throws SAXException, IOException { out.write (eol); column = 0; } private void doIndent () throws SAXException, IOException { int space = elementNestLevel * 2; newline (); column = space; // track tabs only at line starts while (space > 8) { out.write ("\t"); space -= 8; } while (space > 0) { out.write (" "); space -= 2; } } private void rawWrite (char c) throws IOException { out.write (c); column++; } private void rawWrite (String s) throws SAXException, IOException { if (prettyPrinting && "default".equals (space.peek ())) { char data [] = s.toCharArray (); rawWrite (data, 0, data.length); } else { out.write (s); column += s.length (); } } // NOTE: if xhtml, the REC gives some rules about whitespace // which we could follow ... notably, many places where conformant // agents "must" consolidate/normalize whitespace. Line ends can // be removed there, etc. This may not be the right place to do // such mappings though. // Line buffering may help clarify algorithms and improve results. // It's likely xml:space needs more attention. private void rawWrite (char buf [], int offset, int length) throws SAXException, IOException { boolean wrap; if (prettyPrinting && space.empty ()) fatal ("stack discipline", null); wrap = prettyPrinting && "default".equals (space.peek ()); if (!wrap) { out.write (buf, offset, length); column += length; return; } // we're pretty printing and want to fill lines out only // to the desired line length. while (length > 0) { int target = lineLength - column; boolean wrote = false; // Do we even have a problem? if (target > length || noWrap) { out.write (buf, offset, length); column += length; return; } // break the line at a space character, trying to fill // as much of the line as possible. char c; for (int i = target - 1; i >= 0; i--) { if ((c = buf [offset + i]) == ' ' || c == '\t') { i++; out.write (buf, offset, i); doIndent (); offset += i; length -= i; wrote = true; break; } } if (wrote) continue; // no space character permitting break before target // line length is filled. So, take the next one. if (target < 0) target = 0; for (int i = target; i < length; i++) if ((c = buf [offset + i]) == ' ' || c == '\t') { i++; out.write (buf, offset, i); doIndent (); offset += i; length -= i; wrote = true; break; } if (wrote) continue; // no such luck. out.write (buf, offset, length); column += length; break; } } }