diff options
Diffstat (limited to 'libjava/classpath/gnu/javax/swing/text/html/parser/support/Parser.java')
-rw-r--r-- | libjava/classpath/gnu/javax/swing/text/html/parser/support/Parser.java | 1532 |
1 files changed, 1532 insertions, 0 deletions
diff --git a/libjava/classpath/gnu/javax/swing/text/html/parser/support/Parser.java b/libjava/classpath/gnu/javax/swing/text/html/parser/support/Parser.java new file mode 100644 index 000000000..cdefb75c8 --- /dev/null +++ b/libjava/classpath/gnu/javax/swing/text/html/parser/support/Parser.java @@ -0,0 +1,1532 @@ +/* Parser.java -- HTML parser. + Copyright (C) 2005 Free Software Foundation, Inc. + +This file is part of GNU Classpath. + +GNU Classpath is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU Classpath is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Classpath; see the file COPYING. If not, write to the +Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +02110-1301 USA. + +Linking this library statically or dynamically with other modules is +making a combined work based on this library. Thus, the terms and +conditions of the GNU General Public License cover the whole +combination. + +As a special exception, the copyright holders of this library give you +permission to link this library with independent modules to produce an +executable, regardless of the license terms of these independent +modules, and to copy and distribute the resulting executable under +terms of your choice, provided that you also meet, for each linked +independent module, the terms and conditions of the license of that +module. An independent module is a module which is not derived from +or based on this library. If you modify this library, you may extend +this exception to your version of the library, but you are not +obligated to do so. If you do not wish to do so, delete this +exception statement from your version. */ + + +package gnu.javax.swing.text.html.parser.support; + +import gnu.java.lang.CPStringBuilder; + +import gnu.javax.swing.text.html.parser.htmlAttributeSet; +import gnu.javax.swing.text.html.parser.htmlValidator; +import gnu.javax.swing.text.html.parser.support.low.Constants; +import gnu.javax.swing.text.html.parser.support.low.ParseException; +import gnu.javax.swing.text.html.parser.support.low.ReaderTokenizer; +import gnu.javax.swing.text.html.parser.support.low.Token; +import gnu.javax.swing.text.html.parser.support.low.node; +import gnu.javax.swing.text.html.parser.support.low.pattern; + +import java.io.IOException; +import java.io.Reader; + +import java.util.Comparator; +import java.util.Set; +import java.util.TreeSet; +import java.util.Vector; + +import javax.swing.text.ChangedCharSetException; +import javax.swing.text.SimpleAttributeSet; +import javax.swing.text.html.HTML; +import javax.swing.text.html.parser.AttributeList; +import javax.swing.text.html.parser.DTD; +import javax.swing.text.html.parser.DTDConstants; +import javax.swing.text.html.parser.Element; +import javax.swing.text.html.parser.Entity; +import javax.swing.text.html.parser.TagElement; + +/** + * <p>A simple error-tolerant HTML parser that uses a DTD document + * to access data on the possible tokens, arguments and syntax.</p> + * <p> The parser reads an HTML content from a Reader and calls various + * notifying methods (which should be overridden in a subclass) + * when tags or data are encountered.</p> + * <p>Some HTML elements need no opening or closing tags. The + * task of this parser is to invoke the tag handling methods also when + * the tags are not explicitly specified and must be supposed using + * information, stored in the DTD. + * For example, parsing the document + * <p><table><tr><td>a<td>b<td>c</tr> <br> + * will invoke exactly the handling methods exactly in the same order + * (and with the same parameters) as if parsing the document: <br> + * <em><html><head></head><body><table>< + * tbody></em><tr><td>a<em></td></em><td>b<em> + * </td></em><td>c<em></td></tr></em>< + * <em>/tbody></table></body></html></em></p> + * (supposed tags are given in italics). The parser also supports + * obsolete elements of HTML syntax.<p> + * </p> + * @author Audrius Meskauskas, Lithuania (AudriusA@Bioinformatics.org) + */ +public class Parser + extends ReaderTokenizer + implements DTDConstants +{ + /** + * The current html tag. + */ + public Token hTag = new Token(); + + /** + * The document template description that will be used to parse the documents. + */ + protected DTD dtd; + + /** + * The value of this field determines whether or not the Parser will be + * strict in enforcing SGML compatibility. The default value is false, + * stating that the parser should do everything to parse and get at least + * some information even from the incorrectly written HTML input. + */ + protected boolean strict; + + /** + * This fields has positive values in preformatted tags. + */ + protected int preformatted = 0; + + /** + * The set of the document tags. This field is used for supporting + * markFirstTime(). + */ + private Set documentTags = + new TreeSet(new Comparator() + { + public int compare(Object a, Object b) + { + return ((String) a).compareToIgnoreCase((String) b); + } + } + ); + + /** + * The buffer to collect the incremental output like text or coment. + */ + private final StringBuffer buffer = new StringBuffer(); + + /** + * The buffer to store the document title. + */ + private final StringBuffer title = new StringBuffer(); + + /** + * The current token. + */ + private Token t; + + /** + * True means that the 'title' tag of this document has + * already been handled. + */ + private boolean titleHandled; + + /** + * True means that the 'title' tag is currently open and all + * text is also added to the title buffer. + */ + private boolean titleOpen; + + /** + * The attributes of the current HTML element. + * Package-private to avoid an accessor method. + */ + htmlAttributeSet attributes = + htmlAttributeSet.EMPTY_HTML_ATTRIBUTE_SET; + + /** + * The validator, controlling the forcible closing of the tags that + * (in accordance to dtd) are not allowed in the current context. + */ + private htmlValidator validator; + + /** + * Provides the default values for parameters in the case when these + * values are defined in the DTD. + */ + private parameterDefaulter defaulter; + + /** + * The text pre-processor for handling line ends and tabs. + */ + private textPreProcessor textProcessor = new textPreProcessor(); + + /** + * Creates a new Parser that uses the given + * {@link javax.swing.text.html.parser.DTD }. The only standard way + * to get an instance of DTD is to construct it manually, filling in + * all required fields. + * @param a_dtd The DTD to use. The parser behaviour after passing null + * as an argument is not documented and may vary between implementations. + */ + public Parser(DTD a_dtd) + { + if (a_dtd == null) + dtd = gnu.javax.swing.text.html.parser.HTML_401F.getInstance(); + else + dtd = a_dtd; + + defaulter = new parameterDefaulter(dtd); + + validator = + new htmlValidator(dtd) + { + /** + * Handles the error message. This method must be overridden to pass + * the message where required. + * @param msg The message text. + */ + protected void s_error(String msg) + { + error(msg); + } + + /** + * The method is called when the tag validator decides to close the + * tag on its own initiative. After reaching the end of stream, + * The tag validator closes all unclosed elements that are required + * to have the end (closing) tag. + * + * @param tElement The tag being fictionally (forcibly) closed. + */ + protected void handleSupposedEndTag(Element tElement) + { + // The tag is cloned as the original tElement is the + // element from the starting tag - may be accidently used + // somewhere else. + TagElement tag = makeTag(tElement, true); + _handleEndTag_remaining(tag); + } + + /** + * The method is called when the the tag validator decides to open + * the new tag on its own initiative. The tags, opened in this + * way, are HTML, HEAD and BODY. The attribute set is temporary + * assigned to the empty one, the previous value is + * restored before return. + * + * @param tElement The tag being fictionally (forcibly) closed. + */ + protected void handleSupposedStartTag(Element tElement) + { + TagElement tag = makeTag(tElement, true); + htmlAttributeSet were = attributes; + attributes = htmlAttributeSet.EMPTY_HTML_ATTRIBUTE_SET; + _handleStartTag(tag); + attributes = were; + } + }; + } + + /** + * Get the attributes of the current tag. + * @return The attribute set, representing the attributes of the current tag. + */ + public SimpleAttributeSet getAttributes() + { + return new SimpleAttributeSet(attributes); + } + + /** + * Invokes the error handler. The default method in this implementation + * delegates the call to handleError, also providing the current line. + */ + public void error(String msg) + { + error(msg, getTokenAhead()); + } + + public void error(String msg, Token atToken) + { + if (atToken != null) + handleError(atToken.where.beginLine, + msg + ": line " + atToken.where.beginLine + + ", absolute pos " + atToken.where.startPosition + ); + else + handleError(0, msg); + } + + /** + * Invokes the error handler. The default method in this implementation + * delegates the call to error (parm1+": '"+parm2+"'"). + */ + public void error(String msg, String invalid) + { + error(msg + ": '" + invalid + "'"); + } + + /** + * Invokes the error handler. The default method in this implementation + * delegates the call to error (parm1+" "+ parm2+" "+ parm3). + */ + public void error(String parm1, String parm2, String parm3) + { + error(parm1 + " " + parm2 + " " + parm3); + } + + /** + * Invokes the error handler. The default method in this implementation + * delegates the call to error (parm1+" "+ parm2+" "+ parm3+" "+ parm4). + */ + public void error(String parm1, String parm2, String parm3, String parm4) + { + error(parm1 + " " + parm2 + " " + parm3 + " " + parm4); + } + + public void flushAttributes() + { + } + + /** + * Parse the HTML text, calling various methods in response to the + * occurence of the corresponding HTML constructions. + * @param reader The reader to read the source HTML from. + * @throws IOException If the reader throws one. + */ + public synchronized void parse(Reader reader) + throws IOException + { + reset(reader); + restart(); + try + { + parseDocument(); + validator.closeAll(); + } + catch (ParseException ex) + { + if (ex != null) + { + error("Unable to continue parsing the document", ex.getMessage()); + + Throwable cause = ex.getCause(); + if (cause instanceof IOException) + throw (IOException) cause; + } + } + } + + /** + * Parses DTD markup declaration. Currently returns null without action. + * @return null. + * @throws IOException + */ + public String parseDTDMarkup() + throws IOException + { + return null; + } + + /** + * Parse SGML insertion ( <! ... > ). When the + * the SGML insertion is found, this method is called, passing + * SGML in the string buffer as a parameter. The default method + * returns false without action and can be overridden to + * implement user - defined SGML support. + * <p> + * If you need more information about SGML insertions in HTML documents, + * the author suggests to read SGML tutorial on + * {@link http://www.w3.org/TR/WD-html40-970708/intro/sgmltut.html}. + * We also recommend Goldfarb C.F (1991) <i>The SGML Handbook</i>, + * Oxford University Press, 688 p, ISBN: 0198537379. + * </p> + * @param strBuff + * @return true if this is a valid DTD markup declaration. + * @throws IOException + */ + public boolean parseMarkupDeclarations(StringBuffer strBuff) + throws IOException + { + return false; + } + + /** + * Get the first line of the last parsed token. + */ + protected int getCurrentLine() + { + return hTag.where.beginLine; + } + + /** + * Read parseable character data, add to buffer. + * @param clearBuffer If true, buffer if filled by CDATA section, + * otherwise the section is appended to the existing content of the + * buffer. + * + * @throws ParseException + */ + protected void CDATA(boolean clearBuffer) + throws ParseException + { + Token start = hTag = getTokenAhead(); + + if (clearBuffer) + buffer.setLength(0); + + // Handle expected EOF. + if (start.kind == EOF) + return; + + read: + while (true) + { + t = getTokenAhead(); + if (t.kind == EOF) + { + error("unexpected eof", t); + break read; + } + else if (t.kind == BEGIN) + break read; + else if (t.kind == Constants.ENTITY) + { + resolveAndAppendEntity(t); + getNextToken(); + } + else + { + append(t); + getNextToken(); + } + } + hTag = new Token(start, getTokenAhead(0)); + if (buffer.length() != 0) + _handleText(); + } + + /** + * Process Comment. This method skips till --> without + * taking SGML constructs into consideration. The supported SGML + * constructs are handled separately. + */ + protected void Comment() + throws ParseException + { + buffer.setLength(0); + + Token start = hTag = mustBe(BEGIN); + optional(WS); + mustBe(EXCLAMATION); + optional(WS); + mustBe(DOUBLE_DASH); + + Token t; + Token last; + + comment: + while (true) + { + t = getTokenAhead(); + if (t.kind == EOF) + { + handleEOFInComment(); + last = t; + break comment; + } + else if (COMMENT_END.matches(this)) + { + mustBe(DOUBLE_DASH); + optional(WS); + last = mustBe(END); + break comment; + } + else if (COMMENT_TRIPLEDASH_END.matches(this)) + { + mustBe(DOUBLE_DASH); + t = mustBe(NUMTOKEN); + if (t.getImage().equals("-")) + { + append(t); + last = mustBe(END); + break comment; + } + else + { + buffer.append("--"); + append(t); + t = getTokenAhead(); + } + } + else + /* The lllll-- can match as NUMTOKEN */ + if ((t.getImage().endsWith("--")) && + ( + getTokenAhead(1).kind == END || + (getTokenAhead(1).kind == WS && getTokenAhead(2).kind == END) + ) + ) + { + buffer.append(t.getImage().substring(0, t.getImage().length() - 2)); + + /* Skip the closing > that we have already checked. */ + last = mustBe(t.kind); + break comment; + } + else + append(t); + mustBe(t.kind); + } + hTag = new Token(start, last); + + // Consume any whitespace immediately following a comment. + optional(WS); + handleComment(); + } + + /** + * Read a script. The text, returned without any changes, + * is terminated only by the closing tag SCRIPT. + */ + protected void Script() + throws ParseException + { + Token name; + + Token start = hTag = mustBe(BEGIN); + optional(WS); + + name = mustBe(SCRIPT); + + optional(WS); + + restOfTag(false, name, start); + + buffer.setLength(0); + + while (!SCRIPT_CLOSE.matches(this)) + { + append(getNextToken()); + } + + consume(SCRIPT_CLOSE); + + _handleText(); + + endTag(false); + _handleEndTag(makeTagElement(name.getImage(), false)); + } + + /** + * Process SGML insertion that is not a comment. + */ + protected void Sgml() + throws ParseException + { + if (COMMENT_OPEN.matches(this)) + Comment(); + else // skip till ">" + { + Token start = hTag = mustBe(BEGIN); + optional(WS); + mustBe(EXCLAMATION); + + buffer.setLength(0); + read: + while (true) + { + t = getNextToken(); + if (t.kind == Constants.ENTITY) + { + resolveAndAppendEntity(t); + } + else if (t.kind == EOF) + { + error("unexpected eof", t); + break read; + } + else if (t.kind == END) + break read; + else + append(t); + } + + try + { + parseMarkupDeclarations(buffer); + } + catch (IOException ex) + { + error("Unable to parse SGML insertion: '" + buffer + "'", + new Token(start, t) + ); + } + } + // Consume any whitespace that follows the Sgml insertion. + optional(WS); + } + + /** + * Read a style definition. The text, returned without any changes, + * is terminated only by the closing tag STYLE. + */ + protected void Style() + throws ParseException + { + Token name; + + Token start = hTag = mustBe(BEGIN); + optional(WS); + + name = mustBe(STYLE); + + optional(WS); + + restOfTag(false, name, start); + + buffer.setLength(0); + + while (!STYLE_CLOSE.matches(this)) + { + append(getNextToken()); + } + + consume(STYLE_CLOSE); + + _handleText(); + + endTag(false); + _handleEndTag(makeTagElement(name.getImage(), false)); + } + + /** + * Read a html tag. + */ + protected void Tag() + throws ParseException + { + mark(true); + + boolean closing = false; + Token name; + Token start = hTag = mustBe(BEGIN); + + optional(WS); + name = getNextToken(); + optional(WS); + + if (name.kind == SLASH) + { + closing = true; + name = getNextToken(); + } + + restOfTag(closing, name, start); + } + + /** + * A hook, for operations, preceeding call to handleText. + * Handle text in a string buffer. + * In non - preformatted mode, all line breaks immediately following the + * start tag and immediately before an end tag is discarded, + * \r, \n and \t are replaced by spaces, multiple space are replaced + * by the single one and the result is moved into array, + * passing it to handleText(). + */ + protected void _handleText() + { + char[] text; + + if (preformatted > 0) + text = textProcessor.preprocessPreformatted(buffer); + else + text = textProcessor.preprocess(buffer); + + if (text != null && text.length > 0 + // According to the specs we need to discard whitespace immediately + // before a closing tag. + && (text.length > 1 || text[0] != ' ' || ! TAG_CLOSE.matches(this))) + { + TagElement pcdata = new TagElement(dtd.getElement("#pcdata")); + attributes = htmlAttributeSet.EMPTY_HTML_ATTRIBUTE_SET; + _handleEmptyTag(pcdata); + + handleText(text); + if (titleOpen) + title.append(text); + } + } + + /** + * Add the image of this token to the buffer. + * @param t A token to append. + */ + protected final void append(Token t) + { + if (t.kind != EOF) + t.appendTo(buffer); + } + + /** + * Consume pattern that must match. + * @param p A pattern to consume. + */ + protected final void consume(pattern p) + { + node n; + for (int i = 0; i < p.nodes.length; i++) + { + n = p.nodes [ i ]; + if (n.optional) + optional(n.kind); + else + mustBe(n.kind); + } + } + + /** + * The method is called when the HTML end (closing) tag is found or if + * the parser concludes that the one should be present in the + * current position. The method is called immediatly + * before calling the handleEndTag(). + * @param omitted True if the tag is no actually present in the document, + * but is supposed by the parser (like </html> at the end of the + * document). + */ + protected void endTag(boolean omitted) + { + } + + /** + * Handle HTML comment. The default method returns without action. + * @param comment + */ + protected void handleComment(char[] comment) + { + } + + /** + * This is additionally called in when the HTML content terminates + * without closing the HTML comment. This can only happen if the + * HTML document contains errors (for example, the closing --;gt is + * missing. + */ + protected void handleEOFInComment() + { + error("Unclosed comment"); + } + + /** + * Handle the tag with no content, like <br>. The method is + * called for the elements that, in accordance with the current DTD, + * has an empty content. + * @param tag The tag being handled. + * @throws javax.swing.text.ChangedCharSetException + */ + protected void handleEmptyTag(TagElement tag) + throws javax.swing.text.ChangedCharSetException + { + } + + /** + * The method is called when the HTML closing tag ((like </table>) + * is found or if the parser concludes that the one should be present + * in the current position. + * @param tag The tag + */ + protected void handleEndTag(TagElement tag) + { + } + + /* Handle error that has occured in the given line. */ + protected void handleError(int line, String message) + { + } + + /** + * The method is called when the HTML opening tag ((like <table>) + * is found or if the parser concludes that the one should be present + * in the current position. + * @param tag The tag + */ + protected void handleStartTag(TagElement tag) + { + } + + /** + * Handle the text section. + * <p> For non-preformatted section, the parser replaces + * \t, \r and \n by spaces and then multiple spaces + * by a single space. Additionaly, all whitespace around + * tags is discarded. + * </p> + * <p> For pre-formatted text (inside TEXAREA and PRE), the parser preserves + * all tabs and spaces, but removes <b>one</b> bounding \r, \n or \r\n, + * if it is present. Additionally, it replaces each occurence of \r or \r\n + * by a single \n.</p> + * + * @param text A section text. + */ + protected void handleText(char[] text) + { + } + + /** + * Handle HTML <title> tag. This method is invoked when + * both title starting and closing tags are already behind. + * The passed argument contains the concatenation of all + * title text sections. + * @param title The title text. + */ + protected void handleTitle(char[] title) + { + } + + /** + * Constructs the tag from the given element. In this implementation, + * this is defined, but never called. + * @return the tag + */ + protected TagElement makeTag(Element element) + { + return makeTag(element, false); + } + + /** + * Constructs the tag from the given element. + * @param the tag base {@link javax.swing.text.html.parser.Element} + * @param isSupposed true if the tag is not actually present in the + * html input, but the parser supposes that it should to occur in + * the current location. + * @return the tag + */ + protected TagElement makeTag(Element element, boolean isSupposed) + { + return new TagElement(element, isSupposed); + } + + /** + * This is called when the tag, representing the given element, + * occurs first time in the document. + * @param element + */ + protected void markFirstTime(Element element) + { + } + + /** + * Consume the token that was checked before and hence MUST be present. + * @param kind The kind of token to consume. + */ + protected Token mustBe(int kind) + { + if (getTokenAhead().kind == kind) + return getNextToken(); + else + { + String ei = ""; + if (kind < 1000) + ei = " ('" + (char) kind + "') "; + throw new AssertionError("The token of kind " + kind + ei + + " MUST be here," + ); + } + } + + /** + * Handle attribute without value. The default method uses + * the only allowed attribute value from DTD. + * If the attribute is unknown or allows several values, + * the HTML.NULL_ATTRIBUTE_VALUE is used. The attribute with + * this value is added to the attribute set. + * @param element The name of element. + * @param attribute The name of attribute without value. + */ + protected void noValueAttribute(String element, String attribute) + { + Object value = HTML.NULL_ATTRIBUTE_VALUE; + + Element e = dtd.elementHash.get(element.toLowerCase()); + if (e != null) + { + AttributeList attr = e.getAttribute(attribute); + if (attr != null) + { + Vector values = attr.values; + if (values != null && values.size() == 1) + value = values.get(0); + } + } + attributes.addAttribute(attribute, value); + } + + /** + * Consume the optional token, if present. + * @param kind The kind of token to consume. + */ + protected Token optional(int kind) + { + if (getTokenAhead().kind == kind) + return getNextToken(); + else + return null; + } + + /** Parse the html document. */ + protected void parseDocument() + throws ParseException + { + // Read up any initial whitespace. + optional(WS); + while (getTokenAhead().kind != EOF) + { + advanced = false; + if (TAG.matches(this)) + Tag(); + else if (COMMENT_OPEN.matches(this)) + Comment(); + else if (STYLE_OPEN.matches(this)) + Style(); + else if (SCRIPT_OPEN.matches(this)) + Script(); + else if (SGML.matches(this)) + Sgml(); + else + CDATA(true); + + // Surely HTML error, treat as a text. + if (!advanced) + { + Token wrong = getNextToken(); + error("unexpected '" + wrong.getImage() + "'", wrong); + buffer.setLength(0); + buffer.append(wrong.getImage()); + _handleText(); + } + } + } + + /** + * Read the element attributes, adding them into attribute set. + * @param element The element name (needed to access attribute + * information in dtd). + */ + protected void readAttributes(String element) + { + Token name; + Token value; + Token next; + String attrValue; + + attributes = new htmlAttributeSet(); + + optional(WS); + + attributeReading: + while (getTokenAhead().kind == NUMTOKEN) + { + name = getNextToken(); + optional(WS); + + next = getTokenAhead(); + if (next.kind == EQ) + { + mustBe(EQ); + optional(WS); + + next = getNextToken(); + + switch (next.kind) + { + case QUOT: + + // read "quoted" attribute. + buffer.setLength(0); + readTillTokenE(QUOT); + attrValue = buffer.toString(); + break; + + case AP: + + // read 'quoted' attribute. + buffer.setLength(0); + readTillTokenE(AP); + attrValue = buffer.toString(); + break; + + // read unquoted attribute. + case NUMTOKEN: + value = next; + optional(WS); + + // Check maybe the opening quote is missing. + next = getTokenAhead(); + if (bQUOTING.get(next.kind)) + { + hTag = next; + error("The value without opening quote is closed with '" + + next.getImage() + "'"); + attrValue = value.getImage(); + } + else if (next.kind == SLASH || next.kind == OTHER) + // The slash and other characters (like %) in this context is + // treated as the ordinary + // character, not as a token. The character may be part of + // the unquoted URL. + { + CPStringBuilder image = new CPStringBuilder(value.getImage()); + while (next.kind == NUMTOKEN || next.kind == SLASH + || next.kind == OTHER) + { + image.append(getNextToken().getImage()); + next = getTokenAhead(); + } + attrValue = image.toString(); + } + else + attrValue = value.getImage(); + break; + + case SLASH: + value = next; + optional(WS); + + // Check maybe the opening quote is missing. + next = getTokenAhead(); + if (bQUOTING.get(next.kind)) + { + hTag = next; + error("The value without opening quote is closed with '" + + next.getImage() + "'"); + attrValue = value.getImage(); + } + else if (next.kind == NUMTOKEN || next.kind == SLASH) + // The slash in this context is treated as the ordinary + // character, not as a token. The slash may be part of + // the unquoted URL. + { + CPStringBuilder image = new CPStringBuilder(value.getImage()); + while (next.kind == NUMTOKEN || next.kind == SLASH) + { + image.append(getNextToken().getImage()); + next = getTokenAhead(); + } + attrValue = image.toString(); + } + else + attrValue = value.getImage(); + break; + default: + break attributeReading; + } + attributes.addAttribute(name.getImage(), attrValue); + optional(WS); + } + else + // The '=' is missing: attribute without value. + { + noValueAttribute(element, name.getImage()); + } + } + } + + /** + * Return string, corresponding the given named entity. The name is passed + * with the preceeding &, but without the ending semicolon. + */ + protected String resolveNamedEntity(final String a_tag) + { + // Discard & + if (!a_tag.startsWith("&")) + throw new AssertionError("Named entity " + a_tag + + " must start witn '&'." + ); + + String tag = a_tag.substring(1); + + try + { + Entity entity = dtd.getEntity(tag); + if (entity != null) + return entity.getString(); + + entity = dtd.getEntity(tag.toLowerCase()); + + if (entity != null) + { + error("The name of this entity should be in lowercase", a_tag); + return entity.getString(); + } + } + catch (IndexOutOfBoundsException ibx) + { + /* The error will be reported. */ + } + + error("Unknown named entity", a_tag); + return a_tag; + } + + /** + * Return char, corresponding the given numeric entity. + * The name is passed with the preceeding &#, but without + * the ending semicolon. + */ + protected char resolveNumericEntity(final String a_tag) + { + // Discard &# + if (!a_tag.startsWith("&#")) + throw new AssertionError("Numeric entity " + a_tag + + " must start witn '&#'." + ); + + String tag = a_tag.substring(2); + + try + { + // Determine the encoding type: + char cx = tag.charAt(0); + if (cx == 'x' || cx == 'X') // Hexadecimal &#Xnnn; + + return (char) Integer.parseInt(tag.substring(1), 16); + + return (char) Integer.parseInt(tag); + } + + /* The error will be reported. */ + catch (NumberFormatException nex) + { + } + catch (IndexOutOfBoundsException ix) + { + } + + error("Invalid numeric entity", a_tag); + return '?'; + } + + /** + * Reset all fields into the intial default state, preparing the + * parset for parsing the next document. + */ + protected void restart() + { + documentTags.clear(); + titleHandled = false; + titleOpen = false; + buffer.setLength(0); + title.setLength(0); + validator.restart(); + } + + /** + * The method is called when the HTML opening tag ((like <table>) + * is found or if the parser concludes that the one should be present + * in the current position. The method is called immediately before + * calling the handleStartTag. + * @param tag The tag + */ + protected void startTag(TagElement tag) + throws ChangedCharSetException + { + } + + /** + * Handle a complete element, when the tag content is already present in the + * buffer and both starting and heading tags behind. This is called + * in the case when the tag text must not be parsed for the nested + * elements (elements STYLE and SCRIPT). + */ + private void _handleCompleteElement(TagElement tag) + { + _handleStartTag(tag); + + // Suppress inclusion of the SCRIPT ans STYLE texts into the title. + HTML.Tag h = tag.getHTMLTag(); + if (h == HTML.Tag.SCRIPT || h == HTML.Tag.STYLE) + { + boolean tmp = titleOpen; + titleOpen = false; + _handleText(); + titleOpen = tmp; + } + else + _handleText(); + + _handleEndTag(tag); + } + + /** + * A hooks for operations, preceeding call to handleEmptyTag(). + * Handle the tag with no content, like <br>. As no any + * nested tags are expected, the tag validator is not involved. + * @param tag The tag being handled. + */ + private void _handleEmptyTag(TagElement tag) + { + try + { + validator.validateTag(tag, attributes); + handleEmptyTag(tag); + HTML.Tag h = tag.getHTMLTag(); + // When a block tag is closed, consume whitespace that follows after + // it. + // For some unknown reason a FRAME tag is not treated as block element. + // However in this case it should be treated as such. + if (isBlock(h)) + optional(WS); + } + catch (ChangedCharSetException ex) + { + error("Changed charset exception:", ex.getMessage()); + } + } + + /** + * A hooks for operations, preceeding call to handleEndTag(). + * The method is called when the HTML closing tag + * is found. Calls handleTitle after closing the 'title' tag. + * @param tag The tag + */ + private void _handleEndTag(TagElement tag) + { + if (validator.closeTag(tag)) + _handleEndTag_remaining(tag); + } + + /** + * Actions that are also required if the closing action was + * initiated by the tag validator. + * Package-private to avoid an accessor method. + */ + void _handleEndTag_remaining(TagElement tag) + { + HTML.Tag h = tag.getHTMLTag(); + + handleEndTag(tag); + endTag(tag.fictional()); + + if (h.isPreformatted()) + preformatted--; + if (preformatted < 0) + preformatted = 0; + + // When a block tag is closed, consume whitespace that follows after + // it. + if (isBlock(h)) + optional(WS); + + if (h == HTML.Tag.TITLE) + { + titleOpen = false; + titleHandled = true; + + char[] a = new char[ title.length() ]; + title.getChars(0, a.length, a, 0); + handleTitle(a); + } + } + + /** + * A hooks for operations, preceeding call to handleStartTag(). + * The method is called when the HTML opening tag ((like <table>) + * is found. + * Package-private to avoid an accessor method. + * @param tag The tag + */ + void _handleStartTag(TagElement tag) + { + validator.openTag(tag, attributes); + startingTag(tag); + handleStartTag(tag); + + HTML.Tag h = tag.getHTMLTag(); + + if (isBlock(h)) + optional(WS); + + if (h.isPreformatted()) + preformatted++; + + if (h == HTML.Tag.TITLE) + { + if (titleHandled) + error("Repetetive <TITLE> tag"); + titleOpen = true; + titleHandled = false; + } + } + + /** + * Resume parsing after heavy errors in HTML tag structure. + * @throws ParseException + */ + private void forciblyCloseTheTag() + throws ParseException + { + int closeAt = 0; + buffer.setLength(0); + + ahead: + for (int i = 1; i < 100; i++) + { + t = getTokenAhead(i - 1); + if (t.kind == EOF || t.kind == BEGIN) + break ahead; + if (t.kind == END) + { + /* Closing '>' found. */ + closeAt = i; + break ahead; + } + } + if (closeAt > 0) + { + buffer.append("Ignoring '"); + for (int i = 1; i <= closeAt; i++) + { + t = getNextToken(); + append(t); + } + buffer.append('\''); + error(buffer.toString()); + } + } + + /** + * Handle comment in string buffer. You can avoid allocating a char + * array each time by processing your comment directly here. + */ + private void handleComment() + { + char[] a = new char[ buffer.length() ]; + buffer.getChars(0, a.length, a, 0); + handleComment(a); + } + + private TagElement makeTagElement(String name, boolean isSupposed) + { + Element e = dtd.elementHash.get(name.toLowerCase()); + if (e == null) + { + error("Unknown tag <" + name + ">"); + e = dtd.getElement(name); + e.name = name.toUpperCase(); + e.index = -1; + } + + if (!documentTags.contains(e.name)) + { + markFirstTime(e); + documentTags.add(e.name); + } + + return makeTag(e, isSupposed); + } + + /** + * Read till the given token, resolving entities. Consume the given + * token without adding it to buffer. + * @param till The token to read till + * @throws ParseException + */ + private void readTillTokenE(int till) + throws ParseException + { + buffer.setLength(0); + read: + while (true) + { + t = getNextToken(); + if (t.kind == Constants.ENTITY) + { + resolveAndAppendEntity(t); + } + else if (t.kind == EOF) + { + error("unexpected eof", t); + break read; + } + else if (t.kind == till) + break read; + else if (t.kind == WS) + { + // Processing whitespace in accordance with CDATA rules: + String s = t.getImage(); + char c; + for (int i = 0; i < s.length(); i++) + { + c = s.charAt(i); + if (c == '\r') + buffer.append(' '); // CR replaced by space + else if (c == '\n') + { /* LF ignored */ } + else if (c == '\t') + buffer.append(' '); // Tab replaced by space + else + buffer.append(c); + } + } + else + append(t); + } + } + + /** + * Resolve the entity and append it to the end of buffer. + * @param entity + */ + private void resolveAndAppendEntity(Token entity) + { + switch (entity.category) + { + case ENTITY_NAMED : + buffer.append(resolveNamedEntity(entity.getImage())); + break; + + case ENTITY_NUMERIC : + buffer.append(resolveNumericEntity(entity.getImage())); + break; + + default : + throw new AssertionError("Invalid entity category " + + entity.category + ); + } + } + + /** + * Handle the remaining of HTML tags. This is a common end for + * TAG, SCRIPT and STYLE. + * @param closing True for closing tags ( </TAG> ). + * @param name Name of element + * @param start Token where element has started + * @throws ParseException + */ + private void restOfTag(boolean closing, Token name, Token start) + throws ParseException + { + boolean end = false; + Token next; + + optional(WS); + + readAttributes(name.getImage()); + + optional(WS); + + next = getTokenAhead(); + if (next.kind == END) + { + mustBe(END); + end = true; + } + + hTag = new Token(start, next); + + if (!end) + { + // The tag body contains errors. If additionally the tag + // name is not valid, this construction is treated as text. + if (dtd.elementHash.get(name.getImage().toLowerCase()) == null && + backupMode + ) + { + error("Errors in tag body and unknown tag name. " + + "Treating the tag as a text." + ); + reset(); + + hTag = mustBe(BEGIN); + buffer.setLength(0); + buffer.append(hTag.getImage()); + CDATA(false); + return; + } + else + { + error("Forcibly closing invalid parameter list"); + forciblyCloseTheTag(); + } + } + + if (closing) + { + endTag(false); + _handleEndTag(makeTagElement(name.getImage(), false)); + } + else + { + TagElement te = makeTagElement(name.getImage(), false); + if (te.getElement().type == DTDConstants.EMPTY) + _handleEmptyTag(te); + else + { + // According to the specs we need to consume whitespace following + // immediately after a opening tag. + optional(WS); + _handleStartTag(te); + } + } + } + + /** + * This should fire additional actions in response to the + * ChangedCharSetException. The current implementation + * does nothing. + * @param tag + */ + private void startingTag(TagElement tag) + { + try + { + startTag(tag); + } + catch (ChangedCharSetException cax) + { + error("Invalid change of charset"); + } + } + + private void ws_error() + { + error("Whitespace here is not permitted"); + } + + /** + * Returns true when the specified tag should be considered a block tag + * wrt whitespace handling. We need this special handling, since there + * are a couple of tags that we must treat as block tags but which aren't + * officially block tags. + * + * @param tag the tag to check + * @return true when the specified tag should be considered a block tag + * wrt whitespace handling + */ + private boolean isBlock(HTML.Tag tag) + { + return tag.isBlock() || tag == HTML.Tag.STYLE || tag == HTML.Tag.FRAME; + } +} |