diff options
Diffstat (limited to 'libjava/classpath/gnu/javax/swing/text/html/parser/support')
15 files changed, 3705 insertions, 0 deletions
diff --git a/libjava/classpath/gnu/javax/swing/text/html/parser/support/Parser.java b/libjava/classpath/gnu/javax/swing/text/html/parser/support/Parser.java new file mode 100644 index 000000000..cdefb75c8 --- /dev/null +++ b/libjava/classpath/gnu/javax/swing/text/html/parser/support/Parser.java @@ -0,0 +1,1532 @@ +/* Parser.java -- HTML parser. + Copyright (C) 2005 Free Software Foundation, Inc. + +This file is part of GNU Classpath. + +GNU Classpath is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU Classpath is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Classpath; see the file COPYING. If not, write to the +Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +02110-1301 USA. + +Linking this library statically or dynamically with other modules is +making a combined work based on this library. Thus, the terms and +conditions of the GNU General Public License cover the whole +combination. + +As a special exception, the copyright holders of this library give you +permission to link this library with independent modules to produce an +executable, regardless of the license terms of these independent +modules, and to copy and distribute the resulting executable under +terms of your choice, provided that you also meet, for each linked +independent module, the terms and conditions of the license of that +module. An independent module is a module which is not derived from +or based on this library. If you modify this library, you may extend +this exception to your version of the library, but you are not +obligated to do so. If you do not wish to do so, delete this +exception statement from your version. */ + + +package gnu.javax.swing.text.html.parser.support; + +import gnu.java.lang.CPStringBuilder; + +import gnu.javax.swing.text.html.parser.htmlAttributeSet; +import gnu.javax.swing.text.html.parser.htmlValidator; +import gnu.javax.swing.text.html.parser.support.low.Constants; +import gnu.javax.swing.text.html.parser.support.low.ParseException; +import gnu.javax.swing.text.html.parser.support.low.ReaderTokenizer; +import gnu.javax.swing.text.html.parser.support.low.Token; +import gnu.javax.swing.text.html.parser.support.low.node; +import gnu.javax.swing.text.html.parser.support.low.pattern; + +import java.io.IOException; +import java.io.Reader; + +import java.util.Comparator; +import java.util.Set; +import java.util.TreeSet; +import java.util.Vector; + +import javax.swing.text.ChangedCharSetException; +import javax.swing.text.SimpleAttributeSet; +import javax.swing.text.html.HTML; +import javax.swing.text.html.parser.AttributeList; +import javax.swing.text.html.parser.DTD; +import javax.swing.text.html.parser.DTDConstants; +import javax.swing.text.html.parser.Element; +import javax.swing.text.html.parser.Entity; +import javax.swing.text.html.parser.TagElement; + +/** + * <p>A simple error-tolerant HTML parser that uses a DTD document + * to access data on the possible tokens, arguments and syntax.</p> + * <p> The parser reads an HTML content from a Reader and calls various + * notifying methods (which should be overridden in a subclass) + * when tags or data are encountered.</p> + * <p>Some HTML elements need no opening or closing tags. The + * task of this parser is to invoke the tag handling methods also when + * the tags are not explicitly specified and must be supposed using + * information, stored in the DTD. + * For example, parsing the document + * <p><table><tr><td>a<td>b<td>c</tr> <br> + * will invoke exactly the handling methods exactly in the same order + * (and with the same parameters) as if parsing the document: <br> + * <em><html><head></head><body><table>< + * tbody></em><tr><td>a<em></td></em><td>b<em> + * </td></em><td>c<em></td></tr></em>< + * <em>/tbody></table></body></html></em></p> + * (supposed tags are given in italics). The parser also supports + * obsolete elements of HTML syntax.<p> + * </p> + * @author Audrius Meskauskas, Lithuania (AudriusA@Bioinformatics.org) + */ +public class Parser + extends ReaderTokenizer + implements DTDConstants +{ + /** + * The current html tag. + */ + public Token hTag = new Token(); + + /** + * The document template description that will be used to parse the documents. + */ + protected DTD dtd; + + /** + * The value of this field determines whether or not the Parser will be + * strict in enforcing SGML compatibility. The default value is false, + * stating that the parser should do everything to parse and get at least + * some information even from the incorrectly written HTML input. + */ + protected boolean strict; + + /** + * This fields has positive values in preformatted tags. + */ + protected int preformatted = 0; + + /** + * The set of the document tags. This field is used for supporting + * markFirstTime(). + */ + private Set documentTags = + new TreeSet(new Comparator() + { + public int compare(Object a, Object b) + { + return ((String) a).compareToIgnoreCase((String) b); + } + } + ); + + /** + * The buffer to collect the incremental output like text or coment. + */ + private final StringBuffer buffer = new StringBuffer(); + + /** + * The buffer to store the document title. + */ + private final StringBuffer title = new StringBuffer(); + + /** + * The current token. + */ + private Token t; + + /** + * True means that the 'title' tag of this document has + * already been handled. + */ + private boolean titleHandled; + + /** + * True means that the 'title' tag is currently open and all + * text is also added to the title buffer. + */ + private boolean titleOpen; + + /** + * The attributes of the current HTML element. + * Package-private to avoid an accessor method. + */ + htmlAttributeSet attributes = + htmlAttributeSet.EMPTY_HTML_ATTRIBUTE_SET; + + /** + * The validator, controlling the forcible closing of the tags that + * (in accordance to dtd) are not allowed in the current context. + */ + private htmlValidator validator; + + /** + * Provides the default values for parameters in the case when these + * values are defined in the DTD. + */ + private parameterDefaulter defaulter; + + /** + * The text pre-processor for handling line ends and tabs. + */ + private textPreProcessor textProcessor = new textPreProcessor(); + + /** + * Creates a new Parser that uses the given + * {@link javax.swing.text.html.parser.DTD }. The only standard way + * to get an instance of DTD is to construct it manually, filling in + * all required fields. + * @param a_dtd The DTD to use. The parser behaviour after passing null + * as an argument is not documented and may vary between implementations. + */ + public Parser(DTD a_dtd) + { + if (a_dtd == null) + dtd = gnu.javax.swing.text.html.parser.HTML_401F.getInstance(); + else + dtd = a_dtd; + + defaulter = new parameterDefaulter(dtd); + + validator = + new htmlValidator(dtd) + { + /** + * Handles the error message. This method must be overridden to pass + * the message where required. + * @param msg The message text. + */ + protected void s_error(String msg) + { + error(msg); + } + + /** + * The method is called when the tag validator decides to close the + * tag on its own initiative. After reaching the end of stream, + * The tag validator closes all unclosed elements that are required + * to have the end (closing) tag. + * + * @param tElement The tag being fictionally (forcibly) closed. + */ + protected void handleSupposedEndTag(Element tElement) + { + // The tag is cloned as the original tElement is the + // element from the starting tag - may be accidently used + // somewhere else. + TagElement tag = makeTag(tElement, true); + _handleEndTag_remaining(tag); + } + + /** + * The method is called when the the tag validator decides to open + * the new tag on its own initiative. The tags, opened in this + * way, are HTML, HEAD and BODY. The attribute set is temporary + * assigned to the empty one, the previous value is + * restored before return. + * + * @param tElement The tag being fictionally (forcibly) closed. + */ + protected void handleSupposedStartTag(Element tElement) + { + TagElement tag = makeTag(tElement, true); + htmlAttributeSet were = attributes; + attributes = htmlAttributeSet.EMPTY_HTML_ATTRIBUTE_SET; + _handleStartTag(tag); + attributes = were; + } + }; + } + + /** + * Get the attributes of the current tag. + * @return The attribute set, representing the attributes of the current tag. + */ + public SimpleAttributeSet getAttributes() + { + return new SimpleAttributeSet(attributes); + } + + /** + * Invokes the error handler. The default method in this implementation + * delegates the call to handleError, also providing the current line. + */ + public void error(String msg) + { + error(msg, getTokenAhead()); + } + + public void error(String msg, Token atToken) + { + if (atToken != null) + handleError(atToken.where.beginLine, + msg + ": line " + atToken.where.beginLine + + ", absolute pos " + atToken.where.startPosition + ); + else + handleError(0, msg); + } + + /** + * Invokes the error handler. The default method in this implementation + * delegates the call to error (parm1+": '"+parm2+"'"). + */ + public void error(String msg, String invalid) + { + error(msg + ": '" + invalid + "'"); + } + + /** + * Invokes the error handler. The default method in this implementation + * delegates the call to error (parm1+" "+ parm2+" "+ parm3). + */ + public void error(String parm1, String parm2, String parm3) + { + error(parm1 + " " + parm2 + " " + parm3); + } + + /** + * Invokes the error handler. The default method in this implementation + * delegates the call to error (parm1+" "+ parm2+" "+ parm3+" "+ parm4). + */ + public void error(String parm1, String parm2, String parm3, String parm4) + { + error(parm1 + " " + parm2 + " " + parm3 + " " + parm4); + } + + public void flushAttributes() + { + } + + /** + * Parse the HTML text, calling various methods in response to the + * occurence of the corresponding HTML constructions. + * @param reader The reader to read the source HTML from. + * @throws IOException If the reader throws one. + */ + public synchronized void parse(Reader reader) + throws IOException + { + reset(reader); + restart(); + try + { + parseDocument(); + validator.closeAll(); + } + catch (ParseException ex) + { + if (ex != null) + { + error("Unable to continue parsing the document", ex.getMessage()); + + Throwable cause = ex.getCause(); + if (cause instanceof IOException) + throw (IOException) cause; + } + } + } + + /** + * Parses DTD markup declaration. Currently returns null without action. + * @return null. + * @throws IOException + */ + public String parseDTDMarkup() + throws IOException + { + return null; + } + + /** + * Parse SGML insertion ( <! ... > ). When the + * the SGML insertion is found, this method is called, passing + * SGML in the string buffer as a parameter. The default method + * returns false without action and can be overridden to + * implement user - defined SGML support. + * <p> + * If you need more information about SGML insertions in HTML documents, + * the author suggests to read SGML tutorial on + * {@link http://www.w3.org/TR/WD-html40-970708/intro/sgmltut.html}. + * We also recommend Goldfarb C.F (1991) <i>The SGML Handbook</i>, + * Oxford University Press, 688 p, ISBN: 0198537379. + * </p> + * @param strBuff + * @return true if this is a valid DTD markup declaration. + * @throws IOException + */ + public boolean parseMarkupDeclarations(StringBuffer strBuff) + throws IOException + { + return false; + } + + /** + * Get the first line of the last parsed token. + */ + protected int getCurrentLine() + { + return hTag.where.beginLine; + } + + /** + * Read parseable character data, add to buffer. + * @param clearBuffer If true, buffer if filled by CDATA section, + * otherwise the section is appended to the existing content of the + * buffer. + * + * @throws ParseException + */ + protected void CDATA(boolean clearBuffer) + throws ParseException + { + Token start = hTag = getTokenAhead(); + + if (clearBuffer) + buffer.setLength(0); + + // Handle expected EOF. + if (start.kind == EOF) + return; + + read: + while (true) + { + t = getTokenAhead(); + if (t.kind == EOF) + { + error("unexpected eof", t); + break read; + } + else if (t.kind == BEGIN) + break read; + else if (t.kind == Constants.ENTITY) + { + resolveAndAppendEntity(t); + getNextToken(); + } + else + { + append(t); + getNextToken(); + } + } + hTag = new Token(start, getTokenAhead(0)); + if (buffer.length() != 0) + _handleText(); + } + + /** + * Process Comment. This method skips till --> without + * taking SGML constructs into consideration. The supported SGML + * constructs are handled separately. + */ + protected void Comment() + throws ParseException + { + buffer.setLength(0); + + Token start = hTag = mustBe(BEGIN); + optional(WS); + mustBe(EXCLAMATION); + optional(WS); + mustBe(DOUBLE_DASH); + + Token t; + Token last; + + comment: + while (true) + { + t = getTokenAhead(); + if (t.kind == EOF) + { + handleEOFInComment(); + last = t; + break comment; + } + else if (COMMENT_END.matches(this)) + { + mustBe(DOUBLE_DASH); + optional(WS); + last = mustBe(END); + break comment; + } + else if (COMMENT_TRIPLEDASH_END.matches(this)) + { + mustBe(DOUBLE_DASH); + t = mustBe(NUMTOKEN); + if (t.getImage().equals("-")) + { + append(t); + last = mustBe(END); + break comment; + } + else + { + buffer.append("--"); + append(t); + t = getTokenAhead(); + } + } + else + /* The lllll-- can match as NUMTOKEN */ + if ((t.getImage().endsWith("--")) && + ( + getTokenAhead(1).kind == END || + (getTokenAhead(1).kind == WS && getTokenAhead(2).kind == END) + ) + ) + { + buffer.append(t.getImage().substring(0, t.getImage().length() - 2)); + + /* Skip the closing > that we have already checked. */ + last = mustBe(t.kind); + break comment; + } + else + append(t); + mustBe(t.kind); + } + hTag = new Token(start, last); + + // Consume any whitespace immediately following a comment. + optional(WS); + handleComment(); + } + + /** + * Read a script. The text, returned without any changes, + * is terminated only by the closing tag SCRIPT. + */ + protected void Script() + throws ParseException + { + Token name; + + Token start = hTag = mustBe(BEGIN); + optional(WS); + + name = mustBe(SCRIPT); + + optional(WS); + + restOfTag(false, name, start); + + buffer.setLength(0); + + while (!SCRIPT_CLOSE.matches(this)) + { + append(getNextToken()); + } + + consume(SCRIPT_CLOSE); + + _handleText(); + + endTag(false); + _handleEndTag(makeTagElement(name.getImage(), false)); + } + + /** + * Process SGML insertion that is not a comment. + */ + protected void Sgml() + throws ParseException + { + if (COMMENT_OPEN.matches(this)) + Comment(); + else // skip till ">" + { + Token start = hTag = mustBe(BEGIN); + optional(WS); + mustBe(EXCLAMATION); + + buffer.setLength(0); + read: + while (true) + { + t = getNextToken(); + if (t.kind == Constants.ENTITY) + { + resolveAndAppendEntity(t); + } + else if (t.kind == EOF) + { + error("unexpected eof", t); + break read; + } + else if (t.kind == END) + break read; + else + append(t); + } + + try + { + parseMarkupDeclarations(buffer); + } + catch (IOException ex) + { + error("Unable to parse SGML insertion: '" + buffer + "'", + new Token(start, t) + ); + } + } + // Consume any whitespace that follows the Sgml insertion. + optional(WS); + } + + /** + * Read a style definition. The text, returned without any changes, + * is terminated only by the closing tag STYLE. + */ + protected void Style() + throws ParseException + { + Token name; + + Token start = hTag = mustBe(BEGIN); + optional(WS); + + name = mustBe(STYLE); + + optional(WS); + + restOfTag(false, name, start); + + buffer.setLength(0); + + while (!STYLE_CLOSE.matches(this)) + { + append(getNextToken()); + } + + consume(STYLE_CLOSE); + + _handleText(); + + endTag(false); + _handleEndTag(makeTagElement(name.getImage(), false)); + } + + /** + * Read a html tag. + */ + protected void Tag() + throws ParseException + { + mark(true); + + boolean closing = false; + Token name; + Token start = hTag = mustBe(BEGIN); + + optional(WS); + name = getNextToken(); + optional(WS); + + if (name.kind == SLASH) + { + closing = true; + name = getNextToken(); + } + + restOfTag(closing, name, start); + } + + /** + * A hook, for operations, preceeding call to handleText. + * Handle text in a string buffer. + * In non - preformatted mode, all line breaks immediately following the + * start tag and immediately before an end tag is discarded, + * \r, \n and \t are replaced by spaces, multiple space are replaced + * by the single one and the result is moved into array, + * passing it to handleText(). + */ + protected void _handleText() + { + char[] text; + + if (preformatted > 0) + text = textProcessor.preprocessPreformatted(buffer); + else + text = textProcessor.preprocess(buffer); + + if (text != null && text.length > 0 + // According to the specs we need to discard whitespace immediately + // before a closing tag. + && (text.length > 1 || text[0] != ' ' || ! TAG_CLOSE.matches(this))) + { + TagElement pcdata = new TagElement(dtd.getElement("#pcdata")); + attributes = htmlAttributeSet.EMPTY_HTML_ATTRIBUTE_SET; + _handleEmptyTag(pcdata); + + handleText(text); + if (titleOpen) + title.append(text); + } + } + + /** + * Add the image of this token to the buffer. + * @param t A token to append. + */ + protected final void append(Token t) + { + if (t.kind != EOF) + t.appendTo(buffer); + } + + /** + * Consume pattern that must match. + * @param p A pattern to consume. + */ + protected final void consume(pattern p) + { + node n; + for (int i = 0; i < p.nodes.length; i++) + { + n = p.nodes [ i ]; + if (n.optional) + optional(n.kind); + else + mustBe(n.kind); + } + } + + /** + * The method is called when the HTML end (closing) tag is found or if + * the parser concludes that the one should be present in the + * current position. The method is called immediatly + * before calling the handleEndTag(). + * @param omitted True if the tag is no actually present in the document, + * but is supposed by the parser (like </html> at the end of the + * document). + */ + protected void endTag(boolean omitted) + { + } + + /** + * Handle HTML comment. The default method returns without action. + * @param comment + */ + protected void handleComment(char[] comment) + { + } + + /** + * This is additionally called in when the HTML content terminates + * without closing the HTML comment. This can only happen if the + * HTML document contains errors (for example, the closing --;gt is + * missing. + */ + protected void handleEOFInComment() + { + error("Unclosed comment"); + } + + /** + * Handle the tag with no content, like <br>. The method is + * called for the elements that, in accordance with the current DTD, + * has an empty content. + * @param tag The tag being handled. + * @throws javax.swing.text.ChangedCharSetException + */ + protected void handleEmptyTag(TagElement tag) + throws javax.swing.text.ChangedCharSetException + { + } + + /** + * The method is called when the HTML closing tag ((like </table>) + * is found or if the parser concludes that the one should be present + * in the current position. + * @param tag The tag + */ + protected void handleEndTag(TagElement tag) + { + } + + /* Handle error that has occured in the given line. */ + protected void handleError(int line, String message) + { + } + + /** + * The method is called when the HTML opening tag ((like <table>) + * is found or if the parser concludes that the one should be present + * in the current position. + * @param tag The tag + */ + protected void handleStartTag(TagElement tag) + { + } + + /** + * Handle the text section. + * <p> For non-preformatted section, the parser replaces + * \t, \r and \n by spaces and then multiple spaces + * by a single space. Additionaly, all whitespace around + * tags is discarded. + * </p> + * <p> For pre-formatted text (inside TEXAREA and PRE), the parser preserves + * all tabs and spaces, but removes <b>one</b> bounding \r, \n or \r\n, + * if it is present. Additionally, it replaces each occurence of \r or \r\n + * by a single \n.</p> + * + * @param text A section text. + */ + protected void handleText(char[] text) + { + } + + /** + * Handle HTML <title> tag. This method is invoked when + * both title starting and closing tags are already behind. + * The passed argument contains the concatenation of all + * title text sections. + * @param title The title text. + */ + protected void handleTitle(char[] title) + { + } + + /** + * Constructs the tag from the given element. In this implementation, + * this is defined, but never called. + * @return the tag + */ + protected TagElement makeTag(Element element) + { + return makeTag(element, false); + } + + /** + * Constructs the tag from the given element. + * @param the tag base {@link javax.swing.text.html.parser.Element} + * @param isSupposed true if the tag is not actually present in the + * html input, but the parser supposes that it should to occur in + * the current location. + * @return the tag + */ + protected TagElement makeTag(Element element, boolean isSupposed) + { + return new TagElement(element, isSupposed); + } + + /** + * This is called when the tag, representing the given element, + * occurs first time in the document. + * @param element + */ + protected void markFirstTime(Element element) + { + } + + /** + * Consume the token that was checked before and hence MUST be present. + * @param kind The kind of token to consume. + */ + protected Token mustBe(int kind) + { + if (getTokenAhead().kind == kind) + return getNextToken(); + else + { + String ei = ""; + if (kind < 1000) + ei = " ('" + (char) kind + "') "; + throw new AssertionError("The token of kind " + kind + ei + + " MUST be here," + ); + } + } + + /** + * Handle attribute without value. The default method uses + * the only allowed attribute value from DTD. + * If the attribute is unknown or allows several values, + * the HTML.NULL_ATTRIBUTE_VALUE is used. The attribute with + * this value is added to the attribute set. + * @param element The name of element. + * @param attribute The name of attribute without value. + */ + protected void noValueAttribute(String element, String attribute) + { + Object value = HTML.NULL_ATTRIBUTE_VALUE; + + Element e = dtd.elementHash.get(element.toLowerCase()); + if (e != null) + { + AttributeList attr = e.getAttribute(attribute); + if (attr != null) + { + Vector values = attr.values; + if (values != null && values.size() == 1) + value = values.get(0); + } + } + attributes.addAttribute(attribute, value); + } + + /** + * Consume the optional token, if present. + * @param kind The kind of token to consume. + */ + protected Token optional(int kind) + { + if (getTokenAhead().kind == kind) + return getNextToken(); + else + return null; + } + + /** Parse the html document. */ + protected void parseDocument() + throws ParseException + { + // Read up any initial whitespace. + optional(WS); + while (getTokenAhead().kind != EOF) + { + advanced = false; + if (TAG.matches(this)) + Tag(); + else if (COMMENT_OPEN.matches(this)) + Comment(); + else if (STYLE_OPEN.matches(this)) + Style(); + else if (SCRIPT_OPEN.matches(this)) + Script(); + else if (SGML.matches(this)) + Sgml(); + else + CDATA(true); + + // Surely HTML error, treat as a text. + if (!advanced) + { + Token wrong = getNextToken(); + error("unexpected '" + wrong.getImage() + "'", wrong); + buffer.setLength(0); + buffer.append(wrong.getImage()); + _handleText(); + } + } + } + + /** + * Read the element attributes, adding them into attribute set. + * @param element The element name (needed to access attribute + * information in dtd). + */ + protected void readAttributes(String element) + { + Token name; + Token value; + Token next; + String attrValue; + + attributes = new htmlAttributeSet(); + + optional(WS); + + attributeReading: + while (getTokenAhead().kind == NUMTOKEN) + { + name = getNextToken(); + optional(WS); + + next = getTokenAhead(); + if (next.kind == EQ) + { + mustBe(EQ); + optional(WS); + + next = getNextToken(); + + switch (next.kind) + { + case QUOT: + + // read "quoted" attribute. + buffer.setLength(0); + readTillTokenE(QUOT); + attrValue = buffer.toString(); + break; + + case AP: + + // read 'quoted' attribute. + buffer.setLength(0); + readTillTokenE(AP); + attrValue = buffer.toString(); + break; + + // read unquoted attribute. + case NUMTOKEN: + value = next; + optional(WS); + + // Check maybe the opening quote is missing. + next = getTokenAhead(); + if (bQUOTING.get(next.kind)) + { + hTag = next; + error("The value without opening quote is closed with '" + + next.getImage() + "'"); + attrValue = value.getImage(); + } + else if (next.kind == SLASH || next.kind == OTHER) + // The slash and other characters (like %) in this context is + // treated as the ordinary + // character, not as a token. The character may be part of + // the unquoted URL. + { + CPStringBuilder image = new CPStringBuilder(value.getImage()); + while (next.kind == NUMTOKEN || next.kind == SLASH + || next.kind == OTHER) + { + image.append(getNextToken().getImage()); + next = getTokenAhead(); + } + attrValue = image.toString(); + } + else + attrValue = value.getImage(); + break; + + case SLASH: + value = next; + optional(WS); + + // Check maybe the opening quote is missing. + next = getTokenAhead(); + if (bQUOTING.get(next.kind)) + { + hTag = next; + error("The value without opening quote is closed with '" + + next.getImage() + "'"); + attrValue = value.getImage(); + } + else if (next.kind == NUMTOKEN || next.kind == SLASH) + // The slash in this context is treated as the ordinary + // character, not as a token. The slash may be part of + // the unquoted URL. + { + CPStringBuilder image = new CPStringBuilder(value.getImage()); + while (next.kind == NUMTOKEN || next.kind == SLASH) + { + image.append(getNextToken().getImage()); + next = getTokenAhead(); + } + attrValue = image.toString(); + } + else + attrValue = value.getImage(); + break; + default: + break attributeReading; + } + attributes.addAttribute(name.getImage(), attrValue); + optional(WS); + } + else + // The '=' is missing: attribute without value. + { + noValueAttribute(element, name.getImage()); + } + } + } + + /** + * Return string, corresponding the given named entity. The name is passed + * with the preceeding &, but without the ending semicolon. + */ + protected String resolveNamedEntity(final String a_tag) + { + // Discard & + if (!a_tag.startsWith("&")) + throw new AssertionError("Named entity " + a_tag + + " must start witn '&'." + ); + + String tag = a_tag.substring(1); + + try + { + Entity entity = dtd.getEntity(tag); + if (entity != null) + return entity.getString(); + + entity = dtd.getEntity(tag.toLowerCase()); + + if (entity != null) + { + error("The name of this entity should be in lowercase", a_tag); + return entity.getString(); + } + } + catch (IndexOutOfBoundsException ibx) + { + /* The error will be reported. */ + } + + error("Unknown named entity", a_tag); + return a_tag; + } + + /** + * Return char, corresponding the given numeric entity. + * The name is passed with the preceeding &#, but without + * the ending semicolon. + */ + protected char resolveNumericEntity(final String a_tag) + { + // Discard &# + if (!a_tag.startsWith("&#")) + throw new AssertionError("Numeric entity " + a_tag + + " must start witn '&#'." + ); + + String tag = a_tag.substring(2); + + try + { + // Determine the encoding type: + char cx = tag.charAt(0); + if (cx == 'x' || cx == 'X') // Hexadecimal &#Xnnn; + + return (char) Integer.parseInt(tag.substring(1), 16); + + return (char) Integer.parseInt(tag); + } + + /* The error will be reported. */ + catch (NumberFormatException nex) + { + } + catch (IndexOutOfBoundsException ix) + { + } + + error("Invalid numeric entity", a_tag); + return '?'; + } + + /** + * Reset all fields into the intial default state, preparing the + * parset for parsing the next document. + */ + protected void restart() + { + documentTags.clear(); + titleHandled = false; + titleOpen = false; + buffer.setLength(0); + title.setLength(0); + validator.restart(); + } + + /** + * The method is called when the HTML opening tag ((like <table>) + * is found or if the parser concludes that the one should be present + * in the current position. The method is called immediately before + * calling the handleStartTag. + * @param tag The tag + */ + protected void startTag(TagElement tag) + throws ChangedCharSetException + { + } + + /** + * Handle a complete element, when the tag content is already present in the + * buffer and both starting and heading tags behind. This is called + * in the case when the tag text must not be parsed for the nested + * elements (elements STYLE and SCRIPT). + */ + private void _handleCompleteElement(TagElement tag) + { + _handleStartTag(tag); + + // Suppress inclusion of the SCRIPT ans STYLE texts into the title. + HTML.Tag h = tag.getHTMLTag(); + if (h == HTML.Tag.SCRIPT || h == HTML.Tag.STYLE) + { + boolean tmp = titleOpen; + titleOpen = false; + _handleText(); + titleOpen = tmp; + } + else + _handleText(); + + _handleEndTag(tag); + } + + /** + * A hooks for operations, preceeding call to handleEmptyTag(). + * Handle the tag with no content, like <br>. As no any + * nested tags are expected, the tag validator is not involved. + * @param tag The tag being handled. + */ + private void _handleEmptyTag(TagElement tag) + { + try + { + validator.validateTag(tag, attributes); + handleEmptyTag(tag); + HTML.Tag h = tag.getHTMLTag(); + // When a block tag is closed, consume whitespace that follows after + // it. + // For some unknown reason a FRAME tag is not treated as block element. + // However in this case it should be treated as such. + if (isBlock(h)) + optional(WS); + } + catch (ChangedCharSetException ex) + { + error("Changed charset exception:", ex.getMessage()); + } + } + + /** + * A hooks for operations, preceeding call to handleEndTag(). + * The method is called when the HTML closing tag + * is found. Calls handleTitle after closing the 'title' tag. + * @param tag The tag + */ + private void _handleEndTag(TagElement tag) + { + if (validator.closeTag(tag)) + _handleEndTag_remaining(tag); + } + + /** + * Actions that are also required if the closing action was + * initiated by the tag validator. + * Package-private to avoid an accessor method. + */ + void _handleEndTag_remaining(TagElement tag) + { + HTML.Tag h = tag.getHTMLTag(); + + handleEndTag(tag); + endTag(tag.fictional()); + + if (h.isPreformatted()) + preformatted--; + if (preformatted < 0) + preformatted = 0; + + // When a block tag is closed, consume whitespace that follows after + // it. + if (isBlock(h)) + optional(WS); + + if (h == HTML.Tag.TITLE) + { + titleOpen = false; + titleHandled = true; + + char[] a = new char[ title.length() ]; + title.getChars(0, a.length, a, 0); + handleTitle(a); + } + } + + /** + * A hooks for operations, preceeding call to handleStartTag(). + * The method is called when the HTML opening tag ((like <table>) + * is found. + * Package-private to avoid an accessor method. + * @param tag The tag + */ + void _handleStartTag(TagElement tag) + { + validator.openTag(tag, attributes); + startingTag(tag); + handleStartTag(tag); + + HTML.Tag h = tag.getHTMLTag(); + + if (isBlock(h)) + optional(WS); + + if (h.isPreformatted()) + preformatted++; + + if (h == HTML.Tag.TITLE) + { + if (titleHandled) + error("Repetetive <TITLE> tag"); + titleOpen = true; + titleHandled = false; + } + } + + /** + * Resume parsing after heavy errors in HTML tag structure. + * @throws ParseException + */ + private void forciblyCloseTheTag() + throws ParseException + { + int closeAt = 0; + buffer.setLength(0); + + ahead: + for (int i = 1; i < 100; i++) + { + t = getTokenAhead(i - 1); + if (t.kind == EOF || t.kind == BEGIN) + break ahead; + if (t.kind == END) + { + /* Closing '>' found. */ + closeAt = i; + break ahead; + } + } + if (closeAt > 0) + { + buffer.append("Ignoring '"); + for (int i = 1; i <= closeAt; i++) + { + t = getNextToken(); + append(t); + } + buffer.append('\''); + error(buffer.toString()); + } + } + + /** + * Handle comment in string buffer. You can avoid allocating a char + * array each time by processing your comment directly here. + */ + private void handleComment() + { + char[] a = new char[ buffer.length() ]; + buffer.getChars(0, a.length, a, 0); + handleComment(a); + } + + private TagElement makeTagElement(String name, boolean isSupposed) + { + Element e = dtd.elementHash.get(name.toLowerCase()); + if (e == null) + { + error("Unknown tag <" + name + ">"); + e = dtd.getElement(name); + e.name = name.toUpperCase(); + e.index = -1; + } + + if (!documentTags.contains(e.name)) + { + markFirstTime(e); + documentTags.add(e.name); + } + + return makeTag(e, isSupposed); + } + + /** + * Read till the given token, resolving entities. Consume the given + * token without adding it to buffer. + * @param till The token to read till + * @throws ParseException + */ + private void readTillTokenE(int till) + throws ParseException + { + buffer.setLength(0); + read: + while (true) + { + t = getNextToken(); + if (t.kind == Constants.ENTITY) + { + resolveAndAppendEntity(t); + } + else if (t.kind == EOF) + { + error("unexpected eof", t); + break read; + } + else if (t.kind == till) + break read; + else if (t.kind == WS) + { + // Processing whitespace in accordance with CDATA rules: + String s = t.getImage(); + char c; + for (int i = 0; i < s.length(); i++) + { + c = s.charAt(i); + if (c == '\r') + buffer.append(' '); // CR replaced by space + else if (c == '\n') + { /* LF ignored */ } + else if (c == '\t') + buffer.append(' '); // Tab replaced by space + else + buffer.append(c); + } + } + else + append(t); + } + } + + /** + * Resolve the entity and append it to the end of buffer. + * @param entity + */ + private void resolveAndAppendEntity(Token entity) + { + switch (entity.category) + { + case ENTITY_NAMED : + buffer.append(resolveNamedEntity(entity.getImage())); + break; + + case ENTITY_NUMERIC : + buffer.append(resolveNumericEntity(entity.getImage())); + break; + + default : + throw new AssertionError("Invalid entity category " + + entity.category + ); + } + } + + /** + * Handle the remaining of HTML tags. This is a common end for + * TAG, SCRIPT and STYLE. + * @param closing True for closing tags ( </TAG> ). + * @param name Name of element + * @param start Token where element has started + * @throws ParseException + */ + private void restOfTag(boolean closing, Token name, Token start) + throws ParseException + { + boolean end = false; + Token next; + + optional(WS); + + readAttributes(name.getImage()); + + optional(WS); + + next = getTokenAhead(); + if (next.kind == END) + { + mustBe(END); + end = true; + } + + hTag = new Token(start, next); + + if (!end) + { + // The tag body contains errors. If additionally the tag + // name is not valid, this construction is treated as text. + if (dtd.elementHash.get(name.getImage().toLowerCase()) == null && + backupMode + ) + { + error("Errors in tag body and unknown tag name. " + + "Treating the tag as a text." + ); + reset(); + + hTag = mustBe(BEGIN); + buffer.setLength(0); + buffer.append(hTag.getImage()); + CDATA(false); + return; + } + else + { + error("Forcibly closing invalid parameter list"); + forciblyCloseTheTag(); + } + } + + if (closing) + { + endTag(false); + _handleEndTag(makeTagElement(name.getImage(), false)); + } + else + { + TagElement te = makeTagElement(name.getImage(), false); + if (te.getElement().type == DTDConstants.EMPTY) + _handleEmptyTag(te); + else + { + // According to the specs we need to consume whitespace following + // immediately after a opening tag. + optional(WS); + _handleStartTag(te); + } + } + } + + /** + * This should fire additional actions in response to the + * ChangedCharSetException. The current implementation + * does nothing. + * @param tag + */ + private void startingTag(TagElement tag) + { + try + { + startTag(tag); + } + catch (ChangedCharSetException cax) + { + error("Invalid change of charset"); + } + } + + private void ws_error() + { + error("Whitespace here is not permitted"); + } + + /** + * Returns true when the specified tag should be considered a block tag + * wrt whitespace handling. We need this special handling, since there + * are a couple of tags that we must treat as block tags but which aren't + * officially block tags. + * + * @param tag the tag to check + * @return true when the specified tag should be considered a block tag + * wrt whitespace handling + */ + private boolean isBlock(HTML.Tag tag) + { + return tag.isBlock() || tag == HTML.Tag.STYLE || tag == HTML.Tag.FRAME; + } +} diff --git a/libjava/classpath/gnu/javax/swing/text/html/parser/support/gnuStringIntMapper.java b/libjava/classpath/gnu/javax/swing/text/html/parser/support/gnuStringIntMapper.java new file mode 100644 index 000000000..9cdf810dd --- /dev/null +++ b/libjava/classpath/gnu/javax/swing/text/html/parser/support/gnuStringIntMapper.java @@ -0,0 +1,112 @@ +/* gnuStringIntMapper.java -- + Copyright (C) 2005 Free Software Foundation, Inc. + +This file is part of GNU Classpath. + +GNU Classpath is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU Classpath is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Classpath; see the file COPYING. If not, write to the +Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +02110-1301 USA. + +Linking this library statically or dynamically with other modules is +making a combined work based on this library. Thus, the terms and +conditions of the GNU General Public License cover the whole +combination. + +As a special exception, the copyright holders of this library give you +permission to link this library with independent modules to produce an +executable, regardless of the license terms of these independent +modules, and to copy and distribute the resulting executable under +terms of your choice, provided that you also meet, for each linked +independent module, the terms and conditions of the license of that +module. An independent module is a module which is not derived from +or based on this library. If you modify this library, you may extend +this exception to your version of the library, but you are not +obligated to do so. If you do not wish to do so, delete this +exception statement from your version. */ + + +package gnu.javax.swing.text.html.parser.support; + +import java.util.HashMap; +import java.util.Map; +import java.util.TreeMap; + +/** + * A helper class, mapping between the strings and they unique integer + * identifiers. + * @author Audrius Meskauskas, Lithuania (AudriusA@Bioinformatics.org) + */ +public abstract class gnuStringIntMapper +{ + /** + * Maps argument integer values from DTDConstants into they string + * names. Initialized on demand. + */ + private Map is_Map; + + /** + * Maps argument string names into they integer values from DTDConstants. + * Initialized on demand. + */ + private Map si_Map; + + /** + * Get string from id or null if no such id is present in the mapper. + */ + public final String get(int id) + { + if (is_Map == null) + createTheMap(); + + return (String) is_Map.get(new Integer(id)); + } + + /** Get id from string or 0 if no such string is present in the mapper. */ + public final int get(String id) + { + if (si_Map == null) + createTheMap(); + + Integer i = (Integer) si_Map.get(id); + + return i != null ? i.intValue() : 0; + } + + /** + * Create the mapping table for this mapper by adding the required + * String/int pairs. The method is invoked + * only once for each instance, after the first invocation of the any + * form of the <code>get</code> method. Use <code>add</code> to + * create a map for a concrete instance. + */ + protected abstract void create(); + + /** + * Add an id/string pair to this mapper. This is called from + * the method <code>create</code> only. + */ + protected void add(String name, int id) + { + Integer i = new Integer(id); + si_Map.put(name, i); + is_Map.put(i, name); + } + + private void createTheMap() + { + is_Map = new HashMap(); + si_Map = new TreeMap(); + create(); + } +} diff --git a/libjava/classpath/gnu/javax/swing/text/html/parser/support/low/Buffer.java b/libjava/classpath/gnu/javax/swing/text/html/parser/support/low/Buffer.java new file mode 100644 index 000000000..a39330af8 --- /dev/null +++ b/libjava/classpath/gnu/javax/swing/text/html/parser/support/low/Buffer.java @@ -0,0 +1,238 @@ +/* Buffer.java -- + Copyright (C) 2005 Free Software Foundation, Inc. + +This file is part of GNU Classpath. + +GNU Classpath is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU Classpath is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Classpath; see the file COPYING. If not, write to the +Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +02110-1301 USA. + +Linking this library statically or dynamically with other modules is +making a combined work based on this library. Thus, the terms and +conditions of the GNU General Public License cover the whole +combination. + +As a special exception, the copyright holders of this library give you +permission to link this library with independent modules to produce an +executable, regardless of the license terms of these independent +modules, and to copy and distribute the resulting executable under +terms of your choice, provided that you also meet, for each linked +independent module, the terms and conditions of the license of that +module. An independent module is a module which is not derived from +or based on this library. If you modify this library, you may extend +this exception to your version of the library, but you are not +obligated to do so. If you do not wish to do so, delete this +exception statement from your version. */ + + +package gnu.javax.swing.text.html.parser.support.low; + +/** + * A string buffer that additionally holds line and absolute postion + * information. + * @author Audrius Meskauskas, Lithuania (AudriusA@Bioinformatics.org) + */ +public class Buffer +{ + public static int INITIAL_SIZE = 2048; + + /** + * True if the \n symbol has been seen. + */ + public boolean n_seen; + + /** + * True if the \r symbol has been seen. + */ + public boolean r_seen; + char[] chr = new char[ INITIAL_SIZE ]; + int[] line = new int[ INITIAL_SIZE ]; + int[] position = new int[ INITIAL_SIZE ]; + + /** + * Current line. + */ + int current_line = 0; + + /** + * Point to the next free position. + */ + int length; + + public Buffer() + { + } + + public Buffer(String content) + { + for (int i = 0; i < content.length(); i++) + { + append(content.charAt(i), i); + } + } + + /** + * Get the characters into array. + * @param srcBegin From, inclusive + * @param srcEnd To, exclusive. + * @param dst Into + * @param dstBegin Offset. + */ + public void getChars(int srcBegin, int srcEnd, char[] dst, int dstBegin) + { + System.arraycopy(chr, srcBegin, dst, dstBegin, (srcEnd - srcBegin)); + } + + /** + * Return the sequence, used to separate lines in the document. + * @return one of \n, \r or \r\n. + */ + public String getEndOfLineSequence() + { + if (r_seen && n_seen) + return "\r\n"; + else if (r_seen) + return "\r"; + else + + // This also is returned for single-line document. + return "\n"; + } + + /** + * Truncate. + * @param n The length to truncate till. + */ + public void setLength(int n) + { + length = n; + } + + /** + * Get location information for the given region. + * @param from Region start, inclusive. + * @param to Region end, exclusive. + * @return The location, covering the region. + */ + public Location getLocation(int from, int to) + { + Location l = new Location(); + l.beginLine = line [ from ]; + l.endLine = line [ to - 1 ]; + + l.startPosition = position [ from ]; + l.endPosition = position [ to - 1 ] + 1; + + return l; + } + + /** + * Add the character. + * @param c The character. + * @param pos The character position in the stream (the line number + * is handled internally in the buffer). + */ + public void append(char c, int pos) + { + if (length >= chr.length) + expand(); + chr [ length ] = c; + position [ length ] = pos; + + if (c == '\n') + { + if (!r_seen) + current_line++; + n_seen = true; + } + else if (c == '\r') + { + current_line++; + r_seen = true; + } + + line [ length ] = current_line; + + length++; + } + + /** + * Return char at the given positon. + */ + public char charAt(int i) + { + return chr [ i ]; + } + + /** + * Delete the range + * @param from Start position, inclusive. + * @param to End position, exclusive. + */ + public void delete(int from, int to) + { + int len = to - from; + if (len < 1) + throw new AssertionError("Deleting " + from + " till " + to); + + int tail = length - to; + + System.arraycopy(chr, to, chr, from, tail); + System.arraycopy(position, to, position, from, tail); + System.arraycopy(line, to, line, from, tail); + length = length - len; + } + + /** + * Double the buffer size. + */ + public void expand() + { + int nSize = 2 * chr.length; + + char[] nchr = new char[ nSize ]; + int[] nposition = new int[ nSize ]; + int[] nline = new int[ nSize ]; + + System.arraycopy(chr, 0, nchr, 0, chr.length); + System.arraycopy(position, 0, nposition, 0, position.length); + System.arraycopy(line, 0, nline, 0, line.length); + + chr = nchr; + position = nposition; + line = nline; + } + + /** + * Return length of the occupied part of the buffer. + */ + public int length() + { + return length; + } + + /** + * Prepare for parsing the new document. + */ + public void reset() + { + setLength(0); + r_seen = n_seen = false; + } + + public String toString() + { + return new String(chr, 0, length); + } +} diff --git a/libjava/classpath/gnu/javax/swing/text/html/parser/support/low/Constants.java b/libjava/classpath/gnu/javax/swing/text/html/parser/support/low/Constants.java new file mode 100644 index 000000000..5416582ad --- /dev/null +++ b/libjava/classpath/gnu/javax/swing/text/html/parser/support/low/Constants.java @@ -0,0 +1,433 @@ +/* Constants.java -- + Copyright (C) 2005 Free Software Foundation, Inc. + +This file is part of GNU Classpath. + +GNU Classpath is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU Classpath is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Classpath; see the file COPYING. If not, write to the +Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +02110-1301 USA. + +Linking this library statically or dynamically with other modules is +making a combined work based on this library. Thus, the terms and +conditions of the GNU General Public License cover the whole +combination. + +As a special exception, the copyright holders of this library give you +permission to link this library with independent modules to produce an +executable, regardless of the license terms of these independent +modules, and to copy and distribute the resulting executable under +terms of your choice, provided that you also meet, for each linked +independent module, the terms and conditions of the license of that +module. An independent module is a module which is not derived from +or based on this library. If you modify this library, you may extend +this exception to your version of the library, but you are not +obligated to do so. If you do not wish to do so, delete this +exception statement from your version. */ + + +package gnu.javax.swing.text.html.parser.support.low; + +import java.util.BitSet; + +/** + * The parser constants and operations, directly related to the parser + * constants. + * @author Audrius Meskauskas, Lithuania (AudriusA@Bioinformatics.org) + */ +public class Constants +{ + /* Single character tokens are reflected into they ASCII codes. */ + + /** + * Start of HTML token. + */ + public static final int BEGIN = '<'; + + /** + * End of HTML token. + */ + public static final int END = '>'; + + /** + * Exclamation (indicates SGML or comment). + */ + public static final int EXCLAMATION = '!'; + + /** + * Slash (indicates closing tag). + */ + public static final int SLASH = '/'; + + /** + * Equals sign. + */ + public static final int EQ = '='; + + /** + * Quoting sign. + */ + public static final int AP = '\''; + + /** + * Quoting sign. + */ + public static final int QUOT = '"'; + + /* The numbers of other tokens start outside the ascii space. */ + /* String tokens */ + + /** + * Double dash (--) + */ + public static final int DOUBLE_DASH = 1000; + + /** + * The STYLE tag (needs special handling). + */ + public static final int STYLE = 1001; + + /** + * The SCRIPT tag (needs special handling). + */ + public static final int SCRIPT = 1002; + + /* Pattern tokens */ + + /** + * HTML whitespace. + */ + public static final int WS = 1003; + + /** + * Named or numeric entity, + */ + public static final int ENTITY = 1004; + + /** + * Sequence of valid name characters (can start from digit). + */ + public static final int NUMTOKEN = 1005; + + /* Complex tokens */ + + /** + * Comment opening sequence. + */ + public static final pattern COMMENT_OPEN = + new pattern(new node[] + { + new node(BEGIN), new node(WS, true), new node(EXCLAMATION), + new node(WS, true), new node(DOUBLE_DASH), + } + ); + + /** + * Comment closing sequence + */ + public static final pattern COMMENT_END = + new pattern(new node[] + { + new node(DOUBLE_DASH), new node(WS, true), new node(END) + } + ); + + /** + * Special case ---> (also is treated as end of comment). + */ + public static final pattern COMMENT_TRIPLEDASH_END = + new pattern(new node[] + { + new node(DOUBLE_DASH), new node(NUMTOKEN), new node(END) + } + ); + + /** + * STYLE element heading pattern. + */ + public static final pattern STYLE_OPEN = + new pattern(new node[] { new node(BEGIN), new node(WS, true), new node(STYLE) }); + + /** + * SCRIPT element heading pattern. + */ + public static final pattern SCRIPT_OPEN = + new pattern(new node[] { new node(BEGIN), new node(WS, true), new node(SCRIPT) }); + + /** + * SGML element heading pattern. + */ + public static final pattern SGML = + new pattern(new node[] + { + new node(BEGIN), new node(WS, true), new node(EXCLAMATION) + } + ); + + /** + * SCRIPT element closing pattern. + */ + public static final pattern SCRIPT_CLOSE = + new pattern(new node[] + { + new node(BEGIN), new node(WS, true), new node(SLASH), + new node(WS, true), new node(SCRIPT), new node(WS, true), + new node(END) + } + ); + + /** + * STYLE element closing pattern. + */ + public static final pattern STYLE_CLOSE = + new pattern(new node[] + { + new node(BEGIN), new node(WS, true), new node(SLASH), + new node(WS, true), new node(STYLE), new node(WS, true), + new node(END) + } + ); + + /** + * Ordinary HTML tag heading pattern. + */ + public static final pattern TAG = + new pattern(new node[] + { + new node(BEGIN), new node(WS, true), new node(SLASH, true), + new node(WS, true), new node(NUMTOKEN) + } + ); + + /** + * Ordinary HTML tag closing pattern. + */ + public static final pattern TAG_CLOSE = + new pattern(new node[] + { + new node(BEGIN), new node(WS, true), new node(SLASH), + new node(WS, true), new node(NUMTOKEN) + } + ); + + /* Special tokens */ + + /** + * All other tokens. + */ + public static final int OTHER = 1999; + + /** + * The UNICODE "end of text" control code + */ + static final char ETX = 3; + + /** + * End of file. + */ + public static final int EOF = ETX; + + /* Character categories */ + + /** + * All single char tokens. + */ + public static final BitSet bSINGLE_CHAR_TOKEN = new BitSet(); + + /** + * Non letters and non numbers, allowed in HTML names. + */ + public static final BitSet bSPECIAL = new BitSet(); + + /** + * All letters, used in HTML names. + */ + public static final BitSet bLETTER = new BitSet(); + + /** + * Digits. + */ + public static final BitSet bDIGIT = new BitSet(); + + /** + * Both line breaks. + */ + public static final BitSet bLINEBREAK = new BitSet(); + + /** + * All whitespace. + */ + public static final BitSet bWHITESPACE = new BitSet(); + + /** + * Both quoting characters. + */ + public static final BitSet bQUOTING = new BitSet(); + + /** + * Valid name characters. + */ + public static final BitSet bNAME = new BitSet(); + + /* Entity subcategories */ + + /** + * Named entity. + */ + public static final int ENTITY_NAMED = 1; + + /** + * Numeric entity. + */ + public static final int ENTITY_NUMERIC = 2; + + static + { + bQUOTING.set(AP); + bQUOTING.set(QUOT); + + bSINGLE_CHAR_TOKEN.set(BEGIN); + bSINGLE_CHAR_TOKEN.set(END); + bSINGLE_CHAR_TOKEN.set(EXCLAMATION); + bSINGLE_CHAR_TOKEN.set(SLASH); + bSINGLE_CHAR_TOKEN.set(EQ); + bSINGLE_CHAR_TOKEN.set(EOF); + + bSINGLE_CHAR_TOKEN.or(bQUOTING); + + bLINEBREAK.set('\r'); + bLINEBREAK.set('\n'); + + bWHITESPACE.set(' '); + bWHITESPACE.set('\t'); + bWHITESPACE.set(0xC); + bWHITESPACE.or(bLINEBREAK); + + for (char i = '0'; i <= '9'; i++) + { + bDIGIT.set(i); + } + + for (char i = 'a'; i <= 'z'; i++) + { + bLETTER.set(i); + } + + for (char i = 'A'; i <= 'Z'; i++) + { + bLETTER.set(i); + } + + bSPECIAL.set('-'); + bSPECIAL.set('_'); + bSPECIAL.set(':'); + bSPECIAL.set('.'); + + bNAME.or(bLETTER); + bNAME.or(bDIGIT); + bNAME.or(bSPECIAL); + } + + /** + * Verifies if one of the tokens matches the end of string + * buffer. The last character in the string buffer is the + * "future character", some tokens needs to verify it the + * token does not continue "towards the future". If the token + * matches, it matches till "pre-last" character in the buffer. + * @param b + * @return + */ + public Token endMatches(Buffer b) + { + if (b.length() < 2) + return null; + + int p = b.length() - 2; + + if (b.length() > 2 && b.charAt(p) == '-' && b.charAt(p - 1) == '-') + return new Token(DOUBLE_DASH, "--", b.getLocation(p - 1, p + 1)); + + char last = b.charAt(p); + + if (bSINGLE_CHAR_TOKEN.get(last)) + return new Token(last, last, b.getLocation(p, p + 1)); + + char future = b.charAt(p + 1); + + // Check for numtokens, script and style: + if (bNAME.get(last) && !bNAME.get(future)) + { + // Scan the history up: + int u = p - 1; + while (u >= 0 && bNAME.get(b.charAt(u))) + u--; + u++; + + char[] token = new char[ p - u + 1 ]; + + // Found a numtoken + b.getChars(u, p + 1, token, 0); + + // Verify for the built-in tokens: + String e = new String(token); + + // found the entity reference + if (u > 0 && b.charAt(u - 1) == '&') + { + // The subsequent semicolon may be the part of the token + // as well. The semicolon must be ignored. This must be + // handled elsewhere. + return new Token(ENTITY, ENTITY_NAMED, "&" + e, + b.getLocation(u - 1, p + 1) + ); + } + + // found the numeric entity reference + if (u > 1 && b.charAt(u - 1) == '#' && b.charAt(u - 2) == '&') + { + // The subsequent semicolon may be the part of the token + // as well. The semicolon must be ignored. This must be + // handled elsewhere. + return new Token(ENTITY, ENTITY_NUMERIC, "&#" + e, + b.getLocation(u - 2, p + 2) + ); + } + + Location le = b.getLocation(u, p + 1); + + if (e.equalsIgnoreCase("SCRIPT")) + return new Token(SCRIPT, e, le); + else if (e.equalsIgnoreCase("STYLE")) + return new Token(STYLE, e, le); + else + return new Token(NUMTOKEN, e, le); + } + + // Check for whitespace + if (bWHITESPACE.get(last) && !bWHITESPACE.get(future)) + { + // Scan the history up: + int u = p - 1; + while (u >= 0 && bWHITESPACE.get(b.charAt(u))) + u--; + u++; + + char[] token = new char[ p - u + 1 ]; + b.getChars(u, p + 1, token, 0); + + return new Token(WS, new String(token), b.getLocation(u, p + 1)); + } + + return null; + } +} diff --git a/libjava/classpath/gnu/javax/swing/text/html/parser/support/low/Location.java b/libjava/classpath/gnu/javax/swing/text/html/parser/support/low/Location.java new file mode 100644 index 000000000..8a1cde1c8 --- /dev/null +++ b/libjava/classpath/gnu/javax/swing/text/html/parser/support/low/Location.java @@ -0,0 +1,83 @@ +/* Location.java -- + Copyright (C) 2005 Free Software Foundation, Inc. + +This file is part of GNU Classpath. + +GNU Classpath is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU Classpath is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Classpath; see the file COPYING. If not, write to the +Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +02110-1301 USA. + +Linking this library statically or dynamically with other modules is +making a combined work based on this library. Thus, the terms and +conditions of the GNU General Public License cover the whole +combination. + +As a special exception, the copyright holders of this library give you +permission to link this library with independent modules to produce an +executable, regardless of the license terms of these independent +modules, and to copy and distribute the resulting executable under +terms of your choice, provided that you also meet, for each linked +independent module, the terms and conditions of the license of that +module. An independent module is a module which is not derived from +or based on this library. If you modify this library, you may extend +this exception to your version of the library, but you are not +obligated to do so. If you do not wish to do so, delete this +exception statement from your version. */ + + +package gnu.javax.swing.text.html.parser.support.low; + +/** + * Defines a region in the text: its bounding positions and the line number. + * @author Audrius Meskauskas, Lithuania (AudriusA@Bioinformatics.org) + */ +public class Location +{ + /** + * The line number, where the token starts. + */ + public int beginLine; + + /** + * The line, where the token ends. + */ + public int endLine; + + /** + * The absolute token end position in the input stream, + * exclusive. + */ + public int endPosition; + + /** + * The absolute token start position in the input stream, + * inclusive. + */ + public int startPosition; + + public Location() + { + } + + /** + * Special case, used to mark EOF. + * @param p The total stream length. + */ + public Location(int p) + { + startPosition = p; + endPosition = p + 1; + beginLine = endLine = -1; + } +} diff --git a/libjava/classpath/gnu/javax/swing/text/html/parser/support/low/ParseException.java b/libjava/classpath/gnu/javax/swing/text/html/parser/support/low/ParseException.java new file mode 100644 index 000000000..e71c0c1f6 --- /dev/null +++ b/libjava/classpath/gnu/javax/swing/text/html/parser/support/low/ParseException.java @@ -0,0 +1,51 @@ +/* ParseException.java -- + Copyright (C) 2005 Free Software Foundation, Inc. + +This file is part of GNU Classpath. + +GNU Classpath is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU Classpath is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Classpath; see the file COPYING. If not, write to the +Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +02110-1301 USA. + +Linking this library statically or dynamically with other modules is +making a combined work based on this library. Thus, the terms and +conditions of the GNU General Public License cover the whole +combination. + +As a special exception, the copyright holders of this library give you +permission to link this library with independent modules to produce an +executable, regardless of the license terms of these independent +modules, and to copy and distribute the resulting executable under +terms of your choice, provided that you also meet, for each linked +independent module, the terms and conditions of the license of that +module. An independent module is a module which is not derived from +or based on this library. If you modify this library, you may extend +this exception to your version of the library, but you are not +obligated to do so. If you do not wish to do so, delete this +exception statement from your version. */ + + +package gnu.javax.swing.text.html.parser.support.low; + +/** + * This can be thrown from various parsing methods. + */ +public class ParseException + extends RuntimeException +{ + public ParseException(String s, Throwable cause) + { + super(s, cause); + } +} diff --git a/libjava/classpath/gnu/javax/swing/text/html/parser/support/low/Queue.java b/libjava/classpath/gnu/javax/swing/text/html/parser/support/low/Queue.java new file mode 100644 index 000000000..31cf4bb4d --- /dev/null +++ b/libjava/classpath/gnu/javax/swing/text/html/parser/support/low/Queue.java @@ -0,0 +1,142 @@ +/* Queue.java -- a token queue. + Copyright (C) 2005 Free Software Foundation, Inc. + +This file is part of GNU Classpath. + +GNU Classpath is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU Classpath is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Classpath; see the file COPYING. If not, write to the +Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +02110-1301 USA. + +Linking this library statically or dynamically with other modules is +making a combined work based on this library. Thus, the terms and +conditions of the GNU General Public License cover the whole +combination. + +As a special exception, the copyright holders of this library give you +permission to link this library with independent modules to produce an +executable, regardless of the license terms of these independent +modules, and to copy and distribute the resulting executable under +terms of your choice, provided that you also meet, for each linked +independent module, the terms and conditions of the license of that +module. An independent module is a module which is not derived from +or based on this library. If you modify this library, you may extend +this exception to your version of the library, but you are not +obligated to do so. If you do not wish to do so, delete this +exception statement from your version. */ + + +package gnu.javax.swing.text.html.parser.support.low; + +import java.util.Arrays; + +/** + * A token queue. + * @author Audrius Meskauskas, Lithuania (AudriusA@Bioinformatics.org) + */ +public class Queue +{ + Token[] m = new Token[ 64 ]; + int a = 0; + int b = 0; + + /** + * True for the empty queue. + */ + public boolean isEmpty() + { + return size() == 0; + } + + /** + * Add this trace to the end of the queue. + */ + public void add(Token u) + { + if (a < m.length) + { + m [ a ] = u; + a++; + } + else // The end of array has been reached. + { + if (b > 0) // If some elements were deleted from the start of the queue, shift. + { + int d = b; + System.arraycopy(m, b, m, 0, a - b); + b = b - d; + a = a - d; + m [ a ] = u; + a++; + } + else // Enlarge the queue, doubling the size. + { + int n = m.length * 2; + Token[] nm = new Token[ 2 * n ]; + System.arraycopy(m, 0, nm, 0, m.length); + Arrays.fill(m, null); + + nm [ a ] = u; + m = nm; + a++; + } + } + } + + /** + * Clear the queue. + */ + public void clear() + { + a = b = 0; + Arrays.fill(m, null); + } + + /** + * Read the value ahead. 0 is the value that will be returned with + * the following next. This method does not remove values from the + * queue. To test if there is enough tokens in the queue, size() must + * be checked before calling this method. + */ + public Token get(int ahead) + { + int p = b + ahead; + if (p < a) + return m [ p ]; + else + throw new ArrayIndexOutOfBoundsException("Not enough tokens"); + } + + /** + * Read the oldest value from the queue and remove this value from + * the queue. + */ + public Token next() + { + if (a == b) + throw new ArrayIndexOutOfBoundsException("queue empty"); + + Token r = m [ b ]; + m [ b ] = null; + b++; + return r; + } + + /** + * Size of the queue. + */ + public int size() + { + return a - b; + } +} diff --git a/libjava/classpath/gnu/javax/swing/text/html/parser/support/low/ReaderTokenizer.java b/libjava/classpath/gnu/javax/swing/text/html/parser/support/low/ReaderTokenizer.java new file mode 100644 index 000000000..45ac181b3 --- /dev/null +++ b/libjava/classpath/gnu/javax/swing/text/html/parser/support/low/ReaderTokenizer.java @@ -0,0 +1,373 @@ +/* ReaderTokenizer.java -- splits the input char sequence int tokens. + Copyright (C) 2005 Free Software Foundation, Inc. + +This file is part of GNU Classpath. + +GNU Classpath is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU Classpath is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Classpath; see the file COPYING. If not, write to the +Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +02110-1301 USA. + +Linking this library statically or dynamically with other modules is +making a combined work based on this library. Thus, the terms and +conditions of the GNU General Public License cover the whole +combination. + +As a special exception, the copyright holders of this library give you +permission to link this library with independent modules to produce an +executable, regardless of the license terms of these independent +modules, and to copy and distribute the resulting executable under +terms of your choice, provided that you also meet, for each linked +independent module, the terms and conditions of the license of that +module. An independent module is a module which is not derived from +or based on this library. If you modify this library, you may extend +this exception to your version of the library, but you are not +obligated to do so. If you do not wish to do so, delete this +exception statement from your version. */ + + +package gnu.javax.swing.text.html.parser.support.low; + +import java.io.IOException; +import java.io.Reader; + +/** + * Reader splits the input char sequence into tokens. + * @author Audrius Meskauskas, Lithuania (AudriusA@Bioinformatics.org) + */ +public class ReaderTokenizer + extends Constants +{ + /** + * This is set to true each time the getNextToken is called. + * Used in preventing loops when all patterns refuse to accept + * the invalid input. + */ + protected boolean advanced; + + /** + * If true, the returned tokens are also placed in the backup + * queue. + */ + protected boolean backupMode; + + /** + * The buffer to read document into. + */ + Buffer buffer = new Buffer(); + + /** + * The queue for supporting mark(). + */ + Queue backup = new Queue(); + + /** + * The queue of found tokens. + */ + Queue queue = new Queue(); + + /** + * The reader to read the document from. + */ + Reader reader; + + /** + * Array of char tokens + */ + char[] charTokens; + + /** + * Array of string tokens. + */ + String[] stringTokens; + + /** + * The current reader position. + */ + int readerPosition = -1; + + /** + * Creates a new ReaderTokenizer. The reset(...) method must be + * subsequently called to set the reader. + */ + public ReaderTokenizer() + { + } + + /** + * Return the sequence, used to separate lines in the document. + * @return one of \n, \r or \r\n. + */ + public String getEndOfLineSequence() + { + return buffer.getEndOfLineSequence(); + } + + /** + * Get the next token. + * @return + */ + public Token getNextToken() + { + Token rt; + advanced = true; + try + { + if (queue.isEmpty()) + read(1); + + if (!queue.isEmpty()) + rt = queue.next(); + else + rt = new Token(EOF, new Location(readerPosition)); + } + catch (IOException ex) + { + throw new ParseException("IO Exception", ex); + } + if (backupMode) + backup.add(rt); + return rt; + } + + /** + * Get a token, lying the given number of tokens + * ahead. getToken(0) will return the same token, + * what would be returned by getNextToken(). + * getToken(..) does change the current position + * in the input stream. If the end of stream is + * reached, the EOF token is always returned. + */ + public Token getTokenAhead(int ahead) + { + try + { + read(ahead - queue.size() + 1); + return queue.size() >= ahead ? queue.get(ahead) : eofToken(); + } + catch (IOException ex) + { + throw new ParseException("IO Exception", ex); + } + } + + /** + * Get a token, bein immediatley ahead. + * If the end of stream is + * reached, the EOF token is always returned. + * The method is equivalent calling getTokenAhead(0). + */ + public Token getTokenAhead() + { + try + { + if (queue.isEmpty()) + read(1); + if (!queue.isEmpty()) + return queue.get(0); + else + return eofToken(); + } + catch (IOException ex) + { + throw new ParseException("IO Exception", ex); + } + } + + /** + * Invokes the error handler. + */ + public void error(String msg, Token at) + { + System.out.println(msg); + } + + /** + * Turns the backup mode on or off. + * It is possible to return where the mark(true) was last called + * by calling reset(). + * @param mode True if it is required to save tokens, making + * returning to the current point possible. + */ + public void mark(boolean mode) + { + backup.clear(); + backupMode = mode; + } + + /** + * Prepare for new parsing from the given stream. + * @param a_reader A reader to parse from. + */ + public void reset(Reader a_reader) + { + reader = a_reader; + readerPosition = -1; + buffer.reset(); + queue.clear(); + } + + /** + * Reset the internal cursor to the position where the mark() + * was last time called. Switches the backup mode off. + */ + public void reset() + { + if (!backupMode) + throw new AssertionError("Call mark(true) before using reset()!"); + backupMode = false; + + // That is now in the queue, will be appended to the end of backup. + while (!queue.isEmpty()) + backup.add(queue.next()); + + Queue t = queue; + queue = backup; + backup = t; + backup.clear(); + } + + /** + * Read the given number of the tokens. Add the needed number of EOF + * tokens if there are no more data in the stream. + * @param numberOfTokens The number of additional tokens to read. + */ + void read(int numberOfTokens) + throws IOException + { + if (numberOfTokens <= 0) + return; + + for (int i = 0; i < numberOfTokens; i++) + readToken(); + } + + /** + * Read next token from the reader, add it to the queue + */ + void readToken() + throws IOException + { + Token t; + int ch; + + enlarging: + while (true) + { + t = tokenMatches(); + if (t != null) + break enlarging; + else + { + ch = reader.read(); + readerPosition++; + if (ch == ETX) + ch = ' '; + if (ch < 0) + { + if (buffer.length() == 0) + { + queue.add(eofToken()); + return; + } + else + { + if (buffer.charAt(buffer.length() - 1) != ETX) + buffer.append(ETX, readerPosition++); + else + { + // Discard terminating ETX + buffer.setLength(buffer.length() - 1); + if (buffer.length() > 0) + { + t = new Token(OTHER, buffer.toString(), + buffer.getLocation(0, buffer.length()) + ); + queue.add(t); + buffer.setLength(0); + } + return; + } + } + } + else + buffer.append((char) ch, readerPosition); + } + } + } + + /** + * Check if the end of buffer matches one of the tokens. If it does, + * return this token and remove the token sequence from the end of + * buffer. + * @return The matching token. + */ + Token tokenMatches() + { + Token rt = endMatches(buffer); + if (rt != null) // Remove the matched image + { + // Consume future character if it was an entity and the future + // character is semicolon. + if (rt.kind == ENTITY) + { + if (buffer.charAt(buffer.length() - 1) == ';') + buffer.setLength(buffer.length() - rt.getImage().length() - 1); + else + { + error("Missing closing semicolon for entity '" + rt.getImage() + + "'", rt + ); + consumeBuffer(rt); + } + } + else + { + consumeBuffer(rt); + } + } + + // If the buffer is not empty, some sequence does not match any tokens. + // Add it to the queue as "OTHER". + if (rt != null) + { + if (buffer.length() > 1) + { + String rest = buffer.toString(); + rest = rest.substring(0, rest.length() - 1); + + Token other = + new Token(OTHER, rest, buffer.getLocation(0, buffer.length)); + queue.add(other); + consumeBuffer(other); + } + queue.add(rt); + } + return rt; + } + + private void consumeBuffer(Token rt) + { + buffer.delete(buffer.length() - rt.getImage().length() - 1, + buffer.length() - 1 + ); + } + + /** + * Create EOF token. + */ + private Token eofToken() + { + return new Token(EOF, "#", new Location(readerPosition)); + } +} diff --git a/libjava/classpath/gnu/javax/swing/text/html/parser/support/low/Token.java b/libjava/classpath/gnu/javax/swing/text/html/parser/support/low/Token.java new file mode 100644 index 000000000..d91adf47a --- /dev/null +++ b/libjava/classpath/gnu/javax/swing/text/html/parser/support/low/Token.java @@ -0,0 +1,169 @@ +/* Token.java -- + Copyright (C) 2005 Free Software Foundation, Inc. + +This file is part of GNU Classpath. + +GNU Classpath is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU Classpath is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Classpath; see the file COPYING. If not, write to the +Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +02110-1301 USA. + +Linking this library statically or dynamically with other modules is +making a combined work based on this library. Thus, the terms and +conditions of the GNU General Public License cover the whole +combination. + +As a special exception, the copyright holders of this library give you +permission to link this library with independent modules to produce an +executable, regardless of the license terms of these independent +modules, and to copy and distribute the resulting executable under +terms of your choice, provided that you also meet, for each linked +independent module, the terms and conditions of the license of that +module. An independent module is a module which is not derived from +or based on this library. If you modify this library, you may extend +this exception to your version of the library, but you are not +obligated to do so. If you do not wish to do so, delete this +exception statement from your version. */ + + +package gnu.javax.swing.text.html.parser.support.low; + +/** + * A token. + * @author Audrius Meskauskas, Lithuania (AudriusA@Bioinformatics.org) + */ +public class Token +{ + /** + * The place of this token in the document. + */ + public Location where; + + /** + * The additional category of token. + */ + public int category; + + /** + * An integer that describes the kind of this token. + */ + public int kind; + + /** + * The string image of the token, null if the char image must be used. + */ + private String stringImage; + + /** + * The char image of the token. + */ + private char charImage; + + /** + * Creates a new token with fields, initialized to the default values. + */ + public Token() + { + } + + /** + * Creates a new token of the given kind. + */ + public Token(int _kind, Location _where) + { + kind = _kind; + where = _where; + } + + /** + * Creates a new token of the given kind and given single char image. + */ + public Token(int _kind, char _image, Location _where) + { + kind = _kind; + charImage = _image; + where = _where; + } + + /** + * Creates a new token of the given kind and given string image. + */ + public Token(int _kind, String _image, Location _where) + { + kind = _kind; + stringImage = _image; + where = _where; + } + + /** + * Creates a new token of the given kind, category and given string image. + */ + public Token(int _kind, int _category, String _image, Location _where) + { + kind = _kind; + category = _category; + stringImage = _image; + where = _where; + } + + /** + * Creates a new token, where location fields are set as for token, + * spanning over two provided tokens and any tokens between them. + * The image field is initialized to null, the kind field is set to -1. + */ + public Token(Token fromInclusive, Token toInclusive) + { + where = new Location(); + where.beginLine = fromInclusive.where.beginLine; + where.startPosition = fromInclusive.where.startPosition; + + where.endLine = toInclusive.where.endLine; + where.endPosition = toInclusive.where.endPosition; + } + + public String getImage() + { + if (kind == 3) + return "#"; + if (stringImage == null) + { + if (charImage == 0) + return null; + stringImage = new String(new char[] { charImage }); + } + return stringImage; + } + + /** + * Append the token image to the given string buffer. + * This may be more effective that buffer.append(this.getImage()). + * @param buffer A buffer to append. + */ + public void appendTo(StringBuffer buffer) + { + if (charImage == 0) + buffer.append(getImage()); + else + buffer.append(charImage); + } + + /** + * Returns the string image or, if null, the bounding positions. + */ + public String toString() + { + return getImage() != null ? kind + "'" + getImage() + : "<line " + where.beginLine + ", abs pos " + where.startPosition + + ".." + where.endPosition + ">"; + } +} diff --git a/libjava/classpath/gnu/javax/swing/text/html/parser/support/low/node.java b/libjava/classpath/gnu/javax/swing/text/html/parser/support/low/node.java new file mode 100644 index 000000000..b54ed86a3 --- /dev/null +++ b/libjava/classpath/gnu/javax/swing/text/html/parser/support/low/node.java @@ -0,0 +1,78 @@ +/* node.java -- + Copyright (C) 2005 Free Software Foundation, Inc. + +This file is part of GNU Classpath. + +GNU Classpath is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU Classpath is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Classpath; see the file COPYING. If not, write to the +Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +02110-1301 USA. + +Linking this library statically or dynamically with other modules is +making a combined work based on this library. Thus, the terms and +conditions of the GNU General Public License cover the whole +combination. + +As a special exception, the copyright holders of this library give you +permission to link this library with independent modules to produce an +executable, regardless of the license terms of these independent +modules, and to copy and distribute the resulting executable under +terms of your choice, provided that you also meet, for each linked +independent module, the terms and conditions of the license of that +module. An independent module is a module which is not derived from +or based on this library. If you modify this library, you may extend +this exception to your version of the library, but you are not +obligated to do so. If you do not wish to do so, delete this +exception statement from your version. */ + + +package gnu.javax.swing.text.html.parser.support.low; + +/** + * A text level content model node. The only required unary operations + * here are "appears" and "optionally appears" ('?'). + * <p>@author Audrius Meskauskas, Lithuania (AudriusA@Bioinformatics.org)</p> + */ +public class node +{ + /** + * True for node that is optional for the given position. + */ + public boolean optional; + + /** + * The kind of the token to match. + */ + public int kind; + + /** + * Creates the new node for matching a given kind of the token. + * @param kind The kind of the token to match. + * @param modifier The modifier (*?+). + */ + public node(int kind, boolean _optional) + { + this.kind = kind; + optional = _optional; + } + + /** + * Creates the node, indicating that token must match exactluy one time. + * @param kind The kind of token to match. + */ + public node(int kind) + { + this.kind = kind; + optional = false; + } +} diff --git a/libjava/classpath/gnu/javax/swing/text/html/parser/support/low/package.html b/libjava/classpath/gnu/javax/swing/text/html/parser/support/low/package.html new file mode 100644 index 000000000..173583015 --- /dev/null +++ b/libjava/classpath/gnu/javax/swing/text/html/parser/support/low/package.html @@ -0,0 +1,47 @@ +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN"> +<!-- package.html - describes classes in javax.swing.text.html.parser package. + Copyright (C) 2002 Free Software Foundation, Inc. + +This file is part of GNU Classpath. + +GNU Classpath is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU Classpath is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Classpath; see the file COPYING. If not, write to the +Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +02110-1301 USA. + +Linking this library statically or dynamically with other modules is +making a combined work based on this library. Thus, the terms and +conditions of the GNU General Public License cover the whole +combination. + +As a special exception, the copyright holders of this library give you +permission to link this library with independent modules to produce an +executable, regardless of the license terms of these independent +modules, and to copy and distribute the resulting executable under +terms of your choice, provided that you also meet, for each linked +independent module, the terms and conditions of the license of that +module. An independent module is a module which is not derived from +or based on this library. If you modify this library, you may extend +this exception to your version of the library, but you are not +obligated to do so. If you do not wish to do so, delete this +exception statement from your version. --> + +<html> +<head><title>GNU Classpath - gnu.javax.swing.text.html.parser.support.low</title></head> + +<body> +<p>This package contains classes that are directly used to process +the text input: adapted stream tokenizer, specialized buffer and text-level content models .</p> +@author Audrius Meskauskas, Lithuania +</body> +</html> diff --git a/libjava/classpath/gnu/javax/swing/text/html/parser/support/low/pattern.java b/libjava/classpath/gnu/javax/swing/text/html/parser/support/low/pattern.java new file mode 100644 index 000000000..0fe03fdbe --- /dev/null +++ b/libjava/classpath/gnu/javax/swing/text/html/parser/support/low/pattern.java @@ -0,0 +1,105 @@ +/* pattern.java -- + Copyright (C) 2005 Free Software Foundation, Inc. + +This file is part of GNU Classpath. + +GNU Classpath is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU Classpath is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Classpath; see the file COPYING. If not, write to the +Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +02110-1301 USA. + +Linking this library statically or dynamically with other modules is +making a combined work based on this library. Thus, the terms and +conditions of the GNU General Public License cover the whole +combination. + +As a special exception, the copyright holders of this library give you +permission to link this library with independent modules to produce an +executable, regardless of the license terms of these independent +modules, and to copy and distribute the resulting executable under +terms of your choice, provided that you also meet, for each linked +independent module, the terms and conditions of the license of that +module. An independent module is a module which is not derived from +or based on this library. If you modify this library, you may extend +this exception to your version of the library, but you are not +obligated to do so. If you do not wish to do so, delete this +exception statement from your version. */ + + +package gnu.javax.swing.text.html.parser.support.low; + + +/** + * The simple pattern, consisting from the sequence of tokens that + * may have the unary modifier '?'. Choices and grouping + * are not required here. + * @author Audrius Meskauskas, Lithuania (AudriusA@Bioinformatics.org) + */ +public class pattern +{ + /** + * The nodes of this pattern. + */ + public final node[] nodes; + + /** + * Create a pattern, containing the given list of nodes. + * @param a_nodes + */ + public pattern(node[] a_nodes) + { + nodes = a_nodes; + } + + /** + * Checks if the pattern can match the tokens in this + * tokenizer. Does not change the state of tokenizer. + * @param stream The tokenizer to read data from + * @return True if the pattern sequence matches the + * beginning of the tokenizer content. + */ + public boolean matches(ReaderTokenizer stream) + { + try + { + int pt = 0; + int pn = 0; + Token t; + node n; + + while (pn < nodes.length) + { + n = nodes [ pn ]; + t = stream.getTokenAhead(pt); + + if (t.kind == n.kind) + { + pn++; + pt++; + } + else + { + if (!n.optional) + return false; + else + pn++; + } + } + return true; + } + catch (Exception ex) + { + throw new ParseException("Exception", ex); + } + } +} diff --git a/libjava/classpath/gnu/javax/swing/text/html/parser/support/package.html b/libjava/classpath/gnu/javax/swing/text/html/parser/support/package.html new file mode 100644 index 000000000..97c6439b3 --- /dev/null +++ b/libjava/classpath/gnu/javax/swing/text/html/parser/support/package.html @@ -0,0 +1,47 @@ +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN"> +<!-- package.html - describes classes in javax.swing.text.html.parser package. + Copyright (C) 2002 Free Software Foundation, Inc. + +This file is part of GNU Classpath. + +GNU Classpath is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU Classpath is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Classpath; see the file COPYING. If not, write to the +Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +02110-1301 USA. + +Linking this library statically or dynamically with other modules is +making a combined work based on this library. Thus, the terms and +conditions of the GNU General Public License cover the whole +combination. + +As a special exception, the copyright holders of this library give you +permission to link this library with independent modules to produce an +executable, regardless of the license terms of these independent +modules, and to copy and distribute the resulting executable under +terms of your choice, provided that you also meet, for each linked +independent module, the terms and conditions of the license of that +module. An independent module is a module which is not derived from +or based on this library. If you modify this library, you may extend +this exception to your version of the library, but you are not +obligated to do so. If you do not wish to do so, delete this +exception statement from your version. --> + +<html> +<head><title>GNU Classpath - gnu.javax.swing.text.html.parser.support</title></head> + +<body> +<p>This package provides various specialised classes, needed by HTML parser. +</p> +@author Audrius Meskauskas, Lithuania +</body> +</html> diff --git a/libjava/classpath/gnu/javax/swing/text/html/parser/support/parameterDefaulter.java b/libjava/classpath/gnu/javax/swing/text/html/parser/support/parameterDefaulter.java new file mode 100644 index 000000000..43c07572a --- /dev/null +++ b/libjava/classpath/gnu/javax/swing/text/html/parser/support/parameterDefaulter.java @@ -0,0 +1,106 @@ +/* parameterDefaulter.java -- + Copyright (C) 2005 Free Software Foundation, Inc. + +This file is part of GNU Classpath. + +GNU Classpath is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU Classpath is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Classpath; see the file COPYING. If not, write to the +Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +02110-1301 USA. + +Linking this library statically or dynamically with other modules is +making a combined work based on this library. Thus, the terms and +conditions of the GNU General Public License cover the whole +combination. + +As a special exception, the copyright holders of this library give you +permission to link this library with independent modules to produce an +executable, regardless of the license terms of these independent +modules, and to copy and distribute the resulting executable under +terms of your choice, provided that you also meet, for each linked +independent module, the terms and conditions of the license of that +module. An independent module is a module which is not derived from +or based on this library. If you modify this library, you may extend +this exception to your version of the library, but you are not +obligated to do so. If you do not wish to do so, delete this +exception statement from your version. */ + + +package gnu.javax.swing.text.html.parser.support; + +import gnu.javax.swing.text.html.parser.htmlAttributeSet; + +import java.util.Hashtable; + +import javax.swing.text.html.parser.AttributeList; +import javax.swing.text.html.parser.DTD; +import javax.swing.text.html.parser.Element; + +/** + * Returns an attribute set, containing default + * parameters for the given element. Caches sets of default + * parameters. + * @author Audrius Meskauskas, Lithuania (AudriusA@Bioinformatics.org) + */ +public class parameterDefaulter +{ + public final DTD dtd; + Hashtable sets = new Hashtable(); + + /** + * Create a parameterDefaulter that looks for the default attribute + * values in the given DTD. + * @param a_dtd + */ + public parameterDefaulter(DTD a_dtd) + { + dtd = a_dtd; + } + + /** + * Get the default parameter set for the given element. + * @param element The element name (case insensitive). + * @return the default attrbute set. + */ + public htmlAttributeSet getDefaultParameters(String element) + { + String key = element.toLowerCase(); + htmlAttributeSet atts = (htmlAttributeSet) sets.get(key); + + if (atts == null) + { + htmlAttributeSet set = new htmlAttributeSet(); + Element e = dtd.elementHash.get(element.toLowerCase()); + + if (e != null) + { + AttributeList a = e.getAttributes(); + + while (a != null) + { + if (a.value != null) + set.addAttribute(a.name, a.value); + a = a.next; + } + } + + if (set.getAttributeCount() > 0) + sets.put(key, set); + else + sets.put(key, htmlAttributeSet.EMPTY_HTML_ATTRIBUTE_SET); + + atts = set; + } + return atts; + } +} diff --git a/libjava/classpath/gnu/javax/swing/text/html/parser/support/textPreProcessor.java b/libjava/classpath/gnu/javax/swing/text/html/parser/support/textPreProcessor.java new file mode 100644 index 000000000..22c44be4f --- /dev/null +++ b/libjava/classpath/gnu/javax/swing/text/html/parser/support/textPreProcessor.java @@ -0,0 +1,189 @@ +/* textPreProcessor.java -- + Copyright (C) 2005 Free Software Foundation, Inc. + +This file is part of GNU Classpath. + +GNU Classpath is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU Classpath is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Classpath; see the file COPYING. If not, write to the +Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +02110-1301 USA. + +Linking this library statically or dynamically with other modules is +making a combined work based on this library. Thus, the terms and +conditions of the GNU General Public License cover the whole +combination. + +As a special exception, the copyright holders of this library give you +permission to link this library with independent modules to produce an +executable, regardless of the license terms of these independent +modules, and to copy and distribute the resulting executable under +terms of your choice, provided that you also meet, for each linked +independent module, the terms and conditions of the license of that +module. An independent module is a module which is not derived from +or based on this library. If you modify this library, you may extend +this exception to your version of the library, but you are not +obligated to do so. If you do not wish to do so, delete this +exception statement from your version. */ + + +package gnu.javax.swing.text.html.parser.support; + +import gnu.javax.swing.text.html.parser.support.low.Constants; + +/** + * Pre - processes text in text parts of the html document. + * + * @author Audrius Meskauskas, Lithuania (AudriusA@Bioinformatics.org) + */ +public class textPreProcessor +{ + /** + * Pre - process non-preformatted text. \t, \r and \n mutate into spaces, then + * multiple spaces mutate into single one, all whitespace around tags is + * consumed. The content of the passed buffer is destroyed. + * + * @param a_text A text to pre-process. + */ + public char[] preprocess(StringBuffer a_text) + { + if (a_text.length() == 0) + return null; + + char[] text = toCharArray(a_text); + + int a = 0; + int b = text.length - 1; + + // Remove leading/trailing whitespace, leaving at most one character + int len = text.length; + while (a + 1 < len && Constants.bWHITESPACE.get(text[a]) + && Constants.bWHITESPACE.get(text[a + 1])) + a++; + + while (b > a && Constants.bWHITESPACE.get(text[b]) + && Constants.bWHITESPACE.get(text[b - 1])) + b--; + + a_text.setLength(0); + + boolean spacesWere = false; + boolean spaceNow; + char c; + + chars: for (int i = a; i <= b; i++) + { + c = text[i]; + spaceNow = Constants.bWHITESPACE.get(c); + if (spacesWere && spaceNow) + continue chars; + if (spaceNow) + a_text.append(' '); + else + a_text.append(c); + spacesWere = spaceNow; + } + + if (a_text.length() == text.length) + { + a_text.getChars(0, a_text.length(), text, 0); + return text; + } + else + return toCharArray(a_text); + } + + /** + * Pre - process pre-formatted text. + * Heading/closing spaces and tabs preserved. + * ONE bounding \r, \n or \r\n is removed. + * \r or \r\n mutate into \n. Tabs are + * preserved. + * The content of the passed buffer is destroyed. + * @param a_text + * @return + */ + public char[] preprocessPreformatted(StringBuffer a_text) + { + if (a_text.length() == 0) + return null; + + char[] text = toCharArray(a_text); + + int a = 0; + int n = text.length - 1; + int b = n; + + if (text [ 0 ] == '\n') + a++; + else + { + if (text [ 0 ] == '\r') + { + a++; + if (text.length > 1 && text [ 1 ] == '\n') + a++; + } + } + + if (text [ n ] == '\r') + b--; + else + { + if (text [ n ] == '\n') + { + b--; + if (n > 0 && text [ n - 1 ] == '\r') + b--; + } + } + + a_text.setLength(0); + + if (a > b) + return null; + + char c; + + for (int i = a; i <= b; i++) + { + c = text [ i ]; + if (c == '\r') + { + if (i == b || text [ i + 1 ] != '\n') + a_text.append('\n'); + } + else + a_text.append(c); + } + + if (a_text.length() == text.length) + { + a_text.getChars(0, a_text.length(), text, 0); + return text; + } + else + return toCharArray(a_text); + } + + /** + * Return array of chars, present in the given buffer. + * @param a_text The buffer + * @return + */ + private static char[] toCharArray(StringBuffer a_text) + { + char[] text = new char[ a_text.length() ]; + a_text.getChars(0, text.length, text, 0); + return text; + } +} |