diff options
author | upstream source tree <ports@midipix.org> | 2015-03-15 20:14:05 -0400 |
---|---|---|
committer | upstream source tree <ports@midipix.org> | 2015-03-15 20:14:05 -0400 |
commit | 554fd8c5195424bdbcabf5de30fdc183aba391bd (patch) | |
tree | 976dc5ab7fddf506dadce60ae936f43f58787092 /libjava/classpath/gnu/xml/dom/html2/DomHTMLParser.java | |
download | cbb-gcc-4.6.4-15d2061ac0796199866debe9ac87130894b0cdd3.tar.bz2 cbb-gcc-4.6.4-15d2061ac0796199866debe9ac87130894b0cdd3.tar.xz |
obtained gcc-4.6.4.tar.bz2 from upstream website;upstream
verified gcc-4.6.4.tar.bz2.sig;
imported gcc-4.6.4 source tree from verified upstream tarball.
downloading a git-generated archive based on the 'upstream' tag
should provide you with a source tree that is binary identical
to the one extracted from the above tarball.
if you have obtained the source via the command 'git clone',
however, do note that line-endings of files in your working
directory might differ from line-endings of the respective
files in the upstream repository.
Diffstat (limited to 'libjava/classpath/gnu/xml/dom/html2/DomHTMLParser.java')
-rw-r--r-- | libjava/classpath/gnu/xml/dom/html2/DomHTMLParser.java | 264 |
1 files changed, 264 insertions, 0 deletions
diff --git a/libjava/classpath/gnu/xml/dom/html2/DomHTMLParser.java b/libjava/classpath/gnu/xml/dom/html2/DomHTMLParser.java new file mode 100644 index 000000000..88656d2d3 --- /dev/null +++ b/libjava/classpath/gnu/xml/dom/html2/DomHTMLParser.java @@ -0,0 +1,264 @@ +/* DomHTMLParser.java -- + Copyright (C) 2005 Free Software Foundation, Inc. + +This file is part of GNU Classpath. + +GNU Classpath is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU Classpath is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Classpath; see the file COPYING. If not, write to the +Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +02110-1301 USA. + +Linking this library statically or dynamically with other modules is +making a combined work based on this library. Thus, the terms and +conditions of the GNU General Public License cover the whole +combination. + +As a special exception, the copyright holders of this library give you +permission to link this library with independent modules to produce an +executable, regardless of the license terms of these independent +modules, and to copy and distribute the resulting executable under +terms of your choice, provided that you also meet, for each linked +independent module, the terms and conditions of the license of that +module. An independent module is a module which is not derived from +or based on this library. If you modify this library, you may extend +this exception to your version of the library, but you are not +obligated to do so. If you do not wish to do so, delete this +exception statement from your version. */ + + +package gnu.xml.dom.html2; + +import java.io.IOException; +import java.io.Reader; + +import java.util.Enumeration; +import java.util.Iterator; +import java.util.LinkedList; + +import javax.swing.text.AttributeSet; +import javax.swing.text.html.HTML; +import javax.swing.text.html.parser.DTD; +import javax.swing.text.html.parser.TagElement; + +import org.w3c.dom.NamedNodeMap; +import org.w3c.dom.Node; +import org.w3c.dom.html2.HTMLDocument; + +/** + * This parser reads HTML from the given stream and stores into + * {@link HTMLDocument}. The HTML tag becomes the {@link Node}. + * The tag attributes become the node attributes. The text inside + * HTML tag is inserted as one or several text nodes. The nested + * HTML tags are inserted as child nodes. + * + * If the strict tree structure, closing the tag means closing all + * nested tags. To work around this, this parser closes the nested + * tags and immediately reopens them after the closed tag. + * In this way, <code><b><i>c</b>d</code> + * is parsed as <code><b><i>c</i></b><i>d</code> . + * + * @author Audrius Meskauskas (AudriusA@Bioinformatics.org) + */ +public class DomHTMLParser + extends gnu.javax.swing.text.html.parser.support.Parser +{ + /** + * The target where HTML document will be inserted. + */ + protected DomHTMLDocument document; + + /** + * The subsequently created new nodes will be inserted as the + * childs of this cursor. + */ + protected Node cursor; + + /** + * Create parser using the given DTD. + * + * @param dtd the DTD (for example, + * {@link gnu.javax.swing.text.html.parser.HTML_401F}). + */ + public DomHTMLParser(DTD dtd) + { + super(dtd); + } + + /** + * Parse SGML insertion ( <! ... > ). + * Currently just treats it as comment. + */ + public boolean parseMarkupDeclarations(StringBuffer strBuff) + throws java.io.IOException + { + Node c = document.createComment(strBuff.toString()); + cursor.appendChild(c); + return false; + } + + /** + * Read the document, present in the given stream, and + * return the corresponding {@link HTMLDocument}. + * + * @param input a stream to read from. + * @return a document, reflecting the structure of the provided HTML + * text. + * + * @throws IOException if the reader throws one. + */ + public HTMLDocument parseDocument(Reader input) + throws IOException + { + try + { + document = new DomHTMLDocument(); + document.setCheckWellformedness(false); + document.setCheckingCharacters(false); + + cursor = document; + + parse(input); + + DomHTMLDocument h = document; + document = null; + return h; + } + catch (Exception ex) + { + ex.printStackTrace(); + throw new IOException("Exception: " + ex.getMessage()); + } + } + + /** + * Create a new node. + * @param name the name of node, case insensitive. + * @return the created node. + */ + protected Node createNode(String name) + { + Node new_node = document.createElement(name.toLowerCase()); + AttributeSet hatts = getAttributes(); + NamedNodeMap natts = new_node.getAttributes(); + + Enumeration enumeration = hatts.getAttributeNames(); + Object key; + Node attribute; + + while (hatts != null) + { + while (enumeration.hasMoreElements()) + { + key = enumeration.nextElement(); + attribute = document.createAttribute(key.toString()); + attribute.setNodeValue(hatts.getAttribute(key).toString()); + natts.setNamedItem(attribute); + } + + // The default values are stored in a parent node. + hatts = hatts.getResolveParent(); + } + + return new_node; + } + + /** + * Handle comment by inserting the comment node. + * @param text the comment text. + */ + protected void handleComment(char[] text) + { + Node c = document.createComment(new String(text)); + cursor.appendChild(c); + } + + /** + * Handle the tag with no content. + * @param tag the tag to handle. + */ + protected void handleEmptyTag(TagElement tag) + { + String name = tag.getHTMLTag().toString(); + + if (name.equalsIgnoreCase("#pcdata")) + return; + + Node c = createNode(name); + cursor.appendChild(c); + } + + /** + * Close the given tag. Close and reopen all nested tags. + * @param tag the tag to close. + */ + protected void handleEndTag(TagElement tag) + { + String name = tag.getHTMLTag().toString(); + String nname = cursor.getNodeName(); + + // Closing the current tag. + if (nname != null && nname.equalsIgnoreCase(name)) + { + cursor = cursor.getParentNode(); + } + else + { + Node nCursor = cursor.getParentNode(); + + // Remember the opened nodes. + LinkedList open = new LinkedList(); + Node close = cursor; + while (close != null && !close.getNodeName().equalsIgnoreCase(name)) + { + if (close != document) + open.addFirst(close); + close = close.getParentNode(); + } + if (close == null) + cursor = document; + else + cursor = close.getParentNode(); + + // Insert the copies of the opened nodes. + Iterator iter = open.iterator(); + while (iter.hasNext()) + { + Node item = (Node) iter.next(); + cursor.appendChild(item); + cursor = item; + } + } + } + + /** + * Handle the start tag by inserting the HTML element. + * @param tag the tag to handle. + */ + protected void handleStartTag(TagElement tag) + { + HTML.Tag h = tag.getHTMLTag(); + Node c = createNode(h.toString()); + cursor.appendChild(c); + cursor = c; + } + + /** + * Handle text by inserting the text node. + * @param text the text to insert. + */ + protected void handleText(char[] text) + { + Node c = document.createTextNode(text, 0, text.length); + cursor.appendChild(c); + } +} |