1 files changed, 242 insertions, 0 deletions
diff --git a/libjava/classpath/gnu/xml/pipeline/LinkFilter.java b/libjava/classpath/gnu/xml/pipeline/LinkFilter.java
new file mode 100644
index 000000000..e11a5eca6
--- /dev/null
+++ b/libjava/classpath/gnu/xml/pipeline/LinkFilter.java
@@ -0,0 +1,242 @@
+/* LinkFilter.java --
+   Copyright (C) 1999,2000,2001 Free Software Foundation, Inc.
+
+This file is part of GNU Classpath.
+
+GNU Classpath is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2, or (at your option)
+any later version.
+
+GNU Classpath is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GNU Classpath; see the file COPYING.  If not, write to the
+Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+02110-1301 USA.
+
+Linking this library statically or dynamically with other modules is
+making a combined work based on this library.  Thus, the terms and
+conditions of the GNU General Public License cover the whole
+combination.
+
+As a special exception, the copyright holders of this library give you
+permission to link this library with independent modules to produce an
+executable, regardless of the license terms of these independent
+modules, and to copy and distribute the resulting executable under
+terms of your choice, provided that you also meet, for each linked
+independent module, the terms and conditions of the license of that
+module.  An independent module is a module which is not derived from
+or based on this library.  If you modify this library, you may extend
+this exception to your version of the library, but you are not
+obligated to do so.  If you do not wish to do so, delete this
+exception statement from your version. */
+
+package gnu.xml.pipeline;
+
+import java.io.IOException;
+import java.net.URL;
+import java.util.Enumeration;
+import java.util.Vector;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+
+/**
+ * Pipeline filter to remember XHTML links found in a document,
+ * so they can later be crawled.  Fragments are not counted, and duplicates
+ * are ignored.  Callers are responsible for filtering out URLs they aren't
+ * interested in.  Events are passed through unmodified.
+ *
+ * <p> Input MUST include a setDocumentLocator() call, as it's used to
+ * resolve relative links in the absence of a "base" element.  Input MUST
+ * also include namespace identifiers, since it is the XHTML namespace
+ * identifier which is used to identify the relevant elements.
+ *
+ * <p><em>FIXME:</em> handle xml:base attribute ... in association with
+ * a stack of base URIs.  Similarly, recognize/support XLink data.
+ *
+ * @author David Brownell
+ */
+public class LinkFilter extends EventFilter
+{
+    // for storing URIs
+    private Vector              vector = new Vector ();
+
+        // struct for "full" link record (tbd)
+        // these for troubleshooting original source:
+        //      original uri
+        //      uri as resolved (base, relative, etc)
+        //      URI of originating doc
+        //      line #
+        //      original element + attrs (img src, desc, etc)
+
+        // XLink model of the link ... for inter-site pairups ?
+
+    private String              baseURI;
+
+    private boolean             siteRestricted = false;
+
+    //
+    // XXX leverage blacklist info (like robots.txt)
+    //
+    // XXX constructor w/param ... pipeline for sending link data
+    // probably XHTML --> XLink, providing info as sketched above
+    //
+
+
+    /**
+     * Constructs a new event filter, which collects links in private data
+     * structure for later enumeration.
+     */
+        // constructor used by PipelineFactory
+    public LinkFilter ()
+    {
+        super.setContentHandler (this);
+    }
+
+
+    /**
+     * Constructs a new event filter, which collects links in private data
+     * structure for later enumeration and passes all events, unmodified,
+     * to the next consumer.
+     */
+        // constructor used by PipelineFactory
+    public LinkFilter (EventConsumer next)
+    {
+        super (next);
+        super.setContentHandler (this);
+    }
+
+
+    /**
+     * Returns an enumeration of the links found since the filter
+     * was constructed, or since removeAllLinks() was called.
+     *
+     * @return enumeration of strings.
+     */
+    public Enumeration getLinks ()
+    {
+        return vector.elements ();
+    }
+
+    /**
+     * Removes records about all links reported to the event
+     * stream, as if the filter were newly created.
+     */
+    public void removeAllLinks ()
+    {
+        vector = new Vector ();
+    }
+
+
+    /**
+     * Collects URIs for (X)HTML content from elements which hold them.
+     */
+    public void startElement (
+        String          uri,
+        String          localName,
+        String          qName,
+        Attributes      atts
+    ) throws SAXException
+    {
+        String  link;
+
+        // Recognize XHTML links.
+        if ("http://www.w3.org/1999/xhtml".equals (uri)) {
+
+            if ("a".equals (localName) || "base".equals (localName)
+                    || "area".equals (localName))
+                link = atts.getValue ("href");
+            else if ("iframe".equals (localName) || "frame".equals (localName))
+                link = atts.getValue ("src");
+            else if ("blockquote".equals (localName) || "q".equals (localName)
+                    || "ins".equals (localName) || "del".equals (localName))
+                link = atts.getValue ("cite");
+            else
+                link = null;
+            link = maybeAddLink (link);
+
+            // "base" modifies designated baseURI
+            if ("base".equals (localName) && link != null)
+                baseURI = link;
+
+            if ("iframe".equals (localName) || "img".equals (localName))
+                maybeAddLink (atts.getValue ("longdesc"));
+        }
+
+        super.startElement (uri, localName, qName, atts);
+    }
+
+    private String maybeAddLink (String link)
+    {
+        int             index;
+
+        // ignore empty links and fragments inside docs
+        if (link == null)
+            return null;
+        if ((index = link.indexOf ("#")) >= 0)
+            link = link.substring (0, index);
+        if (link.equals (""))
+            return null;
+
+        try {
+            // get the real URI
+            URL         base = new URL ((baseURI != null)
+                                    ? baseURI
+                                    : getDocumentLocator ().getSystemId ());
+            URL         url = new URL (base, link);
+
+            link = url.toString ();
+
+            // ignore duplicates
+            if (vector.contains (link))
+                return link;
+
+            // other than what "base" does, stick to original site:
+            if (siteRestricted) {
+                // don't switch protocols
+                if (!base.getProtocol ().equals (url.getProtocol ()))
+                    return link;
+                // don't switch servers
+                if (base.getHost () != null
+                        && !base.getHost ().equals (url.getHost ()))
+                    return link;
+            }
+
+            vector.addElement (link);
+
+            return link;
+
+        } catch (IOException e) {
+            // bad URLs we don't want
+        }
+        return null;
+    }
+
+    /**
+     * Reports an error if no Locator has been made available.
+     */
+    public void startDocument ()
+    throws SAXException
+    {
+        if (getDocumentLocator () == null)
+            throw new SAXException ("no Locator!");
+    }
+
+    /**
+     * Forgets about any base URI information that may be recorded.
+     * Applications will often want to call removeAllLinks(), likely
+     * after examining the links which were reported.
+     */
+    public void endDocument ()
+    throws SAXException
+    {
+        baseURI = null;
+        super.endDocument ();
+    }
+}