FileDocCategorySizeDatePackage
IndexLinks.javaAPI DocExample6463Mon Feb 23 21:27:48 GMT 2004com.develop.ss

IndexLinks.java

package com.develop.ss;

import junit.framework.Test;
import junit.framework.TestSuite;
import junit.framework.TestResult;
import junit.textui.TestRunner;
import com.meterware.httpunit.WebConversation;
import com.meterware.httpunit.WebLink;

import java.util.HashSet;
import java.util.Set;
import java.util.Iterator;
import java.util.StringTokenizer;
import java.util.ArrayList;
import java.net.MalformedURLException;
import java.net.URL;
import java.io.FileOutputStream;
import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.io.IOException;

import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;

public class IndexLinks {
  WebConversation conversation = new WebConversation();
  Set linksAlreadyFollowed = new HashSet();
  Set linksNotFollowed = new HashSet();
  Set linkPrefixesToFollow = new HashSet();
  HashSet linkPrefixesToAvoid = new HashSet();
  HashSet linksToIndex = new HashSet();
  private String linksNotFollowedOutputFileName;
  private int maxLinks = Integer.MAX_VALUE;
  private IndexWriter writer;
  private String initialLink;
  private static String[] EMPTY_STRINGARRAY = new String[0];

  // constructors
  public IndexLinks(String indexPath, int maxLinks, String skippedLinksOutputFileName) throws IOException {
    this.maxLinks = maxLinks;
    this.linksNotFollowedOutputFileName = skippedLinksOutputFileName;
    writer = new IndexWriter(indexPath, new StandardAnalyzer(), true);
  }

  // properties
  public String getInitialLink() {
    return initialLink;
  }

  public void setInitialLink(String initialLink) throws MalformedURLException {
    if ((initialLink == null) || (initialLink.length() == 0)) {
      throw new Error("Must specify a non-null initialLink");
    }
    linkPrefixesToFollow.add(new URL(initialLink));
    this.initialLink = initialLink;
    IndexLink.log.info("Initial link is " + initialLink);
    addLink(new IndexLink(initialLink,conversation,this));
  }

  public IndexWriter getWriter() {
    return writer;
  }

  public void setFollowPrefixes(String[] prefixesToFollow) throws MalformedURLException {
    for (int i = 0; i < prefixesToFollow.length; i++) {
      String s = prefixesToFollow[i];
      IndexLink.log.info("Following links prefixed with " + s);
      linkPrefixesToFollow.add(new URL(s));
    }
  }
  public void setAvoidPrefixes(String[] prefixesToAvoid) throws MalformedURLException {
    for (int i = 0; i < prefixesToAvoid.length; i++) {
      String s = prefixesToAvoid[i];
      IndexLink.log.info("Avoid links prefixed with " + s);
      linkPrefixesToAvoid.add(new URL(s));
    }
  }

  // methods

  public void initFollowPrefixesFromSystemProperties() throws MalformedURLException {
    String followPrefixes = System.getProperty("com.develop.ss.FollowLinks");
    if (followPrefixes == null || followPrefixes.length() == 0) return;
    String[] prefixes = followPrefixes.split(" ");
    if (prefixes != null && prefixes.length != 0) {
      setFollowPrefixes(prefixes);
    }
  }

  public void initAvoidPrefixesFromSystemProperties() throws MalformedURLException {
    String avoidPrefixes = System.getProperty("com.develop.ss.AvoidLinks");
    if (avoidPrefixes == null || avoidPrefixes.length() == 0) return;
    String[] prefixes = avoidPrefixes.split(" ");
    if (prefixes != null && prefixes.length != 0) {
      setAvoidPrefixes(prefixes);
    }
  }

  private void writeStringSet(String s, Set skippedPaths) throws IOException {
    FileOutputStream fos = new FileOutputStream(s);
    PrintStream ps = new PrintStream(fos);
    Iterator it = skippedPaths.iterator();
    while (it.hasNext()) {
      ps.println(it.next());
    }
    fos.flush();
    fos.close();
  }

  boolean shouldFollowLink(URL newLink) {
    for (Iterator iterator = linkPrefixesToFollow.iterator(); iterator.hasNext();) {
      URL u = (URL) iterator.next();
      if (matchesDownToPathPrefix(u, newLink)) {
        return true;
      }
    }
    return false;
  }

  boolean shouldNotFollowLink(URL newLink) {
      for (Iterator iterator = linkPrefixesToAvoid.iterator(); iterator.hasNext();) {
      URL u = (URL) iterator.next();
      if (matchesDownToPathPrefix(u, newLink)) {
        return true;
      }
    }
    return false;
  }

  private boolean matchesDownToPathPrefix(URL matchBase, URL newLink) {
    return matchBase.getHost().equals(newLink.getHost()) &&
       matchBase.getPort() == newLink.getPort() &&
       matchBase.getProtocol().equals(newLink.getProtocol()) &&
       newLink.getPath().startsWith(matchBase.getPath());
  }

  void considerNewLink(String linkFrom, WebLink newLink) throws MalformedURLException {
    URL url = null;
    url = newLink.getRequest().getURL();
    if (shouldFollowLink(url)) {
      if (linksAlreadyFollowed.add(url.toExternalForm())) {
        if (linksAlreadyFollowed.size() > maxLinks) {
          linksAlreadyFollowed.remove(url.toExternalForm());
          throw new Error("Max links exceeded " + maxLinks);
        }
        if (shouldNotFollowLink(url)) {
          IndexLink.log.info("Not following " + url.toExternalForm() + " from " + linkFrom);
        } else {
          IndexLink.log.info("Following " + url.toExternalForm() + " from " + linkFrom);
          addLink(new IndexLink(url.toString(),conversation, this));
        }
      }
    } else {
      ignoreLink(url, linkFrom);
    }
  }

  private void ignoreLink(URL url, String linkFrom) {
    String status = "Ignoring " + url.toExternalForm() + " from " + linkFrom;
    linksNotFollowed.add(status);
    IndexLink.log.fine(status);
  }

  public void execute()
  {
      for(int i = 0;i<linksToIndex.size() - 1;i++)
      {
        try
        {
            ((IndexLink)linksToIndex.toArray()[i]).checkLink();
        }
        catch(Exception ex)
        {
        }
      }

  }

  public void addLink(IndexLink link)
  {
      try
      {
        link.checkLink();
      }
      catch(Exception ex)
      {
      }
      linksToIndex.add(link);
  }

  public void addToIndex(Document d)
  {
      try
      {
      writer.addDocument(d);
      }
      catch(Exception ex)
      {
      }
  }

 }