package com.develop.ss;
import junit.framework.Test;
import junit.framework.TestSuite;
import junit.framework.TestResult;
import junit.textui.TestRunner;
import com.meterware.httpunit.WebConversation;
import com.meterware.httpunit.WebLink;
import java.util.HashSet;
import java.util.Set;
import java.util.Iterator;
import java.util.StringTokenizer;
import java.util.ArrayList;
import java.net.MalformedURLException;
import java.net.URL;
import java.io.FileOutputStream;
import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.io.IOException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
public class IndexLinks {
WebConversation conversation = new WebConversation();
Set linksAlreadyFollowed = new HashSet();
Set linksNotFollowed = new HashSet();
Set linkPrefixesToFollow = new HashSet();
HashSet linkPrefixesToAvoid = new HashSet();
HashSet linksToIndex = new HashSet();
private String linksNotFollowedOutputFileName;
private int maxLinks = Integer.MAX_VALUE;
private IndexWriter writer;
private String initialLink;
private static String[] EMPTY_STRINGARRAY = new String[0];
// constructors
public IndexLinks(String indexPath, int maxLinks, String skippedLinksOutputFileName) throws IOException {
this.maxLinks = maxLinks;
this.linksNotFollowedOutputFileName = skippedLinksOutputFileName;
writer = new IndexWriter(indexPath, new StandardAnalyzer(), true);
}
// properties
public String getInitialLink() {
return initialLink;
}
public void setInitialLink(String initialLink) throws MalformedURLException {
if ((initialLink == null) || (initialLink.length() == 0)) {
throw new Error("Must specify a non-null initialLink");
}
linkPrefixesToFollow.add(new URL(initialLink));
this.initialLink = initialLink;
IndexLink.log.info("Initial link is " + initialLink);
addLink(new IndexLink(initialLink,conversation,this));
}
public IndexWriter getWriter() {
return writer;
}
public void setFollowPrefixes(String[] prefixesToFollow) throws MalformedURLException {
for (int i = 0; i < prefixesToFollow.length; i++) {
String s = prefixesToFollow[i];
IndexLink.log.info("Following links prefixed with " + s);
linkPrefixesToFollow.add(new URL(s));
}
}
public void setAvoidPrefixes(String[] prefixesToAvoid) throws MalformedURLException {
for (int i = 0; i < prefixesToAvoid.length; i++) {
String s = prefixesToAvoid[i];
IndexLink.log.info("Avoid links prefixed with " + s);
linkPrefixesToAvoid.add(new URL(s));
}
}
// methods
public void initFollowPrefixesFromSystemProperties() throws MalformedURLException {
String followPrefixes = System.getProperty("com.develop.ss.FollowLinks");
if (followPrefixes == null || followPrefixes.length() == 0) return;
String[] prefixes = followPrefixes.split(" ");
if (prefixes != null && prefixes.length != 0) {
setFollowPrefixes(prefixes);
}
}
public void initAvoidPrefixesFromSystemProperties() throws MalformedURLException {
String avoidPrefixes = System.getProperty("com.develop.ss.AvoidLinks");
if (avoidPrefixes == null || avoidPrefixes.length() == 0) return;
String[] prefixes = avoidPrefixes.split(" ");
if (prefixes != null && prefixes.length != 0) {
setAvoidPrefixes(prefixes);
}
}
private void writeStringSet(String s, Set skippedPaths) throws IOException {
FileOutputStream fos = new FileOutputStream(s);
PrintStream ps = new PrintStream(fos);
Iterator it = skippedPaths.iterator();
while (it.hasNext()) {
ps.println(it.next());
}
fos.flush();
fos.close();
}
boolean shouldFollowLink(URL newLink) {
for (Iterator iterator = linkPrefixesToFollow.iterator(); iterator.hasNext();) {
URL u = (URL) iterator.next();
if (matchesDownToPathPrefix(u, newLink)) {
return true;
}
}
return false;
}
boolean shouldNotFollowLink(URL newLink) {
for (Iterator iterator = linkPrefixesToAvoid.iterator(); iterator.hasNext();) {
URL u = (URL) iterator.next();
if (matchesDownToPathPrefix(u, newLink)) {
return true;
}
}
return false;
}
private boolean matchesDownToPathPrefix(URL matchBase, URL newLink) {
return matchBase.getHost().equals(newLink.getHost()) &&
matchBase.getPort() == newLink.getPort() &&
matchBase.getProtocol().equals(newLink.getProtocol()) &&
newLink.getPath().startsWith(matchBase.getPath());
}
void considerNewLink(String linkFrom, WebLink newLink) throws MalformedURLException {
URL url = null;
url = newLink.getRequest().getURL();
if (shouldFollowLink(url)) {
if (linksAlreadyFollowed.add(url.toExternalForm())) {
if (linksAlreadyFollowed.size() > maxLinks) {
linksAlreadyFollowed.remove(url.toExternalForm());
throw new Error("Max links exceeded " + maxLinks);
}
if (shouldNotFollowLink(url)) {
IndexLink.log.info("Not following " + url.toExternalForm() + " from " + linkFrom);
} else {
IndexLink.log.info("Following " + url.toExternalForm() + " from " + linkFrom);
addLink(new IndexLink(url.toString(),conversation, this));
}
}
} else {
ignoreLink(url, linkFrom);
}
}
private void ignoreLink(URL url, String linkFrom) {
String status = "Ignoring " + url.toExternalForm() + " from " + linkFrom;
linksNotFollowed.add(status);
IndexLink.log.fine(status);
}
public void execute()
{
for(int i = 0;i<linksToIndex.size() - 1;i++)
{
try
{
((IndexLink)linksToIndex.toArray()[i]).checkLink();
}
catch(Exception ex)
{
}
}
}
public void addLink(IndexLink link)
{
try
{
link.checkLink();
}
catch(Exception ex)
{
}
linksToIndex.add(link);
}
public void addToIndex(Document d)
{
try
{
writer.addDocument(d);
}
catch(Exception ex)
{
}
}
}
|