File Doc Category Size Date Package
Syns2Index.java API Doc Apache Lucene 1.9 9247 Mon Feb 20 09:17:52 GMT 2006 org.apache.lucene.wordnet

Syns2Index

java.lang.Object

public class Syns2Index extends Object

Convert the prolog file wn_s.pl from the WordNet prolog download into a Lucene index suitable for looking up synonyms and performing query expansion ({@link SynExpand#expand SynExpand.expand(...)}). This has been tested with WordNet 2.0. The index has fields named "word" ({@link #F_WORD}) and "syn" ({@link #F_SYN}).

The source word (such as 'big') can be looked up in the "word" field, and if present there will be fields named "syn" for every synonym. What's tricky here is that there could be multiple fields with the same name, in the general case for words that have multiple synonyms. That's not a problem with Lucene, you just use {@link org.apache.lucene.document.Document#getValues}

While the WordNet file distinguishes groups of synonyms with related meanings we don't do that here.

This can take 4 minutes to execute and build an index on a "fast" system and the index takes up almost 3 MB.

author: Dave Spencer, dave@searchmorph.com
see: WordNet home page
see: prologdb man page
see: sample site that uses it

Fields Summary
private static final PrintStream
o
private static final PrintStream
err
public static final String
F_SYN
public static final String
F_WORD
private static final Analyzer
ana
Constructors Summary
Methods Summary
private static void index(java.lang.String indexDir, java.util.Map word2Nums, java.util.Map num2Words)
Forms a Lucene index based on the 2 maps.
param
indexDir the direcotry where the index should be created
param
word2Nums
param
num2Words
int row = 0; int mod = 1; // override the specific index if it already exists IndexWriter writer = new IndexWriter(indexDir, ana, true); writer.setUseCompoundFile(true); // why? // blindly up these parameters for speed writer.setMergeFactor( writer.getMergeFactor() * 2); writer.setMaxBufferedDocs( writer.getMaxBufferedDocs() * 2); Iterator i1 = word2Nums.keySet().iterator(); while (i1.hasNext()) // for each word { String g = (String) i1.next(); Document doc = new Document(); int n = index(word2Nums, num2Words, g, doc); if (n > 0) { doc.add( new Field( F_WORD, g, Field.Store.YES, Field.Index.UN_TOKENIZED)); if ((++row % mod) == 0) { o.println("\trow=" + row + "/" + word2Nums.size() + " doc= " + doc); mod *= 2; } writer.addDocument(doc); } // else degenerate } o.println( "Optimizing.."); writer.optimize(); writer.close();
private static int index(java.util.Map word2Nums, java.util.Map num2Words, java.lang.String g, org.apache.lucene.document.Document doc)
Given the 2 maps fills a document for 1 word.
List keys = (List) word2Nums.get(g); // get list of key#'s Iterator i2 = keys.iterator(); Set already = new TreeSet(); // keep them sorted // pass 1: fill up 'already' with all words while (i2.hasNext()) // for each key# { already.addAll((List) num2Words.get(i2.next())); // get list of words } int num = 0; already.remove(g); // of course a word is it's own syn Iterator it = already.iterator(); while (it.hasNext()) { String cur = (String) it.next(); // don't store things like 'pit bull' -> 'american pit bull' if (!isDecent(cur)) { continue; } num++; doc.add( new Field( F_SYN, cur, Field.Store.YES, Field.Index.NO)); } return num;
private static boolean isDecent(java.lang.String s)
Checks to see if a word contains only alphabetic characters by checking it one character at a time.
param
s string to check
return
true if the string is decent
int len = s.length(); for (int i = 0; i < len; i++) { if (!Character.isLetter(s.charAt(i))) { return false; } } return true;
public static void main(java.lang.String[] args)
Takes arg of prolog file name and index directory.
// get command line arguments String prologFilename = null; // name of file "wn_s.pl" String indexDir = null; if (args.length == 2) { prologFilename = args[0]; indexDir = args[1]; } else { usage(); System.exit(1); } // ensure that the prolog file is readable if (! (new File(prologFilename)).canRead()) { err.println("Error: cannot read Prolog file: " + prologFilename); System.exit(1); } // exit if the target index directory already exists if ((new File(indexDir)).isDirectory()) { err.println("Error: index directory already exists: " + indexDir); err.println("Please specify a name of a non-existent directory"); System.exit(1); } o.println("Opening Prolog file " + prologFilename); final FileInputStream fis = new FileInputStream(prologFilename); final BufferedReader br = new BufferedReader(new InputStreamReader(fis)); String line; // maps a word to all the "groups" it's in final Map word2Nums = new TreeMap(); // maps a group to all the words in it final Map num2Words = new TreeMap(); // number of rejected words int ndecent = 0; // status output int mod = 1; int row = 1; // parse prolog file o.println( "[1/2] Parsing " + prologFilename); while ((line = br.readLine()) != null) { // occasional progress if ((++row) % mod == 0) // periodically print out line we read in { mod *= 2; o.println("\t" + row + " " + line + " " + word2Nums.size() + " " + num2Words.size() + " ndecent=" + ndecent); } // syntax check if (! line.startsWith("s(")) { err.println("OUCH: " + line); System.exit(1); } // parse line line = line.substring(2); int comma = line.indexOf(',"); String num = line.substring(0, comma); int q1 = line.indexOf('\'"); line = line.substring(q1 + 1); int q2 = line.indexOf('\'"); String word = line.substring(0, q2).toLowerCase(); // make sure is a normal word if (! isDecent(word)) { ndecent++; continue; // don't store words w/ spaces } // 1/2: word2Nums map // append to entry or add new one List lis =(List) word2Nums.get(word); if (lis == null) { lis = new LinkedList(); lis.add(num); word2Nums.put(word, lis); } else lis.add(num); // 2/2: num2Words map lis = (List) num2Words.get(num); if (lis == null) { lis = new LinkedList(); lis.add(word); num2Words.put(num, lis); } else lis.add(word); } // close the streams fis.close(); br.close(); // create the index o.println( "[2/2] Building index to store synonyms, " + " map sizes are " + word2Nums.size() + " and " + num2Words.size()); index(indexDir, word2Nums, num2Words);
private static void usage()
o.println("\n\n" + "java org.apache.lucene.wordnet.Syn2Index <prolog file> <index dir>\n\n");