Syns2Indexpublic class Syns2Index extends Object Convert the prolog file wn_s.pl from the WordNet prolog download
into a Lucene index suitable for looking up synonyms and performing query expansion ({@link SynExpand#expand SynExpand.expand(...)}).
This has been tested with WordNet 2.0.
The index has fields named "word" ({@link #F_WORD})
and "syn" ({@link #F_SYN}).
The source word (such as 'big') can be looked up in the
"word" field, and if present there will be fields named "syn"
for every synonym. What's tricky here is that there could be multiple
fields with the same name, in the general case for words that have multiple synonyms.
That's not a problem with Lucene, you just use {@link org.apache.lucene.document.Document#getValues}
While the WordNet file distinguishes groups of synonyms with
related meanings we don't do that here.
This can take 4 minutes to execute and build an index on a "fast" system and the index takes up almost 3 MB. |
Fields Summary |
---|
private static final PrintStream | o | private static final PrintStream | err | public static final String | F_SYN | public static final String | F_WORD | private static final Analyzer | ana |
Methods Summary |
---|
private static void | index(java.lang.String indexDir, java.util.Map word2Nums, java.util.Map num2Words)Forms a Lucene index based on the 2 maps.
int row = 0;
int mod = 1;
// override the specific index if it already exists
IndexWriter writer = new IndexWriter(indexDir, ana, true);
writer.setUseCompoundFile(true); // why?
// blindly up these parameters for speed
writer.setMergeFactor( writer.getMergeFactor() * 2);
writer.setMaxBufferedDocs( writer.getMaxBufferedDocs() * 2);
Iterator i1 = word2Nums.keySet().iterator();
while (i1.hasNext()) // for each word
{
String g = (String) i1.next();
Document doc = new Document();
int n = index(word2Nums, num2Words, g, doc);
if (n > 0)
{
doc.add( new Field( F_WORD, g, Field.Store.YES, Field.Index.UN_TOKENIZED));
if ((++row % mod) == 0)
{
o.println("\trow=" + row + "/" + word2Nums.size() + " doc= " + doc);
mod *= 2;
}
writer.addDocument(doc);
} // else degenerate
}
o.println( "Optimizing..");
writer.optimize();
writer.close();
| private static int | index(java.util.Map word2Nums, java.util.Map num2Words, java.lang.String g, org.apache.lucene.document.Document doc)Given the 2 maps fills a document for 1 word.
List keys = (List) word2Nums.get(g); // get list of key#'s
Iterator i2 = keys.iterator();
Set already = new TreeSet(); // keep them sorted
// pass 1: fill up 'already' with all words
while (i2.hasNext()) // for each key#
{
already.addAll((List) num2Words.get(i2.next())); // get list of words
}
int num = 0;
already.remove(g); // of course a word is it's own syn
Iterator it = already.iterator();
while (it.hasNext())
{
String cur = (String) it.next();
// don't store things like 'pit bull' -> 'american pit bull'
if (!isDecent(cur))
{
continue;
}
num++;
doc.add( new Field( F_SYN, cur, Field.Store.YES, Field.Index.NO));
}
return num;
| private static boolean | isDecent(java.lang.String s)Checks to see if a word contains only alphabetic characters by
checking it one character at a time.
int len = s.length();
for (int i = 0; i < len; i++)
{
if (!Character.isLetter(s.charAt(i)))
{
return false;
}
}
return true;
| public static void | main(java.lang.String[] args)Takes arg of prolog file name and index directory.
// get command line arguments
String prologFilename = null; // name of file "wn_s.pl"
String indexDir = null;
if (args.length == 2)
{
prologFilename = args[0];
indexDir = args[1];
}
else
{
usage();
System.exit(1);
}
// ensure that the prolog file is readable
if (! (new File(prologFilename)).canRead())
{
err.println("Error: cannot read Prolog file: " + prologFilename);
System.exit(1);
}
// exit if the target index directory already exists
if ((new File(indexDir)).isDirectory())
{
err.println("Error: index directory already exists: " + indexDir);
err.println("Please specify a name of a non-existent directory");
System.exit(1);
}
o.println("Opening Prolog file " + prologFilename);
final FileInputStream fis = new FileInputStream(prologFilename);
final BufferedReader br = new BufferedReader(new InputStreamReader(fis));
String line;
// maps a word to all the "groups" it's in
final Map word2Nums = new TreeMap();
// maps a group to all the words in it
final Map num2Words = new TreeMap();
// number of rejected words
int ndecent = 0;
// status output
int mod = 1;
int row = 1;
// parse prolog file
o.println( "[1/2] Parsing " + prologFilename);
while ((line = br.readLine()) != null)
{
// occasional progress
if ((++row) % mod == 0) // periodically print out line we read in
{
mod *= 2;
o.println("\t" + row + " " + line + " " + word2Nums.size()
+ " " + num2Words.size() + " ndecent=" + ndecent);
}
// syntax check
if (! line.startsWith("s("))
{
err.println("OUCH: " + line);
System.exit(1);
}
// parse line
line = line.substring(2);
int comma = line.indexOf(',");
String num = line.substring(0, comma);
int q1 = line.indexOf('\'");
line = line.substring(q1 + 1);
int q2 = line.indexOf('\'");
String word = line.substring(0, q2).toLowerCase();
// make sure is a normal word
if (! isDecent(word))
{
ndecent++;
continue; // don't store words w/ spaces
}
// 1/2: word2Nums map
// append to entry or add new one
List lis =(List) word2Nums.get(word);
if (lis == null)
{
lis = new LinkedList();
lis.add(num);
word2Nums.put(word, lis);
}
else
lis.add(num);
// 2/2: num2Words map
lis = (List) num2Words.get(num);
if (lis == null)
{
lis = new LinkedList();
lis.add(word);
num2Words.put(num, lis);
}
else
lis.add(word);
}
// close the streams
fis.close();
br.close();
// create the index
o.println( "[2/2] Building index to store synonyms, " +
" map sizes are " + word2Nums.size() + " and " + num2Words.size());
index(indexDir, word2Nums, num2Words);
| private static void | usage()
o.println("\n\n" +
"java org.apache.lucene.wordnet.Syn2Index <prolog file> <index dir>\n\n");
|
|