FileDocCategorySizeDatePackage
Syns2Index.javaAPI DocApache Lucene 1.99247Mon Feb 20 09:17:52 GMT 2006org.apache.lucene.wordnet

Syns2Index

public class Syns2Index extends Object
Convert the prolog file wn_s.pl from the WordNet prolog download into a Lucene index suitable for looking up synonyms and performing query expansion ({@link SynExpand#expand SynExpand.expand(...)}). This has been tested with WordNet 2.0. The index has fields named "word" ({@link #F_WORD}) and "syn" ({@link #F_SYN}).

The source word (such as 'big') can be looked up in the "word" field, and if present there will be fields named "syn" for every synonym. What's tricky here is that there could be multiple fields with the same name, in the general case for words that have multiple synonyms. That's not a problem with Lucene, you just use {@link org.apache.lucene.document.Document#getValues}

While the WordNet file distinguishes groups of synonyms with related meanings we don't do that here.

This can take 4 minutes to execute and build an index on a "fast" system and the index takes up almost 3 MB.
author
Dave Spencer, dave@searchmorph.com
see
WordNet home page
see
prologdb man page
see
sample site that uses it

Fields Summary
private static final PrintStream
o
private static final PrintStream
err
public static final String
F_SYN
public static final String
F_WORD
private static final Analyzer
ana
Constructors Summary
Methods Summary
private static voidindex(java.lang.String indexDir, java.util.Map word2Nums, java.util.Map num2Words)
Forms a Lucene index based on the 2 maps.

param
indexDir the direcotry where the index should be created
param
word2Nums
param
num2Words

        int row = 0;
        int mod = 1;

        // override the specific index if it already exists
        IndexWriter writer = new IndexWriter(indexDir, ana, true);
        writer.setUseCompoundFile(true); // why?
		// blindly up these parameters for speed
		writer.setMergeFactor( writer.getMergeFactor() * 2);
		writer.setMaxBufferedDocs( writer.getMaxBufferedDocs() * 2);
        Iterator i1 = word2Nums.keySet().iterator();
        while (i1.hasNext()) // for each word
        {
            String g = (String) i1.next();
            Document doc = new Document();

            int n = index(word2Nums, num2Words, g, doc);
            if (n > 0)
            {
				doc.add( new Field( F_WORD, g, Field.Store.YES, Field.Index.UN_TOKENIZED));
                if ((++row % mod) == 0)
                {
                    o.println("\trow=" + row + "/" + word2Nums.size() + " doc= " + doc);
                    mod *= 2;
                }
                writer.addDocument(doc);
            } // else degenerate
        }
		o.println( "Optimizing..");
        writer.optimize();
        writer.close();
    
private static intindex(java.util.Map word2Nums, java.util.Map num2Words, java.lang.String g, org.apache.lucene.document.Document doc)
Given the 2 maps fills a document for 1 word.

        List keys = (List) word2Nums.get(g); // get list of key#'s
        Iterator i2 = keys.iterator();

        Set already = new TreeSet(); // keep them sorted

        // pass 1: fill up 'already' with all words
        while (i2.hasNext()) // for each key#
        {
            already.addAll((List) num2Words.get(i2.next())); // get list of words
        }
        int num = 0;
        already.remove(g); // of course a word is it's own syn
        Iterator it = already.iterator();
        while (it.hasNext())
        {
            String cur = (String) it.next();
            // don't store things like 'pit bull' -> 'american pit bull'
            if (!isDecent(cur))
            {
                continue;
            }
            num++;
			doc.add( new Field( F_SYN, cur, Field.Store.YES, Field.Index.NO));
        }
        return num;
    
private static booleanisDecent(java.lang.String s)
Checks to see if a word contains only alphabetic characters by checking it one character at a time.

param
s string to check
return
true if the string is decent

        int len = s.length();
        for (int i = 0; i < len; i++)
        {
            if (!Character.isLetter(s.charAt(i)))
            {
                return false;
            }
        }
        return true;
    
public static voidmain(java.lang.String[] args)
Takes arg of prolog file name and index directory.


                  
        
         
    
        // get command line arguments
        String prologFilename = null; // name of file "wn_s.pl"
        String indexDir = null;
        if (args.length == 2)
        {
            prologFilename = args[0];
            indexDir = args[1];
        }
        else
        {
            usage();
            System.exit(1);
        }

        // ensure that the prolog file is readable
        if (! (new File(prologFilename)).canRead())
        {
            err.println("Error: cannot read Prolog file: " + prologFilename);
            System.exit(1);
        }
        // exit if the target index directory already exists
        if ((new File(indexDir)).isDirectory())
        {
            err.println("Error: index directory already exists: " + indexDir);
            err.println("Please specify a name of a non-existent directory");
            System.exit(1);
        }

        o.println("Opening Prolog file " + prologFilename);
        final FileInputStream fis = new FileInputStream(prologFilename);
        final BufferedReader br = new BufferedReader(new InputStreamReader(fis));
        String line;

        // maps a word to all the "groups" it's in
        final Map word2Nums = new TreeMap();
        // maps a group to all the words in it
        final Map num2Words = new TreeMap();
        // number of rejected words
        int ndecent = 0;

        // status output
        int mod = 1;
        int row = 1;
        // parse prolog file
		o.println( "[1/2] Parsing " + prologFilename);
        while ((line = br.readLine()) != null)
        {
            // occasional progress
            if ((++row) % mod == 0) // periodically print out line we read in
            {
                mod *= 2;
                o.println("\t" + row + " " + line + " " + word2Nums.size()
                    + " " + num2Words.size() + " ndecent=" + ndecent);
            }

            // syntax check
            if (! line.startsWith("s("))
            {
                err.println("OUCH: " + line);
                System.exit(1);
            }

            // parse line
            line = line.substring(2);
            int comma = line.indexOf(',");
            String num = line.substring(0, comma);
            int q1 = line.indexOf('\'");
            line = line.substring(q1 + 1);
            int q2 = line.indexOf('\'");
            String word = line.substring(0, q2).toLowerCase();

            // make sure is a normal word
            if (! isDecent(word))
            {
                ndecent++;
                continue; // don't store words w/ spaces
            }

            // 1/2: word2Nums map
            // append to entry or add new one
            List lis =(List) word2Nums.get(word);
            if (lis == null)
            {
                lis = new LinkedList();
                lis.add(num);
                word2Nums.put(word, lis);
            }
            else
                lis.add(num);

            // 2/2: num2Words map
            lis = (List) num2Words.get(num);
            if (lis == null)
            {
                lis = new LinkedList();
                lis.add(word);
                num2Words.put(num, lis);
            }
            else
                lis.add(word);
        }

        // close the streams
        fis.close();
        br.close();

        // create the index
		o.println( "[2/2] Building index to store synonyms, " +
				   " map sizes are " + word2Nums.size() + " and " + num2Words.size());
        index(indexDir, word2Nums, num2Words);
    
private static voidusage()

        o.println("\n\n" +
            "java org.apache.lucene.wordnet.Syn2Index <prolog file> <index dir>\n\n");