FileDocCategorySizeDatePackage
SpellChecker.javaAPI DocApache Lucene 1.911137Mon Feb 20 09:18:32 GMT 2006org.apache.lucene.search.spell

SpellChecker

public class SpellChecker extends Object

Spell Checker class (Main class)
(initially inspired by the David Spencer code).

Example Usage:

SpellChecker spellchecker = new SpellChecker(spellIndexDirectory);
// To index a field of a user index:
spellchecker.indexDictionary(new LuceneDictionary(my_lucene_reader, a_field));
// To index a file containing words:
spellchecker.indexDictionary(new PlainTextDictionary(new File("myfile.txt")));
String[] suggestions = spellchecker.suggestSimilar("misspelt", 5);
author
Nicolas Maisonneuve
version
1.0

Fields Summary
public static final String
F_WORD
Field name for each word in the ngram index.
Directory
spellindex
the spell index
private float
bStart
Boost value for start and end grams
private float
bEnd
private IndexReader
reader
float
min
Constructors Summary
public SpellChecker(Directory gramIndex)

        this.setSpellIndex(gramIndex);
    
Methods Summary
private static voidadd(org.apache.lucene.search.BooleanQuery q, java.lang.String k, java.lang.String v, float boost)
Add a clause to a boolean query.

        Query tq=new TermQuery(new Term(k, v));
        tq.setBoost(boost);
        q.add(new BooleanClause(tq, BooleanClause.Occur.SHOULD));
    
private static voidadd(org.apache.lucene.search.BooleanQuery q, java.lang.String k, java.lang.String v)
Add a clause to a boolean query.

        q.add(new BooleanClause(new TermQuery(new Term(k, v)), BooleanClause.Occur.SHOULD));
    
private static voidaddGram(java.lang.String text, org.apache.lucene.document.Document doc, int ng1, int ng2)

        int len=text.length();
        for (int ng=ng1; ng<=ng2; ng++) {
            String key="gram"+ng;
            String end=null;
            for (int i=0; i<len-ng+1; i++) {
                String gram=text.substring(i, i+ng);
                doc.add(new Field(key, gram, Field.Store.YES, Field.Index.UN_TOKENIZED));
                if (i==0) {
                    doc.add(new Field("start"+ng, gram, Field.Store.YES, Field.Index.UN_TOKENIZED));
                }
                end=gram;
            }
            if (end!=null) { // may not be present if len==ng1
                doc.add(new Field("end"+ng, end, Field.Store.YES, Field.Index.UN_TOKENIZED));
            }
        }
    
public voidclearIndex()

        IndexReader.unlock(spellindex);
        IndexWriter writer=new IndexWriter(spellindex, null, true);
        writer.close();
    
private static org.apache.lucene.document.DocumentcreateDocument(java.lang.String text, int ng1, int ng2)

        Document doc=new Document();
        doc.add(new Field(F_WORD, text, Field.Store.YES, Field.Index.UN_TOKENIZED)); // orig term
        addGram(text, doc, ng1, ng2);
        return doc;
    
public booleanexist(java.lang.String word)
Check whether the word exists in the index.

param
word String
throws
IOException
return
true iff the word exists in the index

        if (reader==null) {
            reader=IndexReader.open(spellindex);
        }
        return reader.docFreq(new Term(F_WORD, word))>0;
    
protected voidfinalize()

        if (reader!=null) {
            reader.close();
        }
    
private static java.lang.String[]formGrams(java.lang.String text, int ng)
Form all ngrams for a given word.

param
text the word to parse
param
ng the ngram length e.g. 3
return
an array of all ngrams in the word and note that duplicates are not removed

        int len=text.length();
        String[] res=new String[len-ng+1];
        for (int i=0; i<len-ng+1; i++) {
            res[i]=text.substring(i, i+ng);
        }
        return res;
    
private intgetMax(int l)

        if (l>5) {
            return 4;
        }
        if (l==5) {
            return 3;
        }
        return 2;
    
private intgetMin(int l)

        if (l>5) {
            return 3;
        }
        if (l==5) {
            return 2;
        }
        return 1;
    
public voidindexDictionary(java.util.Dictionary dict)
Index a Dictionary

param
dict the dictionary to index
throws
IOException

        IndexReader.unlock(spellindex);
        IndexWriter writer=new IndexWriter(spellindex, new WhitespaceAnalyzer(), !IndexReader.indexExists(spellindex));
        writer.setMergeFactor(300);
        writer.setMaxBufferedDocs(150);

        Iterator iter=dict.getWordsIterator();
        while (iter.hasNext()) {
            String word=(String) iter.next();

            int len=word.length();
            if (len<3) {
                continue; // too short we bail but "too long" is fine...
            }

            if (this.exist(word)) { // if the word already exist in the gramindex
                continue;
            }

            // ok index the word
            Document doc=createDocument(word, getMin(len), getMax(len));
            writer.addDocument(doc);
        }
        // close writer
        writer.optimize();
        writer.close();

        // close reader
        reader.close();
        reader=null;
    
public voidsetAccuraty(float min)
Set the accuracy 0 < min < 1; default 0.5

        this.min=min;
    
public voidsetSpellIndex(org.apache.lucene.store.Directory spellindex)


         
        this.spellindex=spellindex;
    
public java.lang.String[]suggestSimilar(java.lang.String word, int num_sug)
Suggest similar words

param
word String the word you want a spell check done on
param
num_sug int the number of suggest words
throws
IOException
return
String[]

        return this.suggestSimilar(word, num_sug, null, null, false);
    
public java.lang.String[]suggestSimilar(java.lang.String word, int num_sug, org.apache.lucene.index.IndexReader ir, java.lang.String field, boolean morePopular)
Suggest similar words (restricted or not to a field of a user index)

param
word String the word you want a spell check done on
param
num_sug int the number of suggest words
param
ir the indexReader of the user index (can be null see field param)
param
field String the field of the user index: if field is not null, the suggested words are restricted to the words present in this field.
param
morePopular boolean return only the suggest words that are more frequent than the searched word (only if restricted mode = (indexReader!=null and field!=null)
throws
IOException
return
String[] the sorted list of the suggest words with this 2 criteria: first criteria: the edit distance, second criteria (only if restricted mode): the popularity of the suggest words in the field of the user index


        final TRStringDistance sd=new TRStringDistance(word);
        final int lengthWord=word.length();

        final int goalFreq=(morePopular&&ir!=null)?ir.docFreq(new Term(field, word)):0;
        if (!morePopular&&goalFreq>0) {
            return new String[] {
            word}; // return the word if it exist in the index and i don't want a more popular word
        }

        BooleanQuery query=new BooleanQuery();
        String[] grams;
        String key;

        for (int ng=getMin(lengthWord); ng<=getMax(lengthWord); ng++) {

            key="gram"+ng; // form key

            grams=formGrams(word, ng); // form word into ngrams (allow dups too)

            if (grams.length==0) {
                continue; // hmm
            }

            if (bStart>0) { // should we boost prefixes?
                add(query, "start"+ng, grams[0], bStart); // matches start of word

            }
            if (bEnd>0) { // should we boost suffixes
                add(query, "end"+ng, grams[grams.length-1], bEnd); // matches end of word

            }
            for (int i=0; i<grams.length; i++) {
                add(query, key, grams[i]);
            }

        }

        IndexSearcher searcher=new IndexSearcher(this.spellindex);
        Hits hits=searcher.search(query);
        SuggestWordQueue sugqueue=new SuggestWordQueue(num_sug);

        int stop=Math.min(hits.length(), 10*num_sug); // go thru more than 'maxr' matches in case the distance filter triggers
        SuggestWord sugword=new SuggestWord();
        for (int i=0; i<stop; i++) {

            sugword.string=hits.doc(i).get(F_WORD); // get orig word)

            if (sugword.string.equals(word)) {
                continue; // don't suggest a word for itself, that would be silly
            }

            //edit distance/normalize with the min word length
            sugword.score=1.0f-((float) sd.getDistance(sugword.string)/Math.min(sugword.string.length(), lengthWord));
            if (sugword.score<min) {
                continue;
            }

            if (ir!=null) { // use the user index
                sugword.freq=ir.docFreq(new Term(field, sugword.string)); // freq in the index
                if ((morePopular&&goalFreq>sugword.freq)||sugword.freq<1) { // don't suggest a word that is not present in the field
                    continue;
                }
            }
            sugqueue.insert(sugword);
            if (sugqueue.size()==num_sug) {
                //if queue full , maintain the min score
                min=((SuggestWord) sugqueue.top()).score;
            }
            sugword=new SuggestWord();
        }

        // convert to array string
        String[] list=new String[sugqueue.size()];
        for (int i=sugqueue.size()-1; i>=0; i--) {
            list[i]=((SuggestWord) sugqueue.pop()).string;
        }

        searcher.close();
        return list;