FileDocCategorySizeDatePackage
SpellChecker.javaAPI DocApache Lucene 2.1.011027Wed Feb 14 10:46:24 GMT 2007org.apache.lucene.search.spell

SpellChecker

public class SpellChecker extends Object

Spell Checker class (Main class)
(initially inspired by the David Spencer code).

Example Usage:

SpellChecker spellchecker = new SpellChecker(spellIndexDirectory);
// To index a field of a user index:
spellchecker.indexDictionary(new LuceneDictionary(my_lucene_reader, a_field));
// To index a file containing words:
spellchecker.indexDictionary(new PlainTextDictionary(new File("myfile.txt")));
String[] suggestions = spellchecker.suggestSimilar("misspelt", 5);
author
Nicolas Maisonneuve
version
1.0

Fields Summary
public static final String
F_WORD
Field name for each word in the ngram index.
Directory
spellIndex
the spell index
private float
bStart
Boost value for start and end grams
private float
bEnd
private IndexReader
reader
private IndexSearcher
searcher
private float
minScore
Constructors Summary
public SpellChecker(Directory spellIndex)


       
    this.setSpellIndex(spellIndex);
  
Methods Summary
private static voidadd(org.apache.lucene.search.BooleanQuery q, java.lang.String name, java.lang.String value, float boost)
Add a clause to a boolean query.

    Query tq = new TermQuery(new Term(name, value));
    tq.setBoost(boost);
    q.add(new BooleanClause(tq, BooleanClause.Occur.SHOULD));
  
private static voidadd(org.apache.lucene.search.BooleanQuery q, java.lang.String name, java.lang.String value)
Add a clause to a boolean query.

    q.add(new BooleanClause(new TermQuery(new Term(name, value)), BooleanClause.Occur.SHOULD));
  
private static voidaddGram(java.lang.String text, org.apache.lucene.document.Document doc, int ng1, int ng2)

    int len = text.length();
    for (int ng = ng1; ng <= ng2; ng++) {
      String key = "gram" + ng;
      String end = null;
      for (int i = 0; i < len - ng + 1; i++) {
        String gram = text.substring(i, i + ng);
        doc.add(new Field(key, gram, Field.Store.YES, Field.Index.UN_TOKENIZED));
        if (i == 0) {
          doc.add(new Field("start" + ng, gram, Field.Store.YES, Field.Index.UN_TOKENIZED));
        }
        end = gram;
      }
      if (end != null) { // may not be present if len==ng1
        doc.add(new Field("end" + ng, end, Field.Store.YES, Field.Index.UN_TOKENIZED));
      }
    }
  
public voidclearIndex()

    IndexReader.unlock(spellIndex);
    IndexWriter writer = new IndexWriter(spellIndex, null, true);
    writer.close();
  
private static org.apache.lucene.document.DocumentcreateDocument(java.lang.String text, int ng1, int ng2)

    Document doc = new Document();
    doc.add(new Field(F_WORD, text, Field.Store.YES, Field.Index.UN_TOKENIZED)); // orig term
    addGram(text, doc, ng1, ng2);
    return doc;
  
public booleanexist(java.lang.String word)
Check whether the word exists in the index.

param
word String
throws
IOException
return
true iff the word exists in the index

    if (reader == null) {
      reader = IndexReader.open(spellIndex);
    }
    return reader.docFreq(new Term(F_WORD, word)) > 0;
  
protected voidfinalize()

    try {
      if (reader != null) {
        reader.close();
      }
    } finally {
      super.finalize();
    }
  
private static java.lang.String[]formGrams(java.lang.String text, int ng)
Form all ngrams for a given word.

param
text the word to parse
param
ng the ngram length e.g. 3
return
an array of all ngrams in the word and note that duplicates are not removed

    int len = text.length();
    String[] res = new String[len - ng + 1];
    for (int i = 0; i < len - ng + 1; i++) {
      res[i] = text.substring(i, i + ng);
    }
    return res;
  
private intgetMax(int l)

    if (l > 5) {
      return 4;
    }
    if (l == 5) {
      return 3;
    }
    return 2;
  
private intgetMin(int l)

    if (l > 5) {
      return 3;
    }
    if (l == 5) {
      return 2;
    }
    return 1;
  
public voidindexDictionary(java.util.Dictionary dict)
Index a Dictionary

param
dict the dictionary to index
throws
IOException

    IndexReader.unlock(spellIndex);
    IndexWriter writer = new IndexWriter(spellIndex, new WhitespaceAnalyzer(),
        !IndexReader.indexExists(spellIndex));
    writer.setMergeFactor(300);
    writer.setMaxBufferedDocs(150);

    Iterator iter = dict.getWordsIterator();
    while (iter.hasNext()) {
      String word = (String) iter.next();

      int len = word.length();
      if (len < 3) {
        continue; // too short we bail but "too long" is fine...
      }

      if (this.exist(word)) { // if the word already exist in the gramindex
        continue;
      }

      // ok index the word
      Document doc = createDocument(word, getMin(len), getMax(len));
      writer.addDocument(doc);
    }
    // close writer
    writer.optimize();
    writer.close();
  
public voidsetAccuracy(float min)
Sets the accuracy 0 < minScore < 1; default 0.5

    this.minScore = min;
  
public voidsetSpellIndex(org.apache.lucene.store.Directory spellIndex)

    this.spellIndex = spellIndex;
    if (!IndexReader.indexExists(spellIndex)) {
        IndexWriter writer = new IndexWriter(spellIndex, null, true);
        writer.close();
    }
    searcher = new IndexSearcher(this.spellIndex);
  
public java.lang.String[]suggestSimilar(java.lang.String word, int numSug)
Suggest similar words

param
word String the word you want a spell check done on
param
numSug int the number of suggest words
throws
IOException
return
String[]

    return this.suggestSimilar(word, numSug, null, null, false);
  
public java.lang.String[]suggestSimilar(java.lang.String word, int numSug, org.apache.lucene.index.IndexReader ir, java.lang.String field, boolean morePopular)
Suggest similar words (restricted or not to a field of a user index)

param
word String the word you want a spell check done on
param
numSug int the number of suggest words
param
ir the indexReader of the user index (can be null see field param)
param
field String the field of the user index: if field is not null, the suggested words are restricted to the words present in this field.
param
morePopular boolean return only the suggest words that are more frequent than the searched word (only if restricted mode = (indexReader!=null and field!=null)
throws
IOException
return
String[] the sorted list of the suggest words with this 2 criteria: first criteria: the edit distance, second criteria (only if restricted mode): the popularity of the suggest words in the field of the user index


    float min = this.minScore;
    final TRStringDistance sd = new TRStringDistance(word);
    final int lengthWord = word.length();

    final int goalFreq = (morePopular && ir != null) ? ir.docFreq(new Term(field, word)) : 0;
    // if the word exists in the real index and we don't care for word frequency, return the word itself
    if (!morePopular && goalFreq > 0) {
      return new String[] { word };
    }

    BooleanQuery query = new BooleanQuery();
    String[] grams;
    String key;

    for (int ng = getMin(lengthWord); ng <= getMax(lengthWord); ng++) {

      key = "gram" + ng; // form key

      grams = formGrams(word, ng); // form word into ngrams (allow dups too)

      if (grams.length == 0) {
        continue; // hmm
      }

      if (bStart > 0) { // should we boost prefixes?
        add(query, "start" + ng, grams[0], bStart); // matches start of word

      }
      if (bEnd > 0) { // should we boost suffixes
        add(query, "end" + ng, grams[grams.length - 1], bEnd); // matches end of word

      }
      for (int i = 0; i < grams.length; i++) {
        add(query, key, grams[i]);
      }
    }

//    System.out.println("Q: " + query);
    Hits hits = searcher.search(query);
//    System.out.println("HITS: " + hits.length());
    SuggestWordQueue sugQueue = new SuggestWordQueue(numSug);

    // go thru more than 'maxr' matches in case the distance filter triggers
    int stop = Math.min(hits.length(), 10 * numSug);
    SuggestWord sugWord = new SuggestWord();
    for (int i = 0; i < stop; i++) {

      sugWord.string = hits.doc(i).get(F_WORD); // get orig word

      // don't suggest a word for itself, that would be silly
      if (sugWord.string.equals(word)) {
        continue;
      }

      // edit distance/normalize with the minScore word length
      sugWord.score = 1.0f - ((float) sd.getDistance(sugWord.string) / Math
          .min(sugWord.string.length(), lengthWord));
      if (sugWord.score < min) {
        continue;
      }

      if (ir != null) { // use the user index
        sugWord.freq = ir.docFreq(new Term(field, sugWord.string)); // freq in the index
        // don't suggest a word that is not present in the field
        if ((morePopular && goalFreq > sugWord.freq) || sugWord.freq < 1) {
          continue;
        }
      }
      sugQueue.insert(sugWord);
      if (sugQueue.size() == numSug) {
        // if queue full, maintain the minScore score
        min = ((SuggestWord) sugQueue.top()).score;
      }
      sugWord = new SuggestWord();
    }

    // convert to array string
    String[] list = new String[sugQueue.size()];
    for (int i = sugQueue.size() - 1; i >= 0; i--) {
      list[i] = ((SuggestWord) sugQueue.pop()).string;
    }

    return list;