FileDocCategorySizeDatePackage
SpellChecker.javaAPI DocApache Lucene 2.2.013019Sat Jun 16 22:21:02 BST 2007org.apache.lucene.search.spell

SpellChecker

public class SpellChecker extends Object

Spell Checker class (Main class)
(initially inspired by the David Spencer code).

Example Usage:

SpellChecker spellchecker = new SpellChecker(spellIndexDirectory);
// To index a field of a user index:
spellchecker.indexDictionary(new LuceneDictionary(my_lucene_reader, a_field));
// To index a file containing words:
spellchecker.indexDictionary(new PlainTextDictionary(new File("myfile.txt")));
String[] suggestions = spellchecker.suggestSimilar("misspelt", 5);
author
Nicolas Maisonneuve
version
1.0

Fields Summary
public static final String
F_WORD
Field name for each word in the ngram index.
Directory
spellIndex
the spell index
private float
bStart
Boost value for start and end grams
private float
bEnd
private IndexReader
reader
private IndexSearcher
searcher
private float
minScore
Constructors Summary
public SpellChecker(Directory spellIndex)
Use the given directory as a spell checker index. The directory is created if it doesn't exist yet.

param
spellIndex
throws
IOException


                            
       
    this.setSpellIndex(spellIndex);
  
Methods Summary
private static voidadd(org.apache.lucene.search.BooleanQuery q, java.lang.String name, java.lang.String value, float boost)
Add a clause to a boolean query.

    Query tq = new TermQuery(new Term(name, value));
    tq.setBoost(boost);
    q.add(new BooleanClause(tq, BooleanClause.Occur.SHOULD));
  
private static voidadd(org.apache.lucene.search.BooleanQuery q, java.lang.String name, java.lang.String value)
Add a clause to a boolean query.

    q.add(new BooleanClause(new TermQuery(new Term(name, value)), BooleanClause.Occur.SHOULD));
  
private static voidaddGram(java.lang.String text, org.apache.lucene.document.Document doc, int ng1, int ng2)

    int len = text.length();
    for (int ng = ng1; ng <= ng2; ng++) {
      String key = "gram" + ng;
      String end = null;
      for (int i = 0; i < len - ng + 1; i++) {
        String gram = text.substring(i, i + ng);
        doc.add(new Field(key, gram, Field.Store.NO, Field.Index.UN_TOKENIZED));
        if (i == 0) {
          doc.add(new Field("start" + ng, gram, Field.Store.NO, Field.Index.UN_TOKENIZED));
        }
        end = gram;
      }
      if (end != null) { // may not be present if len==ng1
        doc.add(new Field("end" + ng, end, Field.Store.NO, Field.Index.UN_TOKENIZED));
      }
    }
  
public voidclearIndex()
Removes all terms from the spell check index.

throws
IOException

    IndexReader.unlock(spellIndex);
    IndexWriter writer = new IndexWriter(spellIndex, null, true);
    writer.close();
  
private static org.apache.lucene.document.DocumentcreateDocument(java.lang.String text, int ng1, int ng2)

    Document doc = new Document();
    doc.add(new Field(F_WORD, text, Field.Store.YES, Field.Index.UN_TOKENIZED)); // orig term
    addGram(text, doc, ng1, ng2);
    return doc;
  
public booleanexist(java.lang.String word)
Check whether the word exists in the index.

param
word
throws
IOException
return
true iff the word exists in the index

    if (reader == null) {
      reader = IndexReader.open(spellIndex);
    }
    return reader.docFreq(new Term(F_WORD, word)) > 0;
  
protected voidfinalize()
Closes the internal IndexReader.

    try {
      if (reader != null) {
        reader.close();
      }
    } finally {
      super.finalize();
    }
  
private static java.lang.String[]formGrams(java.lang.String text, int ng)
Form all ngrams for a given word.

param
text the word to parse
param
ng the ngram length e.g. 3
return
an array of all ngrams in the word and note that duplicates are not removed

    int len = text.length();
    String[] res = new String[len - ng + 1];
    for (int i = 0; i < len - ng + 1; i++) {
      res[i] = text.substring(i, i + ng);
    }
    return res;
  
private intgetMax(int l)

    if (l > 5) {
      return 4;
    }
    if (l == 5) {
      return 3;
    }
    return 2;
  
private intgetMin(int l)

    if (l > 5) {
      return 3;
    }
    if (l == 5) {
      return 2;
    }
    return 1;
  
public voidindexDictionary(java.util.Dictionary dict)
Index a Dictionary

param
dict the dictionary to index
throws
IOException

    IndexReader.unlock(spellIndex);
    IndexWriter writer = new IndexWriter(spellIndex, new WhitespaceAnalyzer(),
        !IndexReader.indexExists(spellIndex));
    writer.setMergeFactor(300);
    writer.setMaxBufferedDocs(150);

    Iterator iter = dict.getWordsIterator();
    while (iter.hasNext()) {
      String word = (String) iter.next();

      int len = word.length();
      if (len < 3) {
        continue; // too short we bail but "too long" is fine...
      }

      if (this.exist(word)) { // if the word already exist in the gramindex
        continue;
      }

      // ok index the word
      Document doc = createDocument(word, getMin(len), getMax(len));
      writer.addDocument(doc);
    }
    // close writer
    writer.optimize();
    writer.close();
    // close reader so it will be re-opened (and see the new content) when exist()
    // is called the next time:
    if (reader != null) {
      reader.close();
      reader = null;
    }
    // also re-open the spell index to see our own changes when the next suggestion
    // is fetched:
    searcher.close();
    searcher = new IndexSearcher(this.spellIndex);
  
public voidsetAccuracy(float minScore)
Sets the accuracy 0 < minScore < 1; default 0.5

    this.minScore = minScore;
  
public voidsetSpellIndex(org.apache.lucene.store.Directory spellIndex)
Use a different index as the spell checker index or re-open the existing index if spellIndex is the same value as given in the constructor.

param
spellIndex
throws
IOException

    this.spellIndex = spellIndex;
    if (!IndexReader.indexExists(spellIndex)) {
        IndexWriter writer = new IndexWriter(spellIndex, null, true);
        writer.close();
    }
    // close the old searcher, if there was one
    if (searcher != null) {
      searcher.close();
    }
    searcher = new IndexSearcher(this.spellIndex);
  
public java.lang.String[]suggestSimilar(java.lang.String word, int numSug)
Suggest similar words.

As the Lucene similarity that is used to fetch the most relevant n-grammed terms is not the same as the edit distance strategy used to calculate the best matching spell-checked word from the hits that Lucene found, one usually has to retrieve a couple of numSug's in order to get the true best match.

I.e. if numSug == 1, don't count on that suggestion being the best one. Thus, you should set this value to at least 5 for a good suggestion.

param
word the word you want a spell check done on
param
numSug the number of suggested words
throws
IOException
return
String[]

    return this.suggestSimilar(word, numSug, null, null, false);
  
public java.lang.String[]suggestSimilar(java.lang.String word, int numSug, org.apache.lucene.index.IndexReader ir, java.lang.String field, boolean morePopular)
Suggest similar words (optionally restricted to a field of an index).

As the Lucene similarity that is used to fetch the most relevant n-grammed terms is not the same as the edit distance strategy used to calculate the best matching spell-checked word from the hits that Lucene found, one usually has to retrieve a couple of numSug's in order to get the true best match.

I.e. if numSug == 1, don't count on that suggestion being the best one. Thus, you should set this value to at least 5 for a good suggestion.

param
word the word you want a spell check done on
param
numSug the number of suggested words
param
ir the indexReader of the user index (can be null see field param)
param
field the field of the user index: if field is not null, the suggested words are restricted to the words present in this field.
param
morePopular return only the suggest words that are more frequent than the searched word (only if restricted mode = (indexReader!=null and field!=null)
throws
IOException
return
String[] the sorted list of the suggest words with these 2 criteria: first criteria: the edit distance, second criteria (only if restricted mode): the popularity of the suggest words in the field of the user index


    float min = this.minScore;
    final TRStringDistance sd = new TRStringDistance(word);
    final int lengthWord = word.length();

    final int goalFreq = (morePopular && ir != null) ? ir.docFreq(new Term(field, word)) : 0;
    // if the word exists in the real index and we don't care for word frequency, return the word itself
    if (!morePopular && goalFreq > 0) {
      return new String[] { word };
    }

    BooleanQuery query = new BooleanQuery();
    String[] grams;
    String key;

    for (int ng = getMin(lengthWord); ng <= getMax(lengthWord); ng++) {

      key = "gram" + ng; // form key

      grams = formGrams(word, ng); // form word into ngrams (allow dups too)

      if (grams.length == 0) {
        continue; // hmm
      }

      if (bStart > 0) { // should we boost prefixes?
        add(query, "start" + ng, grams[0], bStart); // matches start of word

      }
      if (bEnd > 0) { // should we boost suffixes
        add(query, "end" + ng, grams[grams.length - 1], bEnd); // matches end of word

      }
      for (int i = 0; i < grams.length; i++) {
        add(query, key, grams[i]);
      }
    }

//    System.out.println("Q: " + query);
    Hits hits = searcher.search(query);
//    System.out.println("HITS: " + hits.length());
    SuggestWordQueue sugQueue = new SuggestWordQueue(numSug);

    // go thru more than 'maxr' matches in case the distance filter triggers
    int stop = Math.min(hits.length(), 10 * numSug);
    SuggestWord sugWord = new SuggestWord();
    for (int i = 0; i < stop; i++) {

      sugWord.string = hits.doc(i).get(F_WORD); // get orig word

      // don't suggest a word for itself, that would be silly
      if (sugWord.string.equals(word)) {
        continue;
      }

      // edit distance/normalize with the minScore word length
      sugWord.score = 1.0f - ((float) sd.getDistance(sugWord.string) / Math
          .min(sugWord.string.length(), lengthWord));
      if (sugWord.score < min) {
        continue;
      }

      if (ir != null) { // use the user index
        sugWord.freq = ir.docFreq(new Term(field, sugWord.string)); // freq in the index
        // don't suggest a word that is not present in the field
        if ((morePopular && goalFreq > sugWord.freq) || sugWord.freq < 1) {
          continue;
        }
      }
      sugQueue.insert(sugWord);
      if (sugQueue.size() == numSug) {
        // if queue full, maintain the minScore score
        min = ((SuggestWord) sugQueue.top()).score;
      }
      sugWord = new SuggestWord();
    }

    // convert to array string
    String[] list = new String[sugQueue.size()];
    for (int i = sugQueue.size() - 1; i >= 0; i--) {
      list[i] = ((SuggestWord) sugQueue.pop()).string;
    }

    return list;