SpellChecker.java (Apache Lucene 2.1.0)

File	Doc	Category	Size	Date	Package
SpellChecker.java	API Doc	Apache Lucene 2.1.0	11027	Wed Feb 14 10:46:24 GMT 2007	org.apache.lucene.search.spell
SpellChecker.java

package org.apache.lucene.search.spell;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;

import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.Directory;
import java.util.*;

/**
 * <p>
 *   Spell Checker class  (Main class) <br/>
 *  (initially inspired by the David Spencer code).
 * </p>
 *
 * <p>Example Usage:
 * 
 * <pre>
 *  SpellChecker spellchecker = new SpellChecker(spellIndexDirectory);
 *  // To index a field of a user index:
 *  spellchecker.indexDictionary(new LuceneDictionary(my_lucene_reader, a_field));
 *  // To index a file containing words:
 *  spellchecker.indexDictionary(new PlainTextDictionary(new File("myfile.txt")));
 *  String[] suggestions = spellchecker.suggestSimilar("misspelt", 5);
 * </pre>
 * 
 * @author Nicolas Maisonneuve
 * @version 1.0
 */
public class SpellChecker {

  /**
   * Field name for each word in the ngram index.
   */
  public static final String F_WORD = "word";

  /**
   * the spell index
   */
  Directory spellIndex;

  /**
   * Boost value for start and end grams
   */
  private float bStart = 2.0f;
  private float bEnd = 1.0f;

  private IndexReader reader;
  private IndexSearcher searcher;

  // minimum score for hits generated by the spell checker query
  private float minScore = 0.5f;

  public SpellChecker(Directory spellIndex) throws IOException {
    this.setSpellIndex(spellIndex);
  }

  public void setSpellIndex(Directory spellIndex) throws IOException {
    this.spellIndex = spellIndex;
    if (!IndexReader.indexExists(spellIndex)) {
        IndexWriter writer = new IndexWriter(spellIndex, null, true);
        writer.close();
    }
    searcher = new IndexSearcher(this.spellIndex);
  }

  /**
   * Sets the accuracy 0 < minScore < 1; default 0.5
   */
  public void setAccuracy(float min) {
    this.minScore = min;
  }

  /**
   * Suggest similar words
   * @param word String the word you want a spell check done on
   * @param numSug int the number of suggest words
   * @throws IOException
   * @return String[]
   */
  public String[] suggestSimilar(String word, int numSug) throws IOException {
    return this.suggestSimilar(word, numSug, null, null, false);
  }

  /**
   * Suggest similar words (restricted or not to a field of a user index)
   * @param word String the word you want a spell check done on
   * @param numSug int the number of suggest words
   * @param ir the indexReader of the user index (can be null see field param)
   * @param field String the field of the user index: if field is not null, the suggested
   * words are restricted to the words present in this field.
   * @param morePopular boolean return only the suggest words that are more frequent than the searched word
   * (only if restricted mode = (indexReader!=null and field!=null)
   * @throws IOException
   * @return String[] the sorted list of the suggest words with this 2 criteria:
   * first criteria: the edit distance, second criteria (only if restricted mode): the popularity
   * of the suggest words in the field of the user index
   */
  public String[] suggestSimilar(String word, int numSug, IndexReader ir,
      String field, boolean morePopular) throws IOException {

    float min = this.minScore;
    final TRStringDistance sd = new TRStringDistance(word);
    final int lengthWord = word.length();

    final int goalFreq = (morePopular && ir != null) ? ir.docFreq(new Term(field, word)) : 0;
    // if the word exists in the real index and we don't care for word frequency, return the word itself
    if (!morePopular && goalFreq > 0) {
      return new String[] { word };
    }

    BooleanQuery query = new BooleanQuery();
    String[] grams;
    String key;

    for (int ng = getMin(lengthWord); ng <= getMax(lengthWord); ng++) {

      key = "gram" + ng; // form key

      grams = formGrams(word, ng); // form word into ngrams (allow dups too)

      if (grams.length == 0) {
        continue; // hmm
      }

      if (bStart > 0) { // should we boost prefixes?
        add(query, "start" + ng, grams[0], bStart); // matches start of word

      }
      if (bEnd > 0) { // should we boost suffixes
        add(query, "end" + ng, grams[grams.length - 1], bEnd); // matches end of word

      }
      for (int i = 0; i < grams.length; i++) {
        add(query, key, grams[i]);
      }
    }

//    System.out.println("Q: " + query);
    Hits hits = searcher.search(query);
//    System.out.println("HITS: " + hits.length());
    SuggestWordQueue sugQueue = new SuggestWordQueue(numSug);

    // go thru more than 'maxr' matches in case the distance filter triggers
    int stop = Math.min(hits.length(), 10 * numSug);
    SuggestWord sugWord = new SuggestWord();
    for (int i = 0; i < stop; i++) {

      sugWord.string = hits.doc(i).get(F_WORD); // get orig word

      // don't suggest a word for itself, that would be silly
      if (sugWord.string.equals(word)) {
        continue;
      }

      // edit distance/normalize with the minScore word length
      sugWord.score = 1.0f - ((float) sd.getDistance(sugWord.string) / Math
          .min(sugWord.string.length(), lengthWord));
      if (sugWord.score < min) {
        continue;
      }

      if (ir != null) { // use the user index
        sugWord.freq = ir.docFreq(new Term(field, sugWord.string)); // freq in the index
        // don't suggest a word that is not present in the field
        if ((morePopular && goalFreq > sugWord.freq) || sugWord.freq < 1) {
          continue;
        }
      }
      sugQueue.insert(sugWord);
      if (sugQueue.size() == numSug) {
        // if queue full, maintain the minScore score
        min = ((SuggestWord) sugQueue.top()).score;
      }
      sugWord = new SuggestWord();
    }

    // convert to array string
    String[] list = new String[sugQueue.size()];
    for (int i = sugQueue.size() - 1; i >= 0; i--) {
      list[i] = ((SuggestWord) sugQueue.pop()).string;
    }

    return list;
  }

  /**
   * Add a clause to a boolean query.
   */
  private static void add(BooleanQuery q, String name, String value, float boost) {
    Query tq = new TermQuery(new Term(name, value));
    tq.setBoost(boost);
    q.add(new BooleanClause(tq, BooleanClause.Occur.SHOULD));
  }

  /**
   * Add a clause to a boolean query.
   */
  private static void add(BooleanQuery q, String name, String value) {
    q.add(new BooleanClause(new TermQuery(new Term(name, value)), BooleanClause.Occur.SHOULD));
  }

  /**
   * Form all ngrams for a given word.
   * @param text the word to parse
   * @param ng the ngram length e.g. 3
   * @return an array of all ngrams in the word and note that duplicates are not removed
   */
  private static String[] formGrams(String text, int ng) {
    int len = text.length();
    String[] res = new String[len - ng + 1];
    for (int i = 0; i < len - ng + 1; i++) {
      res[i] = text.substring(i, i + ng);
    }
    return res;
  }

  public void clearIndex() throws IOException {
    IndexReader.unlock(spellIndex);
    IndexWriter writer = new IndexWriter(spellIndex, null, true);
    writer.close();
  }

  /**
   * Check whether the word exists in the index.
   * @param word String
   * @throws IOException
   * @return true iff the word exists in the index
   */
  public boolean exist(String word) throws IOException {
    if (reader == null) {
      reader = IndexReader.open(spellIndex);
    }
    return reader.docFreq(new Term(F_WORD, word)) > 0;
  }

  /**
   * Index a Dictionary
   * @param dict the dictionary to index
   * @throws IOException
   */
  public void indexDictionary(Dictionary dict) throws IOException {
    IndexReader.unlock(spellIndex);
    IndexWriter writer = new IndexWriter(spellIndex, new WhitespaceAnalyzer(),
        !IndexReader.indexExists(spellIndex));
    writer.setMergeFactor(300);
    writer.setMaxBufferedDocs(150);

    Iterator iter = dict.getWordsIterator();
    while (iter.hasNext()) {
      String word = (String) iter.next();

      int len = word.length();
      if (len < 3) {
        continue; // too short we bail but "too long" is fine...
      }

      if (this.exist(word)) { // if the word already exist in the gramindex
        continue;
      }

      // ok index the word
      Document doc = createDocument(word, getMin(len), getMax(len));
      writer.addDocument(doc);
    }
    // close writer
    writer.optimize();
    writer.close();
  }

  private int getMin(int l) {
    if (l > 5) {
      return 3;
    }
    if (l == 5) {
      return 2;
    }
    return 1;
  }

  private int getMax(int l) {
    if (l > 5) {
      return 4;
    }
    if (l == 5) {
      return 3;
    }
    return 2;
  }

  private static Document createDocument(String text, int ng1, int ng2) {
    Document doc = new Document();
    doc.add(new Field(F_WORD, text, Field.Store.YES, Field.Index.UN_TOKENIZED)); // orig term
    addGram(text, doc, ng1, ng2);
    return doc;
  }

  private static void addGram(String text, Document doc, int ng1, int ng2) {
    int len = text.length();
    for (int ng = ng1; ng <= ng2; ng++) {
      String key = "gram" + ng;
      String end = null;
      for (int i = 0; i < len - ng + 1; i++) {
        String gram = text.substring(i, i + ng);
        doc.add(new Field(key, gram, Field.Store.YES, Field.Index.UN_TOKENIZED));
        if (i == 0) {
          doc.add(new Field("start" + ng, gram, Field.Store.YES, Field.Index.UN_TOKENIZED));
        }
        end = gram;
      }
      if (end != null) { // may not be present if len==ng1
        doc.add(new Field("end" + ng, end, Field.Store.YES, Field.Index.UN_TOKENIZED));
      }
    }
  }

  protected void finalize() throws Throwable {
    try {
      if (reader != null) {
        reader.close();
      }
    } finally {
      super.finalize();
    }
  }
}