File Doc Category Size Date Package
SpellChecker.java API Doc Apache Lucene 2.1.0 11027 Wed Feb 14 10:46:24 GMT 2007 org.apache.lucene.search.spell

SpellChecker

java.lang.Object

public class SpellChecker extends Object

Spell Checker class (Main class)
(initially inspired by the David Spencer code).

Example Usage:

SpellChecker spellchecker = new SpellChecker(spellIndexDirectory);
// To index a field of a user index:
spellchecker.indexDictionary(new LuceneDictionary(my_lucene_reader, a_field));
// To index a file containing words:
spellchecker.indexDictionary(new PlainTextDictionary(new File("myfile.txt")));
String[] suggestions = spellchecker.suggestSimilar("misspelt", 5);

author: Nicolas Maisonneuve
version: 1.0

Fields Summary
public static final String
F_WORD
Field name for each word in the ngram index.
Directory
spellIndex
the spell index
private float
bStart
Boost value for start and end grams
private float
bEnd
private IndexReader
reader
private IndexSearcher
searcher
private float
minScore
Constructors Summary
public SpellChecker(Directory spellIndex)
this.setSpellIndex(spellIndex);
Methods Summary
private static void add(org.apache.lucene.search.BooleanQuery q, java.lang.String name, java.lang.String value, float boost)
Add a clause to a boolean query.
Query tq = new TermQuery(new Term(name, value)); tq.setBoost(boost); q.add(new BooleanClause(tq, BooleanClause.Occur.SHOULD));
private static void add(org.apache.lucene.search.BooleanQuery q, java.lang.String name, java.lang.String value)
Add a clause to a boolean query.
q.add(new BooleanClause(new TermQuery(new Term(name, value)), BooleanClause.Occur.SHOULD));
private static void addGram(java.lang.String text, org.apache.lucene.document.Document doc, int ng1, int ng2)
int len = text.length(); for (int ng = ng1; ng <= ng2; ng++) { String key = "gram" + ng; String end = null; for (int i = 0; i < len - ng + 1; i++) { String gram = text.substring(i, i + ng); doc.add(new Field(key, gram, Field.Store.YES, Field.Index.UN_TOKENIZED)); if (i == 0) { doc.add(new Field("start" + ng, gram, Field.Store.YES, Field.Index.UN_TOKENIZED)); } end = gram; } if (end != null) { // may not be present if len==ng1 doc.add(new Field("end" + ng, end, Field.Store.YES, Field.Index.UN_TOKENIZED)); } }
public void clearIndex()
IndexReader.unlock(spellIndex); IndexWriter writer = new IndexWriter(spellIndex, null, true); writer.close();
private static org.apache.lucene.document.Document createDocument(java.lang.String text, int ng1, int ng2)
Document doc = new Document(); doc.add(new Field(F_WORD, text, Field.Store.YES, Field.Index.UN_TOKENIZED)); // orig term addGram(text, doc, ng1, ng2); return doc;
public boolean exist(java.lang.String word)
Check whether the word exists in the index.
param
word String
throws
IOException
return
true iff the word exists in the index
if (reader == null) { reader = IndexReader.open(spellIndex); } return reader.docFreq(new Term(F_WORD, word)) > 0;
protected void finalize()
try { if (reader != null) { reader.close(); } } finally { super.finalize(); }
private static java.lang.String[] formGrams(java.lang.String text, int ng)
Form all ngrams for a given word.
param
text the word to parse
param
ng the ngram length e.g. 3
return
an array of all ngrams in the word and note that duplicates are not removed
int len = text.length(); String[] res = new String[len - ng + 1]; for (int i = 0; i < len - ng + 1; i++) { res[i] = text.substring(i, i + ng); } return res;
private int getMax(int l)
if (l > 5) { return 4; } if (l == 5) { return 3; } return 2;
private int getMin(int l)
if (l > 5) { return 3; } if (l == 5) { return 2; } return 1;
public void indexDictionary(java.util.Dictionary dict)
Index a Dictionary
param
dict the dictionary to index
throws
IOException
IndexReader.unlock(spellIndex); IndexWriter writer = new IndexWriter(spellIndex, new WhitespaceAnalyzer(), !IndexReader.indexExists(spellIndex)); writer.setMergeFactor(300); writer.setMaxBufferedDocs(150); Iterator iter = dict.getWordsIterator(); while (iter.hasNext()) { String word = (String) iter.next(); int len = word.length(); if (len < 3) { continue; // too short we bail but "too long" is fine... } if (this.exist(word)) { // if the word already exist in the gramindex continue; } // ok index the word Document doc = createDocument(word, getMin(len), getMax(len)); writer.addDocument(doc); } // close writer writer.optimize(); writer.close();
public void setAccuracy(float min)
Sets the accuracy 0 < minScore < 1; default 0.5
this.minScore = min;
public void setSpellIndex(org.apache.lucene.store.Directory spellIndex)
this.spellIndex = spellIndex; if (!IndexReader.indexExists(spellIndex)) { IndexWriter writer = new IndexWriter(spellIndex, null, true); writer.close(); } searcher = new IndexSearcher(this.spellIndex);
public java.lang.String[] suggestSimilar(java.lang.String word, int numSug)
Suggest similar words
param
word String the word you want a spell check done on
param
numSug int the number of suggest words
throws
IOException
return
String[]
return this.suggestSimilar(word, numSug, null, null, false);
public java.lang.String[] suggestSimilar(java.lang.String word, int numSug, org.apache.lucene.index.IndexReader ir, java.lang.String field, boolean morePopular)
Suggest similar words (restricted or not to a field of a user index)
param
word String the word you want a spell check done on
param
numSug int the number of suggest words
param
ir the indexReader of the user index (can be null see field param)
param
field String the field of the user index: if field is not null, the suggested words are restricted to the words present in this field.
param
morePopular boolean return only the suggest words that are more frequent than the searched word (only if restricted mode = (indexReader!=null and field!=null)
throws
IOException
return
String[] the sorted list of the suggest words with this 2 criteria: first criteria: the edit distance, second criteria (only if restricted mode): the popularity of the suggest words in the field of the user index
float min = this.minScore; final TRStringDistance sd = new TRStringDistance(word); final int lengthWord = word.length(); final int goalFreq = (morePopular && ir != null) ? ir.docFreq(new Term(field, word)) : 0; // if the word exists in the real index and we don't care for word frequency, return the word itself if (!morePopular && goalFreq > 0) { return new String[] { word }; } BooleanQuery query = new BooleanQuery(); String[] grams; String key; for (int ng = getMin(lengthWord); ng <= getMax(lengthWord); ng++) { key = "gram" + ng; // form key grams = formGrams(word, ng); // form word into ngrams (allow dups too) if (grams.length == 0) { continue; // hmm } if (bStart > 0) { // should we boost prefixes? add(query, "start" + ng, grams[0], bStart); // matches start of word } if (bEnd > 0) { // should we boost suffixes add(query, "end" + ng, grams[grams.length - 1], bEnd); // matches end of word } for (int i = 0; i < grams.length; i++) { add(query, key, grams[i]); } } // System.out.println("Q: " + query); Hits hits = searcher.search(query); // System.out.println("HITS: " + hits.length()); SuggestWordQueue sugQueue = new SuggestWordQueue(numSug); // go thru more than 'maxr' matches in case the distance filter triggers int stop = Math.min(hits.length(), 10 * numSug); SuggestWord sugWord = new SuggestWord(); for (int i = 0; i < stop; i++) { sugWord.string = hits.doc(i).get(F_WORD); // get orig word // don't suggest a word for itself, that would be silly if (sugWord.string.equals(word)) { continue; } // edit distance/normalize with the minScore word length sugWord.score = 1.0f - ((float) sd.getDistance(sugWord.string) / Math .min(sugWord.string.length(), lengthWord)); if (sugWord.score < min) { continue; } if (ir != null) { // use the user index sugWord.freq = ir.docFreq(new Term(field, sugWord.string)); // freq in the index // don't suggest a word that is not present in the field if ((morePopular && goalFreq > sugWord.freq) || sugWord.freq < 1) { continue; } } sugQueue.insert(sugWord); if (sugQueue.size() == numSug) { // if queue full, maintain the minScore score min = ((SuggestWord) sugQueue.top()).score; } sugWord = new SuggestWord(); } // convert to array string String[] list = new String[sugQueue.size()]; for (int i = sugQueue.size() - 1; i >= 0; i--) { list[i] = ((SuggestWord) sugQueue.pop()).string; } return list;