File Doc Category Size Date Package
SpellChecker.java API Doc Apache Lucene 1.9 11137 Mon Feb 20 09:18:32 GMT 2006 org.apache.lucene.search.spell

SpellChecker

java.lang.Object

public class SpellChecker extends Object

Spell Checker class (Main class)
(initially inspired by the David Spencer code).

Example Usage:

SpellChecker spellchecker = new SpellChecker(spellIndexDirectory);
// To index a field of a user index:
spellchecker.indexDictionary(new LuceneDictionary(my_lucene_reader, a_field));
// To index a file containing words:
spellchecker.indexDictionary(new PlainTextDictionary(new File("myfile.txt")));
String[] suggestions = spellchecker.suggestSimilar("misspelt", 5);

author: Nicolas Maisonneuve
version: 1.0

Fields Summary
public static final String
F_WORD
Field name for each word in the ngram index.
Directory
spellindex
the spell index
private float
bStart
Boost value for start and end grams
private float
bEnd
private IndexReader
reader
float
min
Constructors Summary
public SpellChecker(Directory gramIndex)
this.setSpellIndex(gramIndex);
Methods Summary
private static void add(org.apache.lucene.search.BooleanQuery q, java.lang.String k, java.lang.String v, float boost)
Add a clause to a boolean query.
Query tq=new TermQuery(new Term(k, v)); tq.setBoost(boost); q.add(new BooleanClause(tq, BooleanClause.Occur.SHOULD));
private static void add(org.apache.lucene.search.BooleanQuery q, java.lang.String k, java.lang.String v)
Add a clause to a boolean query.
q.add(new BooleanClause(new TermQuery(new Term(k, v)), BooleanClause.Occur.SHOULD));
private static void addGram(java.lang.String text, org.apache.lucene.document.Document doc, int ng1, int ng2)
int len=text.length(); for (int ng=ng1; ng<=ng2; ng++) { String key="gram"+ng; String end=null; for (int i=0; i<len-ng+1; i++) { String gram=text.substring(i, i+ng); doc.add(new Field(key, gram, Field.Store.YES, Field.Index.UN_TOKENIZED)); if (i==0) { doc.add(new Field("start"+ng, gram, Field.Store.YES, Field.Index.UN_TOKENIZED)); } end=gram; } if (end!=null) { // may not be present if len==ng1 doc.add(new Field("end"+ng, end, Field.Store.YES, Field.Index.UN_TOKENIZED)); } }
public void clearIndex()
IndexReader.unlock(spellindex); IndexWriter writer=new IndexWriter(spellindex, null, true); writer.close();
private static org.apache.lucene.document.Document createDocument(java.lang.String text, int ng1, int ng2)
Document doc=new Document(); doc.add(new Field(F_WORD, text, Field.Store.YES, Field.Index.UN_TOKENIZED)); // orig term addGram(text, doc, ng1, ng2); return doc;
public boolean exist(java.lang.String word)
Check whether the word exists in the index.
param
word String
throws
IOException
return
true iff the word exists in the index
if (reader==null) { reader=IndexReader.open(spellindex); } return reader.docFreq(new Term(F_WORD, word))>0;
protected void finalize()
if (reader!=null) { reader.close(); }
private static java.lang.String[] formGrams(java.lang.String text, int ng)
Form all ngrams for a given word.
param
text the word to parse
param
ng the ngram length e.g. 3
return
an array of all ngrams in the word and note that duplicates are not removed
int len=text.length(); String[] res=new String[len-ng+1]; for (int i=0; i<len-ng+1; i++) { res[i]=text.substring(i, i+ng); } return res;
private int getMax(int l)
if (l>5) { return 4; } if (l==5) { return 3; } return 2;
private int getMin(int l)
if (l>5) { return 3; } if (l==5) { return 2; } return 1;
public void indexDictionary(java.util.Dictionary dict)
Index a Dictionary
param
dict the dictionary to index
throws
IOException
IndexReader.unlock(spellindex); IndexWriter writer=new IndexWriter(spellindex, new WhitespaceAnalyzer(), !IndexReader.indexExists(spellindex)); writer.setMergeFactor(300); writer.setMaxBufferedDocs(150); Iterator iter=dict.getWordsIterator(); while (iter.hasNext()) { String word=(String) iter.next(); int len=word.length(); if (len<3) { continue; // too short we bail but "too long" is fine... } if (this.exist(word)) { // if the word already exist in the gramindex continue; } // ok index the word Document doc=createDocument(word, getMin(len), getMax(len)); writer.addDocument(doc); } // close writer writer.optimize(); writer.close(); // close reader reader.close(); reader=null;
public void setAccuraty(float min)
Set the accuracy 0 < min < 1; default 0.5
this.min=min;
public void setSpellIndex(org.apache.lucene.store.Directory spellindex)
this.spellindex=spellindex;
public java.lang.String[] suggestSimilar(java.lang.String word, int num_sug)
Suggest similar words
param
word String the word you want a spell check done on
param
num_sug int the number of suggest words
throws
IOException
return
String[]
return this.suggestSimilar(word, num_sug, null, null, false);
public java.lang.String[] suggestSimilar(java.lang.String word, int num_sug, org.apache.lucene.index.IndexReader ir, java.lang.String field, boolean morePopular)
Suggest similar words (restricted or not to a field of a user index)
param
word String the word you want a spell check done on
param
num_sug int the number of suggest words
param
ir the indexReader of the user index (can be null see field param)
param
field String the field of the user index: if field is not null, the suggested words are restricted to the words present in this field.
param
morePopular boolean return only the suggest words that are more frequent than the searched word (only if restricted mode = (indexReader!=null and field!=null)
throws
IOException
return
String[] the sorted list of the suggest words with this 2 criteria: first criteria: the edit distance, second criteria (only if restricted mode): the popularity of the suggest words in the field of the user index
final TRStringDistance sd=new TRStringDistance(word); final int lengthWord=word.length(); final int goalFreq=(morePopular&&ir!=null)?ir.docFreq(new Term(field, word)):0; if (!morePopular&&goalFreq>0) { return new String[] { word}; // return the word if it exist in the index and i don't want a more popular word } BooleanQuery query=new BooleanQuery(); String[] grams; String key; for (int ng=getMin(lengthWord); ng<=getMax(lengthWord); ng++) { key="gram"+ng; // form key grams=formGrams(word, ng); // form word into ngrams (allow dups too) if (grams.length==0) { continue; // hmm } if (bStart>0) { // should we boost prefixes? add(query, "start"+ng, grams[0], bStart); // matches start of word } if (bEnd>0) { // should we boost suffixes add(query, "end"+ng, grams[grams.length-1], bEnd); // matches end of word } for (int i=0; i<grams.length; i++) { add(query, key, grams[i]); } } IndexSearcher searcher=new IndexSearcher(this.spellindex); Hits hits=searcher.search(query); SuggestWordQueue sugqueue=new SuggestWordQueue(num_sug); int stop=Math.min(hits.length(), 10*num_sug); // go thru more than 'maxr' matches in case the distance filter triggers SuggestWord sugword=new SuggestWord(); for (int i=0; i<stop; i++) { sugword.string=hits.doc(i).get(F_WORD); // get orig word) if (sugword.string.equals(word)) { continue; // don't suggest a word for itself, that would be silly } //edit distance/normalize with the min word length sugword.score=1.0f-((float) sd.getDistance(sugword.string)/Math.min(sugword.string.length(), lengthWord)); if (sugword.score<min) { continue; } if (ir!=null) { // use the user index sugword.freq=ir.docFreq(new Term(field, sugword.string)); // freq in the index if ((morePopular&&goalFreq>sugword.freq)||sugword.freq<1) { // don't suggest a word that is not present in the field continue; } } sugqueue.insert(sugword); if (sugqueue.size()==num_sug) { //if queue full , maintain the min score min=((SuggestWord) sugqueue.top()).score; } sugword=new SuggestWord(); } // convert to array string String[] list=new String[sugqueue.size()]; for (int i=sugqueue.size()-1; i>=0; i--) { list[i]=((SuggestWord) sugqueue.pop()).string; } searcher.close(); return list;