SpellCheckerpublic class SpellChecker extends Object
Spell Checker class (Main class)
(initially inspired by the David Spencer code).
Example Usage:
SpellChecker spellchecker = new SpellChecker(spellIndexDirectory);
// To index a field of a user index:
spellchecker.indexDictionary(new LuceneDictionary(my_lucene_reader, a_field));
// To index a file containing words:
spellchecker.indexDictionary(new PlainTextDictionary(new File("myfile.txt")));
String[] suggestions = spellchecker.suggestSimilar("misspelt", 5);
|
Fields Summary |
---|
public static final String | F_WORDField name for each word in the ngram index. | Directory | spellindexthe spell index | private float | bStartBoost value for start and end grams | private float | bEnd | private IndexReader | reader | float | min |
Constructors Summary |
---|
public SpellChecker(Directory gramIndex)
this.setSpellIndex(gramIndex);
|
Methods Summary |
---|
private static void | add(org.apache.lucene.search.BooleanQuery q, java.lang.String k, java.lang.String v, float boost)Add a clause to a boolean query.
Query tq=new TermQuery(new Term(k, v));
tq.setBoost(boost);
q.add(new BooleanClause(tq, BooleanClause.Occur.SHOULD));
| private static void | add(org.apache.lucene.search.BooleanQuery q, java.lang.String k, java.lang.String v)Add a clause to a boolean query.
q.add(new BooleanClause(new TermQuery(new Term(k, v)), BooleanClause.Occur.SHOULD));
| private static void | addGram(java.lang.String text, org.apache.lucene.document.Document doc, int ng1, int ng2)
int len=text.length();
for (int ng=ng1; ng<=ng2; ng++) {
String key="gram"+ng;
String end=null;
for (int i=0; i<len-ng+1; i++) {
String gram=text.substring(i, i+ng);
doc.add(new Field(key, gram, Field.Store.YES, Field.Index.UN_TOKENIZED));
if (i==0) {
doc.add(new Field("start"+ng, gram, Field.Store.YES, Field.Index.UN_TOKENIZED));
}
end=gram;
}
if (end!=null) { // may not be present if len==ng1
doc.add(new Field("end"+ng, end, Field.Store.YES, Field.Index.UN_TOKENIZED));
}
}
| public void | clearIndex()
IndexReader.unlock(spellindex);
IndexWriter writer=new IndexWriter(spellindex, null, true);
writer.close();
| private static org.apache.lucene.document.Document | createDocument(java.lang.String text, int ng1, int ng2)
Document doc=new Document();
doc.add(new Field(F_WORD, text, Field.Store.YES, Field.Index.UN_TOKENIZED)); // orig term
addGram(text, doc, ng1, ng2);
return doc;
| public boolean | exist(java.lang.String word)Check whether the word exists in the index.
if (reader==null) {
reader=IndexReader.open(spellindex);
}
return reader.docFreq(new Term(F_WORD, word))>0;
| protected void | finalize()
if (reader!=null) {
reader.close();
}
| private static java.lang.String[] | formGrams(java.lang.String text, int ng)Form all ngrams for a given word.
int len=text.length();
String[] res=new String[len-ng+1];
for (int i=0; i<len-ng+1; i++) {
res[i]=text.substring(i, i+ng);
}
return res;
| private int | getMax(int l)
if (l>5) {
return 4;
}
if (l==5) {
return 3;
}
return 2;
| private int | getMin(int l)
if (l>5) {
return 3;
}
if (l==5) {
return 2;
}
return 1;
| public void | indexDictionary(java.util.Dictionary dict)Index a Dictionary
IndexReader.unlock(spellindex);
IndexWriter writer=new IndexWriter(spellindex, new WhitespaceAnalyzer(), !IndexReader.indexExists(spellindex));
writer.setMergeFactor(300);
writer.setMaxBufferedDocs(150);
Iterator iter=dict.getWordsIterator();
while (iter.hasNext()) {
String word=(String) iter.next();
int len=word.length();
if (len<3) {
continue; // too short we bail but "too long" is fine...
}
if (this.exist(word)) { // if the word already exist in the gramindex
continue;
}
// ok index the word
Document doc=createDocument(word, getMin(len), getMax(len));
writer.addDocument(doc);
}
// close writer
writer.optimize();
writer.close();
// close reader
reader.close();
reader=null;
| public void | setAccuraty(float min)Set the accuracy 0 < min < 1; default 0.5
this.min=min;
| public void | setSpellIndex(org.apache.lucene.store.Directory spellindex)
this.spellindex=spellindex;
| public java.lang.String[] | suggestSimilar(java.lang.String word, int num_sug)Suggest similar words
return this.suggestSimilar(word, num_sug, null, null, false);
| public java.lang.String[] | suggestSimilar(java.lang.String word, int num_sug, org.apache.lucene.index.IndexReader ir, java.lang.String field, boolean morePopular)Suggest similar words (restricted or not to a field of a user index)
final TRStringDistance sd=new TRStringDistance(word);
final int lengthWord=word.length();
final int goalFreq=(morePopular&&ir!=null)?ir.docFreq(new Term(field, word)):0;
if (!morePopular&&goalFreq>0) {
return new String[] {
word}; // return the word if it exist in the index and i don't want a more popular word
}
BooleanQuery query=new BooleanQuery();
String[] grams;
String key;
for (int ng=getMin(lengthWord); ng<=getMax(lengthWord); ng++) {
key="gram"+ng; // form key
grams=formGrams(word, ng); // form word into ngrams (allow dups too)
if (grams.length==0) {
continue; // hmm
}
if (bStart>0) { // should we boost prefixes?
add(query, "start"+ng, grams[0], bStart); // matches start of word
}
if (bEnd>0) { // should we boost suffixes
add(query, "end"+ng, grams[grams.length-1], bEnd); // matches end of word
}
for (int i=0; i<grams.length; i++) {
add(query, key, grams[i]);
}
}
IndexSearcher searcher=new IndexSearcher(this.spellindex);
Hits hits=searcher.search(query);
SuggestWordQueue sugqueue=new SuggestWordQueue(num_sug);
int stop=Math.min(hits.length(), 10*num_sug); // go thru more than 'maxr' matches in case the distance filter triggers
SuggestWord sugword=new SuggestWord();
for (int i=0; i<stop; i++) {
sugword.string=hits.doc(i).get(F_WORD); // get orig word)
if (sugword.string.equals(word)) {
continue; // don't suggest a word for itself, that would be silly
}
//edit distance/normalize with the min word length
sugword.score=1.0f-((float) sd.getDistance(sugword.string)/Math.min(sugword.string.length(), lengthWord));
if (sugword.score<min) {
continue;
}
if (ir!=null) { // use the user index
sugword.freq=ir.docFreq(new Term(field, sugword.string)); // freq in the index
if ((morePopular&&goalFreq>sugword.freq)||sugword.freq<1) { // don't suggest a word that is not present in the field
continue;
}
}
sugqueue.insert(sugword);
if (sugqueue.size()==num_sug) {
//if queue full , maintain the min score
min=((SuggestWord) sugqueue.top()).score;
}
sugword=new SuggestWord();
}
// convert to array string
String[] list=new String[sugqueue.size()];
for (int i=sugqueue.size()-1; i>=0; i--) {
list[i]=((SuggestWord) sugqueue.pop()).string;
}
searcher.close();
return list;
|
|