SpellCheckerpublic class SpellChecker extends Object
Spell Checker class (Main class)
(initially inspired by the David Spencer code).
Example Usage:
SpellChecker spellchecker = new SpellChecker(spellIndexDirectory);
// To index a field of a user index:
spellchecker.indexDictionary(new LuceneDictionary(my_lucene_reader, a_field));
// To index a file containing words:
spellchecker.indexDictionary(new PlainTextDictionary(new File("myfile.txt")));
String[] suggestions = spellchecker.suggestSimilar("misspelt", 5);
|
Fields Summary |
---|
public static final String | F_WORDField name for each word in the ngram index. | Directory | spellIndexthe spell index | private float | bStartBoost value for start and end grams | private float | bEnd | private IndexReader | reader | private IndexSearcher | searcher | private float | minScore |
Constructors Summary |
---|
public SpellChecker(Directory spellIndex)Use the given directory as a spell checker index. The directory
is created if it doesn't exist yet.
this.setSpellIndex(spellIndex);
|
Methods Summary |
---|
private static void | add(org.apache.lucene.search.BooleanQuery q, java.lang.String name, java.lang.String value, float boost)Add a clause to a boolean query.
Query tq = new TermQuery(new Term(name, value));
tq.setBoost(boost);
q.add(new BooleanClause(tq, BooleanClause.Occur.SHOULD));
| private static void | add(org.apache.lucene.search.BooleanQuery q, java.lang.String name, java.lang.String value)Add a clause to a boolean query.
q.add(new BooleanClause(new TermQuery(new Term(name, value)), BooleanClause.Occur.SHOULD));
| private static void | addGram(java.lang.String text, org.apache.lucene.document.Document doc, int ng1, int ng2)
int len = text.length();
for (int ng = ng1; ng <= ng2; ng++) {
String key = "gram" + ng;
String end = null;
for (int i = 0; i < len - ng + 1; i++) {
String gram = text.substring(i, i + ng);
doc.add(new Field(key, gram, Field.Store.NO, Field.Index.UN_TOKENIZED));
if (i == 0) {
doc.add(new Field("start" + ng, gram, Field.Store.NO, Field.Index.UN_TOKENIZED));
}
end = gram;
}
if (end != null) { // may not be present if len==ng1
doc.add(new Field("end" + ng, end, Field.Store.NO, Field.Index.UN_TOKENIZED));
}
}
| public void | clearIndex()Removes all terms from the spell check index.
IndexReader.unlock(spellIndex);
IndexWriter writer = new IndexWriter(spellIndex, null, true);
writer.close();
| private static org.apache.lucene.document.Document | createDocument(java.lang.String text, int ng1, int ng2)
Document doc = new Document();
doc.add(new Field(F_WORD, text, Field.Store.YES, Field.Index.UN_TOKENIZED)); // orig term
addGram(text, doc, ng1, ng2);
return doc;
| public boolean | exist(java.lang.String word)Check whether the word exists in the index.
if (reader == null) {
reader = IndexReader.open(spellIndex);
}
return reader.docFreq(new Term(F_WORD, word)) > 0;
| protected void | finalize()Closes the internal IndexReader.
try {
if (reader != null) {
reader.close();
}
} finally {
super.finalize();
}
| private static java.lang.String[] | formGrams(java.lang.String text, int ng)Form all ngrams for a given word.
int len = text.length();
String[] res = new String[len - ng + 1];
for (int i = 0; i < len - ng + 1; i++) {
res[i] = text.substring(i, i + ng);
}
return res;
| private int | getMax(int l)
if (l > 5) {
return 4;
}
if (l == 5) {
return 3;
}
return 2;
| private int | getMin(int l)
if (l > 5) {
return 3;
}
if (l == 5) {
return 2;
}
return 1;
| public void | indexDictionary(java.util.Dictionary dict)Index a Dictionary
IndexReader.unlock(spellIndex);
IndexWriter writer = new IndexWriter(spellIndex, new WhitespaceAnalyzer(),
!IndexReader.indexExists(spellIndex));
writer.setMergeFactor(300);
writer.setMaxBufferedDocs(150);
Iterator iter = dict.getWordsIterator();
while (iter.hasNext()) {
String word = (String) iter.next();
int len = word.length();
if (len < 3) {
continue; // too short we bail but "too long" is fine...
}
if (this.exist(word)) { // if the word already exist in the gramindex
continue;
}
// ok index the word
Document doc = createDocument(word, getMin(len), getMax(len));
writer.addDocument(doc);
}
// close writer
writer.optimize();
writer.close();
// close reader so it will be re-opened (and see the new content) when exist()
// is called the next time:
if (reader != null) {
reader.close();
reader = null;
}
// also re-open the spell index to see our own changes when the next suggestion
// is fetched:
searcher.close();
searcher = new IndexSearcher(this.spellIndex);
| public void | setAccuracy(float minScore)Sets the accuracy 0 < minScore < 1; default 0.5
this.minScore = minScore;
| public void | setSpellIndex(org.apache.lucene.store.Directory spellIndex)Use a different index as the spell checker index or re-open
the existing index if spellIndex is the same value
as given in the constructor.
this.spellIndex = spellIndex;
if (!IndexReader.indexExists(spellIndex)) {
IndexWriter writer = new IndexWriter(spellIndex, null, true);
writer.close();
}
// close the old searcher, if there was one
if (searcher != null) {
searcher.close();
}
searcher = new IndexSearcher(this.spellIndex);
| public java.lang.String[] | suggestSimilar(java.lang.String word, int numSug)Suggest similar words.
As the Lucene similarity that is used to fetch the most relevant n-grammed terms
is not the same as the edit distance strategy used to calculate the best
matching spell-checked word from the hits that Lucene found, one usually has
to retrieve a couple of numSug's in order to get the true best match.
I.e. if numSug == 1, don't count on that suggestion being the best one.
Thus, you should set this value to at least 5 for a good suggestion.
return this.suggestSimilar(word, numSug, null, null, false);
| public java.lang.String[] | suggestSimilar(java.lang.String word, int numSug, org.apache.lucene.index.IndexReader ir, java.lang.String field, boolean morePopular)Suggest similar words (optionally restricted to a field of an index).
As the Lucene similarity that is used to fetch the most relevant n-grammed terms
is not the same as the edit distance strategy used to calculate the best
matching spell-checked word from the hits that Lucene found, one usually has
to retrieve a couple of numSug's in order to get the true best match.
I.e. if numSug == 1, don't count on that suggestion being the best one.
Thus, you should set this value to at least 5 for a good suggestion.
float min = this.minScore;
final TRStringDistance sd = new TRStringDistance(word);
final int lengthWord = word.length();
final int goalFreq = (morePopular && ir != null) ? ir.docFreq(new Term(field, word)) : 0;
// if the word exists in the real index and we don't care for word frequency, return the word itself
if (!morePopular && goalFreq > 0) {
return new String[] { word };
}
BooleanQuery query = new BooleanQuery();
String[] grams;
String key;
for (int ng = getMin(lengthWord); ng <= getMax(lengthWord); ng++) {
key = "gram" + ng; // form key
grams = formGrams(word, ng); // form word into ngrams (allow dups too)
if (grams.length == 0) {
continue; // hmm
}
if (bStart > 0) { // should we boost prefixes?
add(query, "start" + ng, grams[0], bStart); // matches start of word
}
if (bEnd > 0) { // should we boost suffixes
add(query, "end" + ng, grams[grams.length - 1], bEnd); // matches end of word
}
for (int i = 0; i < grams.length; i++) {
add(query, key, grams[i]);
}
}
// System.out.println("Q: " + query);
Hits hits = searcher.search(query);
// System.out.println("HITS: " + hits.length());
SuggestWordQueue sugQueue = new SuggestWordQueue(numSug);
// go thru more than 'maxr' matches in case the distance filter triggers
int stop = Math.min(hits.length(), 10 * numSug);
SuggestWord sugWord = new SuggestWord();
for (int i = 0; i < stop; i++) {
sugWord.string = hits.doc(i).get(F_WORD); // get orig word
// don't suggest a word for itself, that would be silly
if (sugWord.string.equals(word)) {
continue;
}
// edit distance/normalize with the minScore word length
sugWord.score = 1.0f - ((float) sd.getDistance(sugWord.string) / Math
.min(sugWord.string.length(), lengthWord));
if (sugWord.score < min) {
continue;
}
if (ir != null) { // use the user index
sugWord.freq = ir.docFreq(new Term(field, sugWord.string)); // freq in the index
// don't suggest a word that is not present in the field
if ((morePopular && goalFreq > sugWord.freq) || sugWord.freq < 1) {
continue;
}
}
sugQueue.insert(sugWord);
if (sugQueue.size() == numSug) {
// if queue full, maintain the minScore score
min = ((SuggestWord) sugQueue.top()).score;
}
sugWord = new SuggestWord();
}
// convert to array string
String[] list = new String[sugQueue.size()];
for (int i = sugQueue.size() - 1; i >= 0; i--) {
list[i] = ((SuggestWord) sugQueue.pop()).string;
}
return list;
|
|