FuzzyLikeThisQuerypublic class FuzzyLikeThisQuery extends Query Fuzzifies ALL terms provided as strings and then picks the best n differentiating terms.
In effect this mixes the behaviour of FuzzyQuery and MoreLikeThis but with special consideration
of fuzzy scoring factors.
This generally produces good results for queries where users may provide details in a number of
fields and have no knowledge of boolean query syntax and also want a degree of fuzzy matching and
a fast query.
For each source term the fuzzy variants are held in a BooleanQuery with no coord factor (because
we are not looking for matches on multiple variants in any one doc). Additionally, a specialized
TermQuery is used for variants and does not use that variant term's IDF because this would favour rarer
terms eg misspellings. Instead, all variants use the same IDF ranking (the one for the source query
term) and this is factored into the variant's boost. If the source query term does not exist in the
index the average IDF of the variants is used. |
Fields Summary |
---|
static Similarity | sim | Query | rewrittenQuery | ArrayList | fieldVals | Analyzer | analyzer | ScoreTermQueue | q | int | MAX_VARIANTS_PER_TERM | boolean | ignoreTF |
Constructors Summary |
---|
public FuzzyLikeThisQuery(int maxNumTerms, Analyzer analyzer)
q=new ScoreTermQueue(maxNumTerms);
this.analyzer=analyzer;
|
Methods Summary |
---|
public void | addTerms(java.lang.String queryString, java.lang.String fieldName, float minSimilarity, int prefixLength)Adds user input for "fuzzification"
fieldVals.add(new FieldVals(fieldName,minSimilarity,prefixLength,queryString));
| private void | addTerms(org.apache.lucene.index.IndexReader reader, org.apache.lucene.search.FuzzyLikeThisQuery$FieldVals f)
if(f.queryString==null) return;
TokenStream ts=analyzer.tokenStream(f.fieldName,new StringReader(f.queryString));
Token token=ts.next();
int corpusNumDocs=reader.numDocs();
Term internSavingTemplateTerm =new Term(f.fieldName,""); //optimization to avoid constructing new Term() objects
HashSet processedTerms=new HashSet();
while(token!=null)
{
if(!processedTerms.contains(token.termText()))
{
processedTerms.add(token.termText());
ScoreTermQueue variantsQ=new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
float minScore=0;
Term startTerm=internSavingTemplateTerm.createTerm(token.termText());
FuzzyTermEnum fe=new FuzzyTermEnum(reader,startTerm,f.minSimilarity,f.prefixLength);
TermEnum origEnum = reader.terms(startTerm);
int df=0;
if(startTerm.equals(origEnum.term()))
{
df=origEnum.docFreq(); //store the df so all variants use same idf
}
int numVariants=0;
int totalVariantDocFreqs=0;
do
{
Term possibleMatch=fe.term();
if(possibleMatch!=null)
{
numVariants++;
totalVariantDocFreqs+=fe.docFreq();
float score=fe.difference();
if(variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore){
ScoreTerm st=new ScoreTerm(possibleMatch,score,startTerm);
variantsQ.insert(st);
minScore = ((ScoreTerm)variantsQ.top()).score; // maintain minScore
}
}
}
while(fe.next());
if(numVariants==0)
{
//no variants to rank here
break;
}
int avgDf=totalVariantDocFreqs/numVariants;
if(df==0)//no direct match we can use as df for all variants
{
df=avgDf; //use avg df of all variants
}
// take the top variants (scored by edit distance) and reset the score
// to include an IDF factor then add to the global queue for ranking overall top query terms
int size = variantsQ.size();
for(int i = 0; i < size; i++)
{
ScoreTerm st = (ScoreTerm) variantsQ.pop();
st.score=(st.score*st.score)*sim.idf(df,corpusNumDocs);
q.insert(st);
}
}
token=ts.next();
}
| public boolean | isIgnoreTF()
return ignoreTF;
| public org.apache.lucene.search.Query | rewrite(org.apache.lucene.index.IndexReader reader)
if(rewrittenQuery!=null)
{
return rewrittenQuery;
}
//load up the list of possible terms
for (Iterator iter = fieldVals.iterator(); iter.hasNext();)
{
FieldVals f = (FieldVals) iter.next();
addTerms(reader,f);
}
//clear the list of fields
fieldVals.clear();
BooleanQuery bq=new BooleanQuery();
//create BooleanQueries to hold the variants for each token/field pair and ensure it
// has no coord factor
//Step 1: sort the termqueries by term/field
HashMap variantQueries=new HashMap();
int size = q.size();
for(int i = 0; i < size; i++)
{
ScoreTerm st = (ScoreTerm) q.pop();
ArrayList l=(ArrayList) variantQueries.get(st.fuzziedSourceTerm);
if(l==null)
{
l=new ArrayList();
variantQueries.put(st.fuzziedSourceTerm,l);
}
l.add(st);
}
//Step 2: Organize the sorted termqueries into zero-coord scoring boolean queries
for (Iterator iter = variantQueries.values().iterator(); iter.hasNext();)
{
ArrayList variants = (ArrayList) iter.next();
if(variants.size()==1)
{
//optimize where only one selected variant
ScoreTerm st=(ScoreTerm) variants.get(0);
TermQuery tq = new FuzzyTermQuery(st.term,ignoreTF);
tq.setBoost(st.score); // set the boost to a mix of IDF and score
bq.add(tq, BooleanClause.Occur.SHOULD);
}
else
{
BooleanQuery termVariants=new BooleanQuery(true); //disable coord and IDF for these term variants
for (Iterator iterator2 = variants.iterator(); iterator2
.hasNext();)
{
ScoreTerm st = (ScoreTerm) iterator2.next();
TermQuery tq = new FuzzyTermQuery(st.term,ignoreTF); // found a match
tq.setBoost(st.score); // set the boost using the ScoreTerm's score
termVariants.add(tq, BooleanClause.Occur.SHOULD); // add to query
}
bq.add(termVariants, BooleanClause.Occur.SHOULD); // add to query
}
}
//TODO possible alternative step 3 - organize above booleans into a new layer of field-based
// booleans with a minimum-should-match of NumFields-1?
bq.setBoost(getBoost());
this.rewrittenQuery=bq;
return bq;
| public void | setIgnoreTF(boolean ignoreTF)
this.ignoreTF = ignoreTF;
| public java.lang.String | toString(java.lang.String field)
return null;
|
|