FileDocCategorySizeDatePackage
FuzzyTermEnum.javaAPI DocApache Lucene 1.4.36330Fri Oct 01 10:18:08 BST 2004org.apache.lucene.search

FuzzyTermEnum

public final class FuzzyTermEnum extends FilteredTermEnum
Subclass of FilteredTermEnum for enumerating all terms that are similiar to the specified filter term.

Term enumerations are always ordered by Term.compareTo(). Each term in the enumeration is greater than all that precede it.

Fields Summary
double
distance
boolean
endEnum
Term
searchTerm
String
field
String
text
int
textlen
String
prefix
int
prefixLength
float
minimumSimilarity
double
scale_factor
private int[]
e
This static array saves us from the time required to create a new array everytime editDistance is called.
Constructors Summary
public FuzzyTermEnum(IndexReader reader, Term term)
Empty prefix and minSimilarity of 0.5f are used.

param
reader
param
term
throws
IOException
see
#FuzzyTermEnum(IndexReader, Term, float, int)

    
    
                             
           
      this(reader, term, FuzzyQuery.defaultMinSimilarity, 0);
    
public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity)
This is the standard FuzzyTermEnum with an empty prefix.

param
reader
param
term
param
minSimilarity
throws
IOException
see
#FuzzyTermEnum(IndexReader, Term, float, int)

      this(reader, term, minSimilarity, 0);
    
public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity, int prefixLength)
Constructor for enumeration of all terms from specified reader which share a prefix of length prefixLength with term and which have a fuzzy similarity > minSimilarity.

param
reader Delivers terms.
param
term Pattern term.
param
minSimilarity Minimum required similarity for terms from the reader. Default value is 0.5f.
param
prefixLength Length of required common prefix. Default value is 0.
throws
IOException

        super();
        minimumSimilarity = minSimilarity;
        scale_factor = 1.0f / (1.0f - minimumSimilarity);
        searchTerm = term;
        field = searchTerm.field();
        text = searchTerm.text();
        textlen = text.length();
        if(prefixLength > 0 && prefixLength < textlen){
            this.prefixLength = prefixLength;
            prefix = text.substring(0, prefixLength);
            text = text.substring(prefixLength);
            textlen = text.length();
        }
        setEnum(reader.terms(new Term(searchTerm.field(), prefix)));
    
Methods Summary
public voidclose()

      super.close();
      searchTerm = null;
      field = null;
      text = null;
  
protected final floatdifference()

        return (float)((distance - minimumSimilarity) * scale_factor);
    
private final inteditDistance(java.lang.String s, java.lang.String t, int n, int m)
Levenshtein distance also known as edit distance is a measure of similiarity between two strings where the distance is measured as the number of character deletions, insertions or substitutions required to transform one string to the other string.

This method takes in four parameters; two strings and their respective lengths to compute the Levenshtein distance between the two strings. The result is returned as an integer.

    
                                                                                                            
               
        if (e.length <= n || e[0].length <= m) {
            e = new int[Math.max(e.length, n+1)][Math.max(e[0].length, m+1)];
        }
        int d[][] = e; // matrix
        int i; // iterates through s
        int j; // iterates through t
        char s_i; // ith character of s
        
        if (n == 0) return m;
        if (m == 0) return n;
        
        // init matrix d
        for (i = 0; i <= n; i++) d[i][0] = i;
        for (j = 0; j <= m; j++) d[0][j] = j;
        
        // start computing edit distance
        for (i = 1; i <= n; i++) {
            s_i = s.charAt(i - 1);
            for (j = 1; j <= m; j++) {
                if (s_i != t.charAt(j-1))
                    d[i][j] = min(d[i-1][j], d[i][j-1], d[i-1][j-1])+1;
                else d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]);
            }
        }
        
        // we got the result!
        return d[n][m];
    
public final booleanendEnum()

        return endEnum;
    
private static final intmin(int a, int b, int c)
Finds and returns the smallest of three integers

        int t = (a < b) ? a : b;
        return (t < c) ? t : c;
    
protected final booleantermCompare(org.apache.lucene.index.Term term)
The termCompare method in FuzzyTermEnum uses Levenshtein distance to calculate the distance between the given term and the comparing term.

        String termText = term.text();
        if (field == term.field() && termText.startsWith(prefix)) {
            String target = termText.substring(prefixLength);
            int targetlen = target.length();
            int dist = editDistance(text, target, textlen, targetlen);
            distance = 1 - ((double)dist / (double)Math.min(textlen, targetlen));
            return (distance > minimumSimilarity);
        }
        endEnum = true;
        return false;