FileDocCategorySizeDatePackage
NGramTokenizer.javaAPI DocApache Lucene 2.1.03305Wed Feb 14 10:46:28 GMT 2007org.apache.lucene.analysis.ngram

NGramTokenizer

public class NGramTokenizer extends Tokenizer
Tokenizes the input into n-grams of the given size(s).
author
Otis Gospodnetic

Fields Summary
public static final int
DEFAULT_MIN_NGRAM_SIZE
public static final int
DEFAULT_MAX_NGRAM_SIZE
private int
minGram
private int
maxGram
private int
gramSize
private int
pos
private int
inLen
private String
inStr
private boolean
started
Constructors Summary
public NGramTokenizer(Reader input, int minGram, int maxGram)
Creates NGramTokenizer with given min and max n-grams.

param
input Reader holding the input to be tokenized
param
minGram the smallest n-gram to generate
param
maxGram the largest n-gram to generate


                                        
           
        super(input);
        if (minGram < 1) {
            throw new IllegalArgumentException("minGram must be greater than zero");
        }
        if (minGram > maxGram) {
            throw new IllegalArgumentException("minGram must not be greater than maxGram");
        }
        this.minGram = minGram;
        this.maxGram = maxGram;
    
public NGramTokenizer(Reader input)
Creates NGramTokenizer with default min and max n-grams.

param
input Reader holding the input to be tokenized

        this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
    
Methods Summary
public final org.apache.lucene.analysis.Tokennext()
Returns the next token in the stream, or null at EOS.

        if (!started) {
            started = true;
            gramSize = minGram;
            char[] chars = new char[1024];
            input.read(chars);
            inStr = new String(chars).trim();  // remove any trailing empty strings 
            inLen = inStr.length();
        }

        if (pos+gramSize > inLen) {            // if we hit the end of the string
            pos = 0;                           // reset to beginning of string
            gramSize++;                        // increase n-gram size
            if (gramSize > maxGram)            // we are done
                return null;
            if (pos+gramSize > inLen)
                return null;
        }
        String gram = inStr.substring(pos, pos+gramSize);
        int oldPos = pos;
        pos++;
        return new Token(gram, oldPos, oldPos+gramSize);