FileDocCategorySizeDatePackage
SynonymTokenFilter.javaAPI DocApache Lucene 1.94355Mon Feb 20 09:18:36 GMT 2006org.apache.lucene.index.memory

SynonymTokenFilter

public class SynonymTokenFilter extends TokenFilter
Injects additional tokens for synonyms of token terms fetched from the underlying child stream; the child stream must deliver lowercase tokens for synonyms to be found.
author
whoschek.AT.lbl.DOT.gov

Fields Summary
public static final String
SYNONYM_TOKEN_TYPE
The Token.type used to indicate a synonym to higher level filters.
private final SynonymMap
synonyms
private final int
maxSynonyms
private String[]
stack
private int
index
private Token
current
private int
todo
Constructors Summary
public SynonymTokenFilter(TokenStream input, SynonymMap synonyms, int maxSynonyms)
Creates an instance for the given underlying stream and synonym table.

param
input the underlying child token stream
param
synonyms the map used to extract synonyms for terms
param
maxSynonyms the maximum number of synonym tokens to return per underlying token word (a value of Integer.MAX_VALUE indicates unlimited)

	
	                                                                                             	 
	       
		super(input);
		if (input == null)
			throw new IllegalArgumentException("input must not be null");
		if (synonyms == null)
			throw new IllegalArgumentException("synonyms must not be null");
		if (maxSynonyms < 0) 
			throw new IllegalArgumentException("maxSynonyms must not be negative");
		
		this.synonyms = synonyms;
		this.maxSynonyms = maxSynonyms;
	
Methods Summary
protected org.apache.lucene.analysis.TokencreateToken(java.lang.String synonym, org.apache.lucene.analysis.Token current)
Creates and returns a token for the given synonym of the current input token; Override for custom (stateless or stateful) behaviour, if desired.

param
synonym a synonym for the current token's term
param
current the current token from the underlying child stream
return
a new token, or null to indicate that the given synonym should be ignored

		Token token = new Token(
			synonym, current.startOffset(), current.endOffset(), SYNONYM_TOKEN_TYPE);
		token.setPositionIncrement(0);
		return token;
	
public org.apache.lucene.analysis.Tokennext()
Returns the next token in the stream, or null at EOS.

		Token token;
		while (todo > 0 && index < stack.length) { // pop from stack
			token = createToken(stack[index++], current);
			if (token != null) {
				todo--;
				return token;
			}
		}
		
		token = input.next();
		if (token == null) return null; // EOS; iterator exhausted
		
		stack = synonyms.getSynonyms(token.termText()); // push onto stack
		if (stack.length > maxSynonyms) randomize(stack);
		index = 0;
		current = token;
		todo = maxSynonyms;
		return token;
	
private static voidrandomize(java.lang.Object[] arr)
Randomize synonyms to later sample a subset. Uses constant random seed for reproducability. Uses "DRand", a simple, fast, uniform pseudo-random number generator with medium statistical quality (multiplicative congruential method), producing integers in the range [Integer.MIN_VALUE, Integer.MAX_VALUE].

		int seed = 1234567; // constant
		int randomState = 4*seed + 1;
//		Random random = new Random(seed); // unnecessary overhead
		int len = arr.length;
		for (int i=0; i < len-1; i++) {
			randomState *= 0x278DDE6D; // z(i+1)=a*z(i) (mod 2**32)
			int r = randomState % (len-i);
			if (r < 0) r = -r; // e.g. -9 % 2 == -1
//			int r = random.nextInt(len-i);
			
			// swap arr[i, i+r]
			Object tmp = arr[i];
			arr[i] = arr[i + r];
			arr[i + r] = tmp;
		}