FileDocCategorySizeDatePackage
SynonymTokenFilter.javaAPI DocApache Lucene 1.94355Mon Feb 20 09:18:36 GMT 2006org.apache.lucene.index.memory

SynonymTokenFilter.java

package org.apache.lucene.index.memory;

/**
 * Copyright 2005 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;

import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;

/**
 * Injects additional tokens for synonyms of token terms fetched from the
 * underlying child stream; the child stream must deliver lowercase tokens
 * for synonyms to be found.
 * 
 * @author whoschek.AT.lbl.DOT.gov
 */
public class SynonymTokenFilter extends TokenFilter {
		
	/** The Token.type used to indicate a synonym to higher level filters. */
	public static final String SYNONYM_TOKEN_TYPE = "SYNONYM";

	private final SynonymMap synonyms;
	private final int maxSynonyms;
	
	private String[] stack = null;
	private int index = 0;
	private Token current = null;
	private int todo = 0;
	
	/**
	 * Creates an instance for the given underlying stream and synonym table.
	 * 
	 * @param input
	 *            the underlying child token stream
	 * @param synonyms
	 *            the map used to extract synonyms for terms
	 * @param maxSynonyms
	 *            the maximum number of synonym tokens to return per underlying
	 *            token word (a value of Integer.MAX_VALUE indicates unlimited)
	 */
	public SynonymTokenFilter(TokenStream input, SynonymMap synonyms, int maxSynonyms) {
		super(input);
		if (input == null)
			throw new IllegalArgumentException("input must not be null");
		if (synonyms == null)
			throw new IllegalArgumentException("synonyms must not be null");
		if (maxSynonyms < 0) 
			throw new IllegalArgumentException("maxSynonyms must not be negative");
		
		this.synonyms = synonyms;
		this.maxSynonyms = maxSynonyms;
	}
	
	/** Returns the next token in the stream, or null at EOS. */
	public Token next() throws IOException {
		Token token;
		while (todo > 0 && index < stack.length) { // pop from stack
			token = createToken(stack[index++], current);
			if (token != null) {
				todo--;
				return token;
			}
		}
		
		token = input.next();
		if (token == null) return null; // EOS; iterator exhausted
		
		stack = synonyms.getSynonyms(token.termText()); // push onto stack
		if (stack.length > maxSynonyms) randomize(stack);
		index = 0;
		current = token;
		todo = maxSynonyms;
		return token;
	}
	
	/**
	 * Creates and returns a token for the given synonym of the current input
	 * token; Override for custom (stateless or stateful) behaviour, if desired.
	 * 
	 * @param synonym 
	 *            a synonym for the current token's term
	 * @param current
	 *            the current token from the underlying child stream
	 * @return a new token, or null to indicate that the given synonym should be
	 *         ignored
	 */
	protected Token createToken(String synonym, Token current) {
		Token token = new Token(
			synonym, current.startOffset(), current.endOffset(), SYNONYM_TOKEN_TYPE);
		token.setPositionIncrement(0);
		return token;
	}
	
	/**
	 * Randomize synonyms to later sample a subset. Uses constant random seed
	 * for reproducability. Uses "DRand", a simple, fast, uniform pseudo-random
	 * number generator with medium statistical quality (multiplicative
	 * congruential method), producing integers in the range [Integer.MIN_VALUE,
	 * Integer.MAX_VALUE].
	 */
	private static void randomize(Object[] arr) {
		int seed = 1234567; // constant
		int randomState = 4*seed + 1;
//		Random random = new Random(seed); // unnecessary overhead
		int len = arr.length;
		for (int i=0; i < len-1; i++) {
			randomState *= 0x278DDE6D; // z(i+1)=a*z(i) (mod 2**32)
			int r = randomState % (len-i);
			if (r < 0) r = -r; // e.g. -9 % 2 == -1
//			int r = random.nextInt(len-i);
			
			// swap arr[i, i+r]
			Object tmp = arr[i];
			arr[i] = arr[i + r];
			arr[i + r] = tmp;
		}		
	}
	
}