FileDocCategorySizeDatePackage
TokenSources.javaAPI DocApache Lucene 2.2.09208Sat Jun 16 22:21:02 BST 2007org.apache.lucene.search.highlight

TokenSources.java

/*
 * Created on 28-Oct-2004
 */
package org.apache.lucene.search.highlight;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.index.TermPositionVector;
import org.apache.lucene.index.TermVectorOffsetInfo;

/**
 * Hides implementation issues associated with obtaining a TokenStream for use with
 * the higlighter - can obtain from TermFreqVectors with offsets and (optionally) positions or
 * from Analyzer class reparsing the stored content. 
 * @author maharwood
 */
public class TokenSources
{
    /**
     * A convenience method that tries a number of approaches to getting a token stream.
     * The cost of finding there are no termVectors in the index is minimal (1000 invocations still 
     * registers 0 ms). So this "lazy" (flexible?) approach to coding is probably acceptable
     * @param reader
     * @param docId
     * @param field
     * @param analyzer
     * @return null if field not stored correctly 
     * @throws IOException
     */
    public static TokenStream getAnyTokenStream(IndexReader reader,int docId, String field,Analyzer analyzer) throws IOException
    {
		TokenStream ts=null;

		TermFreqVector tfv=(TermFreqVector) reader.getTermFreqVector(docId,field);
		if(tfv!=null)
		{
		    if(tfv instanceof TermPositionVector)
		    {
		        ts=getTokenStream((TermPositionVector) tfv);
		    }
		}
		//No token info stored so fall back to analyzing raw content
		if(ts==null)
		{
		    ts=getTokenStream(reader,docId,field,analyzer);
		}
		return ts;
    }
    
    
    public static TokenStream getTokenStream(TermPositionVector tpv)
    {
        //assumes the worst and makes no assumptions about token position sequences.
         return getTokenStream(tpv,false);   
    }
    /**
     * Low level api.
     * Returns a token stream or null if no offset info available in index.
     * This can be used to feed the highlighter with a pre-parsed token stream 
     * 
     * In my tests the speeds to recreate 1000 token streams using this method are:
     * - with TermVector offset only data stored - 420  milliseconds 
     * - with TermVector offset AND position data stored - 271 milliseconds
     *  (nb timings for TermVector with position data are based on a tokenizer with contiguous
     *  positions - no overlaps or gaps)
     * The cost of not using TermPositionVector to store
     * pre-parsed content and using an analyzer to re-parse the original content: 
     * - reanalyzing the original content - 980 milliseconds
     * 
     * The re-analyze timings will typically vary depending on -
     * 	1) The complexity of the analyzer code (timings above were using a 
     * 	   stemmer/lowercaser/stopword combo)
     *  2) The  number of other fields (Lucene reads ALL fields off the disk 
     *     when accessing just one document field - can cost dear!)
     *  3) Use of compression on field storage - could be faster cos of compression (less disk IO)
     *     or slower (more CPU burn) depending on the content.
     *
     * @param tpv
     * @param tokenPositionsGuaranteedContiguous true if the token position numbers have no overlaps or gaps. If looking
     * to eek out the last drops of performance, set to true. If in doubt, set to false.
     */
    public static TokenStream getTokenStream(TermPositionVector tpv, boolean tokenPositionsGuaranteedContiguous)
    {
        //an object used to iterate across an array of tokens
        class StoredTokenStream extends TokenStream
        {
            Token tokens[];
            int currentToken=0;
            StoredTokenStream(Token tokens[])
            {
                this.tokens=tokens;
            }
            public Token next()
            {
                if(currentToken>=tokens.length)
                {
                    return null;
                }
                return tokens[currentToken++];
            }            
        }        
        //code to reconstruct the original sequence of Tokens
        String[] terms=tpv.getTerms();          
        int[] freq=tpv.getTermFrequencies();
        int totalTokens=0;
        for (int t = 0; t < freq.length; t++)
        {
            totalTokens+=freq[t];
        }
        Token tokensInOriginalOrder[]=new Token[totalTokens];
        ArrayList unsortedTokens = null;
        for (int t = 0; t < freq.length; t++)
        {
            TermVectorOffsetInfo[] offsets=tpv.getOffsets(t);
            if(offsets==null)
            {
                return null;
            }
            
            int[] pos=null;
            if(tokenPositionsGuaranteedContiguous)
            {
                //try get the token position info to speed up assembly of tokens into sorted sequence
                pos=tpv.getTermPositions(t);
            }
            if(pos==null)
            {	
                //tokens NOT stored with positions or not guaranteed contiguous - must add to list and sort later
                if(unsortedTokens==null)
                {
                    unsortedTokens=new ArrayList();
                }
                for (int tp = 0; tp < offsets.length; tp++)
                {
                    unsortedTokens.add(new Token(terms[t],
                        offsets[tp].getStartOffset(),
                        offsets[tp].getEndOffset()));
                }
            }
            else
            {
                //We have positions stored and a guarantee that the token position information is contiguous
                
                // This may be fast BUT wont work if Tokenizers used which create >1 token in same position or
                // creates jumps in position numbers - this code would fail under those circumstances
                
                //tokens stored with positions - can use this to index straight into sorted array
                for (int tp = 0; tp < pos.length; tp++)
                {
                    tokensInOriginalOrder[pos[tp]]=new Token(terms[t],
                            offsets[tp].getStartOffset(),
                            offsets[tp].getEndOffset());
                }                
            }
        }
        //If the field has been stored without position data we must perform a sort        
        if(unsortedTokens!=null)
        {
            tokensInOriginalOrder=(Token[]) unsortedTokens.toArray(new Token[unsortedTokens.size()]);
            Arrays.sort(tokensInOriginalOrder, new Comparator(){
                public int compare(Object o1, Object o2)
                {
                    Token t1=(Token) o1;
                    Token t2=(Token) o2;
                    if(t1.startOffset()>t2.startOffset())
                        return 1;
                    if(t1.startOffset()<t2.startOffset())
                        return -1;
                    return 0;
                }});
        }
        return new StoredTokenStream(tokensInOriginalOrder);
    }

    public static TokenStream getTokenStream(IndexReader reader,int docId, String field) throws IOException
    {
		TermFreqVector tfv=(TermFreqVector) reader.getTermFreqVector(docId,field);
		if(tfv==null)
		{
		    throw new IllegalArgumentException(field+" in doc #"+docId
		            	+"does not have any term position data stored");
		}
	    if(tfv instanceof TermPositionVector)
	    {
			TermPositionVector tpv=(TermPositionVector) reader.getTermFreqVector(docId,field);
	        return getTokenStream(tpv);	        
	    }
	    throw new IllegalArgumentException(field+" in doc #"+docId
            	+"does not have any term position data stored");
    }

    //convenience method
    public static TokenStream getTokenStream(IndexReader reader,int docId, String field,Analyzer analyzer) throws IOException
    {
		Document doc=reader.document(docId);
		String contents=doc.get(field);
		if(contents==null)
		{
		    throw new IllegalArgumentException("Field "+field +" in document #"+docId+ " is not stored and cannot be analyzed");
		}
        return analyzer.tokenStream(field,new StringReader(contents));
    }
    
    

}