FileDocCategorySizeDatePackage
QueryTermExtractor.javaAPI DocApache Lucene 2.1.06408Wed Feb 14 10:46:22 GMT 2007org.apache.lucene.search.highlight

QueryTermExtractor

public final class QueryTermExtractor extends Object
Utility class used to extract the terms used in a query, plus any weights. This class will not find terms for MultiTermQuery, RangeQuery and PrefixQuery classes so the caller must pass a rewritten query (see Query.rewrite) to obtain a list of expanded terms.

Fields Summary
Constructors Summary
Methods Summary
public static final WeightedTerm[]getIdfWeightedTerms(org.apache.lucene.search.Query query, org.apache.lucene.index.IndexReader reader, java.lang.String fieldName)
Extracts all terms texts of a given Query into an array of WeightedTerms

param
query Query to extract term texts from
param
reader used to compute IDF which can be used to a) score selected fragments better b) use graded highlights eg chaning intensity of font color
param
fieldName the field on which Inverse Document Frequency (IDF) calculations are based
return
an array of the terms used in a query, plus their weights.

	    WeightedTerm[] terms=getTerms(query,false, fieldName);
	    int totalNumDocs=reader.numDocs();
	    for (int i = 0; i < terms.length; i++)
        {
	        try
            {
                int docFreq=reader.docFreq(new Term(fieldName,terms[i].term));
                //IDF algorithm taken from DefaultSimilarity class
                float idf=(float)(Math.log((float)totalNumDocs/(double)(docFreq+1)) + 1.0);
                terms[i].weight*=idf;
            } 
	        catch (IOException e)
            {
	            //ignore 
            }
        }
		return terms;
	
public static final WeightedTerm[]getTerms(org.apache.lucene.search.Query query)
Extracts all terms texts of a given Query into an array of WeightedTerms

param
query Query to extract term texts from
return
an array of the terms used in a query, plus their weights.

		return getTerms(query,false);
	
public static final WeightedTerm[]getTerms(org.apache.lucene.search.Query query, boolean prohibited, java.lang.String fieldName)
Extracts all terms texts of a given Query into an array of WeightedTerms

param
query Query to extract term texts from
param
prohibited true to extract "prohibited" terms, too
param
fieldName The fieldName used to filter query terms
return
an array of the terms used in a query, plus their weights.

		HashSet terms=new HashSet();
		if(fieldName!=null)
		{
		    fieldName=fieldName.intern();
		}
		getTerms(query,terms,prohibited,fieldName);
		return (WeightedTerm[]) terms.toArray(new WeightedTerm[0]);
	
public static final WeightedTerm[]getTerms(org.apache.lucene.search.Query query, boolean prohibited)
Extracts all terms texts of a given Query into an array of WeightedTerms

param
query Query to extract term texts from
param
prohibited true to extract "prohibited" terms, too
return
an array of the terms used in a query, plus their weights.

	    return getTerms(query,prohibited,null);
	
private static final voidgetTerms(org.apache.lucene.search.Query query, java.util.HashSet terms, boolean prohibited, java.lang.String fieldName)

       	try
       	{
    		if (query instanceof BooleanQuery)
    			getTermsFromBooleanQuery((BooleanQuery) query, terms, prohibited, fieldName);
    		else
    			if(query instanceof FilteredQuery)
    				getTermsFromFilteredQuery((FilteredQuery)query, terms,prohibited, fieldName);
    			else
    		{
	       		HashSet nonWeightedTerms=new HashSet();
	       		query.extractTerms(nonWeightedTerms);
	       		for (Iterator iter = nonWeightedTerms.iterator(); iter.hasNext();)
				{
					Term term = (Term) iter.next();
				    if((fieldName==null)||(term.field()==fieldName))
					{
						terms.add(new WeightedTerm(query.getBoost(),term.text()));
					}
				}
    		}
	      }
	      catch(UnsupportedOperationException ignore)
	      {
	    	  //this is non-fatal for our purposes
       	  }		        			        	
	
private static final voidgetTermsFromBooleanQuery(org.apache.lucene.search.BooleanQuery query, java.util.HashSet terms, boolean prohibited, java.lang.String fieldName)
extractTerms is currently the only query-independent means of introspecting queries but it only reveals a list of terms for that query - not the boosts each individual term in that query may or may not have. "Container" queries such as BooleanQuery should be unwrapped to get at the boost info held in each child element. Some discussion around this topic here: http://www.gossamer-threads.com/lists/lucene/java-dev/34208?search_string=introspection;#34208 Unfortunately there seemed to be limited interest in requiring all Query objects to implement something common which would allow access to child queries so what follows here are query-specific implementations for accessing embedded query elements.

		BooleanClause[] queryClauses = query.getClauses();
		for (int i = 0; i < queryClauses.length; i++)
		{
			if (prohibited || queryClauses[i].getOccur()!=BooleanClause.Occur.MUST_NOT)
				getTerms(queryClauses[i].getQuery(), terms, prohibited, fieldName);
		}
	
private static voidgetTermsFromFilteredQuery(org.apache.lucene.search.FilteredQuery query, java.util.HashSet terms, boolean prohibited, java.lang.String fieldName)

		getTerms(query.getQuery(),terms,prohibited,fieldName);