FileDocCategorySizeDatePackage
Highlighter.javaAPI DocApache Lucene 2.0.015995Fri May 26 09:53:50 BST 2006org.apache.lucene.search.highlight

Highlighter

public class Highlighter extends Object
Class used to markup highlighted terms found in the best sections of a text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter}, {@link Encoder} and tokenizers.
author
mark@searcharea.co.uk

Fields Summary
public static final int
DEFAULT_MAX_DOC_BYTES_TO_ANALYZE
private int
maxDocBytesToAnalyze
private Formatter
formatter
private Encoder
encoder
private Fragmenter
textFragmenter
private Scorer
fragmentScorer
Constructors Summary
public Highlighter(Scorer fragmentScorer)


	  
	
		this(new SimpleHTMLFormatter(),fragmentScorer);
	
public Highlighter(Formatter formatter, Scorer fragmentScorer)

		this(formatter,new DefaultEncoder(),fragmentScorer);
	
public Highlighter(Formatter formatter, Encoder encoder, Scorer fragmentScorer)

 		this.formatter = formatter;
		this.encoder = encoder;
 		this.fragmentScorer = fragmentScorer;
 	
Methods Summary
public final java.lang.StringgetBestFragment(org.apache.lucene.analysis.Analyzer analyzer, java.lang.String fieldName, java.lang.String text)
Highlights chosen terms in a text, extracting the most relevant section. This is a convenience method that calls {@link #getBestFragment(TokenStream, String)}

param
analyzer the analyzer that will be used to split text into chunks
param
text text to highlight terms in
param
fieldName Name of field used to influence analyzer's tokenization policy
return
highlighted text fragment or null if no terms found

		TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text));
		return getBestFragment(tokenStream, text);
	
public final java.lang.StringgetBestFragment(org.apache.lucene.analysis.TokenStream tokenStream, java.lang.String text)
Highlights chosen terms in a text, extracting the most relevant section. The document text is analysed in chunks to record hit statistics across the document. After accumulating stats, the fragment with the highest score is returned

param
tokenStream a stream of tokens identified in the text parameter, including offset information. This is typically produced by an analyzer re-parsing a document's text. Some work may be done on retrieving TokenStreams more efficently by adding support for storing original text position data in the Lucene index but this support is not currently available (as of Lucene 1.4 rc2).
param
text text to highlight terms in
return
highlighted text fragment or null if no terms found

		String[] results = getBestFragments(tokenStream,text, 1);
		if (results.length > 0)
		{
			return results[0];
		}
		return null;
	
public final java.lang.StringgetBestFragments(org.apache.lucene.analysis.TokenStream tokenStream, java.lang.String text, int maxNumFragments, java.lang.String separator)
Highlights terms in the text , extracting the most relevant sections and concatenating the chosen fragments with a separator (typically "..."). The document text is analysed in chunks to record hit statistics across the document. After accumulating stats, the fragments with the highest scores are returned in order as "separator" delimited strings.

param
text text to highlight terms in
param
maxNumFragments the maximum number of fragments.
param
separator the separator used to intersperse the document fragments (typically "...")
return
highlighted text

		String sections[] =	getBestFragments(tokenStream,text, maxNumFragments);
		StringBuffer result = new StringBuffer();
		for (int i = 0; i < sections.length; i++)
		{
			if (i > 0)
			{
				result.append(separator);
			}
			result.append(sections[i]);
		}
		return result.toString();
	
public final java.lang.String[]getBestFragments(org.apache.lucene.analysis.Analyzer analyzer, java.lang.String text, int maxNumFragments)
Highlights chosen terms in a text, extracting the most relevant sections. This is a convenience method that calls {@link #getBestFragments(TokenStream, String, int)}

param
analyzer the analyzer that will be used to split text into chunks
param
text text to highlight terms in
param
maxNumFragments the maximum number of fragments.
deprecated
This method incorrectly hardcodes the choice of fieldname. Use the method of the same name that takes a fieldname.
return
highlighted text fragments (between 0 and maxNumFragments number of fragments)

		TokenStream tokenStream = analyzer.tokenStream("field", new StringReader(text));
		return getBestFragments(tokenStream, text, maxNumFragments);
	
public final java.lang.String[]getBestFragments(org.apache.lucene.analysis.Analyzer analyzer, java.lang.String fieldName, java.lang.String text, int maxNumFragments)
Highlights chosen terms in a text, extracting the most relevant sections. This is a convenience method that calls {@link #getBestFragments(TokenStream, String, int)}

param
analyzer the analyzer that will be used to split text into chunks
param
fieldName the name of the field being highlighted (used by analyzer)
param
text text to highlight terms in
param
maxNumFragments the maximum number of fragments.
return
highlighted text fragments (between 0 and maxNumFragments number of fragments)

		TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text));
		return getBestFragments(tokenStream, text, maxNumFragments);
	
public final java.lang.String[]getBestFragments(org.apache.lucene.analysis.TokenStream tokenStream, java.lang.String text, int maxNumFragments)
Highlights chosen terms in a text, extracting the most relevant sections. The document text is analysed in chunks to record hit statistics across the document. After accumulating stats, the fragments with the highest scores are returned as an array of strings in order of score (contiguous fragments are merged into one in their original order to improve readability)

param
text text to highlight terms in
param
maxNumFragments the maximum number of fragments.
return
highlighted text fragments (between 0 and maxNumFragments number of fragments)

		maxNumFragments = Math.max(1, maxNumFragments); //sanity check
	
		TextFragment[] frag =getBestTextFragments(tokenStream,text, true,maxNumFragments);

		//Get text
		ArrayList fragTexts = new ArrayList();
		for (int i = 0; i < frag.length; i++)
		{
			if ((frag[i] != null) && (frag[i].getScore() > 0))
			{
				fragTexts.add(frag[i].toString());
			}
		}
		return (String[]) fragTexts.toArray(new String[0]);
	
public final TextFragment[]getBestTextFragments(org.apache.lucene.analysis.TokenStream tokenStream, java.lang.String text, boolean mergeContiguousFragments, int maxNumFragments)
Low level api to get the most relevant (formatted) sections of the document. This method has been made public to allow visibility of score information held in TextFragment objects. Thanks to Jason Calabrese for help in redefining the interface.

param
tokenStream
param
text
param
maxNumFragments
param
mergeContiguousFragments
throws
IOException

		ArrayList docFrags = new ArrayList();
		StringBuffer newText=new StringBuffer();

		TextFragment currentFrag =	new TextFragment(newText,newText.length(), docFrags.size());
		fragmentScorer.startFragment(currentFrag);
		docFrags.add(currentFrag);
	
		FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);

		try
		{
			org.apache.lucene.analysis.Token token;
			String tokenText;
			int startOffset;
			int endOffset;
			int lastEndOffset = 0;
			textFragmenter.start(text);
		
			TokenGroup tokenGroup=new TokenGroup();

			while ((token = tokenStream.next()) != null)
			{
				if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct(token)))
				{
					//the current token is distinct from previous tokens - 
					// markup the cached token group info
					startOffset = tokenGroup.startOffset;
					endOffset = tokenGroup.endOffset;		
					tokenText = text.substring(startOffset, endOffset);
					String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
					//store any whitespace etc from between this and last group
					if (startOffset > lastEndOffset)
						newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
					newText.append(markedUpText);
					lastEndOffset=endOffset;
					tokenGroup.clear();

					//check if current token marks the start of a new fragment						
					if(textFragmenter.isNewFragment(token))
					{
						currentFrag.setScore(fragmentScorer.getFragmentScore());
						//record stats for a new fragment
						currentFrag.textEndPos = newText.length();
						currentFrag =new TextFragment(newText, newText.length(), docFrags.size());
						fragmentScorer.startFragment(currentFrag);
						docFrags.add(currentFrag);
					}
				}
						
				tokenGroup.addToken(token,fragmentScorer.getTokenScore(token));
				
				if(lastEndOffset>maxDocBytesToAnalyze)
				{
					break;
				}
			}
			currentFrag.setScore(fragmentScorer.getFragmentScore());
	
			if(tokenGroup.numTokens>0)
			{
				//flush the accumulated text (same code as in above loop)
				startOffset = tokenGroup.startOffset;
				endOffset = tokenGroup.endOffset;		
				tokenText = text.substring(startOffset, endOffset);
				String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
				//store any whitespace etc from between this and last group
				if (startOffset > lastEndOffset)
					newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
				newText.append(markedUpText);
				lastEndOffset=endOffset;						
			}

			// append text after end of last token
//			if (lastEndOffset < text.length())
//				newText.append(encoder.encodeText(text.substring(lastEndOffset)));

			currentFrag.textEndPos = newText.length();

			//sort the most relevant sections of the text
			for (Iterator i = docFrags.iterator(); i.hasNext();)
			{
				currentFrag = (TextFragment) i.next();

				//If you are running with a version of Lucene before 11th Sept 03
				// you do not have PriorityQueue.insert() - so uncomment the code below					
				/*
									if (currentFrag.getScore() >= minScore)
									{
										fragQueue.put(currentFrag);
										if (fragQueue.size() > maxNumFragments)
										{ // if hit queue overfull
											fragQueue.pop(); // remove lowest in hit queue
											minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
										}
										
					
									}
				*/
				//The above code caused a problem as a result of Christoph Goller's 11th Sept 03
				//fix to PriorityQueue. The correct method to use here is the new "insert" method
				// USE ABOVE CODE IF THIS DOES NOT COMPILE!
				fragQueue.insert(currentFrag);
			}

			//return the most relevant fragments
			TextFragment frag[] = new TextFragment[fragQueue.size()];
			for (int i = frag.length - 1; i >= 0; i--)
			{
				frag[i] = (TextFragment) fragQueue.pop();
			}
			
			//merge any contiguous fragments to improve readability
			if(mergeContiguousFragments)
			{
				mergeContiguousFragments(frag);
				ArrayList fragTexts = new ArrayList();
				for (int i = 0; i < frag.length; i++)
				{
					if ((frag[i] != null) && (frag[i].getScore() > 0))
					{
						fragTexts.add(frag[i]);
					}
				}
				frag= (TextFragment[]) fragTexts.toArray(new TextFragment[0]);				
			}
			
			return frag;

		}
		finally
		{
			if (tokenStream != null)
			{
				try
				{
					tokenStream.close();
				}
				catch (Exception e)
				{
				}
			}
		}
	
public EncodergetEncoder()

        return encoder;
    
public ScorergetFragmentScorer()

return
Object used to score each text fragment

		return fragmentScorer;
	
public intgetMaxDocBytesToAnalyze()

return
the maximum number of bytes to be tokenized per doc

		return maxDocBytesToAnalyze;
	
public FragmentergetTextFragmenter()

		return textFragmenter;
	
private voidmergeContiguousFragments(TextFragment[] frag)
Improves readability of a score-sorted list of TextFragments by merging any fragments that were contiguous in the original text into one larger fragment with the correct order. This will leave a "null" in the array entry for the lesser scored fragment.

param
frag An array of document fragments in descending score

		boolean mergingStillBeingDone;
		if (frag.length > 1)
			do
			{
				mergingStillBeingDone = false; //initialise loop control flag
				//for each fragment, scan other frags looking for contiguous blocks
				for (int i = 0; i < frag.length; i++)
				{
					if (frag[i] == null)
					{
						continue;
					}
					//merge any contiguous blocks 
					for (int x = 0; x < frag.length; x++)
					{
						if (frag[x] == null)
						{
							continue;
						}
						if (frag[i] == null)
						{
							break;
						}
						TextFragment frag1 = null;
						TextFragment frag2 = null;
						int frag1Num = 0;
						int frag2Num = 0;
						int bestScoringFragNum;
						int worstScoringFragNum;
						//if blocks are contiguous....
						if (frag[i].follows(frag[x]))
						{
							frag1 = frag[x];
							frag1Num = x;
							frag2 = frag[i];
							frag2Num = i;
						}
						else
							if (frag[x].follows(frag[i]))
							{
								frag1 = frag[i];
								frag1Num = i;
								frag2 = frag[x];
								frag2Num = x;
							}
						//merging required..
						if (frag1 != null)
						{
							if (frag1.getScore() > frag2.getScore())
							{
								bestScoringFragNum = frag1Num;
								worstScoringFragNum = frag2Num;
							}
							else
							{
								bestScoringFragNum = frag2Num;
								worstScoringFragNum = frag1Num;
							}
							frag1.merge(frag2);
							frag[worstScoringFragNum] = null;
							mergingStillBeingDone = true;
							frag[bestScoringFragNum] = frag1;
						}
					}
				}
			}
			while (mergingStillBeingDone);
	
public voidsetEncoder(Encoder encoder)

        this.encoder = encoder;
    
public voidsetFragmentScorer(Scorer scorer)

param
scorer

		fragmentScorer = scorer;
	
public voidsetMaxDocBytesToAnalyze(int byteCount)

param
byteCount the maximum number of bytes to be tokenized per doc (This can improve performance with large documents)

		maxDocBytesToAnalyze = byteCount;
	
public voidsetTextFragmenter(Fragmenter fragmenter)

param
fragmenter

		textFragmenter = fragmenter;