package org.apache.lucene.search.highlight;
/**
* Copyright 2002-2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Iterator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.PriorityQueue;
/**
* Class used to markup highlighted terms found in the best sections of a
* text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter},
* {@link Encoder} and tokenizers.
* @author mark@searcharea.co.uk
*/
public class Highlighter
{
public static final int DEFAULT_MAX_DOC_BYTES_TO_ANALYZE=50*1024;
private int maxDocBytesToAnalyze=DEFAULT_MAX_DOC_BYTES_TO_ANALYZE;
private Formatter formatter;
private Encoder encoder;
private Fragmenter textFragmenter=new SimpleFragmenter();
private Scorer fragmentScorer=null;
public Highlighter(Scorer fragmentScorer)
{
this(new SimpleHTMLFormatter(),fragmentScorer);
}
public Highlighter(Formatter formatter, Scorer fragmentScorer)
{
this(formatter,new DefaultEncoder(),fragmentScorer);
}
public Highlighter(Formatter formatter, Encoder encoder, Scorer fragmentScorer)
{
this.formatter = formatter;
this.encoder = encoder;
this.fragmentScorer = fragmentScorer;
}
/**
* Highlights chosen terms in a text, extracting the most relevant section.
* This is a convenience method that calls
* {@link #getBestFragment(TokenStream, String)}
*
* @param analyzer the analyzer that will be used to split <code>text</code>
* into chunks
* @param text text to highlight terms in
* @param fieldName Name of field used to influence analyzer's tokenization policy
*
* @return highlighted text fragment or null if no terms found
*/
public final String getBestFragment(Analyzer analyzer, String fieldName,String text)
throws IOException
{
TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text));
return getBestFragment(tokenStream, text);
}
/**
* Highlights chosen terms in a text, extracting the most relevant section.
* The document text is analysed in chunks to record hit statistics
* across the document. After accumulating stats, the fragment with the highest score
* is returned
*
* @param tokenStream a stream of tokens identified in the text parameter, including offset information.
* This is typically produced by an analyzer re-parsing a document's
* text. Some work may be done on retrieving TokenStreams more efficently
* by adding support for storing original text position data in the Lucene
* index but this support is not currently available (as of Lucene 1.4 rc2).
* @param text text to highlight terms in
*
* @return highlighted text fragment or null if no terms found
*/
public final String getBestFragment(TokenStream tokenStream, String text)
throws IOException
{
String[] results = getBestFragments(tokenStream,text, 1);
if (results.length > 0)
{
return results[0];
}
return null;
}
/**
* Highlights chosen terms in a text, extracting the most relevant sections.
* This is a convenience method that calls
* {@link #getBestFragments(TokenStream, String, int)}
*
* @param analyzer the analyzer that will be used to split <code>text</code>
* into chunks
* @param text text to highlight terms in
* @param maxNumFragments the maximum number of fragments.
*
* @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
*/
public final String[] getBestFragments(
Analyzer analyzer,
String text,
int maxNumFragments)
throws IOException
{
TokenStream tokenStream = analyzer.tokenStream("field", new StringReader(text));
return getBestFragments(tokenStream, text, maxNumFragments);
}
/**
* Highlights chosen terms in a text, extracting the most relevant sections.
* The document text is analysed in chunks to record hit statistics
* across the document. After accumulating stats, the fragments with the highest scores
* are returned as an array of strings in order of score (contiguous fragments are merged into
* one in their original order to improve readability)
*
* @param text text to highlight terms in
* @param maxNumFragments the maximum number of fragments.
*
* @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
*/
public final String[] getBestFragments(
TokenStream tokenStream,
String text,
int maxNumFragments)
throws IOException
{
maxNumFragments = Math.max(1, maxNumFragments); //sanity check
TextFragment[] frag =getBestTextFragments(tokenStream,text, true,maxNumFragments);
//Get text
ArrayList fragTexts = new ArrayList();
for (int i = 0; i < frag.length; i++)
{
if ((frag[i] != null) && (frag[i].getScore() > 0))
{
fragTexts.add(frag[i].toString());
}
}
return (String[]) fragTexts.toArray(new String[0]);
}
/**
* Low level api to get the most relevant (formatted) sections of the document.
* This method has been made public to allow visibility of score information held in TextFragment objects.
* Thanks to Jason Calabrese for help in redefining the interface.
* @param tokenStream
* @param text
* @param maxNumFragments
* @param mergeContiguousFragments
* @throws IOException
*/
public final TextFragment[] getBestTextFragments(
TokenStream tokenStream,
String text,
boolean mergeContiguousFragments,
int maxNumFragments)
throws IOException
{
ArrayList docFrags = new ArrayList();
StringBuffer newText=new StringBuffer();
TextFragment currentFrag = new TextFragment(newText,newText.length(), docFrags.size());
fragmentScorer.startFragment(currentFrag);
docFrags.add(currentFrag);
FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
try
{
org.apache.lucene.analysis.Token token;
String tokenText;
int startOffset;
int endOffset;
int lastEndOffset = 0;
textFragmenter.start(text);
TokenGroup tokenGroup=new TokenGroup();
while ((token = tokenStream.next()) != null)
{
if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct(token)))
{
//the current token is distinct from previous tokens -
// markup the cached token group info
startOffset = tokenGroup.startOffset;
endOffset = tokenGroup.endOffset;
tokenText = text.substring(startOffset, endOffset);
String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
//store any whitespace etc from between this and last group
if (startOffset > lastEndOffset)
newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
newText.append(markedUpText);
lastEndOffset=endOffset;
tokenGroup.clear();
//check if current token marks the start of a new fragment
if(textFragmenter.isNewFragment(token))
{
currentFrag.setScore(fragmentScorer.getFragmentScore());
//record stats for a new fragment
currentFrag.textEndPos = newText.length();
currentFrag =new TextFragment(newText, newText.length(), docFrags.size());
fragmentScorer.startFragment(currentFrag);
docFrags.add(currentFrag);
}
}
tokenGroup.addToken(token,fragmentScorer.getTokenScore(token));
if(lastEndOffset>maxDocBytesToAnalyze)
{
break;
}
}
currentFrag.setScore(fragmentScorer.getFragmentScore());
if(tokenGroup.numTokens>0)
{
//flush the accumulated text (same code as in above loop)
startOffset = tokenGroup.startOffset;
endOffset = tokenGroup.endOffset;
tokenText = text.substring(startOffset, endOffset);
String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
//store any whitespace etc from between this and last group
if (startOffset > lastEndOffset)
newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
newText.append(markedUpText);
lastEndOffset=endOffset;
}
// append text after end of last token
// if (lastEndOffset < text.length())
// newText.append(encoder.encodeText(text.substring(lastEndOffset)));
currentFrag.textEndPos = newText.length();
//sort the most relevant sections of the text
for (Iterator i = docFrags.iterator(); i.hasNext();)
{
currentFrag = (TextFragment) i.next();
//If you are running with a version of Lucene before 11th Sept 03
// you do not have PriorityQueue.insert() - so uncomment the code below
/*
if (currentFrag.getScore() >= minScore)
{
fragQueue.put(currentFrag);
if (fragQueue.size() > maxNumFragments)
{ // if hit queue overfull
fragQueue.pop(); // remove lowest in hit queue
minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
}
}
*/
//The above code caused a problem as a result of Christoph Goller's 11th Sept 03
//fix to PriorityQueue. The correct method to use here is the new "insert" method
// USE ABOVE CODE IF THIS DOES NOT COMPILE!
fragQueue.insert(currentFrag);
}
//return the most relevant fragments
TextFragment frag[] = new TextFragment[fragQueue.size()];
for (int i = frag.length - 1; i >= 0; i--)
{
frag[i] = (TextFragment) fragQueue.pop();
}
//merge any contiguous fragments to improve readability
if(mergeContiguousFragments)
{
mergeContiguousFragments(frag);
ArrayList fragTexts = new ArrayList();
for (int i = 0; i < frag.length; i++)
{
if ((frag[i] != null) && (frag[i].getScore() > 0))
{
fragTexts.add(frag[i]);
}
}
frag= (TextFragment[]) fragTexts.toArray(new TextFragment[0]);
}
return frag;
}
finally
{
if (tokenStream != null)
{
try
{
tokenStream.close();
}
catch (Exception e)
{
}
}
}
}
/** Improves readability of a score-sorted list of TextFragments by merging any fragments
* that were contiguous in the original text into one larger fragment with the correct order.
* This will leave a "null" in the array entry for the lesser scored fragment.
*
* @param frag An array of document fragments in descending score
*/
private void mergeContiguousFragments(TextFragment[] frag)
{
boolean mergingStillBeingDone;
if (frag.length > 1)
do
{
mergingStillBeingDone = false; //initialise loop control flag
//for each fragment, scan other frags looking for contiguous blocks
for (int i = 0; i < frag.length; i++)
{
if (frag[i] == null)
{
continue;
}
//merge any contiguous blocks
for (int x = 0; x < frag.length; x++)
{
if (frag[x] == null)
{
continue;
}
if (frag[i] == null)
{
break;
}
TextFragment frag1 = null;
TextFragment frag2 = null;
int frag1Num = 0;
int frag2Num = 0;
int bestScoringFragNum;
int worstScoringFragNum;
//if blocks are contiguous....
if (frag[i].follows(frag[x]))
{
frag1 = frag[x];
frag1Num = x;
frag2 = frag[i];
frag2Num = i;
}
else
if (frag[x].follows(frag[i]))
{
frag1 = frag[i];
frag1Num = i;
frag2 = frag[x];
frag2Num = x;
}
//merging required..
if (frag1 != null)
{
if (frag1.getScore() > frag2.getScore())
{
bestScoringFragNum = frag1Num;
worstScoringFragNum = frag2Num;
}
else
{
bestScoringFragNum = frag2Num;
worstScoringFragNum = frag1Num;
}
frag1.merge(frag2);
frag[worstScoringFragNum] = null;
mergingStillBeingDone = true;
frag[bestScoringFragNum] = frag1;
}
}
}
}
while (mergingStillBeingDone);
}
/**
* Highlights terms in the text , extracting the most relevant sections
* and concatenating the chosen fragments with a separator (typically "...").
* The document text is analysed in chunks to record hit statistics
* across the document. After accumulating stats, the fragments with the highest scores
* are returned in order as "separator" delimited strings.
*
* @param text text to highlight terms in
* @param maxNumFragments the maximum number of fragments.
* @param separator the separator used to intersperse the document fragments (typically "...")
*
* @return highlighted text
*/
public final String getBestFragments(
TokenStream tokenStream,
String text,
int maxNumFragments,
String separator)
throws IOException
{
String sections[] = getBestFragments(tokenStream,text, maxNumFragments);
StringBuffer result = new StringBuffer();
for (int i = 0; i < sections.length; i++)
{
if (i > 0)
{
result.append(separator);
}
result.append(sections[i]);
}
return result.toString();
}
/**
* @return the maximum number of bytes to be tokenized per doc
*/
public int getMaxDocBytesToAnalyze()
{
return maxDocBytesToAnalyze;
}
/**
* @param byteCount the maximum number of bytes to be tokenized per doc
* (This can improve performance with large documents)
*/
public void setMaxDocBytesToAnalyze(int byteCount)
{
maxDocBytesToAnalyze = byteCount;
}
/**
*/
public Fragmenter getTextFragmenter()
{
return textFragmenter;
}
/**
* @param fragmenter
*/
public void setTextFragmenter(Fragmenter fragmenter)
{
textFragmenter = fragmenter;
}
/**
* @return Object used to score each text fragment
*/
public Scorer getFragmentScorer()
{
return fragmentScorer;
}
/**
* @param scorer
*/
public void setFragmentScorer(Scorer scorer)
{
fragmentScorer = scorer;
}
public Encoder getEncoder()
{
return encoder;
}
public void setEncoder(Encoder encoder)
{
this.encoder = encoder;
}
}
class FragmentQueue extends PriorityQueue
{
public FragmentQueue(int size)
{
initialize(size);
}
public final boolean lessThan(Object a, Object b)
{
TextFragment fragA = (TextFragment) a;
TextFragment fragB = (TextFragment) b;
if (fragA.getScore() == fragB.getScore())
return fragA.fragNum > fragB.fragNum;
else
return fragA.getScore() < fragB.getScore();
}
}
|