Highlighterpublic class Highlighter extends Object Class used to markup highlighted terms found in the best sections of a
text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter},
{@link Encoder} and tokenizers. |
Fields Summary |
---|
public static final int | DEFAULT_MAX_DOC_BYTES_TO_ANALYZE | private int | maxDocBytesToAnalyze | private Formatter | formatter | private Encoder | encoder | private Fragmenter | textFragmenter | private Scorer | fragmentScorer |
Constructors Summary |
---|
public Highlighter(Scorer fragmentScorer)
this(new SimpleHTMLFormatter(),fragmentScorer);
| public Highlighter(Formatter formatter, Scorer fragmentScorer)
this(formatter,new DefaultEncoder(),fragmentScorer);
| public Highlighter(Formatter formatter, Encoder encoder, Scorer fragmentScorer)
this.formatter = formatter;
this.encoder = encoder;
this.fragmentScorer = fragmentScorer;
|
Methods Summary |
---|
public final java.lang.String | getBestFragment(org.apache.lucene.analysis.Analyzer analyzer, java.lang.String fieldName, java.lang.String text)Highlights chosen terms in a text, extracting the most relevant section.
This is a convenience method that calls
{@link #getBestFragment(TokenStream, String)}
TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text));
return getBestFragment(tokenStream, text);
| public final java.lang.String | getBestFragment(org.apache.lucene.analysis.TokenStream tokenStream, java.lang.String text)Highlights chosen terms in a text, extracting the most relevant section.
The document text is analysed in chunks to record hit statistics
across the document. After accumulating stats, the fragment with the highest score
is returned
String[] results = getBestFragments(tokenStream,text, 1);
if (results.length > 0)
{
return results[0];
}
return null;
| public final java.lang.String | getBestFragments(org.apache.lucene.analysis.TokenStream tokenStream, java.lang.String text, int maxNumFragments, java.lang.String separator)Highlights terms in the text , extracting the most relevant sections
and concatenating the chosen fragments with a separator (typically "...").
The document text is analysed in chunks to record hit statistics
across the document. After accumulating stats, the fragments with the highest scores
are returned in order as "separator" delimited strings.
String sections[] = getBestFragments(tokenStream,text, maxNumFragments);
StringBuffer result = new StringBuffer();
for (int i = 0; i < sections.length; i++)
{
if (i > 0)
{
result.append(separator);
}
result.append(sections[i]);
}
return result.toString();
| public final java.lang.String[] | getBestFragments(org.apache.lucene.analysis.Analyzer analyzer, java.lang.String text, int maxNumFragments)Highlights chosen terms in a text, extracting the most relevant sections.
This is a convenience method that calls
{@link #getBestFragments(TokenStream, String, int)}
TokenStream tokenStream = analyzer.tokenStream("field", new StringReader(text));
return getBestFragments(tokenStream, text, maxNumFragments);
| public final java.lang.String[] | getBestFragments(org.apache.lucene.analysis.Analyzer analyzer, java.lang.String fieldName, java.lang.String text, int maxNumFragments)Highlights chosen terms in a text, extracting the most relevant sections.
This is a convenience method that calls
{@link #getBestFragments(TokenStream, String, int)}
TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text));
return getBestFragments(tokenStream, text, maxNumFragments);
| public final java.lang.String[] | getBestFragments(org.apache.lucene.analysis.TokenStream tokenStream, java.lang.String text, int maxNumFragments)Highlights chosen terms in a text, extracting the most relevant sections.
The document text is analysed in chunks to record hit statistics
across the document. After accumulating stats, the fragments with the highest scores
are returned as an array of strings in order of score (contiguous fragments are merged into
one in their original order to improve readability)
maxNumFragments = Math.max(1, maxNumFragments); //sanity check
TextFragment[] frag =getBestTextFragments(tokenStream,text, true,maxNumFragments);
//Get text
ArrayList fragTexts = new ArrayList();
for (int i = 0; i < frag.length; i++)
{
if ((frag[i] != null) && (frag[i].getScore() > 0))
{
fragTexts.add(frag[i].toString());
}
}
return (String[]) fragTexts.toArray(new String[0]);
| public final TextFragment[] | getBestTextFragments(org.apache.lucene.analysis.TokenStream tokenStream, java.lang.String text, boolean mergeContiguousFragments, int maxNumFragments)Low level api to get the most relevant (formatted) sections of the document.
This method has been made public to allow visibility of score information held in TextFragment objects.
Thanks to Jason Calabrese for help in redefining the interface.
ArrayList docFrags = new ArrayList();
StringBuffer newText=new StringBuffer();
TextFragment currentFrag = new TextFragment(newText,newText.length(), docFrags.size());
fragmentScorer.startFragment(currentFrag);
docFrags.add(currentFrag);
FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
try
{
org.apache.lucene.analysis.Token token;
String tokenText;
int startOffset;
int endOffset;
int lastEndOffset = 0;
textFragmenter.start(text);
TokenGroup tokenGroup=new TokenGroup();
token = tokenStream.next();
while ((token!= null)&&(token.startOffset()<maxDocBytesToAnalyze))
{
if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct(token)))
{
//the current token is distinct from previous tokens -
// markup the cached token group info
startOffset = tokenGroup.matchStartOffset;
endOffset = tokenGroup.matchEndOffset;
tokenText = text.substring(startOffset, endOffset);
String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
//store any whitespace etc from between this and last group
if (startOffset > lastEndOffset)
newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
newText.append(markedUpText);
lastEndOffset=Math.max(endOffset, lastEndOffset);
tokenGroup.clear();
//check if current token marks the start of a new fragment
if(textFragmenter.isNewFragment(token))
{
currentFrag.setScore(fragmentScorer.getFragmentScore());
//record stats for a new fragment
currentFrag.textEndPos = newText.length();
currentFrag =new TextFragment(newText, newText.length(), docFrags.size());
fragmentScorer.startFragment(currentFrag);
docFrags.add(currentFrag);
}
}
tokenGroup.addToken(token,fragmentScorer.getTokenScore(token));
// if(lastEndOffset>maxDocBytesToAnalyze)
// {
// break;
// }
token = tokenStream.next();
}
currentFrag.setScore(fragmentScorer.getFragmentScore());
if(tokenGroup.numTokens>0)
{
//flush the accumulated text (same code as in above loop)
startOffset = tokenGroup.matchStartOffset;
endOffset = tokenGroup.matchEndOffset;
tokenText = text.substring(startOffset, endOffset);
String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
//store any whitespace etc from between this and last group
if (startOffset > lastEndOffset)
newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
newText.append(markedUpText);
lastEndOffset=Math.max(lastEndOffset,endOffset);
}
//Test what remains of the original text beyond the point where we stopped analyzing
if (
// if there is text beyond the last token considered..
(lastEndOffset < text.length())
&&
// and that text is not too large...
(text.length()<maxDocBytesToAnalyze)
)
{
//append it to the last fragment
newText.append(encoder.encodeText(text.substring(lastEndOffset)));
}
currentFrag.textEndPos = newText.length();
//sort the most relevant sections of the text
for (Iterator i = docFrags.iterator(); i.hasNext();)
{
currentFrag = (TextFragment) i.next();
//If you are running with a version of Lucene before 11th Sept 03
// you do not have PriorityQueue.insert() - so uncomment the code below
/*
if (currentFrag.getScore() >= minScore)
{
fragQueue.put(currentFrag);
if (fragQueue.size() > maxNumFragments)
{ // if hit queue overfull
fragQueue.pop(); // remove lowest in hit queue
minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
}
}
*/
//The above code caused a problem as a result of Christoph Goller's 11th Sept 03
//fix to PriorityQueue. The correct method to use here is the new "insert" method
// USE ABOVE CODE IF THIS DOES NOT COMPILE!
fragQueue.insert(currentFrag);
}
//return the most relevant fragments
TextFragment frag[] = new TextFragment[fragQueue.size()];
for (int i = frag.length - 1; i >= 0; i--)
{
frag[i] = (TextFragment) fragQueue.pop();
}
//merge any contiguous fragments to improve readability
if(mergeContiguousFragments)
{
mergeContiguousFragments(frag);
ArrayList fragTexts = new ArrayList();
for (int i = 0; i < frag.length; i++)
{
if ((frag[i] != null) && (frag[i].getScore() > 0))
{
fragTexts.add(frag[i]);
}
}
frag= (TextFragment[]) fragTexts.toArray(new TextFragment[0]);
}
return frag;
}
finally
{
if (tokenStream != null)
{
try
{
tokenStream.close();
}
catch (Exception e)
{
}
}
}
| public Encoder | getEncoder()
return encoder;
| public Scorer | getFragmentScorer()
return fragmentScorer;
| public int | getMaxDocBytesToAnalyze()
return maxDocBytesToAnalyze;
| public Fragmenter | getTextFragmenter()
return textFragmenter;
| private void | mergeContiguousFragments(TextFragment[] frag)Improves readability of a score-sorted list of TextFragments by merging any fragments
that were contiguous in the original text into one larger fragment with the correct order.
This will leave a "null" in the array entry for the lesser scored fragment.
boolean mergingStillBeingDone;
if (frag.length > 1)
do
{
mergingStillBeingDone = false; //initialise loop control flag
//for each fragment, scan other frags looking for contiguous blocks
for (int i = 0; i < frag.length; i++)
{
if (frag[i] == null)
{
continue;
}
//merge any contiguous blocks
for (int x = 0; x < frag.length; x++)
{
if (frag[x] == null)
{
continue;
}
if (frag[i] == null)
{
break;
}
TextFragment frag1 = null;
TextFragment frag2 = null;
int frag1Num = 0;
int frag2Num = 0;
int bestScoringFragNum;
int worstScoringFragNum;
//if blocks are contiguous....
if (frag[i].follows(frag[x]))
{
frag1 = frag[x];
frag1Num = x;
frag2 = frag[i];
frag2Num = i;
}
else
if (frag[x].follows(frag[i]))
{
frag1 = frag[i];
frag1Num = i;
frag2 = frag[x];
frag2Num = x;
}
//merging required..
if (frag1 != null)
{
if (frag1.getScore() > frag2.getScore())
{
bestScoringFragNum = frag1Num;
worstScoringFragNum = frag2Num;
}
else
{
bestScoringFragNum = frag2Num;
worstScoringFragNum = frag1Num;
}
frag1.merge(frag2);
frag[worstScoringFragNum] = null;
mergingStillBeingDone = true;
frag[bestScoringFragNum] = frag1;
}
}
}
}
while (mergingStillBeingDone);
| public void | setEncoder(Encoder encoder)
this.encoder = encoder;
| public void | setFragmentScorer(Scorer scorer)
fragmentScorer = scorer;
| public void | setMaxDocBytesToAnalyze(int byteCount)
maxDocBytesToAnalyze = byteCount;
| public void | setTextFragmenter(Fragmenter fragmenter)
textFragmenter = fragmenter;
|
|