HighlighterTestpublic class HighlighterTest extends TestCase implements FormatterJUnit Test for Highlighter class. |
Fields Summary |
---|
private IndexReader | reader | private static final String | FIELD_NAME | private Query | query | RAMDirectory | ramDir | public Searcher | searcher | public Hits | hits | int | numHighlights | Analyzer | analyzer | String[] | texts |
Constructors Summary |
---|
public HighlighterTest(String arg0)Constructor for HighlightExtractorTest.
super(arg0);
|
Methods Summary |
---|
private void | addDoc(org.apache.lucene.index.IndexWriter writer, java.lang.String text)
Document d = new Document();
Field f = new Field(FIELD_NAME, text,Field.Store.YES, Field.Index.TOKENIZED);
d.add(f);
writer.addDocument(d);
| public void | doSearching(java.lang.String queryString)
searcher = new IndexSearcher(ramDir);
QueryParser parser=new QueryParser(FIELD_NAME, new StandardAnalyzer());
query = parser.parse(queryString);
//for any multi-term queries to work (prefix, wildcard, range,fuzzy etc) you must use a rewritten query!
query=query.rewrite(reader);
System.out.println("Searching for: " + query.toString(FIELD_NAME));
hits = searcher.search(query);
| void | doStandardHighlights()
Highlighter highlighter =new Highlighter(this,new QueryScorer(query));
highlighter.setTextFragmenter(new SimpleFragmenter(20));
for (int i = 0; i < hits.length(); i++)
{
String text = hits.doc(i).get(FIELD_NAME);
int maxNumFragmentsRequired = 2;
String fragmentSeparator = "...";
TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
String result =
highlighter.getBestFragments(
tokenStream,
text,
maxNumFragmentsRequired,
fragmentSeparator);
System.out.println("\t" + result);
}
| public java.lang.String | highlightTerm(java.lang.String originalText, TokenGroup group)
if(group.getTotalScore()<=0)
{
return originalText;
}
numHighlights++; //update stats used in assertions
return "<b>" + originalText + "</b>";
| protected void | setUp()
ramDir = new RAMDirectory();
IndexWriter writer = new IndexWriter(ramDir, new StandardAnalyzer(), true);
for (int i = 0; i < texts.length; i++)
{
addDoc(writer, texts[i]);
}
writer.optimize();
writer.close();
reader = IndexReader.open(ramDir);
numHighlights = 0;
| protected void | tearDown()
super.tearDown();
| public void | testEncoding()Demonstrates creation of an XHTML compliant doc using new encoding facilities.
String rawDocContent = "\"Smith & sons' prices < 3 and >4\" claims article";
//run the highlighter on the raw content (scorer does not score any tokens for
// highlighting but scores a single fragment for selection
Highlighter highlighter = new Highlighter(this,
new SimpleHTMLEncoder(), new Scorer()
{
public void startFragment(TextFragment newFragment)
{
}
public float getTokenScore(Token token)
{
return 0;
}
public float getFragmentScore()
{
return 1;
}
});
highlighter.setTextFragmenter(new SimpleFragmenter(2000));
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME,
new StringReader(rawDocContent));
String encodedSnippet = highlighter.getBestFragments(tokenStream, rawDocContent,1,"");
//An ugly bit of XML creation:
String xhtml="<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"+
"<!DOCTYPE html\n"+
"PUBLIC \"//W3C//DTD XHTML 1.0 Transitional//EN\"\n"+
"\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"+
"<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\n"+
"<head>\n"+
"<title>My Test HTML Document</title>\n"+
"</head>\n"+
"<body>\n"+
"<h2>"+encodedSnippet+"</h2>\n"+
"</body>\n"+
"</html>";
//now an ugly built of XML parsing to test the snippet is encoded OK
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
DocumentBuilder db = dbf.newDocumentBuilder();
org.w3c.dom.Document doc = db.parse(new ByteArrayInputStream(xhtml.getBytes()));
Element root=doc.getDocumentElement();
NodeList nodes=root.getElementsByTagName("body");
Element body=(Element) nodes.item(0);
nodes=body.getElementsByTagName("h2");
Element h2=(Element) nodes.item(0);
String decodedSnippet=h2.getFirstChild().getNodeValue();
assertEquals("XHTML Encoding should have worked:", rawDocContent,decodedSnippet);
| public void | testFieldSpecificHighlighting()
String docMainText="fred is one of the people";
QueryParser parser=new QueryParser(FIELD_NAME,analyzer);
Query query=parser.parse("fred category:people");
//highlighting respects fieldnames used in query
QueryScorer fieldSpecificScorer=new QueryScorer(query, "contents");
Highlighter fieldSpecificHighlighter =
new Highlighter(new SimpleHTMLFormatter(),fieldSpecificScorer);
fieldSpecificHighlighter.setTextFragmenter(new NullFragmenter());
String result=fieldSpecificHighlighter.getBestFragment(analyzer,FIELD_NAME,docMainText);
assertEquals("Should match",result,"<B>fred</B> is one of the people");
//highlighting does not respect fieldnames used in query
QueryScorer fieldInSpecificScorer=new QueryScorer(query);
Highlighter fieldInSpecificHighlighter =
new Highlighter(new SimpleHTMLFormatter(),fieldInSpecificScorer);
fieldInSpecificHighlighter.setTextFragmenter(new NullFragmenter());
result=fieldInSpecificHighlighter.getBestFragment(analyzer,FIELD_NAME,docMainText);
assertEquals("Should match",result,"<B>fred</B> is one of the <B>people</B>");
reader.close();
| public void | testGetBestFragmentsMultiTerm()
doSearching("John Kenn*");
doStandardHighlights();
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5);
| public void | testGetBestFragmentsPhrase()
doSearching("\"John Kennedy\"");
doStandardHighlights();
//Currently highlights "John" and "Kennedy" separately
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 2);
| public void | testGetBestFragmentsSimpleQuery()
doSearching("Kennedy");
doStandardHighlights();
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4);
| public void | testGetBestFragmentsWithOr()
doSearching("JFK OR Kennedy");
doStandardHighlights();
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5);
| public void | testGetBestSingleFragment()
doSearching("Kennedy");
Highlighter highlighter =new Highlighter(this,new QueryScorer(query));
highlighter.setTextFragmenter(new SimpleFragmenter(40));
for (int i = 0; i < hits.length(); i++)
{
String text = hits.doc(i).get(FIELD_NAME);
TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
String result = highlighter.getBestFragment(tokenStream,text);
System.out.println("\t" + result);
}
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4);
numHighlights = 0;
for (int i = 0; i < hits.length(); i++)
{
String text = hits.doc(i).get(FIELD_NAME);
highlighter.getBestFragment(analyzer, FIELD_NAME,text);
}
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4);
numHighlights = 0;
for (int i = 0; i < hits.length(); i++)
{
String text = hits.doc(i).get(FIELD_NAME);
highlighter.getBestFragments(analyzer, text, 10);
}
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4);
| public void | testGetBestSingleFragmentWithWeights()
WeightedTerm[]wTerms=new WeightedTerm[2];
wTerms[0]=new WeightedTerm(10f,"hello");
wTerms[1]=new WeightedTerm(1f,"kennedy");
Highlighter highlighter =new Highlighter(new QueryScorer(wTerms));
TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(texts[0]));
highlighter.setTextFragmenter(new SimpleFragmenter(2));
String result = highlighter.getBestFragment(tokenStream,texts[0]).trim();
assertTrue("Failed to find best section using weighted terms. Found: ["+result+"]"
, "<B>Hello</B>".equals(result));
//readjust weights
wTerms[1].setWeight(50f);
tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(texts[0]));
highlighter =new Highlighter(new QueryScorer(wTerms));
highlighter.setTextFragmenter(new SimpleFragmenter(2));
result = highlighter.getBestFragment(tokenStream,texts[0]).trim();
assertTrue("Failed to find best section using weighted terms. Found: "+result
, "<B>kennedy</B>".equals(result));
| public void | testGetFuzzyFragments()
doSearching("Kinnedy~");
doStandardHighlights();
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5);
| public void | testGetMidWildCardFragments()
doSearching("K*dy");
doStandardHighlights();
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5);
| public void | testGetRangeFragments()
doSearching(FIELD_NAME + ":[kannedy TO kznnedy]"); //bug?needs lower case
doStandardHighlights();
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5);
| public void | testGetSimpleHighlight()
doSearching("Kennedy");
Highlighter highlighter =
new Highlighter(this,new QueryScorer(query));
for (int i = 0; i < hits.length(); i++)
{
String text = hits.doc(i).get(FIELD_NAME);
TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
String result = highlighter.getBestFragment(tokenStream,text);
System.out.println("\t" + result);
}
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4);
| public void | testGetTextFragments()
doSearching("Kennedy");
Highlighter highlighter =
new Highlighter(this,new QueryScorer(query));
highlighter.setTextFragmenter(new SimpleFragmenter(20));
for (int i = 0; i < hits.length(); i++)
{
String text = hits.doc(i).get(FIELD_NAME);
TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
String stringResults[] = highlighter.getBestFragments(tokenStream,text,10);
tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
TextFragment fragmentResults[] = highlighter.getBestTextFragments(tokenStream,text,true,10);
assertTrue("Failed to find correct number of text Fragments: " +
fragmentResults.length + " vs "+ stringResults.length, fragmentResults.length==stringResults.length);
for (int j = 0; j < stringResults.length; j++)
{
System.out.println(fragmentResults[j]);
assertTrue("Failed to find same text Fragments: " +
fragmentResults[j] + " found", fragmentResults[j].toString().equals(stringResults[j]));
}
}
| public void | testGetWildCardFragments()
doSearching("K?nnedy");
doStandardHighlights();
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4);
| public void | testMaxSizeHighlight()
doSearching("meat");
Highlighter highlighter =
new Highlighter(this,new QueryScorer(query));
highlighter.setMaxDocBytesToAnalyze(30);
TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(texts[0]));
highlighter.getBestFragment(tokenStream,texts[0]);
assertTrue("Setting MaxDocBytesToAnalyze should have prevented " +
"us from finding matches for this record: " + numHighlights +
" found", numHighlights == 0);
| public void | testMultiSearcher()
//setup index 1
RAMDirectory ramDir1 = new RAMDirectory();
IndexWriter writer1 = new IndexWriter(ramDir1, new StandardAnalyzer(), true);
Document d = new Document();
Field f = new Field(FIELD_NAME, "multiOne", Field.Store.YES, Field.Index.TOKENIZED);
d.add(f);
writer1.addDocument(d);
writer1.optimize();
writer1.close();
IndexReader reader1 = IndexReader.open(ramDir1);
//setup index 2
RAMDirectory ramDir2 = new RAMDirectory();
IndexWriter writer2 = new IndexWriter(ramDir2, new StandardAnalyzer(), true);
d = new Document();
f = new Field(FIELD_NAME, "multiTwo", Field.Store.YES, Field.Index.TOKENIZED);
d.add(f);
writer2.addDocument(d);
writer2.optimize();
writer2.close();
IndexReader reader2 = IndexReader.open(ramDir2);
IndexSearcher searchers[]=new IndexSearcher[2];
searchers[0] = new IndexSearcher(ramDir1);
searchers[1] = new IndexSearcher(ramDir2);
MultiSearcher multiSearcher=new MultiSearcher(searchers);
QueryParser parser=new QueryParser(FIELD_NAME, new StandardAnalyzer());
query = parser.parse("multi*");
System.out.println("Searching for: " + query.toString(FIELD_NAME));
//at this point the multisearcher calls combine(query[])
hits = multiSearcher.search(query);
//query = QueryParser.parse("multi*", FIELD_NAME, new StandardAnalyzer());
Query expandedQueries[]=new Query[2];
expandedQueries[0]=query.rewrite(reader1);
expandedQueries[1]=query.rewrite(reader2);
query=query.combine(expandedQueries);
//create an instance of the highlighter with the tags used to surround highlighted text
Highlighter highlighter =
new Highlighter(this,new QueryScorer(query));
for (int i = 0; i < hits.length(); i++)
{
String text = hits.doc(i).get(FIELD_NAME);
TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
String highlightedText = highlighter.getBestFragment(tokenStream,text);
System.out.println(highlightedText);
}
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 2);
| public void | testNoFragments()
doSearching("AnInvalidQueryWhichShouldYieldNoResults");
Highlighter highlighter =
new Highlighter(this,new QueryScorer(query));
for (int i = 0; i < texts.length; i++)
{
String text = texts[i];
TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
String result = highlighter.getBestFragment(tokenStream,text);
assertNull("The highlight result should be null for text with no query terms", result);
}
| public void | testOverlapAnalyzer()
HashMap synonyms = new HashMap();
synonyms.put("football", "soccer,footie");
Analyzer analyzer = new SynonymAnalyzer(synonyms);
String srchkey = "football";
String s = "football-soccer in the euro 2004 footie competition";
QueryParser parser=new QueryParser("bookid",analyzer);
Query query = parser.parse(srchkey);
Highlighter highlighter = new Highlighter(new QueryScorer(query));
TokenStream tokenStream =
analyzer.tokenStream(null, new StringReader(s));
// Get 3 best fragments and seperate with a "..."
String result = highlighter.getBestFragments(tokenStream, s, 3, "...");
String expectedResult="<B>football</B>-<B>soccer</B> in the euro 2004 <B>footie</B> competition";
assertTrue("overlapping analyzer should handle highlights OK",expectedResult.equals(result));
| public void | testSimpleHighlighter()
doSearching("Kennedy");
Highlighter highlighter = new Highlighter(new QueryScorer(query));
highlighter.setTextFragmenter(new SimpleFragmenter(40));
int maxNumFragmentsRequired = 2;
for (int i = 0; i < hits.length(); i++)
{
String text = hits.doc(i).get(FIELD_NAME);
TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
String result =
highlighter.getBestFragments(tokenStream,text,maxNumFragmentsRequired, "...");
System.out.println("\t" + result);
}
//Not sure we can assert anything here - just running to check we dont throw any exceptions
| public void | testUnRewrittenQuery()
//test to show how rewritten query can still be used
searcher = new IndexSearcher(ramDir);
Analyzer analyzer=new StandardAnalyzer();
QueryParser parser=new QueryParser(FIELD_NAME,analyzer);
Query query = parser.parse("JF? or Kenned*");
System.out.println("Searching with primitive query");
//forget to set this and...
//query=query.rewrite(reader);
Hits hits = searcher.search(query);
//create an instance of the highlighter with the tags used to surround highlighted text
// QueryHighlightExtractor highlighter = new QueryHighlightExtractor(this, query, new StandardAnalyzer());
Highlighter highlighter =
new Highlighter(this,new QueryScorer(query));
highlighter.setTextFragmenter(new SimpleFragmenter(40));
int maxNumFragmentsRequired = 3;
for (int i = 0; i < hits.length(); i++)
{
String text = hits.doc(i).get(FIELD_NAME);
TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
String highlightedText = highlighter.getBestFragments(tokenStream,text,maxNumFragmentsRequired,"...");
System.out.println(highlightedText);
}
//We expect to have zero highlights if the query is multi-terms and is not rewritten!
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 0);
|
|