FileDocCategorySizeDatePackage
HighlighterTest.javaAPI DocApache Lucene 1.923876Mon Feb 27 11:12:22 GMT 2006org.apache.lucene.search.highlight

HighlighterTest

public class HighlighterTest extends TestCase implements Formatter
JUnit Test for Highlighter class.
author
mark@searcharea.co.uk

Fields Summary
private IndexReader
reader
private static final String
FIELD_NAME
private Query
query
RAMDirectory
ramDir
public Searcher
searcher
public Hits
hits
int
numHighlights
Analyzer
analyzer
String[]
texts
Constructors Summary
public HighlighterTest(String arg0)
Constructor for HighlightExtractorTest.

param
arg0


	     	 
	  
	
		super(arg0);
	
Methods Summary
private voidaddDoc(org.apache.lucene.index.IndexWriter writer, java.lang.String text)

		Document d = new Document();
		Field f = new Field(FIELD_NAME, text,Field.Store.YES, Field.Index.TOKENIZED);
		d.add(f);
		writer.addDocument(d);

	
public voiddoSearching(java.lang.String queryString)

		searcher = new IndexSearcher(ramDir);
		QueryParser parser=new QueryParser(FIELD_NAME, new StandardAnalyzer());
		query = parser.parse(queryString);
		//for any multi-term queries to work (prefix, wildcard, range,fuzzy etc) you must use a rewritten query!
		query=query.rewrite(reader);
		System.out.println("Searching for: " + query.toString(FIELD_NAME));
		hits = searcher.search(query);
	
voiddoStandardHighlights()

		Highlighter highlighter =new Highlighter(this,new QueryScorer(query));
		highlighter.setTextFragmenter(new SimpleFragmenter(20));
		for (int i = 0; i < hits.length(); i++)
		{
			String text = hits.doc(i).get(FIELD_NAME);
			int maxNumFragmentsRequired = 2;
			String fragmentSeparator = "...";
			TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));

			String result =
				highlighter.getBestFragments(
					tokenStream,
					text,
					maxNumFragmentsRequired,
					fragmentSeparator);
			System.out.println("\t" + result);
		}
	
public java.lang.StringhighlightTerm(java.lang.String originalText, TokenGroup group)

		if(group.getTotalScore()<=0)
		{
			return originalText;
		}
		numHighlights++; //update stats used in assertions
		return "<b>" + originalText + "</b>";
	
protected voidsetUp()

		ramDir = new RAMDirectory();
		IndexWriter writer = new IndexWriter(ramDir, new StandardAnalyzer(), true);
		for (int i = 0; i < texts.length; i++)
		{
			addDoc(writer, texts[i]);
		}

		writer.optimize();
		writer.close();
		reader = IndexReader.open(ramDir);
		numHighlights = 0;
	
protected voidtearDown()

		super.tearDown();
	
public voidtestEncoding()
Demonstrates creation of an XHTML compliant doc using new encoding facilities.

throws
Exception

        String rawDocContent = "\"Smith & sons' prices < 3 and >4\" claims article";
        //run the highlighter on the raw content (scorer does not score any tokens for 
        // highlighting but scores a single fragment for selection
        Highlighter highlighter = new Highlighter(this,
                new SimpleHTMLEncoder(), new Scorer()
                {
                    public void startFragment(TextFragment newFragment)
                    {
                    }
                    public float getTokenScore(Token token)
                    {
                        return 0;
                    }
                    public float getFragmentScore()
                    {
                        return 1;
                    }
                });
        highlighter.setTextFragmenter(new SimpleFragmenter(2000));
        TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME,
                new StringReader(rawDocContent));

        String encodedSnippet = highlighter.getBestFragments(tokenStream, rawDocContent,1,"");
        //An ugly bit of XML creation:
        String xhtml="<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"+
            		"<!DOCTYPE html\n"+
            		"PUBLIC \"//W3C//DTD XHTML 1.0 Transitional//EN\"\n"+
            		"\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"+
            		"<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\n"+
            		"<head>\n"+
            		"<title>My Test HTML Document</title>\n"+
            		"</head>\n"+
            		"<body>\n"+
            		"<h2>"+encodedSnippet+"</h2>\n"+
            		"</body>\n"+
            		"</html>";
        //now an ugly built of XML parsing to test the snippet is encoded OK 
  		DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
  		DocumentBuilder db = dbf.newDocumentBuilder();
  		org.w3c.dom.Document doc = db.parse(new ByteArrayInputStream(xhtml.getBytes()));
  		Element root=doc.getDocumentElement();  		
  		NodeList nodes=root.getElementsByTagName("body");
  		Element body=(Element) nodes.item(0);
  		nodes=body.getElementsByTagName("h2");
        Element h2=(Element) nodes.item(0); 
        String decodedSnippet=h2.getFirstChild().getNodeValue();
        assertEquals("XHTML Encoding should have worked:", rawDocContent,decodedSnippet);
    
public voidtestFieldSpecificHighlighting()

		String docMainText="fred is one of the people";
		QueryParser parser=new QueryParser(FIELD_NAME,analyzer);
		Query query=parser.parse("fred category:people");
		
		//highlighting respects fieldnames used in query
		QueryScorer fieldSpecificScorer=new QueryScorer(query, "contents");
		Highlighter fieldSpecificHighlighter =
			new Highlighter(new SimpleHTMLFormatter(),fieldSpecificScorer);
		fieldSpecificHighlighter.setTextFragmenter(new NullFragmenter());
		String result=fieldSpecificHighlighter.getBestFragment(analyzer,FIELD_NAME,docMainText);
		assertEquals("Should match",result,"<B>fred</B> is one of the people");
		
		//highlighting does not respect fieldnames used in query
		QueryScorer fieldInSpecificScorer=new QueryScorer(query);
		Highlighter fieldInSpecificHighlighter =
			new Highlighter(new SimpleHTMLFormatter(),fieldInSpecificScorer);
		fieldInSpecificHighlighter.setTextFragmenter(new NullFragmenter());
		result=fieldInSpecificHighlighter.getBestFragment(analyzer,FIELD_NAME,docMainText);
		assertEquals("Should match",result,"<B>fred</B> is one of the <B>people</B>");
		
		
		reader.close();
		
	
public voidtestGetBestFragmentsMultiTerm()

		doSearching("John Kenn*");
		doStandardHighlights();
		assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5);
	
public voidtestGetBestFragmentsPhrase()

		doSearching("\"John Kennedy\"");
		doStandardHighlights();
		//Currently highlights "John" and "Kennedy" separately
		assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 2);
	
public voidtestGetBestFragmentsSimpleQuery()

		doSearching("Kennedy");
		doStandardHighlights();
		assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4);
	
public voidtestGetBestFragmentsWithOr()

		doSearching("JFK OR Kennedy");
		doStandardHighlights();
		assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5);
	
public voidtestGetBestSingleFragment()

		doSearching("Kennedy");
		Highlighter highlighter =new Highlighter(this,new QueryScorer(query));
		highlighter.setTextFragmenter(new SimpleFragmenter(40));

		for (int i = 0; i < hits.length(); i++)
		{
			String text = hits.doc(i).get(FIELD_NAME);
			TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
			String result = highlighter.getBestFragment(tokenStream,text);
			System.out.println("\t" + result);
		}
		assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4);

		numHighlights = 0;
		for (int i = 0; i < hits.length(); i++)
		{
    		String text = hits.doc(i).get(FIELD_NAME);
    		highlighter.getBestFragment(analyzer, FIELD_NAME,text);
		}
		assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4);

		numHighlights = 0;
		for (int i = 0; i < hits.length(); i++)
		{
    		String text = hits.doc(i).get(FIELD_NAME);
    		highlighter.getBestFragments(analyzer, text, 10);
		}
		assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4);

	
public voidtestGetBestSingleFragmentWithWeights()

		WeightedTerm[]wTerms=new WeightedTerm[2];
		wTerms[0]=new WeightedTerm(10f,"hello");
		wTerms[1]=new WeightedTerm(1f,"kennedy");
		Highlighter highlighter =new Highlighter(new QueryScorer(wTerms));
		TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(texts[0]));
		highlighter.setTextFragmenter(new SimpleFragmenter(2));

		String result = highlighter.getBestFragment(tokenStream,texts[0]).trim();
		assertTrue("Failed to find best section using weighted terms. Found: ["+result+"]"
			, "<B>Hello</B>".equals(result));

		//readjust weights
		wTerms[1].setWeight(50f);
		tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(texts[0]));
		highlighter =new Highlighter(new QueryScorer(wTerms));
		highlighter.setTextFragmenter(new SimpleFragmenter(2));

		result = highlighter.getBestFragment(tokenStream,texts[0]).trim();
		assertTrue("Failed to find best section using weighted terms. Found: "+result
			, "<B>kennedy</B>".equals(result));
	
public voidtestGetFuzzyFragments()

		doSearching("Kinnedy~");
		doStandardHighlights();
		assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5);
	
public voidtestGetMidWildCardFragments()

		doSearching("K*dy");
		doStandardHighlights();
		assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5);
	
public voidtestGetRangeFragments()

		doSearching(FIELD_NAME + ":[kannedy TO kznnedy]"); //bug?needs lower case
		doStandardHighlights();
		assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5);
	
public voidtestGetSimpleHighlight()

		doSearching("Kennedy");
		Highlighter highlighter =
			new Highlighter(this,new QueryScorer(query));

		for (int i = 0; i < hits.length(); i++)
		{
			String text = hits.doc(i).get(FIELD_NAME);
			TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));

			String result = highlighter.getBestFragment(tokenStream,text);
			System.out.println("\t" + result);
		}
		assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4);
	
public voidtestGetTextFragments()

		doSearching("Kennedy");
		Highlighter highlighter =
			new Highlighter(this,new QueryScorer(query));
		highlighter.setTextFragmenter(new SimpleFragmenter(20));

		for (int i = 0; i < hits.length(); i++)
		{
			String text = hits.doc(i).get(FIELD_NAME);
			TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));

			String stringResults[] = highlighter.getBestFragments(tokenStream,text,10);

			tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
			TextFragment fragmentResults[] = highlighter.getBestTextFragments(tokenStream,text,true,10);

			assertTrue("Failed to find correct number of text Fragments: " + 
				fragmentResults.length + " vs "+ stringResults.length, fragmentResults.length==stringResults.length);
			for (int j = 0; j < stringResults.length; j++) 
			{
				System.out.println(fragmentResults[j]);
				assertTrue("Failed to find same text Fragments: " + 
					fragmentResults[j] + " found", fragmentResults[j].toString().equals(stringResults[j]));
				
			}
			
		}
	
public voidtestGetWildCardFragments()

		doSearching("K?nnedy");
		doStandardHighlights();
		assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4);
	
public voidtestMaxSizeHighlight()

		doSearching("meat");
		Highlighter highlighter =
			new Highlighter(this,new QueryScorer(query));
		highlighter.setMaxDocBytesToAnalyze(30);
		TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(texts[0]));
		highlighter.getBestFragment(tokenStream,texts[0]);
		assertTrue("Setting MaxDocBytesToAnalyze should have prevented " +
			"us from finding matches for this record: " + numHighlights +
			 " found", numHighlights == 0);
	
public voidtestMultiSearcher()

		//setup index 1
		RAMDirectory ramDir1 = new RAMDirectory();
		IndexWriter writer1 = new IndexWriter(ramDir1, new StandardAnalyzer(), true);
		Document d = new Document();
		Field f = new Field(FIELD_NAME, "multiOne", Field.Store.YES, Field.Index.TOKENIZED);
		d.add(f);
		writer1.addDocument(d);
		writer1.optimize();
		writer1.close();
		IndexReader reader1 = IndexReader.open(ramDir1);

		//setup index 2
		RAMDirectory ramDir2 = new RAMDirectory();
		IndexWriter writer2 = new IndexWriter(ramDir2, new StandardAnalyzer(), true);
		d = new Document();
		f = new Field(FIELD_NAME, "multiTwo", Field.Store.YES, Field.Index.TOKENIZED);
		d.add(f);
		writer2.addDocument(d);
		writer2.optimize();
		writer2.close();
		IndexReader reader2 = IndexReader.open(ramDir2);



		IndexSearcher searchers[]=new IndexSearcher[2];
		searchers[0] = new IndexSearcher(ramDir1);
		searchers[1] = new IndexSearcher(ramDir2);
		MultiSearcher multiSearcher=new MultiSearcher(searchers);
		QueryParser parser=new QueryParser(FIELD_NAME, new StandardAnalyzer());
		query = parser.parse("multi*");
		System.out.println("Searching for: " + query.toString(FIELD_NAME));
		//at this point the multisearcher calls combine(query[])
		hits = multiSearcher.search(query);

		//query = QueryParser.parse("multi*", FIELD_NAME, new StandardAnalyzer());
		Query expandedQueries[]=new Query[2];
		expandedQueries[0]=query.rewrite(reader1);
		expandedQueries[1]=query.rewrite(reader2);
		query=query.combine(expandedQueries);


		//create an instance of the highlighter with the tags used to surround highlighted text
		Highlighter highlighter =
			new Highlighter(this,new QueryScorer(query));

		for (int i = 0; i < hits.length(); i++)
		{
			String text = hits.doc(i).get(FIELD_NAME);
			TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
			String highlightedText = highlighter.getBestFragment(tokenStream,text);
			System.out.println(highlightedText);
		}
		assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 2);

	
public voidtestNoFragments()

		doSearching("AnInvalidQueryWhichShouldYieldNoResults");
		Highlighter highlighter =
			new Highlighter(this,new QueryScorer(query));

		for (int i = 0; i < texts.length; i++)
		{
			String text = texts[i];
			TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));

			String result = highlighter.getBestFragment(tokenStream,text);
			assertNull("The highlight result should be null for text with no query terms", result);
		}
	
public voidtestOverlapAnalyzer()

		HashMap synonyms = new HashMap();
		synonyms.put("football", "soccer,footie");
		Analyzer analyzer = new SynonymAnalyzer(synonyms);
		String srchkey = "football";

		String s = "football-soccer in the euro 2004 footie competition";
		QueryParser parser=new QueryParser("bookid",analyzer);
		Query query = parser.parse(srchkey);

		Highlighter highlighter = new Highlighter(new QueryScorer(query));
		TokenStream tokenStream =
			analyzer.tokenStream(null, new StringReader(s));
		// Get 3 best fragments and seperate with a "..."
		String result = highlighter.getBestFragments(tokenStream, s, 3, "...");
		String expectedResult="<B>football</B>-<B>soccer</B> in the euro 2004 <B>footie</B> competition";
		assertTrue("overlapping analyzer should handle highlights OK",expectedResult.equals(result));
	
public voidtestSimpleHighlighter()

		doSearching("Kennedy");
		Highlighter highlighter =	new Highlighter(new QueryScorer(query));
		highlighter.setTextFragmenter(new SimpleFragmenter(40));
		int maxNumFragmentsRequired = 2;
		for (int i = 0; i < hits.length(); i++)
		{
			String text = hits.doc(i).get(FIELD_NAME);
			TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));

			String result =
				highlighter.getBestFragments(tokenStream,text,maxNumFragmentsRequired, "...");
			System.out.println("\t" + result);
		}
		//Not sure we can assert anything here - just running to check we dont throw any exceptions
	
public voidtestUnRewrittenQuery()

		//test to show how rewritten query can still be used
		searcher = new IndexSearcher(ramDir);
		Analyzer analyzer=new StandardAnalyzer();

		QueryParser parser=new QueryParser(FIELD_NAME,analyzer);	
		Query query = parser.parse("JF? or Kenned*");
		System.out.println("Searching with primitive query");
		//forget to set this and...
		//query=query.rewrite(reader);
		Hits hits = searcher.search(query);

		//create an instance of the highlighter with the tags used to surround highlighted text
//		QueryHighlightExtractor highlighter = new QueryHighlightExtractor(this, query, new StandardAnalyzer());
		Highlighter highlighter =
			new Highlighter(this,new QueryScorer(query));

		highlighter.setTextFragmenter(new SimpleFragmenter(40));

		int maxNumFragmentsRequired = 3;

		for (int i = 0; i < hits.length(); i++)
		{
			String text = hits.doc(i).get(FIELD_NAME);
			TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));

			String highlightedText = highlighter.getBestFragments(tokenStream,text,maxNumFragmentsRequired,"...");
			System.out.println(highlightedText);
		}
		//We expect to have zero highlights if the query is multi-terms and is not rewritten!
		assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 0);