FileDocCategorySizeDatePackage
HTMLStrategy.javaAPI DocApache Lucene 2.1.03800Wed Feb 14 10:46:04 GMT 2007org.apache.lucene.gdata.search.analysis

HTMLStrategy

public class HTMLStrategy extends org.apache.lucene.gdata.search.analysis.ContentStrategy
This ContentStrategy applies the path to the Indexable and retrieves the plain string content from the returning node. All of the nodes text content will cleaned from any html tags.
author
Simon Willnauer

Fields Summary
private static final String
REMOVE_SCRIPT
private static final String
CHAR_ENCODING
Constructors Summary
protected HTMLStrategy(org.apache.lucene.gdata.search.config.IndexSchemaField fieldConfiguration)


       
        super(fieldConfiguration);

    
Methods Summary
public voidprocessIndexable(Indexable indexable)

see
org.apache.lucene.gdata.search.analysis.ContentStrategy#processIndexable(org.apache.lucene.gdata.search.analysis.Indexable)

        String path = this.config.getPath();
        Node node = null;
        try {
            node = indexable.applyPath(path);
        } catch (XPathExpressionException e1) {
            throw new NotIndexableException("Can not apply path -- " + path);

        }
        if(node == null)
            throw new NotIndexableException("Could not retrieve content for schema field: "+this.config);
        StringReader contentReader = new StringReader(node.getTextContent());
        /*
         * remove all elements and script parts
         */
        ElementRemover remover = new ElementRemover();
        remover.removeElement(REMOVE_SCRIPT);
        StringWriter contentWriter = new StringWriter();
        Writer writer = new Writer(contentWriter, CHAR_ENCODING);
        XMLDocumentFilter[] filters = { remover, writer, };
        XMLParserConfiguration parser = new HTMLConfiguration();
        parser.setProperty("http://cyberneko.org/html/properties/filters",
                filters);
        XMLInputSource source = new XMLInputSource(null, null, null,
                contentReader, CHAR_ENCODING);
        try {
            parser.parse(source);
        } catch (XNIException e) {
            throw new NotIndexableException("Can not parse html -- ", e);

        } catch (IOException e) {
            throw new NotIndexableException("Can not parse html -- ", e);

        }
        this.content = contentWriter.toString();