String path = this.config.getPath();
Node node = null;
try {
node = indexable.applyPath(path);
} catch (XPathExpressionException e1) {
throw new NotIndexableException("Can not apply path -- " + path);
}
if(node == null)
throw new NotIndexableException("Could not retrieve content for schema field: "+this.config);
StringReader contentReader = new StringReader(node.getTextContent());
/*
* remove all elements and script parts
*/
ElementRemover remover = new ElementRemover();
remover.removeElement(REMOVE_SCRIPT);
StringWriter contentWriter = new StringWriter();
Writer writer = new Writer(contentWriter, CHAR_ENCODING);
XMLDocumentFilter[] filters = { remover, writer, };
XMLParserConfiguration parser = new HTMLConfiguration();
parser.setProperty("http://cyberneko.org/html/properties/filters",
filters);
XMLInputSource source = new XMLInputSource(null, null, null,
contentReader, CHAR_ENCODING);
try {
parser.parse(source);
} catch (XNIException e) {
throw new NotIndexableException("Can not parse html -- ", e);
} catch (IOException e) {
throw new NotIndexableException("Can not parse html -- ", e);
}
this.content = contentWriter.toString();