package com.oreilly.jent.xml;
/**
* In general, you may use the code in this book in your programs and
* documentation. You do not need to contact us for permission unless
* you're reproducing a significant portion of the code. For example,
* writing a program that uses several chunks of code from this book does
* not require permission. Selling or distributing a CD-ROM of examples
* from O'Reilly books does require permission. Answering a question by
* citing this book and quoting example code does not require permission.
* Incorporating a significant amount of example code from this book into
* your product's documentation does require permission.
*
* We appreciate, but do not require, attribution. An attribution usually
* includes the title, author, publisher, and ISBN. For example:
*
* "Java Enterprise in a Nutshell, Third Edition,
* by Jim Farley and William Crawford
* with Prakash Malani, John G. Norman, and Justin Gehtland.
* Copyright 2006 O'Reilly Media, Inc., 0-596-10142-2."
*
* If you feel your use of code examples falls outside fair use or the
* permission given above, feel free to contact us at
* permissions@oreilly.com.
*/
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
public class DocumentCondenser {
public static void main(String[] args) throws Exception {
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance( );
// For HTML, we don't want to validate without a DTD
dbf.setValidating(false);
// Ignore text elements that are completely empty:
dbf.setIgnoringElementContentWhitespace(false);
dbf.setExpandEntityReferences(true);
dbf.setCoalescing(true);
// Ensure that getLocalName() returns the HTML element name
dbf.setNamespaceAware(true);
DocumentBuilder db = null;
try {
db = dbf.newDocumentBuilder( );
}
catch (ParserConfigurationException pce) {
pce.printStackTrace();
return;
}
Document html = null;
try {
html = db.parse("enterprisexml.html");
process(html);
// Use the XSLT Transformer to see the output
TransformerFactory tf = TransformerFactory.newInstance();
Transformer output = tf.newTransformer();
output.transform(new DOMSource(html), new StreamResult(System.out));
}
catch (Exception ex) {
ex.printStackTrace();
return;
}
}
/* We want to keep text if the parent is <em>, <title>, <b>, <li>, <th>
or <h1>..<h6>. We also want to keep text if it is in a <font> tag with
a size attribute set to a larger than normal size */
private static boolean keepText(Node parentNode) {
if (parentNode == null) return true; // top level
String parentName = parentNode.getLocalName();
if ((parentName.equalsIgnoreCase("em")) ||
(parentName.equalsIgnoreCase("title")) ||
(parentName.equalsIgnoreCase("b")) ||
(parentName.equalsIgnoreCase("li")) ||
(parentName.equalsIgnoreCase("th")) ||
((parentName.toLowerCase().startsWith("h")) &&
(parentName.length() == 2))) {
return true;
}
if ((parentNode.getNodeType() == Node.ELEMENT_NODE) &&
(parentName.equalsIgnoreCase("font"))) {
NamedNodeMap atts = parentNode.getAttributes();
if (atts != null) {
Node sizeNode = atts.getNamedItem("size"); //get an attribue Node
if (sizeNode != null) {
if (sizeNode.getNodeValue().startsWith("+")) {
return true;
}
}
}
}
return false;
}
private static void process(Node node) {
Node c = null;
Node delNode = null;
for (c = node.getFirstChild(); c != null; c = c.getNextSibling()) {
if (delNode != null) {
delNode.getParentNode().removeChild(delNode);
}
delNode = null;
if ((c.getNodeType() == Node.TEXT_NODE) &&
(!keepText(c.getParentNode()))) {
delNode = c;
}
else if (c.getNodeType() != Node.TEXT_NODE) {
process(c);
}
} // End For
if (delNode != null) // Delete, if the last child was text
delNode.getParentNode().removeChild(delNode);
}
}
|