FileDocCategorySizeDatePackage
DocumentCondenser.javaAPI DocExample4735Thu Dec 15 21:10:42 GMT 2005com.oreilly.jent.xml

DocumentCondenser.java

package com.oreilly.jent.xml;

/**
 * In general, you may use the code in this book in your programs and 
 * documentation. You do not need to contact us for permission unless 
 * you're reproducing a significant portion of the code. For example, 
 * writing a program that uses several chunks of code from this book does 
 * not require permission. Selling or distributing a CD-ROM of examples 
 * from O'Reilly books does require permission. Answering a question by 
 * citing this book and quoting example code does not require permission. 
 * Incorporating a significant amount of example code from this book into 
 * your product's documentation does require permission.
 * 
 * We appreciate, but do not require, attribution. An attribution usually 
 * includes the title, author, publisher, and ISBN. For example: 
 * 
 *   "Java Enterprise in a Nutshell, Third Edition, 
 *    by Jim Farley and William Crawford 
 *    with Prakash Malani, John G. Norman, and Justin Gehtland. 
 *    Copyright 2006 O'Reilly Media, Inc., 0-596-10142-2."
 *  
 *  If you feel your use of code examples falls outside fair use or the 
 *  permission given above, feel free to contact us at 
 *  permissions@oreilly.com.
 */

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;

public class DocumentCondenser {
  public static void main(String[] args) throws Exception {
    
    DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(  );
    
    // For HTML, we don't want to validate without a DTD
    dbf.setValidating(false);
    // Ignore text elements that are completely empty:
    dbf.setIgnoringElementContentWhitespace(false);
    dbf.setExpandEntityReferences(true);
    dbf.setCoalescing(true);
    
    // Ensure that getLocalName() returns the HTML element name
    dbf.setNamespaceAware(true);
    
    DocumentBuilder db = null;
    try {
      db = dbf.newDocumentBuilder(  );
    } 
    catch (ParserConfigurationException pce) {
      pce.printStackTrace();
      return;
    }
    
    Document html = null;
    try {
      html = db.parse("enterprisexml.html");
      process(html);
      
      // Use the XSLT Transformer to see the output
      TransformerFactory tf = TransformerFactory.newInstance();
      Transformer output = tf.newTransformer();
      output.transform(new DOMSource(html), new StreamResult(System.out));
    } 
    catch (Exception ex) {
      ex.printStackTrace();
      return;
    }    
  }
  
  /* We want to keep text if the parent is <em>, <title>, <b>, <li>, <th>
   or <h1>..<h6>. We also want to keep text if it is in a <font> tag with
   a size attribute set to a larger than normal size */
  private static boolean keepText(Node parentNode) {
    if (parentNode == null) return true; // top level
    
    String parentName = parentNode.getLocalName();
    if ((parentName.equalsIgnoreCase("em")) ||
        (parentName.equalsIgnoreCase("title")) ||
        (parentName.equalsIgnoreCase("b")) ||
        (parentName.equalsIgnoreCase("li")) ||
        (parentName.equalsIgnoreCase("th")) ||
        ((parentName.toLowerCase().startsWith("h")) &&
         (parentName.length() == 2))) {
      return true;
    }
    
    if ((parentNode.getNodeType() == Node.ELEMENT_NODE) &&
        (parentName.equalsIgnoreCase("font"))) {
      NamedNodeMap atts = parentNode.getAttributes();
      if (atts != null) {
        Node sizeNode = atts.getNamedItem("size"); //get an attribue Node
        if (sizeNode != null) {
          if (sizeNode.getNodeValue().startsWith("+")) {
            return true;
          }
        }
      }
      
    }
    return false;
  }
  
  private static void process(Node node) {
    
    Node c = null;
    Node delNode = null;
    
    for (c = node.getFirstChild(); c != null; c = c.getNextSibling()) {
      if (delNode != null) {
        delNode.getParentNode().removeChild(delNode);
      }
      delNode = null;
      if ((c.getNodeType() == Node.TEXT_NODE) &&
          (!keepText(c.getParentNode()))) {
        delNode = c;
      } 
      else if (c.getNodeType() != Node.TEXT_NODE) {
        process(c);
      }
    } // End For
    
    if (delNode != null) // Delete, if the last child was text
      delNode.getParentNode().removeChild(delNode);
  }
}