FileDocCategorySizeDatePackage
HTMLdtd.javaAPI DocApache Xerces 3.0.119621Fri Sep 14 20:33:52 BST 2007org.apache.xml.serialize

HTMLdtd.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

// Aug 21, 2000:
//   Fixed bug in isElement and made HTMLdtd public.
//   Contributed by Eric SCHAEFFER" <eschaeffer@posterconseil.com>

package org.apache.xml.serialize;

import org.apache.xerces.dom.DOMMessageFormatter;

import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.BufferedReader;
import java.util.Hashtable;
import java.util.Locale;

/**
 * Utility class for accessing information specific to HTML documents.
 * The HTML DTD is expressed as three utility function groups. Two methods
 * allow for checking whether an element requires an open tag on printing
 * ({@link #isEmptyTag}) or on parsing ({@link #isOptionalClosing}).
 * <P>
 * Two other methods translate character references from name to value and
 * from value to name. A small entities resource is loaded into memory the
 * first time any of these methods is called for fast and efficient access.
 *
 * @deprecated This class was deprecated in Xerces 2.9.0. It is recommended 
 * that new applications use JAXP's Transformation API for XML (TrAX) for 
 * serializing HTML. See the Xerces documentation for more information.
 * @version $Revision: 476047 $ $Date: 2006-11-16 23:27:45 -0500 (Thu, 16 Nov 2006) $
 * @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a>
 */
public final class HTMLdtd
{

    /**
     * Public identifier for HTML 4.01 (Strict) document type.
     */
    public static final String HTMLPublicId = "-//W3C//DTD HTML 4.01//EN";

    /**
     * System identifier for HTML 4.01 (Strict) document type.
     */
    public static final String HTMLSystemId =
        "http://www.w3.org/TR/html4/strict.dtd";

    /**
     * Public identifier for XHTML 1.0 (Strict) document type.
     */
    public static final String XHTMLPublicId =
        "-//W3C//DTD XHTML 1.0 Strict//EN";

    /**
     * System identifier for XHTML 1.0 (Strict) document type.
     */
    public static final String XHTMLSystemId =
        "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";

    /**
     * Table of reverse character reference mapping. Character codes are held
     * as single-character strings, mapped to their reference name.
     */
    private static Hashtable        _byChar;


    /**
     * Table of entity name to value mapping. Entities are held as strings,
     * character references as <TT>Character</TT> objects.
     */
    private static Hashtable        _byName;


    private static Hashtable        _boolAttrs;


    /**
     * Holds element definitions.
     */
    private static Hashtable        _elemDefs;


    /**
     * Locates the HTML entities file that is loaded upon initialization.
     * This file is a resource loaded with the default class loader.
     */
    private static final String     ENTITIES_RESOURCE = "HTMLEntities.res";


    /**
     * Only opening tag should be printed.
     */
    private static final int ONLY_OPENING = 0x0001;

    /**
     * Element contains element content only.
     */
    private static final int ELEM_CONTENT = 0x0002;


    /**
     * Element preserve spaces.
     */
    private static final int PRESERVE     = 0x0004;


    /**
     * Optional closing tag.
     */
    private static final int OPT_CLOSING  = 0x0008;


    /**
     * Element is empty (also means only opening tag)
     */
    private static final int EMPTY        = 0x0010 | ONLY_OPENING;


    /**
     * Allowed to appear in head.
     */
    private static final int ALLOWED_HEAD = 0x0020;


    /**
     * When opened, closes P.
     */
    private static final int CLOSE_P      = 0x0040;


    /**
     * When opened, closes DD or DT.
     */
    private static final int CLOSE_DD_DT  = 0x0080;


    /**
     * When opened, closes itself.
     */
    private static final int CLOSE_SELF   = 0x0100;


    /**
     * When opened, closes another table section.
     */
    private static final int CLOSE_TABLE  = 0x0200;


    /**
     * When opened, closes TH or TD.
     */
    private static final int CLOSE_TH_TD  = 0x04000;


    /**
     * Returns true if element is declared to be empty. HTML elements are
     * defines as empty in the DTD, not by the document syntax.
     *
     * @param tagName The element tag name (upper case)
     * @return True if element is empty
     */
    public static boolean isEmptyTag( String tagName )
    {
        return isElement( tagName, EMPTY );
    }


    /**
     * Returns true if element is declared to have element content.
     * Whitespaces appearing inside element content will be ignored,
     * other text will simply report an error.
     *
     * @param tagName The element tag name (upper case)
     * @return True if element content
     */
    public static boolean isElementContent( String tagName )
    {
        return isElement( tagName, ELEM_CONTENT );
    }


    /**
     * Returns true if element's textual contents preserves spaces.
     * This only applies to PRE and TEXTAREA, all other HTML elements
     * do not preserve space.
     *
     * @param tagName The element tag name (upper case)
     * @return True if element's text content preserves spaces
     */
    public static boolean isPreserveSpace( String tagName )
    {
        return isElement( tagName, PRESERVE );
    }


    /**
     * Returns true if element's closing tag is optional and need not
     * exist. An error will not be reported for such elements if they
     * are not closed. For example, <tt>LI</tt> is most often not closed.
     *
     * @param tagName The element tag name (upper case)
     * @return True if closing tag implied
     */
    public static boolean isOptionalClosing( String tagName )
    {
        return isElement( tagName, OPT_CLOSING );
    }


    /**
     * Returns true if element's closing tag is generally not printed.
     * For example, <tt>LI</tt> should not print the closing tag.
     *
     * @param tagName The element tag name (upper case)
     * @return True if only opening tag should be printed
     */
    public static boolean isOnlyOpening( String tagName )
    {
        return isElement( tagName, ONLY_OPENING );
    }


    /**
     * Returns true if the opening of one element (<tt>tagName</tt>) implies
     * the closing of another open element (<tt>openTag</tt>). For example,
     * every opening <tt>LI</tt> will close the previously open <tt>LI</tt>,
     * and every opening <tt>BODY</tt> will close the previously open <tt>HEAD</tt>.
     *
     * @param tagName The newly opened element
     * @param openTag The already opened element
     * @return True if closing tag closes opening tag
     */
    public static boolean isClosing( String tagName, String openTag )
    {
        // Several elements are defined as closing the HEAD
        if ( openTag.equalsIgnoreCase( "HEAD" ) )
            return ! isElement( tagName, ALLOWED_HEAD );
        // P closes iteself
        if ( openTag.equalsIgnoreCase( "P" ) )
            return isElement( tagName, CLOSE_P );
        // DT closes DD, DD closes DT
        if ( openTag.equalsIgnoreCase( "DT" ) || openTag.equalsIgnoreCase( "DD" ) )
            return isElement( tagName, CLOSE_DD_DT );
        // LI and OPTION close themselves
        if ( openTag.equalsIgnoreCase( "LI" ) || openTag.equalsIgnoreCase( "OPTION" ) )
            return isElement( tagName, CLOSE_SELF );
        // Each of these table sections closes all the others
        if ( openTag.equalsIgnoreCase( "THEAD" ) || openTag.equalsIgnoreCase( "TFOOT" ) ||
             openTag.equalsIgnoreCase( "TBODY" ) || openTag.equalsIgnoreCase( "TR" ) ||
             openTag.equalsIgnoreCase( "COLGROUP" ) )
            return isElement( tagName, CLOSE_TABLE );
        // TD closes TH and TH closes TD
        if ( openTag.equalsIgnoreCase( "TH" ) || openTag.equalsIgnoreCase( "TD" ) )
            return isElement( tagName, CLOSE_TH_TD );
        return false;
    }


    /**
     * Returns true if the specified attribute it a URI and should be
     * escaped appropriately. In HTML URIs are escaped differently
     * than normal attributes.
     *
     * @param tagName The element's tag name
     * @param attrName The attribute's name
     */
    public static boolean isURI( String tagName, String attrName )
    {
        // Stupid checks.
        return ( attrName.equalsIgnoreCase( "href" ) || attrName.equalsIgnoreCase( "src" ) );
    }


    /**
     * Returns true if the specified attribute is a boolean and should be
     * printed without the value. This applies to attributes that are true
     * if they exist, such as selected (OPTION/INPUT).
     *
     * @param tagName The element's tag name
     * @param attrName The attribute's name
     */
    public static boolean isBoolean( String tagName, String attrName )
    {
        String[] attrNames;

        attrNames = (String[]) _boolAttrs.get( tagName.toUpperCase(Locale.ENGLISH) );
        if ( attrNames == null )
            return false;
        for ( int i = 0 ; i < attrNames.length ; ++i )
            if ( attrNames[ i ].equalsIgnoreCase( attrName ) )
                return true;
        return false;
    }


    /**
     * Returns the value of an HTML character reference by its name. If the
     * reference is not found or was not defined as a character reference,
     * returns EOF (-1).
     *
     * @param name Name of character reference
     * @return Character code or EOF (-1)
     */
    public static int charFromName( String name )
    {
        Object    value;

        initialize();
        value = _byName.get( name );
        if ( value != null && value instanceof Integer ) {
            return ( (Integer) value ).intValue();
        }
        return -1;
    }


    /**
     * Returns the name of an HTML character reference based on its character
     * value. Only valid for entities defined from character references. If no
     * such character value was defined, return null.
     *
     * @param value Character value of entity
     * @return Entity's name or null
     */
    public static String fromChar(int value )
    {
       if (value > 0xffff)
            return null;

        String name;

        initialize();
        name = (String) _byChar.get( new Integer( value ) );
        return name;
    }


    /**
     * Initialize upon first access. Will load all the HTML character references
     * into a list that is accessible by name or character value and is optimized
     * for character substitution. This method may be called any number of times
     * but will execute only once.
     */
    private static void initialize()
    {
        InputStream     is = null;
        BufferedReader  reader = null;
        int             index;
        String          name;
        String          value;
        int             code;
        String          line;

        // Make sure not to initialize twice.
        if ( _byName != null )
            return;
        try {
            _byName = new Hashtable();
            _byChar = new Hashtable();
            is = HTMLdtd.class.getResourceAsStream( ENTITIES_RESOURCE );
            if ( is == null ) {
            	throw new RuntimeException( 
				    DOMMessageFormatter.formatMessage(
				    DOMMessageFormatter.SERIALIZER_DOMAIN,
                    "ResourceNotFound", new Object[] {ENTITIES_RESOURCE}));
            }    
            reader = new BufferedReader( new InputStreamReader( is, "ASCII" ) );
            line = reader.readLine();
            while ( line != null ) {
                if ( line.length() == 0 || line.charAt( 0 ) == '#' ) {
                    line = reader.readLine();
                    continue;
                }
                index = line.indexOf( ' ' );
                if ( index > 1 ) {
                    name = line.substring( 0, index );
                    ++index;
                    if ( index < line.length() ) {
                        value = line.substring( index );
                        index = value.indexOf( ' ' );
                        if ( index > 0 )
                            value = value.substring( 0, index );
                        code = Integer.parseInt( value );
                                        defineEntity( name, (char) code );
                    }
                }
                line = reader.readLine();
            }
            is.close();
        }  catch ( Exception except ) {
			throw new RuntimeException( 
				DOMMessageFormatter.formatMessage(
				DOMMessageFormatter.SERIALIZER_DOMAIN,
                "ResourceNotLoaded", new Object[] {ENTITIES_RESOURCE, except.toString()}));        	
        } finally {
            if ( is != null ) {
                try {
                    is.close();
                } catch ( Exception except ) { }
            }
        }
    }


    /**
     * Defines a new character reference. The reference's name and value are
     * supplied. Nothing happens if the character reference is already defined.
     * <P>
     * Unlike internal entities, character references are a string to single
     * character mapping. They are used to map non-ASCII characters both on
     * parsing and printing, primarily for HTML documents. '<amp;' is an
     * example of a character reference.
     *
     * @param name The entity's name
     * @param value The entity's value
     */
    private static void defineEntity( String name, char value )
    {
        if ( _byName.get( name ) == null ) {
            _byName.put( name, new Integer( value ) );
            _byChar.put( new Integer( value ), name );
        }
    }


    private static void defineElement( String name, int flags )
    {
        _elemDefs.put( name, new Integer( flags ) );
    }


    private static void defineBoolean( String tagName, String attrName )
    {
        defineBoolean( tagName, new String[] { attrName } );
    }


    private static void defineBoolean( String tagName, String[] attrNames )
    {
        _boolAttrs.put( tagName, attrNames );
    }


    private static boolean isElement( String name, int flag )
    {
        Integer flags;

        flags = (Integer) _elemDefs.get( name.toUpperCase(Locale.ENGLISH) );
        if ( flags == null ) {
            return false;
        }
        return ( ( flags.intValue() & flag ) == flag );
    }


    static
    {
        _elemDefs = new Hashtable();
        defineElement( "ADDRESS", CLOSE_P );
        defineElement( "AREA", EMPTY );
        defineElement( "BASE",  EMPTY | ALLOWED_HEAD );
        defineElement( "BASEFONT", EMPTY );
        defineElement( "BLOCKQUOTE", CLOSE_P );
        defineElement( "BODY", OPT_CLOSING );
        defineElement( "BR", EMPTY );
        defineElement( "COL", EMPTY );
        defineElement( "COLGROUP", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
        defineElement( "DD", OPT_CLOSING | ONLY_OPENING | CLOSE_DD_DT );
        defineElement( "DIV", CLOSE_P );
        defineElement( "DL", ELEM_CONTENT | CLOSE_P );
        defineElement( "DT", OPT_CLOSING | ONLY_OPENING | CLOSE_DD_DT );
        defineElement( "FIELDSET", CLOSE_P );
        defineElement( "FORM", CLOSE_P );
        defineElement( "FRAME", EMPTY | OPT_CLOSING );
        defineElement( "H1", CLOSE_P );
        defineElement( "H2", CLOSE_P );
        defineElement( "H3", CLOSE_P );
        defineElement( "H4", CLOSE_P );
        defineElement( "H5", CLOSE_P );
        defineElement( "H6", CLOSE_P );
        defineElement( "HEAD", ELEM_CONTENT | OPT_CLOSING );
        defineElement( "HR", EMPTY | CLOSE_P );
        defineElement( "HTML", ELEM_CONTENT | OPT_CLOSING );
        defineElement( "IMG", EMPTY );
        defineElement( "INPUT", EMPTY );
        defineElement( "ISINDEX", EMPTY | ALLOWED_HEAD );
        defineElement( "LI", OPT_CLOSING | ONLY_OPENING | CLOSE_SELF );
        defineElement( "LINK", EMPTY | ALLOWED_HEAD );
        defineElement( "MAP", ALLOWED_HEAD );
        defineElement( "META", EMPTY | ALLOWED_HEAD );
        defineElement( "OL", ELEM_CONTENT | CLOSE_P );
        defineElement( "OPTGROUP", ELEM_CONTENT );
        defineElement( "OPTION", OPT_CLOSING | ONLY_OPENING | CLOSE_SELF );
        defineElement( "P", OPT_CLOSING | CLOSE_P | CLOSE_SELF );
        defineElement( "PARAM", EMPTY );
        defineElement( "PRE", PRESERVE | CLOSE_P );
        defineElement( "SCRIPT", ALLOWED_HEAD | PRESERVE );
        defineElement( "NOSCRIPT", ALLOWED_HEAD | PRESERVE );
        defineElement( "SELECT", ELEM_CONTENT );
        defineElement( "STYLE", ALLOWED_HEAD | PRESERVE );
        defineElement( "TABLE", ELEM_CONTENT | CLOSE_P );
        defineElement( "TBODY", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
        defineElement( "TD", OPT_CLOSING | CLOSE_TH_TD );
        defineElement( "TEXTAREA", PRESERVE );
        defineElement( "TFOOT", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
        defineElement( "TH", OPT_CLOSING | CLOSE_TH_TD );
        defineElement( "THEAD", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
        defineElement( "TITLE", ALLOWED_HEAD );
        defineElement( "TR", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
        defineElement( "UL", ELEM_CONTENT | CLOSE_P );

        _boolAttrs = new Hashtable();
        defineBoolean( "AREA", "href" );
        defineBoolean( "BUTTON", "disabled" );
        defineBoolean( "DIR", "compact" );
        defineBoolean( "DL", "compact" );
        defineBoolean( "FRAME", "noresize" );
        defineBoolean( "HR", "noshade" );
        defineBoolean( "IMAGE", "ismap" );
        defineBoolean( "INPUT", new String[] { "defaultchecked", "checked", "readonly", "disabled" } );
        defineBoolean( "LINK", "link" );
        defineBoolean( "MENU", "compact" );
        defineBoolean( "OBJECT", "declare" );
        defineBoolean( "OL", "compact" );
        defineBoolean( "OPTGROUP", "disabled" );
        defineBoolean( "OPTION", new String[] { "default-selected", "selected", "disabled" } );
        defineBoolean( "SCRIPT", "defer" );
        defineBoolean( "SELECT", new String[] { "multiple", "disabled" } );
        defineBoolean( "STYLE", "disabled" );
        defineBoolean( "TD", "nowrap" );
        defineBoolean( "TH", "nowrap" );
        defineBoolean( "TEXTAREA", new String[] { "disabled", "readonly" } );
        defineBoolean( "UL", "compact" );

        initialize();
    }



}