FileDocCategorySizeDatePackage
XIncludeTextReader.javaAPI DocJava SE 5 API21450Fri Aug 26 14:55:56 BST 2005com.sun.org.apache.xerces.internal.xinclude

XIncludeTextReader

public class XIncludeTextReader extends Object
This class is used for reading resources requested in <include> elements, when the parse attribute of the <include> element is "text". Using this class will open the location, detect the encoding, and discard the byte order mark, if applicable. REVISIT: Much of the code in this class is taken from XMLEntityManager. It would be nice if this code could be shared in some way. However, since XMLEntityManager is used for reading files as XML, and this needs to read files as text, there would need to be some refactoring done.
author
Michael Glavassevich, IBM
author
Peter McCracken, IBM
author
Arun Yadav, Sun Microsystems Inc.
version
$Id: XIncludeTextReader.java,v 1.10 2004/04/15 04:51:56 mrglavas Exp $
see
XIncludeHandler

Fields Summary
private Reader
fReader
private XIncludeHandler
fHandler
private XMLInputSource
fSource
private XMLErrorReporter
fErrorReporter
private String
fAccept
private String
fAcceptLanguage
Constructors Summary
public XIncludeTextReader(XMLInputSource source, XIncludeHandler handler)
Construct the XIncludeReader using the XMLInputSource and XIncludeHandler.

param
source The XMLInputSource to use.
param
handler The XIncludeHandler to use.

        fHandler = handler;
        fSource = source;
    
Methods Summary
public voidclose()
Closes the stream. Call this after parse(), or when there is no longer any need for this object.

throws
IOException

        if (fReader != null) {
            fReader.close();
        }
    
protected voidconsumeBOM(java.io.InputStream stream, java.lang.String encoding)
Removes the byte order mark from the stream, if it exists.

param
stream
param
encoding
throws
IOException


        byte[] b = new byte[3];
        int count = 0;
        stream.mark(3);
        if (encoding.equals("UTF-8")) {
            count = stream.read(b, 0, 3);
            if (count == 3) {
                int b0 = b[0] & 0xFF; 
                int b1 = b[1] & 0xFF;
                int b2 = b[2] & 0xFF;
                if (b0 != 0xEF || b1 != 0xBB || b2 != 0xBF) {
                    // First three bytes are not BOM, so reset.
                    stream.reset();
                }
            }
            else {
                stream.reset();
            }
        }
        else if (encoding.startsWith("UTF-16")) {
            count = stream.read(b, 0, 2);
            if (count == 2) {
                int b0 = b[0] & 0xFF;
                int b1 = b[1] & 0xFF;
                if ((b0 != 0xFE || b1 != 0xFF) 
                    && (b0 != 0xFF || b1 != 0xFE)) {
                    // First two bytes are not BOM, so reset.
                    stream.reset();
                }
            }
            else {
                stream.reset();
            }
        }
        // We could do UTF-32, but since the getEncodingName() doesn't support that
        // we won't support it here.
        // To implement UTF-32, look for:  00 00 FE FF for big-endian
        //                             or  FF FE 00 00 for little-endian
    
protected java.lang.StringgetEncodingName(java.io.InputStream stream)
XMLEntityManager cares about endian-ness, since it creates its own optimized readers. Since we're just using generic Java readers for now, we're not caring about endian-ness. If this changes, even more code needs to be copied from XMLEntity manager. -- PJM

        final byte[] b4 = new byte[4];
        String encoding = null;

        // this has the potential to throw an exception
        // it will be fixed when we ensure the stream is rewindable (see note above)
        stream.mark(4);
        int count = stream.read(b4, 0, 4);
        stream.reset();
        if (count == 4) {
            encoding = getEncodingName(b4);
        }

        return encoding;
    
protected java.lang.StringgetEncodingName(byte[] b4)
REVISIT: This code is taken from com.sun.org.apache.xerces.internal.impl.XMLEntityManager. Is there any way we can share the code, without having it implemented twice? I think we should make it public and static in XMLEntityManager. --PJM Returns the IANA encoding name that is auto-detected from the bytes specified, with the endian-ness of that encoding where appropriate.

param
b4 The first four bytes of the input.
return
the encoding name, or null if no encoding could be detected


        // UTF-16, with BOM
        int b0 = b4[0] & 0xFF;
        int b1 = b4[1] & 0xFF;
        if (b0 == 0xFE && b1 == 0xFF) {
            // UTF-16, big-endian
            return "UTF-16BE";
        }
        if (b0 == 0xFF && b1 == 0xFE) {
            // UTF-16, little-endian
            return "UTF-16LE";
        }

        // UTF-8 with a BOM
        int b2 = b4[2] & 0xFF;
        if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
            return "UTF-8";
        }

        // other encodings
        int b3 = b4[3] & 0xFF;
        if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
            // UCS-4, big endian (1234)
            return "ISO-10646-UCS-4";
        }
        if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
            // UCS-4, little endian (4321)
            return "ISO-10646-UCS-4";
        }
        if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {
            // UCS-4, unusual octet order (2143)
            return "ISO-10646-UCS-4";
        }
        if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {
            // UCS-4, unusual octect order (3412)
            return "ISO-10646-UCS-4";
        }
        if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
            // UTF-16, big-endian, no BOM
            // (or could turn out to be UCS-2...
            return "UTF-16BE";
        }
        if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
            // UTF-16, little-endian, no BOM
            // (or could turn out to be UCS-2...
            return "UTF-16LE";
        }
        if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {
            // EBCDIC
            // a la xerces1, return CP037 instead of EBCDIC here
            return "CP037";
        }

        // this signals us to use the value from the encoding attribute
        return null;

    
protected java.io.ReadergetReader(com.sun.org.apache.xerces.internal.xni.parser.XMLInputSource source)
Return the Reader for given XMLInputSource.

param
source The XMLInputSource to use.

        if (source.getCharacterStream() != null) {
            return source.getCharacterStream();
        }
        else {
            InputStream stream = null;

            String encoding = source.getEncoding();
            if (encoding == null) {
                encoding = "UTF-8";
            }
            if (source.getByteStream() != null) {
                stream = source.getByteStream();
                // Wrap the InputStream so that it is possible to rewind it.
                if (!(stream instanceof BufferedInputStream)) {
                    stream = new BufferedInputStream(stream);
                }
            }
            else {
                String expandedSystemId = XMLEntityManager.expandSystemId(source.getSystemId(), source.getBaseSystemId(), false);

                URL url = new URL(expandedSystemId);
                URLConnection urlCon = url.openConnection();
				
                // If this is an HTTP connection attach any 
                // content negotation parameters to the request.
                if (urlCon instanceof HttpURLConnection) {
                    if( fAccept != null && fAccept.length() > 0) {
                        urlCon.setRequestProperty(XIncludeHandler.HTTP_ACCEPT, fAccept);
                    }
                    if( fAcceptLanguage != null && fAcceptLanguage.length() > 0) {
                        urlCon.setRequestProperty(XIncludeHandler.HTTP_ACCEPT_LANGUAGE, fAcceptLanguage);
                    }
                }
                
                // Wrap the InputStream so that it is possible to rewind it.
                stream = new BufferedInputStream(urlCon.getInputStream());
                
                // content type will be string like "text/xml; charset=UTF-8" or "text/xml"
                String rawContentType = urlCon.getContentType();
                
                // text/xml and application/xml offer only one optional parameter
                int index = (rawContentType != null) ? rawContentType.indexOf(';") : -1;

                String contentType = null;
                String charset = null;
                if (index != -1) {
                    // this should be something like "text/xml"
                    contentType = rawContentType.substring(0, index).trim();

                    // this should be something like "charset=UTF-8", but we want to
                    // strip it down to just "UTF-8"
                    charset = rawContentType.substring(index + 1).trim();
                    if (charset.startsWith("charset=")) {
                        // 8 is the length of "charset="
                        charset = charset.substring(8).trim();
                        // strip quotes, if present
                        if ((charset.charAt(0) == '""
                            && charset.charAt(charset.length() - 1) == '"")
                            || (charset.charAt(0) == '\'"
                                && charset.charAt(charset.length() - 1)
                                    == '\'")) {
                            charset =
                                charset.substring(1, charset.length() - 1);
                        }
                    }
                    else {
                        charset = null;
                    }
                }
                else {
                    contentType = rawContentType.trim();
                }

                String detectedEncoding = null;
                /**  The encoding of such a resource is determined by:
                    1 external encoding information, if available, otherwise
                         -- the most common type of external information is the "charset" parameter of a MIME package
                    2 if the media type of the resource is text/xml, application/xml, or matches the conventions text/*+xml or application/*+xml as described in XML Media Types [IETF RFC 3023], the encoding is recognized as specified in XML 1.0, otherwise
                    3 the value of the encoding attribute if one exists, otherwise
                    4 UTF-8.
                 **/
                if (contentType.equals("text/xml")) {
                    if (charset != null) {
                        detectedEncoding = charset;
                    }
                    else {
                        // see RFC2376 or 3023, section 3.1
                        detectedEncoding = "US-ASCII";
                    }
                }
                else if (contentType.equals("application/xml")) {
                    if (charset != null) {
                        detectedEncoding = charset;
                    }
                    else {
                        // see RFC2376 or 3023, section 3.2
                        detectedEncoding = getEncodingName(stream);
                    }
                }
                else if (contentType.endsWith("+xml")) {
                    detectedEncoding = getEncodingName(stream);
                }

                if (detectedEncoding != null) {
                    encoding = detectedEncoding;
                }
                // else 3 or 4.
            }
            
            encoding = encoding.toUpperCase(Locale.ENGLISH);
            
            // eat the Byte Order Mark
            consumeBOM(stream, encoding);
            
            // If the document is UTF-8 or US-ASCII use 
            // the Xerces readers for these encodings. For
            // US-ASCII consult the encoding map since
            // this encoding has many aliases.
            if (encoding.equals("UTF-8")) {
                return new UTF8Reader(stream, 
                    XMLEntityManager.DEFAULT_BUFFER_SIZE, 
                    fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN), 
                    fErrorReporter.getLocale() );
            }
            
            // Try to use a Java reader.
            String javaEncoding = EncodingMap.getIANA2JavaMapping(encoding);
            
            // If the specified encoding wasn't a recognized IANA encoding throw an IOException.
            // The XIncludeHandler will report this as a ResourceError and then will
            // attempt to include a fallback if there is one.
            if (javaEncoding == null) {
            	MessageFormatter aFormatter = 
                    fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN);
            	Locale aLocale = fErrorReporter.getLocale();
            	throw new IOException( aFormatter.formatMessage( aLocale, 
            	    "EncodingDeclInvalid", 
                    new Object[] {encoding} ) );
            }
            else if (javaEncoding.equals("ASCII")) {
                return new ASCIIReader(stream,
                    XMLEntityManager.DEFAULT_BUFFER_SIZE,
                    fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN), 
                    fErrorReporter.getLocale() );
            }
            
            return new InputStreamReader(stream, javaEncoding);
        }
    
protected booleanisValid(int ch)
Returns true if the specified character is a valid XML character as per the rules of XML 1.0.

param
ch The character to check.

        return XMLChar.isValid(ch);
    
public voidparse()
Read the input stream as text, and pass the text on to the XIncludeHandler using calls to characters(). This will read all of the text it can from the resource.

throws
IOException

        // REVISIT: This method needs to be rewritten to improve performance: both
        // time and memory. We should be reading chunks and reporting chunks instead 
        // of reading characters individually and reporting all the characters in 
        // one callback. Also, currently we don't provide any locator information:
        // line number, column number, etc... so if we report an error it will appear
        // as if the invalid XML character was in the include parent. -- mrglavas
        XMLStringBuffer buffer = new XMLStringBuffer();
        fReader = getReader(fSource);
        int ch;
        while((ch = fReader.read()) != -1) {
            if (isValid(ch)) {
                buffer.append((char)ch);
            }
            else if (XMLChar.isHighSurrogate(ch)) {
            	int ch2 = fReader.read();
            	if (XMLChar.isLowSurrogate(ch2)) {

                    // convert surrogates to a supplemental character
                    int sup = XMLChar.supplemental((char)ch, (char)ch2);

                    // supplemental character must be a valid XML character
                    if (!isValid(sup)) {
                        fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
                                                   "InvalidCharInContent", 
                                                   new Object[] { Integer.toString(sup, 16) },
                                                   XMLErrorReporter.SEVERITY_FATAL_ERROR);
                        continue;
                    }                 
                    buffer.append((char) ch);
                    buffer.append((char) ch2);
                }
                else {
                    fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
                                               "InvalidCharInContent", 
                                               new Object[] { Integer.toString(ch, 16) },
                                               XMLErrorReporter.SEVERITY_FATAL_ERROR);
                }
            }
            else {
                fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
                                           "InvalidCharInContent", 
                                           new Object[] { Integer.toString(ch, 16) },
                                           XMLErrorReporter.SEVERITY_FATAL_ERROR);
            }
        }
        if (fHandler != null && buffer.length > 0) {
            fHandler.characters(
                buffer,
                fHandler.modifyAugmentations(null, true));
        }
    
public voidsetErrorReporter(com.sun.org.apache.xerces.internal.impl.XMLErrorReporter errorReporter)
Sets the XMLErrorReporter used for reporting errors while reading the text include.

param
errorReporter the XMLErrorReporter to be used for reporting errors.

        fErrorReporter = errorReporter;
    
public voidsetHttpProperties(java.lang.String accept, java.lang.String acceptLanguage)
Sets content negotation parameters to be attached to an HTTP request.

param
accept the Accept HTTP request property
param
acceptLanguage the Accept-Language HTTP request property

        fAccept = accept;
        fAcceptLanguage = acceptLanguage;