/*
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
*
* Copyright 1997-2007 Sun Microsystems, Inc. All rights reserved.
*
* The contents of this file are subject to the terms of either the GNU
* General Public License Version 2 only ("GPL") or the Common Development
* and Distribution License("CDDL") (collectively, the "License"). You
* may not use this file except in compliance with the License. You can obtain
* a copy of the License at https://glassfish.dev.java.net/public/CDDL+GPL.html
* or glassfish/bootstrap/legal/LICENSE.txt. See the License for the specific
* language governing permissions and limitations under the License.
*
* When distributing the software, include this License Header Notice in each
* file and include the License file at glassfish/bootstrap/legal/LICENSE.txt.
* Sun designates this particular file as subject to the "Classpath" exception
* as provided by Sun in the GPL Version 2 section of the License file that
* accompanied this code. If applicable, add the following below the License
* Header, with the fields enclosed by brackets [] replaced by your own
* identifying information: "Portions Copyrighted [year]
* [name of copyright owner]"
*
* Contributor(s):
*
* If you wish your version of this file to be governed by only the CDDL or
* only the GPL Version 2, indicate your decision by adding "[Contributor]
* elects to include this software in this distribution under the [CDDL or GPL
* Version 2] license." If you don't indicate a single choice of license, a
* recipient has the option to distribute your version of this file under
* either the CDDL, the GPL Version 2 or to extend the choice of license to
* its licensees as provided above. However, if you add GPL Version 2 code
* and therefore, elected the GPL Version 2 license, then the option applies
* only if the new code is made subject to such option by the copyright
* holder.
*/
package com.sun.enterprise.diagnostics.report.html;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
/**
* Implement HTML escapes. Additional escapes can be added.
* <p>
* This class is a singleton. If you subclass and override the
* <code>escape</code> methods, use <code>setInstance</code> to
* install your handler.
*/
public class Escape {
/** A value to signal an undefined entity. */
public static final int UNDEFINED = -1;
/** The instance to use. */
private static Escape instance = null;
/**
* If true, use hexadecimal character references. If false,
* use decimal character references.
*/
private boolean useHex = false;
/**
* These are the entities which are always replaced on output. Add
* entities which should always be recognized on input and always
* replaced on output here.
*/
private final Map<Character,String> alwaysReplace =
new HashMap<Character,String>();
{
alwaysReplace.put(new Character('&'), "amp");
alwaysReplace.put(new Character('<'), "lt");
alwaysReplace.put(new Character('>'), "gt");
alwaysReplace.put(new Character('"'), "quot");
alwaysReplace.put(new Character('\''), "#039");
alwaysReplace.put(new Character('\u00A0'), "nbsp");
}
/**
* This holds all entities. The map is generated by reversing
* the {@link #setEntity(String, char)} method.
*/
private final Map<String,Character> entityToChar =
new HashMap<String,Character>();
/**
* This holds all entities. Add entities which should be recognized
* on input but not (necessarily) generated on output here.
* <p>
* This set was automatically generated from the HTML 4.01 character
* entity specification. You can find it online at:
* <a href="http://www.w3.org/TR/1999/REC-html401-19991224/sgml/entities.html"
* >http://www.w3.org/TR/1999/REC-html401-19991224/sgml/entities.html</a>.
* <p>
* Note that this method is initialized using the
* {@link #setEntity(String, char)} method.
*/
private final Map<Character,String> charToEntity = new HashMap<Character,String>();
{
/*
* The following are the basic escapes in the ISO 8859-1
* range.
*/
/** no-break space = non-breaking space, U+00A0 ISOnum */
setEntity("nbsp", (char) 160);
/** inverted exclamation mark, U+00A1 ISOnum */
setEntity("iexcl", (char) 161);
/** cent sign, U+00A2 ISOnum */
setEntity("cent", (char) 162);
/** pound sign, U+00A3 ISOnum */
setEntity("pound", (char) 163);
/** currency sign, U+00A4 ISOnum */
setEntity("curren", (char) 164);
/** yen sign = yuan sign, U+00A5 ISOnum */
setEntity("yen", (char) 165);
/** broken bar = broken vertical bar, U+00A6 ISOnum */
setEntity("brvbar", (char) 166);
/** section sign, U+00A7 ISOnum */
setEntity("sect", (char) 167);
/** diaeresis = spacing diaeresis, U+00A8 ISOdia */
setEntity("uml", (char) 168);
/** copyright sign, U+00A9 ISOnum */
setEntity("copy", (char) 169);
/** feminine ordinal indicator, U+00AA ISOnum */
setEntity("ordf", (char) 170);
/** left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum */
setEntity("laquo", (char) 171);
/** not sign, U+00AC ISOnum */
setEntity("not", (char) 172);
/** soft hyphen = discretionary hyphen, U+00AD ISOnum */
setEntity("shy", (char) 173);
/** registered sign = registered trade mark sign, U+00AE ISOnum */
setEntity("reg", (char) 174);
/** macron = spacing macron = overline = APL overbar, U+00AF ISOdia */
setEntity("macr", (char) 175);
/** degree sign, U+00B0 ISOnum */
setEntity("deg", (char) 176);
/** plus-minus sign = plus-or-minus sign, U+00B1 ISOnum */
setEntity("plusmn", (char) 177);
/** superscript two = superscript digit two = squared, U+00B2 ISOnum */
setEntity("sup2", (char) 178);
/** superscript three = superscript digit three = cubed, U+00B3 ISOnum */
setEntity("sup3", (char) 179);
/** acute accent = spacing acute, U+00B4 ISOdia */
setEntity("acute", (char) 180);
/** micro sign, U+00B5 ISOnum */
setEntity("micro", (char) 181);
/** pilcrow sign = paragraph sign, U+00B6 ISOnum */
setEntity("para", (char) 182);
/** middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum */
setEntity("middot", (char) 183);
/** cedilla = spacing cedilla, U+00B8 ISOdia */
setEntity("cedil", (char) 184);
/** superscript one = superscript digit one, U+00B9 ISOnum */
setEntity("sup1", (char) 185);
/** masculine ordinal indicator, U+00BA ISOnum */
setEntity("ordm", (char) 186);
/** right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum */
setEntity("raquo", (char) 187);
/** vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum */
setEntity("frac14", (char) 188);
/** vulgar fraction one half = fraction one half, U+00BD ISOnum */
setEntity("frac12", (char) 189);
/** vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum */
setEntity("frac34", (char) 190);
/** inverted question mark = turned question mark, U+00BF ISOnum */
setEntity("iquest", (char) 191);
/** latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1 */
setEntity("Agrave", (char) 192);
/** latin capital letter A with acute, U+00C1 ISOlat1 */
setEntity("Aacute", (char) 193);
/** latin capital letter A with circumflex, U+00C2 ISOlat1 */
setEntity("Acirc", (char) 194);
/** latin capital letter A with tilde, U+00C3 ISOlat1 */
setEntity("Atilde", (char) 195);
/** latin capital letter A with diaeresis, U+00C4 ISOlat1 */
setEntity("Auml", (char) 196);
/** latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1 */
setEntity("Aring", (char) 197);
/** latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1 */
setEntity("AElig", (char) 198);
/** latin capital letter C with cedilla, U+00C7 ISOlat1 */
setEntity("Ccedil", (char) 199);
/** latin capital letter E with grave, U+00C8 ISOlat1 */
setEntity("Egrave", (char) 200);
/** latin capital letter E with acute, U+00C9 ISOlat1 */
setEntity("Eacute", (char) 201);
/** latin capital letter E with circumflex, U+00CA ISOlat1 */
setEntity("Ecirc", (char) 202);
/** latin capital letter E with diaeresis, U+00CB ISOlat1 */
setEntity("Euml", (char) 203);
/** latin capital letter I with grave, U+00CC ISOlat1 */
setEntity("Igrave", (char) 204);
/** latin capital letter I with acute, U+00CD ISOlat1 */
setEntity("Iacute", (char) 205);
/** latin capital letter I with circumflex, U+00CE ISOlat1 */
setEntity("Icirc", (char) 206);
/** latin capital letter I with diaeresis, U+00CF ISOlat1 */
setEntity("Iuml", (char) 207);
/** latin capital letter ETH, U+00D0 ISOlat1 */
setEntity("ETH", (char) 208);
/** latin capital letter N with tilde, U+00D1 ISOlat1 */
setEntity("Ntilde", (char) 209);
/** latin capital letter O with grave, U+00D2 ISOlat1 */
setEntity("Ograve", (char) 210);
/** latin capital letter O with acute, U+00D3 ISOlat1 */
setEntity("Oacute", (char) 211);
/** latin capital letter O with circumflex, U+00D4 ISOlat1 */
setEntity("Ocirc", (char) 212);
/** latin capital letter O with tilde, U+00D5 ISOlat1 */
setEntity("Otilde", (char) 213);
/** latin capital letter O with diaeresis, U+00D6 ISOlat1 */
setEntity("Ouml", (char) 214);
/** multiplication sign, U+00D7 ISOnum */
setEntity("times", (char) 215);
/** latin capital letter O with stroke = latin capital letter O slash, U+00D8 ISOlat1 */
setEntity("Oslash", (char) 216);
/** latin capital letter U with grave, U+00D9 ISOlat1 */
setEntity("Ugrave", (char) 217);
/** latin capital letter U with acute, U+00DA ISOlat1 */
setEntity("Uacute", (char) 218);
/** latin capital letter U with circumflex, U+00DB ISOlat1 */
setEntity("Ucirc", (char) 219);
/** latin capital letter U with diaeresis, U+00DC ISOlat1 */
setEntity("Uuml", (char) 220);
/** latin capital letter Y with acute, U+00DD ISOlat1 */
setEntity("Yacute", (char) 221);
/** latin capital letter THORN, U+00DE ISOlat1 */
setEntity("THORN", (char) 222);
/** latin small letter sharp s = ess-zed, U+00DF ISOlat1 */
setEntity("szlig", (char) 223);
/** latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1 */
setEntity("agrave", (char) 224);
/** latin small letter a with acute, U+00E1 ISOlat1 */
setEntity("aacute", (char) 225);
/** latin small letter a with circumflex, U+00E2 ISOlat1 */
setEntity("acirc", (char) 226);
/** latin small letter a with tilde, U+00E3 ISOlat1 */
setEntity("atilde", (char) 227);
/** latin small letter a with diaeresis, U+00E4 ISOlat1 */
setEntity("auml", (char) 228);
/** latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1 */
setEntity("aring", (char) 229);
/** latin small letter ae = latin small ligature ae, U+00E6 ISOlat1 */
setEntity("aelig", (char) 230);
/** latin small letter c with cedilla, U+00E7 ISOlat1 */
setEntity("ccedil", (char) 231);
/** latin small letter e with grave, U+00E8 ISOlat1 */
setEntity("egrave", (char) 232);
/** latin small letter e with acute, U+00E9 ISOlat1 */
setEntity("eacute", (char) 233);
/** latin small letter e with circumflex, U+00EA ISOlat1 */
setEntity("ecirc", (char) 234);
/** latin small letter e with diaeresis, U+00EB ISOlat1 */
setEntity("euml", (char) 235);
/** latin small letter i with grave, U+00EC ISOlat1 */
setEntity("igrave", (char) 236);
/** latin small letter i with acute, U+00ED ISOlat1 */
setEntity("iacute", (char) 237);
/** latin small letter i with circumflex, U+00EE ISOlat1 */
setEntity("icirc", (char) 238);
/** latin small letter i with diaeresis, U+00EF ISOlat1 */
setEntity("iuml", (char) 239);
/** latin small letter eth, U+00F0 ISOlat1 */
setEntity("eth", (char) 240);
/** latin small letter n with tilde, U+00F1 ISOlat1 */
setEntity("ntilde", (char) 241);
/** latin small letter o with grave, U+00F2 ISOlat1 */
setEntity("ograve", (char) 242);
/** latin small letter o with acute, U+00F3 ISOlat1 */
setEntity("oacute", (char) 243);
/** latin small letter o with circumflex, U+00F4 ISOlat1 */
setEntity("ocirc", (char) 244);
/** latin small letter o with tilde, U+00F5 ISOlat1 */
setEntity("otilde", (char) 245);
/** latin small letter o with diaeresis, U+00F6 ISOlat1 */
setEntity("ouml", (char) 246);
/** division sign, U+00F7 ISOnum */
setEntity("divide", (char) 247);
/** latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1 */
setEntity("oslash", (char) 248);
/** latin small letter u with grave, U+00F9 ISOlat1 */
setEntity("ugrave", (char) 249);
/** latin small letter u with acute, U+00FA ISOlat1 */
setEntity("uacute", (char) 250);
/** latin small letter u with circumflex, U+00FB ISOlat1 */
setEntity("ucirc", (char) 251);
/** latin small letter u with diaeresis, U+00FC ISOlat1 */
setEntity("uuml", (char) 252);
/** latin small letter y with acute, U+00FD ISOlat1 */
setEntity("yacute", (char) 253);
/** latin small letter thorn, U+00FE ISOlat1 */
setEntity("thorn", (char) 254);
/** latin small letter y with diaeresis, U+00FF ISOlat1 */
setEntity("yuml", (char) 255);
/*
* The following are the greek and mathematical symbols.
*/
/** latin small f with hook = function = florin, U+0192 ISOtech */
setEntity("fnof", (char) 402);
/** greek capital letter alpha, U+0391 */
setEntity("Alpha", (char) 913);
/** greek capital letter beta, U+0392 */
setEntity("Beta", (char) 914);
/** greek capital letter gamma, U+0393 ISOgrk3 */
setEntity("Gamma", (char) 915);
/** greek capital letter delta, U+0394 ISOgrk3 */
setEntity("Delta", (char) 916);
/** greek capital letter epsilon, U+0395 */
setEntity("Epsilon", (char) 917);
/** greek capital letter zeta, U+0396 */
setEntity("Zeta", (char) 918);
/** greek capital letter eta, U+0397 */
setEntity("Eta", (char) 919);
/** greek capital letter theta, U+0398 ISOgrk3 */
setEntity("Theta", (char) 920);
/** greek capital letter iota, U+0399 */
setEntity("Iota", (char) 921);
/** greek capital letter kappa, U+039A */
setEntity("Kappa", (char) 922);
/** greek capital letter lambda, U+039B ISOgrk3 */
setEntity("Lambda", (char) 923);
/** greek capital letter mu, U+039C */
setEntity("Mu", (char) 924);
/** greek capital letter nu, U+039D */
setEntity("Nu", (char) 925);
/** greek capital letter xi, U+039E ISOgrk3 */
setEntity("Xi", (char) 926);
/** greek capital letter omicron, U+039F */
setEntity("Omicron", (char) 927);
/** greek capital letter pi, U+03A0 ISOgrk3 */
setEntity("Pi", (char) 928);
/** greek capital letter rho, U+03A1 */
setEntity("Rho", (char) 929);
/** greek capital letter sigma, U+03A3 ISOgrk3 */
setEntity("Sigma", (char) 931);
/** greek capital letter tau, U+03A4 */
setEntity("Tau", (char) 932);
/** greek capital letter upsilon, U+03A5 ISOgrk3 */
setEntity("Upsilon", (char) 933);
/** greek capital letter phi, U+03A6 ISOgrk3 */
setEntity("Phi", (char) 934);
/** greek capital letter chi, U+03A7 */
setEntity("Chi", (char) 935);
/** greek capital letter psi, U+03A8 ISOgrk3 */
setEntity("Psi", (char) 936);
/** greek capital letter omega, U+03A9 ISOgrk3 */
setEntity("Omega", (char) 937);
/** greek small letter alpha, U+03B1 ISOgrk3 */
setEntity("alpha", (char) 945);
/** greek small letter beta, U+03B2 ISOgrk3 */
setEntity("beta", (char) 946);
/** greek small letter gamma, U+03B3 ISOgrk3 */
setEntity("gamma", (char) 947);
/** greek small letter delta, U+03B4 ISOgrk3 */
setEntity("delta", (char) 948);
/** greek small letter epsilon, U+03B5 ISOgrk3 */
setEntity("epsilon", (char) 949);
/** greek small letter zeta, U+03B6 ISOgrk3 */
setEntity("zeta", (char) 950);
/** greek small letter eta, U+03B7 ISOgrk3 */
setEntity("eta", (char) 951);
/** greek small letter theta, U+03B8 ISOgrk3 */
setEntity("theta", (char) 952);
/** greek small letter iota, U+03B9 ISOgrk3 */
setEntity("iota", (char) 953);
/** greek small letter kappa, U+03BA ISOgrk3 */
setEntity("kappa", (char) 954);
/** greek small letter lambda, U+03BB ISOgrk3 */
setEntity("lambda", (char) 955);
/** greek small letter mu, U+03BC ISOgrk3 */
setEntity("mu", (char) 956);
/** greek small letter nu, U+03BD ISOgrk3 */
setEntity("nu", (char) 957);
/** greek small letter xi, U+03BE ISOgrk3 */
setEntity("xi", (char) 958);
/** greek small letter omicron, U+03BF NEW */
setEntity("omicron", (char) 959);
/** greek small letter pi, U+03C0 ISOgrk3 */
setEntity("pi", (char) 960);
/** greek small letter rho, U+03C1 ISOgrk3 */
setEntity("rho", (char) 961);
/** greek small letter final sigma, U+03C2 ISOgrk3 */
setEntity("sigmaf", (char) 962);
/** greek small letter sigma, U+03C3 ISOgrk3 */
setEntity("sigma", (char) 963);
/** greek small letter tau, U+03C4 ISOgrk3 */
setEntity("tau", (char) 964);
/** greek small letter upsilon, U+03C5 ISOgrk3 */
setEntity("upsilon", (char) 965);
/** greek small letter phi, U+03C6 ISOgrk3 */
setEntity("phi", (char) 966);
/** greek small letter chi, U+03C7 ISOgrk3 */
setEntity("chi", (char) 967);
/** greek small letter psi, U+03C8 ISOgrk3 */
setEntity("psi", (char) 968);
/** greek small letter omega, U+03C9 ISOgrk3 */
setEntity("omega", (char) 969);
/** greek small letter theta symbol, U+03D1 NEW */
setEntity("thetasym", (char) 977);
/** greek upsilon with hook symbol, U+03D2 NEW */
setEntity("upsih", (char) 978);
/** greek pi symbol, U+03D6 ISOgrk3 */
setEntity("piv", (char) 982);
/** bullet = black small circle, U+2022 ISOpub */
setEntity("bull", (char) 8226);
/** horizontal ellipsis = three dot leader, U+2026 ISOpub */
setEntity("hellip", (char) 8230);
/** prime = minutes = feet, U+2032 ISOtech */
setEntity("prime", (char) 8242);
/** double prime = seconds = inches, U+2033 ISOtech */
setEntity("Prime", (char) 8243);
/** overline = spacing overscore, U+203E NEW */
setEntity("oline", (char) 8254);
/** fraction slash, U+2044 NEW */
setEntity("frasl", (char) 8260);
/** script capital P = power set = Weierstrass p, U+2118 ISOamso */
setEntity("weierp", (char) 8472);
/** blackletter capital I = imaginary part, U+2111 ISOamso */
setEntity("image", (char) 8465);
/** blackletter capital R = real part symbol, U+211C ISOamso */
setEntity("real", (char) 8476);
/** trade mark sign, U+2122 ISOnum */
setEntity("trade", (char) 8482);
/** alef symbol = first transfinite cardinal, U+2135 NEW */
setEntity("alefsym", (char) 8501);
/** leftwards arrow, U+2190 ISOnum */
setEntity("larr", (char) 8592);
/** upwards arrow, U+2191 ISOnum*/
setEntity("uarr", (char) 8593);
/** rightwards arrow, U+2192 ISOnum */
setEntity("rarr", (char) 8594);
/** downwards arrow, U+2193 ISOnum */
setEntity("darr", (char) 8595);
/** left right arrow, U+2194 ISOamsa */
setEntity("harr", (char) 8596);
/** downwards arrow with corner leftwards = carriage return, U+21B5 NEW */
setEntity("crarr", (char) 8629);
/** leftwards double arrow, U+21D0 ISOtech */
setEntity("lArr", (char) 8656);
/** upwards double arrow, U+21D1 ISOamsa */
setEntity("uArr", (char) 8657);
/** rightwards double arrow, U+21D2 ISOtech */
setEntity("rArr", (char) 8658);
/** downwards double arrow, U+21D3 ISOamsa */
setEntity("dArr", (char) 8659);
/** left right double arrow, U+21D4 ISOamsa */
setEntity("hArr", (char) 8660);
/** for all, U+2200 ISOtech */
setEntity("forall", (char) 8704);
/** partial differential, U+2202 ISOtech */
setEntity("part", (char) 8706);
/** there exists, U+2203 ISOtech */
setEntity("exist", (char) 8707);
/** empty set = null set = diameter, U+2205 ISOamso */
setEntity("empty", (char) 8709);
/** nabla = backward difference, U+2207 ISOtech */
setEntity("nabla", (char) 8711);
/** element of, U+2208 ISOtech */
setEntity("isin", (char) 8712);
/** not an element of, U+2209 ISOtech */
setEntity("notin", (char) 8713);
/** contains as member, U+220B ISOtech */
setEntity("ni", (char) 8715);
/** n-ary product = product sign, U+220F ISOamsb */
setEntity("prod", (char) 8719);
/** n-ary sumation, U+2211 ISOamsb */
setEntity("sum", (char) 8721);
/** minus sign, U+2212 ISOtech */
setEntity("minus", (char) 8722);
/** asterisk operator, U+2217 ISOtech */
setEntity("lowast", (char) 8727);
/** square root = radical sign, U+221A ISOtech */
setEntity("radic", (char) 8730);
/** proportional to, U+221D ISOtech */
setEntity("prop", (char) 8733);
/** infinity, U+221E ISOtech */
setEntity("infin", (char) 8734);
/** angle, U+2220 ISOamso */
setEntity("ang", (char) 8736);
/** logical and = wedge, U+2227 ISOtech */
setEntity("and", (char) 8743);
/** logical or = vee, U+2228 ISOtech */
setEntity("or", (char) 8744);
/** intersection = cap, U+2229 ISOtech */
setEntity("cap", (char) 8745);
/** union = cup, U+222A ISOtech */
setEntity("cup", (char) 8746);
/** integral, U+222B ISOtech */
setEntity("int", (char) 8747);
/** therefore, U+2234 ISOtech */
setEntity("there4", (char) 8756);
/** tilde operator = varies with = similar to, U+223C ISOtech */
setEntity("sim", (char) 8764);
/** approximately equal to, U+2245 ISOtech */
setEntity("cong", (char) 8773);
/** almost equal to = asymptotic to, U+2248 ISOamsr */
setEntity("asymp", (char) 8776);
/** not equal to, U+2260 ISOtech */
setEntity("ne", (char) 8800);
/** identical to, U+2261 ISOtech */
setEntity("equiv", (char) 8801);
/** less-than or equal to, U+2264 ISOtech */
setEntity("le", (char) 8804);
/** greater-than or equal to, U+2265 ISOtech */
setEntity("ge", (char) 8805);
/** subset of, U+2282 ISOtech */
setEntity("sub", (char) 8834);
/** superset of, U+2283 ISOtech */
setEntity("sup", (char) 8835);
/** not a subset of, U+2284 ISOamsn */
setEntity("nsub", (char) 8836);
/** subset of or equal to, U+2286 ISOtech */
setEntity("sube", (char) 8838);
/** superset of or equal to, U+2287 ISOtech */
setEntity("supe", (char) 8839);
/** circled plus = direct sum, U+2295 ISOamsb */
setEntity("oplus", (char) 8853);
/** circled times = vector product, U+2297 ISOamsb */
setEntity("otimes", (char) 8855);
/** up tack = orthogonal to = perpendicular, U+22A5 ISOtech */
setEntity("perp", (char) 8869);
/** dot operator, U+22C5 ISOamsb */
setEntity("sdot", (char) 8901);
/** left ceiling = apl upstile, U+2308 ISOamsc */
setEntity("lceil", (char) 8968);
/** right ceiling, U+2309 ISOamsc */
setEntity("rceil", (char) 8969);
/** left floor = apl downstile, U+230A ISOamsc */
setEntity("lfloor", (char) 8970);
/** right floor, U+230B ISOamsc */
setEntity("rfloor", (char) 8971);
/** left-pointing angle bracket = bra, U+2329 ISOtech */
setEntity("lang", (char) 9001);
/** right-pointing angle bracket = ket, U+232A ISOtech */
setEntity("rang", (char) 9002);
/** lozenge, U+25CA ISOpub */
setEntity("loz", (char) 9674);
/** black spade suit, U+2660 ISOpub */
setEntity("spades", (char) 9824);
/** black club suit = shamrock, U+2663 ISOpub */
setEntity("clubs", (char) 9827);
/** black heart suit = valentine, U+2665 ISOpub */
setEntity("hearts", (char) 9829);
/** black diamond suit, U+2666 ISOpub */
setEntity("diams", (char) 9830);
/*
* The following are other special symbols included in the
* list of HTML character entities for 4.01.
*/
/** quotation mark = APL quote, U+0022 ISOnum */
setEntity("quot", (char) 34);
/** ampersand, U+0026 ISOnum */
setEntity("amp", (char) 38);
/** less-than sign, U+003C ISOnum */
setEntity("lt", (char) 60);
/** greater-than sign, U+003E ISOnum */
setEntity("gt", (char) 62);
/** latin capital ligature OE, U+0152 ISOlat2 */
setEntity("OElig", (char) 338);
/** latin small ligature oe, U+0153 ISOlat2 */
setEntity("oelig", (char) 339);
/** latin capital letter S with caron, U+0160 ISOlat2 */
setEntity("Scaron", (char) 352);
/** latin small letter s with caron, U+0161 ISOlat2 */
setEntity("scaron", (char) 353);
/** latin capital letter Y with diaeresis, U+0178 ISOlat2 */
setEntity("Yuml", (char) 376);
/** modifier letter circumflex accent, U+02C6 ISOpub */
setEntity("circ", (char) 710);
/** small tilde, U+02DC ISOdia */
setEntity("tilde", (char) 732);
/** en space, U+2002 ISOpub */
setEntity("ensp", (char) 8194);
/** em space, U+2003 ISOpub */
setEntity("emsp", (char) 8195);
/** thin space, U+2009 ISOpub */
setEntity("thinsp", (char) 8201);
/** zero width non-joiner, U+200C NEW RFC 2070 */
setEntity("zwnj", (char) 8204);
/** zero width joiner, U+200D NEW RFC 2070 */
setEntity("zwj", (char) 8205);
/** left-to-right mark, U+200E NEW RFC 2070 */
setEntity("lrm", (char) 8206);
/** right-to-left mark, U+200F NEW RFC 2070 */
setEntity("rlm", (char) 8207);
/** en dash, U+2013 ISOpub */
setEntity("ndash", (char) 8211);
/** em dash, U+2014 ISOpub */
setEntity("mdash", (char) 8212);
/** left single quotation mark, U+2018 ISOnum */
setEntity("lsquo", (char) 8216);
/** right single quotation mark, U+2019 ISOnum */
setEntity("rsquo", (char) 8217);
/** single low-9 quotation mark, U+201A NEW */
setEntity("sbquo", (char) 8218);
/** left double quotation mark, U+201C ISOnum */
setEntity("ldquo", (char) 8220);
/** right double quotation mark, U+201D ISOnum */
setEntity("rdquo", (char) 8221);
/** double low-9 quotation mark, U+201E NEW */
setEntity("bdquo", (char) 8222);
/** dagger, U+2020 ISOpub */
setEntity("dagger", (char) 8224);
/** double dagger, U+2021 ISOpub */
setEntity("Dagger", (char) 8225);
/** per mille sign, U+2030 ISOtech */
setEntity("permil", (char) 8240);
/** single left-pointing angle quotation mark, U+2039 ISO proposed */
setEntity("lsaquo", (char) 8249);
/** single right-pointing angle quotation mark, U+203A ISO proposed */
setEntity("rsaquo", (char) 8250);
/** euro sign, U+20AC NEW */
setEntity("euro", (char) 8364);
charToEntity.put(new Character(';'), "semi");
charToEntity.put(new Character('\u00A0'), "nbsp");
}
/**
* This field holds the list of non-alphanumeric characters to
* preserve as-is in URLs.
*/
private String preserve = "_-!.~#()*" + ",;:$&+=" + "?/[]@";
/**
* Make a new escape instance. This method is protected since only
* subclasses should use it. Do not create instances of this class
* directly; use {@link #getInstance()} to get the correct
* <code>Escape</code> instance to use.
*/
protected Escape() {
super();
}
/**
* Get the escape instance to use to escape strings.
* @return The instance to use.
* @see #setInstance(Escape)
*/
public static final Escape getInstance() {
if (instance == null) {
instance = new Escape();
}
return instance;
}
/**
* Set the instance to use to escape strings.
* @param escape The instance to use.
* @return The instance to use.
* @see #getInstance()
*/
public static final Escape setInstance(Escape escape) {
if (escape == null) {
throw new NullPointerException("Escape instance is null.");
}
instance = escape;
return instance;
}
/**
* Given a character, return the appropriate entity if there
* is an entity representation for this character. Otherwise
* return a numeric character reference.
* @param ch The character to encode.
* @return The encoded string.
*/
public String encodeAsEntity(char ch) {
String replacement = charToEntity.get(new Character(ch));
if (replacement == null) {
String value = "" + (int) ch;
for (int i = value.length(); i < 3; i++) {
value = "0" + value;
} // Pad with zeros to length three.
return "" + value + ";";
} else {
return "&" + replacement + ";";
}
}
/**
* Decode an entity or numeric character reference, and return the
* appropriate character. Entity names are case-sensitive.
* @param name An entity or numeric character reference. It can
* include the ampersand and semicolon, or not.
* @return Either the character referenced, or the input
* string.
*/
public String decodeAsEntity(String name) {
if (name == null) {
throw new NullPointerException("Entity name is null.");
}
// This should just be the entity name. If the entity is
// decorated, remove the decorations.
if (name.startsWith("&") && name.endsWith(";")) {
name = name.substring(1, name.length()-1);
}
// See if this is a numeric character reference (ISO 10646).
// If the entity name starts with a hash mark, it is. The
// next character determines if this is hex or decimal. If
// the next character is an x, then this is hex.
// Section 5.3.1
if (name.startsWith("#")) {
try {
name = name.substring(1);
if (name.startsWith("X")) {
name = name.substring(1);
return "" + Integer.parseInt(name, 16);
} else {
return "" + Integer.parseInt(name);
}
} catch (NumberFormatException nfe) {
return "&" + name + ";";
}
}
// Get the entity's value, if it is defined.
Character value = entityToChar.get(name);
if (value == null) {
return "&" + name + ";";
} else {
return "" + value.charValue();
}
}
/**
* Encode a string by replacing characters with entity references
* or numeric character references, if there is no named entity.
* <p>
* The characters which will always be replaced are:
* <ul>
* <li>& (&)</li>
* <li>< (<)</li>
* <li>> (>)</li>
* <li>" (")</li>
* <li>' (')</li>
* <li> ( )</li>
* </ul>
* Additionally, anything outside of the ISO 8859-1 range will be
* encoded. From what I've read, this is a good idea.
* @param cdata The string to encode.
* @param characters Additional characters which should be encoded.
* @return The encoded string.
* @see #encodeAsEntity(char)
*/
public String encodeEntities(String cdata, String characters) {
if (cdata == null) {
throw new NullPointerException("The character data to " +
"encode is null.");
}
if (characters == null) {
throw new NullPointerException("The list of additional " +
"characters to encode is null.");
}
// Traverse the string. Just replace the characters indicated
// in the argument, and any additional characters which should
// always be encoded.
StringBuffer buf = new StringBuffer();
for (char ch : cdata.toCharArray()) {
if (ch >= 128 ||
alwaysReplace.containsKey(new Character(ch)) ||
characters.indexOf(ch) >= 0) {
buf.append(encodeAsEntity(ch));
} else {
buf.append(ch);
}
} // Traverse the string.
// Done.
return buf.toString();
}
/**
* Decode all entity references in the provided string. This also
* decodes any numeric character references of the form &#N;,
* where N is a decimal number, or &#xN;, where N is a hex
* number.
* @param cdata The string to decode.
* @return The decoded string.
* @see #decodeAsEntity(String)
*/
public String decodeEntities(String cdata) {
if (cdata == null) {
throw new NullPointerException("The character data to " +
"decode is null.");
}
// Traverse the string. Replace all entity and numeric character
// references with the actual characters.
StringBuffer buf = new StringBuffer();
int i = 0;
while (cdata.length() > 0) {
// Find the next ampersand.
i = cdata.indexOf('&');
if (i < 0) {
buf.append(cdata);
cdata = "";
continue;
}
// Extract the prefix.
buf.append(cdata.substring(0,i));
cdata = cdata.substring(i);
// Find the ending semicolon.
i = cdata.indexOf(';');
if (i < 0) {
buf.append(cdata);
cdata = "";
continue;
}
// Extract the entity name.
String entity = cdata.substring(1,i);
cdata = cdata.substring(i+1);
// Convert the entity to a character, if possible.
String replace = decodeAsEntity(entity);
// Add the replacement.
buf.append(replace);
} // Construct decoded string.
// Done.
return buf.toString();
}
/**
* Specify whether to use hexadecimal character references of the
* form <code>&#xN;</code>, where N is the hex character code.
* The alternative is decimal character references of the form
* <code>&#N;</code>, where N is the decimal character code.
* @param flag The setting.
* @return This escape.
* @see #encodeAsEntity(char)
*/
public Escape setUseHex(boolean flag) {
useHex = flag;
return this;
}
/**
* Add a new entity to this escape.
* @param entity The entity name. There can be an ampersand at
* the start and a semicolon at the end, but these
* are optional.
* @param value The value of the entity, as a single character.
* @return This escape.
*/
public Escape setEntity(String entity, char value) {
if (entity == null) {
throw new NullPointerException("The entity name is null.");
}
if (entity.startsWith("&")) {
entity = entity.substring(1, entity.length());
}
if (entity.endsWith(";")) {
entity = entity.substring(0, entity.length()-1);
}
charToEntity.put(new Character(value), entity);
entityToChar.put(entity, new Character(value));
return this;
}
/**
* Convert a character to a sequence of hex URL escapes.
* <p>
* Multibyte characters are handled in the default character encoding.
* @param ch The character to encode.
* @return The hex encoding, which may consist of more than
* one byte, and which is performed in the default
* character encoding.
*/
public String hexEncode(char ch) {
// Some characters occupy more than one byte (multibyte).
// To account for this, convert the character to a string
// and then get the bytes for the string. I expect there
// is a better way to do this, which is dependent on character
// encodings, but for now this will have to work.
byte[] bytes = ("" + ch).getBytes();
StringBuffer buf = new StringBuffer();
for (byte bt : bytes) {
// Bytes are signed (why?) so this is necessary to prevent
// an undesirable number of one bits in the result. This
// essentially converts the byte to a signed value. I
// sometimes wish Java had an unsigned keyword.
int ibt = (int) bt & 0xFF;
buf.append('%');
String hex = Integer.toHexString(ibt);
if (hex.length() < 2) {
buf.append('0');
}
buf.append(hex);
} // Traversing the bytes.
// Now return the encoded string.
return buf.toString();
}
/**
* Traverse the input string, and hex encode non-alphanumeric
* characters in the string, other than those in the provided set.
* Note that all non-ascii characters are encoded here.
* @param text The text to encode.
* @param characters Characters to preserve, unencoded.
* @return The encoded string.
*/
public String hexEncode(String text, String characters) {
// Traverse the string and encode characters.
StringBuffer buf = new StringBuffer();
for (char ch : text.toCharArray()) {
if (ch < 128 &&
(Character.isLetterOrDigit(ch) ||
characters.indexOf(ch) >= 0) ||
preserve.indexOf(ch) >= 0) {
buf.append(ch);
} else {
buf.append(hexEncode(ch));
}
} // Loop over input string.
// Done.
return buf.toString();
}
/**
* Convert all URL hex escapes in the string to characters. This is
* complicated by the need to handle multibyte characters.
* <p>
* Multibyte characters are handled in the default character encoding.
* @param text The text to decode.
* @return The decoded text.
*/
public String hexDecode(String text) {
if (text == null) {
throw new NullPointerException("The text to hex decode is null.");
}
// Traverse the string and decode any hex escapes. These are
// turned into bytes, and added to the byte sequence.
ByteArrayOutputStream baos = new ByteArrayOutputStream();
int index = 0;
int length = text.length();
while (index < length) {
// If the next character is a percent sign, decode a hex
// escape. Otherwise, just add the bytes for the character.
char ch = text.charAt(index);
try {
if (ch == '%') {
// There must be two more characters in the text.
if (length - index <= 2) {
// Too few characters in the text.
baos.write("%".getBytes());
index++;
continue;
}
// The next two characters must be hex.
String hex = text.substring(index+1, index+3);
try {
int value = Integer.parseInt(hex, 16);
baos.write((byte) value);
index += 3;
} catch (NumberFormatException exception) {
// Some characters are not hexadecimal.
baos.write("%".getBytes());
index++;
}
} else {
// Just add the character as-is.
baos.write(("" + ch).getBytes());
index++;
}
} catch (IOException exception) {
// This should never happen. Ignore this.
}
} // Loop over input string.
// Return the result, in the default encoding.
return baos.toString();
}
}
|