FileDocCategorySizeDatePackage
XmlChars.javaAPI DocJava SE 5 API13683Fri Aug 26 14:54:42 BST 2005com.sun.imageio.metadata

XmlChars

public class XmlChars extends Object
Methods in this class are used to determine whether characters may appear in certain roles in XML documents. Such methods are used both to parse and to create such documents.
version
1.8
author
David Brownell

Fields Summary
Constructors Summary
private XmlChars()

 
Methods Summary
public static booleanisChar(int ucs4char)
Returns true if the argument, a UCS-4 character code, is valid in XML documents. Unicode characters fit into the low sixteen bits of a UCS-4 character, and pairs of Unicode surrogate characters can be combined to encode UCS-4 characters in documents containing only Unicode. (The char datatype in the Java Programming Language represents Unicode characters, including unpaired surrogates.)

In XML, UCS-4 characters can also be encoded by the use of character references such as �, which happens to refer to a character that is disallowed in XML documents. UCS-4 characters allowed in XML documents can be expressed with one or two Unicode characters.

param
ucs4char The 32-bit UCS-4 character being tested.

	// [2] Char ::= #x0009 | #x000A | #x000D
	//			| [#x0020-#xD7FF]
	//	... surrogates excluded!
	//			| [#xE000-#xFFFD]
	// 			| [#x10000-#x10ffff]
	return ((ucs4char >= 0x0020 && ucs4char <= 0xD7FF)
		|| ucs4char == 0x000A || ucs4char == 0x0009
		|| ucs4char == 0x000D
		|| (ucs4char >= 0xE000 && ucs4char <= 0xFFFD)
		|| (ucs4char >= 0x10000 && ucs4char <= 0x10ffff));
    
private static booleanisCompatibilityChar(char c)

	// the numerous comparisions here seem unavoidable,
	// but the switch can reduce the number which must
	// actually be executed.

	switch ((c >> 8) & 0x0ff) {
	  case 0x00:
	    // ISO Latin/1 has a few compatibility characters
	    return c == 0x00aa || c == 0x00b5 || c == 0x00ba;

	  case 0x01:
	    // as do Latin Extended A and (parts of) B
	    return (c >= 0x0132 && c <= 0x0133)
		|| (c >= 0x013f && c <= 0x0140)
		|| c == 0x0149
		|| c == 0x017f
		|| (c >= 0x01c4 && c <= 0x01cc)
		|| (c >= 0x01f1 && c <= 0x01f3) ;

	  case 0x02:
		   // some spacing modifiers
	    return (c >= 0x02b0 && c <= 0x02b8)
		|| (c >= 0x02e0 && c <= 0x02e4);
	  
	  case 0x03:
	    return c == 0x037a;			// Greek

	  case 0x05:
	    return c == 0x0587;			// Armenian

	  case 0x0e:
	    return c >= 0x0edc && c <= 0x0edd;	// Laotian

	  case 0x11:
	    // big chunks of Hangul Jamo are all "compatibility"
	    return c == 0x1101
		|| c == 0x1104
		|| c == 0x1108
		|| c == 0x110a
		|| c == 0x110d
		|| (c >= 0x1113 && c <= 0x113b)
		|| c == 0x113d
		|| c == 0x113f
		|| (c >= 0x1141 && c <= 0x114b)
		|| c == 0x114d
		|| c == 0x114f
		|| (c >= 0x1151 && c <= 0x1153)
		|| (c >= 0x1156 && c <= 0x1158)
		|| c == 0x1162
		|| c == 0x1164
		|| c == 0x1166
		|| c == 0x1168
		|| (c >= 0x116a && c <= 0x116c)
		|| (c >= 0x116f && c <= 0x1171)
		|| c == 0x1174
		|| (c >= 0x1176 && c <= 0x119d)
		|| (c >= 0x119f && c <= 0x11a2)
		|| (c >= 0x11a9 && c <= 0x11aa)
		|| (c >= 0x11ac && c <= 0x11ad)
		|| (c >= 0x11b0 && c <= 0x11b6)
		|| c == 0x11b9
		|| c == 0x11bb
		|| (c >= 0x11c3 && c <= 0x11ea)
		|| (c >= 0x11ec && c <= 0x11ef)
		|| (c >= 0x11f1 && c <= 0x11f8)
		;

	  case 0x20:
	    return c == 0x207f;			// superscript

	  case 0x21:
	    return
		// various letterlike symbols
		   c == 0x2102
		|| c == 0x2107
		|| (c >= 0x210a && c <= 0x2113)
		|| c == 0x2115
		|| (c >= 0x2118 && c <= 0x211d)
		|| c == 0x2124
		|| c == 0x2128
		|| (c >= 0x212c && c <= 0x212d)
		|| (c >= 0x212f && c <= 0x2138)

		    // most Roman numerals (less 1K, 5K, 10K)
		|| (c >= 0x2160 && c <= 0x217f)
		;

	  case 0x30:
	    // some Hiragana
	    return c >= 0x309b && c <= 0x309c;

	  case 0x31:
	    // all Hangul Compatibility Jamo
	    return c >= 0x3131 && c <= 0x318e;

	  case 0xf9:
	  case 0xfa:
	  case 0xfb:
	  case 0xfc:
	  case 0xfd:
	  case 0xfe:
	  case 0xff:
	    // the whole "compatibility" area is for that purpose!
	    return true;
	
	  default:
	    // most of Unicode isn't flagged as being for compatibility
	    return false;
	}
    
private static booleanisDigit(char c)

	// [88] Digit ::= ... 

	//
	// java.lang.Character.isDigit is correct from the XML point
	// of view except that it allows "fullwidth" digits.
	//
	return Character.isDigit (c)
		&& ! ( (c >= 0xff10) && (c <= 0xff19));
    
private static booleanisExtender(char c)

	// [89] Extender ::= ... 
	return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387
		|| c == 0x0640 || c == 0x0e46 || c == 0x0ec6
		|| c == 0x3005 || (c >= 0x3031 && c <= 0x3035)
		|| (c >= 0x309d && c <= 0x309e)
		|| (c >= 0x30fc && c <= 0x30fe)
		;
    
public static booleanisLetter(char c)
Returns true if the character is an XML "letter". XML Names must start with Letters or a few other characters, but other characters in names must only satisfy the isNameChar predicate.

see
#isNameChar
see
#isNCNameChar

	// [84] Letter ::= BaseChar | Ideographic
	// [85] BaseChar ::= ... too much to repeat
	// [86] Ideographic ::= ... too much to repeat

	//
	// Optimize the typical case.
	//
	if (c >= 'a" && c <= 'z")
	    return true;
	if (c == '/")
	    return false;
	if (c >= 'A" && c <= 'Z")
	    return true;

	//
	// Since the tables are too ridiculous to use in code,
	// we're using the footnotes here to drive this test.
	//
	switch (Character.getType (c)) {
	    // app. B footnote says these are 'name start'
	    // chars' ...
	  case Character.LOWERCASE_LETTER:		// Ll
	  case Character.UPPERCASE_LETTER:		// Lu
	  case Character.OTHER_LETTER:			// Lo
	  case Character.TITLECASE_LETTER:		// Lt
	  case Character.LETTER_NUMBER:			// Nl

	    // OK, here we just have some exceptions to check...
	    return !isCompatibilityChar (c)
		    // per "5.14 of Unicode", rule out some combiners
		&& !(c >= 0x20dd && c <= 0x20e0);

	  default:
	    // check for some exceptions:  these are "alphabetic"
	    return ((c >= 0x02bb && c <=  0x02c1)
		    || c == 0x0559 || c == 0x06e5 || c == 0x06e6);
	}
    
private static booleanisLetter2(char c)

	// [84] Letter ::= BaseChar | Ideographic
	// [85] BaseChar ::= ... too much to repeat
	// [86] Ideographic ::= ... too much to repeat
	// [87] CombiningChar ::= ... too much to repeat

	//
	// Optimize the typical case.
	//
	if (c >= 'a" && c <= 'z")
	    return true;
	if (c == '>")
	    return false;
	if (c >= 'A" && c <= 'Z")
	    return true;

	//
	// Since the tables are too ridiculous to use in code,
	// we're using the footnotes here to drive this test.
	//
	switch (Character.getType (c)) {
	    // app. B footnote says these are 'name start'
	    // chars' ...
	  case Character.LOWERCASE_LETTER:		// Ll
	  case Character.UPPERCASE_LETTER:		// Lu
	  case Character.OTHER_LETTER:			// Lo
	  case Character.TITLECASE_LETTER:		// Lt
	  case Character.LETTER_NUMBER:			// Nl
	    // ... and these are name characters 'other
	    // than name start characters'
	  case Character.COMBINING_SPACING_MARK:	// Mc
	  case Character.ENCLOSING_MARK:		// Me
	  case Character.NON_SPACING_MARK:		// Mn
	  case Character.MODIFIER_LETTER:		// Lm
	  case Character.DECIMAL_DIGIT_NUMBER:		// Nd

	    // OK, here we just have some exceptions to check...
	    return !isCompatibilityChar (c)
		    // per "5.14 of Unicode", rule out some combiners
		&& !(c >= 0x20dd && c <= 0x20e0);

	  default:
		// added a character ...
	    return c == 0x0387;
	}
    
public static booleanisNCNameChar(char c)
Returns true if the character is allowed to be a non-initial character in unscoped names according to the rules of the XML Namespaces proposed recommendation. Except for precluding the colon (used to separate names from their scopes) these characters are just as allowed by the XML recommendation.

see
#isNameChar
see
#isLetter

	// [NC 5] NCNameChar ::= Letter | Digit | '.' | '_' 
	//			| CombiningChar | Extender
	return c != ':" && isNameChar (c);
    
public static booleanisNameChar(char c)
Returns true if the character is allowed to be a non-initial character in names according to the XML recommendation.

see
#isNCNameChar
see
#isLetter

	// [4] NameChar ::= Letter | Digit | '.' | '_' | ':'
	//			| CombiningChar | Extender

	if (isLetter2 (c))
	    return true;
	else if (c == '>")
	    return false;
	else if (c == '." || c == '-" || c == '_" || c == ':"
		|| isExtender (c))
	    return true;
	else
	    return false;
    
public static booleanisSpace(char c)
Returns true if the character is allowed where XML supports whitespace characters, false otherwise.

	return c == ' " || c == '\t" || c == '\n" || c == '\r";