FileDocCategorySizeDatePackage
CharsetUtil.javaAPI DocAndroid 1.5 API44767Wed May 06 22:42:46 BST 2009org.apache.james.mime4j.util

CharsetUtil

public class CharsetUtil extends Object
Utility class for working with character sets. It is somewhat similar to the Java 1.4 java.nio.charset.Charset class but knows many more aliases and is compatible with Java 1.3. It will use a simple detection mechanism to detect what character sets the current VM supports. This will be a sub-set of the character sets listed in the Java 1.5 (J2SE5.0) Supported Encodings document.

The IANA Character Sets document has been used to determine the preferred MIME character set names and to get a list of known aliases.

This is a complete list of the character sets known to this class:
Canonical (Java) name MIME preferred Aliases
ASCII US-ASCII ANSI_X3.4-1968 iso-ir-6 ANSI_X3.4-1986 ISO_646.irv:1991 ISO646-US us IBM367 cp367 csASCII ascii7 646 iso_646.irv:1983
Big5 Big5 csBig5 CN-Big5 BIG-FIVE BIGFIVE
Big5_HKSCS Big5-HKSCS big5hkscs
Big5_Solaris ?
Cp037 IBM037 ebcdic-cp-us ebcdic-cp-ca ebcdic-cp-wt ebcdic-cp-nl csIBM037
Cp1006 ?
Cp1025 ?
Cp1026 IBM1026 csIBM1026
Cp1046 ?
Cp1047 IBM1047 IBM-1047
Cp1097 ?
Cp1098 ?
Cp1112 ?
Cp1122 ?
Cp1123 ?
Cp1124 ?
Cp1140 IBM01140 CCSID01140 CP01140 ebcdic-us-37+euro
Cp1141 IBM01141 CCSID01141 CP01141 ebcdic-de-273+euro
Cp1142 IBM01142 CCSID01142 CP01142 ebcdic-dk-277+euro ebcdic-no-277+euro
Cp1143 IBM01143 CCSID01143 CP01143 ebcdic-fi-278+euro ebcdic-se-278+euro
Cp1144 IBM01144 CCSID01144 CP01144 ebcdic-it-280+euro
Cp1145 IBM01145 CCSID01145 CP01145 ebcdic-es-284+euro
Cp1146 IBM01146 CCSID01146 CP01146 ebcdic-gb-285+euro
Cp1147 IBM01147 CCSID01147 CP01147 ebcdic-fr-297+euro
Cp1148 IBM01148 CCSID01148 CP01148 ebcdic-international-500+euro
Cp1149 IBM01149 CCSID01149 CP01149 ebcdic-is-871+euro
Cp1250 windows-1250
Cp1251 windows-1251
Cp1252 windows-1252
Cp1253 windows-1253
Cp1254 windows-1254
Cp1255 windows-1255
Cp1256 windows-1256
Cp1257 windows-1257
Cp1258 windows-1258
Cp1381 ?
Cp1383 ?
Cp273 IBM273 csIBM273
Cp277 IBM277 EBCDIC-CP-DK EBCDIC-CP-NO csIBM277
Cp278 IBM278 CP278 ebcdic-cp-fi ebcdic-cp-se csIBM278
Cp280 IBM280 ebcdic-cp-it csIBM280
Cp284 IBM284 ebcdic-cp-es csIBM284
Cp285 IBM285 ebcdic-cp-gb csIBM285
Cp297 IBM297 ebcdic-cp-fr csIBM297
Cp33722 ?
Cp420 IBM420 ebcdic-cp-ar1 csIBM420
Cp424 IBM424 ebcdic-cp-he csIBM424
Cp437 IBM437 437 csPC8CodePage437
Cp500 IBM500 ebcdic-cp-be ebcdic-cp-ch csIBM500
Cp737 ?
Cp775 IBM775 csPC775Baltic
Cp838 IBM-Thai
Cp850 IBM850 850 csPC850Multilingual
Cp852 IBM852 852 csPCp852
Cp855 IBM855 855 csIBM855
Cp856 ?
Cp857 IBM857 857 csIBM857
Cp858 IBM00858 CCSID00858 CP00858 PC-Multilingual-850+euro
Cp860 IBM860 860 csIBM860
Cp861 IBM861 861 cp-is csIBM861
Cp862 IBM862 862 csPC862LatinHebrew
Cp863 IBM863 863 csIBM863
Cp864 IBM864 cp864 csIBM864
Cp865 IBM865 865 csIBM865
Cp866 IBM866 866 csIBM866
Cp868 IBM868 cp-ar csIBM868
Cp869 IBM869 cp-gr csIBM869
Cp870 IBM870 ebcdic-cp-roece ebcdic-cp-yu csIBM870
Cp871 IBM871 ebcdic-cp-is csIBM871
Cp875 ?
Cp918 IBM918 ebcdic-cp-ar2 csIBM918
Cp921 ?
Cp922 ?
Cp930 ?
Cp933 ?
Cp935 ?
Cp937 ?
Cp939 ?
Cp942 ?
Cp942C ?
Cp943 ?
Cp943C ?
Cp948 ?
Cp949 ?
Cp949C ?
Cp950 ?
Cp964 ?
Cp970 ?
EUC_CN GB2312 x-EUC-CN csGB2312 euccn euc-cn gb2312-80 gb2312-1980 CN-GB CN-GB-ISOIR165
EUC_JP EUC-JP csEUCPkdFmtJapanese Extended_UNIX_Code_Packed_Format_for_Japanese eucjis x-eucjp eucjp x-euc-jp
EUC_JP_LINUX ?
EUC_JP_Solaris ?
EUC_KR EUC-KR csEUCKR ksc5601 5601 ksc5601_1987 ksc_5601 ksc5601-1987 ks_c_5601-1987 euckr
EUC_TW EUC-TW x-EUC-TW cns11643 euctw
GB18030 GB18030 gb18030-2000
GBK windows-936 CP936 MS936 ms_936 x-mswin-936
ISCII91 ? x-ISCII91 iscii
ISO2022CN ISO-2022-CN
ISO2022JP ISO-2022-JP csISO2022JP JIS jis_encoding csjisencoding
ISO2022KR ISO-2022-KR csISO2022KR
ISO2022_CN_CNS ?
ISO2022_CN_GB ?
ISO8859_1 ISO-8859-1 ISO_8859-1:1987 iso-ir-100 ISO_8859-1 latin1 l1 IBM819 CP819 csISOLatin1 8859_1 819 IBM-819 ISO8859-1 ISO_8859_1
ISO8859_13 ISO-8859-13
ISO8859_15 ISO-8859-15 ISO_8859-15 Latin-9 8859_15 csISOlatin9 IBM923 cp923 923 L9 IBM-923 ISO8859-15 LATIN9 LATIN0 csISOlatin0 ISO8859_15_FDIS
ISO8859_2 ISO-8859-2 ISO_8859-2:1987 iso-ir-101 ISO_8859-2 latin2 l2 csISOLatin2 8859_2 iso8859_2
ISO8859_3 ISO-8859-3 ISO_8859-3:1988 iso-ir-109 ISO_8859-3 latin3 l3 csISOLatin3 8859_3
ISO8859_4 ISO-8859-4 ISO_8859-4:1988 iso-ir-110 ISO_8859-4 latin4 l4 csISOLatin4 8859_4
ISO8859_5 ISO-8859-5 ISO_8859-5:1988 iso-ir-144 ISO_8859-5 cyrillic csISOLatinCyrillic 8859_5
ISO8859_6 ISO-8859-6 ISO_8859-6:1987 iso-ir-127 ISO_8859-6 ECMA-114 ASMO-708 arabic csISOLatinArabic 8859_6
ISO8859_7 ISO-8859-7 ISO_8859-7:1987 iso-ir-126 ISO_8859-7 ELOT_928 ECMA-118 greek greek8 csISOLatinGreek 8859_7 sun_eu_greek
ISO8859_8 ISO-8859-8 ISO_8859-8:1988 iso-ir-138 ISO_8859-8 hebrew csISOLatinHebrew 8859_8
ISO8859_9 ISO-8859-9 ISO_8859-9:1989 iso-ir-148 ISO_8859-9 latin5 l5 csISOLatin5 8859_9
JISAutoDetect ?
JIS_C6626-1983 JIS_C6626-1983 x-JIS0208 JIS0208 csISO87JISX0208 x0208 JIS_X0208-1983 iso-ir-87
JIS_X0201 JIS_X0201 X0201 JIS0201 csHalfWidthKatakana
JIS_X0212-1990 JIS_X0212-1990 iso-ir-159 x0212 JIS0212 csISO159JISX02121990
KOI8_R KOI8-R csKOI8R koi8
MS874 windows-874 cp874
MS932 Windows-31J windows-932 csWindows31J x-ms-cp932
MS949 windows-949 windows949 ms_949 x-windows-949
MS950 windows-950 x-windows-950
MS950_HKSCS
MacArabic ?
MacCentralEurope ?
MacCroatian ?
MacCyrillic ?
MacDingbat ?
MacGreek MacGreek
MacHebrew ?
MacIceland ?
MacRoman MacRoman Macintosh MAC csMacintosh
MacRomania ?
MacSymbol ?
MacThai ?
MacTurkish ?
MacUkraine ?
SJIS Shift_JIS MS_Kanji csShiftJIS shift-jis x-sjis pck
TIS620 TIS-620
UTF-16 UTF-16 UTF_16
UTF8 UTF-8
UnicodeBig ?
UnicodeBigUnmarked UTF-16BE X-UTF-16BE UTF_16BE ISO-10646-UCS-2
UnicodeLittle ?
UnicodeLittleUnmarked UTF-16LE UTF_16LE X-UTF-16LE
x-Johab johab johab cp1361 ms1361 ksc5601-1992 ksc5601_1992
x-iso-8859-11 ?

version
$Id: CharsetUtil.java,v 1.1 2004/10/25 07:26:46 ntherning Exp $

Fields Summary
private static Log
log
private static Charset[]
JAVA_CHARSETS
private static TreeSet
decodingSupported
Contains the canonical names of character sets which can be used to decode bytes into Java chars.
private static TreeSet
encodingSupported
Contains the canonical names of character sets which can be used to encode Java chars into bytes.
private static HashMap
charsetMap
Maps character set names to Charset objects. All possible names of a charset will be mapped to the Charset.
public static final String
CRLF
carriage return - line feed sequence
public static final int
CR
US-ASCII CR, carriage return (13)
public static final int
LF
US-ASCII LF, line feed (10)
public static final int
SP
US-ASCII SP, space (32)
public static final int
HT
US-ASCII HT, horizontal-tab (9)
public static final Charset
US_ASCII
public static final Charset
ISO_8859_1
public static final Charset
UTF_8
Constructors Summary
Methods Summary
public static java.nio.charset.CharsetgetCharset(java.lang.String charsetName)

        String defaultCharset = "ISO-8859-1";
        
        // Use the default chareset if given charset is null
        if(charsetName == null) charsetName = defaultCharset;
            
        try {
            return java.nio.charset.Charset.forName(charsetName);
        } catch (IllegalCharsetNameException e) {
            log.info("Illegal charset " + charsetName + ", fallback to " + defaultCharset + ": " + e);
            // Use default charset on exception 
            return java.nio.charset.Charset.forName(defaultCharset);
        } catch (UnsupportedCharsetException ex) {
            log.info("Unsupported charset " + charsetName + ", fallback to " + defaultCharset + ": " + ex);
            // Use default charset on exception
            return java.nio.charset.Charset.forName(defaultCharset);
        }
        
    
public static booleanisDecodingSupported(java.lang.String charsetName)
Determines if the VM supports decoding (bytes to chars) the specified character set. NOTE: the given character set name may not be known to the VM even if this method returns true. Use {@link #toJavaCharset(String)} to get the canonical Java character set name.

param
charsetName the characters set name.
return
true if decoding is supported, false otherwise.

        return decodingSupported.contains(charsetName.toLowerCase());
    
public static booleanisEncodingSupported(java.lang.String charsetName)
Determines if the VM supports encoding (chars to bytes) the specified character set. NOTE: the given character set name may not be known to the VM even if this method returns true. Use {@link #toJavaCharset(String)} to get the canonical Java character set name.

param
charsetName the characters set name.
return
true if encoding is supported, false otherwise.

        return encodingSupported.contains(charsetName.toLowerCase());
    
public static booleanisWhitespace(char ch)
Returns true if the specified character is a whitespace character (CR, LF, SP or HT). ANDROID: COPIED FROM A NEWER VERSION OF MIME4J

param
ch character to test.
return
true if the specified character is a whitespace character, false otherwise.


                                                                       
         
        return ch == SP || ch == HT || ch == CR || ch == LF;
    
public static booleanisWhitespace(java.lang.String s)
Returns true if the specified string consists entirely of whitespace characters. ANDROID: COPIED FROM A NEWER VERSION OF MIME4J

param
s string to test.
return
true if the specified string consists entirely of whitespace characters, false otherwise.

        if (s == null) {
            throw new IllegalArgumentException("String may not be null");
        }
        final int len = s.length();
        for (int i = 0; i < len; i++) {
            if (!isWhitespace(s.charAt(i))) {
                return false;
            }
        }
        return true;
    
public static java.lang.StringtoJavaCharset(java.lang.String charsetName)
Gets the canonical Java character set name for the specified character set or null if not known. This should be called before doing any conversions using the Java API. NOTE: you must use {@link #isEncodingSupported(String)} or {@link #isDecodingSupported(String)} to make sure the returned Java character set is supported by the current VM.

param
charsetName the character set name to look for.
return
the canonical Java name or null if not known.

        Charset c = (Charset) charsetMap.get(charsetName.toLowerCase());
        if (c != null) {
            return c.canonical;
        }
        return null;
    
public static java.lang.StringtoMimeCharset(java.lang.String charsetName)
Gets the preferred MIME character set name for the specified character set or null if not known.

param
charsetName the character set name to look for.
return
the MIME preferred name or null if not known.

        Charset c = (Charset) charsetMap.get(charsetName.toLowerCase());
        if (c != null) {
            return c.mime;
        }
        return null;