FileDocCategorySizeDatePackage
UTF_8_Reader.javaAPI DocphoneME MR2 API (J2ME)13432Wed May 02 18:00:02 BST 2007com.sun.cldc.i18n.j2me

UTF_8_Reader

public class UTF_8_Reader extends com.sun.cldc.i18n.StreamReader
Reader for UTF-8 encoded input streams.

Fields Summary
private static final int
NO_BYTE
signals that no byte is available, but not the end of stream
private static final int
RC
'replacement character' [Unicode 1.1.0]
private int[]
readAhead
read ahead buffer to hold a part of char from the last read. The only case this buffer is needed is like following: after a number of characters (at least one) have been read, the next character is encoded by 4 bytes, of which only 3 are already available in the input stream. In this case read() will finish without waiting for the last byte of the character.
private static final int
READ_AHEAD_SIZE
protected int
pendingSurrogate
If non-zero, the last read code point must be represented by two surrogate code units, and the low surrogate code unit has not yet been retrieved during the last read operation.
private static final int[]
minimalValidValue
maps the number of extra bytes onto the minimal valid value that may be encoded with this number of bytes
Constructors Summary
public UTF_8_Reader()
Constructs a UTF-8 reader.


         
      
        readAhead = new int[READ_AHEAD_SIZE];
    
Methods Summary
private intgetByteOfCurrentChar(int byteOfChar, boolean allowBlockingRead)
Get one of the raw bytes for the current character. The byte first gets read into the read ahead buffer, unless it's already there.

param
byteOfChar which raw byte to get 0 for the first, 3 for the last. The bytes must be accessed sequentially, that is, the only possible order of byteOfChar values in a series of calls is 0, 1, 2, 3.
param
allowBlockingRead false allows returning NO_BYTE if no byte is available in the input stream; true forces reading.
return
a byte value, NO_BYTE for no byte available or -1 for end of stream
exception
IOException if an I/O error occurs.

        if (readAhead[byteOfChar] != NO_BYTE) {
            return readAhead[byteOfChar];
        }

        /*
         * allowBlockingRead will be true for the first character.
         * Our read method must block until it gets one char so don't call
         * available() for the first character.
         */
        if (allowBlockingRead || in.available() > 0) {
            readAhead[byteOfChar] = in.read();
        }

        return readAhead[byteOfChar];
    
public voidmark(int readAheadLimit)
Mark a read ahead character is not supported for UTF8 readers.

param
readAheadLimit number of characters to buffer ahead
exception
IOException is thrown, for all calls to this method because marking is not supported for UTF8 readers

        throw new IOException("mark() not supported");
    
public booleanmarkSupported()
Tell whether this reader supports the mark() operation. The UTF-8 implementation always returns false because it does not support mark().

return
false

        /*
         * For readers mark() is in characters, since UTF-8 character are
         * variable length, so we can't just forward this to the underlying
         * byte InputStream like other readers do.
         * So this reader does not support mark at this time.
         */
        return false;
    
public java.io.Readeropen(java.io.InputStream in, java.lang.String enc)

        super.open(in, enc);
        prepareForNextChar(NO_BYTE);
        return this;
    
private voidprepareForNextChar(int headByte)
Prepare the reader for the next character by clearing the look ahead buffer.

param
headByte value of first byte. If previous sequence is interrupted by malformed byte - this byte should be moved at head of next sequence

        readAhead[0] = headByte;
        for (int i=1; i<READ_AHEAD_SIZE; i++) {
            readAhead[i]=NO_BYTE;
        }
    
public intread(char[] cbuf, int off, int len)
Read a block of UTF8 characters.

param
cbuf output buffer for converted characters read
param
off initial offset into the provided buffer
param
len length of characters in the buffer
return
the number of converted characters
exception
IOException is thrown if the input stream could not be read for the raw unconverted character

                                                               
              
        int count = 0;
        int firstByte;
        int extraBytes;
        int currentChar = 0;
        int nextByte;
        int headByte = NO_BYTE;

        if (len == 0) {
            return 0;
        }
        if (pendingSurrogate != 0) {
            cbuf[off + count] = (char)pendingSurrogate;
            count++;
            pendingSurrogate = 0;
            if (len == 1) {
                return 1;
            }
        }

        while (count < len) {
            // must wait for the first character, and
            // other characters are read only if they are available
            final boolean mustBlockTillGetsAChar = (0 == count);
            firstByte = getByteOfCurrentChar(0, mustBlockTillGetsAChar);
            if (firstByte < 0) {
                if (firstByte == -1 && count == 0) {
                    // end of stream
                    return -1;
                }

                return count;
            }
            /* Let's reduce amount of case-mode comparisons */
            if ((firstByte&0x80) == 0) {
                extraBytes = 0;
                currentChar = firstByte;
            } else {
                switch (firstByte >> 4) {
                case 12: case 13:
                    /* 11 bits: 110x xxxx   10xx xxxx */
                    extraBytes = 1;
                    currentChar = firstByte & 0x1F;
                    break;
    
                case 14:
                    /* 16 bits: 1110 xxxx  10xx xxxx  10xx xxxx */
                    extraBytes = 2;
                    currentChar = firstByte & 0x0F;
                    break;

                case 15:
                    if ((firstByte&0x08)==0) {
                        /* 21 bits: 1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx */
                        extraBytes = 3;
                        currentChar = firstByte & 0x07;
                        break;
                    } // else as default

                default:
                    /* we do replace malformed character with special symbol */
                    extraBytes = 0;
                    currentChar = RC;
                }
            }

            for (int j = 1; j <= extraBytes; j++) {
                nextByte = getByteOfCurrentChar(j, mustBlockTillGetsAChar);
                if (nextByte == NO_BYTE) {
                    // done for now, comeback later for the rest of char
                    return count;
                }

                if (nextByte == -1) {
                    // end of stream in the middle of char -- set 'RC'
                    currentChar = RC;
                    break;
                }

                if ((nextByte & 0xC0) != 0x80) {
                    // invalid byte - move it at head of next read sequence
                    currentChar = RC;
                    headByte = nextByte;
                    break;
                }

                // each extra byte has 6 bits more of the char
                currentChar = (currentChar << 6) + (nextByte & 0x3F);
            }

            if (currentChar < minimalValidValue[extraBytes]) {
                // the character is malformed: it should be encoded
                // with a shorter sequence of bytes
                currentChar = RC;
                cbuf[off + count] = (char)currentChar;
                count++;
            } else if (currentChar <= 0xd7ff
             // d800...d8ff and dc00...dfff are high and low surrogate code
             // points, they do not represent characters
             || (0xe000 <= currentChar && currentChar <= 0xffff)) {
                cbuf[off + count] = (char)currentChar;
                count++;
            } else if (0xffff < currentChar && currentChar <= 0x10ffff) {
                int highSurrogate = 0xd800 | ((currentChar-0x10000) >> 10);
                int lowSurrogate = 0xdc00 | (currentChar & 0x3ff);
                cbuf[off + count] = (char)highSurrogate;
                count++;
                if (count < len) {
                    cbuf[off + count] = (char)lowSurrogate;
                    count++;
                } else {
                    pendingSurrogate=lowSurrogate;
                }
            } else {
                currentChar = RC;
                cbuf[off + count] = (char)currentChar;
                count++;
            }
            prepareForNextChar(headByte);
        }
        return count;
    
public voidreset()
Reset the read ahead marks is not supported for UTF8 readers.

exception
IOException is thrown, for all calls to this method because marking is not supported for UTF8 readers

        throw new IOException("reset() not supported");
    
public intsizeOf(byte[] array, int offset, int length)
Get the size in chars of an array of bytes.

param
array Source buffer
param
offset Offset at which to start counting characters
param
length number of bytes to use for counting
return
number of characters that would be converted

        int count = 0;
        int endOfArray;
        int extraBytes;

        for (endOfArray = offset + length; offset < endOfArray; ) {
            int oldCount = count;
            count++;
            /* Reduce amount of case-mode comparisons */
            if ((array[offset]&0x80) == 0) {
                extraBytes = 0;
            } else {
                switch (((int)array[offset] & 0xff) >> 4) {
                case 12: case 13:
                    /* 11 bits: 110x xxxx   10xx xxxx */
                    extraBytes = 1;
                    break;
    
                case 14:
                    /* 16 bits: 1110 xxxx  10xx xxxx  10xx xxxx */
                    extraBytes = 2;
                    break;

                case 15:
                    if (((int)array[offset] & 0x08)==0) {
                        /* 21 bits: 1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx */
                        // we imply that the 5 high bits are not all zeroes
                        extraBytes = 3;
                        count++;
                        break;
                    } // else as default

             default:
                    /*
                     * this byte will be replaced with 'RC'
                     */
                    extraBytes = 0;
                }
            }
            offset++;
            // test if extra bytes are in form 10xx xxxx
            while (extraBytes-- > 0){
                if (offset < endOfArray) {
                    if ((((int)array[offset]) & 0xC0) != 0x80) {
                        break;  // test fails: char will be replaced with 'RC'
                    } else {
                        offset++;
                    }
                } else {
                    // broken sequence of bytes detected at the array tail
                    // the broken char still must be counted
                    count = oldCount+1;
                    break;
                }
            }
        }

        return count;