UTF_8_Readerpublic class UTF_8_Reader extends com.sun.cldc.i18n.StreamReader Reader for UTF-8 encoded input streams. |
Fields Summary |
---|
private static final int | NO_BYTEsignals that no byte is available, but not the end of stream | private static final int | RC'replacement character' [Unicode 1.1.0] | private int[] | readAheadread ahead buffer to hold a part of char from the last read.
The only case this buffer is needed is like following:
after a number of characters (at least one) have been read,
the next character is encoded by 4 bytes, of which only 3 are
already available in the input stream. In this case read()
will finish without waiting for the last byte of the character. | private static final int | READ_AHEAD_SIZE | protected int | pendingSurrogateIf non-zero, the last read code point must be represented by two
surrogate code units, and the low surrogate code unit has not yet
been retrieved during the last read operation. | private static final int[] | minimalValidValuemaps the number of extra bytes onto the minimal valid value that may
be encoded with this number of bytes |
Constructors Summary |
---|
public UTF_8_Reader()Constructs a UTF-8 reader.
readAhead = new int[READ_AHEAD_SIZE];
|
Methods Summary |
---|
private int | getByteOfCurrentChar(int byteOfChar, boolean allowBlockingRead)Get one of the raw bytes for the current character.
The byte first gets read into the read ahead buffer, unless
it's already there.
if (readAhead[byteOfChar] != NO_BYTE) {
return readAhead[byteOfChar];
}
/*
* allowBlockingRead will be true for the first character.
* Our read method must block until it gets one char so don't call
* available() for the first character.
*/
if (allowBlockingRead || in.available() > 0) {
readAhead[byteOfChar] = in.read();
}
return readAhead[byteOfChar];
| public void | mark(int readAheadLimit)Mark a read ahead character is not supported for UTF8
readers.
throw new IOException("mark() not supported");
| public boolean | markSupported()Tell whether this reader supports the mark() operation.
The UTF-8 implementation always returns false because it does not
support mark().
/*
* For readers mark() is in characters, since UTF-8 character are
* variable length, so we can't just forward this to the underlying
* byte InputStream like other readers do.
* So this reader does not support mark at this time.
*/
return false;
| public java.io.Reader | open(java.io.InputStream in, java.lang.String enc)
super.open(in, enc);
prepareForNextChar(NO_BYTE);
return this;
| private void | prepareForNextChar(int headByte)Prepare the reader for the next character by clearing the look
ahead buffer.
readAhead[0] = headByte;
for (int i=1; i<READ_AHEAD_SIZE; i++) {
readAhead[i]=NO_BYTE;
}
| public int | read(char[] cbuf, int off, int len)Read a block of UTF8 characters.
int count = 0;
int firstByte;
int extraBytes;
int currentChar = 0;
int nextByte;
int headByte = NO_BYTE;
if (len == 0) {
return 0;
}
if (pendingSurrogate != 0) {
cbuf[off + count] = (char)pendingSurrogate;
count++;
pendingSurrogate = 0;
if (len == 1) {
return 1;
}
}
while (count < len) {
// must wait for the first character, and
// other characters are read only if they are available
final boolean mustBlockTillGetsAChar = (0 == count);
firstByte = getByteOfCurrentChar(0, mustBlockTillGetsAChar);
if (firstByte < 0) {
if (firstByte == -1 && count == 0) {
// end of stream
return -1;
}
return count;
}
/* Let's reduce amount of case-mode comparisons */
if ((firstByte&0x80) == 0) {
extraBytes = 0;
currentChar = firstByte;
} else {
switch (firstByte >> 4) {
case 12: case 13:
/* 11 bits: 110x xxxx 10xx xxxx */
extraBytes = 1;
currentChar = firstByte & 0x1F;
break;
case 14:
/* 16 bits: 1110 xxxx 10xx xxxx 10xx xxxx */
extraBytes = 2;
currentChar = firstByte & 0x0F;
break;
case 15:
if ((firstByte&0x08)==0) {
/* 21 bits: 1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx */
extraBytes = 3;
currentChar = firstByte & 0x07;
break;
} // else as default
default:
/* we do replace malformed character with special symbol */
extraBytes = 0;
currentChar = RC;
}
}
for (int j = 1; j <= extraBytes; j++) {
nextByte = getByteOfCurrentChar(j, mustBlockTillGetsAChar);
if (nextByte == NO_BYTE) {
// done for now, comeback later for the rest of char
return count;
}
if (nextByte == -1) {
// end of stream in the middle of char -- set 'RC'
currentChar = RC;
break;
}
if ((nextByte & 0xC0) != 0x80) {
// invalid byte - move it at head of next read sequence
currentChar = RC;
headByte = nextByte;
break;
}
// each extra byte has 6 bits more of the char
currentChar = (currentChar << 6) + (nextByte & 0x3F);
}
if (currentChar < minimalValidValue[extraBytes]) {
// the character is malformed: it should be encoded
// with a shorter sequence of bytes
currentChar = RC;
cbuf[off + count] = (char)currentChar;
count++;
} else if (currentChar <= 0xd7ff
// d800...d8ff and dc00...dfff are high and low surrogate code
// points, they do not represent characters
|| (0xe000 <= currentChar && currentChar <= 0xffff)) {
cbuf[off + count] = (char)currentChar;
count++;
} else if (0xffff < currentChar && currentChar <= 0x10ffff) {
int highSurrogate = 0xd800 | ((currentChar-0x10000) >> 10);
int lowSurrogate = 0xdc00 | (currentChar & 0x3ff);
cbuf[off + count] = (char)highSurrogate;
count++;
if (count < len) {
cbuf[off + count] = (char)lowSurrogate;
count++;
} else {
pendingSurrogate=lowSurrogate;
}
} else {
currentChar = RC;
cbuf[off + count] = (char)currentChar;
count++;
}
prepareForNextChar(headByte);
}
return count;
| public void | reset()Reset the read ahead marks is not supported for UTF8 readers.
throw new IOException("reset() not supported");
| public int | sizeOf(byte[] array, int offset, int length)Get the size in chars of an array of bytes.
int count = 0;
int endOfArray;
int extraBytes;
for (endOfArray = offset + length; offset < endOfArray; ) {
int oldCount = count;
count++;
/* Reduce amount of case-mode comparisons */
if ((array[offset]&0x80) == 0) {
extraBytes = 0;
} else {
switch (((int)array[offset] & 0xff) >> 4) {
case 12: case 13:
/* 11 bits: 110x xxxx 10xx xxxx */
extraBytes = 1;
break;
case 14:
/* 16 bits: 1110 xxxx 10xx xxxx 10xx xxxx */
extraBytes = 2;
break;
case 15:
if (((int)array[offset] & 0x08)==0) {
/* 21 bits: 1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx */
// we imply that the 5 high bits are not all zeroes
extraBytes = 3;
count++;
break;
} // else as default
default:
/*
* this byte will be replaced with 'RC'
*/
extraBytes = 0;
}
}
offset++;
// test if extra bytes are in form 10xx xxxx
while (extraBytes-- > 0){
if (offset < endOfArray) {
if ((((int)array[offset]) & 0xC0) != 0x80) {
break; // test fails: char will be replaced with 'RC'
} else {
offset++;
}
} else {
// broken sequence of bytes detected at the array tail
// the broken char still must be counted
count = oldCount+1;
break;
}
}
}
return count;
|
|