UTF8Readerpublic class UTF8Reader extends Reader
Fields Summary |
---|
public static final int | DEFAULT_BUFFER_SIZEDefault byte buffer size (2048). | private static final boolean | DEBUG_READDebug read. | protected final InputStream | fInputStreamInput stream. | protected final byte[] | fBufferByte buffer. | protected int | fOffsetOffset into buffer. | private int | fSurrogateSurrogate character. | private final org.apache.xerces.util.MessageFormatter | fFormatter | private final Locale | fLocale |
Constructors Summary |
---|
public UTF8Reader(InputStream inputStream)Constructs a UTF-8 reader from the specified input stream
using the default buffer size. Primarily for testing.
//
// Constructors
//
this(inputStream, DEFAULT_BUFFER_SIZE, new XMLMessageFormatter(), Locale.getDefault());
| public UTF8Reader(InputStream inputStream, org.apache.xerces.util.MessageFormatter messageFormatter, Locale locale)Constructs a UTF-8 reader from the specified input stream
using the default buffer size and the given MessageFormatter.
this(inputStream, DEFAULT_BUFFER_SIZE, messageFormatter, locale);
| public UTF8Reader(InputStream inputStream, int size, org.apache.xerces.util.MessageFormatter messageFormatter, Locale locale)Constructs a UTF-8 reader from the specified input stream,
buffer size and MessageFormatter.
this(inputStream, new byte[size], messageFormatter, locale);
| public UTF8Reader(InputStream inputStream, byte[] buffer, org.apache.xerces.util.MessageFormatter messageFormatter, Locale locale)Constructs a UTF-8 reader from the specified input stream,
buffer and MessageFormatter.
fInputStream = inputStream;
fBuffer = buffer;
fFormatter = messageFormatter;
fLocale = locale;
|
Methods Summary |
---|
public void | close()Close the stream. Once a stream has been closed, further read(),
ready(), mark(), or reset() invocations will throw an IOException.
Closing a previously-closed stream, however, has no effect.
fInputStream.close();
| private void | expectedByte(int position, int count)Throws an exception for expected byte.
throw new MalformedByteSequenceException(fFormatter,
fLocale,
XMLMessageFormatter.XML_DOMAIN,
"ExpectedByte",
new Object[] {Integer.toString(position), Integer.toString(count)});
| private void | invalidByte(int position, int count, int c)Throws an exception for invalid byte.
throw new MalformedByteSequenceException(fFormatter,
fLocale,
XMLMessageFormatter.XML_DOMAIN,
"InvalidByte",
new Object [] {Integer.toString(position), Integer.toString(count)});
| private void | invalidSurrogate(int uuuuu)Throws an exception for invalid surrogate bits.
throw new MalformedByteSequenceException(fFormatter,
fLocale,
XMLMessageFormatter.XML_DOMAIN,
"InvalidHighSurrogate",
new Object[] {Integer.toHexString(uuuuu)});
| public void | mark(int readAheadLimit)Mark the present position in the stream. Subsequent calls to reset()
will attempt to reposition the stream to this point. Not all
character-input streams support the mark() operation.
throw new IOException(fFormatter.formatMessage(fLocale, "OperationNotSupported", new Object[]{"mark()", "UTF-8"}));
| public boolean | markSupported()Tell whether this stream supports the mark() operation.
return false;
| public int | read()Read a single character. This method will block until a character is
available, an I/O error occurs, or the end of the stream is reached.
Subclasses that intend to support efficient single-character input
should override this method.
// decode character
int c = fSurrogate;
if (fSurrogate == -1) {
// NOTE: We use the index into the buffer if there are remaining
// bytes from the last block read. -Ac
int index = 0;
// get first byte
int b0 = index == fOffset
? fInputStream.read() : fBuffer[index++] & 0x00FF;
if (b0 == -1) {
return -1;
}
// UTF-8: [0xxx xxxx]
// Unicode: [0000 0000] [0xxx xxxx]
if (b0 < 0x80) {
c = (char)b0;
}
// UTF-8: [110y yyyy] [10xx xxxx]
// Unicode: [0000 0yyy] [yyxx xxxx]
else if ((b0 & 0xE0) == 0xC0 && (b0 & 0x1E) != 0) {
int b1 = index == fOffset
? fInputStream.read() : fBuffer[index++] & 0x00FF;
if (b1 == -1) {
expectedByte(2, 2);
}
if ((b1 & 0xC0) != 0x80) {
invalidByte(2, 2, b1);
}
c = ((b0 << 6) & 0x07C0) | (b1 & 0x003F);
}
// UTF-8: [1110 zzzz] [10yy yyyy] [10xx xxxx]
// Unicode: [zzzz yyyy] [yyxx xxxx]
else if ((b0 & 0xF0) == 0xE0) {
int b1 = index == fOffset
? fInputStream.read() : fBuffer[index++] & 0x00FF;
if (b1 == -1) {
expectedByte(2, 3);
}
if ((b1 & 0xC0) != 0x80
|| (b0 == 0xED && b1 >= 0xA0)
|| ((b0 & 0x0F) == 0 && (b1 & 0x20) == 0)) {
invalidByte(2, 3, b1);
}
int b2 = index == fOffset
? fInputStream.read() : fBuffer[index++] & 0x00FF;
if (b2 == -1) {
expectedByte(3, 3);
}
if ((b2 & 0xC0) != 0x80) {
invalidByte(3, 3, b2);
}
c = ((b0 << 12) & 0xF000) | ((b1 << 6) & 0x0FC0) |
(b2 & 0x003F);
}
// UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
// Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
// [1101 11yy] [yyxx xxxx] (low surrogate)
// * uuuuu = wwww + 1
else if ((b0 & 0xF8) == 0xF0) {
int b1 = index == fOffset
? fInputStream.read() : fBuffer[index++] & 0x00FF;
if (b1 == -1) {
expectedByte(2, 4);
}
if ((b1 & 0xC0) != 0x80
|| ((b1 & 0x30) == 0 && (b0 & 0x07) == 0)) {
invalidByte(2, 3, b1);
}
int b2 = index == fOffset
? fInputStream.read() : fBuffer[index++] & 0x00FF;
if (b2 == -1) {
expectedByte(3, 4);
}
if ((b2 & 0xC0) != 0x80) {
invalidByte(3, 3, b2);
}
int b3 = index == fOffset
? fInputStream.read() : fBuffer[index++] & 0x00FF;
if (b3 == -1) {
expectedByte(4, 4);
}
if ((b3 & 0xC0) != 0x80) {
invalidByte(4, 4, b3);
}
int uuuuu = ((b0 << 2) & 0x001C) | ((b1 >> 4) & 0x0003);
if (uuuuu > 0x10) {
invalidSurrogate(uuuuu);
}
int wwww = uuuuu - 1;
int hs = 0xD800 |
((wwww << 6) & 0x03C0) | ((b1 << 2) & 0x003C) |
((b2 >> 4) & 0x0003);
int ls = 0xDC00 | ((b2 << 6) & 0x03C0) | (b3 & 0x003F);
c = hs;
fSurrogate = ls;
}
// error
else {
invalidByte(1, 1, b0);
}
}
// use surrogate
else {
fSurrogate = -1;
}
// return character
if (DEBUG_READ) {
System.out.println("read(): 0x"+Integer.toHexString(c));
}
return c;
| public int | read(char[] ch, int offset, int length)Read characters into a portion of an array. This method will block
until some input is available, an I/O error occurs, or the end of the
stream is reached.
// read bytes
int out = offset;
int count = 0;
if (fOffset == 0) {
// adjust length to read
if (length > fBuffer.length) {
length = fBuffer.length;
}
// handle surrogate
if (fSurrogate != -1) {
ch[out++] = (char)fSurrogate;
fSurrogate = -1;
length--;
}
// perform read operation
count = fInputStream.read(fBuffer, 0, length);
if (count == -1) {
return -1;
}
count += out - offset;
}
// skip read; last character was in error
// NOTE: Having an offset value other than zero means that there was
// an error in the last character read. In this case, we have
// skipped the read so we don't consume any bytes past the
// error. By signalling the error on the next block read we
// allow the method to return the most valid characters that
// it can on the previous block read. -Ac
else {
count = fOffset;
fOffset = 0;
}
// convert bytes to characters
final int total = count;
int in;
byte byte1;
final byte byte0 = 0;
for (in = 0; in < total; in++) {
byte1 = fBuffer[in];
if (byte1 >= byte0) {
ch[out++] = (char)byte1;
}
else {
break;
}
}
for ( ; in < total; in++) {
byte1 = fBuffer[in];
// UTF-8: [0xxx xxxx]
// Unicode: [0000 0000] [0xxx xxxx]
if (byte1 >= byte0) {
ch[out++] = (char)byte1;
continue;
}
// UTF-8: [110y yyyy] [10xx xxxx]
// Unicode: [0000 0yyy] [yyxx xxxx]
int b0 = byte1 & 0x0FF;
if ((b0 & 0xE0) == 0xC0 && (b0 & 0x1E) != 0) {
int b1 = -1;
if (++in < total) {
b1 = fBuffer[in] & 0x00FF;
}
else {
b1 = fInputStream.read();
if (b1 == -1) {
if (out > offset) {
fBuffer[0] = (byte)b0;
fOffset = 1;
return out - offset;
}
expectedByte(2, 2);
}
count++;
}
if ((b1 & 0xC0) != 0x80) {
if (out > offset) {
fBuffer[0] = (byte)b0;
fBuffer[1] = (byte)b1;
fOffset = 2;
return out - offset;
}
invalidByte(2, 2, b1);
}
int c = ((b0 << 6) & 0x07C0) | (b1 & 0x003F);
ch[out++] = (char)c;
count -= 1;
continue;
}
// UTF-8: [1110 zzzz] [10yy yyyy] [10xx xxxx]
// Unicode: [zzzz yyyy] [yyxx xxxx]
if ((b0 & 0xF0) == 0xE0) {
int b1 = -1;
if (++in < total) {
b1 = fBuffer[in] & 0x00FF;
}
else {
b1 = fInputStream.read();
if (b1 == -1) {
if (out > offset) {
fBuffer[0] = (byte)b0;
fOffset = 1;
return out - offset;
}
expectedByte(2, 3);
}
count++;
}
if ((b1 & 0xC0) != 0x80
|| (b0 == 0xED && b1 >= 0xA0)
|| ((b0 & 0x0F) == 0 && (b1 & 0x20) == 0)) {
if (out > offset) {
fBuffer[0] = (byte)b0;
fBuffer[1] = (byte)b1;
fOffset = 2;
return out - offset;
}
invalidByte(2, 3, b1);
}
int b2 = -1;
if (++in < total) {
b2 = fBuffer[in] & 0x00FF;
}
else {
b2 = fInputStream.read();
if (b2 == -1) {
if (out > offset) {
fBuffer[0] = (byte)b0;
fBuffer[1] = (byte)b1;
fOffset = 2;
return out - offset;
}
expectedByte(3, 3);
}
count++;
}
if ((b2 & 0xC0) != 0x80) {
if (out > offset) {
fBuffer[0] = (byte)b0;
fBuffer[1] = (byte)b1;
fBuffer[2] = (byte)b2;
fOffset = 3;
return out - offset;
}
invalidByte(3, 3, b2);
}
int c = ((b0 << 12) & 0xF000) | ((b1 << 6) & 0x0FC0) |
(b2 & 0x003F);
ch[out++] = (char)c;
count -= 2;
continue;
}
// UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
// Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
// [1101 11yy] [yyxx xxxx] (low surrogate)
// * uuuuu = wwww + 1
if ((b0 & 0xF8) == 0xF0) {
int b1 = -1;
if (++in < total) {
b1 = fBuffer[in] & 0x00FF;
}
else {
b1 = fInputStream.read();
if (b1 == -1) {
if (out > offset) {
fBuffer[0] = (byte)b0;
fOffset = 1;
return out - offset;
}
expectedByte(2, 4);
}
count++;
}
if ((b1 & 0xC0) != 0x80
|| ((b1 & 0x30) == 0 && (b0 & 0x07) == 0)) {
if (out > offset) {
fBuffer[0] = (byte)b0;
fBuffer[1] = (byte)b1;
fOffset = 2;
return out - offset;
}
invalidByte(2, 4, b1);
}
int b2 = -1;
if (++in < total) {
b2 = fBuffer[in] & 0x00FF;
}
else {
b2 = fInputStream.read();
if (b2 == -1) {
if (out > offset) {
fBuffer[0] = (byte)b0;
fBuffer[1] = (byte)b1;
fOffset = 2;
return out - offset;
}
expectedByte(3, 4);
}
count++;
}
if ((b2 & 0xC0) != 0x80) {
if (out > offset) {
fBuffer[0] = (byte)b0;
fBuffer[1] = (byte)b1;
fBuffer[2] = (byte)b2;
fOffset = 3;
return out - offset;
}
invalidByte(3, 4, b2);
}
int b3 = -1;
if (++in < total) {
b3 = fBuffer[in] & 0x00FF;
}
else {
b3 = fInputStream.read();
if (b3 == -1) {
if (out > offset) {
fBuffer[0] = (byte)b0;
fBuffer[1] = (byte)b1;
fBuffer[2] = (byte)b2;
fOffset = 3;
return out - offset;
}
expectedByte(4, 4);
}
count++;
}
if ((b3 & 0xC0) != 0x80) {
if (out > offset) {
fBuffer[0] = (byte)b0;
fBuffer[1] = (byte)b1;
fBuffer[2] = (byte)b2;
fBuffer[3] = (byte)b3;
fOffset = 4;
return out - offset;
}
invalidByte(4, 4, b2);
}
// decode bytes into surrogate characters
int uuuuu = ((b0 << 2) & 0x001C) | ((b1 >> 4) & 0x0003);
if (uuuuu > 0x10) {
invalidSurrogate(uuuuu);
}
int wwww = uuuuu - 1;
int zzzz = b1 & 0x000F;
int yyyyyy = b2 & 0x003F;
int xxxxxx = b3 & 0x003F;
int hs = 0xD800 | ((wwww << 6) & 0x03C0) | (zzzz << 2) | (yyyyyy >> 4);
int ls = 0xDC00 | ((yyyyyy << 6) & 0x03C0) | xxxxxx;
// set characters
ch[out++] = (char)hs;
if ((count -= 2) <= length) {
ch[out++] = (char)ls;
}
// reached the end of the char buffer; save low surrogate for the next read
else {
fSurrogate = ls;
--count;
}
continue;
}
// error
if (out > offset) {
fBuffer[0] = (byte)b0;
fOffset = 1;
return out - offset;
}
invalidByte(1, 1, b0);
}
// return number of characters converted
if (DEBUG_READ) {
System.out.println("read(char[],"+offset+',"+length+"): count="+count);
}
return count;
| public boolean | ready()Tell whether this stream is ready to be read.
return false;
| public void | reset()Reset the stream. If the stream has been marked, then attempt to
reposition it at the mark. If the stream has not been marked, then
attempt to reset it in some way appropriate to the particular stream,
for example by repositioning it to its starting point. Not all
character-input streams support the reset() operation, and some support
reset() without supporting mark().
fOffset = 0;
fSurrogate = -1;
| public long | skip(long n)Skip characters. This method will block until some characters are
available, an I/O error occurs, or the end of the stream is reached.
long remaining = n;
final char[] ch = new char[fBuffer.length];
do {
int length = ch.length < remaining ? ch.length : (int)remaining;
int count = read(ch, 0, length);
if (count > 0) {
remaining -= count;
}
else {
break;
}
} while (remaining > 0);
long skipped = n - remaining;
return skipped;
|
|