HTMLScannerpublic class HTMLScanner extends Object implements Scanner, LocatorThis class implements a table-driven scanner for HTML, allowing for lots of
defects. It implements the Scanner interface, which accepts a Reader
object to fetch characters from and a ScanHandler object to report lexical
events to. |
Fields Summary |
---|
private static final int | S_ANAME | private static final int | S_APOS | private static final int | S_AVAL | private static final int | S_BB | private static final int | S_BBC | private static final int | S_BBCD | private static final int | S_BBCDA | private static final int | S_BBCDAT | private static final int | S_BBCDATA | private static final int | S_CDATA | private static final int | S_CDATA2 | private static final int | S_CDSECT | private static final int | S_CDSECT1 | private static final int | S_CDSECT2 | private static final int | S_COM | private static final int | S_COM2 | private static final int | S_COM3 | private static final int | S_COM4 | private static final int | S_DECL | private static final int | S_DECL2 | private static final int | S_DONE | private static final int | S_EMPTYTAG | private static final int | S_ENT | private static final int | S_EQ | private static final int | S_ETAG | private static final int | S_GI | private static final int | S_NCR | private static final int | S_PCDATA | private static final int | S_PI | private static final int | S_PITARGET | private static final int | S_QUOT | private static final int | S_STAGC | private static final int | S_TAG | private static final int | S_TAGWS | private static final int | S_XNCR | private static final int | A_ADUP | private static final int | A_ADUP_SAVE | private static final int | A_ADUP_STAGC | private static final int | A_ANAME | private static final int | A_ANAME_ADUP | private static final int | A_ANAME_ADUP_STAGC | private static final int | A_AVAL | private static final int | A_AVAL_STAGC | private static final int | A_CDATA | private static final int | A_CMNT | private static final int | A_DECL | private static final int | A_EMPTYTAG | private static final int | A_ENTITY | private static final int | A_ENTITY_START | private static final int | A_ETAG | private static final int | A_GI | private static final int | A_GI_STAGC | private static final int | A_LT | private static final int | A_LT_PCDATA | private static final int | A_MINUS | private static final int | A_MINUS2 | private static final int | A_MINUS3 | private static final int | A_PCDATA | private static final int | A_PI | private static final int | A_PITARGET | private static final int | A_PITARGET_PI | private static final int | A_SAVE | private static final int | A_SKIP | private static final int | A_SP | private static final int | A_STAGC | private static final int | A_UNGET | private static final int | A_UNSAVE_PCDATA | private static int[] | statetable | private static final String[] | debug_actionnames | private static final String[] | debug_statenames | private String | thePublicid | private String | theSystemid | private int | theLastLine | private int | theLastColumn | private int | theCurrentLine | private int | theCurrentColumn | int | theState | int | theNextState | char[] | theOutputBuffer | int | theSize | int[] | theWinMap |
Methods Summary |
---|
public int | getColumnNumber()
return theLastColumn;
| public int | getLineNumber()
return theLastLine;
| public java.lang.String | getPublicId()
return thePublicid;
| public java.lang.String | getSystemId()
return theSystemid;
| public static void | main(java.lang.String[] argv)Test procedure. Reads HTML from the standard input and writes
PYX to the standard output.
Scanner s = new HTMLScanner();
Reader r = new InputStreamReader(System.in, "UTF-8");
Writer w = new OutputStreamWriter(System.out, "UTF-8");
PYXWriter pw = new PYXWriter(w);
s.scan(r, pw);
w.close();
| private void | mark()Mark the current scan position as a "point of interest" - start of a tag,
cdata, processing instruction etc.
theLastColumn = theCurrentColumn;
theLastLine = theCurrentLine;
| private static java.lang.String | nicechar(int in)
if (in == '\n") return "\\n";
if (in < 32) return "0x"+Integer.toHexString(in);
return "'"+((char)in)+"'";
| public void | resetDocumentLocator(java.lang.String publicid, java.lang.String systemid)Reset document locator, supplying systemid and publicid.
thePublicid = publicid;
theSystemid = systemid;
theLastLine = theLastColumn = theCurrentLine = theCurrentColumn = 0;
| private void | save(int ch, ScanHandler h)
if (theSize >= theOutputBuffer.length - 20) {
if (theState == S_PCDATA || theState == S_CDATA) {
// Return a buffer-sized chunk of PCDATA
h.pcdata(theOutputBuffer, 0, theSize);
theSize = 0;
}
else {
// Grow the buffer size
char[] newOutputBuffer = new char[theOutputBuffer.length * 2];
System.arraycopy(theOutputBuffer, 0, newOutputBuffer, 0, theSize+1);
theOutputBuffer = newOutputBuffer;
}
}
theOutputBuffer[theSize++] = (char)ch;
| public void | scan(java.io.Reader r0, ScanHandler h)Scan HTML source, reporting lexical events.
theState = S_PCDATA;
PushbackReader r;
if (r0 instanceof PushbackReader) {
r = (PushbackReader)r0;
}
else if (r0 instanceof BufferedReader) {
r = new PushbackReader(r0);
}
else {
r = new PushbackReader(new BufferedReader(r0, 200));
}
int firstChar = r.read(); // Remove any leading BOM
if (firstChar != '\uFEFF") unread(r, firstChar);
while (theState != S_DONE) {
int ch = r.read();
// Process control characters
if (ch >= 0x80 && ch <= 0x9F) ch = theWinMap[ch-0x80];
if (ch == '\r") {
ch = r.read(); // expect LF next
if (ch != '\n") {
unread(r, ch); // nope
ch = '\n";
}
}
if (ch == '\n") {
theCurrentLine++;
theCurrentColumn = 0;
}
else {
theCurrentColumn++;
}
if (!(ch >= 0x20 || ch == '\n" || ch == '\t" || ch == -1)) continue;
// Search state table
int action = 0;
for (int i = 0; i < statetable.length; i += 4) {
if (theState != statetable[i]) {
if (action != 0) break;
continue;
}
if (statetable[i+1] == 0) {
action = statetable[i+2];
theNextState = statetable[i+3];
}
else if (statetable[i+1] == ch) {
action = statetable[i+2];
theNextState = statetable[i+3];
break;
}
}
// System.err.println("In " + debug_statenames[theState] + " got " + nicechar(ch) + " doing " + debug_actionnames[action] + " then " + debug_statenames[theNextState]);
switch (action) {
case 0:
throw new Error(
"HTMLScanner can't cope with " + Integer.toString(ch) + " in state " +
Integer.toString(theState));
case A_ADUP:
h.adup(theOutputBuffer, 0, theSize);
theSize = 0;
break;
case A_ADUP_SAVE:
h.adup(theOutputBuffer, 0, theSize);
theSize = 0;
save(ch, h);
break;
case A_ADUP_STAGC:
h.adup(theOutputBuffer, 0, theSize);
theSize = 0;
h.stagc(theOutputBuffer, 0, theSize);
break;
case A_ANAME:
h.aname(theOutputBuffer, 0, theSize);
theSize = 0;
break;
case A_ANAME_ADUP:
h.aname(theOutputBuffer, 0, theSize);
theSize = 0;
h.adup(theOutputBuffer, 0, theSize);
break;
case A_ANAME_ADUP_STAGC:
h.aname(theOutputBuffer, 0, theSize);
theSize = 0;
h.adup(theOutputBuffer, 0, theSize);
h.stagc(theOutputBuffer, 0, theSize);
break;
case A_AVAL:
h.aval(theOutputBuffer, 0, theSize);
theSize = 0;
break;
case A_AVAL_STAGC:
h.aval(theOutputBuffer, 0, theSize);
theSize = 0;
h.stagc(theOutputBuffer, 0, theSize);
break;
case A_CDATA:
mark();
// suppress the final "]]" in the buffer
if (theSize > 1) theSize -= 2;
h.pcdata(theOutputBuffer, 0, theSize);
theSize = 0;
break;
case A_ENTITY_START:
h.pcdata(theOutputBuffer, 0, theSize);
theSize = 0;
save(ch, h);
break;
case A_ENTITY:
mark();
char ch1 = (char)ch;
// System.out.println("Got " + ch1 + " in state " + ((theState == S_ENT) ? "S_ENT" : ((theState == S_NCR) ? "S_NCR" : "UNK")));
if (theState == S_ENT && ch1 == '#") {
theNextState = S_NCR;
save(ch, h);
break;
}
else if (theState == S_NCR && (ch1 == 'x" || ch1 == 'X")) {
theNextState = S_XNCR;
save(ch, h);
break;
}
else if (theState == S_ENT && Character.isLetterOrDigit(ch1)) {
save(ch, h);
break;
}
else if (theState == S_NCR && Character.isDigit(ch1)) {
save(ch, h);
break;
}
else if (theState == S_XNCR && (Character.isDigit(ch1) || "abcdefABCDEF".indexOf(ch1) != -1)) {
save(ch, h);
break;
}
// The whole entity reference has been collected
// System.err.println("%%" + new String(theOutputBuffer, 0, theSize));
h.entity(theOutputBuffer, 1, theSize - 1);
int ent = h.getEntity();
// System.err.println("%% value = " + ent);
if (ent != 0) {
theSize = 0;
if (ent >= 0x80 && ent <= 0x9F) {
ent = theWinMap[ent-0x80];
}
if (ent < 0x20) {
// Control becomes space
ent = 0x20;
}
else if (ent >= 0xD800 && ent <= 0xDFFF) {
// Surrogates get dropped
ent = 0;
}
else if (ent <= 0xFFFF) {
// BMP character
save(ent, h);
}
else {
// Astral converted to two surrogates
ent -= 0x10000;
save((ent>>10) + 0xD800, h);
save((ent&0x3FF) + 0xDC00, h);
}
if (ch != ';") {
unread(r, ch);
theCurrentColumn--;
}
}
else {
unread(r, ch);
theCurrentColumn--;
}
theNextState = S_PCDATA;
break;
case A_ETAG:
h.etag(theOutputBuffer, 0, theSize);
theSize = 0;
break;
case A_DECL:
h.decl(theOutputBuffer, 0, theSize);
theSize = 0;
break;
case A_GI:
h.gi(theOutputBuffer, 0, theSize);
theSize = 0;
break;
case A_GI_STAGC:
h.gi(theOutputBuffer, 0, theSize);
theSize = 0;
h.stagc(theOutputBuffer, 0, theSize);
break;
case A_LT:
mark();
save('<", h);
save(ch, h);
break;
case A_LT_PCDATA:
mark();
save('<", h);
h.pcdata(theOutputBuffer, 0, theSize);
theSize = 0;
break;
case A_PCDATA:
mark();
h.pcdata(theOutputBuffer, 0, theSize);
theSize = 0;
break;
case A_CMNT:
mark();
h.cmnt(theOutputBuffer, 0, theSize);
theSize = 0;
break;
case A_MINUS3:
save('-", h);
save(' ", h);
break;
case A_MINUS2:
save('-", h);
save(' ", h);
// fall through into A_MINUS
case A_MINUS:
save('-", h);
save(ch, h);
break;
case A_PI:
mark();
h.pi(theOutputBuffer, 0, theSize);
theSize = 0;
break;
case A_PITARGET:
h.pitarget(theOutputBuffer, 0, theSize);
theSize = 0;
break;
case A_PITARGET_PI:
h.pitarget(theOutputBuffer, 0, theSize);
theSize = 0;
h.pi(theOutputBuffer, 0, theSize);
break;
case A_SAVE:
save(ch, h);
break;
case A_SKIP:
break;
case A_SP:
save(' ", h);
break;
case A_STAGC:
h.stagc(theOutputBuffer, 0, theSize);
theSize = 0;
break;
case A_EMPTYTAG:
mark();
// System.err.println("%%% Empty tag seen");
if (theSize > 0) h.gi(theOutputBuffer, 0, theSize);
theSize = 0;
h.stage(theOutputBuffer, 0, theSize);
break;
case A_UNGET:
unread(r, ch);
theCurrentColumn--;
break;
case A_UNSAVE_PCDATA:
if (theSize > 0) theSize--;
h.pcdata(theOutputBuffer, 0, theSize);
theSize = 0;
break;
default:
throw new Error("Can't process state " + action);
}
theState = theNextState;
}
h.eof(theOutputBuffer, 0, 0);
| public void | startCDATA()A callback for the ScanHandler that allows it to force
the lexer state to CDATA content (no markup is recognized except
the end of element. theNextState = S_CDATA;
| private void | unread(java.io.PushbackReader r, int c)
// Compensate for bug in PushbackReader that allows
// pushing back EOF.
if (c != -1) r.unread(c);
|
|