FileDocCategorySizeDatePackage
HTMLScanner.javaAPI DocAndroid 1.5 API19570Wed May 06 22:41:42 BST 2009org.ccil.cowan.tagsoup

HTMLScanner

public class HTMLScanner extends Object implements Scanner, Locator
This class implements a table-driven scanner for HTML, allowing for lots of defects. It implements the Scanner interface, which accepts a Reader object to fetch characters from and a ScanHandler object to report lexical events to.

Fields Summary
private static final int
S_ANAME
private static final int
S_APOS
private static final int
S_AVAL
private static final int
S_BB
private static final int
S_BBC
private static final int
S_BBCD
private static final int
S_BBCDA
private static final int
S_BBCDAT
private static final int
S_BBCDATA
private static final int
S_CDATA
private static final int
S_CDATA2
private static final int
S_CDSECT
private static final int
S_CDSECT1
private static final int
S_CDSECT2
private static final int
S_COM
private static final int
S_COM2
private static final int
S_COM3
private static final int
S_COM4
private static final int
S_DECL
private static final int
S_DECL2
private static final int
S_DONE
private static final int
S_EMPTYTAG
private static final int
S_ENT
private static final int
S_EQ
private static final int
S_ETAG
private static final int
S_GI
private static final int
S_NCR
private static final int
S_PCDATA
private static final int
S_PI
private static final int
S_PITARGET
private static final int
S_QUOT
private static final int
S_STAGC
private static final int
S_TAG
private static final int
S_TAGWS
private static final int
S_XNCR
private static final int
A_ADUP
private static final int
A_ADUP_SAVE
private static final int
A_ADUP_STAGC
private static final int
A_ANAME
private static final int
A_ANAME_ADUP
private static final int
A_ANAME_ADUP_STAGC
private static final int
A_AVAL
private static final int
A_AVAL_STAGC
private static final int
A_CDATA
private static final int
A_CMNT
private static final int
A_DECL
private static final int
A_EMPTYTAG
private static final int
A_ENTITY
private static final int
A_ENTITY_START
private static final int
A_ETAG
private static final int
A_GI
private static final int
A_GI_STAGC
private static final int
A_LT
private static final int
A_LT_PCDATA
private static final int
A_MINUS
private static final int
A_MINUS2
private static final int
A_MINUS3
private static final int
A_PCDATA
private static final int
A_PI
private static final int
A_PITARGET
private static final int
A_PITARGET_PI
private static final int
A_SAVE
private static final int
A_SKIP
private static final int
A_SP
private static final int
A_STAGC
private static final int
A_UNGET
private static final int
A_UNSAVE_PCDATA
private static int[]
statetable
private static final String[]
debug_actionnames
private static final String[]
debug_statenames
private String
thePublicid
private String
theSystemid
private int
theLastLine
private int
theLastColumn
private int
theCurrentLine
private int
theCurrentColumn
int
theState
int
theNextState
char[]
theOutputBuffer
int
theSize
int[]
theWinMap
Constructors Summary
Methods Summary
public intgetColumnNumber()

		return theLastColumn;
		
public intgetLineNumber()

		return theLastLine;
		
public java.lang.StringgetPublicId()

		return thePublicid;
		
public java.lang.StringgetSystemId()

		return theSystemid;
		
public static voidmain(java.lang.String[] argv)
Test procedure. Reads HTML from the standard input and writes PYX to the standard output.

		Scanner s = new HTMLScanner();
		Reader r = new InputStreamReader(System.in, "UTF-8");
		Writer w = new OutputStreamWriter(System.out, "UTF-8");
		PYXWriter pw = new PYXWriter(w);
		s.scan(r, pw);
		w.close();
		
private voidmark()
Mark the current scan position as a "point of interest" - start of a tag, cdata, processing instruction etc.

		theLastColumn = theCurrentColumn;
		theLastLine = theCurrentLine;
		
private static java.lang.Stringnicechar(int in)

		if (in == '\n") return "\\n";
		if (in < 32) return "0x"+Integer.toHexString(in);
		return "'"+((char)in)+"'";
		
public voidresetDocumentLocator(java.lang.String publicid, java.lang.String systemid)
Reset document locator, supplying systemid and publicid.

param
systemid System id
param
publicid Public id

		thePublicid = publicid;
		theSystemid = systemid;
		theLastLine = theLastColumn = theCurrentLine = theCurrentColumn = 0;
		
private voidsave(int ch, ScanHandler h)

		if (theSize >= theOutputBuffer.length - 20) {
			if (theState == S_PCDATA || theState == S_CDATA) {
				// Return a buffer-sized chunk of PCDATA
				h.pcdata(theOutputBuffer, 0, theSize);
				theSize = 0;
				}
			else {
				// Grow the buffer size
				char[] newOutputBuffer = new char[theOutputBuffer.length * 2];
                                System.arraycopy(theOutputBuffer, 0, newOutputBuffer, 0, theSize+1);
				theOutputBuffer = newOutputBuffer;
				}
			}
		theOutputBuffer[theSize++] = (char)ch;
		
public voidscan(java.io.Reader r0, ScanHandler h)
Scan HTML source, reporting lexical events.

param
r0 Reader that provides characters
param
h ScanHandler that accepts lexical events.

		theState = S_PCDATA;
		PushbackReader r;
		if (r0 instanceof PushbackReader) {
			r = (PushbackReader)r0;
			}
		else if (r0 instanceof BufferedReader) {
			r = new PushbackReader(r0);
			}
		else {
			r = new PushbackReader(new BufferedReader(r0, 200));
			}

		int firstChar = r.read();	// Remove any leading BOM
		if (firstChar != '\uFEFF") unread(r, firstChar);

		while (theState != S_DONE) {
			int ch = r.read();

			// Process control characters
			if (ch >= 0x80 && ch <= 0x9F) ch = theWinMap[ch-0x80];

			if (ch == '\r") {
				ch = r.read();		// expect LF next
				if (ch != '\n") {
					unread(r, ch);	// nope
					ch = '\n";
					}
				}

			if (ch == '\n") {
				theCurrentLine++;
				theCurrentColumn = 0;
				}
			else {
				theCurrentColumn++;
				}

			if (!(ch >= 0x20 || ch == '\n" || ch == '\t" || ch == -1)) continue;

			// Search state table
			int action = 0;
			for (int i = 0; i < statetable.length; i += 4) {
				if (theState != statetable[i]) {
					if (action != 0) break;
					continue;
					}
				if (statetable[i+1] == 0) {
					action = statetable[i+2];
					theNextState = statetable[i+3];
					}
				else if (statetable[i+1] == ch) {
					action = statetable[i+2];
					theNextState = statetable[i+3];
					break;
					}
				}
//			System.err.println("In " + debug_statenames[theState] + " got " + nicechar(ch) + " doing " + debug_actionnames[action] + " then " + debug_statenames[theNextState]);
			switch (action) {
			case 0:
				throw new Error(
"HTMLScanner can't cope with " + Integer.toString(ch) + " in state " +
Integer.toString(theState));
        		case A_ADUP:
				h.adup(theOutputBuffer, 0, theSize);
				theSize = 0;
				break;
        		case A_ADUP_SAVE:
				h.adup(theOutputBuffer, 0, theSize);
				theSize = 0;
				save(ch, h);
				break;
        		case A_ADUP_STAGC:
				h.adup(theOutputBuffer, 0, theSize);
				theSize = 0;
				h.stagc(theOutputBuffer, 0, theSize);
				break;
        		case A_ANAME:
				h.aname(theOutputBuffer, 0, theSize);
				theSize = 0;
				break;
        		case A_ANAME_ADUP:
				h.aname(theOutputBuffer, 0, theSize);
				theSize = 0;
				h.adup(theOutputBuffer, 0, theSize);
				break;
        		case A_ANAME_ADUP_STAGC:
				h.aname(theOutputBuffer, 0, theSize);
				theSize = 0;
				h.adup(theOutputBuffer, 0, theSize);
				h.stagc(theOutputBuffer, 0, theSize);
				break;
        		case A_AVAL:
				h.aval(theOutputBuffer, 0, theSize);
				theSize = 0;
				break;
        		case A_AVAL_STAGC:
				h.aval(theOutputBuffer, 0, theSize);
				theSize = 0;
				h.stagc(theOutputBuffer, 0, theSize);
				break;
			case A_CDATA:
				mark();
				// suppress the final "]]" in the buffer
				if (theSize > 1) theSize -= 2;
				h.pcdata(theOutputBuffer, 0, theSize);
				theSize = 0;
				break;
			case A_ENTITY_START:
				h.pcdata(theOutputBuffer, 0, theSize);
				theSize = 0;
				save(ch, h);
				break;
			case A_ENTITY:
				mark();
				char ch1 = (char)ch;
//				System.out.println("Got " + ch1 + " in state " + ((theState == S_ENT) ? "S_ENT" : ((theState == S_NCR) ? "S_NCR" : "UNK")));
				if (theState == S_ENT && ch1 == '#") {
					theNextState = S_NCR;
					save(ch, h);
					break;
					}
				else if (theState == S_NCR && (ch1 == 'x" || ch1 == 'X")) {
					theNextState = S_XNCR;
					save(ch, h);
					break;
					}
				else if (theState == S_ENT && Character.isLetterOrDigit(ch1)) {
					save(ch, h);
					break;
					}
				else if (theState == S_NCR && Character.isDigit(ch1)) {
					save(ch, h);
					break;
					}
				else if (theState == S_XNCR && (Character.isDigit(ch1) || "abcdefABCDEF".indexOf(ch1) != -1)) {
					save(ch, h);
					break;
					}

				// The whole entity reference has been collected
//				System.err.println("%%" + new String(theOutputBuffer, 0, theSize));
				h.entity(theOutputBuffer, 1, theSize - 1);
				int ent = h.getEntity();
//				System.err.println("%% value = " + ent);
				if (ent != 0) {
					theSize = 0;
					if (ent >= 0x80 && ent <= 0x9F) {
						ent = theWinMap[ent-0x80];
						}
					if (ent < 0x20) {
						// Control becomes space
						ent = 0x20;
						}
					else if (ent >= 0xD800 && ent <= 0xDFFF) {
						// Surrogates get dropped
						ent = 0;
						}
					else if (ent <= 0xFFFF) {
						// BMP character
						save(ent, h);
						}
					else {
						// Astral converted to two surrogates
						ent -= 0x10000;
						save((ent>>10) + 0xD800, h);
						save((ent&0x3FF) + 0xDC00, h);
						}
					if (ch != ';") {
						unread(r, ch);
						theCurrentColumn--;
						}
					}
				else {
					unread(r, ch);
					theCurrentColumn--;
					}
				theNextState = S_PCDATA;
				break;
        		case A_ETAG:
				h.etag(theOutputBuffer, 0, theSize);
				theSize = 0;
				break;
        		case A_DECL:
				h.decl(theOutputBuffer, 0, theSize);
				theSize = 0;
				break;
        		case A_GI:
				h.gi(theOutputBuffer, 0, theSize);
				theSize = 0;
				break;
			case A_GI_STAGC:
				h.gi(theOutputBuffer, 0, theSize);
				theSize = 0;
				h.stagc(theOutputBuffer, 0, theSize);
				break;
        		case A_LT:
				mark();
				save('<", h);
				save(ch, h);
				break;
			case A_LT_PCDATA:
				mark();
				save('<", h);
				h.pcdata(theOutputBuffer, 0, theSize);
				theSize = 0;
				break;
        		case A_PCDATA:
				mark();
				h.pcdata(theOutputBuffer, 0, theSize);
				theSize = 0;
				break;
			case A_CMNT:
				mark();
				h.cmnt(theOutputBuffer, 0, theSize);
				theSize = 0;
				break;
			case A_MINUS3:
				save('-", h);
				save(' ", h);
				break;
			case A_MINUS2:
				save('-", h);
				save(' ", h);
				// fall through into A_MINUS
			case A_MINUS:
				save('-", h);
				save(ch, h);
				break;
        		case A_PI:
				mark();
				h.pi(theOutputBuffer, 0, theSize);
				theSize = 0;
				break;
        		case A_PITARGET:
				h.pitarget(theOutputBuffer, 0, theSize);
				theSize = 0;
				break;
        		case A_PITARGET_PI:
				h.pitarget(theOutputBuffer, 0, theSize);
				theSize = 0;
				h.pi(theOutputBuffer, 0, theSize);
				break;
        		case A_SAVE:
				save(ch, h);
				break;
        		case A_SKIP:
				break;
        		case A_SP:
				save(' ", h);
				break;
        		case A_STAGC:
				h.stagc(theOutputBuffer, 0, theSize);
				theSize = 0;
				break;
			case A_EMPTYTAG:
				mark();
//				System.err.println("%%% Empty tag seen");
				if (theSize > 0) h.gi(theOutputBuffer, 0, theSize);
				theSize = 0;
				h.stage(theOutputBuffer, 0, theSize);
				break;
			case A_UNGET:
				unread(r, ch);
				theCurrentColumn--;
				break;
        		case A_UNSAVE_PCDATA:
				if (theSize > 0) theSize--;
				h.pcdata(theOutputBuffer, 0, theSize);
				theSize = 0;
				break;
			default:
				throw new Error("Can't process state " + action);
				}
			theState = theNextState;
			}
		h.eof(theOutputBuffer, 0, 0);
		
public voidstartCDATA()
A callback for the ScanHandler that allows it to force the lexer state to CDATA content (no markup is recognized except the end of element.

 theNextState = S_CDATA; 
private voidunread(java.io.PushbackReader r, int c)


	// Compensate for bug in PushbackReader that allows
	// pushing back EOF.
	        
		if (c != -1) r.unread(c);