FileDocCategorySizeDatePackage
AbstractTokenizer.javaAPI DocExample13390Sat Jan 24 10:44:26 GMT 2004je3.classes

AbstractTokenizer

public abstract class AbstractTokenizer extends Object implements Tokenizer
This class implements all the methods of the Tokenizer interface, and defines two new abstract methods, {@link #createBuffer} and {@link #fillBuffer} which all concrete subclasses must implement. By default, instances of this class can handle tokens of up to 16*1024 characters in length.
author
David Flanagan

Fields Summary
boolean
skipSpaces
boolean
tokenizeSpaces
boolean
tokenizeNumbers
boolean
tokenizeWords
boolean
testquotes
Tokenizer.WordRecognizer
wordRecognizer
Map
keywordMap
String
openquotes
String
closequotes
boolean
trackPosition
int
maximumTokenLength
int
tokenType
int
tokenLine
int
tokenColumn
int
tokenKeyword
int
line
int
column
boolean
eof
protected int
tokenStart
The start of the current token in {@link #text}. Subclasses may need to update this field in {@link #fillBuffer}.
protected int
tokenEnd
The index in {@link #text} of the first character after the current token. Subclasses may need to update this field in {@link #fillBuffer}.
protected int
p
The position of the first untokenized character in {@link #text}. Subclasses may need to update this field in {@link #fillBuffer}.
protected int
numChars
The number of valid characters of input text stored in {@link #text}. Subclasses must implement {@link #createBuffer} and {@link #fillBuffer} to set this value appropriately.
protected char[]
text
A buffer holding the text we're parsing. Subclasses must implement {@link #createBuffer} to set this field to a character array, and {@link #fillBuffer} to refill the array.
Constructors Summary
Methods Summary
private voidbeginNewToken()

	ensureChars();
	if (!eof) {
	    tokenStart = p;
	    tokenColumn = column;
	    tokenLine = line;
	}
    
protected abstract voidcreateBuffer(int bufferSize)
Create the {@link #text} buffer to use for parsing. This method may put text in the buffer, but it is not required to. In either case, it should set {@link #numChars} appropriately. This method will be called once, before tokenizing begins.

param
bufferSize the minimum size of the created array, unless the subclass knows in advance that the input text is smaller than this, in which case, the input text size may be used instead.
see
#fillBuffer

private voidensureChars()

	if (text == null) {
	    createBuffer(maximumTokenLength);  // create text[], set numChars
	    p = tokenStart = tokenEnd = 0;     // initialize other state
	    if (trackPosition) line = column = 1;
	}
	if (!eof && p >= numChars) // Fill the text[] buffer if needed
	    eof = !fillBuffer();  

	// Make sure our class invariants hold true before we start a token
	assert text != null && 0 <= tokenStart && tokenStart <= tokenEnd && 
	    tokenEnd <= p && (p < numChars || (p == numChars && eof)) &&
	    numChars <= text.length;
    
protected abstract booleanfillBuffer()
Fill or refill the {@link #text} buffer and adjust related fields. This method will be called when the tokenizer needs more characters to tokenize. Concrete subclasses must implement this method to put characters into the @{link #text} buffer, blocking if necessary to wait for characters to become available. This method may make room in the buffer by shifting the contents down to remove any characters before tokenStart. It must preserve any characters after {@link #tokenStart} and before {@link #numChars}, however. After such a shift, it must adjust {@link #tokenStart}, {@link #tokenEnd} and {@link #p} appropriately. After the optional shift, the method should add as many new characters as possible to {@link #text} (and always at least 1) and adjust {@link #numChars} appropriately.

return
false if no more characters are available; true otherwise.
see
#createBuffer

public Tokenizerkeywords(java.lang.String[] keywords)

	if (keywords != null) {
	    keywordMap = new HashMap(keywords.length);
	    for(int i = 0; i < keywords.length; i++) 
		keywordMap.put(keywords[i], new Integer(i));
	}
	else keywordMap = null;
	return this;
    
public TokenizermaximumTokenLength(int size)

	if (size < 1) throw new IllegalArgumentException();
	if (text != null) throw new IllegalStateException();
	maximumTokenLength = size;
	return this;
    
public intnext()

	int quoteindex;
	beginNewToken();
	if (eof) return tokenType = EOF;

	char c = text[p];

	if ((skipSpaces||tokenizeSpaces) && Character.isWhitespace(c)) {
	    tokenType = SPACE;
	    do {
		if (trackPosition) updatePosition(text[p]);
		p++;
		if (p >= numChars) eof = !fillBuffer();
	    } while(!eof && Character.isWhitespace(text[p]));

	    // If we don't return space tokens then recursively call 
	    // this method to find another token. Note that the next character
	    // is not space, so we will not get into infinite recursion
	    if (skipSpaces) return next();
	    tokenEnd = p;
	}
	else if (tokenizeNumbers && Character.isDigit(c)) {
	    tokenType = NUMBER;
	    do {
		if (trackPosition) column++;
		p++;
		if (p >= numChars) eof = !fillBuffer();
	    } while(!eof && Character.isDigit(text[p]));
	    tokenEnd = p;
	}
	else if (tokenizeWords && 
		 (wordRecognizer!=null
		      ?wordRecognizer.isWordStart(c)
		      :Character.isJavaIdentifierStart(c))) {
	    tokenType = WORD;
	    do {
		if (trackPosition) column++;
		p++;
		if (p >= numChars) eof = !fillBuffer();
	    } while(!eof &&
		    (wordRecognizer!=null
		         ?wordRecognizer.isWordPart(text[p], c)
		         :Character.isJavaIdentifierPart(text[p])));

	    if (keywordMap != null) {
		String ident = new String(text,tokenStart,p-tokenStart);
		Integer index = (Integer) keywordMap.get(ident);
		if (index != null) {
		    tokenType = KEYWORD;
		    tokenKeyword = index.intValue();
		}
	    }
	    tokenEnd = p;
	}
	else if (testquotes && (quoteindex = openquotes.indexOf(c)) != -1) {
	    // Notes: we do not recognize any escape characters.
	    // We do not include the opening or closing quote.
	    // We do not report an error on EOF or OVERFLOW.
	    if (trackPosition) column++;
	    p++;
	    // Scan until we find a matching quote, but do not include
	    // the opening or closing quote.  Set the token type to the 
	    // opening delimiter
	    char closequote = closequotes.charAt(quoteindex);
	    scan(closequote, false, false, true);
	    tokenType = c;
	    // the call to scan set tokenEnd, so we don't have to
	}
	else {
	    // Otherwise, the character itself is the token
	    if (trackPosition) updatePosition(text[p]);
	    tokenType = text[p];
	    p++;
	    tokenEnd = p;
	}
	    
	// Check the invariants before returning
	assert text != null && 0 <= tokenStart && tokenStart <= tokenEnd && 
	    tokenEnd <= p && p <= numChars && numChars <= text.length;
	return tokenType;
    
public intnextChar()

	beginNewToken();
	if (eof) return tokenType = EOF;
	tokenType = text[p];
	if (trackPosition) updatePosition(text[p]);
	tokenEnd = ++p;
	// Check the invariants before returning
	assert text != null && 0 <= tokenStart && tokenStart <= tokenEnd && 
	    tokenEnd <= p && p <= numChars && numChars <= text.length;
	return tokenType;
    
public Tokenizerquotes(java.lang.String openquotes, java.lang.String closequotes)

	if (openquotes == null || closequotes == null) 
	    throw new NullPointerException("arguments must be non-null");
	if (openquotes.length() != closequotes.length()) 
	    throw new IllegalArgumentException("argument lengths differ");
	this.openquotes = openquotes;
	this.closequotes = closequotes;
	this.testquotes = openquotes.length() > 0;
	return this;
    
public intscan(char delimiter, boolean extendCurrentToken, boolean includeDelimiter, boolean skipDelimiter)

	return scan(new char[] { delimiter }, false,
		    extendCurrentToken, includeDelimiter, skipDelimiter);
    
public intscan(java.lang.String delimiter, boolean matchall, boolean extendCurrentToken, boolean includeDelimiter, boolean skipDelimiter)

	return scan(delimiter.toCharArray(), matchall,
		    extendCurrentToken, includeDelimiter, skipDelimiter);
    
protected intscan(char[] delimiter, boolean matchall, boolean extendCurrentToken, boolean includeDelimiter, boolean skipDelimiter)

	if (matchall && !includeDelimiter && !skipDelimiter) 
	    throw new IllegalArgumentException("must include or skip " +
					  "delimiter when matchall is true");

	if (extendCurrentToken) ensureChars();
	else beginNewToken();

	tokenType = TEXT; // Even if return value differs
	if (eof) return EOF;

	int delimiterMatchIndex = 0;
	String delimString = null;
	if (!matchall && delimiter.length > 0)
	    delimString = new String(delimiter);

	while(!eof) {
	    // See if we've found the delimiter.  There are 3 cases here:
	    // 1) single-character delimiter
	    // 2) multi-char delimiter, and all must be matched sequentially
	    // 3) multi-char delimiter, must match any one of them.
	    if (delimiter.length == 1) {
		if (text[p] == delimiter[0]) break;
	    }
	    else if (matchall) {
		if (text[p] == delimiter[delimiterMatchIndex]) {
		    delimiterMatchIndex++;
		    if (delimiterMatchIndex == delimiter.length) break;
		}
		else delimiterMatchIndex = 0;
	    }
	    else {
		if (delimString.indexOf(text[p]) != -1) break;
	    }

	    if (trackPosition) updatePosition(text[p]);
	    p++;
	    if (p >= numChars) {    // Do we need more text?
		if (tokenStart > 0)     // Do we have room for more?
		    eof = !fillBuffer(); // Yes, so go get some
		else {                  // No room for more characters
		    tokenEnd = p;       // so report an overflow
		    return OVERFLOW;
		}
	    }
	}

	if (eof) {
	    tokenEnd = p;
	    return EOF;
	}

	if (includeDelimiter) {
	    if (trackPosition) updatePosition(text[p]);
	    p++;
	    tokenEnd = p;
	}
	else if (skipDelimiter) {
	    if (trackPosition) updatePosition(text[p]);
	    p++;
	    if (matchall) tokenEnd = p - delimiter.length;
	    else tokenEnd = p - 1;
	}
	else {
	    // we know the delimiter length is 1 in this case
	    tokenEnd = p;
	}

	// Check the invariants before returning
	assert text != null && 0 <= tokenStart && tokenStart <= tokenEnd && 
	    tokenEnd <= p && p <= numChars && numChars <= text.length;
	return TEXT;
    
public TokenizerskipSpaces(boolean skip)


                                                                                            
        

                                                                                                                                                    
         

        
	skipSpaces = skip;
	return this;
    
public inttokenColumn()

	if (trackPosition && tokenStart < numChars) return tokenColumn;
	else return 0;
    
public inttokenKeyword()

	if (tokenType == KEYWORD) return tokenKeyword;
	else return -1;
    
public inttokenLine()

	if (trackPosition && tokenStart < numChars) return tokenLine;
	else return 0;
    
public java.lang.StringtokenText()

	if (text == null || tokenStart >= numChars) return null;
	return new String(text, tokenStart, tokenEnd-tokenStart);
    
public inttokenType()

 return tokenType; 
public TokenizertokenizeNumbers(boolean tokenize)

	tokenizeNumbers = tokenize;
	return this;
    
public TokenizertokenizeSpaces(boolean tokenize)

	tokenizeSpaces = tokenize;
	return this;
    
public TokenizertokenizeWords(boolean tokenize)

	tokenizeWords = tokenize;
	return this;
    
public TokenizertrackPosition(boolean track)

	if (text != null) throw new IllegalStateException();
	trackPosition = track;
	return this;
    
private voidupdatePosition(char c)

	if (c == '\n") {
	    line++;
	    column = 1;
	}
	else column++;
    
public TokenizerwordRecognizer(Tokenizer.WordRecognizer wordRecognizer)

	this.wordRecognizer = wordRecognizer;
	return this;