File Doc Category Size Date Package
AbstractTokenizer.java API Doc Example 13390 Sat Jan 24 10:44:26 GMT 2004 je3.classes

AbstractTokenizer

java.lang.Object

public abstract class AbstractTokenizer extends Object implements Tokenizer

This class implements all the methods of the Tokenizer interface, and defines two new abstract methods, {@link #createBuffer} and {@link #fillBuffer} which all concrete subclasses must implement. By default, instances of this class can handle tokens of up to 16*1024 characters in length.

author: David Flanagan

Fields Summary
boolean
skipSpaces
boolean
tokenizeSpaces
boolean
tokenizeNumbers
boolean
tokenizeWords
boolean
testquotes
Tokenizer.WordRecognizer
wordRecognizer
Map
keywordMap
String
openquotes
String
closequotes
boolean
trackPosition
int
maximumTokenLength
int
tokenType
int
tokenLine
int
tokenColumn
int
tokenKeyword
int
line
int
column
boolean
eof
protected int
tokenStart
The start of the current token in {@link #text}. Subclasses may need to update this field in {@link #fillBuffer}.
protected int
tokenEnd
The index in {@link #text} of the first character after the current token. Subclasses may need to update this field in {@link #fillBuffer}.
protected int
p
The position of the first untokenized character in {@link #text}. Subclasses may need to update this field in {@link #fillBuffer}.
protected int
numChars
The number of valid characters of input text stored in {@link #text}. Subclasses must implement {@link #createBuffer} and {@link #fillBuffer} to set this value appropriately.
protected char[]
text
A buffer holding the text we're parsing. Subclasses must implement {@link #createBuffer} to set this field to a character array, and {@link #fillBuffer} to refill the array.
Constructors Summary
Methods Summary
private void beginNewToken()
ensureChars(); if (!eof) { tokenStart = p; tokenColumn = column; tokenLine = line; }
protected abstract void createBuffer(int bufferSize)
Create the {@link #text} buffer to use for parsing. This method may put text in the buffer, but it is not required to. In either case, it should set {@link #numChars} appropriately. This method will be called once, before tokenizing begins.
param
bufferSize the minimum size of the created array, unless the subclass knows in advance that the input text is smaller than this, in which case, the input text size may be used instead.
see
#fillBuffer
private void ensureChars()
if (text == null) { createBuffer(maximumTokenLength); // create text[], set numChars p = tokenStart = tokenEnd = 0; // initialize other state if (trackPosition) line = column = 1; } if (!eof && p >= numChars) // Fill the text[] buffer if needed eof = !fillBuffer(); // Make sure our class invariants hold true before we start a token assert text != null && 0 <= tokenStart && tokenStart <= tokenEnd && tokenEnd <= p && (p < numChars || (p == numChars && eof)) && numChars <= text.length;
protected abstract boolean fillBuffer()
Fill or refill the {@link #text} buffer and adjust related fields. This method will be called when the tokenizer needs more characters to tokenize. Concrete subclasses must implement this method to put characters into the @{link #text} buffer, blocking if necessary to wait for characters to become available. This method may make room in the buffer by shifting the contents down to remove any characters before tokenStart. It must preserve any characters after {@link #tokenStart} and before {@link #numChars}, however. After such a shift, it must adjust {@link #tokenStart}, {@link #tokenEnd} and {@link #p} appropriately. After the optional shift, the method should add as many new characters as possible to {@link #text} (and always at least 1) and adjust {@link #numChars} appropriately.
return
false if no more characters are available; true otherwise.
see
#createBuffer
public Tokenizer keywords(java.lang.String[] keywords)
if (keywords != null) { keywordMap = new HashMap(keywords.length); for(int i = 0; i < keywords.length; i++) keywordMap.put(keywords[i], new Integer(i)); } else keywordMap = null; return this;
public Tokenizer maximumTokenLength(int size)
if (size < 1) throw new IllegalArgumentException(); if (text != null) throw new IllegalStateException(); maximumTokenLength = size; return this;
public int next()
int quoteindex; beginNewToken(); if (eof) return tokenType = EOF; char c = text[p]; if ((skipSpaces||tokenizeSpaces) && Character.isWhitespace(c)) { tokenType = SPACE; do { if (trackPosition) updatePosition(text[p]); p++; if (p >= numChars) eof = !fillBuffer(); } while(!eof && Character.isWhitespace(text[p])); // If we don't return space tokens then recursively call // this method to find another token. Note that the next character // is not space, so we will not get into infinite recursion if (skipSpaces) return next(); tokenEnd = p; } else if (tokenizeNumbers && Character.isDigit(c)) { tokenType = NUMBER; do { if (trackPosition) column++; p++; if (p >= numChars) eof = !fillBuffer(); } while(!eof && Character.isDigit(text[p])); tokenEnd = p; } else if (tokenizeWords && (wordRecognizer!=null ?wordRecognizer.isWordStart(c) :Character.isJavaIdentifierStart(c))) { tokenType = WORD; do { if (trackPosition) column++; p++; if (p >= numChars) eof = !fillBuffer(); } while(!eof && (wordRecognizer!=null ?wordRecognizer.isWordPart(text[p], c) :Character.isJavaIdentifierPart(text[p]))); if (keywordMap != null) { String ident = new String(text,tokenStart,p-tokenStart); Integer index = (Integer) keywordMap.get(ident); if (index != null) { tokenType = KEYWORD; tokenKeyword = index.intValue(); } } tokenEnd = p; } else if (testquotes && (quoteindex = openquotes.indexOf(c)) != -1) { // Notes: we do not recognize any escape characters. // We do not include the opening or closing quote. // We do not report an error on EOF or OVERFLOW. if (trackPosition) column++; p++; // Scan until we find a matching quote, but do not include // the opening or closing quote. Set the token type to the // opening delimiter char closequote = closequotes.charAt(quoteindex); scan(closequote, false, false, true); tokenType = c; // the call to scan set tokenEnd, so we don't have to } else { // Otherwise, the character itself is the token if (trackPosition) updatePosition(text[p]); tokenType = text[p]; p++; tokenEnd = p; } // Check the invariants before returning assert text != null && 0 <= tokenStart && tokenStart <= tokenEnd && tokenEnd <= p && p <= numChars && numChars <= text.length; return tokenType;
public int nextChar()
beginNewToken(); if (eof) return tokenType = EOF; tokenType = text[p]; if (trackPosition) updatePosition(text[p]); tokenEnd = ++p; // Check the invariants before returning assert text != null && 0 <= tokenStart && tokenStart <= tokenEnd && tokenEnd <= p && p <= numChars && numChars <= text.length; return tokenType;
public Tokenizer quotes(java.lang.String openquotes, java.lang.String closequotes)
if (openquotes == null || closequotes == null) throw new NullPointerException("arguments must be non-null"); if (openquotes.length() != closequotes.length()) throw new IllegalArgumentException("argument lengths differ"); this.openquotes = openquotes; this.closequotes = closequotes; this.testquotes = openquotes.length() > 0; return this;
public int scan(char delimiter, boolean extendCurrentToken, boolean includeDelimiter, boolean skipDelimiter)
return scan(new char[] { delimiter }, false, extendCurrentToken, includeDelimiter, skipDelimiter);
public int scan(java.lang.String delimiter, boolean matchall, boolean extendCurrentToken, boolean includeDelimiter, boolean skipDelimiter)
return scan(delimiter.toCharArray(), matchall, extendCurrentToken, includeDelimiter, skipDelimiter);
protected int scan(char[] delimiter, boolean matchall, boolean extendCurrentToken, boolean includeDelimiter, boolean skipDelimiter)
if (matchall && !includeDelimiter && !skipDelimiter) throw new IllegalArgumentException("must include or skip " + "delimiter when matchall is true"); if (extendCurrentToken) ensureChars(); else beginNewToken(); tokenType = TEXT; // Even if return value differs if (eof) return EOF; int delimiterMatchIndex = 0; String delimString = null; if (!matchall && delimiter.length > 0) delimString = new String(delimiter); while(!eof) { // See if we've found the delimiter. There are 3 cases here: // 1) single-character delimiter // 2) multi-char delimiter, and all must be matched sequentially // 3) multi-char delimiter, must match any one of them. if (delimiter.length == 1) { if (text[p] == delimiter[0]) break; } else if (matchall) { if (text[p] == delimiter[delimiterMatchIndex]) { delimiterMatchIndex++; if (delimiterMatchIndex == delimiter.length) break; } else delimiterMatchIndex = 0; } else { if (delimString.indexOf(text[p]) != -1) break; } if (trackPosition) updatePosition(text[p]); p++; if (p >= numChars) { // Do we need more text? if (tokenStart > 0) // Do we have room for more? eof = !fillBuffer(); // Yes, so go get some else { // No room for more characters tokenEnd = p; // so report an overflow return OVERFLOW; } } } if (eof) { tokenEnd = p; return EOF; } if (includeDelimiter) { if (trackPosition) updatePosition(text[p]); p++; tokenEnd = p; } else if (skipDelimiter) { if (trackPosition) updatePosition(text[p]); p++; if (matchall) tokenEnd = p - delimiter.length; else tokenEnd = p - 1; } else { // we know the delimiter length is 1 in this case tokenEnd = p; } // Check the invariants before returning assert text != null && 0 <= tokenStart && tokenStart <= tokenEnd && tokenEnd <= p && p <= numChars && numChars <= text.length; return TEXT;
public Tokenizer skipSpaces(boolean skip)
skipSpaces = skip; return this;
public int tokenColumn()
if (trackPosition && tokenStart < numChars) return tokenColumn; else return 0;
public int tokenKeyword()
if (tokenType == KEYWORD) return tokenKeyword; else return -1;
public int tokenLine()
if (trackPosition && tokenStart < numChars) return tokenLine; else return 0;
public java.lang.String tokenText()
if (text == null || tokenStart >= numChars) return null; return new String(text, tokenStart, tokenEnd-tokenStart);
public int tokenType()
return tokenType;
public Tokenizer tokenizeNumbers(boolean tokenize)
tokenizeNumbers = tokenize; return this;
public Tokenizer tokenizeSpaces(boolean tokenize)
tokenizeSpaces = tokenize; return this;
public Tokenizer tokenizeWords(boolean tokenize)
tokenizeWords = tokenize; return this;
public Tokenizer trackPosition(boolean track)
if (text != null) throw new IllegalStateException(); trackPosition = track; return this;
private void updatePosition(char c)
if (c == '\n") { line++; column = 1; } else column++;
public Tokenizer wordRecognizer(Tokenizer.WordRecognizer wordRecognizer)
this.wordRecognizer = wordRecognizer; return this;