AbstractTokenizer.java (Example)

File	Doc	Category	Size	Date	Package
AbstractTokenizer.java	API Doc	Example	13390	Sat Jan 24 10:44:26 GMT 2004	je3.classes
AbstractTokenizer.java

/*
 * Copyright (c) 2004 David Flanagan.  All rights reserved.
 * This code is from the book Java Examples in a Nutshell, 3nd Edition.
 * It is provided AS-IS, WITHOUT ANY WARRANTY either expressed or implied.
 * You may study, use, and modify it for any non-commercial purpose,
 * including teaching and use in open-source projects.
 * You may distribute it non-commercially as long as you retain this notice.
 * For a commercial use license, or to purchase the book, 
 * please visit http://www.davidflanagan.com/javaexamples3.
 */
package je3.classes;
import java.util.*;
import java.io.IOException;

/**
 * This class implements all the methods of the Tokenizer interface, and
 * defines two new abstract methods, {@link #createBuffer} and
 * {@link #fillBuffer} which all concrete subclasses must implement.
 * By default, instances of this class can handle tokens of up to 16*1024
 * characters in length.
 * @author David Flanagan
 */
public abstract class AbstractTokenizer implements Tokenizer {
    boolean skipSpaces;
    boolean tokenizeSpaces;
    boolean tokenizeNumbers;
    boolean tokenizeWords;
    boolean testquotes;
    Tokenizer.WordRecognizer wordRecognizer;
    Map keywordMap;
    String openquotes, closequotes;
    boolean trackPosition;

    int maximumTokenLength = 16 * 1024;

    int tokenType = BOF;
    int tokenLine = 0; 
    int tokenColumn = 0; 
    int tokenKeyword = -1;

    int line=0, column=0;  // The line and column numbers of text[p]

    // The name of this field is is a little misleading. If eof is true, it
    // means that no more characters are available. But tokenType and tokenText
    // may still be valid until the next call to next(), nextChar(), or scan().
    boolean eof;           // Set to the return value of fillBuffer()

    // The following fields keep track of the tokenizer's state
    // Invariant:  tokenStart <= tokenEnd <= p <= numChars <= text.length

    /**
     * The start of the current token in {@link #text}.
     * Subclasses may need to update this field in {@link #fillBuffer}.
     */
    protected int tokenStart = 0;

    /**
     * The index in {@link #text} of the first character after the current
     * token. Subclasses may need to update this field in {@link #fillBuffer}.
     */
    protected int tokenEnd = 0;

    /**
     * The position of the first untokenized character in {@link #text}.
     * Subclasses may need to update this field in {@link #fillBuffer}.
     */
    protected int p = 0;

    /**
     * The number of valid characters of input text stored in {@link #text}.
     * Subclasses must implement {@link #createBuffer} and {@link #fillBuffer}
     * to set this value appropriately.
     */
    protected int numChars = 0;

    /**
     * A buffer holding the text we're parsing.  Subclasses must implement
     * {@link #createBuffer} to set this field to a character array, and
     * {@link #fillBuffer} to refill the array.
     */
    protected char[] text = null;

    /**
     * Create the {@link #text} buffer to use for parsing.  This method may
     * put text in the buffer, but it is not required to.  In either case, it
     * should set {@link #numChars} appropriately.  This method will be called
     * once, before tokenizing begins.
     * 
     * @param bufferSize the minimum size of the created array, unless the 
     * subclass knows in advance that the input text is smaller than this, in 
     * which case, the input text size may be used instead.
     * @see #fillBuffer
     */
    protected abstract void createBuffer(int bufferSize);

    /**
     * Fill or refill the {@link #text} buffer and adjust related fields.
     * This method will be called when the tokenizer needs more characters to
     * tokenize. Concrete subclasses must implement this method to put
     * characters into the @{link #text} buffer, blocking if necessary to wait
     * for characters to become available.  This method may make room in the
     * buffer by shifting the contents down to remove any characters before
     * tokenStart.  It must preserve any characters after {@link #tokenStart}
     * and before {@link #numChars}, however.  After such a shift, it must
     * adjust {@link #tokenStart}, {@link #tokenEnd} and {@link #p}
     * appropriately.  After the optional shift, the method should add as many
     * new characters as possible to {@link #text} (and always at least 1) and
     * adjust {@link #numChars} appropriately.
     * 
     * @return false if no more characters are available; true otherwise.
     * @see #createBuffer
     */
    protected abstract boolean fillBuffer() throws IOException;

    public Tokenizer skipSpaces(boolean skip) {
	skipSpaces = skip;
	return this;
    }

    public Tokenizer tokenizeSpaces(boolean tokenize) {
	tokenizeSpaces = tokenize;
	return this;
    }

    public Tokenizer tokenizeNumbers(boolean tokenize) {
	tokenizeNumbers = tokenize;
	return this;
    }
    
    public Tokenizer tokenizeWords(boolean tokenize) {
	tokenizeWords = tokenize;
	return this;
    }

    public Tokenizer wordRecognizer(Tokenizer.WordRecognizer wordRecognizer) {
	this.wordRecognizer = wordRecognizer;
	return this;
    }

    public Tokenizer quotes(String openquotes, String closequotes) {
	if (openquotes == null || closequotes == null) 
	    throw new NullPointerException("arguments must be non-null");
	if (openquotes.length() != closequotes.length()) 
	    throw new IllegalArgumentException("argument lengths differ");
	this.openquotes = openquotes;
	this.closequotes = closequotes;
	this.testquotes = openquotes.length() > 0;
	return this;
    }

    public Tokenizer trackPosition(boolean track) {
	if (text != null) throw new IllegalStateException();
	trackPosition = track;
	return this;
    }

    public Tokenizer keywords(String[] keywords) {
	if (keywords != null) {
	    keywordMap = new HashMap(keywords.length);
	    for(int i = 0; i < keywords.length; i++) 
		keywordMap.put(keywords[i], new Integer(i));
	}
	else keywordMap = null;
	return this;
    }

    public Tokenizer maximumTokenLength(int size) {
	if (size < 1) throw new IllegalArgumentException();
	if (text != null) throw new IllegalStateException();
	maximumTokenLength = size;
	return this;
    }

    public int tokenType() { return tokenType; }

    public String tokenText() {
	if (text == null || tokenStart >= numChars) return null;
	return new String(text, tokenStart, tokenEnd-tokenStart);
    }

    public int tokenLine() {
	if (trackPosition && tokenStart < numChars) return tokenLine;
	else return 0;
    }

    public int tokenColumn() {
	if (trackPosition && tokenStart < numChars) return tokenColumn;
	else return 0;
    }

    public int tokenKeyword() {
	if (tokenType == KEYWORD) return tokenKeyword;
	else return -1;
    }
                                 
    public int next() throws IOException {
	int quoteindex;
	beginNewToken();
	if (eof) return tokenType = EOF;

	char c = text[p];

	if ((skipSpaces||tokenizeSpaces) && Character.isWhitespace(c)) {
	    tokenType = SPACE;
	    do {
		if (trackPosition) updatePosition(text[p]);
		p++;
		if (p >= numChars) eof = !fillBuffer();
	    } while(!eof && Character.isWhitespace(text[p]));

	    // If we don't return space tokens then recursively call 
	    // this method to find another token. Note that the next character
	    // is not space, so we will not get into infinite recursion
	    if (skipSpaces) return next();
	    tokenEnd = p;
	}
	else if (tokenizeNumbers && Character.isDigit(c)) {
	    tokenType = NUMBER;
	    do {
		if (trackPosition) column++;
		p++;
		if (p >= numChars) eof = !fillBuffer();
	    } while(!eof && Character.isDigit(text[p]));
	    tokenEnd = p;
	}
	else if (tokenizeWords && 
		 (wordRecognizer!=null
		      ?wordRecognizer.isWordStart(c)
		      :Character.isJavaIdentifierStart(c))) {
	    tokenType = WORD;
	    do {
		if (trackPosition) column++;
		p++;
		if (p >= numChars) eof = !fillBuffer();
	    } while(!eof &&
		    (wordRecognizer!=null
		         ?wordRecognizer.isWordPart(text[p], c)
		         :Character.isJavaIdentifierPart(text[p])));

	    if (keywordMap != null) {
		String ident = new String(text,tokenStart,p-tokenStart);
		Integer index = (Integer) keywordMap.get(ident);
		if (index != null) {
		    tokenType = KEYWORD;
		    tokenKeyword = index.intValue();
		}
	    }
	    tokenEnd = p;
	}
	else if (testquotes && (quoteindex = openquotes.indexOf(c)) != -1) {
	    // Notes: we do not recognize any escape characters.
	    // We do not include the opening or closing quote.
	    // We do not report an error on EOF or OVERFLOW.
	    if (trackPosition) column++;
	    p++;
	    // Scan until we find a matching quote, but do not include
	    // the opening or closing quote.  Set the token type to the 
	    // opening delimiter
	    char closequote = closequotes.charAt(quoteindex);
	    scan(closequote, false, false, true);
	    tokenType = c;
	    // the call to scan set tokenEnd, so we don't have to
	}
	else {
	    // Otherwise, the character itself is the token
	    if (trackPosition) updatePosition(text[p]);
	    tokenType = text[p];
	    p++;
	    tokenEnd = p;
	}
	    
	// Check the invariants before returning
	assert text != null && 0 <= tokenStart && tokenStart <= tokenEnd && 
	    tokenEnd <= p && p <= numChars && numChars <= text.length;
	return tokenType;
    }

    public int nextChar() throws IOException {
	beginNewToken();
	if (eof) return tokenType = EOF;
	tokenType = text[p];
	if (trackPosition) updatePosition(text[p]);
	tokenEnd = ++p;
	// Check the invariants before returning
	assert text != null && 0 <= tokenStart && tokenStart <= tokenEnd && 
	    tokenEnd <= p && p <= numChars && numChars <= text.length;
	return tokenType;
    }

    public int scan(char delimiter, boolean extendCurrentToken,
		    boolean includeDelimiter, boolean skipDelimiter)
	throws IOException 
    {
	return scan(new char[] { delimiter }, false,
		    extendCurrentToken, includeDelimiter, skipDelimiter);
    }

    public int scan(String delimiter, boolean matchall,
		    boolean extendCurrentToken,
		    boolean includeDelimiter, boolean skipDelimiter)
	throws IOException 
    {
	return scan(delimiter.toCharArray(), matchall,
		    extendCurrentToken, includeDelimiter, skipDelimiter);
    }

    protected int scan(char[] delimiter, 
		       boolean matchall, boolean extendCurrentToken,
		       boolean includeDelimiter, boolean skipDelimiter)
	throws IOException 
    {
	if (matchall && !includeDelimiter && !skipDelimiter) 
	    throw new IllegalArgumentException("must include or skip " +
					  "delimiter when matchall is true");

	if (extendCurrentToken) ensureChars();
	else beginNewToken();

	tokenType = TEXT; // Even if return value differs
	if (eof) return EOF;

	int delimiterMatchIndex = 0;
	String delimString = null;
	if (!matchall && delimiter.length > 0)
	    delimString = new String(delimiter);

	while(!eof) {
	    // See if we've found the delimiter.  There are 3 cases here:
	    // 1) single-character delimiter
	    // 2) multi-char delimiter, and all must be matched sequentially
	    // 3) multi-char delimiter, must match any one of them.
	    if (delimiter.length == 1) {
		if (text[p] == delimiter[0]) break;
	    }
	    else if (matchall) {
		if (text[p] == delimiter[delimiterMatchIndex]) {
		    delimiterMatchIndex++;
		    if (delimiterMatchIndex == delimiter.length) break;
		}
		else delimiterMatchIndex = 0;
	    }
	    else {
		if (delimString.indexOf(text[p]) != -1) break;
	    }

	    if (trackPosition) updatePosition(text[p]);
	    p++;
	    if (p >= numChars) {    // Do we need more text?
		if (tokenStart > 0)     // Do we have room for more?
		    eof = !fillBuffer(); // Yes, so go get some
		else {                  // No room for more characters
		    tokenEnd = p;       // so report an overflow
		    return OVERFLOW;
		}
	    }
	}

	if (eof) {
	    tokenEnd = p;
	    return EOF;
	}

	if (includeDelimiter) {
	    if (trackPosition) updatePosition(text[p]);
	    p++;
	    tokenEnd = p;
	}
	else if (skipDelimiter) {
	    if (trackPosition) updatePosition(text[p]);
	    p++;
	    if (matchall) tokenEnd = p - delimiter.length;
	    else tokenEnd = p - 1;
	}
	else {
	    // we know the delimiter length is 1 in this case
	    tokenEnd = p;
	}

	// Check the invariants before returning
	assert text != null && 0 <= tokenStart && tokenStart <= tokenEnd && 
	    tokenEnd <= p && p <= numChars && numChars <= text.length;
	return TEXT;
    }

    private void ensureChars() throws IOException {
	if (text == null) {
	    createBuffer(maximumTokenLength);  // create text[], set numChars
	    p = tokenStart = tokenEnd = 0;     // initialize other state
	    if (trackPosition) line = column = 1;
	}
	if (!eof && p >= numChars) // Fill the text[] buffer if needed
	    eof = !fillBuffer();  

	// Make sure our class invariants hold true before we start a token
	assert text != null && 0 <= tokenStart && tokenStart <= tokenEnd && 
	    tokenEnd <= p && (p < numChars || (p == numChars && eof)) &&
	    numChars <= text.length;
    }

    private void beginNewToken() throws IOException {
	ensureChars();
	if (!eof) {
	    tokenStart = p;
	    tokenColumn = column;
	    tokenLine = line;
	}
    }

    private void updatePosition(char c) {
	if (c == '\n') {
	    line++;
	    column = 1;
	}
	else column++;
    }
}