FileDocCategorySizeDatePackage
ReaderTokenizer.javaAPI DocExample3988Sat Jan 24 10:44:26 GMT 2004je3.io

ReaderTokenizer.java

/*
 * Copyright (c) 2004 David Flanagan.  All rights reserved.
 * This code is from the book Java Examples in a Nutshell, 3nd Edition.
 * It is provided AS-IS, WITHOUT ANY WARRANTY either expressed or implied.
 * You may study, use, and modify it for any non-commercial purpose,
 * including teaching and use in open-source projects.
 * You may distribute it non-commercially as long as you retain this notice.
 * For a commercial use license, or to purchase the book, 
 * please visit http://www.davidflanagan.com/javaexamples3.
 */
package je3.io;
import je3.classes.Tokenizer;
import je3.classes.AbstractTokenizer;
import java.io.*;

/**
 * This Tokenizer implementation extends AbstractTokenizer to tokenize a stream
 * of text read from a java.io.Reader.  It implements the createBuffer() and
 * fillBuffer() methods required by AbstractTokenizer.  See that class for
 * details on how these methods must behave.  Note that a buffer size may
 * be selected, and that this buffer size also determines the maximum token
 * length.  The Test class is a simple test that tokenizes a file and uses
 * the tokens to produce a copy of the file
 **/
public class ReaderTokenizer extends AbstractTokenizer {
    Reader in;

    // Create a ReaderTokenizer with a default buffer size of 16K characters
    public ReaderTokenizer(Reader in) { this(in, 16*1024); }

    public ReaderTokenizer(Reader in, int bufferSize) {
	this.in = in;  // Remember the reader to read input from
	// Tell our superclass about the selected buffer size.
	// The superclass will pass this number to createBuffer()
	maximumTokenLength(bufferSize);
    }

    // Create a buffer to tokenize.
    protected void createBuffer(int bufferSize) {
	// Make sure AbstractTokenizer only calls this method once
	assert text == null;
	this.text = new char[bufferSize];  // the new buffer
	this.numChars = 0;                 // how much text it contains
    }

    // Fill or refill the buffer.
    // See AbstractTokenizer.fillBuffer() for what this method must do.
    protected boolean fillBuffer() throws IOException {
	// Make sure AbstractTokenizer is upholding its end of the bargain
	assert text!=null && 0 <= tokenStart && tokenStart <= tokenEnd &&
	    tokenEnd <= p && p <= numChars && numChars <= text.length;

	// First, shift already tokenized characters out of the buffer
	if (tokenStart > 0) {
	    // Shift array contents
	    System.arraycopy(text, tokenStart, text, 0, numChars-tokenStart);
	    // And update buffer indexes
	    tokenEnd -= tokenStart; 
	    p -= tokenStart;
	    numChars -= tokenStart;
	    tokenStart = 0; 
	}

	// Now try to read more characters into the buffer
	int numread = in.read(text, numChars, text.length-numChars);
	// If there are no more characters, return false
	if (numread == -1) return false;
	// Otherwise, adjust the number of valid characters in the buffer
	numChars += numread;
	return true;  
    }

    // This test class tokenizes a file, reporting the tokens to standard out
    // and creating a copy of the file to demonstrate that every input
    // character is accounted for (since spaces are not skipped).
    public static class Test {
	public static void main(String[] args) throws java.io.IOException {
	    Reader in = new FileReader(args[0]);
	    PrintWriter out = new PrintWriter(new FileWriter(args[0]+".copy"));
	    ReaderTokenizer t = new ReaderTokenizer(in);
	    t.tokenizeWords(true).tokenizeNumbers(true).tokenizeSpaces(true);
	    while(t.next() != Tokenizer.EOF) {
		switch(t.tokenType()) {
		case Tokenizer.EOF:
		    System.out.println("EOF"); break;
		case Tokenizer.WORD:
		    System.out.println("WORD: " + t.tokenText()); break;
		case Tokenizer.NUMBER:
		    System.out.println("NUMBER: " + t.tokenText()); break;
		case Tokenizer.SPACE:
		    System.out.println("SPACE"); break;
		default:
		    System.out.println((char)t.tokenType());
		}
		out.print(t.tokenText());  // Copy token to the file
	    }
	    out.close();
	}
    }
}