FileDocCategorySizeDatePackage
lexer.javaAPI DocJava SE 5 API17621Fri Aug 26 14:54:54 BST 2005com.sun.java_cup.internal

lexer

public class lexer extends Object
This class implements a small scanner (aka lexical analyzer or lexer) for the JavaCup specification. This scanner reads characters from standard input (System.in) and returns integers corresponding to the terminal number of the next Symbol. Once end of input is reached the EOF Symbol is returned on every subsequent call.

Symbols currently returned include:

Symbol Constant Returned Symbol Constant Returned
------ ----------------- ------ -----------------
"package" PACKAGE "import" IMPORT
"code" CODE "action" ACTION
"parser" PARSER "terminal" TERMINAL
"non" NON "init" INIT
"scan" SCAN "with" WITH
"start" START "precedence" PRECEDENCE
"left" LEFT "right" RIGHT
"nonassoc" NONASSOC "%prec PRECENT_PREC
[ LBRACK ] RBRACK
; SEMI
, COMMA * STAR
. DOT : COLON
::= COLON_COLON_EQUALS | BAR
identifier ID {:...:} CODE_STRING
"nonterminal" NONTERMINAL
All symbol constants are defined in sym.java which is generated by JavaCup from parser.cup.

In addition to the scanner proper (called first via init() then with next_token() to get each Symbol) this class provides simple error and warning routines and keeps a count of errors and warnings that is publicly accessible.

This class is "static" (i.e., it has only static members and methods).

version
last updated: 7/3/96
author
Frank Flannery

Fields Summary
protected static int
next_char
First character of lookahead.
protected static int
next_char2
Second character of lookahead.
protected static int
next_char3
Second character of lookahead.
protected static int
next_char4
Second character of lookahead.
protected static final int
EOF_CHAR
EOF constant.
protected static Hashtable
keywords
Table of keywords. Keywords are initially treated as identifiers. Just before they are returned we look them up in this table to see if they match one of the keywords. The string of the name is the key here, which indexes Integer objects holding the symbol number.
protected static Hashtable
char_symbols
Table of single character symbols. For ease of implementation, we store all unambiguous single character Symbols in this table of Integer objects keyed by Integer objects with the numerical value of the appropriate char (currently Character objects have a bug which precludes their use in tables).
protected static int
current_line
Current line number for use in error messages.
protected static int
current_position
Character position in current line.
protected static int
absolute_position
Character position in current line.
public static int
error_count
Count of total errors detected so far.
public static int
warning_count
Count of warnings issued so far
Constructors Summary
private lexer()
The only constructor is private, so no instances can be created.

 
Methods Summary
protected static voidadvance()
Advance the scanner one character in the input stream. This moves next_char2 to next_char and then reads a new next_char2.

      int old_char;

      old_char = next_char;
      next_char = next_char2;
      if (next_char == EOF_CHAR) {
	next_char2 = EOF_CHAR;
        next_char3 = EOF_CHAR;
	next_char4 = EOF_CHAR;
      } else {
	next_char2 = next_char3;
	if (next_char2 == EOF_CHAR) {
	  next_char3 = EOF_CHAR;
	  next_char4 = EOF_CHAR;
	} else {
	  next_char3 = next_char4;
	  if (next_char3 == EOF_CHAR) {
	    next_char4 = EOF_CHAR;
	  } else {
	    next_char4 = System.in.read();
	  }
	}
      }

      /* count this */
      absolute_position++;
      current_position++;
      if (old_char == '\n")
	{
	  current_line++;
	  current_position = 1;
	}
    
public static com.sun.java_cup.internal.runtime.Symboldebug_next_token()
Debugging version of next_token(). This routine calls the real scanning routine, prints a message on System.out indicating what the Symbol is, then returns it.

      Symbol result = real_next_token();
      System.out.println("# next_Symbol() => " + result.sym);
      return result;
    
protected static com.sun.java_cup.internal.runtime.Symboldo_code_string()
Swallow up a code string. Code strings begin with "{:" and include all characters up to the first occurrence of ":}" (there is no way to include ":}" inside a code string). The routine returns a String object suitable for return by the scanner.

      StringBuffer result = new StringBuffer();

      /* at this point we have lookahead of "{:" -- swallow that */
      advance(); advance();

      /* save chars until we see ":}" */
      while (!(next_char == ':" && next_char2 == '}"))
	{
	  /* if we have run off the end issue a message and break out of loop */
	  if (next_char == EOF_CHAR)
	    {
	      emit_error("Specification file ends inside a code string");
	      break;
	    }

	  /* otherwise record the char and move on */
	  result.append(new Character((char)next_char));
	  advance();
	}

      /* advance past the closer and build a return Symbol */
      advance(); advance();
      return new Symbol(sym.CODE_STRING, result.toString());
    
protected static com.sun.java_cup.internal.runtime.Symboldo_id()
Process an identifier. Identifiers begin with a letter, underscore, or dollar sign, which is followed by zero or more letters, numbers, underscores or dollar signs. This routine returns a String suitable for return by the scanner.

      StringBuffer result = new StringBuffer();
      String       result_str;
      Integer      keyword_num;
      char         buffer[] = new char[1];

      /* next_char holds first character of id */
      buffer[0] = (char)next_char;
      result.append(buffer,0,1);
      advance();

      /* collect up characters while they fit in id */ 
      while(id_char(next_char))
	{
          buffer[0] = (char)next_char;
	  result.append(buffer,0,1);
	  advance();
	}

      /* extract a string and try to look it up as a keyword */
      result_str = result.toString();
      keyword_num = (Integer)keywords.get(result_str);

      /* if we found something, return that keyword */
      if (keyword_num != null)
	return new Symbol(keyword_num.intValue());

      /* otherwise build and return an id Symbol with an attached string */
      return new Symbol(sym.ID, result_str);
    
public static voidemit_error(java.lang.String message)
Emit an error message. The message will be marked with both the current line number and the position in the line. Error messages are printed on standard error (System.err).

param
message the message to print.

      System.err.println("Error at " + current_line + "(" + current_position +
			 "): " + message);
      error_count++;
    
public static voidemit_warn(java.lang.String message)
Emit a warning message. The message will be marked with both the current line number and the position in the line. Messages are printed on standard error (System.err).

param
message the message to print.

      System.err.println("Warning at " + current_line + "(" + current_position +
			 "): " + message);
      warning_count++;
    
protected static intfind_single_char(int ch)
Try to look up a single character symbol, returns -1 for not found.

param
ch the character in question.

      Integer result;

      result = (Integer)char_symbols.get(new Integer((char)ch));
      if (result == null) 
	return -1;
      else
	return result.intValue();
    
protected static booleanid_char(int ch)
Determine if a character is ok for the middle of an id.

param
ch the character in question.

      return id_start_char(ch) || (ch >= '0" && ch <= '9");
    
protected static booleanid_start_char(int ch)
Determine if a character is ok to start an id.

param
ch the character in question.

      /* allow for % in identifiers.  a hack to allow my
	 %prec in.  Should eventually make lex spec for this 
	 frankf */
      return (ch >= 'a" &&  ch <= 'z") || (ch >= 'A" && ch <= 'Z") || 
	     (ch == '_");

      // later need to deal with non-8-bit chars here
    
public static voidinit()
Initialize the scanner. This sets up the keywords and char_symbols tables and reads the first two characters of lookahead.


  /*-----------------------------------------------------------*/
  /*--- Static Methods ----------------------------------------*/
  /*-----------------------------------------------------------*/

                            
       
    
      /* set up the keyword table */
      keywords.put("package",    new Integer(sym.PACKAGE));
      keywords.put("import",     new Integer(sym.IMPORT));
      keywords.put("code",       new Integer(sym.CODE));
      keywords.put("action",     new Integer(sym.ACTION));
      keywords.put("parser",     new Integer(sym.PARSER));
      keywords.put("terminal",   new Integer(sym.TERMINAL));
      keywords.put("non",        new Integer(sym.NON));
      keywords.put("nonterminal",new Integer(sym.NONTERMINAL));// [CSA]
      keywords.put("init",       new Integer(sym.INIT));
      keywords.put("scan",       new Integer(sym.SCAN));
      keywords.put("with",       new Integer(sym.WITH));
      keywords.put("start",      new Integer(sym.START));
      keywords.put("precedence", new Integer(sym.PRECEDENCE));
      keywords.put("left",       new Integer(sym.LEFT));
      keywords.put("right",      new Integer(sym.RIGHT));
      keywords.put("nonassoc",   new Integer(sym.NONASSOC));

      /* set up the table of single character symbols */
      char_symbols.put(new Integer(';"), new Integer(sym.SEMI));
      char_symbols.put(new Integer(',"), new Integer(sym.COMMA));
      char_symbols.put(new Integer('*"), new Integer(sym.STAR));
      char_symbols.put(new Integer('."), new Integer(sym.DOT));
      char_symbols.put(new Integer('|"), new Integer(sym.BAR));
      char_symbols.put(new Integer('["), new Integer(sym.LBRACK));
      char_symbols.put(new Integer(']"), new Integer(sym.RBRACK));

      /* read two characters of lookahead */
      next_char = System.in.read();
      if (next_char == EOF_CHAR) {
	next_char2 = EOF_CHAR;
        next_char3 = EOF_CHAR;
        next_char4 = EOF_CHAR;
      } else {
	next_char2 = System.in.read();
	if (next_char2 == EOF_CHAR) {
	  next_char3 = EOF_CHAR;
	  next_char4 = EOF_CHAR;
	} else {
	  next_char3 = System.in.read();
	  if (next_char3 == EOF_CHAR) {
	    next_char4 = EOF_CHAR;
	  } else {
	    next_char4 = System.in.read();
	  }
	}
      }
    
public static com.sun.java_cup.internal.runtime.Symbolnext_token()
Return one Symbol. This is the main external interface to the scanner. It consumes sufficient characters to determine the next input Symbol and returns it. To help with debugging, this routine actually calls real_next_token() which does the work. If you need to debug the parser, this can be changed to call debug_next_token() which prints a debugging message before returning the Symbol.

      return real_next_token();
    
protected static com.sun.java_cup.internal.runtime.Symbolreal_next_token()
The actual routine to return one Symbol. This is normally called from next_token(), but for debugging purposes can be called indirectly from debug_next_token().

      int sym_num;

      for (;;)
	{
	  /* look for white space */
	  if (next_char == ' " || next_char == '\t" || next_char == '\n" ||
	      next_char == '\f" ||  next_char == '\r")
	    {
	      /* advance past it and try the next character */
	      advance();
	      continue;
	    }

	  /* look for a single character symbol */
	  sym_num = find_single_char(next_char);
	  if (sym_num != -1)
	    {
	      /* found one -- advance past it and return a Symbol for it */
	      advance();
	      return new Symbol(sym_num);
	    }

	  /* look for : or ::= */
	  if (next_char == ':")
	    {
	      /* if we don't have a second ':' return COLON */
	      if (next_char2 != ':") 
		{
		  advance();
		  return new Symbol(sym.COLON);
		}

	      /* move forward and look for the '=' */
	      advance();
	      if (next_char2 == '=") 
		{
		  advance(); advance();
		  return new Symbol(sym.COLON_COLON_EQUALS);
		}
	      else
		{
		  /* return just the colon (already consumed) */
		  return new Symbol(sym.COLON);
		}
	    }

	  /* find a "%prec" string and return it.  otherwise, a '%' was found,
	     which has no right being in the specification otherwise */
	  if (next_char == '%") {
	    advance();
	    if ((next_char == 'p") && (next_char2 == 'r") && (next_char3 == 'e") && 
		(next_char4 == 'c")) {
	      advance();
	      advance();
	      advance();
	      advance();
	      return new Symbol(sym.PERCENT_PREC);
	    } else {
	      emit_error("Found extraneous percent sign");
	    }
	  }

	  /* look for a comment */
	  if (next_char == '/" && (next_char2 == '*" || next_char2 == '/"))
	    {
	      /* swallow then continue the scan */
	      swallow_comment();
	      continue;
	    }

	  /* look for start of code string */
	  if (next_char == '{" && next_char2 == ':")
	    return do_code_string();

	  /* look for an id or keyword */
	  if (id_start_char(next_char)) return do_id();

	  /* look for EOF */
	  if (next_char == EOF_CHAR) return new Symbol(sym.EOF);

	  /* if we get here, we have an unrecognized character */
	  emit_warn("Unrecognized character '" + 
	    new Character((char)next_char) + "'(" + next_char + 
	    ") -- ignored");

	  /* advance past it */
	  advance();
	}
    
protected static voidswallow_comment()
Handle swallowing up a comment. Both old style C and new style C++ comments are handled.

      /* next_char == '/' at this point */

      /* is it a traditional comment */
      if (next_char2 == '*")
	{
	  /* swallow the opener */
	  advance(); advance();

	  /* swallow the comment until end of comment or EOF */
	  for (;;)
	    {
	      /* if its EOF we have an error */
	      if (next_char == EOF_CHAR)
		{
		  emit_error("Specification file ends inside a comment");
		  return;
		}

	      /* if we can see the closer we are done */
	      if (next_char == '*" && next_char2 == '/")
		{
		  advance();
		  advance();
		  return;
		}

	      /* otherwise swallow char and move on */
	      advance();
	    }
	}

      /* is its a new style comment */
      if (next_char2 == '/")
	{
	  /* swallow the opener */
	  advance(); advance();

	  /* swallow to '\n', '\f', or EOF */ 
	  while (next_char != '\n" && next_char != '\f" && next_char!=EOF_CHAR)
	    advance();

	  return;

	}

      /* shouldn't get here, but... if we get here we have an error */
      emit_error("Malformed comment in specification -- ignored");
      advance();