TokenizerImpl.java (Glassfish v2 API)

File	Doc	Category	Size	Date	Package
TokenizerImpl.java	API Doc	Glassfish v2 API	10697	Fri May 04 22:31:06 BST 2007	com.sun.appserv.management.util.misc
TokenizerImpl.java

/*
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
 * 
 * Copyright 1997-2007 Sun Microsystems, Inc. All rights reserved.
 * 
 * The contents of this file are subject to the terms of either the GNU
 * General Public License Version 2 only ("GPL") or the Common Development
 * and Distribution License("CDDL") (collectively, the "License").  You
 * may not use this file except in compliance with the License. You can obtain
 * a copy of the License at https://glassfish.dev.java.net/public/CDDL+GPL.html
 * or glassfish/bootstrap/legal/LICENSE.txt.  See the License for the specific
 * language governing permissions and limitations under the License.
 * 
 * When distributing the software, include this License Header Notice in each
 * file and include the License file at glassfish/bootstrap/legal/LICENSE.txt.
 * Sun designates this particular file as subject to the "Classpath" exception
 * as provided by Sun in the GPL Version 2 section of the License file that
 * accompanied this code.  If applicable, add the following below the License
 * Header, with the fields enclosed by brackets [] replaced by your own
 * identifying information: "Portions Copyrighted [year]
 * [name of copyright owner]"
 * 
 * Contributor(s):
 * 
 * If you wish your version of this file to be governed by only the CDDL or
 * only the GPL Version 2, indicate your decision by adding "[Contributor]
 * elects to include this software in this distribution under the [CDDL or GPL
 * Version 2] license."  If you don't indicate a single choice of license, a
 * recipient has the option to distribute your version of this file under
 * either the CDDL, the GPL Version 2 or to extend the choice of license to
 * its licensees as provided above.  However, if you add GPL Version 2 code
 * and therefore, elected the GPL Version 2 license, then the option applies
 * only if the new code is made subject to such option by the copyright
 * holder.
 */
package com.sun.appserv.management.util.misc;

import java.text.StringCharacterIterator;
import java.util.List;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Arrays;


class IllegalEscapeSequenceException extends TokenizerException
{
    static final long serialVersionUID = 6579038898242625567L;
	public	IllegalEscapeSequenceException( String msg )	{ super( msg ); }
}

final class UnterminatedLiteralStringException extends TokenizerException
{
    static final long serialVersionUID = -1327166469948605347L;
	public	UnterminatedLiteralStringException( String msg )	{ super( msg ); }
}

final class MalformedUnicodeSequenceException extends IllegalEscapeSequenceException
{
    static final long serialVersionUID = 6604956430084180525L;
	public	MalformedUnicodeSequenceException( String msg )	{ super( msg ); }
}

/**
 */
public final class TokenizerImpl implements Tokenizer
{
	final String[]		mTokens;
	
		public
	TokenizerImpl( String	input )
		throws TokenizerException
	{
		this( input, new TokenizerParams() );
	}
	
	private static final char	QUOTE_CHAR	= '\"';
	
		public
	TokenizerImpl(
		String			input,
		TokenizerParams	params )
		throws TokenizerException
	{
		final TokenizerInternal	worker = new TokenizerInternal( input, params );
	
		List<Object>	allTokens	= worker.parseTokens( );

		if ( params.mMultipleDelimsCountAsOne )
		{
			allTokens	= removeMultipleDelims( allTokens );
		}
		
		mTokens	= interpretTokenList( allTokens );
	}
	
		final static List<Object>
	removeMultipleDelims( List<Object> list )
	{
		final List<Object>		resultList	= new ArrayList<Object>();
		
		boolean	lastWasDelim	= false;
		for( final Object value : list )
		{
			if ( value instanceof String )
			{
				resultList.add( value );
				lastWasDelim	= false;
			}
			else if ( ! lastWasDelim )
			{
				// add the delimiter
				resultList.add( value );
				lastWasDelim	= true;
			}
		}
		
		return( resultList );
	}
	
	/**
		Interpret the parsed token list, which consists of a series of strings
		and tokens.  We need to handle the special cases where the list starts
		with a delimiter and/or ends with a delimiter.  Examples:
		
		""	=> {}
		"."	=> { "", "" }
		"..."	=> { "", "", "", "" }
		"x."	=> { "x", "" }
		".x"	=> { "", "x" }
		"y.x"	=> { "y", "x" }
	 */
		static String[]
	interpretTokenList( List<Object> list )
	{
		final List<String>		resultList	= new ArrayList<String>();

		boolean	lastWasDelim	= true;

		for( final Object value : list )
		{
			if ( value instanceof String )
			{
				resultList.add( (String)value );
				lastWasDelim	= false;
			}
			else
			{
				if ( lastWasDelim )
				{
					// this one's a delimiter, and so was the last one
					// insert the implicit empty string
					resultList.add( "" );
				}
				else
				{
					lastWasDelim	= true;
				}
			}
		}
		
		// a trailing delimiter implies an empty string after it
		if ( lastWasDelim && list.size() != 0 )
		{
			resultList.add( "" );
		}
		
		return( (String[])resultList.toArray( new String[ resultList.size() ] ) );
	}
	
		public String []
	getTokens()
	{
		return( mTokens );
	}
}



final class TokenizerInternal
{
	final String					mInput;
	final TokenizerParams			mParams;
	final StringCharacterIterator	mIter;
	
	// a distinct object used to denote a delimiter
	private static final class Delim
	{
		private Delim()	{}
		public static Delim	getInstance()	{ return( new Delim() ); }
		public String	toString() { return( "<DELIM>" ); }
	}
	final static Delim	DELIM	= Delim.getInstance();
	
		
	TokenizerInternal(
		String			input,
		TokenizerParams	params )
	{
		mInput			= input;
		mParams			= params;
		mIter		= new StringCharacterIterator( input );
	}
	
		private static boolean
	isSpecialEscapeChar( char theChar )
	{
		// carriage return or newline
		return( theChar == 'n' || theChar == 'r' || theChar == 't' ||theChar == QUOTE_CHAR );
	}
	
		private boolean
	isCallerProvidedEscapableChar( char theChar )
	{
		return( mParams.mEscapableChars.indexOf( theChar ) >= 0 ||
			theChar == mParams.mEscapeChar );
	}
	
		private boolean
	isEscapableChar( char theChar )
	{
		return( isCallerProvidedEscapableChar( theChar ) || isSpecialEscapeChar( theChar ) );
	}
	
		private boolean
	isDelim( String delims, char theChar )
	{
		return( delims.indexOf( theChar ) >= 0 || theChar == mIter.DONE );
	}
	
		private static boolean
	isDigit( char theChar )
	{
		return( (theChar >= '0' && theChar <= '9') );
	}
	

		private static boolean
	isHexDigit( char theChar )
	{
		return( isDigit( theChar ) || (theChar >= 'a' && theChar <= 'f') || isUpper( theChar ) );
	}
	
		private static boolean
	isUpper( char c )
	{
		return( (c >= 'A' && c <= 'F') );
	}
	
		private boolean
	hasMoreChars()
	{
		return( mIter.current() != mIter.DONE );
	}
	
		private int
	getIndex()
	{
		return( mIter.getIndex() );
	}
	
		private char
	setIndex( int index )
	{
		return( mIter.setIndex( index ) );
	}

		private char
	nextChar()
	{
		final char	theChar	= mIter.current();
		mIter.next();
		
		return( theChar );
	}
	
	private static final char	QUOTE_CHAR	= '\"';
	private static final char	TAB_CHAR	= '\t';
	
		private char
	decodeUnicodeSequence()
		throws MalformedUnicodeSequenceException
	{
		int		value	= 0;
		
		try
		{
			for( int i = 0; i < 4; ++i )
			{
				value	= (value << 4 ) | hexValue( nextChar() );
			}
		}
		catch( Exception e )
		{
			throw new MalformedUnicodeSequenceException( "" );
		}
		
		return( (char)value );
	}
	
		private static int
	hexValue( char c )
	{
		if ( ! isHexDigit( c ) )
		{
			throw new IllegalArgumentException();
		}
		
		int	value	= 0;

		if ( isDigit( c ) )
		{
			value	= (int)c - (int)'0';
		}
		else if ( isUpper( c ) )
		{
			value	= (int)c - (int)'A';
		}
		else
		{
			value	= (int)c - (int)'a';
		}
		return( value );
	}
	
		private char
	getEscapedChar( final char inputChar )
		throws MalformedUnicodeSequenceException,IllegalEscapeSequenceException
	{
		char	outChar	= 0;
		
		if ( isCallerProvidedEscapableChar( inputChar ) )
		{
			outChar	= inputChar;
		}
		else
		{
			switch( inputChar )
			{
				default:	throw new IllegalEscapeSequenceException( "" + inputChar );
				case 'n':	outChar	= '\n';		break;
				case 'r':	outChar	= '\r';		break;
				case 't':	outChar	= '\t';		break;
				case QUOTE_CHAR:	outChar	= QUOTE_CHAR;	break;
				case 'u':	outChar	= decodeUnicodeSequence();	break;
			}
		}
		
		return( outChar );
	}
	
	
		private String
	processEscapeSequence()
	{
		// index of the character following the escape character
		String	s	= null;
		
		final char	theChar	= nextChar();
		final int	continuePos	= mIter.getIndex();
		try
		{
			s	= "" + getEscapedChar( theChar );
		}
		catch( TokenizerException e )
		{
			// emit the escape character and the character following it]
			// literally, then proceed.
			s	= mParams.mEscapeChar + "" + theChar;
			mIter.setIndex( continuePos );
		}
		
		return( s );
	}
	
		ArrayList<Object>
	parseTokens(   )
		throws UnterminatedLiteralStringException,
			MalformedUnicodeSequenceException, IllegalEscapeSequenceException
	{
		final StringBuffer	tok	= new StringBuffer();
		final ArrayList<Object>		tokens	= new ArrayList<Object>();
		boolean				insideStringLiteral	= false;
		
		/**
			Escape sequences are always processed regardless of whether we're inside a
			quoted string or not.  A quote string really only alters whether delimiters
			are treated as literal characters, or not.
		 */
		while ( hasMoreChars()  )
		{
			final char	theChar	= nextChar();
			
			if ( theChar == mParams.mEscapeChar )
			{
				if ( mParams.mEmitInvalidEscapeSequencesLiterally )
				{
					tok.append( processEscapeSequence() );
				}
				else
				{
					tok.append( getEscapedChar( nextChar() ) );
				}
			}
			else if ( theChar == Tokenizer.LITERAL_STRING_DELIM )
			{
				// special cases of "", """", """""", etc require forcing an empty string out
				// these case have no delimiter or regular characters to cause a string to
				// be emitted
				if ( insideStringLiteral && tok.length() == 0 && tokens.size() == 0)
				{
					tokens.add( "" );
				}
				
				insideStringLiteral	= ! insideStringLiteral;
			}
			else if ( insideStringLiteral )
			{
				tok.append( theChar );
			}
			else if ( isDelim( mParams.mDelimiters, theChar ) )
			{
				// we've hit a delimiter...if characters have accumulated, spit them out
				// then spit out the delimiter token.
				if ( tok.length() != 0 )
				{
					tokens.add( tok.toString() );
					tok.setLength( 0 );
				}
				tokens.add( DELIM );
			}
			else
			{
				tok.append( theChar );
			}
		}
		
		if ( tok.length() != 0 )
		{
			tokens.add( tok.toString() );
		}
		
		if ( insideStringLiteral )
		{
			throw new UnterminatedLiteralStringException( tok.toString() );
		}
		
		return( tokens );
	}
}