package com.sun.org.apache.regexp.internal;
/*
* ====================================================================
*
* The Apache Software License, Version 1.1
*
* Copyright (c) 1999 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution, if
* any, must include the following acknowlegement:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowlegement may appear in the software itself,
* if and wherever such third-party acknowlegements normally appear.
*
* 4. The names "The Jakarta Project", "Jakarta-Regexp", and "Apache Software
* Foundation" must not be used to endorse or promote products derived
* from this software without prior written permission. For written
* permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache"
* nor may "Apache" appear in their names without prior written
* permission of the Apache Group.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*
*/
import java.util.Vector;
/**
* RE is an efficient, lightweight regular expression evaluator/matcher class.
* Regular expressions are pattern descriptions which enable sophisticated matching of
* strings. In addition to being able to match a string against a pattern, you
* can also extract parts of the match. This is especially useful in text parsing!
* Details on the syntax of regular expression patterns are given below.
*
* <p>
*
* To compile a regular expression (RE), you can simply construct an RE matcher
* object from the string specification of the pattern, like this:
*
* <pre>
*
* RE r = new RE("a*b");
*
* </pre>
*
* <p>
*
* Once you have done this, you can call either of the RE.match methods to
* perform matching on a String. For example:
*
* <pre>
*
* boolean matched = r.match("aaaab");
*
* </pre>
*
* will cause the boolean matched to be set to true because the
* pattern "a*b" matches the string "aaaab".
*
* <p>
* If you were interested in the <i>number</i> of a's which matched the first
* part of our example expression, you could change the expression to
* "(a*)b". Then when you compiled the expression and matched it against
* something like "xaaaab", you would get results like this:
*
* <pre>
*
* RE r = new RE("(a*)b"); // Compile expression
* boolean matched = r.match("xaaaab"); // Match against "xaaaab"
*
* <br>
*
* String wholeExpr = r.getParen(0); // wholeExpr will be 'aaaab'
* String insideParens = r.getParen(1); // insideParens will be 'aaaa'
*
* <br>
*
* int startWholeExpr = getParenStart(0); // startWholeExpr will be index 1
* int endWholeExpr = getParenEnd(0); // endWholeExpr will be index 6
* int lenWholeExpr = getParenLength(0); // lenWholeExpr will be 5
*
* <br>
*
* int startInside = getParenStart(1); // startInside will be index 1
* int endInside = getParenEnd(1); // endInside will be index 5
* int lenInside = getParenLength(1); // lenInside will be 4
*
* </pre>
*
* You can also refer to the contents of a parenthesized expression within
* a regular expression itself. This is called a 'backreference'. The first
* backreference in a regular expression is denoted by \1, the second by \2
* and so on. So the expression:
*
* <pre>
*
* ([0-9]+)=\1
*
* </pre>
*
* will match any string of the form n=n (like 0=0 or 2=2).
*
* <p>
*
* The full regular expression syntax accepted by RE is described here:
*
* <pre>
*
* <br>
*
* <b><font face=times roman>Characters</font></b>
*
* <br>
*
* <i>unicodeChar</i> Matches any identical unicode character
* \ Used to quote a meta-character (like '*')
* \\ Matches a single '\' character
* \0nnn Matches a given octal character
* \xhh Matches a given 8-bit hexadecimal character
* \\uhhhh Matches a given 16-bit hexadecimal character
* \t Matches an ASCII tab character
* \n Matches an ASCII newline character
* \r Matches an ASCII return character
* \f Matches an ASCII form feed character
*
* <br>
*
* <b><font face=times roman>Character Classes</font></b>
*
* <br>
*
* [abc] Simple character class
* [a-zA-Z] Character class with ranges
* [^abc] Negated character class
*
* <br>
*
* <b><font face=times roman>Standard POSIX Character Classes</font></b>
*
* <br>
*
* [:alnum:] Alphanumeric characters.
* [:alpha:] Alphabetic characters.
* [:blank:] Space and tab characters.
* [:cntrl:] Control characters.
* [:digit:] Numeric characters.
* [:graph:] Characters that are printable and are also visible. (A space is printable, but not visible, while an `a' is both.)
* [:lower:] Lower-case alphabetic characters.
* [:print:] Printable characters (characters that are not control characters.)
* [:punct:] Punctuation characters (characters that are not letter, digits, control characters, or space characters).
* [:space:] Space characters (such as space, tab, and formfeed, to name a few).
* [:upper:] Upper-case alphabetic characters.
* [:xdigit:] Characters that are hexadecimal digits.
*
* <br>
*
* <b><font face=times roman>Non-standard POSIX-style Character Classes</font></b>
*
* <br>
*
* [:javastart:] Start of a Java identifier
* [:javapart:] Part of a Java identifier
*
* <br>
*
* <b><font face=times roman>Predefined Classes</font></b>
*
* <br>
*
* . Matches any character other than newline
* \w Matches a "word" character (alphanumeric plus "_")
* \W Matches a non-word character
* \s Matches a whitespace character
* \S Matches a non-whitespace character
* \d Matches a digit character
* \D Matches a non-digit character
*
* <br>
*
* <b><font face=times roman>Boundary Matchers</font></b>
*
* <br>
*
* ^ Matches only at the beginning of a line
* $ Matches only at the end of a line
* \b Matches only at a word boundary
* \B Matches only at a non-word boundary
*
* <br>
*
* <b><font face=times roman>Greedy Closures</font></b>
*
* <br>
*
* A* Matches A 0 or more times (greedy)
* A+ Matches A 1 or more times (greedy)
* A? Matches A 1 or 0 times (greedy)
* A{n} Matches A exactly n times (greedy)
* A{n,} Matches A at least n times (greedy)
* A{n,m} Matches A at least n but not more than m times (greedy)
*
* <br>
*
* <b><font face=times roman>Reluctant Closures</font></b>
*
* <br>
*
* A*? Matches A 0 or more times (reluctant)
* A+? Matches A 1 or more times (reluctant)
* A?? Matches A 0 or 1 times (reluctant)
*
* <br>
*
* <b><font face=times roman>Logical Operators</font></b>
*
* <br>
*
* AB Matches A followed by B
* A|B Matches either A or B
* (A) Used for subexpression grouping
*
* <br>
*
* <b><font face=times roman>Backreferences</font></b>
*
* <br>
*
* \1 Backreference to 1st parenthesized subexpression
* \2 Backreference to 2nd parenthesized subexpression
* \3 Backreference to 3rd parenthesized subexpression
* \4 Backreference to 4th parenthesized subexpression
* \5 Backreference to 5th parenthesized subexpression
* \6 Backreference to 6th parenthesized subexpression
* \7 Backreference to 7th parenthesized subexpression
* \8 Backreference to 8th parenthesized subexpression
* \9 Backreference to 9th parenthesized subexpression
*
* <br>
*
* </pre>
*
* <p>
*
* All closure operators (+, *, ?, {m,n}) are greedy by default, meaning that they
* match as many elements of the string as possible without causing the overall
* match to fail. If you want a closure to be reluctant (non-greedy), you can
* simply follow it with a '?'. A reluctant closure will match as few elements
* of the string as possible when finding matches. {m,n} closures don't currently
* support reluctancy.
*
* <p>
*
* RE runs programs compiled by the RECompiler class. But the RE matcher class
* does not include the actual regular expression compiler for reasons of
* efficiency. In fact, if you want to pre-compile one or more regular expressions,
* the 'recompile' class can be invoked from the command line to produce compiled
* output like this:
*
* <pre>
*
* // Pre-compiled regular expression "a*b"
* char[] re1Instructions =
* {
* 0x007c, 0x0000, 0x001a, 0x007c, 0x0000, 0x000d, 0x0041,
* 0x0001, 0x0004, 0x0061, 0x007c, 0x0000, 0x0003, 0x0047,
* 0x0000, 0xfff6, 0x007c, 0x0000, 0x0003, 0x004e, 0x0000,
* 0x0003, 0x0041, 0x0001, 0x0004, 0x0062, 0x0045, 0x0000,
* 0x0000,
* };
*
* <br>
*
* REProgram re1 = new REProgram(re1Instructions);
*
* </pre>
*
* You can then construct a regular expression matcher (RE) object from the pre-compiled
* expression re1 and thus avoid the overhead of compiling the expression at runtime.
* If you require more dynamic regular expressions, you can construct a single RECompiler
* object and re-use it to compile each expression. Similarly, you can change the
* program run by a given matcher object at any time. However, RE and RECompiler are
* not threadsafe (for efficiency reasons, and because requiring thread safety in this
* class is deemed to be a rare requirement), so you will need to construct a separate
* compiler or matcher object for each thread (unless you do thread synchronization
* yourself).
*
* </pre>
* <br><p><br>
*
* <font color=red>
* <i>ISSUES:</i>
*
* <ul>
* <li>com.weusours.util.re is not currently compatible with all standard POSIX regcomp flags
* <li>com.weusours.util.re does not support POSIX equivalence classes ([=foo=] syntax) (I18N/locale issue)
* <li>com.weusours.util.re does not support nested POSIX character classes (definitely should, but not completely trivial)
* <li>com.weusours.util.re Does not support POSIX character collation concepts ([.foo.] syntax) (I18N/locale issue)
* <li>Should there be different matching styles (simple, POSIX, Perl etc?)
* <li>Should RE support character iterators (for backwards RE matching!)?
* <li>Should RE support reluctant {m,n} closures (does anyone care)?
* <li>Not *all* possibilities are considered for greediness when backreferences
* are involved (as POSIX suggests should be the case). The POSIX RE
* "(ac*)c*d[ac]*\1", when matched against "acdacaa" should yield a match
* of acdacaa where \1 is "a". This is not the case in this RE package,
* and actually Perl doesn't go to this extent either! Until someone
* actually complains about this, I'm not sure it's worth "fixing".
* If it ever is fixed, test #137 in RETest.txt should be updated.
* </ul>
*
* </font>
*
* @see recompile
* @see RECompiler
*
* @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
* @version $Id: RE.java,v 1.6 2000/08/22 17:19:38 jon Exp $
*/
public class RE
{
/**
* Specifies normal, case-sensitive matching behaviour.
*/
public static final int MATCH_NORMAL = 0x0000;
/**
* Flag to indicate that matching should be case-independent (folded)
*/
public static final int MATCH_CASEINDEPENDENT = 0x0001;
/**
* Newlines should match as BOL/EOL (^ and $)
*/
public static final int MATCH_MULTILINE = 0x0002;
/**
* Consider all input a single body of text - newlines are matched by .
*/
public static final int MATCH_SINGLELINE = 0x0004;
/************************************************
* *
* The format of a node in a program is: *
* *
* [ OPCODE ] [ OPDATA ] [ OPNEXT ] [ OPERAND ] *
* *
* char OPCODE - instruction *
* char OPDATA - modifying data *
* char OPNEXT - next node (relative offset) *
* *
************************************************/
// Opcode Char Opdata/Operand Meaning
// ---------- ---------- --------------- --------------------------------------------------
static final char OP_END = 'E'; // end of program
static final char OP_BOL = '^'; // match only if at beginning of line
static final char OP_EOL = '$'; // match only if at end of line
static final char OP_ANY = '.'; // match any single character except newline
static final char OP_ANYOF = '['; // count/ranges match any char in the list of ranges
static final char OP_BRANCH = '|'; // node match this alternative or the next one
static final char OP_ATOM = 'A'; // length/string length of string followed by string itself
static final char OP_STAR = '*'; // node kleene closure
static final char OP_PLUS = '+'; // node positive closure
static final char OP_MAYBE = '?'; // node optional closure
static final char OP_ESCAPE = '\\'; // escape special escape code char class (escape is E_* code)
static final char OP_OPEN = '('; // number nth opening paren
static final char OP_CLOSE = ')'; // number nth closing paren
static final char OP_BACKREF = '#'; // number reference nth already matched parenthesized string
static final char OP_GOTO = 'G'; // nothing but a (back-)pointer
static final char OP_NOTHING = 'N'; // match null string such as in '(a|)'
static final char OP_RELUCTANTSTAR = '8'; // none/expr reluctant '*' (mnemonic for char is unshifted '*')
static final char OP_RELUCTANTPLUS = '='; // none/expr reluctant '+' (mnemonic for char is unshifted '+')
static final char OP_RELUCTANTMAYBE = '/'; // none/expr reluctant '?' (mnemonic for char is unshifted '?')
static final char OP_POSIXCLASS = 'P'; // classid one of the posix character classes
// Escape codes
static final char E_ALNUM = 'w'; // Alphanumeric
static final char E_NALNUM = 'W'; // Non-alphanumeric
static final char E_BOUND = 'b'; // Word boundary
static final char E_NBOUND = 'B'; // Non-word boundary
static final char E_SPACE = 's'; // Whitespace
static final char E_NSPACE = 'S'; // Non-whitespace
static final char E_DIGIT = 'd'; // Digit
static final char E_NDIGIT = 'D'; // Non-digit
// Posix character classes
static final char POSIX_CLASS_ALNUM = 'w'; // Alphanumerics
static final char POSIX_CLASS_ALPHA = 'a'; // Alphabetics
static final char POSIX_CLASS_BLANK = 'b'; // Blanks
static final char POSIX_CLASS_CNTRL = 'c'; // Control characters
static final char POSIX_CLASS_DIGIT = 'd'; // Digits
static final char POSIX_CLASS_GRAPH = 'g'; // Graphic characters
static final char POSIX_CLASS_LOWER = 'l'; // Lowercase characters
static final char POSIX_CLASS_PRINT = 'p'; // Printable characters
static final char POSIX_CLASS_PUNCT = '!'; // Punctuation
static final char POSIX_CLASS_SPACE = 's'; // Spaces
static final char POSIX_CLASS_UPPER = 'u'; // Uppercase characters
static final char POSIX_CLASS_XDIGIT = 'x'; // Hexadecimal digits
static final char POSIX_CLASS_JSTART = 'j'; // Java identifier start
static final char POSIX_CLASS_JPART = 'k'; // Java identifier part
// Limits
static final int maxNode = 65536; // Maximum number of nodes in a program
static final int maxParen = 16; // Number of paren pairs (only 9 can be backrefs)
// Node layout constants
static final int offsetOpcode = 0; // Opcode offset (first character)
static final int offsetOpdata = 1; // Opdata offset (second char)
static final int offsetNext = 2; // Next index offset (third char)
static final int nodeSize = 3; // Node size (in chars)
/** Line Separator */
static final String NEWLINE = System.getProperty("line.separator");
// State of current program
REProgram program; // Compiled regular expression 'program'
CharacterIterator search; // The string being matched against
int idx; // Current index in string being searched
int matchFlags; // Match behaviour flags
// Parenthesized subexpressions
int parenCount; // Number of subexpressions matched (num open parens + 1)
int start0; // Cache of start[0]
int end0; // Cache of start[0]
int start1; // Cache of start[1]
int end1; // Cache of start[1]
int start2; // Cache of start[2]
int end2; // Cache of start[2]
int[] startn; // Lazy-alloced array of sub-expression starts
int[] endn; // Lazy-alloced array of sub-expression ends
// Backreferences
int[] startBackref; // Lazy-alloced array of backref starts
int[] endBackref; // Lazy-alloced array of backref ends
/**
* Constructs a regular expression matcher from a String by compiling it
* using a new instance of RECompiler. If you will be compiling many
* expressions, you may prefer to use a single RECompiler object instead.
* @param pattern The regular expression pattern to compile.
* @exception RESyntaxException Thrown if the regular expression has invalid syntax.
* @see RECompiler
* @see recompile
*/
public RE(String pattern) throws RESyntaxException
{
this(pattern, MATCH_NORMAL);
}
/**
* Constructs a regular expression matcher from a String by compiling it
* using a new instance of RECompiler. If you will be compiling many
* expressions, you may prefer to use a single RECompiler object instead.
* @param pattern The regular expression pattern to compile.
* @param matchFlags The matching style
* @exception RESyntaxException Thrown if the regular expression has invalid syntax.
* @see RECompiler
* @see recompile
*/
public RE(String pattern, int matchFlags) throws RESyntaxException
{
this(new RECompiler().compile(pattern));
setMatchFlags(matchFlags);
}
/**
* Construct a matcher for a pre-compiled regular expression from program
* (bytecode) data. Permits special flags to be passed in to modify matching
* behaviour.
* @param program Compiled regular expression program (see RECompiler and/or recompile)
* @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
*
* <pre>
*
* MATCH_NORMAL // Normal (case-sensitive) matching
* MATCH_CASEINDEPENDENT // Case folded comparisons
* MATCH_MULTILINE // Newline matches as BOL/EOL
*
* </pre>
*
* @see RECompiler
* @see REProgram
* @see recompile
*/
public RE(REProgram program, int matchFlags)
{
setProgram(program);
setMatchFlags(matchFlags);
}
/**
* Construct a matcher for a pre-compiled regular expression from program
* (bytecode) data.
* @param program Compiled regular expression program
* @see RECompiler
* @see recompile
*/
public RE(REProgram program)
{
this(program, MATCH_NORMAL);
}
/**
* Constructs a regular expression matcher with no initial program.
* This is likely to be an uncommon practice, but is still supported.
*/
public RE()
{
this((REProgram)null, MATCH_NORMAL);
}
/**
* Converts a 'simplified' regular expression to a full regular expression
* @param pattern The pattern to convert
* @return The full regular expression
*/
public static String simplePatternToFullRegularExpression(String pattern)
{
StringBuffer buf = new StringBuffer();
for (int i = 0; i < pattern.length(); i++)
{
char c = pattern.charAt(i);
switch (c)
{
case '*':
buf.append(".*");
break;
case '.':
case '[':
case ']':
case '\\':
case '+':
case '?':
case '{':
case '}':
case '$':
case '^':
case '|':
case '(':
case ')':
buf.append('\\');
default:
buf.append(c);
break;
}
}
return buf.toString();
}
/**
* Sets match behaviour flags which alter the way RE does matching.
* @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
*
* <pre>
*
* MATCH_NORMAL // Normal (case-sensitive) matching
* MATCH_CASEINDEPENDENT // Case folded comparisons
* MATCH_MULTILINE // Newline matches as BOL/EOL
*
* </pre>
*
*/
public void setMatchFlags(int matchFlags)
{
this.matchFlags = matchFlags;
}
/**
* Returns the current match behaviour flags.
* @return Current match behaviour flags (RE.MATCH_*).
*
* <pre>
*
* MATCH_NORMAL // Normal (case-sensitive) matching
* MATCH_CASEINDEPENDENT // Case folded comparisons
* MATCH_MULTILINE // Newline matches as BOL/EOL
*
* </pre>
*
* @see #setMatchFlags
*
*/
public int getMatchFlags()
{
return matchFlags;
}
/**
* Sets the current regular expression program used by this matcher object.
* @param program Regular expression program compiled by RECompiler.
* @see RECompiler
* @see REProgram
* @see recompile
*/
public void setProgram(REProgram program)
{
this.program = program;
}
/**
* Returns the current regular expression program in use by this matcher object.
* @return Regular expression program
* @see #setProgram
*/
public REProgram getProgram()
{
return program;
}
/**
* Returns the number of parenthesized subexpressions available after a successful match.
* @return Number of available parenthesized subexpressions
*/
public int getParenCount()
{
return parenCount;
}
/**
* Gets the contents of a parenthesized subexpression after a successful match.
* @param which Nesting level of subexpression
* @return String
*/
public String getParen(int which)
{
int start;
if (which < parenCount && (start = getParenStart(which)) >= 0)
{
return search.substring(start, getParenEnd(which));
}
return null;
}
/**
* Returns the start index of a given paren level.
* @param which Nesting level of subexpression
* @return String index
*/
public final int getParenStart(int which)
{
if (which < parenCount)
{
switch (which)
{
case 0:
return start0;
case 1:
return start1;
case 2:
return start2;
default:
if (startn == null)
{
allocParens();
}
return startn[which];
}
}
return -1;
}
/**
* Returns the end index of a given paren level.
* @param which Nesting level of subexpression
* @return String index
*/
public final int getParenEnd(int which)
{
if (which < parenCount)
{
switch (which)
{
case 0:
return end0;
case 1:
return end1;
case 2:
return end2;
default:
if (endn == null)
{
allocParens();
}
return endn[which];
}
}
return -1;
}
/**
* Returns the length of a given paren level.
* @param which Nesting level of subexpression
* @return Number of characters in the parenthesized subexpression
*/
public final int getParenLength(int which)
{
if (which < parenCount)
{
return getParenEnd(which) - getParenStart(which);
}
return -1;
}
/**
* Sets the start of a paren level
* @param which Which paren level
* @param i Index in input array
*/
protected final void setParenStart(int which, int i)
{
if (which < parenCount)
{
switch (which)
{
case 0:
start0 = i;
break;
case 1:
start1 = i;
break;
case 2:
start2 = i;
break;
default:
if (startn == null)
{
allocParens();
}
startn[which] = i;
break;
}
}
}
/**
* Sets the end of a paren level
* @param which Which paren level
* @param i Index in input array
*/
protected final void setParenEnd(int which, int i)
{
if (which < parenCount)
{
switch (which)
{
case 0:
end0 = i;
break;
case 1:
end1 = i;
break;
case 2:
end2 = i;
break;
default:
if (endn == null)
{
allocParens();
}
endn[which] = i;
break;
}
}
}
/**
* Throws an Error representing an internal error condition probably resulting
* from a bug in the regular expression compiler (or possibly data corruption).
* In practice, this should be very rare.
* @param s Error description
*/
protected void internalError(String s) throws Error
{
throw new Error("RE internal error: " + s);
}
/**
* Performs lazy allocation of subexpression arrays
*/
private final void allocParens()
{
// Allocate arrays for subexpressions
startn = new int[maxParen];
endn = new int[maxParen];
// Set sub-expression pointers to invalid values
for (int i = 0; i < maxParen; i++)
{
startn[i] = -1;
endn[i] = -1;
}
}
/**
* Try to match a string against a subset of nodes in the program
* @param firstNode Node to start at in program
* @param lastNode Last valid node (used for matching a subexpression without
* matching the rest of the program as well).
* @param idxStart Starting position in character array
* @return Final input array index if match succeeded. -1 if not.
*/
protected int matchNodes(int firstNode, int lastNode, int idxStart)
{
// Our current place in the string
int idx = idxStart;
// Loop while node is valid
int next, opcode, opdata;
int idxNew;
char[] instruction = program.instruction;
for (int node = firstNode; node < lastNode; )
{
opcode = instruction[node + offsetOpcode];
next = node + (short)instruction[node + offsetNext];
opdata = instruction[node + offsetOpdata];
switch (opcode)
{
case OP_RELUCTANTMAYBE:
{
int once = 0;
do
{
// Try to match the rest without using the reluctant subexpr
if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
{
return idxNew;
}
}
while ((once++ == 0) && (idx = matchNodes(node + nodeSize, next, idx)) != -1);
return -1;
}
case OP_RELUCTANTPLUS:
while ((idx = matchNodes(node + nodeSize, next, idx)) != -1)
{
// Try to match the rest without using the reluctant subexpr
if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
{
return idxNew;
}
}
return -1;
case OP_RELUCTANTSTAR:
do
{
// Try to match the rest without using the reluctant subexpr
if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
{
return idxNew;
}
}
while ((idx = matchNodes(node + nodeSize, next, idx)) != -1);
return -1;
case OP_OPEN:
// Match subexpression
if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
{
startBackref[opdata] = idx;
}
if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
{
// Increase valid paren count
if ((opdata + 1) > parenCount)
{
parenCount = opdata + 1;
}
// Don't set paren if already set later on
if (getParenStart(opdata) == -1)
{
setParenStart(opdata, idx);
}
}
return idxNew;
case OP_CLOSE:
// Done matching subexpression
if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
{
endBackref[opdata] = idx;
}
if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
{
// Increase valid paren count
if ((opdata + 1) > parenCount)
{
parenCount = opdata + 1;
}
// Don't set paren if already set later on
if (getParenEnd(opdata) == -1)
{
setParenEnd(opdata, idx);
}
}
return idxNew;
case OP_BACKREF:
{
// Get the start and end of the backref
int s = startBackref[opdata];
int e = endBackref[opdata];
// We don't know the backref yet
if (s == -1 || e == -1)
{
return -1;
}
// The backref is empty size
if (s == e)
{
break;
}
// Get the length of the backref
int l = e - s;
// If there's not enough input left, give up.
if (search.isEnd(idx + l - 1))
{
return -1;
}
// Case fold the backref?
if ((matchFlags & MATCH_CASEINDEPENDENT) != 0)
{
// Compare backref to input, case-folding as we go
for (int i = 0; i < l; i++)
{
if (Character.toLowerCase(search.charAt(idx++)) != Character.toLowerCase(search.charAt(s + i)))
{
return -1;
}
}
}
else
{
// Compare backref to input
for (int i = 0; i < l; i++)
{
if (search.charAt(idx++) != search.charAt(s + i))
{
return -1;
}
}
}
}
break;
case OP_BOL:
// Fail if we're not at the start of the string
if (idx != 0)
{
// If we're multiline matching, we could still be at the start of a line
if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE)
{
// If not at start of line, give up
if (idx <= 0 || !isNewline(idx - 1)) {
return -1;
} else {
break;
}
}
return -1;
}
break;
case OP_EOL:
// If we're not at the end of string
if (!search.isEnd(0) && !search.isEnd(idx))
{
// If we're multi-line matching
if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE)
{
// Give up if we're not at the end of a line
if (! isNewline(idx)) {
return -1;
} else {
break;
}
}
return -1;
}
break;
case OP_ESCAPE:
// Which escape?
switch (opdata)
{
// Word boundary match
case E_NBOUND:
case E_BOUND:
{
char cLast = ((idx == getParenStart(0)) ? '\n' : search.charAt(idx - 1));
char cNext = ((search.isEnd(idx)) ? '\n' : search.charAt(idx));
if ((Character.isLetterOrDigit(cLast) == Character.isLetterOrDigit(cNext)) == (opdata == E_BOUND))
{
return -1;
}
}
break;
// Alpha-numeric, digit, space, javaLetter, javaLetterOrDigit
case E_ALNUM:
case E_NALNUM:
case E_DIGIT:
case E_NDIGIT:
case E_SPACE:
case E_NSPACE:
// Give up if out of input
if (search.isEnd(idx))
{
return -1;
}
// Switch on escape
switch (opdata)
{
case E_ALNUM:
case E_NALNUM:
if (!(Character.isLetterOrDigit(search.charAt(idx)) == (opdata == E_ALNUM)))
{
return -1;
}
break;
case E_DIGIT:
case E_NDIGIT:
if (!(Character.isDigit(search.charAt(idx)) == (opdata == E_DIGIT)))
{
return -1;
}
break;
case E_SPACE:
case E_NSPACE:
if (!(Character.isWhitespace(search.charAt(idx)) == (opdata == E_SPACE)))
{
return -1;
}
break;
}
idx++;
break;
default:
internalError("Unrecognized escape '" + opdata + "'");
}
break;
case OP_ANY:
if((matchFlags & MATCH_SINGLELINE) == MATCH_SINGLELINE) {
// Match anything
if(search.isEnd(idx))
{
return -1;
}
idx++;
break;
}
else
{
// Match anything but a newline
if (search.isEnd(idx) || search.charAt(idx++) == '\n')
{
return -1;
}
break;
}
case OP_ATOM:
{
// Match an atom value
if (search.isEnd(idx))
{
return -1;
}
// Get length of atom and starting index
int lenAtom = opdata;
int startAtom = node + nodeSize;
// Give up if not enough input remains to have a match
if (search.isEnd(lenAtom + idx - 1))
{
return -1;
}
// Match atom differently depending on casefolding flag
if ((matchFlags & MATCH_CASEINDEPENDENT) != 0)
{
for (int i = 0; i < lenAtom; i++)
{
if (Character.toLowerCase(search.charAt(idx++)) != Character.toLowerCase(instruction[startAtom + i]))
{
return -1;
}
}
}
else
{
for (int i = 0; i < lenAtom; i++)
{
if (search.charAt(idx++) != instruction[startAtom + i])
{
return -1;
}
}
}
}
break;
case OP_POSIXCLASS:
{
// Out of input?
if (search.isEnd(idx))
{
return -1;
}
switch (opdata)
{
case POSIX_CLASS_ALNUM:
if (!Character.isLetterOrDigit(search.charAt(idx)))
{
return -1;
}
break;
case POSIX_CLASS_ALPHA:
if (!Character.isLetter(search.charAt(idx)))
{
return -1;
}
break;
case POSIX_CLASS_DIGIT:
if (!Character.isDigit(search.charAt(idx)))
{
return -1;
}
break;
case POSIX_CLASS_BLANK: // JWL - bugbug: is this right??
if (!Character.isSpaceChar(search.charAt(idx)))
{
return -1;
}
break;
case POSIX_CLASS_SPACE:
if (!Character.isWhitespace(search.charAt(idx)))
{
return -1;
}
break;
case POSIX_CLASS_CNTRL:
if (Character.getType(search.charAt(idx)) != Character.CONTROL)
{
return -1;
}
break;
case POSIX_CLASS_GRAPH: // JWL - bugbug???
switch (Character.getType(search.charAt(idx)))
{
case Character.MATH_SYMBOL:
case Character.CURRENCY_SYMBOL:
case Character.MODIFIER_SYMBOL:
case Character.OTHER_SYMBOL:
break;
default:
return -1;
}
break;
case POSIX_CLASS_LOWER:
if (Character.getType(search.charAt(idx)) != Character.LOWERCASE_LETTER)
{
return -1;
}
break;
case POSIX_CLASS_UPPER:
if (Character.getType(search.charAt(idx)) != Character.UPPERCASE_LETTER)
{
return -1;
}
break;
case POSIX_CLASS_PRINT:
if (Character.getType(search.charAt(idx)) == Character.CONTROL)
{
return -1;
}
break;
case POSIX_CLASS_PUNCT:
{
int type = Character.getType(search.charAt(idx));
switch(type)
{
case Character.DASH_PUNCTUATION:
case Character.START_PUNCTUATION:
case Character.END_PUNCTUATION:
case Character.CONNECTOR_PUNCTUATION:
case Character.OTHER_PUNCTUATION:
break;
default:
return -1;
}
}
break;
case POSIX_CLASS_XDIGIT: // JWL - bugbug??
{
boolean isXDigit = ((search.charAt(idx) >= '0' && search.charAt(idx) <= '9') ||
(search.charAt(idx) >= 'a' && search.charAt(idx) <= 'f') ||
(search.charAt(idx) >= 'A' && search.charAt(idx) <= 'F'));
if (!isXDigit)
{
return -1;
}
}
break;
case POSIX_CLASS_JSTART:
if (!Character.isJavaIdentifierStart(search.charAt(idx)))
{
return -1;
}
break;
case POSIX_CLASS_JPART:
if (!Character.isJavaIdentifierPart(search.charAt(idx)))
{
return -1;
}
break;
default:
internalError("Bad posix class");
break;
}
// Matched.
idx++;
}
break;
case OP_ANYOF:
{
// Out of input?
if (search.isEnd(idx))
{
return -1;
}
// Get character to match against character class and maybe casefold
char c = search.charAt(idx);
boolean caseFold = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
if (caseFold)
{
c = Character.toLowerCase(c);
}
// Loop through character class checking our match character
int idxRange = node + nodeSize;
int idxEnd = idxRange + (opdata * 2);
boolean match = false;
for (int i = idxRange; i < idxEnd; )
{
// Get start, end and match characters
char s = instruction[i++];
char e = instruction[i++];
// Fold ends of range and match character
if (caseFold)
{
s = Character.toLowerCase(s);
e = Character.toLowerCase(e);
}
// If the match character is in range, break out
if (c >= s && c <= e)
{
match = true;
break;
}
}
// Fail if we didn't match the character class
if (!match)
{
return -1;
}
idx++;
}
break;
case OP_BRANCH:
{
// Check for choices
if (instruction[next + offsetOpcode] != OP_BRANCH)
{
// If there aren't any other choices, just evaluate this branch.
node += nodeSize;
continue;
}
// Try all available branches
short nextBranch;
do
{
// Try matching the branch against the string
if ((idxNew = matchNodes(node + nodeSize, maxNode, idx)) != -1)
{
return idxNew;
}
// Go to next branch (if any)
nextBranch = (short)instruction[node + offsetNext];
node += nextBranch;
}
while (nextBranch != 0 && (instruction[node + offsetOpcode] == OP_BRANCH));
// Failed to match any branch!
return -1;
}
case OP_NOTHING:
case OP_GOTO:
// Just advance to the next node without doing anything
break;
case OP_END:
// Match has succeeded!
setParenEnd(0, idx);
return idx;
default:
// Corrupt program
internalError("Invalid opcode '" + opcode + "'");
}
// Advance to the next node in the program
node = next;
}
// We "should" never end up here
internalError("Corrupt program");
return -1;
}
/**
* Match the current regular expression program against the current
* input string, starting at index i of the input string. This method
* is only meant for internal use.
* @param i The input string index to start matching at
* @return True if the input matched the expression
*/
protected boolean matchAt(int i)
{
// Initialize start pointer, paren cache and paren count
start0 = -1;
end0 = -1;
start1 = -1;
end1 = -1;
start2 = -1;
end2 = -1;
startn = null;
endn = null;
parenCount = 1;
setParenStart(0, i);
// Allocate backref arrays (unless optimizations indicate otherwise)
if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
{
startBackref = new int[maxParen];
endBackref = new int[maxParen];
}
// Match against string
int idx;
if ((idx = matchNodes(0, maxNode, i)) != -1)
{
setParenEnd(0, idx);
return true;
}
// Didn't match
parenCount = 0;
return false;
}
/**
* Matches the current regular expression program against a character array,
* starting at a given index.
* @param search String to match against
* @param i Index to start searching at
* @return True if string matched
*/
public boolean match(String search, int i)
{
return match(new StringCharacterIterator(search), i);
}
/**
* Matches the current regular expression program against a character array,
* starting at a given index.
* @param search String to match against
* @param i Index to start searching at
* @return True if string matched
*/
public boolean match(CharacterIterator search, int i)
{
// There is no compiled program to search with!
if (program == null)
{
// This should be uncommon enough to be an error case rather
// than an exception (which would have to be handled everywhere)
internalError("No RE program to run!");
}
// Save string to search
this.search = search;
// Can we optimize the search by looking for a prefix string?
if (program.prefix == null)
{
// Unprefixed matching must try for a match at each character
for ( ;! search.isEnd(i - 1); i++)
{
// Try a match at index i
if (matchAt(i))
{
return true;
}
}
return false;
}
else
{
// Prefix-anchored matching is possible
boolean caseIndependent = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
char[] prefix = program.prefix;
for ( ;! search.isEnd(i + prefix.length - 1); i++)
{
// If the first character of the prefix matches
boolean match = false;
if (caseIndependent)
match = Character.toLowerCase(search.charAt(i)) == Character.toLowerCase(prefix[0]);
else
match = search.charAt(i) == prefix[0];
if (match)
{
// Save first character position
int firstChar = i++;
int k;
for (k = 1; k < prefix.length; )
{
// If there's a mismatch of any character in the prefix, give up
if (caseIndependent)
match = Character.toLowerCase(search.charAt(i++)) == Character.toLowerCase(prefix[k++]);
else
match = search.charAt(i++) == prefix[k++];
if (!match)
{
break;
}
}
// See if the whole prefix string matched
if (k == prefix.length)
{
// We matched the full prefix at firstChar, so try it
if (matchAt(firstChar))
{
return true;
}
}
// Match failed, reset i to continue the search
i = firstChar;
}
}
return false;
}
}
/**
* Matches the current regular expression program against a String.
* @param search String to match against
* @return True if string matched
*/
public boolean match(String search)
{
return match(search, 0);
}
/**
* Splits a string into an array of strings on regular expression boundaries.
* This function works the same way as the Perl function of the same name.
* Given a regular expression of "[ab]+" and a string to split of
* "xyzzyababbayyzabbbab123", the result would be the array of Strings
* "[xyzzy, yyz, 123]".
* @param s String to split on this regular exression
* @return Array of strings
*/
public String[] split(String s)
{
// Create new vector
Vector v = new Vector();
// Start at position 0 and search the whole string
int pos = 0;
int len = s.length();
// Try a match at each position
while (pos < len && match(s, pos))
{
// Get start of match
int start = getParenStart(0);
// Get end of match
int newpos = getParenEnd(0);
// Check if no progress was made
if (newpos == pos)
{
v.addElement(s.substring(pos, start + 1));
newpos++;
}
else
{
v.addElement(s.substring(pos, start));
}
// Move to new position
pos = newpos;
}
// Push remainder if it's not empty
String remainder = s.substring(pos);
if (remainder.length() != 0)
{
v.addElement(remainder);
}
// Return vector as an array of strings
String[] ret = new String[v.size()];
v.copyInto(ret);
return ret;
}
/**
* Flag bit that indicates that subst should replace all occurrences of this
* regular expression.
*/
public static final int REPLACE_ALL = 0x0000;
/**
* Flag bit that indicates that subst should only replace the first occurrence
* of this regular expression.
*/
public static final int REPLACE_FIRSTONLY = 0x0001;
/**
* Substitutes a string for this regular expression in another string.
* This method works like the Perl function of the same name.
* Given a regular expression of "a*b", a String to substituteIn of
* "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the
* resulting String returned by subst would be "-foo-garply-wacky-".
* @param substituteIn String to substitute within
* @param substitution String to substitute for all matches of this regular expression.
* @return The string substituteIn with zero or more occurrences of the current
* regular expression replaced with the substitution String (if this regular
* expression object doesn't match at any position, the original String is returned
* unchanged).
*/
public String subst(String substituteIn, String substitution)
{
return subst(substituteIn, substitution, REPLACE_ALL);
}
/**
* Substitutes a string for this regular expression in another string.
* This method works like the Perl function of the same name.
* Given a regular expression of "a*b", a String to substituteIn of
* "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the
* resulting String returned by subst would be "-foo-garply-wacky-".
* @param substituteIn String to substitute within
* @param substitution String to substitute for matches of this regular expression
* @param flags One or more bitwise flags from REPLACE_*. If the REPLACE_FIRSTONLY
* flag bit is set, only the first occurrence of this regular expression is replaced.
* If the bit is not set (REPLACE_ALL), all occurrences of this pattern will be
* replaced.
* @return The string substituteIn with zero or more occurrences of the current
* regular expression replaced with the substitution String (if this regular
* expression object doesn't match at any position, the original String is returned
* unchanged).
*/
public String subst(String substituteIn, String substitution, int flags)
{
// String to return
StringBuffer ret = new StringBuffer();
// Start at position 0 and search the whole string
int pos = 0;
int len = substituteIn.length();
// Try a match at each position
while (pos < len && match(substituteIn, pos))
{
// Append string before match
ret.append(substituteIn.substring(pos, getParenStart(0)));
// Append substitution
ret.append(substitution);
// Move forward, skipping past match
int newpos = getParenEnd(0);
// We always want to make progress!
if (newpos == pos)
{
newpos++;
}
// Try new position
pos = newpos;
// Break out if we're only supposed to replace one occurrence
if ((flags & REPLACE_FIRSTONLY) != 0)
{
break;
}
}
// If there's remaining input, append it
if (pos < len)
{
ret.append(substituteIn.substring(pos));
}
// Return string buffer as string
return ret.toString();
}
/**
* Returns an array of Strings, whose toString representation matches a regular
* expression. This method works like the Perl function of the same name. Given
* a regular expression of "a*b" and an array of String objects of [foo, aab, zzz,
* aaaab], the array of Strings returned by grep would be [aab, aaaab].
* @param search Array of Objects to search
* @return Array of Objects whose toString value matches this regular expression.
*/
public String[] grep(Object[] search)
{
// Create new vector to hold return items
Vector v = new Vector();
// Traverse array of objects
for (int i = 0; i < search.length; i++)
{
// Get next object as a string
String s = search[i].toString();
// If it matches this regexp, add it to the list
if (match(s))
{
v.addElement(s);
}
}
// Return vector as an array of strings
String[] ret = new String[v.size()];
v.copyInto(ret);
return ret;
}
/** @return true if at the i-th position in the 'search' a newline ends */
private boolean isNewline(int i) {
if (i < NEWLINE.length() - 1) {
return false;
}
if (search.charAt(i) == '\n') {
return true;
}
for (int j = NEWLINE.length() - 1; j >= 0; j--, i--) {
if (NEWLINE.charAt(j) != search.charAt(i)) {
return false;
}
}
return true;
}
}
|