FileDocCategorySizeDatePackage
RegexParser.javaAPI DocApache Xerces 3.0.143617Fri Sep 14 20:33:54 BST 2007org.apache.xerces.impl.xpath.regex

RegexParser

public class RegexParser extends Object
A Regular Expression Parser.
xerces.internal
version
$Id: RegexParser.java 469061 2006-10-30 04:16:15Z mrglavas $

Fields Summary
static final int
T_CHAR
static final int
T_EOF
static final int
T_OR
static final int
T_STAR
static final int
T_PLUS
static final int
T_QUESTION
static final int
T_LPAREN
static final int
T_RPAREN
static final int
T_DOT
static final int
T_LBRACKET
static final int
T_BACKSOLIDUS
static final int
T_CARET
static final int
T_DOLLAR
static final int
T_LPAREN2
static final int
T_LOOKAHEAD
static final int
T_NEGATIVELOOKAHEAD
static final int
T_LOOKBEHIND
static final int
T_NEGATIVELOOKBEHIND
static final int
T_INDEPENDENT
static final int
T_SET_OPERATIONS
static final int
T_POSIX_CHARCLASS_START
static final int
T_COMMENT
static final int
T_MODIFIERS
static final int
T_CONDITION
static final int
T_XMLSCHEMA_CC_SUBTRACTION
int
offset
String
regex
int
regexlen
int
options
ResourceBundle
resources
int
chardata
int
nexttoken
protected static final int
S_NORMAL
protected static final int
S_INBRACKETS
protected static final int
S_INXBRACKETS
int
context
int
parennumber
boolean
hasBackReferences
Vector
references
Constructors Summary
public RegexParser()


      
        this.setLocale(Locale.getDefault());
    
public RegexParser(Locale locale)

        this.setLocale(locale);
    
Methods Summary
booleancheckQuestion(int off)

        return off < this.regexlen && this.regex.charAt(off) == '?";
    
intdecodeEscaped()

        if (this.read() != T_BACKSOLIDUS)  throw ex("parser.next.1", this.offset-1);
        int c = this.chardata;
        switch (c) {
          case 'e":  c = 0x1b;  break; // ESCAPE U+001B
          case 'f":  c = '\f";  break; // FORM FEED U+000C
          case 'n":  c = '\n";  break; // LINE FEED U+000A
          case 'r":  c = '\r";  break; // CRRIAGE RETURN U+000D
          case 't":  c = '\t";  break; // HORIZONTAL TABULATION U+0009
          //case 'v':  c = 0x0b;  break; // VERTICAL TABULATION U+000B
          case 'x":
            this.next();
            if (this.read() != T_CHAR)  throw ex("parser.descape.1", this.offset-1);
            if (this.chardata == '{") {
                int v1 = 0;
                int uv = 0;
                do {
                    this.next();
                    if (this.read() != T_CHAR)  throw ex("parser.descape.1", this.offset-1);
                    if ((v1 = hexChar(this.chardata)) < 0)
                        break;
                    if (uv > uv*16) throw ex("parser.descape.2", this.offset-1);
                    uv = uv*16+v1;
                } while (true);
                if (this.chardata != '}")  throw ex("parser.descape.3", this.offset-1);
                if (uv > Token.UTF16_MAX)  throw ex("parser.descape.4", this.offset-1);
                c = uv;
            } else {
                int v1 = 0;
                if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
                    throw ex("parser.descape.1", this.offset-1);
                int uv = v1;
                this.next();
                if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
                    throw ex("parser.descape.1", this.offset-1);
                uv = uv*16+v1;
                c = uv;
            }
            break;

          case 'u":
            int v1 = 0;
            this.next();
            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
                throw ex("parser.descape.1", this.offset-1);
            int uv = v1;
            this.next();
            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
                throw ex("parser.descape.1", this.offset-1);
            uv = uv*16+v1;
            this.next();
            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
                throw ex("parser.descape.1", this.offset-1);
            uv = uv*16+v1;
            this.next();
            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
                throw ex("parser.descape.1", this.offset-1);
            uv = uv*16+v1;
            c = uv;
            break;

          case 'v":
            this.next();
            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
                throw ex("parser.descape.1", this.offset-1);
            uv = v1;
            this.next();
            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
                throw ex("parser.descape.1", this.offset-1);
            uv = uv*16+v1;
            this.next();
            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
                throw ex("parser.descape.1", this.offset-1);
            uv = uv*16+v1;
            this.next();
            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
                throw ex("parser.descape.1", this.offset-1);
            uv = uv*16+v1;
            this.next();
            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
                throw ex("parser.descape.1", this.offset-1);
            uv = uv*16+v1;
            this.next();
            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
                throw ex("parser.descape.1", this.offset-1);
            uv = uv*16+v1;
            if (uv > Token.UTF16_MAX)  throw ex("parser.descappe.4", this.offset-1);
            c = uv;
            break;
          case 'A":
          case 'Z":
          case 'z":
            throw ex("parser.descape.5", this.offset-2);
          default:
        }
        return c;
    
final ParseExceptionex(java.lang.String key, int loc)

        return new ParseException(this.resources.getString(key), loc);
    
TokengetTokenForShorthand(int ch)

        Token tok;
        switch (ch) {
          case 'd":
            tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
                ? Token.getRange("Nd", true) : Token.token_0to9;
            break;
          case 'D":
            tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
                ? Token.getRange("Nd", false) : Token.token_not_0to9;
            break;
          case 'w":
            tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
                ? Token.getRange("IsWord", true) : Token.token_wordchars;
            break;
          case 'W":
            tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
                ? Token.getRange("IsWord", false) : Token.token_not_wordchars;
            break;
          case 's":
            tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
                ? Token.getRange("IsSpace", true) : Token.token_spaces;
            break;
          case 'S":
            tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
                ? Token.getRange("IsSpace", false) : Token.token_not_spaces;
            break;

          default:
            throw new RuntimeException("Internal Error: shorthands: \\u"+Integer.toString(ch, 16));
        }
        return tok;
    
private static final inthexChar(int ch)

        if (ch < '0")  return -1;
        if (ch > 'f")  return -1;
        if (ch <= '9")  return ch-'0";
        if (ch < 'A")  return -1;
        if (ch <= 'F")  return ch-'A"+10;
        if (ch < 'a")  return -1;
        return ch-'a"+10;
    
private final booleanisSet(int flag)

        return (this.options & flag) == flag;
    
final voidnext()

        if (this.offset >= this.regexlen) {
            this.chardata = -1;
            this.nexttoken = T_EOF;
            return;
        }

        int ret;
        int ch = this.regex.charAt(this.offset++);
        this.chardata = ch;

        if (this.context == S_INBRACKETS) {
            // In a character class, this.chardata has one character, that is to say,
            // a pair of surrogates is composed and stored to this.chardata.
            switch (ch) {
              case '\\":
                ret = T_BACKSOLIDUS;
                if (this.offset >= this.regexlen)
                    throw ex("parser.next.1", this.offset-1);
                this.chardata = this.regex.charAt(this.offset++);
                break;

              case '-":
                if (this.isSet(RegularExpression.XMLSCHEMA_MODE)
                    && this.offset < this.regexlen && this.regex.charAt(this.offset) == '[") {
                    this.offset++;
                    ret = T_XMLSCHEMA_CC_SUBTRACTION;
                } else
                    ret = T_CHAR;
                break;

              case '[":
                if (!this.isSet(RegularExpression.XMLSCHEMA_MODE)
                    && this.offset < this.regexlen && this.regex.charAt(this.offset) == ':") {
                    this.offset++;
                    ret = T_POSIX_CHARCLASS_START;
                    break;
                } // Through down
              default:
                if (REUtil.isHighSurrogate(ch) && this.offset < this.regexlen) {
                    int low = this.regex.charAt(this.offset);
                    if (REUtil.isLowSurrogate(low)) {
                        this.chardata = REUtil.composeFromSurrogates(ch, low);
                        this.offset ++;
                    }
                }
                ret = T_CHAR;
            }
            this.nexttoken = ret;
            return;
        }

        switch (ch) {
          case '|": ret = T_OR;             break;
          case '*": ret = T_STAR;           break;
          case '+": ret = T_PLUS;           break;
          case '?": ret = T_QUESTION;       break;
          case ')": ret = T_RPAREN;         break;
          case '.": ret = T_DOT;            break;
          case '[": ret = T_LBRACKET;       break;
          case '^":
              if (this.isSet(RegularExpression.XMLSCHEMA_MODE)) {
                  ret = T_CHAR;
              }
              else {
                  ret = T_CARET;
              }
              break;
          case '$": 
              if (this.isSet(RegularExpression.XMLSCHEMA_MODE)) {
                  ret = T_CHAR;
              }
              else {
                  ret = T_DOLLAR;
              }
              break;
          case '(":
            ret = T_LPAREN;
            if (this.offset >= this.regexlen)
                break;
            if (this.regex.charAt(this.offset) != '?")
                break;
            if (++this.offset >= this.regexlen)
                throw ex("parser.next.2", this.offset-1);
            ch = this.regex.charAt(this.offset++);
            switch (ch) {
              case ':":  ret = T_LPAREN2;            break;
              case '=":  ret = T_LOOKAHEAD;          break;
              case '!":  ret = T_NEGATIVELOOKAHEAD;  break;
              case '[":  ret = T_SET_OPERATIONS;     break;
              case '>":  ret = T_INDEPENDENT;        break;
              case '<":
                if (this.offset >= this.regexlen)
                    throw ex("parser.next.2", this.offset-3);
                ch = this.regex.charAt(this.offset++);
                if (ch == '=") {
                    ret = T_LOOKBEHIND;
                } else if (ch == '!") {
                    ret = T_NEGATIVELOOKBEHIND;
                } else
                    throw ex("parser.next.3", this.offset-3);
                break;
              case '#":
                while (this.offset < this.regexlen) {
                    ch = this.regex.charAt(this.offset++);
                    if (ch == ')")  break;
                }
                if (ch != ')")
                    throw ex("parser.next.4", this.offset-1);
                ret = T_COMMENT;
                break;
              default:
                if (ch == '-" || 'a" <= ch && ch <= 'z" || 'A" <= ch && ch <= 'Z") {// Options
                    this.offset --;
                    ret = T_MODIFIERS;
                    break;
                } else if (ch == '(") {         // conditional
                    ret = T_CONDITION;          // this.offsets points the next of '('.
                    break;
                }
                throw ex("parser.next.2", this.offset-2);
            }
            break;
            
          case '\\":
            ret = T_BACKSOLIDUS;
            if (this.offset >= this.regexlen)
                throw ex("parser.next.1", this.offset-1);
            this.chardata = this.regex.charAt(this.offset++);
            break;

          default:
            ret = T_CHAR;
        }
        this.nexttoken = ret;
    
synchronized Tokenparse(java.lang.String regex, int options)

        this.options = options;
        this.offset = 0;
        this.setContext(S_NORMAL);
        this.parennumber = 1;
        this.hasBackReferences = false;
        this.regex = regex;
        if (this.isSet(RegularExpression.EXTENDED_COMMENT))
            this.regex = REUtil.stripExtendedComment(this.regex);
        this.regexlen = this.regex.length();


        this.next();
        Token ret = this.parseRegex();
        if (this.offset != this.regexlen)
            throw ex("parser.parse.1", this.offset);
        if (this.references != null) {
            for (int i = 0;  i < this.references.size();  i ++) {
                ReferencePosition position = (ReferencePosition)this.references.elementAt(i);
                if (this.parennumber <= position.refNumber)
                    throw ex("parser.parse.2", position.position);
            }
            this.references.removeAllElements();
        }
        return ret;
    
TokenparseAtom()
atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9] | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block | '(?>' regex ')' char ::= '\\' | '\' [efnrt] | bmp-code | character-1

        int ch = this.read();
        Token tok = null;
        switch (ch) {
          case T_LPAREN:        return this.processParen();
          case T_LPAREN2:       return this.processParen2(); // '(?:'
          case T_CONDITION:     return this.processCondition(); // '(?('
          case T_MODIFIERS:     return this.processModifiers(); // (?modifiers ... )
          case T_INDEPENDENT:   return this.processIndependent();
          case T_DOT:
            this.next();                    // Skips '.'
            tok = Token.token_dot;
            break;

            /**
             * char-class ::= '[' ( '^'? range ','?)+ ']'
             * range ::= '\d' | '\w' | '\s' | category-block | range-char
             *           | range-char '-' range-char
             * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
             * bmp-char ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
             */
          case T_LBRACKET:      return this.parseCharacterClass(true);
          case T_SET_OPERATIONS: return this.parseSetOperations();

          case T_BACKSOLIDUS:
            switch (this.chardata) {
              case 'd":  case 'D":
              case 'w":  case 'W":
              case 's":  case 'S":
                tok = this.getTokenForShorthand(this.chardata);
                this.next();
                return tok;

              case 'e":  case 'f":  case 'n":  case 'r":
              case 't":  case 'u":  case 'v":  case 'x":
                {
                    int ch2 = this.decodeEscaped();
                    if (ch2 < 0x10000) {
                        tok = Token.createChar(ch2);
                    } else {
                        tok = Token.createString(REUtil.decomposeToSurrogates(ch2));
                    }
                }
                break;

              case 'c": return this.processBacksolidus_c();
              case 'C": return this.processBacksolidus_C();
              case 'i": return this.processBacksolidus_i();
              case 'I": return this.processBacksolidus_I();
              case 'g": return this.processBacksolidus_g();
              case 'X": return this.processBacksolidus_X();
              case '1":  case '2":  case '3":  case '4":
              case '5":  case '6":  case '7":  case '8":  case '9":
                return this.processBackreference();

              case 'P":
              case 'p":
                int pstart = this.offset;
                tok = processBacksolidus_pP(this.chardata);
                if (tok == null)  throw this.ex("parser.atom.5", pstart);
                break;

              default:
                tok = Token.createChar(this.chardata);
            }
            this.next();
            break;

          case T_CHAR:
            if (this.chardata == ']" || this.chardata == '{" || this.chardata == '}")
                throw this.ex("parser.atom.4", this.offset-1);
            tok = Token.createChar(this.chardata);
            int high = this.chardata;
            this.next();
            if (REUtil.isHighSurrogate(high)
                && this.read() == T_CHAR && REUtil.isLowSurrogate(this.chardata)) {
                char[] sur = new char[2];
                sur[0] = (char)high;
                sur[1] = (char)this.chardata;
                tok = Token.createParen(Token.createString(new String(sur)), 0);
                this.next();
            }
            break;

          default:
            throw this.ex("parser.atom.4", this.offset-1);
        }
        return tok;
    
protected RangeTokenparseCharacterClass(boolean useNrange)
char-class ::= '[' ( '^'? range ','?)+ ']' range ::= '\d' | '\w' | '\s' | category-block | range-char | range-char '-' range-char range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2 bmp-code ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]

        this.setContext(S_INBRACKETS);
        this.next();                            // '['
        boolean nrange = false;
        RangeToken base = null;
        RangeToken tok;
        if (this.read() == T_CHAR && this.chardata == '^") {
            nrange = true;
            this.next();                        // '^'
            if (useNrange) {
                tok = Token.createNRange();
            } else {
                base = Token.createRange();
                base.addRange(0, Token.UTF16_MAX);
                tok = Token.createRange();
            }
        } else {
            tok = Token.createRange();
        }
        int type;
        boolean firstloop = true;
        while ((type = this.read()) != T_EOF) {
            if (type == T_CHAR && this.chardata == ']" && !firstloop)
                break;
            firstloop = false;
            int c = this.chardata;
            boolean end = false;
            if (type == T_BACKSOLIDUS) {
                switch (c) {
                  case 'd":  case 'D":
                  case 'w":  case 'W":
                  case 's":  case 'S":
                    tok.mergeRanges(this.getTokenForShorthand(c));
                    end = true;
                    break;

                  case 'i":  case 'I":
                  case 'c":  case 'C":
                    c = this.processCIinCharacterClass(tok, c);
                    if (c < 0)  end = true;
                    break;
                    
                  case 'p":
                  case 'P":
                    int pstart = this.offset;
                    RangeToken tok2 = this.processBacksolidus_pP(c);
                    if (tok2 == null)  throw this.ex("parser.atom.5", pstart);
                    tok.mergeRanges(tok2);
                    end = true;
                    break;

                  default:
                    c = this.decodeEscaped();
                } // \ + c
            } // backsolidus
                                                // POSIX Character class such as [:alnum:]
            else if (type == T_POSIX_CHARCLASS_START) {
                int nameend = this.regex.indexOf(':", this.offset);
                if (nameend < 0) throw this.ex("parser.cc.1", this.offset);
                boolean positive = true;
                if (this.regex.charAt(this.offset) == '^") {
                    this.offset ++;
                    positive = false;
                }
                String name = this.regex.substring(this.offset, nameend);
                RangeToken range = Token.getRange(name, positive,
                                                  this.isSet(RegularExpression.XMLSCHEMA_MODE));
                if (range == null)  throw this.ex("parser.cc.3", this.offset);
                tok.mergeRanges(range);
                end = true;
                if (nameend+1 >= this.regexlen || this.regex.charAt(nameend+1) != ']")
                    throw this.ex("parser.cc.1", nameend);
                this.offset = nameend+2;
            }
            this.next();
            if (!end) {                         // if not shorthands...
                if (this.read() != T_CHAR || this.chardata != '-") { // Here is no '-'.
                    tok.addRange(c, c);
                } else {
                    this.next(); // Skips '-'
                    if ((type = this.read()) == T_EOF)  throw this.ex("parser.cc.2", this.offset);
                    if (type == T_CHAR && this.chardata == ']") {
                        tok.addRange(c, c);
                        tok.addRange('-", '-");
                    } else {
                        int rangeend = this.chardata;
                        if (type == T_BACKSOLIDUS)
                            rangeend = this.decodeEscaped();
                        this.next();
                        tok.addRange(c, rangeend);
                    }
                }
            }
            if (this.isSet(RegularExpression.SPECIAL_COMMA)
                && this.read() == T_CHAR && this.chardata == ',")
                this.next();
        }
        if (this.read() == T_EOF)
            throw this.ex("parser.cc.2", this.offset);
        if (!useNrange && nrange) {
            base.subtractRanges(tok);
            tok = base;
        }
        tok.sortRanges();
        tok.compactRanges();
        //tok.dumpRanges();
        /*
        if (this.isSet(RegularExpression.IGNORE_CASE))
            tok = RangeToken.createCaseInsensitiveToken(tok);
        */
        this.setContext(S_NORMAL);
        this.next();                    // Skips ']'

        return tok;
    
TokenparseFactor()
factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>' | atom (('*' | '+' | '?' | minmax ) '?'? )?) | '(?=' regex ')' | '(?!' regex ')' | '(?<=' regex ')' | '(?<!' regex ')' | '(?#' [^)]* ')' minmax ::= '{' min (',' max?)? '}' min ::= [0-9]+ max ::= [0-9]+

        
        int ch = this.read();
        Token tok;
        switch (ch) {
          case T_CARET:         return this.processCaret();
          case T_DOLLAR:        return this.processDollar();
          case T_LOOKAHEAD:     return this.processLookahead();
          case T_NEGATIVELOOKAHEAD: return this.processNegativelookahead();
          case T_LOOKBEHIND:    return this.processLookbehind();
          case T_NEGATIVELOOKBEHIND: return this.processNegativelookbehind();

          case T_COMMENT:
            this.next();
            return Token.createEmpty();

          case T_BACKSOLIDUS:
            switch (this.chardata) {
              case 'A": return this.processBacksolidus_A();
              case 'Z": return this.processBacksolidus_Z();
              case 'z": return this.processBacksolidus_z();
              case 'b": return this.processBacksolidus_b();
              case 'B": return this.processBacksolidus_B();
              case '<": return this.processBacksolidus_lt();
              case '>": return this.processBacksolidus_gt();
            }
                                                // through down
        }
        tok = this.parseAtom();
        ch = this.read();
        switch (ch) {
          case T_STAR:  return this.processStar(tok);
          case T_PLUS:  return this.processPlus(tok);
          case T_QUESTION: return this.processQuestion(tok);
          case T_CHAR:
            if (this.chardata == '{" && this.offset < this.regexlen) {

                int off = this.offset;          // this.offset -> next of '{'
                int min = 0, max = -1;

                if ((ch = this.regex.charAt(off++)) >= '0" && ch <= '9") {

                    min = ch -'0";
                    while (off < this.regexlen
                           && (ch = this.regex.charAt(off++)) >= '0" && ch <= '9") {
                        min = min*10 +ch-'0";
                        if (min < 0)
                            throw ex("parser.quantifier.5", this.offset);
                    }
                }
                else {
                    throw ex("parser.quantifier.1", this.offset);
                }

                max = min;
                if (ch == ',") {

                   if (off >= this.regexlen) {
                       throw ex("parser.quantifier.3", this.offset);
                   }
                   else if ((ch = this.regex.charAt(off++)) >= '0" && ch <= '9") {                       

                        max = ch -'0";       // {min,max}
                        while (off < this.regexlen
                               && (ch = this.regex.charAt(off++)) >= '0"
                               && ch <= '9") {
                            max = max*10 +ch-'0";
                            if (max < 0)
                                throw ex("parser.quantifier.5", this.offset);
                        }

                        if (min > max)
                            throw ex("parser.quantifier.4", this.offset);
                   }
                   else { // assume {min,}
                        max = -1;           
                    }
                }

               if (ch != '}")
                   throw ex("parser.quantifier.2", this.offset);

               if (this.checkQuestion(off)) {  // off -> next of '}'
                    tok = Token.createNGClosure(tok);
                    this.offset = off+1;
                } else {
                    tok = Token.createClosure(tok);
                    this.offset = off;
                }

                tok.setMin(min);
                tok.setMax(max);
                //System.err.println("CLOSURE: "+min+", "+max);
                this.next();
            }
        }
        return tok;
    
TokenparseRegex()
regex ::= term (`|` term)* term ::= factor+ factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>' | atom (('*' | '+' | '?' | minmax ) '?'? )?) | '(?=' regex ')' | '(?!' regex ')' | '(?<=' regex ')' | '(?<!' regex ')' atom ::= char | '.' | range | '(' regex ')' | '(?:' regex ')' | '\' [0-9] | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block

        Token tok = this.parseTerm();
        Token parent = null;
        while (this.read() == T_OR) {
            this.next();                    // '|'
            if (parent == null) {
                parent = Token.createUnion();
                parent.addChild(tok);
                tok = parent;
            }
            tok.addChild(this.parseTerm());
        }
        return tok;
    
protected RangeTokenparseSetOperations()
'(?[' ... ']' (('-' | '+' | '&') '[' ... ']')? ')'

        RangeToken tok = this.parseCharacterClass(false);
        int type;
        while ((type = this.read()) != T_RPAREN) {
            int ch = this.chardata;
            if (type == T_CHAR && (ch == '-" || ch == '&")
                || type == T_PLUS) {
                this.next();
                if (this.read() != T_LBRACKET) throw ex("parser.ope.1", this.offset-1);
                RangeToken t2 = this.parseCharacterClass(false);
                if (type == T_PLUS)
                    tok.mergeRanges(t2);
                else if (ch == '-")
                    tok.subtractRanges(t2);
                else if (ch == '&")
                    tok.intersectRanges(t2);
                else
                    throw new RuntimeException("ASSERT");
            } else {
                throw ex("parser.ope.2", this.offset-1);
            }
        }
        this.next();
        return tok;
    
TokenparseTerm()
term ::= factor+

        int ch = this.read();
        if (ch == T_OR || ch == T_RPAREN || ch == T_EOF) {
            return Token.createEmpty();
        } else {
            Token tok = this.parseFactor();
            Token concat = null;
            while ((ch = this.read()) != T_OR && ch != T_RPAREN && ch != T_EOF) {
                if (concat == null) {
                    concat = Token.createConcat();
                    concat.addChild(tok);
                    tok = concat;
                }
                concat.addChild(this.parseFactor());
                //tok = Token.createConcat(tok, this.parseFactor());
            }
            return tok;
        }
    
TokenprocessBackreference()

        int refnum = this.chardata-'0";
        Token tok = Token.createBackReference(refnum);
        this.hasBackReferences = true;
        if (this.references == null)  this.references = new Vector();
        this.references.addElement(new ReferencePosition(refnum, this.offset-2));
        this.next();
        return tok;
    
TokenprocessBacksolidus_A()

        this.next();
        return Token.token_stringbeginning;
    
TokenprocessBacksolidus_B()

        this.next();
        return Token.token_not_wordedge;
    
TokenprocessBacksolidus_C()

        throw ex("parser.process.1", this.offset);
    
TokenprocessBacksolidus_I()

        throw ex("parser.process.1", this.offset);
    
TokenprocessBacksolidus_X()

        this.next();
        return Token.getCombiningCharacterSequence();
    
TokenprocessBacksolidus_Z()

        this.next();
        return Token.token_stringend2;
    
TokenprocessBacksolidus_b()

        this.next();
        return Token.token_wordedge;
    
TokenprocessBacksolidus_c()

        int ch2;                                // Must be in 0x0040-0x005f
        if (this.offset >= this.regexlen
            || ((ch2 = this.regex.charAt(this.offset++)) & 0xffe0) != 0x0040)
            throw ex("parser.atom.1", this.offset-1);
        this.next();
        return Token.createChar(ch2-0x40);
    
TokenprocessBacksolidus_g()

        this.next();
        return Token.getGraphemePattern();
    
TokenprocessBacksolidus_gt()

        this.next();
        return Token.token_wordend;
    
TokenprocessBacksolidus_i()

        Token tok = Token.createChar('i");
        this.next();
        return tok;
    
TokenprocessBacksolidus_lt()

        this.next();
        return Token.token_wordbeginning;
    
protected RangeTokenprocessBacksolidus_pP(int c)


        this.next();
        if (this.read() != T_CHAR || this.chardata != '{")
            throw this.ex("parser.atom.2", this.offset-1);

        // handle category escape
        boolean positive = c == 'p";
        int namestart = this.offset;
        int nameend = this.regex.indexOf('}", namestart);

        if (nameend < 0)
            throw this.ex("parser.atom.3", this.offset);

        String pname = this.regex.substring(namestart, nameend);
        this.offset = nameend+1;

        return Token.getRange(pname, positive, this.isSet(RegularExpression.XMLSCHEMA_MODE));
    
TokenprocessBacksolidus_z()

        this.next();
        return Token.token_stringend;
    
intprocessCIinCharacterClass(RangeToken tok, int c)

        return this.decodeEscaped();
    
TokenprocessCaret()

        this.next();
        return Token.token_linebeginning;
    
TokenprocessCondition()

                                                // this.offset points the next of '('
        if (this.offset+1 >= this.regexlen)  throw ex("parser.factor.4", this.offset);
                                                // Parses a condition.
        int refno = -1;
        Token condition = null;
        int ch = this.regex.charAt(this.offset);
        if ('1" <= ch && ch <= '9") {
            refno = ch-'0";
            this.hasBackReferences = true;
            if (this.references == null)  this.references = new Vector();
            this.references.addElement(new ReferencePosition(refno, this.offset));
            this.offset ++;
            if (this.regex.charAt(this.offset) != ')")  throw ex("parser.factor.1", this.offset);
            this.offset ++;
        } else {
            if (ch == '?")  this.offset --; // Points '('.
            this.next();
            condition = this.parseFactor();
            switch (condition.type) {
              case Token.LOOKAHEAD:
              case Token.NEGATIVELOOKAHEAD:
              case Token.LOOKBEHIND:
              case Token.NEGATIVELOOKBEHIND:
                break;
              case Token.ANCHOR:
                if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
                break;
              default:
                throw ex("parser.factor.5", this.offset);
            }
        }
                                                // Parses yes/no-patterns.
        this.next();
        Token yesPattern = this.parseRegex();
        Token noPattern = null;
        if (yesPattern.type == Token.UNION) {
            if (yesPattern.size() != 2)  throw ex("parser.factor.6", this.offset);
            noPattern = yesPattern.getChild(1);
            yesPattern = yesPattern.getChild(0);
        }
        if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
        this.next();
        return Token.createCondition(refno, condition, yesPattern, noPattern);
    
TokenprocessDollar()

        this.next();
        return Token.token_lineend;
    
TokenprocessIndependent()

        this.next();
        Token tok = Token.createLook(Token.INDEPENDENT, this.parseRegex());
        if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
        this.next();                            // Skips ')'
        return tok;
    
TokenprocessLookahead()

        this.next();
        Token tok = Token.createLook(Token.LOOKAHEAD, this.parseRegex());
        if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
        this.next();                            // ')'
        return tok;
    
TokenprocessLookbehind()

        this.next();
        Token tok = Token.createLook(Token.LOOKBEHIND, this.parseRegex());
        if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
        this.next();                            // ')'
        return tok;
    
TokenprocessModifiers()

                                                // this.offset points the next of '?'.
                                                // modifiers ::= [imsw]* ('-' [imsw]*)? ':'
        int add = 0, mask = 0, ch = -1;
        while (this.offset < this.regexlen) {
            ch = this.regex.charAt(this.offset);
            int v = REUtil.getOptionValue(ch);
            if (v == 0)  break;                 // '-' or ':'?
            add |= v;
            this.offset ++;
        }
        if (this.offset >= this.regexlen)  throw ex("parser.factor.2", this.offset-1);
        if (ch == '-") {
            this.offset ++;
            while (this.offset < this.regexlen) {
                ch = this.regex.charAt(this.offset);
                int v = REUtil.getOptionValue(ch);
                if (v == 0)  break;             // ':'?
                mask |= v;
                this.offset ++;
            }
            if (this.offset >= this.regexlen)  throw ex("parser.factor.2", this.offset-1);
        }
        Token tok;
        if (ch == ':") {
            this.offset ++;
            this.next();
            tok = Token.createModifierGroup(this.parseRegex(), add, mask);
            if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
            this.next();
        } else if (ch == ')") {                 // such as (?-i)
            this.offset ++;
            this.next();
            tok = Token.createModifierGroup(this.parseRegex(), add, mask);
        } else
            throw ex("parser.factor.3", this.offset);

        return tok;
    
TokenprocessNegativelookahead()

        this.next();
        Token tok = Token.createLook(Token.NEGATIVELOOKAHEAD, this.parseRegex());
        if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
        this.next();                            // ')'
        return tok;
    
TokenprocessNegativelookbehind()

        this.next();
        Token tok = Token.createLook(Token.NEGATIVELOOKBEHIND, this.parseRegex());
        if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
        this.next();                    // ')'
        return tok;
    
TokenprocessParen()

        this.next();
        int p = this.parennumber++;
        Token tok = Token.createParen(this.parseRegex(), p);
        if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
        this.next();                            // Skips ')'
        return tok;
    
TokenprocessParen2()

        this.next();
        Token tok = Token.createParen(this.parseRegex(), 0);
        if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
        this.next();                            // Skips ')'
        return tok;
    
TokenprocessPlus(Token tok)

        // X+ -> XX*
        this.next();
        if (this.read() == T_QUESTION) {
            this.next();
            return Token.createConcat(tok, Token.createNGClosure(tok));
        } else
            return Token.createConcat(tok, Token.createClosure(tok));
    
TokenprocessQuestion(Token tok)

        // X? -> X|
        this.next();
        Token par = Token.createUnion();
        if (this.read() == T_QUESTION) {
            this.next();
            par.addChild(Token.createEmpty());
            par.addChild(tok);
        } else {
            par.addChild(tok);
            par.addChild(Token.createEmpty());
        }
        return par;
    
TokenprocessStar(Token tok)

        this.next();
        if (this.read() == T_QUESTION) {
            this.next();
            return Token.createNGClosure(tok);
        } else
            return Token.createClosure(tok);
    
final intread()

        return this.nexttoken;
    
protected final voidsetContext(int con)

        this.context = con;
    
public voidsetLocale(java.util.Locale locale)

        try {
            this.resources = ResourceBundle.getBundle("org.apache.xerces.impl.xpath.regex.message", locale);
        } catch (MissingResourceException mre) {
            throw new RuntimeException("Installation Problem???  Couldn't load messages: "
                                       +mre.getMessage());
        }