File Doc Category Size Date Package
RegexParser.java API Doc Apache Xerces 3.0.1 43617 Fri Sep 14 20:33:54 BST 2007 org.apache.xerces.impl.xpath.regex

RegexParser

java.lang.Object

public class RegexParser extends Object

A Regular Expression Parser.

xerces.internal
version: $Id: RegexParser.java 469061 2006-10-30 04:16:15Z mrglavas $

Fields Summary
static final int
T_CHAR
static final int
T_EOF
static final int
T_OR
static final int
T_STAR
static final int
T_PLUS
static final int
T_QUESTION
static final int
T_LPAREN
static final int
T_RPAREN
static final int
T_DOT
static final int
T_LBRACKET
static final int
T_BACKSOLIDUS
static final int
T_CARET
static final int
T_DOLLAR
static final int
T_LPAREN2
static final int
T_LOOKAHEAD
static final int
T_NEGATIVELOOKAHEAD
static final int
T_LOOKBEHIND
static final int
T_NEGATIVELOOKBEHIND
static final int
T_INDEPENDENT
static final int
T_SET_OPERATIONS
static final int
T_POSIX_CHARCLASS_START
static final int
T_COMMENT
static final int
T_MODIFIERS
static final int
T_CONDITION
static final int
T_XMLSCHEMA_CC_SUBTRACTION
int
offset
String
regex
int
regexlen
int
options
ResourceBundle
resources
int
chardata
int
nexttoken
protected static final int
S_NORMAL
protected static final int
S_INBRACKETS
protected static final int
S_INXBRACKETS
int
context
int
parennumber
boolean
hasBackReferences
Vector
references
Constructors Summary
public RegexParser()
this.setLocale(Locale.getDefault());
public RegexParser(Locale locale)
this.setLocale(locale);
Methods Summary
boolean checkQuestion(int off)
return off < this.regexlen && this.regex.charAt(off) == '?";
int decodeEscaped()
if (this.read() != T_BACKSOLIDUS) throw ex("parser.next.1", this.offset-1); int c = this.chardata; switch (c) { case 'e": c = 0x1b; break; // ESCAPE U+001B case 'f": c = '\f"; break; // FORM FEED U+000C case 'n": c = '\n"; break; // LINE FEED U+000A case 'r": c = '\r"; break; // CRRIAGE RETURN U+000D case 't": c = '\t"; break; // HORIZONTAL TABULATION U+0009 //case 'v': c = 0x0b; break; // VERTICAL TABULATION U+000B case 'x": this.next(); if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1); if (this.chardata == '{") { int v1 = 0; int uv = 0; do { this.next(); if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1); if ((v1 = hexChar(this.chardata)) < 0) break; if (uv > uv*16) throw ex("parser.descape.2", this.offset-1); uv = uv*16+v1; } while (true); if (this.chardata != '}") throw ex("parser.descape.3", this.offset-1); if (uv > Token.UTF16_MAX) throw ex("parser.descape.4", this.offset-1); c = uv; } else { int v1 = 0; if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) throw ex("parser.descape.1", this.offset-1); int uv = v1; this.next(); if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) throw ex("parser.descape.1", this.offset-1); uv = uv*16+v1; c = uv; } break; case 'u": int v1 = 0; this.next(); if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) throw ex("parser.descape.1", this.offset-1); int uv = v1; this.next(); if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) throw ex("parser.descape.1", this.offset-1); uv = uv*16+v1; this.next(); if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) throw ex("parser.descape.1", this.offset-1); uv = uv*16+v1; this.next(); if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) throw ex("parser.descape.1", this.offset-1); uv = uv*16+v1; c = uv; break; case 'v": this.next(); if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) throw ex("parser.descape.1", this.offset-1); uv = v1; this.next(); if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) throw ex("parser.descape.1", this.offset-1); uv = uv*16+v1; this.next(); if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) throw ex("parser.descape.1", this.offset-1); uv = uv*16+v1; this.next(); if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) throw ex("parser.descape.1", this.offset-1); uv = uv*16+v1; this.next(); if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) throw ex("parser.descape.1", this.offset-1); uv = uv*16+v1; this.next(); if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) throw ex("parser.descape.1", this.offset-1); uv = uv*16+v1; if (uv > Token.UTF16_MAX) throw ex("parser.descappe.4", this.offset-1); c = uv; break; case 'A": case 'Z": case 'z": throw ex("parser.descape.5", this.offset-2); default: } return c;
final ParseException ex(java.lang.String key, int loc)
return new ParseException(this.resources.getString(key), loc);
Token getTokenForShorthand(int ch)
Token tok; switch (ch) { case 'd": tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) ? Token.getRange("Nd", true) : Token.token_0to9; break; case 'D": tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) ? Token.getRange("Nd", false) : Token.token_not_0to9; break; case 'w": tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) ? Token.getRange("IsWord", true) : Token.token_wordchars; break; case 'W": tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) ? Token.getRange("IsWord", false) : Token.token_not_wordchars; break; case 's": tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) ? Token.getRange("IsSpace", true) : Token.token_spaces; break; case 'S": tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) ? Token.getRange("IsSpace", false) : Token.token_not_spaces; break; default: throw new RuntimeException("Internal Error: shorthands: \\u"+Integer.toString(ch, 16)); } return tok;
private static final int hexChar(int ch)
if (ch < '0") return -1; if (ch > 'f") return -1; if (ch <= '9") return ch-'0"; if (ch < 'A") return -1; if (ch <= 'F") return ch-'A"+10; if (ch < 'a") return -1; return ch-'a"+10;
private final boolean isSet(int flag)
return (this.options & flag) == flag;
final void next()
if (this.offset >= this.regexlen) { this.chardata = -1; this.nexttoken = T_EOF; return; } int ret; int ch = this.regex.charAt(this.offset++); this.chardata = ch; if (this.context == S_INBRACKETS) { // In a character class, this.chardata has one character, that is to say, // a pair of surrogates is composed and stored to this.chardata. switch (ch) { case '\\": ret = T_BACKSOLIDUS; if (this.offset >= this.regexlen) throw ex("parser.next.1", this.offset-1); this.chardata = this.regex.charAt(this.offset++); break; case '-": if (this.isSet(RegularExpression.XMLSCHEMA_MODE) && this.offset < this.regexlen && this.regex.charAt(this.offset) == '[") { this.offset++; ret = T_XMLSCHEMA_CC_SUBTRACTION; } else ret = T_CHAR; break; case '[": if (!this.isSet(RegularExpression.XMLSCHEMA_MODE) && this.offset < this.regexlen && this.regex.charAt(this.offset) == ':") { this.offset++; ret = T_POSIX_CHARCLASS_START; break; } // Through down default: if (REUtil.isHighSurrogate(ch) && this.offset < this.regexlen) { int low = this.regex.charAt(this.offset); if (REUtil.isLowSurrogate(low)) { this.chardata = REUtil.composeFromSurrogates(ch, low); this.offset ++; } } ret = T_CHAR; } this.nexttoken = ret; return; } switch (ch) { case '|": ret = T_OR; break; case '*": ret = T_STAR; break; case '+": ret = T_PLUS; break; case '?": ret = T_QUESTION; break; case ')": ret = T_RPAREN; break; case '.": ret = T_DOT; break; case '[": ret = T_LBRACKET; break; case '^": if (this.isSet(RegularExpression.XMLSCHEMA_MODE)) { ret = T_CHAR; } else { ret = T_CARET; } break; case '$": if (this.isSet(RegularExpression.XMLSCHEMA_MODE)) { ret = T_CHAR; } else { ret = T_DOLLAR; } break; case '(": ret = T_LPAREN; if (this.offset >= this.regexlen) break; if (this.regex.charAt(this.offset) != '?") break; if (++this.offset >= this.regexlen) throw ex("parser.next.2", this.offset-1); ch = this.regex.charAt(this.offset++); switch (ch) { case ':": ret = T_LPAREN2; break; case '=": ret = T_LOOKAHEAD; break; case '!": ret = T_NEGATIVELOOKAHEAD; break; case '[": ret = T_SET_OPERATIONS; break; case '>": ret = T_INDEPENDENT; break; case '<": if (this.offset >= this.regexlen) throw ex("parser.next.2", this.offset-3); ch = this.regex.charAt(this.offset++); if (ch == '=") { ret = T_LOOKBEHIND; } else if (ch == '!") { ret = T_NEGATIVELOOKBEHIND; } else throw ex("parser.next.3", this.offset-3); break; case '#": while (this.offset < this.regexlen) { ch = this.regex.charAt(this.offset++); if (ch == ')") break; } if (ch != ')") throw ex("parser.next.4", this.offset-1); ret = T_COMMENT; break; default: if (ch == '-" || 'a" <= ch && ch <= 'z" || 'A" <= ch && ch <= 'Z") {// Options this.offset --; ret = T_MODIFIERS; break; } else if (ch == '(") { // conditional ret = T_CONDITION; // this.offsets points the next of '('. break; } throw ex("parser.next.2", this.offset-2); } break; case '\\": ret = T_BACKSOLIDUS; if (this.offset >= this.regexlen) throw ex("parser.next.1", this.offset-1); this.chardata = this.regex.charAt(this.offset++); break; default: ret = T_CHAR; } this.nexttoken = ret;
synchronized Token parse(java.lang.String regex, int options)
this.options = options; this.offset = 0; this.setContext(S_NORMAL); this.parennumber = 1; this.hasBackReferences = false; this.regex = regex; if (this.isSet(RegularExpression.EXTENDED_COMMENT)) this.regex = REUtil.stripExtendedComment(this.regex); this.regexlen = this.regex.length(); this.next(); Token ret = this.parseRegex(); if (this.offset != this.regexlen) throw ex("parser.parse.1", this.offset); if (this.references != null) { for (int i = 0; i < this.references.size(); i ++) { ReferencePosition position = (ReferencePosition)this.references.elementAt(i); if (this.parennumber <= position.refNumber) throw ex("parser.parse.2", position.position); } this.references.removeAllElements(); } return ret;
Token parseAtom()
atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9] | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block | '(?>' regex ')' char ::= '\\' | '\' [efnrt] | bmp-code | character-1
int ch = this.read(); Token tok = null; switch (ch) { case T_LPAREN: return this.processParen(); case T_LPAREN2: return this.processParen2(); // '(?:' case T_CONDITION: return this.processCondition(); // '(?(' case T_MODIFIERS: return this.processModifiers(); // (?modifiers ... ) case T_INDEPENDENT: return this.processIndependent(); case T_DOT: this.next(); // Skips '.' tok = Token.token_dot; break; /** * char-class ::= '[' ( '^'? range ','?)+ ']' * range ::= '\d' | '\w' | '\s' | category-block | range-char * | range-char '-' range-char * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2 * bmp-char ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] */ case T_LBRACKET: return this.parseCharacterClass(true); case T_SET_OPERATIONS: return this.parseSetOperations(); case T_BACKSOLIDUS: switch (this.chardata) { case 'd": case 'D": case 'w": case 'W": case 's": case 'S": tok = this.getTokenForShorthand(this.chardata); this.next(); return tok; case 'e": case 'f": case 'n": case 'r": case 't": case 'u": case 'v": case 'x": { int ch2 = this.decodeEscaped(); if (ch2 < 0x10000) { tok = Token.createChar(ch2); } else { tok = Token.createString(REUtil.decomposeToSurrogates(ch2)); } } break; case 'c": return this.processBacksolidus_c(); case 'C": return this.processBacksolidus_C(); case 'i": return this.processBacksolidus_i(); case 'I": return this.processBacksolidus_I(); case 'g": return this.processBacksolidus_g(); case 'X": return this.processBacksolidus_X(); case '1": case '2": case '3": case '4": case '5": case '6": case '7": case '8": case '9": return this.processBackreference(); case 'P": case 'p": int pstart = this.offset; tok = processBacksolidus_pP(this.chardata); if (tok == null) throw this.ex("parser.atom.5", pstart); break; default: tok = Token.createChar(this.chardata); } this.next(); break; case T_CHAR: if (this.chardata == ']" || this.chardata == '{" || this.chardata == '}") throw this.ex("parser.atom.4", this.offset-1); tok = Token.createChar(this.chardata); int high = this.chardata; this.next(); if (REUtil.isHighSurrogate(high) && this.read() == T_CHAR && REUtil.isLowSurrogate(this.chardata)) { char[] sur = new char[2]; sur[0] = (char)high; sur[1] = (char)this.chardata; tok = Token.createParen(Token.createString(new String(sur)), 0); this.next(); } break; default: throw this.ex("parser.atom.4", this.offset-1); } return tok;
protected RangeToken parseCharacterClass(boolean useNrange)
char-class ::= '[' ( '^'? range ','?)+ ']' range ::= '\d' | '\w' | '\s' | category-block | range-char | range-char '-' range-char range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2 bmp-code ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
this.setContext(S_INBRACKETS); this.next(); // '[' boolean nrange = false; RangeToken base = null; RangeToken tok; if (this.read() == T_CHAR && this.chardata == '^") { nrange = true; this.next(); // '^' if (useNrange) { tok = Token.createNRange(); } else { base = Token.createRange(); base.addRange(0, Token.UTF16_MAX); tok = Token.createRange(); } } else { tok = Token.createRange(); } int type; boolean firstloop = true; while ((type = this.read()) != T_EOF) { if (type == T_CHAR && this.chardata == ']" && !firstloop) break; firstloop = false; int c = this.chardata; boolean end = false; if (type == T_BACKSOLIDUS) { switch (c) { case 'd": case 'D": case 'w": case 'W": case 's": case 'S": tok.mergeRanges(this.getTokenForShorthand(c)); end = true; break; case 'i": case 'I": case 'c": case 'C": c = this.processCIinCharacterClass(tok, c); if (c < 0) end = true; break; case 'p": case 'P": int pstart = this.offset; RangeToken tok2 = this.processBacksolidus_pP(c); if (tok2 == null) throw this.ex("parser.atom.5", pstart); tok.mergeRanges(tok2); end = true; break; default: c = this.decodeEscaped(); } // \ + c } // backsolidus // POSIX Character class such as [:alnum:] else if (type == T_POSIX_CHARCLASS_START) { int nameend = this.regex.indexOf(':", this.offset); if (nameend < 0) throw this.ex("parser.cc.1", this.offset); boolean positive = true; if (this.regex.charAt(this.offset) == '^") { this.offset ++; positive = false; } String name = this.regex.substring(this.offset, nameend); RangeToken range = Token.getRange(name, positive, this.isSet(RegularExpression.XMLSCHEMA_MODE)); if (range == null) throw this.ex("parser.cc.3", this.offset); tok.mergeRanges(range); end = true; if (nameend+1 >= this.regexlen || this.regex.charAt(nameend+1) != ']") throw this.ex("parser.cc.1", nameend); this.offset = nameend+2; } this.next(); if (!end) { // if not shorthands... if (this.read() != T_CHAR || this.chardata != '-") { // Here is no '-'. tok.addRange(c, c); } else { this.next(); // Skips '-' if ((type = this.read()) == T_EOF) throw this.ex("parser.cc.2", this.offset); if (type == T_CHAR && this.chardata == ']") { tok.addRange(c, c); tok.addRange('-", '-"); } else { int rangeend = this.chardata; if (type == T_BACKSOLIDUS) rangeend = this.decodeEscaped(); this.next(); tok.addRange(c, rangeend); } } } if (this.isSet(RegularExpression.SPECIAL_COMMA) && this.read() == T_CHAR && this.chardata == ',") this.next(); } if (this.read() == T_EOF) throw this.ex("parser.cc.2", this.offset); if (!useNrange && nrange) { base.subtractRanges(tok); tok = base; } tok.sortRanges(); tok.compactRanges(); //tok.dumpRanges(); /* if (this.isSet(RegularExpression.IGNORE_CASE)) tok = RangeToken.createCaseInsensitiveToken(tok); */ this.setContext(S_NORMAL); this.next(); // Skips ']' return tok;
Token parseFactor()
factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>' | atom (('*' | '+' | '?' | minmax ) '?'? )?) | '(?=' regex ')' | '(?!' regex ')' | '(?<=' regex ')' | '(?<!' regex ')' | '(?#' [^)]* ')' minmax ::= '{' min (',' max?)? '}' min ::= [0-9]+ max ::= [0-9]+
int ch = this.read(); Token tok; switch (ch) { case T_CARET: return this.processCaret(); case T_DOLLAR: return this.processDollar(); case T_LOOKAHEAD: return this.processLookahead(); case T_NEGATIVELOOKAHEAD: return this.processNegativelookahead(); case T_LOOKBEHIND: return this.processLookbehind(); case T_NEGATIVELOOKBEHIND: return this.processNegativelookbehind(); case T_COMMENT: this.next(); return Token.createEmpty(); case T_BACKSOLIDUS: switch (this.chardata) { case 'A": return this.processBacksolidus_A(); case 'Z": return this.processBacksolidus_Z(); case 'z": return this.processBacksolidus_z(); case 'b": return this.processBacksolidus_b(); case 'B": return this.processBacksolidus_B(); case '<": return this.processBacksolidus_lt(); case '>": return this.processBacksolidus_gt(); } // through down } tok = this.parseAtom(); ch = this.read(); switch (ch) { case T_STAR: return this.processStar(tok); case T_PLUS: return this.processPlus(tok); case T_QUESTION: return this.processQuestion(tok); case T_CHAR: if (this.chardata == '{" && this.offset < this.regexlen) { int off = this.offset; // this.offset -> next of '{' int min = 0, max = -1; if ((ch = this.regex.charAt(off++)) >= '0" && ch <= '9") { min = ch -'0"; while (off < this.regexlen && (ch = this.regex.charAt(off++)) >= '0" && ch <= '9") { min = min*10 +ch-'0"; if (min < 0) throw ex("parser.quantifier.5", this.offset); } } else { throw ex("parser.quantifier.1", this.offset); } max = min; if (ch == ',") { if (off >= this.regexlen) { throw ex("parser.quantifier.3", this.offset); } else if ((ch = this.regex.charAt(off++)) >= '0" && ch <= '9") { max = ch -'0"; // {min,max} while (off < this.regexlen && (ch = this.regex.charAt(off++)) >= '0" && ch <= '9") { max = max*10 +ch-'0"; if (max < 0) throw ex("parser.quantifier.5", this.offset); } if (min > max) throw ex("parser.quantifier.4", this.offset); } else { // assume {min,} max = -1; } } if (ch != '}") throw ex("parser.quantifier.2", this.offset); if (this.checkQuestion(off)) { // off -> next of '}' tok = Token.createNGClosure(tok); this.offset = off+1; } else { tok = Token.createClosure(tok); this.offset = off; } tok.setMin(min); tok.setMax(max); //System.err.println("CLOSURE: "+min+", "+max); this.next(); } } return tok;
Token parseRegex()
regex ::= term (`|` term)* term ::= factor+ factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>' | atom (('*' | '+' | '?' | minmax ) '?'? )?) | '(?=' regex ')' | '(?!' regex ')' | '(?<=' regex ')' | '(?<!' regex ')' atom ::= char | '.' | range | '(' regex ')' | '(?:' regex ')' | '\' [0-9] | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block
Token tok = this.parseTerm(); Token parent = null; while (this.read() == T_OR) { this.next(); // '|' if (parent == null) { parent = Token.createUnion(); parent.addChild(tok); tok = parent; } tok.addChild(this.parseTerm()); } return tok;
protected RangeToken parseSetOperations()
'(?[' ... ']' (('-' | '+' | '&') '[' ... ']')? ')'
RangeToken tok = this.parseCharacterClass(false); int type; while ((type = this.read()) != T_RPAREN) { int ch = this.chardata; if (type == T_CHAR && (ch == '-" || ch == '&") || type == T_PLUS) { this.next(); if (this.read() != T_LBRACKET) throw ex("parser.ope.1", this.offset-1); RangeToken t2 = this.parseCharacterClass(false); if (type == T_PLUS) tok.mergeRanges(t2); else if (ch == '-") tok.subtractRanges(t2); else if (ch == '&") tok.intersectRanges(t2); else throw new RuntimeException("ASSERT"); } else { throw ex("parser.ope.2", this.offset-1); } } this.next(); return tok;
Token parseTerm()
term ::= factor+
int ch = this.read(); if (ch == T_OR || ch == T_RPAREN || ch == T_EOF) { return Token.createEmpty(); } else { Token tok = this.parseFactor(); Token concat = null; while ((ch = this.read()) != T_OR && ch != T_RPAREN && ch != T_EOF) { if (concat == null) { concat = Token.createConcat(); concat.addChild(tok); tok = concat; } concat.addChild(this.parseFactor()); //tok = Token.createConcat(tok, this.parseFactor()); } return tok; }
Token processBackreference()
int refnum = this.chardata-'0"; Token tok = Token.createBackReference(refnum); this.hasBackReferences = true; if (this.references == null) this.references = new Vector(); this.references.addElement(new ReferencePosition(refnum, this.offset-2)); this.next(); return tok;
Token processBacksolidus_A()
this.next(); return Token.token_stringbeginning;
Token processBacksolidus_B()
this.next(); return Token.token_not_wordedge;
Token processBacksolidus_C()
throw ex("parser.process.1", this.offset);
Token processBacksolidus_I()
throw ex("parser.process.1", this.offset);
Token processBacksolidus_X()
this.next(); return Token.getCombiningCharacterSequence();
Token processBacksolidus_Z()
this.next(); return Token.token_stringend2;
Token processBacksolidus_b()
this.next(); return Token.token_wordedge;
Token processBacksolidus_c()
int ch2; // Must be in 0x0040-0x005f if (this.offset >= this.regexlen || ((ch2 = this.regex.charAt(this.offset++)) & 0xffe0) != 0x0040) throw ex("parser.atom.1", this.offset-1); this.next(); return Token.createChar(ch2-0x40);
Token processBacksolidus_g()
this.next(); return Token.getGraphemePattern();
Token processBacksolidus_gt()
this.next(); return Token.token_wordend;
Token processBacksolidus_i()
Token tok = Token.createChar('i"); this.next(); return tok;
Token processBacksolidus_lt()
this.next(); return Token.token_wordbeginning;
protected RangeToken processBacksolidus_pP(int c)
this.next(); if (this.read() != T_CHAR || this.chardata != '{") throw this.ex("parser.atom.2", this.offset-1); // handle category escape boolean positive = c == 'p"; int namestart = this.offset; int nameend = this.regex.indexOf('}", namestart); if (nameend < 0) throw this.ex("parser.atom.3", this.offset); String pname = this.regex.substring(namestart, nameend); this.offset = nameend+1; return Token.getRange(pname, positive, this.isSet(RegularExpression.XMLSCHEMA_MODE));
Token processBacksolidus_z()
this.next(); return Token.token_stringend;
int processCIinCharacterClass(RangeToken tok, int c)
return this.decodeEscaped();
Token processCaret()
this.next(); return Token.token_linebeginning;
Token processCondition()
// this.offset points the next of '(' if (this.offset+1 >= this.regexlen) throw ex("parser.factor.4", this.offset); // Parses a condition. int refno = -1; Token condition = null; int ch = this.regex.charAt(this.offset); if ('1" <= ch && ch <= '9") { refno = ch-'0"; this.hasBackReferences = true; if (this.references == null) this.references = new Vector(); this.references.addElement(new ReferencePosition(refno, this.offset)); this.offset ++; if (this.regex.charAt(this.offset) != ')") throw ex("parser.factor.1", this.offset); this.offset ++; } else { if (ch == '?") this.offset --; // Points '('. this.next(); condition = this.parseFactor(); switch (condition.type) { case Token.LOOKAHEAD: case Token.NEGATIVELOOKAHEAD: case Token.LOOKBEHIND: case Token.NEGATIVELOOKBEHIND: break; case Token.ANCHOR: if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); break; default: throw ex("parser.factor.5", this.offset); } } // Parses yes/no-patterns. this.next(); Token yesPattern = this.parseRegex(); Token noPattern = null; if (yesPattern.type == Token.UNION) { if (yesPattern.size() != 2) throw ex("parser.factor.6", this.offset); noPattern = yesPattern.getChild(1); yesPattern = yesPattern.getChild(0); } if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); this.next(); return Token.createCondition(refno, condition, yesPattern, noPattern);
Token processDollar()
this.next(); return Token.token_lineend;
Token processIndependent()
this.next(); Token tok = Token.createLook(Token.INDEPENDENT, this.parseRegex()); if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); this.next(); // Skips ')' return tok;
Token processLookahead()
this.next(); Token tok = Token.createLook(Token.LOOKAHEAD, this.parseRegex()); if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); this.next(); // ')' return tok;
Token processLookbehind()
this.next(); Token tok = Token.createLook(Token.LOOKBEHIND, this.parseRegex()); if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); this.next(); // ')' return tok;
Token processModifiers()
// this.offset points the next of '?'. // modifiers ::= [imsw]* ('-' [imsw]*)? ':' int add = 0, mask = 0, ch = -1; while (this.offset < this.regexlen) { ch = this.regex.charAt(this.offset); int v = REUtil.getOptionValue(ch); if (v == 0) break; // '-' or ':'? add |= v; this.offset ++; } if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1); if (ch == '-") { this.offset ++; while (this.offset < this.regexlen) { ch = this.regex.charAt(this.offset); int v = REUtil.getOptionValue(ch); if (v == 0) break; // ':'? mask |= v; this.offset ++; } if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1); } Token tok; if (ch == ':") { this.offset ++; this.next(); tok = Token.createModifierGroup(this.parseRegex(), add, mask); if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); this.next(); } else if (ch == ')") { // such as (?-i) this.offset ++; this.next(); tok = Token.createModifierGroup(this.parseRegex(), add, mask); } else throw ex("parser.factor.3", this.offset); return tok;
Token processNegativelookahead()
this.next(); Token tok = Token.createLook(Token.NEGATIVELOOKAHEAD, this.parseRegex()); if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); this.next(); // ')' return tok;
Token processNegativelookbehind()
this.next(); Token tok = Token.createLook(Token.NEGATIVELOOKBEHIND, this.parseRegex()); if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); this.next(); // ')' return tok;
Token processParen()
this.next(); int p = this.parennumber++; Token tok = Token.createParen(this.parseRegex(), p); if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); this.next(); // Skips ')' return tok;
Token processParen2()
this.next(); Token tok = Token.createParen(this.parseRegex(), 0); if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); this.next(); // Skips ')' return tok;
Token processPlus(Token tok)
// X+ -> XX* this.next(); if (this.read() == T_QUESTION) { this.next(); return Token.createConcat(tok, Token.createNGClosure(tok)); } else return Token.createConcat(tok, Token.createClosure(tok));
Token processQuestion(Token tok)
// X? -> X| this.next(); Token par = Token.createUnion(); if (this.read() == T_QUESTION) { this.next(); par.addChild(Token.createEmpty()); par.addChild(tok); } else { par.addChild(tok); par.addChild(Token.createEmpty()); } return par;
Token processStar(Token tok)
this.next(); if (this.read() == T_QUESTION) { this.next(); return Token.createNGClosure(tok); } else return Token.createClosure(tok);
final int read()
return this.nexttoken;
protected final void setContext(int con)
this.context = con;
public void setLocale(java.util.Locale locale)
try { this.resources = ResourceBundle.getBundle("org.apache.xerces.impl.xpath.regex.message", locale); } catch (MissingResourceException mre) { throw new RuntimeException("Installation Problem??? Couldn't load messages: " +mre.getMessage()); }