Methods Summary |
---|
private void | accept(int ch, java.lang.String s)Match next character, signal error if failed.
int testChar = temp[cursor++];
if (has(COMMENTS))
testChar = parsePastWhitespace(testChar);
if (ch != testChar) {
error(s);
}
|
private void | addFlag()Parses inlined match flags and set them appropriately.
int ch = peek();
for (;;) {
switch (ch) {
case 'i":
flags |= CASE_INSENSITIVE;
break;
case 'm":
flags |= MULTILINE;
break;
case 's":
flags |= DOTALL;
break;
case 'd":
flags |= UNIX_LINES;
break;
case 'u":
flags |= UNICODE_CASE;
break;
case 'c":
flags |= CANON_EQ;
break;
case 'x":
flags |= COMMENTS;
break;
case '-": // subFlag then fall through
ch = next();
subFlag();
default:
return;
}
ch = next();
}
|
private void | append(int ch, int len)
if (len >= buffer.length) {
int[] tmp = new int[len+len];
System.arraycopy(buffer, 0, tmp, 0, len);
buffer = tmp;
}
buffer[len] = ch;
|
private java.util.regex.Pattern$Node | atom()Parse and add a new Single or Slice.
int first = 0;
int prev = -1;
boolean hasSupplementary = false;
int ch = peek();
for (;;) {
switch (ch) {
case '*":
case '+":
case '?":
case '{":
if (first > 1) {
cursor = prev; // Unwind one character
first--;
}
break;
case '$":
case '.":
case '^":
case '(":
case '[":
case '|":
case ')":
break;
case '\\":
ch = nextEscaped();
if (ch == 'p" || ch == 'P") { // Property
if (first > 0) { // Slice is waiting; handle it first
unread();
break;
} else { // No slice; just return the family node
if (ch == 'p" || ch == 'P") {
boolean comp = (ch == 'P");
boolean oneLetter = true;
ch = next(); // Consume { if present
if (ch != '{")
unread();
else
oneLetter = false;
return family(comp, oneLetter);
}
}
break;
}
unread();
prev = cursor;
ch = escape(false, first == 0);
if (ch >= 0) {
append(ch, first);
first++;
if (isSupplementary(ch)) {
hasSupplementary = true;
}
ch = peek();
continue;
} else if (first == 0) {
return root;
}
// Unwind meta escape sequence
cursor = prev;
break;
case 0:
if (cursor >= patternLength) {
break;
}
// Fall through
default:
prev = cursor;
append(ch, first);
first++;
if (isSupplementary(ch)) {
hasSupplementary = true;
}
ch = next();
continue;
}
break;
}
if (first == 1) {
return newSingle(buffer[0]);
} else {
return newSlice(buffer, first, hasSupplementary);
}
|
private int | c()Utility method for parsing control escape sequences.
if (cursor < patternLength) {
return read() ^ 64;
}
error("Illegal control escape sequence");
return -1;
|
private java.util.regex.Pattern$Node | clazz(boolean consume)Parse a character class, and return the node that matches it.
Consumes a ] on the way out if consume is true. Usually consume
is true except for the case of [abc&&def] where def is a separate
right hand node with "understood" brackets.
Node prev = null;
Node node = null;
BitClass bits = new BitClass(false);
boolean include = true;
boolean firstInClass = true;
int ch = next();
for (;;) {
switch (ch) {
case '^":
// Negates if first char in a class, otherwise literal
if (firstInClass) {
if (temp[cursor-1] != '[")
break;
ch = next();
include = !include;
continue;
} else {
// ^ not first in class, treat as literal
break;
}
case '[":
firstInClass = false;
node = clazz(true);
if (prev == null)
prev = node;
else
prev = new Add(prev, node);
ch = peek();
continue;
case '&":
firstInClass = false;
ch = next();
if (ch == '&") {
ch = next();
Node rightNode = null;
while (ch != ']" && ch != '&") {
if (ch == '[") {
if (rightNode == null)
rightNode = clazz(true);
else
rightNode = new Add(rightNode, clazz(true));
} else { // abc&&def
unread();
rightNode = clazz(false);
}
ch = peek();
}
if (rightNode != null)
node = rightNode;
if (prev == null) {
if (rightNode == null)
return error("Bad class syntax");
else
prev = rightNode;
} else {
prev = new Both(prev, node);
}
} else {
// treat as a literal &
unread();
break;
}
continue;
case 0:
firstInClass = false;
if (cursor >= patternLength)
return error("Unclosed character class");
break;
case ']":
firstInClass = false;
if (prev != null) {
if (consume)
next();
return prev;
}
break;
default:
firstInClass = false;
break;
}
node = range(bits);
if (include) {
if (prev == null) {
prev = node;
} else {
if (prev != node)
prev = new Add(prev, node);
}
} else {
if (prev == null) {
prev = node.dup(true); // Complement
} else {
if (prev != node)
prev = new Sub(prev, node);
}
}
ch = peek();
}
|
private java.util.regex.Pattern$Node | closure(java.util.regex.Pattern$Node prev)Processes repetition. If the next character peeked is a quantifier
then new nodes must be appended to handle the repetition.
Prev could be a single or a group, so it could be a chain of nodes.
Node atom;
int ch = peek();
switch (ch) {
case '?":
ch = next();
if (ch == '?") {
next();
return new Ques(prev, LAZY);
} else if (ch == '+") {
next();
return new Ques(prev, POSSESSIVE);
}
return new Ques(prev, GREEDY);
case '*":
ch = next();
if (ch == '?") {
next();
return new Curly(prev, 0, MAX_REPS, LAZY);
} else if (ch == '+") {
next();
return new Curly(prev, 0, MAX_REPS, POSSESSIVE);
}
return new Curly(prev, 0, MAX_REPS, GREEDY);
case '+":
ch = next();
if (ch == '?") {
next();
return new Curly(prev, 1, MAX_REPS, LAZY);
} else if (ch == '+") {
next();
return new Curly(prev, 1, MAX_REPS, POSSESSIVE);
}
return new Curly(prev, 1, MAX_REPS, GREEDY);
case '{":
ch = temp[cursor+1];
if (ASCII.isDigit(ch)) {
skip();
int cmin = 0;
do {
cmin = cmin * 10 + (ch - '0");
} while (ASCII.isDigit(ch = read()));
int cmax = cmin;
if (ch == ',") {
ch = read();
cmax = MAX_REPS;
if (ch != '}") {
cmax = 0;
while (ASCII.isDigit(ch)) {
cmax = cmax * 10 + (ch - '0");
ch = read();
}
}
}
if (ch != '}")
return error("Unclosed counted closure");
if (((cmin) | (cmax) | (cmax - cmin)) < 0)
return error("Illegal repetition range");
Curly curly;
ch = peek();
if (ch == '?") {
next();
curly = new Curly(prev, cmin, cmax, LAZY);
} else if (ch == '+") {
next();
curly = new Curly(prev, cmin, cmax, POSSESSIVE);
} else {
curly = new Curly(prev, cmin, cmax, GREEDY);
}
return curly;
} else {
error("Illegal repetition");
}
return prev;
default:
return prev;
}
|
public static java.util.regex.Pattern | compile(java.lang.String regex)Compiles the given regular expression into a pattern.
return new Pattern(regex, 0);
|
private void | compile()Copies regular expression to a char array and invokes the parsing
of the expression which will create the object tree.
// Handle canonical equivalences
if (has(CANON_EQ) && !has(LITERAL)) {
normalize();
} else {
normalizedPattern = pattern;
}
// Copy pattern to char array for convenience
patternLength = normalizedPattern.length();
temp = new int[patternLength + 2];
boolean hasSupplementary = false;
// Use double null characters to terminate pattern
int c, count = 0;
// Convert all chars into code points
for (int x = 0; x < patternLength; x += Character.charCount(c)) {
c = normalizedPattern.codePointAt(x);
if (isSupplementary(c)) {
hasSupplementary = true;
}
temp[count++] = c;
}
patternLength = count; // patternLength now in code points
temp[patternLength] = 0;
temp[patternLength + 1] = 0;
// Allocate all temporary objects here.
buffer = new int[32];
groupNodes = new GroupHead[10];
if (has(LITERAL)) {
// Literal pattern handling
matchRoot = newSlice(temp, patternLength, hasSupplementary);
matchRoot.next = lastAccept;
} else {
// Start recursive decedent parsing
matchRoot = expr(lastAccept);
// Check extra pattern characters
if (patternLength != cursor) {
if (peek() == ')") {
error("Unmatched closing ')'");
} else {
error("Unexpected internal error");
}
}
}
// Peephole optimization
if (matchRoot instanceof Slice) {
root = BnM.optimize(matchRoot);
if (root == matchRoot) {
root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
}
} else if (matchRoot instanceof Begin || matchRoot instanceof First) {
root = matchRoot;
} else {
root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
}
// Release temporary storage
temp = null;
buffer = null;
groupNodes = null;
patternLength = 0;
compiled = true;
|
public static java.util.regex.Pattern | compile(java.lang.String regex, int flags)Compiles the given regular expression into a pattern with the given
flags.
return new Pattern(regex, flags);
|
private java.lang.String | composeOneStep(java.lang.String input)Attempts to compose input by combining the first character
with the first combining mark following it. Returns a String
that is the composition of the leading character with its first
combining mark followed by the remaining combining marks. Returns
null if the first two characters cannot be further composed.
int len = countChars(input, 0, 2);
String firstTwoCharacters = input.substring(0, len);
String result = Normalizer.compose(firstTwoCharacters, false, 0);
if (result.equals(firstTwoCharacters))
return null;
else {
String remainder = input.substring(len);
return result + remainder;
}
|
private static final int | countChars(java.lang.CharSequence seq, int index, int lengthInCodePoints)
// optimization
if (lengthInCodePoints == 1 && !Character.isHighSurrogate(seq.charAt(index))) {
assert (index >= 0 && index < seq.length());
return 1;
}
int length = seq.length();
int x = index;
if (lengthInCodePoints >= 0) {
assert (index >= 0 && index < length);
for (int i = 0; x < length && i < lengthInCodePoints; i++) {
if (Character.isHighSurrogate(seq.charAt(x++))) {
if (x < length && Character.isLowSurrogate(seq.charAt(x))) {
x++;
}
}
}
return x - index;
}
assert (index >= 0 && index <= length);
if (index == 0) {
return 0;
}
int len = -lengthInCodePoints;
for (int i = 0; x > 0 && i < len; i++) {
if (Character.isLowSurrogate(seq.charAt(--x))) {
if (x > 0 && Character.isHighSurrogate(seq.charAt(x-1))) {
x--;
}
}
}
return index - x;
|
private static final int | countCodePoints(java.lang.CharSequence seq)
int length = seq.length();
int n = 0;
for (int i = 0; i < length; ) {
n++;
if (Character.isHighSurrogate(seq.charAt(i++))) {
if (i < length && Character.isLowSurrogate(seq.charAt(i))) {
i++;
}
}
}
return n;
|
private java.util.regex.Pattern$Node | createGroup(boolean anonymous)Create group head and tail nodes using double return. If the group is
created with anonymous true then it is a pure group and should not
affect group counting.
int localIndex = localCount++;
int groupIndex = 0;
if (!anonymous)
groupIndex = capturingGroupCount++;
GroupHead head = new GroupHead(localIndex);
root = new GroupTail(localIndex, groupIndex);
if (!anonymous && groupIndex < 10)
groupNodes[groupIndex] = head;
return head;
|
private java.util.regex.Pattern$Node | error(java.lang.String s)Internal method used for handling all syntax errors. The pattern is
displayed with a pointer to aid in locating the syntax error.
throw new PatternSyntaxException(s, normalizedPattern,
cursor - 1);
|
private int | escape(boolean inclass, boolean create)Parses an escape sequence to determine the actual value that needs
to be matched.
If -1 is returned and create was true a new object was added to the tree
to handle the escape sequence.
If the returned value is greater than zero, it is the value that
matches the escape sequence.
int ch = skip();
switch (ch) {
case '0":
return o();
case '1":
case '2":
case '3":
case '4":
case '5":
case '6":
case '7":
case '8":
case '9":
if (inclass) break;
if (capturingGroupCount < (ch - '0"))
error("No such group yet exists at this point in the pattern");
if (create) {
root = ref((ch - '0"));
}
return -1;
case 'A":
if (inclass) break;
if (create) root = new Begin();
return -1;
case 'B":
if (inclass) break;
if (create) root = new Bound(Bound.NONE);
return -1;
case 'C":
break;
case 'D":
if (create) root = new NotCtype(ASCII.DIGIT);
return -1;
case 'E":
case 'F":
break;
case 'G":
if (inclass) break;
if (create) root = new LastMatch();
return -1;
case 'H":
case 'I":
case 'J":
case 'K":
case 'L":
case 'M":
case 'N":
case 'O":
case 'P":
break;
case 'Q":
if (create) {
// Disable metacharacters. We will return a slice
// up to the next \E
int i = cursor;
int c;
while ((c = readEscaped()) != 0) {
if (c == '\\") {
c = readEscaped();
if (c == 'E" || c == 0)
break;
else
unread();
}
}
int j = cursor-1;
if (c == 'E")
j--;
else
unread();
boolean hasSupplementary = false;
for (int x = i; x<j; x++) {
c = temp[x];
append(c, x-i);
if (isSupplementary(c)) {
hasSupplementary = true;
}
}
root = newSlice(buffer, j-i, hasSupplementary);
}
return -1;
case 'R":
break;
case 'S":
if (create) root = new NotCtype(ASCII.SPACE);
return -1;
case 'T":
case 'U":
case 'V":
break;
case 'W":
if (create) root = new NotCtype(ASCII.WORD);
return -1;
case 'X":
case 'Y":
break;
case 'Z":
if (inclass) break;
if (create) {
if (has(UNIX_LINES))
root = new UnixDollar(false);
else
root = new Dollar(false);
}
return -1;
case 'a":
return '\007";
case 'b":
if (inclass) break;
if (create) root = new Bound(Bound.BOTH);
return -1;
case 'c":
return c();
case 'd":
if (create) root = new Ctype(ASCII.DIGIT);
return -1;
case 'e":
return '\033";
case 'f":
return '\f";
case 'g":
case 'h":
case 'i":
case 'j":
case 'k":
case 'l":
case 'm":
break;
case 'n":
return '\n";
case 'o":
case 'p":
case 'q":
break;
case 'r":
return '\r";
case 's":
if (create) root = new Ctype(ASCII.SPACE);
return -1;
case 't":
return '\t";
case 'u":
return u();
case 'v":
return '\013";
case 'w":
if (create) root = new Ctype(ASCII.WORD);
return -1;
case 'x":
return x();
case 'y":
break;
case 'z":
if (inclass) break;
if (create) root = new End();
return -1;
default:
return ch;
}
error("Illegal/unsupported escape squence");
return -2;
|
private java.util.regex.Pattern$Node | expr(java.util.regex.Pattern$Node end)The expression is parsed with branch nodes added for alternations.
This may be called recursively to parse sub expressions that may
contain alternations.
Node prev = null;
for (;;) {
Node node = sequence(end);
if (prev == null) {
prev = node;
} else {
prev = new Branch(prev, node);
}
if (peek() != '|") {
return prev;
}
next();
}
|
private java.util.regex.Pattern$Node | family(boolean not, boolean singleLetter)Parses a Unicode character family and returns its representative node.
Reference to an unknown character family results in a list of supported
families in the error.
next();
String name;
if (singleLetter) {
int c = temp[cursor];
if (!Character.isSupplementaryCodePoint(c)) {
name = String.valueOf((char)c);
} else {
name = new String(temp, cursor, 1);
}
name = name.intern();
read();
} else {
int i = cursor;
mark('}");
while(read() != '}") {
}
mark('\000");
int j = cursor;
if (j > patternLength)
return error("Unclosed character family");
if (i + 1 >= j)
return error("Empty character family");
name = new String(temp, i, j-i-1).intern();
}
if (name.startsWith("In")) {
name = name.substring(2, name.length()).intern();
return retrieveFamilyNode(name, not);
}
if (name.startsWith("Is"))
name = name.substring(2, name.length()).intern();
return retrieveCategoryNode(name).dup(not);
|
private java.util.regex.Pattern$Node | familyError(java.lang.String name, java.lang.String type)
StringBuilder sb = new StringBuilder();
sb.append(type);
sb.append(name);
sb.append("}");
name = sb.toString();
return error(name);
|
private boolean | findSupplementary(int start, int end)Determines if there is any supplementary character or unpaired
surrogate in the specified range.
for (int i = start; i < end; i++) {
if (isSupplementary(temp[i]))
return true;
}
return false;
|
public int | flags()Returns this pattern's match flags.
return flags;
|
private int | getClass(int c)
return Normalizer.getClass(c);
|
private java.util.regex.Pattern$Node | group0()Parses a group and returns the head node of a set of nodes that process
the group. Sometimes a double return system is used where the tail is
returned in root.
boolean capturingGroup = false;
Node head = null;
Node tail = null;
int save = flags;
root = null;
int ch = next();
if (ch == '?") {
ch = skip();
switch (ch) {
case ':": // (?:xxx) pure group
head = createGroup(true);
tail = root;
head.next = expr(tail);
break;
case '=": // (?=xxx) and (?!xxx) lookahead
case '!":
head = createGroup(true);
tail = root;
head.next = expr(tail);
if (ch == '=") {
head = tail = new Pos(head);
} else {
head = tail = new Neg(head);
}
break;
case '>": // (?>xxx) independent group
head = createGroup(true);
tail = root;
head.next = expr(tail);
head = tail = new Ques(head, INDEPENDENT);
break;
case '<": // (?<xxx) look behind
ch = read();
int start = cursor;
head = createGroup(true);
tail = root;
head.next = expr(tail);
TreeInfo info = new TreeInfo();
head.study(info);
if (info.maxValid == false) {
return error("Look-behind group does not have "
+ "an obvious maximum length");
}
boolean hasSupplementary = findSupplementary(start, patternLength);
if (ch == '=") {
head = tail = (hasSupplementary ?
new BehindS(head, info.maxLength,
info.minLength) :
new Behind(head, info.maxLength,
info.minLength));
} else if (ch == '!") {
head = tail = (hasSupplementary ?
new NotBehindS(head, info.maxLength,
info.minLength) :
new NotBehind(head, info.maxLength,
info.minLength));
} else {
error("Unknown look-behind group");
}
break;
case '$":
case '@":
return error("Unknown group type");
default: // (?xxx:) inlined match flags
unread();
addFlag();
ch = read();
if (ch == ')") {
return null; // Inline modifier only
}
if (ch != ':") {
return error("Unknown inline modifier");
}
head = createGroup(true);
tail = root;
head.next = expr(tail);
break;
}
} else { // (xxx) a regular group
capturingGroup = true;
head = createGroup(false);
tail = root;
head.next = expr(tail);
}
accept(')", "Unclosed group");
flags = save;
// Check for quantifiers
Node node = closure(head);
if (node == head) { // No closure
root = tail;
return node; // Dual return
}
if (head == tail) { // Zero length assertion
root = node;
return node; // Dual return
}
if (node instanceof Ques) {
Ques ques = (Ques) node;
if (ques.type == POSSESSIVE) {
root = node;
return node;
}
// Dummy node to connect branch
tail.next = new Dummy();
tail = tail.next;
if (ques.type == GREEDY) {
head = new Branch(head, tail);
} else { // Reluctant quantifier
head = new Branch(tail, head);
}
root = tail;
return head;
} else if (node instanceof Curly) {
Curly curly = (Curly) node;
if (curly.type == POSSESSIVE) {
root = node;
return node;
}
// Discover if the group is deterministic
TreeInfo info = new TreeInfo();
if (head.study(info)) { // Deterministic
GroupTail temp = (GroupTail) tail;
head = root = new GroupCurly(head.next, curly.cmin,
curly.cmax, curly.type,
((GroupTail)tail).localIndex,
((GroupTail)tail).groupIndex,
capturingGroup);
return head;
} else { // Non-deterministic
int temp = ((GroupHead) head).localIndex;
Loop loop;
if (curly.type == GREEDY)
loop = new Loop(this.localCount, temp);
else // Reluctant Curly
loop = new LazyLoop(this.localCount, temp);
Prolog prolog = new Prolog(loop);
this.localCount += 1;
loop.cmin = curly.cmin;
loop.cmax = curly.cmax;
loop.body = head;
tail.next = loop;
root = loop;
return prolog; // Dual return
}
} else if (node instanceof First) {
root = node;
return node;
}
return error("Internal logic error");
|
private boolean | has(int f)Indicates whether a particular flag is set or not.
return (flags & f) > 0;
|
private static boolean | hasBaseCharacter(java.util.regex.Matcher matcher, int i, java.lang.CharSequence seq)Non spacing marks only count as word characters in bounds calculations
if they have a base character.
int start = (!matcher.transparentBounds) ?
matcher.from : 0;
for (int x=i; x >= start; x--) {
int ch = Character.codePointAt(seq, x);
if (Character.isLetterOrDigit(ch))
return true;
if (Character.getType(ch) == Character.NON_SPACING_MARK)
continue;
return false;
}
return false;
|
private boolean | isLineSeparator(int ch)Determines if character is a line separator in the current mode
if (has(UNIX_LINES)) {
return ch == '\n";
} else {
return (ch == '\n" ||
ch == '\r" ||
(ch|1) == '\u2029" ||
ch == '\u0085");
}
|
private static final boolean | isSupplementary(int ch)Determines if the specified code point is a supplementary
character or unpaired surrogate.
return ch >= Character.MIN_SUPPLEMENTARY_CODE_POINT || isSurrogate(ch);
|
private static final boolean | isSurrogate(int c)Tests a surrogate value.
return c >= Character.MIN_HIGH_SURROGATE && c <= Character.MAX_LOW_SURROGATE;
|
private void | mark(int c)Mark the end of pattern with a specific character.
temp[patternLength] = c;
|
public java.util.regex.Matcher | matcher(java.lang.CharSequence input)Creates a matcher that will match the given input against this pattern.
synchronized(this) {
if (!compiled)
compile();
}
Matcher m = new Matcher(this, input);
return m;
|
public static boolean | matches(java.lang.String regex, java.lang.CharSequence input)Compiles the given regular expression and attempts to match the given
input against it.
An invocation of this convenience method of the form
Pattern.matches(regex, input);
behaves in exactly the same way as the expression
Pattern.compile(regex).matcher(input).matches()
If a pattern is to be used multiple times, compiling it once and reusing
it will be more efficient than invoking this method each time.
Pattern p = Pattern.compile(regex);
Matcher m = p.matcher(input);
return m.matches();
|
private java.util.regex.Pattern$Node | newSingle(int ch)Utility method for creating a single character matcher.
int f = flags;
if ((f & (CASE_INSENSITIVE|UNICODE_CASE)) == 0) {
return new Single(ch);
}
if ((f & UNICODE_CASE) == 0) {
return new SingleA(ch);
}
return new SingleU(ch);
|
private java.util.regex.Pattern$Node | newSlice(int[] buf, int count, boolean hasSupplementary)Utility method for creating a string slice matcher.
int[] tmp = new int[count];
int i = flags;
if ((i & (CASE_INSENSITIVE|UNICODE_CASE)) == 0) {
for (i = 0; i < count; i++) {
tmp[i] = buf[i];
}
return hasSupplementary ? new SliceS(tmp) : new Slice(tmp);
} else if ((i & UNICODE_CASE) == 0) {
for (i = 0; i < count; i++) {
tmp[i] = (char)ASCII.toLower(buf[i]);
}
return new SliceA(tmp);
} else {
for (i = 0; i < count; i++) {
int c = buf[i];
c = Character.toUpperCase(c);
c = Character.toLowerCase(c);
tmp[i] = c;
}
return new SliceU(tmp);
}
|
private int | next()Advance the cursor by one, and peek the next character.
int ch = temp[++cursor];
if (has(COMMENTS))
ch = peekPastWhitespace(ch);
return ch;
|
private int | nextEscaped()Advance the cursor by one, and peek the next character,
ignoring the COMMENTS setting
int ch = temp[++cursor];
return ch;
|
private void | normalize()The pattern is converted to normalizedD form and then a pure group
is constructed to match canonical equivalences of the characters.
boolean inCharClass = false;
int lastCodePoint = -1;
// Convert pattern into normalizedD form
normalizedPattern = Normalizer.decompose(pattern, false, 0);
patternLength = normalizedPattern.length();
// Modify pattern to match canonical equivalences
StringBuilder newPattern = new StringBuilder(patternLength);
for(int i=0; i<patternLength; ) {
int c = normalizedPattern.codePointAt(i);
StringBuilder sequenceBuffer;
if ((Character.getType(c) == Character.NON_SPACING_MARK)
&& (lastCodePoint != -1)) {
sequenceBuffer = new StringBuilder();
sequenceBuffer.appendCodePoint(lastCodePoint);
sequenceBuffer.appendCodePoint(c);
while(Character.getType(c) == Character.NON_SPACING_MARK) {
i += Character.charCount(c);
if (i >= patternLength)
break;
c = normalizedPattern.codePointAt(i);
sequenceBuffer.appendCodePoint(c);
}
String ea = produceEquivalentAlternation(
sequenceBuffer.toString());
newPattern.setLength(newPattern.length()-Character.charCount(lastCodePoint));
newPattern.append("(?:").append(ea).append(")");
} else if (c == '[" && lastCodePoint != '\\") {
i = normalizeCharClass(newPattern, i);
} else {
newPattern.appendCodePoint(c);
}
lastCodePoint = c;
i += Character.charCount(c);
}
normalizedPattern = newPattern.toString();
|
private int | normalizeCharClass(java.lang.StringBuilder newPattern, int i)Complete the character class being parsed and add a set
of alternations to it that will match the canonical equivalences
of the characters within the class.
StringBuilder charClass = new StringBuilder();
StringBuilder eq = null;
int lastCodePoint = -1;
String result;
i++;
charClass.append("[");
while(true) {
int c = normalizedPattern.codePointAt(i);
StringBuilder sequenceBuffer;
if (c == ']" && lastCodePoint != '\\") {
charClass.append((char)c);
break;
} else if (Character.getType(c) == Character.NON_SPACING_MARK) {
sequenceBuffer = new StringBuilder();
sequenceBuffer.appendCodePoint(lastCodePoint);
while(Character.getType(c) == Character.NON_SPACING_MARK) {
sequenceBuffer.appendCodePoint(c);
i += Character.charCount(c);
if (i >= normalizedPattern.length())
break;
c = normalizedPattern.codePointAt(i);
}
String ea = produceEquivalentAlternation(
sequenceBuffer.toString());
charClass.setLength(charClass.length()-Character.charCount(lastCodePoint));
if (eq == null)
eq = new StringBuilder();
eq.append('|");
eq.append(ea);
} else {
charClass.appendCodePoint(c);
i++;
}
if (i == normalizedPattern.length())
error("Unclosed character class");
lastCodePoint = c;
}
if (eq != null) {
result = new String("(?:"+charClass.toString()+
eq.toString()+")");
} else {
result = charClass.toString();
}
newPattern.append(result);
return i;
|
private int | o()Utility method for parsing octal escape sequences.
int n = read();
if (((n-'0")|('7"-n)) >= 0) {
int m = read();
if (((m-'0")|('7"-m)) >= 0) {
int o = read();
if ((((o-'0")|('7"-o)) >= 0) && (((n-'0")|('3"-n)) >= 0)) {
return (n - '0") * 64 + (m - '0") * 8 + (o - '0");
}
unread();
return (n - '0") * 8 + (m - '0");
}
unread();
return (n - '0");
}
error("Illegal octal escape sequence");
return -1;
|
private int | parsePastLine()xmode parse past comment to end of line.
int ch = temp[cursor++];
while (ch != 0 && !isLineSeparator(ch))
ch = temp[cursor++];
return ch;
|
private int | parsePastWhitespace(int ch)If in xmode parse past whitespace and comments.
while (ASCII.isSpace(ch) || ch == '#") {
while (ASCII.isSpace(ch))
ch = temp[cursor++];
if (ch == '#")
ch = parsePastLine();
}
return ch;
|
public java.lang.String | pattern()Returns the regular expression from which this pattern was compiled.
return pattern;
|
private int | peek()Peek the next character, and do not advance the cursor.
int ch = temp[cursor];
if (has(COMMENTS))
ch = peekPastWhitespace(ch);
return ch;
|
private int | peekPastLine()xmode peek past comment to end of line.
int ch = temp[++cursor];
while (ch != 0 && !isLineSeparator(ch))
ch = temp[++cursor];
return ch;
|
private int | peekPastWhitespace(int ch)If in xmode peek past whitespace and comments.
while (ASCII.isSpace(ch) || ch == '#") {
while (ASCII.isSpace(ch))
ch = temp[++cursor];
if (ch == '#") {
ch = peekPastLine();
}
}
return ch;
|
private static void | printObjectTree(java.util.regex.Pattern$Node node)Used to print out a subtree of the Pattern to help with debugging.
while(node != null) {
if (node instanceof Prolog) {
System.out.println(node);
printObjectTree(((Prolog)node).loop);
System.out.println("**** end contents prolog loop");
} else if (node instanceof Loop) {
System.out.println(node);
printObjectTree(((Loop)node).body);
System.out.println("**** end contents Loop body");
} else if (node instanceof Curly) {
System.out.println(node);
printObjectTree(((Curly)node).atom);
System.out.println("**** end contents Curly body");
} else if (node instanceof GroupCurly) {
System.out.println(node);
printObjectTree(((GroupCurly)node).atom);
System.out.println("**** end contents GroupCurly body");
} else if (node instanceof GroupTail) {
System.out.println(node);
System.out.println("Tail next is "+node.next);
return;
} else {
System.out.println(node);
}
node = node.next;
if (node != null)
System.out.println("->next:");
if (node == Pattern.accept) {
System.out.println("Accept Node");
node = null;
}
}
|
private java.lang.String | produceEquivalentAlternation(java.lang.String source)Given a specific sequence composed of a regular character and
combining marks that follow it, produce the alternation that will
match all canonical equivalences of that sequence.
int len = countChars(source, 0, 1);
if (source.length() == len)
// source has one character.
return new String(source);
String base = source.substring(0,len);
String combiningMarks = source.substring(len);
String[] perms = producePermutations(combiningMarks);
StringBuilder result = new StringBuilder(source);
// Add combined permutations
for(int x=0; x<perms.length; x++) {
String next = base + perms[x];
if (x>0)
result.append("|"+next);
next = composeOneStep(next);
if (next != null)
result.append("|"+produceEquivalentAlternation(next));
}
return result.toString();
|
private java.lang.String[] | producePermutations(java.lang.String input)Returns an array of strings that have all the possible
permutations of the characters in the input string.
This is used to get a list of all possible orderings
of a set of combining marks. Note that some of the permutations
are invalid because of combining class collisions, and these
possibilities must be removed because they are not canonically
equivalent.
if (input.length() == countChars(input, 0, 1))
return new String[] {input};
if (input.length() == countChars(input, 0, 2)) {
int c0 = Character.codePointAt(input, 0);
int c1 = Character.codePointAt(input, Character.charCount(c0));
if (getClass(c1) == getClass(c0)) {
return new String[] {input};
}
String[] result = new String[2];
result[0] = input;
StringBuilder sb = new StringBuilder(2);
sb.appendCodePoint(c1);
sb.appendCodePoint(c0);
result[1] = sb.toString();
return result;
}
int length = 1;
int nCodePoints = countCodePoints(input);
for(int x=1; x<nCodePoints; x++)
length = length * (x+1);
String[] temp = new String[length];
int combClass[] = new int[nCodePoints];
for(int x=0, i=0; x<nCodePoints; x++) {
int c = Character.codePointAt(input, i);
combClass[x] = getClass(c);
i += Character.charCount(c);
}
// For each char, take it out and add the permutations
// of the remaining chars
int index = 0;
int len;
// offset maintains the index in code units.
loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
len = countChars(input, offset, 1);
boolean skip = false;
for(int y=x-1; y>=0; y--) {
if (combClass[y] == combClass[x]) {
continue loop;
}
}
StringBuilder sb = new StringBuilder(input);
String otherChars = sb.delete(offset, offset+len).toString();
String[] subResult = producePermutations(otherChars);
String prefix = input.substring(offset, offset+len);
for(int y=0; y<subResult.length; y++)
temp[index++] = prefix + subResult[y];
}
String[] result = new String[index];
for (int x=0; x<index; x++)
result[x] = temp[x];
return result;
|
public static java.lang.String | quote(java.lang.String s)Returns a literal pattern String for the specified
String .
This method produces a String that can be used to
create a Pattern that would match the string
s as if it were a literal pattern. Metacharacters
or escape sequences in the input sequence will be given no special
meaning.
int slashEIndex = s.indexOf("\\E");
if (slashEIndex == -1)
return "\\Q" + s + "\\E";
StringBuilder sb = new StringBuilder(s.length() * 2);
sb.append("\\Q");
slashEIndex = 0;
int current = 0;
while ((slashEIndex = s.indexOf("\\E", current)) != -1) {
sb.append(s.substring(current, slashEIndex));
current = slashEIndex + 2;
sb.append("\\E\\\\E\\Q");
}
sb.append(s.substring(current, s.length()));
sb.append("\\E");
return sb.toString();
|
private java.util.regex.Pattern$Node | range(java.util.regex.Pattern$BitClass bits)Parse a single character or a character range in a character class
and return its representative node.
int ch = peek();
if (ch == '\\") {
ch = nextEscaped();
if (ch == 'p" || ch == 'P") { // A property
boolean comp = (ch == 'P");
boolean oneLetter = true;
// Consume { if present
ch = next();
if (ch != '{")
unread();
else
oneLetter = false;
return family(comp, oneLetter);
} else { // ordinary escape
unread();
ch = escape(true, true);
if (ch == -1)
return root;
}
} else {
ch = single();
}
if (ch >= 0) {
if (peek() == '-") {
int endRange = temp[cursor+1];
if (endRange == '[") {
if (ch < 256)
return bits.add(ch, flags());
return newSingle(ch);
}
if (endRange != ']") {
next();
int m = single();
if (m < ch)
return error("Illegal character range");
if (has(CASE_INSENSITIVE) || has(UNICODE_CASE))
return new CIRange(ch, m);
else
return new Range(ch, m);
}
}
if (ch < 256)
return bits.add(ch, flags());
return newSingle(ch);
}
return error("Unexpected character '"+((char)ch)+"'");
|
private int | read()Read the next character, and advance the cursor by one.
int ch = temp[cursor++];
if (has(COMMENTS))
ch = parsePastWhitespace(ch);
return ch;
|
private int | readEscaped()Read the next character, and advance the cursor by one,
ignoring the COMMENTS setting
int ch = temp[cursor++];
return ch;
|
private void | readObject(java.io.ObjectInputStream s)Recompile the Pattern instance from a stream. The original pattern
string is read in and the object tree is recompiled from it.
// Read in all fields
s.defaultReadObject();
// Initialize counts
capturingGroupCount = 1;
localCount = 0;
// if length > 0, the Pattern is lazily compiled
compiled = false;
if (pattern.length() == 0) {
root = new Start(lastAccept);
matchRoot = lastAccept;
compiled = true;
}
|
private java.util.regex.Pattern$Node | ref(int refNum)Parses a backref greedily, taking as many numbers as it
can. The first digit is always treated as a backref, but
multi digit numbers are only treated as a backref if at
least that many backrefs exist at this point in the regex.
boolean done = false;
while(!done) {
int ch = peek();
switch(ch) {
case '0":
case '1":
case '2":
case '3":
case '4":
case '5":
case '6":
case '7":
case '8":
case '9":
int newRefNum = (refNum * 10) + (ch - '0");
// Add another number if it doesn't make a group
// that doesn't exist
if (capturingGroupCount - 1 < newRefNum) {
done = true;
break;
}
refNum = newRefNum;
read();
break;
default:
done = true;
break;
}
}
if (has(CASE_INSENSITIVE) || has(UNICODE_CASE))
return new CIBackRef(refNum);
else
return new BackRef(refNum);
|
private java.util.regex.Pattern$Node | retrieveCategoryNode(java.lang.String name)
Node n = (Node)categoryNames.cMap.get(name);
if (n != null)
return n;
return familyError(name, "Unknown character category {");
|
private java.util.regex.Pattern$Node | retrieveFamilyNode(java.lang.String name, boolean not)
if (name == null) {
return familyError("", "Null character family.");
}
Node n = null;
try {
Character.UnicodeBlock block = Character.UnicodeBlock.forName(name);
n = new UBlock(block, not);
} catch (IllegalArgumentException iae) {
return familyError(name, "Unknown character family {");
}
return n;
|
private java.util.regex.Pattern$Node | sequence(java.util.regex.Pattern$Node end)Parsing of sequences between alternations.
Node head = null;
Node tail = null;
Node node = null;
int i, j, ch;
LOOP:
for (;;) {
ch = peek();
switch (ch) {
case '(":
// Because group handles its own closure,
// we need to treat it differently
node = group0();
// Check for comment or flag group
if (node == null)
continue;
if (head == null)
head = node;
else
tail.next = node;
// Double return: Tail was returned in root
tail = root;
continue;
case '[":
node = clazz(true);
break;
case '\\":
ch = nextEscaped();
if (ch == 'p" || ch == 'P") {
boolean comp = (ch == 'P");
boolean oneLetter = true;
ch = next(); // Consume { if present
if (ch != '{") {
unread();
} else {
oneLetter = false;
}
node = family(comp, oneLetter);
} else {
unread();
node = atom();
}
break;
case '^":
next();
if (has(MULTILINE)) {
if (has(UNIX_LINES))
node = new UnixCaret();
else
node = new Caret();
} else {
node = new Begin();
}
break;
case '$":
next();
if (has(UNIX_LINES))
node = new UnixDollar(has(MULTILINE));
else
node = new Dollar(has(MULTILINE));
break;
case '.":
next();
if (has(DOTALL)) {
node = new All();
} else {
if (has(UNIX_LINES))
node = new UnixDot();
else {
node = new Dot();
}
}
break;
case '|":
case ')":
break LOOP;
case ']": // Now interpreting dangling ] and } as literals
case '}":
node = atom();
break;
case '?":
case '*":
case '+":
next();
return error("Dangling meta character '" + ((char)ch) + "'");
case 0:
if (cursor >= patternLength) {
break LOOP;
}
// Fall through
default:
node = atom();
break;
}
node = closure(node);
if (head == null) {
head = tail = node;
} else {
tail.next = node;
tail = node;
}
}
if (head == null) {
return end;
}
tail.next = end;
return head;
|
private int | single()
int ch = peek();
switch (ch) {
case '\\":
return escape(true, false);
default:
next();
return ch;
}
|
private int | skip()Read the character after the next one, and advance the cursor by two.
int i = cursor;
int ch = temp[i+1];
cursor = i + 2;
return ch;
|
public java.lang.String[] | split(java.lang.CharSequence input, int limit)Splits the given input sequence around matches of this pattern.
The array returned by this method contains each substring of the
input sequence that is terminated by another subsequence that matches
this pattern or is terminated by the end of the input sequence. The
substrings in the array are in the order in which they occur in the
input. If this pattern does not match any subsequence of the input then
the resulting array has just one element, namely the input sequence in
string form.
The limit parameter controls the number of times the
pattern is applied and therefore affects the length of the resulting
array. If the limit n is greater than zero then the pattern
will be applied at most n - 1 times, the array's
length will be no greater than n, and the array's last entry
will contain all input beyond the last matched delimiter. If n
is non-positive then the pattern will be applied as many times as
possible and the array can have any length. If n is zero then
the pattern will be applied as many times as possible, the array can
have any length, and trailing empty strings will be discarded.
The input "boo:and:foo", for example, yields the following
results with these parameters:
Regex |
Limit |
Result |
: |
2 |
{ "boo", "and:foo" } |
: |
5 |
{ "boo", "and", "foo" } |
: |
-2 |
{ "boo", "and", "foo" } |
o |
5 |
{ "b", "", ":and:f", "", "" } |
o |
-2 |
{ "b", "", ":and:f", "", "" } |
o |
0 |
{ "b", "", ":and:f" } |
int index = 0;
boolean matchLimited = limit > 0;
ArrayList matchList = new ArrayList();
Matcher m = matcher(input);
// Add segments before each match found
while(m.find()) {
if (!matchLimited || matchList.size() < limit - 1) {
String match = input.subSequence(index, m.start()).toString();
matchList.add(match);
index = m.end();
} else if (matchList.size() == limit - 1) { // last one
String match = input.subSequence(index,
input.length()).toString();
matchList.add(match);
index = m.end();
}
}
// If no match was found, return this
if (index == 0)
return new String[] {input.toString()};
// Add remaining segment
if (!matchLimited || matchList.size() < limit)
matchList.add(input.subSequence(index, input.length()).toString());
// Construct result
int resultSize = matchList.size();
if (limit == 0)
while (resultSize > 0 && matchList.get(resultSize-1).equals(""))
resultSize--;
String[] result = new String[resultSize];
return (String[])matchList.subList(0, resultSize).toArray(result);
|
public java.lang.String[] | split(java.lang.CharSequence input)Splits the given input sequence around matches of this pattern.
This method works as if by invoking the two-argument {@link
#split(java.lang.CharSequence, int) split} method with the given input
sequence and a limit argument of zero. Trailing empty strings are
therefore not included in the resulting array.
The input "boo:and:foo", for example, yields the following
results with these expressions:
Regex |
Result |
: |
{ "boo", "and", "foo" } |
o |
{ "b", "", ":and:f" } |
return split(input, 0);
|
private void | subFlag()Parses the second part of inlined match flags and turns off
flags appropriately.
int ch = peek();
for (;;) {
switch (ch) {
case 'i":
flags &= ~CASE_INSENSITIVE;
break;
case 'm":
flags &= ~MULTILINE;
break;
case 's":
flags &= ~DOTALL;
break;
case 'd":
flags &= ~UNIX_LINES;
break;
case 'u":
flags &= ~UNICODE_CASE;
break;
case 'c":
flags &= ~CANON_EQ;
break;
case 'x":
flags &= ~COMMENTS;
break;
default:
return;
}
ch = next();
}
|
public java.lang.String | toString()Returns the string representation of this pattern. This
is the regular expression from which this pattern was
compiled.
return pattern;
|
private int | u()Utility method for parsing unicode escape sequences.
int n = 0;
for (int i = 0; i < 4; i++) {
int ch = read();
if (!ASCII.isHexDigit(ch)) {
error("Illegal Unicode escape sequence");
}
n = n * 16 + ASCII.toDigit(ch);
}
return n;
|
private void | unread()Unread one next character, and retreat cursor by one.
cursor--;
|
private int | x()Utility method for parsing hexadecimal escape sequences.
int n = read();
if (ASCII.isHexDigit(n)) {
int m = read();
if (ASCII.isHexDigit(m)) {
return ASCII.toDigit(n) * 16 + ASCII.toDigit(m);
}
}
error("Illegal hexadecimal escape sequence");
return -1;
|