Methods Summary |
---|
private void | RemoveQEQuoting()Preprocess any \Q...\E sequences in `temp', meta-quoting them.
See the description of `quotemeta' in perlfunc(1).
final int pLen = patternLength;
int i = 0;
while (i < pLen-1) {
if (temp[i] != '\\")
i += 1;
else if (temp[i + 1] != 'Q")
i += 2;
else
break;
}
if (i >= pLen - 1) // No \Q sequence found
return;
int j = i;
i += 2;
int[] newtemp = new int[j + 2*(pLen-i) + 2];
System.arraycopy(temp, 0, newtemp, 0, j);
boolean inQuote = true;
while (i < pLen) {
int c = temp[i++];
if (! ASCII.isAscii(c) || ASCII.isAlnum(c)) {
newtemp[j++] = c;
} else if (c != '\\") {
if (inQuote) newtemp[j++] = '\\";
newtemp[j++] = c;
} else if (inQuote) {
if (temp[i] == 'E") {
i++;
inQuote = false;
} else {
newtemp[j++] = '\\";
newtemp[j++] = '\\";
}
} else {
if (temp[i] == 'Q") {
i++;
inQuote = true;
} else {
newtemp[j++] = c;
if (i != pLen)
newtemp[j++] = temp[i++];
}
}
}
patternLength = j;
temp = Arrays.copyOf(newtemp, j + 2); // double zero termination
|
private void | accept(int ch, java.lang.String s)Match next character, signal error if failed.
int testChar = temp[cursor++];
if (has(COMMENTS))
testChar = parsePastWhitespace(testChar);
if (ch != testChar) {
throw error(s);
}
|
private void | addFlag()Parses inlined match flags and set them appropriately.
int ch = peek();
for (;;) {
switch (ch) {
case 'i":
flags |= CASE_INSENSITIVE;
break;
case 'm":
flags |= MULTILINE;
break;
case 's":
flags |= DOTALL;
break;
case 'd":
flags |= UNIX_LINES;
break;
case 'u":
flags |= UNICODE_CASE;
break;
case 'c":
flags |= CANON_EQ;
break;
case 'x":
flags |= COMMENTS;
break;
case '-": // subFlag then fall through
ch = next();
subFlag();
default:
return;
}
ch = next();
}
|
private void | append(int ch, int len)
if (len >= buffer.length) {
int[] tmp = new int[len+len];
System.arraycopy(buffer, 0, tmp, 0, len);
buffer = tmp;
}
buffer[len] = ch;
|
private java.util.regex.Pattern$Node | atom()Parse and add a new Single or Slice.
int first = 0;
int prev = -1;
boolean hasSupplementary = false;
int ch = peek();
for (;;) {
switch (ch) {
case '*":
case '+":
case '?":
case '{":
if (first > 1) {
cursor = prev; // Unwind one character
first--;
}
break;
case '$":
case '.":
case '^":
case '(":
case '[":
case '|":
case ')":
break;
case '\\":
ch = nextEscaped();
if (ch == 'p" || ch == 'P") { // Property
if (first > 0) { // Slice is waiting; handle it first
unread();
break;
} else { // No slice; just return the family node
boolean comp = (ch == 'P");
boolean oneLetter = true;
ch = next(); // Consume { if present
if (ch != '{")
unread();
else
oneLetter = false;
return family(oneLetter).maybeComplement(comp);
}
}
unread();
prev = cursor;
ch = escape(false, first == 0);
if (ch >= 0) {
append(ch, first);
first++;
if (isSupplementary(ch)) {
hasSupplementary = true;
}
ch = peek();
continue;
} else if (first == 0) {
return root;
}
// Unwind meta escape sequence
cursor = prev;
break;
case 0:
if (cursor >= patternLength) {
break;
}
// Fall through
default:
prev = cursor;
append(ch, first);
first++;
if (isSupplementary(ch)) {
hasSupplementary = true;
}
ch = next();
continue;
}
break;
}
if (first == 1) {
return newSingle(buffer[0]);
} else {
return newSlice(buffer, first, hasSupplementary);
}
|
private java.util.regex.Pattern$CharProperty | bitsOrSingle(java.util.regex.Pattern$BitClass bits, int ch)
/* Bits can only handle codepoints in [u+0000-u+00ff] range.
Use "single" node instead of bits when dealing with unicode
case folding for codepoints listed below.
(1)Uppercase out of range: u+00ff, u+00b5
toUpperCase(u+00ff) -> u+0178
toUpperCase(u+00b5) -> u+039c
(2)LatinSmallLetterLongS u+17f
toUpperCase(u+017f) -> u+0053
(3)LatinSmallLetterDotlessI u+131
toUpperCase(u+0131) -> u+0049
(4)LatinCapitalLetterIWithDotAbove u+0130
toLowerCase(u+0130) -> u+0069
(5)KelvinSign u+212a
toLowerCase(u+212a) ==> u+006B
(6)AngstromSign u+212b
toLowerCase(u+212b) ==> u+00e5
*/
int d;
if (ch < 256 &&
!(has(CASE_INSENSITIVE) && has(UNICODE_CASE) &&
(ch == 0xff || ch == 0xb5 ||
ch == 0x49 || ch == 0x69 || //I and i
ch == 0x53 || ch == 0x73 || //S and s
ch == 0x4b || ch == 0x6b || //K and k
ch == 0xc5 || ch == 0xe5))) //A+ring
return bits.add(ch, flags());
return newSingle(ch);
|
private int | c()Utility method for parsing control escape sequences.
if (cursor < patternLength) {
return read() ^ 64;
}
throw error("Illegal control escape sequence");
|
private java.util.regex.Pattern$CharProperty | caseInsensitiveRangeFor(int lower, int upper)Returns node for matching characters within an explicit value
range in a case insensitive manner.
if (has(UNICODE_CASE))
return new CharProperty() {
boolean isSatisfiedBy(int ch) {
if (inRange(lower, ch, upper))
return true;
int up = Character.toUpperCase(ch);
return inRange(lower, up, upper) ||
inRange(lower, Character.toLowerCase(up), upper);}};
return new CharProperty() {
boolean isSatisfiedBy(int ch) {
return inRange(lower, ch, upper) ||
ASCII.isAscii(ch) &&
(inRange(lower, ASCII.toUpper(ch), upper) ||
inRange(lower, ASCII.toLower(ch), upper));
}};
|
private java.util.regex.Pattern$CharProperty | charPropertyNodeFor(java.lang.String name)Returns a CharProperty matching all characters in a named property.
CharProperty p = CharPropertyNames.charPropertyFor(name);
if (p == null)
throw error("Unknown character property name {" + name + "}");
return p;
|
private java.util.regex.Pattern$CharProperty | clazz(boolean consume)Parse a character class, and return the node that matches it.
Consumes a ] on the way out if consume is true. Usually consume
is true except for the case of [abc&&def] where def is a separate
right hand node with "understood" brackets.
CharProperty prev = null;
CharProperty node = null;
BitClass bits = new BitClass();
boolean include = true;
boolean firstInClass = true;
int ch = next();
for (;;) {
switch (ch) {
case '^":
// Negates if first char in a class, otherwise literal
if (firstInClass) {
if (temp[cursor-1] != '[")
break;
ch = next();
include = !include;
continue;
} else {
// ^ not first in class, treat as literal
break;
}
case '[":
firstInClass = false;
node = clazz(true);
if (prev == null)
prev = node;
else
prev = union(prev, node);
ch = peek();
continue;
case '&":
firstInClass = false;
ch = next();
if (ch == '&") {
ch = next();
CharProperty rightNode = null;
while (ch != ']" && ch != '&") {
if (ch == '[") {
if (rightNode == null)
rightNode = clazz(true);
else
rightNode = union(rightNode, clazz(true));
} else { // abc&&def
unread();
rightNode = clazz(false);
}
ch = peek();
}
if (rightNode != null)
node = rightNode;
if (prev == null) {
if (rightNode == null)
throw error("Bad class syntax");
else
prev = rightNode;
} else {
prev = intersection(prev, node);
}
} else {
// treat as a literal &
unread();
break;
}
continue;
case 0:
firstInClass = false;
if (cursor >= patternLength)
throw error("Unclosed character class");
break;
case ']":
firstInClass = false;
if (prev != null) {
if (consume)
next();
return prev;
}
break;
default:
firstInClass = false;
break;
}
node = range(bits);
if (include) {
if (prev == null) {
prev = node;
} else {
if (prev != node)
prev = union(prev, node);
}
} else {
if (prev == null) {
prev = node.complement();
} else {
if (prev != node)
prev = setDifference(prev, node);
}
}
ch = peek();
}
|
private java.util.regex.Pattern$Node | closure(java.util.regex.Pattern$Node prev)Processes repetition. If the next character peeked is a quantifier
then new nodes must be appended to handle the repetition.
Prev could be a single or a group, so it could be a chain of nodes.
Node atom;
int ch = peek();
switch (ch) {
case '?":
ch = next();
if (ch == '?") {
next();
return new Ques(prev, LAZY);
} else if (ch == '+") {
next();
return new Ques(prev, POSSESSIVE);
}
return new Ques(prev, GREEDY);
case '*":
ch = next();
if (ch == '?") {
next();
return new Curly(prev, 0, MAX_REPS, LAZY);
} else if (ch == '+") {
next();
return new Curly(prev, 0, MAX_REPS, POSSESSIVE);
}
return new Curly(prev, 0, MAX_REPS, GREEDY);
case '+":
ch = next();
if (ch == '?") {
next();
return new Curly(prev, 1, MAX_REPS, LAZY);
} else if (ch == '+") {
next();
return new Curly(prev, 1, MAX_REPS, POSSESSIVE);
}
return new Curly(prev, 1, MAX_REPS, GREEDY);
case '{":
ch = temp[cursor+1];
if (ASCII.isDigit(ch)) {
skip();
int cmin = 0;
do {
cmin = cmin * 10 + (ch - '0");
} while (ASCII.isDigit(ch = read()));
int cmax = cmin;
if (ch == ',") {
ch = read();
cmax = MAX_REPS;
if (ch != '}") {
cmax = 0;
while (ASCII.isDigit(ch)) {
cmax = cmax * 10 + (ch - '0");
ch = read();
}
}
}
if (ch != '}")
throw error("Unclosed counted closure");
if (((cmin) | (cmax) | (cmax - cmin)) < 0)
throw error("Illegal repetition range");
Curly curly;
ch = peek();
if (ch == '?") {
next();
curly = new Curly(prev, cmin, cmax, LAZY);
} else if (ch == '+") {
next();
curly = new Curly(prev, cmin, cmax, POSSESSIVE);
} else {
curly = new Curly(prev, cmin, cmax, GREEDY);
}
return curly;
} else {
throw error("Illegal repetition");
}
default:
return prev;
}
|
public static java.util.regex.Pattern | compile(java.lang.String regex)Compiles the given regular expression into a pattern.
return new Pattern(regex, 0);
|
public static java.util.regex.Pattern | compile(java.lang.String regex, int flags)Compiles the given regular expression into a pattern with the given
flags.
return new Pattern(regex, flags);
|
private void | compile()Copies regular expression to an int array and invokes the parsing
of the expression which will create the object tree.
// Handle canonical equivalences
if (has(CANON_EQ) && !has(LITERAL)) {
normalize();
} else {
normalizedPattern = pattern;
}
patternLength = normalizedPattern.length();
// Copy pattern to int array for convenience
// Use double zero to terminate pattern
temp = new int[patternLength + 2];
boolean hasSupplementary = false;
int c, count = 0;
// Convert all chars into code points
for (int x = 0; x < patternLength; x += Character.charCount(c)) {
c = normalizedPattern.codePointAt(x);
if (isSupplementary(c)) {
hasSupplementary = true;
}
temp[count++] = c;
}
patternLength = count; // patternLength now in code points
if (! has(LITERAL))
RemoveQEQuoting();
// Allocate all temporary objects here.
buffer = new int[32];
groupNodes = new GroupHead[10];
if (has(LITERAL)) {
// Literal pattern handling
matchRoot = newSlice(temp, patternLength, hasSupplementary);
matchRoot.next = lastAccept;
} else {
// Start recursive descent parsing
matchRoot = expr(lastAccept);
// Check extra pattern characters
if (patternLength != cursor) {
if (peek() == ')") {
throw error("Unmatched closing ')'");
} else {
throw error("Unexpected internal error");
}
}
}
// Peephole optimization
if (matchRoot instanceof Slice) {
root = BnM.optimize(matchRoot);
if (root == matchRoot) {
root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
}
} else if (matchRoot instanceof Begin || matchRoot instanceof First) {
root = matchRoot;
} else {
root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
}
// Release temporary storage
temp = null;
buffer = null;
groupNodes = null;
patternLength = 0;
compiled = true;
|
private java.lang.String | composeOneStep(java.lang.String input)Attempts to compose input by combining the first character
with the first combining mark following it. Returns a String
that is the composition of the leading character with its first
combining mark followed by the remaining combining marks. Returns
null if the first two characters cannot be further composed.
int len = countChars(input, 0, 2);
String firstTwoCharacters = input.substring(0, len);
String result = Normalizer.normalize(firstTwoCharacters, Normalizer.Form.NFC);
if (result.equals(firstTwoCharacters))
return null;
else {
String remainder = input.substring(len);
return result + remainder;
}
|
private static final int | countChars(java.lang.CharSequence seq, int index, int lengthInCodePoints)
// optimization
if (lengthInCodePoints == 1 && !Character.isHighSurrogate(seq.charAt(index))) {
assert (index >= 0 && index < seq.length());
return 1;
}
int length = seq.length();
int x = index;
if (lengthInCodePoints >= 0) {
assert (index >= 0 && index < length);
for (int i = 0; x < length && i < lengthInCodePoints; i++) {
if (Character.isHighSurrogate(seq.charAt(x++))) {
if (x < length && Character.isLowSurrogate(seq.charAt(x))) {
x++;
}
}
}
return x - index;
}
assert (index >= 0 && index <= length);
if (index == 0) {
return 0;
}
int len = -lengthInCodePoints;
for (int i = 0; x > 0 && i < len; i++) {
if (Character.isLowSurrogate(seq.charAt(--x))) {
if (x > 0 && Character.isHighSurrogate(seq.charAt(x-1))) {
x--;
}
}
}
return index - x;
|
private static final int | countCodePoints(java.lang.CharSequence seq)
int length = seq.length();
int n = 0;
for (int i = 0; i < length; ) {
n++;
if (Character.isHighSurrogate(seq.charAt(i++))) {
if (i < length && Character.isLowSurrogate(seq.charAt(i))) {
i++;
}
}
}
return n;
|
private java.util.regex.Pattern$Node | createGroup(boolean anonymous)Create group head and tail nodes using double return. If the group is
created with anonymous true then it is a pure group and should not
affect group counting.
int localIndex = localCount++;
int groupIndex = 0;
if (!anonymous)
groupIndex = capturingGroupCount++;
GroupHead head = new GroupHead(localIndex);
root = new GroupTail(localIndex, groupIndex);
if (!anonymous && groupIndex < 10)
groupNodes[groupIndex] = head;
return head;
|
private java.util.regex.PatternSyntaxException | error(java.lang.String s)Internal method used for handling all syntax errors. The pattern is
displayed with a pointer to aid in locating the syntax error.
return new PatternSyntaxException(s, normalizedPattern, cursor - 1);
|
private int | escape(boolean inclass, boolean create)Parses an escape sequence to determine the actual value that needs
to be matched.
If -1 is returned and create was true a new object was added to the tree
to handle the escape sequence.
If the returned value is greater than zero, it is the value that
matches the escape sequence.
int ch = skip();
switch (ch) {
case '0":
return o();
case '1":
case '2":
case '3":
case '4":
case '5":
case '6":
case '7":
case '8":
case '9":
if (inclass) break;
if (create) {
root = ref((ch - '0"));
}
return -1;
case 'A":
if (inclass) break;
if (create) root = new Begin();
return -1;
case 'B":
if (inclass) break;
if (create) root = new Bound(Bound.NONE);
return -1;
case 'C":
break;
case 'D":
if (create) root = new Ctype(ASCII.DIGIT).complement();
return -1;
case 'E":
case 'F":
break;
case 'G":
if (inclass) break;
if (create) root = new LastMatch();
return -1;
case 'H":
case 'I":
case 'J":
case 'K":
case 'L":
case 'M":
case 'N":
case 'O":
case 'P":
case 'Q":
case 'R":
break;
case 'S":
if (create) root = new Ctype(ASCII.SPACE).complement();
return -1;
case 'T":
case 'U":
case 'V":
break;
case 'W":
if (create) root = new Ctype(ASCII.WORD).complement();
return -1;
case 'X":
case 'Y":
break;
case 'Z":
if (inclass) break;
if (create) {
if (has(UNIX_LINES))
root = new UnixDollar(false);
else
root = new Dollar(false);
}
return -1;
case 'a":
return '\007";
case 'b":
if (inclass) break;
if (create) root = new Bound(Bound.BOTH);
return -1;
case 'c":
return c();
case 'd":
if (create) root = new Ctype(ASCII.DIGIT);
return -1;
case 'e":
return '\033";
case 'f":
return '\f";
case 'g":
case 'h":
case 'i":
case 'j":
case 'k":
case 'l":
case 'm":
break;
case 'n":
return '\n";
case 'o":
case 'p":
case 'q":
break;
case 'r":
return '\r";
case 's":
if (create) root = new Ctype(ASCII.SPACE);
return -1;
case 't":
return '\t";
case 'u":
return u();
case 'v":
return '\013";
case 'w":
if (create) root = new Ctype(ASCII.WORD);
return -1;
case 'x":
return x();
case 'y":
break;
case 'z":
if (inclass) break;
if (create) root = new End();
return -1;
default:
return ch;
}
throw error("Illegal/unsupported escape sequence");
|
private java.util.regex.Pattern$Node | expr(java.util.regex.Pattern$Node end)The expression is parsed with branch nodes added for alternations.
This may be called recursively to parse sub expressions that may
contain alternations.
Node prev = null;
Node firstTail = null;
Node branchConn = null;
for (;;) {
Node node = sequence(end);
Node nodeTail = root; //double return
if (prev == null) {
prev = node;
firstTail = nodeTail;
} else {
// Branch
if (branchConn == null) {
branchConn = new BranchConn();
branchConn.next = end;
}
if (node == end) {
// if the node returned from sequence() is "end"
// we have an empty expr, set a null atom into
// the branch to indicate to go "next" directly.
node = null;
} else {
// the "tail.next" of each atom goes to branchConn
nodeTail.next = branchConn;
}
if (prev instanceof Branch) {
((Branch)prev).add(node);
} else {
if (prev == end) {
prev = null;
} else {
// replace the "end" with "branchConn" at its tail.next
// when put the "prev" into the branch as the first atom.
firstTail.next = branchConn;
}
prev = new Branch(prev, node, branchConn);
}
}
if (peek() != '|") {
return prev;
}
next();
}
|
private java.util.regex.Pattern$CharProperty | family(boolean singleLetter)Parses a Unicode character family and returns its representative node.
next();
String name;
if (singleLetter) {
int c = temp[cursor];
if (!Character.isSupplementaryCodePoint(c)) {
name = String.valueOf((char)c);
} else {
name = new String(temp, cursor, 1);
}
read();
} else {
int i = cursor;
mark('}");
while(read() != '}") {
}
mark('\000");
int j = cursor;
if (j > patternLength)
throw error("Unclosed character family");
if (i + 1 >= j)
throw error("Empty character family");
name = new String(temp, i, j-i-1);
}
if (name.startsWith("In")) {
return unicodeBlockPropertyFor(name.substring(2));
} else {
if (name.startsWith("Is"))
name = name.substring(2);
return charPropertyNodeFor(name);
}
|
private boolean | findSupplementary(int start, int end)Determines if there is any supplementary character or unpaired
surrogate in the specified range.
for (int i = start; i < end; i++) {
if (isSupplementary(temp[i]))
return true;
}
return false;
|
public int | flags()Returns this pattern's match flags.
return flags;
|
private int | getClass(int c)
return sun.text.Normalizer.getCombiningClass(c);
|
private java.util.regex.Pattern$Node | group0()Parses a group and returns the head node of a set of nodes that process
the group. Sometimes a double return system is used where the tail is
returned in root.
boolean capturingGroup = false;
Node head = null;
Node tail = null;
int save = flags;
root = null;
int ch = next();
if (ch == '?") {
ch = skip();
switch (ch) {
case ':": // (?:xxx) pure group
head = createGroup(true);
tail = root;
head.next = expr(tail);
break;
case '=": // (?=xxx) and (?!xxx) lookahead
case '!":
head = createGroup(true);
tail = root;
head.next = expr(tail);
if (ch == '=") {
head = tail = new Pos(head);
} else {
head = tail = new Neg(head);
}
break;
case '>": // (?>xxx) independent group
head = createGroup(true);
tail = root;
head.next = expr(tail);
head = tail = new Ques(head, INDEPENDENT);
break;
case '<": // (?<xxx) look behind
ch = read();
int start = cursor;
head = createGroup(true);
tail = root;
head.next = expr(tail);
tail.next = lookbehindEnd;
TreeInfo info = new TreeInfo();
head.study(info);
if (info.maxValid == false) {
throw error("Look-behind group does not have "
+ "an obvious maximum length");
}
boolean hasSupplementary = findSupplementary(start, patternLength);
if (ch == '=") {
head = tail = (hasSupplementary ?
new BehindS(head, info.maxLength,
info.minLength) :
new Behind(head, info.maxLength,
info.minLength));
} else if (ch == '!") {
head = tail = (hasSupplementary ?
new NotBehindS(head, info.maxLength,
info.minLength) :
new NotBehind(head, info.maxLength,
info.minLength));
} else {
throw error("Unknown look-behind group");
}
break;
case '$":
case '@":
throw error("Unknown group type");
default: // (?xxx:) inlined match flags
unread();
addFlag();
ch = read();
if (ch == ')") {
return null; // Inline modifier only
}
if (ch != ':") {
throw error("Unknown inline modifier");
}
head = createGroup(true);
tail = root;
head.next = expr(tail);
break;
}
} else { // (xxx) a regular group
capturingGroup = true;
head = createGroup(false);
tail = root;
head.next = expr(tail);
}
accept(')", "Unclosed group");
flags = save;
// Check for quantifiers
Node node = closure(head);
if (node == head) { // No closure
root = tail;
return node; // Dual return
}
if (head == tail) { // Zero length assertion
root = node;
return node; // Dual return
}
if (node instanceof Ques) {
Ques ques = (Ques) node;
if (ques.type == POSSESSIVE) {
root = node;
return node;
}
tail.next = new BranchConn();
tail = tail.next;
if (ques.type == GREEDY) {
head = new Branch(head, null, tail);
} else { // Reluctant quantifier
head = new Branch(null, head, tail);
}
root = tail;
return head;
} else if (node instanceof Curly) {
Curly curly = (Curly) node;
if (curly.type == POSSESSIVE) {
root = node;
return node;
}
// Discover if the group is deterministic
TreeInfo info = new TreeInfo();
if (head.study(info)) { // Deterministic
GroupTail temp = (GroupTail) tail;
head = root = new GroupCurly(head.next, curly.cmin,
curly.cmax, curly.type,
((GroupTail)tail).localIndex,
((GroupTail)tail).groupIndex,
capturingGroup);
return head;
} else { // Non-deterministic
int temp = ((GroupHead) head).localIndex;
Loop loop;
if (curly.type == GREEDY)
loop = new Loop(this.localCount, temp);
else // Reluctant Curly
loop = new LazyLoop(this.localCount, temp);
Prolog prolog = new Prolog(loop);
this.localCount += 1;
loop.cmin = curly.cmin;
loop.cmax = curly.cmax;
loop.body = head;
tail.next = loop;
root = loop;
return prolog; // Dual return
}
}
throw error("Internal logic error");
|
private boolean | has(int f)Indicates whether a particular flag is set or not.
return (flags & f) != 0;
|
private static boolean | hasBaseCharacter(java.util.regex.Matcher matcher, int i, java.lang.CharSequence seq)Non spacing marks only count as word characters in bounds calculations
if they have a base character.
int start = (!matcher.transparentBounds) ?
matcher.from : 0;
for (int x=i; x >= start; x--) {
int ch = Character.codePointAt(seq, x);
if (Character.isLetterOrDigit(ch))
return true;
if (Character.getType(ch) == Character.NON_SPACING_MARK)
continue;
return false;
}
return false;
|
private static boolean | inRange(int lower, int ch, int upper)
return lower <= ch && ch <= upper;
|
private static java.util.regex.Pattern$CharProperty | intersection(java.util.regex.Pattern$CharProperty lhs, java.util.regex.Pattern$CharProperty rhs)Returns the set intersection of two CharProperty nodes.
return new CharProperty() {
boolean isSatisfiedBy(int ch) {
return lhs.isSatisfiedBy(ch) && rhs.isSatisfiedBy(ch);}};
|
private boolean | isLineSeparator(int ch)Determines if character is a line separator in the current mode
if (has(UNIX_LINES)) {
return ch == '\n";
} else {
return (ch == '\n" ||
ch == '\r" ||
(ch|1) == '\u2029" ||
ch == '\u0085");
}
|
private static final boolean | isSupplementary(int ch)Determines if the specified code point is a supplementary
character or unpaired surrogate.
return ch >= Character.MIN_SUPPLEMENTARY_CODE_POINT || isSurrogate(ch);
|
private static final boolean | isSurrogate(int c)Tests a surrogate value.
return c >= Character.MIN_HIGH_SURROGATE && c <= Character.MAX_LOW_SURROGATE;
|
private void | mark(int c)Mark the end of pattern with a specific character.
temp[patternLength] = c;
|
public java.util.regex.Matcher | matcher(java.lang.CharSequence input)Creates a matcher that will match the given input against this pattern.
if (!compiled) {
synchronized(this) {
if (!compiled)
compile();
}
}
Matcher m = new Matcher(this, input);
return m;
|
public static boolean | matches(java.lang.String regex, java.lang.CharSequence input)Compiles the given regular expression and attempts to match the given
input against it.
An invocation of this convenience method of the form
Pattern.matches(regex, input);
behaves in exactly the same way as the expression
Pattern.compile(regex).matcher(input).matches()
If a pattern is to be used multiple times, compiling it once and reusing
it will be more efficient than invoking this method each time.
Pattern p = Pattern.compile(regex);
Matcher m = p.matcher(input);
return m.matches();
|
private java.util.regex.Pattern$CharProperty | newSingle(int ch)Returns a suitably optimized, single character matcher.
if (has(CASE_INSENSITIVE)) {
int lower, upper;
if (has(UNICODE_CASE)) {
upper = Character.toUpperCase(ch);
lower = Character.toLowerCase(upper);
if (upper != lower)
return new SingleU(lower);
} else if (ASCII.isAscii(ch)) {
lower = ASCII.toLower(ch);
upper = ASCII.toUpper(ch);
if (lower != upper)
return new SingleI(lower, upper);
}
}
if (isSupplementary(ch))
return new SingleS(ch); // Match a given Unicode character
return new Single(ch); // Match a given BMP character
|
private java.util.regex.Pattern$Node | newSlice(int[] buf, int count, boolean hasSupplementary)Utility method for creating a string slice matcher.
int[] tmp = new int[count];
if (has(CASE_INSENSITIVE)) {
if (has(UNICODE_CASE)) {
for (int i = 0; i < count; i++) {
tmp[i] = Character.toLowerCase(
Character.toUpperCase(buf[i]));
}
return hasSupplementary? new SliceUS(tmp) : new SliceU(tmp);
}
for (int i = 0; i < count; i++) {
tmp[i] = ASCII.toLower(buf[i]);
}
return hasSupplementary? new SliceIS(tmp) : new SliceI(tmp);
}
for (int i = 0; i < count; i++) {
tmp[i] = buf[i];
}
return hasSupplementary ? new SliceS(tmp) : new Slice(tmp);
|
private int | next()Advance the cursor by one, and peek the next character.
int ch = temp[++cursor];
if (has(COMMENTS))
ch = peekPastWhitespace(ch);
return ch;
|
private int | nextEscaped()Advance the cursor by one, and peek the next character,
ignoring the COMMENTS setting
int ch = temp[++cursor];
return ch;
|
private void | normalize()The pattern is converted to normalizedD form and then a pure group
is constructed to match canonical equivalences of the characters.
boolean inCharClass = false;
int lastCodePoint = -1;
// Convert pattern into normalizedD form
normalizedPattern = Normalizer.normalize(pattern, Normalizer.Form.NFD);
patternLength = normalizedPattern.length();
// Modify pattern to match canonical equivalences
StringBuilder newPattern = new StringBuilder(patternLength);
for(int i=0; i<patternLength; ) {
int c = normalizedPattern.codePointAt(i);
StringBuilder sequenceBuffer;
if ((Character.getType(c) == Character.NON_SPACING_MARK)
&& (lastCodePoint != -1)) {
sequenceBuffer = new StringBuilder();
sequenceBuffer.appendCodePoint(lastCodePoint);
sequenceBuffer.appendCodePoint(c);
while(Character.getType(c) == Character.NON_SPACING_MARK) {
i += Character.charCount(c);
if (i >= patternLength)
break;
c = normalizedPattern.codePointAt(i);
sequenceBuffer.appendCodePoint(c);
}
String ea = produceEquivalentAlternation(
sequenceBuffer.toString());
newPattern.setLength(newPattern.length()-Character.charCount(lastCodePoint));
newPattern.append("(?:").append(ea).append(")");
} else if (c == '[" && lastCodePoint != '\\") {
i = normalizeCharClass(newPattern, i);
} else {
newPattern.appendCodePoint(c);
}
lastCodePoint = c;
i += Character.charCount(c);
}
normalizedPattern = newPattern.toString();
|
private int | normalizeCharClass(java.lang.StringBuilder newPattern, int i)Complete the character class being parsed and add a set
of alternations to it that will match the canonical equivalences
of the characters within the class.
StringBuilder charClass = new StringBuilder();
StringBuilder eq = null;
int lastCodePoint = -1;
String result;
i++;
charClass.append("[");
while(true) {
int c = normalizedPattern.codePointAt(i);
StringBuilder sequenceBuffer;
if (c == ']" && lastCodePoint != '\\") {
charClass.append((char)c);
break;
} else if (Character.getType(c) == Character.NON_SPACING_MARK) {
sequenceBuffer = new StringBuilder();
sequenceBuffer.appendCodePoint(lastCodePoint);
while(Character.getType(c) == Character.NON_SPACING_MARK) {
sequenceBuffer.appendCodePoint(c);
i += Character.charCount(c);
if (i >= normalizedPattern.length())
break;
c = normalizedPattern.codePointAt(i);
}
String ea = produceEquivalentAlternation(
sequenceBuffer.toString());
charClass.setLength(charClass.length()-Character.charCount(lastCodePoint));
if (eq == null)
eq = new StringBuilder();
eq.append('|");
eq.append(ea);
} else {
charClass.appendCodePoint(c);
i++;
}
if (i == normalizedPattern.length())
throw error("Unclosed character class");
lastCodePoint = c;
}
if (eq != null) {
result = "(?:"+charClass.toString()+eq.toString()+")";
} else {
result = charClass.toString();
}
newPattern.append(result);
return i;
|
private int | o()Utility method for parsing octal escape sequences.
int n = read();
if (((n-'0")|('7"-n)) >= 0) {
int m = read();
if (((m-'0")|('7"-m)) >= 0) {
int o = read();
if ((((o-'0")|('7"-o)) >= 0) && (((n-'0")|('3"-n)) >= 0)) {
return (n - '0") * 64 + (m - '0") * 8 + (o - '0");
}
unread();
return (n - '0") * 8 + (m - '0");
}
unread();
return (n - '0");
}
throw error("Illegal octal escape sequence");
|
private int | parsePastLine()xmode parse past comment to end of line.
int ch = temp[cursor++];
while (ch != 0 && !isLineSeparator(ch))
ch = temp[cursor++];
return ch;
|
private int | parsePastWhitespace(int ch)If in xmode parse past whitespace and comments.
while (ASCII.isSpace(ch) || ch == '#") {
while (ASCII.isSpace(ch))
ch = temp[cursor++];
if (ch == '#")
ch = parsePastLine();
}
return ch;
|
public java.lang.String | pattern()Returns the regular expression from which this pattern was compiled.
return pattern;
|
private int | peek()Peek the next character, and do not advance the cursor.
int ch = temp[cursor];
if (has(COMMENTS))
ch = peekPastWhitespace(ch);
return ch;
|
private int | peekPastLine()xmode peek past comment to end of line.
int ch = temp[++cursor];
while (ch != 0 && !isLineSeparator(ch))
ch = temp[++cursor];
return ch;
|
private int | peekPastWhitespace(int ch)If in xmode peek past whitespace and comments.
while (ASCII.isSpace(ch) || ch == '#") {
while (ASCII.isSpace(ch))
ch = temp[++cursor];
if (ch == '#") {
ch = peekPastLine();
}
}
return ch;
|
private static void | printObjectTree(java.util.regex.Pattern$Node node)Used to print out a subtree of the Pattern to help with debugging.
while(node != null) {
if (node instanceof Prolog) {
System.out.println(node);
printObjectTree(((Prolog)node).loop);
System.out.println("**** end contents prolog loop");
} else if (node instanceof Loop) {
System.out.println(node);
printObjectTree(((Loop)node).body);
System.out.println("**** end contents Loop body");
} else if (node instanceof Curly) {
System.out.println(node);
printObjectTree(((Curly)node).atom);
System.out.println("**** end contents Curly body");
} else if (node instanceof GroupCurly) {
System.out.println(node);
printObjectTree(((GroupCurly)node).atom);
System.out.println("**** end contents GroupCurly body");
} else if (node instanceof GroupTail) {
System.out.println(node);
System.out.println("Tail next is "+node.next);
return;
} else {
System.out.println(node);
}
node = node.next;
if (node != null)
System.out.println("->next:");
if (node == Pattern.accept) {
System.out.println("Accept Node");
node = null;
}
}
|
private java.lang.String | produceEquivalentAlternation(java.lang.String source)Given a specific sequence composed of a regular character and
combining marks that follow it, produce the alternation that will
match all canonical equivalences of that sequence.
int len = countChars(source, 0, 1);
if (source.length() == len)
// source has one character.
return source;
String base = source.substring(0,len);
String combiningMarks = source.substring(len);
String[] perms = producePermutations(combiningMarks);
StringBuilder result = new StringBuilder(source);
// Add combined permutations
for(int x=0; x<perms.length; x++) {
String next = base + perms[x];
if (x>0)
result.append("|"+next);
next = composeOneStep(next);
if (next != null)
result.append("|"+produceEquivalentAlternation(next));
}
return result.toString();
|
private java.lang.String[] | producePermutations(java.lang.String input)Returns an array of strings that have all the possible
permutations of the characters in the input string.
This is used to get a list of all possible orderings
of a set of combining marks. Note that some of the permutations
are invalid because of combining class collisions, and these
possibilities must be removed because they are not canonically
equivalent.
if (input.length() == countChars(input, 0, 1))
return new String[] {input};
if (input.length() == countChars(input, 0, 2)) {
int c0 = Character.codePointAt(input, 0);
int c1 = Character.codePointAt(input, Character.charCount(c0));
if (getClass(c1) == getClass(c0)) {
return new String[] {input};
}
String[] result = new String[2];
result[0] = input;
StringBuilder sb = new StringBuilder(2);
sb.appendCodePoint(c1);
sb.appendCodePoint(c0);
result[1] = sb.toString();
return result;
}
int length = 1;
int nCodePoints = countCodePoints(input);
for(int x=1; x<nCodePoints; x++)
length = length * (x+1);
String[] temp = new String[length];
int combClass[] = new int[nCodePoints];
for(int x=0, i=0; x<nCodePoints; x++) {
int c = Character.codePointAt(input, i);
combClass[x] = getClass(c);
i += Character.charCount(c);
}
// For each char, take it out and add the permutations
// of the remaining chars
int index = 0;
int len;
// offset maintains the index in code units.
loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
len = countChars(input, offset, 1);
boolean skip = false;
for(int y=x-1; y>=0; y--) {
if (combClass[y] == combClass[x]) {
continue loop;
}
}
StringBuilder sb = new StringBuilder(input);
String otherChars = sb.delete(offset, offset+len).toString();
String[] subResult = producePermutations(otherChars);
String prefix = input.substring(offset, offset+len);
for(int y=0; y<subResult.length; y++)
temp[index++] = prefix + subResult[y];
}
String[] result = new String[index];
for (int x=0; x<index; x++)
result[x] = temp[x];
return result;
|
public static java.lang.String | quote(java.lang.String s)Returns a literal pattern String for the specified
String .
This method produces a String that can be used to
create a Pattern that would match the string
s as if it were a literal pattern. Metacharacters
or escape sequences in the input sequence will be given no special
meaning.
int slashEIndex = s.indexOf("\\E");
if (slashEIndex == -1)
return "\\Q" + s + "\\E";
StringBuilder sb = new StringBuilder(s.length() * 2);
sb.append("\\Q");
slashEIndex = 0;
int current = 0;
while ((slashEIndex = s.indexOf("\\E", current)) != -1) {
sb.append(s.substring(current, slashEIndex));
current = slashEIndex + 2;
sb.append("\\E\\\\E\\Q");
}
sb.append(s.substring(current, s.length()));
sb.append("\\E");
return sb.toString();
|
private java.util.regex.Pattern$CharProperty | range(java.util.regex.Pattern$BitClass bits)Parse a single character or a character range in a character class
and return its representative node.
int ch = peek();
if (ch == '\\") {
ch = nextEscaped();
if (ch == 'p" || ch == 'P") { // A property
boolean comp = (ch == 'P");
boolean oneLetter = true;
// Consume { if present
ch = next();
if (ch != '{")
unread();
else
oneLetter = false;
return family(oneLetter).maybeComplement(comp);
} else { // ordinary escape
unread();
ch = escape(true, true);
if (ch == -1)
return (CharProperty) root;
}
} else {
ch = single();
}
if (ch >= 0) {
if (peek() == '-") {
int endRange = temp[cursor+1];
if (endRange == '[") {
return bitsOrSingle(bits, ch);
}
if (endRange != ']") {
next();
int m = single();
if (m < ch)
throw error("Illegal character range");
if (has(CASE_INSENSITIVE))
return caseInsensitiveRangeFor(ch, m);
else
return rangeFor(ch, m);
}
}
return bitsOrSingle(bits, ch);
}
throw error("Unexpected character '"+((char)ch)+"'");
|
private static java.util.regex.Pattern$CharProperty | rangeFor(int lower, int upper)Returns node for matching characters within an explicit value range.
return new CharProperty() {
boolean isSatisfiedBy(int ch) {
return inRange(lower, ch, upper);}};
|
private int | read()Read the next character, and advance the cursor by one.
int ch = temp[cursor++];
if (has(COMMENTS))
ch = parsePastWhitespace(ch);
return ch;
|
private int | readEscaped()Read the next character, and advance the cursor by one,
ignoring the COMMENTS setting
int ch = temp[cursor++];
return ch;
|
private void | readObject(java.io.ObjectInputStream s)Recompile the Pattern instance from a stream. The original pattern
string is read in and the object tree is recompiled from it.
// Read in all fields
s.defaultReadObject();
// Initialize counts
capturingGroupCount = 1;
localCount = 0;
// if length > 0, the Pattern is lazily compiled
compiled = false;
if (pattern.length() == 0) {
root = new Start(lastAccept);
matchRoot = lastAccept;
compiled = true;
}
|
private java.util.regex.Pattern$Node | ref(int refNum)Parses a backref greedily, taking as many numbers as it
can. The first digit is always treated as a backref, but
multi digit numbers are only treated as a backref if at
least that many backrefs exist at this point in the regex.
boolean done = false;
while(!done) {
int ch = peek();
switch(ch) {
case '0":
case '1":
case '2":
case '3":
case '4":
case '5":
case '6":
case '7":
case '8":
case '9":
int newRefNum = (refNum * 10) + (ch - '0");
// Add another number if it doesn't make a group
// that doesn't exist
if (capturingGroupCount - 1 < newRefNum) {
done = true;
break;
}
refNum = newRefNum;
read();
break;
default:
done = true;
break;
}
}
if (has(CASE_INSENSITIVE))
return new CIBackRef(refNum, has(UNICODE_CASE));
else
return new BackRef(refNum);
|
private java.util.regex.Pattern$Node | sequence(java.util.regex.Pattern$Node end)Parsing of sequences between alternations.
Node head = null;
Node tail = null;
Node node = null;
LOOP:
for (;;) {
int ch = peek();
switch (ch) {
case '(":
// Because group handles its own closure,
// we need to treat it differently
node = group0();
// Check for comment or flag group
if (node == null)
continue;
if (head == null)
head = node;
else
tail.next = node;
// Double return: Tail was returned in root
tail = root;
continue;
case '[":
node = clazz(true);
break;
case '\\":
ch = nextEscaped();
if (ch == 'p" || ch == 'P") {
boolean oneLetter = true;
boolean comp = (ch == 'P");
ch = next(); // Consume { if present
if (ch != '{") {
unread();
} else {
oneLetter = false;
}
node = family(oneLetter).maybeComplement(comp);
} else {
unread();
node = atom();
}
break;
case '^":
next();
if (has(MULTILINE)) {
if (has(UNIX_LINES))
node = new UnixCaret();
else
node = new Caret();
} else {
node = new Begin();
}
break;
case '$":
next();
if (has(UNIX_LINES))
node = new UnixDollar(has(MULTILINE));
else
node = new Dollar(has(MULTILINE));
break;
case '.":
next();
if (has(DOTALL)) {
node = new All();
} else {
if (has(UNIX_LINES))
node = new UnixDot();
else {
node = new Dot();
}
}
break;
case '|":
case ')":
break LOOP;
case ']": // Now interpreting dangling ] and } as literals
case '}":
node = atom();
break;
case '?":
case '*":
case '+":
next();
throw error("Dangling meta character '" + ((char)ch) + "'");
case 0:
if (cursor >= patternLength) {
break LOOP;
}
// Fall through
default:
node = atom();
break;
}
node = closure(node);
if (head == null) {
head = tail = node;
} else {
tail.next = node;
tail = node;
}
}
if (head == null) {
return end;
}
tail.next = end;
root = tail; //double return
return head;
|
private static java.util.regex.Pattern$CharProperty | setDifference(java.util.regex.Pattern$CharProperty lhs, java.util.regex.Pattern$CharProperty rhs)Returns the set difference of two CharProperty nodes.
return new CharProperty() {
boolean isSatisfiedBy(int ch) {
return ! rhs.isSatisfiedBy(ch) && lhs.isSatisfiedBy(ch);}};
|
private int | single()
int ch = peek();
switch (ch) {
case '\\":
return escape(true, false);
default:
next();
return ch;
}
|
private int | skip()Read the character after the next one, and advance the cursor by two.
int i = cursor;
int ch = temp[i+1];
cursor = i + 2;
return ch;
|
public java.lang.String[] | split(java.lang.CharSequence input, int limit)Splits the given input sequence around matches of this pattern.
The array returned by this method contains each substring of the
input sequence that is terminated by another subsequence that matches
this pattern or is terminated by the end of the input sequence. The
substrings in the array are in the order in which they occur in the
input. If this pattern does not match any subsequence of the input then
the resulting array has just one element, namely the input sequence in
string form.
The limit parameter controls the number of times the
pattern is applied and therefore affects the length of the resulting
array. If the limit n is greater than zero then the pattern
will be applied at most n - 1 times, the array's
length will be no greater than n, and the array's last entry
will contain all input beyond the last matched delimiter. If n
is non-positive then the pattern will be applied as many times as
possible and the array can have any length. If n is zero then
the pattern will be applied as many times as possible, the array can
have any length, and trailing empty strings will be discarded.
The input "boo:and:foo", for example, yields the following
results with these parameters:
Regex |
Limit |
Result |
: |
2 |
{ "boo", "and:foo" } |
: |
5 |
{ "boo", "and", "foo" } |
: |
-2 |
{ "boo", "and", "foo" } |
o |
5 |
{ "b", "", ":and:f", "", "" } |
o |
-2 |
{ "b", "", ":and:f", "", "" } |
o |
0 |
{ "b", "", ":and:f" } |
int index = 0;
boolean matchLimited = limit > 0;
ArrayList<String> matchList = new ArrayList<String>();
Matcher m = matcher(input);
// Add segments before each match found
while(m.find()) {
if (!matchLimited || matchList.size() < limit - 1) {
String match = input.subSequence(index, m.start()).toString();
matchList.add(match);
index = m.end();
} else if (matchList.size() == limit - 1) { // last one
String match = input.subSequence(index,
input.length()).toString();
matchList.add(match);
index = m.end();
}
}
// If no match was found, return this
if (index == 0)
return new String[] {input.toString()};
// Add remaining segment
if (!matchLimited || matchList.size() < limit)
matchList.add(input.subSequence(index, input.length()).toString());
// Construct result
int resultSize = matchList.size();
if (limit == 0)
while (resultSize > 0 && matchList.get(resultSize-1).equals(""))
resultSize--;
String[] result = new String[resultSize];
return matchList.subList(0, resultSize).toArray(result);
|
public java.lang.String[] | split(java.lang.CharSequence input)Splits the given input sequence around matches of this pattern.
This method works as if by invoking the two-argument {@link
#split(java.lang.CharSequence, int) split} method with the given input
sequence and a limit argument of zero. Trailing empty strings are
therefore not included in the resulting array.
The input "boo:and:foo", for example, yields the following
results with these expressions:
Regex |
Result |
: |
{ "boo", "and", "foo" } |
o |
{ "b", "", ":and:f" } |
return split(input, 0);
|
private void | subFlag()Parses the second part of inlined match flags and turns off
flags appropriately.
int ch = peek();
for (;;) {
switch (ch) {
case 'i":
flags &= ~CASE_INSENSITIVE;
break;
case 'm":
flags &= ~MULTILINE;
break;
case 's":
flags &= ~DOTALL;
break;
case 'd":
flags &= ~UNIX_LINES;
break;
case 'u":
flags &= ~UNICODE_CASE;
break;
case 'c":
flags &= ~CANON_EQ;
break;
case 'x":
flags &= ~COMMENTS;
break;
default:
return;
}
ch = next();
}
|
public java.lang.String | toString()Returns the string representation of this pattern. This
is the regular expression from which this pattern was
compiled.
return pattern;
|
private int | u()Utility method for parsing unicode escape sequences.
int n = 0;
for (int i = 0; i < 4; i++) {
int ch = read();
if (!ASCII.isHexDigit(ch)) {
throw error("Illegal Unicode escape sequence");
}
n = n * 16 + ASCII.toDigit(ch);
}
return n;
|
private java.util.regex.Pattern$CharProperty | unicodeBlockPropertyFor(java.lang.String name)Returns a CharProperty matching all characters in a UnicodeBlock.
final Character.UnicodeBlock block;
try {
block = Character.UnicodeBlock.forName(name);
} catch (IllegalArgumentException iae) {
throw error("Unknown character block name {" + name + "}");
}
return new CharProperty() {
boolean isSatisfiedBy(int ch) {
return block == Character.UnicodeBlock.of(ch);}};
|
private static java.util.regex.Pattern$CharProperty | union(java.util.regex.Pattern$CharProperty lhs, java.util.regex.Pattern$CharProperty rhs)Returns the set union of two CharProperty nodes.
return new CharProperty() {
boolean isSatisfiedBy(int ch) {
return lhs.isSatisfiedBy(ch) || rhs.isSatisfiedBy(ch);}};
|
private void | unread()Unread one next character, and retreat cursor by one.
cursor--;
|
private int | x()Utility method for parsing hexadecimal escape sequences.
int n = read();
if (ASCII.isHexDigit(n)) {
int m = read();
if (ASCII.isHexDigit(m)) {
return ASCII.toDigit(n) * 16 + ASCII.toDigit(m);
}
}
throw error("Illegal hexadecimal escape sequence");
|