FileDocCategorySizeDatePackage
Matcher.javaAPI DocAndroid 1.5 API24198Wed May 06 22:41:04 BST 2009java.util.regex

Matcher.java

/*
 * Copyright (C) 2007 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package java.util.regex;

import com.ibm.icu4jni.regex.NativeRegEx;

/**
 * Provides a means of matching regular expressions against a given input,
 * finding occurrences of regular expressions in a given input, or replacing
 * parts of a given input. A {@code Matcher} instance has an associated {@link
 * Pattern} instance and an input text. A typical use case is to
 * iteratively find all occurrences of the {@code Pattern}, until the end of
 * the input is reached, as the following example illustrates:
 * 
 * <p/>
 * 
 * <pre>
 * Pattern p = Pattern.compile("[A-Za-z]+");
 *  
 * Matcher m = p.matcher("Hello, Android!");
 * while (m.find()) {
 *     System.out.println(m.group()); // prints "Hello" and "Android"
 * }
 * </pre>
 * 
 * <p/>
 * 
 * The {@code Matcher} has a state that results from the previous operations.
 * For example, it knows whether the most recent attempt to find the
 * {@code Pattern} was successful and at which position the next attempt would
 * resume the search. Depending on the application's needs, it may become
 * necessary to explicitly {@link #reset()} this state from time to time.
 * 
 * @since Android 1.0
 */
public final class Matcher implements MatchResult {

    /**
     * Holds the pattern, that is, the compiled regular expression.
     */
    private Pattern pattern;

    /**
     * Holds the handle for the native version of the pattern.
     */
    private int nativePattern;
    
    /**
     * Holds the input text.
     */
    private String input = "";

    /**
     * Holds the start of the region, or 0 if the matching should start at the
     * beginning of the text.
     */
    private int regionStart;
    
    /**
     * Holds the end of the region, or input.length() if the matching should
     * go until the end of the input.
     */
    private int regionEnd;

    /**
     * Reflects whether we just reset the matcher or whether we already
     * started some find/replace operations.
     */
    private boolean searching;
    
    /**
     * Holds the position where the next find operation will take place. 
     */
    private int findPos;
    
    /**
     * Holds the position where the next append operation will take place. 
     */
    private int appendPos;
    
    /**
     * Reflects whether a match has been found during the most recent find
     * operation.
     */
    private boolean matchFound;

    /**
     * Holds the offsets for the most recent match.
     */
    private int[] matchOffsets;

    /**
     * Reflects whether the bounds of the region are anchoring.
     */
    private boolean anchoringBounds = true;
    
    /**
     * Reflects whether the bounds of the region are transparent.
     */
    private boolean transparentBounds;

    /**
     * Creates a matcher for a given combination of pattern and input. Both
     * elements can be changed later on.
     * 
     * @param pattern
     *            the pattern to use.
     * @param input
     *            the input to use.
     */
    Matcher(Pattern pattern, CharSequence input) {
        usePattern(pattern);
        reset(input);
    }

    /**
     * Resets the Matcher. A new input sequence and a new region can be
     * specified. Results of a previous find get lost. The next attempt to find
     * an occurrence of the Pattern in the string will start at the beginning of
     * the region. This is the internal version of reset() to which the several
     * public versions delegate.
     * 
     * @param input
     *            the input sequence.
     * @param start
     *            the start of the region.
     * @param end
     *            the end of the region.
     * 
     * @return the matcher itself.
     */
    private Matcher reset(CharSequence input, int start, int end) {
        if (input == null) {
            throw new IllegalArgumentException();
        }
        
        if (start < 0 || end < 0 || start > input.length() || 
                end > input.length() || start > end) {
            throw new IllegalArgumentException();
        }

        // Maybe should have a reset() here, but it makes thing worse...
        // NativeRegEx.reset(nativePattern, 0);
        
        if (!input.equals(this.input)) {
            this.input = input.toString();
            
            NativeRegEx.setText(nativePattern, this.input);
            
            regionStart = 0;
            regionEnd = input.length();
        }

        if (start != regionStart || end != regionEnd) {
            regionStart = start;
            regionEnd = end;

            NativeRegEx.setRegion(nativePattern, regionStart, regionEnd);
        }

        searching = false;
        matchFound = false;
        findPos = regionStart;
        appendPos = 0;
        
        return this;
    }

    /**
     * Resets the {@code Matcher}. This results in the region being set to the
     * whole input. Results of a previous find get lost. The next attempt to
     * find an occurrence of the {@link Pattern} in the string will start at the
     * beginning of the input.
     * 
     * @return the {@code Matcher} itself.
     * 
     * @since Android 1.0
     */
    public Matcher reset() {
        return reset(input, 0, input.length());
    }

    /**
     * Provides a new input and resets the {@code Matcher}. This results in the
     * region being set to the whole input. Results of a previous find get lost.
     * The next attempt to find an occurrence of the {@link Pattern} in the
     * string will start at the beginning of the input.
     * 
     * @param input
     *            the new input sequence.
     * 
     * @return the {@code Matcher} itself.
     * 
     * @since Android 1.0
     */
    public Matcher reset(CharSequence input) {
        return reset(input, 0, input.length());
    }

    /**
     * Sets a new pattern for the {@code Matcher}. Results of a previous find
     * get lost. The next attempt to find an occurrence of the {@link Pattern}
     * in the string will start at the beginning of the input.
     * 
     * @param pattern
     *            the new {@code Pattern}.
     * 
     * @return the {@code Matcher} itself.
     * 
     * @since Android 1.0
     */
    public Matcher usePattern(Pattern pattern) {
        if (pattern == null) {
            throw new IllegalArgumentException();
        }
        
        this.pattern = pattern;
        
        if (nativePattern != 0) {
            NativeRegEx.close(nativePattern);
        }
        nativePattern = NativeRegEx.clone(pattern.mNativePattern);

        if (input != null) {
            NativeRegEx.setText(nativePattern, input);
            NativeRegEx.setRegion(nativePattern, regionStart, regionEnd);
            NativeRegEx.useAnchoringBounds(nativePattern, anchoringBounds);
            NativeRegEx.useTransparentBounds(nativePattern, transparentBounds);
        }
        
        matchOffsets = new int[(this.pattern.mGroupCount + 1) * 2];
        matchFound = false;
        return this;
    }

    /**
     * Returns the {@link Pattern} instance used inside this matcher.
     * 
     * @return the {@code Pattern} instance.
     * 
     * @since Android 1.0
     */
    public Pattern pattern() {
        return pattern;
    }

    /**
     * Returns the number of groups in the results, which is always equal to
     * the number of groups in the original regular expression.
     * 
     * @return the number of groups.
     * 
     * @since Android 1.0
     */
    public int groupCount() {
        return pattern.mGroupCount;
    }
    
    /**
     * Resets this matcher and sets a region. Only characters inside the region
     * are considered for a match.
     * 
     * @param start
     *            the first character of the region.
     * @param end
     *            the first character after the end of the region.
     * @return the {@code Matcher} itself.
     * @since Android 1.0
     */
    public Matcher region(int start, int end) {
        return reset(input, start, end);
    }

    /**
     * Returns this matcher's region start, that is, the first character that is
     * considered for a match.
     * 
     * @return the start of the region.
     * @since Android 1.0
     */
    public int regionStart() {
        return regionStart;
    }

    /**
     * Returns this matcher's region end, that is, the first character that is
     * not considered for a match.
     * 
     * @return the end of the region.
     * @since Android 1.0
     */
    public int regionEnd() {
        return regionEnd;
    }

    /**
     * Determines whether this matcher has anchoring bounds enabled or not. When
     * anchoring bounds are enabled, the start and end of the input match the
     * '^' and '$' meta-characters, otherwise not. Anchoring bounds are enabled
     * by default.
     * 
     * @param value
     *            the new value for anchoring bounds.
     * @return the {@code Matcher} itself.
     * @since Android 1.0
     */
    public Matcher useAnchoringBounds(boolean value) {
        anchoringBounds = value;
        NativeRegEx.useAnchoringBounds(nativePattern, value);
        return this;
    }
    
    /**
     * Indicates whether this matcher has anchoring bounds enabled. When
     * anchoring bounds are enabled, the start and end of the input match the
     * '^' and '$' meta-characters, otherwise not. Anchoring bounds are enabled
     * by default.
     * 
     * @return true if (and only if) the {@code Matcher} uses anchoring bounds.
     * @since Android 1.0
     */
    public boolean hasAnchoringBounds() {
        return anchoringBounds;
    }

    /**
     * Determines whether this matcher has transparent bounds enabled or not.
     * When transparent bounds are enabled, the parts of the input outside the
     * region are subject to lookahead and lookbehind, otherwise they are not.
     * Transparent bounds are disabled by default.
     * 
     * @param value
     *            the new value for transparent bounds.
     * @return the {@code Matcher} itself.
     * @since Android 1.0
     */
    public Matcher useTransparentBounds(boolean value) {
        transparentBounds = value;
        NativeRegEx.useTransparentBounds(nativePattern, value);
        return this;
    }
    
    /**
     * Indicates whether this matcher has transparent bounds enabled. When
     * transparent bounds are enabled, the parts of the input outside the region
     * are subject to lookahead and lookbehind, otherwise they are not.
     * Transparent bounds are disabled by default.
     * 
     * @return true if (and only if) the {@code Matcher} uses anchoring bounds.
     * @since Android 1.0
     */
    public boolean hasTransparentBounds() {
        return transparentBounds;
    }
    
    /**
     * Makes sure that a successful match has been made. Is invoked internally
     * from various places in the class.
     * 
     * @throws IllegalStateException
     *             if no successful match has been made.
     * 
     * @since Android 1.0
     */
    private void ensureMatch() throws IllegalStateException {
        if (!matchFound) {
            throw new IllegalStateException("No successful match so far");
        }
    }
    
    /**
     * Returns the next occurrence of the {@link Pattern} in the input. If a
     * previous match was successful, the method continues the search from the
     * first character following that match in the input. Otherwise it searches
     * either from the region start (if one has been set), or from position 0.
     * 
     * @return true if (and only if) a match has been found.
     * @since Android 1.0
     */
    public boolean find() {
        if (!searching) {
            searching = true;
            matchFound = NativeRegEx.find(nativePattern, -1);
        } else {
            matchFound = NativeRegEx.findNext(nativePattern);
        }

        if (matchFound) {
            NativeRegEx.startEnd(nativePattern, matchOffsets);
            findPos = matchOffsets[1];
        }
        
        return matchFound;
    }

    /**
     * Returns the next occurrence of the {@link Pattern} in the input. The
     * method starts the search from the given character in the input.
     * 
     * @param start
     *            The index in the input at which the find operation is to
     *            begin. If this is less than the start of the region, it is
     *            automatically adjusted to that value. If it is beyond the end
     *            of the region, the method will fail.
     * @return true if (and only if) a match has been found.
     * @since Android 1.0
     */
    public boolean find(int start) {
        findPos = start;
        
        if (findPos < regionStart) {
            findPos = regionStart;
        } else if (findPos >= regionEnd) {
            matchFound = false;
            return false;
        }
        
        matchFound = NativeRegEx.find(nativePattern, findPos); 
        if (matchFound) {
            NativeRegEx.startEnd(nativePattern, matchOffsets);
            findPos = matchOffsets[1];
        }
        
        return matchFound;
    }

    /**
     * Tries to match the {@link Pattern} against the entire region (or the
     * entire input, if no region has been set).
     * 
     * @return true if (and only if) the {@code Pattern} matches the entire
     *         region.
     * 
     * @since Android 1.0
     */
    public boolean matches() {
        matchFound = NativeRegEx.matches(nativePattern, -1); 
        if (matchFound) {
            NativeRegEx.startEnd(nativePattern, matchOffsets);
            findPos = matchOffsets[1];
        }
        
        return matchFound;
    }

    /**
     * Tries to match the {@link Pattern}, starting from the beginning of the
     * region (or the beginning of the input, if no region has been set).
     * Doesn't require the {@code Pattern} to match against the whole region.
     * 
     * @return true if (and only if) the {@code Pattern} matches.
     * 
     * @since Android 1.0
     */
    public boolean lookingAt() {
        matchFound = NativeRegEx.lookingAt(nativePattern, -1); 
        if (matchFound) {
            NativeRegEx.startEnd(nativePattern, matchOffsets);
            findPos = matchOffsets[1];
        }
        
        return matchFound;
    }

    /**
     * Returns the index of the first character of the text that matched the
     * whole regular expression.
     * 
     * @return the character index.
     * @throws IllegalStateException
     *             if no successful match has been made.
     * @since Android 1.0
     */
    public int start() throws IllegalStateException {
        return start(0);
    }

    /**
     * Returns the index of the first character of the text that matched a given
     * group.
     * 
     * @param group
     *            the group, ranging from 0 to groupCount() - 1, with 0
     *            representing the whole pattern.
     * @return the character index.
     * @throws IllegalStateException
     *             if no successful match has been made.
     * @since Android 1.0
     */
    public int start(int group) throws IllegalStateException {
        ensureMatch();
        return matchOffsets[group * 2];
    }

    /**
     * Returns the index of the first character following the text that matched
     * the whole regular expression.
     * 
     * @return the character index.
     * @throws IllegalStateException
     *             if no successful match has been made.
     * @since Android 1.0
     */
    public int end() {
        return end(0);
    }

    /**
     * Returns the index of the first character following the text that matched
     * a given group.
     * 
     * @param group
     *            the group, ranging from 0 to groupCount() - 1, with 0
     *            representing the whole pattern.
     * @return the character index.
     * @throws IllegalStateException
     *             if no successful match has been made.
     * @since Android 1.0
     */
    public int end(int group) {
        ensureMatch();
        return matchOffsets[(group * 2) + 1];
    }

    /**
     * Returns the text that matched the whole regular expression.
     * 
     * @return the text.
     * @throws IllegalStateException
     *             if no successful match has been made.
     * @since Android 1.0
     */
    public String group() {
        return group(0);
    }

    /**
     * Returns the text that matched a given group of the regular expression.
     * 
     * @param group
     *            the group, ranging from 0 to groupCount() - 1, with 0
     *            representing the whole pattern.
     * @return the text that matched the group.
     * @throws IllegalStateException
     *             if no successful match has been made.
     * @since Android 1.0
     */
    public String group(int group) {
        ensureMatch();
        int from = matchOffsets[group * 2];
        int to = matchOffsets[(group * 2) + 1];
        if (from == -1 || to == -1) {
            return null;
        } else {
            return input.substring(from, to);
        }
    }

    /**
     * Indicates whether the last match hit the end of the input.
     * 
     * @return true if (and only if) the last match hit the end of the input.
     * @since Android 1.0
     */
    public boolean hitEnd() {
        return NativeRegEx.hitEnd(nativePattern);
    }
    
    /**
     * Indicates whether more input might change a successful match into an
     * unsuccessful one.
     * 
     * @return true if (and only if) more input might change a successful match
     *         into an unsuccessful one.
     * @since Android 1.0
     */
    public boolean requireEnd() {
        return NativeRegEx.requireEnd(nativePattern);
    }
    
    /**
     * Converts the current match into a separate {@link MatchResult} instance
     * that is independent from this matcher. The new object is unaffected when
     * the state of this matcher changes.
     * 
     * @return the new {@code MatchResult}.
     * @throws IllegalStateException
     *             if no successful match has been made.
     * @since Android 1.0
     */
    public MatchResult toMatchResult() {
        ensureMatch();
        return new MatchResultImpl(input, matchOffsets);
    }

    /**
     * Appends a literal part of the input plus a replacement for the current
     * match to a given {@link StringBuffer}. The literal part is exactly the
     * part of the input between the previous match and the current match. The
     * method can be used in conjunction with {@link #find()} and
     * {@link #appendTail(StringBuffer)} to walk through the input and replace
     * all occurrences of the {@code Pattern} with something else.
     * 
     * @param buffer
     *            the {@code StringBuffer} to append to.
     * @param replacement
     *            the replacement text.
     * @return the {@code Matcher} itself.
     * @throws IllegalStateException
     *             if no successful match has been made.
     * @since Android 1.0
     */
    public Matcher appendReplacement(StringBuffer buffer, String replacement)
            throws IllegalStateException {
        
        buffer.append(input.substring(appendPos, start()));
        appendEvaluated(buffer, replacement);
        appendPos = end();
        
        return this;
    }

    /**
     * Appends the (unmatched) remainder of the input to the given
     * {@link StringBuffer}. The method can be used in conjunction with
     * {@link #find()} and {@link #appendReplacement(StringBuffer, String)} to
     * walk through the input and replace all matches of the {@code Pattern}
     * with something else.
     * 
     * @param buffer
     *            the {@code StringBuffer} to append to.
     * @return the {@code StringBuffer}.
     * @throws IllegalStateException
     *             if no successful match has been made.
     * @since Android 1.0
     */
    public StringBuffer appendTail(StringBuffer buffer) {
        if (appendPos < regionEnd) {
            buffer.append(input.substring(appendPos, regionEnd));
        }
        
        return buffer;
    }

    /**
     * Internal helper method to append a given string to a given string buffer.
     * If the string contains any references to groups, these are replaced by
     * the corresponding group's contents.
     * 
     * @param buffer
     *            the string buffer.
     * @param s
     *            the string to append.
     */
    private void appendEvaluated(StringBuffer buffer, String s) {
        boolean escape = false;
        boolean dollar = false;
        
        for (int i = 0; i < s.length(); i++) {
            char c = s.charAt(i);
            if (c == '\\' && !escape) {
                escape = true;
            } else if (c == '$' && !escape) {
                dollar = true;
            } else if (c >= '0' && c <= '9' && dollar) {
                buffer.append(group(c - '0'));
                dollar = false;
            } else {
                buffer.append(c);
                dollar = false;
                escape = false;
            }
        }
        
        // This seemingly stupid piece of code reproduces a JDK bug. 
        if (escape) {
            throw new ArrayIndexOutOfBoundsException(s.length());
        }
    }
    
    /**
     * Replaces all occurrences of this matcher's pattern in the input with a
     * given string.
     * 
     * @param replacement
     *            the replacement text.
     * @return the modified input string.
     * @since Android 1.0
     */
    public String replaceAll(String replacement) {
        StringBuffer buffer = new StringBuffer(input.length());
        
        findPos = 0;
        appendPos = 0;
        matchFound = false;
        searching = false;
        
        while (find()) {
            appendReplacement(buffer, replacement);
        }
        
        return appendTail(buffer).toString();
    }

    /**
     * Replaces the first occurrence of this matcher's pattern in the input with
     * a given string.
     * 
     * @param replacement
     *            the replacement text.
     * @return the modified input string.
     * @since Android 1.0
     */
    public String replaceFirst(String replacement) {
        StringBuffer buffer = new StringBuffer(input.length());

        findPos = 0;
        appendPos = 0;
        matchFound = false;
        searching = false;
        
        if (find()) {
            appendReplacement(buffer, replacement);
        }
        
        return appendTail(buffer).toString();
    }

    /**
     * Returns a replacement string for the given one that has all backslashes
     * and dollar signs escaped.
     * 
     * @param s
     *            the input string.
     * @return the input string, with all backslashes and dollar signs having
     *         been escaped.
     * @since Android 1.0
     */
    public static String quoteReplacement(String s) {
        StringBuffer buffer = new StringBuffer(s.length());
        
        for (int i = 0; i < s.length(); i++) {
            char c = s.charAt(i);
            if (c == '\\' || c == '$') {
                buffer.append('\\');
            }
            buffer.append(c);
        }
        
        return buffer.toString();
    }

    @Override
    protected void finalize() throws Throwable {
        try {
            if (nativePattern != 0) {
                NativeRegEx.close(nativePattern);
            }
        }
        finally {
            super.finalize();
        }
    }
    
}