FileDocCategorySizeDatePackage
AbstractMessageParser.javaAPI DocAndroid 1.5 API44438Wed May 06 22:41:56 BST 2009com.google.android.util

AbstractMessageParser

public abstract class AbstractMessageParser extends Object
Logic for parsing a text message typed by the user looking for smileys, urls, acronyms,formatting (e.g., '*'s for bold), me commands (e.g., "/me is asleep"), and punctuation. It constructs an array, which breaks the text up into its constituent pieces, which we return to the client.

Fields Summary
public static final String
musicNote
Music note that indicates user is listening to a music track.
private String
text
private int
nextChar
private int
nextClass
private ArrayList
parts
private ArrayList
tokens
private HashMap
formatStart
private boolean
parseSmilies
private boolean
parseAcronyms
private boolean
parseFormatting
private boolean
parseUrls
private boolean
parseMeText
private boolean
parseMusic
Constructors Summary
public AbstractMessageParser(String text)
Create a message parser to parse urls, formatting, acronyms, smileys, /me text and music

param
text the text to parse


                          
     
    this(text, true, true, true, true, true, true);
  
public AbstractMessageParser(String text, boolean parseSmilies, boolean parseAcronyms, boolean parseFormatting, boolean parseUrls, boolean parseMusic, boolean parseMeText)
Create a message parser, specifying the kinds of text to parse

param
text the text to parse

    this.text = text;
    this.nextChar = 0;
    this.nextClass = 10;
    this.parts = new ArrayList<Part>();
    this.tokens = new ArrayList<Token>();
    this.formatStart = new HashMap<Character,Format>();
    this.parseSmilies = parseSmilies;
    this.parseAcronyms = parseAcronyms;
    this.parseFormatting = parseFormatting;
    this.parseUrls = parseUrls;
    this.parseMusic = parseMusic;
    this.parseMeText = parseMeText;
  
Methods Summary
private voidaddToken(com.google.android.util.AbstractMessageParser$Token token)
Adds the given token to the parsed output.

    tokens.add(token);
  
private voidaddURLToken(java.lang.String url, java.lang.String text)
Adds the appropriate token for the given URL. This might be a simple link or it might be a recognized media type.

     addToken(tokenForUrl(url, text));
  
private voidbuildParts(java.lang.String meText)
Builds the parts list.

param
meText any meText parsed from the message

    for (int i = 0; i < tokens.size(); ++i) {
      Token token = tokens.get(i);
      if (token.isMedia() || (parts.size() == 0) || lastPart().isMedia()) {
        parts.add(new Part());
      }
      lastPart().add(token);
    }

    // The first part inherits the meText of the line.
    if (parts.size() > 0) {
      parts.get(0).setMeText(meText);
    }
  
private intgetCharClass(int index)
Returns the class for the character at the given index.

    if ((index < 0) || (text.length() <= index)) {
      return 0;
    }

    char ch = text.charAt(index);
    if (Character.isWhitespace(ch)) {
      return 1;
    } else if (Character.isLetter(ch)) {
      return 2;
    } else if (Character.isDigit(ch)) {
      return 3;
    } else if (isPunctuation(ch)) {
      // For punctuation, we return a unique value every time so that they are
      // always different from any other character.  Punctuation should always
      // be considered a possible word break.
      return ++nextClass;
    } else {
      return 4;
    }
  
public final com.google.android.util.AbstractMessageParser$PartgetPart(int index)
Return the part at the given index.

 return parts.get(index); 
public final intgetPartCount()
Return the number of parts.

 return parts.size(); 
public final java.util.ListgetParts()
Return the list of parts from the parsed text

 return parts; 
public final java.lang.StringgetRawText()
Returns the raw text being parsed.

 return text; 
protected abstract com.google.android.util.AbstractMessageParser$ResourcesgetResources()
Subclasses must define the schemes, domains, smileys and acronyms that are necessary for parsing

private booleanisDomainChar(char c)
Determines if this is an allowable domain character.

    return c == '-" || Character.isLetter(c) || Character.isDigit(c);
  
private static booleanisFormatChar(char ch)
Determines whether the given character is the beginning or end of a section with special formatting.

    switch (ch) {
      case '*": case '_": case '^":
        return true;

      default:
        return false;
    }
  
private static booleanisPunctuation(char ch)
Determines whether the given character is punctuation.

    switch (ch) {
      case '.": case ',": case '"": case ':": case ';":
      case '?": case '!": case '(": case ')":
        return true;

      default:
        return false;
    }
  
private booleanisSmileyBreak(int index)
Determines whether the given index could be a possible smiley break.

    if (index > 0 && index < text.length()) {
      if (isSmileyBreak(text.charAt(index - 1), text.charAt(index))) {
        return true;
      }
    }

    return false;
  
private static booleanisSmileyBreak(char c1, char c2)
Returns true if c1 could be the last character of a smiley and c2 could be the first character of a different smiley, if {@link #isWordBreak} would not already recognize that this is possible.

    switch (c1) {
      /*    
       * These characters can end smileys, but don't normally end words.
       */
      case '$": case '&": case '*": case '+": case '-":
      case '/": case '<": case '=": case '>": case '@":
      case '[": case '\\": case ']": case '^": case '|":
      case '}": case '~":
        switch (c2) {
          /*
           * These characters can begin smileys, but don't normally
           * begin words.
           */
          case '#": case '$": case '%": case '*": case '/":
          case '<": case '=": case '>": case '@": case '[":
          case '\\": case '^": case '~":
            return true;
        }
    }

    return false;
  
private booleanisURLBreak(int index)
Verifies that the character before the given index is end of line, whitespace, or punctuation.

    switch (getCharClass(index - 1)) {
      case 2:
      case 3:
      case 4:
        return false;

      case 0:
      case 1:
      default:
        return true;
    }
  
private booleanisValidDomain(java.lang.String domain)
Determines if the given string is a valid domain.

    // For hostnames, check that it ends with a known domain suffix
    if (matches(getResources().getDomainSuffixes(), reverse(domain))) {
      return true;
    }
    return false;
  
private booleanisWordBreak(int index)
Determines whether the given index could be a possible word break.

    return getCharClass(index - 1) != getCharClass(index);
  
private com.google.android.util.AbstractMessageParser$PartlastPart()
Returns the last part in the list.

 return parts.get(parts.size() - 1); 
private static com.google.android.util.AbstractMessageParser$TrieNodelongestMatch(com.google.android.util.AbstractMessageParser$TrieNode root, com.google.android.util.AbstractMessageParser p, int start)
Returns the longest substring of the given string, starting at the given index, that exists in the trie.

    return longestMatch(root, p, start, false);
  
private static com.google.android.util.AbstractMessageParser$TrieNodelongestMatch(com.google.android.util.AbstractMessageParser$TrieNode root, com.google.android.util.AbstractMessageParser p, int start, boolean smiley)
Returns the longest substring of the given string, starting at the given index, that exists in the trie, with a special tokenizing case for smileys if specified.

    int index = start;
    TrieNode bestMatch = null;
    while (index < p.getRawText().length()) {
      root = root.getChild(p.getRawText().charAt(index++));
      if (root == null) {
        break;
      } else if (root.exists()) {
        if (p.isWordBreak(index)) {
          bestMatch = root;
        } else if (smiley && p.isSmileyBreak(index)) {
          bestMatch = root;
        }
      }
    }
    return bestMatch;
  
private static booleanmatches(com.google.android.util.AbstractMessageParser$TrieNode root, java.lang.String str)
Determines whether the given string is in the given trie.

    int index = 0;
    while (index < str.length()) {
      root = root.getChild(str.charAt(index++));
      if (root == null) {
        break;
      } else if (root.exists()) {
        return true;
      }
    }
    return false;
  
public voidparse()
Parses the text string into an internal representation.

    // Look for music track (of which there would be only one and it'll be the
    // first token)
    if (parseMusicTrack()) {
      buildParts(null);
      return;
    }

    // Look for me commands.
    String meText = null;
    if (parseMeText && text.startsWith("/me") && (text.length() > 3) &&
        Character.isWhitespace(text.charAt(3))) {
      meText = text.substring(0, 4);
      text = text.substring(4);
    }

    // Break the text into tokens.
    boolean wasSmiley = false;
    while (nextChar < text.length()) {
      if (!isWordBreak(nextChar)) {
        if (!wasSmiley || !isSmileyBreak(nextChar)) {
          throw new AssertionError("last chunk did not end at word break");
        }
      }

      if (parseSmiley()) {
        wasSmiley = true;
      } else {
        wasSmiley = false;

        if (!parseAcronym() && !parseURL() && !parseFormatting()) {
          parseText();
        }
      }
    }

    // Trim the whitespace before and after media components.
    for (int i = 0; i < tokens.size(); ++i) {
      if (tokens.get(i).isMedia()) {
        if ((i > 0) && (tokens.get(i - 1) instanceof Html)) {
          ((Html)tokens.get(i - 1)).trimLeadingWhitespace();
        }
        if ((i + 1 < tokens.size()) && (tokens.get(i + 1) instanceof Html)) {
          ((Html)tokens.get(i + 1)).trimTrailingWhitespace();
        }
      }
    }

    // Remove any empty html tokens.
    for (int i = 0; i < tokens.size(); ++i) {
      if (tokens.get(i).isHtml() &&
          (tokens.get(i).toHtml(true).length() == 0)) {
        tokens.remove(i);
        --i;  // visit this index again
      }
    }

    buildParts(meText);
  
private booleanparseAcronym()
Looks for acronyms (e.g., "lol") in the text.

    if(!parseAcronyms) {
      return false;
    }
    TrieNode match = longestMatch(getResources().getAcronyms(), this, nextChar);
    if (match == null) {
      return false;
    } else {
      addToken(new Acronym(match.getText(), match.getValue()));
      nextChar += match.getText().length();
      return true;
    }
  
private booleanparseFormatting()
Deal with formatting characters. Parsing is as follows: - Treat all contiguous strings of formatting characters as one block. (This method processes one block.) - Only a single instance of a particular format character within a block is used to determine whether to turn on/off that type of formatting; other instances simply print the character itself. - If the format is to be turned on, we use the _first_ instance; if it is to be turned off, we use the _last_ instance (by appending the format.) Example: **string** turns into *string*

    if(!parseFormatting) {
      return false;
    }
    int endChar = nextChar;
    while ((endChar < text.length()) && isFormatChar(text.charAt(endChar))) {
      endChar += 1;
    }

    if ((endChar == nextChar) || !isWordBreak(endChar)) {
      return false;
    }

    // Keeps track of whether we've seen a character (in map if we've seen it)
    // and whether we should append a closing format token (if value in
    // map is TRUE).  Linked hashmap for consistent ordering.
    LinkedHashMap<Character, Boolean> seenCharacters =
        new LinkedHashMap<Character, Boolean>();

    for (int index = nextChar; index < endChar; ++index) {
      char ch = text.charAt(index);
      Character key = Character.valueOf(ch);
      if (seenCharacters.containsKey(key)) {
        // Already seen this character, just append an unmatched token, which
        // will print plaintext character
        addToken(new Format(ch, false));
      } else {
        Format start = formatStart.get(key);
        if (start != null) {
          // Match the start token, and ask an end token to be appended
          start.setMatched(true);
          formatStart.remove(key);
          seenCharacters.put(key, Boolean.TRUE);
        } else {
          // Append start token
          start = new Format(ch, true);
          formatStart.put(key, start);
          addToken(start);
          seenCharacters.put(key, Boolean.FALSE);
        }
      }
    }

    // Append any necessary end tokens
    for (Character key : seenCharacters.keySet()) {
      if (seenCharacters.get(key) == Boolean.TRUE) {
        Format end = new Format(key.charValue(), false);
        end.setMatched(true);
        addToken(end);
      }
    }

    nextChar = endChar;
    return true;
  
private booleanparseMusicTrack()
Looks for a music track (\u266B is first character, everything else is track info).


    if (parseMusic && text.startsWith(musicNote)) {
      addToken(new MusicTrack(text.substring(musicNote.length())));
      nextChar = text.length();
      return true;
    }
    return false;
  
private booleanparseSmiley()
Looks for smileys (e.g., ":)") in the text. The set of known smileys is loaded from a file into a trie at server start.

    if(!parseSmilies) {
      return false;
    }
    TrieNode match = longestMatch(getResources().getSmileys(), this, nextChar,
                                  true);
    if (match == null) {
      return false;
    } else {
      int previousCharClass = getCharClass(nextChar - 1);
      int nextCharClass = getCharClass(nextChar + match.getText().length());
      if ((previousCharClass == 2 || previousCharClass == 3)
          && (nextCharClass == 2 || nextCharClass == 3)) {
        return false;
      }
      addToken(new Smiley(match.getText()));
      nextChar += match.getText().length();
      return true;
    }
  
private voidparseText()
Consumes all of the text in the next word .

    StringBuilder buf = new StringBuilder();
    int start = nextChar;
    do {
      char ch = text.charAt(nextChar++);
      switch (ch) {
        case '<":  buf.append("<"); break;
        case '>":  buf.append(">"); break;
        case '&":  buf.append("&"); break;
        case '"":  buf.append("""); break;
        case '\'":  buf.append("'"); break;
        case '\n":  buf.append("<br>"); break;
        default:  buf.append(ch); break;
      }
    } while (!isWordBreak(nextChar));

    addToken(new Html(text.substring(start, nextChar), buf.toString()));
  
private booleanparseURL()
Looks for a URL in two possible forms: either a proper URL with a known scheme or a domain name optionally followed by a path, query, or query.

    // Make sure this is a valid place to start a URL.
    if (!parseUrls || !isURLBreak(nextChar)) {
      return false;
    }

    int start = nextChar;

    // Search for the first block of letters.
    int index = start;
    while ((index < text.length()) && isDomainChar(text.charAt(index))) {
      index += 1;
    }

    String url = "";
    boolean done = false;

    if (index == text.length()) {
      return false;
    } else if (text.charAt(index) == ':") {
      // Make sure this is a known scheme.
      String scheme = text.substring(nextChar, index);
      if (!getResources().getSchemes().contains(scheme)) {
        return false;
      }
    } else if (text.charAt(index) == '.") {
      // Search for the end of the domain name.
      while (index < text.length()) {
        char ch = text.charAt(index);
        if ((ch != '.") && !isDomainChar(ch)) {
          break;
        } else {
          index += 1;
        }
      }

      // Make sure the domain name has a valid suffix.  Since tries look for
      // prefix matches, we reverse all the strings to get suffix comparisons.
      String domain = text.substring(nextChar, index);
      if (!isValidDomain(domain)) {
        return false;
      }

      // Search for a port.  We deal with this specially because a colon can
      // also be a punctuation character.
      if ((index + 1 < text.length()) && (text.charAt(index) == ':")) {
        char ch = text.charAt(index + 1);
        if (Character.isDigit(ch)) {
          index += 1;
          while ((index < text.length()) &&
                 Character.isDigit(text.charAt(index))) {
            index += 1;
          }
        }
      }

      // The domain name should be followed by end of line, whitespace,
      // punctuation, or a colon, slash, question, or hash character.  The
      // tricky part here is that some URL characters are also punctuation, so
      // we need to distinguish them.  Since we looked for ports above, a colon
      // is always punctuation here.  To distinguish '?' cases, we look at the
      // character that follows it.
      if (index == text.length()) {
        done = true;
      } else {
        char ch = text.charAt(index);
        if (ch == '?") {
          // If the next character is whitespace or punctuation (or missing),
          // then this question mark looks like punctuation.
          if (index + 1 == text.length()) {
            done = true;
          } else {
            char ch2 = text.charAt(index + 1);
            if (Character.isWhitespace(ch2) || isPunctuation(ch2)) {
              done = true;
            }
          }
        } else if (isPunctuation(ch)) {
          done = true;
        } else if (Character.isWhitespace(ch)) {
          done = true;
        } else if ((ch == '/") || (ch == '#")) {
          // In this case, the URL is not done.  We will search for the end of
          // it below.
        } else {
          return false;
        }
      }

      // We will assume the user meant HTTP.  (One weird case is where they
      // type a port of 443.  That could mean HTTPS, but they might also want
      // HTTP.  We'll let them specify if they don't want HTTP.)
      url = "http://";
    } else {
      return false;
    }

    // If the URL is not done, search for the end, which is just before the
    // next whitespace character.
    if (!done) {
      while ((index < text.length()) &&
             !Character.isWhitespace(text.charAt(index))) {
        index += 1;
      }
    }

    String urlText = text.substring(start, index);
    url += urlText;

    // Figure out the appropriate token type.
    addURLToken(url, urlText);

    nextChar = index;
    return true;
  
protected static java.lang.Stringreverse(java.lang.String str)
Returns the reverse of the given string.

    StringBuilder buf = new StringBuilder();
    for (int i = str.length() - 1; i >= 0; --i) {
      buf.append(str.charAt(i));
    }
    return buf.toString();
  
public java.lang.StringtoHtml()
Converts the entire message into a single HTML display string.

    StringBuilder html = new StringBuilder();

    for (Part part : parts) {
      boolean caps = false;

      html.append("<p>");
      for (Token token : part.getTokens()) {
        if (token.isHtml()) {
          html.append(token.toHtml(caps));
        } else {
          switch (token.getType()) {
          case LINK:
            html.append("<a href=\"");
            html.append(((Link)token).getURL());
            html.append("\">");
            html.append(token.getRawText());
            html.append("</a>");
            break;

          case SMILEY:
            // TODO: link to an appropriate image
            html.append(token.getRawText());
            break;

          case ACRONYM:
            html.append(token.getRawText());
            break;

          case MUSIC:
            // TODO: include a music glyph
            html.append(((MusicTrack)token).getTrack());
            break;

          case GOOGLE_VIDEO:
            // TODO: include a Google Video icon
            html.append("<a href=\"");
            html.append(((Video)token).getURL(((Video)token).getDocID()));
            html.append("\">");
            html.append(token.getRawText());
            html.append("</a>");
            break;

          case YOUTUBE_VIDEO:
            // TODO: include a YouTube icon
            html.append("<a href=\"");
            html.append(((YouTubeVideo)token).getURL(
                ((YouTubeVideo)token).getDocID()));
            html.append("\">");
            html.append(token.getRawText());
            html.append("</a>");
            break;

          case PHOTO: {
            // TODO: include a Picasa Web icon
            html.append("<a href=\"");
            html.append(Photo.getAlbumURL(
                ((Photo)token).getUser(), ((Photo)token).getAlbum()));
            html.append("\">");
            html.append(token.getRawText());
            html.append("</a>");
            break;
          }

          case FLICKR:
            // TODO: include a Flickr icon
            Photo p = (Photo) token;
            html.append("<a href=\"");
            html.append(((FlickrPhoto)token).getUrl());
            html.append("\">");
            html.append(token.getRawText());
            html.append("</a>");
            break;

          default:
            throw new AssertionError("unknown token type: " + token.getType());
          }
        }

        if (token.controlCaps()) {
          caps = token.setCaps();
        }
      }
      html.append("</p>\n");
    }

    return html.toString();
  
public static com.google.android.util.AbstractMessageParser$TokentokenForUrl(java.lang.String url, java.lang.String text)
Get a the appropriate Token for a given URL

param
text the anchor text
param
url the url

    if(url == null) {
      return null;
    }

    //Look for video links
    Video video = Video.matchURL(url, text);
    if (video != null) {
      return video;
    }

    // Look for video links.
    YouTubeVideo ytVideo = YouTubeVideo.matchURL(url, text);
    if (ytVideo != null) {
      return ytVideo;
    }

    // Look for photo links.
    Photo photo = Photo.matchURL(url, text);
    if (photo != null) {
      return photo;
    }

    // Look for photo links.
    FlickrPhoto flickrPhoto = FlickrPhoto.matchURL(url, text);
    if (flickrPhoto != null) {
      return flickrPhoto;
    }

    //Not media, so must be a regular URL
    return new Link(url, text);