File Doc Category Size Date Package
AbstractMessageParser.java API Doc Android 1.5 API 44438 Wed May 06 22:41:56 BST 2009 com.google.android.util

AbstractMessageParser

java.lang.Object

public abstract class AbstractMessageParser extends Object

Logic for parsing a text message typed by the user looking for smileys, urls, acronyms,formatting (e.g., '*'s for bold), me commands (e.g., "/me is asleep"), and punctuation. It constructs an array, which breaks the text up into its constituent pieces, which we return to the client.

Fields Summary
public static final String
musicNote
Music note that indicates user is listening to a music track.
private String
text
private int
nextChar
private int
nextClass
private ArrayList
parts
private ArrayList
tokens
private HashMap
formatStart
private boolean
parseSmilies
private boolean
parseAcronyms
private boolean
parseFormatting
private boolean
parseUrls
private boolean
parseMeText
private boolean
parseMusic
Constructors Summary
public AbstractMessageParser(String text)
Create a message parser to parse urls, formatting, acronyms, smileys, /me text and music
param
text the text to parse
this(text, true, true, true, true, true, true);
public AbstractMessageParser(String text, boolean parseSmilies, boolean parseAcronyms, boolean parseFormatting, boolean parseUrls, boolean parseMusic, boolean parseMeText)
Create a message parser, specifying the kinds of text to parse
param
text the text to parse
this.text = text; this.nextChar = 0; this.nextClass = 10; this.parts = new ArrayList<Part>(); this.tokens = new ArrayList<Token>(); this.formatStart = new HashMap<Character,Format>(); this.parseSmilies = parseSmilies; this.parseAcronyms = parseAcronyms; this.parseFormatting = parseFormatting; this.parseUrls = parseUrls; this.parseMusic = parseMusic; this.parseMeText = parseMeText;
Methods Summary
private void addToken(com.google.android.util.AbstractMessageParser$Token token)
Adds the given token to the parsed output.
tokens.add(token);
private void addURLToken(java.lang.String url, java.lang.String text)
Adds the appropriate token for the given URL. This might be a simple link or it might be a recognized media type.
addToken(tokenForUrl(url, text));
private void buildParts(java.lang.String meText)
Builds the parts list.
param
meText any meText parsed from the message
for (int i = 0; i < tokens.size(); ++i) { Token token = tokens.get(i); if (token.isMedia() || (parts.size() == 0) || lastPart().isMedia()) { parts.add(new Part()); } lastPart().add(token); } // The first part inherits the meText of the line. if (parts.size() > 0) { parts.get(0).setMeText(meText); }
private int getCharClass(int index)
Returns the class for the character at the given index.
if ((index < 0) || (text.length() <= index)) { return 0; } char ch = text.charAt(index); if (Character.isWhitespace(ch)) { return 1; } else if (Character.isLetter(ch)) { return 2; } else if (Character.isDigit(ch)) { return 3; } else if (isPunctuation(ch)) { // For punctuation, we return a unique value every time so that they are // always different from any other character. Punctuation should always // be considered a possible word break. return ++nextClass; } else { return 4; }
public final com.google.android.util.AbstractMessageParser$Part getPart(int index)
Return the part at the given index.
return parts.get(index);
public final int getPartCount()
Return the number of parts.
return parts.size();
public final java.util.List getParts()
Return the list of parts from the parsed text
return parts;
public final java.lang.String getRawText()
Returns the raw text being parsed.
return text;
protected abstract com.google.android.util.AbstractMessageParser$Resources getResources()
Subclasses must define the schemes, domains, smileys and acronyms that are necessary for parsing
private boolean isDomainChar(char c)
Determines if this is an allowable domain character.
return c == '-" || Character.isLetter(c) || Character.isDigit(c);
private static boolean isFormatChar(char ch)
Determines whether the given character is the beginning or end of a section with special formatting.
switch (ch) { case '*": case '_": case '^": return true; default: return false; }
private static boolean isPunctuation(char ch)
Determines whether the given character is punctuation.
switch (ch) { case '.": case ',": case '"": case ':": case ';": case '?": case '!": case '(": case ')": return true; default: return false; }
private boolean isSmileyBreak(int index)
Determines whether the given index could be a possible smiley break.
if (index > 0 && index < text.length()) { if (isSmileyBreak(text.charAt(index - 1), text.charAt(index))) { return true; } } return false;
private static boolean isSmileyBreak(char c1, char c2)
Returns true if c1 could be the last character of a smiley and c2 could be the first character of a different smiley, if {@link #isWordBreak} would not already recognize that this is possible.
switch (c1) { /* * These characters can end smileys, but don't normally end words. */ case '$": case '&": case '*": case '+": case '-": case '/": case '<": case '=": case '>": case '@": case '[": case '\\": case ']": case '^": case '|": case '}": case '~": switch (c2) { /* * These characters can begin smileys, but don't normally * begin words. */ case '#": case '$": case '%": case '*": case '/": case '<": case '=": case '>": case '@": case '[": case '\\": case '^": case '~": return true; } } return false;
private boolean isURLBreak(int index)
Verifies that the character before the given index is end of line, whitespace, or punctuation.
switch (getCharClass(index - 1)) { case 2: case 3: case 4: return false; case 0: case 1: default: return true; }
private boolean isValidDomain(java.lang.String domain)
Determines if the given string is a valid domain.
// For hostnames, check that it ends with a known domain suffix if (matches(getResources().getDomainSuffixes(), reverse(domain))) { return true; } return false;
private boolean isWordBreak(int index)
Determines whether the given index could be a possible word break.
return getCharClass(index - 1) != getCharClass(index);
private com.google.android.util.AbstractMessageParser$Part lastPart()
Returns the last part in the list.
return parts.get(parts.size() - 1);
private static com.google.android.util.AbstractMessageParser$TrieNode longestMatch(com.google.android.util.AbstractMessageParser$TrieNode root, com.google.android.util.AbstractMessageParser p, int start)
Returns the longest substring of the given string, starting at the given index, that exists in the trie.
return longestMatch(root, p, start, false);
private static com.google.android.util.AbstractMessageParser$TrieNode longestMatch(com.google.android.util.AbstractMessageParser$TrieNode root, com.google.android.util.AbstractMessageParser p, int start, boolean smiley)
Returns the longest substring of the given string, starting at the given index, that exists in the trie, with a special tokenizing case for smileys if specified.
int index = start; TrieNode bestMatch = null; while (index < p.getRawText().length()) { root = root.getChild(p.getRawText().charAt(index++)); if (root == null) { break; } else if (root.exists()) { if (p.isWordBreak(index)) { bestMatch = root; } else if (smiley && p.isSmileyBreak(index)) { bestMatch = root; } } } return bestMatch;
private static boolean matches(com.google.android.util.AbstractMessageParser$TrieNode root, java.lang.String str)
Determines whether the given string is in the given trie.
int index = 0; while (index < str.length()) { root = root.getChild(str.charAt(index++)); if (root == null) { break; } else if (root.exists()) { return true; } } return false;
public void parse()
Parses the text string into an internal representation.
// Look for music track (of which there would be only one and it'll be the // first token) if (parseMusicTrack()) { buildParts(null); return; } // Look for me commands. String meText = null; if (parseMeText && text.startsWith("/me") && (text.length() > 3) && Character.isWhitespace(text.charAt(3))) { meText = text.substring(0, 4); text = text.substring(4); } // Break the text into tokens. boolean wasSmiley = false; while (nextChar < text.length()) { if (!isWordBreak(nextChar)) { if (!wasSmiley || !isSmileyBreak(nextChar)) { throw new AssertionError("last chunk did not end at word break"); } } if (parseSmiley()) { wasSmiley = true; } else { wasSmiley = false; if (!parseAcronym() && !parseURL() && !parseFormatting()) { parseText(); } } } // Trim the whitespace before and after media components. for (int i = 0; i < tokens.size(); ++i) { if (tokens.get(i).isMedia()) { if ((i > 0) && (tokens.get(i - 1) instanceof Html)) { ((Html)tokens.get(i - 1)).trimLeadingWhitespace(); } if ((i + 1 < tokens.size()) && (tokens.get(i + 1) instanceof Html)) { ((Html)tokens.get(i + 1)).trimTrailingWhitespace(); } } } // Remove any empty html tokens. for (int i = 0; i < tokens.size(); ++i) { if (tokens.get(i).isHtml() && (tokens.get(i).toHtml(true).length() == 0)) { tokens.remove(i); --i; // visit this index again } } buildParts(meText);
private boolean parseAcronym()
Looks for acronyms (e.g., "lol") in the text.
if(!parseAcronyms) { return false; } TrieNode match = longestMatch(getResources().getAcronyms(), this, nextChar); if (match == null) { return false; } else { addToken(new Acronym(match.getText(), match.getValue())); nextChar += match.getText().length(); return true; }
private boolean parseFormatting()
Deal with formatting characters. Parsing is as follows: - Treat all contiguous strings of formatting characters as one block. (This method processes one block.) - Only a single instance of a particular format character within a block is used to determine whether to turn on/off that type of formatting; other instances simply print the character itself. - If the format is to be turned on, we use the _first_ instance; if it is to be turned off, we use the _last_ instance (by appending the format.) Example: **string** turns into *string*
if(!parseFormatting) { return false; } int endChar = nextChar; while ((endChar < text.length()) && isFormatChar(text.charAt(endChar))) { endChar += 1; } if ((endChar == nextChar) || !isWordBreak(endChar)) { return false; } // Keeps track of whether we've seen a character (in map if we've seen it) // and whether we should append a closing format token (if value in // map is TRUE). Linked hashmap for consistent ordering. LinkedHashMap<Character, Boolean> seenCharacters = new LinkedHashMap<Character, Boolean>(); for (int index = nextChar; index < endChar; ++index) { char ch = text.charAt(index); Character key = Character.valueOf(ch); if (seenCharacters.containsKey(key)) { // Already seen this character, just append an unmatched token, which // will print plaintext character addToken(new Format(ch, false)); } else { Format start = formatStart.get(key); if (start != null) { // Match the start token, and ask an end token to be appended start.setMatched(true); formatStart.remove(key); seenCharacters.put(key, Boolean.TRUE); } else { // Append start token start = new Format(ch, true); formatStart.put(key, start); addToken(start); seenCharacters.put(key, Boolean.FALSE); } } } // Append any necessary end tokens for (Character key : seenCharacters.keySet()) { if (seenCharacters.get(key) == Boolean.TRUE) { Format end = new Format(key.charValue(), false); end.setMatched(true); addToken(end); } } nextChar = endChar; return true;
private boolean parseMusicTrack()
Looks for a music track (\u266B is first character, everything else is track info).
if (parseMusic && text.startsWith(musicNote)) { addToken(new MusicTrack(text.substring(musicNote.length()))); nextChar = text.length(); return true; } return false;
private boolean parseSmiley()
Looks for smileys (e.g., ":)") in the text. The set of known smileys is loaded from a file into a trie at server start.
if(!parseSmilies) { return false; } TrieNode match = longestMatch(getResources().getSmileys(), this, nextChar, true); if (match == null) { return false; } else { int previousCharClass = getCharClass(nextChar - 1); int nextCharClass = getCharClass(nextChar + match.getText().length()); if ((previousCharClass == 2 || previousCharClass == 3) && (nextCharClass == 2 || nextCharClass == 3)) { return false; } addToken(new Smiley(match.getText())); nextChar += match.getText().length(); return true; }
private void parseText()
Consumes all of the text in the next word .
StringBuilder buf = new StringBuilder(); int start = nextChar; do { char ch = text.charAt(nextChar++); switch (ch) { case '<": buf.append("<"); break; case '>": buf.append(">"); break; case '&": buf.append("&"); break; case '"": buf.append("""); break; case '\'": buf.append("'"); break; case '\n": buf.append("<br>"); break; default: buf.append(ch); break; } } while (!isWordBreak(nextChar)); addToken(new Html(text.substring(start, nextChar), buf.toString()));
private boolean parseURL()
Looks for a URL in two possible forms: either a proper URL with a known scheme or a domain name optionally followed by a path, query, or query.
// Make sure this is a valid place to start a URL. if (!parseUrls || !isURLBreak(nextChar)) { return false; } int start = nextChar; // Search for the first block of letters. int index = start; while ((index < text.length()) && isDomainChar(text.charAt(index))) { index += 1; } String url = ""; boolean done = false; if (index == text.length()) { return false; } else if (text.charAt(index) == ':") { // Make sure this is a known scheme. String scheme = text.substring(nextChar, index); if (!getResources().getSchemes().contains(scheme)) { return false; } } else if (text.charAt(index) == '.") { // Search for the end of the domain name. while (index < text.length()) { char ch = text.charAt(index); if ((ch != '.") && !isDomainChar(ch)) { break; } else { index += 1; } } // Make sure the domain name has a valid suffix. Since tries look for // prefix matches, we reverse all the strings to get suffix comparisons. String domain = text.substring(nextChar, index); if (!isValidDomain(domain)) { return false; } // Search for a port. We deal with this specially because a colon can // also be a punctuation character. if ((index + 1 < text.length()) && (text.charAt(index) == ':")) { char ch = text.charAt(index + 1); if (Character.isDigit(ch)) { index += 1; while ((index < text.length()) && Character.isDigit(text.charAt(index))) { index += 1; } } } // The domain name should be followed by end of line, whitespace, // punctuation, or a colon, slash, question, or hash character. The // tricky part here is that some URL characters are also punctuation, so // we need to distinguish them. Since we looked for ports above, a colon // is always punctuation here. To distinguish '?' cases, we look at the // character that follows it. if (index == text.length()) { done = true; } else { char ch = text.charAt(index); if (ch == '?") { // If the next character is whitespace or punctuation (or missing), // then this question mark looks like punctuation. if (index + 1 == text.length()) { done = true; } else { char ch2 = text.charAt(index + 1); if (Character.isWhitespace(ch2) || isPunctuation(ch2)) { done = true; } } } else if (isPunctuation(ch)) { done = true; } else if (Character.isWhitespace(ch)) { done = true; } else if ((ch == '/") || (ch == '#")) { // In this case, the URL is not done. We will search for the end of // it below. } else { return false; } } // We will assume the user meant HTTP. (One weird case is where they // type a port of 443. That could mean HTTPS, but they might also want // HTTP. We'll let them specify if they don't want HTTP.) url = "http://"; } else { return false; } // If the URL is not done, search for the end, which is just before the // next whitespace character. if (!done) { while ((index < text.length()) && !Character.isWhitespace(text.charAt(index))) { index += 1; } } String urlText = text.substring(start, index); url += urlText; // Figure out the appropriate token type. addURLToken(url, urlText); nextChar = index; return true;
protected static java.lang.String reverse(java.lang.String str)
Returns the reverse of the given string.
StringBuilder buf = new StringBuilder(); for (int i = str.length() - 1; i >= 0; --i) { buf.append(str.charAt(i)); } return buf.toString();
public java.lang.String toHtml()
Converts the entire message into a single HTML display string.
StringBuilder html = new StringBuilder(); for (Part part : parts) { boolean caps = false; html.append("<p>"); for (Token token : part.getTokens()) { if (token.isHtml()) { html.append(token.toHtml(caps)); } else { switch (token.getType()) { case LINK: html.append("<a href=\""); html.append(((Link)token).getURL()); html.append("\">"); html.append(token.getRawText()); html.append("</a>"); break; case SMILEY: // TODO: link to an appropriate image html.append(token.getRawText()); break; case ACRONYM: html.append(token.getRawText()); break; case MUSIC: // TODO: include a music glyph html.append(((MusicTrack)token).getTrack()); break; case GOOGLE_VIDEO: // TODO: include a Google Video icon html.append("<a href=\""); html.append(((Video)token).getURL(((Video)token).getDocID())); html.append("\">"); html.append(token.getRawText()); html.append("</a>"); break; case YOUTUBE_VIDEO: // TODO: include a YouTube icon html.append("<a href=\""); html.append(((YouTubeVideo)token).getURL( ((YouTubeVideo)token).getDocID())); html.append("\">"); html.append(token.getRawText()); html.append("</a>"); break; case PHOTO: { // TODO: include a Picasa Web icon html.append("<a href=\""); html.append(Photo.getAlbumURL( ((Photo)token).getUser(), ((Photo)token).getAlbum())); html.append("\">"); html.append(token.getRawText()); html.append("</a>"); break; } case FLICKR: // TODO: include a Flickr icon Photo p = (Photo) token; html.append("<a href=\""); html.append(((FlickrPhoto)token).getUrl()); html.append("\">"); html.append(token.getRawText()); html.append("</a>"); break; default: throw new AssertionError("unknown token type: " + token.getType()); } } if (token.controlCaps()) { caps = token.setCaps(); } } html.append("</p>\n"); } return html.toString();
public static com.google.android.util.AbstractMessageParser$Token tokenForUrl(java.lang.String url, java.lang.String text)
Get a the appropriate Token for a given URL
param
text the anchor text
param
url the url
if(url == null) { return null; } //Look for video links Video video = Video.matchURL(url, text); if (video != null) { return video; } // Look for video links. YouTubeVideo ytVideo = YouTubeVideo.matchURL(url, text); if (ytVideo != null) { return ytVideo; } // Look for photo links. Photo photo = Photo.matchURL(url, text); if (photo != null) { return photo; } // Look for photo links. FlickrPhoto flickrPhoto = FlickrPhoto.matchURL(url, text); if (flickrPhoto != null) { return flickrPhoto; } //Not media, so must be a regular URL return new Link(url, text);