Analyzer a = new StandardAnalyzer();
// alphanumeric tokens
assertAnalyzesTo(a, "B2B", new String[]{"b2b"});
assertAnalyzesTo(a, "2B", new String[]{"2b"});
// underscores are delimiters, but not in email addresses (below)
assertAnalyzesTo(a, "word_having_underscore", new String[]{"word", "having", "underscore"});
assertAnalyzesTo(a, "word_with_underscore_and_stopwords", new String[]{"word", "underscore", "stopwords"});
// other delimiters: "-", "/", ","
assertAnalyzesTo(a, "some-dashed-phrase", new String[]{"some", "dashed", "phrase" });
assertAnalyzesTo(a, "dogs,chase,cats", new String[]{"dogs", "chase", "cats"});
assertAnalyzesTo(a, "ac/dc", new String[]{"ac", "dc"});
// internal apostrophes: O'Reilly, you're, O'Reilly's
// possessives are actually removed by StardardFilter, not the tokenizer
assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"});
assertAnalyzesTo(a, "you're", new String[]{"you're"});
assertAnalyzesTo(a, "she's", new String[]{"she"});
assertAnalyzesTo(a, "Jim's", new String[]{"jim"});
assertAnalyzesTo(a, "don't", new String[]{"don't"});
assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly"});
// t and s had been stopwords in Lucene <= 2.0, which made it impossible
// to correctly search for these terms:
assertAnalyzesTo(a, "s-class", new String[]{"s", "class"});
assertAnalyzesTo(a, "t-com", new String[]{"t", "com"});
// 'a' is still a stopword:
assertAnalyzesTo(a, "a-class", new String[]{"class"});
// company names
assertAnalyzesTo(a, "AT&T", new String[]{"at&t"});
assertAnalyzesTo(a, "Excite@Home", new String[]{"excite@home"});
// domain names
assertAnalyzesTo(a, "www.nutch.org", new String[]{"www.nutch.org" });
// email addresses, possibly with underscores, periods, etc
assertAnalyzesTo(a, "test@example.com", new String[]{"test@example.com"});
assertAnalyzesTo(a, "first.lastname@example.com", new String[]{"first.lastname@example.com"});
assertAnalyzesTo(a, "first_lastname@example.com", new String[]{"first_lastname@example.com"});
// floating point, serial, model numbers, ip addresses, etc.
// every other segment must have at least one digit
assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"r2d2", "c3po"});
assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
assertAnalyzesTo(a, "1-2-3", new String[]{"1-2-3"});
assertAnalyzesTo(a, "a1-b2-c3", new String[]{"a1-b2-c3"});
assertAnalyzesTo(a, "a1-b-c3", new String[]{"a1-b-c3"});
// numbers
assertAnalyzesTo(a, "David has 5000 bones", new String[]{"david", "has", "5000", "bones"});
// various
assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"c", "embedded", "developers", "wanted" });
assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "foo", "bar"});
assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[]{"foo", "bar", "foo", "bar"});
assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"quoted", "word"});
// acronyms have their dots stripped
assertAnalyzesTo(a, "U.S.A.", new String[]{ "usa" });
// It would be nice to change the grammar in StandardTokenizer.jj to make "C#" and "C++" end up as tokens.
assertAnalyzesTo(a, "C++", new String[]{"c"});
assertAnalyzesTo(a, "C#", new String[]{"c"});
// Korean words
assertAnalyzesTo(a, "ì