File Doc Category Size Date Package
TestStandardAnalyzer.java API Doc Apache Lucene 2.1.0 4861 Wed Feb 14 10:46:34 GMT 2007 org.apache.lucene.analysis

TestStandardAnalyzer

java.lang.Object
- junit.framework.Assert
  - junit.framework.TestCase

public class TestStandardAnalyzer extends TestCase

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

Fields Summary
Constructors Summary
Methods Summary
public void assertAnalyzesTo(org.apache.lucene.analysis.Analyzer a, java.lang.String input, java.lang.String[] expected)
TokenStream ts = a.tokenStream("dummy", new StringReader(input)); for (int i = 0; i < expected.length; i++) { Token t = ts.next(); assertNotNull(t); assertEquals(expected[i], t.termText()); } assertNull(ts.next()); ts.close();
public void testStandard()
Analyzer a = new StandardAnalyzer(); // alphanumeric tokens assertAnalyzesTo(a, "B2B", new String[]{"b2b"}); assertAnalyzesTo(a, "2B", new String[]{"2b"}); // underscores are delimiters, but not in email addresses (below) assertAnalyzesTo(a, "word_having_underscore", new String[]{"word", "having", "underscore"}); assertAnalyzesTo(a, "word_with_underscore_and_stopwords", new String[]{"word", "underscore", "stopwords"}); // other delimiters: "-", "/", "," assertAnalyzesTo(a, "some-dashed-phrase", new String[]{"some", "dashed", "phrase" }); assertAnalyzesTo(a, "dogs,chase,cats", new String[]{"dogs", "chase", "cats"}); assertAnalyzesTo(a, "ac/dc", new String[]{"ac", "dc"}); // internal apostrophes: O'Reilly, you're, O'Reilly's // possessives are actually removed by StardardFilter, not the tokenizer assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"}); assertAnalyzesTo(a, "you're", new String[]{"you're"}); assertAnalyzesTo(a, "she's", new String[]{"she"}); assertAnalyzesTo(a, "Jim's", new String[]{"jim"}); assertAnalyzesTo(a, "don't", new String[]{"don't"}); assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly"}); // t and s had been stopwords in Lucene <= 2.0, which made it impossible // to correctly search for these terms: assertAnalyzesTo(a, "s-class", new String[]{"s", "class"}); assertAnalyzesTo(a, "t-com", new String[]{"t", "com"}); // 'a' is still a stopword: assertAnalyzesTo(a, "a-class", new String[]{"class"}); // company names assertAnalyzesTo(a, "AT&T", new String[]{"at&t"}); assertAnalyzesTo(a, "Excite@Home", new String[]{"excite@home"}); // domain names assertAnalyzesTo(a, "www.nutch.org", new String[]{"www.nutch.org" }); // email addresses, possibly with underscores, periods, etc assertAnalyzesTo(a, "test@example.com", new String[]{"test@example.com"}); assertAnalyzesTo(a, "first.lastname@example.com", new String[]{"first.lastname@example.com"}); assertAnalyzesTo(a, "first_lastname@example.com", new String[]{"first_lastname@example.com"}); // floating point, serial, model numbers, ip addresses, etc. // every other segment must have at least one digit assertAnalyzesTo(a, "21.35", new String[]{"21.35"}); assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"r2d2", "c3po"}); assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"}); assertAnalyzesTo(a, "1-2-3", new String[]{"1-2-3"}); assertAnalyzesTo(a, "a1-b2-c3", new String[]{"a1-b2-c3"}); assertAnalyzesTo(a, "a1-b-c3", new String[]{"a1-b-c3"}); // numbers assertAnalyzesTo(a, "David has 5000 bones", new String[]{"david", "has", "5000", "bones"}); // various assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"c", "embedded", "developers", "wanted" }); assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "foo", "bar"}); assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[]{"foo", "bar", "foo", "bar"}); assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"quoted", "word"}); // acronyms have their dots stripped assertAnalyzesTo(a, "U.S.A.", new String[]{ "usa" }); // It would be nice to change the grammar in StandardTokenizer.jj to make "C#" and "C++" end up as tokens. assertAnalyzesTo(a, "C++", new String[]{"c"}); assertAnalyzesTo(a, "C#", new String[]{"c"}); // Korean words assertAnalyzesTo(a, "�