FrenchAnalyzer fa = new FrenchAnalyzer();
// test null reader
boolean iaeFlag = false;
try {
fa.tokenStream("dummy", null);
} catch (IllegalArgumentException iae) {
iaeFlag = true;
}
assertEquals(iaeFlag, true);
// test null fieldname
iaeFlag = false;
try {
fa.tokenStream(null, new StringReader("dummy"));
} catch (IllegalArgumentException iae) {
iaeFlag = true;
}
assertEquals(iaeFlag, true);
assertAnalyzesTo(fa, "", new String[] {
});
assertAnalyzesTo(
fa,
"chien chat cheval",
new String[] { "chien", "chat", "cheval" });
assertAnalyzesTo(
fa,
"chien CHAT CHEVAL",
new String[] { "chien", "chat", "cheval" });
assertAnalyzesTo(
fa,
" chien ,? + = - CHAT /: > CHEVAL",
new String[] { "chien", "chat", "cheval" });
assertAnalyzesTo(fa, "chien++", new String[] { "chien" });
assertAnalyzesTo(
fa,
"mot \"entreguillemet\"",
new String[] { "mot", "entreguillemet" });
// let's do some french specific tests now
/* 1. couldn't resist
I would expect this to stay one term as in French the minus
sign is often used for composing words */
assertAnalyzesTo(
fa,
"Jean-François",
new String[] { "jean", "françois" });
// 2. stopwords
assertAnalyzesTo(
fa,
"le la chien les aux chat du des à cheval",
new String[] { "chien", "chat", "cheval" });
// some nouns and adjectives
assertAnalyzesTo(
fa,
"lances chismes habitable chiste éléments captifs",
new String[] {
"lanc",
"chism",
"habit",
"chist",
"élément",
"captif" });
// some verbs
assertAnalyzesTo(
fa,
"finissions souffrirent rugissante",
new String[] { "fin", "souffr", "rug" });
// some everything else
// aujourd'hui stays one term which is OK
assertAnalyzesTo(
fa,
"C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ ",
new String[] {
"c3po",
"aujourd'hui",
"oeuf",
"ïâöûàä",
"anticonstitutionnel",
"jav" });
// some more everything else
// here 1940-1945 stays as one term, 1940:1945 not ?
assertAnalyzesTo(
fa,
"33Bis 1940-1945 1940:1945 (---i+++)*",
new String[] { "33bis", "1940-1945", "1940", "1945", "i" });