View Javadoc
1   package org.wikimedia.search.extra.analysis.textify;
2   
3   import java.io.IOException;
4   import java.io.StringReader;
5   
6   import org.apache.lucene.analysis.BaseTokenStreamTestCase;
7   import org.apache.lucene.analysis.TokenStream;
8   import org.junit.Test;
9   
10  public class CamelCaseCharFilterTest extends BaseTokenStreamTestCase {
11  
12      private TokenStream ezTokStream(String s) throws IOException {
13          return whitespaceMockTokenizer(new CamelCaseCharFilter(new StringReader(s)));
14      }
15  
16      @Test
17      public void testSimpleLatinCamelCase() throws IOException {
18          assertTokenStreamContents(
19              ezTokStream("testSimpleLatinCamelCase"),
20              new String[]{"test", "Simple", "Latin", "Camel", "Case"},
21              new int[]{0,  4, 10, 15, 20},  // start offsets
22              new int[]{4, 10, 15, 20, 24}); // end offsets
23      }
24  
25      @Test
26      public void testNonLatinAndMixedCamelCase() throws IOException {
27          // Latin, Armenian, Cyrillic, Coptic, Greek, Latin
28          assertTokenStreamContents(
29              ezTokStream("CamelՈւղտКамилаϪⲁⲙⲟⲩⲗΚαμήλαCamel"),
30              new String[]{"Camel", "Ուղտ", "Камила", "Ϫⲁⲙⲟⲩⲗ", "Καμήλα", "Camel"},
31              new int[]{0, 5,  9, 15, 21, 27},  // start offsets
32              new int[]{5, 9, 15, 21, 27, 32}); // end offsets
33      }
34  
35      @Test
36      public void testExtendedCharacterCamelCase() throws IOException {
37          assertTokenStreamContents(
38              ezTokStream("AaƘƙƔɣẠạĐđÀàἍἅᾈᾀԄԅԈԉԘԙAa"),
39              new String[]{"Aa", "Ƙƙ", "Ɣɣ", "Ạạ", "Đđ", "Àà", "Ἅἅ", "ᾈᾀ", "Ԅԅ", "Ԉԉ", "Ԙԙ", "Aa"},
40              new int[]{0, 2, 4, 6,  8, 10, 12, 14, 16, 18, 20, 22},  // start offsets
41              new int[]{2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24}); // end offsets
42      }
43  
44      @Test
45      public void testCombiningChars() throws IOException {
46          assertTokenStreamContents(
47              ezTokStream("Camel̄C̆ameḷ̥̐C̮amel Ax⃞Bx⃠CxाDx"),
48              new String[]{"Camel̄", "C̆ameḷ̥̐", "C̮amel", "Ax⃞", "Bx⃠", "Cxा", "Dx"},
49              new int[]{0,  6, 15, 22, 25, 28, 31, 33},  // start offsets
50              new int[]{6, 15, 21, 25, 28, 31, 33, 35}); // end offsets
51      }
52  
53      @Test
54      public void testInvisibles() throws IOException {
55          assertTokenStreamContents(
56              ezTokStream("Ex\ufe01Fx\u00adGx\u202dHx\u202aIx\u200eJx\u202c" + "Kx\u2069Lx\u034fMx\u200cNx\u200dOx\u2060Px\u200bQx"),
57                  // variation selector, soft hyphen, left-to-right override, left-
58                  // to-right embedding, left-to-right mark, pop directional formatting,
59                  // pop directional isolate, combining grapheme joiner, zero-width
60                  // non-joiner, zero-width joiner, word joiner, zero-width space
61              new String[]{"Ex\ufe01", "Fx\u00ad", "Gx\u202d", "Hx\u202a", "Ix\u200e", "Jx\u202c",
62                           "Kx\u2069", "Lx\u034f", "Mx\u200c", "Nx\u200d", "Ox\u2060", "Px\u200b",
63                           "Qx"},
64              new int[]{0, 3, 6,  9, 12, 15, 18, 21, 24, 27, 30, 33, 36},  // start offsets
65              new int[]{3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 38}); // end offsets
66      }
67  
68      @Test
69      public void testThirtyTwoBitCamelCase() throws IOException {
70          assertTokenStreamContents(
71              ezTokStream("Ax𝐀𝐱𝒜𝓍𝔸𝕩𝕬𝖝𝖠𝗑𝘼𝙭𝙰𝚡𝚪𝛟𝜞𝝋𝞒𝞿𐐃𐐫𐐅𐐭𐐆𐐮Ax"),
72              new String[]{"Ax", "𝐀𝐱", "𝒜𝓍", "𝔸𝕩", "𝕬𝖝", "𝖠𝗑", "𝘼𝙭", "𝙰𝚡",
73                           "𝚪𝛟", "𝜞𝝋", "𝞒𝞿", "𐐃𐐫", "𐐅𐐭", "𐐆𐐮", "Ax"},
74              new int[]{0, 2,  6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54},  // start offsets
75              new int[]{2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 56}); // end offsets
76      }
77  
78      @Test
79      public void testTitleCaseCamelCase() throws IOException {
80          assertTokenStreamContents(
81              ezTokStream("LJALJxLJ LjALjxLj ljAljxlj LJLJ LJLj LJlj LjLJ LjLj Ljlj ljLJ ljLj ljlj"),
82              new String[]{"LJALJx", "LJ", "Lj", "ALjx", "Lj",  "lj", "Aljxlj", "LJLJ", "LJLj", "LJlj",
83                           "Lj",    "LJ", "Lj", "Lj",   "Ljlj", "lj", "LJ",    "lj",  "Lj",  "ljlj"},
84              new int[]{0,   4,  6,  7, 10, 12, 13, 18, 21, 24,
85                        27, 28, 30, 31, 33, 36, 37, 39, 40, 42},  // start offsets
86              new int[]{4,   5,  7, 10, 11, 13, 17, 20, 23, 26,
87                        28, 29, 31, 32, 35, 37, 38, 40, 41, 44}); // end offsets
88      }
89  
90      @Test
91      public void testRidiculousCamelCase() throws IOException {
92          assertTokenStreamContents(
93              ezTokStream("Aԉ⃞̤̆\u00adLj⃠̥̂x"),
94              new String[]{"Aԉ⃞̤̆\u00ad", "Lj⃠̥̂x"},
95              new int[]{0,  6},  // start offsets
96              new int[]{6, 11}); // end offsets
97      }
98  
99      @Test
100     public void testNonAlphabeticText() throws IOException {
101         // Katakana, Hiragana, Hanzi, Hangul, Hebrew, Arabic, Devanagari, Thai, Tamil, Bengali
102         assertTokenStreamContents(
103             ezTokStream("ウィキペディアうぃきぺでぃあ维基百科위키백과ויקיפדיהويكيبيدي" + "विकिपीडियाวิกิพีเดียவிக்கிப்பீடியாউইকিপিডিয়া"),
104             new String[]{"ウィキペディアうぃきぺでぃあ维基百科위키백과ויקיפדיהويكيبيدي" + "विकिपीडियाวิกิพีเดียவிக்கிப்பீடியாউইকিপিডিয়া"}); // no change
105     }
106 
107 }