1 package org.wikimedia.search.extra.analysis.textify;
2
3 import java.io.IOException;
4 import java.io.StringReader;
5
6 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
7 import org.apache.lucene.analysis.TokenStream;
8 import org.junit.Test;
9
10 public class CamelCaseCharFilterTest extends BaseTokenStreamTestCase {
11
12 private TokenStream ezTokStream(String s) throws IOException {
13 return whitespaceMockTokenizer(new CamelCaseCharFilter(new StringReader(s)));
14 }
15
16 @Test
17 public void testSimpleLatinCamelCase() throws IOException {
18 assertTokenStreamContents(
19 ezTokStream("testSimpleLatinCamelCase"),
20 new String[]{"test", "Simple", "Latin", "Camel", "Case"},
21 new int[]{0, 4, 10, 15, 20},
22 new int[]{4, 10, 15, 20, 24});
23 }
24
25 @Test
26 public void testNonLatinAndMixedCamelCase() throws IOException {
27
28 assertTokenStreamContents(
29 ezTokStream("CamelՈւղտКамилаϪⲁⲙⲟⲩⲗΚαμήλαCamel"),
30 new String[]{"Camel", "Ուղտ", "Камила", "Ϫⲁⲙⲟⲩⲗ", "Καμήλα", "Camel"},
31 new int[]{0, 5, 9, 15, 21, 27},
32 new int[]{5, 9, 15, 21, 27, 32});
33 }
34
35 @Test
36 public void testExtendedCharacterCamelCase() throws IOException {
37 assertTokenStreamContents(
38 ezTokStream("AaƘƙƔɣẠạĐđÀàἍἅᾈᾀԄԅԈԉԘԙAa"),
39 new String[]{"Aa", "Ƙƙ", "Ɣɣ", "Ạạ", "Đđ", "Àà", "Ἅἅ", "ᾈᾀ", "Ԅԅ", "Ԉԉ", "Ԙԙ", "Aa"},
40 new int[]{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22},
41 new int[]{2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24});
42 }
43
44 @Test
45 public void testCombiningChars() throws IOException {
46 assertTokenStreamContents(
47 ezTokStream("Camel̄C̆ameḷ̥̐C̮amel Ax⃞Bx⃠CxाDx"),
48 new String[]{"Camel̄", "C̆ameḷ̥̐", "C̮amel", "Ax⃞", "Bx⃠", "Cxा", "Dx"},
49 new int[]{0, 6, 15, 22, 25, 28, 31, 33},
50 new int[]{6, 15, 21, 25, 28, 31, 33, 35});
51 }
52
53 @Test
54 public void testInvisibles() throws IOException {
55 assertTokenStreamContents(
56 ezTokStream("Ex\ufe01Fx\u00adGx\u202dHx\u202aIx\u200eJx\u202c" + "Kx\u2069Lx\u034fMx\u200cNx\u200dOx\u2060Px\u200bQx"),
57
58
59
60
61 new String[]{"Ex\ufe01", "Fx\u00ad", "Gx\u202d", "Hx\u202a", "Ix\u200e", "Jx\u202c",
62 "Kx\u2069", "Lx\u034f", "Mx\u200c", "Nx\u200d", "Ox\u2060", "Px\u200b",
63 "Qx"},
64 new int[]{0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36},
65 new int[]{3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 38});
66 }
67
68 @Test
69 public void testThirtyTwoBitCamelCase() throws IOException {
70 assertTokenStreamContents(
71 ezTokStream("Ax𝐀𝐱𝒜𝓍𝔸𝕩𝕬𝖝𝖠𝗑𝘼𝙭𝙰𝚡𝚪𝛟𝜞𝝋𝞒𝞿𐐃𐐫𐐅𐐭𐐆𐐮Ax"),
72 new String[]{"Ax", "𝐀𝐱", "𝒜𝓍", "𝔸𝕩", "𝕬𝖝", "𝖠𝗑", "𝘼𝙭", "𝙰𝚡",
73 "𝚪𝛟", "𝜞𝝋", "𝞒𝞿", "𐐃𐐫", "𐐅𐐭", "𐐆𐐮", "Ax"},
74 new int[]{0, 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54},
75 new int[]{2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 56});
76 }
77
78 @Test
79 public void testTitleCaseCamelCase() throws IOException {
80 assertTokenStreamContents(
81 ezTokStream("LJALJxLJ LjALjxLj ljAljxlj LJLJ LJLj LJlj LjLJ LjLj Ljlj ljLJ ljLj ljlj"),
82 new String[]{"LJALJx", "LJ", "Lj", "ALjx", "Lj", "lj", "Aljxlj", "LJLJ", "LJLj", "LJlj",
83 "Lj", "LJ", "Lj", "Lj", "Ljlj", "lj", "LJ", "lj", "Lj", "ljlj"},
84 new int[]{0, 4, 6, 7, 10, 12, 13, 18, 21, 24,
85 27, 28, 30, 31, 33, 36, 37, 39, 40, 42},
86 new int[]{4, 5, 7, 10, 11, 13, 17, 20, 23, 26,
87 28, 29, 31, 32, 35, 37, 38, 40, 41, 44});
88 }
89
90 @Test
91 public void testRidiculousCamelCase() throws IOException {
92 assertTokenStreamContents(
93 ezTokStream("Aԉ⃞̤̆\u00adLj⃠̥̂x"),
94 new String[]{"Aԉ⃞̤̆\u00ad", "Lj⃠̥̂x"},
95 new int[]{0, 6},
96 new int[]{6, 11});
97 }
98
99 @Test
100 public void testNonAlphabeticText() throws IOException {
101
102 assertTokenStreamContents(
103 ezTokStream("ウィキペディアうぃきぺでぃあ维基百科위키백과ויקיפדיהويكيبيدي" + "विकिपीडियाวิกิพีเดียவிக்கிப்பீடியாউইকিপিডিয়া"),
104 new String[]{"ウィキペディアうぃきぺでぃあ维基百科위키백과ויקיפדיהويكيبيدي" + "विकिपीडियाวิกิพีเดียவிக்கிப்பீடியாউইকিপিডিয়া"});
105 }
106
107 }