1 package org.wikimedia.search.extra.analysis.textify;
2
3 import java.io.IOException;
4 import java.io.StringReader;
5
6 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
7 import org.apache.lucene.analysis.TokenStream;
8 import org.junit.Test;
9
10 public class AcronymFixerCharFilterTest extends BaseTokenStreamTestCase {
11
12 private TokenStream ezTokStream(String s) throws IOException {
13 return whitespaceMockTokenizer(new AcronymFixerCharFilter(new StringReader(s)));
14 }
15
16 @Test
17 public void testSimpleLatinAcronymFixer() throws IOException {
18 assertTokenStreamContents(
19 ezTokStream("cat a.c.r.o.n.y.m .F.i.X.e.R. T.E.S.T. dog"),
20 new String[]{"cat", "acronym", ".FiXeR.", "TEST.", "dog"},
21 new int[]{0, 4, 18, 30, 39},
22 new int[]{3, 17, 29, 38, 42});
23 }
24
25 @Test
26 public void testNonAcronymPeriods() throws IOException {
27 assertTokenStreamContents(
28 ezTokStream("example.org e.xa.m.ple.o.rg"),
29 new String[]{"example.org", "e.xa.m.ple.o.rg"},
30 new int[]{0, 12},
31 new int[]{11, 27});
32 }
33
34 @Test
35 public void testNonLatinAcronymFixer() throws IOException {
36
37 assertTokenStreamContents(
38 ezTokStream("Q.Σ.Д.অ.ऌ.ខ.ب.Z. α.κ.ρ.ω.ν.ύ.μ.ι.ο .а.к.р.о.н.и.м."),
39 new String[]{"QΣДঅऌខبZ.", "ακρωνύμιο", ".акроним."},
40 new int[]{0, 17, 35},
41 new int[]{16, 34, 50});
42
43
44 assertTokenStreamContents(
45 ezTokStream("う.ふ.ふ. ม.ป.ท. 淄.青.齊.登."),
46 new String[]{"うふふ.", "มปท.", "淄青齊登."},
47 new int[]{0, 7, 14},
48 new int[]{6, 13, 22});
49 }
50
51 @Test
52 public void testAbugidaAcronymFixer() throws IOException {
53
54 assertTokenStreamContents(
55 ezTokStream("के.ए.टि.ए. সা.সা.পূ. ಅ.ಸಂ.ಲಿ.ವ. ပ.အ.မ.ဖ."),
56 new String[]{"केएटिए.", "সাসাপূ.", "ಅಸಂಲಿವ.", "ပအမဖ."},
57 new int[]{0, 11, 21, 32},
58 new int[]{10, 20, 31, 40});
59 }
60
61 @Test
62 public void testExtendedCharacterAcronymFixer() throws IOException {
63 assertTokenStreamContents(
64 ezTokStream("A.Ƙ.Ɣ.Ạ.Đ.À.Ἅ.ᾈ.Ԅ.Ԉ.Ԙ.A."),
65 new String[]{"AƘƔẠĐÀἍᾈԄԈԘA."},
66 new int[]{0},
67 new int[]{24});
68 }
69
70 @Test
71 public void testCombiningChars() throws IOException {
72 assertTokenStreamContents(
73 ezTokStream(".X.X̄.X̆.X̣̥̐.X̮.X. A.⃞B.⃠C.ाD"),
74 new String[]{".XX̄X̆X̣̥̐X̮X.", "A⃞B⃠CाD"},
75 new int[]{0, 20},
76 new int[]{19, 30});
77 }
78
79 @Test
80 public void testInvisibles() throws IOException {
81 assertTokenStreamContents(
82 ezTokStream("E\ufe01.F\u00ad.G\u202d.H\u202a.I\u200e.J\u202c." +
83 "K\u2069.L\u034f.M\u200c.N\u200d.O\u2060.P\u200b.Q."),
84
85
86
87
88 new String[]{"E\ufe01F\u00adG\u202dH\u202aI\u200eJ\u202c" +
89 "K\u2069L\u034fM\u200cN\u200dO\u2060P\u200bQ."},
90 new int[]{0},
91 new int[]{38});
92 }
93
94 @Test
95 public void testThirtyTwoBitAcronymFixer() throws IOException {
96 assertTokenStreamContents(
97 ezTokStream("A.𝐀.𝒜.𝔸.𝕬.𝖠.𝘼.𝙰.𝚪.𝜞.𝞒.𐐃.𐐅.𐐆.A."),
98 new String[]{"A𝐀𝒜𝔸𝕬𝖠𝘼𝙰𝚪𝜞𝞒𐐃𐐅𐐆A."},
99 new int[]{0},
100 new int[]{43});
101 }
102
103 @Test
104 public void testEdgeCasesAcronymFixer() throws IOException {
105
106 assertTokenStreamContents(
107 ezTokStream(".a.c.r.o.n.y.m."),
108 new String[]{".acronym."},
109 new int[]{0},
110 new int[]{15});
111
112
113 assertTokenStreamContents(
114 ezTokStream("a.c.r.o.n.y.m"),
115 new String[]{"acronym"},
116 new int[]{0},
117 new int[]{13});
118 }
119
120 @Test
121 public void testTitleCaseAcronymFixer() throws IOException {
122 assertTokenStreamContents(
123 ezTokStream("A.LJ.Lj.lj.A."),
124 new String[]{"ALJLjljA."},
125 new int[]{0},
126 new int[]{10});
127 }
128
129 @Test
130 public void testFullwidthPeriodsAcronymFixer() throws IOException {
131 assertTokenStreamContents(
132 ezTokStream("A.c.r.o.n.y.m. A.c.r.o.n.y.m " +
133 ".X.X̄.X̆.X̣̥̐.X̮.X. A.⃞B.⃠C.ाD Q.Σ.Д.অ.ऌ.ខ. A.LJ.Lj.lj.A"),
134 new String[]{"Acronym.", "Acronym", ".XX̄X̆X̣̥̐X̮X.", "A⃞B⃠CाD", "QΣДঅऌខ.", "ALJLjljA"},
135 new int[]{0, 15, 29, 49, 60, 73},
136 new int[]{14, 28, 48, 59, 72, 82});
137 }
138
139 @Test
140 public void testRidiculousAcronymFixer() throws IOException {
141
142 assertTokenStreamContents(
143 ezTokStream("A.ԉ⃞̤̆\u00ad.Lj⃠̥̂.x. x.a̸͓̬͙̅̀.b̵͕̿́͑̾̀͂͒́͛̒̊̓.c̴̛͔͊̏̈̓̋̈͆̚ͅ."),
144 new String[]{"Aԉ⃞̤̆\u00adLj⃠̥̂x.", "xa̸͓̬͙̅̀b̵͕̿́͑̾̀͂͒́͛̒̊̓c̴̛͔͊̏̈̓̋̈͆̚ͅ."},
145 new int[]{0, 16},
146 new int[]{15, 56});
147 }
148
149 @Test
150 public void testCircleBuffCapacity() throws IOException {
151
152
153
154 assertTokenStreamContents(
155 ezTokStream("24 a.b."),
156 new String[]{"24", "ab."},
157 new int[]{0, 3},
158 new int[]{2, 29});
159
160
161
162 assertTokenStreamContents(
163 ezTokStream("25 a.b."),
164 new String[]{"25", "ab."},
165 new int[]{0, 3},
166 new int[]{2, 30});
167
168
169
170 assertTokenStreamContents(
171 ezTokStream("26 a.b."),
172 new String[]{"26", "a.b."},
173 new int[]{0, 3},
174 new int[]{2, 33});
175
176
177
178
179 assertTokenStreamContents(
180 ezTokStream("50 a.b."),
181 new String[]{"50", "a.b."},
182 new int[]{0, 3},
183 new int[]{2, 57});
184 }
185
186 @Test
187 public void testMiscNonAcronymicText() throws IOException {
188
189 assertTokenStreamContents(
190 ezTokStream("Wikipedia Википедию 维基百科 위키백과"),
191 new String[]{"Wikipedia", "Википедию", "维基百科", "위키백과"});
192
193
194 assertTokenStreamContents(
195 ezTokStream("Վիքիպեդիա ויקיפדיה Βικιπαίδεια ويكيبيدي ვიკიპედია"),
196 new String[]{"Վիքիպեդիա", "ויקיפדיה", "Βικιπαίδεια", "ويكيبيدي", "ვიკიპედია"});
197
198
199 assertTokenStreamContents(
200 ezTokStream("विकिपीडिया วิกิพีเดีย விக்கிப்பீடியா উইকিপিডিয়া ˌwɪkɪˈpiːdiə"),
201 new String[]{"विकिपीडिया", "วิกิพีเดีย", "விக்கிப்பீடியா", "উইকিপিডিয়া", "ˌwɪkɪˈpiːdiə"});
202 }
203
204 }