View Javadoc
1   package org.wikimedia.search.extra.analysis.textify;
2   
3   import java.io.IOException;
4   import java.io.StringReader;
5   
6   import org.apache.lucene.analysis.BaseTokenStreamTestCase;
7   import org.apache.lucene.analysis.TokenStream;
8   import org.junit.Test;
9   
10  public class AcronymFixerCharFilterTest extends BaseTokenStreamTestCase {
11  
12      private TokenStream ezTokStream(String s) throws IOException {
13          return whitespaceMockTokenizer(new AcronymFixerCharFilter(new StringReader(s)));
14      }
15  
16      @Test
17      public void testSimpleLatinAcronymFixer() throws IOException {
18          assertTokenStreamContents(
19              ezTokStream("cat a.c.r.o.n.y.m .F.i.X.e.R. T.E.S.T. dog"),
20              new String[]{"cat", "acronym", ".FiXeR.", "TEST.", "dog"},
21              new int[]{0,  4, 18, 30, 39},  // start offsets
22              new int[]{3, 17, 29, 38, 42}); // end offsets
23      }
24  
25      @Test
26      public void testNonAcronymPeriods() throws IOException {
27          assertTokenStreamContents(
28              ezTokStream("example.org e.xa.m.ple.o.rg"),
29              new String[]{"example.org", "e.xa.m.ple.o.rg"},
30              new int[]{0,  12},  // start offsets
31              new int[]{11, 27}); // end offsets
32      }
33  
34      @Test
35      public void testNonLatinAcronymFixer() throws IOException {
36          // Latin, Greek, Cyrillic, Bengali, Devenagari, Khmer, Arabic, Latin
37          assertTokenStreamContents(
38              ezTokStream("Q.Σ.Д.অ.ऌ.ខ.ب.Z. α.κ.ρ.ω.ν.ύ.μ.ι.ο .а.к.р.о.н.и.м."),
39              new String[]{"QΣДঅऌខبZ.", "ακρωνύμιο", ".акроним."},
40              new int[]{0,  17, 35},  // start offsets
41              new int[]{16, 34, 50}); // end offsets
42  
43          // Hiragana, Thai, Hanzi
44          assertTokenStreamContents(
45              ezTokStream("う.ふ.ふ. ม.ป.ท. 淄.青.齊.登."),
46              new String[]{"うふふ.", "มปท.", "淄青齊登."},
47              new int[]{0,  7, 14},  // start offsets
48              new int[]{6, 13, 22}); // end offsets
49      }
50  
51      @Test
52      public void testAbugidaAcronymFixer() throws IOException {
53          // Devanagari, Bengali, Kannada, Myanmar
54          assertTokenStreamContents(
55              ezTokStream("के.ए.टि.ए. সা.সা.পূ. ಅ.ಸಂ.ಲಿ.ವ. ပ.အ.မ.ဖ."),
56              new String[]{"केएटिए.", "সাসাপূ.", "ಅಸಂಲಿವ.", "ပအမဖ."},
57              new int[]{0,  11, 21, 32},  // start offsets
58              new int[]{10, 20, 31, 40}); // end offsets
59      }
60  
61      @Test
62      public void testExtendedCharacterAcronymFixer() throws IOException {
63          assertTokenStreamContents(
64              ezTokStream("A.Ƙ.Ɣ.Ạ.Đ.À.Ἅ.ᾈ.Ԅ.Ԉ.Ԙ.A."),
65              new String[]{"AƘƔẠĐÀἍᾈԄԈԘA."},
66              new int[]{0},   // start offsets
67              new int[]{24}); // end offsets
68      }
69  
70      @Test
71      public void testCombiningChars() throws IOException {
72          assertTokenStreamContents(
73              ezTokStream(".X.X̄.X̆.X̣̥̐.X̮.X. A.⃞B.⃠C.ाD"),
74              new String[]{".XX̄X̆X̣̥̐X̮X.", "A⃞B⃠CाD"},
75              new int[]{0,  20},  // start offsets
76              new int[]{19, 30}); // end offsets
77      }
78  
79      @Test
80      public void testInvisibles() throws IOException {
81          assertTokenStreamContents(
82              ezTokStream("E\ufe01.F\u00ad.G\u202d.H\u202a.I\u200e.J\u202c." +
83              "K\u2069.L\u034f.M\u200c.N\u200d.O\u2060.P\u200b.Q."),
84                  // variation selector, soft hyphen, left-to-right override, left-
85                  // to-right embedding, left-to-right mark, pop directional formatting,
86                  // pop directional isolate, combining grapheme joiner, zero-width
87                  // non-joiner, zero-width joiner, word joiner, zero-width space
88              new String[]{"E\ufe01F\u00adG\u202dH\u202aI\u200eJ\u202c" +
89                           "K\u2069L\u034fM\u200cN\u200dO\u2060P\u200bQ."},
90              new int[]{0},   // start offsets
91              new int[]{38}); // end offsets
92      }
93  
94      @Test
95      public void testThirtyTwoBitAcronymFixer() throws IOException {
96          assertTokenStreamContents(
97              ezTokStream("A.𝐀.𝒜.𝔸.𝕬.𝖠.𝘼.𝙰.𝚪.𝜞.𝞒.𐐃.𐐅.𐐆.A."),
98              new String[]{"A𝐀𝒜𝔸𝕬𝖠𝘼𝙰𝚪𝜞𝞒𐐃𐐅𐐆A."},
99              new int[]{0},   // start offsets
100             new int[]{43}); // end offsets
101     }
102 
103     @Test
104     public void testEdgeCasesAcronymFixer() throws IOException {
105         // string begins or ends with periods
106         assertTokenStreamContents(
107             ezTokStream(".a.c.r.o.n.y.m."),
108             new String[]{".acronym."},
109             new int[]{0},   // start offsets
110             new int[]{15}); // end offsets
111 
112         // string begins or ends WITHOUT periods
113         assertTokenStreamContents(
114             ezTokStream("a.c.r.o.n.y.m"),
115             new String[]{"acronym"},
116             new int[]{0},   // start offsets
117             new int[]{13}); // end offsets
118     }
119 
120     @Test
121     public void testTitleCaseAcronymFixer() throws IOException {
122         assertTokenStreamContents(
123             ezTokStream("A.LJ.Lj.lj.A."),
124             new String[]{"ALJLjljA."},
125             new int[]{0},   // start offsets
126             new int[]{10}); // end offsets
127     }
128 
129     @Test
130     public void testFullwidthPeriodsAcronymFixer() throws IOException {
131         assertTokenStreamContents(
132             ezTokStream("A.c.r.o.n.y.m. A.c.r.o.n.y.m " +
133             ".X.X̄.X̆.X̣̥̐.X̮.X. A.⃞B.⃠C.ाD Q.Σ.Д.অ.ऌ.ខ. A.LJ.Lj.lj.A"),
134             new String[]{"Acronym.", "Acronym", ".XX̄X̆X̣̥̐X̮X.", "A⃞B⃠CाD", "QΣДঅऌខ.", "ALJLjljA"},
135             new int[]{0,  15, 29, 49, 60, 73},  // start offsets
136             new int[]{14, 28, 48, 59, 72, 82}); // end offsets
137     }
138 
139     @Test
140     public void testRidiculousAcronymFixer() throws IOException {
141         // test very many combining marks and/or invisibles
142         assertTokenStreamContents(
143             ezTokStream("A.ԉ⃞̤̆\u00ad.Lj⃠̥̂.x. x.a̸͓̬͙̅̀.b̵͕̿́͑̾̀͂͒́͛̒̊̓.c̴̛͔͊̏̈̓̋̈͆̚ͅ."),
144             new String[]{"Aԉ⃞̤̆\u00adLj⃠̥̂x.", "xa̸͓̬͙̅̀b̵͕̿́͑̾̀͂͒́͛̒̊̓c̴̛͔͊̏̈̓̋̈͆̚ͅ."},
145             new int[]{0,  16},  // start offsets
146             new int[]{15, 56}); // end offsets
147     }
148 
149     @Test
150     public void testCircleBuffCapacity() throws IOException {
151         // test the buff capacity right at 25. 24 should be fine. 25 will fill the
152         // buffer but not overflow, and 26 should be too many.
153         // 24 == 22 soft hyphens between "a." and "b.", plus "b." -- buff not full
154         assertTokenStreamContents(
155             ezTokStream("24 a.­­­­­­­­­­­­­­­­­­­­­­b."),
156             new String[]{"24", "a­­­­­­­­­­­­­­­­­­­­­­b."},
157             new int[]{0, 3},   // start offsets
158             new int[]{2, 29}); // end offsets
159 
160         // 25 == 23 soft hyphens between "a." and "b.", plus "b." -- buff full, but
161         // works
162         assertTokenStreamContents(
163             ezTokStream("25 a.­­­­­­­­­­­­­­­­­­­­­­­b."),
164             new String[]{"25", "a­­­­­­­­­­­­­­­­­­­­­­­b."},
165             new int[]{0, 3},   // start offsets
166             new int[]{2, 30}); // end offsets
167 
168         // 26 == 24 soft hyphens between "a." and "b.", plus "b." -- buff too small,
169         // fails gracefully
170         assertTokenStreamContents(
171             ezTokStream("26 a.­­­­­­­­­­­­­­­­­­­­­­­­­­b."),
172             new String[]{"26", "a.­­­­­­­­­­­­­­­­­­­­­­­­­­b."},
173             new int[]{0, 3},   // start offsets
174             new int[]{2, 33}); // end offsets
175 
176         // test way too many combining marks and/or invisibles (>>25). 50 soft
177         // hyphens between "a." and "b." is way, way too many and acronym fixing
178         // should fail, but gracefully
179         assertTokenStreamContents(
180             ezTokStream("50 a.­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­b."),
181             new String[]{"50", "a.­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­b."},
182             new int[]{0, 3},   // start offsets
183             new int[]{2, 57}); // end offsets
184     }
185 
186     @Test
187     public void testMiscNonAcronymicText() throws IOException { // these should all be unchanged
188         // Latin, Cyrillic, Hanzi, Hangul
189         assertTokenStreamContents(
190             ezTokStream("Wikipedia Википедию 维基百科 위키백과"),
191             new String[]{"Wikipedia", "Википедию", "维基百科", "위키백과"});
192 
193         // Armenian, Hebrew, Greek, Arabic, Georgian
194         assertTokenStreamContents(
195             ezTokStream("Վիքիպեդիա ויקיפדיה Βικιπαίδεια ويكيبيدي ვიკიპედია"),
196             new String[]{"Վիքիպեդիա", "ויקיפדיה", "Βικιπαίδεια", "ويكيبيدي", "ვიკიპედია"});
197 
198         // Devanagari, Thai, Tamil, Bengali, IPA
199         assertTokenStreamContents(
200             ezTokStream("विकिपीडिया วิกิพีเดีย விக்கிப்பீடியா উইকিপিডিয়া ˌwɪkɪˈpiːdiə"),
201             new String[]{"विकिपीडिया", "วิกิพีเดีย", "விக்கிப்பீடியா", "উইকিপিডিয়া", "ˌwɪkɪˈpiːdiə"});
202     }
203 
204 }