1
2
3
4
5
6
7
8
9
10
11
12
13
14 package org.wikimedia.search.extra.analysis.textify;
15
16 import java.io.IOException;
17 import java.io.Reader;
18 import java.nio.BufferOverflowException;
19
20 import org.apache.lucene.analysis.charfilter.BaseCharFilter;
21
22 public class AcronymFixerCharFilter extends BaseCharFilter {
23
24 private enum AcroState { NON_LETTER, ONE_LETTER, MULTI_LETTER }
25 private AcroState aState = AcroState.NON_LETTER;
26
27
28 private enum BuffState { BUFF_START, ONE_LETTER, NUKE_DOT, BUFF_STOP }
29
30
31
32 private final CircleBuff buff = new CircleBuff(25);
33
34
35 private int lowSurrogate = -1;
36
37 private int outputCharCount;
38 private int cumulativeOffset;
39
40 private boolean inputEnd;
41
42 public AcronymFixerCharFilter(Reader in) {
43 super(in);
44 }
45
46 private BuffState updateBState(int type, BuffState bState) {
47 if (TextifyUtils.isLetterType(type)) {
48 bState = (bState == BuffState.BUFF_START) ? BuffState.ONE_LETTER : BuffState.BUFF_STOP;
49 } else if (TextifyUtils.isMarkOrFormatType(type)) {
50
51 } else {
52 bState = (bState == BuffState.ONE_LETTER) ? BuffState.NUKE_DOT : BuffState.BUFF_STOP;
53 }
54
55 if (buff.isFull() && bState != BuffState.NUKE_DOT) {
56 bState = BuffState.BUFF_STOP;
57 }
58 return bState;
59 }
60
61 private BuffState inspectPostPeriod() throws IOException {
62 BuffState bState = BuffState.BUFF_START;
63
64
65
66 while (bState == BuffState.BUFF_START || bState == BuffState.ONE_LETTER) {
67 int nextChar = input.read();
68 int nextType;
69
70 if (nextChar == -1) {
71
72 nextType = Character.UNASSIGNED;
73 inputEnd = true;
74 } else {
75 buff.put(nextChar);
76 nextType = Character.getType(nextChar);
77 }
78
79 if (Character.isHighSurrogate((char) nextChar)) {
80
81
82
83
84 if (buff.isFull()) {
85 bState = BuffState.BUFF_STOP;
86 break;
87 }
88 int nextLowSurr = input.read();
89
90 if (nextLowSurr == -1) {
91
92 inputEnd = true;
93 } else {
94 buff.put(nextLowSurr);
95 if (Character.isLowSurrogate((char) nextLowSurr)) {
96 nextType = Character.getType(Character.toCodePoint((char) nextChar,
97 (char) nextLowSurr));
98 }
99 }
100
101 }
102
103 bState = updateBState(nextType, bState);
104 }
105
106 return bState;
107 }
108
109 private int readBuffOrInput() throws IOException {
110 if (!buff.isEmpty()) {
111 return buff.read();
112 }
113 if (inputEnd) {
114 return -1;
115 }
116 int c = input.read();
117 if (c == -1) {
118 inputEnd = true;
119 }
120 return c;
121 }
122
123 private int getReadCharType(int c) throws IOException {
124 int type = TextifyUtils.getCustomCharType(c);
125
126 if (Character.isHighSurrogate((char) c)) {
127 lowSurrogate = readBuffOrInput();
128 if (lowSurrogate != -1 && Character.isLowSurrogate((char) lowSurrogate)) {
129 type = Character.getType(Character.toCodePoint((char) c,
130 (char) lowSurrogate));
131 }
132 }
133 return type;
134 }
135
136 @Override
137 public int read() throws IOException {
138 int c;
139
140 if (lowSurrogate != -1) {
141
142 c = lowSurrogate;
143 lowSurrogate = -1;
144 outputCharCount++;
145 return c;
146 }
147
148 c = readBuffOrInput();
149 if (c == -1) {
150 return c;
151 }
152
153 int type = getReadCharType(c);
154
155 if (TextifyUtils.isLetterType(type)) {
156 aState = (aState == AcroState.NON_LETTER) ? AcroState.ONE_LETTER : AcroState.MULTI_LETTER;
157 } else if (TextifyUtils.isPeriodlikeChar(c)) {
158 if (aState == AcroState.ONE_LETTER) {
159
160
161 BuffState bState = inspectPostPeriod();
162
163 if (bState == BuffState.NUKE_DOT) {
164
165
166 cumulativeOffset++;
167 addOffCorrectMap(outputCharCount, cumulativeOffset);
168 aState = AcroState.NON_LETTER;
169 return read();
170 }
171 }
172 aState = AcroState.NON_LETTER;
173 } else if (TextifyUtils.isMarkOrFormatType(type)) {
174
175 } else {
176 aState = AcroState.NON_LETTER;
177 }
178
179 outputCharCount++;
180 return c;
181 }
182
183 @Override
184 public int read(char[] cbuf, int offset, int len) throws IOException {
185 int charsRead = 0;
186 for (int i = offset; i < offset + len; i++) {
187 int c = read();
188 if (c == -1) {
189 break;
190 }
191 cbuf[i] = (char) c;
192 charsRead++;
193 }
194
195 return charsRead == 0 && len > 0 ? -1 : charsRead;
196 }
197
198 @Override
199 public void reset() throws IOException {
200 input.reset();
201 outputCharCount = 0;
202 cumulativeOffset = 0;
203 aState = AcroState.NON_LETTER;
204 lowSurrogate = -1;
205 inputEnd = false;
206 buff.reset();
207 }
208
209
210 private static final class CircleBuff {
211 private final int[] buffer;
212 private final int capacity;
213 private int size;
214 private int head;
215 private int tail = -1;
216
217 private CircleBuff(int capacity) {
218 this.capacity = capacity;
219 buffer = new int[capacity];
220 }
221
222
223
224 private void put(int c) throws BufferOverflowException {
225 if (size == capacity) {
226 throw new BufferOverflowException();
227 }
228 tail = (tail + 1) % capacity;
229 buffer[tail] = c;
230 size++;
231 }
232
233 private int read() {
234 if (size < 1) {
235 return -1;
236 }
237 int ret = buffer[head];
238 size--;
239 head = (head + 1) % capacity;
240 return ret;
241 }
242
243 private boolean isEmpty() {
244 return (size == 0);
245 }
246
247 private boolean isFull() {
248 return (size == capacity);
249 }
250
251 private void reset() {
252 size = 0;
253 head = 0;
254 tail = -1;
255 }
256
257 }
258
259 }