View Javadoc
1   /*
2    * The WMF licenses this file to you under the Apache License, Version
3    * 2.0 (the "License"); you may not use this file except in compliance
4    * with the License. You may obtain a copy of the License at
5    *
6    *      http://www.apache.org/licenses/LICENSE-2.0
7    *
8    * Unless required by applicable law or agreed to in writing, software
9    * distributed under the License is distributed on an "AS IS" BASIS,
10   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11   * See the License for the specific language governing permissions and
12   * limitations under the License.
13   */
14  package org.wikimedia.search.extra.analysis.textify;
15  
16  import java.io.IOException;
17  import java.io.Reader;
18  import java.nio.BufferOverflowException;
19  
20  import org.apache.lucene.analysis.charfilter.BaseCharFilter;
21  
22  public class AcronymFixerCharFilter extends BaseCharFilter {
23      // possible states of current char-by-char pre-period acronym detection
24      private enum AcroState { NON_LETTER, ONE_LETTER, MULTI_LETTER }
25      private AcroState aState = AcroState.NON_LETTER;
26  
27      // possible states of post-period, buffered char-by-char acronym detection
28      private enum BuffState { BUFF_START, ONE_LETTER, NUKE_DOT, BUFF_STOP }
29  
30      // Buffer for post-period read-ahead characters. Don't expect more than 5,
31      // but allocate space for 25 anyway
32      private final CircleBuff buff = new CircleBuff(25);
33  
34      // 1-char buffer for low surrogate read-ahead
35      private int lowSurrogate = -1;
36  
37      private int outputCharCount;
38      private int cumulativeOffset;
39  
40      private boolean inputEnd;
41  
42      public AcronymFixerCharFilter(Reader in) {
43          super(in);
44      }
45  
46      private BuffState updateBState(int type, BuffState bState) {
47          if (TextifyUtils.isLetterType(type)) {
48              bState = (bState == BuffState.BUFF_START) ? BuffState.ONE_LETTER : BuffState.BUFF_STOP;
49          } else if (TextifyUtils.isMarkOrFormatType(type)) {
50              // do nothing (maintain post-period buffering state)
51          } else {
52              bState = (bState == BuffState.ONE_LETTER) ? BuffState.NUKE_DOT : BuffState.BUFF_STOP;
53          }
54  
55          if (buff.isFull() && bState != BuffState.NUKE_DOT) {
56              bState = BuffState.BUFF_STOP;
57          }
58          return bState;
59      }
60  
61      private BuffState inspectPostPeriod() throws IOException {
62          BuffState bState = BuffState.BUFF_START;
63  
64          // look ahead for post-period (letter + optional combining characters or invisibles)
65          // followed by a non-letter
66          while (bState == BuffState.BUFF_START || bState == BuffState.ONE_LETTER) {
67              int nextChar = input.read();
68              int nextType;
69  
70              if (nextChar == -1) {
71                  // end of input; mark it as a non-letter
72                  nextType = Character.UNASSIGNED;
73                  inputEnd = true;
74              } else {
75                  buff.put(nextChar);
76                  nextType = Character.getType(nextChar);
77              }
78  
79              if (Character.isHighSurrogate((char) nextChar)) {
80                  // if it's a low surrogate, it's out of order and everything is a
81                  // mess, so let it continue as a non-letter if it's a high surrogate,
82                  // we need to look for a low surrogate, and we'll have to buffer it,
83                  // so make sure we have room
84                  if (buff.isFull()) {
85                      bState = BuffState.BUFF_STOP;
86                      break;
87                  }
88                  int nextLowSurr = input.read();
89  
90                  if (nextLowSurr == -1) {
91                      // nextType remains SURROGATE
92                      inputEnd = true;
93                  } else {
94                      buff.put(nextLowSurr);
95                      if (Character.isLowSurrogate((char) nextLowSurr)) {
96                          nextType = Character.getType(Character.toCodePoint((char) nextChar,
97                              (char) nextLowSurr));
98                      }
99                  }
100 
101             }
102 
103             bState = updateBState(nextType, bState);
104         }
105 
106         return bState;
107     }
108 
109     private int readBuffOrInput() throws IOException {
110         if (!buff.isEmpty()) {
111             return buff.read();
112         }
113         if (inputEnd) {
114             return -1;
115         }
116         int c = input.read();
117         if (c == -1) {
118             inputEnd = true;
119         }
120         return c;
121     }
122 
123     private int getReadCharType(int c) throws IOException {
124         int type = TextifyUtils.getCustomCharType(c);
125 
126         if (Character.isHighSurrogate((char) c)) {
127             lowSurrogate = readBuffOrInput();
128             if (lowSurrogate != -1 && Character.isLowSurrogate((char) lowSurrogate)) {
129                 type = Character.getType(Character.toCodePoint((char) c,
130                     (char) lowSurrogate));
131             }
132         }
133         return type;
134     }
135 
136     @Override
137     public int read() throws IOException {
138         int c;
139 
140         if (lowSurrogate != -1) {
141             // catch up on stashed low surrogate
142             c = lowSurrogate;
143             lowSurrogate = -1;
144             outputCharCount++;
145             return c;
146         }
147 
148         c = readBuffOrInput();
149         if (c == -1) {
150             return c;
151         }
152 
153         int type = getReadCharType(c);
154 
155         if (TextifyUtils.isLetterType(type)) {
156             aState = (aState == AcroState.NON_LETTER) ? AcroState.ONE_LETTER : AcroState.MULTI_LETTER;
157         } else if (TextifyUtils.isPeriodlikeChar(c)) {
158             if (aState == AcroState.ONE_LETTER) {
159                 // if only one letter before period, check after period to see if it looks
160                 // like an acronym, walks like and acronym, and quacks like an acronym
161                 BuffState bState = inspectPostPeriod();
162 
163                 if (bState == BuffState.NUKE_DOT) {
164                     // Looks like an acronym, so skip this period and pull the
165                     // next character from the buffer, after updating offset map
166                     cumulativeOffset++;
167                     addOffCorrectMap(outputCharCount, cumulativeOffset);
168                     aState = AcroState.NON_LETTER;
169                     return read();
170                 }
171             }
172             aState = AcroState.NON_LETTER;
173         } else if (TextifyUtils.isMarkOrFormatType(type)) {
174             // do nothing (maintain acronym letter count state)
175         } else {
176             aState = AcroState.NON_LETTER;
177         }
178 
179         outputCharCount++;
180         return c;
181     }
182 
183     @Override
184     public int read(char[] cbuf, int offset, int len) throws IOException {
185         int charsRead = 0;
186         for (int i = offset; i < offset + len; i++) {
187             int c = read();
188             if (c == -1) {
189                 break;
190             }
191             cbuf[i] = (char) c;
192             charsRead++;
193         }
194 
195         return charsRead == 0 && len > 0 ? -1 : charsRead;
196     }
197 
198     @Override
199     public void reset() throws IOException {
200         input.reset();
201         outputCharCount = 0;
202         cumulativeOffset = 0;
203         aState = AcroState.NON_LETTER;
204         lowSurrogate = -1;
205         inputEnd = false;
206         buff.reset();
207     }
208 
209     // Small utility class for buffering post-period read-ahead characters.
210     private static final class CircleBuff {
211         private final int[] buffer;
212         private final int capacity;
213         private int size;
214         private int head;
215         private int tail = -1;
216 
217         private CircleBuff(int capacity) {
218             this.capacity = capacity;
219             buffer = new int[capacity];
220         }
221 
222         // put() throws BufferOverflowException if the buffer is full.
223         // The caller should check if the buffer is full before calling put().
224         private void put(int c) throws BufferOverflowException {
225             if (size == capacity) {
226                 throw new BufferOverflowException();
227             }
228             tail = (tail + 1) % capacity;
229             buffer[tail] = c;
230             size++;
231         }
232 
233         private int read() {
234             if (size < 1) {
235                 return -1;
236             }
237             int ret = buffer[head];
238             size--;
239             head = (head + 1) % capacity;
240             return ret;
241         }
242 
243         private boolean isEmpty() {
244             return (size == 0);
245         }
246 
247         private boolean isFull() {
248             return (size == capacity);
249         }
250 
251         private void reset() {
252             size = 0;
253             head = 0;
254             tail = -1;
255         }
256 
257     }
258 
259 }