View Javadoc
1   /*
2    * The WMF licenses this file to you under the Apache License, Version
3    * 2.0 (the "License"); you may not use this file except in compliance
4    * with the License. You may obtain a copy of the License at
5    *
6    *      http://www.apache.org/licenses/LICENSE-2.0
7    *
8    * Unless required by applicable law or agreed to in writing, software
9    * distributed under the License is distributed on an "AS IS" BASIS,
10   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11   * See the License for the specific language governing permissions and
12   * limitations under the License.
13   */
14  package org.wikimedia.search.extra.analysis.textify;
15  
16  import static java.util.Collections.unmodifiableMap;
17  
18  import java.io.IOException;
19  import java.io.Reader;
20  import java.util.Map;
21  
22  import org.apache.lucene.analysis.charfilter.BaseCharFilter;
23  
24  public class LimitedMappingCharFilter extends BaseCharFilter {
25  
26      private final Map<Integer, Integer> oneCharMap;
27      private int outputCharCount;
28      private int cumulativeOffset;
29  
30      LimitedMappingCharFilter(Map<Integer, Integer> map, Reader in) {
31          super(in);
32  
33          for (Map.Entry<Integer, Integer> pair : map.entrySet()) {
34              if (pair.getKey() < 0 || pair.getKey() > 0xFFFF) {
35                  throw new IllegalArgumentException("mapping keys must be between 0 and 0xFFFF");
36              }
37              if (pair.getValue() < -1 || pair.getValue() > 0xFFFF) {
38                  throw new IllegalArgumentException("mapping values must be between -1 and 0xFFFF");
39              }
40          }
41  
42          oneCharMap = unmodifiableMap(map);
43      }
44  
45      @Override
46      public int read() throws IOException {
47          int c = 0;
48          boolean doneReading = false;
49  
50          while (!doneReading) {
51              c = input.read();
52              if (c == -1) {
53                  // sentinel value for end of input stream
54                  return c;
55              }
56              c = oneCharMap.getOrDefault(c, c);
57              if (c == -1) {
58                  // sentinel value for character to be deleted
59                  cumulativeOffset++;
60                  addOffCorrectMap(outputCharCount, cumulativeOffset);
61              } else {
62                  doneReading = true;
63              }
64          }
65          outputCharCount++;
66          return c;
67      }
68  
69      @Override
70      public int read(char[] cbuf, int offset, int len) throws IOException {
71          int charsRead = 0;
72          for (int i = offset; i < offset + len; i++) {
73              int c = read();
74              if (c == -1) {
75                  break;
76              }
77              cbuf[i] = (char) c;
78              charsRead++;
79          }
80  
81          return charsRead == 0 && len > 0 ? -1 : charsRead;
82      }
83  
84      @Override
85      public void reset() throws IOException {
86          input.reset();
87          outputCharCount = 0;
88          cumulativeOffset = 0;
89      }
90  
91  }