View Javadoc
1   /*
2    * The WMF licenses this file to you under the Apache License, Version
3    * 2.0 (the "License"); you may not use this file except in compliance
4    * with the License. You may obtain a copy of the License at
5    *
6    *      http://www.apache.org/licenses/LICENSE-2.0
7    *
8    * Unless required by applicable law or agreed to in writing, software
9    * distributed under the License is distributed on an "AS IS" BASIS,
10   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11   * See the License for the specific language governing permissions and
12   * limitations under the License.
13   *
14   * *** Source Information ***
15   *
16   * This implementation is based on the lucene-solr PatternReplaceCharFilter
17   * (v 7.5), which is licensed from ASF under the Apache License, Version
18   * 2.0. Source code is available here:
19   *
20   *   https://github.com/apache/lucene-solr/blob/branch_7_5/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.java
21   *
22   * ** Additional Changes **
23   *
24   * - The generic pattern matching was replaced with Khmer-specific
25   *   syllable-reordering logic. The input pattern to be matched is fixed (it
26   *   defines a Khmer syllable, and is imported from KhmerSyllableReorderer),
27   *   and the string to be replaced is computed at runtime by calling
28   *   reorderKhmerSyllable() on the matched input syllable. All of the logic
29   *   to compute offsets is retained from the original--thanks a lot for that!
30   *
31   * - Added a Khmer-specific MappingCharFilter before the local logic to
32   *   replace or remove obsolete, deprecated, and variant characters.
33   *
34   * - Discarded some commented out code.
35   *
36   * - Updates to conform to findbugs/spotbugs/checkstyle errors.
37   *
38   */
39  package org.wikimedia.search.extra.analysis.khmer;
40  
41  import java.io.IOException;
42  import java.io.Reader;
43  import java.io.StringReader;
44  import java.util.regex.Matcher;
45  
46  import javax.annotation.Nullable;
47  
48  import org.apache.lucene.analysis.charfilter.BaseCharFilter;
49  import org.apache.lucene.analysis.charfilter.MappingCharFilter;
50  import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
51  
52  /**
53   * CharFilter that uses a regular expression (from KhmerSyllableReorderer) to
54   * match Khmer syllables and replace them with canonically reordered syllables.
55   * The pattern match will be done in each "block" in the char stream.
56   *
57   * NOTE: If the reordered syllable is a different length from the original
58   * source syllable and the field is used for highlighting, there could be some
59   * offset mismatches or mistakes. This is less likely than in the original pattern
60   * replace char filter because the first character of the syllable stays the
61   * same and is unlikely to be discarded by the rest of the analysis chain, likely
62   * providing a reasonable marker for later token offsets.
63   *
64   */
65  public class KhmerCharFilter extends BaseCharFilter {
66  
67      @Nullable private Reader transformedInput;
68  
69      // define deprecated characters to remap, for external use with MappingCharFilter.
70      static final NormalizeCharMap KHMER_NORM_MAP = initKhmerNormMap();
71      public KhmerCharFilter(Reader in) {
72          // internally, use a mapping char filter to handle replacing or removing
73          // deprecated and obsolete characters, and--most importantly--handle all
74          // the offset correction bookkeeping.
75          super(new MappingCharFilter(KHMER_NORM_MAP, in));
76      }
77  
78      @Override
79      public int read(char[] cbuf, int off, int len) throws IOException {
80          // Buffer all input on the first call.
81          if (transformedInput == null) {
82              fill();
83          }
84  
85          return transformedInput.read(cbuf, off, len);
86      }
87  
88      @Override
89      public int read() throws IOException {
90          if (transformedInput == null) {
91              fill();
92          }
93  
94          return transformedInput.read();
95      }
96  
97      // map deprecated, obsolete, and variant characters to more typical/modern varieties
98      // before doing syllable reordering.
99      private static NormalizeCharMap initKhmerNormMap() {
100         NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
101 
102         builder.add("\u17A3", "\u17A2");             // deprecated indep vowel ឣ → អ
103         builder.add("\u17A4", "\u17A2\u17B6");       // deprecated indep vowel digraph ឤ → អា
104         builder.add("\u17A8", "\u17A7\u1780");       // obsolete ligature ឨ → ឧក
105         builder.add("\u17B2", "\u17B1");             // replace ឲ as a variant of ឱ
106         builder.add("\u17B4", "");                   // delete non-visible inherent vowel (឴)
107         builder.add("\u17B5", "");                   // delete non-visible inherent vowel (឵)
108         builder.add("\u17D3", "\u17C6");             // deprecated BATHAMASAT ៓ → NIKAHIT ំ
109         builder.add("\u17D8", "\u17D4\u179B\u17D4"); // deprecated trigraph ៘ → ។ល។
110         builder.add("\u17DD", "\u17D1");             // obsolete ATTHACAN ៝ → VIRIAM ៑
111 
112         return builder.build();
113     }
114 
115     private void fill() throws IOException {
116         StringBuilder buffered = new StringBuilder();
117         char[] temp = new char[1024];
118         for (int cnt = input.read(temp); cnt > 0; cnt = input.read(temp)) {
119             buffered.append(temp, 0, cnt);
120         }
121         transformedInput = new StringReader(processPattern(buffered).toString());
122     }
123 
124     @Override
125     protected int correct(int currentOff) {
126         return Math.max(0,  super.correct(currentOff));
127     }
128 
129     /**
130      * Replace pattern in input and mark correction offsets.
131      */
132     CharSequence processPattern(CharSequence input) {
133         final Matcher m = KhmerSyllableReorderer.SYLL_PAT.matcher(input);
134 
135         // Once we get to Java 9 or higher, StringBuffer should be replaced with StringBuilder
136         final StringBuffer cumulativeOutput = new StringBuffer();
137         int cumulative = 0;
138         int lastMatchEnd = 0;
139         while (m.find()) {
140             final int groupSize = m.end() - m.start();
141             final int skippedSize = m.start() - lastMatchEnd;
142             lastMatchEnd = m.end();
143             final String replacement = KhmerSyllableReorderer.reorderKhmerSyllable(m.group());
144             assert !replacement.contains("\\") && !replacement.contains("$") :
145                     "KhmerSyllableReorderer.reorderKhmerSyllable() must not produce a string containing $ or \\";
146 
147             final int lengthBeforeReplacement = cumulativeOutput.length() + skippedSize;
148             m.appendReplacement(cumulativeOutput, replacement);
149             // Matcher doesn't tell us how many characters have been appended before the replacement.
150             // So we need to calculate it. Skipped characters have been added as part of appendReplacement.
151             final int replacementSize = cumulativeOutput.length() - lengthBeforeReplacement;
152 
153             if (groupSize != replacementSize) {
154                 if (replacementSize < groupSize) {
155                     // The replacement is smaller.
156                     // Add the 'backskip' to the next index after the replacement (this is possibly
157                     // after the end of string, but it's fine -- it just means the last character
158                     // of the replaced block doesn't reach the end of the original string.
159                     cumulative += groupSize - replacementSize;
160                     int atIndex = lengthBeforeReplacement + replacementSize;
161                     addOffCorrectMap(atIndex, cumulative);
162                 } else {
163                     // The replacement is larger. Every new index needs to point to the last
164                     // element of the original group (if any).
165                     //
166                     //   NOTE: This *shouldn't* happen because we only reorder and discard
167                     //   characters, but why throw away perfectly good code that could prevent
168                     //   an unanticipated catastrophe?
169                     for (int i = groupSize; i < replacementSize; i++) {
170                         addOffCorrectMap(lengthBeforeReplacement + i, --cumulative);
171                     }
172                 }
173             }
174         }
175 
176         // Append the remaining output, no further changes to indices.
177         m.appendTail(cumulativeOutput);
178         return cumulativeOutput;
179     }
180 }