1 /* 2 * The WMF licenses this file to you under the Apache License, Version 3 * 2.0 (the "License"); you may not use this file except in compliance 4 * with the License. You may obtain a copy of the License at 5 * 6 * http://www.apache.org/licenses/LICENSE-2.0 7 * 8 * Unless required by applicable law or agreed to in writing, software 9 * distributed under the License is distributed on an "AS IS" BASIS, 10 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 * See the License for the specific language governing permissions and 12 * limitations under the License. 13 * 14 * *** Source Information *** 15 * 16 * This implementation is based on the lucene-solr PatternReplaceCharFilter 17 * (v 7.5), which is licensed from ASF under the Apache License, Version 18 * 2.0. Source code is available here: 19 * 20 * https://github.com/apache/lucene-solr/blob/branch_7_5/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.java 21 * 22 * ** Additional Changes ** 23 * 24 * - The generic pattern matching was replaced with Khmer-specific 25 * syllable-reordering logic. The input pattern to be matched is fixed (it 26 * defines a Khmer syllable, and is imported from KhmerSyllableReorderer), 27 * and the string to be replaced is computed at runtime by calling 28 * reorderKhmerSyllable() on the matched input syllable. All of the logic 29 * to compute offsets is retained from the original--thanks a lot for that! 30 * 31 * - Added a Khmer-specific MappingCharFilter before the local logic to 32 * replace or remove obsolete, deprecated, and variant characters. 33 * 34 * - Discarded some commented out code. 35 * 36 * - Updates to conform to findbugs/spotbugs/checkstyle errors. 37 * 38 */ 39 package org.wikimedia.search.extra.analysis.khmer; 40 41 import java.io.IOException; 42 import java.io.Reader; 43 import java.io.StringReader; 44 import java.util.regex.Matcher; 45 46 import javax.annotation.Nullable; 47 48 import org.apache.lucene.analysis.charfilter.BaseCharFilter; 49 import org.apache.lucene.analysis.charfilter.MappingCharFilter; 50 import org.apache.lucene.analysis.charfilter.NormalizeCharMap; 51 52 /** 53 * CharFilter that uses a regular expression (from KhmerSyllableReorderer) to 54 * match Khmer syllables and replace them with canonically reordered syllables. 55 * The pattern match will be done in each "block" in the char stream. 56 * 57 * NOTE: If the reordered syllable is a different length from the original 58 * source syllable and the field is used for highlighting, there could be some 59 * offset mismatches or mistakes. This is less likely than in the original pattern 60 * replace char filter because the first character of the syllable stays the 61 * same and is unlikely to be discarded by the rest of the analysis chain, likely 62 * providing a reasonable marker for later token offsets. 63 * 64 */ 65 public class KhmerCharFilter extends BaseCharFilter { 66 67 @Nullable private Reader transformedInput; 68 69 // define deprecated characters to remap, for external use with MappingCharFilter. 70 static final NormalizeCharMap KHMER_NORM_MAP = initKhmerNormMap(); 71 public KhmerCharFilter(Reader in) { 72 // internally, use a mapping char filter to handle replacing or removing 73 // deprecated and obsolete characters, and--most importantly--handle all 74 // the offset correction bookkeeping. 75 super(new MappingCharFilter(KHMER_NORM_MAP, in)); 76 } 77 78 @Override 79 public int read(char[] cbuf, int off, int len) throws IOException { 80 // Buffer all input on the first call. 81 if (transformedInput == null) { 82 fill(); 83 } 84 85 return transformedInput.read(cbuf, off, len); 86 } 87 88 @Override 89 public int read() throws IOException { 90 if (transformedInput == null) { 91 fill(); 92 } 93 94 return transformedInput.read(); 95 } 96 97 // map deprecated, obsolete, and variant characters to more typical/modern varieties 98 // before doing syllable reordering. 99 private static NormalizeCharMap initKhmerNormMap() { 100 NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); 101 102 builder.add("\u17A3", "\u17A2"); // deprecated indep vowel ឣ → អ 103 builder.add("\u17A4", "\u17A2\u17B6"); // deprecated indep vowel digraph ឤ → អា 104 builder.add("\u17A8", "\u17A7\u1780"); // obsolete ligature ឨ → ឧក 105 builder.add("\u17B2", "\u17B1"); // replace ឲ as a variant of ឱ 106 builder.add("\u17B4", ""); // delete non-visible inherent vowel (឴) 107 builder.add("\u17B5", ""); // delete non-visible inherent vowel (឵) 108 builder.add("\u17D3", "\u17C6"); // deprecated BATHAMASAT ៓ → NIKAHIT ំ 109 builder.add("\u17D8", "\u17D4\u179B\u17D4"); // deprecated trigraph ៘ → ។ល។ 110 builder.add("\u17DD", "\u17D1"); // obsolete ATTHACAN ៝ → VIRIAM ៑ 111 112 return builder.build(); 113 } 114 115 private void fill() throws IOException { 116 StringBuilder buffered = new StringBuilder(); 117 char[] temp = new char[1024]; 118 for (int cnt = input.read(temp); cnt > 0; cnt = input.read(temp)) { 119 buffered.append(temp, 0, cnt); 120 } 121 transformedInput = new StringReader(processPattern(buffered).toString()); 122 } 123 124 @Override 125 protected int correct(int currentOff) { 126 return Math.max(0, super.correct(currentOff)); 127 } 128 129 /** 130 * Replace pattern in input and mark correction offsets. 131 */ 132 CharSequence processPattern(CharSequence input) { 133 final Matcher m = KhmerSyllableReorderer.SYLL_PAT.matcher(input); 134 135 // Once we get to Java 9 or higher, StringBuffer should be replaced with StringBuilder 136 final StringBuffer cumulativeOutput = new StringBuffer(); 137 int cumulative = 0; 138 int lastMatchEnd = 0; 139 while (m.find()) { 140 final int groupSize = m.end() - m.start(); 141 final int skippedSize = m.start() - lastMatchEnd; 142 lastMatchEnd = m.end(); 143 final String replacement = KhmerSyllableReorderer.reorderKhmerSyllable(m.group()); 144 assert !replacement.contains("\\") && !replacement.contains("$") : 145 "KhmerSyllableReorderer.reorderKhmerSyllable() must not produce a string containing $ or \\"; 146 147 final int lengthBeforeReplacement = cumulativeOutput.length() + skippedSize; 148 m.appendReplacement(cumulativeOutput, replacement); 149 // Matcher doesn't tell us how many characters have been appended before the replacement. 150 // So we need to calculate it. Skipped characters have been added as part of appendReplacement. 151 final int replacementSize = cumulativeOutput.length() - lengthBeforeReplacement; 152 153 if (groupSize != replacementSize) { 154 if (replacementSize < groupSize) { 155 // The replacement is smaller. 156 // Add the 'backskip' to the next index after the replacement (this is possibly 157 // after the end of string, but it's fine -- it just means the last character 158 // of the replaced block doesn't reach the end of the original string. 159 cumulative += groupSize - replacementSize; 160 int atIndex = lengthBeforeReplacement + replacementSize; 161 addOffCorrectMap(atIndex, cumulative); 162 } else { 163 // The replacement is larger. Every new index needs to point to the last 164 // element of the original group (if any). 165 // 166 // NOTE: This *shouldn't* happen because we only reorder and discard 167 // characters, but why throw away perfectly good code that could prevent 168 // an unanticipated catastrophe? 169 for (int i = groupSize; i < replacementSize; i++) { 170 addOffCorrectMap(lengthBeforeReplacement + i, --cumulative); 171 } 172 } 173 } 174 } 175 176 // Append the remaining output, no further changes to indices. 177 m.appendTail(cumulativeOutput); 178 return cumulativeOutput; 179 } 180 }