View Javadoc
1   /*
2    * The WMF licenses this file to you under the Apache License, Version
3    * 2.0 (the "License"); you may not use this file except in compliance
4    * with the License. You may obtain a copy of the License at
5    *
6    *      http://www.apache.org/licenses/LICENSE-2.0
7    *
8    * Unless required by applicable law or agreed to in writing, software
9    * distributed under the License is distributed on an "AS IS" BASIS,
10   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11   * See the License for the specific language governing permissions and
12   * limitations under the License.
13   */
14  package org.wikimedia.search.extra.analysis.textify;
15  
16  import static java.util.Collections.unmodifiableMap;
17  
18  import java.util.Arrays;
19  import java.util.HashMap;
20  import java.util.List;
21  import java.util.ListIterator;
22  import java.util.Map;
23  
24  import javax.annotation.Nullable;
25  
26  import org.apache.lucene.analysis.standard.StandardTokenizer;
27  
28  import com.google.common.collect.HashBasedTable;
29  import com.google.common.collect.Table;
30  import com.ibm.icu.lang.UScript;
31  
32  public final class TextifyUtils {
33  
34      private TextifyUtils() {}
35  
36      // reverse map of token type strings ("<NUM>") to ids (NUM == 1)
37      private static final Map<String, Integer> TOKEN_TYPE_STR2INT = initTokenTypeMappings();
38  
39      /* Types used by Lucene tokenizers are just strings. The Lucene 8.7.0 Standard Tokenizer
40       * list is here:
41       *   https://github.com/apache/lucene/blob/releases/lucene-solr/8.7.0/lucene/core/src/java/
42       *   org/apache/lucene/analysis/standard/StandardTokenizer.java#L61
43       *
44       * The default ICU tokenizer config inherits these types:
45       *   https://github.com/apache/lucene/blob/releases/lucene-solr/8.7.0/lucene/analysis/icu/
46       *   src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java#L42
47       */
48      protected static final int TOKEN_TYPE_ALPHANUM = StandardTokenizer.ALPHANUM;
49      protected static final int TOKEN_TYPE_NUM = StandardTokenizer.NUM;
50      protected static final int TOKEN_TYPE_SOUTHEAST_ASIAN = StandardTokenizer.SOUTHEAST_ASIAN;
51      protected static final int TOKEN_TYPE_IDEOGRAPHIC = StandardTokenizer.IDEOGRAPHIC;
52      protected static final int TOKEN_TYPE_HIRAGANA = StandardTokenizer.HIRAGANA;
53      protected static final int TOKEN_TYPE_KATAKANA = StandardTokenizer.KATAKANA;
54      protected static final int TOKEN_TYPE_HANGUL = StandardTokenizer.HANGUL;
55      protected static final int TOKEN_TYPE_EMOJI = StandardTokenizer.EMOJI;
56      protected static final int TOKEN_TYPE_OTHER = -1;
57      protected static final String TOKEN_TYPE_OTHER_WORD = "<OTHER>";
58  
59      protected static boolean isLetterType(int type) {
60          switch (type) {
61              case Character.LOWERCASE_LETTER:
62              case Character.UPPERCASE_LETTER:
63              case Character.TITLECASE_LETTER:
64              case Character.OTHER_LETTER:
65              case Character.MODIFIER_LETTER:
66                  return true;
67              default:
68                  return false;
69          }
70      }
71  
72      protected static boolean isMarkOrFormatType(int type) {
73          switch (type) {
74              case Character.FORMAT:
75              case Character.COMBINING_SPACING_MARK:
76              case Character.NON_SPACING_MARK:
77              case Character.ENCLOSING_MARK:
78                  return true;
79              default:
80                  return false;
81          }
82      }
83  
84      protected static boolean isPeriodlikeChar(int c) {
85          switch (c) {
86              case '.': // regular period/full stop
87              case '.': // CJK fullwidth period/full stop
88                  return true;
89              default:
90                  return false;
91          }
92      }
93  
94      /* Does the character look uppercase on the leading edge? (TitleCase characters—
95       * like Lj, Nj, or Dž—lead upper and trail lower.)
96       */
97      protected static boolean isLeadingUppercaseishType(int type) {
98          switch (type) {
99              case Character.UPPERCASE_LETTER:
100             case Character.TITLECASE_LETTER:
101                 return true;
102             default:
103                 return false;
104         }
105     }
106 
107     /* Does the character look lowercase on the trailing edge? (TitleCase characters—
108      * like Lj, Nj, or Dž—lead upper and trail lower.)
109      */
110     protected static boolean isTrailingLowercaseishType(int type) {
111         switch (type) {
112             case Character.LOWERCASE_LETTER:
113             case Character.TITLECASE_LETTER:
114                 return true;
115             default:
116                 return false;
117         }
118     }
119 
120     protected static int getCustomCharType(int c) {
121         if (c == 0x2069) { // treat POP DIRECTIONAL ISOLATE (U+2069) as formatting
122             return Character.FORMAT;
123         }
124         return Character.getType(c);
125     }
126 
127     protected static String getTokenTypeName(int typeInt) {
128         switch (typeInt) {
129             case TOKEN_TYPE_OTHER:
130                 return TOKEN_TYPE_OTHER_WORD;
131             default:
132                 return StandardTokenizer.TOKEN_TYPES[typeInt];
133         }
134     }
135 
136     protected static int getTokenType(String typeStr) {
137         return TOKEN_TYPE_STR2INT.getOrDefault(typeStr, TOKEN_TYPE_OTHER);
138     }
139 
140     private static Map<String, Integer> initTokenTypeMappings() {
141         Map<String, Integer> map = new HashMap<>();
142         for (int i = 0; i < StandardTokenizer.TOKEN_TYPES.length; i++) {
143             map.put(StandardTokenizer.TOKEN_TYPES[i], i);
144         }
145         map.put(TOKEN_TYPE_OTHER_WORD, TOKEN_TYPE_OTHER);
146         return unmodifiableMap(map);
147     }
148 
149     // Parse string arguments for allowable ICUTokenRepair scripts.
150     // These are here to prevent circular dependencies elsewhere.
151 
152     /* Single string as input */
153     protected static Table<Integer, Integer, Boolean> parseICUTokenRepairScriptList(
154             String scriptGroups) {
155         if (scriptGroups.length() == 0) {
156             return parseICUTokenRepairScriptList((List<String>) null);
157         }
158         return parseICUTokenRepairScriptList(Arrays.asList(scriptGroups.split(", *")));
159     }
160 
161     /* list of strings as input */
162     protected static Table<Integer, Integer, Boolean> parseICUTokenRepairScriptList(
163             @Nullable List<String> listOfScriptGroups) {
164         if (listOfScriptGroups == null || listOfScriptGroups.isEmpty()) {
165             return HashBasedTable.create();
166         }
167 
168         ListIterator<String> iter = listOfScriptGroups.listIterator();
169         Table<Integer, Integer, Boolean> scriptTable = HashBasedTable.create();
170 
171         while (iter.hasNext()) {
172             String[] group = iter.next().split("\\+");
173             int glen = group.length;
174             int[] groupCode = new int[glen];
175 
176             for (int i = 0; i < glen; i++) {
177                 if (isJpanScriptName(group[i])) {
178                     group[i] = "Jpan";
179                 }
180                 groupCode[i] = UScript.getCodeFromName(group[i]);
181                 if (groupCode[i] == UScript.INVALID_CODE) {
182                     throw new IllegalArgumentException("ICU Token Repair invalid argument: " +
183                         "unrecognized script " + group[i]);
184                 }
185             }
186 
187             for (int i = 0; i < glen; i++) {
188                 for (int j = i + 1; j < glen; j++) {
189                     // insert both orders into the table to make lookup faster
190                     scriptTable.put(groupCode[i], groupCode[j], Boolean.TRUE);
191                     scriptTable.put(groupCode[j], groupCode[i], Boolean.TRUE);
192                 }
193             }
194         }
195 
196         return scriptTable;
197     }
198 
199     private static boolean isJpanScriptName(String scr) {
200         // Tokens marked as "Chinese/Japanese" in explain output are internally "Jpan"
201         // both getName() and getShortName() return "Jpan". Allow "Chinese/Japanese",
202         // "Chinese", and "Japanese" as alternatives to "Jpan" in config.
203         switch (scr) {
204             case "Chinese":
205             case "Japanese":
206             case "Chinese/Japanese":
207                 return true;
208             default:
209                 return false;
210         }
211     }
212 
213 }