View Javadoc
1   package org.wikimedia.search.extra.analysis.textify;
2   
3   import static java.util.Collections.emptySet;
4   import static java.util.Collections.unmodifiableSet;
5   
6   import java.util.Arrays;
7   import java.util.HashSet;
8   import java.util.List;
9   import java.util.Set;
10  
11  import javax.annotation.Nullable;
12  
13  import com.google.common.collect.HashBasedTable;
14  import com.google.common.collect.ImmutableTable;
15  import com.google.common.collect.Table;
16  
17  public final class ICUTokenRepairFilterConfig {
18  
19      static final int DEFAULT_MAX_TOK_LEN = 100;
20      static final int MIN_MAX_TOK_LEN = 2;
21      static final int MAX_MAX_TOK_LEN = 5000;
22      static final Boolean DEFAULT_KEEP_CAMEL_SPLIT = Boolean.TRUE;
23      static final Boolean DEFAULT_MERGE_NUM_ONLY = Boolean.FALSE;
24  
25      private static final Boolean DEFAULT_IS_TYPE_ALLOW_LIST = Boolean.FALSE;
26      private static final Set<Integer> DEFAULT_TYPE_DENYLIST_SET =
27          unmodifiableSet(new HashSet<>(Arrays.asList(
28              TextifyUtils.TOKEN_TYPE_EMOJI,
29              TextifyUtils.TOKEN_TYPE_IDEOGRAPHIC,
30              TextifyUtils.TOKEN_TYPE_HANGUL
31          )));
32  
33      private static final Boolean DEFAULT_FILTER_SCRIPTS = Boolean.TRUE;
34      private static final List<String> DEFAULT_SCRIPT_GROUPS =
35          Arrays.asList("Armenian+Coptic+Cyrillic+Greek+Latin", "Lao+Thai", "Latin+Tifinagh",
36              "Cherokee+Latin", "Gothic+Latin", "Canadian_Aboriginal+Latin");
37  
38      protected int maxTokenLength = DEFAULT_MAX_TOK_LEN;
39      protected boolean keepCamelSplit = DEFAULT_KEEP_CAMEL_SPLIT;
40      protected boolean mergeNumOnly = DEFAULT_MERGE_NUM_ONLY;
41      protected boolean isTypeAllowList = DEFAULT_IS_TYPE_ALLOW_LIST;
42      protected Set<Integer> mergeableTypes = DEFAULT_TYPE_DENYLIST_SET;
43      protected boolean filterScriptPairs = DEFAULT_FILTER_SCRIPTS;
44      @Nullable protected Table<Integer, Integer, Boolean> mergeableScriptPairs;
45  
46      public ICUTokenRepairFilterConfig() {
47          this(
48              DEFAULT_MAX_TOK_LEN,
49              DEFAULT_KEEP_CAMEL_SPLIT,
50              DEFAULT_MERGE_NUM_ONLY,
51              DEFAULT_IS_TYPE_ALLOW_LIST,
52              DEFAULT_TYPE_DENYLIST_SET,
53              DEFAULT_FILTER_SCRIPTS,
54              TextifyUtils.parseICUTokenRepairScriptList(DEFAULT_SCRIPT_GROUPS)
55          );
56      }
57  
58      public ICUTokenRepairFilterConfig(int maxTokLen, boolean keepCamSpl, boolean mrgNumOnly,
59              boolean isAllow, Set<Integer> typeSet, boolean filterScripts,
60              @Nullable Table<Integer, Integer, Boolean> scriptPairs) {
61          setMaxTokenLength(maxTokLen);
62          setKeepCamelSplit(keepCamSpl);
63          setMergeNumOnly(mrgNumOnly);
64          setTypeLimits(isAllow, typeSet);
65          setScriptLimits(filterScripts, scriptPairs);
66      }
67  
68      public void setKeepCamelSplit(boolean keepCamSpl) {
69          keepCamelSplit = keepCamSpl;
70      }
71  
72      public void setMergeNumOnly(boolean mrgNumOnly) {
73          mergeNumOnly = mrgNumOnly;
74      }
75  
76      public void setMaxTokenLength(int maxTokLen) {
77          if (maxTokLen < MIN_MAX_TOK_LEN || maxTokLen > MAX_MAX_TOK_LEN) {
78              throw new IllegalArgumentException("ICU Token Repair invalid argument: maximum " +
79                  "token length must be between " + MIN_MAX_TOK_LEN + " and " + MAX_MAX_TOK_LEN);
80          }
81          maxTokenLength = maxTokLen;
82      }
83  
84      public void setNoTypeLimits() {
85          setTypeLimits(false, emptySet());
86      }
87  
88      public void setTypeLimits(boolean isAllow, Set<Integer> typeSet) {
89          isTypeAllowList = isAllow;
90          mergeableTypes = unmodifiableSet(typeSet);
91      }
92  
93      public void setNoScriptLimits() {
94          filterScriptPairs = false;
95          mergeableScriptPairs = null;
96      }
97  
98      public void setScriptLimits(String scriptGroups) {
99          setScriptLimits(true, TextifyUtils.parseICUTokenRepairScriptList(scriptGroups));
100     }
101 
102     /* external input should be string-based, using parseICUTokenRepairScriptList */
103     protected void setScriptLimits(boolean filterScripts,
104             @Nullable Table<Integer, Integer, Boolean> scriptPairs) {
105         filterScriptPairs = filterScripts;
106         if (filterScriptPairs) {
107             if (scriptPairs == null) {
108                 scriptPairs = HashBasedTable.create();
109             }
110             mergeableScriptPairs = ImmutableTable.copyOf(scriptPairs);
111         } else {
112             mergeableScriptPairs = null;
113         }
114     }
115 
116 }