1 package org.wikimedia.search.extra.analysis.textify;
2
3 import static java.util.Collections.emptySet;
4 import static java.util.Collections.unmodifiableSet;
5
6 import java.util.Arrays;
7 import java.util.HashSet;
8 import java.util.List;
9 import java.util.Set;
10
11 import javax.annotation.Nullable;
12
13 import com.google.common.collect.HashBasedTable;
14 import com.google.common.collect.ImmutableTable;
15 import com.google.common.collect.Table;
16
17 public final class ICUTokenRepairFilterConfig {
18
19 static final int DEFAULT_MAX_TOK_LEN = 100;
20 static final int MIN_MAX_TOK_LEN = 2;
21 static final int MAX_MAX_TOK_LEN = 5000;
22 static final Boolean DEFAULT_KEEP_CAMEL_SPLIT = Boolean.TRUE;
23 static final Boolean DEFAULT_MERGE_NUM_ONLY = Boolean.FALSE;
24
25 private static final Boolean DEFAULT_IS_TYPE_ALLOW_LIST = Boolean.FALSE;
26 private static final Set<Integer> DEFAULT_TYPE_DENYLIST_SET =
27 unmodifiableSet(new HashSet<>(Arrays.asList(
28 TextifyUtils.TOKEN_TYPE_EMOJI,
29 TextifyUtils.TOKEN_TYPE_IDEOGRAPHIC,
30 TextifyUtils.TOKEN_TYPE_HANGUL
31 )));
32
33 private static final Boolean DEFAULT_FILTER_SCRIPTS = Boolean.TRUE;
34 private static final List<String> DEFAULT_SCRIPT_GROUPS =
35 Arrays.asList("Armenian+Coptic+Cyrillic+Greek+Latin", "Lao+Thai", "Latin+Tifinagh",
36 "Cherokee+Latin", "Gothic+Latin", "Canadian_Aboriginal+Latin");
37
38 protected int maxTokenLength = DEFAULT_MAX_TOK_LEN;
39 protected boolean keepCamelSplit = DEFAULT_KEEP_CAMEL_SPLIT;
40 protected boolean mergeNumOnly = DEFAULT_MERGE_NUM_ONLY;
41 protected boolean isTypeAllowList = DEFAULT_IS_TYPE_ALLOW_LIST;
42 protected Set<Integer> mergeableTypes = DEFAULT_TYPE_DENYLIST_SET;
43 protected boolean filterScriptPairs = DEFAULT_FILTER_SCRIPTS;
44 @Nullable protected Table<Integer, Integer, Boolean> mergeableScriptPairs;
45
46 public ICUTokenRepairFilterConfig() {
47 this(
48 DEFAULT_MAX_TOK_LEN,
49 DEFAULT_KEEP_CAMEL_SPLIT,
50 DEFAULT_MERGE_NUM_ONLY,
51 DEFAULT_IS_TYPE_ALLOW_LIST,
52 DEFAULT_TYPE_DENYLIST_SET,
53 DEFAULT_FILTER_SCRIPTS,
54 TextifyUtils.parseICUTokenRepairScriptList(DEFAULT_SCRIPT_GROUPS)
55 );
56 }
57
58 public ICUTokenRepairFilterConfig(int maxTokLen, boolean keepCamSpl, boolean mrgNumOnly,
59 boolean isAllow, Set<Integer> typeSet, boolean filterScripts,
60 @Nullable Table<Integer, Integer, Boolean> scriptPairs) {
61 setMaxTokenLength(maxTokLen);
62 setKeepCamelSplit(keepCamSpl);
63 setMergeNumOnly(mrgNumOnly);
64 setTypeLimits(isAllow, typeSet);
65 setScriptLimits(filterScripts, scriptPairs);
66 }
67
68 public void setKeepCamelSplit(boolean keepCamSpl) {
69 keepCamelSplit = keepCamSpl;
70 }
71
72 public void setMergeNumOnly(boolean mrgNumOnly) {
73 mergeNumOnly = mrgNumOnly;
74 }
75
76 public void setMaxTokenLength(int maxTokLen) {
77 if (maxTokLen < MIN_MAX_TOK_LEN || maxTokLen > MAX_MAX_TOK_LEN) {
78 throw new IllegalArgumentException("ICU Token Repair invalid argument: maximum " +
79 "token length must be between " + MIN_MAX_TOK_LEN + " and " + MAX_MAX_TOK_LEN);
80 }
81 maxTokenLength = maxTokLen;
82 }
83
84 public void setNoTypeLimits() {
85 setTypeLimits(false, emptySet());
86 }
87
88 public void setTypeLimits(boolean isAllow, Set<Integer> typeSet) {
89 isTypeAllowList = isAllow;
90 mergeableTypes = unmodifiableSet(typeSet);
91 }
92
93 public void setNoScriptLimits() {
94 filterScriptPairs = false;
95 mergeableScriptPairs = null;
96 }
97
98 public void setScriptLimits(String scriptGroups) {
99 setScriptLimits(true, TextifyUtils.parseICUTokenRepairScriptList(scriptGroups));
100 }
101
102
103 protected void setScriptLimits(boolean filterScripts,
104 @Nullable Table<Integer, Integer, Boolean> scriptPairs) {
105 filterScriptPairs = filterScripts;
106 if (filterScriptPairs) {
107 if (scriptPairs == null) {
108 scriptPairs = HashBasedTable.create();
109 }
110 mergeableScriptPairs = ImmutableTable.copyOf(scriptPairs);
111 } else {
112 mergeableScriptPairs = null;
113 }
114 }
115
116 }