1 package org.wikimedia.search.extra.analysis.textify;
2
3 import static java.util.Collections.emptySet;
4 import static org.wikimedia.search.extra.analysis.textify.ICUTokenRepairFilterConfig.DEFAULT_MAX_TOK_LEN;
5 import static org.wikimedia.search.extra.analysis.textify.ICUTokenRepairFilterConfig.DEFAULT_KEEP_CAMEL_SPLIT;
6 import static org.wikimedia.search.extra.analysis.textify.ICUTokenRepairFilterConfig.DEFAULT_MERGE_NUM_ONLY;
7
8 import java.util.HashSet;
9 import java.util.List;
10 import java.util.ListIterator;
11 import java.util.Locale;
12 import java.util.Set;
13
14 import javax.annotation.Nullable;
15
16 import org.apache.lucene.analysis.TokenStream;
17 import org.elasticsearch.common.settings.Settings;
18 import org.elasticsearch.common.settings.SettingsException;
19 import org.elasticsearch.env.Environment;
20 import org.elasticsearch.index.IndexSettings;
21 import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
22
23 public class ICUTokenRepairFilterFactory extends AbstractTokenFilterFactory {
24
25 private static final String SETTINGS_EXCP_PREFIX = "icu_token_repair configuration error: ";
26
27 public static final String MAX_TOK_LEN_KEY = "max_token_length";
28 public static final String KEEP_CAMEL_KEY = "keep_camel_split";
29 public static final String NUM_ONLY_KEY = "merge_numbers_only";
30 public static final String ALLOW_TYPES_KEY = "allow_types";
31 public static final String DENY_TYPES_KEY = "deny_types";
32 public static final String TYPE_PRESET_KEY = "type_preset";
33 public static final String ALLOW_SCRIPTS_KEY = "allow_scripts";
34 public static final String SCRIPT_PRESET_KEY = "script_preset";
35
36 private ICUTokenRepairFilterConfig icuTokRepConfig = new ICUTokenRepairFilterConfig();
37
38 ICUTokenRepairFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
39 super(indexSettings, name, settings);
40
41 icuTokRepConfig.setMaxTokenLength(settings.getAsInt(MAX_TOK_LEN_KEY, DEFAULT_MAX_TOK_LEN));
42 icuTokRepConfig.setKeepCamelSplit(settings.getAsBoolean(KEEP_CAMEL_KEY, DEFAULT_KEEP_CAMEL_SPLIT));
43 icuTokRepConfig.setMergeNumOnly(settings.getAsBoolean(NUM_ONLY_KEY, DEFAULT_MERGE_NUM_ONLY));
44
45 parseTypeLimits(settings);
46 parseScriptLimits(settings);
47
48 }
49
50 private void parseTypeLimits(Settings settings) throws SettingsException {
51 int typesConfigSeen = 0;
52
53 if (settings.get(ALLOW_TYPES_KEY) != null) {
54 icuTokRepConfig.setTypeLimits(true, parseTypeList(settings.getAsList(ALLOW_TYPES_KEY)));
55 typesConfigSeen++;
56 }
57
58 if (settings.get(DENY_TYPES_KEY) != null) {
59 icuTokRepConfig.setTypeLimits(false, parseTypeList(settings.getAsList(DENY_TYPES_KEY)));
60 typesConfigSeen++;
61 }
62
63 String typePreset = settings.get(TYPE_PRESET_KEY);
64 if (typePreset != null) {
65 enableTypePreset(typePreset);
66 typesConfigSeen++;
67 }
68
69 if (typesConfigSeen > 1) {
70 throw new SettingsException(SETTINGS_EXCP_PREFIX + "Only one of " + ALLOW_TYPES_KEY
71 + ", " + DENY_TYPES_KEY + ", or " + TYPE_PRESET_KEY + " is allowed.");
72 }
73 }
74
75 protected static Set<Integer> parseTypeList(@Nullable List<String> listOfTypeStrings)
76 throws SettingsException {
77 if (listOfTypeStrings == null || listOfTypeStrings.isEmpty()) {
78 return emptySet();
79 }
80
81 ListIterator<String> iter = listOfTypeStrings.listIterator();
82 Set<Integer> typeSet = new HashSet<>();
83
84 while (iter.hasNext()) {
85 int typeNum = TextifyUtils.getTokenType(iter.next().toUpperCase(Locale.ENGLISH));
86 if (typeNum == -1) {
87 throw new SettingsException(SETTINGS_EXCP_PREFIX + "Token type " +
88 iter.previous() + " unknown or not allowed");
89 }
90 typeSet.add(typeNum);
91 }
92
93 return typeSet;
94 }
95
96 private void enableTypePreset(String typePreset) throws SettingsException {
97 switch (typePreset.toLowerCase(Locale.ENGLISH)) {
98 case "all":
99 icuTokRepConfig.setNoTypeLimits();
100 break;
101 case "none":
102 icuTokRepConfig.setTypeLimits(true, emptySet());
103 break;
104 case "default":
105 break;
106 default:
107 throw new SettingsException(SETTINGS_EXCP_PREFIX + "Unknown value for " +
108 TYPE_PRESET_KEY + ": " + typePreset);
109 }
110 }
111
112 private void parseScriptLimits(Settings settings) throws SettingsException {
113 int scriptConfigSeen = 0;
114
115 if (settings.get(ALLOW_SCRIPTS_KEY) != null) {
116 icuTokRepConfig.setScriptLimits(true,
117 TextifyUtils.parseICUTokenRepairScriptList(settings.getAsList(ALLOW_SCRIPTS_KEY)));
118 scriptConfigSeen++;
119 }
120
121 String scriptPreset = settings.get(SCRIPT_PRESET_KEY);
122 if (scriptPreset != null) {
123 enableScriptPreset(scriptPreset);
124 scriptConfigSeen++;
125 }
126
127 if (scriptConfigSeen > 1) {
128 throw new SettingsException(SETTINGS_EXCP_PREFIX + "Only one of " + ALLOW_SCRIPTS_KEY
129 + " or " + SCRIPT_PRESET_KEY + " is allowed.");
130 }
131 }
132
133 private void enableScriptPreset(String scriptPreset) throws SettingsException {
134 switch (scriptPreset.toLowerCase(Locale.ENGLISH)) {
135 case "all":
136 icuTokRepConfig.setNoScriptLimits();
137 break;
138 case "none":
139 icuTokRepConfig.setScriptLimits("");
140 break;
141 case "default":
142 break;
143 default:
144 throw new SettingsException(SETTINGS_EXCP_PREFIX + "Unknown value for " +
145 SCRIPT_PRESET_KEY + ": " + scriptPreset);
146 }
147 }
148
149 @Override public TokenStream create(TokenStream tokenStream) {
150 return new ICUTokenRepairFilter(tokenStream, icuTokRepConfig);
151 }
152
153 }