View Javadoc
1   package org.wikimedia.search.extra.analysis.textify;
2   
3   import static java.util.Collections.emptySet;
4   import static org.wikimedia.search.extra.analysis.textify.ICUTokenRepairFilterConfig.DEFAULT_MAX_TOK_LEN;
5   import static org.wikimedia.search.extra.analysis.textify.ICUTokenRepairFilterConfig.DEFAULT_KEEP_CAMEL_SPLIT;
6   import static org.wikimedia.search.extra.analysis.textify.ICUTokenRepairFilterConfig.DEFAULT_MERGE_NUM_ONLY;
7   
8   import java.util.HashSet;
9   import java.util.List;
10  import java.util.ListIterator;
11  import java.util.Locale;
12  import java.util.Set;
13  
14  import javax.annotation.Nullable;
15  
16  import org.apache.lucene.analysis.TokenStream;
17  import org.elasticsearch.common.settings.Settings;
18  import org.elasticsearch.common.settings.SettingsException;
19  import org.elasticsearch.env.Environment;
20  import org.elasticsearch.index.IndexSettings;
21  import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
22  
23  public class ICUTokenRepairFilterFactory extends AbstractTokenFilterFactory {
24  
25      private static final String SETTINGS_EXCP_PREFIX = "icu_token_repair configuration error: ";
26  
27      public static final String MAX_TOK_LEN_KEY = "max_token_length";
28      public static final String KEEP_CAMEL_KEY = "keep_camel_split";
29      public static final String NUM_ONLY_KEY = "merge_numbers_only";
30      public static final String ALLOW_TYPES_KEY = "allow_types";
31      public static final String DENY_TYPES_KEY = "deny_types";
32      public static final String TYPE_PRESET_KEY = "type_preset";
33      public static final String ALLOW_SCRIPTS_KEY = "allow_scripts";
34      public static final String SCRIPT_PRESET_KEY = "script_preset";
35  
36      private ICUTokenRepairFilterConfig icuTokRepConfig = new ICUTokenRepairFilterConfig();
37  
38      ICUTokenRepairFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
39          super(indexSettings, name, settings);
40  
41          icuTokRepConfig.setMaxTokenLength(settings.getAsInt(MAX_TOK_LEN_KEY, DEFAULT_MAX_TOK_LEN));
42          icuTokRepConfig.setKeepCamelSplit(settings.getAsBoolean(KEEP_CAMEL_KEY, DEFAULT_KEEP_CAMEL_SPLIT));
43          icuTokRepConfig.setMergeNumOnly(settings.getAsBoolean(NUM_ONLY_KEY, DEFAULT_MERGE_NUM_ONLY));
44  
45          parseTypeLimits(settings);
46          parseScriptLimits(settings);
47  
48      }
49  
50      private void parseTypeLimits(Settings settings) throws SettingsException {
51          int typesConfigSeen = 0;
52  
53          if (settings.get(ALLOW_TYPES_KEY) != null) {
54              icuTokRepConfig.setTypeLimits(true, parseTypeList(settings.getAsList(ALLOW_TYPES_KEY)));
55              typesConfigSeen++;
56          }
57  
58          if (settings.get(DENY_TYPES_KEY) != null) {
59              icuTokRepConfig.setTypeLimits(false, parseTypeList(settings.getAsList(DENY_TYPES_KEY)));
60              typesConfigSeen++;
61          }
62  
63          String typePreset = settings.get(TYPE_PRESET_KEY);
64          if (typePreset != null) {
65              enableTypePreset(typePreset);
66              typesConfigSeen++;
67          }
68  
69          if (typesConfigSeen > 1) {
70              throw new SettingsException(SETTINGS_EXCP_PREFIX + "Only one of " + ALLOW_TYPES_KEY
71                  + ", " + DENY_TYPES_KEY + ", or " + TYPE_PRESET_KEY + " is allowed.");
72          }
73      }
74  
75      protected static Set<Integer> parseTypeList(@Nullable List<String> listOfTypeStrings)
76              throws SettingsException {
77          if (listOfTypeStrings == null || listOfTypeStrings.isEmpty()) {
78              return emptySet();
79          }
80  
81          ListIterator<String> iter = listOfTypeStrings.listIterator();
82          Set<Integer> typeSet = new HashSet<>();
83  
84          while (iter.hasNext()) {
85              int typeNum = TextifyUtils.getTokenType(iter.next().toUpperCase(Locale.ENGLISH));
86              if (typeNum == -1) { // unknown or explicit "<OTHER>" == not allowed
87                  throw new SettingsException(SETTINGS_EXCP_PREFIX + "Token type " +
88                      iter.previous() + " unknown or not allowed");
89              }
90              typeSet.add(typeNum);
91          }
92  
93          return typeSet;
94      }
95  
96      private void enableTypePreset(String typePreset) throws SettingsException {
97          switch (typePreset.toLowerCase(Locale.ENGLISH)) {
98              case "all": // create deny list, but deny nothing
99                  icuTokRepConfig.setNoTypeLimits();
100                 break;
101             case "none": // create allow list, but allow nothing
102                 icuTokRepConfig.setTypeLimits(true, emptySet());
103                 break;
104             case "default": // do nothing, default is enabled
105                 break;
106             default:
107                 throw new SettingsException(SETTINGS_EXCP_PREFIX + "Unknown value for " +
108                     TYPE_PRESET_KEY + ": " + typePreset);
109         }
110     }
111 
112     private void parseScriptLimits(Settings settings) throws SettingsException {
113         int scriptConfigSeen = 0;
114 
115         if (settings.get(ALLOW_SCRIPTS_KEY) != null) {
116             icuTokRepConfig.setScriptLimits(true,
117                 TextifyUtils.parseICUTokenRepairScriptList(settings.getAsList(ALLOW_SCRIPTS_KEY)));
118             scriptConfigSeen++;
119         }
120 
121         String scriptPreset = settings.get(SCRIPT_PRESET_KEY);
122         if (scriptPreset != null) {
123             enableScriptPreset(scriptPreset);
124             scriptConfigSeen++;
125         }
126 
127         if (scriptConfigSeen > 1) {
128             throw new SettingsException(SETTINGS_EXCP_PREFIX + "Only one of " + ALLOW_SCRIPTS_KEY
129                 + " or " + SCRIPT_PRESET_KEY + " is allowed.");
130         }
131     }
132 
133     private void enableScriptPreset(String scriptPreset) throws SettingsException {
134         switch (scriptPreset.toLowerCase(Locale.ENGLISH)) {
135             case "all": // don't filter by scripts
136                 icuTokRepConfig.setNoScriptLimits();
137                 break;
138             case "none": // do filter by scripts, but don't allow anything
139                 icuTokRepConfig.setScriptLimits("");
140                 break;
141             case "default": // do nothing, default is enabled
142                 break;
143             default:
144                 throw new SettingsException(SETTINGS_EXCP_PREFIX + "Unknown value for " +
145                     SCRIPT_PRESET_KEY + ": " + scriptPreset);
146         }
147     }
148 
149     @Override public TokenStream create(TokenStream tokenStream) {
150         return new ICUTokenRepairFilter(tokenStream, icuTokRepConfig);
151     }
152 
153 }