View Javadoc
1   package org.wikimedia.search.extra.analysis.textify;
2   
3   import static java.util.Collections.unmodifiableMap;
4   
5   import java.io.Reader;
6   import java.util.HashMap;
7   import java.util.List;
8   import java.util.Map;
9   import java.util.regex.Matcher;
10  import java.util.regex.Pattern;
11  
12  import org.elasticsearch.common.settings.Settings;
13  import org.elasticsearch.common.settings.SettingsException;
14  import org.elasticsearch.env.Environment;
15  import org.elasticsearch.index.IndexSettings;
16  import org.elasticsearch.index.analysis.AbstractCharFilterFactory;
17  import org.elasticsearch.index.analysis.Analysis;
18  
19  public class LimitedMappingCharFilterFactory extends AbstractCharFilterFactory {
20  
21      private final Map<Integer, Integer> oneCharMap;
22  
23      LimitedMappingCharFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
24          super(indexSettings, name);
25  
26          List<String> mappings = Analysis.getWordList(env, settings, "mappings");
27          if (mappings == null) {
28              throw new SettingsException("mapping requires `mappings` to be configured");
29          }
30          oneCharMap = parseMappings(mappings);
31      }
32  
33      @Override
34      public Reader create(Reader reader) {
35          // add LimitedMapping
36          return new LimitedMappingCharFilter(oneCharMap, reader);
37      }
38  
39      private static final Pattern MAPPING_PATTERN =
40          Pattern.compile("^(.+)=>(.*)$", Pattern.DOTALL);
41  
42      protected static Map<Integer, Integer> parseMappings(List<String> mappings) {
43          Map<Integer, Integer> map = new HashMap<>();
44          for (String mapping : mappings) {
45              Matcher m = MAPPING_PATTERN.matcher(mapping);
46              if (!m.find()) {
47                  throw new SettingsException("Invalid mapping rule: [" + mapping + "]");
48              }
49              Integer src = parseChar(m.group(1), false); // no map *from* empty
50              Integer dst = parseChar(m.group(2), true); // map *to* empty is ok
51              if (map.get(src) != null) {
52                  throw new SettingsException("Multiple mappings for [" + src + "]");
53              }
54              map.put(src, dst);
55          }
56          return unmodifiableMap(map);
57      }
58  
59      private static Integer parseChar(String s, boolean allowEmpty) {
60          int len = s.length();
61  
62          if (len == 0) {
63              if (allowEmpty) {
64                  return -1;
65              }
66          } else {
67              char c = s.charAt(0);
68              if (len == 1) {
69                  return (int) c;
70              }
71              if (c == '\\') {
72                  if (len == 2) {
73                      c = s.charAt(1);
74                      switch (c) {
75                          case '\\':
76                          case '\'':
77                          case '"':
78                              return (int) c;
79                          case 't':
80                              return (int) '\t';
81                          case 'n':
82                              return (int) '\n';
83                          case 'r':
84                              return (int) '\r';
85                          default:
86                              break;
87                      }
88                  } else if (len == 6 && s.charAt(1) == 'u') {
89                      return Integer.parseInt(s.substring(2, 6), 16);
90                  }
91              }
92          }
93          throw new SettingsException("Invalid escaped character: [" + s + "]");
94      }
95  
96  }