1 package org.wikimedia.search.extra.analysis.textify;
2
3 import static java.util.Collections.unmodifiableMap;
4
5 import java.io.Reader;
6 import java.util.HashMap;
7 import java.util.List;
8 import java.util.Map;
9 import java.util.regex.Matcher;
10 import java.util.regex.Pattern;
11
12 import org.elasticsearch.common.settings.Settings;
13 import org.elasticsearch.common.settings.SettingsException;
14 import org.elasticsearch.env.Environment;
15 import org.elasticsearch.index.IndexSettings;
16 import org.elasticsearch.index.analysis.AbstractCharFilterFactory;
17 import org.elasticsearch.index.analysis.Analysis;
18
19 public class LimitedMappingCharFilterFactory extends AbstractCharFilterFactory {
20
21 private final Map<Integer, Integer> oneCharMap;
22
23 LimitedMappingCharFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
24 super(indexSettings, name);
25
26 List<String> mappings = Analysis.getWordList(env, settings, "mappings");
27 if (mappings == null) {
28 throw new SettingsException("mapping requires `mappings` to be configured");
29 }
30 oneCharMap = parseMappings(mappings);
31 }
32
33 @Override
34 public Reader create(Reader reader) {
35
36 return new LimitedMappingCharFilter(oneCharMap, reader);
37 }
38
39 private static final Pattern MAPPING_PATTERN =
40 Pattern.compile("^(.+)=>(.*)$", Pattern.DOTALL);
41
42 protected static Map<Integer, Integer> parseMappings(List<String> mappings) {
43 Map<Integer, Integer> map = new HashMap<>();
44 for (String mapping : mappings) {
45 Matcher m = MAPPING_PATTERN.matcher(mapping);
46 if (!m.find()) {
47 throw new SettingsException("Invalid mapping rule: [" + mapping + "]");
48 }
49 Integer src = parseChar(m.group(1), false);
50 Integer dst = parseChar(m.group(2), true);
51 if (map.get(src) != null) {
52 throw new SettingsException("Multiple mappings for [" + src + "]");
53 }
54 map.put(src, dst);
55 }
56 return unmodifiableMap(map);
57 }
58
59 private static Integer parseChar(String s, boolean allowEmpty) {
60 int len = s.length();
61
62 if (len == 0) {
63 if (allowEmpty) {
64 return -1;
65 }
66 } else {
67 char c = s.charAt(0);
68 if (len == 1) {
69 return (int) c;
70 }
71 if (c == '\\') {
72 if (len == 2) {
73 c = s.charAt(1);
74 switch (c) {
75 case '\\':
76 case '\'':
77 case '"':
78 return (int) c;
79 case 't':
80 return (int) '\t';
81 case 'n':
82 return (int) '\n';
83 case 'r':
84 return (int) '\r';
85 default:
86 break;
87 }
88 } else if (len == 6 && s.charAt(1) == 'u') {
89 return Integer.parseInt(s.substring(2, 6), 16);
90 }
91 }
92 }
93 throw new SettingsException("Invalid escaped character: [" + s + "]");
94 }
95
96 }