1 package org.wikimedia.search.extra.regex;
2
3 import org.apache.lucene.util.automaton.Automaton;
4 import org.apache.lucene.util.automaton.RunAutomaton;
5
6 class ContainsCharacterRunAutomaton extends RunAutomaton {
7 ContainsCharacterRunAutomaton(Automaton a) {
8 super(a, Character.MAX_CODE_POINT);
9 }
10
11 /**
12 * Does s contain a substring which matches the automaton?
13 *
14 * @param s string to check
15 */
16 public boolean contains(String s) {
17 /*
18 * By requiring all callers to wrap their regex in `.*(regexp)` we
19 * can make a single pass on s to determine if a match exists.
20 */
21 for (int cp, p = 0, i = 0; i < s.length(); i += Character.charCount(cp)) {
22 cp = s.codePointAt(i);
23 p = step(p, lowerCaseIfNeeded(cp));
24 if (p == -1) {
25 break;
26 }
27 if (isAccept(p)) {
28 return true;
29 }
30 }
31 return false;
32 }
33
34 protected int lowerCaseIfNeeded(int cp) {
35 return cp;
36 }
37
38 static class LowerCasing extends ContainsCharacterRunAutomaton {
39 LowerCasing(Automaton a) {
40 super(a);
41 }
42
43 @Override
44 protected int lowerCaseIfNeeded(int cp) {
45 return Character.toLowerCase(cp);
46 }
47 }
48
49 static class GreekLowerCasing extends ContainsCharacterRunAutomaton {
50 GreekLowerCasing(Automaton a) {
51 super(a);
52 }
53
54 /**
55 * Lowercase cp in Greek compatible way. This method is a copy of
56 * Lucene's GreekLowerCaseFilter's lowerCase method. If that method had
57 * been public and static we wouldn't need to do this.
58 */
59 @SuppressWarnings("CyclomaticComplexity")
60 @Override
61 protected int lowerCaseIfNeeded(int cp) {
62 switch (cp) {
63 /*
64 * There are two lowercase forms of sigma: U+03C2: small final sigma
65 * (end of word) U+03C3: small sigma (otherwise)
66 *
67 * Standardize both to U+03C3
68 */
69 case '\u03C2': /* small final sigma */
70 return '\u03C3'; /* small sigma */
71
72 /*
73 * Some greek characters contain diacritics. This filter removes
74 * these, converting to the lowercase base form.
75 */
76
77 case '\u0386': /* capital alpha with tonos */
78 case '\u03AC': /* small alpha with tonos */
79 return '\u03B1'; /* small alpha */
80
81 case '\u0388': /* capital epsilon with tonos */
82 case '\u03AD': /* small epsilon with tonos */
83 return '\u03B5'; /* small epsilon */
84
85 case '\u0389': /* capital eta with tonos */
86 case '\u03AE': /* small eta with tonos */
87 return '\u03B7'; /* small eta */
88
89 case '\u038A': /* capital iota with tonos */
90 case '\u03AA': /* capital iota with dialytika */
91 case '\u03AF': /* small iota with tonos */
92 case '\u03CA': /* small iota with dialytika */
93 case '\u0390': /* small iota with dialytika and tonos */
94 return '\u03B9'; /* small iota */
95
96 case '\u038E': /* capital upsilon with tonos */
97 case '\u03AB': /* capital upsilon with dialytika */
98 case '\u03CD': /* small upsilon with tonos */
99 case '\u03CB': /* small upsilon with dialytika */
100 case '\u03B0': /* small upsilon with dialytika and tonos */
101 return '\u03C5'; /* small upsilon */
102
103 case '\u038C': /* capital omicron with tonos */
104 case '\u03CC': /* small omicron with tonos */
105 return '\u03BF'; /* small omicron */
106
107 case '\u038F': /* capital omega with tonos */
108 case '\u03CE': /* small omega with tonos */
109 return '\u03C9'; /* small omega */
110
111 /*
112 * The previous implementation did the conversion below. Only
113 * implemented for backwards compatibility with old indexes.
114 */
115
116 case '\u03A2': /* reserved */
117 return '\u03C2'; /* small final sigma */
118
119 default:
120 return Character.toLowerCase(cp);
121 }
122 }
123 }
124 }