View Javadoc
1   package org.wikimedia.search.extra.regex;
2   
3   import org.apache.lucene.util.automaton.Automaton;
4   import org.apache.lucene.util.automaton.RunAutomaton;
5   
6   class ContainsCharacterRunAutomaton extends RunAutomaton {
7       ContainsCharacterRunAutomaton(Automaton a) {
8           super(a, Character.MAX_CODE_POINT);
9       }
10  
11      /**
12       * Does s contain a substring which matches the automaton?
13       *
14       * @param s string to check
15       */
16      public boolean contains(String s) {
17          /*
18           * By requiring all callers to wrap their regex in `.*(regexp)` we
19           * can make a single pass on s to determine if a match exists.
20           */
21          for (int cp, p = 0, i = 0; i < s.length(); i += Character.charCount(cp)) {
22              cp = s.codePointAt(i);
23              p = step(p, lowerCaseIfNeeded(cp));
24              if (p == -1) {
25                  break;
26              }
27              if (isAccept(p)) {
28                  return true;
29              }
30          }
31          return false;
32      }
33  
34      protected int lowerCaseIfNeeded(int cp) {
35          return cp;
36      }
37  
38      static class LowerCasing extends ContainsCharacterRunAutomaton {
39          LowerCasing(Automaton a) {
40              super(a);
41          }
42  
43          @Override
44          protected int lowerCaseIfNeeded(int cp) {
45              return Character.toLowerCase(cp);
46          }
47      }
48  
49      static class GreekLowerCasing extends ContainsCharacterRunAutomaton {
50          GreekLowerCasing(Automaton a) {
51              super(a);
52          }
53  
54          /**
55           * Lowercase cp in Greek compatible way. This method is a copy of
56           * Lucene's GreekLowerCaseFilter's lowerCase method. If that method had
57           * been public and static we wouldn't need to do this.
58           */
59          @SuppressWarnings("CyclomaticComplexity")
60          @Override
61          protected int lowerCaseIfNeeded(int cp) {
62              switch (cp) {
63                  /*
64                   * There are two lowercase forms of sigma: U+03C2: small final sigma
65                   * (end of word) U+03C3: small sigma (otherwise)
66                   *
67                   * Standardize both to U+03C3
68                   */
69                  case '\u03C2': /* small final sigma */
70                      return '\u03C3'; /* small sigma */
71  
72                      /*
73                       * Some greek characters contain diacritics. This filter removes
74                       * these, converting to the lowercase base form.
75                       */
76  
77                  case '\u0386': /* capital alpha with tonos */
78                  case '\u03AC': /* small alpha with tonos */
79                      return '\u03B1'; /* small alpha */
80  
81                  case '\u0388': /* capital epsilon with tonos */
82                  case '\u03AD': /* small epsilon with tonos */
83                      return '\u03B5'; /* small epsilon */
84  
85                  case '\u0389': /* capital eta with tonos */
86                  case '\u03AE': /* small eta with tonos */
87                      return '\u03B7'; /* small eta */
88  
89                  case '\u038A': /* capital iota with tonos */
90                  case '\u03AA': /* capital iota with dialytika */
91                  case '\u03AF': /* small iota with tonos */
92                  case '\u03CA': /* small iota with dialytika */
93                  case '\u0390': /* small iota with dialytika and tonos */
94                      return '\u03B9'; /* small iota */
95  
96                  case '\u038E': /* capital upsilon with tonos */
97                  case '\u03AB': /* capital upsilon with dialytika */
98                  case '\u03CD': /* small upsilon with tonos */
99                  case '\u03CB': /* small upsilon with dialytika */
100                 case '\u03B0': /* small upsilon with dialytika and tonos */
101                     return '\u03C5'; /* small upsilon */
102 
103                 case '\u038C': /* capital omicron with tonos */
104                 case '\u03CC': /* small omicron with tonos */
105                     return '\u03BF'; /* small omicron */
106 
107                 case '\u038F': /* capital omega with tonos */
108                 case '\u03CE': /* small omega with tonos */
109                     return '\u03C9'; /* small omega */
110 
111                     /*
112                      * The previous implementation did the conversion below. Only
113                      * implemented for backwards compatibility with old indexes.
114                      */
115 
116                 case '\u03A2': /* reserved */
117                     return '\u03C2'; /* small final sigma */
118 
119                 default:
120                     return Character.toLowerCase(cp);
121             }
122         }
123     }
124 }