View Javadoc
1   /*
2    * The WMF licenses this file to you under the Apache License, Version
3    * 2.0 (the "License"); you may not use this file except in compliance
4    * with the License. You may obtain a copy of the License at
5    *
6    *      http://www.apache.org/licenses/LICENSE-2.0
7    *
8    * Unless required by applicable law or agreed to in writing, software
9    * distributed under the License is distributed on an "AS IS" BASIS,
10   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11   * See the License for the specific language governing permissions and
12   * limitations under the License.
13   */
14  package org.wikimedia.search.extra.analysis.textify;
15  
16  import java.io.IOException;
17  import java.io.Reader;
18  
19  import org.apache.lucene.analysis.charfilter.BaseCharFilter;
20  
21  public class CamelCaseCharFilter extends BaseCharFilter {
22  
23      private int outputCharCount;
24      private int cumulativeOffset;
25  
26      private boolean inputEnd;
27  
28      private boolean seenLowercaseish;
29      private int buffChar = -1;      // buffer next char when we need to insert a space
30      private int lowSurrogate = -1;  // read-ahead character when we see a high surrogate
31  
32      public CamelCaseCharFilter(Reader in) {
33          super(in);
34      }
35  
36      private int getComplexCharType(int c) throws IOException {
37          int type;
38  
39          if (c == -1) {
40              inputEnd = true;
41              return Character.UNASSIGNED;
42          } else {
43              type = TextifyUtils.getCustomCharType(c);
44          }
45  
46          if (Character.isHighSurrogate((char) c)) {
47              lowSurrogate = inputEnd ? -1 : input.read();
48              if (lowSurrogate == -1) {
49                  inputEnd = true;
50              } else if (Character.isLowSurrogate((char) lowSurrogate)) {
51                  type = Character.getType(Character.toCodePoint((char)c, (char)lowSurrogate));
52              }
53          }
54          return type;
55      }
56  
57      private int processNextChar() throws IOException {
58          int c = inputEnd ? -1 : input.read();
59          int type = getComplexCharType(c);
60  
61          // Add space between (lowercase + optional combining characters or invisibles) and
62          // (uppercase or titlecase)
63          if (type == Character.LOWERCASE_LETTER) {
64              seenLowercaseish = true;
65          } else if (TextifyUtils.isMarkOrFormatType(type)) {
66              // do nothing -- maintain seenLowercaseish state for combining and invisible characters
67          } else if (TextifyUtils.isLeadingUppercaseishType(type)) {
68              if (seenLowercaseish) {
69                  // add a space, store the current character for later,
70                  // and update the offset correction table
71                  buffChar = c;
72                  c = ' ';
73                  cumulativeOffset--;
74                  addOffCorrectMap(outputCharCount, cumulativeOffset);
75              }
76              // Titlecase (e.g., Lj) is upper on the front side and lower on the backside!
77              seenLowercaseish = TextifyUtils.isTrailingLowercaseishType(type);
78          } else {
79              seenLowercaseish = false;
80          }
81  
82          return c;
83      }
84  
85      @Override
86      public int read() throws IOException {
87          int c;
88          outputCharCount++;
89          if (buffChar != -1) {
90              c = buffChar;
91              buffChar = -1;
92          } else if (lowSurrogate != -1) {
93              c = lowSurrogate;
94              lowSurrogate = -1;
95          } else {
96              c = processNextChar();
97          }
98          return c;
99      }
100 
101     @Override
102     public int read(char[] cbuf, int offset, int len) throws IOException {
103         int charsRead = 0;
104         for (int i = offset; i < offset + len; i++) {
105             int c = read();
106             if (c == -1) {
107                 break;
108             }
109             cbuf[i] = (char) c;
110             charsRead++;
111         }
112 
113         return charsRead == 0 && len > 0 ? -1 : charsRead;
114     }
115 
116     @Override
117     public void reset() throws IOException {
118         input.reset();
119         outputCharCount = 0;
120         cumulativeOffset = 0;
121         seenLowercaseish = false;
122         buffChar = -1;
123         lowSurrogate = -1;
124         inputEnd = false;
125     }
126 
127 }