1
2
3
4
5
6
7
8
9
10
11
12
13
14 package org.wikimedia.search.extra.analysis.textify;
15
16 import java.io.IOException;
17 import java.io.Reader;
18
19 import org.apache.lucene.analysis.charfilter.BaseCharFilter;
20
21 public class CamelCaseCharFilter extends BaseCharFilter {
22
23 private int outputCharCount;
24 private int cumulativeOffset;
25
26 private boolean inputEnd;
27
28 private boolean seenLowercaseish;
29 private int buffChar = -1;
30 private int lowSurrogate = -1;
31
32 public CamelCaseCharFilter(Reader in) {
33 super(in);
34 }
35
36 private int getComplexCharType(int c) throws IOException {
37 int type;
38
39 if (c == -1) {
40 inputEnd = true;
41 return Character.UNASSIGNED;
42 } else {
43 type = TextifyUtils.getCustomCharType(c);
44 }
45
46 if (Character.isHighSurrogate((char) c)) {
47 lowSurrogate = inputEnd ? -1 : input.read();
48 if (lowSurrogate == -1) {
49 inputEnd = true;
50 } else if (Character.isLowSurrogate((char) lowSurrogate)) {
51 type = Character.getType(Character.toCodePoint((char)c, (char)lowSurrogate));
52 }
53 }
54 return type;
55 }
56
57 private int processNextChar() throws IOException {
58 int c = inputEnd ? -1 : input.read();
59 int type = getComplexCharType(c);
60
61
62
63 if (type == Character.LOWERCASE_LETTER) {
64 seenLowercaseish = true;
65 } else if (TextifyUtils.isMarkOrFormatType(type)) {
66
67 } else if (TextifyUtils.isLeadingUppercaseishType(type)) {
68 if (seenLowercaseish) {
69
70
71 buffChar = c;
72 c = ' ';
73 cumulativeOffset--;
74 addOffCorrectMap(outputCharCount, cumulativeOffset);
75 }
76
77 seenLowercaseish = TextifyUtils.isTrailingLowercaseishType(type);
78 } else {
79 seenLowercaseish = false;
80 }
81
82 return c;
83 }
84
85 @Override
86 public int read() throws IOException {
87 int c;
88 outputCharCount++;
89 if (buffChar != -1) {
90 c = buffChar;
91 buffChar = -1;
92 } else if (lowSurrogate != -1) {
93 c = lowSurrogate;
94 lowSurrogate = -1;
95 } else {
96 c = processNextChar();
97 }
98 return c;
99 }
100
101 @Override
102 public int read(char[] cbuf, int offset, int len) throws IOException {
103 int charsRead = 0;
104 for (int i = offset; i < offset + len; i++) {
105 int c = read();
106 if (c == -1) {
107 break;
108 }
109 cbuf[i] = (char) c;
110 charsRead++;
111 }
112
113 return charsRead == 0 && len > 0 ? -1 : charsRead;
114 }
115
116 @Override
117 public void reset() throws IOException {
118 input.reset();
119 outputCharCount = 0;
120 cumulativeOffset = 0;
121 seenLowercaseish = false;
122 buffChar = -1;
123 lowSurrogate = -1;
124 inputEnd = false;
125 }
126
127 }