Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
100.00% |
64 / 64 |
|
100.00% |
6 / 6 |
CRAP | |
100.00% |
1 / 1 |
MultiStringMatcher | |
100.00% |
64 / 64 |
|
100.00% |
6 / 6 |
27 | |
100.00% |
1 / 1 |
__construct | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
4 | |||
getKeywords | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
nextState | |
100.00% |
11 / 11 |
|
100.00% |
1 / 1 |
5 | |||
searchIn | |
100.00% |
12 / 12 |
|
100.00% |
1 / 1 |
5 | |||
computeYesTransitions | |
100.00% |
13 / 13 |
|
100.00% |
1 / 1 |
4 | |||
computeNoTransitions | |
100.00% |
20 / 20 |
|
100.00% |
1 / 1 |
8 |
1 | <?php |
2 | /** |
3 | * Copyright 2015 Ori Livneh <ori@wikimedia.org> |
4 | * |
5 | * Licensed under the Apache License, Version 2.0 (the "License"); |
6 | * you may not use this file except in compliance with the License. |
7 | * You may obtain a copy of the License at |
8 | * |
9 | * http://www.apache.org/licenses/LICENSE-2.0 |
10 | * |
11 | * Unless required by applicable law or agreed to in writing, software |
12 | * distributed under the License is distributed on an "AS IS" BASIS, |
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | * See the License for the specific language governing permissions and |
15 | * limitations under the License. |
16 | * |
17 | * @file |
18 | * @author Ori Livneh <ori@wikimedia.org> |
19 | */ |
20 | |
21 | namespace AhoCorasick; |
22 | |
23 | /** |
24 | * Represents a finite state machine that can find all occurrences |
25 | * of a set of search keywords in a body of text. |
26 | * |
27 | * The time it takes to construct the finite state machine is |
28 | * proportional to the sum of the lengths of the search keywords. |
29 | * Once constructed, the machine can locate all occurences of all |
30 | * search keywords in a body of text in a single pass, making exactly |
31 | * one state transition per input character. |
32 | * |
33 | * This is an implementation of the Aho-Corasick string matching |
34 | * algorithm. |
35 | * |
36 | * Alfred V. Aho and Margaret J. Corasick, "Efficient string matching: |
37 | * an aid to bibliographic search", CACM, 18(6):333-340, June 1975. |
38 | * |
39 | * @link http://xlinux.nist.gov/dads/HTML/ahoCorasick.html |
40 | */ |
41 | class MultiStringMatcher { |
42 | |
43 | /** @var string[] The set of keywords to be searched for. */ |
44 | protected $searchKeywords = []; |
45 | |
46 | /** @var int The number of possible states of the string-matching finite state machine. */ |
47 | protected $numStates = 1; |
48 | |
49 | /** @var array Mapping of states to outputs. */ |
50 | protected $outputs = []; |
51 | |
52 | /** @var array Mapping of failure transitions. */ |
53 | protected $noTransitions = []; |
54 | |
55 | /** @var array Mapping of success transitions. */ |
56 | protected $yesTransitions = []; |
57 | |
58 | /** |
59 | * Constructor. |
60 | * |
61 | * @param string[] $searchKeywords The set of keywords to be matched. |
62 | */ |
63 | public function __construct( array $searchKeywords ) { |
64 | foreach ( $searchKeywords as $keyword ) { |
65 | if ( $keyword !== '' ) { |
66 | $this->searchKeywords[$keyword] = strlen( $keyword ); |
67 | } |
68 | } |
69 | |
70 | if ( !$this->searchKeywords ) { |
71 | trigger_error( __METHOD__ . ': The set of search keywords is empty.', E_USER_WARNING ); |
72 | // Unreachable 'return' when PHPUnit detects trigger_error |
73 | return; // @codeCoverageIgnore |
74 | } |
75 | |
76 | $this->computeYesTransitions(); |
77 | $this->computeNoTransitions(); |
78 | } |
79 | |
80 | /** |
81 | * Accessor for the search keywords. |
82 | * |
83 | * @return string[] Search keywords. |
84 | */ |
85 | public function getKeywords() { |
86 | return array_keys( $this->searchKeywords ); |
87 | } |
88 | |
89 | /** |
90 | * Map the current state and input character to the next state. |
91 | * |
92 | * @param int $currentState The current state of the string-matching |
93 | * automaton. |
94 | * @param string $inputChar The character the string-matching |
95 | * automaton is currently processing. |
96 | * @return int The state the automaton should transition to. |
97 | */ |
98 | public function nextState( $currentState, $inputChar ) { |
99 | $initialState = $currentState; |
100 | while ( true ) { |
101 | $transitions =& $this->yesTransitions[$currentState]; |
102 | if ( isset( $transitions[$inputChar] ) ) { |
103 | $nextState = $transitions[$inputChar]; |
104 | // Avoid failure transitions next time. |
105 | if ( $currentState !== $initialState ) { |
106 | $this->yesTransitions[$initialState][$inputChar] = $nextState; |
107 | } |
108 | return $nextState; |
109 | } |
110 | if ( $currentState === 0 ) { |
111 | return 0; |
112 | } |
113 | $currentState = $this->noTransitions[$currentState]; |
114 | } |
115 | // Unreachable outside 'while' |
116 | } // @codeCoverageIgnore |
117 | |
118 | /** |
119 | * Locate the search keywords in some text. |
120 | * |
121 | * @param string $text The string to search in. |
122 | * @return array[] An array of matches. Each match is a vector |
123 | * containing an integer offset and the matched keyword. |
124 | * |
125 | * @par Example: |
126 | * @code |
127 | * $keywords = new MultiStringMatcher( array( 'ore', 'hell' ) ); |
128 | * $keywords->searchIn( 'She sells sea shells by the sea shore.' ); |
129 | * // result: array( array( 15, 'hell' ), array( 34, 'ore' ) ) |
130 | * @endcode |
131 | */ |
132 | public function searchIn( $text ) { |
133 | if ( !$this->searchKeywords || $text === '' ) { |
134 | return []; // fast path |
135 | } |
136 | |
137 | $state = 0; |
138 | $results = []; |
139 | $length = strlen( $text ); |
140 | |
141 | for ( $i = 0; $i < $length; $i++ ) { |
142 | $ch = $text[$i]; |
143 | $state = $this->nextState( $state, $ch ); |
144 | foreach ( $this->outputs[$state] as $match ) { |
145 | $offset = $i - $this->searchKeywords[$match] + 1; |
146 | $results[] = [ $offset, $match ]; |
147 | } |
148 | } |
149 | |
150 | return $results; |
151 | } |
152 | |
153 | /** |
154 | * Get the state transitions which the string-matching automaton |
155 | * shall make as it advances through input text. |
156 | * |
157 | * Constructs a directed tree with a root node which represents the |
158 | * initial state of the string-matching automaton and from which a |
159 | * path exists which spells out each search keyword. |
160 | */ |
161 | protected function computeYesTransitions() { |
162 | $this->yesTransitions = [ [] ]; |
163 | $this->outputs = [ [] ]; |
164 | foreach ( $this->searchKeywords as $keyword => $length ) { |
165 | $state = 0; |
166 | for ( $i = 0; $i < $length; $i++ ) { |
167 | $ch = strval( $keyword )[$i]; |
168 | if ( !empty( $this->yesTransitions[$state][$ch] ) ) { |
169 | $state = $this->yesTransitions[$state][$ch]; |
170 | } else { |
171 | $this->yesTransitions[$state][$ch] = $this->numStates; |
172 | $this->yesTransitions[] = []; |
173 | $this->outputs[] = []; |
174 | $state = $this->numStates++; |
175 | } |
176 | } |
177 | |
178 | $this->outputs[$state][] = $keyword; |
179 | } |
180 | } |
181 | |
182 | /** |
183 | * Get the state transitions which the string-matching automaton |
184 | * shall make when a partial match proves false. |
185 | */ |
186 | protected function computeNoTransitions() { |
187 | $queue = []; |
188 | $this->noTransitions = []; |
189 | |
190 | foreach ( $this->yesTransitions[0] as $ch => $toState ) { |
191 | $queue[] = $toState; |
192 | $this->noTransitions[$toState] = 0; |
193 | } |
194 | |
195 | while ( true ) { |
196 | $fromState = array_shift( $queue ); |
197 | if ( $fromState === null ) { |
198 | break; |
199 | } |
200 | foreach ( $this->yesTransitions[$fromState] as $ch => $toState ) { |
201 | $queue[] = $toState; |
202 | $state = $this->noTransitions[$fromState]; |
203 | |
204 | while ( $state !== 0 && empty( $this->yesTransitions[$state][$ch] ) ) { |
205 | $state = $this->noTransitions[$state]; |
206 | } |
207 | |
208 | if ( isset( $this->yesTransitions[$state][$ch] ) ) { |
209 | $noState = $this->yesTransitions[$state][$ch]; |
210 | } else { |
211 | $noState = 0; |
212 | } |
213 | |
214 | $this->noTransitions[$toState] = $noState; |
215 | $this->outputs[$toState] = array_merge( |
216 | $this->outputs[$toState], $this->outputs[$noState] ); |
217 | } |
218 | } |
219 | } |
220 | } |