Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 69 |
|
0.00% |
0 / 12 |
CRAP | |
0.00% |
0 / 1 |
PegTokenizer | |
0.00% |
0 / 69 |
|
0.00% |
0 / 12 |
506 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
initGrammar | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
12 | |||
getOptions | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
setSourceOffsets | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
process | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
processChunkily | |
0.00% |
0 / 19 |
|
0.00% |
0 / 1 |
12 | |||
tokenizeSync | |
0.00% |
0 / 23 |
|
0.00% |
0 / 1 |
30 | |||
tokenizeAs | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
2 | |||
tokenizeURL | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
tokenizeTableCellAttributes | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getLastErrorLogMessage | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
resetState | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | /** |
5 | * Tokenizer for wikitext, using WikiPEG and a |
6 | * separate PEG grammar file |
7 | * (Grammar.pegphp) |
8 | * |
9 | * Use along with a {@link Wt2Html/TreeBuilder/TreeBuilderStage} and the |
10 | * {@link DOMPostProcessor}(s) for HTML output. |
11 | */ |
12 | |
13 | namespace Wikimedia\Parsoid\Wt2Html; |
14 | |
15 | use Generator; |
16 | use Wikimedia\Assert\Assert; |
17 | use Wikimedia\Parsoid\Config\Env; |
18 | use Wikimedia\Parsoid\Tokens\EOFTk; |
19 | use Wikimedia\Parsoid\Tokens\SourceRange; |
20 | use Wikimedia\WikiPEG\SyntaxError; |
21 | |
22 | class PegTokenizer extends PipelineStage { |
23 | private $options; |
24 | private $offsets; |
25 | private ?SyntaxError $lastError = null; |
26 | /* @var Grammar|TracingGrammar|null */ |
27 | private $grammar = null; |
28 | |
29 | public function __construct( |
30 | Env $env, array $options = [], string $stageId = "", |
31 | ?PipelineStage $prevStage = null |
32 | ) { |
33 | parent::__construct( $env, $prevStage ); |
34 | $this->env = $env; |
35 | $this->options = $options; |
36 | $this->offsets = []; |
37 | } |
38 | |
39 | private function initGrammar() { |
40 | if ( !$this->grammar ) { |
41 | $this->grammar = $this->env->hasTraceFlag( 'grammar' ) ? new TracingGrammar : new Grammar; |
42 | } |
43 | } |
44 | |
45 | /** |
46 | * Get the constructor options. |
47 | * |
48 | * @internal |
49 | * @return array |
50 | */ |
51 | public function getOptions(): array { |
52 | return $this->options; |
53 | } |
54 | |
55 | /** |
56 | * Set start and end offsets of the source that generated this DOM. |
57 | * |
58 | * @param SourceRange $so |
59 | */ |
60 | public function setSourceOffsets( SourceRange $so ): void { |
61 | $this->offsets['startOffset'] = $so->start; |
62 | $this->offsets['endOffset'] = $so->end; |
63 | } |
64 | |
65 | /** |
66 | * See PipelineStage::process docs as well. This doc block refines |
67 | * the generic arg types to be specific to this pipeline stage. |
68 | * |
69 | * @param string $input wikitext to tokenize |
70 | * @param array{sol:bool} $opts |
71 | * - atTopLevel: (bool) Whether we are processing the top-level document |
72 | * - sol: (bool) Whether input should be processed in start-of-line context |
73 | * @return array|false The token array, or false for a syntax error |
74 | */ |
75 | public function process( $input, array $opts ) { |
76 | Assert::invariant( is_string( $input ), "Input should be a string" ); |
77 | return $this->tokenizeSync( $input, $opts ); |
78 | } |
79 | |
80 | /** |
81 | * The text is tokenized in chunks (one per top-level block) |
82 | * and registered event listeners are called with the chunk |
83 | * to let it get processed further. |
84 | * |
85 | * The main worker. Sets up event emission ('chunk' and 'end' events). |
86 | * Consumers are supposed to register with PegTokenizer before calling |
87 | * process(). |
88 | * |
89 | * @param string $text |
90 | * @param array{sol:bool} $opts |
91 | * - sol (bool) Whether text should be processed in start-of-line context. |
92 | * @return Generator |
93 | */ |
94 | public function processChunkily( $text, array $opts ): Generator { |
95 | if ( !$this->grammar ) { |
96 | $this->initGrammar(); |
97 | } |
98 | |
99 | Assert::invariant( is_string( $text ), "Input should be a string" ); |
100 | Assert::invariant( isset( $opts['sol'] ), "Sol should be set" ); |
101 | |
102 | // Kick it off! |
103 | $pipelineOffset = $this->offsets['startOffset'] ?? 0; |
104 | $args = [ |
105 | 'env' => $this->env, |
106 | 'pipelineId' => $this->getPipelineId(), |
107 | 'pegTokenizer' => $this, |
108 | 'pipelineOffset' => $pipelineOffset, |
109 | 'sol' => $opts['sol'], |
110 | 'stream' => true, |
111 | 'startRule' => 'start_async', |
112 | ]; |
113 | |
114 | try { |
115 | // Wrap wikipeg's generator with our own generator |
116 | // to catch exceptions and track time usage. |
117 | // @phan-suppress-next-line PhanTypeInvalidYieldFrom |
118 | yield from $this->grammar->parse( $text, $args ); |
119 | yield [ new EOFTk() ]; |
120 | } catch ( SyntaxError $e ) { |
121 | $this->lastError = $e; |
122 | throw $e; |
123 | } |
124 | } |
125 | |
126 | /** |
127 | * Tokenize via a rule passed in as an arg. |
128 | * The text is tokenized synchronously in one shot. |
129 | * |
130 | * @param string $text |
131 | * @param array{sol:bool} $args |
132 | * - sol: (bool) Whether input should be processed in start-of-line context. |
133 | * - startRule: (string) which tokenizer rule to tokenize with |
134 | * @return array|false The token array, or false for a syntax error |
135 | */ |
136 | public function tokenizeSync( string $text, array $args ) { |
137 | if ( !$this->grammar ) { |
138 | $this->initGrammar(); |
139 | } |
140 | Assert::invariant( isset( $args['sol'] ), "Sol should be set" ); |
141 | $args += [ |
142 | 'pegTokenizer' => $this, |
143 | 'pipelineId' => $this->getPipelineId(), |
144 | 'pipelineOffset' => $this->offsets['startOffset'] ?? 0, |
145 | 'startRule' => 'start', |
146 | 'env' => $this->env |
147 | ]; |
148 | |
149 | $start = null; |
150 | $profile = null; |
151 | if ( $this->env->profiling() ) { |
152 | $profile = $this->env->getCurrentProfile(); |
153 | $start = microtime( true ); |
154 | } |
155 | |
156 | try { |
157 | $toks = $this->grammar->parse( $text, $args ); |
158 | } catch ( SyntaxError $e ) { |
159 | $this->lastError = $e; |
160 | return false; |
161 | } |
162 | |
163 | if ( $profile ) { |
164 | $profile->bumpTimeUse( |
165 | 'PEG', 1000 * ( microtime( true ) - $start ), 'PEG' ); |
166 | } |
167 | return $toks; |
168 | } |
169 | |
170 | /** |
171 | * Tokenizes a string as a rule |
172 | * |
173 | * @param string $text The input text |
174 | * @param string $rule The rule name |
175 | * @param bool $sol Start of line flag |
176 | * @return array|false Array of tokens/strings or false on error |
177 | */ |
178 | public function tokenizeAs( string $text, string $rule, bool $sol ) { |
179 | $args = [ |
180 | 'startRule' => $rule, |
181 | 'sol' => $sol, |
182 | 'pipelineOffset' => 0 |
183 | ]; |
184 | return $this->tokenizeSync( $text, $args ); |
185 | } |
186 | |
187 | /** |
188 | * Tokenize a URL. |
189 | * @param string $text |
190 | * @return array|false Array of tokens/strings or false on error |
191 | */ |
192 | public function tokenizeURL( string $text ) { |
193 | return $this->tokenizeAs( $text, 'url', /* sol */true ); |
194 | } |
195 | |
196 | /** |
197 | * Tokenize table cell attributes. |
198 | * @param string $text |
199 | * @param bool $sol |
200 | * @return array|false Array of tokens/strings or false on error |
201 | */ |
202 | public function tokenizeTableCellAttributes( string $text, bool $sol ) { |
203 | return $this->tokenizeAs( $text, 'row_syntax_table_args', $sol ); |
204 | } |
205 | |
206 | /** |
207 | * If a tokenize method returned false, this will return a string describing the error, |
208 | * suitable for use in a log entry. If there has not been any error, returns false. |
209 | * |
210 | * @return string|false |
211 | */ |
212 | public function getLastErrorLogMessage() { |
213 | if ( $this->lastError ) { |
214 | return "Tokenizer parse error at input location {$this->lastError->location}: " . |
215 | $this->lastError->getMessage(); |
216 | } else { |
217 | return false; |
218 | } |
219 | } |
220 | |
221 | /** |
222 | * @inheritDoc |
223 | */ |
224 | public function resetState( array $opts ): void { |
225 | TokenizerUtils::resetAnnotationIncludeRegex(); |
226 | if ( $this->grammar ) { |
227 | $this->grammar->resetState(); |
228 | } |
229 | parent::resetState( $opts ); |
230 | } |
231 | } |