Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 82 |
|
0.00% |
0 / 11 |
CRAP | |
0.00% |
0 / 1 |
PegTokenizer | |
0.00% |
0 / 82 |
|
0.00% |
0 / 11 |
600 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
2 | |||
initGrammar | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
12 | |||
getOptions | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
setSourceOffsets | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
process | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
6 | |||
processChunkily | |
0.00% |
0 / 18 |
|
0.00% |
0 / 1 |
12 | |||
tokenizeSync | |
0.00% |
0 / 33 |
|
0.00% |
0 / 1 |
72 | |||
tokenizeAs | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
2 | |||
tokenizeURL | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
tokenizeTableCellAttributes | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
resetState | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Wt2Html; |
5 | |
6 | use Generator; |
7 | use Wikimedia\Assert\Assert; |
8 | use Wikimedia\Parsoid\Config\Env; |
9 | use Wikimedia\Parsoid\DOM\DocumentFragment; |
10 | use Wikimedia\Parsoid\DOM\Element; |
11 | use Wikimedia\Parsoid\Tokens\EOFTk; |
12 | use Wikimedia\Parsoid\Tokens\SourceRange; |
13 | use Wikimedia\Parsoid\Tokens\Token; |
14 | use Wikimedia\WikiPEG\SyntaxError; |
15 | |
16 | /** |
17 | * Tokenizer for wikitext, using WikiPEG and a |
18 | * separate PEG grammar file (Grammar.pegphp) |
19 | */ |
20 | class PegTokenizer extends PipelineStage { |
21 | private array $options; |
22 | private array $offsets; |
23 | /** @var Grammar|TracingGrammar|null */ |
24 | private $grammar = null; |
25 | private bool $tracing; |
26 | /** |
27 | * No need to retokenize identical strings |
28 | * Cache <src,startRule> --> token array. |
29 | * Expected benefits: |
30 | * - same expanded template source used multiple times on a page |
31 | * - convertToString calls |
32 | * - calls from TableFixups and elsewhere to tokenize* methods |
33 | */ |
34 | private TokenCache $cache; |
35 | |
36 | public function __construct( |
37 | Env $env, array $options = [], string $stageId = "", |
38 | ?PipelineStage $prevStage = null |
39 | ) { |
40 | parent::__construct( $env, $prevStage ); |
41 | $this->env = $env; |
42 | $this->options = $options; |
43 | $this->offsets = []; |
44 | $this->tracing = $env->hasTraceFlag( 'grammar' ); |
45 | // Cache only on seeing the same source the second time. |
46 | // This minimizes cache bloat & token cloning penalties. |
47 | $this->cache = $this->env->getCache( |
48 | "PegTokenizer", |
49 | [ "repeatThreshold" => 1, "cloneValue" => true ] |
50 | ); |
51 | } |
52 | |
53 | private function initGrammar(): void { |
54 | if ( !$this->grammar ) { |
55 | $this->grammar = $this->tracing ? new TracingGrammar : new Grammar; |
56 | } |
57 | } |
58 | |
59 | /** |
60 | * Get the constructor options. |
61 | * |
62 | * @internal |
63 | * @return array |
64 | */ |
65 | public function getOptions(): array { |
66 | return $this->options; |
67 | } |
68 | |
69 | /** |
70 | * Set start and end offsets of the source that generated this DOM. |
71 | * |
72 | * @param SourceRange $so |
73 | */ |
74 | public function setSourceOffsets( SourceRange $so ): void { |
75 | $this->offsets['startOffset'] = $so->start; |
76 | $this->offsets['endOffset'] = $so->end; |
77 | } |
78 | |
79 | /** |
80 | * See PipelineStage::process docs as well. This doc block refines |
81 | * the generic arg types to be specific to this pipeline stage. |
82 | * |
83 | * @param string|array|DocumentFragment|Element $input |
84 | * Wikitext to tokenize. In practice this should be a string. |
85 | * @param array{sol:bool} $options |
86 | * - atTopLevel: (bool) Whether we are processing the top-level document |
87 | * - sol: (bool) Whether input should be processed in start-of-line context |
88 | * |
89 | * @return array The token array |
90 | * @throws SyntaxError |
91 | */ |
92 | public function process( |
93 | string|array|DocumentFragment|Element $input, |
94 | array $options |
95 | ): array|Element|DocumentFragment { |
96 | Assert::invariant( is_string( $input ), "Input should be a string" ); |
97 | $result = $this->tokenizeSync( $input, $options, $exception ); |
98 | if ( $result === false ) { |
99 | // Should never happen. |
100 | throw $exception; |
101 | } |
102 | return $result; |
103 | } |
104 | |
105 | /** |
106 | * The text is tokenized in chunks (one per top-level block). |
107 | * |
108 | * @param string|array|DocumentFragment|Element $input |
109 | * Wikitext to tokenize. In practice this should be a string. |
110 | * @param array{atTopLevel:bool,sol:bool} $options |
111 | * - atTopLevel: (bool) Whether we are processing the top-level document |
112 | * - sol (bool) Whether text should be processed in start-of-line context. |
113 | * @return Generator<list<Token|string>> |
114 | */ |
115 | public function processChunkily( |
116 | string|array|DocumentFragment|Element $input, |
117 | array $options |
118 | ): Generator { |
119 | if ( !$this->grammar ) { |
120 | $this->initGrammar(); |
121 | } |
122 | |
123 | Assert::invariant( is_string( $input ), "Input should be a string" ); |
124 | Assert::invariant( isset( $options['sol'] ), "Sol should be set" ); |
125 | |
126 | // Kick it off! |
127 | $pipelineOffset = $this->offsets['startOffset'] ?? 0; |
128 | $args = [ |
129 | 'env' => $this->env, |
130 | 'pipelineId' => $this->getPipelineId(), |
131 | 'pegTokenizer' => $this, |
132 | 'pipelineOffset' => $pipelineOffset, |
133 | 'sol' => $options['sol'], |
134 | 'stream' => true, |
135 | 'startRule' => 'start_async', |
136 | ]; |
137 | |
138 | if ( $this->tracing ) { |
139 | $args['tracer'] = new Tracer( $input ); |
140 | } |
141 | |
142 | // Wrap wikipeg's generator with our own generator |
143 | // to track time usage. |
144 | // @phan-suppress-next-line PhanTypeInvalidYieldFrom |
145 | yield from $this->grammar->parse( $input, $args ); |
146 | yield [ new EOFTk() ]; |
147 | } |
148 | |
149 | /** |
150 | * Tokenize via a rule passed in as an arg. |
151 | * The text is tokenized synchronously in one shot. |
152 | * |
153 | * @param string $text |
154 | * @param array{sol:bool} $args |
155 | * - sol: (bool) Whether input should be processed in start-of-line context. |
156 | * - startRule: (string) which tokenizer rule to tokenize with |
157 | * @param SyntaxError|null &$exception a syntax error, if thrown. |
158 | * @return array|false The token array, or false for a syntax error |
159 | */ |
160 | public function tokenizeSync( string $text, array $args, &$exception = null ) { |
161 | if ( !$this->grammar ) { |
162 | $this->initGrammar(); |
163 | } |
164 | Assert::invariant( isset( $args['sol'] ), "Sol should be set" ); |
165 | $args += [ |
166 | 'pegTokenizer' => $this, |
167 | 'pipelineId' => $this->getPipelineId(), |
168 | 'pipelineOffset' => $this->offsets['startOffset'] ?? 0, |
169 | 'startRule' => 'start', |
170 | 'env' => $this->env |
171 | ]; |
172 | |
173 | // crc32 is much faster than md5 and since we are verifying a |
174 | // $text match when reusing cache contents, hash collisions are okay. |
175 | // |
176 | // NOTE about inclusion of pipelineOffset in the cache key: |
177 | // The PEG tokenizer returns tokens with offsets shifted by |
178 | // $args['pipelineOffset'], so we cannot reuse tokens across |
179 | // differing values of this option. If required, we could refactor |
180 | // to move that and the logging code into this file. |
181 | $cacheKey = crc32( $text ) . |
182 | "|" . (int)$args['sol'] . |
183 | "|" . $args['startRule'] . |
184 | "|" . $args['pipelineOffset']; |
185 | $res = $this->cache->lookup( $cacheKey, $text ); |
186 | if ( $res !== null ) { |
187 | return $res; |
188 | } |
189 | |
190 | if ( $this->tracing ) { |
191 | $args['tracer'] = new Tracer( $text ); |
192 | } |
193 | |
194 | $start = null; |
195 | $profile = null; |
196 | if ( $this->env->profiling() ) { |
197 | $profile = $this->env->getCurrentProfile(); |
198 | $start = hrtime( true ); |
199 | } |
200 | |
201 | try { |
202 | $toks = $this->grammar->parse( $text, $args ); |
203 | } catch ( SyntaxError $e ) { |
204 | $exception = $e; |
205 | return false; |
206 | } |
207 | |
208 | if ( $profile ) { |
209 | $profile->bumpTimeUse( 'PEG', hrtime( true ) - $start, 'PEG' ); |
210 | } |
211 | |
212 | if ( is_array( $toks ) ) { |
213 | $this->cache->cache( $cacheKey, $toks, $text ); |
214 | } |
215 | |
216 | return $toks; |
217 | } |
218 | |
219 | /** |
220 | * Tokenizes a string as a rule |
221 | * |
222 | * @param string $text The input text |
223 | * @param string $rule The rule name |
224 | * @param bool $sol Start of line flag |
225 | * @return array|false Array of tokens/strings or false on error |
226 | */ |
227 | public function tokenizeAs( string $text, string $rule, bool $sol ) { |
228 | $args = [ |
229 | 'startRule' => $rule, |
230 | 'sol' => $sol, |
231 | 'pipelineOffset' => 0 |
232 | ]; |
233 | return $this->tokenizeSync( $text, $args ); |
234 | } |
235 | |
236 | /** |
237 | * Tokenize a URL. |
238 | * @param string $text |
239 | * @return array|false Array of tokens/strings or false on error |
240 | */ |
241 | public function tokenizeURL( string $text ) { |
242 | return $this->tokenizeAs( $text, 'url', /* sol */true ); |
243 | } |
244 | |
245 | /** |
246 | * Tokenize table cell attributes. |
247 | * @param string $text |
248 | * @param bool $sol |
249 | * @return array|false Array of tokens/strings or false on error |
250 | */ |
251 | public function tokenizeTableCellAttributes( string $text, bool $sol ) { |
252 | return $this->tokenizeAs( $text, 'row_syntax_table_args', $sol ); |
253 | } |
254 | |
255 | /** |
256 | * @inheritDoc |
257 | */ |
258 | public function resetState( array $options ): void { |
259 | TokenizerUtils::resetAnnotationIncludeRegex(); |
260 | if ( $this->grammar ) { |
261 | $this->grammar->resetState(); |
262 | } |
263 | parent::resetState( $options ); |
264 | } |
265 | } |