Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 69 |
|
0.00% |
0 / 12 |
CRAP | |
0.00% |
0 / 1 |
PegTokenizer | |
0.00% |
0 / 69 |
|
0.00% |
0 / 12 |
420 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
initGrammar | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
6 | |||
getOptions | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
setSourceOffsets | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
process | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
processChunkily | |
0.00% |
0 / 19 |
|
0.00% |
0 / 1 |
12 | |||
tokenizeSync | |
0.00% |
0 / 24 |
|
0.00% |
0 / 1 |
30 | |||
tokenizeAs | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
2 | |||
tokenizeURL | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
tokenizeTableCellAttributes | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getLastErrorLogMessage | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
resetState | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | /** |
5 | * Tokenizer for wikitext, using WikiPEG and a |
6 | * separate PEG grammar file |
7 | * (Grammar.pegphp) |
8 | * |
9 | * Use along with a {@link Wt2Html/TreeBuilder/TreeBuilderStage} and the |
10 | * {@link DOMPostProcessor}(s) for HTML output. |
11 | */ |
12 | |
13 | namespace Wikimedia\Parsoid\Wt2Html; |
14 | |
15 | use Generator; |
16 | use Wikimedia\Assert\Assert; |
17 | use Wikimedia\Parsoid\Config\Env; |
18 | use Wikimedia\Parsoid\Tokens\EOFTk; |
19 | use Wikimedia\Parsoid\Tokens\SourceRange; |
20 | use Wikimedia\Parsoid\Utils\PHPUtils; |
21 | use Wikimedia\WikiPEG\SyntaxError; |
22 | |
23 | class PegTokenizer extends PipelineStage { |
24 | private $options; |
25 | private $offsets; |
26 | |
27 | /** @var SyntaxError|null */ |
28 | private $lastError; |
29 | |
30 | /** @var Grammar */ |
31 | private $grammar; |
32 | |
33 | public function __construct( |
34 | Env $env, array $options = [], string $stageId = "", |
35 | ?PipelineStage $prevStage = null |
36 | ) { |
37 | parent::__construct( $env, $prevStage ); |
38 | $this->env = $env; |
39 | $this->options = $options; |
40 | $this->offsets = []; |
41 | } |
42 | |
43 | private function initGrammar() { |
44 | if ( !$this->grammar ) { |
45 | $this->grammar = new Grammar; |
46 | } |
47 | } |
48 | |
49 | /** |
50 | * Get the constructor options. |
51 | * |
52 | * @internal |
53 | * @return array |
54 | */ |
55 | public function getOptions(): array { |
56 | return $this->options; |
57 | } |
58 | |
59 | /** |
60 | * Set start and end offsets of the source that generated this DOM. |
61 | * |
62 | * @param SourceRange $so |
63 | */ |
64 | public function setSourceOffsets( SourceRange $so ): void { |
65 | $this->offsets['startOffset'] = $so->start; |
66 | $this->offsets['endOffset'] = $so->end; |
67 | } |
68 | |
69 | /** |
70 | * See PipelineStage::process docs as well. This doc block refines |
71 | * the generic arg types to be specific to this pipeline stage. |
72 | * |
73 | * @param string $input wikitext to tokenize |
74 | * @param ?array $opts |
75 | * - atTopLevel: (bool) Whether we are processing the top-level document |
76 | * - sol: (bool) Whether input should be processed in start-of-line context |
77 | * @return array|false The token array, or false for a syntax error |
78 | */ |
79 | public function process( $input, ?array $opts = null ) { |
80 | Assert::invariant( is_string( $input ), "Input should be a string" ); |
81 | PHPUtils::assertValidUTF8( $input ); // Transitional check for PHP port |
82 | return $this->tokenizeSync( $input, $opts ?? [] ); |
83 | } |
84 | |
85 | /** |
86 | * The text is tokenized in chunks (one per top-level block) |
87 | * and registered event listeners are called with the chunk |
88 | * to let it get processed further. |
89 | * |
90 | * The main worker. Sets up event emission ('chunk' and 'end' events). |
91 | * Consumers are supposed to register with PegTokenizer before calling |
92 | * process(). |
93 | * |
94 | * @param string $text |
95 | * @param ?array $opts |
96 | * - sol (bool) Whether text should be processed in start-of-line context. |
97 | * @return Generator |
98 | */ |
99 | public function processChunkily( $text, ?array $opts ): Generator { |
100 | if ( !$this->grammar ) { |
101 | $this->initGrammar(); |
102 | } |
103 | |
104 | Assert::invariant( is_string( $text ), "Input should be a string" ); |
105 | PHPUtils::assertValidUTF8( $text ); // Transitional check for PHP port |
106 | |
107 | // Kick it off! |
108 | $pipelineOffset = $this->offsets['startOffset'] ?? 0; |
109 | $args = [ |
110 | 'env' => $this->env, |
111 | 'pipelineId' => $this->getPipelineId(), |
112 | 'pegTokenizer' => $this, |
113 | 'pipelineOffset' => $pipelineOffset, |
114 | 'sol' => !empty( $opts['sol'] ), // defaults to false |
115 | 'stream' => true, |
116 | 'startRule' => 'start_async', |
117 | ]; |
118 | |
119 | try { |
120 | // Wrap wikipeg's generator with our own generator |
121 | // to catch exceptions and track time usage. |
122 | // @phan-suppress-next-line PhanTypeInvalidYieldFrom |
123 | yield from $this->grammar->parse( $text, $args ); |
124 | yield [ new EOFTk() ]; |
125 | } catch ( SyntaxError $e ) { |
126 | $this->lastError = $e; |
127 | throw $e; |
128 | } |
129 | } |
130 | |
131 | /** |
132 | * Tokenize via a rule passed in as an arg. |
133 | * The text is tokenized synchronously in one shot. |
134 | * |
135 | * @param string $text |
136 | * @param array $args |
137 | * - sol: (bool) Whether input should be processed in start-of-line context. |
138 | * - startRule: (string) which tokenizer rule to tokenize with |
139 | * @return array|false The token array, or false for a syntax error |
140 | */ |
141 | public function tokenizeSync( string $text, array $args = [] ) { |
142 | if ( !$this->grammar ) { |
143 | $this->initGrammar(); |
144 | } |
145 | PHPUtils::assertValidUTF8( $text ); // Transitional check for PHP port |
146 | $args += [ |
147 | 'pegTokenizer' => $this, |
148 | 'pipelineId' => $this->getPipelineId(), |
149 | 'pipelineOffset' => $this->offsets['startOffset'] ?? 0, |
150 | 'startRule' => 'start', |
151 | 'sol' => $args['sol'] ?? true, // defaults to true |
152 | 'env' => $this->env |
153 | ]; |
154 | |
155 | $start = null; |
156 | $profile = null; |
157 | if ( $this->env->profiling() ) { |
158 | $profile = $this->env->getCurrentProfile(); |
159 | $start = microtime( true ); |
160 | } |
161 | |
162 | try { |
163 | $toks = $this->grammar->parse( $text, $args ); |
164 | } catch ( SyntaxError $e ) { |
165 | $this->lastError = $e; |
166 | return false; |
167 | } |
168 | |
169 | if ( $profile ) { |
170 | $profile->bumpTimeUse( |
171 | 'PEG', 1000 * ( microtime( true ) - $start ), 'PEG' ); |
172 | } |
173 | return $toks; |
174 | } |
175 | |
176 | /** |
177 | * Tokenizes a string as a rule |
178 | * |
179 | * @param string $text The input text |
180 | * @param string $rule The rule name |
181 | * @param bool $sol Start of line flag |
182 | * @return array|false Array of tokens/strings or false on error |
183 | */ |
184 | public function tokenizeAs( string $text, string $rule, bool $sol ) { |
185 | $args = [ |
186 | 'startRule' => $rule, |
187 | 'sol' => $sol, |
188 | 'pipelineOffset' => 0 |
189 | ]; |
190 | return $this->tokenizeSync( $text, $args ); |
191 | } |
192 | |
193 | /** |
194 | * Tokenize a URL. |
195 | * @param string $text |
196 | * @return array|false Array of tokens/strings or false on error |
197 | */ |
198 | public function tokenizeURL( string $text ) { |
199 | return $this->tokenizeAs( $text, 'url', /* sol */true ); |
200 | } |
201 | |
202 | /** |
203 | * Tokenize table cell attributes. |
204 | * @param string $text |
205 | * @param bool $sol |
206 | * @return array|false Array of tokens/strings or false on error |
207 | */ |
208 | public function tokenizeTableCellAttributes( string $text, bool $sol ) { |
209 | return $this->tokenizeAs( $text, 'row_syntax_table_args', $sol ); |
210 | } |
211 | |
212 | /** |
213 | * If a tokenize method returned false, this will return a string describing the error, |
214 | * suitable for use in a log entry. If there has not been any error, returns false. |
215 | * |
216 | * @return string|false |
217 | */ |
218 | public function getLastErrorLogMessage() { |
219 | if ( $this->lastError ) { |
220 | return "Tokenizer parse error at input location {$this->lastError->location}: " . |
221 | $this->lastError->getMessage(); |
222 | } else { |
223 | return false; |
224 | } |
225 | } |
226 | |
227 | /** |
228 | * @inheritDoc |
229 | */ |
230 | public function resetState( array $opts ): void { |
231 | TokenizerUtils::resetAnnotationIncludeRegex(); |
232 | parent::resetState( $opts ); |
233 | } |
234 | } |