Code Coverage for /src/src/Wt2Html/PegTokenizer.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	0.00% covered (danger)	0.00%	0 / 69	0.00% covered (danger)	0.00%	0 / 12	CRAP	0.00% covered (danger)	0.00%	0 / 1
PegTokenizer	0.00% covered (danger)	0.00%	0 / 69	0.00% covered (danger)	0.00%	0 / 12	420	0.00% covered (danger)	0.00%	0 / 1
__construct	0.00% covered (danger)	0.00%	0 / 4	0.00% covered (danger)	0.00%	0 / 1	2
initGrammar	0.00% covered (danger)	0.00%	0 / 2	0.00% covered (danger)	0.00%	0 / 1	6
getOptions	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
setSourceOffsets	0.00% covered (danger)	0.00%	0 / 2	0.00% covered (danger)	0.00%	0 / 1	2
process	0.00% covered (danger)	0.00%	0 / 3	0.00% covered (danger)	0.00%	0 / 1	2
processChunkily	0.00% covered (danger)	0.00%	0 / 19	0.00% covered (danger)	0.00%	0 / 1	12
tokenizeSync	0.00% covered (danger)	0.00%	0 / 24	0.00% covered (danger)	0.00%	0 / 1	30
tokenizeAs	0.00% covered (danger)	0.00%	0 / 6	0.00% covered (danger)	0.00%	0 / 1	2
tokenizeURL	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
tokenizeTableCellAttributes	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
getLastErrorLogMessage	0.00% covered (danger)	0.00%	0 / 4	0.00% covered (danger)	0.00%	0 / 1	6
resetState	0.00% covered (danger)	0.00%	0 / 2	0.00% covered (danger)	0.00%	0 / 1	2

1	<?php
2	declare( strict_types = 1 );
3
4	/**
5	* Tokenizer for wikitext, using WikiPEG and a
6	* separate PEG grammar file
7	* (Grammar.pegphp)
8	*
9	* Use along with a {@link Wt2Html/TreeBuilder/TreeBuilderStage} and the
10	* {@link DOMPostProcessor}(s) for HTML output.
11	*/
12
13	namespace Wikimedia\Parsoid\Wt2Html;
14
15	use Generator;
16	use Wikimedia\Assert\Assert;
17	use Wikimedia\Parsoid\Config\Env;
18	use Wikimedia\Parsoid\Tokens\EOFTk;
19	use Wikimedia\Parsoid\Tokens\SourceRange;
20	use Wikimedia\Parsoid\Utils\PHPUtils;
21	use Wikimedia\WikiPEG\SyntaxError;
22
23	class PegTokenizer extends PipelineStage {
24	private $options;
25	private $offsets;
26
27	/** @var SyntaxError\|null */
28	private $lastError;
29
30	/** @var Grammar */
31	private $grammar;
32
33	public function __construct(
34	Env $env, array $options = [], string $stageId = "",
35	?PipelineStage $prevStage = null
36	) {
37	parent::__construct( $env, $prevStage );
38	$this->env = $env;
39	$this->options = $options;
40	$this->offsets = [];
41	}
42
43	private function initGrammar() {
44	if ( !$this->grammar ) {
45	$this->grammar = new Grammar;
46	}
47	}
48
49	/**
50	* Get the constructor options.
51	*
52	* @internal
53	* @return array
54	*/
55	public function getOptions(): array {
56	return $this->options;
57	}
58
59	/**
60	* Set start and end offsets of the source that generated this DOM.
61	*
62	* @param SourceRange $so
63	*/
64	public function setSourceOffsets( SourceRange $so ): void {
65	$this->offsets['startOffset'] = $so->start;
66	$this->offsets['endOffset'] = $so->end;
67	}
68
69	/**
70	* See PipelineStage::process docs as well. This doc block refines
71	* the generic arg types to be specific to this pipeline stage.
72	*
73	* @param string $input wikitext to tokenize
74	* @param ?array $opts
75	* - atTopLevel: (bool) Whether we are processing the top-level document
76	* - sol: (bool) Whether input should be processed in start-of-line context
77	* @return array\|false The token array, or false for a syntax error
78	*/
79	public function process( $input, ?array $opts = null ) {
80	Assert::invariant( is_string( $input ), "Input should be a string" );
81	PHPUtils::assertValidUTF8( $input ); // Transitional check for PHP port
82	return $this->tokenizeSync( $input, $opts ?? [] );
83	}
84
85	/**
86	* The text is tokenized in chunks (one per top-level block)
87	* and registered event listeners are called with the chunk
88	* to let it get processed further.
89	*
90	* The main worker. Sets up event emission ('chunk' and 'end' events).
91	* Consumers are supposed to register with PegTokenizer before calling
92	* process().
93	*
94	* @param string $text
95	* @param ?array $opts
96	* - sol (bool) Whether text should be processed in start-of-line context.
97	* @return Generator
98	*/
99	public function processChunkily( $text, ?array $opts ): Generator {
100	if ( !$this->grammar ) {
101	$this->initGrammar();
102	}
103
104	Assert::invariant( is_string( $text ), "Input should be a string" );
105	PHPUtils::assertValidUTF8( $text ); // Transitional check for PHP port
106
107	// Kick it off!
108	$pipelineOffset = $this->offsets['startOffset'] ?? 0;
109	$args = [
110	'env' => $this->env,
111	'pipelineId' => $this->getPipelineId(),
112	'pegTokenizer' => $this,
113	'pipelineOffset' => $pipelineOffset,
114	'sol' => !empty( $opts['sol'] ), // defaults to false
115	'stream' => true,
116	'startRule' => 'start_async',
117	];
118
119	try {
120	// Wrap wikipeg's generator with our own generator
121	// to catch exceptions and track time usage.
122	// @phan-suppress-next-line PhanTypeInvalidYieldFrom
123	yield from $this->grammar->parse( $text, $args );
124	yield [ new EOFTk() ];
125	} catch ( SyntaxError $e ) {
126	$this->lastError = $e;
127	throw $e;
128	}
129	}
130
131	/**
132	* Tokenize via a rule passed in as an arg.
133	* The text is tokenized synchronously in one shot.
134	*
135	* @param string $text
136	* @param array $args
137	* - sol: (bool) Whether input should be processed in start-of-line context.
138	* - startRule: (string) which tokenizer rule to tokenize with
139	* @return array\|false The token array, or false for a syntax error
140	*/
141	public function tokenizeSync( string $text, array $args = [] ) {
142	if ( !$this->grammar ) {
143	$this->initGrammar();
144	}
145	PHPUtils::assertValidUTF8( $text ); // Transitional check for PHP port
146	$args += [
147	'pegTokenizer' => $this,
148	'pipelineId' => $this->getPipelineId(),
149	'pipelineOffset' => $this->offsets['startOffset'] ?? 0,
150	'startRule' => 'start',
151	'sol' => $args['sol'] ?? true, // defaults to true
152	'env' => $this->env
153	];
154
155	$start = null;
156	$profile = null;
157	if ( $this->env->profiling() ) {
158	$profile = $this->env->getCurrentProfile();
159	$start = microtime( true );
160	}
161
162	try {
163	$toks = $this->grammar->parse( $text, $args );
164	} catch ( SyntaxError $e ) {
165	$this->lastError = $e;
166	return false;
167	}
168
169	if ( $profile ) {
170	$profile->bumpTimeUse(
171	'PEG', 1000 * ( microtime( true ) - $start ), 'PEG' );
172	}
173	return $toks;
174	}
175
176	/**
177	* Tokenizes a string as a rule
178	*
179	* @param string $text The input text
180	* @param string $rule The rule name
181	* @param bool $sol Start of line flag
182	* @return array\|false Array of tokens/strings or false on error
183	*/
184	public function tokenizeAs( string $text, string $rule, bool $sol ) {
185	$args = [
186	'startRule' => $rule,
187	'sol' => $sol,
188	'pipelineOffset' => 0
189	];
190	return $this->tokenizeSync( $text, $args );
191	}
192
193	/**
194	* Tokenize a URL.
195	* @param string $text
196	* @return array\|false Array of tokens/strings or false on error
197	*/
198	public function tokenizeURL( string $text ) {
199	return $this->tokenizeAs( $text, 'url', /* sol */true );
200	}
201
202	/**
203	* Tokenize table cell attributes.
204	* @param string $text
205	* @param bool $sol
206	* @return array\|false Array of tokens/strings or false on error
207	*/
208	public function tokenizeTableCellAttributes( string $text, bool $sol ) {
209	return $this->tokenizeAs( $text, 'row_syntax_table_args', $sol );
210	}
211
212	/**
213	* If a tokenize method returned false, this will return a string describing the error,
214	* suitable for use in a log entry. If there has not been any error, returns false.
215	*
216	* @return string\|false
217	*/
218	public function getLastErrorLogMessage() {
219	if ( $this->lastError ) {
220	return "Tokenizer parse error at input location {$this->lastError->location}: " .
221	$this->lastError->getMessage();
222	} else {
223	return false;
224	}
225	}
226
227	/**
228	* @inheritDoc
229	*/
230	public function resetState( array $opts ): void {
231	TokenizerUtils::resetAnnotationIncludeRegex();
232	parent::resetState( $opts );
233	}
234	}