Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
66.06% |
72 / 109 |
|
25.00% |
2 / 8 |
CRAP | |
0.00% |
0 / 1 |
ConstrainedText | |
66.06% |
72 / 109 |
|
25.00% |
2 / 8 |
106.75 | |
0.00% |
0 / 1 |
escapeLine | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
3 | |||
__construct | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
2 | |||
cast | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
escape | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
equals | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
20 | |||
matches | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
20 | |||
fromSelSer | |
92.86% |
13 / 14 |
|
0.00% |
0 / 1 |
5.01 | |||
fromSelSerImpl | |
75.86% |
44 / 58 |
|
0.00% |
0 / 1 |
27.20 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Html2Wt\ConstrainedText; |
5 | |
6 | use Wikimedia\Parsoid\Config\Env; |
7 | use Wikimedia\Parsoid\DOM\Element; |
8 | use Wikimedia\Parsoid\DOM\Node; |
9 | use Wikimedia\Parsoid\NodeData\DataParsoid; |
10 | use Wikimedia\Parsoid\Utils\DiffDOMUtils; |
11 | use Wikimedia\Parsoid\Utils\DOMCompat; |
12 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
13 | use Wikimedia\Parsoid\Utils\DOMUtils; |
14 | use Wikimedia\Parsoid\Utils\PHPUtils; |
15 | use Wikimedia\Parsoid\Utils\Utils; |
16 | |
17 | /** |
18 | * A chunk of wikitext output. This base class contains the |
19 | * wikitext and a pointer to the DOM node which is responsible for |
20 | * generating it. Subclasses can add additional properties to record |
21 | * context or wikitext boundary restrictions for proper escaping. |
22 | * The chunk is serialized with the `escape` method, which might |
23 | * alter the wikitext in order to ensure it doesn't run together |
24 | * with its context (usually by adding `<nowiki>` tags). |
25 | * |
26 | * The main entry point is the static function `ConstrainedText::escapeLine()`. |
27 | */ |
28 | class ConstrainedText { |
29 | /** |
30 | * This adds necessary escapes to a line of chunks. We provide |
31 | * the `ConstrainedText#escape` function with its left and right |
32 | * context, and it can determine what escapes are needed. |
33 | * |
34 | * The `line` parameter is an array of `ConstrainedText` *chunks* |
35 | * which make up a line (or part of a line, in some cases of nested |
36 | * processing). |
37 | * |
38 | * @param ConstrainedText[] $line |
39 | * @return string |
40 | */ |
41 | public static function escapeLine( array $line ): string { |
42 | // The left context will be precise (that is, it is the result |
43 | // of `ConstrainedText#escape` and will include any escapes |
44 | // triggered by chunks on the left), but the right context |
45 | // is just the (unescaped) text property from the chunk. |
46 | // As we work left to right we will piece together a fully-escaped |
47 | // string. Be careful not to shoot yourself in the foot -- if the |
48 | // escaped text is significantly different from the chunk's `text` |
49 | // property, the preceding chunk may not have made the correct |
50 | // decisions about emitting an escape suffix. We could solve |
51 | // this by looping until the state converges (or until we detect |
52 | // a loop) but for now let's hope that's not necessary. |
53 | $state = new State( $line ); |
54 | $safeLeft = ''; |
55 | for ( $state->pos = 0; $state->pos < count( $line ); $state->pos++ ) { |
56 | $chunk = $line[$state->pos]; |
57 | // Process the escapes for this chunk, given escaped previous chunk |
58 | $state->rightContext = substr( $state->rightContext, strlen( $chunk->text ) ); |
59 | $thisEscape = $chunk->escape( $state ); |
60 | $state->leftContext .= |
61 | ( $thisEscape->prefix ?? '' ) . |
62 | $thisEscape->text . |
63 | ( $thisEscape->suffix ?? '' ); |
64 | if ( $thisEscape->greedy ) { |
65 | // protect the left context: this will be matched greedily |
66 | // by this chunk, so there's no chance that a subsequent |
67 | // token will include this in its prefix. |
68 | $safeLeft .= $state->leftContext; |
69 | $state->leftContext = ''; |
70 | } |
71 | } |
72 | // right context should be empty here. |
73 | return $safeLeft . $state->leftContext; |
74 | } |
75 | |
76 | /** |
77 | * The wikitext string associated with this chunk. |
78 | * @var string |
79 | */ |
80 | public $text; |
81 | /** |
82 | * The DOM Node associated with this chunk. |
83 | * @var Node |
84 | */ |
85 | public $node; |
86 | /** |
87 | * The prefix string to add if the start of the chunk doesn't match its |
88 | * constraints. |
89 | * @var ?string |
90 | */ |
91 | public $prefix; |
92 | /** |
93 | * The suffix string to add if the end of the chunk doesn't match its |
94 | * constraints. |
95 | * @var ?string |
96 | */ |
97 | public $suffix; |
98 | /** |
99 | * Does this chunk come from selser? |
100 | * @var bool |
101 | */ |
102 | public $selser; |
103 | /** |
104 | * Suppress separators? |
105 | * @var bool |
106 | */ |
107 | public $noSep; |
108 | |
109 | /** |
110 | * @param array{text:string,node:Node,prefix?:string,suffix?:string} $args Options. |
111 | */ |
112 | public function __construct( array $args ) { |
113 | $this->text = $args['text']; |
114 | $this->node = $args['node']; |
115 | $this->prefix = $args['prefix'] ?? null; |
116 | $this->suffix = $args['suffix'] ?? null; |
117 | $this->selser = false; |
118 | $this->noSep = false; |
119 | } |
120 | |
121 | /** |
122 | * Ensure that the argument `o`, which is perhaps a string, is a instance of |
123 | * `ConstrainedText`. |
124 | * @param string|ConstrainedText $o |
125 | * @param Node $node |
126 | * The {@link Node} corresponding to `o`. |
127 | * @return ConstrainedText |
128 | */ |
129 | public static function cast( $o, Node $node ): ConstrainedText { |
130 | if ( $o instanceof ConstrainedText ) { |
131 | return $o; |
132 | } |
133 | return new ConstrainedText( [ 'text' => $o ?? '', 'node' => $node ] ); |
134 | } |
135 | |
136 | /** |
137 | * Use the provided `state`, which gives context and access to the entire |
138 | * list of chunks, to determine the proper escape prefix/suffix. |
139 | * Returns an object with a `text` property as well as optional |
140 | * `prefix` and 'suffix' properties giving desired escape strings. |
141 | * @param State $state Context state |
142 | * @return Result |
143 | */ |
144 | public function escape( State $state ): Result { |
145 | // default implementation: no escaping, no prefixes or suffixes. |
146 | return new Result( $this->text, $this->prefix, $this->suffix ); |
147 | } |
148 | |
149 | /** |
150 | * Simple equality. This enforces type equality |
151 | * (ie subclasses are not equal). |
152 | * @param ConstrainedText $ct |
153 | * @return bool |
154 | */ |
155 | public function equals( ConstrainedText $ct ): bool { |
156 | return $this === $ct || ( |
157 | get_class( $this ) === self::class && |
158 | get_class( $ct ) === self::class && |
159 | $this->text === $ct->text |
160 | ); |
161 | } |
162 | |
163 | /** |
164 | * Useful shortcut: execute a regular expression on the raw wikitext. |
165 | * @param string $re |
166 | * @param Env $env |
167 | * @return array|null |
168 | * An array containing the matched results or null if there were no matches. |
169 | */ |
170 | public function matches( string $re, Env $env ): ?array { |
171 | $r = preg_match( $re, $this->text, $m ); |
172 | if ( $r === false ) { |
173 | if ( version_compare( PHP_VERSION, '8.0.0', '>' ) ) { |
174 | $error_msg = preg_last_error_msg(); |
175 | } else { |
176 | $error_msg = "preg_last_error: " . preg_last_error(); |
177 | } |
178 | $env->log( 'error', $error_msg, $re, $this->text ); |
179 | throw new \Error( 'Bad regular expression' ); |
180 | } |
181 | return $r === 0 ? null : $m; |
182 | } |
183 | |
184 | /** |
185 | * SelSer support: when we come across an unmodified node in during |
186 | * selective serialization, we know we can use the original wikitext |
187 | * for that node unmodified. *But* there may be boundary conditions |
188 | * on the left and right sides of the selser'ed text which are going |
189 | * to require escaping. |
190 | * |
191 | * So rather than turning the node into a plain old `ConstrainedText` |
192 | * chunk, allow subclasses of `ConstrainedText` to register as potential |
193 | * handlers of selser nodes. A selser'ed magic link, for example, |
194 | * will then turn into a `MagicLinkText` and thus be able to enforce |
195 | * the proper boundary constraints. |
196 | * |
197 | * @param string $text |
198 | * @param Element $node |
199 | * @param DataParsoid $dataParsoid |
200 | * @param Env $env |
201 | * @param array $opts |
202 | * @return ConstrainedText[] |
203 | */ |
204 | public static function fromSelSer( |
205 | string $text, Element $node, DataParsoid $dataParsoid, |
206 | Env $env, array $opts = [] |
207 | ): array { |
208 | // Main dispatch point: iterate through registered subclasses, asking |
209 | // each if they can handle this node (by invoking `fromSelSerImpl`). |
210 | |
211 | // We define parent types before subtypes, so search the list backwards |
212 | // to be sure we check subtypes before parent types. |
213 | $types = self::$types; |
214 | for ( $i = count( $types ) - 1; $i >= 0; $i-- ) { |
215 | $ct = call_user_func( |
216 | [ $types[$i], 'fromSelSerImpl' ], |
217 | $text, $node, $dataParsoid, $env, $opts |
218 | ); |
219 | if ( !$ct ) { |
220 | continue; |
221 | } |
222 | if ( !is_array( $ct ) ) { |
223 | $ct = [ $ct ]; |
224 | } |
225 | // tag these chunks as coming from selser |
226 | foreach ( $ct as $t ) { |
227 | $t->selser = true; |
228 | } |
229 | return $ct; |
230 | } |
231 | // ConstrainedText::fromSelSerImpl should handle everything which reaches it |
232 | // so nothing should make it here. |
233 | throw new \Error( 'Should never happen.' ); |
234 | } |
235 | |
236 | /** |
237 | * Base case: the given node type does not correspond to a special |
238 | * `ConstrainedText` subclass. We still have to be careful: the leftmost |
239 | * (rightmost) children of `node` may still be exposed to our left (right) |
240 | * context. If so (ie, their DSR bounds coincide) split the selser text |
241 | * and emit multiple `ConstrainedText` chunks to preserve the proper |
242 | * boundary conditions. |
243 | * |
244 | * @param string $text |
245 | * @param Element $node |
246 | * @param DataParsoid $dataParsoid |
247 | * @param Env $env |
248 | * @param array $opts |
249 | * @return ConstrainedText|ConstrainedText[] |
250 | */ |
251 | protected static function fromSelSerImpl( |
252 | string $text, Element $node, DataParsoid $dataParsoid, |
253 | Env $env, array $opts |
254 | ) { |
255 | // look at leftmost and rightmost children, it may be that we need |
256 | // to turn these into ConstrainedText chunks in order to preserve |
257 | // the proper escape conditions on the prefix/suffix text. |
258 | $firstChild = DiffDOMUtils::firstNonDeletedChild( $node ); |
259 | $lastChild = DiffDOMUtils::lastNonDeletedChild( $node ); |
260 | $firstChildDp = $firstChild instanceof Element ? |
261 | DOMDataUtils::getDataParsoid( $firstChild ) : null; |
262 | $lastChildDp = $lastChild instanceof Element ? |
263 | DOMDataUtils::getDataParsoid( $lastChild ) : null; |
264 | $prefixChunks = []; |
265 | $suffixChunks = []; |
266 | $len = null; |
267 | $ignorePrefix = $opts['ignorePrefix'] ?? false; |
268 | $ignoreSuffix = $opts['ignoreSuffix'] ?? false; |
269 | // check to see if first child's DSR start is the same as this node's |
270 | // DSR start. If so, the first child is exposed to the (modified) |
271 | // left-hand context, and so recursively convert it to the proper |
272 | // list of specialized chunks. |
273 | if ( |
274 | !$ignorePrefix && |
275 | $firstChildDp && Utils::isValidDSR( $firstChildDp->dsr ?? null ) && |
276 | $dataParsoid->dsr->start === $firstChildDp->dsr->start |
277 | ) { |
278 | DOMUtils::assertElt( $firstChild ); // implied by $firstChildDp |
279 | $len = $firstChildDp->dsr->length(); |
280 | if ( $len < 0 ) { // T254412: Bad DSR |
281 | $env->log( "error/html2wt/dsr", |
282 | "Bad DSR: " . PHPUtils::jsonEncode( $firstChildDp->dsr ), |
283 | "Node: " . DOMCompat::getOuterHTML( $firstChild ) ); |
284 | } else { |
285 | if ( $len > strlen( $text ) ) { // T254412: Bad DSR |
286 | $env->log( "error/html2wt/dsr", |
287 | "Bad DSR: " . PHPUtils::jsonEncode( $firstChildDp->dsr ), |
288 | "Node: " . DOMCompat::getOuterHTML( $firstChild ) ); |
289 | $len = strlen( $text ); |
290 | } |
291 | $prefixChunks = self::fromSelSer( |
292 | substr( $text, 0, $len ), $firstChild, $firstChildDp, $env, |
293 | // this child node's right context will be protected: |
294 | [ 'ignoreSuffix' => true ] |
295 | ); |
296 | $text = substr( $text, $len ); |
297 | } |
298 | } |
299 | // check to see if last child's DSR end is the same as this node's |
300 | // DSR end. If so, the last child is exposed to the (modified) |
301 | // right-hand context, and so recursively convert it to the proper |
302 | // list of specialized chunks. |
303 | if ( |
304 | !$ignoreSuffix && $lastChild !== $firstChild && |
305 | $lastChildDp && Utils::isValidDSR( $lastChildDp->dsr ?? null ) && |
306 | $dataParsoid->dsr->end === $lastChildDp->dsr->end |
307 | ) { |
308 | DOMUtils::assertElt( $lastChild ); // implied by $lastChildDp |
309 | $len = $lastChildDp->dsr->length(); |
310 | if ( $len < 0 ) { // T254412: Bad DSR |
311 | $env->log( "error/html2wt/dsr", |
312 | "Bad DSR: " . PHPUtils::jsonEncode( $lastChildDp->dsr ), |
313 | "Node: " . DOMCompat::getOuterHTML( $lastChild ) ); |
314 | } else { |
315 | if ( $len > strlen( $text ) ) { // T254412: Bad DSR |
316 | $env->log( "error/html2wt/dsr", |
317 | "Bad DSR: " . PHPUtils::jsonEncode( $lastChildDp->dsr ), |
318 | "Node: " . DOMCompat::getOuterHTML( $lastChild ) ); |
319 | $len = strlen( $text ); |
320 | } |
321 | $suffixChunks = self::fromSelSer( |
322 | substr( $text, -$len ), $lastChild, $lastChildDp, $env, |
323 | // this child node's left context will be protected: |
324 | [ 'ignorePrefix' => true ] |
325 | ); |
326 | $text = substr( $text, 0, -$len ); |
327 | } |
328 | } |
329 | // glue together prefixChunks, whatever's left of `text`, and suffixChunks |
330 | $chunks = [ self::cast( $text, $node ) ]; |
331 | $chunks = array_merge( $prefixChunks, $chunks, $suffixChunks ); |
332 | // top-level chunks only: |
333 | if ( !( $ignorePrefix || $ignoreSuffix ) ) { |
334 | // ensure that the first chunk belongs to `node` in order to |
335 | // emit separators correctly before `node` |
336 | if ( $chunks[0]->node !== $node ) { |
337 | array_unshift( $chunks, self::cast( '', $node ) ); |
338 | } |
339 | // set 'noSep' flag on all but the first chunk, so we don't get |
340 | // extra separators from `SSP.emitChunk` |
341 | foreach ( $chunks as $i => $t ) { |
342 | if ( $i > 0 ) { |
343 | $t->noSep = true; |
344 | } |
345 | } |
346 | } |
347 | return $chunks; |
348 | } |
349 | |
350 | /** |
351 | * List of types we attempt `fromSelSer` with. This should include all the |
352 | * concrete subclasses of `ConstrainedText` (`RegExpConstrainedText` is |
353 | * missing since it is an abstract class). We also include the |
354 | * `ConstrainedText` class as the first element (even though it is |
355 | * an abstract base class) as a little bit of a hack: it simplifies |
356 | * `ConstrainedText.fromSelSer` by factoring some of its work into |
357 | * `ConstrainedText.fromSelSerImpl`. |
358 | * @var class-string[] |
359 | */ |
360 | private static $types = [ |
361 | // Base class is first, as a special case |
362 | self::class, |
363 | // All concrete subclasses of ConstrainedText |
364 | WikiLinkText::class, ExtLinkText::class, AutoURLLinkText::class, |
365 | MagicLinkText::class, LanguageVariantText::class |
366 | ]; |
367 | } |