Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
65.09% |
69 / 106 |
|
25.00% |
2 / 8 |
CRAP | |
0.00% |
0 / 1 |
ConstrainedText | |
65.09% |
69 / 106 |
|
25.00% |
2 / 8 |
112.49 | |
0.00% |
0 / 1 |
escapeLine | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
3 | |||
__construct | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
2 | |||
cast | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
escape | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
equals | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
20 | |||
matches | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
20 | |||
fromSelSer | |
92.31% |
12 / 13 |
|
0.00% |
0 / 1 |
5.01 | |||
fromSelSerImpl | |
75.00% |
42 / 56 |
|
0.00% |
0 / 1 |
27.89 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Html2Wt\ConstrainedText; |
5 | |
6 | use Wikimedia\Parsoid\Config\Env; |
7 | use Wikimedia\Parsoid\DOM\Element; |
8 | use Wikimedia\Parsoid\DOM\Node; |
9 | use Wikimedia\Parsoid\NodeData\DataParsoid; |
10 | use Wikimedia\Parsoid\Utils\DiffDOMUtils; |
11 | use Wikimedia\Parsoid\Utils\DOMCompat; |
12 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
13 | use Wikimedia\Parsoid\Utils\PHPUtils; |
14 | use Wikimedia\Parsoid\Utils\Utils; |
15 | |
16 | /** |
17 | * A chunk of wikitext output. This base class contains the |
18 | * wikitext and a pointer to the DOM node which is responsible for |
19 | * generating it. Subclasses can add additional properties to record |
20 | * context or wikitext boundary restrictions for proper escaping. |
21 | * The chunk is serialized with the `escape` method, which might |
22 | * alter the wikitext in order to ensure it doesn't run together |
23 | * with its context (usually by adding `<nowiki>` tags). |
24 | * |
25 | * The main entry point is the static function `ConstrainedText::escapeLine()`. |
26 | */ |
27 | class ConstrainedText { |
28 | /** |
29 | * This adds necessary escapes to a line of chunks. We provide |
30 | * the `ConstrainedText#escape` function with its left and right |
31 | * context, and it can determine what escapes are needed. |
32 | * |
33 | * The `line` parameter is an array of `ConstrainedText` *chunks* |
34 | * which make up a line (or part of a line, in some cases of nested |
35 | * processing). |
36 | * |
37 | * @param ConstrainedText[] $line |
38 | * @return string |
39 | */ |
40 | public static function escapeLine( array $line ): string { |
41 | // The left context will be precise (that is, it is the result |
42 | // of `ConstrainedText#escape` and will include any escapes |
43 | // triggered by chunks on the left), but the right context |
44 | // is just the (unescaped) text property from the chunk. |
45 | // As we work left to right we will piece together a fully-escaped |
46 | // string. Be careful not to shoot yourself in the foot -- if the |
47 | // escaped text is significantly different from the chunk's `text` |
48 | // property, the preceding chunk may not have made the correct |
49 | // decisions about emitting an escape suffix. We could solve |
50 | // this by looping until the state converges (or until we detect |
51 | // a loop) but for now let's hope that's not necessary. |
52 | $state = new State( $line ); |
53 | $safeLeft = ''; |
54 | for ( $state->pos = 0; $state->pos < count( $line ); $state->pos++ ) { |
55 | $chunk = $line[$state->pos]; |
56 | // Process the escapes for this chunk, given escaped previous chunk |
57 | $state->rightContext = substr( $state->rightContext, strlen( $chunk->text ) ); |
58 | $thisEscape = $chunk->escape( $state ); |
59 | $state->leftContext .= |
60 | ( $thisEscape->prefix ?? '' ) . |
61 | $thisEscape->text . |
62 | ( $thisEscape->suffix ?? '' ); |
63 | if ( $thisEscape->greedy ) { |
64 | // protect the left context: this will be matched greedily |
65 | // by this chunk, so there's no chance that a subsequent |
66 | // token will include this in its prefix. |
67 | $safeLeft .= $state->leftContext; |
68 | $state->leftContext = ''; |
69 | } |
70 | } |
71 | // right context should be empty here. |
72 | return $safeLeft . $state->leftContext; |
73 | } |
74 | |
75 | /** |
76 | * The wikitext string associated with this chunk. |
77 | * @var string |
78 | */ |
79 | public $text; |
80 | /** |
81 | * The DOM Node associated with this chunk. |
82 | * @var Node |
83 | */ |
84 | public $node; |
85 | /** |
86 | * The prefix string to add if the start of the chunk doesn't match its |
87 | * constraints. |
88 | * @var ?string |
89 | */ |
90 | public $prefix; |
91 | /** |
92 | * The suffix string to add if the end of the chunk doesn't match its |
93 | * constraints. |
94 | * @var ?string |
95 | */ |
96 | public $suffix; |
97 | /** |
98 | * Does this chunk come from selser? |
99 | * @var bool |
100 | */ |
101 | public $selser; |
102 | /** |
103 | * Suppress separators? |
104 | * @var bool |
105 | */ |
106 | public $noSep; |
107 | |
108 | /** |
109 | * @param array{text:string,node:Node,prefix?:?string,suffix?:?string} $args Options. |
110 | */ |
111 | public function __construct( array $args ) { |
112 | $this->text = $args['text']; |
113 | $this->node = $args['node']; |
114 | $this->prefix = $args['prefix'] ?? null; |
115 | $this->suffix = $args['suffix'] ?? null; |
116 | $this->selser = false; |
117 | $this->noSep = false; |
118 | } |
119 | |
120 | /** |
121 | * Ensure that the argument `o`, which is perhaps a string, is a instance of |
122 | * `ConstrainedText`. |
123 | * @param string|ConstrainedText $o |
124 | * @param Node $node |
125 | * The {@link Node} corresponding to `o`. |
126 | * @return ConstrainedText |
127 | */ |
128 | public static function cast( $o, Node $node ): ConstrainedText { |
129 | if ( $o instanceof ConstrainedText ) { |
130 | return $o; |
131 | } |
132 | return new ConstrainedText( [ 'text' => $o ?? '', 'node' => $node ] ); |
133 | } |
134 | |
135 | /** |
136 | * Use the provided `state`, which gives context and access to the entire |
137 | * list of chunks, to determine the proper escape prefix/suffix. |
138 | * Returns an object with a `text` property as well as optional |
139 | * `prefix` and 'suffix' properties giving desired escape strings. |
140 | * @param State $state Context state |
141 | * @return Result |
142 | */ |
143 | public function escape( State $state ): Result { |
144 | // default implementation: no escaping, no prefixes or suffixes. |
145 | return new Result( $this->text, $this->prefix, $this->suffix ); |
146 | } |
147 | |
148 | /** |
149 | * Simple equality. This enforces type equality |
150 | * (ie subclasses are not equal). |
151 | * @param ConstrainedText $ct |
152 | * @return bool |
153 | */ |
154 | public function equals( ConstrainedText $ct ): bool { |
155 | return $this === $ct || ( |
156 | get_class( $this ) === self::class && |
157 | get_class( $ct ) === self::class && |
158 | $this->text === $ct->text |
159 | ); |
160 | } |
161 | |
162 | /** |
163 | * Useful shortcut: execute a regular expression on the raw wikitext. |
164 | * @param string $re |
165 | * @param Env $env |
166 | * @return array|null |
167 | * An array containing the matched results or null if there were no matches. |
168 | */ |
169 | public function matches( string $re, Env $env ): ?array { |
170 | $r = preg_match( $re, $this->text, $m ); |
171 | if ( $r === false ) { |
172 | if ( version_compare( PHP_VERSION, '8.0.0', '>' ) ) { |
173 | $error_msg = preg_last_error_msg(); |
174 | } else { |
175 | $error_msg = "preg_last_error: " . preg_last_error(); |
176 | } |
177 | $env->log( 'error', $error_msg, $re, $this->text ); |
178 | throw new \Error( 'Bad regular expression' ); |
179 | } |
180 | return $r === 0 ? null : $m; |
181 | } |
182 | |
183 | /** |
184 | * SelSer support: when we come across an unmodified node in during |
185 | * selective serialization, we know we can use the original wikitext |
186 | * for that node unmodified. *But* there may be boundary conditions |
187 | * on the left and right sides of the selser'ed text which are going |
188 | * to require escaping. |
189 | * |
190 | * So rather than turning the node into a plain old `ConstrainedText` |
191 | * chunk, allow subclasses of `ConstrainedText` to register as potential |
192 | * handlers of selser nodes. A selser'ed magic link, for example, |
193 | * will then turn into a `MagicLinkText` and thus be able to enforce |
194 | * the proper boundary constraints. |
195 | * |
196 | * @param string $text |
197 | * @param Element $node |
198 | * @param DataParsoid $dataParsoid |
199 | * @param Env $env |
200 | * @param array $opts |
201 | * @return ConstrainedText[] |
202 | */ |
203 | public static function fromSelSer( |
204 | string $text, Element $node, DataParsoid $dataParsoid, |
205 | Env $env, array $opts = [] |
206 | ): array { |
207 | // Main dispatch point: iterate through registered subclasses, asking |
208 | // each if they can handle this node (by invoking `fromSelSerImpl`). |
209 | |
210 | // We define parent types before subtypes, so search the list backwards |
211 | // to be sure we check subtypes before parent types. |
212 | $types = self::$types; |
213 | for ( $i = count( $types ) - 1; $i >= 0; $i-- ) { |
214 | $ct = $types[$i]::fromSelSerImpl( |
215 | $text, $node, $dataParsoid, $env, $opts |
216 | ); |
217 | if ( !$ct ) { |
218 | continue; |
219 | } |
220 | if ( !is_array( $ct ) ) { |
221 | $ct = [ $ct ]; |
222 | } |
223 | // tag these chunks as coming from selser |
224 | foreach ( $ct as $t ) { |
225 | $t->selser = true; |
226 | } |
227 | return $ct; |
228 | } |
229 | // ConstrainedText::fromSelSerImpl should handle everything which reaches it |
230 | // so nothing should make it here. |
231 | throw new \Error( 'Should never happen.' ); |
232 | } |
233 | |
234 | /** |
235 | * Base case: the given node type does not correspond to a special |
236 | * `ConstrainedText` subclass. We still have to be careful: the leftmost |
237 | * (rightmost) children of `node` may still be exposed to our left (right) |
238 | * context. If so (ie, their DSR bounds coincide) split the selser text |
239 | * and emit multiple `ConstrainedText` chunks to preserve the proper |
240 | * boundary conditions. |
241 | * |
242 | * @param string $text |
243 | * @param Element $node |
244 | * @param DataParsoid $dataParsoid |
245 | * @param Env $env |
246 | * @param array $opts |
247 | * @return ConstrainedText|ConstrainedText[] |
248 | */ |
249 | protected static function fromSelSerImpl( |
250 | string $text, Element $node, DataParsoid $dataParsoid, |
251 | Env $env, array $opts |
252 | ) { |
253 | // look at leftmost and rightmost children, it may be that we need |
254 | // to turn these into ConstrainedText chunks in order to preserve |
255 | // the proper escape conditions on the prefix/suffix text. |
256 | $firstChild = DiffDOMUtils::firstNonDeletedChild( $node ); |
257 | $lastChild = DiffDOMUtils::lastNonDeletedChild( $node ); |
258 | $firstChildDp = $firstChild instanceof Element ? |
259 | DOMDataUtils::getDataParsoid( $firstChild ) : null; |
260 | $lastChildDp = $lastChild instanceof Element ? |
261 | DOMDataUtils::getDataParsoid( $lastChild ) : null; |
262 | $prefixChunks = []; |
263 | $suffixChunks = []; |
264 | $len = null; |
265 | $ignorePrefix = $opts['ignorePrefix'] ?? false; |
266 | $ignoreSuffix = $opts['ignoreSuffix'] ?? false; |
267 | // check to see if first child's DSR start is the same as this node's |
268 | // DSR start. If so, the first child is exposed to the (modified) |
269 | // left-hand context, and so recursively convert it to the proper |
270 | // list of specialized chunks. |
271 | if ( |
272 | !$ignorePrefix && |
273 | $firstChildDp && Utils::isValidDSR( $firstChildDp->dsr ?? null ) && |
274 | $dataParsoid->dsr->start === $firstChildDp->dsr->start |
275 | ) { |
276 | '@phan-var Element $firstChild'; // @var Element $firstChild - implied by $firstChildDp |
277 | $len = $firstChildDp->dsr->length(); |
278 | if ( $len < 0 ) { // T254412: Bad DSR |
279 | $env->log( "error/html2wt/dsr", |
280 | "Bad DSR: " . PHPUtils::jsonEncode( $firstChildDp->dsr ), |
281 | "Node: " . DOMCompat::getOuterHTML( $firstChild ) ); |
282 | } else { |
283 | if ( $len > strlen( $text ) ) { // T254412: Bad DSR |
284 | $env->log( "error/html2wt/dsr", |
285 | "Bad DSR: " . PHPUtils::jsonEncode( $firstChildDp->dsr ), |
286 | "Node: " . DOMCompat::getOuterHTML( $firstChild ) ); |
287 | $len = strlen( $text ); |
288 | } |
289 | $prefixChunks = self::fromSelSer( |
290 | substr( $text, 0, $len ), $firstChild, $firstChildDp, $env, |
291 | // this child node's right context will be protected: |
292 | [ 'ignoreSuffix' => true ] |
293 | ); |
294 | $text = substr( $text, $len ); |
295 | } |
296 | } |
297 | // check to see if last child's DSR end is the same as this node's |
298 | // DSR end. If so, the last child is exposed to the (modified) |
299 | // right-hand context, and so recursively convert it to the proper |
300 | // list of specialized chunks. |
301 | if ( |
302 | !$ignoreSuffix && $lastChild !== $firstChild && |
303 | $lastChildDp && Utils::isValidDSR( $lastChildDp->dsr ?? null ) && |
304 | $dataParsoid->dsr->end === $lastChildDp->dsr->end |
305 | ) { |
306 | '@phan-var Element $lastChild'; // @var Element $lastChild - implied by $lastChildDp |
307 | $len = $lastChildDp->dsr->length(); |
308 | if ( $len < 0 ) { // T254412: Bad DSR |
309 | $env->log( "error/html2wt/dsr", |
310 | "Bad DSR: " . PHPUtils::jsonEncode( $lastChildDp->dsr ), |
311 | "Node: " . DOMCompat::getOuterHTML( $lastChild ) ); |
312 | } else { |
313 | if ( $len > strlen( $text ) ) { // T254412: Bad DSR |
314 | $env->log( "error/html2wt/dsr", |
315 | "Bad DSR: " . PHPUtils::jsonEncode( $lastChildDp->dsr ), |
316 | "Node: " . DOMCompat::getOuterHTML( $lastChild ) ); |
317 | $len = strlen( $text ); |
318 | } |
319 | $suffixChunks = self::fromSelSer( |
320 | substr( $text, -$len ), $lastChild, $lastChildDp, $env, |
321 | // this child node's left context will be protected: |
322 | [ 'ignorePrefix' => true ] |
323 | ); |
324 | $text = substr( $text, 0, -$len ); |
325 | } |
326 | } |
327 | // glue together prefixChunks, whatever's left of `text`, and suffixChunks |
328 | $chunks = [ self::cast( $text, $node ) ]; |
329 | $chunks = array_merge( $prefixChunks, $chunks, $suffixChunks ); |
330 | // top-level chunks only: |
331 | if ( !( $ignorePrefix || $ignoreSuffix ) ) { |
332 | // ensure that the first chunk belongs to `node` in order to |
333 | // emit separators correctly before `node` |
334 | if ( $chunks[0]->node !== $node ) { |
335 | array_unshift( $chunks, self::cast( '', $node ) ); |
336 | } |
337 | // set 'noSep' flag on all but the first chunk, so we don't get |
338 | // extra separators from `SSP.emitChunk` |
339 | foreach ( $chunks as $i => $t ) { |
340 | if ( $i > 0 ) { |
341 | $t->noSep = true; |
342 | } |
343 | } |
344 | } |
345 | return $chunks; |
346 | } |
347 | |
348 | /** |
349 | * List of types we attempt `fromSelSer` with. This should include all the |
350 | * concrete subclasses of `ConstrainedText` (`RegExpConstrainedText` is |
351 | * missing since it is an abstract class). We also include the |
352 | * `ConstrainedText` class as the first element (even though it is |
353 | * an abstract base class) as a little bit of a hack: it simplifies |
354 | * `ConstrainedText.fromSelSer` by factoring some of its work into |
355 | * `ConstrainedText.fromSelSerImpl`. |
356 | * @var class-string[] |
357 | */ |
358 | private static $types = [ |
359 | // Base class is first, as a special case |
360 | self::class, |
361 | // All concrete subclasses of ConstrainedText |
362 | WikiLinkText::class, ExtLinkText::class, AutoURLLinkText::class, |
363 | MagicLinkText::class, LanguageVariantText::class |
364 | ]; |
365 | } |