Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
67.92% |
72 / 106 |
|
25.00% |
2 / 8 |
CRAP | |
0.00% |
0 / 1 |
| ConstrainedText | |
67.92% |
72 / 106 |
|
25.00% |
2 / 8 |
92.80 | |
0.00% |
0 / 1 |
| escapeLine | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
3 | |||
| __construct | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
2 | |||
| cast | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
| escape | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| equals | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
20 | |||
| matches | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 | |||
| fromSelSer | |
92.86% |
13 / 14 |
|
0.00% |
0 / 1 |
5.01 | |||
| fromSelSerImpl | |
75.86% |
44 / 58 |
|
0.00% |
0 / 1 |
27.20 | |||
| 1 | <?php |
| 2 | declare( strict_types = 1 ); |
| 3 | |
| 4 | namespace Wikimedia\Parsoid\Html2Wt\ConstrainedText; |
| 5 | |
| 6 | use Wikimedia\Parsoid\Config\Env; |
| 7 | use Wikimedia\Parsoid\DOM\Element; |
| 8 | use Wikimedia\Parsoid\DOM\Node; |
| 9 | use Wikimedia\Parsoid\NodeData\DataParsoid; |
| 10 | use Wikimedia\Parsoid\Utils\DiffDOMUtils; |
| 11 | use Wikimedia\Parsoid\Utils\DOMCompat; |
| 12 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
| 13 | use Wikimedia\Parsoid\Utils\DOMUtils; |
| 14 | use Wikimedia\Parsoid\Utils\PHPUtils; |
| 15 | use Wikimedia\Parsoid\Utils\Utils; |
| 16 | |
| 17 | /** |
| 18 | * A chunk of wikitext output. This base class contains the |
| 19 | * wikitext and a pointer to the DOM node which is responsible for |
| 20 | * generating it. Subclasses can add additional properties to record |
| 21 | * context or wikitext boundary restrictions for proper escaping. |
| 22 | * The chunk is serialized with the `escape` method, which might |
| 23 | * alter the wikitext in order to ensure it doesn't run together |
| 24 | * with its context (usually by adding `<nowiki>` tags). |
| 25 | * |
| 26 | * The main entry point is the static function `ConstrainedText::escapeLine()`. |
| 27 | */ |
| 28 | class ConstrainedText { |
| 29 | /** |
| 30 | * This adds necessary escapes to a line of chunks. We provide |
| 31 | * the `ConstrainedText#escape` function with its left and right |
| 32 | * context, and it can determine what escapes are needed. |
| 33 | * |
| 34 | * The `line` parameter is an array of `ConstrainedText` *chunks* |
| 35 | * which make up a line (or part of a line, in some cases of nested |
| 36 | * processing). |
| 37 | * |
| 38 | * @param ConstrainedText[] $line |
| 39 | * @return string |
| 40 | */ |
| 41 | public static function escapeLine( array $line ): string { |
| 42 | // The left context will be precise (that is, it is the result |
| 43 | // of `ConstrainedText#escape` and will include any escapes |
| 44 | // triggered by chunks on the left), but the right context |
| 45 | // is just the (unescaped) text property from the chunk. |
| 46 | // As we work left to right we will piece together a fully-escaped |
| 47 | // string. Be careful not to shoot yourself in the foot -- if the |
| 48 | // escaped text is significantly different from the chunk's `text` |
| 49 | // property, the preceding chunk may not have made the correct |
| 50 | // decisions about emitting an escape suffix. We could solve |
| 51 | // this by looping until the state converges (or until we detect |
| 52 | // a loop) but for now let's hope that's not necessary. |
| 53 | $state = new State( $line ); |
| 54 | $safeLeft = ''; |
| 55 | for ( $state->pos = 0; $state->pos < count( $line ); $state->pos++ ) { |
| 56 | $chunk = $line[$state->pos]; |
| 57 | // Process the escapes for this chunk, given escaped previous chunk |
| 58 | $state->rightContext = substr( $state->rightContext, strlen( $chunk->text ) ); |
| 59 | $thisEscape = $chunk->escape( $state ); |
| 60 | $state->leftContext .= |
| 61 | ( $thisEscape->prefix ?? '' ) . |
| 62 | $thisEscape->text . |
| 63 | ( $thisEscape->suffix ?? '' ); |
| 64 | if ( $thisEscape->greedy ) { |
| 65 | // protect the left context: this will be matched greedily |
| 66 | // by this chunk, so there's no chance that a subsequent |
| 67 | // token will include this in its prefix. |
| 68 | $safeLeft .= $state->leftContext; |
| 69 | $state->leftContext = ''; |
| 70 | } |
| 71 | } |
| 72 | // right context should be empty here. |
| 73 | return $safeLeft . $state->leftContext; |
| 74 | } |
| 75 | |
| 76 | /** |
| 77 | * The wikitext string associated with this chunk. |
| 78 | * @var string |
| 79 | */ |
| 80 | public $text; |
| 81 | /** |
| 82 | * The DOM Node associated with this chunk. |
| 83 | * @var Node |
| 84 | */ |
| 85 | public $node; |
| 86 | /** |
| 87 | * The prefix string to add if the start of the chunk doesn't match its |
| 88 | * constraints. |
| 89 | * @var ?string |
| 90 | */ |
| 91 | public $prefix; |
| 92 | /** |
| 93 | * The suffix string to add if the end of the chunk doesn't match its |
| 94 | * constraints. |
| 95 | * @var ?string |
| 96 | */ |
| 97 | public $suffix; |
| 98 | /** |
| 99 | * Does this chunk come from selser? |
| 100 | * @var bool |
| 101 | */ |
| 102 | public $selser; |
| 103 | /** |
| 104 | * Suppress separators? |
| 105 | * @var bool |
| 106 | */ |
| 107 | public $noSep; |
| 108 | |
| 109 | /** |
| 110 | * @param array{text:string,node:Node,prefix?:string,suffix?:string} $args Options. |
| 111 | */ |
| 112 | public function __construct( array $args ) { |
| 113 | $this->text = $args['text']; |
| 114 | $this->node = $args['node']; |
| 115 | $this->prefix = $args['prefix'] ?? null; |
| 116 | $this->suffix = $args['suffix'] ?? null; |
| 117 | $this->selser = false; |
| 118 | $this->noSep = false; |
| 119 | } |
| 120 | |
| 121 | /** |
| 122 | * Ensure that the argument `o`, which is perhaps a string, is a instance of |
| 123 | * `ConstrainedText`. |
| 124 | * @param string|ConstrainedText $o |
| 125 | * @param Node $node |
| 126 | * The {@link Node} corresponding to `o`. |
| 127 | * @return ConstrainedText |
| 128 | */ |
| 129 | public static function cast( $o, Node $node ): ConstrainedText { |
| 130 | if ( $o instanceof ConstrainedText ) { |
| 131 | return $o; |
| 132 | } |
| 133 | return new ConstrainedText( [ 'text' => $o ?? '', 'node' => $node ] ); |
| 134 | } |
| 135 | |
| 136 | /** |
| 137 | * Use the provided `state`, which gives context and access to the entire |
| 138 | * list of chunks, to determine the proper escape prefix/suffix. |
| 139 | * Returns an object with a `text` property as well as optional |
| 140 | * `prefix` and 'suffix' properties giving desired escape strings. |
| 141 | * @param State $state Context state |
| 142 | * @return Result |
| 143 | */ |
| 144 | public function escape( State $state ): Result { |
| 145 | // default implementation: no escaping, no prefixes or suffixes. |
| 146 | return new Result( $this->text, $this->prefix, $this->suffix ); |
| 147 | } |
| 148 | |
| 149 | /** |
| 150 | * Simple equality. This enforces type equality |
| 151 | * (ie subclasses are not equal). |
| 152 | * @param ConstrainedText $ct |
| 153 | * @return bool |
| 154 | */ |
| 155 | public function equals( ConstrainedText $ct ): bool { |
| 156 | return $this === $ct || ( |
| 157 | get_class( $this ) === self::class && |
| 158 | get_class( $ct ) === self::class && |
| 159 | $this->text === $ct->text |
| 160 | ); |
| 161 | } |
| 162 | |
| 163 | /** |
| 164 | * Useful shortcut: execute a regular expression on the raw wikitext. |
| 165 | * @param string $re |
| 166 | * @param Env $env |
| 167 | * @return array|null |
| 168 | * An array containing the matched results or null if there were no matches. |
| 169 | */ |
| 170 | public function matches( string $re, Env $env ): ?array { |
| 171 | $r = preg_match( $re, $this->text, $m ); |
| 172 | if ( $r === false ) { |
| 173 | $env->log( 'error', preg_last_error_msg(), $re, $this->text ); |
| 174 | throw new \Error( 'Bad regular expression' ); |
| 175 | } |
| 176 | return $r === 0 ? null : $m; |
| 177 | } |
| 178 | |
| 179 | /** |
| 180 | * SelSer support: when we come across an unmodified node in during |
| 181 | * selective serialization, we know we can use the original wikitext |
| 182 | * for that node unmodified. *But* there may be boundary conditions |
| 183 | * on the left and right sides of the selser'ed text which are going |
| 184 | * to require escaping. |
| 185 | * |
| 186 | * So rather than turning the node into a plain old `ConstrainedText` |
| 187 | * chunk, allow subclasses of `ConstrainedText` to register as potential |
| 188 | * handlers of selser nodes. A selser'ed magic link, for example, |
| 189 | * will then turn into a `MagicLinkText` and thus be able to enforce |
| 190 | * the proper boundary constraints. |
| 191 | * |
| 192 | * @param string $text |
| 193 | * @param Element $node |
| 194 | * @param DataParsoid $dataParsoid |
| 195 | * @param Env $env |
| 196 | * @param array $opts |
| 197 | * @return ConstrainedText[] |
| 198 | */ |
| 199 | public static function fromSelSer( |
| 200 | string $text, Element $node, DataParsoid $dataParsoid, |
| 201 | Env $env, array $opts = [] |
| 202 | ): array { |
| 203 | // Main dispatch point: iterate through registered subclasses, asking |
| 204 | // each if they can handle this node (by invoking `fromSelSerImpl`). |
| 205 | |
| 206 | // We define parent types before subtypes, so search the list backwards |
| 207 | // to be sure we check subtypes before parent types. |
| 208 | $types = self::$types; |
| 209 | for ( $i = count( $types ) - 1; $i >= 0; $i-- ) { |
| 210 | $ct = call_user_func( |
| 211 | [ $types[$i], 'fromSelSerImpl' ], |
| 212 | $text, $node, $dataParsoid, $env, $opts |
| 213 | ); |
| 214 | if ( !$ct ) { |
| 215 | continue; |
| 216 | } |
| 217 | if ( !is_array( $ct ) ) { |
| 218 | $ct = [ $ct ]; |
| 219 | } |
| 220 | // tag these chunks as coming from selser |
| 221 | foreach ( $ct as $t ) { |
| 222 | $t->selser = true; |
| 223 | } |
| 224 | return $ct; |
| 225 | } |
| 226 | // ConstrainedText::fromSelSerImpl should handle everything which reaches it |
| 227 | // so nothing should make it here. |
| 228 | throw new \Error( 'Should never happen.' ); |
| 229 | } |
| 230 | |
| 231 | /** |
| 232 | * Base case: the given node type does not correspond to a special |
| 233 | * `ConstrainedText` subclass. We still have to be careful: the leftmost |
| 234 | * (rightmost) children of `node` may still be exposed to our left (right) |
| 235 | * context. If so (ie, their DSR bounds coincide) split the selser text |
| 236 | * and emit multiple `ConstrainedText` chunks to preserve the proper |
| 237 | * boundary conditions. |
| 238 | * |
| 239 | * @param string $text |
| 240 | * @param Element $node |
| 241 | * @param DataParsoid $dataParsoid |
| 242 | * @param Env $env |
| 243 | * @param array $opts |
| 244 | * @return ConstrainedText|ConstrainedText[] |
| 245 | */ |
| 246 | protected static function fromSelSerImpl( |
| 247 | string $text, Element $node, DataParsoid $dataParsoid, |
| 248 | Env $env, array $opts |
| 249 | ) { |
| 250 | // look at leftmost and rightmost children, it may be that we need |
| 251 | // to turn these into ConstrainedText chunks in order to preserve |
| 252 | // the proper escape conditions on the prefix/suffix text. |
| 253 | $firstChild = DiffDOMUtils::firstNonDeletedChild( $node ); |
| 254 | $lastChild = DiffDOMUtils::lastNonDeletedChild( $node ); |
| 255 | $firstChildDp = $firstChild instanceof Element ? |
| 256 | DOMDataUtils::getDataParsoid( $firstChild ) : null; |
| 257 | $lastChildDp = $lastChild instanceof Element ? |
| 258 | DOMDataUtils::getDataParsoid( $lastChild ) : null; |
| 259 | $prefixChunks = []; |
| 260 | $suffixChunks = []; |
| 261 | $len = null; |
| 262 | $ignorePrefix = $opts['ignorePrefix'] ?? false; |
| 263 | $ignoreSuffix = $opts['ignoreSuffix'] ?? false; |
| 264 | // check to see if first child's DSR start is the same as this node's |
| 265 | // DSR start. If so, the first child is exposed to the (modified) |
| 266 | // left-hand context, and so recursively convert it to the proper |
| 267 | // list of specialized chunks. |
| 268 | if ( |
| 269 | !$ignorePrefix && |
| 270 | $firstChildDp && Utils::isValidDSR( $firstChildDp->dsr ?? null ) && |
| 271 | $dataParsoid->dsr->start === $firstChildDp->dsr->start |
| 272 | ) { |
| 273 | DOMUtils::assertElt( $firstChild ); // implied by $firstChildDp |
| 274 | $len = $firstChildDp->dsr->length(); |
| 275 | if ( $len < 0 ) { // T254412: Bad DSR |
| 276 | $env->log( "error/html2wt/dsr", |
| 277 | "Bad DSR: " . PHPUtils::jsonEncode( $firstChildDp->dsr ), |
| 278 | "Node: " . DOMCompat::getOuterHTML( $firstChild ) ); |
| 279 | } else { |
| 280 | if ( $len > strlen( $text ) ) { // T254412: Bad DSR |
| 281 | $env->log( "error/html2wt/dsr", |
| 282 | "Bad DSR: " . PHPUtils::jsonEncode( $firstChildDp->dsr ), |
| 283 | "Node: " . DOMCompat::getOuterHTML( $firstChild ) ); |
| 284 | $len = strlen( $text ); |
| 285 | } |
| 286 | $prefixChunks = self::fromSelSer( |
| 287 | substr( $text, 0, $len ), $firstChild, $firstChildDp, $env, |
| 288 | // this child node's right context will be protected: |
| 289 | [ 'ignoreSuffix' => true ] |
| 290 | ); |
| 291 | $text = substr( $text, $len ); |
| 292 | } |
| 293 | } |
| 294 | // check to see if last child's DSR end is the same as this node's |
| 295 | // DSR end. If so, the last child is exposed to the (modified) |
| 296 | // right-hand context, and so recursively convert it to the proper |
| 297 | // list of specialized chunks. |
| 298 | if ( |
| 299 | !$ignoreSuffix && $lastChild !== $firstChild && |
| 300 | $lastChildDp && Utils::isValidDSR( $lastChildDp->dsr ?? null ) && |
| 301 | $dataParsoid->dsr->end === $lastChildDp->dsr->end |
| 302 | ) { |
| 303 | DOMUtils::assertElt( $lastChild ); // implied by $lastChildDp |
| 304 | $len = $lastChildDp->dsr->length(); |
| 305 | if ( $len < 0 ) { // T254412: Bad DSR |
| 306 | $env->log( "error/html2wt/dsr", |
| 307 | "Bad DSR: " . PHPUtils::jsonEncode( $lastChildDp->dsr ), |
| 308 | "Node: " . DOMCompat::getOuterHTML( $lastChild ) ); |
| 309 | } else { |
| 310 | if ( $len > strlen( $text ) ) { // T254412: Bad DSR |
| 311 | $env->log( "error/html2wt/dsr", |
| 312 | "Bad DSR: " . PHPUtils::jsonEncode( $lastChildDp->dsr ), |
| 313 | "Node: " . DOMCompat::getOuterHTML( $lastChild ) ); |
| 314 | $len = strlen( $text ); |
| 315 | } |
| 316 | $suffixChunks = self::fromSelSer( |
| 317 | substr( $text, -$len ), $lastChild, $lastChildDp, $env, |
| 318 | // this child node's left context will be protected: |
| 319 | [ 'ignorePrefix' => true ] |
| 320 | ); |
| 321 | $text = substr( $text, 0, -$len ); |
| 322 | } |
| 323 | } |
| 324 | // glue together prefixChunks, whatever's left of `text`, and suffixChunks |
| 325 | $chunks = [ self::cast( $text, $node ) ]; |
| 326 | $chunks = array_merge( $prefixChunks, $chunks, $suffixChunks ); |
| 327 | // top-level chunks only: |
| 328 | if ( !( $ignorePrefix || $ignoreSuffix ) ) { |
| 329 | // ensure that the first chunk belongs to `node` in order to |
| 330 | // emit separators correctly before `node` |
| 331 | if ( $chunks[0]->node !== $node ) { |
| 332 | array_unshift( $chunks, self::cast( '', $node ) ); |
| 333 | } |
| 334 | // set 'noSep' flag on all but the first chunk, so we don't get |
| 335 | // extra separators from `SSP.emitChunk` |
| 336 | foreach ( $chunks as $i => $t ) { |
| 337 | if ( $i > 0 ) { |
| 338 | $t->noSep = true; |
| 339 | } |
| 340 | } |
| 341 | } |
| 342 | return $chunks; |
| 343 | } |
| 344 | |
| 345 | /** |
| 346 | * List of types we attempt `fromSelSer` with. This should include all the |
| 347 | * concrete subclasses of `ConstrainedText` (`RegExpConstrainedText` is |
| 348 | * missing since it is an abstract class). We also include the |
| 349 | * `ConstrainedText` class as the first element (even though it is |
| 350 | * an abstract base class) as a little bit of a hack: it simplifies |
| 351 | * `ConstrainedText.fromSelSer` by factoring some of its work into |
| 352 | * `ConstrainedText.fromSelSerImpl`. |
| 353 | * @var class-string[] |
| 354 | */ |
| 355 | private static $types = [ |
| 356 | // Base class is first, as a special case |
| 357 | self::class, |
| 358 | // All concrete subclasses of ConstrainedText |
| 359 | WikiLinkText::class, ExtLinkText::class, AutoURLLinkText::class, |
| 360 | MagicLinkText::class, LanguageVariantText::class |
| 361 | ]; |
| 362 | } |