Code Coverage |
||||||||||
Classes and Traits |
Functions and Methods |
Lines |
||||||||
Total | |
0.00% |
0 / 1 |
|
25.00% |
2 / 8 |
CRAP | |
65.09% |
69 / 106 |
ConstrainedText | |
0.00% |
0 / 1 |
|
25.00% |
2 / 8 |
117.02 | |
65.09% |
69 / 106 |
escapeLine | |
100.00% |
1 / 1 |
3 | |
100.00% |
14 / 14 |
|||
__construct | |
0.00% |
0 / 1 |
2 | |
0.00% |
0 / 7 |
|||
cast | |
0.00% |
0 / 1 |
6 | |
0.00% |
0 / 3 |
|||
escape | |
100.00% |
1 / 1 |
1 | |
100.00% |
1 / 1 |
|||
equals | |
0.00% |
0 / 1 |
20 | |
0.00% |
0 / 4 |
|||
matches | |
0.00% |
0 / 1 |
30 | |
0.00% |
0 / 8 |
|||
fromSelSer | |
0.00% |
0 / 1 |
5.01 | |
92.31% |
12 / 13 |
|||
fromSelSerImpl | |
0.00% |
0 / 1 |
27.89 | |
75.00% |
42 / 56 |
<?php | |
declare( strict_types = 1 ); | |
namespace Wikimedia\Parsoid\Html2Wt\ConstrainedText; | |
use Wikimedia\Parsoid\Config\Env; | |
use Wikimedia\Parsoid\DOM\Element; | |
use Wikimedia\Parsoid\DOM\Node; | |
use Wikimedia\Parsoid\NodeData\DataParsoid; | |
use Wikimedia\Parsoid\Utils\DOMCompat; | |
use Wikimedia\Parsoid\Utils\DOMDataUtils; | |
use Wikimedia\Parsoid\Utils\DOMUtils; | |
use Wikimedia\Parsoid\Utils\PHPUtils; | |
use Wikimedia\Parsoid\Utils\Utils; | |
/** | |
* A chunk of wikitext output. This base class contains the | |
* wikitext and a pointer to the DOM node which is responsible for | |
* generating it. Subclasses can add additional properties to record | |
* context or wikitext boundary restrictions for proper escaping. | |
* The chunk is serialized with the `escape` method, which might | |
* alter the wikitext in order to ensure it doesn't run together | |
* with its context (usually by adding `<nowiki>` tags). | |
* | |
* The main entry point is the static function `ConstrainedText::escapeLine()`. | |
*/ | |
class ConstrainedText { | |
/** | |
* This adds necessary escapes to a line of chunks. We provide | |
* the `ConstrainedText#escape` function with its left and right | |
* context, and it can determine what escapes are needed. | |
* | |
* The `line` parameter is an array of `ConstrainedText` *chunks* | |
* which make up a line (or part of a line, in some cases of nested | |
* processing). | |
* | |
* @param ConstrainedText[] $line | |
* @return string | |
*/ | |
public static function escapeLine( array $line ): string { | |
// The left context will be precise (that is, it is the result | |
// of `ConstrainedText#escape` and will include any escapes | |
// triggered by chunks on the left), but the right context | |
// is just the (unescaped) text property from the chunk. | |
// As we work left to right we will piece together a fully-escaped | |
// string. Be careful not to shoot yourself in the foot -- if the | |
// escaped text is significantly different from the chunk's `text` | |
// property, the preceding chunk may not have made the correct | |
// decisions about emitting an escape suffix. We could solve | |
// this by looping until the state converges (or until we detect | |
// a loop) but for now let's hope that's not necessary. | |
$state = new State( $line ); | |
$safeLeft = ''; | |
for ( $state->pos = 0; $state->pos < count( $line ); $state->pos++ ) { | |
$chunk = $line[$state->pos]; | |
// Process the escapes for this chunk, given escaped previous chunk | |
$state->rightContext = substr( $state->rightContext, strlen( $chunk->text ) ); | |
$thisEscape = $chunk->escape( $state ); | |
$state->leftContext .= | |
( $thisEscape->prefix ?? '' ) . | |
$thisEscape->text . | |
( $thisEscape->suffix ?? '' ); | |
if ( $thisEscape->greedy ) { | |
// protect the left context: this will be matched greedily | |
// by this chunk, so there's no chance that a subsequent | |
// token will include this in its prefix. | |
$safeLeft .= $state->leftContext; | |
$state->leftContext = ''; | |
} | |
} | |
// right context should be empty here. | |
return $safeLeft . $state->leftContext; | |
} | |
/** | |
* The wikitext string associated with this chunk. | |
* @var string | |
*/ | |
public $text; | |
/** | |
* The DOM Node associated with this chunk. | |
* @var Node | |
*/ | |
public $node; | |
/** | |
* The prefix string to add if the start of the chunk doesn't match its | |
* constraints. | |
* @var string | |
*/ | |
public $prefix; | |
/** | |
* The suffix string to add if the end of the chunk doesn't match its | |
* constraints. | |
* @var string | |
*/ | |
public $suffix; | |
/** | |
* Does this chunk come from selser? | |
* @var bool | |
*/ | |
public $selser; | |
/** | |
* Suppress separators? | |
* @var bool | |
*/ | |
public $noSep; | |
/** | |
* @param array{text:string,node:Node,prefix?:string,suffix?:string} $args Options. | |
*/ | |
public function __construct( array $args ) { | |
$this->text = $args['text']; | |
$this->node = $args['node']; | |
$this->prefix = $args['prefix'] ?? null; | |
$this->suffix = $args['suffix'] ?? null; | |
$this->selser = false; | |
$this->noSep = false; | |
} | |
/** | |
* Ensure that the argument `o`, which is perhaps a string, is a instance of | |
* `ConstrainedText`. | |
* @param string|ConstrainedText $o | |
* @param Node $node | |
* The {@link Node} corresponding to `o`. | |
* @return ConstrainedText | |
*/ | |
public static function cast( $o, Node $node ): ConstrainedText { | |
if ( $o instanceof ConstrainedText ) { | |
return $o; | |
} | |
return new ConstrainedText( [ 'text' => $o ?? '', 'node' => $node ] ); | |
} | |
/** | |
* Use the provided `state`, which gives context and access to the entire | |
* list of chunks, to determine the proper escape prefix/suffix. | |
* Returns an object with a `text` property as well as optional | |
* `prefix` and 'suffix' properties giving desired escape strings. | |
* @param State $state Context state | |
* @return Result | |
*/ | |
public function escape( State $state ): Result { | |
// default implementation: no escaping, no prefixes or suffixes. | |
return new Result( $this->text, $this->prefix, $this->suffix ); | |
} | |
/** | |
* Simple equality. This enforces type equality | |
* (ie subclasses are not equal). | |
* @param ConstrainedText $ct | |
* @return bool | |
*/ | |
public function equals( ConstrainedText $ct ): bool { | |
return $this === $ct || ( | |
get_class( $this ) === self::class && | |
get_class( $ct ) === self::class && | |
$this->text === $ct->text | |
); | |
} | |
/** | |
* Useful shortcut: execute a regular expression on the raw wikitext. | |
* @param string $re | |
* @param Env $env | |
* @return array|null | |
* An array containing the matched results or null if there were no matches. | |
*/ | |
public function matches( string $re, Env $env ): ?array { | |
$r = preg_match( $re, $this->text, $m ); | |
if ( $r === false ) { | |
if ( version_compare( PHP_VERSION, '8.0.0', '>' ) ) { | |
$error_msg = preg_last_error_msg(); | |
} else { | |
$error_msg = "preg_last_error: " . preg_last_error(); | |
} | |
$env->log( 'error', $error_msg, $re, $this->text ); | |
throw new \Error( 'Bad regular expression' ); | |
} | |
return $r === 0 ? null : $m; | |
} | |
/** | |
* SelSer support: when we come across an unmodified node in during | |
* selective serialization, we know we can use the original wikitext | |
* for that node unmodified. *But* there may be boundary conditions | |
* on the left and right sides of the selser'ed text which are going | |
* to require escaping. | |
* | |
* So rather than turning the node into a plain old `ConstrainedText` | |
* chunk, allow subclasses of `ConstrainedText` to register as potential | |
* handlers of selser nodes. A selser'ed magic link, for example, | |
* will then turn into a `MagicLinkText` and thus be able to enforce | |
* the proper boundary constraints. | |
* | |
* @param string $text | |
* @param Element $node | |
* @param DataParsoid $dataParsoid | |
* @param Env $env | |
* @param array $opts | |
* @return ConstrainedText[] | |
*/ | |
public static function fromSelSer( | |
string $text, Element $node, DataParsoid $dataParsoid, | |
Env $env, array $opts = [] | |
): array { | |
// Main dispatch point: iterate through registered subclasses, asking | |
// each if they can handle this node (by invoking `fromSelSerImpl`). | |
// We define parent types before subtypes, so search the list backwards | |
// to be sure we check subtypes before parent types. | |
$types = self::$types; | |
for ( $i = count( $types ) - 1; $i >= 0; $i-- ) { | |
$ct = call_user_func( | |
[ $types[$i], 'fromSelSerImpl' ], | |
$text, $node, $dataParsoid, $env, $opts | |
); | |
if ( !$ct ) { | |
continue; | |
} | |
if ( !is_array( $ct ) ) { | |
$ct = [ $ct ]; | |
} | |
// tag these chunks as coming from selser | |
foreach ( $ct as $t ) { | |
$t->selser = true; | |
} | |
return $ct; | |
} | |
// ConstrainedText::fromSelSerImpl should handle everything which reaches it | |
// so nothing should make it here. | |
throw new \Error( 'Should never happen.' ); | |
} | |
/** | |
* Base case: the given node type does not correspond to a special | |
* `ConstrainedText` subclass. We still have to be careful: the leftmost | |
* (rightmost) children of `node` may still be exposed to our left (right) | |
* context. If so (ie, their DSR bounds coincide) split the selser text | |
* and emit multiple `ConstrainedText` chunks to preserve the proper | |
* boundary conditions. | |
* | |
* @param string $text | |
* @param Element $node | |
* @param DataParsoid $dataParsoid | |
* @param Env $env | |
* @param array $opts | |
* @return ConstrainedText|ConstrainedText[] | |
*/ | |
protected static function fromSelSerImpl( | |
string $text, Element $node, DataParsoid $dataParsoid, | |
Env $env, array $opts | |
) { | |
// look at leftmost and rightmost children, it may be that we need | |
// to turn these into ConstrainedText chunks in order to preserve | |
// the proper escape conditions on the prefix/suffix text. | |
$firstChild = DOMUtils::firstNonDeletedChild( $node ); | |
$lastChild = DOMUtils::lastNonDeletedChild( $node ); | |
$firstChildDp = $firstChild instanceof Element ? | |
DOMDataUtils::getDataParsoid( $firstChild ) : null; | |
$lastChildDp = $lastChild instanceof Element ? | |
DOMDataUtils::getDataParsoid( $lastChild ) : null; | |
$prefixChunks = []; | |
$suffixChunks = []; | |
$len = null; | |
$ignorePrefix = $opts['ignorePrefix'] ?? false; | |
$ignoreSuffix = $opts['ignoreSuffix'] ?? false; | |
// check to see if first child's DSR start is the same as this node's | |
// DSR start. If so, the first child is exposed to the (modified) | |
// left-hand context, and so recursively convert it to the proper | |
// list of specialized chunks. | |
if ( | |
!$ignorePrefix && | |
$firstChildDp && Utils::isValidDSR( $firstChildDp->dsr ?? null ) && | |
$dataParsoid->dsr->start === $firstChildDp->dsr->start | |
) { | |
DOMUtils::assertElt( $firstChild ); // implied by $firstChildDp | |
$len = $firstChildDp->dsr->length(); | |
if ( $len < 0 ) { // T254412: Bad DSR | |
$env->log( "error/html2wt/dsr", | |
"Bad DSR: " . PHPUtils::jsonEncode( $firstChildDp->dsr ), | |
"Node: " . DOMCompat::getOuterHTML( $firstChild ) ); | |
} else { | |
if ( $len > strlen( $text ) ) { // T254412: Bad DSR | |
$env->log( "error/html2wt/dsr", | |
"Bad DSR: " . PHPUtils::jsonEncode( $firstChildDp->dsr ), | |
"Node: " . DOMCompat::getOuterHTML( $firstChild ) ); | |
$len = strlen( $text ); | |
} | |
$prefixChunks = self::fromSelSer( | |
substr( $text, 0, $len ), $firstChild, $firstChildDp, $env, | |
// this child node's right context will be protected: | |
[ 'ignoreSuffix' => true ] | |
); | |
$text = substr( $text, $len ); | |
} | |
} | |
// check to see if last child's DSR end is the same as this node's | |
// DSR end. If so, the last child is exposed to the (modified) | |
// right-hand context, and so recursively convert it to the proper | |
// list of specialized chunks. | |
if ( | |
!$ignoreSuffix && $lastChild !== $firstChild && | |
$lastChildDp && Utils::isValidDSR( $lastChildDp->dsr ?? null ) && | |
$dataParsoid->dsr->end === $lastChildDp->dsr->end | |
) { | |
DOMUtils::assertElt( $lastChild ); // implied by $lastChildDp | |
$len = $lastChildDp->dsr->length(); | |
if ( $len < 0 ) { // T254412: Bad DSR | |
$env->log( "error/html2wt/dsr", | |
"Bad DSR: " . PHPUtils::jsonEncode( $lastChildDp->dsr ), | |
"Node: " . DOMCompat::getOuterHTML( $lastChild ) ); | |
} else { | |
if ( $len > strlen( $text ) ) { // T254412: Bad DSR | |
$env->log( "error/html2wt/dsr", | |
"Bad DSR: " . PHPUtils::jsonEncode( $lastChildDp->dsr ), | |
"Node: " . DOMCompat::getOuterHTML( $lastChild ) ); | |
$len = strlen( $text ); | |
} | |
$suffixChunks = self::fromSelSer( | |
substr( $text, -$len ), $lastChild, $lastChildDp, $env, | |
// this child node's left context will be protected: | |
[ 'ignorePrefix' => true ] | |
); | |
$text = substr( $text, 0, -$len ); | |
} | |
} | |
// glue together prefixChunks, whatever's left of `text`, and suffixChunks | |
$chunks = [ self::cast( $text, $node ) ]; | |
$chunks = array_merge( $prefixChunks, $chunks, $suffixChunks ); | |
// top-level chunks only: | |
if ( !( $ignorePrefix || $ignoreSuffix ) ) { | |
// ensure that the first chunk belongs to `node` in order to | |
// emit separators correctly before `node` | |
if ( $chunks[0]->node !== $node ) { | |
array_unshift( $chunks, self::cast( '', $node ) ); | |
} | |
// set 'noSep' flag on all but the first chunk, so we don't get | |
// extra separators from `SSP.emitChunk` | |
foreach ( $chunks as $i => $t ) { | |
if ( $i > 0 ) { | |
$t->noSep = true; | |
} | |
} | |
} | |
return $chunks; | |
} | |
/** | |
* List of types we attempt `fromSelSer` with. This should include all the | |
* concrete subclasses of `ConstrainedText` (`RegExpConstrainedText` is | |
* missing since it is an abstract class). We also include the | |
* `ConstrainedText` class as the first element (even though it is | |
* an abstract base class) as a little bit of a hack: it simplifies | |
* `ConstrainedText.fromSelSer` by factoring some of its work into | |
* `ConstrainedText.fromSelSerImpl`. | |
* @var class-string[] | |
*/ | |
private static $types = [ | |
// Base class is first, as a special case | |
self::class, | |
// All concrete subclasses of ConstrainedText | |
WikiLinkText::class, ExtLinkText::class, AutoURLLinkText::class, | |
MagicLinkText::class, LanguageVariantText::class | |
]; | |
} |