Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
98.16% |
586 / 597 |
|
85.29% |
29 / 34 |
CRAP | |
0.00% |
0 / 1 |
Linter | |
98.16% |
586 / 597 |
|
85.29% |
29 / 34 |
255 | |
0.00% |
0 / 1 |
getTagsWithChangedMisnestingBehavior | |
100.00% |
19 / 19 |
|
100.00% |
1 / 1 |
7 | |||
leftMostMisnestedDescendent | |
100.00% |
13 / 13 |
|
100.00% |
1 / 1 |
8 | |||
getMatchingMisnestedNode | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
3 | |||
findEnclosingTemplateName | |
77.78% |
14 / 18 |
|
0.00% |
0 / 1 |
7.54 | |||
findLintDSR | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
5 | |||
hasIdenticalNestedTag | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
5 | |||
hasMisnestableContent | |
100.00% |
15 / 15 |
|
100.00% |
1 / 1 |
10 | |||
endTagOptional | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
getHeadingAncestor | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
3 | |||
matchedOpenTagPairExists | |
90.00% |
9 / 10 |
|
0.00% |
0 / 1 |
7.05 | |||
lintTreeBuilderFixup | |
100.00% |
59 / 59 |
|
100.00% |
1 / 1 |
31 | |||
lintFostered | |
100.00% |
28 / 28 |
|
100.00% |
1 / 1 |
11 | |||
lintObsoleteTag | |
100.00% |
38 / 38 |
|
100.00% |
1 / 1 |
17 | |||
lintBogusImageOptions | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
8 | |||
lintDeletableTableTag | |
100.00% |
24 / 24 |
|
100.00% |
1 / 1 |
7 | |||
findMatchingChild | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
3 | |||
hasNoWrapCSS | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
3 | |||
lintPWrapBugWorkaround | |
100.00% |
17 / 17 |
|
100.00% |
1 / 1 |
5 | |||
lintMiscTidyReplacementIssues | |
100.00% |
15 / 15 |
|
100.00% |
1 / 1 |
8 | |||
lintTidyWhitespaceBug | |
98.78% |
81 / 82 |
|
0.00% |
0 / 1 |
29 | |||
lintMultipleUnclosedFormattingTags | |
100.00% |
22 / 22 |
|
100.00% |
1 / 1 |
9 | |||
postProcessLints | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getWikitextListItemAncestor | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
6 | |||
lintMultilineHtmlTableInList | |
100.00% |
16 / 16 |
|
100.00% |
1 / 1 |
5 | |||
lintWikilinksInExtlink | |
100.00% |
21 / 21 |
|
100.00% |
1 / 1 |
11 | |||
recordLargeTablesLint | |
100.00% |
13 / 13 |
|
100.00% |
1 / 1 |
1 | |||
skipNonElementNodes | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
3 | |||
lintLargeTables | |
100.00% |
24 / 24 |
|
100.00% |
1 / 1 |
8 | |||
lintNightModeUnawareBackgroundColor | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
4 | |||
lintMissingAltText | |
100.00% |
25 / 25 |
|
100.00% |
1 / 1 |
9 | |||
lintDuplicateIds | |
100.00% |
15 / 15 |
|
100.00% |
1 / 1 |
4 | |||
logWikitextFixups | |
100.00% |
15 / 15 |
|
100.00% |
1 / 1 |
1 | |||
findLints | |
88.24% |
30 / 34 |
|
0.00% |
0 / 1 |
11.20 | |||
run | |
90.00% |
9 / 10 |
|
0.00% |
0 / 1 |
4.02 |
1 | <?php |
2 | |
3 | declare( strict_types = 1 ); |
4 | |
5 | namespace Wikimedia\Parsoid\Wt2Html\DOM\Processors; |
6 | |
7 | use stdClass; |
8 | use Wikimedia\Assert\UnreachableException; |
9 | use Wikimedia\Parsoid\Config\Env; |
10 | use Wikimedia\Parsoid\Core\DomSourceRange; |
11 | use Wikimedia\Parsoid\DOM\Comment; |
12 | use Wikimedia\Parsoid\DOM\Element; |
13 | use Wikimedia\Parsoid\DOM\Node; |
14 | use Wikimedia\Parsoid\DOM\Text; |
15 | use Wikimedia\Parsoid\Ext\ParsoidExtensionAPI; |
16 | use Wikimedia\Parsoid\NodeData\DataParsoid; |
17 | use Wikimedia\Parsoid\NodeData\TempData; |
18 | use Wikimedia\Parsoid\NodeData\TemplateInfo; |
19 | use Wikimedia\Parsoid\Utils\DiffDOMUtils; |
20 | use Wikimedia\Parsoid\Utils\DOMCompat; |
21 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
22 | use Wikimedia\Parsoid\Utils\DOMUtils; |
23 | use Wikimedia\Parsoid\Utils\PHPUtils; |
24 | use Wikimedia\Parsoid\Utils\Timing; |
25 | use Wikimedia\Parsoid\Utils\Utils; |
26 | use Wikimedia\Parsoid\Utils\WTUtils; |
27 | use Wikimedia\Parsoid\Wikitext\Consts; |
28 | use Wikimedia\Parsoid\Wt2Html\Wt2HtmlDOMProcessor; |
29 | |
30 | /** |
31 | * DOM pass that walks the DOM tree, detects specific wikitext patterns, |
32 | * and emits them as linter events. |
33 | */ |
34 | class Linter implements Wt2HtmlDOMProcessor { |
35 | private ?ParsoidExtensionAPI $extApi = null; |
36 | private ?string $obsoleteTagsRE = null; |
37 | private array $seenIds = []; |
38 | |
39 | /** @var array<string,bool>|null */ |
40 | private ?array $tagsWithChangedMisnestingBehavior = null; |
41 | |
42 | /** |
43 | * We are trying to find HTML5 tags that have different behavior compared to HTML4 |
44 | * in some misnesting scenarios around wikitext paragraphs. |
45 | * |
46 | * Ex: Input: <p><small>a</p><p>b</small></p> |
47 | * Tidy output: <p><small>a</small></p><p><small>b</small></p> |
48 | * HTML5 output: <p><small>a</small></p><p><small>b</small></p> |
49 | * |
50 | * So, all good here. |
51 | * But, see how output changes when we use <span> instead |
52 | * |
53 | * Ex: Input: <p><span>a</p><p>b</span></p> |
54 | * Tidy output: <p><span>a</span></p><p><span>b</span></p> |
55 | * HTML5 output: <p><span>a</span></p><p>b</p> |
56 | * |
57 | * The source wikitext is "<span>a\n\nb</span>". The difference persists even |
58 | * when you have "<span>a\n\n<div>b</div>" or "<span>a\n\n{|\n|x\n|}\nbar". |
59 | * |
60 | * This is because Tidy seems to be doing the equivalent of HTM5-treebuilder's |
61 | * active formatting element reconstruction step on all *inline* elements. |
62 | * However, HTML5 parsers only do that on formatting elements. So, we need |
63 | * to compute which HTML5 tags are subject to this differential behavior. |
64 | * |
65 | * We compute that by excluding the following tags from the list of all HTML5 tags |
66 | * - If our sanitizer doesn't allow them, they will be escaped => ignore them |
67 | * - HTML4 block tags are excluded (obviously) |
68 | * - Void tags don't matter since they cannot wrap anything (obviously) |
69 | * - Active formatting elements have special handling in the HTML5 tree building |
70 | * algorithm where they are reconstructed to wrap all originally intended content. |
71 | * (ex: <small> above) |
72 | * |
73 | * Here is the list of 22 HTML5 tags that are affected: |
74 | * ABBR, BDI, BDO, CITE, DATA, DEL, DFN, INS, KBD, MARK, |
75 | * Q, RB, RP, RT, RTC, RUBY, SAMP, SPAN, SUB, SUP, TIME, VAR |
76 | * |
77 | * https://phabricator.wikimedia.org/T176363#3628173 verifies that this list of |
78 | * tags all demonstrate this behavior. |
79 | * |
80 | * @return array |
81 | * @phan-return array<string,bool> |
82 | */ |
83 | private function getTagsWithChangedMisnestingBehavior(): array { |
84 | if ( $this->tagsWithChangedMisnestingBehavior === null ) { |
85 | // This set is frozen in time. It gets us down to the requisite |
86 | // 22 HTML5 tags above, but shouldn't be used for anything other |
87 | // than that. |
88 | $HTML4TidyBlockTags = PHPUtils::makeSet( [ |
89 | 'div', 'p', |
90 | # tables |
91 | 'table', 'tbody', 'thead', 'tfoot', 'caption', 'th', 'tr', 'td', |
92 | # lists |
93 | 'ul', 'ol', 'li', 'dl', 'dt', 'dd', |
94 | # HTML5 heading content |
95 | 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hgroup', |
96 | # HTML5 sectioning content |
97 | 'article', 'aside', 'nav', 'section', 'footer', 'header', |
98 | 'figure', 'figcaption', 'fieldset', 'details', 'blockquote', |
99 | # other |
100 | 'hr', 'button', 'canvas', 'center', 'col', 'colgroup', 'embed', |
101 | 'map', 'object', 'pre', 'progress', 'video', |
102 | ] ); |
103 | $this->tagsWithChangedMisnestingBehavior = []; |
104 | foreach ( Consts::$HTML['HTML5Tags'] as $tag => $dummy ) { |
105 | if ( isset( Consts::$Sanitizer['AllowedLiteralTags'][$tag] ) && |
106 | !isset( $HTML4TidyBlockTags[$tag] ) && |
107 | !isset( Consts::$HTML['FormattingTags'][$tag] ) && |
108 | !isset( Consts::$HTML['VoidTags'][$tag] ) |
109 | ) { |
110 | $this->tagsWithChangedMisnestingBehavior[$tag] = true; |
111 | } |
112 | } |
113 | } |
114 | |
115 | return $this->tagsWithChangedMisnestingBehavior; |
116 | } |
117 | |
118 | /** |
119 | * Finds a matching node at the "start" of this node. |
120 | */ |
121 | private function leftMostMisnestedDescendent( ?Node $node, Element $match ): ?Element { |
122 | if ( !$node instanceof Element ) { |
123 | return null; |
124 | } |
125 | |
126 | if ( DOMUtils::isMarkerMeta( $node, 'mw:Placeholder/StrippedTag' ) ) { |
127 | $name = DOMDataUtils::getDataParsoid( $node )->name ?? null; |
128 | return $name === DOMCompat::nodeName( $match ) ? $node : null; |
129 | } |
130 | |
131 | if ( DOMCompat::nodeName( $node ) === DOMCompat::nodeName( $match ) ) { |
132 | $dp = DOMDataUtils::getDataParsoid( $node ); |
133 | if ( ( DOMDataUtils::getDataParsoid( $match )->stx ?? null ) === ( $dp->stx ?? null ) && |
134 | !empty( $dp->autoInsertedStart ) |
135 | ) { |
136 | if ( !empty( $dp->autoInsertedEnd ) ) { |
137 | return $this->getMatchingMisnestedNode( $node, $match ); |
138 | } else { |
139 | return $node; |
140 | } |
141 | } |
142 | } |
143 | |
144 | return $this->leftMostMisnestedDescendent( $node->firstChild, $match ); |
145 | } |
146 | |
147 | /** |
148 | * $node has an 'autoInsertedEnd' flag set on it. We are looking for |
149 | * its matching node that has an 'autoInsertedStart' flag set on it. |
150 | * This happens when the tree-builder fixes up misnested tags. |
151 | * This "adjacency" is wrt the HTML string. In a DOM, this can either |
152 | * be the next sibling OR, it might be the left-most-descendent of |
153 | * of $node's parent's sibling (and so on up the ancestor chain). |
154 | */ |
155 | private function getMatchingMisnestedNode( Node $node, Element $match ): ?Element { |
156 | if ( DOMUtils::atTheTop( $node ) ) { |
157 | return null; |
158 | } |
159 | |
160 | if ( DiffDOMUtils::nextNonSepSibling( $node ) ) { |
161 | return $this->leftMostMisnestedDescendent( DiffDOMUtils::nextNonSepSibling( $node ), $match ); |
162 | } |
163 | |
164 | return $this->getMatchingMisnestedNode( $node->parentNode, $match ); |
165 | } |
166 | |
167 | /** |
168 | * Given a tplInfo object, determine whether we are: |
169 | * - Not processing template content (could be extension or top level page) |
170 | * - Processing encapsulated content that is produced by a single template. |
171 | * If so, return the name of that template. |
172 | * - Processing encapsulated content that comes from multiple templates. |
173 | * If so, return a flag indicating this. |
174 | * |
175 | * FIXME: We might potentially be computing this information redundantly |
176 | * for every lint we find within this template's content. It could probably |
177 | * be cached in tplInfo after it is computed once. |
178 | */ |
179 | public static function findEnclosingTemplateName( Env $env, ?stdClass $tplInfo ): ?array { |
180 | if ( !$tplInfo ) { |
181 | return null; |
182 | } |
183 | |
184 | if ( !DOMUtils::hasTypeOf( $tplInfo->first, 'mw:Transclusion' ) ) { |
185 | return null; |
186 | } |
187 | $dmw = DOMDataUtils::getDataMw( $tplInfo->first ); |
188 | // This count check is conservative in that link suffixes and prefixes |
189 | // could artifically add an extra element to the parts array but we |
190 | // don't have a good way of distinguishing that right now. It will require |
191 | // a non-string representation for them and a change in spec along with |
192 | // a version bump and all that song and dance. If linting accuracy in these |
193 | // scenarios become a problem, we can revisit this. |
194 | if ( |
195 | !empty( $dmw->parts ) && |
196 | count( $dmw->parts ) === 1 |
197 | ) { |
198 | $p0 = $dmw->parts[0]; |
199 | if ( !( $p0 instanceof TemplateInfo ) ) { |
200 | throw new UnreachableException( |
201 | "a single part will always be a TemplateInfo not a string" |
202 | ); |
203 | } |
204 | $name = null; |
205 | if ( !empty( $p0->href ) ) { // Could be "function" |
206 | // PORT-FIXME: Should that be SiteConfig::relativeLinkPrefix() rather than './'? |
207 | $name = PHPUtils::stripPrefix( $p0->href, './' ); |
208 | } else { |
209 | // type === 'templatearg' or 'template' |
210 | $name = trim( $p0->targetWt ); |
211 | } |
212 | return [ 'name' => $name ]; |
213 | } else { |
214 | return [ 'multiPartTemplateBlock' => true ]; |
215 | } |
216 | } |
217 | |
218 | /** |
219 | * Compute the DSR information for the lint object. |
220 | * - In the common case, this is simply the DSR value of the node |
221 | * that generated the lint. But, occasionally, for some lints, |
222 | * we might have to post-process the node's DSR. |
223 | * - If the lint is found in template content, then the DSR spans |
224 | * the transclusion markup in the toplevel page source. |
225 | */ |
226 | public static function findLintDSR( |
227 | ?array $tplLintInfo, ?stdClass $tplInfo, ?DomSourceRange $nodeDSR, |
228 | ?callable $updateNodeDSR = null |
229 | ): ?DomSourceRange { |
230 | if ( $tplLintInfo !== null || ( $tplInfo && !Utils::isValidDSR( $nodeDSR ) ) ) { |
231 | return DOMDataUtils::getDataParsoid( $tplInfo->first )->dsr ?? null; |
232 | } else { |
233 | return $updateNodeDSR ? $updateNodeDSR( $nodeDSR ) : $nodeDSR; |
234 | } |
235 | } |
236 | |
237 | /** |
238 | * Determine if a node has an identical nested tag (?) |
239 | */ |
240 | private function hasIdenticalNestedTag( Element $node, string $name ): bool { |
241 | $c = $node->firstChild; |
242 | while ( $c ) { |
243 | if ( $c instanceof Element ) { |
244 | if ( |
245 | DOMCompat::nodeName( $c ) === $name && |
246 | empty( DOMDataUtils::getDataParsoid( $c )->autoInsertedEnd ) |
247 | ) { |
248 | return true; |
249 | } |
250 | |
251 | return $this->hasIdenticalNestedTag( $c, $name ); |
252 | } |
253 | |
254 | $c = $c->nextSibling; |
255 | } |
256 | |
257 | return false; |
258 | } |
259 | |
260 | /** |
261 | * Determine if a node has misnestable content |
262 | */ |
263 | private function hasMisnestableContent( Node $node, string $name ): bool { |
264 | // For A, TD, TH, H* tags, Tidy doesn't seem to propagate |
265 | // the unclosed tag outside these tags. |
266 | // No need to check for tr/table since content cannot show up there |
267 | if ( DOMUtils::atTheTop( $node ) || preg_match( '/^(?:a|td|th|h\d)$/D', DOMCompat::nodeName( $node ) ) ) { |
268 | return false; |
269 | } |
270 | |
271 | $next = DiffDOMUtils::nextNonSepSibling( $node ); |
272 | if ( !$next ) { |
273 | return $this->hasMisnestableContent( $node->parentNode, $name ); |
274 | } |
275 | |
276 | $contentNode = null; |
277 | if ( DOMCompat::nodeName( $next ) === 'p' && !WTUtils::isLiteralHTMLNode( $next ) ) { |
278 | $contentNode = DiffDOMUtils::firstNonSepChild( $next ); |
279 | } else { |
280 | $contentNode = $next; |
281 | } |
282 | |
283 | // If the first "content" node we find is a matching |
284 | // stripped tag, we have nothing that can get misnested |
285 | return $contentNode && !( |
286 | $contentNode instanceof Element && |
287 | DOMUtils::isMarkerMeta( $contentNode, 'mw:Placeholder/StrippedTag' ) && |
288 | isset( DOMDataUtils::getDataParsoid( $contentNode )->name ) && |
289 | DOMDataUtils::getDataParsoid( $contentNode )->name === $name |
290 | ); |
291 | } |
292 | |
293 | /** |
294 | * Indicate whether an end tag is optional for this node |
295 | * |
296 | * See https://www.w3.org/TR/html5/syntax.html#optional-tags |
297 | * |
298 | * End tags for tr/td/th/li are entirely optional since they |
299 | * require a parent container and can only be followed by like |
300 | * kind. |
301 | * |
302 | * Caveat: <li>foo</li><ol>..</ol> and <li>foo<ol>..</ol> |
303 | * generate different DOM trees, so explicit </li> tag |
304 | * is required to specify which of the two was intended. |
305 | * |
306 | * With that one caveat around nesting, the parse with/without |
307 | * the end tag is identical. For now, ignoring that caveat |
308 | * since they aren't like to show up in our corpus much. |
309 | * |
310 | * For the other tags in that w3c spec section, I haven't reasoned |
311 | * through when exactly they are optional. Not handling that complexity |
312 | * for now since those are likely uncommon use cases in our corpus. |
313 | */ |
314 | private function endTagOptional( Node $node ): bool { |
315 | static $tagNames = [ 'tr', 'td', 'th', 'li' ]; |
316 | return in_array( DOMCompat::nodeName( $node ), $tagNames, true ); |
317 | } |
318 | |
319 | /** |
320 | * Find the nearest ancestor heading tag |
321 | */ |
322 | private function getHeadingAncestor( Node $node ): ?Node { |
323 | while ( $node && !DOMUtils::isHeading( $node ) ) { |
324 | $node = $node->parentNode; |
325 | } |
326 | return $node; |
327 | } |
328 | |
329 | /** |
330 | * For formatting tags, Tidy seems to be doing this "smart" fixup of |
331 | * unclosed tags by looking for matching unclosed pairs of identical tags |
332 | * and if the content ends in non-whitespace text, it treats the second |
333 | * unclosed opening tag as a closing tag. But, a HTML5 parser won't do this. |
334 | * So, detect this pattern and flag for linter fixup. |
335 | */ |
336 | private function matchedOpenTagPairExists( Node $c, DataParsoid $dp ): bool { |
337 | $lc = $c->lastChild; |
338 | if ( !$lc instanceof Element || DOMCompat::nodeName( $lc ) !== DOMCompat::nodeName( $c ) ) { |
339 | return false; |
340 | } |
341 | |
342 | $lcDP = DOMDataUtils::getDataParsoid( $lc ); |
343 | if ( empty( $lcDP->autoInsertedEnd ) || ( $lcDP->stx ?? null ) !== ( $dp->stx ?? null ) ) { |
344 | return false; |
345 | } |
346 | |
347 | $prev = $lc->previousSibling; |
348 | // PORT-FIXME: Do we care about non-ASCII whitespace here? |
349 | if ( $prev instanceof Text && !preg_match( '/\s$/D', $prev->nodeValue ) ) { |
350 | return true; |
351 | } |
352 | |
353 | return false; |
354 | } |
355 | |
356 | /** |
357 | * Lint Treebuilder fixups marked by ProcessTreeBuilderFixups |
358 | * |
359 | * It handles the following scenarios: |
360 | * |
361 | * 1. Unclosed end tags (`missing-end-tag`, `missing-end-tag-in-heading`) |
362 | * 2. Invalid self-closed tags (`self-closed-tag`) |
363 | * 3. Stripped tags (`stripped-tag`) |
364 | * |
365 | * In addition, we have specialized categories for some patterns |
366 | * where we encounter unclosed end tags. |
367 | * |
368 | * 4. misnested-tag |
369 | * 5. html5-misnesting |
370 | * 6. multiple-unclosed-formatting-tags |
371 | * 7. unclosed-quotes-in-heading |
372 | */ |
373 | private function lintTreeBuilderFixup( |
374 | Env $env, Element $c, DataParsoid $dp, ?stdClass $tplInfo |
375 | ): void { |
376 | // This might have been processed as part of |
377 | // misnested-tag category identification. |
378 | if ( $dp->getTempFlag( TempData::LINTED ) ) { |
379 | return; |
380 | } |
381 | |
382 | $tplLintInfo = self::findEnclosingTemplateName( $env, $tplInfo ); |
383 | // During DSR computation, stripped meta tags |
384 | // surrender their width to its previous sibling. |
385 | // We record the original DSR in the tmp attribute |
386 | // for that reason. |
387 | $dsr = self::findLintDSR( $tplLintInfo, $tplInfo, $dp->tmp->origDSR ?? $dp->dsr ?? null ); |
388 | $lintObj = null; |
389 | if ( DOMUtils::isMarkerMeta( $c, 'mw:Placeholder/StrippedTag' ) ) { |
390 | $lintObj = [ |
391 | 'dsr' => $dsr, |
392 | 'templateInfo' => $tplLintInfo, |
393 | 'params' => [ 'name' => $dp->name ?? null ], |
394 | ]; |
395 | $env->recordLint( 'stripped-tag', $lintObj ); |
396 | } |
397 | |
398 | // Dont bother linting for auto-inserted start/end or self-closing-tag if: |
399 | // 1. c is a void element |
400 | // Void elements won't have auto-inserted start/end tags |
401 | // and self-closing versions are valid for them. |
402 | // |
403 | // 2. c is tbody (FIXME: don't remember why we have this exception) |
404 | // |
405 | // 3. c is not an HTML element (unless they are i/b quotes or tables) |
406 | // |
407 | // 4. c doesn't have DSR info and doesn't come from a template either |
408 | $cNodeName = DOMCompat::nodeName( $c ); |
409 | $ancestor = null; |
410 | $isHtmlElement = WTUtils::hasLiteralHTMLMarker( $dp ); |
411 | if ( !Utils::isVoidElement( $cNodeName ) && |
412 | $cNodeName !== 'tbody' && |
413 | ( $isHtmlElement || DOMUtils::isQuoteElt( $c ) || $cNodeName === 'table' ) && |
414 | ( $tplInfo !== null || $dsr !== null ) |
415 | ) { |
416 | if ( !empty( $dp->selfClose ) && $cNodeName !== 'meta' ) { |
417 | $lintObj = [ |
418 | 'dsr' => $dsr, |
419 | 'templateInfo' => $tplLintInfo, |
420 | 'params' => [ 'name' => $cNodeName ], |
421 | ]; |
422 | $env->recordLint( 'self-closed-tag', $lintObj ); |
423 | // The other checks won't pass - no need to test them. |
424 | return; |
425 | } |
426 | |
427 | if ( |
428 | ( $dp->autoInsertedEnd ?? null ) === true && |
429 | ( $tplInfo || ( $dsr->openWidth ?? 0 ) > 0 ) |
430 | ) { |
431 | $lintObj = [ |
432 | 'dsr' => $dsr, |
433 | 'templateInfo' => $tplLintInfo, |
434 | 'params' => [ 'name' => $cNodeName ], |
435 | ]; |
436 | |
437 | // FIXME: This literal html marker check is strictly not required |
438 | // (a) we've already checked that above and know that isQuoteElt is |
439 | // not one of our tags. |
440 | // (b) none of the tags in the list have native wikitext syntax => |
441 | // they will show up as literal html tags. |
442 | // But, in the interest of long-term maintenance in the face of |
443 | // changes (to wikitext or html specs), let us make it explicit. |
444 | if ( $isHtmlElement && |
445 | isset( $this->getTagsWithChangedMisnestingBehavior()[DOMCompat::nodeName( $c )] ) && |
446 | $this->hasMisnestableContent( $c, DOMCompat::nodeName( $c ) ) && |
447 | // Tidy WTF moment here! |
448 | // I don't know why Tidy does something very different |
449 | // when there is an identical nested tag here. |
450 | // |
451 | // <p><span id='1'>a<span>X</span></p><p>b</span></p> |
452 | // vs. |
453 | // <p><span id='1'>a</p><p>b</span></p> OR |
454 | // <p><span id='1'>a<del>X</del></p><p>b</span></p> |
455 | // |
456 | // For the first snippet, Tidy only wraps "a" with the id='1' span |
457 | // For the second and third snippets, Tidy wraps "b" with the id='1' span as well. |
458 | // |
459 | // For the corresponding wikitext that generates the above token stream, |
460 | // Parsoid (and Remex) won't wrap 'b' with the id=1' span at all. |
461 | !$this->hasIdenticalNestedTag( $c, DOMCompat::nodeName( $c ) ) |
462 | ) { |
463 | $env->recordLint( 'html5-misnesting', $lintObj ); |
464 | } elseif ( |
465 | !$isHtmlElement && DOMUtils::isQuoteElt( $c ) && |
466 | // phpcs:ignore Generic.CodeAnalysis.AssignmentInCondition.Found |
467 | ( $ancestor = $this->getHeadingAncestor( $c->parentNode ) ) |
468 | ) { |
469 | $lintObj['params']['ancestorName'] = DOMCompat::nodeName( $ancestor ); |
470 | $env->recordLint( 'unclosed-quotes-in-heading', $lintObj ); |
471 | } else { |
472 | $adjNode = $this->getMatchingMisnestedNode( $c, $c ); |
473 | if ( $adjNode ) { |
474 | $adjDp = DOMDataUtils::getDataParsoid( $adjNode ); |
475 | $adjDp->setTempFlag( TempData::LINTED ); |
476 | $env->recordLint( 'misnested-tag', $lintObj ); |
477 | } elseif ( !$this->endTagOptional( $c ) && empty( $dp->autoInsertedStart ) ) { |
478 | $lintObj['params']['inTable'] = DOMUtils::hasNameOrHasAncestorOfName( $c, 'table' ); |
479 | $category = $this->getHeadingAncestor( $c ) ? |
480 | 'missing-end-tag-in-heading' : 'missing-end-tag'; |
481 | $next = DiffDOMUtils::nextNonSepSibling( $c ); |
482 | if ( |
483 | // Skip if covered by deletable-table-tag |
484 | !( $cNodeName === 'table' && $next && |
485 | ( DOMCompat::nodeName( $c ) === 'table' ) ) |
486 | ) { |
487 | $env->recordLint( $category, $lintObj ); |
488 | } |
489 | if ( isset( Consts::$HTML['FormattingTags'][DOMCompat::nodeName( $c )] ) && |
490 | $this->matchedOpenTagPairExists( $c, $dp ) |
491 | ) { |
492 | $env->recordLint( 'multiple-unclosed-formatting-tags', $lintObj ); |
493 | } |
494 | } |
495 | } |
496 | } |
497 | } |
498 | } |
499 | |
500 | /** |
501 | * Lint fostered content marked by MarkFosteredContent. |
502 | * |
503 | * Lint category: `fostered`, `fostered-transparent` |
504 | * |
505 | * This will log cases like: |
506 | * |
507 | * {| |
508 | * foo |
509 | * |- |
510 | * | bar |
511 | * |} |
512 | * |
513 | * Here 'foo' gets fostered out. |
514 | */ |
515 | private function lintFostered( |
516 | Env $env, Element $node, DataParsoid $dp, ?stdClass $tplInfo |
517 | ): void { |
518 | if ( DOMCompat::nodeName( $node ) !== 'table' ) { |
519 | return; |
520 | } |
521 | |
522 | // The top-level nodes in the foster box are span/p wrapped |
523 | // and so, if we have fostered content, previous siblings to |
524 | // the table are expected to be elements. |
525 | $maybeFostered = $node->previousSibling; |
526 | |
527 | // Emit "fostered" or "fostered-transparent" depending on if the fostered |
528 | // content is entirely transparent or not. |
529 | // |
530 | // We're trying to find a balance between creating noise for wikignomes |
531 | // and avoiding dirty-diffs from DiscussionTools. DiscussionTools |
532 | // expects to know when pages have fostered content otherwise it can |
533 | // lead to corruption on edit. However, rendering transparent nodes |
534 | // often end up in fosterable positions, like category links from |
535 | // templates or include directives on template pages. |
536 | |
537 | $fosteredRenderingTransparent = false; |
538 | while ( |
539 | $maybeFostered instanceof Element && |
540 | !empty( DOMDataUtils::getDataParsoid( $maybeFostered )->fostered ) && |
541 | ( WTUtils::isRenderingTransparentNode( $maybeFostered ) || |
542 | // TODO: Section tags are rendering transparent but not sol transparent, |
543 | // and that method only considers WTUtils::isSolTransparentLink, though |
544 | // there is a FIXME to consider all link nodes. |
545 | ( DOMCompat::nodeName( $maybeFostered ) === 'link' && |
546 | DOMUtils::hasTypeOf( $maybeFostered, 'mw:Extension/section' ) ) ) |
547 | ) { |
548 | // Skip rendering-transparent nodes if they come from a template, |
549 | // since they'll roundtrip cleanly regardless |
550 | $fosteredRenderingTransparent = $fosteredRenderingTransparent || !$tplInfo; |
551 | |
552 | $maybeFostered = $maybeFostered->previousSibling; |
553 | } |
554 | |
555 | if ( |
556 | $maybeFostered instanceof Element && |
557 | !empty( DOMDataUtils::getDataParsoid( $maybeFostered )->fostered ) |
558 | ) { |
559 | $type = 'fostered'; |
560 | } elseif ( $fosteredRenderingTransparent ) { |
561 | $type = 'fostered-transparent'; |
562 | } else { |
563 | return; |
564 | } |
565 | |
566 | $tplLintInfo = self::findEnclosingTemplateName( $env, $tplInfo ); |
567 | $lintObj = [ |
568 | 'dsr' => self::findLintDSR( |
569 | $tplLintInfo, $tplInfo, $dp->dsr ?? null |
570 | ), |
571 | 'templateInfo' => $tplLintInfo, |
572 | ]; |
573 | $env->recordLint( $type, $lintObj ); |
574 | } |
575 | |
576 | /** |
577 | * Lint obsolete HTML tags. |
578 | * |
579 | * Lint category: `obsolete-tag`, `tidy-font-bug` |
580 | */ |
581 | private function lintObsoleteTag( |
582 | Env $env, Element $c, DataParsoid $dp, ?stdClass $tplInfo |
583 | ): void { |
584 | if ( !$this->obsoleteTagsRE ) { |
585 | $elts = []; |
586 | foreach ( Consts::$HTML['OlderHTMLTags'] as $tag => $dummy ) { |
587 | // Looks like all existing editors let editors add the <big> tag. |
588 | // VE has a button to add <big>, it seems so does the WikiEditor |
589 | // and JS wikitext editor. So, don't flag BIG as an obsolete tag. |
590 | if ( $tag !== 'big' ) { |
591 | $elts[] = preg_quote( $tag, '/' ); |
592 | } |
593 | } |
594 | $this->obsoleteTagsRE = '/^(?:' . implode( '|', $elts ) . ')$/D'; |
595 | } |
596 | |
597 | $tplLintInfo = null; |
598 | if ( ( empty( $dp->autoInsertedStart ) || empty( $dp->autoInsertedEnd ) ) && |
599 | preg_match( $this->obsoleteTagsRE, DOMCompat::nodeName( $c ) ) |
600 | ) { |
601 | $tplLintInfo = self::findEnclosingTemplateName( $env, $tplInfo ); |
602 | $lintObj = [ |
603 | 'dsr' => self::findLintDSR( $tplLintInfo, $tplInfo, $dp->dsr ?? null ), |
604 | 'templateInfo' => $tplLintInfo, |
605 | 'params' => [ 'name' => DOMCompat::nodeName( $c ) ], |
606 | ]; |
607 | $env->recordLint( 'obsolete-tag', $lintObj ); |
608 | } |
609 | |
610 | if ( DOMCompat::nodeName( $c ) === 'font' && $c->hasAttribute( 'color' ) ) { |
611 | /* ---------------------------------------------------------- |
612 | * Tidy migrates <font> into the link in these cases |
613 | * <font>[[Foo]]</font> |
614 | * <font>[[Foo]]l</font> (link-trail) |
615 | * <font><!--boo-->[[Foo]]</font> |
616 | * <font>__NOTOC__[[Foo]]</font> |
617 | * <font>[[Category:Foo]][[Foo]]</font> |
618 | * <font>{{1x|[[Foo]]}}</font> |
619 | * |
620 | * Tidy does not migrate <font> into the link in these cases |
621 | * <font> [[Foo]]</font> |
622 | * <font>[[Foo]] </font> |
623 | * <font>[[Foo]]L</font> (not a link-trail) |
624 | * <font>[[Foo]][[Bar]]</font> |
625 | * <font>[[Foo]][[Bar]]</font> |
626 | * |
627 | * <font> is special. |
628 | * This behavior is not seen with other formatting tags. |
629 | * |
630 | * Remex/parsoid won't do any of this. |
631 | * This difference in behavior only matters when the font tag |
632 | * specifies a link colour because the link no longer renders |
633 | * as blue/red but in the font-specified colour. |
634 | * ---------------------------------------------------------- */ |
635 | $tidyFontBug = $c->firstChild !== null; |
636 | $haveLink = false; |
637 | for ( $n = $c->firstChild; $n; $n = $n->nextSibling ) { |
638 | $nodeName = DOMCompat::nodeName( $n ); |
639 | if ( $nodeName !== 'a' && |
640 | !WTUtils::isRenderingTransparentNode( $n ) && |
641 | !WTUtils::isTplMarkerMeta( $n ) |
642 | ) { |
643 | $tidyFontBug = false; |
644 | break; |
645 | } |
646 | |
647 | if ( $nodeName === 'a' || $nodeName === 'figure' ) { |
648 | if ( !$haveLink ) { |
649 | $haveLink = true; |
650 | } else { |
651 | $tidyFontBug = false; |
652 | break; |
653 | } |
654 | } |
655 | } |
656 | |
657 | if ( $tidyFontBug ) { |
658 | $tplLintInfo = self::findEnclosingTemplateName( $env, $tplInfo ); |
659 | $env->recordLint( 'tidy-font-bug', [ |
660 | 'dsr' => self::findLintDSR( $tplLintInfo, $tplInfo, $dp->dsr ?? null ), |
661 | 'templateInfo' => $tplLintInfo, |
662 | 'params' => [ 'name' => 'font' ] |
663 | ] ); |
664 | } |
665 | } |
666 | } |
667 | |
668 | /** |
669 | * Log bogus (=unrecognized) media options. |
670 | * |
671 | * See - https://www.mediawiki.org/wiki/Help:Images#Syntax |
672 | * |
673 | * Lint category: `bogus-image-options` |
674 | */ |
675 | private function lintBogusImageOptions( |
676 | Env $env, Node $c, DataParsoid $dp, ?stdClass $tplInfo |
677 | ): void { |
678 | // Despite the lint category name, this checks all media, not just images |
679 | if ( WTUtils::isGeneratedFigure( $c ) && !empty( $dp->optList ) ) { |
680 | $items = []; |
681 | $bogusPx = $dp->getTempFlag( TempData::BOGUS_PX ); |
682 | foreach ( $dp->optList as $item ) { |
683 | if ( |
684 | $item['ck'] === 'bogus' || |
685 | ( $bogusPx && $item['ck'] === 'width' ) |
686 | ) { |
687 | $items[] = $item['ak']; |
688 | } |
689 | } |
690 | if ( $items ) { |
691 | $tplLintInfo = self::findEnclosingTemplateName( $env, $tplInfo ); |
692 | $env->recordLint( 'bogus-image-options', [ |
693 | 'dsr' => self::findLintDSR( $tplLintInfo, $tplInfo, $dp->dsr ?? null ), |
694 | 'templateInfo' => $tplLintInfo, |
695 | 'params' => [ 'items' => $items ] |
696 | ] ); |
697 | } |
698 | } |
699 | } |
700 | |
701 | /** |
702 | * Lint tables Tidy deletes. |
703 | * |
704 | * Lint category: `deletable-table-tag` |
705 | * |
706 | * In this example below, the second table is in a fosterable position |
707 | * (inside a <tr>). The tree builder closes the first table at that point |
708 | * and starts a new table there. We are detecting this pattern because |
709 | * Tidy does something very different here. It strips the inner table |
710 | * and retains the outer table. So, for preserving rendering of pages |
711 | * that are tailored for Tidy, editors have to fix up this wikitext |
712 | * to strip the inner table (to mimic what Tidy does). |
713 | * |
714 | * {| style='border:1px solid red;' |
715 | * |a |
716 | * |- |
717 | * {| style='border:1px solid blue;' |
718 | * |b |
719 | * |c |
720 | * |} |
721 | * |} |
722 | */ |
723 | private function lintDeletableTableTag( |
724 | Env $env, Node $c, DataParsoid $dp, ?stdClass $tplInfo |
725 | ): void { |
726 | if ( DOMCompat::nodeName( $c ) === 'table' ) { |
727 | $prev = DiffDOMUtils::previousNonSepSibling( $c ); |
728 | if ( $prev instanceof Element && DOMCompat::nodeName( $prev ) === 'table' && |
729 | !empty( DOMDataUtils::getDataParsoid( $prev )->autoInsertedEnd ) |
730 | ) { |
731 | $tplLintInfo = self::findEnclosingTemplateName( $env, $tplInfo ); |
732 | $dsr = self::findLintDSR( |
733 | $tplLintInfo, |
734 | $tplInfo, |
735 | $dp->dsr ?? null, |
736 | static function ( ?DomSourceRange $nodeDSR ): ?DomSourceRange { |
737 | // Identify the dsr-span of the opening tag |
738 | // of the table that needs to be deleted |
739 | $x = $nodeDSR === null ? null : ( clone $nodeDSR ); |
740 | if ( !empty( $x->openWidth ) ) { |
741 | $x->end = $x->innerStart(); |
742 | $x->openWidth = 0; |
743 | $x->closeWidth = 0; |
744 | } |
745 | return $x; |
746 | } |
747 | ); |
748 | $lintObj = [ |
749 | 'dsr' => $dsr, |
750 | 'templateInfo' => $tplLintInfo, |
751 | 'params' => [ 'name' => 'table' ], |
752 | ]; |
753 | $env->recordLint( 'deletable-table-tag', $lintObj ); |
754 | } |
755 | } |
756 | } |
757 | |
758 | /** |
759 | * Find the first child passing the filter. |
760 | */ |
761 | private function findMatchingChild( Node $node, callable $filter ): ?Node { |
762 | $c = $node->firstChild; |
763 | while ( $c && !$filter( $c ) ) { |
764 | $c = $c->nextSibling; |
765 | } |
766 | |
767 | return $c; |
768 | } |
769 | |
770 | /** |
771 | * Test if the node has a 'nowrap' CSS rule |
772 | * |
773 | * In the general case, this CSS can come from a class, |
774 | * or from a <style> tag or a stylesheet or even from JS code. |
775 | * But, for now, we are restricting this inspection to inline CSS |
776 | * since the intent is to aid editors in fixing patterns that |
777 | * can be automatically detected. |
778 | * |
779 | * Special case for enwiki that has Template:nowrap which |
780 | * assigns class='nowrap' with CSS white-space:nowrap in |
781 | * MediaWiki:Common.css |
782 | */ |
783 | private function hasNoWrapCSS( Node $node ): bool { |
784 | return $node instanceof Element && ( |
785 | str_contains( DOMCompat::getAttribute( $node, 'style' ) ?? '', 'nowrap' ) || |
786 | DOMUtils::hasClass( $node, 'nowrap' ) |
787 | ); |
788 | } |
789 | |
790 | /** |
791 | * Lint bad P wrapping. |
792 | * |
793 | * Lint category: `pwrap-bug-workaround` |
794 | */ |
795 | private function lintPWrapBugWorkaround( |
796 | Env $env, Element $node, DataParsoid $dp, ?stdClass $tplInfo |
797 | ): void { |
798 | if ( |
799 | !DOMUtils::isWikitextBlockNode( $node ) && |
800 | DOMUtils::isWikitextBlockNode( $node->parentNode ) && |
801 | $this->hasNoWrapCSS( $node ) |
802 | ) { |
803 | $p = $this->findMatchingChild( $node, static function ( $e ) { |
804 | return DOMCompat::nodeName( $e ) === 'p'; |
805 | } ); |
806 | if ( $p ) { |
807 | $tplLintInfo = self::findEnclosingTemplateName( $env, $tplInfo ); |
808 | $lintObj = [ |
809 | 'dsr' => self::findLintDSR( $tplLintInfo, $tplInfo, $dp->dsr ?? null ), |
810 | 'templateInfo' => $tplLintInfo, |
811 | 'params' => [ |
812 | 'root' => DOMCompat::nodeName( $node->parentNode ), |
813 | 'child' => DOMCompat::nodeName( $node ), |
814 | ] |
815 | ]; |
816 | $env->recordLint( 'pwrap-bug-workaround', $lintObj ); |
817 | } |
818 | } |
819 | } |
820 | |
821 | /** |
822 | * Lint Tidy div span flip. |
823 | * |
824 | * Lint category: `misc-tidy-replacement-issues` |
825 | */ |
826 | private function lintMiscTidyReplacementIssues( |
827 | Env $env, Element $node, DataParsoid $dp, ?stdClass $tplInfo |
828 | ): void { |
829 | if ( DOMCompat::nodeName( $node ) !== 'span' ) { |
830 | return; |
831 | } |
832 | |
833 | $fc = DiffDOMUtils::firstNonSepChild( $node ); |
834 | if ( !$fc instanceof Element || DOMCompat::nodeName( $fc ) !== 'div' ) { |
835 | return; |
836 | } |
837 | |
838 | // No style/class attributes -- so, this won't affect rendering |
839 | if ( !$node->hasAttribute( 'class' ) && !$node->hasAttribute( 'style' ) && |
840 | !$fc->hasAttribute( 'class' ) && !$fc->hasAttribute( 'style' ) |
841 | ) { |
842 | return; |
843 | } |
844 | |
845 | $tplLintInfo = self::findEnclosingTemplateName( $env, $tplInfo ); |
846 | $lintObj = [ |
847 | 'dsr' => self::findLintDSR( $tplLintInfo, $tplInfo, $dp->dsr ?? null ), |
848 | 'templateInfo' => $tplLintInfo, |
849 | 'params' => [ 'subtype' => 'div-span-flip' ] |
850 | ]; |
851 | $env->recordLint( 'misc-tidy-replacement-issues', $lintObj ); |
852 | } |
853 | |
854 | /** |
855 | * Lint tidy whitespace bug. |
856 | * |
857 | * Lint category: `tidy-whitespace-bug` |
858 | */ |
859 | private function lintTidyWhitespaceBug( |
860 | Env $env, Node $node, DataParsoid $dp, ?stdClass $tplInfo |
861 | ): void { |
862 | // We handle a run of nodes in one shot. |
863 | // No need to reprocess repeatedly. |
864 | if ( $dp->getTempFlag( TempData::PROCESSED_TIDY_WS_BUG ) ) { |
865 | return; |
866 | } |
867 | |
868 | // Find the longest run of nodes that are affected by white-space:nowrap CSS |
869 | // in a way that leads to unsightly rendering in HTML5 compliant browsers. |
870 | // |
871 | // Check if Tidy does buggy whitespace hoisting there to provide the browser |
872 | // opportunities to split the content in short segments. |
873 | & |