Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
98.26% |
566 / 576 |
|
87.88% |
29 / 33 |
CRAP | |
0.00% |
0 / 1 |
Linter | |
98.26% |
566 / 576 |
|
87.88% |
29 / 33 |
248 | |
0.00% |
0 / 1 |
getTagsWithChangedMisnestingBehavior | |
100.00% |
19 / 19 |
|
100.00% |
1 / 1 |
7 | |||
leftMostMisnestedDescendent | |
100.00% |
13 / 13 |
|
100.00% |
1 / 1 |
8 | |||
getMatchingMisnestedNode | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
3 | |||
findEnclosingTemplateName | |
77.78% |
14 / 18 |
|
0.00% |
0 / 1 |
7.54 | |||
findLintDSR | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
5 | |||
hasIdenticalNestedTag | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
5 | |||
hasMisnestableContent | |
100.00% |
15 / 15 |
|
100.00% |
1 / 1 |
10 | |||
endTagOptional | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
getHeadingAncestor | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
3 | |||
matchedOpenTagPairExists | |
90.00% |
9 / 10 |
|
0.00% |
0 / 1 |
7.05 | |||
lintTreeBuilderFixup | |
100.00% |
59 / 59 |
|
100.00% |
1 / 1 |
31 | |||
lintFostered | |
100.00% |
25 / 25 |
|
100.00% |
1 / 1 |
9 | |||
lintObsoleteTag | |
100.00% |
38 / 38 |
|
100.00% |
1 / 1 |
17 | |||
lintBogusImageOptions | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
8 | |||
lintDeletableTableTag | |
100.00% |
24 / 24 |
|
100.00% |
1 / 1 |
7 | |||
findMatchingChild | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
3 | |||
hasNoWrapCSS | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
3 | |||
lintPWrapBugWorkaround | |
100.00% |
17 / 17 |
|
100.00% |
1 / 1 |
5 | |||
lintMiscTidyReplacementIssues | |
100.00% |
15 / 15 |
|
100.00% |
1 / 1 |
8 | |||
lintTidyWhitespaceBug | |
98.78% |
81 / 82 |
|
0.00% |
0 / 1 |
29 | |||
lintMultipleUnclosedFormattingTags | |
100.00% |
22 / 22 |
|
100.00% |
1 / 1 |
9 | |||
postProcessLints | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getWikitextListItemAncestor | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
6 | |||
lintMultilineHtmlTableInList | |
100.00% |
16 / 16 |
|
100.00% |
1 / 1 |
5 | |||
lintWikilinksInExtlink | |
100.00% |
21 / 21 |
|
100.00% |
1 / 1 |
11 | |||
recordLargeTablesLint | |
100.00% |
13 / 13 |
|
100.00% |
1 / 1 |
1 | |||
skipNonElementNodes | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
3 | |||
lintLargeTables | |
100.00% |
24 / 24 |
|
100.00% |
1 / 1 |
8 | |||
lintNightModeUnawareBackgroundColor | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
4 | |||
lintMissingAltText | |
100.00% |
25 / 25 |
|
100.00% |
1 / 1 |
9 | |||
logWikitextFixups | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
1 | |||
findLints | |
88.24% |
30 / 34 |
|
0.00% |
0 / 1 |
11.20 | |||
run | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
3 |
1 | <?php |
2 | |
3 | declare( strict_types = 1 ); |
4 | |
5 | namespace Wikimedia\Parsoid\Wt2Html\DOM\Processors; |
6 | |
7 | use stdClass; |
8 | use Wikimedia\Assert\UnreachableException; |
9 | use Wikimedia\Parsoid\Config\Env; |
10 | use Wikimedia\Parsoid\Core\DomSourceRange; |
11 | use Wikimedia\Parsoid\DOM\Comment; |
12 | use Wikimedia\Parsoid\DOM\Element; |
13 | use Wikimedia\Parsoid\DOM\Node; |
14 | use Wikimedia\Parsoid\DOM\Text; |
15 | use Wikimedia\Parsoid\Ext\ParsoidExtensionAPI; |
16 | use Wikimedia\Parsoid\NodeData\DataParsoid; |
17 | use Wikimedia\Parsoid\NodeData\TempData; |
18 | use Wikimedia\Parsoid\NodeData\TemplateInfo; |
19 | use Wikimedia\Parsoid\Utils\DiffDOMUtils; |
20 | use Wikimedia\Parsoid\Utils\DOMCompat; |
21 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
22 | use Wikimedia\Parsoid\Utils\DOMUtils; |
23 | use Wikimedia\Parsoid\Utils\PHPUtils; |
24 | use Wikimedia\Parsoid\Utils\Timing; |
25 | use Wikimedia\Parsoid\Utils\Utils; |
26 | use Wikimedia\Parsoid\Utils\WTUtils; |
27 | use Wikimedia\Parsoid\Wikitext\Consts; |
28 | use Wikimedia\Parsoid\Wt2Html\Wt2HtmlDOMProcessor; |
29 | |
30 | /** |
31 | * DOM pass that walks the DOM tree, detects specific wikitext patterns, |
32 | * and emits them as linter events. |
33 | */ |
34 | class Linter implements Wt2HtmlDOMProcessor { |
35 | /** @var ParsoidExtensionAPI */ |
36 | private $extApi = null; |
37 | |
38 | /** @var array<string,bool>|null */ |
39 | private $tagsWithChangedMisnestingBehavior = null; |
40 | |
41 | /** @var string|null */ |
42 | private $obsoleteTagsRE = null; |
43 | |
44 | /** |
45 | * We are trying to find HTML5 tags that have different behavior compared to HTML4 |
46 | * in some misnesting scenarios around wikitext paragraphs. |
47 | * |
48 | * Ex: Input: <p><small>a</p><p>b</small></p> |
49 | * Tidy output: <p><small>a</small></p><p><small>b</small></p> |
50 | * HTML5 output: <p><small>a</small></p><p><small>b</small></p> |
51 | * |
52 | * So, all good here. |
53 | * But, see how output changes when we use <span> instead |
54 | * |
55 | * Ex: Input: <p><span>a</p><p>b</span></p> |
56 | * Tidy output: <p><span>a</span></p><p><span>b</span></p> |
57 | * HTML5 output: <p><span>a</span></p><p>b</p> |
58 | * |
59 | * The source wikitext is "<span>a\n\nb</span>". The difference persists even |
60 | * when you have "<span>a\n\n<div>b</div>" or "<span>a\n\n{|\n|x\n|}\nbar". |
61 | * |
62 | * This is because Tidy seems to be doing the equivalent of HTM5-treebuilder's |
63 | * active formatting element reconstruction step on all *inline* elements. |
64 | * However, HTML5 parsers only do that on formatting elements. So, we need |
65 | * to compute which HTML5 tags are subject to this differential behavior. |
66 | * |
67 | * We compute that by excluding the following tags from the list of all HTML5 tags |
68 | * - If our sanitizer doesn't allow them, they will be escaped => ignore them |
69 | * - HTML4 block tags are excluded (obviously) |
70 | * - Void tags don't matter since they cannot wrap anything (obviously) |
71 | * - Active formatting elements have special handling in the HTML5 tree building |
72 | * algorithm where they are reconstructed to wrap all originally intended content. |
73 | * (ex: <small> above) |
74 | * |
75 | * Here is the list of 22 HTML5 tags that are affected: |
76 | * ABBR, BDI, BDO, CITE, DATA, DEL, DFN, INS, KBD, MARK, |
77 | * Q, RB, RP, RT, RTC, RUBY, SAMP, SPAN, SUB, SUP, TIME, VAR |
78 | * |
79 | * https://phabricator.wikimedia.org/T176363#3628173 verifies that this list of |
80 | * tags all demonstrate this behavior. |
81 | * |
82 | * @return array |
83 | * @phan-return array<string,bool> |
84 | */ |
85 | private function getTagsWithChangedMisnestingBehavior(): array { |
86 | if ( $this->tagsWithChangedMisnestingBehavior === null ) { |
87 | // This set is frozen in time. It gets us down to the requisite |
88 | // 22 HTML5 tags above, but shouldn't be used for anything other |
89 | // than that. |
90 | $HTML4TidyBlockTags = PHPUtils::makeSet( [ |
91 | 'div', 'p', |
92 | # tables |
93 | 'table', 'tbody', 'thead', 'tfoot', 'caption', 'th', 'tr', 'td', |
94 | # lists |
95 | 'ul', 'ol', 'li', 'dl', 'dt', 'dd', |
96 | # HTML5 heading content |
97 | 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hgroup', |
98 | # HTML5 sectioning content |
99 | 'article', 'aside', 'nav', 'section', 'footer', 'header', |
100 | 'figure', 'figcaption', 'fieldset', 'details', 'blockquote', |
101 | # other |
102 | 'hr', 'button', 'canvas', 'center', 'col', 'colgroup', 'embed', |
103 | 'map', 'object', 'pre', 'progress', 'video', |
104 | ] ); |
105 | $this->tagsWithChangedMisnestingBehavior = []; |
106 | foreach ( Consts::$HTML['HTML5Tags'] as $tag => $dummy ) { |
107 | if ( isset( Consts::$Sanitizer['AllowedLiteralTags'][$tag] ) && |
108 | !isset( $HTML4TidyBlockTags[$tag] ) && |
109 | !isset( Consts::$HTML['FormattingTags'][$tag] ) && |
110 | !isset( Consts::$HTML['VoidTags'][$tag] ) |
111 | ) { |
112 | $this->tagsWithChangedMisnestingBehavior[$tag] = true; |
113 | } |
114 | } |
115 | } |
116 | |
117 | return $this->tagsWithChangedMisnestingBehavior; |
118 | } |
119 | |
120 | /** |
121 | * Finds a matching node at the "start" of this node. |
122 | */ |
123 | private function leftMostMisnestedDescendent( ?Node $node, Element $match ): ?Element { |
124 | if ( !$node instanceof Element ) { |
125 | return null; |
126 | } |
127 | |
128 | if ( DOMUtils::isMarkerMeta( $node, 'mw:Placeholder/StrippedTag' ) ) { |
129 | $name = DOMDataUtils::getDataParsoid( $node )->name ?? null; |
130 | return $name === DOMCompat::nodeName( $match ) ? $node : null; |
131 | } |
132 | |
133 | if ( DOMCompat::nodeName( $node ) === DOMCompat::nodeName( $match ) ) { |
134 | $dp = DOMDataUtils::getDataParsoid( $node ); |
135 | if ( ( DOMDataUtils::getDataParsoid( $match )->stx ?? null ) === ( $dp->stx ?? null ) && |
136 | !empty( $dp->autoInsertedStart ) |
137 | ) { |
138 | if ( !empty( $dp->autoInsertedEnd ) ) { |
139 | return $this->getMatchingMisnestedNode( $node, $match ); |
140 | } else { |
141 | return $node; |
142 | } |
143 | } |
144 | } |
145 | |
146 | return $this->leftMostMisnestedDescendent( $node->firstChild, $match ); |
147 | } |
148 | |
149 | /** |
150 | * $node has an 'autoInsertedEnd' flag set on it. We are looking for |
151 | * its matching node that has an 'autoInsertedStart' flag set on it. |
152 | * This happens when the tree-builder fixes up misnested tags. |
153 | * This "adjacency" is wrt the HTML string. In a DOM, this can either |
154 | * be the next sibling OR, it might be the left-most-descendent of |
155 | * of $node's parent's sibling (and so on up the ancestor chain). |
156 | */ |
157 | private function getMatchingMisnestedNode( Node $node, Element $match ): ?Element { |
158 | if ( DOMUtils::atTheTop( $node ) ) { |
159 | return null; |
160 | } |
161 | |
162 | if ( DiffDOMUtils::nextNonSepSibling( $node ) ) { |
163 | return $this->leftMostMisnestedDescendent( DiffDOMUtils::nextNonSepSibling( $node ), $match ); |
164 | } |
165 | |
166 | return $this->getMatchingMisnestedNode( $node->parentNode, $match ); |
167 | } |
168 | |
169 | /** |
170 | * Given a tplInfo object, determine whether we are: |
171 | * - Not processing template content (could be extension or top level page) |
172 | * - Processing encapsulated content that is produced by a single template. |
173 | * If so, return the name of that template. |
174 | * - Processing encapsulated content that comes from multiple templates. |
175 | * If so, return a flag indicating this. |
176 | * |
177 | * FIXME: We might potentially be computing this information redundantly |
178 | * for every lint we find within this template's content. It could probably |
179 | * be cached in tplInfo after it is computed once. |
180 | */ |
181 | private function findEnclosingTemplateName( Env $env, ?stdClass $tplInfo ): ?array { |
182 | if ( !$tplInfo ) { |
183 | return null; |
184 | } |
185 | |
186 | if ( !DOMUtils::hasTypeOf( $tplInfo->first, 'mw:Transclusion' ) ) { |
187 | return null; |
188 | } |
189 | $dmw = DOMDataUtils::getDataMw( $tplInfo->first ); |
190 | // This count check is conservative in that link suffixes and prefixes |
191 | // could artifically add an extra element to the parts array but we |
192 | // don't have a good way of distinguishing that right now. It will require |
193 | // a non-string representation for them and a change in spec along with |
194 | // a version bump and all that song and dance. If linting accuracy in these |
195 | // scenarios become a problem, we can revisit this. |
196 | if ( |
197 | !empty( $dmw->parts ) && |
198 | count( $dmw->parts ) === 1 |
199 | ) { |
200 | $p0 = $dmw->parts[0]; |
201 | if ( !( $p0 instanceof TemplateInfo ) ) { |
202 | throw new UnreachableException( |
203 | "a single part will always be a TemplateInfo not a string" |
204 | ); |
205 | } |
206 | $name = null; |
207 | if ( !empty( $p0->href ) ) { // Could be "function" |
208 | // PORT-FIXME: Should that be SiteConfig::relativeLinkPrefix() rather than './'? |
209 | $name = PHPUtils::stripPrefix( $p0->href, './' ); |
210 | } else { |
211 | // type === 'templatearg' or 'template' |
212 | $name = trim( $p0->targetWt ); |
213 | } |
214 | return [ 'name' => $name ]; |
215 | } else { |
216 | return [ 'multiPartTemplateBlock' => true ]; |
217 | } |
218 | } |
219 | |
220 | /** |
221 | * Compute the DSR information for the lint object. |
222 | * - In the common case, this is simply the DSR value of the node |
223 | * that generated the lint. But, occasionally, for some lints, |
224 | * we might have to post-process the node's DSR. |
225 | * - If the lint is found in template content, then the DSR spans |
226 | * the transclusion markup in the toplevel page source. |
227 | */ |
228 | private function findLintDSR( |
229 | ?array $tplLintInfo, ?stdClass $tplInfo, ?DomSourceRange $nodeDSR, |
230 | ?callable $updateNodeDSR = null |
231 | ): ?DomSourceRange { |
232 | if ( $tplLintInfo !== null || ( $tplInfo && !Utils::isValidDSR( $nodeDSR ) ) ) { |
233 | return DOMDataUtils::getDataParsoid( $tplInfo->first )->dsr ?? null; |
234 | } else { |
235 | return $updateNodeDSR ? $updateNodeDSR( $nodeDSR ) : $nodeDSR; |
236 | } |
237 | } |
238 | |
239 | /** |
240 | * Determine if a node has an identical nested tag (?) |
241 | */ |
242 | private function hasIdenticalNestedTag( Element $node, string $name ): bool { |
243 | $c = $node->firstChild; |
244 | while ( $c ) { |
245 | if ( $c instanceof Element ) { |
246 | if ( |
247 | DOMCompat::nodeName( $c ) === $name && |
248 | empty( DOMDataUtils::getDataParsoid( $c )->autoInsertedEnd ) |
249 | ) { |
250 | return true; |
251 | } |
252 | |
253 | return $this->hasIdenticalNestedTag( $c, $name ); |
254 | } |
255 | |
256 | $c = $c->nextSibling; |
257 | } |
258 | |
259 | return false; |
260 | } |
261 | |
262 | /** |
263 | * Determine if a node has misnestable content |
264 | */ |
265 | private function hasMisnestableContent( Node $node, string $name ): bool { |
266 | // For A, TD, TH, H* tags, Tidy doesn't seem to propagate |
267 | // the unclosed tag outside these tags. |
268 | // No need to check for tr/table since content cannot show up there |
269 | if ( DOMUtils::atTheTop( $node ) || preg_match( '/^(?:a|td|th|h\d)$/D', DOMCompat::nodeName( $node ) ) ) { |
270 | return false; |
271 | } |
272 | |
273 | $next = DiffDOMUtils::nextNonSepSibling( $node ); |
274 | if ( !$next ) { |
275 | return $this->hasMisnestableContent( $node->parentNode, $name ); |
276 | } |
277 | |
278 | $contentNode = null; |
279 | if ( DOMCompat::nodeName( $next ) === 'p' && !WTUtils::isLiteralHTMLNode( $next ) ) { |
280 | $contentNode = DiffDOMUtils::firstNonSepChild( $next ); |
281 | } else { |
282 | $contentNode = $next; |
283 | } |
284 | |
285 | // If the first "content" node we find is a matching |
286 | // stripped tag, we have nothing that can get misnested |
287 | return $contentNode && !( |
288 | $contentNode instanceof Element && |
289 | DOMUtils::isMarkerMeta( $contentNode, 'mw:Placeholder/StrippedTag' ) && |
290 | isset( DOMDataUtils::getDataParsoid( $contentNode )->name ) && |
291 | DOMDataUtils::getDataParsoid( $contentNode )->name === $name |
292 | ); |
293 | } |
294 | |
295 | /** |
296 | * Indicate whether an end tag is optional for this node |
297 | * |
298 | * See https://www.w3.org/TR/html5/syntax.html#optional-tags |
299 | * |
300 | * End tags for tr/td/th/li are entirely optional since they |
301 | * require a parent container and can only be followed by like |
302 | * kind. |
303 | * |
304 | * Caveat: <li>foo</li><ol>..</ol> and <li>foo<ol>..</ol> |
305 | * generate different DOM trees, so explicit </li> tag |
306 | * is required to specify which of the two was intended. |
307 | * |
308 | * With that one caveat around nesting, the parse with/without |
309 | * the end tag is identical. For now, ignoring that caveat |
310 | * since they aren't like to show up in our corpus much. |
311 | * |
312 | * For the other tags in that w3c spec section, I haven't reasoned |
313 | * through when exactly they are optional. Not handling that complexity |
314 | * for now since those are likely uncommon use cases in our corpus. |
315 | */ |
316 | private function endTagOptional( Node $node ): bool { |
317 | static $tagNames = [ 'tr', 'td', 'th', 'li' ]; |
318 | return in_array( DOMCompat::nodeName( $node ), $tagNames, true ); |
319 | } |
320 | |
321 | /** |
322 | * Find the nearest ancestor heading tag |
323 | */ |
324 | private function getHeadingAncestor( Node $node ): ?Node { |
325 | while ( $node && !DOMUtils::isHeading( $node ) ) { |
326 | $node = $node->parentNode; |
327 | } |
328 | return $node; |
329 | } |
330 | |
331 | /** |
332 | * For formatting tags, Tidy seems to be doing this "smart" fixup of |
333 | * unclosed tags by looking for matching unclosed pairs of identical tags |
334 | * and if the content ends in non-whitespace text, it treats the second |
335 | * unclosed opening tag as a closing tag. But, a HTML5 parser won't do this. |
336 | * So, detect this pattern and flag for linter fixup. |
337 | */ |
338 | private function matchedOpenTagPairExists( Node $c, DataParsoid $dp ): bool { |
339 | $lc = $c->lastChild; |
340 | if ( !$lc instanceof Element || DOMCompat::nodeName( $lc ) !== DOMCompat::nodeName( $c ) ) { |
341 | return false; |
342 | } |
343 | |
344 | $lcDP = DOMDataUtils::getDataParsoid( $lc ); |
345 | if ( empty( $lcDP->autoInsertedEnd ) || ( $lcDP->stx ?? null ) !== ( $dp->stx ?? null ) ) { |
346 | return false; |
347 | } |
348 | |
349 | $prev = $lc->previousSibling; |
350 | // PORT-FIXME: Do we care about non-ASCII whitespace here? |
351 | if ( $prev instanceof Text && !preg_match( '/\s$/D', $prev->nodeValue ) ) { |
352 | return true; |
353 | } |
354 | |
355 | return false; |
356 | } |
357 | |
358 | /** |
359 | * Lint Treebuilder fixups marked by dom.markTreeBuilderFixup.js |
360 | * |
361 | * It handles the following scenarios: |
362 | * |
363 | * 1. Unclosed end tags (`missing-end-tag`, `missing-end-tag-in-heading`) |
364 | * 2. Invalid self-closed tags (`self-closed-tag`) |
365 | * 3. Stripped tags (`stripped-tag`) |
366 | * |
367 | * In addition, we have specialized categories for some patterns |
368 | * where we encounter unclosed end tags. |
369 | * |
370 | * 4. misnested-tag |
371 | * 5. html5-misnesting |
372 | * 6. multiple-unclosed-formatting-tags |
373 | * 7. unclosed-quotes-in-heading |
374 | */ |
375 | private function lintTreeBuilderFixup( |
376 | Env $env, Element $c, DataParsoid $dp, ?stdClass $tplInfo |
377 | ): void { |
378 | // This might have been processed as part of |
379 | // misnested-tag category identification. |
380 | if ( $dp->getTempFlag( TempData::LINTED ) ) { |
381 | return; |
382 | } |
383 | |
384 | $tplLintInfo = $this->findEnclosingTemplateName( $env, $tplInfo ); |
385 | // During DSR computation, stripped meta tags |
386 | // surrender their width to its previous sibling. |
387 | // We record the original DSR in the tmp attribute |
388 | // for that reason. |
389 | $dsr = $this->findLintDSR( $tplLintInfo, $tplInfo, $dp->tmp->origDSR ?? $dp->dsr ?? null ); |
390 | $lintObj = null; |
391 | if ( DOMUtils::isMarkerMeta( $c, 'mw:Placeholder/StrippedTag' ) ) { |
392 | $lintObj = [ |
393 | 'dsr' => $dsr, |
394 | 'templateInfo' => $tplLintInfo, |
395 | 'params' => [ 'name' => $dp->name ?? null ], |
396 | ]; |
397 | $env->recordLint( 'stripped-tag', $lintObj ); |
398 | } |
399 | |
400 | // Dont bother linting for auto-inserted start/end or self-closing-tag if: |
401 | // 1. c is a void element |
402 | // Void elements won't have auto-inserted start/end tags |
403 | // and self-closing versions are valid for them. |
404 | // |
405 | // 2. c is tbody (FIXME: don't remember why we have this exception) |
406 | // |
407 | // 3. c is not an HTML element (unless they are i/b quotes or tables) |
408 | // |
409 | // 4. c doesn't have DSR info and doesn't come from a template either |
410 | $cNodeName = DOMCompat::nodeName( $c ); |
411 | $ancestor = null; |
412 | $isHtmlElement = WTUtils::hasLiteralHTMLMarker( $dp ); |
413 | if ( !Utils::isVoidElement( $cNodeName ) && |
414 | $cNodeName !== 'tbody' && |
415 | ( $isHtmlElement || DOMUtils::isQuoteElt( $c ) || $cNodeName === 'table' ) && |
416 | ( $tplInfo !== null || $dsr !== null ) |
417 | ) { |
418 | if ( !empty( $dp->selfClose ) && $cNodeName !== 'meta' ) { |
419 | $lintObj = [ |
420 | 'dsr' => $dsr, |
421 | 'templateInfo' => $tplLintInfo, |
422 | 'params' => [ 'name' => $cNodeName ], |
423 | ]; |
424 | $env->recordLint( 'self-closed-tag', $lintObj ); |
425 | // The other checks won't pass - no need to test them. |
426 | return; |
427 | } |
428 | |
429 | if ( |
430 | ( $dp->autoInsertedEnd ?? null ) === true && |
431 | ( $tplInfo || ( $dsr->openWidth ?? 0 ) > 0 ) |
432 | ) { |
433 | $lintObj = [ |
434 | 'dsr' => $dsr, |
435 | 'templateInfo' => $tplLintInfo, |
436 | 'params' => [ 'name' => $cNodeName ], |
437 | ]; |
438 | |
439 | // FIXME: This literal html marker check is strictly not required |
440 | // (a) we've already checked that above and know that isQuoteElt is |
441 | // not one of our tags. |
442 | // (b) none of the tags in the list have native wikitext syntax => |
443 | // they will show up as literal html tags. |
444 | // But, in the interest of long-term maintenance in the face of |
445 | // changes (to wikitext or html specs), let us make it explicit. |
446 | if ( $isHtmlElement && |
447 | isset( $this->getTagsWithChangedMisnestingBehavior()[DOMCompat::nodeName( $c )] ) && |
448 | $this->hasMisnestableContent( $c, DOMCompat::nodeName( $c ) ) && |
449 | // Tidy WTF moment here! |
450 | // I don't know why Tidy does something very different |
451 | // when there is an identical nested tag here. |
452 | // |
453 | // <p><span id='1'>a<span>X</span></p><p>b</span></p> |
454 | // vs. |
455 | // <p><span id='1'>a</p><p>b</span></p> OR |
456 | // <p><span id='1'>a<del>X</del></p><p>b</span></p> |
457 | // |
458 | // For the first snippet, Tidy only wraps "a" with the id='1' span |
459 | // For the second and third snippets, Tidy wraps "b" with the id='1' span as well. |
460 | // |
461 | // For the corresponding wikitext that generates the above token stream, |
462 | // Parsoid (and Remex) won't wrap 'b' with the id=1' span at all. |
463 | !$this->hasIdenticalNestedTag( $c, DOMCompat::nodeName( $c ) ) |
464 | ) { |
465 | $env->recordLint( 'html5-misnesting', $lintObj ); |
466 | } elseif ( |
467 | !$isHtmlElement && DOMUtils::isQuoteElt( $c ) && |
468 | // phpcs:ignore Generic.CodeAnalysis.AssignmentInCondition.Found |
469 | ( $ancestor = $this->getHeadingAncestor( $c->parentNode ) ) |
470 | ) { |
471 | $lintObj['params']['ancestorName'] = DOMCompat::nodeName( $ancestor ); |
472 | $env->recordLint( 'unclosed-quotes-in-heading', $lintObj ); |
473 | } else { |
474 | $adjNode = $this->getMatchingMisnestedNode( $c, $c ); |
475 | if ( $adjNode ) { |
476 | $adjDp = DOMDataUtils::getDataParsoid( $adjNode ); |
477 | $adjDp->setTempFlag( TempData::LINTED ); |
478 | $env->recordLint( 'misnested-tag', $lintObj ); |
479 | } elseif ( !$this->endTagOptional( $c ) && empty( $dp->autoInsertedStart ) ) { |
480 | $lintObj['params']['inTable'] = DOMUtils::hasNameOrHasAncestorOfName( $c, 'table' ); |
481 | $category = $this->getHeadingAncestor( $c ) ? |
482 | 'missing-end-tag-in-heading' : 'missing-end-tag'; |
483 | $next = DiffDOMUtils::nextNonSepSibling( $c ); |
484 | if ( |
485 | // Skip if covered by deletable-table-tag |
486 | !( $cNodeName === 'table' && $next && |
487 | ( DOMCompat::nodeName( $c ) === 'table' ) ) |
488 | ) { |
489 | $env->recordLint( $category, $lintObj ); |
490 | } |
491 | if ( isset( Consts::$HTML['FormattingTags'][DOMCompat::nodeName( $c )] ) && |
492 | $this->matchedOpenTagPairExists( $c, $dp ) |
493 | ) { |
494 | $env->recordLint( 'multiple-unclosed-formatting-tags', $lintObj ); |
495 | } |
496 | } |
497 | } |
498 | } |
499 | } |
500 | } |
501 | |
502 | /** |
503 | * Lint fostered content marked by MarkFosteredContent. |
504 | * |
505 | * Lint category: `fostered` |
506 | * |
507 | * This will log cases like: |
508 | * |
509 | * {| |
510 | * foo |
511 | * |- |
512 | * | bar |
513 | * |} |
514 | * |
515 | * Here 'foo' gets fostered out. |
516 | */ |
517 | private function lintFostered( |
518 | Env $env, Element $node, DataParsoid $dp, ?stdClass $tplInfo |
519 | ): void { |
520 | if ( DOMCompat::nodeName( $node ) !== 'table' ) { |
521 | return; |
522 | } |
523 | |
524 | // The top-level nodes in the foster box are span/p wrapped |
525 | // and so, if we have fostered content, previous siblings to |
526 | // the table are expected to be elements. |
527 | $maybeFostered = $node->previousSibling; |
528 | |
529 | $isTemplatePage = $env->getContextTitle()->getNamespace() === 10; // NS_TEMPLATE |
530 | |
531 | // Skip rendering-transparent nodes if they come from a template or |
532 | // we're on a template page |
533 | // |
534 | // We're trying to find a balance between creating noise for wikignomes |
535 | // and avoiding dirty-diffs from DiscussionTools. DiscussionTools |
536 | // expects to know when pages have fostered content otherwise it can |
537 | // lead to corruption on edit. However, rendering transparent nodes |
538 | // often end up in fosterable positions, like category links from |
539 | // templates or include directives on template pages. Neither of which |
540 | // seem particularly concerning for DT. |
541 | // |
542 | // FIXME(T369317): Not skipping rendering transparent nodes is proving too |
543 | // costly to wikignomes work. We should explore other alternatives like |
544 | // surfacing if fostered content is all rendering transparent in params |
545 | // and then suppressing those lints from Linter UI. Or, introduce a new |
546 | // hidden category, 'fostered-transparent' or some such. |
547 | // $skipRenderingTransparentNodes = ( $tplInfo || $isTemplatePage ); |
548 | $skipRenderingTransparentNodes = true; |
549 | |
550 | // @phan-suppress-next-line PhanInfiniteLoop |
551 | while ( $skipRenderingTransparentNodes && $maybeFostered instanceof Element && ( |
552 | WTUtils::isRenderingTransparentNode( $maybeFostered ) || |
553 | // TODO: Section tags are rendering transparent but not sol transparent, |
554 | // and that method only considers WTUtils::isSolTransparentLink, though |
555 | // there is a FIXME to consider all link nodes. |
556 | ( DOMCompat::nodeName( $maybeFostered ) === 'link' && |
557 | DOMUtils::hasTypeOf( $maybeFostered, 'mw:Extension/section' ) ) |
558 | ) ) { |
559 | $maybeFostered = $maybeFostered->previousSibling; |
560 | } |
561 | |
562 | if ( |
563 | !( $maybeFostered instanceof Element ) || |
564 | empty( DOMDataUtils::getDataParsoid( $maybeFostered )->fostered ) |
565 | ) { |
566 | return; |
567 | } |
568 | |
569 | $tplLintInfo = $this->findEnclosingTemplateName( $env, $tplInfo ); |
570 | $lintObj = [ |
571 | 'dsr' => $this->findLintDSR( |
572 | $tplLintInfo, $tplInfo, $dp->dsr ?? null |
573 | ), |
574 | 'templateInfo' => $tplLintInfo, |
575 | ]; |
576 | $env->recordLint( 'fostered', $lintObj ); |
577 | } |
578 | |
579 | /** |
580 | * Lint obsolete HTML tags. |
581 | * |
582 | * Lint category: `obsolete-tag`, `tidy-font-bug` |
583 | */ |
584 | private function lintObsoleteTag( |
585 | Env $env, Element $c, DataParsoid $dp, ?stdClass $tplInfo |
586 | ): void { |
587 | if ( !$this->obsoleteTagsRE ) { |
588 | $elts = []; |
589 | foreach ( Consts::$HTML['OlderHTMLTags'] as $tag => $dummy ) { |
590 | // Looks like all existing editors let editors add the <big> tag. |
591 | // VE has a button to add <big>, it seems so does the WikiEditor |
592 | // and JS wikitext editor. So, don't flag BIG as an obsolete tag. |
593 | if ( $tag !== 'big' ) { |
594 | $elts[] = preg_quote( $tag, '/' ); |
595 | } |
596 | } |
597 | $this->obsoleteTagsRE = '/^(?:' . implode( '|', $elts ) . ')$/D'; |
598 | } |
599 | |
600 | $tplLintInfo = null; |
601 | if ( ( empty( $dp->autoInsertedStart ) || empty( $dp->autoInsertedEnd ) ) && |
602 | preg_match( $this->obsoleteTagsRE, DOMCompat::nodeName( $c ) ) |
603 | ) { |
604 | $tplLintInfo = $this->findEnclosingTemplateName( $env, $tplInfo ); |
605 | $lintObj = [ |
606 | 'dsr' => $this->findLintDSR( $tplLintInfo, $tplInfo, $dp->dsr ?? null ), |
607 | 'templateInfo' => $tplLintInfo, |
608 | 'params' => [ 'name' => DOMCompat::nodeName( $c ) ], |
609 | ]; |
610 | $env->recordLint( 'obsolete-tag', $lintObj ); |
611 | } |
612 | |
613 | if ( DOMCompat::nodeName( $c ) === 'font' && $c->hasAttribute( 'color' ) ) { |
614 | /* ---------------------------------------------------------- |
615 | * Tidy migrates <font> into the link in these cases |
616 | * <font>[[Foo]]</font> |
617 | * <font>[[Foo]]l</font> (link-trail) |
618 | * <font><!--boo-->[[Foo]]</font> |
619 | * <font>__NOTOC__[[Foo]]</font> |
620 | * <font>[[Category:Foo]][[Foo]]</font> |
621 | * <font>{{1x|[[Foo]]}}</font> |
622 | * |
623 | * Tidy does not migrate <font> into the link in these cases |
624 | * <font> [[Foo]]</font> |
625 | * <font>[[Foo]] </font> |
626 | * <font>[[Foo]]L</font> (not a link-trail) |
627 | * <font>[[Foo]][[Bar]]</font> |
628 | * <font>[[Foo]][[Bar]]</font> |
629 | * |
630 | * <font> is special. |
631 | * This behavior is not seen with other formatting tags. |
632 | * |
633 | * Remex/parsoid won't do any of this. |
634 | * This difference in behavior only matters when the font tag |
635 | * specifies a link colour because the link no longer renders |
636 | * as blue/red but in the font-specified colour. |
637 | * ---------------------------------------------------------- */ |
638 | $tidyFontBug = $c->firstChild !== null; |
639 | $haveLink = false; |
640 | for ( $n = $c->firstChild; $n; $n = $n->nextSibling ) { |
641 | $nodeName = DOMCompat::nodeName( $n ); |
642 | if ( $nodeName !== 'a' && |
643 | !WTUtils::isRenderingTransparentNode( $n ) && |
644 | !WTUtils::isTplMarkerMeta( $n ) |
645 | ) { |
646 | $tidyFontBug = false; |
647 | break; |
648 | } |
649 | |
650 | if ( $nodeName === 'a' || $nodeName === 'figure' ) { |
651 | if ( !$haveLink ) { |
652 | $haveLink = true; |
653 | } else { |
654 | $tidyFontBug = false; |
655 | break; |
656 | } |
657 | } |
658 | } |
659 | |
660 | if ( $tidyFontBug ) { |
661 | $tplLintInfo = $this->findEnclosingTemplateName( $env, $tplInfo ); |
662 | $env->recordLint( 'tidy-font-bug', [ |
663 | 'dsr' => $this->findLintDSR( $tplLintInfo, $tplInfo, $dp->dsr ?? null ), |
664 | 'templateInfo' => $tplLintInfo, |
665 | 'params' => [ 'name' => 'font' ] |
666 | ] ); |
667 | } |
668 | } |
669 | } |
670 | |
671 | /** |
672 | * Log bogus (=unrecognized) media options. |
673 | * |
674 | * See - https://www.mediawiki.org/wiki/Help:Images#Syntax |
675 | * |
676 | * Lint category: `bogus-image-options` |
677 | */ |
678 | private function lintBogusImageOptions( |
679 | Env $env, Node $c, DataParsoid $dp, ?stdClass $tplInfo |
680 | ): void { |
681 | // Despite the lint category name, this checks all media, not just images |
682 | if ( WTUtils::isGeneratedFigure( $c ) && !empty( $dp->optList ) ) { |
683 | $items = []; |
684 | $bogusPx = $dp->getTempFlag( TempData::BOGUS_PX ); |
685 | foreach ( $dp->optList as $item ) { |
686 | if ( |
687 | $item['ck'] === 'bogus' || |
688 | ( $bogusPx && $item['ck'] === 'width' ) |
689 | ) { |
690 | $items[] = $item['ak']; |
691 | } |
692 | } |
693 | if ( $items ) { |
694 | $tplLintInfo = $this->findEnclosingTemplateName( $env, $tplInfo ); |
695 | $env->recordLint( 'bogus-image-options', [ |
696 | 'dsr' => $this->findLintDSR( $tplLintInfo, $tplInfo, $dp->dsr ?? null ), |
697 | 'templateInfo' => $tplLintInfo, |
698 | 'params' => [ 'items' => $items ] |
699 | ] ); |
700 | } |
701 | } |
702 | } |
703 | |
704 | /** |
705 | * Lint tables Tidy deletes. |
706 | * |
707 | * Lint category: `deletable-table-tag` |
708 | * |
709 | * In this example below, the second table is in a fosterable position |
710 | * (inside a <tr>). The tree builder closes the first table at that point |
711 | * and starts a new table there. We are detecting this pattern because |
712 | * Tidy does something very different here. It strips the inner table |
713 | * and retains the outer table. So, for preserving rendering of pages |
714 | * that are tailored for Tidy, editors have to fix up this wikitext |
715 | * to strip the inner table (to mimic what Tidy does). |
716 | * |
717 | * {| style='border:1px solid red;' |
718 | * |a |
719 | * |- |
720 | * {| style='border:1px solid blue;' |
721 | * |b |
722 | * |c |
723 | * |} |
724 | * |} |
725 | */ |
726 | private function lintDeletableTableTag( |
727 | Env $env, Node $c, DataParsoid $dp, ?stdClass $tplInfo |
728 | ): void { |
729 | if ( DOMCompat::nodeName( $c ) === 'table' ) { |
730 | $prev = DiffDOMUtils::previousNonSepSibling( $c ); |
731 | if ( $prev instanceof Element && DOMCompat::nodeName( $prev ) === 'table' && |
732 | !empty( DOMDataUtils::getDataParsoid( $prev )->autoInsertedEnd ) |
733 | ) { |
734 | $tplLintInfo = $this->findEnclosingTemplateName( $env, $tplInfo ); |
735 | $dsr = $this->findLintDSR( |
736 | $tplLintInfo, |
737 | $tplInfo, |
738 | $dp->dsr ?? null, |
739 | static function ( ?DomSourceRange $nodeDSR ): ?DomSourceRange { |
740 | // Identify the dsr-span of the opening tag |
741 | // of the table that needs to be deleted |
742 | $x = $nodeDSR === null ? null : ( clone $nodeDSR ); |
743 | if ( !empty( $x->openWidth ) ) { |
744 | $x->end = $x->innerStart(); |
745 | $x->openWidth = 0; |
746 | $x->closeWidth = 0; |
747 | } |
748 | return $x; |
749 | } |
750 | ); |
751 | $lintObj = [ |
752 | 'dsr' => $dsr, |
753 | 'templateInfo' => $tplLintInfo, |
754 | 'params' => [ 'name' => 'table' ], |
755 | ]; |
756 | $env->recordLint( 'deletable-table-tag', $lintObj ); |
757 | } |
758 | } |
759 | } |
760 | |
761 | /** |
762 | * Find the first child passing the filter. |
763 | */ |
764 | private function findMatchingChild( Node $node, callable $filter ): ?Node { |
765 | $c = $node->firstChild; |
766 | while ( $c && !$filter( $c ) ) { |
767 | $c = $c->nextSibling; |
768 | } |
769 | |
770 | return $c; |
771 | } |
772 | |
773 | /** |
774 | * Test if the node has a 'nowrap' CSS rule |
775 | * |
776 | * In the general case, this CSS can come from a class, |
777 | * or from a <style> tag or a stylesheet or even from JS code. |
778 | * But, for now, we are restricting this inspection to inline CSS |
779 | * since the intent is to aid editors in fixing patterns that |
780 | * can be automatically detected. |
781 | * |
782 | * Special case for enwiki that has Template:nowrap which |
783 | * assigns class='nowrap' with CSS white-space:nowrap in |
784 | * MediaWiki:Common.css |
785 | */ |
786 | private function hasNoWrapCSS( Node $node ): bool { |
787 | return $node instanceof Element && ( |
788 | str_contains( DOMCompat::getAttribute( $node, 'style' ) ?? '', 'nowrap' ) || |
789 | DOMUtils::hasClass( $node, 'nowrap' ) |
790 | ); |
791 | } |
792 | |
793 | /** |
794 | * Lint bad P wrapping. |
795 | * |
796 | * Lint category: `pwrap-bug-workaround` |
797 | */ |
798 | private function lintPWrapBugWorkaround( |
799 | Env $env, Element $node, DataParsoid $dp, ?stdClass $tplInfo |
800 | ): void { |
801 | if ( |
802 | !DOMUtils::isWikitextBlockNode( $node ) && |
803 | DOMUtils::isWikitextBlockNode( $node->parentNode ) && |
804 | $this->hasNoWrapCSS( $node ) |
805 | ) { |
806 | $p = $this->findMatchingChild( $node, static function ( $e ) { |
807 | return DOMCompat::nodeName( $e ) === 'p'; |
808 | } ); |
809 | if ( $p ) { |
810 | $tplLintInfo = $this->findEnclosingTemplateName( $env, $tplInfo ); |
811 | $lintObj = [ |
812 | 'dsr' => $this->findLintDSR( $tplLintInfo, $tplInfo, $dp->dsr ?? null ), |
813 | 'templateInfo' => $tplLintInfo, |
814 | 'params' => [ |
815 | 'root' => DOMCompat::nodeName( $node->parentNode ), |
816 | 'child' => DOMCompat::nodeName( $node ), |
817 | ] |
818 | ]; |
819 | $env->recordLint( 'pwrap-bug-workaround', $lintObj ); |
820 | } |
821 | } |
822 | } |
823 | |
824 | /** |
825 | * Lint Tidy div span flip. |
826 | * |
827 | * Lint category: `misc-tidy-replacement-issues` |
828 | */ |
829 | private function lintMiscTidyReplacementIssues( |
830 | Env $env, Element $node, DataParsoid $dp, ?stdClass $tplInfo |
831 | ): void { |
832 | if ( DOMCompat::nodeName( $node ) !== 'span' ) { |
833 | return; |
834 | } |
835 | |
836 | $fc = DiffDOMUtils::firstNonSepChild( $node ); |
837 | if ( !$fc instanceof Element || DOMCompat::nodeName( $fc ) !== 'div' ) { |
838 | return; |
839 | } |
840 | |
841 | // No style/class attributes -- so, this won't affect rendering |
842 | if ( !$node->hasAttribute( 'class' ) && !$node->hasAttribute( 'style' ) && |
843 | !$fc->hasAttribute( 'class' ) && !$fc->hasAttribute( 'style' ) |
844 | ) { |
845 | return; |
846 | } |
847 | |
848 | $tplLintInfo = $this->findEnclosingTemplateName( $env, $tplInfo ); |
849 | $lintObj = [ |
850 | 'dsr' => $this->findLintDSR( $tplLintInfo, $tplInfo, $dp->dsr ?? null ), |
851 | 'templateInfo' => $tplLintInfo, |
852 | 'params' => [ 'subtype' => 'div-span-flip' ] |
853 | ]; |
854 | $env->recordLint( 'misc-tidy-replacement-issues', $lintObj ); |
855 | } |
856 | |
857 | /** |
858 | * Lint tidy whitespace bug. |
859 | * |
860 | * Lint category: `tidy-whitespace-bug` |
861 | */ |
862 | private function lintTidyWhitespaceBug( |
863 | Env $env, Node $node, DataParsoid $dp, ?stdClass $tplInfo |
864 | ): void { |
865 | // We handle a run of nodes in one shot. |
866 | // No need to reprocess repeatedly. |
867 | if ( $dp->getTempFlag( TempData::PROCESSED_TIDY_WS_BUG ) ) { |
868 | return; |
869 | } |
870 | |
871 | // Find the longest run of nodes that are affected by white-space:nowrap CSS |
872 | // in a way that leads to unsightly rendering in HTML5 compliant browsers. |
873 | // |
874 | // Check if Tidy does buggy whitespace hoisting there to provide the browser |
875 | // opportunities to split the content in short segments. |
876 | // |
877 | // If so, editors would need to edit this run of nodes to introduce |
878 | // whitespace breaks as necessary so that HTML5 browsers get that |
879 | // same opportunity when Tidy is removed. |
880 | $s = null; |
881 | $nowrapNodes = []; |
882 | '@phan-var array<array{node:Node,tidybug:bool,hasLeadingWS:bool}> $nowrapNodes'; |
883 | $startNode = $node; |
884 | $haveTidyBug = false; |
885 | $runLength = 0; |
886 | |
887 | // <br>, <wbr>, <hr> break a line |
888 | while ( $node && !DOMUtils::isRemexBlockNode( $node ) && |
889 | !in_array( DOMCompat::nodeName( $node ), [ 'hr', 'br', 'wbr' ], true ) |
890 | ) { |
891 | if ( $node instanceof Text || !$this->hasNoWrapCSS( $node ) ) { |
892 | // No CSS property that affects whitespace. |
893 | $s = $node->textContent; |
894 | if ( preg_match( '/^(\S*)\s/', $s, $m ) ) { // PORT-FIXME: non-ASCII whitespace? |
895 | $runLength += strlen( $m[1] ); |
896 | $nowrapNodes[] = [ |
897 | 'node' => $node, |
898 | 'tidybug' => false, |
899 | 'hasLeadingWS' => ( preg_match( '/^\s/', $s ) === 1 ), // PORT-FIXME: non-ASCII whitespace? |
900 | ]; |
901 | break; |
902 | } else { |
903 | $nowrapNodes[] = [ 'node' => $node, 'tidybug' => false ]; |
904 | $runLength += strlen( $s ); |
905 | } |
906 | } else { |
907 | // Find last non-comment child of node |
908 | $last = $node->lastChild; |
909 | while ( $last instanceof Comment ) { |
910 | $last = $last->previousSibling; |
911 | } |
912 | |
913 | $bug = false; |
914 | if ( $last instanceof Text && |
915 | preg_match( '/\s$/D', $last->nodeValue ) // PORT-FIXME: non-ASCII whitespace? |
916 | ) { |
917 | // In this scenario, when Tidy hoists the whitespace to |
918 | // after the node, that whitespace is not subject to the |
919 | // nowrap CSS => browsers can break content there. |
920 | // |
921 | // But, non-Tidy libraries won't hoist the whitespace. |
922 | // So, browsers don't have a place to break content. |
923 | $bug = true; |
924 | $haveTidyBug = true; |
925 | } |
926 | |
927 | $nowrapNodes[] = [ 'node' => $node, 'tidybug' => $bug ]; |
928 | $runLength += strlen( $node->textContent ); |
929 | } |
930 | |
931 | // Don't cross template boundaries at the top-level |
932 | if ( $tplInfo && $tplInfo->last === $node ) { |
933 | // Exiting a top-level template |
934 | break; |
935 | } elseif ( !$tplInfo && WTUtils::findFirstEncapsulationWrapperNode( $node ) ) { |
936 | // Entering a top-level template |
937 | break; |
938 | } |
939 | |
940 | // Move to the next non-comment sibling |
941 | $node = $node->nextSibling; |
942 | while ( $node instanceof Comment ) { |
943 | $node = $node->nextSibling; |
944 | } |
945 | } |
946 | |
947 | $markProcessedNodes = static function () use ( &$nowrapNodes ) { // Helper |
948 | foreach ( $nowrapNodes as $o ) { |
949 | // Phan fails at applying the instanceof type restriction to the array member when analyzing the |
950 | // following call, but is fine when it's copied to a local variable. |
951 | $node = $o['node']; |
952 | if ( $node instanceof Element ) { |
953 | DOMDataUtils::getDataParsoid( $node )->setTempFlag( TempData::PROCESSED_TIDY_WS_BUG ); |
954 | } |
955 | } |
956 | }; |
957 | |
958 | if ( !$haveTidyBug ) { |
959 | // Mark processed nodes and bail |
960 | $markProcessedNodes(); |
961 | return; |
962 | } |
963 | |
964 | // Find run before startNode that doesn't have a whitespace break |
965 | $prev = $startNode->previousSibling; |
966 | while ( $prev && !DOMUtils::isRemexBlockNode( $prev ) ) { |
967 | if ( !( $prev instanceof Comment ) ) { |
968 | $s = $prev->textContent; |
969 | // Find the last \s in the string |
970 | if ( preg_match( '/\s(\S*)$/D', $s, $m ) ) { // PORT-FIXME: non-ASCII whitespace here? |
971 | $runLength += strlen( $m[1] ); |
972 | break; |
973 | } else { |
974 | $runLength += strlen( $s ); |
975 | } |
976 | } |
977 | $prev = $prev->previousSibling; |
978 | } |
979 | |
980 | $lintConfig = $env->getLinterConfig(); |
981 | $tidyWhitespaceBugMaxLength = $lintConfig['tidyWhitespaceBugMaxLength'] ?? 100; |
982 | |
983 | if ( $runLength < $tidyWhitespaceBugMaxLength ) { |
984 | // Mark processed nodes and bail |
985 | $markProcessedNodes(); |
986 | return; |
987 | } |
988 | |
989 | // For every node where Tidy hoists whitespace, |
990 | // emit an event to flag a whitespace fixup opportunity. |
991 | $tplLintInfo = $this->findEnclosingTemplateName( $env, $tplInfo ); |
992 | $n = count( $nowrapNodes ) - 1; |
993 | foreach ( $nowrapNodes as $i => $o ) { |
994 | if ( $o['tidybug'] && $i < $n && empty( $nowrapNodes[$i + 1]['hasLeadingWS'] ) ) { |
995 | $nowrapNode = $o['node']; // (see above) |
996 | $lintObj = [ |
997 | 'dsr' => $this->findLintDSR( |
998 | $tplLintInfo, |
999 | $tplInfo, |
1000 | $nowrapNode instanceof Element |
1001 | ? DOMDataUtils::getDataParsoid( $nowrapNode )->dsr ?? null |
1002 | : null |
1003 | ), |
1004 | 'templateInfo' => $tplLintInfo, |
1005 | 'params' => [ |
1006 | 'node' => DOMCompat::nodeName( $o['node'] ), |
1007 | 'sibling' => DOMCompat::nodeName( $o['node']->nextSibling ) |
1008 | ] |
1009 | ]; |
1010 | |
1011 | $env->recordLint( 'tidy-whitespace-bug', $lintObj ); |
1012 | } |
1013 | } |
1014 | |
1015 | $markProcessedNodes(); |
1016 | } |
1017 | |
1018 | /** |
1019 | * Detect multiple-unclosed-formatting-tags errors. |
1020 | * |
1021 | * Since unclosed <small> and <big> tags accumulate their effects |
1022 | * in HTML5 parsers (unlike in Tidy where it seems to suppress |
1023 | * multiple unclosed elements of the same name), such pages will |
1024 | * break pretty spectacularly with Remex. |
1025 | * |
1026 | * Ex: https://it.wikipedia.org/wiki/Hubert_H._Humphrey_Metrodome?oldid=93017491#Note |
1027 | * |
1028 | * Lint category: `multiple-unclosed-formatting-tags` |
1029 | */ |
1030 | private function lintMultipleUnclosedFormattingTags( array $lints, Env $env ): void { |
1031 | $firstUnclosedTag = [ |
1032 | 'small' => null, |
1033 | 'big' => null |
1034 | ]; |
1035 | $multiUnclosedTagName = null; |
1036 | foreach ( $lints as $item ) { |
1037 | // Unclosed tags in tables don't leak out of the table |
1038 | if ( $item['type'] === 'missing-end-tag' && !$item['params']['inTable'] ) { |
1039 | if ( $item['params']['name'] === 'small' || $item['params']['name'] === 'big' ) { |
1040 | $tagName = $item['params']['name']; |
1041 | // @phan-suppress-next-line PhanPossiblyUndeclaredVariable |
1042 | if ( !$firstUnclosedTag[$tagName] ) { |
1043 | $firstUnclosedTag[$tagName] = $item; |
1044 | } else { |
1045 | $multiUnclosedTagName = $tagName; |
1046 | break; |
1047 | } |
1048 | } |
1049 | } |
1050 | } |
1051 | |
1052 | if ( $multiUnclosedTagName ) { |
1053 | $item = $firstUnclosedTag[$multiUnclosedTagName]; |
1054 | if ( isset( $item['dsr'] ) ) { |
1055 | $item['dsr'] = DomSourceRange::newFromJsonArray( $item['dsr'] ); |
1056 | } |
1057 | $env->recordLint( 'multiple-unclosed-formatting-tags', [ |
1058 | 'params' => $item['params'], |
1059 | 'dsr' => $item['dsr'], |
1060 | 'templateInfo' => $item['templateInfo'], |
1061 | ] ); |
1062 | } |
1063 | } |
1064 | |
1065 | /** |
1066 | * Post-process an array of lints |
1067 | */ |
1068 | private function postProcessLints( array $lints, Env $env ): void { |
1069 | $this->lintMultipleUnclosedFormattingTags( $lints, $env ); |
1070 | } |
1071 | |
1072 | /** |
1073 | * Get wikitext list item ancestor |
1074 | */ |
1075 | private function getWikitextListItemAncestor( ?Node $node ): ?Node { |
1076 | while ( $node && !DOMUtils::isListItem( $node ) ) { |
1077 | $node = $node->parentNode; |
1078 | } |
1079 | |
1080 | if ( $node && !WTUtils::isLiteralHTMLNode( $node ) && |
1081 | !WTUtils::fromExtensionContent( $node, 'references' ) |
1082 | ) { |
1083 | return $node; |
1084 | } |
1085 | |
1086 | return null; |
1087 | } |
1088 | |
1089 | /** |
1090 | * Lint a PHP parser bug. |
1091 | * |
1092 | * When an HTML table is nested inside a list, if any part of the table |
1093 | * is on a new line, the PHP parser misnests the list and the table. |
1094 | * Tidy fixes the misnesting one way (puts table inside/outside the list) |
1095 | * HTML5 parser fixes it another way (list expands to rest of the page!) |
1096 | * |
1097 | * Lint category: `multiline-html-table-in-list` |
1098 | */ |
1099 | private function lintMultilineHtmlTableInList( |
1100 | Env $env, Element $node, DataParsoid $dp, ?stdClass $tplInfo |
1101 | ): void { |
1102 | $li = null; |
1103 | if ( !WTUtils::isLiteralHTMLNode( $node ) || |
1104 | DOMCompat::nodeName( $node ) !== 'table' || |
1105 | // phpcs:ignore Generic.CodeAnalysis.AssignmentInCondition.Found |
1106 | !( $li = $this->getWikitextListItemAncestor( $node ) ) || |
1107 | !str_contains( DOMCompat::getOuterHTML( $node ), "\n" ) |
1108 | ) { |
1109 | return; |
1110 | } |
1111 | |
1112 | // We have an HTML table nested inside a list |
1113 | // that has a newline break in its outer HTML |
1114 | // => we are in trouble with the PHP Parser + Remex combo |
1115 | $tplLintInfo = $this->findEnclosingTemplateName( $env, $tplInfo ); |
1116 | $lintObj = [ |
1117 | 'dsr' => $this->findLintDSR( $tplLintInfo, $tplInfo, $dp->dsr ?? null ), |
1118 | 'templateInfo' => $tplLintInfo, |
1119 | 'params' => [ |
1120 | 'name' => 'table', |
1121 | 'ancestorName' => DOMCompat::nodeName( $li ), |
1122 | ], |
1123 | ]; |
1124 | $env->recordLint( 'multiline-html-table-in-list', $lintObj ); |
1125 | } |
1126 | |
1127 | /** |
1128 | * Log wikilinks or media in external links. |
1129 | * |
1130 | * HTML tags can be nested but this is not the case for <a> tags |
1131 | * which when nested outputs the <a> tags adjacent to each other |
1132 | * In the example below, [[Google]] is a wikilink that is nested |
1133 | * in the outer external link |
1134 | * [http://google.com This is [[Google]]'s search page] |
1135 | * |
1136 | * Linter category: `wikilink-in-extlink` |
1137 | */ |
1138 | private function lintWikilinksInExtlink( |
1139 | Env $env, Element $node, DataParsoid $dp, ?stdClass $tplInfo |
1140 | ): void { |
1141 | if ( |
1142 | DOMCompat::nodeName( $node ) === 'a' && |
1143 | DOMUtils::hasRel( $node, "mw:ExtLink" ) && |
1144 | // Images in extlinks will end up with broken up extlinks inside the |
1145 | // <figure> DOM. Those have 'misnested' flag set on them. Ignore those. |
1146 | empty( $dp->misnested ) |
1147 | ) { |
1148 | $next = $node->nextSibling; |
1149 | $lintError = $next instanceof Element && |
1150 | !empty( DOMDataUtils::getDataParsoid( $next )->misnested ) && |
1151 | // This check may not be necessary but ensures that we are |
1152 | // really in a link-in-link misnested scenario. |
1153 | DOMUtils::treeHasElement( $next, 'a', true ); |
1154 | |
1155 | // Media as opposed to most instances of img (barring the link= trick), don't result |
1156 | // in misnesting according the html5 spec since we're actively suppressing links in |
1157 | // their structure. However, since timed media is inherently clickable, being nested |
1158 | // in an extlink could surprise a user clicking on it by navigating away from the page. |
1159 | if ( !$lintError ) { |
1160 | DOMUtils::visitDOM( $node, static function ( $element ) use ( &$lintError ) { |
1161 | if ( $element instanceof Element && |
1162 | ( DOMCompat::nodeName( $element ) === 'audio' || |
1163 | DOMCompat::nodeName( $element ) === 'video' ) |
1164 | ) { |
1165 | $lintError = true; |
1166 | } |
1167 | } ); |
1168 | } |
1169 | if ( $lintError ) { |
1170 | $tplLintInfo = $this->findEnclosingTemplateName( $env, $tplInfo ); |
1171 | $lintObj = [ |
1172 | 'dsr' => $this->findLintDSR( $tplLintInfo, $tplInfo, $dp->dsr ?? null ), |
1173 | 'templateInfo' => $tplLintInfo, |
1174 | ]; |
1175 | $env->recordLint( 'wikilink-in-extlink', $lintObj ); |
1176 | } |
1177 | } |
1178 | } |
1179 | |
1180 | private function recordLargeTablesLint( |
1181 | Env $env, ?stdClass $tplInfo, Element $node, int $numColumns, int $columnsMax |
1182 | ): void { |
1183 | $tplLintInfo = $this->findEnclosingTemplateName( $env, $tplInfo ); |
1184 | $lintObj = [ |
1185 | 'dsr' => $this->findLintDSR( |
1186 | $tplLintInfo, $tplInfo, DOMDataUtils::getDataParsoid( $node )->dsr ?? null |
1187 | ), |
1188 | 'templateInfo' => $tplLintInfo, |
1189 | 'params' => [ |
1190 | 'name' => 'table', |
1191 | 'columns' => $numColumns, |
1192 | 'columnsMax' => $columnsMax, |
1193 | ], |
1194 | ]; |
1195 | $env->recordLint( 'large-tables', $lintObj ); |
1196 | } |
1197 | |
1198 | /** |
1199 | * TODO: In the future, this may merit being moved to DOMUtils |
1200 | * along with its "previous" variant. |
1201 | */ |
1202 | private function skipNonElementNodes( ?Node $n ): ?Element { |
1203 | while ( $n && !( $n instanceof Element ) ) { |
1204 | $n = $n->nextSibling; |
1205 | } |
1206 | return $n; |
1207 | } |
1208 | |
1209 | /** |
1210 | * Lint large tables. |
1211 | * |
1212 | * Identify articles having overly-large tables |
1213 | * to help editors optimize their articles. |
1214 | * |
1215 | * Linter category: `large-tables` |
1216 | */ |
1217 | private function lintLargeTables( |
1218 | Env $env, Element $node, DataParsoid $dp, ?stdClass $tplInfo |
1219 | ): void { |
1220 | if ( DOMCompat::nodeName( $node ) !== 'table' ) { |
1221 | return; |
1222 | } |
1223 | |
1224 | // Skip tables that have nested tables in them as they are likely |
1225 | // to be used for layout and not for data representation. |
1226 | // We may check nested tables in the next iteration of this lint. |
1227 | $nestedTables = $node->getElementsByTagName( 'table' ); |
1228 | if ( $nestedTables->length > 0 ) { |
1229 | return; |
1230 | } |
1231 | |
1232 | $lintConfig = $env->getLinterConfig(); |
1233 | $maxColumns = $lintConfig['maxTableColumnHeuristic'] ?? 5; |
1234 | $maxRowsToCheck = $lintConfig['maxTableRowsToCheck'] ?? 10; |
1235 | |
1236 | $trCount = 0; |
1237 | $tbody = DOMCompat::querySelector( $node, 'tbody' ); |
1238 | // empty table |
1239 | if ( !$tbody ) { |
1240 | return; |
1241 | } |
1242 | $tr = self::skipNonElementNodes( $tbody->firstChild ); |
1243 | while ( $tr && $trCount < $maxRowsToCheck ) { |
1244 | $numTh = $tr->getElementsByTagName( 'th' )->length; |
1245 | if ( $numTh > $maxColumns ) { |
1246 | $this->recordLargeTablesLint( $env, $tplInfo, $node, $numTh, $maxColumns ); |
1247 | return; |
1248 | } |
1249 | |
1250 | $numTd = $tr->getElementsByTagName( 'td' )->length; |
1251 | if ( $numTd > $maxColumns ) { |
1252 | $this->recordLargeTablesLint( $env, $tplInfo, $node, $numTd, $maxColumns ); |
1253 | return; |
1254 | } |
1255 | |
1256 | $tr = self::skipNonElementNodes( $tr->nextSibling ); |
1257 | $trCount++; |
1258 | } |
1259 | } |
1260 | |
1261 | /** |
1262 | * Log inline background color style rules without a color style rule. |
1263 | * |
1264 | * This function identifies elements with inline style attributes |
1265 | * that have background color set but don't have a color style rule. |
1266 | * It records linter events for such elements to help editors make |
1267 | * their articles comply with WCAG color contrast rules. |
1268 | * |
1269 | * Linter category: `night-mode-unaware-background-color` |
1270 | */ |
1271 | private function lintNightModeUnawareBackgroundColor( |
1272 | Env $env, Element $node, DataParsoid $dp, ?stdClass $tplInfo |
1273 | ): void { |
1274 | // Get inline style attribute value |
1275 | $styleAttrValue = DOMCompat::getAttribute( $node, 'style' ); |
1276 | |
1277 | // Check if background color is set but font color is not |
1278 | if ( |
1279 | ( $styleAttrValue !== null ) && |
1280 | preg_match( '/(^|;)\s*background(-color)?\s*:/i', $styleAttrValue ) && |
1281 | !preg_match( '/(^|;)\s*color\s*:/i', $styleAttrValue ) |
1282 | ) { |
1283 | $tplLintInfo = $this->findEnclosingTemplateName( $env, $tplInfo ); |
1284 | $lintObj = [ |
1285 | 'dsr' => $this->findLintDSR( $tplLintInfo, $tplInfo, $dp->dsr ?? null ), |
1286 | 'templateInfo' => $tplLintInfo, |
1287 | ]; |
1288 | $env->recordLint( 'night-mode-unaware-background-color', $lintObj ); |
1289 | } |
1290 | } |
1291 | |
1292 | /** |
1293 | * Lint for missing image alt text |
1294 | */ |
1295 | private function lintMissingAltText( |
1296 | Env $env, Element $c, DataParsoid $dp, ?stdClass $tplInfo |
1297 | ): void { |
1298 | if ( !WTUtils::isGeneratedFigure( $c ) ) { |
1299 | return; |
1300 | } |
1301 | |
1302 | // Extract the media element in its standard place |
1303 | $media = $c->firstChild->firstChild ?? null; |
1304 | if ( !( $media instanceof Element ) || DOMCompat::nodeName( $media ) !== 'img' ) { |
1305 | // Videos and such are handled differently; check only |
1306 | // simple image output for alt text. |
1307 | return; |
1308 | } |
1309 | |
1310 | if ( $media->hasAttribute( 'alt' ) ) { |
1311 | // Present and accounted for, either via explicit markup |
1312 | // or filling in from an inline caption or other future |
1313 | // source. |
1314 | // |
1315 | // Note that an explicit empty alt text will be counted |
1316 | // as present, as this may be done deliberately for |
1317 | // spacer images or similar. |
1318 | return; |
1319 | } |
1320 | |
1321 | // Follow the parent tree looking for aria-hidden=true or equivalent roles |
1322 | for ( $node = $media; $node->parentNode; $node = $node->parentNode ) { |
1323 | $hidden = strtolower( DOMCompat::getAttribute( $node, 'aria-hidden' ) ?? '' ); |
1324 | $role = strtolower( DOMCompat::getAttribute( $node, 'role' ) ?? '' ); |
1325 | if ( $hidden === 'true' |
1326 | || $role === 'presentation' |
1327 | || $role === 'none' ) { |
1328 | // This entire subtree is excluded from the accessibility tree. |
1329 | return; |
1330 | } |
1331 | } |
1332 | |
1333 | $resource = DOMCompat::getAttribute( $media, 'resource' ) ?? ''; |
1334 | $file = basename( urldecode( $resource ) ); |
1335 | |
1336 | $tplLintInfo = $this->findEnclosingTemplateName( $env, $tplInfo ); |
1337 | $lintObj = [ |
1338 | 'dsr' => $this->findLintDSR( $tplLintInfo, $tplInfo, $dp->dsr ?? null ), |
1339 | 'templateInfo' => $tplLintInfo, |
1340 | 'params' => [ |
1341 | 'file' => $file, |
1342 | ] |
1343 | ]; |
1344 | $env->recordLint( 'missing-image-alt-text', $lintObj ); |
1345 | } |
1346 | |
1347 | /** |
1348 | * Log wikitext fixups |
1349 | */ |
1350 | private function logWikitextFixups( |
1351 | Element $node, Env $env, ?stdClass $tplInfo |
1352 | ): void { |
1353 | $dp = DOMDataUtils::getDataParsoid( $node ); |
1354 | $this->lintTreeBuilderFixup( $env, $node, $dp, $tplInfo ); |
1355 | $this->lintDeletableTableTag( $env, $node, $dp, $tplInfo ); // For T161341 |
1356 | $this->lintPWrapBugWorkaround( $env, $node, $dp, $tplInfo ); // For T161306 |
1357 | $this->lintObsoleteTag( $env, $node, $dp, $tplInfo ); |
1358 | $this->lintBogusImageOptions( $env, $node, $dp, $tplInfo ); |
1359 | $this->lintTidyWhitespaceBug( $env, $node, $dp, $tplInfo ); |
1360 | $this->lintMiscTidyReplacementIssues( $env, $node, $dp, $tplInfo ); |
1361 | $this->lintMultilineHtmlTableInList( $env, $node, $dp, $tplInfo ); |
1362 | $this->lintWikilinksInExtlink( $env, $node, $dp, $tplInfo ); |
1363 | $this->lintLargeTables( $env, $node, $dp, $tplInfo ); |
1364 | $this->lintNightModeUnawareBackgroundColor( $env, $node, $dp, $tplInfo ); |
1365 | $this->lintFostered( $env, $node, $dp, $tplInfo ); |
1366 | $this->lintMissingAltText( $env, $node, $dp, $tplInfo ); |
1367 | } |
1368 | |
1369 | /** |
1370 | * Walk the DOM and compute lints for the entire tree. |
1371 | * - When we enter encapsulated content (templates or extensions), |
1372 | * compute "tplInfo" (misnamed given that it can be an extension) |
1373 | * so that lints from the templates' content can be mapped back |
1374 | * to the transclusion that generated them. |
1375 | * - When we process extensions, if we have a lint handler for the |
1376 | * extension, let the extension's lint handler compute lints. |
1377 | */ |
1378 | private function findLints( |
1379 | Node $root, Env $env, ?stdClass $tplInfo = null |
1380 | ): void { |
1381 | $node = $root->firstChild; |
1382 | while ( $node !== null ) { |
1383 | if ( !$node instanceof Element ) { |
1384 | $node = $node->nextSibling; |
1385 | continue; |
1386 | } |
1387 | |
1388 | // !tplInfo check is to protect against templated content in |
1389 | // extensions which might in turn be nested in templated content. |
1390 | if ( !$tplInfo && WTUtils::isFirstEncapsulationWrapperNode( $node ) ) { |
1391 | $aboutSibs = WTUtils::getAboutSiblings( $node, DOMCompat::getAttribute( $node, 'about' ) ); |
1392 | $tplInfo = (object)[ |
1393 | 'first' => $node, |
1394 | 'last' => end( $aboutSibs ), |
1395 | 'dsr' => DOMDataUtils::getDataParsoid( $node )->dsr ?? null, |
1396 | // FIXME: This is not being used. Instead the code is recomputing |
1397 | // this info in findEnclosingTemplateName. |
1398 | 'isTemplated' => DOMUtils::hasTypeOf( $node, 'mw:Transclusion' ), |
1399 | ]; |
1400 | } |
1401 | |
1402 | $handled = false; |
1403 | |
1404 | // Let native extensions lint their content |
1405 | $nativeExt = WTUtils::getNativeExt( $env, $node ); |
1406 | if ( $nativeExt ) { |
1407 | if ( !$this->extApi ) { |
1408 | $this->extApi = new ParsoidExtensionAPI( $env ); |
1409 | } |
1410 | $handled = $nativeExt->lintHandler( |
1411 | $this->extApi, |
1412 | $node, |
1413 | function ( $extRootNode ) use ( $env, $tplInfo ) { |
1414 | $this->findLints( |
1415 | $extRootNode, $env, |
1416 | empty( $tplInfo->isTemplated ) ? null : $tplInfo |
1417 | ); |
1418 | } |
1419 | ); |
1420 | // NOTE: See the note in WrapSectionsState::shouldOmitFromTOC() |
1421 | // but we've assumed extension content is contained in a single |
1422 | // wrapper node and it's safe to move to $node->nextSibling. |
1423 | } |
1424 | |
1425 | // Default node handler |
1426 | if ( $handled === false ) { |
1427 | // Lint this node |
1428 | $this->logWikitextFixups( $node, $env, $tplInfo ); |
1429 | |
1430 | // Lint subtree |
1431 | $this->findLints( $node, $env, $tplInfo ); |
1432 | } |
1433 | |
1434 | if ( $tplInfo && $tplInfo->last === $node ) { |
1435 | $tplInfo = null; |
1436 | } |
1437 | |
1438 | $node = $node->nextSibling; |
1439 | } |
1440 | } |
1441 | |
1442 | /** |
1443 | * This is only invoked on the top-level document |
1444 | * @inheritDoc |
1445 | */ |
1446 | public function run( |
1447 | Env $env, Node $root, array $options = [], bool $atTopLevel = false |
1448 | ): void { |
1449 | // Track time spent linting so we can evaluate benefits |
1450 | // of migrating this code off the critical path to its own |
1451 | // post processor. |
1452 | $metrics = $env->getSiteConfig()->metrics(); |
1453 | $timer = null; |
1454 | if ( $metrics ) { |
1455 | $timer = Timing::start( $metrics ); |
1456 | } |
1457 | |
1458 | $this->findLints( $root, $env ); |
1459 | $this->postProcessLints( $env->getLints(), $env ); |
1460 | |
1461 | if ( $metrics ) { |
1462 | $timer->end( "linting" ); |
1463 | } |
1464 | } |
1465 | |
1466 | } |