Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
98.15% |
583 / 594 |
|
85.29% |
29 / 34 |
CRAP | |
0.00% |
0 / 1 |
Linter | |
98.15% |
583 / 594 |
|
85.29% |
29 / 34 |
253 | |
0.00% |
0 / 1 |
getTagsWithChangedMisnestingBehavior | |
100.00% |
19 / 19 |
|
100.00% |
1 / 1 |
7 | |||
leftMostMisnestedDescendent | |
100.00% |
13 / 13 |
|
100.00% |
1 / 1 |
8 | |||
getMatchingMisnestedNode | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
3 | |||
findEnclosingTemplateName | |
77.78% |
14 / 18 |
|
0.00% |
0 / 1 |
7.54 | |||
findLintDSR | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
5 | |||
hasIdenticalNestedTag | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
5 | |||
hasMisnestableContent | |
100.00% |
15 / 15 |
|
100.00% |
1 / 1 |
10 | |||
endTagOptional | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
getHeadingAncestor | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
3 | |||
matchedOpenTagPairExists | |
90.00% |
9 / 10 |
|
0.00% |
0 / 1 |
7.05 | |||
lintTreeBuilderFixup | |
100.00% |
59 / 59 |
|
100.00% |
1 / 1 |
31 | |||
lintFostered | |
100.00% |
28 / 28 |
|
100.00% |
1 / 1 |
11 | |||
lintObsoleteTag | |
100.00% |
38 / 38 |
|
100.00% |
1 / 1 |
17 | |||
lintBogusImageOptions | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
8 | |||
lintDeletableTableTag | |
100.00% |
24 / 24 |
|
100.00% |
1 / 1 |
7 | |||
findMatchingChild | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
3 | |||
hasNoWrapCSS | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
3 | |||
lintPWrapBugWorkaround | |
100.00% |
17 / 17 |
|
100.00% |
1 / 1 |
5 | |||
lintMiscTidyReplacementIssues | |
100.00% |
15 / 15 |
|
100.00% |
1 / 1 |
8 | |||
lintTidyWhitespaceBug | |
98.78% |
81 / 82 |
|
0.00% |
0 / 1 |
29 | |||
lintMultipleUnclosedFormattingTags | |
100.00% |
22 / 22 |
|
100.00% |
1 / 1 |
9 | |||
postProcessLints | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getWikitextListItemAncestor | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
6 | |||
lintMultilineHtmlTableInList | |
100.00% |
16 / 16 |
|
100.00% |
1 / 1 |
5 | |||
lintWikilinksInExtlink | |
100.00% |
21 / 21 |
|
100.00% |
1 / 1 |
11 | |||
recordLargeTablesLint | |
100.00% |
13 / 13 |
|
100.00% |
1 / 1 |
1 | |||
skipNonElementNodes | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
3 | |||
lintLargeTables | |
100.00% |
24 / 24 |
|
100.00% |
1 / 1 |
8 | |||
lintNightModeUnawareBackgroundColor | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
4 | |||
lintMissingAltText | |
100.00% |
25 / 25 |
|
100.00% |
1 / 1 |
9 | |||
lintDuplicateIds | |
100.00% |
15 / 15 |
|
100.00% |
1 / 1 |
4 | |||
logWikitextFixups | |
100.00% |
15 / 15 |
|
100.00% |
1 / 1 |
1 | |||
findLints | |
88.24% |
30 / 34 |
|
0.00% |
0 / 1 |
11.20 | |||
run | |
85.71% |
6 / 7 |
|
0.00% |
0 / 1 |
2.01 |
1 | <?php |
2 | |
3 | declare( strict_types = 1 ); |
4 | |
5 | namespace Wikimedia\Parsoid\Wt2Html\DOM\Processors; |
6 | |
7 | use stdClass; |
8 | use Wikimedia\Assert\UnreachableException; |
9 | use Wikimedia\Parsoid\Config\Env; |
10 | use Wikimedia\Parsoid\Core\DomSourceRange; |
11 | use Wikimedia\Parsoid\DOM\Comment; |
12 | use Wikimedia\Parsoid\DOM\Element; |
13 | use Wikimedia\Parsoid\DOM\Node; |
14 | use Wikimedia\Parsoid\DOM\Text; |
15 | use Wikimedia\Parsoid\Ext\ParsoidExtensionAPI; |
16 | use Wikimedia\Parsoid\NodeData\DataParsoid; |
17 | use Wikimedia\Parsoid\NodeData\TempData; |
18 | use Wikimedia\Parsoid\NodeData\TemplateInfo; |
19 | use Wikimedia\Parsoid\Utils\DiffDOMUtils; |
20 | use Wikimedia\Parsoid\Utils\DOMCompat; |
21 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
22 | use Wikimedia\Parsoid\Utils\DOMUtils; |
23 | use Wikimedia\Parsoid\Utils\PHPUtils; |
24 | use Wikimedia\Parsoid\Utils\Timing; |
25 | use Wikimedia\Parsoid\Utils\Utils; |
26 | use Wikimedia\Parsoid\Utils\WTUtils; |
27 | use Wikimedia\Parsoid\Wikitext\Consts; |
28 | use Wikimedia\Parsoid\Wt2Html\Wt2HtmlDOMProcessor; |
29 | |
30 | /** |
31 | * DOM pass that walks the DOM tree, detects specific wikitext patterns, |
32 | * and emits them as linter events. |
33 | */ |
34 | class Linter implements Wt2HtmlDOMProcessor { |
35 | private ?ParsoidExtensionAPI $extApi = null; |
36 | private ?string $obsoleteTagsRE = null; |
37 | private array $seenIds = []; |
38 | |
39 | /** @var array<string,bool>|null */ |
40 | private ?array $tagsWithChangedMisnestingBehavior = null; |
41 | |
42 | /** |
43 | * We are trying to find HTML5 tags that have different behavior compared to HTML4 |
44 | * in some misnesting scenarios around wikitext paragraphs. |
45 | * |
46 | * Ex: Input: <p><small>a</p><p>b</small></p> |
47 | * Tidy output: <p><small>a</small></p><p><small>b</small></p> |
48 | * HTML5 output: <p><small>a</small></p><p><small>b</small></p> |
49 | * |
50 | * So, all good here. |
51 | * But, see how output changes when we use <span> instead |
52 | * |
53 | * Ex: Input: <p><span>a</p><p>b</span></p> |
54 | * Tidy output: <p><span>a</span></p><p><span>b</span></p> |
55 | * HTML5 output: <p><span>a</span></p><p>b</p> |
56 | * |
57 | * The source wikitext is "<span>a\n\nb</span>". The difference persists even |
58 | * when you have "<span>a\n\n<div>b</div>" or "<span>a\n\n{|\n|x\n|}\nbar". |
59 | * |
60 | * This is because Tidy seems to be doing the equivalent of HTM5-treebuilder's |
61 | * active formatting element reconstruction step on all *inline* elements. |
62 | * However, HTML5 parsers only do that on formatting elements. So, we need |
63 | * to compute which HTML5 tags are subject to this differential behavior. |
64 | * |
65 | * We compute that by excluding the following tags from the list of all HTML5 tags |
66 | * - If our sanitizer doesn't allow them, they will be escaped => ignore them |
67 | * - HTML4 block tags are excluded (obviously) |
68 | * - Void tags don't matter since they cannot wrap anything (obviously) |
69 | * - Active formatting elements have special handling in the HTML5 tree building |
70 | * algorithm where they are reconstructed to wrap all originally intended content. |
71 | * (ex: <small> above) |
72 | * |
73 | * Here is the list of 22 HTML5 tags that are affected: |
74 | * ABBR, BDI, BDO, CITE, DATA, DEL, DFN, INS, KBD, MARK, |
75 | * Q, RB, RP, RT, RTC, RUBY, SAMP, SPAN, SUB, SUP, TIME, VAR |
76 | * |
77 | * https://phabricator.wikimedia.org/T176363#3628173 verifies that this list of |
78 | * tags all demonstrate this behavior. |
79 | * |
80 | * @return array |
81 | * @phan-return array<string,bool> |
82 | */ |
83 | private function getTagsWithChangedMisnestingBehavior(): array { |
84 | if ( $this->tagsWithChangedMisnestingBehavior === null ) { |
85 | // This set is frozen in time. It gets us down to the requisite |
86 | // 22 HTML5 tags above, but shouldn't be used for anything other |
87 | // than that. |
88 | $HTML4TidyBlockTags = PHPUtils::makeSet( [ |
89 | 'div', 'p', |
90 | # tables |
91 | 'table', 'tbody', 'thead', 'tfoot', 'caption', 'th', 'tr', 'td', |
92 | # lists |
93 | 'ul', 'ol', 'li', 'dl', 'dt', 'dd', |
94 | # HTML5 heading content |
95 | 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hgroup', |
96 | # HTML5 sectioning content |
97 | 'article', 'aside', 'nav', 'section', 'footer', 'header', |
98 | 'figure', 'figcaption', 'fieldset', 'details', 'blockquote', |
99 | # other |
100 | 'hr', 'button', 'canvas', 'center', 'col', 'colgroup', 'embed', |
101 | 'map', 'object', 'pre', 'progress', 'video', |
102 | ] ); |
103 | $this->tagsWithChangedMisnestingBehavior = []; |
104 | foreach ( Consts::$HTML['HTML5Tags'] as $tag => $dummy ) { |
105 | if ( isset( Consts::$Sanitizer['AllowedLiteralTags'][$tag] ) && |
106 | !isset( $HTML4TidyBlockTags[$tag] ) && |
107 | !isset( Consts::$HTML['FormattingTags'][$tag] ) && |
108 | !isset( Consts::$HTML['VoidTags'][$tag] ) |
109 | ) { |
110 | $this->tagsWithChangedMisnestingBehavior[$tag] = true; |
111 | } |
112 | } |
113 | } |
114 | |
115 | return $this->tagsWithChangedMisnestingBehavior; |
116 | } |
117 | |
118 | /** |
119 | * Finds a matching node at the "start" of this node. |
120 | */ |
121 | private function leftMostMisnestedDescendent( ?Node $node, Element $match ): ?Element { |
122 | if ( !$node instanceof Element ) { |
123 | return null; |
124 | } |
125 | |
126 | if ( DOMUtils::isMarkerMeta( $node, 'mw:Placeholder/StrippedTag' ) ) { |
127 | $name = DOMDataUtils::getDataParsoid( $node )->name ?? null; |
128 | return $name === DOMCompat::nodeName( $match ) ? $node : null; |
129 | } |
130 | |
131 | if ( DOMCompat::nodeName( $node ) === DOMCompat::nodeName( $match ) ) { |
132 | $dp = DOMDataUtils::getDataParsoid( $node ); |
133 | if ( ( DOMDataUtils::getDataParsoid( $match )->stx ?? null ) === ( $dp->stx ?? null ) && |
134 | !empty( $dp->autoInsertedStart ) |
135 | ) { |
136 | if ( !empty( $dp->autoInsertedEnd ) ) { |
137 | return $this->getMatchingMisnestedNode( $node, $match ); |
138 | } else { |
139 | return $node; |
140 | } |
141 | } |
142 | } |
143 | |
144 | return $this->leftMostMisnestedDescendent( $node->firstChild, $match ); |
145 | } |
146 | |
147 | /** |
148 | * $node has an 'autoInsertedEnd' flag set on it. We are looking for |
149 | * its matching node that has an 'autoInsertedStart' flag set on it. |
150 | * This happens when the tree-builder fixes up misnested tags. |
151 | * This "adjacency" is wrt the HTML string. In a DOM, this can either |
152 | * be the next sibling OR, it might be the left-most-descendent of |
153 | * of $node's parent's sibling (and so on up the ancestor chain). |
154 | */ |
155 | private function getMatchingMisnestedNode( Node $node, Element $match ): ?Element { |
156 | if ( DOMUtils::atTheTop( $node ) ) { |
157 | return null; |
158 | } |
159 | |
160 | if ( DiffDOMUtils::nextNonSepSibling( $node ) ) { |
161 | return $this->leftMostMisnestedDescendent( DiffDOMUtils::nextNonSepSibling( $node ), $match ); |
162 | } |
163 | |
164 | return $this->getMatchingMisnestedNode( $node->parentNode, $match ); |
165 | } |
166 | |
167 | /** |
168 | * Given a tplInfo object, determine whether we are: |
169 | * - Not processing template content (could be extension or top level page) |
170 | * - Processing encapsulated content that is produced by a single template. |
171 | * If so, return the name of that template. |
172 | * - Processing encapsulated content that comes from multiple templates. |
173 | * If so, return a flag indicating this. |
174 | * |
175 | * FIXME: We might potentially be computing this information redundantly |
176 | * for every lint we find within this template's content. It could probably |
177 | * be cached in tplInfo after it is computed once. |
178 | */ |
179 | public static function findEnclosingTemplateName( Env $env, ?stdClass $tplInfo ): ?array { |
180 | if ( !$tplInfo ) { |
181 | return null; |
182 | } |
183 | |
184 | if ( !DOMUtils::hasTypeOf( $tplInfo->first, 'mw:Transclusion' ) ) { |
185 | return null; |
186 | } |
187 | $dmw = DOMDataUtils::getDataMw( $tplInfo->first ); |
188 | // This count check is conservative in that link suffixes and prefixes |
189 | // could artifically add an extra element to the parts array but we |
190 | // don't have a good way of distinguishing that right now. It will require |
191 | // a non-string representation for them and a change in spec along with |
192 | // a version bump and all that song and dance. If linting accuracy in these |
193 | // scenarios become a problem, we can revisit this. |
194 | if ( |
195 | !empty( $dmw->parts ) && |
196 | count( $dmw->parts ) === 1 |
197 | ) { |
198 | $p0 = $dmw->parts[0]; |
199 | if ( !( $p0 instanceof TemplateInfo ) ) { |
200 | throw new UnreachableException( |
201 | "a single part will always be a TemplateInfo not a string" |
202 | ); |
203 | } |
204 | $name = null; |
205 | if ( !empty( $p0->href ) ) { // Could be "function" |
206 | // PORT-FIXME: Should that be SiteConfig::relativeLinkPrefix() rather than './'? |
207 | $name = PHPUtils::stripPrefix( $p0->href, './' ); |
208 | } else { |
209 | // type === 'templatearg' or 'template' |
210 | $name = trim( $p0->targetWt ); |
211 | } |
212 | return [ 'name' => $name ]; |
213 | } else { |
214 | return [ 'multiPartTemplateBlock' => true ]; |
215 | } |
216 | } |
217 | |
218 | /** |
219 | * Compute the DSR information for the lint object. |
220 | * - In the common case, this is simply the DSR value of the node |
221 | * that generated the lint. But, occasionally, for some lints, |
222 | * we might have to post-process the node's DSR. |
223 | * - If the lint is found in template content, then the DSR spans |
224 | * the transclusion markup in the toplevel page source. |
225 | */ |
226 | public static function findLintDSR( |
227 | ?array $tplLintInfo, ?stdClass $tplInfo, ?DomSourceRange $nodeDSR, |
228 | ?callable $updateNodeDSR = null |
229 | ): ?DomSourceRange { |
230 | if ( $tplLintInfo !== null || ( $tplInfo && !Utils::isValidDSR( $nodeDSR ) ) ) { |
231 | return DOMDataUtils::getDataParsoid( $tplInfo->first )->dsr ?? null; |
232 | } else { |
233 | return $updateNodeDSR ? $updateNodeDSR( $nodeDSR ) : $nodeDSR; |
234 | } |
235 | } |
236 | |
237 | /** |
238 | * Determine if a node has an identical nested tag (?) |
239 | */ |
240 | private function hasIdenticalNestedTag( Element $node, string $name ): bool { |
241 | $c = $node->firstChild; |
242 | while ( $c ) { |
243 | if ( $c instanceof Element ) { |
244 | if ( |
245 | DOMCompat::nodeName( $c ) === $name && |
246 | empty( DOMDataUtils::getDataParsoid( $c )->autoInsertedEnd ) |
247 | ) { |
248 | return true; |
249 | } |
250 | |
251 | return $this->hasIdenticalNestedTag( $c, $name ); |
252 | } |
253 | |
254 | $c = $c->nextSibling; |
255 | } |
256 | |
257 | return false; |
258 | } |
259 | |
260 | /** |
261 | * Determine if a node has misnestable content |
262 | */ |
263 | private function hasMisnestableContent( Node $node, string $name ): bool { |
264 | // For A, TD, TH, H* tags, Tidy doesn't seem to propagate |
265 | // the unclosed tag outside these tags. |
266 | // No need to check for tr/table since content cannot show up there |
267 | if ( DOMUtils::atTheTop( $node ) || preg_match( '/^(?:a|td|th|h\d)$/D', DOMCompat::nodeName( $node ) ) ) { |
268 | return false; |
269 | } |
270 | |
271 | $next = DiffDOMUtils::nextNonSepSibling( $node ); |
272 | if ( !$next ) { |
273 | return $this->hasMisnestableContent( $node->parentNode, $name ); |
274 | } |
275 | |
276 | $contentNode = null; |
277 | if ( DOMCompat::nodeName( $next ) === 'p' && !WTUtils::isLiteralHTMLNode( $next ) ) { |
278 | $contentNode = DiffDOMUtils::firstNonSepChild( $next ); |
279 | } else { |
280 | $contentNode = $next; |
281 | } |
282 | |
283 | // If the first "content" node we find is a matching |
284 | // stripped tag, we have nothing that can get misnested |
285 | return $contentNode && !( |
286 | $contentNode instanceof Element && |
287 | DOMUtils::isMarkerMeta( $contentNode, 'mw:Placeholder/StrippedTag' ) && |
288 | isset( DOMDataUtils::getDataParsoid( $contentNode )->name ) && |
289 | DOMDataUtils::getDataParsoid( $contentNode )->name === $name |
290 | ); |
291 | } |
292 | |
293 | /** |
294 | * Indicate whether an end tag is optional for this node |
295 | * |
296 | * See https://www.w3.org/TR/html5/syntax.html#optional-tags |
297 | * |
298 | * End tags for tr/td/th/li are entirely optional since they |
299 | * require a parent container and can only be followed by like |
300 | * kind. |
301 | * |
302 | * Caveat: <li>foo</li><ol>..</ol> and <li>foo<ol>..</ol> |
303 | * generate different DOM trees, so explicit </li> tag |
304 | * is required to specify which of the two was intended. |
305 | * |
306 | * With that one caveat around nesting, the parse with/without |
307 | * the end tag is identical. For now, ignoring that caveat |
308 | * since they aren't like to show up in our corpus much. |
309 | * |
310 | * For the other tags in that w3c spec section, I haven't reasoned |
311 | * through when exactly they are optional. Not handling that complexity |
312 | * for now since those are likely uncommon use cases in our corpus. |
313 | */ |
314 | private function endTagOptional( Node $node ): bool { |
315 | static $tagNames = [ 'tr', 'td', 'th', 'li' ]; |
316 | return in_array( DOMCompat::nodeName( $node ), $tagNames, true ); |
317 | } |
318 | |
319 | /** |
320 | * Find the nearest ancestor heading tag |
321 | */ |
322 | private function getHeadingAncestor( Node $node ): ?Node { |
323 | while ( $node && !DOMUtils::isHeading( $node ) ) { |
324 | $node = $node->parentNode; |
325 | } |
326 | return $node; |
327 | } |
328 | |
329 | /** |
330 | * For formatting tags, Tidy seems to be doing this "smart" fixup of |
331 | * unclosed tags by looking for matching unclosed pairs of identical tags |
332 | * and if the content ends in non-whitespace text, it treats the second |
333 | * unclosed opening tag as a closing tag. But, a HTML5 parser won't do this. |
334 | * So, detect this pattern and flag for linter fixup. |
335 | */ |
336 | private function matchedOpenTagPairExists( Node $c, DataParsoid $dp ): bool { |
337 | $lc = $c->lastChild; |
338 | if ( !$lc instanceof Element || DOMCompat::nodeName( $lc ) !== DOMCompat::nodeName( $c ) ) { |
339 | return false; |
340 | } |
341 | |
342 | $lcDP = DOMDataUtils::getDataParsoid( $lc ); |
343 | if ( empty( $lcDP->autoInsertedEnd ) || ( $lcDP->stx ?? null ) !== ( $dp->stx ?? null ) ) { |
344 | return false; |
345 | } |
346 | |
347 | $prev = $lc->previousSibling; |
348 | // PORT-FIXME: Do we care about non-ASCII whitespace here? |
349 | if ( $prev instanceof Text && !preg_match( '/\s$/D', $prev->nodeValue ) ) { |
350 | return true; |
351 | } |
352 | |
353 | return false; |
354 | } |
355 | |
356 | /** |
357 | * Lint Treebuilder fixups marked by ProcessTreeBuilderFixups |
358 | * |
359 | * It handles the following scenarios: |
360 | * |
361 | * 1. Unclosed end tags (`missing-end-tag`, `missing-end-tag-in-heading`) |
362 | * 2. Invalid self-closed tags (`self-closed-tag`) |
363 | * 3. Stripped tags (`stripped-tag`) |
364 | * |
365 | * In addition, we have specialized categories for some patterns |
366 | * where we encounter unclosed end tags. |
367 | * |
368 | * 4. misnested-tag |
369 | * 5. html5-misnesting |
370 | * 6. multiple-unclosed-formatting-tags |
371 | * 7. unclosed-quotes-in-heading |
372 | */ |
373 | private function lintTreeBuilderFixup( |
374 | Env $env, Element $c, DataParsoid $dp, ?stdClass $tplInfo |
375 | ): void { |
376 | // This might have been processed as part of |
377 | // misnested-tag category identification. |
378 | if ( $dp->getTempFlag( TempData::LINTED ) ) { |
379 | return; |
380 | } |
381 | |
382 | $tplLintInfo = self::findEnclosingTemplateName( $env, $tplInfo ); |
383 | // During DSR computation, stripped meta tags |
384 | // surrender their width to its previous sibling. |
385 | // We record the original DSR in the tmp attribute |
386 | // for that reason. |
387 | $dsr = self::findLintDSR( $tplLintInfo, $tplInfo, $dp->tmp->origDSR ?? $dp->dsr ?? null ); |
388 | $lintObj = null; |
389 | if ( DOMUtils::isMarkerMeta( $c, 'mw:Placeholder/StrippedTag' ) ) { |
390 | $lintObj = [ |
391 | 'dsr' => $dsr, |
392 | 'templateInfo' => $tplLintInfo, |
393 | 'params' => [ 'name' => $dp->name ?? null ], |
394 | ]; |
395 | $env->recordLint( 'stripped-tag', $lintObj ); |
396 | } |
397 | |
398 | // Dont bother linting for auto-inserted start/end or self-closing-tag if: |
399 | // 1. c is a void element |
400 | // Void elements won't have auto-inserted start/end tags |
401 | // and self-closing versions are valid for them. |
402 | // |
403 | // 2. c is tbody (FIXME: don't remember why we have this exception) |
404 | // |
405 | // 3. c is not an HTML element (unless they are i/b quotes or tables) |
406 | // |
407 | // 4. c doesn't have DSR info and doesn't come from a template either |
408 | $cNodeName = DOMCompat::nodeName( $c ); |
409 | $ancestor = null; |
410 | $isHtmlElement = WTUtils::hasLiteralHTMLMarker( $dp ); |
411 | if ( !Utils::isVoidElement( $cNodeName ) && |
412 | $cNodeName !== 'tbody' && |
413 | ( $isHtmlElement || DOMUtils::isQuoteElt( $c ) || $cNodeName === 'table' ) && |
414 | ( $tplInfo !== null || $dsr !== null ) |
415 | ) { |
416 | if ( !empty( $dp->selfClose ) && $cNodeName !== 'meta' ) { |
417 | $lintObj = [ |
418 | 'dsr' => $dsr, |
419 | 'templateInfo' => $tplLintInfo, |
420 | 'params' => [ 'name' => $cNodeName ], |
421 | ]; |
422 | $env->recordLint( 'self-closed-tag', $lintObj ); |
423 | // The other checks won't pass - no need to test them. |
424 | return; |
425 | } |
426 | |
427 | if ( |
428 | ( $dp->autoInsertedEnd ?? null ) === true && |
429 | ( $tplInfo || ( $dsr->openWidth ?? 0 ) > 0 ) |
430 | ) { |
431 | $lintObj = [ |
432 | 'dsr' => $dsr, |
433 | 'templateInfo' => $tplLintInfo, |
434 | 'params' => [ 'name' => $cNodeName ], |
435 | ]; |
436 | |
437 | // FIXME: This literal html marker check is strictly not required |
438 | // (a) we've already checked that above and know that isQuoteElt is |
439 | // not one of our tags. |
440 | // (b) none of the tags in the list have native wikitext syntax => |
441 | // they will show up as literal html tags. |
442 | // But, in the interest of long-term maintenance in the face of |
443 | // changes (to wikitext or html specs), let us make it explicit. |
444 | if ( $isHtmlElement && |
445 | isset( $this->getTagsWithChangedMisnestingBehavior()[DOMCompat::nodeName( $c )] ) && |
446 | $this->hasMisnestableContent( $c, DOMCompat::nodeName( $c ) ) && |
447 | // Tidy WTF moment here! |
448 | // I don't know why Tidy does something very different |
449 | // when there is an identical nested tag here. |
450 | // |
451 | // <p><span id='1'>a<span>X</span></p><p>b</span></p> |
452 | // vs. |
453 | // <p><span id='1'>a</p><p>b</span></p> OR |
454 | // <p><span id='1'>a<del>X</del></p><p>b</span></p> |
455 | // |
456 | // For the first snippet, Tidy only wraps "a" with the id='1' span |
457 | // For the second and third snippets, Tidy wraps "b" with the id='1' span as well. |
458 | // |
459 | // For the corresponding wikitext that generates the above token stream, |
460 | // Parsoid (and Remex) won't wrap 'b' with the id=1' span at all. |
461 | !$this->hasIdenticalNestedTag( $c, DOMCompat::nodeName( $c ) ) |
462 | ) { |
463 | $env->recordLint( 'html5-misnesting', $lintObj ); |
464 | } elseif ( |
465 | !$isHtmlElement && DOMUtils::isQuoteElt( $c ) && |
466 | // phpcs:ignore Generic.CodeAnalysis.AssignmentInCondition.Found |
467 | ( $ancestor = $this->getHeadingAncestor( $c->parentNode ) ) |
468 | ) { |
469 | $lintObj['params']['ancestorName'] = DOMCompat::nodeName( $ancestor ); |
470 | $env->recordLint( 'unclosed-quotes-in-heading', $lintObj ); |
471 | } else { |
472 | $adjNode = $this->getMatchingMisnestedNode( $c, $c ); |
473 | if ( $adjNode ) { |
474 | $adjDp = DOMDataUtils::getDataParsoid( $adjNode ); |
475 | $adjDp->setTempFlag( TempData::LINTED ); |
476 | $env->recordLint( 'misnested-tag', $lintObj ); |
477 | } elseif ( !$this->endTagOptional( $c ) && empty( $dp->autoInsertedStart ) ) { |
478 | $lintObj['params']['inTable'] = DOMUtils::hasNameOrHasAncestorOfName( $c, 'table' ); |
479 | $category = $this->getHeadingAncestor( $c ) ? |
480 | 'missing-end-tag-in-heading' : 'missing-end-tag'; |
481 | $next = DiffDOMUtils::nextNonSepSibling( $c ); |
482 | if ( |
483 | // Skip if covered by deletable-table-tag |
484 | !( $cNodeName === 'table' && $next && |
485 | ( DOMCompat::nodeName( $c ) === 'table' ) ) |
486 | ) { |
487 | $env->recordLint( $category, $lintObj ); |
488 | } |
489 | if ( isset( Consts::$HTML['FormattingTags'][DOMCompat::nodeName( $c )] ) && |
490 | $this->matchedOpenTagPairExists( $c, $dp ) |
491 | ) { |
492 | $env->recordLint( 'multiple-unclosed-formatting-tags', $lintObj ); |
493 | } |
494 | } |
495 | } |
496 | } |
497 | } |
498 | } |
499 | |
500 | /** |
501 | * Lint fostered content marked by MarkFosteredContent. |
502 | * |
503 | * Lint category: `fostered`, `fostered-transparent` |
504 | * |
505 | * This will log cases like: |
506 | * |
507 | * {| |
508 | * foo |
509 | * |- |
510 | * | bar |
511 | * |} |
512 | * |
513 | * Here 'foo' gets fostered out. |
514 | */ |
515 | private function lintFostered( |
516 | Env $env, Element $node, DataParsoid $dp, ?stdClass $tplInfo |
517 | ): void { |
518 | if ( DOMCompat::nodeName( $node ) !== 'table' ) { |
519 | return; |
520 | } |
521 | |
522 | // The top-level nodes in the foster box are span/p wrapped |
523 | // and so, if we have fostered content, previous siblings to |
524 | // the table are expected to be elements. |
525 | $maybeFostered = $node->previousSibling; |
526 | |
527 | // Emit "fostered" or "fostered-transparent" depending on if the fostered |
528 | // content is entirely transparent or not. |
529 | // |
530 | // We're trying to find a balance between creating noise for wikignomes |
531 | // and avoiding dirty-diffs from DiscussionTools. DiscussionTools |
532 | // expects to know when pages have fostered content otherwise it can |
533 | // lead to corruption on edit. However, rendering transparent nodes |
534 | // often end up in fosterable positions, like category links from |
535 | // templates or include directives on template pages. |
536 | |
537 | $fosteredRenderingTransparent = false; |
538 | while ( |
539 | $maybeFostered instanceof Element && |
540 | !empty( DOMDataUtils::getDataParsoid( $maybeFostered )->fostered ) && |
541 | ( WTUtils::isRenderingTransparentNode( $maybeFostered ) || |
542 | // TODO: Section tags are rendering transparent but not sol transparent, |
543 | // and that method only considers WTUtils::isSolTransparentLink, though |
544 | // there is a FIXME to consider all link nodes. |
545 | ( DOMCompat::nodeName( $maybeFostered ) === 'link' && |
546 | DOMUtils::hasTypeOf( $maybeFostered, 'mw:Extension/section' ) ) ) |
547 | ) { |
548 | // Skip rendering-transparent nodes if they come from a template, |
549 | // since they'll roundtrip cleanly regardless |
550 | $fosteredRenderingTransparent = $fosteredRenderingTransparent || !$tplInfo; |
551 | |
552 | $maybeFostered = $maybeFostered->previousSibling; |
553 | } |
554 | |
555 | if ( |
556 | $maybeFostered instanceof Element && |
557 | !empty( DOMDataUtils::getDataParsoid( $maybeFostered )->fostered ) |
558 | ) { |
559 | $type = 'fostered'; |
560 | } elseif ( $fosteredRenderingTransparent ) { |
561 | $type = 'fostered-transparent'; |
562 | } else { |
563 | return; |
564 | } |
565 | |
566 | $tplLintInfo = self::findEnclosingTemplateName( $env, $tplInfo ); |
567 | $lintObj = [ |
568 | 'dsr' => self::findLintDSR( |
569 | $tplLintInfo, $tplInfo, $dp->dsr ?? null |
570 | ), |
571 | 'templateInfo' => $tplLintInfo, |
572 | ]; |
573 | $env->recordLint( $type, $lintObj ); |
574 | } |
575 | |
576 | /** |
577 | * Lint obsolete HTML tags. |
578 | * |
579 | * Lint category: `obsolete-tag`, `tidy-font-bug` |
580 | */ |
581 | private function lintObsoleteTag( |
582 | Env $env, Element $c, DataParsoid $dp, ?stdClass $tplInfo |
583 | ): void { |
584 | if ( !$this->obsoleteTagsRE ) { |
585 | $elts = []; |
586 | foreach ( Consts::$HTML['OlderHTMLTags'] as $tag => $dummy ) { |
587 | // Looks like all existing editors let editors add the <big> tag. |
588 | // VE has a button to add <big>, it seems so does the WikiEditor |
589 | // and JS wikitext editor. So, don't flag BIG as an obsolete tag. |
590 | if ( $tag !== 'big' ) { |
591 | $elts[] = preg_quote( $tag, '/' ); |
592 | } |
593 | } |
594 | $this->obsoleteTagsRE = '/^(?:' . implode( '|', $elts ) . ')$/D'; |
595 | } |
596 | |
597 | $tplLintInfo = null; |
598 | if ( ( empty( $dp->autoInsertedStart ) || empty( $dp->autoInsertedEnd ) ) && |
599 | preg_match( $this->obsoleteTagsRE, DOMCompat::nodeName( $c ) ) |
600 | ) { |
601 | $tplLintInfo = self::findEnclosingTemplateName( $env, $tplInfo ); |
602 | $lintObj = [ |
603 | 'dsr' => self::findLintDSR( $tplLintInfo, $tplInfo, $dp->dsr ?? null ), |
604 | 'templateInfo' => $tplLintInfo, |
605 | 'params' => [ 'name' => DOMCompat::nodeName( $c ) ], |
606 | ]; |
607 | $env->recordLint( 'obsolete-tag', $lintObj ); |
608 | } |
609 | |
610 | if ( DOMCompat::nodeName( $c ) === 'font' && $c->hasAttribute( 'color' ) ) { |
611 | /* ---------------------------------------------------------- |
612 | * Tidy migrates <font> into the link in these cases |
613 | * <font>[[Foo]]</font> |
614 | * <font>[[Foo]]l</font> (link-trail) |
615 | * <font><!--boo-->[[Foo]]</font> |
616 | * <font>__NOTOC__[[Foo]]</font> |
617 | * <font>[[Category:Foo]][[Foo]]</font> |
618 | * <font>{{1x|[[Foo]]}}</font> |
619 | * |
620 | * Tidy does not migrate <font> into the link in these cases |
621 | * <font> [[Foo]]</font> |
622 | * <font>[[Foo]] </font> |
623 | * <font>[[Foo]]L</font> (not a link-trail) |
624 | * <font>[[Foo]][[Bar]]</font> |
625 | * <font>[[Foo]][[Bar]]</font> |
626 | * |
627 | * <font> is special. |
628 | * This behavior is not seen with other formatting tags. |
629 | * |
630 | * Remex/parsoid won't do any of this. |
631 | * This difference in behavior only matters when the font tag |
632 | * specifies a link colour because the link no longer renders |
633 | * as blue/red but in the font-specified colour. |
634 | * ---------------------------------------------------------- */ |
635 | $tidyFontBug = $c->firstChild !== null; |
636 | $haveLink = false; |
637 | for ( $n = $c->firstChild; $n; $n = $n->nextSibling ) { |
638 | $nodeName = DOMCompat::nodeName( $n ); |
639 | if ( $nodeName !== 'a' && |
640 | !WTUtils::isRenderingTransparentNode( $n ) && |
641 | !WTUtils::isTplMarkerMeta( $n ) |
642 | ) { |
643 | $tidyFontBug = false; |
644 | break; |
645 | } |
646 | |
647 | if ( $nodeName === 'a' || $nodeName === 'figure' ) { |
648 | if ( !$haveLink ) { |
649 | $haveLink = true; |
650 | } else { |
651 | $tidyFontBug = false; |
652 | break; |
653 | } |
654 | } |
655 | } |
656 | |
657 | if ( $tidyFontBug ) { |
658 | $tplLintInfo = self::findEnclosingTemplateName( $env, $tplInfo ); |
659 | $env->recordLint( 'tidy-font-bug', [ |
660 | 'dsr' => self::findLintDSR( $tplLintInfo, $tplInfo, $dp->dsr ?? null ), |
661 | 'templateInfo' => $tplLintInfo, |
662 | 'params' => [ 'name' => 'font' ] |
663 | ] ); |
664 | } |
665 | } |
666 | } |
667 | |
668 | /** |
669 | * Log bogus (=unrecognized) media options. |
670 | * |
671 | * See - https://www.mediawiki.org/wiki/Help:Images#Syntax |
672 | * |
673 | * Lint category: `bogus-image-options` |
674 | */ |
675 | private function lintBogusImageOptions( |
676 | Env $env, Node $c, DataParsoid $dp, ?stdClass $tplInfo |
677 | ): void { |
678 | // Despite the lint category name, this checks all media, not just images |
679 | if ( WTUtils::isGeneratedFigure( $c ) && !empty( $dp->optList ) ) { |
680 | $items = []; |
681 | $bogusPx = $dp->getTempFlag( TempData::BOGUS_PX ); |
682 | foreach ( $dp->optList as $item ) { |
683 | if ( |
684 | $item['ck'] === 'bogus' || |
685 | ( $bogusPx && $item['ck'] === 'width' ) |
686 | ) { |
687 | $items[] = $item['ak']; |
688 | } |
689 | } |
690 | if ( $items ) { |
691 | $tplLintInfo = self::findEnclosingTemplateName( $env, $tplInfo ); |
692 | $env->recordLint( 'bogus-image-options', [ |
693 | 'dsr' => self::findLintDSR( $tplLintInfo, $tplInfo, $dp->dsr ?? null ), |
694 | 'templateInfo' => $tplLintInfo, |
695 | 'params' => [ 'items' => $items ] |
696 | ] ); |
697 | } |
698 | } |
699 | } |
700 | |
701 | /** |
702 | * Lint tables Tidy deletes. |
703 | * |
704 | * Lint category: `deletable-table-tag` |
705 | * |
706 | * In this example below, the second table is in a fosterable position |
707 | * (inside a <tr>). The tree builder closes the first table at that point |
708 | * and starts a new table there. We are detecting this pattern because |
709 | * Tidy does something very different here. It strips the inner table |
710 | * and retains the outer table. So, for preserving rendering of pages |
711 | * that are tailored for Tidy, editors have to fix up this wikitext |
712 | * to strip the inner table (to mimic what Tidy does). |
713 | * |
714 | * {| style='border:1px solid red;' |
715 | * |a |
716 | * |- |
717 | * {| style='border:1px solid blue;' |
718 | * |b |
719 | * |c |
720 | * |} |
721 | * |} |
722 | */ |
723 | private function lintDeletableTableTag( |
724 | Env $env, Node $c, DataParsoid $dp, ?stdClass $tplInfo |
725 | ): void { |
726 | if ( DOMCompat::nodeName( $c ) === 'table' ) { |
727 | $prev = DiffDOMUtils::previousNonSepSibling( $c ); |
728 | if ( $prev instanceof Element && DOMCompat::nodeName( $prev ) === 'table' && |
729 | !empty( DOMDataUtils::getDataParsoid( $prev )->autoInsertedEnd ) |
730 | ) { |
731 | $tplLintInfo = self::findEnclosingTemplateName( $env, $tplInfo ); |
732 | $dsr = self::findLintDSR( |
733 | $tplLintInfo, |
734 | $tplInfo, |
735 | $dp->dsr ?? null, |
736 | static function ( ?DomSourceRange $nodeDSR ): ?DomSourceRange { |
737 | // Identify the dsr-span of the opening tag |
738 | // of the table that needs to be deleted |
739 | $x = $nodeDSR === null ? null : ( clone $nodeDSR ); |
740 | if ( !empty( $x->openWidth ) ) { |
741 | $x->end = $x->innerStart(); |
742 | $x->openWidth = 0; |
743 | $x->closeWidth = 0; |
744 | } |
745 | return $x; |
746 | } |
747 | ); |
748 | $lintObj = [ |
749 | 'dsr' => $dsr, |
750 | 'templateInfo' => $tplLintInfo, |
751 | 'params' => [ 'name' => 'table' ], |
752 | ]; |
753 | $env->recordLint( 'deletable-table-tag', $lintObj ); |
754 | } |
755 | } |
756 | } |
757 | |
758 | /** |
759 | * Find the first child passing the filter. |
760 | */ |
761 | private function findMatchingChild( Node $node, callable $filter ): ?Node { |
762 | $c = $node->firstChild; |
763 | while ( $c && !$filter( $c ) ) { |
764 | $c = $c->nextSibling; |
765 | } |
766 | |
767 | return $c; |
768 | } |
769 | |
770 | /** |
771 | * Test if the node has a 'nowrap' CSS rule |
772 | * |
773 | * In the general case, this CSS can come from a class, |
774 | * or from a <style> tag or a stylesheet or even from JS code. |
775 | * But, for now, we are restricting this inspection to inline CSS |
776 | * since the intent is to aid editors in fixing patterns that |
777 | * can be automatically detected. |
778 | * |
779 | * Special case for enwiki that has Template:nowrap which |
780 | * assigns class='nowrap' with CSS white-space:nowrap in |
781 | * MediaWiki:Common.css |
782 | */ |
783 | private function hasNoWrapCSS( Node $node ): bool { |
784 | return $node instanceof Element && ( |
785 | str_contains( DOMCompat::getAttribute( $node, 'style' ) ?? '', 'nowrap' ) || |
786 | DOMUtils::hasClass( $node, 'nowrap' ) |
787 | ); |
788 | } |
789 | |
790 | /** |
791 | * Lint bad P wrapping. |
792 | * |
793 | * Lint category: `pwrap-bug-workaround` |
794 | */ |
795 | private function lintPWrapBugWorkaround( |
796 | Env $env, Element $node, DataParsoid $dp, ?stdClass $tplInfo |
797 | ): void { |
798 | if ( |
799 | !DOMUtils::isWikitextBlockNode( $node ) && |
800 | DOMUtils::isWikitextBlockNode( $node->parentNode ) && |
801 | $this->hasNoWrapCSS( $node ) |
802 | ) { |
803 | $p = $this->findMatchingChild( $node, static function ( $e ) { |
804 | return DOMCompat::nodeName( $e ) === 'p'; |
805 | } ); |
806 | if ( $p ) { |
807 | $tplLintInfo = self::findEnclosingTemplateName( $env, $tplInfo ); |
808 | $lintObj = [ |
809 | 'dsr' => self::findLintDSR( $tplLintInfo, $tplInfo, $dp->dsr ?? null ), |
810 | 'templateInfo' => $tplLintInfo, |
811 | 'params' => [ |
812 | 'root' => DOMCompat::nodeName( $node->parentNode ), |
813 | 'child' => DOMCompat::nodeName( $node ), |
814 | ] |
815 | ]; |
816 | $env->recordLint( 'pwrap-bug-workaround', $lintObj ); |
817 | } |
818 | } |
819 | } |
820 | |
821 | /** |
822 | * Lint Tidy div span flip. |
823 | * |
824 | * Lint category: `misc-tidy-replacement-issues` |
825 | */ |
826 | private function lintMiscTidyReplacementIssues( |
827 | Env $env, Element $node, DataParsoid $dp, ?stdClass $tplInfo |
828 | ): void { |
829 | if ( DOMCompat::nodeName( $node ) !== 'span' ) { |
830 | return; |
831 | } |
832 | |
833 | $fc = DiffDOMUtils::firstNonSepChild( $node ); |
834 | if ( !$fc instanceof Element || DOMCompat::nodeName( $fc ) !== 'div' ) { |
835 | return; |
836 | } |
837 | |
838 | // No style/class attributes -- so, this won't affect rendering |
839 | if ( !$node->hasAttribute( 'class' ) && !$node->hasAttribute( 'style' ) && |
840 | !$fc->hasAttribute( 'class' ) && !$fc->hasAttribute( 'style' ) |
841 | ) { |
842 | return; |
843 | } |
844 | |
845 | $tplLintInfo = self::findEnclosingTemplateName( $env, $tplInfo ); |
846 | $lintObj = [ |
847 | 'dsr' => self::findLintDSR( $tplLintInfo, $tplInfo, $dp->dsr ?? null ), |
848 | 'templateInfo' => $tplLintInfo, |
849 | 'params' => [ 'subtype' => 'div-span-flip' ] |
850 | ]; |
851 | $env->recordLint( 'misc-tidy-replacement-issues', $lintObj ); |
852 | } |
853 | |
854 | /** |
855 | * Lint tidy whitespace bug. |
856 | * |
857 | * Lint category: `tidy-whitespace-bug` |
858 | */ |
859 | private function lintTidyWhitespaceBug( |
860 | Env $env, Node $node, DataParsoid $dp, ?stdClass $tplInfo |
861 | ): void { |
862 | // We handle a run of nodes in one shot. |
863 | // No need to reprocess repeatedly. |
864 | if ( $dp->getTempFlag( TempData::PROCESSED_TIDY_WS_BUG ) ) { |
865 | return; |
866 | } |
867 | |
868 | // Find the longest run of nodes that are affected by white-space:nowrap CSS |
869 | // in a way that leads to unsightly rendering in HTML5 compliant browsers. |
870 | // |
871 | // Check if Tidy does buggy whitespace hoisting there to provide the browser |
872 | // opportunities to split the content in short segments. |
873 | // |
874 | // If so, editors would need to edit this run of nodes to introduce |
875 | // whitespace breaks as necessary so that HTML5 browsers get that |
876 | // same opportunity when Tidy is removed. |
877 | $s = null; |
878 | $nowrapNodes = []; |
879 | '@phan-var array<array{node:Node,tidybug:bool,hasLeadingWS:bool}> $nowrapNodes'; |
880 | $startNode = $node; |
881 | $haveTidyBug = false; |
882 | $runLength = 0; |
883 | |
884 | // <br>, <wbr>, <hr> break a line |
885 | while ( $node && !DOMUtils::isRemexBlockNode( $node ) && |
886 | !in_array( DOMCompat::nodeName( $node ), [ 'hr', 'br', 'wbr' ], true ) |
887 | ) { |
888 | if ( $node instanceof Text || !$this->hasNoWrapCSS( $node ) ) { |
889 | // No CSS property that affects whitespace. |
890 | $s = $node->textContent; |
891 | if ( preg_match( '/^(\S*)\s/', $s, $m ) ) { // PORT-FIXME: non-ASCII whitespace? |
892 | $runLength += strlen( $m[1] ); |
893 | $nowrapNodes[] = [ |
894 | 'node' => $node, |
895 | 'tidybug' => false, |
896 | 'hasLeadingWS' => ( preg_match( '/^\s/', $s ) === 1 ), // PORT-FIXME: non-ASCII whitespace? |
897 | ]; |
898 | break; |
899 | } else { |
900 | $nowrapNodes[] = [ 'node' => $node, 'tidybug' => false ]; |
901 | $runLength += strlen( $s ); |
902 | } |
903 | } else { |
904 | // Find last non-comment child of node |
905 | $last = $node->lastChild; |
906 | while ( $last instanceof Comment ) { |
907 | $last = $last->previousSibling; |
908 | } |
909 | |
910 | $bug = false; |
911 | if ( $last instanceof Text && |
912 | preg_match( '/\s$/D', $last->nodeValue ) // PORT-FIXME: non-ASCII whitespace? |
913 | ) { |
914 | // In this scenario, when Tidy hoists the whitespace to |
915 | // after the node, that whitespace is not subject to the |
916 | // nowrap CSS => browsers can break content there. |
917 | // |
918 | // But, non-Tidy libraries won't hoist the whitespace. |
919 | // So, browsers don't have a place to break content. |
920 | $bug = true; |
921 | $haveTidyBug = true; |
922 | } |
923 | |
924 | $nowrapNodes[] = [ 'node' => $node, 'tidybug' => $bug ]; |
925 | $runLength += strlen( $node->textContent ); |
926 | } |
927 | |
928 | // Don't cross template boundaries at the top-level |
929 | if ( $tplInfo && $tplInfo->last === $node ) { |
930 | // Exiting a top-level template |
931 | break; |
932 | } elseif ( !$tplInfo && WTUtils::findFirstEncapsulationWrapperNode( $node ) ) { |
933 | // Entering a top-level template |
934 | break; |
935 | } |
936 | |
937 | // Move to the next non-comment sibling |
938 | $node = $node->nextSibling; |
939 | while ( $node instanceof Comment ) { |
940 | $node = $node->nextSibling; |
941 | } |
942 | } |
943 | |
944 | $markProcessedNodes = static function () use ( &$nowrapNodes ) { // Helper |
945 | foreach ( $nowrapNodes as $o ) { |
946 | // Phan fails at applying the instanceof type restriction to the array member when analyzing the |
947 | // following call, but is fine when it's copied to a local variable. |
948 | $node = $o['node']; |
949 | if ( $node instanceof Element ) { |
950 | DOMDataUtils::getDataParsoid( $node )->setTempFlag( TempData::PROCESSED_TIDY_WS_BUG ); |
951 | } |
952 | } |
953 | }; |
954 | |
955 | if ( !$haveTidyBug ) { |
956 | // Mark processed nodes and bail |
957 | $markProcessedNodes(); |
958 | return; |
959 | } |
960 | |
961 | // Find run before startNode that doesn't have a whitespace break |
962 | $prev = $startNode->previousSibling; |
963 | while ( $prev && !DOMUtils::isRemexBlockNode( $prev ) ) { |
964 | if ( !( $prev instanceof Comment ) ) { |
965 | $s = $prev->textContent; |
966 | // Find the last \s in the string |
967 | if ( preg_match( '/\s(\S*)$/D', $s, $m ) ) { // PORT-FIXME: non-ASCII whitespace here? |
968 | $runLength += strlen( $m[1] ); |
969 | break; |
970 | } else { |
971 | $runLength += strlen( $s ); |
972 | } |
973 | } |
974 | $prev = $prev->previousSibling; |
975 | } |
976 | |
977 | $lintConfig = $env->getLinterConfig(); |
978 | $tidyWhitespaceBugMaxLength = $lintConfig['tidyWhitespaceBugMaxLength'] ?? 100; |
979 | |
980 | if ( $runLength < $tidyWhitespaceBugMaxLength ) { |
981 | // Mark processed nodes and bail |
982 | $markProcessedNodes(); |
983 | return; |
984 | } |
985 | |
986 | // For every node where Tidy hoists whitespace, |
987 | // emit an event to flag a whitespace fixup opportunity. |
988 | $tplLintInfo = self::findEnclosingTemplateName( $env, $tplInfo ); |
989 | $n = count( $nowrapNodes ) - 1; |
990 | foreach ( $nowrapNodes as $i => $o ) { |
991 | if ( $o['tidybug'] && $i < $n && empty( $nowrapNodes[$i + 1]['hasLeadingWS'] ) ) { |
992 | $nowrapNode = $o['node']; // (see above) |
993 | $lintObj = [ |
994 | 'dsr' => self::findLintDSR( |
995 | $tplLintInfo, |
996 | $tplInfo, |
997 | $nowrapNode instanceof Element |
998 | ? DOMDataUtils::getDataParsoid( $nowrapNode )->dsr ?? null |
999 | : null |
1000 | ), |
1001 | 'templateInfo' => $tplLintInfo, |
1002 | 'params' => [ |
1003 | 'node' => DOMCompat::nodeName( $o['node'] ), |
1004 | 'sibling' => DOMCompat::nodeName( $o['node']->nextSibling ) |
1005 | ] |
1006 | ]; |
1007 | |
1008 | $env->recordLint( 'tidy-whitespace-bug', $lintObj ); |
1009 | } |
1010 | } |
1011 | |
1012 | $markProcessedNodes(); |
1013 | } |
1014 | |
1015 | /** |
1016 | * Detect multiple-unclosed-formatting-tags errors. |
1017 | * |
1018 | * Since unclosed <small> and <big> tags accumulate their effects |
1019 | * in HTML5 parsers (unlike in Tidy where it seems to suppress |
1020 | * multiple unclosed elements of the same name), such pages will |
1021 | * break pretty spectacularly with Remex. |
1022 | * |
1023 | * Ex: https://it.wikipedia.org/wiki/Hubert_H._Humphrey_Metrodome?oldid=93017491#Note |
1024 | * |
1025 | * Lint category: `multiple-unclosed-formatting-tags` |
1026 | */ |
1027 | private function lintMultipleUnclosedFormattingTags( array $lints, Env $env ): void { |
1028 | $firstUnclosedTag = [ |
1029 | 'small' => null, |
1030 | 'big' => null |
1031 | ]; |
1032 | $multiUnclosedTagName = null; |
1033 | foreach ( $lints as $item ) { |
1034 | // Unclosed tags in tables don't leak out of the table |
1035 | if ( $item['type'] === 'missing-end-tag' && !$item['params']['inTable'] ) { |
1036 | if ( $item['params']['name'] === 'small' || $item['params']['name'] === 'big' ) { |
1037 | $tagName = $item['params']['name']; |
1038 | // @phan-suppress-next-line PhanPossiblyUndeclaredVariable |
1039 | if ( !$firstUnclosedTag[$tagName] ) { |
1040 | $firstUnclosedTag[$tagName] = $item; |
1041 | } else { |
1042 | $multiUnclosedTagName = $tagName; |
1043 | break; |
1044 | } |
1045 | } |
1046 | } |
1047 | } |
1048 | |
1049 | if ( $multiUnclosedTagName ) { |
1050 | $item = $firstUnclosedTag[$multiUnclosedTagName]; |
1051 | if ( isset( $item['dsr'] ) ) { |
1052 | $item['dsr'] = DomSourceRange::newFromJsonArray( $item['dsr'] ); |
1053 | } |
1054 | $env->recordLint( 'multiple-unclosed-formatting-tags', [ |
1055 | 'params' => $item['params'], |
1056 | 'dsr' => $item['dsr'], |
1057 | 'templateInfo' => $item['templateInfo'], |
1058 | ] ); |
1059 | } |
1060 | } |
1061 | |
1062 | /** |
1063 | * Post-process an array of lints |
1064 | */ |
1065 | private function postProcessLints( array $lints, Env $env ): void { |
1066 | $this->lintMultipleUnclosedFormattingTags( $lints, $env ); |
1067 | } |
1068 | |
1069 | /** |
1070 | * Get wikitext list item ancestor |
1071 | */ |
1072 | private function getWikitextListItemAncestor( ?Node $node ): ?Node { |
1073 | while ( $node && !DOMUtils::isListItem( $node ) ) { |
1074 | $node = $node->parentNode; |
1075 | } |
1076 | |
1077 | if ( $node && !WTUtils::isLiteralHTMLNode( $node ) && |
1078 | !WTUtils::fromExtensionContent( $node, 'references' ) |
1079 | ) { |
1080 | return $node; |
1081 | } |
1082 | |
1083 | return null; |
1084 | } |
1085 | |
1086 | /** |
1087 | * Lint a PHP parser bug. |
1088 | * |
1089 | * When an HTML table is nested inside a list, if any part of the table |
1090 | * is on a new line, the PHP parser misnests the list and the table. |
1091 | * Tidy fixes the misnesting one way (puts table inside/outside the list) |
1092 | * HTML5 parser fixes it another way (list expands to rest of the page!) |
1093 | * |
1094 | * Lint category: `multiline-html-table-in-list` |
1095 | */ |
1096 | private function lintMultilineHtmlTableInList( |
1097 | Env $env, Element $node, DataParsoid $dp, ?stdClass $tplInfo |
1098 | ): void { |
1099 | $li = null; |
1100 | if ( !WTUtils::isLiteralHTMLNode( $node ) || |
1101 | DOMCompat::nodeName( $node ) !== 'table' || |
1102 | // phpcs:ignore Generic.CodeAnalysis.AssignmentInCondition.Found |
1103 | !( $li = $this->getWikitextListItemAncestor( $node ) ) || |
1104 | !str_contains( DOMCompat::getOuterHTML( $node ), "\n" ) |
1105 | ) { |
1106 | return; |
1107 | } |
1108 | |
1109 | // We have an HTML table nested inside a list |
1110 | // that has a newline break in its outer HTML |
1111 | // => we are in trouble with the PHP Parser + Remex combo |
1112 | $tplLintInfo = self::findEnclosingTemplateName( $env, $tplInfo ); |
1113 | $lintObj = [ |
1114 | 'dsr' => self::findLintDSR( $tplLintInfo, $tplInfo, $dp->dsr ?? null ), |
1115 | 'templateInfo' => $tplLintInfo, |
1116 | 'params' => [ |
1117 | 'name' => 'table', |
1118 | 'ancestorName' => DOMCompat::nodeName( $li ), |
1119 | ], |
1120 | ]; |
1121 | $env->recordLint( 'multiline-html-table-in-list', $lintObj ); |
1122 | } |
1123 | |
1124 | /** |
1125 | * Log wikilinks or media in external links. |
1126 | * |
1127 | * HTML tags can be nested but this is not the case for <a> tags |
1128 | * which when nested outputs the <a> tags adjacent to each other |
1129 | * In the example below, [[Google]] is a wikilink that is nested |
1130 | * in the outer external link |
1131 | * [http://google.com This is [[Google]]'s search page] |
1132 | * |
1133 | * Linter category: `wikilink-in-extlink` |
1134 | */ |
1135 | private function lintWikilinksInExtlink( |
1136 | Env $env, Element $node, DataParsoid $dp, ?stdClass $tplInfo |
1137 | ): void { |
1138 | if ( |
1139 | DOMCompat::nodeName( $node ) === 'a' && |
1140 | DOMUtils::hasRel( $node, "mw:ExtLink" ) && |
1141 | // Images in extlinks will end up with broken up extlinks inside the |
1142 | // <figure> DOM. Those have 'misnested' flag set on them. Ignore those. |
1143 | empty( $dp->misnested ) |
1144 | ) { |
1145 | $next = $node->nextSibling; |
1146 | $lintError = $next instanceof Element && |
1147 | !empty( DOMDataUtils::getDataParsoid( $next )->misnested ) && |
1148 | // This check may not be necessary but ensures that we are |
1149 | // really in a link-in-link misnested scenario. |
1150 | DOMUtils::treeHasElement( $next, 'a', true ); |
1151 | |
1152 | // Media as opposed to most instances of img (barring the link= trick), don't result |
1153 | // in misnesting according the html5 spec since we're actively suppressing links in |
1154 | // their structure. However, since timed media is inherently clickable, being nested |
1155 | // in an extlink could surprise a user clicking on it by navigating away from the page. |
1156 | if ( !$lintError ) { |
1157 | DOMUtils::visitDOM( $node, static function ( $element ) use ( &$lintError ) { |
1158 | if ( $element instanceof Element && |
1159 | ( DOMCompat::nodeName( $element ) === 'audio' || |
1160 | DOMCompat::nodeName( $element ) === 'video' ) |
1161 | ) { |
1162 | $lintError = true; |
1163 | } |
1164 | } ); |
1165 | } |
1166 | if ( $lintError ) { |
1167 | $tplLintInfo = self::findEnclosingTemplateName( $env, $tplInfo ); |
1168 | $lintObj = [ |
1169 | 'dsr' => self::findLintDSR( $tplLintInfo, $tplInfo, $dp->dsr ?? null ), |
1170 | 'templateInfo' => $tplLintInfo, |
1171 | ]; |
1172 | $env->recordLint( 'wikilink-in-extlink', $lintObj ); |
1173 | } |
1174 | } |
1175 | } |
1176 | |
1177 | private function recordLargeTablesLint( |
1178 | Env $env, ?stdClass $tplInfo, Element $node, int $numColumns, int $columnsMax |
1179 | ): void { |
1180 | $tplLintInfo = self::findEnclosingTemplateName( $env, $tplInfo ); |
1181 | $lintObj = [ |
1182 | 'dsr' => self::findLintDSR( |
1183 | $tplLintInfo, $tplInfo, DOMDataUtils::getDataParsoid( $node )->dsr ?? null |
1184 | ), |
1185 | 'templateInfo' => $tplLintInfo, |
1186 | 'params' => [ |
1187 | 'name' => 'table', |
1188 | 'columns' => $numColumns, |
1189 | 'columnsMax' => $columnsMax, |
1190 | ], |
1191 | ]; |
1192 | $env->recordLint( 'large-tables', $lintObj ); |
1193 | } |
1194 | |
1195 | /** |
1196 | * TODO: In the future, this may merit being moved to DOMUtils |
1197 | * along with its "previous" variant. |
1198 | */ |
1199 | private function skipNonElementNodes( ?Node $n ): ?Element { |
1200 | while ( $n && !( $n instanceof Element ) ) { |
1201 | $n = $n->nextSibling; |
1202 | } |
1203 | return $n; |
1204 | } |
1205 | |
1206 | /** |
1207 | * Lint large tables. |
1208 | * |
1209 | * Identify articles having overly-large tables |
1210 | * to help editors optimize their articles. |
1211 | * |
1212 | * Linter category: `large-tables` |
1213 | */ |
1214 | private function lintLargeTables( |
1215 | Env $env, Element $node, DataParsoid $dp, ?stdClass $tplInfo |
1216 | ): void { |
1217 | if ( DOMCompat::nodeName( $node ) !== 'table' ) { |
1218 | return; |
1219 | } |
1220 | |
1221 | // Skip tables that have nested tables in them as they are likely |
1222 | // to be used for layout and not for data representation. |
1223 | // We may check nested tables in the next iteration of this lint. |
1224 | $nestedTables = $node->getElementsByTagName( 'table' ); |
1225 | if ( $nestedTables->length > 0 ) { |
1226 | return; |
1227 | } |
1228 | |
1229 | $lintConfig = $env->getLinterConfig(); |
1230 | $maxColumns = $lintConfig['maxTableColumnHeuristic'] ?? 5; |
1231 | $maxRowsToCheck = $lintConfig['maxTableRowsToCheck'] ?? 10; |
1232 | |
1233 | $trCount = 0; |
1234 | $tbody = DOMCompat::querySelector( $node, 'tbody' ); |
1235 | // empty table |
1236 | if ( !$tbody ) { |
1237 | return; |
1238 | } |
1239 | $tr = self::skipNonElementNodes( $tbody->firstChild ); |
1240 | while ( $tr && $trCount < $maxRowsToCheck ) { |
1241 | $numTh = $tr->getElementsByTagName( 'th' )->length; |
1242 | if ( $numTh > $maxColumns ) { |
1243 | $this->recordLargeTablesLint( $env, $tplInfo, $node, $numTh, $maxColumns ); |
1244 | return; |
1245 | } |
1246 | |
1247 | $numTd = $tr->getElementsByTagName( 'td' )->length; |
1248 | if ( $numTd > $maxColumns ) { |
1249 | $this->recordLargeTablesLint( $env, $tplInfo, $node, $numTd, $maxColumns ); |
1250 | return; |
1251 | } |
1252 | |
1253 | $tr = self::skipNonElementNodes( $tr->nextSibling ); |
1254 | $trCount++; |
1255 | } |
1256 | } |
1257 | |
1258 | /** |
1259 | * Log inline background color style rules without a color style rule. |
1260 | * |
1261 | * This function identifies elements with inline style attributes |
1262 | * that have background color set but don't have a color style rule. |
1263 | * It records linter events for such elements to help editors make |
1264 | * their articles comply with WCAG color contrast rules. |
1265 | * |
1266 | * Linter category: `night-mode-unaware-background-color` |
1267 | */ |
1268 | private function lintNightModeUnawareBackgroundColor( |
1269 | Env $env, Element $node, DataParsoid $dp, ?stdClass $tplInfo |
1270 | ): void { |
1271 | // Get inline style attribute value |
1272 | $styleAttrValue = DOMCompat::getAttribute( $node, 'style' ); |
1273 | |
1274 | // Check if background color is set but font color is not |
1275 | if ( |
1276 | ( $styleAttrValue !== null ) && |
1277 | preg_match( '/(^|;)\s*background(-color)?\s*:/i', $styleAttrValue ) && |
1278 | !preg_match( '/(^|;)\s*color\s*:/i', $styleAttrValue ) |
1279 | ) { |
1280 | $tplLintInfo = self::findEnclosingTemplateName( $env, $tplInfo ); |
1281 | $lintObj = [ |
1282 | 'dsr' => self::findLintDSR( $tplLintInfo, $tplInfo, $dp->dsr ?? null ), |
1283 | 'templateInfo' => $tplLintInfo, |
1284 | ]; |
1285 | $env->recordLint( 'night-mode-unaware-background-color', $lintObj ); |
1286 | } |
1287 | } |
1288 | |
1289 | /** |
1290 | * Lint for missing image alt text |
1291 | * |
1292 | * Linter category: `missing-image-alt-text` |
1293 | */ |
1294 | private function lintMissingAltText( |
1295 | Env $env, Element $c, DataParsoid $dp, ?stdClass $tplInfo |
1296 | ): void { |
1297 | if ( !WTUtils::isGeneratedFigure( $c ) ) { |
1298 | return; |
1299 | } |
1300 | |
1301 | // Extract the media element in its standard place |
1302 | $media = $c->firstChild->firstChild ?? null; |
1303 | if ( !( $media instanceof Element ) || DOMCompat::nodeName( $media ) !== 'img' ) { |
1304 | // Videos and such are handled differently; check only |
1305 | // simple image output for alt text. |
1306 | return; |
1307 | } |
1308 | |
1309 | if ( $media->hasAttribute( 'alt' ) ) { |
1310 | // Present and accounted for, either via explicit markup |
1311 | // or filling in from an inline caption or other future |
1312 | // source. |
1313 | // |
1314 | // Note that an explicit empty alt text will be counted |
1315 | // as present, as this may be done deliberately for |
1316 | // spacer images or similar. |
1317 | return; |
1318 | } |
1319 | |
1320 | // Follow the parent tree looking for aria-hidden=true or equivalent roles |
1321 | for ( $node = $media; $node->parentNode; $node = $node->parentNode ) { |
1322 | $hidden = strtolower( DOMCompat::getAttribute( $node, 'aria-hidden' ) ?? '' ); |
1323 | $role = strtolower( DOMCompat::getAttribute( $node, 'role' ) ?? '' ); |
1324 | if ( $hidden === 'true' |
1325 | || $role === 'presentation' |
1326 | || $role === 'none' ) { |
1327 | // This entire subtree is excluded from the accessibility tree. |
1328 | return; |
1329 | } |
1330 | } |
1331 | |
1332 | $resource = DOMCompat::getAttribute( $media, 'resource' ) ?? ''; |
1333 | $file = basename( urldecode( $resource ) ); |
1334 | |
1335 | $tplLintInfo = self::findEnclosingTemplateName( $env, $tplInfo ); |
1336 | $lintObj = [ |
1337 | 'dsr' => self::findLintDSR( $tplLintInfo, $tplInfo, $dp->dsr ?? null ), |
1338 | 'templateInfo' => $tplLintInfo, |
1339 | 'params' => [ |
1340 | 'file' => $file, |
1341 | ] |
1342 | ]; |
1343 | $env->recordLint( 'missing-image-alt-text', $lintObj ); |
1344 | } |
1345 | |
1346 | /** |
1347 | * Lint duplicate ids in the page |
1348 | * |
1349 | * Linter category: `duplicate-ids` |
1350 | */ |
1351 | private function lintDuplicateIds( |
1352 | Env $env, Element $node, DataParsoid $dp, ?stdClass $tplInfo |
1353 | ) { |
1354 | $id = DOMCompat::getAttribute( $node, 'id' ); |
1355 | if ( $id === null || $id === '' ) { |
1356 | return; |
1357 | } |
1358 | if ( !isset( $this->seenIds[$id] ) ) { |
1359 | $this->seenIds[$id] = 1; |
1360 | return; |
1361 | } |
1362 | $tplLintInfo = self::findEnclosingTemplateName( $env, $tplInfo ); |
1363 | $lintObj = [ |
1364 | 'dsr' => self::findLintDSR( |
1365 | $tplLintInfo, $tplInfo, $dp->dsr ?? null |
1366 | ), |
1367 | 'templateInfo' => $tplLintInfo, |
1368 | 'params' => [ 'id' => $id ], |
1369 | ]; |
1370 | $env->recordLint( 'duplicate-ids', $lintObj ); |
1371 | } |
1372 | |
1373 | /** |
1374 | * Log wikitext fixups |
1375 | */ |
1376 | private function logWikitextFixups( |
1377 | Element $node, Env $env, ?stdClass $tplInfo |
1378 | ): void { |
1379 | $dp = DOMDataUtils::getDataParsoid( $node ); |
1380 | $this->lintTreeBuilderFixup( $env, $node, $dp, $tplInfo ); |
1381 | $this->lintDeletableTableTag( $env, $node, $dp, $tplInfo ); // For T161341 |
1382 | $this->lintPWrapBugWorkaround( $env, $node, $dp, $tplInfo ); // For T161306 |
1383 | $this->lintObsoleteTag( $env, $node, $dp, $tplInfo ); |
1384 | $this->lintBogusImageOptions( $env, $node, $dp, $tplInfo ); |
1385 | $this->lintTidyWhitespaceBug( $env, $node, $dp, $tplInfo ); |
1386 | $this->lintMiscTidyReplacementIssues( $env, $node, $dp, $tplInfo ); |
1387 | $this->lintMultilineHtmlTableInList( $env, $node, $dp, $tplInfo ); |
1388 | $this->lintWikilinksInExtlink( $env, $node, $dp, $tplInfo ); |
1389 | $this->lintLargeTables( $env, $node, $dp, $tplInfo ); |
1390 | $this->lintNightModeUnawareBackgroundColor( $env, $node, $dp, $tplInfo ); |
1391 | $this->lintFostered( $env, $node, $dp, $tplInfo ); |
1392 | $this->lintMissingAltText( $env, $node, $dp, $tplInfo ); |
1393 | $this->lintDuplicateIds( $env, $node, $dp, $tplInfo ); |
1394 | } |
1395 | |
1396 | /** |
1397 | * Walk the DOM and compute lints for the entire tree. |
1398 | * - When we enter encapsulated content (templates or extensions), |
1399 | * compute "tplInfo" (misnamed given that it can be an extension) |
1400 | * so that lints from the templates' content can be mapped back |
1401 | * to the transclusion that generated them. |
1402 | * - When we process extensions, if we have a lint handler for the |
1403 | * extension, let the extension's lint handler compute lints. |
1404 | */ |
1405 | private function findLints( |
1406 | Node $root, Env $env, ?stdClass $tplInfo = null |
1407 | ): void { |
1408 | $node = $root->firstChild; |
1409 | while ( $node !== null ) { |
1410 | if ( !$node instanceof Element ) { |
1411 | $node = $node->nextSibling; |
1412 | continue; |
1413 | } |
1414 | |
1415 | // !tplInfo check is to protect against templated content in |
1416 | // extensions which might in turn be nested in templated content. |
1417 | if ( !$tplInfo && WTUtils::isFirstEncapsulationWrapperNode( $node ) ) { |
1418 | $aboutSibs = WTUtils::getAboutSiblings( $node, DOMCompat::getAttribute( $node, 'about' ) ); |
1419 | $tplInfo = (object)[ |
1420 | 'first' => $node, |
1421 | 'last' => end( $aboutSibs ), |
1422 | 'dsr' => DOMDataUtils::getDataParsoid( $node )->dsr ?? null, |
1423 | // FIXME: This is not being used. Instead the code is recomputing |
1424 | // this info in findEnclosingTemplateName. |
1425 | 'isTemplated' => DOMUtils::hasTypeOf( $node, 'mw:Transclusion' ), |
1426 | ]; |
1427 | } |
1428 | |
1429 | $handled = false; |
1430 | |
1431 | // Let native extensions lint their content |
1432 | $nativeExt = WTUtils::getNativeExt( $env, $node ); |
1433 | if ( $nativeExt ) { |
1434 | if ( !$this->extApi ) { |
1435 | $this->extApi = new ParsoidExtensionAPI( $env ); |
1436 | } |
1437 | $handled = $nativeExt->lintHandler( |
1438 | $this->extApi, |
1439 | $node, |
1440 | function ( $extRootNode ) use ( $env, $tplInfo ) { |
1441 | $this->findLints( |
1442 | $extRootNode, $env, |
1443 | empty( $tplInfo->isTemplated ) ? null : $tplInfo |
1444 | ); |
1445 | } |
1446 | ); |
1447 | // NOTE: See the note in WrapSectionsState::shouldOmitFromTOC() |
1448 | // but we've assumed extension content is contained in a single |
1449 | // wrapper node and it's safe to move to $node->nextSibling. |
1450 | } |
1451 | |
1452 | // Default node handler |
1453 | if ( $handled === false ) { |
1454 | // Lint this node |
1455 | $this->logWikitextFixups( $node, $env, $tplInfo ); |
1456 | |
1457 | // Lint subtree |
1458 | $this->findLints( $node, $env, $tplInfo ); |
1459 | } |
1460 | |
1461 | if ( $tplInfo && $tplInfo->last === $node ) { |
1462 | $tplInfo = null; |
1463 | } |
1464 | |
1465 | $node = $node->nextSibling; |
1466 | } |
1467 | } |
1468 | |
1469 | /** |
1470 | * This is only invoked on the top-level document |
1471 | * @inheritDoc |
1472 | */ |
1473 | public function run( |
1474 | Env $env, Node $root, array $options = [], bool $atTopLevel = false |
1475 | ): void { |
1476 | if ( !$env->linting() ) { |
1477 | return; |
1478 | } |
1479 | |
1480 | // Track time spent linting so we can evaluate benefits |
1481 | // of migrating this code off the critical path to its own |
1482 | // post processor. |
1483 | $siteConfig = $env->getSiteConfig(); |
1484 | $timer = Timing::start( $siteConfig ); |
1485 | |
1486 | $this->findLints( $root, $env ); |
1487 | $this->postProcessLints( $env->getLints(), $env ); |
1488 | |
1489 | $timer->end( "linting", "linting", [] ); |
1490 | } |
1491 | |
1492 | } |