Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
98.37% |
542 / 551 |
|
87.50% |
28 / 32 |
CRAP | |
0.00% |
0 / 1 |
Linter | |
98.37% |
542 / 551 |
|
87.50% |
28 / 32 |
238 | |
0.00% |
0 / 1 |
getTagsWithChangedMisnestingBehavior | |
100.00% |
19 / 19 |
|
100.00% |
1 / 1 |
7 | |||
leftMostMisnestedDescendent | |
100.00% |
13 / 13 |
|
100.00% |
1 / 1 |
8 | |||
getMatchingMisnestedNode | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
3 | |||
findEnclosingTemplateName | |
80.00% |
12 / 15 |
|
0.00% |
0 / 1 |
7.39 | |||
findLintDSR | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
5 | |||
hasIdenticalNestedTag | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
5 | |||
hasMisnestableContent | |
100.00% |
15 / 15 |
|
100.00% |
1 / 1 |
10 | |||
endTagOptional | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
getHeadingAncestor | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
3 | |||
matchedOpenTagPairExists | |
90.00% |
9 / 10 |
|
0.00% |
0 / 1 |
7.05 | |||
lintTreeBuilderFixup | |
100.00% |
59 / 59 |
|
100.00% |
1 / 1 |
31 | |||
lintFostered | |
100.00% |
23 / 23 |
|
100.00% |
1 / 1 |
8 | |||
lintObsoleteTag | |
100.00% |
38 / 38 |
|
100.00% |
1 / 1 |
17 | |||
lintBogusImageOptions | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
8 | |||
lintDeletableTableTag | |
100.00% |
24 / 24 |
|
100.00% |
1 / 1 |
7 | |||
findMatchingChild | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
3 | |||
hasNoWrapCSS | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
3 | |||
lintPWrapBugWorkaround | |
100.00% |
17 / 17 |
|
100.00% |
1 / 1 |
5 | |||
lintMiscTidyReplacementIssues | |
100.00% |
15 / 15 |
|
100.00% |
1 / 1 |
8 | |||
lintTidyWhitespaceBug | |
98.78% |
81 / 82 |
|
0.00% |
0 / 1 |
29 | |||
lintMultipleUnclosedFormattingTags | |
100.00% |
22 / 22 |
|
100.00% |
1 / 1 |
9 | |||
postProcessLints | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getWikitextListItemAncestor | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
6 | |||
lintMultilineHtmlTableInList | |
100.00% |
18 / 18 |
|
100.00% |
1 / 1 |
5 | |||
lintWikilinksInExtlink | |
100.00% |
23 / 23 |
|
100.00% |
1 / 1 |
11 | |||
recordLargeTablesLint | |
100.00% |
13 / 13 |
|
100.00% |
1 / 1 |
1 | |||
skipNonElementNodes | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
3 | |||
lintLargeTables | |
100.00% |
24 / 24 |
|
100.00% |
1 / 1 |
8 | |||
lintNightModeUnawareBackgroundColor | |
100.00% |
12 / 12 |
|
100.00% |
1 / 1 |
4 | |||
logWikitextFixups | |
100.00% |
13 / 13 |
|
100.00% |
1 / 1 |
1 | |||
findLints | |
88.24% |
30 / 34 |
|
0.00% |
0 / 1 |
11.20 | |||
run | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
3 |
1 | <?php |
2 | |
3 | declare( strict_types = 1 ); |
4 | |
5 | namespace Wikimedia\Parsoid\Wt2Html\PP\Processors; |
6 | |
7 | use stdClass; |
8 | use Wikimedia\Parsoid\Config\Env; |
9 | use Wikimedia\Parsoid\Core\DomSourceRange; |
10 | use Wikimedia\Parsoid\DOM\Comment; |
11 | use Wikimedia\Parsoid\DOM\Element; |
12 | use Wikimedia\Parsoid\DOM\Node; |
13 | use Wikimedia\Parsoid\DOM\Text; |
14 | use Wikimedia\Parsoid\Ext\ParsoidExtensionAPI; |
15 | use Wikimedia\Parsoid\NodeData\DataParsoid; |
16 | use Wikimedia\Parsoid\NodeData\TempData; |
17 | use Wikimedia\Parsoid\Utils\DiffDOMUtils; |
18 | use Wikimedia\Parsoid\Utils\DOMCompat; |
19 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
20 | use Wikimedia\Parsoid\Utils\DOMUtils; |
21 | use Wikimedia\Parsoid\Utils\PHPUtils; |
22 | use Wikimedia\Parsoid\Utils\Timing; |
23 | use Wikimedia\Parsoid\Utils\Utils; |
24 | use Wikimedia\Parsoid\Utils\WTUtils; |
25 | use Wikimedia\Parsoid\Wikitext\Consts; |
26 | use Wikimedia\Parsoid\Wt2Html\Wt2HtmlDOMProcessor; |
27 | |
28 | /** |
29 | * DOM pass that walks the DOM tree, detects specific wikitext patterns, |
30 | * and emits them as linter events. |
31 | */ |
32 | class Linter implements Wt2HtmlDOMProcessor { |
33 | /** @var ParsoidExtensionAPI */ |
34 | private $extApi = null; |
35 | |
36 | /** @var array<string,bool>|null */ |
37 | private $tagsWithChangedMisnestingBehavior = null; |
38 | |
39 | /** @var string|null */ |
40 | private $obsoleteTagsRE = null; |
41 | |
42 | /** |
43 | * We are trying to find HTML5 tags that have different behavior compared to HTML4 |
44 | * in some misnesting scenarios around wikitext paragraphs. |
45 | * |
46 | * Ex: Input: <p><small>a</p><p>b</small></p> |
47 | * Tidy output: <p><small>a</small></p><p><small>b</small></p> |
48 | * HTML5 output: <p><small>a</small></p><p><small>b</small></p> |
49 | * |
50 | * So, all good here. |
51 | * But, see how output changes when we use <span> instead |
52 | * |
53 | * Ex: Input: <p><span>a</p><p>b</span></p> |
54 | * Tidy output: <p><span>a</span></p><p><span>b</span></p> |
55 | * HTML5 output: <p><span>a</span></p><p>b</p> |
56 | * |
57 | * The source wikitext is "<span>a\n\nb</span>". The difference persists even |
58 | * when you have "<span>a\n\n<div>b</div>" or "<span>a\n\n{|\n|x\n|}\nbar". |
59 | * |
60 | * This is because Tidy seems to be doing the equivalent of HTM5-treebuilder's |
61 | * active formatting element reconstruction step on all *inline* elements. |
62 | * However, HTML5 parsers only do that on formatting elements. So, we need |
63 | * to compute which HTML5 tags are subject to this differential behavior. |
64 | * |
65 | * We compute that by excluding the following tags from the list of all HTML5 tags |
66 | * - If our sanitizer doesn't allow them, they will be escaped => ignore them |
67 | * - HTML4 block tags are excluded (obviously) |
68 | * - Void tags don't matter since they cannot wrap anything (obviously) |
69 | * - Active formatting elements have special handling in the HTML5 tree building |
70 | * algorithm where they are reconstructed to wrap all originally intended content. |
71 | * (ex: <small> above) |
72 | * |
73 | * Here is the list of 22 HTML5 tags that are affected: |
74 | * ABBR, BDI, BDO, CITE, DATA, DEL, DFN, INS, KBD, MARK, |
75 | * Q, RB, RP, RT, RTC, RUBY, SAMP, SPAN, SUB, SUP, TIME, VAR |
76 | * |
77 | * https://phabricator.wikimedia.org/T176363#3628173 verifies that this list of |
78 | * tags all demonstrate this behavior. |
79 | * |
80 | * @return array |
81 | * @phan-return array<string,bool> |
82 | */ |
83 | private function getTagsWithChangedMisnestingBehavior(): array { |
84 | if ( $this->tagsWithChangedMisnestingBehavior === null ) { |
85 | // This set is frozen in time. It gets us down to the requisite |
86 | // 22 HTML5 tags above, but shouldn't be used for anything other |
87 | // than that. |
88 | $HTML4TidyBlockTags = PHPUtils::makeSet( [ |
89 | 'div', 'p', |
90 | # tables |
91 | 'table', 'tbody', 'thead', 'tfoot', 'caption', 'th', 'tr', 'td', |
92 | # lists |
93 | 'ul', 'ol', 'li', 'dl', 'dt', 'dd', |
94 | # HTML5 heading content |
95 | 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hgroup', |
96 | # HTML5 sectioning content |
97 | 'article', 'aside', 'nav', 'section', 'footer', 'header', |
98 | 'figure', 'figcaption', 'fieldset', 'details', 'blockquote', |
99 | # other |
100 | 'hr', 'button', 'canvas', 'center', 'col', 'colgroup', 'embed', |
101 | 'map', 'object', 'pre', 'progress', 'video', |
102 | ] ); |
103 | $this->tagsWithChangedMisnestingBehavior = []; |
104 | foreach ( Consts::$HTML['HTML5Tags'] as $tag => $dummy ) { |
105 | if ( isset( Consts::$Sanitizer['AllowedLiteralTags'][$tag] ) && |
106 | !isset( $HTML4TidyBlockTags[$tag] ) && |
107 | !isset( Consts::$HTML['FormattingTags'][$tag] ) && |
108 | !isset( Consts::$HTML['VoidTags'][$tag] ) |
109 | ) { |
110 | $this->tagsWithChangedMisnestingBehavior[$tag] = true; |
111 | } |
112 | } |
113 | } |
114 | |
115 | return $this->tagsWithChangedMisnestingBehavior; |
116 | } |
117 | |
118 | /** |
119 | * Finds a matching node at the "start" of this node. |
120 | */ |
121 | private function leftMostMisnestedDescendent( ?Node $node, Element $match ): ?Element { |
122 | if ( !$node instanceof Element ) { |
123 | return null; |
124 | } |
125 | |
126 | if ( DOMUtils::isMarkerMeta( $node, 'mw:Placeholder/StrippedTag' ) ) { |
127 | $name = DOMDataUtils::getDataParsoid( $node )->name ?? null; |
128 | return $name === DOMCompat::nodeName( $match ) ? $node : null; |
129 | } |
130 | |
131 | if ( DOMCompat::nodeName( $node ) === DOMCompat::nodeName( $match ) ) { |
132 | $dp = DOMDataUtils::getDataParsoid( $node ); |
133 | if ( ( DOMDataUtils::getDataParsoid( $match )->stx ?? null ) === ( $dp->stx ?? null ) && |
134 | !empty( $dp->autoInsertedStart ) |
135 | ) { |
136 | if ( !empty( $dp->autoInsertedEnd ) ) { |
137 | return $this->getMatchingMisnestedNode( $node, $match ); |
138 | } else { |
139 | return $node; |
140 | } |
141 | } |
142 | } |
143 | |
144 | return $this->leftMostMisnestedDescendent( $node->firstChild, $match ); |
145 | } |
146 | |
147 | /** |
148 | * $node has an 'autoInsertedEnd' flag set on it. We are looking for |
149 | * its matching node that has an 'autoInsertedStart' flag set on it. |
150 | * This happens when the tree-builder fixes up misnested tags. |
151 | * This "adjacency" is wrt the HTML string. In a DOM, this can either |
152 | * be the next sibling OR, it might be the left-most-descendent of |
153 | * of $node's parent's sibling (and so on up the ancestor chain). |
154 | */ |
155 | private function getMatchingMisnestedNode( Node $node, Element $match ): ?Element { |
156 | if ( DOMUtils::atTheTop( $node ) ) { |
157 | return null; |
158 | } |
159 | |
160 | if ( DiffDOMUtils::nextNonSepSibling( $node ) ) { |
161 | return $this->leftMostMisnestedDescendent( DiffDOMUtils::nextNonSepSibling( $node ), $match ); |
162 | } |
163 | |
164 | return $this->getMatchingMisnestedNode( $node->parentNode, $match ); |
165 | } |
166 | |
167 | /** |
168 | * Given a tplInfo object, determine whether we are: |
169 | * - Not processing template content (could be extension or top level page) |
170 | * - Processing encapsulated content that is produced by a single template. |
171 | * If so, return the name of that template. |
172 | * - Processing encapsulated content that comes from multiple templates. |
173 | * If so, return a flag indicating this. |
174 | * |
175 | * FIXME: We might potentially be computing this information redundantly |
176 | * for every lint we find within this template's content. It could probably |
177 | * be cached in tplInfo after it is computed once. |
178 | */ |
179 | private function findEnclosingTemplateName( Env $env, ?stdClass $tplInfo ): ?array { |
180 | if ( !$tplInfo ) { |
181 | return null; |
182 | } |
183 | |
184 | if ( !DOMUtils::hasTypeOf( $tplInfo->first, 'mw:Transclusion' ) ) { |
185 | return null; |
186 | } |
187 | $dmw = DOMDataUtils::getDataMw( $tplInfo->first ); |
188 | // This count check is conservative in that link suffixes and prefixes |
189 | // could artifically add an extra element to the parts array but we |
190 | // don't have a good way of distinguishing that right now. It will require |
191 | // a non-string representation for them and a change in spec along with |
192 | // a version bump and all that song and dance. If linting accuracy in these |
193 | // scenarios become a problem, we can revisit this. |
194 | if ( !empty( $dmw->parts ) && count( $dmw->parts ) === 1 ) { |
195 | $p0 = $dmw->parts[0]; |
196 | // If just a single part (guaranteed with count above), it will be stdclass |
197 | '@phan-var \stdClass $p0'; |
198 | $name = null; |
199 | if ( !empty( $p0->template->target->href ) ) { // Could be "function" |
200 | // PORT-FIXME: Should that be SiteConfig::relativeLinkPrefix() rather than './'? |
201 | $name = PHPUtils::stripPrefix( $p0->template->target->href, './' ); |
202 | } elseif ( !empty( $p0->template ) ) { |
203 | $name = trim( $p0->template->target->wt ); |
204 | } else { |
205 | $name = trim( $p0->templatearg->target->wt ); |
206 | } |
207 | return [ 'name' => $name ]; |
208 | } else { |
209 | return [ 'multiPartTemplateBlock' => true ]; |
210 | } |
211 | } |
212 | |
213 | /** |
214 | * Compute the DSR information for the lint object. |
215 | * - In the common case, this is simply the DSR value of the node |
216 | * that generated the lint. But, occasionally, for some lints, |
217 | * we might have to post-process the node's DSR. |
218 | * - If the lint is found in template content, then the DSR spans |
219 | * the transclusion markup in the toplevel page source. |
220 | */ |
221 | private function findLintDSR( |
222 | ?array $tplLintInfo, ?stdClass $tplInfo, ?DomSourceRange $nodeDSR, |
223 | ?callable $updateNodeDSR = null |
224 | ): ?DomSourceRange { |
225 | if ( $tplLintInfo !== null || ( $tplInfo && !Utils::isValidDSR( $nodeDSR ) ) ) { |
226 | return DOMDataUtils::getDataParsoid( $tplInfo->first )->dsr ?? null; |
227 | } else { |
228 | return $updateNodeDSR ? $updateNodeDSR( $nodeDSR ) : $nodeDSR; |
229 | } |
230 | } |
231 | |
232 | /** |
233 | * Determine if a node has an identical nested tag (?) |
234 | */ |
235 | private function hasIdenticalNestedTag( Element $node, string $name ): bool { |
236 | $c = $node->firstChild; |
237 | while ( $c ) { |
238 | if ( $c instanceof Element ) { |
239 | if ( |
240 | DOMCompat::nodeName( $c ) === $name && |
241 | empty( DOMDataUtils::getDataParsoid( $c )->autoInsertedEnd ) |
242 | ) { |
243 | return true; |
244 | } |
245 | |
246 | return $this->hasIdenticalNestedTag( $c, $name ); |
247 | } |
248 | |
249 | $c = $c->nextSibling; |
250 | } |
251 | |
252 | return false; |
253 | } |
254 | |
255 | /** |
256 | * Determine if a node has misnestable content |
257 | */ |
258 | private function hasMisnestableContent( Node $node, string $name ): bool { |
259 | // For A, TD, TH, H* tags, Tidy doesn't seem to propagate |
260 | // the unclosed tag outside these tags. |
261 | // No need to check for tr/table since content cannot show up there |
262 | if ( DOMUtils::atTheTop( $node ) || preg_match( '/^(?:a|td|th|h\d)$/D', DOMCompat::nodeName( $node ) ) ) { |
263 | return false; |
264 | } |
265 | |
266 | $next = DiffDOMUtils::nextNonSepSibling( $node ); |
267 | if ( !$next ) { |
268 | return $this->hasMisnestableContent( $node->parentNode, $name ); |
269 | } |
270 | |
271 | $contentNode = null; |
272 | if ( DOMCompat::nodeName( $next ) === 'p' && !WTUtils::isLiteralHTMLNode( $next ) ) { |
273 | $contentNode = DiffDOMUtils::firstNonSepChild( $next ); |
274 | } else { |
275 | $contentNode = $next; |
276 | } |
277 | |
278 | // If the first "content" node we find is a matching |
279 | // stripped tag, we have nothing that can get misnested |
280 | return $contentNode && !( |
281 | $contentNode instanceof Element && |
282 | DOMUtils::isMarkerMeta( $contentNode, 'mw:Placeholder/StrippedTag' ) && |
283 | isset( DOMDataUtils::getDataParsoid( $contentNode )->name ) && |
284 | DOMDataUtils::getDataParsoid( $contentNode )->name === $name |
285 | ); |
286 | } |
287 | |
288 | /** |
289 | * Indicate whether an end tag is optional for this node |
290 | * |
291 | * See https://www.w3.org/TR/html5/syntax.html#optional-tags |
292 | * |
293 | * End tags for tr/td/th/li are entirely optional since they |
294 | * require a parent container and can only be followed by like |
295 | * kind. |
296 | * |
297 | * Caveat: <li>foo</li><ol>..</ol> and <li>foo<ol>..</ol> |
298 | * generate different DOM trees, so explicit </li> tag |
299 | * is required to specify which of the two was intended. |
300 | * |
301 | * With that one caveat around nesting, the parse with/without |
302 | * the end tag is identical. For now, ignoring that caveat |
303 | * since they aren't like to show up in our corpus much. |
304 | * |
305 | * For the other tags in that w3c spec section, I haven't reasoned |
306 | * through when exactly they are optional. Not handling that complexity |
307 | * for now since those are likely uncommon use cases in our corpus. |
308 | */ |
309 | private function endTagOptional( Node $node ): bool { |
310 | static $tagNames = [ 'tr', 'td', 'th', 'li' ]; |
311 | return in_array( DOMCompat::nodeName( $node ), $tagNames, true ); |
312 | } |
313 | |
314 | /** |
315 | * Find the nearest ancestor heading tag |
316 | */ |
317 | private function getHeadingAncestor( Node $node ): ?Node { |
318 | while ( $node && !DOMUtils::isHeading( $node ) ) { |
319 | $node = $node->parentNode; |
320 | } |
321 | return $node; |
322 | } |
323 | |
324 | /** |
325 | * For formatting tags, Tidy seems to be doing this "smart" fixup of |
326 | * unclosed tags by looking for matching unclosed pairs of identical tags |
327 | * and if the content ends in non-whitespace text, it treats the second |
328 | * unclosed opening tag as a closing tag. But, a HTML5 parser won't do this. |
329 | * So, detect this pattern and flag for linter fixup. |
330 | */ |
331 | private function matchedOpenTagPairExists( Node $c, DataParsoid $dp ): bool { |
332 | $lc = $c->lastChild; |
333 | if ( !$lc instanceof Element || DOMCompat::nodeName( $lc ) !== DOMCompat::nodeName( $c ) ) { |
334 | return false; |
335 | } |
336 | |
337 | $lcDP = DOMDataUtils::getDataParsoid( $lc ); |
338 | if ( empty( $lcDP->autoInsertedEnd ) || ( $lcDP->stx ?? null ) !== ( $dp->stx ?? null ) ) { |
339 | return false; |
340 | } |
341 | |
342 | $prev = $lc->previousSibling; |
343 | // PORT-FIXME: Do we care about non-ASCII whitespace here? |
344 | if ( $prev instanceof Text && !preg_match( '/\s$/D', $prev->nodeValue ) ) { |
345 | return true; |
346 | } |
347 | |
348 | return false; |
349 | } |
350 | |
351 | /** |
352 | * Lint Treebuilder fixups marked by dom.markTreeBuilderFixup.js |
353 | * |
354 | * It handles the following scenarios: |
355 | * |
356 | * 1. Unclosed end tags (`missing-end-tag`, `missing-end-tag-in-heading`) |
357 | * 2. Invalid self-closed tags (`self-closed-tag`) |
358 | * 3. Stripped tags (`stripped-tag`) |
359 | * |
360 | * In addition, we have specialized categories for some patterns |
361 | * where we encounter unclosed end tags. |
362 | * |
363 | * 4. misnested-tag |
364 | * 5. html5-misnesting |
365 | * 6. multiple-unclosed-formatting-tags |
366 | * 7. unclosed-quotes-in-heading |
367 | */ |
368 | private function lintTreeBuilderFixup( |
369 | Env $env, Element $c, DataParsoid $dp, ?stdClass $tplInfo |
370 | ): void { |
371 | // This might have been processed as part of |
372 | // misnested-tag category identification. |
373 | if ( $dp->getTempFlag( TempData::LINTED ) ) { |
374 | return; |
375 | } |
376 | |
377 | $tplLintInfo = $this->findEnclosingTemplateName( $env, $tplInfo ); |
378 | // During DSR computation, stripped meta tags |
379 | // surrender their width to its previous sibling. |
380 | // We record the original DSR in the tmp attribute |
381 | // for that reason. |
382 | $dsr = $this->findLintDSR( $tplLintInfo, $tplInfo, $dp->tmp->origDSR ?? $dp->dsr ?? null ); |
383 | $lintObj = null; |
384 | if ( DOMUtils::isMarkerMeta( $c, 'mw:Placeholder/StrippedTag' ) ) { |
385 | $lintObj = [ |
386 | 'dsr' => $dsr, |
387 | 'templateInfo' => $tplLintInfo, |
388 | 'params' => [ 'name' => $dp->name ?? null ], |
389 | ]; |
390 | $env->recordLint( 'stripped-tag', $lintObj ); |
391 | } |
392 | |
393 | // Dont bother linting for auto-inserted start/end or self-closing-tag if: |
394 | // 1. c is a void element |
395 | // Void elements won't have auto-inserted start/end tags |
396 | // and self-closing versions are valid for them. |
397 | // |
398 | // 2. c is tbody (FIXME: don't remember why we have this exception) |
399 | // |
400 | // 3. c is not an HTML element (unless they are i/b quotes or tables) |
401 | // |
402 | // 4. c doesn't have DSR info and doesn't come from a template either |
403 | $cNodeName = DOMCompat::nodeName( $c ); |
404 | $ancestor = null; |
405 | $isHtmlElement = WTUtils::hasLiteralHTMLMarker( $dp ); |
406 | if ( !Utils::isVoidElement( $cNodeName ) && |
407 | $cNodeName !== 'tbody' && |
408 | ( $isHtmlElement || DOMUtils::isQuoteElt( $c ) || $cNodeName === 'table' ) && |
409 | ( $tplInfo !== null || $dsr !== null ) |
410 | ) { |
411 | if ( !empty( $dp->selfClose ) && $cNodeName !== 'meta' ) { |
412 | $lintObj = [ |
413 | 'dsr' => $dsr, |
414 | 'templateInfo' => $tplLintInfo, |
415 | 'params' => [ 'name' => $cNodeName ], |
416 | ]; |
417 | $env->recordLint( 'self-closed-tag', $lintObj ); |
418 | // The other checks won't pass - no need to test them. |
419 | return; |
420 | } |
421 | |
422 | if ( |
423 | ( $dp->autoInsertedEnd ?? null ) === true && |
424 | ( $tplInfo || ( $dsr->openWidth ?? 0 ) > 0 ) |
425 | ) { |
426 | $lintObj = [ |
427 | 'dsr' => $dsr, |
428 | 'templateInfo' => $tplLintInfo, |
429 | 'params' => [ 'name' => $cNodeName ], |
430 | ]; |
431 | |
432 | // FIXME: This literal html marker check is strictly not required |
433 | // (a) we've already checked that above and know that isQuoteElt is |
434 | // not one of our tags. |
435 | // (b) none of the tags in the list have native wikitext syntax => |
436 | // they will show up as literal html tags. |
437 | // But, in the interest of long-term maintenance in the face of |
438 | // changes (to wikitext or html specs), let us make it explicit. |
439 | if ( $isHtmlElement && |
440 | isset( $this->getTagsWithChangedMisnestingBehavior()[DOMCompat::nodeName( $c )] ) && |
441 | $this->hasMisnestableContent( $c, DOMCompat::nodeName( $c ) ) && |
442 | // Tidy WTF moment here! |
443 | // I don't know why Tidy does something very different |
444 | // when there is an identical nested tag here. |
445 | // |
446 | // <p><span id='1'>a<span>X</span></p><p>b</span></p> |
447 | // vs. |
448 | // <p><span id='1'>a</p><p>b</span></p> OR |
449 | // <p><span id='1'>a<del>X</del></p><p>b</span></p> |
450 | // |
451 | // For the first snippet, Tidy only wraps "a" with the id='1' span |
452 | // For the second and third snippets, Tidy wraps "b" with the id='1' span as well. |
453 | // |
454 | // For the corresponding wikitext that generates the above token stream, |
455 | // Parsoid (and Remex) won't wrap 'b' with the id=1' span at all. |
456 | !$this->hasIdenticalNestedTag( $c, DOMCompat::nodeName( $c ) ) |
457 | ) { |
458 | $env->recordLint( 'html5-misnesting', $lintObj ); |
459 | } elseif ( |
460 | !$isHtmlElement && DOMUtils::isQuoteElt( $c ) && |
461 | // phpcs:ignore Generic.CodeAnalysis.AssignmentInCondition.Found |
462 | ( $ancestor = $this->getHeadingAncestor( $c->parentNode ) ) |
463 | ) { |
464 | $lintObj['params']['ancestorName'] = DOMCompat::nodeName( $ancestor ); |
465 | $env->recordLint( 'unclosed-quotes-in-heading', $lintObj ); |
466 | } else { |
467 | $adjNode = $this->getMatchingMisnestedNode( $c, $c ); |
468 | if ( $adjNode ) { |
469 | $adjDp = DOMDataUtils::getDataParsoid( $adjNode ); |
470 | $adjDp->setTempFlag( TempData::LINTED ); |
471 | $env->recordLint( 'misnested-tag', $lintObj ); |
472 | } elseif ( !$this->endTagOptional( $c ) && empty( $dp->autoInsertedStart ) ) { |
473 | $lintObj['params']['inTable'] = DOMUtils::hasNameOrHasAncestorOfName( $c, 'table' ); |
474 | $category = $this->getHeadingAncestor( $c ) ? |
475 | 'missing-end-tag-in-heading' : 'missing-end-tag'; |
476 | $next = DiffDOMUtils::nextNonSepSibling( $c ); |
477 | if ( |
478 | // Skip if covered by deletable-table-tag |
479 | !( $cNodeName === 'table' && $next && |
480 | ( DOMCompat::nodeName( $c ) === 'table' ) ) |
481 | ) { |
482 | $env->recordLint( $category, $lintObj ); |
483 | } |
484 | if ( isset( Consts::$HTML['FormattingTags'][DOMCompat::nodeName( $c )] ) && |
485 | $this->matchedOpenTagPairExists( $c, $dp ) |
486 | ) { |
487 | $env->recordLint( 'multiple-unclosed-formatting-tags', $lintObj ); |
488 | } |
489 | } |
490 | } |
491 | } |
492 | } |
493 | } |
494 | |
495 | /** |
496 | * Lint fostered content marked by MarkFosteredContent. |
497 | * |
498 | * Lint category: `fostered` |
499 | * |
500 | * This will log cases like: |
501 | * |
502 | * {| |
503 | * foo |
504 | * |- |
505 | * | bar |
506 | * |} |
507 | * |
508 | * Here 'foo' gets fostered out. |
509 | */ |
510 | private function lintFostered( |
511 | Env $env, Element $node, DataParsoid $dp, ?stdClass $tplInfo |
512 | ): void { |
513 | if ( DOMCompat::nodeName( $node ) !== 'table' ) { |
514 | return; |
515 | } |
516 | |
517 | // The top-level nodes in the foster box are span/p wrapped |
518 | // and so, if we have fostered content, previous siblings to |
519 | // the table are expected to be elements. |
520 | $maybeFostered = $node->previousSibling; |
521 | |
522 | // Skip rendering-transparent nodes |
523 | while ( $maybeFostered instanceof Element && ( |
524 | WTUtils::isRenderingTransparentNode( $maybeFostered ) || |
525 | // TODO: Section tags are rendering transparent but not sol transparent, |
526 | // and that method only considers WTUtils::isSolTransparentLink, though |
527 | // there is a FIXME to consider all link nodes. |
528 | ( DOMCompat::nodeName( $maybeFostered ) === 'link' && |
529 | DOMUtils::hasTypeOf( $maybeFostered, 'mw:Extension/section' ) ) |
530 | ) ) { |
531 | $maybeFostered = $maybeFostered->previousSibling; |
532 | } |
533 | |
534 | if ( |
535 | !( $maybeFostered instanceof Element ) || |
536 | empty( DOMDataUtils::getDataParsoid( $maybeFostered )->fostered ) |
537 | ) { |
538 | return; |
539 | } |
540 | |
541 | $tplLintInfo = $this->findEnclosingTemplateName( $env, $tplInfo ); |
542 | $lintObj = [ |
543 | 'dsr' => $this->findLintDSR( |
544 | $tplLintInfo, $tplInfo, $dp->dsr ?? null |
545 | ), |
546 | 'templateInfo' => $tplLintInfo, |
547 | ]; |
548 | $env->recordLint( 'fostered', $lintObj ); |
549 | } |
550 | |
551 | /** |
552 | * Lint obsolete HTML tags. |
553 | * |
554 | * Lint category: `obsolete-tag`, `tidy-font-bug` |
555 | */ |
556 | private function lintObsoleteTag( |
557 | Env $env, Element $c, DataParsoid $dp, ?stdClass $tplInfo |
558 | ): void { |
559 | if ( !$this->obsoleteTagsRE ) { |
560 | $elts = []; |
561 | foreach ( Consts::$HTML['OlderHTMLTags'] as $tag => $dummy ) { |
562 | // Looks like all existing editors let editors add the <big> tag. |
563 | // VE has a button to add <big>, it seems so does the WikiEditor |
564 | // and JS wikitext editor. So, don't flag BIG as an obsolete tag. |
565 | if ( $tag !== 'big' ) { |
566 | $elts[] = preg_quote( $tag, '/' ); |
567 | } |
568 | } |
569 | $this->obsoleteTagsRE = '/^(?:' . implode( '|', $elts ) . ')$/D'; |
570 | } |
571 | |
572 | $tplLintInfo = null; |
573 | if ( ( empty( $dp->autoInsertedStart ) || empty( $dp->autoInsertedEnd ) ) && |
574 | preg_match( $this->obsoleteTagsRE, DOMCompat::nodeName( $c ) ) |
575 | ) { |
576 | $tplLintInfo = $this->findEnclosingTemplateName( $env, $tplInfo ); |
577 | $lintObj = [ |
578 | 'dsr' => $this->findLintDSR( $tplLintInfo, $tplInfo, $dp->dsr ?? null ), |
579 | 'templateInfo' => $tplLintInfo, |
580 | 'params' => [ 'name' => DOMCompat::nodeName( $c ) ], |
581 | ]; |
582 | $env->recordLint( 'obsolete-tag', $lintObj ); |
583 | } |
584 | |
585 | if ( DOMCompat::nodeName( $c ) === 'font' && $c->hasAttribute( 'color' ) ) { |
586 | /* ---------------------------------------------------------- |
587 | * Tidy migrates <font> into the link in these cases |
588 | * <font>[[Foo]]</font> |
589 | * <font>[[Foo]]l</font> (link-trail) |
590 | * <font><!--boo-->[[Foo]]</font> |
591 | * <font>__NOTOC__[[Foo]]</font> |
592 | * <font>[[Category:Foo]][[Foo]]</font> |
593 | * <font>{{1x|[[Foo]]}}</font> |
594 | * |
595 | * Tidy does not migrate <font> into the link in these cases |
596 | * <font> [[Foo]]</font> |
597 | * <font>[[Foo]] </font> |
598 | * <font>[[Foo]]L</font> (not a link-trail) |
599 | * <font>[[Foo]][[Bar]]</font> |
600 | * <font>[[Foo]][[Bar]]</font> |
601 | * |
602 | * <font> is special. |
603 | * This behavior is not seen with other formatting tags. |
604 | * |
605 | * Remex/parsoid won't do any of this. |
606 | * This difference in behavior only matters when the font tag |
607 | * specifies a link colour because the link no longer renders |
608 | * as blue/red but in the font-specified colour. |
609 | * ---------------------------------------------------------- */ |
610 | $tidyFontBug = $c->firstChild !== null; |
611 | $haveLink = false; |
612 | for ( $n = $c->firstChild; $n; $n = $n->nextSibling ) { |
613 | $nodeName = DOMCompat::nodeName( $n ); |
614 | if ( $nodeName !== 'a' && |
615 | !WTUtils::isRenderingTransparentNode( $n ) && |
616 | !WTUtils::isTplMarkerMeta( $n ) |
617 | ) { |
618 | $tidyFontBug = false; |
619 | break; |
620 | } |
621 | |
622 | if ( $nodeName === 'a' || $nodeName === 'figure' ) { |
623 | if ( !$haveLink ) { |
624 | $haveLink = true; |
625 | } else { |
626 | $tidyFontBug = false; |
627 | break; |
628 | } |
629 | } |
630 | } |
631 | |
632 | if ( $tidyFontBug ) { |
633 | $tplLintInfo = $this->findEnclosingTemplateName( $env, $tplInfo ); |
634 | $env->recordLint( 'tidy-font-bug', [ |
635 | 'dsr' => $this->findLintDSR( $tplLintInfo, $tplInfo, $dp->dsr ?? null ), |
636 | 'templateInfo' => $tplLintInfo, |
637 | 'params' => [ 'name' => 'font' ] |
638 | ] ); |
639 | } |
640 | } |
641 | } |
642 | |
643 | /** |
644 | * Log bogus (=unrecognized) media options. |
645 | * |
646 | * See - https://www.mediawiki.org/wiki/Help:Images#Syntax |
647 | * |
648 | * Lint category: `bogus-image-options` |
649 | */ |
650 | private function lintBogusImageOptions( |
651 | Env $env, Node $c, DataParsoid $dp, ?stdClass $tplInfo |
652 | ): void { |
653 | // Despite the lint category name, this checks all media, not just images |
654 | if ( WTUtils::isGeneratedFigure( $c ) && !empty( $dp->optList ) ) { |
655 | $items = []; |
656 | $bogusPx = $dp->getTempFlag( TempData::BOGUS_PX ); |
657 | foreach ( $dp->optList as $item ) { |
658 | if ( |
659 | $item['ck'] === 'bogus' || |
660 | ( $bogusPx && $item['ck'] === 'width' ) |
661 | ) { |
662 | $items[] = $item['ak']; |
663 | } |
664 | } |
665 | if ( $items ) { |
666 | $tplLintInfo = $this->findEnclosingTemplateName( $env, $tplInfo ); |
667 | $env->recordLint( 'bogus-image-options', [ |
668 | 'dsr' => $this->findLintDSR( $tplLintInfo, $tplInfo, $dp->dsr ?? null ), |
669 | 'templateInfo' => $tplLintInfo, |
670 | 'params' => [ 'items' => $items ] |
671 | ] ); |
672 | } |
673 | } |
674 | } |
675 | |
676 | /** |
677 | * Lint tables Tidy deletes. |
678 | * |
679 | * Lint category: `deletable-table-tag` |
680 | * |
681 | * In this example below, the second table is in a fosterable position |
682 | * (inside a <tr>). The tree builder closes the first table at that point |
683 | * and starts a new table there. We are detecting this pattern because |
684 | * Tidy does something very different here. It strips the inner table |
685 | * and retains the outer table. So, for preserving rendering of pages |
686 | * that are tailored for Tidy, editors have to fix up this wikitext |
687 | * to strip the inner table (to mimic what Tidy does). |
688 | * |
689 | * {| style='border:1px solid red;' |
690 | * |a |
691 | * |- |
692 | * {| style='border:1px solid blue;' |
693 | * |b |
694 | * |c |
695 | * |} |
696 | * |} |
697 | */ |
698 | private function lintDeletableTableTag( |
699 | Env $env, Node $c, DataParsoid $dp, ?stdClass $tplInfo |
700 | ): void { |
701 | if ( DOMCompat::nodeName( $c ) === 'table' ) { |
702 | $prev = DiffDOMUtils::previousNonSepSibling( $c ); |
703 | if ( $prev instanceof Element && DOMCompat::nodeName( $prev ) === 'table' && |
704 | !empty( DOMDataUtils::getDataParsoid( $prev )->autoInsertedEnd ) |
705 | ) { |
706 | $tplLintInfo = $this->findEnclosingTemplateName( $env, $tplInfo ); |
707 | $dsr = $this->findLintDSR( |
708 | $tplLintInfo, |
709 | $tplInfo, |
710 | $dp->dsr ?? null, |
711 | static function ( ?DomSourceRange $nodeDSR ): ?DomSourceRange { |
712 | // Identify the dsr-span of the opening tag |
713 | // of the table that needs to be deleted |
714 | $x = $nodeDSR === null ? null : ( clone $nodeDSR ); |
715 | if ( !empty( $x->openWidth ) ) { |
716 | $x->end = $x->innerStart(); |
717 | $x->openWidth = 0; |
718 | $x->closeWidth = 0; |
719 | } |
720 | return $x; |
721 | } |
722 | ); |
723 | $lintObj = [ |
724 | 'dsr' => $dsr, |
725 | 'templateInfo' => $tplLintInfo, |
726 | 'params' => [ 'name' => 'table' ], |
727 | ]; |
728 | $env->recordLint( 'deletable-table-tag', $lintObj ); |
729 | } |
730 | } |
731 | } |
732 | |
733 | /** |
734 | * Find the first child passing the filter. |
735 | */ |
736 | private function findMatchingChild( Node $node, callable $filter ): ?Node { |
737 | $c = $node->firstChild; |
738 | while ( $c && !$filter( $c ) ) { |
739 | $c = $c->nextSibling; |
740 | } |
741 | |
742 | return $c; |
743 | } |
744 | |
745 | /** |
746 | * Test if the node has a 'nowrap' CSS rule |
747 | * |
748 | * In the general case, this CSS can come from a class, |
749 | * or from a <style> tag or a stylesheet or even from JS code. |
750 | * But, for now, we are restricting this inspection to inline CSS |
751 | * since the intent is to aid editors in fixing patterns that |
752 | * can be automatically detected. |
753 | * |
754 | * Special case for enwiki that has Template:nowrap which |
755 | * assigns class='nowrap' with CSS white-space:nowrap in |
756 | * MediaWiki:Common.css |
757 | */ |
758 | private function hasNoWrapCSS( Node $node ): bool { |
759 | return $node instanceof Element && ( |
760 | str_contains( DOMCompat::getAttribute( $node, 'style' ) ?? '', 'nowrap' ) || |
761 | DOMUtils::hasClass( $node, 'nowrap' ) |
762 | ); |
763 | } |
764 | |
765 | /** |
766 | * Lint bad P wrapping. |
767 | * |
768 | * Lint category: `pwrap-bug-workaround` |
769 | */ |
770 | private function lintPWrapBugWorkaround( |
771 | Env $env, Element $node, DataParsoid $dp, ?stdClass $tplInfo |
772 | ): void { |
773 | if ( |
774 | !DOMUtils::isWikitextBlockNode( $node ) && |
775 | DOMUtils::isWikitextBlockNode( $node->parentNode ) && |
776 | $this->hasNoWrapCSS( $node ) |
777 | ) { |
778 | $p = $this->findMatchingChild( $node, static function ( $e ) { |
779 | return DOMCompat::nodeName( $e ) === 'p'; |
780 | } ); |
781 | if ( $p ) { |
782 | $tplLintInfo = $this->findEnclosingTemplateName( $env, $tplInfo ); |
783 | $lintObj = [ |
784 | 'dsr' => $this->findLintDSR( $tplLintInfo, $tplInfo, $dp->dsr ?? null ), |
785 | 'templateInfo' => $tplLintInfo, |
786 | 'params' => [ |
787 | 'root' => DOMCompat::nodeName( $node->parentNode ), |
788 | 'child' => DOMCompat::nodeName( $node ), |
789 | ] |
790 | ]; |
791 | $env->recordLint( 'pwrap-bug-workaround', $lintObj ); |
792 | } |
793 | } |
794 | } |
795 | |
796 | /** |
797 | * Lint Tidy div span flip. |
798 | * |
799 | * Lint category: `misc-tidy-replacement-issues` |
800 | */ |
801 | private function lintMiscTidyReplacementIssues( |
802 | Env $env, Element $node, DataParsoid $dp, ?stdClass $tplInfo |
803 | ): void { |
804 | if ( DOMCompat::nodeName( $node ) !== 'span' ) { |
805 | return; |
806 | } |
807 | |
808 | $fc = DiffDOMUtils::firstNonSepChild( $node ); |
809 | if ( !$fc instanceof Element || DOMCompat::nodeName( $fc ) !== 'div' ) { |
810 | return; |
811 | } |
812 | |
813 | // No style/class attributes -- so, this won't affect rendering |
814 | if ( !$node->hasAttribute( 'class' ) && !$node->hasAttribute( 'style' ) && |
815 | !$fc->hasAttribute( 'class' ) && !$fc->hasAttribute( 'style' ) |
816 | ) { |
817 | return; |
818 | } |
819 | |
820 | $tplLintInfo = $this->findEnclosingTemplateName( $env, $tplInfo ); |
821 | $lintObj = [ |
822 | 'dsr' => $this->findLintDSR( $tplLintInfo, $tplInfo, $dp->dsr ?? null ), |
823 | 'templateInfo' => $tplLintInfo, |
824 | 'params' => [ 'subtype' => 'div-span-flip' ] |
825 | ]; |
826 | $env->recordLint( 'misc-tidy-replacement-issues', $lintObj ); |
827 | } |
828 | |
829 | /** |
830 | * Lint tidy whitespace bug. |
831 | * |
832 | * Lint category: `tidy-whitespace-bug` |
833 | */ |
834 | private function lintTidyWhitespaceBug( |
835 | Env $env, Node $node, DataParsoid $dp, ?stdClass $tplInfo |
836 | ): void { |
837 | // We handle a run of nodes in one shot. |
838 | // No need to reprocess repeatedly. |
839 | if ( $dp->getTempFlag( TempData::PROCESSED_TIDY_WS_BUG ) ) { |
840 | return; |
841 | } |
842 | |
843 | // Find the longest run of nodes that are affected by white-space:nowrap CSS |
844 | // in a way that leads to unsightly rendering in HTML5 compliant browsers. |
845 | // |
846 | // Check if Tidy does buggy whitespace hoisting there to provide the browser |
847 | // opportunities to split the content in short segments. |
848 | // |
849 | // If so, editors would need to edit this run of nodes to introduce |
850 | // whitespace breaks as necessary so that HTML5 browsers get that |
851 | // same opportunity when Tidy is removed. |
852 | $s = null; |
853 | $nowrapNodes = []; |
854 | '@phan-var array<array{node:Node,tidybug:bool,hasLeadingWS:bool}> $nowrapNodes'; |
855 | $startNode = $node; |
856 | $haveTidyBug = false; |
857 | $runLength = 0; |
858 | |
859 | // <br>, <wbr>, <hr> break a line |
860 | while ( $node && !DOMUtils::isRemexBlockNode( $node ) && |
861 | !in_array( DOMCompat::nodeName( $node ), [ 'hr', 'br', 'wbr' ], true ) |
862 | ) { |
863 | if ( $node instanceof Text || !$this->hasNoWrapCSS( $node ) ) { |
864 | // No CSS property that affects whitespace. |
865 | $s = $node->textContent; |
866 | if ( preg_match( '/^(\S*)\s/', $s, $m ) ) { // PORT-FIXME: non-ASCII whitespace? |
867 | $runLength += strlen( $m[1] ); |
868 | $nowrapNodes[] = [ |
869 | 'node' => $node, |
870 | 'tidybug' => false, |
871 | 'hasLeadingWS' => ( preg_match( '/^\s/', $s ) === 1 ), // PORT-FIXME: non-ASCII whitespace? |
872 | ]; |
873 | break; |
874 | } else { |
875 | $nowrapNodes[] = [ 'node' => $node, 'tidybug' => false ]; |
876 | $runLength += strlen( $s ); |
877 | } |
878 | } else { |
879 | // Find last non-comment child of node |
880 | $last = $node->lastChild; |
881 | while ( $last instanceof Comment ) { |
882 | $last = $last->previousSibling; |
883 | } |
884 | |
885 | $bug = false; |
886 | if ( $last instanceof Text && |
887 | preg_match( '/\s$/D', $last->nodeValue ) // PORT-FIXME: non-ASCII whitespace? |
888 | ) { |
889 | // In this scenario, when Tidy hoists the whitespace to |
890 | // after the node, that whitespace is not subject to the |
891 | // nowrap CSS => browsers can break content there. |
892 | // |
893 | // But, non-Tidy libraries won't hoist the whitespace. |
894 | // So, browsers don't have a place to break content. |
895 | $bug = true; |
896 | $haveTidyBug = true; |
897 | } |
898 | |
899 | $nowrapNodes[] = [ 'node' => $node, 'tidybug' => $bug ]; |
900 | $runLength += strlen( $node->textContent ); |
901 | } |
902 | |
903 | // Don't cross template boundaries at the top-level |
904 | if ( $tplInfo && $tplInfo->last === $node ) { |
905 | // Exiting a top-level template |
906 | break; |
907 | } elseif ( !$tplInfo && WTUtils::findFirstEncapsulationWrapperNode( $node ) ) { |
908 | // Entering a top-level template |
909 | break; |
910 | } |
911 | |
912 | // Move to the next non-comment sibling |
913 | $node = $node->nextSibling; |
914 | while ( $node instanceof Comment ) { |
915 | $node = $node->nextSibling; |
916 | } |
917 | } |
918 | |
919 | $markProcessedNodes = static function () use ( &$nowrapNodes ) { // Helper |
920 | foreach ( $nowrapNodes as $o ) { |
921 | // Phan fails at applying the instanceof type restriction to the array member when analyzing the |
922 | // following call, but is fine when it's copied to a local variable. |
923 | $node = $o['node']; |
924 | if ( $node instanceof Element ) { |
925 | DOMDataUtils::getDataParsoid( $node )->setTempFlag( TempData::PROCESSED_TIDY_WS_BUG ); |
926 | } |
927 | } |
928 | }; |
929 | |
930 | if ( !$haveTidyBug ) { |
931 | // Mark processed nodes and bail |
932 | $markProcessedNodes(); |
933 | return; |
934 | } |
935 | |
936 | // Find run before startNode that doesn't have a whitespace break |
937 | $prev = $startNode->previousSibling; |
938 | while ( $prev && !DOMUtils::isRemexBlockNode( $prev ) ) { |
939 | if ( !( $prev instanceof Comment ) ) { |
940 | $s = $prev->textContent; |
941 | // Find the last \s in the string |
942 | if ( preg_match( '/\s(\S*)$/D', $s, $m ) ) { // PORT-FIXME: non-ASCII whitespace here? |
943 | $runLength += strlen( $m[1] ); |
944 | break; |
945 | } else { |
946 | $runLength += strlen( $s ); |
947 | } |
948 | } |
949 | $prev = $prev->previousSibling; |
950 | } |
951 | |
952 | $lintConfig = $env->getLinterConfig(); |
953 | $tidyWhitespaceBugMaxLength = $lintConfig['tidyWhitespaceBugMaxLength'] ?? 100; |
954 | |
955 | if ( $runLength < $tidyWhitespaceBugMaxLength ) { |
956 | // Mark processed nodes and bail |
957 | $markProcessedNodes(); |
958 | return; |
959 | } |
960 | |
961 | // For every node where Tidy hoists whitespace, |
962 | // emit an event to flag a whitespace fixup opportunity. |
963 | $tplLintInfo = $this->findEnclosingTemplateName( $env, $tplInfo ); |
964 | $n = count( $nowrapNodes ) - 1; |
965 | foreach ( $nowrapNodes as $i => $o ) { |
966 | if ( $o['tidybug'] && $i < $n && empty( $nowrapNodes[$i + 1]['hasLeadingWS'] ) ) { |
967 | $nowrapNode = $o['node']; // (see above) |
968 | $lintObj = [ |
969 | 'dsr' => $this->findLintDSR( |
970 | $tplLintInfo, |
971 | $tplInfo, |
972 | $nowrapNode instanceof Element |
973 | ? DOMDataUtils::getDataParsoid( $nowrapNode )->dsr ?? null |
974 | : null |
975 | ), |
976 | 'templateInfo' => $tplLintInfo, |
977 | 'params' => [ |
978 | 'node' => DOMCompat::nodeName( $o['node'] ), |
979 | 'sibling' => DOMCompat::nodeName( $o['node']->nextSibling ) |
980 | ] |
981 | ]; |
982 | |
983 | $env->recordLint( 'tidy-whitespace-bug', $lintObj ); |
984 | } |
985 | } |
986 | |
987 | $markProcessedNodes(); |
988 | } |
989 | |
990 | /** |
991 | * Detect multiple-unclosed-formatting-tags errors. |
992 | * |
993 | * Since unclosed <small> and <big> tags accumulate their effects |
994 | * in HTML5 parsers (unlike in Tidy where it seems to suppress |
995 | * multiple unclosed elements of the same name), such pages will |
996 | * break pretty spectacularly with Remex. |
997 | * |
998 | * Ex: https://it.wikipedia.org/wiki/Hubert_H._Humphrey_Metrodome?oldid=93017491#Note |
999 | * |
1000 | * Lint category: `multiple-unclosed-formatting-tags` |
1001 | */ |
1002 | private function lintMultipleUnclosedFormattingTags( array $lints, Env $env ): void { |
1003 | $firstUnclosedTag = [ |
1004 | 'small' => null, |
1005 | 'big' => null |
1006 | ]; |
1007 | $multiUnclosedTagName = null; |
1008 | foreach ( $lints as $item ) { |
1009 | // Unclosed tags in tables don't leak out of the table |
1010 | if ( $item['type'] === 'missing-end-tag' && !$item['params']['inTable'] ) { |
1011 | if ( $item['params']['name'] === 'small' || $item['params']['name'] === 'big' ) { |
1012 | $tagName = $item['params']['name']; |
1013 | // @phan-suppress-next-line PhanPossiblyUndeclaredVariable |
1014 | if ( !$firstUnclosedTag[$tagName] ) { |
1015 | $firstUnclosedTag[$tagName] = $item; |
1016 | } else { |
1017 | $multiUnclosedTagName = $tagName; |
1018 | break; |
1019 | } |
1020 | } |
1021 | } |
1022 | } |
1023 | |
1024 | if ( $multiUnclosedTagName ) { |
1025 | $item = $firstUnclosedTag[$multiUnclosedTagName]; |
1026 | if ( isset( $item['dsr'] ) ) { |
1027 | $item['dsr'] = DomSourceRange::fromArray( $item['dsr'] ); |
1028 | } |
1029 | $env->recordLint( 'multiple-unclosed-formatting-tags', [ |
1030 | 'params' => $item['params'] ?? [], |
1031 | 'dsr' => $item['dsr'] ?? null, |
1032 | 'templateInfo' => $item['tplLintInfo'] ?? null |
1033 | ] ); |
1034 | } |
1035 | } |
1036 | |
1037 | /** |
1038 | * Post-process an array of lints |
1039 | */ |
1040 | private function postProcessLints( array $lints, Env $env ): void { |
1041 | $this->lintMultipleUnclosedFormattingTags( $lints, $env ); |
1042 | } |
1043 | |
1044 | /** |
1045 | * Get wikitext list item ancestor |
1046 | */ |
1047 | private function getWikitextListItemAncestor( ?Node $node ): ?Node { |
1048 | while ( $node && !DOMUtils::isListItem( $node ) ) { |
1049 | $node = $node->parentNode; |
1050 | } |
1051 | |
1052 | if ( $node && !WTUtils::isLiteralHTMLNode( $node ) && |
1053 | !WTUtils::fromExtensionContent( $node, 'references' ) |
1054 | ) { |
1055 | return $node; |
1056 | } |
1057 | |
1058 | return null; |
1059 | } |
1060 | |
1061 | /** |
1062 | * Lint a PHP parser bug. |
1063 | * |
1064 | * When an HTML table is nested inside a list, if any part of the table |
1065 | * is on a new line, the PHP parser misnests the list and the table. |
1066 | * Tidy fixes the misnesting one way (puts table inside/outside the list) |
1067 | * HTML5 parser fixes it another way (list expands to rest of the page!) |
1068 | * |
1069 | * Lint category: `multiline-html-table-in-list` |
1070 | */ |
1071 | private function lintMultilineHtmlTableInList( |
1072 | Env $env, Element $node, DataParsoid $dp, ?stdClass $tplInfo |
1073 | ): void { |
1074 | $li = null; |
1075 | if ( !WTUtils::isLiteralHTMLNode( $node ) || |
1076 | DOMCompat::nodeName( $node ) !== 'table' || |
1077 | // phpcs:ignore Generic.CodeAnalysis.AssignmentInCondition.Found |
1078 | !( $li = $this->getWikitextListItemAncestor( $node ) ) || |
1079 | !str_contains( DOMCompat::getOuterHTML( $node ), "\n" ) |
1080 | ) { |
1081 | return; |
1082 | } |
1083 | |
1084 | // We have an HTML table nested inside a list |
1085 | // that has a newline break in its outer HTML |
1086 | // => we are in trouble with the PHP Parser + Remex combo |
1087 | $tplLintInfo = $this->findEnclosingTemplateName( $env, $tplInfo ); |
1088 | $lintObj = [ |
1089 | 'dsr' => $this->findLintDSR( |
1090 | $tplLintInfo, $tplInfo, DOMDataUtils::getDataParsoid( $node )->dsr ?? null |
1091 | ), |
1092 | 'templateInfo' => $tplLintInfo, |
1093 | 'params' => [ |
1094 | 'name' => 'table', |
1095 | 'ancestorName' => DOMCompat::nodeName( $li ), |
1096 | ], |
1097 | ]; |
1098 | $env->recordLint( 'multiline-html-table-in-list', $lintObj ); |
1099 | } |
1100 | |
1101 | /** |
1102 | * Log wikilinks or media in external links. |
1103 | * |
1104 | * HTML tags can be nested but this is not the case for <a> tags |
1105 | * which when nested outputs the <a> tags adjacent to each other |
1106 | * In the example below, [[Google]] is a wikilink that is nested |
1107 | * in the outer external link |
1108 | * [http://google.com This is [[Google]]'s search page] |
1109 | * |
1110 | * Linter category: `wikilink-in-extlink` |
1111 | */ |
1112 | private function lintWikilinksInExtlink( |
1113 | Env $env, Element $c, DataParsoid $dp, ?stdClass $tplInfo |
1114 | ): void { |
1115 | if ( DOMCompat::nodeName( $c ) === 'a' && |
1116 | DOMUtils::hasRel( $c, "mw:ExtLink" ) && |
1117 | // Images in extlinks will end up with broken up extlinks inside the |
1118 | // <figure> DOM. Those have 'misnested' flag set on them. Ignore those. |
1119 | empty( DOMDataUtils::getDataParsoid( $c )->misnested ) |
1120 | ) { |
1121 | $next = $c->nextSibling; |
1122 | $lintError = $next instanceof Element && |
1123 | !empty( DOMDataUtils::getDataParsoid( $next )->misnested ) && |
1124 | // This check may not be necessary but ensures that we are |
1125 | // really in a link-in-link misnested scenario. |
1126 | DOMUtils::treeHasElement( $next, 'a', true ); |
1127 | |
1128 | // Media as opposed to most instances of img (barring the link= trick), don't result |
1129 | // in misnesting according the html5 spec since we're actively suppressing links in |
1130 | // their structure. However, since timed media is inherently clickable, being nested |
1131 | // in an extlink could surprise a user clicking on it by navigating away from the page. |
1132 | if ( !$lintError ) { |
1133 | DOMUtils::visitDOM( $c, static function ( $element ) use ( &$lintError ) { |
1134 | if ( $element instanceof Element && |
1135 | ( DOMCompat::nodeName( $element ) === 'audio' || |
1136 | DOMCompat::nodeName( $element ) === 'video' ) |
1137 | ) { |
1138 | $lintError = true; |
1139 | } |
1140 | } ); |
1141 | } |
1142 | if ( $lintError ) { |
1143 | $tplLintInfo = $this->findEnclosingTemplateName( $env, $tplInfo ); |
1144 | $lintObj = [ |
1145 | 'dsr' => $this->findLintDSR( |
1146 | $tplLintInfo, $tplInfo, DOMDataUtils::getDataParsoid( $c )->dsr ?? null |
1147 | ), |
1148 | 'templateInfo' => $tplLintInfo, |
1149 | ]; |
1150 | $env->recordLint( 'wikilink-in-extlink', $lintObj ); |
1151 | } |
1152 | } |
1153 | } |
1154 | |
1155 | private function recordLargeTablesLint( |
1156 | Env $env, ?stdClass $tplInfo, Element $node, int $numColumns, int $columnsMax |
1157 | ): void { |
1158 | $tplLintInfo = $this->findEnclosingTemplateName( $env, $tplInfo ); |
1159 | $lintObj = [ |
1160 | 'dsr' => $this->findLintDSR( |
1161 | $tplLintInfo, $tplInfo, DOMDataUtils::getDataParsoid( $node )->dsr ?? null |
1162 | ), |
1163 | 'templateInfo' => $tplLintInfo, |
1164 | 'params' => [ |
1165 | 'name' => 'table', |
1166 | 'columns' => $numColumns, |
1167 | 'columnsMax' => $columnsMax, |
1168 | ], |
1169 | ]; |
1170 | $env->recordLint( 'large-tables', $lintObj ); |
1171 | } |
1172 | |
1173 | /** |
1174 | * TODO: In the future, this may merit being moved to DOMUtils |
1175 | * along with its "previous" variant. |
1176 | */ |
1177 | private function skipNonElementNodes( ?Node $n ): ?Element { |
1178 | while ( $n && !( $n instanceof Element ) ) { |
1179 | $n = $n->nextSibling; |
1180 | } |
1181 | return $n; |
1182 | } |
1183 | |
1184 | /** |
1185 | * Lint large tables. |
1186 | * |
1187 | * Identify articles having overly-large tables |
1188 | * to help editors optimize their articles. |
1189 | * |
1190 | * Linter category: `large-tables` |
1191 | */ |
1192 | private function lintLargeTables( |
1193 | Env $env, Element $node, DataParsoid $dp, ?stdClass $tplInfo |
1194 | ): void { |
1195 | if ( DOMCompat::nodeName( $node ) !== 'table' ) { |
1196 | return; |
1197 | } |
1198 | |
1199 | // Skip tables that have nested tables in them as they are likely |
1200 | // to be used for layout and not for data representation. |
1201 | // We may check nested tables in the next iteration of this lint. |
1202 | $nestedTables = $node->getElementsByTagName( 'table' ); |
1203 | if ( $nestedTables->length > 0 ) { |
1204 | return; |
1205 | } |
1206 | |
1207 | $lintConfig = $env->getLinterConfig(); |
1208 | $maxColumns = $lintConfig['maxTableColumnHeuristic'] ?? 5; |
1209 | $maxRowsToCheck = $lintConfig['maxTableRowsToCheck'] ?? 10; |
1210 | |
1211 | $trCount = 0; |
1212 | $tbody = DOMCompat::querySelector( $node, 'tbody' ); |
1213 | // empty table |
1214 | if ( !$tbody ) { |
1215 | return; |
1216 | } |
1217 | $tr = self::skipNonElementNodes( $tbody->firstChild ); |
1218 | while ( $tr && $trCount < $maxRowsToCheck ) { |
1219 | $numTh = $tr->getElementsByTagName( 'th' )->length; |
1220 | if ( $numTh > $maxColumns ) { |
1221 | $this->recordLargeTablesLint( $env, $tplInfo, $node, $numTh, $maxColumns ); |
1222 | return; |
1223 | } |
1224 | |
1225 | $numTd = $tr->getElementsByTagName( 'td' )->length; |
1226 | if ( $numTd > $maxColumns ) { |
1227 | $this->recordLargeTablesLint( $env, $tplInfo, $node, $numTd, $maxColumns ); |
1228 | return; |
1229 | } |
1230 | |
1231 | $tr = self::skipNonElementNodes( $tr->nextSibling ); |
1232 | $trCount++; |
1233 | } |
1234 | } |
1235 | |
1236 | /** |
1237 | * Log inline background color style rules without a color style rule. |
1238 | * |
1239 | * This function identifies elements with inline style attributes |
1240 | * that have background color set but don't have a color style rule. |
1241 | * It records linter events for such elements to help editors make |
1242 | * their articles comply with WCAG color contrast rules. |
1243 | * |
1244 | * Linter category: `night-mode-unaware-background-color` |
1245 | */ |
1246 | private function lintNightModeUnawareBackgroundColor( |
1247 | Env $env, Element $node, DataParsoid $dp, ?stdClass $tplInfo |
1248 | ): void { |
1249 | // Get inline style attribute value |
1250 | $styleAttrValue = DOMCompat::getAttribute( $node, 'style' ); |
1251 | |
1252 | // Check if background color is set but font color is not |
1253 | if ( |
1254 | ( $styleAttrValue !== null ) && |
1255 | preg_match( '/(^|;)\s*background(-color)?\s*:/i', $styleAttrValue ) && |
1256 | !preg_match( '/(^|;)\s*color\s*:/i', $styleAttrValue ) |
1257 | ) { |
1258 | $tplLintInfo = $this->findEnclosingTemplateName( $env, $tplInfo ); |
1259 | $lintObj = [ |
1260 | 'dsr' => $this->findLintDSR( |
1261 | $tplLintInfo, $tplInfo, DOMDataUtils::getDataParsoid( $node )->dsr ?? null |
1262 | ), |
1263 | 'templateInfo' => $tplLintInfo, |
1264 | ]; |
1265 | $env->recordLint( 'night-mode-unaware-background-color', $lintObj ); |
1266 | } |
1267 | } |
1268 | |
1269 | /** |
1270 | * Log wikitext fixups |
1271 | */ |
1272 | private function logWikitextFixups( |
1273 | Element $node, Env $env, ?stdClass $tplInfo |
1274 | ): void { |
1275 | $dp = DOMDataUtils::getDataParsoid( $node ); |
1276 | $this->lintTreeBuilderFixup( $env, $node, $dp, $tplInfo ); |
1277 | $this->lintDeletableTableTag( $env, $node, $dp, $tplInfo ); // For T161341 |
1278 | $this->lintPWrapBugWorkaround( $env, $node, $dp, $tplInfo ); // For T161306 |
1279 | $this->lintObsoleteTag( $env, $node, $dp, $tplInfo ); |
1280 | $this->lintBogusImageOptions( $env, $node, $dp, $tplInfo ); |
1281 | $this->lintTidyWhitespaceBug( $env, $node, $dp, $tplInfo ); |
1282 | $this->lintMiscTidyReplacementIssues( $env, $node, $dp, $tplInfo ); |
1283 | $this->lintMultilineHtmlTableInList( $env, $node, $dp, $tplInfo ); |
1284 | $this->lintWikilinksInExtlink( $env, $node, $dp, $tplInfo ); |
1285 | $this->lintLargeTables( $env, $node, $dp, $tplInfo ); |
1286 | $this->lintNightModeUnawareBackgroundColor( $env, $node, $dp, $tplInfo ); |
1287 | $this->lintFostered( $env, $node, $dp, $tplInfo ); |
1288 | } |
1289 | |
1290 | /** |
1291 | * Walk the DOM and compute lints for the entire tree. |
1292 | * - When we enter encapsulated content (templates or extensions), |
1293 | * compute "tplInfo" (misnamed given that it can be an extension) |
1294 | * so that lints from the templates' content can be mapped back |
1295 | * to the transclusion that generated them. |
1296 | * - When we process extensions, if we have a lint handler for the |
1297 | * extension, let the extension's lint handler compute lints. |
1298 | */ |
1299 | private function findLints( |
1300 | Node $root, Env $env, ?stdClass $tplInfo = null |
1301 | ): void { |
1302 | $node = $root->firstChild; |
1303 | while ( $node !== null ) { |
1304 | if ( !$node instanceof Element ) { |
1305 | $node = $node->nextSibling; |
1306 | continue; |
1307 | } |
1308 | |
1309 | // !tplInfo check is to protect against templated content in |
1310 | // extensions which might in turn be nested in templated content. |
1311 | if ( !$tplInfo && WTUtils::isFirstEncapsulationWrapperNode( $node ) ) { |
1312 | $aboutSibs = WTUtils::getAboutSiblings( $node, DOMCompat::getAttribute( $node, 'about' ) ); |
1313 | $tplInfo = (object)[ |
1314 | 'first' => $node, |
1315 | 'last' => end( $aboutSibs ), |
1316 | 'dsr' => DOMDataUtils::getDataParsoid( $node )->dsr ?? null, |
1317 | // FIXME: This is not being used. Instead the code is recomputing |
1318 | // this info in findEnclosingTemplateName. |
1319 | 'isTemplated' => DOMUtils::hasTypeOf( $node, 'mw:Transclusion' ), |
1320 | ]; |
1321 | } |
1322 | |
1323 | $handled = false; |
1324 | |
1325 | // Let native extensions lint their content |
1326 | $nativeExt = WTUtils::getNativeExt( $env, $node ); |
1327 | if ( $nativeExt ) { |
1328 | if ( !$this->extApi ) { |
1329 | $this->extApi = new ParsoidExtensionAPI( $env ); |
1330 | } |
1331 | $handled = $nativeExt->lintHandler( |
1332 | $this->extApi, |
1333 | $node, |
1334 | function ( $extRootNode ) use ( $env, $tplInfo ) { |
1335 | $this->findLints( |
1336 | $extRootNode, $env, |
1337 | empty( $tplInfo->isTemplated ) ? null : $tplInfo |
1338 | ); |
1339 | } |
1340 | ); |
1341 | // NOTE: See the note in WrapSectionsState::shouldOmitFromTOC() |
1342 | // but we've assumed extension content is contained in a single |
1343 | // wrapper node and it's safe to move to $node->nextSibling. |
1344 | } |
1345 | |
1346 | // Default node handler |
1347 | if ( $handled === false ) { |
1348 | // Lint this node |
1349 | $this->logWikitextFixups( $node, $env, $tplInfo ); |
1350 | |
1351 | // Lint subtree |
1352 | $this->findLints( $node, $env, $tplInfo ); |
1353 | } |
1354 | |
1355 | if ( $tplInfo && $tplInfo->last === $node ) { |
1356 | $tplInfo = null; |
1357 | } |
1358 | |
1359 | $node = $node->nextSibling; |
1360 | } |
1361 | } |
1362 | |
1363 | /** |
1364 | * This is only invoked on the top-level document |
1365 | * @inheritDoc |
1366 | */ |
1367 | public function run( |
1368 | Env $env, Node $root, array $options = [], bool $atTopLevel = false |
1369 | ): void { |
1370 | // Track time spent linting so we can evaluate benefits |
1371 | // of migrating this code off the critical path to its own |
1372 | // post processor. |
1373 | $metrics = $env->getSiteConfig()->metrics(); |
1374 | $timer = null; |
1375 | if ( $metrics ) { |
1376 | $timer = Timing::start( $metrics ); |
1377 | } |
1378 | |
1379 | $this->findLints( $root, $env ); |
1380 | $this->postProcessLints( $env->getLints(), $env ); |
1381 | |
1382 | if ( $metrics ) { |
1383 | $timer->end( "linting" ); |
1384 | } |
1385 | } |
1386 | |
1387 | } |