Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
94.12% |
656 / 697 |
|
65.38% |
17 / 26 |
CRAP | |
0.00% |
0 / 1 |
CommentParser | |
94.12% |
656 / 697 |
|
65.38% |
17 / 26 |
260.52 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
11 / 11 |
|
100.00% |
1 / 1 |
1 | |||
parse | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
1 | |||
nextInterestingLeafNode | |
95.45% |
21 / 22 |
|
0.00% |
0 / 1 |
8 | |||
regexpAlternateGroup | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
getMessages | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
getTimestampRegexp | |
88.54% |
85 / 96 |
|
0.00% |
0 / 1 |
32.45 | |||
getTimestampParser | |
93.85% |
122 / 130 |
|
0.00% |
0 / 1 |
52.63 | |||
getLocalTimestampRegexps | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
1 | |||
getLocalTimestampParsers | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
1 | |||
getUsernameFromLink | |
97.06% |
33 / 34 |
|
0.00% |
0 / 1 |
17 | |||
findSignature | |
100.00% |
44 / 44 |
|
100.00% |
1 / 1 |
16 | |||
acceptOnlyNodesAllowingComments | |
100.00% |
18 / 18 |
|
100.00% |
1 / 1 |
11 | |||
getCodepointOffset | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
findTimestamp | |
100.00% |
45 / 45 |
|
100.00% |
1 / 1 |
11 | |||
adjustSigRange | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
3 | |||
buildThreadItems | |
98.98% |
97 / 98 |
|
0.00% |
0 / 1 |
22 | |||
computeTranscludedFrom | |
69.23% |
36 / 52 |
|
0.00% |
0 / 1 |
45.69 | |||
titleCanExist | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
3 | |||
parseTitle | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
getTransclusionTitles | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
4 | |||
getTransclusionRange | |
100.00% |
13 / 13 |
|
100.00% |
1 / 1 |
5 | |||
truncateForId | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
computeId | |
96.88% |
31 / 32 |
|
0.00% |
0 / 1 |
14 | |||
computeName | |
91.67% |
11 / 12 |
|
0.00% |
0 / 1 |
4.01 | |||
buildThreads | |
95.24% |
20 / 21 |
|
0.00% |
0 / 1 |
9 | |||
computeIdsAndNames | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
2 |
1 | <?php |
2 | |
3 | namespace MediaWiki\Extension\DiscussionTools; |
4 | |
5 | use DateInterval; |
6 | use DateTime; |
7 | use DateTimeImmutable; |
8 | use DateTimeZone; |
9 | use InvalidArgumentException; |
10 | use Language; |
11 | use LogicException; |
12 | use MediaWiki\Config\Config; |
13 | use MediaWiki\Extension\DiscussionTools\ThreadItem\ContentCommentItem; |
14 | use MediaWiki\Extension\DiscussionTools\ThreadItem\ContentHeadingItem; |
15 | use MediaWiki\Extension\DiscussionTools\ThreadItem\ContentThreadItem; |
16 | use MediaWiki\Languages\LanguageConverterFactory; |
17 | use MediaWiki\Title\MalformedTitleException; |
18 | use MediaWiki\Title\TitleParser; |
19 | use MediaWiki\Title\TitleValue; |
20 | use MediaWiki\Utils\MWTimestamp; |
21 | use RuntimeException; |
22 | use Wikimedia\Assert\Assert; |
23 | use Wikimedia\IPUtils; |
24 | use Wikimedia\Parsoid\DOM\Element; |
25 | use Wikimedia\Parsoid\DOM\Node; |
26 | use Wikimedia\Parsoid\DOM\Text; |
27 | use Wikimedia\Parsoid\Utils\DOMCompat; |
28 | use Wikimedia\Timestamp\TimestampException; |
29 | |
30 | // TODO consider making timestamp parsing not a returned function |
31 | |
32 | class CommentParser { |
33 | |
34 | /** |
35 | * How far backwards we look for a signature associated with a timestamp before giving up. |
36 | * Note that this is not a hard limit on the length of signatures we detect. |
37 | */ |
38 | private const SIGNATURE_SCAN_LIMIT = 100; |
39 | |
40 | private Config $config; |
41 | private Language $language; |
42 | private LanguageConverterFactory $languageConverterFactory; |
43 | private TitleParser $titleParser; |
44 | |
45 | /** @var string[] */ |
46 | private array $dateFormat; |
47 | /** @var string[][] */ |
48 | private array $digits; |
49 | /** @var string[][] */ |
50 | private $contLangMessages; |
51 | private string $localTimezone; |
52 | /** @var string[][] */ |
53 | private array $timezones; |
54 | private string $specialContributionsName; |
55 | |
56 | private Element $rootNode; |
57 | private TitleValue $title; |
58 | |
59 | /** |
60 | * @param Config $config |
61 | * @param Language $language Content language |
62 | * @param LanguageConverterFactory $languageConverterFactory |
63 | * @param LanguageData $languageData |
64 | * @param TitleParser $titleParser |
65 | */ |
66 | public function __construct( |
67 | Config $config, |
68 | Language $language, |
69 | LanguageConverterFactory $languageConverterFactory, |
70 | LanguageData $languageData, |
71 | TitleParser $titleParser |
72 | ) { |
73 | $this->config = $config; |
74 | $this->language = $language; |
75 | $this->languageConverterFactory = $languageConverterFactory; |
76 | $this->titleParser = $titleParser; |
77 | |
78 | $data = $languageData->getLocalData(); |
79 | $this->dateFormat = $data['dateFormat']; |
80 | $this->digits = $data['digits']; |
81 | $this->contLangMessages = $data['contLangMessages']; |
82 | $this->localTimezone = $data['localTimezone']; |
83 | $this->timezones = $data['timezones']; |
84 | $this->specialContributionsName = $data['specialContributionsName']; |
85 | } |
86 | |
87 | /** |
88 | * Parse a discussion page. |
89 | * |
90 | * @param Element $rootNode Root node of content to parse |
91 | * @param TitleValue $title Title of the page being parsed |
92 | * @return ContentThreadItemSet |
93 | */ |
94 | public function parse( Element $rootNode, TitleValue $title ): ContentThreadItemSet { |
95 | $this->rootNode = $rootNode; |
96 | $this->title = $title; |
97 | |
98 | $result = $this->buildThreadItems(); |
99 | $this->buildThreads( $result ); |
100 | $this->computeIdsAndNames( $result ); |
101 | |
102 | return $result; |
103 | } |
104 | |
105 | /** |
106 | * Return the next leaf node in the tree order that is likely a part of a discussion comment, |
107 | * rather than some boring "separator" element. |
108 | * |
109 | * Currently, this can return a Text node with content other than whitespace, or an Element node |
110 | * that is a "void element" or "text element", except some special cases that we treat as comment |
111 | * separators (isCommentSeparator()). |
112 | * |
113 | * @param ?Node $node Node after which to start searching |
114 | * (if null, start at the beginning of the document). |
115 | * @return Node |
116 | */ |
117 | private function nextInterestingLeafNode( ?Node $node ): Node { |
118 | $rootNode = $this->rootNode; |
119 | $treeWalker = new TreeWalker( |
120 | $rootNode, |
121 | NodeFilter::SHOW_ELEMENT | NodeFilter::SHOW_TEXT, |
122 | static function ( $n ) use ( $node, $rootNode ) { |
123 | // Skip past the starting node and its descendants |
124 | if ( $n === $node || $n->parentNode === $node ) { |
125 | return NodeFilter::FILTER_REJECT; |
126 | } |
127 | // Ignore some elements usually used as separators or headers (and their descendants) |
128 | if ( CommentUtils::isCommentSeparator( $n ) ) { |
129 | return NodeFilter::FILTER_REJECT; |
130 | } |
131 | // Ignore nodes with no rendering that mess up our indentation detection |
132 | if ( CommentUtils::isRenderingTransparentNode( $n ) ) { |
133 | return NodeFilter::FILTER_REJECT; |
134 | } |
135 | if ( CommentUtils::isCommentContent( $n ) ) { |
136 | return NodeFilter::FILTER_ACCEPT; |
137 | } |
138 | return NodeFilter::FILTER_SKIP; |
139 | } |
140 | ); |
141 | if ( $node ) { |
142 | $treeWalker->currentNode = $node; |
143 | } |
144 | $treeWalker->nextNode(); |
145 | if ( !$treeWalker->currentNode ) { |
146 | throw new RuntimeException( 'nextInterestingLeafNode not found' ); |
147 | } |
148 | return $treeWalker->currentNode; |
149 | } |
150 | |
151 | /** |
152 | * @param string[] $values Values to match |
153 | * @return string Regular expression |
154 | */ |
155 | private static function regexpAlternateGroup( array $values ): string { |
156 | return '(' . implode( '|', array_map( static function ( string $x ) { |
157 | return preg_quote( $x, '/' ); |
158 | }, $values ) ) . ')'; |
159 | } |
160 | |
161 | /** |
162 | * Get text of localisation messages in content language. |
163 | * |
164 | * @param string $contLangVariant Content language variant |
165 | * @param string[] $messages Message keys |
166 | * @return string[] Message values |
167 | */ |
168 | private function getMessages( string $contLangVariant, array $messages ): array { |
169 | return array_map( function ( string $key ) use ( $contLangVariant ) { |
170 | return $this->contLangMessages[$contLangVariant][$key]; |
171 | }, $messages ); |
172 | } |
173 | |
174 | /** |
175 | * Get a regexp that matches timestamps generated using the given date format. |
176 | * |
177 | * This only supports format characters that are used by the default date format in any of |
178 | * MediaWiki's languages, namely: D, d, F, G, H, i, j, l, M, n, Y, xg, xkY (and escape characters), |
179 | * and only dates when MediaWiki existed, let's say 2000 onwards (Thai dates before 1941 are |
180 | * complicated). |
181 | * |
182 | * @param string $contLangVariant Content language variant |
183 | * @param string $format Date format |
184 | * @param string $digitsRegexp Regular expression matching a single localised digit, e.g. '[0-9]' |
185 | * @param array $tzAbbrs Associative array mapping localised timezone abbreviations to |
186 | * IANA abbreviations, for the local timezone, e.g. [ 'EDT' => 'EDT', 'EST' => 'EST' ] |
187 | * @return string Regular expression |
188 | */ |
189 | private function getTimestampRegexp( |
190 | string $contLangVariant, string $format, string $digitsRegexp, array $tzAbbrs |
191 | ): string { |
192 | $formatLength = strlen( $format ); |
193 | $s = ''; |
194 | $raw = false; |
195 | // Adapted from Language::sprintfDate() |
196 | for ( $p = 0; $p < $formatLength; $p++ ) { |
197 | $num = false; |
198 | $code = $format[ $p ]; |
199 | if ( $code === 'x' && $p < $formatLength - 1 ) { |
200 | $code .= $format[++$p]; |
201 | } |
202 | if ( $code === 'xk' && $p < $formatLength - 1 ) { |
203 | $code .= $format[++$p]; |
204 | } |
205 | |
206 | switch ( $code ) { |
207 | case 'xx': |
208 | $s .= 'x'; |
209 | break; |
210 | case 'xg': |
211 | $s .= static::regexpAlternateGroup( |
212 | $this->getMessages( $contLangVariant, Language::MONTH_GENITIVE_MESSAGES ) |
213 | ); |
214 | break; |
215 | case 'xn': |
216 | $raw = true; |
217 | break; |
218 | case 'd': |
219 | $num = '2'; |
220 | break; |
221 | case 'D': |
222 | $s .= static::regexpAlternateGroup( |
223 | $this->getMessages( $contLangVariant, Language::WEEKDAY_ABBREVIATED_MESSAGES ) |
224 | ); |
225 | break; |
226 | case 'j': |
227 | $num = '1,2'; |
228 | break; |
229 | case 'l': |
230 | $s .= static::regexpAlternateGroup( |
231 | $this->getMessages( $contLangVariant, Language::WEEKDAY_MESSAGES ) |
232 | ); |
233 | break; |
234 | case 'F': |
235 | $s .= static::regexpAlternateGroup( |
236 | $this->getMessages( $contLangVariant, Language::MONTH_MESSAGES ) |
237 | ); |
238 | break; |
239 | case 'M': |
240 | $s .= static::regexpAlternateGroup( |
241 | $this->getMessages( $contLangVariant, Language::MONTH_ABBREVIATED_MESSAGES ) |
242 | ); |
243 | break; |
244 | case 'm': |
245 | $num = '2'; |
246 | break; |
247 | case 'n': |
248 | $num = '1,2'; |
249 | break; |
250 | case 'Y': |
251 | $num = '4'; |
252 | break; |
253 | case 'xkY': |
254 | $num = '4'; |
255 | break; |
256 | case 'G': |
257 | $num = '1,2'; |
258 | break; |
259 | case 'H': |
260 | $num = '2'; |
261 | break; |
262 | case 'i': |
263 | $num = '2'; |
264 | break; |
265 | case 's': |
266 | $num = '2'; |
267 | break; |
268 | case '\\': |
269 | // Backslash escaping |
270 | if ( $p < $formatLength - 1 ) { |
271 | $s .= preg_quote( $format[++$p], '/' ); |
272 | } else { |
273 | $s .= preg_quote( '\\', '/' ); |
274 | } |
275 | break; |
276 | case '"': |
277 | // Quoted literal |
278 | if ( $p < $formatLength - 1 ) { |
279 | $endQuote = strpos( $format, '"', $p + 1 ); |
280 | if ( $endQuote === false ) { |
281 | // No terminating quote, assume literal " |
282 | $s .= '"'; |
283 | } else { |
284 | $s .= preg_quote( substr( $format, $p + 1, $endQuote - $p - 1 ), '/' ); |
285 | $p = $endQuote; |
286 | } |
287 | } else { |
288 | // Quote at end of string, assume literal " |
289 | $s .= '"'; |
290 | } |
291 | break; |
292 | default: |
293 | // Copy whole characters together, instead of single bytes |
294 | $char = mb_substr( mb_strcut( $format, $p, 4 ), 0, 1 ); |
295 | $s .= preg_quote( $char, '/' ); |
296 | $p += strlen( $char ) - 1; |
297 | } |
298 | if ( $num !== false ) { |
299 | if ( $raw ) { |
300 | $s .= '([0-9]{' . $num . '})'; |
301 | $raw = false; |
302 | } else { |
303 | $s .= '(' . $digitsRegexp . '{' . $num . '})'; |
304 | } |
305 | } |
306 | // Ignore some invisible Unicode characters that often sneak into copy-pasted timestamps (T308448) |
307 | $s .= '[\\x{200E}\\x{200F}]?'; |
308 | } |
309 | |
310 | $tzRegexp = static::regexpAlternateGroup( array_keys( $tzAbbrs ) ); |
311 | |
312 | // Hard-coded parentheses and space like in Parser::pstPass2 |
313 | // Ignore some invisible Unicode characters that often sneak into copy-pasted timestamps (T245784) |
314 | // \uNNNN syntax can only be used from PHP 7.3 |
315 | return '/' . $s . ' [\\x{200E}\\x{200F}]?\\(' . $tzRegexp . '\\)/u'; |
316 | } |
317 | |
318 | /** |
319 | * Get a function that parses timestamps generated using the given date format, based on the result |
320 | * of matching the regexp returned by getTimestampRegexp() |
321 | * |
322 | * @param string $contLangVariant Content language variant |
323 | * @param string $format Date format, as used by MediaWiki |
324 | * @param array<int,string>|null $digits Localised digits from 0 to 9, e.g. `[ '0', '1', ..., '9' ]` |
325 | * @param string $localTimezone Local timezone IANA name, e.g. `America/New_York` |
326 | * @param array $tzAbbrs Map of localised timezone abbreviations to IANA abbreviations |
327 | * for the local timezone, e.g. [ 'EDT' => 'EDT', 'EST' => 'EST' ] |
328 | * @return callable Parser function |
329 | */ |
330 | private function getTimestampParser( |
331 | string $contLangVariant, string $format, ?array $digits, string $localTimezone, array $tzAbbrs |
332 | ): callable { |
333 | $untransformDigits = static function ( string $text ) use ( $digits ): int { |
334 | return (int)( $digits ? strtr( $text, array_flip( $digits ) ) : $text ); |
335 | }; |
336 | |
337 | $formatLength = strlen( $format ); |
338 | $matchingGroups = []; |
339 | for ( $p = 0; $p < $formatLength; $p++ ) { |
340 | $code = $format[$p]; |
341 | if ( $code === 'x' && $p < $formatLength - 1 ) { |
342 | $code .= $format[++$p]; |
343 | } |
344 | if ( $code === 'xk' && $p < $formatLength - 1 ) { |
345 | $code .= $format[++$p]; |
346 | } |
347 | |
348 | switch ( $code ) { |
349 | case 'xx': |
350 | case 'xn': |
351 | break; |
352 | case 'xg': |
353 | case 'd': |
354 | case 'j': |
355 | case 'D': |
356 | case 'l': |
357 | case 'F': |
358 | case 'M': |
359 | case 'm': |
360 | case 'n': |
361 | case 'Y': |
362 | case 'xkY': |
363 | case 'G': |
364 | case 'H': |
365 | case 'i': |
366 | case 's': |
367 | $matchingGroups[] = $code; |
368 | break; |
369 | case '\\': |
370 | // Backslash escaping |
371 | if ( $p < $formatLength - 1 ) { |
372 | $p++; |
373 | } |
374 | break; |
375 | case '"': |
376 | // Quoted literal |
377 | if ( $p < $formatLength - 1 ) { |
378 | $endQuote = strpos( $format, '"', $p + 1 ); |
379 | if ( $endQuote !== false ) { |
380 | $p = $endQuote; |
381 | } |
382 | } |
383 | break; |
384 | default: |
385 | break; |
386 | } |
387 | } |
388 | |
389 | return function ( array $match ) use ( |
390 | $matchingGroups, $untransformDigits, $localTimezone, $tzAbbrs, $contLangVariant |
391 | ) { |
392 | if ( is_array( $match[0] ) ) { |
393 | // Strip PREG_OFFSET_CAPTURE data |
394 | unset( $match['offset'] ); |
395 | $match = array_map( static function ( array $tuple ) { |
396 | return $tuple[0]; |
397 | }, $match ); |
398 | } |
399 | $year = 0; |
400 | $monthIdx = 0; |
401 | $day = 0; |
402 | $hour = 0; |
403 | $minute = 0; |
404 | foreach ( $matchingGroups as $i => $code ) { |
405 | $text = $match[$i + 1]; |
406 | switch ( $code ) { |
407 | case 'xg': |
408 | $monthIdx = array_search( |
409 | $text, |
410 | $this->getMessages( $contLangVariant, Language::MONTH_GENITIVE_MESSAGES ), |
411 | true |
412 | ); |
413 | break; |
414 | case 'd': |
415 | case 'j': |
416 | $day = $untransformDigits( $text ); |
417 | break; |
418 | case 'D': |
419 | case 'l': |
420 | // Day of the week - unused |
421 | break; |
422 | case 'F': |
423 | $monthIdx = array_search( |
424 | $text, |
425 | $this->getMessages( $contLangVariant, Language::MONTH_MESSAGES ), |
426 | true |
427 | ); |
428 | break; |
429 | case 'M': |
430 | $monthIdx = array_search( |
431 | $text, |
432 | $this->getMessages( $contLangVariant, Language::MONTH_ABBREVIATED_MESSAGES ), |
433 | true |
434 | ); |
435 | break; |
436 | case 'm': |
437 | case 'n': |
438 | $monthIdx = $untransformDigits( $text ) - 1; |
439 | break; |
440 | case 'Y': |
441 | $year = $untransformDigits( $text ); |
442 | break; |
443 | case 'xkY': |
444 | // Thai year |
445 | $year = $untransformDigits( $text ) - 543; |
446 | break; |
447 | case 'G': |
448 | case 'H': |
449 | $hour = $untransformDigits( $text ); |
450 | break; |
451 | case 'i': |
452 | $minute = $untransformDigits( $text ); |
453 | break; |
454 | case 's': |
455 | // Seconds - unused, because most timestamp formats omit them |
456 | break; |
457 | default: |
458 | throw new LogicException( 'Not implemented' ); |
459 | } |
460 | } |
461 | |
462 | // The last matching group is the timezone abbreviation |
463 | $tzAbbr = $tzAbbrs[ end( $match ) ]; |
464 | |
465 | // Most of the time, the timezone abbreviation is not necessary to parse the date, since we |
466 | // can assume all times are in the wiki's local timezone. |
467 | $date = new DateTime(); |
468 | // setTimezone must be called before setDate/setTime |
469 | $date->setTimezone( new DateTimeZone( $localTimezone ) ); |
470 | $date->setDate( $year, $monthIdx + 1, $day ); |
471 | $date->setTime( $hour, $minute, 0 ); |
472 | |
473 | // But during the "fall back" at the end of DST, some times will happen twice. |
474 | // Since the timezone abbreviation disambiguates the DST/non-DST times, we can detect |
475 | // when PHP chose the wrong one, and then try the other one. It appears that PHP always |
476 | // uses the later (non-DST) hour, but that behavior isn't documented, so we account for both. |
477 | $dateWarning = null; |
478 | if ( $date->format( 'T' ) !== $tzAbbr ) { |
479 | $altDate = clone $date; |
480 | if ( $date->format( 'I' ) ) { |
481 | // Parsed time is DST, try non-DST by advancing one hour |
482 | $altDate->add( new DateInterval( 'PT1H' ) ); |
483 | } else { |
484 | // Parsed time is non-DST, try DST by going back one hour |
485 | $altDate->sub( new DateInterval( 'PT1H' ) ); |
486 | } |
487 | if ( $altDate->format( 'T' ) === $tzAbbr ) { |
488 | $date = $altDate; |
489 | $dateWarning = 'Timestamp has timezone abbreviation for the wrong time'; |
490 | } else { |
491 | $dateWarning = 'Ambiguous time at DST switchover was parsed'; |
492 | } |
493 | } |
494 | |
495 | // Now set the timezone back to UTC for formatting |
496 | $date->setTimezone( new DateTimeZone( 'UTC' ) ); |
497 | $date = DateTimeImmutable::createFromMutable( $date ); |
498 | |
499 | // We require the date to be compatible with our libraries, for example zero or negative years (T352455) |
500 | // In PHP we need to check with MWTimestamp. |
501 | // In JS we need to check with Moment. |
502 | try { |
503 | // @phan-suppress-next-line PhanNoopNew |
504 | new MWTimestamp( $date->format( 'c' ) ); |
505 | } catch ( TimestampException $ex ) { |
506 | return null; |
507 | } |
508 | |
509 | return [ |
510 | 'date' => $date, |
511 | 'warning' => $dateWarning, |
512 | ]; |
513 | }; |
514 | } |
515 | |
516 | /** |
517 | * Get a regexp that matches timestamps in the local date format, for each language variant. |
518 | * |
519 | * This calls getTimestampRegexp() with predefined data for the current wiki. |
520 | * |
521 | * @return string[] Regular expressions |
522 | */ |
523 | public function getLocalTimestampRegexps(): array { |
524 | $langConv = $this->languageConverterFactory->getLanguageConverter( $this->language ); |
525 | return array_map( function ( $contLangVariant ) { |
526 | return $this->getTimestampRegexp( |
527 | $contLangVariant, |
528 | $this->dateFormat[$contLangVariant], |
529 | '[' . implode( '', $this->digits[$contLangVariant] ) . ']', |
530 | $this->timezones[$contLangVariant] |
531 | ); |
532 | }, $langConv->getVariants() ); |
533 | } |
534 | |
535 | /** |
536 | * Get a function that parses timestamps in the local date format, for each language variant, |
537 | * based on the result of matching the regexp returned by getLocalTimestampRegexp(). |
538 | * |
539 | * This calls getTimestampParser() with predefined data for the current wiki. |
540 | * |
541 | * @return callable[] Parser functions |
542 | */ |
543 | private function getLocalTimestampParsers(): array { |
544 | $langConv = $this->languageConverterFactory->getLanguageConverter( $this->language ); |
545 | return array_map( function ( $contLangVariant ) { |
546 | return $this->getTimestampParser( |
547 | $contLangVariant, |
548 | $this->dateFormat[$contLangVariant], |
549 | $this->digits[$contLangVariant], |
550 | $this->localTimezone, |
551 | $this->timezones[$contLangVariant] |
552 | ); |
553 | }, $langConv->getVariants() ); |
554 | } |
555 | |
556 | /** |
557 | * Given a link node (`<a>`), if it's a link to a user-related page, return their username. |
558 | * |
559 | * @param Element $link |
560 | * @return array|null Array, or null: |
561 | * - string 'username' Username |
562 | * - string|null 'displayName' Display name (link text if link target was in the user namespace) |
563 | */ |
564 | private function getUsernameFromLink( Element $link ): ?array { |
565 | // Selflink: use title of current page |
566 | if ( DOMCompat::getClassList( $link )->contains( 'mw-selflink' ) ) { |
567 | $title = $this->title; |
568 | } else { |
569 | $titleString = CommentUtils::getTitleFromUrl( $link->getAttribute( 'href' ) ?? '', $this->config ) ?? ''; |
570 | // Performance optimization, skip strings that obviously don't contain a namespace |
571 | if ( $titleString === '' || !str_contains( $titleString, ':' ) ) { |
572 | return null; |
573 | } |
574 | $title = $this->parseTitle( $titleString ); |
575 | if ( !$title ) { |
576 | return null; |
577 | } |
578 | } |
579 | |
580 | $username = null; |
581 | $displayName = null; |
582 | $mainText = $title->getText(); |
583 | |
584 | if ( $title->inNamespace( NS_USER ) || $title->inNamespace( NS_USER_TALK ) ) { |
585 | $username = $mainText; |
586 | if ( str_contains( $username, '/' ) ) { |
587 | return null; |
588 | } |
589 | if ( $title->inNamespace( NS_USER ) ) { |
590 | // Use regex trim for consistency with JS implementation |
591 | $text = preg_replace( [ '/^[\s]+/u', '/[\s]+$/u' ], '', $link->textContent ?? '' ); |
592 | // Record the display name if it has been customised beyond changing case |
593 | if ( $text && mb_strtolower( $text ) !== mb_strtolower( $username ) ) { |
594 | $displayName = $text; |
595 | } |
596 | } |
597 | } elseif ( $title->inNamespace( NS_SPECIAL ) ) { |
598 | $parts = explode( '/', $mainText ); |
599 | if ( count( $parts ) === 2 && $parts[0] === $this->specialContributionsName ) { |
600 | // Normalize the username: users may link to their contributions with an unnormalized name |
601 | $userpage = $this->titleParser->makeTitleValueSafe( NS_USER, $parts[1] ); |
602 | if ( !$userpage ) { |
603 | return null; |
604 | } |
605 | $username = $userpage->getText(); |
606 | } |
607 | } |
608 | if ( !$username ) { |
609 | return null; |
610 | } |
611 | if ( IPUtils::isIPv6( $username ) ) { |
612 | // Bot-generated links "Preceding unsigned comment added by" have non-standard case |
613 | $username = strtoupper( $username ); |
614 | } |
615 | return [ |
616 | 'username' => $username, |
617 | 'displayName' => $displayName, |
618 | ]; |
619 | } |
620 | |
621 | /** |
622 | * Find a user signature preceding a timestamp. |
623 | * |
624 | * The signature includes the timestamp node. |
625 | * |
626 | * A signature must contain at least one link to the user's userpage, discussion page or |
627 | * contributions (and may contain other links). The link may be nested in other elements. |
628 | * |
629 | * @param Text $timestampNode |
630 | * @param Node|null $until Node to stop searching at |
631 | * @return array Result, an associative array with the following keys: |
632 | * - Node[] `nodes` Sibling nodes comprising the signature, in reverse order (with |
633 | * $timestampNode or its parent node as the first element) |
634 | * - string|null `username` Username, null for unsigned comments |
635 | */ |
636 | private function findSignature( Text $timestampNode, ?Node $until = null ): array { |
637 | $sigUsername = null; |
638 | $sigDisplayName = null; |
639 | $length = 0; |
640 | $lastLinkNode = $timestampNode; |
641 | |
642 | CommentUtils::linearWalkBackwards( |
643 | $timestampNode, |
644 | function ( string $event, Node $node ) use ( |
645 | &$sigUsername, &$sigDisplayName, &$lastLinkNode, &$length, |
646 | $until, $timestampNode |
647 | ) { |
648 | if ( $event === 'enter' && $node === $until ) { |
649 | return true; |
650 | } |
651 | if ( $length >= static::SIGNATURE_SCAN_LIMIT ) { |
652 | return true; |
653 | } |
654 | if ( CommentUtils::isBlockElement( $node ) ) { |
655 | // Don't allow reaching into preceding paragraphs |
656 | return true; |
657 | } |
658 | |
659 | if ( $event === 'leave' && $node !== $timestampNode ) { |
660 | $length += $node instanceof Text ? |
661 | mb_strlen( CommentUtils::htmlTrim( $node->textContent ?? '' ) ) : 0; |
662 | } |
663 | |
664 | // Find the closest link before timestamp that links to the user's user page. |
665 | // |
666 | // Support timestamps being linked to the diff introducing the comment: |
667 | // if the timestamp node is the only child of a link node, use the link node instead |
668 | // |
669 | // Handle links nested in formatting elements. |
670 | if ( $event === 'leave' && $node instanceof Element && strtolower( $node->tagName ) === 'a' ) { |
671 | $classList = DOMCompat::getClassList( $node ); |
672 | // Generated timestamp links sometimes look like username links (e.g. on user talk pages) |
673 | // so ignore these. |
674 | if ( !$classList->contains( 'ext-discussiontools-init-timestamplink' ) ) { |
675 | $user = $this->getUsernameFromLink( $node ); |
676 | if ( $user ) { |
677 | // Accept the first link to the user namespace, then only accept links to that user |
678 | if ( $sigUsername === null ) { |
679 | $sigUsername = $user['username']; |
680 | } |
681 | if ( $user['username'] === $sigUsername ) { |
682 | $lastLinkNode = $node; |
683 | if ( $user['displayName'] ) { |
684 | $sigDisplayName = $user['displayName']; |
685 | } |
686 | } |
687 | } |
688 | // Keep looking if a node with links wasn't a link to a user page |
689 | // "Doc James (talk · contribs · email)" |
690 | } |
691 | } |
692 | } |
693 | ); |
694 | |
695 | $range = new ImmutableRange( |
696 | $lastLinkNode->parentNode, |
697 | CommentUtils::childIndexOf( $lastLinkNode ), |
698 | $timestampNode->parentNode, |
699 | CommentUtils::childIndexOf( $timestampNode ) + 1 |
700 | ); |
701 | |
702 | // Expand the range so that it covers sibling nodes. |
703 | // This will include any wrapping formatting elements as part of the signature. |
704 | // |
705 | // Helpful accidental feature: users whose signature is not detected in full (due to |
706 | // text formatting) can just wrap it in a <span> to fix that. |
707 | // "Ten Pound Hammer • (What did I screw up now?)" |
708 | // "« Saper // dyskusja »" |
709 | // |
710 | // TODO Not sure if this is actually good, might be better to just use the range... |
711 | $sigNodes = array_reverse( CommentUtils::getCoveredSiblings( $range ) ); |
712 | |
713 | return [ |
714 | 'nodes' => $sigNodes, |
715 | 'username' => $sigUsername, |
716 | 'displayName' => $sigDisplayName, |
717 | ]; |
718 | } |
719 | |
720 | /** |
721 | * Callback for TreeWalker that will skip over nodes where we don't want to detect |
722 | * comments (or section headings). |
723 | * |
724 | * @param Node $node |
725 | * @return int Appropriate NodeFilter constant |
726 | */ |
727 | public static function acceptOnlyNodesAllowingComments( Node $node ): int { |
728 | if ( $node instanceof Element ) { |
729 | $tagName = strtolower( $node->tagName ); |
730 | // The table of contents has a heading that gets erroneously detected as a section |
731 | if ( $node->getAttribute( 'id' ) === 'toc' ) { |
732 | return NodeFilter::FILTER_REJECT; |
733 | } |
734 | // Don't detect comments within quotes (T275881) |
735 | if ( |
736 | $tagName === 'blockquote' || |
737 | $tagName === 'cite' || |
738 | $tagName === 'q' |
739 | ) { |
740 | return NodeFilter::FILTER_REJECT; |
741 | } |
742 | $classList = DOMCompat::getClassList( $node ); |
743 | // Don't attempt to parse blocks marked 'mw-notalk' |
744 | if ( $classList->contains( 'mw-notalk' ) ) { |
745 | return NodeFilter::FILTER_REJECT; |
746 | } |
747 | // Don't detect comments within references. We can't add replies to them without bungling up |
748 | // the structure in some cases (T301213), and you're not supposed to do that anyway… |
749 | if ( |
750 | // <ol class="references"> is the only reliably consistent thing between the two parsers |
751 | $tagName === 'ol' && |
752 | DOMCompat::getClassList( $node )->contains( 'references' ) |
753 | ) { |
754 | return NodeFilter::FILTER_REJECT; |
755 | } |
756 | } |
757 | $parentNode = $node->parentNode; |
758 | // Don't detect comments within headings (but don't reject the headings themselves) |
759 | if ( $parentNode instanceof Element && preg_match( '/^h([1-6])$/i', $parentNode->tagName ) ) { |
760 | return NodeFilter::FILTER_REJECT; |
761 | } |
762 | return NodeFilter::FILTER_ACCEPT; |
763 | } |
764 | |
765 | /** |
766 | * Convert a byte offset within a text node to a unicode codepoint offset |
767 | * |
768 | * @param Text $node Text node |
769 | * @param int $byteOffset Byte offset |
770 | * @return int Codepoint offset |
771 | */ |
772 | private static function getCodepointOffset( Text $node, int $byteOffset ): int { |
773 | return mb_strlen( substr( $node->nodeValue ?? '', 0, $byteOffset ) ); |
774 | } |
775 | |
776 | /** |
777 | * Find a timestamps in a given text node |
778 | * |
779 | * @param Text $node |
780 | * @param string[] $timestampRegexps |
781 | * @return array|null Array with the following keys: |
782 | * - int 'offset' Length of extra text preceding the node that was used for matching (in bytes) |
783 | * - int 'parserIndex' Which of the regexps matched |
784 | * - array 'matchData' Regexp match data, which specifies the location of the match, |
785 | * and which can be parsed using getLocalTimestampParsers() (offsets are in bytes) |
786 | * - ImmutableRange 'range' Range covering the timestamp |
787 | */ |
788 | public function findTimestamp( Text $node, array $timestampRegexps ): ?array { |
789 | $nodeText = ''; |
790 | $offset = 0; |
791 | // Searched nodes (reverse order) |
792 | $nodes = []; |
793 | |
794 | while ( $node ) { |
795 | $nodeText = $node->nodeValue . $nodeText; |
796 | $nodes[] = $node; |
797 | |
798 | // In Parsoid HTML, entities are represented as a 'mw:Entity' node, rather than normal HTML |
799 | // entities. On Arabic Wikipedia, the "UTC" timezone name contains some non-breaking spaces, |
800 | // which apparently are often turned into entities by buggy editing tools. To handle |
801 | // this, we must piece together the text, so that our regexp can match those timestamps. |
802 | if ( |
803 | ( $previousSibling = $node->previousSibling ) && |
804 | $previousSibling instanceof Element && |
805 | $previousSibling->getAttribute( 'typeof' ) === 'mw:Entity' |
806 | ) { |
807 | $nodeText = $previousSibling->firstChild->nodeValue . $nodeText; |
808 | $offset += strlen( $previousSibling->firstChild->nodeValue ?? '' ); |
809 | $nodes[] = $previousSibling->firstChild; |
810 | |
811 | // If the entity is preceded by more text, do this again |
812 | if ( |
813 | $previousSibling->previousSibling && |
814 | $previousSibling->previousSibling instanceof Text |
815 | ) { |
816 | $offset += strlen( $previousSibling->previousSibling->nodeValue ?? '' ); |
817 | $node = $previousSibling->previousSibling; |
818 | } else { |
819 | $node = null; |
820 | } |
821 | } else { |
822 | $node = null; |
823 | } |
824 | } |
825 | |
826 | foreach ( $timestampRegexps as $i => $timestampRegexp ) { |
827 | $matchData = null; |
828 | // Allows us to mimic match.index in #getComments |
829 | if ( preg_match( $timestampRegexp, $nodeText, $matchData, PREG_OFFSET_CAPTURE ) ) { |
830 | $timestampLength = strlen( $matchData[0][0] ); |
831 | // Bytes at the end of the last node which aren't part of the match |
832 | $tailLength = strlen( $nodeText ) - $timestampLength - $matchData[0][1]; |
833 | // We are moving right to left, but we start to the right of the end of |
834 | // the timestamp if there is trailing garbage, so that is a negative offset. |
835 | $count = -$tailLength; |
836 | $endNode = $nodes[0]; |
837 | $endOffset = strlen( $endNode->nodeValue ?? '' ) - $tailLength; |
838 | |
839 | foreach ( $nodes as $n ) { |
840 | $count += strlen( $n->nodeValue ?? '' ); |
841 | // If we have counted to beyond the start of the timestamp, we are in the |
842 | // start node of the timestamp |
843 | if ( $count >= $timestampLength ) { |
844 | $startNode = $n; |
845 | // Offset is how much we overshot the start by |
846 | $startOffset = $count - $timestampLength; |
847 | break; |
848 | } |
849 | } |
850 | Assert::precondition( $endNode instanceof Node, 'endNode of timestamp is a Node' ); |
851 | Assert::precondition( $startNode instanceof Node, 'startNode of timestamp range found' ); |
852 | Assert::precondition( is_int( $startOffset ), 'startOffset of timestamp range found' ); |
853 | |
854 | $startOffset = static::getCodepointOffset( $startNode, $startOffset ); |
855 | $endOffset = static::getCodepointOffset( $endNode, $endOffset ); |
856 | |
857 | $range = new ImmutableRange( $startNode, $startOffset, $endNode, $endOffset ); |
858 | |
859 | return [ |
860 | 'matchData' => $matchData, |
861 | // Bytes at the start of the first node which aren't part of the match |
862 | // TODO: Remove this and use 'range' instead |
863 | 'offset' => $offset, |
864 | 'range' => $range, |
865 | 'parserIndex' => $i, |
866 | ]; |
867 | } |
868 | } |
869 | return null; |
870 | } |
871 | |
872 | /** |
873 | * @param Node[] $sigNodes |
874 | * @param array $match |
875 | * @param Text $node |
876 | * @return ImmutableRange |
877 | */ |
878 | private function adjustSigRange( array $sigNodes, array $match, Text $node ): ImmutableRange { |
879 | $firstSigNode = end( $sigNodes ); |
880 | $lastSigNode = $sigNodes[0]; |
881 | |
882 | // TODO Document why this needs to be so complicated |
883 | $lastSigNodeOffsetByteOffset = |
884 | $match['matchData'][0][1] + strlen( $match['matchData'][0][0] ) - $match['offset']; |
885 | $lastSigNodeOffset = $lastSigNode === $node ? |
886 | static::getCodepointOffset( $node, $lastSigNodeOffsetByteOffset ) : |
887 | CommentUtils::childIndexOf( $lastSigNode ) + 1; |
888 | $sigRange = new ImmutableRange( |
889 | $firstSigNode->parentNode, |
890 | CommentUtils::childIndexOf( $firstSigNode ), |
891 | $lastSigNode === $node ? $node : $lastSigNode->parentNode, |
892 | $lastSigNodeOffset |
893 | ); |
894 | |
895 | return $sigRange; |
896 | } |
897 | |
898 | private function buildThreadItems(): ContentThreadItemSet { |
899 | $result = new ContentThreadItemSet(); |
900 | |
901 | $timestampRegexps = $this->getLocalTimestampRegexps(); |
902 | $dfParsers = $this->getLocalTimestampParsers(); |
903 | |
904 | $curCommentEnd = null; |
905 | |
906 | $treeWalker = new TreeWalker( |
907 | $this->rootNode, |
908 | NodeFilter::SHOW_ELEMENT | NodeFilter::SHOW_TEXT, |
909 | [ static::class, 'acceptOnlyNodesAllowingComments' ] |
910 | ); |
911 | while ( $node = $treeWalker->nextNode() ) { |
912 | if ( $node instanceof Element && preg_match( '/^h([1-6])$/i', $node->tagName, $match ) ) { |
913 | $headingNodeAndOffset = CommentUtils::getHeadlineNodeAndOffset( $node ); |
914 | $headingNode = $headingNodeAndOffset['node']; |
915 | $startOffset = $headingNodeAndOffset['offset']; |
916 | $range = new ImmutableRange( |
917 | $headingNode, $startOffset, $headingNode, $headingNode->childNodes->length |
918 | ); |
919 | $transcludedFrom = $this->computeTranscludedFrom( $range ); |
920 | $curComment = new ContentHeadingItem( $range, $transcludedFrom, (int)( $match[ 1 ] ) ); |
921 | $curComment->setRootNode( $this->rootNode ); |
922 | $result->addThreadItem( $curComment ); |
923 | $curCommentEnd = $node; |
924 | } elseif ( $node instanceof Text && ( $match = $this->findTimestamp( $node, $timestampRegexps ) ) ) { |
925 | $warnings = []; |
926 | $foundSignature = $this->findSignature( $node, $curCommentEnd ); |
927 | $author = $foundSignature['username']; |
928 | |
929 | if ( !$author ) { |
930 | // Ignore timestamps for which we couldn't find a signature. It's probably not a real |
931 | // comment, but just a false match due to a copypasted timestamp. |
932 | continue; |
933 | } |
934 | |
935 | $sigRanges = []; |
936 | $timestampRanges = []; |
937 | |
938 | $sigRanges[] = $this->adjustSigRange( $foundSignature['nodes'], $match, $node ); |
939 | $timestampRanges[] = $match['range']; |
940 | |
941 | // Everything from the last comment up to here is the next comment |
942 | $startNode = $this->nextInterestingLeafNode( $curCommentEnd ); |
943 | $endNode = $foundSignature['nodes'][0]; |
944 | |
945 | // Skip to the end of the "paragraph". This only looks at tag names and can be fooled by CSS, but |
946 | // avoiding that would be more difficult and slower. |
947 | // |
948 | // If this skips over another potential signature, also skip it in the main TreeWalker loop, to |
949 | // avoid generating multiple comments when there is more than one signature on a single "line". |
950 | // Often this is done when someone edits their comment later and wants to add a note about that. |
951 | // (Or when another person corrects a typo, or strikes out a comment, etc.) Multiple comments |
952 | // within one paragraph/list-item result in a confusing double "Reply" button, and we also have |
953 | // no way to indicate which one you're replying to (this might matter in the future for |
954 | // notifications or something). |
955 | CommentUtils::linearWalk( |
956 | $endNode, |
957 | function ( string $event, Node $n ) use ( |
958 | &$endNode, &$sigRanges, &$timestampRanges, |
959 | $treeWalker, $timestampRegexps, $node |
960 | ) { |
961 | if ( CommentUtils::isBlockElement( $n ) || CommentUtils::isCommentSeparator( $n ) ) { |
962 | // Stop when entering or leaving a block node |
963 | return true; |
964 | } |
965 | if ( |
966 | $event === 'leave' && |
967 | $n instanceof Text && $n !== $node && |
968 | ( $match2 = $this->findTimestamp( $n, $timestampRegexps ) ) |
969 | ) { |
970 | // If this skips over another potential signature, also skip it in the main TreeWalker loop |
971 | $treeWalker->currentNode = $n; |
972 | // …and add it as another signature to this comment (regardless of the author and timestamp) |
973 | $foundSignature2 = $this->findSignature( $n, $node ); |
974 | if ( $foundSignature2['username'] ) { |
975 | $sigRanges[] = $this->adjustSigRange( $foundSignature2['nodes'], $match2, $n ); |
976 | $timestampRanges[] = $match2['range']; |
977 | } |
978 | } |
979 | if ( $event === 'leave' ) { |
980 | // Take the last complete node which we skipped past |
981 | $endNode = $n; |
982 | } |
983 | } |
984 | ); |
985 | |
986 | $length = ( $endNode instanceof Text ) ? |
987 | mb_strlen( rtrim( $endNode->nodeValue ?? '', "\t\n\f\r " ) ) : |
988 | // PHP bug: childNodes can be null for comment nodes |
989 | // (it should always be a NodeList, even if the node can't have children) |
990 | ( $endNode->childNodes ? $endNode->childNodes->length : 0 ); |
991 | $range = new ImmutableRange( |
992 | $startNode->parentNode, |
993 | CommentUtils::childIndexOf( $startNode ), |
994 | $endNode, |
995 | $length |
996 | ); |
997 | $transcludedFrom = $this->computeTranscludedFrom( $range ); |
998 | |
999 | $startLevel = CommentUtils::getIndentLevel( $startNode, $this->rootNode ) + 1; |
1000 | $endLevel = CommentUtils::getIndentLevel( $node, $this->rootNode ) + 1; |
1001 | if ( $startLevel !== $endLevel ) { |
1002 | $warnings[] = 'Comment starts and ends with different indentation'; |
1003 | } |
1004 | // Should this use the indent level of $startNode or $node? |
1005 | $level = min( $startLevel, $endLevel ); |
1006 | |
1007 | $parserResult = $dfParsers[ $match['parserIndex'] ]( $match['matchData'] ); |
1008 | if ( !$parserResult ) { |
1009 | continue; |
1010 | } |
1011 | [ 'date' => $dateTime, 'warning' => $dateWarning ] = $parserResult; |
1012 | |
1013 | if ( $dateWarning ) { |
1014 | $warnings[] = $dateWarning; |
1015 | } |
1016 | |
1017 | $curComment = new ContentCommentItem( |
1018 | $level, |
1019 | $range, |
1020 | $transcludedFrom, |
1021 | $sigRanges, |
1022 | $timestampRanges, |
1023 | $dateTime, |
1024 | $author, |
1025 | $foundSignature['displayName'] |
1026 | ); |
1027 | $curComment->setRootNode( $this->rootNode ); |
1028 | if ( $warnings ) { |
1029 | $curComment->addWarnings( $warnings ); |
1030 | } |
1031 | if ( $result->isEmpty() ) { |
1032 | // Add a fake placeholder heading if there are any comments in the 0th section |
1033 | // (before the first real heading) |
1034 | $range = new ImmutableRange( $this->rootNode, 0, $this->rootNode, 0 ); |
1035 | $fakeHeading = new ContentHeadingItem( $range, false, null ); |
1036 | $fakeHeading->setRootNode( $this->rootNode ); |
1037 | $result->addThreadItem( $fakeHeading ); |
1038 | } |
1039 | $result->addThreadItem( $curComment ); |
1040 | $curCommentEnd = $curComment->getRange()->endContainer; |
1041 | } |
1042 | } |
1043 | |
1044 | return $result; |
1045 | } |
1046 | |
1047 | /** |
1048 | * Get the name of the page from which this thread item is transcluded (if any). Replies to |
1049 | * transcluded items must be posted on that page, instead of the current one. |
1050 | * |
1051 | * This is tricky, because we don't want to mark items as trancluded when they're just using a |
1052 | * template (e.g. {{ping|…}} or a non-substituted signature template). Sometimes the whole comment |
1053 | * can be template-generated (e.g. when using some wrapper templates), but as long as a reply can |
1054 | * be added outside of that template, we should not treat it as transcluded. |
1055 | * |
1056 | * The start/end boundary points of comment ranges and Parsoid transclusion ranges don't line up |
1057 | * exactly, even when to a human it's obvious that they cover the same content, making this more |
1058 | * complicated. |
1059 | * |
1060 | * @return string|bool `false` if this item is not transcluded. A string if it's transcluded |
1061 | * from a single page (the page title, in text form with spaces). `true` if it's transcluded, but |
1062 | * we can't determine the source. |
1063 | */ |
1064 | public function computeTranscludedFrom( ImmutableRange $commentRange ) { |
1065 | // Collapsed ranges should otherwise be impossible, but they're not (T299583) |
1066 | // TODO: See if we can fix the root cause, and remove this? |
1067 | if ( $commentRange->collapsed ) { |
1068 | return false; |
1069 | } |
1070 | |
1071 | // General approach: |
1072 | // |
1073 | // Compare the comment range to each transclusion range on the page, and if it overlaps any of |
1074 | // them, examine the overlap. There are a few cases: |
1075 | // |
1076 | // * Comment and transclusion do not overlap: |
1077 | // → Not transcluded. |
1078 | // * Comment contains the transclusion: |
1079 | // → Not transcluded (just a template). |
1080 | // * Comment is contained within the transclusion: |
1081 | // → Transcluded, we can determine the source page (unless it's a complex transclusion). |
1082 | // * Comment and transclusion overlap partially: |
1083 | // → Transcluded, but we can't determine the source page. |
1084 | // * Comment (almost) exactly matches the transclusion: |
1085 | // → Maybe transcluded (it could be that the source page only contains that single comment), |
1086 | // maybe not transcluded (it could be a wrapper template that covers a single comment). |
1087 | // This is very sad, and we decide based on the namespace. |
1088 | // |
1089 | // Most transclusion ranges on the page trivially fall in the "do not overlap" or "contains" |
1090 | // cases, and we only have to carefully examine the two transclusion ranges that contain the |
1091 | // first and last node of the comment range. |
1092 | // |
1093 | // To check for almost exact matches, we walk between the relevant boundary points, and if we |
1094 | // only find uninteresting nodes (that would be ignored when detecting comments), we treat them |
1095 | // like exact matches. |
1096 | |
1097 | $startTransclNode = CommentUtils::getTranscludedFromElement( |
1098 | CommentUtils::getRangeFirstNode( $commentRange ) |
1099 | ); |
1100 | $endTransclNode = CommentUtils::getTranscludedFromElement( |
1101 | CommentUtils::getRangeLastNode( $commentRange ) |
1102 | ); |
1103 | |
1104 | // We only have to examine the two transclusion ranges that contain the first/last node of the |
1105 | // comment range (if they exist). Ignore ranges outside the comment or in the middle of it. |
1106 | $transclNodes = []; |
1107 | if ( $startTransclNode ) { |
1108 | $transclNodes[] = $startTransclNode; |
1109 | } |
1110 | if ( $endTransclNode && $endTransclNode !== $startTransclNode ) { |
1111 | $transclNodes[] = $endTransclNode; |
1112 | } |
1113 | |
1114 | foreach ( $transclNodes as $transclNode ) { |
1115 | $transclRange = static::getTransclusionRange( $transclNode ); |
1116 | $compared = CommentUtils::compareRanges( $commentRange, $transclRange ); |
1117 | $transclTitles = $this->getTransclusionTitles( $transclNode ); |
1118 | $simpleTransclTitle = count( $transclTitles ) === 1 && $transclTitles[0] !== null ? |
1119 | $this->parseTitle( $transclTitles[0] ) : null; |
1120 | |
1121 | switch ( $compared ) { |
1122 | case 'equal': |
1123 | // Comment (almost) exactly matches the transclusion |
1124 | if ( $simpleTransclTitle === null ) { |
1125 | // Allow replying to some accidental complex transclusions consisting of only templates |
1126 | // and wikitext (T313093) |
1127 | if ( count( $transclTitles ) > 1 ) { |
1128 | foreach ( $transclTitles as $transclTitleString ) { |
1129 | if ( $transclTitleString !== null ) { |
1130 | $transclTitle = $this->parseTitle( $transclTitleString ); |
1131 | if ( $transclTitle && !$transclTitle->inNamespace( NS_TEMPLATE ) ) { |
1132 | return true; |
1133 | } |
1134 | } |
1135 | } |
1136 | // Continue examining the other ranges. |
1137 | break; |
1138 | } |
1139 | // Multi-template transclusion, or a parser function call, or template-affected wikitext outside |
1140 | // of a template call, or a mix of the above |
1141 | return true; |
1142 | |
1143 | } elseif ( $simpleTransclTitle->inNamespace( NS_TEMPLATE ) ) { |
1144 | // Is that a subpage transclusion with a single comment, or a wrapper template |
1145 | // transclusion on this page? We don't know, but let's guess based on the namespace. |
1146 | // (T289873) |
1147 | // Continue examining the other ranges. |
1148 | break; |
1149 | } elseif ( !$this->titleCanExist( $simpleTransclTitle ) ) { |
1150 | // Special page transclusion (T344622) or something else weird. Don't return the title, |
1151 | // since it's useless for replying, and can't be stored in the permalink database. |
1152 | return true; |
1153 | } else { |
1154 | Assert::precondition( $transclTitles[0] !== null, "Simple transclusion found" ); |
1155 | return strtr( $transclTitles[0], '_', ' ' ); |
1156 | } |
1157 | |
1158 | case 'contains': |
1159 | // Comment contains the transclusion |
1160 | |
1161 | // If the entire transclusion is contained within the comment range, that's just a |
1162 | // template. This is the same as a transclusion in the middle of the comment, which we |
1163 | // ignored earlier, it just takes us longer to get here in this case. |
1164 | |
1165 | // Continue examining the other ranges. |
1166 | break; |
1167 | |
1168 | case 'contained': |
1169 | // Comment is contained within the transclusion |
1170 | if ( $simpleTransclTitle === null ) { |
1171 | return true; |
1172 | } elseif ( !$this->titleCanExist( $simpleTransclTitle ) ) { |
1173 | // Special page transclusion (T344622) or something else weird. Don't return the title, |
1174 | // since it's useless for replying, and can't be stored in the permalink database. |
1175 | return true; |
1176 | } else { |
1177 | Assert::precondition( $transclTitles[0] !== null, "Simple transclusion found" ); |
1178 | return strtr( $transclTitles[0], '_', ' ' ); |
1179 | } |
1180 | |
1181 | case 'after': |
1182 | case 'before': |
1183 | // Comment and transclusion do not overlap |
1184 | |
1185 | // This should be impossible, because we ignored these ranges earlier. |
1186 | throw new LogicException( 'Unexpected transclusion or comment range' ); |
1187 | |
1188 | case 'overlapstart': |
1189 | case 'overlapend': |
1190 | // Comment and transclusion overlap partially |
1191 | return true; |
1192 | |
1193 | default: |
1194 | throw new LogicException( 'Unexpected return value from compareRanges()' ); |
1195 | } |
1196 | } |
1197 | |
1198 | // If we got here, the comment range was not contained by or overlapping any of the transclusion |
1199 | // ranges. Comment is not transcluded. |
1200 | return false; |
1201 | } |
1202 | |
1203 | private function titleCanExist( TitleValue $title ): bool { |
1204 | return $title->getNamespace() >= NS_MAIN && |
1205 | !$title->isExternal() && |
1206 | $title->getText() !== ''; |
1207 | } |
1208 | |
1209 | private function parseTitle( string $titleString ): ?TitleValue { |
1210 | try { |
1211 | return $this->titleParser->parseTitle( $titleString ); |
1212 | } catch ( MalformedTitleException $err ) { |
1213 | return null; |
1214 | } |
1215 | } |
1216 | |
1217 | /** |
1218 | * Return the page titles for each part of the transclusion, or nulls for each part that isn't |
1219 | * transcluded from another page. |
1220 | * |
1221 | * If the node represents a single-page transclusion, this will return an array containing a |
1222 | * single string. |
1223 | * |
1224 | * @param Element $node |
1225 | * @return array<string|null> |
1226 | */ |
1227 | private function getTransclusionTitles( Element $node ): array { |
1228 | $dataMw = json_decode( $node->getAttribute( 'data-mw' ) ?? '', true ); |
1229 | $out = []; |
1230 | |
1231 | foreach ( $dataMw['parts'] ?? [] as $part ) { |
1232 | if ( |
1233 | !is_string( $part ) && |
1234 | // 'href' will be unset if this is a parser function rather than a template |
1235 | isset( $part['template']['target']['href'] ) |
1236 | ) { |
1237 | $parsoidHref = $part['template']['target']['href']; |
1238 | Assert::precondition( substr( $parsoidHref, 0, 2 ) === './', "href has valid format" ); |
1239 | $out[] = urldecode( substr( $parsoidHref, 2 ) ); |
1240 | } else { |
1241 | $out[] = null; |
1242 | } |
1243 | } |
1244 | |
1245 | return $out; |
1246 | } |
1247 | |
1248 | /** |
1249 | * Given a transclusion's first node (e.g. returned by CommentUtils::getTranscludedFromElement()), |
1250 | * return a range starting before the node and ending after the transclusion's last node. |
1251 | * |
1252 | * @param Element $startNode |
1253 | * @return ImmutableRange |
1254 | */ |
1255 | private function getTransclusionRange( Element $startNode ): ImmutableRange { |
1256 | $endNode = $startNode; |
1257 | while ( |
1258 | // Phan doesn't realize that the conditions on $nextSibling can terminate the loop |
1259 | // @phan-suppress-next-line PhanInfiniteLoop |
1260 | $endNode && |
1261 | ( $nextSibling = $endNode->nextSibling ) && |
1262 | $nextSibling instanceof Element && |
1263 | $nextSibling->getAttribute( 'about' ) === $endNode->getAttribute( 'about' ) |
1264 | ) { |
1265 | $endNode = $nextSibling; |
1266 | } |
1267 | |
1268 | $range = new ImmutableRange( |
1269 | $startNode->parentNode, |
1270 | CommentUtils::childIndexOf( $startNode ), |
1271 | $endNode->parentNode, |
1272 | CommentUtils::childIndexOf( $endNode ) + 1 |
1273 | ); |
1274 | |
1275 | return $range; |
1276 | } |
1277 | |
1278 | /** |
1279 | * Truncate user generated parts of IDs so full ID always fits within a database field of length 255 |
1280 | * |
1281 | * @param string $text Text |
1282 | * @return string Truncated text |
1283 | */ |
1284 | private function truncateForId( string $text ): string { |
1285 | return $this->language->truncateForDatabase( $text, 80, '' ); |
1286 | } |
1287 | |
1288 | /** |
1289 | * Given a thread item, return an identifier for it that is unique within the page. |
1290 | */ |
1291 | private function computeId( ContentThreadItem $threadItem, ContentThreadItemSet $previousItems ): string { |
1292 | $id = null; |
1293 | |
1294 | if ( $threadItem instanceof ContentHeadingItem && $threadItem->isPlaceholderHeading() ) { |
1295 | // The range points to the root note, using it like below results in silly values |
1296 | $id = 'h-'; |
1297 | } elseif ( $threadItem instanceof ContentHeadingItem ) { |
1298 | // <span class="mw-headline" …>, or <hN …> in Parsoid HTML |
1299 | $headline = $threadItem->getRange()->startContainer; |
1300 | Assert::precondition( $headline instanceof Element, 'HeadingItem refers to an element node' ); |
1301 | $id = 'h-' . $this->truncateForId( $headline->getAttribute( 'id' ) |
1302 | ?: $headline->getAttribute( 'data-mw-anchor' ) ?? '' ); |
1303 | } elseif ( $threadItem instanceof ContentCommentItem ) { |
1304 | $id = 'c-' . $this->truncateForId( str_replace( ' ', '_', $threadItem->getAuthor() ) ) . |
1305 | '-' . $threadItem->getTimestampString(); |
1306 | } else { |
1307 | throw new InvalidArgumentException( 'Unknown ThreadItem type' ); |
1308 | } |
1309 | |
1310 | // If there would be multiple comments with the same ID (i.e. the user left multiple comments |
1311 | // in one edit, or within a minute), add the parent ID to disambiguate them. |
1312 | $threadItemParent = $threadItem->getParent(); |
1313 | if ( $threadItemParent instanceof ContentHeadingItem && !$threadItemParent->isPlaceholderHeading() ) { |
1314 | // <span class="mw-headline" …>, or <hN …> in Parsoid HTML |
1315 | $headline = $threadItemParent->getRange()->startContainer; |
1316 | Assert::precondition( $headline instanceof Element, 'HeadingItem refers to an element node' ); |
1317 | $id .= '-' . $this->truncateForId( $headline->getAttribute( 'id' ) |
1318 | ?: $headline->getAttribute( 'data-mw-anchor' ) ?? '' ); |
1319 | } elseif ( $threadItemParent instanceof ContentCommentItem ) { |
1320 | $id .= '-' . $this->truncateForId( str_replace( ' ', '_', $threadItemParent->getAuthor() ) ) . |
1321 | '-' . $threadItemParent->getTimestampString(); |
1322 | } |
1323 | |
1324 | if ( $threadItem instanceof ContentHeadingItem ) { |
1325 | // To avoid old threads re-appearing on popular pages when someone uses a vague title |
1326 | // (e.g. dozens of threads titled "question" on [[Wikipedia:Help desk]]: https://w.wiki/fbN), |
1327 | // include the oldest timestamp in the thread (i.e. date the thread was started) in the |
1328 | // heading ID. |
1329 | $oldestComment = $threadItem->getOldestReply(); |
1330 | if ( $oldestComment ) { |
1331 | $id .= '-' . $oldestComment->getTimestampString(); |
1332 | } |
1333 | } |
1334 | |
1335 | if ( $previousItems->findCommentById( $id ) ) { |
1336 | // Well, that's tough |
1337 | $threadItem->addWarning( 'Duplicate comment ID' ); |
1338 | // Finally, disambiguate by adding sequential numbers, to allow replying to both comments |
1339 | $number = 1; |
1340 | while ( $previousItems->findCommentById( "$id-$number" ) ) { |
1341 | $number++; |
1342 | } |
1343 | $id = "$id-$number"; |
1344 | } |
1345 | |
1346 | return $id; |
1347 | } |
1348 | |
1349 | /** |
1350 | * Given a thread item, return an identifier for it that is consistent across all pages and |
1351 | * revisions where this comment might appear. |
1352 | * |
1353 | * Multiple comments on a page can have the same name; use ID to distinguish them. |
1354 | */ |
1355 | private function computeName( ContentThreadItem $threadItem ): string { |
1356 | $name = null; |
1357 | |
1358 | if ( $threadItem instanceof ContentHeadingItem ) { |
1359 | $name = 'h-'; |
1360 | $mainComment = $threadItem->getOldestReply(); |
1361 | } elseif ( $threadItem instanceof ContentCommentItem ) { |
1362 | $name = 'c-'; |
1363 | $mainComment = $threadItem; |
1364 | } else { |
1365 | throw new InvalidArgumentException( 'Unknown ThreadItem type' ); |
1366 | } |
1367 | |
1368 | if ( $mainComment ) { |
1369 | $name .= $this->truncateForId( str_replace( ' ', '_', $mainComment->getAuthor() ) ) . |
1370 | '-' . $mainComment->getTimestampString(); |
1371 | } |
1372 | |
1373 | return $name; |
1374 | } |
1375 | |
1376 | private function buildThreads( ContentThreadItemSet $result ): void { |
1377 | $lastHeading = null; |
1378 | $replies = []; |
1379 | |
1380 | foreach ( $result->getThreadItems() as $threadItem ) { |
1381 | if ( count( $replies ) < $threadItem->getLevel() ) { |
1382 | // Someone skipped an indentation level (or several). Pretend that the previous reply |
1383 | // covers multiple indentation levels, so that following comments get connected to it. |
1384 | $threadItem->addWarning( 'Comment skips indentation level' ); |
1385 | while ( count( $replies ) < $threadItem->getLevel() ) { |
1386 | $replies[] = end( $replies ); |
1387 | } |
1388 | } |
1389 | |
1390 | if ( $threadItem instanceof ContentHeadingItem ) { |
1391 | // New root (thread) |
1392 | // Attach as a sub-thread to preceding higher-level heading. |
1393 | // Any replies will appear in the tree twice, under the main-thread and the sub-thread. |
1394 | $maybeParent = $lastHeading; |
1395 | while ( $maybeParent && $maybeParent->getHeadingLevel() >= $threadItem->getHeadingLevel() ) { |
1396 | $maybeParent = $maybeParent->getParent(); |
1397 | } |
1398 | if ( $maybeParent ) { |
1399 | $threadItem->setParent( $maybeParent ); |
1400 | $maybeParent->addReply( $threadItem ); |
1401 | } |
1402 | $lastHeading = $threadItem; |
1403 | } elseif ( isset( $replies[ $threadItem->getLevel() - 1 ] ) ) { |
1404 | // Add as a reply to the closest less-nested comment |
1405 | $threadItem->setParent( $replies[ $threadItem->getLevel() - 1 ] ); |
1406 | $threadItem->getParent()->addReply( $threadItem ); |
1407 | } else { |
1408 | $threadItem->addWarning( 'Comment could not be connected to a thread' ); |
1409 | } |
1410 | |
1411 | $replies[ $threadItem->getLevel() ] = $threadItem; |
1412 | // Cut off more deeply nested replies |
1413 | array_splice( $replies, $threadItem->getLevel() + 1 ); |
1414 | } |
1415 | } |
1416 | |
1417 | /** |
1418 | * Set the IDs and names used to refer to comments and headings. |
1419 | * This has to be a separate pass because we don't have the list of replies before |
1420 | * this point. |
1421 | */ |
1422 | private function computeIdsAndNames( ContentThreadItemSet $result ): void { |
1423 | foreach ( $result->getThreadItems() as $threadItem ) { |
1424 | $name = $this->computeName( $threadItem ); |
1425 | $threadItem->setName( $name ); |
1426 | |
1427 | $id = $this->computeId( $threadItem, $result ); |
1428 | $threadItem->setId( $id ); |
1429 | |
1430 | $result->updateIdAndNameMaps( $threadItem ); |
1431 | } |
1432 | } |
1433 | } |