Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
94.09% |
653 / 694 |
|
65.38% |
17 / 26 |
CRAP | |
0.00% |
0 / 1 |
CommentParser | |
94.09% |
653 / 694 |
|
65.38% |
17 / 26 |
260.68 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
11 / 11 |
|
100.00% |
1 / 1 |
1 | |||
parse | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
1 | |||
nextInterestingLeafNode | |
95.45% |
21 / 22 |
|
0.00% |
0 / 1 |
8 | |||
regexpAlternateGroup | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
getMessages | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
getTimestampRegexp | |
88.54% |
85 / 96 |
|
0.00% |
0 / 1 |
32.45 | |||
getTimestampParser | |
93.85% |
122 / 130 |
|
0.00% |
0 / 1 |
52.63 | |||
getLocalTimestampRegexps | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
1 | |||
getLocalTimestampParsers | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
1 | |||
getUsernameFromLink | |
97.06% |
33 / 34 |
|
0.00% |
0 / 1 |
17 | |||
findSignature | |
100.00% |
43 / 43 |
|
100.00% |
1 / 1 |
15 | |||
acceptOnlyNodesAllowingComments | |
100.00% |
17 / 17 |
|
100.00% |
1 / 1 |
11 | |||
getCodepointOffset | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
findTimestamp | |
100.00% |
45 / 45 |
|
100.00% |
1 / 1 |
11 | |||
adjustSigRange | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
3 | |||
buildThreadItems | |
98.96% |
95 / 96 |
|
0.00% |
0 / 1 |
22 | |||
computeTranscludedFrom | |
69.23% |
36 / 52 |
|
0.00% |
0 / 1 |
45.69 | |||
titleCanExist | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
3 | |||
parseTitle | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
getTransclusionTitles | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
4 | |||
getTransclusionRange | |
100.00% |
13 / 13 |
|
100.00% |
1 / 1 |
5 | |||
truncateForId | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
computeId | |
96.30% |
26 / 27 |
|
0.00% |
0 / 1 |
13 | |||
computeName | |
91.67% |
11 / 12 |
|
0.00% |
0 / 1 |
4.01 | |||
buildThreads | |
95.24% |
20 / 21 |
|
0.00% |
0 / 1 |
9 | |||
computeIdsAndNames | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
3 |
1 | <?php |
2 | |
3 | namespace MediaWiki\Extension\DiscussionTools; |
4 | |
5 | use DateInterval; |
6 | use DateTime; |
7 | use DateTimeImmutable; |
8 | use DateTimeZone; |
9 | use InvalidArgumentException; |
10 | use LogicException; |
11 | use MediaWiki\Config\Config; |
12 | use MediaWiki\Extension\DiscussionTools\ThreadItem\ContentCommentItem; |
13 | use MediaWiki\Extension\DiscussionTools\ThreadItem\ContentHeadingItem; |
14 | use MediaWiki\Extension\DiscussionTools\ThreadItem\ContentThreadItem; |
15 | use MediaWiki\Language\Language; |
16 | use MediaWiki\Languages\LanguageConverterFactory; |
17 | use MediaWiki\Title\MalformedTitleException; |
18 | use MediaWiki\Title\TitleParser; |
19 | use MediaWiki\Title\TitleValue; |
20 | use MediaWiki\Utils\MWTimestamp; |
21 | use RuntimeException; |
22 | use Wikimedia\Assert\Assert; |
23 | use Wikimedia\IPUtils; |
24 | use Wikimedia\Parsoid\DOM\Element; |
25 | use Wikimedia\Parsoid\DOM\Node; |
26 | use Wikimedia\Parsoid\DOM\Text; |
27 | use Wikimedia\Parsoid\Utils\DOMCompat; |
28 | use Wikimedia\Parsoid\Utils\DOMUtils; |
29 | use Wikimedia\Timestamp\TimestampException; |
30 | |
31 | // TODO consider making timestamp parsing not a returned function |
32 | |
33 | class CommentParser { |
34 | |
35 | /** |
36 | * How far backwards we look for a signature associated with a timestamp before giving up. |
37 | * Note that this is not a hard limit on the length of signatures we detect. |
38 | */ |
39 | private const SIGNATURE_SCAN_LIMIT = 100; |
40 | |
41 | private Config $config; |
42 | private Language $language; |
43 | private LanguageConverterFactory $languageConverterFactory; |
44 | private TitleParser $titleParser; |
45 | |
46 | /** @var string[] */ |
47 | private array $dateFormat; |
48 | /** @var string[][] */ |
49 | private array $digits; |
50 | /** @var string[][] */ |
51 | private $contLangMessages; |
52 | private string $localTimezone; |
53 | /** @var string[][] */ |
54 | private array $timezones; |
55 | private string $specialContributionsName; |
56 | |
57 | private Element $rootNode; |
58 | private TitleValue $title; |
59 | |
60 | /** |
61 | * @param Config $config |
62 | * @param Language $language Content language |
63 | * @param LanguageConverterFactory $languageConverterFactory |
64 | * @param LanguageData $languageData |
65 | * @param TitleParser $titleParser |
66 | */ |
67 | public function __construct( |
68 | Config $config, |
69 | Language $language, |
70 | LanguageConverterFactory $languageConverterFactory, |
71 | LanguageData $languageData, |
72 | TitleParser $titleParser |
73 | ) { |
74 | $this->config = $config; |
75 | $this->language = $language; |
76 | $this->languageConverterFactory = $languageConverterFactory; |
77 | $this->titleParser = $titleParser; |
78 | |
79 | $data = $languageData->getLocalData(); |
80 | $this->dateFormat = $data['dateFormat']; |
81 | $this->digits = $data['digits']; |
82 | $this->contLangMessages = $data['contLangMessages']; |
83 | $this->localTimezone = $data['localTimezone']; |
84 | $this->timezones = $data['timezones']; |
85 | $this->specialContributionsName = $data['specialContributionsName']; |
86 | } |
87 | |
88 | /** |
89 | * Parse a discussion page. |
90 | * |
91 | * @param Element $rootNode Root node of content to parse |
92 | * @param TitleValue $title Title of the page being parsed |
93 | * @return ContentThreadItemSet |
94 | */ |
95 | public function parse( Element $rootNode, TitleValue $title ): ContentThreadItemSet { |
96 | $this->rootNode = $rootNode; |
97 | $this->title = $title; |
98 | |
99 | $result = $this->buildThreadItems(); |
100 | $this->buildThreads( $result ); |
101 | $this->computeIdsAndNames( $result ); |
102 | |
103 | return $result; |
104 | } |
105 | |
106 | /** |
107 | * Return the next leaf node in the tree order that is likely a part of a discussion comment, |
108 | * rather than some boring "separator" element. |
109 | * |
110 | * Currently, this can return a Text node with content other than whitespace, or an Element node |
111 | * that is a "void element" or "text element", except some special cases that we treat as comment |
112 | * separators (isCommentSeparator()). |
113 | * |
114 | * @param ?Node $node Node after which to start searching |
115 | * (if null, start at the beginning of the document). |
116 | * @return Node |
117 | */ |
118 | private function nextInterestingLeafNode( ?Node $node ): Node { |
119 | $rootNode = $this->rootNode; |
120 | $treeWalker = new TreeWalker( |
121 | $rootNode, |
122 | NodeFilter::SHOW_ELEMENT | NodeFilter::SHOW_TEXT, |
123 | static function ( $n ) use ( $node, $rootNode ) { |
124 | // Skip past the starting node and its descendants |
125 | if ( $n === $node || $n->parentNode === $node ) { |
126 | return NodeFilter::FILTER_REJECT; |
127 | } |
128 | // Ignore some elements usually used as separators or headers (and their descendants) |
129 | if ( CommentUtils::isCommentSeparator( $n ) ) { |
130 | return NodeFilter::FILTER_REJECT; |
131 | } |
132 | // Ignore nodes with no rendering that mess up our indentation detection |
133 | if ( CommentUtils::isRenderingTransparentNode( $n ) ) { |
134 | return NodeFilter::FILTER_REJECT; |
135 | } |
136 | if ( CommentUtils::isCommentContent( $n ) ) { |
137 | return NodeFilter::FILTER_ACCEPT; |
138 | } |
139 | return NodeFilter::FILTER_SKIP; |
140 | } |
141 | ); |
142 | if ( $node ) { |
143 | $treeWalker->currentNode = $node; |
144 | } |
145 | $treeWalker->nextNode(); |
146 | if ( !$treeWalker->currentNode ) { |
147 | throw new RuntimeException( 'nextInterestingLeafNode not found' ); |
148 | } |
149 | return $treeWalker->currentNode; |
150 | } |
151 | |
152 | /** |
153 | * @param string[] $values Values to match |
154 | * @return string Regular expression |
155 | */ |
156 | private static function regexpAlternateGroup( array $values ): string { |
157 | return '(' . implode( '|', array_map( static function ( string $x ) { |
158 | return preg_quote( $x, '/' ); |
159 | }, $values ) ) . ')'; |
160 | } |
161 | |
162 | /** |
163 | * Get text of localisation messages in content language. |
164 | * |
165 | * @param string $contLangVariant Content language variant |
166 | * @param string[] $messages Message keys |
167 | * @return string[] Message values |
168 | */ |
169 | private function getMessages( string $contLangVariant, array $messages ): array { |
170 | return array_map( function ( string $key ) use ( $contLangVariant ) { |
171 | return $this->contLangMessages[$contLangVariant][$key]; |
172 | }, $messages ); |
173 | } |
174 | |
175 | /** |
176 | * Get a regexp that matches timestamps generated using the given date format. |
177 | * |
178 | * This only supports format characters that are used by the default date format in any of |
179 | * MediaWiki's languages, namely: D, d, F, G, H, i, j, l, M, n, Y, xg, xkY (and escape characters), |
180 | * and only dates when MediaWiki existed, let's say 2000 onwards (Thai dates before 1941 are |
181 | * complicated). |
182 | * |
183 | * @param string $contLangVariant Content language variant |
184 | * @param string $format Date format |
185 | * @param string $digitsRegexp Regular expression matching a single localised digit, e.g. '[0-9]' |
186 | * @param array $tzAbbrs Associative array mapping localised timezone abbreviations to |
187 | * IANA abbreviations, for the local timezone, e.g. [ 'EDT' => 'EDT', 'EST' => 'EST' ] |
188 | * @return string Regular expression |
189 | */ |
190 | private function getTimestampRegexp( |
191 | string $contLangVariant, string $format, string $digitsRegexp, array $tzAbbrs |
192 | ): string { |
193 | $formatLength = strlen( $format ); |
194 | $s = ''; |
195 | $raw = false; |
196 | // Adapted from Language::sprintfDate() |
197 | for ( $p = 0; $p < $formatLength; $p++ ) { |
198 | $num = false; |
199 | $code = $format[ $p ]; |
200 | if ( $code === 'x' && $p < $formatLength - 1 ) { |
201 | $code .= $format[++$p]; |
202 | } |
203 | if ( $code === 'xk' && $p < $formatLength - 1 ) { |
204 | $code .= $format[++$p]; |
205 | } |
206 | |
207 | switch ( $code ) { |
208 | case 'xx': |
209 | $s .= 'x'; |
210 | break; |
211 | case 'xg': |
212 | $s .= static::regexpAlternateGroup( |
213 | $this->getMessages( $contLangVariant, Language::MONTH_GENITIVE_MESSAGES ) |
214 | ); |
215 | break; |
216 | case 'xn': |
217 | $raw = true; |
218 | break; |
219 | case 'd': |
220 | $num = '2'; |
221 | break; |
222 | case 'D': |
223 | $s .= static::regexpAlternateGroup( |
224 | $this->getMessages( $contLangVariant, Language::WEEKDAY_ABBREVIATED_MESSAGES ) |
225 | ); |
226 | break; |
227 | case 'j': |
228 | $num = '1,2'; |
229 | break; |
230 | case 'l': |
231 | $s .= static::regexpAlternateGroup( |
232 | $this->getMessages( $contLangVariant, Language::WEEKDAY_MESSAGES ) |
233 | ); |
234 | break; |
235 | case 'F': |
236 | $s .= static::regexpAlternateGroup( |
237 | $this->getMessages( $contLangVariant, Language::MONTH_MESSAGES ) |
238 | ); |
239 | break; |
240 | case 'M': |
241 | $s .= static::regexpAlternateGroup( |
242 | $this->getMessages( $contLangVariant, Language::MONTH_ABBREVIATED_MESSAGES ) |
243 | ); |
244 | break; |
245 | case 'm': |
246 | $num = '2'; |
247 | break; |
248 | case 'n': |
249 | $num = '1,2'; |
250 | break; |
251 | case 'Y': |
252 | $num = '4'; |
253 | break; |
254 | case 'xkY': |
255 | $num = '4'; |
256 | break; |
257 | case 'G': |
258 | $num = '1,2'; |
259 | break; |
260 | case 'H': |
261 | $num = '2'; |
262 | break; |
263 | case 'i': |
264 | $num = '2'; |
265 | break; |
266 | case 's': |
267 | $num = '2'; |
268 | break; |
269 | case '\\': |
270 | // Backslash escaping |
271 | if ( $p < $formatLength - 1 ) { |
272 | $s .= preg_quote( $format[++$p], '/' ); |
273 | } else { |
274 | $s .= preg_quote( '\\', '/' ); |
275 | } |
276 | break; |
277 | case '"': |
278 | // Quoted literal |
279 | if ( $p < $formatLength - 1 ) { |
280 | $endQuote = strpos( $format, '"', $p + 1 ); |
281 | if ( $endQuote === false ) { |
282 | // No terminating quote, assume literal " |
283 | $s .= '"'; |
284 | } else { |
285 | $s .= preg_quote( substr( $format, $p + 1, $endQuote - $p - 1 ), '/' ); |
286 | $p = $endQuote; |
287 | } |
288 | } else { |
289 | // Quote at end of string, assume literal " |
290 | $s .= '"'; |
291 | } |
292 | break; |
293 | default: |
294 | // Copy whole characters together, instead of single bytes |
295 | $char = mb_substr( mb_strcut( $format, $p, 4 ), 0, 1 ); |
296 | $s .= preg_quote( $char, '/' ); |
297 | $p += strlen( $char ) - 1; |
298 | } |
299 | if ( $num !== false ) { |
300 | if ( $raw ) { |
301 | $s .= '([0-9]{' . $num . '})'; |
302 | $raw = false; |
303 | } else { |
304 | $s .= '(' . $digitsRegexp . '{' . $num . '})'; |
305 | } |
306 | } |
307 | // Ignore some invisible Unicode characters that often sneak into copy-pasted timestamps (T308448) |
308 | $s .= '[\\x{200E}\\x{200F}]?'; |
309 | } |
310 | |
311 | $tzRegexp = static::regexpAlternateGroup( array_keys( $tzAbbrs ) ); |
312 | |
313 | // Hard-coded parentheses and space like in Parser::pstPass2 |
314 | // Ignore some invisible Unicode characters that often sneak into copy-pasted timestamps (T245784) |
315 | // \uNNNN syntax can only be used from PHP 7.3 |
316 | return '/' . $s . ' [\\x{200E}\\x{200F}]?\\(' . $tzRegexp . '\\)/u'; |
317 | } |
318 | |
319 | /** |
320 | * Get a function that parses timestamps generated using the given date format, based on the result |
321 | * of matching the regexp returned by getTimestampRegexp() |
322 | * |
323 | * @param string $contLangVariant Content language variant |
324 | * @param string $format Date format, as used by MediaWiki |
325 | * @param array<int,string>|null $digits Localised digits from 0 to 9, e.g. `[ '0', '1', ..., '9' ]` |
326 | * @param string $localTimezone Local timezone IANA name, e.g. `America/New_York` |
327 | * @param array $tzAbbrs Map of localised timezone abbreviations to IANA abbreviations |
328 | * for the local timezone, e.g. [ 'EDT' => 'EDT', 'EST' => 'EST' ] |
329 | * @return callable Parser function |
330 | */ |
331 | private function getTimestampParser( |
332 | string $contLangVariant, string $format, ?array $digits, string $localTimezone, array $tzAbbrs |
333 | ): callable { |
334 | $untransformDigits = static function ( string $text ) use ( $digits ): int { |
335 | return (int)( $digits ? strtr( $text, array_flip( $digits ) ) : $text ); |
336 | }; |
337 | |
338 | $formatLength = strlen( $format ); |
339 | $matchingGroups = []; |
340 | for ( $p = 0; $p < $formatLength; $p++ ) { |
341 | $code = $format[$p]; |
342 | if ( $code === 'x' && $p < $formatLength - 1 ) { |
343 | $code .= $format[++$p]; |
344 | } |
345 | if ( $code === 'xk' && $p < $formatLength - 1 ) { |
346 | $code .= $format[++$p]; |
347 | } |
348 | |
349 | switch ( $code ) { |
350 | case 'xx': |
351 | case 'xn': |
352 | break; |
353 | case 'xg': |
354 | case 'd': |
355 | case 'j': |
356 | case 'D': |
357 | case 'l': |
358 | case 'F': |
359 | case 'M': |
360 | case 'm': |
361 | case 'n': |
362 | case 'Y': |
363 | case 'xkY': |
364 | case 'G': |
365 | case 'H': |
366 | case 'i': |
367 | case 's': |
368 | $matchingGroups[] = $code; |
369 | break; |
370 | case '\\': |
371 | // Backslash escaping |
372 | if ( $p < $formatLength - 1 ) { |
373 | $p++; |
374 | } |
375 | break; |
376 | case '"': |
377 | // Quoted literal |
378 | if ( $p < $formatLength - 1 ) { |
379 | $endQuote = strpos( $format, '"', $p + 1 ); |
380 | if ( $endQuote !== false ) { |
381 | $p = $endQuote; |
382 | } |
383 | } |
384 | break; |
385 | default: |
386 | break; |
387 | } |
388 | } |
389 | |
390 | return function ( array $match ) use ( |
391 | $matchingGroups, $untransformDigits, $localTimezone, $tzAbbrs, $contLangVariant |
392 | ) { |
393 | if ( is_array( $match[0] ) ) { |
394 | // Strip PREG_OFFSET_CAPTURE data |
395 | unset( $match['offset'] ); |
396 | $match = array_map( static function ( array $tuple ) { |
397 | return $tuple[0]; |
398 | }, $match ); |
399 | } |
400 | $year = 0; |
401 | $monthIdx = 0; |
402 | $day = 0; |
403 | $hour = 0; |
404 | $minute = 0; |
405 | foreach ( $matchingGroups as $i => $code ) { |
406 | $text = $match[$i + 1]; |
407 | switch ( $code ) { |
408 | case 'xg': |
409 | $monthIdx = array_search( |
410 | $text, |
411 | $this->getMessages( $contLangVariant, Language::MONTH_GENITIVE_MESSAGES ), |
412 | true |
413 | ); |
414 | break; |
415 | case 'd': |
416 | case 'j': |
417 | $day = $untransformDigits( $text ); |
418 | break; |
419 | case 'D': |
420 | case 'l': |
421 | // Day of the week - unused |
422 | break; |
423 | case 'F': |
424 | $monthIdx = array_search( |
425 | $text, |
426 | $this->getMessages( $contLangVariant, Language::MONTH_MESSAGES ), |
427 | true |
428 | ); |
429 | break; |
430 | case 'M': |
431 | $monthIdx = array_search( |
432 | $text, |
433 | $this->getMessages( $contLangVariant, Language::MONTH_ABBREVIATED_MESSAGES ), |
434 | true |
435 | ); |
436 | break; |
437 | case 'm': |
438 | case 'n': |
439 | $monthIdx = $untransformDigits( $text ) - 1; |
440 | break; |
441 | case 'Y': |
442 | $year = $untransformDigits( $text ); |
443 | break; |
444 | case 'xkY': |
445 | // Thai year |
446 | $year = $untransformDigits( $text ) - 543; |
447 | break; |
448 | case 'G': |
449 | case 'H': |
450 | $hour = $untransformDigits( $text ); |
451 | break; |
452 | case 'i': |
453 | $minute = $untransformDigits( $text ); |
454 | break; |
455 | case 's': |
456 | // Seconds - unused, because most timestamp formats omit them |
457 | break; |
458 | default: |
459 | throw new LogicException( 'Not implemented' ); |
460 | } |
461 | } |
462 | |
463 | // The last matching group is the timezone abbreviation |
464 | $tzAbbr = $tzAbbrs[ end( $match ) ]; |
465 | |
466 | // Most of the time, the timezone abbreviation is not necessary to parse the date, since we |
467 | // can assume all times are in the wiki's local timezone. |
468 | $date = new DateTime(); |
469 | // setTimezone must be called before setDate/setTime |
470 | $date->setTimezone( new DateTimeZone( $localTimezone ) ); |
471 | $date->setDate( $year, $monthIdx + 1, $day ); |
472 | $date->setTime( $hour, $minute, 0 ); |
473 | |
474 | // But during the "fall back" at the end of DST, some times will happen twice. |
475 | // Since the timezone abbreviation disambiguates the DST/non-DST times, we can detect |
476 | // when PHP chose the wrong one, and then try the other one. It appears that PHP always |
477 | // uses the later (non-DST) hour, but that behavior isn't documented, so we account for both. |
478 | $dateWarning = null; |
479 | if ( $date->format( 'T' ) !== $tzAbbr ) { |
480 | $altDate = clone $date; |
481 | if ( $date->format( 'I' ) ) { |
482 | // Parsed time is DST, try non-DST by advancing one hour |
483 | $altDate->add( new DateInterval( 'PT1H' ) ); |
484 | } else { |
485 | // Parsed time is non-DST, try DST by going back one hour |
486 | $altDate->sub( new DateInterval( 'PT1H' ) ); |
487 | } |
488 | if ( $altDate->format( 'T' ) === $tzAbbr ) { |
489 | $date = $altDate; |
490 | $dateWarning = 'Timestamp has timezone abbreviation for the wrong time'; |
491 | } else { |
492 | $dateWarning = 'Ambiguous time at DST switchover was parsed'; |
493 | } |
494 | } |
495 | |
496 | // Now set the timezone back to UTC for formatting |
497 | $date->setTimezone( new DateTimeZone( 'UTC' ) ); |
498 | $date = DateTimeImmutable::createFromMutable( $date ); |
499 | |
500 | // We require the date to be compatible with our libraries, for example zero or negative years (T352455) |
501 | // In PHP we need to check with MWTimestamp. |
502 | // In JS we need to check with Moment. |
503 | try { |
504 | // @phan-suppress-next-line PhanNoopNew |
505 | new MWTimestamp( $date->format( 'c' ) ); |
506 | } catch ( TimestampException $ex ) { |
507 | return null; |
508 | } |
509 | |
510 | return [ |
511 | 'date' => $date, |
512 | 'warning' => $dateWarning, |
513 | ]; |
514 | }; |
515 | } |
516 | |
517 | /** |
518 | * Get a regexp that matches timestamps in the local date format, for each language variant. |
519 | * |
520 | * This calls getTimestampRegexp() with predefined data for the current wiki. |
521 | * |
522 | * @return string[] Regular expressions |
523 | */ |
524 | public function getLocalTimestampRegexps(): array { |
525 | $langConv = $this->languageConverterFactory->getLanguageConverter( $this->language ); |
526 | return array_map( function ( $contLangVariant ) { |
527 | return $this->getTimestampRegexp( |
528 | $contLangVariant, |
529 | $this->dateFormat[$contLangVariant], |
530 | '[' . implode( '', $this->digits[$contLangVariant] ) . ']', |
531 | $this->timezones[$contLangVariant] |
532 | ); |
533 | }, $langConv->getVariants() ); |
534 | } |
535 | |
536 | /** |
537 | * Get a function that parses timestamps in the local date format, for each language variant, |
538 | * based on the result of matching the regexp returned by getLocalTimestampRegexp(). |
539 | * |
540 | * This calls getTimestampParser() with predefined data for the current wiki. |
541 | * |
542 | * @return callable[] Parser functions |
543 | */ |
544 | private function getLocalTimestampParsers(): array { |
545 | $langConv = $this->languageConverterFactory->getLanguageConverter( $this->language ); |
546 | return array_map( function ( $contLangVariant ) { |
547 | return $this->getTimestampParser( |
548 | $contLangVariant, |
549 | $this->dateFormat[$contLangVariant], |
550 | $this->digits[$contLangVariant], |
551 | $this->localTimezone, |
552 | $this->timezones[$contLangVariant] |
553 | ); |
554 | }, $langConv->getVariants() ); |
555 | } |
556 | |
557 | /** |
558 | * Given a link node (`<a>`), if it's a link to a user-related page, return their username. |
559 | * |
560 | * @param Element $link |
561 | * @return array|null Array, or null: |
562 | * - string 'username' Username |
563 | * - string|null 'displayName' Display name (link text if link target was in the user namespace) |
564 | */ |
565 | private function getUsernameFromLink( Element $link ): ?array { |
566 | // Selflink: use title of current page |
567 | if ( DOMUtils::hasClass( $link, 'mw-selflink' ) ) { |
568 | $title = $this->title; |
569 | } else { |
570 | $titleString = CommentUtils::getTitleFromUrl( $link->getAttribute( 'href' ) ?? '', $this->config ) ?? ''; |
571 | // Performance optimization, skip strings that obviously don't contain a namespace |
572 | if ( $titleString === '' || !str_contains( $titleString, ':' ) ) { |
573 | return null; |
574 | } |
575 | $title = $this->parseTitle( $titleString ); |
576 | if ( !$title ) { |
577 | return null; |
578 | } |
579 | } |
580 | |
581 | $username = null; |
582 | $displayName = null; |
583 | $mainText = $title->getText(); |
584 | |
585 | if ( $title->inNamespace( NS_USER ) || $title->inNamespace( NS_USER_TALK ) ) { |
586 | $username = $mainText; |
587 | if ( str_contains( $username, '/' ) ) { |
588 | return null; |
589 | } |
590 | if ( $title->inNamespace( NS_USER ) ) { |
591 | // Use regex trim for consistency with JS implementation |
592 | $text = preg_replace( [ '/^[\s]+/u', '/[\s]+$/u' ], '', $link->textContent ?? '' ); |
593 | // Record the display name if it has been customised beyond changing case |
594 | if ( $text && mb_strtolower( $text ) !== mb_strtolower( $username ) ) { |
595 | $displayName = $text; |
596 | } |
597 | } |
598 | } elseif ( $title->inNamespace( NS_SPECIAL ) ) { |
599 | $parts = explode( '/', $mainText ); |
600 | if ( count( $parts ) === 2 && $parts[0] === $this->specialContributionsName ) { |
601 | // Normalize the username: users may link to their contributions with an unnormalized name |
602 | $userpage = $this->titleParser->makeTitleValueSafe( NS_USER, $parts[1] ); |
603 | if ( !$userpage ) { |
604 | return null; |
605 | } |
606 | $username = $userpage->getText(); |
607 | } |
608 | } |
609 | if ( $username === null ) { |
610 | return null; |
611 | } |
612 | if ( IPUtils::isIPv6( $username ) ) { |
613 | // Bot-generated links "Preceding unsigned comment added by" have non-standard case |
614 | $username = strtoupper( $username ); |
615 | } |
616 | return [ |
617 | 'username' => $username, |
618 | 'displayName' => $displayName, |
619 | ]; |
620 | } |
621 | |
622 | /** |
623 | * Find a user signature preceding a timestamp. |
624 | * |
625 | * The signature includes the timestamp node. |
626 | * |
627 | * A signature must contain at least one link to the user's userpage, discussion page or |
628 | * contributions (and may contain other links). The link may be nested in other elements. |
629 | * |
630 | * @param Text $timestampNode |
631 | * @param Node|null $until Node to stop searching at |
632 | * @return array Result, an associative array with the following keys: |
633 | * - Node[] `nodes` Sibling nodes comprising the signature, in reverse order (with |
634 | * $timestampNode or its parent node as the first element) |
635 | * - string|null `username` Username, null for unsigned comments |
636 | */ |
637 | private function findSignature( Text $timestampNode, ?Node $until = null ): array { |
638 | $sigUsername = null; |
639 | $sigDisplayName = null; |
640 | $length = 0; |
641 | $lastLinkNode = $timestampNode; |
642 | |
643 | CommentUtils::linearWalkBackwards( |
644 | $timestampNode, |
645 | function ( string $event, Node $node ) use ( |
646 | &$sigUsername, &$sigDisplayName, &$lastLinkNode, &$length, |
647 | $until, $timestampNode |
648 | ) { |
649 | if ( $event === 'enter' && $node === $until ) { |
650 | return true; |
651 | } |
652 | if ( $length >= static::SIGNATURE_SCAN_LIMIT ) { |
653 | return true; |
654 | } |
655 | if ( CommentUtils::isBlockElement( $node ) ) { |
656 | // Don't allow reaching into preceding paragraphs |
657 | return true; |
658 | } |
659 | |
660 | if ( $event === 'leave' && $node !== $timestampNode ) { |
661 | $length += $node instanceof Text ? |
662 | mb_strlen( CommentUtils::htmlTrim( $node->textContent ?? '' ) ) : 0; |
663 | } |
664 | |
665 | // Find the closest link before timestamp that links to the user's user page. |
666 | // |
667 | // Support timestamps being linked to the diff introducing the comment: |
668 | // if the timestamp node is the only child of a link node, use the link node instead |
669 | // |
670 | // Handle links nested in formatting elements. |
671 | if ( $event === 'leave' && $node instanceof Element && strtolower( $node->tagName ) === 'a' ) { |
672 | $classList = DOMCompat::getClassList( $node ); |
673 | // Generated timestamp links sometimes look like username links (e.g. on user talk pages) |
674 | // so ignore these. |
675 | if ( !$classList->contains( 'ext-discussiontools-init-timestamplink' ) ) { |
676 | $user = $this->getUsernameFromLink( $node ); |
677 | if ( $user ) { |
678 | // Accept the first link to the user namespace, then only accept links to that user |
679 | $sigUsername ??= $user['username']; |
680 | if ( $user['username'] === $sigUsername ) { |
681 | $lastLinkNode = $node; |
682 | if ( $user['displayName'] ) { |
683 | $sigDisplayName = $user['displayName']; |
684 | } |
685 | } |
686 | } |
687 | // Keep looking if a node with links wasn't a link to a user page |
688 | // "Doc James (talk · contribs · email)" |
689 | } |
690 | } |
691 | } |
692 | ); |
693 | |
694 | $range = new ImmutableRange( |
695 | $lastLinkNode->parentNode, |
696 | CommentUtils::childIndexOf( $lastLinkNode ), |
697 | $timestampNode->parentNode, |
698 | CommentUtils::childIndexOf( $timestampNode ) + 1 |
699 | ); |
700 | |
701 | // Expand the range so that it covers sibling nodes. |
702 | // This will include any wrapping formatting elements as part of the signature. |
703 | // |
704 | // Helpful accidental feature: users whose signature is not detected in full (due to |
705 | // text formatting) can just wrap it in a <span> to fix that. |
706 | // "Ten Pound Hammer • (What did I screw up now?)" |
707 | // "« Saper // dyskusja »" |
708 | // |
709 | // TODO Not sure if this is actually good, might be better to just use the range... |
710 | $sigNodes = array_reverse( CommentUtils::getCoveredSiblings( $range ) ); |
711 | |
712 | return [ |
713 | 'nodes' => $sigNodes, |
714 | 'username' => $sigUsername, |
715 | 'displayName' => $sigDisplayName, |
716 | ]; |
717 | } |
718 | |
719 | /** |
720 | * Callback for TreeWalker that will skip over nodes where we don't want to detect |
721 | * comments (or section headings). |
722 | * |
723 | * @param Node $node |
724 | * @return int Appropriate NodeFilter constant |
725 | */ |
726 | public static function acceptOnlyNodesAllowingComments( Node $node ): int { |
727 | if ( $node instanceof Element ) { |
728 | $tagName = strtolower( $node->tagName ); |
729 | // The table of contents has a heading that gets erroneously detected as a section |
730 | if ( $node->getAttribute( 'id' ) === 'toc' ) { |
731 | return NodeFilter::FILTER_REJECT; |
732 | } |
733 | // Don't detect comments within quotes (T275881) |
734 | if ( |
735 | $tagName === 'blockquote' || |
736 | $tagName === 'cite' || |
737 | $tagName === 'q' |
738 | ) { |
739 | return NodeFilter::FILTER_REJECT; |
740 | } |
741 | // Don't attempt to parse blocks marked 'mw-notalk' |
742 | if ( DOMUtils::hasClass( $node, 'mw-notalk' ) ) { |
743 | return NodeFilter::FILTER_REJECT; |
744 | } |
745 | // Don't detect comments within references. We can't add replies to them without bungling up |
746 | // the structure in some cases (T301213), and you're not supposed to do that anyway… |
747 | if ( |
748 | // <ol class="references"> is the only reliably consistent thing between the two parsers |
749 | $tagName === 'ol' && |
750 | DOMUtils::hasClass( $node, 'references' ) |
751 | ) { |
752 | return NodeFilter::FILTER_REJECT; |
753 | } |
754 | } |
755 | $parentNode = $node->parentNode; |
756 | // Don't detect comments within headings (but don't reject the headings themselves) |
757 | if ( $parentNode instanceof Element && preg_match( '/^h([1-6])$/i', $parentNode->tagName ) ) { |
758 | return NodeFilter::FILTER_REJECT; |
759 | } |
760 | return NodeFilter::FILTER_ACCEPT; |
761 | } |
762 | |
763 | /** |
764 | * Convert a byte offset within a text node to a unicode codepoint offset |
765 | * |
766 | * @param Text $node Text node |
767 | * @param int $byteOffset Byte offset |
768 | * @return int Codepoint offset |
769 | */ |
770 | private static function getCodepointOffset( Text $node, int $byteOffset ): int { |
771 | return mb_strlen( substr( $node->nodeValue ?? '', 0, $byteOffset ) ); |
772 | } |
773 | |
774 | /** |
775 | * Find a timestamps in a given text node |
776 | * |
777 | * @param Text $node |
778 | * @param string[] $timestampRegexps |
779 | * @return array|null Array with the following keys: |
780 | * - int 'offset' Length of extra text preceding the node that was used for matching (in bytes) |
781 | * - int 'parserIndex' Which of the regexps matched |
782 | * - array 'matchData' Regexp match data, which specifies the location of the match, |
783 | * and which can be parsed using getLocalTimestampParsers() (offsets are in bytes) |
784 | * - ImmutableRange 'range' Range covering the timestamp |
785 | */ |
786 | public function findTimestamp( Text $node, array $timestampRegexps ): ?array { |
787 | $nodeText = ''; |
788 | $offset = 0; |
789 | // Searched nodes (reverse order) |
790 | $nodes = []; |
791 | |
792 | while ( $node ) { |
793 | $nodeText = $node->nodeValue . $nodeText; |
794 | $nodes[] = $node; |
795 | |
796 | // In Parsoid HTML, entities are represented as a 'mw:Entity' node, rather than normal HTML |
797 | // entities. On Arabic Wikipedia, the "UTC" timezone name contains some non-breaking spaces, |
798 | // which apparently are often turned into entities by buggy editing tools. To handle |
799 | // this, we must piece together the text, so that our regexp can match those timestamps. |
800 | if ( |
801 | ( $previousSibling = $node->previousSibling ) && |
802 | $previousSibling instanceof Element && |
803 | $previousSibling->getAttribute( 'typeof' ) === 'mw:Entity' |
804 | ) { |
805 | $nodeText = $previousSibling->firstChild->nodeValue . $nodeText; |
806 | $offset += strlen( $previousSibling->firstChild->nodeValue ?? '' ); |
807 | $nodes[] = $previousSibling->firstChild; |
808 | |
809 | // If the entity is preceded by more text, do this again |
810 | if ( |
811 | $previousSibling->previousSibling && |
812 | $previousSibling->previousSibling instanceof Text |
813 | ) { |
814 | $offset += strlen( $previousSibling->previousSibling->nodeValue ?? '' ); |
815 | $node = $previousSibling->previousSibling; |
816 | } else { |
817 | $node = null; |
818 | } |
819 | } else { |
820 | $node = null; |
821 | } |
822 | } |
823 | |
824 | foreach ( $timestampRegexps as $i => $timestampRegexp ) { |
825 | $matchData = null; |
826 | // Allows us to mimic match.index in #getComments |
827 | if ( preg_match( $timestampRegexp, $nodeText, $matchData, PREG_OFFSET_CAPTURE ) ) { |
828 | $timestampLength = strlen( $matchData[0][0] ); |
829 | // Bytes at the end of the last node which aren't part of the match |
830 | $tailLength = strlen( $nodeText ) - $timestampLength - $matchData[0][1]; |
831 | // We are moving right to left, but we start to the right of the end of |
832 | // the timestamp if there is trailing garbage, so that is a negative offset. |
833 | $count = -$tailLength; |
834 | $endNode = $nodes[0]; |
835 | $endOffset = strlen( $endNode->nodeValue ?? '' ) - $tailLength; |
836 | |
837 | foreach ( $nodes as $n ) { |
838 | $count += strlen( $n->nodeValue ?? '' ); |
839 | // If we have counted to beyond the start of the timestamp, we are in the |
840 | // start node of the timestamp |
841 | if ( $count >= $timestampLength ) { |
842 | $startNode = $n; |
843 | // Offset is how much we overshot the start by |
844 | $startOffset = $count - $timestampLength; |
845 | break; |
846 | } |
847 | } |
848 | Assert::precondition( $endNode instanceof Node, 'endNode of timestamp is a Node' ); |
849 | Assert::precondition( $startNode instanceof Node, 'startNode of timestamp range found' ); |
850 | Assert::precondition( is_int( $startOffset ), 'startOffset of timestamp range found' ); |
851 | |
852 | $startOffset = static::getCodepointOffset( $startNode, $startOffset ); |
853 | $endOffset = static::getCodepointOffset( $endNode, $endOffset ); |
854 | |
855 | $range = new ImmutableRange( $startNode, $startOffset, $endNode, $endOffset ); |
856 | |
857 | return [ |
858 | 'matchData' => $matchData, |
859 | // Bytes at the start of the first node which aren't part of the match |
860 | // TODO: Remove this and use 'range' instead |
861 | 'offset' => $offset, |
862 | 'range' => $range, |
863 | 'parserIndex' => $i, |
864 | ]; |
865 | } |
866 | } |
867 | return null; |
868 | } |
869 | |
870 | /** |
871 | * @param Node[] $sigNodes |
872 | * @param array $match |
873 | * @param Text $node |
874 | * @return ImmutableRange |
875 | */ |
876 | private function adjustSigRange( array $sigNodes, array $match, Text $node ): ImmutableRange { |
877 | $firstSigNode = end( $sigNodes ); |
878 | $lastSigNode = $sigNodes[0]; |
879 | |
880 | // TODO Document why this needs to be so complicated |
881 | $lastSigNodeOffsetByteOffset = |
882 | $match['matchData'][0][1] + strlen( $match['matchData'][0][0] ) - $match['offset']; |
883 | $lastSigNodeOffset = $lastSigNode === $node ? |
884 | static::getCodepointOffset( $node, $lastSigNodeOffsetByteOffset ) : |
885 | CommentUtils::childIndexOf( $lastSigNode ) + 1; |
886 | $sigRange = new ImmutableRange( |
887 | $firstSigNode->parentNode, |
888 | CommentUtils::childIndexOf( $firstSigNode ), |
889 | $lastSigNode === $node ? $node : $lastSigNode->parentNode, |
890 | $lastSigNodeOffset |
891 | ); |
892 | |
893 | return $sigRange; |
894 | } |
895 | |
896 | private function buildThreadItems(): ContentThreadItemSet { |
897 | $result = new ContentThreadItemSet(); |
898 | |
899 | $timestampRegexps = $this->getLocalTimestampRegexps(); |
900 | $dfParsers = $this->getLocalTimestampParsers(); |
901 | |
902 | $curCommentEnd = null; |
903 | |
904 | $treeWalker = new TreeWalker( |
905 | $this->rootNode, |
906 | NodeFilter::SHOW_ELEMENT | NodeFilter::SHOW_TEXT, |
907 | [ static::class, 'acceptOnlyNodesAllowingComments' ] |
908 | ); |
909 | while ( $node = $treeWalker->nextNode() ) { |
910 | if ( $node instanceof Element && preg_match( '/^h([1-6])$/i', $node->tagName, $match ) ) { |
911 | $headingNode = CommentUtils::getHeadlineNode( $node ); |
912 | $range = new ImmutableRange( |
913 | $headingNode, 0, $headingNode, $headingNode->childNodes->length |
914 | ); |
915 | $transcludedFrom = $this->computeTranscludedFrom( $range ); |
916 | $curComment = new ContentHeadingItem( $range, $transcludedFrom, (int)( $match[ 1 ] ) ); |
917 | $curComment->setRootNode( $this->rootNode ); |
918 | $result->addThreadItem( $curComment ); |
919 | $curCommentEnd = $node; |
920 | } elseif ( $node instanceof Text && ( $match = $this->findTimestamp( $node, $timestampRegexps ) ) ) { |
921 | $warnings = []; |
922 | $foundSignature = $this->findSignature( $node, $curCommentEnd ); |
923 | $author = $foundSignature['username']; |
924 | |
925 | if ( $author === null ) { |
926 | // Ignore timestamps for which we couldn't find a signature. It's probably not a real |
927 | // comment, but just a false match due to a copypasted timestamp. |
928 | continue; |
929 | } |
930 | |
931 | $sigRanges = []; |
932 | $timestampRanges = []; |
933 | |
934 | $sigRanges[] = $this->adjustSigRange( $foundSignature['nodes'], $match, $node ); |
935 | $timestampRanges[] = $match['range']; |
936 | |
937 | // Everything from the last comment up to here is the next comment |
938 | $startNode = $this->nextInterestingLeafNode( $curCommentEnd ); |
939 | $endNode = $foundSignature['nodes'][0]; |
940 | |
941 | // Skip to the end of the "paragraph". This only looks at tag names and can be fooled by CSS, but |
942 | // avoiding that would be more difficult and slower. |
943 | // |
944 | // If this skips over another potential signature, also skip it in the main TreeWalker loop, to |
945 | // avoid generating multiple comments when there is more than one signature on a single "line". |
946 | // Often this is done when someone edits their comment later and wants to add a note about that. |
947 | // (Or when another person corrects a typo, or strikes out a comment, etc.) Multiple comments |
948 | // within one paragraph/list-item result in a confusing double "Reply" button, and we also have |
949 | // no way to indicate which one you're replying to (this might matter in the future for |
950 | // notifications or something). |
951 | CommentUtils::linearWalk( |
952 | $endNode, |
953 | function ( string $event, Node $n ) use ( |
954 | &$endNode, &$sigRanges, &$timestampRanges, |
955 | $treeWalker, $timestampRegexps, $node |
956 | ) { |
957 | if ( CommentUtils::isBlockElement( $n ) || CommentUtils::isCommentSeparator( $n ) ) { |
958 | // Stop when entering or leaving a block node |
959 | return true; |
960 | } |
961 | if ( |
962 | $event === 'leave' && |
963 | $n instanceof Text && $n !== $node && |
964 | ( $match2 = $this->findTimestamp( $n, $timestampRegexps ) ) |
965 | ) { |
966 | // If this skips over another potential signature, also skip it in the main TreeWalker loop |
967 | $treeWalker->currentNode = $n; |
968 | // …and add it as another signature to this comment (regardless of the author and timestamp) |
969 | $foundSignature2 = $this->findSignature( $n, $node ); |
970 | if ( $foundSignature2['username'] !== null ) { |
971 | $sigRanges[] = $this->adjustSigRange( $foundSignature2['nodes'], $match2, $n ); |
972 | $timestampRanges[] = $match2['range']; |
973 | } |
974 | } |
975 | if ( $event === 'leave' ) { |
976 | // Take the last complete node which we skipped past |
977 | $endNode = $n; |
978 | } |
979 | } |
980 | ); |
981 | |
982 | $length = ( $endNode instanceof Text ) ? |
983 | mb_strlen( rtrim( $endNode->nodeValue ?? '', "\t\n\f\r " ) ) : |
984 | // PHP bug: childNodes can be null for comment nodes |
985 | // (it should always be a NodeList, even if the node can't have children) |
986 | ( $endNode->childNodes ? $endNode->childNodes->length : 0 ); |
987 | $range = new ImmutableRange( |
988 | $startNode->parentNode, |
989 | CommentUtils::childIndexOf( $startNode ), |
990 | $endNode, |
991 | $length |
992 | ); |
993 | $transcludedFrom = $this->computeTranscludedFrom( $range ); |
994 | |
995 | $startLevel = CommentUtils::getIndentLevel( $startNode, $this->rootNode ) + 1; |
996 | $endLevel = CommentUtils::getIndentLevel( $node, $this->rootNode ) + 1; |
997 | if ( $startLevel !== $endLevel ) { |
998 | $warnings[] = 'Comment starts and ends with different indentation'; |
999 | } |
1000 | // Should this use the indent level of $startNode or $node? |
1001 | $level = min( $startLevel, $endLevel ); |
1002 | |
1003 | $parserResult = $dfParsers[ $match['parserIndex'] ]( $match['matchData'] ); |
1004 | if ( !$parserResult ) { |
1005 | continue; |
1006 | } |
1007 | [ 'date' => $dateTime, 'warning' => $dateWarning ] = $parserResult; |
1008 | |
1009 | if ( $dateWarning ) { |
1010 | $warnings[] = $dateWarning; |
1011 | } |
1012 | |
1013 | $curComment = new ContentCommentItem( |
1014 | $level, |
1015 | $range, |
1016 | $transcludedFrom, |
1017 | $sigRanges, |
1018 | $timestampRanges, |
1019 | $dateTime, |
1020 | $author, |
1021 | $foundSignature['displayName'] |
1022 | ); |
1023 | $curComment->setRootNode( $this->rootNode ); |
1024 | if ( $warnings ) { |
1025 | $curComment->addWarnings( $warnings ); |
1026 | } |
1027 | if ( $result->isEmpty() ) { |
1028 | // Add a fake placeholder heading if there are any comments in the 0th section |
1029 | // (before the first real heading) |
1030 | $range = new ImmutableRange( $this->rootNode, 0, $this->rootNode, 0 ); |
1031 | $fakeHeading = new ContentHeadingItem( $range, false, null ); |
1032 | $fakeHeading->setRootNode( $this->rootNode ); |
1033 | $result->addThreadItem( $fakeHeading ); |
1034 | } |
1035 | $result->addThreadItem( $curComment ); |
1036 | $curCommentEnd = $curComment->getRange()->endContainer; |
1037 | } |
1038 | } |
1039 | |
1040 | return $result; |
1041 | } |
1042 | |
1043 | /** |
1044 | * Get the name of the page from which this thread item is transcluded (if any). Replies to |
1045 | * transcluded items must be posted on that page, instead of the current one. |
1046 | * |
1047 | * This is tricky, because we don't want to mark items as trancluded when they're just using a |
1048 | * template (e.g. {{ping|…}} or a non-substituted signature template). Sometimes the whole comment |
1049 | * can be template-generated (e.g. when using some wrapper templates), but as long as a reply can |
1050 | * be added outside of that template, we should not treat it as transcluded. |
1051 | * |
1052 | * The start/end boundary points of comment ranges and Parsoid transclusion ranges don't line up |
1053 | * exactly, even when to a human it's obvious that they cover the same content, making this more |
1054 | * complicated. |
1055 | * |
1056 | * @return string|bool `false` if this item is not transcluded. A string if it's transcluded |
1057 | * from a single page (the page title, in text form with spaces). `true` if it's transcluded, but |
1058 | * we can't determine the source. |
1059 | */ |
1060 | public function computeTranscludedFrom( ImmutableRange $commentRange ) { |
1061 | // Collapsed ranges should otherwise be impossible, but they're not (T299583) |
1062 | // TODO: See if we can fix the root cause, and remove this? |
1063 | if ( $commentRange->collapsed ) { |
1064 | return false; |
1065 | } |
1066 | |
1067 | // General approach: |
1068 | // |
1069 | // Compare the comment range to each transclusion range on the page, and if it overlaps any of |
1070 | // them, examine the overlap. There are a few cases: |
1071 | // |
1072 | // * Comment and transclusion do not overlap: |
1073 | // → Not transcluded. |
1074 | // * Comment contains the transclusion: |
1075 | // → Not transcluded (just a template). |
1076 | // * Comment is contained within the transclusion: |
1077 | // → Transcluded, we can determine the source page (unless it's a complex transclusion). |
1078 | // * Comment and transclusion overlap partially: |
1079 | // → Transcluded, but we can't determine the source page. |
1080 | // * Comment (almost) exactly matches the transclusion: |
1081 | // → Maybe transcluded (it could be that the source page only contains that single comment), |
1082 | // maybe not transcluded (it could be a wrapper template that covers a single comment). |
1083 | // This is very sad, and we decide based on the namespace. |
1084 | // |
1085 | // Most transclusion ranges on the page trivially fall in the "do not overlap" or "contains" |
1086 | // cases, and we only have to carefully examine the two transclusion ranges that contain the |
1087 | // first and last node of the comment range. |
1088 | // |
1089 | // To check for almost exact matches, we walk between the relevant boundary points, and if we |
1090 | // only find uninteresting nodes (that would be ignored when detecting comments), we treat them |
1091 | // like exact matches. |
1092 | |
1093 | $startTransclNode = CommentUtils::getTranscludedFromElement( |
1094 | CommentUtils::getRangeFirstNode( $commentRange ) |
1095 | ); |
1096 | $endTransclNode = CommentUtils::getTranscludedFromElement( |
1097 | CommentUtils::getRangeLastNode( $commentRange ) |
1098 | ); |
1099 | |
1100 | // We only have to examine the two transclusion ranges that contain the first/last node of the |
1101 | // comment range (if they exist). Ignore ranges outside the comment or in the middle of it. |
1102 | $transclNodes = []; |
1103 | if ( $startTransclNode ) { |
1104 | $transclNodes[] = $startTransclNode; |
1105 | } |
1106 | if ( $endTransclNode && $endTransclNode !== $startTransclNode ) { |
1107 | $transclNodes[] = $endTransclNode; |
1108 | } |
1109 | |
1110 | foreach ( $transclNodes as $transclNode ) { |
1111 | $transclRange = static::getTransclusionRange( $transclNode ); |
1112 | $compared = CommentUtils::compareRanges( $commentRange, $transclRange ); |
1113 | $transclTitles = $this->getTransclusionTitles( $transclNode ); |
1114 | $simpleTransclTitle = count( $transclTitles ) === 1 && $transclTitles[0] !== null ? |
1115 | $this->parseTitle( $transclTitles[0] ) : null; |
1116 | |
1117 | switch ( $compared ) { |
1118 | case 'equal': |
1119 | // Comment (almost) exactly matches the transclusion |
1120 | if ( $simpleTransclTitle === null ) { |
1121 | // Allow replying to some accidental complex transclusions consisting of only templates |
1122 | // and wikitext (T313093) |
1123 | if ( count( $transclTitles ) > 1 ) { |
1124 | foreach ( $transclTitles as $transclTitleString ) { |
1125 | if ( $transclTitleString !== null ) { |
1126 | $transclTitle = $this->parseTitle( $transclTitleString ); |
1127 | if ( $transclTitle && !$transclTitle->inNamespace( NS_TEMPLATE ) ) { |
1128 | return true; |
1129 | } |
1130 | } |
1131 | } |
1132 | // Continue examining the other ranges. |
1133 | break; |
1134 | } |
1135 | // Multi-template transclusion, or a parser function call, or template-affected wikitext outside |
1136 | // of a template call, or a mix of the above |
1137 | return true; |
1138 | |
1139 | } elseif ( $simpleTransclTitle->inNamespace( NS_TEMPLATE ) ) { |
1140 | // Is that a subpage transclusion with a single comment, or a wrapper template |
1141 | // transclusion on this page? We don't know, but let's guess based on the namespace. |
1142 | // (T289873) |
1143 | // Continue examining the other ranges. |
1144 | break; |
1145 | } elseif ( !$this->titleCanExist( $simpleTransclTitle ) ) { |
1146 | // Special page transclusion (T344622) or something else weird. Don't return the title, |
1147 | // since it's useless for replying, and can't be stored in the permalink database. |
1148 | return true; |
1149 | } else { |
1150 | Assert::precondition( $transclTitles[0] !== null, "Simple transclusion found" ); |
1151 | return strtr( $transclTitles[0], '_', ' ' ); |
1152 | } |
1153 | |
1154 | case 'contains': |
1155 | // Comment contains the transclusion |
1156 | |
1157 | // If the entire transclusion is contained within the comment range, that's just a |
1158 | // template. This is the same as a transclusion in the middle of the comment, which we |
1159 | // ignored earlier, it just takes us longer to get here in this case. |
1160 | |
1161 | // Continue examining the other ranges. |
1162 | break; |
1163 | |
1164 | case 'contained': |
1165 | // Comment is contained within the transclusion |
1166 | if ( $simpleTransclTitle === null ) { |
1167 | return true; |
1168 | } elseif ( !$this->titleCanExist( $simpleTransclTitle ) ) { |
1169 | // Special page transclusion (T344622) or something else weird. Don't return the title, |
1170 | // since it's useless for replying, and can't be stored in the permalink database. |
1171 | return true; |
1172 | } else { |
1173 | Assert::precondition( $transclTitles[0] !== null, "Simple transclusion found" ); |
1174 | return strtr( $transclTitles[0], '_', ' ' ); |
1175 | } |
1176 | |
1177 | case 'after': |
1178 | case 'before': |
1179 | // Comment and transclusion do not overlap |
1180 | |
1181 | // This should be impossible, because we ignored these ranges earlier. |
1182 | throw new LogicException( 'Unexpected transclusion or comment range' ); |
1183 | |
1184 | case 'overlapstart': |
1185 | case 'overlapend': |
1186 | // Comment and transclusion overlap partially |
1187 | return true; |
1188 | |
1189 | default: |
1190 | throw new LogicException( 'Unexpected return value from compareRanges()' ); |
1191 | } |
1192 | } |
1193 | |
1194 | // If we got here, the comment range was not contained by or overlapping any of the transclusion |
1195 | // ranges. Comment is not transcluded. |
1196 | return false; |
1197 | } |
1198 | |
1199 | private function titleCanExist( TitleValue $title ): bool { |
1200 | return $title->getNamespace() >= NS_MAIN && |
1201 | !$title->isExternal() && |
1202 | $title->getText() !== ''; |
1203 | } |
1204 | |
1205 | private function parseTitle( string $titleString ): ?TitleValue { |
1206 | try { |
1207 | return $this->titleParser->parseTitle( $titleString ); |
1208 | } catch ( MalformedTitleException $err ) { |
1209 | return null; |
1210 | } |
1211 | } |
1212 | |
1213 | /** |
1214 | * Return the page titles for each part of the transclusion, or nulls for each part that isn't |
1215 | * transcluded from another page. |
1216 | * |
1217 | * If the node represents a single-page transclusion, this will return an array containing a |
1218 | * single string. |
1219 | * |
1220 | * @param Element $node |
1221 | * @return array<string|null> |
1222 | */ |
1223 | private function getTransclusionTitles( Element $node ): array { |
1224 | $dataMw = json_decode( $node->getAttribute( 'data-mw' ) ?? '', true ); |
1225 | $out = []; |
1226 | |
1227 | foreach ( $dataMw['parts'] ?? [] as $part ) { |
1228 | if ( |
1229 | !is_string( $part ) && |
1230 | // 'href' will be unset if this is a parser function rather than a template |
1231 | isset( $part['template']['target']['href'] ) |
1232 | ) { |
1233 | $parsoidHref = $part['template']['target']['href']; |
1234 | Assert::precondition( substr( $parsoidHref, 0, 2 ) === './', "href has valid format" ); |
1235 | $out[] = rawurldecode( substr( $parsoidHref, 2 ) ); |
1236 | } else { |
1237 | $out[] = null; |
1238 | } |
1239 | } |
1240 | |
1241 | return $out; |
1242 | } |
1243 | |
1244 | /** |
1245 | * Given a transclusion's first node (e.g. returned by CommentUtils::getTranscludedFromElement()), |
1246 | * return a range starting before the node and ending after the transclusion's last node. |
1247 | * |
1248 | * @param Element $startNode |
1249 | * @return ImmutableRange |
1250 | */ |
1251 | private function getTransclusionRange( Element $startNode ): ImmutableRange { |
1252 | $endNode = $startNode; |
1253 | while ( |
1254 | // Phan doesn't realize that the conditions on $nextSibling can terminate the loop |
1255 | // @phan-suppress-next-line PhanInfiniteLoop |
1256 | $endNode && |
1257 | ( $nextSibling = $endNode->nextSibling ) && |
1258 | $nextSibling instanceof Element && |
1259 | $nextSibling->getAttribute( 'about' ) === $endNode->getAttribute( 'about' ) |
1260 | ) { |
1261 | $endNode = $nextSibling; |
1262 | } |
1263 | |
1264 | $range = new ImmutableRange( |
1265 | $startNode->parentNode, |
1266 | CommentUtils::childIndexOf( $startNode ), |
1267 | $endNode->parentNode, |
1268 | CommentUtils::childIndexOf( $endNode ) + 1 |
1269 | ); |
1270 | |
1271 | return $range; |
1272 | } |
1273 | |
1274 | /** |
1275 | * Truncate user generated parts of IDs so full ID always fits within a database field of length 255 |
1276 | * |
1277 | * nb: Text should already have had spaces replaced with underscores by this point. |
1278 | * |
1279 | * @param string $text Text |
1280 | * @param bool $legacy Generate legacy ID, not needed in JS implementation |
1281 | * @return string Truncated text |
1282 | */ |
1283 | private function truncateForId( string $text, bool $legacy = false ): string { |
1284 | $truncated = $this->language->truncateForDatabase( $text, 80, '' ); |
1285 | if ( !$legacy ) { |
1286 | $truncated = trim( $truncated, '_' ); |
1287 | } |
1288 | return $truncated; |
1289 | } |
1290 | |
1291 | /** |
1292 | * Given a thread item, return an identifier for it that is unique within the page. |
1293 | * |
1294 | * @param ContentThreadItem $threadItem |
1295 | * @param ContentThreadItemSet $previousItems |
1296 | * @param bool $legacy Generate legacy ID, not needed in JS implementation |
1297 | * @return string |
1298 | */ |
1299 | private function computeId( |
1300 | ContentThreadItem $threadItem, ContentThreadItemSet $previousItems, bool $legacy = false |
1301 | ): string { |
1302 | $id = null; |
1303 | |
1304 | if ( $threadItem instanceof ContentHeadingItem && $threadItem->isPlaceholderHeading() ) { |
1305 | // The range points to the root note, using it like below results in silly values |
1306 | $id = 'h-'; |
1307 | } elseif ( $threadItem instanceof ContentHeadingItem ) { |
1308 | $id = 'h-' . $this->truncateForId( $threadItem->getLinkableId(), $legacy ); |
1309 | } elseif ( $threadItem instanceof ContentCommentItem ) { |
1310 | $id = 'c-' . $this->truncateForId( str_replace( ' ', '_', $threadItem->getAuthor() ), $legacy ) . |
1311 | '-' . $threadItem->getTimestampString(); |
1312 | } else { |
1313 | throw new InvalidArgumentException( 'Unknown ThreadItem type' ); |
1314 | } |
1315 | |
1316 | // If there would be multiple comments with the same ID (i.e. the user left multiple comments |
1317 | // in one edit, or within a minute), add the parent ID to disambiguate them. |
1318 | $threadItemParent = $threadItem->getParent(); |
1319 | if ( $threadItemParent instanceof ContentHeadingItem && !$threadItemParent->isPlaceholderHeading() ) { |
1320 | $id .= '-' . $this->truncateForId( $threadItemParent->getLinkableId(), $legacy ); |
1321 | } elseif ( $threadItemParent instanceof ContentCommentItem ) { |
1322 | $id .= '-' . $this->truncateForId( str_replace( ' ', '_', $threadItemParent->getAuthor() ), $legacy ) . |
1323 | '-' . $threadItemParent->getTimestampString(); |
1324 | } |
1325 | |
1326 | if ( $threadItem instanceof ContentHeadingItem ) { |
1327 | // To avoid old threads re-appearing on popular pages when someone uses a vague title |
1328 | // (e.g. dozens of threads titled "question" on [[Wikipedia:Help desk]]: https://w.wiki/fbN), |
1329 | // include the oldest timestamp in the thread (i.e. date the thread was started) in the |
1330 | // heading ID. |
1331 | $oldestComment = $threadItem->getOldestReply(); |
1332 | if ( $oldestComment ) { |
1333 | $id .= '-' . $oldestComment->getTimestampString(); |
1334 | } |
1335 | } |
1336 | |
1337 | if ( $previousItems->findCommentById( $id ) ) { |
1338 | // Well, that's tough |
1339 | if ( !$legacy ) { |
1340 | $threadItem->addWarning( 'Duplicate comment ID' ); |
1341 | } |
1342 | // Finally, disambiguate by adding sequential numbers, to allow replying to both comments |
1343 | $number = 1; |
1344 | while ( $previousItems->findCommentById( "$id-$number" ) ) { |
1345 | $number++; |
1346 | } |
1347 | $id = "$id-$number"; |
1348 | } |
1349 | |
1350 | return $id; |
1351 | } |
1352 | |
1353 | /** |
1354 | * Given a thread item, return an identifier for it that is consistent across all pages and |
1355 | * revisions where this comment might appear. |
1356 | * |
1357 | * Multiple comments on a page can have the same name; use ID to distinguish them. |
1358 | */ |
1359 | private function computeName( ContentThreadItem $threadItem ): string { |
1360 | $name = null; |
1361 | |
1362 | if ( $threadItem instanceof ContentHeadingItem ) { |
1363 | $name = 'h-'; |
1364 | $mainComment = $threadItem->getOldestReply(); |
1365 | } elseif ( $threadItem instanceof ContentCommentItem ) { |
1366 | $name = 'c-'; |
1367 | $mainComment = $threadItem; |
1368 | } else { |
1369 | throw new InvalidArgumentException( 'Unknown ThreadItem type' ); |
1370 | } |
1371 | |
1372 | if ( $mainComment ) { |
1373 | $name .= $this->truncateForId( str_replace( ' ', '_', $mainComment->getAuthor() ) ) . |
1374 | '-' . $mainComment->getTimestampString(); |
1375 | } |
1376 | |
1377 | return $name; |
1378 | } |
1379 | |
1380 | private function buildThreads( ContentThreadItemSet $result ): void { |
1381 | $lastHeading = null; |
1382 | $replies = []; |
1383 | |
1384 | foreach ( $result->getThreadItems() as $threadItem ) { |
1385 | if ( count( $replies ) < $threadItem->getLevel() ) { |
1386 | // Someone skipped an indentation level (or several). Pretend that the previous reply |
1387 | // covers multiple indentation levels, so that following comments get connected to it. |
1388 | $threadItem->addWarning( 'Comment skips indentation level' ); |
1389 | while ( count( $replies ) < $threadItem->getLevel() ) { |
1390 | $replies[] = end( $replies ); |
1391 | } |
1392 | } |
1393 | |
1394 | if ( $threadItem instanceof ContentHeadingItem ) { |
1395 | // New root (thread) |
1396 | // Attach as a sub-thread to preceding higher-level heading. |
1397 | // Any replies will appear in the tree twice, under the main-thread and the sub-thread. |
1398 | $maybeParent = $lastHeading; |
1399 | while ( $maybeParent && $maybeParent->getHeadingLevel() >= $threadItem->getHeadingLevel() ) { |
1400 | $maybeParent = $maybeParent->getParent(); |
1401 | } |
1402 | if ( $maybeParent ) { |
1403 | $threadItem->setParent( $maybeParent ); |
1404 | $maybeParent->addReply( $threadItem ); |
1405 | } |
1406 | $lastHeading = $threadItem; |
1407 | } elseif ( isset( $replies[ $threadItem->getLevel() - 1 ] ) ) { |
1408 | // Add as a reply to the closest less-nested comment |
1409 | $threadItem->setParent( $replies[ $threadItem->getLevel() - 1 ] ); |
1410 | $threadItem->getParent()->addReply( $threadItem ); |
1411 | } else { |
1412 | $threadItem->addWarning( 'Comment could not be connected to a thread' ); |
1413 | } |
1414 | |
1415 | $replies[ $threadItem->getLevel() ] = $threadItem; |
1416 | // Cut off more deeply nested replies |
1417 | array_splice( $replies, $threadItem->getLevel() + 1 ); |
1418 | } |
1419 | } |
1420 | |
1421 | /** |
1422 | * Set the IDs and names used to refer to comments and headings. |
1423 | * This has to be a separate pass because we don't have the list of replies before |
1424 | * this point. |
1425 | */ |
1426 | private function computeIdsAndNames( ContentThreadItemSet $result ): void { |
1427 | foreach ( $result->getThreadItems() as $threadItem ) { |
1428 | $name = $this->computeName( $threadItem ); |
1429 | $threadItem->setName( $name ); |
1430 | |
1431 | $id = $this->computeId( $threadItem, $result ); |
1432 | $threadItem->setId( $id ); |
1433 | $legacyId = $this->computeId( $threadItem, $result, true ); |
1434 | if ( $legacyId !== $id ) { |
1435 | $threadItem->setLegacyId( $legacyId ); |
1436 | } |
1437 | |
1438 | $result->updateIdAndNameMaps( $threadItem ); |
1439 | } |
1440 | } |
1441 | } |