Code Coverage for /workspace/src/extensions/DiscussionTools/includes/CommentParser.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	94.11% covered (success)	94.11%	655 / 696	65.38% covered (warning)	65.38%	17 / 26	CRAP	0.00% covered (danger)	0.00%	0 / 1
CommentParser	94.11% covered (success)	94.11%	655 / 696	65.38% covered (warning)	65.38%	17 / 26	261.67	0.00% covered (danger)	0.00%	0 / 1
__construct	100.00% covered (success)	100.00%	11 / 11	100.00% covered (success)	100.00%	1 / 1	1
parse	100.00% covered (success)	100.00%	6 / 6	100.00% covered (success)	100.00%	1 / 1	1
nextInterestingLeafNode	95.45% covered (success)	95.45%	21 / 22	0.00% covered (danger)	0.00%	0 / 1	8
regexpAlternateGroup	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	1
getMessages	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	1
getTimestampRegexp	88.54% covered (warning)	88.54%	85 / 96	0.00% covered (danger)	0.00%	0 / 1	32.45
getTimestampParser	93.85% covered (success)	93.85%	122 / 130	0.00% covered (danger)	0.00%	0 / 1	52.63
getLocalTimestampRegexps	100.00% covered (success)	100.00%	9 / 9	100.00% covered (success)	100.00%	1 / 1	1
getLocalTimestampParsers	100.00% covered (success)	100.00%	10 / 10	100.00% covered (success)	100.00%	1 / 1	1
getUsernameFromLink	97.06% covered (success)	97.06%	33 / 34	0.00% covered (danger)	0.00%	0 / 1	17
findSignature	100.00% covered (success)	100.00%	44 / 44	100.00% covered (success)	100.00%	1 / 1	16
acceptOnlyNodesAllowingComments	100.00% covered (success)	100.00%	18 / 18	100.00% covered (success)	100.00%	1 / 1	11
getCodepointOffset	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
findTimestamp	100.00% covered (success)	100.00%	45 / 45	100.00% covered (success)	100.00%	1 / 1	11
adjustSigRange	100.00% covered (success)	100.00%	14 / 14	100.00% covered (success)	100.00%	1 / 1	3
buildThreadItems	98.96% covered (success)	98.96%	95 / 96	0.00% covered (danger)	0.00%	0 / 1	22
computeTranscludedFrom	69.23% covered (warning)	69.23%	36 / 52	0.00% covered (danger)	0.00%	0 / 1	45.69
titleCanExist	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	3
parseTitle	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	2
getTransclusionTitles	100.00% covered (success)	100.00%	10 / 10	100.00% covered (success)	100.00%	1 / 1	4
getTransclusionRange	100.00% covered (success)	100.00%	13 / 13	100.00% covered (success)	100.00%	1 / 1	5
truncateForId	100.00% covered (success)	100.00%	4 / 4	100.00% covered (success)	100.00%	1 / 1	2
computeId	96.30% covered (success)	96.30%	26 / 27	0.00% covered (danger)	0.00%	0 / 1	13
computeName	91.67% covered (success)	91.67%	11 / 12	0.00% covered (danger)	0.00%	0 / 1	4.01
buildThreads	95.24% covered (success)	95.24%	20 / 21	0.00% covered (danger)	0.00%	0 / 1	9
computeIdsAndNames	100.00% covered (success)	100.00%	9 / 9	100.00% covered (success)	100.00%	1 / 1	3

1	<?php
2
3	namespace MediaWiki\Extension\DiscussionTools;
4
5	use DateInterval;
6	use DateTime;
7	use DateTimeImmutable;
8	use DateTimeZone;
9	use InvalidArgumentException;
10	use Language;
11	use LogicException;
12	use MediaWiki\Config\Config;
13	use MediaWiki\Extension\DiscussionTools\ThreadItem\ContentCommentItem;
14	use MediaWiki\Extension\DiscussionTools\ThreadItem\ContentHeadingItem;
15	use MediaWiki\Extension\DiscussionTools\ThreadItem\ContentThreadItem;
16	use MediaWiki\Languages\LanguageConverterFactory;
17	use MediaWiki\Title\MalformedTitleException;
18	use MediaWiki\Title\TitleParser;
19	use MediaWiki\Title\TitleValue;
20	use MediaWiki\Utils\MWTimestamp;
21	use RuntimeException;
22	use Wikimedia\Assert\Assert;
23	use Wikimedia\IPUtils;
24	use Wikimedia\Parsoid\DOM\Element;
25	use Wikimedia\Parsoid\DOM\Node;
26	use Wikimedia\Parsoid\DOM\Text;
27	use Wikimedia\Parsoid\Utils\DOMCompat;
28	use Wikimedia\Timestamp\TimestampException;
29
30	// TODO consider making timestamp parsing not a returned function
31
32	class CommentParser {
33
34	/**
35	* How far backwards we look for a signature associated with a timestamp before giving up.
36	* Note that this is not a hard limit on the length of signatures we detect.
37	*/
38	private const SIGNATURE_SCAN_LIMIT = 100;
39
40	private Config $config;
41	private Language $language;
42	private LanguageConverterFactory $languageConverterFactory;
43	private TitleParser $titleParser;
44
45	/** @var string[] */
46	private array $dateFormat;
47	/** @var string[][] */
48	private array $digits;
49	/** @var string[][] */
50	private $contLangMessages;
51	private string $localTimezone;
52	/** @var string[][] */
53	private array $timezones;
54	private string $specialContributionsName;
55
56	private Element $rootNode;
57	private TitleValue $title;
58
59	/**
60	* @param Config $config
61	* @param Language $language Content language
62	* @param LanguageConverterFactory $languageConverterFactory
63	* @param LanguageData $languageData
64	* @param TitleParser $titleParser
65	*/
66	public function __construct(
67	Config $config,
68	Language $language,
69	LanguageConverterFactory $languageConverterFactory,
70	LanguageData $languageData,
71	TitleParser $titleParser
72	) {
73	$this->config = $config;
74	$this->language = $language;
75	$this->languageConverterFactory = $languageConverterFactory;
76	$this->titleParser = $titleParser;
77
78	$data = $languageData->getLocalData();
79	$this->dateFormat = $data['dateFormat'];
80	$this->digits = $data['digits'];
81	$this->contLangMessages = $data['contLangMessages'];
82	$this->localTimezone = $data['localTimezone'];
83	$this->timezones = $data['timezones'];
84	$this->specialContributionsName = $data['specialContributionsName'];
85	}
86
87	/**
88	* Parse a discussion page.
89	*
90	* @param Element $rootNode Root node of content to parse
91	* @param TitleValue $title Title of the page being parsed
92	* @return ContentThreadItemSet
93	*/
94	public function parse( Element $rootNode, TitleValue $title ): ContentThreadItemSet {
95	$this->rootNode = $rootNode;
96	$this->title = $title;
97
98	$result = $this->buildThreadItems();
99	$this->buildThreads( $result );
100	$this->computeIdsAndNames( $result );
101
102	return $result;
103	}
104
105	/**
106	* Return the next leaf node in the tree order that is likely a part of a discussion comment,
107	* rather than some boring "separator" element.
108	*
109	* Currently, this can return a Text node with content other than whitespace, or an Element node
110	* that is a "void element" or "text element", except some special cases that we treat as comment
111	* separators (isCommentSeparator()).
112	*
113	* @param ?Node $node Node after which to start searching
114	* (if null, start at the beginning of the document).
115	* @return Node
116	*/
117	private function nextInterestingLeafNode( ?Node $node ): Node {
118	$rootNode = $this->rootNode;
119	$treeWalker = new TreeWalker(
120	$rootNode,
121	NodeFilter::SHOW_ELEMENT \| NodeFilter::SHOW_TEXT,
122	static function ( $n ) use ( $node, $rootNode ) {
123	// Skip past the starting node and its descendants
124	if ( $n === $node \|\| $n->parentNode === $node ) {
125	return NodeFilter::FILTER_REJECT;
126	}
127	// Ignore some elements usually used as separators or headers (and their descendants)
128	if ( CommentUtils::isCommentSeparator( $n ) ) {
129	return NodeFilter::FILTER_REJECT;
130	}
131	// Ignore nodes with no rendering that mess up our indentation detection
132	if ( CommentUtils::isRenderingTransparentNode( $n ) ) {
133	return NodeFilter::FILTER_REJECT;
134	}
135	if ( CommentUtils::isCommentContent( $n ) ) {
136	return NodeFilter::FILTER_ACCEPT;
137	}
138	return NodeFilter::FILTER_SKIP;
139	}
140	);
141	if ( $node ) {
142	$treeWalker->currentNode = $node;
143	}
144	$treeWalker->nextNode();
145	if ( !$treeWalker->currentNode ) {
146	throw new RuntimeException( 'nextInterestingLeafNode not found' );
147	}
148	return $treeWalker->currentNode;
149	}
150
151	/**
152	* @param string[] $values Values to match
153	* @return string Regular expression
154	*/
155	private static function regexpAlternateGroup( array $values ): string {
156	return '(' . implode( '\|', array_map( static function ( string $x ) {
157	return preg_quote( $x, '/' );
158	}, $values ) ) . ')';
159	}
160
161	/**
162	* Get text of localisation messages in content language.
163	*
164	* @param string $contLangVariant Content language variant
165	* @param string[] $messages Message keys
166	* @return string[] Message values
167	*/
168	private function getMessages( string $contLangVariant, array $messages ): array {
169	return array_map( function ( string $key ) use ( $contLangVariant ) {
170	return $this->contLangMessages[$contLangVariant][$key];
171	}, $messages );
172	}
173
174	/**
175	* Get a regexp that matches timestamps generated using the given date format.
176	*
177	* This only supports format characters that are used by the default date format in any of
178	* MediaWiki's languages, namely: D, d, F, G, H, i, j, l, M, n, Y, xg, xkY (and escape characters),
179	* and only dates when MediaWiki existed, let's say 2000 onwards (Thai dates before 1941 are
180	* complicated).
181	*
182	* @param string $contLangVariant Content language variant
183	* @param string $format Date format
184	* @param string $digitsRegexp Regular expression matching a single localised digit, e.g. '[0-9]'
185	* @param array $tzAbbrs Associative array mapping localised timezone abbreviations to
186	* IANA abbreviations, for the local timezone, e.g. [ 'EDT' => 'EDT', 'EST' => 'EST' ]
187	* @return string Regular expression
188	*/
189	private function getTimestampRegexp(
190	string $contLangVariant, string $format, string $digitsRegexp, array $tzAbbrs
191	): string {
192	$formatLength = strlen( $format );
193	$s = '';
194	$raw = false;
195	// Adapted from Language::sprintfDate()
196	for ( $p = 0; $p < $formatLength; $p++ ) {
197	$num = false;
198	$code = $format[ $p ];
199	if ( $code === 'x' && $p < $formatLength - 1 ) {
200	$code .= $format[++$p];
201	}
202	if ( $code === 'xk' && $p < $formatLength - 1 ) {
203	$code .= $format[++$p];
204	}
205
206	switch ( $code ) {
207	case 'xx':
208	$s .= 'x';
209	break;
210	case 'xg':
211	$s .= static::regexpAlternateGroup(
212	$this->getMessages( $contLangVariant, Language::MONTH_GENITIVE_MESSAGES )
213	);
214	break;
215	case 'xn':
216	$raw = true;
217	break;
218	case 'd':
219	$num = '2';
220	break;
221	case 'D':
222	$s .= static::regexpAlternateGroup(
223	$this->getMessages( $contLangVariant, Language::WEEKDAY_ABBREVIATED_MESSAGES )
224	);
225	break;
226	case 'j':
227	$num = '1,2';
228	break;
229	case 'l':
230	$s .= static::regexpAlternateGroup(
231	$this->getMessages( $contLangVariant, Language::WEEKDAY_MESSAGES )
232	);
233	break;
234	case 'F':
235	$s .= static::regexpAlternateGroup(
236	$this->getMessages( $contLangVariant, Language::MONTH_MESSAGES )
237	);
238	break;
239	case 'M':
240	$s .= static::regexpAlternateGroup(
241	$this->getMessages( $contLangVariant, Language::MONTH_ABBREVIATED_MESSAGES )
242	);
243	break;
244	case 'm':
245	$num = '2';
246	break;
247	case 'n':
248	$num = '1,2';
249	break;
250	case 'Y':
251	$num = '4';
252	break;
253	case 'xkY':
254	$num = '4';
255	break;
256	case 'G':
257	$num = '1,2';
258	break;
259	case 'H':
260	$num = '2';
261	break;
262	case 'i':
263	$num = '2';
264	break;
265	case 's':
266	$num = '2';
267	break;
268	case '\\':
269	// Backslash escaping
270	if ( $p < $formatLength - 1 ) {
271	$s .= preg_quote( $format[++$p], '/' );
272	} else {
273	$s .= preg_quote( '\\', '/' );
274	}
275	break;
276	case '"':
277	// Quoted literal
278	if ( $p < $formatLength - 1 ) {
279	$endQuote = strpos( $format, '"', $p + 1 );
280	if ( $endQuote === false ) {
281	// No terminating quote, assume literal "
282	$s .= '"';
283	} else {
284	$s .= preg_quote( substr( $format, $p + 1, $endQuote - $p - 1 ), '/' );
285	$p = $endQuote;
286	}
287	} else {
288	// Quote at end of string, assume literal "
289	$s .= '"';
290	}
291	break;
292	default:
293	// Copy whole characters together, instead of single bytes
294	$char = mb_substr( mb_strcut( $format, $p, 4 ), 0, 1 );
295	$s .= preg_quote( $char, '/' );
296	$p += strlen( $char ) - 1;
297	}
298	if ( $num !== false ) {
299	if ( $raw ) {
300	$s .= '([0-9]{' . $num . '})';
301	$raw = false;
302	} else {
303	$s .= '(' . $digitsRegexp . '{' . $num . '})';
304	}
305	}
306	// Ignore some invisible Unicode characters that often sneak into copy-pasted timestamps (T308448)
307	$s .= '[\\x{200E}\\x{200F}]?';
308	}
309
310	$tzRegexp = static::regexpAlternateGroup( array_keys( $tzAbbrs ) );
311
312	// Hard-coded parentheses and space like in Parser::pstPass2
313	// Ignore some invisible Unicode characters that often sneak into copy-pasted timestamps (T245784)
314	// \uNNNN syntax can only be used from PHP 7.3
315	return '/' . $s . ' [\\x{200E}\\x{200F}]?\$' . $tzRegexp . '\$/u';
316	}
317
318	/**
319	* Get a function that parses timestamps generated using the given date format, based on the result
320	* of matching the regexp returned by getTimestampRegexp()
321	*
322	* @param string $contLangVariant Content language variant
323	* @param string $format Date format, as used by MediaWiki
324	* @param array<int,string>\|null $digits Localised digits from 0 to 9, e.g. `[ '0', '1', ..., '9' ]`
325	* @param string $localTimezone Local timezone IANA name, e.g. `America/New_York`
326	* @param array $tzAbbrs Map of localised timezone abbreviations to IANA abbreviations
327	* for the local timezone, e.g. [ 'EDT' => 'EDT', 'EST' => 'EST' ]
328	* @return callable Parser function
329	*/
330	private function getTimestampParser(
331	string $contLangVariant, string $format, ?array $digits, string $localTimezone, array $tzAbbrs
332	): callable {
333	$untransformDigits = static function ( string $text ) use ( $digits ): int {
334	return (int)( $digits ? strtr( $text, array_flip( $digits ) ) : $text );
335	};
336
337	$formatLength = strlen( $format );
338	$matchingGroups = [];
339	for ( $p = 0; $p < $formatLength; $p++ ) {
340	$code = $format[$p];
341	if ( $code === 'x' && $p < $formatLength - 1 ) {
342	$code .= $format[++$p];
343	}
344	if ( $code === 'xk' && $p < $formatLength - 1 ) {
345	$code .= $format[++$p];
346	}
347
348	switch ( $code ) {
349	case 'xx':
350	case 'xn':
351	break;
352	case 'xg':
353	case 'd':
354	case 'j':
355	case 'D':
356	case 'l':
357	case 'F':
358	case 'M':
359	case 'm':
360	case 'n':
361	case 'Y':
362	case 'xkY':
363	case 'G':
364	case 'H':
365	case 'i':
366	case 's':
367	$matchingGroups[] = $code;
368	break;
369	case '\\':
370	// Backslash escaping
371	if ( $p < $formatLength - 1 ) {
372	$p++;
373	}
374	break;
375	case '"':
376	// Quoted literal
377	if ( $p < $formatLength - 1 ) {
378	$endQuote = strpos( $format, '"', $p + 1 );
379	if ( $endQuote !== false ) {
380	$p = $endQuote;
381	}
382	}
383	break;
384	default:
385	break;
386	}
387	}
388
389	return function ( array $match ) use (
390	$matchingGroups, $untransformDigits, $localTimezone, $tzAbbrs, $contLangVariant
391	) {
392	if ( is_array( $match[0] ) ) {
393	// Strip PREG_OFFSET_CAPTURE data
394	unset( $match['offset'] );
395	$match = array_map( static function ( array $tuple ) {
396	return $tuple[0];
397	}, $match );
398	}
399	$year = 0;
400	$monthIdx = 0;
401	$day = 0;
402	$hour = 0;
403	$minute = 0;
404	foreach ( $matchingGroups as $i => $code ) {
405	$text = $match[$i + 1];
406	switch ( $code ) {
407	case 'xg':
408	$monthIdx = array_search(
409	$text,
410	$this->getMessages( $contLangVariant, Language::MONTH_GENITIVE_MESSAGES ),
411	true
412	);
413	break;
414	case 'd':
415	case 'j':
416	$day = $untransformDigits( $text );
417	break;
418	case 'D':
419	case 'l':
420	// Day of the week - unused
421	break;
422	case 'F':
423	$monthIdx = array_search(
424	$text,
425	$this->getMessages( $contLangVariant, Language::MONTH_MESSAGES ),
426	true
427	);
428	break;
429	case 'M':
430	$monthIdx = array_search(
431	$text,
432	$this->getMessages( $contLangVariant, Language::MONTH_ABBREVIATED_MESSAGES ),
433	true
434	);
435	break;
436	case 'm':
437	case 'n':
438	$monthIdx = $untransformDigits( $text ) - 1;
439	break;
440	case 'Y':
441	$year = $untransformDigits( $text );
442	break;
443	case 'xkY':
444	// Thai year
445	$year = $untransformDigits( $text ) - 543;
446	break;
447	case 'G':
448	case 'H':
449	$hour = $untransformDigits( $text );
450	break;
451	case 'i':
452	$minute = $untransformDigits( $text );
453	break;
454	case 's':
455	// Seconds - unused, because most timestamp formats omit them
456	break;
457	default:
458	throw new LogicException( 'Not implemented' );
459	}
460	}
461
462	// The last matching group is the timezone abbreviation
463	$tzAbbr = $tzAbbrs[ end( $match ) ];
464
465	// Most of the time, the timezone abbreviation is not necessary to parse the date, since we
466	// can assume all times are in the wiki's local timezone.
467	$date = new DateTime();
468	// setTimezone must be called before setDate/setTime
469	$date->setTimezone( new DateTimeZone( $localTimezone ) );
470	$date->setDate( $year, $monthIdx + 1, $day );
471	$date->setTime( $hour, $minute, 0 );
472
473	// But during the "fall back" at the end of DST, some times will happen twice.
474	// Since the timezone abbreviation disambiguates the DST/non-DST times, we can detect
475	// when PHP chose the wrong one, and then try the other one. It appears that PHP always
476	// uses the later (non-DST) hour, but that behavior isn't documented, so we account for both.
477	$dateWarning = null;
478	if ( $date->format( 'T' ) !== $tzAbbr ) {
479	$altDate = clone $date;
480	if ( $date->format( 'I' ) ) {
481	// Parsed time is DST, try non-DST by advancing one hour
482	$altDate->add( new DateInterval( 'PT1H' ) );
483	} else {
484	// Parsed time is non-DST, try DST by going back one hour
485	$altDate->sub( new DateInterval( 'PT1H' ) );
486	}
487	if ( $altDate->format( 'T' ) === $tzAbbr ) {
488	$date = $altDate;
489	$dateWarning = 'Timestamp has timezone abbreviation for the wrong time';
490	} else {
491	$dateWarning = 'Ambiguous time at DST switchover was parsed';
492	}
493	}
494
495	// Now set the timezone back to UTC for formatting
496	$date->setTimezone( new DateTimeZone( 'UTC' ) );
497	$date = DateTimeImmutable::createFromMutable( $date );
498
499	// We require the date to be compatible with our libraries, for example zero or negative years (T352455)
500	// In PHP we need to check with MWTimestamp.
501	// In JS we need to check with Moment.
502	try {
503	// @phan-suppress-next-line PhanNoopNew
504	new MWTimestamp( $date->format( 'c' ) );
505	} catch ( TimestampException $ex ) {
506	return null;
507	}
508
509	return [
510	'date' => $date,
511	'warning' => $dateWarning,
512	];
513	};
514	}
515
516	/**
517	* Get a regexp that matches timestamps in the local date format, for each language variant.
518	*
519	* This calls getTimestampRegexp() with predefined data for the current wiki.
520	*
521	* @return string[] Regular expressions
522	*/
523	public function getLocalTimestampRegexps(): array {
524	$langConv = $this->languageConverterFactory->getLanguageConverter( $this->language );
525	return array_map( function ( $contLangVariant ) {
526	return $this->getTimestampRegexp(
527	$contLangVariant,
528	$this->dateFormat[$contLangVariant],
529	'[' . implode( '', $this->digits[$contLangVariant] ) . ']',
530	$this->timezones[$contLangVariant]
531	);
532	}, $langConv->getVariants() );
533	}
534
535	/**
536	* Get a function that parses timestamps in the local date format, for each language variant,
537	* based on the result of matching the regexp returned by getLocalTimestampRegexp().
538	*
539	* This calls getTimestampParser() with predefined data for the current wiki.
540	*
541	* @return callable[] Parser functions
542	*/
543	private function getLocalTimestampParsers(): array {
544	$langConv = $this->languageConverterFactory->getLanguageConverter( $this->language );
545	return array_map( function ( $contLangVariant ) {
546	return $this->getTimestampParser(
547	$contLangVariant,
548	$this->dateFormat[$contLangVariant],
549	$this->digits[$contLangVariant],
550	$this->localTimezone,
551	$this->timezones[$contLangVariant]
552	);
553	}, $langConv->getVariants() );
554	}
555
556	/**
557	* Given a link node (`<a>`), if it's a link to a user-related page, return their username.
558	*
559	* @param Element $link
560	* @return array\|null Array, or null:
561	* - string 'username' Username
562	* - string\|null 'displayName' Display name (link text if link target was in the user namespace)
563	*/
564	private function getUsernameFromLink( Element $link ): ?array {
565	// Selflink: use title of current page
566	if ( DOMCompat::getClassList( $link )->contains( 'mw-selflink' ) ) {
567	$title = $this->title;
568	} else {
569	$titleString = CommentUtils::getTitleFromUrl( $link->getAttribute( 'href' ) ?? '', $this->config ) ?? '';
570	// Performance optimization, skip strings that obviously don't contain a namespace
571	if ( $titleString === '' \|\| !str_contains( $titleString, ':' ) ) {
572	return null;
573	}
574	$title = $this->parseTitle( $titleString );
575	if ( !$title ) {
576	return null;
577	}
578	}
579
580	$username = null;
581	$displayName = null;
582	$mainText = $title->getText();
583
584	if ( $title->inNamespace( NS_USER ) \|\| $title->inNamespace( NS_USER_TALK ) ) {
585	$username = $mainText;
586	if ( str_contains( $username, '/' ) ) {
587	return null;
588	}
589	if ( $title->inNamespace( NS_USER ) ) {
590	// Use regex trim for consistency with JS implementation
591	$text = preg_replace( [ '/^[\s]+/u', '/[\s]+$/u' ], '', $link->textContent ?? '' );
592	// Record the display name if it has been customised beyond changing case
593	if ( $text && mb_strtolower( $text ) !== mb_strtolower( $username ) ) {
594	$displayName = $text;
595	}
596	}
597	} elseif ( $title->inNamespace( NS_SPECIAL ) ) {
598	$parts = explode( '/', $mainText );
599	if ( count( $parts ) === 2 && $parts[0] === $this->specialContributionsName ) {
600	// Normalize the username: users may link to their contributions with an unnormalized name
601	$userpage = $this->titleParser->makeTitleValueSafe( NS_USER, $parts[1] );
602	if ( !$userpage ) {
603	return null;
604	}
605	$username = $userpage->getText();
606	}
607	}
608	if ( !$username ) {
609	return null;
610	}
611	if ( IPUtils::isIPv6( $username ) ) {
612	// Bot-generated links "Preceding unsigned comment added by" have non-standard case
613	$username = strtoupper( $username );
614	}
615	return [
616	'username' => $username,
617	'displayName' => $displayName,
618	];
619	}
620
621	/**
622	* Find a user signature preceding a timestamp.
623	*
624	* The signature includes the timestamp node.
625	*
626	* A signature must contain at least one link to the user's userpage, discussion page or
627	* contributions (and may contain other links). The link may be nested in other elements.
628	*
629	* @param Text $timestampNode
630	* @param Node\|null $until Node to stop searching at
631	* @return array Result, an associative array with the following keys:
632	* - Node[] `nodes` Sibling nodes comprising the signature, in reverse order (with
633	* $timestampNode or its parent node as the first element)
634	* - string\|null `username` Username, null for unsigned comments
635	*/
636	private function findSignature( Text $timestampNode, ?Node $until = null ): array {
637	$sigUsername = null;
638	$sigDisplayName = null;
639	$length = 0;
640	$lastLinkNode = $timestampNode;
641
642	CommentUtils::linearWalkBackwards(
643	$timestampNode,
644	function ( string $event, Node $node ) use (
645	&$sigUsername, &$sigDisplayName, &$lastLinkNode, &$length,
646	$until, $timestampNode
647	) {
648	if ( $event === 'enter' && $node === $until ) {
649	return true;
650	}
651	if ( $length >= static::SIGNATURE_SCAN_LIMIT ) {
652	return true;
653	}
654	if ( CommentUtils::isBlockElement( $node ) ) {
655	// Don't allow reaching into preceding paragraphs
656	return true;
657	}
658
659	if ( $event === 'leave' && $node !== $timestampNode ) {
660	$length += $node instanceof Text ?
661	mb_strlen( CommentUtils::htmlTrim( $node->textContent ?? '' ) ) : 0;
662	}
663
664	// Find the closest link before timestamp that links to the user's user page.
665	//
666	// Support timestamps being linked to the diff introducing the comment:
667	// if the timestamp node is the only child of a link node, use the link node instead
668	//
669	// Handle links nested in formatting elements.
670	if ( $event === 'leave' && $node instanceof Element && strtolower( $node->tagName ) === 'a' ) {
671	$classList = DOMCompat::getClassList( $node );
672	// Generated timestamp links sometimes look like username links (e.g. on user talk pages)
673	// so ignore these.
674	if ( !$classList->contains( 'ext-discussiontools-init-timestamplink' ) ) {
675	$user = $this->getUsernameFromLink( $node );
676	if ( $user ) {
677	// Accept the first link to the user namespace, then only accept links to that user
678	if ( $sigUsername === null ) {
679	$sigUsername = $user['username'];
680	}
681	if ( $user['username'] === $sigUsername ) {
682	$lastLinkNode = $node;
683	if ( $user['displayName'] ) {
684	$sigDisplayName = $user['displayName'];
685	}
686	}
687	}
688	// Keep looking if a node with links wasn't a link to a user page
689	// "Doc James (talk · contribs · email)"
690	}
691	}
692	}
693	);
694
695	$range = new ImmutableRange(
696	$lastLinkNode->parentNode,
697	CommentUtils::childIndexOf( $lastLinkNode ),
698	$timestampNode->parentNode,
699	CommentUtils::childIndexOf( $timestampNode ) + 1
700	);
701
702	// Expand the range so that it covers sibling nodes.
703	// This will include any wrapping formatting elements as part of the signature.
704	//
705	// Helpful accidental feature: users whose signature is not detected in full (due to
706	// text formatting) can just wrap it in a <span> to fix that.
707	// "Ten Pound Hammer • (What did I screw up now?)"
708	// "« Saper // dyskusja »"
709	//
710	// TODO Not sure if this is actually good, might be better to just use the range...
711	$sigNodes = array_reverse( CommentUtils::getCoveredSiblings( $range ) );
712
713	return [
714	'nodes' => $sigNodes,
715	'username' => $sigUsername,
716	'displayName' => $sigDisplayName,
717	];
718	}
719
720	/**
721	* Callback for TreeWalker that will skip over nodes where we don't want to detect
722	* comments (or section headings).
723	*
724	* @param Node $node
725	* @return int Appropriate NodeFilter constant
726	*/
727	public static function acceptOnlyNodesAllowingComments( Node $node ): int {
728	if ( $node instanceof Element ) {
729	$tagName = strtolower( $node->tagName );
730	// The table of contents has a heading that gets erroneously detected as a section
731	if ( $node->getAttribute( 'id' ) === 'toc' ) {
732	return NodeFilter::FILTER_REJECT;
733	}
734	// Don't detect comments within quotes (T275881)
735	if (
736	$tagName === 'blockquote' \|\|
737	$tagName === 'cite' \|\|
738	$tagName === 'q'
739	) {
740	return NodeFilter::FILTER_REJECT;
741	}
742	$classList = DOMCompat::getClassList( $node );
743	// Don't attempt to parse blocks marked 'mw-notalk'
744	if ( $classList->contains( 'mw-notalk' ) ) {
745	return NodeFilter::FILTER_REJECT;
746	}
747	// Don't detect comments within references. We can't add replies to them without bungling up
748	// the structure in some cases (T301213), and you're not supposed to do that anyway…
749	if (
750	// <ol class="references"> is the only reliably consistent thing between the two parsers
751	$tagName === 'ol' &&
752	DOMCompat::getClassList( $node )->contains( 'references' )
753	) {
754	return NodeFilter::FILTER_REJECT;
755	}
756	}
757	$parentNode = $node->parentNode;
758	// Don't detect comments within headings (but don't reject the headings themselves)
759	if ( $parentNode instanceof Element && preg_match( '/^h([1-6])$/i', $parentNode->tagName ) ) {
760	return NodeFilter::FILTER_REJECT;
761	}
762	return NodeFilter::FILTER_ACCEPT;
763	}
764
765	/**
766	* Convert a byte offset within a text node to a unicode codepoint offset
767	*
768	* @param Text $node Text node
769	* @param int $byteOffset Byte offset
770	* @return int Codepoint offset
771	*/
772	private static function getCodepointOffset( Text $node, int $byteOffset ): int {
773	return mb_strlen( substr( $node->nodeValue ?? '', 0, $byteOffset ) );
774	}
775
776	/**
777	* Find a timestamps in a given text node
778	*
779	* @param Text $node
780	* @param string[] $timestampRegexps
781	* @return array\|null Array with the following keys:
782	* - int 'offset' Length of extra text preceding the node that was used for matching (in bytes)
783	* - int 'parserIndex' Which of the regexps matched
784	* - array 'matchData' Regexp match data, which specifies the location of the match,
785	* and which can be parsed using getLocalTimestampParsers() (offsets are in bytes)
786	* - ImmutableRange 'range' Range covering the timestamp
787	*/
788	public function findTimestamp( Text $node, array $timestampRegexps ): ?array {
789	$nodeText = '';
790	$offset = 0;
791	// Searched nodes (reverse order)
792	$nodes = [];
793
794	while ( $node ) {
795	$nodeText = $node->nodeValue . $nodeText;
796	$nodes[] = $node;
797
798	// In Parsoid HTML, entities are represented as a 'mw:Entity' node, rather than normal HTML
799	// entities. On Arabic Wikipedia, the "UTC" timezone name contains some non-breaking spaces,
800	// which apparently are often turned into   entities by buggy editing tools. To handle
801	// this, we must piece together the text, so that our regexp can match those timestamps.
802	if (
803	( $previousSibling = $node->previousSibling ) &&
804	$previousSibling instanceof Element &&
805	$previousSibling->getAttribute( 'typeof' ) === 'mw:Entity'
806	) {
807	$nodeText = $previousSibling->firstChild->nodeValue . $nodeText;
808	$offset += strlen( $previousSibling->firstChild->nodeValue ?? '' );
809	$nodes[] = $previousSibling->firstChild;
810
811	// If the entity is preceded by more text, do this again
812	if (
813	$previousSibling->previousSibling &&
814	$previousSibling->previousSibling instanceof Text
815	) {
816	$offset += strlen( $previousSibling->previousSibling->nodeValue ?? '' );
817	$node = $previousSibling->previousSibling;
818	} else {
819	$node = null;
820	}
821	} else {
822	$node = null;
823	}
824	}
825
826	foreach ( $timestampRegexps as $i => $timestampRegexp ) {
827	$matchData = null;
828	// Allows us to mimic match.index in #getComments
829	if ( preg_match( $timestampRegexp, $nodeText, $matchData, PREG_OFFSET_CAPTURE ) ) {
830	$timestampLength = strlen( $matchData[0][0] );
831	// Bytes at the end of the last node which aren't part of the match
832	$tailLength = strlen( $nodeText ) - $timestampLength - $matchData[0][1];
833	// We are moving right to left, but we start to the right of the end of
834	// the timestamp if there is trailing garbage, so that is a negative offset.
835	$count = -$tailLength;
836	$endNode = $nodes[0];
837	$endOffset = strlen( $endNode->nodeValue ?? '' ) - $tailLength;
838
839	foreach ( $nodes as $n ) {
840	$count += strlen( $n->nodeValue ?? '' );
841	// If we have counted to beyond the start of the timestamp, we are in the
842	// start node of the timestamp
843	if ( $count >= $timestampLength ) {
844	$startNode = $n;
845	// Offset is how much we overshot the start by
846	$startOffset = $count - $timestampLength;
847	break;
848	}
849	}
850	Assert::precondition( $endNode instanceof Node, 'endNode of timestamp is a Node' );
851	Assert::precondition( $startNode instanceof Node, 'startNode of timestamp range found' );
852	Assert::precondition( is_int( $startOffset ), 'startOffset of timestamp range found' );
853
854	$startOffset = static::getCodepointOffset( $startNode, $startOffset );
855	$endOffset = static::getCodepointOffset( $endNode, $endOffset );
856
857	$range = new ImmutableRange( $startNode, $startOffset, $endNode, $endOffset );
858
859	return [
860	'matchData' => $matchData,
861	// Bytes at the start of the first node which aren't part of the match
862	// TODO: Remove this and use 'range' instead
863	'offset' => $offset,
864	'range' => $range,
865	'parserIndex' => $i,
866	];
867	}
868	}
869	return null;
870	}
871
872	/**
873	* @param Node[] $sigNodes
874	* @param array $match
875	* @param Text $node
876	* @return ImmutableRange
877	*/
878	private function adjustSigRange( array $sigNodes, array $match, Text $node ): ImmutableRange {
879	$firstSigNode = end( $sigNodes );
880	$lastSigNode = $sigNodes[0];
881
882	// TODO Document why this needs to be so complicated
883	$lastSigNodeOffsetByteOffset =
884	$match['matchData'][0][1] + strlen( $match['matchData'][0][0] ) - $match['offset'];
885	$lastSigNodeOffset = $lastSigNode === $node ?
886	static::getCodepointOffset( $node, $lastSigNodeOffsetByteOffset ) :
887	CommentUtils::childIndexOf( $lastSigNode ) + 1;
888	$sigRange = new ImmutableRange(
889	$firstSigNode->parentNode,
890	CommentUtils::childIndexOf( $firstSigNode ),
891	$lastSigNode === $node ? $node : $lastSigNode->parentNode,
892	$lastSigNodeOffset
893	);
894
895	return $sigRange;
896	}
897
898	private function buildThreadItems(): ContentThreadItemSet {
899	$result = new ContentThreadItemSet();
900
901	$timestampRegexps = $this->getLocalTimestampRegexps();
902	$dfParsers = $this->getLocalTimestampParsers();
903
904	$curCommentEnd = null;
905
906	$treeWalker = new TreeWalker(
907	$this->rootNode,
908	NodeFilter::SHOW_ELEMENT \| NodeFilter::SHOW_TEXT,
909	[ static::class, 'acceptOnlyNodesAllowingComments' ]
910	);
911	while ( $node = $treeWalker->nextNode() ) {
912	if ( $node instanceof Element && preg_match( '/^h([1-6])$/i', $node->tagName, $match ) ) {
913	$headingNode = CommentUtils::getHeadlineNode( $node );
914	$range = new ImmutableRange(
915	$headingNode, 0, $headingNode, $headingNode->childNodes->length
916	);
917	$transcludedFrom = $this->computeTranscludedFrom( $range );
918	$curComment = new ContentHeadingItem( $range, $transcludedFrom, (int)( $match[ 1 ] ) );
919	$curComment->setRootNode( $this->rootNode );
920	$result->addThreadItem( $curComment );
921	$curCommentEnd = $node;
922	} elseif ( $node instanceof Text && ( $match = $this->findTimestamp( $node, $timestampRegexps ) ) ) {
923	$warnings = [];
924	$foundSignature = $this->findSignature( $node, $curCommentEnd );
925	$author = $foundSignature['username'];
926
927	if ( !$author ) {
928	// Ignore timestamps for which we couldn't find a signature. It's probably not a real
929	// comment, but just a false match due to a copypasted timestamp.
930	continue;
931	}
932
933	$sigRanges = [];
934	$timestampRanges = [];
935
936	$sigRanges[] = $this->adjustSigRange( $foundSignature['nodes'], $match, $node );
937	$timestampRanges[] = $match['range'];
938
939	// Everything from the last comment up to here is the next comment
940	$startNode = $this->nextInterestingLeafNode( $curCommentEnd );
941	$endNode = $foundSignature['nodes'][0];
942
943	// Skip to the end of the "paragraph". This only looks at tag names and can be fooled by CSS, but
944	// avoiding that would be more difficult and slower.
945	//
946	// If this skips over another potential signature, also skip it in the main TreeWalker loop, to
947	// avoid generating multiple comments when there is more than one signature on a single "line".
948	// Often this is done when someone edits their comment later and wants to add a note about that.
949	// (Or when another person corrects a typo, or strikes out a comment, etc.) Multiple comments
950	// within one paragraph/list-item result in a confusing double "Reply" button, and we also have
951	// no way to indicate which one you're replying to (this might matter in the future for
952	// notifications or something).
953	CommentUtils::linearWalk(
954	$endNode,
955	function ( string $event, Node $n ) use (
956	&$endNode, &$sigRanges, &$timestampRanges,
957	$treeWalker, $timestampRegexps, $node
958	) {
959	if ( CommentUtils::isBlockElement( $n ) \|\| CommentUtils::isCommentSeparator( $n ) ) {
960	// Stop when entering or leaving a block node
961	return true;
962	}
963	if (
964	$event === 'leave' &&
965	$n instanceof Text && $n !== $node &&
966	( $match2 = $this->findTimestamp( $n, $timestampRegexps ) )
967	) {
968	// If this skips over another potential signature, also skip it in the main TreeWalker loop
969	$treeWalker->currentNode = $n;
970	// …and add it as another signature to this comment (regardless of the author and timestamp)
971	$foundSignature2 = $this->findSignature( $n, $node );
972	if ( $foundSignature2['username'] ) {
973	$sigRanges[] = $this->adjustSigRange( $foundSignature2['nodes'], $match2, $n );
974	$timestampRanges[] = $match2['range'];
975	}
976	}
977	if ( $event === 'leave' ) {
978	// Take the last complete node which we skipped past
979	$endNode = $n;
980	}
981	}
982	);
983
984	$length = ( $endNode instanceof Text ) ?
985	mb_strlen( rtrim( $endNode->nodeValue ?? '', "\t\n\f\r " ) ) :
986	// PHP bug: childNodes can be null for comment nodes
987	// (it should always be a NodeList, even if the node can't have children)
988	( $endNode->childNodes ? $endNode->childNodes->length : 0 );
989	$range = new ImmutableRange(
990	$startNode->parentNode,
991	CommentUtils::childIndexOf( $startNode ),
992	$endNode,
993	$length
994	);
995	$transcludedFrom = $this->computeTranscludedFrom( $range );
996
997	$startLevel = CommentUtils::getIndentLevel( $startNode, $this->rootNode ) + 1;
998	$endLevel = CommentUtils::getIndentLevel( $node, $this->rootNode ) + 1;
999	if ( $startLevel !== $endLevel ) {
1000	$warnings[] = 'Comment starts and ends with different indentation';
1001	}
1002	// Should this use the indent level of $startNode or $node?
1003	$level = min( $startLevel, $endLevel );
1004
1005	$parserResult = $dfParsers[ $match['parserIndex'] ]( $match['matchData'] );
1006	if ( !$parserResult ) {
1007	continue;
1008	}
1009	[ 'date' => $dateTime, 'warning' => $dateWarning ] = $parserResult;
1010
1011	if ( $dateWarning ) {
1012	$warnings[] = $dateWarning;
1013	}
1014
1015	$curComment = new ContentCommentItem(
1016	$level,
1017	$range,
1018	$transcludedFrom,
1019	$sigRanges,
1020	$timestampRanges,
1021	$dateTime,
1022	$author,
1023	$foundSignature['displayName']
1024	);
1025	$curComment->setRootNode( $this->rootNode );
1026	if ( $warnings ) {
1027	$curComment->addWarnings( $warnings );
1028	}
1029	if ( $result->isEmpty() ) {
1030	// Add a fake placeholder heading if there are any comments in the 0th section
1031	// (before the first real heading)
1032	$range = new ImmutableRange( $this->rootNode, 0, $this->rootNode, 0 );
1033	$fakeHeading = new ContentHeadingItem( $range, false, null );
1034	$fakeHeading->setRootNode( $this->rootNode );
1035	$result->addThreadItem( $fakeHeading );
1036	}
1037	$result->addThreadItem( $curComment );
1038	$curCommentEnd = $curComment->getRange()->endContainer;
1039	}
1040	}
1041
1042	return $result;
1043	}
1044
1045	/**
1046	* Get the name of the page from which this thread item is transcluded (if any). Replies to
1047	* transcluded items must be posted on that page, instead of the current one.
1048	*
1049	* This is tricky, because we don't want to mark items as trancluded when they're just using a
1050	* template (e.g. {{ping\|…}} or a non-substituted signature template). Sometimes the whole comment
1051	* can be template-generated (e.g. when using some wrapper templates), but as long as a reply can
1052	* be added outside of that template, we should not treat it as transcluded.
1053	*
1054	* The start/end boundary points of comment ranges and Parsoid transclusion ranges don't line up
1055	* exactly, even when to a human it's obvious that they cover the same content, making this more
1056	* complicated.
1057	*
1058	* @return string\|bool `false` if this item is not transcluded. A string if it's transcluded
1059	* from a single page (the page title, in text form with spaces). `true` if it's transcluded, but
1060	* we can't determine the source.
1061	*/
1062	public function computeTranscludedFrom( ImmutableRange $commentRange ) {
1063	// Collapsed ranges should otherwise be impossible, but they're not (T299583)
1064	// TODO: See if we can fix the root cause, and remove this?
1065	if ( $commentRange->collapsed ) {
1066	return false;
1067	}
1068
1069	// General approach:
1070	//
1071	// Compare the comment range to each transclusion range on the page, and if it overlaps any of
1072	// them, examine the overlap. There are a few cases:
1073	//
1074	// * Comment and transclusion do not overlap:
1075	// → Not transcluded.
1076	// * Comment contains the transclusion:
1077	// → Not transcluded (just a template).
1078	// * Comment is contained within the transclusion:
1079	// → Transcluded, we can determine the source page (unless it's a complex transclusion).
1080	// * Comment and transclusion overlap partially:
1081	// → Transcluded, but we can't determine the source page.
1082	// * Comment (almost) exactly matches the transclusion:
1083	// → Maybe transcluded (it could be that the source page only contains that single comment),
1084	// maybe not transcluded (it could be a wrapper template that covers a single comment).
1085	// This is very sad, and we decide based on the namespace.
1086	//
1087	// Most transclusion ranges on the page trivially fall in the "do not overlap" or "contains"
1088	// cases, and we only have to carefully examine the two transclusion ranges that contain the
1089	// first and last node of the comment range.
1090	//
1091	// To check for almost exact matches, we walk between the relevant boundary points, and if we
1092	// only find uninteresting nodes (that would be ignored when detecting comments), we treat them
1093	// like exact matches.
1094
1095	$startTransclNode = CommentUtils::getTranscludedFromElement(
1096	CommentUtils::getRangeFirstNode( $commentRange )
1097	);
1098	$endTransclNode = CommentUtils::getTranscludedFromElement(
1099	CommentUtils::getRangeLastNode( $commentRange )
1100	);
1101
1102	// We only have to examine the two transclusion ranges that contain the first/last node of the
1103	// comment range (if they exist). Ignore ranges outside the comment or in the middle of it.
1104	$transclNodes = [];
1105	if ( $startTransclNode ) {
1106	$transclNodes[] = $startTransclNode;
1107	}
1108	if ( $endTransclNode && $endTransclNode !== $startTransclNode ) {
1109	$transclNodes[] = $endTransclNode;
1110	}
1111
1112	foreach ( $transclNodes as $transclNode ) {
1113	$transclRange = static::getTransclusionRange( $transclNode );
1114	$compared = CommentUtils::compareRanges( $commentRange, $transclRange );
1115	$transclTitles = $this->getTransclusionTitles( $transclNode );
1116	$simpleTransclTitle = count( $transclTitles ) === 1 && $transclTitles[0] !== null ?
1117	$this->parseTitle( $transclTitles[0] ) : null;
1118
1119	switch ( $compared ) {
1120	case 'equal':
1121	// Comment (almost) exactly matches the transclusion
1122	if ( $simpleTransclTitle === null ) {
1123	// Allow replying to some accidental complex transclusions consisting of only templates
1124	// and wikitext (T313093)
1125	if ( count( $transclTitles ) > 1 ) {
1126	foreach ( $transclTitles as $transclTitleString ) {
1127	if ( $transclTitleString !== null ) {
1128	$transclTitle = $this->parseTitle( $transclTitleString );
1129	if ( $transclTitle && !$transclTitle->inNamespace( NS_TEMPLATE ) ) {
1130	return true;
1131	}
1132	}
1133	}
1134	// Continue examining the other ranges.
1135	break;
1136	}
1137	// Multi-template transclusion, or a parser function call, or template-affected wikitext outside
1138	// of a template call, or a mix of the above
1139	return true;
1140
1141	} elseif ( $simpleTransclTitle->inNamespace( NS_TEMPLATE ) ) {
1142	// Is that a subpage transclusion with a single comment, or a wrapper template
1143	// transclusion on this page? We don't know, but let's guess based on the namespace.
1144	// (T289873)
1145	// Continue examining the other ranges.
1146	break;
1147	} elseif ( !$this->titleCanExist( $simpleTransclTitle ) ) {
1148	// Special page transclusion (T344622) or something else weird. Don't return the title,
1149	// since it's useless for replying, and can't be stored in the permalink database.
1150	return true;
1151	} else {
1152	Assert::precondition( $transclTitles[0] !== null, "Simple transclusion found" );
1153	return strtr( $transclTitles[0], '_', ' ' );
1154	}
1155
1156	case 'contains':
1157	// Comment contains the transclusion
1158
1159	// If the entire transclusion is contained within the comment range, that's just a
1160	// template. This is the same as a transclusion in the middle of the comment, which we
1161	// ignored earlier, it just takes us longer to get here in this case.
1162
1163	// Continue examining the other ranges.
1164	break;
1165
1166	case 'contained':
1167	// Comment is contained within the transclusion
1168	if ( $simpleTransclTitle === null ) {
1169	return true;
1170	} elseif ( !$this->titleCanExist( $simpleTransclTitle ) ) {
1171	// Special page transclusion (T344622) or something else weird. Don't return the title,
1172	// since it's useless for replying, and can't be stored in the permalink database.
1173	return true;
1174	} else {
1175	Assert::precondition( $transclTitles[0] !== null, "Simple transclusion found" );
1176	return strtr( $transclTitles[0], '_', ' ' );
1177	}
1178
1179	case 'after':
1180	case 'before':
1181	// Comment and transclusion do not overlap
1182
1183	// This should be impossible, because we ignored these ranges earlier.
1184	throw new LogicException( 'Unexpected transclusion or comment range' );
1185
1186	case 'overlapstart':
1187	case 'overlapend':
1188	// Comment and transclusion overlap partially
1189	return true;
1190
1191	default:
1192	throw new LogicException( 'Unexpected return value from compareRanges()' );
1193	}
1194	}
1195
1196	// If we got here, the comment range was not contained by or overlapping any of the transclusion
1197	// ranges. Comment is not transcluded.
1198	return false;
1199	}
1200
1201	private function titleCanExist( TitleValue $title ): bool {
1202	return $title->getNamespace() >= NS_MAIN &&
1203	!$title->isExternal() &&
1204	$title->getText() !== '';
1205	}
1206
1207	private function parseTitle( string $titleString ): ?TitleValue {
1208	try {
1209	return $this->titleParser->parseTitle( $titleString );
1210	} catch ( MalformedTitleException $err ) {
1211	return null;
1212	}
1213	}
1214
1215	/**
1216	* Return the page titles for each part of the transclusion, or nulls for each part that isn't
1217	* transcluded from another page.
1218	*
1219	* If the node represents a single-page transclusion, this will return an array containing a
1220	* single string.
1221	*
1222	* @param Element $node
1223	* @return array<string\|null>
1224	*/
1225	private function getTransclusionTitles( Element $node ): array {
1226	$dataMw = json_decode( $node->getAttribute( 'data-mw' ) ?? '', true );
1227	$out = [];
1228
1229	foreach ( $dataMw['parts'] ?? [] as $part ) {
1230	if (
1231	!is_string( $part ) &&
1232	// 'href' will be unset if this is a parser function rather than a template
1233	isset( $part['template']['target']['href'] )
1234	) {
1235	$parsoidHref = $part['template']['target']['href'];
1236	Assert::precondition( substr( $parsoidHref, 0, 2 ) === './', "href has valid format" );
1237	$out[] = rawurldecode( substr( $parsoidHref, 2 ) );
1238	} else {
1239	$out[] = null;
1240	}
1241	}
1242
1243	return $out;
1244	}
1245
1246	/**
1247	* Given a transclusion's first node (e.g. returned by CommentUtils::getTranscludedFromElement()),
1248	* return a range starting before the node and ending after the transclusion's last node.
1249	*
1250	* @param Element $startNode
1251	* @return ImmutableRange
1252	*/
1253	private function getTransclusionRange( Element $startNode ): ImmutableRange {
1254	$endNode = $startNode;
1255	while (
1256	// Phan doesn't realize that the conditions on $nextSibling can terminate the loop
1257	// @phan-suppress-next-line PhanInfiniteLoop
1258	$endNode &&
1259	( $nextSibling = $endNode->nextSibling ) &&
1260	$nextSibling instanceof Element &&
1261	$nextSibling->getAttribute( 'about' ) === $endNode->getAttribute( 'about' )
1262	) {
1263	$endNode = $nextSibling;
1264	}
1265
1266	$range = new ImmutableRange(
1267	$startNode->parentNode,
1268	CommentUtils::childIndexOf( $startNode ),
1269	$endNode->parentNode,
1270	CommentUtils::childIndexOf( $endNode ) + 1
1271	);
1272
1273	return $range;
1274	}
1275
1276	/**
1277	* Truncate user generated parts of IDs so full ID always fits within a database field of length 255
1278	*
1279	* nb: Text should already have had spaces replaced with underscores by this point.
1280	*
1281	* @param string $text Text
1282	* @param bool $legacy Generate legacy ID, not needed in JS implementation
1283	* @return string Truncated text
1284	*/
1285	private function truncateForId( string $text, bool $legacy = false ): string {
1286	$truncated = $this->language->truncateForDatabase( $text, 80, '' );
1287	if ( !$legacy ) {
1288	$truncated = trim( $truncated, '_' );
1289	}
1290	return $truncated;
1291	}
1292
1293	/**
1294	* Given a thread item, return an identifier for it that is unique within the page.
1295	*
1296	* @param ContentThreadItem $threadItem
1297	* @param ContentThreadItemSet $previousItems
1298	* @param bool $legacy Generate legacy ID, not needed in JS implementation
1299	* @return string
1300	*/
1301	private function computeId(
1302	ContentThreadItem $threadItem, ContentThreadItemSet $previousItems, bool $legacy = false
1303	): string {
1304	$id = null;
1305
1306	if ( $threadItem instanceof ContentHeadingItem && $threadItem->isPlaceholderHeading() ) {
1307	// The range points to the root note, using it like below results in silly values
1308	$id = 'h-';
1309	} elseif ( $threadItem instanceof ContentHeadingItem ) {
1310	$id = 'h-' . $this->truncateForId( $threadItem->getLinkableId(), $legacy );
1311	} elseif ( $threadItem instanceof ContentCommentItem ) {
1312	$id = 'c-' . $this->truncateForId( str_replace( ' ', '_', $threadItem->getAuthor() ), $legacy ) .
1313	'-' . $threadItem->getTimestampString();
1314	} else {
1315	throw new InvalidArgumentException( 'Unknown ThreadItem type' );
1316	}
1317
1318	// If there would be multiple comments with the same ID (i.e. the user left multiple comments
1319	// in one edit, or within a minute), add the parent ID to disambiguate them.
1320	$threadItemParent = $threadItem->getParent();
1321	if ( $threadItemParent instanceof ContentHeadingItem && !$threadItemParent->isPlaceholderHeading() ) {
1322	$id .= '-' . $this->truncateForId( $threadItemParent->getLinkableId(), $legacy );
1323	} elseif ( $threadItemParent instanceof ContentCommentItem ) {
1324	$id .= '-' . $this->truncateForId( str_replace( ' ', '_', $threadItemParent->getAuthor() ), $legacy ) .
1325	'-' . $threadItemParent->getTimestampString();
1326	}
1327
1328	if ( $threadItem instanceof ContentHeadingItem ) {
1329	// To avoid old threads re-appearing on popular pages when someone uses a vague title
1330	// (e.g. dozens of threads titled "question" on [[Wikipedia:Help desk]]: https://w.wiki/fbN),
1331	// include the oldest timestamp in the thread (i.e. date the thread was started) in the
1332	// heading ID.
1333	$oldestComment = $threadItem->getOldestReply();
1334	if ( $oldestComment ) {
1335	$id .= '-' . $oldestComment->getTimestampString();
1336	}
1337	}
1338
1339	if ( $previousItems->findCommentById( $id ) ) {
1340	// Well, that's tough
1341	if ( !$legacy ) {
1342	$threadItem->addWarning( 'Duplicate comment ID' );
1343	}
1344	// Finally, disambiguate by adding sequential numbers, to allow replying to both comments
1345	$number = 1;
1346	while ( $previousItems->findCommentById( "$id-$number" ) ) {
1347	$number++;
1348	}
1349	$id = "$id-$number";
1350	}
1351
1352	return $id;
1353	}
1354
1355	/**
1356	* Given a thread item, return an identifier for it that is consistent across all pages and
1357	* revisions where this comment might appear.
1358	*
1359	* Multiple comments on a page can have the same name; use ID to distinguish them.
1360	*/
1361	private function computeName( ContentThreadItem $threadItem ): string {
1362	$name = null;
1363
1364	if ( $threadItem instanceof ContentHeadingItem ) {
1365	$name = 'h-';
1366	$mainComment = $threadItem->getOldestReply();
1367	} elseif ( $threadItem instanceof ContentCommentItem ) {
1368	$name = 'c-';
1369	$mainComment = $threadItem;
1370	} else {
1371	throw new InvalidArgumentException( 'Unknown ThreadItem type' );
1372	}
1373
1374	if ( $mainComment ) {
1375	$name .= $this->truncateForId( str_replace( ' ', '_', $mainComment->getAuthor() ) ) .
1376	'-' . $mainComment->getTimestampString();
1377	}
1378
1379	return $name;
1380	}
1381
1382	private function buildThreads( ContentThreadItemSet $result ): void {
1383	$lastHeading = null;
1384	$replies = [];
1385
1386	foreach ( $result->getThreadItems() as $threadItem ) {
1387	if ( count( $replies ) < $threadItem->getLevel() ) {
1388	// Someone skipped an indentation level (or several). Pretend that the previous reply
1389	// covers multiple indentation levels, so that following comments get connected to it.
1390	$threadItem->addWarning( 'Comment skips indentation level' );
1391	while ( count( $replies ) < $threadItem->getLevel() ) {
1392	$replies[] = end( $replies );
1393	}
1394	}
1395
1396	if ( $threadItem instanceof ContentHeadingItem ) {
1397	// New root (thread)
1398	// Attach as a sub-thread to preceding higher-level heading.
1399	// Any replies will appear in the tree twice, under the main-thread and the sub-thread.
1400	$maybeParent = $lastHeading;
1401	while ( $maybeParent && $maybeParent->getHeadingLevel() >= $threadItem->getHeadingLevel() ) {
1402	$maybeParent = $maybeParent->getParent();
1403	}
1404	if ( $maybeParent ) {
1405	$threadItem->setParent( $maybeParent );
1406	$maybeParent->addReply( $threadItem );
1407	}
1408	$lastHeading = $threadItem;
1409	} elseif ( isset( $replies[ $threadItem->getLevel() - 1 ] ) ) {
1410	// Add as a reply to the closest less-nested comment
1411	$threadItem->setParent( $replies[ $threadItem->getLevel() - 1 ] );
1412	$threadItem->getParent()->addReply( $threadItem );
1413	} else {
1414	$threadItem->addWarning( 'Comment could not be connected to a thread' );
1415	}
1416
1417	$replies[ $threadItem->getLevel() ] = $threadItem;
1418	// Cut off more deeply nested replies
1419	array_splice( $replies, $threadItem->getLevel() + 1 );
1420	}
1421	}
1422
1423	/**
1424	* Set the IDs and names used to refer to comments and headings.
1425	* This has to be a separate pass because we don't have the list of replies before
1426	* this point.
1427	*/
1428	private function computeIdsAndNames( ContentThreadItemSet $result ): void {
1429	foreach ( $result->getThreadItems() as $threadItem ) {
1430	$name = $this->computeName( $threadItem );
1431	$threadItem->setName( $name );
1432
1433	$id = $this->computeId( $threadItem, $result );
1434	$threadItem->setId( $id );
1435	$legacyId = $this->computeId( $threadItem, $result, true );
1436	if ( $legacyId !== $id ) {
1437	$threadItem->setLegacyId( $legacyId );
1438	}
1439
1440	$result->updateIdAndNameMaps( $threadItem );
1441	}
1442	}
1443	}