Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
94.11% |
655 / 696 |
|
65.38% |
17 / 26 |
CRAP | |
0.00% |
0 / 1 |
CommentParser | |
94.11% |
655 / 696 |
|
65.38% |
17 / 26 |
261.67 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
11 / 11 |
|
100.00% |
1 / 1 |
1 | |||
parse | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
1 | |||
nextInterestingLeafNode | |
95.45% |
21 / 22 |
|
0.00% |
0 / 1 |
8 | |||
regexpAlternateGroup | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
getMessages | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
getTimestampRegexp | |
88.54% |
85 / 96 |
|
0.00% |
0 / 1 |
32.45 | |||
getTimestampParser | |
93.85% |
122 / 130 |
|
0.00% |
0 / 1 |
52.63 | |||
getLocalTimestampRegexps | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
1 | |||
getLocalTimestampParsers | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
1 | |||
getUsernameFromLink | |
97.06% |
33 / 34 |
|
0.00% |
0 / 1 |
17 | |||
findSignature | |
100.00% |
44 / 44 |
|
100.00% |
1 / 1 |
16 | |||
acceptOnlyNodesAllowingComments | |
100.00% |
18 / 18 |
|
100.00% |
1 / 1 |
11 | |||
getCodepointOffset | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
findTimestamp | |
100.00% |
45 / 45 |
|
100.00% |
1 / 1 |
11 | |||
adjustSigRange | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
3 | |||
buildThreadItems | |
98.96% |
95 / 96 |
|
0.00% |
0 / 1 |
22 | |||
computeTranscludedFrom | |
69.23% |
36 / 52 |
|
0.00% |
0 / 1 |
45.69 | |||
titleCanExist | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
3 | |||
parseTitle | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
getTransclusionTitles | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
4 | |||
getTransclusionRange | |
100.00% |
13 / 13 |
|
100.00% |
1 / 1 |
5 | |||
truncateForId | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
computeId | |
96.30% |
26 / 27 |
|
0.00% |
0 / 1 |
13 | |||
computeName | |
91.67% |
11 / 12 |
|
0.00% |
0 / 1 |
4.01 | |||
buildThreads | |
95.24% |
20 / 21 |
|
0.00% |
0 / 1 |
9 | |||
computeIdsAndNames | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
3 |
1 | <?php |
2 | |
3 | namespace MediaWiki\Extension\DiscussionTools; |
4 | |
5 | use DateInterval; |
6 | use DateTime; |
7 | use DateTimeImmutable; |
8 | use DateTimeZone; |
9 | use InvalidArgumentException; |
10 | use Language; |
11 | use LogicException; |
12 | use MediaWiki\Config\Config; |
13 | use MediaWiki\Extension\DiscussionTools\ThreadItem\ContentCommentItem; |
14 | use MediaWiki\Extension\DiscussionTools\ThreadItem\ContentHeadingItem; |
15 | use MediaWiki\Extension\DiscussionTools\ThreadItem\ContentThreadItem; |
16 | use MediaWiki\Languages\LanguageConverterFactory; |
17 | use MediaWiki\Title\MalformedTitleException; |
18 | use MediaWiki\Title\TitleParser; |
19 | use MediaWiki\Title\TitleValue; |
20 | use MediaWiki\Utils\MWTimestamp; |
21 | use RuntimeException; |
22 | use Wikimedia\Assert\Assert; |
23 | use Wikimedia\IPUtils; |
24 | use Wikimedia\Parsoid\DOM\Element; |
25 | use Wikimedia\Parsoid\DOM\Node; |
26 | use Wikimedia\Parsoid\DOM\Text; |
27 | use Wikimedia\Parsoid\Utils\DOMCompat; |
28 | use Wikimedia\Timestamp\TimestampException; |
29 | |
30 | // TODO consider making timestamp parsing not a returned function |
31 | |
32 | class CommentParser { |
33 | |
34 | /** |
35 | * How far backwards we look for a signature associated with a timestamp before giving up. |
36 | * Note that this is not a hard limit on the length of signatures we detect. |
37 | */ |
38 | private const SIGNATURE_SCAN_LIMIT = 100; |
39 | |
40 | private Config $config; |
41 | private Language $language; |
42 | private LanguageConverterFactory $languageConverterFactory; |
43 | private TitleParser $titleParser; |
44 | |
45 | /** @var string[] */ |
46 | private array $dateFormat; |
47 | /** @var string[][] */ |
48 | private array $digits; |
49 | /** @var string[][] */ |
50 | private $contLangMessages; |
51 | private string $localTimezone; |
52 | /** @var string[][] */ |
53 | private array $timezones; |
54 | private string $specialContributionsName; |
55 | |
56 | private Element $rootNode; |
57 | private TitleValue $title; |
58 | |
59 | /** |
60 | * @param Config $config |
61 | * @param Language $language Content language |
62 | * @param LanguageConverterFactory $languageConverterFactory |
63 | * @param LanguageData $languageData |
64 | * @param TitleParser $titleParser |
65 | */ |
66 | public function __construct( |
67 | Config $config, |
68 | Language $language, |
69 | LanguageConverterFactory $languageConverterFactory, |
70 | LanguageData $languageData, |
71 | TitleParser $titleParser |
72 | ) { |
73 | $this->config = $config; |
74 | $this->language = $language; |
75 | $this->languageConverterFactory = $languageConverterFactory; |
76 | $this->titleParser = $titleParser; |
77 | |
78 | $data = $languageData->getLocalData(); |
79 | $this->dateFormat = $data['dateFormat']; |
80 | $this->digits = $data['digits']; |
81 | $this->contLangMessages = $data['contLangMessages']; |
82 | $this->localTimezone = $data['localTimezone']; |
83 | $this->timezones = $data['timezones']; |
84 | $this->specialContributionsName = $data['specialContributionsName']; |
85 | } |
86 | |
87 | /** |
88 | * Parse a discussion page. |
89 | * |
90 | * @param Element $rootNode Root node of content to parse |
91 | * @param TitleValue $title Title of the page being parsed |
92 | * @return ContentThreadItemSet |
93 | */ |
94 | public function parse( Element $rootNode, TitleValue $title ): ContentThreadItemSet { |
95 | $this->rootNode = $rootNode; |
96 | $this->title = $title; |
97 | |
98 | $result = $this->buildThreadItems(); |
99 | $this->buildThreads( $result ); |
100 | $this->computeIdsAndNames( $result ); |
101 | |
102 | return $result; |
103 | } |
104 | |
105 | /** |
106 | * Return the next leaf node in the tree order that is likely a part of a discussion comment, |
107 | * rather than some boring "separator" element. |
108 | * |
109 | * Currently, this can return a Text node with content other than whitespace, or an Element node |
110 | * that is a "void element" or "text element", except some special cases that we treat as comment |
111 | * separators (isCommentSeparator()). |
112 | * |
113 | * @param ?Node $node Node after which to start searching |
114 | * (if null, start at the beginning of the document). |
115 | * @return Node |
116 | */ |
117 | private function nextInterestingLeafNode( ?Node $node ): Node { |
118 | $rootNode = $this->rootNode; |
119 | $treeWalker = new TreeWalker( |
120 | $rootNode, |
121 | NodeFilter::SHOW_ELEMENT | NodeFilter::SHOW_TEXT, |
122 | static function ( $n ) use ( $node, $rootNode ) { |
123 | // Skip past the starting node and its descendants |
124 | if ( $n === $node || $n->parentNode === $node ) { |
125 | return NodeFilter::FILTER_REJECT; |
126 | } |
127 | // Ignore some elements usually used as separators or headers (and their descendants) |
128 | if ( CommentUtils::isCommentSeparator( $n ) ) { |
129 | return NodeFilter::FILTER_REJECT; |
130 | } |
131 | // Ignore nodes with no rendering that mess up our indentation detection |
132 | if ( CommentUtils::isRenderingTransparentNode( $n ) ) { |
133 | return NodeFilter::FILTER_REJECT; |
134 | } |
135 | if ( CommentUtils::isCommentContent( $n ) ) { |
136 | return NodeFilter::FILTER_ACCEPT; |
137 | } |
138 | return NodeFilter::FILTER_SKIP; |
139 | } |
140 | ); |
141 | if ( $node ) { |
142 | $treeWalker->currentNode = $node; |
143 | } |
144 | $treeWalker->nextNode(); |
145 | if ( !$treeWalker->currentNode ) { |
146 | throw new RuntimeException( 'nextInterestingLeafNode not found' ); |
147 | } |
148 | return $treeWalker->currentNode; |
149 | } |
150 | |
151 | /** |
152 | * @param string[] $values Values to match |
153 | * @return string Regular expression |
154 | */ |
155 | private static function regexpAlternateGroup( array $values ): string { |
156 | return '(' . implode( '|', array_map( static function ( string $x ) { |
157 | return preg_quote( $x, '/' ); |
158 | }, $values ) ) . ')'; |
159 | } |
160 | |
161 | /** |
162 | * Get text of localisation messages in content language. |
163 | * |
164 | * @param string $contLangVariant Content language variant |
165 | * @param string[] $messages Message keys |
166 | * @return string[] Message values |
167 | */ |
168 | private function getMessages( string $contLangVariant, array $messages ): array { |
169 | return array_map( function ( string $key ) use ( $contLangVariant ) { |
170 | return $this->contLangMessages[$contLangVariant][$key]; |
171 | }, $messages ); |
172 | } |
173 | |
174 | /** |
175 | * Get a regexp that matches timestamps generated using the given date format. |
176 | * |
177 | * This only supports format characters that are used by the default date format in any of |
178 | * MediaWiki's languages, namely: D, d, F, G, H, i, j, l, M, n, Y, xg, xkY (and escape characters), |
179 | * and only dates when MediaWiki existed, let's say 2000 onwards (Thai dates before 1941 are |
180 | * complicated). |
181 | * |
182 | * @param string $contLangVariant Content language variant |
183 | * @param string $format Date format |
184 | * @param string $digitsRegexp Regular expression matching a single localised digit, e.g. '[0-9]' |
185 | * @param array $tzAbbrs Associative array mapping localised timezone abbreviations to |
186 | * IANA abbreviations, for the local timezone, e.g. [ 'EDT' => 'EDT', 'EST' => 'EST' ] |
187 | * @return string Regular expression |
188 | */ |
189 | private function getTimestampRegexp( |
190 | string $contLangVariant, string $format, string $digitsRegexp, array $tzAbbrs |
191 | ): string { |
192 | $formatLength = strlen( $format ); |
193 | $s = ''; |
194 | $raw = false; |
195 | // Adapted from Language::sprintfDate() |
196 | for ( $p = 0; $p < $formatLength; $p++ ) { |
197 | $num = false; |
198 | $code = $format[ $p ]; |
199 | if ( $code === 'x' && $p < $formatLength - 1 ) { |
200 | $code .= $format[++$p]; |
201 | } |
202 | if ( $code === 'xk' && $p < $formatLength - 1 ) { |
203 | $code .= $format[++$p]; |
204 | } |
205 | |
206 | switch ( $code ) { |
207 | case 'xx': |
208 | $s .= 'x'; |
209 | break; |
210 | case 'xg': |
211 | $s .= static::regexpAlternateGroup( |
212 | $this->getMessages( $contLangVariant, Language::MONTH_GENITIVE_MESSAGES ) |
213 | ); |
214 | break; |
215 | case 'xn': |
216 | $raw = true; |
217 | break; |
218 | case 'd': |
219 | $num = '2'; |
220 | break; |
221 | case 'D': |
222 | $s .= static::regexpAlternateGroup( |
223 | $this->getMessages( $contLangVariant, Language::WEEKDAY_ABBREVIATED_MESSAGES ) |
224 | ); |
225 | break; |
226 | case 'j': |
227 | $num = '1,2'; |
228 | break; |
229 | case 'l': |
230 | $s .= static::regexpAlternateGroup( |
231 | $this->getMessages( $contLangVariant, Language::WEEKDAY_MESSAGES ) |
232 | ); |
233 | break; |
234 | case 'F': |
235 | $s .= static::regexpAlternateGroup( |
236 | $this->getMessages( $contLangVariant, Language::MONTH_MESSAGES ) |
237 | ); |
238 | break; |
239 | case 'M': |
240 | $s .= static::regexpAlternateGroup( |
241 | $this->getMessages( $contLangVariant, Language::MONTH_ABBREVIATED_MESSAGES ) |
242 | ); |
243 | break; |
244 | case 'm': |
245 | $num = '2'; |
246 | break; |
247 | case 'n': |
248 | $num = '1,2'; |
249 | break; |
250 | case 'Y': |
251 | $num = '4'; |
252 | break; |
253 | case 'xkY': |
254 | $num = '4'; |
255 | break; |
256 | case 'G': |
257 | $num = '1,2'; |
258 | break; |
259 | case 'H': |
260 | $num = '2'; |
261 | break; |
262 | case 'i': |
263 | $num = '2'; |
264 | break; |
265 | case 's': |
266 | $num = '2'; |
267 | break; |
268 | case '\\': |
269 | // Backslash escaping |
270 | if ( $p < $formatLength - 1 ) { |
271 | $s .= preg_quote( $format[++$p], '/' ); |
272 | } else { |
273 | $s .= preg_quote( '\\', '/' ); |
274 | } |
275 | break; |
276 | case '"': |
277 | // Quoted literal |
278 | if ( $p < $formatLength - 1 ) { |
279 | $endQuote = strpos( $format, '"', $p + 1 ); |
280 | if ( $endQuote === false ) { |
281 | // No terminating quote, assume literal " |
282 | $s .= '"'; |
283 | } else { |
284 | $s .= preg_quote( substr( $format, $p + 1, $endQuote - $p - 1 ), '/' ); |
285 | $p = $endQuote; |
286 | } |
287 | } else { |
288 | // Quote at end of string, assume literal " |
289 | $s .= '"'; |
290 | } |
291 | break; |
292 | default: |
293 | // Copy whole characters together, instead of single bytes |
294 | $char = mb_substr( mb_strcut( $format, $p, 4 ), 0, 1 ); |
295 | $s .= preg_quote( $char, '/' ); |
296 | $p += strlen( $char ) - 1; |
297 | } |
298 | if ( $num !== false ) { |
299 | if ( $raw ) { |
300 | $s .= '([0-9]{' . $num . '})'; |
301 | $raw = false; |
302 | } else { |
303 | $s .= '(' . $digitsRegexp . '{' . $num . '})'; |
304 | } |
305 | } |
306 | // Ignore some invisible Unicode characters that often sneak into copy-pasted timestamps (T308448) |
307 | $s .= '[\\x{200E}\\x{200F}]?'; |
308 | } |
309 | |
310 | $tzRegexp = static::regexpAlternateGroup( array_keys( $tzAbbrs ) ); |
311 | |
312 | // Hard-coded parentheses and space like in Parser::pstPass2 |
313 | // Ignore some invisible Unicode characters that often sneak into copy-pasted timestamps (T245784) |
314 | // \uNNNN syntax can only be used from PHP 7.3 |
315 | return '/' . $s . ' [\\x{200E}\\x{200F}]?\\(' . $tzRegexp . '\\)/u'; |
316 | } |
317 | |
318 | /** |
319 | * Get a function that parses timestamps generated using the given date format, based on the result |
320 | * of matching the regexp returned by getTimestampRegexp() |
321 | * |
322 | * @param string $contLangVariant Content language variant |
323 | * @param string $format Date format, as used by MediaWiki |
324 | * @param array<int,string>|null $digits Localised digits from 0 to 9, e.g. `[ '0', '1', ..., '9' ]` |
325 | * @param string $localTimezone Local timezone IANA name, e.g. `America/New_York` |
326 | * @param array $tzAbbrs Map of localised timezone abbreviations to IANA abbreviations |
327 | * for the local timezone, e.g. [ 'EDT' => 'EDT', 'EST' => 'EST' ] |
328 | * @return callable Parser function |
329 | */ |
330 | private function getTimestampParser( |
331 | string $contLangVariant, string $format, ?array $digits, string $localTimezone, array $tzAbbrs |
332 | ): callable { |
333 | $untransformDigits = static function ( string $text ) use ( $digits ): int { |
334 | return (int)( $digits ? strtr( $text, array_flip( $digits ) ) : $text ); |
335 | }; |
336 | |
337 | $formatLength = strlen( $format ); |
338 | $matchingGroups = []; |
339 | for ( $p = 0; $p < $formatLength; $p++ ) { |
340 | $code = $format[$p]; |
341 | if ( $code === 'x' && $p < $formatLength - 1 ) { |
342 | $code .= $format[++$p]; |
343 | } |
344 | if ( $code === 'xk' && $p < $formatLength - 1 ) { |
345 | $code .= $format[++$p]; |
346 | } |
347 | |
348 | switch ( $code ) { |
349 | case 'xx': |
350 | case 'xn': |
351 | break; |
352 | case 'xg': |
353 | case 'd': |
354 | case 'j': |
355 | case 'D': |
356 | case 'l': |
357 | case 'F': |
358 | case 'M': |
359 | case 'm': |
360 | case 'n': |
361 | case 'Y': |
362 | case 'xkY': |
363 | case 'G': |
364 | case 'H': |
365 | case 'i': |
366 | case 's': |
367 | $matchingGroups[] = $code; |