Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
95.75% |
586 / 612 |
|
61.90% |
13 / 21 |
CRAP | |
0.00% |
0 / 1 |
CommentParser | |
95.75% |
586 / 612 |
|
61.90% |
13 / 21 |
201 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
11 / 11 |
|
100.00% |
1 / 1 |
1 | |||
parse | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
1 | |||
nextInterestingLeafNode | |
95.24% |
20 / 21 |
|
0.00% |
0 / 1 |
8 | |||
regexpAlternateGroup | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
getMessages | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
getTimestampRegexp | |
88.54% |
85 / 96 |
|
0.00% |
0 / 1 |
32.45 | |||
getTimestampParser | |
93.33% |
126 / 135 |
|
0.00% |
0 / 1 |
51.77 | |||
getLocalTimestampRegexps | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
1 | |||
getLocalTimestampParsers | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
1 | |||
getUsernameFromLink | |
96.97% |
32 / 33 |
|
0.00% |
0 / 1 |
15 | |||
findSignature | |
100.00% |
42 / 42 |
|
100.00% |
1 / 1 |
15 | |||
acceptOnlyNodesAllowingComments | |
100.00% |
18 / 18 |
|
100.00% |
1 / 1 |
11 | |||
getCodepointOffset | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
findTimestamp | |
100.00% |
45 / 45 |
|
100.00% |
1 / 1 |
11 | |||
adjustSigRange | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
3 | |||
buildThreadItems | |
98.95% |
94 / 95 |
|
0.00% |
0 / 1 |
21 | |||
truncateForId | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
computeId | |
96.67% |
29 / 30 |
|
0.00% |
0 / 1 |
12 | |||
computeName | |
91.67% |
11 / 12 |
|
0.00% |
0 / 1 |
4.01 | |||
buildThreads | |
95.24% |
20 / 21 |
|
0.00% |
0 / 1 |
9 | |||
computeIdsAndNames | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
2 |
1 | <?php |
2 | |
3 | namespace MediaWiki\Extension\DiscussionTools; |
4 | |
5 | use Config; |
6 | use DateInterval; |
7 | use DateTime; |
8 | use DateTimeImmutable; |
9 | use DateTimeZone; |
10 | use InvalidArgumentException; |
11 | use Language; |
12 | use LogicException; |
13 | use MalformedTitleException; |
14 | use MediaWiki\Extension\DiscussionTools\ThreadItem\ContentCommentItem; |
15 | use MediaWiki\Extension\DiscussionTools\ThreadItem\ContentHeadingItem; |
16 | use MediaWiki\Extension\DiscussionTools\ThreadItem\ContentThreadItem; |
17 | use MediaWiki\Languages\LanguageConverterFactory; |
18 | use RuntimeException; |
19 | use TitleParser; |
20 | use TitleValue; |
21 | use Wikimedia\Assert\Assert; |
22 | use Wikimedia\IPUtils; |
23 | use Wikimedia\Parsoid\DOM\Element; |
24 | use Wikimedia\Parsoid\DOM\Node; |
25 | use Wikimedia\Parsoid\DOM\Text; |
26 | use Wikimedia\Parsoid\Utils\DOMCompat; |
27 | |
28 | // TODO consider making timestamp parsing not a returned function |
29 | |
30 | class CommentParser { |
31 | |
32 | /** |
33 | * How far backwards we look for a signature associated with a timestamp before giving up. |
34 | * Note that this is not a hard limit on the length of signatures we detect. |
35 | */ |
36 | private const SIGNATURE_SCAN_LIMIT = 100; |
37 | |
38 | private Config $config; |
39 | private Language $language; |
40 | private LanguageConverterFactory $languageConverterFactory; |
41 | private TitleParser $titleParser; |
42 | |
43 | private $dateFormat; |
44 | private $digits; |
45 | /** @var string[][] */ |
46 | private $contLangMessages; |
47 | private $localTimezone; |
48 | private $timezones; |
49 | private $specialContributionsName; |
50 | |
51 | private Element $rootNode; |
52 | private TitleValue $title; |
53 | |
54 | /** |
55 | * @param Config $config |
56 | * @param Language $language Content language |
57 | * @param LanguageConverterFactory $languageConverterFactory |
58 | * @param LanguageData $languageData |
59 | * @param TitleParser $titleParser |
60 | */ |
61 | public function __construct( |
62 | Config $config, |
63 | Language $language, |
64 | LanguageConverterFactory $languageConverterFactory, |
65 | LanguageData $languageData, |
66 | TitleParser $titleParser |
67 | ) { |
68 | $this->config = $config; |
69 | $this->language = $language; |
70 | $this->languageConverterFactory = $languageConverterFactory; |
71 | $this->titleParser = $titleParser; |
72 | |
73 | $data = $languageData->getLocalData(); |
74 | $this->dateFormat = $data['dateFormat']; |
75 | $this->digits = $data['digits']; |
76 | $this->contLangMessages = $data['contLangMessages']; |
77 | $this->localTimezone = $data['localTimezone']; |
78 | $this->timezones = $data['timezones']; |
79 | $this->specialContributionsName = $data['specialContributionsName']; |
80 | } |
81 | |
82 | /** |
83 | * Parse a discussion page. |
84 | * |
85 | * @param Element $rootNode Root node of content to parse |
86 | * @param TitleValue $title Title of the page being parsed |
87 | * @return ContentThreadItemSet |
88 | */ |
89 | public function parse( Element $rootNode, TitleValue $title ): ContentThreadItemSet { |
90 | $this->rootNode = $rootNode; |
91 | $this->title = $title; |
92 | |
93 | $result = $this->buildThreadItems(); |
94 | $this->buildThreads( $result ); |
95 | $this->computeIdsAndNames( $result ); |
96 | |
97 | return $result; |
98 | } |
99 | |
100 | /** |
101 | * Return the next leaf node in the tree order that is likely a part of a discussion comment, |
102 | * rather than some boring "separator" element. |
103 | * |
104 | * Currently, this can return a Text node with content other than whitespace, or an Element node |
105 | * that is a "void element" or "text element", except some special cases that we treat as comment |
106 | * separators (isCommentSeparator()). |
107 | * |
108 | * @param Node $node Node to start searching at. This node's children are ignored. |
109 | * @return Node |
110 | */ |
111 | private function nextInterestingLeafNode( Node $node ): Node { |
112 | $rootNode = $this->rootNode; |
113 | $treeWalker = new TreeWalker( |
114 | $rootNode, |
115 | NodeFilter::SHOW_ELEMENT | NodeFilter::SHOW_TEXT, |
116 | static function ( $n ) use ( $node, $rootNode ) { |
117 | // Ignore this node and its descendants |
118 | // (unless it's the root node, this is a special case for "fakeHeading" handling) |
119 | if ( $node !== $rootNode && ( $n === $node || $n->parentNode === $node ) ) { |
120 | return NodeFilter::FILTER_REJECT; |
121 | } |
122 | // Ignore some elements usually used as separators or headers (and their descendants) |
123 | if ( CommentUtils::isCommentSeparator( $n ) ) { |
124 | return NodeFilter::FILTER_REJECT; |
125 | } |
126 | // Ignore nodes with no rendering that mess up our indentation detection |
127 | if ( CommentUtils::isRenderingTransparentNode( $n ) ) { |
128 | return NodeFilter::FILTER_REJECT; |
129 | } |
130 | if ( CommentUtils::isCommentContent( $n ) ) { |
131 | return NodeFilter::FILTER_ACCEPT; |
132 | } |
133 | return NodeFilter::FILTER_SKIP; |
134 | } |
135 | ); |
136 | $treeWalker->currentNode = $node; |
137 | $treeWalker->nextNode(); |
138 | if ( !$treeWalker->currentNode ) { |
139 | throw new RuntimeException( 'nextInterestingLeafNode not found' ); |
140 | } |
141 | return $treeWalker->currentNode; |
142 | } |
143 | |
144 | /** |
145 | * @param string[] $values Values to match |
146 | * @return string Regular expression |
147 | */ |
148 | private static function regexpAlternateGroup( array $values ): string { |
149 | return '(' . implode( '|', array_map( static function ( string $x ) { |
150 | return preg_quote( $x, '/' ); |
151 | }, $values ) ) . ')'; |
152 | } |
153 | |
154 | /** |
155 | * Get text of localisation messages in content language. |
156 | * |
157 | * @param string $contLangVariant Content language variant |
158 | * @param string[] $messages Message keys |
159 | * @return string[] Message values |
160 | */ |
161 | private function getMessages( string $contLangVariant, array $messages ): array { |
162 | return array_map( function ( string $key ) use ( $contLangVariant ) { |
163 | return $this->contLangMessages[$contLangVariant][$key]; |
164 | }, $messages ); |
165 | } |
166 | |
167 | /** |
168 | * Get a regexp that matches timestamps generated using the given date format. |
169 | * |
170 | * This only supports format characters that are used by the default date format in any of |
171 | * MediaWiki's languages, namely: D, d, F, G, H, i, j, l, M, n, Y, xg, xkY (and escape characters), |
172 | * and only dates when MediaWiki existed, let's say 2000 onwards (Thai dates before 1941 are |
173 | * complicated). |
174 | * |
175 | * @param string $contLangVariant Content language variant |
176 | * @param string $format Date format |
177 | * @param string $digitsRegexp Regular expression matching a single localised digit, e.g. '[0-9]' |
178 | * @param array $tzAbbrs Associative array mapping localised timezone abbreviations to |
179 | * IANA abbreviations, for the local timezone, e.g. [ 'EDT' => 'EDT', 'EST' => 'EST' ] |
180 | * @return string Regular expression |
181 | */ |
182 | private function getTimestampRegexp( |
183 | string $contLangVariant, string $format, string $digitsRegexp, array $tzAbbrs |
184 | ): string { |
185 | $formatLength = strlen( $format ); |
186 | $s = ''; |
187 | $raw = false; |
188 | // Adapted from Language::sprintfDate() |
189 | for ( $p = 0; $p < $formatLength; $p++ ) { |
190 | $num = false; |
191 | $code = $format[ $p ]; |
192 | if ( $code === 'x' && $p < $formatLength - 1 ) { |
193 | $code .= $format[++$p]; |
194 | } |
195 | if ( $code === 'xk' && $p < $formatLength - 1 ) { |
196 | $code .= $format[++$p]; |
197 | } |
198 | |
199 | switch ( $code ) { |
200 | case 'xx': |
201 | $s .= 'x'; |
202 | break; |
203 | case 'xg': |
204 | $s .= static::regexpAlternateGroup( |
205 | $this->getMessages( $contLangVariant, Language::MONTH_GENITIVE_MESSAGES ) |
206 | ); |
207 | break; |
208 | case 'xn': |
209 | $raw = true; |
210 | break; |
211 | case 'd': |
212 | $num = '2'; |
213 | break; |
214 | case 'D': |
215 | $s .= static::regexpAlternateGroup( |
216 | $this->getMessages( $contLangVariant, Language::WEEKDAY_ABBREVIATED_MESSAGES ) |
217 | ); |
218 | break; |
219 | case 'j': |
220 | $num = '1,2'; |
221 | break; |
222 | case 'l': |
223 | $s .= static::regexpAlternateGroup( |
224 | $this->getMessages( $contLangVariant, Language::WEEKDAY_MESSAGES ) |
225 | ); |
226 | break; |
227 | case 'F': |
228 | $s .= static::regexpAlternateGroup( |
229 | $this->getMessages( $contLangVariant, Language::MONTH_MESSAGES ) |
230 | ); |
231 | break; |
232 | case 'M': |
233 | $s .= static::regexpAlternateGroup( |
234 | $this->getMessages( $contLangVariant, Language::MONTH_ABBREVIATED_MESSAGES ) |
235 | ); |
236 | break; |
237 | case 'm': |
238 | $num = '2'; |
239 | break; |
240 | case 'n': |
241 | $num = '1,2'; |
242 | break; |
243 | case 'Y': |
244 | $num = '4'; |
245 | break; |
246 | case 'xkY': |
247 | $num = '4'; |
248 | break; |
249 | case 'G': |
250 | $num = '1,2'; |
251 | break; |
252 | case 'H': |
253 | $num = '2'; |
254 | break; |
255 | case 'i': |
256 | $num = '2'; |
257 | break; |
258 | case 's': |
259 | $num = '2'; |
260 | break; |
261 | case '\\': |
262 | // Backslash escaping |
263 | if ( $p < $formatLength - 1 ) { |
264 | $s .= preg_quote( $format[++$p], '/' ); |
265 | } else { |
266 | $s .= preg_quote( '\\', '/' ); |
267 | } |
268 | break; |
269 | case '"': |
270 | // Quoted literal |
271 | if ( $p < $formatLength - 1 ) { |
272 | $endQuote = strpos( $format, '"', $p + 1 ); |
273 | if ( $endQuote === false ) { |
274 | // No terminating quote, assume literal " |
275 | $s .= '"'; |
276 | } else { |
277 | $s .= preg_quote( substr( $format, $p + 1, $endQuote - $p - 1 ), '/' ); |
278 | $p = $endQuote; |
279 | } |
280 | } else { |
281 | // Quote at end of string, assume literal " |
282 | $s .= '"'; |
283 | } |
284 | break; |
285 | default: |
286 | // Copy whole characters together, instead of single bytes |
287 | $char = mb_substr( mb_strcut( $format, $p, 4 ), 0, 1 ); |
288 | $s .= preg_quote( $char, '/' ); |
289 | $p += strlen( $char ) - 1; |
290 | } |
291 | if ( $num !== false ) { |
292 | if ( $raw ) { |
293 | $s .= '([0-9]{' . $num . '})'; |
294 | $raw = false; |
295 | } else { |
296 | $s .= '(' . $digitsRegexp . '{' . $num . '})'; |
297 | } |
298 | } |
299 | // Ignore some invisible Unicode characters that often sneak into copy-pasted timestamps (T308448) |
300 | $s .= '[\\x{200E}\\x{200F}]?'; |
301 | } |
302 | |
303 | $tzRegexp = static::regexpAlternateGroup( array_keys( $tzAbbrs ) ); |
304 | |
305 | // Hard-coded parentheses and space like in Parser::pstPass2 |
306 | // Ignore some invisible Unicode characters that often sneak into copy-pasted timestamps (T245784) |
307 | // \uNNNN syntax can only be used from PHP 7.3 |
308 | return '/' . $s . ' [\\x{200E}\\x{200F}]?\\(' . $tzRegexp . '\\)/u'; |
309 | } |
310 | |
311 | /** |
312 | * Get a function that parses timestamps generated using the given date format, based on the result |
313 | * of matching the regexp returned by getTimestampRegexp() |
314 | * |
315 | * @param string $contLangVariant Content language variant |
316 | * @param string $format Date format, as used by MediaWiki |
317 | * @param string[]|null $digits Localised digits from 0 to 9, e.g. `[ '0', '1', ..., '9' ]` |
318 | * @param string $localTimezone Local timezone IANA name, e.g. `America/New_York` |
319 | * @param array $tzAbbrs Map of localised timezone abbreviations to IANA abbreviations |
320 | * for the local timezone, e.g. [ 'EDT' => 'EDT', 'EST' => 'EST' ] |
321 | * @return callable Parser function |
322 | */ |
323 | private function getTimestampParser( |
324 | string $contLangVariant, string $format, ?array $digits, string $localTimezone, array $tzAbbrs |
325 | ): callable { |
326 | $untransformDigits = static function ( string $text ) use ( $digits ) { |
327 | if ( !$digits ) { |
328 | return $text; |
329 | } |
330 | return preg_replace_callback( |
331 | '/[' . implode( '', $digits ) . ']/u', |
332 | static function ( array $m ) use ( $digits ) { |
333 | return (string)array_search( $m[0], $digits, true ); |
334 | }, |
335 | $text |
336 | ); |
337 | }; |
338 | |
339 | $formatLength = strlen( $format ); |
340 | $matchingGroups = []; |
341 | for ( $p = 0; $p < $formatLength; $p++ ) { |
342 | $code = $format[$p]; |
343 | if ( $code === 'x' && $p < $formatLength - 1 ) { |
344 | $code .= $format[++$p]; |
345 | } |
346 | if ( $code === 'xk' && $p < $formatLength - 1 ) { |
347 | $code .= $format[++$p]; |
348 | } |
349 | |
350 | switch ( $code ) { |
351 | case 'xx': |
352 | case 'xn': |
353 | break; |
354 | case 'xg': |
355 | case 'd': |
356 | case 'j': |
357 | case 'D': |
358 | case 'l': |
359 | case 'F': |
360 | case 'M': |
361 | case 'm': |
362 | case 'n': |
363 | case 'Y': |
364 | case 'xkY': |
365 | case 'G': |
366 | case 'H': |
367 | case 'i': |
368 | case 's': |
369 | $matchingGroups[] = $code; |
370 | break; |
371 | case '\\': |
372 | // Backslash escaping |
373 | if ( $p < $formatLength - 1 ) { |
374 | $p++; |
375 | } |
376 | break; |
377 | case '"': |
378 | // Quoted literal |
379 | if ( $p < $formatLength - 1 ) { |
380 | $endQuote = strpos( $format, '"', $p + 1 ); |
381 | if ( $endQuote !== false ) { |
382 | $p = $endQuote; |
383 | } |
384 | } |
385 | break; |
386 | default: |
387 | break; |
388 | } |
389 | } |
390 | |
391 | return function ( array $match ) use ( |
392 | $matchingGroups, $untransformDigits, $localTimezone, $tzAbbrs, $contLangVariant |
393 | ) { |