Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
96.81% covered (success)
96.81%
303 / 313
57.14% covered (warning)
57.14%
4 / 7
CRAP
0.00% covered (danger)
0.00%
0 / 1
LazyVariableComputer
96.81% covered (success)
96.81%
303 / 313
57.14% covered (warning)
57.14%
4 / 7
79
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 compute
97.61% covered (success)
97.61%
245 / 251
0.00% covered (danger)
0.00%
0 / 1
64
 getLinksFromDB
100.00% covered (success)
100.00%
8 / 8
100.00% covered (success)
100.00%
1 / 1
2
 getLastPageAuthors
96.97% covered (success)
96.97%
32 / 33
0.00% covered (danger)
0.00%
0 / 1
4
 getRevisionFromParameters
70.00% covered (warning)
70.00%
7 / 10
0.00% covered (danger)
0.00%
0 / 1
5.68
 getContentModelFromRevision
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 parseNonEditWikitext
100.00% covered (success)
100.00%
8 / 8
100.00% covered (success)
100.00%
1 / 1
2
1<?php
2
3namespace MediaWiki\Extension\AbuseFilter\Variables;
4
5use InvalidArgumentException;
6use MediaWiki\Content\ContentHandler;
7use MediaWiki\Content\TextContent;
8use MediaWiki\Extension\AbuseFilter\Hooks\AbuseFilterHookRunner;
9use MediaWiki\Extension\AbuseFilter\Parser\AFPData;
10use MediaWiki\Extension\AbuseFilter\TextExtractor;
11use MediaWiki\ExternalLinks\ExternalLinksLookup;
12use MediaWiki\ExternalLinks\LinkFilter;
13use MediaWiki\Language\Language;
14use MediaWiki\Page\PageIdentity;
15use MediaWiki\Page\WikiPage;
16use MediaWiki\Parser\ParserFactory;
17use MediaWiki\Parser\ParserOptions;
18use MediaWiki\Permissions\PermissionManager;
19use MediaWiki\Permissions\RestrictionStore;
20use MediaWiki\RecentChanges\RecentChange;
21use MediaWiki\Revision\RevisionLookup;
22use MediaWiki\Revision\RevisionRecord;
23use MediaWiki\Revision\RevisionStore;
24use MediaWiki\Revision\SlotRecord;
25use MediaWiki\Storage\PreparedUpdate;
26use MediaWiki\Title\Title;
27use MediaWiki\User\ExternalUserNames;
28use MediaWiki\User\User;
29use MediaWiki\User\UserEditTracker;
30use MediaWiki\User\UserGroupManager;
31use MediaWiki\User\UserIdentity;
32use MediaWiki\User\UserIdentityUtils;
33use MediaWiki\User\UserNameUtils;
34use Psr\Log\LoggerInterface;
35use stdClass;
36use StringUtils;
37use UnexpectedValueException;
38use Wikimedia\Diff\Diff;
39use Wikimedia\Diff\UnifiedDiffFormatter;
40use Wikimedia\IPUtils;
41use Wikimedia\ObjectCache\WANObjectCache;
42use Wikimedia\Rdbms\Database;
43use Wikimedia\Rdbms\LBFactory;
44use Wikimedia\Rdbms\SelectQueryBuilder;
45
46/**
47 * Service used to compute lazy-loaded variable.
48 * @internal
49 */
50class LazyVariableComputer {
51    public const SERVICE_NAME = 'AbuseFilterLazyVariableComputer';
52
53    /**
54     * @var float The amount of time to subtract from profiling
55     * @todo This is a hack
56     */
57    public static $profilingExtraTime = 0;
58
59    public function __construct(
60        private readonly TextExtractor $textExtractor,
61        private readonly AbuseFilterHookRunner $hookRunner,
62        private readonly LoggerInterface $logger,
63        private readonly LBFactory $lbFactory,
64        private readonly WANObjectCache $wanCache,
65        private readonly RevisionLookup $revisionLookup,
66        private readonly RevisionStore $revisionStore,
67        private readonly Language $contentLanguage,
68        private readonly ParserFactory $parserFactory,
69        private readonly UserEditTracker $userEditTracker,
70        private readonly UserGroupManager $userGroupManager,
71        private readonly PermissionManager $permissionManager,
72        private readonly RestrictionStore $restrictionStore,
73        private readonly UserIdentityUtils $userIdentityUtils,
74        private readonly UserNameUtils $userNameUtils,
75        private readonly string $wikiID
76    ) {
77    }
78
79    /**
80     * XXX: $getVarCB is a hack to hide the cyclic dependency with VariablesManager. See T261069 for possible
81     * solutions. This might also be merged into VariablesManager, but it would bring a ton of dependencies.
82     * @todo Should we remove $vars parameter (check hooks)?
83     *
84     * @param LazyLoadedVariable $var
85     * @param VariableHolder $vars
86     * @param callable $getVarCB
87     * @phan-param callable(string $name):AFPData $getVarCB
88     * @return AFPData
89     */
90    public function compute( LazyLoadedVariable $var, VariableHolder $vars, callable $getVarCB ) {
91        $parameters = $var->getParameters();
92        $varMethod = $var->getMethod();
93        $result = null;
94
95        if ( !$this->hookRunner->onAbuseFilter_interceptVariable(
96            $varMethod,
97            $vars,
98            $parameters,
99            $result
100        ) ) {
101            return $result instanceof AFPData
102                ? $result : AFPData::newFromPHPVar( $result );
103        }
104
105        switch ( $varMethod ) {
106            case 'diff':
107                $text1Var = $parameters['oldtext-var'];
108                $text2Var = $parameters['newtext-var'];
109                $text1 = $getVarCB( $text1Var )->toString();
110                $text2 = $getVarCB( $text2Var )->toString();
111                // T74329: if there's no text, don't return an array with the empty string
112                $text1 = $text1 === '' ? [] : explode( "\n", $text1 );
113                $text2 = $text2 === '' ? [] : explode( "\n", $text2 );
114                $diffs = new Diff( $text1, $text2 );
115                $format = new UnifiedDiffFormatter();
116                $result = $format->format( $diffs );
117                break;
118            case 'diff-split':
119                $diff = $getVarCB( $parameters['diff-var'] )->toString();
120                $line_prefix = $parameters['line-prefix'];
121                $diff_lines = explode( "\n", $diff );
122                $result = [];
123                foreach ( $diff_lines as $line ) {
124                    if ( ( $line[0] ?? '' ) === $line_prefix ) {
125                        $result[] = substr( $line, 1 );
126                    }
127                }
128                break;
129            case 'array-diff':
130                $baseVar = $parameters['base-var'];
131                $minusVar = $parameters['minus-var'];
132
133                $baseArray = $getVarCB( $baseVar )->toNative();
134                $minusArray = $getVarCB( $minusVar )->toNative();
135
136                $result = array_diff( $baseArray, $minusArray );
137                break;
138            case 'links-from-wikitext':
139                // This should ONLY be used when sharing a parse operation with the edit.
140
141                /** @var WikiPage $article */
142                $article = $parameters['article'];
143                if ( $article->getContentModel() === CONTENT_MODEL_WIKITEXT ) {
144                    // Shared with the edit, don't count it in profiling
145                    $startTime = microtime( true );
146                    $textVar = $parameters['text-var'];
147
148                    $new_text = $getVarCB( $textVar )->toString();
149                    $content = ContentHandler::makeContent( $new_text, $article->getTitle() );
150                    $editInfo = $article->prepareContentForEdit(
151                        $content,
152                        null,
153                        $parameters['contextUserIdentity']
154                    );
155                    $result = LinkFilter::getIndexedUrlsNonReversed(
156                        array_keys( $editInfo->output->getExternalLinks() )
157                    );
158                    self::$profilingExtraTime += ( microtime( true ) - $startTime );
159                    break;
160                }
161            // Otherwise fall back to database
162            case 'links-from-wikitext-or-database':
163                // TODO: use Content object instead, if available!
164                /** @var WikiPage $article */
165                $article ??= $parameters['article'];
166
167                // this inference is ugly, but the name isn't accessible from here
168                // and we only want this for debugging
169                $textVar = $parameters['text-var'];
170                $varName = str_starts_with( $textVar, 'old_' ) ? 'old_links' : 'new_links';
171                if ( $parameters['forFilter'] ?? false ) {
172                    $this->logger->debug( "Loading $varName from DB" );
173                    $links = $this->getLinksFromDB( $article );
174                } elseif ( $article->getContentModel() === CONTENT_MODEL_WIKITEXT ) {
175                    $this->logger->debug( "Loading $varName from Parser" );
176
177                    $wikitext = $getVarCB( $textVar )->toString();
178                    $editInfo = $this->parseNonEditWikitext(
179                        $wikitext,
180                        $article,
181                        $parameters['contextUserIdentity']
182                    );
183                    $links = LinkFilter::getIndexedUrlsNonReversed(
184                        array_keys( $editInfo->output->getExternalLinks() )
185                    );
186                } else {
187                    // TODO: Get links from Content object. But we don't have the content object.
188                    // And for non-text content, $wikitext is usually not going to be a valid
189                    // serialization, but rather some dummy text for filtering.
190                    $links = [];
191                }
192
193                $result = $links;
194                break;
195            case 'links-from-update':
196                /** @var PreparedUpdate $update */
197                $update = $parameters['update'];
198                // Shared with the edit, don't count it in profiling
199                $startTime = microtime( true );
200                $result = LinkFilter::getIndexedUrlsNonReversed(
201                    array_keys( $update->getParserOutputForMetaData()->getExternalLinks() )
202                );
203                self::$profilingExtraTime += ( microtime( true ) - $startTime );
204                break;
205            case 'links-from-database':
206                /** @var PageIdentity $article */
207                $article = $parameters['article'];
208                $this->logger->debug( 'Loading old_links from DB' );
209                $result = $this->getLinksFromDB( $article );
210                break;
211            case 'parse-wikitext':
212                // Should ONLY be used when sharing a parse operation with the edit.
213                // TODO: use Content object instead, if available!
214                /* @var WikiPage $article */
215                $article = $parameters['article'];
216                if ( $article->getContentModel() === CONTENT_MODEL_WIKITEXT ) {
217                    // Shared with the edit, don't count it in profiling
218                    $startTime = microtime( true );
219                    $textVar = $parameters['wikitext-var'];
220
221                    $new_text = $getVarCB( $textVar )->toString();
222                    $content = ContentHandler::makeContent( $new_text, $article->getTitle() );
223                    $editInfo = $article->prepareContentForEdit(
224                        $content,
225                        null,
226                        $parameters['contextUserIdentity']
227                    );
228                    if ( isset( $parameters['pst'] ) && $parameters['pst'] ) {
229                        $result = $editInfo->pstContent->serialize( $editInfo->format );
230                    } else {
231                        // Note: as of core change r727361, the PP limit comments (which we don't want to be here)
232                        // are already excluded.
233                        $popts = $editInfo->popts;
234                        $result = $editInfo->getOutput()->runOutputPipeline( $popts, [] )->getContentHolderText();
235                    }
236                    self::$profilingExtraTime += ( microtime( true ) - $startTime );
237                } else {
238                    $result = '';
239                }
240                break;
241            case 'pst-from-update':
242                /** @var PreparedUpdate $update */
243                $update = $parameters['update'];
244                $result = $this->textExtractor->revisionToString(
245                    $update->getRevision(),
246                    $parameters['contextUser']
247                );
248                break;
249            case 'html-from-update':
250                /** @var PreparedUpdate $update */
251                $update = $parameters['update'];
252                // Shared with the edit, don't count it in profiling
253                $startTime = microtime( true );
254                $popts = $update->getRenderedRevision()->getOptions();
255                $result = $update->getCanonicalParserOutput()->runOutputPipeline( $popts, [] )->getContentHolderText();
256                self::$profilingExtraTime += ( microtime( true ) - $startTime );
257                break;
258            case 'strip-html':
259                $htmlVar = $parameters['html-var'];
260                $html = $getVarCB( $htmlVar )->toString();
261                $stripped = StringUtils::delimiterReplace( '<', '>', '', $html );
262                // We strip extra spaces to the right because the stripping above
263                // could leave a lot of whitespace.
264                // @fixme Find a better way to do this.
265                $result = TextContent::normalizeLineEndings( $stripped );
266                break;
267            case 'load-recent-authors':
268                $result = $this->getLastPageAuthors( $parameters['title'] );
269                break;
270            case 'load-first-author':
271                $revision = $this->revisionLookup->getFirstRevision( $parameters['title'] );
272                if ( $revision ) {
273                    // TODO T233241
274                    $user = $revision->getUser();
275                    $result = $user === null ? '' : $user->getName();
276                } else {
277                    $result = '';
278                }
279                break;
280            case 'get-page-restrictions':
281                $action = $parameters['action'];
282                /** @var Title $title */
283                $title = $parameters['title'];
284                $result = $this->restrictionStore->getRestrictions( $title, $action );
285                break;
286            case 'account-type':
287                /** @var User $createdUser */
288                $createdUser = $parameters['createdUser'];
289                $isTemp = $this->userIdentityUtils->isTemp( $createdUser );
290                if ( $parameters['autocreate'] && $isTemp ) {
291                    $result = 'temp';
292                } elseif ( !$isTemp && $this->userNameUtils->isCreatable( $createdUser->getName() ) ) {
293                    // At this point the account hasn't been written to the DB yet, so:
294                    // - User::getId() is still 0
295                    // - User::isRegistered() will always be false
296                    // - and User::isNamed() can't be trusted here
297                    //
298                    // That means the only thing we can really rely on during pre-auth/pre-creation
299                    // is the username itself. If it's not a temporary account and the username is
300                    // creatable, then this is effectively a named account creation attempt.
301                    $result = 'named';
302                } else {
303                    $result = 'unknown';
304                }
305                break;
306            case 'user-unnamed-ip':
307                /** @var User $user */
308                $user = $parameters['user'];
309                $result = null;
310
311                // Reveal IPs for:
312                // - temporary accounts: temporary account names will replace the IP in the `user_name`
313                //   variable. This variable restores this access.
314                // - logged-out users: This supports the transition to the use of temporary accounts
315                //   so that filter maintainers on pre-transition wikis can migrate `user_name` to `user_unnamed_ip`
316                //   where necessary and see no disruption on transition.
317                //
318                // This variable should only ever be exposed for these use cases and shouldn't be extended
319                // to registered accounts, as that would leak account PII to users without the right to see
320                // that information
321                if (
322                    $this->userIdentityUtils->isTemp( $user ) ||
323                    IPUtils::isIPAddress( $user->getName() )
324                ) {
325                    /** @var RecentChange|null $rc */
326                    $rc = $parameters['rc'];
327                    if ( $rc !== null ) {
328                        $result = $rc->getAttribute( 'rc_ip' );
329                    } else {
330                        $result = $user->getRequest()->getIP();
331                    }
332                }
333                break;
334            case 'user-type':
335                /** @var UserIdentity $userIdentity */
336                $userIdentity = $parameters['user-identity'];
337                if ( $this->userIdentityUtils->isNamed( $userIdentity ) ) {
338                    $result = 'named';
339                } elseif ( $this->userIdentityUtils->isTemp( $userIdentity ) ) {
340                    $result = 'temp';
341                } elseif ( IPUtils::isIPAddress( $userIdentity->getName() ) ) {
342                    $result = 'ip';
343                } elseif ( ExternalUserNames::isExternal( $userIdentity->getName() ) ) {
344                    $result = 'external';
345                } else {
346                    $result = 'unknown';
347                }
348                break;
349            case 'user-editcount':
350                /** @var UserIdentity $userIdentity */
351                $userIdentity = $parameters['user-identity'];
352                $result = $this->userEditTracker->getUserEditCount( $userIdentity );
353                break;
354            case 'user-emailconfirm':
355                /** @var User $user */
356                $user = $parameters['user'];
357                $result = $user->getEmailAuthenticationTimestamp();
358                break;
359            case 'user-groups':
360                /** @var UserIdentity $userIdentity */
361                $userIdentity = $parameters['user-identity'];
362                $result = $this->userGroupManager->getUserEffectiveGroups( $userIdentity );
363                break;
364            case 'user-rights':
365                /** @var UserIdentity $userIdentity */
366                $userIdentity = $parameters['user-identity'];
367                $result = $this->permissionManager->getUserPermissions( $userIdentity );
368                break;
369            case 'user-block':
370                // @todo Support partial blocks?
371                /** @var User $user */
372                $user = $parameters['user'];
373                $result = (bool)$user->getBlock();
374                break;
375            case 'user-age':
376                /** @var User $user */
377                $user = $parameters['user'];
378                $asOf = $parameters['asof'];
379
380                if ( !$user->isRegistered() ) {
381                    $result = 0;
382                } else {
383                    // HACK: If there's no registration date, assume 2008-01-15, Wikipedia Day
384                    // in the year before the new user log was created. See T243469.
385                    $registration = $user->getRegistration() ?? "20080115000000";
386                    $result = (int)wfTimestamp( TS_UNIX, $asOf ) - (int)wfTimestamp( TS_UNIX, $registration );
387                }
388                break;
389            case 'page-age':
390                /** @var Title $title */
391                $title = $parameters['title'];
392
393                $firstRev = $this->revisionLookup->getFirstRevision( $title );
394                $firstRevisionTime = $firstRev?->getTimestamp();
395                if ( !$firstRevisionTime ) {
396                    $result = 0;
397                    break;
398                }
399
400                $asOf = $parameters['asof'];
401                $result = (int)wfTimestamp( TS_UNIX, $asOf ) - (int)wfTimestamp( TS_UNIX, $firstRevisionTime );
402                break;
403            case 'revision-age':
404                $revRec = $this->getRevisionFromParameters( $parameters );
405                if ( !$revRec ) {
406                    $result = null;
407                    break;
408                }
409                $asOf = $parameters['asof'];
410                $result = (int)wfTimestamp( TS_UNIX, $asOf ) - (int)wfTimestamp( TS_UNIX, $revRec->getTimestamp() );
411                break;
412            case 'length':
413                $s = $getVarCB( $parameters['length-var'] )->toString();
414                $result = strlen( $s );
415                break;
416            case 'subtract-int':
417                $v1 = $getVarCB( $parameters['val1-var'] )->toInt();
418                $v2 = $getVarCB( $parameters['val2-var'] )->toInt();
419                $result = $v1 - $v2;
420                break;
421            case 'content-model':
422                $revRec = $this->getRevisionFromParameters( $parameters );
423                $result = $this->getContentModelFromRevision( $revRec );
424                break;
425            case 'revision-text':
426                $revRec = $this->getRevisionFromParameters( $parameters );
427                $result = $this->textExtractor->revisionToString( $revRec, $parameters['contextUser'] );
428                break;
429            case 'get-wiki-name':
430                $result = $this->wikiID;
431                break;
432            case 'get-wiki-language':
433                $result = $this->contentLanguage->getCode();
434                break;
435            default:
436                if ( $this->hookRunner->onAbuseFilter_computeVariable(
437                    $varMethod,
438                    $vars,
439                    $parameters,
440                    $result
441                ) ) {
442                    throw new UnexpectedValueException( 'Unknown variable compute type ' . $varMethod );
443                }
444        }
445
446        return $result instanceof AFPData ? $result : AFPData::newFromPHPVar( $result );
447    }
448
449    /**
450     * @param PageIdentity $page
451     * @return array
452     */
453    private function getLinksFromDB( PageIdentity $page ): array {
454        $id = $page->getId();
455        if ( !$id ) {
456            return [];
457        }
458
459        return ExternalLinksLookup::getExtLinksForPage(
460            $id,
461            $this->lbFactory,
462            __METHOD__
463        );
464    }
465
466    /**
467     * @todo Move to MW core (T272050)
468     * @param Title $title
469     * @return string[] Usernames of the last 10 (unique) authors from $title
470     */
471    private function getLastPageAuthors( Title $title ) {
472        if ( !$title->exists() ) {
473            return [];
474        }
475
476        $fname = __METHOD__;
477
478        return $this->wanCache->getWithSetCallback(
479            $this->wanCache->makeKey( 'last-10-authors', 'revision', $title->getLatestRevID() ),
480            WANObjectCache::TTL_MINUTE,
481            function ( $oldValue, &$ttl, array &$setOpts ) use ( $title, $fname ) {
482                $dbr = $this->lbFactory->getReplicaDatabase();
483
484                $setOpts += Database::getCacheSetOptions( $dbr );
485                // Get the last 100 edit authors with a trivial query (avoid T116557)
486                $revQuery = $this->revisionStore->getQueryInfo();
487                $revAuthors = $dbr->newSelectQueryBuilder()
488                    ->tables( $revQuery['tables'] )
489                    ->field( $revQuery['fields']['rev_user_text'] )
490                    ->where( [
491                        'rev_page' => $title->getArticleID(),
492                        // TODO Should deleted names be counted in the 10 authors? If yes, this check should
493                        // be moved inside the foreach
494                        'rev_deleted' => 0
495                    ] )
496                    ->caller( $fname )
497                    // Some pages have < 10 authors but many revisions (e.g. bot pages)
498                    ->orderBy( [ 'rev_timestamp', 'rev_id' ], SelectQueryBuilder::SORT_DESC )
499                    ->limit( 100 )
500                    // Force index per T116557
501                    ->useIndex( [ 'revision' => 'rev_page_timestamp' ] )
502                    ->joinConds( $revQuery['joins'] )
503                    ->fetchFieldValues();
504                // Get the last 10 distinct authors within this set of edits
505                $users = [];
506                foreach ( $revAuthors as $author ) {
507                    $users[$author] = 1;
508                    if ( count( $users ) >= 10 ) {
509                        break;
510                    }
511                }
512
513                return array_keys( $users );
514            }
515        );
516    }
517
518    /**
519     * @param array{revid?:int,title?:Title,parent?:true} $params
520     * @return ?RevisionRecord
521     */
522    private function getRevisionFromParameters( array $params ): ?RevisionRecord {
523        if ( isset( $params['revid'] ) ) {
524            $revision = $this->revisionLookup->getRevisionById( $params['revid'] );
525        } elseif ( isset( $params['title'] ) ) {
526            $revision = $this->revisionLookup->getRevisionByTitle( $params['title'] );
527        } else {
528            throw new InvalidArgumentException(
529                "Either 'revid' or 'title' are mandatory revision specifiers"
530            );
531        }
532        if ( ( $params['parent'] ?? false ) && $revision !== null ) {
533            $revision = $this->revisionLookup->getPreviousRevision( $revision );
534        }
535        return $revision;
536    }
537
538    private function getContentModelFromRevision( ?RevisionRecord $revision ): string {
539        // this is consistent with what is done on various places in RunVariableGenerator
540        // and RCVariableGenerator
541        return $revision?->getContent( SlotRecord::MAIN, RevisionRecord::RAW )
542            ->getModel() ?? '';
543    }
544
545    /**
546     * It's like WikiPage::prepareContentForEdit, but not for editing (old wikitext usually)
547     *
548     * @param string $wikitext
549     * @param WikiPage $article
550     * @param UserIdentity $userIdentity Context user
551     *
552     * @return stdClass
553     */
554    private function parseNonEditWikitext( $wikitext, WikiPage $article, UserIdentity $userIdentity ) {
555        static $cache = [];
556
557        $cacheKey = md5( $wikitext ) . ':' . $article->getTitle()->getPrefixedText();
558
559        if ( !isset( $cache[$cacheKey] ) ) {
560            $options = ParserOptions::newFromUser( $userIdentity );
561            $cache[$cacheKey] = (object)[
562                'output' => $this->parserFactory->getInstance()->parse( $wikitext, $article->getTitle(), $options )
563            ];
564        }
565
566        return $cache[$cacheKey];
567    }
568}