Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
96.81% covered (success)
96.81%
303 / 313
57.14% covered (warning)
57.14%
4 / 7
CRAP
0.00% covered (danger)
0.00%
0 / 1
LazyVariableComputer
96.81% covered (success)
96.81%
303 / 313
57.14% covered (warning)
57.14%
4 / 7
79
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 compute
97.61% covered (success)
97.61%
245 / 251
0.00% covered (danger)
0.00%
0 / 1
64
 getLinksFromDB
100.00% covered (success)
100.00%
8 / 8
100.00% covered (success)
100.00%
1 / 1
2
 getLastPageAuthors
96.97% covered (success)
96.97%
32 / 33
0.00% covered (danger)
0.00%
0 / 1
4
 getRevisionFromParameters
70.00% covered (warning)
70.00%
7 / 10
0.00% covered (danger)
0.00%
0 / 1
5.68
 getContentModelFromRevision
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 parseNonEditWikitext
100.00% covered (success)
100.00%
8 / 8
100.00% covered (success)
100.00%
1 / 1
2
1<?php
2
3namespace MediaWiki\Extension\AbuseFilter\Variables;
4
5use InvalidArgumentException;
6use MediaWiki\Content\ContentHandler;
7use MediaWiki\Content\TextContent;
8use MediaWiki\Extension\AbuseFilter\Hooks\AbuseFilterHookRunner;
9use MediaWiki\Extension\AbuseFilter\Parser\AFPData;
10use MediaWiki\Extension\AbuseFilter\TextExtractor;
11use MediaWiki\ExternalLinks\ExternalLinksLookup;
12use MediaWiki\ExternalLinks\LinkFilter;
13use MediaWiki\Language\Language;
14use MediaWiki\Page\PageIdentity;
15use MediaWiki\Page\WikiPage;
16use MediaWiki\Parser\ParserFactory;
17use MediaWiki\Parser\ParserOptions;
18use MediaWiki\Permissions\PermissionManager;
19use MediaWiki\Permissions\RestrictionStore;
20use MediaWiki\RecentChanges\RecentChange;
21use MediaWiki\Revision\RevisionLookup;
22use MediaWiki\Revision\RevisionRecord;
23use MediaWiki\Revision\RevisionStore;
24use MediaWiki\Revision\SlotRecord;
25use MediaWiki\Storage\PreparedUpdate;
26use MediaWiki\Title\Title;
27use MediaWiki\User\ExternalUserNames;
28use MediaWiki\User\User;
29use MediaWiki\User\UserEditTracker;
30use MediaWiki\User\UserGroupManager;
31use MediaWiki\User\UserIdentity;
32use MediaWiki\User\UserIdentityUtils;
33use MediaWiki\User\UserNameUtils;
34use Psr\Log\LoggerInterface;
35use stdClass;
36use UnexpectedValueException;
37use Wikimedia\Diff\Diff;
38use Wikimedia\Diff\UnifiedDiffFormatter;
39use Wikimedia\IPUtils;
40use Wikimedia\ObjectCache\WANObjectCache;
41use Wikimedia\Rdbms\Database;
42use Wikimedia\Rdbms\LBFactory;
43use Wikimedia\Rdbms\SelectQueryBuilder;
44use Wikimedia\StringUtils\StringUtils;
45
46/**
47 * Service used to compute lazy-loaded variable.
48 * @internal
49 */
50class LazyVariableComputer {
51    public const SERVICE_NAME = 'AbuseFilterLazyVariableComputer';
52
53    /**
54     * @var float The amount of time to subtract from profiling
55     * @todo This is a hack
56     */
57    public static $profilingExtraTime = 0;
58
59    public function __construct(
60        private readonly TextExtractor $textExtractor,
61        private readonly AbuseFilterHookRunner $hookRunner,
62        private readonly LoggerInterface $logger,
63        private readonly LBFactory $lbFactory,
64        private readonly WANObjectCache $wanCache,
65        private readonly RevisionLookup $revisionLookup,
66        private readonly RevisionStore $revisionStore,
67        private readonly Language $contentLanguage,
68        private readonly ParserFactory $parserFactory,
69        private readonly UserEditTracker $userEditTracker,
70        private readonly UserGroupManager $userGroupManager,
71        private readonly PermissionManager $permissionManager,
72        private readonly RestrictionStore $restrictionStore,
73        private readonly UserIdentityUtils $userIdentityUtils,
74        private readonly UserNameUtils $userNameUtils,
75        private readonly string $wikiID
76    ) {
77    }
78
79    /**
80     * XXX: $getVarCB is a hack to hide the cyclic dependency with VariablesManager. See T261069 for possible
81     * solutions. This might also be merged into VariablesManager, but it would bring a ton of dependencies.
82     * @todo Should we remove $vars parameter (check hooks)?
83     *
84     * @param LazyLoadedVariable $var
85     * @param VariableHolder $vars
86     * @param callable(string $name):AFPData $getVarCB
87     * @return AFPData
88     */
89    public function compute( LazyLoadedVariable $var, VariableHolder $vars, callable $getVarCB ) {
90        $parameters = $var->getParameters();
91        $varMethod = $var->getMethod();
92        $result = null;
93
94        if ( !$this->hookRunner->onAbuseFilter_interceptVariable(
95            $varMethod,
96            $vars,
97            $parameters,
98            $result
99        ) ) {
100            return $result instanceof AFPData
101                ? $result : AFPData::newFromPHPVar( $result );
102        }
103
104        switch ( $varMethod ) {
105            case 'diff':
106                $text1Var = $parameters['oldtext-var'];
107                $text2Var = $parameters['newtext-var'];
108                $text1 = $getVarCB( $text1Var )->toString();
109                $text2 = $getVarCB( $text2Var )->toString();
110                // T74329: if there's no text, don't return an array with the empty string
111                $text1 = $text1 === '' ? [] : explode( "\n", $text1 );
112                $text2 = $text2 === '' ? [] : explode( "\n", $text2 );
113                $diffs = new Diff( $text1, $text2 );
114                $format = new UnifiedDiffFormatter();
115                $result = $format->format( $diffs );
116                break;
117            case 'diff-split':
118                $diff = $getVarCB( $parameters['diff-var'] )->toString();
119                $line_prefix = $parameters['line-prefix'];
120                $diff_lines = explode( "\n", $diff );
121                $result = [];
122                foreach ( $diff_lines as $line ) {
123                    if ( ( $line[0] ?? '' ) === $line_prefix ) {
124                        $result[] = substr( $line, 1 );
125                    }
126                }
127                break;
128            case 'array-diff':
129                $baseVar = $parameters['base-var'];
130                $minusVar = $parameters['minus-var'];
131
132                $baseArray = $getVarCB( $baseVar )->toNative();
133                $minusArray = $getVarCB( $minusVar )->toNative();
134
135                $result = array_diff( $baseArray, $minusArray );
136                break;
137            case 'links-from-wikitext':
138                // This should ONLY be used when sharing a parse operation with the edit.
139
140                /** @var WikiPage $article */
141                $article = $parameters['article'];
142                if ( $article->getContentModel() === CONTENT_MODEL_WIKITEXT ) {
143                    // Shared with the edit, don't count it in profiling
144                    $startTime = microtime( true );
145                    $textVar = $parameters['text-var'];
146
147                    $new_text = $getVarCB( $textVar )->toString();
148                    $content = ContentHandler::makeContent( $new_text, $article->getTitle() );
149                    $editInfo = $article->prepareContentForEdit(
150                        $content,
151                        null,
152                        $parameters['contextUserIdentity']
153                    );
154                    $result = LinkFilter::getIndexedUrlsNonReversed(
155                        array_keys( $editInfo->output->getExternalLinks() )
156                    );
157                    self::$profilingExtraTime += ( microtime( true ) - $startTime );
158                    break;
159                }
160            // Otherwise fall back to database
161            case 'links-from-wikitext-or-database':
162                // TODO: use Content object instead, if available!
163                /** @var WikiPage $article */
164                $article ??= $parameters['article'];
165
166                // this inference is ugly, but the name isn't accessible from here
167                // and we only want this for debugging
168                $textVar = $parameters['text-var'];
169                $varName = str_starts_with( $textVar, 'old_' ) ? 'old_links' : 'new_links';
170                if ( $parameters['forFilter'] ?? false ) {
171                    $this->logger->debug( "Loading $varName from DB" );
172                    $links = $this->getLinksFromDB( $article );
173                } elseif ( $article->getContentModel() === CONTENT_MODEL_WIKITEXT ) {
174                    $this->logger->debug( "Loading $varName from Parser" );
175
176                    $wikitext = $getVarCB( $textVar )->toString();
177                    $editInfo = $this->parseNonEditWikitext(
178                        $wikitext,
179                        $article,
180                        $parameters['contextUserIdentity']
181                    );
182                    $links = LinkFilter::getIndexedUrlsNonReversed(
183                        array_keys( $editInfo->output->getExternalLinks() )
184                    );
185                } else {
186                    // TODO: Get links from Content object. But we don't have the content object.
187                    // And for non-text content, $wikitext is usually not going to be a valid
188                    // serialization, but rather some dummy text for filtering.
189                    $links = [];
190                }
191
192                $result = $links;
193                break;
194            case 'links-from-update':
195                /** @var PreparedUpdate $update */
196                $update = $parameters['update'];
197                // Shared with the edit, don't count it in profiling
198                $startTime = microtime( true );
199                $result = LinkFilter::getIndexedUrlsNonReversed(
200                    array_keys( $update->getParserOutputForMetaData()->getExternalLinks() )
201                );
202                self::$profilingExtraTime += ( microtime( true ) - $startTime );
203                break;
204            case 'links-from-database':
205                /** @var PageIdentity $article */
206                $article = $parameters['article'];
207                $this->logger->debug( 'Loading old_links from DB' );
208                $result = $this->getLinksFromDB( $article );
209                break;
210            case 'parse-wikitext':
211                // Should ONLY be used when sharing a parse operation with the edit.
212                // TODO: use Content object instead, if available!
213                /* @var WikiPage $article */
214                $article = $parameters['article'];
215                if ( $article->getContentModel() === CONTENT_MODEL_WIKITEXT ) {
216                    // Shared with the edit, don't count it in profiling
217                    $startTime = microtime( true );
218                    $textVar = $parameters['wikitext-var'];
219
220                    $new_text = $getVarCB( $textVar )->toString();
221                    $content = ContentHandler::makeContent( $new_text, $article->getTitle() );
222                    $editInfo = $article->prepareContentForEdit(
223                        $content,
224                        null,
225                        $parameters['contextUserIdentity']
226                    );
227                    if ( isset( $parameters['pst'] ) && $parameters['pst'] ) {
228                        $result = $editInfo->pstContent->serialize( $editInfo->format );
229                    } else {
230                        // Note: as of core change r727361, the PP limit comments (which we don't want to be here)
231                        // are already excluded.
232                        $popts = $editInfo->popts;
233                        $result = $editInfo->getOutput()->runOutputPipeline( $popts, [] )->getContentHolderText();
234                    }
235                    self::$profilingExtraTime += ( microtime( true ) - $startTime );
236                } else {
237                    $result = '';
238                }
239                break;
240            case 'pst-from-update':
241                /** @var PreparedUpdate $update */
242                $update = $parameters['update'];
243                $result = $this->textExtractor->revisionToString(
244                    $update->getRevision(),
245                    $parameters['contextUser']
246                );
247                break;
248            case 'html-from-update':
249                /** @var PreparedUpdate $update */
250                $update = $parameters['update'];
251                // Shared with the edit, don't count it in profiling
252                $startTime = microtime( true );
253                $popts = $update->getRenderedRevision()->getOptions();
254                $result = $update->getCanonicalParserOutput()->runOutputPipeline( $popts, [] )->getContentHolderText();
255                self::$profilingExtraTime += ( microtime( true ) - $startTime );
256                break;
257            case 'strip-html':
258                $htmlVar = $parameters['html-var'];
259                $html = $getVarCB( $htmlVar )->toString();
260                $stripped = StringUtils::delimiterReplace( '<', '>', '', $html );
261                // We strip extra spaces to the right because the stripping above
262                // could leave a lot of whitespace.
263                // @fixme Find a better way to do this.
264                $result = TextContent::normalizeLineEndings( $stripped );
265                break;
266            case 'load-recent-authors':
267                $result = $this->getLastPageAuthors( $parameters['title'] );
268                break;
269            case 'load-first-author':
270                $revision = $this->revisionLookup->getFirstRevision( $parameters['title'] );
271                if ( $revision ) {
272                    // TODO T233241
273                    $user = $revision->getUser();
274                    $result = $user === null ? '' : $user->getName();
275                } else {
276                    $result = '';
277                }
278                break;
279            case 'get-page-restrictions':
280                $action = $parameters['action'];
281                /** @var Title $title */
282                $title = $parameters['title'];
283                $result = $this->restrictionStore->getRestrictions( $title, $action );
284                break;
285            case 'account-type':
286                /** @var User $createdUser */
287                $createdUser = $parameters['createdUser'];
288                $isTemp = $this->userIdentityUtils->isTemp( $createdUser );
289                if ( $parameters['autocreate'] && $isTemp ) {
290                    $result = 'temp';
291                } elseif ( !$isTemp && $this->userNameUtils->isCreatable( $createdUser->getName() ) ) {
292                    // At this point the account hasn't been written to the DB yet, so:
293                    // - User::getId() is still 0
294                    // - User::isRegistered() will always be false
295                    // - and User::isNamed() can't be trusted here
296                    //
297                    // That means the only thing we can really rely on during pre-auth/pre-creation
298                    // is the username itself. If it's not a temporary account and the username is
299                    // creatable, then this is effectively a named account creation attempt.
300                    $result = 'named';
301                } else {
302                    $result = 'unknown';
303                }
304                break;
305            case 'user-unnamed-ip':
306                /** @var User $user */
307                $user = $parameters['user'];
308                $result = null;
309
310                // Reveal IPs for:
311                // - temporary accounts: temporary account names will replace the IP in the `user_name`
312                //   variable. This variable restores this access.
313                // - logged-out users: This supports the transition to the use of temporary accounts
314                //   so that filter maintainers on pre-transition wikis can migrate `user_name` to `user_unnamed_ip`
315                //   where necessary and see no disruption on transition.
316                //
317                // This variable should only ever be exposed for these use cases and shouldn't be extended
318                // to registered accounts, as that would leak account PII to users without the right to see
319                // that information
320                if (
321                    $this->userIdentityUtils->isTemp( $user ) ||
322                    IPUtils::isIPAddress( $user->getName() )
323                ) {
324                    /** @var RecentChange|null $rc */
325                    $rc = $parameters['rc'];
326                    if ( $rc !== null ) {
327                        $result = $rc->getAttribute( 'rc_ip' );
328                    } else {
329                        $result = $user->getRequest()->getIP();
330                    }
331                }
332                break;
333            case 'user-type':
334                /** @var UserIdentity $userIdentity */
335                $userIdentity = $parameters['user-identity'];
336                if ( $this->userIdentityUtils->isNamed( $userIdentity ) ) {
337                    $result = 'named';
338                } elseif ( $this->userIdentityUtils->isTemp( $userIdentity ) ) {
339                    $result = 'temp';
340                } elseif ( IPUtils::isIPAddress( $userIdentity->getName() ) ) {
341                    $result = 'ip';
342                } elseif ( ExternalUserNames::isExternal( $userIdentity->getName() ) ) {
343                    $result = 'external';
344                } else {
345                    $result = 'unknown';
346                }
347                break;
348            case 'user-editcount':
349                /** @var UserIdentity $userIdentity */
350                $userIdentity = $parameters['user-identity'];
351                $result = $this->userEditTracker->getUserEditCount( $userIdentity );
352                break;
353            case 'user-emailconfirm':
354                /** @var User $user */
355                $user = $parameters['user'];
356                $result = $user->getEmailAuthenticationTimestamp();
357                break;
358            case 'user-groups':
359                /** @var UserIdentity $userIdentity */
360                $userIdentity = $parameters['user-identity'];
361                $result = $this->userGroupManager->getUserEffectiveGroups( $userIdentity );
362                break;
363            case 'user-rights':
364                /** @var UserIdentity $userIdentity */
365                $userIdentity = $parameters['user-identity'];
366                $result = $this->permissionManager->getUserPermissions( $userIdentity );
367                break;
368            case 'user-block':
369                // @todo Support partial blocks?
370                /** @var User $user */
371                $user = $parameters['user'];
372                $result = (bool)$user->getBlock();
373                break;
374            case 'user-age':
375                /** @var User $user */
376                $user = $parameters['user'];
377                $asOf = $parameters['asof'];
378
379                if ( !$user->isRegistered() ) {
380                    $result = 0;
381                } else {
382                    // HACK: If there's no registration date, assume 2008-01-15, Wikipedia Day
383                    // in the year before the new user log was created. See T243469.
384                    $registration = $user->getRegistration() ?? "20080115000000";
385                    $result = (int)wfTimestamp( TS_UNIX, $asOf ) - (int)wfTimestamp( TS_UNIX, $registration );
386                }
387                break;
388            case 'page-age':
389                /** @var Title $title */
390                $title = $parameters['title'];
391
392                $firstRev = $this->revisionLookup->getFirstRevision( $title );
393                $firstRevisionTime = $firstRev?->getTimestamp();
394                if ( !$firstRevisionTime ) {
395                    $result = 0;
396                    break;
397                }
398
399                $asOf = $parameters['asof'];
400                $result = (int)wfTimestamp( TS_UNIX, $asOf ) - (int)wfTimestamp( TS_UNIX, $firstRevisionTime );
401                break;
402            case 'revision-age':
403                $revRec = $this->getRevisionFromParameters( $parameters );
404                if ( !$revRec ) {
405                    $result = null;
406                    break;
407                }
408                $asOf = $parameters['asof'];
409                $result = (int)wfTimestamp( TS_UNIX, $asOf ) - (int)wfTimestamp( TS_UNIX, $revRec->getTimestamp() );
410                break;
411            case 'length':
412                $s = $getVarCB( $parameters['length-var'] )->toString();
413                $result = strlen( $s );
414                break;
415            case 'subtract-int':
416                $v1 = $getVarCB( $parameters['val1-var'] )->toInt();
417                $v2 = $getVarCB( $parameters['val2-var'] )->toInt();
418                $result = $v1 - $v2;
419                break;
420            case 'content-model':
421                $revRec = $this->getRevisionFromParameters( $parameters );
422                $result = $this->getContentModelFromRevision( $revRec );
423                break;
424            case 'revision-text':
425                $revRec = $this->getRevisionFromParameters( $parameters );
426                $result = $this->textExtractor->revisionToString( $revRec, $parameters['contextUser'] );
427                break;
428            case 'get-wiki-name':
429                $result = $this->wikiID;
430                break;
431            case 'get-wiki-language':
432                $result = $this->contentLanguage->getCode();
433                break;
434            default:
435                if ( $this->hookRunner->onAbuseFilter_computeVariable(
436                    $varMethod,
437                    $vars,
438                    $parameters,
439                    $result
440                ) ) {
441                    throw new UnexpectedValueException( 'Unknown variable compute type ' . $varMethod );
442                }
443        }
444
445        return $result instanceof AFPData ? $result : AFPData::newFromPHPVar( $result );
446    }
447
448    /**
449     * @param PageIdentity $page
450     * @return string[]
451     */
452    private function getLinksFromDB( PageIdentity $page ): array {
453        $id = $page->getId();
454        if ( !$id ) {
455            return [];
456        }
457
458        return ExternalLinksLookup::getExtLinksForPage(
459            $id,
460            $this->lbFactory,
461            __METHOD__
462        );
463    }
464
465    /**
466     * @todo Move to MW core (T272050)
467     * @param Title $title
468     * @return string[] Usernames of the last 10 (unique) authors from $title
469     */
470    private function getLastPageAuthors( Title $title ) {
471        if ( !$title->exists() ) {
472            return [];
473        }
474
475        $fname = __METHOD__;
476
477        return $this->wanCache->getWithSetCallback(
478            $this->wanCache->makeKey( 'last-10-authors', 'revision', $title->getLatestRevID() ),
479            WANObjectCache::TTL_MINUTE,
480            function ( $oldValue, &$ttl, array &$setOpts ) use ( $title, $fname ) {
481                $dbr = $this->lbFactory->getReplicaDatabase();
482
483                $setOpts += Database::getCacheSetOptions( $dbr );
484                // Get the last 100 edit authors with a trivial query (avoid T116557)
485                $revQuery = $this->revisionStore->getQueryInfo();
486                $revAuthors = $dbr->newSelectQueryBuilder()
487                    ->tables( $revQuery['tables'] )
488                    ->field( $revQuery['fields']['rev_user_text'] )
489                    ->where( [
490                        'rev_page' => $title->getArticleID(),
491                        // TODO Should deleted names be counted in the 10 authors? If yes, this check should
492                        // be moved inside the foreach
493                        'rev_deleted' => 0
494                    ] )
495                    ->caller( $fname )
496                    // Some pages have < 10 authors but many revisions (e.g. bot pages)
497                    ->orderBy( [ 'rev_timestamp', 'rev_id' ], SelectQueryBuilder::SORT_DESC )
498                    ->limit( 100 )
499                    // Force index per T116557
500                    ->useIndex( [ 'revision' => 'rev_page_timestamp' ] )
501                    ->joinConds( $revQuery['joins'] )
502                    ->fetchFieldValues();
503                // Get the last 10 distinct authors within this set of edits
504                $users = [];
505                foreach ( $revAuthors as $author ) {
506                    $users[$author] = 1;
507                    if ( count( $users ) >= 10 ) {
508                        break;
509                    }
510                }
511
512                return array_keys( $users );
513            }
514        );
515    }
516
517    /**
518     * @param array{revid?:int,title?:Title,parent?:true} $params
519     * @return ?RevisionRecord
520     */
521    private function getRevisionFromParameters( array $params ): ?RevisionRecord {
522        if ( isset( $params['revid'] ) ) {
523            $revision = $this->revisionLookup->getRevisionById( $params['revid'] );
524        } elseif ( isset( $params['title'] ) ) {
525            $revision = $this->revisionLookup->getRevisionByTitle( $params['title'] );
526        } else {
527            throw new InvalidArgumentException(
528                "Either 'revid' or 'title' are mandatory revision specifiers"
529            );
530        }
531        if ( ( $params['parent'] ?? false ) && $revision !== null ) {
532            $revision = $this->revisionLookup->getPreviousRevision( $revision );
533        }
534        return $revision;
535    }
536
537    private function getContentModelFromRevision( ?RevisionRecord $revision ): string {
538        // this is consistent with what is done on various places in RunVariableGenerator
539        // and RCVariableGenerator
540        return $revision?->getContent( SlotRecord::MAIN, RevisionRecord::RAW )
541            ->getModel() ?? '';
542    }
543
544    /**
545     * It's like WikiPage::prepareContentForEdit, but not for editing (old wikitext usually)
546     *
547     * @param string $wikitext
548     * @param WikiPage $article
549     * @param UserIdentity $userIdentity Context user
550     *
551     * @return stdClass
552     */
553    private function parseNonEditWikitext( $wikitext, WikiPage $article, UserIdentity $userIdentity ) {
554        static $cache = [];
555
556        $cacheKey = md5( $wikitext ) . ':' . $article->getTitle()->getPrefixedText();
557
558        if ( !isset( $cache[$cacheKey] ) ) {
559            $options = ParserOptions::newFromUser( $userIdentity );
560            $cache[$cacheKey] = (object)[
561                'output' => $this->parserFactory->getInstance()->parse( $wikitext, $article->getTitle(), $options )
562            ];
563        }
564
565        return $cache[$cacheKey];
566    }
567}