Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
87.64% |
227 / 259 |
|
62.50% |
10 / 16 |
CRAP | |
0.00% |
0 / 1 |
PotentialInviteesFinder | |
87.64% |
227 / 259 |
|
62.50% |
10 / 16 |
57.10 | |
0.00% |
0 / 1 |
__construct | |
85.71% |
6 / 7 |
|
0.00% |
0 / 1 |
1.00 | |||
setDebugLogger | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
generate | |
80.00% |
8 / 10 |
|
0.00% |
0 / 1 |
4.13 | |||
getAllRevisionsForWiki | |
86.42% |
70 / 81 |
|
0.00% |
0 / 1 |
14.49 | |||
getRevisionFilterConditions | |
70.83% |
34 / 48 |
|
0.00% |
0 / 1 |
5.62 | |||
rankUsers | |
100.00% |
12 / 12 |
|
100.00% |
1 / 1 |
3 | |||
getDeltasByUser | |
100.00% |
16 / 16 |
|
100.00% |
1 / 1 |
5 | |||
getUserDataByWiki | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
3 | |||
getUserScore | |
100.00% |
12 / 12 |
|
100.00% |
1 / 1 |
1 | |||
getOverallBytesScore | |
80.00% |
12 / 15 |
|
0.00% |
0 / 1 |
4.13 | |||
getBytesScoreForPage | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
getEditCountScore | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
getEditCount | |
100.00% |
12 / 12 |
|
100.00% |
1 / 1 |
3 | |||
getRecentActivityScore | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
1 | |||
getDaysSinceLastEdit | |
93.33% |
14 / 15 |
|
0.00% |
0 / 1 |
4.00 | |||
filterUsersByPreference | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
1 |
1 | <?php |
2 | |
3 | declare( strict_types=1 ); |
4 | |
5 | namespace MediaWiki\Extension\CampaignEvents\Invitation; |
6 | |
7 | use ChangeTags; |
8 | use MediaWiki\DAO\WikiAwareEntity; |
9 | use MediaWiki\Extension\CampaignEvents\Hooks\Handlers\GetPreferencesHandler; |
10 | use MediaWiki\Page\PageIdentity; |
11 | use MediaWiki\Revision\RevisionStoreFactory; |
12 | use MediaWiki\Storage\NameTableAccessException; |
13 | use MediaWiki\Storage\NameTableStoreFactory; |
14 | use MediaWiki\User\Options\UserOptionsLookup; |
15 | use MediaWiki\User\UserIdentityValue; |
16 | use MediaWiki\Utils\MWTimestamp; |
17 | use MediaWiki\WikiMap\WikiMap; |
18 | use RuntimeException; |
19 | use UnexpectedValueException; |
20 | use Wikimedia\Rdbms\IConnectionProvider; |
21 | use Wikimedia\Rdbms\IReadableDatabase; |
22 | use Wikimedia\Rdbms\SelectQueryBuilder; |
23 | use Wikimedia\Timestamp\ConvertibleTimestamp; |
24 | |
25 | /** |
26 | * This class generates a list of potential event participants ("invitation list") by looking at who contributed |
27 | * to a given list of pages ("worklist"). |
28 | */ |
29 | class PotentialInviteesFinder { |
30 | public const SERVICE_NAME = 'CampaignEventsPotentialInviteesFinder'; |
31 | |
32 | /** How many days to look back into the past when scanning revisions. */ |
33 | private const CUTOFF_DAYS = 3 * 365; |
34 | public const RESULT_USER_LIMIT = 200; |
35 | private const REVISIONS_PER_PAGE_LIMIT = 5_000; |
36 | private const MIN_SCORE = 5; |
37 | |
38 | private RevisionStoreFactory $revisionStoreFactory; |
39 | private IConnectionProvider $dbProvider; |
40 | private NameTableStoreFactory $nameTableStoreFactory; |
41 | private int $blockTargetMigrationStage; |
42 | /** |
43 | * @var callable |
44 | * @phan-var callable(string $msg):void |
45 | */ |
46 | private $debugLogger; |
47 | private UserOptionsLookup $userOptionsLookup; |
48 | |
49 | public function __construct( |
50 | RevisionStoreFactory $revisionStoreFactory, |
51 | IConnectionProvider $dbProvider, |
52 | NameTableStoreFactory $nameTableStoreFactory, |
53 | int $blockTargetMigrationStage, |
54 | UserOptionsLookup $userOptionsLookup |
55 | ) { |
56 | $this->revisionStoreFactory = $revisionStoreFactory; |
57 | $this->dbProvider = $dbProvider; |
58 | $this->nameTableStoreFactory = $nameTableStoreFactory; |
59 | $this->blockTargetMigrationStage = $blockTargetMigrationStage; |
60 | $this->userOptionsLookup = $userOptionsLookup; |
61 | $this->debugLogger = static function ( string $msg ): void { |
62 | }; |
63 | } |
64 | |
65 | /** |
66 | * @param callable $debugLogger |
67 | * @phan-param callable(string $msg):void $debugLogger |
68 | */ |
69 | public function setDebugLogger( callable $debugLogger ): void { |
70 | $this->debugLogger = $debugLogger; |
71 | } |
72 | |
73 | /** |
74 | * @param Worklist $worklist |
75 | * @return array<string,int> |
76 | */ |
77 | public function generate( Worklist $worklist ): array { |
78 | $revisionsByWiki = []; |
79 | foreach ( $worklist->getPagesByWiki() as $wiki => $pages ) { |
80 | if ( $wiki !== WikiMap::getCurrentWikiID() ) { |
81 | // TODO: Re-implement support for multi-wiki worklists. Currently not doable because we'd need to read, |
82 | // and possibly merge, cross-wiki user preferences. Note that this code is currently unreachable. |
83 | throw new UnexpectedValueException( "Unexpected foreign page on $wiki" ); |
84 | } |
85 | $revisionsByWiki[$wiki] = $this->getAllRevisionsForWiki( $wiki, $pages ); |
86 | } |
87 | $revisionsByWiki = array_filter( $revisionsByWiki ); |
88 | if ( !$revisionsByWiki ) { |
89 | return []; |
90 | } |
91 | |
92 | $rankedUsers = $this->rankUsers( $revisionsByWiki ); |
93 | // Return just the top scores to avoid useless mile-long invitation lists. Preserve integer keys |
94 | // in case a username is numeric and PHP cast it to int. |
95 | return array_slice( $rankedUsers, 0, self::RESULT_USER_LIMIT, true ); |
96 | } |
97 | |
98 | /** |
99 | * @param string $wikiIDStr |
100 | * @param PageIdentity[] $pages |
101 | * @return array[] List of arrays with revision data. The page is only included for debugging, and callers should |
102 | * not rely on its format. |
103 | * @phan-return list<array{username:string,userID:int,actorID:int,page:string,delta:int}> |
104 | */ |
105 | private function getAllRevisionsForWiki( string $wikiIDStr, array $pages ): array { |
106 | $wikiID = $wikiIDStr === WikiMap::getCurrentWikiId() |
107 | ? WikiAwareEntity::LOCAL |
108 | : $wikiIDStr; |
109 | $revisionStore = $this->revisionStoreFactory->getRevisionStore( $wikiID ); |
110 | // This script may potentially scan a lot of revisions. Although the queries can use good indexes, sending them |
111 | // to vslow hosts shouldn't hurt. |
112 | $dbr = $this->dbProvider->getReplicaDatabase( $wikiID, 'vslow' ); |
113 | |
114 | $pagesByID = []; |
115 | foreach ( $pages as $page ) { |
116 | $pageID = $page->getId( $wikiID ); |
117 | $pagesByID[$pageID] = $page; |
118 | } |
119 | // For simplicity (e.g. below when limiting the number of revisions), order the list of page IDs |
120 | asort( $pagesByID ); |
121 | $pageChunks = array_chunk( array_keys( $pagesByID ), 25 ); |
122 | $totalPageChunks = count( $pageChunks ); |
123 | |
124 | $baseWhereConds = $this->getRevisionFilterConditions( $wikiID, $dbr ); |
125 | |
126 | $batchSize = 2500; |
127 | $scannedRevisionsPerPage = []; |
128 | $revisions = []; |
129 | $pageBatchIdx = 1; |
130 | |
131 | // Process the list of pages in smaller chunks, to avoid the optimizer making wrong decisions, and also to keep |
132 | // the queries more readable. |
133 | foreach ( $pageChunks as $batchPageIDs ) { |
134 | $lastPage = 0; |
135 | $lastTimestamp = null; |
136 | $lastRevID = null; |
137 | $innerBatchIdx = 1; |
138 | do { |
139 | $progressMsg = "Running $wikiIDStr batch #$pageBatchIdx.$innerBatchIdx of $totalPageChunks " . |
140 | "from pageID=" . min( $batchPageIDs ); |
141 | if ( $lastTimestamp !== null && $lastRevID !== null ) { |
142 | $progressMsg .= ", ts=$lastTimestamp, rev=$lastRevID"; |
143 | } |
144 | ( $this->debugLogger )( $progressMsg ); |
145 | |
146 | $paginationConds = [ |
147 | 'rev_page' => $lastPage |
148 | ]; |
149 | if ( $lastTimestamp && $lastRevID ) { |
150 | $paginationConds = array_merge( |
151 | $paginationConds, |
152 | [ |
153 | 'rev_timestamp' => $lastTimestamp, |
154 | 'rev_id' => $lastRevID |
155 | ] |
156 | ); |
157 | } |
158 | |
159 | $revQueryBuilder = $revisionStore->newSelectQueryBuilder( $dbr ); |
160 | $res = $revQueryBuilder |
161 | ->field( 'actor_name' ) |
162 | // Needed for the user_is_temp check. |
163 | ->joinUser() |
164 | ->where( $baseWhereConds ) |
165 | ->andWhere( [ 'rev_page' => $batchPageIDs ] ) |
166 | ->andWhere( $dbr->buildComparison( '>', $paginationConds ) ) |
167 | ->orderBy( [ 'rev_page', 'rev_timestamp', 'rev_id' ], SelectQueryBuilder::SORT_ASC ) |
168 | ->limit( $batchSize ) |
169 | ->caller( __METHOD__ ) |
170 | ->fetchResultSet(); |
171 | |
172 | $parents = []; |
173 | foreach ( $res as $row ) { |
174 | $parentID = (int)$row->rev_parent_id; |
175 | if ( $parentID !== 0 ) { |
176 | $parents[$row->rev_id] = $parentID; |
177 | } |
178 | } |
179 | |
180 | $parentSizes = $revisionStore->getRevisionSizes( array_values( $parents ) ); |
181 | |
182 | foreach ( $res as $row ) { |
183 | $pageID = (int)$row->rev_page; |
184 | $parentID = $parents[$row->rev_id] ?? null; |
185 | $parentSize = $parentID ? $parentSizes[$parentID] : 0; |
186 | $revisions[] = [ |
187 | 'username' => $row->actor_name, |
188 | 'userID' => $row->rev_user, |
189 | 'actorID' => $row->rev_actor, |
190 | 'page' => $pagesByID[$row->rev_page]->__toString(), |
191 | 'delta' => (int)$row->rev_len - $parentSize |
192 | ]; |
193 | $scannedRevisionsPerPage[$pageID] ??= 0; |
194 | $scannedRevisionsPerPage[$pageID]++; |
195 | $lastPage = $pageID; |
196 | $lastTimestamp = $row->rev_timestamp; |
197 | $lastRevID = (int)$row->rev_id; |
198 | } |
199 | |
200 | if ( $scannedRevisionsPerPage[$lastPage] >= self::REVISIONS_PER_PAGE_LIMIT ) { |
201 | // If we've already analyzed enough revisions for this page, move on to the next one. |
202 | // Ideally we'd set a limit in the query above, but that seems difficult, especially considering |
203 | // the limited subset of SQL we can use. So, just use this as an approximate limit. The ultimate |
204 | // goal is to avoid choking on pages with lots of revisions, so the limit doesn't have to be exact. |
205 | // Unsetting all the pagination conditions except for the page one makes us go straight to the |
206 | // next page in the list. |
207 | $lastTimestamp = null; |
208 | $lastRevID = null; |
209 | } |
210 | $innerBatchIdx++; |
211 | if ( !defined( 'MW_PHPUNIT_TEST' ) ) { |
212 | // Sleep after every batch to avoid putting too much load on DB servers with the revision queries. |
213 | sleep( 1 ); |
214 | } |
215 | } while ( $res->numRows() >= $batchSize ); |
216 | $pageBatchIdx++; |
217 | } |
218 | |
219 | return $revisions; |
220 | } |
221 | |
222 | /** |
223 | * Returns an array of filters to apply to the revision query. |
224 | * |
225 | * @param string|false $wikiID |
226 | * @param IReadableDatabase $dbr |
227 | * @return array |
228 | */ |
229 | private function getRevisionFilterConditions( $wikiID, IReadableDatabase $dbr ): array { |
230 | $filterConditions = []; |
231 | |
232 | // Exclude all sorts of deleted revisions to avoid any chance of data leaks. |
233 | $filterConditions['rev_deleted'] = 0; |
234 | |
235 | // Exclude anons and temp users. |
236 | $filterConditions[] = $dbr->expr( 'actor_user', '!=', null ); |
237 | $filterConditions['user_is_temp'] = 0; |
238 | |
239 | // Exclude anything too old. |
240 | $startTime = (int)ConvertibleTimestamp::now( TS_UNIX ) - self::CUTOFF_DAYS * 24 * 60 * 60; |
241 | $filterConditions[] = $dbr->expr( 'rev_timestamp', '>=', $dbr->timestamp( $startTime ) ); |
242 | |
243 | // Exclude both edits that have been reverted, and edits that revert other edits. Neither of these is relevant, |
244 | // and can easily skew the deltas. |
245 | $changeTagDefStore = $this->nameTableStoreFactory->getChangeTagDef( $wikiID ); |
246 | $revertTagIDs = []; |
247 | foreach ( [ ...ChangeTags::REVERT_TAGS, ChangeTags::TAG_REVERTED ] as $tagName ) { |
248 | try { |
249 | $revertTagIDs[] = $changeTagDefStore->getId( $tagName ); |
250 | } catch ( NameTableAccessException $e ) { |
251 | // There's no tag ID if no revisions have ever been tagged with this tag. |
252 | } |
253 | } |
254 | if ( $revertTagIDs ) { |
255 | $tagSubquery = $dbr->newSelectQueryBuilder() |
256 | ->select( '1' ) |
257 | ->from( 'change_tag' ) |
258 | ->where( [ 'ct_rev_id = rev_id', 'ct_tag_id' => $revertTagIDs ] ); |
259 | $filterConditions[] = 'NOT EXISTS(' . $tagSubquery->getSQL() . ')'; |
260 | } |
261 | |
262 | // Exclude users who have a sitewide infinite block. |
263 | $readOldBlockSchema = (bool)( $this->blockTargetMigrationStage & SCHEMA_COMPAT_READ_OLD ); |
264 | if ( $readOldBlockSchema ) { |
265 | $blocksSubquery = $dbr->newSelectQueryBuilder() |
266 | ->select( '1' ) |
267 | ->from( 'ipblocks' ) |
268 | ->where( [ |
269 | $dbr->expr( 'ipb_user', '!=', 0 ), |
270 | 'actor_rev_user.actor_user = ipb_user', |
271 | 'ipb_expiry' => $dbr->getInfinity(), |
272 | 'ipb_sitewide' => 1, |
273 | ] ); |
274 | } else { |
275 | $blocksSubquery = $dbr->newSelectQueryBuilder() |
276 | ->select( '1' ) |
277 | ->from( 'block' ) |
278 | ->join( 'block_target', null, 'bt_id=bl_target' ) |
279 | ->where( [ |
280 | $dbr->expr( 'bt_user', '!=', null ), |
281 | 'actor_rev_user.actor_user = bt_user', |
282 | 'bl_expiry' => $dbr->getInfinity(), |
283 | 'bl_sitewide' => 1, |
284 | ] ); |
285 | } |
286 | $filterConditions[] = 'NOT EXISTS(' . $blocksSubquery->getSQL() . ')'; |
287 | |
288 | // Exclude bots. Note, this only checks whether a user is *currently* a bot, not whether |
289 | // they were a bot at the time the edit was made. |
290 | // XXX: Ideally we would use GroupPermissionLookup to list user groups with the 'bot' right, but that |
291 | // only works for the local wiki. |
292 | $botSubquery = $dbr->newSelectQueryBuilder() |
293 | ->select( '1' ) |
294 | ->from( 'user_groups' ) |
295 | ->where( [ |
296 | 'actor_rev_user.actor_user = ug_user', |
297 | 'ug_group' => 'bot', |
298 | ] ); |
299 | $filterConditions[] = 'NOT EXISTS(' . $botSubquery->getSQL() . ')'; |
300 | |
301 | return $filterConditions; |
302 | } |
303 | |
304 | /** |
305 | * This method takes a list of contributors along with the total number of bytes they added for each page, and |
306 | * returns a list of the same users, ranked by the likelihood of them being interested in the event. |
307 | * |
308 | * @param array[] $revisionsByWiki |
309 | * @phpcs:ignore Generic.Files.LineLength |
310 | * @phan-param array<string,list<array{username:string,userID:int,actorID:int,page:string,delta:int}>> $revisionsByWiki |
311 | * @return array<string,int> List of users along with their score, sorted from highest to lowest. |
312 | */ |
313 | private function rankUsers( array $revisionsByWiki ): array { |
314 | // get unique usernames and remove those who have opted out of invitation lists |
315 | $filteredRevisions = $this->filterUsersByPreference( $revisionsByWiki ); |
316 | $deltasByUser = $this->getDeltasByUser( $filteredRevisions ); |
317 | $userDataByWiki = $this->getUserDataByWiki( $filteredRevisions ); |
318 | ( $this->debugLogger )( "==Scoring debug info==" ); |
319 | $rankedUsers = []; |
320 | foreach ( $deltasByUser as $username => $byteDeltas ) { |
321 | // Make sure the username is a string to satisfy the type hint. PHP will have transformed it to an integer |
322 | // if the username is numeric (when used as array key). |
323 | $score = $this->getUserScore( (string)$username, $byteDeltas, $userDataByWiki[$username] ); |
324 | if ( $score >= self::MIN_SCORE ) { |
325 | $rankedUsers[$username] = $score; |
326 | } |
327 | } |
328 | arsort( $rankedUsers ); |
329 | ( $this->debugLogger )( "\n" ); |
330 | return $rankedUsers; |
331 | } |
332 | |
333 | /** |
334 | * @param array[] $revisionsByWiki |
335 | * * @phpcs:ignore Generic.Files.LineLength |
336 | * * @phan-param array<string,list<array{username:string,userID:int,actorID:int,page:string,delta:int}>> $revisionsByWiki |
337 | * @return array<string,int[]> For each user, this contains a list of deltas in bytes across all relevant pages. |
338 | */ |
339 | private function getDeltasByUser( array $revisionsByWiki ): array { |
340 | $listByUser = []; |
341 | // Flatten the list, merging revisions from all wikis. |
342 | $revisions = array_merge( ...array_values( $revisionsByWiki ) ); |
343 | foreach ( |
344 | $revisions as [ 'userID' => $userID, 'username' => $username, 'page' => $pageKey, 'delta' => $delta ] |
345 | ) { |
346 | $listByUser[$username] ??= []; |
347 | $listByUser[$username][$pageKey] ??= 0; |
348 | $listByUser[$username][$pageKey] += $delta; |
349 | } |
350 | |
351 | $deltas = []; |
352 | ( $this->debugLogger )( "==Contributions==" ); |
353 | foreach ( $listByUser as $user => $userDeltas ) { |
354 | foreach ( $userDeltas as $pageKey => $delta ) { |
355 | ( $this->debugLogger )( "$user - $pageKey - $delta" ); |
356 | } |
357 | // TODO: What should we do with negative totals? Large negative deltas do not necessarily indicate that a |
358 | // user is not interested in the article. This problem is somewhat mitigated by the exclusion of reverts, |
359 | // but there are still situations where a negative delta might be a good thing. For instance, if someone has |
360 | // moved a section of the article to a separate page. In general, the byte count itself is far from being |
361 | // perfect as a metric. For now, we're excluding negative deltas because some of the formulas below expect |
362 | // the total delta to be positive. |
363 | $positiveDeltas = array_filter( $userDeltas, static fn ( $x ) => $x > 0 ); |
364 | if ( $positiveDeltas ) { |
365 | $deltas[$user] = array_values( $positiveDeltas ); |
366 | } |
367 | } |
368 | ( $this->debugLogger )( "\n" ); |
369 | return $deltas; |
370 | } |
371 | |
372 | /** |
373 | * Returns user identifiers (name, ID, actor ID) for each contributor, for each wiki where they made edits to |
374 | * articles in the worklist. This can't just use UserIdentity because that doesn't include the actor ID, which we |
375 | * need for other queries later (particularly in getDaysSinceLastEdit()). Alternatively we could use |
376 | * ActorNormalization or a join on the user table, but both seem unnecessary (and potentially slow) when we already |
377 | * have the actor ID available. |
378 | * |
379 | * @param array[] $revisionsByWiki |
380 | * @phpcs:ignore Generic.Files.LineLength |
381 | * @phan-param array<string,list<array{username:string,userID:int,actorID:int,page:string,delta:int}>> $revisionsByWiki |
382 | * @return int[][][] Indexed by username first, then wiki ID. |
383 | * @phan-return array<string,array<string,array{userID:int,actorID:int}>> |
384 | */ |
385 | private function getUserDataByWiki( array $revisionsByWiki ): array { |
386 | $userData = []; |
387 | foreach ( $revisionsByWiki as $wiki => $revisions ) { |
388 | foreach ( $revisions as [ 'username' => $username, 'userID' => $userID, 'actorID' => $actorID ] ) { |
389 | $userData[$username][$wiki] = [ 'userID' => $userID, 'actorID' => $actorID ]; |
390 | } |
391 | } |
392 | return $userData; |
393 | } |
394 | |
395 | /** |
396 | * Returns a score from 0 to 100 for a given user. |
397 | * |
398 | * @param string $username |
399 | * @param int[] $byteDeltas |
400 | * @param int[][] $userDataByWiki Map of [ wiki => [ userID: int, actorID: int ] ] |
401 | * @phan-param array<string,array{userID:int,actorID:int}> $userDataByWiki |
402 | * @return int |
403 | */ |
404 | private function getUserScore( string $username, array $byteDeltas, array $userDataByWiki ): int { |
405 | // Idea: Maybe check how many edits each user has for each page, and handle each edit separately. |
406 | // This would allow us to better handle outliers like single edits that add a lot of content. |
407 | // Unsure how valuable that would be though, because huge edits can represent at least two different things: |
408 | // 1 - Automated maintenance operation (e.g., adding archive links via IABot) |
409 | // 2 - Substantial additions of content (for example, but not necessarily, upon page creation) |
410 | // Our goal is to avoid 1, while catching 2, which might be difficult. Still, if a user has multiple edits for a |
411 | // given page, it's more likely that they may have a genuine interest in the article subject, as opposed to them |
412 | // performing some mass-maintenance operation that happened to touch a certain article. |
413 | |
414 | $bytesScore = $this->getOverallBytesScore( $username, $byteDeltas ); |
415 | $editCountScore = $this->getEditCountScore( $username, $userDataByWiki ); |
416 | $recentActivityScore = $this->getRecentActivityScore( $username, $userDataByWiki ); |
417 | |
418 | // Once we have a (0, 1) score for each criterion, we combine them to obtain an overall score. This is currently |
419 | // doing a weighted geometric mean. Amongst the advantages of the geometric mean is that it's conveniently |
420 | // sensitive to small values. In practice, this means that even a single low score (around zero) will bring the |
421 | // overall score down to around zero. |
422 | $bytesWeight = 4; |
423 | $editCountWeight = 1; |
424 | $recentActivityWeight = 5; |
425 | $overallScore = ( |
426 | ( $bytesScore ** $bytesWeight ) * |
427 | ( $editCountScore ** $editCountWeight ) * |
428 | ( $recentActivityScore ** $recentActivityWeight ) |
429 | ) ** ( 1 / ( $bytesWeight + $editCountWeight + $recentActivityWeight ) ); |
430 | return (int)round( 100 * $overallScore ); |
431 | } |
432 | |
433 | /** |
434 | * Returns a (0, 1) score based on the number and size of contributions that a single user made across all pages |
435 | * in the worklist. |
436 | * |
437 | * @param string $username |
438 | * @param int[] $deltas |
439 | * @return float |
440 | */ |
441 | private function getOverallBytesScore( string $username, array $deltas ): float { |
442 | // This function computed a (0, 1) score for each page. Then, we get the maximum of those scores and "boost" |
443 | // it by using the other scores. Let us indicate the overall scoring function with f(x), where x is a |
444 | // k-dimentional vector. Let x_m be the component in x with the maximum value. We then have f(x) = x_m * b(x), |
445 | // where b(x) is the boosting function, which outputs values in [ 1, 1 / x_m ]. |
446 | // f(x) satisfies the following conditions: |
447 | // * f(x) ∈ [0, 1] |
448 | // * f( x_1 ) = x_1 (single variable case) |
449 | // * f( 0, ..., 0 ) = 0, f( 1, ..., 1 ) = 1 |
450 | // * f(x) >= x_m, where the equality holds true iff all components (at most with the exception of x_m) are 0, |
451 | // or x_m = 1 |
452 | // Note that the case x_m = 0 is defined separately to avoid annoyances with denominators. |
453 | // The b(x) currently used is calculated by taking x_m and linearly amplifying it by a factor proportional to |
454 | // the second largest component, then iterating the process for every component. This is very empirical, and |
455 | // it would be better to scale the amplification based on the actual byte values, by establishing more rigorous |
456 | // relationship to determine how much we shuld favour a given total delta being spread across multiple pages, as |
457 | // opposed to being concentrated in a single page. Hopefully, this simple approach would suffice for now. |
458 | // The two-dimensional version of the function can be visualised in https://www.desmos.com/3d/41a47c8129 |
459 | rsort( $deltas ); |
460 | $numPages = count( $deltas ); |
461 | $maxBytesScore = $this->getBytesScoreForPage( $deltas[0] ); |
462 | ( $this->debugLogger )( "User $username max bytes $deltas[0] with score $maxBytesScore" ); |
463 | $bytesScore = $maxBytesScore; |
464 | if ( $maxBytesScore !== 0.0 ) { |
465 | for ( $i = 1; $i < $numPages; $i++ ) { |
466 | $curBytesScore = $this->getBytesScoreForPage( $deltas[$i] ); |
467 | if ( $curBytesScore === 0.0 ) { |
468 | // Scores from here on no longer have any effect. |
469 | break; |
470 | } |
471 | ( $this->debugLogger )( "User $username bytes score #$i: $curBytesScore" ); |
472 | $damping = 1; |
473 | $bytesScore *= 1 + ( $curBytesScore ** $damping ) * ( 1 / $bytesScore - 1 ); |
474 | } |
475 | } |
476 | ( $this->debugLogger )( "User $username overall bytes score $bytesScore" ); |
477 | return $bytesScore; |
478 | } |
479 | |
480 | /** |
481 | * Returns a (0, 1) score based on the contributions made to a single page. |
482 | * |
483 | * @param int $delta |
484 | * @return float |
485 | */ |
486 | private function getBytesScoreForPage( int $delta ): float { |
487 | // Because we use bytes as the main metric in determining the overall score, it's important that the score |
488 | // function is as good as possible. This is a logistic-like model, but multiplied by a function that's really |
489 | // flat near 0, which acts as a sort of high-pass filter. |
490 | // The values for the two parameters have been computed numerically via gradient descent in order to minimize |
491 | // the sum of squared residuals. Then, they've been approximated to more readable values. Both the original fit |
492 | // and the rounded fit can be visualized in https://www.desmos.com/calculator/eu7u0kwkd6 |
493 | $scaledX = $delta / 1000; |
494 | $baseScore = 2 / ( 1 + exp( -0.42 * ( $scaledX ** 1.1 ) ) ) - 1; |
495 | $flatteningFactor = exp( -0.00001 / ( $scaledX ** 10 ) ); |
496 | return $baseScore * $flatteningFactor; |
497 | } |
498 | |
499 | /** |
500 | * Returns a (0, 1) score based on the edit count of the given user. |
501 | * |
502 | * @param string $username |
503 | * @param int[][] $userDataByWiki Map of [ wiki => [ userID: int, actorID: int ] ] |
504 | * @phan-param array<string,array{userID:int,actorID:int}> $userDataByWiki |
505 | * @return float |
506 | */ |
507 | private function getEditCountScore( string $username, array $userDataByWiki ): float { |
508 | $editCount = $this->getEditCount( $userDataByWiki ); |
509 | // This one uses the same base model as the bytes score, but with different parameters (approximated in the |
510 | // same way). The graph can be visualised in https://www.desmos.com/calculator/4mhrtnhf4i |
511 | $scaledEC = $editCount / 1000; |
512 | $editCountScore = 2 / ( 1 + exp( -3 * ( $scaledEC ** 0.66 ) ) ) - 1; |
513 | ( $this->debugLogger )( "User $username edit count $editCount, score $editCountScore" ); |
514 | return $editCountScore; |
515 | } |
516 | |
517 | /** |
518 | * @param int[][] $userDataByWiki Map of [ wiki => [ userID: int, actorID: int ] ] |
519 | * @phan-param array<string,array{userID:int,actorID:int}> $userDataByWiki |
520 | * @return int |
521 | */ |
522 | private function getEditCount( array $userDataByWiki ): int { |
523 | // XXX: UserEditTracker is only available for the local wiki, and the global edit count is a CentralAuth thing |
524 | $totalEditCount = 0; |
525 | foreach ( $userDataByWiki as $wiki => [ 'userID' => $userID ] ) { |
526 | $dbr = $this->dbProvider->getReplicaDatabase( $wiki ); |
527 | $curWikiEditCount = $dbr->newSelectQueryBuilder() |
528 | ->select( 'user_editcount' ) |
529 | ->from( 'user' ) |
530 | ->where( [ 'user_id' => $userID ] ) |
531 | ->caller( __METHOD__ ) |
532 | ->fetchField(); |
533 | if ( $curWikiEditCount !== null ) { |
534 | $totalEditCount += (int)$curWikiEditCount; |
535 | } |
536 | } |
537 | return $totalEditCount; |
538 | } |
539 | |
540 | /** |
541 | * Returns a (0, 1) score based on the recent activity (edits) of the given user. |
542 | * |
543 | * @param string $username |
544 | * @param int[][] $userDataByWiki Map of [ wiki => [ userID: int, actorID: int ] ] |
545 | * @phan-param array<string,array{userID:int,actorID:int}> $userDataByWiki |
546 | * @return float |
547 | */ |
548 | private function getRecentActivityScore( string $username, array $userDataByWiki ): float { |
549 | // This uses a rational function, so that it does not decay exponentially over time. See |
550 | // https://www.desmos.com/calculator/vzhiigbxc9 |
551 | // Note: we already have a hard cutoff in the revision query (self::CUTOFF_DAYS), so anything before that won't |
552 | // even be scored. |
553 | $daysSinceLastEdit = $this->getDaysSinceLastEdit( $userDataByWiki ); |
554 | $xMonths = $daysSinceLastEdit / 30; |
555 | // TODO: This may have to be scaled down. Think of what the overall score would be in function of the recent |
556 | // activity score assuming that the other scores are all 1. For instance, with c=7 and d=0.5 to speed up decay, |
557 | // and then reducing the recent activity weight. |
558 | $recentActivityScore = ( $xMonths ** 2 + 1.2 * $xMonths + 0.7 ) / |
559 | ( 0.3 * ( $xMonths ** 3 ) + $xMonths ** 2 + 1.2 * $xMonths + 0.7 ); |
560 | ( $this->debugLogger )( "User $username last edit $daysSinceLastEdit days ago, score $recentActivityScore" ); |
561 | return $recentActivityScore; |
562 | } |
563 | |
564 | /** |
565 | * @param int[][] $userDataByWiki Map of [ wiki => [ userID: int, actorID: int ] ] |
566 | * @phan-param array<string,array{userID:int,actorID:int}> $userDataByWiki |
567 | * @return float |
568 | */ |
569 | private function getDaysSinceLastEdit( array $userDataByWiki ): float { |
570 | // XXX: UserEditTracker is only available for the local wiki, so just use its query directly. |
571 | $lastEditTS = 0; |
572 | foreach ( $userDataByWiki as $wiki => [ 'actorID' => $actorID ] ) { |
573 | $dbr = $this->dbProvider->getReplicaDatabase( $wiki ); |
574 | $curWikiTS = $dbr->newSelectQueryBuilder() |
575 | ->select( 'rev_timestamp' ) |
576 | ->from( 'revision' ) |
577 | ->where( [ 'rev_actor' => $actorID ] ) |
578 | ->orderBy( 'rev_timestamp', SelectQueryBuilder::SORT_DESC ) |
579 | ->caller( __METHOD__ ) |
580 | ->fetchField(); |
581 | if ( $curWikiTS ) { |
582 | $lastEditTS = max( $lastEditTS, (int)MWTimestamp::convert( TS_UNIX, $curWikiTS ) ); |
583 | } |
584 | } |
585 | if ( $lastEditTS === 0 ) { |
586 | throw new RuntimeException( "No last edit from user who has edits?!" ); |
587 | } |
588 | return ( MWTimestamp::time() - $lastEditTS ) / ( 60 * 60 * 24 ); |
589 | } |
590 | |
591 | /** |
592 | * @param array $revisionsByWiki |
593 | * @return array |
594 | */ |
595 | private function filterUsersByPreference( array $revisionsByWiki ): array { |
596 | array_walk( $revisionsByWiki, function ( &$subArray ) { |
597 | $subArray = array_filter( $subArray, function ( $item ) { |
598 | $user = UserIdentityValue::newRegistered( (int)$item['userID'], $item['username'] ); |
599 | return $this->userOptionsLookup->getBoolOption( |
600 | $user, |
601 | GetPreferencesHandler::ALLOW_INVITATIONS_PREFERENCE |
602 | ); |
603 | } ); |
604 | } ); |
605 | return $revisionsByWiki; |
606 | } |
607 | } |