Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
89.49% |
247 / 276 |
|
60.00% |
9 / 15 |
CRAP | |
0.00% |
0 / 1 |
GenerateInvitationList | |
89.82% |
247 / 275 |
|
60.00% |
9 / 15 |
55.97 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
13 / 13 |
|
100.00% |
1 / 1 |
1 | |||
execute | |
87.50% |
14 / 16 |
|
0.00% |
0 / 1 |
4.03 | |||
getArticlesByWiki | |
75.00% |
21 / 28 |
|
0.00% |
0 / 1 |
11.56 | |||
getAllRevisionsForWiki | |
98.48% |
65 / 66 |
|
0.00% |
0 / 1 |
8 | |||
getRevisionFilterConditions | |
72.00% |
36 / 50 |
|
0.00% |
0 / 1 |
5.55 | |||
rankUsers | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
2 | |||
getDeltasByUser | |
100.00% |
16 / 16 |
|
100.00% |
1 / 1 |
5 | |||
getUserDataByWiki | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
3 | |||
getUserScore | |
100.00% |
12 / 12 |
|
100.00% |
1 / 1 |
1 | |||
getOverallBytesScore | |
80.00% |
12 / 15 |
|
0.00% |
0 / 1 |
4.13 | |||
getBytesScoreForPage | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
getEditCountScore | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
getEditCount | |
100.00% |
13 / 13 |
|
100.00% |
1 / 1 |
3 | |||
getRecentActivityScore | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
1 | |||
getDaysSinceLastEdit | |
93.75% |
15 / 16 |
|
0.00% |
0 / 1 |
4.00 |
1 | <?php |
2 | |
3 | declare( strict_types=1 ); |
4 | |
5 | namespace MediaWiki\Extension\CampaignEvents\Maintenance; |
6 | |
7 | use ChangeTags; |
8 | use Maintenance; |
9 | use MediaWiki\MainConfigNames; |
10 | use MediaWiki\MediaWikiServices; |
11 | use MediaWiki\Page\PageIdentity; |
12 | use MediaWiki\Storage\NameTableAccessException; |
13 | use MediaWiki\Utils\MWTimestamp; |
14 | use MediaWiki\WikiMap\WikiMap; |
15 | use Wikimedia\Rdbms\IReadableDatabase; |
16 | use Wikimedia\Rdbms\SelectQueryBuilder; |
17 | use Wikimedia\Timestamp\ConvertibleTimestamp; |
18 | |
19 | /** |
20 | * This scripts takes a list of pages as input, and outputs a list of users who've edited those pages the most. Each |
21 | * user is given a score from 0 to 100 based on how likely they are to be a productive participant in an event that is |
22 | * focused on improving the given pages. This score is based on how many edits someone made (and how big they are) to |
23 | * the pages in the list, their global edit count, and their recent global activity. |
24 | * NOTE: This script is just a demo / proof of concept. It is not based on any real-world data and the calculations |
25 | * are very much non-rigorous. |
26 | */ |
27 | class GenerateInvitationList extends Maintenance { |
28 | /** |
29 | * How many days to look back into the past when scanning revisions. |
30 | * TODO: Is 3 years OK? |
31 | */ |
32 | public const CUTOFF_DAYS = 3 * 365; |
33 | |
34 | public function __construct() { |
35 | parent::__construct(); |
36 | $this->addDescription( |
37 | 'Generates a list of potential event participants by looking at who contributed to a given list of pages' |
38 | ); |
39 | $this->requireExtension( 'CampaignEvents' ); |
40 | $this->addOption( |
41 | 'listfile', |
42 | 'Path to a file with a list of articles to get contributors for. The file should have one page per ' . |
43 | 'line, in the following format: `[wikiID, or empty for the local wiki]:[page title]`. All the pages ' . |
44 | 'must be in the mainspace.', |
45 | true, |
46 | true |
47 | ); |
48 | } |
49 | |
50 | /** |
51 | * @inheritDoc |
52 | */ |
53 | public function execute(): void { |
54 | $pagesByWiki = $this->getArticlesByWiki(); |
55 | $this->output( |
56 | "==Articles==\n" . implode( "\n", array_merge( ...array_values( $pagesByWiki ) ) ) . "\n\n" |
57 | ); |
58 | |
59 | $revisionsByWiki = []; |
60 | foreach ( $pagesByWiki as $wiki => $pages ) { |
61 | $revisionsByWiki[$wiki] = $this->getAllRevisionsForWiki( $wiki, $pages ); |
62 | } |
63 | $revisionsByWiki = array_filter( $revisionsByWiki ); |
64 | if ( !$revisionsByWiki ) { |
65 | $this->output( "No revisions found.\n" ); |
66 | return; |
67 | } |
68 | |
69 | $rankedUsers = $this->rankUsers( $revisionsByWiki ); |
70 | $out = "\n==Contributor scores==\n"; |
71 | foreach ( $rankedUsers as $username => $score ) { |
72 | $out .= "$username - $score\n"; |
73 | } |
74 | $this->output( $out . "\n\n" ); |
75 | } |
76 | |
77 | /** |
78 | * Reads a list of articles from the file passed as `listfile` to the script. |
79 | * |
80 | * @return PageIdentity[][] Map of [ wiki ID => non-empty list of articles ] |
81 | * @phan-return non-empty-array<string|false,non-empty-list<PageIdentity>> |
82 | */ |
83 | private function getArticlesByWiki(): array { |
84 | $listPath = $this->getOption( 'listfile' ); |
85 | // phpcs:ignore Generic.PHP.NoSilencedErrors.Discouraged |
86 | $rawList = @file_get_contents( $listPath ); |
87 | if ( $rawList === false ) { |
88 | $this->fatalError( "Cannot read list of articles" ); |
89 | } |
90 | |
91 | $curWikiID = WikiMap::getCurrentWikiId(); |
92 | $pageStoreFactory = MediaWikiServices::getInstance()->getPageStoreFactory(); |
93 | $pagesByWiki = []; |
94 | foreach ( explode( "\n", $rawList ) as $line ) { |
95 | if ( $line === '' ) { |
96 | continue; |
97 | } |
98 | |
99 | $lineParts = explode( ':', $line, 2 ); |
100 | if ( count( $lineParts ) !== 2 ) { |
101 | $this->fatalError( "Line without wiki ID: $line" ); |
102 | } |
103 | // XXX: We're using the actual wiki ID instead of WikiAwareEntity::LOCAL for the local wiki, so that PHP |
104 | // won't autocast it to `0` when used as array key. |
105 | $wikiID = $lineParts[0] === '' ? $curWikiID : $lineParts[0]; |
106 | $title = $lineParts[1]; |
107 | $pageStore = $pageStoreFactory->getPageStore( $wikiID ); |
108 | // Note: If $title happens to contain a namespace identifier, or really anything that cannot be parsed in |
109 | // the context of the current wiki, this method won't behave correctly due to T353916. There doesn't seem |
110 | // to be much that we can do about it. |
111 | $page = $pageStore->getPageByText( $title ); |
112 | if ( !$page ) { |
113 | $this->fatalError( "Invalid title: $title" ); |
114 | } elseif ( !$page->exists() ) { |
115 | $this->fatalError( "Page does not exist: $title" ); |
116 | } elseif ( $page->getNamespace() !== NS_MAIN ) { |
117 | $this->fatalError( "Page is not in the mainspace: $title" ); |
118 | } |
119 | $pagesByWiki[$wikiID] ??= []; |
120 | $pagesByWiki[$wikiID][] = $page; |
121 | } |
122 | |
123 | if ( !$pagesByWiki ) { |
124 | $this->fatalError( "Empty list of articles" ); |
125 | } |
126 | |
127 | return $pagesByWiki; |
128 | } |
129 | |
130 | /** |
131 | * @param string|false $wikiID |
132 | * @param PageIdentity[] $pages |
133 | * @return array[] List of arrays with revision data. The page is only included for debugging, and callers should |
134 | * not rely on its format. |
135 | * @phan-return list<array{username:string,userID:int,actorID:int,page:string,delta:int}> |
136 | */ |
137 | private function getAllRevisionsForWiki( $wikiID, array $pages ): array { |
138 | $revisionStore = MediaWikiServices::getInstance()->getRevisionStoreFactory()->getRevisionStore( $wikiID ); |
139 | $dbProvider = MediaWikiServices::getInstance()->getDBLoadBalancerFactory(); |
140 | // This script may potentially scan a lot of revisions. Although the queries can use good indexes, sending them |
141 | // to vslow hosts shouldn't hurt. |
142 | $dbr = $dbProvider->getReplicaDatabase( $wikiID, 'vslow' ); |
143 | |
144 | $pagesByID = []; |
145 | foreach ( $pages as $page ) { |
146 | $pageID = $page->getId( $wikiID ); |
147 | $pagesByID[$pageID] = $page; |
148 | } |
149 | asort( $pagesByID ); |
150 | $pageChunks = array_chunk( array_keys( $pagesByID ), 25 ); |
151 | $totalPageChunks = count( $pageChunks ); |
152 | |
153 | $baseWhereConds = $this->getRevisionFilterConditions( $wikiID, $dbr ); |
154 | |
155 | $batchSize = 2500; |
156 | $revisions = []; |
157 | $pageBatchIdx = 1; |
158 | |
159 | // Process the list of pages in smaller chunks, to avoid the optimizer making wrong decisions, and also to keep |
160 | // the queries more readable. |
161 | foreach ( $pageChunks as $batchPageIDs ) { |
162 | $lastPage = 0; |
163 | $lastTimestamp = $dbr->timestamp( '20000101000000' ); |
164 | $lastRevID = 0; |
165 | $innerBatchIdx = 1; |
166 | do { |
167 | $progressMsg = "Running $wikiID batch #$pageBatchIdx.$innerBatchIdx of $totalPageChunks " . |
168 | "from pageID=" . min( $batchPageIDs ); |
169 | if ( $lastRevID !== 0 ) { |
170 | $progressMsg .= ", ts=$lastTimestamp, rev=$lastRevID"; |
171 | } |
172 | $this->output( $progressMsg . "\n" ); |
173 | $revQueryBuilder = $revisionStore->newSelectQueryBuilder( $dbr ); |
174 | $res = $revQueryBuilder |
175 | ->field( 'actor_name' ) |
176 | // Needed for the user_is_temp check. |
177 | ->joinUser() |
178 | ->where( $baseWhereConds ) |
179 | ->andWhere( [ 'rev_page' => $batchPageIDs ] ) |
180 | ->andWhere( [ |
181 | $dbr->buildComparison( '>', [ |
182 | 'rev_page' => $lastPage, |
183 | 'rev_timestamp' => $lastTimestamp, |
184 | 'rev_id' => $lastRevID |
185 | ] ) |
186 | ] ) |
187 | ->orderBy( [ 'rev_page', 'rev_timestamp', 'rev_id' ], SelectQueryBuilder::SORT_ASC ) |
188 | ->limit( $batchSize ) |
189 | ->caller( __METHOD__ ) |
190 | ->fetchResultSet(); |
191 | |
192 | $parents = []; |
193 | foreach ( $res as $row ) { |
194 | $parentID = (int)$row->rev_parent_id; |
195 | if ( $parentID !== 0 ) { |
196 | $parents[$row->rev_id] = $parentID; |
197 | } |
198 | } |
199 | |
200 | $parentSizes = $revisionStore->getRevisionSizes( array_values( $parents ) ); |
201 | |
202 | foreach ( $res as $row ) { |
203 | $parentID = $parents[$row->rev_id] ?? null; |
204 | $parentSize = $parentID ? $parentSizes[$parentID] : 0; |
205 | $revisions[] = [ |
206 | 'username' => $row->actor_name, |
207 | 'userID' => $row->rev_user, |
208 | 'actorID' => $row->rev_actor, |
209 | 'page' => $pagesByID[$row->rev_page]->__toString(), |
210 | 'delta' => (int)$row->rev_len - $parentSize |
211 | ]; |
212 | $lastPage = (int)$row->rev_page; |
213 | $lastTimestamp = $row->rev_timestamp; |
214 | $lastRevID = (int)$row->rev_id; |
215 | } |
216 | |
217 | $innerBatchIdx++; |
218 | sleep( 1 ); |
219 | } while ( $res->numRows() >= $batchSize ); |
220 | $pageBatchIdx++; |
221 | } |
222 | |
223 | return $revisions; |
224 | } |
225 | |
226 | /** |
227 | * Returns an array of filters to apply to the revision query. |
228 | * |
229 | * @param string|false $wikiID |
230 | * @param IReadableDatabase $dbr |
231 | * @return array |
232 | */ |
233 | private function getRevisionFilterConditions( $wikiID, IReadableDatabase $dbr ): array { |
234 | $filterConditions = []; |
235 | |
236 | // Exclude all sorts of deleted revisions to avoid any chance of data leaks. |
237 | $filterConditions['rev_deleted'] = 0; |
238 | |
239 | // Exclude anons and temp users. |
240 | $filterConditions[] = $dbr->expr( 'actor_user', '!=', null ); |
241 | $filterConditions['user_is_temp'] = 0; |
242 | |
243 | // Exclude anything too old. |
244 | $startTime = (int)ConvertibleTimestamp::now( TS_UNIX ) - self::CUTOFF_DAYS * 24 * 60 * 60; |
245 | $filterConditions[] = $dbr->expr( 'rev_timestamp', '>=', $dbr->timestamp( $startTime ) ); |
246 | |
247 | // Exclude both edits that have been reverted, and edits that revert other edits. Neither of these is relevant, |
248 | // and can easily skew the deltas. |
249 | $nameTableStoreFactory = MediaWikiServices::getInstance()->getNameTableStoreFactory(); |
250 | $changeTagDefStore = $nameTableStoreFactory->getChangeTagDef( $wikiID ); |
251 | $revertTagIDs = []; |
252 | foreach ( [ ...ChangeTags::REVERT_TAGS, ChangeTags::TAG_REVERTED ] as $tagName ) { |
253 | try { |
254 | $revertTagIDs[] = $changeTagDefStore->getId( $tagName ); |
255 | } catch ( NameTableAccessException $e ) { |
256 | // There's no tag ID if no revisions have ever been tagged with this tag. |
257 | } |
258 | } |
259 | if ( $revertTagIDs ) { |
260 | $tagSubquery = $dbr->newSelectQueryBuilder() |
261 | ->select( '1' ) |
262 | ->from( 'change_tag' ) |
263 | ->where( [ 'ct_rev_id = rev_id', 'ct_tag_id' => $revertTagIDs ] ); |
264 | $filterConditions[] = 'NOT EXISTS(' . $tagSubquery->getSQL() . ')'; |
265 | } |
266 | |
267 | // Exclude users who have a sitewide infinite block. |
268 | $readOldBlockSchema = (bool)( MediaWikiServices::getInstance()->getMainConfig() |
269 | ->get( MainConfigNames::BlockTargetMigrationStage ) & SCHEMA_COMPAT_READ_OLD ); |
270 | if ( $readOldBlockSchema ) { |
271 | $blocksSubquery = $dbr->newSelectQueryBuilder() |
272 | ->select( '1' ) |
273 | ->from( 'ipblocks' ) |
274 | ->where( [ |
275 | $dbr->expr( 'ipb_user', '!=', 0 ), |
276 | 'actor_rev_user.actor_user = ipb_user', |
277 | 'ipb_expiry' => $dbr->getInfinity(), |
278 | 'ipb_sitewide' => 1, |
279 | ] ); |
280 | } else { |
281 | $blocksSubquery = $dbr->newSelectQueryBuilder() |
282 | ->select( '1' ) |
283 | ->from( 'block' ) |
284 | ->join( 'block_target', null, 'bt_id=bl_target' ) |
285 | ->where( [ |
286 | $dbr->expr( 'bt_user', '!=', null ), |
287 | 'actor_rev_user.actor_user = bt_user', |
288 | 'bl_expiry' => $dbr->getInfinity(), |
289 | 'bl_sitewide' => 1, |
290 | ] ); |
291 | } |
292 | $filterConditions[] = 'NOT EXISTS(' . $blocksSubquery->getSQL() . ')'; |
293 | |
294 | // Exclude bots. Note, this only checks whether a user is *currently* a bot, not whether |
295 | // they were a bot at the time the edit was made. |
296 | // XXX: Ideally we would use GroupPermissionLookup to list user groups with the 'bot' right, but that |
297 | // only works for the local wiki. |
298 | $botSubquery = $dbr->newSelectQueryBuilder() |
299 | ->select( '1' ) |
300 | ->from( 'user_groups' ) |
301 | ->where( [ |
302 | 'actor_rev_user.actor_user = ug_user', |
303 | 'ug_group' => 'bot', |
304 | ] ); |
305 | $filterConditions[] = 'NOT EXISTS(' . $botSubquery->getSQL() . ')'; |
306 | |
307 | return $filterConditions; |
308 | } |
309 | |
310 | /** |
311 | * This method takes a list of contributors along with the total number of bytes they added for each page, and |
312 | * returns a list of the same users, ranked by the likelihood of them being interested in the event. |
313 | * |
314 | * @param array[] $revisionsByWiki |
315 | * @phpcs:ignore Generic.Files.LineLength |
316 | * @phan-param array<string,list<array{username:string,userID:int,actorID:int,page:string,delta:int}>> $revisionsByWiki |
317 | * @return array<string,int> List of users along with their score, sorted from highest to lowest. |
318 | */ |
319 | private function rankUsers( array $revisionsByWiki ): array { |
320 | $deltasByUser = $this->getDeltasByUser( $revisionsByWiki ); |
321 | $userDataByWiki = $this->getUserDataByWiki( $revisionsByWiki ); |
322 | $this->output( "==Scoring debug info==\n" ); |
323 | $rankedUsers = []; |
324 | foreach ( $deltasByUser as $username => $byteDeltas ) { |
325 | // Make sure the username is a string to satisfy the type hint. PHP will have transformed it to an integer |
326 | // if the username is numeric (when used as array key). |
327 | $score = $this->getUserScore( (string)$username, $byteDeltas, $userDataByWiki[$username] ); |
328 | $rankedUsers[$username] = $score; |
329 | } |
330 | arsort( $rankedUsers ); |
331 | $this->output( "\n\n" ); |
332 | return $rankedUsers; |
333 | } |
334 | |
335 | /** |
336 | * @param array[] $revisionsByWiki |
337 | * * @phpcs:ignore Generic.Files.LineLength |
338 | * * @phan-param array<string,list<array{username:string,userID:int,actorID:int,page:string,delta:int}>> $revisionsByWiki |
339 | * @return array<string,int[]> For each user, this contains a list of deltas in bytes across all relevant pages. |
340 | */ |
341 | private function getDeltasByUser( array $revisionsByWiki ): array { |
342 | $listByUser = []; |
343 | // Flatten the list, merging revisions from all wikis. |
344 | $revisions = array_merge( ...array_values( $revisionsByWiki ) ); |
345 | foreach ( $revisions as [ 'username' => $username, 'page' => $pageKey, 'delta' => $delta ] ) { |
346 | $listByUser[$username] ??= []; |
347 | $listByUser[$username][$pageKey] ??= 0; |
348 | $listByUser[$username][$pageKey] += $delta; |
349 | } |
350 | |
351 | $deltas = []; |
352 | $this->output( "==Contributions==\n" ); |
353 | foreach ( $listByUser as $user => $userDeltas ) { |
354 | foreach ( $userDeltas as $pageKey => $delta ) { |
355 | $this->output( "$user - $pageKey - $delta\n" ); |
356 | } |
357 | // TODO: What should we do with negative totals? Large negative deltas do not necessarily indicate that a |
358 | // user is not interested in the article. This problem is somewhat mitigated by the exclusion of reverts, |
359 | // but there are still situations where a negative delta might be a good thing. For instance, if someone has |
360 | // moved a section of the article to a separate page. In general, the byte count itself is far from being |
361 | // perfect as a metric. For now, we're excluding negative deltas because some of the formulas below expect |
362 | // the total delta to be positive. |
363 | $positiveDeltas = array_filter( $userDeltas, static fn ( $x ) => $x > 0 ); |
364 | if ( $positiveDeltas ) { |
365 | $deltas[$user] = array_values( $positiveDeltas ); |
366 | } |
367 | } |
368 | $this->output( "\n\n" ); |
369 | return $deltas; |
370 | } |
371 | |
372 | /** |
373 | * Returns user identifiers (name, ID, actor ID) for each contributor, for each wiki where they made edits to |
374 | * articles in the worklist. This can't just use UserIdentity because that doesn't include the actor ID, which we |
375 | * need for other queries later (particularly in getDaysSinceLastEdit()). Alternatively we could use |
376 | * ActorNormalization or a join on the user table, but both seem unnecessary (and potentially slow) when we already |
377 | * have the actor ID available. |
378 | * |
379 | * @param array[] $revisionsByWiki |
380 | * @phpcs:ignore Generic.Files.LineLength |
381 | * @phan-param array<string,list<array{username:string,userID:int,actorID:int,page:string,delta:int}>> $revisionsByWiki |
382 | * @return int[][][] Indexed by username first, then wiki ID. |
383 | * @phan-return array<string,array<string,array{userID:int,actorID:int}>> |
384 | */ |
385 | private function getUserDataByWiki( array $revisionsByWiki ): array { |
386 | $userData = []; |
387 | foreach ( $revisionsByWiki as $wiki => $revisions ) { |
388 | foreach ( $revisions as [ 'username' => $username, 'userID' => $userID, 'actorID' => $actorID ] ) { |
389 | $userData[$username][$wiki] = [ 'userID' => $userID, 'actorID' => $actorID ]; |
390 | } |
391 | } |
392 | return $userData; |
393 | } |
394 | |
395 | /** |
396 | * Returns a score from 0 to 100 for a given user. |
397 | * |
398 | * @param string $username |
399 | * @param int[] $byteDeltas |
400 | * @param int[][] $userDataByWiki Map of [ wiki => [ userID: int, actorID: int ] ] |
401 | * @phan-param array<string,array{userID:int,actorID:int}> $userDataByWiki |
402 | * @return int |
403 | */ |
404 | private function getUserScore( string $username, array $byteDeltas, array $userDataByWiki ): int { |
405 | // Idea: Maybe check how many edits each user has for each page, and handle each edit separately. |
406 | // This would allow us to better handle outliers like single edits that add a lot of content. |
407 | // Unsure how valuable that would be though, because huge edits can represent at least two different things: |
408 | // 1 - Automated maintenance operation (e.g., adding archive links via IABot) |
409 | // 2 - Substantial additions of content (for example, but not necessarily, upon page creation) |
410 | // Our goal is to avoid 1, while catching 2, which might be difficult. Still, if a user has multiple edits for a |
411 | // given page, it's more likely that they may have a genuine interest in the article subject, as opposed to them |
412 | // performing some mass-maintenance operation that happened to touch a certain article. |
413 | |
414 | $bytesScore = $this->getOverallBytesScore( $username, $byteDeltas ); |
415 | $editCountScore = $this->getEditCountScore( $username, $userDataByWiki ); |
416 | $recentActivityScore = $this->getRecentActivityScore( $username, $userDataByWiki ); |
417 | |
418 | // Once we have a (0, 1) score for each criterion, we combine them to obtain an overall score. This is currently |
419 | // doing a weighted geometric mean. Amongst the advantages of the geometric mean is that it's conveniently |
420 | // sensitive to small values. In practice, this means that even a single low score (around zero) will bring the |
421 | // overall score down to around zero. |
422 | $bytesWeight = 4; |
423 | $editCountWeight = 1; |
424 | $recentActivityWeight = 5; |
425 | $overallScore = ( |
426 | ( $bytesScore ** $bytesWeight ) * |
427 | ( $editCountScore ** $editCountWeight ) * |
428 | ( $recentActivityScore ** $recentActivityWeight ) |
429 | ) ** ( 1 / ( $bytesWeight + $editCountWeight + $recentActivityWeight ) ); |
430 | return (int)round( 100 * $overallScore ); |
431 | } |
432 | |
433 | /** |
434 | * Returns a (0, 1) score based on the number and size of contributions that a single user made across all pages |
435 | * in the worklist. |
436 | * |
437 | * @param string $username |
438 | * @param int[] $deltas |
439 | * @return float |
440 | */ |
441 | private function getOverallBytesScore( string $username, array $deltas ): float { |
442 | // This function computed a (0, 1) score for each page. Then, we get the maximum of those scores and "boost" |
443 | // it by using the other scores. Let us indicate the overall scoring function with f(x), where x is a |
444 | // k-dimentional vector. Let x_m be the component in x with the maximum value. We then have f(x) = x_m * b(x), |
445 | // where b(x) is the boosting function, which outputs values in [ 1, 1 / x_m ]. |
446 | // f(x) satisfies the following conditions: |
447 | // * f(x) ∈ [0, 1] |
448 | // * f( x_1 ) = x_1 (single variable case) |
449 | // * f( 0, ..., 0 ) = 0, f( 1, ..., 1 ) = 1 |
450 | // * f(x) >= x_m, where the equality holds true iff all components (at most with the exception of x_m) are 0, |
451 | // or x_m = 1 |
452 | // Note that the case x_m = 0 is defined separately to avoid annoyances with denominators. |
453 | // The b(x) currently used is calculated by taking x_m and linearly amplifying it by a factor proportional to |
454 | // the second largest component, then iterating the process for every component. This is very empirical, and |
455 | // it would be better to scale the amplification based on the actual byte values, by establishing more rigorous |
456 | // relationship to determine how much we shuld favour a given total delta being spread across multiple pages, as |
457 | // opposed to being concentrated in a single page. Hopefully, this simple approach would suffice for now. |
458 | // The two-dimensional version of the function can be visualised in https://www.desmos.com/3d/41a47c8129 |
459 | rsort( $deltas ); |
460 | $numPages = count( $deltas ); |
461 | $maxBytesScore = $this->getBytesScoreForPage( $deltas[0] ); |
462 | $this->output( "User $username max bytes $deltas[0] with score $maxBytesScore\n" ); |
463 | $bytesScore = $maxBytesScore; |
464 | if ( $maxBytesScore !== 0.0 ) { |
465 | for ( $i = 1; $i < $numPages; $i++ ) { |
466 | $curBytesScore = $this->getBytesScoreForPage( $deltas[$i] ); |
467 | if ( $curBytesScore === 0.0 ) { |
468 | // Scores from here on no longer have any effect. |
469 | break; |
470 | } |
471 | $this->output( "User $username bytes score #$i: $curBytesScore\n" ); |
472 | $damping = 1; |
473 | $bytesScore *= 1 + ( $curBytesScore ** $damping ) * ( 1 / $bytesScore - 1 ); |
474 | } |
475 | } |
476 | $this->output( "User $username overall bytes score $bytesScore\n" ); |
477 | return $bytesScore; |
478 | } |
479 | |
480 | /** |
481 | * Returns a (0, 1) score based on the contributions made to a single page. |
482 | * |
483 | * @param int $delta |
484 | * @return float |
485 | */ |
486 | private function getBytesScoreForPage( int $delta ): float { |
487 | // Because we use bytes as the main metric in determining the overall score, it's important that the score |
488 | // function is as good as possible. This is a logistic-like model, but multiplied by a function that's really |
489 | // flat near 0, which acts as a sort of high-pass filter. |
490 | // The values for the two parameters have been computed numerically via gradient descent in order to minimize |
491 | // the sum of squared residuals. Then, they've been approximated to more readable values. Both the original fit |
492 | // and the rounded fit can be visualized in https://www.desmos.com/calculator/eu7u0kwkd6 |
493 | $scaledX = $delta / 1000; |
494 | $baseScore = 2 / ( 1 + exp( -0.42 * ( $scaledX ** 1.1 ) ) ) - 1; |
495 | $flatteningFactor = exp( -0.00001 / ( $scaledX ** 10 ) ); |
496 | return $baseScore * $flatteningFactor; |
497 | } |
498 | |
499 | /** |
500 | * Returns a (0, 1) score based on the edit count of the given user. |
501 | * |
502 | * @param string $username |
503 | * @param int[][] $userDataByWiki Map of [ wiki => [ userID: int, actorID: int ] ] |
504 | * @phan-param array<string,array{userID:int,actorID:int}> $userDataByWiki |
505 | * @return float |
506 | */ |
507 | private function getEditCountScore( string $username, array $userDataByWiki ): float { |
508 | $editCount = $this->getEditCount( $userDataByWiki ); |
509 | // This one uses the same base model as the bytes score, but with different parameters (approximated in the |
510 | // same way). The graph can be visualised in https://www.desmos.com/calculator/4mhrtnhf4i |
511 | $scaledEC = $editCount / 1000; |
512 | $editCountScore = 2 / ( 1 + exp( -3 * ( $scaledEC ** 0.66 ) ) ) - 1; |
513 | $this->output( "User $username edit count $editCount, score $editCountScore\n" ); |
514 | return $editCountScore; |
515 | } |
516 | |
517 | /** |
518 | * @param int[][] $userDataByWiki Map of [ wiki => [ userID: int, actorID: int ] ] |
519 | * @phan-param array<string,array{userID:int,actorID:int}> $userDataByWiki |
520 | * @return int |
521 | */ |
522 | private function getEditCount( array $userDataByWiki ): int { |
523 | // XXX: UserEditTracker is only available for the local wiki, and the global edit count is a CentralAuth thing |
524 | $totalEditCount = 0; |
525 | foreach ( $userDataByWiki as $wiki => [ 'userID' => $userID ] ) { |
526 | $dbr = MediaWikiServices::getInstance()->getDBLoadBalancerFactory() |
527 | ->getReplicaDatabase( $wiki ); |
528 | $curWikiEditCount = $dbr->newSelectQueryBuilder() |
529 | ->select( 'user_editcount' ) |
530 | ->from( 'user' ) |
531 | ->where( [ 'user_id' => $userID ] ) |
532 | ->caller( __METHOD__ ) |
533 | ->fetchField(); |
534 | if ( $curWikiEditCount !== null ) { |
535 | $totalEditCount += (int)$curWikiEditCount; |
536 | } |
537 | } |
538 | return $totalEditCount; |
539 | } |
540 | |
541 | /** |
542 | * Returns a (0, 1) score based on the recent activity (edits) of the given user. |
543 | * |
544 | * @param string $username |
545 | * @param int[][] $userDataByWiki Map of [ wiki => [ userID: int, actorID: int ] ] |
546 | * @phan-param array<string,array{userID:int,actorID:int}> $userDataByWiki |
547 | * @return float |
548 | */ |
549 | private function getRecentActivityScore( string $username, array $userDataByWiki ): float { |
550 | // This uses a rational function, so that it does not decay exponentially over time. See |
551 | // https://www.desmos.com/calculator/vzhiigbxc9 |
552 | // Note: we already have a hard cutoff in the revision query (self::CUTOFF_DAYS), so anything before that won't |
553 | // even be scored. |
554 | $daysSinceLastEdit = $this->getDaysSinceLastEdit( $userDataByWiki ); |
555 | $xMonths = $daysSinceLastEdit / 30; |
556 | // TODO: This may have to be scaled down. Think of what the overall score would be in function of the recent |
557 | // activity score assuming that the other scores are all 1. For instance, with c=7 and d=0.5 to speed up decay, |
558 | // and then reducing the recent activity weight. |
559 | $recentActivityScore = ( $xMonths ** 2 + 1.2 * $xMonths + 0.7 ) / |
560 | ( 0.3 * ( $xMonths ** 3 ) + $xMonths ** 2 + 1.2 * $xMonths + 0.7 ); |
561 | $this->output( "User $username last edit $daysSinceLastEdit days ago, score $recentActivityScore\n" ); |
562 | return $recentActivityScore; |
563 | } |
564 | |
565 | /** |
566 | * @param int[][] $userDataByWiki Map of [ wiki => [ userID: int, actorID: int ] ] |
567 | * @phan-param array<string,array{userID:int,actorID:int}> $userDataByWiki |
568 | * @return float |
569 | */ |
570 | private function getDaysSinceLastEdit( array $userDataByWiki ): float { |
571 | // XXX: UserEditTracker is only available for the local wiki, so just use its query directly. |
572 | $lastEditTS = 0; |
573 | foreach ( $userDataByWiki as $wiki => [ 'actorID' => $actorID ] ) { |
574 | $dbr = MediaWikiServices::getInstance()->getDBLoadBalancerFactory() |
575 | ->getReplicaDatabase( $wiki ); |
576 | $curWikiTS = $dbr->newSelectQueryBuilder() |
577 | ->select( 'rev_timestamp' ) |
578 | ->from( 'revision' ) |
579 | ->where( [ 'rev_actor' => $actorID ] ) |
580 | ->orderBy( 'rev_timestamp', SelectQueryBuilder::SORT_DESC ) |
581 | ->caller( __METHOD__ ) |
582 | ->fetchField(); |
583 | if ( $curWikiTS ) { |
584 | $lastEditTS = max( $lastEditTS, (int)MWTimestamp::convert( TS_UNIX, $curWikiTS ) ); |
585 | } |
586 | } |
587 | if ( $lastEditTS === 0 ) { |
588 | $this->fatalError( "No last edit from user who has edits?!" ); |
589 | } |
590 | return ( MWTimestamp::time() - $lastEditTS ) / ( 60 * 60 * 24 ); |
591 | } |
592 | } |
593 | |
594 | return GenerateInvitationList::class; |