Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
0.00% covered (danger)
0.00%
0 / 356
0.00% covered (danger)
0.00%
0 / 6
CRAP
0.00% covered (danger)
0.00%
0 / 1
FlaggedRevsStats
0.00% covered (danger)
0.00%
0 / 356
0.00% covered (danger)
0.00%
0 / 6
1482
0.00% covered (danger)
0.00%
0 / 1
 getStats
0.00% covered (danger)
0.00%
0 / 51
0.00% covered (danger)
0.00%
0 / 1
240
 updateCache
0.00% covered (danger)
0.00%
0 / 84
0.00% covered (danger)
0.00%
0 / 1
30
 getPerNamespaceTotals
0.00% covered (danger)
0.00%
0 / 25
0.00% covered (danger)
0.00%
0 / 1
6
 dbUnixTime
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
6
 getMeanPendingEditTime
0.00% covered (danger)
0.00%
0 / 13
0.00% covered (danger)
0.00%
0 / 1
2
 getEditReviewTimes
0.00% covered (danger)
0.00%
0 / 182
0.00% covered (danger)
0.00%
0 / 1
182
1<?php
2
3use MediaWiki\MediaWikiServices;
4use MediaWiki\User\ActorMigration;
5use Wikimedia\Rdbms\IReadableDatabase;
6use Wikimedia\Rdbms\SelectQueryBuilder;
7use Wikimedia\Rdbms\SubQuery;
8
9/**
10 * FlaggedRevs stats functions
11 */
12class FlaggedRevsStats {
13    /**
14     * @return array of current FR stats
15     */
16    public static function getStats() {
17        $data = [
18            'reviewLag-anon-sampleSize' => '-',
19            'reviewLag-anon-average' => '-',
20            'reviewLag-anon-median' => '-',
21            'reviewLag-anon-percentile' => [],
22            'reviewLag-user-sampleSize' => '-',
23            'reviewLag-user-average' => '-',
24            'reviewLag-user-median' => '-',
25            'reviewLag-user-percentile' => [],
26            'totalPages-NS' => [],
27            'reviewedPages-NS' => [],
28            'syncedPages-NS' => [],
29            'pendingLag-average' => '-',
30            'statTimestamp' => '-',
31        ];
32
33        $dbr = MediaWikiServices::getInstance()->getConnectionProvider()->getReplicaDatabase();
34        // Latest timestamp recorded
35        $timestamp = $dbr->newSelectQueryBuilder()
36            ->select( 'MAX(frs_timestamp)' )
37            ->from( 'flaggedrevs_statistics' )
38            ->caller( __METHOD__ )
39            ->fetchField();
40
41        if ( $timestamp !== false ) {
42            $data['statTimestamp'] = wfTimestamp( TS_MW, $timestamp );
43
44            $res = $dbr->newSelectQueryBuilder()
45                ->select( [ 'frs_stat_key', 'frs_stat_val' ] )
46                ->from( 'flaggedrevs_statistics' )
47                ->where( [ 'frs_timestamp' => $dbr->timestamp( $timestamp ) ] )
48                ->caller( __METHOD__ )
49                ->fetchResultSet();
50            foreach ( $res as $row ) {
51                $key = explode( ':', $row->frs_stat_key );
52                switch ( $key[0] ) {
53                    case 'reviewLag-anon-sampleSize':
54                    case 'reviewLag-anon-average':
55                    case 'reviewLag-anon-median':
56                    case 'reviewLag-user-sampleSize':
57                    case 'reviewLag-user-average':
58                    case 'reviewLag-user-median':
59                    case 'pendingLag-average':
60                        $data[$key[0]] = (int)$row->frs_stat_val;
61                        break;
62                    case 'reviewLag-anon-percentile': // <stat name,percentile>
63                    case 'reviewLag-user-percentile': // <stat name,percentile>
64                        $data[$key[0]][$key[1]] = (int)$row->frs_stat_val;
65                        break;
66                    case 'totalPages-NS': // <stat name,namespace>
67                    case 'reviewedPages-NS': // <stat name,namespace>
68                    case 'syncedPages-NS': // <stat name,namespace>
69                        $data[$key[0]][$key[1]] = (int)$row->frs_stat_val;
70                        break;
71                }
72            }
73        }
74
75        return $data;
76    }
77
78    /**
79     * Run a stats update and update the DB
80     * Note: this can easily be too expensive to run live
81     *
82     * @return void
83     */
84    public static function updateCache() {
85        $rNamespaces = FlaggedRevs::getReviewNamespaces();
86        $cache = ObjectCache::getLocalClusterInstance();
87        if ( !$rNamespaces ) {
88            return; // no SQL errors please :)
89        }
90
91        // Get total, reviewed, and synced page count for each namespace
92        [ $ns_total, $ns_reviewed, $ns_synced ] = self::getPerNamespaceTotals();
93
94        // Getting mean pending edit time
95        // @TODO: percentiles?
96        $avePET = self::getMeanPendingEditTime();
97
98        # Get wait (till review) time samples for anon edits...
99        $reviewDataAnon = self::getEditReviewTimes( $cache, 'anons' );
100        # Get wait (till review) time samples for logged-in user edits...
101        $reviewDataUser = self::getEditReviewTimes( $cache, 'users' );
102
103        $dbw = MediaWikiServices::getInstance()->getConnectionProvider()->getPrimaryDatabase();
104        // The timestamp to identify this whole batch of data
105        $encDataTimestamp = $dbw->timestamp();
106
107        $dataSet = [];
108        // Data range for samples...
109        $dataSet[] = [
110            'frs_stat_key'  => 'reviewLag-anon-sampleStartTimestamp',
111            'frs_stat_val'  => $reviewDataAnon['sampleStartTS'], // unix
112            'frs_timestamp' => $encDataTimestamp ];
113        $dataSet[] = [
114            'frs_stat_key'  => 'reviewLag-user-sampleStartTimestamp',
115            'frs_stat_val'  => $reviewDataUser['sampleStartTS'], // unix
116            'frs_timestamp' => $encDataTimestamp ];
117        $dataSet[] = [
118            'frs_stat_key'  => 'reviewLag-anon-sampleEndTimestamp',
119            'frs_stat_val'  => $reviewDataAnon['sampleEndTS'], // unix
120            'frs_timestamp' => $encDataTimestamp ];
121        $dataSet[] = [
122            'frs_stat_key'  => 'reviewLag-user-sampleEndTimestamp',
123            'frs_stat_val'  => $reviewDataUser['sampleEndTS'], // unix
124            'frs_timestamp' => $encDataTimestamp ];
125        // All-namespace percentiles...
126        foreach ( $reviewDataAnon['percTable'] as $percentile => $seconds ) {
127            $dataSet[] = [
128                'frs_stat_key'  => 'reviewLag-anon-percentile:' . (int)$percentile,
129                'frs_stat_val'  => $seconds,
130                'frs_timestamp' => $encDataTimestamp ];
131        }
132        foreach ( $reviewDataUser['percTable'] as $percentile => $seconds ) {
133            $dataSet[] = [
134                'frs_stat_key'  => 'reviewLag-user-percentile:' . (int)$percentile,
135                'frs_stat_val'  => $seconds,
136                'frs_timestamp' => $encDataTimestamp ];
137        }
138        // Sample sizes...
139        $dataSet[] = [
140            'frs_stat_key'  => 'reviewLag-anon-sampleSize',
141            'frs_stat_val'  => $reviewDataAnon['sampleSize'],
142            'frs_timestamp' => $encDataTimestamp ];
143        $dataSet[] = [
144            'frs_stat_key'  => 'reviewLag-user-sampleSize',
145            'frs_stat_val'  => $reviewDataUser['sampleSize'],
146            'frs_timestamp' => $encDataTimestamp ];
147
148        // All-namespace ave/med review lag & ave pending lag stats...
149        $dataSet[] = [
150            'frs_stat_key'  => 'reviewLag-anon-average',
151            'frs_stat_val'  => $reviewDataAnon['average'],
152            'frs_timestamp' => $encDataTimestamp ];
153        $dataSet[] = [
154            'frs_stat_key'  => 'reviewLag-user-average',
155            'frs_stat_val'  => $reviewDataUser['average'],
156            'frs_timestamp' => $encDataTimestamp ];
157        $dataSet[] = [
158            'frs_stat_key'  => 'reviewLag-anon-median',
159            'frs_stat_val'  => $reviewDataAnon['median'],
160            'frs_timestamp' => $encDataTimestamp ];
161        $dataSet[] = [
162            'frs_stat_key'  => 'reviewLag-user-median',
163            'frs_stat_val'  => $reviewDataUser['median'],
164            'frs_timestamp' => $encDataTimestamp ];
165        $dataSet[] = [
166            'frs_stat_key'  => 'pendingLag-average',
167            'frs_stat_val'  => $avePET,
168            'frs_timestamp' => $encDataTimestamp ];
169
170        // Per-namespace total/reviewed/synced stats...
171        foreach ( $rNamespaces as $namespace ) {
172            $dataSet[] = [
173                'frs_stat_key'  => 'totalPages-NS:' . (int)$namespace,
174                'frs_stat_val'  => $ns_total[$namespace] ?? 0,
175                'frs_timestamp' => $encDataTimestamp ];
176            $dataSet[] = [
177                'frs_stat_key'  => 'reviewedPages-NS:' . (int)$namespace,
178                'frs_stat_val'  => $ns_reviewed[$namespace] ?? 0,
179                'frs_timestamp' => $encDataTimestamp ];
180            $dataSet[] = [
181                'frs_stat_key'  => 'syncedPages-NS:' . (int)$namespace,
182                'frs_stat_val'  => $ns_synced[$namespace] ?? 0,
183                'frs_timestamp' => $encDataTimestamp ];
184        }
185
186        // Save the data...
187        $dbw->newInsertQueryBuilder()
188            ->insertInto( 'flaggedrevs_statistics' )
189            ->ignore()
190            ->rows( $dataSet )
191            ->caller( __METHOD__ )
192            ->execute();
193    }
194
195    /**
196     * @return int[][]
197     */
198    private static function getPerNamespaceTotals() {
199        $ns_total = [];
200        $ns_reviewed = [];
201        $ns_synced = [];
202        // Get total, reviewed, and synced page count for each namespace
203        $dbr = MediaWikiServices::getInstance()->getConnectionProvider()->getReplicaDatabase( false, 'vslow' );
204        $res = $dbr->newSelectQueryBuilder()
205            ->select( [
206                'page_namespace',
207                'total' => 'COUNT(*)',
208                'reviewed' => 'COUNT(fp_page_id)',
209                'pending' => 'COUNT(fp_pending_since)'
210            ] )
211            ->from( 'page' )
212            ->leftJoin( 'flaggedpages', null, 'fp_page_id = page_id' )
213            ->where( [
214                'page_is_redirect' => 0,
215                'page_namespace' => FlaggedRevs::getReviewNamespaces()
216            ] )
217            ->groupBy( 'page_namespace' )
218            ->caller( __METHOD__ )
219            ->fetchResultSet();
220        foreach ( $res as $row ) {
221            $ns_total[$row->page_namespace] = (int)$row->total;
222            $ns_reviewed[$row->page_namespace] = (int)$row->reviewed;
223            $ns_synced[$row->page_namespace] = (int)$row->reviewed - (int)$row->pending;
224        }
225        return [ $ns_total, $ns_reviewed, $ns_synced ];
226    }
227
228    /**
229     * @param IReadableDatabase $db
230     * @param string $column
231     *
232     * @return string
233     */
234    private static function dbUnixTime( IReadableDatabase $db, $column ) {
235        return $db->getType() === 'sqlite' ? "strftime('%s',$column)" : "UNIX_TIMESTAMP($column)";
236    }
237
238    /**
239     * @return int
240     */
241    private static function getMeanPendingEditTime() {
242        $dbr = MediaWikiServices::getInstance()->getConnectionProvider()->getReplicaDatabase( false, 'vslow' );
243        $nowUnix = wfTimestamp();
244        $unixTimeCall = self::dbUnixTime( $dbr, 'fp_pending_since' );
245        return (int)$dbr->newSelectQueryBuilder()
246            ->select( "AVG( $nowUnix - $unixTimeCall )" )
247            ->from( 'flaggedpages' )
248            ->join( 'page', null, 'fp_page_id = page_id' )
249            ->where( [
250                $dbr->expr( 'fp_pending_since', '!=', null ),
251                'page_namespace' => FlaggedRevs::getReviewNamespaces() // sanity
252            ] )
253            ->caller( __METHOD__ )
254            ->fetchField();
255    }
256
257    /**
258     * Get edit review time statistics (as recent as possible)
259     * @param BagOStuff $cache
260     * @param string $users string "anons" or "users"
261     * @return array associative
262     */
263    private static function getEditReviewTimes( $cache, $users ) {
264        $result = [
265            'average'       => 0,
266            'median'        => 0,
267            'percTable'     => [],
268            'sampleSize'    => 0,
269            'sampleStartTS' => null,
270            'sampleEndTS'   => null
271        ];
272        if ( FlaggedRevs::useOnlyIfProtected() ) {
273            return $result; // disabled
274        }
275
276        $actorMigration = ActorMigration::newMigration();
277        $actorQuery = $actorMigration->getJoin( 'rev_user' );
278
279        $rPerTable = []; // review wait percentiles
280        # Only go so far back...otherwise we will get garbage values due to
281        # the fact that FlaggedRevs wasn't enabled until after a while.
282        $dbr = MediaWikiServices::getInstance()->getConnectionProvider()->getReplicaDatabase( false, 'vslow' );
283        $installedUnix = $dbr->newSelectQueryBuilder()
284            ->select( self::dbUnixTime( $dbr, 'MIN(log_timestamp)' ) )
285            ->from( 'logging' )
286            ->where( [ 'log_type' => 'review' ] )
287            ->caller( __METHOD__ )
288            ->fetchField();
289        $dbInstalled = $dbr->timestamp( $installedUnix ?: wfTimestamp() );
290        # Skip the most recent recent revs as they are likely to just
291        # be WHERE condition misses. This also gives us more data to use.
292        # Lastly, we want to avoid bias that would make the time too low
293        # since new revisions could not have "took a long time to sight".
294        $worstLagTS = $dbr->timestamp(); // now
295        $lastTS = $dbInstalled;
296        while ( true ) { // should almost always be ~1 pass
297            # Get the page with the worst pending lag...
298            $row = $dbr->newSelectQueryBuilder()
299                ->select( [ 'fpp_page_id', 'fpp_rev_id', 'fpp_pending_since', 'fr_timestamp' ] )
300                ->from( 'flaggedpage_pending' )
301                ->join( 'flaggedrevs', null, [ 'fr_page_id = fpp_page_id', 'fr_rev_id = fpp_rev_id' ] )
302                ->where( [
303                    'fpp_quality' => 0, // "checked"
304                    $dbr->expr( 'fpp_pending_since', '>', $lastTS ), // skip failed rows
305                ] )
306                ->caller( __METHOD__ )
307                ->orderBy( 'fpp_pending_since' )
308                ->fetchRow();
309            if ( !$row ) {
310                break;
311            }
312            # Find the newest revision at the time the page was reviewed,
313            # this is the one that *should* have been reviewed.
314            $idealRev = (int)$dbr->newSelectQueryBuilder()
315                ->select( 'rev_id' )
316                ->from( 'revision' )
317                ->where( [
318                    'rev_page' => $row->fpp_page_id,
319                    $dbr->expr( 'rev_timestamp', '<', $row->fr_timestamp ),
320                ] )
321                ->caller( __METHOD__ )
322                ->orderBy( 'rev_timestamp', SelectQueryBuilder::SORT_DESC )
323                ->fetchField();
324            if ( $row->fpp_rev_id >= $idealRev ) {
325                $worstLagTS = $row->fpp_pending_since;
326                break; // sane $worstLagTS found
327            # Fudge factor to prevent deliberate reviewing of non-current revisions
328            # from squeezing the range. Shouldn't effect anything otherwise.
329            } else {
330                $lastTS = $row->fpp_pending_since; // next iteration
331            }
332        }
333        # User condition (anons/users)
334        if ( $users === 'anons' ) {
335            $userCondition = $actorMigration->isAnon( $actorQuery['fields']['rev_user'] );
336        } elseif ( $users === 'users' ) {
337            $userCondition = $actorMigration->isNotAnon( $actorQuery['fields']['rev_user'] );
338        } else {
339            throw new InvalidArgumentException( 'Invalid $users param given.' );
340        }
341        # Avoid having to censor data
342        # Note: if no edits pending, $worstLagTS is the cur time just before we checked
343        # for the worst lag. Thus, new edits *right* after the check are properly excluded.
344        $maxTSUnix = (int)wfTimestamp( TS_UNIX, $worstLagTS ) - 1; // all edits later reviewed
345        $dbMaxTS = $dbr->timestamp( $maxTSUnix );
346        # Use a one week time range
347        $days = 7;
348        $minTSUnix = $maxTSUnix - $days * 86400;
349        $dbMinTS = $dbr->timestamp( $minTSUnix );
350        # Approximate the number rows to scan
351        $rows = $dbr->newSelectQueryBuilder()
352            ->select( '1' )
353            ->from( 'revision' )
354            ->tables( $actorQuery['tables'] )
355            ->where( $userCondition )
356            ->andWhere( [
357                $dbr->expr( 'rev_timestamp', '>=', $dbMinTS ),
358                $dbr->expr( 'rev_timestamp', '<=', $dbMaxTS ),
359            ] )
360            ->joinConds( $actorQuery['joins'] )
361            ->caller( __METHOD__ )
362            ->estimateRowCount();
363        # If the range doesn't have many rows (like on small wikis), use 30 days
364        if ( $rows < 500 ) {
365            $days = 30;
366            $minTSUnix = $maxTSUnix - $days * 86400;
367            $dbMinTS = $dbr->addQuotes( $dbr->timestamp( $minTSUnix ) );
368            # Approximate rows to scan
369            $rows = $dbr->newSelectQueryBuilder()
370                ->select( '1' )
371                ->from( 'revision' )
372                ->tables( $actorQuery['tables'] )
373                ->where( $userCondition )
374                ->andWhere( [
375                    $dbr->expr( 'rev_timestamp', '>=', $dbMinTS ),
376                    $dbr->expr( 'rev_timestamp', '<=', $dbMaxTS ),
377                ] )
378                ->joinConds( $actorQuery['joins'] )
379                ->caller( __METHOD__ )
380                ->estimateRowCount();
381            # If the range doesn't have many rows (like on really tiny wikis), use 90 days
382            if ( $rows < 500 ) {
383                $days = 90;
384                $minTSUnix = $maxTSUnix - $days * 86400;
385            }
386        }
387        $sampleSize = 1500; // sample size
388        # Sanity check the starting timestamp
389        $minTSUnix = max( $minTSUnix, $installedUnix );
390        $dbMinTS = $dbr->timestamp( $minTSUnix );
391        # Get timestamp boundaries
392        $timeCondition = [
393            $dbr->expr( 'rev_timestamp', '>=', $dbMinTS ),
394            $dbr->expr( 'rev_timestamp', '<=', $dbMaxTS ),
395        ];
396        # Get mod for edit spread
397        $fname = __METHOD__;
398        $edits = $cache->getWithSetCallback(
399            $cache->makeKey( 'flaggedrevs', 'rcEditCount', $users, $days ),
400            $cache::TTL_WEEK * 2,
401            static function () use ( $dbr, $fname, $userCondition, $timeCondition, $actorQuery ) {
402                return (int)$dbr->newSelectQueryBuilder()
403                    ->select( 'COUNT(*)' )
404                    ->from( 'revision' )
405                    ->join( 'page', null, 'page_id = rev_page' )
406                    ->tables( $actorQuery['tables'] )
407                    ->where( [
408                        $userCondition,
409                        $timeCondition, // in time range
410                        'page_namespace' => FlaggedRevs::getReviewNamespaces()
411                    ] )
412                    ->joinConds( $actorQuery['joins'] )
413                    ->caller( $fname )
414                    ->fetchField();
415            }
416        );
417        $mod = max( floor( $edits / $sampleSize ), 1 ); # $mod >= 1
418        # For edits that started off pending, how long do they take to get reviewed?
419        # Edits started off pending if made when a flagged rev of the page already existed.
420        # Get the *first* reviewed rev *after* each edit and get the time difference.
421        $res = $dbr->newSelectQueryBuilder()
422            ->select( [
423                'rt' => 'rev_timestamp', // time revision was made
424                'nft' => new SubQuery( $dbr->newSelectQueryBuilder()
425                    ->select( 'MIN(fr_timestamp)' )
426                    ->from( 'flaggedrevs' )
427                    ->where( [
428                        'fr_page_id = rev_page',
429                        'fr_rev_timestamp >= rev_timestamp'
430                    ] )
431                    ->caller( __METHOD__ )
432                    ->getSQL()
433                ) // time when revision was first reviewed
434            ] )
435            ->from( 'revision' )
436            ->tables( $actorQuery['tables'] )
437            ->where( [
438                $userCondition,
439                $timeCondition,
440                "(rev_id % $mod) = 0",
441                $dbr->expr( 'rev_parent_id', '>', 0 ), // optimize (exclude new pages)
442                'EXISTS (' . $dbr->newSelectQueryBuilder()
443                    ->select( '*' )
444                    ->from( 'flaggedrevs' )
445                    ->where( [ // page was reviewed when this revision was made
446                        'fr_page_id = rev_page',
447                        'fr_rev_timestamp < rev_timestamp', // before this revision
448                        'fr_rev_id < rev_id', // not imported later
449                        'fr_timestamp < rev_timestamp', // page reviewed before revision
450                    ] )
451                    ->caller( __METHOD__ )
452                    ->getSQL() .
453                ')'
454            ] )
455            ->caller( __METHOD__ )
456            ->joinConds( $actorQuery['joins'] )
457            ->fetchResultSet();
458
459        $secondsR = 0; // total wait seconds for edits later reviewed
460        $secondsP = 0; // total wait seconds for edits still pending
461        $times = [];
462        if ( $res->numRows() ) {
463            # Get the elapsed times revs were pending (flagged time - edit time)
464            foreach ( $res as $row ) {
465                $time = (int)wfTimestamp( TS_UNIX, $row->nft ) - (int)wfTimestamp( TS_UNIX, $row->rt );
466                $time = max( $time, 0 ); // sanity
467                $secondsR += $time;
468                $times[] = $time;
469            }
470            $sampleSize = count( $times );
471            $aveRT = ( $secondsR + $secondsP ) / $sampleSize; // sample mean
472            sort( $times ); // order smallest -> largest
473            // Sample median
474            $rank = intval( round( count( $times ) / 2 + 0.5 ) - 1 );
475            $medianRT = $times[$rank];
476            // Make percentile tabulation data
477            $doPercentiles = [ 35, 45, 55, 65, 75, 85, 90, 95 ];
478            foreach ( $doPercentiles as $percentile ) {
479                $rank = intval( round( $percentile * count( $times ) / 100 + 0.5 ) - 1 );
480                $rPerTable[$percentile] = $times[$rank];
481            }
482            $result['average']       = $aveRT;
483            $result['median']        = $medianRT;
484            $result['percTable']     = $rPerTable;
485            $result['sampleSize']    = count( $times );
486            $result['sampleStartTS'] = $minTSUnix;
487            $result['sampleEndTS']   = $maxTSUnix;
488        }
489
490        return $result;
491    }
492}