Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
95.10% covered (success)
95.10%
330 / 347
71.43% covered (warning)
71.43%
5 / 7
CRAP
0.00% covered (danger)
0.00%
0 / 1
FlaggedRevsStats
95.10% covered (success)
95.10%
330 / 347
71.43% covered (warning)
71.43%
5 / 7
41
0.00% covered (danger)
0.00%
0 / 1
 getStats
100.00% covered (success)
100.00%
51 / 51
100.00% covered (success)
100.00%
1 / 1
15
 updateCache
98.82% covered (success)
98.82%
84 / 85
0.00% covered (danger)
0.00%
0 / 1
5
 getPerNamespaceTotals
100.00% covered (success)
100.00%
25 / 25
100.00% covered (success)
100.00%
1 / 1
2
 dbUnixTime
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
2
 getMeanPendingEditTime
100.00% covered (success)
100.00%
13 / 13
100.00% covered (success)
100.00%
1 / 1
1
 getEditReviewTimes
89.87% covered (warning)
89.87%
142 / 158
0.00% covered (danger)
0.00%
0 / 1
15.23
 estimateRevisionRowCount
100.00% covered (success)
100.00%
14 / 14
100.00% covered (success)
100.00%
1 / 1
1
1<?php
2
3use MediaWiki\MediaWikiServices;
4use Wikimedia\ObjectCache\BagOStuff;
5use Wikimedia\Rdbms\IExpression;
6use Wikimedia\Rdbms\IReadableDatabase;
7use Wikimedia\Rdbms\RawSQLExpression;
8use Wikimedia\Rdbms\SelectQueryBuilder;
9use Wikimedia\Rdbms\Subquery;
10
11/**
12 * FlaggedRevs stats functions
13 */
14class FlaggedRevsStats {
15    /**
16     * @return array of current FR stats
17     */
18    public static function getStats() {
19        $data = [
20            'reviewLag-anon-sampleSize' => '-',
21            'reviewLag-anon-average' => '-',
22            'reviewLag-anon-median' => '-',
23            'reviewLag-anon-percentile' => [],
24            'reviewLag-user-sampleSize' => '-',
25            'reviewLag-user-average' => '-',
26            'reviewLag-user-median' => '-',
27            'reviewLag-user-percentile' => [],
28            'totalPages-NS' => [],
29            'reviewedPages-NS' => [],
30            'syncedPages-NS' => [],
31            'pendingLag-average' => '-',
32            'statTimestamp' => '-',
33        ];
34
35        $dbr = MediaWikiServices::getInstance()->getConnectionProvider()->getReplicaDatabase();
36        // Latest timestamp recorded
37        $timestamp = $dbr->newSelectQueryBuilder()
38            ->select( 'MAX(frs_timestamp)' )
39            ->from( 'flaggedrevs_statistics' )
40            ->caller( __METHOD__ )
41            ->fetchField();
42
43        if ( $timestamp !== false ) {
44            $data['statTimestamp'] = wfTimestamp( TS_MW, $timestamp );
45
46            $res = $dbr->newSelectQueryBuilder()
47                ->select( [ 'frs_stat_key', 'frs_stat_val' ] )
48                ->from( 'flaggedrevs_statistics' )
49                ->where( [ 'frs_timestamp' => $dbr->timestamp( $timestamp ) ] )
50                ->caller( __METHOD__ )
51                ->fetchResultSet();
52            foreach ( $res as $row ) {
53                $key = explode( ':', $row->frs_stat_key );
54                switch ( $key[0] ) {
55                    case 'reviewLag-anon-sampleSize':
56                    case 'reviewLag-anon-average':
57                    case 'reviewLag-anon-median':
58                    case 'reviewLag-user-sampleSize':
59                    case 'reviewLag-user-average':
60                    case 'reviewLag-user-median':
61                    case 'pendingLag-average':
62                        $data[$key[0]] = (int)$row->frs_stat_val;
63                        break;
64                    case 'reviewLag-anon-percentile': // <stat name,percentile>
65                    case 'reviewLag-user-percentile': // <stat name,percentile>
66                        $data[$key[0]][$key[1]] = (int)$row->frs_stat_val;
67                        break;
68                    case 'totalPages-NS': // <stat name,namespace>
69                    case 'reviewedPages-NS': // <stat name,namespace>
70                    case 'syncedPages-NS': // <stat name,namespace>
71                        $data[$key[0]][$key[1]] = (int)$row->frs_stat_val;
72                        break;
73                }
74            }
75        }
76
77        return $data;
78    }
79
80    /**
81     * Run a stats update and update the DB
82     * Note: this can easily be too expensive to run live
83     *
84     * @return void
85     */
86    public static function updateCache() {
87        $rNamespaces = FlaggedRevs::getReviewNamespaces();
88        $cache = MediaWikiServices::getInstance()
89            ->getObjectCacheFactory()->getLocalClusterInstance();
90        if ( !$rNamespaces ) {
91            return; // no SQL errors please :)
92        }
93
94        // Get total, reviewed, and synced page count for each namespace
95        [ $ns_total, $ns_reviewed, $ns_synced ] = self::getPerNamespaceTotals();
96
97        // Getting mean pending edit time
98        // @TODO: percentiles?
99        $avePET = self::getMeanPendingEditTime();
100
101        # Get wait (till review) time samples for anon edits...
102        $reviewDataAnon = self::getEditReviewTimes( $cache, 'anons' );
103        # Get wait (till review) time samples for logged-in user edits...
104        $reviewDataUser = self::getEditReviewTimes( $cache, 'users' );
105
106        $dbw = MediaWikiServices::getInstance()->getConnectionProvider()->getPrimaryDatabase();
107        // The timestamp to identify this whole batch of data
108        $encDataTimestamp = $dbw->timestamp();
109
110        $dataSet = [];
111        // Data range for samples...
112        $dataSet[] = [
113            'frs_stat_key'  => 'reviewLag-anon-sampleStartTimestamp',
114            'frs_stat_val'  => $reviewDataAnon['sampleStartTS'], // unix
115            'frs_timestamp' => $encDataTimestamp ];
116        $dataSet[] = [
117            'frs_stat_key'  => 'reviewLag-user-sampleStartTimestamp',
118            'frs_stat_val'  => $reviewDataUser['sampleStartTS'], // unix
119            'frs_timestamp' => $encDataTimestamp ];
120        $dataSet[] = [
121            'frs_stat_key'  => 'reviewLag-anon-sampleEndTimestamp',
122            'frs_stat_val'  => $reviewDataAnon['sampleEndTS'], // unix
123            'frs_timestamp' => $encDataTimestamp ];
124        $dataSet[] = [
125            'frs_stat_key'  => 'reviewLag-user-sampleEndTimestamp',
126            'frs_stat_val'  => $reviewDataUser['sampleEndTS'], // unix
127            'frs_timestamp' => $encDataTimestamp ];
128        // All-namespace percentiles...
129        foreach ( $reviewDataAnon['percTable'] as $percentile => $seconds ) {
130            $dataSet[] = [
131                'frs_stat_key'  => 'reviewLag-anon-percentile:' . (int)$percentile,
132                'frs_stat_val'  => $seconds,
133                'frs_timestamp' => $encDataTimestamp ];
134        }
135        foreach ( $reviewDataUser['percTable'] as $percentile => $seconds ) {
136            $dataSet[] = [
137                'frs_stat_key'  => 'reviewLag-user-percentile:' . (int)$percentile,
138                'frs_stat_val'  => $seconds,
139                'frs_timestamp' => $encDataTimestamp ];
140        }
141        // Sample sizes...
142        $dataSet[] = [
143            'frs_stat_key'  => 'reviewLag-anon-sampleSize',
144            'frs_stat_val'  => $reviewDataAnon['sampleSize'],
145            'frs_timestamp' => $encDataTimestamp ];
146        $dataSet[] = [
147            'frs_stat_key'  => 'reviewLag-user-sampleSize',
148            'frs_stat_val'  => $reviewDataUser['sampleSize'],
149            'frs_timestamp' => $encDataTimestamp ];
150
151        // All-namespace ave/med review lag & ave pending lag stats...
152        $dataSet[] = [
153            'frs_stat_key'  => 'reviewLag-anon-average',
154            'frs_stat_val'  => $reviewDataAnon['average'],
155            'frs_timestamp' => $encDataTimestamp ];
156        $dataSet[] = [
157            'frs_stat_key'  => 'reviewLag-user-average',
158            'frs_stat_val'  => $reviewDataUser['average'],
159            'frs_timestamp' => $encDataTimestamp ];
160        $dataSet[] = [
161            'frs_stat_key'  => 'reviewLag-anon-median',
162            'frs_stat_val'  => $reviewDataAnon['median'],
163            'frs_timestamp' => $encDataTimestamp ];
164        $dataSet[] = [
165            'frs_stat_key'  => 'reviewLag-user-median',
166            'frs_stat_val'  => $reviewDataUser['median'],
167            'frs_timestamp' => $encDataTimestamp ];
168        $dataSet[] = [
169            'frs_stat_key'  => 'pendingLag-average',
170            'frs_stat_val'  => $avePET,
171            'frs_timestamp' => $encDataTimestamp ];
172
173        // Per-namespace total/reviewed/synced stats...
174        foreach ( $rNamespaces as $namespace ) {
175            $dataSet[] = [
176                'frs_stat_key'  => 'totalPages-NS:' . (int)$namespace,
177                'frs_stat_val'  => $ns_total[$namespace] ?? 0,
178                'frs_timestamp' => $encDataTimestamp ];
179            $dataSet[] = [
180                'frs_stat_key'  => 'reviewedPages-NS:' . (int)$namespace,
181                'frs_stat_val'  => $ns_reviewed[$namespace] ?? 0,
182                'frs_timestamp' => $encDataTimestamp ];
183            $dataSet[] = [
184                'frs_stat_key'  => 'syncedPages-NS:' . (int)$namespace,
185                'frs_stat_val'  => $ns_synced[$namespace] ?? 0,
186                'frs_timestamp' => $encDataTimestamp ];
187        }
188
189        // Save the data...
190        $dbw->newInsertQueryBuilder()
191            ->insertInto( 'flaggedrevs_statistics' )
192            ->ignore()
193            ->rows( $dataSet )
194            ->caller( __METHOD__ )
195            ->execute();
196    }
197
198    /**
199     * @return int[][]
200     */
201    private static function getPerNamespaceTotals() {
202        $ns_total = [];
203        $ns_reviewed = [];
204        $ns_synced = [];
205        // Get total, reviewed, and synced page count for each namespace
206        $dbr = MediaWikiServices::getInstance()->getConnectionProvider()->getReplicaDatabase( false, 'vslow' );
207        $res = $dbr->newSelectQueryBuilder()
208            ->select( [
209                'page_namespace',
210                'total' => 'COUNT(*)',
211                'reviewed' => 'COUNT(fp_page_id)',
212                'pending' => 'COUNT(fp_pending_since)'
213            ] )
214            ->from( 'page' )
215            ->leftJoin( 'flaggedpages', null, 'fp_page_id = page_id' )
216            ->where( [
217                'page_is_redirect' => 0,
218                'page_namespace' => FlaggedRevs::getReviewNamespaces()
219            ] )
220            ->groupBy( 'page_namespace' )
221            ->caller( __METHOD__ )
222            ->fetchResultSet();
223        foreach ( $res as $row ) {
224            $ns_total[$row->page_namespace] = (int)$row->total;
225            $ns_reviewed[$row->page_namespace] = (int)$row->reviewed;
226            $ns_synced[$row->page_namespace] = (int)$row->reviewed - (int)$row->pending;
227        }
228        return [ $ns_total, $ns_reviewed, $ns_synced ];
229    }
230
231    /**
232     * @param IReadableDatabase $db
233     * @param string $column
234     *
235     * @return string
236     */
237    private static function dbUnixTime( IReadableDatabase $db, $column ) {
238        return $db->getType() === 'sqlite' ? "strftime('%s',$column)" : "UNIX_TIMESTAMP($column)";
239    }
240
241    /**
242     * @return int
243     */
244    private static function getMeanPendingEditTime() {
245        $dbr = MediaWikiServices::getInstance()->getConnectionProvider()->getReplicaDatabase( false, 'vslow' );
246        $nowUnix = wfTimestamp();
247        $unixTimeCall = self::dbUnixTime( $dbr, 'fp_pending_since' );
248        return (int)$dbr->newSelectQueryBuilder()
249            ->select( "AVG( $nowUnix - $unixTimeCall )" )
250            ->from( 'flaggedpages' )
251            ->join( 'page', null, 'fp_page_id = page_id' )
252            ->where( [
253                $dbr->expr( 'fp_pending_since', '!=', null ),
254                'page_namespace' => FlaggedRevs::getReviewNamespaces() // sanity
255            ] )
256            ->caller( __METHOD__ )
257            ->fetchField();
258    }
259
260    /**
261     * Get edit review time statistics (as recent as possible)
262     * @param BagOStuff $cache
263     * @param string $users string "anons" or "users"
264     * @return array associative
265     */
266    private static function getEditReviewTimes( $cache, $users ) {
267        $result = [
268            'average'       => 0,
269            'median'        => 0,
270            'percTable'     => [],
271            'sampleSize'    => 0,
272            'sampleStartTS' => null,
273            'sampleEndTS'   => null
274        ];
275        if ( FlaggedRevs::useOnlyIfProtected() ) {
276            return $result; // disabled
277        }
278
279        $rPerTable = []; // review wait percentiles
280        # Only go so far back...otherwise we will get garbage values due to
281        # the fact that FlaggedRevs wasn't enabled until after a while.
282        $dbr = MediaWikiServices::getInstance()->getConnectionProvider()->getReplicaDatabase( false, 'vslow' );
283        $installedUnix = $dbr->newSelectQueryBuilder()
284            ->select( self::dbUnixTime( $dbr, 'MIN(log_timestamp)' ) )
285            ->from( 'logging' )
286            ->where( [ 'log_type' => 'review' ] )
287            ->caller( __METHOD__ )
288            ->fetchField();
289        $dbInstalled = $dbr->timestamp( $installedUnix ?: wfTimestamp() );
290        # Skip the most recent recent revs as they are likely to just
291        # be WHERE condition misses. This also gives us more data to use.
292        # Lastly, we want to avoid bias that would make the time too low
293        # since new revisions could not have "took a long time to sight".
294        $worstLagTS = $dbr->timestamp(); // now
295        $lastTS = $dbInstalled;
296        while ( true ) { // should almost always be ~1 pass
297            # Get the page with the worst pending lag...
298            $row = $dbr->newSelectQueryBuilder()
299                ->select( [ 'fp_page_id', 'fr_rev_id', 'fp_pending_since', 'fr_timestamp' ] )
300                ->from( 'flaggedpages' )
301                ->join( 'flaggedrevs', null, [ 'fr_page_id = fp_page_id', 'fr_rev_id = fp_stable' ] )
302                ->where( [
303                    $dbr->expr( 'fp_pending_since', '!=', null ),
304                    $dbr->expr( 'fp_pending_since', '>', $lastTS ), // skip failed rows
305                ] )
306                ->orderBy( 'fp_pending_since' )
307                ->caller( __METHOD__ )->fetchRow();
308            if ( !$row ) {
309                break;
310            }
311            # Find the newest revision at the time the page was reviewed,
312            # this is the one that *should* have been reviewed.
313            $idealRev = (int)$dbr->newSelectQueryBuilder()
314                ->select( 'rev_id' )
315                ->from( 'revision' )
316                ->where( [
317                    'rev_page' => $row->fp_page_id,
318                    $dbr->expr( 'rev_timestamp', '<', $row->fr_timestamp ),
319                ] )
320                ->caller( __METHOD__ )
321                ->orderBy( 'rev_timestamp', SelectQueryBuilder::SORT_DESC )
322                ->fetchField();
323            if ( $row->fr_rev_id >= $idealRev ) {
324                $worstLagTS = $row->fp_pending_since;
325                break; // sane $worstLagTS found
326            # Fudge factor to prevent deliberate reviewing of non-current revisions
327            # from squeezing the range. Shouldn't effect anything otherwise.
328            } else {
329                $lastTS = $row->fp_pending_since; // next iteration
330            }
331        }
332
333        $tempUserConfig = MediaWikiServices::getInstance()->getTempUserConfig();
334
335        # User condition (anons/users)
336        if ( $users === 'anons' ) {
337            $anonConds = [ new RawSQLExpression( 'actor_user IS NULL' ) ];
338            if ( $tempUserConfig->isKnown() ) {
339                $anonConds[] = $tempUserConfig->getMatchCondition( $dbr, 'actor_name', IExpression::LIKE );
340            }
341
342            $userCondition = $dbr->orExpr( $anonConds );
343        } elseif ( $users === 'users' ) {
344            $userConds = [ new RawSQLExpression( 'actor_user IS NOT NULL' ) ];
345            if ( $tempUserConfig->isKnown() ) {
346                $userConds[] = $tempUserConfig->getMatchCondition( $dbr, 'actor_name', IExpression::NOT_LIKE );
347            }
348
349            $userCondition = $dbr->andExpr( $userConds );
350        } else {
351            throw new InvalidArgumentException( 'Invalid $users param given.' );
352        }
353        # Avoid having to censor data
354        # Note: if no edits pending, $worstLagTS is the cur time just before we checked
355        # for the worst lag. Thus, new edits *right* after the check are properly excluded.
356        $maxTSUnix = (int)wfTimestamp( TS_UNIX, $worstLagTS ) - 1; // all edits later reviewed
357        $dbMaxTS = $dbr->timestamp( $maxTSUnix );
358        # Use a one week time range
359        $days = 7;
360        $minTSUnix = $maxTSUnix - $days * 86400;
361        # Approximate the number rows to scan
362        $rows = self::estimateRevisionRowCount( $dbr, $maxTSUnix, 7, $userCondition );
363        # If the range doesn't have many rows (like on small wikis), use 30 days
364        if ( $rows < 500 ) {
365            $rows = self::estimateRevisionRowCount( $dbr, $maxTSUnix, 30, $userCondition );
366            # If the range doesn't have many rows (like on really tiny wikis), use 90 days
367            if ( $rows < 500 ) {
368                $days = 90;
369                $minTSUnix = $maxTSUnix - $days * 86400;
370            }
371        }
372
373        $sampleSize = 1500; // sample size
374        # Sanity check the starting timestamp
375        $minTSUnix = max( $minTSUnix, $installedUnix );
376        $dbMinTS = $dbr->timestamp( $minTSUnix );
377        # Get timestamp boundaries
378        $timeCondition = [
379            $dbr->expr( 'rev_timestamp', '>=', $dbMinTS ),
380            $dbr->expr( 'rev_timestamp', '<=', $dbMaxTS ),
381        ];
382        # Get mod for edit spread
383        $fname = __METHOD__;
384        $edits = $cache->getWithSetCallback(
385            $cache->makeKey( 'flaggedrevs', 'rcEditCount', $users, $days ),
386            $cache::TTL_WEEK * 2,
387            static function () use ( $dbr, $fname, $userCondition, $timeCondition ) {
388                return (int)$dbr->newSelectQueryBuilder()
389                    ->select( 'COUNT(*)' )
390                    ->from( 'revision' )
391                    ->join( 'page', null, 'page_id = rev_page' )
392                    ->join( 'actor', null, 'rev_actor = actor_id' )
393                    ->where( $timeCondition )
394                    ->andWhere( [
395                        $userCondition,
396                        'page_namespace' => FlaggedRevs::getReviewNamespaces()
397                    ] )
398                    ->caller( $fname )
399                    ->fetchField();
400            }
401        );
402        $mod = max( floor( $edits / $sampleSize ), 1 ); # $mod >= 1
403        # For edits that started off pending, how long do they take to get reviewed?
404        # Edits started off pending if made when a flagged rev of the page already existed.
405        # Get the *first* reviewed rev *after* each edit and get the time difference.
406        $res = $dbr->newSelectQueryBuilder()
407            ->select( [
408                'rt' => 'rev_timestamp', // time revision was made
409                'nft' => new Subquery( $dbr->newSelectQueryBuilder()
410                    ->select( 'MIN(fr_timestamp)' )
411                    ->from( 'flaggedrevs' )
412                    ->where( [
413                        'fr_page_id = rev_page',
414                        'fr_rev_timestamp >= rev_timestamp'
415                    ] )
416                    ->caller( __METHOD__ )
417                    ->getSQL()
418                ) // time when revision was first reviewed
419            ] )
420            ->from( 'revision' )
421            ->join( 'actor', null, 'rev_actor = actor_id' )
422            ->where( [
423                $userCondition,
424                "(rev_id % $mod) = 0",
425                $dbr->expr( 'rev_parent_id', '>', 0 ), // optimize (exclude new pages)
426                'EXISTS (' . $dbr->newSelectQueryBuilder()
427                    ->select( '*' )
428                    ->from( 'flaggedrevs' )
429                    ->where( [ // page was reviewed when this revision was made
430                        'fr_page_id = rev_page',
431                        'fr_rev_timestamp < rev_timestamp', // before this revision
432                        'fr_rev_id < rev_id', // not imported later
433                        'fr_timestamp < rev_timestamp', // page reviewed before revision
434                    ] )
435                    ->caller( __METHOD__ )
436                    ->getSQL() .
437                ')'
438            ] )
439            ->andWhere( $timeCondition )
440            ->caller( __METHOD__ )
441            ->fetchResultSet();
442
443        $secondsR = 0; // total wait seconds for edits later reviewed
444        $secondsP = 0; // total wait seconds for edits still pending
445        $times = [];
446        if ( $res->numRows() ) {
447            # Get the elapsed times revs were pending (flagged time - edit time)
448            foreach ( $res as $row ) {
449                $time = (int)wfTimestamp( TS_UNIX, $row->nft ) - (int)wfTimestamp( TS_UNIX, $row->rt );
450                $time = max( $time, 0 ); // sanity
451                $secondsR += $time;
452                $times[] = $time;
453            }
454            $sampleSize = count( $times );
455            $aveRT = ( $secondsR + $secondsP ) / $sampleSize; // sample mean
456            sort( $times ); // order smallest -> largest
457            // Sample median
458            $rank = intval( round( count( $times ) / 2 + 0.5 ) - 1 );
459            $medianRT = $times[$rank];
460            // Make percentile tabulation data
461            $doPercentiles = [ 35, 45, 55, 65, 75, 85, 90, 95 ];
462            foreach ( $doPercentiles as $percentile ) {
463                $rank = intval( round( $percentile * count( $times ) / 100 + 0.5 ) - 1 );
464                $rPerTable[$percentile] = $times[$rank];
465            }
466            $result['average']       = $aveRT;
467            $result['median']        = $medianRT;
468            $result['percTable']     = $rPerTable;
469            $result['sampleSize']    = count( $times );
470            $result['sampleStartTS'] = $minTSUnix;
471            $result['sampleEndTS']   = $maxTSUnix;
472        }
473
474        return $result;
475    }
476
477    /**
478     * Convenience function to estimate the number of revisions authored by a given user type
479     * within a given timeframe.
480     *
481     * @param IReadableDatabase $dbr Replica DB connection handle.
482     * @param int $maxTSUnix Only consider revisions created before this UNIX timestamp.
483     * @param int $days Only consider revisions created at most this many days before $maxTSUnix.
484     * @param IExpression|string $userCondition SQL condition to filter revisions by user type.
485     *
486     * @return int The estimated number of revisions matching the given time and user constraints.
487     */
488    private static function estimateRevisionRowCount(
489        IReadableDatabase $dbr,
490        int $maxTSUnix,
491        int $days,
492        $userCondition
493    ): int {
494        $minTSUnix = $maxTSUnix - $days * 86400;
495        $dbMinTS = $dbr->timestamp( $minTSUnix );
496        $dbMaxTS = $dbr->timestamp( $maxTSUnix );
497
498        return $dbr->newSelectQueryBuilder()
499            ->select( '1' )
500            ->from( 'revision' )
501            ->join( 'actor', null, 'rev_actor = actor_id' )
502            ->where( $userCondition )
503            ->andWhere( [
504                $dbr->expr( 'rev_timestamp', '>=', $dbMinTS ),
505                $dbr->expr( 'rev_timestamp', '<=', $dbMaxTS ),
506            ] )
507            ->caller( __METHOD__ )
508            ->estimateRowCount();
509    }
510}