Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
0.00% covered (danger)
0.00%
0 / 197
0.00% covered (danger)
0.00%
0 / 12
CRAP
0.00% covered (danger)
0.00%
0 / 1
FlowRemoveOldTopics
0.00% covered (danger)
0.00%
0 / 191
0.00% covered (danger)
0.00%
0 / 12
1122
0.00% covered (danger)
0.00%
0 / 1
 __construct
0.00% covered (danger)
0.00%
0 / 7
0.00% covered (danger)
0.00%
0 / 1
2
 execute
0.00% covered (danger)
0.00%
0 / 8
0.00% covered (danger)
0.00%
0 / 1
2
 removeHeader
0.00% covered (danger)
0.00%
0 / 57
0.00% covered (danger)
0.00%
0 / 1
72
 removeTopics
0.00% covered (danger)
0.00%
0 / 24
0.00% covered (danger)
0.00%
0 / 1
6
 removeTopicsWithFlowUpdates
0.00% covered (danger)
0.00%
0 / 35
0.00% covered (danger)
0.00%
0 / 1
6
 removeWorkflows
0.00% covered (danger)
0.00%
0 / 10
0.00% covered (danger)
0.00%
0 / 1
12
 removeTopicList
0.00% covered (danger)
0.00%
0 / 4
0.00% covered (danger)
0.00%
0 / 1
6
 removeSummary
0.00% covered (danger)
0.00%
0 / 6
0.00% covered (danger)
0.00%
0 / 1
12
 sortSubtree
0.00% covered (danger)
0.00%
0 / 8
0.00% covered (danger)
0.00%
0 / 1
6
 removePosts
0.00% covered (danger)
0.00%
0 / 15
0.00% covered (danger)
0.00%
0 / 1
30
 removeReferences
0.00% covered (danger)
0.00%
0 / 16
0.00% covered (danger)
0.00%
0 / 1
12
 multiRemove
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
1<?php
2
3namespace Flow\Maintenance;
4
5use Flow\Container;
6use Flow\Data\ManagerGroup;
7use Flow\Data\Utils\RawSql;
8use Flow\DbFactory;
9use Flow\Exception\FlowException;
10use Flow\Hooks;
11use Flow\Model\AbstractRevision;
12use Flow\Model\Header;
13use Flow\Model\PostRevision;
14use Flow\Model\UUID;
15use Flow\Model\Workflow;
16use Flow\Repository\TreeRepository;
17use Maintenance;
18use MediaWiki\WikiMap\WikiMap;
19use Wikimedia\Rdbms\DBUnexpectedError;
20
21$IP = getenv( 'MW_INSTALL_PATH' );
22if ( $IP === false ) {
23    $IP = __DIR__ . '/../../..';
24}
25
26require_once "$IP/maintenance/Maintenance.php";
27
28/**
29 * @ingroup Maintenance
30 */
31class FlowRemoveOldTopics extends Maintenance {
32    /**
33     * @var bool
34     */
35    protected $dryRun = false;
36
37    /**
38     * @var ManagerGroup
39     */
40    protected $storage;
41
42    /**
43     * @var TreeRepository
44     */
45    protected $treeRepo;
46
47    /**
48     * @var DbFactory
49     */
50    protected $dbFactory;
51
52    public function __construct() {
53        parent::__construct();
54
55        $this->addDescription( "Deletes old topics" );
56
57        $this->addOption( 'date', 'Date cutoff (in any format understood by wfTimestamp), topics ' .
58            'older than this date will be deleted.', true, true );
59        $this->addOption( 'dryrun', 'Simulate script run, without actually deleting anything' );
60
61        $this->setBatchSize( 10 );
62
63        $this->requireExtension( 'Flow' );
64    }
65
66    public function execute() {
67        $this->dryRun = $this->getOption( 'dryrun', false );
68        $this->storage = Container::get( 'storage' );
69        $this->treeRepo = Container::get( 'repository.tree' );
70        $this->dbFactory = Container::get( 'db.factory' );
71
72        $timestamp = wfTimestamp( TS_MW, $this->getOption( 'date' ) );
73
74        $this->removeHeader( $timestamp );
75        // remove topics that are older than the given timestamp
76        $this->removeTopics( $timestamp );
77        // remove topics that have more recent updates, but only from Flow talk
78        // page manager
79        $this->removeTopicsWithFlowUpdates( $timestamp );
80    }
81
82    protected function removeHeader( $timestamp ) {
83        $dbr = $this->dbFactory->getDB( DB_REPLICA );
84        $batchSize = $this->getBatchSize();
85
86        // we don't store a timestamp with revisions - the id also holds date
87        // info, so that's what we should compare against
88        $endId = UUID::getComparisonUUID( $timestamp );
89
90        // start from around unix epoch - there can be no Flow data before that
91        $startId = UUID::getComparisonUUID( '1' );
92        do {
93            /** @var Header[] $revisions */
94            $revisions = $this->storage->find(
95                'Header',
96                [
97                    'rev_user_wiki' => WikiMap::getCurrentWikiId(),
98                    'rev_type' => 'header',
99                    new RawSql( 'rev_id > ' . $dbr->addQuotes( $startId->getBinary() ) ),
100                    new RawSql( 'rev_id < ' . $dbr->addQuotes( $endId->getBinary() ) ),
101                    // only fetch original post at this point: we still need to
102                    // narrow down the results
103                    'rev_parent_id' => null,
104                ],
105                [
106                    'limit' => $batchSize,
107                    'sort' => 'rev_id',
108                    'order' => 'ASC',
109                ]
110            );
111
112            if ( !$revisions ) {
113                break;
114            }
115
116            // prepare for next batch, which will start at this
117            /** @var UUID $startId */
118            $startId = end( $revisions )->getRevisionId();
119
120            // we've now found all first revisions prior to a certain date, but we
121            // don't want to remove those that have revisions after that date cutoff
122            // (we don't want to break history)
123            // let's see if any has revisions more recent than timestamp
124            $conds = [];
125            $uuids = [];
126            foreach ( $revisions as $revision ) {
127                // keep track of UUIDs we may want to delete
128                $uuids[$revision->getCollectionId()->getAlphadecimal()] = $revision->getCollectionId();
129
130                $conds[] = [
131                    'rev_user_wiki' => WikiMap::getCurrentWikiId(),
132                    'rev_type' => 'header',
133                    new RawSql( 'rev_id >= ' . $dbr->addQuotes( $endId->getBinary() ) ),
134                    'rev_type_id' => $revision->getCollectionId()->getBinary(),
135                ];
136            }
137
138            /** @var Header[] $recent */
139            $recent = $this->storage->findMulti( 'Header', $conds, [ 'limit' => 1 ] );
140
141            // now exclude collection ids where there's a revision that is more
142            // recent than the timestamp cutoff
143            foreach ( $recent as $revisions ) {
144                foreach ( $revisions as $revision ) {
145                    unset( $uuids[$revision->getCollectionId()->getAlphadecimal()] );
146                }
147            }
148
149            // by now, there may be nothing left to remove, so move on to the
150            // next batch...
151            if ( !$uuids ) {
152                continue;
153            }
154
155            $revisions = $this->storage->find(
156                'Header',
157                [
158                    'rev_user_wiki' => WikiMap::getCurrentWikiId(),
159                    'rev_type' => 'header',
160                    'rev_type_id' => UUID::convertUUIDs( $uuids ),
161                ]
162            );
163
164            $this->output( 'Removing ' . count( $revisions ) . ' header revisions from ' .
165                count( $uuids ) . ' headers (up to ' . $startId->getTimestamp() . ")\n" );
166
167            $this->dbFactory->getDB( DB_PRIMARY )->begin( __METHOD__ );
168
169            foreach ( $revisions as $revision ) {
170                $this->removeReferences( $revision );
171            }
172
173            $this->multiRemove( $revisions );
174
175            if ( $this->dryRun ) {
176                $this->dbFactory->getDB( DB_PRIMARY )->rollback( __METHOD__ );
177            } else {
178                $this->dbFactory->getDB( DB_PRIMARY )->commit( __METHOD__ );
179                $this->dbFactory->waitForReplicas();
180            }
181        } while ( $revisions );
182    }
183
184    /**
185     * @param string $timestamp Timestamp in TS_MW format
186     * @throws \Flow\Exception\FlowException
187     */
188    protected function removeTopics( $timestamp ) {
189        $dbr = $this->dbFactory->getDB( DB_REPLICA );
190        $batchSize = $this->getBatchSize();
191
192        // start from around unix epoch - there can be no Flow data before that
193        $startId = UUID::getComparisonUUID( '1' );
194        do {
195            $workflows = $this->storage->find(
196                'Workflow',
197                [
198                    new RawSql( 'workflow_id > ' . $dbr->addQuotes( $startId->getBinary() ) ),
199                    'workflow_wiki' => WikiMap::getCurrentWikiId(),
200                    'workflow_type' => 'topic',
201                    new RawSql( 'workflow_last_update_timestamp < ' . $dbr->addQuotes( $dbr->timestamp( $timestamp ) ) ),
202                ],
203                [
204                    'limit' => $batchSize,
205                    'sort' => 'workflow_id',
206                    'order' => 'ASC',
207                ]
208            );
209
210            if ( !$workflows ) {
211                break;
212            }
213
214            // prepare for next batch
215            /** @var UUID $startId */
216            $startId = end( $workflows )->getId();
217
218            $this->output( 'Removing ' . count( $workflows ) .
219                ' topic workflows (up to ' . $startId->getTimestamp() . ")\n" );
220            $this->removeWorkflows( $workflows );
221        } while ( $workflows );
222    }
223
224    /**
225     * @param string $timestamp Timestamp in TS_MW format
226     * @throws DBUnexpectedError
227     * @throws FlowException
228     */
229    protected function removeTopicsWithFlowUpdates( $timestamp ) {
230        $dbr = $this->dbFactory->getDB( DB_REPLICA );
231        $batchSize = $this->getBatchSize();
232        $talkpageManager = Hooks::getOccupationController()->getTalkpageManager();
233
234        // start from around unix epoch - there can be no Flow data before that
235        $batchStartId = UUID::getComparisonUUID( '1' );
236
237        // we only care about revisions since cutoff here
238        $cutoffStartId = UUID::getComparisonUUID( $timestamp );
239
240        do {
241            $workflowIds = $dbr->selectFieldValues(
242                [ 'flow_workflow', 'flow_tree_node', 'flow_revision' ],
243                'workflow_id',
244                [
245                    // revisions more recent than cutoff time
246                    'rev_id > ' . $dbr->addQuotes( $cutoffStartId->getBinary() ),
247                    // workflow_id condition is only used to batch, the exact
248                    // $batchStartId otherwise doesn't matter (unlike rev_id)
249                    'workflow_id > ' . $dbr->addQuotes( $batchStartId->getBinary() ),
250                    'workflow_wiki' => WikiMap::getCurrentWikiId(),
251                    'workflow_type' => 'topic',
252                    'workflow_last_update_timestamp >= ' . $dbr->addQuotes( $dbr->timestamp( $timestamp ) ),
253                ],
254                __METHOD__,
255                [
256                    'LIMIT' => $batchSize,
257                    'ORDER BY' => 'workflow_id ASC',
258                    // we only want to find topics that were only altered by talk
259                    // page manager: as long as anyone else edited any post, we're
260                    // not interested in it
261                    'GROUP BY' => 'workflow_id',
262                    'HAVING' => [ 'GROUP_CONCAT(DISTINCT rev_user_id)' => $talkpageManager->getId() ],
263                ],
264                [
265                    'flow_tree_node' => [ 'INNER JOIN', [ 'tree_ancestor_id = workflow_id' ] ],
266                    'flow_revision' => [ 'INNER JOIN', [ 'rev_type_id = tree_descendant_id' ] ],
267                ]
268            );
269
270            if ( !$workflowIds ) {
271                break;
272            }
273
274            $workflows = $this->storage->getMulti( 'Workflow', $workflowIds );
275
276            // prepare for next batch
277            /** @var UUID $batchStartId */
278            $batchStartId = end( $workflows )->getId();
279
280            $this->output( 'Removing ' . count( $workflows ) . ' topic workflows with recent ' .
281                'Flow updates (up to ' . $batchStartId->getTimestamp() . ")\n" );
282            $this->removeWorkflows( $workflows );
283        } while ( $workflows );
284    }
285
286    /**
287     * @param Workflow[] $workflows
288     * @throws DBUnexpectedError
289     */
290    protected function removeWorkflows( array $workflows ) {
291        $this->dbFactory->getDB( DB_PRIMARY )->begin( __METHOD__ );
292
293        foreach ( $workflows as $workflow ) {
294            $this->removeSummary( $workflow );
295            $this->removePosts( $workflow );
296            $this->removeTopicList( $workflow );
297        }
298
299        $this->multiRemove( $workflows );
300
301        if ( $this->dryRun ) {
302            $this->dbFactory->getDB( DB_PRIMARY )->rollback( __METHOD__ );
303        } else {
304            $this->dbFactory->getDB( DB_PRIMARY )->commit( __METHOD__ );
305            $this->dbFactory->waitForReplicas();
306        }
307    }
308
309    protected function removeTopicList( Workflow $workflow ) {
310        $entries = $this->storage->find( 'TopicListEntry', [ 'topic_id' => $workflow->getId() ] );
311        if ( $entries ) {
312            $this->output( 'Removing ' . count( $entries ) . " topiclist entries.\n" );
313            $this->multiRemove( $entries );
314        }
315    }
316
317    protected function removeSummary( Workflow $workflow ) {
318        $revisions = $this->storage->find( 'PostSummary', [ 'rev_type_id' => $workflow->getId() ] );
319        if ( $revisions ) {
320            foreach ( $revisions as $revision ) {
321                $this->removeReferences( $revision );
322            }
323
324            $this->output( 'Removing ' . count( $revisions ) . " summary revisions from 1 topic.\n" );
325            $this->multiRemove( $revisions );
326        }
327    }
328
329    /**
330     * @param UUID $parentId
331     * @param array $subtree
332     * @return array
333     */
334    protected function sortSubtree( UUID $parentId, array $subtree ) {
335        $flat = [];
336
337        // first recursively process all children, so they come first in $flat
338        foreach ( $subtree['children'] as $id => $data ) {
339            $flat = array_merge(
340                $flat,
341                $this->sortSubtree( UUID::create( $id ), $data )
342            );
343        }
344
345        // then add parent, which should come last in $flat
346        $flat[] = $parentId;
347
348        return $flat;
349    }
350
351    protected function removePosts( Workflow $workflow ) {
352        // fetch all children (posts) from a topic & reverse-sort all the posts:
353        // deepest-nested children should come first, parents last
354        $subtree = $this->treeRepo->fetchSubtree( $workflow->getId() );
355        $uuids = $this->sortSubtree( $workflow->getId(), $subtree );
356
357        $conds = [];
358        foreach ( $uuids as $id ) {
359            $conds[] = [ 'rev_type_id' => $id ];
360        }
361
362        $posts = $this->storage->findMulti( 'PostRevision', $conds );
363        $count = 0;
364        foreach ( $posts as $revisions ) {
365            /** @var PostRevision[] $revisions */
366            foreach ( $revisions as $revision ) {
367                $this->removeReferences( $revision );
368            }
369
370            $count += count( $revisions );
371            $this->multiRemove( $revisions );
372
373            foreach ( $revisions as $revision ) {
374                $this->treeRepo->delete( $revision->getCollectionId() );
375            }
376        }
377        $this->output( 'Removing ' . $count . ' post revisions from ' . count( $posts ) . " posts.\n" );
378    }
379
380    protected function removeReferences( AbstractRevision $revision ) {
381        $wikiReferences = $this->storage->find( 'WikiReference', [
382            'ref_src_wiki' => WikiMap::getCurrentWikiId(),
383            'ref_src_object_type' => $revision->getRevisionType(),
384            'ref_src_object_id' => $revision->getCollectionId(),
385        ] );
386        if ( $wikiReferences ) {
387            $this->output( 'Removing ' . count( $wikiReferences ) . " wiki references from 1 revision.\n" );
388            $this->multiRemove( $wikiReferences );
389        }
390
391        $urlReferences = $this->storage->find( 'URLReference', [
392            'ref_src_wiki' => WikiMap::getCurrentWikiId(),
393            'ref_src_object_type' => $revision->getRevisionType(),
394            'ref_src_object_id' => $revision->getCollectionId(),
395        ] );
396        if ( $urlReferences ) {
397            $this->output( 'Removing ' . count( $urlReferences ) . " url references from 1 revision.\n" );
398            $this->multiRemove( $urlReferences );
399        }
400    }
401
402    protected function multiRemove( array $objects ) {
403        $this->storage->multiRemove( $objects );
404    }
405}
406
407$maintClass = FlowRemoveOldTopics::class;
408require_once RUN_MAINTENANCE_IF_MAIN;