MediaWiki REL1_37
deduplicateArchiveRevId.php
Go to the documentation of this file.
1<?php
2
4
5require_once __DIR__ . '/Maintenance.php';
6
15
16 private $deleted = 0;
17 private $reassigned = 0;
18
19 public function __construct() {
20 parent::__construct();
21 $this->addDescription(
22 'Clean up duplicate ar_rev_id, both within archive and between archive and revision.'
23 );
24 $this->setBatchSize( 10000 );
25 }
26
27 protected function getUpdateKey() {
28 return __CLASS__;
29 }
30
31 protected function doDBUpdates() {
32 $this->output( "Deduplicating ar_rev_id...\n" );
33 $dbw = $this->getDB( DB_PRIMARY );
34 // Sanity check. If this is a new install, we don't need to do anything here.
36 $this->output( "New install, nothing to do here.\n" );
37 return true;
38 }
39
41
42 $minId = $dbw->selectField( 'archive', 'MIN(ar_rev_id)', [], __METHOD__ );
43 $maxId = $dbw->selectField( 'archive', 'MAX(ar_rev_id)', [], __METHOD__ );
44 $batchSize = $this->getBatchSize();
45
46 $revActorQuery = ActorMigration::newMigration()->getJoin( 'rev_user' );
47
48 for ( $id = $minId; $id <= $maxId; $id += $batchSize ) {
49 $endId = min( $maxId, $id + $batchSize - 1 );
50
51 $this->beginTransaction( $dbw, __METHOD__ );
52
53 // Lock the archive and revision table rows for the IDs we're checking
54 // to try to prevent deletions or undeletions from confusing things.
55 $dbw->selectRowCount(
56 'archive',
57 '1',
58 [ 'ar_rev_id >= ' . (int)$id, 'ar_rev_id <= ' . (int)$endId ],
59 __METHOD__,
60 [ 'FOR UPDATE' ]
61 );
62 $dbw->selectRowCount(
63 'revision',
64 '1',
65 [ 'rev_id >= ' . (int)$id, 'rev_id <= ' . (int)$endId ],
66 __METHOD__,
67 [ 'LOCK IN SHARE MODE' ]
68 );
69
70 // Figure out the ar_rev_ids we actually need to look at
71 $res = $dbw->select(
72 [ 'archive', 'revision' ] + $revActorQuery['tables'],
73 [ 'rev_id', 'rev_timestamp', 'rev_sha1' ] + $revActorQuery['fields'],
74 [ 'ar_rev_id >= ' . (int)$id, 'ar_rev_id <= ' . (int)$endId ],
75 __METHOD__,
76 [ 'DISTINCT' ],
77 [ 'revision' => [ 'JOIN', 'ar_rev_id = rev_id' ] ] + $revActorQuery['joins']
78 );
79 $revRows = [];
80 foreach ( $res as $row ) {
81 $revRows[$row->rev_id] = $row;
82 }
83
84 $arRevIds = $dbw->selectFieldValues(
85 [ 'archive' ],
86 'ar_rev_id',
87 [ 'ar_rev_id >= ' . (int)$id, 'ar_rev_id <= ' . (int)$endId ],
88 __METHOD__,
89 [ 'GROUP BY' => 'ar_rev_id', 'HAVING' => 'COUNT(*) > 1' ]
90 );
91 $arRevIds = array_values( array_unique( array_merge( $arRevIds, array_keys( $revRows ) ) ) );
92
93 if ( $arRevIds ) {
94 $this->processArRevIds( $dbw, $arRevIds, $revRows );
95 }
96
97 $this->output( "... $id-$endId\n" );
98 $this->commitTransaction( $dbw, __METHOD__ );
99 }
100
101 $this->output(
102 "Finished deduplicating ar_rev_id. $this->deleted rows deleted, "
103 . "$this->reassigned assigned new IDs.\n"
104 );
105 return true;
106 }
107
114 private function processArRevIds( IDatabase $dbw, array $arRevIds, array $revRows ) {
115 // Select all the data we need for deduplication
116 $res = $dbw->select(
117 [ 'archive' ],
118 [ 'ar_id', 'ar_rev_id', 'ar_namespace', 'ar_title', 'ar_actor',
119 'ar_timestamp', 'ar_sha1' ],
120 [ 'ar_rev_id' => $arRevIds ],
121 __METHOD__
122 );
123
124 // Determine which rows we need to delete or reassign
125 $seen = [];
126 $toDelete = [];
127 $toReassign = [];
128 foreach ( $res as $row ) {
129 // Revision-table row exists?
130 if ( isset( $revRows[$row->ar_rev_id] ) ) {
131 $revRow = $revRows[$row->ar_rev_id];
132
133 // Record the rev_id as seen, so the code below will always delete or reassign.
134 if ( !isset( $seen[$revRow->rev_id] ) ) {
135 $seen[$revRow->rev_id] = [
136 'first' => "revision row",
137 ];
138 }
139
140 // Delete the archive row if it seems to be the same regardless
141 // of page, because moves can change IDs and titles.
142 if ( $row->ar_timestamp === $revRow->rev_timestamp &&
143 $row->ar_sha1 === $revRow->rev_sha1 &&
144 $row->ar_actor === $revRow->rev_actor
145 ) {
146 $this->output(
147 "Row $row->ar_id duplicates revision row for rev_id $revRow->rev_id, deleting\n"
148 );
149 $toDelete[] = $row->ar_id;
150 continue;
151 }
152 }
153
154 $key = $this->getSeenKey( $row );
155 if ( !isset( $seen[$row->ar_rev_id] ) ) {
156 // This rev_id hasn't even been seen yet, nothing to do besides record it.
157 $seen[$row->ar_rev_id] = [
158 'first' => "archive row $row->ar_id",
159 $key => $row->ar_id,
160 ];
161 } elseif ( !isset( $seen[$row->ar_rev_id][$key] ) ) {
162 // The rev_id was seen, but not this particular change. Reassign it.
163 $seen[$row->ar_rev_id][$key] = $row->ar_id;
164 $this->output(
165 "Row $row->ar_id conflicts with {$seen[$row->ar_rev_id]['first']} "
166 . "for rev_id $row->ar_rev_id, reassigning\n"
167 );
168 $toReassign[] = $row->ar_id;
169 } else {
170 // The rev_id was seen with a row that matches this change. Delete it.
171 $this->output(
172 "Row $row->ar_id duplicates archive row {$seen[$row->ar_rev_id][$key]} "
173 . "for rev_id $row->ar_rev_id, deleting\n"
174 );
175 $toDelete[] = $row->ar_id;
176 }
177 }
178
179 // Perform the updates
180 if ( $toDelete ) {
181 $dbw->delete( 'archive', [ 'ar_id' => $toDelete ], __METHOD__ );
182 $this->deleted += $dbw->affectedRows();
183 }
184 if ( $toReassign ) {
185 $this->reassigned += PopulateArchiveRevId::reassignArRevIds( $dbw, $toReassign );
186 }
187 }
188
194 private function getSeenKey( $row ) {
195 return implode( "\n", [
196 $row->ar_namespace,
197 $row->ar_title,
198 $row->ar_timestamp,
199 $row->ar_sha1,
200 $row->ar_actor,
201 ] );
202 }
203
204}
205
206$maintClass = DeduplicateArchiveRevId::class;
207require_once RUN_MAINTENANCE_IF_MAIN;
getDB()
Maintenance script that cleans up archive rows with duplicated ar_rev_id, both within archive and bet...
__construct()
Default constructor.
getUpdateKey()
Get the update key name to go in the update log table.
getSeenKey( $row)
Make a key identifying a "unique" change from a row.
processArRevIds(IDatabase $dbw, array $arRevIds, array $revRows)
Process a set of ar_rev_ids.
Class for scripts that perform database maintenance and want to log the update in updatelog so we can...
beginTransaction(IDatabase $dbw, $fname)
Begin a transaction on a DB.
commitTransaction(IDatabase $dbw, $fname)
Commit the transaction on a DB handle and wait for replica DBs to catch up.
output( $out, $channel=null)
Throw some output to the user.
getBatchSize()
Returns batch size.
addDescription( $text)
Set the description text.
setBatchSize( $s=0)
static checkMysqlAutoIncrementBug(IDatabase $dbw)
Check for (and work around) a MySQL auto-increment bug.
static reassignArRevIds(IDatabase $dbw, array $arIds, array $conds=[])
Assign new ar_rev_ids to a set of ar_ids.
static isNewInstall(IDatabase $dbw)
Basic database interface for live and lazy-loaded relation database handles.
Definition IDatabase.php:38
select( $table, $vars, $conds='', $fname=__METHOD__, $options=[], $join_conds=[])
Execute a SELECT query constructed using the various parameters provided.
affectedRows()
Get the number of rows affected by the last write query.
delete( $table, $conds, $fname=__METHOD__)
Delete all rows in a table that match a condition.
const DB_PRIMARY
Definition defines.php:27