MediaWiki  master
deduplicateArchiveRevId.php
Go to the documentation of this file.
1 <?php
2 
4 
5 require_once __DIR__ . '/Maintenance.php';
6 
15 
16  private $deleted = 0;
17  private $reassigned = 0;
18 
19  public function __construct() {
20  parent::__construct();
21  $this->addDescription(
22  'Clean up duplicate ar_rev_id, both within archive and between archive and revision.'
23  );
24  $this->setBatchSize( 10000 );
25  }
26 
27  protected function getUpdateKey() {
28  return __CLASS__;
29  }
30 
31  protected function doDBUpdates() {
32  $this->output( "Deduplicating ar_rev_id...\n" );
33  $dbw = $this->getDB( DB_PRIMARY );
34  // Sanity check. If this is a new install, we don't need to do anything here.
35  if ( PopulateArchiveRevId::isNewInstall( $dbw ) ) {
36  $this->output( "New install, nothing to do here.\n" );
37  return true;
38  }
39 
41 
42  $minId = $dbw->selectField( 'archive', 'MIN(ar_rev_id)', [], __METHOD__ );
43  $maxId = $dbw->selectField( 'archive', 'MAX(ar_rev_id)', [], __METHOD__ );
44  $batchSize = $this->getBatchSize();
45 
46  $revActorQuery = ActorMigration::newMigration()->getJoin( 'rev_user' );
47 
48  for ( $id = $minId; $id <= $maxId; $id += $batchSize ) {
49  $endId = min( $maxId, $id + $batchSize - 1 );
50 
51  $this->beginTransaction( $dbw, __METHOD__ );
52 
53  // Lock the archive and revision table rows for the IDs we're checking
54  // to try to prevent deletions or undeletions from confusing things.
55  $dbw->selectRowCount(
56  'archive',
57  '1',
58  [ 'ar_rev_id >= ' . (int)$id, 'ar_rev_id <= ' . (int)$endId ],
59  __METHOD__,
60  [ 'FOR UPDATE' ]
61  );
62  $dbw->selectRowCount(
63  'revision',
64  '1',
65  [ 'rev_id >= ' . (int)$id, 'rev_id <= ' . (int)$endId ],
66  __METHOD__,
67  [ 'LOCK IN SHARE MODE' ]
68  );
69 
70  // Figure out the ar_rev_ids we actually need to look at
71  $res = $dbw->select(
72  [ 'archive', 'revision' ] + $revActorQuery['tables'],
73  [ 'rev_id', 'rev_timestamp', 'rev_sha1' ] + $revActorQuery['fields'],
74  [ 'ar_rev_id >= ' . (int)$id, 'ar_rev_id <= ' . (int)$endId ],
75  __METHOD__,
76  [ 'DISTINCT' ],
77  [ 'revision' => [ 'JOIN', 'ar_rev_id = rev_id' ] ] + $revActorQuery['joins']
78  );
79  $revRows = [];
80  foreach ( $res as $row ) {
81  $revRows[$row->rev_id] = $row;
82  }
83 
84  $arRevIds = $dbw->selectFieldValues(
85  [ 'archive' ],
86  'ar_rev_id',
87  [ 'ar_rev_id >= ' . (int)$id, 'ar_rev_id <= ' . (int)$endId ],
88  __METHOD__,
89  [ 'GROUP BY' => 'ar_rev_id', 'HAVING' => 'COUNT(*) > 1' ]
90  );
91  $arRevIds = array_values( array_unique( array_merge( $arRevIds, array_keys( $revRows ) ) ) );
92 
93  if ( $arRevIds ) {
94  $this->processArRevIds( $dbw, $arRevIds, $revRows );
95  }
96 
97  $this->output( "... $id-$endId\n" );
98  $this->commitTransaction( $dbw, __METHOD__ );
99  }
100 
101  $this->output(
102  "Finished deduplicating ar_rev_id. $this->deleted rows deleted, "
103  . "$this->reassigned assigned new IDs.\n"
104  );
105  return true;
106  }
107 
114  private function processArRevIds( IDatabase $dbw, array $arRevIds, array $revRows ) {
115  // Select all the data we need for deduplication
116  $res = $dbw->select(
117  [ 'archive' ],
118  [ 'ar_id', 'ar_rev_id', 'ar_namespace', 'ar_title', 'ar_actor',
119  'ar_timestamp', 'ar_sha1' ],
120  [ 'ar_rev_id' => $arRevIds ],
121  __METHOD__
122  );
123 
124  // Determine which rows we need to delete or reassign
125  $seen = [];
126  $toDelete = [];
127  $toReassign = [];
128  foreach ( $res as $row ) {
129  // Revision-table row exists?
130  if ( isset( $revRows[$row->ar_rev_id] ) ) {
131  $revRow = $revRows[$row->ar_rev_id];
132 
133  // Record the rev_id as seen, so the code below will always delete or reassign.
134  if ( !isset( $seen[$revRow->rev_id] ) ) {
135  $seen[$revRow->rev_id] = [
136  'first' => "revision row",
137  ];
138  }
139 
140  // Delete the archive row if it seems to be the same regardless
141  // of page, because moves can change IDs and titles.
142  if ( $row->ar_timestamp === $revRow->rev_timestamp &&
143  $row->ar_sha1 === $revRow->rev_sha1 &&
144  $row->ar_actor === $revRow->rev_actor
145  ) {
146  $this->output(
147  "Row $row->ar_id duplicates revision row for rev_id $revRow->rev_id, deleting\n"
148  );
149  $toDelete[] = $row->ar_id;
150  continue;
151  }
152  }
153 
154  $key = $this->getSeenKey( $row );
155  if ( !isset( $seen[$row->ar_rev_id] ) ) {
156  // This rev_id hasn't even been seen yet, nothing to do besides record it.
157  $seen[$row->ar_rev_id] = [
158  'first' => "archive row $row->ar_id",
159  $key => $row->ar_id,
160  ];
161  } elseif ( !isset( $seen[$row->ar_rev_id][$key] ) ) {
162  // The rev_id was seen, but not this particular change. Reassign it.
163  $seen[$row->ar_rev_id][$key] = $row->ar_id;
164  $this->output(
165  "Row $row->ar_id conflicts with {$seen[$row->ar_rev_id]['first']} "
166  . "for rev_id $row->ar_rev_id, reassigning\n"
167  );
168  $toReassign[] = $row->ar_id;
169  } else {
170  // The rev_id was seen with a row that matches this change. Delete it.
171  $this->output(
172  "Row $row->ar_id duplicates archive row {$seen[$row->ar_rev_id][$key]} "
173  . "for rev_id $row->ar_rev_id, deleting\n"
174  );
175  $toDelete[] = $row->ar_id;
176  }
177  }
178 
179  // Perform the updates
180  if ( $toDelete ) {
181  $dbw->delete( 'archive', [ 'ar_id' => $toDelete ], __METHOD__ );
182  $this->deleted += $dbw->affectedRows();
183  }
184  if ( $toReassign ) {
185  $this->reassigned += PopulateArchiveRevId::reassignArRevIds( $dbw, $toReassign );
186  }
187  }
188 
194  private function getSeenKey( $row ) {
195  return implode( "\n", [
196  $row->ar_namespace,
197  $row->ar_title,
198  $row->ar_timestamp,
199  $row->ar_sha1,
200  $row->ar_actor,
201  ] );
202  }
203 
204 }
205 
206 $maintClass = DeduplicateArchiveRevId::class;
207 require_once RUN_MAINTENANCE_IF_MAIN;
Wikimedia\Rdbms\IDatabase\affectedRows
affectedRows()
Get the number of rows affected by the last write query.
DeduplicateArchiveRevId\$reassigned
$reassigned
Definition: deduplicateArchiveRevId.php:17
DeduplicateArchiveRevId\getSeenKey
getSeenKey( $row)
Make a key identifying a "unique" change from a row.
Definition: deduplicateArchiveRevId.php:194
Maintenance\addDescription
addDescription( $text)
Set the description text.
Definition: Maintenance.php:329
$maintClass
$maintClass
Definition: deduplicateArchiveRevId.php:206
DeduplicateArchiveRevId\doDBUpdates
doDBUpdates()
Do the actual work.
Definition: deduplicateArchiveRevId.php:31
$res
$res
Definition: testCompression.php:57
ActorMigration\newMigration
static newMigration()
Static constructor.
Definition: ActorMigration.php:76
Wikimedia\Rdbms\IDatabase
Basic database interface for live and lazy-loaded relation database handles.
Definition: IDatabase.php:38
Maintenance\beginTransaction
beginTransaction(IDatabase $dbw, $fname)
Begin a transaction on a DB.
Definition: Maintenance.php:1393
PopulateArchiveRevId\checkMysqlAutoIncrementBug
static checkMysqlAutoIncrementBug(IDatabase $dbw)
Check for (and work around) a MySQL auto-increment bug.
Definition: populateArchiveRevId.php:113
DeduplicateArchiveRevId\__construct
__construct()
Default constructor.
Definition: deduplicateArchiveRevId.php:19
LoggedUpdateMaintenance
Class for scripts that perform database maintenance and want to log the update in updatelog so we can...
Definition: LoggedUpdateMaintenance.php:26
DeduplicateArchiveRevId\processArRevIds
processArRevIds(IDatabase $dbw, array $arRevIds, array $revRows)
Process a set of ar_rev_ids.
Definition: deduplicateArchiveRevId.php:114
PopulateArchiveRevId\reassignArRevIds
static reassignArRevIds(IDatabase $dbw, array $arIds, array $conds=[])
Assign new ar_rev_ids to a set of ar_ids.
Definition: populateArchiveRevId.php:158
DB_PRIMARY
const DB_PRIMARY
Definition: defines.php:27
Maintenance\commitTransaction
commitTransaction(IDatabase $dbw, $fname)
Commit the transaction on a DB handle and wait for replica DBs to catch up.
Definition: Maintenance.php:1408
Maintenance\getDB
getDB( $db, $groups=[], $dbDomain=false)
Returns a database to be used by current maintenance script.
Definition: Maintenance.php:1362
DeduplicateArchiveRevId
Maintenance script that cleans up archive rows with duplicated ar_rev_id, both within archive and bet...
Definition: deduplicateArchiveRevId.php:14
Maintenance\getBatchSize
getBatchSize()
Returns batch size.
Definition: Maintenance.php:368
Wikimedia\Rdbms\IDatabase\select
select( $table, $vars, $conds='', $fname=__METHOD__, $options=[], $join_conds=[])
Execute a SELECT query constructed using the various parameters provided.
PopulateArchiveRevId\isNewInstall
static isNewInstall(IDatabase $dbw)
Definition: populateArchiveRevId.php:52
Maintenance\output
output( $out, $channel=null)
Throw some output to the user.
Definition: Maintenance.php:435
DeduplicateArchiveRevId\$deleted
$deleted
Definition: deduplicateArchiveRevId.php:16
DeduplicateArchiveRevId\getUpdateKey
getUpdateKey()
Get the update key name to go in the update log table.
Definition: deduplicateArchiveRevId.php:27
Wikimedia\Rdbms\IDatabase\delete
delete( $table, $conds, $fname=__METHOD__)
Delete all rows in a table that match a condition.
Maintenance\setBatchSize
setBatchSize( $s=0)
Definition: Maintenance.php:375