MediaWiki  1.34.0
deduplicateArchiveRevId.php
Go to the documentation of this file.
1 <?php
2 
4 
5 require_once __DIR__ . '/Maintenance.php';
6 
15 
17  private $arActorQuery = null;
18 
19  private $deleted = 0;
20  private $reassigned = 0;
21 
22  public function __construct() {
23  parent::__construct();
24  $this->addDescription(
25  'Clean up duplicate ar_rev_id, both within archive and between archive and revision.'
26  );
27  $this->setBatchSize( 10000 );
28  }
29 
30  protected function getUpdateKey() {
31  return __CLASS__;
32  }
33 
34  protected function doDBUpdates() {
35  $this->output( "Deduplicating ar_rev_id...\n" );
36  $dbw = $this->getDB( DB_MASTER );
37  // Sanity check. If this is a new install, we don't need to do anything here.
38  if ( PopulateArchiveRevId::isNewInstall( $dbw ) ) {
39  $this->output( "New install, nothing to do here.\n" );
40  return true;
41  }
42 
44 
45  $minId = $dbw->selectField( 'archive', 'MIN(ar_rev_id)', [], __METHOD__ );
46  $maxId = $dbw->selectField( 'archive', 'MAX(ar_rev_id)', [], __METHOD__ );
47  $batchSize = $this->getBatchSize();
48 
49  $this->arActorQuery = ActorMigration::newMigration()->getJoin( 'ar_user' );
50  $revActorQuery = ActorMigration::newMigration()->getJoin( 'rev_user' );
51 
52  for ( $id = $minId; $id <= $maxId; $id += $batchSize ) {
53  $endId = min( $maxId, $id + $batchSize - 1 );
54 
55  $this->beginTransaction( $dbw, __METHOD__ );
56 
57  // Lock the archive and revision table rows for the IDs we're checking
58  // to try to prevent deletions or undeletions from confusing things.
59  $dbw->selectRowCount(
60  'archive',
61  1,
62  [ 'ar_rev_id >= ' . (int)$id, 'ar_rev_id <= ' . (int)$endId ],
63  __METHOD__,
64  [ 'FOR UPDATE' ]
65  );
66  $dbw->selectRowCount(
67  'revision',
68  1,
69  [ 'rev_id >= ' . (int)$id, 'rev_id <= ' . (int)$endId ],
70  __METHOD__,
71  [ 'LOCK IN SHARE MODE' ]
72  );
73 
74  // Figure out the ar_rev_ids we actually need to look at
75  $res = $dbw->select(
76  [ 'archive', 'revision' ] + $revActorQuery['tables'],
77  [ 'rev_id', 'rev_timestamp', 'rev_sha1' ] + $revActorQuery['fields'],
78  [ 'ar_rev_id >= ' . (int)$id, 'ar_rev_id <= ' . (int)$endId ],
79  __METHOD__,
80  [ 'DISTINCT' ],
81  [ 'revision' => [ 'JOIN', 'ar_rev_id = rev_id' ] ] + $revActorQuery['joins']
82  );
83  $revRows = [];
84  foreach ( $res as $row ) {
85  $revRows[$row->rev_id] = $row;
86  }
87 
88  $arRevIds = $dbw->selectFieldValues(
89  [ 'archive' ],
90  'ar_rev_id',
91  [ 'ar_rev_id >= ' . (int)$id, 'ar_rev_id <= ' . (int)$endId ],
92  __METHOD__,
93  [ 'GROUP BY' => 'ar_rev_id', 'HAVING' => 'COUNT(*) > 1' ]
94  );
95  $arRevIds = array_values( array_unique( array_merge( $arRevIds, array_keys( $revRows ) ) ) );
96 
97  if ( $arRevIds ) {
98  $this->processArRevIds( $dbw, $arRevIds, $revRows );
99  }
100 
101  $this->output( "... $id-$endId\n" );
102  $this->commitTransaction( $dbw, __METHOD__ );
103  }
104 
105  $this->output(
106  "Finished deduplicating ar_rev_id. $this->deleted rows deleted, "
107  . "$this->reassigned assigned new IDs.\n"
108  );
109  return true;
110  }
111 
118  private function processArRevIds( IDatabase $dbw, array $arRevIds, array $revRows ) {
119  // Select all the data we need for deduplication
120  $res = $dbw->select(
121  [ 'archive' ] + $this->arActorQuery['tables'],
122  [ 'ar_id', 'ar_rev_id', 'ar_namespace', 'ar_title', 'ar_timestamp', 'ar_sha1' ]
123  + $this->arActorQuery['fields'],
124  [ 'ar_rev_id' => $arRevIds ],
125  __METHOD__,
126  [],
127  $this->arActorQuery['joins']
128  );
129 
130  // Determine which rows we need to delete or reassign
131  $seen = [];
132  $toDelete = [];
133  $toReassign = [];
134  foreach ( $res as $row ) {
135  // Revision-table row exists?
136  if ( isset( $revRows[$row->ar_rev_id] ) ) {
137  $revRow = $revRows[$row->ar_rev_id];
138 
139  // Record the rev_id as seen, so the code below will always delete or reassign.
140  if ( !isset( $seen[$revRow->rev_id] ) ) {
141  $seen[$revRow->rev_id] = [
142  'first' => "revision row",
143  ];
144  }
145 
146  // Delete the archive row if it seems to be the same regardless
147  // of page, because moves can change IDs and titles.
148  if ( $row->ar_timestamp === $revRow->rev_timestamp &&
149  $row->ar_sha1 === $revRow->rev_sha1 &&
150  $row->ar_user === $revRow->rev_user &&
151  $row->ar_user_text === $revRow->rev_user_text
152  ) {
153  $this->output(
154  "Row $row->ar_id duplicates revision row for rev_id $revRow->rev_id, deleting\n"
155  );
156  $toDelete[] = $row->ar_id;
157  continue;
158  }
159  }
160 
161  $key = $this->getSeenKey( $row );
162  if ( !isset( $seen[$row->ar_rev_id] ) ) {
163  // This rev_id hasn't even been seen yet, nothing to do besides record it.
164  $seen[$row->ar_rev_id] = [
165  'first' => "archive row $row->ar_id",
166  $key => $row->ar_id,
167  ];
168  } elseif ( !isset( $seen[$row->ar_rev_id][$key] ) ) {
169  // The rev_id was seen, but not this particular change. Reassign it.
170  $seen[$row->ar_rev_id][$key] = $row->ar_id;
171  $this->output(
172  "Row $row->ar_id conflicts with {$seen[$row->ar_rev_id]['first']} "
173  . "for rev_id $row->ar_rev_id, reassigning\n"
174  );
175  $toReassign[] = $row->ar_id;
176  } else {
177  // The rev_id was seen with a row that matches this change. Delete it.
178  $this->output(
179  "Row $row->ar_id duplicates archive row {$seen[$row->ar_rev_id][$key]} "
180  . "for rev_id $row->ar_rev_id, deleting\n"
181  );
182  $toDelete[] = $row->ar_id;
183  }
184  }
185 
186  // Perform the updates
187  if ( $toDelete ) {
188  $dbw->delete( 'archive', [ 'ar_id' => $toDelete ], __METHOD__ );
189  $this->deleted += $dbw->affectedRows();
190  }
191  if ( $toReassign ) {
192  $this->reassigned += PopulateArchiveRevId::reassignArRevIds( $dbw, $toReassign );
193  }
194  }
195 
201  private function getSeenKey( $row ) {
202  return implode( "\n", [
203  $row->ar_namespace,
204  $row->ar_title,
205  $row->ar_timestamp,
206  $row->ar_sha1,
207  $row->ar_user,
208  $row->ar_user_text,
209  ] );
210  }
211 
212 }
213 
214 $maintClass = "DeduplicateArchiveRevId";
215 require_once RUN_MAINTENANCE_IF_MAIN;
RUN_MAINTENANCE_IF_MAIN
const RUN_MAINTENANCE_IF_MAIN
Definition: Maintenance.php:39
Wikimedia\Rdbms\IDatabase\affectedRows
affectedRows()
Get the number of rows affected by the last write query.
DeduplicateArchiveRevId\$reassigned
$reassigned
Definition: deduplicateArchiveRevId.php:20
DeduplicateArchiveRevId\getSeenKey
getSeenKey( $row)
Make a key identifying a "unique" change from a row.
Definition: deduplicateArchiveRevId.php:201
Maintenance\addDescription
addDescription( $text)
Set the description text.
Definition: Maintenance.php:348
$maintClass
$maintClass
Definition: deduplicateArchiveRevId.php:214
DeduplicateArchiveRevId\doDBUpdates
doDBUpdates()
Do the actual work.
Definition: deduplicateArchiveRevId.php:34
$res
$res
Definition: testCompression.php:52
DeduplicateArchiveRevId\$arActorQuery
array null $arActorQuery
Definition: deduplicateArchiveRevId.php:17
ActorMigration\newMigration
static newMigration()
Static constructor.
Definition: ActorMigration.php:136
Wikimedia\Rdbms\IDatabase
Basic database interface for live and lazy-loaded relation database handles.
Definition: IDatabase.php:38
Maintenance\beginTransaction
beginTransaction(IDatabase $dbw, $fname)
Begin a transcation on a DB.
Definition: Maintenance.php:1426
PopulateArchiveRevId\checkMysqlAutoIncrementBug
static checkMysqlAutoIncrementBug(IDatabase $dbw)
Check for (and work around) a MySQL auto-increment bug.
Definition: populateArchiveRevId.php:110
DeduplicateArchiveRevId\__construct
__construct()
Default constructor.
Definition: deduplicateArchiveRevId.php:22
LoggedUpdateMaintenance
Class for scripts that perform database maintenance and want to log the update in updatelog so we can...
Definition: Maintenance.php:1727
DeduplicateArchiveRevId\processArRevIds
processArRevIds(IDatabase $dbw, array $arRevIds, array $revRows)
Process a set of ar_rev_ids.
Definition: deduplicateArchiveRevId.php:118
DB_MASTER
const DB_MASTER
Definition: defines.php:26
PopulateArchiveRevId\reassignArRevIds
static reassignArRevIds(IDatabase $dbw, array $arIds, array $conds=[])
Assign new ar_rev_ids to a set of ar_ids.
Definition: populateArchiveRevId.php:154
Maintenance\commitTransaction
commitTransaction(IDatabase $dbw, $fname)
Commit the transcation on a DB handle and wait for replica DBs to catch up.
Definition: Maintenance.php:1441
Maintenance\getDB
getDB( $db, $groups=[], $dbDomain=false)
Returns a database to be used by current maintenance script.
Definition: Maintenance.php:1396
DeduplicateArchiveRevId
Maintenance script that cleans up archive rows with duplicated ar_rev_id, both within archive and bet...
Definition: deduplicateArchiveRevId.php:14
Maintenance\getBatchSize
getBatchSize()
Returns batch size.
Definition: Maintenance.php:386
Wikimedia\Rdbms\IDatabase\select
select( $table, $vars, $conds='', $fname=__METHOD__, $options=[], $join_conds=[])
Execute a SELECT query constructed using the various parameters provided.
PopulateArchiveRevId\isNewInstall
static isNewInstall(IDatabase $dbw)
Definition: populateArchiveRevId.php:50
Maintenance\output
output( $out, $channel=null)
Throw some output to the user.
Definition: Maintenance.php:453
DeduplicateArchiveRevId\$deleted
$deleted
Definition: deduplicateArchiveRevId.php:19
DeduplicateArchiveRevId\getUpdateKey
getUpdateKey()
Get the update key name to go in the update log table.
Definition: deduplicateArchiveRevId.php:30
Wikimedia\Rdbms\IDatabase\delete
delete( $table, $conds, $fname=__METHOD__)
DELETE query wrapper.
Maintenance\setBatchSize
setBatchSize( $s=0)
Set the batch size.
Definition: Maintenance.php:394