MediaWiki  master
deduplicateArchiveRevId.php
Go to the documentation of this file.
1 <?php
2 
4 
5 require_once __DIR__ . '/Maintenance.php';
6 
15 
20  private $arActorQuery = null;
21 
22  private $deleted = 0;
23  private $reassigned = 0;
24 
25  public function __construct() {
26  parent::__construct();
27  $this->addDescription(
28  'Clean up duplicate ar_rev_id, both within archive and between archive and revision.'
29  );
30  $this->setBatchSize( 10000 );
31  }
32 
33  protected function getUpdateKey() {
34  return __CLASS__;
35  }
36 
37  protected function doDBUpdates() {
38  $this->output( "Deduplicating ar_rev_id...\n" );
39  $dbw = $this->getDB( DB_MASTER );
40  // Sanity check. If this is a new install, we don't need to do anything here.
41  if ( PopulateArchiveRevId::isNewInstall( $dbw ) ) {
42  $this->output( "New install, nothing to do here.\n" );
43  return true;
44  }
45 
47 
48  $minId = $dbw->selectField( 'archive', 'MIN(ar_rev_id)', [], __METHOD__ );
49  $maxId = $dbw->selectField( 'archive', 'MAX(ar_rev_id)', [], __METHOD__ );
50  $batchSize = $this->getBatchSize();
51 
52  $this->arActorQuery = ActorMigration::newMigration()->getJoin( 'ar_user' );
53  $revActorQuery = ActorMigration::newMigration()->getJoin( 'rev_user' );
54 
55  for ( $id = $minId; $id <= $maxId; $id += $batchSize ) {
56  $endId = min( $maxId, $id + $batchSize - 1 );
57 
58  $this->beginTransaction( $dbw, __METHOD__ );
59 
60  // Lock the archive and revision table rows for the IDs we're checking
61  // to try to prevent deletions or undeletions from confusing things.
62  $dbw->selectRowCount(
63  'archive',
64  '1',
65  [ 'ar_rev_id >= ' . (int)$id, 'ar_rev_id <= ' . (int)$endId ],
66  __METHOD__,
67  [ 'FOR UPDATE' ]
68  );
69  $dbw->selectRowCount(
70  'revision',
71  '1',
72  [ 'rev_id >= ' . (int)$id, 'rev_id <= ' . (int)$endId ],
73  __METHOD__,
74  [ 'LOCK IN SHARE MODE' ]
75  );
76 
77  // Figure out the ar_rev_ids we actually need to look at
78  $res = $dbw->select(
79  [ 'archive', 'revision' ] + $revActorQuery['tables'],
80  [ 'rev_id', 'rev_timestamp', 'rev_sha1' ] + $revActorQuery['fields'],
81  [ 'ar_rev_id >= ' . (int)$id, 'ar_rev_id <= ' . (int)$endId ],
82  __METHOD__,
83  [ 'DISTINCT' ],
84  [ 'revision' => [ 'JOIN', 'ar_rev_id = rev_id' ] ] + $revActorQuery['joins']
85  );
86  $revRows = [];
87  foreach ( $res as $row ) {
88  $revRows[$row->rev_id] = $row;
89  }
90 
91  $arRevIds = $dbw->selectFieldValues(
92  [ 'archive' ],
93  'ar_rev_id',
94  [ 'ar_rev_id >= ' . (int)$id, 'ar_rev_id <= ' . (int)$endId ],
95  __METHOD__,
96  [ 'GROUP BY' => 'ar_rev_id', 'HAVING' => 'COUNT(*) > 1' ]
97  );
98  $arRevIds = array_values( array_unique( array_merge( $arRevIds, array_keys( $revRows ) ) ) );
99 
100  if ( $arRevIds ) {
101  $this->processArRevIds( $dbw, $arRevIds, $revRows );
102  }
103 
104  $this->output( "... $id-$endId\n" );
105  $this->commitTransaction( $dbw, __METHOD__ );
106  }
107 
108  $this->output(
109  "Finished deduplicating ar_rev_id. $this->deleted rows deleted, "
110  . "$this->reassigned assigned new IDs.\n"
111  );
112  return true;
113  }
114 
121  private function processArRevIds( IDatabase $dbw, array $arRevIds, array $revRows ) {
122  // Select all the data we need for deduplication
123  $res = $dbw->select(
124  [ 'archive' ] + $this->arActorQuery['tables'],
125  [ 'ar_id', 'ar_rev_id', 'ar_namespace', 'ar_title', 'ar_timestamp', 'ar_sha1' ]
126  + $this->arActorQuery['fields'],
127  [ 'ar_rev_id' => $arRevIds ],
128  __METHOD__,
129  [],
130  $this->arActorQuery['joins']
131  );
132 
133  // Determine which rows we need to delete or reassign
134  $seen = [];
135  $toDelete = [];
136  $toReassign = [];
137  foreach ( $res as $row ) {
138  // Revision-table row exists?
139  if ( isset( $revRows[$row->ar_rev_id] ) ) {
140  $revRow = $revRows[$row->ar_rev_id];
141 
142  // Record the rev_id as seen, so the code below will always delete or reassign.
143  if ( !isset( $seen[$revRow->rev_id] ) ) {
144  $seen[$revRow->rev_id] = [
145  'first' => "revision row",
146  ];
147  }
148 
149  // Delete the archive row if it seems to be the same regardless
150  // of page, because moves can change IDs and titles.
151  if ( $row->ar_timestamp === $revRow->rev_timestamp &&
152  $row->ar_sha1 === $revRow->rev_sha1 &&
153  $row->ar_user === $revRow->rev_user &&
154  $row->ar_user_text === $revRow->rev_user_text
155  ) {
156  $this->output(
157  "Row $row->ar_id duplicates revision row for rev_id $revRow->rev_id, deleting\n"
158  );
159  $toDelete[] = $row->ar_id;
160  continue;
161  }
162  }
163 
164  $key = $this->getSeenKey( $row );
165  if ( !isset( $seen[$row->ar_rev_id] ) ) {
166  // This rev_id hasn't even been seen yet, nothing to do besides record it.
167  $seen[$row->ar_rev_id] = [
168  'first' => "archive row $row->ar_id",
169  $key => $row->ar_id,
170  ];
171  } elseif ( !isset( $seen[$row->ar_rev_id][$key] ) ) {
172  // The rev_id was seen, but not this particular change. Reassign it.
173  $seen[$row->ar_rev_id][$key] = $row->ar_id;
174  $this->output(
175  "Row $row->ar_id conflicts with {$seen[$row->ar_rev_id]['first']} "
176  . "for rev_id $row->ar_rev_id, reassigning\n"
177  );
178  $toReassign[] = $row->ar_id;
179  } else {
180  // The rev_id was seen with a row that matches this change. Delete it.
181  $this->output(
182  "Row $row->ar_id duplicates archive row {$seen[$row->ar_rev_id][$key]} "
183  . "for rev_id $row->ar_rev_id, deleting\n"
184  );
185  $toDelete[] = $row->ar_id;
186  }
187  }
188 
189  // Perform the updates
190  if ( $toDelete ) {
191  $dbw->delete( 'archive', [ 'ar_id' => $toDelete ], __METHOD__ );
192  $this->deleted += $dbw->affectedRows();
193  }
194  if ( $toReassign ) {
195  $this->reassigned += PopulateArchiveRevId::reassignArRevIds( $dbw, $toReassign );
196  }
197  }
198 
204  private function getSeenKey( $row ) {
205  return implode( "\n", [
206  $row->ar_namespace,
207  $row->ar_title,
208  $row->ar_timestamp,
209  $row->ar_sha1,
210  $row->ar_user,
211  $row->ar_user_text,
212  ] );
213  }
214 
215 }
216 
217 $maintClass = DeduplicateArchiveRevId::class;
218 require_once RUN_MAINTENANCE_IF_MAIN;
RUN_MAINTENANCE_IF_MAIN
const RUN_MAINTENANCE_IF_MAIN
Definition: Maintenance.php:38
Wikimedia\Rdbms\IDatabase\affectedRows
affectedRows()
Get the number of rows affected by the last write query.
DeduplicateArchiveRevId\$reassigned
$reassigned
Definition: deduplicateArchiveRevId.php:23
DeduplicateArchiveRevId\getSeenKey
getSeenKey( $row)
Make a key identifying a "unique" change from a row.
Definition: deduplicateArchiveRevId.php:204
Maintenance\addDescription
addDescription( $text)
Set the description text.
Definition: Maintenance.php:327
$maintClass
$maintClass
Definition: deduplicateArchiveRevId.php:217
DeduplicateArchiveRevId\doDBUpdates
doDBUpdates()
Do the actual work.
Definition: deduplicateArchiveRevId.php:37
$res
$res
Definition: testCompression.php:57
ActorMigration\newMigration
static newMigration()
Static constructor.
Definition: ActorMigration.php:156
Wikimedia\Rdbms\IDatabase
Basic database interface for live and lazy-loaded relation database handles.
Definition: IDatabase.php:38
Maintenance\beginTransaction
beginTransaction(IDatabase $dbw, $fname)
Begin a transcation on a DB.
Definition: Maintenance.php:1401
PopulateArchiveRevId\checkMysqlAutoIncrementBug
static checkMysqlAutoIncrementBug(IDatabase $dbw)
Check for (and work around) a MySQL auto-increment bug.
Definition: populateArchiveRevId.php:112
DeduplicateArchiveRevId\__construct
__construct()
Default constructor.
Definition: deduplicateArchiveRevId.php:25
LoggedUpdateMaintenance
Class for scripts that perform database maintenance and want to log the update in updatelog so we can...
Definition: LoggedUpdateMaintenance.php:26
DeduplicateArchiveRevId\processArRevIds
processArRevIds(IDatabase $dbw, array $arRevIds, array $revRows)
Process a set of ar_rev_ids.
Definition: deduplicateArchiveRevId.php:121
DB_MASTER
const DB_MASTER
Definition: defines.php:26
PopulateArchiveRevId\reassignArRevIds
static reassignArRevIds(IDatabase $dbw, array $arIds, array $conds=[])
Assign new ar_rev_ids to a set of ar_ids.
Definition: populateArchiveRevId.php:157
Maintenance\commitTransaction
commitTransaction(IDatabase $dbw, $fname)
Commit the transcation on a DB handle and wait for replica DBs to catch up.
Definition: Maintenance.php:1416
Maintenance\getDB
getDB( $db, $groups=[], $dbDomain=false)
Returns a database to be used by current maintenance script.
Definition: Maintenance.php:1370
DeduplicateArchiveRevId
Maintenance script that cleans up archive rows with duplicated ar_rev_id, both within archive and bet...
Definition: deduplicateArchiveRevId.php:14
Maintenance\getBatchSize
getBatchSize()
Returns batch size.
Definition: Maintenance.php:366
DeduplicateArchiveRevId\$arActorQuery
array[] null $arActorQuery
-var array{tables:string[],fields:string[],joins:array}|null
Definition: deduplicateArchiveRevId.php:20
Wikimedia\Rdbms\IDatabase\select
select( $table, $vars, $conds='', $fname=__METHOD__, $options=[], $join_conds=[])
Execute a SELECT query constructed using the various parameters provided.
PopulateArchiveRevId\isNewInstall
static isNewInstall(IDatabase $dbw)
Definition: populateArchiveRevId.php:51
Maintenance\output
output( $out, $channel=null)
Throw some output to the user.
Definition: Maintenance.php:434
DeduplicateArchiveRevId\$deleted
$deleted
Definition: deduplicateArchiveRevId.php:22
DeduplicateArchiveRevId\getUpdateKey
getUpdateKey()
Get the update key name to go in the update log table.
Definition: deduplicateArchiveRevId.php:33
Wikimedia\Rdbms\IDatabase\delete
delete( $table, $conds, $fname=__METHOD__)
Delete all rows in a table that match a condition.
Maintenance\setBatchSize
setBatchSize( $s=0)
Set the batch size.
Definition: Maintenance.php:374