MediaWiki 1.39.10
populateContentTables.php
Go to the documentation of this file.
1<?php
28use Wikimedia\Assert\Assert;
31
32require_once __DIR__ . '/Maintenance.php';
33
39
41 private $dbw;
42
44 private $contentModelStore;
45
47 private $slotRoleStore;
48
50 private $blobStore;
51
53 private $mainRoleId;
54
56 private $contentRowMap = null;
57
58 private $count = 0, $totalCount = 0;
59
60 public function __construct() {
61 parent::__construct();
62
63 $this->addDescription( 'Populate content and slot tables' );
64 $this->addOption( 'table', 'revision or archive table, or `all` to populate both', false,
65 true );
66 $this->addOption( 'reuse-content',
67 'Reuse content table rows when the address and model are the same. '
68 . 'This will increase the script\'s time and memory usage, perhaps significantly.',
69 false, false );
70 $this->addOption( 'start-revision', 'The rev_id to start at', false, true );
71 $this->addOption( 'start-archive', 'The ar_rev_id to start at', false, true );
72 $this->setBatchSize( 500 );
73 }
74
75 private function initServices() {
76 $this->dbw = $this->getDB( DB_PRIMARY );
77 $services = MediaWikiServices::getInstance();
78 $this->contentModelStore = $services->getContentModelStore();
79 $this->slotRoleStore = $services->getSlotRoleStore();
80 $this->blobStore = $services->getBlobStore();
81
82 // Don't trust the cache for the NameTableStores, in case something went
83 // wrong during a previous run (see T224949#5325895).
84 $this->contentModelStore->reloadMap();
85 $this->slotRoleStore->reloadMap();
86 $this->mainRoleId = $this->slotRoleStore->acquireId( SlotRecord::MAIN );
87 }
88
89 public function execute() {
90 $t0 = microtime( true );
91
92 $this->initServices();
93
94 if ( $this->getOption( 'reuse-content', false ) ) {
95 $this->loadContentMap();
96 }
97
98 foreach ( $this->getTables() as $table ) {
99 $this->populateTable( $table );
100 }
101
102 $elapsed = microtime( true ) - $t0;
103 $this->writeln( "Done. Processed $this->totalCount rows in $elapsed seconds" );
104 return true;
105 }
106
110 private function getTables() {
111 $table = $this->getOption( 'table', 'all' );
112 $validTableOptions = [ 'all', 'revision', 'archive' ];
113
114 if ( !in_array( $table, $validTableOptions ) ) {
115 $this->fatalError( 'Invalid table. Must be either `revision` or `archive` or `all`' );
116 }
117
118 if ( $table === 'all' ) {
119 $tables = [ 'revision', 'archive' ];
120 } else {
121 $tables = [ $table ];
122 }
123
124 return $tables;
125 }
126
127 private function loadContentMap() {
128 $t0 = microtime( true );
129 $this->writeln( "Loading existing content table rows..." );
130 $this->contentRowMap = [];
131 $dbr = $this->getDB( DB_REPLICA );
132 $from = false;
133 while ( true ) {
134 $res = $dbr->select(
135 'content',
136 [ 'content_id', 'content_address', 'content_model' ],
137 $from ? "content_id > $from" : '',
138 __METHOD__,
139 [ 'ORDER BY' => 'content_id', 'LIMIT' => $this->getBatchSize() ]
140 );
141 if ( !$res || !$res->numRows() ) {
142 break;
143 }
144 foreach ( $res as $row ) {
145 $from = $row->content_id;
146 $this->contentRowMap["{$row->content_model}:{$row->content_address}"] = $row->content_id;
147 }
148 }
149 $elapsed = microtime( true ) - $t0;
150 $this->writeln( "Loaded " . count( $this->contentRowMap ) . " rows in $elapsed seconds" );
151 }
152
156 private function populateTable( $table ) {
157 $t0 = microtime( true );
158 $this->count = 0;
159 $this->writeln( "Populating $table..." );
160
161 if ( $table === 'revision' ) {
162 $idField = 'rev_id';
163 $tables = [ 'revision', 'slots', 'page' ];
164 $fields = [
165 'rev_id',
166 'len' => 'rev_len',
167 'sha1' => 'rev_sha1',
168 'text_id' => 'rev_text_id',
169 'content_model' => 'rev_content_model',
170 'namespace' => 'page_namespace',
171 'title' => 'page_title',
172 ];
173 $joins = [
174 'slots' => [ 'LEFT JOIN', 'rev_id=slot_revision_id' ],
175 'page' => [ 'LEFT JOIN', 'rev_page=page_id' ],
176 ];
177 $startOption = 'start-revision';
178 } else {
179 $idField = 'ar_rev_id';
180 $tables = [ 'archive', 'slots' ];
181 $fields = [
182 'rev_id' => 'ar_rev_id',
183 'len' => 'ar_len',
184 'sha1' => 'ar_sha1',
185 'text_id' => 'ar_text_id',
186 'content_model' => 'ar_content_model',
187 'namespace' => 'ar_namespace',
188 'title' => 'ar_title',
189 ];
190 $joins = [
191 'slots' => [ 'LEFT JOIN', 'ar_rev_id=slot_revision_id' ],
192 ];
193 $startOption = 'start-archive';
194 }
195
196 if ( !$this->dbw->fieldExists( $table, $fields['text_id'], __METHOD__ ) ) {
197 $this->writeln( "No need to populate, $table.{$fields['text_id']} field does not exist" );
198 return;
199 }
200
201 $minmax = $this->dbw->selectRow(
202 $table,
203 [ 'min' => "MIN( $idField )", 'max' => "MAX( $idField )" ],
204 '',
205 __METHOD__
206 );
207 if ( $this->hasOption( $startOption ) ) {
208 $minmax->min = (int)$this->getOption( $startOption );
209 }
210 if ( !$minmax || !is_numeric( $minmax->min ) || !is_numeric( $minmax->max ) ) {
211 // No rows?
212 $minmax = (object)[ 'min' => 1, 'max' => 0 ];
213 }
214
215 $batchSize = $this->getBatchSize();
216
217 for ( $startId = (int)$minmax->min; $startId <= $minmax->max; $startId += $batchSize ) {
218 $endId = (int)min( $startId + $batchSize - 1, $minmax->max );
219 $rows = $this->dbw->select(
220 $tables,
221 $fields,
222 [
223 "$idField >= $startId",
224 "$idField <= $endId",
225 'slot_revision_id IS NULL',
226 ],
227 __METHOD__,
228 [ 'ORDER BY' => 'rev_id' ],
229 $joins
230 );
231 if ( $rows->numRows() !== 0 ) {
232 $this->populateContentTablesForRowBatch( $rows, $startId, $table );
233 }
234
235 $elapsed = microtime( true ) - $t0;
236 $this->writeln(
237 "... $table processed up to revision id $endId of {$minmax->max}"
238 . " ($this->count rows in $elapsed seconds)"
239 );
240 }
241
242 $elapsed = microtime( true ) - $t0;
243 $this->writeln( "Done populating $table table. Processed $this->count rows in $elapsed seconds" );
244 }
245
252 private function populateContentTablesForRowBatch( IResultWrapper $rows, $startId, $table ) {
253 $this->beginTransaction( $this->dbw, __METHOD__ );
254
255 if ( $this->contentRowMap === null ) {
256 $map = [];
257 } else {
258 $map = &$this->contentRowMap;
259 }
260 $contentKeys = [];
261
262 try {
263 // Step 1: Figure out content rows needing insertion.
264 $contentRows = [];
265 foreach ( $rows as $row ) {
266 $revisionId = $row->rev_id;
267
268 Assert::invariant( $revisionId !== null, 'rev_id must not be null' );
269
270 $model = $this->getContentModel( $row );
271 $modelId = $this->contentModelStore->acquireId( $model );
272 $address = SqlBlobStore::makeAddressFromTextId( $row->text_id );
273
274 $key = "{$modelId}:{$address}";
275 $contentKeys[$revisionId] = $key;
276
277 if ( !isset( $map[$key] ) ) {
278 $this->fillMissingFields( $row, $model, $address );
279 $map[$key] = false;
280 $contentRows[] = [
281 'content_size' => (int)$row->len,
282 'content_sha1' => $row->sha1,
283 'content_model' => $modelId,
284 'content_address' => $address,
285 ];
286 }
287 }
288
289 // Step 2: Insert them, then read them back in for use in the next step.
290 if ( $contentRows ) {
291 $id = $this->dbw->selectField( 'content', 'MAX(content_id)', '', __METHOD__ );
292 $this->dbw->insert( 'content', $contentRows, __METHOD__ );
293 $res = $this->dbw->select(
294 'content',
295 [ 'content_id', 'content_model', 'content_address' ],
296 'content_id > ' . (int)$id,
297 __METHOD__
298 );
299 foreach ( $res as $row ) {
300 $address = $row->content_address;
301 if ( substr( $address, 0, 4 ) === 'bad:' ) {
302 $address = substr( $address, 4 );
303 }
304 $key = $row->content_model . ':' . $address;
305 $map[$key] = $row->content_id;
306 }
307 }
308
309 // Step 3: Insert the slot rows.
310 $slotRows = [];
311 foreach ( $rows as $row ) {
312 $revisionId = $row->rev_id;
313 $contentId = $map[$contentKeys[$revisionId]] ?? false;
314 if ( $contentId === false ) {
315 throw new \RuntimeException( "Content row for $revisionId not found after content insert" );
316 }
317 $slotRows[] = [
318 'slot_revision_id' => $revisionId,
319 'slot_role_id' => $this->mainRoleId,
320 'slot_content_id' => $contentId,
321 // There's no way to really know the previous revision, so assume no inheriting.
322 // rev_parent_id can get changed on undeletions, and deletions can screw up
323 // rev_timestamp ordering.
324 'slot_origin' => $revisionId,
325 ];
326 }
327 $this->dbw->insert( 'slots', $slotRows, __METHOD__ );
328 $this->count += count( $slotRows );
329 $this->totalCount += count( $slotRows );
330 } catch ( \Exception $e ) {
331 $this->rollbackTransaction( $this->dbw, __METHOD__ );
332 $this->fatalError( "Failed to populate content table $table row batch starting at $startId "
333 . "due to exception: " . $e->__toString() );
334 }
335
336 $this->commitTransaction( $this->dbw, __METHOD__ );
337 }
338
343 private function getContentModel( $row ) {
344 if ( isset( $row->content_model ) ) {
345 return $row->content_model;
346 }
347
348 $title = Title::makeTitle( $row->namespace, $row->title );
349
351 }
352
356 private function writeln( $msg ) {
357 $this->output( "$msg\n" );
358 }
359
368 private function fillMissingFields( $row, $model, &$address ) {
369 if ( !isset( $row->content_model ) ) {
370 // just for completeness
371 $row->content_model = $model;
372 }
373
374 if ( isset( $row->len ) && isset( $row->sha1 ) && $row->sha1 !== '' ) {
375 // No need to load the content, quite now.
376 return;
377 }
378
379 try {
380 $blob = $this->blobStore->getBlob( $address );
381 } catch ( BlobAccessException $e ) {
382 $address = 'bad:' . $address;
383 $blob = '';
384 }
385
386 if ( !isset( $row->len ) ) {
387 // NOTE: The nominal size of the content may not be the length of the raw blob.
388 $row->len = ContentHandler::makeContent( $blob, null, $model )->getSize();
389 }
390
391 if ( !isset( $row->sha1 ) || $row->sha1 === '' ) {
392 $row->sha1 = SlotRecord::base36Sha1( $blob );
393 }
394 }
395}
396
397$maintClass = PopulateContentTables::class;
398require_once RUN_MAINTENANCE_IF_MAIN;
getDB()
static makeContent( $text, Title $title=null, $modelId=null, $format=null)
Convenience function for creating a Content object from a given textual representation.
static getDefaultModelFor(Title $title)
Returns the name of the default content model to be used for the page with the given title.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
beginTransaction(IDatabase $dbw, $fname)
Begin a transaction on a DB.
commitTransaction(IDatabase $dbw, $fname)
Commit the transaction on a DB handle and wait for replica DBs to catch up.
output( $out, $channel=null)
Throw some output to the user.
hasOption( $name)
Checks to see if a particular option was set.
getBatchSize()
Returns batch size.
addDescription( $text)
Set the description text.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
rollbackTransaction(IDatabase $dbw, $fname)
Rollback the transaction on a DB handle.
setBatchSize( $s=0)
fatalError( $msg, $exitCode=1)
Output a message and terminate the current script.
Service locator for MediaWiki core services.
Value object representing a content slot associated with a page revision.
Exception representing a failure to access a data blob.
Service for storing and loading Content objects.
Populate the content and slot tables.
__construct()
Default constructor.
execute()
Do the actual work.
static makeTitle( $ns, $title, $fragment='', $interwiki='')
Create a new Title from a namespace index and a DB key.
Definition Title.php:638
Service for loading and storing data blobs.
Definition BlobStore.php:35
Basic database interface for live and lazy-loaded relation database handles.
Definition IDatabase.php:39
Result wrapper for grabbing data queried from an IDatabase object.
numRows()
Get the number of rows in a result object.
const DB_REPLICA
Definition defines.php:26
const DB_PRIMARY
Definition defines.php:28