MediaWiki fundraising/REL1_35
populateContentTables.php
Go to the documentation of this file.
1<?php
28use Wikimedia\Assert\Assert;
31
32require_once __DIR__ . '/Maintenance.php';
33
39
41 private $dbw;
42
45
48
50 private $blobStore;
51
53 private $mainRoleId;
54
56 private $contentRowMap = null;
57
58 private $count = 0, $totalCount = 0;
59
60 public function __construct() {
61 parent::__construct();
62
63 $this->addDescription( 'Populate content and slot tables' );
64 $this->addOption( 'table', 'revision or archive table, or `all` to populate both', false,
65 true );
66 $this->addOption( 'reuse-content',
67 'Reuse content table rows when the address and model are the same. '
68 . 'This will increase the script\'s time and memory usage, perhaps significantly.',
69 false, false );
70 $this->addOption( 'start-revision', 'The rev_id to start at', false, true );
71 $this->addOption( 'start-archive', 'The ar_rev_id to start at', false, true );
72 $this->setBatchSize( 500 );
73 }
74
75 private function initServices() {
76 $this->dbw = $this->getDB( DB_MASTER );
77 $services = MediaWikiServices::getInstance();
78 $this->contentModelStore = $services->getContentModelStore();
79 $this->slotRoleStore = $services->getSlotRoleStore();
80 $this->blobStore = $services->getBlobStore();
81
82 // Don't trust the cache for the NameTableStores, in case something went
83 // wrong during a previous run (see T224949#5325895).
84 $this->contentModelStore->reloadMap();
85 $this->slotRoleStore->reloadMap();
86 $this->mainRoleId = $this->slotRoleStore->acquireId( SlotRecord::MAIN );
87 }
88
89 public function execute() {
90 $t0 = microtime( true );
91
92 $this->initServices();
93
94 if ( $this->getOption( 'reuse-content', false ) ) {
95 $this->loadContentMap();
96 }
97
98 foreach ( $this->getTables() as $table ) {
99 $this->populateTable( $table );
100 }
101
102 $elapsed = microtime( true ) - $t0;
103 $this->writeln( "Done. Processed $this->totalCount rows in $elapsed seconds" );
104 return true;
105 }
106
110 private function getTables() {
111 $table = $this->getOption( 'table', 'all' );
112 $validTableOptions = [ 'all', 'revision', 'archive' ];
113
114 if ( !in_array( $table, $validTableOptions ) ) {
115 $this->fatalError( 'Invalid table. Must be either `revision` or `archive` or `all`' );
116 }
117
118 if ( $table === 'all' ) {
119 $tables = [ 'revision', 'archive' ];
120 } else {
121 $tables = [ $table ];
122 }
123
124 return $tables;
125 }
126
127 private function loadContentMap() {
128 $t0 = microtime( true );
129 $this->writeln( "Loading existing content table rows..." );
130 $this->contentRowMap = [];
131 $dbr = $this->getDB( DB_REPLICA );
132 $from = false;
133 while ( true ) {
134 $res = $dbr->select(
135 'content',
136 [ 'content_id', 'content_address', 'content_model' ],
137 $from ? "content_id > $from" : '',
138 __METHOD__,
139 [ 'ORDER BY' => 'content_id', 'LIMIT' => $this->getBatchSize() ]
140 );
141 if ( !$res || !$res->numRows() ) {
142 break;
143 }
144 foreach ( $res as $row ) {
145 $from = $row->content_id;
146 $this->contentRowMap["{$row->content_model}:{$row->content_address}"] = $row->content_id;
147 }
148 }
149 $elapsed = microtime( true ) - $t0;
150 $this->writeln( "Loaded " . count( $this->contentRowMap ) . " rows in $elapsed seconds" );
151 }
152
156 private function populateTable( $table ) {
157 $t0 = microtime( true );
158 $this->count = 0;
159 $this->writeln( "Populating $table..." );
160
161 if ( $table === 'revision' ) {
162 $idField = 'rev_id';
163 $tables = [ 'revision', 'slots', 'page' ];
164 $fields = [
165 'rev_id',
166 'len' => 'rev_len',
167 'sha1' => 'rev_sha1',
168 'text_id' => 'rev_text_id',
169 'content_model' => 'rev_content_model',
170 'namespace' => 'page_namespace',
171 'title' => 'page_title',
172 ];
173 $joins = [
174 'slots' => [ 'LEFT JOIN', 'rev_id=slot_revision_id' ],
175 'page' => [ 'LEFT JOIN', 'rev_page=page_id' ],
176 ];
177 $startOption = 'start-revision';
178 } else {
179 $idField = 'ar_rev_id';
180 $tables = [ 'archive', 'slots' ];
181 $fields = [
182 'rev_id' => 'ar_rev_id',
183 'len' => 'ar_len',
184 'sha1' => 'ar_sha1',
185 'text_id' => 'ar_text_id',
186 'content_model' => 'ar_content_model',
187 'namespace' => 'ar_namespace',
188 'title' => 'ar_title',
189 ];
190 $joins = [
191 'slots' => [ 'LEFT JOIN', 'ar_rev_id=slot_revision_id' ],
192 ];
193 $startOption = 'start-archive';
194 }
195
196 if ( !$this->dbw->fieldExists( $table, $fields['text_id'], __METHOD__ ) ) {
197 $this->writeln( "No need to populate, $table.{$fields['text_id']} field does not exist" );
198 return;
199 }
200
201 $minmax = $this->dbw->selectRow(
202 $table,
203 [ 'min' => "MIN( $idField )", 'max' => "MAX( $idField )" ],
204 '',
205 __METHOD__
206 );
207 if ( $this->hasOption( $startOption ) ) {
208 $minmax->min = (int)$this->getOption( $startOption );
209 }
210 if ( !$minmax || !is_numeric( $minmax->min ) || !is_numeric( $minmax->max ) ) {
211 // No rows?
212 $minmax = (object)[ 'min' => 1, 'max' => 0 ];
213 }
214
215 $batchSize = $this->getBatchSize();
216
217 for ( $startId = $minmax->min; $startId <= $minmax->max; $startId += $batchSize ) {
218 $endId = min( $startId + $batchSize - 1, $minmax->max );
219 $rows = $this->dbw->select(
220 $tables,
221 $fields,
222 [
223 "$idField >= $startId",
224 "$idField <= $endId",
225 'slot_revision_id IS NULL',
226 ],
227 __METHOD__,
228 [ 'ORDER BY' => 'rev_id' ],
229 $joins
230 );
231 if ( $rows->numRows() !== 0 ) {
232 $this->populateContentTablesForRowBatch( $rows, $startId, $table );
233 }
234
235 $elapsed = microtime( true ) - $t0;
236 $this->writeln(
237 "... $table processed up to revision id $endId of {$minmax->max}"
238 . " ($this->count rows in $elapsed seconds)"
239 );
240 }
241
242 $elapsed = microtime( true ) - $t0;
243 $this->writeln( "Done populating $table table. Processed $this->count rows in $elapsed seconds" );
244 }
245
252 private function populateContentTablesForRowBatch( IResultWrapper $rows, $startId, $table ) {
253 $this->beginTransaction( $this->dbw, __METHOD__ );
254
255 if ( $this->contentRowMap === null ) {
256 $map = [];
257 } else {
258 $map = &$this->contentRowMap;
259 }
260 $contentKeys = [];
261
262 try {
263 // Step 1: Figure out content rows needing insertion.
264 $contentRows = [];
265 foreach ( $rows as $row ) {
266 $revisionId = $row->rev_id;
267
268 Assert::invariant( $revisionId !== null, 'rev_id must not be null' );
269
270 $model = $this->getContentModel( $row );
271 $modelId = $this->contentModelStore->acquireId( $model );
272 $address = SqlBlobStore::makeAddressFromTextId( $row->text_id );
273
274 $key = "{$modelId}:{$address}";
275 $contentKeys[$revisionId] = $key;
276
277 if ( !isset( $map[$key] ) ) {
278 $this->fillMissingFields( $row, $model, $address );
279
280 $map[$key] = false;
281 $contentRows[] = [
282 'content_size' => (int)$row->len,
283 'content_sha1' => $row->sha1,
284 'content_model' => $modelId,
285 'content_address' => $address,
286 ];
287 }
288 }
289
290 // Step 2: Insert them, then read them back in for use in the next step.
291 if ( $contentRows ) {
292 $id = $this->dbw->selectField( 'content', 'MAX(content_id)', '', __METHOD__ );
293 $this->dbw->insert( 'content', $contentRows, __METHOD__ );
294 $res = $this->dbw->select(
295 'content',
296 [ 'content_id', 'content_model', 'content_address' ],
297 'content_id > ' . (int)$id,
298 __METHOD__
299 );
300 foreach ( $res as $row ) {
301 $address = $row->content_address;
302 if ( substr( $address, 0, 4 ) === 'bad:' ) {
303 $address = substr( $address, 4 );
304 }
305 $key = $row->content_model . ':' . $address;
306 $map[$key] = $row->content_id;
307 }
308 }
309
310 // Step 3: Insert the slot rows.
311 $slotRows = [];
312 foreach ( $rows as $row ) {
313 $revisionId = $row->rev_id;
314 $contentId = $map[$contentKeys[$revisionId]] ?? false;
315 if ( $contentId === false ) {
316 throw new \RuntimeException( "Content row for $revisionId not found after content insert" );
317 }
318 $slotRows[] = [
319 'slot_revision_id' => $revisionId,
320 'slot_role_id' => $this->mainRoleId,
321 'slot_content_id' => $contentId,
322 // There's no way to really know the previous revision, so assume no inheriting.
323 // rev_parent_id can get changed on undeletions, and deletions can screw up
324 // rev_timestamp ordering.
325 'slot_origin' => $revisionId,
326 ];
327 }
328 $this->dbw->insert( 'slots', $slotRows, __METHOD__ );
329 $this->count += count( $slotRows );
330 $this->totalCount += count( $slotRows );
331 } catch ( \Exception $e ) {
332 $this->rollbackTransaction( $this->dbw, __METHOD__ );
333 $this->fatalError( "Failed to populate content table $table row batch starting at $startId "
334 . "due to exception: " . $e->__toString() );
335 }
336
337 $this->commitTransaction( $this->dbw, __METHOD__ );
338 }
339
344 private function getContentModel( $row ) {
345 if ( isset( $row->content_model ) ) {
346 return $row->content_model;
347 }
348
349 $title = Title::makeTitle( $row->namespace, $row->title );
350
351 return ContentHandler::getDefaultModelFor( $title );
352 }
353
357 private function writeln( $msg ) {
358 $this->output( "$msg\n" );
359 }
360
369 private function fillMissingFields( $row, $model, &$address ) {
370 if ( !isset( $row->content_model ) ) {
371 // just for completeness
372 $row->content_model = $model;
373 }
374
375 if ( isset( $row->len ) && isset( $row->sha1 ) && $row->sha1 !== '' ) {
376 // No need to load the content, quite now.
377 return;
378 }
379
380 try {
381 $blob = $this->blobStore->getBlob( $address );
382 } catch ( BlobAccessException $e ) {
383 $address = 'bad:' . $address;
384 $blob = '';
385 }
386
387 if ( !isset( $row->len ) ) {
388 // NOTE: The nominal size of the content may not be the length of the raw blob.
389 $row->len = ContentHandler::makeContent( $blob, null, $model )->getSize();
390 }
391
392 if ( !isset( $row->sha1 ) || $row->sha1 === '' ) {
393 $row->sha1 = SlotRecord::base36Sha1( $blob );
394 }
395 }
396}
397
398$maintClass = PopulateContentTables::class;
399require_once RUN_MAINTENANCE_IF_MAIN;
getDB()
const RUN_MAINTENANCE_IF_MAIN
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
beginTransaction(IDatabase $dbw, $fname)
Begin a transcation on a DB.
commitTransaction(IDatabase $dbw, $fname)
Commit the transcation on a DB handle and wait for replica DBs to catch up.
output( $out, $channel=null)
Throw some output to the user.
hasOption( $name)
Checks to see if a particular option was set.
getBatchSize()
Returns batch size.
addDescription( $text)
Set the description text.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
rollbackTransaction(IDatabase $dbw, $fname)
Rollback the transcation on a DB handle.
setBatchSize( $s=0)
Set the batch size.
fatalError( $msg, $exitCode=1)
Output a message and terminate the current script.
MediaWikiServices is the service locator for the application scope of MediaWiki.
Value object representing a content slot associated with a page revision.
Exception representing a failure to access a data blob.
Service for storing and loading Content objects.
Populate the content and slot tables.
array null $contentRowMap
Map "{$modelId}:{$address}" to content_id.
populateContentTablesForRowBatch(IResultWrapper $rows, $startId, $table)
__construct()
Default constructor.
execute()
Do the actual work.
fillMissingFields( $row, $model, &$address)
Compute any missing fields in $row.
Service for loading and storing data blobs.
Definition BlobStore.php:35
Basic database interface for live and lazy-loaded relation database handles.
Definition IDatabase.php:38
Result wrapper for grabbing data queried from an IDatabase object.
const DB_REPLICA
Definition defines.php:25
const DB_MASTER
Definition defines.php:29