MediaWiki  master
populateContentTables.php
Go to the documentation of this file.
1 <?php
30 
31 require_once __DIR__ . '/Maintenance.php';
32 
38 
40  private $dbw;
41 
44 
46  private $slotRoleStore;
47 
49  private $blobStore;
50 
52  private $mainRoleId;
53 
55  private $contentRowMap = null;
56 
57  private $count = 0, $totalCount = 0;
58 
59  public function __construct() {
60  parent::__construct();
61 
62  $this->addDescription( 'Populate content and slot tables' );
63  $this->addOption( 'table', 'revision or archive table, or `all` to populate both', false,
64  true );
65  $this->addOption( 'reuse-content',
66  'Reuse content table rows when the address and model are the same. '
67  . 'This will increase the script\'s time and memory usage, perhaps significantly.',
68  false, false );
69  $this->addOption( 'start-revision', 'The rev_id to start at', false, true );
70  $this->addOption( 'start-archive', 'The ar_rev_id to start at', false, true );
71  $this->setBatchSize( 500 );
72  }
73 
74  private function initServices() {
75  $this->dbw = $this->getDB( DB_MASTER );
76  $this->contentModelStore = MediaWikiServices::getInstance()->getContentModelStore();
77  $this->slotRoleStore = MediaWikiServices::getInstance()->getSlotRoleStore();
78  $this->blobStore = MediaWikiServices::getInstance()->getBlobStore();
79 
80  // Don't trust the cache for the NameTableStores, in case something went
81  // wrong during a previous run (see T224949#5325895).
82  $this->contentModelStore->reloadMap();
83  $this->slotRoleStore->reloadMap();
84  $this->mainRoleId = $this->slotRoleStore->acquireId( SlotRecord::MAIN );
85  }
86 
87  public function execute() {
88  $multiContentRevisionSchemaMigrationStage =
89  $this->getConfig()->get( 'MultiContentRevisionSchemaMigrationStage' );
90 
91  $t0 = microtime( true );
92 
93  if ( ( $multiContentRevisionSchemaMigrationStage & SCHEMA_COMPAT_WRITE_NEW ) === 0 ) {
94  $this->writeln(
95  '...cannot update while \$wgMultiContentRevisionSchemaMigrationStage '
96  . 'does not have the SCHEMA_COMPAT_WRITE_NEW bit set.'
97  );
98  return false;
99  }
100 
101  $this->initServices();
102 
103  if ( $this->getOption( 'reuse-content', false ) ) {
104  $this->loadContentMap();
105  }
106 
107  foreach ( $this->getTables() as $table ) {
108  $this->populateTable( $table );
109  }
110 
111  $elapsed = microtime( true ) - $t0;
112  $this->writeln( "Done. Processed $this->totalCount rows in $elapsed seconds" );
113  return true;
114  }
115 
119  private function getTables() {
120  $table = $this->getOption( 'table', 'all' );
121  $validTableOptions = [ 'all', 'revision', 'archive' ];
122 
123  if ( !in_array( $table, $validTableOptions ) ) {
124  $this->fatalError( 'Invalid table. Must be either `revision` or `archive` or `all`' );
125  }
126 
127  if ( $table === 'all' ) {
128  $tables = [ 'revision', 'archive' ];
129  } else {
130  $tables = [ $table ];
131  }
132 
133  return $tables;
134  }
135 
136  private function loadContentMap() {
137  $t0 = microtime( true );
138  $this->writeln( "Loading existing content table rows..." );
139  $this->contentRowMap = [];
140  $dbr = $this->getDB( DB_REPLICA );
141  $from = false;
142  while ( true ) {
143  $res = $dbr->select(
144  'content',
145  [ 'content_id', 'content_address', 'content_model' ],
146  $from ? "content_id > $from" : '',
147  __METHOD__,
148  [ 'ORDER BY' => 'content_id', 'LIMIT' => $this->getBatchSize() ]
149  );
150  if ( !$res || !$res->numRows() ) {
151  break;
152  }
153  foreach ( $res as $row ) {
154  $from = $row->content_id;
155  $this->contentRowMap["{$row->content_model}:{$row->content_address}"] = $row->content_id;
156  }
157  }
158  $elapsed = microtime( true ) - $t0;
159  $this->writeln( "Loaded " . count( $this->contentRowMap ) . " rows in $elapsed seconds" );
160  }
161 
165  private function populateTable( $table ) {
166  $t0 = microtime( true );
167  $this->count = 0;
168  $this->writeln( "Populating $table..." );
169 
170  if ( $table === 'revision' ) {
171  $idField = 'rev_id';
172  $tables = [ 'revision', 'slots', 'page' ];
173  $fields = [
174  'rev_id',
175  'len' => 'rev_len',
176  'sha1' => 'rev_sha1',
177  'text_id' => 'rev_text_id',
178  'content_model' => 'rev_content_model',
179  'namespace' => 'page_namespace',
180  'title' => 'page_title',
181  ];
182  $joins = [
183  'slots' => [ 'LEFT JOIN', 'rev_id=slot_revision_id' ],
184  'page' => [ 'LEFT JOIN', 'rev_page=page_id' ],
185  ];
186  $startOption = 'start-revision';
187  } else {
188  $idField = 'ar_rev_id';
189  $tables = [ 'archive', 'slots' ];
190  $fields = [
191  'rev_id' => 'ar_rev_id',
192  'len' => 'ar_len',
193  'sha1' => 'ar_sha1',
194  'text_id' => 'ar_text_id',
195  'content_model' => 'ar_content_model',
196  'namespace' => 'ar_namespace',
197  'title' => 'ar_title',
198  ];
199  $joins = [
200  'slots' => [ 'LEFT JOIN', 'ar_rev_id=slot_revision_id' ],
201  ];
202  $startOption = 'start-archive';
203  }
204 
205  if ( !$this->dbw->fieldExists( $table, $fields['text_id'], __METHOD__ ) ) {
206  $this->writeln( "No need to populate, $table.{$fields['text_id']} field does not exist" );
207  return;
208  }
209 
210  $minmax = $this->dbw->selectRow(
211  $table,
212  [ 'min' => "MIN( $idField )", 'max' => "MAX( $idField )" ],
213  '',
214  __METHOD__
215  );
216  if ( $this->hasOption( $startOption ) ) {
217  $minmax->min = (int)$this->getOption( $startOption );
218  }
219  if ( !$minmax || !is_numeric( $minmax->min ) || !is_numeric( $minmax->max ) ) {
220  // No rows?
221  $minmax = (object)[ 'min' => 1, 'max' => 0 ];
222  }
223 
224  $batchSize = $this->getBatchSize();
225 
226  for ( $startId = $minmax->min; $startId <= $minmax->max; $startId += $batchSize ) {
227  $endId = min( $startId + $batchSize - 1, $minmax->max );
228  $rows = $this->dbw->select(
229  $tables,
230  $fields,
231  [
232  "$idField >= $startId",
233  "$idField <= $endId",
234  'slot_revision_id IS NULL',
235  ],
236  __METHOD__,
237  [ 'ORDER BY' => 'rev_id' ],
238  $joins
239  );
240  if ( $rows->numRows() !== 0 ) {
241  $this->populateContentTablesForRowBatch( $rows, $startId, $table );
242  }
243 
244  $elapsed = microtime( true ) - $t0;
245  $this->writeln(
246  "... $table processed up to revision id $endId of {$minmax->max}"
247  . " ($this->count rows in $elapsed seconds)"
248  );
249  }
250 
251  $elapsed = microtime( true ) - $t0;
252  $this->writeln( "Done populating $table table. Processed $this->count rows in $elapsed seconds" );
253  }
254 
261  private function populateContentTablesForRowBatch( IResultWrapper $rows, $startId, $table ) {
262  $this->beginTransaction( $this->dbw, __METHOD__ );
263 
264  if ( $this->contentRowMap === null ) {
265  $map = [];
266  } else {
267  $map = &$this->contentRowMap;
268  }
269  $contentKeys = [];
270 
271  try {
272  // Step 1: Figure out content rows needing insertion.
273  $contentRows = [];
274  foreach ( $rows as $row ) {
275  $revisionId = $row->rev_id;
276 
277  Assert::invariant( $revisionId !== null, 'rev_id must not be null' );
278 
279  $model = $this->getContentModel( $row );
280  $modelId = $this->contentModelStore->acquireId( $model );
281  $address = SqlBlobStore::makeAddressFromTextId( $row->text_id );
282 
283  $key = "{$modelId}:{$address}";
284  $contentKeys[$revisionId] = $key;
285 
286  if ( !isset( $map[$key] ) ) {
287  $this->fillMissingFields( $row, $model, $address );
288 
289  $map[$key] = false;
290  $contentRows[] = [
291  'content_size' => (int)$row->len,
292  'content_sha1' => $row->sha1,
293  'content_model' => $modelId,
294  'content_address' => $address,
295  ];
296  }
297  }
298 
299  // Step 2: Insert them, then read them back in for use in the next step.
300  if ( $contentRows ) {
301  $id = $this->dbw->selectField( 'content', 'MAX(content_id)', '', __METHOD__ );
302  $this->dbw->insert( 'content', $contentRows, __METHOD__ );
303  $res = $this->dbw->select(
304  'content',
305  [ 'content_id', 'content_model', 'content_address' ],
306  'content_id > ' . (int)$id,
307  __METHOD__
308  );
309  foreach ( $res as $row ) {
310  $key = $row->content_model . ':' . $row->content_address;
311  $map[$key] = $row->content_id;
312  }
313  }
314 
315  // Step 3: Insert the slot rows.
316  $slotRows = [];
317  foreach ( $rows as $row ) {
318  $revisionId = $row->rev_id;
319  $contentId = $map[$contentKeys[$revisionId]] ?? false;
320  if ( $contentId === false ) {
321  throw new \RuntimeException( "Content row for $revisionId not found after content insert" );
322  }
323  $slotRows[] = [
324  'slot_revision_id' => $revisionId,
325  'slot_role_id' => $this->mainRoleId,
326  'slot_content_id' => $contentId,
327  // There's no way to really know the previous revision, so assume no inheriting.
328  // rev_parent_id can get changed on undeletions, and deletions can screw up
329  // rev_timestamp ordering.
330  'slot_origin' => $revisionId,
331  ];
332  }
333  $this->dbw->insert( 'slots', $slotRows, __METHOD__ );
334  $this->count += count( $slotRows );
335  $this->totalCount += count( $slotRows );
336  } catch ( \Exception $e ) {
337  $this->rollbackTransaction( $this->dbw, __METHOD__ );
338  $this->fatalError( "Failed to populate content table $table row batch starting at $startId "
339  . "due to exception: " . $e->__toString() );
340  }
341 
342  $this->commitTransaction( $this->dbw, __METHOD__ );
343  }
344 
349  private function getContentModel( $row ) {
350  if ( isset( $row->content_model ) ) {
351  return $row->content_model;
352  }
353 
354  $title = Title::makeTitle( $row->namespace, $row->title );
355 
357  }
358 
362  private function writeln( $msg ) {
363  $this->output( "$msg\n" );
364  }
365 
374  private function fillMissingFields( $row, $model, $address ) {
375  if ( !isset( $row->content_model ) ) {
376  // just for completeness
377  $row->content_model = $model;
378  }
379 
380  if ( isset( $row->len ) && isset( $row->sha1 ) && $row->sha1 !== '' ) {
381  // No need to load the content, quite now.
382  return;
383  }
384 
385  $blob = $this->blobStore->getBlob( $address );
386 
387  if ( !isset( $row->len ) ) {
388  // NOTE: The nominal size of the content may not be the length of the raw blob.
389  $handler = ContentHandler::getForModelID( $model );
390  $content = $handler->unserializeContent( $blob );
391 
392  $row->len = $content->getSize();
393  }
394 
395  if ( !isset( $row->sha1 ) || $row->sha1 === '' ) {
396  $row->sha1 = SlotRecord::base36Sha1( $blob );
397  }
398  }
399 }
400 
401 $maintClass = 'PopulateContentTables';
402 require_once RUN_MAINTENANCE_IF_MAIN;
commitTransaction(IDatabase $dbw, $fname)
Commit the transcation on a DB handle and wait for replica DBs to catch up.
const RUN_MAINTENANCE_IF_MAIN
Definition: Maintenance.php:39
getOption( $name, $default=null)
Get an option, or return the default.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
Definition: Maintenance.php:86
rollbackTransaction(IDatabase $dbw, $fname)
Rollback the transcation on a DB handle.
static getDefaultModelFor(Title $title)
Returns the name of the default content model to be used for the page with the given title...
Populate the content and slot tables.
setBatchSize( $s=0)
Set the batch size.
hasOption( $name)
Checks to see if a particular option exists.
const DB_MASTER
Definition: defines.php:26
static getForModelID( $modelId)
Returns the ContentHandler singleton for the given model ID.
fillMissingFields( $row, $model, $address)
Compute any missing fields in $row.
const SCHEMA_COMPAT_WRITE_NEW
Definition: Defines.php:266
addDescription( $text)
Set the description text.
Result wrapper for grabbing data queried from an IDatabase object.
output( $out, $channel=null)
Throw some output to the user.
static makeTitle( $ns, $title, $fragment='', $interwiki='')
Create a new Title from a namespace index and a DB key.
Definition: Title.php:586
getBatchSize()
Returns batch size.
populateContentTablesForRowBatch(IResultWrapper $rows, $startId, $table)
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
const DB_REPLICA
Definition: defines.php:25
$content
Definition: router.php:78
fatalError( $msg, $exitCode=1)
Output a message and terminate the current script.
array null $contentRowMap
Map "{$modelId}:{$address}" to content_id.
getDB( $db, $groups=[], $dbDomain=false)
Returns a database to be used by current maintenance script.
beginTransaction(IDatabase $dbw, $fname)
Begin a transcation on a DB.