MediaWiki  master
populateContentTables.php
Go to the documentation of this file.
1 <?php
28 use Wikimedia\Assert\Assert;
31 
32 require_once __DIR__ . '/Maintenance.php';
33 
39 
41  private $dbw;
42 
45 
47  private $slotRoleStore;
48 
50  private $blobStore;
51 
53  private $mainRoleId;
54 
56  private $contentRowMap = null;
57 
58  private $count = 0, $totalCount = 0;
59 
60  public function __construct() {
61  parent::__construct();
62 
63  $this->addDescription( 'Populate content and slot tables' );
64  $this->addOption( 'table', 'revision or archive table, or `all` to populate both', false,
65  true );
66  $this->addOption( 'reuse-content',
67  'Reuse content table rows when the address and model are the same. '
68  . 'This will increase the script\'s time and memory usage, perhaps significantly.',
69  false, false );
70  $this->addOption( 'start-revision', 'The rev_id to start at', false, true );
71  $this->addOption( 'start-archive', 'The ar_rev_id to start at', false, true );
72  $this->setBatchSize( 500 );
73  }
74 
75  private function initServices() {
76  $this->dbw = $this->getDB( DB_PRIMARY );
77  $services = MediaWikiServices::getInstance();
78  $this->contentModelStore = $services->getContentModelStore();
79  $this->slotRoleStore = $services->getSlotRoleStore();
80  $this->blobStore = $services->getBlobStore();
81 
82  // Don't trust the cache for the NameTableStores, in case something went
83  // wrong during a previous run (see T224949#5325895).
84  $this->contentModelStore->reloadMap();
85  $this->slotRoleStore->reloadMap();
86  $this->mainRoleId = $this->slotRoleStore->acquireId( SlotRecord::MAIN );
87  }
88 
89  public function execute() {
90  $t0 = microtime( true );
91 
92  $this->initServices();
93 
94  if ( $this->getOption( 'reuse-content', false ) ) {
95  $this->loadContentMap();
96  }
97 
98  foreach ( $this->getTables() as $table ) {
99  $this->populateTable( $table );
100  }
101 
102  $elapsed = microtime( true ) - $t0;
103  $this->writeln( "Done. Processed $this->totalCount rows in $elapsed seconds" );
104  return true;
105  }
106 
110  private function getTables() {
111  $table = $this->getOption( 'table', 'all' );
112  $validTableOptions = [ 'all', 'revision', 'archive' ];
113 
114  if ( !in_array( $table, $validTableOptions ) ) {
115  $this->fatalError( 'Invalid table. Must be either `revision` or `archive` or `all`' );
116  }
117 
118  if ( $table === 'all' ) {
119  $tables = [ 'revision', 'archive' ];
120  } else {
121  $tables = [ $table ];
122  }
123 
124  return $tables;
125  }
126 
127  private function loadContentMap() {
128  $t0 = microtime( true );
129  $this->writeln( "Loading existing content table rows..." );
130  $this->contentRowMap = [];
131  $dbr = $this->getDB( DB_REPLICA );
132  $from = false;
133  while ( true ) {
134  $res = $dbr->select(
135  'content',
136  [ 'content_id', 'content_address', 'content_model' ],
137  $from ? "content_id > $from" : '',
138  __METHOD__,
139  [ 'ORDER BY' => 'content_id', 'LIMIT' => $this->getBatchSize() ]
140  );
141  if ( !$res || !$res->numRows() ) {
142  break;
143  }
144  foreach ( $res as $row ) {
145  $from = $row->content_id;
146  $this->contentRowMap["{$row->content_model}:{$row->content_address}"] = $row->content_id;
147  }
148  }
149  $elapsed = microtime( true ) - $t0;
150  $this->writeln( "Loaded " . count( $this->contentRowMap ) . " rows in $elapsed seconds" );
151  }
152 
156  private function populateTable( $table ) {
157  $t0 = microtime( true );
158  $this->count = 0;
159  $this->writeln( "Populating $table..." );
160 
161  if ( $table === 'revision' ) {
162  $idField = 'rev_id';
163  $tables = [ 'revision', 'slots', 'page' ];
164  $fields = [
165  'rev_id',
166  'len' => 'rev_len',
167  'sha1' => 'rev_sha1',
168  'text_id' => 'rev_text_id',
169  'content_model' => 'rev_content_model',
170  'namespace' => 'page_namespace',
171  'title' => 'page_title',
172  ];
173  $joins = [
174  'slots' => [ 'LEFT JOIN', 'rev_id=slot_revision_id' ],
175  'page' => [ 'LEFT JOIN', 'rev_page=page_id' ],
176  ];
177  $startOption = 'start-revision';
178  } else {
179  $idField = 'ar_rev_id';
180  $tables = [ 'archive', 'slots' ];
181  $fields = [
182  'rev_id' => 'ar_rev_id',
183  'len' => 'ar_len',
184  'sha1' => 'ar_sha1',
185  'text_id' => 'ar_text_id',
186  'content_model' => 'ar_content_model',
187  'namespace' => 'ar_namespace',
188  'title' => 'ar_title',
189  ];
190  $joins = [
191  'slots' => [ 'LEFT JOIN', 'ar_rev_id=slot_revision_id' ],
192  ];
193  $startOption = 'start-archive';
194  }
195 
196  if ( !$this->dbw->fieldExists( $table, $fields['text_id'], __METHOD__ ) ) {
197  $this->writeln( "No need to populate, $table.{$fields['text_id']} field does not exist" );
198  return;
199  }
200 
201  $minmax = $this->dbw->selectRow(
202  $table,
203  [ 'min' => "MIN( $idField )", 'max' => "MAX( $idField )" ],
204  '',
205  __METHOD__
206  );
207  if ( $this->hasOption( $startOption ) ) {
208  $minmax->min = (int)$this->getOption( $startOption );
209  }
210  if ( !$minmax || !is_numeric( $minmax->min ) || !is_numeric( $minmax->max ) ) {
211  // No rows?
212  $minmax = (object)[ 'min' => 1, 'max' => 0 ];
213  }
214 
215  $batchSize = $this->getBatchSize();
216 
217  for ( $startId = (int)$minmax->min; $startId <= $minmax->max; $startId += $batchSize ) {
218  $endId = (int)min( $startId + $batchSize - 1, $minmax->max );
219  $rows = $this->dbw->select(
220  $tables,
221  $fields,
222  [
223  "$idField >= $startId",
224  "$idField <= $endId",
225  'slot_revision_id IS NULL',
226  ],
227  __METHOD__,
228  [ 'ORDER BY' => 'rev_id' ],
229  $joins
230  );
231  if ( $rows->numRows() !== 0 ) {
232  $this->populateContentTablesForRowBatch( $rows, $startId, $table );
233  }
234 
235  $elapsed = microtime( true ) - $t0;
236  $this->writeln(
237  "... $table processed up to revision id $endId of {$minmax->max}"
238  . " ($this->count rows in $elapsed seconds)"
239  );
240  }
241 
242  $elapsed = microtime( true ) - $t0;
243  $this->writeln( "Done populating $table table. Processed $this->count rows in $elapsed seconds" );
244  }
245 
252  private function populateContentTablesForRowBatch( IResultWrapper $rows, $startId, $table ) {
253  $this->beginTransaction( $this->dbw, __METHOD__ );
254 
255  if ( $this->contentRowMap === null ) {
256  $map = [];
257  } else {
258  $map = &$this->contentRowMap;
259  }
260  $contentKeys = [];
261 
262  try {
263  // Step 1: Figure out content rows needing insertion.
264  $contentRows = [];
265  foreach ( $rows as $row ) {
266  $revisionId = $row->rev_id;
267 
268  Assert::invariant( $revisionId !== null, 'rev_id must not be null' );
269 
270  $model = $this->getContentModel( $row );
271  $modelId = $this->contentModelStore->acquireId( $model );
272  $address = SqlBlobStore::makeAddressFromTextId( $row->text_id );
273 
274  $key = "{$modelId}:{$address}";
275  $contentKeys[$revisionId] = $key;
276 
277  if ( !isset( $map[$key] ) ) {
278  $fillFields = $this->fillMissingFields( $row, $model, $address );
279  if ( !$fillFields ) {
280  continue;
281  }
282 
283  $map[$key] = false;
284  $contentRows[] = [
285  'content_size' => (int)$row->len,
286  'content_sha1' => $row->sha1,
287  'content_model' => $modelId,
288  'content_address' => $address,
289  ];
290  }
291  }
292 
293  // Step 2: Insert them, then read them back in for use in the next step.
294  if ( $contentRows ) {
295  $id = $this->dbw->selectField( 'content', 'MAX(content_id)', '', __METHOD__ );
296  $this->dbw->insert( 'content', $contentRows, __METHOD__ );
297  $res = $this->dbw->select(
298  'content',
299  [ 'content_id', 'content_model', 'content_address' ],
300  'content_id > ' . (int)$id,
301  __METHOD__
302  );
303  foreach ( $res as $row ) {
304  $key = $row->content_model . ':' . $row->content_address;
305  $map[$key] = $row->content_id;
306  }
307  }
308 
309  // Step 3: Insert the slot rows.
310  $slotRows = [];
311  foreach ( $rows as $row ) {
312  $revisionId = $row->rev_id;
313  $contentId = $map[$contentKeys[$revisionId]] ?? false;
314  if ( $contentId === false ) {
315  throw new \RuntimeException( "Content row for $revisionId not found after content insert" );
316  }
317  $slotRows[] = [
318  'slot_revision_id' => $revisionId,
319  'slot_role_id' => $this->mainRoleId,
320  'slot_content_id' => $contentId,
321  // There's no way to really know the previous revision, so assume no inheriting.
322  // rev_parent_id can get changed on undeletions, and deletions can screw up
323  // rev_timestamp ordering.
324  'slot_origin' => $revisionId,
325  ];
326  }
327  $this->dbw->insert( 'slots', $slotRows, __METHOD__ );
328  $this->count += count( $slotRows );
329  $this->totalCount += count( $slotRows );
330  } catch ( \Exception $e ) {
331  $this->rollbackTransaction( $this->dbw, __METHOD__ );
332  $this->fatalError( "Failed to populate content table $table row batch starting at $startId "
333  . "due to exception: " . $e->__toString() );
334  }
335 
336  $this->commitTransaction( $this->dbw, __METHOD__ );
337  }
338 
343  private function getContentModel( $row ) {
344  if ( isset( $row->content_model ) ) {
345  return $row->content_model;
346  }
347 
348  $title = Title::makeTitle( $row->namespace, $row->title );
349 
351  }
352 
356  private function writeln( $msg ) {
357  $this->output( "$msg\n" );
358  }
359 
369  private function fillMissingFields( $row, $model, $address ) {
370  if ( !isset( $row->content_model ) ) {
371  // just for completeness
372  $row->content_model = $model;
373  }
374 
375  if ( isset( $row->len ) && isset( $row->sha1 ) && $row->sha1 !== '' ) {
376  // No need to load the content, quite now.
377  return true;
378  }
379 
380  try {
381  $blob = $this->blobStore->getBlob( $address );
382  } catch ( BlobAccessException $e ) {
383  $this->error( $e->getMessage() );
384  return false;
385  }
386 
387  if ( !isset( $row->len ) ) {
388  // NOTE: The nominal size of the content may not be the length of the raw blob.
389  $row->len = ContentHandler::makeContent( $blob, null, $model )->getSize();
390  }
391 
392  if ( !isset( $row->sha1 ) || $row->sha1 === '' ) {
393  $row->sha1 = SlotRecord::base36Sha1( $blob );
394  }
395 
396  return true;
397  }
398 }
399 
400 $maintClass = PopulateContentTables::class;
401 require_once RUN_MAINTENANCE_IF_MAIN;
PopulateContentTables\$totalCount
$totalCount
Definition: populateContentTables.php:58
PopulateContentTables\$blobStore
BlobStore $blobStore
Definition: populateContentTables.php:50
MediaWiki\Storage\BlobAccessException
Exception representing a failure to access a data blob.
Definition: BlobAccessException.php:33
PopulateContentTables\$count
$count
Definition: populateContentTables.php:58
PopulateContentTables\$slotRoleStore
NameTableStore $slotRoleStore
Definition: populateContentTables.php:47
MediaWiki\MediaWikiServices
MediaWikiServices is the service locator for the application scope of MediaWiki.
Definition: MediaWikiServices.php:193
Maintenance\fatalError
fatalError( $msg, $exitCode=1)
Output a message and terminate the current script.
Definition: Maintenance.php:489
MediaWiki\Storage\SqlBlobStore
Service for storing and loading Content objects.
Definition: SqlBlobStore.php:52
Maintenance\addDescription
addDescription( $text)
Set the description text.
Definition: Maintenance.php:329
PopulateContentTables\$mainRoleId
int $mainRoleId
Definition: populateContentTables.php:53
PopulateContentTables\loadContentMap
loadContentMap()
Definition: populateContentTables.php:127
PopulateContentTables\populateTable
populateTable( $table)
Definition: populateContentTables.php:156
$maintClass
$maintClass
Definition: populateContentTables.php:400
Maintenance
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
Definition: Maintenance.php:59
PopulateContentTables\fillMissingFields
fillMissingFields( $row, $model, $address)
Compute any missing fields in $row.
Definition: populateContentTables.php:369
PopulateContentTables\$contentRowMap
array null $contentRowMap
Map "{$modelId}:{$address}" to content_id.
Definition: populateContentTables.php:56
$res
$res
Definition: testCompression.php:57
Wikimedia\Rdbms\IDatabase
Basic database interface for live and lazy-loaded relation database handles.
Definition: IDatabase.php:38
PopulateContentTables\$dbw
IDatabase $dbw
Definition: populateContentTables.php:41
$dbr
$dbr
Definition: testCompression.php:54
PopulateContentTables\populateContentTablesForRowBatch
populateContentTablesForRowBatch(IResultWrapper $rows, $startId, $table)
Definition: populateContentTables.php:252
Maintenance\rollbackTransaction
rollbackTransaction(IDatabase $dbw, $fname)
Rollback the transaction on a DB handle.
Definition: Maintenance.php:1438
Maintenance\beginTransaction
beginTransaction(IDatabase $dbw, $fname)
Begin a transaction on a DB.
Definition: Maintenance.php:1393
PopulateContentTables
Populate the content and slot tables.
Definition: populateContentTables.php:38
PopulateContentTables\writeln
writeln( $msg)
Definition: populateContentTables.php:356
ContentHandler\getDefaultModelFor
static getDefaultModelFor(Title $title)
Returns the name of the default content model to be used for the page with the given title.
Definition: ContentHandler.php:198
PopulateContentTables\$contentModelStore
NameTableStore $contentModelStore
Definition: populateContentTables.php:44
Wikimedia\Rdbms\IResultWrapper
Result wrapper for grabbing data queried from an IDatabase object.
Definition: IResultWrapper.php:26
PopulateContentTables\getTables
getTables()
Definition: populateContentTables.php:110
$blob
$blob
Definition: testCompression.php:70
PopulateContentTables\__construct
__construct()
Default constructor.
Definition: populateContentTables.php:60
PopulateContentTables\execute
execute()
Do the actual work.
Definition: populateContentTables.php:89
Maintenance\addOption
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
Definition: Maintenance.php:249
$title
$title
Definition: testCompression.php:38
Title\makeTitle
static makeTitle( $ns, $title, $fragment='', $interwiki='')
Create a new Title from a namespace index and a DB key.
Definition: Title.php:651
DB_REPLICA
const DB_REPLICA
Definition: defines.php:25
ContentHandler\makeContent
static makeContent( $text, Title $title=null, $modelId=null, $format=null)
Convenience function for creating a Content object from a given textual representation.
Definition: ContentHandler.php:146
PopulateContentTables\initServices
initServices()
Definition: populateContentTables.php:75
DB_PRIMARY
const DB_PRIMARY
Definition: defines.php:27
Maintenance\commitTransaction
commitTransaction(IDatabase $dbw, $fname)
Commit the transaction on a DB handle and wait for replica DBs to catch up.
Definition: Maintenance.php:1408
MediaWiki\Storage\NameTableStore
Definition: NameTableStore.php:36
Maintenance\getDB
getDB( $db, $groups=[], $dbDomain=false)
Returns a database to be used by current maintenance script.
Definition: Maintenance.php:1362
MediaWiki\Storage\BlobStore
Service for loading and storing data blobs.
Definition: BlobStore.php:35
Maintenance\getOption
getOption( $name, $default=null)
Get an option, or return the default.
Definition: Maintenance.php:286
Maintenance\getBatchSize
getBatchSize()
Returns batch size.
Definition: Maintenance.php:368
Maintenance\error
error( $err, $die=0)
Throw an error to the user.
Definition: Maintenance.php:464
Maintenance\output
output( $out, $channel=null)
Throw some output to the user.
Definition: Maintenance.php:435
PopulateContentTables\getContentModel
getContentModel( $row)
Definition: populateContentTables.php:343
Maintenance\hasOption
hasOption( $name)
Checks to see if a particular option was set.
Definition: Maintenance.php:271
MediaWiki\Revision\SlotRecord
Value object representing a content slot associated with a page revision.
Definition: SlotRecord.php:40
Maintenance\setBatchSize
setBatchSize( $s=0)
Definition: Maintenance.php:375