MediaWiki  master
moveToExternal.php
Go to the documentation of this file.
1 <?php
27 use Wikimedia\AtEase\AtEase;
28 
29 require_once __DIR__ . '/../Maintenance.php';
30 
31 class MoveToExternal extends Maintenance {
33  private $resolveStubs;
35  private $reportingInterval;
37  private $minID;
39  private $maxID;
41  private $esType;
43  private $esLocation;
45  private $threshold;
47  private $gzip;
49  private $skipResolve;
51  private $legacyEncoding;
53  private $dryRun;
55  private $undoLog;
56 
57  public function __construct() {
58  parent::__construct();
59 
60  $this->setBatchSize( 1000 );
61 
62  $this->addOption( 'start', 'start old_id', false, true, 's' );
63  $this->addOption( 'end', 'end old_id', false, true, 'e' );
64  $this->addOption( 'threshold', 'minimum size in bytes', false, true );
65  $this->addOption( 'reporting-interval',
66  'show a message after this many revisions', false, true );
67  $this->addOption( 'undo', 'filename for undo SQL', false, true );
68 
69  $this->addOption( 'skip-gzip', 'Don\'t compress individual revisions' );
70  $this->addOption( 'skip-resolve',
71  'Don\'t replace HistoryBlobStub objects with direct external store pointers' );
72  $this->addOption( 'iconv', 'Resolve legacy character encoding' );
73  $this->addOption( 'dry-run', 'Don\'t modify any rows' );
74 
75  $this->addArg( 'type', 'The external store type, e.g. "DB" or "mwstore"' );
76  $this->addArg( 'location', 'e.g. "cluster12" or "global-swift"' );
77  }
78 
79  public function execute() {
80  $this->resolveStubs = new ResolveStubs;
81  $this->esType = $this->getArg( 0 ); // e.g. "DB" or "mwstore"
82  $this->esLocation = $this->getArg( 1 ); // e.g. "cluster12" or "global-swift"
83  $dbw = $this->getDB( DB_PRIMARY );
84 
85  $maxID = $this->getOption( 'end' );
86  if ( $maxID === null ) {
87  $maxID = $dbw->newSelectQueryBuilder()
88  ->select( 'MAX(old_id)' )
89  ->from( 'text' )
90  ->caller( __METHOD__ )->fetchField();
91  }
92  $this->maxID = (int)$maxID;
93  $this->minID = (int)$this->getOption( 'start', 1 );
94 
95  $this->reportingInterval = $this->getOption( 'reporting-interval', 100 );
96  $this->threshold = (int)$this->getOption( 'threshold', 0 );
97 
98  if ( $this->getOption( 'skip-gzip' ) ) {
99  $this->gzip = false;
100  } elseif ( !function_exists( 'gzdeflate' ) ) {
101  $this->fatalError( "gzdeflate() not found. " .
102  "Please run with --skip-gzip if you don't want to compress revisions." );
103  } else {
104  $this->gzip = true;
105  }
106 
107  $this->skipResolve = $this->getOption( 'skip-resolve' );
108 
109  if ( $this->getOption( 'iconv' ) ) {
110  $legacyEncoding = $this->getConfig()->get( MainConfigNames::LegacyEncoding );
111  if ( $legacyEncoding ) {
112  $this->legacyEncoding = $legacyEncoding;
113  } else {
114  $this->output( "iconv requested but the wiki has no legacy encoding\n" );
115  }
116  }
117  $this->dryRun = $this->getOption( 'dry-run', false );
118 
119  $undo = $this->getOption( 'undo' );
120  try {
121  $this->undoLog = new UndoLog( $undo, $dbw );
122  } catch ( RuntimeException $e ) {
123  $this->fatalError( "Unable to open undo log" );
124  }
125  $this->resolveStubs->setUndoLog( $this->undoLog );
126 
127  $this->doMoveToExternal();
128  }
129 
130  private function doMoveToExternal() {
131  $dbr = $this->getDB( DB_REPLICA );
132 
133  $count = $this->maxID - $this->minID + 1;
134  $blockSize = $this->getBatchSize();
135  $numBlocks = ceil( $count / $blockSize );
136  print "Moving text rows from {$this->minID} to {$this->maxID} to external storage\n";
137 
138  $esFactory = $this->getServiceContainer()->getExternalStoreFactory();
139  $lbFactory = $this->getServiceContainer()->getDBLoadBalancerFactory();
140  $extStore = $esFactory->getStore( $this->esType );
141  $numMoved = 0;
142  $stubIDs = [];
143 
144  for ( $block = 0; $block < $numBlocks; $block++ ) {
145  $blockStart = $block * $blockSize + $this->minID;
146  $blockEnd = $blockStart + $blockSize - 1;
147 
148  if ( $this->reportingInterval && !( $block % $this->reportingInterval ) ) {
149  $this->output( "oldid=$blockStart, moved=$numMoved\n" );
150  $lbFactory->waitForReplication();
151  }
152 
153  $res = $dbr->newSelectQueryBuilder()
154  ->select( [ 'old_id', 'old_flags', 'old_text' ] )
155  ->from( 'text' )
156  ->where( $this->getConditions( $blockStart, $blockEnd, $dbr ) )
157  ->caller( __METHOD__ )->fetchResultSet();
158  foreach ( $res as $row ) {
159  $text = $row->old_text;
160  $id = $row->old_id;
161  $flags = SqlBlobStore::explodeFlags( $row->old_flags );
162  [ $text, $flags ] = $this->resolveText( $text, $flags );
163 
164  if ( in_array( 'error', $flags ) ) {
165  continue;
166  } elseif ( in_array( 'object', $flags ) ) {
167  $obj = unserialize( $text );
168  if ( $obj instanceof HistoryBlobStub ) {
169  // Handle later, after CGZ resolution
170  if ( !$this->skipResolve ) {
171  $stubIDs[] = $id;
172  }
173  continue;
174  } elseif ( $obj instanceof HistoryBlobCurStub ) {
175  // Copy cur text to ES
176  [ $text, $flags ] = $this->resolveLegacyEncoding( $obj->getText(), [] );
177  [ $text, $flags ] = $this->compress( $text, $flags );
178  } elseif ( $obj instanceof ConcatenatedGzipHistoryBlob ) {
179  // Store as is
180  } else {
181  $className = get_class( $obj );
182  print "Warning: old_id=$id unrecognised object class \"$className\"\n";
183  continue;
184  }
185  } elseif ( strlen( $text ) < $this->threshold ) {
186  // Don't move small revisions
187  continue;
188  } else {
189  [ $text, $flags ] = $this->resolveLegacyEncoding( $text, $flags );
190  [ $text, $flags ] = $this->compress( $text, $flags );
191  }
192  $flags[] = 'external';
193  $flagsString = implode( ',', $flags );
194 
195  if ( $this->dryRun ) {
196  $this->output( "Move $id => $flagsString " .
197  addcslashes( substr( $text, 0, 30 ), "\0..\x1f\x7f..\xff" ) .
198  "\n"
199  );
200  continue;
201  }
202 
203  $url = $extStore->store( $this->esLocation, $text );
204  if ( !$url ) {
205  $this->fatalError( "Error writing to external storage" );
206  }
207  $moved = $this->undoLog->update(
208  'text',
209  [ 'old_flags' => $flagsString, 'old_text' => $url ],
210  (array)$row,
211  __METHOD__
212  );
213  if ( $moved ) {
214  $numMoved++;
215  } else {
216  print "Update of old_id $id failed, affected zero rows\n";
217  }
218  }
219  }
220 
221  if ( count( $stubIDs ) ) {
222  $this->resolveStubs( $stubIDs );
223  }
224  }
225 
226  private function compress( $text, $flags ) {
227  if ( $this->gzip && !in_array( 'gzip', $flags ) ) {
228  $flags[] = 'gzip';
229  $text = gzdeflate( $text );
230  }
231  return [ $text, $flags ];
232  }
233 
234  private function resolveLegacyEncoding( $text, $flags ) {
235  if ( $this->legacyEncoding !== null
236  && !in_array( 'utf-8', $flags )
237  && !in_array( 'utf8', $flags )
238  ) {
239  // First decompress the entry so we don't try to convert a binary gzip to utf-8
240  if ( in_array( 'gzip', $flags ) ) {
241  if ( !$this->gzip ) {
242  return [ $text, $flags ];
243  }
244  $flags = array_diff( $flags, [ 'gzip' ] );
245  $text = gzinflate( $text );
246  }
247  AtEase::suppressWarnings();
248  $text = iconv( $this->legacyEncoding, 'UTF-8//IGNORE', $text );
249  AtEase::restoreWarnings();
250  $flags[] = 'utf-8';
251  }
252  return [ $text, $flags ];
253  }
254 
255  private function resolveStubs( $stubIDs ) {
256  if ( $this->dryRun ) {
257  print "Note: resolving stubs in dry run mode is expected to fail, " .
258  "because the main blobs have not been moved to external storage.\n";
259  }
260 
261  $dbr = $this->getDB( DB_REPLICA );
262  $lbFactory = $this->getServiceContainer()->getDBLoadBalancerFactory();
263  $this->output( "Resolving " . count( $stubIDs ) . " stubs\n" );
264  $numResolved = 0;
265  $numTotal = 0;
266  foreach ( array_chunk( $stubIDs, $this->getBatchSize() ) as $stubBatch ) {
267  $res = $dbr->newSelectQueryBuilder()
268  ->select( [ 'old_id', 'old_flags', 'old_text' ] )
269  ->from( 'text' )
270  ->where( [ 'old_id' => $stubBatch ] )
271  ->caller( __METHOD__ )->fetchResultSet();
272  foreach ( $res as $row ) {
273  $numResolved += $this->resolveStubs->resolveStub( $row, $this->dryRun ) ? 1 : 0;
274  $numTotal++;
275  if ( $this->reportingInterval
276  && $numTotal % $this->reportingInterval == 0
277  ) {
278  $this->output( "$numTotal stubs processed\n" );
279  $lbFactory->waitForReplication();
280  }
281  }
282  }
283  $this->output( "$numResolved of $numTotal stubs resolved\n" );
284  }
285 
286  protected function getConditions( $blockStart, $blockEnd, $dbr ) {
287  return [
288  "old_id BETWEEN $blockStart AND $blockEnd",
289  'old_flags NOT ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
290  ];
291  }
292 
293  protected function resolveText( $text, $flags ) {
294  return [ $text, $flags ];
295  }
296 }
297 
298 $maintClass = MoveToExternal::class;
299 require_once RUN_MAINTENANCE_IF_MAIN;
Concatenated gzip (CGZ) storage Improves compression ratio by concatenating like objects before gzipp...
To speed up conversion from 1.4 to 1.5 schema, text rows can refer to the leftover cur table as the b...
Pointer object for an item within a CGZ blob stored in the text table.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
Definition: Maintenance.php:66
getDB( $db, $groups=[], $dbDomain=false)
Returns a database to be used by current maintenance script.
addArg( $arg, $description, $required=true, $multi=false)
Add some args that are needed.
output( $out, $channel=null)
Throw some output to the user.
getServiceContainer()
Returns the main service container.
getBatchSize()
Returns batch size.
getArg( $argId=0, $default=null)
Get an argument.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
setBatchSize( $s=0)
fatalError( $msg, $exitCode=1)
Output a message and terminate the current script.
A class containing constants representing the names of configuration variables.
Update a database while optionally writing SQL that reverses the update to a file.
Definition: UndoLog.php:11
Service for storing and loading Content objects representing revision data blobs.
__construct()
Default constructor.
getConditions( $blockStart, $blockEnd, $dbr)
execute()
Do the actual work.
resolveText( $text, $flags)
$maintClass
const DB_REPLICA
Definition: defines.php:26
const DB_PRIMARY
Definition: defines.php:28