MediaWiki master
moveToExternal.php
Go to the documentation of this file.
1<?php
27use Wikimedia\AtEase\AtEase;
28
29require_once __DIR__ . '/../Maintenance.php';
30
33 private $resolveStubs;
35 private $reportingInterval;
37 private $minID;
39 private $maxID;
41 private $esType;
43 private $esLocation;
45 private $threshold;
47 private $gzip;
49 private $skipResolve;
51 private $legacyEncoding;
53 private $dryRun;
55 private $undoLog;
56
57 public function __construct() {
58 parent::__construct();
59
60 $this->setBatchSize( 1000 );
61
62 $this->addOption( 'start', 'start old_id', false, true, 's' );
63 $this->addOption( 'end', 'end old_id', false, true, 'e' );
64 $this->addOption( 'threshold', 'minimum size in bytes', false, true );
65 $this->addOption( 'reporting-interval',
66 'show a message after this many revisions', false, true );
67 $this->addOption( 'undo', 'filename for undo SQL', false, true );
68
69 $this->addOption( 'skip-gzip', 'Don\'t compress individual revisions' );
70 $this->addOption( 'skip-resolve',
71 'Don\'t replace HistoryBlobStub objects with direct external store pointers' );
72 $this->addOption( 'iconv', 'Resolve legacy character encoding' );
73 $this->addOption( 'dry-run', 'Don\'t modify any rows' );
74
75 $this->addArg( 'type', 'The external store type, e.g. "DB" or "mwstore"' );
76 $this->addArg( 'location', 'e.g. "cluster12" or "global-swift"' );
77 }
78
79 public function execute() {
80 $this->resolveStubs = new ResolveStubs;
81 $this->esType = $this->getArg( 0 ); // e.g. "DB" or "mwstore"
82 $this->esLocation = $this->getArg( 1 ); // e.g. "cluster12" or "global-swift"
83 $dbw = $this->getPrimaryDB();
84
85 $maxID = $this->getOption( 'end' );
86 if ( $maxID === null ) {
87 $maxID = $dbw->newSelectQueryBuilder()
88 ->select( 'MAX(old_id)' )
89 ->from( 'text' )
90 ->caller( __METHOD__ )->fetchField();
91 }
92 $this->maxID = (int)$maxID;
93 $this->minID = (int)$this->getOption( 'start', 1 );
94
95 $this->reportingInterval = $this->getOption( 'reporting-interval', 100 );
96 $this->threshold = (int)$this->getOption( 'threshold', 0 );
97
98 if ( $this->getOption( 'skip-gzip' ) ) {
99 $this->gzip = false;
100 } elseif ( !function_exists( 'gzdeflate' ) ) {
101 $this->fatalError( "gzdeflate() not found. " .
102 "Please run with --skip-gzip if you don't want to compress revisions." );
103 } else {
104 $this->gzip = true;
105 }
106
107 $this->skipResolve = $this->getOption( 'skip-resolve' );
108
109 if ( $this->getOption( 'iconv' ) ) {
110 $legacyEncoding = $this->getConfig()->get( MainConfigNames::LegacyEncoding );
111 if ( $legacyEncoding ) {
112 $this->legacyEncoding = $legacyEncoding;
113 } else {
114 $this->output( "iconv requested but the wiki has no legacy encoding\n" );
115 }
116 }
117 $this->dryRun = $this->getOption( 'dry-run', false );
118
119 $undo = $this->getOption( 'undo' );
120 try {
121 $this->undoLog = new UndoLog( $undo, $dbw );
122 } catch ( RuntimeException $e ) {
123 $this->fatalError( "Unable to open undo log" );
124 }
125 $this->resolveStubs->setUndoLog( $this->undoLog );
126
127 return $this->doMoveToExternal();
128 }
129
130 private function doMoveToExternal() {
131 $success = true;
132 $dbr = $this->getReplicaDB();
133
134 $count = $this->maxID - $this->minID + 1;
135 $blockSize = $this->getBatchSize();
136 $numBlocks = ceil( $count / $blockSize );
137 print "Moving text rows from {$this->minID} to {$this->maxID} to external storage\n";
138
139 $esFactory = $this->getServiceContainer()->getExternalStoreFactory();
140 $extStore = $esFactory->getStore( $this->esType );
141 $numMoved = 0;
142 $stubIDs = [];
143
144 for ( $block = 0; $block < $numBlocks; $block++ ) {
145 $blockStart = $block * $blockSize + $this->minID;
146 $blockEnd = $blockStart + $blockSize - 1;
147
148 if ( $this->reportingInterval && !( $block % $this->reportingInterval ) ) {
149 $this->output( "oldid=$blockStart, moved=$numMoved\n" );
150 $this->waitForReplication();
151 }
152
153 $res = $dbr->newSelectQueryBuilder()
154 ->select( [ 'old_id', 'old_flags', 'old_text' ] )
155 ->from( 'text' )
156 ->where( $this->getConditions( $blockStart, $blockEnd, $dbr ) )
157 ->caller( __METHOD__ )->fetchResultSet();
158 foreach ( $res as $row ) {
159 $text = $row->old_text;
160 $id = $row->old_id;
161 $flags = SqlBlobStore::explodeFlags( $row->old_flags );
162 [ $text, $flags ] = $this->resolveText( $text, $flags );
163
164 if ( $text === false ) {
165 $success = false;
166 }
167
168 if ( in_array( 'error', $flags ) ) {
169 continue;
170 } elseif ( in_array( 'object', $flags ) ) {
171 $obj = unserialize( $text );
172 if ( $obj instanceof HistoryBlobStub ) {
173 // Handle later, after CGZ resolution
174 if ( !$this->skipResolve ) {
175 $stubIDs[] = $id;
176 }
177 continue;
178 } elseif ( $obj instanceof HistoryBlobCurStub ) {
179 // Copy cur text to ES
180 $newText = $obj->getText();
181 if ( $newText === false ) {
182 print "Warning: Could not fetch revision blob {$id}: {$text}\n";
183 $success = false;
184 continue;
185 }
186
187 [ $text, $flags ] = $this->resolveLegacyEncoding( $newText, [] );
188
189 if ( $text === false ) {
190 print "Warning: Could not decode legacy-encoded gzip\'ed revision blob {$id}: {$newText}\n";
191 $success = false;
192 continue;
193 }
194
195 [ $text, $flags ] = $this->compress( $text, $flags );
196 } elseif ( $obj instanceof ConcatenatedGzipHistoryBlob ) {
197 // Store as is
198 } else {
199 $className = get_class( $obj );
200 print "Warning: old_id=$id unrecognised object class \"$className\"\n";
201 $success = false;
202 continue;
203 }
204 } elseif ( strlen( $text ) < $this->threshold ) {
205 // Don't move small revisions
206 continue;
207 } else {
208 [ $text, $flags ] = $this->resolveLegacyEncoding( $text, $flags );
209 [ $newText, $flags ] = $this->compress( $text, $flags );
210 if ( $newText === false ) {
211 print "Warning: Could not compress revision blob {$id}: {$text}\n";
212 $success = false;
213 continue;
214 }
215 $text = $newText;
216 }
217 $flags[] = 'external';
218 $flagsString = implode( ',', $flags );
219
220 if ( $this->dryRun ) {
221 $this->output( "Move $id => $flagsString " .
222 addcslashes( substr( $text, 0, 30 ), "\0..\x1f\x7f..\xff" ) .
223 "\n"
224 );
225 continue;
226 }
227
228 $url = $extStore->store( $this->esLocation, $text );
229 if ( !$url ) {
230 $this->fatalError( "Error writing to external storage" );
231 }
232 $moved = $this->undoLog->update(
233 'text',
234 [ 'old_flags' => $flagsString, 'old_text' => $url ],
235 (array)$row,
236 __METHOD__
237 );
238 if ( $moved ) {
239 $numMoved++;
240 } else {
241 print "Update of old_id $id failed, affected zero rows\n";
242 $success = false;
243 }
244 }
245 }
246
247 if ( count( $stubIDs ) ) {
248 $this->resolveStubs( $stubIDs );
249 }
250
251 return $success;
252 }
253
254 private function compress( $text, $flags ) {
255 if ( $this->gzip && !in_array( 'gzip', $flags ) ) {
256 $flags[] = 'gzip';
257 $text = gzdeflate( $text );
258 }
259 return [ $text, $flags ];
260 }
261
262 private function resolveLegacyEncoding( $text, $flags ) {
263 if ( $this->legacyEncoding !== null
264 && !in_array( 'utf-8', $flags )
265 && !in_array( 'utf8', $flags )
266 ) {
267 // First decompress the entry so we don't try to convert a binary gzip to utf-8
268 if ( in_array( 'gzip', $flags ) ) {
269 if ( !$this->gzip ) {
270 return [ $text, $flags ];
271 }
272 $flags = array_diff( $flags, [ 'gzip' ] );
273 $newText = gzinflate( $text );
274 if ( $newText === false ) {
275 return [ false, $flags ];
276 }
277 $text = $newText;
278 }
279 AtEase::suppressWarnings();
280 $newText = iconv( $this->legacyEncoding, 'UTF-8//IGNORE', $text );
281 AtEase::restoreWarnings();
282 if ( $newText === false ) {
283 return [ false, $flags ];
284 }
285 $text = $newText;
286 $flags[] = 'utf-8';
287 }
288 return [ $text, $flags ];
289 }
290
291 private function resolveStubs( $stubIDs ) {
292 if ( $this->dryRun ) {
293 print "Note: resolving stubs in dry run mode is expected to fail, " .
294 "because the main blobs have not been moved to external storage.\n";
295 }
296
297 $dbr = $this->getReplicaDB();
298 $this->output( "Resolving " . count( $stubIDs ) . " stubs\n" );
299 $numResolved = 0;
300 $numTotal = 0;
301 foreach ( array_chunk( $stubIDs, $this->getBatchSize() ) as $stubBatch ) {
302 $res = $dbr->newSelectQueryBuilder()
303 ->select( [ 'old_id', 'old_flags', 'old_text' ] )
304 ->from( 'text' )
305 ->where( [ 'old_id' => $stubBatch ] )
306 ->caller( __METHOD__ )->fetchResultSet();
307 foreach ( $res as $row ) {
308 $numResolved += $this->resolveStubs->resolveStub( $row, $this->dryRun ) ? 1 : 0;
309 $numTotal++;
310 if ( $this->reportingInterval
311 && $numTotal % $this->reportingInterval == 0
312 ) {
313 $this->output( "$numTotal stubs processed\n" );
314 $this->waitForReplication();
315 }
316 }
317 }
318 $this->output( "$numResolved of $numTotal stubs resolved\n" );
319 }
320
321 protected function getConditions( $blockStart, $blockEnd, $dbr ) {
322 return [
323 "old_id BETWEEN $blockStart AND $blockEnd",
324 'old_flags NOT ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
325 ];
326 }
327
328 protected function resolveText( $text, $flags ) {
329 return [ $text, $flags ];
330 }
331}
332
333$maintClass = MoveToExternal::class;
334require_once RUN_MAINTENANCE_IF_MAIN;
Concatenated gzip (CGZ) storage Improves compression ratio by concatenating like objects before gzipp...
To speed up conversion from 1.4 to 1.5 schema, text rows can refer to the leftover cur table as the b...
Pointer object for an item within a CGZ blob stored in the text table.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
addArg( $arg, $description, $required=true, $multi=false)
Add some args that are needed.
output( $out, $channel=null)
Throw some output to the user.
waitForReplication()
Wait for replica DBs to catch up.
getServiceContainer()
Returns the main service container.
getBatchSize()
Returns batch size.
getArg( $argId=0, $default=null)
Get an argument.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
setBatchSize( $s=0)
fatalError( $msg, $exitCode=1)
Output a message and terminate the current script.
A class containing constants representing the names of configuration variables.
Update a database while optionally writing SQL that reverses the update to a file.
Definition UndoLog.php:11
Service for storing and loading Content objects representing revision data blobs.
__construct()
Default constructor.
getConditions( $blockStart, $blockEnd, $dbr)
execute()
Do the actual work.
resolveText( $text, $flags)
$maintClass