MediaWiki master
moveToExternal.php
Go to the documentation of this file.
1<?php
27use Wikimedia\AtEase\AtEase;
30
31require_once __DIR__ . '/../Maintenance.php';
32
35 private $resolveStubs;
37 private $reportingInterval;
39 private $minID;
41 private $maxID;
43 private $esType;
45 private $esLocation;
47 private $threshold;
49 private $gzip;
51 private $skipResolve;
53 private $legacyEncoding;
55 private $dryRun;
57 private $undoLog;
58
59 public function __construct() {
60 parent::__construct();
61
62 $this->setBatchSize( 1000 );
63
64 $this->addOption( 'start', 'start old_id', false, true, 's' );
65 $this->addOption( 'end', 'end old_id', false, true, 'e' );
66 $this->addOption( 'threshold', 'minimum size in bytes', false, true );
67 $this->addOption( 'reporting-interval',
68 'show a message after this many revisions', false, true );
69 $this->addOption( 'undo', 'filename for undo SQL', false, true );
70
71 $this->addOption( 'skip-gzip', 'Don\'t compress individual revisions' );
72 $this->addOption( 'skip-resolve',
73 'Don\'t replace HistoryBlobStub objects with direct external store pointers' );
74 $this->addOption( 'iconv', 'Resolve legacy character encoding' );
75 $this->addOption( 'dry-run', 'Don\'t modify any rows' );
76
77 $this->addArg( 'type', 'The external store type, e.g. "DB" or "mwstore"' );
78 $this->addArg( 'location', 'e.g. "cluster12" or "global-swift"' );
79 }
80
81 public function execute() {
82 $this->resolveStubs = new ResolveStubs;
83 $this->esType = $this->getArg( 0 ); // e.g. "DB" or "mwstore"
84 $this->esLocation = $this->getArg( 1 ); // e.g. "cluster12" or "global-swift"
85 $dbw = $this->getPrimaryDB();
86
87 $maxID = $this->getOption( 'end' );
88 if ( $maxID === null ) {
89 $maxID = $dbw->newSelectQueryBuilder()
90 ->select( 'MAX(old_id)' )
91 ->from( 'text' )
92 ->caller( __METHOD__ )->fetchField();
93 }
94 $this->maxID = (int)$maxID;
95 $this->minID = (int)$this->getOption( 'start', 1 );
96
97 $this->reportingInterval = $this->getOption( 'reporting-interval', 100 );
98 $this->threshold = (int)$this->getOption( 'threshold', 0 );
99
100 if ( $this->getOption( 'skip-gzip' ) ) {
101 $this->gzip = false;
102 } elseif ( !function_exists( 'gzdeflate' ) ) {
103 $this->fatalError( "gzdeflate() not found. " .
104 "Please run with --skip-gzip if you don't want to compress revisions." );
105 } else {
106 $this->gzip = true;
107 }
108
109 $this->skipResolve = $this->getOption( 'skip-resolve' );
110
111 if ( $this->getOption( 'iconv' ) ) {
112 $legacyEncoding = $this->getConfig()->get( MainConfigNames::LegacyEncoding );
113 if ( $legacyEncoding ) {
114 $this->legacyEncoding = $legacyEncoding;
115 } else {
116 $this->output( "iconv requested but the wiki has no legacy encoding\n" );
117 }
118 }
119 $this->dryRun = $this->getOption( 'dry-run', false );
120
121 $undo = $this->getOption( 'undo' );
122 try {
123 $this->undoLog = new UndoLog( $undo, $dbw );
124 } catch ( RuntimeException $e ) {
125 $this->fatalError( "Unable to open undo log" );
126 }
127 $this->resolveStubs->setUndoLog( $this->undoLog );
128
129 return $this->doMoveToExternal();
130 }
131
132 private function doMoveToExternal() {
133 $success = true;
134 $dbr = $this->getReplicaDB();
135
136 $count = $this->maxID - $this->minID + 1;
137 $blockSize = $this->getBatchSize();
138 $numBlocks = ceil( $count / $blockSize );
139 print "Moving text rows from {$this->minID} to {$this->maxID} to external storage\n";
140
141 $esFactory = $this->getServiceContainer()->getExternalStoreFactory();
142 $extStore = $esFactory->getStore( $this->esType );
143 $numMoved = 0;
144 $stubIDs = [];
145
146 for ( $block = 0; $block < $numBlocks; $block++ ) {
147 $blockStart = $block * $blockSize + $this->minID;
148 $blockEnd = $blockStart + $blockSize - 1;
149
150 if ( $this->reportingInterval && !( $block % $this->reportingInterval ) ) {
151 $this->output( "oldid=$blockStart, moved=$numMoved\n" );
152 $this->waitForReplication();
153 }
154
155 $res = $dbr->newSelectQueryBuilder()
156 ->select( [ 'old_id', 'old_flags', 'old_text' ] )
157 ->from( 'text' )
158 ->where( $this->getConditions( $blockStart, $blockEnd, $dbr ) )
159 ->caller( __METHOD__ )->fetchResultSet();
160 foreach ( $res as $row ) {
161 $text = $row->old_text;
162 $id = $row->old_id;
163 $flags = SqlBlobStore::explodeFlags( $row->old_flags );
164 [ $text, $flags ] = $this->resolveText( $text, $flags );
165
166 if ( $text === false ) {
167 $success = false;
168 }
169
170 if ( in_array( 'error', $flags ) ) {
171 continue;
172 } elseif ( in_array( 'object', $flags ) ) {
173 $obj = unserialize( $text );
174 if ( $obj instanceof HistoryBlobStub ) {
175 // Handle later, after CGZ resolution
176 if ( !$this->skipResolve ) {
177 $stubIDs[] = $id;
178 }
179 continue;
180 } elseif ( $obj instanceof HistoryBlobCurStub ) {
181 // Copy cur text to ES
182 $newText = $obj->getText();
183 if ( $newText === false ) {
184 print "Warning: Could not fetch revision blob {$id}: {$text}\n";
185 $success = false;
186 continue;
187 }
188
189 [ $text, $flags ] = $this->resolveLegacyEncoding( $newText, [] );
190
191 if ( $text === false ) {
192 print "Warning: Could not decode legacy-encoded gzip\'ed revision blob {$id}: {$newText}\n";
193 $success = false;
194 continue;
195 }
196
197 [ $text, $flags ] = $this->compress( $text, $flags );
198 } elseif ( $obj instanceof ConcatenatedGzipHistoryBlob ) {
199 // Store as is
200 } else {
201 $className = get_class( $obj );
202 print "Warning: old_id=$id unrecognised object class \"$className\"\n";
203 $success = false;
204 continue;
205 }
206 } elseif ( strlen( $text ) < $this->threshold ) {
207 // Don't move small revisions
208 continue;
209 } else {
210 [ $text, $flags ] = $this->resolveLegacyEncoding( $text, $flags );
211 [ $newText, $flags ] = $this->compress( $text, $flags );
212 if ( $newText === false ) {
213 print "Warning: Could not compress revision blob {$id}: {$text}\n";
214 $success = false;
215 continue;
216 }
217 $text = $newText;
218 }
219 $flags[] = 'external';
220 $flagsString = implode( ',', $flags );
221
222 if ( $this->dryRun ) {
223 $this->output( "Move $id => $flagsString " .
224 addcslashes( substr( $text, 0, 30 ), "\0..\x1f\x7f..\xff" ) .
225 "\n"
226 );
227 continue;
228 }
229
230 $url = $extStore->store( $this->esLocation, $text );
231 if ( !$url ) {
232 $this->fatalError( "Error writing to external storage" );
233 }
234 $moved = $this->undoLog->update(
235 'text',
236 [ 'old_flags' => $flagsString, 'old_text' => $url ],
237 (array)$row,
238 __METHOD__
239 );
240 if ( $moved ) {
241 $numMoved++;
242 } else {
243 print "Update of old_id $id failed, affected zero rows\n";
244 $success = false;
245 }
246 }
247 }
248
249 if ( count( $stubIDs ) ) {
250 $this->resolveStubs( $stubIDs );
251 }
252
253 return $success;
254 }
255
256 private function compress( $text, $flags ) {
257 if ( $this->gzip && !in_array( 'gzip', $flags ) ) {
258 $flags[] = 'gzip';
259 $text = gzdeflate( $text );
260 }
261 return [ $text, $flags ];
262 }
263
264 private function resolveLegacyEncoding( $text, $flags ) {
265 if ( $this->legacyEncoding !== null
266 && !in_array( 'utf-8', $flags )
267 && !in_array( 'utf8', $flags )
268 ) {
269 // First decompress the entry so we don't try to convert a binary gzip to utf-8
270 if ( in_array( 'gzip', $flags ) ) {
271 if ( !$this->gzip ) {
272 return [ $text, $flags ];
273 }
274 $flags = array_diff( $flags, [ 'gzip' ] );
275 $newText = gzinflate( $text );
276 if ( $newText === false ) {
277 return [ false, $flags ];
278 }
279 $text = $newText;
280 }
281 AtEase::suppressWarnings();
282 $newText = iconv( $this->legacyEncoding, 'UTF-8//IGNORE', $text );
283 AtEase::restoreWarnings();
284 if ( $newText === false ) {
285 return [ false, $flags ];
286 }
287 $text = $newText;
288 $flags[] = 'utf-8';
289 }
290 return [ $text, $flags ];
291 }
292
293 private function resolveStubs( $stubIDs ) {
294 if ( $this->dryRun ) {
295 print "Note: resolving stubs in dry run mode is expected to fail, " .
296 "because the main blobs have not been moved to external storage.\n";
297 }
298
299 $dbr = $this->getReplicaDB();
300 $this->output( "Resolving " . count( $stubIDs ) . " stubs\n" );
301 $numResolved = 0;
302 $numTotal = 0;
303 foreach ( array_chunk( $stubIDs, $this->getBatchSize() ) as $stubBatch ) {
304 $res = $dbr->newSelectQueryBuilder()
305 ->select( [ 'old_id', 'old_flags', 'old_text' ] )
306 ->from( 'text' )
307 ->where( [ 'old_id' => $stubBatch ] )
308 ->caller( __METHOD__ )->fetchResultSet();
309 foreach ( $res as $row ) {
310 $numResolved += $this->resolveStubs->resolveStub( $row, $this->dryRun ) ? 1 : 0;
311 $numTotal++;
312 if ( $this->reportingInterval
313 && $numTotal % $this->reportingInterval == 0
314 ) {
315 $this->output( "$numTotal stubs processed\n" );
316 $this->waitForReplication();
317 }
318 }
319 }
320 $this->output( "$numResolved of $numTotal stubs resolved\n" );
321 }
322
323 protected function getConditions( $blockStart, $blockEnd, $dbr ) {
324 return [
325 $dbr->expr( 'old_id', '>=', $blockStart ),
326 $dbr->expr( 'old_id', '>=', $blockEnd ),
327 $dbr->expr( 'old_flags', IExpression::NOT_LIKE,
328 new LikeValue( $dbr->anyString(), 'external', $dbr->anyString() ) ),
329 ];
330 }
331
332 protected function resolveText( $text, $flags ) {
333 return [ $text, $flags ];
334 }
335}
336
337$maintClass = MoveToExternal::class;
338require_once RUN_MAINTENANCE_IF_MAIN;
Concatenated gzip (CGZ) storage Improves compression ratio by concatenating like objects before gzipp...
To speed up conversion from 1.4 to 1.5 schema, text rows can refer to the leftover cur table as the b...
Pointer object for an item within a CGZ blob stored in the text table.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
addArg( $arg, $description, $required=true, $multi=false)
Add some args that are needed.
output( $out, $channel=null)
Throw some output to the user.
waitForReplication()
Wait for replica DBs to catch up.
getServiceContainer()
Returns the main service container.
getBatchSize()
Returns batch size.
getArg( $argId=0, $default=null)
Get an argument.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
setBatchSize( $s=0)
fatalError( $msg, $exitCode=1)
Output a message and terminate the current script.
A class containing constants representing the names of configuration variables.
Update a database while optionally writing SQL that reverses the update to a file.
Definition UndoLog.php:11
Service for storing and loading Content objects representing revision data blobs.
__construct()
Default constructor.
getConditions( $blockStart, $blockEnd, $dbr)
execute()
Do the actual work.
resolveText( $text, $flags)
Content of like value.
Definition LikeValue.php:14
$maintClass