MediaWiki 1.41.2
moveToExternal.php
Go to the documentation of this file.
1<?php
27use Wikimedia\AtEase\AtEase;
28
29require_once __DIR__ . '/../Maintenance.php';
30
33 private $resolveStubs;
35 private $reportingInterval;
37 private $minID;
39 private $maxID;
41 private $esType;
43 private $esLocation;
45 private $threshold;
47 private $gzip;
49 private $skipResolve;
51 private $legacyEncoding;
53 private $dryRun;
55 private $undoLog;
56
57 public function __construct() {
58 parent::__construct();
59
60 $this->setBatchSize( 1000 );
61
62 $this->addOption( 'start', 'start old_id', false, true, 's' );
63 $this->addOption( 'end', 'end old_id', false, true, 'e' );
64 $this->addOption( 'threshold', 'minimum size in bytes', false, true );
65 $this->addOption( 'reporting-interval',
66 'show a message after this many revisions', false, true );
67 $this->addOption( 'undo', 'filename for undo SQL', false, true );
68
69 $this->addOption( 'skip-gzip', 'Don\'t compress individual revisions' );
70 $this->addOption( 'skip-resolve',
71 'Don\'t replace HistoryBlobStub objects with direct external store pointers' );
72 $this->addOption( 'iconv', 'Resolve legacy character encoding' );
73 $this->addOption( 'dry-run', 'Don\'t modify any rows' );
74
75 $this->addArg( 'type', 'The external store type, e.g. "DB" or "mwstore"' );
76 $this->addArg( 'location', 'e.g. "cluster12" or "global-swift"' );
77 }
78
79 public function execute() {
80 $this->resolveStubs = new ResolveStubs;
81 $this->esType = $this->getArg( 0 ); // e.g. "DB" or "mwstore"
82 $this->esLocation = $this->getArg( 1 ); // e.g. "cluster12" or "global-swift"
83 $dbw = $this->getDB( DB_PRIMARY );
84
85 $maxID = $this->getOption( 'end' );
86 if ( $maxID === null ) {
87 $maxID = $dbw->newSelectQueryBuilder()
88 ->select( 'MAX(old_id)' )
89 ->from( 'text' )
90 ->caller( __METHOD__ )->fetchField();
91 }
92 $this->maxID = (int)$maxID;
93 $this->minID = (int)$this->getOption( 'start', 1 );
94
95 $this->reportingInterval = $this->getOption( 'reporting-interval', 100 );
96 $this->threshold = (int)$this->getOption( 'threshold', 0 );
97
98 if ( $this->getOption( 'skip-gzip' ) ) {
99 $this->gzip = false;
100 } elseif ( !function_exists( 'gzdeflate' ) ) {
101 $this->fatalError( "gzdeflate() not found. " .
102 "Please run with --skip-gzip if you don't want to compress revisions." );
103 } else {
104 $this->gzip = true;
105 }
106
107 $this->skipResolve = $this->getOption( 'skip-resolve' );
108
109 if ( $this->getOption( 'iconv' ) ) {
110 $legacyEncoding = $this->getConfig()->get( MainConfigNames::LegacyEncoding );
111 if ( $legacyEncoding ) {
112 $this->legacyEncoding = $legacyEncoding;
113 } else {
114 $this->output( "iconv requested but the wiki has no legacy encoding\n" );
115 }
116 }
117 $this->dryRun = $this->getOption( 'dry-run', false );
118
119 $undo = $this->getOption( 'undo' );
120 try {
121 $this->undoLog = new UndoLog( $undo, $dbw );
122 } catch ( RuntimeException $e ) {
123 $this->fatalError( "Unable to open undo log" );
124 }
125 $this->resolveStubs->setUndoLog( $this->undoLog );
126
127 $this->doMoveToExternal();
128 }
129
130 private function doMoveToExternal() {
131 $dbr = $this->getDB( DB_REPLICA );
132
133 $count = $this->maxID - $this->minID + 1;
134 $blockSize = $this->getBatchSize();
135 $numBlocks = ceil( $count / $blockSize );
136 print "Moving text rows from {$this->minID} to {$this->maxID} to external storage\n";
137
138 $esFactory = $this->getServiceContainer()->getExternalStoreFactory();
139 $lbFactory = $this->getServiceContainer()->getDBLoadBalancerFactory();
140 $extStore = $esFactory->getStore( $this->esType );
141 $numMoved = 0;
142 $stubIDs = [];
143
144 for ( $block = 0; $block < $numBlocks; $block++ ) {
145 $blockStart = $block * $blockSize + $this->minID;
146 $blockEnd = $blockStart + $blockSize - 1;
147
148 if ( $this->reportingInterval && !( $block % $this->reportingInterval ) ) {
149 $this->output( "oldid=$blockStart, moved=$numMoved\n" );
150 $lbFactory->waitForReplication();
151 }
152
153 $res = $dbr->newSelectQueryBuilder()
154 ->select( [ 'old_id', 'old_flags', 'old_text' ] )
155 ->from( 'text' )
156 ->where( $this->getConditions( $blockStart, $blockEnd, $dbr ) )
157 ->caller( __METHOD__ )->fetchResultSet();
158 foreach ( $res as $row ) {
159 $text = $row->old_text;
160 $id = $row->old_id;
161 $flags = SqlBlobStore::explodeFlags( $row->old_flags );
162 [ $text, $flags ] = $this->resolveText( $text, $flags );
163
164 if ( in_array( 'error', $flags ) ) {
165 continue;
166 } elseif ( in_array( 'object', $flags ) ) {
167 $obj = unserialize( $text );
168 if ( $obj instanceof HistoryBlobStub ) {
169 // Handle later, after CGZ resolution
170 if ( !$this->skipResolve ) {
171 $stubIDs[] = $id;
172 }
173 continue;
174 } elseif ( $obj instanceof HistoryBlobCurStub ) {
175 // Copy cur text to ES
176 [ $text, $flags ] = $this->resolveLegacyEncoding( $obj->getText(), [] );
177 [ $text, $flags ] = $this->compress( $text, $flags );
178 } elseif ( $obj instanceof ConcatenatedGzipHistoryBlob ) {
179 // Store as is
180 } else {
181 $className = get_class( $obj );
182 print "Warning: old_id=$id unrecognised object class \"$className\"\n";
183 continue;
184 }
185 } elseif ( strlen( $text ) < $this->threshold ) {
186 // Don't move small revisions
187 continue;
188 } else {
189 [ $text, $flags ] = $this->resolveLegacyEncoding( $text, $flags );
190 [ $text, $flags ] = $this->compress( $text, $flags );
191 }
192 $flags[] = 'external';
193 $flagsString = implode( ',', $flags );
194
195 if ( $this->dryRun ) {
196 $this->output( "Move $id => $flagsString " .
197 addcslashes( substr( $text, 0, 30 ), "\0..\x1f\x7f..\xff" ) .
198 "\n"
199 );
200 continue;
201 }
202
203 $url = $extStore->store( $this->esLocation, $text );
204 if ( !$url ) {
205 $this->fatalError( "Error writing to external storage" );
206 }
207 $moved = $this->undoLog->update(
208 'text',
209 [ 'old_flags' => $flagsString, 'old_text' => $url ],
210 (array)$row,
211 __METHOD__
212 );
213 if ( $moved ) {
214 $numMoved++;
215 } else {
216 print "Update of old_id $id failed, affected zero rows\n";
217 }
218 }
219 }
220
221 if ( count( $stubIDs ) ) {
222 $this->resolveStubs( $stubIDs );
223 }
224 }
225
226 private function compress( $text, $flags ) {
227 if ( $this->gzip && !in_array( 'gzip', $flags ) ) {
228 $flags[] = 'gzip';
229 $text = gzdeflate( $text );
230 }
231 return [ $text, $flags ];
232 }
233
234 private function resolveLegacyEncoding( $text, $flags ) {
235 if ( $this->legacyEncoding !== null
236 && !in_array( 'utf-8', $flags )
237 && !in_array( 'utf8', $flags )
238 ) {
239 // First decompress the entry so we don't try to convert a binary gzip to utf-8
240 if ( in_array( 'gzip', $flags ) ) {
241 if ( !$this->gzip ) {
242 return [ $text, $flags ];
243 }
244 $flags = array_diff( $flags, [ 'gzip' ] );
245 $text = gzinflate( $text );
246 }
247 AtEase::suppressWarnings();
248 $text = iconv( $this->legacyEncoding, 'UTF-8//IGNORE', $text );
249 AtEase::restoreWarnings();
250 $flags[] = 'utf-8';
251 }
252 return [ $text, $flags ];
253 }
254
255 private function resolveStubs( $stubIDs ) {
256 if ( $this->dryRun ) {
257 print "Note: resolving stubs in dry run mode is expected to fail, " .
258 "because the main blobs have not been moved to external storage.\n";
259 }
260
261 $dbr = $this->getDB( DB_REPLICA );
262 $lbFactory = $this->getServiceContainer()->getDBLoadBalancerFactory();
263 $this->output( "Resolving " . count( $stubIDs ) . " stubs\n" );
264 $numResolved = 0;
265 $numTotal = 0;
266 foreach ( array_chunk( $stubIDs, $this->getBatchSize() ) as $stubBatch ) {
267 $res = $dbr->newSelectQueryBuilder()
268 ->select( [ 'old_id', 'old_flags', 'old_text' ] )
269 ->from( 'text' )
270 ->where( [ 'old_id' => $stubBatch ] )
271 ->caller( __METHOD__ )->fetchResultSet();
272 foreach ( $res as $row ) {
273 $numResolved += $this->resolveStubs->resolveStub( $row, $this->dryRun ) ? 1 : 0;
274 $numTotal++;
275 if ( $this->reportingInterval
276 && $numTotal % $this->reportingInterval == 0
277 ) {
278 $this->output( "$numTotal stubs processed\n" );
279 $lbFactory->waitForReplication();
280 }
281 }
282 }
283 $this->output( "$numResolved of $numTotal stubs resolved\n" );
284 }
285
286 protected function getConditions( $blockStart, $blockEnd, $dbr ) {
287 return [
288 "old_id BETWEEN $blockStart AND $blockEnd",
289 'old_flags NOT ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
290 ];
291 }
292
293 protected function resolveText( $text, $flags ) {
294 return [ $text, $flags ];
295 }
296}
297
298$maintClass = MoveToExternal::class;
299require_once RUN_MAINTENANCE_IF_MAIN;
getDB()
Concatenated gzip (CGZ) storage Improves compression ratio by concatenating like objects before gzipp...
To speed up conversion from 1.4 to 1.5 schema, text rows can refer to the leftover cur table as the b...
Pointer object for an item within a CGZ blob stored in the text table.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
addArg( $arg, $description, $required=true, $multi=false)
Add some args that are needed.
output( $out, $channel=null)
Throw some output to the user.
getServiceContainer()
Returns the main service container.
getBatchSize()
Returns batch size.
getArg( $argId=0, $default=null)
Get an argument.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
setBatchSize( $s=0)
fatalError( $msg, $exitCode=1)
Output a message and terminate the current script.
A class containing constants representing the names of configuration variables.
Update a database while optionally writing SQL that reverses the update to a file.
Definition UndoLog.php:11
Service for storing and loading Content objects representing revision data blobs.
__construct()
Default constructor.
getConditions( $blockStart, $blockEnd, $dbr)
execute()
Do the actual work.
resolveText( $text, $flags)
$maintClass
const DB_REPLICA
Definition defines.php:26
const DB_PRIMARY
Definition defines.php:28