MediaWiki master
copyFileBackend.php
Go to the documentation of this file.
1<?php
12
13// @codeCoverageIgnoreStart
14require_once __DIR__ . '/Maintenance.php';
15// @codeCoverageIgnoreEnd
16
30 protected $statCache = null;
31
32 public function __construct() {
33 parent::__construct();
34 $this->addDescription( 'Copy files in one backend to another.' );
35 $this->addOption( 'src', 'Backend containing the source files', true, true );
36 $this->addOption( 'dst', 'Backend where files should be copied to', true, true );
37 $this->addOption( 'containers', 'Pipe separated list of containers', true, true );
38 $this->addOption( 'subdir', 'Only do items in this child directory', false, true );
39 $this->addOption( 'ratefile', 'File to check periodically for batch size', false, true );
40 $this->addOption( 'prestat', 'Stat the destination files first (try to use listings)' );
41 $this->addOption( 'skiphash', 'Skip SHA-1 sync checks for files' );
42 $this->addOption( 'missingonly', 'Only copy files missing from destination listing' );
43 $this->addOption( 'syncviadelete', 'Delete destination files missing from source listing' );
44 $this->addOption( 'utf8only', 'Skip source files that do not have valid UTF-8 names' );
45 $this->setBatchSize( 50 );
46 }
47
48 public function execute() {
49 $backendGroup = $this->getServiceContainer()->getFileBackendGroup();
50 $src = $backendGroup->get( $this->getOption( 'src' ) );
51 $dst = $backendGroup->get( $this->getOption( 'dst' ) );
52 $containers = explode( '|', $this->getOption( 'containers' ) );
53 $subDir = rtrim( $this->getOption( 'subdir', '' ), '/' );
54
55 $rateFile = $this->getOption( 'ratefile' );
56
57 foreach ( $containers as $container ) {
58 if ( $subDir != '' ) {
59 $backendRel = "$container/$subDir";
60 $this->output( "Doing container '$container', directory '$subDir'...\n" );
61 } else {
62 $backendRel = $container;
63 $this->output( "Doing container '$container'...\n" );
64 }
65
66 if ( $this->hasOption( 'missingonly' ) ) {
67 $this->output( "\tBuilding list of missing files..." );
68 $srcPathsRel = $this->getListingDiffRel( $src, $dst, $backendRel );
69 $this->output( count( $srcPathsRel ) . " file(s) need to be copied.\n" );
70 } else {
71 $srcPathsRel = $src->getFileList( [
72 'dir' => $src->getRootStoragePath() . "/$backendRel",
73 'adviseStat' => true // avoid HEADs
74 ] );
75 if ( $srcPathsRel === null ) {
76 $this->fatalError( "Could not list files in $container." );
77 }
78 }
79
80 if ( $this->getOption( 'prestat' ) && !$this->hasOption( 'missingonly' ) ) {
81 // Build the stat cache for the destination files
82 $this->output( "\tBuilding destination stat cache..." );
83 $dstPathsRel = $dst->getFileList( [
84 'dir' => $dst->getRootStoragePath() . "/$backendRel",
85 'adviseStat' => true // avoid HEADs
86 ] );
87 if ( $dstPathsRel === null ) {
88 $this->fatalError( "Could not list files in $container." );
89 }
90 $this->statCache = [];
91 foreach ( $dstPathsRel as $dstPathRel ) {
92 $path = $dst->getRootStoragePath() . "/$backendRel/$dstPathRel";
93 $this->statCache[sha1( $path )] = $dst->getFileStat( [ 'src' => $path ] );
94 }
95 $this->output( "done [" . count( $this->statCache ) . " file(s)]\n" );
96 }
97
98 $this->output( "\tCopying file(s)...\n" );
99 $count = 0;
100 $batchPaths = [];
101 foreach ( $srcPathsRel as $srcPathRel ) {
102 // Check up on the rate file periodically to adjust the concurrency
103 if ( $rateFile && ( !$count || ( $count % 500 ) == 0 ) ) {
104 $this->setBatchSize( max( 1, (int)file_get_contents( $rateFile ) ) );
105 $this->output( "\tBatch size is now {$this->getBatchSize()}.\n" );
106 }
107 $batchPaths[$srcPathRel] = 1; // remove duplicates
108 if ( count( $batchPaths ) >= $this->getBatchSize() ) {
109 $this->copyFileBatch( array_keys( $batchPaths ), $backendRel, $src, $dst );
110 $batchPaths = []; // done
111 }
112 ++$count;
113 }
114 if ( count( $batchPaths ) ) { // left-overs
115 $this->copyFileBatch( array_keys( $batchPaths ), $backendRel, $src, $dst );
116 }
117 $this->output( "\tCopied $count file(s).\n" );
118
119 if ( $this->hasOption( 'syncviadelete' ) ) {
120 $this->output( "\tBuilding list of excess destination files..." );
121 $delPathsRel = $this->getListingDiffRel( $dst, $src, $backendRel );
122 $this->output( count( $delPathsRel ) . " file(s) need to be deleted.\n" );
123
124 $this->output( "\tDeleting file(s)...\n" );
125 $count = 0;
126 $batchPaths = [];
127 foreach ( $delPathsRel as $delPathRel ) {
128 // Check up on the rate file periodically to adjust the concurrency
129 if ( $rateFile && ( !$count || ( $count % 500 ) == 0 ) ) {
130 $this->setBatchSize( max( 1, (int)file_get_contents( $rateFile ) ) );
131 $this->output( "\tBatch size is now {$this->getBatchSize()}.\n" );
132 }
133 $batchPaths[$delPathRel] = 1; // remove duplicates
134 if ( count( $batchPaths ) >= $this->getBatchSize() ) {
135 $this->delFileBatch( array_keys( $batchPaths ), $backendRel, $dst );
136 $batchPaths = []; // done
137 }
138 ++$count;
139 }
140 if ( count( $batchPaths ) ) { // left-overs
141 $this->delFileBatch( array_keys( $batchPaths ), $backendRel, $dst );
142 }
143
144 $this->output( "\tDeleted $count file(s).\n" );
145 }
146
147 if ( $subDir != '' ) {
148 $this->output( "Finished container '$container', directory '$subDir'.\n" );
149 } else {
150 $this->output( "Finished container '$container'.\n" );
151 }
152 }
153
154 $this->output( "Done.\n" );
155 }
156
163 protected function getListingDiffRel( FileBackend $src, FileBackend $dst, $backendRel ) {
164 $srcPathsRel = $src->getFileList( [
165 'dir' => $src->getRootStoragePath() . "/$backendRel" ] );
166 if ( $srcPathsRel === null ) {
167 $this->fatalError( "Could not list files in source container." );
168 }
169 $dstPathsRel = $dst->getFileList( [
170 'dir' => $dst->getRootStoragePath() . "/$backendRel" ] );
171 if ( $dstPathsRel === null ) {
172 $this->fatalError( "Could not list files in destination container." );
173 }
174 // Get the list of destination files
175 $relFilesDstSha1 = [];
176 foreach ( $dstPathsRel as $dstPathRel ) {
177 $relFilesDstSha1[sha1( $dstPathRel )] = 1;
178 }
179 unset( $dstPathsRel ); // free
180 // Get the list of missing files
181 $missingPathsRel = [];
182 foreach ( $srcPathsRel as $srcPathRel ) {
183 if ( !isset( $relFilesDstSha1[sha1( $srcPathRel )] ) ) {
184 $missingPathsRel[] = $srcPathRel;
185 }
186 }
187 unset( $srcPathsRel ); // free
188
189 return $missingPathsRel;
190 }
191
199 protected function copyFileBatch(
200 array $srcPathsRel, $backendRel, FileBackend $src, FileBackend $dst
201 ) {
202 $ops = [];
203 $fsFiles = [];
204 $copiedRel = []; // for output message
205 $domainId = $src->getDomainId();
206
207 // Download the batch of source files into backend cache...
208 if ( $this->hasOption( 'missingonly' ) ) {
209 $srcPaths = [];
210 foreach ( $srcPathsRel as $srcPathRel ) {
211 $srcPaths[] = $src->getRootStoragePath() . "/$backendRel/$srcPathRel";
212 }
213 $t_start = microtime( true );
214 $fsFiles = $src->getLocalReferenceMulti( [ 'srcs' => $srcPaths, 'latest' => 1 ] );
215 $elapsed_ms = floor( ( microtime( true ) - $t_start ) * 1000 );
216 $this->output( "\n\tDownloaded these file(s) [{$elapsed_ms}ms]:\n\t" .
217 implode( "\n\t", $srcPaths ) . "\n\n" );
218 }
219
220 // Determine what files need to be copied over...
221 foreach ( $srcPathsRel as $srcPathRel ) {
222 $srcPath = $src->getRootStoragePath() . "/$backendRel/$srcPathRel";
223 $dstPath = $dst->getRootStoragePath() . "/$backendRel/$srcPathRel";
224 if ( $this->hasOption( 'utf8only' ) && !mb_check_encoding( $srcPath, 'UTF-8' ) ) {
225 $this->error( "$domainId: Detected illegal (non-UTF8) path for $srcPath." );
226 continue;
227 } elseif ( !$this->hasOption( 'missingonly' )
228 && $this->filesAreSame( $src, $dst, $srcPath, $dstPath )
229 ) {
230 $this->output( "\tAlready have $srcPathRel.\n" );
231 continue; // assume already copied...
232 }
233 $fsFile = array_key_exists( $srcPath, $fsFiles )
234 ? $fsFiles[$srcPath]
235 : $src->getLocalReference( [ 'src' => $srcPath, 'latest' => 1 ] );
236 if ( !$fsFile ) {
237 $src->clearCache( [ $srcPath ] );
238 if ( $src->fileExists( [ 'src' => $srcPath, 'latest' => 1 ] ) === false ) {
239 $this->error( "$domainId: File '$srcPath' was listed but does not exist." );
240 } else {
241 $this->error( "$domainId: Could not get local copy of $srcPath." );
242 }
243 continue;
244 } elseif ( !$fsFile->exists() ) {
245 // FSFileBackends just return the path for getLocalReference() and paths with
246 // illegal slashes may get normalized to a different path. This can cause the
247 // local reference to not exist...skip these broken files.
248 $this->error( "$domainId: Detected possible illegal path for $srcPath." );
249 continue;
250 }
251 $fsFiles[] = $fsFile; // keep TempFSFile objects alive as needed
252 // Note: prepare() is usually fast for key/value backends
253 $status = $dst->prepare( [ 'dir' => dirname( $dstPath ), 'bypassReadOnly' => true ] );
254 if ( !$status->isOK() ) {
255 $this->error( $status );
256 $this->fatalError( "$domainId: Could not copy $srcPath to $dstPath." );
257 }
258 $ops[] = [ 'op' => 'store',
259 'src' => $fsFile->getPath(), 'dst' => $dstPath, 'overwrite' => true ];
260 $copiedRel[] = $srcPathRel;
261 }
262
263 // Copy in the batch of source files...
264 $t_start = microtime( true );
265 $status = $dst->doQuickOperations( $ops, [ 'bypassReadOnly' => true ] );
266 if ( !$status->isOK() ) {
267 sleep( 10 ); // wait and retry copy again
268 $status = $dst->doQuickOperations( $ops, [ 'bypassReadOnly' => true ] );
269 }
270 $elapsed_ms = floor( ( microtime( true ) - $t_start ) * 1000 );
271 if ( !$status->isOK() ) {
272 $this->error( $status );
273 $this->fatalError( "$domainId: Could not copy file batch." );
274 } elseif ( count( $copiedRel ) ) {
275 $this->output( "\n\tCopied these file(s) [{$elapsed_ms}ms]:\n\t" .
276 implode( "\n\t", $copiedRel ) . "\n\n" );
277 }
278 }
279
286 protected function delFileBatch(
287 array $dstPathsRel, $backendRel, FileBackend $dst
288 ) {
289 $ops = [];
290 $deletedRel = []; // for output message
291 $domainId = $dst->getDomainId();
292
293 // Determine what files need to be copied over...
294 foreach ( $dstPathsRel as $dstPathRel ) {
295 $dstPath = $dst->getRootStoragePath() . "/$backendRel/$dstPathRel";
296 $ops[] = [ 'op' => 'delete', 'src' => $dstPath ];
297 $deletedRel[] = $dstPathRel;
298 }
299
300 // Delete the batch of source files...
301 $t_start = microtime( true );
302 $status = $dst->doQuickOperations( $ops, [ 'bypassReadOnly' => true ] );
303 if ( !$status->isOK() ) {
304 sleep( 10 ); // wait and retry copy again
305 $status = $dst->doQuickOperations( $ops, [ 'bypassReadOnly' => true ] );
306 }
307 $elapsed_ms = floor( ( microtime( true ) - $t_start ) * 1000 );
308 if ( !$status->isOK() ) {
309 $this->error( $status );
310 $this->fatalError( "$domainId: Could not delete file batch." );
311 } elseif ( count( $deletedRel ) ) {
312 $this->output( "\n\tDeleted these file(s) [{$elapsed_ms}ms]:\n\t" .
313 implode( "\n\t", $deletedRel ) . "\n\n" );
314 }
315 }
316
324 protected function filesAreSame( FileBackend $src, FileBackend $dst, $sPath, $dPath ) {
325 $skipHash = $this->hasOption( 'skiphash' );
326 $srcStat = $src->getFileStat( [ 'src' => $sPath ] );
327 $dPathSha1 = sha1( $dPath );
328 if ( $this->statCache !== null ) {
329 // All dst files are already in stat cache
330 $dstStat = $this->statCache[$dPathSha1] ?? false;
331 } else {
332 $dstStat = $dst->getFileStat( [ 'src' => $dPath ] );
333 }
334 // Initial fast checks to see if files are obviously different
335 $sameFast = (
336 is_array( $srcStat )
337 && is_array( $dstStat ) // dest exists
338 && $srcStat['size'] === $dstStat['size']
339 );
340 // More thorough checks against files
341 if ( !$sameFast ) {
342 $same = false; // no need to look farther
343 } elseif ( isset( $srcStat['md5'] ) && isset( $dstStat['md5'] ) ) {
344 // If MD5 was already in the stat info, just use it.
345 // This is useful as many objects stores can return this in object listing,
346 // so we can use it to avoid slow per-file HEADs.
347 $same = ( $srcStat['md5'] === $dstStat['md5'] );
348 } elseif ( $skipHash ) {
349 // This mode is good for copying to a backup location or resyncing clone
350 // backends in FileBackendMultiWrite (since they get writes second, they have
351 // higher timestamps). However, when copying the other way, this hits loads of
352 // false positives (possibly 100%) and wastes a bunch of time on GETs/PUTs.
353 // @phan-suppress-next-line PhanTypeArraySuspiciousNullable
354 $same = ( $srcStat['mtime'] <= $dstStat['mtime'] );
355 } else {
356 // This is the slowest method which does many per-file HEADs (unless an object
357 // store tracks SHA-1 in listings).
358 $same = ( $src->getFileSha1Base36( [ 'src' => $sPath, 'latest' => 1 ] )
359 === $dst->getFileSha1Base36( [ 'src' => $dPath, 'latest' => 1 ] ) );
360 }
361
362 return $same;
363 }
364}
365
366// @codeCoverageIgnoreStart
367$maintClass = CopyFileBackend::class;
368require_once RUN_MAINTENANCE_IF_MAIN;
369// @codeCoverageIgnoreEnd
Copy all files in one container of one backend to another.
__construct()
Default constructor.
filesAreSame(FileBackend $src, FileBackend $dst, $sPath, $dPath)
array null $statCache
(path sha1 => stat) Pre-computed dst stat entries from listings
delFileBatch(array $dstPathsRel, $backendRel, FileBackend $dst)
execute()
Do the actual work.
getListingDiffRel(FileBackend $src, FileBackend $dst, $backendRel)
copyFileBatch(array $srcPathsRel, $backendRel, FileBackend $src, FileBackend $dst)
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
getBatchSize()
Returns batch size.
output( $out, $channel=null)
Throw some output to the user.
fatalError( $msg, $exitCode=1)
Output a message and terminate the current script.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
hasOption( $name)
Checks to see if a particular option was set.
getOption( $name, $default=null)
Get an option, or return the default.
error( $err, $die=0)
Throw an error to the user.
getServiceContainer()
Returns the main service container.
addDescription( $text)
Set the description text.
Base class for all file backend classes (including multi-write backends).
prepare(array $params)
Prepare a storage directory for usage.
getFileList(array $params)
Get an iterator to list all stored files under a storage directory.
getLocalReferenceMulti(array $params)
Like getLocalReference() except it takes an array of storage paths and yields an order-preserved map ...
getFileStat(array $params)
Get quick information about a file at a storage path in the backend.
fileExists(array $params)
Check if a file exists at a storage path in the backend.
getFileSha1Base36(array $params)
Get a SHA-1 hash of the content of the file at a storage path in the backend.
getRootStoragePath()
Get the root storage path of this backend.
getDomainId()
Get the domain identifier used for this backend (possibly empty).
getLocalReference(array $params)
Returns a file system file, identical in content to the file at a storage path.
doQuickOperations(array $ops, array $opts=[])
Perform a set of independent file operations on some files.
clearCache(?array $paths=null)
Invalidate any in-process file stat and property cache.