MediaWiki master
dumpIterator.php
Go to the documentation of this file.
1<?php
37
38// @codeCoverageIgnoreStart
39require_once __DIR__ . '/Maintenance.php';
40// @codeCoverageIgnoreEnd
41
47abstract class DumpIterator extends Maintenance {
49 private $count = 0;
51 private $startTime;
53 private $from;
54
55 public function __construct() {
56 parent::__construct();
57 $this->addDescription( 'Does something with a dump' );
58 $this->addOption( 'file', 'File with text to run.', false, true );
59 $this->addOption( 'dump', 'XML dump to execute all revisions.', false, true );
60 $this->addOption( 'from', 'Article from XML dump to start from.', false, true );
61 }
62
63 public function execute() {
64 if ( !( $this->hasOption( 'file' ) xor $this->hasOption( 'dump' ) ) ) {
65 $this->fatalError( "You must provide a file or dump" );
66 }
67
68 $this->checkOptions();
69
70 if ( $this->hasOption( 'file' ) ) {
71 $file = $this->getOption( 'file' );
72 $revision = new WikiRevision();
73 $text = file_get_contents( $file );
74 $title = Title::newFromText( rawurldecode( basename( $file, '.txt' ) ) );
75 $revision->setTitle( $title );
76 $content = ContentHandler::makeContent( $text, $title );
77 $revision->setContent( SlotRecord::MAIN, $content );
78
79 $this->from = false;
80 $this->handleRevision( $revision );
81
82 return;
83 }
84
85 $this->startTime = microtime( true );
86
87 if ( $this->getOption( 'dump' ) == '-' ) {
88 $source = new ImportStreamSource( $this->getStdin() );
89 } else {
90 $this->fatalError( "Sorry, I don't support dump filenames yet. "
91 . "Use - and provide it on stdin on the meantime." );
92 }
93
94 $user = User::newSystemUser( User::MAINTENANCE_SCRIPT_USER, [ 'steal' => true ] );
95
96 $importer = $this->getServiceContainer()
97 ->getWikiImporterFactory()
98 ->getWikiImporter( $source, new UltimateAuthority( $user ) );
99
100 $importer->setRevisionCallback(
101 [ $this, 'handleRevision' ] );
102 $importer->setNoticeCallback( static function ( $msg, $params ) {
103 echo wfMessage( $msg, $params )->text() . "\n";
104 } );
105
106 $this->from = $this->getOption( 'from', null );
107 $this->count = 0;
108 $importer->doImport();
109
110 $this->conclusions();
111
112 $delta = microtime( true ) - $this->startTime;
113 $this->error( "Done {$this->count} revisions in " . round( $delta, 2 ) . " seconds " );
114 if ( $delta > 0 ) {
115 $this->error( round( $this->count / $delta, 2 ) . " pages/sec" );
116 }
117
118 # Perform the memory_get_peak_usage() when all the other data has been
119 # output so there's no damage if it dies. It is only available since
120 # 5.2.0 (since 5.2.1 if you haven't compiled with --enable-memory-limit)
121 $this->error( "Memory peak usage of " . memory_get_peak_usage() . " bytes\n" );
122 }
123
124 public function finalSetup( SettingsBuilder $settingsBuilder ) {
125 parent::finalSetup( $settingsBuilder );
126
127 if ( $this->getDbType() == Maintenance::DB_NONE ) {
128 // TODO: Allow hooks to be registered via SettingsBuilder as well!
129 // This matches the idea of unifying SettingsBuilder with ExtensionRegistry.
130 // phpcs:disable MediaWiki.Usage.DeprecatedGlobalVariables.Deprecated$wgHooks
131 global $wgHooks;
132 $wgHooks['InterwikiLoadPrefix'][] = 'DumpIterator::disableInterwikis';
133
134 $settingsBuilder->putConfigValues( [
135 MainConfigNames::UseDatabaseMessages => false,
136 MainConfigNames::LocalisationCacheConf => [ 'storeClass' => LCStoreNull::class ],
137 ] );
138 }
139 }
140
141 public static function disableInterwikis( $prefix, &$data ) {
142 # Title::newFromText will check on each namespaced article if it's an interwiki.
143 # We always answer that it is not.
144
145 return false;
146 }
147
153 public function handleRevision( $rev ) {
154 $title = $rev->getTitle();
155 if ( !$title ) {
156 $this->error( "Got bogus revision with null title!" );
157
158 return;
159 }
160
161 $this->count++;
162 if ( $this->from !== false ) {
163 if ( $this->from != $title ) {
164 return;
165 }
166 $this->output( "Skipped " . ( $this->count - 1 ) . " pages\n" );
167
168 $this->count = 1;
169 $this->from = null;
170 }
171
172 $this->processRevision( $rev );
173 }
174
178 public function checkOptions() {
179 }
180
184 public function conclusions() {
185 }
186
190 abstract public function processRevision( WikiRevision $rev );
191}
192
199
200 public function __construct() {
201 parent::__construct();
202 $this->addDescription( 'Runs a regex in the revisions from a dump' );
203 $this->addOption( 'regex', 'Searching regex', true, true );
204 }
205
206 public function getDbType() {
207 return Maintenance::DB_NONE;
208 }
209
210 public function processRevision( WikiRevision $rev ) {
211 if ( preg_match( $this->getOption( 'regex' ), $rev->getContent()->getTextForSearchIndex() ) ) {
212 $this->output( $rev->getTitle() . " matches at edit from " . $rev->getTimestamp() . "\n" );
213 }
214 }
215}
216
217// @codeCoverageIgnoreStart
218$maintClass = SearchDump::class;
219require_once RUN_MAINTENANCE_IF_MAIN;
220// @codeCoverageIgnoreEnd
wfMessage( $key,... $params)
This is the function for getting translated interface messages.
array $params
The job parameters.
Base class for iterating over a dump.
static disableInterwikis( $prefix, &$data)
__construct()
Default constructor.
execute()
Do the actual work.
processRevision(WikiRevision $rev)
Core function which does whatever the maintenance script is designed to do.
conclusions()
Stub function for giving data about what was computed.
checkOptions()
Stub function for processing additional options.
handleRevision( $rev)
Callback function for each revision, child classes should override processRevision instead.
finalSetup(SettingsBuilder $settingsBuilder)
Handle some last-minute setup here.
Imports a XML dump from a file (either from file upload, files on disk, or HTTP)
A content handler knows how do deal with a specific type of content on a wiki page.
A class containing constants representing the names of configuration variables.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
getDbType()
Does the script need different DB access? By default, we give Maintenance scripts normal rights to th...
output( $out, $channel=null)
Throw some output to the user.
fatalError( $msg, $exitCode=1)
Output a message and terminate the current script.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
hasOption( $name)
Checks to see if a particular option was set.
getOption( $name, $default=null)
Get an option, or return the default.
error( $err, $die=0)
Throw an error to the user.
getServiceContainer()
Returns the main service container.
getStdin( $len=null)
Return input from stdin.
addDescription( $text)
Set the description text.
Represents an authority that has all permissions.
Value object representing a content slot associated with a page revision.
Builder class for constructing a Config object from a set of sources during bootstrap.
putConfigValues(array $values)
Sets the value of multiple config variables.
Represents a title within MediaWiki.
Definition Title.php:78
User class for the MediaWiki software.
Definition User.php:119
Maintenance script that runs a regex in the revisions from a dump.
__construct()
Default constructor.
processRevision(WikiRevision $rev)
Core function which does whatever the maintenance script is designed to do.
getDbType()
Does the script need different DB access? By default, we give Maintenance scripts normal rights to th...
Represents a revision, log entry or upload during the import process.
getContent( $role=SlotRecord::MAIN)
$wgHooks
Config variable stub for the Hooks setting, for use by phpdoc and IDEs.
$maintClass
$source