Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
55.20% covered (warning)
55.20%
308 / 558
27.45% covered (danger)
27.45%
14 / 51
CRAP
0.00% covered (danger)
0.00%
0 / 1
WikiImporter
55.20% covered (warning)
55.20%
308 / 558
27.45% covered (danger)
27.45%
14 / 51
3578.71
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
25 / 25
100.00% covered (success)
100.00%
1 / 1
2
 getReader
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 throwXmlError
0.00% covered (danger)
0.00%
0 / 2
0.00% covered (danger)
0.00%
0 / 1
2
 debug
50.00% covered (danger)
50.00%
1 / 2
0.00% covered (danger)
0.00%
0 / 1
2.50
 warn
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 notice
0.00% covered (danger)
0.00%
0 / 3
0.00% covered (danger)
0.00%
0 / 1
6
 setDebug
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 setNoUpdates
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 setPageOffset
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 setNoticeCallback
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 setPageCallback
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 setPageOutCallback
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 setRevisionCallback
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 setUploadCallback
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 setLogItemCallback
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 setSiteInfoCallback
0.00% covered (danger)
0.00%
0 / 3
0.00% covered (danger)
0.00%
0 / 1
2
 setImportTitleFactory
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 setTargetNamespace
0.00% covered (danger)
0.00%
0 / 21
0.00% covered (danger)
0.00%
0 / 1
20
 setTargetRootPage
0.00% covered (danger)
0.00%
0 / 29
0.00% covered (danger)
0.00%
0 / 1
56
 setImageBasePath
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 setImportUploads
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 setUsernamePrefix
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 disableStatisticsUpdate
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 beforeImportPage
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
1
 importRevision
17.65% covered (danger)
17.65%
3 / 17
0.00% covered (danger)
0.00%
0 / 1
8.03
 importLogItem
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 importUpload
0.00% covered (danger)
0.00%
0 / 2
0.00% covered (danger)
0.00%
0 / 1
2
 finishImportPage
72.22% covered (warning)
72.22%
13 / 18
0.00% covered (danger)
0.00%
0 / 1
5.54
 siteInfoCallback
33.33% covered (danger)
33.33%
2 / 6
0.00% covered (danger)
0.00%
0 / 1
3.19
 pageCallback
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
2
 pageOutCallback
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
2
 revisionCallback
83.33% covered (warning)
83.33%
5 / 6
0.00% covered (danger)
0.00%
0 / 1
2.02
 logItemCallback
0.00% covered (danger)
0.00%
0 / 6
0.00% covered (danger)
0.00%
0 / 1
6
 nodeAttribute
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 nodeContents
80.00% covered (warning)
80.00%
8 / 10
0.00% covered (danger)
0.00%
0 / 1
7.39
 doImport
79.17% covered (warning)
79.17%
38 / 48
0.00% covered (danger)
0.00%
0 / 1
17.03
 handleSiteInfo
100.00% covered (success)
100.00%
15 / 15
100.00% covered (success)
100.00%
1 / 1
6
 handleLogItem
0.00% covered (danger)
0.00%
0 / 17
0.00% covered (danger)
0.00%
0 / 1
72
 processLogItem
0.00% covered (danger)
0.00%
0 / 22
0.00% covered (danger)
0.00%
0 / 1
72
 handlePage
93.02% covered (success)
93.02%
40 / 43
0.00% covered (danger)
0.00%
0 / 1
17.10
 handleRevision
100.00% covered (success)
100.00%
24 / 24
100.00% covered (success)
100.00%
1 / 1
11
 handleContent
88.24% covered (warning)
88.24%
15 / 17
0.00% covered (danger)
0.00%
0 / 1
8.10
 makeContent
80.00% covered (warning)
80.00%
20 / 25
0.00% covered (danger)
0.00%
0 / 1
6.29
 processRevision
86.21% covered (warning)
86.21%
25 / 29
0.00% covered (danger)
0.00%
0 / 1
9.21
 handleUpload
0.00% covered (danger)
0.00%
0 / 31
0.00% covered (danger)
0.00%
0 / 1
240
 dumpTemp
0.00% covered (danger)
0.00%
0 / 3
0.00% covered (danger)
0.00%
0 / 1
2
 processUpload
0.00% covered (danger)
0.00%
0 / 29
0.00% covered (danger)
0.00%
0 / 1
42
 handleContributor
92.31% covered (success)
92.31%
12 / 13
0.00% covered (danger)
0.00%
0 / 1
6.02
 processTitle
65.22% covered (warning)
65.22%
15 / 23
0.00% covered (danger)
0.00%
0 / 1
7.51
 openReader
38.89% covered (danger)
38.89%
7 / 18
0.00% covered (danger)
0.00%
0 / 1
7.65
 syntaxCheckXML
93.33% covered (success)
93.33%
14 / 15
0.00% covered (danger)
0.00%
0 / 1
4.00
1<?php
2/**
3 * MediaWiki page data importer.
4 *
5 * Copyright © 2003,2005 Brooke Vibber <bvibber@wikimedia.org>
6 * https://www.mediawiki.org/
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License along
19 * with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 * http://www.gnu.org/copyleft/gpl.html
22 *
23 * @file
24 * @ingroup SpecialPage
25 */
26
27use MediaWiki\Cache\CacheKeyHelper;
28use MediaWiki\Config\Config;
29use MediaWiki\Content\IContentHandlerFactory;
30use MediaWiki\Deferred\DeferredUpdates;
31use MediaWiki\Deferred\SiteStatsUpdate;
32use MediaWiki\HookContainer\HookContainer;
33use MediaWiki\HookContainer\HookRunner;
34use MediaWiki\MainConfigNames;
35use MediaWiki\Page\PageIdentity;
36use MediaWiki\Page\WikiPageFactory;
37use MediaWiki\Permissions\Authority;
38use MediaWiki\Revision\SlotRecord;
39use MediaWiki\Revision\SlotRoleRegistry;
40use MediaWiki\Status\Status;
41use MediaWiki\Title\ForeignTitle;
42use MediaWiki\Title\ImportTitleFactory;
43use MediaWiki\Title\NaiveForeignTitleFactory;
44use MediaWiki\Title\NaiveImportTitleFactory;
45use MediaWiki\Title\NamespaceAwareForeignTitleFactory;
46use MediaWiki\Title\NamespaceImportTitleFactory;
47use MediaWiki\Title\NamespaceInfo;
48use MediaWiki\Title\SubpageImportTitleFactory;
49use MediaWiki\Title\Title;
50use MediaWiki\Title\TitleFactory;
51use MediaWiki\User\ExternalUserNames;
52use Wikimedia\AtEase\AtEase;
53use Wikimedia\NormalizedException\NormalizedException;
54
55/**
56 * XML file reader for the page data importer.
57 *
58 * implements Special:Import
59 * @ingroup SpecialPage
60 */
61class WikiImporter {
62    /** @var XMLReader|null */
63    private $reader;
64
65    /** @var string */
66    private $sourceAdapterId;
67
68    /** @var array|null */
69    private $foreignNamespaces = null;
70
71    /** @var callable */
72    private $mLogItemCallback;
73
74    /** @var callable */
75    private $mUploadCallback;
76
77    /** @var callable|null */
78    private $mRevisionCallback;
79
80    /** @var callable|null */
81    private $mPageCallback;
82
83    /** @var callable|null */
84    private $mSiteInfoCallback;
85
86    /** @var callable|null */
87    private $mPageOutCallback;
88
89    /** @var callable|null */
90    private $mNoticeCallback;
91
92    /** @var bool|null */
93    private $mDebug;
94
95    /** @var bool|null */
96    private $mImportUploads;
97
98    /** @var string|null */
99    private $mImageBasePath;
100
101    /** @var bool */
102    private $mNoUpdates = false;
103
104    /** @var int */
105    private $pageOffset = 0;
106
107    private ImportTitleFactory $importTitleFactory;
108    private ExternalUserNames $externalUserNames;
109
110    /** @var array */
111    private $countableCache = [];
112
113    /** @var bool */
114    private $disableStatisticsUpdate = false;
115
116    /**
117     * Authority used for permission checks only (to ensure that the user performing the import is
118     * allowed to edit the pages they're importing). To skip the checks, use UltimateAuthority.
119     *
120     * If you want to also log the import actions, see ImportReporter.
121     */
122    private Authority $performer;
123
124    private Config $config;
125    private HookRunner $hookRunner;
126    private Language $contentLanguage;
127    private NamespaceInfo $namespaceInfo;
128    private TitleFactory $titleFactory;
129    private WikiPageFactory $wikiPageFactory;
130    private UploadRevisionImporter $uploadRevisionImporter;
131    private IContentHandlerFactory $contentHandlerFactory;
132    private SlotRoleRegistry $slotRoleRegistry;
133
134    /**
135     * Creates an ImportXMLReader drawing from the source provided
136     */
137    public function __construct(
138        ImportSource $source,
139        Authority $performer,
140        Config $config,
141        HookContainer $hookContainer,
142        Language $contentLanguage,
143        NamespaceInfo $namespaceInfo,
144        TitleFactory $titleFactory,
145        WikiPageFactory $wikiPageFactory,
146        UploadRevisionImporter $uploadRevisionImporter,
147        IContentHandlerFactory $contentHandlerFactory,
148        SlotRoleRegistry $slotRoleRegistry
149    ) {
150        $this->performer = $performer;
151        $this->config = $config;
152        $this->hookRunner = new HookRunner( $hookContainer );
153        $this->contentLanguage = $contentLanguage;
154        $this->namespaceInfo = $namespaceInfo;
155        $this->titleFactory = $titleFactory;
156        $this->wikiPageFactory = $wikiPageFactory;
157        $this->uploadRevisionImporter = $uploadRevisionImporter;
158        $this->contentHandlerFactory = $contentHandlerFactory;
159        $this->slotRoleRegistry = $slotRoleRegistry;
160
161        if ( !in_array( 'uploadsource', stream_get_wrappers() ) ) {
162            stream_wrapper_register( 'uploadsource', UploadSourceAdapter::class );
163        }
164        $this->sourceAdapterId = UploadSourceAdapter::registerSource( $source );
165
166        $this->openReader();
167
168        // Default callbacks
169        $this->setPageCallback( [ $this, 'beforeImportPage' ] );
170        $this->setRevisionCallback( [ $this, "importRevision" ] );
171        $this->setUploadCallback( [ $this, 'importUpload' ] );
172        $this->setLogItemCallback( [ $this, 'importLogItem' ] );
173        $this->setPageOutCallback( [ $this, 'finishImportPage' ] );
174
175        $this->importTitleFactory = new NaiveImportTitleFactory(
176            $this->contentLanguage,
177            $this->namespaceInfo,
178            $this->titleFactory
179        );
180        $this->externalUserNames = new ExternalUserNames( 'imported', false );
181    }
182
183    /**
184     * @return null|XMLReader
185     */
186    public function getReader() {
187        return $this->reader;
188    }
189
190    /**
191     * @param string $err
192     */
193    public function throwXmlError( $err ) {
194        $this->debug( "FAILURE: $err" );
195        wfDebug( "WikiImporter XML error: $err" );
196    }
197
198    /**
199     * @param string $data
200     */
201    public function debug( $data ) {
202        if ( $this->mDebug ) {
203            wfDebug( "IMPORT: $data" );
204        }
205    }
206
207    /**
208     * @param string $data
209     */
210    public function warn( $data ) {
211        wfDebug( "IMPORT: $data" );
212    }
213
214    /**
215     * @param string $msg
216     * @param mixed ...$params
217     */
218    public function notice( $msg, ...$params ) {
219        if ( is_callable( $this->mNoticeCallback ) ) {
220            call_user_func( $this->mNoticeCallback, $msg, $params );
221        } else { # No ImportReporter -> CLI
222            // T177997: the command line importers should call setNoticeCallback()
223            // for their own custom callback to echo the notice
224            wfDebug( wfMessage( $msg, $params )->text() );
225        }
226    }
227
228    /**
229     * Set debug mode...
230     * @param bool $debug
231     */
232    public function setDebug( $debug ) {
233        $this->mDebug = $debug;
234    }
235
236    /**
237     * Set 'no updates' mode. In this mode, the link tables will not be updated by the importer
238     * @param bool $noupdates
239     */
240    public function setNoUpdates( $noupdates ) {
241        $this->mNoUpdates = $noupdates;
242    }
243
244    /**
245     * Sets 'pageOffset' value. So it will skip the first n-1 pages
246     * and start from the nth page. It's 1-based indexing.
247     * @param int $nthPage
248     * @since 1.29
249     */
250    public function setPageOffset( $nthPage ) {
251        $this->pageOffset = $nthPage;
252    }
253
254    /**
255     * Set a callback that displays notice messages
256     *
257     * @param callable $callback
258     * @return callable
259     */
260    public function setNoticeCallback( $callback ) {
261        return wfSetVar( $this->mNoticeCallback, $callback );
262    }
263
264    /**
265     * Sets the action to perform as each new page in the stream is reached.
266     * @param callable|null $callback
267     * @return callable|null
268     */
269    public function setPageCallback( $callback ) {
270        $previous = $this->mPageCallback;
271        $this->mPageCallback = $callback;
272        return $previous;
273    }
274
275    /**
276     * Sets the action to perform as each page in the stream is completed.
277     * Callback accepts the page title (as a Title object), a second object
278     * with the original title form (in case it's been overridden into a
279     * local namespace), and a count of revisions.
280     *
281     * @param callable|null $callback
282     * @return callable|null
283     */
284    public function setPageOutCallback( $callback ) {
285        $previous = $this->mPageOutCallback;
286        $this->mPageOutCallback = $callback;
287        return $previous;
288    }
289
290    /**
291     * Sets the action to perform as each page revision is reached.
292     * @param callable|null $callback
293     * @return callable|null
294     */
295    public function setRevisionCallback( $callback ) {
296        $previous = $this->mRevisionCallback;
297        $this->mRevisionCallback = $callback;
298        return $previous;
299    }
300
301    /**
302     * Sets the action to perform as each file upload version is reached.
303     * @param callable $callback
304     * @return callable
305     */
306    public function setUploadCallback( $callback ) {
307        $previous = $this->mUploadCallback;
308        $this->mUploadCallback = $callback;
309        return $previous;
310    }
311
312    /**
313     * Sets the action to perform as each log item reached.
314     * @param callable $callback
315     * @return callable
316     */
317    public function setLogItemCallback( $callback ) {
318        $previous = $this->mLogItemCallback;
319        $this->mLogItemCallback = $callback;
320        return $previous;
321    }
322
323    /**
324     * Sets the action to perform when site info is encountered
325     * @param callable $callback
326     * @return callable
327     */
328    public function setSiteInfoCallback( $callback ) {
329        $previous = $this->mSiteInfoCallback;
330        $this->mSiteInfoCallback = $callback;
331        return $previous;
332    }
333
334    /**
335     * Sets the factory object to use to convert ForeignTitle objects into local
336     * Title objects
337     * @param ImportTitleFactory $factory
338     */
339    public function setImportTitleFactory( $factory ) {
340        $this->importTitleFactory = $factory;
341    }
342
343    /**
344     * Set a target namespace to override the defaults
345     * @param null|int $namespace
346     * @return bool
347     */
348    public function setTargetNamespace( $namespace ) {
349        if ( $namespace === null ) {
350            // Don't override namespaces
351            $this->setImportTitleFactory(
352                new NaiveImportTitleFactory(
353                    $this->contentLanguage,
354                    $this->namespaceInfo,
355                    $this->titleFactory
356                )
357            );
358            return true;
359        } elseif (
360            $namespace >= 0 &&
361            $this->namespaceInfo->exists( intval( $namespace ) )
362        ) {
363            $namespace = intval( $namespace );
364            $this->setImportTitleFactory(
365                new NamespaceImportTitleFactory(
366                    $this->namespaceInfo,
367                    $this->titleFactory,
368                    $namespace
369                )
370            );
371            return true;
372        } else {
373            return false;
374        }
375    }
376
377    /**
378     * Set a target root page under which all pages are imported
379     * @param null|string $rootpage
380     * @return Status
381     */
382    public function setTargetRootPage( $rootpage ) {
383        $status = Status::newGood();
384        $nsInfo = $this->namespaceInfo;
385        if ( $rootpage === null ) {
386            // No rootpage
387            $this->setImportTitleFactory(
388                new NaiveImportTitleFactory(
389                    $this->contentLanguage,
390                    $nsInfo,
391                    $this->titleFactory
392                )
393            );
394        } elseif ( $rootpage !== '' ) {
395            $rootpage = rtrim( $rootpage, '/' ); // avoid double slashes
396            $title = Title::newFromText( $rootpage );
397
398            if ( !$title || $title->isExternal() ) {
399                $status->fatal( 'import-rootpage-invalid' );
400            } elseif ( !$nsInfo->hasSubpages( $title->getNamespace() ) ) {
401                $displayNSText = $title->getNamespace() === NS_MAIN
402                    ? wfMessage( 'blanknamespace' )->text()
403                    : $this->contentLanguage->getNsText( $title->getNamespace() );
404                $status->fatal( 'import-rootpage-nosubpage', $displayNSText );
405            } else {
406                // set namespace to 'all', so the namespace check in processTitle() can pass
407                $this->setTargetNamespace( null );
408                $this->setImportTitleFactory(
409                    new SubpageImportTitleFactory(
410                        $nsInfo,
411                        $this->titleFactory,
412                        $title
413                    )
414                );
415            }
416        }
417        return $status;
418    }
419
420    /**
421     * @param string $dir
422     */
423    public function setImageBasePath( $dir ) {
424        $this->mImageBasePath = $dir;
425    }
426
427    /**
428     * @param bool $import
429     */
430    public function setImportUploads( $import ) {
431        $this->mImportUploads = $import;
432    }
433
434    /**
435     * @since 1.31
436     * @param string $usernamePrefix Prefix to apply to unknown (and possibly also known) usernames
437     * @param bool $assignKnownUsers Whether to apply the prefix to usernames that exist locally
438     */
439    public function setUsernamePrefix( $usernamePrefix, $assignKnownUsers ) {
440        $this->externalUserNames = new ExternalUserNames( $usernamePrefix, $assignKnownUsers );
441    }
442
443    /**
444     * Statistics update can cause a lot of time
445     * @since 1.29
446     */
447    public function disableStatisticsUpdate() {
448        $this->disableStatisticsUpdate = true;
449    }
450
451    /**
452     * Default per-page callback. Sets up some things related to site statistics
453     * @param array $titleAndForeignTitle Two-element array, with Title object at
454     * index 0 and ForeignTitle object at index 1
455     * @return bool
456     */
457    public function beforeImportPage( $titleAndForeignTitle ) {
458        $title = $titleAndForeignTitle[0];
459        $page = $this->wikiPageFactory->newFromTitle( $title );
460        $this->countableCache['title_' . $title->getPrefixedText()] = $page->isCountable();
461        return true;
462    }
463
464    /**
465     * Default per-revision callback, performs the import.
466     * @param WikiRevision $revision
467     * @return bool
468     */
469    public function importRevision( $revision ) {
470        if ( !$revision->getContentHandler()->canBeUsedOn( $revision->getTitle() ) ) {
471            $this->notice( 'import-error-bad-location',
472                $revision->getTitle()->getPrefixedText(),
473                $revision->getID(),
474                $revision->getModel(),
475                $revision->getFormat()
476            );
477
478            return false;
479        }
480
481        try {
482            return $revision->importOldRevision();
483        } catch ( MWContentSerializationException $ex ) {
484            $this->notice( 'import-error-unserialize',
485                $revision->getTitle()->getPrefixedText(),
486                $revision->getID(),
487                $revision->getModel(),
488                $revision->getFormat()
489            );
490        }
491
492        return false;
493    }
494
495    /**
496     * Default per-revision callback, performs the import.
497     * @param WikiRevision $revision
498     * @return bool
499     */
500    public function importLogItem( $revision ) {
501        return $revision->importLogItem();
502    }
503
504    /**
505     * Dummy for now...
506     * @param WikiRevision $revision
507     * @return bool
508     */
509    public function importUpload( $revision ) {
510        $status = $this->uploadRevisionImporter->import( $revision );
511        return $status->isGood();
512    }
513
514    /**
515     * Mostly for hook use
516     * @param PageIdentity $pageIdentity
517     * @param ForeignTitle $foreignTitle
518     * @param int $revCount
519     * @param int $sRevCount
520     * @param array $pageInfo
521     * @return bool
522     */
523    public function finishImportPage( PageIdentity $pageIdentity, $foreignTitle, $revCount,
524        $sRevCount, $pageInfo
525    ) {
526        // Update article count statistics (T42009)
527        // The normal counting logic in WikiPage->doEditUpdates() is designed for
528        // one-revision-at-a-time editing, not bulk imports. In this situation it
529        // suffers from issues of replica DB lag. We let WikiPage handle the total page
530        // and revision count, and we implement our own custom logic for the
531        // article (content page) count.
532        if ( !$this->disableStatisticsUpdate ) {
533            $page = $this->wikiPageFactory->newFromTitle( $pageIdentity );
534
535            $page->loadPageData( IDBAccessObject::READ_LATEST );
536            $rev = $page->getRevisionRecord();
537            if ( $rev === null ) {
538
539                wfDebug( __METHOD__ . ': Skipping article count adjustment for ' . $pageIdentity .
540                    ' because WikiPage::getRevisionRecord() returned null' );
541            } else {
542                $update = $page->newPageUpdater( $this->performer )->prepareUpdate();
543                $countKey = 'title_' . CacheKeyHelper::getKeyForPage( $pageIdentity );
544                $countable = $update->isCountable();
545                if ( array_key_exists( $countKey, $this->countableCache ) &&
546                    $countable != $this->countableCache[$countKey] ) {
547                    DeferredUpdates::addUpdate( SiteStatsUpdate::factory( [
548                        'articles' => ( (int)$countable - (int)$this->countableCache[$countKey] )
549                    ] ) );
550                }
551            }
552        }
553
554        $title = Title::newFromPageIdentity( $pageIdentity );
555        return $this->hookRunner->onAfterImportPage( $title, $foreignTitle,
556            $revCount, $sRevCount, $pageInfo );
557    }
558
559    /**
560     * Notify the callback function of site info
561     * @param array $siteInfo
562     * @return mixed|false
563     */
564    private function siteInfoCallback( $siteInfo ) {
565        if ( isset( $this->mSiteInfoCallback ) ) {
566            return call_user_func_array(
567                $this->mSiteInfoCallback,
568                [ $siteInfo, $this ]
569            );
570        } else {
571            return false;
572        }
573    }
574
575    /**
576     * Notify the callback function when a new "<page>" is reached.
577     * @param array $title
578     */
579    public function pageCallback( $title ) {
580        if ( isset( $this->mPageCallback ) ) {
581            call_user_func( $this->mPageCallback, $title );
582        }
583    }
584
585    /**
586     * Notify the callback function when a "</page>" is closed.
587     * @param PageIdentity $pageIdentity
588     * @param ForeignTitle $foreignTitle
589     * @param int $revCount
590     * @param int $sucCount Number of revisions for which callback returned true
591     * @param array $pageInfo Associative array of page information
592     */
593    private function pageOutCallback( PageIdentity $pageIdentity, $foreignTitle, $revCount,
594            $sucCount, $pageInfo ) {
595        if ( isset( $this->mPageOutCallback ) ) {
596            call_user_func_array( $this->mPageOutCallback, func_get_args() );
597        }
598    }
599
600    /**
601     * Notify the callback function of a revision
602     * @param WikiRevision $revision
603     * @return bool|mixed
604     */
605    private function revisionCallback( $revision ) {
606        if ( isset( $this->mRevisionCallback ) ) {
607            return call_user_func_array(
608                $this->mRevisionCallback,
609                [ $revision, $this ]
610            );
611        } else {
612            return false;
613        }
614    }
615
616    /**
617     * Notify the callback function of a new log item
618     * @param WikiRevision $revision
619     * @return mixed|false
620     */
621    private function logItemCallback( $revision ) {
622        if ( isset( $this->mLogItemCallback ) ) {
623            return call_user_func_array(
624                $this->mLogItemCallback,
625                [ $revision, $this ]
626            );
627        } else {
628            return false;
629        }
630    }
631
632    /**
633     * Retrieves the contents of the named attribute of the current element.
634     * @param string $attr The name of the attribute
635     * @return string The value of the attribute or an empty string if it is not set in the current
636     * element.
637     */
638    public function nodeAttribute( $attr ) {
639        return $this->reader->getAttribute( $attr ) ?? '';
640    }
641
642    /**
643     * Shouldn't something like this be built-in to XMLReader?
644     * Fetches text contents of the current element, assuming
645     * no sub-elements or such scary things.
646     * @return string
647     * @internal
648     */
649    public function nodeContents() {
650        if ( $this->reader->isEmptyElement ) {
651            return "";
652        }
653        $buffer = "";
654        while ( $this->reader->read() ) {
655            switch ( $this->reader->nodeType ) {
656                case XMLReader::TEXT:
657                case XMLReader::CDATA:
658                case XMLReader::SIGNIFICANT_WHITESPACE:
659                    $buffer .= $this->reader->value;
660                    break;
661                case XMLReader::END_ELEMENT:
662                    return $buffer;
663            }
664        }
665
666        $this->reader->close();
667        return '';
668    }
669
670    /**
671     * Primary entry point
672     * @throws Exception
673     * @return bool
674     */
675    public function doImport() {
676        $this->syntaxCheckXML();
677
678        // Calls to reader->read need to be wrapped in calls to
679        // libxml_disable_entity_loader() to avoid local file
680        // inclusion attacks (T48932).
681        // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847
682        $oldDisable = @libxml_disable_entity_loader( true );
683        try {
684            $this->reader->read();
685
686            if ( $this->reader->localName != 'mediawiki' ) {
687                // phpcs:ignore Generic.PHP.NoSilencedErrors
688                @libxml_disable_entity_loader( $oldDisable );
689                $error = libxml_get_last_error();
690                if ( $error ) {
691                    throw new NormalizedException( "XML error at line {line}: {message}", [
692                        'line' => $error->line,
693                        'message' => $error->message,
694                    ] );
695                } else {
696                    throw new UnexpectedValueException(
697                        "Expected '<mediawiki>' tag, got '<{$this->reader->localName}>' tag."
698                    );
699                }
700            }
701            $this->debug( "<mediawiki> tag is correct." );
702
703            $this->debug( "Starting primary dump processing loop." );
704
705            $keepReading = $this->reader->read();
706            $skip = false;
707            $pageCount = 0;
708            while ( $keepReading ) {
709                $tag = $this->reader->localName;
710                if ( $this->pageOffset ) {
711                    if ( $tag === 'page' ) {
712                        $pageCount++;
713                    }
714                    if ( $pageCount < $this->pageOffset ) {
715                        $keepReading = $this->reader->next();
716                        continue;
717                    }
718                }
719                $type = $this->reader->nodeType;
720
721                if ( !$this->hookRunner->onImportHandleToplevelXMLTag( $this ) ) {
722                    // Do nothing
723                } elseif ( $tag == 'mediawiki' && $type == XMLReader::END_ELEMENT ) {
724                    break;
725                } elseif ( $tag == 'siteinfo' ) {
726                    $this->handleSiteInfo();
727                } elseif ( $tag == 'page' ) {
728                    $this->handlePage();
729                } elseif ( $tag == 'logitem' ) {
730                    $this->handleLogItem();
731                } elseif ( $tag != '#text' ) {
732                    $this->warn( "Unhandled top-level XML tag $tag" );
733
734                    $skip = true;
735                }
736
737                if ( $skip ) {
738                    $keepReading = $this->reader->next();
739                    $skip = false;
740                    $this->debug( "Skip" );
741                } else {
742                    $keepReading = $this->reader->read();
743                }
744            }
745        } finally {
746            // phpcs:ignore Generic.PHP.NoSilencedErrors
747            @libxml_disable_entity_loader( $oldDisable );
748            $this->reader->close();
749        }
750
751        return true;
752    }
753
754    private function handleSiteInfo() {
755        $this->debug( "Enter site info handler." );
756        $siteInfo = [];
757
758        // Fields that can just be stuffed in the siteInfo object
759        $normalFields = [ 'sitename', 'base', 'generator', 'case' ];
760
761        while ( $this->reader->read() ) {
762            if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
763                    $this->reader->localName == 'siteinfo' ) {
764                break;
765            }
766
767            $tag = $this->reader->localName;
768
769            if ( $tag == 'namespace' ) {
770                $this->foreignNamespaces[$this->nodeAttribute( 'key' )] =
771                    $this->nodeContents();
772            } elseif ( in_array( $tag, $normalFields ) ) {
773                $siteInfo[$tag] = $this->nodeContents();
774            }
775        }
776
777        $siteInfo['_namespaces'] = $this->foreignNamespaces;
778        $this->siteInfoCallback( $siteInfo );
779    }
780
781    private function handleLogItem() {
782        $this->debug( "Enter log item handler." );
783        $logInfo = [];
784
785        // Fields that can just be stuffed in the pageInfo object
786        $normalFields = [ 'id', 'comment', 'type', 'action', 'timestamp',
787            'logtitle', 'params' ];
788
789        while ( $this->reader->read() ) {
790            if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
791                    $this->reader->localName == 'logitem' ) {
792                break;
793            }
794
795            $tag = $this->reader->localName;
796
797            if ( !$this->hookRunner->onImportHandleLogItemXMLTag( $this, $logInfo ) ) {
798                // Do nothing
799            } elseif ( in_array( $tag, $normalFields ) ) {
800                $logInfo[$tag] = $this->nodeContents();
801            } elseif ( $tag == 'contributor' ) {
802                $logInfo['contributor'] = $this->handleContributor();
803            } elseif ( $tag != '#text' ) {
804                $this->warn( "Unhandled log-item XML tag $tag" );
805            }
806        }
807
808        $this->processLogItem( $logInfo );
809    }
810
811    /**
812     * @param array $logInfo
813     * @return mixed|false
814     */
815    private function processLogItem( $logInfo ) {
816        $revision = new WikiRevision();
817
818        if ( isset( $logInfo['id'] ) ) {
819            $revision->setID( $logInfo['id'] );
820        }
821        $revision->setType( $logInfo['type'] );
822        $revision->setAction( $logInfo['action'] );
823        if ( isset( $logInfo['timestamp'] ) ) {
824            $revision->setTimestamp( $logInfo['timestamp'] );
825        }
826        if ( isset( $logInfo['params'] ) ) {
827            $revision->setParams( $logInfo['params'] );
828        }
829        if ( isset( $logInfo['logtitle'] ) ) {
830            // @todo Using Title for non-local titles is a recipe for disaster.
831            // We should use ForeignTitle here instead.
832            $revision->setTitle( Title::newFromText( $logInfo['logtitle'] ) );
833        }
834
835        $revision->setNoUpdates( $this->mNoUpdates );
836
837        if ( isset( $logInfo['comment'] ) ) {
838            $revision->setComment( $logInfo['comment'] );
839        }
840
841        if ( isset( $logInfo['contributor']['username'] ) ) {
842            $revision->setUsername(
843                $this->externalUserNames->applyPrefix( $logInfo['contributor']['username'] )
844            );
845        } elseif ( isset( $logInfo['contributor']['ip'] ) ) {
846            $revision->setUserIP( $logInfo['contributor']['ip'] );
847        } else {
848            $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
849        }
850
851        return $this->logItemCallback( $revision );
852    }
853
854    private function handlePage() {
855        // Handle page data.
856        $this->debug( "Enter page handler." );
857        $pageInfo = [ 'revisionCount' => 0, 'successfulRevisionCount' => 0 ];
858
859        // Fields that can just be stuffed in the pageInfo object
860        $normalFields = [ 'title', 'ns', 'id', 'redirect', 'restrictions' ];
861
862        $skip = false;
863        $badTitle = false;
864
865        while ( $skip ? $this->reader->next() : $this->reader->read() ) {
866            if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
867                    $this->reader->localName == 'page' ) {
868                break;
869            }
870
871            $skip = false;
872
873            $tag = $this->reader->localName;
874
875            if ( $badTitle ) {
876                // The title is invalid, bail out of this page
877                $skip = true;
878            } elseif ( !$this->hookRunner->onImportHandlePageXMLTag( $this, $pageInfo ) ) {
879                // Do nothing
880            } elseif ( in_array( $tag, $normalFields ) ) {
881                // An XML snippet:
882                // <page>
883                //     <id>123</id>
884                //     <title>Page</title>
885                //     <redirect title="NewTitle"/>
886                //     ...
887                // Because the redirect tag is built differently, we need special handling for that case.
888                if ( $tag == 'redirect' ) {
889                    $pageInfo[$tag] = $this->nodeAttribute( 'title' );
890                } else {
891                    $pageInfo[$tag] = $this->nodeContents();
892                }
893            } elseif ( $tag == 'revision' || $tag == 'upload' ) {
894                if ( !isset( $title ) ) {
895                    $title = $this->processTitle( $pageInfo['title'],
896                        $pageInfo['ns'] ?? null );
897
898                    // $title is either an array of two titles or false.
899                    if ( is_array( $title ) ) {
900                        $this->pageCallback( $title );
901                        [ $pageInfo['_title'], $foreignTitle ] = $title;
902                    } else {
903                        $badTitle = true;
904                        $skip = true;
905                    }
906                }
907
908                if ( $title ) {
909                    if ( $tag == 'revision' ) {
910                        $this->handleRevision( $pageInfo );
911                    } else {
912                        $this->handleUpload( $pageInfo );
913                    }
914                }
915            } elseif ( $tag != '#text' ) {
916                $this->warn( "Unhandled page XML tag $tag" );
917                $skip = true;
918            }
919        }
920
921        // @note $pageInfo is only set if a valid $title is processed above with
922        //       no error. If we have a valid $title, then pageCallback is called
923        //       above, $pageInfo['title'] is set and we do pageOutCallback here.
924        //       If $pageInfo['_title'] is not set, then $foreignTitle is also not
925        //       set since they both come from $title above.
926        if ( array_key_exists( '_title', $pageInfo ) ) {
927            /** @var Title $title */
928            $title = $pageInfo['_title'];
929            $this->pageOutCallback(
930                $title,
931                // @phan-suppress-next-line PhanPossiblyUndeclaredVariable Set together with _title key
932                $foreignTitle,
933                $pageInfo['revisionCount'],
934                $pageInfo['successfulRevisionCount'],
935                $pageInfo
936            );
937        }
938    }
939
940    /**
941     * @param array &$pageInfo
942     */
943    private function handleRevision( &$pageInfo ) {
944        $this->debug( "Enter revision handler" );
945        $revisionInfo = [];
946
947        $normalFields = [ 'id', 'parentid', 'timestamp', 'comment', 'minor', 'origin',
948            'model', 'format', 'text', 'sha1' ];
949
950        $skip = false;
951
952        while ( $skip ? $this->reader->next() : $this->reader->read() ) {
953            if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
954                    $this->reader->localName == 'revision' ) {
955                break;
956            }
957
958            $tag = $this->reader->localName;
959
960            if ( !$this->hookRunner->onImportHandleRevisionXMLTag(
961                $this, $pageInfo, $revisionInfo )
962            ) {
963                // Do nothing
964            } elseif ( in_array( $tag, $normalFields ) ) {
965                $revisionInfo[$tag] = $this->nodeContents();
966            } elseif ( $tag == 'content' ) {
967                // We can have multiple content tags, so make this an array.
968                $revisionInfo[$tag][] = $this->handleContent();
969            } elseif ( $tag == 'contributor' ) {
970                $revisionInfo['contributor'] = $this->handleContributor();
971            } elseif ( $tag != '#text' ) {
972                $this->warn( "Unhandled revision XML tag $tag" );
973                $skip = true;
974            }
975        }
976
977        $pageInfo['revisionCount']++;
978        if ( $this->processRevision( $pageInfo, $revisionInfo ) ) {
979            $pageInfo['successfulRevisionCount']++;
980        }
981    }
982
983    private function handleContent() {
984        $this->debug( "Enter content handler" );
985        $contentInfo = [];
986
987        $normalFields = [ 'role', 'origin', 'model', 'format', 'text' ];
988
989        $skip = false;
990
991        while ( $skip ? $this->reader->next() : $this->reader->read() ) {
992            if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
993                $this->reader->localName == 'content' ) {
994                break;
995            }
996
997            $tag = $this->reader->localName;
998
999            if ( !$this->hookRunner->onImportHandleContentXMLTag(
1000                $this, $contentInfo )
1001            ) {
1002                // Do nothing
1003            } elseif ( in_array( $tag, $normalFields ) ) {
1004                $contentInfo[$tag] = $this->nodeContents();
1005            } elseif ( $tag != '#text' ) {
1006                $this->warn( "Unhandled content XML tag $tag" );
1007                $skip = true;
1008            }
1009        }
1010
1011        return $contentInfo;
1012    }
1013
1014    /**
1015     * @param PageIdentity $page
1016     * @param int $revisionId
1017     * @param array $contentInfo
1018     *
1019     * @return Content
1020     */
1021    private function makeContent( PageIdentity $page, $revisionId, $contentInfo ) {
1022        $maxArticleSize = $this->config->get( MainConfigNames::MaxArticleSize );
1023
1024        if ( !isset( $contentInfo['text'] ) ) {
1025            throw new InvalidArgumentException( 'Missing text field in import.' );
1026        }
1027
1028        // Make sure revisions won't violate $wgMaxArticleSize, which could lead to
1029        // database errors and instability. Testing for revisions with only listed
1030        // content models, as other content models might use serialization formats
1031        // which aren't checked against $wgMaxArticleSize.
1032        if ( ( !isset( $contentInfo['model'] ) ||
1033                in_array( $contentInfo['model'], [
1034                    'wikitext',
1035                    'css',
1036                    'json',
1037                    'javascript',
1038                    'text',
1039                    ''
1040                ] ) ) &&
1041            strlen( $contentInfo['text'] ) > $maxArticleSize * 1024
1042        ) {
1043            throw new RuntimeException( 'The text of ' .
1044                ( $revisionId ?
1045                    "the revision with ID $revisionId" :
1046                    'a revision'
1047                ) . " exceeds the maximum allowable size ({$maxArticleSize} KiB)" );
1048        }
1049
1050        $role = $contentInfo['role'] ?? SlotRecord::MAIN;
1051        $model = $contentInfo['model'] ?? $this->slotRoleRegistry
1052            ->getRoleHandler( $role )
1053            ->getDefaultModel( $page );
1054        $handler = $this->contentHandlerFactory->getContentHandler( $model );
1055
1056        $text = $handler->importTransform( $contentInfo['text'] );
1057
1058        return $handler->unserializeContent( $text );
1059    }
1060
1061    /**
1062     * @param array $pageInfo
1063     * @param array $revisionInfo
1064     * @return mixed|false
1065     */
1066    private function processRevision( $pageInfo, $revisionInfo ) {
1067        $revision = new WikiRevision();
1068
1069        $revId = $revisionInfo['id'] ?? 0;
1070        if ( $revId ) {
1071            $revision->setID( $revisionInfo['id'] );
1072        }
1073
1074        $title = $pageInfo['_title'];
1075        $revision->setTitle( $title );
1076
1077        $content = $this->makeContent( $title, $revId, $revisionInfo );
1078        $revision->setContent( SlotRecord::MAIN, $content );
1079
1080        foreach ( $revisionInfo['content'] ?? [] as $slotInfo ) {
1081            if ( !isset( $slotInfo['role'] ) ) {
1082                throw new RuntimeException( "Missing role for imported slot." );
1083            }
1084
1085            $content = $this->makeContent( $title, $revId, $slotInfo );
1086            $revision->setContent( $slotInfo['role'], $content );
1087        }
1088        $revision->setTimestamp( $revisionInfo['timestamp'] ?? wfTimestampNow() );
1089
1090        if ( isset( $revisionInfo['comment'] ) ) {
1091            $revision->setComment( $revisionInfo['comment'] );
1092        }
1093
1094        if ( isset( $revisionInfo['minor'] ) ) {
1095            $revision->setMinor( true );
1096        }
1097        if ( isset( $revisionInfo['contributor']['username'] ) ) {
1098            $revision->setUsername(
1099                $this->externalUserNames->applyPrefix( $revisionInfo['contributor']['username'] )
1100            );
1101        } elseif ( isset( $revisionInfo['contributor']['ip'] ) ) {
1102            $revision->setUserIP( $revisionInfo['contributor']['ip'] );
1103        } else {
1104            $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
1105        }
1106        if ( isset( $revisionInfo['sha1'] ) ) {
1107           &n