Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
55.20% |
308 / 558 |
|
27.45% |
14 / 51 |
CRAP | |
0.00% |
0 / 1 |
WikiImporter | |
55.20% |
308 / 558 |
|
27.45% |
14 / 51 |
3578.71 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
25 / 25 |
|
100.00% |
1 / 1 |
2 | |||
getReader | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
throwXmlError | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
debug | |
50.00% |
1 / 2 |
|
0.00% |
0 / 1 |
2.50 | |||
warn | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
notice | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
setDebug | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
setNoUpdates | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
setPageOffset | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
setNoticeCallback | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
setPageCallback | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
setPageOutCallback | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
setRevisionCallback | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
setUploadCallback | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
setLogItemCallback | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
setSiteInfoCallback | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
setImportTitleFactory | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
setTargetNamespace | |
0.00% |
0 / 21 |
|
0.00% |
0 / 1 |
20 | |||
setTargetRootPage | |
0.00% |
0 / 29 |
|
0.00% |
0 / 1 |
56 | |||
setImageBasePath | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
setImportUploads | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
setUsernamePrefix | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
disableStatisticsUpdate | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
beforeImportPage | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
importRevision | |
17.65% |
3 / 17 |
|
0.00% |
0 / 1 |
8.03 | |||
importLogItem | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
importUpload | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
finishImportPage | |
72.22% |
13 / 18 |
|
0.00% |
0 / 1 |
5.54 | |||
siteInfoCallback | |
33.33% |
2 / 6 |
|
0.00% |
0 / 1 |
3.19 | |||
pageCallback | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
pageOutCallback | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
revisionCallback | |
83.33% |
5 / 6 |
|
0.00% |
0 / 1 |
2.02 | |||
logItemCallback | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
6 | |||
nodeAttribute | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
nodeContents | |
80.00% |
8 / 10 |
|
0.00% |
0 / 1 |
7.39 | |||
doImport | |
79.17% |
38 / 48 |
|
0.00% |
0 / 1 |
17.03 | |||
handleSiteInfo | |
100.00% |
15 / 15 |
|
100.00% |
1 / 1 |
6 | |||
handleLogItem | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
72 | |||
processLogItem | |
0.00% |
0 / 22 |
|
0.00% |
0 / 1 |
72 | |||
handlePage | |
93.02% |
40 / 43 |
|
0.00% |
0 / 1 |
17.10 | |||
handleRevision | |
100.00% |
24 / 24 |
|
100.00% |
1 / 1 |
11 | |||
handleContent | |
88.24% |
15 / 17 |
|
0.00% |
0 / 1 |
8.10 | |||
makeContent | |
80.00% |
20 / 25 |
|
0.00% |
0 / 1 |
6.29 | |||
processRevision | |
86.21% |
25 / 29 |
|
0.00% |
0 / 1 |
9.21 | |||
handleUpload | |
0.00% |
0 / 31 |
|
0.00% |
0 / 1 |
240 | |||
dumpTemp | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
processUpload | |
0.00% |
0 / 29 |
|
0.00% |
0 / 1 |
42 | |||
handleContributor | |
92.31% |
12 / 13 |
|
0.00% |
0 / 1 |
6.02 | |||
processTitle | |
65.22% |
15 / 23 |
|
0.00% |
0 / 1 |
7.51 | |||
openReader | |
38.89% |
7 / 18 |
|
0.00% |
0 / 1 |
7.65 | |||
syntaxCheckXML | |
93.33% |
14 / 15 |
|
0.00% |
0 / 1 |
4.00 |
1 | <?php |
2 | /** |
3 | * MediaWiki page data importer. |
4 | * |
5 | * Copyright © 2003,2005 Brooke Vibber <bvibber@wikimedia.org> |
6 | * https://www.mediawiki.org/ |
7 | * |
8 | * This program is free software; you can redistribute it and/or modify |
9 | * it under the terms of the GNU General Public License as published by |
10 | * the Free Software Foundation; either version 2 of the License, or |
11 | * (at your option) any later version. |
12 | * |
13 | * This program is distributed in the hope that it will be useful, |
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 | * GNU General Public License for more details. |
17 | * |
18 | * You should have received a copy of the GNU General Public License along |
19 | * with this program; if not, write to the Free Software Foundation, Inc., |
20 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
21 | * http://www.gnu.org/copyleft/gpl.html |
22 | * |
23 | * @file |
24 | * @ingroup SpecialPage |
25 | */ |
26 | |
27 | use MediaWiki\Cache\CacheKeyHelper; |
28 | use MediaWiki\Config\Config; |
29 | use MediaWiki\Content\IContentHandlerFactory; |
30 | use MediaWiki\Deferred\DeferredUpdates; |
31 | use MediaWiki\Deferred\SiteStatsUpdate; |
32 | use MediaWiki\HookContainer\HookContainer; |
33 | use MediaWiki\HookContainer\HookRunner; |
34 | use MediaWiki\MainConfigNames; |
35 | use MediaWiki\Page\PageIdentity; |
36 | use MediaWiki\Page\WikiPageFactory; |
37 | use MediaWiki\Permissions\Authority; |
38 | use MediaWiki\Revision\SlotRecord; |
39 | use MediaWiki\Revision\SlotRoleRegistry; |
40 | use MediaWiki\Status\Status; |
41 | use MediaWiki\Title\ForeignTitle; |
42 | use MediaWiki\Title\ImportTitleFactory; |
43 | use MediaWiki\Title\NaiveForeignTitleFactory; |
44 | use MediaWiki\Title\NaiveImportTitleFactory; |
45 | use MediaWiki\Title\NamespaceAwareForeignTitleFactory; |
46 | use MediaWiki\Title\NamespaceImportTitleFactory; |
47 | use MediaWiki\Title\NamespaceInfo; |
48 | use MediaWiki\Title\SubpageImportTitleFactory; |
49 | use MediaWiki\Title\Title; |
50 | use MediaWiki\Title\TitleFactory; |
51 | use MediaWiki\User\ExternalUserNames; |
52 | use Wikimedia\AtEase\AtEase; |
53 | use Wikimedia\NormalizedException\NormalizedException; |
54 | |
55 | /** |
56 | * XML file reader for the page data importer. |
57 | * |
58 | * implements Special:Import |
59 | * @ingroup SpecialPage |
60 | */ |
61 | class WikiImporter { |
62 | /** @var XMLReader|null */ |
63 | private $reader; |
64 | |
65 | /** @var string */ |
66 | private $sourceAdapterId; |
67 | |
68 | /** @var array|null */ |
69 | private $foreignNamespaces = null; |
70 | |
71 | /** @var callable */ |
72 | private $mLogItemCallback; |
73 | |
74 | /** @var callable */ |
75 | private $mUploadCallback; |
76 | |
77 | /** @var callable|null */ |
78 | private $mRevisionCallback; |
79 | |
80 | /** @var callable|null */ |
81 | private $mPageCallback; |
82 | |
83 | /** @var callable|null */ |
84 | private $mSiteInfoCallback; |
85 | |
86 | /** @var callable|null */ |
87 | private $mPageOutCallback; |
88 | |
89 | /** @var callable|null */ |
90 | private $mNoticeCallback; |
91 | |
92 | /** @var bool|null */ |
93 | private $mDebug; |
94 | |
95 | /** @var bool|null */ |
96 | private $mImportUploads; |
97 | |
98 | /** @var string|null */ |
99 | private $mImageBasePath; |
100 | |
101 | /** @var bool */ |
102 | private $mNoUpdates = false; |
103 | |
104 | /** @var int */ |
105 | private $pageOffset = 0; |
106 | |
107 | private ImportTitleFactory $importTitleFactory; |
108 | private ExternalUserNames $externalUserNames; |
109 | |
110 | /** @var array */ |
111 | private $countableCache = []; |
112 | |
113 | /** @var bool */ |
114 | private $disableStatisticsUpdate = false; |
115 | |
116 | /** |
117 | * Authority used for permission checks only (to ensure that the user performing the import is |
118 | * allowed to edit the pages they're importing). To skip the checks, use UltimateAuthority. |
119 | * |
120 | * If you want to also log the import actions, see ImportReporter. |
121 | */ |
122 | private Authority $performer; |
123 | |
124 | private Config $config; |
125 | private HookRunner $hookRunner; |
126 | private Language $contentLanguage; |
127 | private NamespaceInfo $namespaceInfo; |
128 | private TitleFactory $titleFactory; |
129 | private WikiPageFactory $wikiPageFactory; |
130 | private UploadRevisionImporter $uploadRevisionImporter; |
131 | private IContentHandlerFactory $contentHandlerFactory; |
132 | private SlotRoleRegistry $slotRoleRegistry; |
133 | |
134 | /** |
135 | * Creates an ImportXMLReader drawing from the source provided |
136 | */ |
137 | public function __construct( |
138 | ImportSource $source, |
139 | Authority $performer, |
140 | Config $config, |
141 | HookContainer $hookContainer, |
142 | Language $contentLanguage, |
143 | NamespaceInfo $namespaceInfo, |
144 | TitleFactory $titleFactory, |
145 | WikiPageFactory $wikiPageFactory, |
146 | UploadRevisionImporter $uploadRevisionImporter, |
147 | IContentHandlerFactory $contentHandlerFactory, |
148 | SlotRoleRegistry $slotRoleRegistry |
149 | ) { |
150 | $this->performer = $performer; |
151 | $this->config = $config; |
152 | $this->hookRunner = new HookRunner( $hookContainer ); |
153 | $this->contentLanguage = $contentLanguage; |
154 | $this->namespaceInfo = $namespaceInfo; |
155 | $this->titleFactory = $titleFactory; |
156 | $this->wikiPageFactory = $wikiPageFactory; |
157 | $this->uploadRevisionImporter = $uploadRevisionImporter; |
158 | $this->contentHandlerFactory = $contentHandlerFactory; |
159 | $this->slotRoleRegistry = $slotRoleRegistry; |
160 | |
161 | if ( !in_array( 'uploadsource', stream_get_wrappers() ) ) { |
162 | stream_wrapper_register( 'uploadsource', UploadSourceAdapter::class ); |
163 | } |
164 | $this->sourceAdapterId = UploadSourceAdapter::registerSource( $source ); |
165 | |
166 | $this->openReader(); |
167 | |
168 | // Default callbacks |
169 | $this->setPageCallback( [ $this, 'beforeImportPage' ] ); |
170 | $this->setRevisionCallback( [ $this, "importRevision" ] ); |
171 | $this->setUploadCallback( [ $this, 'importUpload' ] ); |
172 | $this->setLogItemCallback( [ $this, 'importLogItem' ] ); |
173 | $this->setPageOutCallback( [ $this, 'finishImportPage' ] ); |
174 | |
175 | $this->importTitleFactory = new NaiveImportTitleFactory( |
176 | $this->contentLanguage, |
177 | $this->namespaceInfo, |
178 | $this->titleFactory |
179 | ); |
180 | $this->externalUserNames = new ExternalUserNames( 'imported', false ); |
181 | } |
182 | |
183 | /** |
184 | * @return null|XMLReader |
185 | */ |
186 | public function getReader() { |
187 | return $this->reader; |
188 | } |
189 | |
190 | /** |
191 | * @param string $err |
192 | */ |
193 | public function throwXmlError( $err ) { |
194 | $this->debug( "FAILURE: $err" ); |
195 | wfDebug( "WikiImporter XML error: $err" ); |
196 | } |
197 | |
198 | /** |
199 | * @param string $data |
200 | */ |
201 | public function debug( $data ) { |
202 | if ( $this->mDebug ) { |
203 | wfDebug( "IMPORT: $data" ); |
204 | } |
205 | } |
206 | |
207 | /** |
208 | * @param string $data |
209 | */ |
210 | public function warn( $data ) { |
211 | wfDebug( "IMPORT: $data" ); |
212 | } |
213 | |
214 | /** |
215 | * @param string $msg |
216 | * @param mixed ...$params |
217 | */ |
218 | public function notice( $msg, ...$params ) { |
219 | if ( is_callable( $this->mNoticeCallback ) ) { |
220 | call_user_func( $this->mNoticeCallback, $msg, $params ); |
221 | } else { # No ImportReporter -> CLI |
222 | // T177997: the command line importers should call setNoticeCallback() |
223 | // for their own custom callback to echo the notice |
224 | wfDebug( wfMessage( $msg, $params )->text() ); |
225 | } |
226 | } |
227 | |
228 | /** |
229 | * Set debug mode... |
230 | * @param bool $debug |
231 | */ |
232 | public function setDebug( $debug ) { |
233 | $this->mDebug = $debug; |
234 | } |
235 | |
236 | /** |
237 | * Set 'no updates' mode. In this mode, the link tables will not be updated by the importer |
238 | * @param bool $noupdates |
239 | */ |
240 | public function setNoUpdates( $noupdates ) { |
241 | $this->mNoUpdates = $noupdates; |
242 | } |
243 | |
244 | /** |
245 | * Sets 'pageOffset' value. So it will skip the first n-1 pages |
246 | * and start from the nth page. It's 1-based indexing. |
247 | * @param int $nthPage |
248 | * @since 1.29 |
249 | */ |
250 | public function setPageOffset( $nthPage ) { |
251 | $this->pageOffset = $nthPage; |
252 | } |
253 | |
254 | /** |
255 | * Set a callback that displays notice messages |
256 | * |
257 | * @param callable $callback |
258 | * @return callable |
259 | */ |
260 | public function setNoticeCallback( $callback ) { |
261 | return wfSetVar( $this->mNoticeCallback, $callback ); |
262 | } |
263 | |
264 | /** |
265 | * Sets the action to perform as each new page in the stream is reached. |
266 | * @param callable|null $callback |
267 | * @return callable|null |
268 | */ |
269 | public function setPageCallback( $callback ) { |
270 | $previous = $this->mPageCallback; |
271 | $this->mPageCallback = $callback; |
272 | return $previous; |
273 | } |
274 | |
275 | /** |
276 | * Sets the action to perform as each page in the stream is completed. |
277 | * Callback accepts the page title (as a Title object), a second object |
278 | * with the original title form (in case it's been overridden into a |
279 | * local namespace), and a count of revisions. |
280 | * |
281 | * @param callable|null $callback |
282 | * @return callable|null |
283 | */ |
284 | public function setPageOutCallback( $callback ) { |
285 | $previous = $this->mPageOutCallback; |
286 | $this->mPageOutCallback = $callback; |
287 | return $previous; |
288 | } |
289 | |
290 | /** |
291 | * Sets the action to perform as each page revision is reached. |
292 | * @param callable|null $callback |
293 | * @return callable|null |
294 | */ |
295 | public function setRevisionCallback( $callback ) { |
296 | $previous = $this->mRevisionCallback; |
297 | $this->mRevisionCallback = $callback; |
298 | return $previous; |
299 | } |
300 | |
301 | /** |
302 | * Sets the action to perform as each file upload version is reached. |
303 | * @param callable $callback |
304 | * @return callable |
305 | */ |
306 | public function setUploadCallback( $callback ) { |
307 | $previous = $this->mUploadCallback; |
308 | $this->mUploadCallback = $callback; |
309 | return $previous; |
310 | } |
311 | |
312 | /** |
313 | * Sets the action to perform as each log item reached. |
314 | * @param callable $callback |
315 | * @return callable |
316 | */ |
317 | public function setLogItemCallback( $callback ) { |
318 | $previous = $this->mLogItemCallback; |
319 | $this->mLogItemCallback = $callback; |
320 | return $previous; |
321 | } |
322 | |
323 | /** |
324 | * Sets the action to perform when site info is encountered |
325 | * @param callable $callback |
326 | * @return callable |
327 | */ |
328 | public function setSiteInfoCallback( $callback ) { |
329 | $previous = $this->mSiteInfoCallback; |
330 | $this->mSiteInfoCallback = $callback; |
331 | return $previous; |
332 | } |
333 | |
334 | /** |
335 | * Sets the factory object to use to convert ForeignTitle objects into local |
336 | * Title objects |
337 | * @param ImportTitleFactory $factory |
338 | */ |
339 | public function setImportTitleFactory( $factory ) { |
340 | $this->importTitleFactory = $factory; |
341 | } |
342 | |
343 | /** |
344 | * Set a target namespace to override the defaults |
345 | * @param null|int $namespace |
346 | * @return bool |
347 | */ |
348 | public function setTargetNamespace( $namespace ) { |
349 | if ( $namespace === null ) { |
350 | // Don't override namespaces |
351 | $this->setImportTitleFactory( |
352 | new NaiveImportTitleFactory( |
353 | $this->contentLanguage, |
354 | $this->namespaceInfo, |
355 | $this->titleFactory |
356 | ) |
357 | ); |
358 | return true; |
359 | } elseif ( |
360 | $namespace >= 0 && |
361 | $this->namespaceInfo->exists( intval( $namespace ) ) |
362 | ) { |
363 | $namespace = intval( $namespace ); |
364 | $this->setImportTitleFactory( |
365 | new NamespaceImportTitleFactory( |
366 | $this->namespaceInfo, |
367 | $this->titleFactory, |
368 | $namespace |
369 | ) |
370 | ); |
371 | return true; |
372 | } else { |
373 | return false; |
374 | } |
375 | } |
376 | |
377 | /** |
378 | * Set a target root page under which all pages are imported |
379 | * @param null|string $rootpage |
380 | * @return Status |
381 | */ |
382 | public function setTargetRootPage( $rootpage ) { |
383 | $status = Status::newGood(); |
384 | $nsInfo = $this->namespaceInfo; |
385 | if ( $rootpage === null ) { |
386 | // No rootpage |
387 | $this->setImportTitleFactory( |
388 | new NaiveImportTitleFactory( |
389 | $this->contentLanguage, |
390 | $nsInfo, |
391 | $this->titleFactory |
392 | ) |
393 | ); |
394 | } elseif ( $rootpage !== '' ) { |
395 | $rootpage = rtrim( $rootpage, '/' ); // avoid double slashes |
396 | $title = Title::newFromText( $rootpage ); |
397 | |
398 | if ( !$title || $title->isExternal() ) { |
399 | $status->fatal( 'import-rootpage-invalid' ); |
400 | } elseif ( !$nsInfo->hasSubpages( $title->getNamespace() ) ) { |
401 | $displayNSText = $title->getNamespace() === NS_MAIN |
402 | ? wfMessage( 'blanknamespace' )->text() |
403 | : $this->contentLanguage->getNsText( $title->getNamespace() ); |
404 | $status->fatal( 'import-rootpage-nosubpage', $displayNSText ); |
405 | } else { |
406 | // set namespace to 'all', so the namespace check in processTitle() can pass |
407 | $this->setTargetNamespace( null ); |
408 | $this->setImportTitleFactory( |
409 | new SubpageImportTitleFactory( |
410 | $nsInfo, |
411 | $this->titleFactory, |
412 | $title |
413 | ) |
414 | ); |
415 | } |
416 | } |
417 | return $status; |
418 | } |
419 | |
420 | /** |
421 | * @param string $dir |
422 | */ |
423 | public function setImageBasePath( $dir ) { |
424 | $this->mImageBasePath = $dir; |
425 | } |
426 | |
427 | /** |
428 | * @param bool $import |
429 | */ |
430 | public function setImportUploads( $import ) { |
431 | $this->mImportUploads = $import; |
432 | } |
433 | |
434 | /** |
435 | * @since 1.31 |
436 | * @param string $usernamePrefix Prefix to apply to unknown (and possibly also known) usernames |
437 | * @param bool $assignKnownUsers Whether to apply the prefix to usernames that exist locally |
438 | */ |
439 | public function setUsernamePrefix( $usernamePrefix, $assignKnownUsers ) { |
440 | $this->externalUserNames = new ExternalUserNames( $usernamePrefix, $assignKnownUsers ); |
441 | } |
442 | |
443 | /** |
444 | * Statistics update can cause a lot of time |
445 | * @since 1.29 |
446 | */ |
447 | public function disableStatisticsUpdate() { |
448 | $this->disableStatisticsUpdate = true; |
449 | } |
450 | |
451 | /** |
452 | * Default per-page callback. Sets up some things related to site statistics |
453 | * @param array $titleAndForeignTitle Two-element array, with Title object at |
454 | * index 0 and ForeignTitle object at index 1 |
455 | * @return bool |
456 | */ |
457 | public function beforeImportPage( $titleAndForeignTitle ) { |
458 | $title = $titleAndForeignTitle[0]; |
459 | $page = $this->wikiPageFactory->newFromTitle( $title ); |
460 | $this->countableCache['title_' . $title->getPrefixedText()] = $page->isCountable(); |
461 | return true; |
462 | } |
463 | |
464 | /** |
465 | * Default per-revision callback, performs the import. |
466 | * @param WikiRevision $revision |
467 | * @return bool |
468 | */ |
469 | public function importRevision( $revision ) { |
470 | if ( !$revision->getContentHandler()->canBeUsedOn( $revision->getTitle() ) ) { |
471 | $this->notice( 'import-error-bad-location', |
472 | $revision->getTitle()->getPrefixedText(), |
473 | $revision->getID(), |
474 | $revision->getModel(), |
475 | $revision->getFormat() |
476 | ); |
477 | |
478 | return false; |
479 | } |
480 | |
481 | try { |
482 | return $revision->importOldRevision(); |
483 | } catch ( MWContentSerializationException $ex ) { |
484 | $this->notice( 'import-error-unserialize', |
485 | $revision->getTitle()->getPrefixedText(), |
486 | $revision->getID(), |
487 | $revision->getModel(), |
488 | $revision->getFormat() |
489 | ); |
490 | } |
491 | |
492 | return false; |
493 | } |
494 | |
495 | /** |
496 | * Default per-revision callback, performs the import. |
497 | * @param WikiRevision $revision |
498 | * @return bool |
499 | */ |
500 | public function importLogItem( $revision ) { |
501 | return $revision->importLogItem(); |
502 | } |
503 | |
504 | /** |
505 | * Dummy for now... |
506 | * @param WikiRevision $revision |
507 | * @return bool |
508 | */ |
509 | public function importUpload( $revision ) { |
510 | $status = $this->uploadRevisionImporter->import( $revision ); |
511 | return $status->isGood(); |
512 | } |
513 | |
514 | /** |
515 | * Mostly for hook use |
516 | * @param PageIdentity $pageIdentity |
517 | * @param ForeignTitle $foreignTitle |
518 | * @param int $revCount |
519 | * @param int $sRevCount |
520 | * @param array $pageInfo |
521 | * @return bool |
522 | */ |
523 | public function finishImportPage( PageIdentity $pageIdentity, $foreignTitle, $revCount, |
524 | $sRevCount, $pageInfo |
525 | ) { |
526 | // Update article count statistics (T42009) |
527 | // The normal counting logic in WikiPage->doEditUpdates() is designed for |
528 | // one-revision-at-a-time editing, not bulk imports. In this situation it |
529 | // suffers from issues of replica DB lag. We let WikiPage handle the total page |
530 | // and revision count, and we implement our own custom logic for the |
531 | // article (content page) count. |
532 | if ( !$this->disableStatisticsUpdate ) { |
533 | $page = $this->wikiPageFactory->newFromTitle( $pageIdentity ); |
534 | |
535 | $page->loadPageData( IDBAccessObject::READ_LATEST ); |
536 | $rev = $page->getRevisionRecord(); |
537 | if ( $rev === null ) { |
538 | |
539 | wfDebug( __METHOD__ . ': Skipping article count adjustment for ' . $pageIdentity . |
540 | ' because WikiPage::getRevisionRecord() returned null' ); |
541 | } else { |
542 | $update = $page->newPageUpdater( $this->performer )->prepareUpdate(); |
543 | $countKey = 'title_' . CacheKeyHelper::getKeyForPage( $pageIdentity ); |
544 | $countable = $update->isCountable(); |
545 | if ( array_key_exists( $countKey, $this->countableCache ) && |
546 | $countable != $this->countableCache[$countKey] ) { |
547 | DeferredUpdates::addUpdate( SiteStatsUpdate::factory( [ |
548 | 'articles' => ( (int)$countable - (int)$this->countableCache[$countKey] ) |
549 | ] ) ); |
550 | } |
551 | } |
552 | } |
553 | |
554 | $title = Title::newFromPageIdentity( $pageIdentity ); |
555 | return $this->hookRunner->onAfterImportPage( $title, $foreignTitle, |
556 | $revCount, $sRevCount, $pageInfo ); |
557 | } |
558 | |
559 | /** |
560 | * Notify the callback function of site info |
561 | * @param array $siteInfo |
562 | * @return mixed|false |
563 | */ |
564 | private function siteInfoCallback( $siteInfo ) { |
565 | if ( isset( $this->mSiteInfoCallback ) ) { |
566 | return call_user_func_array( |
567 | $this->mSiteInfoCallback, |
568 | [ $siteInfo, $this ] |
569 | ); |
570 | } else { |
571 | return false; |
572 | } |
573 | } |
574 | |
575 | /** |
576 | * Notify the callback function when a new "<page>" is reached. |
577 | * @param array $title |
578 | */ |
579 | public function pageCallback( $title ) { |
580 | if ( isset( $this->mPageCallback ) ) { |
581 | call_user_func( $this->mPageCallback, $title ); |
582 | } |
583 | } |
584 | |
585 | /** |
586 | * Notify the callback function when a "</page>" is closed. |
587 | * @param PageIdentity $pageIdentity |
588 | * @param ForeignTitle $foreignTitle |
589 | * @param int $revCount |
590 | * @param int $sucCount Number of revisions for which callback returned true |
591 | * @param array $pageInfo Associative array of page information |
592 | */ |
593 | private function pageOutCallback( PageIdentity $pageIdentity, $foreignTitle, $revCount, |
594 | $sucCount, $pageInfo ) { |
595 | if ( isset( $this->mPageOutCallback ) ) { |
596 | call_user_func_array( $this->mPageOutCallback, func_get_args() ); |
597 | } |
598 | } |
599 | |
600 | /** |
601 | * Notify the callback function of a revision |
602 | * @param WikiRevision $revision |
603 | * @return bool|mixed |
604 | */ |
605 | private function revisionCallback( $revision ) { |
606 | if ( isset( $this->mRevisionCallback ) ) { |
607 | return call_user_func_array( |
608 | $this->mRevisionCallback, |
609 | [ $revision, $this ] |
610 | ); |
611 | } else { |
612 | return false; |
613 | } |
614 | } |
615 | |
616 | /** |
617 | * Notify the callback function of a new log item |
618 | * @param WikiRevision $revision |
619 | * @return mixed|false |
620 | */ |
621 | private function logItemCallback( $revision ) { |
622 | if ( isset( $this->mLogItemCallback ) ) { |
623 | return call_user_func_array( |
624 | $this->mLogItemCallback, |
625 | [ $revision, $this ] |
626 | ); |
627 | } else { |
628 | return false; |
629 | } |
630 | } |
631 | |
632 | /** |
633 | * Retrieves the contents of the named attribute of the current element. |
634 | * @param string $attr The name of the attribute |
635 | * @return string The value of the attribute or an empty string if it is not set in the current |
636 | * element. |
637 | */ |
638 | public function nodeAttribute( $attr ) { |
639 | return $this->reader->getAttribute( $attr ) ?? ''; |
640 | } |
641 | |
642 | /** |
643 | * Shouldn't something like this be built-in to XMLReader? |
644 | * Fetches text contents of the current element, assuming |
645 | * no sub-elements or such scary things. |
646 | * @return string |
647 | * @internal |
648 | */ |
649 | public function nodeContents() { |
650 | if ( $this->reader->isEmptyElement ) { |
651 | return ""; |
652 | } |
653 | $buffer = ""; |
654 | while ( $this->reader->read() ) { |
655 | switch ( $this->reader->nodeType ) { |
656 | case XMLReader::TEXT: |
657 | case XMLReader::CDATA: |
658 | case XMLReader::SIGNIFICANT_WHITESPACE: |
659 | $buffer .= $this->reader->value; |
660 | break; |
661 | case XMLReader::END_ELEMENT: |
662 | return $buffer; |
663 | } |
664 | } |
665 | |
666 | $this->reader->close(); |
667 | return ''; |
668 | } |
669 | |
670 | /** |
671 | * Primary entry point |
672 | * @throws Exception |
673 | * @return bool |
674 | */ |
675 | public function doImport() { |
676 | $this->syntaxCheckXML(); |
677 | |
678 | // Calls to reader->read need to be wrapped in calls to |
679 | // libxml_disable_entity_loader() to avoid local file |
680 | // inclusion attacks (T48932). |
681 | // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847 |
682 | $oldDisable = @libxml_disable_entity_loader( true ); |
683 | try { |
684 | $this->reader->read(); |
685 | |
686 | if ( $this->reader->localName != 'mediawiki' ) { |
687 | // phpcs:ignore Generic.PHP.NoSilencedErrors |
688 | @libxml_disable_entity_loader( $oldDisable ); |
689 | $error = libxml_get_last_error(); |
690 | if ( $error ) { |
691 | throw new NormalizedException( "XML error at line {line}: {message}", [ |
692 | 'line' => $error->line, |
693 | 'message' => $error->message, |
694 | ] ); |
695 | } else { |
696 | throw new UnexpectedValueException( |
697 | "Expected '<mediawiki>' tag, got '<{$this->reader->localName}>' tag." |
698 | ); |
699 | } |
700 | } |
701 | $this->debug( "<mediawiki> tag is correct." ); |
702 | |
703 | $this->debug( "Starting primary dump processing loop." ); |
704 | |
705 | $keepReading = $this->reader->read(); |
706 | $skip = false; |
707 | $pageCount = 0; |
708 | while ( $keepReading ) { |
709 | $tag = $this->reader->localName; |
710 | if ( $this->pageOffset ) { |
711 | if ( $tag === 'page' ) { |
712 | $pageCount++; |
713 | } |
714 | if ( $pageCount < $this->pageOffset ) { |
715 | $keepReading = $this->reader->next(); |
716 | continue; |
717 | } |
718 | } |
719 | $type = $this->reader->nodeType; |
720 | |
721 | if ( !$this->hookRunner->onImportHandleToplevelXMLTag( $this ) ) { |
722 | // Do nothing |
723 | } elseif ( $tag == 'mediawiki' && $type == XMLReader::END_ELEMENT ) { |
724 | break; |
725 | } elseif ( $tag == 'siteinfo' ) { |
726 | $this->handleSiteInfo(); |
727 | } elseif ( $tag == 'page' ) { |
728 | $this->handlePage(); |
729 | } elseif ( $tag == 'logitem' ) { |
730 | $this->handleLogItem(); |
731 | } elseif ( $tag != '#text' ) { |
732 | $this->warn( "Unhandled top-level XML tag $tag" ); |
733 | |
734 | $skip = true; |
735 | } |
736 | |
737 | if ( $skip ) { |
738 | $keepReading = $this->reader->next(); |
739 | $skip = false; |
740 | $this->debug( "Skip" ); |
741 | } else { |
742 | $keepReading = $this->reader->read(); |
743 | } |
744 | } |
745 | } finally { |
746 | // phpcs:ignore Generic.PHP.NoSilencedErrors |
747 | @libxml_disable_entity_loader( $oldDisable ); |
748 | $this->reader->close(); |
749 | } |
750 | |
751 | return true; |
752 | } |
753 | |
754 | private function handleSiteInfo() { |
755 | $this->debug( "Enter site info handler." ); |
756 | $siteInfo = []; |
757 | |
758 | // Fields that can just be stuffed in the siteInfo object |
759 | $normalFields = [ 'sitename', 'base', 'generator', 'case' ]; |
760 | |
761 | while ( $this->reader->read() ) { |
762 | if ( $this->reader->nodeType == XMLReader::END_ELEMENT && |
763 | $this->reader->localName == 'siteinfo' ) { |
764 | break; |
765 | } |
766 | |
767 | $tag = $this->reader->localName; |
768 | |
769 | if ( $tag == 'namespace' ) { |
770 | $this->foreignNamespaces[$this->nodeAttribute( 'key' )] = |
771 | $this->nodeContents(); |
772 | } elseif ( in_array( $tag, $normalFields ) ) { |
773 | $siteInfo[$tag] = $this->nodeContents(); |
774 | } |
775 | } |
776 | |
777 | $siteInfo['_namespaces'] = $this->foreignNamespaces; |
778 | $this->siteInfoCallback( $siteInfo ); |
779 | } |
780 | |
781 | private function handleLogItem() { |
782 | $this->debug( "Enter log item handler." ); |
783 | $logInfo = []; |
784 | |
785 | // Fields that can just be stuffed in the pageInfo object |
786 | $normalFields = [ 'id', 'comment', 'type', 'action', 'timestamp', |
787 | 'logtitle', 'params' ]; |
788 | |
789 | while ( $this->reader->read() ) { |
790 | if ( $this->reader->nodeType == XMLReader::END_ELEMENT && |
791 | $this->reader->localName == 'logitem' ) { |
792 | break; |
793 | } |
794 | |
795 | $tag = $this->reader->localName; |
796 | |
797 | if ( !$this->hookRunner->onImportHandleLogItemXMLTag( $this, $logInfo ) ) { |
798 | // Do nothing |
799 | } elseif ( in_array( $tag, $normalFields ) ) { |
800 | $logInfo[$tag] = $this->nodeContents(); |
801 | } elseif ( $tag == 'contributor' ) { |
802 | $logInfo['contributor'] = $this->handleContributor(); |
803 | } elseif ( $tag != '#text' ) { |
804 | $this->warn( "Unhandled log-item XML tag $tag" ); |
805 | } |
806 | } |
807 | |
808 | $this->processLogItem( $logInfo ); |
809 | } |
810 | |
811 | /** |
812 | * @param array $logInfo |
813 | * @return mixed|false |
814 | */ |
815 | private function processLogItem( $logInfo ) { |
816 | $revision = new WikiRevision(); |
817 | |
818 | if ( isset( $logInfo['id'] ) ) { |
819 | $revision->setID( $logInfo['id'] ); |
820 | } |
821 | $revision->setType( $logInfo['type'] ); |
822 | $revision->setAction( $logInfo['action'] ); |
823 | if ( isset( $logInfo['timestamp'] ) ) { |
824 | $revision->setTimestamp( $logInfo['timestamp'] ); |
825 | } |
826 | if ( isset( $logInfo['params'] ) ) { |
827 | $revision->setParams( $logInfo['params'] ); |
828 | } |
829 | if ( isset( $logInfo['logtitle'] ) ) { |
830 | // @todo Using Title for non-local titles is a recipe for disaster. |
831 | // We should use ForeignTitle here instead. |
832 | $revision->setTitle( Title::newFromText( $logInfo['logtitle'] ) ); |
833 | } |
834 | |
835 | $revision->setNoUpdates( $this->mNoUpdates ); |
836 | |
837 | if ( isset( $logInfo['comment'] ) ) { |
838 | $revision->setComment( $logInfo['comment'] ); |
839 | } |
840 | |
841 | if ( isset( $logInfo['contributor']['username'] ) ) { |
842 | $revision->setUsername( |
843 | $this->externalUserNames->applyPrefix( $logInfo['contributor']['username'] ) |
844 | ); |
845 | } elseif ( isset( $logInfo['contributor']['ip'] ) ) { |
846 | $revision->setUserIP( $logInfo['contributor']['ip'] ); |
847 | } else { |
848 | $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) ); |
849 | } |
850 | |
851 | return $this->logItemCallback( $revision ); |
852 | } |
853 | |
854 | private function handlePage() { |
855 | // Handle page data. |
856 | $this->debug( "Enter page handler." ); |
857 | $pageInfo = [ 'revisionCount' => 0, 'successfulRevisionCount' => 0 ]; |
858 | |
859 | // Fields that can just be stuffed in the pageInfo object |
860 | $normalFields = [ 'title', 'ns', 'id', 'redirect', 'restrictions' ]; |
861 | |
862 | $skip = false; |
863 | $badTitle = false; |
864 | |
865 | while ( $skip ? $this->reader->next() : $this->reader->read() ) { |
866 | if ( $this->reader->nodeType == XMLReader::END_ELEMENT && |
867 | $this->reader->localName == 'page' ) { |
868 | break; |
869 | } |
870 | |
871 | $skip = false; |
872 | |
873 | $tag = $this->reader->localName; |
874 | |
875 | if ( $badTitle ) { |
876 | // The title is invalid, bail out of this page |
877 | $skip = true; |
878 | } elseif ( !$this->hookRunner->onImportHandlePageXMLTag( $this, $pageInfo ) ) { |
879 | // Do nothing |
880 | } elseif ( in_array( $tag, $normalFields ) ) { |
881 | // An XML snippet: |
882 | // <page> |
883 | // <id>123</id> |
884 | // <title>Page</title> |
885 | // <redirect title="NewTitle"/> |
886 | // ... |
887 | // Because the redirect tag is built differently, we need special handling for that case. |
888 | if ( $tag == 'redirect' ) { |
889 | $pageInfo[$tag] = $this->nodeAttribute( 'title' ); |
890 | } else { |
891 | $pageInfo[$tag] = $this->nodeContents(); |
892 | } |
893 | } elseif ( $tag == 'revision' || $tag == 'upload' ) { |
894 | if ( !isset( $title ) ) { |
895 | $title = $this->processTitle( $pageInfo['title'], |
896 | $pageInfo['ns'] ?? null ); |
897 | |
898 | // $title is either an array of two titles or false. |
899 | if ( is_array( $title ) ) { |
900 | $this->pageCallback( $title ); |
901 | [ $pageInfo['_title'], $foreignTitle ] = $title; |
902 | } else { |
903 | $badTitle = true; |
904 | $skip = true; |
905 | } |
906 | } |
907 | |
908 | if ( $title ) { |
909 | if ( $tag == 'revision' ) { |
910 | $this->handleRevision( $pageInfo ); |
911 | } else { |
912 | $this->handleUpload( $pageInfo ); |
913 | } |
914 | } |
915 | } elseif ( $tag != '#text' ) { |
916 | $this->warn( "Unhandled page XML tag $tag" ); |
917 | $skip = true; |
918 | } |
919 | } |
920 | |
921 | // @note $pageInfo is only set if a valid $title is processed above with |
922 | // no error. If we have a valid $title, then pageCallback is called |
923 | // above, $pageInfo['title'] is set and we do pageOutCallback here. |
924 | // If $pageInfo['_title'] is not set, then $foreignTitle is also not |
925 | // set since they both come from $title above. |
926 | if ( array_key_exists( '_title', $pageInfo ) ) { |
927 | /** @var Title $title */ |
928 | $title = $pageInfo['_title']; |
929 | $this->pageOutCallback( |
930 | $title, |
931 | // @phan-suppress-next-line PhanPossiblyUndeclaredVariable Set together with _title key |
932 | $foreignTitle, |
933 | $pageInfo['revisionCount'], |
934 | $pageInfo['successfulRevisionCount'], |
935 | $pageInfo |
936 | ); |
937 | } |
938 | } |
939 | |
940 | /** |
941 | * @param array &$pageInfo |
942 | */ |
943 | private function handleRevision( &$pageInfo ) { |
944 | $this->debug( "Enter revision handler" ); |
945 | $revisionInfo = []; |
946 | |
947 | $normalFields = [ 'id', 'parentid', 'timestamp', 'comment', 'minor', 'origin', |
948 | 'model', 'format', 'text', 'sha1' ]; |
949 | |
950 | $skip = false; |
951 | |
952 | while ( $skip ? $this->reader->next() : $this->reader->read() ) { |
953 | if ( $this->reader->nodeType == XMLReader::END_ELEMENT && |
954 | $this->reader->localName == 'revision' ) { |
955 | break; |
956 | } |
957 | |
958 | $tag = $this->reader->localName; |
959 | |
960 | if ( !$this->hookRunner->onImportHandleRevisionXMLTag( |
961 | $this, $pageInfo, $revisionInfo ) |
962 | ) { |
963 | // Do nothing |
964 | } elseif ( in_array( $tag, $normalFields ) ) { |
965 | $revisionInfo[$tag] = $this->nodeContents(); |
966 | } elseif ( $tag == 'content' ) { |
967 | // We can have multiple content tags, so make this an array. |
968 | $revisionInfo[$tag][] = $this->handleContent(); |
969 | } elseif ( $tag == 'contributor' ) { |
970 | $revisionInfo['contributor'] = $this->handleContributor(); |
971 | } elseif ( $tag != '#text' ) { |
972 | $this->warn( "Unhandled revision XML tag $tag" ); |
973 | $skip = true; |
974 | } |
975 | } |
976 | |
977 | $pageInfo['revisionCount']++; |
978 | if ( $this->processRevision( $pageInfo, $revisionInfo ) ) { |
979 | $pageInfo['successfulRevisionCount']++; |
980 | } |
981 | } |
982 | |
983 | private function handleContent() { |
984 | $this->debug( "Enter content handler" ); |
985 | $contentInfo = []; |
986 | |
987 | $normalFields = [ 'role', 'origin', 'model', 'format', 'text' ]; |
988 | |
989 | $skip = false; |
990 | |
991 | while ( $skip ? $this->reader->next() : $this->reader->read() ) { |
992 | if ( $this->reader->nodeType == XMLReader::END_ELEMENT && |
993 | $this->reader->localName == 'content' ) { |
994 | break; |
995 | } |
996 | |
997 | $tag = $this->reader->localName; |
998 | |
999 | if ( !$this->hookRunner->onImportHandleContentXMLTag( |
1000 | $this, $contentInfo ) |
1001 | ) { |
1002 | // Do nothing |
1003 | } elseif ( in_array( $tag, $normalFields ) ) { |
1004 | $contentInfo[$tag] = $this->nodeContents(); |
1005 | } elseif ( $tag != '#text' ) { |
1006 | $this->warn( "Unhandled content XML tag $tag" ); |
1007 | $skip = true; |
1008 | } |
1009 | } |
1010 | |
1011 | return $contentInfo; |
1012 | } |
1013 | |
1014 | /** |
1015 | * @param Title $title |
1016 | * @param int $revisionId |
1017 | * @param array $contentInfo |
1018 | * |
1019 | * @return Content |
1020 | */ |
1021 | private function makeContent( Title $title, $revisionId, $contentInfo ) { |
1022 | $maxArticleSize = $this->config->get( MainConfigNames::MaxArticleSize ); |
1023 | |
1024 | if ( !isset( $contentInfo['text'] ) ) { |
1025 | throw new InvalidArgumentException( 'Missing text field in import.' ); |
1026 | } |
1027 | |
1028 | // Make sure revisions won't violate $wgMaxArticleSize, which could lead to |
1029 | // database errors and instability. Testing for revisions with only listed |
1030 | // content models, as other content models might use serialization formats |
1031 | // which aren't checked against $wgMaxArticleSize. |
1032 | if ( ( !isset( $contentInfo['model'] ) || |
1033 | in_array( $contentInfo['model'], [ |
1034 | 'wikitext', |
1035 | 'css', |
1036 | 'json', |
1037 | 'javascript', |
1038 | 'text', |
1039 | '' |
1040 | ] ) ) && |
1041 | strlen( $contentInfo['text'] ) > $maxArticleSize * 1024 |
1042 | ) { |
1043 | throw new RuntimeException( 'The text of ' . |
1044 | ( $revisionId ? |
1045 | "the revision with ID $revisionId" : |
1046 | 'a revision' |
1047 | ) . " exceeds the maximum allowable size ({$maxArticleSize} KiB)" ); |
1048 | } |
1049 | |
1050 | $role = $contentInfo['role'] ?? SlotRecord::MAIN; |
1051 | $model = $contentInfo['model'] ?? $this->slotRoleRegistry |
1052 | ->getRoleHandler( $role ) |
1053 | ->getDefaultModel( $title ); |
1054 | $handler = $this->contentHandlerFactory->getContentHandler( $model ); |
1055 | |
1056 | $text = $handler->importTransform( $contentInfo['text'] ); |
1057 | |
1058 | return $handler->unserializeContent( $text ); |
1059 | } |
1060 | |
1061 | /** |
1062 | * @param array $pageInfo |
1063 | * @param array $revisionInfo |
1064 | * @return mixed|false |
1065 | */ |
1066 | private function processRevision( $pageInfo, $revisionInfo ) { |
1067 | $revision = new WikiRevision(); |
1068 | |
1069 | $revId = $revisionInfo['id'] ?? 0; |
1070 | if ( $revId ) { |
1071 | $revision->setID( $revisionInfo['id'] ); |
1072 | } |
1073 | |
1074 | $title = $pageInfo['_title']; |
1075 | $revision->setTitle( $title ); |
1076 | |
1077 | $content = $this->makeContent( $title, $revId, $revisionInfo ); |
1078 | $revision->setContent( SlotRecord::MAIN, $content ); |
1079 | |
1080 | foreach ( $revisionInfo['content'] ?? [] as $slotInfo ) { |
1081 | if ( !isset( $slotInfo['role'] ) ) { |
1082 | throw new RuntimeException( "Missing role for imported slot." ); |
1083 | } |
1084 | |
1085 | $content = $this->makeContent( $title, $revId, $slotInfo ); |
1086 | $revision->setContent( $slotInfo['role'], $content ); |
1087 | } |
1088 | $revision->setTimestamp( $revisionInfo['timestamp'] ?? wfTimestampNow() ); |
1089 | |
1090 | if ( isset( $revisionInfo['comment'] ) ) { |
1091 | $revision->setComment( $revisionInfo['comment'] ); |
1092 | } |
1093 | |
1094 | if ( isset( $revisionInfo['minor'] ) ) { |
1095 | $revision->setMinor( true ); |
1096 | } |
1097 | if ( isset( $revisionInfo['contributor']['username'] ) ) { |
1098 | $revision->setUsername( |
1099 | $this->externalUserNames->applyPrefix( $revisionInfo['contributor']['username'] ) |
1100 | ); |
1101 | } elseif ( isset( $revisionInfo['contributor']['ip'] ) ) { |
1102 | $revision->setUserIP( $revisionInfo['contributor']['ip'] ); |
1103 | } else { |
1104 | $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) ); |
1105 | } |
1106 | if ( isset( $revisionInfo['sha1'] ) ) { |
1107 | $revision->setSha1Base36( $revisionInfo['sha1'] ); |
1108 | } |
1109 | $revision->setNoUpdates( $this->mNoUpdates ); |
1110 | |
1111 | return $this->revisionCallback( $revision ); |
1112 | } |
1113 | |
1114 | /** |
1115 | * @param array &$pageInfo |
1116 | * @return mixed |
1117 | */ |
1118 | private function handleUpload( &$pageInfo ) { |
1119 | $this->debug( "Enter upload handler" ); |
1120 | $uploadInfo = []; |
1121 | |
1122 | $normalFields = [ 'timestamp', 'comment', 'filename', 'text', |
1123 | 'src', 'size', 'sha1base36', 'archivename', 'rel' ]; |
1124 | |
1125 | $skip = false; |
1126 | |
1127 | while ( $skip ? $this->reader->next() : $this->reader->read() ) { |
1128 | if ( $this->reader->nodeType == XMLReader::END_ELEMENT && |
1129 | $this->reader->localName == 'upload' ) { |
1130 | break; |
1131 | } |
1132 | |
1133 | $tag = $this->reader->localName; |
1134 | |
1135 | if ( !$this->hookRunner->onImportHandleUploadXMLTag( $this, $pageInfo ) ) { |
1136 | // Do nothing |
1137 | } elseif ( in_array( $tag, $normalFields ) ) { |
1138 | $uploadInfo[$tag] = $this->nodeContents(); |
1139 | } elseif ( $tag == 'contributor' ) { |
1140 | $uploadInfo['contributor'] = $this->handleContributor(); |
1141 | } elseif ( $tag == 'contents' ) { |
1142 | $contents = $this->nodeContents(); |
1143 | $encoding = $this->reader->getAttribute( 'encoding' ); |
1144 | if ( $encoding === 'base64' ) { |
1145 | $uploadInfo['fileSrc'] = $this->dumpTemp( base64_decode( $contents ) ); |
1146 | $uploadInfo['isTempSrc'] = true; |
1147 | } |
1148 | } elseif ( $tag != '#text' ) { |
1149 | $this->warn( "Unhandled upload XML tag $tag" ); |
1150 | $skip = true; |
1151 | } |
1152 | } |
1153 | |
1154 | if ( $this->mImageBasePath && isset( $uploadInfo['rel'] ) ) { |
1155 | $path = "{$this->mImageBasePath}/{$uploadInfo['rel']}"; |
1156 | if ( file_exists( $path ) ) { |
1157 | $uploadInfo['fileSrc'] = $path; |
1158 | $uploadInfo['isTempSrc'] = false; |
1159 | } |
1160 | } |
1161 | |
1162 | if ( $this->mImportUploads ) { |
1163 | return $this->processUpload( $pageInfo, $uploadInfo ); |
1164 | } |
1165 | } |
1166 | |
1167 | /** |
1168 | * @param string $contents |
1169 | * @return string |
1170 | */ |
1171 | private function dumpTemp( $contents ) { |
1172 | $filename = tempnam( wfTempDir(), 'importupload' ); |
1173 | file_put_contents( $filename, $contents ); |
1174 | return $filename; |
1175 | } |
1176 | |
1177 | /** |
1178 | * @param array $pageInfo |
1179 | * @param array $uploadInfo |
1180 | * @return mixed |
1181 | */ |
1182 | private function processUpload( $pageInfo, $uploadInfo ) { |
1183 | $revision = new WikiRevision(); |
1184 | $revId = $pageInfo['id']; |
1185 | $title = $pageInfo['_title']; |
1186 | // T292348: text key may be absent, force addition if null |
1187 | $uploadInfo['text'] ??= ''; |
1188 | $content = $this->makeContent( $title, $revId, $uploadInfo ); |
1189 | |
1190 | $revision->setTitle( $title ); |
1191 | $revision->setID( $revId ); |
1192 | $revision->setTimestamp( $uploadInfo['timestamp'] ); |
1193 | $revision->setContent( SlotRecord::MAIN, $content ); |
1194 | $revision->setFilename( $uploadInfo['filename'] ); |
1195 | if ( isset( $uploadInfo['archivename'] ) ) { |
1196 | $revision->setArchiveName( $uploadInfo['archivename'] ); |
1197 | } |
1198 | $revision->setSrc( $uploadInfo['src'] ); |
1199 | if ( isset( $uploadInfo['fileSrc'] ) ) { |
1200 | $revision->setFileSrc( $uploadInfo['fileSrc'], |
1201 | !empty( $uploadInfo['isTempSrc'] ) |
1202 | ); |
1203 | } |
1204 | if ( isset( $uploadInfo['sha1base36'] ) ) { |
1205 | $revision->setSha1Base36( $uploadInfo['sha1base36'] ); |
1206 | } |
1207 | $revision->setSize( intval( $uploadInfo['size'] ) ); |
1208 | $revision->setComment( $uploadInfo['comment'] ); |
1209 | |
1210 | if ( isset( $uploadInfo['contributor']['username'] ) ) { |
1211 | $revision->setUsername( |
1212 | $this->externalUserNames->applyPrefix( $uploadInfo['contributor']['username'] ) |
1213 | ); |
1214 | } elseif ( isset( $uploadInfo['contributor']['ip'] ) ) { |
1215 | $revision->setUserIP( $uploadInfo['contributor']['ip'] ); |
1216 | } |
1217 | $revision->setNoUpdates( $this->mNoUpdates ); |
1218 | |
1219 | return call_user_func( $this->mUploadCallback, $revision ); |
1220 | } |
1221 | |
1222 | /** |
1223 | * @return array |
1224 | */ |
1225 | private function handleContributor() { |
1226 | $this->debug( "Enter contributor handler." ); |
1227 | |
1228 | if ( $this->reader->isEmptyElement ) { |
1229 | return []; |
1230 | } |
1231 | |
1232 | $fields = [ 'id', 'ip', 'username' ]; |
1233 | $info = []; |
1234 | |
1235 | while ( $this->reader->read() ) { |
1236 | if ( $this->reader->nodeType == XMLReader::END_ELEMENT && |
1237 | $this->reader->localName == 'contributor' ) { |
1238 | break; |
1239 | } |
1240 | |
1241 | $tag = $this->reader->localName; |
1242 | |
1243 | if ( in_array( $tag, $fields ) ) { |
1244 | $info[$tag] = $this->nodeContents(); |
1245 | } |
1246 | } |
1247 | |
1248 | return $info; |
1249 | } |
1250 | |
1251 | /** |
1252 | * @param string $text |
1253 | * @param string|null $ns |
1254 | * @return array|false |
1255 | */ |
1256 | private function processTitle( $text, $ns = null ) { |
1257 | if ( $this->foreignNamespaces === null ) { |
1258 | $foreignTitleFactory = new NaiveForeignTitleFactory( |
1259 | $this->contentLanguage |
1260 | ); |
1261 | } else { |
1262 | $foreignTitleFactory = new NamespaceAwareForeignTitleFactory( |
1263 | $this->foreignNamespaces ); |
1264 | } |
1265 | |
1266 | $foreignTitle = $foreignTitleFactory->createForeignTitle( $text, |
1267 | intval( $ns ) ); |
1268 | |
1269 | $title = $this->importTitleFactory->createTitleFromForeignTitle( |
1270 | $foreignTitle ); |
1271 | |
1272 | if ( $title === null ) { |
1273 | # Invalid page title? Ignore the page |
1274 | $this->notice( 'import-error-invalid', $foreignTitle->getFullText() ); |
1275 | return false; |
1276 | } elseif ( $title->isExternal() ) { |
1277 | $this->notice( 'import-error-interwiki', $title->getPrefixedText() ); |
1278 | return false; |
1279 | } elseif ( !$title->canExist() ) { |
1280 | $this->notice( 'import-error-special', $title->getPrefixedText() ); |
1281 | return false; |
1282 | } elseif ( !$this->performer->definitelyCan( 'edit', $title ) ) { |
1283 | # Do not import if the importing wiki user cannot edit this page |
1284 | $this->notice( 'import-error-edit', $title->getPrefixedText() ); |
1285 | return false; |
1286 | } |
1287 | |
1288 | return [ $title, $foreignTitle ]; |
1289 | } |
1290 | |
1291 | /** |
1292 | * Open the XMLReader connected to the source adapter id |
1293 | * @suppress PhanStaticCallToNonStatic, UnusedSuppression -- for PHP 7.4 support |
1294 | */ |
1295 | private function openReader() { |
1296 | // Enable the entity loader, as it is needed for loading external URLs via |
1297 | // XMLReader::open (T86036) |
1298 | // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847 |
1299 | $oldDisable = @libxml_disable_entity_loader( false ); |
1300 | |
1301 | if ( PHP_VERSION_ID >= 80000 ) { |
1302 | // A static call is now preferred, and avoids https://github.com/php/php-src/issues/11548 |
1303 | $reader = XMLReader::open( |
1304 | 'uploadsource://' . $this->sourceAdapterId, null, LIBXML_PARSEHUGE ); |
1305 | if ( $reader instanceof XMLReader ) { |
1306 | $this->reader = $reader; |
1307 | $status = true; |
1308 | } else { |
1309 | $status = false; |
1310 | } |
1311 | } else { |
1312 | // A static call generated a deprecation warning prior to PHP 8.0 |
1313 | $this->reader = new XMLReader; |
1314 | $status = $this->reader->open( |
1315 | 'uploadsource://' . $this->sourceAdapterId, null, LIBXML_PARSEHUGE ); |
1316 | } |
1317 | if ( !$status ) { |
1318 | $error = libxml_get_last_error(); |
1319 | // phpcs:ignore Generic.PHP.NoSilencedErrors |
1320 | @libxml_disable_entity_loader( $oldDisable ); |
1321 | throw new RuntimeException( |
1322 | 'Encountered an internal error while initializing WikiImporter object: ' . $error->message |
1323 | ); |
1324 | } |
1325 | // phpcs:ignore Generic.PHP.NoSilencedErrors |
1326 | @libxml_disable_entity_loader( $oldDisable ); |
1327 | } |
1328 | |
1329 | /** |
1330 | * Check the syntax of the given xml |
1331 | */ |
1332 | private function syntaxCheckXML() { |
1333 | if ( !UploadSourceAdapter::isSeekableSource( $this->sourceAdapterId ) ) { |
1334 | return; |
1335 | } |
1336 | AtEase::suppressWarnings(); |
1337 | $oldDisable = libxml_disable_entity_loader( false ); |
1338 | try { |
1339 | while ( $this->reader->read() ); |
1340 | $error = libxml_get_last_error(); |
1341 | if ( $error ) { |
1342 | $errorMessage = 'XML error at line ' . $error->line . ': ' . $error->message; |
1343 | wfDebug( __METHOD__ . ': Invalid xml found - ' . $errorMessage ); |
1344 | throw new RuntimeException( $errorMessage ); |
1345 | } |
1346 | } finally { |
1347 | libxml_disable_entity_loader( $oldDisable ); |
1348 | AtEase::restoreWarnings(); |
1349 | $this->reader->close(); |
1350 | } |
1351 | |
1352 | // Reopen for the real import |
1353 | UploadSourceAdapter::seekSource( $this->sourceAdapterId, 0 ); |
1354 | $this->openReader(); |
1355 | } |
1356 | } |