Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
55.20% |
308 / 558 |
|
27.45% |
14 / 51 |
CRAP | |
0.00% |
0 / 1 |
WikiImporter | |
55.20% |
308 / 558 |
|
27.45% |
14 / 51 |
3578.71 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
25 / 25 |
|
100.00% |
1 / 1 |
2 | |||
getReader | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
throwXmlError | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
debug | |
50.00% |
1 / 2 |
|
0.00% |
0 / 1 |
2.50 | |||
warn | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
notice | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
setDebug | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
setNoUpdates | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
setPageOffset | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
setNoticeCallback | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
setPageCallback | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
setPageOutCallback | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
setRevisionCallback | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
setUploadCallback | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
setLogItemCallback | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
setSiteInfoCallback | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
setImportTitleFactory | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
setTargetNamespace | |
0.00% |
0 / 21 |
|
0.00% |
0 / 1 |
20 | |||
setTargetRootPage | |
0.00% |
0 / 29 |
|
0.00% |
0 / 1 |
56 | |||
setImageBasePath | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
setImportUploads | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
setUsernamePrefix | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
disableStatisticsUpdate | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
beforeImportPage | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
importRevision | |
17.65% |
3 / 17 |
|
0.00% |
0 / 1 |
8.03 | |||
importLogItem | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
importUpload | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
finishImportPage | |
72.22% |
13 / 18 |
|
0.00% |
0 / 1 |
5.54 | |||
siteInfoCallback | |
33.33% |
2 / 6 |
|
0.00% |
0 / 1 |
3.19 | |||
pageCallback | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
pageOutCallback | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
revisionCallback | |
83.33% |
5 / 6 |
|
0.00% |
0 / 1 |
2.02 | |||
logItemCallback | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
6 | |||
nodeAttribute | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
nodeContents | |
80.00% |
8 / 10 |
|
0.00% |
0 / 1 |
7.39 | |||
doImport | |
79.17% |
38 / 48 |
|
0.00% |
0 / 1 |
17.03 | |||
handleSiteInfo | |
100.00% |
15 / 15 |
|
100.00% |
1 / 1 |
6 | |||
handleLogItem | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
72 | |||
processLogItem | |
0.00% |
0 / 22 |
|
0.00% |
0 / 1 |
72 | |||
handlePage | |
93.02% |
40 / 43 |
|
0.00% |
0 / 1 |
17.10 | |||
handleRevision | |
100.00% |
24 / 24 |
|
100.00% |
1 / 1 |
11 | |||
handleContent | |
88.24% |
15 / 17 |
|
0.00% |
0 / 1 |
8.10 | |||
makeContent | |
80.00% |
20 / 25 |
|
0.00% |
0 / 1 |
6.29 | |||
processRevision | |
86.21% |
25 / 29 |
|
0.00% |
0 / 1 |
9.21 | |||
handleUpload | |
0.00% |
0 / 31 |
|
0.00% |
0 / 1 |
240 | |||
dumpTemp | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
processUpload | |
0.00% |
0 / 29 |
|
0.00% |
0 / 1 |
42 | |||
handleContributor | |
92.31% |
12 / 13 |
|
0.00% |
0 / 1 |
6.02 | |||
processTitle | |
65.22% |
15 / 23 |
|
0.00% |
0 / 1 |
7.51 | |||
openReader | |
38.89% |
7 / 18 |
|
0.00% |
0 / 1 |
7.65 | |||
syntaxCheckXML | |
93.33% |
14 / 15 |
|
0.00% |
0 / 1 |
4.00 |
1 | <?php |
2 | /** |
3 | * MediaWiki page data importer. |
4 | * |
5 | * Copyright © 2003,2005 Brooke Vibber <bvibber@wikimedia.org> |
6 | * https://www.mediawiki.org/ |
7 | * |
8 | * This program is free software; you can redistribute it and/or modify |
9 | * it under the terms of the GNU General Public License as published by |
10 | * the Free Software Foundation; either version 2 of the License, or |
11 | * (at your option) any later version. |
12 | * |
13 | * This program is distributed in the hope that it will be useful, |
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 | * GNU General Public License for more details. |
17 | * |
18 | * You should have received a copy of the GNU General Public License along |
19 | * with this program; if not, write to the Free Software Foundation, Inc., |
20 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
21 | * http://www.gnu.org/copyleft/gpl.html |
22 | * |
23 | * @file |
24 | * @ingroup SpecialPage |
25 | */ |
26 | |
27 | use MediaWiki\Cache\CacheKeyHelper; |
28 | use MediaWiki\Config\Config; |
29 | use MediaWiki\Content\Content; |
30 | use MediaWiki\Content\IContentHandlerFactory; |
31 | use MediaWiki\Deferred\DeferredUpdates; |
32 | use MediaWiki\Deferred\SiteStatsUpdate; |
33 | use MediaWiki\HookContainer\HookContainer; |
34 | use MediaWiki\HookContainer\HookRunner; |
35 | use MediaWiki\Language\Language; |
36 | use MediaWiki\MainConfigNames; |
37 | use MediaWiki\Page\PageIdentity; |
38 | use MediaWiki\Page\WikiPageFactory; |
39 | use MediaWiki\Permissions\Authority; |
40 | use MediaWiki\Revision\SlotRecord; |
41 | use MediaWiki\Revision\SlotRoleRegistry; |
42 | use MediaWiki\Status\Status; |
43 | use MediaWiki\Title\ForeignTitle; |
44 | use MediaWiki\Title\ImportTitleFactory; |
45 | use MediaWiki\Title\NaiveForeignTitleFactory; |
46 | use MediaWiki\Title\NaiveImportTitleFactory; |
47 | use MediaWiki\Title\NamespaceAwareForeignTitleFactory; |
48 | use MediaWiki\Title\NamespaceImportTitleFactory; |
49 | use MediaWiki\Title\NamespaceInfo; |
50 | use MediaWiki\Title\SubpageImportTitleFactory; |
51 | use MediaWiki\Title\Title; |
52 | use MediaWiki\Title\TitleFactory; |
53 | use MediaWiki\User\ExternalUserNames; |
54 | use Wikimedia\AtEase\AtEase; |
55 | use Wikimedia\Message\MessageParam; |
56 | use Wikimedia\Message\MessageSpecifier; |
57 | use Wikimedia\NormalizedException\NormalizedException; |
58 | use Wikimedia\Rdbms\IDBAccessObject; |
59 | |
60 | /** |
61 | * XML file reader for the page data importer. |
62 | * |
63 | * implements Special:Import |
64 | * @ingroup SpecialPage |
65 | */ |
66 | class WikiImporter { |
67 | /** @var XMLReader|null */ |
68 | private $reader; |
69 | |
70 | /** @var string */ |
71 | private $sourceAdapterId; |
72 | |
73 | /** @var array|null */ |
74 | private $foreignNamespaces = null; |
75 | |
76 | /** @var callable|null */ |
77 | private $mLogItemCallback; |
78 | |
79 | /** @var callable */ |
80 | private $mUploadCallback; |
81 | |
82 | /** @var callable|null */ |
83 | private $mRevisionCallback; |
84 | |
85 | /** @var callable|null */ |
86 | private $mPageCallback; |
87 | |
88 | /** @var callable|null */ |
89 | private $mSiteInfoCallback; |
90 | |
91 | /** @var callable|null */ |
92 | private $mPageOutCallback; |
93 | |
94 | /** @var callable|null */ |
95 | private $mNoticeCallback; |
96 | |
97 | /** @var bool|null */ |
98 | private $mDebug; |
99 | |
100 | /** @var bool|null */ |
101 | private $mImportUploads; |
102 | |
103 | /** @var string|null */ |
104 | private $mImageBasePath; |
105 | |
106 | /** @var bool */ |
107 | private $mNoUpdates = false; |
108 | |
109 | /** @var int */ |
110 | private $pageOffset = 0; |
111 | |
112 | private ImportTitleFactory $importTitleFactory; |
113 | private ExternalUserNames $externalUserNames; |
114 | |
115 | /** @var array */ |
116 | private $countableCache = []; |
117 | |
118 | /** @var bool */ |
119 | private $disableStatisticsUpdate = false; |
120 | |
121 | /** |
122 | * Authority used for permission checks only (to ensure that the user performing the import is |
123 | * allowed to edit the pages they're importing). To skip the checks, use UltimateAuthority. |
124 | * |
125 | * If you want to also log the import actions, see ImportReporter. |
126 | */ |
127 | private Authority $performer; |
128 | |
129 | private Config $config; |
130 | private HookRunner $hookRunner; |
131 | private Language $contentLanguage; |
132 | private NamespaceInfo $namespaceInfo; |
133 | private TitleFactory $titleFactory; |
134 | private WikiPageFactory $wikiPageFactory; |
135 | private UploadRevisionImporter $uploadRevisionImporter; |
136 | private IContentHandlerFactory $contentHandlerFactory; |
137 | private SlotRoleRegistry $slotRoleRegistry; |
138 | |
139 | /** |
140 | * Creates an ImportXMLReader drawing from the source provided |
141 | */ |
142 | public function __construct( |
143 | ImportSource $source, |
144 | Authority $performer, |
145 | Config $config, |
146 | HookContainer $hookContainer, |
147 | Language $contentLanguage, |
148 | NamespaceInfo $namespaceInfo, |
149 | TitleFactory $titleFactory, |
150 | WikiPageFactory $wikiPageFactory, |
151 | UploadRevisionImporter $uploadRevisionImporter, |
152 | IContentHandlerFactory $contentHandlerFactory, |
153 | SlotRoleRegistry $slotRoleRegistry |
154 | ) { |
155 | $this->performer = $performer; |
156 | $this->config = $config; |
157 | $this->hookRunner = new HookRunner( $hookContainer ); |
158 | $this->contentLanguage = $contentLanguage; |
159 | $this->namespaceInfo = $namespaceInfo; |
160 | $this->titleFactory = $titleFactory; |
161 | $this->wikiPageFactory = $wikiPageFactory; |
162 | $this->uploadRevisionImporter = $uploadRevisionImporter; |
163 | $this->contentHandlerFactory = $contentHandlerFactory; |
164 | $this->slotRoleRegistry = $slotRoleRegistry; |
165 | |
166 | if ( !in_array( 'uploadsource', stream_get_wrappers() ) ) { |
167 | stream_wrapper_register( 'uploadsource', UploadSourceAdapter::class ); |
168 | } |
169 | $this->sourceAdapterId = UploadSourceAdapter::registerSource( $source ); |
170 | |
171 | $this->openReader(); |
172 | |
173 | // Default callbacks |
174 | $this->setPageCallback( [ $this, 'beforeImportPage' ] ); |
175 | $this->setRevisionCallback( [ $this, "importRevision" ] ); |
176 | $this->setUploadCallback( [ $this, 'importUpload' ] ); |
177 | $this->setLogItemCallback( [ $this, 'importLogItem' ] ); |
178 | $this->setPageOutCallback( [ $this, 'finishImportPage' ] ); |
179 | |
180 | $this->importTitleFactory = new NaiveImportTitleFactory( |
181 | $this->contentLanguage, |
182 | $this->namespaceInfo, |
183 | $this->titleFactory |
184 | ); |
185 | $this->externalUserNames = new ExternalUserNames( 'imported', false ); |
186 | } |
187 | |
188 | /** |
189 | * @return null|XMLReader |
190 | */ |
191 | public function getReader() { |
192 | return $this->reader; |
193 | } |
194 | |
195 | /** |
196 | * @param string $err |
197 | */ |
198 | public function throwXmlError( $err ) { |
199 | $this->debug( "FAILURE: $err" ); |
200 | wfDebug( "WikiImporter XML error: $err" ); |
201 | } |
202 | |
203 | /** |
204 | * @param string $data |
205 | */ |
206 | public function debug( $data ) { |
207 | if ( $this->mDebug ) { |
208 | wfDebug( "IMPORT: $data" ); |
209 | } |
210 | } |
211 | |
212 | /** |
213 | * @param string $data |
214 | */ |
215 | public function warn( $data ) { |
216 | wfDebug( "IMPORT: $data" ); |
217 | } |
218 | |
219 | /** |
220 | * @param string $msg |
221 | * @phpcs:ignore Generic.Files.LineLength |
222 | * @param MessageParam|MessageSpecifier|string|int|float|list<MessageParam|MessageSpecifier|string|int|float> ...$params |
223 | * See Message::params() |
224 | */ |
225 | public function notice( $msg, ...$params ) { |
226 | if ( is_callable( $this->mNoticeCallback ) ) { |
227 | call_user_func( $this->mNoticeCallback, $msg, $params ); |
228 | } else { # No ImportReporter -> CLI |
229 | // T177997: the command line importers should call setNoticeCallback() |
230 | // for their own custom callback to echo the notice |
231 | wfDebug( wfMessage( $msg, $params )->text() ); |
232 | } |
233 | } |
234 | |
235 | /** |
236 | * Set debug mode... |
237 | * @param bool $debug |
238 | */ |
239 | public function setDebug( $debug ) { |
240 | $this->mDebug = $debug; |
241 | } |
242 | |
243 | /** |
244 | * Set 'no updates' mode. In this mode, the link tables will not be updated by the importer |
245 | * @param bool $noupdates |
246 | */ |
247 | public function setNoUpdates( $noupdates ) { |
248 | $this->mNoUpdates = $noupdates; |
249 | } |
250 | |
251 | /** |
252 | * Sets 'pageOffset' value. So it will skip the first n-1 pages |
253 | * and start from the nth page. It's 1-based indexing. |
254 | * @param int $nthPage |
255 | * @since 1.29 |
256 | */ |
257 | public function setPageOffset( $nthPage ) { |
258 | $this->pageOffset = $nthPage; |
259 | } |
260 | |
261 | /** |
262 | * Set a callback that displays notice messages |
263 | * |
264 | * @param callable $callback |
265 | * @return callable |
266 | */ |
267 | public function setNoticeCallback( $callback ) { |
268 | return wfSetVar( $this->mNoticeCallback, $callback ); |
269 | } |
270 | |
271 | /** |
272 | * Sets the action to perform as each new page in the stream is reached. |
273 | * @param callable|null $callback |
274 | * @return callable|null |
275 | */ |
276 | public function setPageCallback( $callback ) { |
277 | $previous = $this->mPageCallback; |
278 | $this->mPageCallback = $callback; |
279 | return $previous; |
280 | } |
281 | |
282 | /** |
283 | * Sets the action to perform as each page in the stream is completed. |
284 | * Callback accepts the page title (as a Title object), a second object |
285 | * with the original title form (in case it's been overridden into a |
286 | * local namespace), and a count of revisions. |
287 | * |
288 | * @param callable|null $callback |
289 | * @return callable|null |
290 | */ |
291 | public function setPageOutCallback( $callback ) { |
292 | $previous = $this->mPageOutCallback; |
293 | $this->mPageOutCallback = $callback; |
294 | return $previous; |
295 | } |
296 | |
297 | /** |
298 | * Sets the action to perform as each page revision is reached. |
299 | * @param callable|null $callback |
300 | * @return callable|null |
301 | */ |
302 | public function setRevisionCallback( $callback ) { |
303 | $previous = $this->mRevisionCallback; |
304 | $this->mRevisionCallback = $callback; |
305 | return $previous; |
306 | } |
307 | |
308 | /** |
309 | * Sets the action to perform as each file upload version is reached. |
310 | * @param callable $callback |
311 | * @return callable |
312 | */ |
313 | public function setUploadCallback( $callback ) { |
314 | $previous = $this->mUploadCallback; |
315 | $this->mUploadCallback = $callback; |
316 | return $previous; |
317 | } |
318 | |
319 | /** |
320 | * Sets the action to perform as each log item reached. |
321 | * @param callable $callback |
322 | * @return callable |
323 | */ |
324 | public function setLogItemCallback( $callback ) { |
325 | $previous = $this->mLogItemCallback; |
326 | $this->mLogItemCallback = $callback; |
327 | return $previous; |
328 | } |
329 | |
330 | /** |
331 | * Sets the action to perform when site info is encountered |
332 | * @param callable $callback |
333 | * @return callable |
334 | */ |
335 | public function setSiteInfoCallback( $callback ) { |
336 | $previous = $this->mSiteInfoCallback; |
337 | $this->mSiteInfoCallback = $callback; |
338 | return $previous; |
339 | } |
340 | |
341 | /** |
342 | * Sets the factory object to use to convert ForeignTitle objects into local |
343 | * Title objects |
344 | * @param ImportTitleFactory $factory |
345 | */ |
346 | public function setImportTitleFactory( $factory ) { |
347 | $this->importTitleFactory = $factory; |
348 | } |
349 | |
350 | /** |
351 | * Set a target namespace to override the defaults |
352 | * @param null|int $namespace |
353 | * @return bool |
354 | */ |
355 | public function setTargetNamespace( $namespace ) { |
356 | if ( $namespace === null ) { |
357 | // Don't override namespaces |
358 | $this->setImportTitleFactory( |
359 | new NaiveImportTitleFactory( |
360 | $this->contentLanguage, |
361 | $this->namespaceInfo, |
362 | $this->titleFactory |
363 | ) |
364 | ); |
365 | return true; |
366 | } elseif ( |
367 | $namespace >= 0 && |
368 | $this->namespaceInfo->exists( intval( $namespace ) ) |
369 | ) { |
370 | $namespace = intval( $namespace ); |
371 | $this->setImportTitleFactory( |
372 | new NamespaceImportTitleFactory( |
373 | $this->namespaceInfo, |
374 | $this->titleFactory, |
375 | $namespace |
376 | ) |
377 | ); |
378 | return true; |
379 | } else { |
380 | return false; |
381 | } |
382 | } |
383 | |
384 | /** |
385 | * Set a target root page under which all pages are imported |
386 | * @param null|string $rootpage |
387 | * @return Status |
388 | */ |
389 | public function setTargetRootPage( $rootpage ) { |
390 | $status = Status::newGood(); |
391 | $nsInfo = $this->namespaceInfo; |
392 | if ( $rootpage === null ) { |
393 | // No rootpage |
394 | $this->setImportTitleFactory( |
395 | new NaiveImportTitleFactory( |
396 | $this->contentLanguage, |
397 | $nsInfo, |
398 | $this->titleFactory |
399 | ) |
400 | ); |
401 | } elseif ( $rootpage !== '' ) { |
402 | $rootpage = rtrim( $rootpage, '/' ); // avoid double slashes |
403 | $title = Title::newFromText( $rootpage ); |
404 | |
405 | if ( !$title || $title->isExternal() ) { |
406 | $status->fatal( 'import-rootpage-invalid' ); |
407 | } elseif ( !$nsInfo->hasSubpages( $title->getNamespace() ) ) { |
408 | $displayNSText = $title->getNamespace() === NS_MAIN |
409 | ? wfMessage( 'blanknamespace' )->text() |
410 | : $this->contentLanguage->getNsText( $title->getNamespace() ); |
411 | $status->fatal( 'import-rootpage-nosubpage', $displayNSText ); |
412 | } else { |
413 | // set namespace to 'all', so the namespace check in processTitle() can pass |
414 | $this->setTargetNamespace( null ); |
415 | $this->setImportTitleFactory( |
416 | new SubpageImportTitleFactory( |
417 | $nsInfo, |
418 | $this->titleFactory, |
419 | $title |
420 | ) |
421 | ); |
422 | } |
423 | } |
424 | return $status; |
425 | } |
426 | |
427 | /** |
428 | * @param string $dir |
429 | */ |
430 | public function setImageBasePath( $dir ) { |
431 | $this->mImageBasePath = $dir; |
432 | } |
433 | |
434 | /** |
435 | * @param bool $import |
436 | */ |
437 | public function setImportUploads( $import ) { |
438 | $this->mImportUploads = $import; |
439 | } |
440 | |
441 | /** |
442 | * @since 1.31 |
443 | * @param string $usernamePrefix Prefix to apply to unknown (and possibly also known) usernames |
444 | * @param bool $assignKnownUsers Whether to apply the prefix to usernames that exist locally |
445 | */ |
446 | public function setUsernamePrefix( $usernamePrefix, $assignKnownUsers ) { |
447 | $this->externalUserNames = new ExternalUserNames( $usernamePrefix, $assignKnownUsers ); |
448 | } |
449 | |
450 | /** |
451 | * Statistics update can cause a lot of time |
452 | * @since 1.29 |
453 | */ |
454 | public function disableStatisticsUpdate() { |
455 | $this->disableStatisticsUpdate = true; |
456 | } |
457 | |
458 | /** |
459 | * Default per-page callback. Sets up some things related to site statistics |
460 | * @param array $titleAndForeignTitle Two-element array, with Title object at |
461 | * index 0 and ForeignTitle object at index 1 |
462 | * @return bool |
463 | */ |
464 | public function beforeImportPage( $titleAndForeignTitle ) { |
465 | $title = $titleAndForeignTitle[0]; |
466 | $page = $this->wikiPageFactory->newFromTitle( $title ); |
467 | $this->countableCache['title_' . $title->getPrefixedText()] = $page->isCountable(); |
468 | return true; |
469 | } |
470 | |
471 | /** |
472 | * Default per-revision callback, performs the import. |
473 | * @param WikiRevision $revision |
474 | * @return bool |
475 | */ |
476 | public function importRevision( $revision ) { |
477 | if ( !$revision->getContentHandler()->canBeUsedOn( $revision->getTitle() ) ) { |
478 | $this->notice( 'import-error-bad-location', |
479 | $revision->getTitle()->getPrefixedText(), |
480 | $revision->getID(), |
481 | $revision->getModel(), |
482 | $revision->getFormat() |
483 | ); |
484 | |
485 | return false; |
486 | } |
487 | |
488 | try { |
489 | return $revision->importOldRevision(); |
490 | } catch ( MWContentSerializationException $ex ) { |
491 | $this->notice( 'import-error-unserialize', |
492 | $revision->getTitle()->getPrefixedText(), |
493 | $revision->getID(), |
494 | $revision->getModel(), |
495 | $revision->getFormat() |
496 | ); |
497 | } |
498 | |
499 | return false; |
500 | } |
501 | |
502 | /** |
503 | * Default per-revision callback, performs the import. |
504 | * @param WikiRevision $revision |
505 | * @return bool |
506 | */ |
507 | public function importLogItem( $revision ) { |
508 | return $revision->importLogItem(); |
509 | } |
510 | |
511 | /** |
512 | * Dummy for now... |
513 | * @param WikiRevision $revision |
514 | * @return bool |
515 | */ |
516 | public function importUpload( $revision ) { |
517 | $status = $this->uploadRevisionImporter->import( $revision ); |
518 | return $status->isGood(); |
519 | } |
520 | |
521 | /** |
522 | * Mostly for hook use |
523 | * @param PageIdentity $pageIdentity |
524 | * @param ForeignTitle $foreignTitle |
525 | * @param int $revCount |
526 | * @param int $sRevCount |
527 | * @param array $pageInfo |
528 | * @return bool |
529 | */ |
530 | public function finishImportPage( PageIdentity $pageIdentity, $foreignTitle, $revCount, |
531 | $sRevCount, $pageInfo |
532 | ) { |
533 | // Update article count statistics (T42009) |
534 | // The normal counting logic in WikiPage->doEditUpdates() is designed for |
535 | // one-revision-at-a-time editing, not bulk imports. In this situation it |
536 | // suffers from issues of replica DB lag. We let WikiPage handle the total page |
537 | // and revision count, and we implement our own custom logic for the |
538 | // article (content page) count. |
539 | if ( !$this->disableStatisticsUpdate ) { |
540 | $page = $this->wikiPageFactory->newFromTitle( $pageIdentity ); |
541 | |
542 | $page->loadPageData( IDBAccessObject::READ_LATEST ); |
543 | $rev = $page->getRevisionRecord(); |
544 | if ( $rev === null ) { |
545 | |
546 | wfDebug( __METHOD__ . ': Skipping article count adjustment for ' . $pageIdentity . |
547 | ' because WikiPage::getRevisionRecord() returned null' ); |
548 | } else { |
549 | $update = $page->newPageUpdater( $this->performer )->prepareUpdate(); |
550 | $countKey = 'title_' . CacheKeyHelper::getKeyForPage( $pageIdentity ); |
551 | $countable = $update->isCountable(); |
552 | if ( array_key_exists( $countKey, $this->countableCache ) && |
553 | $countable != $this->countableCache[$countKey] ) { |
554 | DeferredUpdates::addUpdate( SiteStatsUpdate::factory( [ |
555 | 'articles' => ( (int)$countable - (int)$this->countableCache[$countKey] ) |
556 | ] ) ); |
557 | } |
558 | } |
559 | } |
560 | |
561 | $title = Title::newFromPageIdentity( $pageIdentity ); |
562 | return $this->hookRunner->onAfterImportPage( $title, $foreignTitle, |
563 | $revCount, $sRevCount, $pageInfo ); |
564 | } |
565 | |
566 | /** |
567 | * Notify the callback function of site info |
568 | * @param array $siteInfo |
569 | * @return mixed|false |
570 | */ |
571 | private function siteInfoCallback( $siteInfo ) { |
572 | if ( $this->mSiteInfoCallback ) { |
573 | return call_user_func_array( |
574 | $this->mSiteInfoCallback, |
575 | [ $siteInfo, $this ] |
576 | ); |
577 | } else { |
578 | return false; |
579 | } |
580 | } |
581 | |
582 | /** |
583 | * Notify the callback function when a new "<page>" is reached. |
584 | * @param array $title |
585 | */ |
586 | public function pageCallback( $title ) { |
587 | if ( $this->mPageCallback ) { |
588 | call_user_func( $this->mPageCallback, $title ); |
589 | } |
590 | } |
591 | |
592 | /** |
593 | * Notify the callback function when a "</page>" is closed. |
594 | * @param PageIdentity $pageIdentity |
595 | * @param ForeignTitle $foreignTitle |
596 | * @param int $revCount |
597 | * @param int $sucCount Number of revisions for which callback returned true |
598 | * @param array $pageInfo Associative array of page information |
599 | */ |
600 | private function pageOutCallback( PageIdentity $pageIdentity, $foreignTitle, $revCount, |
601 | $sucCount, $pageInfo ) { |
602 | if ( $this->mPageOutCallback ) { |
603 | call_user_func_array( $this->mPageOutCallback, func_get_args() ); |
604 | } |
605 | } |
606 | |
607 | /** |
608 | * Notify the callback function of a revision |
609 | * @param WikiRevision $revision |
610 | * @return bool|mixed |
611 | */ |
612 | private function revisionCallback( $revision ) { |
613 | if ( $this->mRevisionCallback ) { |
614 | return call_user_func_array( |
615 | $this->mRevisionCallback, |
616 | [ $revision, $this ] |
617 | ); |
618 | } else { |
619 | return false; |
620 | } |
621 | } |
622 | |
623 | /** |
624 | * Notify the callback function of a new log item |
625 | * @param WikiRevision $revision |
626 | * @return mixed|false |
627 | */ |
628 | private function logItemCallback( $revision ) { |
629 | if ( $this->mLogItemCallback ) { |
630 | return call_user_func_array( |
631 | $this->mLogItemCallback, |
632 | [ $revision, $this ] |
633 | ); |
634 | } else { |
635 | return false; |
636 | } |
637 | } |
638 | |
639 | /** |
640 | * Retrieves the contents of the named attribute of the current element. |
641 | * @param string $attr The name of the attribute |
642 | * @return string The value of the attribute or an empty string if it is not set in the current |
643 | * element. |
644 | */ |
645 | public function nodeAttribute( $attr ) { |
646 | return $this->reader->getAttribute( $attr ) ?? ''; |
647 | } |
648 | |
649 | /** |
650 | * Shouldn't something like this be built-in to XMLReader? |
651 | * Fetches text contents of the current element, assuming |
652 | * no sub-elements or such scary things. |
653 | * @return string |
654 | * @internal |
655 | */ |
656 | public function nodeContents() { |
657 | if ( $this->reader->isEmptyElement ) { |
658 | return ""; |
659 | } |
660 | $buffer = ""; |
661 | while ( $this->reader->read() ) { |
662 | switch ( $this->reader->nodeType ) { |
663 | case XMLReader::TEXT: |
664 | case XMLReader::CDATA: |
665 | case XMLReader::SIGNIFICANT_WHITESPACE: |
666 | $buffer .= $this->reader->value; |
667 | break; |
668 | case XMLReader::END_ELEMENT: |
669 | return $buffer; |
670 | } |
671 | } |
672 | |
673 | $this->reader->close(); |
674 | return ''; |
675 | } |
676 | |
677 | /** |
678 | * Primary entry point |
679 | * @throws Exception |
680 | * @return bool |
681 | */ |
682 | public function doImport() { |
683 | $this->syntaxCheckXML(); |
684 | |
685 | // Calls to reader->read need to be wrapped in calls to |
686 | // libxml_disable_entity_loader() to avoid local file |
687 | // inclusion attacks (T48932). |
688 | // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847 |
689 | $oldDisable = @libxml_disable_entity_loader( true ); |
690 | try { |
691 | $this->reader->read(); |
692 | |
693 | if ( $this->reader->localName != 'mediawiki' ) { |
694 | // phpcs:ignore Generic.PHP.NoSilencedErrors |
695 | @libxml_disable_entity_loader( $oldDisable ); |
696 | $error = libxml_get_last_error(); |
697 | if ( $error ) { |
698 | throw new NormalizedException( "XML error at line {line}: {message}", [ |
699 | 'line' => $error->line, |
700 | 'message' => $error->message, |
701 | ] ); |
702 | } else { |
703 | throw new UnexpectedValueException( |
704 | "Expected '<mediawiki>' tag, got '<{$this->reader->localName}>' tag." |
705 | ); |
706 | } |
707 | } |
708 | $this->debug( "<mediawiki> tag is correct." ); |
709 | |
710 | $this->debug( "Starting primary dump processing loop." ); |
711 | |
712 | $keepReading = $this->reader->read(); |
713 | $skip = false; |
714 | $pageCount = 0; |
715 | while ( $keepReading ) { |
716 | $tag = $this->reader->localName; |
717 | if ( $this->pageOffset ) { |
718 | if ( $tag === 'page' ) { |
719 | $pageCount++; |
720 | } |
721 | if ( $pageCount < $this->pageOffset ) { |
722 | $keepReading = $this->reader->next(); |
723 | continue; |
724 | } |
725 | } |
726 | $type = $this->reader->nodeType; |
727 | |
728 | if ( !$this->hookRunner->onImportHandleToplevelXMLTag( $this ) ) { |
729 | // Do nothing |
730 | } elseif ( $tag == 'mediawiki' && $type == XMLReader::END_ELEMENT ) { |
731 | break; |
732 | } elseif ( $tag == 'siteinfo' ) { |
733 | $this->handleSiteInfo(); |
734 | } elseif ( $tag == 'page' ) { |
735 | $this->handlePage(); |
736 | } elseif ( $tag == 'logitem' ) { |
737 | $this->handleLogItem(); |
738 | } elseif ( $tag != '#text' ) { |
739 | $this->warn( "Unhandled top-level XML tag $tag" ); |
740 | |
741 | $skip = true; |
742 | } |
743 | |
744 | if ( $skip ) { |
745 | $keepReading = $this->reader->next(); |
746 | $skip = false; |
747 | $this->debug( "Skip" ); |
748 | } else { |
749 | $keepReading = $this->reader->read(); |
750 | } |
751 | } |
752 | } finally { |
753 | // phpcs:ignore Generic.PHP.NoSilencedErrors |
754 | @libxml_disable_entity_loader( $oldDisable ); |
755 | $this->reader->close(); |
756 | } |
757 | |
758 | return true; |
759 | } |
760 | |
761 | private function handleSiteInfo() { |
762 | $this->debug( "Enter site info handler." ); |
763 | $siteInfo = []; |
764 | |
765 | // Fields that can just be stuffed in the siteInfo object |
766 | $normalFields = [ 'sitename', 'base', 'generator', 'case' ]; |
767 | |
768 | while ( $this->reader->read() ) { |
769 | if ( $this->reader->nodeType == XMLReader::END_ELEMENT && |
770 | $this->reader->localName == 'siteinfo' ) { |
771 | break; |
772 | } |
773 | |
774 | $tag = $this->reader->localName; |
775 | |
776 | if ( $tag == 'namespace' ) { |
777 | $this->foreignNamespaces[$this->nodeAttribute( 'key' )] = |
778 | $this->nodeContents(); |
779 | } elseif ( in_array( $tag, $normalFields ) ) { |
780 | $siteInfo[$tag] = $this->nodeContents(); |
781 | } |
782 | } |
783 | |
784 | $siteInfo['_namespaces'] = $this->foreignNamespaces; |
785 | $this->siteInfoCallback( $siteInfo ); |
786 | } |
787 | |
788 | private function handleLogItem() { |
789 | $this->debug( "Enter log item handler." ); |
790 | $logInfo = []; |
791 | |
792 | // Fields that can just be stuffed in the pageInfo object |
793 | $normalFields = [ 'id', 'comment', 'type', 'action', 'timestamp', |
794 | 'logtitle', 'params' ]; |
795 | |
796 | while ( $this->reader->read() ) { |
797 | if ( $this->reader->nodeType == XMLReader::END_ELEMENT && |
798 | $this->reader->localName == 'logitem' ) { |
799 | break; |
800 | } |
801 | |
802 | $tag = $this->reader->localName; |
803 | |
804 | if ( !$this->hookRunner->onImportHandleLogItemXMLTag( $this, $logInfo ) ) { |
805 | // Do nothing |
806 | } elseif ( in_array( $tag, $normalFields ) ) { |
807 | $logInfo[$tag] = $this->nodeContents(); |
808 | } elseif ( $tag == 'contributor' ) { |
809 | $logInfo['contributor'] = $this->handleContributor(); |
810 | } elseif ( $tag != '#text' ) { |
811 | $this->warn( "Unhandled log-item XML tag $tag" ); |
812 | } |
813 | } |
814 | |
815 | $this->processLogItem( $logInfo ); |
816 | } |
817 | |
818 | /** |
819 | * @param array $logInfo |
820 | * @return mixed|false |
821 | */ |
822 | private function processLogItem( $logInfo ) { |
823 | $revision = new WikiRevision(); |
824 | |
825 | if ( isset( $logInfo['id'] ) ) { |
826 | $revision->setID( $logInfo['id'] ); |
827 | } |
828 | $revision->setType( $logInfo['type'] ); |
829 | $revision->setAction( $logInfo['action'] ); |
830 | if ( isset( $logInfo['timestamp'] ) ) { |
831 | $revision->setTimestamp( $logInfo['timestamp'] ); |
832 | } |
833 | if ( isset( $logInfo['params'] ) ) { |
834 | $revision->setParams( $logInfo['params'] ); |
835 | } |
836 | if ( isset( $logInfo['logtitle'] ) ) { |
837 | // @todo Using Title for non-local titles is a recipe for disaster. |
838 | // We should use ForeignTitle here instead. |
839 | $revision->setTitle( Title::newFromText( $logInfo['logtitle'] ) ); |
840 | } |
841 | |
842 | $revision->setNoUpdates( $this->mNoUpdates ); |
843 | |
844 | if ( isset( $logInfo['comment'] ) ) { |
845 | $revision->setComment( $logInfo['comment'] ); |
846 | } |
847 | |
848 | if ( isset( $logInfo['contributor']['username'] ) ) { |
849 | $revision->setUsername( |
850 | $this->externalUserNames->applyPrefix( $logInfo['contributor']['username'] ) |
851 | ); |
852 | } elseif ( isset( $logInfo['contributor']['ip'] ) ) { |
853 | $revision->setUserIP( $logInfo['contributor']['ip'] ); |
854 | } else { |
855 | $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) ); |
856 | } |
857 | |
858 | return $this->logItemCallback( $revision ); |
859 | } |
860 | |
861 | private function handlePage() { |
862 | // Handle page data. |
863 | $this->debug( "Enter page handler." ); |
864 | $pageInfo = [ 'revisionCount' => 0, 'successfulRevisionCount' => 0 ]; |
865 | |
866 | // Fields that can just be stuffed in the pageInfo object |
867 | $normalFields = [ 'title', 'ns', 'id', 'redirect', 'restrictions' ]; |
868 | |
869 | $skip = false; |
870 | $badTitle = false; |
871 | |
872 | while ( $skip ? $this->reader->next() : $this->reader->read() ) { |
873 | if ( $this->reader->nodeType == XMLReader::END_ELEMENT && |
874 | $this->reader->localName == 'page' ) { |
875 | break; |
876 | } |
877 | |
878 | $skip = false; |
879 | |
880 | $tag = $this->reader->localName; |
881 | |
882 | if ( $badTitle ) { |
883 | // The title is invalid, bail out of this page |
884 | $skip = true; |
885 | } elseif ( !$this->hookRunner->onImportHandlePageXMLTag( $this, $pageInfo ) ) { |
886 | // Do nothing |
887 | } elseif ( in_array( $tag, $normalFields ) ) { |
888 | // An XML snippet: |
889 | // <page> |
890 | // <id>123</id> |
891 | // <title>Page</title> |
892 | // <redirect title="NewTitle"/> |
893 | // ... |
894 | // Because the redirect tag is built differently, we need special handling for that case. |
895 | if ( $tag == 'redirect' ) { |
896 | $pageInfo[$tag] = $this->nodeAttribute( 'title' ); |
897 | } else { |
898 | $pageInfo[$tag] = $this->nodeContents(); |
899 | } |
900 | } elseif ( $tag == 'revision' || $tag == 'upload' ) { |
901 | if ( !isset( $title ) ) { |
902 | $title = $this->processTitle( $pageInfo['title'], |
903 | $pageInfo['ns'] ?? null ); |
904 | |
905 | // $title is either an array of two titles or false. |
906 | if ( is_array( $title ) ) { |
907 | $this->pageCallback( $title ); |
908 | [ $pageInfo['_title'], $foreignTitle ] = $title; |
909 | } else { |
910 | $badTitle = true; |
911 | $skip = true; |
912 | } |
913 | } |
914 | |
915 | if ( $title ) { |
916 | if ( $tag == 'revision' ) { |
917 | $this->handleRevision( $pageInfo ); |
918 | } else { |
919 | $this->handleUpload( $pageInfo ); |
920 | } |
921 | } |
922 | } elseif ( $tag != '#text' ) { |
923 | $this->warn( "Unhandled page XML tag $tag" ); |
924 | $skip = true; |
925 | } |
926 | } |
927 | |
928 | // @note $pageInfo is only set if a valid $title is processed above with |
929 | // no error. If we have a valid $title, then pageCallback is called |
930 | // above, $pageInfo['title'] is set and we do pageOutCallback here. |
931 | // If $pageInfo['_title'] is not set, then $foreignTitle is also not |
932 | // set since they both come from $title above. |
933 | if ( array_key_exists( '_title', $pageInfo ) ) { |
934 | /** @var Title $title */ |
935 | $title = $pageInfo['_title']; |
936 | $this->pageOutCallback( |
937 | $title, |
938 | // @phan-suppress-next-line PhanPossiblyUndeclaredVariable Set together with _title key |
939 | $foreignTitle, |
940 | $pageInfo['revisionCount'], |
941 | $pageInfo['successfulRevisionCount'], |
942 | $pageInfo |
943 | ); |
944 | } |
945 | } |
946 | |
947 | /** |
948 | * @param array &$pageInfo |
949 | */ |
950 | private function handleRevision( &$pageInfo ) { |
951 | $this->debug( "Enter revision handler" ); |
952 | $revisionInfo = []; |
953 | |
954 | $normalFields = [ 'id', 'parentid', 'timestamp', 'comment', 'minor', 'origin', |
955 | 'model', 'format', 'text', 'sha1' ]; |
956 | |
957 | $skip = false; |
958 | |
959 | while ( $skip ? $this->reader->next() : $this->reader->read() ) { |
960 | if ( $this->reader->nodeType == XMLReader::END_ELEMENT && |
961 | $this->reader->localName == 'revision' ) { |
962 | break; |
963 | } |
964 | |
965 | $tag = $this->reader->localName; |
966 | |
967 | if ( !$this->hookRunner->onImportHandleRevisionXMLTag( |
968 | $this, $pageInfo, $revisionInfo ) |
969 | ) { |
970 | // Do nothing |
971 | } elseif ( in_array( $tag, $normalFields ) ) { |
972 | $revisionInfo[$tag] = $this->nodeContents(); |
973 | } elseif ( $tag == 'content' ) { |
974 | // We can have multiple content tags, so make this an array. |
975 | $revisionInfo[$tag][] = $this->handleContent(); |
976 | } elseif ( $tag == 'contributor' ) { |
977 | $revisionInfo['contributor'] = $this->handleContributor(); |
978 | } elseif ( $tag != '#text' ) { |
979 | $this->warn( "Unhandled revision XML tag $tag" ); |
980 | $skip = true; |
981 | } |
982 | } |
983 | |
984 | $pageInfo['revisionCount']++; |
985 | if ( $this->processRevision( $pageInfo, $revisionInfo ) ) { |
986 | $pageInfo['successfulRevisionCount']++; |
987 | } |
988 | } |
989 | |
990 | private function handleContent() { |
991 | $this->debug( "Enter content handler" ); |
992 | $contentInfo = []; |
993 | |
994 | $normalFields = [ 'role', 'origin', 'model', 'format', 'text' ]; |
995 | |
996 | $skip = false; |
997 | |
998 | while ( $skip ? $this->reader->next() : $this->reader->read() ) { |
999 | if ( $this->reader->nodeType == XMLReader::END_ELEMENT && |
1000 | $this->reader->localName == 'content' ) { |
1001 | break; |
1002 | } |
1003 | |
1004 | $tag = $this->reader->localName; |
1005 | |
1006 | if ( !$this->hookRunner->onImportHandleContentXMLTag( |
1007 | $this, $contentInfo ) |
1008 | ) { |
1009 | // Do nothing |
1010 | } elseif ( in_array( $tag, $normalFields ) ) { |
1011 | $contentInfo[$tag] = $this->nodeContents(); |
1012 | } elseif ( $tag != '#text' ) { |
1013 | $this->warn( "Unhandled content XML tag $tag" ); |
1014 | $skip = true; |
1015 | } |
1016 | } |
1017 | |
1018 | return $contentInfo; |
1019 | } |
1020 | |
1021 | /** |
1022 | * @param PageIdentity $page |
1023 | * @param int $revisionId |
1024 | * @param array $contentInfo |
1025 | * |
1026 | * @return Content |
1027 | */ |
1028 | private function makeContent( PageIdentity $page, $revisionId, $contentInfo ) { |
1029 | $maxArticleSize = $this->config->get( MainConfigNames::MaxArticleSize ); |
1030 | |
1031 | if ( !isset( $contentInfo['text'] ) ) { |
1032 | throw new InvalidArgumentException( 'Missing text field in import.' ); |
1033 | } |
1034 | |
1035 | // Make sure revisions won't violate $wgMaxArticleSize, which could lead to |
1036 | // database errors and instability. Testing for revisions with only listed |
1037 | // content models, as other content models might use serialization formats |
1038 | // which aren't checked against $wgMaxArticleSize. |
1039 | if ( ( !isset( $contentInfo['model'] ) || |
1040 | in_array( $contentInfo['model'], [ |
1041 | 'wikitext', |
1042 | 'css', |
1043 | 'json', |
1044 | 'javascript', |
1045 | 'text', |
1046 | '' |
1047 | ] ) ) && |
1048 | strlen( $contentInfo['text'] ) > $maxArticleSize * 1024 |
1049 | ) { |
1050 | throw new RuntimeException( 'The text of ' . |
1051 | ( $revisionId ? |
1052 | "the revision with ID $revisionId" : |
1053 | 'a revision' |
1054 | ) . " exceeds the maximum allowable size ({$maxArticleSize} KiB)" ); |
1055 | } |
1056 | |
1057 | $role = $contentInfo['role'] ?? SlotRecord::MAIN; |
1058 | $model = $contentInfo['model'] ?? $this->slotRoleRegistry |
1059 | ->getRoleHandler( $role ) |
1060 | ->getDefaultModel( $page ); |
1061 | $handler = $this->contentHandlerFactory->getContentHandler( $model ); |
1062 | |
1063 | $text = $handler->importTransform( $contentInfo['text'] ); |
1064 | |
1065 | return $handler->unserializeContent( $text ); |
1066 | } |
1067 | |
1068 | /** |
1069 | * @param array $pageInfo |
1070 | * @param array $revisionInfo |
1071 | * @return mixed|false |
1072 | */ |
1073 | private function processRevision( $pageInfo, $revisionInfo ) { |
1074 | $revision = new WikiRevision(); |
1075 | |
1076 | $revId = $revisionInfo['id'] ?? 0; |
1077 | if ( $revId ) { |
1078 | $revision->setID( $revisionInfo['id'] ); |
1079 | } |
1080 | |
1081 | $title = $pageInfo['_title']; |
1082 | $revision->setTitle( $title ); |
1083 | |
1084 | $content = $this->makeContent( $title, $revId, $revisionInfo ); |
1085 | $revision->setContent( SlotRecord::MAIN, $content ); |
1086 | |
1087 | foreach ( $revisionInfo['content'] ?? [] as $slotInfo ) { |
1088 | if ( !isset( $slotInfo['role'] ) ) { |
1089 | throw new RuntimeException( "Missing role for imported slot." ); |
1090 | } |
1091 | |
1092 | $content = $this->makeContent( $title, $revId, $slotInfo ); |
1093 | $revision->setContent( $slotInfo['role'], $content ); |
1094 | } |
1095 | $revision->setTimestamp( $revisionInfo['timestamp'] ?? wfTimestampNow() ); |
1096 | |
1097 | if ( isset( $revisionInfo['comment'] ) ) { |
1098 | $revision->setComment( $revisionInfo['comment'] ); |
1099 | } |
1100 | |
1101 | if ( isset( $revisionInfo['minor'] ) ) { |
1102 | $revision->setMinor( true ); |
1103 | } |
1104 | if ( isset( $revisionInfo['contributor']['username'] ) ) { |
1105 | $revision->setUsername( |
1106 | $this->externalUserNames->applyPrefix( $revisionInfo['contributor']['username'] ) |
1107 | ); |
1108 | } elseif ( isset( $revisionInfo['contributor']['ip'] ) ) { |
1109 | $revision->setUserIP( $revisionInfo['contributor']['ip'] ); |
1110 | } else { |
1111 | $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) ); |
1112 | } |
1113 | if ( isset( $revisionInfo['sha1'] ) ) { |
1114 | $revision->setSha1Base36( $revisionInfo['sha1'] ); |
1115 | } |
1116 | $revision->setNoUpdates( $this->mNoUpdates ); |
1117 | |
1118 | return $this->revisionCallback( $revision ); |
1119 | } |
1120 | |
1121 | /** |
1122 | * @param array &$pageInfo |
1123 | * @return mixed |
1124 | */ |
1125 | private function handleUpload( &$pageInfo ) { |
1126 | $this->debug( "Enter upload handler" ); |
1127 | $uploadInfo = []; |
1128 | |
1129 | $normalFields = [ 'timestamp', 'comment', 'filename', 'text', |
1130 | 'src', 'size', 'sha1base36', 'archivename', 'rel' ]; |
1131 | |
1132 | $skip = false; |
1133 | |
1134 | while ( $skip ? $this->reader->next() : $this->reader->read() ) { |
1135 | if ( $this->reader->nodeType == XMLReader::END_ELEMENT && |
1136 | $this->reader->localName == 'upload' ) { |
1137 | break; |
1138 | } |
1139 | |
1140 | $tag = $this->reader->localName; |
1141 | |
1142 | if ( !$this->hookRunner->onImportHandleUploadXMLTag( $this, $pageInfo ) ) { |
1143 | // Do nothing |
1144 | } elseif ( in_array( $tag, $normalFields ) ) { |
1145 | $uploadInfo[$tag] = $this->nodeContents(); |
1146 | } elseif ( $tag == 'contributor' ) { |
1147 | $uploadInfo['contributor'] = $this->handleContributor(); |
1148 | } elseif ( $tag == 'contents' ) { |
1149 | $contents = $this->nodeContents(); |
1150 | $encoding = $this->reader->getAttribute( 'encoding' ); |
1151 | if ( $encoding === 'base64' ) { |
1152 | $uploadInfo['fileSrc'] = $this->dumpTemp( base64_decode( $contents ) ); |
1153 | $uploadInfo['isTempSrc'] = true; |
1154 | } |
1155 | } elseif ( $tag != '#text' ) { |
1156 | $this->warn( "Unhandled upload XML tag $tag" ); |
1157 | $skip = true; |
1158 | } |
1159 | } |
1160 | |
1161 | if ( $this->mImageBasePath && isset( $uploadInfo['rel'] ) ) { |
1162 | $path = "{$this->mImageBasePath}/{$uploadInfo['rel']}"; |
1163 | if ( file_exists( $path ) ) { |
1164 | $uploadInfo['fileSrc'] = $path; |
1165 | $uploadInfo['isTempSrc'] = false; |
1166 | } |
1167 | } |
1168 | |
1169 | if ( $this->mImportUploads ) { |
1170 | return $this->processUpload( $pageInfo, $uploadInfo ); |
1171 | } |
1172 | } |
1173 | |
1174 | /** |
1175 | * @param string $contents |
1176 | * @return string |
1177 | */ |
1178 | private function dumpTemp( $contents ) { |
1179 | $filename = tempnam( wfTempDir(), 'importupload' ); |
1180 | file_put_contents( $filename, $contents ); |
1181 | return $filename; |
1182 | } |
1183 | |
1184 | /** |
1185 | * @param array $pageInfo |
1186 | * @param array $uploadInfo |
1187 | * @return mixed |
1188 | */ |
1189 | private function processUpload( $pageInfo, $uploadInfo ) { |
1190 | $revision = new WikiRevision(); |
1191 | $revId = $pageInfo['id']; |
1192 | $title = $pageInfo['_title']; |
1193 | // T292348: text key may be absent, force addition if null |
1194 | $uploadInfo['text'] ??= ''; |
1195 | $content = $this->makeContent( $title, $revId, $uploadInfo ); |
1196 | |
1197 | $revision->setTitle( $title ); |
1198 | $revision->setID( $revId ); |
1199 | $revision->setTimestamp( $uploadInfo['timestamp'] ); |
1200 | $revision->setContent( SlotRecord::MAIN, $content ); |
1201 | $revision->setFilename( $uploadInfo['filename'] ); |
1202 | if ( isset( $uploadInfo['archivename'] ) ) { |
1203 | $revision->setArchiveName( $uploadInfo['archivename'] ); |
1204 | } |
1205 | $revision->setSrc( $uploadInfo['src'] ); |
1206 | if ( isset( $uploadInfo['fileSrc'] ) ) { |
1207 | $revision->setFileSrc( $uploadInfo['fileSrc'], |
1208 | !empty( $uploadInfo['isTempSrc'] ) |
1209 | ); |
1210 | } |
1211 | if ( isset( $uploadInfo['sha1base36'] ) ) { |
1212 | $revision->setSha1Base36( $uploadInfo['sha1base36'] ); |
1213 | } |
1214 | $revision->setSize( intval( $uploadInfo['size'] ) ); |
1215 | $revision->setComment( $uploadInfo['comment'] ); |
1216 | |
1217 | if ( isset( $uploadInfo['contributor']['username'] ) ) { |
1218 | $revision->setUsername( |
1219 | $this->externalUserNames->applyPrefix( $uploadInfo['contributor']['username'] ) |
1220 | ); |
1221 | } elseif ( isset( $uploadInfo['contributor']['ip'] ) ) { |
1222 | $revision->setUserIP( $uploadInfo['contributor']['ip'] ); |
1223 | } |
1224 | $revision->setNoUpdates( $this->mNoUpdates ); |
1225 | |
1226 | return call_user_func( $this->mUploadCallback, $revision ); |
1227 | } |
1228 | |
1229 | /** |
1230 | * @return array |
1231 | */ |
1232 | private function handleContributor() { |
1233 | $this->debug( "Enter contributor handler." ); |
1234 | |
1235 | if ( $this->reader->isEmptyElement ) { |
1236 | return []; |
1237 | } |
1238 | |
1239 | $fields = [ 'id', 'ip', 'username' ]; |
1240 | $info = []; |
1241 | |
1242 | while ( $this->reader->read() ) { |
1243 | if ( $this->reader->nodeType == XMLReader::END_ELEMENT && |
1244 | $this->reader->localName == 'contributor' ) { |
1245 | break; |
1246 | } |
1247 | |
1248 | $tag = $this->reader->localName; |
1249 | |
1250 | if ( in_array( $tag, $fields ) ) { |
1251 | $info[$tag] = $this->nodeContents(); |
1252 | } |
1253 | } |
1254 | |
1255 | return $info; |
1256 | } |
1257 | |
1258 | /** |
1259 | * @param string $text |
1260 | * @param string|null $ns |
1261 | * @return array|false |
1262 | */ |
1263 | private function processTitle( $text, $ns = null ) { |
1264 | if ( $this->foreignNamespaces === null ) { |
1265 | $foreignTitleFactory = new NaiveForeignTitleFactory( |
1266 | $this->contentLanguage |
1267 | ); |
1268 | } else { |
1269 | $foreignTitleFactory = new NamespaceAwareForeignTitleFactory( |
1270 | $this->foreignNamespaces ); |
1271 | } |
1272 | |
1273 | $foreignTitle = $foreignTitleFactory->createForeignTitle( $text, |
1274 | intval( $ns ) ); |
1275 | |
1276 | $title = $this->importTitleFactory->createTitleFromForeignTitle( |
1277 | $foreignTitle ); |
1278 | |
1279 | if ( $title === null ) { |
1280 | # Invalid page title? Ignore the page |
1281 | $this->notice( 'import-error-invalid', $foreignTitle->getFullText() ); |
1282 | return false; |
1283 | } elseif ( $title->isExternal() ) { |
1284 | $this->notice( 'import-error-interwiki', $title->getPrefixedText() ); |
1285 | return false; |
1286 | } elseif ( !$title->canExist() ) { |
1287 | $this->notice( 'import-error-special', $title->getPrefixedText() ); |
1288 | return false; |
1289 | } elseif ( !$this->performer->definitelyCan( 'edit', $title ) ) { |
1290 | # Do not import if the importing wiki user cannot edit this page |
1291 | $this->notice( 'import-error-edit', $title->getPrefixedText() ); |
1292 | return false; |
1293 | } |
1294 | |
1295 | return [ $title, $foreignTitle ]; |
1296 | } |
1297 | |
1298 | /** |
1299 | * Open the XMLReader connected to the source adapter id |
1300 | * @suppress PhanStaticCallToNonStatic, UnusedSuppression -- for PHP 7.4 support |
1301 | */ |
1302 | private function openReader() { |
1303 | // Enable the entity loader, as it is needed for loading external URLs via |
1304 | // XMLReader::open (T86036) |
1305 | // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847 |
1306 | $oldDisable = @libxml_disable_entity_loader( false ); |
1307 | |
1308 | if ( PHP_VERSION_ID >= 80000 ) { |
1309 | // A static call is now preferred, and avoids https://github.com/php/php-src/issues/11548 |
1310 | $reader = XMLReader::open( |
1311 | 'uploadsource://' . $this->sourceAdapterId, null, LIBXML_PARSEHUGE ); |
1312 | if ( $reader instanceof XMLReader ) { |
1313 | $this->reader = $reader; |
1314 | $status = true; |
1315 | } else { |
1316 | $status = false; |
1317 | } |
1318 | } else { |
1319 | // A static call generated a deprecation warning prior to PHP 8.0 |
1320 | $this->reader = new XMLReader; |
1321 | $status = $this->reader->open( |
1322 | 'uploadsource://' . $this->sourceAdapterId, null, LIBXML_PARSEHUGE ); |
1323 | } |
1324 | if ( !$status ) { |
1325 | $error = libxml_get_last_error(); |
1326 | // phpcs:ignore Generic.PHP.NoSilencedErrors |
1327 | @libxml_disable_entity_loader( $oldDisable ); |
1328 | throw new RuntimeException( |
1329 | 'Encountered an internal error while initializing WikiImporter object: ' . $error->message |
1330 | ); |
1331 | } |
1332 | // phpcs:ignore Generic.PHP.NoSilencedErrors |
1333 | @libxml_disable_entity_loader( $oldDisable ); |
1334 | } |
1335 | |
1336 | /** |
1337 | * Check the syntax of the given xml |
1338 | */ |
1339 | private function syntaxCheckXML() { |
1340 | if ( !UploadSourceAdapter::isSeekableSource( $this->sourceAdapterId ) ) { |
1341 | return; |
1342 | } |
1343 | AtEase::suppressWarnings(); |
1344 | $oldDisable = libxml_disable_entity_loader( false ); |
1345 | try { |
1346 | while ( $this->reader->read() ); |
1347 | $error = libxml_get_last_error(); |
1348 | if ( $error ) { |
1349 | $errorMessage = 'XML error at line ' . $error->line . ': ' . $error->message; |
1350 | wfDebug( __METHOD__ . ': Invalid xml found - ' . $errorMessage ); |
1351 | throw new RuntimeException( $errorMessage ); |
1352 | } |
1353 | } finally { |
1354 | libxml_disable_entity_loader( $oldDisable ); |
1355 | AtEase::restoreWarnings(); |
1356 | $this->reader->close(); |
1357 | } |
1358 | |
1359 | // Reopen for the real import |
1360 | UploadSourceAdapter::seekSource( $this->sourceAdapterId, 0 ); |
1361 | $this->openReader(); |
1362 | } |
1363 | } |