MediaWiki master
XmlTypeCheck.php
Go to the documentation of this file.
1<?php
7namespace Wikimedia\Mime;
8
9use Exception;
10use XMLReader;
11
29 public $wellFormed = null;
30
35 public $filterMatch = false;
36
42 public $filterMatchType = false;
43
48 public $rootElement = '';
49
55 protected $elementData = [];
56
60 protected $elementDataContext = [];
61
65 protected $stackDepth = 0;
66
68 protected $filterCallback;
69
73 private $parserOptions = [
74 'processing_instruction_handler' => null,
75 'external_dtd_handler' => '',
76 'dtd_handler' => '',
77 'require_safe_dtd' => true
78 ];
79
106 public function __construct( $input, $filterCallback = null, $isFile = true, $options = [] ) {
107 $this->filterCallback = $filterCallback;
108 $this->parserOptions = array_merge( $this->parserOptions, $options );
109 $this->validateFromInput( $input, $isFile );
110 }
111
123 public static function newFromFilename( $fname, $filterCallback = null ) {
124 return new self( $fname, $filterCallback, true );
125 }
126
138 public static function newFromString( $string, $filterCallback = null ) {
139 return new self( $string, $filterCallback, false );
140 }
141
147 public function getRootElement() {
148 return $this->rootElement;
149 }
150
155 private function validateFromInput( $xml, $isFile ) {
156 // Allow text and attr nodes over 10 MB, e.g. embedded embedded raster images in SVG (T387969).
157 $xmlParseHuge = LIBXML_PARSEHUGE;
158 if ( defined( 'MW_PHPUNIT_TEST' ) ) {
159 // Use low limits while running tests, because XmlTypeCheckTest::testRecursiveEntity()
160 // requires over 1 GB of memory and over a minute of time with huge limits (T392782).
161 // Maybe this should be configurable for low-resource deployments?
162 $xmlParseHuge = 0;
163 }
164 $reader = new XMLReader();
165 if ( $isFile ) {
166 $s = $reader->open( $xml, null, LIBXML_NOERROR | LIBXML_NOWARNING | $xmlParseHuge );
167 } else {
168 $s = $reader->XML( $xml, null, LIBXML_NOERROR | LIBXML_NOWARNING | $xmlParseHuge );
169 }
170 if ( $s !== true ) {
171 // Couldn't open the XML
172 $this->wellFormed = false;
173 } else {
174 // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847
175 $oldDisable = @libxml_disable_entity_loader( true );
176 $reader->setParserProperty( XMLReader::SUBST_ENTITIES, true );
177 try {
178 $this->validate( $reader );
179 } catch ( Exception $e ) {
180 // Calling this malformed, because we didn't parse the whole
181 // thing. Maybe just an external entity refernce.
182 $this->wellFormed = false;
183 throw $e;
184 } finally {
185 $reader->close();
186 // phpcs:ignore Generic.PHP.NoSilencedErrors
187 @libxml_disable_entity_loader( $oldDisable );
188 }
189 }
190 }
191
192 private function readNext( XMLReader $reader ): bool {
193 set_error_handler( function ( $line, $file ) {
194 $this->wellFormed = false;
195 return true;
196 } );
197 $ret = $reader->read();
198 restore_error_handler();
199 return $ret;
200 }
201
202 private function validate( XMLReader $reader ) {
203 // First, move through anything that isn't an element, and
204 // handle any processing instructions with the callback
205 do {
206 if ( !$this->readNext( $reader ) ) {
207 // Hit the end of the document before any elements
208 $this->wellFormed = false;
209 return;
210 }
211 if ( $reader->nodeType === XMLReader::PI ) {
212 $this->processingInstructionHandler( $reader->name, $reader->value );
213 }
214 if ( $reader->nodeType === XMLReader::DOC_TYPE ) {
215 $this->dtdHandler( $reader );
216 }
217 } while ( $reader->nodeType != XMLReader::ELEMENT );
218
219 // Process the rest of the document
220 do {
221 switch ( $reader->nodeType ) {
222 case XMLReader::ELEMENT:
223 $name = $this->expandNS(
224 $reader->name,
225 $reader->namespaceURI
226 );
227 if ( $this->rootElement === '' ) {
228 $this->rootElement = $name;
229 }
230 $empty = $reader->isEmptyElement;
231 $attrs = $this->getAttributesArray( $reader );
232 $this->elementOpen( $name, $attrs );
233 if ( $empty ) {
234 $this->elementClose();
235 }
236 break;
237
238 case XMLReader::END_ELEMENT:
239 $this->elementClose();
240 break;
241
242 case XMLReader::WHITESPACE:
243 case XMLReader::SIGNIFICANT_WHITESPACE:
244 case XMLReader::CDATA:
245 case XMLReader::TEXT:
246 $this->elementData( $reader->value );
247 break;
248
249 case XMLReader::ENTITY_REF:
250 // Unexpanded entity (maybe external?),
251 // don't send to the filter (xml_parse didn't)
252 break;
253
254 case XMLReader::COMMENT:
255 // Don't send to the filter (xml_parse didn't)
256 break;
257
258 case XMLReader::PI:
259 // Processing instructions can happen after the header too
260 $this->processingInstructionHandler(
261 $reader->name,
262 $reader->value
263 );
264 break;
265 case XMLReader::DOC_TYPE:
266 // We should never see a doctype after first
267 // element.
268 $this->wellFormed = false;
269 break;
270 default:
271 // One of DOC, ENTITY, END_ENTITY,
272 // NOTATION, or XML_DECLARATION
273 // xml_parse didn't send these to the filter, so we won't.
274 }
275 } while ( $this->readNext( $reader ) );
276
277 if ( $this->stackDepth !== 0 ) {
278 $this->wellFormed = false;
279 } elseif ( $this->wellFormed === null ) {
280 $this->wellFormed = true;
281 }
282 }
283
289 private function getAttributesArray( XMLReader $r ) {
290 $attrs = [];
291 while ( $r->moveToNextAttribute() ) {
292 if ( $r->namespaceURI === 'http://www.w3.org/2000/xmlns/' ) {
293 // XMLReader treats xmlns attributes as normal
294 // attributes, while xml_parse doesn't
295 continue;
296 }
297 $name = $this->expandNS( $r->name, $r->namespaceURI );
298 $attrs[$name] = $r->value;
299 }
300 return $attrs;
301 }
302
308 private function expandNS( $name, $namespaceURI ) {
309 if ( $namespaceURI ) {
310 $parts = explode( ':', $name );
311 $localname = array_pop( $parts );
312 return "$namespaceURI:$localname";
313 }
314 return $name;
315 }
316
321 private function elementOpen( $name, $attribs ) {
322 $this->elementDataContext[] = [ $name, $attribs ];
323 $this->elementData[] = '';
324 $this->stackDepth++;
325 }
326
327 private function elementClose() {
328 [ $name, $attribs ] = array_pop( $this->elementDataContext );
329 $data = array_pop( $this->elementData );
330 $this->stackDepth--;
331 $callbackReturn = false;
332
333 if ( is_callable( $this->filterCallback ) ) {
334 $callbackReturn = ( $this->filterCallback )( $name, $attribs, $data );
335 }
336 if ( $callbackReturn ) {
337 // Filter hit!
338 $this->filterMatch = true;
339 $this->filterMatchType = $callbackReturn;
340 }
341 }
342
346 private function elementData( $data ) {
347 // Collect any data here, and we'll run the callback in elementClose
348 $this->elementData[ $this->stackDepth - 1 ] .= trim( $data );
349 }
350
355 private function processingInstructionHandler( $target, $data ) {
356 $callbackReturn = false;
357 if ( $this->parserOptions['processing_instruction_handler'] ) {
358 // @phan-suppress-next-line PhanTypeInvalidCallable false positive
359 $callbackReturn = $this->parserOptions['processing_instruction_handler'](
360 $target,
361 $data
362 );
363 }
364 if ( $callbackReturn ) {
365 // Filter hit!
366 $this->filterMatch = true;
367 $this->filterMatchType = $callbackReturn;
368 }
369 }
370
376 private function dtdHandler( XMLReader $reader ) {
377 $externalCallback = $this->parserOptions['external_dtd_handler'];
378 $generalCallback = $this->parserOptions['dtd_handler'];
379 $checkIfSafe = $this->parserOptions['require_safe_dtd'];
380 if ( !$externalCallback && !$generalCallback && !$checkIfSafe ) {
381 return;
382 }
383 $dtd = $reader->readOuterXml();
384 $callbackReturn = false;
385
386 if ( $generalCallback ) {
387 $callbackReturn = $generalCallback( $dtd );
388 }
389 if ( $callbackReturn ) {
390 // Filter hit!
391 $this->filterMatch = true;
392 $this->filterMatchType = $callbackReturn;
393 $callbackReturn = false;
394 }
395
396 $parsedDTD = $this->parseDTD( $dtd );
397 if ( $externalCallback && isset( $parsedDTD['type'] ) ) {
398 $callbackReturn = $externalCallback(
399 $parsedDTD['type'],
400 $parsedDTD['publicid'] ?? null,
401 $parsedDTD['systemid'] ?? null
402 );
403 }
404 if ( $callbackReturn ) {
405 // Filter hit!
406 $this->filterMatch = true;
407 $this->filterMatchType = $callbackReturn;
408 }
409
410 if ( $checkIfSafe && isset( $parsedDTD['internal'] ) &&
411 !$this->checkDTDIsSafe( $parsedDTD['internal'] )
412 ) {
413 $this->wellFormed = false;
414 }
415 }
416
437 private function checkDTDIsSafe( $internalSubset ) {
438 $res = preg_match(
439 '/^(?:\s*<!ENTITY\s+\S+\s+' .
440 '(?:"(?:&[^"%&;]{1,64};|(?:[^"%&]|&amp;|&quot;){0,255})"' .
441 '|\'(?:&[^\'%&;]{1,64};|(?:[^\'%&]|&amp;|&apos;){0,255})\')\s*>' .
442 '|\s*<!--(?:[^-]|-[^-])*-->' .
443 '|\s*<!ATTLIST svg xmlns:xlink CDATA #FIXED ' .
444 '"http:\/\/www.w3.org\/1999\/xlink">)*\s*$/',
445 $internalSubset
446 );
447
448 return (bool)$res;
449 }
450
459 private function parseDTD( $dtd ) {
460 $m = [];
461 $res = preg_match(
462 '/^<!DOCTYPE\s*\S+\s*' .
463 '(?:(?P<typepublic>PUBLIC)\s*' .
464 '(?:"(?P<pubquote>[^"]*)"|\'(?P<pubapos>[^\']*)\')' . // public identifer
465 '\s*"(?P<pubsysquote>[^"]*)"|\'(?P<pubsysapos>[^\']*)\'' . // system identifier
466 '|(?P<typesystem>SYSTEM)\s*' .
467 '(?:"(?P<sysquote>[^"]*)"|\'(?P<sysapos>[^\']*)\')' .
468 ')?\s*' .
469 '(?:\[\s*(?P<internal>.*)\])?\s*>$/s',
470 $dtd,
471 $m
472 );
473 if ( !$res ) {
474 $this->wellFormed = false;
475 return [];
476 }
477 $parsed = [];
478 foreach ( $m as $field => $value ) {
479 if ( $value === '' || is_numeric( $field ) ) {
480 continue;
481 }
482 switch ( $field ) {
483 case 'typepublic':
484 case 'typesystem':
485 $parsed['type'] = $value;
486 break;
487 case 'pubquote':
488 case 'pubapos':
489 $parsed['publicid'] = $value;
490 break;
491 case 'pubsysquote':
492 case 'pubsysapos':
493 case 'sysquote':
494 case 'sysapos':
495 $parsed['systemid'] = $value;
496 break;
497 case 'internal':
498 $parsed['internal'] = $value;
499 break;
500 }
501 }
502 return $parsed;
503 }
504}
505
507class_alias( XmlTypeCheck::class, 'XmlTypeCheck' );
XML syntax and type checker.
__construct( $input, $filterCallback=null, $isFile=true, $options=[])
Allow filtering an XML file.
int $stackDepth
Current depth of the data stack.
static newFromString( $string, $filterCallback=null)
Alternative constructor: from string.
string[] $elementData
A stack of strings containing the data of each xml element as it's processed.
bool null $wellFormed
Will be set to true or false to indicate whether the file is well-formed XML.
static newFromFilename( $fname, $filterCallback=null)
Alternative constructor: from filename.
bool $filterMatch
Will be set to true if the optional element filter returned a match at some point.
array $elementDataContext
A stack of element names and attributes, as we process them.
getRootElement()
Get the root element.
mixed $filterMatchType
Will contain the type of filter hit if the optional element filter returned a match at some point.
string $rootElement
Name of the document's root element, including any namespace as an expanded URL.