MediaWiki master
XmlTypeCheck.php
Go to the documentation of this file.
1<?php
21namespace Wikimedia\Mime;
22
23use Exception;
24use XMLReader;
25
43 public $wellFormed = null;
44
49 public $filterMatch = false;
50
56 public $filterMatchType = false;
57
62 public $rootElement = '';
63
69 protected $elementData = [];
70
74 protected $elementDataContext = [];
75
79 protected $stackDepth = 0;
80
82 protected $filterCallback;
83
87 private $parserOptions = [
88 'processing_instruction_handler' => null,
89 'external_dtd_handler' => '',
90 'dtd_handler' => '',
91 'require_safe_dtd' => true
92 ];
93
120 public function __construct( $input, $filterCallback = null, $isFile = true, $options = [] ) {
121 $this->filterCallback = $filterCallback;
122 $this->parserOptions = array_merge( $this->parserOptions, $options );
123 $this->validateFromInput( $input, $isFile );
124 }
125
137 public static function newFromFilename( $fname, $filterCallback = null ) {
138 return new self( $fname, $filterCallback, true );
139 }
140
152 public static function newFromString( $string, $filterCallback = null ) {
153 return new self( $string, $filterCallback, false );
154 }
155
161 public function getRootElement() {
162 return $this->rootElement;
163 }
164
169 private function validateFromInput( $xml, $isFile ) {
170 $reader = new XMLReader();
171 if ( $isFile ) {
172 $s = $reader->open( $xml, null, LIBXML_NOERROR | LIBXML_NOWARNING );
173 } else {
174 $s = $reader->XML( $xml, null, LIBXML_NOERROR | LIBXML_NOWARNING );
175 }
176 if ( $s !== true ) {
177 // Couldn't open the XML
178 $this->wellFormed = false;
179 } else {
180 // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847
181 $oldDisable = @libxml_disable_entity_loader( true );
182 $reader->setParserProperty( XMLReader::SUBST_ENTITIES, true );
183 try {
184 $this->validate( $reader );
185 } catch ( Exception $e ) {
186 // Calling this malformed, because we didn't parse the whole
187 // thing. Maybe just an external entity refernce.
188 $this->wellFormed = false;
189 $reader->close();
190 // phpcs:ignore Generic.PHP.NoSilencedErrors
191 @libxml_disable_entity_loader( $oldDisable );
192 throw $e;
193 }
194 $reader->close();
195 // phpcs:ignore Generic.PHP.NoSilencedErrors
196 @libxml_disable_entity_loader( $oldDisable );
197 }
198 }
199
200 private function readNext( XMLReader $reader ) {
201 set_error_handler( function ( $line, $file ) {
202 $this->wellFormed = false;
203 return true;
204 } );
205 $ret = $reader->read();
206 restore_error_handler();
207 return $ret;
208 }
209
210 private function validate( $reader ) {
211 // First, move through anything that isn't an element, and
212 // handle any processing instructions with the callback
213 do {
214 if ( !$this->readNext( $reader ) ) {
215 // Hit the end of the document before any elements
216 $this->wellFormed = false;
217 return;
218 }
219 if ( $reader->nodeType === XMLReader::PI ) {
220 $this->processingInstructionHandler( $reader->name, $reader->value );
221 }
222 if ( $reader->nodeType === XMLReader::DOC_TYPE ) {
223 $this->dtdHandler( $reader );
224 }
225 } while ( $reader->nodeType != XMLReader::ELEMENT );
226
227 // Process the rest of the document
228 do {
229 switch ( $reader->nodeType ) {
230 case XMLReader::ELEMENT:
231 $name = $this->expandNS(
232 $reader->name,
233 $reader->namespaceURI
234 );
235 if ( $this->rootElement === '' ) {
236 $this->rootElement = $name;
237 }
238 $empty = $reader->isEmptyElement;
239 $attrs = $this->getAttributesArray( $reader );
240 $this->elementOpen( $name, $attrs );
241 if ( $empty ) {
242 $this->elementClose();
243 }
244 break;
245
246 case XMLReader::END_ELEMENT:
247 $this->elementClose();
248 break;
249
250 case XMLReader::WHITESPACE:
251 case XMLReader::SIGNIFICANT_WHITESPACE:
252 case XMLReader::CDATA:
253 case XMLReader::TEXT:
254 $this->elementData( $reader->value );
255 break;
256
257 case XMLReader::ENTITY_REF:
258 // Unexpanded entity (maybe external?),
259 // don't send to the filter (xml_parse didn't)
260 break;
261
262 case XMLReader::COMMENT:
263 // Don't send to the filter (xml_parse didn't)
264 break;
265
266 case XMLReader::PI:
267 // Processing instructions can happen after the header too
268 $this->processingInstructionHandler(
269 $reader->name,
270 $reader->value
271 );
272 break;
273 case XMLReader::DOC_TYPE:
274 // We should never see a doctype after first
275 // element.
276 $this->wellFormed = false;
277 break;
278 default:
279 // One of DOC, ENTITY, END_ENTITY,
280 // NOTATION, or XML_DECLARATION
281 // xml_parse didn't send these to the filter, so we won't.
282 }
283 } while ( $this->readNext( $reader ) );
284
285 if ( $this->stackDepth !== 0 ) {
286 $this->wellFormed = false;
287 } elseif ( $this->wellFormed === null ) {
288 $this->wellFormed = true;
289 }
290 }
291
297 private function getAttributesArray( XMLReader $r ) {
298 $attrs = [];
299 while ( $r->moveToNextAttribute() ) {
300 if ( $r->namespaceURI === 'http://www.w3.org/2000/xmlns/' ) {
301 // XMLReader treats xmlns attributes as normal
302 // attributes, while xml_parse doesn't
303 continue;
304 }
305 $name = $this->expandNS( $r->name, $r->namespaceURI );
306 $attrs[$name] = $r->value;
307 }
308 return $attrs;
309 }
310
316 private function expandNS( $name, $namespaceURI ) {
317 if ( $namespaceURI ) {
318 $parts = explode( ':', $name );
319 $localname = array_pop( $parts );
320 return "$namespaceURI:$localname";
321 }
322 return $name;
323 }
324
329 private function elementOpen( $name, $attribs ) {
330 $this->elementDataContext[] = [ $name, $attribs ];
331 $this->elementData[] = '';
332 $this->stackDepth++;
333 }
334
335 private function elementClose() {
336 [ $name, $attribs ] = array_pop( $this->elementDataContext );
337 $data = array_pop( $this->elementData );
338 $this->stackDepth--;
339 $callbackReturn = false;
340
341 if ( is_callable( $this->filterCallback ) ) {
342 $callbackReturn = ( $this->filterCallback )( $name, $attribs, $data );
343 }
344 if ( $callbackReturn ) {
345 // Filter hit!
346 $this->filterMatch = true;
347 $this->filterMatchType = $callbackReturn;
348 }
349 }
350
354 private function elementData( $data ) {
355 // Collect any data here, and we'll run the callback in elementClose
356 $this->elementData[ $this->stackDepth - 1 ] .= trim( $data );
357 }
358
363 private function processingInstructionHandler( $target, $data ) {
364 $callbackReturn = false;
365 if ( $this->parserOptions['processing_instruction_handler'] ) {
366 // @phan-suppress-next-line PhanTypeInvalidCallable false positive
367 $callbackReturn = $this->parserOptions['processing_instruction_handler'](
368 $target,
369 $data
370 );
371 }
372 if ( $callbackReturn ) {
373 // Filter hit!
374 $this->filterMatch = true;
375 $this->filterMatchType = $callbackReturn;
376 }
377 }
378
384 private function dtdHandler( XMLReader $reader ) {
385 $externalCallback = $this->parserOptions['external_dtd_handler'];
386 $generalCallback = $this->parserOptions['dtd_handler'];
387 $checkIfSafe = $this->parserOptions['require_safe_dtd'];
388 if ( !$externalCallback && !$generalCallback && !$checkIfSafe ) {
389 return;
390 }
391 $dtd = $reader->readOuterXml();
392 $callbackReturn = false;
393
394 if ( $generalCallback ) {
395 $callbackReturn = $generalCallback( $dtd );
396 }
397 if ( $callbackReturn ) {
398 // Filter hit!
399 $this->filterMatch = true;
400 $this->filterMatchType = $callbackReturn;
401 $callbackReturn = false;
402 }
403
404 $parsedDTD = $this->parseDTD( $dtd );
405 if ( $externalCallback && isset( $parsedDTD['type'] ) ) {
406 $callbackReturn = $externalCallback(
407 $parsedDTD['type'],
408 $parsedDTD['publicid'] ?? null,
409 $parsedDTD['systemid'] ?? null
410 );
411 }
412 if ( $callbackReturn ) {
413 // Filter hit!
414 $this->filterMatch = true;
415 $this->filterMatchType = $callbackReturn;
416 }
417
418 if ( $checkIfSafe && isset( $parsedDTD['internal'] ) &&
419 !$this->checkDTDIsSafe( $parsedDTD['internal'] )
420 ) {
421 $this->wellFormed = false;
422 }
423 }
424
445 private function checkDTDIsSafe( $internalSubset ) {
446 $res = preg_match(
447 '/^(?:\s*<!ENTITY\s+\S+\s+' .
448 '(?:"(?:&[^"%&;]{1,64};|(?:[^"%&]|&amp;|&quot;){0,255})"' .
449 '|\'(?:&[^\'%&;]{1,64};|(?:[^\'%&]|&amp;|&apos;){0,255})\')\s*>' .
450 '|\s*<!--(?:[^-]|-[^-])*-->' .
451 '|\s*<!ATTLIST svg xmlns:xlink CDATA #FIXED ' .
452 '"http:\/\/www.w3.org\/1999\/xlink">)*\s*$/',
453 $internalSubset
454 );
455
456 return (bool)$res;
457 }
458
467 private function parseDTD( $dtd ) {
468 $m = [];
469 $res = preg_match(
470 '/^<!DOCTYPE\s*\S+\s*' .
471 '(?:(?P<typepublic>PUBLIC)\s*' .
472 '(?:"(?P<pubquote>[^"]*)"|\'(?P<pubapos>[^\']*)\')' . // public identifer
473 '\s*"(?P<pubsysquote>[^"]*)"|\'(?P<pubsysapos>[^\']*)\'' . // system identifier
474 '|(?P<typesystem>SYSTEM)\s*' .
475 '(?:"(?P<sysquote>[^"]*)"|\'(?P<sysapos>[^\']*)\')' .
476 ')?\s*' .
477 '(?:\[\s*(?P<internal>.*)\])?\s*>$/s',
478 $dtd,
479 $m
480 );
481 if ( !$res ) {
482 $this->wellFormed = false;
483 return [];
484 }
485 $parsed = [];
486 foreach ( $m as $field => $value ) {
487 if ( $value === '' || is_numeric( $field ) ) {
488 continue;
489 }
490 switch ( $field ) {
491 case 'typepublic':
492 case 'typesystem':
493 $parsed['type'] = $value;
494 break;
495 case 'pubquote':
496 case 'pubapos':
497 $parsed['publicid'] = $value;
498 break;
499 case 'pubsysquote':
500 case 'pubsysapos':
501 case 'sysquote':
502 case 'sysapos':
503 $parsed['systemid'] = $value;
504 break;
505 case 'internal':
506 $parsed['internal'] = $value;
507 break;
508 }
509 }
510 return $parsed;
511 }
512}
513
515class_alias( XmlTypeCheck::class, 'XmlTypeCheck' );
XML syntax and type checker.
__construct( $input, $filterCallback=null, $isFile=true, $options=[])
Allow filtering an XML file.
int $stackDepth
Current depth of the data stack.
static newFromString( $string, $filterCallback=null)
Alternative constructor: from string.
string[] $elementData
A stack of strings containing the data of each xml element as it's processed.
bool null $wellFormed
Will be set to true or false to indicate whether the file is well-formed XML.
static newFromFilename( $fname, $filterCallback=null)
Alternative constructor: from filename.
bool $filterMatch
Will be set to true if the optional element filter returned a match at some point.
array $elementDataContext
A stack of element names and attributes, as we process them.
getRootElement()
Get the root element.
mixed $filterMatchType
Will contain the type of filter hit if the optional element filter returned a match at some point.
string $rootElement
Name of the document's root element, including any namespace as an expanded URL.