MediaWiki master
XmlTypeCheck.php
Go to the documentation of this file.
1<?php
38 public $wellFormed = null;
39
44 public $filterMatch = false;
45
51 public $filterMatchType = false;
52
57 public $rootElement = '';
58
64 protected $elementData = [];
65
69 protected $elementDataContext = [];
70
74 protected $stackDepth = 0;
75
77 protected $filterCallback;
78
82 private $parserOptions = [
83 'processing_instruction_handler' => null,
84 'external_dtd_handler' => '',
85 'dtd_handler' => '',
86 'require_safe_dtd' => true
87 ];
88
115 public function __construct( $input, $filterCallback = null, $isFile = true, $options = [] ) {
116 $this->filterCallback = $filterCallback;
117 $this->parserOptions = array_merge( $this->parserOptions, $options );
118 $this->validateFromInput( $input, $isFile );
119 }
120
132 public static function newFromFilename( $fname, $filterCallback = null ) {
133 return new self( $fname, $filterCallback, true );
134 }
135
147 public static function newFromString( $string, $filterCallback = null ) {
148 return new self( $string, $filterCallback, false );
149 }
150
156 public function getRootElement() {
157 return $this->rootElement;
158 }
159
164 private function validateFromInput( $xml, $isFile ) {
165 $reader = new XMLReader();
166 if ( $isFile ) {
167 $s = $reader->open( $xml, null, LIBXML_NOERROR | LIBXML_NOWARNING );
168 } else {
169 $s = $reader->XML( $xml, null, LIBXML_NOERROR | LIBXML_NOWARNING );
170 }
171 if ( $s !== true ) {
172 // Couldn't open the XML
173 $this->wellFormed = false;
174 } else {
175 // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847
176 $oldDisable = @libxml_disable_entity_loader( true );
177 $reader->setParserProperty( XMLReader::SUBST_ENTITIES, true );
178 try {
179 $this->validate( $reader );
180 } catch ( Exception $e ) {
181 // Calling this malformed, because we didn't parse the whole
182 // thing. Maybe just an external entity refernce.
183 $this->wellFormed = false;
184 $reader->close();
185 // phpcs:ignore Generic.PHP.NoSilencedErrors
186 @libxml_disable_entity_loader( $oldDisable );
187 throw $e;
188 }
189 $reader->close();
190 // phpcs:ignore Generic.PHP.NoSilencedErrors
191 @libxml_disable_entity_loader( $oldDisable );
192 }
193 }
194
195 private function readNext( XMLReader $reader ) {
196 set_error_handler( function ( $line, $file ) {
197 $this->wellFormed = false;
198 return true;
199 } );
200 $ret = $reader->read();
201 restore_error_handler();
202 return $ret;
203 }
204
205 private function validate( $reader ) {
206 // First, move through anything that isn't an element, and
207 // handle any processing instructions with the callback
208 do {
209 if ( !$this->readNext( $reader ) ) {
210 // Hit the end of the document before any elements
211 $this->wellFormed = false;
212 return;
213 }
214 if ( $reader->nodeType === XMLReader::PI ) {
215 $this->processingInstructionHandler( $reader->name, $reader->value );
216 }
217 if ( $reader->nodeType === XMLReader::DOC_TYPE ) {
218 $this->dtdHandler( $reader );
219 }
220 } while ( $reader->nodeType != XMLReader::ELEMENT );
221
222 // Process the rest of the document
223 do {
224 switch ( $reader->nodeType ) {
225 case XMLReader::ELEMENT:
226 $name = $this->expandNS(
227 $reader->name,
228 $reader->namespaceURI
229 );
230 if ( $this->rootElement === '' ) {
231 $this->rootElement = $name;
232 }
233 $empty = $reader->isEmptyElement;
234 $attrs = $this->getAttributesArray( $reader );
235 $this->elementOpen( $name, $attrs );
236 if ( $empty ) {
237 $this->elementClose();
238 }
239 break;
240
241 case XMLReader::END_ELEMENT:
242 $this->elementClose();
243 break;
244
245 case XMLReader::WHITESPACE:
246 case XMLReader::SIGNIFICANT_WHITESPACE:
247 case XMLReader::CDATA:
248 case XMLReader::TEXT:
249 $this->elementData( $reader->value );
250 break;
251
252 case XMLReader::ENTITY_REF:
253 // Unexpanded entity (maybe external?),
254 // don't send to the filter (xml_parse didn't)
255 break;
256
257 case XMLReader::COMMENT:
258 // Don't send to the filter (xml_parse didn't)
259 break;
260
261 case XMLReader::PI:
262 // Processing instructions can happen after the header too
263 $this->processingInstructionHandler(
264 $reader->name,
265 $reader->value
266 );
267 break;
268 case XMLReader::DOC_TYPE:
269 // We should never see a doctype after first
270 // element.
271 $this->wellFormed = false;
272 break;
273 default:
274 // One of DOC, ENTITY, END_ENTITY,
275 // NOTATION, or XML_DECLARATION
276 // xml_parse didn't send these to the filter, so we won't.
277 }
278 } while ( $this->readNext( $reader ) );
279
280 if ( $this->stackDepth !== 0 ) {
281 $this->wellFormed = false;
282 } elseif ( $this->wellFormed === null ) {
283 $this->wellFormed = true;
284 }
285 }
286
292 private function getAttributesArray( XMLReader $r ) {
293 $attrs = [];
294 while ( $r->moveToNextAttribute() ) {
295 if ( $r->namespaceURI === 'http://www.w3.org/2000/xmlns/' ) {
296 // XMLReader treats xmlns attributes as normal
297 // attributes, while xml_parse doesn't
298 continue;
299 }
300 $name = $this->expandNS( $r->name, $r->namespaceURI );
301 $attrs[$name] = $r->value;
302 }
303 return $attrs;
304 }
305
311 private function expandNS( $name, $namespaceURI ) {
312 if ( $namespaceURI ) {
313 $parts = explode( ':', $name );
314 $localname = array_pop( $parts );
315 return "$namespaceURI:$localname";
316 }
317 return $name;
318 }
319
324 private function elementOpen( $name, $attribs ) {
325 $this->elementDataContext[] = [ $name, $attribs ];
326 $this->elementData[] = '';
327 $this->stackDepth++;
328 }
329
330 private function elementClose() {
331 [ $name, $attribs ] = array_pop( $this->elementDataContext );
332 $data = array_pop( $this->elementData );
333 $this->stackDepth--;
334 $callbackReturn = false;
335
336 if ( is_callable( $this->filterCallback ) ) {
337 $callbackReturn = ( $this->filterCallback )( $name, $attribs, $data );
338 }
339 if ( $callbackReturn ) {
340 // Filter hit!
341 $this->filterMatch = true;
342 $this->filterMatchType = $callbackReturn;
343 }
344 }
345
349 private function elementData( $data ) {
350 // Collect any data here, and we'll run the callback in elementClose
351 $this->elementData[ $this->stackDepth - 1 ] .= trim( $data );
352 }
353
358 private function processingInstructionHandler( $target, $data ) {
359 $callbackReturn = false;
360 if ( $this->parserOptions['processing_instruction_handler'] ) {
361 // @phan-suppress-next-line PhanTypeInvalidCallable false positive
362 $callbackReturn = $this->parserOptions['processing_instruction_handler'](
363 $target,
364 $data
365 );
366 }
367 if ( $callbackReturn ) {
368 // Filter hit!
369 $this->filterMatch = true;
370 $this->filterMatchType = $callbackReturn;
371 }
372 }
373
379 private function dtdHandler( XMLReader $reader ) {
380 $externalCallback = $this->parserOptions['external_dtd_handler'];
381 $generalCallback = $this->parserOptions['dtd_handler'];
382 $checkIfSafe = $this->parserOptions['require_safe_dtd'];
383 if ( !$externalCallback && !$generalCallback && !$checkIfSafe ) {
384 return;
385 }
386 $dtd = $reader->readOuterXml();
387 $callbackReturn = false;
388
389 if ( $generalCallback ) {
390 $callbackReturn = $generalCallback( $dtd );
391 }
392 if ( $callbackReturn ) {
393 // Filter hit!
394 $this->filterMatch = true;
395 $this->filterMatchType = $callbackReturn;
396 $callbackReturn = false;
397 }
398
399 $parsedDTD = $this->parseDTD( $dtd );
400 if ( $externalCallback && isset( $parsedDTD['type'] ) ) {
401 $callbackReturn = $externalCallback(
402 $parsedDTD['type'],
403 $parsedDTD['publicid'] ?? null,
404 $parsedDTD['systemid'] ?? null
405 );
406 }
407 if ( $callbackReturn ) {
408 // Filter hit!
409 $this->filterMatch = true;
410 $this->filterMatchType = $callbackReturn;
411 }
412
413 if ( $checkIfSafe && isset( $parsedDTD['internal'] ) &&
414 !$this->checkDTDIsSafe( $parsedDTD['internal'] )
415 ) {
416 $this->wellFormed = false;
417 }
418 }
419
440 private function checkDTDIsSafe( $internalSubset ) {
441 $res = preg_match(
442 '/^(?:\s*<!ENTITY\s+\S+\s+' .
443 '(?:"(?:&[^"%&;]{1,64};|(?:[^"%&]|&amp;|&quot;){0,255})"' .
444 '|\'(?:&[^\'%&;]{1,64};|(?:[^\'%&]|&amp;|&apos;){0,255})\')\s*>' .
445 '|\s*<!--(?:[^-]|-[^-])*-->' .
446 '|\s*<!ATTLIST svg xmlns:xlink CDATA #FIXED ' .
447 '"http:\/\/www.w3.org\/1999\/xlink">)*\s*$/',
448 $internalSubset
449 );
450
451 return (bool)$res;
452 }
453
462 private function parseDTD( $dtd ) {
463 $m = [];
464 $res = preg_match(
465 '/^<!DOCTYPE\s*\S+\s*' .
466 '(?:(?P<typepublic>PUBLIC)\s*' .
467 '(?:"(?P<pubquote>[^"]*)"|\'(?P<pubapos>[^\']*)\')' . // public identifer
468 '\s*"(?P<pubsysquote>[^"]*)"|\'(?P<pubsysapos>[^\']*)\'' . // system identifier
469 '|(?P<typesystem>SYSTEM)\s*' .
470 '(?:"(?P<sysquote>[^"]*)"|\'(?P<sysapos>[^\']*)\')' .
471 ')?\s*' .
472 '(?:\[\s*(?P<internal>.*)\])?\s*>$/s',
473 $dtd,
474 $m
475 );
476 if ( !$res ) {
477 $this->wellFormed = false;
478 return [];
479 }
480 $parsed = [];
481 foreach ( $m as $field => $value ) {
482 if ( $value === '' || is_numeric( $field ) ) {
483 continue;
484 }
485 switch ( $field ) {
486 case 'typepublic':
487 case 'typesystem':
488 $parsed['type'] = $value;
489 break;
490 case 'pubquote':
491 case 'pubapos':
492 $parsed['publicid'] = $value;
493 break;
494 case 'pubsysquote':
495 case 'pubsysapos':
496 case 'sysquote':
497 case 'sysapos':
498 $parsed['systemid'] = $value;
499 break;
500 case 'internal':
501 $parsed['internal'] = $value;
502 break;
503 }
504 }
505 return $parsed;
506 }
507}
XML syntax and type checker.
int $stackDepth
Current depth of the data stack.
callable null $filterCallback
getRootElement()
Get the root element.
__construct( $input, $filterCallback=null, $isFile=true, $options=[])
Allow filtering an XML file.
string[] $elementData
A stack of strings containing the data of each xml element as it's processed.
bool null $wellFormed
Will be set to true or false to indicate whether the file is well-formed XML.
string $rootElement
Name of the document's root element, including any namespace as an expanded URL.
bool $filterMatch
Will be set to true if the optional element filter returned a match at some point.
static newFromFilename( $fname, $filterCallback=null)
Alternative constructor: from filename.
mixed $filterMatchType
Will contain the type of filter hit if the optional element filter returned a match at some point.
array $elementDataContext
A stack of element names and attributes, as we process them.
static newFromString( $string, $filterCallback=null)
Alternative constructor: from string.