MediaWiki REL1_34
XmlTypeCheck.php
Go to the documentation of this file.
1<?php
33 public $wellFormed = null;
34
39 public $filterMatch = false;
40
46 public $filterMatchType = false;
47
52 public $rootElement = '';
53
59 protected $elementData = [];
60
64 protected $elementDataContext = [];
65
69 protected $stackDepth = 0;
70
72 protected $filterCallback;
73
77 private $parserOptions = [
78 'processing_instruction_handler' => null,
79 'external_dtd_handler' => '',
80 'dtd_handler' => '',
81 'require_safe_dtd' => true
82 ];
83
110 function __construct( $input, $filterCallback = null, $isFile = true, $options = [] ) {
111 $this->filterCallback = $filterCallback;
112 $this->parserOptions = array_merge( $this->parserOptions, $options );
113 $this->validateFromInput( $input, $isFile );
114 }
115
127 public static function newFromFilename( $fname, $filterCallback = null ) {
128 return new self( $fname, $filterCallback, true );
129 }
130
142 public static function newFromString( $string, $filterCallback = null ) {
143 return new self( $string, $filterCallback, false );
144 }
145
151 public function getRootElement() {
152 return $this->rootElement;
153 }
154
159 private function validateFromInput( $xml, $isFile ) {
160 $reader = new XMLReader();
161 if ( $isFile ) {
162 $s = $reader->open( $xml, null, LIBXML_NOERROR | LIBXML_NOWARNING );
163 } else {
164 $s = $reader->XML( $xml, null, LIBXML_NOERROR | LIBXML_NOWARNING );
165 }
166 if ( $s !== true ) {
167 // Couldn't open the XML
168 $this->wellFormed = false;
169 } else {
170 $oldDisable = libxml_disable_entity_loader( true );
171 $reader->setParserProperty( XMLReader::SUBST_ENTITIES, true );
172 try {
173 $this->validate( $reader );
174 } catch ( Exception $e ) {
175 // Calling this malformed, because we didn't parse the whole
176 // thing. Maybe just an external entity refernce.
177 $this->wellFormed = false;
178 $reader->close();
179 libxml_disable_entity_loader( $oldDisable );
180 throw $e;
181 }
182 $reader->close();
183 libxml_disable_entity_loader( $oldDisable );
184 }
185 }
186
187 private function readNext( XMLReader $reader ) {
188 set_error_handler( [ $this, 'XmlErrorHandler' ] );
189 $ret = $reader->read();
190 restore_error_handler();
191 return $ret;
192 }
193
194 public function XmlErrorHandler( $errno, $errstr ) {
195 $this->wellFormed = false;
196 }
197
198 private function validate( $reader ) {
199 // First, move through anything that isn't an element, and
200 // handle any processing instructions with the callback
201 do {
202 if ( !$this->readNext( $reader ) ) {
203 // Hit the end of the document before any elements
204 $this->wellFormed = false;
205 return;
206 }
207 if ( $reader->nodeType === XMLReader::PI ) {
208 $this->processingInstructionHandler( $reader->name, $reader->value );
209 }
210 if ( $reader->nodeType === XMLReader::DOC_TYPE ) {
211 $this->DTDHandler( $reader );
212 }
213 } while ( $reader->nodeType != XMLReader::ELEMENT );
214
215 // Process the rest of the document
216 do {
217 switch ( $reader->nodeType ) {
218 case XMLReader::ELEMENT:
219 $name = $this->expandNS(
220 $reader->name,
221 $reader->namespaceURI
222 );
223 if ( $this->rootElement === '' ) {
224 $this->rootElement = $name;
225 }
226 $empty = $reader->isEmptyElement;
227 $attrs = $this->getAttributesArray( $reader );
228 $this->elementOpen( $name, $attrs );
229 if ( $empty ) {
230 $this->elementClose();
231 }
232 break;
233
234 case XMLReader::END_ELEMENT:
235 $this->elementClose();
236 break;
237
238 case XMLReader::WHITESPACE:
239 case XMLReader::SIGNIFICANT_WHITESPACE:
240 case XMLReader::CDATA:
241 case XMLReader::TEXT:
242 $this->elementData( $reader->value );
243 break;
244
245 case XMLReader::ENTITY_REF:
246 // Unexpanded entity (maybe external?),
247 // don't send to the filter (xml_parse didn't)
248 break;
249
250 case XMLReader::COMMENT:
251 // Don't send to the filter (xml_parse didn't)
252 break;
253
254 case XMLReader::PI:
255 // Processing instructions can happen after the header too
257 $reader->name,
258 $reader->value
259 );
260 break;
261 case XMLReader::DOC_TYPE:
262 // We should never see a doctype after first
263 // element.
264 $this->wellFormed = false;
265 break;
266 default:
267 // One of DOC, ENTITY, END_ENTITY,
268 // NOTATION, or XML_DECLARATION
269 // xml_parse didn't send these to the filter, so we won't.
270 }
271 } while ( $this->readNext( $reader ) );
272
273 if ( $this->stackDepth !== 0 ) {
274 $this->wellFormed = false;
275 } elseif ( $this->wellFormed === null ) {
276 $this->wellFormed = true;
277 }
278 }
279
285 private function getAttributesArray( XMLReader $r ) {
286 $attrs = [];
287 while ( $r->moveToNextAttribute() ) {
288 if ( $r->namespaceURI === 'http://www.w3.org/2000/xmlns/' ) {
289 // XMLReader treats xmlns attributes as normal
290 // attributes, while xml_parse doesn't
291 continue;
292 }
293 $name = $this->expandNS( $r->name, $r->namespaceURI );
294 $attrs[$name] = $r->value;
295 }
296 return $attrs;
297 }
298
304 private function expandNS( $name, $namespaceURI ) {
305 if ( $namespaceURI ) {
306 $parts = explode( ':', $name );
307 $localname = array_pop( $parts );
308 return "$namespaceURI:$localname";
309 }
310 return $name;
311 }
312
317 private function elementOpen( $name, $attribs ) {
318 $this->elementDataContext[] = [ $name, $attribs ];
319 $this->elementData[] = '';
320 $this->stackDepth++;
321 }
322
323 private function elementClose() {
324 list( $name, $attribs ) = array_pop( $this->elementDataContext );
325 $data = array_pop( $this->elementData );
326 $this->stackDepth--;
327 $callbackReturn = false;
328
329 if ( is_callable( $this->filterCallback ) ) {
330 $callbackReturn = call_user_func(
331 $this->filterCallback,
332 $name,
333 $attribs,
334 $data
335 );
336 }
337 if ( $callbackReturn ) {
338 // Filter hit!
339 $this->filterMatch = true;
340 $this->filterMatchType = $callbackReturn;
341 }
342 }
343
347 private function elementData( $data ) {
348 // Collect any data here, and we'll run the callback in elementClose
349 $this->elementData[ $this->stackDepth - 1 ] .= trim( $data );
350 }
351
356 private function processingInstructionHandler( $target, $data ) {
357 $callbackReturn = false;
358 if ( $this->parserOptions['processing_instruction_handler'] ) {
359 $callbackReturn = call_user_func(
360 $this->parserOptions['processing_instruction_handler'],
361 $target,
362 $data
363 );
364 }
365 if ( $callbackReturn ) {
366 // Filter hit!
367 $this->filterMatch = true;
368 $this->filterMatchType = $callbackReturn;
369 }
370 }
371
377 private function DTDHandler( XMLReader $reader ) {
378 $externalCallback = $this->parserOptions['external_dtd_handler'];
379 $generalCallback = $this->parserOptions['dtd_handler'];
380 $checkIfSafe = $this->parserOptions['require_safe_dtd'];
381 if ( !$externalCallback && !$generalCallback && !$checkIfSafe ) {
382 return;
383 }
384 $dtd = $reader->readOuterXml();
385 $callbackReturn = false;
386
387 if ( $generalCallback ) {
388 $callbackReturn = call_user_func( $generalCallback, $dtd );
389 }
390 if ( $callbackReturn ) {
391 // Filter hit!
392 $this->filterMatch = true;
393 $this->filterMatchType = $callbackReturn;
394 $callbackReturn = false;
395 }
396
397 $parsedDTD = $this->parseDTD( $dtd );
398 if ( $externalCallback && isset( $parsedDTD['type'] ) ) {
399 $callbackReturn = call_user_func(
400 $externalCallback,
401 $parsedDTD['type'],
402 $parsedDTD['publicid'] ?? null,
403 $parsedDTD['systemid'] ?? null
404 );
405 }
406 if ( $callbackReturn ) {
407 // Filter hit!
408 $this->filterMatch = true;
409 $this->filterMatchType = $callbackReturn;
410 $callbackReturn = false;
411 }
412
413 if ( $checkIfSafe && isset( $parsedDTD['internal'] ) &&
414 !$this->checkDTDIsSafe( $parsedDTD['internal'] )
415 ) {
416 $this->wellFormed = false;
417 }
418 }
419
440 private function checkDTDIsSafe( $internalSubset ) {
441 $offset = 0;
442 $res = preg_match(
443 '/^(?:\s*<!ENTITY\s+\S+\s+' .
444 '(?:"(?:&[^"%&;]{1,64};|(?:[^"%&]|&amp;|&quot;){0,255})"' .
445 '|\'(?:&[^"%&;]{1,64};|(?:[^\'%&]|&amp;|&apos;){0,255})\')\s*>' .
446 '|\s*<!--(?:[^-]|-[^-])*-->' .
447 '|\s*<!ATTLIST svg xmlns:xlink CDATA #FIXED ' .
448 '"http:\/\/www.w3.org\/1999\/xlink">)*\s*$/',
449 $internalSubset
450 );
451
452 return (bool)$res;
453 }
454
463 private function parseDTD( $dtd ) {
464 $m = [];
465 $res = preg_match(
466 '/^<!DOCTYPE\s*\S+\s*' .
467 '(?:(?P<typepublic>PUBLIC)\s*' .
468 '(?:"(?P<pubquote>[^"]*)"|\'(?P<pubapos>[^\']*)\')' . // public identifer
469 '\s*"(?P<pubsysquote>[^"]*)"|\'(?P<pubsysapos>[^\']*)\'' . // system identifier
470 '|(?P<typesystem>SYSTEM)\s*' .
471 '(?:"(?P<sysquote>[^"]*)"|\'(?P<sysapos>[^\']*)\')' .
472 ')?\s*' .
473 '(?:\[\s*(?P<internal>.*)\])?\s*>$/s',
474 $dtd,
475 $m
476 );
477 if ( !$res ) {
478 $this->wellFormed = false;
479 return [];
480 }
481 $parsed = [];
482 foreach ( $m as $field => $value ) {
483 if ( $value === '' || is_numeric( $field ) ) {
484 continue;
485 }
486 switch ( $field ) {
487 case 'typepublic':
488 case 'typesystem':
489 $parsed['type'] = $value;
490 break;
491 case 'pubquote':
492 case 'pubapos':
493 $parsed['publicid'] = $value;
494 break;
495 case 'pubsysquote':
496 case 'pubsysapos':
497 case 'sysquote':
498 case 'sysapos':
499 $parsed['systemid'] = $value;
500 break;
501 case 'internal':
502 $parsed['internal'] = $value;
503 break;
504 }
505 }
506 return $parsed;
507 }
508}
int $stackDepth
Current depth of the data stack.
callable null $filterCallback
validate( $reader)
getRootElement()
Get the root element.
__construct( $input, $filterCallback=null, $isFile=true, $options=[])
Allow filtering an XML file.
parseDTD( $dtd)
Parse DTD into parts.
elementData( $data)
string[] $elementData
A stack of strings containing the data of each xml element as it's processed.
bool null $wellFormed
Will be set to true or false to indicate whether the file is well-formed XML.
string $rootElement
Name of the document's root element, including any namespace as an expanded URL.
DTDHandler(XMLReader $reader)
Handle coming across a <!DOCTYPE declaration.
bool $filterMatch
Will be set to true if the optional element filter returned a match at some point.
static newFromFilename( $fname, $filterCallback=null)
Alternative constructor: from filename.
readNext(XMLReader $reader)
elementOpen( $name, $attribs)
validateFromInput( $xml, $isFile)
array $parserOptions
Additional parsing options.
mixed $filterMatchType
Will contain the type of filter hit if the optional element filter returned a match at some point.
XmlErrorHandler( $errno, $errstr)
array $elementDataContext
A stack of element names and attributes, as we process them.
expandNS( $name, $namespaceURI)
static newFromString( $string, $filterCallback=null)
Alternative constructor: from string.
processingInstructionHandler( $target, $data)
checkDTDIsSafe( $internalSubset)
Check if the internal subset of the DTD is safe.
getAttributesArray(XMLReader $r)
Get all of the attributes for an XMLReader's current node.
return true
Definition router.php:94