MediaWiki  master
XmlTypeCheck.php
Go to the documentation of this file.
1 <?php
28 class XmlTypeCheck {
33  public $wellFormed = null;
34 
39  public $filterMatch = false;
40 
46  public $filterMatchType = false;
47 
52  public $rootElement = '';
53 
59  protected $elementData = [];
60 
64  protected $elementDataContext = [];
65 
69  protected $stackDepth = 0;
70 
72  protected $filterCallback;
73 
77  private $parserOptions = [
78  'processing_instruction_handler' => null,
79  'external_dtd_handler' => '',
80  'dtd_handler' => '',
81  'require_safe_dtd' => true
82  ];
83 
110  function __construct( $input, $filterCallback = null, $isFile = true, $options = [] ) {
111  $this->filterCallback = $filterCallback;
112  $this->parserOptions = array_merge( $this->parserOptions, $options );
113  $this->validateFromInput( $input, $isFile );
114  }
115 
127  public static function newFromFilename( $fname, $filterCallback = null ) {
128  return new self( $fname, $filterCallback, true );
129  }
130 
142  public static function newFromString( $string, $filterCallback = null ) {
143  return new self( $string, $filterCallback, false );
144  }
145 
151  public function getRootElement() {
152  return $this->rootElement;
153  }
154 
159  private function validateFromInput( $xml, $isFile ) {
160  $reader = new XMLReader();
161  if ( $isFile ) {
162  $s = $reader->open( $xml, null, LIBXML_NOERROR | LIBXML_NOWARNING );
163  } else {
164  $s = $reader->XML( $xml, null, LIBXML_NOERROR | LIBXML_NOWARNING );
165  }
166  if ( $s !== true ) {
167  // Couldn't open the XML
168  $this->wellFormed = false;
169  } else {
170  $oldDisable = libxml_disable_entity_loader( true );
171  $reader->setParserProperty( XMLReader::SUBST_ENTITIES, true );
172  try {
173  $this->validate( $reader );
174  } catch ( Exception $e ) {
175  // Calling this malformed, because we didn't parse the whole
176  // thing. Maybe just an external entity refernce.
177  $this->wellFormed = false;
178  $reader->close();
179  libxml_disable_entity_loader( $oldDisable );
180  throw $e;
181  }
182  $reader->close();
183  libxml_disable_entity_loader( $oldDisable );
184  }
185  }
186 
187  private function readNext( XMLReader $reader ) {
188  set_error_handler( [ $this, 'XmlErrorHandler' ] );
189  $ret = $reader->read();
190  restore_error_handler();
191  return $ret;
192  }
193 
194  public function XmlErrorHandler( $errno, $errstr ) {
195  $this->wellFormed = false;
196  }
197 
198  private function validate( $reader ) {
199  // First, move through anything that isn't an element, and
200  // handle any processing instructions with the callback
201  do {
202  if ( !$this->readNext( $reader ) ) {
203  // Hit the end of the document before any elements
204  $this->wellFormed = false;
205  return;
206  }
207  if ( $reader->nodeType === XMLReader::PI ) {
208  $this->processingInstructionHandler( $reader->name, $reader->value );
209  }
210  if ( $reader->nodeType === XMLReader::DOC_TYPE ) {
211  $this->DTDHandler( $reader );
212  }
213  } while ( $reader->nodeType != XMLReader::ELEMENT );
214 
215  // Process the rest of the document
216  do {
217  switch ( $reader->nodeType ) {
218  case XMLReader::ELEMENT:
219  $name = $this->expandNS(
220  $reader->name,
221  $reader->namespaceURI
222  );
223  if ( $this->rootElement === '' ) {
224  $this->rootElement = $name;
225  }
226  $empty = $reader->isEmptyElement;
227  $attrs = $this->getAttributesArray( $reader );
228  $this->elementOpen( $name, $attrs );
229  if ( $empty ) {
230  $this->elementClose();
231  }
232  break;
233 
234  case XMLReader::END_ELEMENT:
235  $this->elementClose();
236  break;
237 
238  case XMLReader::WHITESPACE:
239  case XMLReader::SIGNIFICANT_WHITESPACE:
240  case XMLReader::CDATA:
241  case XMLReader::TEXT:
242  $this->elementData( $reader->value );
243  break;
244 
245  case XMLReader::ENTITY_REF:
246  // Unexpanded entity (maybe external?),
247  // don't send to the filter (xml_parse didn't)
248  break;
249 
250  case XMLReader::COMMENT:
251  // Don't send to the filter (xml_parse didn't)
252  break;
253 
254  case XMLReader::PI:
255  // Processing instructions can happen after the header too
257  $reader->name,
258  $reader->value
259  );
260  break;
261  case XMLReader::DOC_TYPE:
262  // We should never see a doctype after first
263  // element.
264  $this->wellFormed = false;
265  break;
266  default:
267  // One of DOC, ENTITY, END_ENTITY,
268  // NOTATION, or XML_DECLARATION
269  // xml_parse didn't send these to the filter, so we won't.
270  }
271  } while ( $this->readNext( $reader ) );
272 
273  if ( $this->stackDepth !== 0 ) {
274  $this->wellFormed = false;
275  } elseif ( $this->wellFormed === null ) {
276  $this->wellFormed = true;
277  }
278  }
279 
285  private function getAttributesArray( XMLReader $r ) {
286  $attrs = [];
287  while ( $r->moveToNextAttribute() ) {
288  if ( $r->namespaceURI === 'http://www.w3.org/2000/xmlns/' ) {
289  // XMLReader treats xmlns attributes as normal
290  // attributes, while xml_parse doesn't
291  continue;
292  }
293  $name = $this->expandNS( $r->name, $r->namespaceURI );
294  $attrs[$name] = $r->value;
295  }
296  return $attrs;
297  }
298 
304  private function expandNS( $name, $namespaceURI ) {
305  if ( $namespaceURI ) {
306  $parts = explode( ':', $name );
307  $localname = array_pop( $parts );
308  return "$namespaceURI:$localname";
309  }
310  return $name;
311  }
312 
317  private function elementOpen( $name, $attribs ) {
318  $this->elementDataContext[] = [ $name, $attribs ];
319  $this->elementData[] = '';
320  $this->stackDepth++;
321  }
322 
323  private function elementClose() {
324  list( $name, $attribs ) = array_pop( $this->elementDataContext );
325  $data = array_pop( $this->elementData );
326  $this->stackDepth--;
327  $callbackReturn = false;
328 
329  if ( is_callable( $this->filterCallback ) ) {
330  $callbackReturn = call_user_func(
331  $this->filterCallback,
332  $name,
333  $attribs,
334  $data
335  );
336  }
337  if ( $callbackReturn ) {
338  // Filter hit!
339  $this->filterMatch = true;
340  $this->filterMatchType = $callbackReturn;
341  }
342  }
343 
347  private function elementData( $data ) {
348  // Collect any data here, and we'll run the callback in elementClose
349  $this->elementData[ $this->stackDepth - 1 ] .= trim( $data );
350  }
351 
356  private function processingInstructionHandler( $target, $data ) {
357  $callbackReturn = false;
358  if ( $this->parserOptions['processing_instruction_handler'] ) {
359  $callbackReturn = call_user_func(
360  $this->parserOptions['processing_instruction_handler'],
361  $target,
362  $data
363  );
364  }
365  if ( $callbackReturn ) {
366  // Filter hit!
367  $this->filterMatch = true;
368  $this->filterMatchType = $callbackReturn;
369  }
370  }
371 
377  private function DTDHandler( XMLReader $reader ) {
378  $externalCallback = $this->parserOptions['external_dtd_handler'];
379  $generalCallback = $this->parserOptions['dtd_handler'];
380  $checkIfSafe = $this->parserOptions['require_safe_dtd'];
381  if ( !$externalCallback && !$generalCallback && !$checkIfSafe ) {
382  return;
383  }
384  $dtd = $reader->readOuterXml();
385  $callbackReturn = false;
386 
387  if ( $generalCallback ) {
388  $callbackReturn = call_user_func( $generalCallback, $dtd );
389  }
390  if ( $callbackReturn ) {
391  // Filter hit!
392  $this->filterMatch = true;
393  $this->filterMatchType = $callbackReturn;
394  $callbackReturn = false;
395  }
396 
397  $parsedDTD = $this->parseDTD( $dtd );
398  if ( $externalCallback && isset( $parsedDTD['type'] ) ) {
399  $callbackReturn = call_user_func(
400  $externalCallback,
401  $parsedDTD['type'],
402  $parsedDTD['publicid'] ?? null,
403  $parsedDTD['systemid'] ?? null
404  );
405  }
406  if ( $callbackReturn ) {
407  // Filter hit!
408  $this->filterMatch = true;
409  $this->filterMatchType = $callbackReturn;
410  $callbackReturn = false;
411  }
412 
413  if ( $checkIfSafe && isset( $parsedDTD['internal'] ) &&
414  !$this->checkDTDIsSafe( $parsedDTD['internal'] )
415  ) {
416  $this->wellFormed = false;
417  }
418  }
419 
440  private function checkDTDIsSafe( $internalSubset ) {
441  $offset = 0;
442  $res = preg_match(
443  '/^(?:\s*<!ENTITY\s+\S+\s+' .
444  '(?:"(?:&[^"%&;]{1,64};|(?:[^"%&]|&amp;|&quot;){0,255})"' .
445  '|\'(?:&[^"%&;]{1,64};|(?:[^\'%&]|&amp;|&apos;){0,255})\')\s*>' .
446  '|\s*<!--(?:[^-]|-[^-])*-->' .
447  '|\s*<!ATTLIST svg xmlns:xlink CDATA #FIXED ' .
448  '"http:\/\/www.w3.org\/1999\/xlink">)*\s*$/',
449  $internalSubset
450  );
451 
452  return (bool)$res;
453  }
454 
463  private function parseDTD( $dtd ) {
464  $m = [];
465  $res = preg_match(
466  '/^<!DOCTYPE\s*\S+\s*' .
467  '(?:(?P<typepublic>PUBLIC)\s*' .
468  '(?:"(?P<pubquote>[^"]*)"|\'(?P<pubapos>[^\']*)\')' . // public identifer
469  '\s*"(?P<pubsysquote>[^"]*)"|\'(?P<pubsysapos>[^\']*)\'' . // system identifier
470  '|(?P<typesystem>SYSTEM)\s*' .
471  '(?:"(?P<sysquote>[^"]*)"|\'(?P<sysapos>[^\']*)\')' .
472  ')?\s*' .
473  '(?:\[\s*(?P<internal>.*)\])?\s*>$/s',
474  $dtd,
475  $m
476  );
477  if ( !$res ) {
478  $this->wellFormed = false;
479  return [];
480  }
481  $parsed = [];
482  foreach ( $m as $field => $value ) {
483  if ( $value === '' || is_numeric( $field ) ) {
484  continue;
485  }
486  switch ( $field ) {
487  case 'typepublic':
488  case 'typesystem':
489  $parsed['type'] = $value;
490  break;
491  case 'pubquote':
492  case 'pubapos':
493  $parsed['publicid'] = $value;
494  break;
495  case 'pubsysquote':
496  case 'pubsysapos':
497  case 'sysquote':
498  case 'sysapos':
499  $parsed['systemid'] = $value;
500  break;
501  case 'internal':
502  $parsed['internal'] = $value;
503  break;
504  }
505  }
506  return $parsed;
507  }
508 }
processingInstructionHandler( $target, $data)
readNext(XMLReader $reader)
bool null $wellFormed
Will be set to true or false to indicate whether the file is well-formed XML.
validate( $reader)
elementData( $data)
validateFromInput( $xml, $isFile)
array __construct( $input, $filterCallback=null, $isFile=true, $options=[])
Allow filtering an XML file.
getRootElement()
Get the root element.
checkDTDIsSafe( $internalSubset)
Check if the internal subset of the DTD is safe.
static newFromFilename( $fname, $filterCallback=null)
Alternative constructor: from filename.
elementOpen( $name, $attribs)
static newFromString( $string, $filterCallback=null)
Alternative constructor: from string.
int $stackDepth
Current depth of the data stack.
bool $filterMatch
Will be set to true if the optional element filter returned a match at some point.
mixed $filterMatchType
Will contain the type of filter hit if the optional element filter returned a match at some point...
parseDTD( $dtd)
Parse DTD into parts.
string [] $elementData
A stack of strings containing the data of each xml element as it&#39;s processed.
callable null $filterCallback
XmlErrorHandler( $errno, $errstr)
getAttributesArray(XMLReader $r)
Get all of the attributes for an XMLReader&#39;s current node.
expandNS( $name, $namespaceURI)
array $elementDataContext
A stack of element names and attributes, as we process them.
DTDHandler(XMLReader $reader)
Handle coming across a <!DOCTYPE declaration.
string $rootElement
Name of the document&#39;s root element, including any namespace as an expanded URL.
return true
Definition: router.php:92
array $parserOptions
Additional parsing options.