MediaWiki  master
XmlTypeCheck.php
Go to the documentation of this file.
1 <?php
28 class XmlTypeCheck {
33  public $wellFormed = null;
34 
39  public $filterMatch = false;
40 
46  public $filterMatchType = false;
47 
52  public $rootElement = '';
53 
59  protected $elementData = [];
60 
64  protected $elementDataContext = [];
65 
69  protected $stackDepth = 0;
70 
72  protected $filterCallback;
73 
77  private $parserOptions = [
78  'processing_instruction_handler' => null,
79  'external_dtd_handler' => '',
80  'dtd_handler' => '',
81  'require_safe_dtd' => true
82  ];
83 
110  public function __construct( $input, $filterCallback = null, $isFile = true, $options = [] ) {
111  $this->filterCallback = $filterCallback;
112  $this->parserOptions = array_merge( $this->parserOptions, $options );
113  $this->validateFromInput( $input, $isFile );
114  }
115 
127  public static function newFromFilename( $fname, $filterCallback = null ) {
128  return new self( $fname, $filterCallback, true );
129  }
130 
142  public static function newFromString( $string, $filterCallback = null ) {
143  return new self( $string, $filterCallback, false );
144  }
145 
151  public function getRootElement() {
152  return $this->rootElement;
153  }
154 
159  private function validateFromInput( $xml, $isFile ) {
160  $reader = new XMLReader();
161  if ( $isFile ) {
162  $s = $reader->open( $xml, null, LIBXML_NOERROR | LIBXML_NOWARNING );
163  } else {
164  $s = $reader->XML( $xml, null, LIBXML_NOERROR | LIBXML_NOWARNING );
165  }
166  if ( $s !== true ) {
167  // Couldn't open the XML
168  $this->wellFormed = false;
169  } else {
170  $oldDisable = libxml_disable_entity_loader( true );
171  $reader->setParserProperty( XMLReader::SUBST_ENTITIES, true );
172  try {
173  $this->validate( $reader );
174  } catch ( Exception $e ) {
175  // Calling this malformed, because we didn't parse the whole
176  // thing. Maybe just an external entity refernce.
177  $this->wellFormed = false;
178  $reader->close();
179  libxml_disable_entity_loader( $oldDisable );
180  throw $e;
181  }
182  $reader->close();
183  libxml_disable_entity_loader( $oldDisable );
184  }
185  }
186 
187  private function readNext( XMLReader $reader ) {
188  set_error_handler( function ( $line, $file ) {
189  $this->wellFormed = false;
190  } );
191  $ret = $reader->read();
192  restore_error_handler();
193  return $ret;
194  }
195 
196  private function validate( $reader ) {
197  // First, move through anything that isn't an element, and
198  // handle any processing instructions with the callback
199  do {
200  if ( !$this->readNext( $reader ) ) {
201  // Hit the end of the document before any elements
202  $this->wellFormed = false;
203  return;
204  }
205  if ( $reader->nodeType === XMLReader::PI ) {
206  $this->processingInstructionHandler( $reader->name, $reader->value );
207  }
208  if ( $reader->nodeType === XMLReader::DOC_TYPE ) {
209  $this->dtdHandler( $reader );
210  }
211  } while ( $reader->nodeType != XMLReader::ELEMENT );
212 
213  // Process the rest of the document
214  do {
215  switch ( $reader->nodeType ) {
216  case XMLReader::ELEMENT:
217  $name = $this->expandNS(
218  $reader->name,
219  $reader->namespaceURI
220  );
221  if ( $this->rootElement === '' ) {
222  $this->rootElement = $name;
223  }
224  $empty = $reader->isEmptyElement;
225  $attrs = $this->getAttributesArray( $reader );
226  $this->elementOpen( $name, $attrs );
227  if ( $empty ) {
228  $this->elementClose();
229  }
230  break;
231 
232  case XMLReader::END_ELEMENT:
233  $this->elementClose();
234  break;
235 
236  case XMLReader::WHITESPACE:
237  case XMLReader::SIGNIFICANT_WHITESPACE:
238  case XMLReader::CDATA:
239  case XMLReader::TEXT:
240  $this->elementData( $reader->value );
241  break;
242 
243  case XMLReader::ENTITY_REF:
244  // Unexpanded entity (maybe external?),
245  // don't send to the filter (xml_parse didn't)
246  break;
247 
248  case XMLReader::COMMENT:
249  // Don't send to the filter (xml_parse didn't)
250  break;
251 
252  case XMLReader::PI:
253  // Processing instructions can happen after the header too
255  $reader->name,
256  $reader->value
257  );
258  break;
259  case XMLReader::DOC_TYPE:
260  // We should never see a doctype after first
261  // element.
262  $this->wellFormed = false;
263  break;
264  default:
265  // One of DOC, ENTITY, END_ENTITY,
266  // NOTATION, or XML_DECLARATION
267  // xml_parse didn't send these to the filter, so we won't.
268  }
269  } while ( $this->readNext( $reader ) );
270 
271  if ( $this->stackDepth !== 0 ) {
272  $this->wellFormed = false;
273  } elseif ( $this->wellFormed === null ) {
274  $this->wellFormed = true;
275  }
276  }
277 
283  private function getAttributesArray( XMLReader $r ) {
284  $attrs = [];
285  while ( $r->moveToNextAttribute() ) {
286  if ( $r->namespaceURI === 'http://www.w3.org/2000/xmlns/' ) {
287  // XMLReader treats xmlns attributes as normal
288  // attributes, while xml_parse doesn't
289  continue;
290  }
291  $name = $this->expandNS( $r->name, $r->namespaceURI );
292  $attrs[$name] = $r->value;
293  }
294  return $attrs;
295  }
296 
302  private function expandNS( $name, $namespaceURI ) {
303  if ( $namespaceURI ) {
304  $parts = explode( ':', $name );
305  $localname = array_pop( $parts );
306  return "$namespaceURI:$localname";
307  }
308  return $name;
309  }
310 
315  private function elementOpen( $name, $attribs ) {
316  $this->elementDataContext[] = [ $name, $attribs ];
317  $this->elementData[] = '';
318  $this->stackDepth++;
319  }
320 
321  private function elementClose() {
322  list( $name, $attribs ) = array_pop( $this->elementDataContext );
323  $data = array_pop( $this->elementData );
324  $this->stackDepth--;
325  $callbackReturn = false;
326 
327  if ( is_callable( $this->filterCallback ) ) {
328  $callbackReturn = ( $this->filterCallback )( $name, $attribs, $data );
329  }
330  if ( $callbackReturn ) {
331  // Filter hit!
332  $this->filterMatch = true;
333  $this->filterMatchType = $callbackReturn;
334  }
335  }
336 
340  private function elementData( $data ) {
341  // Collect any data here, and we'll run the callback in elementClose
342  $this->elementData[ $this->stackDepth - 1 ] .= trim( $data );
343  }
344 
349  private function processingInstructionHandler( $target, $data ) {
350  $callbackReturn = false;
351  if ( $this->parserOptions['processing_instruction_handler'] ) {
352  // @phan-suppress-next-line PhanTypeInvalidCallable false positive
353  $callbackReturn = $this->parserOptions['processing_instruction_handler'](
354  $target,
355  $data
356  );
357  }
358  if ( $callbackReturn ) {
359  // Filter hit!
360  $this->filterMatch = true;
361  $this->filterMatchType = $callbackReturn;
362  }
363  }
364 
370  private function dtdHandler( XMLReader $reader ) {
371  $externalCallback = $this->parserOptions['external_dtd_handler'];
372  $generalCallback = $this->parserOptions['dtd_handler'];
373  $checkIfSafe = $this->parserOptions['require_safe_dtd'];
374  if ( !$externalCallback && !$generalCallback && !$checkIfSafe ) {
375  return;
376  }
377  $dtd = $reader->readOuterXml();
378  $callbackReturn = false;
379 
380  if ( $generalCallback ) {
381  $callbackReturn = $generalCallback( $dtd );
382  }
383  if ( $callbackReturn ) {
384  // Filter hit!
385  $this->filterMatch = true;
386  $this->filterMatchType = $callbackReturn;
387  $callbackReturn = false;
388  }
389 
390  $parsedDTD = $this->parseDTD( $dtd );
391  if ( $externalCallback && isset( $parsedDTD['type'] ) ) {
392  $callbackReturn = $externalCallback(
393  $parsedDTD['type'],
394  $parsedDTD['publicid'] ?? null,
395  $parsedDTD['systemid'] ?? null
396  );
397  }
398  if ( $callbackReturn ) {
399  // Filter hit!
400  $this->filterMatch = true;
401  $this->filterMatchType = $callbackReturn;
402  }
403 
404  if ( $checkIfSafe && isset( $parsedDTD['internal'] ) &&
405  !$this->checkDTDIsSafe( $parsedDTD['internal'] )
406  ) {
407  $this->wellFormed = false;
408  }
409  }
410 
431  private function checkDTDIsSafe( $internalSubset ) {
432  $res = preg_match(
433  '/^(?:\s*<!ENTITY\s+\S+\s+' .
434  '(?:"(?:&[^"%&;]{1,64};|(?:[^"%&]|&amp;|&quot;){0,255})"' .
435  '|\'(?:&[^"%&;]{1,64};|(?:[^\'%&]|&amp;|&apos;){0,255})\')\s*>' .
436  '|\s*<!--(?:[^-]|-[^-])*-->' .
437  '|\s*<!ATTLIST svg xmlns:xlink CDATA #FIXED ' .
438  '"http:\/\/www.w3.org\/1999\/xlink">)*\s*$/',
439  $internalSubset
440  );
441 
442  return (bool)$res;
443  }
444 
453  private function parseDTD( $dtd ) {
454  $m = [];
455  $res = preg_match(
456  '/^<!DOCTYPE\s*\S+\s*' .
457  '(?:(?P<typepublic>PUBLIC)\s*' .
458  '(?:"(?P<pubquote>[^"]*)"|\'(?P<pubapos>[^\']*)\')' . // public identifer
459  '\s*"(?P<pubsysquote>[^"]*)"|\'(?P<pubsysapos>[^\']*)\'' . // system identifier
460  '|(?P<typesystem>SYSTEM)\s*' .
461  '(?:"(?P<sysquote>[^"]*)"|\'(?P<sysapos>[^\']*)\')' .
462  ')?\s*' .
463  '(?:\[\s*(?P<internal>.*)\])?\s*>$/s',
464  $dtd,
465  $m
466  );
467  if ( !$res ) {
468  $this->wellFormed = false;
469  return [];
470  }
471  $parsed = [];
472  foreach ( $m as $field => $value ) {
473  if ( $value === '' || is_numeric( $field ) ) {
474  continue;
475  }
476  switch ( $field ) {
477  case 'typepublic':
478  case 'typesystem':
479  $parsed['type'] = $value;
480  break;
481  case 'pubquote':
482  case 'pubapos':
483  $parsed['publicid'] = $value;
484  break;
485  case 'pubsysquote':
486  case 'pubsysapos':
487  case 'sysquote':
488  case 'sysapos':
489  $parsed['systemid'] = $value;
490  break;
491  case 'internal':
492  $parsed['internal'] = $value;
493  break;
494  }
495  }
496  return $parsed;
497  }
498 }
XmlTypeCheck\$elementData
string[] $elementData
A stack of strings containing the data of each xml element as it's processed.
Definition: XmlTypeCheck.php:59
XmlTypeCheck\parseDTD
parseDTD( $dtd)
Parse DTD into parts.
Definition: XmlTypeCheck.php:453
XmlTypeCheck\$elementDataContext
array $elementDataContext
A stack of element names and attributes, as we process them.
Definition: XmlTypeCheck.php:64
XmlTypeCheck\$rootElement
string $rootElement
Name of the document's root element, including any namespace as an expanded URL.
Definition: XmlTypeCheck.php:52
XmlTypeCheck\dtdHandler
dtdHandler(XMLReader $reader)
Handle coming across a <!DOCTYPE declaration.
Definition: XmlTypeCheck.php:370
true
return true
Definition: router.php:90
XmlTypeCheck\newFromString
static newFromString( $string, $filterCallback=null)
Alternative constructor: from string.
Definition: XmlTypeCheck.php:142
$file
if(PHP_SAPI !='cli-server') if(!isset( $_SERVER['SCRIPT_FILENAME'])) $file
Item class for a filearchive table row.
Definition: router.php:42
XmlTypeCheck\newFromFilename
static newFromFilename( $fname, $filterCallback=null)
Alternative constructor: from filename.
Definition: XmlTypeCheck.php:127
$s
$s
Definition: mergeMessageFileList.php:185
$res
$res
Definition: testCompression.php:57
XmlTypeCheck\expandNS
expandNS( $name, $namespaceURI)
Definition: XmlTypeCheck.php:302
XmlTypeCheck\getAttributesArray
getAttributesArray(XMLReader $r)
Get all of the attributes for an XMLReader's current node.
Definition: XmlTypeCheck.php:283
XmlTypeCheck\$wellFormed
bool null $wellFormed
Will be set to true or false to indicate whether the file is well-formed XML.
Definition: XmlTypeCheck.php:33
XmlTypeCheck\$stackDepth
int $stackDepth
Current depth of the data stack.
Definition: XmlTypeCheck.php:69
XmlTypeCheck\__construct
__construct( $input, $filterCallback=null, $isFile=true, $options=[])
Allow filtering an XML file.
Definition: XmlTypeCheck.php:110
XmlTypeCheck\$parserOptions
array $parserOptions
Additional parsing options.
Definition: XmlTypeCheck.php:77
XmlTypeCheck\readNext
readNext(XMLReader $reader)
Definition: XmlTypeCheck.php:187
XmlTypeCheck\elementData
elementData( $data)
Definition: XmlTypeCheck.php:340
$line
$line
Definition: mcc.php:119
XmlTypeCheck\$filterMatch
bool $filterMatch
Will be set to true if the optional element filter returned a match at some point.
Definition: XmlTypeCheck.php:39
XmlTypeCheck\getRootElement
getRootElement()
Get the root element.
Definition: XmlTypeCheck.php:151
XmlTypeCheck
Definition: XmlTypeCheck.php:28
XmlTypeCheck\elementOpen
elementOpen( $name, $attribs)
Definition: XmlTypeCheck.php:315
XmlTypeCheck\processingInstructionHandler
processingInstructionHandler( $target, $data)
Definition: XmlTypeCheck.php:349
XmlTypeCheck\validateFromInput
validateFromInput( $xml, $isFile)
Definition: XmlTypeCheck.php:159
XmlTypeCheck\validate
validate( $reader)
Definition: XmlTypeCheck.php:196
XmlTypeCheck\elementClose
elementClose()
Definition: XmlTypeCheck.php:321
XmlTypeCheck\$filterMatchType
mixed $filterMatchType
Will contain the type of filter hit if the optional element filter returned a match at some point.
Definition: XmlTypeCheck.php:46
XmlTypeCheck\checkDTDIsSafe
checkDTDIsSafe( $internalSubset)
Check if the internal subset of the DTD is safe.
Definition: XmlTypeCheck.php:431
XmlTypeCheck\$filterCallback
callable null $filterCallback
Definition: XmlTypeCheck.php:72