MediaWiki  master
XmlTypeCheck.php
Go to the documentation of this file.
1 <?php
28 class XmlTypeCheck {
33  public $wellFormed = null;
34 
39  public $filterMatch = false;
40 
46  public $filterMatchType = false;
47 
52  public $rootElement = '';
53 
59  protected $elementData = [];
60 
64  protected $elementDataContext = [];
65 
69  protected $stackDepth = 0;
70 
72  protected $filterCallback;
73 
77  private $parserOptions = [
78  'processing_instruction_handler' => null,
79  'external_dtd_handler' => '',
80  'dtd_handler' => '',
81  'require_safe_dtd' => true
82  ];
83 
110  public function __construct( $input, $filterCallback = null, $isFile = true, $options = [] ) {
111  $this->filterCallback = $filterCallback;
112  $this->parserOptions = array_merge( $this->parserOptions, $options );
113  $this->validateFromInput( $input, $isFile );
114  }
115 
127  public static function newFromFilename( $fname, $filterCallback = null ) {
128  return new self( $fname, $filterCallback, true );
129  }
130 
142  public static function newFromString( $string, $filterCallback = null ) {
143  return new self( $string, $filterCallback, false );
144  }
145 
151  public function getRootElement() {
152  return $this->rootElement;
153  }
154 
159  private function validateFromInput( $xml, $isFile ) {
160  $reader = new XMLReader();
161  if ( $isFile ) {
162  $s = $reader->open( $xml, null, LIBXML_NOERROR | LIBXML_NOWARNING );
163  } else {
164  $s = $reader->XML( $xml, null, LIBXML_NOERROR | LIBXML_NOWARNING );
165  }
166  if ( $s !== true ) {
167  // Couldn't open the XML
168  $this->wellFormed = false;
169  } else {
170  // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847
171  $oldDisable = @libxml_disable_entity_loader( true );
172  $reader->setParserProperty( XMLReader::SUBST_ENTITIES, true );
173  try {
174  $this->validate( $reader );
175  } catch ( Exception $e ) {
176  // Calling this malformed, because we didn't parse the whole
177  // thing. Maybe just an external entity refernce.
178  $this->wellFormed = false;
179  $reader->close();
180  // phpcs:ignore Generic.PHP.NoSilencedErrors
181  @libxml_disable_entity_loader( $oldDisable );
182  throw $e;
183  }
184  $reader->close();
185  // phpcs:ignore Generic.PHP.NoSilencedErrors
186  @libxml_disable_entity_loader( $oldDisable );
187  }
188  }
189 
190  private function readNext( XMLReader $reader ) {
191  set_error_handler( function ( $line, $file ) {
192  $this->wellFormed = false;
193  return true;
194  } );
195  $ret = $reader->read();
196  restore_error_handler();
197  return $ret;
198  }
199 
200  private function validate( $reader ) {
201  // First, move through anything that isn't an element, and
202  // handle any processing instructions with the callback
203  do {
204  if ( !$this->readNext( $reader ) ) {
205  // Hit the end of the document before any elements
206  $this->wellFormed = false;
207  return;
208  }
209  if ( $reader->nodeType === XMLReader::PI ) {
210  $this->processingInstructionHandler( $reader->name, $reader->value );
211  }
212  if ( $reader->nodeType === XMLReader::DOC_TYPE ) {
213  $this->dtdHandler( $reader );
214  }
215  } while ( $reader->nodeType != XMLReader::ELEMENT );
216 
217  // Process the rest of the document
218  do {
219  switch ( $reader->nodeType ) {
220  case XMLReader::ELEMENT:
221  $name = $this->expandNS(
222  $reader->name,
223  $reader->namespaceURI
224  );
225  if ( $this->rootElement === '' ) {
226  $this->rootElement = $name;
227  }
228  $empty = $reader->isEmptyElement;
229  $attrs = $this->getAttributesArray( $reader );
230  $this->elementOpen( $name, $attrs );
231  if ( $empty ) {
232  $this->elementClose();
233  }
234  break;
235 
236  case XMLReader::END_ELEMENT:
237  $this->elementClose();
238  break;
239 
240  case XMLReader::WHITESPACE:
241  case XMLReader::SIGNIFICANT_WHITESPACE:
242  case XMLReader::CDATA:
243  case XMLReader::TEXT:
244  $this->elementData( $reader->value );
245  break;
246 
247  case XMLReader::ENTITY_REF:
248  // Unexpanded entity (maybe external?),
249  // don't send to the filter (xml_parse didn't)
250  break;
251 
252  case XMLReader::COMMENT:
253  // Don't send to the filter (xml_parse didn't)
254  break;
255 
256  case XMLReader::PI:
257  // Processing instructions can happen after the header too
258  $this->processingInstructionHandler(
259  $reader->name,
260  $reader->value
261  );
262  break;
263  case XMLReader::DOC_TYPE:
264  // We should never see a doctype after first
265  // element.
266  $this->wellFormed = false;
267  break;
268  default:
269  // One of DOC, ENTITY, END_ENTITY,
270  // NOTATION, or XML_DECLARATION
271  // xml_parse didn't send these to the filter, so we won't.
272  }
273  } while ( $this->readNext( $reader ) );
274 
275  if ( $this->stackDepth !== 0 ) {
276  $this->wellFormed = false;
277  } elseif ( $this->wellFormed === null ) {
278  $this->wellFormed = true;
279  }
280  }
281 
287  private function getAttributesArray( XMLReader $r ) {
288  $attrs = [];
289  while ( $r->moveToNextAttribute() ) {
290  if ( $r->namespaceURI === 'http://www.w3.org/2000/xmlns/' ) {
291  // XMLReader treats xmlns attributes as normal
292  // attributes, while xml_parse doesn't
293  continue;
294  }
295  $name = $this->expandNS( $r->name, $r->namespaceURI );
296  $attrs[$name] = $r->value;
297  }
298  return $attrs;
299  }
300 
306  private function expandNS( $name, $namespaceURI ) {
307  if ( $namespaceURI ) {
308  $parts = explode( ':', $name );
309  $localname = array_pop( $parts );
310  return "$namespaceURI:$localname";
311  }
312  return $name;
313  }
314 
319  private function elementOpen( $name, $attribs ) {
320  $this->elementDataContext[] = [ $name, $attribs ];
321  $this->elementData[] = '';
322  $this->stackDepth++;
323  }
324 
325  private function elementClose() {
326  [ $name, $attribs ] = array_pop( $this->elementDataContext );
327  $data = array_pop( $this->elementData );
328  $this->stackDepth--;
329  $callbackReturn = false;
330 
331  if ( is_callable( $this->filterCallback ) ) {
332  $callbackReturn = ( $this->filterCallback )( $name, $attribs, $data );
333  }
334  if ( $callbackReturn ) {
335  // Filter hit!
336  $this->filterMatch = true;
337  $this->filterMatchType = $callbackReturn;
338  }
339  }
340 
344  private function elementData( $data ) {
345  // Collect any data here, and we'll run the callback in elementClose
346  $this->elementData[ $this->stackDepth - 1 ] .= trim( $data );
347  }
348 
353  private function processingInstructionHandler( $target, $data ) {
354  $callbackReturn = false;
355  if ( $this->parserOptions['processing_instruction_handler'] ) {
356  // @phan-suppress-next-line PhanTypeInvalidCallable false positive
357  $callbackReturn = $this->parserOptions['processing_instruction_handler'](
358  $target,
359  $data
360  );
361  }
362  if ( $callbackReturn ) {
363  // Filter hit!
364  $this->filterMatch = true;
365  $this->filterMatchType = $callbackReturn;
366  }
367  }
368 
374  private function dtdHandler( XMLReader $reader ) {
375  $externalCallback = $this->parserOptions['external_dtd_handler'];
376  $generalCallback = $this->parserOptions['dtd_handler'];
377  $checkIfSafe = $this->parserOptions['require_safe_dtd'];
378  if ( !$externalCallback && !$generalCallback && !$checkIfSafe ) {
379  return;
380  }
381  $dtd = $reader->readOuterXml();
382  $callbackReturn = false;
383 
384  if ( $generalCallback ) {
385  $callbackReturn = $generalCallback( $dtd );
386  }
387  if ( $callbackReturn ) {
388  // Filter hit!
389  $this->filterMatch = true;
390  $this->filterMatchType = $callbackReturn;
391  $callbackReturn = false;
392  }
393 
394  $parsedDTD = $this->parseDTD( $dtd );
395  if ( $externalCallback && isset( $parsedDTD['type'] ) ) {
396  $callbackReturn = $externalCallback(
397  $parsedDTD['type'],
398  $parsedDTD['publicid'] ?? null,
399  $parsedDTD['systemid'] ?? null
400  );
401  }
402  if ( $callbackReturn ) {
403  // Filter hit!
404  $this->filterMatch = true;
405  $this->filterMatchType = $callbackReturn;
406  }
407 
408  if ( $checkIfSafe && isset( $parsedDTD['internal'] ) &&
409  !$this->checkDTDIsSafe( $parsedDTD['internal'] )
410  ) {
411  $this->wellFormed = false;
412  }
413  }
414 
435  private function checkDTDIsSafe( $internalSubset ) {
436  $res = preg_match(
437  '/^(?:\s*<!ENTITY\s+\S+\s+' .
438  '(?:"(?:&[^"%&;]{1,64};|(?:[^"%&]|&amp;|&quot;){0,255})"' .
439  '|\'(?:&[^"%&;]{1,64};|(?:[^\'%&]|&amp;|&apos;){0,255})\')\s*>' .
440  '|\s*<!--(?:[^-]|-[^-])*-->' .
441  '|\s*<!ATTLIST svg xmlns:xlink CDATA #FIXED ' .
442  '"http:\/\/www.w3.org\/1999\/xlink">)*\s*$/',
443  $internalSubset
444  );
445 
446  return (bool)$res;
447  }
448 
457  private function parseDTD( $dtd ) {
458  $m = [];
459  $res = preg_match(
460  '/^<!DOCTYPE\s*\S+\s*' .
461  '(?:(?P<typepublic>PUBLIC)\s*' .
462  '(?:"(?P<pubquote>[^"]*)"|\'(?P<pubapos>[^\']*)\')' . // public identifer
463  '\s*"(?P<pubsysquote>[^"]*)"|\'(?P<pubsysapos>[^\']*)\'' . // system identifier
464  '|(?P<typesystem>SYSTEM)\s*' .
465  '(?:"(?P<sysquote>[^"]*)"|\'(?P<sysapos>[^\']*)\')' .
466  ')?\s*' .
467  '(?:\[\s*(?P<internal>.*)\])?\s*>$/s',
468  $dtd,
469  $m
470  );
471  if ( !$res ) {
472  $this->wellFormed = false;
473  return [];
474  }
475  $parsed = [];
476  foreach ( $m as $field => $value ) {
477  if ( $value === '' || is_numeric( $field ) ) {
478  continue;
479  }
480  switch ( $field ) {
481  case 'typepublic':
482  case 'typesystem':
483  $parsed['type'] = $value;
484  break;
485  case 'pubquote':
486  case 'pubapos':
487  $parsed['publicid'] = $value;
488  break;
489  case 'pubsysquote':
490  case 'pubsysapos':
491  case 'sysquote':
492  case 'sysapos':
493  $parsed['systemid'] = $value;
494  break;
495  case 'internal':
496  $parsed['internal'] = $value;
497  break;
498  }
499  }
500  return $parsed;
501  }
502 }
int $stackDepth
Current depth of the data stack.
callable null $filterCallback
getRootElement()
Get the root element.
__construct( $input, $filterCallback=null, $isFile=true, $options=[])
Allow filtering an XML file.
string[] $elementData
A stack of strings containing the data of each xml element as it's processed.
bool null $wellFormed
Will be set to true or false to indicate whether the file is well-formed XML.
string $rootElement
Name of the document's root element, including any namespace as an expanded URL.
bool $filterMatch
Will be set to true if the optional element filter returned a match at some point.
static newFromFilename( $fname, $filterCallback=null)
Alternative constructor: from filename.
mixed $filterMatchType
Will contain the type of filter hit if the optional element filter returned a match at some point.
array $elementDataContext
A stack of element names and attributes, as we process them.
static newFromString( $string, $filterCallback=null)
Alternative constructor: from string.
return true
Definition: router.php:90
if(PHP_SAPI !='cli-server') if(!isset( $_SERVER['SCRIPT_FILENAME'])) $file
Item class for a filearchive table row.
Definition: router.php:42