MediaWiki  master
XmlTypeCheck.php
Go to the documentation of this file.
1 <?php
28 class XmlTypeCheck {
33  public $wellFormed = null;
34 
39  public $filterMatch = false;
40 
46  public $filterMatchType = false;
47 
52  public $rootElement = '';
53 
59  protected $elementData = [];
60 
64  protected $elementDataContext = [];
65 
69  protected $stackDepth = 0;
70 
74  private $parserOptions = [
75  'processing_instruction_handler' => null,
76  'external_dtd_handler' => '',
77  'dtd_handler' => '',
78  'require_safe_dtd' => true
79  ];
80 
107  function __construct( $input, $filterCallback = null, $isFile = true, $options = [] ) {
108  $this->filterCallback = $filterCallback;
109  $this->parserOptions = array_merge( $this->parserOptions, $options );
110  $this->validateFromInput( $input, $isFile );
111  }
112 
124  public static function newFromFilename( $fname, $filterCallback = null ) {
125  return new self( $fname, $filterCallback, true );
126  }
127 
139  public static function newFromString( $string, $filterCallback = null ) {
140  return new self( $string, $filterCallback, false );
141  }
142 
148  public function getRootElement() {
149  return $this->rootElement;
150  }
151 
155  private function validateFromInput( $xml, $isFile ) {
156  $reader = new XMLReader();
157  if ( $isFile ) {
158  $s = $reader->open( $xml, null, LIBXML_NOERROR | LIBXML_NOWARNING );
159  } else {
160  $s = $reader->XML( $xml, null, LIBXML_NOERROR | LIBXML_NOWARNING );
161  }
162  if ( $s !== true ) {
163  // Couldn't open the XML
164  $this->wellFormed = false;
165  } else {
166  $oldDisable = libxml_disable_entity_loader( true );
167  $reader->setParserProperty( XMLReader::SUBST_ENTITIES, true );
168  try {
169  $this->validate( $reader );
170  } catch ( Exception $e ) {
171  // Calling this malformed, because we didn't parse the whole
172  // thing. Maybe just an external entity refernce.
173  $this->wellFormed = false;
174  $reader->close();
175  libxml_disable_entity_loader( $oldDisable );
176  throw $e;
177  }
178  $reader->close();
179  libxml_disable_entity_loader( $oldDisable );
180  }
181  }
182 
183  private function readNext( XMLReader $reader ) {
184  set_error_handler( [ $this, 'XmlErrorHandler' ] );
185  $ret = $reader->read();
186  restore_error_handler();
187  return $ret;
188  }
189 
190  public function XmlErrorHandler( $errno, $errstr ) {
191  $this->wellFormed = false;
192  }
193 
194  private function validate( $reader ) {
195  // First, move through anything that isn't an element, and
196  // handle any processing instructions with the callback
197  do {
198  if ( !$this->readNext( $reader ) ) {
199  // Hit the end of the document before any elements
200  $this->wellFormed = false;
201  return;
202  }
203  if ( $reader->nodeType === XMLReader::PI ) {
204  $this->processingInstructionHandler( $reader->name, $reader->value );
205  }
206  if ( $reader->nodeType === XMLReader::DOC_TYPE ) {
207  $this->DTDHandler( $reader );
208  }
209  } while ( $reader->nodeType != XMLReader::ELEMENT );
210 
211  // Process the rest of the document
212  do {
213  switch ( $reader->nodeType ) {
214  case XMLReader::ELEMENT:
215  $name = $this->expandNS(
216  $reader->name,
217  $reader->namespaceURI
218  );
219  if ( $this->rootElement === '' ) {
220  $this->rootElement = $name;
221  }
222  $empty = $reader->isEmptyElement;
223  $attrs = $this->getAttributesArray( $reader );
224  $this->elementOpen( $name, $attrs );
225  if ( $empty ) {
226  $this->elementClose();
227  }
228  break;
229 
230  case XMLReader::END_ELEMENT:
231  $this->elementClose();
232  break;
233 
234  case XMLReader::WHITESPACE:
235  case XMLReader::SIGNIFICANT_WHITESPACE:
236  case XMLReader::CDATA:
237  case XMLReader::TEXT:
238  $this->elementData( $reader->value );
239  break;
240 
241  case XMLReader::ENTITY_REF:
242  // Unexpanded entity (maybe external?),
243  // don't send to the filter (xml_parse didn't)
244  break;
245 
246  case XMLReader::COMMENT:
247  // Don't send to the filter (xml_parse didn't)
248  break;
249 
250  case XMLReader::PI:
251  // Processing instructions can happen after the header too
253  $reader->name,
254  $reader->value
255  );
256  break;
257  case XMLReader::DOC_TYPE:
258  // We should never see a doctype after first
259  // element.
260  $this->wellFormed = false;
261  break;
262  default:
263  // One of DOC, ENTITY, END_ENTITY,
264  // NOTATION, or XML_DECLARATION
265  // xml_parse didn't send these to the filter, so we won't.
266  }
267  } while ( $this->readNext( $reader ) );
268 
269  if ( $this->stackDepth !== 0 ) {
270  $this->wellFormed = false;
271  } elseif ( $this->wellFormed === null ) {
272  $this->wellFormed = true;
273  }
274  }
275 
281  private function getAttributesArray( XMLReader $r ) {
282  $attrs = [];
283  while ( $r->moveToNextAttribute() ) {
284  if ( $r->namespaceURI === 'http://www.w3.org/2000/xmlns/' ) {
285  // XMLReader treats xmlns attributes as normal
286  // attributes, while xml_parse doesn't
287  continue;
288  }
289  $name = $this->expandNS( $r->name, $r->namespaceURI );
290  $attrs[$name] = $r->value;
291  }
292  return $attrs;
293  }
294 
300  private function expandNS( $name, $namespaceURI ) {
301  if ( $namespaceURI ) {
302  $parts = explode( ':', $name );
303  $localname = array_pop( $parts );
304  return "$namespaceURI:$localname";
305  }
306  return $name;
307  }
308 
313  private function elementOpen( $name, $attribs ) {
314  $this->elementDataContext[] = [ $name, $attribs ];
315  $this->elementData[] = '';
316  $this->stackDepth++;
317  }
318 
319  private function elementClose() {
320  list( $name, $attribs ) = array_pop( $this->elementDataContext );
321  $data = array_pop( $this->elementData );
322  $this->stackDepth--;
323  $callbackReturn = false;
324 
325  if ( is_callable( $this->filterCallback ) ) {
326  $callbackReturn = call_user_func(
327  $this->filterCallback,
328  $name,
329  $attribs,
330  $data
331  );
332  }
333  if ( $callbackReturn ) {
334  // Filter hit!
335  $this->filterMatch = true;
336  $this->filterMatchType = $callbackReturn;
337  }
338  }
339 
343  private function elementData( $data ) {
344  // Collect any data here, and we'll run the callback in elementClose
345  $this->elementData[ $this->stackDepth - 1 ] .= trim( $data );
346  }
347 
352  private function processingInstructionHandler( $target, $data ) {
353  $callbackReturn = false;
354  if ( $this->parserOptions['processing_instruction_handler'] ) {
355  $callbackReturn = call_user_func(
356  $this->parserOptions['processing_instruction_handler'],
357  $target,
358  $data
359  );
360  }
361  if ( $callbackReturn ) {
362  // Filter hit!
363  $this->filterMatch = true;
364  $this->filterMatchType = $callbackReturn;
365  }
366  }
367 
373  private function DTDHandler( XMLReader $reader ) {
374  $externalCallback = $this->parserOptions['external_dtd_handler'];
375  $generalCallback = $this->parserOptions['dtd_handler'];
376  $checkIfSafe = $this->parserOptions['require_safe_dtd'];
377  if ( !$externalCallback && !$generalCallback && !$checkIfSafe ) {
378  return;
379  }
380  $dtd = $reader->readOuterXml();
381  $callbackReturn = false;
382 
383  if ( $generalCallback ) {
384  $callbackReturn = call_user_func( $generalCallback, $dtd );
385  }
386  if ( $callbackReturn ) {
387  // Filter hit!
388  $this->filterMatch = true;
389  $this->filterMatchType = $callbackReturn;
390  $callbackReturn = false;
391  }
392 
393  $parsedDTD = $this->parseDTD( $dtd );
394  if ( $externalCallback && isset( $parsedDTD['type'] ) ) {
395  $callbackReturn = call_user_func(
396  $externalCallback,
397  $parsedDTD['type'],
398  $parsedDTD['publicid'] ?? null,
399  $parsedDTD['systemid'] ?? null
400  );
401  }
402  if ( $callbackReturn ) {
403  // Filter hit!
404  $this->filterMatch = true;
405  $this->filterMatchType = $callbackReturn;
406  $callbackReturn = false;
407  }
408 
409  if ( $checkIfSafe && isset( $parsedDTD['internal'] ) &&
410  !$this->checkDTDIsSafe( $parsedDTD['internal'] )
411  ) {
412  $this->wellFormed = false;
413  }
414  }
415 
436  private function checkDTDIsSafe( $internalSubset ) {
437  $offset = 0;
438  $res = preg_match(
439  '/^(?:\s*<!ENTITY\s+\S+\s+' .
440  '(?:"(?:&[^"%&;]{1,64};|(?:[^"%&]|&amp;|&quot;){0,255})"' .
441  '|\'(?:&[^"%&;]{1,64};|(?:[^\'%&]|&amp;|&apos;){0,255})\')\s*>' .
442  '|\s*<!--(?:[^-]|-[^-])*-->' .
443  '|\s*<!ATTLIST svg xmlns:xlink CDATA #FIXED ' .
444  '"http:\/\/www.w3.org\/1999\/xlink">)*\s*$/',
445  $internalSubset
446  );
447 
448  return (bool)$res;
449  }
450 
459  private function parseDTD( $dtd ) {
460  $m = [];
461  $res = preg_match(
462  '/^<!DOCTYPE\s*\S+\s*' .
463  '(?:(?P<typepublic>PUBLIC)\s*' .
464  '(?:"(?P<pubquote>[^"]*)"|\'(?P<pubapos>[^\']*)\')' . // public identifer
465  '\s*"(?P<pubsysquote>[^"]*)"|\'(?P<pubsysapos>[^\']*)\'' . // system identifier
466  '|(?P<typesystem>SYSTEM)\s*' .
467  '(?:"(?P<sysquote>[^"]*)"|\'(?P<sysapos>[^\']*)\')' .
468  ')?\s*' .
469  '(?:\[\s*(?P<internal>.*)\])?\s*>$/s',
470  $dtd,
471  $m
472  );
473  if ( !$res ) {
474  $this->wellFormed = false;
475  return [];
476  }
477  $parsed = [];
478  foreach ( $m as $field => $value ) {
479  if ( $value === '' || is_numeric( $field ) ) {
480  continue;
481  }
482  switch ( $field ) {
483  case 'typepublic':
484  case 'typesystem':
485  $parsed['type'] = $value;
486  break;
487  case 'pubquote':
488  case 'pubapos':
489  $parsed['publicid'] = $value;
490  break;
491  case 'pubsysquote':
492  case 'pubsysapos':
493  case 'sysquote':
494  case 'sysapos':
495  $parsed['systemid'] = $value;
496  break;
497  case 'internal':
498  $parsed['internal'] = $value;
499  break;
500  }
501  }
502  return $parsed;
503  }
504 }
processingInstructionHandler( $target, $data)
readNext(XMLReader $reader)
$parserOptions
Additional parsing options.
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global list
Definition: deferred.txt:11
if(is_array( $mode)) switch( $mode) $input
processing should stop and the error should be shown to the user * false
Definition: hooks.txt:187
validate( $reader)
null means default in associative array with keys and values unescaped Should be merged with default with a value of false meaning to suppress the attribute in associative array with keys and values unescaped noclasses & $ret
Definition: hooks.txt:1982
elementData( $data)
div flags Integer display flags(NO_ACTION_LINK, NO_EXTRA_USER_LINKS) 'LogException' returning false will NOT prevent logging $e
Definition: hooks.txt:2159
validateFromInput( $xml, $isFile)
$value
getRootElement()
Get the root element.
checkDTDIsSafe( $internalSubset)
Check if the internal subset of the DTD is safe.
static newFromFilename( $fname, $filterCallback=null)
Alternative constructor: from filename.
elementOpen( $name, $attribs)
null means default in associative array with keys and values unescaped Should be merged with default with a value of false meaning to suppress the attribute in associative array with keys and values unescaped noclasses just before the function returns a value If you return true
Definition: hooks.txt:1982
static newFromString( $string, $filterCallback=null)
Alternative constructor: from string.
$stackDepth
Current depth of the data stack.
$res
Definition: database.txt:21
mixed $filterMatchType
Will contain the type of filter hit if the optional element filter returned a match at some point...
null means default in associative array with keys and values unescaped Should be merged with default with a value of false meaning to suppress the attribute in associative array with keys and values unescaped & $options
Definition: hooks.txt:1982
null means default in associative array with keys and values unescaped Should be merged with default with a value of false meaning to suppress the attribute in associative array with keys and values unescaped noclasses just before the function returns a value If you return an< a > element with HTML attributes $attribs and contents $html will be returned If you return $ret will be returned and may include noclasses after processing & $attribs
Definition: hooks.txt:1982
this hook is for auditing only or null if authentication failed before getting that far or null if we can t even determine that When $user is not null
Definition: hooks.txt:780
parseDTD( $dtd)
Parse DTD into parts.
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such as
Definition: distributors.txt:9
if(defined( 'MW_SETUP_CALLBACK')) $fname
Customization point after all loading (constants, functions, classes, DefaultSettings, LocalSettings).
Definition: Setup.php:123
$rootElement
Name of the document&#39;s root element, including any namespace as an expanded URL.
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency which acts as the top level factory for services in MediaWiki which can be used to gain access to default instances of various services MediaWikiServices however also allows new services to be defined and default services to be redefined Services are defined or redefined by providing a callback the instantiator that will return a new instance of the service When it will create an instance of MediaWikiServices and populate it with the services defined in the files listed by thereby bootstrapping the DI framework Per $wgServiceWiringFiles lists includes ServiceWiring php
Definition: injection.txt:35
$elementData
A stack of strings containing the data of each xml element as it&#39;s processed.
$filterMatch
Will be set to true if the optional element filter returned a match at some point.
XmlErrorHandler( $errno, $errstr)
getAttributesArray(XMLReader $r)
Get all of the attributes for an XMLReader&#39;s current node.
expandNS( $name, $namespaceURI)
Allows to change the fields on the form that will be generated $name
Definition: hooks.txt:271
DTDHandler(XMLReader $reader)
Handle coming across a <!DOCTYPE declaration.
__construct( $input, $filterCallback=null, $isFile=true, $options=[])
Allow filtering an XML file.
$elementDataContext
A stack of element names and attributes, as we process them.
$wellFormed
Will be set to true or false to indicate whether the file is well-formed XML.