MediaWiki REL1_35
XmlTypeCheck.php
Go to the documentation of this file.
1<?php
33 public $wellFormed = null;
34
39 public $filterMatch = false;
40
46 public $filterMatchType = false;
47
52 public $rootElement = '';
53
59 protected $elementData = [];
60
64 protected $elementDataContext = [];
65
69 protected $stackDepth = 0;
70
72 protected $filterCallback;
73
77 private $parserOptions = [
78 'processing_instruction_handler' => null,
79 'external_dtd_handler' => '',
80 'dtd_handler' => '',
81 'require_safe_dtd' => true
82 ];
83
110 public function __construct( $input, $filterCallback = null, $isFile = true, $options = [] ) {
111 $this->filterCallback = $filterCallback;
112 $this->parserOptions = array_merge( $this->parserOptions, $options );
113 $this->validateFromInput( $input, $isFile );
114 }
115
127 public static function newFromFilename( $fname, $filterCallback = null ) {
128 return new self( $fname, $filterCallback, true );
129 }
130
142 public static function newFromString( $string, $filterCallback = null ) {
143 return new self( $string, $filterCallback, false );
144 }
145
151 public function getRootElement() {
152 return $this->rootElement;
153 }
154
159 private function validateFromInput( $xml, $isFile ) {
160 $reader = new XMLReader();
161 if ( $isFile ) {
162 $s = $reader->open( $xml, null, LIBXML_NOERROR | LIBXML_NOWARNING );
163 } else {
164 $s = $reader->XML( $xml, null, LIBXML_NOERROR | LIBXML_NOWARNING );
165 }
166 if ( $s !== true ) {
167 // Couldn't open the XML
168 $this->wellFormed = false;
169 } else {
170 // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847
171 $oldDisable = @libxml_disable_entity_loader( true );
172 $reader->setParserProperty( XMLReader::SUBST_ENTITIES, true );
173 try {
174 $this->validate( $reader );
175 } catch ( Exception $e ) {
176 // Calling this malformed, because we didn't parse the whole
177 // thing. Maybe just an external entity refernce.
178 $this->wellFormed = false;
179 $reader->close();
180 // phpcs:ignore Generic.PHP.NoSilencedErrors
181 @libxml_disable_entity_loader( $oldDisable );
182 throw $e;
183 }
184 $reader->close();
185 // phpcs:ignore Generic.PHP.NoSilencedErrors
186 @libxml_disable_entity_loader( $oldDisable );
187 }
188 }
189
190 private function readNext( XMLReader $reader ) {
191 set_error_handler( function ( $line, $file ) {
192 $this->wellFormed = false;
193 } );
194 $ret = $reader->read();
195 restore_error_handler();
196 return $ret;
197 }
198
199 private function validate( $reader ) {
200 // First, move through anything that isn't an element, and
201 // handle any processing instructions with the callback
202 do {
203 if ( !$this->readNext( $reader ) ) {
204 // Hit the end of the document before any elements
205 $this->wellFormed = false;
206 return;
207 }
208 if ( $reader->nodeType === XMLReader::PI ) {
209 $this->processingInstructionHandler( $reader->name, $reader->value );
210 }
211 if ( $reader->nodeType === XMLReader::DOC_TYPE ) {
212 $this->dtdHandler( $reader );
213 }
214 } while ( $reader->nodeType != XMLReader::ELEMENT );
215
216 // Process the rest of the document
217 do {
218 switch ( $reader->nodeType ) {
219 case XMLReader::ELEMENT:
220 $name = $this->expandNS(
221 $reader->name,
222 $reader->namespaceURI
223 );
224 if ( $this->rootElement === '' ) {
225 $this->rootElement = $name;
226 }
227 $empty = $reader->isEmptyElement;
228 $attrs = $this->getAttributesArray( $reader );
229 $this->elementOpen( $name, $attrs );
230 if ( $empty ) {
231 $this->elementClose();
232 }
233 break;
234
235 case XMLReader::END_ELEMENT:
236 $this->elementClose();
237 break;
238
239 case XMLReader::WHITESPACE:
240 case XMLReader::SIGNIFICANT_WHITESPACE:
241 case XMLReader::CDATA:
242 case XMLReader::TEXT:
243 $this->elementData( $reader->value );
244 break;
245
246 case XMLReader::ENTITY_REF:
247 // Unexpanded entity (maybe external?),
248 // don't send to the filter (xml_parse didn't)
249 break;
250
251 case XMLReader::COMMENT:
252 // Don't send to the filter (xml_parse didn't)
253 break;
254
255 case XMLReader::PI:
256 // Processing instructions can happen after the header too
258 $reader->name,
259 $reader->value
260 );
261 break;
262 case XMLReader::DOC_TYPE:
263 // We should never see a doctype after first
264 // element.
265 $this->wellFormed = false;
266 break;
267 default:
268 // One of DOC, ENTITY, END_ENTITY,
269 // NOTATION, or XML_DECLARATION
270 // xml_parse didn't send these to the filter, so we won't.
271 }
272 } while ( $this->readNext( $reader ) );
273
274 if ( $this->stackDepth !== 0 ) {
275 $this->wellFormed = false;
276 } elseif ( $this->wellFormed === null ) {
277 $this->wellFormed = true;
278 }
279 }
280
286 private function getAttributesArray( XMLReader $r ) {
287 $attrs = [];
288 while ( $r->moveToNextAttribute() ) {
289 if ( $r->namespaceURI === 'http://www.w3.org/2000/xmlns/' ) {
290 // XMLReader treats xmlns attributes as normal
291 // attributes, while xml_parse doesn't
292 continue;
293 }
294 $name = $this->expandNS( $r->name, $r->namespaceURI );
295 $attrs[$name] = $r->value;
296 }
297 return $attrs;
298 }
299
305 private function expandNS( $name, $namespaceURI ) {
306 if ( $namespaceURI ) {
307 $parts = explode( ':', $name );
308 $localname = array_pop( $parts );
309 return "$namespaceURI:$localname";
310 }
311 return $name;
312 }
313
318 private function elementOpen( $name, $attribs ) {
319 $this->elementDataContext[] = [ $name, $attribs ];
320 $this->elementData[] = '';
321 $this->stackDepth++;
322 }
323
324 private function elementClose() {
325 list( $name, $attribs ) = array_pop( $this->elementDataContext );
326 $data = array_pop( $this->elementData );
327 $this->stackDepth--;
328 $callbackReturn = false;
329
330 if ( is_callable( $this->filterCallback ) ) {
331 $callbackReturn = ( $this->filterCallback )( $name, $attribs, $data );
332 }
333 if ( $callbackReturn ) {
334 // Filter hit!
335 $this->filterMatch = true;
336 $this->filterMatchType = $callbackReturn;
337 }
338 }
339
343 private function elementData( $data ) {
344 // Collect any data here, and we'll run the callback in elementClose
345 $this->elementData[ $this->stackDepth - 1 ] .= trim( $data );
346 }
347
352 private function processingInstructionHandler( $target, $data ) {
353 $callbackReturn = false;
354 if ( $this->parserOptions['processing_instruction_handler'] ) {
355 // @phan-suppress-next-line PhanTypeInvalidCallable false positive
356 $callbackReturn = $this->parserOptions['processing_instruction_handler'](
357 $target,
358 $data
359 );
360 }
361 if ( $callbackReturn ) {
362 // Filter hit!
363 $this->filterMatch = true;
364 $this->filterMatchType = $callbackReturn;
365 }
366 }
367
373 private function dtdHandler( XMLReader $reader ) {
374 $externalCallback = $this->parserOptions['external_dtd_handler'];
375 $generalCallback = $this->parserOptions['dtd_handler'];
376 $checkIfSafe = $this->parserOptions['require_safe_dtd'];
377 if ( !$externalCallback && !$generalCallback && !$checkIfSafe ) {
378 return;
379 }
380 $dtd = $reader->readOuterXml();
381 $callbackReturn = false;
382
383 if ( $generalCallback ) {
384 $callbackReturn = $generalCallback( $dtd );
385 }
386 if ( $callbackReturn ) {
387 // Filter hit!
388 $this->filterMatch = true;
389 $this->filterMatchType = $callbackReturn;
390 $callbackReturn = false;
391 }
392
393 $parsedDTD = $this->parseDTD( $dtd );
394 if ( $externalCallback && isset( $parsedDTD['type'] ) ) {
395 $callbackReturn = $externalCallback(
396 $parsedDTD['type'],
397 $parsedDTD['publicid'] ?? null,
398 $parsedDTD['systemid'] ?? null
399 );
400 }
401 if ( $callbackReturn ) {
402 // Filter hit!
403 $this->filterMatch = true;
404 $this->filterMatchType = $callbackReturn;
405 }
406
407 if ( $checkIfSafe && isset( $parsedDTD['internal'] ) &&
408 !$this->checkDTDIsSafe( $parsedDTD['internal'] )
409 ) {
410 $this->wellFormed = false;
411 }
412 }
413
434 private function checkDTDIsSafe( $internalSubset ) {
435 $res = preg_match(
436 '/^(?:\s*<!ENTITY\s+\S+\s+' .
437 '(?:"(?:&[^"%&;]{1,64};|(?:[^"%&]|&amp;|&quot;){0,255})"' .
438 '|\'(?:&[^"%&;]{1,64};|(?:[^\'%&]|&amp;|&apos;){0,255})\')\s*>' .
439 '|\s*<!--(?:[^-]|-[^-])*-->' .
440 '|\s*<!ATTLIST svg xmlns:xlink CDATA #FIXED ' .
441 '"http:\/\/www.w3.org\/1999\/xlink">)*\s*$/',
442 $internalSubset
443 );
444
445 return (bool)$res;
446 }
447
456 private function parseDTD( $dtd ) {
457 $m = [];
458 $res = preg_match(
459 '/^<!DOCTYPE\s*\S+\s*' .
460 '(?:(?P<typepublic>PUBLIC)\s*' .
461 '(?:"(?P<pubquote>[^"]*)"|\'(?P<pubapos>[^\']*)\')' . // public identifer
462 '\s*"(?P<pubsysquote>[^"]*)"|\'(?P<pubsysapos>[^\']*)\'' . // system identifier
463 '|(?P<typesystem>SYSTEM)\s*' .
464 '(?:"(?P<sysquote>[^"]*)"|\'(?P<sysapos>[^\']*)\')' .
465 ')?\s*' .
466 '(?:\[\s*(?P<internal>.*)\])?\s*>$/s',
467 $dtd,
468 $m
469 );
470 if ( !$res ) {
471 $this->wellFormed = false;
472 return [];
473 }
474 $parsed = [];
475 foreach ( $m as $field => $value ) {
476 if ( $value === '' || is_numeric( $field ) ) {
477 continue;
478 }
479 switch ( $field ) {
480 case 'typepublic':
481 case 'typesystem':
482 $parsed['type'] = $value;
483 break;
484 case 'pubquote':
485 case 'pubapos':
486 $parsed['publicid'] = $value;
487 break;
488 case 'pubsysquote':
489 case 'pubsysapos':
490 case 'sysquote':
491 case 'sysapos':
492 $parsed['systemid'] = $value;
493 break;
494 case 'internal':
495 $parsed['internal'] = $value;
496 break;
497 }
498 }
499 return $parsed;
500 }
501}
int $stackDepth
Current depth of the data stack.
callable null $filterCallback
validate( $reader)
getRootElement()
Get the root element.
__construct( $input, $filterCallback=null, $isFile=true, $options=[])
Allow filtering an XML file.
parseDTD( $dtd)
Parse DTD into parts.
elementData( $data)
string[] $elementData
A stack of strings containing the data of each xml element as it's processed.
bool null $wellFormed
Will be set to true or false to indicate whether the file is well-formed XML.
string $rootElement
Name of the document's root element, including any namespace as an expanded URL.
bool $filterMatch
Will be set to true if the optional element filter returned a match at some point.
static newFromFilename( $fname, $filterCallback=null)
Alternative constructor: from filename.
readNext(XMLReader $reader)
elementOpen( $name, $attribs)
validateFromInput( $xml, $isFile)
array $parserOptions
Additional parsing options.
mixed $filterMatchType
Will contain the type of filter hit if the optional element filter returned a match at some point.
array $elementDataContext
A stack of element names and attributes, as we process them.
expandNS( $name, $namespaceURI)
dtdHandler(XMLReader $reader)
Handle coming across a <!DOCTYPE declaration.
static newFromString( $string, $filterCallback=null)
Alternative constructor: from string.
processingInstructionHandler( $target, $data)
checkDTDIsSafe( $internalSubset)
Check if the internal subset of the DTD is safe.
getAttributesArray(XMLReader $r)
Get all of the attributes for an XMLReader's current node.
$line
Definition mcc.php:119
return true
Definition router.php:92
if(PHP_SAPI !='cli-server') if(!isset( $_SERVER['SCRIPT_FILENAME'])) $file
Item class for a filearchive table row.
Definition router.php:42