MediaWiki  1.23.0
HtmlFormatter.php
Go to the documentation of this file.
1 <?php
27  private $doc;
28 
29  private $html;
30  private $itemsToRemove = array();
32  protected $removeMedia = false;
33 
39  public function __construct( $html ) {
40  $this->html = $html;
41  }
42 
48  public static function wrapHTML( $html ) {
49  return '<!doctype html><html><head></head><body>' . $html . '</body></html>';
50  }
51 
57  protected function onHtmlReady( $html ) {
58  return $html;
59  }
60 
64  public function getDoc() {
65  if ( !$this->doc ) {
66  $html = mb_convert_encoding( $this->html, 'HTML-ENTITIES', 'UTF-8' );
67 
68  // Workaround for bug that caused spaces before references
69  // to disappear during processing:
70  // https://bugzilla.wikimedia.org/show_bug.cgi?id=53086
71  //
72  // Please replace with a better fix if one can be found.
73  $html = str_replace( ' <', '&#32;<', $html );
74 
75  libxml_use_internal_errors( true );
76  $loader = libxml_disable_entity_loader();
77  $this->doc = new DOMDocument();
78  $this->doc->strictErrorChecking = false;
79  $this->doc->loadHTML( $html );
80  libxml_disable_entity_loader( $loader );
81  libxml_use_internal_errors( false );
82  $this->doc->encoding = 'UTF-8';
83  }
84  return $this->doc;
85  }
86 
91  public function setRemoveMedia( $flag = true ) {
92  $this->removeMedia = $flag;
93  }
94 
106  public function remove( $selectors ) {
107  $this->itemsToRemove = array_merge( $this->itemsToRemove, (array)$selectors );
108  }
109 
119  public function flatten( $elements ) {
120  $this->elementsToFlatten = array_merge( $this->elementsToFlatten, (array)$elements );
121  }
122 
126  public function flattenAllTags() {
127  $this->flatten( '[?!]?[a-z0-9]+' );
128  }
129 
133  public function filterContent() {
134  wfProfileIn( __METHOD__ );
135  $removals = $this->parseItemsToRemove();
136 
137  if ( !$removals ) {
138  wfProfileOut( __METHOD__ );
139  return;
140  }
141 
142  $doc = $this->getDoc();
143 
144  // Remove tags
145 
146  // You can't remove DOMNodes from a DOMNodeList as you're iterating
147  // over them in a foreach loop. It will seemingly leave the internal
148  // iterator on the foreach out of wack and results will be quite
149  // strange. Though, making a queue of items to remove seems to work.
150  $domElemsToRemove = array();
151  foreach ( $removals['TAG'] as $tagToRemove ) {
152  $tagToRemoveNodes = $doc->getElementsByTagName( $tagToRemove );
153  foreach ( $tagToRemoveNodes as $tagToRemoveNode ) {
154  if ( $tagToRemoveNode ) {
155  $domElemsToRemove[] = $tagToRemoveNode;
156  }
157  }
158  }
159 
160  $this->removeElements( $domElemsToRemove );
161 
162  // Elements with named IDs
163  $domElemsToRemove = array();
164  foreach ( $removals['ID'] as $itemToRemove ) {
165  $itemToRemoveNode = $doc->getElementById( $itemToRemove );
166  if ( $itemToRemoveNode ) {
167  $domElemsToRemove[] = $itemToRemoveNode;
168  }
169  }
170  $this->removeElements( $domElemsToRemove );
171 
172  // CSS Classes
173  $domElemsToRemove = array();
174  $xpath = new DOMXpath( $doc );
175  foreach ( $removals['CLASS'] as $classToRemove ) {
176  $elements = $xpath->query( '//*[contains(@class, "' . $classToRemove . '")]' );
177 
179  foreach ( $elements as $element ) {
180  $classes = $element->getAttribute( 'class' );
181  if ( preg_match( "/\b$classToRemove\b/", $classes ) && $element->parentNode ) {
182  $domElemsToRemove[] = $element;
183  }
184  }
185  }
186  $this->removeElements( $domElemsToRemove );
187 
188  // Tags with CSS Classes
189  foreach ( $removals['TAG_CLASS'] as $classToRemove ) {
190  $parts = explode( '.', $classToRemove );
191 
192  $elements = $xpath->query(
193  '//' . $parts[0] . '[@class="' . $parts[1] . '"]'
194  );
195 
196  $this->removeElements( $elements );
197  }
198 
199  wfProfileOut( __METHOD__ );
200  }
201 
206  private function removeElements( $elements ) {
207  $list = $elements;
208  if ( $elements instanceof DOMNodeList ) {
209  $list = array();
210  foreach ( $elements as $element ) {
211  $list[] = $element;
212  }
213  }
215  foreach ( $list as $element ) {
216  if ( $element->parentNode ) {
217  $element->parentNode->removeChild( $element );
218  }
219  }
220  }
221 
228  private function fixLibXML( $html ) {
229  wfProfileIn( __METHOD__ );
230  static $replacements;
231  if ( ! $replacements ) {
232  // We don't include rules like '&#34;' => '&amp;quot;' because entities had already been
233  // normalized by libxml. Using this function with input not sanitized by libxml is UNSAFE!
234  $replacements = new ReplacementArray( array(
235  '&quot;' => '&amp;quot;',
236  '&amp;' => '&amp;amp;',
237  '&lt;' => '&amp;lt;',
238  '&gt;' => '&amp;gt;',
239  ) );
240  }
241  $html = $replacements->replace( $html );
242  $html = mb_convert_encoding( $html, 'UTF-8', 'HTML-ENTITIES' );
243  wfProfileOut( __METHOD__ );
244  return $html;
245  }
246 
253  public function getText( $element = null ) {
254  wfProfileIn( __METHOD__ );
255 
256  if ( $this->doc ) {
257  if ( $element !== null && !( $element instanceof DOMElement ) ) {
258  $element = $this->doc->getElementById( $element );
259  }
260  if ( $element ) {
261  $body = $this->doc->getElementsByTagName( 'body' )->item( 0 );
262  $nodesArray = array();
263  foreach ( $body->childNodes as $node ) {
264  $nodesArray[] = $node;
265  }
266  foreach ( $nodesArray as $nodeArray ) {
267  $body->removeChild( $nodeArray );
268  }
269  $body->appendChild( $element );
270  }
271  $html = $this->doc->saveHTML();
272  $html = $this->fixLibXml( $html );
273  } else {
274  $html = $this->html;
275  }
276  if ( wfIsWindows() ) {
277  // Appears to be cleanup for CRLF misprocessing of unknown origin
278  // when running server on Windows platform.
279  //
280  // If this error continues in the future, please track it down in the
281  // XML code paths if possible and fix there.
282  $html = str_replace( '&#13;', '', $html );
283  }
284  $html = preg_replace( '/<!--.*?-->|^.*?<body>|<\/body>.*$/s', '', $html );
285  $html = $this->onHtmlReady( $html );
286 
287  if ( $this->elementsToFlatten ) {
288  $elements = implode( '|', $this->elementsToFlatten );
289  $html = preg_replace( "#</?($elements)\\b[^>]*>#is", '', $html );
290  }
291 
292  wfProfileOut( __METHOD__ );
293  return $html;
294  }
295 
302  protected function parseSelector( $selector, &$type, &$rawName ) {
303  if ( strpos( $selector, '.' ) === 0 ) {
304  $type = 'CLASS';
305  $rawName = substr( $selector, 1 );
306  } elseif ( strpos( $selector, '#' ) === 0 ) {
307  $type = 'ID';
308  $rawName = substr( $selector, 1 );
309  } elseif ( strpos( $selector, '.' ) !== 0 && strpos( $selector, '.' ) !== false ) {
310  $type = 'TAG_CLASS';
311  $rawName = $selector;
312  } elseif ( strpos( $selector, '[' ) === false && strpos( $selector, ']' ) === false ) {
313  $type = 'TAG';
314  $rawName = $selector;
315  } else {
316  throw new MWException( __METHOD__ . "(): unrecognized selector '$selector'" );
317  }
318 
319  return true;
320  }
321 
326  protected function parseItemsToRemove() {
327  wfProfileIn( __METHOD__ );
328  $removals = array(
329  'ID' => array(),
330  'TAG' => array(),
331  'CLASS' => array(),
332  'TAG_CLASS' => array(),
333  );
334 
335  foreach ( $this->itemsToRemove as $itemToRemove ) {
336  $type = '';
337  $rawName = '';
338  if ( $this->parseSelector( $itemToRemove, $type, $rawName ) ) {
339  $removals[$type][] = $rawName;
340  }
341  }
342 
343  if ( $this->removeMedia ) {
344  $removals['TAG'][] = 'img';
345  $removals['TAG'][] = 'audio';
346  $removals['TAG'][] = 'video';
347  }
348 
349  wfProfileOut( __METHOD__ );
350  return $removals;
351  }
352 }
php
skin txt MediaWiki includes four core it has been set as the default in MediaWiki since the replacing Monobook it had been been the default skin since before being replaced by Vector largely rewritten in while keeping its appearance Several legacy skins were removed in the as the burden of supporting them became too heavy to bear Those in etc for skin dependent CSS etc for skin dependent JavaScript These can also be customised on a per user by etc This feature has led to a wide variety of user styles becoming that gallery is a good place to ending in php
Definition: skin.txt:62
HtmlFormatter\parseSelector
parseSelector( $selector, &$type, &$rawName)
Definition: HtmlFormatter.php:301
HtmlFormatter\removeElements
removeElements( $elements)
Removes a list of elelments from DOMDocument.
Definition: HtmlFormatter.php:205
HtmlFormatter\$removeMedia
$removeMedia
Definition: HtmlFormatter.php:31
HtmlFormatter\$html
$html
Definition: HtmlFormatter.php:28
wfProfileIn
wfProfileIn( $functionname)
Begin profiling of a function.
Definition: Profiler.php:33
HtmlFormatter\parseItemsToRemove
parseItemsToRemove()
Transforms CSS selectors into an internal representation suitable for processing.
Definition: HtmlFormatter.php:325
HtmlFormatter\$elementsToFlatten
$elementsToFlatten
Definition: HtmlFormatter.php:30
HtmlFormatter\onHtmlReady
onHtmlReady( $html)
Override this in descendant class to modify HTML after it has been converted from DOM tree.
Definition: HtmlFormatter.php:56
HtmlFormatter\setRemoveMedia
setRemoveMedia( $flag=true)
Sets whether images/videos/sounds should be removed from output.
Definition: HtmlFormatter.php:90
HtmlFormatter\getText
getText( $element=null)
Performs final transformations and returns resulting HTML.
Definition: HtmlFormatter.php:252
MWException
MediaWiki exception.
Definition: MWException.php:26
wfProfileOut
wfProfileOut( $functionname='missing')
Stop profiling of a function.
Definition: Profiler.php:46
HtmlFormatter\$itemsToRemove
$itemsToRemove
Definition: HtmlFormatter.php:29
HtmlFormatter\__construct
__construct( $html)
Constructor.
Definition: HtmlFormatter.php:38
HtmlFormatter\flattenAllTags
flattenAllTags()
Instructs the formatter to flatten all tags.
Definition: HtmlFormatter.php:125
HtmlFormatter\wrapHTML
static wrapHTML( $html)
Turns a chunk of HTML into a proper document.
Definition: HtmlFormatter.php:47
array
the array() calling protocol came about after MediaWiki 1.4rc1.
List of Api Query prop modules.
ReplacementArray
Replacement array for FSS with fallback to strtr() Supports lazy initialisation of FSS resource.
Definition: StringUtils.php:411
HtmlFormatter\filterContent
filterContent()
Removes content we've chosen to remove.
Definition: HtmlFormatter.php:132
$selector
$selector
Definition: styleTest.css.php:43
wfIsWindows
wfIsWindows()
Check if the operating system is Windows.
Definition: GlobalFunctions.php:2524
HtmlFormatter\flatten
flatten( $elements)
Adds one or more element name to the list to flatten (remove tag, but not its content) Can accept und...
Definition: HtmlFormatter.php:118
HtmlFormatter
Definition: HtmlFormatter.php:23
HtmlFormatter\$doc
DOMDocument $doc
Definition: HtmlFormatter.php:26
as
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such as
Definition: distributors.txt:9
HtmlFormatter\getDoc
getDoc()
Definition: HtmlFormatter.php:63
HtmlFormatter\fixLibXML
fixLibXML( $html)
libxml in its usual pointlessness converts many chars to entities - this function perfoms a reverse c...
Definition: HtmlFormatter.php:227
$type
$type
Definition: testCompression.php:46