MediaWiki REL1_34
ExtractFormatter.php
Go to the documentation of this file.
1<?php
2
3namespace TextExtracts;
4
5use DOMElement;
6use HtmlFormatter\HtmlFormatter;
7
13class ExtractFormatter extends HtmlFormatter {
14 const SECTION_MARKER_START = "\1\2";
15 const SECTION_MARKER_END = "\2\1";
16
20 private $plainText;
21
26 public function __construct( $text, $plainText ) {
27 parent::__construct( HtmlFormatter::wrapHTML( $text ) );
28 $this->plainText = $plainText;
29
30 $this->setRemoveMedia( true );
31
32 if ( $plainText ) {
33 $this->flattenAllTags();
34 } else {
35 $this->flatten( [ 'a' ] );
36 }
37 }
38
47 public function getText( $element = null ) {
48 $this->filterContent();
49 $text = parent::getText();
50 if ( $this->plainText ) {
51 $text = html_entity_decode( $text );
52 // replace nbsp with space
53 $text = str_replace( "\u{00A0}", ' ', $text );
54 // for Windows
55 $text = str_replace( "\r", "\n", $text );
56 // normalise newlines
57 $text = preg_replace( "/\n{3,}/", "\n\n", $text );
58 }
59 return trim( $text );
60 }
61
66 public function onHtmlReady( $html ) {
67 if ( $this->plainText ) {
68 $html = preg_replace( '/\s*(<h([1-6])\b)/i',
69 "\n\n" . self::SECTION_MARKER_START . '$2' . self::SECTION_MARKER_END . '$1',
70 $html
71 );
72 }
73 return $html;
74 }
75
82 public function filterContent() {
83 $removed = parent::filterContent();
84
85 $doc = $this->getDoc();
86 $spans = $doc->getElementsByTagName( 'span' );
87
89 foreach ( $spans as $span ) {
90 $span->removeAttribute( 'class' );
91 $span->removeAttribute( 'style' );
92 }
93
94 return $removed;
95 }
96}
Provides text-only or limited-HTML extracts of page HTML.
getText( $element=null)
Performs final transformations (such as newline replacement for plaintext option) and returns resulti...
filterContent()
Removes content we've chosen to remove then removes class and style attributes from the remaining spa...