MediaWiki  1.34.0
ExtractFormatter.php
Go to the documentation of this file.
1 <?php
2 
3 namespace TextExtracts;
4 
5 use DOMElement;
6 use HtmlFormatter\HtmlFormatter;
7 
13 class ExtractFormatter extends HtmlFormatter {
14  const SECTION_MARKER_START = "\1\2";
15  const SECTION_MARKER_END = "\2\1";
16 
20  private $plainText;
21 
26  public function __construct( $text, $plainText ) {
27  parent::__construct( HtmlFormatter::wrapHTML( $text ) );
28  $this->plainText = $plainText;
29 
30  $this->setRemoveMedia( true );
31 
32  if ( $plainText ) {
33  $this->flattenAllTags();
34  } else {
35  $this->flatten( [ 'a' ] );
36  }
37  }
38 
47  public function getText( $element = null ) {
48  $this->filterContent();
49  $text = parent::getText();
50  if ( $this->plainText ) {
51  $text = html_entity_decode( $text );
52  // replace nbsp with space
53  $text = str_replace( "\u{00A0}", ' ', $text );
54  // for Windows
55  $text = str_replace( "\r", "\n", $text );
56  // normalise newlines
57  $text = preg_replace( "/\n{3,}/", "\n\n", $text );
58  }
59  return trim( $text );
60  }
61 
66  public function onHtmlReady( $html ) {
67  if ( $this->plainText ) {
68  $html = preg_replace( '/\s*(<h([1-6])\b)/i',
69  "\n\n" . self::SECTION_MARKER_START . '$2' . self::SECTION_MARKER_END . '$1',
70  $html
71  );
72  }
73  return $html;
74  }
75 
82  public function filterContent() {
83  $removed = parent::filterContent();
84 
85  $doc = $this->getDoc();
86  $spans = $doc->getElementsByTagName( 'span' );
87 
89  foreach ( $spans as $span ) {
90  $span->removeAttribute( 'class' );
91  $span->removeAttribute( 'style' );
92  }
93 
94  return $removed;
95  }
96 }
TextExtracts\ExtractFormatter\getText
getText( $element=null)
Performs final transformations (such as newline replacement for plaintext option) and returns resulti...
Definition: ExtractFormatter.php:47
TextExtracts\ExtractFormatter\filterContent
filterContent()
Removes content we've chosen to remove then removes class and style attributes from the remaining spa...
Definition: ExtractFormatter.php:82
TextExtracts\ExtractFormatter\__construct
__construct( $text, $plainText)
Definition: ExtractFormatter.php:26
TextExtracts\ExtractFormatter\$plainText
bool $plainText
Definition: ExtractFormatter.php:20
TextExtracts
Definition: ApiQueryExtracts.php:3
TextExtracts\ExtractFormatter\onHtmlReady
onHtmlReady( $html)
Definition: ExtractFormatter.php:66
TextExtracts\ExtractFormatter
Provides text-only or limited-HTML extracts of page HTML.
Definition: ExtractFormatter.php:13