MediaWiki
REL1_34
ExtractFormatter.php
Go to the documentation of this file.
1
<?php
2
3
namespace
TextExtracts
;
4
5
use DOMElement;
6
use HtmlFormatter\HtmlFormatter;
7
13
class
ExtractFormatter
extends
HtmlFormatter {
14
const
SECTION_MARKER_START =
"\1\2"
;
15
const
SECTION_MARKER_END =
"\2\1"
;
16
20
private
$plainText
;
21
26
public
function
__construct
( $text, $plainText ) {
27
parent::__construct( HtmlFormatter::wrapHTML( $text ) );
28
$this->plainText = $plainText;
29
30
$this->setRemoveMedia(
true
);
31
32
if
( $plainText ) {
33
$this->flattenAllTags();
34
}
else
{
35
$this->flatten( [
'a'
] );
36
}
37
}
38
47
public
function
getText
( $element =
null
) {
48
$this->filterContent();
49
$text = parent::getText();
50
if
( $this->plainText ) {
51
$text = html_entity_decode( $text );
52
// replace nbsp with space
53
$text = str_replace(
"\u{00A0}"
,
' '
, $text );
54
// for Windows
55
$text = str_replace(
"\r"
,
"\n"
, $text );
56
// normalise newlines
57
$text = preg_replace(
"/\n{3,}/"
,
"\n\n"
, $text );
58
}
59
return
trim( $text );
60
}
61
66
public
function
onHtmlReady
( $html ) {
67
if
( $this->plainText ) {
68
$html = preg_replace(
'/\s*(<h([1-6])\b)/i'
,
69
"\n\n"
. self::SECTION_MARKER_START .
'$2'
. self::SECTION_MARKER_END .
'$1'
,
70
$html
71
);
72
}
73
return
$html;
74
}
75
82
public
function
filterContent
() {
83
$removed = parent::filterContent();
84
85
$doc = $this->getDoc();
86
$spans = $doc->getElementsByTagName(
'span'
);
87
89
foreach
( $spans as $span ) {
90
$span->removeAttribute(
'class'
);
91
$span->removeAttribute(
'style'
);
92
}
93
94
return
$removed;
95
}
96
}
TextExtracts\ExtractFormatter
Provides text-only or limited-HTML extracts of page HTML.
Definition
ExtractFormatter.php:13
TextExtracts\ExtractFormatter\getText
getText( $element=null)
Performs final transformations (such as newline replacement for plaintext option) and returns resulti...
Definition
ExtractFormatter.php:47
TextExtracts\ExtractFormatter\__construct
__construct( $text, $plainText)
Definition
ExtractFormatter.php:26
TextExtracts\ExtractFormatter\$plainText
bool $plainText
Definition
ExtractFormatter.php:20
TextExtracts\ExtractFormatter\filterContent
filterContent()
Removes content we've chosen to remove then removes class and style attributes from the remaining spa...
Definition
ExtractFormatter.php:82
TextExtracts\ExtractFormatter\onHtmlReady
onHtmlReady( $html)
Definition
ExtractFormatter.php:66
TextExtracts
Definition
ApiQueryExtracts.php:3
extensions
TextExtracts
includes
ExtractFormatter.php
Generated on Mon Nov 25 2024 16:04:45 for MediaWiki by
1.10.0