MediaWiki master
WikiTextStructure.php
Go to the documentation of this file.
1<?php
2
3namespace MediaWiki\Content;
4
7use Wikimedia\Parsoid\Utils\DOMCompat;
8use Wikimedia\Parsoid\Utils\DOMUtils;
9
14
15 private ?string $openingText = null;
16 private ?string $allText = null;
18 private array $auxText = [];
19 private ParserOutput $parserOutput;
20
24 private const EXCLUDED_ELEMENT_SELECTORS = [
25 // "it looks like you don't have javascript enabled..." – do not need to index
26 'audio', 'video',
27 // CSS stylesheets aren't content
28 'style',
29 // The [1] for references from Cite
30 'sup.reference',
31 // The ↑ next to references in the references section from Cite
32 '.mw-cite-backlink',
33 // Headings are already indexed in their own field.
34 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
35 // Collapsed fields are hidden by default, so we don't want them showing up.
36 '.autocollapse',
37 // Content explicitly decided to be not searchable by editors such
38 // as custom navigation templates.
39 '.navigation-not-searchable',
40 // User-facing interface code prompting the user to act from WikibaseMediaInfo
41 '.wbmi-entityview-emptyCaption',
42 ];
43
47 private const AUXILIARY_ELEMENT_SELECTORS = [
48 // Thumbnail captions aren't really part of the text proper
49 '.thumbcaption',
50 'figcaption',
51 // Neither are tables
52 'table',
53 // Common style for "See also:".
54 '.rellink',
55 // Common style for calling out helpful links at the top of the article.
56 '.dablink',
57 // New class users can use to mark stuff as auxiliary to searches.
58 '.searchaux',
59 ];
60
61 public function __construct( ParserOutput $parserOutput ) {
62 $this->parserOutput = $parserOutput;
63 }
64
82 public function headings() {
83 $headings = [];
84 $tocData = $this->parserOutput->getTOCData();
85 if ( $tocData === null ) {
86 return $headings;
87 }
88 $ignoredHeadings = $this->getIgnoredHeadings();
89 foreach ( $tocData->getSections() as $heading ) {
90 $heading = $heading->line;
91
92 // Some wikis wrap the brackets in a span:
93 // https://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link
94 $heading = preg_replace( '/<\/?span>/', '', $heading );
95 // Normalize [] so the following regexp would work.
96 $heading = preg_replace( [ '/&#91;/', '/&#93;/' ], [ '[', ']' ], $heading );
97 $heading = preg_replace( '/<sup>\s*\[\s*\d+\s*\]\s*<\/sup>/i', '', $heading );
98
99 // Strip tags from the heading or else we'll display them (escaped) in search results
100 $heading = trim( Sanitizer::stripAllTags( $heading ) );
101
102 // Note that we don't take the level of the heading into account - all headings are equal.
103 // Except the ones we ignore.
104 if ( !in_array( $heading, $ignoredHeadings ) ) {
105 $headings[] = $heading;
106 }
107 }
108
109 return $headings;
110 }
111
120 public static function parseSettingsInMessage( $message ) {
121 $lines = explode( "\n", $message );
122 // Remove comments
123 $lines = preg_replace( '/#.*$/', '', $lines );
124 // Remove extra spaces
125 $lines = array_map( 'trim', $lines );
126
127 // Remove empty lines
128 return array_filter( $lines );
129 }
130
136 private function getIgnoredHeadings() {
137 static $ignoredHeadings = null;
138 if ( $ignoredHeadings === null ) {
139 $ignoredHeadings = [];
140 $source = wfMessage( 'search-ignored-headings' )->inContentLanguage();
141 if ( !$source->isDisabled() ) {
142 $lines = self::parseSettingsInMessage( $source->plain() );
143 // Now we just have headings!
144 $ignoredHeadings = $lines;
145 }
146 }
147
148 return $ignoredHeadings;
149 }
150
154 private function extractWikitextParts() {
155 if ( $this->allText !== null ) {
156 return;
157 }
158 $text = $this->parserOutput->getRawText();
159 if ( $text === '' ) {
160 $this->allText = "";
161
162 // empty text - nothing to seek here
163 return;
164 }
165
166 $this->openingText = $this->extractTextBeforeFirstHeading( $text );
167
168 $doc = DOMUtils::parseHTML( $text );
169
170 // Strip elements from the page that we never want in the search text.
171 foreach ( self::EXCLUDED_ELEMENT_SELECTORS as $selector ) {
172 foreach ( DOMCompat::querySelectorAll( $doc, $selector ) as $element ) {
173 $element->parentNode->removeChild( $element );
174 }
175 }
176
177 // Strip elements from the page that are auxiliary text. These will still be
178 // searched, but matches will be ranked lower and non-auxiliary matches will be
179 // preferred in highlighting.
180 foreach ( self::AUXILIARY_ELEMENT_SELECTORS as $selector ) {
181 foreach ( DOMCompat::querySelectorAll( $doc, $selector ) as $element ) {
182 $this->auxText[] = trim( Sanitizer::stripAllTags( DOMCompat::getInnerHTML( $element ) ) );
183 $element->parentNode->removeChild( $element );
184 }
185 }
186
187 $this->allText = trim( Sanitizer::stripAllTags( DOMCompat::getInnerHTML( DOMCompat::getBody( $doc ) ) ) );
188 }
189
197 private function extractTextBeforeFirstHeading( $text ) {
198 $matches = [];
199 if ( !preg_match( '/<h[123456]\b/', $text, $matches, PREG_OFFSET_CAPTURE ) ) {
200 // There isn't a first heading, so we interpret this as the article
201 // being entirely without heading.
202 return null;
203 }
204 $text = substr( $text, 0, $matches[ 0 ][ 1 ] );
205 if ( !$text ) {
206 // There isn't any text before the first heading, so we declare there isn't
207 // a first heading.
208 return null;
209 }
210
211 $doc = DOMUtils::parseHTML( $text );
212 foreach ( array_merge( self::EXCLUDED_ELEMENT_SELECTORS, self::AUXILIARY_ELEMENT_SELECTORS ) as $selector ) {
213 foreach ( DOMCompat::querySelectorAll( $doc, $selector ) as $element ) {
214 $element->parentNode->removeChild( $element );
215 }
216 }
217
218 $text = trim( Sanitizer::stripAllTags( DOMCompat::getInnerHTML( DOMCompat::getBody( $doc ) ) ) );
219
220 if ( !$text ) {
221 // There isn't any text after filtering before the first heading, so we declare
222 // that there isn't a first heading.
223 return null;
224 }
225
226 return $text;
227 }
228
232 public function getOpeningText() {
233 $this->extractWikitextParts();
234
235 return $this->openingText;
236 }
237
241 public function getMainText() {
242 $this->extractWikitextParts();
243
244 return $this->allText;
245 }
246
250 public function getAuxiliaryText() {
251 $this->extractWikitextParts();
252
253 return $this->auxText;
254 }
255
261 public function getDefaultSort() {
262 $sort = $this->parserOutput->getPageProperty( 'defaultsort' );
263 if ( $sort === false ) {
264 return null;
265 }
266
267 return $sort;
268 }
269}
270
272class_alias( WikiTextStructure::class, 'WikiTextStructure' );
wfMessage( $key,... $params)
This is the function for getting translated interface messages.
Class allowing to explore the structure of parsed wikitext.
__construct(ParserOutput $parserOutput)
headings()
Gets headings from the page.
static parseSettingsInMessage( $message)
Parse a message content into an array.
getDefaultSort()
Get the "defaultsort" property.
ParserOutput is a rendering of a Content object or a message.
HTML sanitizer for MediaWiki.
Definition Sanitizer.php:32
$source