MediaWiki master
WikiTextStructure.php
Go to the documentation of this file.
1<?php
2
3use HtmlFormatter\HtmlFormatter;
5
13 private $openingText;
17 private $allText;
21 private $auxText = [];
25 private $parserOutput;
26
30 private $excludedElementSelectors = [
31 // "it looks like you don't have javascript enabled..." – do not need to index
32 'audio', 'video',
33 // CSS stylesheets aren't content
34 'style',
35 // The [1] for references from Cite
36 'sup.reference',
37 // The ↑ next to references in the references section from Cite
38 '.mw-cite-backlink',
39 // Headings are already indexed in their own field.
40 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
41 // Collapsed fields are hidden by default so we don't want them showing up.
42 '.autocollapse',
43 // Content explicitly decided to be not searchable by editors such
44 // as custom navigation templates.
45 '.navigation-not-searchable',
46 // User-facing interface code prompting the user to act from WikibaseMediaInfo
47 '.wbmi-entityview-emptyCaption',
48 ];
49
53 private $auxiliaryElementSelectors = [
54 // Thumbnail captions aren't really part of the text proper
55 '.thumbcaption',
56 'figcaption',
57 // Neither are tables
58 'table',
59 // Common style for "See also:".
60 '.rellink',
61 // Common style for calling out helpful links at the top of the article.
62 '.dablink',
63 // New class users can use to mark stuff as auxiliary to searches.
64 '.searchaux',
65 ];
66
70 public function __construct( ParserOutput $parserOutput ) {
71 $this->parserOutput = $parserOutput;
72 }
73
87 public function headings() {
88 $headings = [];
89 $tocData = $this->parserOutput->getTOCData();
90 if ( $tocData === null ) {
91 return $headings;
92 }
93 $ignoredHeadings = $this->getIgnoredHeadings();
94 foreach ( $tocData->getSections() as $heading ) {
95 $heading = $heading->line;
96
97 // Some wikis wrap the brackets in a span:
98 // https://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link
99 $heading = preg_replace( '/<\/?span>/', '', $heading );
100 // Normalize [] so the following regexp would work.
101 $heading = preg_replace( [ '/&#91;/', '/&#93;/' ], [ '[', ']' ], $heading );
102 $heading = preg_replace( '/<sup>\s*\[\s*\d+\s*\]\s*<\/sup>/i', '', $heading );
103
104 // Strip tags from the heading or else we'll display them (escaped) in search results
105 $heading = trim( Sanitizer::stripAllTags( $heading ) );
106
107 // Note that we don't take the level of the heading into account - all headings are equal.
108 // Except the ones we ignore.
109 if ( !in_array( $heading, $ignoredHeadings ) ) {
110 $headings[] = $heading;
111 }
112 }
113 return $headings;
114 }
115
123 public static function parseSettingsInMessage( $message ) {
124 $lines = explode( "\n", $message );
125 $lines = preg_replace( '/#.*$/', '', $lines ); // Remove comments
126 $lines = array_map( 'trim', $lines ); // Remove extra spaces
127 $lines = array_filter( $lines ); // Remove empty lines
128 return $lines;
129 }
130
135 private function getIgnoredHeadings() {
136 static $ignoredHeadings = null;
137 if ( $ignoredHeadings === null ) {
138 $ignoredHeadings = [];
139 $source = wfMessage( 'search-ignored-headings' )->inContentLanguage();
140 if ( $source->isBlank() ) {
141 // Try old version too, just in case
142 $source = wfMessage( 'cirrussearch-ignored-headings' )->inContentLanguage();
143 }
144 if ( !$source->isDisabled() ) {
146 $ignoredHeadings = $lines; // Now we just have headings!
147 }
148 }
149 return $ignoredHeadings;
150 }
151
155 private function extractWikitextParts() {
156 if ( $this->allText !== null ) {
157 return;
158 }
159 $text = $this->parserOutput->getText( [
160 'enableSectionEditTokens' => false,
161 'allowTOC' => false,
162 ] );
163 if ( $text === '' ) {
164 $this->allText = "";
165 // empty text - nothing to seek here
166 return;
167 }
168
169 $this->openingText = $this->extractTextBeforeFirstHeading( $text );
170
171 $formatter = new HtmlFormatter( $text );
172
173 // Strip elements from the page that we never want in the search text.
174 $formatter->remove( $this->excludedElementSelectors );
175 $formatter->filterContent();
176
177 // Strip elements from the page that are auxiliary text. These will still be
178 // searched but matches will be ranked lower and non-auxiliary matches will be
179 // preferred in highlighting.
180 $formatter->remove( $this->auxiliaryElementSelectors );
181 $auxiliaryElements = $formatter->filterContent();
182 $this->allText = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
183 foreach ( $auxiliaryElements as $auxiliaryElement ) {
184 $this->auxText[] =
185 trim( Sanitizer::stripAllTags( $formatter->getText( $auxiliaryElement ) ) );
186 }
187 }
188
194 private function extractTextBeforeFirstHeading( $text ) {
195 $matches = [];
196 if ( !preg_match( '/<h[123456]>/', $text, $matches, PREG_OFFSET_CAPTURE ) ) {
197 // There isn't a first heading so we interpret this as the article
198 // being entirely without heading.
199 return null;
200 }
201 $text = substr( $text, 0, $matches[ 0 ][ 1 ] );
202 if ( !$text ) {
203 // There isn't any text before the first heading so we declare there isn't
204 // a first heading.
205 return null;
206 }
207
208 $formatter = new HtmlFormatter( $text );
209 $formatter->remove( $this->excludedElementSelectors );
210 $formatter->remove( $this->auxiliaryElementSelectors );
211 $formatter->filterContent();
212 $text = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
213
214 if ( !$text ) {
215 // There isn't any text after filtering before the first heading so we declare
216 // that there isn't a first heading.
217 return null;
218 }
219
220 return $text;
221 }
222
226 public function getOpeningText() {
227 $this->extractWikitextParts();
228 return $this->openingText;
229 }
230
234 public function getMainText() {
235 $this->extractWikitextParts();
236 return $this->allText;
237 }
238
242 public function getAuxiliaryText() {
243 $this->extractWikitextParts();
244 return $this->auxText;
245 }
246
251 public function getDefaultSort() {
252 $sort = $this->parserOutput->getPageProperty( 'defaultsort' );
253 if ( $sort === false ) {
254 return null;
255 }
256 return $sort;
257 }
258}
wfMessage( $key,... $params)
This is the function for getting translated interface messages.
HTML sanitizer for MediaWiki.
Definition Sanitizer.php:46
Rendered output of a wiki page, as parsed from wikitext.
Class allowing to explore structure of parsed wikitext.
getDefaultSort()
Get the defaultsort property.
static parseSettingsInMessage( $message)
Parse a message content into an array.
headings()
Get headings on the page.
__construct(ParserOutput $parserOutput)
$source
if(!file_exists( $CREDITS)) $lines