MediaWiki 1.40.4
WikiTextStructure.php
Go to the documentation of this file.
1<?php
2
3use HtmlFormatter\HtmlFormatter;
4
12 private $openingText;
16 private $allText;
20 private $auxText = [];
24 private $parserOutput;
25
29 private $excludedElementSelectors = [
30 // "it looks like you don't have javascript enabled..." – do not need to index
31 'audio', 'video',
32 // CSS stylesheets aren't content
33 'style',
34 // The [1] for references from Cite
35 'sup.reference',
36 // The ↑ next to references in the references section from Cite
37 '.mw-cite-backlink',
38 // Headings are already indexed in their own field.
39 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
40 // Collapsed fields are hidden by default so we don't want them showing up.
41 '.autocollapse',
42 // Content explicitly decided to be not searchable by editors such
43 // as custom navigation templates.
44 '.navigation-not-searchable',
45 // User-facing interface code prompting the user to act from WikibaseMediaInfo
46 '.wbmi-entityview-emptyCaption',
47 ];
48
52 private $auxiliaryElementSelectors = [
53 // Thumbnail captions aren't really part of the text proper
54 '.thumbcaption',
55 // Neither are tables
56 'table',
57 // Common style for "See also:".
58 '.rellink',
59 // Common style for calling out helpful links at the top of the article.
60 '.dablink',
61 // New class users can use to mark stuff as auxiliary to searches.
62 '.searchaux',
63 ];
64
68 public function __construct( ParserOutput $parserOutput ) {
69 $this->parserOutput = $parserOutput;
70 }
71
85 public function headings() {
86 $headings = [];
87 $tocData = $this->parserOutput->getTOCData();
88 if ( $tocData === null ) {
89 return $headings;
90 }
91 $ignoredHeadings = $this->getIgnoredHeadings();
92 foreach ( $tocData->getSections() as $heading ) {
93 $heading = $heading->line;
94
95 // Some wikis wrap the brackets in a span:
96 // https://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link
97 $heading = preg_replace( '/<\/?span>/', '', $heading );
98 // Normalize [] so the following regexp would work.
99 $heading = preg_replace( [ '/&#91;/', '/&#93;/' ], [ '[', ']' ], $heading );
100 $heading = preg_replace( '/<sup>\s*\[\s*\d+\s*\]\s*<\/sup>/is', '', $heading );
101
102 // Strip tags from the heading or else we'll display them (escaped) in search results
103 $heading = trim( Sanitizer::stripAllTags( $heading ) );
104
105 // Note that we don't take the level of the heading into account - all headings are equal.
106 // Except the ones we ignore.
107 if ( !in_array( $heading, $ignoredHeadings ) ) {
108 $headings[] = $heading;
109 }
110 }
111 return $headings;
112 }
113
121 public static function parseSettingsInMessage( $message ) {
122 $lines = explode( "\n", $message );
123 $lines = preg_replace( '/#.*$/', '', $lines ); // Remove comments
124 $lines = array_map( 'trim', $lines ); // Remove extra spaces
125 $lines = array_filter( $lines ); // Remove empty lines
126 return $lines;
127 }
128
133 private function getIgnoredHeadings() {
134 static $ignoredHeadings = null;
135 if ( $ignoredHeadings === null ) {
136 $ignoredHeadings = [];
137 $source = wfMessage( 'search-ignored-headings' )->inContentLanguage();
138 if ( $source->isBlank() ) {
139 // Try old version too, just in case
140 $source = wfMessage( 'cirrussearch-ignored-headings' )->inContentLanguage();
141 }
142 if ( !$source->isDisabled() ) {
144 $ignoredHeadings = $lines; // Now we just have headings!
145 }
146 }
147 return $ignoredHeadings;
148 }
149
153 private function extractWikitextParts() {
154 if ( $this->allText !== null ) {
155 return;
156 }
157 $text = $this->parserOutput->getText( [
158 'enableSectionEditTokens' => false,
159 'allowTOC' => false,
160 ] );
161 if ( $text === '' ) {
162 $this->allText = "";
163 // empty text - nothing to seek here
164 return;
165 }
166
167 $this->openingText = $this->extractHeadingBeforeFirstHeading( $text );
168
169 $formatter = new HtmlFormatter( $text );
170
171 // Strip elements from the page that we never want in the search text.
172 $formatter->remove( $this->excludedElementSelectors );
173 $formatter->filterContent();
174
175 // Strip elements from the page that are auxiliary text. These will still be
176 // searched but matches will be ranked lower and non-auxiliary matches will be
177 // preferred in highlighting.
178 $formatter->remove( $this->auxiliaryElementSelectors );
179 $auxiliaryElements = $formatter->filterContent();
180 $this->allText = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
181 foreach ( $auxiliaryElements as $auxiliaryElement ) {
182 $this->auxText[] =
183 trim( Sanitizer::stripAllTags( $formatter->getText( $auxiliaryElement ) ) );
184 }
185 }
186
192 private function extractHeadingBeforeFirstHeading( $text ) {
193 $matches = [];
194 if ( !preg_match( '/<h[123456]>/', $text, $matches, PREG_OFFSET_CAPTURE ) ) {
195 // There isn't a first heading so we interpret this as the article
196 // being entirely without heading.
197 return null;
198 }
199 $text = substr( $text, 0, $matches[ 0 ][ 1 ] );
200 if ( !$text ) {
201 // There isn't any text before the first heading so we declare there isn't
202 // a first heading.
203 return null;
204 }
205
206 $formatter = new HtmlFormatter( $text );
207 $formatter->remove( $this->excludedElementSelectors );
208 $formatter->remove( $this->auxiliaryElementSelectors );
209 $formatter->filterContent();
210 $text = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
211
212 if ( !$text ) {
213 // There isn't any text after filtering before the first heading so we declare
214 // that there isn't a first heading.
215 return null;
216 }
217
218 return $text;
219 }
220
224 public function getOpeningText() {
225 $this->extractWikitextParts();
226 return $this->openingText;
227 }
228
232 public function getMainText() {
233 $this->extractWikitextParts();
234 return $this->allText;
235 }
236
240 public function getAuxiliaryText() {
241 $this->extractWikitextParts();
242 return $this->auxText;
243 }
244
249 public function getDefaultSort() {
250 $sort = $this->parserOutput->getPageProperty( 'defaultsort' );
251 if ( $sort === false ) {
252 return null;
253 }
254 return $sort;
255 }
256}
wfMessage( $key,... $params)
This is the function for getting translated interface messages.
Class allowing to explore structure of parsed wikitext.
getDefaultSort()
Get the defaultsort property.
static parseSettingsInMessage( $message)
Parse a message content into an array.
headings()
Get headings on the page.
__construct(ParserOutput $parserOutput)
$source
if(!file_exists( $CREDITS)) $lines