MediaWiki master
WikiTextStructure.php
Go to the documentation of this file.
1<?php
2
3use HtmlFormatter\HtmlFormatter;
6
11
12 private ?string $openingText = null;
13 private ?string $allText = null;
15 private array $auxText = [];
16 private ParserOutput $parserOutput;
17
21 private const EXCLUDED_ELEMENT_SELECTORS = [
22 // "it looks like you don't have javascript enabled..." – do not need to index
23 'audio', 'video',
24 // CSS stylesheets aren't content
25 'style',
26 // The [1] for references from Cite
27 'sup.reference',
28 // The ↑ next to references in the references section from Cite
29 '.mw-cite-backlink',
30 // Headings are already indexed in their own field.
31 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
32 // Collapsed fields are hidden by default, so we don't want them showing up.
33 '.autocollapse',
34 // Content explicitly decided to be not searchable by editors such
35 // as custom navigation templates.
36 '.navigation-not-searchable',
37 // User-facing interface code prompting the user to act from WikibaseMediaInfo
38 '.wbmi-entityview-emptyCaption',
39 ];
40
44 private const AUXILIARY_ELEMENT_SELECTORS = [
45 // Thumbnail captions aren't really part of the text proper
46 '.thumbcaption',
47 'figcaption',
48 // Neither are tables
49 'table',
50 // Common style for "See also:".
51 '.rellink',
52 // Common style for calling out helpful links at the top of the article.
53 '.dablink',
54 // New class users can use to mark stuff as auxiliary to searches.
55 '.searchaux',
56 ];
57
61 public function __construct( ParserOutput $parserOutput ) {
62 $this->parserOutput = $parserOutput;
63 }
64
81 public function headings() {
82 $headings = [];
83 $tocData = $this->parserOutput->getTOCData();
84 if ( $tocData === null ) {
85 return $headings;
86 }
87 $ignoredHeadings = $this->getIgnoredHeadings();
88 foreach ( $tocData->getSections() as $heading ) {
89 $heading = $heading->line;
90
91 // Some wikis wrap the brackets in a span:
92 // https://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link
93 $heading = preg_replace( '/<\/?span>/', '', $heading );
94 // Normalize [] so the following regexp would work.
95 $heading = preg_replace( [ '/&#91;/', '/&#93;/' ], [ '[', ']' ], $heading );
96 $heading = preg_replace( '/<sup>\s*\[\s*\d+\s*\]\s*<\/sup>/i', '', $heading );
97
98 // Strip tags from the heading or else we'll display them (escaped) in search results
99 $heading = trim( Sanitizer::stripAllTags( $heading ) );
100
101 // Note that we don't take the level of the heading into account - all headings are equal.
102 // Except the ones we ignore.
103 if ( !in_array( $heading, $ignoredHeadings ) ) {
104 $headings[] = $heading;
105 }
106 }
107 return $headings;
108 }
109
117 public static function parseSettingsInMessage( $message ) {
118 $lines = explode( "\n", $message );
119 // Remove comments
120 $lines = preg_replace( '/#.*$/', '', $lines );
121 // Remove extra spaces
122 $lines = array_map( 'trim', $lines );
123 // Remove empty lines
124 return array_filter( $lines );
125 }
126
131 private function getIgnoredHeadings() {
132 static $ignoredHeadings = null;
133 if ( $ignoredHeadings === null ) {
134 $ignoredHeadings = [];
135 $source = wfMessage( 'search-ignored-headings' )->inContentLanguage();
136 if ( $source->isBlank() ) {
137 // Try the old version too, just in case
138 $source = wfMessage( 'cirrussearch-ignored-headings' )->inContentLanguage();
139 }
140 if ( !$source->isDisabled() ) {
142 // Now we just have headings!
143 $ignoredHeadings = $lines;
144 }
145 }
146 return $ignoredHeadings;
147 }
148
152 private function extractWikitextParts() {
153 if ( $this->allText !== null ) {
154 return;
155 }
156 $text = $this->parserOutput->getRawText();
157 if ( $text === '' ) {
158 $this->allText = "";
159 // empty text - nothing to seek here
160 return;
161 }
162
163 $this->openingText = $this->extractTextBeforeFirstHeading( $text );
164
165 $formatter = new HtmlFormatter( $text );
166
167 // Strip elements from the page that we never want in the search text.
168 $formatter->remove( self::EXCLUDED_ELEMENT_SELECTORS );
169 $formatter->filterContent();
170
171 // Strip elements from the page that are auxiliary text. These will still be
172 // searched, but matches will be ranked lower and non-auxiliary matches will be
173 // preferred in highlighting.
174 $formatter->remove( self::AUXILIARY_ELEMENT_SELECTORS );
175 $auxiliaryElements = $formatter->filterContent();
176 $this->allText = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
177 foreach ( $auxiliaryElements as $auxiliaryElement ) {
178 $this->auxText[] =
179 trim( Sanitizer::stripAllTags( $formatter->getText( $auxiliaryElement ) ) );
180 }
181 }
182
188 private function extractTextBeforeFirstHeading( $text ) {
189 $matches = [];
190 if ( !preg_match( '/<h[123456]\b/', $text, $matches, PREG_OFFSET_CAPTURE ) ) {
191 // There isn't a first heading, so we interpret this as the article
192 // being entirely without heading.
193 return null;
194 }
195 $text = substr( $text, 0, $matches[ 0 ][ 1 ] );
196 if ( !$text ) {
197 // There isn't any text before the first heading, so we declare there isn't
198 // a first heading.
199 return null;
200 }
201
202 $formatter = new HtmlFormatter( $text );
203 $formatter->remove( self::EXCLUDED_ELEMENT_SELECTORS );
204 $formatter->remove( self::AUXILIARY_ELEMENT_SELECTORS );
205 $formatter->filterContent();
206 $text = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
207
208 if ( !$text ) {
209 // There isn't any text after filtering before the first heading, so we declare
210 // that there isn't a first heading.
211 return null;
212 }
213
214 return $text;
215 }
216
220 public function getOpeningText() {
221 $this->extractWikitextParts();
222 return $this->openingText;
223 }
224
228 public function getMainText() {
229 $this->extractWikitextParts();
230 return $this->allText;
231 }
232
236 public function getAuxiliaryText() {
237 $this->extractWikitextParts();
238 return $this->auxText;
239 }
240
245 public function getDefaultSort() {
246 $sort = $this->parserOutput->getPageProperty( 'defaultsort' );
247 if ( $sort === false ) {
248 return null;
249 }
250 return $sort;
251 }
252}
wfMessage( $key,... $params)
This is the function for getting translated interface messages.
Rendered output of a wiki page, as parsed from wikitext.
HTML sanitizer for MediaWiki.
Definition Sanitizer.php:46
Class allowing to explore the structure of parsed wikitext.
getDefaultSort()
Get the "defaultsort" property.
static parseSettingsInMessage( $message)
Parse a message content into an array.
headings()
Gets headings from the page.
__construct(ParserOutput $parserOutput)
$source
if(!file_exists( $CREDITS)) $lines