MediaWiki REL1_31
WikiTextStructure.php
Go to the documentation of this file.
1<?php
2
3use HtmlFormatter\HtmlFormatter;
4
12 private $openingText;
16 private $allText;
20 private $auxText = [];
25
30 // "it looks like you don't have javascript enabled..." – do not need to index
31 'audio', 'video',
32 // CSS stylesheets aren't content
33 'style',
34 // The [1] for references
35 'sup.reference',
36 // The ↑ next to references in the references section
37 '.mw-cite-backlink',
38 // Headings are already indexed in their own field.
39 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
40 // Collapsed fields are hidden by default so we don't want them showing up.
41 '.autocollapse',
42 // Content explicitly decided to be not searchable by editors such
43 // as custom navigation templates.
44 '.navigation-not-searchable'
45 ];
46
51 // Thumbnail captions aren't really part of the text proper
52 '.thumbcaption',
53 // Neither are tables
54 'table',
55 // Common style for "See also:".
56 '.rellink',
57 // Common style for calling out helpful links at the top of the article.
58 '.dablink',
59 // New class users can use to mark stuff as auxiliary to searches.
60 '.searchaux',
61 ];
62
67 $this->parserOutput = $parserOutput;
68 }
69
83 public function headings() {
84 $headings = [];
85 $ignoredHeadings = $this->getIgnoredHeadings();
86 foreach ( $this->parserOutput->getSections() as $heading ) {
87 $heading = $heading[ 'line' ];
88
89 // Some wikis wrap the brackets in a span:
90 // https://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link
91 $heading = preg_replace( '/<\/?span>/', '', $heading );
92 // Normalize [] so the following regexp would work.
93 $heading = preg_replace( [ '/&#91;/', '/&#93;/' ], [ '[', ']' ], $heading );
94 $heading = preg_replace( '/<sup>\s*\[\s*\d+\s*\]\s*<\/sup>/is', '', $heading );
95
96 // Strip tags from the heading or else we'll display them (escaped) in search results
97 $heading = trim( Sanitizer::stripAllTags( $heading ) );
98
99 // Note that we don't take the level of the heading into account - all headings are equal.
100 // Except the ones we ignore.
101 if ( !in_array( $heading, $ignoredHeadings ) ) {
102 $headings[] = $heading;
103 }
104 }
105 return $headings;
106 }
107
115 public static function parseSettingsInMessage( $message ) {
116 $lines = explode( "\n", $message );
117 $lines = preg_replace( '/#.*$/', '', $lines ); // Remove comments
118 $lines = array_map( 'trim', $lines ); // Remove extra spaces
119 $lines = array_filter( $lines ); // Remove empty lines
120 return $lines;
121 }
122
127 private function getIgnoredHeadings() {
128 static $ignoredHeadings = null;
129 if ( $ignoredHeadings === null ) {
130 $ignoredHeadings = [];
131 $source = wfMessage( 'search-ignored-headings' )->inContentLanguage();
132 if ( $source->isBlank() ) {
133 // Try old version too, just in case
134 $source = wfMessage( 'cirrussearch-ignored-headings' )->inContentLanguage();
135 }
136 if ( !$source->isDisabled() ) {
138 $ignoredHeadings = $lines; // Now we just have headings!
139 }
140 }
141 return $ignoredHeadings;
142 }
143
147 private function extractWikitextParts() {
148 if ( !is_null( $this->allText ) ) {
149 return;
150 }
151 $text = $this->parserOutput->getText( [
152 'enableSectionEditTokens' => false,
153 'allowTOC' => false,
154 ] );
155 if ( strlen( $text ) == 0 ) {
156 $this->allText = "";
157 // empty text - nothing to seek here
158 return;
159 }
160 $opening = null;
161
162 $this->openingText = $this->extractHeadingBeforeFirstHeading( $text );
163
164 // Add extra spacing around break tags so text crammed together like<br>this
165 // doesn't make one word.
166 $text = str_replace( '<br', "\n<br", $text );
167
168 $formatter = new HtmlFormatter( $text );
169
170 // Strip elements from the page that we never want in the search text.
171 $formatter->remove( $this->excludedElementSelectors );
172 $formatter->filterContent();
173
174 // Strip elements from the page that are auxiliary text. These will still be
175 // searched but matches will be ranked lower and non-auxiliary matches will be
176 // preferred in highlighting.
177 $formatter->remove( $this->auxiliaryElementSelectors );
178 $auxiliaryElements = $formatter->filterContent();
179 $this->allText = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
180 foreach ( $auxiliaryElements as $auxiliaryElement ) {
181 $this->auxText[] =
182 trim( Sanitizer::stripAllTags( $formatter->getText( $auxiliaryElement ) ) );
183 }
184 }
185
191 private function extractHeadingBeforeFirstHeading( $text ) {
192 $matches = [];
193 if ( !preg_match( '/<h[123456]>/', $text, $matches, PREG_OFFSET_CAPTURE ) ) {
194 // There isn't a first heading so we interpret this as the article
195 // being entirely without heading.
196 return null;
197 }
198 $text = substr( $text, 0, $matches[ 0 ][ 1 ] );
199 if ( !$text ) {
200 // There isn't any text before the first heading so we declare there isn't
201 // a first heading.
202 return null;
203 }
204
205 $formatter = new HtmlFormatter( $text );
206 $formatter->remove( $this->excludedElementSelectors );
207 $formatter->remove( $this->auxiliaryElementSelectors );
208 $formatter->filterContent();
209 $text = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
210
211 if ( !$text ) {
212 // There isn't any text after filtering before the first heading so we declare
213 // that there isn't a first heading.
214 return null;
215 }
216
217 return $text;
218 }
219
224 public function getOpeningText() {
225 $this->extractWikitextParts();
226 return $this->openingText;
227 }
228
233 public function getMainText() {
234 $this->extractWikitextParts();
235 return $this->allText;
236 }
237
242 public function getAuxiliaryText() {
243 $this->extractWikitextParts();
244 return $this->auxText;
245 }
246
251 public function getDefaultSort() {
252 return $this->parserOutput->getProperty( 'defaultsort' );
253 }
254}
Class allowing to explore structure of parsed wikitext.
getDefaultSort()
Get the defaultsort property.
extractHeadingBeforeFirstHeading( $text)
Get text before first heading.
string[] $auxiliaryElementSelectors
selectors to elements that are considered auxiliary to article text for search
ParserOutput $parserOutput
static parseSettingsInMessage( $message)
Parse a message content into an array.
extractWikitextParts()
Extract parts of the text - opening, main and auxiliary.
getOpeningText()
Get opening text.
getMainText()
Get main text.
string[] $excludedElementSelectors
selectors to elements that are excluded entirely from search
headings()
Get headings on the page.
getIgnoredHeadings()
Get list of heading to ignore.
getAuxiliaryText()
Get auxiliary text.
__construct(ParserOutput $parserOutput)
either a unescaped string or a HtmlArmor object after in associative array form externallinks including delete and has completed for all link tables whether this was an auto creation default is conds Array Extra conditions for the No matching items in log is displayed if loglist is empty msgKey Array If you want a nice box with a set this to the key of the message First element is the message additional optional elements are parameters for the key that are processed with wfMessage() -> params() ->parseAsBlock() - offset Set to overwrite offset parameter in $wgRequest set to '' to unset offset - wrap String Wrap the message in html(usually something like "&lt;div ...>$1&lt;/div>"). - flags Integer display flags(NO_ACTION_LINK, NO_EXTRA_USER_LINKS) 'LogException':Called before an exception(or PHP error) is logged. This is meant for integration with external error aggregation services
$source
$lines
Definition router.php:61