MediaWiki  master
WikiTextStructure.php
Go to the documentation of this file.
1 <?php
2 
4 
12  private $openingText;
16  private $allText;
20  private $auxText = [];
24  private $parserOutput;
25 
30  // "it looks like you don't have javascript enabled..." – do not need to index
31  'audio', 'video',
32  // CSS stylesheets aren't content
33  'style',
34  // The [1] for references from Cite
35  'sup.reference',
36  // The ↑ next to references in the references section from Cite
37  '.mw-cite-backlink',
38  // Headings are already indexed in their own field.
39  'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
40  // Collapsed fields are hidden by default so we don't want them showing up.
41  '.autocollapse',
42  // Content explicitly decided to be not searchable by editors such
43  // as custom navigation templates.
44  '.navigation-not-searchable',
45  // User-facing interface code prompting the user to act from WikibaseMediaInfo
46  '.wbmi-entityview-emptyCaption',
47  ];
48 
53  // Thumbnail captions aren't really part of the text proper
54  '.thumbcaption',
55  // Neither are tables
56  'table',
57  // Common style for "See also:".
58  '.rellink',
59  // Common style for calling out helpful links at the top of the article.
60  '.dablink',
61  // New class users can use to mark stuff as auxiliary to searches.
62  '.searchaux',
63  ];
64 
68  public function __construct( ParserOutput $parserOutput ) {
69  $this->parserOutput = $parserOutput;
70  }
71 
85  public function headings() {
86  $headings = [];
87  $ignoredHeadings = $this->getIgnoredHeadings();
88  foreach ( $this->parserOutput->getSections() as $heading ) {
89  $heading = $heading[ 'line' ];
90 
91  // Some wikis wrap the brackets in a span:
92  // https://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link
93  $heading = preg_replace( '/<\/?span>/', '', $heading );
94  // Normalize [] so the following regexp would work.
95  $heading = preg_replace( [ '/&#91;/', '/&#93;/' ], [ '[', ']' ], $heading );
96  $heading = preg_replace( '/<sup>\s*\[\s*\d+\s*\]\s*<\/sup>/is', '', $heading );
97 
98  // Strip tags from the heading or else we'll display them (escaped) in search results
99  $heading = trim( Sanitizer::stripAllTags( $heading ) );
100 
101  // Note that we don't take the level of the heading into account - all headings are equal.
102  // Except the ones we ignore.
103  if ( !in_array( $heading, $ignoredHeadings ) ) {
104  $headings[] = $heading;
105  }
106  }
107  return $headings;
108  }
109 
117  public static function parseSettingsInMessage( $message ) {
118  $lines = explode( "\n", $message );
119  $lines = preg_replace( '/#.*$/', '', $lines ); // Remove comments
120  $lines = array_map( 'trim', $lines ); // Remove extra spaces
121  $lines = array_filter( $lines ); // Remove empty lines
122  return $lines;
123  }
124 
129  private function getIgnoredHeadings() {
130  static $ignoredHeadings = null;
131  if ( $ignoredHeadings === null ) {
132  $ignoredHeadings = [];
133  $source = wfMessage( 'search-ignored-headings' )->inContentLanguage();
134  if ( $source->isBlank() ) {
135  // Try old version too, just in case
136  $source = wfMessage( 'cirrussearch-ignored-headings' )->inContentLanguage();
137  }
138  if ( !$source->isDisabled() ) {
139  $lines = self::parseSettingsInMessage( $source->plain() );
140  $ignoredHeadings = $lines; // Now we just have headings!
141  }
142  }
143  return $ignoredHeadings;
144  }
145 
149  private function extractWikitextParts() {
150  if ( !is_null( $this->allText ) ) {
151  return;
152  }
153  $text = $this->parserOutput->getText( [
154  'enableSectionEditTokens' => false,
155  'allowTOC' => false,
156  ] );
157  if ( $text === '' ) {
158  $this->allText = "";
159  // empty text - nothing to seek here
160  return;
161  }
162  $opening = null;
163 
164  $this->openingText = $this->extractHeadingBeforeFirstHeading( $text );
165 
166  $formatter = new HtmlFormatter( $text );
167 
168  // Strip elements from the page that we never want in the search text.
169  $formatter->remove( $this->excludedElementSelectors );
170  $formatter->filterContent();
171 
172  // Strip elements from the page that are auxiliary text. These will still be
173  // searched but matches will be ranked lower and non-auxiliary matches will be
174  // preferred in highlighting.
175  $formatter->remove( $this->auxiliaryElementSelectors );
176  $auxiliaryElements = $formatter->filterContent();
177  $this->allText = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
178  foreach ( $auxiliaryElements as $auxiliaryElement ) {
179  $this->auxText[] =
180  trim( Sanitizer::stripAllTags( $formatter->getText( $auxiliaryElement ) ) );
181  }
182  }
183 
189  private function extractHeadingBeforeFirstHeading( $text ) {
190  $matches = [];
191  if ( !preg_match( '/<h[123456]>/', $text, $matches, PREG_OFFSET_CAPTURE ) ) {
192  // There isn't a first heading so we interpret this as the article
193  // being entirely without heading.
194  return null;
195  }
196  $text = substr( $text, 0, $matches[ 0 ][ 1 ] );
197  if ( !$text ) {
198  // There isn't any text before the first heading so we declare there isn't
199  // a first heading.
200  return null;
201  }
202 
203  $formatter = new HtmlFormatter( $text );
204  $formatter->remove( $this->excludedElementSelectors );
205  $formatter->remove( $this->auxiliaryElementSelectors );
206  $formatter->filterContent();
207  $text = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
208 
209  if ( !$text ) {
210  // There isn't any text after filtering before the first heading so we declare
211  // that there isn't a first heading.
212  return null;
213  }
214 
215  return $text;
216  }
217 
222  public function getOpeningText() {
223  $this->extractWikitextParts();
224  return $this->openingText;
225  }
226 
231  public function getMainText() {
232  $this->extractWikitextParts();
233  return $this->allText;
234  }
235 
240  public function getAuxiliaryText() {
241  $this->extractWikitextParts();
242  return $this->auxText;
243  }
244 
249  public function getDefaultSort() {
250  return $this->parserOutput->getProperty( 'defaultsort' );
251  }
252 }
ParserOutput $parserOutput
string [] $auxiliaryElementSelectors
selectors to elements that are considered auxiliary to article text for search
getAuxiliaryText()
Get auxiliary text.
getMainText()
Get main text.
getOpeningText()
Get opening text.
static parseSettingsInMessage( $message)
Parse a message content into an array.
headings()
Get headings on the page.
Class allowing to explore structure of parsed wikitext.
$source
extractHeadingBeforeFirstHeading( $text)
Get text before first heading.
static stripAllTags( $html)
Take a fragment of (potentially invalid) HTML and return a version with any tags removed, encoded as plain text.
Definition: Sanitizer.php:2041
string [] $excludedElementSelectors
selectors to elements that are excluded entirely from search
extractWikitextParts()
Extract parts of the text - opening, main and auxiliary.
getIgnoredHeadings()
Get list of heading to ignore.
__construct(ParserOutput $parserOutput)
$lines
Definition: router.php:61
wfMessage( $key,... $params)
This is the function for getting translated interface messages.
getDefaultSort()
Get the defaultsort property.
$matches