Code Coverage for /workspace/src/includes/content/WikiTextStructure.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	76.47% covered (warning)	76.47%	52 / 68	40.00% covered (danger)	40.00%	4 / 10	CRAP	0.00% covered (danger)	0.00%	0 / 1
WikiTextStructure	77.61% covered (warning)	77.61%	52 / 67	40.00% covered (danger)	40.00%	4 / 10	27.43	0.00% covered (danger)	0.00%	0 / 1
__construct	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
headings	92.86% covered (success)	92.86%	13 / 14	0.00% covered (danger)	0.00%	0 / 1	4.01
parseSettingsInMessage	0.00% covered (danger)	0.00%	0 / 4	0.00% covered (danger)	0.00%	0 / 1	2
getIgnoredHeadings	37.50% covered (danger)	37.50%	3 / 8	0.00% covered (danger)	0.00%	0 / 1	5.20
extractWikitextParts	87.50% covered (warning)	87.50%	14 / 16	0.00% covered (danger)	0.00%	0 / 1	4.03
extractTextBeforeFirstHeading	85.71% covered (warning)	85.71%	12 / 14	0.00% covered (danger)	0.00%	0 / 1	4.05
getOpeningText	100.00% covered (success)	100.00%	2 / 2	100.00% covered (success)	100.00%	1 / 1	1
getMainText	100.00% covered (success)	100.00%	2 / 2	100.00% covered (success)	100.00%	1 / 1	1
getAuxiliaryText	100.00% covered (success)	100.00%	2 / 2	100.00% covered (success)	100.00%	1 / 1	1
getDefaultSort	75.00% covered (warning)	75.00%	3 / 4	0.00% covered (danger)	0.00%	0 / 1	2.06

1	<?php
2
3	namespace MediaWiki\Content;
4
5	use HtmlFormatter\HtmlFormatter;
6	use MediaWiki\Parser\ParserOutput;
7	use MediaWiki\Parser\Sanitizer;
8
9	/**
10	* Class allowing to explore the structure of parsed wikitext.
11	*/
12	class WikiTextStructure {
13
14	private ?string $openingText = null;
15	private ?string $allText = null;
16	/** @var string[] */
17	private array $auxText = [];
18	private ParserOutput $parserOutput;
19
20	/**
21	* Selectors to elements that are excluded entirely from search
22	*/
23	private const EXCLUDED_ELEMENT_SELECTORS = [
24	// "it looks like you don't have javascript enabled..." – do not need to index
25	'audio', 'video',
26	// CSS stylesheets aren't content
27	'style',
28	// The [1] for references from Cite
29	'sup.reference',
30	// The ↑ next to references in the references section from Cite
31	'.mw-cite-backlink',
32	// Headings are already indexed in their own field.
33	'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
34	// Collapsed fields are hidden by default, so we don't want them showing up.
35	'.autocollapse',
36	// Content explicitly decided to be not searchable by editors such
37	// as custom navigation templates.
38	'.navigation-not-searchable',
39	// User-facing interface code prompting the user to act from WikibaseMediaInfo
40	'.wbmi-entityview-emptyCaption',
41	];
42
43	/**
44	* Selectors to elements that are considered auxiliary to the article text for search
45	*/
46	private const AUXILIARY_ELEMENT_SELECTORS = [
47	// Thumbnail captions aren't really part of the text proper
48	'.thumbcaption',
49	'figcaption',
50	// Neither are tables
51	'table',
52	// Common style for "See also:".
53	'.rellink',
54	// Common style for calling out helpful links at the top of the article.
55	'.dablink',
56	// New class users can use to mark stuff as auxiliary to searches.
57	'.searchaux',
58	];
59
60	/**
61	* @param ParserOutput $parserOutput
62	*/
63	public function __construct( ParserOutput $parserOutput ) {
64	$this->parserOutput = $parserOutput;
65	}
66
67	/**
68	* Gets headings from the page.
69	*
70	* @return string[]
71	* First strip out things that look like references. We can't use HTML filtering because
72	* the references come back as <sup> tags without a class. To keep from breaking stuff like
73	* ==Applicability of the strict mass–energy equivalence formula, ''E'' = ''mc''<sup>2</sup>==
74	* we don't remove the whole <sup> tag.
75	*
76	* We also don't want to strip the <sup> tag and remove everything that looks like [2] because,
77	* I don't know, maybe there is a band named Word [2] Foo r something. Whatever.
78	*
79	* So we only strip things that look like <sup> tags wrapping a reference. And since the data
80	* looks like:
81	* Reference in heading <sup>[1]</sup><sup>[2]</sup>
82	* we can not really use HtmlFormatter as we have no suitable selector.
83	*/
84	public function headings() {
85	$headings = [];
86	$tocData = $this->parserOutput->getTOCData();
87	if ( $tocData === null ) {
88	return $headings;
89	}
90	$ignoredHeadings = $this->getIgnoredHeadings();
91	foreach ( $tocData->getSections() as $heading ) {
92	$heading = $heading->line;
93
94	// Some wikis wrap the brackets in a span:
95	// https://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link
96	$heading = preg_replace( '/<\/?span>/', '', $heading );
97	// Normalize [] so the following regexp would work.
98	$heading = preg_replace( [ '/[/', '/]/' ], [ '[', ']' ], $heading );
99	$heading = preg_replace( '/<sup>\s\[\s\d+\s\]\s<\/sup>/i', '', $heading );
100
101	// Strip tags from the heading or else we'll display them (escaped) in search results
102	$heading = trim( Sanitizer::stripAllTags( $heading ) );
103
104	// Note that we don't take the level of the heading into account - all headings are equal.
105	// Except the ones we ignore.
106	if ( !in_array( $heading, $ignoredHeadings ) ) {
107	$headings[] = $heading;
108	}
109	}
110
111	return $headings;
112	}
113
114	/**
115	* Parse a message content into an array. This function is generally used to
116	* parse settings stored as i18n messages (see search-ignored-headings).
117	*
118	* @param string $message
119	*
120	* @return string[]
121	*/
122	public static function parseSettingsInMessage( $message ) {
123	$lines = explode( "\n", $message );
124	// Remove comments
125	$lines = preg_replace( '/#.*$/', '', $lines );
126	// Remove extra spaces
127	$lines = array_map( 'trim', $lines );
128
129	// Remove empty lines
130	return array_filter( $lines );
131	}
132
133	/**
134	* Gets a list of heading to ignore.
135	*
136	* @return string[]
137	*/
138	private function getIgnoredHeadings() {
139	static $ignoredHeadings = null;
140	if ( $ignoredHeadings === null ) {
141	$ignoredHeadings = [];
142	$source = wfMessage( 'search-ignored-headings' )->inContentLanguage();
143	if ( !$source->isDisabled() ) {
144	$lines = self::parseSettingsInMessage( $source->plain() );
145	// Now we just have headings!
146	$ignoredHeadings = $lines;
147	}
148	}
149
150	return $ignoredHeadings;
151	}
152
153	/**
154	* Extract parts of the text - opening, main and auxiliary.
155	*/
156	private function extractWikitextParts() {
157	if ( $this->allText !== null ) {
158	return;
159	}
160	$text = $this->parserOutput->getRawText();
161	if ( $text === '' ) {
162	$this->allText = "";
163
164	// empty text - nothing to seek here
165	return;
166	}
167
168	$this->openingText = $this->extractTextBeforeFirstHeading( $text );
169
170	$formatter = new HtmlFormatter( $text );
171
172	// Strip elements from the page that we never want in the search text.
173	$formatter->remove( self::EXCLUDED_ELEMENT_SELECTORS );
174	$formatter->filterContent();
175
176	// Strip elements from the page that are auxiliary text. These will still be
177	// searched, but matches will be ranked lower and non-auxiliary matches will be
178	// preferred in highlighting.
179	$formatter->remove( self::AUXILIARY_ELEMENT_SELECTORS );
180	$auxiliaryElements = $formatter->filterContent();
181	$this->allText = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
182	foreach ( $auxiliaryElements as $auxiliaryElement ) {
183	$this->auxText[] =
184	trim( Sanitizer::stripAllTags( $formatter->getText( $auxiliaryElement ) ) );
185	}
186	}
187
188	/**
189	* Get text before first heading.
190	*
191	* @param string $text
192	*
193	* @return string\|null
194	*/
195	private function extractTextBeforeFirstHeading( $text ) {
196	$matches = [];
197	if ( !preg_match( '/<h[123456]\b/', $text, $matches, PREG_OFFSET_CAPTURE ) ) {
198	// There isn't a first heading, so we interpret this as the article
199	// being entirely without heading.
200	return null;
201	}
202	$text = substr( $text, 0, $matches[ 0 ][ 1 ] );
203	if ( !$text ) {
204	// There isn't any text before the first heading, so we declare there isn't
205	// a first heading.
206	return null;
207	}
208
209	$formatter = new HtmlFormatter( $text );
210	$formatter->remove( self::EXCLUDED_ELEMENT_SELECTORS );
211	$formatter->remove( self::AUXILIARY_ELEMENT_SELECTORS );
212	$formatter->filterContent();
213	$text = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
214
215	if ( !$text ) {
216	// There isn't any text after filtering before the first heading, so we declare
217	// that there isn't a first heading.
218	return null;
219	}
220
221	return $text;
222	}
223
224	/**
225	* @return string\|null
226	*/
227	public function getOpeningText() {
228	$this->extractWikitextParts();
229
230	return $this->openingText;
231	}
232
233	/**
234	* @return string
235	*/
236	public function getMainText() {
237	$this->extractWikitextParts();
238
239	return $this->allText;
240	}
241
242	/**
243	* @return string[]
244	*/
245	public function getAuxiliaryText() {
246	$this->extractWikitextParts();
247
248	return $this->auxText;
249	}
250
251	/**
252	* Get the "defaultsort" property
253	*
254	* @return string\|null
255	*/
256	public function getDefaultSort() {
257	$sort = $this->parserOutput->getPageProperty( 'defaultsort' );
258	if ( $sort === false ) {
259	return null;
260	}
261
262	return $sort;
263	}
264	}
265
266	/** @deprecated class alias since 1.43 */
267	class_alias( WikiTextStructure::class, 'WikiTextStructure' );