Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
76.47% |
52 / 68 |
|
40.00% |
4 / 10 |
CRAP | |
0.00% |
0 / 1 |
WikiTextStructure | |
77.61% |
52 / 67 |
|
40.00% |
4 / 10 |
27.43 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
headings | |
92.86% |
13 / 14 |
|
0.00% |
0 / 1 |
4.01 | |||
parseSettingsInMessage | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
getIgnoredHeadings | |
37.50% |
3 / 8 |
|
0.00% |
0 / 1 |
5.20 | |||
extractWikitextParts | |
87.50% |
14 / 16 |
|
0.00% |
0 / 1 |
4.03 | |||
extractTextBeforeFirstHeading | |
85.71% |
12 / 14 |
|
0.00% |
0 / 1 |
4.05 | |||
getOpeningText | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
getMainText | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
getAuxiliaryText | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
getDefaultSort | |
75.00% |
3 / 4 |
|
0.00% |
0 / 1 |
2.06 |
1 | <?php |
2 | |
3 | namespace MediaWiki\Content; |
4 | |
5 | use HtmlFormatter\HtmlFormatter; |
6 | use MediaWiki\Parser\ParserOutput; |
7 | use MediaWiki\Parser\Sanitizer; |
8 | |
9 | /** |
10 | * Class allowing to explore the structure of parsed wikitext. |
11 | */ |
12 | class WikiTextStructure { |
13 | |
14 | private ?string $openingText = null; |
15 | private ?string $allText = null; |
16 | /** @var string[] */ |
17 | private array $auxText = []; |
18 | private ParserOutput $parserOutput; |
19 | |
20 | /** |
21 | * Selectors to elements that are excluded entirely from search |
22 | */ |
23 | private const EXCLUDED_ELEMENT_SELECTORS = [ |
24 | // "it looks like you don't have javascript enabled..." – do not need to index |
25 | 'audio', 'video', |
26 | // CSS stylesheets aren't content |
27 | 'style', |
28 | // The [1] for references from Cite |
29 | 'sup.reference', |
30 | // The ↑ next to references in the references section from Cite |
31 | '.mw-cite-backlink', |
32 | // Headings are already indexed in their own field. |
33 | 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', |
34 | // Collapsed fields are hidden by default, so we don't want them showing up. |
35 | '.autocollapse', |
36 | // Content explicitly decided to be not searchable by editors such |
37 | // as custom navigation templates. |
38 | '.navigation-not-searchable', |
39 | // User-facing interface code prompting the user to act from WikibaseMediaInfo |
40 | '.wbmi-entityview-emptyCaption', |
41 | ]; |
42 | |
43 | /** |
44 | * Selectors to elements that are considered auxiliary to the article text for search |
45 | */ |
46 | private const AUXILIARY_ELEMENT_SELECTORS = [ |
47 | // Thumbnail captions aren't really part of the text proper |
48 | '.thumbcaption', |
49 | 'figcaption', |
50 | // Neither are tables |
51 | 'table', |
52 | // Common style for "See also:". |
53 | '.rellink', |
54 | // Common style for calling out helpful links at the top of the article. |
55 | '.dablink', |
56 | // New class users can use to mark stuff as auxiliary to searches. |
57 | '.searchaux', |
58 | ]; |
59 | |
60 | /** |
61 | * @param ParserOutput $parserOutput |
62 | */ |
63 | public function __construct( ParserOutput $parserOutput ) { |
64 | $this->parserOutput = $parserOutput; |
65 | } |
66 | |
67 | /** |
68 | * Gets headings from the page. |
69 | * |
70 | * @return string[] |
71 | * First strip out things that look like references. We can't use HTML filtering because |
72 | * the references come back as <sup> tags without a class. To keep from breaking stuff like |
73 | * ==Applicability of the strict mass–energy equivalence formula, ''E'' = ''mc''<sup>2</sup>== |
74 | * we don't remove the whole <sup> tag. |
75 | * |
76 | * We also don't want to strip the <sup> tag and remove everything that looks like [2] because, |
77 | * I don't know, maybe there is a band named Word [2] Foo r something. Whatever. |
78 | * |
79 | * So we only strip things that look like <sup> tags wrapping a reference. And since the data |
80 | * looks like: |
81 | * Reference in heading <sup>[1]</sup><sup>[2]</sup> |
82 | * we can not really use HtmlFormatter as we have no suitable selector. |
83 | */ |
84 | public function headings() { |
85 | $headings = []; |
86 | $tocData = $this->parserOutput->getTOCData(); |
87 | if ( $tocData === null ) { |
88 | return $headings; |
89 | } |
90 | $ignoredHeadings = $this->getIgnoredHeadings(); |
91 | foreach ( $tocData->getSections() as $heading ) { |
92 | $heading = $heading->line; |
93 | |
94 | // Some wikis wrap the brackets in a span: |
95 | // https://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link |
96 | $heading = preg_replace( '/<\/?span>/', '', $heading ); |
97 | // Normalize [] so the following regexp would work. |
98 | $heading = preg_replace( [ '/[/', '/]/' ], [ '[', ']' ], $heading ); |
99 | $heading = preg_replace( '/<sup>\s*\[\s*\d+\s*\]\s*<\/sup>/i', '', $heading ); |
100 | |
101 | // Strip tags from the heading or else we'll display them (escaped) in search results |
102 | $heading = trim( Sanitizer::stripAllTags( $heading ) ); |
103 | |
104 | // Note that we don't take the level of the heading into account - all headings are equal. |
105 | // Except the ones we ignore. |
106 | if ( !in_array( $heading, $ignoredHeadings ) ) { |
107 | $headings[] = $heading; |
108 | } |
109 | } |
110 | |
111 | return $headings; |
112 | } |
113 | |
114 | /** |
115 | * Parse a message content into an array. This function is generally used to |
116 | * parse settings stored as i18n messages (see search-ignored-headings). |
117 | * |
118 | * @param string $message |
119 | * |
120 | * @return string[] |
121 | */ |
122 | public static function parseSettingsInMessage( $message ) { |
123 | $lines = explode( "\n", $message ); |
124 | // Remove comments |
125 | $lines = preg_replace( '/#.*$/', '', $lines ); |
126 | // Remove extra spaces |
127 | $lines = array_map( 'trim', $lines ); |
128 | |
129 | // Remove empty lines |
130 | return array_filter( $lines ); |
131 | } |
132 | |
133 | /** |
134 | * Gets a list of heading to ignore. |
135 | * |
136 | * @return string[] |
137 | */ |
138 | private function getIgnoredHeadings() { |
139 | static $ignoredHeadings = null; |
140 | if ( $ignoredHeadings === null ) { |
141 | $ignoredHeadings = []; |
142 | $source = wfMessage( 'search-ignored-headings' )->inContentLanguage(); |
143 | if ( !$source->isDisabled() ) { |
144 | $lines = self::parseSettingsInMessage( $source->plain() ); |
145 | // Now we just have headings! |
146 | $ignoredHeadings = $lines; |
147 | } |
148 | } |
149 | |
150 | return $ignoredHeadings; |
151 | } |
152 | |
153 | /** |
154 | * Extract parts of the text - opening, main and auxiliary. |
155 | */ |
156 | private function extractWikitextParts() { |
157 | if ( $this->allText !== null ) { |
158 | return; |
159 | } |
160 | $text = $this->parserOutput->getRawText(); |
161 | if ( $text === '' ) { |
162 | $this->allText = ""; |
163 | |
164 | // empty text - nothing to seek here |
165 | return; |
166 | } |
167 | |
168 | $this->openingText = $this->extractTextBeforeFirstHeading( $text ); |
169 | |
170 | $formatter = new HtmlFormatter( $text ); |
171 | |
172 | // Strip elements from the page that we never want in the search text. |
173 | $formatter->remove( self::EXCLUDED_ELEMENT_SELECTORS ); |
174 | $formatter->filterContent(); |
175 | |
176 | // Strip elements from the page that are auxiliary text. These will still be |
177 | // searched, but matches will be ranked lower and non-auxiliary matches will be |
178 | // preferred in highlighting. |
179 | $formatter->remove( self::AUXILIARY_ELEMENT_SELECTORS ); |
180 | $auxiliaryElements = $formatter->filterContent(); |
181 | $this->allText = trim( Sanitizer::stripAllTags( $formatter->getText() ) ); |
182 | foreach ( $auxiliaryElements as $auxiliaryElement ) { |
183 | $this->auxText[] = |
184 | trim( Sanitizer::stripAllTags( $formatter->getText( $auxiliaryElement ) ) ); |
185 | } |
186 | } |
187 | |
188 | /** |
189 | * Get text before first heading. |
190 | * |
191 | * @param string $text |
192 | * |
193 | * @return string|null |
194 | */ |
195 | private function extractTextBeforeFirstHeading( $text ) { |
196 | $matches = []; |
197 | if ( !preg_match( '/<h[123456]\b/', $text, $matches, PREG_OFFSET_CAPTURE ) ) { |
198 | // There isn't a first heading, so we interpret this as the article |
199 | // being entirely without heading. |
200 | return null; |
201 | } |
202 | $text = substr( $text, 0, $matches[ 0 ][ 1 ] ); |
203 | if ( !$text ) { |
204 | // There isn't any text before the first heading, so we declare there isn't |
205 | // a first heading. |
206 | return null; |
207 | } |
208 | |
209 | $formatter = new HtmlFormatter( $text ); |
210 | $formatter->remove( self::EXCLUDED_ELEMENT_SELECTORS ); |
211 | $formatter->remove( self::AUXILIARY_ELEMENT_SELECTORS ); |
212 | $formatter->filterContent(); |
213 | $text = trim( Sanitizer::stripAllTags( $formatter->getText() ) ); |
214 | |
215 | if ( !$text ) { |
216 | // There isn't any text after filtering before the first heading, so we declare |
217 | // that there isn't a first heading. |
218 | return null; |
219 | } |
220 | |
221 | return $text; |
222 | } |
223 | |
224 | /** |
225 | * @return string|null |
226 | */ |
227 | public function getOpeningText() { |
228 | $this->extractWikitextParts(); |
229 | |
230 | return $this->openingText; |
231 | } |
232 | |
233 | /** |
234 | * @return string |
235 | */ |
236 | public function getMainText() { |
237 | $this->extractWikitextParts(); |
238 | |
239 | return $this->allText; |
240 | } |
241 | |
242 | /** |
243 | * @return string[] |
244 | */ |
245 | public function getAuxiliaryText() { |
246 | $this->extractWikitextParts(); |
247 | |
248 | return $this->auxText; |
249 | } |
250 | |
251 | /** |
252 | * Get the "defaultsort" property |
253 | * |
254 | * @return string|null |
255 | */ |
256 | public function getDefaultSort() { |
257 | $sort = $this->parserOutput->getPageProperty( 'defaultsort' ); |
258 | if ( $sort === false ) { |
259 | return null; |
260 | } |
261 | |
262 | return $sort; |
263 | } |
264 | } |
265 | |
266 | /** @deprecated class alias since 1.43 */ |
267 | class_alias( WikiTextStructure::class, 'WikiTextStructure' ); |