Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
91.18% |
62 / 68 |
|
70.00% |
7 / 10 |
CRAP | |
0.00% |
0 / 1 |
WikiTextStructure | |
92.54% |
62 / 67 |
|
70.00% |
7 / 10 |
22.20 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
headings | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
4 | |||
parseSettingsInMessage | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
getIgnoredHeadings | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
3 | |||
extractWikitextParts | |
87.50% |
14 / 16 |
|
0.00% |
0 / 1 |
4.03 | |||
extractTextBeforeFirstHeading | |
85.71% |
12 / 14 |
|
0.00% |
0 / 1 |
4.05 | |||
getOpeningText | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
getMainText | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
getAuxiliaryText | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
getDefaultSort | |
75.00% |
3 / 4 |
|
0.00% |
0 / 1 |
2.06 |
1 | <?php |
2 | |
3 | namespace MediaWiki\Content; |
4 | |
5 | use HtmlFormatter\HtmlFormatter; |
6 | use MediaWiki\Parser\ParserOutput; |
7 | use MediaWiki\Parser\Sanitizer; |
8 | |
9 | /** |
10 | * Class allowing to explore the structure of parsed wikitext. |
11 | */ |
12 | class WikiTextStructure { |
13 | |
14 | private ?string $openingText = null; |
15 | private ?string $allText = null; |
16 | /** @var string[] */ |
17 | private array $auxText = []; |
18 | private ParserOutput $parserOutput; |
19 | |
20 | /** |
21 | * Selectors to elements that are excluded entirely from search |
22 | */ |
23 | private const EXCLUDED_ELEMENT_SELECTORS = [ |
24 | // "it looks like you don't have javascript enabled..." – do not need to index |
25 | 'audio', 'video', |
26 | // CSS stylesheets aren't content |
27 | 'style', |
28 | // The [1] for references from Cite |
29 | 'sup.reference', |
30 | // The ↑ next to references in the references section from Cite |
31 | '.mw-cite-backlink', |
32 | // Headings are already indexed in their own field. |
33 | 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', |
34 | // Collapsed fields are hidden by default, so we don't want them showing up. |
35 | '.autocollapse', |
36 | // Content explicitly decided to be not searchable by editors such |
37 | // as custom navigation templates. |
38 | '.navigation-not-searchable', |
39 | // User-facing interface code prompting the user to act from WikibaseMediaInfo |
40 | '.wbmi-entityview-emptyCaption', |
41 | ]; |
42 | |
43 | /** |
44 | * Selectors to elements that are considered auxiliary to the article text for search |
45 | */ |
46 | private const AUXILIARY_ELEMENT_SELECTORS = [ |
47 | // Thumbnail captions aren't really part of the text proper |
48 | '.thumbcaption', |
49 | 'figcaption', |
50 | // Neither are tables |
51 | 'table', |
52 | // Common style for "See also:". |
53 | '.rellink', |
54 | // Common style for calling out helpful links at the top of the article. |
55 | '.dablink', |
56 | // New class users can use to mark stuff as auxiliary to searches. |
57 | '.searchaux', |
58 | ]; |
59 | |
60 | public function __construct( ParserOutput $parserOutput ) { |
61 | $this->parserOutput = $parserOutput; |
62 | } |
63 | |
64 | /** |
65 | * Gets headings from the page. |
66 | * |
67 | * @return string[] |
68 | * First strip out things that look like references. We can't use HTML filtering because |
69 | * the references come back as <sup> tags without a class. To keep from breaking stuff like |
70 | * ==Applicability of the strict mass–energy equivalence formula, ''E'' = ''mc''<sup>2</sup>== |
71 | * we don't remove the whole <sup> tag. |
72 | * |
73 | * We also don't want to strip the <sup> tag and remove everything that looks like [2] because, |
74 | * I don't know, maybe there is a band named Word [2] Foo r something. Whatever. |
75 | * |
76 | * So we only strip things that look like <sup> tags wrapping a reference. And since the data |
77 | * looks like: |
78 | * Reference in heading <sup>[1]</sup><sup>[2]</sup> |
79 | * we can not really use HtmlFormatter as we have no suitable selector. |
80 | */ |
81 | public function headings() { |
82 | $headings = []; |
83 | $tocData = $this->parserOutput->getTOCData(); |
84 | if ( $tocData === null ) { |
85 | return $headings; |
86 | } |
87 | $ignoredHeadings = $this->getIgnoredHeadings(); |
88 | foreach ( $tocData->getSections() as $heading ) { |
89 | $heading = $heading->line; |
90 | |
91 | // Some wikis wrap the brackets in a span: |
92 | // https://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link |
93 | $heading = preg_replace( '/<\/?span>/', '', $heading ); |
94 | // Normalize [] so the following regexp would work. |
95 | $heading = preg_replace( [ '/[/', '/]/' ], [ '[', ']' ], $heading ); |
96 | $heading = preg_replace( '/<sup>\s*\[\s*\d+\s*\]\s*<\/sup>/i', '', $heading ); |
97 | |
98 | // Strip tags from the heading or else we'll display them (escaped) in search results |
99 | $heading = trim( Sanitizer::stripAllTags( $heading ) ); |
100 | |
101 | // Note that we don't take the level of the heading into account - all headings are equal. |
102 | // Except the ones we ignore. |
103 | if ( !in_array( $heading, $ignoredHeadings ) ) { |
104 | $headings[] = $heading; |
105 | } |
106 | } |
107 | |
108 | return $headings; |
109 | } |
110 | |
111 | /** |
112 | * Parse a message content into an array. This function is generally used to |
113 | * parse settings stored as i18n messages (see search-ignored-headings). |
114 | * |
115 | * @param string $message |
116 | * |
117 | * @return string[] |
118 | */ |
119 | public static function parseSettingsInMessage( $message ) { |
120 | $lines = explode( "\n", $message ); |
121 | // Remove comments |
122 | $lines = preg_replace( '/#.*$/', '', $lines ); |
123 | // Remove extra spaces |
124 | $lines = array_map( 'trim', $lines ); |
125 | |
126 | // Remove empty lines |
127 | return array_filter( $lines ); |
128 | } |
129 | |
130 | /** |
131 | * Gets a list of heading to ignore. |
132 | * |
133 | * @return string[] |
134 | */ |
135 | private function getIgnoredHeadings() { |
136 | static $ignoredHeadings = null; |
137 | if ( $ignoredHeadings === null ) { |
138 | $ignoredHeadings = []; |
139 | $source = wfMessage( 'search-ignored-headings' )->inContentLanguage(); |
140 | if ( !$source->isDisabled() ) { |
141 | $lines = self::parseSettingsInMessage( $source->plain() ); |
142 | // Now we just have headings! |
143 | $ignoredHeadings = $lines; |
144 | } |
145 | } |
146 | |
147 | return $ignoredHeadings; |
148 | } |
149 | |
150 | /** |
151 | * Extract parts of the text - opening, main and auxiliary. |
152 | */ |
153 | private function extractWikitextParts() { |
154 | if ( $this->allText !== null ) { |
155 | return; |
156 | } |
157 | $text = $this->parserOutput->getRawText(); |
158 | if ( $text === '' ) { |
159 | $this->allText = ""; |
160 | |
161 | // empty text - nothing to seek here |
162 | return; |
163 | } |
164 | |
165 | $this->openingText = $this->extractTextBeforeFirstHeading( $text ); |
166 | |
167 | $formatter = new HtmlFormatter( $text ); |
168 | |
169 | // Strip elements from the page that we never want in the search text. |
170 | $formatter->remove( self::EXCLUDED_ELEMENT_SELECTORS ); |
171 | $formatter->filterContent(); |
172 | |
173 | // Strip elements from the page that are auxiliary text. These will still be |
174 | // searched, but matches will be ranked lower and non-auxiliary matches will be |
175 | // preferred in highlighting. |
176 | $formatter->remove( self::AUXILIARY_ELEMENT_SELECTORS ); |
177 | $auxiliaryElements = $formatter->filterContent(); |
178 | $this->allText = trim( Sanitizer::stripAllTags( $formatter->getText() ) ); |
179 | foreach ( $auxiliaryElements as $auxiliaryElement ) { |
180 | $this->auxText[] = |
181 | trim( Sanitizer::stripAllTags( $formatter->getText( $auxiliaryElement ) ) ); |
182 | } |
183 | } |
184 | |
185 | /** |
186 | * Get text before first heading. |
187 | * |
188 | * @param string $text |
189 | * |
190 | * @return string|null |
191 | */ |
192 | private function extractTextBeforeFirstHeading( $text ) { |
193 | $matches = []; |
194 | if ( !preg_match( '/<h[123456]\b/', $text, $matches, PREG_OFFSET_CAPTURE ) ) { |
195 | // There isn't a first heading, so we interpret this as the article |
196 | // being entirely without heading. |
197 | return null; |
198 | } |
199 | $text = substr( $text, 0, $matches[ 0 ][ 1 ] ); |
200 | if ( !$text ) { |
201 | // There isn't any text before the first heading, so we declare there isn't |
202 | // a first heading. |
203 | return null; |
204 | } |
205 | |
206 | $formatter = new HtmlFormatter( $text ); |
207 | $formatter->remove( self::EXCLUDED_ELEMENT_SELECTORS ); |
208 | $formatter->remove( self::AUXILIARY_ELEMENT_SELECTORS ); |
209 | $formatter->filterContent(); |
210 | $text = trim( Sanitizer::stripAllTags( $formatter->getText() ) ); |
211 | |
212 | if ( !$text ) { |
213 | // There isn't any text after filtering before the first heading, so we declare |
214 | // that there isn't a first heading. |
215 | return null; |
216 | } |
217 | |
218 | return $text; |
219 | } |
220 | |
221 | /** |
222 | * @return string|null |
223 | */ |
224 | public function getOpeningText() { |
225 | $this->extractWikitextParts(); |
226 | |
227 | return $this->openingText; |
228 | } |
229 | |
230 | /** |
231 | * @return string |
232 | */ |
233 | public function getMainText() { |
234 | $this->extractWikitextParts(); |
235 | |
236 | return $this->allText; |
237 | } |
238 | |
239 | /** |
240 | * @return string[] |
241 | */ |
242 | public function getAuxiliaryText() { |
243 | $this->extractWikitextParts(); |
244 | |
245 | return $this->auxText; |
246 | } |
247 | |
248 | /** |
249 | * Get the "defaultsort" property |
250 | * |
251 | * @return string|null |
252 | */ |
253 | public function getDefaultSort() { |
254 | $sort = $this->parserOutput->getPageProperty( 'defaultsort' ); |
255 | if ( $sort === false ) { |
256 | return null; |
257 | } |
258 | |
259 | return $sort; |
260 | } |
261 | } |
262 | |
263 | /** @deprecated class alias since 1.43 */ |
264 | class_alias( WikiTextStructure::class, 'WikiTextStructure' ); |