Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
75.36% |
52 / 69 |
|
40.00% |
4 / 10 |
CRAP | |
0.00% |
0 / 1 |
WikiTextStructure | |
75.36% |
52 / 69 |
|
40.00% |
4 / 10 |
30.91 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
headings | |
92.86% |
13 / 14 |
|
0.00% |
0 / 1 |
4.01 | |||
parseSettingsInMessage | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
getIgnoredHeadings | |
30.00% |
3 / 10 |
|
0.00% |
0 / 1 |
9.49 | |||
extractWikitextParts | |
87.50% |
14 / 16 |
|
0.00% |
0 / 1 |
4.03 | |||
extractTextBeforeFirstHeading | |
85.71% |
12 / 14 |
|
0.00% |
0 / 1 |
4.05 | |||
getOpeningText | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
getMainText | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
getAuxiliaryText | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
getDefaultSort | |
75.00% |
3 / 4 |
|
0.00% |
0 / 1 |
2.06 |
1 | <?php |
2 | |
3 | use HtmlFormatter\HtmlFormatter; |
4 | use MediaWiki\Parser\ParserOutput; |
5 | use MediaWiki\Parser\Sanitizer; |
6 | |
7 | /** |
8 | * Class allowing to explore the structure of parsed wikitext. |
9 | */ |
10 | class WikiTextStructure { |
11 | |
12 | private ?string $openingText = null; |
13 | private ?string $allText = null; |
14 | /** @var string[] */ |
15 | private array $auxText = []; |
16 | private ParserOutput $parserOutput; |
17 | |
18 | /** |
19 | * Selectors to elements that are excluded entirely from search |
20 | */ |
21 | private const EXCLUDED_ELEMENT_SELECTORS = [ |
22 | // "it looks like you don't have javascript enabled..." – do not need to index |
23 | 'audio', 'video', |
24 | // CSS stylesheets aren't content |
25 | 'style', |
26 | // The [1] for references from Cite |
27 | 'sup.reference', |
28 | // The ↑ next to references in the references section from Cite |
29 | '.mw-cite-backlink', |
30 | // Headings are already indexed in their own field. |
31 | 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', |
32 | // Collapsed fields are hidden by default, so we don't want them showing up. |
33 | '.autocollapse', |
34 | // Content explicitly decided to be not searchable by editors such |
35 | // as custom navigation templates. |
36 | '.navigation-not-searchable', |
37 | // User-facing interface code prompting the user to act from WikibaseMediaInfo |
38 | '.wbmi-entityview-emptyCaption', |
39 | ]; |
40 | |
41 | /** |
42 | * Selectors to elements that are considered auxiliary to the article text for search |
43 | */ |
44 | private const AUXILIARY_ELEMENT_SELECTORS = [ |
45 | // Thumbnail captions aren't really part of the text proper |
46 | '.thumbcaption', |
47 | 'figcaption', |
48 | // Neither are tables |
49 | 'table', |
50 | // Common style for "See also:". |
51 | '.rellink', |
52 | // Common style for calling out helpful links at the top of the article. |
53 | '.dablink', |
54 | // New class users can use to mark stuff as auxiliary to searches. |
55 | '.searchaux', |
56 | ]; |
57 | |
58 | /** |
59 | * @param ParserOutput $parserOutput |
60 | */ |
61 | public function __construct( ParserOutput $parserOutput ) { |
62 | $this->parserOutput = $parserOutput; |
63 | } |
64 | |
65 | /** |
66 | * Gets headings from the page. |
67 | * @return string[] |
68 | * First strip out things that look like references. We can't use HTML filtering because |
69 | * the references come back as <sup> tags without a class. To keep from breaking stuff like |
70 | * ==Applicability of the strict mass–energy equivalence formula, ''E'' = ''mc''<sup>2</sup>== |
71 | * we don't remove the whole <sup> tag. |
72 | * |
73 | * We also don't want to strip the <sup> tag and remove everything that looks like [2] because, |
74 | * I don't know, maybe there is a band named Word [2] Foo r something. Whatever. |
75 | * |
76 | * So we only strip things that look like <sup> tags wrapping a reference. And since the data |
77 | * looks like: |
78 | * Reference in heading <sup>[1]</sup><sup>[2]</sup> |
79 | * we can not really use HtmlFormatter as we have no suitable selector. |
80 | */ |
81 | public function headings() { |
82 | $headings = []; |
83 | $tocData = $this->parserOutput->getTOCData(); |
84 | if ( $tocData === null ) { |
85 | return $headings; |
86 | } |
87 | $ignoredHeadings = $this->getIgnoredHeadings(); |
88 | foreach ( $tocData->getSections() as $heading ) { |
89 | $heading = $heading->line; |
90 | |
91 | // Some wikis wrap the brackets in a span: |
92 | // https://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link |
93 | $heading = preg_replace( '/<\/?span>/', '', $heading ); |
94 | // Normalize [] so the following regexp would work. |
95 | $heading = preg_replace( [ '/[/', '/]/' ], [ '[', ']' ], $heading ); |
96 | $heading = preg_replace( '/<sup>\s*\[\s*\d+\s*\]\s*<\/sup>/i', '', $heading ); |
97 | |
98 | // Strip tags from the heading or else we'll display them (escaped) in search results |
99 | $heading = trim( Sanitizer::stripAllTags( $heading ) ); |
100 | |
101 | // Note that we don't take the level of the heading into account - all headings are equal. |
102 | // Except the ones we ignore. |
103 | if ( !in_array( $heading, $ignoredHeadings ) ) { |
104 | $headings[] = $heading; |
105 | } |
106 | } |
107 | return $headings; |
108 | } |
109 | |
110 | /** |
111 | * Parse a message content into an array. This function is generally used to |
112 | * parse settings stored as i18n messages (see search-ignored-headings). |
113 | * |
114 | * @param string $message |
115 | * @return string[] |
116 | */ |
117 | public static function parseSettingsInMessage( $message ) { |
118 | $lines = explode( "\n", $message ); |
119 | // Remove comments |
120 | $lines = preg_replace( '/#.*$/', '', $lines ); |
121 | // Remove extra spaces |
122 | $lines = array_map( 'trim', $lines ); |
123 | // Remove empty lines |
124 | return array_filter( $lines ); |
125 | } |
126 | |
127 | /** |
128 | * Gets a list of heading to ignore. |
129 | * @return string[] |
130 | */ |
131 | private function getIgnoredHeadings() { |
132 | static $ignoredHeadings = null; |
133 | if ( $ignoredHeadings === null ) { |
134 | $ignoredHeadings = []; |
135 | $source = wfMessage( 'search-ignored-headings' )->inContentLanguage(); |
136 | if ( $source->isBlank() ) { |
137 | // Try the old version too, just in case |
138 | $source = wfMessage( 'cirrussearch-ignored-headings' )->inContentLanguage(); |
139 | } |
140 | if ( !$source->isDisabled() ) { |
141 | $lines = self::parseSettingsInMessage( $source->plain() ); |
142 | // Now we just have headings! |
143 | $ignoredHeadings = $lines; |
144 | } |
145 | } |
146 | return $ignoredHeadings; |
147 | } |
148 | |
149 | /** |
150 | * Extract parts of the text - opening, main and auxiliary. |
151 | */ |
152 | private function extractWikitextParts() { |
153 | if ( $this->allText !== null ) { |
154 | return; |
155 | } |
156 | $text = $this->parserOutput->getRawText(); |
157 | if ( $text === '' ) { |
158 | $this->allText = ""; |
159 | // empty text - nothing to seek here |
160 | return; |
161 | } |
162 | |
163 | $this->openingText = $this->extractTextBeforeFirstHeading( $text ); |
164 | |
165 | $formatter = new HtmlFormatter( $text ); |
166 | |
167 | // Strip elements from the page that we never want in the search text. |
168 | $formatter->remove( self::EXCLUDED_ELEMENT_SELECTORS ); |
169 | $formatter->filterContent(); |
170 | |
171 | // Strip elements from the page that are auxiliary text. These will still be |
172 | // searched, but matches will be ranked lower and non-auxiliary matches will be |
173 | // preferred in highlighting. |
174 | $formatter->remove( self::AUXILIARY_ELEMENT_SELECTORS ); |
175 | $auxiliaryElements = $formatter->filterContent(); |
176 | $this->allText = trim( Sanitizer::stripAllTags( $formatter->getText() ) ); |
177 | foreach ( $auxiliaryElements as $auxiliaryElement ) { |
178 | $this->auxText[] = |
179 | trim( Sanitizer::stripAllTags( $formatter->getText( $auxiliaryElement ) ) ); |
180 | } |
181 | } |
182 | |
183 | /** |
184 | * Get text before first heading. |
185 | * @param string $text |
186 | * @return string|null |
187 | */ |
188 | private function extractTextBeforeFirstHeading( $text ) { |
189 | $matches = []; |
190 | if ( !preg_match( '/<h[123456]\b/', $text, $matches, PREG_OFFSET_CAPTURE ) ) { |
191 | // There isn't a first heading, so we interpret this as the article |
192 | // being entirely without heading. |
193 | return null; |
194 | } |
195 | $text = substr( $text, 0, $matches[ 0 ][ 1 ] ); |
196 | if ( !$text ) { |
197 | // There isn't any text before the first heading, so we declare there isn't |
198 | // a first heading. |
199 | return null; |
200 | } |
201 | |
202 | $formatter = new HtmlFormatter( $text ); |
203 | $formatter->remove( self::EXCLUDED_ELEMENT_SELECTORS ); |
204 | $formatter->remove( self::AUXILIARY_ELEMENT_SELECTORS ); |
205 | $formatter->filterContent(); |
206 | $text = trim( Sanitizer::stripAllTags( $formatter->getText() ) ); |
207 | |
208 | if ( !$text ) { |
209 | // There isn't any text after filtering before the first heading, so we declare |
210 | // that there isn't a first heading. |
211 | return null; |
212 | } |
213 | |
214 | return $text; |
215 | } |
216 | |
217 | /** |
218 | * @return string|null |
219 | */ |
220 | public function getOpeningText() { |
221 | $this->extractWikitextParts(); |
222 | return $this->openingText; |
223 | } |
224 | |
225 | /** |
226 | * @return string |
227 | */ |
228 | public function getMainText() { |
229 | $this->extractWikitextParts(); |
230 | return $this->allText; |
231 | } |
232 | |
233 | /** |
234 | * @return string[] |
235 | */ |
236 | public function getAuxiliaryText() { |
237 | $this->extractWikitextParts(); |
238 | return $this->auxText; |
239 | } |
240 | |
241 | /** |
242 | * Get the "defaultsort" property |
243 | * @return string|null |
244 | */ |
245 | public function getDefaultSort() { |
246 | $sort = $this->parserOutput->getPageProperty( 'defaultsort' ); |
247 | if ( $sort === false ) { |
248 | return null; |
249 | } |
250 | return $sort; |
251 | } |
252 | } |