Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
43.15% |
85 / 197 |
|
66.67% |
10 / 15 |
CRAP | |
0.00% |
0 / 1 |
ApiQueryExtracts | |
43.15% |
85 / 197 |
|
66.67% |
10 / 15 |
569.19 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
execute | |
0.00% |
0 / 42 |
|
0.00% |
0 / 1 |
272 | |||
getCacheMode | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getExtract | |
0.00% |
0 / 20 |
|
0.00% |
0 / 1 |
42 | |||
cacheKey | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
3 | |||
getFromCache | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
setCache | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
getFirstSection | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
3 | |||
parse | |
0.00% |
0 / 45 |
|
0.00% |
0 / 1 |
72 | |||
convertText | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
truncate | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
4 | |||
doSections | |
93.75% |
15 / 16 |
|
0.00% |
0 / 1 |
5.01 | |||
getAllowedParams | |
100.00% |
30 / 30 |
|
100.00% |
1 / 1 |
1 | |||
getExamplesMessages | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
getHelpUrls | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 |
1 | <?php |
2 | |
3 | namespace TextExtracts; |
4 | |
5 | use ApiBase; |
6 | use ApiMain; |
7 | use ApiQueryBase; |
8 | use ApiUsageException; |
9 | use MediaWiki\Config\Config; |
10 | use MediaWiki\Config\ConfigFactory; |
11 | use MediaWiki\Languages\LanguageConverterFactory; |
12 | use MediaWiki\Logger\LoggerFactory; |
13 | use MediaWiki\MediaWikiServices; |
14 | use MediaWiki\Page\WikiPageFactory; |
15 | use MediaWiki\Request\FauxRequest; |
16 | use MediaWiki\Title\Title; |
17 | use ParserOptions; |
18 | use WANObjectCache; |
19 | use Wikimedia\ParamValidator\ParamValidator; |
20 | use WikiPage; |
21 | |
22 | /** |
23 | * @license GPL-2.0-or-later |
24 | */ |
25 | class ApiQueryExtracts extends ApiQueryBase { |
26 | |
27 | /** |
28 | * Bump when memcache needs clearing |
29 | */ |
30 | private const CACHE_VERSION = 2; |
31 | |
32 | private const PREFIX = 'ex'; |
33 | |
34 | /** |
35 | * @var array |
36 | */ |
37 | private $params; |
38 | |
39 | /** |
40 | * @var Config |
41 | */ |
42 | private $config; |
43 | /** |
44 | * @var WANObjectCache |
45 | */ |
46 | private $cache; |
47 | /** |
48 | * @var LanguageConverterFactory |
49 | */ |
50 | private $langConvFactory; |
51 | /** |
52 | * @var WikiPageFactory |
53 | */ |
54 | private $wikiPageFactory; |
55 | |
56 | // TODO: Allow extensions to hook into this to opt-in. |
57 | // This is partly for security reasons; see T107170. |
58 | /** |
59 | * @var string[] |
60 | */ |
61 | private $supportedContentModels = [ 'wikitext' ]; |
62 | |
63 | /** |
64 | * @param \ApiQuery $query API query module object |
65 | * @param string $moduleName Name of this query module |
66 | * @param ConfigFactory $configFactory |
67 | * @param WANObjectCache $cache |
68 | * @param LanguageConverterFactory $langConvFactory |
69 | * @param WikiPageFactory $wikiPageFactory |
70 | */ |
71 | public function __construct( |
72 | $query, |
73 | $moduleName, |
74 | ConfigFactory $configFactory, |
75 | WANObjectCache $cache, |
76 | LanguageConverterFactory $langConvFactory, |
77 | WikiPageFactory $wikiPageFactory |
78 | ) { |
79 | parent::__construct( $query, $moduleName, self::PREFIX ); |
80 | $this->config = $configFactory->makeConfig( 'textextracts' ); |
81 | $this->cache = $cache; |
82 | $this->langConvFactory = $langConvFactory; |
83 | $this->wikiPageFactory = $wikiPageFactory; |
84 | } |
85 | |
86 | /** |
87 | * Evaluates the parameters, performs the requested extraction of text, |
88 | * and sets up the result |
89 | */ |
90 | public function execute() { |
91 | $titles = $this->getPageSet()->getGoodTitles(); |
92 | if ( $titles === [] ) { |
93 | return; |
94 | } |
95 | $isXml = $this->getMain()->isInternalMode() |
96 | || $this->getMain()->getPrinter()->getFormat() == 'XML'; |
97 | $result = $this->getResult(); |
98 | $params = $this->params = $this->extractRequestParams(); |
99 | $this->requireMaxOneParameter( $params, 'chars', 'sentences' ); |
100 | $continue = 0; |
101 | $limit = intval( $params['limit'] ); |
102 | if ( $limit > 1 && !$params['intro'] && count( $titles ) > 1 ) { |
103 | $limit = 1; |
104 | $this->addWarning( [ 'apiwarn-textextracts-limit', $limit ] ); |
105 | } |
106 | if ( isset( $params['continue'] ) ) { |
107 | $continue = intval( $params['continue'] ); |
108 | $this->dieContinueUsageIf( $continue < 0 || $continue > count( $titles ) ); |
109 | $titles = array_slice( $titles, $continue, null, true ); |
110 | } |
111 | $count = 0; |
112 | $titleInFileNamespace = false; |
113 | /** @var Title $t */ |
114 | foreach ( $titles as $id => $t ) { |
115 | if ( ++$count > $limit ) { |
116 | $this->setContinueEnumParameter( 'continue', $continue + $count - 1 ); |
117 | break; |
118 | } |
119 | |
120 | if ( $t->inNamespace( NS_FILE ) ) { |
121 | $text = ''; |
122 | $titleInFileNamespace = true; |
123 | } else { |
124 | $params = $this->params; |
125 | $text = $this->getExtract( $t ); |
126 | $text = $this->truncate( $text ); |
127 | if ( $params['plaintext'] ) { |
128 | $text = $this->doSections( $text ); |
129 | } else { |
130 | if ( $params['sentences'] ) { |
131 | $this->addWarning( $this->msg( 'apiwarn-textextracts-sentences-and-html', self::PREFIX ) ); |
132 | } |
133 | $this->addWarning( 'apiwarn-textextracts-malformed-html' ); |
134 | } |
135 | } |
136 | |
137 | if ( $isXml ) { |
138 | $fit = $result->addValue( [ 'query', 'pages', $id ], 'extract', [ '*' => $text ] ); |
139 | } else { |
140 | $fit = $result->addValue( [ 'query', 'pages', $id ], 'extract', $text ); |
141 | } |
142 | if ( !$fit ) { |
143 | $this->setContinueEnumParameter( 'continue', $continue + $count - 1 ); |
144 | break; |
145 | } |
146 | } |
147 | if ( $titleInFileNamespace ) { |
148 | $this->addWarning( 'apiwarn-textextracts-title-in-file-namespace' ); |
149 | } |
150 | } |
151 | |
152 | /** |
153 | * @param array $params Ignored parameters |
154 | * @return string |
155 | */ |
156 | public function getCacheMode( $params ) { |
157 | return 'public'; |
158 | } |
159 | |
160 | /** |
161 | * Returns a processed, but not trimmed extract |
162 | * @param Title $title |
163 | * @return string |
164 | */ |
165 | private function getExtract( Title $title ) { |
166 | $contentModel = $title->getContentModel(); |
167 | if ( !in_array( $contentModel, $this->supportedContentModels, true ) ) { |
168 | $this->addWarning( [ |
169 | 'apiwarn-textextracts-unsupportedmodel', |
170 | wfEscapeWikiText( $title->getPrefixedText() ), |
171 | $contentModel |
172 | ] ); |
173 | return ''; |
174 | } |
175 | |
176 | $page = $this->wikiPageFactory->newFromTitle( $title ); |
177 | |
178 | $introOnly = $this->params['intro']; |
179 | $text = $this->getFromCache( $page, $introOnly ); |
180 | // if we need just first section, try retrieving full page and getting first section out of it |
181 | if ( $text === false && $introOnly ) { |
182 | $text = $this->getFromCache( $page, false ); |
183 | if ( $text !== false ) { |
184 | $text = $this->getFirstSection( $text, $this->params['plaintext'] ); |
185 | } |
186 | } |
187 | if ( $text === false ) { |
188 | $text = $this->parse( $page ); |
189 | $text = $this->convertText( $text ); |
190 | $this->setCache( $page, $text ); |
191 | } |
192 | return $text; |
193 | } |
194 | |
195 | /** |
196 | * @param WANObjectCache $cache |
197 | * @param WikiPage $page |
198 | * @param bool $introOnly |
199 | * @return string |
200 | */ |
201 | private function cacheKey( WANObjectCache $cache, WikiPage $page, $introOnly ) { |
202 | $langConv = $this->langConvFactory->getLanguageConverter( $page->getTitle()->getPageLanguage() ); |
203 | return $cache->makeKey( 'textextracts', self::CACHE_VERSION, |
204 | $page->getId(), $page->getTouched(), |
205 | $langConv->getPreferredVariant(), |
206 | $this->params['plaintext'] ? 'plaintext' : 'html', |
207 | $introOnly ? 'intro' : 'full' |
208 | ); |
209 | } |
210 | |
211 | /** |
212 | * @param WikiPage $page |
213 | * @param bool $introOnly |
214 | * @return string|false |
215 | */ |
216 | private function getFromCache( WikiPage $page, $introOnly ) { |
217 | $cache = $this->cache; |
218 | // @TODO: replace with getWithSetCallback() |
219 | $key = $this->cacheKey( $cache, $page, $introOnly ); |
220 | return $cache->get( $key ); |
221 | } |
222 | |
223 | /** |
224 | * @param WikiPage $page |
225 | * @param string $text |
226 | */ |
227 | private function setCache( WikiPage $page, $text ) { |
228 | $cache = $this->cache; |
229 | // @TODO: replace with getWithSetCallback() |
230 | $key = $this->cacheKey( $cache, $page, $this->params['intro'] ); |
231 | $cache->set( $key, $text, $this->getConfig()->get( 'ParserCacheExpireTime' ) ); |
232 | } |
233 | |
234 | /** |
235 | * @param string $text |
236 | * @param bool $plainText |
237 | * @return string |
238 | */ |
239 | private function getFirstSection( $text, $plainText ) { |
240 | if ( $plainText ) { |
241 | $regexp = '/^.*?(?=' . ExtractFormatter::SECTION_MARKER_START . |
242 | '(?!.' . ExtractFormatter::SECTION_MARKER_END . '<h2 id="mw-toc-heading"))/s'; |
243 | } else { |
244 | $regexp = '/^.*?(?=<h[1-6]\b(?! id="mw-toc-heading"))/s'; |
245 | } |
246 | if ( preg_match( $regexp, $text, $matches ) ) { |
247 | $text = $matches[0]; |
248 | } |
249 | return $text; |
250 | } |
251 | |
252 | /** |
253 | * Returns page HTML |
254 | * @param WikiPage $page |
255 | * @return string|null |
256 | * @throws ApiUsageException |
257 | */ |
258 | private function parse( WikiPage $page ) { |
259 | $apiException = null; |
260 | $parserOptions = ParserOptions::newFromAnon(); |
261 | |
262 | // first try finding full page in parser cache |
263 | if ( $page->shouldCheckParserCache( $parserOptions, 0 ) ) { |
264 | // TODO inject ParserCache |
265 | $pout = MediaWikiServices::getInstance()->getParserCache()->get( $page, $parserOptions ); |
266 | if ( $pout ) { |
267 | $text = $pout->getText( [ 'unwrap' => true ] ); |
268 | if ( $this->params['intro'] ) { |
269 | $text = $this->getFirstSection( $text, false ); |
270 | } |
271 | return $text; |
272 | } |
273 | } |
274 | $request = [ |
275 | 'action' => 'parse', |
276 | 'page' => $page->getTitle()->getPrefixedText(), |
277 | 'prop' => 'text', |
278 | // Invokes special handling when using partial wikitext (T168743) |
279 | 'sectionpreview' => 1, |
280 | 'wrapoutputclass' => '', |
281 | ]; |
282 | if ( $this->params['intro'] ) { |
283 | $request['section'] = 0; |
284 | } |
285 | // in case of cache miss, render just the needed section |
286 | $api = new ApiMain( new FauxRequest( $request ) ); |
287 | try { |
288 | $api->execute(); |
289 | $data = $api->getResult()->getResultData( null, [ |
290 | 'BC' => [], |
291 | 'Types' => [], |
292 | ] ); |
293 | } catch ( ApiUsageException $e ) { |
294 | $apiException = $e->__toString(); |
295 | if ( $e->getStatusValue()->hasMessage( 'apierror-nosuchsection' ) ) { |
296 | // Looks like we tried to get the intro to a page without |
297 | // sections! Lets just grab what we can get. |
298 | unset( $request['section'] ); |
299 | $api = new ApiMain( new FauxRequest( $request ) ); |
300 | $api->execute(); |
301 | $data = $api->getResult()->getResultData( null, [ |
302 | 'BC' => [], |
303 | 'Types' => [], |
304 | ] ); |
305 | } else { |
306 | // Some other unexpected error - lets just report it to the user |
307 | // on the off chance that is the right thing. |
308 | throw $e; |
309 | } |
310 | } |
311 | if ( !array_key_exists( 'parse', $data ) ) { |
312 | LoggerFactory::getInstance( 'textextracts' )->warning( |
313 | 'API Parse request failed while generating text extract', [ |
314 | 'title' => $page->getTitle()->getFullText(), |
315 | 'url' => $this->getRequest()->getFullRequestURL(), |
316 | 'exception' => $apiException, |
317 | 'request' => $request |
318 | ] ); |
319 | return null; |
320 | } |
321 | |
322 | return $data['parse']['text']['*']; |
323 | } |
324 | |
325 | /** |
326 | * Converts page HTML into an extract |
327 | * @param string $text |
328 | * @return string |
329 | */ |
330 | private function convertText( $text ) { |
331 | $fmt = new ExtractFormatter( $text, $this->params['plaintext'] ); |
332 | $fmt->remove( $this->config->get( 'ExtractsRemoveClasses' ) ); |
333 | $text = $fmt->getText(); |
334 | return $text; |
335 | } |
336 | |
337 | /** |
338 | * Truncate the given text to a certain number of characters or sentences |
339 | * @param string $text The text to truncate |
340 | * @return string |
341 | */ |
342 | private function truncate( $text ) { |
343 | $useTidy = !$this->params['plaintext']; |
344 | $truncator = new TextTruncator( $useTidy ); |
345 | |
346 | if ( $this->params['chars'] ) { |
347 | $truncatedText = $truncator->getFirstChars( $text, $this->params['chars'] ); |
348 | if ( $truncatedText !== $text ) { |
349 | $text = $truncatedText . $this->msg( 'ellipsis' )->text(); |
350 | } |
351 | } elseif ( $this->params['sentences'] ) { |
352 | $text = $truncator->getFirstSentences( $text, $this->params['sentences'] ); |
353 | } |
354 | return $text; |
355 | } |
356 | |
357 | /** |
358 | * @param string $text |
359 | * @return string |
360 | */ |
361 | private function doSections( $text ) { |
362 | $pattern = '/' . |
363 | ExtractFormatter::SECTION_MARKER_START . '(\d)' . |
364 | ExtractFormatter::SECTION_MARKER_END . '(.*)/'; |
365 | |
366 | switch ( $this->params['sectionformat'] ) { |
367 | case 'raw': |
368 | return $text; |
369 | |
370 | case 'wiki': |
371 | return preg_replace_callback( $pattern, static function ( $matches ) { |
372 | $bars = str_repeat( '=', $matches[1] ); |
373 | return "\n$bars " . trim( $matches[2] ) . " $bars"; |
374 | }, $text ); |
375 | |
376 | case 'plain': |
377 | return preg_replace_callback( $pattern, static function ( $matches ) { |
378 | return "\n" . trim( $matches[2] ); |
379 | }, $text ); |
380 | |
381 | default: |
382 | throw new \LogicException( 'Invalid sectionformat' ); |
383 | } |
384 | } |
385 | |
386 | /** |
387 | * @inheritDoc |
388 | */ |
389 | public function getAllowedParams() { |
390 | return [ |
391 | 'chars' => [ |
392 | ApiBase::PARAM_TYPE => 'integer', |
393 | ApiBase::PARAM_MIN => 1, |
394 | ApiBase::PARAM_MAX => 1200, |
395 | ], |
396 | 'sentences' => [ |
397 | ApiBase::PARAM_TYPE => 'integer', |
398 | ApiBase::PARAM_MIN => 1, |
399 | ApiBase::PARAM_MAX => 10, |
400 | ], |
401 | 'limit' => [ |
402 | ParamValidator::PARAM_DEFAULT => 20, |
403 | ApiBase::PARAM_TYPE => 'limit', |
404 | ApiBase::PARAM_MIN => 1, |
405 | ApiBase::PARAM_MAX => 20, |
406 | ApiBase::PARAM_MAX2 => 20, |
407 | ], |
408 | 'intro' => false, |
409 | 'plaintext' => false, |
410 | 'sectionformat' => [ |
411 | ApiBase::PARAM_TYPE => [ 'plain', 'wiki', 'raw' ], |
412 | ParamValidator::PARAM_DEFAULT => 'wiki', |
413 | ApiBase::PARAM_HELP_MSG_PER_VALUE => [], |
414 | ], |
415 | 'continue' => [ |
416 | ApiBase::PARAM_TYPE => 'integer', |
417 | ApiBase::PARAM_HELP_MSG => 'api-help-param-continue', |
418 | ], |
419 | ]; |
420 | } |
421 | |
422 | /** |
423 | * @inheritDoc |
424 | */ |
425 | protected function getExamplesMessages() { |
426 | return [ |
427 | 'action=query&prop=extracts&exchars=175&titles=Therion' |
428 | => 'apihelp-query+extracts-example-1', |
429 | ]; |
430 | } |
431 | |
432 | /** |
433 | * @inheritDoc |
434 | */ |
435 | public function getHelpUrls() { |
436 | return 'https://www.mediawiki.org/wiki/Special:MyLanguage/Extension:TextExtracts#API'; |
437 | } |
438 | |
439 | } |