Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
50.29% |
86 / 171 |
|
66.67% |
10 / 15 |
CRAP | |
0.00% |
0 / 1 |
ApiQueryExtracts | |
50.29% |
86 / 171 |
|
66.67% |
10 / 15 |
330.98 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
1 | |||
execute | |
0.00% |
0 / 42 |
|
0.00% |
0 / 1 |
272 | |||
getCacheMode | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getExtract | |
0.00% |
0 / 20 |
|
0.00% |
0 / 1 |
42 | |||
cacheKey | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
3 | |||
getFromCache | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
setCache | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
getFirstSection | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
3 | |||
parse | |
0.00% |
0 / 18 |
|
0.00% |
0 / 1 |
12 | |||
convertText | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
truncate | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
4 | |||
doSections | |
93.75% |
15 / 16 |
|
0.00% |
0 / 1 |
5.01 | |||
getAllowedParams | |
100.00% |
30 / 30 |
|
100.00% |
1 / 1 |
1 | |||
getExamplesMessages | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
getHelpUrls | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 |
1 | <?php |
2 | |
3 | namespace TextExtracts; |
4 | |
5 | use ApiBase; |
6 | use ApiQueryBase; |
7 | use ApiUsageException; |
8 | use MediaWiki\Config\Config; |
9 | use MediaWiki\Config\ConfigFactory; |
10 | use MediaWiki\Languages\LanguageConverterFactory; |
11 | use MediaWiki\Logger\LoggerFactory; |
12 | use MediaWiki\MediaWikiServices; |
13 | use MediaWiki\Page\PageIdentity; |
14 | use MediaWiki\Page\WikiPageFactory; |
15 | use MediaWiki\Title\TitleFormatter; |
16 | use ParserOptions; |
17 | use WANObjectCache; |
18 | use Wikimedia\ParamValidator\ParamValidator; |
19 | use WikiPage; |
20 | |
21 | /** |
22 | * @license GPL-2.0-or-later |
23 | */ |
24 | class ApiQueryExtracts extends ApiQueryBase { |
25 | |
26 | /** |
27 | * Bump when memcache needs clearing |
28 | */ |
29 | private const CACHE_VERSION = 2; |
30 | |
31 | private const PREFIX = 'ex'; |
32 | |
33 | /** |
34 | * @var array |
35 | */ |
36 | private $params; |
37 | |
38 | /** |
39 | * @var Config |
40 | */ |
41 | private $config; |
42 | /** |
43 | * @var WANObjectCache |
44 | */ |
45 | private $cache; |
46 | /** |
47 | * @var LanguageConverterFactory |
48 | */ |
49 | private $langConvFactory; |
50 | /** |
51 | * @var WikiPageFactory |
52 | */ |
53 | private $wikiPageFactory; |
54 | private TitleFormatter $titleFormatter; |
55 | |
56 | // TODO: Allow extensions to hook into this to opt-in. |
57 | // This is partly for security reasons; see T107170. |
58 | /** |
59 | * @var string[] |
60 | */ |
61 | private $supportedContentModels = [ 'wikitext' ]; |
62 | |
63 | /** |
64 | * @param \ApiQuery $query API query module object |
65 | * @param string $moduleName Name of this query module |
66 | * @param ConfigFactory $configFactory |
67 | * @param WANObjectCache $cache |
68 | * @param LanguageConverterFactory $langConvFactory |
69 | * @param WikiPageFactory $wikiPageFactory |
70 | * @param TitleFormatter $titleFormatter |
71 | */ |
72 | public function __construct( |
73 | $query, |
74 | $moduleName, |
75 | ConfigFactory $configFactory, |
76 | WANObjectCache $cache, |
77 | LanguageConverterFactory $langConvFactory, |
78 | WikiPageFactory $wikiPageFactory, |
79 | TitleFormatter $titleFormatter |
80 | ) { |
81 | parent::__construct( $query, $moduleName, self::PREFIX ); |
82 | $this->config = $configFactory->makeConfig( 'textextracts' ); |
83 | $this->cache = $cache; |
84 | $this->langConvFactory = $langConvFactory; |
85 | $this->wikiPageFactory = $wikiPageFactory; |
86 | $this->titleFormatter = $titleFormatter; |
87 | } |
88 | |
89 | /** |
90 | * Evaluates the parameters, performs the requested extraction of text, |
91 | * and sets up the result |
92 | */ |
93 | public function execute() { |
94 | $titles = $this->getPageSet()->getGoodPages(); |
95 | if ( $titles === [] ) { |
96 | return; |
97 | } |
98 | $isXml = $this->getMain()->isInternalMode() |
99 | || $this->getMain()->getPrinter()->getFormat() == 'XML'; |
100 | $result = $this->getResult(); |
101 | $params = $this->params = $this->extractRequestParams(); |
102 | $this->requireMaxOneParameter( $params, 'chars', 'sentences' ); |
103 | $continue = 0; |
104 | $limit = intval( $params['limit'] ); |
105 | if ( $limit > 1 && !$params['intro'] && count( $titles ) > 1 ) { |
106 | $limit = 1; |
107 | $this->addWarning( [ 'apiwarn-textextracts-limit', $limit ] ); |
108 | } |
109 | if ( isset( $params['continue'] ) ) { |
110 | $continue = intval( $params['continue'] ); |
111 | $this->dieContinueUsageIf( $continue < 0 || $continue > count( $titles ) ); |
112 | $titles = array_slice( $titles, $continue, null, true ); |
113 | } |
114 | $count = 0; |
115 | $titleInFileNamespace = false; |
116 | /** @var PageIdentity $t */ |
117 | foreach ( $titles as $id => $t ) { |
118 | if ( ++$count > $limit ) { |
119 | $this->setContinueEnumParameter( 'continue', $continue + $count - 1 ); |
120 | break; |
121 | } |
122 | |
123 | if ( $t->getNamespace() === NS_FILE ) { |
124 | $text = ''; |
125 | $titleInFileNamespace = true; |
126 | } else { |
127 | $params = $this->params; |
128 | $text = $this->getExtract( $t ); |
129 | $text = $this->truncate( $text ); |
130 | if ( $params['plaintext'] ) { |
131 | $text = $this->doSections( $text ); |
132 | } else { |
133 | if ( $params['sentences'] ) { |
134 | $this->addWarning( $this->msg( 'apiwarn-textextracts-sentences-and-html', self::PREFIX ) ); |
135 | } |
136 | $this->addWarning( 'apiwarn-textextracts-malformed-html' ); |
137 | } |
138 | } |
139 | |
140 | if ( $isXml ) { |
141 | $fit = $result->addValue( [ 'query', 'pages', $id ], 'extract', [ '*' => $text ] ); |
142 | } else { |
143 | $fit = $result->addValue( [ 'query', 'pages', $id ], 'extract', $text ); |
144 | } |
145 | if ( !$fit ) { |
146 | $this->setContinueEnumParameter( 'continue', $continue + $count - 1 ); |
147 | break; |
148 | } |
149 | } |
150 | if ( $titleInFileNamespace ) { |
151 | $this->addWarning( 'apiwarn-textextracts-title-in-file-namespace' ); |
152 | } |
153 | } |
154 | |
155 | /** |
156 | * @param array $params Ignored parameters |
157 | * @return string |
158 | */ |
159 | public function getCacheMode( $params ) { |
160 | return 'public'; |
161 | } |
162 | |
163 | /** |
164 | * Returns a processed, but not trimmed extract |
165 | * @param PageIdentity $title |
166 | * @return string |
167 | */ |
168 | private function getExtract( PageIdentity $title ) { |
169 | $page = $this->wikiPageFactory->newFromTitle( $title ); |
170 | |
171 | $contentModel = $page->getContentModel(); |
172 | if ( !in_array( $contentModel, $this->supportedContentModels, true ) ) { |
173 | $this->addWarning( [ |
174 | 'apiwarn-textextracts-unsupportedmodel', |
175 | wfEscapeWikiText( $this->titleFormatter->getPrefixedText( $title ) ), |
176 | $contentModel |
177 | ] ); |
178 | return ''; |
179 | } |
180 | |
181 | $introOnly = $this->params['intro']; |
182 | $text = $this->getFromCache( $page, $introOnly ); |
183 | // if we need just first section, try retrieving full page and getting first section out of it |
184 | if ( $text === false && $introOnly ) { |
185 | $text = $this->getFromCache( $page, false ); |
186 | if ( $text !== false ) { |
187 | $text = $this->getFirstSection( $text, $this->params['plaintext'] ); |
188 | } |
189 | } |
190 | if ( $text === false ) { |
191 | $text = $this->parse( $page ); |
192 | $text = $this->convertText( $text ); |
193 | $this->setCache( $page, $text ); |
194 | } |
195 | return $text; |
196 | } |
197 | |
198 | /** |
199 | * @param WANObjectCache $cache |
200 | * @param WikiPage $page |
201 | * @param bool $introOnly |
202 | * @return string |
203 | */ |
204 | private function cacheKey( WANObjectCache $cache, WikiPage $page, $introOnly ) { |
205 | $langConv = $this->langConvFactory->getLanguageConverter( $page->getTitle()->getPageLanguage() ); |
206 | return $cache->makeKey( 'textextracts', self::CACHE_VERSION, |
207 | $page->getId(), $page->getTouched(), |
208 | $langConv->getPreferredVariant(), |
209 | $this->params['plaintext'] ? 'plaintext' : 'html', |
210 | $introOnly ? 'intro' : 'full' |
211 | ); |
212 | } |
213 | |
214 | /** |
215 | * @param WikiPage $page |
216 | * @param bool $introOnly |
217 | * @return string|false |
218 | */ |
219 | private function getFromCache( WikiPage $page, $introOnly ) { |
220 | $cache = $this->cache; |
221 | // @TODO: replace with getWithSetCallback() |
222 | $key = $this->cacheKey( $cache, $page, $introOnly ); |
223 | return $cache->get( $key ); |
224 | } |
225 | |
226 | /** |
227 | * @param WikiPage $page |
228 | * @param string $text |
229 | */ |
230 | private function setCache( WikiPage $page, $text ) { |
231 | $cache = $this->cache; |
232 | // @TODO: replace with getWithSetCallback() |
233 | $key = $this->cacheKey( $cache, $page, $this->params['intro'] ); |
234 | $cache->set( $key, $text, $this->getConfig()->get( 'ParserCacheExpireTime' ) ); |
235 | } |
236 | |
237 | /** |
238 | * @param string $text |
239 | * @param bool $plainText |
240 | * @return string |
241 | */ |
242 | private function getFirstSection( $text, $plainText ) { |
243 | if ( $plainText ) { |
244 | $regexp = '/^.*?(?=' . ExtractFormatter::SECTION_MARKER_START . |
245 | '(?!.' . ExtractFormatter::SECTION_MARKER_END . '<h2 id="mw-toc-heading"))/s'; |
246 | } else { |
247 | $regexp = '/^.*?(?=<h[1-6]\b(?! id="mw-toc-heading"))/s'; |
248 | } |
249 | if ( preg_match( $regexp, $text, $matches ) ) { |
250 | $text = $matches[0]; |
251 | } |
252 | return $text; |
253 | } |
254 | |
255 | /** |
256 | * Returns page HTML |
257 | * @param WikiPage $page |
258 | * @return string |
259 | * @throws ApiUsageException |
260 | */ |
261 | private function parse( WikiPage $page ) { |
262 | $parserOutputAccess = MediaWikiServices::getInstance()->getParserOutputAccess(); |
263 | $status = $parserOutputAccess->getParserOutput( |
264 | $page->toPageRecord(), |
265 | ParserOptions::newFromAnon() |
266 | ); |
267 | if ( $status->isOK() ) { |
268 | $pout = $status->getValue(); |
269 | $text = $pout->getText( [ 'unwrap' => true ] ); |
270 | if ( $this->params['intro'] ) { |
271 | $text = $this->getFirstSection( $text, false ); |
272 | } |
273 | return $text; |
274 | } else { |
275 | LoggerFactory::getInstance( 'textextracts' )->warning( |
276 | 'Parse attempt failed while generating text extract', [ |
277 | 'title' => $page->getTitle()->getFullText(), |
278 | 'url' => $this->getRequest()->getFullRequestURL(), |
279 | 'reason' => $status->getWikiText( false, false, 'en' ) |
280 | ] ); |
281 | $this->dieStatus( $status ); |
282 | } |
283 | } |
284 | |
285 | /** |
286 | * Converts page HTML into an extract |
287 | * @param string $text |
288 | * @return string |
289 | */ |
290 | private function convertText( $text ) { |
291 | $fmt = new ExtractFormatter( $text, $this->params['plaintext'] ); |
292 | $fmt->remove( $this->config->get( 'ExtractsRemoveClasses' ) ); |
293 | $text = $fmt->getText(); |
294 | return $text; |
295 | } |
296 | |
297 | /** |
298 | * Truncate the given text to a certain number of characters or sentences |
299 | * @param string $text The text to truncate |
300 | * @return string |
301 | */ |
302 | private function truncate( $text ) { |
303 | $useTidy = !$this->params['plaintext']; |
304 | $truncator = new TextTruncator( $useTidy ); |
305 | |
306 | if ( $this->params['chars'] ) { |
307 | $truncatedText = $truncator->getFirstChars( $text, $this->params['chars'] ); |
308 | if ( $truncatedText !== $text ) { |
309 | $text = $truncatedText . $this->msg( 'ellipsis' )->text(); |
310 | } |
311 | } elseif ( $this->params['sentences'] ) { |
312 | $text = $truncator->getFirstSentences( $text, $this->params['sentences'] ); |
313 | } |
314 | return $text; |
315 | } |
316 | |
317 | /** |
318 | * @param string $text |
319 | * @return string |
320 | */ |
321 | private function doSections( $text ) { |
322 | $pattern = '/' . |
323 | ExtractFormatter::SECTION_MARKER_START . '(\d)' . |
324 | ExtractFormatter::SECTION_MARKER_END . '(.*)/'; |
325 | |
326 | switch ( $this->params['sectionformat'] ) { |
327 | case 'raw': |
328 | return $text; |
329 | |
330 | case 'wiki': |
331 | return preg_replace_callback( $pattern, static function ( $matches ) { |
332 | $bars = str_repeat( '=', $matches[1] ); |
333 | return "\n$bars " . trim( $matches[2] ) . " $bars"; |
334 | }, $text ); |
335 | |
336 | case 'plain': |
337 | return preg_replace_callback( $pattern, static function ( $matches ) { |
338 | return "\n" . trim( $matches[2] ); |
339 | }, $text ); |
340 | |
341 | default: |
342 | throw new \LogicException( 'Invalid sectionformat' ); |
343 | } |
344 | } |
345 | |
346 | /** |
347 | * @inheritDoc |
348 | */ |
349 | public function getAllowedParams() { |
350 | return [ |
351 | 'chars' => [ |
352 | ApiBase::PARAM_TYPE => 'integer', |
353 | ApiBase::PARAM_MIN => 1, |
354 | ApiBase::PARAM_MAX => 1200, |
355 | ], |
356 | 'sentences' => [ |
357 | ApiBase::PARAM_TYPE => 'integer', |
358 | ApiBase::PARAM_MIN => 1, |
359 | ApiBase::PARAM_MAX => 10, |
360 | ], |
361 | 'limit' => [ |
362 | ParamValidator::PARAM_DEFAULT => 20, |
363 | ApiBase::PARAM_TYPE => 'limit', |
364 | ApiBase::PARAM_MIN => 1, |
365 | ApiBase::PARAM_MAX => 20, |
366 | ApiBase::PARAM_MAX2 => 20, |
367 | ], |
368 | 'intro' => false, |
369 | 'plaintext' => false, |
370 | 'sectionformat' => [ |
371 | ApiBase::PARAM_TYPE => [ 'plain', 'wiki', 'raw' ], |
372 | ParamValidator::PARAM_DEFAULT => 'wiki', |
373 | ApiBase::PARAM_HELP_MSG_PER_VALUE => [], |
374 | ], |
375 | 'continue' => [ |
376 | ApiBase::PARAM_TYPE => 'integer', |
377 | ApiBase::PARAM_HELP_MSG => 'api-help-param-continue', |
378 | ], |
379 | ]; |
380 | } |
381 | |
382 | /** |
383 | * @inheritDoc |
384 | */ |
385 | protected function getExamplesMessages() { |
386 | return [ |
387 | 'action=query&prop=extracts&exchars=175&titles=Therion' |
388 | => 'apihelp-query+extracts-example-1', |
389 | ]; |
390 | } |
391 | |
392 | /** |
393 | * @inheritDoc |
394 | */ |
395 | public function getHelpUrls() { |
396 | return 'https://www.mediawiki.org/wiki/Special:MyLanguage/Extension:TextExtracts#API'; |
397 | } |
398 | |
399 | } |