Code Coverage for /workspace/src/extensions/TextExtracts/includes/ApiQueryExtracts.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	50.29% covered (warning)	50.29%	86 / 171	66.67% covered (warning)	66.67%	10 / 15	CRAP	0.00% covered (danger)	0.00%	0 / 1
ApiQueryExtracts	50.29% covered (warning)	50.29%	86 / 171	66.67% covered (warning)	66.67%	10 / 15	330.98	0.00% covered (danger)	0.00%	0 / 1
__construct	100.00% covered (success)	100.00%	6 / 6	100.00% covered (success)	100.00%	1 / 1	1
execute	0.00% covered (danger)	0.00%	0 / 42	0.00% covered (danger)	0.00%	0 / 1	272
getCacheMode	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
getExtract	0.00% covered (danger)	0.00%	0 / 20	0.00% covered (danger)	0.00%	0 / 1	42
cacheKey	100.00% covered (success)	100.00%	7 / 7	100.00% covered (success)	100.00%	1 / 1	3
getFromCache	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	1
setCache	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	1
getFirstSection	100.00% covered (success)	100.00%	7 / 7	100.00% covered (success)	100.00%	1 / 1	3
parse	0.00% covered (danger)	0.00%	0 / 18	0.00% covered (danger)	0.00%	0 / 1	12
convertText	0.00% covered (danger)	0.00%	0 / 4	0.00% covered (danger)	0.00%	0 / 1	2
truncate	100.00% covered (success)	100.00%	9 / 9	100.00% covered (success)	100.00%	1 / 1	4
doSections	93.75% covered (success)	93.75%	15 / 16	0.00% covered (danger)	0.00%	0 / 1	5.01
getAllowedParams	100.00% covered (success)	100.00%	30 / 30	100.00% covered (success)	100.00%	1 / 1	1
getExamplesMessages	100.00% covered (success)	100.00%	4 / 4	100.00% covered (success)	100.00%	1 / 1	1
getHelpUrls	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1

1	<?php
2
3	namespace TextExtracts;
4
5	use ApiBase;
6	use ApiQueryBase;
7	use ApiUsageException;
8	use MediaWiki\Config\Config;
9	use MediaWiki\Config\ConfigFactory;
10	use MediaWiki\Languages\LanguageConverterFactory;
11	use MediaWiki\Logger\LoggerFactory;
12	use MediaWiki\MediaWikiServices;
13	use MediaWiki\Page\PageIdentity;
14	use MediaWiki\Page\WikiPageFactory;
15	use MediaWiki\Title\TitleFormatter;
16	use ParserOptions;
17	use WANObjectCache;
18	use Wikimedia\ParamValidator\ParamValidator;
19	use WikiPage;
20
21	/**
22	* @license GPL-2.0-or-later
23	*/
24	class ApiQueryExtracts extends ApiQueryBase {
25
26	/**
27	* Bump when memcache needs clearing
28	*/
29	private const CACHE_VERSION = 2;
30
31	private const PREFIX = 'ex';
32
33	/**
34	* @var array
35	*/
36	private $params;
37
38	/**
39	* @var Config
40	*/
41	private $config;
42	/**
43	* @var WANObjectCache
44	*/
45	private $cache;
46	/**
47	* @var LanguageConverterFactory
48	*/
49	private $langConvFactory;
50	/**
51	* @var WikiPageFactory
52	*/
53	private $wikiPageFactory;
54	private TitleFormatter $titleFormatter;
55
56	// TODO: Allow extensions to hook into this to opt-in.
57	// This is partly for security reasons; see T107170.
58	/**
59	* @var string[]
60	*/
61	private $supportedContentModels = [ 'wikitext' ];
62
63	/**
64	* @param \ApiQuery $query API query module object
65	* @param string $moduleName Name of this query module
66	* @param ConfigFactory $configFactory
67	* @param WANObjectCache $cache
68	* @param LanguageConverterFactory $langConvFactory
69	* @param WikiPageFactory $wikiPageFactory
70	* @param TitleFormatter $titleFormatter
71	*/
72	public function __construct(
73	$query,
74	$moduleName,
75	ConfigFactory $configFactory,
76	WANObjectCache $cache,
77	LanguageConverterFactory $langConvFactory,
78	WikiPageFactory $wikiPageFactory,
79	TitleFormatter $titleFormatter
80	) {
81	parent::__construct( $query, $moduleName, self::PREFIX );
82	$this->config = $configFactory->makeConfig( 'textextracts' );
83	$this->cache = $cache;
84	$this->langConvFactory = $langConvFactory;
85	$this->wikiPageFactory = $wikiPageFactory;
86	$this->titleFormatter = $titleFormatter;
87	}
88
89	/**
90	* Evaluates the parameters, performs the requested extraction of text,
91	* and sets up the result
92	*/
93	public function execute() {
94	$titles = $this->getPageSet()->getGoodPages();
95	if ( $titles === [] ) {
96	return;
97	}
98	$isXml = $this->getMain()->isInternalMode()
99	\|\| $this->getMain()->getPrinter()->getFormat() == 'XML';
100	$result = $this->getResult();
101	$params = $this->params = $this->extractRequestParams();
102	$this->requireMaxOneParameter( $params, 'chars', 'sentences' );
103	$continue = 0;
104	$limit = intval( $params['limit'] );
105	if ( $limit > 1 && !$params['intro'] && count( $titles ) > 1 ) {
106	$limit = 1;
107	$this->addWarning( [ 'apiwarn-textextracts-limit', $limit ] );
108	}
109	if ( isset( $params['continue'] ) ) {
110	$continue = intval( $params['continue'] );
111	$this->dieContinueUsageIf( $continue < 0 \|\| $continue > count( $titles ) );
112	$titles = array_slice( $titles, $continue, null, true );
113	}
114	$count = 0;
115	$titleInFileNamespace = false;
116	/** @var PageIdentity $t */
117	foreach ( $titles as $id => $t ) {
118	if ( ++$count > $limit ) {
119	$this->setContinueEnumParameter( 'continue', $continue + $count - 1 );
120	break;
121	}
122
123	if ( $t->getNamespace() === NS_FILE ) {
124	$text = '';
125	$titleInFileNamespace = true;
126	} else {
127	$params = $this->params;
128	$text = $this->getExtract( $t );
129	$text = $this->truncate( $text );
130	if ( $params['plaintext'] ) {
131	$text = $this->doSections( $text );
132	} else {
133	if ( $params['sentences'] ) {
134	$this->addWarning( $this->msg( 'apiwarn-textextracts-sentences-and-html', self::PREFIX ) );
135	}
136	$this->addWarning( 'apiwarn-textextracts-malformed-html' );
137	}
138	}
139
140	if ( $isXml ) {
141	$fit = $result->addValue( [ 'query', 'pages', $id ], 'extract', [ '*' => $text ] );
142	} else {
143	$fit = $result->addValue( [ 'query', 'pages', $id ], 'extract', $text );
144	}
145	if ( !$fit ) {
146	$this->setContinueEnumParameter( 'continue', $continue + $count - 1 );
147	break;
148	}
149	}
150	if ( $titleInFileNamespace ) {
151	$this->addWarning( 'apiwarn-textextracts-title-in-file-namespace' );
152	}
153	}
154
155	/**
156	* @param array $params Ignored parameters
157	* @return string
158	*/
159	public function getCacheMode( $params ) {
160	return 'public';
161	}
162
163	/**
164	* Returns a processed, but not trimmed extract
165	* @param PageIdentity $title
166	* @return string
167	*/
168	private function getExtract( PageIdentity $title ) {
169	$page = $this->wikiPageFactory->newFromTitle( $title );
170
171	$contentModel = $page->getContentModel();
172	if ( !in_array( $contentModel, $this->supportedContentModels, true ) ) {
173	$this->addWarning( [
174	'apiwarn-textextracts-unsupportedmodel',
175	wfEscapeWikiText( $this->titleFormatter->getPrefixedText( $title ) ),
176	$contentModel
177	] );
178	return '';
179	}
180
181	$introOnly = $this->params['intro'];
182	$text = $this->getFromCache( $page, $introOnly );
183	// if we need just first section, try retrieving full page and getting first section out of it
184	if ( $text === false && $introOnly ) {
185	$text = $this->getFromCache( $page, false );
186	if ( $text !== false ) {
187	$text = $this->getFirstSection( $text, $this->params['plaintext'] );
188	}
189	}
190	if ( $text === false ) {
191	$text = $this->parse( $page );
192	$text = $this->convertText( $text );
193	$this->setCache( $page, $text );
194	}
195	return $text;
196	}
197
198	/**
199	* @param WANObjectCache $cache
200	* @param WikiPage $page
201	* @param bool $introOnly
202	* @return string
203	*/
204	private function cacheKey( WANObjectCache $cache, WikiPage $page, $introOnly ) {
205	$langConv = $this->langConvFactory->getLanguageConverter( $page->getTitle()->getPageLanguage() );
206	return $cache->makeKey( 'textextracts', self::CACHE_VERSION,
207	$page->getId(), $page->getTouched(),
208	$langConv->getPreferredVariant(),
209	$this->params['plaintext'] ? 'plaintext' : 'html',
210	$introOnly ? 'intro' : 'full'
211	);
212	}
213
214	/**
215	* @param WikiPage $page
216	* @param bool $introOnly
217	* @return string\|false
218	*/
219	private function getFromCache( WikiPage $page, $introOnly ) {
220	$cache = $this->cache;
221	// @TODO: replace with getWithSetCallback()
222	$key = $this->cacheKey( $cache, $page, $introOnly );
223	return $cache->get( $key );
224	}
225
226	/**
227	* @param WikiPage $page
228	* @param string $text
229	*/
230	private function setCache( WikiPage $page, $text ) {
231	$cache = $this->cache;
232	// @TODO: replace with getWithSetCallback()
233	$key = $this->cacheKey( $cache, $page, $this->params['intro'] );
234	$cache->set( $key, $text, $this->getConfig()->get( 'ParserCacheExpireTime' ) );
235	}
236
237	/**
238	* @param string $text
239	* @param bool $plainText
240	* @return string
241	*/
242	private function getFirstSection( $text, $plainText ) {
243	if ( $plainText ) {
244	$regexp = '/^.*?(?=' . ExtractFormatter::SECTION_MARKER_START .
245	'(?!.' . ExtractFormatter::SECTION_MARKER_END . '<h2 id="mw-toc-heading"))/s';
246	} else {
247	$regexp = '/^.*?(?=<h[1-6]\b(?! id="mw-toc-heading"))/s';
248	}
249	if ( preg_match( $regexp, $text, $matches ) ) {
250	$text = $matches[0];
251	}
252	return $text;
253	}
254
255	/**
256	* Returns page HTML
257	* @param WikiPage $page
258	* @return string
259	* @throws ApiUsageException
260	*/
261	private function parse( WikiPage $page ) {
262	$parserOutputAccess = MediaWikiServices::getInstance()->getParserOutputAccess();
263	$status = $parserOutputAccess->getParserOutput(
264	$page->toPageRecord(),
265	ParserOptions::newFromAnon()
266	);
267	if ( $status->isOK() ) {
268	$pout = $status->getValue();
269	$text = $pout->getText( [ 'unwrap' => true ] );
270	if ( $this->params['intro'] ) {
271	$text = $this->getFirstSection( $text, false );
272	}
273	return $text;
274	} else {
275	LoggerFactory::getInstance( 'textextracts' )->warning(
276	'Parse attempt failed while generating text extract', [
277	'title' => $page->getTitle()->getFullText(),
278	'url' => $this->getRequest()->getFullRequestURL(),
279	'reason' => $status->getWikiText( false, false, 'en' )
280	] );
281	$this->dieStatus( $status );
282	}
283	}
284
285	/**
286	* Converts page HTML into an extract
287	* @param string $text
288	* @return string
289	*/
290	private function convertText( $text ) {
291	$fmt = new ExtractFormatter( $text, $this->params['plaintext'] );
292	$fmt->remove( $this->config->get( 'ExtractsRemoveClasses' ) );
293	$text = $fmt->getText();
294	return $text;
295	}
296
297	/**
298	* Truncate the given text to a certain number of characters or sentences
299	* @param string $text The text to truncate
300	* @return string
301	*/
302	private function truncate( $text ) {
303	$useTidy = !$this->params['plaintext'];
304	$truncator = new TextTruncator( $useTidy );
305
306	if ( $this->params['chars'] ) {
307	$truncatedText = $truncator->getFirstChars( $text, $this->params['chars'] );
308	if ( $truncatedText !== $text ) {
309	$text = $truncatedText . $this->msg( 'ellipsis' )->text();
310	}
311	} elseif ( $this->params['sentences'] ) {
312	$text = $truncator->getFirstSentences( $text, $this->params['sentences'] );
313	}
314	return $text;
315	}
316
317	/**
318	* @param string $text
319	* @return string
320	*/
321	private function doSections( $text ) {
322	$pattern = '/' .
323	ExtractFormatter::SECTION_MARKER_START . '(\d)' .
324	ExtractFormatter::SECTION_MARKER_END . '(.*)/';
325
326	switch ( $this->params['sectionformat'] ) {
327	case 'raw':
328	return $text;
329
330	case 'wiki':
331	return preg_replace_callback( $pattern, static function ( $matches ) {
332	$bars = str_repeat( '=', $matches[1] );
333	return "\n$bars " . trim( $matches[2] ) . " $bars";
334	}, $text );
335
336	case 'plain':
337	return preg_replace_callback( $pattern, static function ( $matches ) {
338	return "\n" . trim( $matches[2] );
339	}, $text );
340
341	default:
342	throw new \LogicException( 'Invalid sectionformat' );
343	}
344	}
345
346	/**
347	* @inheritDoc
348	*/
349	public function getAllowedParams() {
350	return [
351	'chars' => [
352	ApiBase::PARAM_TYPE => 'integer',
353	ApiBase::PARAM_MIN => 1,
354	ApiBase::PARAM_MAX => 1200,
355	],
356	'sentences' => [
357	ApiBase::PARAM_TYPE => 'integer',
358	ApiBase::PARAM_MIN => 1,
359	ApiBase::PARAM_MAX => 10,
360	],
361	'limit' => [
362	ParamValidator::PARAM_DEFAULT => 20,
363	ApiBase::PARAM_TYPE => 'limit',
364	ApiBase::PARAM_MIN => 1,
365	ApiBase::PARAM_MAX => 20,
366	ApiBase::PARAM_MAX2 => 20,
367	],
368	'intro' => false,
369	'plaintext' => false,
370	'sectionformat' => [
371	ApiBase::PARAM_TYPE => [ 'plain', 'wiki', 'raw' ],
372	ParamValidator::PARAM_DEFAULT => 'wiki',
373	ApiBase::PARAM_HELP_MSG_PER_VALUE => [],
374	],
375	'continue' => [
376	ApiBase::PARAM_TYPE => 'integer',
377	ApiBase::PARAM_HELP_MSG => 'api-help-param-continue',
378	],
379	];
380	}
381
382	/**
383	* @inheritDoc
384	*/
385	protected function getExamplesMessages() {
386	return [
387	'action=query&prop=extracts&exchars=175&titles=Therion'
388	=> 'apihelp-query+extracts-example-1',
389	];
390	}
391
392	/**
393	* @inheritDoc
394	*/
395	public function getHelpUrls() {
396	return 'https://www.mediawiki.org/wiki/Special:MyLanguage/Extension:TextExtracts#API';
397	}
398
399	}