MediaWiki REL1_34
ApiQueryExtracts.php
Go to the documentation of this file.
1<?php
2
3namespace TextExtracts;
4
5use ApiBase;
6use ApiMain;
8use BagOStuff;
9use Config;
10use FauxRequest;
13use MWTidy;
14
16use Title;
18use User;
19use WikiPage;
20
25
29 const CACHE_VERSION = 2;
30
31 const PREFIX = 'ex';
32
33 private $params;
37 private $config;
38
39 // TODO: Allow extensions to hook into this to opt-in.
40 // This is partly for security reasons; see T107170.
44 private $supportedContentModels = [ 'wikitext' ];
45
51 public function __construct( $query, $moduleName, Config $conf ) {
52 parent::__construct( $query, $moduleName, self::PREFIX );
53 $this->config = $conf;
54 }
55
61 public function execute() {
62 $titles = $this->getPageSet()->getGoodTitles();
63 if ( $titles === [] ) {
64 return;
65 }
66 $isXml = $this->getMain()->isInternalMode()
67 || $this->getMain()->getPrinter()->getFormat() == 'XML';
68 $result = $this->getResult();
69 $params = $this->params = $this->extractRequestParams();
70 $this->requireMaxOneParameter( $params, 'chars', 'sentences' );
71 $continue = 0;
72 $limit = intval( $params['limit'] );
73 if ( $limit > 1 && !$params['intro'] ) {
74 $limit = 1;
75 $this->addWarning( [ 'apiwarn-textextracts-limit', $limit ] );
76 }
77 if ( isset( $params['continue'] ) ) {
78 $continue = intval( $params['continue'] );
79 $this->dieContinueUsageIf( $continue < 0 || $continue > count( $titles ) );
80 $titles = array_slice( $titles, $continue, null, true );
81 }
82 $count = 0;
83 $titleInFileNamespace = false;
85 foreach ( $titles as $id => $t ) {
86 if ( ++$count > $limit ) {
87 $this->setContinueEnumParameter( 'continue', $continue + $count - 1 );
88 break;
89 }
90
91 if ( $t->inNamespace( NS_FILE ) ) {
92 $text = '';
93 $titleInFileNamespace = true;
94 } else {
95 $params = $this->params;
96 $text = $this->getExtract( $t );
97 $text = $this->truncate( $text );
98 if ( $params['plaintext'] ) {
99 $text = $this->doSections( $text );
100 } else {
101 if ( $params['sentences'] ) {
102 $this->addWarning( $this->msg( 'apiwarn-textextracts-sentences-and-html', self::PREFIX ) );
103 }
104 $this->addWarning( 'apiwarn-textextracts-malformed-html' );
105 }
106 }
107
108 if ( $isXml ) {
109 $fit = $result->addValue( [ 'query', 'pages', $id ], 'extract', [ '*' => $text ] );
110 } else {
111 $fit = $result->addValue( [ 'query', 'pages', $id ], 'extract', $text );
112 }
113 if ( !$fit ) {
114 $this->setContinueEnumParameter( 'continue', $continue + $count - 1 );
115 break;
116 }
117 }
118 if ( $titleInFileNamespace ) {
119 $this->addWarning( 'apiwarn-textextracts-title-in-file-namespace' );
120 }
121 }
122
127 public function getCacheMode( $params ) {
128 return 'public';
129 }
130
136 private function getExtract( Title $title ) {
137 $contentModel = $title->getContentModel();
138 if ( !in_array( $contentModel, $this->supportedContentModels, true ) ) {
139 $this->addWarning( [
140 'apiwarn-textextracts-unsupportedmodel',
141 wfEscapeWikiText( $title->getPrefixedText() ),
142 $contentModel
143 ] );
144 return '';
145 }
146
147 $page = WikiPage::factory( $title );
148
149 $introOnly = $this->params['intro'];
150 $text = $this->getFromCache( $page, $introOnly );
151 // if we need just first section, try retrieving full page and getting first section out of it
152 if ( $text === false && $introOnly ) {
153 $text = $this->getFromCache( $page, false );
154 if ( $text !== false ) {
155 $text = $this->getFirstSection( $text, $this->params['plaintext'] );
156 }
157 }
158 if ( $text === false ) {
159 $text = $this->parse( $page );
160 $text = $this->convertText( $text );
161 $this->setCache( $page, $text );
162 }
163 return $text;
164 }
165
166 private function cacheKey( BagOStuff $cache, WikiPage $page, $introOnly ) {
167 return $cache->makeKey( 'textextracts', self::CACHE_VERSION,
168 $page->getId(), $page->getTouched(),
169 $page->getTitle()->getPageLanguage()->getPreferredVariant(),
170 $this->params['plaintext'], $introOnly
171 );
172 }
173
174 private function getFromCache( WikiPage $page, $introOnly ) {
175 global $wgMemc;
176
177 $key = $this->cacheKey( $wgMemc, $page, $introOnly );
178 return $wgMemc->get( $key );
179 }
180
181 private function setCache( WikiPage $page, $text ) {
182 global $wgMemc;
183
184 $key = $this->cacheKey( $wgMemc, $page, $this->params['intro'] );
185 $wgMemc->set( $key, $text, $this->getConfig()->get( 'ParserCacheExpireTime' ) );
186 }
187
188 private function getFirstSection( $text, $plainText ) {
189 if ( $plainText ) {
190 $regexp = '/^(.*?)(?=' . ExtractFormatter::SECTION_MARKER_START . ')/s';
191 } else {
192 $regexp = '/^(.*?)(?=<h[1-6]\b)/s';
193 }
194 if ( preg_match( $regexp, $text, $matches ) ) {
195 $text = $matches[0];
196 }
197 return $text;
198 }
199
206 private function parse( WikiPage $page ) {
207 $apiException = null;
208 $parserOptions = new ParserOptions( new User() );
209
210 // first try finding full page in parser cache
211 if ( $page->shouldCheckParserCache( $parserOptions, 0 ) ) {
212 $pout = MediaWikiServices::getInstance()->getParserCache()->get( $page, $parserOptions );
213 if ( $pout ) {
214 $text = $pout->getText( [ 'unwrap' => true ] );
215 if ( $this->params['intro'] ) {
216 $text = $this->getFirstSection( $text, false );
217 }
218 return $text;
219 }
220 }
221 $request = [
222 'action' => 'parse',
223 'page' => $page->getTitle()->getPrefixedText(),
224 'prop' => 'text',
225 // Invokes special handling when using partial wikitext (T168743)
226 'sectionpreview' => 1,
227 'wrapoutputclass' => '',
228 ];
229 if ( $this->params['intro'] ) {
230 $request['section'] = 0;
231 }
232 // in case of cache miss, render just the needed section
233 $api = new ApiMain( new FauxRequest( $request ) );
234 try {
235 $api->execute();
236 $data = $api->getResult()->getResultData( null, [
237 'BC' => [],
238 'Types' => [],
239 ] );
240 } catch ( ApiUsageException $e ) {
241 $apiException = $e->__toString();
242 if ( $e->getStatusValue()->hasMessage( 'apierror-nosuchsection' ) ) {
243 // Looks like we tried to get the intro to a page without
244 // sections! Lets just grab what we can get.
245 unset( $request['section'] );
246 $api = new ApiMain( new FauxRequest( $request ) );
247 $api->execute();
248 $data = $api->getResult()->getResultData( null, [
249 'BC' => [],
250 'Types' => [],
251 ] );
252 } else {
253 // Some other unexpected error - lets just report it to the user
254 // on the off chance that is the right thing.
255 throw $e;
256 }
257 }
258 if ( !array_key_exists( 'parse', $data ) ) {
259 LoggerFactory::getInstance( 'textextracts' )->warning(
260 'API Parse request failed while generating text extract', [
261 'title' => $page->getTitle()->getFullText(),
262 'url' => $this->getRequest()->getFullRequestURL(),
263 'exception' => $apiException,
264 'request' => $request
265 ] );
266 return null;
267 }
268
269 return $data['parse']['text']['*'];
270 }
271
277 public static function factory( $query, $name ) {
278 $config = MediaWikiServices::getInstance()->getConfigFactory()->makeConfig( 'textextracts' );
279 return new self( $query, $name, $config );
280 }
281
287 private function convertText( $text ) {
288 $fmt = new ExtractFormatter( $text, $this->params['plaintext'] );
289 $fmt->remove( $this->config->get( 'ExtractsRemoveClasses' ) );
290 $text = $fmt->getText();
291 return $text;
292 }
293
299 private function truncate( $text ) {
300 if ( !$this->params['plaintext'] && MWTidy::isEnabled() ) {
301 $truncator = new TextTruncator( MWTidy::singleton() );
302 } else {
303 $truncator = new TextTruncator();
304 }
305
306 if ( $this->params['chars'] ) {
307 $text = $truncator->getFirstChars( $text, $this->params['chars'] ) .
308 $this->msg( 'ellipsis' )->text();
309 } elseif ( $this->params['sentences'] ) {
310 $text = $truncator->getFirstSentences( $text, $this->params['sentences'] );
311 }
312 return $text;
313 }
314
315 private function doSections( $text ) {
316 $pattern = '/' .
317 ExtractFormatter::SECTION_MARKER_START . '(\d)' .
318 ExtractFormatter::SECTION_MARKER_END . '(.*)/';
319
320 switch ( $this->params['sectionformat'] ) {
321 case 'raw':
322 return $text;
323
324 case 'wiki':
325 return preg_replace_callback( $pattern, function ( $matches ) {
326 $bars = str_repeat( '=', $matches[1] );
327 return "\n$bars " . trim( $matches[2] ) . " $bars";
328 }, $text );
329
330 case 'plain':
331 return preg_replace_callback( $pattern, function ( $matches ) {
332 return "\n" . trim( $matches[2] );
333 }, $text );
334
335 default:
336 throw new \LogicException( 'Invalid sectionformat' );
337 }
338 }
339
344 public function getAllowedParams() {
345 return [
346 'chars' => [
347 ApiBase::PARAM_TYPE => 'integer',
349 ApiBase::PARAM_MAX => 1200,
350 ],
351 'sentences' => [
352 ApiBase::PARAM_TYPE => 'integer',
354 ApiBase::PARAM_MAX => 10,
355 ],
356 'limit' => [
358 ApiBase::PARAM_TYPE => 'limit',
360 ApiBase::PARAM_MAX => 20,
362 ],
363 'intro' => false,
364 'plaintext' => false,
365 'sectionformat' => [
366 ApiBase::PARAM_TYPE => [ 'plain', 'wiki', 'raw' ],
367 ApiBase::PARAM_DFLT => 'wiki',
368 ],
369 'continue' => [
370 ApiBase::PARAM_TYPE => 'integer',
371 ApiBase::PARAM_HELP_MSG => 'api-help-param-continue',
372 ],
373 ];
374 }
375
380 protected function getExamplesMessages() {
381 return [
382 'action=query&prop=extracts&exchars=175&titles=Therion'
383 => 'apihelp-query+extracts-example-1',
384 ];
385 }
386
391 public function getHelpUrls() {
392 return 'https://www.mediawiki.org/wiki/Special:MyLanguage/Extension:TextExtracts#API';
393 }
394}
wfEscapeWikiText( $text)
Escapes the given text so that it may be output using addWikiText() without any linking,...
$wgMemc
Definition Setup.php:790
This abstract class implements many basic API functions, and is the base of all API classes.
Definition ApiBase.php:42
const PARAM_MAX2
(integer) Max value allowed for the parameter for users with the apihighlimits right,...
Definition ApiBase.php:103
const PARAM_MAX
(integer) Max value allowed for the parameter, for PARAM_TYPE 'integer' and 'limit'.
Definition ApiBase.php:97
const PARAM_TYPE
(string|string[]) Either an array of allowed value strings, or a string type as described below.
Definition ApiBase.php:94
const PARAM_DFLT
(null|boolean|integer|string) Default value of the parameter.
Definition ApiBase.php:55
const PARAM_MIN
(integer) Lowest value allowed for the parameter, for PARAM_TYPE 'integer' and 'limit'.
Definition ApiBase.php:106
const PARAM_HELP_MSG
(string|array|Message) Specify an alternative i18n documentation message for this parameter.
Definition ApiBase.php:131
This is the main API class, used for both external and internal processing.
Definition ApiMain.php:41
This is a base class for all Query modules.
Exception used to abort API execution with an error.
getStatusValue()
Fetch the error status.
Class representing a cache/ephemeral data store.
Definition BagOStuff.php:63
WebRequest clone which takes values from a provided array.
Class to interact with and configure Remex tidy.
Definition MWTidy.php:29
PSR-3 logger instance factory.
MediaWikiServices is the service locator for the application scope of MediaWiki.
Set options of the Parser.
getFirstSection( $text, $plainText)
execute()
Evaluates the parameters, performs the requested extraction of text, and sets up the result.
static factory( $query, $name)
truncate( $text)
Truncate the given text to a certain number of characters or sentences.
getFromCache(WikiPage $page, $introOnly)
getExtract(Title $title)
Returns a processed, but not trimmed extract.
parse(WikiPage $page)
Returns page HTML.
cacheKey(BagOStuff $cache, WikiPage $page, $introOnly)
__construct( $query, $moduleName, Config $conf)
getAllowedParams()
Return an array describing all possible parameters to this module.
setCache(WikiPage $page, $text)
convertText( $text)
Converts page HTML into an extract.
Provides text-only or limited-HTML extracts of page HTML.
This class needs to understand HTML as well as plain text.
Represents a title within MediaWiki.
Definition Title.php:42
The User object encapsulates all of the user-specific settings (user_id, name, rights,...
Definition User.php:51
Class representing a MediaWiki article and history.
Definition WikiPage.php:47
shouldCheckParserCache(ParserOptions $parserOptions, $oldId)
Should the parser cache be used?
getTitle()
Get the title object of the article.
Definition WikiPage.php:298
getTouched()
Get the page_touched field.
Definition WikiPage.php:692
const NS_FILE
Definition Defines.php:75
Interface for configuration instances.
Definition Config.php:28
$cache
Definition mcc.php:33