MediaWiki  1.34.0
ApiQueryExtracts.php
Go to the documentation of this file.
1 <?php
2 
3 namespace TextExtracts;
4 
5 use ApiBase;
6 use ApiMain;
7 use ApiQueryBase;
8 use BagOStuff;
9 use Config;
10 use FauxRequest;
13 use MWTidy;
14 
15 use ParserOptions;
16 use Title;
18 use User;
19 use WikiPage;
20 
25 
29  const CACHE_VERSION = 2;
30 
31  const PREFIX = 'ex';
32 
33  private $params;
37  private $config;
38 
39  // TODO: Allow extensions to hook into this to opt-in.
40  // This is partly for security reasons; see T107170.
44  private $supportedContentModels = [ 'wikitext' ];
45 
51  public function __construct( $query, $moduleName, Config $conf ) {
52  parent::__construct( $query, $moduleName, self::PREFIX );
53  $this->config = $conf;
54  }
55 
61  public function execute() {
62  $titles = $this->getPageSet()->getGoodTitles();
63  if ( $titles === [] ) {
64  return;
65  }
66  $isXml = $this->getMain()->isInternalMode()
67  || $this->getMain()->getPrinter()->getFormat() == 'XML';
68  $result = $this->getResult();
69  $params = $this->params = $this->extractRequestParams();
70  $this->requireMaxOneParameter( $params, 'chars', 'sentences' );
71  $continue = 0;
72  $limit = intval( $params['limit'] );
73  if ( $limit > 1 && !$params['intro'] ) {
74  $limit = 1;
75  $this->addWarning( [ 'apiwarn-textextracts-limit', $limit ] );
76  }
77  if ( isset( $params['continue'] ) ) {
78  $continue = intval( $params['continue'] );
79  $this->dieContinueUsageIf( $continue < 0 || $continue > count( $titles ) );
80  $titles = array_slice( $titles, $continue, null, true );
81  }
82  $count = 0;
83  $titleInFileNamespace = false;
85  foreach ( $titles as $id => $t ) {
86  if ( ++$count > $limit ) {
87  $this->setContinueEnumParameter( 'continue', $continue + $count - 1 );
88  break;
89  }
90 
91  if ( $t->inNamespace( NS_FILE ) ) {
92  $text = '';
93  $titleInFileNamespace = true;
94  } else {
95  $params = $this->params;
96  $text = $this->getExtract( $t );
97  $text = $this->truncate( $text );
98  if ( $params['plaintext'] ) {
99  $text = $this->doSections( $text );
100  } else {
101  if ( $params['sentences'] ) {
102  $this->addWarning( $this->msg( 'apiwarn-textextracts-sentences-and-html', self::PREFIX ) );
103  }
104  $this->addWarning( 'apiwarn-textextracts-malformed-html' );
105  }
106  }
107 
108  if ( $isXml ) {
109  $fit = $result->addValue( [ 'query', 'pages', $id ], 'extract', [ '*' => $text ] );
110  } else {
111  $fit = $result->addValue( [ 'query', 'pages', $id ], 'extract', $text );
112  }
113  if ( !$fit ) {
114  $this->setContinueEnumParameter( 'continue', $continue + $count - 1 );
115  break;
116  }
117  }
118  if ( $titleInFileNamespace ) {
119  $this->addWarning( 'apiwarn-textextracts-title-in-file-namespace' );
120  }
121  }
122 
127  public function getCacheMode( $params ) {
128  return 'public';
129  }
130 
136  private function getExtract( Title $title ) {
137  $contentModel = $title->getContentModel();
138  if ( !in_array( $contentModel, $this->supportedContentModels, true ) ) {
139  $this->addWarning( [
140  'apiwarn-textextracts-unsupportedmodel',
141  wfEscapeWikiText( $title->getPrefixedText() ),
142  $contentModel
143  ] );
144  return '';
145  }
146 
147  $page = WikiPage::factory( $title );
148 
149  $introOnly = $this->params['intro'];
150  $text = $this->getFromCache( $page, $introOnly );
151  // if we need just first section, try retrieving full page and getting first section out of it
152  if ( $text === false && $introOnly ) {
153  $text = $this->getFromCache( $page, false );
154  if ( $text !== false ) {
155  $text = $this->getFirstSection( $text, $this->params['plaintext'] );
156  }
157  }
158  if ( $text === false ) {
159  $text = $this->parse( $page );
160  $text = $this->convertText( $text );
161  $this->setCache( $page, $text );
162  }
163  return $text;
164  }
165 
166  private function cacheKey( BagOStuff $cache, WikiPage $page, $introOnly ) {
167  return $cache->makeKey( 'textextracts', self::CACHE_VERSION,
168  $page->getId(), $page->getTouched(),
169  $page->getTitle()->getPageLanguage()->getPreferredVariant(),
170  $this->params['plaintext'], $introOnly
171  );
172  }
173 
174  private function getFromCache( WikiPage $page, $introOnly ) {
175  global $wgMemc;
176 
177  $key = $this->cacheKey( $wgMemc, $page, $introOnly );
178  return $wgMemc->get( $key );
179  }
180 
181  private function setCache( WikiPage $page, $text ) {
182  global $wgMemc;
183 
184  $key = $this->cacheKey( $wgMemc, $page, $this->params['intro'] );
185  $wgMemc->set( $key, $text, $this->getConfig()->get( 'ParserCacheExpireTime' ) );
186  }
187 
188  private function getFirstSection( $text, $plainText ) {
189  if ( $plainText ) {
190  $regexp = '/^(.*?)(?=' . ExtractFormatter::SECTION_MARKER_START . ')/s';
191  } else {
192  $regexp = '/^(.*?)(?=<h[1-6]\b)/s';
193  }
194  if ( preg_match( $regexp, $text, $matches ) ) {
195  $text = $matches[0];
196  }
197  return $text;
198  }
199 
206  private function parse( WikiPage $page ) {
207  $apiException = null;
208  $parserOptions = new ParserOptions( new User() );
209 
210  // first try finding full page in parser cache
211  if ( $page->shouldCheckParserCache( $parserOptions, 0 ) ) {
212  $pout = MediaWikiServices::getInstance()->getParserCache()->get( $page, $parserOptions );
213  if ( $pout ) {
214  $text = $pout->getText( [ 'unwrap' => true ] );
215  if ( $this->params['intro'] ) {
216  $text = $this->getFirstSection( $text, false );
217  }
218  return $text;
219  }
220  }
221  $request = [
222  'action' => 'parse',
223  'page' => $page->getTitle()->getPrefixedText(),
224  'prop' => 'text',
225  // Invokes special handling when using partial wikitext (T168743)
226  'sectionpreview' => 1,
227  'wrapoutputclass' => '',
228  ];
229  if ( $this->params['intro'] ) {
230  $request['section'] = 0;
231  }
232  // in case of cache miss, render just the needed section
233  $api = new ApiMain( new FauxRequest( $request ) );
234  try {
235  $api->execute();
236  $data = $api->getResult()->getResultData( null, [
237  'BC' => [],
238  'Types' => [],
239  ] );
240  } catch ( ApiUsageException $e ) {
241  $apiException = $e->__toString();
242  if ( $e->getStatusValue()->hasMessage( 'apierror-nosuchsection' ) ) {
243  // Looks like we tried to get the intro to a page without
244  // sections! Lets just grab what we can get.
245  unset( $request['section'] );
246  $api = new ApiMain( new FauxRequest( $request ) );
247  $api->execute();
248  $data = $api->getResult()->getResultData( null, [
249  'BC' => [],
250  'Types' => [],
251  ] );
252  } else {
253  // Some other unexpected error - lets just report it to the user
254  // on the off chance that is the right thing.
255  throw $e;
256  }
257  }
258  if ( !array_key_exists( 'parse', $data ) ) {
259  LoggerFactory::getInstance( 'textextracts' )->warning(
260  'API Parse request failed while generating text extract', [
261  'title' => $page->getTitle()->getFullText(),
262  'url' => $this->getRequest()->getFullRequestURL(),
263  'exception' => $apiException,
264  'request' => $request
265  ] );
266  return null;
267  }
268 
269  return $data['parse']['text']['*'];
270  }
271 
277  public static function factory( $query, $name ) {
278  $config = MediaWikiServices::getInstance()->getConfigFactory()->makeConfig( 'textextracts' );
279  return new self( $query, $name, $config );
280  }
281 
287  private function convertText( $text ) {
288  $fmt = new ExtractFormatter( $text, $this->params['plaintext'] );
289  $fmt->remove( $this->config->get( 'ExtractsRemoveClasses' ) );
290  $text = $fmt->getText();
291  return $text;
292  }
293 
299  private function truncate( $text ) {
300  if ( !$this->params['plaintext'] && MWTidy::isEnabled() ) {
301  $truncator = new TextTruncator( MWTidy::singleton() );
302  } else {
303  $truncator = new TextTruncator();
304  }
305 
306  if ( $this->params['chars'] ) {
307  $text = $truncator->getFirstChars( $text, $this->params['chars'] ) .
308  $this->msg( 'ellipsis' )->text();
309  } elseif ( $this->params['sentences'] ) {
310  $text = $truncator->getFirstSentences( $text, $this->params['sentences'] );
311  }
312  return $text;
313  }
314 
315  private function doSections( $text ) {
316  $pattern = '/' .
317  ExtractFormatter::SECTION_MARKER_START . '(\d)' .
318  ExtractFormatter::SECTION_MARKER_END . '(.*)/';
319 
320  switch ( $this->params['sectionformat'] ) {
321  case 'raw':
322  return $text;
323 
324  case 'wiki':
325  return preg_replace_callback( $pattern, function ( $matches ) {
326  $bars = str_repeat( '=', $matches[1] );
327  return "\n$bars " . trim( $matches[2] ) . " $bars";
328  }, $text );
329 
330  case 'plain':
331  return preg_replace_callback( $pattern, function ( $matches ) {
332  return "\n" . trim( $matches[2] );
333  }, $text );
334 
335  default:
336  throw new \LogicException( 'Invalid sectionformat' );
337  }
338  }
339 
344  public function getAllowedParams() {
345  return [
346  'chars' => [
347  ApiBase::PARAM_TYPE => 'integer',
348  ApiBase::PARAM_MIN => 1,
349  ApiBase::PARAM_MAX => 1200,
350  ],
351  'sentences' => [
352  ApiBase::PARAM_TYPE => 'integer',
353  ApiBase::PARAM_MIN => 1,
354  ApiBase::PARAM_MAX => 10,
355  ],
356  'limit' => [
357  ApiBase::PARAM_DFLT => 20,
358  ApiBase::PARAM_TYPE => 'limit',
359  ApiBase::PARAM_MIN => 1,
360  ApiBase::PARAM_MAX => 20,
361  ApiBase::PARAM_MAX2 => 20,
362  ],
363  'intro' => false,
364  'plaintext' => false,
365  'sectionformat' => [
366  ApiBase::PARAM_TYPE => [ 'plain', 'wiki', 'raw' ],
367  ApiBase::PARAM_DFLT => 'wiki',
368  ],
369  'continue' => [
370  ApiBase::PARAM_TYPE => 'integer',
371  ApiBase::PARAM_HELP_MSG => 'api-help-param-continue',
372  ],
373  ];
374  }
375 
380  protected function getExamplesMessages() {
381  return [
382  'action=query&prop=extracts&exchars=175&titles=Therion'
383  => 'apihelp-query+extracts-example-1',
384  ];
385  }
386 
391  public function getHelpUrls() {
392  return 'https://www.mediawiki.org/wiki/Special:MyLanguage/Extension:TextExtracts#API';
393  }
394 }
ApiUsageException\getStatusValue
getStatusValue()
Fetch the error status.
Definition: ApiUsageException.php:101
ParserOptions
Set options of the Parser.
Definition: ParserOptions.php:42
ApiMain
This is the main API class, used for both external and internal processing.
Definition: ApiMain.php:41
FauxRequest
WebRequest clone which takes values from a provided array.
Definition: FauxRequest.php:33
ApiUsageException
Exception used to abort API execution with an error.
Definition: ApiUsageException.php:28
TextExtracts\ApiQueryExtracts\getFromCache
getFromCache(WikiPage $page, $introOnly)
Definition: ApiQueryExtracts.php:174
TextExtracts\ApiQueryExtracts\convertText
convertText( $text)
Converts page HTML into an extract.
Definition: ApiQueryExtracts.php:287
TextExtracts\ApiQueryExtracts\$params
$params
Definition: ApiQueryExtracts.php:33
MediaWiki\MediaWikiServices
MediaWikiServices is the service locator for the application scope of MediaWiki.
Definition: MediaWikiServices.php:117
ApiBase\PARAM_HELP_MSG
const PARAM_HELP_MSG
(string|array|Message) Specify an alternative i18n documentation message for this parameter.
Definition: ApiBase.php:131
WikiPage\getTouched
getTouched()
Get the page_touched field.
Definition: WikiPage.php:692
ApiBase\PARAM_TYPE
const PARAM_TYPE
(string|string[]) Either an array of allowed value strings, or a string type as described below.
Definition: ApiBase.php:94
TextExtracts\ApiQueryExtracts\parse
parse(WikiPage $page)
Returns page HTML.
Definition: ApiQueryExtracts.php:206
WikiPage
Class representing a MediaWiki article and history.
Definition: WikiPage.php:47
TextExtracts\ApiQueryExtracts\getFirstSection
getFirstSection( $text, $plainText)
Definition: ApiQueryExtracts.php:188
NS_FILE
const NS_FILE
Definition: Defines.php:66
BagOStuff
Class representing a cache/ephemeral data store.
Definition: BagOStuff.php:63
MWTidy\isEnabled
static isEnabled()
Definition: MWTidy.php:54
TextExtracts\ApiQueryExtracts\truncate
truncate( $text)
Truncate the given text to a certain number of characters or sentences.
Definition: ApiQueryExtracts.php:299
TextExtracts\ApiQueryExtracts\getHelpUrls
getHelpUrls()
Definition: ApiQueryExtracts.php:391
ApiBase
This abstract class implements many basic API functions, and is the base of all API classes.
Definition: ApiBase.php:42
$wgMemc
$wgMemc
Definition: Setup.php:791
Config
Interface for configuration instances.
Definition: Config.php:28
ApiBase\PARAM_MIN
const PARAM_MIN
(integer) Lowest value allowed for the parameter, for PARAM_TYPE 'integer' and 'limit'.
Definition: ApiBase.php:106
WikiPage\factory
static factory(Title $title)
Create a WikiPage object of the appropriate class for the given title.
Definition: WikiPage.php:142
MediaWiki\Logger\LoggerFactory
PSR-3 logger instance factory.
Definition: LoggerFactory.php:45
ApiQueryBase
This is a base class for all Query modules.
Definition: ApiQueryBase.php:34
WikiPage\getId
getId()
Definition: WikiPage.php:600
$matches
$matches
Definition: NoLocalSettings.php:24
WikiPage\shouldCheckParserCache
shouldCheckParserCache(ParserOptions $parserOptions, $oldId)
Should the parser cache be used?
Definition: WikiPage.php:1211
TextExtracts\ApiQueryExtracts
Definition: ApiQueryExtracts.php:24
WikiPage\getTitle
getTitle()
Get the title object of the article.
Definition: WikiPage.php:298
ApiBase\PARAM_MAX
const PARAM_MAX
(integer) Max value allowed for the parameter, for PARAM_TYPE 'integer' and 'limit'.
Definition: ApiBase.php:97
$t
$t
Definition: make-normalization-table.php:143
TextExtracts\ApiQueryExtracts\cacheKey
cacheKey(BagOStuff $cache, WikiPage $page, $introOnly)
Definition: ApiQueryExtracts.php:166
TextExtracts\ApiQueryExtracts\getExtract
getExtract(Title $title)
Returns a processed, but not trimmed extract.
Definition: ApiQueryExtracts.php:136
$title
$title
Definition: testCompression.php:34
MWTidy
Class to interact with and configure Remex tidy.
Definition: MWTidy.php:29
TextExtracts\TextTruncator
This class needs to understand HTML as well as plain text.
Definition: TextTruncator.php:14
TextExtracts\ApiQueryExtracts\__construct
__construct( $query, $moduleName, Config $conf)
Definition: ApiQueryExtracts.php:51
TextExtracts
Definition: ApiQueryExtracts.php:3
ApiUsageException\__toString
__toString()
Definition: ApiUsageException.php:115
wfEscapeWikiText
wfEscapeWikiText( $text)
Escapes the given text so that it may be output using addWikiText() without any linking,...
Definition: GlobalFunctions.php:1551
TextExtracts\ExtractFormatter
Provides text-only or limited-HTML extracts of page HTML.
Definition: ExtractFormatter.php:13
TextExtracts\ApiQueryExtracts\$config
Config $config
Definition: ApiQueryExtracts.php:37
TextExtracts\ApiQueryExtracts\getCacheMode
getCacheMode( $params)
Definition: ApiQueryExtracts.php:127
TextExtracts\ApiQueryExtracts\doSections
doSections( $text)
Definition: ApiQueryExtracts.php:315
TextExtracts\ApiQueryExtracts\getAllowedParams
getAllowedParams()
Return an array describing all possible parameters to this module.
Definition: ApiQueryExtracts.php:344
Title
Represents a title within MediaWiki.
Definition: Title.php:42
$cache
$cache
Definition: mcc.php:33
MWTidy\singleton
static singleton()
Definition: MWTidy.php:61
TextExtracts\ApiQueryExtracts\factory
static factory( $query, $name)
Definition: ApiQueryExtracts.php:277
TextExtracts\ApiQueryExtracts\setCache
setCache(WikiPage $page, $text)
Definition: ApiQueryExtracts.php:181
ApiBase\PARAM_DFLT
const PARAM_DFLT
(null|boolean|integer|string) Default value of the parameter.
Definition: ApiBase.php:55
ApiBase\PARAM_MAX2
const PARAM_MAX2
(integer) Max value allowed for the parameter for users with the apihighlimits right,...
Definition: ApiBase.php:103
TextExtracts\ApiQueryExtracts\execute
execute()
Evaluates the parameters, performs the requested extraction of text, and sets up the result.
Definition: ApiQueryExtracts.php:61
TextExtracts\ApiQueryExtracts\getExamplesMessages
getExamplesMessages()
Definition: ApiQueryExtracts.php:380
User
The User object encapsulates all of the user-specific settings (user_id, name, rights,...
Definition: User.php:51