Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
50.29% covered (warning)
50.29%
86 / 171
66.67% covered (warning)
66.67%
10 / 15
CRAP
0.00% covered (danger)
0.00%
0 / 1
ApiQueryExtracts
50.29% covered (warning)
50.29%
86 / 171
66.67% covered (warning)
66.67%
10 / 15
330.98
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
1
 execute
0.00% covered (danger)
0.00%
0 / 42
0.00% covered (danger)
0.00%
0 / 1
272
 getCacheMode
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 getExtract
0.00% covered (danger)
0.00%
0 / 20
0.00% covered (danger)
0.00%
0 / 1
42
 cacheKey
100.00% covered (success)
100.00%
7 / 7
100.00% covered (success)
100.00%
1 / 1
3
 getFromCache
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 setCache
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 getFirstSection
100.00% covered (success)
100.00%
7 / 7
100.00% covered (success)
100.00%
1 / 1
3
 parse
0.00% covered (danger)
0.00%
0 / 18
0.00% covered (danger)
0.00%
0 / 1
12
 convertText
0.00% covered (danger)
0.00%
0 / 4
0.00% covered (danger)
0.00%
0 / 1
2
 truncate
100.00% covered (success)
100.00%
9 / 9
100.00% covered (success)
100.00%
1 / 1
4
 doSections
93.75% covered (success)
93.75%
15 / 16
0.00% covered (danger)
0.00%
0 / 1
5.01
 getAllowedParams
100.00% covered (success)
100.00%
30 / 30
100.00% covered (success)
100.00%
1 / 1
1
 getExamplesMessages
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
1
 getHelpUrls
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
1<?php
2
3namespace TextExtracts;
4
5use ApiBase;
6use ApiQueryBase;
7use ApiUsageException;
8use MediaWiki\Config\Config;
9use MediaWiki\Config\ConfigFactory;
10use MediaWiki\Languages\LanguageConverterFactory;
11use MediaWiki\Logger\LoggerFactory;
12use MediaWiki\MediaWikiServices;
13use MediaWiki\Page\PageIdentity;
14use MediaWiki\Page\WikiPageFactory;
15use MediaWiki\Title\TitleFormatter;
16use ParserOptions;
17use WANObjectCache;
18use Wikimedia\ParamValidator\ParamValidator;
19use WikiPage;
20
21/**
22 * @license GPL-2.0-or-later
23 */
24class ApiQueryExtracts extends ApiQueryBase {
25
26    /**
27     * Bump when memcache needs clearing
28     */
29    private const CACHE_VERSION = 2;
30
31    private const PREFIX = 'ex';
32
33    /**
34     * @var array
35     */
36    private $params;
37
38    /**
39     * @var Config
40     */
41    private $config;
42    /**
43     * @var WANObjectCache
44     */
45    private $cache;
46    /**
47     * @var LanguageConverterFactory
48     */
49    private $langConvFactory;
50    /**
51     * @var WikiPageFactory
52     */
53    private $wikiPageFactory;
54    private TitleFormatter $titleFormatter;
55
56    // TODO: Allow extensions to hook into this to opt-in.
57    // This is partly for security reasons; see T107170.
58    /**
59     * @var string[]
60     */
61    private $supportedContentModels = [ 'wikitext' ];
62
63    /**
64     * @param \ApiQuery $query API query module object
65     * @param string $moduleName Name of this query module
66     * @param ConfigFactory $configFactory
67     * @param WANObjectCache $cache
68     * @param LanguageConverterFactory $langConvFactory
69     * @param WikiPageFactory $wikiPageFactory
70     * @param TitleFormatter $titleFormatter
71     */
72    public function __construct(
73        $query,
74        $moduleName,
75        ConfigFactory $configFactory,
76        WANObjectCache $cache,
77        LanguageConverterFactory $langConvFactory,
78        WikiPageFactory $wikiPageFactory,
79        TitleFormatter $titleFormatter
80    ) {
81        parent::__construct( $query, $moduleName, self::PREFIX );
82        $this->config = $configFactory->makeConfig( 'textextracts' );
83        $this->cache = $cache;
84        $this->langConvFactory = $langConvFactory;
85        $this->wikiPageFactory = $wikiPageFactory;
86        $this->titleFormatter = $titleFormatter;
87    }
88
89    /**
90     * Evaluates the parameters, performs the requested extraction of text,
91     * and sets up the result
92     */
93    public function execute() {
94        $titles = $this->getPageSet()->getGoodPages();
95        if ( $titles === [] ) {
96            return;
97        }
98        $isXml = $this->getMain()->isInternalMode()
99            || $this->getMain()->getPrinter()->getFormat() == 'XML';
100        $result = $this->getResult();
101        $params = $this->params = $this->extractRequestParams();
102        $this->requireMaxOneParameter( $params, 'chars', 'sentences' );
103        $continue = 0;
104        $limit = intval( $params['limit'] );
105        if ( $limit > 1 && !$params['intro'] && count( $titles ) > 1 ) {
106            $limit = 1;
107            $this->addWarning( [ 'apiwarn-textextracts-limit', $limit ] );
108        }
109        if ( isset( $params['continue'] ) ) {
110            $continue = intval( $params['continue'] );
111            $this->dieContinueUsageIf( $continue < 0 || $continue > count( $titles ) );
112            $titles = array_slice( $titles, $continue, null, true );
113        }
114        $count = 0;
115        $titleInFileNamespace = false;
116        /** @var PageIdentity $t */
117        foreach ( $titles as $id => $t ) {
118            if ( ++$count > $limit ) {
119                $this->setContinueEnumParameter( 'continue', $continue + $count - 1 );
120                break;
121            }
122
123            if ( $t->getNamespace() === NS_FILE ) {
124                $text = '';
125                $titleInFileNamespace = true;
126            } else {
127                $params = $this->params;
128                $text = $this->getExtract( $t );
129                $text = $this->truncate( $text );
130                if ( $params['plaintext'] ) {
131                    $text = $this->doSections( $text );
132                } else {
133                    if ( $params['sentences'] ) {
134                        $this->addWarning( $this->msg( 'apiwarn-textextracts-sentences-and-html', self::PREFIX ) );
135                    }
136                    $this->addWarning( 'apiwarn-textextracts-malformed-html' );
137                }
138            }
139
140            if ( $isXml ) {
141                $fit = $result->addValue( [ 'query', 'pages', $id ], 'extract', [ '*' => $text ] );
142            } else {
143                $fit = $result->addValue( [ 'query', 'pages', $id ], 'extract', $text );
144            }
145            if ( !$fit ) {
146                $this->setContinueEnumParameter( 'continue', $continue + $count - 1 );
147                break;
148            }
149        }
150        if ( $titleInFileNamespace ) {
151            $this->addWarning( 'apiwarn-textextracts-title-in-file-namespace' );
152        }
153    }
154
155    /**
156     * @param array $params Ignored parameters
157     * @return string
158     */
159    public function getCacheMode( $params ) {
160        return 'public';
161    }
162
163    /**
164     * Returns a processed, but not trimmed extract
165     * @param PageIdentity $title
166     * @return string
167     */
168    private function getExtract( PageIdentity $title ) {
169        $page = $this->wikiPageFactory->newFromTitle( $title );
170
171        $contentModel = $page->getContentModel();
172        if ( !in_array( $contentModel, $this->supportedContentModels, true ) ) {
173            $this->addWarning( [
174                'apiwarn-textextracts-unsupportedmodel',
175                wfEscapeWikiText( $this->titleFormatter->getPrefixedText( $title ) ),
176                $contentModel
177            ] );
178            return '';
179        }
180
181        $introOnly = $this->params['intro'];
182        $text = $this->getFromCache( $page, $introOnly );
183        // if we need just first section, try retrieving full page and getting first section out of it
184        if ( $text === false && $introOnly ) {
185            $text = $this->getFromCache( $page, false );
186            if ( $text !== false ) {
187                $text = $this->getFirstSection( $text, $this->params['plaintext'] );
188            }
189        }
190        if ( $text === false ) {
191            $text = $this->parse( $page );
192            $text = $this->convertText( $text );
193            $this->setCache( $page, $text );
194        }
195        return $text;
196    }
197
198    /**
199     * @param WANObjectCache $cache
200     * @param WikiPage $page
201     * @param bool $introOnly
202     * @return string
203     */
204    private function cacheKey( WANObjectCache $cache, WikiPage $page, $introOnly ) {
205        $langConv = $this->langConvFactory->getLanguageConverter( $page->getTitle()->getPageLanguage() );
206        return $cache->makeKey( 'textextracts', self::CACHE_VERSION,
207            $page->getId(), $page->getTouched(),
208            $langConv->getPreferredVariant(),
209            $this->params['plaintext'] ? 'plaintext' : 'html',
210            $introOnly ? 'intro' : 'full'
211        );
212    }
213
214    /**
215     * @param WikiPage $page
216     * @param bool $introOnly
217     * @return string|false
218     */
219    private function getFromCache( WikiPage $page, $introOnly ) {
220        $cache = $this->cache;
221        // @TODO: replace with getWithSetCallback()
222        $key = $this->cacheKey( $cache, $page, $introOnly );
223        return $cache->get( $key );
224    }
225
226    /**
227     * @param WikiPage $page
228     * @param string $text
229     */
230    private function setCache( WikiPage $page, $text ) {
231        $cache = $this->cache;
232        // @TODO: replace with getWithSetCallback()
233        $key = $this->cacheKey( $cache, $page, $this->params['intro'] );
234        $cache->set( $key, $text, $this->getConfig()->get( 'ParserCacheExpireTime' ) );
235    }
236
237    /**
238     * @param string $text
239     * @param bool $plainText
240     * @return string
241     */
242    private function getFirstSection( $text, $plainText ) {
243        if ( $plainText ) {
244            $regexp = '/^.*?(?=' . ExtractFormatter::SECTION_MARKER_START .
245                '(?!.' . ExtractFormatter::SECTION_MARKER_END . '<h2 id="mw-toc-heading"))/s';
246        } else {
247            $regexp = '/^.*?(?=<h[1-6]\b(?! id="mw-toc-heading"))/s';
248        }
249        if ( preg_match( $regexp, $text, $matches ) ) {
250            $text = $matches[0];
251        }
252        return $text;
253    }
254
255    /**
256     * Returns page HTML
257     * @param WikiPage $page
258     * @return string
259     * @throws ApiUsageException
260     */
261    private function parse( WikiPage $page ) {
262        $parserOutputAccess = MediaWikiServices::getInstance()->getParserOutputAccess();
263        $status = $parserOutputAccess->getParserOutput(
264            $page->toPageRecord(),
265            ParserOptions::newFromAnon()
266        );
267        if ( $status->isOK() ) {
268            $pout = $status->getValue();
269            $text = $pout->getText( [ 'unwrap' => true ] );
270            if ( $this->params['intro'] ) {
271                $text = $this->getFirstSection( $text, false );
272            }
273            return $text;
274        } else {
275            LoggerFactory::getInstance( 'textextracts' )->warning(
276                'Parse attempt failed while generating text extract', [
277                    'title' => $page->getTitle()->getFullText(),
278                    'url' => $this->getRequest()->getFullRequestURL(),
279                    'reason' => $status->getWikiText( false, false, 'en' )
280                ] );
281            $this->dieStatus( $status );
282        }
283    }
284
285    /**
286     * Converts page HTML into an extract
287     * @param string $text
288     * @return string
289     */
290    private function convertText( $text ) {
291        $fmt = new ExtractFormatter( $text, $this->params['plaintext'] );
292        $fmt->remove( $this->config->get( 'ExtractsRemoveClasses' ) );
293        $text = $fmt->getText();
294        return $text;
295    }
296
297    /**
298     * Truncate the given text to a certain number of characters or sentences
299     * @param string $text The text to truncate
300     * @return string
301     */
302    private function truncate( $text ) {
303        $useTidy = !$this->params['plaintext'];
304        $truncator = new TextTruncator( $useTidy );
305
306        if ( $this->params['chars'] ) {
307            $truncatedText = $truncator->getFirstChars( $text, $this->params['chars'] );
308            if ( $truncatedText !== $text ) {
309                $text = $truncatedText . $this->msg( 'ellipsis' )->text();
310            }
311        } elseif ( $this->params['sentences'] ) {
312            $text = $truncator->getFirstSentences( $text, $this->params['sentences'] );
313        }
314        return $text;
315    }
316
317    /**
318     * @param string $text
319     * @return string
320     */
321    private function doSections( $text ) {
322        $pattern = '/' .
323            ExtractFormatter::SECTION_MARKER_START . '(\d)' .
324            ExtractFormatter::SECTION_MARKER_END . '(.*)/';
325
326        switch ( $this->params['sectionformat'] ) {
327            case 'raw':
328                return $text;
329
330            case 'wiki':
331                return preg_replace_callback( $pattern, static function ( $matches ) {
332                    $bars = str_repeat( '=', $matches[1] );
333                    return "\n$bars " . trim( $matches[2] ) . " $bars";
334                }, $text );
335
336            case 'plain':
337                return preg_replace_callback( $pattern, static function ( $matches ) {
338                    return "\n" . trim( $matches[2] );
339                }, $text );
340
341            default:
342                throw new \LogicException( 'Invalid sectionformat' );
343        }
344    }
345
346    /**
347     * @inheritDoc
348     */
349    public function getAllowedParams() {
350        return [
351            'chars' => [
352                ApiBase::PARAM_TYPE => 'integer',
353                ApiBase::PARAM_MIN => 1,
354                ApiBase::PARAM_MAX => 1200,
355            ],
356            'sentences' => [
357                ApiBase::PARAM_TYPE => 'integer',
358                ApiBase::PARAM_MIN => 1,
359                ApiBase::PARAM_MAX => 10,
360            ],
361            'limit' => [
362                ParamValidator::PARAM_DEFAULT => 20,
363                ApiBase::PARAM_TYPE => 'limit',
364                ApiBase::PARAM_MIN => 1,
365                ApiBase::PARAM_MAX => 20,
366                ApiBase::PARAM_MAX2 => 20,
367            ],
368            'intro' => false,
369            'plaintext' => false,
370            'sectionformat' => [
371                ApiBase::PARAM_TYPE => [ 'plain', 'wiki', 'raw' ],
372                ParamValidator::PARAM_DEFAULT => 'wiki',
373                ApiBase::PARAM_HELP_MSG_PER_VALUE => [],
374            ],
375            'continue' => [
376                ApiBase::PARAM_TYPE => 'integer',
377                ApiBase::PARAM_HELP_MSG => 'api-help-param-continue',
378            ],
379        ];
380    }
381
382    /**
383     * @inheritDoc
384     */
385    protected function getExamplesMessages() {
386        return [
387            'action=query&prop=extracts&exchars=175&titles=Therion'
388                => 'apihelp-query+extracts-example-1',
389        ];
390    }
391
392    /**
393     * @inheritDoc
394     */
395    public function getHelpUrls() {
396        return 'https://www.mediawiki.org/wiki/Special:MyLanguage/Extension:TextExtracts#API';
397    }
398
399}