Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
50.59% covered (warning)
50.59%
86 / 170
66.67% covered (warning)
66.67%
10 / 15
CRAP
0.00% covered (danger)
0.00%
0 / 1
ApiQueryExtracts
50.59% covered (warning)
50.59%
86 / 170
66.67% covered (warning)
66.67%
10 / 15
325.95
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
7 / 7
100.00% covered (success)
100.00%
1 / 1
1
 execute
0.00% covered (danger)
0.00%
0 / 42
0.00% covered (danger)
0.00%
0 / 1
272
 getCacheMode
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 getExtract
0.00% covered (danger)
0.00%
0 / 20
0.00% covered (danger)
0.00%
0 / 1
42
 cacheKey
100.00% covered (success)
100.00%
7 / 7
100.00% covered (success)
100.00%
1 / 1
3
 getFromCache
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 setCache
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 getFirstSection
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
3
 parse
0.00% covered (danger)
0.00%
0 / 17
0.00% covered (danger)
0.00%
0 / 1
12
 convertText
0.00% covered (danger)
0.00%
0 / 4
0.00% covered (danger)
0.00%
0 / 1
2
 truncate
100.00% covered (success)
100.00%
9 / 9
100.00% covered (success)
100.00%
1 / 1
4
 doSections
93.75% covered (success)
93.75%
15 / 16
0.00% covered (danger)
0.00%
0 / 1
5.01
 getAllowedParams
100.00% covered (success)
100.00%
30 / 30
100.00% covered (success)
100.00%
1 / 1
1
 getExamplesMessages
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
1
 getHelpUrls
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
1<?php
2
3namespace MediaWiki\Extension\TextExtracts;
4
5use MediaWiki\Api\ApiBase;
6use MediaWiki\Api\ApiQuery;
7use MediaWiki\Api\ApiQueryBase;
8use MediaWiki\Api\ApiUsageException;
9use MediaWiki\Config\Config;
10use MediaWiki\Config\ConfigFactory;
11use MediaWiki\Languages\LanguageConverterFactory;
12use MediaWiki\Logger\LoggerFactory;
13use MediaWiki\Page\PageIdentity;
14use MediaWiki\Page\ParserOutputAccess;
15use MediaWiki\Page\WikiPageFactory;
16use MediaWiki\Parser\ParserOptions;
17use MediaWiki\Title\TitleFormatter;
18use Wikimedia\ObjectCache\WANObjectCache;
19use Wikimedia\ParamValidator\ParamValidator;
20use WikiPage;
21
22/**
23 * @license GPL-2.0-or-later
24 */
25class ApiQueryExtracts extends ApiQueryBase {
26
27    /**
28     * Bump when memcache needs clearing
29     */
30    private const CACHE_VERSION = 3;
31
32    private const PREFIX = 'ex';
33
34    /**
35     * @var array
36     */
37    private $params;
38
39    private Config $config;
40    private WANObjectCache $cache;
41    private LanguageConverterFactory $langConvFactory;
42    private ParserOutputAccess $parserOutputAccess;
43    private WikiPageFactory $wikiPageFactory;
44    private TitleFormatter $titleFormatter;
45
46    // TODO: Allow extensions to hook into this to opt-in.
47    // This is partly for security reasons; see T107170.
48    /**
49     * @var string[]
50     */
51    private $supportedContentModels = [ 'wikitext' ];
52
53    public function __construct(
54        ApiQuery $query,
55        string $moduleName,
56        ConfigFactory $configFactory,
57        WANObjectCache $cache,
58        LanguageConverterFactory $langConvFactory,
59        ParserOutputAccess $parserOutputAccess,
60        WikiPageFactory $wikiPageFactory,
61        TitleFormatter $titleFormatter
62    ) {
63        parent::__construct( $query, $moduleName, self::PREFIX );
64        $this->config = $configFactory->makeConfig( 'textextracts' );
65        $this->cache = $cache;
66        $this->langConvFactory = $langConvFactory;
67        $this->parserOutputAccess = $parserOutputAccess;
68        $this->wikiPageFactory = $wikiPageFactory;
69        $this->titleFormatter = $titleFormatter;
70    }
71
72    /**
73     * Evaluates the parameters, performs the requested extraction of text,
74     * and sets up the result
75     */
76    public function execute() {
77        $titles = $this->getPageSet()->getGoodPages();
78        if ( $titles === [] ) {
79            return;
80        }
81        $isXml = $this->getMain()->isInternalMode()
82            || $this->getMain()->getPrinter()->getFormat() == 'XML';
83        $result = $this->getResult();
84        $params = $this->params = $this->extractRequestParams();
85        $this->requireMaxOneParameter( $params, 'chars', 'sentences' );
86        $continue = 0;
87        $limit = intval( $params['limit'] );
88        if ( $limit > 1 && !$params['intro'] && count( $titles ) > 1 ) {
89            $limit = 1;
90            $this->addWarning( [ 'apiwarn-textextracts-limit', $limit ] );
91        }
92        if ( isset( $params['continue'] ) ) {
93            $continue = intval( $params['continue'] );
94            $this->dieContinueUsageIf( $continue < 0 || $continue > count( $titles ) );
95            $titles = array_slice( $titles, $continue, null, true );
96        }
97        $count = 0;
98        $titleInFileNamespace = false;
99        /** @var PageIdentity $t */
100        foreach ( $titles as $id => $t ) {
101            if ( ++$count > $limit ) {
102                $this->setContinueEnumParameter( 'continue', $continue + $count - 1 );
103                break;
104            }
105
106            if ( $t->getNamespace() === NS_FILE ) {
107                $text = '';
108                $titleInFileNamespace = true;
109            } else {
110                $params = $this->params;
111                $text = $this->getExtract( $t );
112                $text = $this->truncate( $text );
113                if ( $params['plaintext'] ) {
114                    $text = $this->doSections( $text );
115                } else {
116                    if ( $params['sentences'] ) {
117                        $this->addWarning( $this->msg( 'apiwarn-textextracts-sentences-and-html', self::PREFIX ) );
118                    }
119                    $this->addWarning( 'apiwarn-textextracts-malformed-html' );
120                }
121            }
122
123            if ( $isXml ) {
124                $fit = $result->addValue( [ 'query', 'pages', $id ], 'extract', [ '*' => $text ] );
125            } else {
126                $fit = $result->addValue( [ 'query', 'pages', $id ], 'extract', $text );
127            }
128            if ( !$fit ) {
129                $this->setContinueEnumParameter( 'continue', $continue + $count - 1 );
130                break;
131            }
132        }
133        if ( $titleInFileNamespace ) {
134            $this->addWarning( 'apiwarn-textextracts-title-in-file-namespace' );
135        }
136    }
137
138    /**
139     * @param array $params Ignored parameters
140     * @return string
141     */
142    public function getCacheMode( $params ) {
143        return 'public';
144    }
145
146    /**
147     * Returns a processed, but not trimmed extract
148     * @param PageIdentity $title
149     * @return string
150     */
151    private function getExtract( PageIdentity $title ) {
152        $page = $this->wikiPageFactory->newFromTitle( $title );
153
154        $contentModel = $page->getContentModel();
155        if ( !in_array( $contentModel, $this->supportedContentModels, true ) ) {
156            $this->addWarning( [
157                'apiwarn-textextracts-unsupportedmodel',
158                wfEscapeWikiText( $this->titleFormatter->getPrefixedText( $title ) ),
159                $contentModel
160            ] );
161            return '';
162        }
163
164        $introOnly = $this->params['intro'];
165        $text = $this->getFromCache( $page, $introOnly );
166        // if we need just first section, try retrieving full page and getting first section out of it
167        if ( $text === false && $introOnly ) {
168            $text = $this->getFromCache( $page, false );
169            if ( $text !== false ) {
170                $text = $this->getFirstSection( $text, $this->params['plaintext'] );
171            }
172        }
173        if ( $text === false ) {
174            $text = $this->parse( $page );
175            $text = $this->convertText( $text );
176            $this->setCache( $page, $text );
177        }
178        return $text;
179    }
180
181    /**
182     * @param WANObjectCache $cache
183     * @param WikiPage $page
184     * @param bool $introOnly
185     * @return string
186     */
187    private function cacheKey( WANObjectCache $cache, WikiPage $page, $introOnly ) {
188        $langConv = $this->langConvFactory->getLanguageConverter( $page->getTitle()->getPageLanguage() );
189        return $cache->makeKey( 'textextracts', self::CACHE_VERSION,
190            $page->getId(), $page->getTouched(),
191            $langConv->getPreferredVariant(),
192            $this->params['plaintext'] ? 'plaintext' : 'html',
193            $introOnly ? 'intro' : 'full'
194        );
195    }
196
197    /**
198     * @param WikiPage $page
199     * @param bool $introOnly
200     * @return string|false
201     */
202    private function getFromCache( WikiPage $page, $introOnly ) {
203        $cache = $this->cache;
204        // @TODO: replace with getWithSetCallback()
205        $key = $this->cacheKey( $cache, $page, $introOnly );
206        return $cache->get( $key );
207    }
208
209    /**
210     * @param WikiPage $page
211     * @param string $text
212     */
213    private function setCache( WikiPage $page, $text ) {
214        $cache = $this->cache;
215        // @TODO: replace with getWithSetCallback()
216        $key = $this->cacheKey( $cache, $page, $this->params['intro'] );
217        $cache->set( $key, $text, $this->getConfig()->get( 'ParserCacheExpireTime' ) );
218    }
219
220    /**
221     * @param string $text
222     * @param bool $plainText
223     * @return string
224     */
225    private function getFirstSection( $text, $plainText ) {
226        if ( $plainText ) {
227            $regexp = '/^(.*?)(?=' . ExtractFormatter::SECTION_MARKER_START . ')/s';
228        } else {
229            $regexp = '/^(.*?)(?=<h[1-6]\b)/s';
230        }
231        if ( preg_match( $regexp, $text, $matches ) ) {
232            $text = $matches[0];
233        }
234        return $text;
235    }
236
237    /**
238     * Returns page HTML
239     * @param WikiPage $page
240     * @return string
241     * @throws ApiUsageException
242     */
243    private function parse( WikiPage $page ) {
244        $status = $this->parserOutputAccess->getParserOutput(
245            $page->toPageRecord(),
246            ParserOptions::newFromAnon()
247        );
248        if ( $status->isOK() ) {
249            $pout = $status->getValue();
250            $text = $pout->getRawText();
251            if ( $this->params['intro'] ) {
252                $text = $this->getFirstSection( $text, false );
253            }
254            return $text;
255        } else {
256            LoggerFactory::getInstance( 'textextracts' )->warning(
257                'Parse attempt failed while generating text extract', [
258                    'title' => $page->getTitle()->getFullText(),
259                    'url' => $this->getRequest()->getFullRequestURL(),
260                    'reason' => $status->getWikiText( false, false, 'en' )
261                ] );
262            $this->dieStatus( $status );
263        }
264    }
265
266    /**
267     * Converts page HTML into an extract
268     * @param string $text
269     * @return string
270     */
271    private function convertText( $text ) {
272        $fmt = new ExtractFormatter( $text, $this->params['plaintext'] );
273        $fmt->remove( $this->config->get( 'ExtractsRemoveClasses' ) );
274        $text = $fmt->getText();
275        return $text;
276    }
277
278    /**
279     * Truncate the given text to a certain number of characters or sentences
280     * @param string $text The text to truncate
281     * @return string
282     */
283    private function truncate( $text ) {
284        $useTidy = !$this->params['plaintext'];
285        $truncator = new TextTruncator( $useTidy );
286
287        if ( $this->params['chars'] ) {
288            $truncatedText = $truncator->getFirstChars( $text, $this->params['chars'] );
289            if ( $truncatedText !== $text ) {
290                $text = $truncatedText . $this->msg( 'ellipsis' )->text();
291            }
292        } elseif ( $this->params['sentences'] ) {
293            $text = $truncator->getFirstSentences( $text, $this->params['sentences'] );
294        }
295        return $text;
296    }
297
298    /**
299     * @param string $text
300     * @return string
301     */
302    private function doSections( $text ) {
303        $pattern = '/' .
304            ExtractFormatter::SECTION_MARKER_START . '(\d)' .
305            ExtractFormatter::SECTION_MARKER_END . '(.*)/';
306
307        switch ( $this->params['sectionformat'] ) {
308            case 'raw':
309                return $text;
310
311            case 'wiki':
312                return preg_replace_callback( $pattern, static function ( $matches ) {
313                    $bars = str_repeat( '=', $matches[1] );
314                    return "\n$bars " . trim( $matches[2] ) . " $bars";
315                }, $text );
316
317            case 'plain':
318                return preg_replace_callback( $pattern, static function ( $matches ) {
319                    return "\n" . trim( $matches[2] );
320                }, $text );
321
322            default:
323                throw new \LogicException( 'Invalid sectionformat' );
324        }
325    }
326
327    /**
328     * @inheritDoc
329     */
330    public function getAllowedParams() {
331        return [
332            'chars' => [
333                ApiBase::PARAM_TYPE => 'integer',
334                ApiBase::PARAM_MIN => 1,
335                ApiBase::PARAM_MAX => 1200,
336            ],
337            'sentences' => [
338                ApiBase::PARAM_TYPE => 'integer',
339                ApiBase::PARAM_MIN => 1,
340                ApiBase::PARAM_MAX => 10,
341            ],
342            'limit' => [
343                ParamValidator::PARAM_DEFAULT => 20,
344                ApiBase::PARAM_TYPE => 'limit',
345                ApiBase::PARAM_MIN => 1,
346                ApiBase::PARAM_MAX => 20,
347                ApiBase::PARAM_MAX2 => 20,
348            ],
349            'intro' => false,
350            'plaintext' => false,
351            'sectionformat' => [
352                ApiBase::PARAM_TYPE => [ 'plain', 'wiki', 'raw' ],
353                ParamValidator::PARAM_DEFAULT => 'wiki',
354                ApiBase::PARAM_HELP_MSG_PER_VALUE => [],
355            ],
356            'continue' => [
357                ApiBase::PARAM_TYPE => 'integer',
358                ApiBase::PARAM_HELP_MSG => 'api-help-param-continue',
359            ],
360        ];
361    }
362
363    /**
364     * @inheritDoc
365     */
366    protected function getExamplesMessages() {
367        return [
368            'action=query&prop=extracts&exchars=175&titles=Therion'
369                => 'apihelp-query+extracts-example-1',
370        ];
371    }
372
373    /**
374     * @inheritDoc
375     */
376    public function getHelpUrls() {
377        return 'https://www.mediawiki.org/wiki/Special:MyLanguage/Extension:TextExtracts#API';
378    }
379
380}