Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
87.80% covered (warning)
87.80%
108 / 123
60.00% covered (warning)
60.00%
6 / 10
CRAP
0.00% covered (danger)
0.00%
0 / 1
SegmentPageFactory
87.80% covered (warning)
87.80%
108 / 123
60.00% covered (warning)
60.00%
6 / 10
36.10
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 setUseRevisionPropertiesCache
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 setRevisionStore
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 setHttpRequestFactory
0.00% covered (danger)
0.00%
0 / 2
0.00% covered (danger)
0.00%
0 / 1
2
 setRequirePageRevisionProperties
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 pageProviderFactory
44.44% covered (danger)
44.44%
4 / 9
0.00% covered (danger)
0.00%
0 / 1
9.29
 loadPageRevisionProperties
77.78% covered (warning)
77.78%
7 / 9
0.00% covered (danger)
0.00%
0 / 1
4.18
 segmentPage
91.67% covered (success)
91.67%
66 / 72
0.00% covered (danger)
0.00%
0 / 1
18.19
 segmentedPageCacheKeyFactory
100.00% covered (success)
100.00%
12 / 12
100.00% covered (success)
100.00%
1 / 1
1
 pageRevisionPropertiesCacheKeyFactory
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
1
1<?php
2
3namespace MediaWiki\Wikispeech\Segment;
4
5/**
6 * @file
7 * @ingroup Extensions
8 * @license GPL-2.0-or-later
9 */
10
11use InvalidArgumentException;
12use LogicException;
13use MediaWiki\Config\Config;
14use MediaWiki\Http\HttpRequestFactory;
15use MediaWiki\Revision\RevisionStore;
16use Mediawiki\Title\Title;
17use RuntimeException;
18use WANObjectCache;
19
20/**
21 * @since 0.1.10
22 */
23class SegmentPageFactory extends SegmentFactory {
24
25    /**
26     * Whether or not to use cache for page revision properties,
27     * i.e. to retrieve page id and title when only supplying a revision id.
28     *
29     * Not turning this on in a situation where consumerUrl is set
30     * will cause one extra http request to the remote wiki
31     * in order to lookup pageId, which is required to create or find
32     * utterances in UtteranceStore.
33     *
34     * @var bool
35     */
36    private $useRevisionPropertiesCache = false;
37
38    /**
39     * Required only when providing page content from a local wiki.
40     * @var RevisionStore|null
41     */
42    private $revisionStore = null;
43
44    /**
45     * Required only when providing page content from a remote wiki.
46     * @var HttpRequestFactory|null
47     */
48    private $httpRequestFactory = null;
49
50    /**
51     * If true, page id and title is always retrieved from page provider or cache
52     * and be made available in response from segmentPage.
53     *
54     * @var bool
55     */
56    private $requirePageRevisionProperties = false;
57
58    /**
59     * @since 0.1.13
60     * @param WANObjectCache $cache
61     * @param Config $config
62     * @param RevisionStore $revisionStore
63     * @param HttpRequestFactory $httpRequestFactory
64     */
65    public function __construct(
66        WANObjectCache $cache,
67        Config $config,
68        RevisionStore $revisionStore,
69        HttpRequestFactory $httpRequestFactory
70    ) {
71        parent::__construct( $cache, $config );
72
73        $this->revisionStore = $revisionStore;
74        $this->httpRequestFactory = $httpRequestFactory;
75    }
76
77    /**
78     * @see SegmentPageFactory::$useRevisionPropertiesCache
79     * @since 0.1.10
80     * @param bool $useRevisionPropertiesCache
81     * @return SegmentPageFactory $this
82     */
83    public function setUseRevisionPropertiesCache(
84        bool $useRevisionPropertiesCache
85    ): SegmentPageFactory {
86        $this->useRevisionPropertiesCache = $useRevisionPropertiesCache;
87        return $this;
88    }
89
90    /**
91     * @see SegmentPageFactory::$revisionStore
92     * @since 0.1.10
93     * @param RevisionStore|null $revisionStore
94     * @return SegmentPageFactory $this
95     */
96    public function setRevisionStore(
97        ?RevisionStore $revisionStore
98    ): SegmentPageFactory {
99        $this->revisionStore = $revisionStore;
100        return $this;
101    }
102
103    /**
104     * @see SegmentPageFactory::$httpRequestFactory
105     * @since 0.1.10
106     * @param HttpRequestFactory|null $httpRequestFactory
107     * @return SegmentPageFactory $this
108     */
109    public function setHttpRequestFactory(
110        ?HttpRequestFactory $httpRequestFactory
111    ): SegmentPageFactory {
112        $this->httpRequestFactory = $httpRequestFactory;
113        return $this;
114    }
115
116    /**
117     * If true, page id and title is always retrieved from page provider or cache
118     * and be made available in response from segmentPage.
119     *
120     * @since 0.1.10
121     * @param bool $requirePageRevisionProperties
122     * @return SegmentPageFactory $this
123     */
124    public function setRequirePageRevisionProperties(
125        bool $requirePageRevisionProperties
126    ): SegmentPageFactory {
127        $this->requirePageRevisionProperties = $requirePageRevisionProperties;
128        return $this;
129    }
130
131    /**
132     * @since 0.1.10
133     * @return PageProvider
134     */
135    protected function pageProviderFactory(): PageProvider {
136        if ( $this->consumerUrl ) {
137            if ( $this->httpRequestFactory === null ) {
138                throw new LogicException( '$httpRequestFactory is null!' );
139            }
140            return new RemoteWikiPageProvider( $this->consumerUrl, $this->httpRequestFactory );
141        } else {
142            if ( $this->contextSource === null ) {
143                throw new LogicException( '$contextSource is null!' );
144            }
145            if ( $this->revisionStore === null ) {
146                throw new LogicException( '$revisionStore is null!' );
147            }
148            return new LocalWikiPageProvider( $this->contextSource, $this->revisionStore );
149        }
150    }
151
152    /**
153     * Loads revision properties
154     * from cache (if set to use)
155     * or via pageProvider (if not using or missing in cache).
156     *
157     * @since 0.1.10
158     * @param PageProvider $pageProvider
159     * @param int $revisionId
160     * @return PageRevisionProperties
161     */
162    protected function loadPageRevisionProperties(
163        PageProvider $pageProvider,
164        int $revisionId
165    ): PageRevisionProperties {
166        $cacheKey = $this->pageRevisionPropertiesCacheKeyFactory( $pageProvider, $revisionId );
167        // Lookup title and page id given the revision id.
168        if ( $this->useRevisionPropertiesCache ) {
169            $revisionProperties = $this->cache->get( $cacheKey );
170        } else {
171            $revisionProperties = false;
172        }
173        if ( $revisionProperties === false ) {
174            $revisionProperties = $pageProvider->loadPageRevisionProperties( $revisionId );
175            if ( $this->useRevisionPropertiesCache ) {
176                $this->cache->set( $cacheKey, $revisionProperties );
177            }
178        }
179        return $revisionProperties;
180    }
181
182    /**
183     * Convenience function to build segmenter
184     * which is immediately invoked to segment page
185     * after parsing DOM of the HTML
186     * supplied by a page provider that is constructed from the factory fields,
187     * possibly loading extra properties required by invoker,
188     * and possibly bypassing all of above invocation using caches.
189     *
190     * $title XOR $revisionId must be provided.
191     *
192     * @since 0.1.10
193     * @param Title|null $title
194     * @param int|null $revisionId
195     * @return SegmentPageResponse
196     * @throws InvalidArgumentException If $title xor $revisionId is not provided.
197     */
198    public function segmentPage(
199        ?Title $title = null,
200        ?int $revisionId = null
201    ): SegmentPageResponse {
202        $pageProvider = $this->pageProviderFactory();
203
204        $segmentPageResponse = new SegmentPageResponse();
205
206        $revisionProperties = null;
207
208        if ( $title === null && $revisionId !== null ) {
209            // Lookup title and page id given the revision id.
210            $revisionProperties = $this->loadPageRevisionProperties( $pageProvider, $revisionId );
211            $segmentPageResponse->setTitle( $revisionProperties->getTitle() );
212            $segmentPageResponse->setPageId( $revisionProperties->getPageId() );
213            $segmentPageResponse->setRevisionId( $revisionId );
214        } elseif ( $title !== null && $revisionId === null ) {
215            // Set user supplied title.
216            $segmentPageResponse->setTitle( $title );
217        } else {
218            throw new InvalidArgumentException( '$title xor $revisionId must be provided.' );
219        }
220
221        $removeTags = $this->removeTags ?? $this->config->get( 'WikispeechRemoveTags' );
222        $segmentBreakingTags = $this->segmentBreakingTags ?? $this->config->get( 'WikispeechSegmentBreakingTags' );
223
224        $segmenter = $this->segmenter ?? new StandardSegmenter();
225
226        if ( $this->useSegmentsCache && $revisionId !== null ) {
227            $segments = $this->cache->get(
228                $this->segmentedPageCacheKeyFactory(
229                    $removeTags,
230                    $segmentBreakingTags,
231                    $pageProvider,
232                    $revisionId
233                )
234            );
235        } else {
236            $segments = false;
237        }
238        if ( $segments === false ) {
239            $segmentPageResponseTitle = $segmentPageResponse->getTitle();
240            if ( $segmentPageResponseTitle === null ) {
241                throw new LogicException( 'Title is null!' );
242            }
243            $pageProvider->loadData( $segmentPageResponseTitle );
244
245            $displayTitle = $pageProvider->getDisplayTitle();
246            if ( $displayTitle === null ) {
247                throw new RuntimeException( 'Display title not loaded!' );
248            }
249            $pageContent = $pageProvider->getPageContent();
250            if ( $pageContent === null ) {
251                throw new RuntimeException( 'Page content not loaded!' );
252            }
253            $providedRevisionId = $pageProvider->getRevisionId();
254            if ( $providedRevisionId === null ) {
255                throw new RuntimeException( 'Revision id not loaded!' );
256            }
257
258            if ( $revisionId !== null
259                && $revisionId !== $providedRevisionId
260            ) {
261                throw new OutdatedOrInvalidRevisionException( 'An outdated or invalid revision id was provided' );
262            }
263
264            $cleanedText = $this->cleanHtmlDom(
265                $displayTitle,
266                $pageContent,
267                $removeTags,
268                $segmentBreakingTags
269            );
270
271            $segments = new SegmentList( $segmenter->segmentSentences( $cleanedText ) );
272
273            if ( $this->useSegmentsCache ) {
274                $this->cache->set(
275                // use revision as stated by page provider, not as provided by invoker.
276                    $this->segmentedPageCacheKeyFactory(
277                        $removeTags,
278                        $segmentBreakingTags,
279                        $pageProvider,
280                        $providedRevisionId
281                    ),
282                    $segments,
283                    WANObjectCache::TTL_HOUR
284                );
285            }
286            $segmentPageResponse->setRevisionId( $providedRevisionId );
287        }
288
289        $segmentPageResponse->setSegments( $segments );
290
291        if ( $this->requirePageRevisionProperties && $revisionProperties === null ) {
292            $segmentPageResponseRevisionId = $segmentPageResponse->getRevisionId();
293            if ( $segmentPageResponseRevisionId === null ) {
294                throw new LogicException( 'Revision id is null!' );
295            }
296            $revisionProperties = $this->loadPageRevisionProperties(
297                $pageProvider,
298                $segmentPageResponseRevisionId
299            );
300            $segmentPageResponse->setTitle( $revisionProperties->getTitle() );
301            $segmentPageResponse->setPageId( $revisionProperties->getPageId() );
302        }
303
304        return $segmentPageResponse;
305    }
306
307    /**
308     * @param string[] $removeTags
309     * @param string[] $segmentBreakingTags
310     * @param PageProvider $pageProvider
311     * @param int|null $revisionId
312     * @return string
313     */
314    private function segmentedPageCacheKeyFactory(
315        array $removeTags,
316        array $segmentBreakingTags,
317        PageProvider $pageProvider,
318        ?int $revisionId
319    ): string {
320        $cacheKeyComponents = [
321            get_class( $this ),
322            get_class( $pageProvider ),
323            $revisionId,
324            var_export( $removeTags, true ),
325            implode( '-', $segmentBreakingTags ),
326            $pageProvider->getCachedSegmentsKeyComponents()
327        ];
328        return $this->cache->makeKey(
329            'Wikispeech.segments',
330            ...$cacheKeyComponents
331        );
332    }
333
334    /**
335     * @param PageProvider $pageProvider
336     * @param int $revisionId
337     * @return string
338     */
339    private function pageRevisionPropertiesCacheKeyFactory(
340        PageProvider $pageProvider,
341        int $revisionId
342    ): string {
343        $cacheKeyComponents = [
344            get_class( $this ),
345            get_class( $pageProvider ),
346            $revisionId,
347            $pageProvider->getCachedSegmentsKeyComponents()
348        ];
349        return $this->cache->makeKey(
350            'Wikispeech.pageRevisionProperties',
351            ...$cacheKeyComponents
352        );
353    }
354}