Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
86.96% covered (warning)
86.96%
120 / 138
66.67% covered (warning)
66.67%
12 / 18
CRAP
0.00% covered (danger)
0.00%
0 / 1
SegmentPageFactory
86.96% covered (warning)
86.96%
120 / 138
66.67% covered (warning)
66.67%
12 / 18
45.91
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 setUseSegmentsCache
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 setUseRevisionPropertiesCache
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 setSegmenter
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 setSegmenterByLanguage
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 setRemoveTags
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 setSegmentBreakingTags
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 setContextSource
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 setRevisionStore
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 setHttpRequestFactory
0.00% covered (danger)
0.00%
0 / 2
0.00% covered (danger)
0.00%
0 / 1
2
 setConsumerUrl
0.00% covered (danger)
0.00%
0 / 2
0.00% covered (danger)
0.00%
0 / 1
2
 setRequirePageRevisionProperties
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 pageProviderFactory
44.44% covered (danger)
44.44%
4 / 9
0.00% covered (danger)
0.00%
0 / 1
9.29
 loadPageRevisionProperties
77.78% covered (warning)
77.78%
7 / 9
0.00% covered (danger)
0.00%
0 / 1
4.18
 segmentPage
91.78% covered (success)
91.78%
67 / 73
0.00% covered (danger)
0.00%
0 / 1
18.18
 segmentedPageCacheKeyFactory
100.00% covered (success)
100.00%
12 / 12
100.00% covered (success)
100.00%
1 / 1
1
 pageRevisionPropertiesCacheKeyFactory
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
1
 cleanHtmlDom
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
1<?php
2
3namespace MediaWiki\Wikispeech\Segment;
4
5/**
6 * @file
7 * @ingroup Extensions
8 * @license GPL-2.0-or-later
9 */
10
11use ConfigFactory;
12use IContextSource;
13use InvalidArgumentException;
14use LogicException;
15use MediaWiki\Http\HttpRequestFactory;
16use MediaWiki\Revision\RevisionStore;
17use MWException;
18use Title;
19use WANObjectCache;
20
21/**
22 * @since 0.1.10
23 */
24class SegmentPageFactory {
25
26    /** @var WANObjectCache */
27    private $cache;
28
29    /** @var ConfigFactory */
30    private $configFactory;
31
32    /**
33     * Whether or not to use cache for segmentation
34     * @var bool
35     */
36    private $useSegmentsCache = true;
37
38    /**
39     * Whether or not to use cache for page revision properties,
40     * i.e. to retrieve page id and title when only supplying a revision id.
41     *
42     * Not turning this on in a situation where consumerUrl is set
43     * will cause one extra http request to the remote wiki
44     * in order to lookup pageId, which is required to create or find
45     * utterances in UtteranceStore.
46     *
47     * @var bool
48     */
49    private $useRevisionPropertiesCache = false;
50
51    /**
52     * Will default to an instance of {@link StandardSegmenter} if not set.
53     * @var Segmenter|null
54     */
55    private $segmenter = null;
56
57    /**
58     * Will default to config setting WikispeechRemoveTags if not set.
59     * @var string[]|null
60     */
61    private $removeTags = null;
62
63    /**
64     * Will default to config setting WikispeechSegmentBreakingTags if not set.
65     * @var string[]|null
66     */
67    private $segmentBreakingTags = null;
68
69    /**
70     * Required only when providing page content from a local wiki.
71     * @var IContextSource
72     */
73    private $contextSource = null;
74
75    /**
76     * Required only when providing page content from a local wiki.
77     * @var RevisionStore|null
78     */
79    private $revisionStore = null;
80
81    /**
82     * Required only when providing page content from a remote wiki.
83     * @var HttpRequestFactory|null
84     */
85    private $httpRequestFactory = null;
86
87    /**
88     * Required only when providing page content from a remote wiki.
89     * @var string|null
90     */
91    private $consumerUrl = null;
92
93    /**
94     * If true, page id and title is always retrieved from page provider or cache
95     * and be made available in response from segmentPage.
96     *
97     * @var bool
98     */
99    private $requirePageRevisionProperties = false;
100
101    /**
102     * @since 0.1.10
103     * @param WANObjectCache $cache
104     * @param ConfigFactory $configFactory
105     */
106    public function __construct(
107        WANObjectCache $cache,
108        ConfigFactory $configFactory
109    ) {
110        $this->cache = $cache;
111        $this->configFactory = $configFactory;
112    }
113
114    /**
115     * @see SegmentPageFactory::$useSegmentsCache
116     * @since 0.1.10
117     * @param bool $useSegmentsCache
118     * @return SegmentPageFactory $this
119     */
120    public function setUseSegmentsCache(
121        bool $useSegmentsCache
122    ): SegmentPageFactory {
123        $this->useSegmentsCache = $useSegmentsCache;
124        return $this;
125    }
126
127    /**
128     * @see SegmentPageFactory::$useRevisionPropertiesCache
129     * @since 0.1.10
130     * @param bool $useRevisionPropertiesCache
131     * @return SegmentPageFactory $this
132     */
133    public function setUseRevisionPropertiesCache(
134        bool $useRevisionPropertiesCache
135    ): SegmentPageFactory {
136        $this->useRevisionPropertiesCache = $useRevisionPropertiesCache;
137        return $this;
138    }
139
140    /**
141     * @see SegmentPageFactory::$segmenter
142     * @since 0.1.10
143     * @param Segmenter|null $segmenter
144     * @return SegmentPageFactory $this
145     */
146    public function setSegmenter(
147        ?Segmenter $segmenter
148    ): SegmentPageFactory {
149        $this->segmenter = $segmenter;
150        return $this;
151    }
152
153    /**
154     * @see SegmentPageFactory::$setSegmenter
155     * @since 0.1.10
156     * @param string $language
157     * @return SegmentPageFactory $this
158     */
159    public function setSegmenterByLanguage(
160        string $language
161    ): SegmentPageFactory {
162        // @todo lookup segmenter by language
163        return $this->setSegmenter( new StandardSegmenter() );
164    }
165
166    /**
167     * @see SegmentPageFactory::$removeTags
168     * @since 0.1.10
169     * @param string[]|null $removeTags
170     * @return SegmentPageFactory $this
171     */
172    public function setRemoveTags(
173        ?array $removeTags
174    ): SegmentPageFactory {
175        $this->removeTags = $removeTags;
176        return $this;
177    }
178
179    /**
180     * @see SegmentPageFactory::$segmentBreakingTags
181     * @since 0.1.10
182     * @param string[]|null $segmentBreakingTags
183     * @return SegmentPageFactory $this
184     */
185    public function setSegmentBreakingTags(
186        ?array $segmentBreakingTags
187    ): SegmentPageFactory {
188        $this->segmentBreakingTags = $segmentBreakingTags;
189        return $this;
190    }
191
192    /**
193     * @see SegmentPageFactory::$contextSource
194     * @since 0.1.10
195     * @param IContextSource|null $contextSource
196     * @return SegmentPageFactory $this
197     */
198    public function setContextSource(
199        ?IContextSource $contextSource
200    ): SegmentPageFactory {
201        $this->contextSource = $contextSource;
202        return $this;
203    }
204
205    /**
206     * @see SegmentPageFactory::$revisionStore
207     * @since 0.1.10
208     * @param RevisionStore|null $revisionStore
209     * @return SegmentPageFactory $this
210     */
211    public function setRevisionStore(
212        ?RevisionStore $revisionStore
213    ): SegmentPageFactory {
214        $this->revisionStore = $revisionStore;
215        return $this;
216    }
217
218    /**
219     * @see SegmentPageFactory::$httpRequestFactory
220     * @since 0.1.10
221     * @param HttpRequestFactory|null $httpRequestFactory
222     * @return SegmentPageFactory $this
223     */
224    public function setHttpRequestFactory(
225        ?HttpRequestFactory $httpRequestFactory
226    ): SegmentPageFactory {
227        $this->httpRequestFactory = $httpRequestFactory;
228        return $this;
229    }
230
231    /**
232     * @see SegmentPageFactory::$consumerUrl
233     * @since 0.1.10
234     * @param string|null $consumerUrl
235     * @return SegmentPageFactory $this
236     */
237    public function setConsumerUrl(
238        ?string $consumerUrl
239    ): SegmentPageFactory {
240        $this->consumerUrl = $consumerUrl;
241        return $this;
242    }
243
244    /**
245     * If true, page id and title is always retrieved from page provider or cache
246     * and be made available in response from segmentPage.
247     *
248     * @since 0.1.10
249     * @param bool $requirePageRevisionProperties
250     * @return SegmentPageFactory $this
251     */
252    public function setRequirePageRevisionProperties(
253        bool $requirePageRevisionProperties
254    ): SegmentPageFactory {
255        $this->requirePageRevisionProperties = $requirePageRevisionProperties;
256        return $this;
257    }
258
259    /**
260     * @since 0.1.10
261     * @return PageProvider
262     */
263    protected function pageProviderFactory(): PageProvider {
264        if ( $this->consumerUrl ) {
265            if ( $this->httpRequestFactory === null ) {
266                throw new LogicException( '$httpRequestFactory is null!' );
267            }
268            return new RemoteWikiPageProvider( $this->consumerUrl, $this->httpRequestFactory );
269        } else {
270            if ( $this->contextSource === null ) {
271                throw new LogicException( '$contextSource is null!' );
272            }
273            if ( $this->revisionStore === null ) {
274                throw new LogicException( '$revisionStore is null!' );
275            }
276            return new LocalWikiPageProvider( $this->contextSource, $this->revisionStore );
277        }
278    }
279
280    /**
281     * Loads revision properties
282     * from cache (if set to use)
283     * or via pageProvider (if not using or missing in cache).
284     *
285     * @since 0.1.10
286     * @param PageProvider $pageProvider
287     * @param int $revisionId
288     * @return PageRevisionProperties
289     */
290    protected function loadPageRevisionProperties(
291        PageProvider $pageProvider,
292        int $revisionId
293    ): PageRevisionProperties {
294        $cacheKey = $this->pageRevisionPropertiesCacheKeyFactory( $pageProvider, $revisionId );
295        // Lookup title and page id given the revision id.
296        if ( $this->useRevisionPropertiesCache ) {
297            $revisionProperties = $this->cache->get( $cacheKey );
298        } else {
299            $revisionProperties = false;
300        }
301        if ( $revisionProperties === false ) {
302            $revisionProperties = $pageProvider->loadPageRevisionProperties( $revisionId );
303            if ( $this->useRevisionPropertiesCache ) {
304                $this->cache->set( $cacheKey, $revisionProperties );
305            }
306        }
307        return $revisionProperties;
308    }
309
310    /**
311     * Convenience function to build segmenter
312     * which is immediately invoked to segment page
313     * after parsing DOM of the HTML
314     * supplied by a page provider that is constructed from the factory fields,
315     * possibly loading extra properties required by invoker,
316     * and possibly bypassing all of above invocation using caches.
317     *
318     * $title XOR $revisionId must be provided.
319     *
320     * @since 0.1.10
321     * @param Title|null $title
322     * @param int|null $revisionId
323     * @return SegmentPageResponse
324     * @throws InvalidArgumentException If $title xor $revisionId is not provided.
325     */
326    public function segmentPage(
327        ?Title $title,
328        ?int $revisionId
329    ): SegmentPageResponse {
330        $pageProvider = $this->pageProviderFactory();
331
332        $segmentPageResponse = new SegmentPageResponse();
333
334        $revisionProperties = null;
335
336        if ( $title === null && $revisionId !== null ) {
337            // Lookup title and page id given the revision id.
338            $revisionProperties = $this->loadPageRevisionProperties( $pageProvider, $revisionId );
339            $segmentPageResponse->setTitle( $revisionProperties->getTitle() );
340            $segmentPageResponse->setPageId( $revisionProperties->getPageId() );
341            $segmentPageResponse->setRevisionId( $revisionId );
342        } elseif ( $title !== null && $revisionId === null ) {
343            // Set user supplied title.
344            $segmentPageResponse->setTitle( $title );
345        } else {
346            throw new InvalidArgumentException( '$title xor $revisionId must be provided.' );
347        }
348
349        $config = $this->configFactory->makeConfig( 'wikispeech' );
350        $removeTags = $this->removeTags ?? $config->get( 'WikispeechRemoveTags' );
351        $segmentBreakingTags = $this->segmentBreakingTags ?? $config->get( 'WikispeechSegmentBreakingTags' );
352
353        $segmenter = $this->segmenter ?? new StandardSegmenter();
354
355        if ( $this->useSegmentsCache && $revisionId !== null ) {
356            $segments = $this->cache->get(
357                $this->segmentedPageCacheKeyFactory(
358                    $removeTags,
359                    $segmentBreakingTags,
360                    $pageProvider,
361                    $revisionId
362                )
363            );
364        } else {
365            $segments = false;
366        }
367        if ( $segments === false ) {
368            $segmentPageResponseTitle = $segmentPageResponse->getTitle();
369            if ( $segmentPageResponseTitle === null ) {
370                throw new LogicException( 'Title is null!' );
371            }
372            $pageProvider->loadData( $segmentPageResponseTitle );
373
374            $displayTitle = $pageProvider->getDisplayTitle();
375            if ( $displayTitle === null ) {
376                throw new MWException( 'Display title not loaded!' );
377            }
378            $pageContent = $pageProvider->getPageContent();
379            if ( $pageContent === null ) {
380                throw new MWException( 'Page content not loaded!' );
381            }
382            $providedRevisionId = $pageProvider->getRevisionId();
383            if ( $providedRevisionId === null ) {
384                throw new MWException( 'Revision id not loaded!' );
385            }
386
387            if ( $revisionId !== null
388                && $revisionId !== $providedRevisionId
389            ) {
390                throw new OutdatedOrInvalidRevisionException( 'An outdated or invalid revision id was provided' );
391            }
392
393            $cleanedText = $this->cleanHtmlDom(
394                $displayTitle,
395                $pageContent,
396                $removeTags,
397                $segmentBreakingTags
398            );
399
400            $segments = new SegmentList( $segmenter->segmentSentences( $cleanedText ) );
401
402            if ( $this->useSegmentsCache ) {
403                $this->cache->set(
404                // use revision as stated by page provider, not as provided by invoker.
405                    $this->segmentedPageCacheKeyFactory(
406                        $removeTags,
407                        $segmentBreakingTags,
408                        $pageProvider,
409                        $providedRevisionId
410                    ),
411                    $segments,
412                    WANObjectCache::TTL_HOUR
413                );
414            }
415            $segmentPageResponse->setRevisionId( $providedRevisionId );
416        }
417
418        $segmentPageResponse->setSegments( $segments );
419
420        if ( $this->requirePageRevisionProperties && $revisionProperties === null ) {
421            $segmentPageResponseRevisionId = $segmentPageResponse->getRevisionId();
422            if ( $segmentPageResponseRevisionId === null ) {
423                throw new LogicException( 'Revision id is null!' );
424            }
425            $revisionProperties = $this->loadPageRevisionProperties(
426                $pageProvider,
427                $segmentPageResponseRevisionId
428            );
429            $segmentPageResponse->setTitle( $revisionProperties->getTitle() );
430            $segmentPageResponse->setPageId( $revisionProperties->getPageId() );
431        }
432
433        return $segmentPageResponse;
434    }
435
436    /**
437     * @param string[] $removeTags
438     * @param string[] $segmentBreakingTags
439     * @param PageProvider $pageProvider
440     * @param int|null $revisionId
441     * @return string
442     */
443    private function segmentedPageCacheKeyFactory(
444        array $removeTags,
445        array $segmentBreakingTags,
446        PageProvider $pageProvider,
447        ?int $revisionId
448    ): string {
449        $cacheKeyComponents = [
450            get_class( $this ),
451            get_class( $pageProvider ),
452            $revisionId,
453            var_export( $removeTags, true ),
454            implode( '-', $segmentBreakingTags ),
455            $pageProvider->getCachedSegmentsKeyComponents()
456        ];
457        return $this->cache->makeKey(
458            'Wikispeech.segments',
459            ...$cacheKeyComponents
460        );
461    }
462
463    /**
464     * @param PageProvider $pageProvider
465     * @param int $revisionId
466     * @return string
467     */
468    private function pageRevisionPropertiesCacheKeyFactory(
469        PageProvider $pageProvider,
470        int $revisionId
471    ): string {
472        $cacheKeyComponents = [
473            get_class( $this ),
474            get_class( $pageProvider ),
475            $revisionId,
476            $pageProvider->getCachedSegmentsKeyComponents()
477        ];
478        return $this->cache->makeKey(
479            'Wikispeech.pageRevisionProperties',
480            ...$cacheKeyComponents
481        );
482    }
483
484    /**
485     * This method exists due to need for test mocking.
486     *
487     * @see Cleaner::cleanHtmlDom()
488     * @since 0.1.10
489     * @param string $displayTitle
490     * @param string $pageContent
491     * @param string[] $removeTags
492     * @param string[] $segmentBreakingTags
493     * @return array
494     * @throws MWException
495     */
496    protected function cleanHtmlDom(
497        string $displayTitle,
498        string $pageContent,
499        array $removeTags,
500        array $segmentBreakingTags
501    ): array {
502        $cleaner = new Cleaner( $removeTags, $segmentBreakingTags );
503        return $cleaner->cleanHtmlDom( $displayTitle, $pageContent );
504    }
505}