Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
86.96% |
120 / 138 |
|
66.67% |
12 / 18 |
CRAP | |
0.00% |
0 / 1 |
SegmentPageFactory | |
86.96% |
120 / 138 |
|
66.67% |
12 / 18 |
45.91 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
setUseSegmentsCache | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
setUseRevisionPropertiesCache | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
setSegmenter | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
setSegmenterByLanguage | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
setRemoveTags | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
setSegmentBreakingTags | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
setContextSource | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
setRevisionStore | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
setHttpRequestFactory | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
setConsumerUrl | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
setRequirePageRevisionProperties | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
pageProviderFactory | |
44.44% |
4 / 9 |
|
0.00% |
0 / 1 |
9.29 | |||
loadPageRevisionProperties | |
77.78% |
7 / 9 |
|
0.00% |
0 / 1 |
4.18 | |||
segmentPage | |
91.78% |
67 / 73 |
|
0.00% |
0 / 1 |
18.18 | |||
segmentedPageCacheKeyFactory | |
100.00% |
12 / 12 |
|
100.00% |
1 / 1 |
1 | |||
pageRevisionPropertiesCacheKeyFactory | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
1 | |||
cleanHtmlDom | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 |
1 | <?php |
2 | |
3 | namespace MediaWiki\Wikispeech\Segment; |
4 | |
5 | /** |
6 | * @file |
7 | * @ingroup Extensions |
8 | * @license GPL-2.0-or-later |
9 | */ |
10 | |
11 | use ConfigFactory; |
12 | use IContextSource; |
13 | use InvalidArgumentException; |
14 | use LogicException; |
15 | use MediaWiki\Http\HttpRequestFactory; |
16 | use MediaWiki\Revision\RevisionStore; |
17 | use MWException; |
18 | use Title; |
19 | use WANObjectCache; |
20 | |
21 | /** |
22 | * @since 0.1.10 |
23 | */ |
24 | class SegmentPageFactory { |
25 | |
26 | /** @var WANObjectCache */ |
27 | private $cache; |
28 | |
29 | /** @var ConfigFactory */ |
30 | private $configFactory; |
31 | |
32 | /** |
33 | * Whether or not to use cache for segmentation |
34 | * @var bool |
35 | */ |
36 | private $useSegmentsCache = true; |
37 | |
38 | /** |
39 | * Whether or not to use cache for page revision properties, |
40 | * i.e. to retrieve page id and title when only supplying a revision id. |
41 | * |
42 | * Not turning this on in a situation where consumerUrl is set |
43 | * will cause one extra http request to the remote wiki |
44 | * in order to lookup pageId, which is required to create or find |
45 | * utterances in UtteranceStore. |
46 | * |
47 | * @var bool |
48 | */ |
49 | private $useRevisionPropertiesCache = false; |
50 | |
51 | /** |
52 | * Will default to an instance of {@link StandardSegmenter} if not set. |
53 | * @var Segmenter|null |
54 | */ |
55 | private $segmenter = null; |
56 | |
57 | /** |
58 | * Will default to config setting WikispeechRemoveTags if not set. |
59 | * @var string[]|null |
60 | */ |
61 | private $removeTags = null; |
62 | |
63 | /** |
64 | * Will default to config setting WikispeechSegmentBreakingTags if not set. |
65 | * @var string[]|null |
66 | */ |
67 | private $segmentBreakingTags = null; |
68 | |
69 | /** |
70 | * Required only when providing page content from a local wiki. |
71 | * @var IContextSource |
72 | */ |
73 | private $contextSource = null; |
74 | |
75 | /** |
76 | * Required only when providing page content from a local wiki. |
77 | * @var RevisionStore|null |
78 | */ |
79 | private $revisionStore = null; |
80 | |
81 | /** |
82 | * Required only when providing page content from a remote wiki. |
83 | * @var HttpRequestFactory|null |
84 | */ |
85 | private $httpRequestFactory = null; |
86 | |
87 | /** |
88 | * Required only when providing page content from a remote wiki. |
89 | * @var string|null |
90 | */ |
91 | private $consumerUrl = null; |
92 | |
93 | /** |
94 | * If true, page id and title is always retrieved from page provider or cache |
95 | * and be made available in response from segmentPage. |
96 | * |
97 | * @var bool |
98 | */ |
99 | private $requirePageRevisionProperties = false; |
100 | |
101 | /** |
102 | * @since 0.1.10 |
103 | * @param WANObjectCache $cache |
104 | * @param ConfigFactory $configFactory |
105 | */ |
106 | public function __construct( |
107 | WANObjectCache $cache, |
108 | ConfigFactory $configFactory |
109 | ) { |
110 | $this->cache = $cache; |
111 | $this->configFactory = $configFactory; |
112 | } |
113 | |
114 | /** |
115 | * @see SegmentPageFactory::$useSegmentsCache |
116 | * @since 0.1.10 |
117 | * @param bool $useSegmentsCache |
118 | * @return SegmentPageFactory $this |
119 | */ |
120 | public function setUseSegmentsCache( |
121 | bool $useSegmentsCache |
122 | ): SegmentPageFactory { |
123 | $this->useSegmentsCache = $useSegmentsCache; |
124 | return $this; |
125 | } |
126 | |
127 | /** |
128 | * @see SegmentPageFactory::$useRevisionPropertiesCache |
129 | * @since 0.1.10 |
130 | * @param bool $useRevisionPropertiesCache |
131 | * @return SegmentPageFactory $this |
132 | */ |
133 | public function setUseRevisionPropertiesCache( |
134 | bool $useRevisionPropertiesCache |
135 | ): SegmentPageFactory { |
136 | $this->useRevisionPropertiesCache = $useRevisionPropertiesCache; |
137 | return $this; |
138 | } |
139 | |
140 | /** |
141 | * @see SegmentPageFactory::$segmenter |
142 | * @since 0.1.10 |
143 | * @param Segmenter|null $segmenter |
144 | * @return SegmentPageFactory $this |
145 | */ |
146 | public function setSegmenter( |
147 | ?Segmenter $segmenter |
148 | ): SegmentPageFactory { |
149 | $this->segmenter = $segmenter; |
150 | return $this; |
151 | } |
152 | |
153 | /** |
154 | * @see SegmentPageFactory::$setSegmenter |
155 | * @since 0.1.10 |
156 | * @param string $language |
157 | * @return SegmentPageFactory $this |
158 | */ |
159 | public function setSegmenterByLanguage( |
160 | string $language |
161 | ): SegmentPageFactory { |
162 | // @todo lookup segmenter by language |
163 | return $this->setSegmenter( new StandardSegmenter() ); |
164 | } |
165 | |
166 | /** |
167 | * @see SegmentPageFactory::$removeTags |
168 | * @since 0.1.10 |
169 | * @param string[]|null $removeTags |
170 | * @return SegmentPageFactory $this |
171 | */ |
172 | public function setRemoveTags( |
173 | ?array $removeTags |
174 | ): SegmentPageFactory { |
175 | $this->removeTags = $removeTags; |
176 | return $this; |
177 | } |
178 | |
179 | /** |
180 | * @see SegmentPageFactory::$segmentBreakingTags |
181 | * @since 0.1.10 |
182 | * @param string[]|null $segmentBreakingTags |
183 | * @return SegmentPageFactory $this |
184 | */ |
185 | public function setSegmentBreakingTags( |
186 | ?array $segmentBreakingTags |
187 | ): SegmentPageFactory { |
188 | $this->segmentBreakingTags = $segmentBreakingTags; |
189 | return $this; |
190 | } |
191 | |
192 | /** |
193 | * @see SegmentPageFactory::$contextSource |
194 | * @since 0.1.10 |
195 | * @param IContextSource|null $contextSource |
196 | * @return SegmentPageFactory $this |
197 | */ |
198 | public function setContextSource( |
199 | ?IContextSource $contextSource |
200 | ): SegmentPageFactory { |
201 | $this->contextSource = $contextSource; |
202 | return $this; |
203 | } |
204 | |
205 | /** |
206 | * @see SegmentPageFactory::$revisionStore |
207 | * @since 0.1.10 |
208 | * @param RevisionStore|null $revisionStore |
209 | * @return SegmentPageFactory $this |
210 | */ |
211 | public function setRevisionStore( |
212 | ?RevisionStore $revisionStore |
213 | ): SegmentPageFactory { |
214 | $this->revisionStore = $revisionStore; |
215 | return $this; |
216 | } |
217 | |
218 | /** |
219 | * @see SegmentPageFactory::$httpRequestFactory |
220 | * @since 0.1.10 |
221 | * @param HttpRequestFactory|null $httpRequestFactory |
222 | * @return SegmentPageFactory $this |
223 | */ |
224 | public function setHttpRequestFactory( |
225 | ?HttpRequestFactory $httpRequestFactory |
226 | ): SegmentPageFactory { |
227 | $this->httpRequestFactory = $httpRequestFactory; |
228 | return $this; |
229 | } |
230 | |
231 | /** |
232 | * @see SegmentPageFactory::$consumerUrl |
233 | * @since 0.1.10 |
234 | * @param string|null $consumerUrl |
235 | * @return SegmentPageFactory $this |
236 | */ |
237 | public function setConsumerUrl( |
238 | ?string $consumerUrl |
239 | ): SegmentPageFactory { |
240 | $this->consumerUrl = $consumerUrl; |
241 | return $this; |
242 | } |
243 | |
244 | /** |
245 | * If true, page id and title is always retrieved from page provider or cache |
246 | * and be made available in response from segmentPage. |
247 | * |
248 | * @since 0.1.10 |
249 | * @param bool $requirePageRevisionProperties |
250 | * @return SegmentPageFactory $this |
251 | */ |
252 | public function setRequirePageRevisionProperties( |
253 | bool $requirePageRevisionProperties |
254 | ): SegmentPageFactory { |
255 | $this->requirePageRevisionProperties = $requirePageRevisionProperties; |
256 | return $this; |
257 | } |
258 | |
259 | /** |
260 | * @since 0.1.10 |
261 | * @return PageProvider |
262 | */ |
263 | protected function pageProviderFactory(): PageProvider { |
264 | if ( $this->consumerUrl ) { |
265 | if ( $this->httpRequestFactory === null ) { |
266 | throw new LogicException( '$httpRequestFactory is null!' ); |
267 | } |
268 | return new RemoteWikiPageProvider( $this->consumerUrl, $this->httpRequestFactory ); |
269 | } else { |
270 | if ( $this->contextSource === null ) { |
271 | throw new LogicException( '$contextSource is null!' ); |
272 | } |
273 | if ( $this->revisionStore === null ) { |
274 | throw new LogicException( '$revisionStore is null!' ); |
275 | } |
276 | return new LocalWikiPageProvider( $this->contextSource, $this->revisionStore ); |
277 | } |
278 | } |
279 | |
280 | /** |
281 | * Loads revision properties |
282 | * from cache (if set to use) |
283 | * or via pageProvider (if not using or missing in cache). |
284 | * |
285 | * @since 0.1.10 |
286 | * @param PageProvider $pageProvider |
287 | * @param int $revisionId |
288 | * @return PageRevisionProperties |
289 | */ |
290 | protected function loadPageRevisionProperties( |
291 | PageProvider $pageProvider, |
292 | int $revisionId |
293 | ): PageRevisionProperties { |
294 | $cacheKey = $this->pageRevisionPropertiesCacheKeyFactory( $pageProvider, $revisionId ); |
295 | // Lookup title and page id given the revision id. |
296 | if ( $this->useRevisionPropertiesCache ) { |
297 | $revisionProperties = $this->cache->get( $cacheKey ); |
298 | } else { |
299 | $revisionProperties = false; |
300 | } |
301 | if ( $revisionProperties === false ) { |
302 | $revisionProperties = $pageProvider->loadPageRevisionProperties( $revisionId ); |
303 | if ( $this->useRevisionPropertiesCache ) { |
304 | $this->cache->set( $cacheKey, $revisionProperties ); |
305 | } |
306 | } |
307 | return $revisionProperties; |
308 | } |
309 | |
310 | /** |
311 | * Convenience function to build segmenter |
312 | * which is immediately invoked to segment page |
313 | * after parsing DOM of the HTML |
314 | * supplied by a page provider that is constructed from the factory fields, |
315 | * possibly loading extra properties required by invoker, |
316 | * and possibly bypassing all of above invocation using caches. |
317 | * |
318 | * $title XOR $revisionId must be provided. |
319 | * |
320 | * @since 0.1.10 |
321 | * @param Title|null $title |
322 | * @param int|null $revisionId |
323 | * @return SegmentPageResponse |
324 | * @throws InvalidArgumentException If $title xor $revisionId is not provided. |
325 | */ |
326 | public function segmentPage( |
327 | ?Title $title, |
328 | ?int $revisionId |
329 | ): SegmentPageResponse { |
330 | $pageProvider = $this->pageProviderFactory(); |
331 | |
332 | $segmentPageResponse = new SegmentPageResponse(); |
333 | |
334 | $revisionProperties = null; |
335 | |
336 | if ( $title === null && $revisionId !== null ) { |
337 | // Lookup title and page id given the revision id. |
338 | $revisionProperties = $this->loadPageRevisionProperties( $pageProvider, $revisionId ); |
339 | $segmentPageResponse->setTitle( $revisionProperties->getTitle() ); |
340 | $segmentPageResponse->setPageId( $revisionProperties->getPageId() ); |
341 | $segmentPageResponse->setRevisionId( $revisionId ); |
342 | } elseif ( $title !== null && $revisionId === null ) { |
343 | // Set user supplied title. |
344 | $segmentPageResponse->setTitle( $title ); |
345 | } else { |
346 | throw new InvalidArgumentException( '$title xor $revisionId must be provided.' ); |
347 | } |
348 | |
349 | $config = $this->configFactory->makeConfig( 'wikispeech' ); |
350 | $removeTags = $this->removeTags ?? $config->get( 'WikispeechRemoveTags' ); |
351 | $segmentBreakingTags = $this->segmentBreakingTags ?? $config->get( 'WikispeechSegmentBreakingTags' ); |
352 | |
353 | $segmenter = $this->segmenter ?? new StandardSegmenter(); |
354 | |
355 | if ( $this->useSegmentsCache && $revisionId !== null ) { |
356 | $segments = $this->cache->get( |
357 | $this->segmentedPageCacheKeyFactory( |
358 | $removeTags, |
359 | $segmentBreakingTags, |
360 | $pageProvider, |
361 | $revisionId |
362 | ) |
363 | ); |
364 | } else { |
365 | $segments = false; |
366 | } |
367 | if ( $segments === false ) { |
368 | $segmentPageResponseTitle = $segmentPageResponse->getTitle(); |
369 | if ( $segmentPageResponseTitle === null ) { |
370 | throw new LogicException( 'Title is null!' ); |
371 | } |
372 | $pageProvider->loadData( $segmentPageResponseTitle ); |
373 | |
374 | $displayTitle = $pageProvider->getDisplayTitle(); |
375 | if ( $displayTitle === null ) { |
376 | throw new MWException( 'Display title not loaded!' ); |
377 | } |
378 | $pageContent = $pageProvider->getPageContent(); |
379 | if ( $pageContent === null ) { |
380 | throw new MWException( 'Page content not loaded!' ); |
381 | } |
382 | $providedRevisionId = $pageProvider->getRevisionId(); |
383 | if ( $providedRevisionId === null ) { |
384 | throw new MWException( 'Revision id not loaded!' ); |
385 | } |
386 | |
387 | if ( $revisionId !== null |
388 | && $revisionId !== $providedRevisionId |
389 | ) { |
390 | throw new OutdatedOrInvalidRevisionException( 'An outdated or invalid revision id was provided' ); |
391 | } |
392 | |
393 | $cleanedText = $this->cleanHtmlDom( |
394 | $displayTitle, |
395 | $pageContent, |
396 | $removeTags, |
397 | $segmentBreakingTags |
398 | ); |
399 | |
400 | $segments = new SegmentList( $segmenter->segmentSentences( $cleanedText ) ); |
401 | |
402 | if ( $this->useSegmentsCache ) { |
403 | $this->cache->set( |
404 | // use revision as stated by page provider, not as provided by invoker. |
405 | $this->segmentedPageCacheKeyFactory( |
406 | $removeTags, |
407 | $segmentBreakingTags, |
408 | $pageProvider, |
409 | $providedRevisionId |
410 | ), |
411 | $segments, |
412 | WANObjectCache::TTL_HOUR |
413 | ); |
414 | } |
415 | $segmentPageResponse->setRevisionId( $providedRevisionId ); |
416 | } |
417 | |
418 | $segmentPageResponse->setSegments( $segments ); |
419 | |
420 | if ( $this->requirePageRevisionProperties && $revisionProperties === null ) { |
421 | $segmentPageResponseRevisionId = $segmentPageResponse->getRevisionId(); |
422 | if ( $segmentPageResponseRevisionId === null ) { |
423 | throw new LogicException( 'Revision id is null!' ); |
424 | } |
425 | $revisionProperties = $this->loadPageRevisionProperties( |
426 | $pageProvider, |
427 | $segmentPageResponseRevisionId |
428 | ); |
429 | $segmentPageResponse->setTitle( $revisionProperties->getTitle() ); |
430 | $segmentPageResponse->setPageId( $revisionProperties->getPageId() ); |
431 | } |
432 | |
433 | return $segmentPageResponse; |
434 | } |
435 | |
436 | /** |
437 | * @param string[] $removeTags |
438 | * @param string[] $segmentBreakingTags |
439 | * @param PageProvider $pageProvider |
440 | * @param int|null $revisionId |
441 | * @return string |
442 | */ |
443 | private function segmentedPageCacheKeyFactory( |
444 | array $removeTags, |
445 | array $segmentBreakingTags, |
446 | PageProvider $pageProvider, |
447 | ?int $revisionId |
448 | ): string { |
449 | $cacheKeyComponents = [ |
450 | get_class( $this ), |
451 | get_class( $pageProvider ), |
452 | $revisionId, |
453 | var_export( $removeTags, true ), |
454 | implode( '-', $segmentBreakingTags ), |
455 | $pageProvider->getCachedSegmentsKeyComponents() |
456 | ]; |
457 | return $this->cache->makeKey( |
458 | 'Wikispeech.segments', |
459 | ...$cacheKeyComponents |
460 | ); |
461 | } |
462 | |
463 | /** |
464 | * @param PageProvider $pageProvider |
465 | * @param int $revisionId |
466 | * @return string |
467 | */ |
468 | private function pageRevisionPropertiesCacheKeyFactory( |
469 | PageProvider $pageProvider, |
470 | int $revisionId |
471 | ): string { |
472 | $cacheKeyComponents = [ |
473 | get_class( $this ), |
474 | get_class( $pageProvider ), |
475 | $revisionId, |
476 | $pageProvider->getCachedSegmentsKeyComponents() |
477 | ]; |
478 | return $this->cache->makeKey( |
479 | 'Wikispeech.pageRevisionProperties', |
480 | ...$cacheKeyComponents |
481 | ); |
482 | } |
483 | |
484 | /** |
485 | * This method exists due to need for test mocking. |
486 | * |
487 | * @see Cleaner::cleanHtmlDom() |
488 | * @since 0.1.10 |
489 | * @param string $displayTitle |
490 | * @param string $pageContent |
491 | * @param string[] $removeTags |
492 | * @param string[] $segmentBreakingTags |
493 | * @return array |
494 | * @throws MWException |
495 | */ |
496 | protected function cleanHtmlDom( |
497 | string $displayTitle, |
498 | string $pageContent, |
499 | array $removeTags, |
500 | array $segmentBreakingTags |
501 | ): array { |
502 | $cleaner = new Cleaner( $removeTags, $segmentBreakingTags ); |
503 | return $cleaner->cleanHtmlDom( $displayTitle, $pageContent ); |
504 | } |
505 | } |