Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
5.26% |
2 / 38 |
|
16.67% |
1 / 6 |
CRAP | |
0.00% |
0 / 1 |
| StoryContentAnalyzer | |
5.26% |
2 / 38 |
|
16.67% |
1 / 6 |
206.31 | |
0.00% |
0 / 1 |
| __construct | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| hasOutdatedText | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
30 | |||
| isOutdatedText | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
| inText | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 | |||
| transformText | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
| getArticleText | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
12 | |||
| 1 | <?php |
| 2 | |
| 3 | namespace MediaWiki\Extension\Wikistories; |
| 4 | |
| 5 | use MediaWiki\Page\WikiPageFactory; |
| 6 | use MediaWiki\Title\Title; |
| 7 | |
| 8 | class StoryContentAnalyzer { |
| 9 | |
| 10 | /** |
| 11 | * This sentence separator works for many but not all languages. |
| 12 | * todo: re-visit when deploying to a new wiki |
| 13 | */ |
| 14 | private const SENTENCE_SEPARATOR_REGEX = '/[.:;]/'; |
| 15 | |
| 16 | /** |
| 17 | * @var array Instance cache of article text indexed by title |
| 18 | */ |
| 19 | private $cache = []; |
| 20 | |
| 21 | public function __construct( |
| 22 | private readonly WikiPageFactory $wikiPageFactory, |
| 23 | ) { |
| 24 | } |
| 25 | |
| 26 | public function hasOutdatedText( StoryContent $story ): bool { |
| 27 | $articleTitle = $story->getArticleTitle(); |
| 28 | if ( $articleTitle === null ) { |
| 29 | return false; |
| 30 | } |
| 31 | |
| 32 | $articleText = $this->getArticleText( $articleTitle ); |
| 33 | if ( $articleText === null ) { |
| 34 | return false; |
| 35 | } |
| 36 | |
| 37 | foreach ( $story->getFrames() as $frame ) { |
| 38 | if ( $this->isOutdatedText( |
| 39 | $articleText, |
| 40 | $frame->text->value, |
| 41 | $frame->text->fromArticle->originalText |
| 42 | ) ) { |
| 43 | return true; |
| 44 | } |
| 45 | } |
| 46 | return false; |
| 47 | } |
| 48 | |
| 49 | public function isOutdatedText( string $articleText, string $currentText, string $originalText ): bool { |
| 50 | return !$this->inText( $currentText, $articleText ) |
| 51 | && !$this->inText( $originalText, $articleText ); |
| 52 | } |
| 53 | |
| 54 | /** |
| 55 | * @param string $part Block of text containing one or more sentences |
| 56 | * originally selected from the article text. May have been manually edited |
| 57 | * by story editor. |
| 58 | * @param string $text Article text |
| 59 | * @return bool True if all the sentences in $part are present in the article text |
| 60 | */ |
| 61 | private function inText( string $part, string $text ): bool { |
| 62 | $sentences = preg_split( self::SENTENCE_SEPARATOR_REGEX, $part ); |
| 63 | foreach ( $sentences as $sentence ) { |
| 64 | if ( !str_contains( $text, trim( $sentence ) ) ) { |
| 65 | return false; |
| 66 | } |
| 67 | } |
| 68 | return true; |
| 69 | } |
| 70 | |
| 71 | /** |
| 72 | * Remove unnecessary elements from the html text |
| 73 | */ |
| 74 | public function transformText( string $html ): string { |
| 75 | // Remove HTML tags and convert entities |
| 76 | $text = html_entity_decode( strip_tags( $html ) ); |
| 77 | |
| 78 | // Convert multiple spaces to single space |
| 79 | $text = preg_replace( '/\s+/', ' ', $text ); |
| 80 | |
| 81 | // Remove references ([1]) |
| 82 | $text = preg_replace( '/\[\d+\]/', '', $text ); |
| 83 | return $text; |
| 84 | } |
| 85 | |
| 86 | public function getArticleText( Title $articleTitle ): ?string { |
| 87 | $dbKey = $articleTitle->getDBkey(); |
| 88 | if ( isset( $this->cache[ $dbKey ] ) ) { |
| 89 | return $this->cache[ $dbKey ]; |
| 90 | } |
| 91 | |
| 92 | $page = $this->wikiPageFactory->newFromTitle( $articleTitle ); |
| 93 | $parserOptions = $page->makeParserOptions( 'canonical' ); |
| 94 | $parserOutput = $page->getParserOutput( $parserOptions ); |
| 95 | |
| 96 | if ( !$parserOutput ) { |
| 97 | return null; |
| 98 | } |
| 99 | |
| 100 | $html = $parserOutput->getRawText(); |
| 101 | $text = $this->transformText( $html ); |
| 102 | $this->cache[ $dbKey ] = $text; |
| 103 | |
| 104 | return $text; |
| 105 | } |
| 106 | |
| 107 | } |