Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
5.26% |
2 / 38 |
|
16.67% |
1 / 6 |
CRAP | |
0.00% |
0 / 1 |
StoryContentAnalyzer | |
5.26% |
2 / 38 |
|
16.67% |
1 / 6 |
206.31 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
hasOutdatedText | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
30 | |||
isOutdatedText | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
inText | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 | |||
transformText | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
getArticleText | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
12 |
1 | <?php |
2 | |
3 | namespace MediaWiki\Extension\Wikistories; |
4 | |
5 | use MediaWiki\Page\WikiPageFactory; |
6 | use MediaWiki\Title\Title; |
7 | |
8 | class StoryContentAnalyzer { |
9 | |
10 | /** |
11 | * This sentence separator works for many but not all languages. |
12 | * todo: re-visit when deploying to a new wiki |
13 | */ |
14 | private const SENTENCE_SEPARATOR_REGEX = '/[.:;]/'; |
15 | |
16 | /** @var WikiPageFactory */ |
17 | private $wikiPageFactory; |
18 | |
19 | /** |
20 | * @var array Instance cache of article text indexed by title |
21 | */ |
22 | private $cache = []; |
23 | |
24 | /** |
25 | * @param WikiPageFactory $wikiPageFactory |
26 | */ |
27 | public function __construct( |
28 | WikiPageFactory $wikiPageFactory |
29 | ) { |
30 | $this->wikiPageFactory = $wikiPageFactory; |
31 | } |
32 | |
33 | /** |
34 | * @param StoryContent $story |
35 | * @return bool |
36 | */ |
37 | public function hasOutdatedText( StoryContent $story ): bool { |
38 | $articleTitle = $story->getArticleTitle(); |
39 | if ( $articleTitle === null ) { |
40 | return false; |
41 | } |
42 | |
43 | $articleText = $this->getArticleText( $articleTitle ); |
44 | if ( $articleText === false ) { |
45 | return false; |
46 | } |
47 | |
48 | foreach ( $story->getFrames() as $frame ) { |
49 | if ( $this->isOutdatedText( |
50 | $articleText, |
51 | $frame->text->value, |
52 | $frame->text->fromArticle->originalText |
53 | ) ) { |
54 | return true; |
55 | } |
56 | } |
57 | return false; |
58 | } |
59 | |
60 | /** |
61 | * @param string $articleText |
62 | * @param string $currentText |
63 | * @param string $originalText |
64 | * @return bool |
65 | */ |
66 | public function isOutdatedText( string $articleText, string $currentText, string $originalText ): bool { |
67 | return !$this->inText( $currentText, $articleText ) |
68 | && !$this->inText( $originalText, $articleText ); |
69 | } |
70 | |
71 | /** |
72 | * @param string $part Block of text containing one or more sentences |
73 | * originally selected from the article text. May have been manually edited |
74 | * by story editor. |
75 | * @param string $text Article text |
76 | * @return bool True if all the sentences in $part are present in the article text |
77 | */ |
78 | private function inText( string $part, string $text ): bool { |
79 | $sentences = preg_split( self::SENTENCE_SEPARATOR_REGEX, $part ); |
80 | foreach ( $sentences as $sentence ) { |
81 | if ( !str_contains( $text, trim( $sentence ) ) ) { |
82 | return false; |
83 | } |
84 | } |
85 | return true; |
86 | } |
87 | |
88 | /** |
89 | * Remove unnecessary elements from the html text |
90 | * @param string $html |
91 | * @return string |
92 | */ |
93 | public function transformText( string $html ): string { |
94 | // Remove HTML tags and convert entities |
95 | $text = html_entity_decode( strip_tags( $html ) ); |
96 | |
97 | // Convert multiple spaces to single space |
98 | $text = preg_replace( '/\s+/', ' ', $text ); |
99 | |
100 | // Remove references ([1]) |
101 | $text = preg_replace( '/\[\d+\]/', '', $text ); |
102 | return $text; |
103 | } |
104 | |
105 | /** |
106 | * @param Title $articleTitle |
107 | * @return string|false |
108 | */ |
109 | public function getArticleText( Title $articleTitle ) { |
110 | $dbKey = $articleTitle->getDBkey(); |
111 | if ( isset( $this->cache[ $dbKey ] ) ) { |
112 | return $this->cache[ $dbKey ]; |
113 | } |
114 | |
115 | $page = $this->wikiPageFactory->newFromTitle( $articleTitle ); |
116 | $parserOptions = $page->makeParserOptions( 'canonical' ); |
117 | $parserOutput = $page->getParserOutput( $parserOptions ); |
118 | |
119 | if ( !$parserOutput ) { |
120 | return false; |
121 | } |
122 | |
123 | $html = $parserOutput->getText(); |
124 | $text = $this->transformText( $html ); |
125 | $this->cache[ $dbKey ] = $text; |
126 | |
127 | return $text; |
128 | } |
129 | |
130 | } |