Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
98.68% |
75 / 76 |
|
90.00% |
9 / 10 |
CRAP | |
0.00% |
0 / 1 |
StandardSegmenter | |
98.68% |
75 / 76 |
|
90.00% |
9 / 10 |
32 | |
0.00% |
0 / 1 |
segmentSentences | |
90.91% |
10 / 11 |
|
0.00% |
0 / 1 |
5.02 | |||
addContentsToCurrentSegment | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
addContentToCurrentSegment | |
100.00% |
30 / 30 |
|
100.00% |
1 / 1 |
7 | |||
getLeadingWhitespacesLength | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
getSentenceFinalOffset | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
2 | |||
isSentenceFinal | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
9 | |||
isSentenceEndingPunctuation | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
3 | |||
isUpper | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
isLetter | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
finishSegment | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 |
1 | <?php |
2 | |
3 | namespace MediaWiki\Wikispeech\Segment; |
4 | |
5 | use RuntimeException; |
6 | |
7 | /** |
8 | * @file |
9 | * @ingroup Extensions |
10 | * @license GPL-2.0-or-later |
11 | */ |
12 | |
13 | /** |
14 | * Generic segmenter built for use with swedish, english and arabic language pages. |
15 | * |
16 | * @since 0.1.10 |
17 | */ |
18 | class StandardSegmenter extends Segmenter { |
19 | |
20 | /** |
21 | * An array to which finished segments are added. |
22 | * |
23 | * @var Segment[]|null |
24 | */ |
25 | private $segments = null; |
26 | |
27 | /** |
28 | * The segment that is currently being built. |
29 | * |
30 | * @var Segment|null |
31 | */ |
32 | private $currentSegment = null; |
33 | |
34 | /** |
35 | * Divide a cleaned content array into segments, one for each sentence. |
36 | * |
37 | * A sentence is here defined as a sequence of tokens ending with a dot (full stop). |
38 | * |
39 | * @since 0.1.10 |
40 | * @param SegmentContent[] $cleanedContent An array of items returned by `Cleaner::cleanHtml()`. |
41 | * @return Segment[] An array of segments, each containing the `CleanedText's in that segment. |
42 | */ |
43 | public function segmentSentences( array $cleanedContent ): array { |
44 | $this->segments = []; |
45 | $this->currentSegment = new Segment(); |
46 | foreach ( $cleanedContent as $item ) { |
47 | if ( $item instanceof CleanedText ) { |
48 | $this->addContentsToCurrentSegment( $item ); |
49 | } elseif ( $item instanceof SegmentBreak ) { |
50 | $this->finishSegment(); |
51 | } else { |
52 | throw new RuntimeException( 'Unsupported instance of SegmentContent' ); |
53 | } |
54 | } |
55 | if ( $this->currentSegment->getContent() ) { |
56 | // Add the last segment, unless it's empty. |
57 | $this->finishSegment(); |
58 | } |
59 | return $this->segments; |
60 | } |
61 | |
62 | /** |
63 | * Add segment contents for a string. |
64 | * |
65 | * Looks for sentence final strings (strings which a sentence ends |
66 | * with). When a sentence final string is found, it's sentence is |
67 | * added to the $currentSegment. |
68 | * |
69 | * @since 0.1.10 |
70 | * @param CleanedText $text The text to segment. |
71 | */ |
72 | private function addContentsToCurrentSegment( CleanedText $text ) { |
73 | $nextStartOffset = 0; |
74 | do { |
75 | $endOffset = $this->addContentToCurrentSegment( $text, $nextStartOffset ); |
76 | // The earliest the next segments can start is one after |
77 | // the end of the current one. |
78 | $nextStartOffset = $endOffset + 1; |
79 | } while ( $nextStartOffset < mb_strlen( $text->getString() ) - 1 ); |
80 | } |
81 | |
82 | /** |
83 | * Add a sentence, or part thereof, to a segment. |
84 | * |
85 | * Finds the next sentence by sentence final characters and adds |
86 | * them to the segment under construction. If no sentence final |
87 | * character was found, all the remaining text is added. Stores |
88 | * start offset when the first text of a segment is added and end |
89 | * offset when the last is. |
90 | * |
91 | * @since 0.1.10 |
92 | * @param CleanedText $text The text to segment. |
93 | * @param int $startOffset The offset where the next sentence can |
94 | * start, at the earliest. If the sentence has leading |
95 | * whitespaces, this will be moved forward. |
96 | * @return int The offset of the last character in the |
97 | * sentence. If the sentence didn't end yet, this is the last |
98 | * character of $text. |
99 | */ |
100 | private function addContentToCurrentSegment( |
101 | CleanedText $text, |
102 | int $startOffset = 0 |
103 | ): int { |
104 | if ( $this->currentSegment->getStartOffset() === null ) { |
105 | // Move the start offset ahead by the number of leading |
106 | // whitespaces. This means that whitespaces before or |
107 | // between segments aren't included. |
108 | $leadingWhitespacesLength = $this->getLeadingWhitespacesLength( |
109 | mb_substr( $text->getString(), $startOffset ) |
110 | ); |
111 | $startOffset += $leadingWhitespacesLength; |
112 | } |
113 | // Get the offset for the next sentence final character. |
114 | $endOffset = $this->getSentenceFinalOffset( |
115 | $text->getString(), |
116 | $startOffset |
117 | ); |
118 | // If no sentence final character is found, add the rest of |
119 | // the text and remember that this segment isn't ended. |
120 | $ended = true; |
121 | if ( $endOffset === null ) { |
122 | $endOffset = mb_strlen( $text->getString() ) - 1; |
123 | $ended = false; |
124 | } |
125 | $sentence = mb_substr( |
126 | $text->getString(), |
127 | $startOffset, |
128 | $endOffset - $startOffset + 1 |
129 | ); |
130 | if ( $sentence !== '' && $sentence !== "\n" ) { |
131 | // Don't add `CleanedText`s with the empty string or only |
132 | // newline. |
133 | $sentenceText = new CleanedText( |
134 | $sentence, |
135 | $text->getPath() |
136 | ); |
137 | $this->currentSegment->addContent( $sentenceText ); |
138 | if ( $this->currentSegment->getStartOffset() === null ) { |
139 | // Record the start offset if this is the first text |
140 | // added to the segment. |
141 | $this->currentSegment->setStartOffset( $startOffset ); |
142 | } |
143 | $this->currentSegment->setEndOffset( $endOffset ); |
144 | if ( $ended ) { |
145 | $this->finishSegment(); |
146 | } |
147 | } |
148 | return $endOffset; |
149 | } |
150 | |
151 | /** |
152 | * Get the number of whitespaces at the start of a string. |
153 | * |
154 | * @since 0.1.10 |
155 | * @param string $string The string to count leading whitespaces |
156 | * for. |
157 | * @return int The number of whitespaces at the start of $string. |
158 | */ |
159 | private function getLeadingWhitespacesLength( string $string ): int { |
160 | $trimmedString = preg_replace( '/^\s+/u', '', $string ); |
161 | return mb_strlen( $string ) - mb_strlen( $trimmedString ); |
162 | } |
163 | |
164 | /** |
165 | * Get the offset of the first sentence final character in a string. |
166 | * |
167 | * @since 0.1.10 |
168 | * @param string $string The string to look in. |
169 | * @param int $offset The offset to start looking from. |
170 | * @return int|null The offset of the first sentence final character |
171 | * that was found, if any, else null. |
172 | */ |
173 | private function getSentenceFinalOffset( |
174 | string $string, |
175 | int $offset |
176 | ): ?int { |
177 | // For every potentially sentence final character after the |
178 | // first one, we want to start looking from the character |
179 | // after the last one we found. For the first one however, we |
180 | // want to start looking from the character at the offset, to |
181 | // not miss if that is a sentence final character. To only |
182 | // have one loop for both these cases, we need to go back one |
183 | // for the first search. |
184 | $offset--; |
185 | do { |
186 | // Find the next character that may be sentence final. |
187 | $offset = mb_strpos( $string, '.', $offset + 1 ); |
188 | if ( $offset === false ) { |
189 | // No character that can be sentence final was found. |
190 | return null; |
191 | } |
192 | } while ( !$this->isSentenceFinal( $string, $offset ) ); |
193 | return $offset; |
194 | } |
195 | |
196 | /** |
197 | * Test if a character is at the end of a sentence. |
198 | * |
199 | * Dots in abbreviations should only be counted when they also are sentence final. |
200 | * For example: |
201 | * "Monkeys, penguins etc.", but not "Monkeys e.g. baboons". |
202 | * |
203 | * @since 0.1.10 |
204 | * @param string $string The string to check in. |
205 | * @param int $index The index in $string of the character to check. |
206 | * @return bool True if the character is sentence final, else false. |
207 | */ |
208 | protected function isSentenceFinal( |
209 | string $string, |
210 | int $index |
211 | ): bool { |
212 | $character = mb_substr( $string, $index, 1 ); |
213 | $nextCharacter = null; |
214 | if ( mb_strlen( $string ) > $index + 1 ) { |
215 | $nextCharacter = mb_substr( $string, $index + 1, 1 ); |
216 | } |
217 | $characterAfterNext = null; |
218 | if ( mb_strlen( $string ) > $index + 2 ) { |
219 | $characterAfterNext = mb_substr( $string, $index + 2, 1 ); |
220 | } |
221 | |
222 | // A dot is sentence final if it's at the end of string or line |
223 | // or followed by a space and a capital letter. |
224 | |
225 | return self::isSentenceEndingPunctuation( $character ) && ( |
226 | !$nextCharacter || |
227 | $nextCharacter == "\n" || ( |
228 | $nextCharacter == ' ' && ( |
229 | !$characterAfterNext || ( |
230 | self::isLetter( $characterAfterNext ) && |
231 | self::isUpper( $characterAfterNext ) ) ) ) ); |
232 | } |
233 | |
234 | /** |
235 | * @since 0.1.10 |
236 | * @param string $string |
237 | * @return bool If param $string is a sentence ending punctuation. |
238 | */ |
239 | private static function isSentenceEndingPunctuation( string $string ): bool { |
240 | return $string === '.' || |
241 | $string === '?' || |
242 | $string === '!'; |
243 | } |
244 | |
245 | /** |
246 | * Test if a string is upper case. |
247 | * |
248 | * @since 0.1.10 |
249 | * @param string $string The string to test. |
250 | * @return bool true if the entire string is upper case, else false. |
251 | */ |
252 | private static function isUpper( string $string ): bool { |
253 | return mb_strtoupper( $string ) === $string; |
254 | } |
255 | |
256 | /** |
257 | * Test if a string is an alphabetical letter of any language |
258 | * |
259 | * @since 0.1.10 |
260 | * @param string $string The string to test. |
261 | * @return bool true if the entire string is an alphabetical letter, else false. |
262 | */ |
263 | private static function isLetter( string $string ): bool { |
264 | return preg_match( '/^\p{L}$/u', $string ); |
265 | } |
266 | |
267 | /** |
268 | * Add the current segment to the array of segments. |
269 | * |
270 | * Creates a new, empty segment as the new current segment. |
271 | * |
272 | * @since 0.1.10 |
273 | */ |
274 | private function finishSegment() { |
275 | if ( count( $this->currentSegment->getContent() ) ) { |
276 | $this->currentSegment->setHash( $this->evaluateHash( $this->currentSegment ) ); |
277 | $this->segments[] = $this->currentSegment; |
278 | } |
279 | // Create a fresh segment to add following text to. |
280 | $this->currentSegment = new Segment(); |
281 | } |
282 | } |