Code Coverage for /workspace/src/extensions/Wikispeech/includes/Segment/StandardSegmenter.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	98.68% covered (success)	98.68%	75 / 76	90.00% covered (success)	90.00%	9 / 10	CRAP	0.00% covered (danger)	0.00%	0 / 1
StandardSegmenter	98.68% covered (success)	98.68%	75 / 76	90.00% covered (success)	90.00%	9 / 10	32	0.00% covered (danger)	0.00%	0 / 1
segmentSentences	90.91% covered (success)	90.91%	10 / 11	0.00% covered (danger)	0.00%	0 / 1	5.02
addContentsToCurrentSegment	100.00% covered (success)	100.00%	4 / 4	100.00% covered (success)	100.00%	1 / 1	1
addContentToCurrentSegment	100.00% covered (success)	100.00%	30 / 30	100.00% covered (success)	100.00%	1 / 1	7
getLeadingWhitespacesLength	100.00% covered (success)	100.00%	2 / 2	100.00% covered (success)	100.00%	1 / 1	1
getSentenceFinalOffset	100.00% covered (success)	100.00%	6 / 6	100.00% covered (success)	100.00%	1 / 1	2
isSentenceFinal	100.00% covered (success)	100.00%	14 / 14	100.00% covered (success)	100.00%	1 / 1	9
isSentenceEndingPunctuation	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	3
isUpper	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
isLetter	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
finishSegment	100.00% covered (success)	100.00%	4 / 4	100.00% covered (success)	100.00%	1 / 1	2

1	<?php
2
3	namespace MediaWiki\Wikispeech\Segment;
4
5	use RuntimeException;
6
7	/**
8	* @file
9	* @ingroup Extensions
10	* @license GPL-2.0-or-later
11	*/
12
13	/**
14	* Generic segmenter built for use with swedish, english and arabic language pages.
15	*
16	* @since 0.1.10
17	*/
18	class StandardSegmenter extends Segmenter {
19
20	/**
21	* An array to which finished segments are added.
22	*
23	* @var Segment[]\|null
24	*/
25	private $segments = null;
26
27	/**
28	* The segment that is currently being built.
29	*
30	* @var Segment\|null
31	*/
32	private $currentSegment = null;
33
34	/**
35	* Divide a cleaned content array into segments, one for each sentence.
36	*
37	* A sentence is here defined as a sequence of tokens ending with a dot (full stop).
38	*
39	* @since 0.1.10
40	* @param SegmentContent[] $cleanedContent An array of items returned by `Cleaner::cleanHtml()`.
41	* @return Segment[] An array of segments, each containing the `CleanedText's in that segment.
42	*/
43	public function segmentSentences( array $cleanedContent ): array {
44	$this->segments = [];
45	$this->currentSegment = new Segment();
46	foreach ( $cleanedContent as $item ) {
47	if ( $item instanceof CleanedText ) {
48	$this->addContentsToCurrentSegment( $item );
49	} elseif ( $item instanceof SegmentBreak ) {
50	$this->finishSegment();
51	} else {
52	throw new RuntimeException( 'Unsupported instance of SegmentContent' );
53	}
54	}
55	if ( $this->currentSegment->getContent() ) {
56	// Add the last segment, unless it's empty.
57	$this->finishSegment();
58	}
59	return $this->segments;
60	}
61
62	/**
63	* Add segment contents for a string.
64	*
65	* Looks for sentence final strings (strings which a sentence ends
66	* with). When a sentence final string is found, it's sentence is
67	* added to the $currentSegment.
68	*
69	* @since 0.1.10
70	* @param CleanedText $text The text to segment.
71	*/
72	private function addContentsToCurrentSegment( CleanedText $text ) {
73	$nextStartOffset = 0;
74	do {
75	$endOffset = $this->addContentToCurrentSegment( $text, $nextStartOffset );
76	// The earliest the next segments can start is one after
77	// the end of the current one.
78	$nextStartOffset = $endOffset + 1;
79	} while ( $nextStartOffset < mb_strlen( $text->getString() ) - 1 );
80	}
81
82	/**
83	* Add a sentence, or part thereof, to a segment.
84	*
85	* Finds the next sentence by sentence final characters and adds
86	* them to the segment under construction. If no sentence final
87	* character was found, all the remaining text is added. Stores
88	* start offset when the first text of a segment is added and end
89	* offset when the last is.
90	*
91	* @since 0.1.10
92	* @param CleanedText $text The text to segment.
93	* @param int $startOffset The offset where the next sentence can
94	* start, at the earliest. If the sentence has leading
95	* whitespaces, this will be moved forward.
96	* @return int The offset of the last character in the
97	* sentence. If the sentence didn't end yet, this is the last
98	* character of $text.
99	*/
100	private function addContentToCurrentSegment(
101	CleanedText $text,
102	int $startOffset = 0
103	): int {
104	if ( $this->currentSegment->getStartOffset() === null ) {
105	// Move the start offset ahead by the number of leading
106	// whitespaces. This means that whitespaces before or
107	// between segments aren't included.
108	$leadingWhitespacesLength = $this->getLeadingWhitespacesLength(
109	mb_substr( $text->getString(), $startOffset )
110	);
111	$startOffset += $leadingWhitespacesLength;
112	}
113	// Get the offset for the next sentence final character.
114	$endOffset = $this->getSentenceFinalOffset(
115	$text->getString(),
116	$startOffset
117	);
118	// If no sentence final character is found, add the rest of
119	// the text and remember that this segment isn't ended.
120	$ended = true;
121	if ( $endOffset === null ) {
122	$endOffset = mb_strlen( $text->getString() ) - 1;
123	$ended = false;
124	}
125	$sentence = mb_substr(
126	$text->getString(),
127	$startOffset,
128	$endOffset - $startOffset + 1
129	);
130	if ( $sentence !== '' && $sentence !== "\n" ) {
131	// Don't add `CleanedText`s with the empty string or only
132	// newline.
133	$sentenceText = new CleanedText(
134	$sentence,
135	$text->getPath()
136	);
137	$this->currentSegment->addContent( $sentenceText );
138	if ( $this->currentSegment->getStartOffset() === null ) {
139	// Record the start offset if this is the first text
140	// added to the segment.
141	$this->currentSegment->setStartOffset( $startOffset );
142	}
143	$this->currentSegment->setEndOffset( $endOffset );
144	if ( $ended ) {
145	$this->finishSegment();
146	}
147	}
148	return $endOffset;
149	}
150
151	/**
152	* Get the number of whitespaces at the start of a string.
153	*
154	* @since 0.1.10
155	* @param string $string The string to count leading whitespaces
156	* for.
157	* @return int The number of whitespaces at the start of $string.
158	*/
159	private function getLeadingWhitespacesLength( string $string ): int {
160	$trimmedString = preg_replace( '/^\s+/u', '', $string );
161	return mb_strlen( $string ) - mb_strlen( $trimmedString );
162	}
163
164	/**
165	* Get the offset of the first sentence final character in a string.
166	*
167	* @since 0.1.10
168	* @param string $string The string to look in.
169	* @param int $offset The offset to start looking from.
170	* @return int\|null The offset of the first sentence final character
171	* that was found, if any, else null.
172	*/
173	private function getSentenceFinalOffset(
174	string $string,
175	int $offset
176	): ?int {
177	// For every potentially sentence final character after the
178	// first one, we want to start looking from the character
179	// after the last one we found. For the first one however, we
180	// want to start looking from the character at the offset, to
181	// not miss if that is a sentence final character. To only
182	// have one loop for both these cases, we need to go back one
183	// for the first search.
184	$offset--;
185	do {
186	// Find the next character that may be sentence final.
187	$offset = mb_strpos( $string, '.', $offset + 1 );
188	if ( $offset === false ) {
189	// No character that can be sentence final was found.
190	return null;
191	}
192	} while ( !$this->isSentenceFinal( $string, $offset ) );
193	return $offset;
194	}
195
196	/**
197	* Test if a character is at the end of a sentence.
198	*
199	* Dots in abbreviations should only be counted when they also are sentence final.
200	* For example:
201	* "Monkeys, penguins etc.", but not "Monkeys e.g. baboons".
202	*
203	* @since 0.1.10
204	* @param string $string The string to check in.
205	* @param int $index The index in $string of the character to check.
206	* @return bool True if the character is sentence final, else false.
207	*/
208	protected function isSentenceFinal(
209	string $string,
210	int $index
211	): bool {
212	$character = mb_substr( $string, $index, 1 );
213	$nextCharacter = null;
214	if ( mb_strlen( $string ) > $index + 1 ) {
215	$nextCharacter = mb_substr( $string, $index + 1, 1 );
216	}
217	$characterAfterNext = null;
218	if ( mb_strlen( $string ) > $index + 2 ) {
219	$characterAfterNext = mb_substr( $string, $index + 2, 1 );
220	}
221
222	// A dot is sentence final if it's at the end of string or line
223	// or followed by a space and a capital letter.
224
225	return self::isSentenceEndingPunctuation( $character ) && (
226	!$nextCharacter \|\|
227	$nextCharacter == "\n" \|\| (
228	$nextCharacter == ' ' && (
229	!$characterAfterNext \|\| (
230	self::isLetter( $characterAfterNext ) &&
231	self::isUpper( $characterAfterNext ) ) ) ) );
232	}
233
234	/**
235	* @since 0.1.10
236	* @param string $string
237	* @return bool If param $string is a sentence ending punctuation.
238	*/
239	private static function isSentenceEndingPunctuation( string $string ): bool {
240	return $string === '.' \|\|
241	$string === '?' \|\|
242	$string === '!';
243	}
244
245	/**
246	* Test if a string is upper case.
247	*
248	* @since 0.1.10
249	* @param string $string The string to test.
250	* @return bool true if the entire string is upper case, else false.
251	*/
252	private static function isUpper( string $string ): bool {
253	return mb_strtoupper( $string ) === $string;
254	}
255
256	/**
257	* Test if a string is an alphabetical letter of any language
258	*
259	* @since 0.1.10
260	* @param string $string The string to test.
261	* @return bool true if the entire string is an alphabetical letter, else false.
262	*/
263	private static function isLetter( string $string ): bool {
264	return preg_match( '/^\p{L}$/u', $string );
265	}
266
267	/**
268	* Add the current segment to the array of segments.
269	*
270	* Creates a new, empty segment as the new current segment.
271	*
272	* @since 0.1.10
273	*/
274	private function finishSegment() {
275	if ( count( $this->currentSegment->getContent() ) ) {
276	$this->currentSegment->setHash( $this->evaluateHash( $this->currentSegment ) );
277	$this->segments[] = $this->currentSegment;
278	}
279	// Create a fresh segment to add following text to.
280	$this->currentSegment = new Segment();
281	}
282	}