Code Coverage for /workspace/src/extensions/TextExtracts/includes/TextTruncator.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	100.00% covered (success)	100.00%	35 / 35	100.00% covered (success)	100.00%	4 / 4	CRAP	100.00% covered (success)	100.00%	1 / 1
TextTruncator	100.00% covered (success)	100.00%	35 / 35	100.00% covered (success)	100.00%	4 / 4	10	100.00% covered (success)	100.00%	1 / 1
__construct	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
getFirstSentences	100.00% covered (success)	100.00%	20 / 20	100.00% covered (success)	100.00%	1 / 1	3
getFirstChars	100.00% covered (success)	100.00%	11 / 11	100.00% covered (success)	100.00%	1 / 1	4
tidy	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	2

1	<?php
2
3	namespace MediaWiki\Extension\TextExtracts;
4
5	use MediaWiki\MediaWikiServices;
6
7	/**
8	* This class needs to understand HTML as well as plain text. It tries to not break HTML tags, but
9	* might break pairs of tags, leaving unclosed tags behind. We can tidy the output to fix
10	* this.
11	*
12	* @license GPL-2.0-or-later
13	*/
14	class TextTruncator {
15	/**
16	* @var bool Whether to tidy the output
17	*/
18	private $useTidy;
19
20	/**
21	* @param bool $useTidy
22	*/
23	public function __construct( bool $useTidy ) {
24	$this->useTidy = $useTidy;
25	}
26
27	/**
28	* Returns no more than the given number of sentences
29	*
30	* @param string $text Source text to extract from
31	* @param int $requestedSentenceCount Maximum number of sentences to extract
32	* @return string
33	*/
34	public function getFirstSentences( $text, $requestedSentenceCount ) {
35	if ( $requestedSentenceCount <= 0 ) {
36	return '';
37	}
38
39	// Based on code from OpenSearchXml by Brion Vibber
40	$endchars = [
41	// regular ASCII
42	'\P{Lu}\.(?=[ \n]\|$)',
43	'[!?](?=[ \n]\|$)',
44	// full-width ideographic full-stop
45	'。',
46	// double-width roman forms
47	'．',
48	'！',
49	'？',
50	// half-width ideographic full stop
51	'｡',
52	];
53
54	$regexp = '/(?:' . implode( '\|', $endchars ) . ')+/u';
55	$res = preg_match_all( $regexp, $text, $matches, PREG_OFFSET_CAPTURE );
56
57	if ( !$res ) {
58	// Just return the first line
59	$lines = explode( "\n", $text, 2 );
60	return trim( $lines[0] );
61	}
62
63	$index = min( $requestedSentenceCount, $res ) - 1;
64	[ $tail, $length ] = $matches[0][$index];
65	// PCRE returns raw offsets, so using substr() instead of mb_substr()
66	$text = substr( $text, 0, $length ) . $tail;
67
68	return $this->tidy( $text );
69	}
70
71	/**
72	* Returns no more than a requested number of characters, preserving words
73	*
74	* @param string $text Source text to extract from
75	* @param int $requestedLength Maximum number of characters to return
76	* @return string
77	*/
78	public function getFirstChars( $text, $requestedLength ) {
79	if ( $requestedLength <= 0 ) {
80	return '';
81	}
82
83	$length = mb_strlen( $text );
84	if ( $length <= $requestedLength ) {
85	return $text;
86	}
87
88	// This ungreedy pattern always matches, just might return an empty string
89	$pattern = '/^[\w\/]*>?/su';
90	preg_match( $pattern, mb_substr( $text, $requestedLength ), $m );
91	$truncatedText = mb_substr( $text, 0, $requestedLength ) . $m[0];
92	if ( $truncatedText === $text ) {
93	return $text;
94	}
95
96	return $this->tidy( $truncatedText );
97	}
98
99	/**
100	* @param string $text
101	* @return string
102	*/
103	private function tidy( $text ) {
104	if ( $this->useTidy ) {
105	$text = MediaWikiServices::getInstance()->getTidy()->tidy( $text );
106	}
107
108	return trim( $text );
109	}
110
111	}