Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
100.00% |
35 / 35 |
|
100.00% |
4 / 4 |
CRAP | |
100.00% |
1 / 1 |
TextTruncator | |
100.00% |
35 / 35 |
|
100.00% |
4 / 4 |
10 | |
100.00% |
1 / 1 |
__construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getFirstSentences | |
100.00% |
20 / 20 |
|
100.00% |
1 / 1 |
3 | |||
getFirstChars | |
100.00% |
11 / 11 |
|
100.00% |
1 / 1 |
4 | |||
tidy | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 |
1 | <?php |
2 | |
3 | namespace MediaWiki\Extension\TextExtracts; |
4 | |
5 | use MediaWiki\MediaWikiServices; |
6 | |
7 | /** |
8 | * This class needs to understand HTML as well as plain text. It tries to not break HTML tags, but |
9 | * might break pairs of tags, leaving unclosed tags behind. We can tidy the output to fix |
10 | * this. |
11 | * |
12 | * @license GPL-2.0-or-later |
13 | */ |
14 | class TextTruncator { |
15 | /** |
16 | * @var bool Whether to tidy the output |
17 | */ |
18 | private $useTidy; |
19 | |
20 | /** |
21 | * @param bool $useTidy |
22 | */ |
23 | public function __construct( bool $useTidy ) { |
24 | $this->useTidy = $useTidy; |
25 | } |
26 | |
27 | /** |
28 | * Returns no more than the given number of sentences |
29 | * |
30 | * @param string $text Source text to extract from |
31 | * @param int $requestedSentenceCount Maximum number of sentences to extract |
32 | * @return string |
33 | */ |
34 | public function getFirstSentences( $text, $requestedSentenceCount ) { |
35 | if ( $requestedSentenceCount <= 0 ) { |
36 | return ''; |
37 | } |
38 | |
39 | // Based on code from OpenSearchXml by Brion Vibber |
40 | $endchars = [ |
41 | // regular ASCII |
42 | '\P{Lu}\.(?=[ \n]|$)', |
43 | '[!?](?=[ \n]|$)', |
44 | // full-width ideographic full-stop |
45 | '。', |
46 | // double-width roman forms |
47 | '.', |
48 | '!', |
49 | '?', |
50 | // half-width ideographic full stop |
51 | '。', |
52 | ]; |
53 | |
54 | $regexp = '/(?:' . implode( '|', $endchars ) . ')+/u'; |
55 | $res = preg_match_all( $regexp, $text, $matches, PREG_OFFSET_CAPTURE ); |
56 | |
57 | if ( !$res ) { |
58 | // Just return the first line |
59 | $lines = explode( "\n", $text, 2 ); |
60 | return trim( $lines[0] ); |
61 | } |
62 | |
63 | $index = min( $requestedSentenceCount, $res ) - 1; |
64 | [ $tail, $length ] = $matches[0][$index]; |
65 | // PCRE returns raw offsets, so using substr() instead of mb_substr() |
66 | $text = substr( $text, 0, $length ) . $tail; |
67 | |
68 | return $this->tidy( $text ); |
69 | } |
70 | |
71 | /** |
72 | * Returns no more than a requested number of characters, preserving words |
73 | * |
74 | * @param string $text Source text to extract from |
75 | * @param int $requestedLength Maximum number of characters to return |
76 | * @return string |
77 | */ |
78 | public function getFirstChars( $text, $requestedLength ) { |
79 | if ( $requestedLength <= 0 ) { |
80 | return ''; |
81 | } |
82 | |
83 | $length = mb_strlen( $text ); |
84 | if ( $length <= $requestedLength ) { |
85 | return $text; |
86 | } |
87 | |
88 | // This ungreedy pattern always matches, just might return an empty string |
89 | $pattern = '/^[\w\/]*>?/su'; |
90 | preg_match( $pattern, mb_substr( $text, $requestedLength ), $m ); |
91 | $truncatedText = mb_substr( $text, 0, $requestedLength ) . $m[0]; |
92 | if ( $truncatedText === $text ) { |
93 | return $text; |
94 | } |
95 | |
96 | return $this->tidy( $truncatedText ); |
97 | } |
98 | |
99 | /** |
100 | * @param string $text |
101 | * @return string |
102 | */ |
103 | private function tidy( $text ) { |
104 | if ( $this->useTidy ) { |
105 | $text = MediaWikiServices::getInstance()->getTidy()->tidy( $text ); |
106 | } |
107 | |
108 | return trim( $text ); |
109 | } |
110 | |
111 | } |