MediaWiki REL1_34
TextTruncatorTest.php
Go to the documentation of this file.
1<?php
2
3namespace TextExtracts\Test;
4
7
14class TextTruncatorTest extends \PHPUnit\Framework\TestCase {
15 use \PHPUnit4And6Compat;
16
23 public function testGetFirstSentences( $text, $sentences, $expected ) {
24 $truncator = new TextTruncator();
25 $this->assertSame( $expected, $truncator->getFirstSentences( $text, $sentences ) );
26 }
27
28 public function provideGetFirstSentences() {
29 $longLine = str_repeat( 'word ', 1000000 );
30 return [
31 [
32 'Foo is a bar. Such a smart boy. But completely useless.',
33 2,
34 'Foo is a bar. Such a smart boy.',
35 ],
36 [
37 'Foo is a bar. Such a smart boy. But completely useless.',
38 1,
39 'Foo is a bar.',
40 ],
41 [
42 'Foo is a bar. Such a smart boy.',
43 2,
44 'Foo is a bar. Such a smart boy.',
45 ],
46 [
47 'Foo is a bar.',
48 1,
49 'Foo is a bar.',
50 ],
51 [
52 'Foo is a bar.',
53 2,
54 'Foo is a bar.',
55 ],
56 [
57 '',
58 1,
59 '',
60 ],
61 '0 sentences mean empty result' => [
62 'Foo is a bar. Such a smart boy.',
63 0,
64 '',
65 ],
66 "Don't explode on negative input" => [
67 'Foo is a bar. Such a smart boy.',
68 -1,
69 '',
70 ],
71 'More sentences requested than is available' => [
72 'Foo is a bar. Such a smart boy.',
73 3,
74 'Foo is a bar. Such a smart boy.',
75 ],
76 // Exclamation points too!!!
77 [
78 'Foo is a bar! Such a smart boy! But completely useless!',
79 1,
80 'Foo is a bar!',
81 ],
82 // A tricky one
83 [
84 "Acid phosphatase (EC 3.1.3.2) is a chemical you don't want to mess with. " .
85 "Polyvinyl acetate, however, is another story.",
86 1,
87 "Acid phosphatase (EC 3.1.3.2) is a chemical you don't want to mess with.",
88 ],
89 // No clear sentences
90 [
91 "foo\nbar\nbaz",
92 2,
93 'foo',
94 ],
95 // Bug T118621
96 [
97 'Foo was born in 1977. He enjoys listening to Siouxsie and the Banshees.',
98 1,
99 'Foo was born in 1977.',
100 ],
101 // Bug T115795 - Test no cropping after initials
102 [
103 'P.J. Harvey is a singer. She is awesome!',
104 1,
105 'P.J. Harvey is a singer.',
106 ],
107 // Bug T115817 - Non-breaking space is not a delimiter
108 [
109 html_entity_decode( 'Pigeons (lat.&nbsp;Columbidae) are birds. ' .
110 'They primarily feed on seeds.' ),
111 1,
112 html_entity_decode( 'Pigeons (lat.&nbsp;Columbidae) are birds.' ),
113 ],
114 // Bug T145231 - various problems with regexes
115 [
116 $longLine,
117 3,
118 trim( $longLine ),
119 ],
120 [
121 str_repeat( 'Sentence. ', 70000 ),
122 65536,
123 trim( str_repeat( 'Sentence. ', 65536 ) ),
124 ],
125
126 'Preserve whitespace before end character' => [
127 'Aa . Bb',
128 1,
129 'Aa .',
130 ],
131 ];
132 }
133
140 public function testGetFirstChars( $text, $chars, $expected ) {
141 $truncator = new TextTruncator();
142 $this->assertSame( $expected, $truncator->getFirstChars( $text, $chars ) );
143 }
144
145 public function provideGetFirstChars() {
146 $text = 'Lullzy lulz are lullzy!';
147 $html = 'foo<tag>bar</tag>';
148 $longText = str_repeat( 'тест ', 50000 );
149 $longTextExpected = trim( str_repeat( 'тест ', 13108 ) );
150
151 return [
152 [ $text, -8, '' ],
153 [ $text, 0, '' ],
154 [ $text, 100, $text ],
155 [ $text, 1, 'Lullzy' ],
156 [ $text, 6, 'Lullzy' ],
157 // [ $text, 7, 'Lullzy' ],
158 [ $text, 8, 'Lullzy lulz' ],
159 // HTML processing
160 [ $html, 1, 'foo' ],
161 // let HTML sanitizer clean it up later
162 [ $html, 4, 'foo<tag>' ],
163 [ $html, 12, 'foo<tag>bar</tag>' ],
164 [ $html, 13, 'foo<tag>bar</tag>' ],
165 [ $html, 16, 'foo<tag>bar</tag>' ],
166 [ $html, 17, 'foo<tag>bar</tag>' ],
167 // T143178 - previously, characters were extracted using regexps which failed when
168 // requesting 64K chars or more.
169 [ $longText, 65536, $longTextExpected ],
170 ];
171 }
172
173 public function testTidyIntegration() {
174 $tidy = $this->createMock( TidyDriverBase::class );
175 $tidy->method( 'tidy' )
176 ->willReturnCallback( function ( $text ) {
177 return "<tidy>$text</tidy>";
178 } );
179 $truncator = new TextTruncator( $tidy );
180
181 $text = 'Aa. Bb.';
182 $this->assertSame( '<tidy>Aa.</tidy>', $truncator->getFirstSentences( $text, 1 ) );
183 $this->assertSame( '<tidy>Aa</tidy>', $truncator->getFirstChars( $text, 1 ) );
184 }
185
186}
Base class for HTML cleanup utilities.
@covers \TextExtracts\TextTruncator @group TextExtracts
testGetFirstSentences( $text, $sentences, $expected)
@dataProvider provideGetFirstSentences
testGetFirstChars( $text, $chars, $expected)
@dataProvider provideGetFirstChars
This class needs to understand HTML as well as plain text.
if(PHP_SAPI !=='cli' &&PHP_SAPI !=='phpdbg' $chars)