MediaWiki  1.34.0
TextTruncatorTest.php
Go to the documentation of this file.
1 <?php
2 
3 namespace TextExtracts\Test;
4 
7 
14 class TextTruncatorTest extends \PHPUnit\Framework\TestCase {
15  use \PHPUnit4And6Compat;
16 
23  public function testGetFirstSentences( $text, $sentences, $expected ) {
24  $truncator = new TextTruncator();
25  $this->assertSame( $expected, $truncator->getFirstSentences( $text, $sentences ) );
26  }
27 
28  public function provideGetFirstSentences() {
29  $longLine = str_repeat( 'word ', 1000000 );
30  return [
31  [
32  'Foo is a bar. Such a smart boy. But completely useless.',
33  2,
34  'Foo is a bar. Such a smart boy.',
35  ],
36  [
37  'Foo is a bar. Such a smart boy. But completely useless.',
38  1,
39  'Foo is a bar.',
40  ],
41  [
42  'Foo is a bar. Such a smart boy.',
43  2,
44  'Foo is a bar. Such a smart boy.',
45  ],
46  [
47  'Foo is a bar.',
48  1,
49  'Foo is a bar.',
50  ],
51  [
52  'Foo is a bar.',
53  2,
54  'Foo is a bar.',
55  ],
56  [
57  '',
58  1,
59  '',
60  ],
61  '0 sentences mean empty result' => [
62  'Foo is a bar. Such a smart boy.',
63  0,
64  '',
65  ],
66  "Don't explode on negative input" => [
67  'Foo is a bar. Such a smart boy.',
68  -1,
69  '',
70  ],
71  'More sentences requested than is available' => [
72  'Foo is a bar. Such a smart boy.',
73  3,
74  'Foo is a bar. Such a smart boy.',
75  ],
76  // Exclamation points too!!!
77  [
78  'Foo is a bar! Such a smart boy! But completely useless!',
79  1,
80  'Foo is a bar!',
81  ],
82  // A tricky one
83  [
84  "Acid phosphatase (EC 3.1.3.2) is a chemical you don't want to mess with. " .
85  "Polyvinyl acetate, however, is another story.",
86  1,
87  "Acid phosphatase (EC 3.1.3.2) is a chemical you don't want to mess with.",
88  ],
89  // No clear sentences
90  [
91  "foo\nbar\nbaz",
92  2,
93  'foo',
94  ],
95  // Bug T118621
96  [
97  'Foo was born in 1977. He enjoys listening to Siouxsie and the Banshees.',
98  1,
99  'Foo was born in 1977.',
100  ],
101  // Bug T115795 - Test no cropping after initials
102  [
103  'P.J. Harvey is a singer. She is awesome!',
104  1,
105  'P.J. Harvey is a singer.',
106  ],
107  // Bug T115817 - Non-breaking space is not a delimiter
108  [
109  html_entity_decode( 'Pigeons (lat.&nbsp;Columbidae) are birds. ' .
110  'They primarily feed on seeds.' ),
111  1,
112  html_entity_decode( 'Pigeons (lat.&nbsp;Columbidae) are birds.' ),
113  ],
114  // Bug T145231 - various problems with regexes
115  [
116  $longLine,
117  3,
118  trim( $longLine ),
119  ],
120  [
121  str_repeat( 'Sentence. ', 70000 ),
122  65536,
123  trim( str_repeat( 'Sentence. ', 65536 ) ),
124  ],
125 
126  'Preserve whitespace before end character' => [
127  'Aa . Bb',
128  1,
129  'Aa .',
130  ],
131  ];
132  }
133 
140  public function testGetFirstChars( $text, $chars, $expected ) {
141  $truncator = new TextTruncator();
142  $this->assertSame( $expected, $truncator->getFirstChars( $text, $chars ) );
143  }
144 
145  public function provideGetFirstChars() {
146  $text = 'Lullzy lulz are lullzy!';
147  $html = 'foo<tag>bar</tag>';
148  $longText = str_repeat( 'тест ', 50000 );
149  $longTextExpected = trim( str_repeat( 'тест ', 13108 ) );
150 
151  return [
152  [ $text, -8, '' ],
153  [ $text, 0, '' ],
154  [ $text, 100, $text ],
155  [ $text, 1, 'Lullzy' ],
156  [ $text, 6, 'Lullzy' ],
157  // [ $text, 7, 'Lullzy' ],
158  [ $text, 8, 'Lullzy lulz' ],
159  // HTML processing
160  [ $html, 1, 'foo' ],
161  // let HTML sanitizer clean it up later
162  [ $html, 4, 'foo<tag>' ],
163  [ $html, 12, 'foo<tag>bar</tag>' ],
164  [ $html, 13, 'foo<tag>bar</tag>' ],
165  [ $html, 16, 'foo<tag>bar</tag>' ],
166  [ $html, 17, 'foo<tag>bar</tag>' ],
167  // T143178 - previously, characters were extracted using regexps which failed when
168  // requesting 64K chars or more.
169  [ $longText, 65536, $longTextExpected ],
170  ];
171  }
172 
173  public function testTidyIntegration() {
174  $tidy = $this->createMock( TidyDriverBase::class );
175  $tidy->method( 'tidy' )
176  ->willReturnCallback( function ( $text ) {
177  return "<tidy>$text</tidy>";
178  } );
179  $truncator = new TextTruncator( $tidy );
180 
181  $text = 'Aa. Bb.';
182  $this->assertSame( '<tidy>Aa.</tidy>', $truncator->getFirstSentences( $text, 1 ) );
183  $this->assertSame( '<tidy>Aa</tidy>', $truncator->getFirstChars( $text, 1 ) );
184  }
185 
186 }
TextExtracts\Test\TextTruncatorTest\testGetFirstChars
testGetFirstChars( $text, $chars, $expected)
@dataProvider provideGetFirstChars
Definition: TextTruncatorTest.php:140
TextExtracts\Test
Definition: ApiQueryExtractsTest.php:3
TextExtracts\Test\TextTruncatorTest\testTidyIntegration
testTidyIntegration()
Definition: TextTruncatorTest.php:173
TextExtracts\Test\TextTruncatorTest\provideGetFirstChars
provideGetFirstChars()
Definition: TextTruncatorTest.php:145
$chars
if(PHP_SAPI !=='cli' &&PHP_SAPI !=='phpdbg') $chars
Definition: make-tables.php:8
TextExtracts\TextTruncator
This class needs to understand HTML as well as plain text.
Definition: TextTruncator.php:14
MediaWiki\Tidy\TidyDriverBase
Base class for HTML cleanup utilities.
Definition: TidyDriverBase.php:8
TextExtracts\Test\TextTruncatorTest\provideGetFirstSentences
provideGetFirstSentences()
Definition: TextTruncatorTest.php:28
TextExtracts\Test\TextTruncatorTest
@covers \TextExtracts\TextTruncator @group TextExtracts
Definition: TextTruncatorTest.php:14
TextExtracts\Test\TextTruncatorTest\testGetFirstSentences
testGetFirstSentences( $text, $sentences, $expected)
@dataProvider provideGetFirstSentences
Definition: TextTruncatorTest.php:23