MediaWiki  1.34.0
TextTruncator.php
Go to the documentation of this file.
1 <?php
2 
3 namespace TextExtracts;
4 
6 
15 
19  private $tidyDriver;
20 
24  public function __construct( TidyDriverBase $tidy = null ) {
25  $this->tidyDriver = $tidy;
26  }
27 
35  public function getFirstSentences( $text, $requestedSentenceCount ) {
36  if ( $requestedSentenceCount <= 0 ) {
37  return '';
38  }
39 
40  // Based on code from OpenSearchXml by Brion Vibber
41  $endchars = [
42  // regular ASCII
43  '\P{Lu}\.(?=[ \n]|$)',
44  '[!?](?=[ \n]|$)',
45  // full-width ideographic full-stop
46  '。',
47  // double-width roman forms
48  '.',
49  '!',
50  '?',
51  // half-width ideographic full stop
52  '。',
53  ];
54 
55  $regexp = '/(?:' . implode( '|', $endchars ) . ')+/u';
56  $res = preg_match_all( $regexp, $text, $matches, PREG_OFFSET_CAPTURE );
57 
58  if ( !$res ) {
59  // Just return the first line
60  $lines = explode( "\n", $text, 2 );
61  return trim( $lines[0] );
62  }
63 
64  $index = min( $requestedSentenceCount, $res ) - 1;
65  list( $tail, $length ) = $matches[0][$index];
66  // PCRE returns raw offsets, so using substr() instead of mb_substr()
67  $text = substr( $text, 0, $length ) . $tail;
68 
69  return $this->tidy( $text );
70  }
71 
79  public function getFirstChars( $text, $requestedLength ) {
80  if ( $requestedLength <= 0 ) {
81  return '';
82  }
83 
84  $length = mb_strlen( $text );
85  if ( $length <= $requestedLength ) {
86  return $text;
87  }
88 
89  // This ungreedy pattern always matches, just might return an empty string
90  $pattern = '/^[\w\/]*>?/su';
91  preg_match( $pattern, mb_substr( $text, $requestedLength ), $m );
92  $text = mb_substr( $text, 0, $requestedLength ) . $m[0];
93 
94  return $this->tidy( $text );
95  }
96 
101  private function tidy( $text ) {
102  if ( $this->tidyDriver ) {
103  // Fix possibly unclosed HTML tags.
104  $text = $this->tidyDriver->tidy( $text );
105  }
106 
107  return trim( $text );
108  }
109 
110 }
TextExtracts\TextTruncator\getFirstChars
getFirstChars( $text, $requestedLength)
Returns no more than a requested number of characters, preserving words.
Definition: TextTruncator.php:79
$res
$res
Definition: testCompression.php:52
TextExtracts\TextTruncator\$tidyDriver
TidyDriverBase null $tidyDriver
Definition: TextTruncator.php:19
TextExtracts\TextTruncator\__construct
__construct(TidyDriverBase $tidy=null)
Definition: TextTruncator.php:24
$matches
$matches
Definition: NoLocalSettings.php:24
$lines
$lines
Definition: router.php:61
TextExtracts\TextTruncator
This class needs to understand HTML as well as plain text.
Definition: TextTruncator.php:14
MediaWiki\Tidy\TidyDriverBase
Base class for HTML cleanup utilities.
Definition: TidyDriverBase.php:8
TextExtracts
Definition: ApiQueryExtracts.php:3
TextExtracts\TextTruncator\tidy
tidy( $text)
Definition: TextTruncator.php:101
TextExtracts\TextTruncator\getFirstSentences
getFirstSentences( $text, $requestedSentenceCount)
Returns no more than the given number of sentences.
Definition: TextTruncator.php:35