MediaWiki REL1_34
TextTruncator.php
Go to the documentation of this file.
1<?php
2
3namespace TextExtracts;
4
6
15
19 private $tidyDriver;
20
24 public function __construct( TidyDriverBase $tidy = null ) {
25 $this->tidyDriver = $tidy;
26 }
27
35 public function getFirstSentences( $text, $requestedSentenceCount ) {
36 if ( $requestedSentenceCount <= 0 ) {
37 return '';
38 }
39
40 // Based on code from OpenSearchXml by Brion Vibber
41 $endchars = [
42 // regular ASCII
43 '\P{Lu}\.(?=[ \n]|$)',
44 '[!?](?=[ \n]|$)',
45 // full-width ideographic full-stop
46 '。',
47 // double-width roman forms
48 '.',
49 '!',
50 '?',
51 // half-width ideographic full stop
52 '。',
53 ];
54
55 $regexp = '/(?:' . implode( '|', $endchars ) . ')+/u';
56 $res = preg_match_all( $regexp, $text, $matches, PREG_OFFSET_CAPTURE );
57
58 if ( !$res ) {
59 // Just return the first line
60 $lines = explode( "\n", $text, 2 );
61 return trim( $lines[0] );
62 }
63
64 $index = min( $requestedSentenceCount, $res ) - 1;
65 list( $tail, $length ) = $matches[0][$index];
66 // PCRE returns raw offsets, so using substr() instead of mb_substr()
67 $text = substr( $text, 0, $length ) . $tail;
68
69 return $this->tidy( $text );
70 }
71
79 public function getFirstChars( $text, $requestedLength ) {
80 if ( $requestedLength <= 0 ) {
81 return '';
82 }
83
84 $length = mb_strlen( $text );
85 if ( $length <= $requestedLength ) {
86 return $text;
87 }
88
89 // This ungreedy pattern always matches, just might return an empty string
90 $pattern = '/^[\w\/]*>?/su';
91 preg_match( $pattern, mb_substr( $text, $requestedLength ), $m );
92 $text = mb_substr( $text, 0, $requestedLength ) . $m[0];
93
94 return $this->tidy( $text );
95 }
96
101 private function tidy( $text ) {
102 if ( $this->tidyDriver ) {
103 // Fix possibly unclosed HTML tags.
104 $text = $this->tidyDriver->tidy( $text );
105 }
106
107 return trim( $text );
108 }
109
110}
Base class for HTML cleanup utilities.
This class needs to understand HTML as well as plain text.
__construct(TidyDriverBase $tidy=null)
getFirstSentences( $text, $requestedSentenceCount)
Returns no more than the given number of sentences.
TidyDriverBase null $tidyDriver
getFirstChars( $text, $requestedLength)
Returns no more than a requested number of characters, preserving words.
$lines
Definition router.php:61