Code Coverage for /workspace/src/extensions/ActiveAbstract/includes/AbstractFilter.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	6.62% covered (danger)	6.62%	9 / 136	26.67% covered (danger)	26.67%	4 / 15	CRAP	0.00% covered (danger)	0.00%	0 / 1
AbstractFilter	6.67% covered (danger)	6.67%	9 / 135	26.67% covered (danger)	26.67%	4 / 15	761.73	0.00% covered (danger)	0.00%	0 / 1
__construct	80.00% covered (warning)	80.00%	4 / 5	0.00% covered (danger)	0.00%	0 / 1	3.07
register	100.00% covered (success)	100.00%	2 / 2	100.00% covered (success)	100.00%	1 / 1	1
writeOpenStream	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
writeCloseStream	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
writeOpenPage	0.00% covered (danger)	0.00%	0 / 7	0.00% covered (danger)	0.00%	0 / 1	2
variant	0.00% covered (danger)	0.00%	0 / 6	0.00% covered (danger)	0.00%	0 / 1	6
writeClosePage	0.00% covered (danger)	0.00%	0 / 31	0.00% covered (danger)	0.00%	0 / 1	72
getText	0.00% covered (danger)	0.00%	0 / 9	0.00% covered (danger)	0.00%	0 / 1	12
extractAbstract	0.00% covered (danger)	0.00%	0 / 5	0.00% covered (danger)	0.00%	0 / 1	2
stripMarkup	0.00% covered (danger)	0.00%	0 / 17	0.00% covered (danger)	0.00%	0 / 1	2
extractStart	0.00% covered (danger)	0.00%	0 / 15	0.00% covered (danger)	0.00%	0 / 1	6
sectionLinks	0.00% covered (danger)	0.00%	0 / 17	0.00% covered (danger)	0.00%	0 / 1	6
categoryLinks	0.00% covered (danger)	0.00%	0 / 13	0.00% covered (danger)	0.00%	0 / 1	6
formatLink	0.00% covered (danger)	0.00%	0 / 5	0.00% covered (danger)	0.00%	0 / 1	2
writeRevision	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1

1	<?php
2	/**
3	* Generate XML feed for Yahoo's Active Abstracts project
4	* Plugin for dumpBackup.php; call as eg:
5	*
6	* php dumpBackup.php \
7	* --plugin=AbstractFilter \
8	* --current \
9	* --output=gzip:/dumps/abstract.xml.gz \
10	* --filter=namespace:NS_MAIN \
11	* --filter=noredirect \
12	* --filter=abstract
13	*
14	* Can optionally convert output text to a given language variant:
15	* --filter=abstract:variant=zh-cn
16	*/
17
18	namespace MediaWiki\Extension\ActiveAbstract;
19
20	use BackupDumper;
21	use ExportProgressFilter;
22	use InvalidArgumentException;
23	use MediaWiki\MediaWikiServices;
24	use MediaWiki\Parser\Sanitizer;
25	use MediaWiki\Revision\SlotRecord;
26	use MediaWiki\Title\Title;
27	use MWException;
28	use RuntimeException;
29	use stdClass;
30	use TextContent;
31	use UtfNormal\Validator;
32	use Xml;
33
34	/**
35	* Tosses away the MediaWiki XML and generates new output
36	*/
37	class AbstractFilter {
38
39	/** @var ExportProgressFilter */
40	protected $sink;
41
42	/** @var string\|false */
43	private $variant;
44
45	/** @var Title\|null */
46	protected $title;
47
48	/** @var stdClass\|null */
49	protected $revision;
50
51	/**
52	* @param ExportProgressFilter &$sink
53	* @param string $params
54	*/
55	public function __construct( &$sink, $params = '' ) {
56	$this->sink =& $sink;
57
58	$bits = explode( '=', $params, 2 );
59	if ( count( $bits ) === 2 && $bits[0] === 'variant' ) {
60	$this->variant = $bits[1];
61	} else {
62	$this->variant = false;
63	}
64	}
65
66	/**
67	* Register the filter function with the dump manager
68	* @param BackupDumper $dumper
69	*/
70	public static function register( $dumper ) {
71	$dumper->registerFilter( 'abstract', self::class );
72	$dumper->registerFilter( 'noredirect', NoredirectFilter::class );
73	}
74
75	/**
76	* @param string $string
77	*/
78	public function writeOpenStream( $string ) {
79	$this->sink->writeOpenStream( "<feed>\n" );
80	}
81
82	/**
83	* @param string $string
84	*/
85	public function writeCloseStream( $string ) {
86	$this->sink->writeCloseStream( "</feed>\n" );
87	}
88
89	/**
90	* @param stdClass $page
91	* @param string $string
92	*/
93	public function writeOpenPage( $page, $string ) {
94	global $wgSitename;
95	$this->title = Title::makeTitle( $page->page_namespace, $page->page_title );
96	$title = $wgSitename . wfMessage( 'colon-separator' )->text() . $this->title->getPrefixedText();
97
98	$xml = "<doc>\n";
99	$xml .= Xml::element( 'title', null, $this->variant( $title ) ) . "\n";
100	$xml .= Xml::element( 'url', null, $this->title->getCanonicalUrl() ) . "\n";
101
102	// add abstract and links when we have revision data...
103	$this->revision = null;
104
105	$this->sink->writeOpenPage( $page, $xml );
106	}
107
108	/**
109	* Convert text to the preferred output language variant, if set.
110	* @param string $text
111	* @return string
112	*/
113	private function variant( $text ) {
114	if ( $this->variant ) {
115	return MediaWikiServices::getInstance()
116	->getLanguageConverterFactory()
117	->getLanguageConverter()
118	->translate( $text, $this->variant );
119	}
120
121	return $text;
122	}
123
124	/**
125	* @param string $string
126	*/
127	public function writeClosePage( $string ) {
128	$xml = '';
129	if ( $this->revision ) {
130	if ( $this->title->getContentModel() === CONTENT_MODEL_TEXT
131	\|\| $this->title->getContentModel() === CONTENT_MODEL_WIKITEXT ) {
132	try {
133	$xml .= Xml::element( 'abstract', null,
134	$this->variant(
135	$this->extractAbstract( $this->revision ) ) ) . "\n";
136	} catch ( MWException \| RuntimeException $ex ) {
137	$xml .= Xml::element( 'abstract', [ 'serialization-error' => '' ] ) . "\n";
138	wfLogWarning( "failed to get abstract content for page " .
139	$this->title->getPrefixedText() . " with id " .
140	$this->revision->rev_page . "\n" );
141	}
142	} else {
143	$xml .= Xml::element( 'abstract', [ 'not-applicable' => '' ] ) . "\n";
144	}
145	$xml .= "<links>\n";
146
147	try {
148	$links = $this->sectionLinks( $this->revision );
149	if ( !$links ) {
150	// If no TOC, they want us to fall back to categories.
151	$links = $this->categoryLinks( $this->revision );
152	}
153	foreach ( $links as $anchor => $url ) {
154	$xml .= $this->formatLink( $url, $anchor, 'nav' );
155	}
156	} catch ( MWException \| RuntimeException $ex ) {
157	wfLogWarning( "failed to get abstract links for page " .
158	$this->title->getPrefixedText() . " with id " .
159	$this->revision->rev_page . "\n" );
160	$links = [];
161	}
162	// @todo: image links
163
164	$xml .= "</links>\n";
165	}
166	$xml .= "</doc>\n";
167	$this->sink->writeClosePage( $xml );
168	// In rare cases, link cache has the same key for some pages which
169	// might be read as part of the same batch. T220424
170	$linkCache = MediaWikiServices::getInstance()->getLinkCache();
171	$linkCache->clearLink( $this->title );
172	$this->title = null;
173	$this->revision = null;
174	}
175
176	/**
177	* Get the page's textual content (main slot only).
178	*
179	* @param stdClass $rev Database row with revision data
180	* @return string
181	*/
182	protected function getText( $rev ) {
183	try {
184	$store = MediaWikiServices::getInstance()->getRevisionStore();
185	$rec = $store->newRevisionFromRow( $rev );
186	$content = $rec->getContent( SlotRecord::MAIN );
187
188	if ( !$content instanceof TextContent ) {
189	// This should not happen, since writeClosePage() checks the content model.
190	return '';
191	}
192
193	// TODO: cache this!
194	return $content->getText();
195	} catch ( MWException \| RuntimeException \| InvalidArgumentException $ex ) {
196	// fall through
197	}
198
199	wfLogWarning( "failed to get text for revid " . $rev->rev_id . "\n" );
200	return '';
201	}
202
203	/**
204	* Extract an abstract from the page
205	* @param stdClass $rev Database row with revision data
206	* @return string
207	*/
208	protected function extractAbstract( $rev ) {
209	$text = $this->getText( $rev );
210
211	$stripped = $this->stripMarkup( $text );
212	$extract = $this->extractStart( $stripped );
213	// not too long pls
214	$clipped = substr( $extract, 0, 1024 );
215
216	return Validator::cleanUp( $clipped );
217	}
218
219	/**
220	* Strip markup to show plaintext
221	* @param string $text
222	* @return string
223	*/
224	protected function stripMarkup( $text ) {
225	$contLang = MediaWikiServices::getInstance()->getContentLanguage();
226
227	// don't bother with long text...
228	$text = substr( $text, 0, 4096 );
229
230	$image = preg_quote( $contLang->getNsText( NS_FILE ), '#' );
231	$text = str_replace( [ "'''", "''" ], "", $text );
232	// HTML-style comments
233	$text = preg_replace( '#<!--.*?-->#s', '', $text );
234	// HTML-style tags
235	$text = preg_replace( '#</?[a-z0-9]+.*?>#s', '', $text );
236	// URL links
237	$text = preg_replace( '#\\[[a-z]+:.? (.?)\\]#s', '$1', $text );
238	// template parameters
239	$text = preg_replace( '#\\{\\{\\{.*?\\}\\}\\}#s', '', $text );
240	// template calls
241	$text = preg_replace( '#\\{\\{.*?\\}\\}#s', '', $text );
242	// tables
243	$text = preg_replace( '#\\{\\\|.*?\\\|\\}#s', '', $text );
244	// images
245	$text = preg_replace( "#
246	\\[\\[
247	:?$image\\s*:
248	(
249	[^][]*
250	\[\[
251	[^][]*
252	\]\]
253	)*
254	[^][]*
255	\\]\\]#six", '', $text );
256	// links
257	$text = preg_replace( '#\\[\\[([^\|\\]]\\\|)?(.?)\\]\\]#s', '$2', $text );
258	// indented lines near start are usually disambigs or notices
259	$text = preg_replace( '#^:.*$#m', '', $text );
260	$text = Sanitizer::decodeCharReferences( $text );
261
262	return trim( $text );
263	}
264
265	/**
266	* Extract the first two sentences, if detectable, from the text.
267	* @param string $text
268	* @return string
269	*/
270	private function extractStart( $text ) {
271	$endchars = [
272	// regular ASCII
273	'.', '!', '?',
274	// full-width ideographic full-stop
275	'。',
276	// double-width roman forms
277	'．', '！', '？',
278	// half-width ideographic full stop
279	'｡',
280	];
281
282	$endgroup = implode( '', array_map( 'preg_quote', $endchars ) );
283	$end = "[$endgroup]";
284	$sentence = ".*?$end+";
285	$firsttwo = "/^($sentence$sentence)/u";
286
287	$matches = [];
288
289	if ( preg_match( $firsttwo, $text, $matches ) ) {
290	return $matches[1];
291	}
292
293	$firstLine = explode( "\n", $text, 2 )[0];
294	return trim( $firstLine );
295	}
296
297	/**
298	* Extract a list of TOC links
299	* @param stdClass $rev Database row with revision data
300	* @return string[] List of URL strings, indexed by name/title
301	*
302	* @todo FIXME extract TOC items properly
303	* @todo FIXME check for explicit __NOTOC__
304	*/
305	protected function sectionLinks( $rev ) {
306	$parser = MediaWikiServices::getInstance()->getParser();
307
308	$headers = [];
309
310	$text = $this->getText( $rev );
311	$secs = preg_split(
312	'/(^=+.+?=+\|^<h[1-6].?' . '>.?<\/h[1-6].*?' . '>)(?!\S)/mi',
313	$text, -1,
314	PREG_SPLIT_DELIM_CAPTURE
315	);
316
317	$secsCount = count( $secs );
318	for ( $i = 1; $i < $secsCount; $i += 2 ) {
319	$inside = preg_replace( '/^=+\s(.?)\s*=+/', '$1', $secs[$i] );
320	// strip internal markup and <h[1-6]>
321	$stripped = $this->stripMarkup( $inside );
322	$header = Validator::cleanUp( $stripped );
323	$anchor = $parser->guessSectionNameFromWikiText( $header );
324	$url = $this->title->getCanonicalUrl() . $anchor;
325	$headers[$header] = $url;
326	}
327
328	return $headers;
329	}
330
331	/**
332	* Fetch the list of category links for this page
333	* @param stdClass $rev Database row with revision data
334	* @return string[] List of URL strings, indexed by category name
335	*/
336	protected function categoryLinks( $rev ) {
337	$id = $rev->page_id;
338	$dbr = MediaWikiServices::getInstance()->getConnectionProvider()->getReplicaDatabase();
339	$result = $dbr->newSelectQueryBuilder()
340	->select( 'cl_to' )
341	->from( 'categorylinks' )
342	->where( [ 'cl_from' => $id ] )
343	->caller( __METHOD__ )
344	->fetchResultSet();
345
346	$links = [];
347	foreach ( $result as $row ) {
348	$category = Title::makeTitle( NS_CATEGORY, $row->cl_to );
349	$links[$category->getText()] = $category->getCanonicalUrl();
350	}
351
352	return $links;
353	}
354
355	/**
356	* Format a <sublink> element, like so:
357	* <sublink linktype="nav">
358	* <anchor>1939 Births</anchor>
359	* <link>http://en.wikipedia.org/wiki/Category:1939_births</link>
360	* </sublink>
361	*
362	* @param string $url
363	* @param string $anchor Human-readable link text; eg title or fragment
364	* @param string $type "nav" or "image"
365	* @return string XML fragment
366	*/
367	protected function formatLink( $url, $anchor, $type ) {
368	// as defined in Yahoo's .xsd
369	$maxUrlLength = 1024;
370	return Xml::openElement( 'sublink', [ 'linktype' => $type ] ) .
371	Xml::element( 'anchor', null, $this->variant( $anchor ) ) .
372	Xml::element( 'link', null, substr( $url, 0, $maxUrlLength ) ) .
373	Xml::closeElement( 'sublink' ) . "\n";
374	}
375
376	/**
377	* @param stdClass $rev
378	* @param string $string
379	*/
380	public function writeRevision( $rev, $string ) {
381	// Only use one revision's worth of data to output
382	$this->revision = $rev;
383	}
384	}
385
386	class_alias( AbstractFilter::class, 'AbstractFilter' );