Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
6.62% |
9 / 136 |
|
26.67% |
4 / 15 |
CRAP | |
0.00% |
0 / 1 |
AbstractFilter | |
6.67% |
9 / 135 |
|
26.67% |
4 / 15 |
761.73 | |
0.00% |
0 / 1 |
__construct | |
80.00% |
4 / 5 |
|
0.00% |
0 / 1 |
3.07 | |||
register | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
writeOpenStream | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
writeCloseStream | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
writeOpenPage | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
2 | |||
variant | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
6 | |||
writeClosePage | |
0.00% |
0 / 31 |
|
0.00% |
0 / 1 |
72 | |||
getText | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
12 | |||
extractAbstract | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
stripMarkup | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
2 | |||
extractStart | |
0.00% |
0 / 15 |
|
0.00% |
0 / 1 |
6 | |||
sectionLinks | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
6 | |||
categoryLinks | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
6 | |||
formatLink | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
writeRevision | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 |
1 | <?php |
2 | /** |
3 | * Generate XML feed for Yahoo's Active Abstracts project |
4 | * Plugin for dumpBackup.php; call as eg: |
5 | * |
6 | * php dumpBackup.php \ |
7 | * --plugin=AbstractFilter \ |
8 | * --current \ |
9 | * --output=gzip:/dumps/abstract.xml.gz \ |
10 | * --filter=namespace:NS_MAIN \ |
11 | * --filter=noredirect \ |
12 | * --filter=abstract |
13 | * |
14 | * Can optionally convert output text to a given language variant: |
15 | * --filter=abstract:variant=zh-cn |
16 | */ |
17 | |
18 | namespace MediaWiki\Extension\ActiveAbstract; |
19 | |
20 | use ExportProgressFilter; |
21 | use InvalidArgumentException; |
22 | use MediaWiki\Content\TextContent; |
23 | use MediaWiki\Maintenance\BackupDumper; |
24 | use MediaWiki\MediaWikiServices; |
25 | use MediaWiki\Parser\Sanitizer; |
26 | use MediaWiki\Revision\SlotRecord; |
27 | use MediaWiki\Title\Title; |
28 | use MediaWiki\Xml\Xml; |
29 | use MWException; |
30 | use RuntimeException; |
31 | use stdClass; |
32 | use UtfNormal\Validator; |
33 | |
34 | /** |
35 | * Tosses away the MediaWiki XML and generates new output |
36 | */ |
37 | class AbstractFilter { |
38 | |
39 | /** @var ExportProgressFilter */ |
40 | protected $sink; |
41 | |
42 | /** @var string|false */ |
43 | private $variant; |
44 | |
45 | /** @var Title|null */ |
46 | protected $title; |
47 | |
48 | /** @var stdClass|null */ |
49 | protected $revision; |
50 | |
51 | /** |
52 | * @param ExportProgressFilter &$sink |
53 | * @param string $params |
54 | */ |
55 | public function __construct( &$sink, $params = '' ) { |
56 | $this->sink =& $sink; |
57 | |
58 | $bits = explode( '=', $params, 2 ); |
59 | if ( count( $bits ) === 2 && $bits[0] === 'variant' ) { |
60 | $this->variant = $bits[1]; |
61 | } else { |
62 | $this->variant = false; |
63 | } |
64 | } |
65 | |
66 | /** |
67 | * Register the filter function with the dump manager |
68 | * @param BackupDumper $dumper |
69 | */ |
70 | public static function register( $dumper ) { |
71 | $dumper->registerFilter( 'abstract', self::class ); |
72 | $dumper->registerFilter( 'noredirect', NoredirectFilter::class ); |
73 | } |
74 | |
75 | /** |
76 | * @param string $string |
77 | */ |
78 | public function writeOpenStream( $string ) { |
79 | $this->sink->writeOpenStream( "<feed>\n" ); |
80 | } |
81 | |
82 | /** |
83 | * @param string $string |
84 | */ |
85 | public function writeCloseStream( $string ) { |
86 | $this->sink->writeCloseStream( "</feed>\n" ); |
87 | } |
88 | |
89 | /** |
90 | * @param stdClass $page |
91 | * @param string $string |
92 | */ |
93 | public function writeOpenPage( $page, $string ) { |
94 | global $wgSitename; |
95 | $this->title = Title::makeTitle( $page->page_namespace, $page->page_title ); |
96 | $title = $wgSitename . wfMessage( 'colon-separator' )->text() . $this->title->getPrefixedText(); |
97 | |
98 | $xml = "<doc>\n"; |
99 | $xml .= Xml::element( 'title', null, $this->variant( $title ) ) . "\n"; |
100 | $xml .= Xml::element( 'url', null, $this->title->getCanonicalUrl() ) . "\n"; |
101 | |
102 | // add abstract and links when we have revision data... |
103 | $this->revision = null; |
104 | |
105 | $this->sink->writeOpenPage( $page, $xml ); |
106 | } |
107 | |
108 | /** |
109 | * Convert text to the preferred output language variant, if set. |
110 | * @param string $text |
111 | * @return string |
112 | */ |
113 | private function variant( $text ) { |
114 | if ( $this->variant ) { |
115 | return MediaWikiServices::getInstance() |
116 | ->getLanguageConverterFactory() |
117 | ->getLanguageConverter() |
118 | ->translate( $text, $this->variant ); |
119 | } |
120 | |
121 | return $text; |
122 | } |
123 | |
124 | /** |
125 | * @param string $string |
126 | */ |
127 | public function writeClosePage( $string ) { |
128 | $xml = ''; |
129 | if ( $this->revision ) { |
130 | if ( $this->title->getContentModel() === CONTENT_MODEL_TEXT |
131 | || $this->title->getContentModel() === CONTENT_MODEL_WIKITEXT ) { |
132 | try { |
133 | $xml .= Xml::element( 'abstract', null, |
134 | $this->variant( |
135 | $this->extractAbstract( $this->revision ) ) ) . "\n"; |
136 | } catch ( MWException | RuntimeException $ex ) { |
137 | $xml .= Xml::element( 'abstract', [ 'serialization-error' => '' ] ) . "\n"; |
138 | wfLogWarning( "failed to get abstract content for page " . |
139 | $this->title->getPrefixedText() . " with id " . |
140 | $this->revision->rev_page . "\n" ); |
141 | } |
142 | } else { |
143 | $xml .= Xml::element( 'abstract', [ 'not-applicable' => '' ] ) . "\n"; |
144 | } |
145 | $xml .= "<links>\n"; |
146 | |
147 | try { |
148 | $links = $this->sectionLinks( $this->revision ); |
149 | if ( !$links ) { |
150 | // If no TOC, they want us to fall back to categories. |
151 | $links = $this->categoryLinks( $this->revision ); |
152 | } |
153 | foreach ( $links as $anchor => $url ) { |
154 | $xml .= $this->formatLink( $url, $anchor, 'nav' ); |
155 | } |
156 | } catch ( MWException | RuntimeException $ex ) { |
157 | wfLogWarning( "failed to get abstract links for page " . |
158 | $this->title->getPrefixedText() . " with id " . |
159 | $this->revision->rev_page . "\n" ); |
160 | $links = []; |
161 | } |
162 | // @todo: image links |
163 | |
164 | $xml .= "</links>\n"; |
165 | } |
166 | $xml .= "</doc>\n"; |
167 | $this->sink->writeClosePage( $xml ); |
168 | // In rare cases, link cache has the same key for some pages which |
169 | // might be read as part of the same batch. T220424 |
170 | $linkCache = MediaWikiServices::getInstance()->getLinkCache(); |
171 | $linkCache->clearLink( $this->title ); |
172 | $this->title = null; |
173 | $this->revision = null; |
174 | } |
175 | |
176 | /** |
177 | * Get the page's textual content (main slot only). |
178 | * |
179 | * @param stdClass $rev Database row with revision data |
180 | * @return string |
181 | */ |
182 | protected function getText( $rev ) { |
183 | try { |
184 | $store = MediaWikiServices::getInstance()->getRevisionStore(); |
185 | $rec = $store->newRevisionFromRow( $rev ); |
186 | $content = $rec->getContent( SlotRecord::MAIN ); |
187 | |
188 | if ( !$content instanceof TextContent ) { |
189 | // This should not happen, since writeClosePage() checks the content model. |
190 | return ''; |
191 | } |
192 | |
193 | // TODO: cache this! |
194 | return $content->getText(); |
195 | } catch ( MWException | RuntimeException | InvalidArgumentException $ex ) { |
196 | // fall through |
197 | } |
198 | |
199 | wfLogWarning( "failed to get text for revid " . $rev->rev_id . "\n" ); |
200 | return ''; |
201 | } |
202 | |
203 | /** |
204 | * Extract an abstract from the page |
205 | * @param stdClass $rev Database row with revision data |
206 | * @return string |
207 | */ |
208 | protected function extractAbstract( $rev ) { |
209 | $text = $this->getText( $rev ); |
210 | |
211 | $stripped = $this->stripMarkup( $text ); |
212 | $extract = $this->extractStart( $stripped ); |
213 | // not too long pls |
214 | $clipped = substr( $extract, 0, 1024 ); |
215 | |
216 | return Validator::cleanUp( $clipped ); |
217 | } |
218 | |
219 | /** |
220 | * Strip markup to show plaintext |
221 | * @param string $text |
222 | * @return string |
223 | */ |
224 | protected function stripMarkup( $text ) { |
225 | $contLang = MediaWikiServices::getInstance()->getContentLanguage(); |
226 | |
227 | // don't bother with long text... |
228 | $text = substr( $text, 0, 4096 ); |
229 | |
230 | $image = preg_quote( $contLang->getNsText( NS_FILE ), '#' ); |
231 | $text = str_replace( [ "'''", "''" ], "", $text ); |
232 | // HTML-style comments |
233 | $text = preg_replace( '#<!--.*?-->#s', '', $text ); |
234 | // HTML-style tags |
235 | $text = preg_replace( '#</?[a-z0-9]+.*?>#s', '', $text ); |
236 | // URL links |
237 | $text = preg_replace( '#\\[[a-z]+:.*? (.*?)\\]#s', '$1', $text ); |
238 | // template parameters |
239 | $text = preg_replace( '#\\{\\{\\{.*?\\}\\}\\}#s', '', $text ); |
240 | // template calls |
241 | $text = preg_replace( '#\\{\\{.*?\\}\\}#s', '', $text ); |
242 | // tables |
243 | $text = preg_replace( '#\\{\\|.*?\\|\\}#s', '', $text ); |
244 | // images |
245 | $text = preg_replace( "# |
246 | \\[\\[ |
247 | :?$image\\s*: |
248 | ( |
249 | [^][]* |
250 | \[\[ |
251 | [^][]* |
252 | \]\] |
253 | )* |
254 | [^][]* |
255 | \\]\\]#six", '', $text ); |
256 | // links |
257 | $text = preg_replace( '#\\[\\[([^|\\]]*\\|)?(.*?)\\]\\]#s', '$2', $text ); |
258 | // indented lines near start are usually disambigs or notices |
259 | $text = preg_replace( '#^:.*$#m', '', $text ); |
260 | $text = Sanitizer::decodeCharReferences( $text ); |
261 | |
262 | return trim( $text ); |
263 | } |
264 | |
265 | /** |
266 | * Extract the first two sentences, if detectable, from the text. |
267 | * @param string $text |
268 | * @return string |
269 | */ |
270 | private function extractStart( $text ) { |
271 | $endchars = [ |
272 | // regular ASCII |
273 | '.', '!', '?', |
274 | // full-width ideographic full-stop |
275 | '。', |
276 | // double-width roman forms |
277 | '.', '!', '?', |
278 | // half-width ideographic full stop |
279 | '。', |
280 | ]; |
281 | |
282 | $endgroup = implode( '', array_map( 'preg_quote', $endchars ) ); |
283 | $end = "[$endgroup]"; |
284 | $sentence = ".*?$end+"; |
285 | $firsttwo = "/^($sentence$sentence)/u"; |
286 | |
287 | $matches = []; |
288 | |
289 | if ( preg_match( $firsttwo, $text, $matches ) ) { |
290 | return $matches[1]; |
291 | } |
292 | |
293 | $firstLine = explode( "\n", $text, 2 )[0]; |
294 | return trim( $firstLine ); |
295 | } |
296 | |
297 | /** |
298 | * Extract a list of TOC links |
299 | * @param stdClass $rev Database row with revision data |
300 | * @return string[] List of URL strings, indexed by name/title |
301 | * |
302 | * @todo FIXME extract TOC items properly |
303 | * @todo FIXME check for explicit __NOTOC__ |
304 | */ |
305 | protected function sectionLinks( $rev ) { |
306 | $parser = MediaWikiServices::getInstance()->getParser(); |
307 | |
308 | $headers = []; |
309 | |
310 | $text = $this->getText( $rev ); |
311 | $secs = preg_split( |
312 | '/(^=+.+?=+|^<h[1-6].*?' . '>.*?<\/h[1-6].*?' . '>)(?!\S)/mi', |
313 | $text, -1, |
314 | PREG_SPLIT_DELIM_CAPTURE |
315 | ); |
316 | |
317 | $secsCount = count( $secs ); |
318 | for ( $i = 1; $i < $secsCount; $i += 2 ) { |
319 | $inside = preg_replace( '/^=+\s*(.*?)\s*=+/', '$1', $secs[$i] ); |
320 | // strip internal markup and <h[1-6]> |
321 | $stripped = $this->stripMarkup( $inside ); |
322 | $header = Validator::cleanUp( $stripped ); |
323 | $anchor = $parser->guessSectionNameFromWikiText( $header ); |
324 | $url = $this->title->getCanonicalUrl() . $anchor; |
325 | $headers[$header] = $url; |
326 | } |
327 | |
328 | return $headers; |
329 | } |
330 | |
331 | /** |
332 | * Fetch the list of category links for this page |
333 | * @param stdClass $rev Database row with revision data |
334 | * @return string[] List of URL strings, indexed by category name |
335 | */ |
336 | protected function categoryLinks( $rev ) { |
337 | $id = $rev->page_id; |
338 | $dbr = MediaWikiServices::getInstance()->getConnectionProvider()->getReplicaDatabase(); |
339 | $result = $dbr->newSelectQueryBuilder() |
340 | ->select( 'cl_to' ) |
341 | ->from( 'categorylinks' ) |
342 | ->where( [ 'cl_from' => $id ] ) |
343 | ->caller( __METHOD__ ) |
344 | ->fetchResultSet(); |
345 | |
346 | $links = []; |
347 | foreach ( $result as $row ) { |
348 | $category = Title::makeTitle( NS_CATEGORY, $row->cl_to ); |
349 | $links[$category->getText()] = $category->getCanonicalUrl(); |
350 | } |
351 | |
352 | return $links; |
353 | } |
354 | |
355 | /** |
356 | * Format a <sublink> element, like so: |
357 | * <sublink linktype="nav"> |
358 | * <anchor>1939 Births</anchor> |
359 | * <link>http://en.wikipedia.org/wiki/Category:1939_births</link> |
360 | * </sublink> |
361 | * |
362 | * @param string $url |
363 | * @param string $anchor Human-readable link text; eg title or fragment |
364 | * @param string $type "nav" or "image" |
365 | * @return string XML fragment |
366 | */ |
367 | protected function formatLink( $url, $anchor, $type ) { |
368 | // as defined in Yahoo's .xsd |
369 | $maxUrlLength = 1024; |
370 | return Xml::openElement( 'sublink', [ 'linktype' => $type ] ) . |
371 | Xml::element( 'anchor', null, $this->variant( $anchor ) ) . |
372 | Xml::element( 'link', null, substr( $url, 0, $maxUrlLength ) ) . |
373 | Xml::closeElement( 'sublink' ) . "\n"; |
374 | } |
375 | |
376 | /** |
377 | * @param stdClass $rev |
378 | * @param string $string |
379 | */ |
380 | public function writeRevision( $rev, $string ) { |
381 | // Only use one revision's worth of data to output |
382 | $this->revision = $rev; |
383 | } |
384 | } |
385 | |
386 | class_alias( AbstractFilter::class, 'AbstractFilter' ); |