Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
85.90% covered (warning)
85.90%
67 / 78
44.44% covered (danger)
44.44%
4 / 9
CRAP
0.00% covered (danger)
0.00%
0 / 1
BaseDump
87.01% covered (warning)
87.01%
67 / 77
44.44% covered (danger)
44.44%
4 / 9
45.86
0.00% covered (danger)
0.00%
0 / 1
 __construct
66.67% covered (warning)
66.67%
4 / 6
0.00% covered (danger)
0.00%
0 / 1
2.15
 prefetch
100.00% covered (success)
100.00%
26 / 26
100.00% covered (success)
100.00%
1 / 1
14
 debug
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 nextPage
83.33% covered (warning)
83.33%
10 / 12
0.00% covered (danger)
0.00%
0 / 1
5.12
 nextRev
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
3
 nextText
66.67% covered (warning)
66.67%
2 / 3
0.00% covered (danger)
0.00%
0 / 1
2.15
 skipTo
81.82% covered (warning)
81.82%
9 / 11
0.00% covered (danger)
0.00%
0 / 1
7.29
 nodeContents
72.73% covered (warning)
72.73%
8 / 11
0.00% covered (danger)
0.00%
0 / 1
7.99
 close
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
1<?php
2/**
3 * Helper class for the --prefetch option of dumpTextPass.php
4 *
5 * Copyright © 2005 Brooke Vibber <bvibber@wikimedia.org>
6 * https://www.mediawiki.org/
7 *
8 * @license GPL-2.0-or-later
9 * @file
10 * @ingroup Maintenance
11 */
12
13namespace MediaWiki\Export;
14
15use MediaWiki\Revision\SlotRecord;
16use XMLReader;
17
18/**
19 * Readahead helper for making large MediaWiki data dumps;
20 * reads in a previous XML dump to sequentially prefetch text
21 * records already normalized and decompressed.
22 *
23 * This can save load on the external database servers, hopefully.
24 *
25 * Assumes that dumps will be recorded in the canonical order:
26 * - ascending by page_id
27 * - ascending by rev_id within each page
28 * - text contents are immutable and should not change once
29 *   recorded, so the previous dump is a reliable source
30 *
31 * @ingroup Maintenance
32 */
33class BaseDump {
34    /** @var XMLReader|null */
35    protected $reader = null;
36    /** @var bool */
37    protected $atEnd = false;
38    /** @var bool */
39    protected $atPageEnd = false;
40    /** @var int */
41    protected $lastPage = 0;
42    /** @var int */
43    protected $lastRev = 0;
44    /** @var string[]|null */
45    protected $infiles = null;
46
47    /**
48     * @param string $infile
49     */
50    public function __construct( $infile ) {
51        $this->infiles = explode( ';', $infile );
52        $this->reader = new XMLReader();
53        $infile = array_shift( $this->infiles );
54        if ( !$this->reader->open( $infile, null, LIBXML_PARSEHUGE ) ) {
55            $this->debug( __METHOD__ . ' was unable to open xml' );
56            $this->atEnd = true;
57        }
58    }
59
60    /**
61     * Attempts to fetch the text of a particular page revision
62     * from the dump stream. May return null if the page is
63     * unavailable.
64     *
65     * @param int $page ID number of page to read
66     * @param int $rev ID number of revision to read
67     * @param string $slot Role name of the slot to read
68     * @return string|null
69     */
70    public function prefetch( $page, $rev, $slot = SlotRecord::MAIN ) {
71        $page = intval( $page );
72        $rev = intval( $rev );
73        while ( $this->lastPage < $page && !$this->atEnd ) {
74            $this->debug( "BaseDump::prefetch at page $this->lastPage, looking for $page" );
75            $this->nextPage();
76        }
77        if ( $this->lastPage > $page || $this->atEnd ) {
78            $this->debug( "BaseDump::prefetch already past page $page or failed to open/read input file, "
79                . "looking for rev $rev  [$this->lastPage$this->lastRev]" );
80
81            return null;
82        }
83        while ( $this->lastRev < $rev && !$this->atEnd && !$this->atPageEnd ) {
84            $this->debug( "BaseDump::prefetch at page $this->lastPage, rev $this->lastRev"
85                . "looking for $page$rev" );
86            $this->nextRev();
87        }
88        if ( $this->lastRev == $rev && !$this->atEnd ) {
89            $this->debug( "BaseDump::prefetch hit on $page$rev [$this->lastPage$this->lastRev]" );
90
91            if ( $slot !== SlotRecord::MAIN ) {
92                $lastSlot = SlotRecord::MAIN;
93                while ( $lastSlot !== $slot ) {
94                    if ( !$this->skipTo( 'content', 'revision' ) ||
95                        !$this->skipTo( 'role', 'revision' )
96                    ) {
97                        return null;
98                    }
99                    $lastSlot = $this->nodeContents();
100                }
101            }
102
103            return $this->nextText();
104        } else {
105            $this->debug( "BaseDump::prefetch already past rev $rev on page $page "
106                . "[$this->lastPage$this->lastRev]" );
107
108            return null;
109        }
110    }
111
112    /**
113     * @param string $str
114     */
115    protected function debug( $str ) {
116        wfDebug( $str );
117        // global $dumper;
118        // $dumper->progress( $str );
119    }
120
121    private function nextPage() {
122        if ( $this->skipTo( 'page', 'mediawiki' ) ) {
123            if ( $this->skipTo( 'id' ) ) {
124                $this->lastPage = intval( $this->nodeContents() );
125                $this->lastRev = 0;
126                $this->atPageEnd = false;
127            }
128        } else {
129            $this->close();
130            if ( count( $this->infiles ) ) {
131                $infile = array_shift( $this->infiles );
132                if ( !$this->reader->open( $infile, null, LIBXML_PARSEHUGE ) ) {
133                    $this->debug( __METHOD__ . ' was unable to open xml' );
134                    $this->atEnd = true;
135                } else {
136                    $this->atEnd = false;
137                }
138            }
139        }
140    }
141
142    private function nextRev() {
143        if ( $this->skipTo( 'revision' ) ) {
144            if ( $this->skipTo( 'id' ) ) {
145                $this->lastRev = intval( $this->nodeContents() );
146            }
147        } else {
148            $this->atPageEnd = true;
149        }
150    }
151
152    /**
153     * @return string|null
154     */
155    private function nextText() {
156        if ( !$this->skipTo( 'text', 'revision' ) ) {
157            return null;
158        }
159
160        return strval( $this->nodeContents() );
161    }
162
163    /**
164     * @param string $name
165     * @param string $parent
166     * @return bool|null
167     */
168    private function skipTo( $name, $parent = 'page' ) {
169        if ( $this->atEnd ) {
170            return false;
171        }
172        while ( $this->reader->read() ) {
173            if ( $this->reader->nodeType == XMLReader::ELEMENT
174                && $this->reader->name == $name
175            ) {
176                return true;
177            }
178            if ( $this->reader->nodeType == XMLReader::END_ELEMENT
179                && $this->reader->name == $parent
180            ) {
181                $this->debug( "BaseDump::skipTo found </$parent> searching for <$name>" );
182
183                return false;
184            }
185        }
186
187        return $this->close();
188    }
189
190    /**
191     * Shouldn't something like this be built-in to XMLReader?
192     * Fetches text contents of the current element, assuming
193     * no sub-elements or such scary things.
194     *
195     * @return string|null
196     */
197    private function nodeContents() {
198        if ( $this->atEnd ) {
199            return null;
200        }
201        if ( $this->reader->isEmptyElement ) {
202            return "";
203        }
204        $buffer = "";
205        while ( $this->reader->read() ) {
206            switch ( $this->reader->nodeType ) {
207                case XMLReader::TEXT:
208                // case XMLReader::WHITESPACE:
209                case XMLReader::SIGNIFICANT_WHITESPACE:
210                    $buffer .= $this->reader->value;
211                    break;
212                case XMLReader::END_ELEMENT:
213                    return $buffer;
214            }
215        }
216
217        return $this->close();
218    }
219
220    /**
221     * @return null
222     */
223    public function close() {
224        $this->reader->close();
225        $this->atEnd = true;
226
227        return null;
228    }
229}
230
231/** @deprecated class alias since 1.46 */
232class_alias( BaseDump::class, 'BaseDump' );