Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 81 |
|
0.00% |
0 / 9 |
CRAP | |
0.00% |
0 / 1 |
BaseDump | |
0.00% |
0 / 81 |
|
0.00% |
0 / 9 |
1806 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
6 | |||
prefetch | |
0.00% |
0 / 27 |
|
0.00% |
0 / 1 |
210 | |||
debug | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
nextPage | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
30 | |||
nextRev | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
12 | |||
nextText | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
skipTo | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
56 | |||
nodeContents | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
56 | |||
close | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | /** |
3 | * Helper class for the --prefetch option of dumpTextPass.php |
4 | * |
5 | * Copyright © 2005 Brooke Vibber <bvibber@wikimedia.org> |
6 | * https://www.mediawiki.org/ |
7 | * |
8 | * This program is free software; you can redistribute it and/or modify |
9 | * it under the terms of the GNU General Public License as published by |
10 | * the Free Software Foundation; either version 2 of the License, or |
11 | * (at your option) any later version. |
12 | * |
13 | * This program is distributed in the hope that it will be useful, |
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 | * GNU General Public License for more details. |
17 | * |
18 | * You should have received a copy of the GNU General Public License along |
19 | * with this program; if not, write to the Free Software Foundation, Inc., |
20 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
21 | * http://www.gnu.org/copyleft/gpl.html |
22 | * |
23 | * @file |
24 | * @ingroup Maintenance |
25 | */ |
26 | |
27 | use MediaWiki\Revision\SlotRecord; |
28 | |
29 | /** |
30 | * Readahead helper for making large MediaWiki data dumps; |
31 | * reads in a previous XML dump to sequentially prefetch text |
32 | * records already normalized and decompressed. |
33 | * |
34 | * This can save load on the external database servers, hopefully. |
35 | * |
36 | * Assumes that dumps will be recorded in the canonical order: |
37 | * - ascending by page_id |
38 | * - ascending by rev_id within each page |
39 | * - text contents are immutable and should not change once |
40 | * recorded, so the previous dump is a reliable source |
41 | * |
42 | * @ingroup Maintenance |
43 | */ |
44 | class BaseDump { |
45 | /** @var XMLReader|null */ |
46 | protected $reader = null; |
47 | /** @var bool */ |
48 | protected $atEnd = false; |
49 | /** @var bool */ |
50 | protected $atPageEnd = false; |
51 | /** @var int */ |
52 | protected $lastPage = 0; |
53 | /** @var int */ |
54 | protected $lastRev = 0; |
55 | /** @var string[]|null */ |
56 | protected $infiles = null; |
57 | |
58 | /** |
59 | * @param string $infile |
60 | */ |
61 | public function __construct( $infile ) { |
62 | $this->infiles = explode( ';', $infile ); |
63 | $this->reader = new XMLReader(); |
64 | $infile = array_shift( $this->infiles ); |
65 | if ( !$this->reader->open( $infile, null, LIBXML_PARSEHUGE ) ) { |
66 | $this->debug( __METHOD__ . ' was unable to open xml' ); |
67 | $this->atEnd = true; |
68 | } |
69 | } |
70 | |
71 | /** |
72 | * Attempts to fetch the text of a particular page revision |
73 | * from the dump stream. May return null if the page is |
74 | * unavailable. |
75 | * |
76 | * @param int $page ID number of page to read |
77 | * @param int $rev ID number of revision to read |
78 | * @param string $slot Role name of the slot to read |
79 | * @return string|null |
80 | */ |
81 | public function prefetch( $page, $rev, $slot = SlotRecord::MAIN ) { |
82 | $page = intval( $page ); |
83 | $rev = intval( $rev ); |
84 | while ( $this->lastPage < $page && !$this->atEnd ) { |
85 | $this->debug( "BaseDump::prefetch at page $this->lastPage, looking for $page" ); |
86 | $this->nextPage(); |
87 | } |
88 | if ( $this->lastPage > $page || $this->atEnd ) { |
89 | $this->debug( "BaseDump::prefetch already past page $page or failed to open/read input file, " |
90 | . "looking for rev $rev [$this->lastPage, $this->lastRev]" ); |
91 | |
92 | return null; |
93 | } |
94 | while ( $this->lastRev < $rev && !$this->atEnd && !$this->atPageEnd ) { |
95 | $this->debug( "BaseDump::prefetch at page $this->lastPage, rev $this->lastRev, " |
96 | . "looking for $page, $rev" ); |
97 | $this->nextRev(); |
98 | } |
99 | if ( $this->lastRev == $rev && !$this->atEnd ) { |
100 | $this->debug( "BaseDump::prefetch hit on $page, $rev [$this->lastPage, $this->lastRev]" ); |
101 | |
102 | if ( $slot !== SlotRecord::MAIN ) { |
103 | $lastSlot = SlotRecord::MAIN; |
104 | while ( $lastSlot !== $slot ) { |
105 | if ( !$this->skipTo( 'content', 'revision' ) || |
106 | !$this->skipTo( 'role', 'revision' ) |
107 | ) { |
108 | return null; |
109 | } |
110 | $lastSlot = $this->nodeContents(); |
111 | } |
112 | } |
113 | |
114 | return $this->nextText(); |
115 | } else { |
116 | $this->debug( "BaseDump::prefetch already past rev $rev on page $page " |
117 | . "[$this->lastPage, $this->lastRev]" ); |
118 | |
119 | return null; |
120 | } |
121 | } |
122 | |
123 | /** |
124 | * @param string $str |
125 | */ |
126 | protected function debug( $str ) { |
127 | wfDebug( $str ); |
128 | // global $dumper; |
129 | // $dumper->progress( $str ); |
130 | } |
131 | |
132 | private function nextPage() { |
133 | if ( $this->skipTo( 'page', 'mediawiki' ) ) { |
134 | if ( $this->skipTo( 'id' ) ) { |
135 | $this->lastPage = intval( $this->nodeContents() ); |
136 | $this->lastRev = 0; |
137 | $this->atPageEnd = false; |
138 | } |
139 | } else { |
140 | $this->close(); |
141 | if ( count( $this->infiles ) ) { |
142 | $infile = array_shift( $this->infiles ); |
143 | if ( !$this->reader->open( $infile, null, LIBXML_PARSEHUGE ) ) { |
144 | $this->debug( __METHOD__ . ' was unable to open xml' ); |
145 | $this->atEnd = true; |
146 | } else { |
147 | $this->atEnd = false; |
148 | } |
149 | } |
150 | } |
151 | } |
152 | |
153 | private function nextRev() { |
154 | if ( $this->skipTo( 'revision' ) ) { |
155 | if ( $this->skipTo( 'id' ) ) { |
156 | $this->lastRev = intval( $this->nodeContents() ); |
157 | } |
158 | } else { |
159 | $this->atPageEnd = true; |
160 | } |
161 | } |
162 | |
163 | /** |
164 | * @return string|null |
165 | */ |
166 | private function nextText() { |
167 | if ( !$this->skipTo( 'text', 'revision' ) ) { |
168 | return null; |
169 | } |
170 | |
171 | return strval( $this->nodeContents() ); |
172 | } |
173 | |
174 | /** |
175 | * @param string $name |
176 | * @param string $parent |
177 | * @return bool|null |
178 | */ |
179 | private function skipTo( $name, $parent = 'page' ) { |
180 | if ( $this->atEnd ) { |
181 | return false; |
182 | } |
183 | while ( $this->reader->read() ) { |
184 | if ( $this->reader->nodeType == XMLReader::ELEMENT |
185 | && $this->reader->name == $name |
186 | ) { |
187 | return true; |
188 | } |
189 | if ( $this->reader->nodeType == XMLReader::END_ELEMENT |
190 | && $this->reader->name == $parent |
191 | ) { |
192 | $this->debug( "BaseDump::skipTo found </$parent> searching for <$name>" ); |
193 | |
194 | return false; |
195 | } |
196 | } |
197 | |
198 | return $this->close(); |
199 | } |
200 | |
201 | /** |
202 | * Shouldn't something like this be built-in to XMLReader? |
203 | * Fetches text contents of the current element, assuming |
204 | * no sub-elements or such scary things. |
205 | * |
206 | * @return string|null |
207 | */ |
208 | private function nodeContents() { |
209 | if ( $this->atEnd ) { |
210 | return null; |
211 | } |
212 | if ( $this->reader->isEmptyElement ) { |
213 | return ""; |
214 | } |
215 | $buffer = ""; |
216 | while ( $this->reader->read() ) { |
217 | switch ( $this->reader->nodeType ) { |
218 | case XMLReader::TEXT: |
219 | // case XMLReader::WHITESPACE: |
220 | case XMLReader::SIGNIFICANT_WHITESPACE: |
221 | $buffer .= $this->reader->value; |
222 | break; |
223 | case XMLReader::END_ELEMENT: |
224 | return $buffer; |
225 | } |
226 | } |
227 | |
228 | return $this->close(); |
229 | } |
230 | |
231 | /** |
232 | * @return null |
233 | */ |
234 | public function close() { |
235 | $this->reader->close(); |
236 | $this->atEnd = true; |
237 | |
238 | return null; |
239 | } |
240 | } |