MediaWiki 1.39.10
BaseDump.php
Go to the documentation of this file.
1<?php
28
44class BaseDump {
46 protected $reader = null;
48 protected $atEnd = false;
50 protected $atPageEnd = false;
52 protected $lastPage = 0;
54 protected $lastRev = 0;
56 protected $infiles = null;
57
61 public function __construct( $infile ) {
62 $this->infiles = explode( ';', $infile );
63 $this->reader = new XMLReader();
64 $infile = array_shift( $this->infiles );
65 $this->reader->open( $infile, null, LIBXML_PARSEHUGE );
66 }
67
78 public function prefetch( $page, $rev, $slot = SlotRecord::MAIN ) {
79 $page = intval( $page );
80 $rev = intval( $rev );
81 while ( $this->lastPage < $page && !$this->atEnd ) {
82 $this->debug( "BaseDump::prefetch at page $this->lastPage, looking for $page" );
83 $this->nextPage();
84 }
85 if ( $this->lastPage > $page || $this->atEnd ) {
86 $this->debug( "BaseDump::prefetch already past page $page "
87 . "looking for rev $rev [$this->lastPage, $this->lastRev]" );
88
89 return null;
90 }
91 while ( $this->lastRev < $rev && !$this->atEnd && !$this->atPageEnd ) {
92 $this->debug( "BaseDump::prefetch at page $this->lastPage, rev $this->lastRev, "
93 . "looking for $page, $rev" );
94 $this->nextRev();
95 }
96 if ( $this->lastRev == $rev && !$this->atEnd ) {
97 $this->debug( "BaseDump::prefetch hit on $page, $rev [$this->lastPage, $this->lastRev]" );
98
99 if ( $slot !== SlotRecord::MAIN ) {
100 $lastSlot = SlotRecord::MAIN;
101 while ( $lastSlot !== $slot ) {
102 if ( !$this->skipTo( 'content', 'revision' ) ) {
103 return null;
104 }
105 if ( !$this->skipTo( 'role', 'revision' ) ) {
106 return null;
107 }
108 $lastSlot = $this->nodeContents();
109 }
110 }
111
112 return $this->nextText();
113 } else {
114 $this->debug( "BaseDump::prefetch already past rev $rev on page $page "
115 . "[$this->lastPage, $this->lastRev]" );
116
117 return null;
118 }
119 }
120
124 protected function debug( $str ) {
125 wfDebug( $str );
126 // global $dumper;
127 // $dumper->progress( $str );
128 }
129
130 private function nextPage() {
131 if ( $this->skipTo( 'page', 'mediawiki' ) ) {
132 if ( $this->skipTo( 'id' ) ) {
133 $this->lastPage = intval( $this->nodeContents() );
134 $this->lastRev = 0;
135 $this->atPageEnd = false;
136 }
137 } else {
138 $this->close();
139 if ( count( $this->infiles ) ) {
140 $infile = array_shift( $this->infiles );
141 $this->reader->open( $infile, null, LIBXML_PARSEHUGE );
142 $this->atEnd = false;
143 }
144 }
145 }
146
147 private function nextRev() {
148 if ( $this->skipTo( 'revision' ) ) {
149 if ( $this->skipTo( 'id' ) ) {
150 $this->lastRev = intval( $this->nodeContents() );
151 }
152 } else {
153 $this->atPageEnd = true;
154 }
155 }
156
160 private function nextText() {
161 if ( !$this->skipTo( 'text', 'revision' ) ) {
162 return null;
163 }
164
165 return strval( $this->nodeContents() );
166 }
167
173 private function skipTo( $name, $parent = 'page' ) {
174 if ( $this->atEnd ) {
175 return false;
176 }
177 while ( $this->reader->read() ) {
178 if ( $this->reader->nodeType == XMLReader::ELEMENT
179 && $this->reader->name == $name
180 ) {
181 return true;
182 }
183 if ( $this->reader->nodeType == XMLReader::END_ELEMENT
184 && $this->reader->name == $parent
185 ) {
186 $this->debug( "BaseDump::skipTo found </$parent> searching for <$name>" );
187
188 return false;
189 }
190 }
191
192 return $this->close();
193 }
194
202 private function nodeContents() {
203 if ( $this->atEnd ) {
204 return null;
205 }
206 if ( $this->reader->isEmptyElement ) {
207 return "";
208 }
209 $buffer = "";
210 while ( $this->reader->read() ) {
211 switch ( $this->reader->nodeType ) {
212 case XMLReader::TEXT:
213 // case XMLReader::WHITESPACE:
214 case XMLReader::SIGNIFICANT_WHITESPACE:
215 $buffer .= $this->reader->value;
216 break;
217 case XMLReader::END_ELEMENT:
218 return $buffer;
219 }
220 }
221
222 return $this->close();
223 }
224
228 public function close() {
229 $this->reader->close();
230 $this->atEnd = true;
231
232 return null;
233 }
234}
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
Readahead helper for making large MediaWiki data dumps; reads in a previous XML dump to sequentially ...
Definition BaseDump.php:44
bool $atPageEnd
Definition BaseDump.php:50
__construct( $infile)
Definition BaseDump.php:61
XMLReader null $reader
Definition BaseDump.php:46
string[] null $infiles
Definition BaseDump.php:56
debug( $str)
Definition BaseDump.php:124
int $lastRev
Definition BaseDump.php:54
int $lastPage
Definition BaseDump.php:52
prefetch( $page, $rev, $slot=SlotRecord::MAIN)
Attempts to fetch the text of a particular page revision from the dump stream.
Definition BaseDump.php:78
bool $atEnd
Definition BaseDump.php:48
Value object representing a content slot associated with a page revision.