MediaWiki REL1_35
BaseDump.php
Go to the documentation of this file.
1<?php
28
44class BaseDump {
46 protected $reader = null;
47 protected $atEnd = false;
48 protected $atPageEnd = false;
49 protected $lastPage = 0;
50 protected $lastRev = 0;
51 protected $infiles = null;
52
53 public function __construct( $infile ) {
54 $this->infiles = explode( ';', $infile );
55 $this->reader = new XMLReader();
56 $infile = array_shift( $this->infiles );
57 $this->reader->open( $infile, null, LIBXML_PARSEHUGE );
58 }
59
70 public function prefetch( $page, $rev, $slot = SlotRecord::MAIN ) {
71 $page = intval( $page );
72 $rev = intval( $rev );
73 while ( $this->lastPage < $page && !$this->atEnd ) {
74 $this->debug( "BaseDump::prefetch at page $this->lastPage, looking for $page" );
75 $this->nextPage();
76 }
77 if ( $this->lastPage > $page || $this->atEnd ) {
78 $this->debug( "BaseDump::prefetch already past page $page "
79 . "looking for rev $rev [$this->lastPage, $this->lastRev]" );
80
81 return null;
82 }
83 while ( $this->lastRev < $rev && !$this->atEnd && !$this->atPageEnd ) {
84 $this->debug( "BaseDump::prefetch at page $this->lastPage, rev $this->lastRev, "
85 . "looking for $page, $rev" );
86 $this->nextRev();
87 }
88 if ( $this->lastRev == $rev && !$this->atEnd ) {
89 $this->debug( "BaseDump::prefetch hit on $page, $rev [$this->lastPage, $this->lastRev]" );
90
91 if ( $slot !== SlotRecord::MAIN ) {
92 $lastSlot = SlotRecord::MAIN;
93 while ( $lastSlot !== $slot ) {
94 if ( !$this->skipTo( 'content', 'revision' ) ) {
95 return null;
96 }
97 if ( !$this->skipTo( 'role', 'revision' ) ) {
98 return null;
99 }
100 $lastSlot = $this->nodeContents();
101 }
102 }
103
104 return $this->nextText();
105 } else {
106 $this->debug( "BaseDump::prefetch already past rev $rev on page $page "
107 . "[$this->lastPage, $this->lastRev]" );
108
109 return null;
110 }
111 }
112
113 protected function debug( $str ) {
114 wfDebug( $str );
115 // global $dumper;
116 // $dumper->progress( $str );
117 }
118
119 private function nextPage() {
120 if ( $this->skipTo( 'page', 'mediawiki' ) ) {
121 if ( $this->skipTo( 'id' ) ) {
122 $this->lastPage = intval( $this->nodeContents() );
123 $this->lastRev = 0;
124 $this->atPageEnd = false;
125 }
126 } else {
127 $this->close();
128 if ( count( $this->infiles ) ) {
129 $infile = array_shift( $this->infiles );
130 $this->reader->open( $infile, null, LIBXML_PARSEHUGE );
131 $this->atEnd = false;
132 }
133 }
134 }
135
136 private function nextRev() {
137 if ( $this->skipTo( 'revision' ) ) {
138 if ( $this->skipTo( 'id' ) ) {
139 $this->lastRev = intval( $this->nodeContents() );
140 }
141 } else {
142 $this->atPageEnd = true;
143 }
144 }
145
149 private function nextText() {
150 if ( !$this->skipTo( 'text', 'revision' ) ) {
151 return null;
152 }
153
154 return strval( $this->nodeContents() );
155 }
156
162 private function skipTo( $name, $parent = 'page' ) {
163 if ( $this->atEnd ) {
164 return false;
165 }
166 while ( $this->reader->read() ) {
167 if ( $this->reader->nodeType == XMLReader::ELEMENT
168 && $this->reader->name == $name
169 ) {
170 return true;
171 }
172 if ( $this->reader->nodeType == XMLReader::END_ELEMENT
173 && $this->reader->name == $parent
174 ) {
175 $this->debug( "BaseDump::skipTo found </$parent> searching for <$name>" );
176
177 return false;
178 }
179 }
180
181 return $this->close();
182 }
183
191 private function nodeContents() {
192 if ( $this->atEnd ) {
193 return null;
194 }
195 if ( $this->reader->isEmptyElement ) {
196 return "";
197 }
198 $buffer = "";
199 while ( $this->reader->read() ) {
200 switch ( $this->reader->nodeType ) {
201 case XMLReader::TEXT:
202 // case XMLReader::WHITESPACE:
203 case XMLReader::SIGNIFICANT_WHITESPACE:
204 $buffer .= $this->reader->value;
205 break;
206 case XMLReader::END_ELEMENT:
207 return $buffer;
208 }
209 }
210
211 return $this->close();
212 }
213
217 public function close() {
218 $this->reader->close();
219 $this->atEnd = true;
220
221 return null;
222 }
223}
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
Readahead helper for making large MediaWiki data dumps; reads in a previous XML dump to sequentially ...
Definition BaseDump.php:44
__construct( $infile)
Definition BaseDump.php:53
debug( $str)
Definition BaseDump.php:113
nodeContents()
Shouldn't something like this be built-in to XMLReader? Fetches text contents of the current element,...
Definition BaseDump.php:191
skipTo( $name, $parent='page')
Definition BaseDump.php:162
prefetch( $page, $rev, $slot=SlotRecord::MAIN)
Attempts to fetch the text of a particular page revision from the dump stream.
Definition BaseDump.php:70
XMLReader $reader
Definition BaseDump.php:46
Value object representing a content slot associated with a page revision.