MediaWiki  master
BaseDump.php
Go to the documentation of this file.
1 <?php
28 
44 class BaseDump {
46  protected $reader = null;
48  protected $atEnd = false;
50  protected $atPageEnd = false;
52  protected $lastPage = 0;
54  protected $lastRev = 0;
56  protected $infiles = null;
57 
61  public function __construct( $infile ) {
62  $this->infiles = explode( ';', $infile );
63  $this->reader = new XMLReader();
64  $infile = array_shift( $this->infiles );
65  $this->reader->open( $infile, null, LIBXML_PARSEHUGE );
66  }
67 
78  public function prefetch( $page, $rev, $slot = SlotRecord::MAIN ) {
79  $page = intval( $page );
80  $rev = intval( $rev );
81  while ( $this->lastPage < $page && !$this->atEnd ) {
82  $this->debug( "BaseDump::prefetch at page $this->lastPage, looking for $page" );
83  $this->nextPage();
84  }
85  if ( $this->lastPage > $page || $this->atEnd ) {
86  $this->debug( "BaseDump::prefetch already past page $page "
87  . "looking for rev $rev [$this->lastPage, $this->lastRev]" );
88 
89  return null;
90  }
91  while ( $this->lastRev < $rev && !$this->atEnd && !$this->atPageEnd ) {
92  $this->debug( "BaseDump::prefetch at page $this->lastPage, rev $this->lastRev, "
93  . "looking for $page, $rev" );
94  $this->nextRev();
95  }
96  if ( $this->lastRev == $rev && !$this->atEnd ) {
97  $this->debug( "BaseDump::prefetch hit on $page, $rev [$this->lastPage, $this->lastRev]" );
98 
99  if ( $slot !== SlotRecord::MAIN ) {
100  $lastSlot = SlotRecord::MAIN;
101  while ( $lastSlot !== $slot ) {
102  if ( !$this->skipTo( 'content', 'revision' ) ) {
103  return null;
104  }
105  if ( !$this->skipTo( 'role', 'revision' ) ) {
106  return null;
107  }
108  $lastSlot = $this->nodeContents();
109  }
110  }
111 
112  return $this->nextText();
113  } else {
114  $this->debug( "BaseDump::prefetch already past rev $rev on page $page "
115  . "[$this->lastPage, $this->lastRev]" );
116 
117  return null;
118  }
119  }
120 
124  protected function debug( $str ) {
125  wfDebug( $str );
126  // global $dumper;
127  // $dumper->progress( $str );
128  }
129 
130  private function nextPage() {
131  if ( $this->skipTo( 'page', 'mediawiki' ) ) {
132  if ( $this->skipTo( 'id' ) ) {
133  $this->lastPage = intval( $this->nodeContents() );
134  $this->lastRev = 0;
135  $this->atPageEnd = false;
136  }
137  } else {
138  $this->close();
139  if ( count( $this->infiles ) ) {
140  $infile = array_shift( $this->infiles );
141  $this->reader->open( $infile, null, LIBXML_PARSEHUGE );
142  $this->atEnd = false;
143  }
144  }
145  }
146 
147  private function nextRev() {
148  if ( $this->skipTo( 'revision' ) ) {
149  if ( $this->skipTo( 'id' ) ) {
150  $this->lastRev = intval( $this->nodeContents() );
151  }
152  } else {
153  $this->atPageEnd = true;
154  }
155  }
156 
160  private function nextText() {
161  if ( !$this->skipTo( 'text', 'revision' ) ) {
162  return null;
163  }
164 
165  return strval( $this->nodeContents() );
166  }
167 
173  private function skipTo( $name, $parent = 'page' ) {
174  if ( $this->atEnd ) {
175  return false;
176  }
177  while ( $this->reader->read() ) {
178  if ( $this->reader->nodeType == XMLReader::ELEMENT
179  && $this->reader->name == $name
180  ) {
181  return true;
182  }
183  if ( $this->reader->nodeType == XMLReader::END_ELEMENT
184  && $this->reader->name == $parent
185  ) {
186  $this->debug( "BaseDump::skipTo found </$parent> searching for <$name>" );
187 
188  return false;
189  }
190  }
191 
192  return $this->close();
193  }
194 
202  private function nodeContents() {
203  if ( $this->atEnd ) {
204  return null;
205  }
206  if ( $this->reader->isEmptyElement ) {
207  return "";
208  }
209  $buffer = "";
210  while ( $this->reader->read() ) {
211  switch ( $this->reader->nodeType ) {
212  case XMLReader::TEXT:
213  // case XMLReader::WHITESPACE:
214  case XMLReader::SIGNIFICANT_WHITESPACE:
215  $buffer .= $this->reader->value;
216  break;
217  case XMLReader::END_ELEMENT:
218  return $buffer;
219  }
220  }
221 
222  return $this->close();
223  }
224 
228  public function close() {
229  $this->reader->close();
230  $this->atEnd = true;
231 
232  return null;
233  }
234 }
BaseDump
Readahead helper for making large MediaWiki data dumps; reads in a previous XML dump to sequentially ...
Definition: BaseDump.php:44
BaseDump\nodeContents
nodeContents()
Shouldn't something like this be built-in to XMLReader? Fetches text contents of the current element,...
Definition: BaseDump.php:202
BaseDump\skipTo
skipTo( $name, $parent='page')
Definition: BaseDump.php:173
BaseDump\$atPageEnd
bool $atPageEnd
Definition: BaseDump.php:50
BaseDump\$atEnd
bool $atEnd
Definition: BaseDump.php:48
BaseDump\$infiles
string[] null $infiles
Definition: BaseDump.php:56
BaseDump\$lastRev
int $lastRev
Definition: BaseDump.php:54
BaseDump\__construct
__construct( $infile)
Definition: BaseDump.php:61
BaseDump\nextRev
nextRev()
Definition: BaseDump.php:147
wfDebug
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
Definition: GlobalFunctions.php:894
BaseDump\$lastPage
int $lastPage
Definition: BaseDump.php:52
BaseDump\$reader
XMLReader null $reader
Definition: BaseDump.php:46
BaseDump\prefetch
prefetch( $page, $rev, $slot=SlotRecord::MAIN)
Attempts to fetch the text of a particular page revision from the dump stream.
Definition: BaseDump.php:78
BaseDump\close
close()
Definition: BaseDump.php:228
BaseDump\debug
debug( $str)
Definition: BaseDump.php:124
BaseDump\nextPage
nextPage()
Definition: BaseDump.php:130
MediaWiki\Revision\SlotRecord
Value object representing a content slot associated with a page revision.
Definition: SlotRecord.php:40
BaseDump\nextText
nextText()
Definition: BaseDump.php:160