MediaWiki  master
BaseDump.php
Go to the documentation of this file.
1 <?php
42 class BaseDump {
44  protected $reader = null;
45  protected $atEnd = false;
46  protected $atPageEnd = false;
47  protected $lastPage = 0;
48  protected $lastRev = 0;
49  protected $infiles = null;
50 
51  public function __construct( $infile ) {
52  $this->infiles = explode( ';', $infile );
53  $this->reader = new XMLReader();
54  $infile = array_shift( $this->infiles );
55  $this->reader->open( $infile, null, LIBXML_PARSEHUGE );
56  }
57 
67  function prefetch( $page, $rev ) {
68  $page = intval( $page );
69  $rev = intval( $rev );
70  while ( $this->lastPage < $page && !$this->atEnd ) {
71  $this->debug( "BaseDump::prefetch at page $this->lastPage, looking for $page" );
72  $this->nextPage();
73  }
74  if ( $this->lastPage > $page || $this->atEnd ) {
75  $this->debug( "BaseDump::prefetch already past page $page "
76  . "looking for rev $rev [$this->lastPage, $this->lastRev]" );
77 
78  return null;
79  }
80  while ( $this->lastRev < $rev && !$this->atEnd && !$this->atPageEnd ) {
81  $this->debug( "BaseDump::prefetch at page $this->lastPage, rev $this->lastRev, "
82  . "looking for $page, $rev" );
83  $this->nextRev();
84  }
85  if ( $this->lastRev == $rev && !$this->atEnd ) {
86  $this->debug( "BaseDump::prefetch hit on $page, $rev [$this->lastPage, $this->lastRev]" );
87 
88  return $this->nextText();
89  } else {
90  $this->debug( "BaseDump::prefetch already past rev $rev on page $page "
91  . "[$this->lastPage, $this->lastRev]" );
92 
93  return null;
94  }
95  }
96 
97  function debug( $str ) {
98  wfDebug( $str . "\n" );
99  // global $dumper;
100  // $dumper->progress( $str );
101  }
102 
106  function nextPage() {
107  if ( $this->skipTo( 'page', 'mediawiki' ) ) {
108  if ( $this->skipTo( 'id' ) ) {
109  $this->lastPage = intval( $this->nodeContents() );
110  $this->lastRev = 0;
111  $this->atPageEnd = false;
112  }
113  } else {
114  $this->close();
115  if ( count( $this->infiles ) ) {
116  $infile = array_shift( $this->infiles );
117  $this->reader->open( $infile );
118  $this->atEnd = false;
119  }
120  }
121  }
122 
126  function nextRev() {
127  if ( $this->skipTo( 'revision' ) ) {
128  if ( $this->skipTo( 'id' ) ) {
129  $this->lastRev = intval( $this->nodeContents() );
130  }
131  } else {
132  $this->atPageEnd = true;
133  }
134  }
135 
140  function nextText() {
141  $this->skipTo( 'text' );
142 
143  return strval( $this->nodeContents() );
144  }
145 
152  function skipTo( $name, $parent = 'page' ) {
153  if ( $this->atEnd ) {
154  return false;
155  }
156  while ( $this->reader->read() ) {
157  if ( $this->reader->nodeType == XMLReader::ELEMENT
158  && $this->reader->name == $name
159  ) {
160  return true;
161  }
162  if ( $this->reader->nodeType == XMLReader::END_ELEMENT
163  && $this->reader->name == $parent
164  ) {
165  $this->debug( "BaseDump::skipTo found </$parent> searching for <$name>" );
166 
167  return false;
168  }
169  }
170 
171  return $this->close();
172  }
173 
182  function nodeContents() {
183  if ( $this->atEnd ) {
184  return null;
185  }
186  if ( $this->reader->isEmptyElement ) {
187  return "";
188  }
189  $buffer = "";
190  while ( $this->reader->read() ) {
191  switch ( $this->reader->nodeType ) {
192  case XMLReader::TEXT:
193  // case XMLReader::WHITESPACE:
194  case XMLReader::SIGNIFICANT_WHITESPACE:
195  $buffer .= $this->reader->value;
196  break;
197  case XMLReader::END_ELEMENT:
198  return $buffer;
199  }
200  }
201 
202  return $this->close();
203  }
204 
209  function close() {
210  $this->reader->close();
211  $this->atEnd = true;
212 
213  return null;
214  }
215 }
prefetch( $page, $rev)
Attempts to fetch the text of a particular page revision from the dump stream.
Definition: BaseDump.php:67
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
nextPage()
Definition: BaseDump.php:106
Readahead helper for making large MediaWiki data dumps; reads in a previous XML dump to sequentially ...
Definition: BaseDump.php:42
skipTo( $name, $parent='page')
Definition: BaseDump.php:152
__construct( $infile)
Definition: BaseDump.php:51
nextText()
Definition: BaseDump.php:140
XMLReader $reader
Definition: BaseDump.php:44
debug( $str)
Definition: BaseDump.php:97
nodeContents()
Shouldn&#39;t something like this be built-in to XMLReader? Fetches text contents of the current element...
Definition: BaseDump.php:182