MediaWiki  master
BaseDump.php
Go to the documentation of this file.
1 <?php
28 
44 class BaseDump {
46  protected $reader = null;
47  protected $atEnd = false;
48  protected $atPageEnd = false;
49  protected $lastPage = 0;
50  protected $lastRev = 0;
51  protected $infiles = null;
52 
53  public function __construct( $infile ) {
54  $this->infiles = explode( ';', $infile );
55  $this->reader = new XMLReader();
56  $infile = array_shift( $this->infiles );
57  $this->reader->open( $infile, null, LIBXML_PARSEHUGE );
58  }
59 
70  public function prefetch( $page, $rev, $slot = SlotRecord::MAIN ) {
71  $page = intval( $page );
72  $rev = intval( $rev );
73  while ( $this->lastPage < $page && !$this->atEnd ) {
74  $this->debug( "BaseDump::prefetch at page $this->lastPage, looking for $page" );
75  $this->nextPage();
76  }
77  if ( $this->lastPage > $page || $this->atEnd ) {
78  $this->debug( "BaseDump::prefetch already past page $page "
79  . "looking for rev $rev [$this->lastPage, $this->lastRev]" );
80 
81  return null;
82  }
83  while ( $this->lastRev < $rev && !$this->atEnd && !$this->atPageEnd ) {
84  $this->debug( "BaseDump::prefetch at page $this->lastPage, rev $this->lastRev, "
85  . "looking for $page, $rev" );
86  $this->nextRev();
87  }
88  if ( $this->lastRev == $rev && !$this->atEnd ) {
89  $this->debug( "BaseDump::prefetch hit on $page, $rev [$this->lastPage, $this->lastRev]" );
90 
91  if ( $slot !== SlotRecord::MAIN ) {
92  $lastSlot = SlotRecord::MAIN;
93  while ( $lastSlot !== $slot ) {
94  if ( !$this->skipTo( 'content', 'revision' ) ) {
95  return null;
96  }
97  if ( !$this->skipTo( 'role', 'revision' ) ) {
98  return null;
99  }
100  $lastSlot = $this->nodeContents();
101  }
102  }
103 
104  return $this->nextText();
105  } else {
106  $this->debug( "BaseDump::prefetch already past rev $rev on page $page "
107  . "[$this->lastPage, $this->lastRev]" );
108 
109  return null;
110  }
111  }
112 
113  protected function debug( $str ) {
114  wfDebug( $str );
115  // global $dumper;
116  // $dumper->progress( $str );
117  }
118 
119  private function nextPage() {
120  if ( $this->skipTo( 'page', 'mediawiki' ) ) {
121  if ( $this->skipTo( 'id' ) ) {
122  $this->lastPage = intval( $this->nodeContents() );
123  $this->lastRev = 0;
124  $this->atPageEnd = false;
125  }
126  } else {
127  $this->close();
128  if ( count( $this->infiles ) ) {
129  $infile = array_shift( $this->infiles );
130  $this->reader->open( $infile );
131  $this->atEnd = false;
132  }
133  }
134  }
135 
136  private function nextRev() {
137  if ( $this->skipTo( 'revision' ) ) {
138  if ( $this->skipTo( 'id' ) ) {
139  $this->lastRev = intval( $this->nodeContents() );
140  }
141  } else {
142  $this->atPageEnd = true;
143  }
144  }
145 
149  private function nextText() {
150  if ( !$this->skipTo( 'text', 'revision' ) ) {
151  return null;
152  }
153 
154  return strval( $this->nodeContents() );
155  }
156 
162  private function skipTo( $name, $parent = 'page' ) {
163  if ( $this->atEnd ) {
164  return false;
165  }
166  while ( $this->reader->read() ) {
167  if ( $this->reader->nodeType == XMLReader::ELEMENT
168  && $this->reader->name == $name
169  ) {
170  return true;
171  }
172  if ( $this->reader->nodeType == XMLReader::END_ELEMENT
173  && $this->reader->name == $parent
174  ) {
175  $this->debug( "BaseDump::skipTo found </$parent> searching for <$name>" );
176 
177  return false;
178  }
179  }
180 
181  return $this->close();
182  }
183 
191  private function nodeContents() {
192  if ( $this->atEnd ) {
193  return null;
194  }
195  if ( $this->reader->isEmptyElement ) {
196  return "";
197  }
198  $buffer = "";
199  while ( $this->reader->read() ) {
200  switch ( $this->reader->nodeType ) {
201  case XMLReader::TEXT:
202  // case XMLReader::WHITESPACE:
203  case XMLReader::SIGNIFICANT_WHITESPACE:
204  $buffer .= $this->reader->value;
205  break;
206  case XMLReader::END_ELEMENT:
207  return $buffer;
208  }
209  }
210 
211  return $this->close();
212  }
213 
217  public function close() {
218  $this->reader->close();
219  $this->atEnd = true;
220 
221  return null;
222  }
223 }
BaseDump
Readahead helper for making large MediaWiki data dumps; reads in a previous XML dump to sequentially ...
Definition: BaseDump.php:44
BaseDump\nodeContents
nodeContents()
Shouldn't something like this be built-in to XMLReader? Fetches text contents of the current element,...
Definition: BaseDump.php:191
BaseDump\skipTo
skipTo( $name, $parent='page')
Definition: BaseDump.php:162
BaseDump\$reader
XMLReader $reader
Definition: BaseDump.php:46
BaseDump\__construct
__construct( $infile)
Definition: BaseDump.php:53
BaseDump\nextRev
nextRev()
Definition: BaseDump.php:136
BaseDump\$atEnd
$atEnd
Definition: BaseDump.php:47
BaseDump\$infiles
$infiles
Definition: BaseDump.php:51
BaseDump\$lastPage
$lastPage
Definition: BaseDump.php:49
wfDebug
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
Definition: GlobalFunctions.php:912
BaseDump\$atPageEnd
$atPageEnd
Definition: BaseDump.php:48
BaseDump\prefetch
prefetch( $page, $rev, $slot=SlotRecord::MAIN)
Attempts to fetch the text of a particular page revision from the dump stream.
Definition: BaseDump.php:70
BaseDump\$lastRev
$lastRev
Definition: BaseDump.php:50
BaseDump\close
close()
Definition: BaseDump.php:217
BaseDump\debug
debug( $str)
Definition: BaseDump.php:113
BaseDump\nextPage
nextPage()
Definition: BaseDump.php:119
Revision\SlotRecord
Value object representing a content slot associated with a page revision.
Definition: SlotRecord.php:39
BaseDump\nextText
nextText()
Definition: BaseDump.php:149