MediaWiki  master
convertLinks.php
Go to the documentation of this file.
1 <?php
25 
26 require_once __DIR__ . '/Maintenance.php';
27 
36 class ConvertLinks extends Maintenance {
37  private $logPerformance;
38 
39  public function __construct() {
40  parent::__construct();
41  $this->addDescription(
42  'Convert from the old links schema (string->ID) to the new schema (ID->ID). '
43  . 'The wiki should be put into read-only mode while this script executes' );
44 
45  $this->addArg( 'logperformance', "Log performance to perfLogFilename.", false );
46  $this->addArg(
47  'perfLogFilename',
48  "Filename where performance is logged if --logperformance was set "
49  . "(defaults to 'convLinksPerf.txt').",
50  false
51  );
52  $this->addArg(
53  'keep-links-table',
54  "Don't overwrite the old links table with the new one, leave the new table at links_temp.",
55  false
56  );
57  $this->addArg(
58  'nokeys',
59  /* (What about InnoDB?) */
60  "Don't create keys, and so allow duplicates in the new links table.\n"
61  . "This gives a huge speed improvement for very large links tables which are MyISAM.",
62  false
63  );
64  }
65 
66  public function getDbType() {
67  return Maintenance::DB_ADMIN;
68  }
69 
70  public function execute() {
71  $dbw = $this->getDB( DB_MASTER );
72 
73  $type = $dbw->getType();
74  if ( $type != 'mysql' ) {
75  $this->output( "Link table conversion not necessary for $type\n" );
76 
77  return;
78  }
79 
80  # counters etc
81  $numBadLinks = $curRowsRead = 0;
82 
83  # total tuples INSERTed into links_temp
84  $totalTuplesInserted = 0;
85 
86  # whether or not to give progress reports while reading IDs from cur table
87  $reportCurReadProgress = true;
88 
89  # number of rows between progress reports
90  $curReadReportInterval = 1000;
91 
92  # whether or not to give progress reports during conversion
93  $reportLinksConvProgress = true;
94 
95  # number of rows per INSERT
96  $linksConvInsertInterval = 1000;
97 
98  $initialRowOffset = 0;
99 
100  # not used yet; highest row number from links table to process
101  # $finalRowOffset = 0;
102 
103  $this->logPerformance = $this->hasOption( 'logperformance' );
104  $perfLogFilename = $this->getArg( 1, "convLinksPerf.txt" );
105  $overwriteLinksTable = !$this->hasOption( 'keep-links-table' );
106  $noKeys = $this->hasOption( 'noKeys' );
107 
108  # --------------------------------------------------------------------
109 
110  list( $cur, $links, $links_temp, $links_backup ) =
111  $dbw->tableNamesN( 'cur', 'links', 'links_temp', 'links_backup' );
112 
113  if ( $dbw->tableExists( 'pagelinks' ) ) {
114  $this->output( "...have pagelinks; skipping old links table updates\n" );
115 
116  return;
117  }
118 
119  $res = $dbw->query( "SELECT l_from FROM $links LIMIT 1" );
120  // @phan-suppress-next-line PhanUndeclaredMethod
121  if ( $dbw->fieldType( $res, 0 ) == "int" ) {
122  $this->output( "Schema already converted\n" );
123 
124  return;
125  }
126 
127  $res = $dbw->query( "SELECT COUNT(*) AS count FROM $links" );
128  $row = $dbw->fetchObject( $res );
129  $numRows = $row->count;
130 
131  if ( $numRows == 0 ) {
132  $this->output( "Updating schema (no rows to convert)...\n" );
133  $this->createTempTable();
134  } else {
135  $fh = false;
136  if ( $this->logPerformance ) {
137  $fh = fopen( $perfLogFilename, "w" );
138  if ( !$fh ) {
139  $this->error( "Couldn't open $perfLogFilename" );
140  $this->logPerformance = false;
141  }
142  }
143  $baseTime = $startTime = microtime( true );
144  # Create a title -> cur_id map
145  $this->output( "Loading IDs from $cur table...\n" );
146  $this->performanceLog( $fh, "Reading $numRows rows from cur table...\n" );
147  $this->performanceLog( $fh, "rows read vs seconds elapsed:\n" );
148  $contentLang = MediaWikiServices::getInstance()->getContentLanguage();
149 
150  $ids = [];
151  $lastId = 0;
152  do {
153  $res = $dbw->query(
154  "SELECT cur_namespace,cur_title,cur_id FROM $cur " .
155  "WHERE cur_id > $lastId ORDER BY cur_id LIMIT 10000"
156  );
157  foreach ( $res as $row ) {
158  $title = $row->cur_title;
159  if ( $row->cur_namespace ) {
160  $title = $contentLang->getNsText( $row->cur_namespace ) . ":$title";
161  }
162  $ids[$title] = $row->cur_id;
163  $curRowsRead++;
164  if ( $reportCurReadProgress ) {
165  if ( ( $curRowsRead % $curReadReportInterval ) == 0 ) {
166  $this->performanceLog(
167  $fh,
168  $curRowsRead . " " . ( microtime( true ) - $baseTime ) . "\n"
169  );
170  $this->output( "\t$curRowsRead rows of $cur table read.\n" );
171  }
172  }
173  $lastId = $row->cur_id;
174  }
175  } while ( $res->numRows() > 0 );
176  $this->output( "Finished loading IDs.\n\n" );
177  $this->performanceLog(
178  $fh,
179  "Took " . ( microtime( true ) - $baseTime ) . " seconds to load IDs.\n\n"
180  );
181 
182  # --------------------------------------------------------------------
183 
184  # Now, step through the links table (in chunks of $linksConvInsertInterval rows),
185  # convert, and write to the new table.
186  $this->createTempTable();
187  $this->performanceLog( $fh, "Resetting timer.\n\n" );
188  $baseTime = microtime( true );
189  $this->output( "Processing $numRows rows from $links table...\n" );
190  $this->performanceLog( $fh, "Processing $numRows rows from $links table...\n" );
191  $this->performanceLog( $fh, "rows inserted vs seconds elapsed:\n" );
192 
193  for ( $rowOffset = $initialRowOffset; $rowOffset < $numRows;
194  $rowOffset += $linksConvInsertInterval
195  ) {
196  $sqlRead = "SELECT * FROM $links ";
197  $sqlRead = $dbw->limitResult( $sqlRead, $linksConvInsertInterval, $rowOffset );
198  $res = $dbw->query( $sqlRead );
199  if ( $noKeys ) {
200  $sqlWrite = [ "INSERT INTO $links_temp (l_from,l_to) VALUES " ];
201  } else {
202  $sqlWrite = [ "INSERT IGNORE INTO $links_temp (l_from,l_to) VALUES " ];
203  }
204 
205  $tuplesAdded = 0; # no tuples added to INSERT yet
206  foreach ( $res as $row ) {
207  $fromTitle = $row->l_from;
208  if ( array_key_exists( $fromTitle, $ids ) ) { # valid title
209  $from = $ids[$fromTitle];
210  $to = $row->l_to;
211  if ( $tuplesAdded != 0 ) {
212  $sqlWrite[] = ",";
213  }
214  $sqlWrite[] = "($from,$to)";
215  $tuplesAdded++;
216  } else { # invalid title
217  $numBadLinks++;
218  }
219  }
220  # $this->output( "rowOffset: $rowOffset\ttuplesAdded: "
221  # . "$tuplesAdded\tnumBadLinks: $numBadLinks\n" );
222  if ( $tuplesAdded != 0 ) {
223  if ( $reportLinksConvProgress ) {
224  $this->output( "Inserting $tuplesAdded tuples into $links_temp..." );
225  }
226  $dbw->query( implode( "", $sqlWrite ) );
227  $totalTuplesInserted += $tuplesAdded;
228  if ( $reportLinksConvProgress ) {
229  $this->output( " done. Total $totalTuplesInserted tuples inserted.\n" );
230  $this->performanceLog(
231  $fh,
232  $totalTuplesInserted . " " . ( microtime( true ) - $baseTime ) . "\n"
233  );
234  }
235  }
236  }
237  $this->output( "$totalTuplesInserted valid titles and "
238  . "$numBadLinks invalid titles were processed.\n\n" );
239  $this->performanceLog(
240  $fh,
241  "$totalTuplesInserted valid titles and $numBadLinks invalid titles were processed.\n"
242  );
243  $this->performanceLog(
244  $fh,
245  "Total execution time: " . ( microtime( true ) - $startTime ) . " seconds.\n"
246  );
247  if ( $this->logPerformance ) {
248  fclose( $fh );
249  }
250  }
251  # --------------------------------------------------------------------
252 
253  if ( $overwriteLinksTable ) {
254  # Check for existing links_backup, and delete it if it exists.
255  $this->output( "Dropping backup links table if it exists..." );
256  $dbw->query( "DROP TABLE IF EXISTS $links_backup", __METHOD__ );
257  $this->output( " done.\n" );
258 
259  # Swap in the new table, and move old links table to links_backup
260  $this->output( "Swapping tables '$links' to '$links_backup'; '$links_temp' to '$links'..." );
261  $dbw->query( "RENAME TABLE links TO $links_backup, $links_temp TO $links", __METHOD__ );
262  $this->output( " done.\n\n" );
263 
264  $this->output( "Conversion complete. The old table remains at $links_backup;\n" );
265  $this->output( "delete at your leisure.\n" );
266  } else {
267  $this->output( "Conversion complete. The converted table is at $links_temp;\n" );
268  $this->output( "the original links table is unchanged.\n" );
269  }
270  }
271 
272  private function createTempTable() {
273  $dbConn = $this->getDB( DB_MASTER );
274 
275  if ( !( $dbConn->isOpen() ) ) {
276  $this->output( "Opening connection to database failed.\n" );
277 
278  return;
279  }
280  $links_temp = $dbConn->tableName( 'links_temp' );
281 
282  $this->output( "Dropping temporary links table if it exists..." );
283  $dbConn->query( "DROP TABLE IF EXISTS $links_temp" );
284  $this->output( " done.\n" );
285 
286  $this->output( "Creating temporary links table..." );
287  if ( $this->hasOption( 'noKeys' ) ) {
288  $dbConn->query( "CREATE TABLE $links_temp ( " .
289  "l_from int(8) unsigned NOT NULL default '0', " .
290  "l_to int(8) unsigned NOT NULL default '0')" );
291  } else {
292  $dbConn->query( "CREATE TABLE $links_temp ( " .
293  "l_from int(8) unsigned NOT NULL default '0', " .
294  "l_to int(8) unsigned NOT NULL default '0', " .
295  "UNIQUE KEY l_from(l_from,l_to), " .
296  "KEY (l_to))" );
297  }
298  $this->output( " done.\n\n" );
299  }
300 
301  private function performanceLog( $fh, $text ) {
302  if ( $this->logPerformance ) {
303  fwrite( $fh, $text );
304  }
305  }
306 }
307 
308 $maintClass = ConvertLinks::class;
309 require_once RUN_MAINTENANCE_IF_MAIN;
getArg( $argId=0, $default=null)
Get an argument.
const RUN_MAINTENANCE_IF_MAIN
Definition: Maintenance.php:39
error( $err, $die=0)
Throw an error to the user.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
Definition: Maintenance.php:86
hasOption( $name)
Checks to see if a particular option exists.
const DB_MASTER
Definition: defines.php:26
addDescription( $text)
Set the description text.
const DB_ADMIN
Definition: Maintenance.php:93
addArg( $arg, $description, $required=true)
Add some args that are needed.
output( $out, $channel=null)
Throw some output to the user.
getDB( $db, $groups=[], $dbDomain=false)
Returns a database to be used by current maintenance script.