MediaWiki  1.34.0
cleanup.php
Go to the documentation of this file.
1 <?php
9 $IP = getenv( 'MW_INSTALL_PATH' );
10 if ( $IP === false ) {
11  $IP = __DIR__ . '/../../..';
12 }
13 require_once "$IP/maintenance/Maintenance.php";
14 
15 class Cleanup extends Maintenance {
16  public function __construct() {
17  parent::__construct();
18  $this->requireExtension( 'SpamBlacklist' );
19  $this->addOption( 'dry-run', 'Only do a dry run' );
20  }
21 
22  public function execute() {
23  $user = User::newSystemUser( 'Spam cleanup script', [ 'steal' => true ] );
24 
26  $regexes = $sb->getBlacklists();
27  if ( !$regexes ) {
28  $this->fatalError( "Invalid regex, can't clean up spam" );
29  }
30  $dryRun = $this->hasOption( 'dry-run' );
31 
32  $dbr = wfGetDB( DB_REPLICA );
33  $maxID = (int)$dbr->selectField( 'page', 'MAX(page_id)' );
34  $reportingInterval = 100;
35 
36  $this->output( "Regexes are " . implode( ', ', array_map( 'count', $regexes ) ) . " bytes\n" );
37  $this->output( "Searching for spam in $maxID pages...\n" );
38  if ( $dryRun ) {
39  $this->output( "Dry run only\n" );
40  }
41 
42  for ( $id = 1; $id <= $maxID; $id++ ) {
43  if ( $id % $reportingInterval == 0 ) {
44  printf( "%-8d %-5.2f%%\r", $id, $id / $maxID * 100 );
45  }
46  $revision = Revision::loadFromPageId( $dbr, $id );
47  if ( $revision ) {
48  $text = ContentHandler::getContentText( $revision->getContent() );
49  if ( $text ) {
50  foreach ( $regexes as $regex ) {
51  if ( preg_match( $regex, $text, $matches ) ) {
52  $title = $revision->getTitle();
53  $titleText = $title->getPrefixedText();
54  if ( $dryRun ) {
55  $this->output( "Found spam in [[$titleText]]\n" );
56  } else {
57  $this->output( "Cleaning up links to {$matches[0]} in [[$titleText]]\n" );
58  $match = str_replace( 'http://', '', $matches[0] );
59  $this->cleanupArticle( $revision, $regexes, $match, $user );
60  }
61  }
62  }
63  }
64  }
65  }
66  // Just for satisfaction
67  printf( "%-8d %-5.2f%%\n", $id - 1, ( $id - 1 ) / $maxID * 100 );
68  }
69 
77  private function cleanupArticle( Revision $rev, $regexes, $match, User $user ) {
78  $title = $rev->getTitle();
79  while ( $rev ) {
80  $matches = false;
81  foreach ( $regexes as $regex ) {
83  || preg_match(
84  $regex,
86  );
87  }
88  if ( !$matches ) {
89  // Didn't find any spam
90  break;
91  }
92 
93  $rev = $rev->getPrevious();
94  }
95  if ( !$rev ) {
96  // Didn't find a non-spammy revision, blank the page
97  $this->output( "All revisions are spam, blanking...\n" );
98  $text = '';
99  $comment = "All revisions matched the spam blacklist ($match), blanking";
100  } else {
101  // Revert to this revision
102  $text = ContentHandler::getContentText( $rev->getContent() );
103  $comment = "Cleaning up links to $match";
104  }
105  $wikiPage = new WikiPage( $title );
106  $wikiPage->doEditContent(
107  ContentHandler::makeContent( $text, $title ), $comment,
108  0, false, $user
109  );
110  }
111 }
112 
113 $maintClass = Cleanup::class;
114 require_once RUN_MAINTENANCE_IF_MAIN;
BaseBlacklist\getSpamBlacklist
static getSpamBlacklist()
Definition: BaseBlacklist.php:99
RUN_MAINTENANCE_IF_MAIN
const RUN_MAINTENANCE_IF_MAIN
Definition: Maintenance.php:39
Maintenance\fatalError
fatalError( $msg, $exitCode=1)
Output a message and terminate the current script.
Definition: Maintenance.php:504
Revision\getContent
getContent( $audience=self::FOR_PUBLIC, User $user=null)
Fetch revision content if it's available to the specified audience.
Definition: Revision.php:719
WikiPage
Class representing a MediaWiki article and history.
Definition: WikiPage.php:47
Cleanup
Definition: cleanup.php:15
Maintenance
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
Definition: Maintenance.php:82
$dbr
$dbr
Definition: testCompression.php:50
User\newSystemUser
static newSystemUser( $name, $options=[])
Static factory method for creation of a "system" user from username.
Definition: User.php:737
Revision
Definition: Revision.php:40
wfGetDB
wfGetDB( $db, $groups=[], $wiki=false)
Get a Database object.
Definition: GlobalFunctions.php:2575
$matches
$matches
Definition: NoLocalSettings.php:24
Revision\loadFromPageId
static loadFromPageId( $db, $pageid, $id=0)
Load either the current, or a specified, revision that's attached to a given page.
Definition: Revision.php:261
Maintenance\addOption
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
Definition: Maintenance.php:267
Maintenance\requireExtension
requireExtension( $name)
Indicate that the specified extension must be loaded before the script can run.
Definition: Maintenance.php:638
Cleanup\cleanupArticle
cleanupArticle(Revision $rev, $regexes, $match, User $user)
Find the latest revision of the article that does not contain spam and revert to it.
Definition: cleanup.php:77
$title
$title
Definition: testCompression.php:34
DB_REPLICA
const DB_REPLICA
Definition: defines.php:25
Revision\getPrevious
getPrevious()
Get previous revision for this title.
Definition: Revision.php:814
ContentHandler\makeContent
static makeContent( $text, Title $title=null, $modelId=null, $format=null)
Convenience function for creating a Content object from a given textual representation.
Definition: ContentHandler.php:135
Revision\getTitle
getTitle()
Returns the title of the page associated with this entry.
Definition: Revision.php:559
$IP
$IP
An aggressive spam cleanup script.
Definition: cleanup.php:9
Cleanup\execute
execute()
Do the actual work.
Definition: cleanup.php:22
ContentHandler\getContentText
static getContentText(Content $content=null)
Convenience function for getting flat text from a Content object.
Definition: ContentHandler.php:85
$maintClass
$maintClass
Definition: cleanup.php:113
Maintenance\output
output( $out, $channel=null)
Throw some output to the user.
Definition: Maintenance.php:453
Cleanup\__construct
__construct()
Default constructor.
Definition: cleanup.php:16
Maintenance\hasOption
hasOption( $name)
Checks to see if a particular option exists.
Definition: Maintenance.php:288
User
The User object encapsulates all of the user-specific settings (user_id, name, rights,...
Definition: User.php:51