MediaWiki REL1_34
cleanup.php
Go to the documentation of this file.
1<?php
9$IP = getenv( 'MW_INSTALL_PATH' );
10if ( $IP === false ) {
11 $IP = __DIR__ . '/../../..';
12}
13require_once "$IP/maintenance/Maintenance.php";
14
15class Cleanup extends Maintenance {
16 public function __construct() {
17 parent::__construct();
18 $this->requireExtension( 'SpamBlacklist' );
19 $this->addOption( 'dry-run', 'Only do a dry run' );
20 }
21
22 public function execute() {
23 $user = User::newSystemUser( 'Spam cleanup script', [ 'steal' => true ] );
24
26 $regexes = $sb->getBlacklists();
27 if ( !$regexes ) {
28 $this->fatalError( "Invalid regex, can't clean up spam" );
29 }
30 $dryRun = $this->hasOption( 'dry-run' );
31
33 $maxID = (int)$dbr->selectField( 'page', 'MAX(page_id)' );
34 $reportingInterval = 100;
35
36 $this->output( "Regexes are " . implode( ', ', array_map( 'count', $regexes ) ) . " bytes\n" );
37 $this->output( "Searching for spam in $maxID pages...\n" );
38 if ( $dryRun ) {
39 $this->output( "Dry run only\n" );
40 }
41
42 for ( $id = 1; $id <= $maxID; $id++ ) {
43 if ( $id % $reportingInterval == 0 ) {
44 printf( "%-8d %-5.2f%%\r", $id, $id / $maxID * 100 );
45 }
46 $revision = Revision::loadFromPageId( $dbr, $id );
47 if ( $revision ) {
48 $text = ContentHandler::getContentText( $revision->getContent() );
49 if ( $text ) {
50 foreach ( $regexes as $regex ) {
51 if ( preg_match( $regex, $text, $matches ) ) {
52 $title = $revision->getTitle();
53 $titleText = $title->getPrefixedText();
54 if ( $dryRun ) {
55 $this->output( "Found spam in [[$titleText]]\n" );
56 } else {
57 $this->output( "Cleaning up links to {$matches[0]} in [[$titleText]]\n" );
58 $match = str_replace( 'http://', '', $matches[0] );
59 $this->cleanupArticle( $revision, $regexes, $match, $user );
60 }
61 }
62 }
63 }
64 }
65 }
66 // Just for satisfaction
67 printf( "%-8d %-5.2f%%\n", $id - 1, ( $id - 1 ) / $maxID * 100 );
68 }
69
77 private function cleanupArticle( Revision $rev, $regexes, $match, User $user ) {
78 $title = $rev->getTitle();
79 while ( $rev ) {
80 $matches = false;
81 foreach ( $regexes as $regex ) {
83 || preg_match(
84 $regex,
85 ContentHandler::getContentText( $rev->getContent() )
86 );
87 }
88 if ( !$matches ) {
89 // Didn't find any spam
90 break;
91 }
92
93 $rev = $rev->getPrevious();
94 }
95 if ( !$rev ) {
96 // Didn't find a non-spammy revision, blank the page
97 $this->output( "All revisions are spam, blanking...\n" );
98 $text = '';
99 $comment = "All revisions matched the spam blacklist ($match), blanking";
100 } else {
101 // Revert to this revision
102 $text = ContentHandler::getContentText( $rev->getContent() );
103 $comment = "Cleaning up links to $match";
104 }
105 $wikiPage = new WikiPage( $title );
106 $wikiPage->doEditContent(
107 ContentHandler::makeContent( $text, $title ), $comment,
108 0, false, $user
109 );
110 }
111}
112
113$maintClass = Cleanup::class;
114require_once RUN_MAINTENANCE_IF_MAIN;
wfGetDB( $db, $groups=[], $wiki=false)
Get a Database object.
const RUN_MAINTENANCE_IF_MAIN
static getSpamBlacklist()
__construct()
Default constructor.
Definition cleanup.php:16
cleanupArticle(Revision $rev, $regexes, $match, User $user)
Find the latest revision of the article that does not contain spam and revert to it.
Definition cleanup.php:77
execute()
Do the actual work.
Definition cleanup.php:22
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
requireExtension( $name)
Indicate that the specified extension must be loaded before the script can run.
output( $out, $channel=null)
Throw some output to the user.
hasOption( $name)
Checks to see if a particular option exists.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
fatalError( $msg, $exitCode=1)
Output a message and terminate the current script.
getTitle()
Returns the title of the page associated with this entry.
Definition Revision.php:559
getPrevious()
Get previous revision for this title.
Definition Revision.php:814
getContent( $audience=self::FOR_PUBLIC, User $user=null)
Fetch revision content if it's available to the specified audience.
Definition Revision.php:719
The User object encapsulates all of the user-specific settings (user_id, name, rights,...
Definition User.php:51
Class representing a MediaWiki article and history.
Definition WikiPage.php:47
$IP
An aggressive spam cleanup script.
Definition cleanup.php:9
$maintClass
Definition cleanup.php:113
const DB_REPLICA
Definition defines.php:25