Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 68 |
|
0.00% |
0 / 3 |
CRAP | |
0.00% |
0 / 1 |
Cleanup | |
0.00% |
0 / 62 |
|
0.00% |
0 / 3 |
420 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
2 | |||
execute | |
0.00% |
0 / 34 |
|
0.00% |
0 / 1 |
132 | |||
cleanupArticle | |
0.00% |
0 / 22 |
|
0.00% |
0 / 1 |
72 |
1 | <?php |
2 | /** |
3 | * An aggressive spam cleanup script. |
4 | * Searches the database for matching pages, and reverts them to |
5 | * the last non-spammed revision. |
6 | * If all revisions contain spam, blanks the page |
7 | */ |
8 | |
9 | use MediaWiki\Extension\SpamBlacklist\BaseBlacklist; |
10 | use MediaWiki\MediaWikiServices; |
11 | use MediaWiki\Page\WikiPageFactory; |
12 | use MediaWiki\Revision\RevisionLookup; |
13 | use MediaWiki\Revision\RevisionRecord; |
14 | use MediaWiki\Revision\SlotRecord; |
15 | use MediaWiki\Title\Title; |
16 | use MediaWiki\Title\TitleFormatter; |
17 | use MediaWiki\User\User; |
18 | |
19 | $IP = getenv( 'MW_INSTALL_PATH' ); |
20 | if ( $IP === false ) { |
21 | $IP = __DIR__ . '/../../..'; |
22 | } |
23 | require_once "$IP/maintenance/Maintenance.php"; |
24 | |
25 | class Cleanup extends Maintenance { |
26 | /** @var RevisionLookup */ |
27 | private $revisionLookup; |
28 | /** @var TitleFormatter */ |
29 | private $titleFormatter; |
30 | /** @var WikiPageFactory */ |
31 | private $wikiPageFactory; |
32 | |
33 | public function __construct() { |
34 | parent::__construct(); |
35 | $this->revisionLookup = MediaWikiServices::getInstance()->getRevisionLookup(); |
36 | $this->titleFormatter = MediaWikiServices::getInstance()->getTitleFormatter(); |
37 | $this->wikiPageFactory = MediaWikiServices::getInstance()->getWikiPageFactory(); |
38 | |
39 | $this->requireExtension( 'SpamBlacklist' ); |
40 | $this->addOption( 'dry-run', 'Only do a dry run' ); |
41 | } |
42 | |
43 | public function execute() { |
44 | $user = User::newSystemUser( 'Spam cleanup script', [ 'steal' => true ] ); |
45 | |
46 | $sb = BaseBlacklist::getSpamBlacklist(); |
47 | $regexes = $sb->getBlacklists(); |
48 | if ( !$regexes ) { |
49 | $this->fatalError( "Invalid regex, can't clean up spam" ); |
50 | } |
51 | $dryRun = $this->hasOption( 'dry-run' ); |
52 | |
53 | $dbr = $this->getReplicaDB(); |
54 | $maxID = (int)$dbr->newSelectQueryBuilder() |
55 | ->select( 'MAX(page_id)' ) |
56 | ->from( 'page' ) |
57 | ->caller( __METHOD__ ) |
58 | ->fetchField(); |
59 | $reportingInterval = 100; |
60 | |
61 | $this->output( "Regexes are " . implode( ', ', array_map( 'strlen', $regexes ) ) . " bytes\n" ); |
62 | $this->output( "Searching for spam in $maxID pages...\n" ); |
63 | if ( $dryRun ) { |
64 | $this->output( "Dry run only\n" ); |
65 | } |
66 | |
67 | for ( $id = 1; $id <= $maxID; $id++ ) { |
68 | if ( $id % $reportingInterval == 0 ) { |
69 | printf( "%-8d %-5.2f%%\r", $id, $id / $maxID * 100 ); |
70 | } |
71 | $revision = $this->revisionLookup->getRevisionByPageId( $id ); |
72 | if ( $revision ) { |
73 | $content = $revision->getContent( SlotRecord::MAIN ); |
74 | $text = ( $content instanceof TextContent ) ? $content->getText() : null; |
75 | if ( $text ) { |
76 | foreach ( $regexes as $regex ) { |
77 | if ( preg_match( $regex, $text, $matches ) ) { |
78 | $titleText = $this->titleFormatter->getPrefixedText( $revision->getPageAsLinkTarget() ); |
79 | if ( $dryRun ) { |
80 | $this->output( "Found spam in [[$titleText]]\n" ); |
81 | } else { |
82 | $this->output( "Cleaning up links to {$matches[0]} in [[$titleText]]\n" ); |
83 | $match = str_replace( 'http://', '', $matches[0] ); |
84 | $this->cleanupArticle( $revision, $regexes, $match, $user ); |
85 | } |
86 | } |
87 | } |
88 | } |
89 | } |
90 | } |
91 | // Just for satisfaction |
92 | printf( "%-8d %-5.2f%%\n", $id - 1, ( $id - 1 ) / $maxID * 100 ); |
93 | } |
94 | |
95 | /** |
96 | * Find the latest revision of the article that does not contain spam and revert to it |
97 | * @param RevisionRecord $rev |
98 | * @param array $regexes |
99 | * @param string $match |
100 | * @param User $user |
101 | */ |
102 | private function cleanupArticle( RevisionRecord $rev, $regexes, $match, User $user ) { |
103 | $title = Title::newFromLinkTarget( $rev->getPageAsLinkTarget() ); |
104 | while ( $rev ) { |
105 | $matches = false; |
106 | $content = $rev->getContent( SlotRecord::MAIN ); |
107 | foreach ( $regexes as $regex ) { |
108 | $matches = $matches |
109 | || preg_match( |
110 | $regex, |
111 | ( $content instanceof TextContent ) ? $content->getText() : null |
112 | ); |
113 | } |
114 | if ( !$matches ) { |
115 | // Didn't find any spam |
116 | break; |
117 | } |
118 | |
119 | $rev = $this->revisionLookup->getPreviousRevision( $rev ); |
120 | } |
121 | if ( !$rev ) { |
122 | // Didn't find a non-spammy revision, blank the page |
123 | $this->output( "All revisions are spam, blanking...\n" ); |
124 | $content = ContentHandler::makeContent( '', $title ); |
125 | $comment = "All revisions matched the spam blacklist ($match), blanking"; |
126 | } else { |
127 | // Revert to this revision |
128 | $content = $rev->getContent( SlotRecord::MAIN ) ?: |
129 | ContentHandler::makeContent( '', $title ); |
130 | $comment = "Cleaning up links to $match"; |
131 | } |
132 | $wikiPage = $this->wikiPageFactory->newFromTitle( $title ); |
133 | $wikiPage->doUserEditContent( $content, $user, $comment ); |
134 | } |
135 | } |
136 | |
137 | $maintClass = Cleanup::class; |
138 | require_once RUN_MAINTENANCE_IF_MAIN; |