Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 68 |
|
0.00% |
0 / 3 |
CRAP | |
0.00% |
0 / 1 |
Cleanup | |
0.00% |
0 / 62 |
|
0.00% |
0 / 3 |
420 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
2 | |||
execute | |
0.00% |
0 / 34 |
|
0.00% |
0 / 1 |
132 | |||
cleanupArticle | |
0.00% |
0 / 22 |
|
0.00% |
0 / 1 |
72 |
1 | <?php |
2 | /** |
3 | * An aggressive spam cleanup script. |
4 | * Searches the database for matching pages, and reverts them to |
5 | * the last non-spammed revision. |
6 | * If all revisions contain spam, blanks the page |
7 | */ |
8 | |
9 | use MediaWiki\Content\ContentHandler; |
10 | use MediaWiki\Content\TextContent; |
11 | use MediaWiki\Extension\SpamBlacklist\BaseBlacklist; |
12 | use MediaWiki\Maintenance\Maintenance; |
13 | use MediaWiki\MediaWikiServices; |
14 | use MediaWiki\Page\WikiPageFactory; |
15 | use MediaWiki\Revision\RevisionLookup; |
16 | use MediaWiki\Revision\RevisionRecord; |
17 | use MediaWiki\Revision\SlotRecord; |
18 | use MediaWiki\Title\Title; |
19 | use MediaWiki\Title\TitleFormatter; |
20 | use MediaWiki\User\User; |
21 | |
22 | $IP = getenv( 'MW_INSTALL_PATH' ); |
23 | if ( $IP === false ) { |
24 | $IP = __DIR__ . '/../../..'; |
25 | } |
26 | require_once "$IP/maintenance/Maintenance.php"; |
27 | |
28 | class Cleanup extends Maintenance { |
29 | /** @var RevisionLookup */ |
30 | private $revisionLookup; |
31 | /** @var TitleFormatter */ |
32 | private $titleFormatter; |
33 | /** @var WikiPageFactory */ |
34 | private $wikiPageFactory; |
35 | |
36 | public function __construct() { |
37 | parent::__construct(); |
38 | $this->revisionLookup = MediaWikiServices::getInstance()->getRevisionLookup(); |
39 | $this->titleFormatter = MediaWikiServices::getInstance()->getTitleFormatter(); |
40 | $this->wikiPageFactory = MediaWikiServices::getInstance()->getWikiPageFactory(); |
41 | |
42 | $this->requireExtension( 'SpamBlacklist' ); |
43 | $this->addOption( 'dry-run', 'Only do a dry run' ); |
44 | } |
45 | |
46 | public function execute() { |
47 | $user = User::newSystemUser( 'Spam cleanup script', [ 'steal' => true ] ); |
48 | |
49 | $sb = BaseBlacklist::getSpamBlacklist(); |
50 | $regexes = $sb->getBlacklists(); |
51 | if ( !$regexes ) { |
52 | $this->fatalError( "Invalid regex, can't clean up spam" ); |
53 | } |
54 | $dryRun = $this->hasOption( 'dry-run' ); |
55 | |
56 | $dbr = $this->getReplicaDB(); |
57 | $maxID = (int)$dbr->newSelectQueryBuilder() |
58 | ->select( 'MAX(page_id)' ) |
59 | ->from( 'page' ) |
60 | ->caller( __METHOD__ ) |
61 | ->fetchField(); |
62 | $reportingInterval = 100; |
63 | |
64 | $this->output( "Regexes are " . implode( ', ', array_map( 'strlen', $regexes ) ) . " bytes\n" ); |
65 | $this->output( "Searching for spam in $maxID pages...\n" ); |
66 | if ( $dryRun ) { |
67 | $this->output( "Dry run only\n" ); |
68 | } |
69 | |
70 | for ( $id = 1; $id <= $maxID; $id++ ) { |
71 | if ( $id % $reportingInterval == 0 ) { |
72 | printf( "%-8d %-5.2f%%\r", $id, $id / $maxID * 100 ); |
73 | } |
74 | $revision = $this->revisionLookup->getRevisionByPageId( $id ); |
75 | if ( $revision ) { |
76 | $content = $revision->getContent( SlotRecord::MAIN ); |
77 | $text = ( $content instanceof TextContent ) ? $content->getText() : null; |
78 | if ( $text ) { |
79 | foreach ( $regexes as $regex ) { |
80 | if ( preg_match( $regex, $text, $matches ) ) { |
81 | $titleText = $this->titleFormatter->getPrefixedText( $revision->getPageAsLinkTarget() ); |
82 | if ( $dryRun ) { |
83 | $this->output( "Found spam in [[$titleText]]\n" ); |
84 | } else { |
85 | $this->output( "Cleaning up links to {$matches[0]} in [[$titleText]]\n" ); |
86 | $match = str_replace( 'http://', '', $matches[0] ); |
87 | $this->cleanupArticle( $revision, $regexes, $match, $user ); |
88 | } |
89 | } |
90 | } |
91 | } |
92 | } |
93 | } |
94 | // Just for satisfaction |
95 | printf( "%-8d %-5.2f%%\n", $id - 1, ( $id - 1 ) / $maxID * 100 ); |
96 | } |
97 | |
98 | /** |
99 | * Find the latest revision of the article that does not contain spam and revert to it |
100 | * @param RevisionRecord $rev |
101 | * @param array $regexes |
102 | * @param string $match |
103 | * @param User $user |
104 | */ |
105 | private function cleanupArticle( RevisionRecord $rev, $regexes, $match, User $user ) { |
106 | $title = Title::newFromLinkTarget( $rev->getPageAsLinkTarget() ); |
107 | while ( $rev ) { |
108 | $matches = false; |
109 | $content = $rev->getContent( SlotRecord::MAIN ); |
110 | foreach ( $regexes as $regex ) { |
111 | $matches = $matches |
112 | || preg_match( |
113 | $regex, |
114 | ( $content instanceof TextContent ) ? $content->getText() : null |
115 | ); |
116 | } |
117 | if ( !$matches ) { |
118 | // Didn't find any spam |
119 | break; |
120 | } |
121 | |
122 | $rev = $this->revisionLookup->getPreviousRevision( $rev ); |
123 | } |
124 | if ( !$rev ) { |
125 | // Didn't find a non-spammy revision, blank the page |
126 | $this->output( "All revisions are spam, blanking...\n" ); |
127 | $content = ContentHandler::makeContent( '', $title ); |
128 | $comment = "All revisions matched the spam blacklist ($match), blanking"; |
129 | } else { |
130 | // Revert to this revision |
131 | $content = $rev->getContent( SlotRecord::MAIN ) ?: |
132 | ContentHandler::makeContent( '', $title ); |
133 | $comment = "Cleaning up links to $match"; |
134 | } |
135 | $wikiPage = $this->wikiPageFactory->newFromTitle( $title ); |
136 | $wikiPage->doUserEditContent( $content, $user, $comment ); |
137 | } |
138 | } |
139 | |
140 | $maintClass = Cleanup::class; |
141 | require_once RUN_MAINTENANCE_IF_MAIN; |