Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 120 |
|
0.00% |
0 / 12 |
CRAP | |
0.00% |
0 / 1 |
Checker | |
0.00% |
0 / 120 |
|
0.00% |
0 / 12 |
1640 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
2 | |||
check | |
0.00% |
0 / 24 |
|
0.00% |
0 / 1 |
56 | |||
checkExisitingPage | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
20 | |||
checkIfRedirect | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
20 | |||
checkInexistentPage | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
12 | |||
checkPageInIndex | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
12 | |||
checkIndexMismatch | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
20 | |||
checkIndexedVersion | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
12 | |||
loadPagesFromDB | |
0.00% |
0 / 20 |
|
0.00% |
0 / 1 |
30 | |||
getDB | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
loadPagesFromIndex | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
12 | |||
sane | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
6 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\Sanity; |
4 | |
5 | use ArrayObject; |
6 | use CirrusSearch\Connection; |
7 | use CirrusSearch\SearchConfig; |
8 | use CirrusSearch\Searcher; |
9 | use MediaWiki\MediaWikiServices; |
10 | use Title; |
11 | use WikiPage; |
12 | |
13 | /** |
14 | * Checks if a WikiPage's representation in search index is sane. |
15 | * |
16 | * This program is free software; you can redistribute it and/or modify |
17 | * it under the terms of the GNU General Public License as published by |
18 | * the Free Software Foundation; either version 2 of the License, or |
19 | * (at your option) any later version. |
20 | * |
21 | * This program is distributed in the hope that it will be useful, |
22 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
23 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
24 | * GNU General Public License for more details. |
25 | * |
26 | * You should have received a copy of the GNU General Public License along |
27 | * with this program; if not, write to the Free Software Foundation, Inc., |
28 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
29 | * http://www.gnu.org/copyleft/gpl.html |
30 | */ |
31 | |
32 | class Checker { |
33 | /** |
34 | * @var SearchConfig |
35 | */ |
36 | private $searchConfig; |
37 | |
38 | /** |
39 | * @var Connection |
40 | */ |
41 | private $connection; |
42 | |
43 | /** |
44 | * @var Searcher Used for fetching data, so we can check the content. |
45 | */ |
46 | private $searcher; |
47 | |
48 | /** |
49 | * @var Remediator Do something with the problems we found |
50 | */ |
51 | private $remediator; |
52 | |
53 | /** |
54 | * @var bool Should we log id's that are found to have no problems |
55 | */ |
56 | private $logSane; |
57 | |
58 | /** |
59 | * @var bool inspect WikiPage::isRedirect() instead of WikiPage::getContent()->isRedirect() |
60 | * Faster since it does not need to fetch the content but inconsistent in some cases. |
61 | */ |
62 | private $fastRedirectCheck; |
63 | |
64 | /** |
65 | * A cache for pages loaded with loadPagesFromDB( $pageIds ). This is only |
66 | * useful when multiple Checker are run to check different elastic clusters. |
67 | * @var ArrayObject|null |
68 | */ |
69 | private $pageCache; |
70 | |
71 | /** |
72 | * @var callable Accepts a WikiPage argument and returns boolean true if the page |
73 | * should be reindexed based on time since last reindex. |
74 | */ |
75 | private $isOldFn; |
76 | |
77 | /** |
78 | * Build the checker. |
79 | * @param SearchConfig $config |
80 | * @param Connection $connection |
81 | * @param Remediator $remediator the remediator to which to send titles |
82 | * that are insane |
83 | * @param Searcher $searcher searcher to use for fetches |
84 | * @param bool $logSane should we log sane ids |
85 | * @param bool $fastRedirectCheck fast but inconsistent redirect check |
86 | * @param ArrayObject|null $pageCache cache for WikiPage loaded from db |
87 | * @param callable|null $isOldFn Accepts a WikiPage argument and returns boolean true if the page |
88 | * should be reindexed based on time since last reindex. |
89 | */ |
90 | public function __construct( |
91 | SearchConfig $config, |
92 | Connection $connection, |
93 | Remediator $remediator, |
94 | Searcher $searcher, |
95 | $logSane, |
96 | $fastRedirectCheck, |
97 | ArrayObject $pageCache = null, |
98 | callable $isOldFn = null |
99 | ) { |
100 | $this->searchConfig = $config; |
101 | $this->connection = $connection; |
102 | $this->remediator = $remediator; |
103 | $this->searcher = $searcher; |
104 | $this->logSane = $logSane; |
105 | $this->fastRedirectCheck = $fastRedirectCheck; |
106 | $this->pageCache = $pageCache; |
107 | $this->isOldFn = $isOldFn ?? static function ( WikiPage $page ) { |
108 | return false; |
109 | }; |
110 | } |
111 | |
112 | /** |
113 | * Check if a title is insane. |
114 | * |
115 | * @param int[] $pageIds page to check |
116 | * @return int the number of pages updated |
117 | * @throws CheckerException |
118 | */ |
119 | public function check( array $pageIds ) { |
120 | $docIds = array_map( [ $this->searchConfig, 'makeId' ], $pageIds ); |
121 | |
122 | $pagesFromDb = $this->loadPagesFromDB( $pageIds ); |
123 | $pagesFromIndex = $this->loadPagesFromIndex( $docIds ); |
124 | $nbPagesFixed = 0; |
125 | $nbPagesOld = 0; |
126 | foreach ( array_combine( $pageIds, $docIds ) as $pageId => $docId ) { |
127 | $fromIndex = []; |
128 | if ( isset( $pagesFromIndex[$docId] ) ) { |
129 | $fromIndex = $pagesFromIndex[$docId]; |
130 | } |
131 | |
132 | if ( isset( $pagesFromDb[$pageId] ) ) { |
133 | $page = $pagesFromDb[$pageId]; |
134 | $updated = $this->checkExisitingPage( $docId, $pageId, $page, $fromIndex ); |
135 | if ( !$updated && ( $this->isOldFn )( $page ) ) { |
136 | $this->remediator->oldDocument( $page ); |
137 | $nbPagesOld++; |
138 | } |
139 | } else { |
140 | $updated = $this->checkInexistentPage( $docId, $pageId, $fromIndex ); |
141 | } |
142 | if ( $updated ) { |
143 | $nbPagesFixed++; |
144 | } |
145 | } |
146 | $clusterName = $this->connection->getClusterName(); |
147 | $stats = MediaWikiServices::getInstance()->getStatsdDataFactory(); |
148 | $stats->updateCount( "CirrusSearch.$clusterName.sanitization.fixed", $nbPagesFixed ); |
149 | $stats->updateCount( "CirrusSearch.$clusterName.sanitization.checked", count( $pageIds ) ); |
150 | $stats->updateCount( "CirrusSearch.$clusterName.sanitization.old", $nbPagesOld ); |
151 | return $nbPagesFixed; |
152 | } |
153 | |
154 | /** |
155 | * Check that an existing page is properly indexed: |
156 | * - index it if missing in the index |
157 | * - delete it if it's a redirect |
158 | * - verify it if found in the index |
159 | * |
160 | * @param string $docId |
161 | * @param int $pageId |
162 | * @param WikiPage $page |
163 | * @param \Elastica\Result[] $fromIndex |
164 | * @return bool true if a modification was needed |
165 | */ |
166 | private function checkExisitingPage( $docId, $pageId, $page, array $fromIndex ) { |
167 | $inIndex = $fromIndex !== []; |
168 | if ( $this->checkIfRedirect( $page ) ) { |
169 | if ( $inIndex ) { |
170 | $this->remediator->redirectInIndex( $page ); |
171 | return true; |
172 | } |
173 | $this->sane( $pageId, 'Redirect not in index' ); |
174 | return false; |
175 | } |
176 | if ( $inIndex ) { |
177 | return $this->checkPageInIndex( $docId, $pageId, $page, $fromIndex ); |
178 | } |
179 | $this->remediator->pageNotInIndex( $page ); |
180 | return true; |
181 | } |
182 | |
183 | /** |
184 | * Check if the page is a redirect |
185 | * @param WikiPage $page |
186 | * @return bool true if $page is a redirect |
187 | */ |
188 | private function checkIfRedirect( $page ) { |
189 | if ( $this->fastRedirectCheck ) { |
190 | return $page->isRedirect(); |
191 | } |
192 | |
193 | $content = $page->getContent(); |
194 | if ( $content == null ) { |
195 | return false; |
196 | } |
197 | if ( is_object( $content ) ) { |
198 | return $content->isRedirect(); |
199 | } |
200 | return false; |
201 | } |
202 | |
203 | /** |
204 | * Check that an inexistent page is not present in the index |
205 | * and delete it if found |
206 | * |
207 | * @param string $docId |
208 | * @param int $pageId |
209 | * @param \Elastica\Result[] $fromIndex |
210 | * @return bool true if a modification was needed |
211 | */ |
212 | private function checkInexistentPage( $docId, $pageId, array $fromIndex ) { |
213 | $inIndex = $fromIndex !== []; |
214 | if ( $inIndex ) { |
215 | foreach ( $fromIndex as $r ) { |
216 | $title = Title::makeTitleSafe( $r->namespace, $r->title ) ?? |
217 | Title::makeTitle( NS_SPECIAL, 'Badtitle/InvalidInDBOrElastic' ); |
218 | $this->remediator->ghostPageInIndex( $docId, $title ); |
219 | } |
220 | return true; |
221 | } |
222 | $this->sane( $pageId, 'No ghost' ); |
223 | return false; |
224 | } |
225 | |
226 | /** |
227 | * Check that a page present in the db and in the index |
228 | * is in the correct index with the latest version. |
229 | * |
230 | * @param string $docId |
231 | * @param int $pageId |
232 | * @param WikiPage $page |
233 | * @param \Elastica\Result[] $fromIndex |
234 | * @return bool true if a modification was needed |
235 | */ |
236 | private function checkPageInIndex( $docId, $pageId, WikiPage $page, array $fromIndex ) { |
237 | $insane = $this->checkIndexMismatch( $docId, $pageId, $page, $fromIndex ); |
238 | if ( !$insane ) { |
239 | $insane = $this->checkIndexedVersion( $docId, $pageId, $page, $fromIndex ); |
240 | } |
241 | |
242 | if ( !$insane ) { |
243 | $this->sane( $pageId, 'Page in index with latest version' ); |
244 | } |
245 | |
246 | return $insane; |
247 | } |
248 | |
249 | /** |
250 | * Check that a page present in the db and in the index |
251 | * is properly indexed to the appropriate index by checking its |
252 | * namespace. |
253 | * |
254 | * @param string $docId |
255 | * @param int $pageId |
256 | * @param WikiPage $page |
257 | * @param \Elastica\Result[] $fromIndex |
258 | * @return bool true if a modification was needed |
259 | */ |
260 | private function checkIndexMismatch( $docId, $pageId, WikiPage $page, array $fromIndex ) { |
261 | $foundInsanityInIndex = false; |
262 | $expectedSuffix = $this->connection->getIndexSuffixForNamespace( |
263 | $page->getTitle()->getNamespace() |
264 | ); |
265 | foreach ( $fromIndex as $indexInfo ) { |
266 | $suffix = $this->connection->extractIndexSuffix( $indexInfo->getIndex() ); |
267 | if ( $suffix !== $expectedSuffix ) { |
268 | // Got to grab the index type from the index name.... |
269 | $this->remediator->pageInWrongIndex( $docId, $page, $suffix ); |
270 | $foundInsanityInIndex = true; |
271 | } |
272 | } |
273 | |
274 | if ( $foundInsanityInIndex ) { |
275 | return true; |
276 | } |
277 | |
278 | return false; |
279 | } |
280 | |
281 | /** |
282 | * Check that the indexed version of the page is the |
283 | * latest version in the database. |
284 | * |
285 | * @param string $docId |
286 | * @param int $pageId |
287 | * @param WikiPage $page |
288 | * @param \Elastica\Result[] $fromIndex |
289 | * @return bool true if a modification was needed |
290 | */ |
291 | private function checkIndexedVersion( $docId, $pageId, WikiPage $page, array $fromIndex ) { |
292 | $latest = $page->getLatest(); |
293 | $foundInsanityInIndex = false; |
294 | foreach ( $fromIndex as $indexInfo ) { |
295 | $version = $indexInfo->getSource()['version'] ?? -1; |
296 | if ( $version < $latest ) { |
297 | $type = $this->connection->extractIndexSuffix( $indexInfo->getIndex() ); |
298 | $this->remediator->oldVersionInIndex( $docId, $page, $type ); |
299 | |
300 | $foundInsanityInIndex = true; |
301 | } |
302 | } |
303 | |
304 | return $foundInsanityInIndex; |
305 | } |
306 | |
307 | /** |
308 | * @param int[] $pageIds |
309 | * @return WikiPage[] the list of wiki pages indexed in page id |
310 | */ |
311 | private function loadPagesFromDB( array $pageIds ) { |
312 | // If no cache object is constructed we build a new one. |
313 | // Building it in the constructor would cause memleaks because |
314 | // there is no automatic prunning of old entries. If a cache |
315 | // object is provided the owner of this Checker instance must take |
316 | // care of the cleaning. |
317 | $cache = $this->pageCache ?: new ArrayObject(); |
318 | $pageIds = array_diff( $pageIds, array_keys( $cache->getArrayCopy() ) ); |
319 | if ( empty( $pageIds ) ) { |
320 | return $cache->getArrayCopy(); |
321 | } |
322 | $dbr = $this->getDB(); |
323 | $pageQuery = WikiPage::getQueryInfo(); |
324 | |
325 | $res = $dbr->newSelectQueryBuilder() |
326 | ->select( $pageQuery['fields'] ) |
327 | ->tables( $pageQuery['tables'] ) |
328 | ->where( [ 'page_id' => $pageIds ] ) |
329 | ->caller( __METHOD__ ) |
330 | ->joinConds( $pageQuery['joins'] ) |
331 | ->fetchResultSet(); |
332 | |
333 | $wikiPageFactory = MediaWikiServices::getInstance()->getWikiPageFactory(); |
334 | foreach ( $res as $row ) { |
335 | $page = $wikiPageFactory->newFromRow( $row ); |
336 | if ( Title::newFromDBkey( $page->getTitle()->getPrefixedDBkey() ) === null ) { |
337 | // The DB may contain invalid titles, make sure we try to sanitize only valid titles |
338 | // invalid titles like this may have to wait for a dedicated clean up action |
339 | continue; |
340 | } |
341 | $cache->offsetSet( $page->getId(), $page ); |
342 | } |
343 | return $cache->getArrayCopy(); |
344 | } |
345 | |
346 | /** |
347 | * @return \Wikimedia\Rdbms\IDatabase |
348 | */ |
349 | private function getDB() { |
350 | return wfGetDB( DB_REPLICA ); |
351 | } |
352 | |
353 | /** |
354 | * @param string[] $docIds document ids |
355 | * @return \Elastica\Result[][] search results indexed by page id |
356 | * @throws CheckerException if an error occurred |
357 | */ |
358 | private function loadPagesFromIndex( array $docIds ) { |
359 | $status = $this->searcher->get( $docIds, [ 'namespace', 'title', 'version' ], false ); |
360 | if ( !$status->isOK() ) { |
361 | throw new CheckerException( 'Cannot fetch ids from index' ); |
362 | } |
363 | /** @var \Elastica\ResultSet $dataFromIndex */ |
364 | $dataFromIndex = $status->getValue(); |
365 | |
366 | $indexedPages = []; |
367 | foreach ( $dataFromIndex as $indexInfo ) { |
368 | $indexedPages[$indexInfo->getId()][] = $indexInfo; |
369 | } |
370 | return $indexedPages; |
371 | } |
372 | |
373 | private function sane( $pageId, $reason ) { |
374 | if ( $this->logSane ) { |
375 | printf( "%30s %10d\n", $reason, $pageId ); |
376 | } |
377 | } |
378 | } |