Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 136 |
|
0.00% |
0 / 14 |
CRAP | |
0.00% |
0 / 1 |
Checker | |
0.00% |
0 / 136 |
|
0.00% |
0 / 14 |
1892 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 16 |
|
0.00% |
0 / 1 |
2 | |||
makeIsOldClosure | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
check | |
0.00% |
0 / 21 |
|
0.00% |
0 / 1 |
56 | |||
getCounter | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
2 | |||
checkExisitingPage | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
30 | |||
checkIfRedirect | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
20 | |||
checkInexistentPage | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
12 | |||
checkPageInIndex | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
12 | |||
checkIndexMismatch | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
20 | |||
checkIndexedVersion | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
12 | |||
loadPagesFromDB | |
0.00% |
0 / 20 |
|
0.00% |
0 / 1 |
30 | |||
getDB | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
loadPagesFromIndex | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
12 | |||
sane | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
6 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\Sanity; |
4 | |
5 | use ArrayObject; |
6 | use CirrusSearch\Connection; |
7 | use CirrusSearch\SearchConfig; |
8 | use CirrusSearch\Searcher; |
9 | use MediaWiki\MediaWikiServices; |
10 | use MediaWiki\Title\Title; |
11 | use Wikimedia\Stats\Metrics\CounterMetric; |
12 | use Wikimedia\Stats\Metrics\NullMetric; |
13 | use Wikimedia\Stats\StatsFactory; |
14 | use WikiPage; |
15 | |
16 | /** |
17 | * Checks if a WikiPage's representation in search index is sane. |
18 | * |
19 | * This program is free software; you can redistribute it and/or modify |
20 | * it under the terms of the GNU General Public License as published by |
21 | * the Free Software Foundation; either version 2 of the License, or |
22 | * (at your option) any later version. |
23 | * |
24 | * This program is distributed in the hope that it will be useful, |
25 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
26 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
27 | * GNU General Public License for more details. |
28 | * |
29 | * You should have received a copy of the GNU General Public License along |
30 | * with this program; if not, write to the Free Software Foundation, Inc., |
31 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
32 | * http://www.gnu.org/copyleft/gpl.html |
33 | */ |
34 | |
35 | class Checker { |
36 | /** |
37 | * @var SearchConfig |
38 | */ |
39 | private $searchConfig; |
40 | |
41 | /** |
42 | * @var Connection |
43 | */ |
44 | private $connection; |
45 | |
46 | /** |
47 | * @var Searcher Used for fetching data, so we can check the content. |
48 | */ |
49 | private $searcher; |
50 | |
51 | /** |
52 | * @var Remediator Do something with the problems we found |
53 | */ |
54 | private $remediator; |
55 | |
56 | /** |
57 | * @var StatsFactory Used to record stats about the process |
58 | */ |
59 | private StatsFactory $statsFactory; |
60 | |
61 | /** |
62 | * @var bool Should we log id's that are found to have no problems |
63 | */ |
64 | private $logSane; |
65 | |
66 | /** |
67 | * @var bool inspect WikiPage::isRedirect() instead of WikiPage::getContent()->isRedirect() |
68 | * Faster since it does not need to fetch the content but inconsistent in some cases. |
69 | */ |
70 | private $fastRedirectCheck; |
71 | |
72 | /** |
73 | * A cache for pages loaded with loadPagesFromDB( $pageIds ). This is only |
74 | * useful when multiple Checker are run to check different elastic clusters. |
75 | * @var ArrayObject|null |
76 | */ |
77 | private $pageCache; |
78 | |
79 | /** |
80 | * @var callable Accepts a WikiPage argument and returns boolean true if the page |
81 | * should be reindexed based on time since last reindex. |
82 | */ |
83 | private $isOldFn; |
84 | |
85 | /** |
86 | * Build the checker. |
87 | * @param SearchConfig $config |
88 | * @param Connection $connection |
89 | * @param Remediator $remediator the remediator to which to send titles |
90 | * that are insane |
91 | * @param Searcher $searcher searcher to use for fetches |
92 | * @param StatsFactory $statsFactory to use for recording metrics |
93 | * @param bool $logSane should we log sane ids |
94 | * @param bool $fastRedirectCheck fast but inconsistent redirect check |
95 | * @param ArrayObject|null $pageCache cache for WikiPage loaded from db |
96 | * @param callable|null $isOldFn Accepts a WikiPage argument and returns boolean true if the page |
97 | * should be reindexed based on time since last reindex. |
98 | */ |
99 | public function __construct( |
100 | SearchConfig $config, |
101 | Connection $connection, |
102 | Remediator $remediator, |
103 | Searcher $searcher, |
104 | StatsFactory $statsFactory, |
105 | $logSane, |
106 | $fastRedirectCheck, |
107 | ?ArrayObject $pageCache = null, |
108 | ?callable $isOldFn = null |
109 | ) { |
110 | $this->searchConfig = $config; |
111 | $this->connection = $connection; |
112 | $this->statsFactory = $statsFactory; |
113 | $this->remediator = new CountingRemediator( |
114 | $remediator, |
115 | function ( string $problem ) { |
116 | return $this->getCounter( "fixed", $problem ); |
117 | } |
118 | ); |
119 | $this->searcher = $searcher; |
120 | $this->logSane = $logSane; |
121 | $this->fastRedirectCheck = $fastRedirectCheck; |
122 | $this->pageCache = $pageCache; |
123 | $this->isOldFn = $isOldFn ?? static function ( WikiPage $page ) { |
124 | return false; |
125 | }; |
126 | } |
127 | |
128 | /** |
129 | * Decide if a document should be reindexed based on time since last reindex |
130 | * |
131 | * Consider a page as old every $numCycles times the saneitizer loops over |
132 | * the same document. This ensures documents have been reindexed within the |
133 | * last `$numCycles * actual_loop_duration` (note that the configured |
134 | * duration is min_loop_duration, but in practice configuration ensures min |
135 | * and actual are typically the same). |
136 | * |
137 | * @param int $loopId The number of times the checker has looped over |
138 | * the document set. |
139 | * @param int $numCycles The number of loops after which a document |
140 | * is considered old. |
141 | * @return \Closure |
142 | */ |
143 | public static function makeIsOldClosure( $loopId, $numCycles ) { |
144 | $loopMod = $loopId % $numCycles; |
145 | return static function ( \WikiPage $page ) use ( $numCycles, $loopMod ) { |
146 | $pageIdMod = $page->getId() % $numCycles; |
147 | return $pageIdMod == $loopMod; |
148 | }; |
149 | } |
150 | |
151 | /** |
152 | * Check if a title is insane. |
153 | * |
154 | * @param int[] $pageIds page to check |
155 | * @return int the number of pages updated |
156 | * @throws CheckerException |
157 | */ |
158 | public function check( array $pageIds ) { |
159 | $docIds = array_map( [ $this->searchConfig, 'makeId' ], $pageIds ); |
160 | |
161 | $pagesFromDb = $this->loadPagesFromDB( $pageIds ); |
162 | $pagesFromIndex = $this->loadPagesFromIndex( $docIds ); |
163 | $nbPagesFixed = 0; |
164 | $nbPagesOld = 0; |
165 | foreach ( array_combine( $pageIds, $docIds ) as $pageId => $docId ) { |
166 | $fromIndex = []; |
167 | if ( isset( $pagesFromIndex[$docId] ) ) { |
168 | $fromIndex = $pagesFromIndex[$docId]; |
169 | } |
170 | |
171 | if ( isset( $pagesFromDb[$pageId] ) ) { |
172 | $page = $pagesFromDb[$pageId]; |
173 | $updated = $this->checkExisitingPage( $docId, $pageId, $page, $fromIndex ); |
174 | if ( !$updated && ( $this->isOldFn )( $page ) ) { |
175 | $this->remediator->oldDocument( $page ); |
176 | $nbPagesOld++; |
177 | } |
178 | } else { |
179 | $updated = $this->checkInexistentPage( $docId, $pageId, $fromIndex ); |
180 | } |
181 | if ( $updated ) { |
182 | $nbPagesFixed++; |
183 | } |
184 | } |
185 | $this->getCounter( "checked" )->incrementBy( count( $pageIds ) ); |
186 | // This is a duplicate of the "fixed" counter with the |
187 | // "problem => oldDocument" label. It can be removed once |
188 | // dashboards have transitioned away from statsd. |
189 | $this->getCounter( "old" )->incrementBy( $nbPagesOld ); |
190 | |
191 | return $nbPagesFixed; |
192 | } |
193 | |
194 | /** |
195 | * @return CounterMetric|NullMetric |
196 | */ |
197 | private function getCounter( string $action, string $problem = "n/a" ) { |
198 | $cluster = $this->connection->getClusterName(); |
199 | return $this->statsFactory->getCounter( "sanitization_total" ) |
200 | ->setLabel( "problem", $problem ) |
201 | ->setLabel( "search_cluster", $cluster ) |
202 | ->setLabel( "action", $action ) |
203 | ->copyToStatsdAt( "CirrusSearch.$cluster.sanitization.$action" ); |
204 | } |
205 | |
206 | /** |
207 | * Check that an existing page is properly indexed: |
208 | * - index it if missing in the index |
209 | * - delete it if it's a redirect |
210 | * - verify it if found in the index |
211 | * |
212 | * @param string $docId |
213 | * @param int $pageId |
214 | * @param WikiPage $page |
215 | * @param \Elastica\Result[] $fromIndex |
216 | * @return bool true if a modification was needed |
217 | */ |
218 | private function checkExisitingPage( $docId, $pageId, $page, array $fromIndex ) { |
219 | $inIndex = $fromIndex !== []; |
220 | if ( $this->checkIfRedirect( $page ) ) { |
221 | if ( $inIndex ) { |
222 | foreach ( $fromIndex as $indexInfo ) { |
223 | $indexSuffix = $this->connection->extractIndexSuffix( $indexInfo->getIndex() ); |
224 | $this->remediator->redirectInIndex( $docId, $page, $indexSuffix ); |
225 | } |
226 | return true; |
227 | } |
228 | $this->sane( $pageId, 'Redirect not in index' ); |
229 | return false; |
230 | } |
231 | if ( $inIndex ) { |
232 | return $this->checkPageInIndex( $docId, $pageId, $page, $fromIndex ); |
233 | } |
234 | $this->remediator->pageNotInIndex( $page ); |
235 | return true; |
236 | } |
237 | |
238 | /** |
239 | * Check if the page is a redirect |
240 | * @param WikiPage $page |
241 | * @return bool true if $page is a redirect |
242 | */ |
243 | private function checkIfRedirect( $page ) { |
244 | if ( $this->fastRedirectCheck ) { |
245 | return $page->isRedirect(); |
246 | } |
247 | |
248 | $content = $page->getContent(); |
249 | if ( $content == null ) { |
250 | return false; |
251 | } |
252 | if ( is_object( $content ) ) { |
253 | return $content->isRedirect(); |
254 | } |
255 | return false; |
256 | } |
257 | |
258 | /** |
259 | * Check that an inexistent page is not present in the index |
260 | * and delete it if found |
261 | * |
262 | * @param string $docId |
263 | * @param int $pageId |
264 | * @param \Elastica\Result[] $fromIndex |
265 | * @return bool true if a modification was needed |
266 | */ |
267 | private function checkInexistentPage( $docId, $pageId, array $fromIndex ) { |
268 | $inIndex = $fromIndex !== []; |
269 | if ( $inIndex ) { |
270 | foreach ( $fromIndex as $r ) { |
271 | $title = Title::makeTitleSafe( $r->namespace, $r->title ) ?? |
272 | Title::makeTitle( NS_SPECIAL, 'Badtitle/InvalidInDBOrElastic' ); |
273 | $this->remediator->ghostPageInIndex( $docId, $title ); |
274 | } |
275 | return true; |
276 | } |
277 | $this->sane( $pageId, 'No ghost' ); |
278 | return false; |
279 | } |
280 | |
281 | /** |
282 | * Check that a page present in the db and in the index |
283 | * is in the correct index with the latest version. |
284 | * |
285 | * @param string $docId |
286 | * @param int $pageId |
287 | * @param WikiPage $page |
288 | * @param \Elastica\Result[] $fromIndex |
289 | * @return bool true if a modification was needed |
290 | */ |
291 | private function checkPageInIndex( $docId, $pageId, WikiPage $page, array $fromIndex ) { |
292 | $insane = $this->checkIndexMismatch( $docId, $pageId, $page, $fromIndex ); |
293 | if ( !$insane ) { |
294 | $insane = $this->checkIndexedVersion( $docId, $pageId, $page, $fromIndex ); |
295 | } |
296 | |
297 | if ( !$insane ) { |
298 | $this->sane( $pageId, 'Page in index with latest version' ); |
299 | } |
300 | |
301 | return $insane; |
302 | } |
303 | |
304 | /** |
305 | * Check that a page present in the db and in the index |
306 | * is properly indexed to the appropriate index by checking its |
307 | * namespace. |
308 | * |
309 | * @param string $docId |
310 | * @param int $pageId |
311 | * @param WikiPage $page |
312 | * @param \Elastica\Result[] $fromIndex |
313 | * @return bool true if a modification was needed |
314 | */ |
315 | private function checkIndexMismatch( $docId, $pageId, WikiPage $page, array $fromIndex ) { |
316 | $foundInsanityInIndex = false; |
317 | $expectedSuffix = $this->connection->getIndexSuffixForNamespace( |
318 | $page->getTitle()->getNamespace() |
319 | ); |
320 | foreach ( $fromIndex as $indexInfo ) { |
321 | $suffix = $this->connection->extractIndexSuffix( $indexInfo->getIndex() ); |
322 | if ( $suffix !== $expectedSuffix ) { |
323 | // Got to grab the index type from the index name.... |
324 | $this->remediator->pageInWrongIndex( $docId, $page, $suffix ); |
325 | $foundInsanityInIndex = true; |
326 | } |
327 | } |
328 | |
329 | if ( $foundInsanityInIndex ) { |
330 | return true; |
331 | } |
332 | |
333 | return false; |
334 | } |
335 | |
336 | /** |
337 | * Check that the indexed version of the page is the |
338 | * latest version in the database. |
339 | * |
340 | * @param string $docId |
341 | * @param int $pageId |
342 | * @param WikiPage $page |
343 | * @param \Elastica\Result[] $fromIndex |
344 | * @return bool true if a modification was needed |
345 | */ |
346 | private function checkIndexedVersion( $docId, $pageId, WikiPage $page, array $fromIndex ) { |
347 | $latest = $page->getLatest(); |
348 | $foundInsanityInIndex = false; |
349 | foreach ( $fromIndex as $indexInfo ) { |
350 | $version = $indexInfo->getSource()['version'] ?? -1; |
351 | if ( $version < $latest ) { |
352 | $type = $this->connection->extractIndexSuffix( $indexInfo->getIndex() ); |
353 | $this->remediator->oldVersionInIndex( $docId, $page, $type ); |
354 | |
355 | $foundInsanityInIndex = true; |
356 | } |
357 | } |
358 | |
359 | return $foundInsanityInIndex; |
360 | } |
361 | |
362 | /** |
363 | * @param int[] $pageIds |
364 | * @return WikiPage[] the list of wiki pages indexed in page id |
365 | */ |
366 | private function loadPagesFromDB( array $pageIds ) { |
367 | // If no cache object is constructed we build a new one. |
368 | // Building it in the constructor would cause memleaks because |
369 | // there is no automatic prunning of old entries. If a cache |
370 | // object is provided the owner of this Checker instance must take |
371 | // care of the cleaning. |
372 | $cache = $this->pageCache ?: new ArrayObject(); |
373 | $pageIds = array_diff( $pageIds, array_keys( $cache->getArrayCopy() ) ); |
374 | if ( !$pageIds ) { |
375 | return $cache->getArrayCopy(); |
376 | } |
377 | $dbr = $this->getDB(); |
378 | $pageQuery = WikiPage::getQueryInfo(); |
379 | |
380 | $res = $dbr->newSelectQueryBuilder() |
381 | ->select( $pageQuery['fields'] ) |
382 | ->tables( $pageQuery['tables'] ) |
383 | ->where( [ 'page_id' => $pageIds ] ) |
384 | ->caller( __METHOD__ ) |
385 | ->joinConds( $pageQuery['joins'] ) |
386 | ->fetchResultSet(); |
387 | |
388 | $wikiPageFactory = MediaWikiServices::getInstance()->getWikiPageFactory(); |
389 | foreach ( $res as $row ) { |
390 | $page = $wikiPageFactory->newFromRow( $row ); |
391 | if ( Title::newFromDBkey( $page->getTitle()->getPrefixedDBkey() ) === null ) { |
392 | // The DB may contain invalid titles, make sure we try to sanitize only valid titles |
393 | // invalid titles like this may have to wait for a dedicated clean up action |
394 | continue; |
395 | } |
396 | $cache->offsetSet( $page->getId(), $page ); |
397 | } |
398 | return $cache->getArrayCopy(); |
399 | } |
400 | |
401 | /** |
402 | * @return \Wikimedia\Rdbms\IReadableDatabase |
403 | */ |
404 | private function getDB() { |
405 | return MediaWikiServices::getInstance()->getConnectionProvider()->getReplicaDatabase(); |
406 | } |
407 | |
408 | /** |
409 | * @param string[] $docIds document ids |
410 | * @return \Elastica\Result[][] search results indexed by page id |
411 | * @throws CheckerException if an error occurred |
412 | */ |
413 | private function loadPagesFromIndex( array $docIds ) { |
414 | $status = $this->searcher->get( $docIds, [ 'namespace', 'title', 'version' ], false ); |
415 | if ( !$status->isOK() ) { |
416 | throw new CheckerException( 'Cannot fetch ids from index' ); |
417 | } |
418 | /** @var \Elastica\ResultSet $dataFromIndex */ |
419 | $dataFromIndex = $status->getValue(); |
420 | |
421 | $indexedPages = []; |
422 | foreach ( $dataFromIndex as $indexInfo ) { |
423 | $indexedPages[$indexInfo->getId()][] = $indexInfo; |
424 | } |
425 | return $indexedPages; |
426 | } |
427 | |
428 | private function sane( $pageId, $reason ) { |
429 | if ( $this->logSane ) { |
430 | printf( "%30s %10d\n", $reason, $pageId ); |
431 | } |
432 | } |
433 | } |