Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
0.00% |
0 / 135 |
|
0.00% |
0 / 14 |
CRAP | |
0.00% |
0 / 1 |
| Checker | |
0.00% |
0 / 135 |
|
0.00% |
0 / 14 |
1980 | |
0.00% |
0 / 1 |
| __construct | |
0.00% |
0 / 16 |
|
0.00% |
0 / 1 |
2 | |||
| makeIsOldClosure | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
| check | |
0.00% |
0 / 21 |
|
0.00% |
0 / 1 |
72 | |||
| getCounter | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
| checkExisitingPage | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
30 | |||
| checkIfRedirect | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
20 | |||
| checkInexistentPage | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
12 | |||
| checkPageInIndex | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
12 | |||
| checkIndexMismatch | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
20 | |||
| checkIndexedVersion | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
12 | |||
| loadPagesFromDB | |
0.00% |
0 / 20 |
|
0.00% |
0 / 1 |
30 | |||
| getDB | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| loadPagesFromIndex | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
12 | |||
| sane | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
6 | |||
| 1 | <?php |
| 2 | /** |
| 3 | * @license GPL-2.0-or-later |
| 4 | */ |
| 5 | |
| 6 | namespace CirrusSearch\Sanity; |
| 7 | |
| 8 | use ArrayObject; |
| 9 | use CirrusSearch\Connection; |
| 10 | use CirrusSearch\SearchConfig; |
| 11 | use CirrusSearch\Searcher; |
| 12 | use MediaWiki\MediaWikiServices; |
| 13 | use MediaWiki\Page\WikiPage; |
| 14 | use MediaWiki\Title\Title; |
| 15 | use Wikimedia\Stats\Metrics\CounterMetric; |
| 16 | use Wikimedia\Stats\Metrics\NullMetric; |
| 17 | use Wikimedia\Stats\StatsFactory; |
| 18 | |
| 19 | /** |
| 20 | * Checks if a WikiPage's representation in search index is sane. |
| 21 | */ |
| 22 | class Checker { |
| 23 | /** |
| 24 | * @var SearchConfig |
| 25 | */ |
| 26 | private $searchConfig; |
| 27 | |
| 28 | /** |
| 29 | * @var Connection |
| 30 | */ |
| 31 | private $connection; |
| 32 | |
| 33 | /** |
| 34 | * @var Searcher Used for fetching data, so we can check the content. |
| 35 | */ |
| 36 | private $searcher; |
| 37 | |
| 38 | /** |
| 39 | * @var Remediator Do something with the problems we found |
| 40 | */ |
| 41 | private $remediator; |
| 42 | |
| 43 | /** |
| 44 | * @var StatsFactory Used to record stats about the process |
| 45 | */ |
| 46 | private StatsFactory $statsFactory; |
| 47 | |
| 48 | /** |
| 49 | * @var bool Should we log id's that are found to have no problems |
| 50 | */ |
| 51 | private $logSane; |
| 52 | |
| 53 | /** |
| 54 | * @var bool inspect WikiPage::isRedirect() instead of WikiPage::getContent()->isRedirect() |
| 55 | * Faster since it does not need to fetch the content but inconsistent in some cases. |
| 56 | */ |
| 57 | private $fastRedirectCheck; |
| 58 | |
| 59 | /** |
| 60 | * A cache for pages loaded with loadPagesFromDB( $pageIds ). This is only |
| 61 | * useful when multiple Checker are run to check different elastic clusters. |
| 62 | * @var ArrayObject|null |
| 63 | */ |
| 64 | private $pageCache; |
| 65 | |
| 66 | /** |
| 67 | * @var callable Accepts a WikiPage argument and returns boolean true if the page |
| 68 | * should be reindexed based on time since last reindex. |
| 69 | */ |
| 70 | private $isOldFn; |
| 71 | |
| 72 | /** |
| 73 | * Build the checker. |
| 74 | * @param SearchConfig $config |
| 75 | * @param Connection $connection |
| 76 | * @param Remediator $remediator the remediator to which to send titles |
| 77 | * that are insane |
| 78 | * @param Searcher $searcher searcher to use for fetches |
| 79 | * @param StatsFactory $statsFactory to use for recording metrics |
| 80 | * @param bool $logSane should we log sane ids |
| 81 | * @param bool $fastRedirectCheck fast but inconsistent redirect check |
| 82 | * @param ArrayObject|null $pageCache cache for WikiPage loaded from db |
| 83 | * @param callable|null $isOldFn Accepts a WikiPage argument and returns boolean true if the page |
| 84 | * should be reindexed based on time since last reindex. |
| 85 | */ |
| 86 | public function __construct( |
| 87 | SearchConfig $config, |
| 88 | Connection $connection, |
| 89 | Remediator $remediator, |
| 90 | Searcher $searcher, |
| 91 | StatsFactory $statsFactory, |
| 92 | $logSane, |
| 93 | $fastRedirectCheck, |
| 94 | ?ArrayObject $pageCache = null, |
| 95 | ?callable $isOldFn = null |
| 96 | ) { |
| 97 | $this->searchConfig = $config; |
| 98 | $this->connection = $connection; |
| 99 | $this->statsFactory = $statsFactory; |
| 100 | $this->remediator = new CountingRemediator( |
| 101 | $remediator, |
| 102 | function ( string $problem ) { |
| 103 | return $this->getCounter( "fixed", $problem ); |
| 104 | } |
| 105 | ); |
| 106 | $this->searcher = $searcher; |
| 107 | $this->logSane = $logSane; |
| 108 | $this->fastRedirectCheck = $fastRedirectCheck; |
| 109 | $this->pageCache = $pageCache; |
| 110 | $this->isOldFn = $isOldFn ?? static function ( WikiPage $page ) { |
| 111 | return false; |
| 112 | }; |
| 113 | } |
| 114 | |
| 115 | /** |
| 116 | * Decide if a document should be reindexed based on time since last reindex |
| 117 | * |
| 118 | * Consider a page as old every $numCycles times the saneitizer loops over |
| 119 | * the same document. This ensures documents have been reindexed within the |
| 120 | * last `$numCycles * actual_loop_duration` (note that the configured |
| 121 | * duration is min_loop_duration, but in practice configuration ensures min |
| 122 | * and actual are typically the same). |
| 123 | * |
| 124 | * @param int $loopId The number of times the checker has looped over |
| 125 | * the document set. |
| 126 | * @param int $numCycles The number of loops after which a document |
| 127 | * is considered old. |
| 128 | * @return \Closure |
| 129 | */ |
| 130 | public static function makeIsOldClosure( $loopId, $numCycles ) { |
| 131 | $loopMod = $loopId % $numCycles; |
| 132 | return static function ( WikiPage $page ) use ( $numCycles, $loopMod ) { |
| 133 | $pageIdMod = $page->getId() % $numCycles; |
| 134 | return $pageIdMod == $loopMod; |
| 135 | }; |
| 136 | } |
| 137 | |
| 138 | /** |
| 139 | * Check if a title is insane. |
| 140 | * |
| 141 | * @param int[] $pageIds page to check |
| 142 | * @return int the number of pages updated |
| 143 | * @throws CheckerException |
| 144 | */ |
| 145 | public function check( array $pageIds ) { |
| 146 | $docIds = array_map( [ $this->searchConfig, 'makeId' ], $pageIds ); |
| 147 | |
| 148 | $pagesFromDb = $this->loadPagesFromDB( $pageIds ); |
| 149 | $pagesFromIndex = $this->loadPagesFromIndex( $docIds ); |
| 150 | $nbPagesFixed = 0; |
| 151 | $nbPagesOld = 0; |
| 152 | foreach ( array_combine( $pageIds, $docIds ) as $pageId => $docId ) { |
| 153 | $fromIndex = []; |
| 154 | if ( isset( $pagesFromIndex[$docId] ) ) { |
| 155 | $fromIndex = $pagesFromIndex[$docId]; |
| 156 | } |
| 157 | |
| 158 | if ( isset( $pagesFromDb[$pageId] ) ) { |
| 159 | $page = $pagesFromDb[$pageId]; |
| 160 | $updated = $this->checkExisitingPage( $docId, $pageId, $page, $fromIndex ); |
| 161 | if ( !$updated && ( $this->isOldFn )( $page ) && !$page->isRedirect() ) { |
| 162 | $this->remediator->oldDocument( $page ); |
| 163 | $nbPagesOld++; |
| 164 | } |
| 165 | } else { |
| 166 | $updated = $this->checkInexistentPage( $docId, $pageId, $fromIndex ); |
| 167 | } |
| 168 | if ( $updated ) { |
| 169 | $nbPagesFixed++; |
| 170 | } |
| 171 | } |
| 172 | $this->getCounter( "checked" )->incrementBy( count( $pageIds ) ); |
| 173 | // This is a duplicate of the "fixed" counter with the |
| 174 | // "problem => oldDocument" label. It can be removed once |
| 175 | // dashboards have transitioned away from statsd. |
| 176 | $this->getCounter( "old" )->incrementBy( $nbPagesOld ); |
| 177 | |
| 178 | return $nbPagesFixed; |
| 179 | } |
| 180 | |
| 181 | /** |
| 182 | * @return CounterMetric|NullMetric |
| 183 | */ |
| 184 | private function getCounter( string $action, string $problem = "n/a" ) { |
| 185 | $cluster = $this->connection->getClusterName(); |
| 186 | return $this->statsFactory->getCounter( "sanitization_total" ) |
| 187 | ->setLabel( "problem", $problem ) |
| 188 | ->setLabel( "search_cluster", $cluster ) |
| 189 | ->setLabel( "action", $action ); |
| 190 | } |
| 191 | |
| 192 | /** |
| 193 | * Check that an existing page is properly indexed: |
| 194 | * - index it if missing in the index |
| 195 | * - delete it if it's a redirect |
| 196 | * - verify it if found in the index |
| 197 | * |
| 198 | * @param string $docId |
| 199 | * @param int $pageId |
| 200 | * @param WikiPage $page |
| 201 | * @param \Elastica\Result[] $fromIndex |
| 202 | * @return bool true if a modification was needed |
| 203 | */ |
| 204 | private function checkExisitingPage( $docId, $pageId, $page, array $fromIndex ) { |
| 205 | $inIndex = $fromIndex !== []; |
| 206 | if ( $this->checkIfRedirect( $page ) ) { |
| 207 | if ( $inIndex ) { |
| 208 | foreach ( $fromIndex as $indexInfo ) { |
| 209 | $indexSuffix = $this->connection->extractIndexSuffix( $indexInfo->getIndex() ); |
| 210 | $this->remediator->redirectInIndex( $docId, $page, $indexSuffix ); |
| 211 | } |
| 212 | return true; |
| 213 | } |
| 214 | $this->sane( $pageId, 'Redirect not in index' ); |
| 215 | return false; |
| 216 | } |
| 217 | if ( $inIndex ) { |
| 218 | return $this->checkPageInIndex( $docId, $pageId, $page, $fromIndex ); |
| 219 | } |
| 220 | $this->remediator->pageNotInIndex( $page ); |
| 221 | return true; |
| 222 | } |
| 223 | |
| 224 | /** |
| 225 | * Check if the page is a redirect |
| 226 | * @param WikiPage $page |
| 227 | * @return bool true if $page is a redirect |
| 228 | */ |
| 229 | private function checkIfRedirect( $page ) { |
| 230 | if ( $this->fastRedirectCheck ) { |
| 231 | return $page->isRedirect(); |
| 232 | } |
| 233 | |
| 234 | $content = $page->getContent(); |
| 235 | if ( $content == null ) { |
| 236 | return false; |
| 237 | } |
| 238 | if ( is_object( $content ) ) { |
| 239 | return $content->isRedirect(); |
| 240 | } |
| 241 | return false; |
| 242 | } |
| 243 | |
| 244 | /** |
| 245 | * Check that an inexistent page is not present in the index |
| 246 | * and delete it if found |
| 247 | * |
| 248 | * @param string $docId |
| 249 | * @param int $pageId |
| 250 | * @param \Elastica\Result[] $fromIndex |
| 251 | * @return bool true if a modification was needed |
| 252 | */ |
| 253 | private function checkInexistentPage( $docId, $pageId, array $fromIndex ) { |
| 254 | $inIndex = $fromIndex !== []; |
| 255 | if ( $inIndex ) { |
| 256 | foreach ( $fromIndex as $r ) { |
| 257 | $title = Title::makeTitleSafe( $r->namespace, $r->title ) ?? |
| 258 | Title::makeTitle( NS_SPECIAL, 'Badtitle/InvalidInDBOrElastic' ); |
| 259 | $this->remediator->ghostPageInIndex( $docId, $title ); |
| 260 | } |
| 261 | return true; |
| 262 | } |
| 263 | $this->sane( $pageId, 'No ghost' ); |
| 264 | return false; |
| 265 | } |
| 266 | |
| 267 | /** |
| 268 | * Check that a page present in the db and in the index |
| 269 | * is in the correct index with the latest version. |
| 270 | * |
| 271 | * @param string $docId |
| 272 | * @param int $pageId |
| 273 | * @param WikiPage $page |
| 274 | * @param \Elastica\Result[] $fromIndex |
| 275 | * @return bool true if a modification was needed |
| 276 | */ |
| 277 | private function checkPageInIndex( $docId, $pageId, WikiPage $page, array $fromIndex ) { |
| 278 | $insane = $this->checkIndexMismatch( $docId, $page, $fromIndex ); |
| 279 | if ( !$insane ) { |
| 280 | $insane = $this->checkIndexedVersion( $docId, $page, $fromIndex ); |
| 281 | } |
| 282 | |
| 283 | if ( !$insane ) { |
| 284 | $this->sane( $pageId, 'Page in index with latest version' ); |
| 285 | } |
| 286 | |
| 287 | return $insane; |
| 288 | } |
| 289 | |
| 290 | /** |
| 291 | * Check that a page present in the db and in the index |
| 292 | * is properly indexed to the appropriate index by checking its |
| 293 | * namespace. |
| 294 | * |
| 295 | * @param string $docId |
| 296 | * @param WikiPage $page |
| 297 | * @param \Elastica\Result[] $fromIndex |
| 298 | * @return bool true if a modification was needed |
| 299 | */ |
| 300 | private function checkIndexMismatch( $docId, WikiPage $page, array $fromIndex ) { |
| 301 | $foundInsanityInIndex = false; |
| 302 | $expectedSuffix = $this->connection->getIndexSuffixForNamespace( |
| 303 | $page->getTitle()->getNamespace() |
| 304 | ); |
| 305 | foreach ( $fromIndex as $indexInfo ) { |
| 306 | $suffix = $this->connection->extractIndexSuffix( $indexInfo->getIndex() ); |
| 307 | if ( $suffix !== $expectedSuffix ) { |
| 308 | // Got to grab the index type from the index name.... |
| 309 | $this->remediator->pageInWrongIndex( $docId, $page, $suffix ); |
| 310 | $foundInsanityInIndex = true; |
| 311 | } |
| 312 | } |
| 313 | |
| 314 | if ( $foundInsanityInIndex ) { |
| 315 | return true; |
| 316 | } |
| 317 | |
| 318 | return false; |
| 319 | } |
| 320 | |
| 321 | /** |
| 322 | * Check that the indexed version of the page is the |
| 323 | * latest version in the database. |
| 324 | * |
| 325 | * @param string $docId |
| 326 | * @param WikiPage $page |
| 327 | * @param \Elastica\Result[] $fromIndex |
| 328 | * @return bool true if a modification was needed |
| 329 | */ |
| 330 | private function checkIndexedVersion( $docId, WikiPage $page, array $fromIndex ) { |
| 331 | $latest = $page->getLatest(); |
| 332 | $foundInsanityInIndex = false; |
| 333 | foreach ( $fromIndex as $indexInfo ) { |
| 334 | $version = $indexInfo->getSource()['version'] ?? -1; |
| 335 | if ( $version < $latest ) { |
| 336 | $type = $this->connection->extractIndexSuffix( $indexInfo->getIndex() ); |
| 337 | $this->remediator->oldVersionInIndex( $docId, $page, $type ); |
| 338 | |
| 339 | $foundInsanityInIndex = true; |
| 340 | } |
| 341 | } |
| 342 | |
| 343 | return $foundInsanityInIndex; |
| 344 | } |
| 345 | |
| 346 | /** |
| 347 | * @param int[] $pageIds |
| 348 | * @return WikiPage[] the list of wiki pages indexed in page id |
| 349 | */ |
| 350 | private function loadPagesFromDB( array $pageIds ) { |
| 351 | // If no cache object is constructed we build a new one. |
| 352 | // Building it in the constructor would cause memleaks because |
| 353 | // there is no automatic prunning of old entries. If a cache |
| 354 | // object is provided the owner of this Checker instance must take |
| 355 | // care of the cleaning. |
| 356 | $cache = $this->pageCache ?: new ArrayObject(); |
| 357 | $pageIds = array_diff( $pageIds, array_keys( $cache->getArrayCopy() ) ); |
| 358 | if ( !$pageIds ) { |
| 359 | return $cache->getArrayCopy(); |
| 360 | } |
| 361 | $dbr = $this->getDB(); |
| 362 | $pageQuery = WikiPage::getQueryInfo(); |
| 363 | |
| 364 | $res = $dbr->newSelectQueryBuilder() |
| 365 | ->select( $pageQuery['fields'] ) |
| 366 | ->tables( $pageQuery['tables'] ) |
| 367 | ->where( [ 'page_id' => $pageIds ] ) |
| 368 | ->caller( __METHOD__ ) |
| 369 | ->joinConds( $pageQuery['joins'] ) |
| 370 | ->fetchResultSet(); |
| 371 | |
| 372 | $wikiPageFactory = MediaWikiServices::getInstance()->getWikiPageFactory(); |
| 373 | foreach ( $res as $row ) { |
| 374 | $page = $wikiPageFactory->newFromRow( $row ); |
| 375 | if ( Title::newFromDBkey( $page->getTitle()->getPrefixedDBkey() ) === null ) { |
| 376 | // The DB may contain invalid titles, make sure we try to sanitize only valid titles |
| 377 | // invalid titles like this may have to wait for a dedicated clean up action |
| 378 | continue; |
| 379 | } |
| 380 | $cache->offsetSet( $page->getId(), $page ); |
| 381 | } |
| 382 | return $cache->getArrayCopy(); |
| 383 | } |
| 384 | |
| 385 | /** |
| 386 | * @return \Wikimedia\Rdbms\IReadableDatabase |
| 387 | */ |
| 388 | private function getDB() { |
| 389 | return MediaWikiServices::getInstance()->getConnectionProvider()->getReplicaDatabase(); |
| 390 | } |
| 391 | |
| 392 | /** |
| 393 | * @param string[] $docIds document ids |
| 394 | * @return \Elastica\Result[][] search results indexed by page id |
| 395 | * @throws CheckerException if an error occurred |
| 396 | */ |
| 397 | private function loadPagesFromIndex( array $docIds ) { |
| 398 | $status = $this->searcher->get( $docIds, [ 'namespace', 'title', 'version' ], false ); |
| 399 | if ( !$status->isOK() ) { |
| 400 | throw new CheckerException( 'Cannot fetch ids from index' ); |
| 401 | } |
| 402 | /** @var \Elastica\ResultSet $dataFromIndex */ |
| 403 | $dataFromIndex = $status->getValue(); |
| 404 | |
| 405 | $indexedPages = []; |
| 406 | foreach ( $dataFromIndex as $indexInfo ) { |
| 407 | $indexedPages[$indexInfo->getId()][] = $indexInfo; |
| 408 | } |
| 409 | return $indexedPages; |
| 410 | } |
| 411 | |
| 412 | private function sane( int $pageId, string $reason ) { |
| 413 | if ( $this->logSane ) { |
| 414 | printf( "%30s %10d\n", $reason, $pageId ); |
| 415 | } |
| 416 | } |
| 417 | } |