Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
82.44% |
108 / 131 |
|
42.86% |
3 / 7 |
CRAP | |
0.00% |
0 / 1 |
| CheckSanity | |
82.44% |
108 / 131 |
|
42.86% |
3 / 7 |
22.16 | |
0.00% |
0 / 1 |
| execute | |
100.00% |
12 / 12 |
|
100.00% |
1 / 1 |
1 | |||
| makeChecker | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
2 | |||
| makeIsOldClosure | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
6 | |||
| check | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
3 | |||
| reformat | |
98.46% |
64 / 65 |
|
0.00% |
0 / 1 |
11 | |||
| getAllowedParams | |
100.00% |
27 / 27 |
|
100.00% |
1 / 1 |
1 | |||
| isInternal | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| 1 | <?php |
| 2 | |
| 3 | namespace CirrusSearch\Api; |
| 4 | |
| 5 | use CirrusSearch\Connection; |
| 6 | use CirrusSearch\Sanity\BufferedRemediator; |
| 7 | use CirrusSearch\Sanity\Checker; |
| 8 | use CirrusSearch\Sanity\CheckerException; |
| 9 | use CirrusSearch\Sanity\Remediator; |
| 10 | use CirrusSearch\SearchConfig; |
| 11 | use CirrusSearch\Searcher; |
| 12 | use CirrusSearch\Util; |
| 13 | use MediaWiki\Api\ApiBase; |
| 14 | use MediaWiki\WikiMap\WikiMap; |
| 15 | use Wikimedia\ParamValidator\ParamValidator; |
| 16 | use WikiMedia\ParamValidator\TypeDef\IntegerDef; |
| 17 | |
| 18 | /** |
| 19 | * Validates the sanity of the search indexes for a range of page id's |
| 20 | * |
| 21 | * Invokes the cirrus sanity checker which compares a range of page ids |
| 22 | * current state in the sql database against the elasticsearch indexes. |
| 23 | * Reports on issues found such as missing pages, pages that should have |
| 24 | * been deleted, and old versions in the search index. |
| 25 | * |
| 26 | * Also offers a constant rerender-over-time through the sequenceid and |
| 27 | * rerenderfrequency options. The sequenceid should be incremented each |
| 28 | * time the same set of page ids is sent to the checker. A subset of |
| 29 | * the page ids will be emit as `oldDocument` in each batch, such that |
| 30 | * after `rerenderfrequency` increments of `sequenceid` all pages will |
| 31 | * have been rerendered. The purpose of the over-time rerender is to |
| 32 | * ensure changes to how pages are rendered make it into the search indexes |
| 33 | * within an expected timeframe. |
| 34 | * |
| 35 | * This program is free software; you can redistribute it and/or modify |
| 36 | * it under the terms of the GNU General Public License as published by |
| 37 | * the Free Software Foundation; either version 2 of the License, or |
| 38 | * (at your option) any later version. |
| 39 | * |
| 40 | * This program is distributed in the hope that it will be useful, |
| 41 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 42 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 43 | * GNU General Public License for more details. |
| 44 | * |
| 45 | * You should have received a copy of the GNU General Public License along |
| 46 | * with this program; if not, write to the Free Software Foundation, Inc., |
| 47 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
| 48 | * http://www.gnu.org/copyleft/gpl.html |
| 49 | */ |
| 50 | class CheckSanity extends ApiBase { |
| 51 | use ApiTrait; |
| 52 | |
| 53 | public function execute() { |
| 54 | $cluster = $this->getParameter( 'cluster' ); |
| 55 | // Start and end values are inclusive |
| 56 | $start = $this->getParameter( 'from' ); |
| 57 | $end = $start + $this->getParameter( 'limit' ) - 1; |
| 58 | |
| 59 | $remediator = new BufferedRemediator(); |
| 60 | $this->check( $this->makeChecker( $cluster, $remediator ), $start, $end ); |
| 61 | $problems = $remediator->getActions(); |
| 62 | |
| 63 | $result = $this->getResult(); |
| 64 | $result->addValue( null, 'wikiId', WikiMap::getCurrentWikiId() ); |
| 65 | $result->addValue( |
| 66 | null, 'clusterGroup', |
| 67 | $this->getSearchConfig()->getClusterAssignment()->getCrossClusterName() ); |
| 68 | $result->addValue( null, 'problems', $this->reformat( $problems ) ); |
| 69 | } |
| 70 | |
| 71 | protected function makeChecker( string $cluster, Remediator $remediator ): Checker { |
| 72 | $searchConfig = $this->getSearchConfig(); |
| 73 | $connection = Connection::getPool( $searchConfig, $cluster ); |
| 74 | $searcher = new Searcher( $connection, 0, 0, $searchConfig, [], null ); |
| 75 | |
| 76 | return new Checker( |
| 77 | $searchConfig, |
| 78 | $connection, |
| 79 | $remediator, |
| 80 | $searcher, |
| 81 | Util::getStatsFactory(), |
| 82 | false, // logSane |
| 83 | false, // fastRedirectCheck |
| 84 | null, // pageCache |
| 85 | $this->makeIsOldClosure() |
| 86 | ); |
| 87 | } |
| 88 | |
| 89 | private function makeIsOldClosure(): ?\Closure { |
| 90 | $sequenceId = $this->getParameter( 'sequenceid' ); |
| 91 | if ( $sequenceId === null ) { |
| 92 | return null; |
| 93 | } |
| 94 | return Checker::makeIsOldClosure( |
| 95 | $sequenceId, |
| 96 | $this->getParameter( 'rerenderfrequency' ) |
| 97 | ); |
| 98 | } |
| 99 | |
| 100 | private function check( Checker $checker, int $start, int $end, int $batchSize = 10 ) { |
| 101 | $ranges = array_chunk( range( $start, $end ), $batchSize ); |
| 102 | foreach ( $ranges as $pageIds ) { |
| 103 | try { |
| 104 | $checker->check( $pageIds ); |
| 105 | } catch ( CheckerException $e ) { |
| 106 | // This mostly happens when there is a transient data loading problem. |
| 107 | // The request should be retried. |
| 108 | $this->dieWithException( $e ); |
| 109 | } |
| 110 | } |
| 111 | } |
| 112 | |
| 113 | /** |
| 114 | * Reformat Saneitizer problems for output |
| 115 | * |
| 116 | * Intentionally only emits numeric ids to avoid responding with |
| 117 | * any user generated data. As a list of page ids and index states |
| 118 | * this shouldn't be capable of leaking information thats not already |
| 119 | * known. |
| 120 | */ |
| 121 | private function reformat( array $problems ): array { |
| 122 | $clean = []; |
| 123 | $indexBaseName = $this->getSearchConfig()->get( SearchConfig::INDEX_BASE_NAME ); |
| 124 | // Generic connection for resolving index names, its always the same everywhere |
| 125 | $connection = Connection::getPool( $this->getSearchConfig() ); |
| 126 | foreach ( $problems as [ $problem, $args ] ) { |
| 127 | switch ( $problem ) { |
| 128 | case 'redirectInIndex': |
| 129 | [ $docId, $page, $indexSuffix ] = $args; |
| 130 | $target = $page->getRedirectTarget(); |
| 131 | $problem = [ |
| 132 | 'indexName' => $connection->getIndexName( $indexBaseName, $indexSuffix ), |
| 133 | 'errorType' => $problem, |
| 134 | 'pageId' => $page->getId(), |
| 135 | 'namespaceId' => $page->getNamespace(), |
| 136 | ]; |
| 137 | // Page could redirect to a Special page or even another wiki, |
| 138 | // target information is only useful on pages that exist locally. |
| 139 | if ( $target != null && $target->canExist() ) { |
| 140 | $targetIndexSuffix = $connection->getIndexSuffixForNamespace( $target->getNamespace() ); |
| 141 | $problem['target'] = [ |
| 142 | 'pageId' => $target->getId(), |
| 143 | 'namespaceId' => $target->getNamespace(), |
| 144 | 'indexName' => $connection->getIndexName( $indexBaseName, $targetIndexSuffix ), |
| 145 | ]; |
| 146 | } |
| 147 | $clean[] = $problem; |
| 148 | break; |
| 149 | |
| 150 | case 'pageNotInIndex': |
| 151 | case 'oldDocument': |
| 152 | [ $page ] = $args; |
| 153 | $indexSuffix = $connection->getIndexSuffixForNamespace( $page->getNamespace() ); |
| 154 | $clean[] = [ |
| 155 | 'indexName' => $connection->getIndexName( $indexBaseName, $indexSuffix ), |
| 156 | 'errorType' => $problem, |
| 157 | 'pageId' => $page->getId(), |
| 158 | 'namespaceId' => $page->getNamespace(), |
| 159 | ]; |
| 160 | break; |
| 161 | |
| 162 | case 'ghostPageInIndex': |
| 163 | [ $docId, $title ] = $args; |
| 164 | $indexSuffix = $connection->getIndexSuffixForNamespace( $title->getNamespace() ); |
| 165 | $clean[] = [ |
| 166 | 'indexName' => $connection->getIndexName( $indexBaseName, $indexSuffix ), |
| 167 | 'errorType' => $problem, |
| 168 | 'pageId' => (int)$docId, |
| 169 | 'namespaceId' => $title->getNamespace(), |
| 170 | ]; |
| 171 | break; |
| 172 | |
| 173 | case 'pageInWrongIndex': |
| 174 | [ $docId, $page, $wrongIndexSuffix ] = $args; |
| 175 | $indexSuffix = $connection->getIndexSuffixForNamespace( $page->getNamespace() ); |
| 176 | $clean[] = [ |
| 177 | 'wrongIndexName' => $connection->getIndexName( $indexBaseName, $wrongIndexSuffix ), |
| 178 | 'indexName' => $connection->getIndexName( $indexBaseName, $indexSuffix ), |
| 179 | 'errorType' => $problem, |
| 180 | 'pageId' => $page->getId(), |
| 181 | 'namespaceId' => $page->getNamespace(), |
| 182 | ]; |
| 183 | break; |
| 184 | |
| 185 | case 'oldVersionInIndex': |
| 186 | // kinda random this one provides the suffix directly |
| 187 | [ $docId, $page, $indexSuffix ] = $args; |
| 188 | $clean[] = [ |
| 189 | 'indexName' => $connection->getIndexName( $indexBaseName, $indexSuffix ), |
| 190 | 'errorType' => $problem, |
| 191 | 'pageId' => $page->getId(), |
| 192 | 'namespaceId' => $page->getNamespace(), |
| 193 | ]; |
| 194 | break; |
| 195 | |
| 196 | default: |
| 197 | $this->dieDebug( __METHOD__, "Unknown remediation: $problem" ); |
| 198 | } |
| 199 | } |
| 200 | |
| 201 | return $clean; |
| 202 | } |
| 203 | |
| 204 | /** @inheritDoc */ |
| 205 | public function getAllowedParams() { |
| 206 | $assignment = $this->getSearchConfig()->getClusterAssignment(); |
| 207 | return [ |
| 208 | 'cluster' => [ |
| 209 | ParamValidator::PARAM_REQUIRED => true, |
| 210 | ParamValidator::PARAM_TYPE => $assignment->getManagedClusters(), |
| 211 | ], |
| 212 | 'from' => [ |
| 213 | ParamValidator::PARAM_TYPE => 'integer', |
| 214 | ParamValidator::PARAM_REQUIRED => true, |
| 215 | IntegerDef::PARAM_MIN => 0, |
| 216 | ], |
| 217 | 'limit' => [ |
| 218 | ParamValidator::PARAM_DEFAULT => 100, |
| 219 | ParamValidator::PARAM_TYPE => 'limit', |
| 220 | IntegerDef::PARAM_MIN => 1, |
| 221 | IntegerDef::PARAM_MAX => ApiBase::LIMIT_BIG1, |
| 222 | IntegerDef::PARAM_MAX2 => ApiBase::LIMIT_BIG2 |
| 223 | ], |
| 224 | // The caller must increment the sequenceid each successive |
| 225 | // time it invokes the sanity check for the same set of pages. |
| 226 | // Pages within the batch will emit an `oldDocument` problem |
| 227 | // spread over `rerenderfrequency` invocations of the api. |
| 228 | // This supports a slow and constant rerender of all content, |
| 229 | // ensuring the search indices stay aligned with changes to |
| 230 | // indexing and rendering code. |
| 231 | 'sequenceid' => [ |
| 232 | // Providing this enables the "old document" checks |
| 233 | // which provide constant re-rendering over time. |
| 234 | ParamValidator::PARAM_TYPE => 'integer', |
| 235 | ], |
| 236 | // Controls how often a page is flagged with the `oldDocument` |
| 237 | // problem. If the caller scans all page ids every week, then |
| 238 | // the default value of 16 would emit an `oldDocument` problem |
| 239 | // for all existing pages spread over 16 weeks. |
| 240 | 'rerenderfrequency' => [ |
| 241 | ParamValidator::PARAM_DEFAULT => 16, |
| 242 | ParamValidator::PARAM_TYPE => 'integer', |
| 243 | IntegerDef::PARAM_MIN => 2, |
| 244 | ] |
| 245 | ]; |
| 246 | } |
| 247 | |
| 248 | /** |
| 249 | * Mark as internal. This isn't meant to be used by normal api users |
| 250 | * @return bool |
| 251 | */ |
| 252 | public function isInternal() { |
| 253 | return true; |
| 254 | } |
| 255 | |
| 256 | } |