Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
80.45% covered (warning)
80.45%
107 / 133
37.50% covered (danger)
37.50%
3 / 8
CRAP
0.00% covered (danger)
0.00%
0 / 1
CheckSanity
80.45% covered (warning)
80.45%
107 / 133
37.50% covered (danger)
37.50%
3 / 8
21.70
0.00% covered (danger)
0.00%
0 / 1
 execute
100.00% covered (success)
100.00%
12 / 12
100.00% covered (success)
100.00%
1 / 1
1
 makeChecker
0.00% covered (danger)
0.00%
0 / 13
0.00% covered (danger)
0.00%
0 / 1
2
 makeIsOldClosure
0.00% covered (danger)
0.00%
0 / 7
0.00% covered (danger)
0.00%
0 / 1
6
 check
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
3
 reformat
98.44% covered (success)
98.44%
63 / 64
0.00% covered (danger)
0.00%
0 / 1
9
 getAllowedParams
100.00% covered (success)
100.00%
27 / 27
100.00% covered (success)
100.00%
1 / 1
1
 isInternal
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 getExamplesMessages
0.00% covered (danger)
0.00%
0 / 4
0.00% covered (danger)
0.00%
0 / 1
2
1<?php
2
3namespace CirrusSearch\Api;
4
5use ApiBase;
6use CirrusSearch\Connection;
7use CirrusSearch\Sanity\BufferedRemediator;
8use CirrusSearch\Sanity\Checker;
9use CirrusSearch\Sanity\CheckerException;
10use CirrusSearch\Sanity\Remediator;
11use CirrusSearch\SearchConfig;
12use CirrusSearch\Searcher;
13use MediaWiki\WikiMap\WikiMap;
14use Wikimedia\ParamValidator\ParamValidator;
15use WikiMedia\ParamValidator\TypeDef\IntegerDef;
16
17/**
18 * Validates the sanity of the search indexes for a range of page id's
19 *
20 * Invokes the cirrus sanity checker which compares a range of page ids
21 * current state in the sql database against the elasticsearch indexes.
22 * Reports on issues found such as missing pages, pages that should have
23 * been deleted, and old versions in the search index.
24 *
25 * Also offers a constant rerender-over-time through the sequenceid and
26 * rerenderfrequency options. The sequenceid should be incremented each
27 * time the same set of page ids is sent to the checker. A subset of
28 * the page ids will be emit as `oldDocument` in each batch, such that
29 * after `rerenderfrequency` increments of `sequenceid` all pages will
30 * have been rerendered. The purpose of the over-time rerender is to
31 * ensure changes to how pages are rendered make it into the search indexes
32 * within an expected timeframe.
33 *
34 * This program is free software; you can redistribute it and/or modify
35 * it under the terms of the GNU General Public License as published by
36 * the Free Software Foundation; either version 2 of the License, or
37 * (at your option) any later version.
38 *
39 * This program is distributed in the hope that it will be useful,
40 * but WITHOUT ANY WARRANTY; without even the implied warranty of
41 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
42 * GNU General Public License for more details.
43 *
44 * You should have received a copy of the GNU General Public License along
45 * with this program; if not, write to the Free Software Foundation, Inc.,
46 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
47 * http://www.gnu.org/copyleft/gpl.html
48 */
49class CheckSanity extends ApiBase {
50    use ApiTrait;
51
52    public function execute() {
53        $cluster = $this->getParameter( 'cluster' );
54        // Start and end values are inclusive
55        $start = $this->getParameter( 'from' );
56        $end = $start + $this->getParameter( 'limit' ) - 1;
57
58        $remediator = new BufferedRemediator();
59        $this->check( $this->makeChecker( $cluster, $remediator ), $start, $end );
60        $problems = $remediator->getActions();
61
62        $result = $this->getResult();
63        $result->addValue( null, 'wikiId', WikiMap::getCurrentWikiId() );
64        $result->addValue(
65            null, 'clusterGroup',
66            $this->getSearchConfig()->getClusterAssignment()->getCrossClusterName() );
67        $result->addValue( null, 'problems', $this->reformat( $problems ) );
68    }
69
70    protected function makeChecker( string $cluster, Remediator $remediator ): Checker {
71        $searchConfig = $this->getSearchConfig();
72        $connection = Connection::getPool( $searchConfig, $cluster );
73        $searcher = new Searcher( $connection, 0, 0, $searchConfig, [], null );
74
75        return new Checker(
76            $searchConfig,
77            $connection,
78            $remediator,
79            $searcher,
80            false, // logSane
81            false, // fastRedirectCheck
82            null, // pageCache
83            $this->makeIsOldClosure()
84        );
85    }
86
87    private function makeIsOldClosure() {
88        $sequenceId = $this->getParameter( 'sequenceid' );
89        if ( $sequenceId === null ) {
90            return null;
91        }
92        return Checker::makeIsOldClosure(
93            $sequenceId,
94            $this->getParameter( 'rerenderfrequency' )
95        );
96    }
97
98    private function check( Checker $checker, int $start, int $end, int $batchSize = 10 ) {
99        $ranges = array_chunk( range( $start, $end ), $batchSize );
100        foreach ( $ranges as $pageIds ) {
101            try {
102                $checker->check( $pageIds );
103            } catch ( CheckerException $e ) {
104                // This mostly happens when there is a transient data loading problem.
105                // The request should be retried.
106                $this->dieWithException( $e );
107            }
108        }
109    }
110
111    /**
112     * Reformat Saneitizer problems for output
113     *
114     * Intentionally only emits numeric ids to avoid responding with
115     * any user generated data. As a list of page ids and index states
116     * this shouldn't be capable of leaking information thats not already
117     * known.
118     */
119    private function reformat( array $problems ): array {
120        $clean = [];
121        $indexBaseName = $this->getSearchConfig()->get( SearchConfig::INDEX_BASE_NAME );
122        // Generic connection for resolving index names, its always the same everywhere
123        $connection = Connection::getPool( $this->getSearchConfig() );
124        foreach ( $problems as [ $problem, $args ] ) {
125            switch ( $problem ) {
126                case 'redirectInIndex':
127                    [ $page ] = $args;
128                    $target = $page->getRedirectTarget();
129                    $indexSuffix = $connection->getIndexSuffixForNamespace( $page->getNamespace() );
130                    $targetIndexSuffix = $connection->getIndexSuffixForNamespace( $target->getNamespace() );
131                    $clean[] = [
132                        'indexName' => $connection->getIndexName( $indexBaseName, $indexSuffix ),
133                        'errorType' => $problem,
134                        'pageId' => $page->getId(),
135                        'namespaceId' => $page->getNamespace(),
136                        'target' => [
137                            'pageId' => $target->getId(),
138                            'namespaceId' => $target->getNamespace(),
139                            'indexName' => $connection->getIndexName( $indexBaseName, $targetIndexSuffix ),
140                        ],
141                    ];
142                    break;
143
144                case 'pageNotInIndex':
145                case 'oldDocument':
146                    [ $page ] = $args;
147                    $indexSuffix = $connection->getIndexSuffixForNamespace( $page->getNamespace() );
148                    $clean[] = [
149                        'indexName' => $connection->getIndexName( $indexBaseName, $indexSuffix ),
150                        'errorType' => $problem,
151                        'pageId' => $page->getId(),
152                        'namespaceId' => $page->getNamespace(),
153                    ];
154                    break;
155
156                case 'ghostPageInIndex':
157                    [ $docId, $title ] = $args;
158                    $indexSuffix = $connection->getIndexSuffixForNamespace( $title->getNamespace() );
159                    $clean[] = [
160                        'indexName' => $connection->getIndexName( $indexBaseName, $indexSuffix ),
161                        'errorType' => $problem,
162                        'pageId' => (int)$docId,
163                        'namespaceId' => $title->getNamespace(),
164                    ];
165                    break;
166
167                case 'pageInWrongIndex':
168                    [ $docId, $page, $wrongIndexSuffix ] = $args;
169                    $indexSuffix = $connection->getIndexSuffixForNamespace( $page->getNamespace() );
170                    $clean[] = [
171                        'wrongIndexName' => $connection->getIndexName( $indexBaseName, $wrongIndexSuffix ),
172                        'indexName' => $connection->getIndexName( $indexBaseName, $indexSuffix ),
173                        'errorType' => $problem,
174                        'pageId' => $page->getId(),
175                        'namespaceId' => $page->getNamespace(),
176                    ];
177                    break;
178
179                case 'oldVersionInIndex':
180                    // kinda random this one provides the suffix directly
181                    [ $docId, $page, $indexSuffix ] = $args;
182                    $clean[] = [
183                        'indexName' => $connection->getIndexName( $indexBaseName, $indexSuffix ),
184                        'errorType' => $problem,
185                        'pageId' => $page->getId(),
186                        'namespaceId' => $page->getNamespace(),
187                    ];
188                    break;
189
190                default:
191                    $this->dieDebug( __METHOD__, "Unknown remediation: $problem" );
192            }
193        }
194
195        return $clean;
196    }
197
198    public function getAllowedParams() {
199        $assignment = $this->getSearchConfig()->getClusterAssignment();
200        return [
201            'cluster' => [
202                ParamValidator::PARAM_DEFAULT => $assignment->getSearchCluster(),
203                ParamValidator::PARAM_TYPE => $assignment->getAllKnownClusters(),
204            ],
205            'from' => [
206                ParamValidator::PARAM_TYPE => 'integer',
207                ParamValidator::PARAM_REQUIRED => true,
208                IntegerDef::PARAM_MIN => 0,
209            ],
210            'limit' => [
211                ParamValidator::PARAM_DEFAULT => 100,
212                ParamValidator::PARAM_TYPE => 'limit',
213                IntegerDef::PARAM_MIN => 1,
214                IntegerDef::PARAM_MAX => ApiBase::LIMIT_BIG1,
215                IntegerDef::PARAM_MAX2 => ApiBase::LIMIT_BIG2
216            ],
217            // The caller must increment the sequenceid each successive
218            // time it invokes the sanity check for the same set of pages.
219            // Pages within the batch will emit an `oldDocument` problem
220            // spread over `rerenderfrequency` invocations of the api.
221            // This supports a slow and constant rerender of all content,
222            // ensuring the search indices stay aligned with changes to
223            // indexing and rendering code.
224            'sequenceid' => [
225                // Providing this enables the "old document" checks
226                // which provide constant re-rendering over time.
227                ParamValidator::PARAM_TYPE => 'integer',
228            ],
229            // Controls how often a page is flagged with the `oldDocument`
230            // problem. If the caller scans all page ids every week, then
231            // the default value of 16 would emit an `oldDocument` problem
232            // for all existing pages spread over 16 weeks.
233            'rerenderfrequency' => [
234                ParamValidator::PARAM_DEFAULT => 16,
235                ParamValidator::PARAM_TYPE => 'integer',
236                IntegerDef::PARAM_MIN => 2,
237            ]
238        ];
239    }
240
241    /**
242     * Mark as internal. This isn't meant to be used by normal api users
243     * @return bool
244     */
245    public function isInternal() {
246        return true;
247    }
248
249    /**
250     * @see ApiBase::getExamplesMessages
251     * @return array
252     */
253    protected function getExamplesMessages() {
254        return [
255            'action=cirrus-sanity-check&from=0&limit=100' =>
256                'apihelp-cirrus-check-sanity-example',
257        ];
258    }
259
260}