Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
80.45% |
107 / 133 |
|
37.50% |
3 / 8 |
CRAP | |
0.00% |
0 / 1 |
CheckSanity | |
80.45% |
107 / 133 |
|
37.50% |
3 / 8 |
21.70 | |
0.00% |
0 / 1 |
execute | |
100.00% |
12 / 12 |
|
100.00% |
1 / 1 |
1 | |||
makeChecker | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
2 | |||
makeIsOldClosure | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
6 | |||
check | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
3 | |||
reformat | |
98.44% |
63 / 64 |
|
0.00% |
0 / 1 |
9 | |||
getAllowedParams | |
100.00% |
27 / 27 |
|
100.00% |
1 / 1 |
1 | |||
isInternal | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getExamplesMessages | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\Api; |
4 | |
5 | use ApiBase; |
6 | use CirrusSearch\Connection; |
7 | use CirrusSearch\Sanity\BufferedRemediator; |
8 | use CirrusSearch\Sanity\Checker; |
9 | use CirrusSearch\Sanity\CheckerException; |
10 | use CirrusSearch\Sanity\Remediator; |
11 | use CirrusSearch\SearchConfig; |
12 | use CirrusSearch\Searcher; |
13 | use MediaWiki\WikiMap\WikiMap; |
14 | use Wikimedia\ParamValidator\ParamValidator; |
15 | use WikiMedia\ParamValidator\TypeDef\IntegerDef; |
16 | |
17 | /** |
18 | * Validates the sanity of the search indexes for a range of page id's |
19 | * |
20 | * Invokes the cirrus sanity checker which compares a range of page ids |
21 | * current state in the sql database against the elasticsearch indexes. |
22 | * Reports on issues found such as missing pages, pages that should have |
23 | * been deleted, and old versions in the search index. |
24 | * |
25 | * Also offers a constant rerender-over-time through the sequenceid and |
26 | * rerenderfrequency options. The sequenceid should be incremented each |
27 | * time the same set of page ids is sent to the checker. A subset of |
28 | * the page ids will be emit as `oldDocument` in each batch, such that |
29 | * after `rerenderfrequency` increments of `sequenceid` all pages will |
30 | * have been rerendered. The purpose of the over-time rerender is to |
31 | * ensure changes to how pages are rendered make it into the search indexes |
32 | * within an expected timeframe. |
33 | * |
34 | * This program is free software; you can redistribute it and/or modify |
35 | * it under the terms of the GNU General Public License as published by |
36 | * the Free Software Foundation; either version 2 of the License, or |
37 | * (at your option) any later version. |
38 | * |
39 | * This program is distributed in the hope that it will be useful, |
40 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
41 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
42 | * GNU General Public License for more details. |
43 | * |
44 | * You should have received a copy of the GNU General Public License along |
45 | * with this program; if not, write to the Free Software Foundation, Inc., |
46 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
47 | * http://www.gnu.org/copyleft/gpl.html |
48 | */ |
49 | class CheckSanity extends ApiBase { |
50 | use ApiTrait; |
51 | |
52 | public function execute() { |
53 | $cluster = $this->getParameter( 'cluster' ); |
54 | // Start and end values are inclusive |
55 | $start = $this->getParameter( 'from' ); |
56 | $end = $start + $this->getParameter( 'limit' ) - 1; |
57 | |
58 | $remediator = new BufferedRemediator(); |
59 | $this->check( $this->makeChecker( $cluster, $remediator ), $start, $end ); |
60 | $problems = $remediator->getActions(); |
61 | |
62 | $result = $this->getResult(); |
63 | $result->addValue( null, 'wikiId', WikiMap::getCurrentWikiId() ); |
64 | $result->addValue( |
65 | null, 'clusterGroup', |
66 | $this->getSearchConfig()->getClusterAssignment()->getCrossClusterName() ); |
67 | $result->addValue( null, 'problems', $this->reformat( $problems ) ); |
68 | } |
69 | |
70 | protected function makeChecker( string $cluster, Remediator $remediator ): Checker { |
71 | $searchConfig = $this->getSearchConfig(); |
72 | $connection = Connection::getPool( $searchConfig, $cluster ); |
73 | $searcher = new Searcher( $connection, 0, 0, $searchConfig, [], null ); |
74 | |
75 | return new Checker( |
76 | $searchConfig, |
77 | $connection, |
78 | $remediator, |
79 | $searcher, |
80 | false, // logSane |
81 | false, // fastRedirectCheck |
82 | null, // pageCache |
83 | $this->makeIsOldClosure() |
84 | ); |
85 | } |
86 | |
87 | private function makeIsOldClosure() { |
88 | $sequenceId = $this->getParameter( 'sequenceid' ); |
89 | if ( $sequenceId === null ) { |
90 | return null; |
91 | } |
92 | return Checker::makeIsOldClosure( |
93 | $sequenceId, |
94 | $this->getParameter( 'rerenderfrequency' ) |
95 | ); |
96 | } |
97 | |
98 | private function check( Checker $checker, int $start, int $end, int $batchSize = 10 ) { |
99 | $ranges = array_chunk( range( $start, $end ), $batchSize ); |
100 | foreach ( $ranges as $pageIds ) { |
101 | try { |
102 | $checker->check( $pageIds ); |
103 | } catch ( CheckerException $e ) { |
104 | // This mostly happens when there is a transient data loading problem. |
105 | // The request should be retried. |
106 | $this->dieWithException( $e ); |
107 | } |
108 | } |
109 | } |
110 | |
111 | /** |
112 | * Reformat Saneitizer problems for output |
113 | * |
114 | * Intentionally only emits numeric ids to avoid responding with |
115 | * any user generated data. As a list of page ids and index states |
116 | * this shouldn't be capable of leaking information thats not already |
117 | * known. |
118 | */ |
119 | private function reformat( array $problems ): array { |
120 | $clean = []; |
121 | $indexBaseName = $this->getSearchConfig()->get( SearchConfig::INDEX_BASE_NAME ); |
122 | // Generic connection for resolving index names, its always the same everywhere |
123 | $connection = Connection::getPool( $this->getSearchConfig() ); |
124 | foreach ( $problems as [ $problem, $args ] ) { |
125 | switch ( $problem ) { |
126 | case 'redirectInIndex': |
127 | [ $page ] = $args; |
128 | $target = $page->getRedirectTarget(); |
129 | $indexSuffix = $connection->getIndexSuffixForNamespace( $page->getNamespace() ); |
130 | $targetIndexSuffix = $connection->getIndexSuffixForNamespace( $target->getNamespace() ); |
131 | $clean[] = [ |
132 | 'indexName' => $connection->getIndexName( $indexBaseName, $indexSuffix ), |
133 | 'errorType' => $problem, |
134 | 'pageId' => $page->getId(), |
135 | 'namespaceId' => $page->getNamespace(), |
136 | 'target' => [ |
137 | 'pageId' => $target->getId(), |
138 | 'namespaceId' => $target->getNamespace(), |
139 | 'indexName' => $connection->getIndexName( $indexBaseName, $targetIndexSuffix ), |
140 | ], |
141 | ]; |
142 | break; |
143 | |
144 | case 'pageNotInIndex': |
145 | case 'oldDocument': |
146 | [ $page ] = $args; |
147 | $indexSuffix = $connection->getIndexSuffixForNamespace( $page->getNamespace() ); |
148 | $clean[] = [ |
149 | 'indexName' => $connection->getIndexName( $indexBaseName, $indexSuffix ), |
150 | 'errorType' => $problem, |
151 | 'pageId' => $page->getId(), |
152 | 'namespaceId' => $page->getNamespace(), |
153 | ]; |
154 | break; |
155 | |
156 | case 'ghostPageInIndex': |
157 | [ $docId, $title ] = $args; |
158 | $indexSuffix = $connection->getIndexSuffixForNamespace( $title->getNamespace() ); |
159 | $clean[] = [ |
160 | 'indexName' => $connection->getIndexName( $indexBaseName, $indexSuffix ), |
161 | 'errorType' => $problem, |
162 | 'pageId' => (int)$docId, |
163 | 'namespaceId' => $title->getNamespace(), |
164 | ]; |
165 | break; |
166 | |
167 | case 'pageInWrongIndex': |
168 | [ $docId, $page, $wrongIndexSuffix ] = $args; |
169 | $indexSuffix = $connection->getIndexSuffixForNamespace( $page->getNamespace() ); |
170 | $clean[] = [ |
171 | 'wrongIndexName' => $connection->getIndexName( $indexBaseName, $wrongIndexSuffix ), |
172 | 'indexName' => $connection->getIndexName( $indexBaseName, $indexSuffix ), |
173 | 'errorType' => $problem, |
174 | 'pageId' => $page->getId(), |
175 | 'namespaceId' => $page->getNamespace(), |
176 | ]; |
177 | break; |
178 | |
179 | case 'oldVersionInIndex': |
180 | // kinda random this one provides the suffix directly |
181 | [ $docId, $page, $indexSuffix ] = $args; |
182 | $clean[] = [ |
183 | 'indexName' => $connection->getIndexName( $indexBaseName, $indexSuffix ), |
184 | 'errorType' => $problem, |
185 | 'pageId' => $page->getId(), |
186 | 'namespaceId' => $page->getNamespace(), |
187 | ]; |
188 | break; |
189 | |
190 | default: |
191 | $this->dieDebug( __METHOD__, "Unknown remediation: $problem" ); |
192 | } |
193 | } |
194 | |
195 | return $clean; |
196 | } |
197 | |
198 | public function getAllowedParams() { |
199 | $assignment = $this->getSearchConfig()->getClusterAssignment(); |
200 | return [ |
201 | 'cluster' => [ |
202 | ParamValidator::PARAM_DEFAULT => $assignment->getSearchCluster(), |
203 | ParamValidator::PARAM_TYPE => $assignment->getAllKnownClusters(), |
204 | ], |
205 | 'from' => [ |
206 | ParamValidator::PARAM_TYPE => 'integer', |
207 | ParamValidator::PARAM_REQUIRED => true, |
208 | IntegerDef::PARAM_MIN => 0, |
209 | ], |
210 | 'limit' => [ |
211 | ParamValidator::PARAM_DEFAULT => 100, |
212 | ParamValidator::PARAM_TYPE => 'limit', |
213 | IntegerDef::PARAM_MIN => 1, |
214 | IntegerDef::PARAM_MAX => ApiBase::LIMIT_BIG1, |
215 | IntegerDef::PARAM_MAX2 => ApiBase::LIMIT_BIG2 |
216 | ], |
217 | // The caller must increment the sequenceid each successive |
218 | // time it invokes the sanity check for the same set of pages. |
219 | // Pages within the batch will emit an `oldDocument` problem |
220 | // spread over `rerenderfrequency` invocations of the api. |
221 | // This supports a slow and constant rerender of all content, |
222 | // ensuring the search indices stay aligned with changes to |
223 | // indexing and rendering code. |
224 | 'sequenceid' => [ |
225 | // Providing this enables the "old document" checks |
226 | // which provide constant re-rendering over time. |
227 | ParamValidator::PARAM_TYPE => 'integer', |
228 | ], |
229 | // Controls how often a page is flagged with the `oldDocument` |
230 | // problem. If the caller scans all page ids every week, then |
231 | // the default value of 16 would emit an `oldDocument` problem |
232 | // for all existing pages spread over 16 weeks. |
233 | 'rerenderfrequency' => [ |
234 | ParamValidator::PARAM_DEFAULT => 16, |
235 | ParamValidator::PARAM_TYPE => 'integer', |
236 | IntegerDef::PARAM_MIN => 2, |
237 | ] |
238 | ]; |
239 | } |
240 | |
241 | /** |
242 | * Mark as internal. This isn't meant to be used by normal api users |
243 | * @return bool |
244 | */ |
245 | public function isInternal() { |
246 | return true; |
247 | } |
248 | |
249 | /** |
250 | * @see ApiBase::getExamplesMessages |
251 | * @return array |
252 | */ |
253 | protected function getExamplesMessages() { |
254 | return [ |
255 | 'action=cirrus-sanity-check&from=0&limit=100' => |
256 | 'apihelp-cirrus-check-sanity-example', |
257 | ]; |
258 | } |
259 | |
260 | } |