Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
80.00% |
108 / 135 |
|
37.50% |
3 / 8 |
CRAP | |
0.00% |
0 / 1 |
CheckSanity | |
80.00% |
108 / 135 |
|
37.50% |
3 / 8 |
24.53 | |
0.00% |
0 / 1 |
execute | |
100.00% |
12 / 12 |
|
100.00% |
1 / 1 |
1 | |||
makeChecker | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
2 | |||
makeIsOldClosure | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
6 | |||
check | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
3 | |||
reformat | |
98.46% |
64 / 65 |
|
0.00% |
0 / 1 |
11 | |||
getAllowedParams | |
100.00% |
27 / 27 |
|
100.00% |
1 / 1 |
1 | |||
isInternal | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getExamplesMessages | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\Api; |
4 | |
5 | use CirrusSearch\Connection; |
6 | use CirrusSearch\Sanity\BufferedRemediator; |
7 | use CirrusSearch\Sanity\Checker; |
8 | use CirrusSearch\Sanity\CheckerException; |
9 | use CirrusSearch\Sanity\Remediator; |
10 | use CirrusSearch\SearchConfig; |
11 | use CirrusSearch\Searcher; |
12 | use CirrusSearch\Util; |
13 | use MediaWiki\Api\ApiBase; |
14 | use MediaWiki\WikiMap\WikiMap; |
15 | use Wikimedia\ParamValidator\ParamValidator; |
16 | use WikiMedia\ParamValidator\TypeDef\IntegerDef; |
17 | |
18 | /** |
19 | * Validates the sanity of the search indexes for a range of page id's |
20 | * |
21 | * Invokes the cirrus sanity checker which compares a range of page ids |
22 | * current state in the sql database against the elasticsearch indexes. |
23 | * Reports on issues found such as missing pages, pages that should have |
24 | * been deleted, and old versions in the search index. |
25 | * |
26 | * Also offers a constant rerender-over-time through the sequenceid and |
27 | * rerenderfrequency options. The sequenceid should be incremented each |
28 | * time the same set of page ids is sent to the checker. A subset of |
29 | * the page ids will be emit as `oldDocument` in each batch, such that |
30 | * after `rerenderfrequency` increments of `sequenceid` all pages will |
31 | * have been rerendered. The purpose of the over-time rerender is to |
32 | * ensure changes to how pages are rendered make it into the search indexes |
33 | * within an expected timeframe. |
34 | * |
35 | * This program is free software; you can redistribute it and/or modify |
36 | * it under the terms of the GNU General Public License as published by |
37 | * the Free Software Foundation; either version 2 of the License, or |
38 | * (at your option) any later version. |
39 | * |
40 | * This program is distributed in the hope that it will be useful, |
41 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
42 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
43 | * GNU General Public License for more details. |
44 | * |
45 | * You should have received a copy of the GNU General Public License along |
46 | * with this program; if not, write to the Free Software Foundation, Inc., |
47 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
48 | * http://www.gnu.org/copyleft/gpl.html |
49 | */ |
50 | class CheckSanity extends ApiBase { |
51 | use ApiTrait; |
52 | |
53 | public function execute() { |
54 | $cluster = $this->getParameter( 'cluster' ); |
55 | // Start and end values are inclusive |
56 | $start = $this->getParameter( 'from' ); |
57 | $end = $start + $this->getParameter( 'limit' ) - 1; |
58 | |
59 | $remediator = new BufferedRemediator(); |
60 | $this->check( $this->makeChecker( $cluster, $remediator ), $start, $end ); |
61 | $problems = $remediator->getActions(); |
62 | |
63 | $result = $this->getResult(); |
64 | $result->addValue( null, 'wikiId', WikiMap::getCurrentWikiId() ); |
65 | $result->addValue( |
66 | null, 'clusterGroup', |
67 | $this->getSearchConfig()->getClusterAssignment()->getCrossClusterName() ); |
68 | $result->addValue( null, 'problems', $this->reformat( $problems ) ); |
69 | } |
70 | |
71 | protected function makeChecker( string $cluster, Remediator $remediator ): Checker { |
72 | $searchConfig = $this->getSearchConfig(); |
73 | $connection = Connection::getPool( $searchConfig, $cluster ); |
74 | $searcher = new Searcher( $connection, 0, 0, $searchConfig, [], null ); |
75 | |
76 | return new Checker( |
77 | $searchConfig, |
78 | $connection, |
79 | $remediator, |
80 | $searcher, |
81 | Util::getStatsFactory(), |
82 | false, // logSane |
83 | false, // fastRedirectCheck |
84 | null, // pageCache |
85 | $this->makeIsOldClosure() |
86 | ); |
87 | } |
88 | |
89 | private function makeIsOldClosure(): ?\Closure { |
90 | $sequenceId = $this->getParameter( 'sequenceid' ); |
91 | if ( $sequenceId === null ) { |
92 | return null; |
93 | } |
94 | return Checker::makeIsOldClosure( |
95 | $sequenceId, |
96 | $this->getParameter( 'rerenderfrequency' ) |
97 | ); |
98 | } |
99 | |
100 | private function check( Checker $checker, int $start, int $end, int $batchSize = 10 ) { |
101 | $ranges = array_chunk( range( $start, $end ), $batchSize ); |
102 | foreach ( $ranges as $pageIds ) { |
103 | try { |
104 | $checker->check( $pageIds ); |
105 | } catch ( CheckerException $e ) { |
106 | // This mostly happens when there is a transient data loading problem. |
107 | // The request should be retried. |
108 | $this->dieWithException( $e ); |
109 | } |
110 | } |
111 | } |
112 | |
113 | /** |
114 | * Reformat Saneitizer problems for output |
115 | * |
116 | * Intentionally only emits numeric ids to avoid responding with |
117 | * any user generated data. As a list of page ids and index states |
118 | * this shouldn't be capable of leaking information thats not already |
119 | * known. |
120 | */ |
121 | private function reformat( array $problems ): array { |
122 | $clean = []; |
123 | $indexBaseName = $this->getSearchConfig()->get( SearchConfig::INDEX_BASE_NAME ); |
124 | // Generic connection for resolving index names, its always the same everywhere |
125 | $connection = Connection::getPool( $this->getSearchConfig() ); |
126 | foreach ( $problems as [ $problem, $args ] ) { |
127 | switch ( $problem ) { |
128 | case 'redirectInIndex': |
129 | [ $docId, $page, $indexSuffix ] = $args; |
130 | $target = $page->getRedirectTarget(); |
131 | $problem = [ |
132 | 'indexName' => $connection->getIndexName( $indexBaseName, $indexSuffix ), |
133 | 'errorType' => $problem, |
134 | 'pageId' => $page->getId(), |
135 | 'namespaceId' => $page->getNamespace(), |
136 | ]; |
137 | // Page could redirect to a Special page or even another wiki, |
138 | // target information is only useful on pages that exist locally. |
139 | if ( $target != null && $target->canExist() ) { |
140 | $targetIndexSuffix = $connection->getIndexSuffixForNamespace( $target->getNamespace() ); |
141 | $problem['target'] = [ |
142 | 'pageId' => $target->getId(), |
143 | 'namespaceId' => $target->getNamespace(), |
144 | 'indexName' => $connection->getIndexName( $indexBaseName, $targetIndexSuffix ), |
145 | ]; |
146 | } |
147 | $clean[] = $problem; |
148 | break; |
149 | |
150 | case 'pageNotInIndex': |
151 | case 'oldDocument': |
152 | [ $page ] = $args; |
153 | $indexSuffix = $connection->getIndexSuffixForNamespace( $page->getNamespace() ); |
154 | $clean[] = [ |
155 | 'indexName' => $connection->getIndexName( $indexBaseName, $indexSuffix ), |
156 | 'errorType' => $problem, |
157 | 'pageId' => $page->getId(), |
158 | 'namespaceId' => $page->getNamespace(), |
159 | ]; |
160 | break; |
161 | |
162 | case 'ghostPageInIndex': |
163 | [ $docId, $title ] = $args; |
164 | $indexSuffix = $connection->getIndexSuffixForNamespace( $title->getNamespace() ); |
165 | $clean[] = [ |
166 | 'indexName' => $connection->getIndexName( $indexBaseName, $indexSuffix ), |
167 | 'errorType' => $problem, |
168 | 'pageId' => (int)$docId, |
169 | 'namespaceId' => $title->getNamespace(), |
170 | ]; |
171 | break; |
172 | |
173 | case 'pageInWrongIndex': |
174 | [ $docId, $page, $wrongIndexSuffix ] = $args; |
175 | $indexSuffix = $connection->getIndexSuffixForNamespace( $page->getNamespace() ); |
176 | $clean[] = [ |
177 | 'wrongIndexName' => $connection->getIndexName( $indexBaseName, $wrongIndexSuffix ), |
178 | 'indexName' => $connection->getIndexName( $indexBaseName, $indexSuffix ), |
179 | 'errorType' => $problem, |
180 | 'pageId' => $page->getId(), |
181 | 'namespaceId' => $page->getNamespace(), |
182 | ]; |
183 | break; |
184 | |
185 | case 'oldVersionInIndex': |
186 | // kinda random this one provides the suffix directly |
187 | [ $docId, $page, $indexSuffix ] = $args; |
188 | $clean[] = [ |
189 | 'indexName' => $connection->getIndexName( $indexBaseName, $indexSuffix ), |
190 | 'errorType' => $problem, |
191 | 'pageId' => $page->getId(), |
192 | 'namespaceId' => $page->getNamespace(), |
193 | ]; |
194 | break; |
195 | |
196 | default: |
197 | $this->dieDebug( __METHOD__, "Unknown remediation: $problem" ); |
198 | } |
199 | } |
200 | |
201 | return $clean; |
202 | } |
203 | |
204 | /** @inheritDoc */ |
205 | public function getAllowedParams() { |
206 | $assignment = $this->getSearchConfig()->getClusterAssignment(); |
207 | return [ |
208 | 'cluster' => [ |
209 | ParamValidator::PARAM_DEFAULT => $assignment->getSearchCluster(), |
210 | ParamValidator::PARAM_TYPE => $assignment->getAllKnownClusters(), |
211 | ], |
212 | 'from' => [ |
213 | ParamValidator::PARAM_TYPE => 'integer', |
214 | ParamValidator::PARAM_REQUIRED => true, |
215 | IntegerDef::PARAM_MIN => 0, |
216 | ], |
217 | 'limit' => [ |
218 | ParamValidator::PARAM_DEFAULT => 100, |
219 | ParamValidator::PARAM_TYPE => 'limit', |
220 | IntegerDef::PARAM_MIN => 1, |
221 | IntegerDef::PARAM_MAX => ApiBase::LIMIT_BIG1, |
222 | IntegerDef::PARAM_MAX2 => ApiBase::LIMIT_BIG2 |
223 | ], |
224 | // The caller must increment the sequenceid each successive |
225 | // time it invokes the sanity check for the same set of pages. |
226 | // Pages within the batch will emit an `oldDocument` problem |
227 | // spread over `rerenderfrequency` invocations of the api. |
228 | // This supports a slow and constant rerender of all content, |
229 | // ensuring the search indices stay aligned with changes to |
230 | // indexing and rendering code. |
231 | 'sequenceid' => [ |
232 | // Providing this enables the "old document" checks |
233 | // which provide constant re-rendering over time. |
234 | ParamValidator::PARAM_TYPE => 'integer', |
235 | ], |
236 | // Controls how often a page is flagged with the `oldDocument` |
237 | // problem. If the caller scans all page ids every week, then |
238 | // the default value of 16 would emit an `oldDocument` problem |
239 | // for all existing pages spread over 16 weeks. |
240 | 'rerenderfrequency' => [ |
241 | ParamValidator::PARAM_DEFAULT => 16, |
242 | ParamValidator::PARAM_TYPE => 'integer', |
243 | IntegerDef::PARAM_MIN => 2, |
244 | ] |
245 | ]; |
246 | } |
247 | |
248 | /** |
249 | * Mark as internal. This isn't meant to be used by normal api users |
250 | * @return bool |
251 | */ |
252 | public function isInternal() { |
253 | return true; |
254 | } |
255 | |
256 | /** |
257 | * @see ApiBase::getExamplesMessages |
258 | * @return array |
259 | */ |
260 | protected function getExamplesMessages() { |
261 | return [ |
262 | 'action=cirrus-sanity-check&from=0&limit=100' => |
263 | 'apihelp-cirrus-check-sanity-example', |
264 | ]; |
265 | } |
266 | |
267 | } |