Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 125 |
|
0.00% |
0 / 13 |
CRAP | |
0.00% |
0 / 1 |
Checker | |
0.00% |
0 / 125 |
|
0.00% |
0 / 13 |
1722 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
2 | |||
makeIsOldClosure | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
check | |
0.00% |
0 / 24 |
|
0.00% |
0 / 1 |
56 | |||
checkExisitingPage | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
20 | |||
checkIfRedirect | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
20 | |||
checkInexistentPage | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
12 | |||
checkPageInIndex | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
12 | |||
checkIndexMismatch | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
20 | |||
checkIndexedVersion | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
12 | |||
loadPagesFromDB | |
0.00% |
0 / 20 |
|
0.00% |
0 / 1 |
30 | |||
getDB | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
loadPagesFromIndex | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
12 | |||
sane | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
6 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\Sanity; |
4 | |
5 | use ArrayObject; |
6 | use CirrusSearch\Connection; |
7 | use CirrusSearch\SearchConfig; |
8 | use CirrusSearch\Searcher; |
9 | use MediaWiki\MediaWikiServices; |
10 | use MediaWiki\Title\Title; |
11 | use WikiPage; |
12 | |
13 | /** |
14 | * Checks if a WikiPage's representation in search index is sane. |
15 | * |
16 | * This program is free software; you can redistribute it and/or modify |
17 | * it under the terms of the GNU General Public License as published by |
18 | * the Free Software Foundation; either version 2 of the License, or |
19 | * (at your option) any later version. |
20 | * |
21 | * This program is distributed in the hope that it will be useful, |
22 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
23 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
24 | * GNU General Public License for more details. |
25 | * |
26 | * You should have received a copy of the GNU General Public License along |
27 | * with this program; if not, write to the Free Software Foundation, Inc., |
28 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
29 | * http://www.gnu.org/copyleft/gpl.html |
30 | */ |
31 | |
32 | class Checker { |
33 | /** |
34 | * @var SearchConfig |
35 | */ |
36 | private $searchConfig; |
37 | |
38 | /** |
39 | * @var Connection |
40 | */ |
41 | private $connection; |
42 | |
43 | /** |
44 | * @var Searcher Used for fetching data, so we can check the content. |
45 | */ |
46 | private $searcher; |
47 | |
48 | /** |
49 | * @var Remediator Do something with the problems we found |
50 | */ |
51 | private $remediator; |
52 | |
53 | /** |
54 | * @var bool Should we log id's that are found to have no problems |
55 | */ |
56 | private $logSane; |
57 | |
58 | /** |
59 | * @var bool inspect WikiPage::isRedirect() instead of WikiPage::getContent()->isRedirect() |
60 | * Faster since it does not need to fetch the content but inconsistent in some cases. |
61 | */ |
62 | private $fastRedirectCheck; |
63 | |
64 | /** |
65 | * A cache for pages loaded with loadPagesFromDB( $pageIds ). This is only |
66 | * useful when multiple Checker are run to check different elastic clusters. |
67 | * @var ArrayObject|null |
68 | */ |
69 | private $pageCache; |
70 | |
71 | /** |
72 | * @var callable Accepts a WikiPage argument and returns boolean true if the page |
73 | * should be reindexed based on time since last reindex. |
74 | */ |
75 | private $isOldFn; |
76 | |
77 | /** |
78 | * Build the checker. |
79 | * @param SearchConfig $config |
80 | * @param Connection $connection |
81 | * @param Remediator $remediator the remediator to which to send titles |
82 | * that are insane |
83 | * @param Searcher $searcher searcher to use for fetches |
84 | * @param bool $logSane should we log sane ids |
85 | * @param bool $fastRedirectCheck fast but inconsistent redirect check |
86 | * @param ArrayObject|null $pageCache cache for WikiPage loaded from db |
87 | * @param callable|null $isOldFn Accepts a WikiPage argument and returns boolean true if the page |
88 | * should be reindexed based on time since last reindex. |
89 | */ |
90 | public function __construct( |
91 | SearchConfig $config, |
92 | Connection $connection, |
93 | Remediator $remediator, |
94 | Searcher $searcher, |
95 | $logSane, |
96 | $fastRedirectCheck, |
97 | ArrayObject $pageCache = null, |
98 | callable $isOldFn = null |
99 | ) { |
100 | $this->searchConfig = $config; |
101 | $this->connection = $connection; |
102 | $this->remediator = $remediator; |
103 | $this->searcher = $searcher; |
104 | $this->logSane = $logSane; |
105 | $this->fastRedirectCheck = $fastRedirectCheck; |
106 | $this->pageCache = $pageCache; |
107 | $this->isOldFn = $isOldFn ?? static function ( WikiPage $page ) { |
108 | return false; |
109 | }; |
110 | } |
111 | |
112 | /** |
113 | * Decide if a document should be reindexed based on time since last reindex |
114 | * |
115 | * Consider a page as old every $numCycles times the saneitizer loops over |
116 | * the same document. This ensures documents have been reindexed within the |
117 | * last `$numCycles * actual_loop_duration` (note that the configured |
118 | * duration is min_loop_duration, but in practice configuration ensures min |
119 | * and actual are typically the same). |
120 | * |
121 | * @param int $loopId The number of times the checker has looped over |
122 | * the document set. |
123 | * @param int $numCycles The number of loops after which a document |
124 | * is considered old. |
125 | * @return \Closure |
126 | */ |
127 | public static function makeIsOldClosure( $loopId, $numCycles ) { |
128 | $loopMod = $loopId % $numCycles; |
129 | return static function ( \WikiPage $page ) use ( $numCycles, $loopMod ) { |
130 | $pageIdMod = $page->getId() % $numCycles; |
131 | return $pageIdMod == $loopMod; |
132 | }; |
133 | } |
134 | |
135 | /** |
136 | * Check if a title is insane. |
137 | * |
138 | * @param int[] $pageIds page to check |
139 | * @return int the number of pages updated |
140 | * @throws CheckerException |
141 | */ |
142 | public function check( array $pageIds ) { |
143 | $docIds = array_map( [ $this->searchConfig, 'makeId' ], $pageIds ); |
144 | |
145 | $pagesFromDb = $this->loadPagesFromDB( $pageIds ); |
146 | $pagesFromIndex = $this->loadPagesFromIndex( $docIds ); |
147 | $nbPagesFixed = 0; |
148 | $nbPagesOld = 0; |
149 | foreach ( array_combine( $pageIds, $docIds ) as $pageId => $docId ) { |
150 | $fromIndex = []; |
151 | if ( isset( $pagesFromIndex[$docId] ) ) { |
152 | $fromIndex = $pagesFromIndex[$docId]; |
153 | } |
154 | |
155 | if ( isset( $pagesFromDb[$pageId] ) ) { |
156 | $page = $pagesFromDb[$pageId]; |
157 | $updated = $this->checkExisitingPage( $docId, $pageId, $page, $fromIndex ); |
158 | if ( !$updated && ( $this->isOldFn )( $page ) ) { |
159 | $this->remediator->oldDocument( $page ); |
160 | $nbPagesOld++; |
161 | } |
162 | } else { |
163 | $updated = $this->checkInexistentPage( $docId, $pageId, $fromIndex ); |
164 | } |
165 | if ( $updated ) { |
166 | $nbPagesFixed++; |
167 | } |
168 | } |
169 | $clusterName = $this->connection->getClusterName(); |
170 | $stats = MediaWikiServices::getInstance()->getStatsdDataFactory(); |
171 | $stats->updateCount( "CirrusSearch.$clusterName.sanitization.fixed", $nbPagesFixed ); |
172 | $stats->updateCount( "CirrusSearch.$clusterName.sanitization.checked", count( $pageIds ) ); |
173 | $stats->updateCount( "CirrusSearch.$clusterName.sanitization.old", $nbPagesOld ); |
174 | return $nbPagesFixed; |
175 | } |
176 | |
177 | /** |
178 | * Check that an existing page is properly indexed: |
179 | * - index it if missing in the index |
180 | * - delete it if it's a redirect |
181 | * - verify it if found in the index |
182 | * |
183 | * @param string $docId |
184 | * @param int $pageId |
185 | * @param WikiPage $page |
186 | * @param \Elastica\Result[] $fromIndex |
187 | * @return bool true if a modification was needed |
188 | */ |
189 | private function checkExisitingPage( $docId, $pageId, $page, array $fromIndex ) { |
190 | $inIndex = $fromIndex !== []; |
191 | if ( $this->checkIfRedirect( $page ) ) { |
192 | if ( $inIndex ) { |
193 | $this->remediator->redirectInIndex( $page ); |
194 | return true; |
195 | } |
196 | $this->sane( $pageId, 'Redirect not in index' ); |
197 | return false; |
198 | } |
199 | if ( $inIndex ) { |
200 | return $this->checkPageInIndex( $docId, $pageId, $page, $fromIndex ); |
201 | } |
202 | $this->remediator->pageNotInIndex( $page ); |
203 | return true; |
204 | } |
205 | |
206 | /** |
207 | * Check if the page is a redirect |
208 | * @param WikiPage $page |
209 | * @return bool true if $page is a redirect |
210 | */ |
211 | private function checkIfRedirect( $page ) { |
212 | if ( $this->fastRedirectCheck ) { |
213 | return $page->isRedirect(); |
214 | } |
215 | |
216 | $content = $page->getContent(); |
217 | if ( $content == null ) { |
218 | return false; |
219 | } |
220 | if ( is_object( $content ) ) { |
221 | return $content->isRedirect(); |
222 | } |
223 | return false; |
224 | } |
225 | |
226 | /** |
227 | * Check that an inexistent page is not present in the index |
228 | * and delete it if found |
229 | * |
230 | * @param string $docId |
231 | * @param int $pageId |
232 | * @param \Elastica\Result[] $fromIndex |
233 | * @return bool true if a modification was needed |
234 | */ |
235 | private function checkInexistentPage( $docId, $pageId, array $fromIndex ) { |
236 | $inIndex = $fromIndex !== []; |
237 | if ( $inIndex ) { |
238 | foreach ( $fromIndex as $r ) { |
239 | $title = Title::makeTitleSafe( $r->namespace, $r->title ) ?? |
240 | Title::makeTitle( NS_SPECIAL, 'Badtitle/InvalidInDBOrElastic' ); |
241 | $this->remediator->ghostPageInIndex( $docId, $title ); |
242 | } |
243 | return true; |
244 | } |
245 | $this->sane( $pageId, 'No ghost' ); |
246 | return false; |
247 | } |
248 | |
249 | /** |
250 | * Check that a page present in the db and in the index |
251 | * is in the correct index with the latest version. |
252 | * |
253 | * @param string $docId |
254 | * @param int $pageId |
255 | * @param WikiPage $page |
256 | * @param \Elastica\Result[] $fromIndex |
257 | * @return bool true if a modification was needed |
258 | */ |
259 | private function checkPageInIndex( $docId, $pageId, WikiPage $page, array $fromIndex ) { |
260 | $insane = $this->checkIndexMismatch( $docId, $pageId, $page, $fromIndex ); |
261 | if ( !$insane ) { |
262 | $insane = $this->checkIndexedVersion( $docId, $pageId, $page, $fromIndex ); |
263 | } |
264 | |
265 | if ( !$insane ) { |
266 | $this->sane( $pageId, 'Page in index with latest version' ); |
267 | } |
268 | |
269 | return $insane; |
270 | } |
271 | |
272 | /** |
273 | * Check that a page present in the db and in the index |
274 | * is properly indexed to the appropriate index by checking its |
275 | * namespace. |
276 | * |
277 | * @param string $docId |
278 | * @param int $pageId |
279 | * @param WikiPage $page |
280 | * @param \Elastica\Result[] $fromIndex |
281 | * @return bool true if a modification was needed |
282 | */ |
283 | private function checkIndexMismatch( $docId, $pageId, WikiPage $page, array $fromIndex ) { |
284 | $foundInsanityInIndex = false; |
285 | $expectedSuffix = $this->connection->getIndexSuffixForNamespace( |
286 | $page->getTitle()->getNamespace() |
287 | ); |
288 | foreach ( $fromIndex as $indexInfo ) { |
289 | $suffix = $this->connection->extractIndexSuffix( $indexInfo->getIndex() ); |
290 | if ( $suffix !== $expectedSuffix ) { |
291 | // Got to grab the index type from the index name.... |
292 | $this->remediator->pageInWrongIndex( $docId, $page, $suffix ); |
293 | $foundInsanityInIndex = true; |
294 | } |
295 | } |
296 | |
297 | if ( $foundInsanityInIndex ) { |
298 | return true; |
299 | } |
300 | |
301 | return false; |
302 | } |
303 | |
304 | /** |
305 | * Check that the indexed version of the page is the |
306 | * latest version in the database. |
307 | * |
308 | * @param string $docId |
309 | * @param int $pageId |
310 | * @param WikiPage $page |
311 | * @param \Elastica\Result[] $fromIndex |
312 | * @return bool true if a modification was needed |
313 | */ |
314 | private function checkIndexedVersion( $docId, $pageId, WikiPage $page, array $fromIndex ) { |
315 | $latest = $page->getLatest(); |
316 | $foundInsanityInIndex = false; |
317 | foreach ( $fromIndex as $indexInfo ) { |
318 | $version = $indexInfo->getSource()['version'] ?? -1; |
319 | if ( $version < $latest ) { |
320 | $type = $this->connection->extractIndexSuffix( $indexInfo->getIndex() ); |
321 | $this->remediator->oldVersionInIndex( $docId, $page, $type ); |
322 | |
323 | $foundInsanityInIndex = true; |
324 | } |
325 | } |
326 | |
327 | return $foundInsanityInIndex; |
328 | } |
329 | |
330 | /** |
331 | * @param int[] $pageIds |
332 | * @return WikiPage[] the list of wiki pages indexed in page id |
333 | */ |
334 | private function loadPagesFromDB( array $pageIds ) { |
335 | // If no cache object is constructed we build a new one. |
336 | // Building it in the constructor would cause memleaks because |
337 | // there is no automatic prunning of old entries. If a cache |
338 | // object is provided the owner of this Checker instance must take |
339 | // care of the cleaning. |
340 | $cache = $this->pageCache ?: new ArrayObject(); |
341 | $pageIds = array_diff( $pageIds, array_keys( $cache->getArrayCopy() ) ); |
342 | if ( !$pageIds ) { |
343 | return $cache->getArrayCopy(); |
344 | } |
345 | $dbr = $this->getDB(); |
346 | $pageQuery = WikiPage::getQueryInfo(); |
347 | |
348 | $res = $dbr->newSelectQueryBuilder() |
349 | ->select( $pageQuery['fields'] ) |
350 | ->tables( $pageQuery['tables'] ) |
351 | ->where( [ 'page_id' => $pageIds ] ) |
352 | ->caller( __METHOD__ ) |
353 | ->joinConds( $pageQuery['joins'] ) |
354 | ->fetchResultSet(); |
355 | |
356 | $wikiPageFactory = MediaWikiServices::getInstance()->getWikiPageFactory(); |
357 | foreach ( $res as $row ) { |
358 | $page = $wikiPageFactory->newFromRow( $row ); |
359 | if ( Title::newFromDBkey( $page->getTitle()->getPrefixedDBkey() ) === null ) { |
360 | // The DB may contain invalid titles, make sure we try to sanitize only valid titles |
361 | // invalid titles like this may have to wait for a dedicated clean up action |
362 | continue; |
363 | } |
364 | $cache->offsetSet( $page->getId(), $page ); |
365 | } |
366 | return $cache->getArrayCopy(); |
367 | } |
368 | |
369 | /** |
370 | * @return \Wikimedia\Rdbms\IReadableDatabase |
371 | */ |
372 | private function getDB() { |
373 | return MediaWikiServices::getInstance()->getConnectionProvider()->getReplicaDatabase(); |
374 | } |
375 | |
376 | /** |
377 | * @param string[] $docIds document ids |
378 | * @return \Elastica\Result[][] search results indexed by page id |
379 | * @throws CheckerException if an error occurred |
380 | */ |
381 | private function loadPagesFromIndex( array $docIds ) { |
382 | $status = $this->searcher->get( $docIds, [ 'namespace', 'title', 'version' ], false ); |
383 | if ( !$status->isOK() ) { |
384 | throw new CheckerException( 'Cannot fetch ids from index' ); |
385 | } |
386 | /** @var \Elastica\ResultSet $dataFromIndex */ |
387 | $dataFromIndex = $status->getValue(); |
388 | |
389 | $indexedPages = []; |
390 | foreach ( $dataFromIndex as $indexInfo ) { |
391 | $indexedPages[$indexInfo->getId()][] = $indexInfo; |
392 | } |
393 | return $indexedPages; |
394 | } |
395 | |
396 | private function sane( $pageId, $reason ) { |
397 | if ( $this->logSane ) { |
398 | printf( "%30s %10d\n", $reason, $pageId ); |
399 | } |
400 | } |
401 | } |