Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
66.33% |
65 / 98 |
|
50.00% |
3 / 6 |
CRAP | |
0.00% |
0 / 1 |
| BuildDocument | |
66.33% |
65 / 98 |
|
50.00% |
3 / 6 |
61.11 | |
0.00% |
0 / 1 |
| __construct | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
1 | |||
| initialize | |
72.97% |
27 / 37 |
|
0.00% |
0 / 1 |
10.60 | |||
| finalize | |
56.00% |
14 / 25 |
|
0.00% |
0 / 1 |
21.31 | |||
| createBuilders | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
12 | |||
| canUpsert | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
3 | |||
| initializeDoc | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
2 | |||
| 1 | <?php |
| 2 | |
| 3 | namespace CirrusSearch\BuildDocument; |
| 4 | |
| 5 | use CirrusSearch\Connection; |
| 6 | use CirrusSearch\Search\CirrusIndexField; |
| 7 | use CirrusSearch\SearchConfig; |
| 8 | use Elastica\Document; |
| 9 | use MediaWiki\Cache\BacklinkCacheFactory; |
| 10 | use MediaWiki\Logger\LoggerFactory; |
| 11 | use MediaWiki\Page\WikiPage; |
| 12 | use MediaWiki\Page\WikiPageFactory; |
| 13 | use MediaWiki\Revision\RevisionAccessException; |
| 14 | use MediaWiki\Revision\RevisionRecord; |
| 15 | use MediaWiki\Revision\RevisionStore; |
| 16 | use MediaWiki\Revision\SlotRecord; |
| 17 | use MediaWiki\Title\TitleFactory; |
| 18 | use MediaWiki\Title\TitleFormatter; |
| 19 | use Wikimedia\Rdbms\IReadableDatabase; |
| 20 | |
| 21 | /** |
| 22 | * Orchestrate the process of building an elasticsearch document out of a |
| 23 | * WikiPage. Document building is performed in two stages, and all properties |
| 24 | * are provided by PagePropertyBuilder instances chosen by a set of provided |
| 25 | * flags. |
| 26 | * |
| 27 | * The first stage, called initialize, sets up the basic document properties. |
| 28 | * This stage is executed one time per update and the results are shared |
| 29 | * between all retry attempts and clusters to be written to. The results of the |
| 30 | * initialize stage may be written to the job queue, so we try to keep the size |
| 31 | * of these documents reasonable small. The initialize stage supports batching |
| 32 | * initialization by the PagePropertyBuilder instances. |
| 33 | * |
| 34 | * The second stage of document building, finalize, is called on each attempt |
| 35 | * to send a document to an elasticsearch cluster. This stage loads the bulk |
| 36 | * content, potentially megabytes, from mediawiki ParserOutput into the |
| 37 | * documents. |
| 38 | * |
| 39 | * @license GPL-2.0-or-later |
| 40 | */ |
| 41 | class BuildDocument { |
| 42 | private const HINT_FLAGS = 'BuildDocument_flags'; |
| 43 | |
| 44 | // Bit field parameters for constructor et al. |
| 45 | public const INDEX_EVERYTHING = 0; |
| 46 | public const INDEX_ON_SKIP = 1; |
| 47 | public const SKIP_PARSE = 2; |
| 48 | public const SKIP_LINKS = 4; |
| 49 | |
| 50 | /** @var SearchConfig */ |
| 51 | private $config; |
| 52 | /** @var Connection */ |
| 53 | private $connection; |
| 54 | /** @var IReadableDatabase */ |
| 55 | private $db; |
| 56 | /** @var RevisionStore */ |
| 57 | private $revStore; |
| 58 | /** @var BacklinkCacheFactory */ |
| 59 | private $backlinkCacheFactory; |
| 60 | /** @var DocumentSizeLimiter */ |
| 61 | private $documentSizeLimiter; |
| 62 | /** @var TitleFormatter */ |
| 63 | private $titleFormatter; |
| 64 | /** @var WikiPageFactory */ |
| 65 | private $wikiPageFactory; |
| 66 | /** @var TitleFactory */ |
| 67 | private TitleFactory $titleFactory; |
| 68 | |
| 69 | /** |
| 70 | * @param Connection $connection Cirrus connection to read page properties from |
| 71 | * @param IReadableDatabase $db Wiki database connection to read page properties from |
| 72 | * @param RevisionStore $revStore Store for retrieving revisions by id |
| 73 | * @param BacklinkCacheFactory $backlinkCacheFactory |
| 74 | * @param DocumentSizeLimiter $docSizeLimiter |
| 75 | * @param TitleFormatter $titleFormatter |
| 76 | * @param WikiPageFactory $wikiPageFactory |
| 77 | */ |
| 78 | public function __construct( |
| 79 | Connection $connection, |
| 80 | IReadableDatabase $db, |
| 81 | RevisionStore $revStore, |
| 82 | BacklinkCacheFactory $backlinkCacheFactory, |
| 83 | DocumentSizeLimiter $docSizeLimiter, |
| 84 | TitleFormatter $titleFormatter, |
| 85 | WikiPageFactory $wikiPageFactory, |
| 86 | TitleFactory $titleFactory |
| 87 | ) { |
| 88 | $this->config = $connection->getConfig(); |
| 89 | $this->connection = $connection; |
| 90 | $this->db = $db; |
| 91 | $this->revStore = $revStore; |
| 92 | $this->backlinkCacheFactory = $backlinkCacheFactory; |
| 93 | $this->documentSizeLimiter = $docSizeLimiter; |
| 94 | $this->titleFormatter = $titleFormatter; |
| 95 | $this->wikiPageFactory = $wikiPageFactory; |
| 96 | $this->titleFactory = $titleFactory; |
| 97 | } |
| 98 | |
| 99 | /** |
| 100 | * @param WikiPage[]|RevisionRecord[] $pagesOrRevs List of pages to build documents for. These |
| 101 | * pages must represent concrete pages with content. It is expected that |
| 102 | * redirects and non-existent pages have been resolved. |
| 103 | * @param int $flags Bitfield of class constants |
| 104 | * @return \Elastica\Document[] List of created documents indexed by page id. |
| 105 | */ |
| 106 | public function initialize( array $pagesOrRevs, int $flags ): array { |
| 107 | $documents = []; |
| 108 | $builders = $this->createBuilders( $flags ); |
| 109 | $buildRedirectDocs = $this->config->buildRedirectDocuments(); |
| 110 | foreach ( $pagesOrRevs as $pageOrRev ) { |
| 111 | if ( $pageOrRev instanceof RevisionRecord ) { |
| 112 | $revision = $pageOrRev; |
| 113 | $page = $this->wikiPageFactory->newFromTitle( $revision->getPage() ); |
| 114 | // getContent() returns null for suppressed/corrupt content; treat an |
| 115 | // inaccessible main slot as not-a-redirect. |
| 116 | $content = $revision->getContent( SlotRecord::MAIN ); |
| 117 | $isRedirect = $content && $content->isRedirect(); |
| 118 | } else { |
| 119 | $revision = $pageOrRev->getRevisionRecord(); |
| 120 | $page = $pageOrRev; |
| 121 | $isRedirect = $page->isRedirect(); |
| 122 | } |
| 123 | if ( !$page->exists() ) { |
| 124 | LoggerFactory::getInstance( 'CirrusSearch' )->warning( |
| 125 | 'Attempted to build a document for a page that doesn\'t exist. This should be caught ' . |
| 126 | "earlier but wasn't. Page: {title}", |
| 127 | [ 'title' => (string)$page->getTitle() ] |
| 128 | ); |
| 129 | continue; |
| 130 | } |
| 131 | |
| 132 | if ( $isRedirect && !$buildRedirectDocs ) { |
| 133 | LoggerFactory::getInstance( 'CirrusSearch' )->warning( |
| 134 | 'Attempted to build a document for a redirect. This should be caught ' . |
| 135 | "earlier but wasn't. Page: {title}", |
| 136 | [ 'title' => (string)$page->getTitle() ] |
| 137 | ); |
| 138 | // We could return the document for the redirect target, but |
| 139 | // that seems a bit too magical. The document representation |
| 140 | // of a redirect is nothing at all, simply skip this page. |
| 141 | continue; |
| 142 | } |
| 143 | |
| 144 | if ( $revision == null ) { |
| 145 | LoggerFactory::getInstance( 'CirrusSearch' )->warning( |
| 146 | 'Attempted to build a document for a page that doesn\'t have a revision. This should be caught ' . |
| 147 | "earlier but wasn't. Page: {title}", |
| 148 | [ 'title' => (string)$page->getTitle() ] |
| 149 | ); |
| 150 | continue; |
| 151 | } |
| 152 | |
| 153 | $documents[$page->getId()] = $this->initializeDoc( $page, $builders, $flags, $revision, $isRedirect ); |
| 154 | } |
| 155 | |
| 156 | foreach ( $builders as $builder ) { |
| 157 | $builder->finishInitializeBatch(); |
| 158 | } |
| 159 | |
| 160 | return $documents; |
| 161 | } |
| 162 | |
| 163 | /** |
| 164 | * Finalize building a page document. |
| 165 | * |
| 166 | * Called on every attempt to write the document to elasticsearch, meaning |
| 167 | * every cluster and every retry. Any bulk data that needs to be loaded |
| 168 | * should happen here. |
| 169 | * |
| 170 | * @param Document $doc |
| 171 | * @param bool $enforceLatest |
| 172 | * @param RevisionRecord|null $revision |
| 173 | * @return bool True when the document update can proceed |
| 174 | * @throws BuildDocumentException |
| 175 | */ |
| 176 | public function finalize( Document $doc, bool $enforceLatest = true, ?RevisionRecord $revision = null ): bool { |
| 177 | $flags = CirrusIndexField::getHint( $doc, self::HINT_FLAGS ); |
| 178 | if ( $flags !== null ) { |
| 179 | $docRevision = $doc->get( 'version' ); |
| 180 | if ( $revision !== null && $docRevision !== $revision->getId() ) { |
| 181 | throw new \RuntimeException( "Revision id mismatch: {$revision->getId()} != $docRevision" ); |
| 182 | } |
| 183 | $title = null; |
| 184 | try { |
| 185 | $revision ??= $this->revStore->getRevisionById( $docRevision ); |
| 186 | $title = $revision ? $this->titleFactory->castFromPageIdentity( $revision->getPage() ) : null; |
| 187 | } catch ( RevisionAccessException ) { |
| 188 | $revision = null; |
| 189 | } |
| 190 | if ( !$title || !$revision ) { |
| 191 | LoggerFactory::getInstance( 'CirrusSearch' ) |
| 192 | ->warning( 'Ignoring a page/revision that no longer exists {rev_id}', |
| 193 | [ 'rev_id' => $docRevision ] ); |
| 194 | |
| 195 | return false; |
| 196 | } |
| 197 | if ( $enforceLatest && $title->getLatestRevID() !== $docRevision ) { |
| 198 | // Something has changed since the job was enqueued, this is no longer |
| 199 | // a valid update. |
| 200 | LoggerFactory::getInstance( 'CirrusSearch' )->warning( |
| 201 | 'Skipping a page/revision update for revision {rev} because a new one is available', |
| 202 | [ 'rev' => $docRevision ] ); |
| 203 | return false; |
| 204 | } |
| 205 | $builders = $this->createBuilders( $flags ); |
| 206 | foreach ( $builders as $builder ) { |
| 207 | $builder->finalize( $doc, $title, $revision ); |
| 208 | } |
| 209 | $this->documentSizeLimiter->resize( $doc ); |
| 210 | } |
| 211 | return true; |
| 212 | } |
| 213 | |
| 214 | /** |
| 215 | * Construct PagePropertyBuilder instances suitable for provided flags |
| 216 | * |
| 217 | * Visible for testing. Should be private. |
| 218 | * |
| 219 | * @param int $flags Bitfield of class constants |
| 220 | * @return PagePropertyBuilder[] |
| 221 | */ |
| 222 | protected function createBuilders( int $flags ): array { |
| 223 | $skipLinks = $flags & self::SKIP_LINKS; |
| 224 | $skipParse = $flags & self::SKIP_PARSE; |
| 225 | $builders = [ new DefaultPageProperties( $this->db, $this->titleFormatter ) ]; |
| 226 | if ( !$skipParse ) { |
| 227 | $builders[] = new ParserOutputPageProperties( $this->config ); |
| 228 | } |
| 229 | if ( !$skipLinks ) { |
| 230 | $builders[] = new RedirectsAndIncomingLinks( |
| 231 | $this->connection, |
| 232 | $this->backlinkCacheFactory, |
| 233 | $this->titleFormatter |
| 234 | ); |
| 235 | } |
| 236 | return $builders; |
| 237 | } |
| 238 | |
| 239 | /** |
| 240 | * Everything is sent as an update to prevent overwriting fields maintained in other processes |
| 241 | * like OtherIndex::updateOtherIndex. |
| 242 | * |
| 243 | * But we need a way to index documents that don't already exist. We're willing to upsert any |
| 244 | * full documents or any documents that we've been explicitly told it is ok to index when they |
| 245 | * aren't full. This is typically just done during the first phase of the initial index build. |
| 246 | * A quick note about docAsUpsert's merging behavior: It overwrites all fields provided by doc |
| 247 | * unless they are objects in both doc and the indexed source. We're ok with this because all of |
| 248 | * our fields are either regular types or lists of objects and lists are overwritten. |
| 249 | * |
| 250 | * @param int $flags Bitfield of class constants |
| 251 | * @return bool True when upsert is allowed with the provided flags |
| 252 | */ |
| 253 | private function canUpsert( int $flags ): bool { |
| 254 | $skipParse = $flags & self::SKIP_PARSE; |
| 255 | $skipLinks = $flags & self::SKIP_LINKS; |
| 256 | $indexOnSkip = $flags & self::INDEX_ON_SKIP; |
| 257 | $fullDocument = !( $skipParse || $skipLinks ); |
| 258 | return $fullDocument || $indexOnSkip; |
| 259 | } |
| 260 | |
| 261 | /** |
| 262 | * Perform initial building of a page document. This is called |
| 263 | * once when starting an update and is shared between all clusters |
| 264 | * written to. This doc may be written to the jobqueue multiple |
| 265 | * times and should not contain any large values. |
| 266 | * |
| 267 | * @param WikiPage $page |
| 268 | * @param PagePropertyBuilder[] $builders |
| 269 | * @param int $flags |
| 270 | * @param RevisionRecord $revision |
| 271 | * @param bool $isRedirect |
| 272 | * @return Document |
| 273 | */ |
| 274 | private function initializeDoc( |
| 275 | WikiPage $page, array $builders, int $flags, RevisionRecord $revision, bool $isRedirect |
| 276 | ): Document { |
| 277 | $docId = $this->config->makeId( $page->getId() ); |
| 278 | $doc = new \Elastica\Document( $docId, [] ); |
| 279 | // allow self::finalize to recreate the same set of builders |
| 280 | CirrusIndexField::setHint( $doc, self::HINT_FLAGS, $flags ); |
| 281 | $doc->setDocAsUpsert( $this->canUpsert( $flags ) ); |
| 282 | $doc->set( 'version', $revision->getId() ); |
| 283 | CirrusIndexField::addNoopHandler( |
| 284 | $doc, 'version', 'documentVersion' ); |
| 285 | |
| 286 | foreach ( $builders as $builder ) { |
| 287 | $builder->initialize( $doc, $page, $revision, $isRedirect ); |
| 288 | } |
| 289 | |
| 290 | return $doc; |
| 291 | } |
| 292 | } |