Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
63.22% |
55 / 87 |
|
50.00% |
3 / 6 |
CRAP | |
0.00% |
0 / 1 |
BuildDocument | |
63.22% |
55 / 87 |
|
50.00% |
3 / 6 |
59.64 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
1 | |||
initialize | |
69.23% |
18 / 26 |
|
0.00% |
0 / 1 |
7.05 | |||
finalize | |
54.17% |
13 / 24 |
|
0.00% |
0 / 1 |
22.65 | |||
createBuilders | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
12 | |||
canUpsert | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
3 | |||
initializeDoc | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
2 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\BuildDocument; |
4 | |
5 | use CirrusSearch\Connection; |
6 | use CirrusSearch\Search\CirrusIndexField; |
7 | use CirrusSearch\SearchConfig; |
8 | use Elastica\Document; |
9 | use MediaWiki\Cache\BacklinkCacheFactory; |
10 | use MediaWiki\Logger\LoggerFactory; |
11 | use MediaWiki\Page\WikiPageFactory; |
12 | use MediaWiki\Revision\RevisionAccessException; |
13 | use MediaWiki\Revision\RevisionRecord; |
14 | use MediaWiki\Revision\RevisionStore; |
15 | use ParserCache; |
16 | use TitleFormatter; |
17 | use Wikimedia\Rdbms\IDatabase; |
18 | use WikiPage; |
19 | |
20 | /** |
21 | * Orchestrate the process of building an elasticsearch document out of a |
22 | * WikiPage. Document building is performed in two stages, and all properties |
23 | * are provided by PagePropertyBuilder instances chosen by a set of provided |
24 | * flags. |
25 | * |
26 | * The first stage, called initialize, sets up the basic document properties. |
27 | * This stage is executed one time per update and the results are shared |
28 | * between all retry attempts and clusters to be written to. The results of the |
29 | * initialize stage may be written to the job queue, so we try to keep the size |
30 | * of these documents reasonable small. The initialize stage supports batching |
31 | * initialization by the PagePropertyBuilder instances. |
32 | * |
33 | * The second stage of document building, finalize, is called on each attempt |
34 | * to send a document to an elasticsearch cluster. This stage loads the bulk |
35 | * content, potentially megabytes, from mediawiki ParserOutput into the |
36 | * documents. |
37 | * |
38 | * This program is free software; you can redistribute it and/or modify |
39 | * it under the terms of the GNU General Public License as published by |
40 | * the Free Software Foundation; either version 2 of the License, or |
41 | * (at your option) any later version. |
42 | * |
43 | * This program is distributed in the hope that it will be useful, |
44 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
45 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
46 | * GNU General Public License for more details. |
47 | * |
48 | * You should have received a copy of the GNU General Public License along |
49 | * with this program; if not, write to the Free Software Foundation, Inc., |
50 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
51 | * http://www.gnu.org/copyleft/gpl.html |
52 | */ |
53 | class BuildDocument { |
54 | private const HINT_FLAGS = 'BuildDocument_flags'; |
55 | |
56 | // Bit field parameters for constructor et al. |
57 | public const INDEX_EVERYTHING = 0; |
58 | public const INDEX_ON_SKIP = 1; |
59 | public const SKIP_PARSE = 2; |
60 | public const SKIP_LINKS = 4; |
61 | public const FORCE_PARSE = 8; |
62 | |
63 | /** @var SearchConfig */ |
64 | private $config; |
65 | /** @var Connection */ |
66 | private $connection; |
67 | /** @var IDatabase */ |
68 | private $db; |
69 | /** @var ParserCache */ |
70 | private $parserCache; |
71 | /** @var RevisionStore */ |
72 | private $revStore; |
73 | /** @var BacklinkCacheFactory */ |
74 | private $backlinkCacheFactory; |
75 | /** @var DocumentSizeLimiter */ |
76 | private $documentSizeLimiter; |
77 | /** @var TitleFormatter */ |
78 | private $titleFormatter; |
79 | /** @var WikiPageFactory */ |
80 | private $wikiPageFactory; |
81 | |
82 | /** |
83 | * @param Connection $connection Cirrus connection to read page properties from |
84 | * @param IDatabase $db Wiki database connection to read page properties from |
85 | * @param ParserCache $parserCache Cache to read parser output from |
86 | * @param RevisionStore $revStore Store for retrieving revisions by id |
87 | * @param BacklinkCacheFactory $backlinkCacheFactory |
88 | * @param DocumentSizeLimiter $docSizeLimiter |
89 | * @param TitleFormatter $titleFormatter |
90 | * @param WikiPageFactory $wikiPageFactory |
91 | */ |
92 | public function __construct( |
93 | Connection $connection, |
94 | IDatabase $db, |
95 | ParserCache $parserCache, |
96 | RevisionStore $revStore, |
97 | BacklinkCacheFactory $backlinkCacheFactory, |
98 | DocumentSizeLimiter $docSizeLimiter, |
99 | TitleFormatter $titleFormatter, |
100 | WikiPageFactory $wikiPageFactory |
101 | ) { |
102 | $this->config = $connection->getConfig(); |
103 | $this->connection = $connection; |
104 | $this->db = $db; |
105 | $this->parserCache = $parserCache; |
106 | $this->revStore = $revStore; |
107 | $this->backlinkCacheFactory = $backlinkCacheFactory; |
108 | $this->documentSizeLimiter = $docSizeLimiter; |
109 | $this->titleFormatter = $titleFormatter; |
110 | $this->wikiPageFactory = $wikiPageFactory; |
111 | } |
112 | |
113 | /** |
114 | * @param \WikiPage[]|RevisionRecord[] $pagesOrRevs List of pages to build documents for. These |
115 | * pages must represent concrete pages with content. It is expected that |
116 | * redirects and non-existent pages have been resolved. |
117 | * @param int $flags Bitfield of class constants |
118 | * @return \Elastica\Document[] List of created documents indexed by page id. |
119 | */ |
120 | public function initialize( array $pagesOrRevs, int $flags ): array { |
121 | $documents = []; |
122 | $builders = $this->createBuilders( $flags ); |
123 | foreach ( $pagesOrRevs as $pageOrRev ) { |
124 | if ( $pageOrRev instanceof RevisionRecord ) { |
125 | $revision = $pageOrRev; |
126 | $page = $this->wikiPageFactory->newFromTitle( $revision->getPage() ); |
127 | } else { |
128 | $revision = $pageOrRev->getRevisionRecord(); |
129 | $page = $pageOrRev; |
130 | } |
131 | if ( !$page->exists() ) { |
132 | LoggerFactory::getInstance( 'CirrusSearch' )->warning( |
133 | 'Attempted to build a document for a page that doesn\'t exist. This should be caught ' . |
134 | "earlier but wasn't. Page: {title}", |
135 | [ 'title' => (string)$page->getTitle() ] |
136 | ); |
137 | continue; |
138 | } |
139 | |
140 | if ( $revision == null ) { |
141 | LoggerFactory::getInstance( 'CirrusSearch' )->warning( |
142 | 'Attempted to build a document for a page that doesn\'t have a revision. This should be caught ' . |
143 | "earlier but wasn't. Page: {title}", |
144 | [ 'title' => (string)$page->getTitle() ] |
145 | ); |
146 | continue; |
147 | } |
148 | |
149 | $documents[$page->getId()] = $this->initializeDoc( $page, $builders, $flags, $revision ); |
150 | } |
151 | |
152 | foreach ( $builders as $builder ) { |
153 | $builder->finishInitializeBatch(); |
154 | } |
155 | |
156 | return $documents; |
157 | } |
158 | |
159 | /** |
160 | * Finalize building a page document. |
161 | * |
162 | * Called on every attempt to write the document to elasticsearch, meaning |
163 | * every cluster and every retry. Any bulk data that needs to be loaded |
164 | * should happen here. |
165 | * |
166 | * @param Document $doc |
167 | * @param bool $enforceLatest |
168 | * @param RevisionRecord|null $revision |
169 | * @return bool True when the document update can proceed |
170 | * @throws BuildDocumentException |
171 | */ |
172 | public function finalize( Document $doc, bool $enforceLatest = true, RevisionRecord $revision = null ): bool { |
173 | $flags = CirrusIndexField::getHint( $doc, self::HINT_FLAGS ); |
174 | if ( $flags !== null ) { |
175 | $docRevision = $doc->get( 'version' ); |
176 | if ( $revision !== null && $docRevision !== $revision->getId() ) { |
177 | throw new \RuntimeException( "Revision id mismatch: {$revision->getId()} != $docRevision" ); |
178 | } |
179 | try { |
180 | $revision ??= $this->revStore->getRevisionById( $docRevision ); |
181 | $title = $revision ? \Title::castFromPageIdentity( $revision->getPage() ) : null; |
182 | } catch ( RevisionAccessException $e ) { |
183 | $revision = null; |
184 | } |
185 | if ( !$title || !$revision ) { |
186 | LoggerFactory::getInstance( 'CirrusSearch' ) |
187 | ->warning( 'Ignoring a page/revision that no longer exists {rev_id}', |
188 | [ 'rev_id' => $docRevision ] ); |
189 | |
190 | return false; |
191 | } |
192 | if ( $enforceLatest && $title->getLatestRevID() !== $docRevision ) { |
193 | // Something has changed since the job was enqueued, this is no longer |
194 | // a valid update. |
195 | LoggerFactory::getInstance( 'CirrusSearch' )->warning( |
196 | 'Skipping a page/revision update for revision {rev} because a new one is available', |
197 | [ 'rev' => $docRevision ] ); |
198 | return false; |
199 | } |
200 | $builders = $this->createBuilders( $flags ); |
201 | foreach ( $builders as $builder ) { |
202 | $builder->finalize( $doc, $title, $revision ); |
203 | } |
204 | $this->documentSizeLimiter->resize( $doc ); |
205 | } |
206 | return true; |
207 | } |
208 | |
209 | /** |
210 | * Construct PagePropertyBuilder instances suitable for provided flags |
211 | * |
212 | * Visible for testing. Should be private. |
213 | * |
214 | * @param int $flags Bitfield of class constants |
215 | * @return PagePropertyBuilder[] |
216 | */ |
217 | protected function createBuilders( int $flags ): array { |
218 | $skipLinks = $flags & self::SKIP_LINKS; |
219 | $skipParse = $flags & self::SKIP_PARSE; |
220 | $forceParse = $flags & self::FORCE_PARSE; |
221 | $builders = [ new DefaultPageProperties( $this->db ) ]; |
222 | if ( !$skipParse ) { |
223 | $builders[] = new ParserOutputPageProperties( $this->parserCache, (bool)$forceParse, $this->config ); |
224 | } |
225 | if ( !$skipLinks ) { |
226 | $builders[] = new RedirectsAndIncomingLinks( |
227 | $this->connection, |
228 | $this->backlinkCacheFactory, |
229 | $this->titleFormatter |
230 | ); |
231 | } |
232 | return $builders; |
233 | } |
234 | |
235 | /** |
236 | * Everything is sent as an update to prevent overwriting fields maintained in other processes |
237 | * like OtherIndex::updateOtherIndex. |
238 | * |
239 | * But we need a way to index documents that don't already exist. We're willing to upsert any |
240 | * full documents or any documents that we've been explicitly told it is ok to index when they |
241 | * aren't full. This is typically just done during the first phase of the initial index build. |
242 | * A quick note about docAsUpsert's merging behavior: It overwrites all fields provided by doc |
243 | * unless they are objects in both doc and the indexed source. We're ok with this because all of |
244 | * our fields are either regular types or lists of objects and lists are overwritten. |
245 | * |
246 | * @param int $flags Bitfield of class constants |
247 | * @return bool True when upsert is allowed with the provided flags |
248 | */ |
249 | private function canUpsert( int $flags ): bool { |
250 | $skipParse = $flags & self::SKIP_PARSE; |
251 | $skipLinks = $flags & self::SKIP_LINKS; |
252 | $indexOnSkip = $flags & self::INDEX_ON_SKIP; |
253 | $fullDocument = !( $skipParse || $skipLinks ); |
254 | return $fullDocument || $indexOnSkip; |
255 | } |
256 | |
257 | /** |
258 | * Perform initial building of a page document. This is called |
259 | * once when starting an update and is shared between all clusters |
260 | * written to. This doc may be written to the jobqueue multiple |
261 | * times and should not contain any large values. |
262 | * |
263 | * @param WikiPage $page |
264 | * @param PagePropertyBuilder[] $builders |
265 | * @param int $flags |
266 | * @param RevisionRecord $revision |
267 | * @return Document |
268 | */ |
269 | private function initializeDoc( WikiPage $page, array $builders, int $flags, RevisionRecord $revision ): Document { |
270 | $docId = $this->config->makeId( $page->getId() ); |
271 | $doc = new \Elastica\Document( $docId, [] ); |
272 | // allow self::finalize to recreate the same set of builders |
273 | CirrusIndexField::setHint( $doc, self::HINT_FLAGS, $flags ); |
274 | $doc->setDocAsUpsert( $this->canUpsert( $flags ) ); |
275 | $doc->set( 'version', $revision->getId() ); |
276 | CirrusIndexField::addNoopHandler( |
277 | $doc, 'version', 'documentVersion' ); |
278 | |
279 | foreach ( $builders as $builder ) { |
280 | $builder->initialize( $doc, $page, $revision ); |
281 | } |
282 | |
283 | return $doc; |
284 | } |
285 | } |