Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
66.32% |
63 / 95 |
|
50.00% |
3 / 6 |
CRAP | |
0.00% |
0 / 1 |
BuildDocument | |
66.32% |
63 / 95 |
|
50.00% |
3 / 6 |
54.86 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
1 | |||
initialize | |
74.29% |
26 / 35 |
|
0.00% |
0 / 1 |
7.83 | |||
finalize | |
54.17% |
13 / 24 |
|
0.00% |
0 / 1 |
22.65 | |||
createBuilders | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
12 | |||
canUpsert | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
3 | |||
initializeDoc | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
2 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\BuildDocument; |
4 | |
5 | use CirrusSearch\Connection; |
6 | use CirrusSearch\Search\CirrusIndexField; |
7 | use CirrusSearch\SearchConfig; |
8 | use Elastica\Document; |
9 | use MediaWiki\Cache\BacklinkCacheFactory; |
10 | use MediaWiki\Logger\LoggerFactory; |
11 | use MediaWiki\Page\WikiPage; |
12 | use MediaWiki\Page\WikiPageFactory; |
13 | use MediaWiki\Revision\RevisionAccessException; |
14 | use MediaWiki\Revision\RevisionRecord; |
15 | use MediaWiki\Revision\RevisionStore; |
16 | use MediaWiki\Revision\SlotRecord; |
17 | use MediaWiki\Title\TitleFactory; |
18 | use MediaWiki\Title\TitleFormatter; |
19 | use Wikimedia\Rdbms\IReadableDatabase; |
20 | |
21 | /** |
22 | * Orchestrate the process of building an elasticsearch document out of a |
23 | * WikiPage. Document building is performed in two stages, and all properties |
24 | * are provided by PagePropertyBuilder instances chosen by a set of provided |
25 | * flags. |
26 | * |
27 | * The first stage, called initialize, sets up the basic document properties. |
28 | * This stage is executed one time per update and the results are shared |
29 | * between all retry attempts and clusters to be written to. The results of the |
30 | * initialize stage may be written to the job queue, so we try to keep the size |
31 | * of these documents reasonable small. The initialize stage supports batching |
32 | * initialization by the PagePropertyBuilder instances. |
33 | * |
34 | * The second stage of document building, finalize, is called on each attempt |
35 | * to send a document to an elasticsearch cluster. This stage loads the bulk |
36 | * content, potentially megabytes, from mediawiki ParserOutput into the |
37 | * documents. |
38 | * |
39 | * This program is free software; you can redistribute it and/or modify |
40 | * it under the terms of the GNU General Public License as published by |
41 | * the Free Software Foundation; either version 2 of the License, or |
42 | * (at your option) any later version. |
43 | * |
44 | * This program is distributed in the hope that it will be useful, |
45 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
46 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
47 | * GNU General Public License for more details. |
48 | * |
49 | * You should have received a copy of the GNU General Public License along |
50 | * with this program; if not, write to the Free Software Foundation, Inc., |
51 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
52 | * http://www.gnu.org/copyleft/gpl.html |
53 | */ |
54 | class BuildDocument { |
55 | private const HINT_FLAGS = 'BuildDocument_flags'; |
56 | |
57 | // Bit field parameters for constructor et al. |
58 | public const INDEX_EVERYTHING = 0; |
59 | public const INDEX_ON_SKIP = 1; |
60 | public const SKIP_PARSE = 2; |
61 | public const SKIP_LINKS = 4; |
62 | |
63 | /** @var SearchConfig */ |
64 | private $config; |
65 | /** @var Connection */ |
66 | private $connection; |
67 | /** @var IReadableDatabase */ |
68 | private $db; |
69 | /** @var RevisionStore */ |
70 | private $revStore; |
71 | /** @var BacklinkCacheFactory */ |
72 | private $backlinkCacheFactory; |
73 | /** @var DocumentSizeLimiter */ |
74 | private $documentSizeLimiter; |
75 | /** @var TitleFormatter */ |
76 | private $titleFormatter; |
77 | /** @var WikiPageFactory */ |
78 | private $wikiPageFactory; |
79 | /** @var TitleFactory */ |
80 | private TitleFactory $titleFactory; |
81 | |
82 | /** |
83 | * @param Connection $connection Cirrus connection to read page properties from |
84 | * @param IReadableDatabase $db Wiki database connection to read page properties from |
85 | * @param RevisionStore $revStore Store for retrieving revisions by id |
86 | * @param BacklinkCacheFactory $backlinkCacheFactory |
87 | * @param DocumentSizeLimiter $docSizeLimiter |
88 | * @param TitleFormatter $titleFormatter |
89 | * @param WikiPageFactory $wikiPageFactory |
90 | */ |
91 | public function __construct( |
92 | Connection $connection, |
93 | IReadableDatabase $db, |
94 | RevisionStore $revStore, |
95 | BacklinkCacheFactory $backlinkCacheFactory, |
96 | DocumentSizeLimiter $docSizeLimiter, |
97 | TitleFormatter $titleFormatter, |
98 | WikiPageFactory $wikiPageFactory, |
99 | TitleFactory $titleFactory |
100 | ) { |
101 | $this->config = $connection->getConfig(); |
102 | $this->connection = $connection; |
103 | $this->db = $db; |
104 | $this->revStore = $revStore; |
105 | $this->backlinkCacheFactory = $backlinkCacheFactory; |
106 | $this->documentSizeLimiter = $docSizeLimiter; |
107 | $this->titleFormatter = $titleFormatter; |
108 | $this->wikiPageFactory = $wikiPageFactory; |
109 | $this->titleFactory = $titleFactory; |
110 | } |
111 | |
112 | /** |
113 | * @param WikiPage[]|RevisionRecord[] $pagesOrRevs List of pages to build documents for. These |
114 | * pages must represent concrete pages with content. It is expected that |
115 | * redirects and non-existent pages have been resolved. |
116 | * @param int $flags Bitfield of class constants |
117 | * @return \Elastica\Document[] List of created documents indexed by page id. |
118 | */ |
119 | public function initialize( array $pagesOrRevs, int $flags ): array { |
120 | $documents = []; |
121 | $builders = $this->createBuilders( $flags ); |
122 | foreach ( $pagesOrRevs as $pageOrRev ) { |
123 | if ( $pageOrRev instanceof RevisionRecord ) { |
124 | $revision = $pageOrRev; |
125 | $page = $this->wikiPageFactory->newFromTitle( $revision->getPage() ); |
126 | $isRedirect = $revision->getContent( SlotRecord::MAIN )->isRedirect(); |
127 | } else { |
128 | $revision = $pageOrRev->getRevisionRecord(); |
129 | $page = $pageOrRev; |
130 | $isRedirect = $page->isRedirect(); |
131 | } |
132 | if ( !$page->exists() ) { |
133 | LoggerFactory::getInstance( 'CirrusSearch' )->warning( |
134 | 'Attempted to build a document for a page that doesn\'t exist. This should be caught ' . |
135 | "earlier but wasn't. Page: {title}", |
136 | [ 'title' => (string)$page->getTitle() ] |
137 | ); |
138 | continue; |
139 | } |
140 | |
141 | if ( $isRedirect ) { |
142 | LoggerFactory::getInstance( 'CirrusSearch' )->warning( |
143 | 'Attempted to build a document for a redirect. This should be caught ' . |
144 | "earlier but wasn't. Page: {title}", |
145 | [ 'title' => (string)$page->getTitle() ] |
146 | ); |
147 | // We could return the document for the redirect target, but |
148 | // that seems a bit too magical. The document representation |
149 | // of a redirect is nothing at all, simply skip this page. |
150 | continue; |
151 | } |
152 | |
153 | if ( $revision == null ) { |
154 | LoggerFactory::getInstance( 'CirrusSearch' )->warning( |
155 | 'Attempted to build a document for a page that doesn\'t have a revision. This should be caught ' . |
156 | "earlier but wasn't. Page: {title}", |
157 | [ 'title' => (string)$page->getTitle() ] |
158 | ); |
159 | continue; |
160 | } |
161 | |
162 | $documents[$page->getId()] = $this->initializeDoc( $page, $builders, $flags, $revision ); |
163 | } |
164 | |
165 | foreach ( $builders as $builder ) { |
166 | $builder->finishInitializeBatch(); |
167 | } |
168 | |
169 | return $documents; |
170 | } |
171 | |
172 | /** |
173 | * Finalize building a page document. |
174 | * |
175 | * Called on every attempt to write the document to elasticsearch, meaning |
176 | * every cluster and every retry. Any bulk data that needs to be loaded |
177 | * should happen here. |
178 | * |
179 | * @param Document $doc |
180 | * @param bool $enforceLatest |
181 | * @param RevisionRecord|null $revision |
182 | * @return bool True when the document update can proceed |
183 | * @throws BuildDocumentException |
184 | */ |
185 | public function finalize( Document $doc, bool $enforceLatest = true, ?RevisionRecord $revision = null ): bool { |
186 | $flags = CirrusIndexField::getHint( $doc, self::HINT_FLAGS ); |
187 | if ( $flags !== null ) { |
188 | $docRevision = $doc->get( 'version' ); |
189 | if ( $revision !== null && $docRevision !== $revision->getId() ) { |
190 | throw new \RuntimeException( "Revision id mismatch: {$revision->getId()} != $docRevision" ); |
191 | } |
192 | try { |
193 | $revision ??= $this->revStore->getRevisionById( $docRevision ); |
194 | $title = $revision ? $this->titleFactory->castFromPageIdentity( $revision->getPage() ) : null; |
195 | } catch ( RevisionAccessException $e ) { |
196 | $revision = null; |
197 | } |
198 | if ( !$title || !$revision ) { |
199 | LoggerFactory::getInstance( 'CirrusSearch' ) |
200 | ->warning( 'Ignoring a page/revision that no longer exists {rev_id}', |
201 | [ 'rev_id' => $docRevision ] ); |
202 | |
203 | return false; |
204 | } |
205 | if ( $enforceLatest && $title->getLatestRevID() !== $docRevision ) { |
206 | // Something has changed since the job was enqueued, this is no longer |
207 | // a valid update. |
208 | LoggerFactory::getInstance( 'CirrusSearch' )->warning( |
209 | 'Skipping a page/revision update for revision {rev} because a new one is available', |
210 | [ 'rev' => $docRevision ] ); |
211 | return false; |
212 | } |
213 | $builders = $this->createBuilders( $flags ); |
214 | foreach ( $builders as $builder ) { |
215 | $builder->finalize( $doc, $title, $revision ); |
216 | } |
217 | $this->documentSizeLimiter->resize( $doc ); |
218 | } |
219 | return true; |
220 | } |
221 | |
222 | /** |
223 | * Construct PagePropertyBuilder instances suitable for provided flags |
224 | * |
225 | * Visible for testing. Should be private. |
226 | * |
227 | * @param int $flags Bitfield of class constants |
228 | * @return PagePropertyBuilder[] |
229 | */ |
230 | protected function createBuilders( int $flags ): array { |
231 | $skipLinks = $flags & self::SKIP_LINKS; |
232 | $skipParse = $flags & self::SKIP_PARSE; |
233 | $builders = [ new DefaultPageProperties( $this->db ) ]; |
234 | if ( !$skipParse ) { |
235 | $builders[] = new ParserOutputPageProperties( $this->config ); |
236 | } |
237 | if ( !$skipLinks ) { |
238 | $builders[] = new RedirectsAndIncomingLinks( |
239 | $this->connection, |
240 | $this->backlinkCacheFactory, |
241 | $this->titleFormatter |
242 | ); |
243 | } |
244 | return $builders; |
245 | } |
246 | |
247 | /** |
248 | * Everything is sent as an update to prevent overwriting fields maintained in other processes |
249 | * like OtherIndex::updateOtherIndex. |
250 | * |
251 | * But we need a way to index documents that don't already exist. We're willing to upsert any |
252 | * full documents or any documents that we've been explicitly told it is ok to index when they |
253 | * aren't full. This is typically just done during the first phase of the initial index build. |
254 | * A quick note about docAsUpsert's merging behavior: It overwrites all fields provided by doc |
255 | * unless they are objects in both doc and the indexed source. We're ok with this because all of |
256 | * our fields are either regular types or lists of objects and lists are overwritten. |
257 | * |
258 | * @param int $flags Bitfield of class constants |
259 | * @return bool True when upsert is allowed with the provided flags |
260 | */ |
261 | private function canUpsert( int $flags ): bool { |
262 | $skipParse = $flags & self::SKIP_PARSE; |
263 | $skipLinks = $flags & self::SKIP_LINKS; |
264 | $indexOnSkip = $flags & self::INDEX_ON_SKIP; |
265 | $fullDocument = !( $skipParse || $skipLinks ); |
266 | return $fullDocument || $indexOnSkip; |
267 | } |
268 | |
269 | /** |
270 | * Perform initial building of a page document. This is called |
271 | * once when starting an update and is shared between all clusters |
272 | * written to. This doc may be written to the jobqueue multiple |
273 | * times and should not contain any large values. |
274 | * |
275 | * @param WikiPage $page |
276 | * @param PagePropertyBuilder[] $builders |
277 | * @param int $flags |
278 | * @param RevisionRecord $revision |
279 | * @return Document |
280 | */ |
281 | private function initializeDoc( WikiPage $page, array $builders, int $flags, RevisionRecord $revision ): Document { |
282 | $docId = $this->config->makeId( $page->getId() ); |
283 | $doc = new \Elastica\Document( $docId, [] ); |
284 | // allow self::finalize to recreate the same set of builders |
285 | CirrusIndexField::setHint( $doc, self::HINT_FLAGS, $flags ); |
286 | $doc->setDocAsUpsert( $this->canUpsert( $flags ) ); |
287 | $doc->set( 'version', $revision->getId() ); |
288 | CirrusIndexField::addNoopHandler( |
289 | $doc, 'version', 'documentVersion' ); |
290 | |
291 | foreach ( $builders as $builder ) { |
292 | $builder->initialize( $doc, $page, $revision ); |
293 | } |
294 | |
295 | return $doc; |
296 | } |
297 | } |