Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
63.53% |
54 / 85 |
|
50.00% |
3 / 6 |
CRAP | |
0.00% |
0 / 1 |
BuildDocument | |
63.53% |
54 / 85 |
|
50.00% |
3 / 6 |
58.79 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
1 | |||
initialize | |
69.23% |
18 / 26 |
|
0.00% |
0 / 1 |
7.05 | |||
finalize | |
54.17% |
13 / 24 |
|
0.00% |
0 / 1 |
22.65 | |||
createBuilders | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
12 | |||
canUpsert | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
3 | |||
initializeDoc | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
2 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\BuildDocument; |
4 | |
5 | use CirrusSearch\Connection; |
6 | use CirrusSearch\Search\CirrusIndexField; |
7 | use CirrusSearch\SearchConfig; |
8 | use Elastica\Document; |
9 | use MediaWiki\Cache\BacklinkCacheFactory; |
10 | use MediaWiki\Logger\LoggerFactory; |
11 | use MediaWiki\Page\WikiPageFactory; |
12 | use MediaWiki\Revision\RevisionAccessException; |
13 | use MediaWiki\Revision\RevisionRecord; |
14 | use MediaWiki\Revision\RevisionStore; |
15 | use MediaWiki\Title\Title; |
16 | use TitleFormatter; |
17 | use Wikimedia\Rdbms\IReadableDatabase; |
18 | use WikiPage; |
19 | |
20 | /** |
21 | * Orchestrate the process of building an elasticsearch document out of a |
22 | * WikiPage. Document building is performed in two stages, and all properties |
23 | * are provided by PagePropertyBuilder instances chosen by a set of provided |
24 | * flags. |
25 | * |
26 | * The first stage, called initialize, sets up the basic document properties. |
27 | * This stage is executed one time per update and the results are shared |
28 | * between all retry attempts and clusters to be written to. The results of the |
29 | * initialize stage may be written to the job queue, so we try to keep the size |
30 | * of these documents reasonable small. The initialize stage supports batching |
31 | * initialization by the PagePropertyBuilder instances. |
32 | * |
33 | * The second stage of document building, finalize, is called on each attempt |
34 | * to send a document to an elasticsearch cluster. This stage loads the bulk |
35 | * content, potentially megabytes, from mediawiki ParserOutput into the |
36 | * documents. |
37 | * |
38 | * This program is free software; you can redistribute it and/or modify |
39 | * it under the terms of the GNU General Public License as published by |
40 | * the Free Software Foundation; either version 2 of the License, or |
41 | * (at your option) any later version. |
42 | * |
43 | * This program is distributed in the hope that it will be useful, |
44 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
45 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
46 | * GNU General Public License for more details. |
47 | * |
48 | * You should have received a copy of the GNU General Public License along |
49 | * with this program; if not, write to the Free Software Foundation, Inc., |
50 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
51 | * http://www.gnu.org/copyleft/gpl.html |
52 | */ |
53 | class BuildDocument { |
54 | private const HINT_FLAGS = 'BuildDocument_flags'; |
55 | |
56 | // Bit field parameters for constructor et al. |
57 | public const INDEX_EVERYTHING = 0; |
58 | public const INDEX_ON_SKIP = 1; |
59 | public const SKIP_PARSE = 2; |
60 | public const SKIP_LINKS = 4; |
61 | |
62 | /** @var SearchConfig */ |
63 | private $config; |
64 | /** @var Connection */ |
65 | private $connection; |
66 | /** @var IReadableDatabase */ |
67 | private $db; |
68 | /** @var RevisionStore */ |
69 | private $revStore; |
70 | /** @var BacklinkCacheFactory */ |
71 | private $backlinkCacheFactory; |
72 | /** @var DocumentSizeLimiter */ |
73 | private $documentSizeLimiter; |
74 | /** @var TitleFormatter */ |
75 | private $titleFormatter; |
76 | /** @var WikiPageFactory */ |
77 | private $wikiPageFactory; |
78 | |
79 | /** |
80 | * @param Connection $connection Cirrus connection to read page properties from |
81 | * @param IReadableDatabase $db Wiki database connection to read page properties from |
82 | * @param RevisionStore $revStore Store for retrieving revisions by id |
83 | * @param BacklinkCacheFactory $backlinkCacheFactory |
84 | * @param DocumentSizeLimiter $docSizeLimiter |
85 | * @param TitleFormatter $titleFormatter |
86 | * @param WikiPageFactory $wikiPageFactory |
87 | */ |
88 | public function __construct( |
89 | Connection $connection, |
90 | IReadableDatabase $db, |
91 | RevisionStore $revStore, |
92 | BacklinkCacheFactory $backlinkCacheFactory, |
93 | DocumentSizeLimiter $docSizeLimiter, |
94 | TitleFormatter $titleFormatter, |
95 | WikiPageFactory $wikiPageFactory |
96 | ) { |
97 | $this->config = $connection->getConfig(); |
98 | $this->connection = $connection; |
99 | $this->db = $db; |
100 | $this->revStore = $revStore; |
101 | $this->backlinkCacheFactory = $backlinkCacheFactory; |
102 | $this->documentSizeLimiter = $docSizeLimiter; |
103 | $this->titleFormatter = $titleFormatter; |
104 | $this->wikiPageFactory = $wikiPageFactory; |
105 | } |
106 | |
107 | /** |
108 | * @param \WikiPage[]|RevisionRecord[] $pagesOrRevs List of pages to build documents for. These |
109 | * pages must represent concrete pages with content. It is expected that |
110 | * redirects and non-existent pages have been resolved. |
111 | * @param int $flags Bitfield of class constants |
112 | * @return \Elastica\Document[] List of created documents indexed by page id. |
113 | */ |
114 | public function initialize( array $pagesOrRevs, int $flags ): array { |
115 | $documents = []; |
116 | $builders = $this->createBuilders( $flags ); |
117 | foreach ( $pagesOrRevs as $pageOrRev ) { |
118 | if ( $pageOrRev instanceof RevisionRecord ) { |
119 | $revision = $pageOrRev; |
120 | $page = $this->wikiPageFactory->newFromTitle( $revision->getPage() ); |
121 | } else { |
122 | $revision = $pageOrRev->getRevisionRecord(); |
123 | $page = $pageOrRev; |
124 | } |
125 | if ( !$page->exists() ) { |
126 | LoggerFactory::getInstance( 'CirrusSearch' )->warning( |
127 | 'Attempted to build a document for a page that doesn\'t exist. This should be caught ' . |
128 | "earlier but wasn't. Page: {title}", |
129 | [ 'title' => (string)$page->getTitle() ] |
130 | ); |
131 | continue; |
132 | } |
133 | |
134 | if ( $revision == null ) { |
135 | LoggerFactory::getInstance( 'CirrusSearch' )->warning( |
136 | 'Attempted to build a document for a page that doesn\'t have a revision. This should be caught ' . |
137 | "earlier but wasn't. Page: {title}", |
138 | [ 'title' => (string)$page->getTitle() ] |
139 | ); |
140 | continue; |
141 | } |
142 | |
143 | $documents[$page->getId()] = $this->initializeDoc( $page, $builders, $flags, $revision ); |
144 | } |
145 | |
146 | foreach ( $builders as $builder ) { |
147 | $builder->finishInitializeBatch(); |
148 | } |
149 | |
150 | return $documents; |
151 | } |
152 | |
153 | /** |
154 | * Finalize building a page document. |
155 | * |
156 | * Called on every attempt to write the document to elasticsearch, meaning |
157 | * every cluster and every retry. Any bulk data that needs to be loaded |
158 | * should happen here. |
159 | * |
160 | * @param Document $doc |
161 | * @param bool $enforceLatest |
162 | * @param RevisionRecord|null $revision |
163 | * @return bool True when the document update can proceed |
164 | * @throws BuildDocumentException |
165 | */ |
166 | public function finalize( Document $doc, bool $enforceLatest = true, RevisionRecord $revision = null ): bool { |
167 | $flags = CirrusIndexField::getHint( $doc, self::HINT_FLAGS ); |
168 | if ( $flags !== null ) { |
169 | $docRevision = $doc->get( 'version' ); |
170 | if ( $revision !== null && $docRevision !== $revision->getId() ) { |
171 | throw new \RuntimeException( "Revision id mismatch: {$revision->getId()} != $docRevision" ); |
172 | } |
173 | try { |
174 | $revision ??= $this->revStore->getRevisionById( $docRevision ); |
175 | $title = $revision ? Title::castFromPageIdentity( $revision->getPage() ) : null; |
176 | } catch ( RevisionAccessException $e ) { |
177 | $revision = null; |
178 | } |
179 | if ( !$title || !$revision ) { |
180 | LoggerFactory::getInstance( 'CirrusSearch' ) |
181 | ->warning( 'Ignoring a page/revision that no longer exists {rev_id}', |
182 | [ 'rev_id' => $docRevision ] ); |
183 | |
184 | return false; |
185 | } |
186 | if ( $enforceLatest && $title->getLatestRevID() !== $docRevision ) { |
187 | // Something has changed since the job was enqueued, this is no longer |
188 | // a valid update. |
189 | LoggerFactory::getInstance( 'CirrusSearch' )->warning( |
190 | 'Skipping a page/revision update for revision {rev} because a new one is available', |
191 | [ 'rev' => $docRevision ] ); |
192 | return false; |
193 | } |
194 | $builders = $this->createBuilders( $flags ); |
195 | foreach ( $builders as $builder ) { |
196 | $builder->finalize( $doc, $title, $revision ); |
197 | } |
198 | $this->documentSizeLimiter->resize( $doc ); |
199 | } |
200 | return true; |
201 | } |
202 | |
203 | /** |
204 | * Construct PagePropertyBuilder instances suitable for provided flags |
205 | * |
206 | * Visible for testing. Should be private. |
207 | * |
208 | * @param int $flags Bitfield of class constants |
209 | * @return PagePropertyBuilder[] |
210 | */ |
211 | protected function createBuilders( int $flags ): array { |
212 | $skipLinks = $flags & self::SKIP_LINKS; |
213 | $skipParse = $flags & self::SKIP_PARSE; |
214 | $builders = [ new DefaultPageProperties( $this->db ) ]; |
215 | if ( !$skipParse ) { |
216 | $builders[] = new ParserOutputPageProperties( $this->config ); |
217 | } |
218 | if ( !$skipLinks ) { |
219 | $builders[] = new RedirectsAndIncomingLinks( |
220 | $this->connection, |
221 | $this->backlinkCacheFactory, |
222 | $this->titleFormatter |
223 | ); |
224 | } |
225 | return $builders; |
226 | } |
227 | |
228 | /** |
229 | * Everything is sent as an update to prevent overwriting fields maintained in other processes |
230 | * like OtherIndex::updateOtherIndex. |
231 | * |
232 | * But we need a way to index documents that don't already exist. We're willing to upsert any |
233 | * full documents or any documents that we've been explicitly told it is ok to index when they |
234 | * aren't full. This is typically just done during the first phase of the initial index build. |
235 | * A quick note about docAsUpsert's merging behavior: It overwrites all fields provided by doc |
236 | * unless they are objects in both doc and the indexed source. We're ok with this because all of |
237 | * our fields are either regular types or lists of objects and lists are overwritten. |
238 | * |
239 | * @param int $flags Bitfield of class constants |
240 | * @return bool True when upsert is allowed with the provided flags |
241 | */ |
242 | private function canUpsert( int $flags ): bool { |
243 | $skipParse = $flags & self::SKIP_PARSE; |
244 | $skipLinks = $flags & self::SKIP_LINKS; |
245 | $indexOnSkip = $flags & self::INDEX_ON_SKIP; |
246 | $fullDocument = !( $skipParse || $skipLinks ); |
247 | return $fullDocument || $indexOnSkip; |
248 | } |
249 | |
250 | /** |
251 | * Perform initial building of a page document. This is called |
252 | * once when starting an update and is shared between all clusters |
253 | * written to. This doc may be written to the jobqueue multiple |
254 | * times and should not contain any large values. |
255 | * |
256 | * @param WikiPage $page |
257 | * @param PagePropertyBuilder[] $builders |
258 | * @param int $flags |
259 | * @param RevisionRecord $revision |
260 | * @return Document |
261 | */ |
262 | private function initializeDoc( WikiPage $page, array $builders, int $flags, RevisionRecord $revision ): Document { |
263 | $docId = $this->config->makeId( $page->getId() ); |
264 | $doc = new \Elastica\Document( $docId, [] ); |
265 | // allow self::finalize to recreate the same set of builders |
266 | CirrusIndexField::setHint( $doc, self::HINT_FLAGS, $flags ); |
267 | $doc->setDocAsUpsert( $this->canUpsert( $flags ) ); |
268 | $doc->set( 'version', $revision->getId() ); |
269 | CirrusIndexField::addNoopHandler( |
270 | $doc, 'version', 'documentVersion' ); |
271 | |
272 | foreach ( $builders as $builder ) { |
273 | $builder->initialize( $doc, $page, $revision ); |
274 | } |
275 | |
276 | return $doc; |
277 | } |
278 | } |