Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 85 |
|
0.00% |
0 / 8 |
CRAP | |
0.00% |
0 / 1 |
RedirectsAndIncomingLinks | |
0.00% |
0 / 85 |
|
0.00% |
0 / 8 |
506 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
initialize | |
0.00% |
0 / 22 |
|
0.00% |
0 / 1 |
30 | |||
finishInitializeBatch | |
0.00% |
0 / 22 |
|
0.00% |
0 / 1 |
56 | |||
finalize | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
raiseLinkCountException | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
12 | |||
buildCount | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
2 | |||
newLog | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
2 | |||
raiseResponseException | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\BuildDocument; |
4 | |
5 | use CirrusSearch\Connection; |
6 | use CirrusSearch\ElasticaErrorHandler; |
7 | use CirrusSearch\ElasticsearchIntermediary; |
8 | use CirrusSearch\Search\CirrusIndexField; |
9 | use CirrusSearch\SearchConfig; |
10 | use CirrusSearch\SearchRequestLog; |
11 | use Elastica\Document; |
12 | use Elastica\Exception\ResponseException; |
13 | use Elastica\Multi\ResultSet; |
14 | use Elastica\Multi\Search as MultiSearch; |
15 | use Elastica\Query\BoolQuery; |
16 | use Elastica\Query\Terms; |
17 | use Elastica\Search; |
18 | use MediaWiki\Cache\BacklinkCacheFactory; |
19 | use MediaWiki\Logger\LoggerFactory; |
20 | use MediaWiki\Page\PageIdentity; |
21 | use MediaWiki\Revision\RevisionRecord; |
22 | use MediaWiki\Title\Title; |
23 | use MediaWiki\Title\TitleFormatter; |
24 | use WikiPage; |
25 | |
26 | /** |
27 | * Adds redirects and incoming links to the documents. These are done together |
28 | * because one needs the other. |
29 | * |
30 | * This program is free software; you can redistribute it and/or modify |
31 | * it under the terms of the GNU General Public License as published by |
32 | * the Free Software Foundation; either version 2 of the License, or |
33 | * (at your option) any later version. |
34 | * |
35 | * This program is distributed in the hope that it will be useful, |
36 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
37 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
38 | * GNU General Public License for more details. |
39 | * |
40 | * You should have received a copy of the GNU General Public License along |
41 | * with this program; if not, write to the Free Software Foundation, Inc., |
42 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
43 | * http://www.gnu.org/copyleft/gpl.html |
44 | */ |
45 | class RedirectsAndIncomingLinks extends ElasticsearchIntermediary implements PagePropertyBuilder { |
46 | /** |
47 | * @var SearchConfig |
48 | */ |
49 | private $config; |
50 | |
51 | /** |
52 | * @var MultiSearch |
53 | */ |
54 | private $linkCountMultiSearch; |
55 | |
56 | /** |
57 | * @var callable[] Callables expecting to recieve a single argument, the total hits |
58 | * of the related query added to linkCountMultiSearch. Array is in same order |
59 | * as queries added to the multi-search. |
60 | */ |
61 | private $linkCountClosures = []; |
62 | |
63 | /** |
64 | * @var int[] List of page id's in current batch. Only for debug purposes. |
65 | */ |
66 | private $pageIds = []; |
67 | |
68 | /** |
69 | * @var BacklinkCacheFactory |
70 | */ |
71 | private $backlinkCacheFactory; |
72 | |
73 | /** |
74 | * @var TitleFormatter |
75 | */ |
76 | private $titleFormatter; |
77 | |
78 | /** |
79 | * @param Connection $conn |
80 | * @param BacklinkCacheFactory $backlinkCacheFactory |
81 | * @param TitleFormatter $titleFormatter |
82 | */ |
83 | public function __construct( |
84 | Connection $conn, |
85 | BacklinkCacheFactory $backlinkCacheFactory, |
86 | TitleFormatter $titleFormatter |
87 | ) { |
88 | parent::__construct( $conn, null, 0 ); |
89 | $this->config = $conn->getConfig(); |
90 | $this->linkCountMultiSearch = new MultiSearch( $this->connection->getClient() ); |
91 | $this->backlinkCacheFactory = $backlinkCacheFactory; |
92 | $this->titleFormatter = $titleFormatter; |
93 | } |
94 | |
95 | /** |
96 | * {@inheritDoc} |
97 | */ |
98 | public function initialize( Document $doc, WikiPage $page, RevisionRecord $revision ): void { |
99 | $title = $page->getTitle(); |
100 | $this->pageIds[] = $page->getId(); |
101 | $outgoingLinksToCount = [ $title->getPrefixedDBkey() ]; |
102 | |
103 | // Gather redirects to this page |
104 | $redirectPageIdentities = $this->backlinkCacheFactory->getBacklinkCache( $title ) |
105 | ->getLinkPages( 'redirect', false, false, $this->config->get( 'CirrusSearchIndexedRedirects' ) ); |
106 | $redirects = []; |
107 | /** @var PageIdentity $redirect */ |
108 | foreach ( $redirectPageIdentities as $redirect ) { |
109 | // If the redirect is in main OR the same namespace as the article the index it |
110 | if ( $redirect->getNamespace() === NS_MAIN || $redirect->getNamespace() === $title->getNamespace() ) { |
111 | $redirects[] = [ |
112 | 'namespace' => $redirect->getNamespace(), |
113 | 'title' => $this->titleFormatter->getText( $redirect ) |
114 | ]; |
115 | $outgoingLinksToCount[] = $this->titleFormatter->getPrefixedDBkey( $redirect ); |
116 | } |
117 | } |
118 | $doc->set( 'redirect', $redirects ); |
119 | |
120 | if ( !$this->config->get( 'CirrusSearchEnableIncomingLinkCounting' ) ) { |
121 | return; |
122 | } |
123 | |
124 | // Count links |
125 | // Incoming links is the sum of: |
126 | // #1 Number of redirects to the page |
127 | // #2 Number of links to the title |
128 | // #3 Number of links to all the redirects |
129 | |
130 | // #1 we have a list of the "first" $wgCirrusSearchIndexedRedirects redirect so we just count it: |
131 | $redirectCount = count( $redirects ); |
132 | |
133 | // #2 and #3 we count the number of links to the page with Elasticsearch. |
134 | // Since we only have $wgCirrusSearchIndexedRedirects we only count that many terms. |
135 | $this->linkCountMultiSearch->addSearch( $this->buildCount( $outgoingLinksToCount ) ); |
136 | $this->linkCountClosures[] = static function ( $count ) use( $doc, $redirectCount ) { |
137 | $doc->set( 'incoming_links', $count + $redirectCount ); |
138 | CirrusIndexField::addNoopHandler( $doc, 'incoming_links', 'within 20%' ); |
139 | }; |
140 | } |
141 | |
142 | /** |
143 | * {@inheritDoc} |
144 | */ |
145 | public function finishInitializeBatch(): void { |
146 | if ( !$this->linkCountClosures ) { |
147 | return; |
148 | } |
149 | $linkCountClosureCount = count( $this->linkCountClosures ); |
150 | try { |
151 | $this->startNewLog( "counting links to {pageCount} pages", 'count_links', [ |
152 | 'pageCount' => $linkCountClosureCount, |
153 | 'query' => $linkCountClosureCount, |
154 | ] ); |
155 | $result = $this->linkCountMultiSearch->search(); |
156 | |
157 | if ( $result->count() <= 0 ) { |
158 | $this->raiseResponseException(); |
159 | } |
160 | |
161 | $foundNull = false; |
162 | for ( $index = 0; $index < $linkCountClosureCount; $index++ ) { |
163 | if ( $result[$index] === null ) { |
164 | // Finish updating other docs that have results before |
165 | // throwing the exception. |
166 | $foundNull = true; |
167 | } else { |
168 | $this->linkCountClosures[ $index ]( $result[ $index ]->getTotalHits() ); |
169 | } |
170 | } |
171 | if ( $foundNull ) { |
172 | $this->raiseLinkCountException( $result ); |
173 | } |
174 | $this->success(); |
175 | } catch ( \Elastica\Exception\ExceptionInterface $e ) { |
176 | // Note that we do not abort the update operation on failure, we simply |
177 | // complain about it and let the remainder of the update continue. The |
178 | // counts can simply be allowed to drift until resolved. |
179 | $this->failure( $e ); |
180 | LoggerFactory::getInstance( 'CirrusSearchChangeFailed' )->info( |
181 | 'Links for page ids: ' . implode( ',', $this->pageIds ) ); |
182 | } |
183 | } |
184 | |
185 | /** |
186 | * {@inheritDoc} |
187 | */ |
188 | public function finalize( Document $doc, Title $title, RevisionRecord $revision ): void { |
189 | // NOOP |
190 | } |
191 | |
192 | /** |
193 | * @param ResultSet $result |
194 | * @return never |
195 | */ |
196 | private function raiseLinkCountException( $result ): void { |
197 | $linkCountClosureCount = count( $this->linkCountClosures ); |
198 | // Seems to happen during connection issues? Treat it the |
199 | // same as an exception even though it wasn't thrown (why?) |
200 | $numNulls = 0; |
201 | for ( $i = 0; $i < $linkCountClosureCount; $i++ ) { |
202 | if ( $result[$i] === null ) { |
203 | $numNulls++; |
204 | } |
205 | } |
206 | |
207 | // Log the raw request/response until we understand how these happen |
208 | ElasticaErrorHandler::logRequestResponse( $this->connection, |
209 | "Received null for link count on {numNulls} out of {linkCountClosureCount} pages", [ |
210 | 'numNulls' => $numNulls, |
211 | 'linkCountClosureCount' => $linkCountClosureCount, |
212 | ] ); |
213 | |
214 | throw new \Elastica\Exception\RuntimeException( |
215 | "Received null for link count on $numNulls out of $linkCountClosureCount pages" ); |
216 | } |
217 | |
218 | /** |
219 | * Build a Search that will count all pages that link to $titles. |
220 | * |
221 | * @param string[] $titles title in prefixedDBKey form |
222 | * @return Search that counts all pages that link to $titles |
223 | */ |
224 | private function buildCount( array $titles ): Search { |
225 | $bool = new BoolQuery(); |
226 | $bool->addFilter( new Terms( 'outgoing_link', $titles ) ); |
227 | |
228 | $indexPrefix = $this->config->get( SearchConfig::INDEX_BASE_NAME ); |
229 | $index = $this->connection->getIndex( $indexPrefix ); |
230 | $search = new Search( $index->getClient() ); |
231 | $search->addIndex( $index ); |
232 | $search->setQuery( $bool ); |
233 | $search->getQuery()->setTrackTotalHits( true ); |
234 | $search->getQuery()->addParam( 'stats', 'link_count' ); |
235 | $search->getQuery()->setSize( 0 ); |
236 | |
237 | return $search; |
238 | } |
239 | |
240 | /** |
241 | * @param string $description |
242 | * @param string $queryType |
243 | * @param array $extra |
244 | * @return SearchRequestLog |
245 | */ |
246 | protected function newLog( $description, $queryType, array $extra = [] ) { |
247 | return new SearchRequestLog( |
248 | $this->connection->getClient(), |
249 | $description, |
250 | $queryType, |
251 | $extra |
252 | ); |
253 | } |
254 | |
255 | /** |
256 | * @throws ResponseException |
257 | * @return void |
258 | */ |
259 | private function raiseResponseException(): void { |
260 | $client = $this->connection->getClient(); |
261 | $request = $client->getLastRequest(); |
262 | $response = $client->getLastResponse(); |
263 | |
264 | if ( $request && $response ) { |
265 | throw new ResponseException( $request, $response ); |
266 | } |
267 | } |
268 | } |