Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
12.50% |
8 / 64 |
|
40.00% |
2 / 5 |
CRAP | |
0.00% |
0 / 1 |
ApiTrait | |
12.50% |
8 / 64 |
|
40.00% |
2 / 5 |
443.70 | |
0.00% |
0 / 1 |
getCirrusConnection | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
getSearchConfig | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
loadDocuments | |
0.00% |
0 / 22 |
|
0.00% |
0 / 1 |
72 | |||
determineCirrusDocId | |
0.00% |
0 / 27 |
|
0.00% |
0 / 1 |
72 | |||
hasRedirect | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
30 | |||
getUser | n/a |
0 / 0 |
n/a |
0 / 0 |
0 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\Api; |
4 | |
5 | use CirrusSearch\Connection; |
6 | use CirrusSearch\SearchConfig; |
7 | use CirrusSearch\Searcher; |
8 | use MediaWiki\Linker\LinkTarget; |
9 | use MediaWiki\MediaWikiServices; |
10 | use MediaWiki\Page\PageIdentity; |
11 | use MediaWiki\Revision\RevisionRecord; |
12 | use MediaWiki\Revision\SlotRecord; |
13 | use MediaWiki\Title\Title; |
14 | use MediaWiki\User\User; |
15 | |
16 | trait ApiTrait { |
17 | /** @var Connection */ |
18 | private $connection; |
19 | /** @var SearchConfig */ |
20 | private $searchConfig; |
21 | |
22 | /** |
23 | * @return Connection |
24 | */ |
25 | public function getCirrusConnection() { |
26 | if ( $this->connection === null ) { |
27 | $this->connection = new Connection( $this->getSearchConfig() ); |
28 | } |
29 | return $this->connection; |
30 | } |
31 | |
32 | /** |
33 | * @return SearchConfig |
34 | */ |
35 | protected function getSearchConfig() { |
36 | if ( $this->searchConfig === null ) { |
37 | $this->searchConfig = MediaWikiServices::getInstance() |
38 | ->getConfigFactory() |
39 | ->makeConfig( 'CirrusSearch' ); |
40 | } |
41 | return $this->searchConfig; |
42 | } |
43 | |
44 | /** |
45 | * @param PageIdentity $title |
46 | * @param string[]|bool $sourceFiltering source filtering to apply |
47 | * @return array |
48 | */ |
49 | public function loadDocuments( PageIdentity $title, $sourceFiltering = true ) { |
50 | [ $docId, $hasRedirects ] = $this->determineCirrusDocId( $title ); |
51 | if ( $docId === null ) { |
52 | return []; |
53 | } |
54 | $title = Title::newFromPageIdentity( $title ); |
55 | // could be optimized by implementing multi-get but not |
56 | // expecting much usage except debugging/tests. |
57 | $searcher = new Searcher( $this->getCirrusConnection(), 0, 0, $this->getSearchConfig(), [], $this->getUser() ); |
58 | $esSources = $searcher->get( [ $docId ], $sourceFiltering ); |
59 | $result = []; |
60 | if ( $esSources->isOK() ) { |
61 | foreach ( $esSources->getValue() as $esSource ) { |
62 | // If we have followed redirects only report the |
63 | // article dump if the redirect has been indexed. If it |
64 | // hasn't been indexed this document does not represent |
65 | // the original title. |
66 | if ( $hasRedirects && |
67 | !$this->hasRedirect( $esSource->getData(), $title ) |
68 | ) { |
69 | continue; |
70 | } |
71 | |
72 | // If this was not a redirect and the title doesn't match that |
73 | // means a page was moved, but elasticsearch has not yet been |
74 | // updated. Don't return the document that doesn't actually |
75 | // represent the page (yet). |
76 | if ( !$hasRedirects && $esSource->getData()['title'] != $title->getText() ) { |
77 | continue; |
78 | } |
79 | |
80 | $result[] = [ |
81 | 'index' => $esSource->getIndex(), |
82 | 'type' => $esSource->getType(), |
83 | 'id' => $esSource->getId(), |
84 | 'version' => $esSource->getVersion(), |
85 | 'source' => $esSource->getData(), |
86 | ]; |
87 | } |
88 | } |
89 | return $result; |
90 | } |
91 | |
92 | /** |
93 | * Trace redirects to find the page id the title should be indexed to in |
94 | * cirrussearch. Differs from Updater::traceRedirects in that this also |
95 | * supports archived pages. Archive support is important for integration |
96 | * tests that need to know when a page that was deleted from SQL was |
97 | * finally removed from elasticsearch. |
98 | * |
99 | * This still fails to find the correct page id if something was moved, as |
100 | * that page is renamed rather than being moved to the archive. We could |
101 | * further complicate things by looking into move logs but not sure that |
102 | * is worth the complication. |
103 | * |
104 | * @param PageIdentity $title |
105 | * @return array Two element array containing first the cirrus doc id |
106 | * the title should have been indexed into elasticsearch and second a |
107 | * boolean indicating if redirects were followed. If the page would |
108 | * not be indexed (for example a redirect loop, or redirect to |
109 | * invalid page) the first array element will be null. |
110 | */ |
111 | private function determineCirrusDocId( PageIdentity $title ) { |
112 | $hasRedirects = false; |
113 | $seen = []; |
114 | $now = wfTimestamp( TS_MW ); |
115 | $services = MediaWikiServices::getInstance(); |
116 | $contentHandlerFactory = $services->getContentHandlerFactory(); |
117 | $archivedRevisionLookup = $services->getArchivedRevisionLookup(); |
118 | while ( true ) { |
119 | $keySeen = $title->getNamespace() . '|' . $title->getDBkey(); |
120 | if ( isset( $seen[$keySeen] ) || count( $seen ) > 10 ) { |
121 | return [ null, $hasRedirects ]; |
122 | } |
123 | $seen[$keySeen] = true; |
124 | |
125 | // To help the integration tests figure out when a deleted page has |
126 | // been removed from the elasticsearch index we lookup the page in |
127 | // the archive to get it's page id. getPreviousRevisionRecord will |
128 | // check both the archive and live content to return the most recent. |
129 | $revRecord = $archivedRevisionLookup->getPreviousRevisionRecord( $title, $now ); |
130 | if ( !$revRecord ) { |
131 | return [ null, $hasRedirects ]; |
132 | } |
133 | |
134 | $pageId = $revRecord->getPageId(); |
135 | $mainSlot = $revRecord->getSlot( SlotRecord::MAIN, RevisionRecord::RAW ); |
136 | $handler = $contentHandlerFactory->getContentHandler( $mainSlot->getModel() ); |
137 | if ( !$handler->supportsRedirects() ) { |
138 | return [ $pageId, $hasRedirects ]; |
139 | } |
140 | $content = $mainSlot->getContent(); |
141 | // getUltimateRedirectTarget() would be prefered, but it wont find |
142 | // archive pages... |
143 | if ( !$content->isRedirect() ) { |
144 | return [ $this->getSearchConfig()->makeId( $pageId ), $hasRedirects ]; |
145 | } |
146 | $redirect = $content->getRedirectTarget(); |
147 | if ( !$redirect ) { |
148 | // TODO: Can this happen? |
149 | return [ $pageId, $hasRedirects ]; |
150 | } |
151 | |
152 | $hasRedirects = true; |
153 | $title = $redirect; |
154 | } |
155 | } |
156 | |
157 | /** |
158 | * @param array $source _source document from elasticsearch |
159 | * @param LinkTarget $title Title to check for redirect |
160 | * @return bool True when $title is stored as a redirect in $source |
161 | */ |
162 | private function hasRedirect( array $source, LinkTarget $title ) { |
163 | if ( !isset( $source['redirect'] ) ) { |
164 | return false; |
165 | } |
166 | foreach ( $source['redirect'] as $redirect ) { |
167 | if ( $redirect['namespace'] === $title->getNamespace() |
168 | && $redirect['title'] === $title->getText() |
169 | ) { |
170 | return true; |
171 | } |
172 | } |
173 | return false; |
174 | } |
175 | |
176 | /** |
177 | * @return User |
178 | */ |
179 | abstract public function getUser(); |
180 | |
181 | } |