Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
13.33% |
8 / 60 |
|
40.00% |
2 / 5 |
CRAP | |
0.00% |
0 / 1 |
ApiTrait | |
13.33% |
8 / 60 |
|
40.00% |
2 / 5 |
431.85 | |
0.00% |
0 / 1 |
getCirrusConnection | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
getSearchConfig | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
loadDocuments | |
0.00% |
0 / 21 |
|
0.00% |
0 / 1 |
72 | |||
determineCirrusDocId | |
0.00% |
0 / 24 |
|
0.00% |
0 / 1 |
72 | |||
hasRedirect | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
30 | |||
getUser | n/a |
0 / 0 |
n/a |
0 / 0 |
0 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\Api; |
4 | |
5 | use CirrusSearch\Connection; |
6 | use CirrusSearch\SearchConfig; |
7 | use CirrusSearch\Searcher; |
8 | use MediaWiki\MediaWikiServices; |
9 | use MediaWiki\Revision\RevisionRecord; |
10 | use MediaWiki\Revision\SlotRecord; |
11 | use PageArchive; |
12 | use Title; |
13 | use User; |
14 | |
15 | trait ApiTrait { |
16 | /** @var Connection */ |
17 | private $connection; |
18 | /** @var SearchConfig */ |
19 | private $searchConfig; |
20 | |
21 | /** |
22 | * @return Connection |
23 | */ |
24 | public function getCirrusConnection() { |
25 | if ( $this->connection === null ) { |
26 | $this->connection = new Connection( $this->getSearchConfig() ); |
27 | } |
28 | return $this->connection; |
29 | } |
30 | |
31 | /** |
32 | * @return SearchConfig |
33 | */ |
34 | protected function getSearchConfig() { |
35 | if ( $this->searchConfig === null ) { |
36 | $this->searchConfig = MediaWikiServices::getInstance() |
37 | ->getConfigFactory() |
38 | ->makeConfig( 'CirrusSearch' ); |
39 | } |
40 | return $this->searchConfig; |
41 | } |
42 | |
43 | /** |
44 | * @param Title $title |
45 | * @param string[]|bool $sourceFiltering source filtering to apply |
46 | * @return array |
47 | */ |
48 | public function loadDocuments( Title $title, $sourceFiltering = true ) { |
49 | list( $docId, $hasRedirects ) = $this->determineCirrusDocId( $title ); |
50 | if ( $docId === null ) { |
51 | return []; |
52 | } |
53 | // could be optimized by implementing multi-get but not |
54 | // expecting much usage except debugging/tests. |
55 | $searcher = new Searcher( $this->getCirrusConnection(), 0, 0, $this->getSearchConfig(), [], $this->getUser() ); |
56 | $esSources = $searcher->get( [ $docId ], $sourceFiltering ); |
57 | $result = []; |
58 | if ( $esSources->isOK() ) { |
59 | foreach ( $esSources->getValue() as $esSource ) { |
60 | // If we have followed redirects only report the |
61 | // article dump if the redirect has been indexed. If it |
62 | // hasn't been indexed this document does not represent |
63 | // the original title. |
64 | if ( $hasRedirects && |
65 | !$this->hasRedirect( $esSource->getData(), $title ) |
66 | ) { |
67 | continue; |
68 | } |
69 | |
70 | // If this was not a redirect and the title doesn't match that |
71 | // means a page was moved, but elasticsearch has not yet been |
72 | // updated. Don't return the document that doesn't actually |
73 | // represent the page (yet). |
74 | if ( !$hasRedirects && $esSource->getData()['title'] != $title->getText() ) { |
75 | continue; |
76 | } |
77 | |
78 | $result[] = [ |
79 | 'index' => $esSource->getIndex(), |
80 | 'type' => $esSource->getType(), |
81 | 'id' => $esSource->getId(), |
82 | 'version' => $esSource->getVersion(), |
83 | 'source' => $esSource->getData(), |
84 | ]; |
85 | } |
86 | } |
87 | return $result; |
88 | } |
89 | |
90 | /** |
91 | * Trace redirects to find the page id the title should be indexed to in |
92 | * cirrussearch. Differs from Updater::traceRedirects in that this also |
93 | * supports archived pages. Archive support is important for integration |
94 | * tests that need to know when a page that was deleted from SQL was |
95 | * finally removed from elasticsearch. |
96 | * |
97 | * This still fails to find the correct page id if something was moved, as |
98 | * that page is renamed rather than being moved to the archive. We could |
99 | * further complicate things by looking into move logs but not sure that |
100 | * is worth the complication. |
101 | * |
102 | * @param Title $title |
103 | * @return array Two element array containing first the cirrus doc id |
104 | * the title should have been indexed into elasticsearch and second a |
105 | * boolean indicating if redirects were followed. If the page would |
106 | * not be indexed (for example a redirect loop, or redirect to |
107 | * invalid page) the first array element will be null. |
108 | */ |
109 | private function determineCirrusDocId( Title $title ) { |
110 | $hasRedirects = false; |
111 | $seen = []; |
112 | $now = wfTimestamp( TS_MW ); |
113 | $contentHandlerFactory = MediaWikiServices::getInstance()->getContentHandlerFactory(); |
114 | while ( true ) { |
115 | if ( isset( $seen[$title->getPrefixedText()] ) || count( $seen ) > 10 ) { |
116 | return [ null, $hasRedirects ]; |
117 | } |
118 | $seen[$title->getPrefixedText()] = true; |
119 | |
120 | // To help the integration tests figure out when a deleted page has |
121 | // been removed from the elasticsearch index we lookup the page in |
122 | // the archive to get it's page id. getPreviousRevisionRecord will |
123 | // check both the archive and live content to return the most recent. |
124 | $revRecord = ( new PageArchive( $title ) )->getPreviousRevisionRecord( $now ); |
125 | if ( !$revRecord ) { |
126 | return [ null, $hasRedirects ]; |
127 | } |
128 | |
129 | $pageId = $revRecord->getPageId(); |
130 | $mainSlot = $revRecord->getSlot( SlotRecord::MAIN, RevisionRecord::RAW ); |
131 | $handler = $contentHandlerFactory->getContentHandler( $mainSlot->getModel() ); |
132 | if ( !$handler->supportsRedirects() ) { |
133 | return [ $pageId, $hasRedirects ]; |
134 | } |
135 | $content = $mainSlot->getContent(); |
136 | // getUltimateRedirectTarget() would be prefered, but it wont find |
137 | // archive pages... |
138 | if ( !$content->isRedirect() ) { |
139 | return [ $this->getSearchConfig()->makeId( $pageId ), $hasRedirects ]; |
140 | } |
141 | $redirect = $content->getRedirectTarget(); |
142 | if ( !$redirect ) { |
143 | // TODO: Can this happen? |
144 | return [ $pageId, $hasRedirects ]; |
145 | } |
146 | |
147 | $hasRedirects = true; |
148 | $title = $redirect; |
149 | } |
150 | } |
151 | |
152 | /** |
153 | * @param array $source _source document from elasticsearch |
154 | * @param Title $title Title to check for redirect |
155 | * @return bool True when $title is stored as a redirect in $source |
156 | */ |
157 | private function hasRedirect( array $source, Title $title ) { |
158 | if ( !isset( $source['redirect'] ) ) { |
159 | return false; |
160 | } |
161 | foreach ( $source['redirect'] as $redirect ) { |
162 | if ( $redirect['namespace'] === $title->getNamespace() |
163 | && $redirect['title'] === $title->getText() |
164 | ) { |
165 | return true; |
166 | } |
167 | } |
168 | return false; |
169 | } |
170 | |
171 | /** |
172 | * @return User |
173 | */ |
174 | abstract public function getUser(); |
175 | |
176 | } |