Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 121 |
|
0.00% |
0 / 10 |
CRAP | |
0.00% |
0 / 1 |
DumpIndex | |
0.00% |
0 / 114 |
|
0.00% |
0 / 10 |
702 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 32 |
|
0.00% |
0 / 1 |
2 | |||
execute | |
0.00% |
0 / 53 |
|
0.00% |
0 / 1 |
110 | |||
write | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
2 | |||
writeLine | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
6 | |||
getIndex | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
outputIndented | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
output | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 | |||
outputProgress | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
20 | |||
getClient | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getTotalHits | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\Maintenance; |
4 | |
5 | use CirrusSearch\Elastica\SearchAfter; |
6 | use CirrusSearch\Maintenance\Exception\IndexDumperException; |
7 | use CirrusSearch\SearchConfig; |
8 | use Elastica; |
9 | use Elastica\JSON; |
10 | use Elastica\Query; |
11 | |
12 | /** |
13 | * Dump an index to stdout |
14 | * |
15 | * This program is free software; you can redistribute it and/or modify |
16 | * it under the terms of the GNU General Public License as published by |
17 | * the Free Software Foundation; either version 2 of the License, or |
18 | * (at your option) any later version. |
19 | * |
20 | * This program is distributed in the hope that it will be useful, |
21 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
22 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
23 | * GNU General Public License for more details. |
24 | * |
25 | * You should have received a copy of the GNU General Public License along |
26 | * with this program; if not, write to the Free Software Foundation, Inc., |
27 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
28 | * http://www.gnu.org/copyleft/gpl.html |
29 | */ |
30 | |
31 | $IP = getenv( 'MW_INSTALL_PATH' ); |
32 | if ( $IP === false ) { |
33 | $IP = __DIR__ . '/../../..'; |
34 | } |
35 | require_once "$IP/maintenance/Maintenance.php"; |
36 | require_once __DIR__ . '/../includes/Maintenance/Maintenance.php'; |
37 | |
38 | /** |
39 | * Dump an index from elasticsearch. |
40 | */ |
41 | class DumpIndex extends Maintenance { |
42 | |
43 | /** |
44 | * @var string |
45 | */ |
46 | private $indexSuffix; |
47 | |
48 | /** |
49 | * @var string |
50 | */ |
51 | private $indexBaseName; |
52 | |
53 | /** |
54 | * @var string |
55 | */ |
56 | private $indexIdentifier; |
57 | |
58 | /** |
59 | * @var int number of docs per shard we export |
60 | */ |
61 | private $inputChunkSize = 500; |
62 | |
63 | /** |
64 | * @var bool |
65 | */ |
66 | private $logToStderr = false; |
67 | |
68 | /** |
69 | * @var int |
70 | */ |
71 | private $lastProgressPrinted; |
72 | |
73 | public function __construct() { |
74 | parent::__construct(); |
75 | $this->addDescription( "Dump an index into a 'json' based format stdout. " . |
76 | "This format complies to the elasticsearch bulk format and can be directly used " . |
77 | "with a curl command like : " . |
78 | "curl -s -XPOST localhost:9200/{index}/_bulk --data-binary @dump-file\n" . |
79 | "Note that you need to specify the index in the URL because the bulk commands do not " . |
80 | "contain the index name. Beware that the bulk import is not meant to import very large " . |
81 | "files, sweet spot seems to be between 2000 and 5000 documents (see examples below)." . |
82 | "\nThis always operates on a single cluster." . |
83 | "\n\nExamples :\n" . |
84 | " - Dump a general index :" . |
85 | "\n\tdumpIndex --indexSuffix general\n" . |
86 | " - Dump a large content index into compressed chunks of 100000 documents :" . |
87 | "\n\tdumpIndex --indexSuffix content | split -d -a 9 -l 100000 " . |
88 | "--filter 'gzip -c > \$FILE.txt.gz' - \"\" \n" . |
89 | "\nYou can import the data with the following commands :\n" . |
90 | " - Import chunks of 2000 documents :" . |
91 | "\n\tcat dump | split -l 4000 --filter 'curl -s http://elastic:9200/{indexName}/_bulk " . |
92 | "--data-binary @- > /dev/null'\n" . |
93 | " - Import 3 chunks of 2000 documents in parallel :" . |
94 | "\n\tcat dump | parallel --pipe -L 2 -N 2000 -j3 'curl -s http://elastic:9200/{indexName}/_bulk " . |
95 | "--data-binary @- > /dev/null'" ); |
96 | $this->addOption( 'indexSuffix', 'Index to dump. Either content or general.', false, true ); |
97 | $this->addOption( 'indexType', 'BC form of --indexSuffix.', false, true ); |
98 | $this->addOption( 'baseName', 'What basename to use, ' . |
99 | 'defaults to wiki id.', false, true ); |
100 | $this->addOption( 'filter', 'Dump only the documents that match the filter query ' . |
101 | '(queryString syntax).', false, true ); |
102 | $this->addOption( 'limit', 'Maximum number of documents to dump, 0 means no limit. Defaults to 0.', |
103 | false, true ); |
104 | $this->addOption( 'indexIdentifier', 'Force the index identifier, use the alias otherwise.', false, true ); |
105 | $this->addOption( 'sourceFields', 'List of comma separated source fields to extract.', false, true ); |
106 | } |
107 | |
108 | public function execute() { |
109 | $this->disablePoolCountersAndLogging(); |
110 | |
111 | $this->indexSuffix = $this->getBackCompatOption( 'indexSuffix', 'indexType' ); |
112 | $this->indexBaseName = $this->getOption( 'baseName', |
113 | $this->getSearchConfig()->get( SearchConfig::INDEX_BASE_NAME ) ); |
114 | |
115 | $indexSuffixes = $this->getConnection()->getAllIndexSuffixes(); |
116 | if ( !in_array( $this->indexSuffix, $indexSuffixes ) ) { |
117 | $this->fatalError( 'indexSuffix option must be one of ' . |
118 | implode( ', ', $indexSuffixes ) ); |
119 | } |
120 | |
121 | $this->indexIdentifier = $this->getOption( 'indexIdentifier' ); |
122 | |
123 | $filter = null; |
124 | if ( $this->hasOption( 'filter' ) ) { |
125 | $filter = new Elastica\Query\QueryString( $this->getOption( 'filter' ) ); |
126 | } |
127 | |
128 | $limit = (int)$this->getOption( 'limit', 0 ); |
129 | |
130 | $query = new Query(); |
131 | $query->setStoredFields( [ '_id', '_type', '_source' ] ); |
132 | $query->setSize( $this->inputChunkSize ); |
133 | // Elasticsearch docs (mapping-id-field.html) say that sorting by _id is restricted |
134 | // from use, but at least on 7.10.2 this works as one would expect. The _id comes |
135 | // from fielddata, this would be more efficient with a doc-values enabled field. |
136 | // Additionally the cluster setting 'indices.id_field_data.enabled=false' breaks |
137 | // this. |
138 | $query->setSort( [ |
139 | [ '_id' => 'asc' ], |
140 | ] ); |
141 | $query->setTrackTotalHits( true ); |
142 | if ( $this->hasOption( 'sourceFields' ) ) { |
143 | $sourceFields = explode( ',', $this->getOption( 'sourceFields' ) ); |
144 | $query->setSource( [ 'include' => $sourceFields ] ); |
145 | } |
146 | if ( $filter ) { |
147 | $bool = new \Elastica\Query\BoolQuery(); |
148 | $bool->addFilter( $filter ); |
149 | $query->setQuery( $bool ); |
150 | } |
151 | |
152 | $search = new \Elastica\Search( $this->getClient() ); |
153 | $search->setQuery( $query ); |
154 | $search->addIndex( $this->getIndex() ); |
155 | $searchAfter = new SearchAfter( $search ); |
156 | |
157 | $totalDocsInIndex = -1; |
158 | $totalDocsToDump = -1; |
159 | $docsDumped = 0; |
160 | |
161 | $this->logToStderr = true; |
162 | |
163 | foreach ( $searchAfter as $results ) { |
164 | if ( $totalDocsInIndex === -1 ) { |
165 | $totalDocsInIndex = $this->getTotalHits( $results ); |
166 | $totalDocsToDump = $limit > 0 ? $limit : $totalDocsInIndex; |
167 | $this->output( "Dumping $totalDocsToDump documents ($totalDocsInIndex in the index)\n" ); |
168 | } |
169 | |
170 | foreach ( $results as $result ) { |
171 | $document = [ |
172 | '_id' => $result->getId(), |
173 | '_type' => $result->getType(), |
174 | '_source' => $result->getSource() |
175 | ]; |
176 | $this->write( $document ); |
177 | $docsDumped++; |
178 | if ( $docsDumped >= $totalDocsToDump ) { |
179 | break; |
180 | } |
181 | } |
182 | $this->outputProgress( $docsDumped, $totalDocsToDump ); |
183 | } |
184 | $this->output( "Dump done ($docsDumped docs).\n" ); |
185 | |
186 | return true; |
187 | } |
188 | |
189 | /** |
190 | * @param array $document Valid elasticsearch document to write to stdout |
191 | */ |
192 | public function write( array $document ) { |
193 | $indexOp = [ |
194 | 'index' => [ |
195 | '_type' => $document['_type'], |
196 | '_id' => $document['_id'] |
197 | ] ]; |
198 | |
199 | // We use Elastica wrapper around json_encode. |
200 | // Depending on PHP version JSON_ESCAPE_UNICODE will be used |
201 | $this->writeLine( JSON::stringify( $indexOp ) ); |
202 | $this->writeLine( JSON::stringify( $document['_source'] ) ); |
203 | } |
204 | |
205 | /** |
206 | * @param string $data |
207 | */ |
208 | private function writeLine( $data ) { |
209 | if ( !fwrite( STDOUT, $data . "\n" ) ) { |
210 | throw new IndexDumperException( "Cannot write to standard output" ); |
211 | } |
212 | } |
213 | |
214 | /** |
215 | * @return Elastica\Index being updated |
216 | */ |
217 | private function getIndex() { |
218 | if ( $this->indexIdentifier ) { |
219 | return $this->getConnection()->getIndex( $this->indexBaseName, $this->indexSuffix, $this->indexIdentifier ); |
220 | } else { |
221 | return $this->getConnection()->getIndex( $this->indexBaseName, $this->indexSuffix ); |
222 | } |
223 | } |
224 | |
225 | /** |
226 | * @param string $message |
227 | */ |
228 | public function outputIndented( $message ) { |
229 | $this->output( "\t$message" ); |
230 | } |
231 | |
232 | /** |
233 | * @param string $message |
234 | * @param string|null $channel |
235 | */ |
236 | public function output( $message, $channel = null ) { |
237 | if ( $this->mQuiet ) { |
238 | return; |
239 | } |
240 | if ( $this->logToStderr ) { |
241 | // We must log to stderr |
242 | fwrite( STDERR, $message ); |
243 | } else { |
244 | parent::output( $message ); |
245 | } |
246 | } |
247 | |
248 | /** |
249 | * public because php 5.3 does not support accessing private |
250 | * methods in a closure. |
251 | * @param int $docsDumped |
252 | * @param int $limit |
253 | */ |
254 | public function outputProgress( $docsDumped, $limit ) { |
255 | if ( $docsDumped <= 0 ) { |
256 | return; |
257 | } |
258 | $pctDone = (int)( ( $docsDumped / $limit ) * 100 ); |
259 | if ( $this->lastProgressPrinted == $pctDone ) { |
260 | return; |
261 | } |
262 | $this->lastProgressPrinted = $pctDone; |
263 | if ( ( $pctDone % 2 ) == 0 ) { |
264 | $this->outputIndented( "$pctDone% done...\n" ); |
265 | } |
266 | } |
267 | |
268 | /** |
269 | * @return Elastica\Client |
270 | */ |
271 | protected function getClient() { |
272 | return $this->getConnection()->getClient(); |
273 | } |
274 | |
275 | /** |
276 | * @param Elastica\ResultSet $results |
277 | * @return mixed|string |
278 | */ |
279 | private function getTotalHits( Elastica\ResultSet $results ) { |
280 | // hack to support ES6, switch to getTotalHits |
281 | return $results->getResponse()->getData()["hits"]["total"]["value"] ?? |
282 | $results->getResponse()->getData()["hits"]["total"]; |
283 | } |
284 | |
285 | } |
286 | |
287 | $maintClass = DumpIndex::class; |
288 | require_once RUN_MAINTENANCE_IF_MAIN; |