Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 35 |
|
0.00% |
0 / 5 |
CRAP | |
0.00% |
0 / 1 |
DumpReader | |
0.00% |
0 / 35 |
|
0.00% |
0 / 5 |
182 | |
0.00% |
0 / 1 |
getLog | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
__construct | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
normalizeFilename | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
run | |
0.00% |
0 / 16 |
|
0.00% |
0 / 1 |
72 | |||
addJob | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
6 |
1 | <?php |
2 | |
3 | namespace MediaWiki\Extension\MathSearch\StackExchange; |
4 | |
5 | use MediaWiki\Logger\LoggerFactory; |
6 | use MediaWiki\MediaWikiServices; |
7 | |
8 | class DumpReader { |
9 | |
10 | /** |
11 | * @var XMLReader |
12 | */ |
13 | private $file; |
14 | /** |
15 | * @var string |
16 | */ |
17 | private $fileName; |
18 | /** @var string */ |
19 | private $errPath; |
20 | /** @var int */ |
21 | private $part = 0; |
22 | |
23 | private static function getLog() { |
24 | return LoggerFactory::getInstance( 'MathSearch' ); |
25 | } |
26 | |
27 | /** |
28 | * @param \SplFileObject $file |
29 | * @param string $errPath |
30 | */ |
31 | public function __construct( $file, $errPath ) { |
32 | $this->file = new XMLReader(); |
33 | XMLReader::open( $file->getRealPath() ); |
34 | $this->normalizeFilename( $file->getFilename() ); |
35 | $this->errPath = $errPath; |
36 | } |
37 | |
38 | private function normalizeFilename( $fileName ) { |
39 | // some posts file from arq20 math task were modified with additional version |
40 | // information by appending either .V1.0 or _V1_0 |
41 | $fileparts = preg_split( "/[._]/", $fileName ); |
42 | $normalized_fn = strtolower( $fileparts[0] ); |
43 | $this->fileName = $normalized_fn; |
44 | self::getLog()->debug( "'$fileName' is normalized to '$normalized_fn'." ); |
45 | } |
46 | |
47 | public function run() { |
48 | $batchSize = 1000; |
49 | $rows = []; |
50 | $xml = $this->file; |
51 | while ( $xml->read() ) { |
52 | if ( $xml->name === 'row' && $xml->nodeType == XMLReader::ELEMENT ) { |
53 | $attribs = []; |
54 | if ( $xml->hasAttributes ) { |
55 | while ( $xml->moveToNextAttribute() ) { |
56 | $attribs[$xml->name] = $xml->value; |
57 | } |
58 | $rows[] = $attribs; |
59 | if ( count( $rows ) >= $batchSize ) { |
60 | $this->addJob( $rows ); |
61 | $rows = []; |
62 | } |
63 | } |
64 | } elseif ( $xml->nodeType == XMLReader::ELEMENT ) { |
65 | self::getLog()->info( "Skip element: {line}", [ 'line' => $xml->name ] ); |
66 | } |
67 | } |
68 | $this->addJob( $rows ); |
69 | } |
70 | |
71 | /** |
72 | * @param array $rows |
73 | */ |
74 | private function addJob( array $rows ) { |
75 | $part = ++$this->part; |
76 | $title = Title::newFromText( "SE reader '$this->fileName' part $part" ); |
77 | $job = new LineReaderJob( $title, [ |
78 | 'rows' => $rows, |
79 | 'fileName' => $this->fileName, |
80 | 'errFile' => $this->errPath . "/$this->fileName-$part-err.xml", |
81 | ] ); |
82 | if ( method_exists( MediaWikiServices::class, 'getJobQueueGroup' ) ) { |
83 | // MW 1.37+ |
84 | MediaWikiServices::getInstance()->getJobQueueGroup()->push( $job ); |
85 | } else { |
86 | \JobQueueGroup::singleton()->push( $job ); |
87 | } |
88 | } |
89 | } |