Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
95.56% |
86 / 90 |
|
88.89% |
8 / 9 |
CRAP | |
0.00% |
0 / 1 |
| PrewarmParsoidParserCache | |
95.56% |
86 / 90 |
|
88.89% |
8 / 9 |
20 | |
0.00% |
0 / 1 |
| __construct | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
1 | |||
| getPageLookup | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| getRevisionLookup | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| getParserOutputAccess | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| getParsoidSiteConfig | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| getQueryBuilder | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
1 | |||
| parse | |
66.67% |
8 / 12 |
|
0.00% |
0 / 1 |
3.33 | |||
| normalizeNamespace | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| execute | |
100.00% |
48 / 48 |
|
100.00% |
1 / 1 |
10 | |||
| 1 | <?php |
| 2 | use MediaWiki\Maintenance\Maintenance; |
| 3 | use MediaWiki\Page\PageLookup; |
| 4 | use MediaWiki\Page\PageRecord; |
| 5 | use MediaWiki\Page\ParserOutputAccess; |
| 6 | use MediaWiki\Parser\ParserOptions; |
| 7 | use MediaWiki\Parser\Parsoid\Config\SiteConfig as ParsoidSiteConfig; |
| 8 | use MediaWiki\Revision\RevisionLookup; |
| 9 | use MediaWiki\Revision\RevisionRecord; |
| 10 | use MediaWiki\Revision\SlotRecord; |
| 11 | use MediaWiki\Status\Status; |
| 12 | use Wikimedia\Parsoid\Core\ClientError; |
| 13 | use Wikimedia\Parsoid\Core\ResourceLimitExceededException; |
| 14 | use Wikimedia\Rdbms\SelectQueryBuilder; |
| 15 | |
| 16 | // @codeCoverageIgnoreStart |
| 17 | require_once __DIR__ . '/Maintenance.php'; |
| 18 | // @codeCoverageIgnoreEnd |
| 19 | |
| 20 | /** |
| 21 | * Maintenance script for populating parser cache with parsoid output. |
| 22 | * |
| 23 | * @since 1.41 |
| 24 | * |
| 25 | * @license GPL-2.0-or-later |
| 26 | * @author Richika Rana |
| 27 | */ |
| 28 | class PrewarmParsoidParserCache extends Maintenance { |
| 29 | private int $forceParse = 0; |
| 30 | private ParserOutputAccess $parserOutputAccess; |
| 31 | private PageLookup $pageLookup; |
| 32 | private RevisionLookup $revisionLookup; |
| 33 | private ParsoidSiteConfig $parsoidSiteConfig; |
| 34 | |
| 35 | public function __construct() { |
| 36 | parent::__construct(); |
| 37 | |
| 38 | $this->addDescription( |
| 39 | 'Populate parser cache with parsoid output. By default, script attempt to run' . |
| 40 | 'for supported content model pages (in a specified batch if provided)' |
| 41 | ); |
| 42 | $this->addOption( |
| 43 | 'force', |
| 44 | 'Re-parse pages even if the cached entry seems up to date', |
| 45 | false, |
| 46 | false |
| 47 | ); |
| 48 | $this->addOption( 'start-from', 'Start from this page ID', false, true ); |
| 49 | $this->addOption( 'namespace', 'Filter pages in this namespace', false, true ); |
| 50 | $this->setBatchSize( 100 ); |
| 51 | } |
| 52 | |
| 53 | private function getPageLookup(): PageLookup { |
| 54 | $this->pageLookup ??= $this->getServiceContainer()->getPageStore(); |
| 55 | return $this->pageLookup; |
| 56 | } |
| 57 | |
| 58 | private function getRevisionLookup(): RevisionLookup { |
| 59 | $this->revisionLookup ??= $this->getServiceContainer()->getRevisionLookup(); |
| 60 | return $this->revisionLookup; |
| 61 | } |
| 62 | |
| 63 | private function getParserOutputAccess(): ParserOutputAccess { |
| 64 | $this->parserOutputAccess ??= $this->getServiceContainer()->getParserOutputAccess(); |
| 65 | return $this->parserOutputAccess; |
| 66 | } |
| 67 | |
| 68 | private function getParsoidSiteConfig(): ParsoidSiteConfig { |
| 69 | $this->parsoidSiteConfig ??= $this->getServiceContainer()->getParsoidSiteConfig(); |
| 70 | return $this->parsoidSiteConfig; |
| 71 | } |
| 72 | |
| 73 | private function getQueryBuilder(): SelectQueryBuilder { |
| 74 | $dbr = $this->getReplicaDB(); |
| 75 | |
| 76 | return $dbr->newSelectQueryBuilder() |
| 77 | ->select( [ 'page_id' ] ) |
| 78 | ->from( 'page' ) |
| 79 | ->caller( __METHOD__ ) |
| 80 | ->orderBy( 'page_id', SelectQueryBuilder::SORT_ASC ); |
| 81 | } |
| 82 | |
| 83 | private function parse( |
| 84 | PageRecord $page, |
| 85 | RevisionRecord $revision |
| 86 | ): Status { |
| 87 | $popts = ParserOptions::newFromAnon(); |
| 88 | $popts->setUseParsoid(); |
| 89 | try { |
| 90 | return $this->getParserOutputAccess()->getParserOutput( |
| 91 | $page, |
| 92 | $popts, |
| 93 | $revision, |
| 94 | $this->forceParse |
| 95 | ); |
| 96 | } catch ( ClientError $e ) { |
| 97 | return Status::newFatal( 'parsoid-client-error', $e->getMessage() ); |
| 98 | } catch ( ResourceLimitExceededException $e ) { |
| 99 | return Status::newFatal( 'parsoid-resource-limit-exceeded', $e->getMessage() ); |
| 100 | } |
| 101 | } |
| 102 | |
| 103 | /** |
| 104 | * NamespaceInfo::getCanonicalIndex() requires the namespace to be in lowercase, |
| 105 | * so let's do some normalization and return its canonical index. |
| 106 | * |
| 107 | * @param string $namespace The namespace string from the command line |
| 108 | * @return ?int The canonical index of the namespace |
| 109 | */ |
| 110 | private function normalizeNamespace( string $namespace ): ?int { |
| 111 | return $this->getServiceContainer()->getNamespaceInfo() |
| 112 | ->getCanonicalIndex( strtolower( $namespace ) ); |
| 113 | } |
| 114 | |
| 115 | /** |
| 116 | * Populate parser cache with parsoid output. |
| 117 | * |
| 118 | * @return bool |
| 119 | */ |
| 120 | public function execute() { |
| 121 | $force = $this->getOption( 'force' ); |
| 122 | $startFrom = $this->getOption( 'start-from' ); |
| 123 | |
| 124 | // We need the namespace index instead of the name to perform the query |
| 125 | // on, because that's what the page table stores (in the page_namespace field). |
| 126 | $namespaceIndex = null; |
| 127 | $namespace = $this->getOption( 'namespace' ); |
| 128 | if ( $namespace !== null ) { |
| 129 | $namespaceIndex = $this->normalizeNamespace( $namespace ); |
| 130 | } |
| 131 | |
| 132 | if ( $force !== null ) { |
| 133 | // If --force is supplied, for a parse for supported pages or supported |
| 134 | // pages in the specified batch. |
| 135 | $this->forceParse = ParserOutputAccess::OPT_FORCE_PARSE; |
| 136 | } |
| 137 | |
| 138 | $startFrom = (int)$startFrom; |
| 139 | |
| 140 | $this->output( "\nWarming parsoid parser cache with Parsoid output...\n\n" ); |
| 141 | while ( true ) { |
| 142 | $query = $this->getQueryBuilder(); |
| 143 | if ( $namespaceIndex !== null ) { |
| 144 | $query = $query->where( [ 'page_namespace' => $namespaceIndex ] ); |
| 145 | } |
| 146 | $query = $query->where( $this->getReplicaDB()->expr( 'page_id', '>=', $startFrom ) ) |
| 147 | ->limit( $this->getBatchSize() ); |
| 148 | |
| 149 | $result = $query->fetchResultSet(); |
| 150 | |
| 151 | if ( !$result->numRows() ) { |
| 152 | break; |
| 153 | } |
| 154 | |
| 155 | $currentBatch = $startFrom + ( $this->getBatchSize() - 1 ); |
| 156 | $this->output( "\n\nBatch: $startFrom - $currentBatch\n----\n" ); |
| 157 | |
| 158 | // Look through pages by pageId and populate the parserCache |
| 159 | foreach ( $result as $row ) { |
| 160 | $page = $this->getPageLookup()->getPageById( $row->page_id ); |
| 161 | $startFrom = ( (int)$row->page_id + 1 ); |
| 162 | |
| 163 | if ( $page === null ) { |
| 164 | $this->output( "\n[Skipped] Page ID: $row->page_id not found.\n" ); |
| 165 | continue; |
| 166 | } |
| 167 | |
| 168 | $latestRevision = $page->getLatest(); |
| 169 | $revision = $this->getRevisionLookup()->getRevisionById( $latestRevision ); |
| 170 | $mainSlot = $revision->getSlot( SlotRecord::MAIN ); |
| 171 | |
| 172 | // POA will write a dummy output to PC, but we don't want that here. Just skip! |
| 173 | if ( !$this->getParsoidSiteConfig()->supportsContentModel( $mainSlot->getModel() ) ) { |
| 174 | $this->output( |
| 175 | '[Skipped] Content model "' . |
| 176 | $mainSlot->getModel() . |
| 177 | "\" not supported for page ID: $row->page_id.\n" |
| 178 | ); |
| 179 | continue; |
| 180 | } |
| 181 | |
| 182 | $status = $this->parse( $page, $revision ); |
| 183 | if ( !$status->isOK() ) { |
| 184 | $this->output( |
| 185 | __METHOD__ . |
| 186 | ": Error parsing page ID: $row->page_id or writing to parser cache\n" |
| 187 | ); |
| 188 | continue; |
| 189 | } |
| 190 | |
| 191 | $this->output( "[Done] Page ID: $row->page_id ✔️\n" ); |
| 192 | } |
| 193 | $this->waitForReplication(); |
| 194 | } |
| 195 | |
| 196 | $this->output( "\nDone pre-warming parsoid parser cache...\n" ); |
| 197 | |
| 198 | return true; |
| 199 | } |
| 200 | } |
| 201 | |
| 202 | // @codeCoverageIgnoreStart |
| 203 | $maintClass = PrewarmParsoidParserCache::class; |
| 204 | require_once RUN_MAINTENANCE_IF_MAIN; |
| 205 | // @codeCoverageIgnoreEnd |