Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
82.44% |
108 / 131 |
|
25.00% |
2 / 8 |
CRAP | |
0.00% |
0 / 1 |
| QueryBuildDocument | |
82.44% |
108 / 131 |
|
25.00% |
2 / 8 |
33.55 | |
0.00% |
0 / 1 |
| __construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| execute | |
81.25% |
13 / 16 |
|
0.00% |
0 / 1 |
3.06 | |||
| doExecute | |
87.34% |
69 / 79 |
|
0.00% |
0 / 1 |
16.52 | |||
| getRevisionIDs | |
90.91% |
10 / 11 |
|
0.00% |
0 / 1 |
5.02 | |||
| getAllowedParams | |
100.00% |
15 / 15 |
|
100.00% |
1 / 1 |
1 | |||
| isInternal | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| getExamplesMessages | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
| markUnrenderable | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
| 1 | <?php |
| 2 | |
| 3 | namespace CirrusSearch\Api; |
| 4 | |
| 5 | use CirrusSearch\BuildDocument\BuildDocument; |
| 6 | use CirrusSearch\BuildDocument\DocumentSizeLimiter; |
| 7 | use CirrusSearch\CirrusSearch; |
| 8 | use CirrusSearch\Profile\SearchProfileService; |
| 9 | use CirrusSearch\Search\CirrusIndexField; |
| 10 | use CirrusSearch\SearchConfig; |
| 11 | use MediaWiki\Api\ApiBase; |
| 12 | use MediaWiki\Api\ApiQuery; |
| 13 | use MediaWiki\Api\ApiQueryBase; |
| 14 | use MediaWiki\Api\ApiResult; |
| 15 | use MediaWiki\MediaWikiServices; |
| 16 | use MediaWiki\PoolCounter\PoolCounterWorkViaCallback; |
| 17 | use MediaWiki\Revision\SlotRecord; |
| 18 | use Wikimedia\ParamValidator\ParamValidator; |
| 19 | |
| 20 | /** |
| 21 | * Generate CirrusSearch document for page. |
| 22 | * |
| 23 | * @license GPL-2.0-or-later |
| 24 | */ |
| 25 | class QueryBuildDocument extends ApiQueryBase { |
| 26 | use ApiTrait; |
| 27 | |
| 28 | public function __construct( ApiQuery $query, string $moduleName ) { |
| 29 | parent::__construct( $query, $moduleName, 'cb' ); |
| 30 | } |
| 31 | |
| 32 | public function execute() { |
| 33 | $engine = MediaWikiServices::getInstance()->getSearchEngineFactory()->create(); |
| 34 | if ( !( $engine instanceof CirrusSearch ) ) { |
| 35 | throw new \RuntimeException( 'Could not create cirrus engine' ); |
| 36 | } |
| 37 | |
| 38 | if ( $this->getUser()->getName() === $engine->getConfig()->get( "CirrusSearchStreamingUpdaterUsername" ) ) { |
| 39 | // Bypass poolcounter protection for the internal cirrus user |
| 40 | $this->doExecute( $engine ); |
| 41 | } else { |
| 42 | // Protect against too many concurrent requests |
| 43 | // Use a global key this API is internal and could only be useful for manual debugging purposes |
| 44 | // so no real need to have it on per user basis. |
| 45 | $worker = new PoolCounterWorkViaCallback( 'CirrusSearch-QueryBuildDocument', 'QueryBuildDocument', |
| 46 | [ |
| 47 | 'doWork' => function () use ( $engine ) { |
| 48 | return $this->doExecute( $engine ); |
| 49 | }, |
| 50 | 'error' => function () { |
| 51 | $this->dieWithError( 'apierror-concurrency-limit' ); |
| 52 | }, |
| 53 | ] |
| 54 | ); |
| 55 | $worker->execute(); |
| 56 | } |
| 57 | } |
| 58 | |
| 59 | public function doExecute( CirrusSearch $engine ) { |
| 60 | $result = $this->getResult(); |
| 61 | $services = MediaWikiServices::getInstance(); |
| 62 | |
| 63 | $builders = $this->getParameter( 'builders' ); |
| 64 | $profile = $this->getParameter( 'limiterprofile' ); |
| 65 | $flags = 0; |
| 66 | if ( !in_array( 'content', $builders ) ) { |
| 67 | $flags |= BuildDocument::SKIP_PARSE; |
| 68 | } |
| 69 | if ( !in_array( 'links', $builders ) ) { |
| 70 | $flags |= BuildDocument::SKIP_LINKS; |
| 71 | } |
| 72 | |
| 73 | $pages = []; |
| 74 | $wikiPageFactory = $services->getWikiPageFactory(); |
| 75 | $revisionStore = $services->getRevisionStore(); |
| 76 | $revisionBased = false; |
| 77 | if ( $this->getPageSet()->getRevisionIDs() ) { |
| 78 | $revisionBased = true; |
| 79 | foreach ( $this->getRevisionIDs() as $pageId => $revId ) { |
| 80 | $rev = $revisionStore->getRevisionById( $revId ); |
| 81 | if ( $rev === null ) { |
| 82 | // We cannot trust ApiPageSet to properly identify missing revisions, RevisionStore |
| 83 | // might not agree with it likely because they could be using different db replicas (T370770) |
| 84 | $result->addValue( 'query', 'badrevids', [ |
| 85 | $revId => [ 'revid' => $revId, 'missing' => true ] |
| 86 | ] ); |
| 87 | } elseif ( $rev->audienceCan( $rev::DELETED_TEXT, $rev::FOR_PUBLIC ) ) { |
| 88 | // Redirects are not directly represented as searchable documents. |
| 89 | // They are unrenderable. |
| 90 | if ( $rev->getContent( SlotRecord::MAIN )->isRedirect() ) { |
| 91 | $this->markUnrenderable( $result, $pageId ); |
| 92 | } else { |
| 93 | $pages[$pageId] = $rev; |
| 94 | } |
| 95 | } else { |
| 96 | // While the user might have permissions, we want to limit |
| 97 | // what could possibly be indexed to that which is public. |
| 98 | // For an anon this would fail deeper in the system |
| 99 | // anyways, this early check mostly avoids blowing up deep |
| 100 | // in the bowels. |
| 101 | $result->addValue( |
| 102 | [ 'query', 'pages', $pageId ], |
| 103 | 'texthidden', true |
| 104 | ); |
| 105 | } |
| 106 | } |
| 107 | } else { |
| 108 | foreach ( $this->getPageSet()->getGoodPages() as $pageId => $title ) { |
| 109 | $page = $wikiPageFactory->newFromTitle( $title ); |
| 110 | if ( $page->isRedirect() ) { |
| 111 | $this->markUnrenderable( $result, $pageId ); |
| 112 | } else { |
| 113 | $pages[$pageId] = $page; |
| 114 | } |
| 115 | } |
| 116 | } |
| 117 | |
| 118 | $searchConfig = $engine->getConfig(); |
| 119 | $builder = new BuildDocument( |
| 120 | $this->getCirrusConnection(), |
| 121 | $this->getDB(), |
| 122 | $services->getRevisionStore(), |
| 123 | $services->getBacklinkCacheFactory(), |
| 124 | new DocumentSizeLimiter( $searchConfig->getProfileService() |
| 125 | ->loadProfile( SearchProfileService::DOCUMENT_SIZE_LIMITER, SearchProfileService::CONTEXT_DEFAULT, $profile ) ), |
| 126 | $services->getTitleFormatter(), |
| 127 | $services->getWikiPageFactory(), |
| 128 | $services->getTitleFactory() |
| 129 | ); |
| 130 | $baseMetadata = []; |
| 131 | $clusterGroup = $searchConfig->getClusterAssignment()->getCrossClusterName(); |
| 132 | if ( $clusterGroup !== null ) { |
| 133 | $baseMetadata['cluster_group'] = $clusterGroup; |
| 134 | } |
| 135 | $docs = $builder->initialize( $pages, $flags ); |
| 136 | foreach ( $docs as $pageId => $doc ) { |
| 137 | $pageId = $doc->get( 'page_id' ); |
| 138 | $revision = $revisionBased ? $pages[$pageId] : null; |
| 139 | if ( $builder->finalize( $doc, false, $revision ) ) { |
| 140 | $result->addValue( |
| 141 | [ 'query', 'pages', $pageId ], |
| 142 | 'cirrusbuilddoc', $doc->getData() |
| 143 | ); |
| 144 | $hints = CirrusIndexField::getHint( $doc, CirrusIndexField::NOOP_HINT ); |
| 145 | $metadata = []; |
| 146 | if ( $hints !== null ) { |
| 147 | $metadata = $baseMetadata + [ 'noop_hints' => $hints ]; |
| 148 | } |
| 149 | $limiterStats = CirrusIndexField::getHint( $doc, DocumentSizeLimiter::HINT_DOC_SIZE_LIMITER_STATS ); |
| 150 | if ( $limiterStats !== null ) { |
| 151 | $metadata += [ 'size_limiter_stats' => $limiterStats ]; |
| 152 | } |
| 153 | $indexName = $this->getCirrusConnection()->getIndexName( $searchConfig->get( SearchConfig::INDEX_BASE_NAME ), |
| 154 | $this->getCirrusConnection()->getIndexSuffixForNamespace( $doc->get( 'namespace' ) ) ); |
| 155 | $metadata += [ |
| 156 | 'index_name' => $indexName |
| 157 | ]; |
| 158 | |
| 159 | $result->addValue( [ 'query', 'pages', $pageId ], |
| 160 | 'cirrusbuilddoc_metadata', $metadata ); |
| 161 | $result->addValue( |
| 162 | [ 'query', 'pages', $pageId ], |
| 163 | 'cirrusbuilddoc_comment', |
| 164 | 'The CirrusDoc format is meant for internal use by CirrusSearch for debugging or queries, ' |
| 165 | . 'it might change at any time without notice' |
| 166 | ); |
| 167 | } |
| 168 | } |
| 169 | } |
| 170 | |
| 171 | private function getRevisionIDs(): array { |
| 172 | $result = []; |
| 173 | $warning = false; |
| 174 | foreach ( $this->getPageSet()->getRevisionIDs() as $revId => $pageId ) { |
| 175 | if ( isset( $result[$pageId] ) ) { |
| 176 | $warning = true; |
| 177 | if ( $result[$pageId] >= $revId ) { |
| 178 | continue; |
| 179 | } |
| 180 | } |
| 181 | $result[$pageId] = $revId; |
| 182 | } |
| 183 | if ( $warning ) { |
| 184 | $this->addWarning( [ 'apiwarn-cirrus-ignore-revisions' ] ); |
| 185 | } |
| 186 | return $result; |
| 187 | } |
| 188 | |
| 189 | /** @inheritDoc */ |
| 190 | public function getAllowedParams() { |
| 191 | return [ |
| 192 | 'builders' => [ |
| 193 | ParamValidator::PARAM_DEFAULT => [ 'content', 'links' ], |
| 194 | ParamValidator::PARAM_ISMULTI => true, |
| 195 | ParamValidator::PARAM_ALLOW_DUPLICATES => false, |
| 196 | ParamValidator::PARAM_TYPE => [ |
| 197 | 'content', |
| 198 | 'links', |
| 199 | ], |
| 200 | ApiBase::PARAM_HELP_MSG => 'apihelp-query+cirrusbuilddoc-param-builders', |
| 201 | ], |
| 202 | 'limiterprofile' => [ |
| 203 | ParamValidator::PARAM_TYPE => 'string' |
| 204 | ], |
| 205 | ]; |
| 206 | } |
| 207 | |
| 208 | /** |
| 209 | * Mark as internal. This isn't meant to be used by normal api users |
| 210 | * @return bool |
| 211 | */ |
| 212 | public function isInternal() { |
| 213 | return true; |
| 214 | } |
| 215 | |
| 216 | /** |
| 217 | * @see ApiBase::getExamplesMessages |
| 218 | * @return array |
| 219 | */ |
| 220 | protected function getExamplesMessages() { |
| 221 | return [ |
| 222 | 'action=query&prop=cirrusbuilddoc&titles=Main_Page' => |
| 223 | 'apihelp-query+cirrusbuilddoc-example' |
| 224 | ]; |
| 225 | } |
| 226 | |
| 227 | /** |
| 228 | * @param ApiResult $result Result obect to write to |
| 229 | * @param int $pageId The page to mark unrenderable |
| 230 | */ |
| 231 | private function markUnrenderable( ApiResult $result, int $pageId ) { |
| 232 | $result->addValue( |
| 233 | [ 'query', 'pages', $pageId ], |
| 234 | 'unrenderable', true |
| 235 | ); |
| 236 | } |
| 237 | |
| 238 | } |