Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
87.06% |
74 / 85 |
|
63.64% |
7 / 11 |
CRAP | |
0.00% |
0 / 1 |
| ParserOutputPageProperties | |
87.06% |
74 / 85 |
|
63.64% |
7 / 11 |
33.08 | |
0.00% |
0 / 1 |
| __construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| initialize | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| finishInitializeBatch | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| finalize | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
| finalizeReal | |
81.58% |
31 / 38 |
|
0.00% |
0 / 1 |
6.23 | |||
| extractDisplayTitle | |
100.00% |
24 / 24 |
|
100.00% |
1 / 1 |
10 | |||
| isSameString | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
| fixAndFlagInvalidUTF8InSource | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
3 | |||
| flagProblem | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| truncateFileTextContent | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
4 | |||
| truncateFileContent | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
| 1 | <?php |
| 2 | |
| 3 | namespace CirrusSearch\BuildDocument; |
| 4 | |
| 5 | use CirrusSearch\CirrusSearch; |
| 6 | use CirrusSearch\Search\CirrusIndexField; |
| 7 | use CirrusSearch\SearchConfig; |
| 8 | use Elastica\Document; |
| 9 | use MediaWiki\Logger\LoggerFactory; |
| 10 | use MediaWiki\MediaWikiServices; |
| 11 | use MediaWiki\Page\WikiPage; |
| 12 | use MediaWiki\Parser\ParserOutput; |
| 13 | use MediaWiki\Parser\Sanitizer; |
| 14 | use MediaWiki\Revision\BadRevisionException; |
| 15 | use MediaWiki\Revision\RevisionRecord; |
| 16 | use MediaWiki\Title\Title; |
| 17 | use Wikimedia\LightweightObjectStore\ExpirationAwareness; |
| 18 | |
| 19 | /** |
| 20 | * Extract searchable properties from the MediaWiki ParserOutput |
| 21 | */ |
| 22 | class ParserOutputPageProperties implements PagePropertyBuilder { |
| 23 | /** @var SearchConfig */ |
| 24 | private $config; |
| 25 | |
| 26 | public function __construct( SearchConfig $config ) { |
| 27 | $this->config = $config; |
| 28 | } |
| 29 | |
| 30 | /** |
| 31 | * {@inheritDoc} |
| 32 | */ |
| 33 | public function initialize( Document $doc, WikiPage $page, RevisionRecord $revision, bool $isRedirect ): void { |
| 34 | // NOOP |
| 35 | } |
| 36 | |
| 37 | /** |
| 38 | * {@inheritDoc} |
| 39 | */ |
| 40 | public function finishInitializeBatch(): void { |
| 41 | // NOOP |
| 42 | } |
| 43 | |
| 44 | /** |
| 45 | * {@inheritDoc} |
| 46 | */ |
| 47 | public function finalize( Document $doc, Title $title, RevisionRecord $revision ): void { |
| 48 | $page = MediaWikiServices::getInstance()->getWikiPageFactory()->newFromTitle( $title ); |
| 49 | $this->finalizeReal( $doc, $page, new CirrusSearch, $revision ); |
| 50 | } |
| 51 | |
| 52 | /** |
| 53 | * Visible for testing. Much simpler to test with all objects resolved. |
| 54 | * |
| 55 | * @param Document $doc Document to finalize |
| 56 | * @param WikiPage $page WikiPage to scope operation to |
| 57 | * @param CirrusSearch $engine SearchEngine implementation |
| 58 | * @param RevisionRecord $revision The page revision to use |
| 59 | * @throws BuildDocumentException |
| 60 | */ |
| 61 | public function finalizeReal( |
| 62 | Document $doc, |
| 63 | WikiPage $page, |
| 64 | CirrusSearch $engine, |
| 65 | RevisionRecord $revision |
| 66 | ): void { |
| 67 | $wanCache = MediaWikiServices::getInstance()->getMainWANObjectCache(); |
| 68 | $cacheKey = $wanCache->makeKey( |
| 69 | 'CirrusSearchParserOutputPageProperties', |
| 70 | $page->getId(), |
| 71 | $revision->getId(), |
| 72 | $page->getTouched(), |
| 73 | 'v2' |
| 74 | ); |
| 75 | |
| 76 | // We are having problems with low hit rates, but haven't been able to |
| 77 | // track down why that is. Log a sample of keys so we can evaluate if |
| 78 | // the problem is that $page->getTouched() is changing between |
| 79 | // invocations. -- eb 2024 july 9 |
| 80 | if ( $page->getId() % 1000 === 0 ) { |
| 81 | LoggerFactory::getInstance( 'CirrusSearch' )->debug( |
| 82 | 'Sampling of CirrusSearchParserOutputPageProperties cache keys: {cache_key}', |
| 83 | [ |
| 84 | 'cache_key' => $cacheKey, |
| 85 | 'revision_id' => $revision->getId(), |
| 86 | 'page_id' => $page->getId(), |
| 87 | ] ); |
| 88 | } |
| 89 | |
| 90 | $fieldContent = $wanCache->getWithSetCallback( |
| 91 | $cacheKey, |
| 92 | ExpirationAwareness::TTL_HOUR * 6, |
| 93 | function () use ( $page, $revision, $engine ) { |
| 94 | $contentHandler = $page->getContentHandler(); |
| 95 | // TODO: Should see if we can change content handler api to avoid |
| 96 | // the WikiPage god object, but currently parser cache is still |
| 97 | // tied to WikiPage as well. |
| 98 | try { |
| 99 | $output = $contentHandler->getParserOutputForIndexing( $page, null, $revision ); |
| 100 | } catch ( BadRevisionException ) { |
| 101 | // The revision is corrupted in the db and has been marked as permanently missing, |
| 102 | //we can't do much about it so flag that page as broken in CirrusSearch. |
| 103 | return self::flagProblem( [], 'CirrusSearchBadRevision' ); |
| 104 | } |
| 105 | |
| 106 | if ( !$output ) { |
| 107 | // @phan-suppress-next-line PhanThrowTypeAbsent |
| 108 | throw new BuildDocumentException( "ParserOutput cannot be obtained." ); |
| 109 | } |
| 110 | |
| 111 | $fieldContent = $contentHandler->getDataForSearchIndex( $page, $output, $engine, $revision ); |
| 112 | $fieldContent['display_title'] = self::extractDisplayTitle( $page->getTitle(), $output ); |
| 113 | return self::fixAndFlagInvalidUTF8InSource( $fieldContent, $page->getId() ); |
| 114 | } |
| 115 | ); |
| 116 | |
| 117 | $fieldContent = $this->truncateFileContent( $fieldContent ); |
| 118 | $fieldDefinitions = $engine->getSearchIndexFields(); |
| 119 | foreach ( $fieldContent as $field => $fieldData ) { |
| 120 | $doc->set( $field, $fieldData ); |
| 121 | if ( isset( $fieldDefinitions[$field] ) ) { |
| 122 | $hints = $fieldDefinitions[$field]->getEngineHints( $engine ); |
| 123 | CirrusIndexField::addIndexingHints( $doc, $field, $hints ); |
| 124 | } |
| 125 | } |
| 126 | } |
| 127 | |
| 128 | /** |
| 129 | * @param Title $title |
| 130 | * @param ParserOutput $output |
| 131 | * @return string|null |
| 132 | */ |
| 133 | private static function extractDisplayTitle( Title $title, ParserOutput $output ): ?string { |
| 134 | $titleText = $title->getText(); |
| 135 | $titlePrefixedText = $title->getPrefixedText(); |
| 136 | |
| 137 | $raw = $output->getDisplayTitle(); |
| 138 | if ( $raw === false ) { |
| 139 | return null; |
| 140 | } |
| 141 | $clean = Sanitizer::stripAllTags( $raw ); |
| 142 | // Only index display titles that differ from the normal title |
| 143 | if ( self::isSameString( $clean, $titleText ) || |
| 144 | self::isSameString( $clean, $titlePrefixedText ) |
| 145 | ) { |
| 146 | return null; |
| 147 | } |
| 148 | if ( $title->getNamespace() === 0 || strpos( $clean, ':' ) === false ) { |
| 149 | return $clean; |
| 150 | } |
| 151 | // There is no official way that namespaces work in display title, it |
| 152 | // is an arbitrary string. Even so some use cases, such as the |
| 153 | // Translate extension, will translate the namespace as well. Here |
| 154 | // `Help:foo` will have a display title of `Aide:bar`. If we were to |
| 155 | // simply index as is the autocomplete and near matcher would see |
| 156 | // Help:Aide:bar, which doesn't seem particularly useful. |
| 157 | // The strategy here is to see if the portion before the : is a valid namespace |
| 158 | // in either the language of the wiki or the language of the page. If it is |
| 159 | // then we strip it from the display title. |
| 160 | [ $maybeNs, $maybeDisplayTitle ] = explode( ':', $clean, 2 ); |
| 161 | $cleanTitle = Title::newFromText( $clean ); |
| 162 | if ( $cleanTitle === null ) { |
| 163 | // The title is invalid, we cannot extract the ns prefix |
| 164 | return $clean; |
| 165 | } |
| 166 | if ( $cleanTitle->getNamespace() == $title->getNamespace() ) { |
| 167 | // While it doesn't really matter, $cleanTitle->getText() may |
| 168 | // have had ucfirst() applied depending on settings so we |
| 169 | // return the unmodified $maybeDisplayTitle. |
| 170 | return $maybeDisplayTitle; |
| 171 | } |
| 172 | |
| 173 | $docLang = $title->getPageLanguage(); |
| 174 | $nsIndex = $docLang->getNsIndex( $maybeNs ); |
| 175 | if ( $nsIndex !== $title->getNamespace() ) { |
| 176 | // Valid namespace but not the same as the actual page. |
| 177 | // Keep the namespace in the display title. |
| 178 | return $clean; |
| 179 | } |
| 180 | |
| 181 | return self::isSameString( $maybeDisplayTitle, $titleText ) |
| 182 | ? null |
| 183 | : $maybeDisplayTitle; |
| 184 | } |
| 185 | |
| 186 | private static function isSameString( string $a, string $b ): bool { |
| 187 | $a = mb_strtolower( strtr( $a, '_', ' ' ) ); |
| 188 | $b = mb_strtolower( strtr( $b, '_', ' ' ) ); |
| 189 | return $a === $b; |
| 190 | } |
| 191 | |
| 192 | /** |
| 193 | * Find invalid UTF-8 sequence in the source text. |
| 194 | * Fix them and flag the doc with the CirrusSearchInvalidUTF8 template. |
| 195 | * |
| 196 | * Temporary solution to help investigate/fix T225200 |
| 197 | * |
| 198 | * Visible for testing only |
| 199 | * @param array $fields |
| 200 | * @param int $pageId |
| 201 | * @return array |
| 202 | */ |
| 203 | public static function fixAndFlagInvalidUTF8InSource( array $fields, int $pageId ): array { |
| 204 | if ( isset( $fields['source_text'] ) ) { |
| 205 | $fixedVersion = mb_convert_encoding( $fields['source_text'], 'UTF-8', 'UTF-8' ); |
| 206 | if ( $fixedVersion !== $fields['source_text'] ) { |
| 207 | LoggerFactory::getInstance( 'CirrusSearch' ) |
| 208 | ->warning( 'Fixing invalid UTF-8 sequences in source text for page id {page_id}', |
| 209 | [ 'page_id' => $pageId ] ); |
| 210 | $fields['source_text'] = $fixedVersion; |
| 211 | $fields = self::flagProblem( $fields, 'CirrusSearchInvalidUTF8' ); |
| 212 | } |
| 213 | } |
| 214 | return $fields; |
| 215 | } |
| 216 | |
| 217 | private static function flagProblem( array $fields, string $template ): array { |
| 218 | $fields['template'][] = Title::makeTitle( NS_TEMPLATE, $template )->getPrefixedText(); |
| 219 | return $fields; |
| 220 | } |
| 221 | |
| 222 | /** |
| 223 | * Visible for testing only |
| 224 | * @param int $maxLen |
| 225 | * @param array $fieldContent |
| 226 | * @return array |
| 227 | */ |
| 228 | public static function truncateFileTextContent( int $maxLen, array $fieldContent ): array { |
| 229 | if ( $maxLen >= 0 && isset( $fieldContent['file_text'] ) && strlen( $fieldContent['file_text'] ) > $maxLen ) { |
| 230 | $fieldContent['file_text'] = mb_strcut( $fieldContent['file_text'], 0, $maxLen ); |
| 231 | } |
| 232 | |
| 233 | return $fieldContent; |
| 234 | } |
| 235 | |
| 236 | private function truncateFileContent( array $fieldContent ): array { |
| 237 | return self::truncateFileTextContent( $this->config->get( 'CirrusSearchMaxFileTextLength' ) ?: -1, $fieldContent ); |
| 238 | } |
| 239 | } |