Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
83.95% |
68 / 81 |
|
60.00% |
6 / 10 |
CRAP | |
0.00% |
0 / 1 |
ParserOutputPageProperties | |
83.95% |
68 / 81 |
|
60.00% |
6 / 10 |
32.48 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
initialize | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
finishInitializeBatch | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
finalize | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
finalizeReal | |
75.00% |
27 / 36 |
|
0.00% |
0 / 1 |
5.39 | |||
extractDisplayTitle | |
100.00% |
24 / 24 |
|
100.00% |
1 / 1 |
10 | |||
isSameString | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
fixAndFlagInvalidUTF8InSource | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
3 | |||
truncateFileTextContent | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
4 | |||
truncateFileContent | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\BuildDocument; |
4 | |
5 | use CirrusSearch\CirrusSearch; |
6 | use CirrusSearch\Search\CirrusIndexField; |
7 | use CirrusSearch\SearchConfig; |
8 | use Elastica\Document; |
9 | use MediaWiki\Logger\LoggerFactory; |
10 | use MediaWiki\MediaWikiServices; |
11 | use MediaWiki\Parser\ParserOutput; |
12 | use MediaWiki\Parser\Sanitizer; |
13 | use MediaWiki\Revision\RevisionRecord; |
14 | use MediaWiki\Title\Title; |
15 | use Wikimedia\LightweightObjectStore\ExpirationAwareness; |
16 | use WikiPage; |
17 | |
18 | /** |
19 | * Extract searchable properties from the MediaWiki ParserOutput |
20 | */ |
21 | class ParserOutputPageProperties implements PagePropertyBuilder { |
22 | /** @var SearchConfig */ |
23 | private $config; |
24 | |
25 | /** |
26 | * @param SearchConfig $config |
27 | */ |
28 | public function __construct( SearchConfig $config ) { |
29 | $this->config = $config; |
30 | } |
31 | |
32 | /** |
33 | * {@inheritDoc} |
34 | */ |
35 | public function initialize( Document $doc, WikiPage $page, RevisionRecord $revision ): void { |
36 | // NOOP |
37 | } |
38 | |
39 | /** |
40 | * {@inheritDoc} |
41 | */ |
42 | public function finishInitializeBatch(): void { |
43 | // NOOP |
44 | } |
45 | |
46 | /** |
47 | * {@inheritDoc} |
48 | */ |
49 | public function finalize( Document $doc, Title $title, RevisionRecord $revision ): void { |
50 | $page = MediaWikiServices::getInstance()->getWikiPageFactory()->newFromTitle( $title ); |
51 | $this->finalizeReal( $doc, $page, new CirrusSearch, $revision ); |
52 | } |
53 | |
54 | /** |
55 | * Visible for testing. Much simpler to test with all objects resolved. |
56 | * |
57 | * @param Document $doc Document to finalize |
58 | * @param WikiPage $page WikiPage to scope operation to |
59 | * @param CirrusSearch $engine SearchEngine implementation |
60 | * @param RevisionRecord $revision The page revision to use |
61 | * @throws BuildDocumentException |
62 | */ |
63 | public function finalizeReal( |
64 | Document $doc, |
65 | WikiPage $page, |
66 | CirrusSearch $engine, |
67 | RevisionRecord $revision |
68 | ): void { |
69 | $wanCache = MediaWikiServices::getInstance()->getMainWANObjectCache(); |
70 | $cacheKey = $wanCache->makeKey( |
71 | 'CirrusSearchParserOutputPageProperties', |
72 | $page->getId(), |
73 | $revision->getId(), |
74 | $page->getTouched(), |
75 | 'v2' |
76 | ); |
77 | |
78 | // We are having problems with low hit rates, but haven't been able to |
79 | // track down why that is. Log a sample of keys so we can evaluate if |
80 | // the problem is that $page->getTouched() is changing between |
81 | // invocations. -- eb 2024 july 9 |
82 | if ( $page->getId() % 1000 === 0 ) { |
83 | LoggerFactory::getInstance( 'CirrusSearch' )->debug( |
84 | 'Sampling of CirrusSearchParserOutputPageProperties cache keys: {cache_key}', |
85 | [ |
86 | 'cache_key' => $cacheKey, |
87 | 'revision_id' => $revision->getId(), |
88 | 'page_id' => $page->getId(), |
89 | ] ); |
90 | } |
91 | |
92 | $fieldContent = $wanCache->getWithSetCallback( |
93 | $cacheKey, |
94 | ExpirationAwareness::TTL_HOUR * 6, |
95 | function () use ( $page, $revision, $engine ) { |
96 | $contentHandler = $page->getContentHandler(); |
97 | // TODO: Should see if we can change content handler api to avoid |
98 | // the WikiPage god object, but currently parser cache is still |
99 | // tied to WikiPage as well. |
100 | $output = $contentHandler->getParserOutputForIndexing( $page, null, $revision ); |
101 | |
102 | if ( !$output ) { |
103 | throw new BuildDocumentException( "ParserOutput cannot be obtained." ); |
104 | } |
105 | |
106 | $fieldContent = $contentHandler->getDataForSearchIndex( $page, $output, $engine, $revision ); |
107 | $fieldContent['display_title'] = self::extractDisplayTitle( $page->getTitle(), $output ); |
108 | return self::fixAndFlagInvalidUTF8InSource( $fieldContent, $page->getId() ); |
109 | } |
110 | ); |
111 | $fieldContent = $this->truncateFileContent( $fieldContent ); |
112 | $fieldDefinitions = $engine->getSearchIndexFields(); |
113 | foreach ( $fieldContent as $field => $fieldData ) { |
114 | $doc->set( $field, $fieldData ); |
115 | if ( isset( $fieldDefinitions[$field] ) ) { |
116 | $hints = $fieldDefinitions[$field]->getEngineHints( $engine ); |
117 | CirrusIndexField::addIndexingHints( $doc, $field, $hints ); |
118 | } |
119 | } |
120 | } |
121 | |
122 | /** |
123 | * @param Title $title |
124 | * @param ParserOutput $output |
125 | * @return string|null |
126 | */ |
127 | private static function extractDisplayTitle( Title $title, ParserOutput $output ): ?string { |
128 | $titleText = $title->getText(); |
129 | $titlePrefixedText = $title->getPrefixedText(); |
130 | |
131 | $raw = $output->getDisplayTitle(); |
132 | if ( $raw === false ) { |
133 | return null; |
134 | } |
135 | $clean = Sanitizer::stripAllTags( $raw ); |
136 | // Only index display titles that differ from the normal title |
137 | if ( self::isSameString( $clean, $titleText ) || |
138 | self::isSameString( $clean, $titlePrefixedText ) |
139 | ) { |
140 | return null; |
141 | } |
142 | if ( $title->getNamespace() === 0 || strpos( $clean, ':' ) === false ) { |
143 | return $clean; |
144 | } |
145 | // There is no official way that namespaces work in display title, it |
146 | // is an arbitrary string. Even so some use cases, such as the |
147 | // Translate extension, will translate the namespace as well. Here |
148 | // `Help:foo` will have a display title of `Aide:bar`. If we were to |
149 | // simply index as is the autocomplete and near matcher would see |
150 | // Help:Aide:bar, which doesn't seem particularly useful. |
151 | // The strategy here is to see if the portion before the : is a valid namespace |
152 | // in either the language of the wiki or the language of the page. If it is |
153 | // then we strip it from the display title. |
154 | [ $maybeNs, $maybeDisplayTitle ] = explode( ':', $clean, 2 ); |
155 | $cleanTitle = Title::newFromText( $clean ); |
156 | if ( $cleanTitle === null ) { |
157 | // The title is invalid, we cannot extract the ns prefix |
158 | return $clean; |
159 | } |
160 | if ( $cleanTitle->getNamespace() == $title->getNamespace() ) { |
161 | // While it doesn't really matter, $cleanTitle->getText() may |
162 | // have had ucfirst() applied depending on settings so we |
163 | // return the unmodified $maybeDisplayTitle. |
164 | return $maybeDisplayTitle; |
165 | } |
166 | |
167 | $docLang = $title->getPageLanguage(); |
168 | $nsIndex = $docLang->getNsIndex( $maybeNs ); |
169 | if ( $nsIndex !== $title->getNamespace() ) { |
170 | // Valid namespace but not the same as the actual page. |
171 | // Keep the namespace in the display title. |
172 | return $clean; |
173 | } |
174 | |
175 | return self::isSameString( $maybeDisplayTitle, $titleText ) |
176 | ? null |
177 | : $maybeDisplayTitle; |
178 | } |
179 | |
180 | private static function isSameString( string $a, string $b ): bool { |
181 | $a = mb_strtolower( strtr( $a, '_', ' ' ) ); |
182 | $b = mb_strtolower( strtr( $b, '_', ' ' ) ); |
183 | return $a === $b; |
184 | } |
185 | |
186 | /** |
187 | * Find invalid UTF-8 sequence in the source text. |
188 | * Fix them and flag the doc with the CirrusSearchInvalidUTF8 template. |
189 | * |
190 | * Temporary solution to help investigate/fix T225200 |
191 | * |
192 | * Visible for testing only |
193 | * @param array $fieldDefinitions |
194 | * @param int $pageId |
195 | * @return array |
196 | */ |
197 | public static function fixAndFlagInvalidUTF8InSource( array $fieldDefinitions, int $pageId ): array { |
198 | if ( isset( $fieldDefinitions['source_text'] ) ) { |
199 | $fixedVersion = mb_convert_encoding( $fieldDefinitions['source_text'], 'UTF-8', 'UTF-8' ); |
200 | if ( $fixedVersion !== $fieldDefinitions['source_text'] ) { |
201 | LoggerFactory::getInstance( 'CirrusSearch' ) |
202 | ->warning( 'Fixing invalid UTF-8 sequences in source text for page id {page_id}', |
203 | [ 'page_id' => $pageId ] ); |
204 | $fieldDefinitions['source_text'] = $fixedVersion; |
205 | $fieldDefinitions['template'][] = Title::makeTitle( NS_TEMPLATE, 'CirrusSearchInvalidUTF8' )->getPrefixedText(); |
206 | } |
207 | } |
208 | return $fieldDefinitions; |
209 | } |
210 | |
211 | /** |
212 | * Visible for testing only |
213 | * @param int $maxLen |
214 | * @param array $fieldContent |
215 | * @return array |
216 | */ |
217 | public static function truncateFileTextContent( int $maxLen, array $fieldContent ): array { |
218 | if ( $maxLen >= 0 && isset( $fieldContent['file_text'] ) && strlen( $fieldContent['file_text'] ) > $maxLen ) { |
219 | $fieldContent['file_text'] = mb_strcut( $fieldContent['file_text'], 0, $maxLen ); |
220 | } |
221 | |
222 | return $fieldContent; |
223 | } |
224 | |
225 | /** |
226 | * @param array $fieldContent |
227 | * @return array |
228 | */ |
229 | private function truncateFileContent( array $fieldContent ): array { |
230 | return self::truncateFileTextContent( $this->config->get( 'CirrusSearchMaxFileTextLength' ) ?: -1, $fieldContent ); |
231 | } |
232 | } |