Code Coverage for /workspace/src/extensions/CirrusSearch/includes/BuildDocument/ParserOutputPageProperties.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	83.95% covered (warning)	83.95%	68 / 81	60.00% covered (warning)	60.00%	6 / 10	CRAP	0.00% covered (danger)	0.00%	0 / 1
ParserOutputPageProperties	83.95% covered (warning)	83.95%	68 / 81	60.00% covered (warning)	60.00%	6 / 10	32.48	0.00% covered (danger)	0.00%	0 / 1
__construct	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
initialize	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
finishInitializeBatch	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
finalize	0.00% covered (danger)	0.00%	0 / 2	0.00% covered (danger)	0.00%	0 / 1	2
finalizeReal	75.00% covered (warning)	75.00%	27 / 36	0.00% covered (danger)	0.00%	0 / 1	5.39
extractDisplayTitle	100.00% covered (success)	100.00%	24 / 24	100.00% covered (success)	100.00%	1 / 1	10
isSameString	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	1
fixAndFlagInvalidUTF8InSource	100.00% covered (success)	100.00%	9 / 9	100.00% covered (success)	100.00%	1 / 1	3
truncateFileTextContent	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	4
truncateFileContent	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	2

1	<?php
2
3	namespace CirrusSearch\BuildDocument;
4
5	use CirrusSearch\CirrusSearch;
6	use CirrusSearch\Search\CirrusIndexField;
7	use CirrusSearch\SearchConfig;
8	use Elastica\Document;
9	use MediaWiki\Logger\LoggerFactory;
10	use MediaWiki\MediaWikiServices;
11	use MediaWiki\Parser\ParserOutput;
12	use MediaWiki\Parser\Sanitizer;
13	use MediaWiki\Revision\RevisionRecord;
14	use MediaWiki\Title\Title;
15	use Wikimedia\LightweightObjectStore\ExpirationAwareness;
16	use WikiPage;
17
18	/**
19	* Extract searchable properties from the MediaWiki ParserOutput
20	*/
21	class ParserOutputPageProperties implements PagePropertyBuilder {
22	/** @var SearchConfig */
23	private $config;
24
25	/**
26	* @param SearchConfig $config
27	*/
28	public function __construct( SearchConfig $config ) {
29	$this->config = $config;
30	}
31
32	/**
33	* {@inheritDoc}
34	*/
35	public function initialize( Document $doc, WikiPage $page, RevisionRecord $revision ): void {
36	// NOOP
37	}
38
39	/**
40	* {@inheritDoc}
41	*/
42	public function finishInitializeBatch(): void {
43	// NOOP
44	}
45
46	/**
47	* {@inheritDoc}
48	*/
49	public function finalize( Document $doc, Title $title, RevisionRecord $revision ): void {
50	$page = MediaWikiServices::getInstance()->getWikiPageFactory()->newFromTitle( $title );
51	$this->finalizeReal( $doc, $page, new CirrusSearch, $revision );
52	}
53
54	/**
55	* Visible for testing. Much simpler to test with all objects resolved.
56	*
57	* @param Document $doc Document to finalize
58	* @param WikiPage $page WikiPage to scope operation to
59	* @param CirrusSearch $engine SearchEngine implementation
60	* @param RevisionRecord $revision The page revision to use
61	* @throws BuildDocumentException
62	*/
63	public function finalizeReal(
64	Document $doc,
65	WikiPage $page,
66	CirrusSearch $engine,
67	RevisionRecord $revision
68	): void {
69	$wanCache = MediaWikiServices::getInstance()->getMainWANObjectCache();
70	$cacheKey = $wanCache->makeKey(
71	'CirrusSearchParserOutputPageProperties',
72	$page->getId(),
73	$revision->getId(),
74	$page->getTouched(),
75	'v2'
76	);
77
78	// We are having problems with low hit rates, but haven't been able to
79	// track down why that is. Log a sample of keys so we can evaluate if
80	// the problem is that $page->getTouched() is changing between
81	// invocations. -- eb 2024 july 9
82	if ( $page->getId() % 1000 === 0 ) {
83	LoggerFactory::getInstance( 'CirrusSearch' )->debug(
84	'Sampling of CirrusSearchParserOutputPageProperties cache keys: {cache_key}',
85	[
86	'cache_key' => $cacheKey,
87	'revision_id' => $revision->getId(),
88	'page_id' => $page->getId(),
89	] );
90	}
91
92	$fieldContent = $wanCache->getWithSetCallback(
93	$cacheKey,
94	ExpirationAwareness::TTL_HOUR * 6,
95	function () use ( $page, $revision, $engine ) {
96	$contentHandler = $page->getContentHandler();
97	// TODO: Should see if we can change content handler api to avoid
98	// the WikiPage god object, but currently parser cache is still
99	// tied to WikiPage as well.
100	$output = $contentHandler->getParserOutputForIndexing( $page, null, $revision );
101
102	if ( !$output ) {
103	throw new BuildDocumentException( "ParserOutput cannot be obtained." );
104	}
105
106	$fieldContent = $contentHandler->getDataForSearchIndex( $page, $output, $engine, $revision );
107	$fieldContent['display_title'] = self::extractDisplayTitle( $page->getTitle(), $output );
108	return self::fixAndFlagInvalidUTF8InSource( $fieldContent, $page->getId() );
109	}
110	);
111	$fieldContent = $this->truncateFileContent( $fieldContent );
112	$fieldDefinitions = $engine->getSearchIndexFields();
113	foreach ( $fieldContent as $field => $fieldData ) {
114	$doc->set( $field, $fieldData );
115	if ( isset( $fieldDefinitions[$field] ) ) {
116	$hints = $fieldDefinitions[$field]->getEngineHints( $engine );
117	CirrusIndexField::addIndexingHints( $doc, $field, $hints );
118	}
119	}
120	}
121
122	/**
123	* @param Title $title
124	* @param ParserOutput $output
125	* @return string\|null
126	*/
127	private static function extractDisplayTitle( Title $title, ParserOutput $output ): ?string {
128	$titleText = $title->getText();
129	$titlePrefixedText = $title->getPrefixedText();
130
131	$raw = $output->getDisplayTitle();
132	if ( $raw === false ) {
133	return null;
134	}
135	$clean = Sanitizer::stripAllTags( $raw );
136	// Only index display titles that differ from the normal title
137	if ( self::isSameString( $clean, $titleText ) \|\|
138	self::isSameString( $clean, $titlePrefixedText )
139	) {
140	return null;
141	}
142	if ( $title->getNamespace() === 0 \|\| strpos( $clean, ':' ) === false ) {
143	return $clean;
144	}
145	// There is no official way that namespaces work in display title, it
146	// is an arbitrary string. Even so some use cases, such as the
147	// Translate extension, will translate the namespace as well. Here
148	// `Help:foo` will have a display title of `Aide:bar`. If we were to
149	// simply index as is the autocomplete and near matcher would see
150	// Help:Aide:bar, which doesn't seem particularly useful.
151	// The strategy here is to see if the portion before the : is a valid namespace
152	// in either the language of the wiki or the language of the page. If it is
153	// then we strip it from the display title.
154	[ $maybeNs, $maybeDisplayTitle ] = explode( ':', $clean, 2 );
155	$cleanTitle = Title::newFromText( $clean );
156	if ( $cleanTitle === null ) {
157	// The title is invalid, we cannot extract the ns prefix
158	return $clean;
159	}
160	if ( $cleanTitle->getNamespace() == $title->getNamespace() ) {
161	// While it doesn't really matter, $cleanTitle->getText() may
162	// have had ucfirst() applied depending on settings so we
163	// return the unmodified $maybeDisplayTitle.
164	return $maybeDisplayTitle;
165	}
166
167	$docLang = $title->getPageLanguage();
168	$nsIndex = $docLang->getNsIndex( $maybeNs );
169	if ( $nsIndex !== $title->getNamespace() ) {
170	// Valid namespace but not the same as the actual page.
171	// Keep the namespace in the display title.
172	return $clean;
173	}
174
175	return self::isSameString( $maybeDisplayTitle, $titleText )
176	? null
177	: $maybeDisplayTitle;
178	}
179
180	private static function isSameString( string $a, string $b ): bool {
181	$a = mb_strtolower( strtr( $a, '_', ' ' ) );
182	$b = mb_strtolower( strtr( $b, '_', ' ' ) );
183	return $a === $b;
184	}
185
186	/**
187	* Find invalid UTF-8 sequence in the source text.
188	* Fix them and flag the doc with the CirrusSearchInvalidUTF8 template.
189	*
190	* Temporary solution to help investigate/fix T225200
191	*
192	* Visible for testing only
193	* @param array $fieldDefinitions
194	* @param int $pageId
195	* @return array
196	*/
197	public static function fixAndFlagInvalidUTF8InSource( array $fieldDefinitions, int $pageId ): array {
198	if ( isset( $fieldDefinitions['source_text'] ) ) {
199	$fixedVersion = mb_convert_encoding( $fieldDefinitions['source_text'], 'UTF-8', 'UTF-8' );
200	if ( $fixedVersion !== $fieldDefinitions['source_text'] ) {
201	LoggerFactory::getInstance( 'CirrusSearch' )
202	->warning( 'Fixing invalid UTF-8 sequences in source text for page id {page_id}',
203	[ 'page_id' => $pageId ] );
204	$fieldDefinitions['source_text'] = $fixedVersion;
205	$fieldDefinitions['template'][] = Title::makeTitle( NS_TEMPLATE, 'CirrusSearchInvalidUTF8' )->getPrefixedText();
206	}
207	}
208	return $fieldDefinitions;
209	}
210
211	/**
212	* Visible for testing only
213	* @param int $maxLen
214	* @param array $fieldContent
215	* @return array
216	*/
217	public static function truncateFileTextContent( int $maxLen, array $fieldContent ): array {
218	if ( $maxLen >= 0 && isset( $fieldContent['file_text'] ) && strlen( $fieldContent['file_text'] ) > $maxLen ) {
219	$fieldContent['file_text'] = mb_strcut( $fieldContent['file_text'], 0, $maxLen );
220	}
221
222	return $fieldContent;
223	}
224
225	/**
226	* @param array $fieldContent
227	* @return array
228	*/
229	private function truncateFileContent( array $fieldContent ): array {
230	return self::truncateFileTextContent( $this->config->get( 'CirrusSearchMaxFileTextLength' ) ?: -1, $fieldContent );
231	}
232	}