Code Coverage for /workspace/src/includes/deferred/SearchUpdate.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	42.65% covered (danger)	42.65%	29 / 68	40.00% covered (danger)	40.00%	2 / 5	CRAP	0.00% covered (danger)	0.00%	0 / 1
SearchUpdate	43.28% covered (danger)	43.28%	29 / 67	40.00% covered (danger)	40.00%	2 / 5	56.05	0.00% covered (danger)	0.00%	0 / 1
__construct	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	1
doUpdate	0.00% covered (danger)	0.00%	0 / 21	0.00% covered (danger)	0.00%	0 / 1	72
updateText	100.00% covered (success)	100.00%	26 / 26	100.00% covered (success)	100.00%	1 / 1	2
getLatestPage	0.00% covered (danger)	0.00%	0 / 4	0.00% covered (danger)	0.00%	0 / 1	6
getNormalizedTitle	0.00% covered (danger)	0.00%	0 / 13	0.00% covered (danger)	0.00%	0 / 1	6

1	<?php
2	/**
3	* Search index updater
4	*
5	* See deferred.txt
6	*
7	* This program is free software; you can redistribute it and/or modify
8	* it under the terms of the GNU General Public License as published by
9	* the Free Software Foundation; either version 2 of the License, or
10	* (at your option) any later version.
11	*
12	* This program is distributed in the hope that it will be useful,
13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	* GNU General Public License for more details.
16	*
17	* You should have received a copy of the GNU General Public License along
18	* with this program; if not, write to the Free Software Foundation, Inc.,
19	* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20	* http://www.gnu.org/copyleft/gpl.html
21	*
22	* @file
23	* @ingroup Search
24	*/
25
26	namespace MediaWiki\Deferred;
27
28	use Content;
29	use IDBAccessObject;
30	use MediaWiki\Logger\LoggerFactory;
31	use MediaWiki\MainConfigNames;
32	use MediaWiki\MediaWikiServices;
33	use MediaWiki\Page\ExistingPageRecord;
34	use MediaWiki\Page\PageIdentity;
35	use SearchEngine;
36
37	/**
38	* Database independent search index updater
39	*
40	* @ingroup Search
41	*/
42	class SearchUpdate implements DeferrableUpdate {
43	/** @var int Page id being updated */
44	private $id = 0;
45
46	/** @var PageIdentity The page we're updating */
47	private $page;
48
49	/** @var Content\|null Content of the page (not text) */
50	private $content;
51
52	/** @var ExistingPageRecord\|null */
53	private $latestPage = null;
54
55	/**
56	* @param int $id Page id to update
57	* @param PageIdentity $page Page to update
58	* @param Content\|null $c Content of the page to update.
59	*/
60	public function __construct( $id, $page, ?Content $c = null ) {
61	$this->page = $page;
62	$this->id = $id;
63	$this->content = $c;
64	}
65
66	/**
67	* Perform actual update for the entry
68	*/
69	public function doUpdate() {
70	$services = MediaWikiServices::getInstance();
71	$searchEngineConfig = $services->getSearchEngineConfig();
72
73	if ( $services->getMainConfig()->get( MainConfigNames::DisableSearchUpdate ) \|\| !$this->id ) {
74	LoggerFactory::getInstance( "search" )
75	->debug( "Skipping update: search updates disabled by config" );
76	return;
77	}
78
79	$seFactory = $services->getSearchEngineFactory();
80	foreach ( $searchEngineConfig->getSearchTypes() as $type ) {
81	$search = $seFactory->create( $type );
82	if ( !$search->supports( 'search-update' ) ) {
83	continue;
84	}
85
86	$normalTitle = $this->getNormalizedTitle( $search );
87
88	if ( $this->getLatestPage() === null ) {
89	$search->delete( $this->id, $normalTitle );
90	continue;
91	} elseif ( $this->content === null ) {
92	$search->updateTitle( $this->id, $normalTitle );
93	continue;
94	}
95
96	$text = $this->content !== null ? $this->content->getTextForSearchIndex() : '';
97	$text = $this->updateText( $text, $search );
98
99	# Perform the actual update
100	$search->update( $this->id, $normalTitle, $search->normalizeText( $text ) );
101	}
102	}
103
104	/**
105	* Clean text for indexing. Only really suitable for indexing in databases.
106	* If you're using a real search engine, you'll probably want to override
107	* this behavior and do something nicer with the original wikitext.
108	* @param string $text
109	* @param SearchEngine\|null $se Search engine
110	* @return string
111	*/
112	public function updateText( $text, SearchEngine $se = null ) {
113	$services = MediaWikiServices::getInstance();
114	$contLang = $services->getContentLanguage();
115	# Language-specific strip/conversion
116	$text = $contLang->normalizeForSearch( $text );
117	$se = $se ?: $services->newSearchEngine();
118	$lc = $se->legalSearchChars() . '&#;';
119
120	# Strip HTML markup
121	$text = preg_replace( "/<\\/?\\s[A-Za-z][^>]?>/",
122	' ', $contLang->lc( " " . $text . " " ) );
123	$text = preg_replace( "/(^\|\\n)==\\s([^\\n]+)\\s==(\\s)/",
124	"\\1\\2 \\2 \\2\\3", $text ); # Emphasize headings
125
126	# Strip external URLs
127	$uc = "A-Za-z0-9_\\/:.,~%\\-+&;#?!=()@\\x80-\\xFF";
128	$protos = "http\|https\|ftp\|mailto\|news\|gopher";
129	$pat = "/(^\|[^\\[])({$protos}):[{$uc}]+([^{$uc}]\|$)/";
130	$text = preg_replace( $pat, "\\1 \\3", $text );
131
132	$p1 = "/([^\\[])\\[({$protos}):[{$uc}]+]/";
133	$p2 = "/([^\\[])\\[({$protos}):[{$uc}]+\\s+([^\\]]+)]/";
134	$text = preg_replace( $p1, "\\1 ", $text );
135	$text = preg_replace( $p2, "\\1 \\3 ", $text );
136
137	# Internal image links
138	$pat2 = "/\\[\\[image:([{$uc}]+)\\.(gif\|png\|jpg\|jpeg)([^{$uc}])/i";
139	$text = preg_replace( $pat2, " \\1 \\3", $text );
140
141	$text = preg_replace( "/([^{$lc}])([{$lc}]+)]]([a-z]+)/",
142	"\\1\\2 \\2\\3", $text ); # Handle [[game]]s
143
144	# Strip all remaining non-search characters
145	$text = preg_replace( "/[^{$lc}]+/", " ", $text );
146
147	/**
148	* Handle 's, s'
149	*
150	* $text = preg_replace( "/([{$lc}]+)'s /", "\\1 \\1's ", $text );
151	* $text = preg_replace( "/([{$lc}]+)s' /", "\\1s ", $text );
152	*
153	* These tail-anchored regexps are very slow. The worst case comes
154	* when Japanese or Chinese text (ie, no word spacing) is written on
155	* a wiki configured for Western UTF-8 mode. The Unicode characters are
156	* expanded to hex codes and the "words" are very long paragraph-length
157	* monstrosities. On a large page the above regexps may take over 20
158	* seconds each on a 1GHz-level processor.
159	*
160	* Following are reversed versions which are consistently fast
161	* (about 3 milliseconds on 1GHz-level processor).
162	*/
163	$text = strrev( preg_replace( "/ s'([{$lc}]+)/", " s'\\1 \\1", strrev( $text ) ) );
164	$text = strrev( preg_replace( "/ 's([{$lc}]+)/", " s\\1", strrev( $text ) ) );
165
166	# Strip wiki '' and '''
167	$text = preg_replace( "/''[']*/", " ", $text );
168
169	return $text;
170	}
171
172	/**
173	* Get ExistingPageRecord for the SearchUpdate $id using IDBAccessObject::READ_LATEST
174	* and ensure using the same ExistingPageRecord object if there are multiple
175	* SearchEngine types.
176	*
177	* Returns null if a page has been deleted or is not found.
178	*
179	* @return ExistingPageRecord\|null
180	*/
181	private function getLatestPage() {
182	if ( !isset( $this->latestPage ) ) {
183	$this->latestPage = MediaWikiServices::getInstance()->getPageStore()
184	->getPageById( $this->id, IDBAccessObject::READ_LATEST );
185	}
186
187	return $this->latestPage;
188	}
189
190	/**
191	* Get a normalized string representation of a title suitable for
192	* including in a search index
193	*
194	* @param SearchEngine $search
195	* @return string A stripped-down title string ready for the search index
196	*/
197	private function getNormalizedTitle( SearchEngine $search ) {
198	$contLang = MediaWikiServices::getInstance()->getContentLanguage();
199	$ns = $this->page->getNamespace();
200	$title = str_replace( '_', ' ', $this->page->getDBkey() );
201
202	$lc = $search->legalSearchChars() . '&#;';
203	$t = $contLang->normalizeForSearch( $title );
204	$t = preg_replace( "/[^{$lc}]+/", ' ', $t );
205	$t = $contLang->lc( $t );
206
207	# Handle 's, s'
208	$t = preg_replace( "/([{$lc}]+)'s( \|$)/", "\\1 \\1's ", $t );
209	$t = preg_replace( "/([{$lc}]+)s'( \|$)/", "\\1s ", $t );
210
211	$t = preg_replace( "/\\s+/", ' ', $t );
212
213	if ( $ns === NS_FILE ) {
214	$t = preg_replace( "/ (png\|gif\|jpg\|jpeg\|ogg)$/", "", $t );
215	}
216
217	return $search->normalizeText( trim( $t ) );
218	}
219	}
220
221	/** @deprecated class alias since 1.42 */
222	class_alias( SearchUpdate::class, 'SearchUpdate' );