Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
42.65% |
29 / 68 |
|
40.00% |
2 / 5 |
CRAP | |
0.00% |
0 / 1 |
SearchUpdate | |
43.28% |
29 / 67 |
|
40.00% |
2 / 5 |
56.05 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
doUpdate | |
0.00% |
0 / 21 |
|
0.00% |
0 / 1 |
72 | |||
updateText | |
100.00% |
26 / 26 |
|
100.00% |
1 / 1 |
2 | |||
getLatestPage | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
getNormalizedTitle | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
6 |
1 | <?php |
2 | /** |
3 | * Search index updater |
4 | * |
5 | * See deferred.txt |
6 | * |
7 | * This program is free software; you can redistribute it and/or modify |
8 | * it under the terms of the GNU General Public License as published by |
9 | * the Free Software Foundation; either version 2 of the License, or |
10 | * (at your option) any later version. |
11 | * |
12 | * This program is distributed in the hope that it will be useful, |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | * GNU General Public License for more details. |
16 | * |
17 | * You should have received a copy of the GNU General Public License along |
18 | * with this program; if not, write to the Free Software Foundation, Inc., |
19 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
20 | * http://www.gnu.org/copyleft/gpl.html |
21 | * |
22 | * @file |
23 | * @ingroup Search |
24 | */ |
25 | |
26 | namespace MediaWiki\Deferred; |
27 | |
28 | use Content; |
29 | use IDBAccessObject; |
30 | use MediaWiki\Logger\LoggerFactory; |
31 | use MediaWiki\MainConfigNames; |
32 | use MediaWiki\MediaWikiServices; |
33 | use MediaWiki\Page\ExistingPageRecord; |
34 | use MediaWiki\Page\PageIdentity; |
35 | use SearchEngine; |
36 | |
37 | /** |
38 | * Database independent search index updater |
39 | * |
40 | * @ingroup Search |
41 | */ |
42 | class SearchUpdate implements DeferrableUpdate { |
43 | /** @var int Page id being updated */ |
44 | private $id = 0; |
45 | |
46 | /** @var PageIdentity The page we're updating */ |
47 | private $page; |
48 | |
49 | /** @var Content|null Content of the page (not text) */ |
50 | private $content; |
51 | |
52 | /** @var ExistingPageRecord|null */ |
53 | private $latestPage = null; |
54 | |
55 | /** |
56 | * @param int $id Page id to update |
57 | * @param PageIdentity $page Page to update |
58 | * @param Content|null $c Content of the page to update. |
59 | */ |
60 | public function __construct( $id, $page, ?Content $c = null ) { |
61 | $this->page = $page; |
62 | $this->id = $id; |
63 | $this->content = $c; |
64 | } |
65 | |
66 | /** |
67 | * Perform actual update for the entry |
68 | */ |
69 | public function doUpdate() { |
70 | $services = MediaWikiServices::getInstance(); |
71 | $searchEngineConfig = $services->getSearchEngineConfig(); |
72 | |
73 | if ( $services->getMainConfig()->get( MainConfigNames::DisableSearchUpdate ) || !$this->id ) { |
74 | LoggerFactory::getInstance( "search" ) |
75 | ->debug( "Skipping update: search updates disabled by config" ); |
76 | return; |
77 | } |
78 | |
79 | $seFactory = $services->getSearchEngineFactory(); |
80 | foreach ( $searchEngineConfig->getSearchTypes() as $type ) { |
81 | $search = $seFactory->create( $type ); |
82 | if ( !$search->supports( 'search-update' ) ) { |
83 | continue; |
84 | } |
85 | |
86 | $normalTitle = $this->getNormalizedTitle( $search ); |
87 | |
88 | if ( $this->getLatestPage() === null ) { |
89 | $search->delete( $this->id, $normalTitle ); |
90 | continue; |
91 | } elseif ( $this->content === null ) { |
92 | $search->updateTitle( $this->id, $normalTitle ); |
93 | continue; |
94 | } |
95 | |
96 | $text = $this->content !== null ? $this->content->getTextForSearchIndex() : ''; |
97 | $text = $this->updateText( $text, $search ); |
98 | |
99 | # Perform the actual update |
100 | $search->update( $this->id, $normalTitle, $search->normalizeText( $text ) ); |
101 | } |
102 | } |
103 | |
104 | /** |
105 | * Clean text for indexing. Only really suitable for indexing in databases. |
106 | * If you're using a real search engine, you'll probably want to override |
107 | * this behavior and do something nicer with the original wikitext. |
108 | * @param string $text |
109 | * @param SearchEngine|null $se Search engine |
110 | * @return string |
111 | */ |
112 | public function updateText( $text, SearchEngine $se = null ) { |
113 | $services = MediaWikiServices::getInstance(); |
114 | $contLang = $services->getContentLanguage(); |
115 | # Language-specific strip/conversion |
116 | $text = $contLang->normalizeForSearch( $text ); |
117 | $se = $se ?: $services->newSearchEngine(); |
118 | $lc = $se->legalSearchChars() . '&#;'; |
119 | |
120 | # Strip HTML markup |
121 | $text = preg_replace( "/<\\/?\\s*[A-Za-z][^>]*?>/", |
122 | ' ', $contLang->lc( " " . $text . " " ) ); |
123 | $text = preg_replace( "/(^|\\n)==\\s*([^\\n]+)\\s*==(\\s)/", |
124 | "\\1\\2 \\2 \\2\\3", $text ); # Emphasize headings |
125 | |
126 | # Strip external URLs |
127 | $uc = "A-Za-z0-9_\\/:.,~%\\-+&;#?!=()@\\x80-\\xFF"; |
128 | $protos = "http|https|ftp|mailto|news|gopher"; |
129 | $pat = "/(^|[^\\[])({$protos}):[{$uc}]+([^{$uc}]|$)/"; |
130 | $text = preg_replace( $pat, "\\1 \\3", $text ); |
131 | |
132 | $p1 = "/([^\\[])\\[({$protos}):[{$uc}]+]/"; |
133 | $p2 = "/([^\\[])\\[({$protos}):[{$uc}]+\\s+([^\\]]+)]/"; |
134 | $text = preg_replace( $p1, "\\1 ", $text ); |
135 | $text = preg_replace( $p2, "\\1 \\3 ", $text ); |
136 | |
137 | # Internal image links |
138 | $pat2 = "/\\[\\[image:([{$uc}]+)\\.(gif|png|jpg|jpeg)([^{$uc}])/i"; |
139 | $text = preg_replace( $pat2, " \\1 \\3", $text ); |
140 | |
141 | $text = preg_replace( "/([^{$lc}])([{$lc}]+)]]([a-z]+)/", |
142 | "\\1\\2 \\2\\3", $text ); # Handle [[game]]s |
143 | |
144 | # Strip all remaining non-search characters |
145 | $text = preg_replace( "/[^{$lc}]+/", " ", $text ); |
146 | |
147 | /** |
148 | * Handle 's, s' |
149 | * |
150 | * $text = preg_replace( "/([{$lc}]+)'s /", "\\1 \\1's ", $text ); |
151 | * $text = preg_replace( "/([{$lc}]+)s' /", "\\1s ", $text ); |
152 | * |
153 | * These tail-anchored regexps are very slow. The worst case comes |
154 | * when Japanese or Chinese text (ie, no word spacing) is written on |
155 | * a wiki configured for Western UTF-8 mode. The Unicode characters are |
156 | * expanded to hex codes and the "words" are very long paragraph-length |
157 | * monstrosities. On a large page the above regexps may take over 20 |
158 | * seconds *each* on a 1GHz-level processor. |
159 | * |
160 | * Following are reversed versions which are consistently fast |
161 | * (about 3 milliseconds on 1GHz-level processor). |
162 | */ |
163 | $text = strrev( preg_replace( "/ s'([{$lc}]+)/", " s'\\1 \\1", strrev( $text ) ) ); |
164 | $text = strrev( preg_replace( "/ 's([{$lc}]+)/", " s\\1", strrev( $text ) ) ); |
165 | |
166 | # Strip wiki '' and ''' |
167 | $text = preg_replace( "/''[']*/", " ", $text ); |
168 | |
169 | return $text; |
170 | } |
171 | |
172 | /** |
173 | * Get ExistingPageRecord for the SearchUpdate $id using IDBAccessObject::READ_LATEST |
174 | * and ensure using the same ExistingPageRecord object if there are multiple |
175 | * SearchEngine types. |
176 | * |
177 | * Returns null if a page has been deleted or is not found. |
178 | * |
179 | * @return ExistingPageRecord|null |
180 | */ |
181 | private function getLatestPage() { |
182 | if ( !isset( $this->latestPage ) ) { |
183 | $this->latestPage = MediaWikiServices::getInstance()->getPageStore() |
184 | ->getPageById( $this->id, IDBAccessObject::READ_LATEST ); |
185 | } |
186 | |
187 | return $this->latestPage; |
188 | } |
189 | |
190 | /** |
191 | * Get a normalized string representation of a title suitable for |
192 | * including in a search index |
193 | * |
194 | * @param SearchEngine $search |
195 | * @return string A stripped-down title string ready for the search index |
196 | */ |
197 | private function getNormalizedTitle( SearchEngine $search ) { |
198 | $contLang = MediaWikiServices::getInstance()->getContentLanguage(); |
199 | $ns = $this->page->getNamespace(); |
200 | $title = str_replace( '_', ' ', $this->page->getDBkey() ); |
201 | |
202 | $lc = $search->legalSearchChars() . '&#;'; |
203 | $t = $contLang->normalizeForSearch( $title ); |
204 | $t = preg_replace( "/[^{$lc}]+/", ' ', $t ); |
205 | $t = $contLang->lc( $t ); |
206 | |
207 | # Handle 's, s' |
208 | $t = preg_replace( "/([{$lc}]+)'s( |$)/", "\\1 \\1's ", $t ); |
209 | $t = preg_replace( "/([{$lc}]+)s'( |$)/", "\\1s ", $t ); |
210 | |
211 | $t = preg_replace( "/\\s+/", ' ', $t ); |
212 | |
213 | if ( $ns === NS_FILE ) { |
214 | $t = preg_replace( "/ (png|gif|jpg|jpeg|ogg)$/", "", $t ); |
215 | } |
216 | |
217 | return $search->normalizeText( trim( $t ) ); |
218 | } |
219 | } |
220 | |
221 | /** @deprecated class alias since 1.42 */ |
222 | class_alias( SearchUpdate::class, 'SearchUpdate' ); |