Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
43.55% |
27 / 62 |
|
40.00% |
2 / 5 |
CRAP | |
0.00% |
0 / 1 |
SearchUpdate | |
43.55% |
27 / 62 |
|
40.00% |
2 / 5 |
49.26 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
doUpdate | |
0.00% |
0 / 21 |
|
0.00% |
0 / 1 |
72 | |||
updateText | |
100.00% |
24 / 24 |
|
100.00% |
1 / 1 |
2 | |||
getLatestPage | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
getNormalizedTitle | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | /** |
3 | * This program is free software; you can redistribute it and/or modify |
4 | * it under the terms of the GNU General Public License as published by |
5 | * the Free Software Foundation; either version 2 of the License, or |
6 | * (at your option) any later version. |
7 | * |
8 | * This program is distributed in the hope that it will be useful, |
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11 | * GNU General Public License for more details. |
12 | * |
13 | * You should have received a copy of the GNU General Public License along |
14 | * with this program; if not, write to the Free Software Foundation, Inc., |
15 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
16 | * http://www.gnu.org/copyleft/gpl.html |
17 | * |
18 | * @file |
19 | */ |
20 | |
21 | namespace MediaWiki\Search; |
22 | |
23 | use MediaWiki\Content\Content; |
24 | use MediaWiki\Logger\LoggerFactory; |
25 | use MediaWiki\MainConfigNames; |
26 | use MediaWiki\MediaWikiServices; |
27 | use MediaWiki\Page\ExistingPageRecord; |
28 | use MediaWiki\Page\PageIdentity; |
29 | use MediaWiki\Title\Title; |
30 | use SearchEngine; |
31 | use Wikimedia\Rdbms\IDBAccessObject; |
32 | |
33 | /** |
34 | * Database independent search index updater |
35 | * |
36 | * @internal |
37 | * @ingroup Search |
38 | */ |
39 | class SearchUpdate { |
40 | /** @var int Page id being updated */ |
41 | private $id = 0; |
42 | |
43 | /** @var PageIdentity The page we're updating */ |
44 | private $page; |
45 | |
46 | /** @var Content|null Content of the page (not text) */ |
47 | private $content; |
48 | |
49 | /** @var ExistingPageRecord|null */ |
50 | private $latestPage = null; |
51 | |
52 | /** |
53 | * @param int $id Page id to update |
54 | * @param PageIdentity $page Page to update |
55 | * @param Content|null $c Content of the page to update. |
56 | */ |
57 | public function __construct( $id, $page, ?Content $c = null ) { |
58 | $this->page = $page; |
59 | $this->id = $id; |
60 | $this->content = $c; |
61 | } |
62 | |
63 | /** |
64 | * Perform actual update for the entry |
65 | */ |
66 | public function doUpdate() { |
67 | $services = MediaWikiServices::getInstance(); |
68 | $searchEngineConfig = $services->getSearchEngineConfig(); |
69 | |
70 | if ( $services->getMainConfig()->get( MainConfigNames::DisableSearchUpdate ) || !$this->id ) { |
71 | LoggerFactory::getInstance( "search" ) |
72 | ->debug( "Skipping update: search updates disabled by config" ); |
73 | return; |
74 | } |
75 | |
76 | $seFactory = $services->getSearchEngineFactory(); |
77 | foreach ( $searchEngineConfig->getSearchTypes() as $type ) { |
78 | $search = $seFactory->create( $type ); |
79 | if ( !$search->supports( 'search-update' ) ) { |
80 | continue; |
81 | } |
82 | |
83 | $normalTitle = $this->getNormalizedTitle( $search ); |
84 | |
85 | if ( $this->getLatestPage() === null ) { |
86 | $search->delete( $this->id, $normalTitle ); |
87 | continue; |
88 | } elseif ( $this->content === null ) { |
89 | $search->updateTitle( $this->id, $normalTitle ); |
90 | continue; |
91 | } |
92 | |
93 | $text = $this->content !== null ? $this->content->getTextForSearchIndex() : ''; |
94 | $text = $this->updateText( $text, $search ); |
95 | |
96 | # Perform the actual update |
97 | $search->update( $this->id, $normalTitle, $search->normalizeText( $text ) ); |
98 | } |
99 | } |
100 | |
101 | /** |
102 | * Clean text for indexing. Only really suitable for indexing in databases. |
103 | * If you're using a real search engine, you'll probably want to override |
104 | * this behavior and do something nicer with the original wikitext. |
105 | * @param string $text |
106 | * @param SearchEngine|null $se Search engine |
107 | * @return string |
108 | */ |
109 | public function updateText( $text, ?SearchEngine $se = null ) { |
110 | $services = MediaWikiServices::getInstance(); |
111 | $contLang = $services->getContentLanguage(); |
112 | # Language-specific strip/conversion |
113 | $text = $contLang->normalizeForSearch( $text ); |
114 | $se = $se ?: $services->newSearchEngine(); |
115 | $lc = $se->legalSearchChars() . '&#;'; |
116 | |
117 | # Strip HTML markup |
118 | $text = preg_replace( "/<\\/?\\s*[A-Za-z][^>]*?>/", |
119 | ' ', $contLang->lc( " " . $text . " " ) ); |
120 | $text = preg_replace( "/(^|\\n)==\\s*([^\\n]+)\\s*==(\\s)/", |
121 | "\\1\\2 \\2 \\2\\3", $text ); # Emphasize headings |
122 | |
123 | # Strip external URLs |
124 | $uc = "A-Za-z0-9_\\/:.,~%\\-+&;#?!=()@\\x80-\\xFF"; |
125 | $protos = "http|https|ftp|mailto|news|gopher"; |
126 | $pat = "/(^|[^\\[])({$protos}):[{$uc}]+([^{$uc}]|$)/"; |
127 | $text = preg_replace( $pat, "\\1 \\3", $text ); |
128 | |
129 | $p1 = "/([^\\[])\\[({$protos}):[{$uc}]+]/"; |
130 | $p2 = "/([^\\[])\\[({$protos}):[{$uc}]+\\s+([^\\]]+)]/"; |
131 | $text = preg_replace( $p1, "\\1 ", $text ); |
132 | $text = preg_replace( $p2, "\\1 \\3 ", $text ); |
133 | |
134 | $text = preg_replace( "/([^{$lc}])([{$lc}]+)]]([a-z]+)/", |
135 | "\\1\\2 \\2\\3", $text ); # Handle [[game]]s |
136 | |
137 | # Strip all remaining non-search characters |
138 | $text = preg_replace( "/[^{$lc}]+/", " ", $text ); |
139 | |
140 | /** |
141 | * Handle 's, s' |
142 | * |
143 | * $text = preg_replace( "/([{$lc}]+)'s /", "\\1 \\1's ", $text ); |
144 | * $text = preg_replace( "/([{$lc}]+)s' /", "\\1s ", $text ); |
145 | * |
146 | * These tail-anchored regexps are very slow. The worst case comes |
147 | * when Japanese or Chinese text (ie, no word spacing) is written on |
148 | * a wiki configured for Western UTF-8 mode. The Unicode characters are |
149 | * expanded to hex codes and the "words" are very long paragraph-length |
150 | * monstrosities. On a large page the above regexps may take over 20 |
151 | * seconds *each* on a 1GHz-level processor. |
152 | * |
153 | * Following are reversed versions which are consistently fast |
154 | * (about 3 milliseconds on 1GHz-level processor). |
155 | */ |
156 | $text = strrev( preg_replace( "/ s'([{$lc}]+)/", " s'\\1 \\1", strrev( $text ) ) ); |
157 | $text = strrev( preg_replace( "/ 's([{$lc}]+)/", " s\\1", strrev( $text ) ) ); |
158 | |
159 | # Strip wiki '' and ''' |
160 | $text = preg_replace( "/''[']*/", " ", $text ); |
161 | |
162 | return $text; |
163 | } |
164 | |
165 | /** |
166 | * Get ExistingPageRecord for the SearchUpdate $id using IDBAccessObject::READ_LATEST |
167 | * and ensure using the same ExistingPageRecord object if there are multiple |
168 | * SearchEngine types. |
169 | * |
170 | * Returns null if a page has been deleted or is not found. |
171 | * |
172 | * @return ExistingPageRecord|null |
173 | */ |
174 | private function getLatestPage() { |
175 | if ( !$this->latestPage ) { |
176 | $this->latestPage = MediaWikiServices::getInstance()->getPageStore() |
177 | ->getPageById( $this->id, IDBAccessObject::READ_LATEST ); |
178 | } |
179 | |
180 | return $this->latestPage; |
181 | } |
182 | |
183 | /** |
184 | * Get a normalized string representation of a title suitable for |
185 | * including in a search index |
186 | * |
187 | * @param SearchEngine $search |
188 | * @return string A stripped-down title string ready for the search index |
189 | */ |
190 | private function getNormalizedTitle( SearchEngine $search ) { |
191 | $contLang = MediaWikiServices::getInstance()->getContentLanguage(); |
192 | $title = Title::newFromPageIdentity( $this->page )->getText(); |
193 | |
194 | $lc = $search->legalSearchChars() . '&#;'; |
195 | $t = $contLang->normalizeForSearch( $title ); |
196 | $t = preg_replace( "/[^{$lc}]+/", ' ', $t ); |
197 | $t = $contLang->lc( $t ); |
198 | |
199 | # Handle 's, s' |
200 | $t = preg_replace( "/([{$lc}]+)'s( |$)/", "\\1 \\1's ", $t ); |
201 | $t = preg_replace( "/([{$lc}]+)s'( |$)/", "\\1s ", $t ); |
202 | |
203 | $t = preg_replace( "/\\s+/", ' ', $t ); |
204 | |
205 | return $search->normalizeText( trim( $t ) ); |
206 | } |
207 | } |