Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
42.19% |
27 / 64 |
|
40.00% |
2 / 5 |
CRAP | |
0.00% |
0 / 1 |
| SearchUpdate | |
42.19% |
27 / 64 |
|
40.00% |
2 / 5 |
51.87 | |
0.00% |
0 / 1 |
| __construct | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
| doUpdate | |
0.00% |
0 / 21 |
|
0.00% |
0 / 1 |
56 | |||
| updateText | |
100.00% |
24 / 24 |
|
100.00% |
1 / 1 |
2 | |||
| getLatestPage | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
| getNormalizedTitle | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
6 | |||
| 1 | <?php |
| 2 | /** |
| 3 | * @license GPL-2.0-or-later |
| 4 | * @file |
| 5 | */ |
| 6 | |
| 7 | namespace MediaWiki\Search; |
| 8 | |
| 9 | use MediaWiki\Content\Content; |
| 10 | use MediaWiki\Logger\LoggerFactory; |
| 11 | use MediaWiki\MainConfigNames; |
| 12 | use MediaWiki\MediaWikiServices; |
| 13 | use MediaWiki\Page\ExistingPageRecord; |
| 14 | use MediaWiki\Page\PageIdentity; |
| 15 | use MediaWiki\Title\Title; |
| 16 | use Wikimedia\Rdbms\IDBAccessObject; |
| 17 | |
| 18 | /** |
| 19 | * Database independent search index updater |
| 20 | * |
| 21 | * @internal |
| 22 | * @ingroup Search |
| 23 | */ |
| 24 | class SearchUpdate { |
| 25 | /** @var int Page id being updated */ |
| 26 | private $id = 0; |
| 27 | |
| 28 | /** @var PageIdentity The page we're updating */ |
| 29 | private $page; |
| 30 | |
| 31 | /** @var Content|null Content of the page (not text) */ |
| 32 | private $content; |
| 33 | |
| 34 | /** @var ExistingPageRecord|null */ |
| 35 | private $latestPage = null; |
| 36 | |
| 37 | /** |
| 38 | * @param int $id Page id to update |
| 39 | * @param PageIdentity $page Page to update |
| 40 | * @param Content|null $c Content of the page to update. |
| 41 | */ |
| 42 | public function __construct( $id, $page, ?Content $c = null ) { |
| 43 | $this->page = $page; |
| 44 | $this->id = $id; |
| 45 | $this->content = $c; |
| 46 | } |
| 47 | |
| 48 | /** |
| 49 | * Perform actual update for the entry |
| 50 | */ |
| 51 | public function doUpdate() { |
| 52 | $services = MediaWikiServices::getInstance(); |
| 53 | $searchEngineConfig = $services->getSearchEngineConfig(); |
| 54 | |
| 55 | if ( $services->getMainConfig()->get( MainConfigNames::DisableSearchUpdate ) || !$this->id ) { |
| 56 | LoggerFactory::getInstance( "search" ) |
| 57 | ->debug( "Skipping update: search updates disabled by config" ); |
| 58 | return; |
| 59 | } |
| 60 | |
| 61 | $seFactory = $services->getSearchEngineFactory(); |
| 62 | foreach ( $searchEngineConfig->getSearchTypes() as $type ) { |
| 63 | $search = $seFactory->create( $type ); |
| 64 | if ( !$search->supports( 'search-update' ) ) { |
| 65 | continue; |
| 66 | } |
| 67 | |
| 68 | $normalTitle = $this->getNormalizedTitle( $search ); |
| 69 | |
| 70 | if ( $this->getLatestPage() === null ) { |
| 71 | $search->delete( $this->id, $normalTitle ); |
| 72 | continue; |
| 73 | } |
| 74 | if ( $this->content === null ) { |
| 75 | $search->updateTitle( $this->id, $normalTitle ); |
| 76 | continue; |
| 77 | } |
| 78 | |
| 79 | $text = $this->content->getTextForSearchIndex(); |
| 80 | $text = $this->updateText( $text, $search ); |
| 81 | |
| 82 | # Perform the actual update |
| 83 | $search->update( $this->id, $normalTitle, $search->normalizeText( $text ) ); |
| 84 | } |
| 85 | } |
| 86 | |
| 87 | /** |
| 88 | * Clean text for indexing. Only really suitable for indexing in databases. |
| 89 | * If you're using a real search engine, you'll probably want to override |
| 90 | * this behavior and do something nicer with the original wikitext. |
| 91 | * @param string $text |
| 92 | * @param SearchEngine|null $se Search engine |
| 93 | * @return string |
| 94 | */ |
| 95 | public function updateText( $text, ?SearchEngine $se = null ) { |
| 96 | $services = MediaWikiServices::getInstance(); |
| 97 | $contLang = $services->getContentLanguage(); |
| 98 | # Language-specific strip/conversion |
| 99 | $text = $contLang->normalizeForSearch( $text ); |
| 100 | $se = $se ?: $services->newSearchEngine(); |
| 101 | $lc = $se->legalSearchChars() . '&#;'; |
| 102 | |
| 103 | # Strip HTML markup |
| 104 | $text = preg_replace( "/<\\/?\\s*[A-Za-z][^>]*?>/", |
| 105 | ' ', $contLang->lc( " " . $text . " " ) ); |
| 106 | $text = preg_replace( "/(^|\\n)==\\s*([^\\n]+)\\s*==(\\s)/", |
| 107 | "\\1\\2 \\2 \\2\\3", $text ); # Emphasize headings |
| 108 | |
| 109 | # Strip external URLs |
| 110 | $uc = "A-Za-z0-9_\\/:.,~%\\-+&;#?!=()@\\x80-\\xFF"; |
| 111 | $protos = "http|https|ftp|mailto|news|gopher"; |
| 112 | $pat = "/(^|[^\\[])({$protos}):[{$uc}]+([^{$uc}]|$)/"; |
| 113 | $text = preg_replace( $pat, "\\1 \\3", $text ); |
| 114 | |
| 115 | $p1 = "/([^\\[])\\[({$protos}):[{$uc}]+]/"; |
| 116 | $p2 = "/([^\\[])\\[({$protos}):[{$uc}]+\\s+([^\\]]+)]/"; |
| 117 | $text = preg_replace( $p1, "\\1 ", $text ); |
| 118 | $text = preg_replace( $p2, "\\1 \\3 ", $text ); |
| 119 | |
| 120 | $text = preg_replace( "/([^{$lc}])([{$lc}]+)]]([a-z]+)/", |
| 121 | "\\1\\2 \\2\\3", $text ); # Handle [[game]]s |
| 122 | |
| 123 | # Strip all remaining non-search characters |
| 124 | $text = preg_replace( "/[^{$lc}]+/", " ", $text ); |
| 125 | |
| 126 | /** |
| 127 | * Handle 's, s' |
| 128 | * |
| 129 | * $text = preg_replace( "/([{$lc}]+)'s /", "\\1 \\1's ", $text ); |
| 130 | * $text = preg_replace( "/([{$lc}]+)s' /", "\\1s ", $text ); |
| 131 | * |
| 132 | * These tail-anchored regexps are very slow. The worst case comes |
| 133 | * when Japanese or Chinese text (ie, no word spacing) is written on |
| 134 | * a wiki configured for Western UTF-8 mode. The Unicode characters are |
| 135 | * expanded to hex codes and the "words" are very long paragraph-length |
| 136 | * monstrosities. On a large page the above regexps may take over 20 |
| 137 | * seconds *each* on a 1GHz-level processor. |
| 138 | * |
| 139 | * Following are reversed versions which are consistently fast |
| 140 | * (about 3 milliseconds on 1GHz-level processor). |
| 141 | */ |
| 142 | $text = strrev( preg_replace( "/ s'([{$lc}]+)/", " s'\\1 \\1", strrev( $text ) ) ); |
| 143 | $text = strrev( preg_replace( "/ 's([{$lc}]+)/", " s\\1", strrev( $text ) ) ); |
| 144 | |
| 145 | # Strip wiki '' and ''' |
| 146 | $text = preg_replace( "/''[']*/", " ", $text ); |
| 147 | |
| 148 | return $text; |
| 149 | } |
| 150 | |
| 151 | /** |
| 152 | * Get ExistingPageRecord for the SearchUpdate $id using IDBAccessObject::READ_LATEST |
| 153 | * and ensure using the same ExistingPageRecord object if there are multiple |
| 154 | * SearchEngine types. |
| 155 | * |
| 156 | * Returns null if a page has been deleted or is not found. |
| 157 | * |
| 158 | * @return ExistingPageRecord|null |
| 159 | */ |
| 160 | private function getLatestPage() { |
| 161 | if ( !$this->latestPage ) { |
| 162 | $this->latestPage = MediaWikiServices::getInstance()->getPageStore() |
| 163 | ->getPageById( $this->id, IDBAccessObject::READ_LATEST ); |
| 164 | } |
| 165 | |
| 166 | return $this->latestPage; |
| 167 | } |
| 168 | |
| 169 | /** |
| 170 | * Get a normalized string representation of a title suitable for |
| 171 | * including in a search index |
| 172 | * |
| 173 | * @param SearchEngine $search |
| 174 | * @return string A stripped-down title string ready for the search index |
| 175 | */ |
| 176 | private function getNormalizedTitle( SearchEngine $search ) { |
| 177 | $contLang = MediaWikiServices::getInstance()->getContentLanguage(); |
| 178 | $title = Title::newFromPageIdentity( $this->page )->getText(); |
| 179 | |
| 180 | $lc = $search->legalSearchChars() . '&#;'; |
| 181 | $t = $contLang->normalizeForSearch( $title ); |
| 182 | $t = preg_replace( "/[^{$lc}]+/", ' ', $t ); |
| 183 | $t = $contLang->lc( $t ); |
| 184 | |
| 185 | if ( $this->page->getNamespace() === NS_FILE ) { |
| 186 | $t = preg_replace( "/([{$lc}]+)\\.(\\w{1,4})$/", "\\1 \\1.\\2", $t ); |
| 187 | } |
| 188 | |
| 189 | # Handle 's, s' |
| 190 | $t = preg_replace( "/([{$lc}]+)'s( |$)/", "\\1 \\1's ", $t ); |
| 191 | $t = preg_replace( "/([{$lc}]+)s'( |$)/", "\\1s ", $t ); |
| 192 | |
| 193 | $t = preg_replace( "/\\s+/", ' ', $t ); |
| 194 | |
| 195 | return $search->normalizeText( trim( $t ) ); |
| 196 | } |
| 197 | } |