MediaWiki master
SearchUpdate.php
Go to the documentation of this file.
1<?php
21namespace MediaWiki\Search;
22
31use SearchEngine;
33
42 private $id = 0;
43
45 private $page;
46
48 private $content;
49
51 private $latestPage = null;
52
58 public function __construct( $id, $page, ?Content $c = null ) {
59 $this->page = $page;
60 $this->id = $id;
61 $this->content = $c;
62 }
63
67 public function doUpdate() {
69 $searchEngineConfig = $services->getSearchEngineConfig();
70
71 if ( $services->getMainConfig()->get( MainConfigNames::DisableSearchUpdate ) || !$this->id ) {
72 LoggerFactory::getInstance( "search" )
73 ->debug( "Skipping update: search updates disabled by config" );
74 return;
75 }
76
77 $seFactory = $services->getSearchEngineFactory();
78 foreach ( $searchEngineConfig->getSearchTypes() as $type ) {
79 $search = $seFactory->create( $type );
80 if ( !$search->supports( 'search-update' ) ) {
81 continue;
82 }
83
84 $normalTitle = $this->getNormalizedTitle( $search );
85
86 if ( $this->getLatestPage() === null ) {
87 $search->delete( $this->id, $normalTitle );
88 continue;
89 } elseif ( $this->content === null ) {
90 $search->updateTitle( $this->id, $normalTitle );
91 continue;
92 }
93
94 $text = $this->content !== null ? $this->content->getTextForSearchIndex() : '';
95 $text = $this->updateText( $text, $search );
96
97 # Perform the actual update
98 $search->update( $this->id, $normalTitle, $search->normalizeText( $text ) );
99 }
100 }
101
110 public function updateText( $text, ?SearchEngine $se = null ) {
111 $services = MediaWikiServices::getInstance();
112 $contLang = $services->getContentLanguage();
113 # Language-specific strip/conversion
114 $text = $contLang->normalizeForSearch( $text );
115 $se = $se ?: $services->newSearchEngine();
116 $lc = $se->legalSearchChars() . '&#;';
117
118 # Strip HTML markup
119 $text = preg_replace( "/<\\/?\\s*[A-Za-z][^>]*?>/",
120 ' ', $contLang->lc( " " . $text . " " ) );
121 $text = preg_replace( "/(^|\\n)==\\s*([^\\n]+)\\s*==(\\s)/",
122 "\\1\\2 \\2 \\2\\3", $text ); # Emphasize headings
123
124 # Strip external URLs
125 $uc = "A-Za-z0-9_\\/:.,~%\\-+&;#?!=()@\\x80-\\xFF";
126 $protos = "http|https|ftp|mailto|news|gopher";
127 $pat = "/(^|[^\\[])({$protos}):[{$uc}]+([^{$uc}]|$)/";
128 $text = preg_replace( $pat, "\\1 \\3", $text );
129
130 $p1 = "/([^\\[])\\[({$protos}):[{$uc}]+]/";
131 $p2 = "/([^\\[])\\[({$protos}):[{$uc}]+\\s+([^\\]]+)]/";
132 $text = preg_replace( $p1, "\\1 ", $text );
133 $text = preg_replace( $p2, "\\1 \\3 ", $text );
134
135 $text = preg_replace( "/([^{$lc}])([{$lc}]+)]]([a-z]+)/",
136 "\\1\\2 \\2\\3", $text ); # Handle [[game]]s
137
138 # Strip all remaining non-search characters
139 $text = preg_replace( "/[^{$lc}]+/", " ", $text );
140
157 $text = strrev( preg_replace( "/ s'([{$lc}]+)/", " s'\\1 \\1", strrev( $text ) ) );
158 $text = strrev( preg_replace( "/ 's([{$lc}]+)/", " s\\1", strrev( $text ) ) );
159
160 # Strip wiki '' and '''
161 $text = preg_replace( "/''[']*/", " ", $text );
162
163 return $text;
164 }
165
175 private function getLatestPage() {
176 if ( !$this->latestPage ) {
177 $this->latestPage = MediaWikiServices::getInstance()->getPageStore()
178 ->getPageById( $this->id, IDBAccessObject::READ_LATEST );
179 }
180
181 return $this->latestPage;
182 }
183
191 private function getNormalizedTitle( SearchEngine $search ) {
192 $contLang = MediaWikiServices::getInstance()->getContentLanguage();
193 $title = Title::newFromPageIdentity( $this->page )->getText();
194
195 $lc = $search->legalSearchChars() . '&#;';
196 $t = $contLang->normalizeForSearch( $title );
197 $t = preg_replace( "/[^{$lc}]+/", ' ', $t );
198 $t = $contLang->lc( $t );
199
200 # Handle 's, s'
201 $t = preg_replace( "/([{$lc}]+)'s( |$)/", "\\1 \\1's ", $t );
202 $t = preg_replace( "/([{$lc}]+)s'( |$)/", "\\1s ", $t );
203
204 $t = preg_replace( "/\\s+/", ' ', $t );
205
206 return $search->normalizeText( trim( $t ) );
207 }
208}
Create PSR-3 logger objects.
A class containing constants representing the names of configuration variables.
const DisableSearchUpdate
Name constant for the DisableSearchUpdate setting, for use with Config::get()
Service locator for MediaWiki core services.
static getInstance()
Returns the global default instance of the top level service locator.
Database independent search index updater.
doUpdate()
Perform actual update for the entry.
__construct( $id, $page, ?Content $c=null)
updateText( $text, ?SearchEngine $se=null)
Clean text for indexing.
Represents a title within MediaWiki.
Definition Title.php:78
Contain a class for special pages.
delete( $id, $title)
Delete an indexed page Title should be pre-processed.
supports( $feature)
update( $id, $title, $text)
Create or update the search index record for the given page.
normalizeText( $string)
When overridden in derived class, performs database-specific conversions on text to be used for searc...
updateTitle( $id, $title)
Update a search index record's title only.
legalSearchChars( $type=self::CHARS_ALL)
Get chars legal for search.
Base interface for representing page content.
Definition Content.php:39
Interface that deferrable updates should implement.
Data record representing a page that currently exists as an editable page on a wiki.
Interface for objects (potentially) representing an editable wiki page.
Interface for database access objects.