MediaWiki master
SearchUpdate.php
Go to the documentation of this file.
1<?php
21namespace MediaWiki\Search;
22
30use SearchEngine;
32
41 private $id = 0;
42
44 private $page;
45
47 private $content;
48
50 private $latestPage = null;
51
57 public function __construct( $id, $page, ?Content $c = null ) {
58 $this->page = $page;
59 $this->id = $id;
60 $this->content = $c;
61 }
62
66 public function doUpdate() {
68 $searchEngineConfig = $services->getSearchEngineConfig();
69
70 if ( $services->getMainConfig()->get( MainConfigNames::DisableSearchUpdate ) || !$this->id ) {
71 LoggerFactory::getInstance( "search" )
72 ->debug( "Skipping update: search updates disabled by config" );
73 return;
74 }
75
76 $seFactory = $services->getSearchEngineFactory();
77 foreach ( $searchEngineConfig->getSearchTypes() as $type ) {
78 $search = $seFactory->create( $type );
79 if ( !$search->supports( 'search-update' ) ) {
80 continue;
81 }
82
83 $normalTitle = $this->getNormalizedTitle( $search );
84
85 if ( $this->getLatestPage() === null ) {
86 $search->delete( $this->id, $normalTitle );
87 continue;
88 } elseif ( $this->content === null ) {
89 $search->updateTitle( $this->id, $normalTitle );
90 continue;
91 }
92
93 $text = $this->content !== null ? $this->content->getTextForSearchIndex() : '';
94 $text = $this->updateText( $text, $search );
95
96 # Perform the actual update
97 $search->update( $this->id, $normalTitle, $search->normalizeText( $text ) );
98 }
99 }
100
109 public function updateText( $text, ?SearchEngine $se = null ) {
110 $services = MediaWikiServices::getInstance();
111 $contLang = $services->getContentLanguage();
112 # Language-specific strip/conversion
113 $text = $contLang->normalizeForSearch( $text );
114 $se = $se ?: $services->newSearchEngine();
115 $lc = $se->legalSearchChars() . '&#;';
116
117 # Strip HTML markup
118 $text = preg_replace( "/<\\/?\\s*[A-Za-z][^>]*?>/",
119 ' ', $contLang->lc( " " . $text . " " ) );
120 $text = preg_replace( "/(^|\\n)==\\s*([^\\n]+)\\s*==(\\s)/",
121 "\\1\\2 \\2 \\2\\3", $text ); # Emphasize headings
122
123 # Strip external URLs
124 $uc = "A-Za-z0-9_\\/:.,~%\\-+&;#?!=()@\\x80-\\xFF";
125 $protos = "http|https|ftp|mailto|news|gopher";
126 $pat = "/(^|[^\\[])({$protos}):[{$uc}]+([^{$uc}]|$)/";
127 $text = preg_replace( $pat, "\\1 \\3", $text );
128
129 $p1 = "/([^\\[])\\[({$protos}):[{$uc}]+]/";
130 $p2 = "/([^\\[])\\[({$protos}):[{$uc}]+\\s+([^\\]]+)]/";
131 $text = preg_replace( $p1, "\\1 ", $text );
132 $text = preg_replace( $p2, "\\1 \\3 ", $text );
133
134 $text = preg_replace( "/([^{$lc}])([{$lc}]+)]]([a-z]+)/",
135 "\\1\\2 \\2\\3", $text ); # Handle [[game]]s
136
137 # Strip all remaining non-search characters
138 $text = preg_replace( "/[^{$lc}]+/", " ", $text );
139
156 $text = strrev( preg_replace( "/ s'([{$lc}]+)/", " s'\\1 \\1", strrev( $text ) ) );
157 $text = strrev( preg_replace( "/ 's([{$lc}]+)/", " s\\1", strrev( $text ) ) );
158
159 # Strip wiki '' and '''
160 $text = preg_replace( "/''[']*/", " ", $text );
161
162 return $text;
163 }
164
174 private function getLatestPage() {
175 if ( !$this->latestPage ) {
176 $this->latestPage = MediaWikiServices::getInstance()->getPageStore()
177 ->getPageById( $this->id, IDBAccessObject::READ_LATEST );
178 }
179
180 return $this->latestPage;
181 }
182
190 private function getNormalizedTitle( SearchEngine $search ) {
191 $contLang = MediaWikiServices::getInstance()->getContentLanguage();
192 $title = Title::newFromPageIdentity( $this->page )->getText();
193
194 $lc = $search->legalSearchChars() . '&#;';
195 $t = $contLang->normalizeForSearch( $title );
196 $t = preg_replace( "/[^{$lc}]+/", ' ', $t );
197 $t = $contLang->lc( $t );
198
199 # Handle 's, s'
200 $t = preg_replace( "/([{$lc}]+)'s( |$)/", "\\1 \\1's ", $t );
201 $t = preg_replace( "/([{$lc}]+)s'( |$)/", "\\1s ", $t );
202
203 $t = preg_replace( "/\\s+/", ' ', $t );
204
205 return $search->normalizeText( trim( $t ) );
206 }
207}
Create PSR-3 logger objects.
A class containing constants representing the names of configuration variables.
const DisableSearchUpdate
Name constant for the DisableSearchUpdate setting, for use with Config::get()
Service locator for MediaWiki core services.
static getInstance()
Returns the global default instance of the top level service locator.
Database independent search index updater.
doUpdate()
Perform actual update for the entry.
__construct( $id, $page, ?Content $c=null)
updateText( $text, ?SearchEngine $se=null)
Clean text for indexing.
Represents a title within MediaWiki.
Definition Title.php:78
Contain a class for special pages.
delete( $id, $title)
Delete an indexed page Title should be pre-processed.
supports( $feature)
update( $id, $title, $text)
Create or update the search index record for the given page.
normalizeText( $string)
When overridden in derived class, performs database-specific conversions on text to be used for searc...
updateTitle( $id, $title)
Update a search index record's title only.
legalSearchChars( $type=self::CHARS_ALL)
Get chars legal for search.
Content objects represent page content, e.g.
Definition Content.php:42
Data record representing a page that currently exists as an editable page on a wiki.
Interface for objects (potentially) representing an editable wiki page.
Interface for database access objects.