MediaWiki master
SearchUpdate.php
Go to the documentation of this file.
1<?php
7namespace MediaWiki\Search;
8
16use SearchEngine;
18
27 private $id = 0;
28
30 private $page;
31
33 private $content;
34
36 private $latestPage = null;
37
43 public function __construct( $id, $page, ?Content $c = null ) {
44 $this->page = $page;
45 $this->id = $id;
46 $this->content = $c;
47 }
48
52 public function doUpdate() {
54 $searchEngineConfig = $services->getSearchEngineConfig();
55
56 if ( $services->getMainConfig()->get( MainConfigNames::DisableSearchUpdate ) || !$this->id ) {
57 LoggerFactory::getInstance( "search" )
58 ->debug( "Skipping update: search updates disabled by config" );
59 return;
60 }
61
62 $seFactory = $services->getSearchEngineFactory();
63 foreach ( $searchEngineConfig->getSearchTypes() as $type ) {
64 $search = $seFactory->create( $type );
65 if ( !$search->supports( 'search-update' ) ) {
66 continue;
67 }
68
69 $normalTitle = $this->getNormalizedTitle( $search );
70
71 if ( $this->getLatestPage() === null ) {
72 $search->delete( $this->id, $normalTitle );
73 continue;
74 }
75 if ( $this->content === null ) {
76 $search->updateTitle( $this->id, $normalTitle );
77 continue;
78 }
79
80 $text = $this->content->getTextForSearchIndex();
81 $text = $this->updateText( $text, $search );
82
83 # Perform the actual update
84 $search->update( $this->id, $normalTitle, $search->normalizeText( $text ) );
85 }
86 }
87
96 public function updateText( $text, ?SearchEngine $se = null ) {
98 $contLang = $services->getContentLanguage();
99 # Language-specific strip/conversion
100 $text = $contLang->normalizeForSearch( $text );
101 $se = $se ?: $services->newSearchEngine();
102 $lc = $se->legalSearchChars() . '&#;';
103
104 # Strip HTML markup
105 $text = preg_replace( "/<\\/?\\s*[A-Za-z][^>]*?>/",
106 ' ', $contLang->lc( " " . $text . " " ) );
107 $text = preg_replace( "/(^|\\n)==\\s*([^\\n]+)\\s*==(\\s)/",
108 "\\1\\2 \\2 \\2\\3", $text ); # Emphasize headings
109
110 # Strip external URLs
111 $uc = "A-Za-z0-9_\\/:.,~%\\-+&;#?!=()@\\x80-\\xFF";
112 $protos = "http|https|ftp|mailto|news|gopher";
113 $pat = "/(^|[^\\[])({$protos}):[{$uc}]+([^{$uc}]|$)/";
114 $text = preg_replace( $pat, "\\1 \\3", $text );
115
116 $p1 = "/([^\\[])\\[({$protos}):[{$uc}]+]/";
117 $p2 = "/([^\\[])\\[({$protos}):[{$uc}]+\\s+([^\\]]+)]/";
118 $text = preg_replace( $p1, "\\1 ", $text );
119 $text = preg_replace( $p2, "\\1 \\3 ", $text );
120
121 $text = preg_replace( "/([^{$lc}])([{$lc}]+)]]([a-z]+)/",
122 "\\1\\2 \\2\\3", $text ); # Handle [[game]]s
123
124 # Strip all remaining non-search characters
125 $text = preg_replace( "/[^{$lc}]+/", " ", $text );
126
143 $text = strrev( preg_replace( "/ s'([{$lc}]+)/", " s'\\1 \\1", strrev( $text ) ) );
144 $text = strrev( preg_replace( "/ 's([{$lc}]+)/", " s\\1", strrev( $text ) ) );
145
146 # Strip wiki '' and '''
147 $text = preg_replace( "/''[']*/", " ", $text );
148
149 return $text;
150 }
151
161 private function getLatestPage() {
162 if ( !$this->latestPage ) {
163 $this->latestPage = MediaWikiServices::getInstance()->getPageStore()
164 ->getPageById( $this->id, IDBAccessObject::READ_LATEST );
165 }
166
167 return $this->latestPage;
168 }
169
177 private function getNormalizedTitle( SearchEngine $search ) {
178 $contLang = MediaWikiServices::getInstance()->getContentLanguage();
179 $title = Title::newFromPageIdentity( $this->page )->getText();
180
181 $lc = $search->legalSearchChars() . '&#;';
182 $t = $contLang->normalizeForSearch( $title );
183 $t = preg_replace( "/[^{$lc}]+/", ' ', $t );
184 $t = $contLang->lc( $t );
185
186 if ( $this->page->getNamespace() === NS_FILE ) {
187 $t = preg_replace( "/([{$lc}]+)\\.(\\w{1,4})$/", "\\1 \\1.\\2", $t );
188 }
189
190 # Handle 's, s'
191 $t = preg_replace( "/([{$lc}]+)'s( |$)/", "\\1 \\1's ", $t );
192 $t = preg_replace( "/([{$lc}]+)s'( |$)/", "\\1s ", $t );
193
194 $t = preg_replace( "/\\s+/", ' ', $t );
195
196 return $search->normalizeText( trim( $t ) );
197 }
198}
const NS_FILE
Definition Defines.php:57
Create PSR-3 logger objects.
A class containing constants representing the names of configuration variables.
const DisableSearchUpdate
Name constant for the DisableSearchUpdate setting, for use with Config::get()
Service locator for MediaWiki core services.
static getInstance()
Returns the global default instance of the top level service locator.
Database independent search index updater.
doUpdate()
Perform actual update for the entry.
__construct( $id, $page, ?Content $c=null)
updateText( $text, ?SearchEngine $se=null)
Clean text for indexing.
Represents a title within MediaWiki.
Definition Title.php:69
Contain a class for special pages.
delete( $id, $title)
Delete an indexed page Title should be pre-processed.
supports( $feature)
update( $id, $title, $text)
Create or update the search index record for the given page.
normalizeText( $string)
When overridden in derived class, performs database-specific conversions on text to be used for searc...
updateTitle( $id, $title)
Update a search index record's title only.
legalSearchChars( $type=self::CHARS_ALL)
Get chars legal for search.
Content objects represent page content, e.g.
Definition Content.php:28
Data record representing a page that currently exists as an editable page on a wiki.
Interface for objects (potentially) representing an editable wiki page.
Interface for database access objects.