MediaWiki master
SearchUpdate.php
Go to the documentation of this file.
1<?php
7namespace MediaWiki\Search;
8
17
26 private $id = 0;
27
29 private $page;
30
32 private $content;
33
35 private $latestPage = null;
36
42 public function __construct( $id, $page, ?Content $c = null ) {
43 $this->page = $page;
44 $this->id = $id;
45 $this->content = $c;
46 }
47
51 public function doUpdate() {
53 $searchEngineConfig = $services->getSearchEngineConfig();
54
55 if ( $services->getMainConfig()->get( MainConfigNames::DisableSearchUpdate ) || !$this->id ) {
56 LoggerFactory::getInstance( "search" )
57 ->debug( "Skipping update: search updates disabled by config" );
58 return;
59 }
60
61 $seFactory = $services->getSearchEngineFactory();
62 foreach ( $searchEngineConfig->getSearchTypes() as $type ) {
63 $search = $seFactory->create( $type );
64 if ( !$search->supports( 'search-update' ) ) {
65 continue;
66 }
67
68 $normalTitle = $this->getNormalizedTitle( $search );
69
70 if ( $this->getLatestPage() === null ) {
71 $search->delete( $this->id, $normalTitle );
72 continue;
73 }
74 if ( $this->content === null ) {
75 $search->updateTitle( $this->id, $normalTitle );
76 continue;
77 }
78
79 $text = $this->content->getTextForSearchIndex();
80 $text = $this->updateText( $text, $search );
81
82 # Perform the actual update
83 $search->update( $this->id, $normalTitle, $search->normalizeText( $text ) );
84 }
85 }
86
95 public function updateText( $text, ?SearchEngine $se = null ) {
97 $contLang = $services->getContentLanguage();
98 # Language-specific strip/conversion
99 $text = $contLang->normalizeForSearch( $text );
100 $se = $se ?: $services->newSearchEngine();
101 $lc = $se->legalSearchChars() . '&#;';
102
103 # Strip HTML markup
104 $text = preg_replace( "/<\\/?\\s*[A-Za-z][^>]*?>/",
105 ' ', $contLang->lc( " " . $text . " " ) );
106 $text = preg_replace( "/(^|\\n)==\\s*([^\\n]+)\\s*==(\\s)/",
107 "\\1\\2 \\2 \\2\\3", $text ); # Emphasize headings
108
109 # Strip external URLs
110 $uc = "A-Za-z0-9_\\/:.,~%\\-+&;#?!=()@\\x80-\\xFF";
111 $protos = "http|https|ftp|mailto|news|gopher";
112 $pat = "/(^|[^\\[])({$protos}):[{$uc}]+([^{$uc}]|$)/";
113 $text = preg_replace( $pat, "\\1 \\3", $text );
114
115 $p1 = "/([^\\[])\\[({$protos}):[{$uc}]+]/";
116 $p2 = "/([^\\[])\\[({$protos}):[{$uc}]+\\s+([^\\]]+)]/";
117 $text = preg_replace( $p1, "\\1 ", $text );
118 $text = preg_replace( $p2, "\\1 \\3 ", $text );
119
120 $text = preg_replace( "/([^{$lc}])([{$lc}]+)]]([a-z]+)/",
121 "\\1\\2 \\2\\3", $text ); # Handle [[game]]s
122
123 # Strip all remaining non-search characters
124 $text = preg_replace( "/[^{$lc}]+/", " ", $text );
125
142 $text = strrev( preg_replace( "/ s'([{$lc}]+)/", " s'\\1 \\1", strrev( $text ) ) );
143 $text = strrev( preg_replace( "/ 's([{$lc}]+)/", " s\\1", strrev( $text ) ) );
144
145 # Strip wiki '' and '''
146 $text = preg_replace( "/''[']*/", " ", $text );
147
148 return $text;
149 }
150
160 private function getLatestPage() {
161 if ( !$this->latestPage ) {
162 $this->latestPage = MediaWikiServices::getInstance()->getPageStore()
163 ->getPageById( $this->id, IDBAccessObject::READ_LATEST );
164 }
165
166 return $this->latestPage;
167 }
168
176 private function getNormalizedTitle( SearchEngine $search ) {
177 $contLang = MediaWikiServices::getInstance()->getContentLanguage();
178 $title = Title::newFromPageIdentity( $this->page )->getText();
179
180 $lc = $search->legalSearchChars() . '&#;';
181 $t = $contLang->normalizeForSearch( $title );
182 $t = preg_replace( "/[^{$lc}]+/", ' ', $t );
183 $t = $contLang->lc( $t );
184
185 if ( $this->page->getNamespace() === NS_FILE ) {
186 $t = preg_replace( "/([{$lc}]+)\\.(\\w{1,4})$/", "\\1 \\1.\\2", $t );
187 }
188
189 # Handle 's, s'
190 $t = preg_replace( "/([{$lc}]+)'s( |$)/", "\\1 \\1's ", $t );
191 $t = preg_replace( "/([{$lc}]+)s'( |$)/", "\\1s ", $t );
192
193 $t = preg_replace( "/\\s+/", ' ', $t );
194
195 return $search->normalizeText( trim( $t ) );
196 }
197}
const NS_FILE
Definition Defines.php:57
Create PSR-3 logger objects.
A class containing constants representing the names of configuration variables.
const DisableSearchUpdate
Name constant for the DisableSearchUpdate setting, for use with Config::get()
Service locator for MediaWiki core services.
static getInstance()
Returns the global default instance of the top level service locator.
Contain a class for special pages.
Database independent search index updater.
doUpdate()
Perform actual update for the entry.
__construct( $id, $page, ?Content $c=null)
updateText( $text, ?SearchEngine $se=null)
Clean text for indexing.
Represents a title within MediaWiki.
Definition Title.php:69
Content objects represent page content, e.g.
Definition Content.php:28
Data record representing a page that currently exists as an editable page on a wiki.
Interface for objects (potentially) representing an editable wiki page.
Interface for database access objects.
Definition of a mapping for the search index field.