MediaWiki  master
SearchUpdate.php
Go to the documentation of this file.
1 <?php
30 
36 class SearchUpdate implements DeferrableUpdate {
38  private $id = 0;
39 
41  private $page;
42 
44  private $content;
45 
47  private $latestPage = null;
48 
54  public function __construct( $id, $page, $c = null ) {
55  $this->page = $page;
56 
57  $this->id = $id;
58  // is_string() check is back-compat for ApprovedRevs
59  if ( is_string( $c ) ) {
60  wfDeprecated( __METHOD__ . " with a string for the content", '1.34' );
61  $c = new TextContent( $c );
62  } elseif ( is_bool( $c ) ) {
63  wfDeprecated( __METHOD__ . " with a boolean for the content", '1.34' );
64  $c = null;
65  }
66  $this->content = $c;
67  }
68 
72  public function doUpdate() {
73  $services = MediaWikiServices::getInstance();
74  $config = $services->getSearchEngineConfig();
75 
76  if ( $config->getConfig()->get( 'DisableSearchUpdate' ) || !$this->id ) {
77  return;
78  }
79 
80  $seFactory = $services->getSearchEngineFactory();
81  foreach ( $config->getSearchTypes() as $type ) {
82  $search = $seFactory->create( $type );
83  if ( !$search->supports( 'search-update' ) ) {
84  continue;
85  }
86 
87  $normalTitle = $this->getNormalizedTitle( $search );
88 
89  if ( $this->getLatestPage() === null ) {
90  $search->delete( $this->id, $normalTitle );
91  continue;
92  } elseif ( $this->content === null ) {
93  $search->updateTitle( $this->id, $normalTitle );
94  continue;
95  }
96 
97  $text = $this->content !== null ? $this->content->getTextForSearchIndex() : '';
98  $text = $this->updateText( $text, $search );
99 
100  # Perform the actual update
101  $search->update( $this->id, $normalTitle, $search->normalizeText( $text ) );
102  }
103  }
104 
113  public function updateText( $text, SearchEngine $se = null ) {
114  $services = MediaWikiServices::getInstance();
115  $contLang = $services->getContentLanguage();
116  # Language-specific strip/conversion
117  $text = $contLang->normalizeForSearch( $text );
118  $se = $se ?: $services->newSearchEngine();
119  $lc = $se->legalSearchChars() . '&#;';
120 
121  # Strip HTML markup
122  $text = preg_replace( "/<\\/?\\s*[A-Za-z][^>]*?>/",
123  ' ', $contLang->lc( " " . $text . " " ) );
124  $text = preg_replace( "/(^|\\n)==\\s*([^\\n]+)\\s*==(\\s)/sD",
125  "\\1\\2 \\2 \\2\\3", $text ); # Emphasize headings
126 
127  # Strip external URLs
128  $uc = "A-Za-z0-9_\\/:.,~%\\-+&;#?!=()@\\x80-\\xFF";
129  $protos = "http|https|ftp|mailto|news|gopher";
130  $pat = "/(^|[^\\[])({$protos}):[{$uc}]+([^{$uc}]|$)/";
131  $text = preg_replace( $pat, "\\1 \\3", $text );
132 
133  $p1 = "/([^\\[])\\[({$protos}):[{$uc}]+]/";
134  $p2 = "/([^\\[])\\[({$protos}):[{$uc}]+\\s+([^\\]]+)]/";
135  $text = preg_replace( $p1, "\\1 ", $text );
136  $text = preg_replace( $p2, "\\1 \\3 ", $text );
137 
138  # Internal image links
139  $pat2 = "/\\[\\[image:([{$uc}]+)\\.(gif|png|jpg|jpeg)([^{$uc}])/i";
140  $text = preg_replace( $pat2, " \\1 \\3", $text );
141 
142  $text = preg_replace( "/([^{$lc}])([{$lc}]+)]]([a-z]+)/",
143  "\\1\\2 \\2\\3", $text ); # Handle [[game]]s
144 
145  # Strip all remaining non-search characters
146  $text = preg_replace( "/[^{$lc}]+/", " ", $text );
147 
164  $text = strrev( preg_replace( "/ s'([{$lc}]+)/", " s'\\1 \\1", strrev( $text ) ) );
165  $text = strrev( preg_replace( "/ 's([{$lc}]+)/", " s\\1", strrev( $text ) ) );
166 
167  # Strip wiki '' and '''
168  $text = preg_replace( "/''[']*/", " ", $text );
169 
170  return $text;
171  }
172 
182  private function getLatestPage() {
183  if ( !isset( $this->latestPage ) ) {
184  $this->latestPage = MediaWikiServices::getInstance()->getPageStore()
185  ->getPageById( $this->id, PageStore::READ_LATEST );
186  }
187 
188  return $this->latestPage;
189  }
190 
198  private function getNormalizedTitle( SearchEngine $search ) {
199  $contLang = MediaWikiServices::getInstance()->getContentLanguage();
200  $ns = $this->page->getNamespace();
201  $title = str_replace( '_', ' ', $this->page->getDBkey() );
202 
203  $lc = $search->legalSearchChars() . '&#;';
204  $t = $contLang->normalizeForSearch( $title );
205  $t = preg_replace( "/[^{$lc}]+/", ' ', $t );
206  $t = $contLang->lc( $t );
207 
208  # Handle 's, s'
209  $t = preg_replace( "/([{$lc}]+)'s( |$)/", "\\1 \\1's ", $t );
210  $t = preg_replace( "/([{$lc}]+)s'( |$)/", "\\1s ", $t );
211 
212  $t = preg_replace( "/\\s+/", ' ', $t );
213 
214  if ( $ns === NS_FILE ) {
215  $t = preg_replace( "/ (png|gif|jpg|jpeg|ogg)$/", "", $t );
216  }
217 
218  return $search->normalizeText( trim( $t ) );
219  }
220 }
Page\PageIdentity
Interface for objects (potentially) representing an editable wiki page.
Definition: PageIdentity.php:64
MediaWiki\MediaWikiServices
MediaWikiServices is the service locator for the application scope of MediaWiki.
Definition: MediaWikiServices.php:193
SearchUpdate\updateText
updateText( $text, SearchEngine $se=null)
Clean text for indexing.
Definition: SearchUpdate.php:113
SearchEngine\normalizeText
normalizeText( $string)
When overridden in derived class, performs database-specific conversions on text to be used for searc...
Definition: SearchEngine.php:253
SearchUpdate
Database independent search index updater.
Definition: SearchUpdate.php:36
SearchEngine\legalSearchChars
legalSearchChars( $type=self::CHARS_ALL)
Get chars legal for search.
Definition: SearchEngine.php:286
wfDeprecated
wfDeprecated( $function, $version=false, $component=false, $callerOffset=2)
Logs a warning that a deprecated feature was used.
Definition: GlobalFunctions.php:997
SearchUpdate\$latestPage
ExistingPageRecord null $latestPage
Definition: SearchUpdate.php:47
SearchUpdate\$content
Content null $content
Content of the page (not text)
Definition: SearchUpdate.php:44
$title
$title
Definition: testCompression.php:38
SearchUpdate\__construct
__construct( $id, $page, $c=null)
Definition: SearchUpdate.php:54
Page\ExistingPageRecord
Data record representing a page that currently exists as an editable page on a wiki.
Definition: ExistingPageRecord.php:15
SearchUpdate\$id
int $id
Page id being updated.
Definition: SearchUpdate.php:38
TextContent
Content object implementation for representing flat text.
Definition: TextContent.php:39
SearchEngine
Contain a class for special pages.
Definition: SearchEngine.php:37
Content
Base interface for content objects.
Definition: Content.php:35
SearchUpdate\getLatestPage
getLatestPage()
Get ExistingPageRecord for the SearchUpdate $id using PageStore::READ_LATEST and ensure using the sam...
Definition: SearchUpdate.php:182
SearchUpdate\$page
PageIdentity $page
The page we're updating.
Definition: SearchUpdate.php:41
SearchUpdate\getNormalizedTitle
getNormalizedTitle(SearchEngine $search)
Get a normalized string representation of a title suitable for including in a search index.
Definition: SearchUpdate.php:198
$t
$t
Definition: testCompression.php:74
DeferrableUpdate
Interface that deferrable updates should implement.
Definition: DeferrableUpdate.php:11
NS_FILE
const NS_FILE
Definition: Defines.php:70
Page\PageStore
Definition: PageStore.php:29
SearchUpdate\doUpdate
doUpdate()
Perform actual update for the entry.
Definition: SearchUpdate.php:72
$type
$type
Definition: testCompression.php:52