MediaWiki  master
SearchUpdate.php
Go to the documentation of this file.
1 <?php
27 
33 class SearchUpdate implements DeferrableUpdate {
35  private $id = 0;
36 
38  private $title;
39 
41  private $content;
42 
44  private $page;
45 
51  public function __construct( $id, $title, $c = null ) {
52  if ( is_string( $title ) ) {
53  wfDeprecated( __METHOD__ . " with a string for the title", '1.34' );
54  $this->title = Title::newFromText( $title );
55  if ( $this->title === null ) {
56  throw new InvalidArgumentException( "Cannot construct the title: $title" );
57  }
58  } else {
59  $this->title = $title;
60  }
61 
62  $this->id = $id;
63  // is_string() check is back-compat for ApprovedRevs
64  if ( is_string( $c ) ) {
65  wfDeprecated( __METHOD__ . " with a string for the content", '1.34' );
66  $c = new TextContent( $c );
67  } elseif ( is_bool( $c ) ) {
68  wfDeprecated( __METHOD__ . " with a boolean for the content", '1.34' );
69  $c = null;
70  }
71  $this->content = $c;
72  }
73 
77  public function doUpdate() {
78  $services = MediaWikiServices::getInstance();
79  $config = $services->getSearchEngineConfig();
80 
81  if ( $config->getConfig()->get( 'DisableSearchUpdate' ) || !$this->id ) {
82  return;
83  }
84 
85  $seFactory = $services->getSearchEngineFactory();
86  foreach ( $config->getSearchTypes() as $type ) {
87  $search = $seFactory->create( $type );
88  if ( !$search->supports( 'search-update' ) ) {
89  continue;
90  }
91 
92  $normalTitle = $this->getNormalizedTitle( $search );
93 
94  if ( $this->getLatestPage() === null ) {
95  $search->delete( $this->id, $normalTitle );
96  continue;
97  } elseif ( $this->content === null ) {
98  $search->updateTitle( $this->id, $normalTitle );
99  continue;
100  }
101 
102  $text = $this->content !== null ? $this->content->getTextForSearchIndex() : '';
103  $text = $this->updateText( $text, $search );
104 
105  # Perform the actual update
106  $search->update( $this->id, $normalTitle, $search->normalizeText( $text ) );
107  }
108  }
109 
118  public function updateText( $text, SearchEngine $se = null ) {
119  $services = MediaWikiServices::getInstance();
120  $contLang = $services->getContentLanguage();
121  # Language-specific strip/conversion
122  $text = $contLang->normalizeForSearch( $text );
123  $se = $se ?: $services->newSearchEngine();
124  $lc = $se->legalSearchChars() . '&#;';
125 
126  # Strip HTML markup
127  $text = preg_replace( "/<\\/?\\s*[A-Za-z][^>]*?>/",
128  ' ', $contLang->lc( " " . $text . " " ) );
129  $text = preg_replace( "/(^|\\n)==\\s*([^\\n]+)\\s*==(\\s)/sD",
130  "\\1\\2 \\2 \\2\\3", $text ); # Emphasize headings
131 
132  # Strip external URLs
133  $uc = "A-Za-z0-9_\\/:.,~%\\-+&;#?!=()@\\x80-\\xFF";
134  $protos = "http|https|ftp|mailto|news|gopher";
135  $pat = "/(^|[^\\[])({$protos}):[{$uc}]+([^{$uc}]|$)/";
136  $text = preg_replace( $pat, "\\1 \\3", $text );
137 
138  $p1 = "/([^\\[])\\[({$protos}):[{$uc}]+]/";
139  $p2 = "/([^\\[])\\[({$protos}):[{$uc}]+\\s+([^\\]]+)]/";
140  $text = preg_replace( $p1, "\\1 ", $text );
141  $text = preg_replace( $p2, "\\1 \\3 ", $text );
142 
143  # Internal image links
144  $pat2 = "/\\[\\[image:([{$uc}]+)\\.(gif|png|jpg|jpeg)([^{$uc}])/i";
145  $text = preg_replace( $pat2, " \\1 \\3", $text );
146 
147  $text = preg_replace( "/([^{$lc}])([{$lc}]+)]]([a-z]+)/",
148  "\\1\\2 \\2\\3", $text ); # Handle [[game]]s
149 
150  # Strip all remaining non-search characters
151  $text = preg_replace( "/[^{$lc}]+/", " ", $text );
152 
169  $text = strrev( preg_replace( "/ s'([{$lc}]+)/", " s'\\1 \\1", strrev( $text ) ) );
170  $text = strrev( preg_replace( "/ 's([{$lc}]+)/", " s\\1", strrev( $text ) ) );
171 
172  # Strip wiki '' and '''
173  $text = preg_replace( "/''[']*/", " ", $text );
174 
175  return $text;
176  }
177 
187  private function getLatestPage() {
188  if ( !isset( $this->page ) ) {
189  $this->page = WikiPage::newFromID( $this->id, WikiPage::READ_LATEST );
190  }
191 
192  return $this->page;
193  }
194 
202  private function getNormalizedTitle( SearchEngine $search ) {
203  $contLang = MediaWikiServices::getInstance()->getContentLanguage();
204  $ns = $this->title->getNamespace();
205  $title = $this->title->getText();
206 
207  $lc = $search->legalSearchChars() . '&#;';
208  $t = $contLang->normalizeForSearch( $title );
209  $t = preg_replace( "/[^{$lc}]+/", ' ', $t );
210  $t = $contLang->lc( $t );
211 
212  # Handle 's, s'
213  $t = preg_replace( "/([{$lc}]+)'s( |$)/", "\\1 \\1's ", $t );
214  $t = preg_replace( "/([{$lc}]+)s'( |$)/", "\\1s ", $t );
215 
216  $t = preg_replace( "/\\s+/", ' ', $t );
217 
218  if ( $ns == NS_FILE ) {
219  $t = preg_replace( "/ (png|gif|jpg|jpeg|ogg)$/", "", $t );
220  }
221 
222  return $search->normalizeText( trim( $t ) );
223  }
224 }
Title\newFromText
static newFromText( $text, $defaultNamespace=NS_MAIN)
Create a new Title from text, such as what one would find in a link.
Definition: Title.php:317
SearchEngine\supports
supports( $feature)
Definition: SearchEngine.php:197
MediaWiki\MediaWikiServices
MediaWikiServices is the service locator for the application scope of MediaWiki.
Definition: MediaWikiServices.php:129
SearchEngine\normalizeText
normalizeText( $string)
When overridden in derived class, performs database-specific conversions on text to be used for searc...
Definition: SearchEngine.php:236
DeferrableUpdate\doUpdate
doUpdate()
Perform the actual work.
WikiPage
Class representing a MediaWiki article and history.
Definition: WikiPage.php:45
NS_FILE
const NS_FILE
Definition: Defines.php:66
SearchEngine\legalSearchChars
legalSearchChars( $type=self::CHARS_ALL)
Get chars legal for search.
Definition: SearchEngine.php:267
wfDeprecated
wfDeprecated( $function, $version=false, $component=false, $callerOffset=2)
Throws a warning that $function is deprecated.
Definition: GlobalFunctions.php:1044
$title
$title
Definition: testCompression.php:36
SearchEngine\updateTitle
updateTitle( $id, $title)
Update a search index record's title only.
Definition: SearchEngine.php:467
SearchEngine\update
update( $id, $title, $text)
Create or update the search index record for the given page.
Definition: SearchEngine.php:455
$content
$content
Definition: router.php:78
WikiPage\newFromID
static newFromID( $id, $from='fromdb')
Constructor from a page id.
Definition: WikiPage.php:178
SearchEngine\delete
delete( $id, $title)
Delete an indexed page Title should be pre-processed.
Definition: SearchEngine.php:479
TextContent
Content object implementation for representing flat text.
Definition: TextContent.php:37
SearchEngine
Contain a class for special pages.
Definition: SearchEngine.php:34
Content
Base interface for content objects.
Definition: Content.php:34
Title
Represents a title within MediaWiki.
Definition: Title.php:42
$t
$t
Definition: testCompression.php:71
DeferrableUpdate
Interface that deferrable updates should implement.
Definition: DeferrableUpdate.php:9
$type
$type
Definition: testCompression.php:50