MediaWiki  master
SearchUpdate.php
Go to the documentation of this file.
1 <?php
27 
33 class SearchUpdate implements DeferrableUpdate {
35  private $id = 0;
36 
38  private $title;
39 
41  private $content;
42 
44  private $page;
45 
51  public function __construct( $id, $title, $c = null ) {
52  if ( is_string( $title ) ) {
53  wfDeprecated( __METHOD__ . " with a string for the title", '1.34' );
54  $this->title = Title::newFromText( $title );
55  if ( $this->title === null ) {
56  throw new InvalidArgumentException( "Cannot construct the title: $title" );
57  }
58  } else {
59  $this->title = $title;
60  }
61 
62  $this->id = $id;
63  // is_string() check is back-compat for ApprovedRevs
64  if ( is_string( $c ) ) {
65  wfDeprecated( __METHOD__ . " with a string for the content", '1.34' );
66  $c = new TextContent( $c );
67  } elseif ( is_bool( $c ) ) {
68  wfDeprecated( __METHOD__ . " with a boolean for the content", '1.34' );
69  $c = null;
70  }
71  $this->content = $c;
72  }
73 
77  public function doUpdate() {
78  $services = MediaWikiServices::getInstance();
79  $config = $services->getSearchEngineConfig();
80 
81  if ( $config->getConfig()->get( 'DisableSearchUpdate' ) || !$this->id ) {
82  return;
83  }
84 
85  $seFactory = $services->getSearchEngineFactory();
86  foreach ( $config->getSearchTypes() as $type ) {
87  $search = $seFactory->create( $type );
88  if ( !$search->supports( 'search-update' ) ) {
89  continue;
90  }
91 
92  $normalTitle = $this->getNormalizedTitle( $search );
93 
94  if ( $this->getLatestPage() === null ) {
95  $search->delete( $this->id, $normalTitle );
96  continue;
97  } elseif ( $this->content === null ) {
98  $search->updateTitle( $this->id, $normalTitle );
99  continue;
100  }
101 
102  $text = $this->content !== null ? $this->content->getTextForSearchIndex() : '';
103  $text = $this->updateText( $text, $search );
104 
105  # Perform the actual update
106  $search->update( $this->id, $normalTitle, $search->normalizeText( $text ) );
107  }
108  }
109 
118  public function updateText( $text, SearchEngine $se = null ) {
119  $services = MediaWikiServices::getInstance();
120  $contLang = $services->getContentLanguage();
121  # Language-specific strip/conversion
122  $text = $contLang->normalizeForSearch( $text );
123  $se = $se ?: $services->newSearchEngine();
124  $lc = $se->legalSearchChars() . '&#;';
125 
126  # Strip HTML markup
127  $text = preg_replace( "/<\\/?\\s*[A-Za-z][^>]*?>/",
128  ' ', $contLang->lc( " " . $text . " " ) );
129  $text = preg_replace( "/(^|\\n)==\\s*([^\\n]+)\\s*==(\\s)/sD",
130  "\\1\\2 \\2 \\2\\3", $text ); # Emphasize headings
131 
132  # Strip external URLs
133  $uc = "A-Za-z0-9_\\/:.,~%\\-+&;#?!=()@\\x80-\\xFF";
134  $protos = "http|https|ftp|mailto|news|gopher";
135  $pat = "/(^|[^\\[])({$protos}):[{$uc}]+([^{$uc}]|$)/";
136  $text = preg_replace( $pat, "\\1 \\3", $text );
137 
138  $p1 = "/([^\\[])\\[({$protos}):[{$uc}]+]/";
139  $p2 = "/([^\\[])\\[({$protos}):[{$uc}]+\\s+([^\\]]+)]/";
140  $text = preg_replace( $p1, "\\1 ", $text );
141  $text = preg_replace( $p2, "\\1 \\3 ", $text );
142 
143  # Internal image links
144  $pat2 = "/\\[\\[image:([{$uc}]+)\\.(gif|png|jpg|jpeg)([^{$uc}])/i";
145  $text = preg_replace( $pat2, " \\1 \\3", $text );
146 
147  $text = preg_replace( "/([^{$lc}])([{$lc}]+)]]([a-z]+)/",
148  "\\1\\2 \\2\\3", $text ); # Handle [[game]]s
149 
150  # Strip all remaining non-search characters
151  $text = preg_replace( "/[^{$lc}]+/", " ", $text );
152 
169  $text = strrev( preg_replace( "/ s'([{$lc}]+)/", " s'\\1 \\1", strrev( $text ) ) );
170  $text = strrev( preg_replace( "/ 's([{$lc}]+)/", " s\\1", strrev( $text ) ) );
171 
172  # Strip wiki '' and '''
173  $text = preg_replace( "/''[']*/", " ", $text );
174 
175  return $text;
176  }
177 
187  private function getLatestPage() {
188  if ( !isset( $this->page ) ) {
189  $this->page = WikiPage::newFromID( $this->id, WikiPage::READ_LATEST );
190  }
191 
192  return $this->page;
193  }
194 
202  private function getNormalizedTitle( SearchEngine $search ) {
203  $contLang = MediaWikiServices::getInstance()->getContentLanguage();
204  $ns = $this->title->getNamespace();
205  $title = $this->title->getText();
206 
207  $lc = $search->legalSearchChars() . '&#;';
208  $t = $contLang->normalizeForSearch( $title );
209  $t = preg_replace( "/[^{$lc}]+/", ' ', $t );
210  $t = $contLang->lc( $t );
211 
212  # Handle 's, s'
213  $t = preg_replace( "/([{$lc}]+)'s( |$)/", "\\1 \\1's ", $t );
214  $t = preg_replace( "/([{$lc}]+)s'( |$)/", "\\1s ", $t );
215 
216  $t = preg_replace( "/\\s+/", ' ', $t );
217 
218  if ( $ns == NS_FILE ) {
219  $t = preg_replace( "/ (png|gif|jpg|jpeg|ogg)$/", "", $t );
220  }
221 
222  return $search->normalizeText( trim( $t ) );
223  }
224 }
getText()
Get the text form (spaces not underscores) of the main part.
Definition: Title.php:998
doUpdate()
Perform actual update for the entry.
legalSearchChars( $type=self::CHARS_ALL)
Get chars legal for search.
int $id
Page id being updated.
const NS_FILE
Definition: Defines.php:66
getNormalizedTitle(SearchEngine $search)
Get a normalized string representation of a title suitable for including in a search index...
WikiPage $page
static newFromID( $id, $from='fromdb')
Constructor from a page id.
Definition: WikiPage.php:180
Content null $content
Content of the page (not text)
wfDeprecated( $function, $version=false, $component=false, $callerOffset=2)
Throws a warning that $function is deprecated.
Title $title
Title we&#39;re updating.
normalizeText( $string)
When overridden in derived class, performs database-specific conversions on text to be used for searc...
getLatestPage()
Get WikiPage for the SearchUpdate $id using WikiPage::READ_LATEST and ensure using the same WikiPage ...
__construct( $id, $title, $c=null)
updateText( $text, SearchEngine $se=null)
Clean text for indexing.
static newFromText( $text, $defaultNamespace=NS_MAIN)
Create a new Title from text, such as what one would find in a link.
Definition: Title.php:319