MediaWiki  master
MediaWikiPageNameNormalizer.php
Go to the documentation of this file.
1 <?php
2 
3 namespace MediaWiki\Site;
4 
5 use FormatJson;
6 use Http;
8 
36 
40  private $http;
41 
45  public function __construct( Http $http = null ) {
46  if ( !$http ) {
47  $http = new Http();
48  }
49 
50  $this->http = $http;
51  }
52 
74  public function normalizePageName( $pageName, $apiUrl ) {
75  // Check if we have strings as arguments.
76  if ( !is_string( $pageName ) ) {
77  throw new \MWException( '$pageName must be a string' );
78  }
79 
80  // Go on call the external site
81 
82  // Make sure the string is normalized into NFC (due to T42017)
83  // but do nothing to the whitespaces, that should work appropriately.
84  // @see https://phabricator.wikimedia.org/T42017
85  $pageName = Validator::cleanUp( $pageName );
86 
87  // Build the args for the specific call
88  $args = [
89  'action' => 'query',
90  'prop' => 'info',
91  'redirects' => true,
92  'converttitles' => true,
93  'format' => 'json',
94  'titles' => $pageName,
95  // @todo options for maxlag and maxage
96  // Note that maxlag will lead to a long delay before a reply is made,
97  // but that maxage can avoid the extreme delay. On the other hand
98  // maxage could be nice to use anyhow as it stops unnecessary requests.
99  // Also consider smaxage if maxage is used.
100  ];
101 
102  $url = wfAppendQuery( $apiUrl, $args );
103 
104  // Go on call the external site
105  // @todo we need a good way to specify a timeout here.
106  $ret = $this->http->get( $url, [], __METHOD__ );
107 
108  if ( $ret === false ) {
109  wfDebugLog( "MediaWikiSite", "call to external site failed: $url" );
110  return false;
111  }
112 
113  $data = FormatJson::decode( $ret, true );
114 
115  if ( !is_array( $data ) ) {
116  wfDebugLog( "MediaWikiSite", "call to <$url> returned bad json: " . $ret );
117  return false;
118  }
119 
120  $page = static::extractPageRecord( $data, $pageName );
121 
122  if ( isset( $page['missing'] ) ) {
123  wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for a missing page title! "
124  . $ret );
125  return false;
126  }
127 
128  if ( isset( $page['invalid'] ) ) {
129  wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for an invalid page title! "
130  . $ret );
131  return false;
132  }
133 
134  if ( !isset( $page['title'] ) ) {
135  wfDebugLog( "MediaWikiSite", "call to <$url> did not return a page title! " . $ret );
136  return false;
137  }
138 
139  return $page['title'];
140  }
141 
150  private static function extractPageRecord( $externalData, $pageTitle ) {
151  // If there is a special case with only one returned page
152  // we can cheat, and only return
153  // the single page in the "pages" substructure.
154  if ( isset( $externalData['query']['pages'] ) ) {
155  $pages = array_values( $externalData['query']['pages'] );
156  if ( count( $pages ) === 1 ) {
157  return $pages[0];
158  }
159  }
160  // This is only used during internal testing, as it is assumed
161  // a more optimal (and lossfree) storage.
162  // Make initial checks and return if prerequisites are not meet.
163  if ( !is_array( $externalData ) || !isset( $externalData['query'] ) ) {
164  return false;
165  }
166  // Loop over the tree different named structures, that otherwise are similar
167  $structs = [
168  'normalized' => 'from',
169  'converted' => 'from',
170  'redirects' => 'from',
171  'pages' => 'title'
172  ];
173  foreach ( $structs as $listId => $fieldId ) {
174  // Check if the substructure exist at all.
175  if ( !isset( $externalData['query'][$listId] ) ) {
176  continue;
177  }
178  // Filter the substructure down to what we actually are using.
179  $collectedHits = array_filter(
180  array_values( $externalData['query'][$listId] ),
181  function ( $a ) use ( $fieldId, $pageTitle ) {
182  return $a[$fieldId] === $pageTitle;
183  }
184  );
185  // If still looping over normalization, conversion or redirects,
186  // then we need to keep the new page title for later rounds.
187  if ( $fieldId === 'from' && is_array( $collectedHits ) ) {
188  switch ( count( $collectedHits ) ) {
189  case 0:
190  break;
191  case 1:
192  $pageTitle = $collectedHits[0]['to'];
193  break;
194  default:
195  return false;
196  }
197  } elseif ( $fieldId === 'title' && is_array( $collectedHits ) ) {
198  // If on the pages structure we should prepare for returning.
199 
200  switch ( count( $collectedHits ) ) {
201  case 0:
202  return false;
203  case 1:
204  return array_shift( $collectedHits );
205  default:
206  return false;
207  }
208  }
209  }
210  // should never be here
211  return false;
212  }
213 
214 }
if( $line===false) $args
Definition: cdb.php:64
wfAppendQuery( $url, $query)
Append a query string to an existing URL, which may or may not already have query string parameters a...
static decode( $value, $assoc=false)
Decodes a JSON string.
Definition: FormatJson.php:174
Various HTTP related functions.
Definition: Http.php:29
Service for normalizing a page name using a MediaWiki api.
static extractPageRecord( $externalData, $pageTitle)
Get normalization record for a given page title from an API response.
wfDebugLog( $logGroup, $text, $dest='all', array $context=[])
Send a line to a supplementary debug log file, if configured, or main debug log if not...
normalizePageName( $pageName, $apiUrl)
Returns the normalized form of the given page title, using the normalization rules of the given site...