MediaWiki  master
MediaWikiPageNameNormalizer.php
Go to the documentation of this file.
1 <?php
2 
3 namespace MediaWiki\Site;
4 
5 use FormatJson;
6 use Http;
7 use InvalidArgumentException;
8 use UtfNormal\Validator;
9 
37 
38  public const FOLLOW_REDIRECT = 1;
39  public const NOFOLLOW_REDIRECT = 2;
40 
44  private $http;
45 
49  public function __construct( Http $http = null ) {
50  if ( !$http ) {
51  $http = new Http();
52  }
53 
54  $this->http = $http;
55  }
56 
82  public function normalizePageName( $pageName, $apiUrl, $followRedirect = self::FOLLOW_REDIRECT ) {
83  // Check if we have strings as arguments.
84  if ( !is_string( $pageName ) ) {
85  throw new \MWException( '$pageName must be a string' );
86  }
87 
88  if ( $followRedirect === self::FOLLOW_REDIRECT ) {
89  $redirects = true;
90  } elseif ( $followRedirect === self::NOFOLLOW_REDIRECT ) {
91  $redirects = false;
92  } else {
93  throw new InvalidArgumentException( '$followRedirect is not properly set: ' . $followRedirect );
94  }
95 
96  // Go on call the external site
97 
98  // Make sure the string is normalized into NFC (due to T42017)
99  // but do nothing to the whitespaces, that should work appropriately.
100  // @see https://phabricator.wikimedia.org/T42017
101  $pageName = Validator::cleanUp( $pageName );
102 
103  // Build the args for the specific call
104  $args = [
105  'action' => 'query',
106  'prop' => 'info',
107  'redirects' => $redirects,
108  'converttitles' => true,
109  'format' => 'json',
110  'titles' => $pageName,
111  // @todo options for maxlag and maxage
112  // Note that maxlag will lead to a long delay before a reply is made,
113  // but that maxage can avoid the extreme delay. On the other hand
114  // maxage could be nice to use anyhow as it stops unnecessary requests.
115  // Also consider smaxage if maxage is used.
116  ];
117 
118  $url = wfAppendQuery( $apiUrl, $args );
119 
120  // Go on call the external site
121  // @todo we need a good way to specify a timeout here.
122  $ret = $this->http->get( $url, [], __METHOD__ );
123 
124  if ( $ret === false ) {
125  wfDebugLog( "MediaWikiSite", "call to external site failed: $url" );
126  return false;
127  }
128 
129  $data = FormatJson::decode( $ret, true );
130 
131  if ( !is_array( $data ) ) {
132  wfDebugLog( "MediaWikiSite", "call to <$url> returned bad json: " . $ret );
133  return false;
134  }
135 
136  $page = static::extractPageRecord( $data, $pageName );
137 
138  if ( isset( $page['missing'] ) ) {
139  wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for a missing page title! "
140  . $ret );
141  return false;
142  }
143 
144  if ( isset( $page['invalid'] ) ) {
145  wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for an invalid page title! "
146  . $ret );
147  return false;
148  }
149 
150  if ( !isset( $page['title'] ) ) {
151  wfDebugLog( "MediaWikiSite", "call to <$url> did not return a page title! " . $ret );
152  return false;
153  }
154 
155  return $page['title'];
156  }
157 
166  private static function extractPageRecord( $externalData, $pageTitle ) {
167  // If there is a special case with only one returned page
168  // we can cheat, and only return
169  // the single page in the "pages" substructure.
170  if ( isset( $externalData['query']['pages'] ) ) {
171  $pages = array_values( $externalData['query']['pages'] );
172  if ( count( $pages ) === 1 ) {
173  return $pages[0];
174  }
175  }
176  // This is only used during internal testing, as it is assumed
177  // a more optimal (and lossfree) storage.
178  // Make initial checks and return if prerequisites are not meet.
179  if ( !is_array( $externalData ) || !isset( $externalData['query'] ) ) {
180  return false;
181  }
182  // Loop over the tree different named structures, that otherwise are similar
183  $structs = [
184  'normalized' => 'from',
185  'converted' => 'from',
186  'redirects' => 'from',
187  'pages' => 'title'
188  ];
189  foreach ( $structs as $listId => $fieldId ) {
190  // Check if the substructure exist at all.
191  if ( !isset( $externalData['query'][$listId] ) ) {
192  continue;
193  }
194  // Filter the substructure down to what we actually are using.
195  $collectedHits = array_filter(
196  array_values( $externalData['query'][$listId] ),
197  static function ( $a ) use ( $fieldId, $pageTitle ) {
198  return $a[$fieldId] === $pageTitle;
199  }
200  );
201  // If still looping over normalization, conversion or redirects,
202  // then we need to keep the new page title for later rounds.
203  if ( $fieldId === 'from' && is_array( $collectedHits ) ) {
204  switch ( count( $collectedHits ) ) {
205  case 0:
206  break;
207  case 1:
208  $pageTitle = $collectedHits[0]['to'];
209  break;
210  default:
211  return false;
212  }
213  } elseif ( $fieldId === 'title' && is_array( $collectedHits ) ) {
214  // If on the pages structure we should prepare for returning.
215 
216  switch ( count( $collectedHits ) ) {
217  case 0:
218  return false;
219  case 1:
220  return array_shift( $collectedHits );
221  default:
222  return false;
223  }
224  }
225  }
226  // should never be here
227  return false;
228  }
229 
230 }
MediaWiki\Site\MediaWikiPageNameNormalizer\__construct
__construct(Http $http=null)
Definition: MediaWikiPageNameNormalizer.php:49
MediaWiki\Site\MediaWikiPageNameNormalizer\extractPageRecord
static extractPageRecord( $externalData, $pageTitle)
Get normalization record for a given page title from an API response.
Definition: MediaWikiPageNameNormalizer.php:166
MediaWiki\Site\MediaWikiPageNameNormalizer\$http
Http $http
Definition: MediaWikiPageNameNormalizer.php:44
wfDebugLog
wfDebugLog( $logGroup, $text, $dest='all', array $context=[])
Send a line to a supplementary debug log file, if configured, or main debug log if not.
Definition: GlobalFunctions.php:958
wfAppendQuery
wfAppendQuery( $url, $query)
Append a query string to an existing URL, which may or may not already have query string parameters a...
Definition: GlobalFunctions.php:422
FormatJson\decode
static decode( $value, $assoc=false)
Decodes a JSON string.
Definition: FormatJson.php:146
FormatJson
JSON formatter wrapper class.
Definition: FormatJson.php:26
$args
if( $line===false) $args
Definition: mcc.php:124
MediaWiki\Site\MediaWikiPageNameNormalizer\NOFOLLOW_REDIRECT
const NOFOLLOW_REDIRECT
Definition: MediaWikiPageNameNormalizer.php:39
MediaWiki\Site\MediaWikiPageNameNormalizer
Service for normalizing a page name using a MediaWiki api.
Definition: MediaWikiPageNameNormalizer.php:36
MediaWiki\Site\MediaWikiPageNameNormalizer\FOLLOW_REDIRECT
const FOLLOW_REDIRECT
Definition: MediaWikiPageNameNormalizer.php:38
Http
Various HTTP related functions.
Definition: Http.php:28
MediaWiki\Site
Definition: MediaWikiPageNameNormalizer.php:3
MediaWiki\Site\MediaWikiPageNameNormalizer\normalizePageName
normalizePageName( $pageName, $apiUrl, $followRedirect=self::FOLLOW_REDIRECT)
Returns the normalized form of the given page title, using the normalization rules of the given site.
Definition: MediaWikiPageNameNormalizer.php:82