MediaWiki  master
MediaWikiPageNameNormalizer.php
Go to the documentation of this file.
1 <?php
21 namespace MediaWiki\Site;
22 
23 use FormatJson;
24 use InvalidArgumentException;
27 use UtfNormal\Validator;
28 
39 
40  public const FOLLOW_REDIRECT = 1;
41  public const NOFOLLOW_REDIRECT = 2;
42 
46  private $httpRequestFactory;
47 
51  public function __construct( $httpRequestFactory = null ) {
52  if ( !$httpRequestFactory instanceof HttpRequestFactory ) {
53  $httpRequestFactory = MediaWikiServices::getInstance()->getHttpRequestFactory();
54  }
55  $this->httpRequestFactory = $httpRequestFactory;
56  }
57 
83  public function normalizePageName( $pageName, $apiUrl, $followRedirect = self::FOLLOW_REDIRECT ) {
84  // Check if we have strings as arguments.
85  if ( !is_string( $pageName ) ) {
86  throw new \MWException( '$pageName must be a string' );
87  }
88 
89  if ( $followRedirect === self::FOLLOW_REDIRECT ) {
90  $redirects = true;
91  } elseif ( $followRedirect === self::NOFOLLOW_REDIRECT ) {
92  $redirects = false;
93  } else {
94  throw new InvalidArgumentException( '$followRedirect is not properly set: ' . $followRedirect );
95  }
96 
97  // Go on call the external site
98 
99  // Make sure the string is normalized into NFC (due to T42017)
100  // but do nothing to the whitespaces, that should work appropriately.
101  // @see https://phabricator.wikimedia.org/T42017
102  $pageName = Validator::cleanUp( $pageName );
103 
104  // Build the args for the specific call
105  $args = [
106  'action' => 'query',
107  'prop' => 'info',
108  'redirects' => $redirects,
109  'converttitles' => true,
110  'format' => 'json',
111  'titles' => $pageName,
112  // @todo options for maxlag and maxage
113  // Note that maxlag will lead to a long delay before a reply is made,
114  // but that maxage can avoid the extreme delay. On the other hand
115  // maxage could be nice to use anyhow as it stops unnecessary requests.
116  // Also consider smaxage if maxage is used.
117  ];
118 
119  $url = wfAppendQuery( $apiUrl, $args );
120 
121  // Go on call the external site
122  // @todo we need a good way to specify a timeout here.
123  $ret = $this->httpRequestFactory->get( $url, [], __METHOD__ );
124 
125  if ( $ret === null ) {
126  wfDebugLog( "MediaWikiSite", "call to external site failed: $url" );
127  return false;
128  }
129 
130  $data = FormatJson::decode( $ret, true );
131 
132  if ( !is_array( $data ) ) {
133  wfDebugLog( "MediaWikiSite", "call to <$url> returned bad json: " . $ret );
134  return false;
135  }
136 
137  $page = static::extractPageRecord( $data, $pageName );
138 
139  if ( isset( $page['missing'] ) ) {
140  wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for a missing page title! "
141  . $ret );
142  return false;
143  }
144 
145  if ( isset( $page['invalid'] ) ) {
146  wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for an invalid page title! "
147  . $ret );
148  return false;
149  }
150 
151  if ( !isset( $page['title'] ) ) {
152  wfDebugLog( "MediaWikiSite", "call to <$url> did not return a page title! " . $ret );
153  return false;
154  }
155 
156  return $page['title'];
157  }
158 
167  private static function extractPageRecord( $externalData, $pageTitle ) {
168  // If there is a special case with only one returned page
169  // we can cheat, and only return
170  // the single page in the "pages" substructure.
171  if ( isset( $externalData['query']['pages'] ) ) {
172  $pages = array_values( $externalData['query']['pages'] );
173  if ( count( $pages ) === 1 ) {
174  return $pages[0];
175  }
176  }
177  // This is only used during internal testing, as it is assumed
178  // a more optimal (and lossfree) storage.
179  // Make initial checks and return if prerequisites are not meet.
180  if ( !is_array( $externalData ) || !isset( $externalData['query'] ) ) {
181  return false;
182  }
183  // Loop over the tree different named structures, that otherwise are similar
184  $structs = [
185  'normalized' => 'from',
186  'converted' => 'from',
187  'redirects' => 'from',
188  'pages' => 'title'
189  ];
190  foreach ( $structs as $listId => $fieldId ) {
191  // Check if the substructure exist at all.
192  if ( !isset( $externalData['query'][$listId] ) ) {
193  continue;
194  }
195  // Filter the substructure down to what we actually are using.
196  $collectedHits = array_filter(
197  array_values( $externalData['query'][$listId] ),
198  static function ( $a ) use ( $fieldId, $pageTitle ) {
199  return $a[$fieldId] === $pageTitle;
200  }
201  );
202  // If still looping over normalization, conversion or redirects,
203  // then we need to keep the new page title for later rounds.
204  if ( $fieldId === 'from' && is_array( $collectedHits ) ) {
205  switch ( count( $collectedHits ) ) {
206  case 0:
207  break;
208  case 1:
209  $pageTitle = $collectedHits[0]['to'];
210  break;
211  default:
212  return false;
213  }
214  } elseif ( $fieldId === 'title' && is_array( $collectedHits ) ) {
215  // If on the pages structure we should prepare for returning.
216 
217  switch ( count( $collectedHits ) ) {
218  case 0:
219  return false;
220  case 1:
221  return array_shift( $collectedHits );
222  default:
223  return false;
224  }
225  }
226  }
227  // should never be here
228  return false;
229  }
230 
231 }
wfDebugLog( $logGroup, $text, $dest='all', array $context=[])
Send a line to a supplementary debug log file, if configured, or main debug log if not.
wfAppendQuery( $url, $query)
Append a query string to an existing URL, which may or may not already have query string parameters a...
JSON formatter wrapper class.
Definition: FormatJson.php:26
static decode( $value, $assoc=false)
Decodes a JSON string.
Definition: FormatJson.php:146
Factory creating MWHttpRequest objects.
Service locator for MediaWiki core services.
static getInstance()
Returns the global default instance of the top level service locator.
Service for normalizing a page name via a MediaWiki action API.
normalizePageName( $pageName, $apiUrl, $followRedirect=self::FOLLOW_REDIRECT)
Returns the normalized form of the given page title, using the normalization rules of the given site.