MediaWiki master
MediaWikiPageNameNormalizer.php
Go to the documentation of this file.
1<?php
21namespace MediaWiki\Site;
22
23use FormatJson;
24use InvalidArgumentException;
27use UtfNormal\Validator;
28
39
40 public const FOLLOW_REDIRECT = 1;
41 public const NOFOLLOW_REDIRECT = 2;
42
46 private $httpRequestFactory;
47
51 public function __construct( $httpRequestFactory = null ) {
52 if ( !$httpRequestFactory instanceof HttpRequestFactory ) {
53 $httpRequestFactory = MediaWikiServices::getInstance()->getHttpRequestFactory();
54 }
55 $this->httpRequestFactory = $httpRequestFactory;
56 }
57
81 public function normalizePageName( string $pageName, $apiUrl, $followRedirect = self::FOLLOW_REDIRECT ) {
82 if ( $followRedirect === self::FOLLOW_REDIRECT ) {
83 $redirects = true;
84 } elseif ( $followRedirect === self::NOFOLLOW_REDIRECT ) {
85 $redirects = false;
86 } else {
87 throw new InvalidArgumentException( '$followRedirect is not properly set: ' . $followRedirect );
88 }
89
90 // Go on call the external site
91
92 // Make sure the string is normalized into NFC (due to T42017)
93 // but do nothing to the whitespaces, that should work appropriately.
94 // @see https://phabricator.wikimedia.org/T42017
95 $pageName = Validator::cleanUp( $pageName );
96
97 // Build the args for the specific call
98 $args = [
99 'action' => 'query',
100 'prop' => 'info',
101 'redirects' => $redirects,
102 'converttitles' => true,
103 'format' => 'json',
104 'titles' => $pageName,
105 // @todo options for maxlag and maxage
106 // Note that maxlag will lead to a long delay before a reply is made,
107 // but that maxage can avoid the extreme delay. On the other hand
108 // maxage could be nice to use anyhow as it stops unnecessary requests.
109 // Also consider smaxage if maxage is used.
110 ];
111
112 $url = wfAppendQuery( $apiUrl, $args );
113
114 // Go on call the external site
115 // @todo we need a good way to specify a timeout here.
116 $ret = $this->httpRequestFactory->get( $url, [], __METHOD__ );
117
118 if ( $ret === null ) {
119 wfDebugLog( "MediaWikiSite", "call to external site failed: $url" );
120 return false;
121 }
122
123 $data = FormatJson::decode( $ret, true );
124
125 if ( !is_array( $data ) ) {
126 wfDebugLog( "MediaWikiSite", "call to <$url> returned bad json: " . $ret );
127 return false;
128 }
129
130 $page = static::extractPageRecord( $data, $pageName );
131
132 if ( isset( $page['missing'] ) ) {
133 wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for a missing page title! "
134 . $ret );
135 return false;
136 }
137
138 if ( isset( $page['invalid'] ) ) {
139 wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for an invalid page title! "
140 . $ret );
141 return false;
142 }
143
144 if ( !isset( $page['title'] ) ) {
145 wfDebugLog( "MediaWikiSite", "call to <$url> did not return a page title! " . $ret );
146 return false;
147 }
148
149 return $page['title'];
150 }
151
160 private static function extractPageRecord( $externalData, $pageTitle ) {
161 // If there is a special case with only one returned page
162 // we can cheat, and only return
163 // the single page in the "pages" substructure.
164 if ( isset( $externalData['query']['pages'] ) ) {
165 $pages = array_values( $externalData['query']['pages'] );
166 if ( count( $pages ) === 1 ) {
167 return $pages[0];
168 }
169 }
170 // This is only used during internal testing, as it is assumed
171 // a more optimal (and lossfree) storage.
172 // Make initial checks and return if prerequisites are not meet.
173 if ( !is_array( $externalData ) || !isset( $externalData['query'] ) ) {
174 return false;
175 }
176 // Loop over the tree different named structures, that otherwise are similar
177 $structs = [
178 'normalized' => 'from',
179 'converted' => 'from',
180 'redirects' => 'from',
181 'pages' => 'title'
182 ];
183 foreach ( $structs as $listId => $fieldId ) {
184 // Check if the substructure exist at all.
185 if ( !isset( $externalData['query'][$listId] ) ) {
186 continue;
187 }
188 // Filter the substructure down to what we actually are using.
189 $collectedHits = array_filter(
190 array_values( $externalData['query'][$listId] ),
191 static function ( $a ) use ( $fieldId, $pageTitle ) {
192 return $a[$fieldId] === $pageTitle;
193 }
194 );
195 // If still looping over normalization, conversion or redirects,
196 // then we need to keep the new page title for later rounds.
197 if ( $fieldId === 'from' && is_array( $collectedHits ) ) {
198 switch ( count( $collectedHits ) ) {
199 case 0:
200 break;
201 case 1:
202 $pageTitle = $collectedHits[0]['to'];
203 break;
204 default:
205 return false;
206 }
207 } elseif ( $fieldId === 'title' && is_array( $collectedHits ) ) {
208 // If on the pages structure we should prepare for returning.
209
210 switch ( count( $collectedHits ) ) {
211 case 0:
212 return false;
213 case 1:
214 return array_shift( $collectedHits );
215 default:
216 return false;
217 }
218 }
219 }
220 // should never be here
221 return false;
222 }
223
224}
wfDebugLog( $logGroup, $text, $dest='all', array $context=[])
Send a line to a supplementary debug log file, if configured, or main debug log if not.
wfAppendQuery( $url, $query)
Append a query string to an existing URL, which may or may not already have query string parameters a...
JSON formatter wrapper class.
Factory creating MWHttpRequest objects.
Service locator for MediaWiki core services.
static getInstance()
Returns the global default instance of the top level service locator.
Service for normalizing a page name via a MediaWiki action API.
normalizePageName(string $pageName, $apiUrl, $followRedirect=self::FOLLOW_REDIRECT)
Returns the normalized form of the given page title, using the normalization rules of the given site.