MediaWiki REL1_37
MediaWikiPageNameNormalizer.php
Go to the documentation of this file.
1<?php
2
3namespace MediaWiki\Site;
4
5use FormatJson;
6use Http;
7use InvalidArgumentException;
8use UtfNormal\Validator;
9
37
38 public const FOLLOW_REDIRECT = 1;
39 public const NOFOLLOW_REDIRECT = 2;
40
44 private $http;
45
49 public function __construct( Http $http = null ) {
50 if ( !$http ) {
51 $http = new Http();
52 }
53
54 $this->http = $http;
55 }
56
82 public function normalizePageName( $pageName, $apiUrl, $followRedirect = self::FOLLOW_REDIRECT ) {
83 // Check if we have strings as arguments.
84 if ( !is_string( $pageName ) ) {
85 throw new \MWException( '$pageName must be a string' );
86 }
87
88 if ( $followRedirect === self::FOLLOW_REDIRECT ) {
89 $redirects = true;
90 } elseif ( $followRedirect === self::NOFOLLOW_REDIRECT ) {
91 $redirects = false;
92 } else {
93 throw new InvalidArgumentException( '$followRedirect is not properly set: ' . $followRedirect );
94 }
95
96 // Go on call the external site
97
98 // Make sure the string is normalized into NFC (due to T42017)
99 // but do nothing to the whitespaces, that should work appropriately.
100 // @see https://phabricator.wikimedia.org/T42017
101 $pageName = Validator::cleanUp( $pageName );
102
103 // Build the args for the specific call
104 $args = [
105 'action' => 'query',
106 'prop' => 'info',
107 'redirects' => $redirects,
108 'converttitles' => true,
109 'format' => 'json',
110 'titles' => $pageName,
111 // @todo options for maxlag and maxage
112 // Note that maxlag will lead to a long delay before a reply is made,
113 // but that maxage can avoid the extreme delay. On the other hand
114 // maxage could be nice to use anyhow as it stops unnecessary requests.
115 // Also consider smaxage if maxage is used.
116 ];
117
118 $url = wfAppendQuery( $apiUrl, $args );
119
120 // Go on call the external site
121 // @todo we need a good way to specify a timeout here.
122 $ret = $this->http->get( $url, [], __METHOD__ );
123
124 if ( $ret === false ) {
125 wfDebugLog( "MediaWikiSite", "call to external site failed: $url" );
126 return false;
127 }
128
129 $data = FormatJson::decode( $ret, true );
130
131 if ( !is_array( $data ) ) {
132 wfDebugLog( "MediaWikiSite", "call to <$url> returned bad json: " . $ret );
133 return false;
134 }
135
136 $page = static::extractPageRecord( $data, $pageName );
137
138 if ( isset( $page['missing'] ) ) {
139 wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for a missing page title! "
140 . $ret );
141 return false;
142 }
143
144 if ( isset( $page['invalid'] ) ) {
145 wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for an invalid page title! "
146 . $ret );
147 return false;
148 }
149
150 if ( !isset( $page['title'] ) ) {
151 wfDebugLog( "MediaWikiSite", "call to <$url> did not return a page title! " . $ret );
152 return false;
153 }
154
155 return $page['title'];
156 }
157
166 private static function extractPageRecord( $externalData, $pageTitle ) {
167 // If there is a special case with only one returned page
168 // we can cheat, and only return
169 // the single page in the "pages" substructure.
170 if ( isset( $externalData['query']['pages'] ) ) {
171 $pages = array_values( $externalData['query']['pages'] );
172 if ( count( $pages ) === 1 ) {
173 return $pages[0];
174 }
175 }
176 // This is only used during internal testing, as it is assumed
177 // a more optimal (and lossfree) storage.
178 // Make initial checks and return if prerequisites are not meet.
179 if ( !is_array( $externalData ) || !isset( $externalData['query'] ) ) {
180 return false;
181 }
182 // Loop over the tree different named structures, that otherwise are similar
183 $structs = [
184 'normalized' => 'from',
185 'converted' => 'from',
186 'redirects' => 'from',
187 'pages' => 'title'
188 ];
189 foreach ( $structs as $listId => $fieldId ) {
190 // Check if the substructure exist at all.
191 if ( !isset( $externalData['query'][$listId] ) ) {
192 continue;
193 }
194 // Filter the substructure down to what we actually are using.
195 $collectedHits = array_filter(
196 array_values( $externalData['query'][$listId] ),
197 static function ( $a ) use ( $fieldId, $pageTitle ) {
198 return $a[$fieldId] === $pageTitle;
199 }
200 );
201 // If still looping over normalization, conversion or redirects,
202 // then we need to keep the new page title for later rounds.
203 if ( $fieldId === 'from' && is_array( $collectedHits ) ) {
204 switch ( count( $collectedHits ) ) {
205 case 0:
206 break;
207 case 1:
208 $pageTitle = $collectedHits[0]['to'];
209 break;
210 default:
211 return false;
212 }
213 } elseif ( $fieldId === 'title' && is_array( $collectedHits ) ) {
214 // If on the pages structure we should prepare for returning.
215
216 switch ( count( $collectedHits ) ) {
217 case 0:
218 return false;
219 case 1:
220 return array_shift( $collectedHits );
221 default:
222 return false;
223 }
224 }
225 }
226 // should never be here
227 return false;
228 }
229
230}
wfDebugLog( $logGroup, $text, $dest='all', array $context=[])
Send a line to a supplementary debug log file, if configured, or main debug log if not.
wfAppendQuery( $url, $query)
Append a query string to an existing URL, which may or may not already have query string parameters a...
JSON formatter wrapper class.
Various HTTP related functions.
Definition Http.php:28
Service for normalizing a page name using a MediaWiki api.
static extractPageRecord( $externalData, $pageTitle)
Get normalization record for a given page title from an API response.
normalizePageName( $pageName, $apiUrl, $followRedirect=self::FOLLOW_REDIRECT)
Returns the normalized form of the given page title, using the normalization rules of the given site.
if( $line===false) $args
Definition mcc.php:124