MediaWiki REL1_34
MediaWikiPageNameNormalizer.php
Go to the documentation of this file.
1<?php
2
3namespace MediaWiki\Site;
4
5use FormatJson;
6use Http;
7use UtfNormal\Validator;
8
36
40 private $http;
41
45 public function __construct( Http $http = null ) {
46 if ( !$http ) {
47 $http = new Http();
48 }
49
50 $this->http = $http;
51 }
52
74 public function normalizePageName( $pageName, $apiUrl ) {
75 // Check if we have strings as arguments.
76 if ( !is_string( $pageName ) ) {
77 throw new \MWException( '$pageName must be a string' );
78 }
79
80 // Go on call the external site
81
82 // Make sure the string is normalized into NFC (due to T42017)
83 // but do nothing to the whitespaces, that should work appropriately.
84 // @see https://phabricator.wikimedia.org/T42017
85 $pageName = Validator::cleanUp( $pageName );
86
87 // Build the args for the specific call
88 $args = [
89 'action' => 'query',
90 'prop' => 'info',
91 'redirects' => true,
92 'converttitles' => true,
93 'format' => 'json',
94 'titles' => $pageName,
95 // @todo options for maxlag and maxage
96 // Note that maxlag will lead to a long delay before a reply is made,
97 // but that maxage can avoid the extreme delay. On the other hand
98 // maxage could be nice to use anyhow as it stops unnecessary requests.
99 // Also consider smaxage if maxage is used.
100 ];
101
102 $url = wfAppendQuery( $apiUrl, $args );
103
104 // Go on call the external site
105 // @todo we need a good way to specify a timeout here.
106 $ret = $this->http->get( $url, [], __METHOD__ );
107
108 if ( $ret === false ) {
109 wfDebugLog( "MediaWikiSite", "call to external site failed: $url" );
110 return false;
111 }
112
113 $data = FormatJson::decode( $ret, true );
114
115 if ( !is_array( $data ) ) {
116 wfDebugLog( "MediaWikiSite", "call to <$url> returned bad json: " . $ret );
117 return false;
118 }
119
120 $page = static::extractPageRecord( $data, $pageName );
121
122 if ( isset( $page['missing'] ) ) {
123 wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for a missing page title! "
124 . $ret );
125 return false;
126 }
127
128 if ( isset( $page['invalid'] ) ) {
129 wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for an invalid page title! "
130 . $ret );
131 return false;
132 }
133
134 if ( !isset( $page['title'] ) ) {
135 wfDebugLog( "MediaWikiSite", "call to <$url> did not return a page title! " . $ret );
136 return false;
137 }
138
139 return $page['title'];
140 }
141
150 private static function extractPageRecord( $externalData, $pageTitle ) {
151 // If there is a special case with only one returned page
152 // we can cheat, and only return
153 // the single page in the "pages" substructure.
154 if ( isset( $externalData['query']['pages'] ) ) {
155 $pages = array_values( $externalData['query']['pages'] );
156 if ( count( $pages ) === 1 ) {
157 return $pages[0];
158 }
159 }
160 // This is only used during internal testing, as it is assumed
161 // a more optimal (and lossfree) storage.
162 // Make initial checks and return if prerequisites are not meet.
163 if ( !is_array( $externalData ) || !isset( $externalData['query'] ) ) {
164 return false;
165 }
166 // Loop over the tree different named structures, that otherwise are similar
167 $structs = [
168 'normalized' => 'from',
169 'converted' => 'from',
170 'redirects' => 'from',
171 'pages' => 'title'
172 ];
173 foreach ( $structs as $listId => $fieldId ) {
174 // Check if the substructure exist at all.
175 if ( !isset( $externalData['query'][$listId] ) ) {
176 continue;
177 }
178 // Filter the substructure down to what we actually are using.
179 $collectedHits = array_filter(
180 array_values( $externalData['query'][$listId] ),
181 function ( $a ) use ( $fieldId, $pageTitle ) {
182 return $a[$fieldId] === $pageTitle;
183 }
184 );
185 // If still looping over normalization, conversion or redirects,
186 // then we need to keep the new page title for later rounds.
187 if ( $fieldId === 'from' && is_array( $collectedHits ) ) {
188 switch ( count( $collectedHits ) ) {
189 case 0:
190 break;
191 case 1:
192 $pageTitle = $collectedHits[0]['to'];
193 break;
194 default:
195 return false;
196 }
197 } elseif ( $fieldId === 'title' && is_array( $collectedHits ) ) {
198 // If on the pages structure we should prepare for returning.
199
200 switch ( count( $collectedHits ) ) {
201 case 0:
202 return false;
203 case 1:
204 return array_shift( $collectedHits );
205 default:
206 return false;
207 }
208 }
209 }
210 // should never be here
211 return false;
212 }
213
214}
wfDebugLog( $logGroup, $text, $dest='all', array $context=[])
Send a line to a supplementary debug log file, if configured, or main debug log if not.
wfAppendQuery( $url, $query)
Append a query string to an existing URL, which may or may not already have query string parameters a...
if( $line===false) $args
Definition cdb.php:64
JSON formatter wrapper class.
Various HTTP related functions.
Definition Http.php:29
Service for normalizing a page name using a MediaWiki api.
static extractPageRecord( $externalData, $pageTitle)
Get normalization record for a given page title from an API response.
normalizePageName( $pageName, $apiUrl)
Returns the normalized form of the given page title, using the normalization rules of the given site.