MediaWiki master
MediaWikiPageNameNormalizer.php
Go to the documentation of this file.
1<?php
7namespace MediaWiki\Site;
8
9use InvalidArgumentException;
13use UtfNormal\Validator;
14
25
26 public const FOLLOW_REDIRECT = 1;
27 public const NOFOLLOW_REDIRECT = 2;
28
32 private $httpRequestFactory;
33
37 public function __construct( $httpRequestFactory = null ) {
38 if ( !$httpRequestFactory instanceof HttpRequestFactory ) {
39 $httpRequestFactory = MediaWikiServices::getInstance()->getHttpRequestFactory();
40 }
41 $this->httpRequestFactory = $httpRequestFactory;
42 }
43
67 public function normalizePageName( string $pageName, $apiUrl, $followRedirect = self::FOLLOW_REDIRECT ) {
68 if ( $followRedirect === self::FOLLOW_REDIRECT ) {
69 $redirects = true;
70 } elseif ( $followRedirect === self::NOFOLLOW_REDIRECT ) {
71 $redirects = false;
72 } else {
73 throw new InvalidArgumentException( '$followRedirect is not properly set: ' . $followRedirect );
74 }
75
76 // Go on call the external site
77
78 // Make sure the string is normalized into NFC (due to T42017)
79 // but do nothing to the whitespaces, that should work appropriately.
80 // @see https://phabricator.wikimedia.org/T42017
81 $pageName = Validator::cleanUp( $pageName );
82
83 // Build the args for the specific call
84 $args = [
85 'action' => 'query',
86 'prop' => 'info',
87 'redirects' => $redirects,
88 'converttitles' => true,
89 'format' => 'json',
90 'titles' => $pageName,
91 // @todo options for maxlag and maxage
92 // Note that maxlag will lead to a long delay before a reply is made,
93 // but that maxage can avoid the extreme delay. On the other hand
94 // maxage could be nice to use anyhow as it stops unnecessary requests.
95 // Also consider smaxage if maxage is used.
96 ];
97
98 $url = wfAppendQuery( $apiUrl, $args );
99
100 // Go on call the external site
101 // @todo we need a good way to specify a timeout here.
102 $ret = $this->httpRequestFactory->get( $url, [], __METHOD__ );
103
104 if ( $ret === null ) {
105 wfDebugLog( "MediaWikiSite", "call to external site failed: $url" );
106 return false;
107 }
108
109 $data = FormatJson::decode( $ret, true );
110
111 if ( !is_array( $data ) ) {
112 wfDebugLog( "MediaWikiSite", "call to <$url> returned bad json: " . $ret );
113 return false;
114 }
115
116 $page = static::extractPageRecord( $data, $pageName );
117
118 if ( isset( $page['missing'] ) ) {
119 wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for a missing page title! "
120 . $ret );
121 return false;
122 }
123
124 if ( isset( $page['invalid'] ) ) {
125 wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for an invalid page title! "
126 . $ret );
127 return false;
128 }
129
130 if ( !isset( $page['title'] ) ) {
131 wfDebugLog( "MediaWikiSite", "call to <$url> did not return a page title! " . $ret );
132 return false;
133 }
134
135 return $page['title'];
136 }
137
146 private static function extractPageRecord( $externalData, $pageTitle ) {
147 // If there is a special case with only one returned page
148 // we can cheat, and only return
149 // the single page in the "pages" substructure.
150 if ( isset( $externalData['query']['pages'] ) ) {
151 $pages = array_values( $externalData['query']['pages'] );
152 if ( count( $pages ) === 1 ) {
153 return $pages[0];
154 }
155 }
156 // This is only used during internal testing, as it is assumed
157 // a more optimal (and lossfree) storage.
158 // Make initial checks and return if prerequisites are not meet.
159 if ( !is_array( $externalData ) || !isset( $externalData['query'] ) ) {
160 return false;
161 }
162 // Loop over the tree different named structures, that otherwise are similar
163 $structs = [
164 'normalized' => 'from',
165 'converted' => 'from',
166 'redirects' => 'from',
167 'pages' => 'title'
168 ];
169 foreach ( $structs as $listId => $fieldId ) {
170 // Check if the substructure exist at all.
171 if ( !isset( $externalData['query'][$listId] ) ) {
172 continue;
173 }
174 // Filter the substructure down to what we actually are using.
175 $collectedHits = array_filter(
176 array_values( $externalData['query'][$listId] ),
177 static function ( $a ) use ( $fieldId, $pageTitle ) {
178 return $a[$fieldId] === $pageTitle;
179 }
180 );
181 // If still looping over normalization, conversion or redirects,
182 // then we need to keep the new page title for later rounds.
183 if ( $fieldId === 'from' && is_array( $collectedHits ) ) {
184 switch ( count( $collectedHits ) ) {
185 case 0:
186 break;
187 case 1:
188 $pageTitle = $collectedHits[0]['to'];
189 break;
190 default:
191 return false;
192 }
193 } elseif ( $fieldId === 'title' && is_array( $collectedHits ) ) {
194 // If on the pages structure we should prepare for returning.
195
196 switch ( count( $collectedHits ) ) {
197 case 0:
198 return false;
199 case 1:
200 return array_shift( $collectedHits );
201 default:
202 return false;
203 }
204 }
205 }
206 // should never be here
207 return false;
208 }
209
210}
wfDebugLog( $logGroup, $text, $dest='all', array $context=[])
Send a line to a supplementary debug log file, if configured, or main debug log if not.
wfAppendQuery( $url, $query)
Append a query string to an existing URL, which may or may not already have query string parameters a...
Factory creating MWHttpRequest objects.
JSON formatter wrapper class.
Service locator for MediaWiki core services.
static getInstance()
Returns the global default instance of the top level service locator.
Service for normalizing a page name via a MediaWiki action API.
normalizePageName(string $pageName, $apiUrl, $followRedirect=self::FOLLOW_REDIRECT)
Returns the normalized form of the given page title, using the normalization rules of the given site.