Code Coverage for /workspace/src/includes/site/MediaWikiPageNameNormalizer.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	49.38% covered (danger)	49.38%	40 / 81	0.00% covered (danger)	0.00%	0 / 3	CRAP	0.00% covered (danger)	0.00%	0 / 1
MediaWikiPageNameNormalizer	49.38% covered (danger)	49.38%	40 / 81	0.00% covered (danger)	0.00%	0 / 3	121.54	0.00% covered (danger)	0.00%	0 / 1
__construct	66.67% covered (warning)	66.67%	2 / 3	0.00% covered (danger)	0.00%	0 / 1	2.15
normalizePageName	82.93% covered (warning)	82.93%	34 / 41	0.00% covered (danger)	0.00%	0 / 1	8.32
extractPageRecord	10.81% covered (danger)	10.81%	4 / 37	0.00% covered (danger)	0.00%	0 / 1	222.04

1	<?php
2	/**
3	* This program is free software; you can redistribute it and/or modify
4	* it under the terms of the GNU General Public License as published by
5	* the Free Software Foundation; either version 2 of the License, or
6	* (at your option) any later version.
7	*
8	* This program is distributed in the hope that it will be useful,
9	* but WITHOUT ANY WARRANTY; without even the implied warranty of
10	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11	* GNU General Public License for more details.
12	*
13	* You should have received a copy of the GNU General Public License along
14	* with this program; if not, write to the Free Software Foundation, Inc.,
15	* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
16	* http://www.gnu.org/copyleft/gpl.html
17	*
18	* @file
19	*/
20
21	namespace MediaWiki\Site;
22
23	use InvalidArgumentException;
24	use MediaWiki\Http\HttpRequestFactory;
25	use MediaWiki\Json\FormatJson;
26	use MediaWiki\MediaWikiServices;
27	use UtfNormal\Validator;
28
29	/**
30	* Service for normalizing a page name via a MediaWiki action API.
31	*
32	* @since 1.27
33	* @author John Erling Blad < jeblad@gmail.com >
34	* @author Daniel Kinzler
35	* @author Jeroen De Dauw < jeroendedauw@gmail.com >
36	* @author Marius Hoch
37	*/
38	class MediaWikiPageNameNormalizer {
39
40	public const FOLLOW_REDIRECT = 1;
41	public const NOFOLLOW_REDIRECT = 2;
42
43	/**
44	* @var HttpRequestFactory
45	*/
46	private $httpRequestFactory;
47
48	/**
49	* @param HttpRequestFactory\|null $httpRequestFactory
50	*/
51	public function __construct( $httpRequestFactory = null ) {
52	if ( !$httpRequestFactory instanceof HttpRequestFactory ) {
53	$httpRequestFactory = MediaWikiServices::getInstance()->getHttpRequestFactory();
54	}
55	$this->httpRequestFactory = $httpRequestFactory;
56	}
57
58	/**
59	* Returns the normalized form of the given page title, using the
60	* normalization rules of the given site. If $followRedirect is set to self::FOLLOW_REDIRECT (default)
61	* and the given title is a redirect, the redirect will be resolved and
62	* the redirect target is returned.
63	* Only titles of existing pages will be returned.
64	*
65	* @note This actually makes an API request to the remote site, so beware
66	* that this function is slow and depends on an external service.
67	*
68	* @see Site::normalizePageName
69	*
70	* @since 1.27
71	* @since 1.37 Added $followRedirect
72	*
73	* @param string $pageName
74	* @param string $apiUrl
75	* @param int $followRedirect either self::FOLLOW_REDIRECT or self::NOFOLLOW_REDIRECT
76	*
77	* @return string\|false The normalized form of the title,
78	* or false to indicate an invalid title, a missing page,
79	* or some other kind of error.
80	*/
81	public function normalizePageName( string $pageName, $apiUrl, $followRedirect = self::FOLLOW_REDIRECT ) {
82	if ( $followRedirect === self::FOLLOW_REDIRECT ) {
83	$redirects = true;
84	} elseif ( $followRedirect === self::NOFOLLOW_REDIRECT ) {
85	$redirects = false;
86	} else {
87	throw new InvalidArgumentException( '$followRedirect is not properly set: ' . $followRedirect );
88	}
89
90	// Go on call the external site
91
92	// Make sure the string is normalized into NFC (due to T42017)
93	// but do nothing to the whitespaces, that should work appropriately.
94	// @see https://phabricator.wikimedia.org/T42017
95	$pageName = Validator::cleanUp( $pageName );
96
97	// Build the args for the specific call
98	$args = [
99	'action' => 'query',
100	'prop' => 'info',
101	'redirects' => $redirects,
102	'converttitles' => true,
103	'format' => 'json',
104	'titles' => $pageName,
105	// @todo options for maxlag and maxage
106	// Note that maxlag will lead to a long delay before a reply is made,
107	// but that maxage can avoid the extreme delay. On the other hand
108	// maxage could be nice to use anyhow as it stops unnecessary requests.
109	// Also consider smaxage if maxage is used.
110	];
111
112	$url = wfAppendQuery( $apiUrl, $args );
113
114	// Go on call the external site
115	// @todo we need a good way to specify a timeout here.
116	$ret = $this->httpRequestFactory->get( $url, [], __METHOD__ );
117
118	if ( $ret === null ) {
119	wfDebugLog( "MediaWikiSite", "call to external site failed: $url" );
120	return false;
121	}
122
123	$data = FormatJson::decode( $ret, true );
124
125	if ( !is_array( $data ) ) {
126	wfDebugLog( "MediaWikiSite", "call to <$url> returned bad json: " . $ret );
127	return false;
128	}
129
130	$page = static::extractPageRecord( $data, $pageName );
131
132	if ( isset( $page['missing'] ) ) {
133	wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for a missing page title! "
134	. $ret );
135	return false;
136	}
137
138	if ( isset( $page['invalid'] ) ) {
139	wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for an invalid page title! "
140	. $ret );
141	return false;
142	}
143
144	if ( !isset( $page['title'] ) ) {
145	wfDebugLog( "MediaWikiSite", "call to <$url> did not return a page title! " . $ret );
146	return false;
147	}
148
149	return $page['title'];
150	}
151
152	/**
153	* Get normalization record for a given page title from an API response.
154	*
155	* @param array $externalData A reply from the API on a external server.
156	* @param string $pageTitle Identifies the page at the external site, needing normalization.
157	*
158	* @return array\|bool A 'page' structure representing the page identified by $pageTitle.
159	*/
160	private static function extractPageRecord( $externalData, $pageTitle ) {
161	// If there is a special case with only one returned page
162	// we can cheat, and only return
163	// the single page in the "pages" substructure.
164	if ( isset( $externalData['query']['pages'] ) ) {
165	$pages = array_values( $externalData['query']['pages'] );
166	if ( count( $pages ) === 1 ) {
167	return $pages[0];
168	}
169	}
170	// This is only used during internal testing, as it is assumed
171	// a more optimal (and lossfree) storage.
172	// Make initial checks and return if prerequisites are not meet.
173	if ( !is_array( $externalData ) \|\| !isset( $externalData['query'] ) ) {
174	return false;
175	}
176	// Loop over the tree different named structures, that otherwise are similar
177	$structs = [
178	'normalized' => 'from',
179	'converted' => 'from',
180	'redirects' => 'from',
181	'pages' => 'title'
182	];
183	foreach ( $structs as $listId => $fieldId ) {
184	// Check if the substructure exist at all.
185	if ( !isset( $externalData['query'][$listId] ) ) {
186	continue;
187	}
188	// Filter the substructure down to what we actually are using.
189	$collectedHits = array_filter(
190	array_values( $externalData['query'][$listId] ),
191	static function ( $a ) use ( $fieldId, $pageTitle ) {
192	return $a[$fieldId] === $pageTitle;
193	}
194	);
195	// If still looping over normalization, conversion or redirects,
196	// then we need to keep the new page title for later rounds.
197	if ( $fieldId === 'from' && is_array( $collectedHits ) ) {
198	switch ( count( $collectedHits ) ) {
199	case 0:
200	break;
201	case 1:
202	$pageTitle = $collectedHits[0]['to'];
203	break;
204	default:
205	return false;
206	}
207	} elseif ( $fieldId === 'title' && is_array( $collectedHits ) ) {
208	// If on the pages structure we should prepare for returning.
209
210	switch ( count( $collectedHits ) ) {
211	case 0:
212	return false;
213	case 1:
214	return array_shift( $collectedHits );
215	default:
216	return false;
217	}
218	}
219	}
220	// should never be here
221	return false;
222	}
223
224	}