Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
49.38% |
40 / 81 |
|
0.00% |
0 / 3 |
CRAP | |
0.00% |
0 / 1 |
MediaWikiPageNameNormalizer | |
49.38% |
40 / 81 |
|
0.00% |
0 / 3 |
121.54 | |
0.00% |
0 / 1 |
__construct | |
66.67% |
2 / 3 |
|
0.00% |
0 / 1 |
2.15 | |||
normalizePageName | |
82.93% |
34 / 41 |
|
0.00% |
0 / 1 |
8.32 | |||
extractPageRecord | |
10.81% |
4 / 37 |
|
0.00% |
0 / 1 |
222.04 |
1 | <?php |
2 | /** |
3 | * This program is free software; you can redistribute it and/or modify |
4 | * it under the terms of the GNU General Public License as published by |
5 | * the Free Software Foundation; either version 2 of the License, or |
6 | * (at your option) any later version. |
7 | * |
8 | * This program is distributed in the hope that it will be useful, |
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11 | * GNU General Public License for more details. |
12 | * |
13 | * You should have received a copy of the GNU General Public License along |
14 | * with this program; if not, write to the Free Software Foundation, Inc., |
15 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
16 | * http://www.gnu.org/copyleft/gpl.html |
17 | * |
18 | * @file |
19 | */ |
20 | |
21 | namespace MediaWiki\Site; |
22 | |
23 | use InvalidArgumentException; |
24 | use MediaWiki\Http\HttpRequestFactory; |
25 | use MediaWiki\Json\FormatJson; |
26 | use MediaWiki\MediaWikiServices; |
27 | use UtfNormal\Validator; |
28 | |
29 | /** |
30 | * Service for normalizing a page name via a MediaWiki action API. |
31 | * |
32 | * @since 1.27 |
33 | * @author John Erling Blad < jeblad@gmail.com > |
34 | * @author Daniel Kinzler |
35 | * @author Jeroen De Dauw < jeroendedauw@gmail.com > |
36 | * @author Marius Hoch |
37 | */ |
38 | class MediaWikiPageNameNormalizer { |
39 | |
40 | public const FOLLOW_REDIRECT = 1; |
41 | public const NOFOLLOW_REDIRECT = 2; |
42 | |
43 | /** |
44 | * @var HttpRequestFactory |
45 | */ |
46 | private $httpRequestFactory; |
47 | |
48 | /** |
49 | * @param HttpRequestFactory|null $httpRequestFactory |
50 | */ |
51 | public function __construct( $httpRequestFactory = null ) { |
52 | if ( !$httpRequestFactory instanceof HttpRequestFactory ) { |
53 | $httpRequestFactory = MediaWikiServices::getInstance()->getHttpRequestFactory(); |
54 | } |
55 | $this->httpRequestFactory = $httpRequestFactory; |
56 | } |
57 | |
58 | /** |
59 | * Returns the normalized form of the given page title, using the |
60 | * normalization rules of the given site. If $followRedirect is set to self::FOLLOW_REDIRECT (default) |
61 | * and the given title is a redirect, the redirect will be resolved and |
62 | * the redirect target is returned. |
63 | * Only titles of existing pages will be returned. |
64 | * |
65 | * @note This actually makes an API request to the remote site, so beware |
66 | * that this function is slow and depends on an external service. |
67 | * |
68 | * @see Site::normalizePageName |
69 | * |
70 | * @since 1.27 |
71 | * @since 1.37 Added $followRedirect |
72 | * |
73 | * @param string $pageName |
74 | * @param string $apiUrl |
75 | * @param int $followRedirect either self::FOLLOW_REDIRECT or self::NOFOLLOW_REDIRECT |
76 | * |
77 | * @return string|false The normalized form of the title, |
78 | * or false to indicate an invalid title, a missing page, |
79 | * or some other kind of error. |
80 | */ |
81 | public function normalizePageName( string $pageName, $apiUrl, $followRedirect = self::FOLLOW_REDIRECT ) { |
82 | if ( $followRedirect === self::FOLLOW_REDIRECT ) { |
83 | $redirects = true; |
84 | } elseif ( $followRedirect === self::NOFOLLOW_REDIRECT ) { |
85 | $redirects = false; |
86 | } else { |
87 | throw new InvalidArgumentException( '$followRedirect is not properly set: ' . $followRedirect ); |
88 | } |
89 | |
90 | // Go on call the external site |
91 | |
92 | // Make sure the string is normalized into NFC (due to T42017) |
93 | // but do nothing to the whitespaces, that should work appropriately. |
94 | // @see https://phabricator.wikimedia.org/T42017 |
95 | $pageName = Validator::cleanUp( $pageName ); |
96 | |
97 | // Build the args for the specific call |
98 | $args = [ |
99 | 'action' => 'query', |
100 | 'prop' => 'info', |
101 | 'redirects' => $redirects, |
102 | 'converttitles' => true, |
103 | 'format' => 'json', |
104 | 'titles' => $pageName, |
105 | // @todo options for maxlag and maxage |
106 | // Note that maxlag will lead to a long delay before a reply is made, |
107 | // but that maxage can avoid the extreme delay. On the other hand |
108 | // maxage could be nice to use anyhow as it stops unnecessary requests. |
109 | // Also consider smaxage if maxage is used. |
110 | ]; |
111 | |
112 | $url = wfAppendQuery( $apiUrl, $args ); |
113 | |
114 | // Go on call the external site |
115 | // @todo we need a good way to specify a timeout here. |
116 | $ret = $this->httpRequestFactory->get( $url, [], __METHOD__ ); |
117 | |
118 | if ( $ret === null ) { |
119 | wfDebugLog( "MediaWikiSite", "call to external site failed: $url" ); |
120 | return false; |
121 | } |
122 | |
123 | $data = FormatJson::decode( $ret, true ); |
124 | |
125 | if ( !is_array( $data ) ) { |
126 | wfDebugLog( "MediaWikiSite", "call to <$url> returned bad json: " . $ret ); |
127 | return false; |
128 | } |
129 | |
130 | $page = static::extractPageRecord( $data, $pageName ); |
131 | |
132 | if ( isset( $page['missing'] ) ) { |
133 | wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for a missing page title! " |
134 | . $ret ); |
135 | return false; |
136 | } |
137 | |
138 | if ( isset( $page['invalid'] ) ) { |
139 | wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for an invalid page title! " |
140 | . $ret ); |
141 | return false; |
142 | } |
143 | |
144 | if ( !isset( $page['title'] ) ) { |
145 | wfDebugLog( "MediaWikiSite", "call to <$url> did not return a page title! " . $ret ); |
146 | return false; |
147 | } |
148 | |
149 | return $page['title']; |
150 | } |
151 | |
152 | /** |
153 | * Get normalization record for a given page title from an API response. |
154 | * |
155 | * @param array $externalData A reply from the API on a external server. |
156 | * @param string $pageTitle Identifies the page at the external site, needing normalization. |
157 | * |
158 | * @return array|bool A 'page' structure representing the page identified by $pageTitle. |
159 | */ |
160 | private static function extractPageRecord( $externalData, $pageTitle ) { |
161 | // If there is a special case with only one returned page |
162 | // we can cheat, and only return |
163 | // the single page in the "pages" substructure. |
164 | if ( isset( $externalData['query']['pages'] ) ) { |
165 | $pages = array_values( $externalData['query']['pages'] ); |
166 | if ( count( $pages ) === 1 ) { |
167 | return $pages[0]; |
168 | } |
169 | } |
170 | // This is only used during internal testing, as it is assumed |
171 | // a more optimal (and lossfree) storage. |
172 | // Make initial checks and return if prerequisites are not meet. |
173 | if ( !is_array( $externalData ) || !isset( $externalData['query'] ) ) { |
174 | return false; |
175 | } |
176 | // Loop over the tree different named structures, that otherwise are similar |
177 | $structs = [ |
178 | 'normalized' => 'from', |
179 | 'converted' => 'from', |
180 | 'redirects' => 'from', |
181 | 'pages' => 'title' |
182 | ]; |
183 | foreach ( $structs as $listId => $fieldId ) { |
184 | // Check if the substructure exist at all. |
185 | if ( !isset( $externalData['query'][$listId] ) ) { |
186 | continue; |
187 | } |
188 | // Filter the substructure down to what we actually are using. |
189 | $collectedHits = array_filter( |
190 | array_values( $externalData['query'][$listId] ), |
191 | static function ( $a ) use ( $fieldId, $pageTitle ) { |
192 | return $a[$fieldId] === $pageTitle; |
193 | } |
194 | ); |
195 | // If still looping over normalization, conversion or redirects, |
196 | // then we need to keep the new page title for later rounds. |
197 | if ( $fieldId === 'from' && is_array( $collectedHits ) ) { |
198 | switch ( count( $collectedHits ) ) { |
199 | case 0: |
200 | break; |
201 | case 1: |
202 | $pageTitle = $collectedHits[0]['to']; |
203 | break; |
204 | default: |
205 | return false; |
206 | } |
207 | } elseif ( $fieldId === 'title' && is_array( $collectedHits ) ) { |
208 | // If on the pages structure we should prepare for returning. |
209 | |
210 | switch ( count( $collectedHits ) ) { |
211 | case 0: |
212 | return false; |
213 | case 1: |
214 | return array_shift( $collectedHits ); |
215 | default: |
216 | return false; |
217 | } |
218 | } |
219 | } |
220 | // should never be here |
221 | return false; |
222 | } |
223 | |
224 | } |