Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
54.33% |
270 / 497 |
|
31.11% |
14 / 45 |
CRAP | |
0.00% |
0 / 1 |
LanguageConverter | |
54.44% |
270 / 496 |
|
31.11% |
14 / 45 |
3280.11 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getMainCode | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
getStaticDefaultVariant | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
getLanguageVariants | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
getVariantsFallbacks | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
getFlags | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
6 | |||
getAdditionalFlags | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getManualLevel | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
12 | |||
getAdditionalManualLevel | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getDescCodeSeparator | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getDescVarSeparator | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getVariantNames | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
getVariants | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
getVariantFallbacks | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getConvRuleTitle | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getPreferredVariant | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
6 | |||
getDefaultVariant | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
12 | |||
validateVariant | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
5 | |||
getURLVariant | |
87.50% |
7 / 8 |
|
0.00% |
0 / 1 |
3.02 | |||
getUserVariant | |
84.62% |
11 / 13 |
|
0.00% |
0 / 1 |
5.09 | |||
getHeaderVariant | |
64.00% |
16 / 25 |
|
0.00% |
0 / 1 |
16.65 | |||
autoConvert | |
70.27% |
52 / 74 |
|
0.00% |
0 / 1 |
28.49 | |||
translate | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
translateWithoutRomanNumbers | |
70.59% |
12 / 17 |
|
0.00% |
0 / 1 |
3.23 | |||
autoConvertToAllVariants | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
6 | |||
applyManualConv | |
35.71% |
5 / 14 |
|
0.00% |
0 / 1 |
20.02 | |||
convertSplitTitle | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
1 | |||
convertTitle | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
convertNamespace | |
100.00% |
12 / 12 |
|
100.00% |
1 / 1 |
2 | |||
computeNsVariantText | |
85.71% |
12 / 14 |
|
0.00% |
0 / 1 |
5.07 | |||
convert | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
convertTo | |
83.33% |
5 / 6 |
|
0.00% |
0 / 1 |
3.04 | |||
recursiveConvertTopLevel | |
100.00% |
26 / 26 |
|
100.00% |
1 / 1 |
6 | |||
recursiveConvertRule | |
51.28% |
20 / 39 |
|
0.00% |
0 / 1 |
24.99 | |||
findVariantLink | |
0.00% |
0 / 43 |
|
0.00% |
0 / 1 |
420 | |||
getExtraHashOptions | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
guessVariant | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
loadDefaultTables | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
loadTables | |
95.24% |
20 / 21 |
|
0.00% |
0 / 1 |
4 | |||
postLoadTables | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
reloadTables | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
parseCachedTable | |
17.19% |
11 / 64 |
|
0.00% |
0 / 1 |
296.87 | |||
markNoConversion | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
12 | |||
convertCategoryKey | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
updateConversionTable | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
30 | |||
getVarSeparatorPattern | |
93.33% |
14 / 15 |
|
0.00% |
0 / 1 |
5.01 | |||
hasVariants | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
hasVariant | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
convertHtml | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | /** |
3 | * This program is free software; you can redistribute it and/or modify |
4 | * it under the terms of the GNU General Public License as published by |
5 | * the Free Software Foundation; either version 2 of the License, or |
6 | * (at your option) any later version. |
7 | * |
8 | * This program is distributed in the hope that it will be useful, |
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11 | * GNU General Public License for more details. |
12 | * |
13 | * You should have received a copy of the GNU General Public License along |
14 | * with this program; if not, write to the Free Software Foundation, Inc., |
15 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
16 | * http://www.gnu.org/copyleft/gpl.html |
17 | * |
18 | * @file |
19 | * @author Zhengzhu Feng <zhengzhu@gmail.com> |
20 | * @author fdcn <fdcn64@gmail.com> |
21 | * @author shinjiman <shinjiman@gmail.com> |
22 | * @author PhiLiP <philip.npc@gmail.com> |
23 | */ |
24 | |
25 | namespace MediaWiki\Language; |
26 | |
27 | use InvalidArgumentException; |
28 | use MediaWiki\Context\RequestContext; |
29 | use MediaWiki\Debug\DeprecationHelper; |
30 | use MediaWiki\HookContainer\HookRunner; |
31 | use MediaWiki\Html\Html; |
32 | use MediaWiki\Linker\LinkTarget; |
33 | use MediaWiki\Logger\LoggerFactory; |
34 | use MediaWiki\MainConfigNames; |
35 | use MediaWiki\MediaWikiServices; |
36 | use MediaWiki\Page\PageIdentity; |
37 | use MediaWiki\Parser\Parser; |
38 | use MediaWiki\Parser\Sanitizer; |
39 | use MediaWiki\Revision\RevisionRecord; |
40 | use MediaWiki\Revision\SlotRecord; |
41 | use MediaWiki\StubObject\StubUserLang; |
42 | use MediaWiki\Title\Title; |
43 | use MediaWiki\User\User; |
44 | use RuntimeException; |
45 | use StringUtils; |
46 | use UnexpectedValueException; |
47 | use Wikimedia\ObjectCache\BagOStuff; |
48 | |
49 | /** |
50 | * Base class for multi-variant language conversion. |
51 | * |
52 | * @ingroup Language |
53 | */ |
54 | abstract class LanguageConverter implements ILanguageConverter { |
55 | use DeprecationHelper; |
56 | |
57 | /** |
58 | * languages supporting variants |
59 | * @since 1.20 |
60 | * @var string[] |
61 | */ |
62 | public static $languagesWithVariants = [ |
63 | 'ban', |
64 | 'en', |
65 | 'crh', |
66 | 'gan', |
67 | 'iu', |
68 | 'ku', |
69 | 'mni', |
70 | 'sh', |
71 | 'shi', |
72 | 'sr', |
73 | 'tg', |
74 | 'tly', |
75 | 'uz', |
76 | 'wuu', |
77 | 'zgh', |
78 | 'zh', |
79 | ]; |
80 | |
81 | /** |
82 | * static default variant of languages supporting variants |
83 | * for use with DefaultOptionsLookup.php |
84 | * @since 1.40 |
85 | * @var array<string,string> |
86 | */ |
87 | public static $languagesWithStaticDefaultVariant = [ |
88 | 'ban' => 'ban', |
89 | 'en' => 'en', |
90 | 'crh' => 'crh', |
91 | 'gan' => 'gan', |
92 | 'iu' => 'iu', |
93 | 'ku' => 'ku', |
94 | 'mni' => 'mni', |
95 | 'sh' => 'sh-latn', |
96 | 'shi' => 'shi', |
97 | 'sr' => 'sr', |
98 | 'tg' => 'tg', |
99 | 'tly' => 'tly', |
100 | 'uz' => 'uz', |
101 | 'wuu' => 'wuu', |
102 | 'zgh' => 'zgh', |
103 | 'zh' => 'zh', |
104 | ]; |
105 | |
106 | /** @var bool */ |
107 | private $mTablesLoaded = false; |
108 | /** @var ReplacementArray[] */ |
109 | protected $mTables = []; |
110 | /** @var Language|StubUserLang */ |
111 | private $mLangObj; |
112 | /** @var string|false */ |
113 | private $mConvRuleTitle = false; |
114 | /** @var string|null */ |
115 | private $mURLVariant; |
116 | /** @var string|null */ |
117 | private $mUserVariant; |
118 | /** @var string|null */ |
119 | private $mHeaderVariant; |
120 | /** @var int */ |
121 | private $mMaxDepth = 10; |
122 | /** @var string|null */ |
123 | private $mVarSeparatorPattern; |
124 | |
125 | private const CACHE_VERSION_KEY = 'VERSION 7'; |
126 | |
127 | /** |
128 | * @param Language|StubUserLang $langobj |
129 | */ |
130 | public function __construct( $langobj ) { |
131 | $this->mLangObj = $langobj; |
132 | } |
133 | |
134 | /** |
135 | * Get the language code with converter (the "main" language code). |
136 | * Page language code would be the same of the language code with converter. |
137 | * Note that this code might not be included as one of the variant languages. |
138 | * @since 1.36 |
139 | * |
140 | * @return string |
141 | */ |
142 | abstract public function getMainCode(): string; |
143 | |
144 | /** |
145 | * Get static default variant. |
146 | * For use of specify the default variant form when it different from the |
147 | * default "unconverted/mixed-variant form". |
148 | * @since 1.40 |
149 | * |
150 | * @return string |
151 | */ |
152 | protected function getStaticDefaultVariant(): string { |
153 | $code = $this->getMainCode(); |
154 | return self::$languagesWithStaticDefaultVariant[$code] ?? $code; |
155 | } |
156 | |
157 | /** |
158 | * Get supported variants of the language. |
159 | * @since 1.36 |
160 | * |
161 | * @return array |
162 | */ |
163 | abstract protected function getLanguageVariants(): array; |
164 | |
165 | /** |
166 | * Get language variants fallbacks. |
167 | * @since 1.36 |
168 | * |
169 | * @return array |
170 | */ |
171 | abstract public function getVariantsFallbacks(): array; |
172 | |
173 | /** |
174 | * Get the strings that map to the flags. |
175 | * @since 1.36 |
176 | * |
177 | * @return array |
178 | */ |
179 | final public function getFlags(): array { |
180 | $defaultflags = [ |
181 | // 'S' show the converted text |
182 | // '+' add rules for alltext |
183 | // 'E' the flags have an error |
184 | // these flags above are reserved for program |
185 | 'A' => 'A', // add rule for convert code (all text converted) |
186 | 'T' => 'T', // title convert |
187 | 'R' => 'R', // raw content |
188 | 'D' => 'D', // convert description (subclass implement) |
189 | '-' => '-', // remove convert (not implement) |
190 | 'H' => 'H', // add rule for convert code (but no display in placed code) |
191 | 'N' => 'N', // current variant name |
192 | ]; |
193 | $flags = array_merge( $defaultflags, $this->getAdditionalFlags() ); |
194 | foreach ( $this->getVariants() as $v ) { |
195 | $flags[$v] = $v; |
196 | } |
197 | return $flags; |
198 | } |
199 | |
200 | /** |
201 | * Provides additional flags for converter. By default, it returns empty array and |
202 | * typically should be overridden by implementation of converter. |
203 | * |
204 | * @return array |
205 | */ |
206 | protected function getAdditionalFlags(): array { |
207 | return []; |
208 | } |
209 | |
210 | /** |
211 | * Get manual level limit for supported variants. |
212 | * @since 1.36 |
213 | * |
214 | * @return array |
215 | */ |
216 | final public function getManualLevel() { |
217 | $manualLevel = $this->getAdditionalManualLevel(); |
218 | $result = []; |
219 | foreach ( $this->getVariants() as $v ) { |
220 | if ( array_key_exists( $v, $manualLevel ) ) { |
221 | $result[$v] = $manualLevel[$v]; |
222 | } else { |
223 | $result[$v] = 'bidirectional'; |
224 | } |
225 | } |
226 | return $result; |
227 | } |
228 | |
229 | /** |
230 | * Provides additional flags for converter. By default, this function returns an empty array and |
231 | * typically should be overridden by the implementation of converter. |
232 | * @since 1.36 |
233 | * |
234 | * @return array |
235 | */ |
236 | protected function getAdditionalManualLevel(): array { |
237 | return []; |
238 | } |
239 | |
240 | /** |
241 | * Get desc code separator. By default returns ":", can be overridden by |
242 | * implementation of converter. |
243 | * @since 1.36 |
244 | * |
245 | * @return string |
246 | */ |
247 | public function getDescCodeSeparator(): string { |
248 | return ':'; |
249 | } |
250 | |
251 | /** |
252 | * Get desc var separator. By default returns ";", can be overridden by |
253 | * implementation of converter. |
254 | * @since 1.36 |
255 | * |
256 | * @return string |
257 | */ |
258 | public function getDescVarSeparator(): string { |
259 | return ';'; |
260 | } |
261 | |
262 | /** |
263 | * Get variant names. |
264 | * |
265 | * @return array |
266 | */ |
267 | public function getVariantNames(): array { |
268 | return MediaWikiServices::getInstance() |
269 | ->getLanguageNameUtils() |
270 | ->getLanguageNames(); |
271 | } |
272 | |
273 | final public function getVariants() { |
274 | $disabledVariants = MediaWikiServices::getInstance()->getMainConfig()->get( |
275 | MainConfigNames::DisabledVariants ); |
276 | return array_diff( $this->getLanguageVariants(), $disabledVariants ); |
277 | } |
278 | |
279 | public function getVariantFallbacks( $variant ) { |
280 | return $this->getVariantsFallbacks()[$variant] ?? $this->getStaticDefaultVariant(); |
281 | } |
282 | |
283 | public function getConvRuleTitle() { |
284 | return $this->mConvRuleTitle; |
285 | } |
286 | |
287 | public function getPreferredVariant() { |
288 | $req = $this->getURLVariant(); |
289 | |
290 | $services = MediaWikiServices::getInstance(); |
291 | ( new HookRunner( $services->getHookContainer() ) )->onGetLangPreferredVariant( $req ); |
292 | |
293 | if ( !$req ) { |
294 | $user = RequestContext::getMain()->getUser(); |
295 | // NOTE: For some calls there may not be a context user or session that is safe |
296 | // to use, see (T235360) |
297 | // Use case: During user autocreation, UserNameUtils::isUsable is called which uses interface |
298 | // messages for reserved usernames. |
299 | if ( $user->isSafeToLoad() && $user->isRegistered() ) { |
300 | $req = $this->getUserVariant( $user ); |
301 | } else { |
302 | $req = $this->getHeaderVariant(); |
303 | } |
304 | } |
305 | |
306 | $defaultLanguageVariant = $services->getMainConfig() |
307 | ->get( MainConfigNames::DefaultLanguageVariant ); |
308 | if ( !$req && $defaultLanguageVariant ) { |
309 | $req = $this->validateVariant( $defaultLanguageVariant ); |
310 | } |
311 | |
312 | $req = $this->validateVariant( $req ); |
313 | |
314 | // This function, unlike the other get*Variant functions, is |
315 | // not memoized (i.e., there return value is not cached) since |
316 | // new information might appear during processing after this |
317 | // is first called. |
318 | return $req ?? $this->getStaticDefaultVariant(); |
319 | } |
320 | |
321 | public function getDefaultVariant() { |
322 | $defaultLanguageVariant = MediaWikiServices::getInstance()->getMainConfig()->get( |
323 | MainConfigNames::DefaultLanguageVariant ); |
324 | |
325 | $req = $this->getURLVariant() ?? $this->getHeaderVariant(); |
326 | |
327 | if ( !$req && $defaultLanguageVariant ) { |
328 | $req = $this->validateVariant( $defaultLanguageVariant ); |
329 | } |
330 | |
331 | return $req ?? $this->getStaticDefaultVariant(); |
332 | } |
333 | |
334 | public function validateVariant( $variant = null ) { |
335 | if ( $variant === null ) { |
336 | return null; |
337 | } |
338 | // Our internal variants are always lower-case; the variant we |
339 | // are validating may have mixed cases. |
340 | $variant = LanguageCode::replaceDeprecatedCodes( strtolower( $variant ) ); |
341 | if ( in_array( $variant, $this->getVariants() ) ) { |
342 | return $variant; |
343 | } |
344 | // Browsers are supposed to use BCP 47 standard in the |
345 | // Accept-Language header, but not all of our internal |
346 | // mediawiki variant codes are BCP 47. Map BCP 47 code |
347 | // to our internal code. |
348 | foreach ( $this->getVariants() as $v ) { |
349 | // Case-insensitive match (BCP 47 is mixed-case) |
350 | if ( strtolower( LanguageCode::bcp47( $v ) ) === $variant ) { |
351 | return $v; |
352 | } |
353 | } |
354 | return null; |
355 | } |
356 | |
357 | public function getURLVariant() { |
358 | if ( $this->mURLVariant ) { |
359 | return $this->mURLVariant; |
360 | } |
361 | |
362 | $request = RequestContext::getMain()->getRequest(); |
363 | // see if the preference is set in the request |
364 | $ret = $request->getText( 'variant' ); |
365 | |
366 | if ( !$ret ) { |
367 | $ret = $request->getVal( 'uselang' ); |
368 | } |
369 | |
370 | $this->mURLVariant = $this->validateVariant( $ret ); |
371 | return $this->mURLVariant; |
372 | } |
373 | |
374 | /** |
375 | * Determine if the user has a variant set. |
376 | * |
377 | * @param User $user |
378 | * @return string|null Variant if one found, null otherwise |
379 | */ |
380 | protected function getUserVariant( User $user ) { |
381 | // This should only be called within the class after the user is known to be |
382 | // safe to load and logged in, but check just in case. |
383 | if ( !$user->isSafeToLoad() ) { |
384 | return null; |
385 | } |
386 | |
387 | if ( !$this->mUserVariant ) { |
388 | $services = MediaWikiServices::getInstance(); |
389 | if ( $user->isRegistered() ) { |
390 | // Get language variant preference from logged in users |
391 | if ( |
392 | $this->getMainCode() === |
393 | $services->getContentLanguage()->getCode() |
394 | ) { |
395 | $optionName = 'variant'; |
396 | } else { |
397 | $optionName = 'variant-' . $this->getMainCode(); |
398 | } |
399 | } else { |
400 | // figure out user lang without constructing wgLang to avoid |
401 | // infinite recursion |
402 | $optionName = 'language'; |
403 | } |
404 | $ret = $services->getUserOptionsLookup()->getOption( $user, $optionName ); |
405 | |
406 | $this->mUserVariant = $this->validateVariant( $ret ); |
407 | } |
408 | |
409 | return $this->mUserVariant; |
410 | } |
411 | |
412 | /** |
413 | * Determine the language variant from the Accept-Language header. |
414 | * |
415 | * @return string|null Variant if one found, null otherwise |
416 | */ |
417 | protected function getHeaderVariant() { |
418 | if ( $this->mHeaderVariant ) { |
419 | return $this->mHeaderVariant; |
420 | } |
421 | |
422 | $request = RequestContext::getMain()->getRequest(); |
423 | // See if some supported language variant is set in the |
424 | // HTTP header. |
425 | $languages = array_keys( $request->getAcceptLang() ); |
426 | if ( !$languages ) { |
427 | return null; |
428 | } |
429 | |
430 | $fallbackLanguages = []; |
431 | foreach ( $languages as $language ) { |
432 | $this->mHeaderVariant = $this->validateVariant( $language ); |
433 | if ( $this->mHeaderVariant ) { |
434 | break; |
435 | } |
436 | |
437 | // To see if there are fallbacks of current language. |
438 | // We record these fallback variants, and process |
439 | // them later. |
440 | $fallbacks = $this->getVariantFallbacks( $language ); |
441 | if ( |
442 | is_string( $fallbacks ) && |
443 | $fallbacks !== $this->getStaticDefaultVariant() |
444 | ) { |
445 | $fallbackLanguages[] = $fallbacks; |
446 | } elseif ( is_array( $fallbacks ) ) { |
447 | $fallbackLanguages = |
448 | array_merge( $fallbackLanguages, $fallbacks ); |
449 | } |
450 | } |
451 | |
452 | if ( !$this->mHeaderVariant ) { |
453 | // process fallback languages now |
454 | $fallback_languages = array_unique( $fallbackLanguages ); |
455 | foreach ( $fallback_languages as $language ) { |
456 | $this->mHeaderVariant = $this->validateVariant( $language ); |
457 | if ( $this->mHeaderVariant ) { |
458 | break; |
459 | } |
460 | } |
461 | } |
462 | |
463 | return $this->mHeaderVariant; |
464 | } |
465 | |
466 | public function autoConvert( $text, $toVariant = false ) { |
467 | $this->loadTables(); |
468 | |
469 | if ( !$toVariant ) { |
470 | $toVariant = $this->getPreferredVariant(); |
471 | if ( !$toVariant ) { |
472 | return $text; |
473 | } |
474 | } |
475 | |
476 | if ( $this->guessVariant( $text, $toVariant ) ) { |
477 | return $text; |
478 | } |
479 | /** |
480 | * We convert everything except: |
481 | * 1. HTML markups (anything between < and >) |
482 | * 2. HTML entities |
483 | * 3. placeholders created by the parser |
484 | * IMPORTANT: Beware of failure from pcre.backtrack_limit (T124404). |
485 | * Minimize the use of backtracking where possible. |
486 | */ |
487 | static $reg; |
488 | if ( $reg === null ) { |
489 | $marker = '|' . Parser::MARKER_PREFIX . '[^\x7f]++\x7f'; |
490 | |
491 | // this one is needed when the text is inside an HTML markup |
492 | $htmlfix = '|<[^>\004]++(?=\004$)|^[^<>]*+>'; |
493 | |
494 | // Optimize for the common case where these tags have |
495 | // few or no children. Thus try and possessively get as much as |
496 | // possible, and only engage in backtracking when we hit a '<'. |
497 | |
498 | // disable convert to variants between <code> tags |
499 | $codefix = '<code>[^<]*+(?:(?:(?!<\/code>).)[^<]*+)*+<\/code>|'; |
500 | // disable conversion of <script> tags |
501 | $scriptfix = '<script[^>]*+>[^<]*+(?:(?:(?!<\/script>).)[^<]*+)*+<\/script>|'; |
502 | // disable conversion of <pre> tags |
503 | $prefix = '<pre[^>]*+>[^<]*+(?:(?:(?!<\/pre>).)[^<]*+)*+<\/pre>|'; |
504 | // The "|.*+)" at the end, is in case we missed some part of html syntax, |
505 | // we will fail securely (hopefully) by matching the rest of the string. |
506 | $htmlFullTag = '<(?:[^>=]*+(?>[^>=]*+=\s*+(?:"[^"]*"|\'[^\']*\'|[^\'">\s]*+))*+[^>=]*+>|.*+)|'; |
507 | |
508 | $reg = '/' . $codefix . $scriptfix . $prefix . $htmlFullTag . |
509 | '&[a-zA-Z#][a-z0-9]++;' . $marker . $htmlfix . '|\004$/s'; |
510 | } |
511 | $startPos = 0; |
512 | $sourceBlob = ''; |
513 | $literalBlob = ''; |
514 | |
515 | // Guard against delimiter nulls in the input |
516 | // (should never happen: see T159174) |
517 | $text = str_replace( "\000", '', $text ); |
518 | $text = str_replace( "\004", '', $text ); |
519 | |
520 | $markupMatches = null; |
521 | $elementMatches = null; |
522 | |
523 | // We add a marker (\004) at the end of text, to ensure we always match the |
524 | // entire text (Otherwise, pcre.backtrack_limit might cause silent failure) |
525 | $textWithMarker = $text . "\004"; |
526 | while ( $startPos < strlen( $text ) ) { |
527 | if ( preg_match( $reg, $textWithMarker, $markupMatches, PREG_OFFSET_CAPTURE, $startPos ) ) { |
528 | $elementPos = $markupMatches[0][1]; |
529 | $element = $markupMatches[0][0]; |
530 | if ( $element === "\004" ) { |
531 | // We hit the end. |
532 | $elementPos = strlen( $text ); |
533 | $element = ''; |
534 | } elseif ( substr( $element, -1 ) === "\004" ) { |
535 | // This can sometimes happen if we have |
536 | // unclosed html tags. For example, |
537 | // when converting a title attribute |
538 | // during a recursive call that contains |
539 | // a < e.g. <div title="<">. |
540 | $element = substr( $element, 0, -1 ); |
541 | } |
542 | } else { |
543 | // If we hit here, then Language Converter could be tricked |
544 | // into doing an XSS, so we refuse to translate. |
545 | // If expected input manages to reach this code path, |
546 | // we should consider it a bug. |
547 | $log = LoggerFactory::getInstance( 'languageconverter' ); |
548 | $log->error( "Hit pcre.backtrack_limit in " . __METHOD__ |
549 | . ". Disabling language conversion for this page.", |
550 | [ |
551 | "method" => __METHOD__, |
552 | "variant" => $toVariant, |
553 | "startOfText" => substr( $text, 0, 500 ) |
554 | ] |
555 | ); |
556 | return $text; |
557 | } |
558 | // Queue the part before the markup for translation in a batch |
559 | $sourceBlob .= substr( $text, $startPos, $elementPos - $startPos ) . "\000"; |
560 | |
561 | // Advance to the next position |
562 | $startPos = $elementPos + strlen( $element ); |
563 | |
564 | // Translate any alt or title attributes inside the matched element |
565 | if ( $element !== '' |
566 | && preg_match( '/^(<[^>\s]*+)\s([^>]*+)(.*+)$/', $element, $elementMatches ) |
567 | ) { |
568 | // FIXME, this decodes entities, so if you have something |
569 | // like <div title="foo<bar"> the bar won't get |
570 | // translated since after entity decoding it looks like |
571 | // unclosed html and we call this method recursively |
572 | // on attributes. |
573 | $attrs = Sanitizer::decodeTagAttributes( $elementMatches[2] ); |
574 | // Ensure self-closing tags stay self-closing. |
575 | $close = substr( $elementMatches[2], -1 ) === '/' ? ' /' : ''; |
576 | $changed = false; |
577 | foreach ( [ 'title', 'alt' ] as $attrName ) { |
578 | if ( !isset( $attrs[$attrName] ) ) { |
579 | continue; |
580 | } |
581 | $attr = $attrs[$attrName]; |
582 | // Don't convert URLs |
583 | if ( !str_contains( $attr, '://' ) ) { |
584 | $attr = $this->recursiveConvertTopLevel( $attr, $toVariant ); |
585 | } |
586 | |
587 | if ( $attr !== $attrs[$attrName] ) { |
588 | $attrs[$attrName] = $attr; |
589 | $changed = true; |
590 | } |
591 | } |
592 | if ( $changed ) { |
593 | // @phan-suppress-next-line SecurityCheck-DoubleEscaped Explained above with decodeTagAttributes |
594 | $element = $elementMatches[1] . Html::expandAttributes( $attrs ) . |
595 | $close . $elementMatches[3]; |
596 | } |
597 | } |
598 | $literalBlob .= $element . "\000"; |
599 | } |
600 | |
601 | // Do the main translation batch |
602 | $translatedBlob = $this->translate( $sourceBlob, $toVariant ); |
603 | |
604 | // Put the output back together |
605 | $translatedIter = StringUtils::explode( "\000", $translatedBlob ); |
606 | $literalIter = StringUtils::explode( "\000", $literalBlob ); |
607 | $output = ''; |
608 | while ( $translatedIter->valid() && $literalIter->valid() ) { |
609 | $output .= $translatedIter->current(); |
610 | $output .= $literalIter->current(); |
611 | $translatedIter->next(); |
612 | $literalIter->next(); |
613 | } |
614 | |
615 | return $output; |
616 | } |
617 | |
618 | public function translate( $text, $variant ) { |
619 | // If $text is empty or only includes spaces, do nothing |
620 | // Otherwise translate it |
621 | if ( trim( $text ) ) { |
622 | $this->loadTables(); |
623 | $text = $this->mTables[$variant]->replace( $text ); |
624 | } |
625 | return $text; |
626 | } |
627 | |
628 | /** |
629 | * @param string $text Text to convert |
630 | * @param string $variant Variant language code |
631 | * @return string Translated text |
632 | */ |
633 | protected function translateWithoutRomanNumbers( $text, $variant ) { |
634 | $breaks = '[^\w\x80-\xff]'; |
635 | |
636 | // regexp for roman numbers |
637 | // Lookahead assertion ensures $roman doesn't match the empty string |
638 | $roman = '(?=[MDCLXVI])M{0,4}(C[DM]|D?C{0,3})(X[LC]|L?X{0,3})(I[VX]|V?I{0,3})'; |
639 | |
640 | $reg = '/^' . $roman . '$|^' . $roman . $breaks . '|' . $breaks |
641 | . $roman . '$|' . $breaks . $roman . $breaks . '/'; |
642 | |
643 | $matches = preg_split( $reg, $text, -1, PREG_SPLIT_OFFSET_CAPTURE ); |
644 | |
645 | $m = array_shift( $matches ); |
646 | $this->loadTables(); |
647 | if ( !isset( $this->mTables[$variant] ) ) { |
648 | throw new RuntimeException( "Broken variant table: " |
649 | . implode( ',', array_keys( $this->mTables ) ) ); |
650 | } |
651 | $ret = $this->mTables[$variant]->replace( $m[0] ); |
652 | $mstart = (int)$m[1] + strlen( $m[0] ); |
653 | foreach ( $matches as $m ) { |
654 | $ret .= substr( $text, $mstart, (int)$m[1] - $mstart ); |
655 | $ret .= $this->translate( $m[0], $variant ); |
656 | $mstart = (int)$m[1] + strlen( $m[0] ); |
657 | } |
658 | |
659 | return $ret; |
660 | } |
661 | |
662 | public function autoConvertToAllVariants( $text ) { |
663 | $this->loadTables(); |
664 | |
665 | $ret = []; |
666 | foreach ( $this->getVariants() as $variant ) { |
667 | $ret[$variant] = $this->translate( $text, $variant ); |
668 | } |
669 | |
670 | return $ret; |
671 | } |
672 | |
673 | /** |
674 | &n |