Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
95.24% |
20 / 21 |
|
50.00% |
1 / 2 |
CRAP | |
0.00% |
0 / 1 |
NamespaceUnlocalizer | |
95.24% |
20 / 21 |
|
50.00% |
1 / 2 |
7 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
process | |
94.74% |
18 / 19 |
|
0.00% |
0 / 1 |
6.01 |
1 | <?php |
2 | |
3 | namespace FileImporter\Services\Wikitext; |
4 | |
5 | use MediaWiki\Title\NamespaceInfo; |
6 | |
7 | /** |
8 | * A small parser for wiki links that is able to understand namespace prefixes in a specific |
9 | * language, e.g. [[Kategorie:…]] from a German wiki, and unlocalize them to their canonical English |
10 | * form, e.g. [[Category:…]]. |
11 | * |
12 | * As of now, we intentionally do not use MediaWiki's TitleParser infrastructure for a few reasons: |
13 | * - It does to many things (most notably extracting known interwiki prefixes) we really don't care |
14 | * about here. |
15 | * - We don't want to do any normalization on link elements we don't care about (basicaly everything |
16 | * except the namespace) as these would show up as unrelated changes in the diff. |
17 | * |
18 | * @license GPL-2.0-or-later |
19 | */ |
20 | class NamespaceUnlocalizer implements WikiLinkCleaner { |
21 | |
22 | private NamespaceNameLookup $namespaceNameLookup; |
23 | private NamespaceInfo $namespaceInfo; |
24 | |
25 | public function __construct( |
26 | NamespaceNameLookup $namespaceNameLookup, |
27 | NamespaceInfo $namespaceInfo |
28 | ) { |
29 | $this->namespaceNameLookup = $namespaceNameLookup; |
30 | $this->namespaceInfo = $namespaceInfo; |
31 | } |
32 | |
33 | public function process( string $link ): string { |
34 | return preg_replace_callback( |
35 | '/^ |
36 | # Group 1 captures an optional leading colon, the extra + avoid backtracking |
37 | (\h*+:?\h*+) |
38 | # Ungreedy group 2 captures the first prefix |
39 | ([^\v:]+?) |
40 | # Must be followed by a colon and something plausible |
41 | (?=\h*+:[^\v:][^\v]*$) |
42 | /xu', |
43 | function ( array $matches ): string { |
44 | [ $unchanged, $colon, $name ] = $matches; |
45 | // Normalize to use underscores, as this is what the services require |
46 | $name = trim( preg_replace( '/[\s\xA0_]+/u', '_', $name ), '_' ); |
47 | |
48 | $namespaceId = $this->namespaceNameLookup->getIndex( $name ); |
49 | if ( !is_int( $namespaceId ) |
50 | || $namespaceId === NS_MAIN |
51 | // The Project namespace shouldn't be "unlocalized" because it is not localized, |
52 | // but configured via $wgMetaNamespace or $wgSitename. |
53 | || $namespaceId === NS_PROJECT |
54 | ) { |
55 | return $unchanged; |
56 | } |
57 | |
58 | $canonicalName = $this->namespaceInfo->getCanonicalName( $namespaceId ); |
59 | if ( !is_string( $canonicalName ) || $canonicalName === $name ) { |
60 | return $unchanged; |
61 | } |
62 | |
63 | return $colon . str_replace( '_', ' ', $canonicalName ); |
64 | }, |
65 | $link, |
66 | 1 |
67 | ); |
68 | } |
69 | |
70 | } |