Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
95.00% |
19 / 20 |
|
50.00% |
1 / 2 |
CRAP | |
0.00% |
0 / 1 |
| NamespaceUnlocalizer | |
95.00% |
19 / 20 |
|
50.00% |
1 / 2 |
7 | |
0.00% |
0 / 1 |
| __construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| process | |
94.74% |
18 / 19 |
|
0.00% |
0 / 1 |
6.01 | |||
| 1 | <?php |
| 2 | |
| 3 | namespace FileImporter\Services\Wikitext; |
| 4 | |
| 5 | use MediaWiki\Title\NamespaceInfo; |
| 6 | |
| 7 | /** |
| 8 | * A small parser for wiki links that is able to understand namespace prefixes in a specific |
| 9 | * language, e.g. [[Kategorie:…]] from a German wiki, and unlocalize them to their canonical English |
| 10 | * form, e.g. [[Category:…]]. |
| 11 | * |
| 12 | * As of now, we intentionally do not use MediaWiki's TitleParser infrastructure for a few reasons: |
| 13 | * - It does to many things (most notably extracting known interwiki prefixes) we really don't care |
| 14 | * about here. |
| 15 | * - We don't want to do any normalization on link elements we don't care about (basicaly everything |
| 16 | * except the namespace) as these would show up as unrelated changes in the diff. |
| 17 | * |
| 18 | * @license GPL-2.0-or-later |
| 19 | */ |
| 20 | class NamespaceUnlocalizer implements WikiLinkCleaner { |
| 21 | |
| 22 | public function __construct( |
| 23 | private readonly NamespaceNameLookup $namespaceNameLookup, |
| 24 | private readonly NamespaceInfo $namespaceInfo, |
| 25 | ) { |
| 26 | } |
| 27 | |
| 28 | public function process( string $link ): string { |
| 29 | return preg_replace_callback( |
| 30 | '/^ |
| 31 | # Group 1 captures an optional leading colon, the extra + avoid backtracking |
| 32 | (\h*+:?\h*+) |
| 33 | # Ungreedy group 2 captures the first prefix |
| 34 | ([^\v:]+?) |
| 35 | # Must be followed by a colon and something plausible |
| 36 | (?=\h*+:[^\v:][^\v]*$) |
| 37 | /xu', |
| 38 | function ( array $matches ): string { |
| 39 | [ $unchanged, $colon, $name ] = $matches; |
| 40 | // Normalize to use underscores, as this is what the services require |
| 41 | $name = trim( preg_replace( '/[\s\xA0_]+/u', '_', $name ), '_' ); |
| 42 | |
| 43 | $namespaceId = $this->namespaceNameLookup->getIndex( $name ); |
| 44 | if ( !is_int( $namespaceId ) |
| 45 | || $namespaceId === NS_MAIN |
| 46 | // The Project namespace shouldn't be "unlocalized" because it is not localized, |
| 47 | // but configured via $wgMetaNamespace or $wgSitename. |
| 48 | || $namespaceId === NS_PROJECT |
| 49 | ) { |
| 50 | return $unchanged; |
| 51 | } |
| 52 | |
| 53 | $canonicalName = $this->namespaceInfo->getCanonicalName( $namespaceId ); |
| 54 | if ( !is_string( $canonicalName ) || $canonicalName === $name ) { |
| 55 | return $unchanged; |
| 56 | } |
| 57 | |
| 58 | return $colon . str_replace( '_', ' ', $canonicalName ); |
| 59 | }, |
| 60 | $link, |
| 61 | 1 |
| 62 | ); |
| 63 | } |
| 64 | |
| 65 | } |