mwtitle/
php.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
/*
Copyright (C) 2016, 2019 Ed Sanders
Copyright (C) 2021 Kunal Mehta <legoktm@debian.org>

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/*!
 * Non-Unicode uppercase mapping used in MediaWiki titles.
 *
 * Some MediaWiki titles have their first code point (letter) uppercased.
 * Most code points are uppercased according to the [Unicode uppercase mapping],
 * but some have a different uppercase mapping,
 * following the `mb_strtoupper` function from PHP version 7.2 and earlier.
 * In [PHP 7.3], `mb_strtoupper` was updated to follow Unicode casemapping,
 * but MediaWiki titles still use the old uppercasing for compatibility.
 * `ALREADY_UPPERCASE` lists code points that are unchanged by uppercasing
 * and `to_uppercase` maps some code points to their non-Unicode uppercasing.
 *
 * In general, MediaWiki title casing doesn't uppercase one code point to two or three code points
 * as Unicode does with some code points (see [SpecialCasing.txt] for a full list).
 * It leaves some of these code points unchanged,
 * like ff (U+FB00 LATIN SMALL LIGATURE FF),
 * whose Unicode uppercase is
 * FF (U+0046 LATIN CAPITAL LETTER F, U+0046 LATIN CAPITAL LETTER F).
 * It maps other code points to a different single code point,
 * like ᾳ (U+1FB3 GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI),
 * whose Unicode uppercase is
 * ΑΙ (U+0391 GREEK CAPITAL LETTER ALPHA, U+0399 GREEK CAPITAL LETTER IOTA)
 * but whose MediaWiki title uppercase is
 * ᾼ (U+1FBC GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI).
 *
 * MediaWiki title casing also leaves some code points unchanged
 * even when Unicode gives them a one-to-one uppercase mapping.
 * Some of these code points had uppercase mappings
 * in [UnicodeData.txt] for version 1.1 of Unicode.
 * like ⓐ (U+24D0 CIRCLED LATIN SMALL LETTER A)
 * to Ⓐ (U+24B6 CIRCLED LATIN CAPITAL LETTER A).
 * Others had uppercase mappings added in a later version of Unicode;
 * for instance, lowercase ა (U+10D0 GEORGIAN LETTER AN) began to be mapped
 * to uppercase Ა (U+1C90 GEORGIAN MTAVRULI CAPITAL LETTER AN) in Unicode 11.0 (2018-06-05).
 *
 * [PHP 7.3]: https://www.zend.com/blog/guide-to-php-73
 * [SpecialCasing.txt]: http://ftp.unicode.org/Public/UNIDATA/SpecialCasing.txt
 * [UnicodeData.txt]: http://ftp.unicode.org/Public/1.1-Update/UnicodeData-1.1.5.txt
 * [Unicode uppercase mapping]: https://www.unicode.org/reports/tr44/#Casemapping
 */

/// Characters that PHP 7.2 and earlier considers to be uppercase but Unicode does not.
///
///
/// Taken from MediaWiki's mediawiki.Title/phpCharToUpper.json @ 58233ac5af17d
pub(crate) const ALREADY_UPPERCASE: [char; 204] = [
    'ß', 'ʼn', 'ǰ', 'ʂ', 'ͅ', 'ΐ', 'ΰ', 'և', 'ა', 'ბ', 'გ', 'დ', 'ე', 'ვ', 'ზ',
    'თ', 'ი', 'კ', 'ლ', 'მ', 'ნ', 'ო', 'პ', 'ჟ', 'რ', 'ს', 'ტ', 'უ', 'ფ', 'ქ',
    'ღ', 'ყ', 'შ', 'ჩ', 'ც', 'ძ', 'წ', 'ჭ', 'ხ', 'ჯ', 'ჰ', 'ჱ', 'ჲ', 'ჳ', 'ჴ',
    'ჵ', 'ჶ', 'ჷ', 'ჸ', 'ჹ', 'ჺ', 'ჽ', 'ჾ', 'ჿ', 'ᶎ', 'ẖ', 'ẗ', 'ẘ', 'ẙ', 'ẚ',
    'ὐ', 'ὒ', 'ὔ', 'ὖ', 'ᾈ', 'ᾉ', 'ᾊ', 'ᾋ', 'ᾌ', 'ᾍ', 'ᾎ', 'ᾏ', 'ᾘ', 'ᾙ', 'ᾚ',
    'ᾛ', 'ᾜ', 'ᾝ', 'ᾞ', 'ᾟ', 'ᾨ', 'ᾩ', 'ᾪ', 'ᾫ', 'ᾬ', 'ᾭ', 'ᾮ', 'ᾯ', 'ᾲ', 'ᾴ',
    'ᾶ', 'ᾷ', 'ᾼ', 'ῂ', 'ῄ', 'ῆ', 'ῇ', 'ῌ', 'ῒ', 'ΐ', 'ῖ', 'ῗ', 'ῢ', 'ΰ', 'ῤ',
    'ῦ', 'ῧ', 'ῲ', 'ῴ', 'ῶ', 'ῷ', 'ῼ', 'ⅰ', 'ⅱ', 'ⅲ', 'ⅳ', 'ⅴ', 'ⅵ', 'ⅶ', 'ⅷ',
    'ⅸ', 'ⅹ', 'ⅺ', 'ⅻ', 'ⅼ', 'ⅽ', 'ⅾ', 'ⅿ', 'ⓐ', 'ⓑ', 'ⓒ', 'ⓓ', 'ⓔ', 'ⓕ', 'ⓖ',
    'ⓗ', 'ⓘ', 'ⓙ', 'ⓚ', 'ⓛ', 'ⓜ', 'ⓝ', 'ⓞ', 'ⓟ', 'ⓠ', 'ⓡ', 'ⓢ', 'ⓣ', 'ⓤ', 'ⓥ',
    'ⓦ', 'ⓧ', 'ⓨ', 'ⓩ', 'ꞔ', 'ꞹ', 'ꞻ', 'ꞽ', 'ꞿ', 'ꟃ', 'ff', 'fi', 'fl', 'ffi', 'ffl',
    'ſt', 'st', 'ﬓ', 'ﬔ', 'ﬕ', 'ﬖ', 'ﬗ', '𖹠', '𖹡', '𖹢', '𖹣', '𖹤', '𖹥', '𖹦', '𖹧',
    '𖹨', '𖹩', '𖹪', '𖹫', '𖹬', '𖹭', '𖹮', '𖹯', '𖹰', '𖹱', '𖹲', '𖹳', '𖹴', '𖹵', '𖹶',
    '𖹷', '𖹸', '𖹹', '𖹺', '𖹻', '𖹼', '𖹽', '𖹾', '𖹿',
];

/// Characters that PHP 7.2 and earlier maps to uppercase differently than Unicode.
///
/// Taken from MediaWiki's mediawiki.Title/phpCharToUpper.json @ 58233ac5af17d
pub(crate) fn to_uppercase(input: char) -> Option<char> {
    match input {
        'ᾀ' => Some('ᾈ'),
        'ᾁ' => Some('ᾉ'),
        'ᾂ' => Some('ᾊ'),
        'ᾃ' => Some('ᾋ'),
        'ᾄ' => Some('ᾌ'),
        'ᾅ' => Some('ᾍ'),
        'ᾆ' => Some('ᾎ'),
        'ᾇ' => Some('ᾏ'),
        'ᾐ' => Some('ᾘ'),
        'ᾑ' => Some('ᾙ'),
        'ᾒ' => Some('ᾚ'),
        'ᾓ' => Some('ᾛ'),
        'ᾔ' => Some('ᾜ'),
        'ᾕ' => Some('ᾝ'),
        'ᾖ' => Some('ᾞ'),
        'ᾗ' => Some('ᾟ'),
        'ᾠ' => Some('ᾨ'),
        'ᾡ' => Some('ᾩ'),
        'ᾢ' => Some('ᾪ'),
        'ᾣ' => Some('ᾫ'),
        'ᾤ' => Some('ᾬ'),
        'ᾥ' => Some('ᾭ'),
        'ᾦ' => Some('ᾮ'),
        'ᾧ' => Some('ᾯ'),
        'ᾳ' => Some('ᾼ'),
        'ῃ' => Some('ῌ'),
        'ῳ' => Some('ῼ'),
        // Can't default to input.to_uppercase() because that returns an iter
        // of chars rather than a single one.
        _ => None,
    }
}