mwtitle/php.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
/*
Copyright (C) 2016, 2019 Ed Sanders
Copyright (C) 2021 Kunal Mehta <legoktm@debian.org>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/*!
* Non-Unicode uppercase mapping used in MediaWiki titles.
*
* Some MediaWiki titles have their first code point (letter) uppercased.
* Most code points are uppercased according to the [Unicode uppercase mapping],
* but some have a different uppercase mapping,
* following the `mb_strtoupper` function from PHP version 7.2 and earlier.
* In [PHP 7.3], `mb_strtoupper` was updated to follow Unicode casemapping,
* but MediaWiki titles still use the old uppercasing for compatibility.
* `ALREADY_UPPERCASE` lists code points that are unchanged by uppercasing
* and `to_uppercase` maps some code points to their non-Unicode uppercasing.
*
* In general, MediaWiki title casing doesn't uppercase one code point to two or three code points
* as Unicode does with some code points (see [SpecialCasing.txt] for a full list).
* It leaves some of these code points unchanged,
* like ff (U+FB00 LATIN SMALL LIGATURE FF),
* whose Unicode uppercase is
* FF (U+0046 LATIN CAPITAL LETTER F, U+0046 LATIN CAPITAL LETTER F).
* It maps other code points to a different single code point,
* like ᾳ (U+1FB3 GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI),
* whose Unicode uppercase is
* ΑΙ (U+0391 GREEK CAPITAL LETTER ALPHA, U+0399 GREEK CAPITAL LETTER IOTA)
* but whose MediaWiki title uppercase is
* ᾼ (U+1FBC GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI).
*
* MediaWiki title casing also leaves some code points unchanged
* even when Unicode gives them a one-to-one uppercase mapping.
* Some of these code points had uppercase mappings
* in [UnicodeData.txt] for version 1.1 of Unicode.
* like ⓐ (U+24D0 CIRCLED LATIN SMALL LETTER A)
* to Ⓐ (U+24B6 CIRCLED LATIN CAPITAL LETTER A).
* Others had uppercase mappings added in a later version of Unicode;
* for instance, lowercase ა (U+10D0 GEORGIAN LETTER AN) began to be mapped
* to uppercase Ა (U+1C90 GEORGIAN MTAVRULI CAPITAL LETTER AN) in Unicode 11.0 (2018-06-05).
*
* [PHP 7.3]: https://www.zend.com/blog/guide-to-php-73
* [SpecialCasing.txt]: http://ftp.unicode.org/Public/UNIDATA/SpecialCasing.txt
* [UnicodeData.txt]: http://ftp.unicode.org/Public/1.1-Update/UnicodeData-1.1.5.txt
* [Unicode uppercase mapping]: https://www.unicode.org/reports/tr44/#Casemapping
*/
/// Characters that PHP 7.2 and earlier considers to be uppercase but Unicode does not.
///
///
/// Taken from MediaWiki's mediawiki.Title/phpCharToUpper.json @ 58233ac5af17d
pub(crate) const ALREADY_UPPERCASE: [char; 204] = [
'ß', 'ʼn', 'ǰ', 'ʂ', 'ͅ', 'ΐ', 'ΰ', 'և', 'ა', 'ბ', 'გ', 'დ', 'ე', 'ვ', 'ზ',
'თ', 'ი', 'კ', 'ლ', 'მ', 'ნ', 'ო', 'პ', 'ჟ', 'რ', 'ს', 'ტ', 'უ', 'ფ', 'ქ',
'ღ', 'ყ', 'შ', 'ჩ', 'ც', 'ძ', 'წ', 'ჭ', 'ხ', 'ჯ', 'ჰ', 'ჱ', 'ჲ', 'ჳ', 'ჴ',
'ჵ', 'ჶ', 'ჷ', 'ჸ', 'ჹ', 'ჺ', 'ჽ', 'ჾ', 'ჿ', 'ᶎ', 'ẖ', 'ẗ', 'ẘ', 'ẙ', 'ẚ',
'ὐ', 'ὒ', 'ὔ', 'ὖ', 'ᾈ', 'ᾉ', 'ᾊ', 'ᾋ', 'ᾌ', 'ᾍ', 'ᾎ', 'ᾏ', 'ᾘ', 'ᾙ', 'ᾚ',
'ᾛ', 'ᾜ', 'ᾝ', 'ᾞ', 'ᾟ', 'ᾨ', 'ᾩ', 'ᾪ', 'ᾫ', 'ᾬ', 'ᾭ', 'ᾮ', 'ᾯ', 'ᾲ', 'ᾴ',
'ᾶ', 'ᾷ', 'ᾼ', 'ῂ', 'ῄ', 'ῆ', 'ῇ', 'ῌ', 'ῒ', 'ΐ', 'ῖ', 'ῗ', 'ῢ', 'ΰ', 'ῤ',
'ῦ', 'ῧ', 'ῲ', 'ῴ', 'ῶ', 'ῷ', 'ῼ', 'ⅰ', 'ⅱ', 'ⅲ', 'ⅳ', 'ⅴ', 'ⅵ', 'ⅶ', 'ⅷ',
'ⅸ', 'ⅹ', 'ⅺ', 'ⅻ', 'ⅼ', 'ⅽ', 'ⅾ', 'ⅿ', 'ⓐ', 'ⓑ', 'ⓒ', 'ⓓ', 'ⓔ', 'ⓕ', 'ⓖ',
'ⓗ', 'ⓘ', 'ⓙ', 'ⓚ', 'ⓛ', 'ⓜ', 'ⓝ', 'ⓞ', 'ⓟ', 'ⓠ', 'ⓡ', 'ⓢ', 'ⓣ', 'ⓤ', 'ⓥ',
'ⓦ', 'ⓧ', 'ⓨ', 'ⓩ', 'ꞔ', 'ꞹ', 'ꞻ', 'ꞽ', 'ꞿ', 'ꟃ', 'ff', 'fi', 'fl', 'ffi', 'ffl',
'ſt', 'st', 'ﬓ', 'ﬔ', 'ﬕ', 'ﬖ', 'ﬗ', '𖹠', '𖹡', '𖹢', '𖹣', '𖹤', '𖹥', '𖹦', '𖹧',
'𖹨', '𖹩', '𖹪', '𖹫', '𖹬', '𖹭', '𖹮', '𖹯', '𖹰', '𖹱', '𖹲', '𖹳', '𖹴', '𖹵', '𖹶',
'𖹷', '𖹸', '𖹹', '𖹺', '𖹻', '𖹼', '𖹽', '𖹾', '𖹿',
];
/// Characters that PHP 7.2 and earlier maps to uppercase differently than Unicode.
///
/// Taken from MediaWiki's mediawiki.Title/phpCharToUpper.json @ 58233ac5af17d
pub(crate) fn to_uppercase(input: char) -> Option<char> {
match input {
'ᾀ' => Some('ᾈ'),
'ᾁ' => Some('ᾉ'),
'ᾂ' => Some('ᾊ'),
'ᾃ' => Some('ᾋ'),
'ᾄ' => Some('ᾌ'),
'ᾅ' => Some('ᾍ'),
'ᾆ' => Some('ᾎ'),
'ᾇ' => Some('ᾏ'),
'ᾐ' => Some('ᾘ'),
'ᾑ' => Some('ᾙ'),
'ᾒ' => Some('ᾚ'),
'ᾓ' => Some('ᾛ'),
'ᾔ' => Some('ᾜ'),
'ᾕ' => Some('ᾝ'),
'ᾖ' => Some('ᾞ'),
'ᾗ' => Some('ᾟ'),
'ᾠ' => Some('ᾨ'),
'ᾡ' => Some('ᾩ'),
'ᾢ' => Some('ᾪ'),
'ᾣ' => Some('ᾫ'),
'ᾤ' => Some('ᾬ'),
'ᾥ' => Some('ᾭ'),
'ᾦ' => Some('ᾮ'),
'ᾧ' => Some('ᾯ'),
'ᾳ' => Some('ᾼ'),
'ῃ' => Some('ῌ'),
'ῳ' => Some('ῼ'),
// Can't default to input.to_uppercase() because that returns an iter
// of chars rather than a single one.
_ => None,
}
}