mwtitle/
codec.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
/*
Copyright (C) Tim Starling
Copyright (C) Daniel Kinzler
Copyright (C) 2021 Kunal Mehta <legoktm@debian.org>
Copyright (C) 2021 Erutuon

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
use crate::ip::sanitize_ip;
use crate::namespace::{NS_SPECIAL, NS_TALK, NS_USER, NS_USER_TALK};
#[cfg(feature = "utils")]
#[cfg_attr(docsrs, doc(cfg(feature = "utils")))]
use crate::SiteInfoResponse;
use crate::{
    php, Error, Interwiki, InterwikiSet, NamespaceAlias, NamespaceInfo,
    NamespaceMap, Result, SiteInfo, Title, NS_MAIN,
};
#[cfg(feature = "utils")]
#[cfg_attr(docsrs, doc(cfg(feature = "utils")))]
use flate2::read::GzDecoder;
use regex::bytes::Regex;
#[cfg(feature = "utils")]
#[cfg_attr(docsrs, doc(cfg(feature = "utils")))]
use std::{fs::File, io::Read, path::Path, sync::Arc};

/// The `TitleCodec` is responsible for parsing, normalizing and formatting
/// `Title`s. See the crate-level documentation for an example of how to
/// construct one.
#[cfg_attr(docsrs, doc(cfg(feature = "parsing")))]
#[derive(Clone, Debug)]
pub struct TitleCodec {
    namespace_map: NamespaceMap,
    interwiki_set: InterwikiSet,
    local_interwiki_set: InterwikiSet,
    main_page: String,
    lang: String,
    illegal_patterns: Regex,
}

#[test]
fn title_codec_is_send_and_sync() {
    fn assert_send_and_sync<T: Send + Sync>() {}

    assert_send_and_sync::<TitleCodec>();
}

impl TitleCodec {
    /// Create a new title by parsing the provided input.
    pub fn new_title(&self, input: &str) -> Result<Title> {
        self.secure_and_split(input, NS_MAIN)
    }

    /// Create a new title by parsing the provided input. If the title has no
    /// namespace part, then the namespace specified by `default_namespace` is
    /// used instead.
    pub fn new_title_with_namespace(
        &self,
        input: &str,
        default_namespace: i32,
    ) -> Result<Title> {
        self.secure_and_split(input, default_namespace)
    }

    /// Create a new title from the numerical database ID and title portion,
    /// usually obtained directly from the database.
    pub fn new_title_from_database(
        &self,
        namespace: i32,
        dbkey: &str,
    ) -> Result<Title> {
        match self.namespace_map.get_name(namespace) {
            Some(name) => {
                if name.is_empty() {
                    // No prefixing needed
                    self.new_title(dbkey)
                } else {
                    self.new_title(&format!("{name}:{dbkey}"))
                }
            }
            None => Err(Error::UnknownNamespace(namespace)),
        }
    }

    /// Get a reference to the underlying `NamespaceMap`
    /// to get information about namespaces.
    pub fn namespace_map(&self) -> &NamespaceMap {
        &self.namespace_map
    }

    /// Get the title with namespace in pretty aka text form (spaces).
    ///
    /// Fragments will not be included.
    ///
    /// # Panics
    ///
    /// This will panic if the `Title` is in a namespace that this `TitleCodec`
    /// is unaware of.
    pub fn to_pretty(&self, title: &Title) -> String {
        self.namespace_map
            .to_pretty(title)
            .expect("unknown namespace")
    }

    /// Get the title with namespace in underscore aka dbkey form. This is
    /// potentially useful when you want to make a database query.
    ///
    /// Fragments will not be included.
    ///
    /// # Panics
    ///
    /// This will panic if the `Title` is in a namespace that this `TitleCodec`
    /// is unaware of.
    pub fn to_underscores(&self, title: &Title) -> String {
        self.namespace_map
            .to_underscores(title)
            .expect("unknown namespace")
    }

    /// Get the title with namespace in pretty aka text form (spaces), with the
    /// fragment, if one exists, appended.
    ///
    /// # Panics
    ///
    /// This will panic if the `Title` is in a namespace that this `TitleCodec`
    /// is unaware of.
    pub fn to_pretty_with_fragment(&self, title: &Title) -> String {
        self.namespace_map
            .to_pretty_with_fragment(title)
            .expect("unknown namespace")
    }

    /// Construct a new `TitleCodec` using the given fields.
    ///
    /// In most cases it is easier to do so from one of the siteinfo methods.
    pub fn new(
        namespace_map: NamespaceMap,
        interwiki_set: InterwikiSet,
        local_interwiki_set: InterwikiSet,
        main_page: String,
        lang: String,
        legal_title_chars: String,
    ) -> Result<Self> {
        // Copied from `MediaWikiTitleCodec::getTitleInvalidRegex()`.
        // The `legal_title_chars` portion has to be changed when this lands:
        // https://phabricator.wikimedia.org/T297340
        // Matching titles will be held as illegal.
        let illegal_patterns = Regex::new(&format!(
            r"(?x-u)
                # x: ignore whitespace and allow comments;
                # -u: disable code point matching
                # so that \x80-\xff match bytes 0x80-0xFF
                # (corresponding to all non-ASCII code points, U+0080-U+10FFFF)
                # rather than code points U+0080-U+00FF.
                    # Any character not allowed is forbidden...
                    [^{legal_title_chars}]

                    # URL percent encoding sequences interfere with the ability
                    # to round-trip titles -- you can't link to them consistently.
                    | %[0-9A-Fa-f]{{2}}

                    # XML/HTML character references produce similar issues.
                    | &[A-Za-z0-9\x80-\xff]+;
                ",
            // / does not need to be escaped as \/ in Rust regex.
            legal_title_chars = legal_title_chars.replace(r"\/", "/")
        ))?;

        Ok(Self {
            namespace_map,
            interwiki_set,
            local_interwiki_set,

            illegal_patterns,
            main_page,
            lang,
        })
    }

    /// Create a new `TitleCodec` getting namespaces, namespace aliases, and interwikis from iterators.
    pub fn new_from_iters<
        N: IntoIterator<Item = NamespaceInfo>,
        A: IntoIterator<Item = NamespaceAlias>,
        I: IntoIterator<Item = Interwiki>,
    >(
        namespaces: N,
        namespace_aliases: A,
        interwikis: I,
        main_page: String,
        lang: String,
        legal_title_chars: String,
    ) -> Result<Self> {
        let (interwiki_set, local_interwiki_set) =
            InterwikiSet::all_and_local_from_iter(interwikis);
        let namespace_map =
            NamespaceMap::from_namespaces_and_namespace_aliases(
                namespaces,
                namespace_aliases,
            )?;
        Self::new(
            namespace_map,
            interwiki_set,
            local_interwiki_set,
            main_page,
            lang,
            legal_title_chars,
        )
    }

    /// Creates a `TitleCodec` by parsing the contents of a JSON or GZipped JSON file.
    ///
    /// Will accept the `siteinfo-namespaces.json.gz` file from in the Wikimedia dumps.
    /// If the file extension is `gz`, decompresses from the GZip format before deserializing the JSON;
    /// otherwise attempts to deserialize the file contents directly.
    #[cfg(feature = "utils")]
    #[cfg_attr(docsrs, doc(cfg(feature = "utils")))]
    pub fn from_path(path: &Path) -> Result<Self> {
        let json = if path.extension() == Some("gz".as_ref()) {
            let gz = File::open(path)
                .map_err(|source| Error::from_io("open file", source, path))?;
            let mut decoder = GzDecoder::new(gz);
            let mut decoded = String::new();
            decoder
                .read_to_string(&mut decoded)
                .map_err(|source| Error::from_io("parse GZip", source, path))?;
            decoded
        } else {
            std::fs::read_to_string(path).map_err(|source| {
                Error::from_io("read file to string", source, path)
            })?
        };
        Self::from_json_with_path(&json, Some(path))
    }

    /// Creates a `TitleCodec` by parsing the contents of a `Read` type that contains the JSON
    /// representation of a [`SiteInfoResponse`].
    #[cfg(feature = "utils")]
    #[cfg_attr(docsrs, doc(cfg(feature = "utils")))]
    pub fn from_reader<R: Read>(reader: R) -> Result<Self> {
        Self::from_site_info(
            serde_json::from_reader::<R, SiteInfoResponse>(reader)
                .map_err(|source| Error::Json {
                    source: Arc::new(source),
                })?
                .query,
        )
    }

    /// Creates a `TitleCodec` by parsing the JSON representation of a [`SiteInfoResponse`].
    #[cfg(feature = "utils")]
    #[cfg_attr(docsrs, doc(cfg(feature = "utils")))]
    pub fn from_json<S: AsRef<str>>(json: S) -> Result<Self> {
        Self::from_json_with_path(json.as_ref(), None)
    }

    /// Creates a `TitleCodec` by parsing the JSON representation of a [`SiteInfoResponse`].
    ///
    /// # Errors
    ///
    /// If this fails and `path` is `Some(_)`, gives an error message
    /// that mentions `path`.
    #[cfg(feature = "utils")]
    #[cfg_attr(docsrs, doc(cfg(feature = "utils")))]
    fn from_json_with_path(json: &str, path: Option<&Path>) -> Result<Self> {
        Self::from_site_info(
            serde_json::from_str::<SiteInfoResponse>(json)
                .map_err(|source| {
                    let source = Arc::new(source);
                    if let Some(path) = path {
                        Error::JsonFile {
                            source,
                            path: path.into(),
                        }
                    } else {
                        Error::Json { source }
                    }
                })?
                .query,
        )
    }

    /// Create a new `TitleCodec` using the provided [`SiteInfo`].
    ///
    /// The `SiteInfo` must include a non-empty `interwiki_map` field
    /// to enable the resulting `TitleCodec`
    /// to correctly parse titles with interwikis,
    /// but an empty `interwiki_map` is not an error.
    pub fn from_site_info(site_info: SiteInfo) -> Result<Self> {
        Self::new_from_iters(
            site_info.namespaces.into_values(),
            site_info.namespace_aliases,
            site_info.interwiki_map,
            site_info.general.main_page,
            site_info.general.lang,
            site_info.general.legal_title_chars,
        )
    }

    /// Equivalent of `MediaWikiTitleCodec::splitTitleString()`.
    ///
    /// Most comments are direct copies to make it easier to compare with
    /// the MediaWiki implementation.
    fn secure_and_split(
        &self,
        input: &str,
        default_namespace: i32,
    ) -> Result<Title> {
        let mut namespace = default_namespace;
        // Strip Unicode bidi override characters.
        // Clean up whitespace.
        let mut dbkey = normalize_title_chars(input);
        let mut fragment = None;
        let mut interwiki = None;
        let mut local_interwiki = false;

        // U+FFFD is the replacement character
        if dbkey.contains('\u{FFFD}') {
            // Contained illegal UTF-8 sequences or forbidden Unicode chars.
            return Err(Error::IllegalUtf8(input.to_string()));
        }
        // Skip "Contained illegal UTF-8 sequences or forbidden Unicode chars.",
        // because all Rust strings are valid UTF-8.

        // Initial colon indicates main namespace rather than specified default
        // but should not create invalid {ns,title} pairs such as {0,Project:Foo}
        if dbkey.get(0..1) == Some(":") {
            namespace = NS_MAIN;
            // remove the colon but continue processing
            dbkey.drain(..1);
            // remove any subsequent whitespace
            trim_title_whitespace(&mut dbkey);
        }
        if dbkey.is_empty() {
            return Err(Error::Empty(input.to_string()));
        }

        fn get_nonempty_trimmed(
            s: &str,
            range_to: std::ops::RangeTo<usize>,
        ) -> Option<&str> {
            s.get(range_to)
                .filter(|p| !p.is_empty())
                .map(|s| s.trim_end_matches('_'))
        }

        // Namespace or interwiki prefix
        // `MediaWikiTitleCodec` uses a regex here, but we're going to use string
        // parsing instead.
        loop {
            if let Some(colon_pos) = dbkey.find(':') {
                if let Some(prefix) = get_nonempty_trimmed(&dbkey, ..colon_pos)
                {
                    if let Some(ns) = self.namespace_map.get_id(prefix) {
                        // Ordinary namespace
                        namespace = ns;
                        dbkey.drain(..colon_pos + 1);
                        trim_title_whitespace(&mut dbkey);
                        // For Talk:X pages, check if X has a "namespace" prefix
                        if ns == NS_TALK {
                            if let Some(colon_pos) = dbkey.find(':') {
                                // Disallow Talk:File:x or Talk:Interwiki:x type titles ...
                                if let Some(prefix) =
                                    get_nonempty_trimmed(&dbkey, ..colon_pos)
                                {
                                    if self
                                        .namespace_map
                                        .get_id(prefix)
                                        .is_some()
                                        || self.interwiki_set.contains(prefix)
                                    {
                                        return Err(Error::TalkNamespace(
                                            input.to_string(),
                                        ));
                                    }
                                }
                            }
                        }
                    } else if self.interwiki_set.contains(prefix) {
                        // Check this using prefix before we mutably borrow dbkey
                        let is_local_interwiki =
                            self.local_interwiki_set.contains(prefix);
                        interwiki = Some(prefix.to_lowercase());
                        dbkey.drain(..colon_pos + 1);
                        trim_title_whitespace(&mut dbkey);

                        if is_local_interwiki {
                            if dbkey.is_empty() {
                                // Empty self-links should point to the Main Page, to ensure
                                // compatibility with cross-wiki transclusions and the like.
                                return Ok(self
                                    .new_title(&self.main_page)
                                    .map(|mut title| {
                                        title.local_interwiki = true;
                                        title
                                    })
                                    .unwrap_or_else(|_| {
                                        // Fallback to hardcoded "Main Page" if the configured main page
                                        // value is unparseable
                                        Title {
                                            namespace: NS_MAIN,
                                            dbkey: "Main_Page".to_string(),
                                            fragment: None,
                                            interwiki: None,
                                            local_interwiki: true,
                                        }
                                    }));
                            }
                            interwiki = None;
                            // local interwikis should behave like initial-colon links
                            local_interwiki = true;

                            // Do another namespace split...
                            continue;
                        }

                        // If there's an initial colon after the interwiki, that also
                        // resets the default namespace
                        if dbkey.starts_with(':') {
                            namespace = NS_MAIN;
                            dbkey.drain(..1);
                            trim_title_whitespace(&mut dbkey);
                        }
                    }
                }
            }
            // If there's no recognized interwiki or namespace,
            // then let the colon expression be part of the title.
            break;
        }

        if let Some((key, f)) = dbkey.split_once('#') {
            fragment = Some(f.replace('_', " "));
            let key_len = key.len(); // to satisfy borrow checker
            dbkey.truncate(key_len);
            // remove whitespace again: prevents "Foo_bar_#"
            // becoming "Foo_bar_"
            trim_title_whitespace(&mut dbkey);
        }

        // Reject illegal characters.
        if self.illegal_patterns.is_match(dbkey.as_bytes()) {
            return Err(Error::Characters(input.to_string()));
        }

        // Pages with "/./" or "/../" appearing in the URLs will often be un-
        // reachable due to the way web browsers deal with 'relative' URLs.
        // Also, they conflict with subpage syntax.  Forbid them explicitly.
        if dbkey == "."
            || dbkey == ".."
            || dbkey.starts_with("./")
            || dbkey.starts_with("../")
            || dbkey.contains("/./")
            || dbkey.contains("/../")
            || dbkey.ends_with("/.")
            || dbkey.ends_with("/..")
        {
            return Err(Error::Relative(input.to_string()));
        }

        // Magic tilde sequences? Nu-uh!
        if dbkey.contains("~~~") {
            return Err(Error::MagicTildes(input.to_string()));
        }

        // Limit the size of titles to 255 bytes. This is typically the size of the
        // underlying database field. We make an exception for special pages, which
        // don't need to be stored in the database, and may edge over 255 bytes due
        // to subpage syntax for long titles, e.g. [[Special:Block/Long name]]
        let max_length = if namespace == NS_SPECIAL { 512 } else { 255 };
        if dbkey.len() > max_length {
            return Err(Error::TooLong(input.to_string()));
        }

        // Normally, all wiki links are forced to have an initial capital letter so [[foo]]
        // and [[Foo]] point to the same place.  Don't force it for interwikis, since the
        // other site might be case-sensitive.
        if interwiki.is_none()
            && self
                .namespace_map
                .is_capitalized(namespace)
                .unwrap_or(false)
        {
            uppercase_first(&self.lang, &mut dbkey);
        }

        // Can't make a link to a namespace alone... "empty" local links can only be
        // self-links with a fragment identifier.
        // MediaWiki allows for links with just a fragment, but we won't.
        if dbkey.is_empty() && interwiki.is_none() && namespace != NS_MAIN {
            return Err(Error::Empty(input.to_string()));
        }

        if namespace == NS_USER || namespace == NS_USER_TALK {
            sanitize_ip(&mut dbkey);
        }

        // Any remaining initial :s are illegal.
        if dbkey.starts_with(':') {
            return Err(Error::LeadingColon(input.to_string()));
        }

        Ok(Title {
            namespace,
            dbkey,
            fragment,
            interwiki,
            local_interwiki,
        })
    }
}

/// Indicates whether a code point is considered whitespace when it is found in a title.
///
/// Includes all code points with the White_Space property
/// (see [PropList.txt](https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt)),
/// but excludes the control characters
/// U+009-U+00D (tab, newline, vertical tab, form feed, carriage return)
/// and U+0085 (next line), and adds U+180E (MONGOLIAN VOWEL SEPARATOR),
/// a format character (General Category: Cf).
/// The control characters U+009-U+00D are rejected
/// by the `illegal_patterns` regex;
/// U+0085 is accepted as a valid character.
#[rustfmt::skip]
fn is_title_whitespace(c: char) -> bool {
    matches!(
        c,
        ' ' | '_' // U+0020 SPACE, U+005F LOW LINE
            | '\u{A0}' // U+00A0 NO-BREAK SPACE
            | '\u{1680}' // U+1680 OGHAM SPACE MARK
            | '\u{180E}' // U+180E MONGOLIAN VOWEL SEPARATOR
            // U+2000-U+200A: EN QUAD, EM QUAD, EN SPACE, EM SPACE,
            // THREE-PER-EM SPACE, FOUR-PER-EM SPACE, SIX-PER-EM SPACE,
            // FIGURE SPACE, PUNCTUATION SPACE, THIN SPACE, HAIR SPACE
            | '\u{2000}'..='\u{200A}'
            | '\u{2028}' // U+2028 LINE SEPARATOR
            | '\u{2029}' // U+2029 PARAGRAPH SEPARATOR
            | '\u{202F}' // U+202F NARROW NO-BREAK SPACE
            | '\u{205F}' // U+205F MEDIUM MATHEMATICAL SPACE
            | '\u{3000}' // U+3000 IDEOGRAPHIC SPACE
    )
}

/**
 * Indicates that a character is a directional formatting character
 * that should be removed from titles.
 *
 * MediaWiki strips some [directional formatting characters](https://www.unicode.org/reports/tr9/#Directional_Formatting_Characters) from titles:
 * U+200E and U+200F (LEFT-TO-RIGHT MARK, RIGHT-TO-LEFT MARK)
 * and U+202A–U+202E (LEFT-TO-RIGHT EMBEDDING, RIGHT-TO-LEFT EMBEDDING,
 * POP DIRECTIONAL FORMATTING, LEFT-TO-RIGHT OVERRIDE, RIGHT-TO-LEFT OVERRIDE).
 * All of these were introduced in Unicode 1.1 and are referred to as
 * bidi override characters in the source code
 * of `MediaWikiTitleCodec::splitTitleString()`.
 *
 * The following directional formatting characters were introduced
 * in [Unicode 6.3](https://www.unicode.org/versions/Unicode6.3.0/) (2013)
 * and are not stripped:
 * U+061C (ARABIC LETTER MARK)
 * and U+2066–U+2069 (LEFT‑TO‑RIGHT ISOLATE, RIGHT‑TO‑LEFT ISOLATE, FIRST STRONG ISOLATE, POP DIRECTIONAL ISOLATE).
 */
fn is_bidirectional_override(c: char) -> bool {
    matches!(c, '\u{200E}' | '\u{200F}' | '\u{202A}'..='\u{202E}')
}

/**
 * Normalizes characters in a title.
 *
 * Removes the banned directional formatting characters (see [`is_bidirectional_override`]),
 * strips title whitespace characters (see [`is_title_whitespace`])
 * from the beginning and end of the title,
 * and replaces sequences of one or more title whitespace characters with a single underscore.
 */
fn normalize_title_chars(title: &str) -> String {
    // This gets the minimum possible length of the normalized title.
    // It will be longer than this if there is any untrimmed whitespace.
    let mut out = String::with_capacity(
        title
            .chars()
            .filter(|c| {
                !(is_title_whitespace(*c) || is_bidirectional_override(*c))
            })
            .count(),
    );
    let mut prev_whitespace = false;
    for c in title.chars() {
        let cur_whitespace = is_title_whitespace(c);
        if !(cur_whitespace || is_bidirectional_override(c)) {
            if prev_whitespace && !out.is_empty() {
                out.push('_');
            }
            out.push(c);
        }
        prev_whitespace = cur_whitespace;
    }
    out
}

#[test]
fn normalize_title_chars_strips_and_collapses_title_whitespace() {
    assert_eq!(normalize_title_chars(" a b"), "a_b");
    assert_eq!(normalize_title_chars("a b "), "a_b");
    assert_eq!(normalize_title_chars("a  b"), "a_b");
    assert_eq!(normalize_title_chars("a__b"), "a_b");
}

#[test]
fn normalize_title_chars_removes_directional_control_characters() {
    assert_eq!(normalize_title_chars("\u{200E}_a_b"), "a_b");
    assert_eq!(normalize_title_chars("a\u{200E}_b "), "a_b");
    assert_eq!(normalize_title_chars("a_b\u{200E}"), "a_b");
    assert_eq!(normalize_title_chars("a_\u{200E}_b"), "a_b");
}

fn trim_title_whitespace(s: &mut String) {
    let title_start = s.bytes().position(|b| b != b'_').unwrap_or(0);
    let trailing_whitespace_count =
        s.bytes().rev().position(|b| b != b'_').unwrap_or(0);
    // This `String::drain` won't panic because the `Iterator::position` call gets a valid `char` boundary.
    s.drain(..title_start);
    // This `String::truncate` won't panic because `s.len() - trailing_whitespace_count` is a valid `char` boundary;
    s.truncate(s.len() - trailing_whitespace_count);
}

#[test]
fn trim_title_whitespace_trims_underscores() {
    assert_eq!(normalize_title_chars("_a_b"), "a_b");
    assert_eq!(normalize_title_chars("a_b_"), "a_b");
    assert_eq!(normalize_title_chars("_a_b_"), "a_b");
}

const UPPERCASE_DOTTED_I_LANGUAGES: [&str; 4] = ["az", "kaa", "kk", "tr"];

/// Functional equivalent of `Language::ucfirst()`.
///
/// This is probably not going to be identical because of different Unicode
/// versions in use, but hopefully those cases are so rare we don't hit them.
///
/// Or we could just hardcode a special mapping like MediaWiki does for
/// client-side JavaScript.
fn uppercase_first(lang: &str, input: &mut String) {
    if let Some(first) = input.chars().next() {
        // `Language::ucfirst()` has special handling for the `i` character
        // in some languages
        if first == 'i' && UPPERCASE_DOTTED_I_LANGUAGES.contains(&lang) {
            // i has len_utf8() of 1
            input.drain(..1);
            // İ has len_utf8() of 2
            input.reserve(2);
            input.insert(0, 'İ');
        } else if php::ALREADY_UPPERCASE.contains(&first) {
            // Skip, do nothing
        } else if let Some(replace) = php::to_uppercase(first) {
            input.drain(..first.len_utf8());
            input.reserve(replace.len_utf8());
            input.insert(0, replace);
        } else if !first.is_uppercase() {
            input.drain(..first.len_utf8());
            input.reserve(first.to_uppercase().map(|c| c.len_utf8()).sum());
            for c in first.to_uppercase() {
                input.insert(0, c);
            }
        }
    }
}

#[test]
fn uppercase_first_respects_dotted_i_langs() {
    for ((lang, input), expected) in [
        (("en", "abc"), "Abc"),
        (("en", "istanbul"), "Istanbul"),
        (("tr", "istanbul"), "İstanbul"),
    ] {
        let mut capitalized = input.to_string();
        uppercase_first(lang, &mut capitalized);
        assert_eq!(capitalized, expected);
    }
}