// SPDX-License-Identifier: GPL-3.0-or-later
// Copyright (C) 2023 Kunal Mehta <legoktm@debian.org>
//! Calculate Wikipedia prose size
//!
//! This crate is a rough port of the [Wikipedia Prosesize script](https://en.wikipedia.org/wiki/Wikipedia:Prosesize)
//! that allows for counting the bytes of prose on a page rather than the wikitext
//! markup or generated HTML.
//!
//! You will most likely fetch `ImmutableWikicode` using the [parsoid](https://docs.rs/parsoid)
//! crate.
//!
//! The response from `prosesize()` provides the text-only prose size, word count and text-only
//! references size. Enabling the optional `serde-1` feature makes the size struct serializable
//! and deserializable.
//!
//! ## Contributing
//! `wikipedia_prosesize` is part of the [`mwbot-rs` project](https://www.mediawiki.org/wiki/Mwbot-rs).
//! We're always looking for new contributors, please [reach out](https://www.mediawiki.org/wiki/Mwbot-rs#Contributing)
//! if you're interested!
#![deny(clippy::all)]
#![deny(rustdoc::all)]

use parsoid::prelude::*;

#[derive(Debug)]
#[cfg_attr(feature = "serde-1", derive(serde::Serialize, serde::Deserialize))]
pub struct ProseSize {
    /// Prose size (text only) aka "readable prose size
    prose_size: u64,
    /// Words in prose
    word_count: u64,
    /// References (text only)
    references: u64,
}

impl ProseSize {
    /// Size of text in "readable prose".
    /// See <https://en.wikipedia.org/wiki/Wikipedia:Prosesize#Prose_size>.
    pub fn prose_size(&self) -> u64 {
        self.prose_size
    }

    /// Number of words in the "readable prose".
    /// See <https://en.wikipedia.org/wiki/Wikipedia:Prosesize#Prose_size>.
    pub fn word_count(&self) -> u64 {
        self.word_count
    }

    /// Size of references (text only).
    /// See <https://en.wikipedia.org/wiki/Wikipedia:Prosesize#References_size>
    pub fn references(&self) -> u64 {
        self.references
    }
}

/// Calculate the prose size for the given HTML. Note that
/// if you provide a mutable `parsoid::Wikicode` instance,
/// the document will be modified!
pub fn prosesize(html: impl Into<Wikicode>) -> ProseSize {
    let code = html.into();
    let mut prose_size = 0;

    // strip some stuff we don't want to count
    let code = remove_noncounted_elements(code);

    let mut word_count = 0;
    // TODO: Add an option for just selecting "p" to include blockquotes, etc.
    for node in code.select(PROSE_SELECTOR) {
        let text = node.text_contents();
        prose_size += text.len() as u64;
        word_count += text.split(' ').count() as u64;
    }
    // Calculate size of references (i.e. output of <references/>)
    // We only want the main references group here, and not other things that
    // are technically references but not citations, like footnotes. So ignore
    // anything not using the default group (h/t Izno).
    let references = code
        .select("ol.references:not([data-mw-group])")
        .into_iter()
        .map(|node| node.text_contents().len() as u64)
        .sum();
    ProseSize {
        prose_size,
        word_count,
        references,
    }
}

/// Selector for prose, assuming you've already removed other
/// non-counted elements
pub const PROSE_SELECTOR: &str = "section > p";

/// Remove elements that we absolutely don't plan on counting. This
/// can be run against your own Wikicode instance if you just want to
/// analyze prosesize-counted content.
pub fn remove_noncounted_elements(code: Wikicode) -> Wikicode {
    let mut remove = vec![];
    for node in code.descendants() {
        if let Some(element) = node.as_element() {
            // In CSS terms, we're removing nodes that match the
            // following selectors:
            // style
            // #coordinates
            // [class*="emplate"]
            // [typeof~="mw:Extension/math"]
            // [typeof~="mw:Extension/ref"]
            if &element.name.local == "style"
                || element.attributes.borrow().get("id") == Some("coordinates")
                || element
                    .attributes
                    .borrow()
                    .get("class")
                    .map(|c| c.contains("emplate"))
                    .unwrap_or(false)
                || element
                    .attributes
                    .borrow()
                    .get("typeof")
                    .map(|t| t.split(' ').any(|t| t == "mw:Extension/math"))
                    .unwrap_or(false)
                || element
                    .attributes
                    .borrow()
                    .get("typeof")
                    .map(|t| t.split(' ').any(|t| t == "mw:Extension/ref"))
                    .unwrap_or(false)
            {
                // We defer removal because otherwise it interferes with the
                // descendants iterator for some reason.
                remove.push(node.clone());
            }
        }
    }
    for node in remove {
        node.detach();
    }
    code
}

/// Get a stylesheet for Parsoid HTML that highlights elements
/// counted for prosesize in yellow and references in light blue.
pub fn parsoid_stylesheet() -> &'static str {
    include_str!("parsoid.css")
}

#[cfg(test)]
mod tests {
    use super::*;

    #[tokio::test]
    #[ignore] // FIXME: counts aren't stable, maybe templates throw it off?
    async fn test_prosesize() {
        let client = parsoid::Client::new(
            "https://en.wikipedia.org/w/rest.php",
            "wikipedia_prosesize crate testing",
        )
        .unwrap();
        let html = client
            .get_revision("Douglas MacArthur", 1138947706)
            .await
            .unwrap();
        let size = prosesize(html);
        dbg!(&size);
        // prosesize.js says 113 kB
        assert_eq!(size.prose_size(), 115_698);
        // prosesize.js says 18,679 words
        assert_eq!(size.word_count(), 18_674);
        // prosesize.js says 27 kb
        assert_eq!(size.references(), 26_149);
        // Now a math article with <math> tags
        let html = client
            .get_revision("Group (mathematics)", 1133598242)
            .await
            .unwrap();
        let size = prosesize(html);
        dbg!(&size);
        // prosesize.js says 79 kB
        assert_eq!(size.prose_size(), 42_206);
        // prosesize.js says 34,271 words
        assert_eq!(size.word_count(), 7_163);
        // prosesize.js says 2828 B
        assert_eq!(size.references(), 2_573);
    }
}