1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
// SPDX-License-Identifier: GPL-3.0-or-later
// Copyright (C) 2023 Kunal Mehta <legoktm@debian.org>
//! Calculate Wikipedia prose size
//!
//! This crate is a rough port of the [Wikipedia Prosesize script](https://en.wikipedia.org/wiki/Wikipedia:Prosesize)
//! that allows for counting the bytes of prose on a page rather than the wikitext
//! markup or generated HTML.
//!
//! You will most likely fetch `ImmutableWikicode` using the [parsoid](https://docs.rs/parsoid)
//! crate.
//!
//! The response from `prosesize()` provides the text-only prose size, word count and text-only
//! references size. Enabling the optional `serde-1` feature makes the size struct serializable
//! and deserializable.
//!
//! ## Contributing
//! `wikipedia_prosesize` is part of the [`mwbot-rs` project](https://www.mediawiki.org/wiki/Mwbot-rs).
//! We're always looking for new contributors, please [reach out](https://www.mediawiki.org/wiki/Mwbot-rs#Contributing)
//! if you're interested!
#![deny(clippy::all)]
#![deny(rustdoc::all)]

use parsoid::prelude::*;

#[derive(Debug)]
#[cfg_attr(feature = "serde-1", derive(serde::Serialize, serde::Deserialize))]
pub struct ProseSize {
    /// Prose size (text only) aka "readable prose size
    prose_size: u64,
    /// Words in prose
    word_count: u64,
    /// References (text only)
    references: u64,
}

impl ProseSize {
    /// Size of text in "readable prose".
    /// See <https://en.wikipedia.org/wiki/Wikipedia:Prosesize#Prose_size>.
    pub fn prose_size(&self) -> u64 {
        self.prose_size
    }

    /// Number of words in the "readable prose".
    /// See <https://en.wikipedia.org/wiki/Wikipedia:Prosesize#Prose_size>.
    pub fn word_count(&self) -> u64 {
        self.word_count
    }

    /// Size of references (text only).
    /// See <https://en.wikipedia.org/wiki/Wikipedia:Prosesize#References_size>
    pub fn references(&self) -> u64 {
        self.references
    }
}

/// Calculate the prose size for the given HTML. Note that
/// if you provide a mutable `parsoid::Wikicode` instance,
/// the document will be modified!
pub fn prosesize(html: impl Into<Wikicode>) -> ProseSize {
    let code = html.into();
    let mut prose_size = 0;

    // strip some stuff we don't want to count
    let code = remove_noncounted_elements(code);

    let mut word_count = 0;
    // TODO: Add an option for just selecting "p" to include blockquotes, etc.
    for node in code.select(PROSE_SELECTOR) {
        let text = node.text_contents();
        prose_size += text.len() as u64;
        word_count += text.split(' ').count() as u64;
    }
    // Calculate size of references (i.e. output of <references/>)
    // We only want the main references group here, and not other things that
    // are technically references but not citations, like footnotes. So ignore
    // anything not using the default group (h/t Izno).
    let references = code
        .select("ol.references:not([data-mw-group])")
        .into_iter()
        .map(|node| node.text_contents().len() as u64)
        .sum();
    ProseSize {
        prose_size,
        word_count,
        references,
    }
}

/// Selector for prose, assuming you've already removed other
/// non-counted elements
pub const PROSE_SELECTOR: &str = "section > p";

/// Remove elements that we absolutely don't plan on counting. This
/// can be run against your own Wikicode instance if you just want to
/// analyze prosesize-counted content.
pub fn remove_noncounted_elements(code: Wikicode) -> Wikicode {
    let mut remove = vec![];
    for node in code.descendants() {
        if let Some(element) = node.as_element() {
            // In CSS terms, we're removing nodes that match the
            // following selectors:
            // style
            // #coordinates
            // [class*="emplate"]
            // [typeof~="mw:Extension/math"]
            // [typeof~="mw:Extension/ref"]
            if &element.name.local == "style"
                || element.attributes.borrow().get("id") == Some("coordinates")
                || element
                    .attributes
                    .borrow()
                    .get("class")
                    .map(|c| c.contains("emplate"))
                    .unwrap_or(false)
                || element
                    .attributes
                    .borrow()
                    .get("typeof")
                    .map(|t| t.split(' ').any(|t| t == "mw:Extension/math"))
                    .unwrap_or(false)
                || element
                    .attributes
                    .borrow()
                    .get("typeof")
                    .map(|t| t.split(' ').any(|t| t == "mw:Extension/ref"))
                    .unwrap_or(false)
            {
                // We defer removal because otherwise it interferes with the
                // descendants iterator for some reason.
                remove.push(node.clone());
            }
        }
    }
    for node in remove {
        node.detach();
    }
    code
}

/// Get a stylesheet for Parsoid HTML that highlights elements
/// counted for prosesize in yellow and references in light blue.
pub fn parsoid_stylesheet() -> &'static str {
    include_str!("parsoid.css")
}

#[cfg(test)]
mod tests {
    use super::*;

    #[tokio::test]
    #[ignore] // FIXME: counts aren't stable, maybe templates throw it off?
    async fn test_prosesize() {
        let client = parsoid::Client::new(
            "https://en.wikipedia.org/w/rest.php",
            "wikipedia_prosesize crate testing",
        )
        .unwrap();
        let html = client
            .get_revision("Douglas MacArthur", 1138947706)
            .await
            .unwrap();
        let size = prosesize(html);
        dbg!(&size);
        // prosesize.js says 113 kB
        assert_eq!(size.prose_size(), 115_698);
        // prosesize.js says 18,679 words
        assert_eq!(size.word_count(), 18_674);
        // prosesize.js says 27 kb
        assert_eq!(size.references(), 26_149);
        // Now a math article with <math> tags
        let html = client
            .get_revision("Group (mathematics)", 1133598242)
            .await
            .unwrap();
        let size = prosesize(html);
        dbg!(&size);
        // prosesize.js says 79 kB
        assert_eq!(size.prose_size(), 42_206);
        // prosesize.js says 34,271 words
        assert_eq!(size.word_count(), 7_163);
        // prosesize.js says 2828 B
        assert_eq!(size.references(), 2_573);
    }
}