wikipedia_prosesize/
lib.rs

1// SPDX-License-Identifier: GPL-3.0-or-later
2// Copyright (C) 2023 Kunal Mehta <legoktm@debian.org>
3//! Calculate Wikipedia prose size
4//!
5//! This crate is a rough port of the [Wikipedia Prosesize script](https://en.wikipedia.org/wiki/Wikipedia:Prosesize)
6//! that allows for counting the bytes of prose on a page rather than the wikitext
7//! markup or generated HTML.
8//!
9//! You will most likely fetch `ImmutableWikicode` using the [parsoid](https://docs.rs/parsoid)
10//! crate.
11//!
12//! The response from [`prosesize()`] provides the text-only prose size, word count and text-only
13//! references size. Enabling the optional `serde-1` feature makes the size struct serializable
14//! and deserializable.
15//!
16//! ## Contributing
17//! `wikipedia_prosesize` is part of the [`mwbot-rs` project](https://www.mediawiki.org/wiki/Mwbot-rs).
18//! We're always looking for new contributors, please [reach out](https://www.mediawiki.org/wiki/Mwbot-rs#Contributing)
19//! if you're interested!
20#![deny(clippy::all)]
21#![deny(rustdoc::all)]
22
23use parsoid::prelude::*;
24
25#[derive(Debug)]
26#[cfg_attr(feature = "serde-1", derive(serde::Serialize, serde::Deserialize))]
27pub struct ProseSize {
28    /// Prose size (text only) aka "readable prose size
29    prose_size: u64,
30    /// Words in prose
31    word_count: u64,
32    /// References (text only)
33    references: u64,
34}
35
36impl ProseSize {
37    /// Size of text in "readable prose".
38    /// See <https://en.wikipedia.org/wiki/Wikipedia:Prosesize#Prose_size>.
39    pub fn prose_size(&self) -> u64 {
40        self.prose_size
41    }
42
43    /// Number of words in the "readable prose".
44    /// See <https://en.wikipedia.org/wiki/Wikipedia:Prosesize#Prose_size>.
45    pub fn word_count(&self) -> u64 {
46        self.word_count
47    }
48
49    /// Size of references (text only).
50    /// See <https://en.wikipedia.org/wiki/Wikipedia:Prosesize#References_size>
51    pub fn references(&self) -> u64 {
52        self.references
53    }
54}
55
56/// Calculate the prose size for the given HTML. Note that
57/// if you provide a mutable `parsoid::Wikicode` instance,
58/// the document will be modified!
59pub fn prosesize(html: impl Into<Wikicode>) -> ProseSize {
60    let code = html.into();
61    let mut prose_size = 0;
62
63    // strip some stuff we don't want to count
64    let code = remove_noncounted_elements(code);
65
66    let mut word_count = 0;
67    // TODO: Add an option for just selecting "p" to include blockquotes, etc.
68    for node in code.select(PROSE_SELECTOR) {
69        let text = node.text_contents();
70        prose_size += text.len() as u64;
71        word_count += text.split_whitespace().count() as u64;
72    }
73    // Calculate size of references (i.e. output of <references/>)
74    // We only want the main references group here, and not other things that
75    // are technically references but not citations, like footnotes. So ignore
76    // anything not using the default group (h/t Izno).
77    let references = code
78        .select("ol.references:not([data-mw-group])")
79        .into_iter()
80        .map(|node| node.text_contents().len() as u64)
81        .sum();
82    ProseSize {
83        prose_size,
84        word_count,
85        references,
86    }
87}
88
89/// Selector for prose, assuming you've already removed other
90/// non-counted elements
91pub const PROSE_SELECTOR: &str = "section > p";
92
93/// Remove elements that we absolutely don't plan on counting. This
94/// can be run against your own Wikicode instance if you just want to
95/// analyze prosesize-counted content.
96pub fn remove_noncounted_elements(code: Wikicode) -> Wikicode {
97    let mut remove = vec![];
98    for node in code.descendants() {
99        if let Some(element) = node.as_element() {
100            // In CSS terms, we're removing nodes that match the
101            // following selectors:
102            // style
103            // #coordinates
104            // [class*="emplate"]
105            // [typeof~="mw:Extension/math"]
106            // [typeof~="mw:Extension/ref"]
107            if &element.name.local == "style"
108                || element.attributes.borrow().get("id") == Some("coordinates")
109                || element
110                    .attributes
111                    .borrow()
112                    .get("class")
113                    .map(|c| c.contains("emplate"))
114                    .unwrap_or(false)
115                || element
116                    .attributes
117                    .borrow()
118                    .get("typeof")
119                    .map(|t| t.split(' ').any(|t| t == "mw:Extension/math"))
120                    .unwrap_or(false)
121                || element
122                    .attributes
123                    .borrow()
124                    .get("typeof")
125                    .map(|t| t.split(' ').any(|t| t == "mw:Extension/ref"))
126                    .unwrap_or(false)
127            {
128                // We defer removal because otherwise it interferes with the
129                // descendants iterator for some reason.
130                remove.push(node.clone());
131            }
132        }
133    }
134    for node in remove {
135        node.detach();
136    }
137    code
138}
139
140/// Get a stylesheet for Parsoid HTML that highlights elements
141/// counted for prosesize in yellow and references in light blue.
142pub fn parsoid_stylesheet() -> &'static str {
143    include_str!("parsoid.css")
144}
145
146#[cfg(test)]
147mod tests {
148    use super::*;
149
150    #[tokio::test]
151    #[ignore] // FIXME: counts aren't stable, maybe templates throw it off?
152    async fn test_prosesize() {
153        let client = parsoid::Client::new(
154            "https://en.wikipedia.org/w/rest.php",
155            "wikipedia_prosesize crate testing",
156        )
157        .unwrap();
158        let html = client
159            .get_revision("Douglas MacArthur", 1138947706)
160            .await
161            .unwrap();
162        let size = prosesize(html);
163        dbg!(&size);
164        // prosesize.js says 113 kB
165        assert_eq!(size.prose_size(), 115_698);
166        // prosesize.js says 18,679 words
167        assert_eq!(size.word_count(), 18_674);
168        // prosesize.js says 27 kb
169        assert_eq!(size.references(), 26_149);
170        // Now a math article with <math> tags
171        let html = client
172            .get_revision("Group (mathematics)", 1133598242)
173            .await
174            .unwrap();
175        let size = prosesize(html);
176        dbg!(&size);
177        // prosesize.js says 79 kB
178        assert_eq!(size.prose_size(), 42_206);
179        // prosesize.js says 34,271 words
180        assert_eq!(size.word_count(), 7_163);
181        // prosesize.js says 2828 B
182        assert_eq!(size.references(), 2_573);
183    }
184}