wikipedia_prosesize/lib.rs
1// SPDX-License-Identifier: GPL-3.0-or-later
2// Copyright (C) 2023 Kunal Mehta <legoktm@debian.org>
3//! Calculate Wikipedia prose size
4//!
5//! This crate is a rough port of the [Wikipedia Prosesize script](https://en.wikipedia.org/wiki/Wikipedia:Prosesize)
6//! that allows for counting the bytes of prose on a page rather than the wikitext
7//! markup or generated HTML.
8//!
9//! You will most likely fetch `ImmutableWikicode` using the [parsoid](https://docs.rs/parsoid)
10//! crate.
11//!
12//! The response from [`prosesize()`] provides the text-only prose size, word count and text-only
13//! references size. Enabling the optional `serde-1` feature makes the size struct serializable
14//! and deserializable.
15//!
16//! ## Contributing
17//! `wikipedia_prosesize` is part of the [`mwbot-rs` project](https://www.mediawiki.org/wiki/Mwbot-rs).
18//! We're always looking for new contributors, please [reach out](https://www.mediawiki.org/wiki/Mwbot-rs#Contributing)
19//! if you're interested!
20#![deny(clippy::all)]
21#![deny(rustdoc::all)]
22
23use parsoid::prelude::*;
24
25#[derive(Debug)]
26#[cfg_attr(feature = "serde-1", derive(serde::Serialize, serde::Deserialize))]
27pub struct ProseSize {
28 /// Prose size (text only) aka "readable prose size
29 prose_size: u64,
30 /// Words in prose
31 word_count: u64,
32 /// References (text only)
33 references: u64,
34}
35
36impl ProseSize {
37 /// Size of text in "readable prose".
38 /// See <https://en.wikipedia.org/wiki/Wikipedia:Prosesize#Prose_size>.
39 pub fn prose_size(&self) -> u64 {
40 self.prose_size
41 }
42
43 /// Number of words in the "readable prose".
44 /// See <https://en.wikipedia.org/wiki/Wikipedia:Prosesize#Prose_size>.
45 pub fn word_count(&self) -> u64 {
46 self.word_count
47 }
48
49 /// Size of references (text only).
50 /// See <https://en.wikipedia.org/wiki/Wikipedia:Prosesize#References_size>
51 pub fn references(&self) -> u64 {
52 self.references
53 }
54}
55
56/// Calculate the prose size for the given HTML. Note that
57/// if you provide a mutable `parsoid::Wikicode` instance,
58/// the document will be modified!
59pub fn prosesize(html: impl Into<Wikicode>) -> ProseSize {
60 let code = html.into();
61 let mut prose_size = 0;
62
63 // strip some stuff we don't want to count
64 let code = remove_noncounted_elements(code);
65
66 let mut word_count = 0;
67 // TODO: Add an option for just selecting "p" to include blockquotes, etc.
68 for node in code.select(PROSE_SELECTOR) {
69 let text = node.text_contents();
70 prose_size += text.len() as u64;
71 word_count += text.split_whitespace().count() as u64;
72 }
73 // Calculate size of references (i.e. output of <references/>)
74 // We only want the main references group here, and not other things that
75 // are technically references but not citations, like footnotes. So ignore
76 // anything not using the default group (h/t Izno).
77 let references = code
78 .select("ol.references:not([data-mw-group])")
79 .into_iter()
80 .map(|node| node.text_contents().len() as u64)
81 .sum();
82 ProseSize {
83 prose_size,
84 word_count,
85 references,
86 }
87}
88
89/// Selector for prose, assuming you've already removed other
90/// non-counted elements
91pub const PROSE_SELECTOR: &str = "section > p";
92
93/// Remove elements that we absolutely don't plan on counting. This
94/// can be run against your own Wikicode instance if you just want to
95/// analyze prosesize-counted content.
96pub fn remove_noncounted_elements(code: Wikicode) -> Wikicode {
97 let mut remove = vec![];
98 for node in code.descendants() {
99 if let Some(element) = node.as_element() {
100 // In CSS terms, we're removing nodes that match the
101 // following selectors:
102 // style
103 // #coordinates
104 // [class*="emplate"]
105 // [typeof~="mw:Extension/math"]
106 // [typeof~="mw:Extension/ref"]
107 if &element.name.local == "style"
108 || element.attributes.borrow().get("id") == Some("coordinates")
109 || element
110 .attributes
111 .borrow()
112 .get("class")
113 .map(|c| c.contains("emplate"))
114 .unwrap_or(false)
115 || element
116 .attributes
117 .borrow()
118 .get("typeof")
119 .map(|t| t.split(' ').any(|t| t == "mw:Extension/math"))
120 .unwrap_or(false)
121 || element
122 .attributes
123 .borrow()
124 .get("typeof")
125 .map(|t| t.split(' ').any(|t| t == "mw:Extension/ref"))
126 .unwrap_or(false)
127 {
128 // We defer removal because otherwise it interferes with the
129 // descendants iterator for some reason.
130 remove.push(node.clone());
131 }
132 }
133 }
134 for node in remove {
135 node.detach();
136 }
137 code
138}
139
140/// Get a stylesheet for Parsoid HTML that highlights elements
141/// counted for prosesize in yellow and references in light blue.
142pub fn parsoid_stylesheet() -> &'static str {
143 include_str!("parsoid.css")
144}
145
146#[cfg(test)]
147mod tests {
148 use super::*;
149
150 #[tokio::test]
151 #[ignore] // FIXME: counts aren't stable, maybe templates throw it off?
152 async fn test_prosesize() {
153 let client = parsoid::Client::new(
154 "https://en.wikipedia.org/w/rest.php",
155 "wikipedia_prosesize crate testing",
156 )
157 .unwrap();
158 let html = client
159 .get_revision("Douglas MacArthur", 1138947706)
160 .await
161 .unwrap();
162 let size = prosesize(html);
163 dbg!(&size);
164 // prosesize.js says 113 kB
165 assert_eq!(size.prose_size(), 115_698);
166 // prosesize.js says 18,679 words
167 assert_eq!(size.word_count(), 18_674);
168 // prosesize.js says 27 kb
169 assert_eq!(size.references(), 26_149);
170 // Now a math article with <math> tags
171 let html = client
172 .get_revision("Group (mathematics)", 1133598242)
173 .await
174 .unwrap();
175 let size = prosesize(html);
176 dbg!(&size);
177 // prosesize.js says 79 kB
178 assert_eq!(size.prose_size(), 42_206);
179 // prosesize.js says 34,271 words
180 assert_eq!(size.word_count(), 7_163);
181 // prosesize.js says 2828 B
182 assert_eq!(size.references(), 2_573);
183 }
184}