1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184
// SPDX-License-Identifier: GPL-3.0-or-later
// Copyright (C) 2023 Kunal Mehta <legoktm@debian.org>
//! Calculate Wikipedia prose size
//!
//! This crate is a rough port of the [Wikipedia Prosesize script](https://en.wikipedia.org/wiki/Wikipedia:Prosesize)
//! that allows for counting the bytes of prose on a page rather than the wikitext
//! markup or generated HTML.
//!
//! You will most likely fetch `ImmutableWikicode` using the [parsoid](https://docs.rs/parsoid)
//! crate.
//!
//! The response from `prosesize()` provides the text-only prose size, word count and text-only
//! references size. Enabling the optional `serde-1` feature makes the size struct serializable
//! and deserializable.
//!
//! ## Contributing
//! `wikipedia_prosesize` is part of the [`mwbot-rs` project](https://www.mediawiki.org/wiki/Mwbot-rs).
//! We're always looking for new contributors, please [reach out](https://www.mediawiki.org/wiki/Mwbot-rs#Contributing)
//! if you're interested!
#![deny(clippy::all)]
#![deny(rustdoc::all)]
use parsoid::prelude::*;
#[derive(Debug)]
#[cfg_attr(feature = "serde-1", derive(serde::Serialize, serde::Deserialize))]
pub struct ProseSize {
/// Prose size (text only) aka "readable prose size
prose_size: u64,
/// Words in prose
word_count: u64,
/// References (text only)
references: u64,
}
impl ProseSize {
/// Size of text in "readable prose".
/// See <https://en.wikipedia.org/wiki/Wikipedia:Prosesize#Prose_size>.
pub fn prose_size(&self) -> u64 {
self.prose_size
}
/// Number of words in the "readable prose".
/// See <https://en.wikipedia.org/wiki/Wikipedia:Prosesize#Prose_size>.
pub fn word_count(&self) -> u64 {
self.word_count
}
/// Size of references (text only).
/// See <https://en.wikipedia.org/wiki/Wikipedia:Prosesize#References_size>
pub fn references(&self) -> u64 {
self.references
}
}
/// Calculate the prose size for the given HTML. Note that
/// if you provide a mutable `parsoid::Wikicode` instance,
/// the document will be modified!
pub fn prosesize(html: impl Into<Wikicode>) -> ProseSize {
let code = html.into();
let mut prose_size = 0;
// strip some stuff we don't want to count
let code = remove_noncounted_elements(code);
let mut word_count = 0;
// TODO: Add an option for just selecting "p" to include blockquotes, etc.
for node in code.select(PROSE_SELECTOR) {
let text = node.text_contents();
prose_size += text.len() as u64;
word_count += text.split(' ').count() as u64;
}
// Calculate size of references (i.e. output of <references/>)
// We only want the main references group here, and not other things that
// are technically references but not citations, like footnotes. So ignore
// anything not using the default group (h/t Izno).
let references = code
.select("ol.references:not([data-mw-group])")
.into_iter()
.map(|node| node.text_contents().len() as u64)
.sum();
ProseSize {
prose_size,
word_count,
references,
}
}
/// Selector for prose, assuming you've already removed other
/// non-counted elements
pub const PROSE_SELECTOR: &str = "section > p";
/// Remove elements that we absolutely don't plan on counting. This
/// can be run against your own Wikicode instance if you just want to
/// analyze prosesize-counted content.
pub fn remove_noncounted_elements(code: Wikicode) -> Wikicode {
let mut remove = vec![];
for node in code.descendants() {
if let Some(element) = node.as_element() {
// In CSS terms, we're removing nodes that match the
// following selectors:
// style
// #coordinates
// [class*="emplate"]
// [typeof~="mw:Extension/math"]
// [typeof~="mw:Extension/ref"]
if &element.name.local == "style"
|| element.attributes.borrow().get("id") == Some("coordinates")
|| element
.attributes
.borrow()
.get("class")
.map(|c| c.contains("emplate"))
.unwrap_or(false)
|| element
.attributes
.borrow()
.get("typeof")
.map(|t| t.split(' ').any(|t| t == "mw:Extension/math"))
.unwrap_or(false)
|| element
.attributes
.borrow()
.get("typeof")
.map(|t| t.split(' ').any(|t| t == "mw:Extension/ref"))
.unwrap_or(false)
{
// We defer removal because otherwise it interferes with the
// descendants iterator for some reason.
remove.push(node.clone());
}
}
}
for node in remove {
node.detach();
}
code
}
/// Get a stylesheet for Parsoid HTML that highlights elements
/// counted for prosesize in yellow and references in light blue.
pub fn parsoid_stylesheet() -> &'static str {
include_str!("parsoid.css")
}
#[cfg(test)]
mod tests {
use super::*;
#[tokio::test]
#[ignore] // FIXME: counts aren't stable, maybe templates throw it off?
async fn test_prosesize() {
let client = parsoid::Client::new(
"https://en.wikipedia.org/w/rest.php",
"wikipedia_prosesize crate testing",
)
.unwrap();
let html = client
.get_revision("Douglas MacArthur", 1138947706)
.await
.unwrap();
let size = prosesize(html);
dbg!(&size);
// prosesize.js says 113 kB
assert_eq!(size.prose_size(), 115_698);
// prosesize.js says 18,679 words
assert_eq!(size.word_count(), 18_674);
// prosesize.js says 27 kb
assert_eq!(size.references(), 26_149);
// Now a math article with <math> tags
let html = client
.get_revision("Group (mathematics)", 1133598242)
.await
.unwrap();
let size = prosesize(html);
dbg!(&size);
// prosesize.js says 79 kB
assert_eq!(size.prose_size(), 42_206);
// prosesize.js says 34,271 words
assert_eq!(size.word_count(), 7_163);
// prosesize.js says 2828 B
assert_eq!(size.references(), 2_573);
}
}