#
# (C) Pywikibot team, 2022-2026
#
# Distributed under the terms of the MIT license.
#
"""Object representing interface to toolforge tools.
.. version-added:: 7.7
"""
from __future__ import annotations
import collections
import pickle
import re
import urllib.parse
from http import HTTPStatus
from pathlib import Path
from typing import Any
from warnings import warn
import pywikibot
from pywikibot.tools import deprecated, deprecated_args, remove_last_args
[docs]
class WikiBlameMixin:
"""Page mixin for main authorship.
.. version-added:: 7.7
"""
#: Supported wikipedia site codes
WIKIBLAME_CODES = 'als', 'bar', 'de', 'en', 'it', 'nds', 'sco'
def _check_wh_supported(self) -> None:
"""Check if WikiHistory is supported."""
if self.site.family.name != 'wikipedia':
raise NotImplementedError(
'main_authors method is implemented for wikipedia family only')
if (code := self.site.code) not in self.WIKIBLAME_CODES:
raise NotImplementedError(
f'main_authors method is not implemented for wikipedia:{code}')
if (ns := self.namespace()) not in (0, 4, 10, 12, 14, 100):
raise NotImplementedError(
f'main_authors method is not implemented for {ns} namespace')
if not self.exists():
raise pywikibot.exceptions.NoPageError(self)
[docs]
@deprecated('authorsship', since='9.3.0')
@deprecated_args(onlynew=None) # since 9.2.0
def main_authors(self) -> collections.Counter[str, int]:
"""Retrieve the 5 topmost main authors of an article.
Sample:
>>> import pywikibot
>>> site = pywikibot.Site('wikipedia:de')
>>> page = pywikibot.Page(site, 'Project:Pywikibot')
>>> auth = page.main_authors()
>>> auth.most_common(1)
[('DrTrigon', 37)]
.. version-deprecated:: 9.3
use :meth:`authorship` instead.
.. seealso:: :meth:`authorship` for further informations
:return: Percentage of edits for each username
:raise NotImplementedError: unsupported site or unsupported
namespace.
:raise NoPageError: The page does not exist.
:raise TimeoutError: WikiHistory timeout
"""
return collections.Counter(
{user: int(cnt) for user, (_, cnt) in self.authorship(5).items()})
[docs]
@remove_last_args(['revid', 'date']) # since 10.1.0
def authorship(
self,
n: int | None = None,
*,
min_chars: int = 0,
min_pct: float = 0.0,
max_pct_sum: float | None = None,
) -> dict[str, tuple[int, float]]:
"""Retrieve authorship attribution of an article.
This method uses WikiHistory to retrieve the authors measured by
character count.
Sample:
>>> import pywikibot
>>> site = pywikibot.Site('wikipedia:en')
>>> page = pywikibot.Page(site, 'Pywikibot')
>>> auth = page.authorship() # doctest: +SKIP
>>> auth # doctest: +SKIP
{'1234qwer1234qwer4': (68, 100.0)}
.. important:: Only implemented for pages in Main, Project,
Category and Template namespaces and only wikipedias of
:attr:`WIKIBLAME_CODES` are supported.
.. version-added:: 9.3
XTools is used to retrieve authors. This method replaces
:meth:`main_authors`.
.. version-changed:: 10.1
WikiHistory is used to retrieve authors due to :phab:`T392694`.
Here are the differences between these two implementations:
.. tabs::
.. tab:: WikiHistory
.. version-added:: 10.1
- Implemented from version 7.7 until 9.2 (with
:meth:`main_authors` method) and from 10.1.
- Main, Project, Category and Template namespaces are
supported
- Only 'als', 'bar', 'de', 'en', 'it', 'nds' and 'sco'
Wikipedias are supported.
- Revision ID *revid* or revision *date* is not supported.
Always the latest revision is used.
- Only the most 5 authors are given.
- No additional parsing library is required.
.. seealso::
- https://wikihistory.toolforge.org
- https://de.wikipedia.org/wiki/WP:HT/wikihistory
.. tab:: XTools
.. version-removed:: 10.1
- Implemented from version 9.3 until 10.0.
- Only Main namespace is supported.
- Only 'ar', 'de', 'en', 'es', 'eu', 'fr', 'hu', 'id',
'it', 'ja', 'nl', 'pl', 'pt' and 'tr' Wikipedias are
supported.
- Revision ID *revid* or revision *date* is supported to
get authorship for this revision.
- All authors can be given.
- wikitextparser parsing library is required.
.. seealso::
- https://xtools.wmcloud.org/authorship/
- https://www.mediawiki.org/wiki/XTools/Authorship
- https://www.mediawiki.org/wiki/WikiWho
:param n: Only return the first *n* or fewer authors.
:param min_chars: Only return authors with more than *min_chars*
chars changes.
:param min_pct: Only return authors with more than *min_pct*
percentage edits.
:param max_pct_sum: Only return authors until the prcentage sum
reached *max_pct_sum*.
:return: Character count and percentage of edits for each
username.
:raise NotImplementedError: unsupported site or unsupported
namespace.
:raise NoPageError: The page does not exist.
:raise TimeoutError: WikiHistory timeout
"""
if n and n > 5:
warn('Only the first 5 authors can be given.', stacklevel=2)
baseurl = 'https://wikihistory.toolforge.org'
pattern = (r'><bdi>(?P<author>.+?)</bdi></a>\s'
r'\((?P<percent>\d{1,3})&')
self._check_wh_supported()
for onlynew in (1, 0):
url = baseurl + (f'/wiki/getauthors.php?wiki={self.site.code}wiki'
f'&page_id={self.pageid}&onlynew={onlynew}')
r = pywikibot.comms.http.fetch(url)
if r.status_code != HTTPStatus.OK:
r.raise_for_status()
if 'Timeout' not in r.text:
break
pywikibot.sleep(pywikibot.config.retry_wait)
else:
raise pywikibot.exceptions.TimeoutError('WikiHistory Timeout')
length = len(self.text)
result: list[list[str]] = []
pct_sum = 0.0
for rank, (user, cnt) in enumerate(re.findall(pattern, r.text),
start=1):
chars = length * int(cnt) // 100
percent = float(cnt)
# take into account that data() is ordered
if n and rank > n or chars < min_chars or percent < min_pct:
break
result.append((user, chars, percent))
pct_sum += percent
if max_pct_sum and pct_sum >= max_pct_sum:
break
return {user: (chars, percent) for user, chars, percent in result}
[docs]
class WikiWhoMixin:
"""Page mixin for WikiWho authorship data with optimized pickle storage.
WikiWho provides token-level provenance and authorship information.
This implementation uses an optimized subdirectory structure for pickle
caching to avoid filesystem performance issues with millions of files.
.. version-added:: 11.0
"""
#: Supported WikiWho API language codes
WIKIWHO_CODES = (
'ar', 'de', 'en', 'es', 'eu', 'fr', 'hu', 'id', 'it', 'ja', 'nl', 'pl',
'pt', 'tr', 'zh'
)
def _check_wikiwho_supported(self) -> None:
"""Check if WikiWho API is supported.
.. version-added:: 11.0
:raise NotImplementedError: unsupported site, language, or namespace
:raise NoPageError: page does not exist
"""
if self.site.family.name != 'wikipedia':
raise NotImplementedError(
'WikiWho API is implemented for wikipedia family only')
if (code := self.site.code) not in self.WIKIWHO_CODES:
raise NotImplementedError(
f'WikiWho API is not implemented for wikipedia:{code}')
if (ns := self.namespace()) != 0:
raise NotImplementedError(
f'WikiWho API is not implemented for {ns} namespace')
if not self.exists():
raise pywikibot.exceptions.NoPageError(self)
def _build_wikiwho_url(self, endpoint: str) -> str:
"""Build WikiWho API URL for the given endpoint.
.. version-added:: 11.0
:param endpoint: API endpoint (all_content, rev_content,
edit_persistence)
:return: Complete API URL
"""
article_title = self.title(with_ns=False, with_section=False)
encoded_title = urllib.parse.quote(article_title, safe='')
base_url = 'https://wikiwho-api.wmcloud.org'
url = (f'{base_url}/{self.site.code}/api/v1.0.0-beta/{endpoint}/'
f'{encoded_title}/')
return url
[docs]
def get_annotations(self, *, use_cache: bool = True) -> dict[str, Any]:
"""Get WikiWho annotations for article revisions.
This method uses the public WikiWho API to get token-level
provenance annotations showing who added each token in the article.
Results are cached locally using pickle files with an optimized
subdirectory structure to avoid filesystem performance issues.
Sample:
>>> import pywikibot
>>> site = pywikibot.Site('wikipedia:en')
>>> page = pywikibot.Page(site, 'Python (programming language)')
>>> data = page.get_annotations() # doctest: +SKIP
>>> data['article_title'] # doctest: +SKIP
'Python (programming language)'
.. important:: Only implemented for main namespace pages and only
Wikipedias of :attr:`WIKIWHO_CODES` are supported.
.. version-added:: 11.0
.. seealso::
- https://wikiwho-api.wmcloud.org
- https://www.mediawiki.org/wiki/WikiWho
:param use_cache: Whether to use and save cached data.
Set to False to force a fresh API request without caching.
:return: Dictionary containing article_title, page_id, and revisions
with token-level annotations
:raise NotImplementedError: unsupported site, language, or namespace
:raise NoPageError: page does not exist
:raise pywikibot.exceptions.ServerError: WikiWho API error
:raise requests.exceptions.HTTPError: HTTP error from WikiWho API
"""
self._check_wikiwho_supported()
# Check cache first
cache_path = self._get_wikiwho_pickle_path(
self.site.code, self.pageid)
if use_cache and cache_path.exists():
with open(cache_path, 'rb') as f:
return pickle.load(f)
url = self._build_wikiwho_url('all_content')
url = f'{url}?editor=true&o_rev_id=true'
r = pywikibot.comms.http.fetch(url)
if r.status_code != HTTPStatus.OK:
r.raise_for_status()
try:
data = r.json()
except Exception as e:
raise pywikibot.exceptions.ServerError(
f'Failed to parse WikiWho API response: {e}')
if 'Error' in data or 'error' in data:
error_msg = data.get('Error') or data.get('error', 'Unknown error')
raise pywikibot.exceptions.ServerError(
f'WikiWho API error: {error_msg}')
# Save to cache if caching is enabled
if use_cache:
cache_path.parent.mkdir(parents=True, exist_ok=True)
with open(cache_path, 'wb') as f:
pickle.dump(data, f, protocol=pywikibot.config.pickle_protocol)
return data
@staticmethod
def _get_wikiwho_pickle_path(lang: str, page_id: int, cache_dir=None):
"""Calculate pickle file path with subdirectory structure.
Uses subdirectories based on floor(page_id/1000) to optimize
filesystem performance. This avoids having millions of pickle
files in a single directory.
Directory structure:
cache_dir/lang/subdirectory/page_id.p
Where subdirectory = floor(page_id / 1000) * 1000
Examples:
page_id 100000 → en/100000/100000.p
page_id 100002 → en/100000/100002.p
page_id 200005 → en/200000/200005.p
This reduces files per directory from ~7M to ~7K for large wikis.
.. version-added:: 11.0
:param lang: Language code (e.g., 'en', 'de', 'fi')
:param page_id: Wikipedia page ID
:param cache_dir: Custom cache directory (defaults to apicache/wikiwho)
:return: Path object for the pickle file
"""
# Use provided cache_dir or default to apicache/wikiwho
if cache_dir is None:
cache_dir = (Path(pywikibot.config.base_dir)
/ 'apicache' / 'wikiwho')
else:
cache_dir = Path(cache_dir)
# Calculate subdirectory as floor(page_id / 1000) * 1000
subdirectory = (page_id // 1000) * 1000
# Construct path: cache_dir/lang/subdirectory/page_id.p
pickle_path = cache_dir / lang / str(subdirectory) / f'{page_id}.p'
return pickle_path