Source code for pagegenerators._generators

"""Page filter generators provided by the pagegenerators module."""
#
# (C) Pywikibot team, 2008-2025
#
# Distributed under the terms of the MIT license.
#
from __future__ import annotations

import calendar
import codecs
import io
import re
import typing
from collections import abc
from functools import partial
from http import HTTPStatus
from typing import TYPE_CHECKING, Any
from urllib.parse import urlparse

from requests.exceptions import ReadTimeout

import pywikibot
from pywikibot import config, date, xmlreader
from pywikibot.backports import (
    Callable,
    Generator,
    Iterable,
    Iterator,
    Sequence,
    batched,
)
from pywikibot.comms import http
from pywikibot.exceptions import APIError, ServerError
from pywikibot.site import Namespace
from pywikibot.tools import issue_deprecation_warning
from pywikibot.tools.collections import GeneratorWrapper
from pywikibot.tools.itertools import filter_unique


if TYPE_CHECKING:
    from pywikibot.site import BaseSite, NamespaceArgType
    from pywikibot.site._namespace import SingleNamespaceType
    from pywikibot.time import Timestamp

# This is the function that will be used to de-duplicate page iterators.
_filter_unique_pages = partial(
    filter_unique, key=lambda page: '{}:{}:{}'.format(*page._cmpkey()))


[docs] def AllpagesPageGenerator( start: str = '!', namespace: SingleNamespaceType = 0, includeredirects: typing.Literal['only'] | bool = True, site: BaseSite | None = None, total: int | None = None, content: bool = False, *, filterredir: bool | None = None, ) -> Iterable[pywikibot.page.Page]: """Iterate Page objects for all titles in a single namespace. .. deprecated:: 10.0 The *includeredirects* parameter; use *filterredir* instead. .. seealso:: :meth:`APISite.allpages() <pywikibot.site._generators.GeneratorsMixin.allpages>` :param start: if provided, only generate pages >= this title lexically :param namespace: Namespace to retrieve pages from :param includeredirects: If False, redirects are not included. If equals the string 'only', only redirects are added. Otherwise redirects will be included. This parameter is deprecated; use *filterredir* instead. :param site: Site for generator results. :param total: Maximum number of pages to retrieve in total :param content: If True, load current version of each page (default False) :param filterredir: if True, only yield redirects; if False (and not None), only yield non-redirects (default: yield both). :return: a generator that yields Page objects :raises ValueError: *filterredir* as well as *includeredirects* parameters were given. Use *filterredir* only. """ if site is None: site = pywikibot.Site() if filterredir is not None and includeredirects is not True: raise ValueError( f'filterredir parameter ({filterredir}) is used together with ' f'outdated includeredirects parameter ({includeredirects}).' ) # backward compatibility if includeredirects is not True: if not includeredirects: filterredir = False elif includeredirects == 'only': filterredir = True issue_deprecation_warning( 'includeredirects parameter ({includeredirects})', f'filterredir={filterredir}', since='10.0.0' ) return site.allpages(start=start, namespace=namespace, filterredir=filterredir, total=total, content=content)
[docs] def PrefixingPageGenerator( prefix: str, namespace: SingleNamespaceType | None = None, includeredirects: typing.Literal['only'] | bool = True, site: BaseSite | None = None, total: int | None = None, content: bool = False, *, filterredir: bool | None = None, ) -> Iterable[pywikibot.page.Page]: """Prefixed Page generator. .. deprecated:: 10.0 The *includeredirects* parameter; use *filterredir* instead. :param prefix: The prefix of the pages. :param namespace: Namespace to retrieve pages from :param includeredirects: If False, redirects are not included. If equals the string 'only', only redirects are added. Otherwise redirects will be included. This parameter is deprecated; use *filterredir* instead. :param site: Site for generator results. :param total: Maximum number of pages to retrieve in total :param content: If True, load current version of each page (default False) :param filterredir: if True, only yield redirects; if False (and not None), only yield non-redirects (default: yield both). :return: a generator that yields Page objects :raises ValueError: *filterredir* as well as *includeredirects* parameters were given. Use *filterredir* only. """ if site is None: site = pywikibot.Site() prefixlink = pywikibot.Link(prefix, site) if namespace is None: namespace = prefixlink.namespace title = prefixlink.title if filterredir is not None and includeredirects is not True: raise ValueError( f'filterredir parameter ({filterredir}) is used together with ' f'outdated includeredirects parameter ({includeredirects}).' ) # backward compatibility if includeredirects is not True: if not includeredirects: filterredir = False elif includeredirects == 'only': filterredir = True issue_deprecation_warning( 'includeredirects parameter ({includeredirects})', f'filterredir={filterredir}', since='10.0.0' ) return site.allpages(prefix=title, namespace=namespace, filterredir=filterredir, total=total, content=content)
[docs] def LogeventsPageGenerator(logtype: str | None = None, user: str | None = None, site: BaseSite | None = None, namespace: SingleNamespaceType | None = None, total: int | None = None, start: Timestamp | None = None, end: Timestamp | None = None, reverse: bool = False ) -> Generator[pywikibot.page.Page, None, None]: """Generate Pages for specified modes of logevents. :param logtype: Mode of logs to retrieve :param user: User of logs retrieved :param site: Site for generator results :param namespace: Namespace to retrieve logs from :param total: Maximum number of pages to retrieve in total :param start: Timestamp to start listing from :param end: Timestamp to end listing at :param reverse: if True, start with oldest changes (default: newest) """ if site is None: site = pywikibot.Site() for entry in site.logevents(total=total, logtype=logtype, user=user, namespace=namespace, start=start, end=end, reverse=reverse): try: yield entry.page() except KeyError as e: pywikibot.warning('LogeventsPageGenerator: failed to load page ' f'for {entry.data!r}; skipping') pywikibot.error(e)
[docs] def NewpagesPageGenerator(site: BaseSite | None = None, namespaces: NamespaceArgType = (0, ), total: int | None = None ) -> Generator[pywikibot.page.Page, None, None]: """Iterate Page objects for all new titles in a single namespace. :param site: Site for generator results. :param namespaces: namespace to retrieve pages from :param total: Maximum number of pages to retrieve in total """ # API does not (yet) have a newpages function, so this tries to duplicate # it by filtering the recentchanges output # defaults to namespace 0 because that's how Special:Newpages defaults if site is None: site = pywikibot.Site() return (page for page, _ in site.newpages(namespaces=namespaces, total=total, returndict=True))
[docs] def RecentChangesPageGenerator( site: BaseSite | None = None, _filter_unique: None | (Callable[[Iterable[pywikibot.Page]], Iterable[pywikibot.Page]]) = None, **kwargs: Any ) -> Generator[pywikibot.Page, None, None]: """Generate recent changes pages, including duplicates. For keyword parameters refer :meth:`APISite.recentchanges() <pywikibot.site._generators.GeneratorsMixin.recentchanges>`. .. versionchanged:: 8.2 The YieldType depends on namespace. It can be :class:`pywikibot.Page<pywikibot.page.Page>`, :class:`pywikibot.User<pywikibot.page.User>`, :class:`pywikibot.FilePage<pywikibot.page.FilePage>` or :class:`pywikibot.Category<pywikibot.page.Category>`. .. versionchanged:: 9.4 Ignore :class:`pywikibot.FilePage<pywikibot.page.FilePage>` if it raises a :exc:`ValueError` during upcast e.g. due to an invalid file extension. :param site: Site for generator results. """ def upcast(gen): """Upcast pywikibot.Page type.""" for rc in gen: # The title in a log entry may have been suppressed if rc['type'] == 'log' and 'title' not in rc: continue ns = rc['ns'] if ns == Namespace.USER: pageclass: type[pywikibot.Page] = pywikibot.User elif ns == Namespace.FILE: pageclass = pywikibot.FilePage elif ns == Namespace.CATEGORY: pageclass = pywikibot.Category else: pageclass = pywikibot.Page try: yield pageclass(site, rc['title']) except ValueError: if pageclass != pywikibot.FilePage: raise pywikibot.exception() if site is None: site = pywikibot.Site() gen = site.recentchanges(**kwargs) gen.request['rcprop'] = 'title' gen = upcast(gen) if _filter_unique: gen = _filter_unique(gen) return gen
[docs] def UnconnectedPageGenerator( site: BaseSite | None = None, total: int | None = None ) -> Iterable[pywikibot.page.Page]: """Iterate Page objects for all unconnected pages to a Wikibase repository. :param total: Maximum number of pages to retrieve in total :param site: Site for generator results. """ if site is None: site = pywikibot.Site() if not site.data_repository(): raise ValueError('The given site does not have Wikibase repository.') return site.unconnected_pages(total=total)
[docs] def FileLinksGenerator( referredFilePage: pywikibot.page.FilePage, # noqa: N803 total: int | None = None, content: bool = False ) -> Iterable[pywikibot.page.Page]: """Yield Pages on which referredFilePage file is displayed.""" return referredFilePage.using_pages(total=total, content=content)
[docs] def ImagesPageGenerator( pageWithImages: pywikibot.page.Page, # noqa: N803 total: int | None = None, content: bool = False ) -> Iterable[pywikibot.page.Page]: """Yield FilePages displayed on pageWithImages.""" return pageWithImages.imagelinks(total=total, content=content)
[docs] def InterwikiPageGenerator(page: pywikibot.page.Page ) -> Generator[pywikibot.page.Page, None, None]: """Iterate over all interwiki (non-language) links on a page.""" return (pywikibot.Page(link) for link in page.interwiki())
[docs] def LanguageLinksPageGenerator(page: pywikibot.page.Page, total: int | None = None ) -> Generator[pywikibot.page.Page, None, None]: """Iterate over all interwiki language links on a page.""" return (pywikibot.Page(link) for link in page.iterlanglinks(total=total))
[docs] def CategorizedPageGenerator(category: pywikibot.page.Category, recurse: int | bool = False, start: str | None = None, total: int | None = None, content: bool = False, namespaces: NamespaceArgType = None, ) -> Generator[pywikibot.page.Page, None, None]: """Yield all pages in a specific category. :param recurse: if not False or 0, also iterate articles in subcategories. If an int, limit recursion to this number of levels. (Example: recurse=1 will iterate articles in first-level subcats, but no deeper.) :param start: if provided, only generate pages >= this title lexically :param total: iterate no more than this number of pages in total (at all levels) :param content: if True, retrieve the content of the current version of each page (default False) """ yield from category.articles( content=content, namespaces=namespaces, recurse=recurse, startprefix=start, total=total, )
[docs] def SubCategoriesPageGenerator(category: pywikibot.page.Category, recurse: int | bool = False, start: str | None = None, total: int | None = None, content: bool = False, ) -> Generator[pywikibot.page.Page, None, None]: """Yield all subcategories in a specific category. :param recurse: if not False or 0, also iterate articles in subcategories. If an int, limit recursion to this number of levels. (Example: recurse=1 will iterate articles in first-level subcats, but no deeper.) :param start: if provided, only generate pages >= this title lexically :param total: iterate no more than this number of pages in total (at all levels) :param content: if True, retrieve the content of the current version of each page (default False) """ # TODO: page generator could be modified to use cmstartsortkey ... for s in category.subcategories(recurse=recurse, total=total, content=content): if start is None or s.title(with_ns=False) >= start: yield s
[docs] def LinkedPageGenerator( linkingPage: pywikibot.page.Page, # noqa: N803 total: int | None = None, content: bool = False ) -> Iterable[pywikibot.page.BasePage]: """Yield all pages linked from a specific page. See :py:obj:`page.BasePage.linkedPages` for details. :param linkingPage: the page that links to the pages we want :param total: the total number of pages to iterate :param content: if True, retrieve the current content of each linked page :return: a generator that yields Page objects of pages linked to linkingPage """ return linkingPage.linkedPages(total=total, content=content)
def _yield_titles(f: codecs.StreamReaderWriter | io.StringIO, site: pywikibot.site.BaseSite ) -> Generator[pywikibot.page.Page, None, None]: """Yield page titles from a text stream. :param f: text stream object :param site: Site for generator results. :return: a generator that yields Page objects of pages with titles in text stream """ linkmatch = None for linkmatch in pywikibot.link_regex.finditer(f.read()): # If the link is in interwiki format, the Page object may reside # on a different Site than the default. # This makes it possible to work on different wikis using a single # text file, but also could be dangerous because you might # inadvertently change pages on another wiki! yield pywikibot.Page(pywikibot.Link(linkmatch['title'], site)) if linkmatch is not None: return f.seek(0) for title in f: title = title.strip() if '|' in title: title = title[:title.index('|')] if title: yield pywikibot.Page(site, title)
[docs] def TextIOPageGenerator(source: str | None = None, site: BaseSite | None = None, ) -> Generator[pywikibot.page.Page, None, None]: """Iterate pages from a list in a text file or on a webpage. The text source must contain page links between double-square-brackets or, alternatively, separated by newlines. The generator will yield each corresponding Page object. :param source: the file path or URL that should be read. If no name is given, the generator prompts the user. :param site: Site for generator results. """ if source is None: source = pywikibot.input('Please enter the filename / URL:') if site is None: site = pywikibot.Site() # If source cannot be parsed as an HTTP URL, treat as local file if not urlparse(source).netloc: with codecs.open(source, 'r', config.textfile_encoding) as local_file: yield from _yield_titles(local_file, site) # Else, fetch page (page should return text in same format as that expected # in filename, i.e. pages separated by newlines or pages enclosed in double # brackets else: with io.StringIO(http.fetch(source).text) as f: yield from _yield_titles(f, site)
[docs] def PagesFromTitlesGenerator(iterable: Iterable[str], site: BaseSite | None = None ) -> Generator[pywikibot.page.Page, None, None]: """Generate pages from the titles (strings) yielded by iterable. :param site: Site for generator results. """ if site is None: site = pywikibot.Site() for title in iterable: if not isinstance(title, str): break yield pywikibot.Page(pywikibot.Link(title, site))
[docs] def PagesFromPageidGenerator( pageids: Iterable[str], site: BaseSite | None = None ) -> Iterable[pywikibot.page.Page]: """Return a page generator from pageids. Pages are iterated in the same order than in the underlying pageids. Pageids are filtered and only one page is returned in case of duplicate pageid. :param pageids: an iterable that returns pageids, or a comma-separated string of pageids (e.g. '945097,1483753,956608') :param site: Site for generator results. """ if site is None: site = pywikibot.Site() return site.load_pages_from_pageids(pageids)
[docs] def UserContributionsGenerator(username: str, namespaces: NamespaceArgType = None, site: BaseSite | None = None, total: int | None = None, _filter_unique: None | (Callable[ [Iterable[pywikibot.page.Page]], Iterable[pywikibot.page.Page]]) = _filter_unique_pages ) -> Iterable[pywikibot.page.Page]: """Yield unique pages edited by user:username. :param total: Maximum number of pages to retrieve in total :param namespaces: list of namespace numbers to fetch contribs from :param site: Site for generator results. """ if site is None: site = pywikibot.Site() user = pywikibot.User(site, username) if not (user.isAnonymous() or user.isRegistered()): pywikibot.warning( f'User "{user.username}" does not exist on site "{site}".') gen = (contrib[0] for contrib in user.contributions( namespaces=namespaces, total=total)) if _filter_unique: return _filter_unique(gen) return gen
[docs] def NewimagesPageGenerator(total: int | None = None, site: BaseSite | None = None ) -> Generator[pywikibot.page.Page, None, None]: """New file generator. :param total: Maximum number of pages to retrieve in total :param site: Site for generator results. """ if site is None: site = pywikibot.Site() return (entry.page() for entry in site.logevents(logtype='upload', total=total))
[docs] def WikibaseItemGenerator(gen: Iterable[pywikibot.page.Page] ) -> Generator[pywikibot.page.ItemPage, None, None]: """A wrapper generator used to yield Wikibase items of another generator. :param gen: Generator to wrap. :return: Wrapped generator """ for page in gen: if isinstance(page, pywikibot.ItemPage): yield page elif page.site.data_repository() == page.site: # These are already items, as they have a DataSite in page.site. # However generator is yielding Page, so convert to ItemPage. # FIXME: If we've already fetched content, we should retain it yield pywikibot.ItemPage(page.site, page.title()) else: yield pywikibot.ItemPage.fromPage(page)
[docs] def AncientPagesPageGenerator( total: int = 100, site: BaseSite | None = None ) -> Generator[pywikibot.page.Page, None, None]: """Ancient page generator. :param total: Maximum number of pages to retrieve in total :param site: Site for generator results. """ if site is None: site = pywikibot.Site() return (page for page, _ in site.ancientpages(total=total))
[docs] def UnusedFilesGenerator( total: int | None = None, site: BaseSite | None = None ) -> Iterable[pywikibot.page.FilePage]: """Unused files generator. :param total: Maximum number of pages to retrieve in total :param site: Site for generator results. """ if site is None: site = pywikibot.Site() return site.unusedfiles(total=total)
[docs] def WithoutInterwikiPageGenerator( total: int | None = None, site: BaseSite | None = None ) -> Iterable[pywikibot.page.Page]: """Page lacking interwikis generator. :param total: Maximum number of pages to retrieve in total :param site: Site for generator results. """ if site is None: site = pywikibot.Site() return site.withoutinterwiki(total=total)
[docs] def UnCategorizedCategoryGenerator( total: int | None = 100, site: BaseSite | None = None ) -> Iterable[pywikibot.Category]: """Uncategorized category generator. :param total: Maximum number of pages to retrieve in total :param site: Site for generator results. """ if site is None: site = pywikibot.Site() return site.uncategorizedcategories(total=total)
[docs] def UnCategorizedImageGenerator( total: int = 100, site: BaseSite | None = None ) -> Iterable[pywikibot.page.FilePage]: """Uncategorized file generator. :param total: Maximum number of pages to retrieve in total :param site: Site for generator results. """ if site is None: site = pywikibot.Site() return site.uncategorizedimages(total=total)
[docs] def UnCategorizedPageGenerator( total: int = 100, site: BaseSite | None = None ) -> Iterable[pywikibot.page.Page]: """Uncategorized page generator. :param total: Maximum number of pages to retrieve in total :param site: Site for generator results. """ if site is None: site = pywikibot.Site() return site.uncategorizedpages(total=total)
[docs] def UnCategorizedTemplateGenerator( total: int = 100, site: BaseSite | None = None ) -> Iterable[pywikibot.page.Page]: """Uncategorized template generator. :param total: Maximum number of pages to retrieve in total :param site: Site for generator results. """ if site is None: site = pywikibot.Site() return site.uncategorizedtemplates(total=total)
[docs] def LonelyPagesPageGenerator( total: int | None = None, site: BaseSite | None = None ) -> Iterable[pywikibot.page.Page]: """Lonely page generator. :param total: Maximum number of pages to retrieve in total :param site: Site for generator results. """ if site is None: site = pywikibot.Site() return site.lonelypages(total=total)
[docs] def UnwatchedPagesPageGenerator( total: int | None = None, site: BaseSite | None = None ) -> Iterable[pywikibot.page.Page]: """Unwatched page generator. :param total: Maximum number of pages to retrieve in total :param site: Site for generator results. """ if site is None: site = pywikibot.Site() return site.unwatchedpages(total=total)
[docs] def page_with_property_generator( name: str, total: int | None = None, site: BaseSite | None = None ) -> Iterable[pywikibot.page.Page]: """Special:PagesWithProperty page generator. :param name: Property name of pages to be retrieved :param total: Maximum number of pages to retrieve in total :param site: Site for generator results. """ if site is None: site = pywikibot.Site() return site.pages_with_property(name, total=total)
[docs] def WantedPagesPageGenerator( total: int = 100, site: BaseSite | None = None ) -> Iterable[pywikibot.page.Page]: """Wanted page generator. :param total: Maximum number of pages to retrieve in total :param site: Site for generator results. """ if site is None: site = pywikibot.Site() return site.wantedpages(total=total)
[docs] def DeadendPagesPageGenerator( total: int = 100, site: BaseSite | None = None ) -> Iterable[pywikibot.page.Page]: """Dead-end page generator. :param total: Maximum number of pages to retrieve in total :param site: Site for generator results. """ if site is None: site = pywikibot.Site() return site.deadendpages(total=total)
[docs] def LongPagesPageGenerator(total: int = 100, site: BaseSite | None = None ) -> Generator[pywikibot.page.Page, None, None]: """Long page generator. :param total: Maximum number of pages to retrieve in total :param site: Site for generator results. """ if site is None: site = pywikibot.Site() return (page for page, _ in site.longpages(total=total))
[docs] def ShortPagesPageGenerator(total: int = 100, site: BaseSite | None = None ) -> Generator[pywikibot.page.Page, None, None]: """Short page generator. :param total: Maximum number of pages to retrieve in total :param site: Site for generator results. """ if site is None: site = pywikibot.Site() return (page for page, _ in site.shortpages(total=total))
[docs] def RandomPageGenerator( total: int | None = None, site: BaseSite | None = None, namespaces: NamespaceArgType = None ) -> Iterable[pywikibot.page.Page]: """Random page generator. :param total: Maximum number of pages to retrieve in total :param site: Site for generator results. """ if site is None: site = pywikibot.Site() return site.randompages(total=total, namespaces=namespaces)
[docs] def RandomRedirectPageGenerator( total: int | None = None, site: BaseSite | None = None, namespaces: NamespaceArgType = None, ) -> Iterable[pywikibot.page.Page]: """Random redirect generator. :param total: Maximum number of pages to retrieve in total :param site: Site for generator results. """ if site is None: site = pywikibot.Site() return site.randompages(total=total, namespaces=namespaces, redirects=True)
[docs] def LinksearchPageGenerator( url: str, namespaces: NamespaceArgType = None, total: int | None = None, site: BaseSite | None = None, protocol: str | None = None ) -> Iterable[pywikibot.page.Page]: """Yield all pages that link to a certain URL. :param url: The URL to search for (with or without the protocol prefix); this may include a '*' as a wildcard, only at the start of the hostname :param namespaces: list of namespace numbers to fetch contribs from :param total: Maximum number of pages to retrieve in total :param site: Site for generator results :param protocol: Protocol to search for, likely http or https, http by default. Full list shown on Special:LinkSearch wikipage. """ if site is None: site = pywikibot.Site() return site.exturlusage(url, namespaces=namespaces, protocol=protocol, total=total, content=False)
[docs] def SearchPageGenerator( query: str, total: int | None = None, namespaces: NamespaceArgType = None, site: BaseSite | None = None, **kwargs ) -> Iterable[pywikibot.page.Page]: r"""Yield pages from the MediaWiki internal search engine. .. versionchanged:: 10.0 Keyword arguments *content*, *sort* and *where* was added. .. seealso:: :meth:`site.search() <pywikibot.site._generators.GeneratorsMixin.search>` :param query: the text to search for :param total: Maximum number of pages to retrieve in total :param namespaces: search only in these namespaces (defaults to all) :param site: Site for generator results. :keyword str | None where: Where to search; value must be one of the given literals or None (many wikis do not support all search types) :keyword bool content: if True, load the current content of each iterated page (default False) :keyword sort: Set the sort order of returned results. If None is given, 'none' is used. Default is sort by relevance. """ if site is None: site = pywikibot.Site() return site.search(query, total=total, namespaces=namespaces, **kwargs)
[docs] def LiveRCPageGenerator(site: BaseSite | None = None, total: int | None = None ) -> Generator[pywikibot.page.Page, None, None]: """Yield pages from a socket.io RC stream. Generates pages based on the EventStreams Server-Sent-Event (SSE) recent changes stream. The Page objects will have an extra property ._rcinfo containing the literal rc data. This can be used to e.g. filter only new pages. See `pywikibot.comms.eventstreams.rc_listener` for details on the .rcinfo format. :param site: site to return recent changes for :param total: the maximum number of changes to return """ if site is None: site = pywikibot.Site() from pywikibot.comms.eventstreams import site_rc_listener for entry in site_rc_listener(site, total=total): # The title in a log entry may have been suppressed if 'title' not in entry and entry['type'] == 'log': continue page = pywikibot.Page(site, entry['title'], entry['namespace']) page._rcinfo = entry # type: ignore[attr-defined] yield page
[docs] class GoogleSearchPageGenerator(GeneratorWrapper): """Page generator using Google search results. To use this generator, you need to install the googlesearch package:: pip install googlesearch-python As there are concerns about Google's Terms of Service, this generator prints a warning for each query. .. seealso:: https://policies.google.com/terms .. versionchanged:: 7.6 subclassed from :class:`tools.collections.GeneratorWrapper` .. versionchanged:: 10.1 ``googlesearch-python`` package is needed instead of ``google``, see :phab:`T387618` for further informations. The *total* parameter was added. The *query* parameter is positional only. All other parameters are keyword only. """ def __init__(self, query: str = '', /, *, site: BaseSite | None = None, total: int = 10) -> None: """Initializer. :param query: the text to search for. :param site: Site for generator results. :param total: the maximum number of changes to return, default is 10 which is also set by googlesearch package. """ self.query = query or pywikibot.input( 'Please enter the search query:') self.site = site or pywikibot.Site() self.limit = total
[docs] @staticmethod def queryGoogle(query: str, /, **kwargs) -> Generator[str, None, None]: """Perform a query using ``googlesearch-python`` package. .. admonition:: Terms of Service The terms of service as at June 2014 give two conditions that may apply to use of search: 1. Don't access [Google Services] using a method other than the interface and the instructions that [they] provide. 2. Don't remove, obscure, or alter any legal notices displayed in or along with [Google] Services. Both of those issues should be managed by the ``googlesearch-python`` package, however Pywikibot will at least ensure the user sees the TOS in order to comply with the second. .. seealso:: https://policies.google.com/terms condition. .. important:: These note are from 2014 and have not been reviewed or updated since then. .. versionchanged:: 10.1 *query* is positional only; *kwargs* parameter was added. :param query: the text to search for. :param kwargs: other keyword arguments passed to ``googlesearch`` module. """ try: import googlesearch except ModuleNotFoundError: pywikibot.error("""\ generator GoogleSearchPageGenerator depends on package 'googlesearch-python'. To install, please run: pip install googlesearch-python""") return pywikibot.warning('Please read http://www.google.com/accounts/TOS') yield from googlesearch.search(query, **kwargs)
@property def generator(self) -> Generator[pywikibot.page.Page, None, None]: """Yield results from :meth:`queryGoogle` query. .. versionchanged:: 7.6 changed from iterator method to generator property .. versionchanged:: 10.1 use :meth:`site.protocol <pywikibot.site._basesite.BaseSite.protocol>` to get the base URL. Also filter duplicates. """ if not self.query: pywikibot.warning('No query string was specified') return # restrict query to local site site = self.site local_query = f'{self.query} site:{site.hostname()}' base = f'{site.protocol()}://{site.hostname()}{site.articlepath}' pattern = re.compile(base.replace('{}', '(?P<title>.+)')) for url in self.queryGoogle(local_query, num_results=self.limit, unique=True): m = pattern.fullmatch(url) if not m: continue page = pywikibot.Page(pywikibot.Link(m['title'], site)) # Google may contain links in the format: # https://de.wikipedia.org/wiki/en:Foobar if page.site == site: yield page
[docs] def set_maximum_items(self, value: int, /): """Set the maximum number of items to be retrieved from google. This method is added to be used by the :class:`pagegenerators.GeneratorFactory` to circumvent call of :func:`itertools.islice` filter for this generator. .. versionadded:: 10.1 """ self.limit = value
[docs] def MySQLPageGenerator(query: str, site: BaseSite | None = None, verbose: bool | None = None ) -> Generator[pywikibot.page.Page, None, None]: """Yield a list of pages based on a MySQL query. The query should return two columns, page namespace and page title pairs from some table. An example query that yields all ns0 pages might look like:: SELECT page_namespace, page_title FROM page WHERE page_namespace = 0; .. seealso:: :manpage:`MySQL` :param query: MySQL query to execute :param site: Site object :param verbose: if True, print query to be executed; if None, config.verbose_output will be used. :return: generator which yields pywikibot.Page """ from pywikibot.data import mysql if site is None: site = pywikibot.Site() row_gen = mysql.mysql_query(query, dbname=site.dbName(), verbose=verbose) for row in row_gen: namespace_number, page_name = row page_name = page_name.decode(site.encoding()) page = pywikibot.Page(site, page_name, ns=int(namespace_number)) yield page
[docs] def SupersetPageGenerator(query: str, site: BaseSite | None = None, schema_name: str | None = None, database_id: int | None = None ) -> Iterator[pywikibot.page.Page]: """Generate pages that result from the given SPARQL query. Pages are generated using site in following order: 1. site retrieved using page_wikidb column in SQL result 2. site as parameter 3. site retrieved using schema_name SQL columns used are - page_id - page_namespace + page_title - page_wikidb Example SQL queries .. code-block:: sql SELECT gil_wiki AS page_wikidb, gil_page AS page_id FROM globalimagelinks GROUP BY gil_wiki LIMIT 10 OR .. code-block:: sql SELECT page_id FROM page LIMIT 10 OR .. code-block:: sql SELECT page_namespace, page_title FROM page LIMIT 10 .. versionadded:: 9.2 :param query: the SQL query string. :param site: Site for generator results. :param schema_name: target superset schema name :param database_id: target superset database id """ from pywikibot.data.superset import SupersetQuery # Do not pass site to superset if schema_name is defined. # The user may use schema_name to point to different # wikimedia db on purpose and use site for # generating result pages. superset_site = None if schema_name else site superset = SupersetQuery(site=superset_site, schema_name=schema_name, database_id=database_id) try: rows = superset.query(query) except Exception as e: pywikibot.error(f'Error executing query: {query}\n{e}') return sites = {} # If there is no site then retrieve it using schema_name if not site: if not schema_name: raise TypeError('Schema name or site must be provided.') wikidb = re.sub('_p$', '', schema_name) site = pywikibot.site.APISite.fromDBName(wikidb) for row in rows: # If page_wikidb column in SQL result then use it to retrieve site if 'page_wikidb' in row: # remove "_p" suffix wikidb = re.sub('_p$', '', row['page_wikidb']) # Caching sites if wikidb not in sites: try: sites[wikidb] = pywikibot.site.APISite.fromDBName(wikidb) except ValueError: msg = f'Cannot parse a site from {wikidb} for {row}.' pywikibot.warning(msg) continue site = sites[wikidb] # Generate page objects # Create page object from page_id if 'page_id' in row: page_ids = [row['page_id']] pages = site.load_pages_from_pageids(page_ids) for page in pages: yield page # Create page object from page_namespace + page_title elif 'page_title' in rows[0] and 'page_namespace' in rows[0]: page_namespace = int(row['page_namespace']) page_title = row['page_title'] page = pywikibot.Page(site, page_title, ns=page_namespace) yield page else: raise ValueError('The SQL result is in wrong format.')
[docs] class XMLDumpPageGenerator(abc.Iterator): # type: ignore[type-arg] """Xml iterator that yields Page objects. .. versionadded:: 7.2 the `content` parameter :param filename: filename of XML dump :param start: skip entries below that value :param namespaces: namespace filter :param site: current site for the generator :param text_predicate: a callable with entry.text as parameter and boolean as result to indicate the generator should return the page or not :param content: If True, assign old page content to Page.text :ivar skipping: True if start parameter is given, else False :ivar parser: holds the xmlreader.XmlDump parse method """ def __init__( self, filename: str, start: str | None = None, namespaces: NamespaceArgType = None, site: BaseSite | None = None, text_predicate: Callable[[str], bool] | None = None, content=False, ) -> None: """Initializer.""" self.text_predicate = text_predicate self.content = content self.skipping = bool(start) self.start: str | None = None if start is not None and self.skipping: self.start = start.replace('_', ' ') self.site = site or pywikibot.Site() if not namespaces: self.namespaces = self.site.namespaces else: self.namespaces = self.site.namespaces.resolve(namespaces) dump = xmlreader.XmlDump(filename, on_error=pywikibot.error) self.parser = dump.parse() def __next__(self) -> pywikibot.page.Page: """Get next Page.""" while True: entry = next(self.parser) if self.skipping: if entry.title < self.start: continue self.skipping = False page = pywikibot.Page(self.site, entry.title) if page.namespace() not in self.namespaces: continue if not self.text_predicate or self.text_predicate(entry.text): if self.content: page.text = entry.text return page
[docs] def YearPageGenerator(start: int = 1, end: int = 2050, site: BaseSite | None = None ) -> Generator[pywikibot.page.Page, None, None]: """Year page generator. :param site: Site for generator results. """ if site is None: site = pywikibot.Site() pywikibot.info(f'Starting with year {start}') for i in range(start, end + 1): if i % 100 == 0: pywikibot.info(f'Preparing {i}...') # There is no year 0 if i != 0: current_year = date.formatYear(site.lang, i) yield pywikibot.Page(pywikibot.Link(current_year, site))
[docs] def DayPageGenerator(start_month: int = 1, end_month: int = 12, site: BaseSite | None = None, year: int = 2000 ) -> Generator[pywikibot.page.Page, None, None]: """Day page generator. :param site: Site for generator results. :param year: considering leap year. """ if site is None: site = pywikibot.Site() lang = site.lang first_page = pywikibot.Page(site, date.format_date(start_month, 1, lang)) pywikibot.info(f'Starting with {first_page.title(as_link=True)}') for month in range(start_month, end_month + 1): for day in range(1, calendar.monthrange(year, month)[1] + 1): yield pywikibot.Page( pywikibot.Link(date.format_date(month, day, lang), site))
[docs] def WikidataPageFromItemGenerator( gen: Iterable[pywikibot.page.ItemPage], site: pywikibot.site.BaseSite, ) -> Generator[pywikibot.page.Page, None, None]: """Generate pages from site based on sitelinks of item pages. :param gen: generator of :py:obj:`pywikibot.ItemPage` :param site: Site for generator results. """ repo = site.data_repository() for batch in batched(gen, 50): req = {'ids': [item.id for item in batch], 'sitefilter': site.dbName(), 'action': 'wbgetentities', 'props': 'sitelinks'} wbrequest = repo.simple_request(**req) wbdata = wbrequest.submit() entities = (item for item in wbdata['entities'].values() if 'sitelinks' in item and site.dbName() in item['sitelinks']) sitelinks = (item['sitelinks'][site.dbName()]['title'] for item in entities) for sitelink in sitelinks: yield pywikibot.Page(site, sitelink)
[docs] def WikidataSPARQLPageGenerator(query: str, site: BaseSite | None = None, item_name: str = 'item', endpoint: str | None = None, entity_url: str | None = None, result_type: Any = set ) -> Iterator[pywikibot.page.Page]: """Generate pages that result from the given SPARQL query. :param query: the SPARQL query string. :param site: Site for generator results. :param item_name: name of the item in the SPARQL query :param endpoint: SPARQL endpoint URL :param entity_url: URL prefix for any entities returned in a query. :param result_type: type of the iterable in which SPARQL results are stored (default set) """ from pywikibot.data import sparql if site is None: site = pywikibot.Site() repo = site.data_repository() dependencies = {'endpoint': endpoint, 'entity_url': entity_url} if not endpoint or not entity_url: dependencies['repo'] = repo query_object = sparql.SparqlQuery(**dependencies) # type: ignore[arg-type] data = query_object.get_items(query, item_name=item_name, result_type=result_type) entities = (repo.get_entity_for_entity_id(entity) for entity in data) if isinstance(site, pywikibot.site.DataSite): return entities return WikidataPageFromItemGenerator(entities, site)
[docs] def WikibaseSearchItemPageGenerator( text: str, language: str | None = None, total: int | None = None, site: BaseSite | None = None, ) -> Generator[pywikibot.page.ItemPage, None, None]: """Generate pages that contain the provided text. :param text: Text to look for. :param language: Code of the language to search in. If not specified, value from pywikibot.config.data_lang is used. :param total: Maximum number of pages to retrieve in total, or None in case of no limit. :param site: Site for generator results. """ if site is None: site = pywikibot.Site() if language is None: language = site.lang repo = site.data_repository() data = repo.search_entities(text, language, total=total) return (pywikibot.ItemPage(repo, item['id']) for item in data)
[docs] class PetScanPageGenerator(GeneratorWrapper): """Queries PetScan to generate pages. .. seealso:: https://petscan.wmflabs.org/ .. versionadded:: 3.0 .. versionchanged:: 7.6 subclassed from :class:`tools.collections.GeneratorWrapper` """ def __init__( self, categories: Sequence[str], subset_combination: bool = True, namespaces: Iterable[int | pywikibot.site.Namespace] | None = None, site: BaseSite | None = None, extra_options: dict[Any, Any] | None = None ) -> None: """Initializer. :param categories: List of category names to retrieve pages from :param subset_combination: Combination mode. If True, returns the intersection of the results of the categories, else returns the union of the results of the categories :param namespaces: List of namespaces to search in (default is None, meaning all namespaces) :param site: Site to operate on (default is the default site from the user config) :param extra_options: Dictionary of extra options to use (optional) """ if site is None: site = pywikibot.Site() self.site = site self.opts = self.buildQuery(categories, subset_combination, namespaces, extra_options)
[docs] def buildQuery(self, categories: Sequence[str], subset_combination: bool, namespaces: Iterable[int | pywikibot.site.Namespace] | None, extra_options: dict[Any, Any] | None) -> dict[str, Any]: """Get the querystring options to query PetScan. :param categories: List of categories (as strings) :param subset_combination: Combination mode. If True, returns the intersection of the results of the categories, else returns the union of the results of the categories :param namespaces: List of namespaces to search in :param extra_options: Dictionary of extra options to use :return: Dictionary of querystring parameters to use in the query """ extra_options = extra_options or {} query = { 'language': self.site.code, 'project': self.site.hostname().split('.')[-2], 'combination': 'subset' if subset_combination else 'union', 'categories': '\r\n'.join(categories), 'format': 'json', 'doit': '' } if namespaces: for namespace in namespaces: query[f'ns[{int(namespace)}]'] = 1 query_final = query.copy() query_final.update(extra_options) return query_final
[docs] def query(self) -> Generator[dict[str, Any], None, None]: """Query PetScan. .. versionchanged:: 7.4 raises :class:`APIError` if query returns an error message. :raises ServerError: Either ReadTimeout or server status error :raises APIError: error response from petscan """ url = 'https://petscan.wmflabs.org' try: req = http.fetch(url, params=self.opts) except ReadTimeout: raise ServerError(f'received ReadTimeout from {url}') server_err = HTTPStatus.INTERNAL_SERVER_ERROR if server_err <= req.status_code < server_err + 100: raise ServerError( f'received {req.status_code} status from {req.url}') data = req.json() if 'error' in data: raise APIError('Petscan', data['error'], **self.opts) raw_pages = data['*'][0]['a']['*'] yield from raw_pages
@property def generator(self) -> Generator[pywikibot.page.Page, None, None]: """Yield results from :meth:`query`. .. versionchanged:: 7.6 changed from iterator method to generator property """ for raw_page in self.query(): yield pywikibot.Page(self.site, raw_page['title'], int(raw_page['namespace']))
[docs] class PagePilePageGenerator(GeneratorWrapper): """Queries PagePile to generate pages. .. seealso:: https://pagepile.toolforge.org/ .. versionadded:: 9.0 """ def __init__(self, id: int): """Initializer. :param id: The PagePile id to query """ self.opts = self.buildQuery(id)
[docs] def buildQuery(self, id: int): """Get the querystring options to query PagePile. :param id: int :return: Dictionary of querystring parameters to use in the query """ query = { 'id': id, 'action': 'get_data', 'format': 'json', 'doit': '' } return query
[docs] def query(self) -> Generator[str, None, None]: """Query PagePile. :raises ServerError: Either ReadTimeout or server status error :raises APIError: error response from petscan """ url = 'https://pagepile.toolforge.org/api.php' req = http.fetch(url, params=self.opts) data = req.json() if 'error' in data: raise APIError('PagePile', data['error'], **self.opts) self.site = pywikibot.site.APISite.fromDBName(data['wiki']) raw_pages = data['pages'] yield from raw_pages
@property def generator(self) -> Generator[pywikibot.page.Page, None, None]: """Yield results from :meth:`query`.""" for raw_page in self.query(): page = pywikibot.Page(self.site, raw_page) yield page