"""Page filter generators provided by the pagegenerators module."""## (C) Pywikibot team, 2008-2024## Distributed under the terms of the MIT license.#from__future__importannotationsimportcalendarimportcodecsimportioimportreimportsysimporttypingfromcollectionsimportabcfromfunctoolsimportpartialfromhttpimportHTTPStatusfromtypingimportTYPE_CHECKING,Anyfromurllib.parseimporturlparsefromrequests.exceptionsimportReadTimeoutimportpywikibotfrompywikibotimportconfig,date,xmlreaderfrompywikibot.backportsimport(Callable,Generator,Iterable,Iterator,Sequence,batched,)frompywikibot.commsimporthttpfrompywikibot.exceptionsimportAPIError,ServerErrorfrompywikibot.siteimportNamespacefrompywikibot.toolsimportissue_deprecation_warningfrompywikibot.tools.collectionsimportGeneratorWrapperfrompywikibot.tools.itertoolsimportfilter_uniqueifTYPE_CHECKING:frompywikibot.siteimportBaseSite,NamespaceArgTypefrompywikibot.site._namespaceimportSingleNamespaceTypefrompywikibot.timeimportTimestamp# This is the function that will be used to de-duplicate page iterators._filter_unique_pages=partial(filter_unique,key=lambdapage:'{}:{}:{}'.format(*page._cmpkey()))
[docs]defAllpagesPageGenerator(start:str='!',namespace:SingleNamespaceType=0,includeredirects:typing.Literal['only']|bool=True,site:BaseSite|None=None,total:int|None=None,content:bool=False,*,filterredir:bool|None=None,)->Iterable[pywikibot.page.Page]:"""Iterate Page objects for all titles in a single namespace. .. deprecated:: 10.0 The *includeredirects* parameter; use *filterredir* instead. .. seealso:: :meth:`APISite.allpages() <pywikibot.site._generators.GeneratorsMixin.allpages>` :param start: if provided, only generate pages >= this title lexically :param namespace: Namespace to retrieve pages from :param includeredirects: If False, redirects are not included. If equals the string 'only', only redirects are added. Otherwise redirects will be included. This parameter is deprecated; use *filterredir* instead. :param site: Site for generator results. :param total: Maximum number of pages to retrieve in total :param content: If True, load current version of each page (default False) :param filterredir: if True, only yield redirects; if False (and not None), only yield non-redirects (default: yield both). :return: a generator that yields Page objects :raises ValueError: *filterredir* as well as *includeredirects* parameters were given. Use *filterredir* only. """ifsiteisNone:site=pywikibot.Site()iffilterredirisnotNoneandincluderedirectsisnotTrue:raiseValueError(f'filterredir parameter ({filterredir}) is used together with 'f'outdated includeredirects parameter ({includeredirects}).')# backward compatibilityifincluderedirectsisnotTrue:ifnotincluderedirects:filterredir=Falseelifincluderedirects=='only':filterredir=Trueissue_deprecation_warning('includeredirects parameter ({includeredirects})',f'filterredir={filterredir}',since='10.0.0')returnsite.allpages(start=start,namespace=namespace,filterredir=filterredir,total=total,content=content)
[docs]defPrefixingPageGenerator(prefix:str,namespace:SingleNamespaceType|None=None,includeredirects:typing.Literal['only']|bool=True,site:BaseSite|None=None,total:int|None=None,content:bool=False,*,filterredir:bool|None=None,)->Iterable[pywikibot.page.Page]:"""Prefixed Page generator. .. deprecated:: 10.0 The *includeredirects* parameter; use *filterredir* instead. :param prefix: The prefix of the pages. :param namespace: Namespace to retrieve pages from :param includeredirects: If False, redirects are not included. If equals the string 'only', only redirects are added. Otherwise redirects will be included. This parameter is deprecated; use *filterredir* instead. :param site: Site for generator results. :param total: Maximum number of pages to retrieve in total :param content: If True, load current version of each page (default False) :param filterredir: if True, only yield redirects; if False (and not None), only yield non-redirects (default: yield both). :return: a generator that yields Page objects :raises ValueError: *filterredir* as well as *includeredirects* parameters were given. Use *filterredir* only. """ifsiteisNone:site=pywikibot.Site()prefixlink=pywikibot.Link(prefix,site)ifnamespaceisNone:namespace=prefixlink.namespacetitle=prefixlink.titleiffilterredirisnotNoneandincluderedirectsisnotTrue:raiseValueError(f'filterredir parameter ({filterredir}) is used together with 'f'outdated includeredirects parameter ({includeredirects}).')# backward compatibilityifincluderedirectsisnotTrue:ifnotincluderedirects:filterredir=Falseelifincluderedirects=='only':filterredir=Trueissue_deprecation_warning('includeredirects parameter ({includeredirects})',f'filterredir={filterredir}',since='10.0.0')returnsite.allpages(prefix=title,namespace=namespace,filterredir=filterredir,total=total,content=content)
[docs]defLogeventsPageGenerator(logtype:str|None=None,user:str|None=None,site:BaseSite|None=None,namespace:SingleNamespaceType|None=None,total:int|None=None,start:Timestamp|None=None,end:Timestamp|None=None,reverse:bool=False)->Generator[pywikibot.page.Page,None,None]:"""Generate Pages for specified modes of logevents. :param logtype: Mode of logs to retrieve :param user: User of logs retrieved :param site: Site for generator results :param namespace: Namespace to retrieve logs from :param total: Maximum number of pages to retrieve in total :param start: Timestamp to start listing from :param end: Timestamp to end listing at :param reverse: if True, start with oldest changes (default: newest) """ifsiteisNone:site=pywikibot.Site()forentryinsite.logevents(total=total,logtype=logtype,user=user,namespace=namespace,start=start,end=end,reverse=reverse):try:yieldentry.page()exceptKeyErrorase:pywikibot.warning('LogeventsPageGenerator: failed to load page 'f'for {entry.data!r}; skipping')pywikibot.error(e)
[docs]defNewpagesPageGenerator(site:BaseSite|None=None,namespaces:NamespaceArgType=(0,),total:int|None=None)->Generator[pywikibot.page.Page,None,None]:"""Iterate Page objects for all new titles in a single namespace. :param site: Site for generator results. :param namespaces: namespace to retrieve pages from :param total: Maximum number of pages to retrieve in total """# API does not (yet) have a newpages function, so this tries to duplicate# it by filtering the recentchanges output# defaults to namespace 0 because that's how Special:Newpages defaultsifsiteisNone:site=pywikibot.Site()return(pageforpage,_insite.newpages(namespaces=namespaces,total=total,returndict=True))
[docs]defRecentChangesPageGenerator(site:BaseSite|None=None,_filter_unique:None|(Callable[[Iterable[pywikibot.Page]],Iterable[pywikibot.Page]])=None,**kwargs:Any)->Generator[pywikibot.Page,None,None]:"""Generate recent changes pages, including duplicates. For keyword parameters refer :meth:`APISite.recentchanges() <pywikibot.site._generators.GeneratorsMixin.recentchanges>`. .. versionchanged:: 8.2 The YieldType depends on namespace. It can be :class:`pywikibot.Page<pywikibot.page.Page>`, :class:`pywikibot.User<pywikibot.page.User>`, :class:`pywikibot.FilePage<pywikibot.page.FilePage>` or :class:`pywikibot.Category<pywikibot.page.Category>`. .. versionchanged:: 9.4 Ignore :class:`pywikibot.FilePage<pywikibot.page.FilePage>` if it raises a :exc:`ValueError` during upcast e.g. due to an invalid file extension. :param site: Site for generator results. """defupcast(gen):"""Upcast pywikibot.Page type."""forrcingen:# The title in a log entry may have been suppressedifrc['type']=='log'and'title'notinrc:continuens=rc['ns']ifns==Namespace.USER:pageclass:type[pywikibot.Page]=pywikibot.Userelifns==Namespace.FILE:pageclass=pywikibot.FilePageelifns==Namespace.CATEGORY:pageclass=pywikibot.Categoryelse:pageclass=pywikibot.Pagetry:yieldpageclass(site,rc['title'])exceptValueError:ifpageclass!=pywikibot.FilePage:raisepywikibot.exception()ifsiteisNone:site=pywikibot.Site()gen=site.recentchanges(**kwargs)gen.request['rcprop']='title'gen=upcast(gen)if_filter_unique:gen=_filter_unique(gen)returngen
[docs]defUnconnectedPageGenerator(site:BaseSite|None=None,total:int|None=None)->Iterable[pywikibot.page.Page]:"""Iterate Page objects for all unconnected pages to a Wikibase repository. :param total: Maximum number of pages to retrieve in total :param site: Site for generator results. """ifsiteisNone:site=pywikibot.Site()ifnotsite.data_repository():raiseValueError('The given site does not have Wikibase repository.')returnsite.unconnected_pages(total=total)
[docs]defFileLinksGenerator(referredFilePage:pywikibot.page.FilePage,# noqa: N803total:int|None=None,content:bool=False)->Iterable[pywikibot.page.Page]:"""Yield Pages on which referredFilePage file is displayed."""returnreferredFilePage.using_pages(total=total,content=content)
[docs]defImagesPageGenerator(pageWithImages:pywikibot.page.Page,# noqa: N803total:int|None=None,content:bool=False)->Iterable[pywikibot.page.Page]:"""Yield FilePages displayed on pageWithImages."""returnpageWithImages.imagelinks(total=total,content=content)
[docs]defInterwikiPageGenerator(page:pywikibot.page.Page)->Generator[pywikibot.page.Page,None,None]:"""Iterate over all interwiki (non-language) links on a page."""return(pywikibot.Page(link)forlinkinpage.interwiki())
[docs]defLanguageLinksPageGenerator(page:pywikibot.page.Page,total:int|None=None)->Generator[pywikibot.page.Page,None,None]:"""Iterate over all interwiki language links on a page."""return(pywikibot.Page(link)forlinkinpage.iterlanglinks(total=total))
[docs]defCategorizedPageGenerator(category:pywikibot.page.Category,recurse:int|bool=False,start:str|None=None,total:int|None=None,content:bool=False,namespaces:NamespaceArgType=None,)->Generator[pywikibot.page.Page,None,None]:"""Yield all pages in a specific category. :param recurse: if not False or 0, also iterate articles in subcategories. If an int, limit recursion to this number of levels. (Example: recurse=1 will iterate articles in first-level subcats, but no deeper.) :param start: if provided, only generate pages >= this title lexically :param total: iterate no more than this number of pages in total (at all levels) :param content: if True, retrieve the content of the current version of each page (default False) """yield fromcategory.articles(content=content,namespaces=namespaces,recurse=recurse,startprefix=start,total=total,)
[docs]defSubCategoriesPageGenerator(category:pywikibot.page.Category,recurse:int|bool=False,start:str|None=None,total:int|None=None,content:bool=False,)->Generator[pywikibot.page.Page,None,None]:"""Yield all subcategories in a specific category. :param recurse: if not False or 0, also iterate articles in subcategories. If an int, limit recursion to this number of levels. (Example: recurse=1 will iterate articles in first-level subcats, but no deeper.) :param start: if provided, only generate pages >= this title lexically :param total: iterate no more than this number of pages in total (at all levels) :param content: if True, retrieve the content of the current version of each page (default False) """# TODO: page generator could be modified to use cmstartsortkey ...forsincategory.subcategories(recurse=recurse,total=total,content=content):ifstartisNoneors.title(with_ns=False)>=start:yields
[docs]defLinkedPageGenerator(linkingPage:pywikibot.page.Page,# noqa: N803total:int|None=None,content:bool=False)->Iterable[pywikibot.page.BasePage]:"""Yield all pages linked from a specific page. See :py:obj:`page.BasePage.linkedPages` for details. :param linkingPage: the page that links to the pages we want :param total: the total number of pages to iterate :param content: if True, retrieve the current content of each linked page :return: a generator that yields Page objects of pages linked to linkingPage """returnlinkingPage.linkedPages(total=total,content=content)
def_yield_titles(f:codecs.StreamReaderWriter|io.StringIO,site:pywikibot.site.BaseSite)->Generator[pywikibot.page.Page,None,None]:"""Yield page titles from a text stream. :param f: text stream object :param site: Site for generator results. :return: a generator that yields Page objects of pages with titles in text stream """linkmatch=Noneforlinkmatchinpywikibot.link_regex.finditer(f.read()):# If the link is in interwiki format, the Page object may reside# on a different Site than the default.# This makes it possible to work on different wikis using a single# text file, but also could be dangerous because you might# inadvertently change pages on another wiki!yieldpywikibot.Page(pywikibot.Link(linkmatch['title'],site))iflinkmatchisnotNone:returnf.seek(0)fortitleinf:title=title.strip()if'|'intitle:title=title[:title.index('|')]iftitle:yieldpywikibot.Page(site,title)
[docs]defTextIOPageGenerator(source:str|None=None,site:BaseSite|None=None,)->Generator[pywikibot.page.Page,None,None]:"""Iterate pages from a list in a text file or on a webpage. The text source must contain page links between double-square-brackets or, alternatively, separated by newlines. The generator will yield each corresponding Page object. :param source: the file path or URL that should be read. If no name is given, the generator prompts the user. :param site: Site for generator results. """ifsourceisNone:source=pywikibot.input('Please enter the filename / URL:')ifsiteisNone:site=pywikibot.Site()# If source cannot be parsed as an HTTP URL, treat as local fileifnoturlparse(source).netloc:withcodecs.open(source,'r',config.textfile_encoding)aslocal_file:yield from_yield_titles(local_file,site)# Else, fetch page (page should return text in same format as that expected# in filename, i.e. pages separated by newlines or pages enclosed in double# bracketselse:withio.StringIO(http.fetch(source).text)asf:yield from_yield_titles(f,site)
[docs]defPagesFromTitlesGenerator(iterable:Iterable[str],site:BaseSite|None=None)->Generator[pywikibot.page.Page,None,None]:"""Generate pages from the titles (strings) yielded by iterable. :param site: Site for generator results. """ifsiteisNone:site=pywikibot.Site()fortitleiniterable:ifnotisinstance(title,str):breakyieldpywikibot.Page(pywikibot.Link(title,site))
[docs]defPagesFromPageidGenerator(pageids:Iterable[str],site:BaseSite|None=None)->Iterable[pywikibot.page.Page]:"""Return a page generator from pageids. Pages are iterated in the same order than in the underlying pageids. Pageids are filtered and only one page is returned in case of duplicate pageid. :param pageids: an iterable that returns pageids, or a comma-separated string of pageids (e.g. '945097,1483753,956608') :param site: Site for generator results. """ifsiteisNone:site=pywikibot.Site()returnsite.load_pages_from_pageids(pageids)
[docs]defUserContributionsGenerator(username:str,namespaces:NamespaceArgType=None,site:BaseSite|None=None,total:int|None=None,_filter_unique:None|(Callable[[Iterable[pywikibot.page.Page]],Iterable[pywikibot.page.Page]])=_filter_unique_pages)->Iterable[pywikibot.page.Page]:"""Yield unique pages edited by user:username. :param total: Maximum number of pages to retrieve in total :param namespaces: list of namespace numbers to fetch contribs from :param site: Site for generator results. """ifsiteisNone:site=pywikibot.Site()user=pywikibot.User(site,username)ifnot(user.isAnonymous()oruser.isRegistered()):pywikibot.warning(f'User "{user.username}" does not exist on site "{site}".')gen=(contrib[0]forcontribinuser.contributions(namespaces=namespaces,total=total))if_filter_unique:return_filter_unique(gen)returngen
[docs]defNewimagesPageGenerator(total:int|None=None,site:BaseSite|None=None)->Generator[pywikibot.page.Page,None,None]:"""New file generator. :param total: Maximum number of pages to retrieve in total :param site: Site for generator results. """ifsiteisNone:site=pywikibot.Site()return(entry.page()forentryinsite.logevents(logtype='upload',total=total))
[docs]defWikibaseItemGenerator(gen:Iterable[pywikibot.page.Page])->Generator[pywikibot.page.ItemPage,None,None]:"""A wrapper generator used to yield Wikibase items of another generator. :param gen: Generator to wrap. :return: Wrapped generator """forpageingen:ifisinstance(page,pywikibot.ItemPage):yieldpageelifpage.site.data_repository()==page.site:# These are already items, as they have a DataSite in page.site.# However generator is yielding Page, so convert to ItemPage.# FIXME: If we've already fetched content, we should retain ityieldpywikibot.ItemPage(page.site,page.title())else:yieldpywikibot.ItemPage.fromPage(page)
[docs]defAncientPagesPageGenerator(total:int=100,site:BaseSite|None=None)->Generator[pywikibot.page.Page,None,None]:"""Ancient page generator. :param total: Maximum number of pages to retrieve in total :param site: Site for generator results. """ifsiteisNone:site=pywikibot.Site()return(pageforpage,_insite.ancientpages(total=total))
[docs]defUnusedFilesGenerator(total:int|None=None,site:BaseSite|None=None)->Iterable[pywikibot.page.FilePage]:"""Unused files generator. :param total: Maximum number of pages to retrieve in total :param site: Site for generator results. """ifsiteisNone:site=pywikibot.Site()returnsite.unusedfiles(total=total)
[docs]defWithoutInterwikiPageGenerator(total:int|None=None,site:BaseSite|None=None)->Iterable[pywikibot.page.Page]:"""Page lacking interwikis generator. :param total: Maximum number of pages to retrieve in total :param site: Site for generator results. """ifsiteisNone:site=pywikibot.Site()returnsite.withoutinterwiki(total=total)
[docs]defUnCategorizedCategoryGenerator(total:int|None=100,site:BaseSite|None=None)->Iterable[pywikibot.Category]:"""Uncategorized category generator. :param total: Maximum number of pages to retrieve in total :param site: Site for generator results. """ifsiteisNone:site=pywikibot.Site()returnsite.uncategorizedcategories(total=total)
[docs]defUnCategorizedImageGenerator(total:int=100,site:BaseSite|None=None)->Iterable[pywikibot.page.FilePage]:"""Uncategorized file generator. :param total: Maximum number of pages to retrieve in total :param site: Site for generator results. """ifsiteisNone:site=pywikibot.Site()returnsite.uncategorizedimages(total=total)
[docs]defUnCategorizedPageGenerator(total:int=100,site:BaseSite|None=None)->Iterable[pywikibot.page.Page]:"""Uncategorized page generator. :param total: Maximum number of pages to retrieve in total :param site: Site for generator results. """ifsiteisNone:site=pywikibot.Site()returnsite.uncategorizedpages(total=total)
[docs]defUnCategorizedTemplateGenerator(total:int=100,site:BaseSite|None=None)->Iterable[pywikibot.page.Page]:"""Uncategorized template generator. :param total: Maximum number of pages to retrieve in total :param site: Site for generator results. """ifsiteisNone:site=pywikibot.Site()returnsite.uncategorizedtemplates(total=total)
[docs]defLonelyPagesPageGenerator(total:int|None=None,site:BaseSite|None=None)->Iterable[pywikibot.page.Page]:"""Lonely page generator. :param total: Maximum number of pages to retrieve in total :param site: Site for generator results. """ifsiteisNone:site=pywikibot.Site()returnsite.lonelypages(total=total)
[docs]defUnwatchedPagesPageGenerator(total:int|None=None,site:BaseSite|None=None)->Iterable[pywikibot.page.Page]:"""Unwatched page generator. :param total: Maximum number of pages to retrieve in total :param site: Site for generator results. """ifsiteisNone:site=pywikibot.Site()returnsite.unwatchedpages(total=total)
[docs]defpage_with_property_generator(name:str,total:int|None=None,site:BaseSite|None=None)->Iterable[pywikibot.page.Page]:"""Special:PagesWithProperty page generator. :param name: Property name of pages to be retrieved :param total: Maximum number of pages to retrieve in total :param site: Site for generator results. """ifsiteisNone:site=pywikibot.Site()returnsite.pages_with_property(name,total=total)
[docs]defWantedPagesPageGenerator(total:int=100,site:BaseSite|None=None)->Iterable[pywikibot.page.Page]:"""Wanted page generator. :param total: Maximum number of pages to retrieve in total :param site: Site for generator results. """ifsiteisNone:site=pywikibot.Site()returnsite.wantedpages(total=total)
[docs]defDeadendPagesPageGenerator(total:int=100,site:BaseSite|None=None)->Iterable[pywikibot.page.Page]:"""Dead-end page generator. :param total: Maximum number of pages to retrieve in total :param site: Site for generator results. """ifsiteisNone:site=pywikibot.Site()returnsite.deadendpages(total=total)
[docs]defLongPagesPageGenerator(total:int=100,site:BaseSite|None=None)->Generator[pywikibot.page.Page,None,None]:"""Long page generator. :param total: Maximum number of pages to retrieve in total :param site: Site for generator results. """ifsiteisNone:site=pywikibot.Site()return(pageforpage,_insite.longpages(total=total))
[docs]defShortPagesPageGenerator(total:int=100,site:BaseSite|None=None)->Generator[pywikibot.page.Page,None,None]:"""Short page generator. :param total: Maximum number of pages to retrieve in total :param site: Site for generator results. """ifsiteisNone:site=pywikibot.Site()return(pageforpage,_insite.shortpages(total=total))
[docs]defRandomPageGenerator(total:int|None=None,site:BaseSite|None=None,namespaces:NamespaceArgType=None)->Iterable[pywikibot.page.Page]:"""Random page generator. :param total: Maximum number of pages to retrieve in total :param site: Site for generator results. """ifsiteisNone:site=pywikibot.Site()returnsite.randompages(total=total,namespaces=namespaces)
[docs]defRandomRedirectPageGenerator(total:int|None=None,site:BaseSite|None=None,namespaces:NamespaceArgType=None,)->Iterable[pywikibot.page.Page]:"""Random redirect generator. :param total: Maximum number of pages to retrieve in total :param site: Site for generator results. """ifsiteisNone:site=pywikibot.Site()returnsite.randompages(total=total,namespaces=namespaces,redirects=True)
[docs]defLinksearchPageGenerator(url:str,namespaces:NamespaceArgType=None,total:int|None=None,site:BaseSite|None=None,protocol:str|None=None)->Iterable[pywikibot.page.Page]:"""Yield all pages that link to a certain URL. :param url: The URL to search for (with or without the protocol prefix); this may include a '*' as a wildcard, only at the start of the hostname :param namespaces: list of namespace numbers to fetch contribs from :param total: Maximum number of pages to retrieve in total :param site: Site for generator results :param protocol: Protocol to search for, likely http or https, http by default. Full list shown on Special:LinkSearch wikipage. """ifsiteisNone:site=pywikibot.Site()returnsite.exturlusage(url,namespaces=namespaces,protocol=protocol,total=total,content=False)
[docs]defSearchPageGenerator(query:str,total:int|None=None,namespaces:NamespaceArgType=None,site:BaseSite|None=None)->Iterable[pywikibot.page.Page]:"""Yield pages from the MediaWiki internal search engine. :param total: Maximum number of pages to retrieve in total :param site: Site for generator results. """ifsiteisNone:site=pywikibot.Site()returnsite.search(query,total=total,namespaces=namespaces)
[docs]defLiveRCPageGenerator(site:BaseSite|None=None,total:int|None=None)->Generator[pywikibot.page.Page,None,None]:"""Yield pages from a socket.io RC stream. Generates pages based on the EventStreams Server-Sent-Event (SSE) recent changes stream. The Page objects will have an extra property ._rcinfo containing the literal rc data. This can be used to e.g. filter only new pages. See `pywikibot.comms.eventstreams.rc_listener` for details on the .rcinfo format. :param site: site to return recent changes for :param total: the maximum number of changes to return """ifsiteisNone:site=pywikibot.Site()frompywikibot.comms.eventstreamsimportsite_rc_listenerforentryinsite_rc_listener(site,total=total):# The title in a log entry may have been suppressedif'title'notinentryandentry['type']=='log':continuepage=pywikibot.Page(site,entry['title'],entry['namespace'])page._rcinfo=entry# type: ignore[attr-defined]yieldpage
# following classes just ported from version 1 without revision; not tested
[docs]classGoogleSearchPageGenerator(GeneratorWrapper):"""Page generator using Google search results. To use this generator, you need to install the package 'google': :py:obj:`https://pypi.org/project/google` This package has been available since 2010, hosted on GitHub since 2012, and provided by PyPI since 2013. As there are concerns about Google's Terms of Service, this generator prints a warning for each query. .. versionchanged:: 7.6 subclassed from :class:`tools.collections.GeneratorWrapper` """def__init__(self,query:str|None=None,site:BaseSite|None=None)->None:"""Initializer. :param site: Site for generator results. """self.query=queryorpywikibot.input('Please enter the search query:')ifsiteisNone:site=pywikibot.Site()self.site=siteself._google_query=None
[docs]@staticmethoddefqueryGoogle(query:str)->Generator[str,None,None]:"""Perform a query using python package 'google'. The terms of service as at June 2014 give two conditions that may apply to use of search: 1. Don't access [Google Services] using a method other than the interface and the instructions that [they] provide. 2. Don't remove, obscure, or alter any legal notices displayed in or along with [Google] Services. Both of those issues should be managed by the package 'google', however Pywikibot will at least ensure the user sees the TOS in order to comply with the second condition. """try:importgoogleexceptImportError:pywikibot.error('generator GoogleSearchPageGenerator '"depends on package 'google'.\n"'To install, please run: pip install google.')sys.exit(1)pywikibot.warning('Please read http://www.google.com/accounts/TOS')yield fromgoogle.search(query)
@propertydefgenerator(self)->Generator[pywikibot.page.Page,None,None]:"""Yield results from :meth:`queryGoogle` query. Google contains links in the format: https://de.wikipedia.org/wiki/en:Foobar .. versionchanged:: 7.6 changed from iterator method to generator property """# restrict query to local sitelocal_query=f'{self.query} site:{self.site.hostname()}'base=f'http://{self.site.hostname()}{self.site.articlepath}'pattern=base.replace('{}','(.+)')forurlinself.queryGoogle(local_query):m=re.search(pattern,url)ifm:page=pywikibot.Page(pywikibot.Link(m[1],self.site))ifpage.site==self.site:yieldpage
[docs]defMySQLPageGenerator(query:str,site:BaseSite|None=None,verbose:bool|None=None)->Generator[pywikibot.page.Page,None,None]:"""Yield a list of pages based on a MySQL query. The query should return two columns, page namespace and page title pairs from some table. An example query that yields all ns0 pages might look like:: SELECT page_namespace, page_title FROM page WHERE page_namespace = 0; .. seealso:: :manpage:`MySQL` :param query: MySQL query to execute :param site: Site object :param verbose: if True, print query to be executed; if None, config.verbose_output will be used. :return: generator which yields pywikibot.Page """frompywikibot.dataimportmysqlifsiteisNone:site=pywikibot.Site()row_gen=mysql.mysql_query(query,dbname=site.dbName(),verbose=verbose)forrowinrow_gen:namespace_number,page_name=rowpage_name=page_name.decode(site.encoding())page=pywikibot.Page(site,page_name,ns=int(namespace_number))yieldpage
[docs]defSupersetPageGenerator(query:str,site:BaseSite|None=None,schema_name:str|None=None,database_id:int|None=None)->Iterator[pywikibot.page.Page]:"""Generate pages that result from the given SPARQL query. Pages are generated using site in following order: 1. site retrieved using page_wikidb column in SQL result 2. site as parameter 3. site retrieved using schema_name SQL columns used are - page_id - page_namespace + page_title - page_wikidb Example SQL queries .. code-block:: sql SELECT gil_wiki AS page_wikidb, gil_page AS page_id FROM globalimagelinks GROUP BY gil_wiki LIMIT 10 OR .. code-block:: sql SELECT page_id FROM page LIMIT 10 OR .. code-block:: sql SELECT page_namespace, page_title FROM page LIMIT 10 .. versionadded:: 9.2 :param query: the SQL query string. :param site: Site for generator results. :param schema_name: target superset schema name :param database_id: target superset database id """frompywikibot.data.supersetimportSupersetQuery# Do not pass site to superset if schema_name is defined.# The user may use schema_name to point to different# wikimedia db on purpose and use site for# generating result pages.superset_site=Noneifschema_nameelsesitesuperset=SupersetQuery(site=superset_site,schema_name=schema_name,database_id=database_id)try:rows=superset.query(query)exceptExceptionase:pywikibot.error(f'Error executing query: {query}\n{e}')returnsites={}# If there is no site then retrieve it using schema_nameifnotsite:ifnotschema_name:raiseTypeError('Schema name or site must be provided.')wikidb=re.sub('_p$','',schema_name)site=pywikibot.site.APISite.fromDBName(wikidb)forrowinrows:# If page_wikidb column in SQL result then use it to retrieve siteif'page_wikidb'inrow:# remove "_p" suffixwikidb=re.sub('_p$','',row['page_wikidb'])# Caching sitesifwikidbnotinsites:try:sites[wikidb]=pywikibot.site.APISite.fromDBName(wikidb)exceptValueError:msg=f'Cannot parse a site from {wikidb} for {row}.'pywikibot.warning(msg)continuesite=sites[wikidb]# Generate page objects# Create page object from page_idif'page_id'inrow:page_ids=[row['page_id']]pages=site.load_pages_from_pageids(page_ids)forpageinpages:yieldpage# Create page object from page_namespace + page_titleelif'page_title'inrows[0]and'page_namespace'inrows[0]:page_namespace=int(row['page_namespace'])page_title=row['page_title']page=pywikibot.Page(site,page_title,ns=page_namespace)yieldpageelse:raiseValueError('The SQL result is in wrong format.')
[docs]classXMLDumpPageGenerator(abc.Iterator):# type: ignore[type-arg]"""Xml iterator that yields Page objects. .. versionadded:: 7.2 the `content` parameter :param filename: filename of XML dump :param start: skip entries below that value :param namespaces: namespace filter :param site: current site for the generator :param text_predicate: a callable with entry.text as parameter and boolean as result to indicate the generator should return the page or not :param content: If True, assign old page content to Page.text :ivar skipping: True if start parameter is given, else False :ivar parser: holds the xmlreader.XmlDump parse method """def__init__(self,filename:str,start:str|None=None,namespaces:NamespaceArgType=None,site:BaseSite|None=None,text_predicate:Callable[[str],bool]|None=None,content=False,)->None:"""Initializer."""self.text_predicate=text_predicateself.content=contentself.skipping=bool(start)self.start:str|None=NoneifstartisnotNoneandself.skipping:self.start=start.replace('_',' ')self.site=siteorpywikibot.Site()ifnotnamespaces:self.namespaces=self.site.namespaceselse:self.namespaces=self.site.namespaces.resolve(namespaces)dump=xmlreader.XmlDump(filename,on_error=pywikibot.error)self.parser=dump.parse()def__next__(self)->pywikibot.page.Page:"""Get next Page."""whileTrue:entry=next(self.parser)ifself.skipping:ifentry.title<self.start:continueself.skipping=Falsepage=pywikibot.Page(self.site,entry.title)ifpage.namespace()notinself.namespaces:continueifnotself.text_predicateorself.text_predicate(entry.text):ifself.content:page.text=entry.textreturnpage
[docs]defYearPageGenerator(start:int=1,end:int=2050,site:BaseSite|None=None)->Generator[pywikibot.page.Page,None,None]:"""Year page generator. :param site: Site for generator results. """ifsiteisNone:site=pywikibot.Site()pywikibot.info(f'Starting with year {start}')foriinrange(start,end+1):ifi%100==0:pywikibot.info(f'Preparing {i}...')# There is no year 0ifi!=0:current_year=date.formatYear(site.lang,i)yieldpywikibot.Page(pywikibot.Link(current_year,site))
[docs]defDayPageGenerator(start_month:int=1,end_month:int=12,site:BaseSite|None=None,year:int=2000)->Generator[pywikibot.page.Page,None,None]:"""Day page generator. :param site: Site for generator results. :param year: considering leap year. """ifsiteisNone:site=pywikibot.Site()lang=site.langfirst_page=pywikibot.Page(site,date.format_date(start_month,1,lang))pywikibot.info(f'Starting with {first_page.title(as_link=True)}')formonthinrange(start_month,end_month+1):fordayinrange(1,calendar.monthrange(year,month)[1]+1):yieldpywikibot.Page(pywikibot.Link(date.format_date(month,day,lang),site))
[docs]defWikidataPageFromItemGenerator(gen:Iterable[pywikibot.page.ItemPage],site:pywikibot.site.BaseSite,)->Generator[pywikibot.page.Page,None,None]:"""Generate pages from site based on sitelinks of item pages. :param gen: generator of :py:obj:`pywikibot.ItemPage` :param site: Site for generator results. """repo=site.data_repository()forbatchinbatched(gen,50):req={'ids':[item.idforiteminbatch],'sitefilter':site.dbName(),'action':'wbgetentities','props':'sitelinks'}wbrequest=repo.simple_request(**req)wbdata=wbrequest.submit()entities=(itemforiteminwbdata['entities'].values()if'sitelinks'initemandsite.dbName()initem['sitelinks'])sitelinks=(item['sitelinks'][site.dbName()]['title']foriteminentities)forsitelinkinsitelinks:yieldpywikibot.Page(site,sitelink)
[docs]defWikidataSPARQLPageGenerator(query:str,site:BaseSite|None=None,item_name:str='item',endpoint:str|None=None,entity_url:str|None=None,result_type:Any=set)->Iterator[pywikibot.page.Page]:"""Generate pages that result from the given SPARQL query. :param query: the SPARQL query string. :param site: Site for generator results. :param item_name: name of the item in the SPARQL query :param endpoint: SPARQL endpoint URL :param entity_url: URL prefix for any entities returned in a query. :param result_type: type of the iterable in which SPARQL results are stored (default set) """frompywikibot.dataimportsparqlifsiteisNone:site=pywikibot.Site()repo=site.data_repository()dependencies={'endpoint':endpoint,'entity_url':entity_url}ifnotendpointornotentity_url:dependencies['repo']=repoquery_object=sparql.SparqlQuery(**dependencies)# type: ignore[arg-type]data=query_object.get_items(query,item_name=item_name,result_type=result_type)entities=(repo.get_entity_for_entity_id(entity)forentityindata)ifisinstance(site,pywikibot.site.DataSite):returnentitiesreturnWikidataPageFromItemGenerator(entities,site)
[docs]defWikibaseSearchItemPageGenerator(text:str,language:str|None=None,total:int|None=None,site:BaseSite|None=None,)->Generator[pywikibot.page.ItemPage,None,None]:"""Generate pages that contain the provided text. :param text: Text to look for. :param language: Code of the language to search in. If not specified, value from pywikibot.config.data_lang is used. :param total: Maximum number of pages to retrieve in total, or None in case of no limit. :param site: Site for generator results. """ifsiteisNone:site=pywikibot.Site()iflanguageisNone:language=site.langrepo=site.data_repository()data=repo.search_entities(text,language,total=total)return(pywikibot.ItemPage(repo,item['id'])foritemindata)
[docs]classPetScanPageGenerator(GeneratorWrapper):"""Queries PetScan to generate pages. .. seealso:: https://petscan.wmflabs.org/ .. versionadded:: 3.0 .. versionchanged:: 7.6 subclassed from :class:`tools.collections.GeneratorWrapper` """def__init__(self,categories:Sequence[str],subset_combination:bool=True,namespaces:Iterable[int|pywikibot.site.Namespace]|None=None,site:BaseSite|None=None,extra_options:dict[Any,Any]|None=None)->None:"""Initializer. :param categories: List of category names to retrieve pages from :param subset_combination: Combination mode. If True, returns the intersection of the results of the categories, else returns the union of the results of the categories :param namespaces: List of namespaces to search in (default is None, meaning all namespaces) :param site: Site to operate on (default is the default site from the user config) :param extra_options: Dictionary of extra options to use (optional) """ifsiteisNone:site=pywikibot.Site()self.site=siteself.opts=self.buildQuery(categories,subset_combination,namespaces,extra_options)
[docs]defbuildQuery(self,categories:Sequence[str],subset_combination:bool,namespaces:Iterable[int|pywikibot.site.Namespace]|None,extra_options:dict[Any,Any]|None)->dict[str,Any]:"""Get the querystring options to query PetScan. :param categories: List of categories (as strings) :param subset_combination: Combination mode. If True, returns the intersection of the results of the categories, else returns the union of the results of the categories :param namespaces: List of namespaces to search in :param extra_options: Dictionary of extra options to use :return: Dictionary of querystring parameters to use in the query """extra_options=extra_optionsor{}query={'language':self.site.code,'project':self.site.hostname().split('.')[-2],'combination':'subset'ifsubset_combinationelse'union','categories':'\r\n'.join(categories),'format':'json','doit':''}ifnamespaces:fornamespaceinnamespaces:query[f'ns[{int(namespace)}]']=1query_final=query.copy()query_final.update(extra_options)returnquery_final
[docs]defquery(self)->Generator[dict[str,Any],None,None]:"""Query PetScan. .. versionchanged:: 7.4 raises :class:`APIError` if query returns an error message. :raises ServerError: Either ReadTimeout or server status error :raises APIError: error response from petscan """url='https://petscan.wmflabs.org'try:req=http.fetch(url,params=self.opts)exceptReadTimeout:raiseServerError(f'received ReadTimeout from {url}')server_err=HTTPStatus.INTERNAL_SERVER_ERRORifserver_err<=req.status_code<server_err+100:raiseServerError(f'received {req.status_code} status from {req.url}')data=req.json()if'error'indata:raiseAPIError('Petscan',data['error'],**self.opts)raw_pages=data['*'][0]['a']['*']yield fromraw_pages
@propertydefgenerator(self)->Generator[pywikibot.page.Page,None,None]:"""Yield results from :meth:`query`. .. versionchanged:: 7.6 changed from iterator method to generator property """forraw_pageinself.query():yieldpywikibot.Page(self.site,raw_page['title'],int(raw_page['namespace']))
[docs]classPagePilePageGenerator(GeneratorWrapper):"""Queries PagePile to generate pages. .. seealso:: https://pagepile.toolforge.org/ .. versionadded:: 9.0 """def__init__(self,id:int):"""Initializer. :param id: The PagePile id to query """self.opts=self.buildQuery(id)
[docs]defbuildQuery(self,id:int):"""Get the querystring options to query PagePile. :param id: int :return: Dictionary of querystring parameters to use in the query """query={'id':id,'action':'get_data','format':'json','doit':''}returnquery
[docs]defquery(self)->Generator[str,None,None]:"""Query PagePile. :raises ServerError: Either ReadTimeout or server status error :raises APIError: error response from petscan """url='https://pagepile.toolforge.org/api.php'req=http.fetch(url,params=self.opts)data=req.json()if'error'indata:raiseAPIError('PagePile',data['error'],**self.opts)self.site=pywikibot.site.APISite.fromDBName(data['wiki'])raw_pages=data['pages']yield fromraw_pages
@propertydefgenerator(self)->Generator[pywikibot.page.Page,None,None]:"""Yield results from :meth:`query`."""forraw_pageinself.query():page=pywikibot.Page(self.site,raw_page)yieldpage