Source code for interwiki_graph

"""Module with the Graphviz drawing calls."""
#
# (C) Pywikibot team, 2006-2022
#
# Distributed under the terms of the MIT license.
#
from __future__ import annotations

import itertools
import threading
from collections import Counter

import pywikibot
from pywikibot import config


try:
    import pydot
    PYDOT_ERROR = None
except ImportError as e:
    PYDOT_ERROR = e


[docs] class GraphSavingThread(threading.Thread): """ Threaded graph renderer. Rendering a graph can take extremely long. We use multithreading because of that. TODO: Find out if several threads running in parallel can slow down the system too much. Consider adding a mechanism to kill a thread if it takes too long. """ def __init__(self, graph: pydot.Dot, origin: pywikibot.page.Page) -> None: """Initializer.""" super().__init__() self.graph = graph self.origin = origin
[docs] def run(self) -> None: """Write graphs to the data directory.""" for fmt in config.interwiki_graph_formats: filename = config.datafilepath( 'interwiki-graphs/' + getFilename(self.origin, fmt)) if self.graph.write(filename, prog='dot', format=fmt): pywikibot.info('Graph saved as ' + filename) else: pywikibot.info('Graph could not be saved as ' + filename)
[docs] class Subject: """Data about a page with translations on multiple wikis.""" def __init__(self, origin: pywikibot.page.Page | None = None) -> None: """Initializer. :param origin: the page on the 'origin' wiki """ # Remember the "origin page" self.origin = origin # found_in is a dictionary where pages are keys and lists of # pages are values. It stores where we found each page. # As we haven't yet found a page that links to the origin page, we # start with an empty list for it. self.found_in: dict[pywikibot.Page, list[pywikibot.Page]] = {} if origin: self.found_in = {origin: []}
[docs] class GraphDrawer: """Graphviz (dot) code creator.""" def __init__(self, subject: pywikibot.interwiki_graph.Subject) -> None: """Initializer. :param subject: page data to graph :raises ImportError if pydot is not installed """ if PYDOT_ERROR: msg = f'pydot is not installed: {PYDOT_ERROR}.' raise ImportError(msg) self.graph: pydot.Dot | None = None self.subject = subject
[docs] @staticmethod def getLabel(page: pywikibot.page.Page) -> str: """Get label for page.""" return f'"{page.site.code}:{page.title()}"'
def _octagon_site_set(self) -> set[pywikibot.site.BaseSite]: """Build a list of sites with more than one valid page.""" page_list = self.subject.found_in.keys() # Only track sites of normal pages each_site = (page.site for page in page_list if page.exists() and not page.isRedirectPage()) return {x[0] for x in itertools.takewhile( lambda x: x[1] > 1, Counter(each_site).most_common())}
[docs] def addNode(self, page: pywikibot.page.Page) -> None: """Add a node for page.""" assert self.graph is not None node = pydot.Node(self.getLabel(page), shape='rectangle') node.set_URL('"http://{}{}"' .format(page.site.hostname(), page.site.get_address(page.title(as_url=True)))) node.set_style('filled') node.set_fillcolor('white') node.set_fontsize('11') if not page.exists(): node.set_fillcolor('red') elif page.isRedirectPage(): node.set_fillcolor('blue') elif page.isDisambig(): node.set_fillcolor('orange') if page.namespace() != self.subject.origin.namespace(): node.set_color('green') node.set_style('filled,bold') if page.site in self.octagon_sites: # mark conflict by octagonal node node.set_shape('octagon') self.graph.add_node(node)
[docs] def addDirectedEdge(self, page: pywikibot.page.Page, refPage: pywikibot.page.Page) -> None: """Add a directed edge from refPage to page.""" assert self.graph is not None # if page was given as a hint, referrers would be [None] if refPage is not None: sourceLabel = self.getLabel(refPage) targetLabel = self.getLabel(page) edge = pydot.Edge(sourceLabel, targetLabel) oppositeEdge = self.graph.get_edge(targetLabel, sourceLabel) if oppositeEdge: oppositeEdge = oppositeEdge[0] oppositeEdge.set_dir('both') # workaround for sf.net bug 401: prevent duplicate edges # (it is unclear why duplicate edges occur) # https://sourceforge.net/p/pywikipediabot/bugs/401/ elif self.graph.get_edge(sourceLabel, targetLabel): pywikibot.error( f'Tried to create duplicate edge from {refPage} to {page}') # duplicate edges would be bad because then get_edge() would # give a list of edges, not a single edge when we handle the # opposite edge. else: # add edge if refPage.site == page.site: edge.set_color('blue') elif not page.exists(): # mark dead links edge.set_color('red') elif refPage.isDisambig() != page.isDisambig(): # mark links between disambiguation and non-disambiguation # pages edge.set_color('orange') if refPage.namespace() != page.namespace(): edge.set_color('green') self.graph.add_edge(edge)
[docs] def saveGraphFile(self) -> None: """Write graphs to the data directory.""" assert self.graph is not None thread = GraphSavingThread(self.graph, self.subject.origin) thread.start()
[docs] def createGraph(self) -> None: """ Create graph of the interwiki links. For more info see https://meta.wikimedia.org/wiki/Interwiki_graphs """ pywikibot.info(f'Preparing graph for {self.subject.origin.title()}') # create empty graph self.graph = pydot.Dot() self.octagon_sites = self._octagon_site_set() for page in self.subject.found_in.keys(): # a node for each found page self.addNode(page) # mark start node by pointing there from a black dot. firstLabel = self.getLabel(self.subject.origin) self.graph.add_node(pydot.Node('start', shape='point')) self.graph.add_edge(pydot.Edge('start', firstLabel)) for page, referrers in self.subject.found_in.items(): for refPage in referrers: self.addDirectedEdge(page, refPage) self.saveGraphFile()
[docs] def getFilename(page: pywikibot.page.Page, extension: str | None = None) -> str: """ Create a filename that is unique for the page. :param page: page used to create the new filename :param extension: file extension :return: filename of <family>-<lang>-<page>.<ext> """ filename = '-'.join((page.site.family.name, page.site.code, page.title(as_filename=True))) if extension: filename += f'.{extension}' return filename