Source code for scripts.dataextend

#!/usr/bin/python3
"""Script to add properties, identifiers and sources to WikiBase items.

Usage:

    dataextend <item> [<property>[+*]] [args]

In the basic usage, where no property is specified, item is the Q-number
of the item to work on.from html import unescape

If a property (P-number, or the special value 'Wiki' or 'Data') is
specified, only the data from that identifier are added. With a '+'
after it, work starts on that identifier, then goes on to identifiers
after that (including new identifiers added while working on those
identifiers). With a '*' after it, the identifier itself is skipped, but
those coming after it (not those coming before it) are included.

The following parameters are supported:

-always    If this is supplied, the bot will not ask for permission
           after each external link has been handled.

-showonly  Only show claims for a given ItemPage. Don't try to add any
           properties

The bot will load the corresponding pages for these identifiers, and try
to the meaning of that string for the specified type of thing (for
example 'city' or 'gender'). If you want to use it, but not save it
(which can happen if the string specifies a certain value now, but might
show another value elsewhere, or if it is so specific that you're pretty
sure it won't occur a second time), you can provide the Q-number with X
rather than Q. If you do not want to use the string, you can just hit
enter, or give the special value 'XXX' which means that it will be
skipped in each subsequent run as well.

After an identifier has been worked on, there might be a list of names
that has been found, in lc:name format, where lc is a language code. You
can accept all suggested names (answer Y), none (answer N) or ask to get
asked for each name separately (answer S), the latter being the default
if you do not fill in anything.

After all identifiers have been worked on, possible descriptions in
various languages are presented, and you get to choose one. The default
is here 0, which always is the current description for that language.
Finally, for a number of identifiers text is shown that usually gives
parts of the description that are hard to parse automatically, so you
can see if there any additional pieces of data that can be added.

It is advisable to (re)load the item page that the bot has been working
on in the browser afterward, to correct any mistakes it has made, or
cases where a more precise and less precise value have both been
included.

.. versionadded:: 7.2
"""
#
# (C) Pywikibot team, 2020-2022
#
# Distributed under the terms of the MIT license.
#
import codecs
import datetime
import re
import ssl

from collections import defaultdict
from contextlib import suppress
from html import unescape
from textwrap import shorten
from typing import Tuple
from urllib.error import HTTPError, URLError
from urllib.parse import quote, unquote
from urllib.request import urlopen

import pywikibot
from pywikibot.backports import List
from pywikibot.bot import input_yn, SingleSiteBot, suggest_help
from pywikibot.data import sparql
from pywikibot.exceptions import (
    APIError,
    InvalidTitleError,
    NoPageError,
    OtherPageSaveError,
)
from pywikibot.tools.collections import DequeGenerator


[docs]class DataExtendBot(SingleSiteBot): update_options = { 'restrict': '', 'showonly': False, } """The Bot.""" QRE = re.compile(r'Q\d+$') PQRE = re.compile(r'[PQ]\d+$') def __init__(self, **kwargs): """Initializer.""" super().__init__(**kwargs) self.labels = {} self.data = defaultdict(dict) self.noname = set() self.labelfile = 'labels.txt' self.datafile = 'defaultdata.txt' self.nonamefile = 'noname.txt' self.loaddata() self.analyzertype = { 'P213': IsniAnalyzer, 'P214': ViafAnalyzer, 'P227': GndAnalyzer, 'P244': LcAuthAnalyzer, 'P245': UlanAnalyzer, 'P268': BnfAnalyzer, 'P269': SudocAnalyzer, 'P271': CiniiAnalyzer, 'P345': ImdbAnalyzer, 'P396': SbnAnalyzer, 'P409': LibrariesAustraliaAnalyzer, 'P434': MusicBrainzAnalyzer, 'P454': StructuraeAnalyzer, 'P496': OrcidAnalyzer, 'P497': CbdbAnalyzer, 'P535': FindGraveAnalyzer, 'P549': MathGenAnalyzer, 'P586': IpniAuthorsAnalyzer, # 'P590': GnisAnalyzer, <http redirect loop> 'P640': LeonoreAnalyzer, 'P648': OpenLibraryAnalyzer, 'P650': RkdArtistsAnalyzer, 'P651': BiografischPortaalAnalyzer, 'P691': NkcrAnalyzer, 'P723': DbnlAnalyzer, 'P781': SikartAnalyzer, 'P839': ImslpAnalyzer, 'P902': HdsAnalyzer, 'P906': SelibrAnalyzer, 'P950': BneAnalyzer, 'P1005': PtbnpAnalyzer, # 'P1006': NtaAnalyzer, 'P1015': BibsysAnalyzer, 'P1138': KunstindeksAnalyzer, 'P1146': IaafAnalyzer, # 'P1153': ScopusAnalyzer, <requires login> 'P1185': RodovidAnalyzer, 'P1220': IbdbAnalyzer, 'P1233': IsfdbAnalyzer, 'P1263': NndbAnalyzer, 'P1273': CanticAnalyzer, 'P1280': ConorSiAnalyzer, 'P1284': MunzingerAnalyzer, # 'P1305': SkyScraperAnalyzer, <forbidden> # <changed, content is not on page any more> # 'P1315': PeopleAustraliaAnalyzer, 'P1367': ArtUkAnalyzer, 'P1368': LnbAnalyzer, 'P1415': OxfordAnalyzer, 'P1422': SandrartAnalyzer, 'P1440': FideAnalyzer, 'P1447': SportsReferenceAnalyzer, 'P1463': PrdlAnalyzer, 'P1469': FifaAnalyzer, 'P1556': ZbmathAnalyzer, 'P1580': UBarcelonaAnalyzer, 'P1607': DialnetAnalyzer, 'P1615': ClaraAnalyzer, 'P1648': WelshBioAnalyzer, 'P1667': TgnAnalyzer, # 'P1695': NlpAnalyzer, <id doesn't work anymore> 'P1707': DaaoAnalyzer, # 'P1711': BritishMuseumAnalyzer, <does not load> 'P1741': GtaaAnalyzer, 'P1749': ParlementPolitiekAnalyzer, 'P1795': AmericanArtAnalyzer, 'P1802': EmloAnalyzer, 'P1816': NpgPersonAnalyzer, 'P1819': GenealogicsAnalyzer, 'P1838': PssBuildingAnalyzer, 'P1871': CerlAnalyzer, 'P1952': MetallumAnalyzer, 'P1953': DiscogsAnalyzer, 'P1977': ArchivesDuSpectacleAnalyzer, 'P1986': ItalianPeopleAnalyzer, 'P1988': DelargeAnalyzer, 'P2005': HalensisAnalyzer, # 'P2013': FacebookAnalyzer, <requires being logged in> 'P2016': AcademiaeGroninganaeAnalyzer, 'P2029': UlsterAnalyzer, 'P2038': ResearchGateAnalyzer, 'P2041': NgvAnalyzer, 'P2089': JukeboxAnalyzer, 'P2163': FastAnalyzer, 'P2168': SvenskFilmAnalyzer, 'P2191': NilfAnalyzer, 'P2252': NgaAnalyzer, 'P2268': OrsayAnalyzer, 'P2332': ArtHistoriansAnalyzer, 'P2340': CesarAnalyzer, 'P2342': AgorhaAnalyzer, 'P2349': StuttgartAnalyzer, 'P2372': OdisAnalyzer, 'P2381': AcademicTreeAnalyzer, 'P2383': CthsAnalyzer, 'P2446': TransfermarktAnalyzer, 'P2454': KnawAnalyzer, 'P2456': DblpAnalyzer, 'P2469': TheatricaliaAnalyzer, # 'P2533': WomenWritersAnalyzer, #fully opaque 'P2604': KinopoiskAnalyzer, 'P2605': CsfdAnalyzer, 'P2639': FilmportalAnalyzer, 'P2728': CageMatchAnalyzer, 'P2732': PerseeAnalyzer, 'P2750': PhotographersAnalyzer, 'P2753': CanadianBiographyAnalyzer, 'P2829': IWDAnalyzer, 'P2843': BenezitAnalyzer, 'P2915': EcarticoAnalyzer, 'P2940': RostochiensiumAnalyzer, 'P2941': MunksRollAnalyzer, 'P2944': PlarrAnalyzer, 'P2945': BookTradeAnalyzer, 'P2949': WikitreeAnalyzer, 'P2963': GoodreadsAnalyzer, 'P2977': LbtAnalyzer, 'P3029': NationalArchivesAnalyzer, 'P3107': LdifAnalyzer, 'P3109': PeakbaggerAnalyzer, 'P3138': OfdbAnalyzer, 'P3154': RunebergAuthorAnalyzer, 'P3159': UGentAnalyzer, 'P3283': BandcampAnalyzer, 'P3314': Chess365Analyzer, 'P3346': HkmdbAnalyzer, 'P3351': AdultFilmAnalyzer, 'P3360': NobelPrizeAnalyzer, 'P3392': SurmanAnalyzer, 'P3410': CcedAnalyzer, 'P3413': LeopoldinaAnalyzer, 'P3429': EnlightenmentAnalyzer, 'P3430': SnacAnalyzer, 'P3630': BabelioAnalyzer, 'P3782': ArtnetAnalyzer, 'P3786': DanskefilmAnalyzer, 'P3788': BnaAnalyzer, 'P3790': AnimeConsAnalyzer, 'P3829': PublonsAnalyzer, 'P3844': SynchronkarteiAnalyzer, 'P3924': TrackFieldFemaleAnalyzer, 'P3925': TrackFieldMaleAnalyzer, 'P4124': WhosWhoFranceAnalyzer, 'P4145': AthenaeumAnalyzer, 'P4158': AutoresArAnalyzer, 'P4206': FoihAnalyzer, 'P4228': EoasAnalyzer, # 'P4293': PM20Analyzer, <content in frame with unclear url> 'P4399': ItauAnalyzer, 'P4432': AKLAnalyzer, 'P4459': SpanishBiographyAnalyzer, 'P4548': CommonwealthGamesAnalyzer, 'P4585': AccademiaCruscaAnalyzer, 'P4629': OnlineBooksAnalyzer, 'P4657': NumbersAnalyzer, 'P4663': DacsAnalyzer, 'P4666': CinemagiaAnalyzer, 'P4687': PeintresBelgesAnalyzer, 'P4749': AuteursLuxembourgAnalyzer, 'P4759': LuminousAnalyzer, 'P4769': GameFaqsAnalyzer, 'P4823': AmericanBiographyAnalyzer, 'P4872': GeprisAnalyzer, 'P4887': WebumeniaAnalyzer, 'P4927': InvaluableAnalyzer, 'P4929': AinmAnalyzer, 'P4985': TmdbAnalyzer, 'P5034': LibraryKoreaAnalyzer, 'P5068': KunstenpuntAnalyzer, 'P5239': ArtistsCanadaAnalyzer, 'P5240': RollDaBeatsAnalyzer, 'P5246': PornhubAnalyzer, 'P5267': YoupornAnalyzer, 'P5273': NelsonAtkinsAnalyzer, 'P5329': ArmbAnalyzer, 'P5359': OperoneAnalyzer, 'P5361': BnbAnalyzer, 'P5365': InternetBookAnalyzer, 'P5375': BiuSanteAnalyzer, 'P5394': PoetsWritersAnalyzer, 'P5308': ScottishArchitectsAnalyzer, 'P5357': SFAnalyzer, 'P5368': NatGeoCanadaAnalyzer, 'P5370': EntomologistAnalyzer, 'P5408': FantasticFictionAnalyzer, 'P5415': WhonameditAnalyzer, 'P5421': TradingCardAnalyzer, 'P5491': BedethequeAnalyzer, 'P5492': Edit16Analyzer, 'P5504': RismAnalyzer, 'P5534': OmdbAnalyzer, 'P5540': RedTubeAnalyzer, 'P5570': NoosfereAnalyzer, 'P5597': ArtcyclopediaAnalyzer, 'P5645': AcademieFrancaiseAnalyzer, 'P5731': AngelicumAnalyzer, 'P5739': PuscAnalyzer, 'P5747': CwaAnalyzer, 'P5794': IgdbAnalyzer, 'P5819': MathOlympAnalyzer, 'P5882': MuziekwebAnalyzer, 'P6127': LetterboxdAnalyzer, 'P6167': BritishExecutionsAnalyzer, 'P6188': BdfaAnalyzer, 'P6194': AustrianBiographicalAnalyzer, 'P6231': BdelAnalyzer, 'P6295': ArticArtistAnalyzer, 'P6517': WhoSampledAnalyzer, 'P6575': AcademieRouenAnalyzer, 'P6578': MutualAnalyzer, 'P6594': GuggenheimAnalyzer, 'P6770': SnsaAnalyzer, 'P6815': UvaAlbumAnalyzer, 'P6821': AlvinAnalyzer, 'P6844': AbartAnalyzer, 'P6873': IntraTextAnalyzer, 'P7032': RepertoriumAnalyzer, 'P7293': PlwabnAnalyzer, 'P7796': BewebAnalyzer, 'P7902': DeutscheBiographieAnalyzer, 'P8287': WorldsWithoutEndAnalyzer, 'P8696': BelgianPhotographerAnalyzer, 'P8795': AlkindiAnalyzer, 'P8848': ConorAlAnalyzer, 'P8849': ConorBgAnalyzer, 'P8851': ConorSrAnalyzer, 'P8914': ZobodatAnalyzer, 'P9017': OxfordMedievalAnalyzer, # 'P9046': AdSAnalyzer, hard to analyze JavaScript 'P9113': PatrinumAnalyzer, 'P9430': JwaAnalyzer, 'fomu.atomis.be': FotomuseumAnalyzer, 'catalogo.bn.gov.ar': BibliotecaNacionalAnalyzer, 'www.brooklynmuseum.org': BrooklynMuseumAnalyzer, 'www.vondel.humanities.uva.nl': OnstageAnalyzer, 'www.ias.edu': IasAnalyzer, 'kunstaspekte.art': KunstaspekteAnalyzer, 'www.nationaltrustcollections.org.uk': NationalTrustAnalyzer, 'www.oxfordartonline.com': BenezitUrlAnalyzer, 'exhibitions.univie.ac.at': UnivieAnalyzer, 'weber-gesamtausgabe.de': WeberAnalyzer, 'Wiki': WikiAnalyzer, 'Data': BacklinkAnalyzer, 'www.deutsche-biographie.de': DeutscheBiographieAnalyzer, }
[docs] def label(self, title): if title.startswith('!date!'): return self.showtime(self.createdateclaim(title[6:])) if title.startswith('!q!'): return title[3:] if not self.PQRE.match(title): return title if title in self.labels: return self.labels[title] item = self.page(title) try: labels = item.get()['labels'] except NoPageError: labels = {} for lang in ['en', 'nl', 'de', 'fr', 'es', 'it', 'af', 'nds', 'li', 'vls', 'zea', 'fy', 'no', 'sv', 'da', 'pt', 'ro', 'pl', 'cs', 'sk', 'hr', 'et', 'fi', 'lt', 'lv', 'tr', 'cy']: if lang in labels: try: label = labels[lang]['value'] except TypeError: label = labels[lang] break else: label = title self.labels[title] = label return label
[docs] def loaddata(self): """Read data from files.""" param = {'mode': 'r', 'encoding': 'utf-8'} with suppress(IOError), codecs.open(self.labelfile, **param) as f: for line in f.readlines(): key, value = line.strip().split(':', 1) self.labels[key] = value with suppress(IOError), codecs.open(self.datafile, **param) as f: for line in f.readlines(): parts = line.strip().split(':') # assume len(parts) > 1 dtype, *keys, value = parts key = ':'.join(keys) self.data[dtype][key] = value with suppress(IOError), codecs.open(self.nonamefile, **param) as f: self.noname = {line.strip() for line in f.readlines()}
[docs] def teardown(self) -> None: """Save data to files.""" param = {'mode': 'w', 'encoding': 'utf-8'} with codecs.open(self.labelfile, **param) as f: for item in self.labels: f.write('{}:{}\n'.format(item, self.labels[item])) with codecs.open(self.datafile, **param) as f: for dtype in self.data: for key in self.data[dtype]: f.write('{}:{}:{}\n'.format(dtype, key, self.data[dtype][key])) with codecs.open(self.nonamefile, **param) as f: for noname in self.noname: f.write('{}\n'.format(noname))
[docs] def page(self, title): """Dispatch title and return the appropriate Page object.""" title = title.rsplit(':', 1)[-1] if title.startswith('Q'): return pywikibot.ItemPage(self.site, title) if title.startswith('P'): return pywikibot.PropertyPage(self.site, title) raise ValueError('Invalid title {}'.format(title))
[docs] @staticmethod def showtime(time): if time is None: return 'unknown' result = str(time.year) if time.precision < 9: result = 'ca. ' + result if time.precision >= 10: result = '{}-{}'.format(time.month, result) if time.precision >= 11: result = '{}-{}'.format(time.day, result) if time.precision >= 12: result = '{} {}'.format(result, time.hour) if time.precision >= 13: result = '{}:{}'.format(result, time.minute) if time.precision >= 14: result = '{}:{}'.format(result, time.second) return result
[docs] def showclaims(self, claims): pywikibot.output('Current information:') for prop in claims: for claim in claims[prop]: if claim.type == 'wikibase-item': if claim.getTarget() is None: pywikibot.output('{}: unknown' .format(self.label(prop))) else: pywikibot.output( '{}: {}' .format(self.label(prop), self.label(claim.getTarget().title()))) elif claim.type == 'time': pywikibot.output('{}: {}' .format(self.label(prop), self.showtime(claim.getTarget()))) elif claim.type in ['external-id', 'commonsMedia']: pywikibot.output('{}: {}'.format(self.label(prop), claim.getTarget())) elif claim.type == 'quantity': pywikibot.output( '{}: {} {}' .format(self.label(prop), claim.getTarget().amount, self.label( claim.getTarget().unit.split('/')[-1]))) else: pywikibot.output('Unknown type {} for property {}' .format(claim.type, self.label(prop)))
MONTHNUMBER = { '1': 1, '01': 1, 'i': 1, '2': 2, '02': 2, 'ii': 2, '3': 3, '03': 3, 'iii': 3, '4': 4, '04': 4, 'iv': 4, '5': 5, '05': 5, 'v': 5, '6': 6, '06': 6, 'vi': 6, '7': 7, '07': 7, 'vii': 7, '8': 8, '08': 8, 'viii': 8, '9': 9, '09': 9, 'ix': 9, '10': 10, 'x': 10, '11': 11, 'xi': 11, '12': 12, 'xii': 12, 'january': 1, 'jan': 1, 'february': 2, 'feb': 2, 'febr': 2, 'march': 3, 'mar': 3, 'april': 4, 'apr': 4, 'may': 5, 'june': 6, 'jun': 6, 'july': 7, 'jul': 7, 'august': 8, 'aug': 8, 'september': 9, 'sep': 9, 'sept': 9, 'october': 10, 'oct': 10, 'november': 11, 'nov': 11, 'december': 12, 'dec': 12, 'gennaio': 1, 'gen': 1, 'genn': 1, 'febbraio': 2, 'febb': 2, 'febbr': 2, 'marzo': 3, 'marz': 3, 'aprile': 4, 'maggio': 5, 'mag': 5, 'magg': 5, 'giugno': 6, 'giu': 6, 'luglio': 7, 'lug': 7, 'lugl': 7, 'agosto': 8, 'ago': 8, 'agost': 8, 'ag': 8, 'settembre': 9, 'set': 9, 'sett': 9, 'ottobre': 10, 'ott': 10, 'otto': 10, 'novembre': 11, 'dicembre': 12, 'dic': 12, 'januari': 1, 'februari': 2, 'maart': 3, 'maa': 3, 'mrt': 3, 'mei': 5, 'juni': 6, 'juli': 7, 'augustus': 8, 'oktober': 10, 'okt': 10, 'janvier': 1, 'février': 2, 'fevrier': 2, 'fév': 2, 'fev': 2, 'f\\xe9vrier': 2, 'mars': 3, 'avril': 4, 'avr': 4, 'mai': 5, 'juin': 6, 'juillet': 7, 'août': 8, 'aout': 8, 'aoû': 8, 'aou': 8, 'septembre': 9, 'octobre': 10, 'décembre': 12, 'déc': 12, 'januar': 1, 'jänner': 1, 'februar': 2, 'märz': 3, 'm\\xe4rz': 3, 'dezember': 12, 'dez': 12, 'eanáir': 1, 'eanair': 1, 'feabhra': 2, 'márta': 3, 'marta': 3, 'aibreán': 4, 'aibrean': 4, 'bealtaine': 5, 'meitheamh': 6, 'iúil': 7, 'iuil': 7, 'lúnasa': 8, 'lunasa': 8, 'meán fómhair': 9, 'mean fomhair': 9, 'deireadh fómhair': 10, 'deireadh fomhair': 10, 'samhain': 11, 'nollaig': 12, 'styczeń': 1, 'stycznia': 1, 'luty': 2, 'lutego': 2, 'marzec': 3, 'marca': 3, 'kwiecień': 4, 'kwietnia': 4, 'maj': 5, 'maja': 5, 'czerwiec': 6, 'czerwca': 6, 'lipiec': 7, 'lipca': 7, 'sierpień': 8, 'sierpnia': 8, 'wrzesień': 9, 'września': 9, 'październik': 10, 'października': 10, 'listopad': 11, 'listopada': 11, 'grudzień': 12, 'grudnia': 12, 'enero': 1, 'febrero': 2, 'abril': 4, 'mayo': 5, 'junio': 6, 'julio': 7, 'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12, 'gener': 1, 'febrer': 2, 'març': 3, 'maig': 5, 'juny': 6, 'juliol': 7, 'setembre': 9, 'desembre': 12, }
[docs] def createdateclaim(self, text): text = text.strip() year = None month = None day = None m = re.search(r'[{\|](\d{4})\|(\d+)\|(\d+)[\|}]', text) if m: year = int(m.group(1)) month = int(m.group(2)) day = int(m.group(3)) if re.match(r'\d{,4}(?:年頃|\.)?$', text): year = int(text) month = None day = None if re.match(r'(?:1\d{3}|20[01]\d)[01]\d[0123]\d$', text): year = int(text[:4]) month = int(text[4:6]) day = int(text[6:]) if re.match(r'\d{4}-\d{2}$', text): year = int(text[:4]) month = int(text[-2:]) m = re.match(r'(\d{1,2})[-/](\d{4})', text) if m: year = int(m.group(2)) month = int(m.group(1)) m = re.match(r'(\d+)[-./|](\d{1,2})[-./|](\d{1,2})$', text) if m: year = int(m.group(1)) month = int(m.group(2)) day = int(m.group(3)) m = re.match( r'(\d{1,2})[-./|]\s*(\d{1,2})[-./|]\s*(\d{3,4})\.?$', text) if m: year = int(m.group(3)) month = int(m.group(2)) day = int(m.group(1)) m = re.match(r'(\d{1,2})[-./\s]([iIvVxX]+)[-./\s](\d{4})$', text) if m: year = int(m.group(3)) try: month = self.MONTHNUMBER[m.group(2).lower()] except KeyError: raise ValueError("Don't know month {}".format(m.group(2))) day = int(m.group(1)) m = re.match(r"(\d+)(?:\.|er|eme|ème)?[\s.]\s*(?:d'|d[aei] )?" r'([^\s.]{2,})\.?[\s.]\s*(\d+)$', text) if m: year = int(m.group(3)) try: month = self.MONTHNUMBER[m.group(2).lower()] except KeyError: raise ValueError("Don't know month {}".format(m.group(2))) day = int(m.group(1)) m = re.match(r'(\d{4})\.?[\s.]\s*([^\s.]{3,})\.?[\s.]\s*(\d+)$', text) if m: year = int(m.group(1)) try: month = self.MONTHNUMBER[m.group(2).lower()] except KeyError: raise ValueError("Don't know month {}".format(m.group(2))) day = int(m.group(3)) m = re.match(r"(\d+) (?:de |d')?(\w+[a-z]\w+) de (\d+)", text) if m: year = int(m.group(3)) try: month = self.MONTHNUMBER[m.group(2).lower()] except KeyError: raise ValueError("Don't know month {}".format(m.group(2))) day = int(m.group(1)) m = re.match(r'(\w*[a-zA-Z]\w*)\.? (\d+)$', text) if m: year = int(m.group(2)) try: month = self.MONTHNUMBER[m.group(1).lower()] except KeyError: raise ValueError("Don't know month {}".format(m.group(1))) m = re.match(r'(\w+)\.? (\d{1,2})(?:st|nd|rd|th)?\.?\s*,\s*(\d{3,4})$', text) if m: year = int(m.group(3)) try: month = self.MONTHNUMBER[m.group(1).lower()] except KeyError: raise ValueError("Don't know month {}".format(m.group(1))) day = int(m.group(2)) m = re.match(r'(\d{4}),? (\d{1,2}) (\w+)', text) if m: year = int(m.group(1)) try: month = self.MONTHNUMBER[m.group(3).lower()] except KeyError: raise ValueError("Don't know month {}".format(m.group(1))) day = int(m.group(2)) m = re.match(r'(\d+)年(\d+)月(\d+)日', text) if m: year = int(m.group(1)) month = int(m.group(2)) day = int(m.group(3)) m = re.match(r'(\d+)年$', text) if m: year = int(m.group(1)) if day == 0: day = None if day is None and month == 0: month = None if month and month > 12: raise ValueError('Date seems to have an invalid month number {}' .format(month)) if day and day > 31: raise ValueError('Date seems to have an invalid day number {}' .format(day)) if not year: raise ValueError("Can't interpret date {}".format(text)) return pywikibot.WbTime(year=year, month=month, day=day, precision=9 if month is None else 10 if day is None else 11)
QUANTITYTYPE = { 'meter': 'Q11573', 'metre': 'Q11573', 'm': 'Q11573', 'meters': 'Q11573', 'metres': 'Q11573', 'м': 'Q11573', 'centimeter': 'Q174728', 'centimetre': 'Q174728', 'cm': 'Q174728', 'foot': 'Q3710', 'feet': 'Q3710', 'ft': 'Q3710', 'mile': 'Q253276', 'mi': 'Q253276', 'kilometer': 'Q828224', 'kilometre': 'Q828224', 'km': 'Q828224', 'minute': 'Q7727', 'minutes': 'Q7727', 'min': 'Q7727', 'minuten': 'Q7727', 'second': 'Q11574', 's': 'Q11574', 'kilogram': 'Q11570', 'kg': 'Q11570', 'lb': 'Q100995', 'lbs': 'Q100995', 'pond': 'Q100995', }
[docs] def createquantityclaim(self, text): m = re.match(r'(\d+(?:\.\d+)?)\s*([a-z]\w*)', text.replace(',', '.')) amount = m.group(1) name = m.group(2).lower() return pywikibot.WbQuantity(amount, unit=pywikibot.ItemPage( self.site, self.QUANTITYTYPE[name]), site=self.site)
[docs] def treat(self, item) -> None: """Process the ItemPage.""" item.get() claims = item.claims self.showclaims(claims) if self.opt.showonly: return longtexts = [] newdescriptions = defaultdict(set) updatedclaims = {prop: claims[prop] for prop in claims} dorestrict = True continueafterrestrict = False restrict_end = self.opt.restrict and self.opt.restrict[-1] if restrict_end in ('+', '*'): self.opt.restrict = self.opt.restrict[:-1] continueafterrestrict = True if restrict_end == '*': dorestrict = False unidentifiedprops = [] failedprops = [] claims['Wiki'] = [Quasiclaim(page.title(force_interwiki=True, as_link=True)[2:-2]) for page in item.iterlinks()] claims['Data'] = [Quasiclaim(item.title())] propstodo = DequeGenerator(claims) propsdone = set() for prop in propstodo: descriptions = item.descriptions labels = item.labels aliases = item.aliases # This can happen after reloading if prop not in claims.keys(): continue if self.opt.restrict: if prop != self.opt.restrict: continue if continueafterrestrict: self.opt.restrict = '' if not dorestrict: continue for mainclaim in claims[prop]: if mainclaim.type != 'external-id' and prop != 'P973': continue identifier = mainclaim.getTarget() try: analyzertype = self.analyzertype[identifier.split('/')[2] if prop == 'P973' else prop] except KeyError: unidentifiedprops.append(prop) continue analyzer = analyzertype(identifier, self.data, item.title(), self) newclaims = analyzer.findclaims() or [] if newclaims is None: failedprops.append(prop) newclaims = [] if not self.opt.always: pywikibot.output('Found here:') for claim in newclaims: try: pywikibot.output( '{}: {}'.format(self.label(claim[0]), self.label(claim[1]))) except ValueError: newclaims = [nclaim for nclaim in newclaims if nclaim != claim] if self.opt.always or input_yn('Save this?', default=True): for claim in newclaims: if claim[0] in updatedclaims \ and self.isinclaims(claim[1], updatedclaims[claim[0]]): if claim[2]: source = None if claim[2].dbid: id_ = 'P143' if claim[2].iswiki else 'P248' source = pywikibot.Claim(self.site, id_) source.setTarget( pywikibot.ItemPage(self.site, claim[2].dbid)) id_ = 'P4656' if claim[2].iswiki else 'P854' url = pywikibot.Claim(self.site, id_) if claim[2].sparqlquery: url.setTarget(pywikibot.ItemPage( self.site, claim[1]).full_url()) else: url.setTarget(claim[2].url) if claim[2].iswiki or claim[2].isurl: iddata = None else: iddata = pywikibot.Claim(self.site, prop) iddata.setTarget(identifier) if url is None: date = None else: date = pywikibot.Claim(self.site, 'P813') date.setTarget( self.createdateclaim( min(datetime.datetime.now() .strftime('%Y-%m-%d'), datetime.datetime.utcnow() .strftime('%Y-%m-%d')))) if not analyzer.showurl: url = None sourceparts = [source, url, iddata, date] sourcedata = [sourcepart for sourcepart in sourceparts if sourcepart is not None] pywikibot.output('Sourcing {}: {}' .format(self.label(claim[0]), self.label(claim[1]))) # probably means the sourcing is already there with suppress(APIError): updatedclaims[claim[0]][self.getlocnumber( claim[1], updatedclaims[claim[0]])].addSources( sourcedata) else: if claim[0] not in propsdone: propstodo.append(claim[0]) createdclaim = pywikibot.Claim(self.site, claim[0]) if self.QRE.match(claim[1]): createdclaim.setTarget(pywikibot.ItemPage( self.site, claim[1])) elif claim[1].startswith('!date!'): try: target = self.createdateclaim(claim[1][6:]) except ValueError as ex: pywikibot.output( 'Unable to analyze date "{}" for {}: {}' .format(claim[1][6:], self.label(claim[0]), ex)) pywikibot.input('Press enter to continue') target = None if target is None: continue createdclaim.setTarget(target) elif claim[1].startswith('!q!'): target = self.createquantityclaim( claim[1][3:].strip()) if target is None: continue createdclaim.setTarget(target) elif claim[1].startswith('!i!'): createdclaim.setTarget( pywikibot.page.FilePage(self.site, claim[1][3:])) else: createdclaim.setTarget(claim[1]) pywikibot.output('Adding {}: {}' .format(self.label(claim[0]), self.label(claim[1]))) try: item.addClaim(createdclaim) except OtherPageSaveError as ex: if claim[1].startswith('!i!'): pywikibot.output( 'Unable to save image {}: {}' .format(claim[1][3:], ex)) continue raise if claim[0] in updatedclaims: updatedclaims[claim[0]].append(createdclaim) else: updatedclaims[claim[0]] = [createdclaim] if claim[2]: if claim[2].dbid: if claim[2].iswiki: source = pywikibot.Claim(self.site, 'P143') else: source = pywikibot.Claim(self.site, 'P248') source.setTarget( pywikibot.ItemPage(self.site, claim[2].dbid)) else: source = None if claim[2].iswiki: url = pywikibot.Claim(self.site, 'P4656') else: url = pywikibot.Claim(self.site, 'P854') if claim[2].sparqlquery: url.setTarget( pywikibot.ItemPage( self.site, claim[1]).full_url()) else: url.setTarget(claim[2].url) if claim[2].iswiki or claim[2].isurl: iddata = None else: iddata = pywikibot.Claim(self.site, prop) iddata.setTarget(identifier) if url is None: date = None else: date = pywikibot.Claim( self.site, 'P813') date.setTarget(self.createdateclaim( min(datetime.datetime.now().strftime( '%Y-%m-%d'), datetime.datetime.utcnow().strftime('%Y-%m-%d')))) if not analyzer.showurl: url = None sourcedata = [source, url, iddata, date] sourcedata = [sourcepart for sourcepart in sourcedata if sourcepart is not None] pywikibot.output('Sourcing {}: {}' .format(self.label(claim[0]), self.label(claim[1]))) try: createdclaim.addSources( [s for s in sourcedata if s is not None]) except AttributeError: try: updatedclaims[claim[0]][ self.getlocnumber( claim[1], updatedclaims[claim[0]]) ].addSources(sourcedata) except AttributeError: if prop not in propsdone: propstodo.append(prop) pywikibot.output('Sourcing failed') for language, description in analyzer.getdescriptions(): newdescriptions[language].add( shorten(description.rstrip('.'), width=249, placeholder='...')) newnames = analyzer.getnames() newlabels, newaliases = self.definelabels( labels, aliases, newnames) if newlabels: item.editLabels(newlabels) if newaliases: item.editAliases(newaliases) if newlabels or newaliases: item.get(force=True) claims = item.claims claims['Wiki'] = [Quasiclaim(page.title( force_interwiki=True, as_link=True)[2:-2]) for page in item.iterlinks()] claims['Data'] = [Quasiclaim(item.title())] descriptions = item.descriptions labels = item.labels aliases = item.aliases if analyzer.longtext(): longtexts.append((analyzer.dbname, analyzer.longtext())) propsdone.add(prop) item.get(force=True) claims = item.claims claims['Wiki'] = [Quasiclaim(page.title(force_interwiki=True, as_link=True)[2:-2]) for page in item.iterlinks()] claims['Data'] = [Quasiclaim(item.title())] editdescriptions = {} for language in newdescriptions.keys(): newdescription = self.definedescription( language, descriptions.get(language), newdescriptions.get(language)) if newdescription: editdescriptions[language] = newdescription if editdescriptions: item.editDescriptions(editdescriptions) for prop in unidentifiedprops: pywikibot.output('Unknown external {} ({})' .format(prop, self.label(prop))) for prop in failedprops: pywikibot.output('External failed to load: {} ({})' .format(prop, self.label(prop))) if longtexts: if unidentifiedprops or failedprops: pywikibot.input('Press Enter to continue') pywikibot.output('== longtexts ==') for longtext in longtexts: pywikibot.output('\n== {} =='.format(longtext[0])) pywikibot.output(longtext[1]) pywikibot.input('(press enter)')
[docs] @staticmethod def definedescription(language, existingdescription, suggestions): possibilities = [existingdescription] + list(suggestions) pywikibot.output('\nSelect a description for language {}:' .format(language)) pywikibot.output('Default is to keep the old value (0)') for i, pos in enumerate(possibilities): if pos is None: pywikibot.output('{}: No description'.format(i)) else: pywikibot.output('{}: {}'.format(i, pos)) answer = pywikibot.input('Which one to choose? ') try: answer = int(answer) except ValueError: answer = 0 if answer: return possibilities[answer] return None
[docs] def definelabels(self, existinglabels, existingaliases, newnames): realnewnames = defaultdict(list) anythingfound = False for (language, name) in newnames: name = name.strip() if name.lower() == (existinglabels.get(language) or '').lower() \ or name.lower() in (n.lower() for n in existingaliases.get(language, [])): continue if name not in realnewnames[language] and name not in self.noname: realnewnames[language].append(name) anythingfound = True if anythingfound: pywikibot.output(' ') pywikibot.output('New names found:') for language in realnewnames.keys(): for name in realnewnames[language]: pywikibot.output('{}: {}'.format(language, name)) result = pywikibot.input('Add these names? (y/n/[S]elect/x) ') if not result or result[0].upper() not in 'YNX': chosennewnames = defaultdict(list) for language in realnewnames.keys(): for name in realnewnames[language]: result = pywikibot.input( '{}: {} - '.format(language, name)) if (not result) or result[0].upper() == 'Y': chosennewnames[language].append(name) elif result[0].upper() == 'X': self.noname.add(name) realnewnames = chosennewnames result = 'Y' if result[0].upper() == 'X': for language in realnewnames.keys(): for name in realnewnames[language]: self.noname.add(name) elif result[0].upper() != 'N': returnvalue = [{}, {}] for language in realnewnames: if language in existinglabels: returnvalue[1][language] = existingaliases.get( language, []) + realnewnames[language] else: returnvalue[0][language] = realnewnames[language][0] if realnewnames[language]: returnvalue[1][language] = existingaliases.get( language, []) + realnewnames[language][1:] return returnvalue return [{}, {}]
[docs] def isclaim(self, value, claim): try: if value.startswith('!date!'): value = value[6:] if value.startswith('!q!'): value = re.search(r'\d+(?:\.\d+)?', value).group(0) elif value.startswith('!i!'): value = value[3:].strip() if str(claim.getTarget()) == value: return True if claim.type == 'wikibase-item' \ and claim.getTarget().title() == value: return True if claim.type == 'commonsMedia' \ and claim.getTarget().title().split( ':', 1)[1].replace('_', ' ') == value.replace('_', ' '): return True if claim.type == 'time' \ and self.showtime(claim.getTarget()) == self.showtime( self.createdateclaim(value)): return True except (ValueError, AttributeError): return False
[docs] def isinclaims(self, value, claims): return any(self.isclaim(value, claim) for claim in claims)
[docs] def getlocnumber(self, value, claims): for pair in zip(range(len(claims)), claims): if self.isclaim(value, pair[1]): return pair[0] raise ValueError
[docs]class Quasiclaim: def __init__(self, title): """Initializer.""" self._target = title @property def type(self): return 'external-id'
[docs] def getTarget(self): # noqa: N802 """Return the target value of this QuasiClaim.""" return self._target
[docs]class Analyzer: TAGRE = re.compile('<[^<>]*>') SCRIPTRE = re.compile('(?s)<script.*?</script>') def __init__(self, id, data=None, item=None, bot=None): """Initializer.""" self.id = id self.data = defaultdict(dict) if data is None else data self.dbname = None self.urlbase = None self.urlbase2 = None self.urlbase3 = self.urlbase4 = None self.showurl = True self.dbid = None self.dbitem = None self.dbproperty = None self.hrtre = None self.language = 'en' self.escapeunicode = False self.escapehtml = False self.escapeurl = False self.item = item self.iswiki = False self.sparqlquery = None self.isurl = False self.skipfirst = False self.bot = bot self.setup() self.site = pywikibot.Site().data_repository()
[docs] def setup(self): """To be used for putting data into subclasses."""
@property def url(self): usedurl = self.urlbase if usedurl is None: if not self.sparqlquery: pywikibot.output('\n### Skipping {} ({}) ###' .format(self.dbname, self.dbproperty)) return None return usedurl.format(id=quote(self.id)) @property def alturl(self): if self.urlbase2: return self.urlbase2.format(id=quote(self.id)) return None @property def extraurls(self) -> List[str]: if not self.urlbase3: return [] if self.urlbase4: return [self.urlbase3.format(id=quote(self.id)), self.urlbase4.format(id=quote(self.id))] return [self.urlbase3.format(id=quote(self.id))]
[docs] @staticmethod def commastrip(term): term = re.sub(r'(?:\s|&nbsp;)+', ' ', term) term = term.strip().strip(',').rstrip('.').strip() term = term.split('(')[0] if ',' in term: if term.split(',')[1].strip().lower() in ['jr', 'sr']: term = term + '.' else: if term.strip()[-1] != term.strip()[-1].lower(): term = term.strip() + '.' term = term.split(',', 1)[1] + ' ' + term.split(',', 1)[0] term = re.sub(r'\s*-\s*', '-', term) return unescape(term).strip()
[docs] def getdata(self, dtype, text, ask=True): text = text.strip('. ').lower().replace('\\n', ' ').replace( '\n', ' ').replace('%20', ' ').strip() text = re.sub(' +', ' ', text) if not text: return None if dtype in self.data and text in self.data[dtype]: if self.data[dtype][text] == 'XXX': return None return self.data[dtype][text] if not ask: return None pywikibot.output("Trying to get a {} out of '{}'".format(dtype, text)) answer = pywikibot.input( 'Type Qnnn to let it point to Qnnn from now on,\n' 'Xnnn to let it point to Qnnn only now,\n' 'XXX to never use it, or nothing to not use it now') if answer.startswith('Q'): self.data[dtype][text] = answer elif answer.upper() == 'XXX': self.data[dtype][text] = 'XXX' answer = None elif answer.startswith('X'): answer = 'Q' + answer[1:] else: answer = None return answer
[docs] def findclaims(self): if not self.id: return self.html = '' if not self.url and not self.sparqlquery: return newclaims = [] pywikibot.output() pagerequest = None if not self.skipfirst: try: pywikibot.output('Getting {}'.format(self.url)) if 'https' in self.url: context = ssl._create_unverified_context() pagerequest = urlopen(self.url, context=context) else: pagerequest = urlopen(self.url) except (HTTPError, URLError, ConnectionResetError): if self.urlbase2: self.urlbase = self.urlbase2 pywikibot.output('Getting {}'.format(self.url)) if 'https' in self.url: context = ssl._create_unverified_context() pagerequest = urlopen(self.url, context=context) else: pagerequest = urlopen(self.url) else: pywikibot.output('Unable to load {}'.format(self.url)) return [] except UnicodeEncodeError: pywikibot.output('Unable to receive page {} - not unicode?' .format(self.url)) pagerequest = None self.html = '' if pagerequest: pagebytes = pagerequest.read() try: self.html = pagebytes.decode('utf-8') except UnicodeDecodeError: self.html = str(pagebytes) for extraurl in self.extraurls: try: pywikibot.output('Getting {}'.format(extraurl)) if 'https' in self.url: context = ssl._create_unverified_context() pagerequest = urlopen(extraurl, context=context) else: pagerequest = urlopen(extraurl) except (HTTPError, URLError, UnicodeEncodeError): pywikibot.output('Unable to receive altpage') else: pagebytes = pagerequest.read() try: self.html = self.html + '\n' + pagebytes.decode('utf-8') except UnicodeDecodeError: self.html = self.html + '\n' + str(pagebytes) if self.sparqlquery: self.html = str(sparql.SparqlQuery().select(self.sparqlquery)) if not self.html: return if self.escapeunicode: self.html = self.html.encode().decode('unicode-escape') if self.escapehtml: self.html = unescape(self.html) if self.escapeurl: self.html = unquote(self.html) self.html = self.prepare(self.html) pywikibot.output('\n=== {} ({}) ===='.format(self.dbname, self.dbproperty)) if self.hrtre: match = re.compile('(?s)' + self.hrtre).search(self.html) if match: text = match.group(1) text = text.replace('\\n', '\n') text = text.replace('\\t', '\t') text = text.replace('\\r', '\n') text = text.replace('\r', '\n') text = text.replace('\t', ' ') oldtext = '' while oldtext != text: oldtext = text text = self.SCRIPTRE.sub('', text) oldtext = '' while oldtext != text: oldtext = text text = self.TAGRE.sub(' ', text) while '&nbsp;' in text: text = text.replace('&nbsp;', ' ') while ' ' in text: text = text.replace(' ', ' ') while '\n ' in text: text = text.replace('\n ', '\n') while '\n\n' in text: text = text.replace('\n\n', '\n') text = text.strip() pywikibot.output(text) pywikibot.output('-' * (len(self.dbname) + 8)) for (function, prop) in [ (self.findinstanceof, 'P31'), (self.findfirstname, 'P735'), (self.findlastname, 'P734'), ]: result = function(self.html) if result: newclaims.append((prop, result.strip(), None)) for (function, prop) in [ (self.findcountries, 'P17'), (self.findspouses, 'P26'), (self.findpartners, 'P451'), (self.findworkplaces, 'P937'), (self.findresidences, 'P551'), (self.findoccupations, 'P106'), (self.findworkfields, 'P101'), (self.findpositions, 'P39'), (self.findtitles, 'P97'), (self.findemployers, 'P108'), (self.findranks, 'P410'), (self.findschools, 'P69'), (self.findethnicities, 'P172'), (self.findcrimes, 'P1399'), (self.findcomposers, 'P86'), (self.findmoviedirectors, 'P57'), (self.findartdirectors, 'P3174'), (self.findscreenwriters, 'P58'), (self.findproducers, 'P162'), (self.finddirectorsphotography, 'P344'), (self.findmovieeditors, 'P1040'), (self.findproductiondesigners, 'P2554'), (self.findsounddesigners, 'P5028'), (self.findcostumedesigners, 'P2515'), (self.findmakeupartists, 'P4805'), (self.findarchitects, 'P84'), (self.findgenres, 'P136'), (self.findengines, 'P408'), (self.findgamemodes, 'P404'), (self.findcast, 'P161'), (self.findmaterials, 'P186'), (self.finddevelopers, 'P178'), (self.findpublishers, 'P123'), (self.findprodcoms, 'P272'), (self.finddistcoms, 'P750'), (self.findoriglanguages, 'P364'), (self.findcolors, 'P462'), (self.findlanguagesspoken, 'P1412'), (self.findlanguages, 'P407'), (self.findnativelanguages, 'P103'), (self.findpseudonyms, 'P742'), (self.findparts, 'P527'), (self.findpartofs, 'P361'), (self.findinstruments, 'P1303'), (self.findlabels, 'P264'), (self.findsports, 'P641'), (self.findawards, 'P166'), (self.findnominations, 'P1411'), (self.findmemberships, 'P463'), (self.findsportteams, 'P54'), (self.findparties, 'P102'), (self.findbranches, 'P241'), (self.findconflicts, 'P607'), (self.findteampositions, 'P413'), (self.findpolitical, 'P1142'), (self.findstudents, 'P802'), (self.finddocstudents, 'P185'), (self.findteachers, 'P1066'), (self.findadvisors, 'P184'), (self.findinfluences, 'P737'), (self.finddegrees, 'P512'), (self.findmajors, 'P812'), (self.findparticipations, 'P1344'), (self.findnationalities, 'P27'), (self.findsportcountries, 'P1532'), (self.findreligions, 'P140'), (self.findchildren, 'P40'), (self.findsiblings, 'P3373'), (self.findkins, 'P1038'), (self.findincollections, 'P6379'), (self.findinworks, 'P1441'), (self.findmovements, 'P135'), (self.findorigcountries, 'P495'), (self.findwebpages, 'P973'), (self.findsources, 'P1343'), (self.findchoriginplaces, 'P1321'), (self.findpatronof, 'P2925'), (self.findnotableworks, 'P800'), (self.findparticipantins, 'P1344'), (self.findplatforms, 'P400'), (self.findfranchises, 'P8345'), (self.findvoices, 'P412'), ]: results = function(self.html) or [] for result in results: if result is not None and str(result).strip() and result != self.item: newclaims.append((prop, result.replace('\n', ' '), self)) for (function, prop) in [ (self.findfirstnames, 'P735'), (self.findlastnames, 'P734'), ]: results = function(self.html) or [] for result in results: if result is not None and str(result).strip() \ and result != self.item: newclaims.append((prop, result.replace('\n', ' '), None)) for (function, prop) in [ (self.findcountry, 'P17'), (self.findgender, 'P21'), (self.findfather, 'P22'), (self.findmother, 'P25'), (self.findreligion, 'P140'), (self.findadminloc, 'P131'), (self.findlocation, 'P276'), (self.findformationlocation, 'P740'), (self.findbirthplace, 'P19'), (self.finddeathplace, 'P20'), (self.findmannerdeath, 'P1196'), (self.findcausedeath, 'P509'), (self.findburialplace, 'P119'), (self.findorigcountry, 'P495'), (self.findnationality, 'P27'), (self.findethnicity, 'P172'), (self.findorientation, 'P91'), (self.findaddress, 'P969'), (self.findhaircolor, 'P1884'), (self.finduse, 'P366'), (self.findmountainrange, 'P4552'), (self.findviaf, 'P214'), (self.findrelorder, 'P611'), (self.findtwitter, 'P2002'), (self.findfacebook, 'P2013'), (self.findfacebookpage, 'P4003'), (self.findchoriginplace, 'P1321'), (self.findwebsite, 'P856'), (self.findvoice, 'P412'), (self.findfamily, 'P53'), (self.findgens, 'P5025'), (self.findchesstitle, 'P2962'), (self.findfeastday, 'P841'), (self.findbloodtype, 'P1853'), (self.findeyecolor, 'P1340'), ]: result = function(self.html) if result and not ( prop == 'P856' and 'wikipedia.org' in result or prop in ['P2013', 'P4003'] and result == 'pages'): newclaims.append((prop, result.strip(), self)) for (function, prop) in [ (self.findbirthdate, 'P569'), (self.finddeathdate, 'P570'), (self.findbaptismdate, 'P1636'), (self.findburialdate, 'P4602'), (self.findinception, 'P571'), (self.findpremiere, 'P1191'), (self.finddissolution, 'P576'), (self.findpubdate, 'P577'), (self.findfloruitstart, 'P2031'), (self.findfloruitend, 'P2032'), ]: result = function(self.html) if result: result = result.strip() if '?' not in result and re.search(r'\d{3}', result): newclaims.append((prop, '!date!' + result, self)) for (function, prop) in [ (self.findpubdates, 'P577'), ]: results = function(self.html) or [] for result in results: result = result.strip() if '?' not in result and re.search(r'\d{3}', result): newclaims.append((prop, '!date!' + result, self)) for function in [self.findfloruit]: result = function(self.html) if result: result = result.strip().lstrip('(').rstrip(')') result = result.replace('–', '-').replace('‑', '-') if '-' in result: (start, end) = [r.strip() for r in result.split('-', 1)] if start == end: newclaims.append(('P1317', '!date!' + start, self)) else: newclaims.append(('P2031', '!date!' + start, self)) newclaims.append(('P2032', '!date!' + end, self)) else: newclaims.append( ('P1317', '!date!' + result.strip(), self)) for (function, prop) in [ (self.findfloorsabove, 'P1101'), (self.findfloorsbelow, 'P1139'), ]: result = function(self.html) if result: newclaims.append((prop, str(int(result)), self)) for (function, prop) in [ (self.findheights, 'P2048'), (self.findweights, 'P2067'), (self.findelevations, 'P2044'), (self.finddurations, 'P2047'), (self.findprominences, 'P2660'), (self.findisolations, 'P2659'), ]: results = function(self.html) or [] for result in results: if result and result.strip(): newclaims.append((prop, '!q!' + result, self)) for (function, prop) in [ (self.findimage, 'P18'), (self.findcoatarms, 'P94'), (self.findsignature, 'P109'), (self.findlogo, 'P154'), ]: result = function(self.html) if result: result = re.sub('(<.*?>)', '', result) result = result.split('>')[-1] if len(result.strip()) > 2 and '.' in result: newclaims.append((prop, '!i!' + result, self)) result = self.findisni(self.html) if result: m = re.search(r'(\d{4})\s*(\d{4})\s*(\d{4})\s*(\w{4})', result) if m: newclaims.append(('P213', '{} {} {} {}' .format(*m.groups()), self)) for (prop, result) in self.findmixedrefs(self.html) or []: if result is not None: result = result.strip() if prop in ['P1309', 'P1255']: result = result.replace('vtls', '') elif prop == 'P1368': result = result.split('-')[-1] elif prop == 'P409': result = result.strip().lstrip('0') elif prop == 'P396' and '\\' not in result: result = result.replace('%5C', '\\') if '\\' not in result: m = re.match(r'^(.*?)(\d+)', result) result = 'IT\\ICCU\\{}\\{}'.format(*m.groups()) if result: newclaims.append((prop, result, self)) pywikibot.output() for (function, prop) in [ (self.findcoords, 'coordinates'), ]: result = function(self.html) if result: pywikibot.output('Please add yourself: {} - {}' .format(prop, result)) return newclaims
[docs] def prepare(self, html): return html
[docs] @staticmethod def singlespace(text): text = text.replace('\n', ' ') while ' ' in text: text = text.replace(' ', ' ') return text.strip()
[docs] def getdescriptions(self): return [(self.language, self.singlespace(unescape(self.TAGRE.sub(' ', x)))) for x in self.finddescriptions(self.html) or [] if x] \ + [(language, self.singlespace(unescape(self.TAGRE.sub(' ', x)))) for (language, x) in self.findlanguagedescriptions(self.html) or [] if x]
[docs] def longtext(self): result = self.TAGRE.sub(' ', self.findlongtext(self.html) or '') result = result.replace('\t', '\n').replace('\r', '') while ' ' in result: result = result.replace(' ', ' ') if '\n ' in result: result = result.replace('\n ', '\n') if ' \n' in result: result = result.replace(' \n', '\n') while '\n\n' in result: result = result.replace('\n\n', '\n') return result.strip()
[docs] def findlongtext(self, html): return None
[docs] def finddescriptions(self, html): return [self.finddescription(html)]
[docs] def findlanguagedescriptions(self, html): return None
[docs] def finddescription(self, html): return None
[docs] def getlanguage(self, code): if not code: return self.language translation = { 'cz': 'cs', 'hbo': 'he', 'simple': 'en', 'be-tarask': 'be-x-old', 'nb': 'no', } if code in translation: return translation[code] if code[-1] in '123456789': return self.getlanguage(code[:-1]) return code.replace('_', '-')
[docs] def findwikipedianames(self, html): links = self.findallbyre( r'//(\w+\.wikipedia\.org/wiki/[^\'"<>\s]+)', html) return [(self.getlanguage(link.split('.')[0]), unescape(unquote(link.split('/')[-1].replace( '_', ' '))).split('(')[0]) for link in links]
[docs] def getnames(self): return [(self.language, (self.commastrip(term))) for term in self.findnames(self.html) or [] if term and term.strip()] \ + [(self.getlanguage(language), self.commastrip(term)) for (language, term) in self.findlanguagenames(self.html) or [] if term and term.strip()] \ + self.findwikipedianames(self.html)
[docs] def findnames(self, html): return None
[docs] def findlanguagenames(self, html): return None
[docs] def findinstanceof(self, html): return None
[docs] def findgender(self, html): return None
[docs] def findfather(self, html): return None
[docs] def findmother(self, html): return None
[docs] def findspouses(self, html): return None
[docs] def findpartners(self, html): return None
[docs] def findreligion(self, html): return None
[docs] def findreligions(self, html): return None
[docs] def findchildren(self, html): return None
[docs] def findsiblings(self, html): return None
[docs] def findkins(self, html): return None
[docs] def findcountry(self, html): return None
[docs] def findcountries(self, html): return None
[docs] def findorigcountry(self, html): return None
[docs] def findadminloc(self, html): return None
[docs] def findlocation(self, html): return None
[docs] def findformationlocation(self, html): return None
[docs] def findbirthplace(self, html): return None
[docs] def finddeathplace(self, html): return None
[docs] def findburialplace(self, html): return None
[docs] def findmannerdeath(self, html): return None
[docs] def findcausedeath(self, html): return None
[docs] def findchoriginplace(self, html): return None
[docs] def findchoriginplaces(self, html): return None
[docs] def findworkplaces(self, html): return None
[docs] def findresidences(self, html): return None
[docs] def findnationality(self, html): return None
[docs] def findethnicity(self, html): return None
[docs] def findethnicities(self, html): return None
[docs] def findorientation(self, html): return None
[docs] def findnationalities(self, html): return None
[docs] def findsportcountries(self, html): return None
[docs] def findfirstname(self, html): return None
[docs] def findlastname(self, html): return None
[docs] def findfirstnames(self, html): return None
[docs] def findlastnames(self, html): return None
[docs] def findaddress(self, html): return None
[docs] def findhaircolor(self, html): return None
[docs] def findcoords(self, html): return None
[docs] def findbirthdate(self, html): return None
[docs] def finddeathdate(self, html): return None
[docs] def findbaptismdate(self, html): return None
[docs] def findburialdate(self, html): return None
[docs] def findinception(self, html): return None
[docs] def findpremiere(self, html): return None
[docs] def finddissolution(self, html): return None
[docs] def findpubdate(self, html): return None
[docs] def findpubdates(self, html): return None
[docs] def findfloruit(self, html): return None
[docs] def findfloruitstart(self, html): return None
[docs] def findfloruitend(self, html): return None
[docs] def findheights(self, html): return [self.findheight(html)]
[docs] def findheight(self, html): return None
[docs] def findweights(self, html): return [self.findweight(html)]
[docs] def findweight(self, html): return None
[docs] def findoccupations(self, html): return None
[docs] def findworkfields(self, html): return None
[docs] def findpositions(self, html): return None
[docs] def findtitles(self, html): return None
[docs] def findemployers(self, html): return None
[docs] def findranks(self, html): return None
[docs] def findschools(self, html): return None
[docs] def findcrimes(self, html): return None
[docs] def findmoviedirectors(self, html): return None
[docs] def findartdirectors(self, html): return None
[docs] def findscreenwriters(self, html): return None
[docs] def findproducers(self, html): return None
[docs] def finddirectorsphotography(self, html): return None
[docs] def findmovieeditors(self, html): return None
[docs] def findproductiondesigners(self, html): return None
[docs] def findsounddesigners(self, html): return None
[docs] def findcostumedesigners(self, html): return None
[docs] def findmakeupartists(self, html): return None
[docs] def findcomposers(self, html): return None
[docs] def findarchitects(self, html): return None
[docs] def findgenres(self, html): return None
[docs] def findengines(self, html): return None
[docs] def findgamemodes(self, html): return None
[docs] def findcast(self, html): return None
[docs] def findmaterials(self, html): return None
[docs] def finddevelopers(self, html): return None
[docs] def findpublishers(self, html): return None
[docs] def findprodcoms(self, html): return None
[docs] def finddistcoms(self, html): return None
[docs] def findoriglanguages(self, html): return None
[docs] def finddurations(self, html): return None
[docs] def findprominences(self, html): return None
[docs] def findisolations(self, html): return None
[docs] def findlanguagesspoken(self, html): return None
[docs] def findlanguages(self, html): return None
[docs] def findnativelanguages(self, html): return None
[docs] def findcolors(self, html): return None
[docs] def finduse(self, html): return None
[docs] def findfloorsabove(self, html): return None
[docs] def findfloorsbelow(self, html): return None
[docs] def findelevations(self, html): return None
[docs] def findmountainrange(self, html): return None
[docs] def findrelorder(self, html): return None
[docs] def findwebsite(self, html): return None
[docs] def findwebpages(self, html): return None
[docs] def findsources(self, html): return None
[docs] def findvoice(self, html): return None
[docs] def findvoices(self, html): return None
[docs] def findfamily(self, html): return None
[docs] def findgens(self, html): return None
[docs] def findpseudonyms(self, html): return None
[docs] def findparts(self, html): return None
[docs] def findpartofs(self, html): return None
[docs] def findinstruments(self, html): return None
[docs] def findlabels(self, html): return None
[docs] def findsports(self, html): return None
[docs] def findawards(self, html): return None
[docs] def findnominations(self, html): return None
[docs] def findmemberships(self, html): return None
[docs] def findsportteams(self, html): return None
[docs] def findparties(self, html): return None
[docs] def findbranches(self, html): return None
[docs] def findconflicts(self, html): return None
[docs] def findteampositions(self, html): return None
[docs] def findpolitical(self, html): return None
[docs] def findstudents(self, html): return None
[docs] def finddocstudents(self, html): return None
[docs] def findteachers(self, html): return None
[docs] def findadvisors(self, html): return None
[docs] def findinfluences(self, html): return None
[docs] def finddegrees(self, html): return None
[docs] def findmajors(self, html): return None
[docs] def findparticipations(self, html): return None
[docs] def findviaf(self, html): return None
[docs] def findisni(self, html): return None
[docs] def findtwitter(self, html): return None
[docs] def findfacebook(self, html): return None
[docs] def findfacebookpage(self, html): return None
[docs] def findincollections(self, html): return None
[docs] def findinworks(self, html): return None
[docs] def findmovements(self, html): return None
[docs] def findorigcountries(self, html): return None
[docs] def findchesstitle(self, html): return None
[docs] def findfeastday(self, html): return None
[docs] def findbloodtype(self, html): return None
[docs] def findeyecolor(self, html): return None
[docs] def findpatronof(self, html): return None
[docs] def findnotableworks(self, html): return None
[docs] def findparticipantins(self, html): return None
[docs] def findplatforms(self, html): return None
[docs] def findfranchises(self, html): return None
[docs] def findimage(self, html): return None
[docs] def findcoatarms(self, html): return None
[docs] def findsignature(self, html): return None
[docs] def findmixedrefs(self, html): return None
[docs] def finddefaultmixedrefs(self, html, includesocial=True): defaultmixedrefs = [ ('P214', self.findbyre(r'viaf.org/(?:viaf/)?(\d+)', html)), ('P227', self.findbyre(r'd-nb\.info/(?:gnd/)?([\d\-xX]+)', html)), ('P244', self.findbyre( r'id\.loc\.gov/authorities/\w+/(\w+)', html)), ('P244', self.findbyre(r'https?://lccn\.loc\.gov/(\w+)', html)), ('P245', self.findbyre( r'https?://www.getty.edu/[^"\'\s]+subjectid=(\w+)', html)), ('P245', self.findbyre(r'getty.edu/page/ulan/(\w+)', html)), ('P268', self.findbyre( r'https?://catalogue.bnf.fr/ark./\d+/(?:cb)?(\w+)', html)), ('P268', self.findbyre(r'data\.bnf\.fr/ark:/\d+/cb(\w+)', html)), ('P269', self.findbyre(r'https?://\w+.idref.fr/(\w+)', html)), ('P345', self.findbyre(r'https?://www.imdb.com/\w+/(\w+)', html)), ('P349', self.findbyre( r'https?://id.ndl.go.jp/auth/[^"\'\s]+/(\w+)', html)), ('P396', self.findbyre( r'opac\.sbn\.it/opacsbn/opac/[^<>\'"\s]+\?bid=([^\s\'"<>]+)', html)), ('P409', self.findbyre( r'https?://nla.gov.au/anbd.aut-an(\w+)', html)), ('P434', self.findbyre( r'https?://musicbrainz.org/\w+/([\w\-]+)', html)), ('P496', self.findbyre(r'https?://orcid.org/([\d\-]+)', html)), ('P535', self.findbyre( r'https?://www.findagrave.com/memorial/(\w+)', html)), ('P535', self.findbyre( r'https?://www.findagrave.com/cgi-bin/fg.cgi\?[^<>"\']*id=(\w+)', html)), ('P549', self.findbyre( r'genealogy.math.ndsu.nodak.edu/id.php\?id=(\w+)', html)), ('P650', self.findbyre( r'https?://rkd.nl(?:/\w+)?/explore/artists/(\w+)', html)), ('P651', self.findbyre( r'biografischportaal\.nl/persoon/(\w+)', html)), ('P723', self.findbyre( r'dbnl\.(?:nl|org)/auteurs/auteur.php\?id=(\w+)', html)), ('P723', self.findbyre( r'data.bibliotheken.nl/id/dbnla/(\w+)', html)), ('P866', self.findbyre(r'perlentaucher.de/autor/([\w\-]+)', html)), ('P902', self.findbyre( r'hls-dhs-dss.ch/textes/\w/[A-Z]?(\d+)\.php', html)), ('P906', self.findbyre( r'libris.kb.se/(?:resource/)?auth/(\w+)', html)), ('P950', self.findbyre( r'catalogo.bne.es/[^"\'\s]+authority.id=(\w+)', html)), ('P1006', self.findbyre( r'data.bibliotheken.nl/id/thes/p(\d+X?)', html)), ('P1047', self.findbyre( r'catholic-hierarchy.org/\w+/b(.+?)\.html', html)), ('P1220', self.findbyre(r'//ibdb.com/person.php\?id=(\d+)', html)), ('P1233', self.findbyre( r'https?://www.isfdb.org/cgi-bin/ea.cgi\?(\d+)', html)), ('P1415', self.findbyre( r'doi\.org/\d+\.\d+/ref:odnb/(\d+)', html)), ('P1417', self.findbyre( r'https://www.britannica.com/([\w\-/]+)', html)), ('P1422', self.findbyre(r'ta.sandrartnet/-person-(\w+)', html)), ('P1563', self.findbyre( r'https?://www-history.mcs.st-andrews.ac.uk/Biographies/([^\'"<>\s]+)', html)), ('P1728', self.findbyre( r'https?://www.allmusic.com/artist/[\w\-]*?(mn/d+)', html)), ('P1749', self.findbyre( r'https?://www.parlement(?:airdocumentatiecentrum)?.(?:com|nl)/id/(\w+)', html)), ('P1788', self.findbyre( r'huygens.knaw.nl/vrouwenlexicon/lemmata/data/([^"\'<>\s]+)', html)), ('P1802', self.findbyre( r'https?://emlo.bodleian.ox.ac.uk/profile/person/([\w\-]+)', html)), ('P1842', self.findbyre( r'https?://gameo.org/index.php\?title=([^\'"\s]+)', html)), ('P1871', self.findbyre( r'https?://(?:data|thesaurus).cerl.org/(?:thesaurus|record)/(\w+)', html)), ('P1871', self.findbyre( r'thesaurus.cerl.org/cgi-bin/record.pl\?rid=(\w+)', html)), ('P1902', self.findbyre( r'https?://open.spotify.com/artist/(\w+)', html)), ('P1907', self.findbyre( r'https?://adb.anu.edu.au/biography/([\w\-]+)', html)), ('P1938', self.findbyre( r'https?://www.gutenberg.org/ebooks/author/(\d+)', html)), ('P1953', self.findbyre( r'https?://www.discogs.com/(\w+/)?artist/(\d+)', html)), ('P1986', self.findbyre( r'treccani.it/enciclopedia/([\w\-_]+)_\(Dizionario-Biografico\)', html)), ('P2016', self.findbyre( r'hoogleraren\.ub\.rug\.nl/hoogleraren/(\w+)', html)), ('P2038', self.findbyre( r'https?://www.researchgate.net/profile/([^\'"<>\s\?]+)', html)), ('P2163', self.findbyre(r'id\.worldcat\.org/fast/(\d+)', html)), ('P2332', self.findbyre(r'/arthistorians\.info/(\w+)', html)), ('P2372', self.findbyre(r'odis\.be/lnk/([\w_]+)', html)), ('P2373', self.findbyre( r'https?://genius.com/artists/([^\s\'"]*)', html)), ('P2397', self.findbyre(r'youtube\.com/channel/([\w\-_]+)', html)), ('P2454', self.findbyre( r'https?://www.dwc.knaw.nl/[^\'"\s]+=(\w+)', html)), ('P2456', self.findbyre( r'https?://dblp.uni-trier.de/pid/([\w/]+)', html)), ('P2469', self.findbyre(r'theatricalia.com/person/(\w+)', html)), ('P2639', (self.findbyre( r'filmportal.de/person/(\w+)', html) or '').lower() or None), ('P2722', self.findbyre(r'deezer.com/artist/(\w+)', html)), ('P2799', self.findbyre( r'cervantesvirtual.com/person/(\d+)', html)), ('P2850', self.findbyre( r'https?://itunes.apple.com(?:/\w{2})?/(?:id)?(\d+)', html)), ('P2909', self.findbyre( r'https?://www.secondhandsongs.com/artist/(\w+)', html)), ('P2915', self.findbyre( r'vondel.humanities.uva.nl/ecartico/persons/(\d+)', html)), ('P2941', self.findbyre( r'munksroll.rcplondon.ac.uk/Biography/Details/(\d+)', html)), ('P2949', self.findbyre( r'www\.wikitree\.com/wiki/(\w+-\d+)', html)), ('P2963', self.findbyre( r'goodreads\.com/author/show/(\d+)', html)), ('P2969', self.findbyre(r'goodreads\.com/book/show/(\d+)', html)), ('P3040', self.findbyre( r'https?://soundcloud.com/([\w\-]+)', html)), ('P3192', self.findbyre( r'https?://www.last.fm/music/([^\'"\s]+)', html)), ('P3217', self.findbyre( r'https?://sok.riksarkivet.se/sbl/Presentation.aspx\?id=(\d+)', html)), ('P3217', self.findbyre( r'https?://sok.riksarkivet.se/sbl/artikel/(\d+)', html)), ('P3241', self.findbyre( r'https?://www.newadvent.org/cathen/(\w+)\.htm', html)), ('P3265', self.findbyre( r'https?://myspace.com/([\w\-_/]+)', html)), ('P3365', self.findbyre( r'treccani.it/enciclopedia/([\w\-_]+)', html)), ('P3368', self.findbyre( r'https?://prabook.com/web/[^/<>"\']+/(\d+)', html)), ('P3368', self.findbyre( r'prabook.com/web/person-view.html\?profileId=(\d+)', html)), ('P3435', self.findbyre(r'vgmdb\.net/artist/(\w+)', html)), ('P3478', self.findbyre(r'songkick\.com/artists/(\w+)', html)), ('P3630', self.findbyre( r'https?://www.babelio.com/auteur/[^<>\'"\s]+/(\d+)', html)), ('P3854', self.findbyre( r'soundtrackcollector.com/\w+/(\w+)', html)), ('P4013', self.findbyre(r'https?://giphy.com/(\w+)', html)), ('P4073', self.findbyre(r'(\w+)\.wikia\.com', html)), ('P4198', self.findbyre( r'play.google.com/store/music/artist\?id=(\w+)', html)), ('P4223', self.findbyre( r'treccani.it/enciclopedia/([\w\-_]+)_\(Enciclopedia-Italiana\)', html)), ('P4228', self.findbyre( r'www.eoas.info/biogs/([^\s]+)\.html', html)), ('P4228', self.findbyre( r'www.eoas.info%2Fbiogs%2F([^\s]+)\.html', html)), ('P4252', self.findbyre( r'www.mathnet.ru/[\w/\.]+\?.*?personid=(\w+)', html)), ('P4862', self.findbyre( r'https?://www.amazon.com/[\w\-]*/e/(\w+)', html)), ('P5357', self.findbyre( r'sf-encyclopedia.com/entry/([\w_]+)', html)), ('P5404', self.findbyre( r'rateyourmusic.com/artist/([^\'"<>\s]+)', html)), ('P5431', self.findbyre( r'https?://www.setlist.fm/setlists/[\w\-]*?(\w+).html', html)), ('P5570', self.findbyre( r'www.noosfere.org/[\w\./]+\?numauteur=(\w+)', html)), ('P5882', self.findbyre( r'www\.muziekweb\.nl/\w+/(\w+)', html)), ('P5924', self.findbyre( r'lyrics.wikia.com/wiki/([^\'"<>\s]*)', html)), ('P6194', self.findbyre( r'biographien\.ac.\at/oebl/oebl_\w/[^\s\.]+\.', html)), ('P6517', self.findbyre( r'whosampled.com/([^\'"<>/\s]+)', html)), ('P6594', self.findbyre( r'gf\.org/fellows/all-fellows/([\w\-]+)', html)), ('P7032', self.findbyre( r'historici.nl/Onderzoek/Projecten/Repertorium/app/personen/(\d+)', html)), ('P7032', self.findbyre( r'repertoriumambtsdragersambtenaren1428-1861/app/personen/(\d+)', html)), ('P7195', self.findbyre( r'https?://www.bandsintown.com/\w+/(\d+)', html)), ('P7545', self.findbyre( r'https?://www.askart.com/artist/[\w_]*/(\d+)/', html)), ('P7620', self.findbyre( r'treccani.it/enciclopedia/([\w\-]+)_\(Enciclopedia_dei_Papi\)', html)), ('P7902', self.findbyre( r'www.deutsche-biographie.de/pnd(\w+)\.html', html)), ('P8034', self.findbyre( r'viaf.org/viaf/sourceID/BAV\|(\w+)', html)), ('P9029', self.findbyre( r'viceversalitterature\.ch/author/(\d+)', html)), ] if includesocial: defaultmixedrefs += [ ('P2002', self.findbyre( r'https?://(?:www\.)?twitter.com/#?(\w+)', html)), ('P2003', self.findbyre( r'https?://(?:\w+\.)?instagram.com/([^/\s\'"]{2,})', html)), ('P2013', self.findbyre( r'https?://www.facebook.com/(?:pg/)?([^/\s\'"<>\?]+)', html)), ('P2847', self.findbyre( r'https?://plus.google.com/(\+?\w+)', html)), ('P2850', self.findbyre( r'https?://itunes.apple.com/(?:\w+/)?artist/(?:\w*/)?[a-z]{0,2}(\d{3,})', html)), ('P3258', self.findbyre( r'https?://([\w\-]+)\.livejournal.com', html)), ('P3258', self.findbyre( r'https?://users\.livejournal.com/(\w+)', html)), ('P3265', self.findbyre( r'https?://www.myspace.com/([\w\-]+)', html)), ('P3283', self.findbyre( r'https?://([^/"\']+)\.bandcamp.com', html)), ('P4003', self.findbyre( r'https?://www.facebook.com/pages/([^\s\'"<>\?]+)', html)), ('P4175', self.findbyre( r'https://www.patreon.com/([\w\-]+)', html)), ('P6634', self.findbyre( r'\.linkedin\.com/in/([\w\-]+)', html)), ] result = [pair for pair in defaultmixedrefs if pair[0] != self.dbproperty] isniresult = re.search( r'isni\.org/isni/(\d{4})(\d{4})(\d{4})(\w{4})', html) if isniresult: result.append(('P213', '{} {} {} {}'.format(*isniresult.groups()))) commonsresult = self.findbyre( r'commons\.wikimedia\.org/wiki/\w+:([^\'"<>\s]+)', html) if commonsresult: result += [('P18', '!i!' + commonsresult)] return [r for r in result if r[1] and not (r[0] == 'P2002' and r[1] == 'intent') and not (r[0] == 'P2013' and r[1].startswith('pages')) and not (r[0] == 'P2013' and r[1] in ['pg', 'plugins', 'sharer']) and not (r[0] == 'P214' and r[1].lower() == 'sourceid') and not (r[0] == 'P3258' and r[1].lower() in ['users', 'comunity', 'www']) and r[1].lower() != 'search' and not (r[0] == 'P3365' and ('(Dizionario_Biografico)' in r[1] or '(Enciclopedia-Italiana)' in r[1] or '(Enciclopedia-dei-Papi)' in r[1])) and not (r[0] == 'P2013' and '.php' in r[1])]
[docs] def findbyre(self, regex, html, dtype=None, skips=None, alt=None) -> str: if not skips: skips = [] if not alt: alt = [] m = re.search(regex, html) if not m: return None if dtype: alt = [dtype] + alt for alttype in alt: if self.getdata(alttype, m.group(1), ask=False) \ and self.getdata(alttype, m.group(1), ask=False) != 'XXX': return self.getdata(alttype, m.group(1), ask=False) for skip in skips: if self.getdata(skip, m.group(1), ask=False) \ and self.getdata(skip, m.group(1), ask=False) != 'XXX': return None if dtype: return self.getdata(dtype, m.group(1)) return m.group(1)
[docs] def findallbyre(self, regex, html, dtype=None, skips=None, alt=None) -> list: if not skips: skips = [] if not alt: alt = [] if dtype: alt = [dtype] + alt matches = re.findall(regex, html) result = set() for match in matches: doskip = False for alttype in alt: if self.getdata(alttype, match, ask=False) and self.getdata(alttype, match, ask=False) != 'XXX': result.add(self.getdata(alttype, match, ask=False)) doskip = True break for skip in skips: if self.getdata(skip, match, ask=False) and self.getdata(skip, match, ask=False) != 'XXX': doskip = True if doskip: continue if dtype: newresult = self.getdata(dtype, match) if newresult: result.add(newresult) else: result.add(match) return list(result)
[docs]class IsniAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P213' self.dbid = 'Q423048' self.dbname = 'International Standard Name Identifier' self.id = self.id.replace(' ', '') self.id = self.id[:4] + '+' + self.id[4:8] + '+' + self.id[8:12] + '+' + self.id[12:] self.urlbase = 'http://www.isni.org/{id}' self.urlbase3 = 'https://isni.oclc.org/DB=1.2/CMD?ACT=SRCH&IKT=8006&TRM=ISN%3A{id}&TERMS_OF_USE_AGREED=Y&terms_of_use_agree=send' self.skipfirst = True self.hrtre = '(<span class="rec.mat.long">.*?</span>)Sources' self.isperson = False self.language = 'en'
@property def url(self): # TODO: check whether this is right or needed return 'http://www.isni.org/{id}'.format(id=self.id).replace(' ', '')
[docs] def findlanguagenames(self, html): # TODO: check whether this is right or needed section = self.findbyre(r'(?s)>Name</td></tr>(.*?)</tr>', html) if section: return [('en', name) for name in self.findallbyre(r'(?s)<span>(.*?)(?:\([^{}<>]*\))?\s*</span>', section)]
[docs] def getvalues(self, field, html, dtype=None) -> List[str]: section = self.findbyre('(?s)<td class="rec_lable"><div><span>%s:.*?<td class="rec_title">(.*?)</td>', html) if section: return self.findallbyre('<span>(.*?)<', html, dtype) return []
[docs] def findnames(self, html): return [self.findbyre(r'([^\(]*)', name) for name in self.getvalues('Name', html)]
[docs] def finddescriptions(self, html): return [self.findbyre(r'\((.*?)\)', name) for name in self.getvalues('Name', html)]
[docs] def findinstanceof(self, html): result = self.findbyre(r'<span class="rec.mat.long"><img alt="(.*?)"', html, 'instanceof') self.isperson = result == 'Q5' return result
[docs] def findmixedrefs(self, html): return self.finddefaultmixedrefs(html, includesocial=False)
[docs] def findoccupations(self, html): if self.isperson: return self.getvalues('Creation role', html, 'occupation')
[docs] def findbirthdate(self, html): if self.isperson: dates = self.getvalues('Dates', html) if dates: return self.findbyre(r'(.*?)-', dates[0])
[docs] def finddeathdate(self, html): if self.isperson: dates = self.getvalues('Dates', html) if dates: return self.findbyre(r'-(.*?)', dates[0])
[docs]class ViafAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P214' self.dbid = 'Q54919' self.dbname = 'Virtual International Authority File' self.urlbase = 'https://viaf.org/viaf/{id}/' self.hrtre = '(<ns1:Document.*?)<ns1:history>' self.language = 'en' self.escapehtml = True self.sourcelanguage = { 'DNB': 'de', 'LC': 'en', 'JPG': 'en', 'SUDOC': 'fr', 'NDL': 'ja', 'NLA': 'en', 'NKC': 'cs', 'SELIBR': 'sv', 'NLI': 'he', 'BNE': 'es', 'PTBNP': 'pt', 'NTA': 'nl', 'BIBSYS': 'nb', 'BAV': 'en', 'NUKAT': 'pl', 'BNC': 'ca', 'EGAXA': 'en', 'LNB': 'lv', 'NSK': 'hr', 'LAC': 'en', 'NLP': 'pl', 'BNCHL': 'es', 'N6I': 'en', 'FAST': 'en', 'RERO': 'fr', 'B2Q': 'fr', 'DBC': 'da', 'BLBNB': 'pt', 'KRNLK': 'ko', 'ISNI': 'en', 'BNF': 'fr', 'DE663': 'de', 'WKP': 'en', 'VLACC': 'nl', 'ERRR': 'et', 'NII': 'ja', 'BNL': 'fr', 'SWNL': 'fr', 'NLR': 'ru', 'ICCU': 'it', 'LNL': 'ar', 'W2Z': 'nb', 'LIH': 'lt', 'UIY': 'is', 'CAOONL': 'en', 'SIMACOB': 'sl', 'CYT': 'zh', 'SZ': 'de', 'PLWABN': 'pl', 'NLB': 'en', 'SKMASNL': 'sk', 'ARBABN': 'es', 'J9U': 'he', 'GRATEVE': 'el', }
[docs] def getid(self, name, html): result = self.findbyre(r'>{}\|([^<>]+)'.format(name), html) if result: return result.replace(' ', '') else: return None
[docs] def findlanguagenames(self, html): languagenames = set() for section in self.findallbyre(r'(?s)<ns1:x\d+>(.*?)</ns1:x\d+>', html): for name in self.findallbyre(r'<ns1:subfield code="a">(.*?)<', section): for source in self.findallbyre(r'<ns1:s>(.*?)<', section): languagenames.add((self.sourcelanguage[source], name)) names = [name[1] for name in languagenames] for name in self.findallbyre(r'<ns1:subfield code="a">(.*?)<', html): if name not in names: languagenames.add(('en', name)) return languagenames
[docs] def findlanguagedescriptions(self, html): result = set() for section in self.findallbyre(r'(?s)<ns1:x\d+>(.*?)</ns1:x\d+>', html): for name in self.findallbyre(r'<ns1:subfield code="c">(.*?)<', section): for source in self.findallbyre(r'<ns1:s>(.*?)<', section): result.add((self.sourcelanguage[source], name)) names = [name[1] for name in result] for name in self.findallbyre(r'<ns1:subfield code="c">(.*?)<', html): if name not in names: result.add(('en', name)) return result
[docs] def findgender(self, html): return self.findbyre(r'<ns1:gender>([^<>]+)</ns1:gender>', html, 'gender')
[docs] def findnationalities(self, html): section = self.findbyre(r'<ns1:nationalityOfEntity>(.*?)</ns1:nationalityOfEntity>', html) if section: return self.findallbyre(r'<ns1:text>([^<>]+)</ns1:text>', section, 'country') else: return None
[docs] def findlanguagesspoken(self, html): section = self.findbyre(r'<ns1:languageOfEntity>(.*?)</ns1:languageOfEntity>', html) if section: return self.findallbyre(r'<ns1:text>([^<>]+)</ns1:text>', section, 'language') else: return None
[docs] def findoccupations(self, html): sections = self.findallbyre(r'<ns1:occupation>(.*?)</ns1:occupation>', html) section = '\n'.join(sections) return self.findallbyre(r'<ns1:text>(.*?)</ns1:text>', section, 'occupation')
[docs] def findworkfields(self, html): sections = self.findallbyre(r'<ns1:fieldOfActivity>(.*?)</ns1:fieldOfActivity>', html) section = '\n'.join(sections) return self.findallbyre(r'<ns1:text>(.*?)</ns1:text>', section, 'subject')
[docs] def findmixedrefs(self, html): result = [ ('P214', self.findbyre(r'<ns0:directto>(\d+)<', html)), ('P227', self.getid('DNB', html)), ('P244', self.getid('LC', html)), ('P245', self.getid('JPG', html)), ('P269', self.getid('SUDOC', html)), ('P271', self.getid('NII', html)), ('P349', self.getid('NDL', html)), ('P396', self.getid('ICCU', html)), ('P409', self.getid('NLA', html)), ('P691', self.getid('NKC', html)), ('P906', self.getid('SELIBR', html)), ('P949', self.getid('NLI', html)), ('P950', self.getid('BNE', html)), ('P1005', self.getid('PTBNP', html)), ('P1006', self.getid('NTA', html)), ('P1015', self.getid('BIBSYS', html)), ('P1017', self.getid('BAV', html)), ('P1207', self.getid('NUKAT', html)), ('P1255', self.getid('SWNL', html)), ('P1273', self.getid('BNC', html)), ('P1309', self.getid('EGAXA', html)), ('P1368', self.getid('LNB', html)), ('P1375', self.getid('NSK', html)), ('P1670', self.getid('LAC', html)), ('P1695', (self.getid('NLP', html) or '').upper() or None), # ('P1946', self.getid('N6I', html)), #obsolete ('P2163', self.getid('FAST', html)), # ('P3065', self.getid('RERO', html)), ('P3280', self.getid('B2Q', html)), ('P3348', self.getid('GRATEVE', html)), ('P3846', self.getid('DBC', html)), ('P4619', self.getid('BLBNB', html)), ('P5034', self.getid('KRNLK', html)), ('P5504', self.getid('DE663', html)), ('P7293', self.getid('PLWABN', html)), ('P7369', (self.getid('BNCHL', html) or '')[-9:] or None), ('P8034', (self.getid('BAV', html) or '').replace('_', '/') or None), ('P268', self.findbyre(r'"http://catalogue.bnf.fr/ark:/\d+/cb(\w+)"', html)), ('P1566', self.findbyre(r'"http://www.geonames.org/(\w+)"', html)), ] iccu = self.getid('ICCU', html) if iccu: result += [('P396', r'IT\ICCU\{}\{}'.format(iccu[:4], iccu[4:]))] result += self.finddefaultmixedrefs(html) return result
[docs] def findisni(self, html): return self.getid('ISNI', html)
[docs] def findnotableworks(self, html): works = self.findallbyre(r'<ns1:work>(.*?)</ns1:work>', html) works = [(len(re.findall('(<ns1:s>)', work)), work) for work in works] works.sort(reverse=True) works = works[:5] works = [work for work in works if work[0] > 2] return [self.findbyre(r'<ns1:title>(.*?)<', work[1], 'work') for work in works]
[docs]class GndAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P227' self.dbid = 'Q36578' self.dbname = 'Gemeinsame Normdatei' self.urlbase = 'https://portal.dnb.de/opac.htm?method=simpleSearch&cqlMode=true&query=nid%3D{id}' self.hrtre = '(<table id="fullRecordTable".*?/table>)' self.language = 'de' self.escapehtml = True
[docs] def finddescriptions(self, html): return [ self.findbyre(r'(?s)<strong>Weitere Angaben</strong>.*?<td[^<>]*>(.*?)</td>', html), self.findbyre(r'(?s)<strong>Systematik</strong>.*?<td[^<>]*>\s*[^\s]+(.*?)</td>', html), self.findbyre(r'(?s)<strong>Beruf\(e\)</strong>.*?<td[^<>]*>(.*?)</td>', html), ]
[docs] def findlongtext(self, html): return re.sub(r'\s', ' ', self.findbyre(r'(?s)(<table id="fullRecordTable" .*?</table>)', html) or ''). \ replace('<tr>', '\n')
[docs] def findnames(self, html): result = [] section = self.findbyre(r'(?s)<strong>Sachbegriff</strong>.*?(<td.*?>(.*?)</td>)', html) if section: result += self.findallbyre(r'>([^<>\(]*)', section) section = self.findbyre(r'(?s)<strong>Person</strong>.*?(<td.*?>(.*?)</td>)', html) if section: result += self.findallbyre(r'>([^<>\(]*)', section) section = self.findbyre(r'(?s)<strong>Synonyme</strong>.*?(<td.*?>(.*?)</td>)', html) if section: result += self.findallbyre(r'>([^<>\(]*)', section) section = self.findbyre(r'(?s)<strong>Andere Namen</strong>.*?(<td.*?>(.*?)</td>)', html) if section: result += self.findallbyre(r'>([^<>\(]*)', section) return result
[docs] def findinstanceof(self, html): result = self.findbyre(r'(?s)<strong>Typ</strong>.*?<td.*?>(.*?)(?:\(|</)', html, 'instanceof') if not result and '<strong>Person</strong>' in html: result = 'Q5' self.isperson = result == 'Q5' return result
[docs] def findbirthdate(self, html): return self.findbyre(r'(?s)Lebensdaten:([^<>]*?)-', html)
[docs] def finddeathdate(self, html): return self.findbyre(r'(?s)Lebensdaten:[^<>]*?-([^<>\(\)]*)', html)
[docs] def findnationalities(self, html): if self.isperson: section = self.findbyre(r'(?s)<strong>Land</strong>.*?<td.*?>(.*?)</td>', html) if section: return self.findallbyre(r'([\w\s]+)\(', section, 'country')
[docs] def findcountries(self, html): if not self.isperson: section = self.findbyre(r'(?s)<strong>Land</strong>.*?<td.*?>(.*?)</td>', html) if section: return self.findallbyre(r'([\w\s]+)\(', section, 'country')
[docs] def findbirthplace(self, html): return self.findbyre(r'(?s)Geburtsort:\s*(?:<[^<>]*>)?([^<>&]*)', html, 'city') or\ self.findbyre(r'(?s)([\s\w]+)\(Geburtsort\)', html, 'city')
[docs] def finddeathplace(self, html): return self.findbyre(r'(?s)Sterbeort:\s*(?:<[^<>]*>)?([^<>&]*)', html, 'city')
[docs] def findworkplaces(self, html): return (self.findallbyre(r'(?s)Wirkungsort:\s*(?:<[^<>]*>)?([^<>]*)\(\d{3}', html, 'city') or self.findallbyre(r'(?s)Wirkungsort:\s*(?:<[^<>]*>)?([^<>]*)', html, 'city')) \ + self.findallbyre(r'(?s)([\s\w]+)\(Wirkungsort\)', html, 'city')
[docs] def findoccupations(self, html): result = [] sectionfound = False for sectionname in [r'Beruf\(e\)', r'Funktion\(en\)', 'Weitere Angaben']: if sectionname == 'Weitere Angaben' and sectionfound: continue section = self.findbyre(r'(?s)<strong>{}</strong>(.*?)</tr>' .format(sectionname), html) if section: sectionfound = True result += self.findallbyre(r'(?s)[>;,]([^<>;,]*)', section, 'occupation') return result
[docs] def findgender(self, html): return self.findbyre(r'(?s)<strong>Geschlecht</strong>.*?>([^<>]+)</td', html, 'gender')
[docs] def findinstruments(self, html): section = self.findbyre(r'(?s)<strong>Instrumente.*?<td[^<>]*>(.*?)</td>', html) if section: section = self.TAGRE.sub('', section) section = re.sub(r'(?s)(\([^()]*\))', ';', section) return self.findallbyre(r'(?s)([\s\w]+)', section, 'instrument')
[docs] def findvoice(self, html): section = self.findbyre(r'(?s)<strong>Instrumente.*?<td[^<>]*>(.*?)</td>', html) if not section: return None if '(' in section: return self.findbyre(r'(?s)([\s\w]+)\(', section, 'voice') return self.findbyre(r'(?s)([\s\w]+)', section, 'voice')
[docs] def findlanguagesspoken(self, html): if self.isperson: section = self.findbyre(r'(?s)<strong>Sprache.*?<td[^<>]*>(.*?)</td>', html) if section: return self.findallbyre(r'([^{});]*)\(', section, 'language')
[docs] def finddegrees(self, html): section = self.findbyre(r'(?s)Akademischer Grad.*?<td[^<>]*>(.*?)</td>', html) if section: return self.findallbyre(r'([^<>]+)', section, 'degree')
[docs] def findsiblings(self, html): section = self.findbyre(r'(?s)<strong>Beziehungen zu Personen</strong>.*?(<td.*?</td>)', html) if section: return self.findallbyre(r'(?s)([^<>]*)(?:</a> )?\((?:Bruder|Schwester)\)', section, 'person')
[docs] def findspouses(self, html): section = self.findbyre(r'(?s)<strong>Beziehungen zu Personen</strong>.*?(<td.*?</td>)', html) if section: return self.findallbyre(r'(?s)([^<>]*)(?:</a> )?\((?:Ehemann|Ehefrau)\)', section, 'person')
[docs] def findchildren(self, html): section = self.findbyre(r'(?s)<strong>Beziehungen zu Personen</strong>.*?(<td.*?</td>)', html) if section: return self.findallbyre(r'(?s)([^<>]*)(?:</a> )?\((?:Sohn|Tochter)\)', section, 'person')
[docs] def findfather(self, html): section = self.findbyre(r'(?s)<strong>Beziehungen zu Personen</strong>.*?(<td.*?</td>)', html) if section: return self.findbyre(r'(?s)([^<>]*)(?:</a> )?\(Vater\)', section, 'person')
[docs] def findmother(self, html): section = self.findbyre(r'(?s)<strong>Beziehungen zu Personen</strong>.*?(<td.*?</td>)', html) if section: return self.findbyre(r'(?s)([^<>]*)(?:</a> )?\(Mutter\)', section, 'person')
[docs] def findpseudonyms(self, html): section = self.findbyre(r'(?s)<strong>Beziehungen zu Personen</strong>.*?(<td.*?</td>)', html) if section: return [self.findbyre(r'Pseudonym: <a[^<>]*>(.*?)<', section)]
[docs] def findwebsite(self, html): return self.findbyre(r'Homepage[^<>]*<a[^<>]*href="(.*?)"', html)
[docs] def findwebpages(self, html): return self.findallbyre(r'Internet[^<>]*<a[^<>]*href="(.*?)"', html)
[docs] def findworkfields(self, html): result = self.findallbyre(r'(?s)Fachgebiet:(.*?)<', html, 'subject') sections = self.findallbyre(r'(?s)<strong>Thematischer Bezug</strong>.*?(<td.*?</td>)', html) for section in sections: subjects = self.findallbyre(r'>([^<>]*)<', section) for subject in subjects: if ':' in subject: result += self.findallbyre(r'([\w\s]+)', subject[subject.find(':') + 1:], 'subject') else: result += self.findallbyre(r'(.+)', subject, 'subject') return result
[docs] def findemployers(self, html): section = self.findbyre(r'(?s)<strong>Beziehungen zu Organisationen</strong>.*?(<td.*?</td>)', html) if section: return self.findallbyre(r'(?s)[>;]([^<>;]*)[<;]', section, 'employer', alt=['university']) return self.findallbyre(r'Tätig an (?:d\w\w )?([^<>;]*)', html, 'employer', alt=['university'])
[docs] def findsources(self, html): section = self.findbyre(r'(?s)<strong>Quelle</strong>.*?<td[^<>]*(>.*?<)/td>', html) if section: subsections = self.findallbyre(r'>([^<>]*)<', section) result = [] for subsection in subsections: result += self.findallbyre(r'([^;]+)', subsection, 'source') return result
[docs] def findmemberships(self, html): section = self.findbyre(r'(?s)<strong>Beziehungen zu Organisationen</strong>.*?(<td.*?</td>)', html) if section: return self.findallbyre(r'>([^<>]*)</a>', section, 'organization', skips=['religious order', 'employer', 'university'])
[docs] def findrelorder(self, html): section = self.findbyre(r'(?s)<strong>Beziehungen zu Organisationen</strong>.*?(<td.*?</td>)', html) if section: return self.findbyre(r'>([^<>]*)</a>', section, 'religious order', skips=['organization', 'employer', 'university'])
[docs] def findfloruit(self, html): return self.findbyre(r'(?s)Wirkungsdaten:(.*?)<', html)
[docs] def findmixedrefs(self, html): return self.finddefaultmixedrefs(html)
[docs]class LcAuthAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P244' self.dbid = 'Q13219454' self.dbname = 'Library of Congress Authorities' # self.urlbase = None self.hrtre = '(<h1.*?)<h3>(?:Editorial Notes|Change Notes|Sources|Alternate Formats)' self.language = 'en' self.escapehtml = True
@property def url(self): if self.isperson: return 'http://id.loc.gov/authorities/names/{id}.html'.format( id=self.id) if self.id.startswith('s'): return 'http://id.loc.gov/authorities/subjects/{id}.html'.format( id=self.id) return None @property def isperson(self): return self.id.startswith('n')
[docs] def findinstanceof(self, html): return self.findbyre(r'MADS/RDF ([^<>]+)', html, 'instanceof')
[docs] def findnames(self, html): section = self.findbyre(r'(?s)<h3>Variants</h3><ul[^<>]*>(.*?)</ul>', html) if section: result = self.findallbyre(r'>([^<>]*)?(?:,[\s\d\-]+)<', section) else: result = [] return result + \ self.findallbyre(r'skos:prefLabel">(.*?)(?:</|, \d)', html) + \ self.findallbyre(r'skosxl:literalForm">(.*?)(?:<|, \d)', html)
[docs] def finddescriptions(self, html): result = [self.findbyre(r'<title>([^<>]*)-', html)] section = self.findbyre(r'(?s)<h3>Sources</h3>(.*?)</ul>', html) if section: result += self.findallbyre(r'\(([^<>]*?)\)', section) return result
[docs] def findlongtext(self, html): return self.findbyre(r'(?s)<h3>Sources</h3>(.*?)</ul>', html)
[docs] def findfirstname(self, html): if self.isperson: return self.findbyre(r'<h1[^<>]*>[^<>]*?,\s*(\w*)', html, 'firstname')
[docs] def findlastname(self, html): if self.isperson: return self.findbyre(r'h1[^<>]*>([^<>]*?),', html, 'lastname')
[docs] def findbirthdate(self, html): result = self.findbyre(r'<li><h3>Birth Date</h3><ul[^<>]*>(\d{8})<', html) if result: return '{}-{}-{}'.format(result[6:], result[4:6], result[:4]) result = self.findbyre(r'(?s)Birth Date</h3><.*?>(?:\(.*?\))?([^<>]*?)</ul>', html) or \ self.findbyre(r'[\s\(]b\.\s+([\w\-/]+)', html) or \ self.findbyre(r'skos:prefLabel">[^<>]*, (\d+)-', html) if result and '[' not in result: m = re.match(r'(\d+)[/\-](\d+)[/\-](\d+)', result) if m: result = '{}-{}-{}'.format( m.group(2), m.group(1), m.group(3) if len(m.group(3)) > 2 else '19' + m.group(3) ) return result
[docs] def finddeathdate(self, html): result = self.findbyre(r'<li><h3>Death Date</h3><ul[^<>]*>(\d{8})<', html) if result: return '{}-{}-{}'.format(result[6:], result[4:6], result[:4]) result = self.findbyre(r'(?s)Death Date</h3><.*?>(?:\(.*?\))?([^<>]*?)</ul>', html) or \ self.findbyre(r'skos:prefLabel">[^<>]*, \d+-(\d+)', html) if result and '[' not in result: m = re.match(r'(\d+)[/\-](\d+)[/\-](\d+)', result) if m: result = '{}-{}-{}'.format( m.group(2), m.group(1), m.group(3) if len(m.group(3)) > 2 else '19' + m.group(3) ) return result
[docs] def findbirthplace(self, html): return self.findbyre( r'(?s)Birth Place</h3><.*?>(?:\([^<>]*\))?([^<>]+)\s*(?:\([^<>]*\))?\s*</?[au]', html, 'city')
[docs] def finddeathplace(self, html): return self.findbyre( r'(?s)Death Place</h3><.*?>(?:\([^<>]*\))?([^<>]+)\s*(?:\([^<>]*\))?\s*</?[au]', html, 'city')
[docs] def findgender(self, html): return self.findbyre(r'(?s)Gender</h3><.*?>([^<>]*)(?:<[^<>]*>|\s)*</ul>', html, 'gender')
[docs] def findoccupations(self, html): section = self.findbyre(r'(?s)Occupation</h3>(.*?)<h3', html) if section: return self.findallbyre(r'>([^<>]+)</a>', section, 'occupation')
[docs] def findrelorder(self, html): section = self.findbyre(r'(?s)Affiliation</h3>.*?(<ul.*?</ul>)', html) if section: for result in self.findallbyre(r'>([^<>]+)</a', section, 'religious order', skips=['employer', 'university']): if result: return result
[docs] def findemployers(self, html): section = self.findbyre(r'(?s)Affiliation</h3>.*?(<ul.*?</ul>)', html) if section: return self.findallbyre(r'>([^<>]+)</a', section, 'employer', alt=['university'])
[docs] def findlanguagesspoken(self, html): if self.isperson: sections = self.findallbyre(r'(?s)Associated Language[^<>]*</h3>.*?(<ul.*?</ul>)', html) result = [] for section in sections: result += self.findallbyre(r'>([^<>]+)</a', section, 'language') return result
[docs] def findworkfields(self, html): section = self.findbyre(r'(?s)Field of Activity</h3>.*?(<ul.*?</ul>)', html) if section: return self.findallbyre(r'>([^<>]+)</a', section, 'subject')
[docs] def findmixedrefs(self, html): return self.finddefaultmixedrefs(html, includesocial=False)
[docs]class UlanAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P245' self.dbid = 'Q2494649' self.dbname = 'ULAN' self.urlbase = 'https://www.getty.edu/vow/ULANFullDisplay?find=&role=&nation=&subjectid={id}' self.hrtre = '(Record Type:.*?)Sources and Contributors:' self.language = 'en'
[docs] def finddescription(self, html): return self.findbyre(r'(?s)<SPAN CLASS=page>.*?</B>\s*\((.*?)\)', html)
[docs] def findlongtext(self, html): return self.findbyre(r'(?s)<B>Note:\s*</B>(.*?)</', html)
[docs] def findnames(self, html): section = self.findbyre(r'<B>Names:</B>.*<TR>(.*?)</TABLE>', html) if section: return self.findallbyre(r'<B>(.*?)<', section)
[docs] def findinstanceof(self, html): result = self.findbyre(r'Record Type:.*?>(.*?)<', html, 'instanceof') self.isperson = result == 'Q5' return result
[docs] def findlastname(self, html): if self.isperson: return self.findbyre(r'(?s)<SPAN CLASS=page><B>([^<>]*?),', html, 'lastname')
[docs] def findfirstname(self, html): if self.isperson: return self.findbyre(r'(?s)<SPAN CLASS=page><B>[^<>]*?,\s*([\w\-]+)', html, 'firstname')
[docs] def findnationality(self, html): if self.isperson: return self.findbyre(r'(?s)Nationalities:.*<SPAN CLASS=page>([^<>]*)\(', html, 'country')
[docs] def country(self, html): if not self.isperson: return self.findbyre(r'(?s)Nationalities:.*<SPAN CLASS=page>([^<>]*)\(', html, 'country')
[docs] def findoccupations(self, html): if self.isperson: section = self.findbyre(r'(?s)>Roles:<.*?<TR>(.*?)</TABLE>', html) if section: return self.findallbyre(r'>([^<>\(\)]+)[<\(]', section, 'occupation')
[docs] def findgender(self, html): return self.findbyre(r'Gender:<.*?>(.*?)<', html, 'gender')
[docs] def findbirthplace(self, html): return self.findbyre(r'Born:.*?>([^<>]*)\(', html, 'city')
[docs] def finddeathplace(self, html): return self.findbyre(r'Died:.*?>([^<>]*)\(', html, 'city')
[docs] def findlocation(self, html): if not self.isperson: return self.findbyre(r'location:.*?<A.*?>([^<>]*)\(', html, 'city')
[docs] def findbirthdate(self, html): if self.isperson: result = self.findbyre(r'</B>\s*\([^<>]*,([^<>]*)-', html) if result and 'ctive' not in result: return result
[docs] def finddeathdate(self, html): if self.isperson: part = self.findbyre(r'</B>\s*\([^<>]*,([^<>]*-[^<>\)]*)', html) if part and 'ctive' not in part: return self.findbyre(r'-([^<>\)]*)', part)
[docs] def findworkplaces(self, html): return self.findallbyre(r'>active:(?:\s|&nbsp;|<[^<>]*>)*([^<>]*)\(', html, 'city')
[docs] def findchildren(self, html): return self.findallbyre(r'(?s)>parent of.*?<A[^<>]*>(.*?)<', html, 'person')
[docs] def findfather(self, html): result = self.findallbyre(r'(?s)>child of.*?<A[^<>]*>(.*?)<', html, 'male-person') if result: return result[0]
[docs] def findmother(self, html): result = self.findallbyre(r'(?s)>child of.*?<A[^<>]*>(.*?)<', html, 'female-person') if result: return result[0]
[docs] def findsiblings(self, html): return self.findallbyre(r'(?s)>sibling of.*?<A[^<>]*>(.*?)<', html, 'person')
[docs] def findstudents(self, html): return self.findallbyre(r'(?s)>teacher of.*?<A[^<>]*>(.*?)<', html, 'artist')
[docs] def findteachers(self, html): return self.findallbyre(r'(?s)>sibling of.*?<A[^<>]*>(.*?)<', html, 'artist')
[docs]class BnfAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P268' self.dbid = 'Q19938912' self.dbname = 'Bibliothèque nationale de France' self.urlbase = 'http://catalogue.bnf.fr/ark:/12148/cb{id}' self.hrtre = '(<div class="notice" id="ident">.*?)<div class="notice line"' self.language = 'fr' self.escapehtml = True
[docs] def finddescriptions(self, html): return self.findallbyre(r'<meta name="DC.subject" lang="fre" content="(.*?)"', html)
[docs] def findnames(self, html): return self.findallbyre(r'<span class="gras">(.*?)<', html)
[docs] def findlongtext(self, html): return self.findbyre(r'(?s)<div[^<>]*"description">(.*?)</div>', html)
[docs] def findinstanceof(self, html): self.isperson = 'Notice de personne' in html if self.isperson: return 'Q5' # else return self.findbyre(r'(?s)Type de[^<>]+:.*?>([^<>]*)</', html, 'instanceof')
[docs] def findnationality(self, html): if self.isperson: return self.findbyre(r'(?s)Pays[^<>]*:.*?<span.*?>(.*?)</', html, 'country')
[docs] def findcountry(self, html): if not self.isperson: return self.findbyre(r'(?s)Pays[^<>]*:.*?<span.*?>(.*?)</', html, 'country')
[docs] def findlanguagesspoken(self, html): if self.isperson: result = [] section = self.findbyre(r'(?s)Langue\(s\).*?(<.*?>)\s*</div>', html) if section: section = section.replace('ancien ', 'ancien###') section = self.TAGRE.sub(' ', section) section = section.replace('###', ' ') result = self.findallbyre(r'([\w\s&;]{3,})', section, 'language') result += self.findallbyre(r'aussi(?: écrit)? en ([\w]+)', html, 'language') result += self.findallbyre(r'aussi(?: écrit)? en [\w\s]+ et en ([\w]+)', html, 'language') result += self.findallbyre(r'[tT]radu(?:cteur|it) du (.+?) en ', html, 'language') result += self.findallbyre(r'[tT]radu(?:cteur|it) .+? en ([\w\s]+)', html, 'language') return result
[docs] def findgender(self, html): return self.findbyre('(?s)Sexe[^<>]+:.*?<span.*?>(.*?)</', html, 'gender')
[docs] def findbirthdate(self, html): section = self.findbyre(r'(?s)Naissance.*?(<.*?>)\s*</div>', html) if section: result = self.findbyre(r'>([^<>]+?),', section) or self.findbyre(r'>([^<>]+?)</', section) if result and '..' not in result and re.search(r'\d{4}', result): return result return None
[docs] def findbirthplace(self, html): section = self.findbyre(r'(?s)Naissance.*?(<.*?>)\s*</div>', html) if section: result = self.findbyre(',([^<>]+)<', section, 'city') if not result: result = self.findbyre(r'Née? à ([\w\s]+)', html, 'city') return result
[docs] def finddeathdate(self, html): section = self.findbyre(r'(?s)Mort[^<>]*:.*?(<.*?>)\s*</div>', html) if section: result = self.findbyre(r'>([^<>]+?),', section) or self.findbyre(r'>([^<>]+?)</', section) if result and re.search(r'\d{4}', result): return result
[docs] def finddeathplace(self, html): section = self.findbyre(r'(?s)Mort[^<>]*:.*?(<.*?>)\s*</div>', html) if section: return self.findbyre(r',([^<>]+)<', section, 'city')
[docs] def findisni(self, html): return self.findbyre(r'ISNI ([\d\s]*)', html) or self.findbyre(r'isni/(\w+)', html)
[docs] def findoccupations(self, html): section = self.findbyre(r'(?s)"description">\s*<span[^<>]*>(.*?)</span>', html) if section: result = [] texts = [] for subsection in section.split(' et '): texts += self.findallbyre(r'(\w[\-\s\w&\']+)', subsection) for text in texts[:8]: result.append(self.findbyre(r'(.+)', text, 'occupation')) return result
[docs] def findworkfields(self, html): return self.findallbyre(r"[Pp]rofesseur d[eu']([\w\s]+)? [àa]u?x? ", html, 'subject') + \ self.findallbyre(r"[Ss]pécialiste d[eu']s?([\w\s]+) [àa]u?x? ", html, 'subject') + \ self.findallbyre(r'[Ss]pécialisée? en ([\w\s]+) [àa]u?x? ', html, 'subject') + \ self.findallbyre(r"[Pp]rofesseur d[eu']([\w\s]+)", html, 'subject') + \ self.findallbyre(r"[Ss]pécialiste d[eu']s?([\w\s]+)", html, 'subject') + \ self.findallbyre(r'[Ss]pécialisée? en ([\w\s]+)', html, 'subject')
[docs] def findemployers(self, html): sections = self.findallbyre(r'En poste\s*:(.*?)[\(<]', html) result = [] for section in sections: result += self.findallbyre(r'([^;]*)', section, 'employer', alt=['university']) return result
[docs]class SudocAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P269' self.dbid = 'Q47757534' self.dbname = 'SUDOC' self.urlbase = 'https://www.idref.fr/{id}' self.hrtre = '(<div id="editzone">.*?)<p>Informations sur la notice</p>' self.language = 'fr' self.escapehtml = True
[docs] def finddescriptions(self, html): return self.findallbyre(r'(?s)Notice de type</span>.*?([^<>]*)</span>', html) \ + self.findallbyre(r'(?s)<span class="detail_label">Note publique d\'information.*?"detail_value">(.*?)<', html)
[docs] def findnames(self, html): result = [] section = self.findbyre(r"(?s)<p>Point d'accès autorisé</p>(.*)<p>", html) if section: result += self.findallbyre(r'(?s)<b>(.*?)[\(<]', section) section = self.findbyre(r"(?s)<p>Variantes de point d'accès</p>(.*)<p>", html) if section: result += self.findallbyre(r'(?s)<b>(.*?)[\(<]', section) return result
[docs] def findlongtext(self, html): return '\n'.join(self.findallbyre(r'(?s)<span class="detail_value">(.*?)</span>', html))
[docs] def findinstanceof(self, html): return self.findbyre(r'(?s)Notice de type</span>.*?([^<>]*)</span>', html, 'instanceof')
[docs] def findlanguagesspoken(self, html): result = self.findallbyre("Traducteur de l['ea](.*?)vers", html, 'language') +\ self.findallbyre("Traducteur de .*? vers l['ea](.*?)<", html, 'language') section = self.findbyre(r'(?s)<span id="Langues" class="DataCoded">(.*?)</span>', html) if section: result += self.findallbyre(r'([\w\s\(\)]+)', section, 'language') return result
[docs] def findnationality(self, html): return self.findbyre(r'(?s)<span id="PaysISO3166" class="DataCoded">(.*?)</span>', html, 'country')
[docs] def findbirthdate(self, html): result = self.findbyre(r'(?s)Date de naissance[^<>]*</b><span[^<>]*>([^<>]*)<', html) if result: return ''.join([char for char in result if char in '0123456789-/'])
[docs] def finddeathdate(self, html): result = self.findbyre(r'(?s)Date de mort[^<>]*</b><span[^<>]*>([^<>]*)<', html) if result: return ''.join([char for char in result if char in '0123456789-/'])
[docs] def findgender(self, html): return self.findbyre(r'<span id="Z120_sexe" class="DataCoded">(.*?)</span>', html, 'gender')
[docs] def findisni(self, html): return self.findbyre(r'http://isni.org/isni/(\w+)', html)
[docs] def findmixedrefs(self, html): return self.finddefaultmixedrefs(html)
[docs] def findbirthplace(self, html): return self.findbyre(r'ieu de naissance.? (.*?)[\.<>]', html, 'city')
[docs] def finddeathplace(self, html): return self.findbyre(r'ieu de décès.? (.*?)[\.<>]', html, 'city')
[docs] def findoccupations(self, html): sections = self.findallbyre(r'(?s)<div class="detail_chaqueNoteBio">.*?<span class="detail_value">(.*?)<', html) texts = [] for section in sections: for sectionpart in section.split(' et '): texts += self.findallbyre(r'([^\.,;]+)', sectionpart) return [self.findbyre(r'(.+)', text.strip().lstrip('-'), 'occupation') for text in texts[:8]]
[docs]class CiniiAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P271' self.dbid = 'Q10726338' self.dbname = 'CiNii' self.urlbase = 'https://ci.nii.ac.jp/author/{id}' self.hrtre = '(<div class="itemheading authordata">.*?)<div class="resultlist">' self.language = 'ja'
[docs] def findnames(self, html): section = self.findbyre(r'(?s)<h1[^<>]>(.*?)</h1>', html) or '' return self.findallbyre(r'(?s)<span>(.*?)(?:, b\. \d+)?\s*</span>', section) +\ self.findallbyre(r'"seefm">(.*?)(?:, b\. \d+)?\s*[<\((]', html)
[docs] def findinstanceof(self, html): return 'Q5'
[docs] def findfirstname(self, html): return self.findbyre(r'(?s)<h1[^<>]*>[^<>]*<span>[^<>]*?,\s*([\w\-]+)', html, 'firstname')
[docs] def findlastname(self, html): return self.findbyre(r'(?s)<h1[^<>]*>[^<>]*<span>([^<>]+?),', html, 'lastname')
[docs] def findbirthdate(self, html): return self.findbyre(r', b\. (\d+)', html)
[docs]class ImdbAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P345' self.dbid = 'Q37312' self.dbname = 'Internet Movie Database' self.urlbase = None if self.isfilm: self.hrtre = '(<h1.*?)<h2>Frequently Asked Questions' elif self.isperson: self.hrtre = '(<h1.*?</table>)' self.language = 'en' self.escapeurl = True
@property def url(self): if self.isfilm: return 'https://www.imdb.com/title/{id}/'.format(id=self.id) if self.isperson: return 'https://www.imdb.com/name/{id}/'.format(id=self.id) return None @property def isfilm(self): return self.id.startswith('tt') @property def isperson(self): return self.id.startswith('nm')
[docs] def finddescription(self, html): result = self.findbyre(r'<meta name="description" content="(.*?)"', html) if result: return '.'.join(result.split('.')[:2])
[docs] def findlongtext(self, html): return self.findbyre(r'(?s)<div class="inline">(.*?)<', html)
[docs] def findnames(self, html): return [(self.findbyre(r'\'og:title\' content="(.*)"', html) or '').replace(' - IMDb', '')]
[docs] def findinstanceof(self, html): if self.isfilm: return 'Q11424' if self.isperson: return 'Q5' return None
[docs] def findorigcountry(self, html): if self.isfilm: return self.findbyre(r'(?s)Country:.*?>([^<>]+)</a>', html, 'country')
[docs] def findpubdate(self, html): if self.isfilm: return self.findbyre(r'span id="titleYear">\(\s*(?:<[^<>]*>)?(.*?)</', html)
[docs] def findmoviedirectors(self, html): section = self.findbyre(r'(?s)>Director:(<.*?</div>)', html) if section: return self.findallbyre(r'"name">([^<>]*)</span>', section, 'filmmaker')
[docs] def findscreenwriters(self, html): section = self.findbyre(r'(?s)>Writer:(<.*?</div>)', html) if section: return self.findallbyre(r'"name">([^<>]*)</span>', section, 'filmmaker')
[docs] def findcast(self, html): section = self.findbyre(r'(?s)>Credited cast:(<.*?</table>)', html) if section: return self.findallbyre(r'"name">([^<>]*)</span>', section, 'actor')
[docs] def findprodcoms(self, html): section = self.findbyre(r'(?s)>Production Co:(<.*?</div>)', html) if section: return self.findallbyre(r'"name">([^<>]*)</span>', section, 'filmcompany')
[docs] def findgenres(self, html): section = self.findbyre(r'(?s)>Genres:(<.*?</div>)', html) if section: return self.findallbyre(r'(?s)>([^<>]*)</a>', section, 'film-genre', alt=['genre'])
[docs] def findoriglanguages(self, html): section = self.findbyre(r'(?s)>Language:(<.*?</div>)', html) if section: return self.findallbyre(r'(?s)>([^<>]*)</a>', section, 'language')
[docs] def finddurations(self, html): section = self.findbyre(r'(?s)>Runtime:(<.*?</div>)', html) if section: return [self.findbyre(r'(?s)>([^<>]*)</time>', section)]
[docs] def findcolors(self, html): result = self.findbyre(r'(?s)>Color:.*?>([^<>]+)</a>', html, 'film-color') if result: return [result]
[docs] def findoccupations(self, html): section = self.findbyre(r'(?s)"jobTitle": (".*?"|\[.*?\])', html) if section: occupations = self.findallbyre(r'"(.*?)"', section, 'film-occupation', alt=['occupation']) return ['Q2526255' if result == 'Q3455803' else result for result in occupations]
[docs] def findbirthdate(self, html): return self.findbyre(r'"birthDate": "(.*?)"', html)
[docs] def finddeathdate(self, html): return self.findbyre(r'"deathDate": "(.*?)"', html)
[docs] def findbirthplace(self, html): return self.findbyre(r'birth_place=(.*?)[&"]', html, 'city')
[docs] def finddeathplace(self, html): return self.findbyre(r'death_place=(.*?)[&"]', html, 'city')
[docs]class SbnAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P396' self.dbid = None self.dbname = 'SBN' self.urlbase = 'http://opac.sbn.it/opacsbn/opac/iccu/scheda_authority.jsp?bid={id}' self.hrtre = '(<tbody>.*?</tbody>)' self.language = 'it'
[docs] def findnames(self, html): result = [self.findbyre(r'(?s)Nome autore.*?<a .*?>(.*?)[<&\(]', html)] section = self.findbyre(r'(?s)Forme varianti.*?(<.*?)</tr>', html) if section: result += self.findallbyre(r'(?s)>([^<>]*)</div>', section) return result
[docs] def finddescription(self, html): return self.findbyre(r'(?s)Nota informativa.*?"detail_value">(.*?)<', html)
[docs] def findlongtext(self, html): return self.findbyre(r'(?s)Nota informativa.*?"detail_value">(.*?)<', html)
[docs] def findinstanceof(self, html): return self.findbyre(r'(?s)Tipo autore.*?detail_value">(.*?)</td>', html, 'instanceof')
[docs] def findbirthdate(self, html): return self.findbyre(r'(?s)Datazione\s*</td>\s*<td[^<>]*>(?:[^<>]*,)?([^<>]*?)-', html)
[docs] def finddeathdate(self, html): return self.findbyre(r'(?s)Datazione\s*</td>\s*<td[^<>]*>[^<>]*-(.*?)<', html)
[docs] def findoccupations(self, html): section = self.findbyre( r'(?s)Nota informativa.*?detail_value">([^<>]*?)\.', html) if not section: return None if ',' in section or ';' in section: return self.findallbyre(r'([^,;]+)', section, 'occupation') return self.findallbyre(r'(\w{3,})', section, 'occupation')
[docs] def findbirthplace(self, html): return self.findbyre(r'Nato ad? ([^<>]+) e morto', html, 'city') or \ self.findbyre(r'Nato ad? ([^<>]+?)[,\(\.]', html, 'city') or \ self.findbyre(r'Nato e morto ad? ([^<>,\(\.]+)', html, 'city') or \ self.findbyre(r'Nato ad? ([^<>\.]+)', html, 'city')
[docs] def finddeathplace(self, html): return self.findbyre(r'[mM]orto ad? ([^<>\.\(]+) nel', html, 'city') or \ self.findbyre(r'[mM]orto ad? ([^<>\.\(]+)', html, 'city')
[docs] def findlanguagesspoken(self, html): section = self.findbyre(r'Lingua.*?detail_value">(.*?)<', html) if section: return self.findallbyre(r'(\w{3,})', section, 'language')
[docs] def findisni(self, html): return self.findbyre(r'http://isni.org/isni/(\w+)', html)
[docs] def findrelorder(self, html): section = self.findbyre(r'(?s)Nota informativa.*?detail_value">([^<>]*?)\.', html) or '' if 'gesuita' in section.lower(): return 'Q36380' return None
[docs]class LibrariesAustraliaAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P409' self.dbid = None self.dbname = 'National Library of Australia' self.urlbase = 'https://librariesaustralia.nla.gov.au/search/display?dbid=auth&id={id}' self.hrtre = '<!--Record summary-->(.*?)<!--Record summary-->' self.language = 'en'
[docs] def finddescription(self, html): return self.findbyre(r'(?s)Heading:.*?">([^<>])*</a>', html)
[docs] def findnames(self, html): result = self.findallbyre(r'(?s)<title>([^<>]*?)(?:<|\(|\s-\s)', html) return [','.join(r.split(',')[:2]) for r in result]
[docs] def findbirthdate(self, html): result = self.findbyre(r'(?s)<dt>Birth:</dt>.*?<li>(.*?)-?</li>', html) if result: if 'approx' not in result and 'active' not in result: return result else: section = self.findbyre(r'(?s)<dt>Heading:</dt>.*?>([^<>]*)</a', html) if section and 'approx' not in section and 'active' not in section: result = self.findbyre(r',([^,]*)-', section) return result if result else section return None
[docs] def findbirthplace(self, html): result = self.findbyre(r'(?s)<dt>Birth:</dt>(?:\s|<[^<>]*>)*<li>[^<>]*</li>\s*<li>(.*?)</li>', html) if result: return self.getdata('city', result) return None
[docs] def finddeathdate(self, html): result = self.findbyre(r'(?s)<dt>Death:</dt>.*?<li>(.*?)</li>', html) if result: if 'approx' not in result: return result else: section = self.findbyre(r'(?s)<dt>Heading:</dt>.*?>([^<>]*)-?</a', html) if section: result = self.findbyre(r'-([^,\-]*)', section) if result and 'approx' not in result: return result
[docs] def findfirstname(self, html): section = self.findbyre(r'(?s)<dt>Heading:</dt>.*?>([^<>]*)</a', html) pywikibot.output(section) if section: return self.findbyre(r',\s*(\w+)', section, 'firstname')
[docs] def findlastname(self, html): section = self.findbyre(r'(?s)<dt>Heading:</dt>.*?>([^<>]*)</a', html) if section: return self.findbyre(r'([^,]*),', section, 'lastname')
[docs] def finddeathplace(self, html): result = self.findbyre(r'(?s)<dt>Death:</dt>(?:\s|<[^<>]*>)*<li>[^<>]*</li>\s*<li>(.*?)</li>', html) if result: return self.getdata('city', result)
[docs] def findoccupations(self, html): section = self.findbyre(r'(?s)<dt>Occupations:</dt>.*?<li>(.*?)</li>', html) if section: return self.findallbyre(r'(\w+)', section, 'occupation')
[docs] def findmixedrefs(self, html): result = self.findbyre(r'(?s)<dt>LC number:</dt>.*?<li>(.*?)</li>', html) if result: result = result.replace(' ', '') results = self.findallbyre(r'[a-z]+\d+', result) return [('P244', result) for result in results]
[docs]class MusicBrainzAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P434' self.dbid = 'Q19832969' self.dbname = 'MusicBrainz' self.urlbase = 'https://musicbrainz.org/artist/{id}' self.urlbase3 = 'https://musicbrainz.org/artist/{id}/relationships' self.hrtre = '(<h2 class="artist-information">.*?)<div id="footer">' self.language = 'en'
[docs] def finddescription(self, html): return self.findbyre(r'<div class="wikipedia-extract-body wikipedia-extract-collapse"><p>(.+?)</p>', html)
[docs] def findnames(self, html): return self.findallbyre(r'(?s)<dd class="sort-name">(.*?)</dd>', html)
[docs] def findinstanceof(self, html): result = self.findbyre(r'<dd class="type">(.*?)</dd>', html, 'instanceof') self.isperson = result == 'Q5' return result
[docs] def findinception(self, html): return self.findbyre(r'(?s)<dt>Founded:</dt>.*?<dd[^<>]*>(.*?)[<\(]', html)
[docs] def finddissolution(self, html): return self.findbyre(r'(?s)<dt>Dissolved:</dt>.*?<dd[^<>]*>(.*?)[<\(]', html)
[docs] def findformationlocation(self, html): if not self.isperson: return self.findbyre(r'(?s)<dt>Founded in:</dt>.*?<bdi>(\w+)', html, 'city') \ or self.findbyre(r'(?s)<dt>Founded in:</dt>.*?<bdi>(.*?)</bdi>', html, 'city') \ or self.findbyre(r'(?s)<dt>Area:</dt>.*?<bdi>(.*?)</bdi>', html, 'city')
[docs] def findorigcountry(self, html): if not self.isperson: return self.findbyre(r'(?s)<dt>Area:</dt>.*?<bdi>(.*?)</bdi>', html, 'country')
[docs] def findnationality(self, html): if self.isperson: return self.findbyre(r'(?s)<dt>Area:</dt>.*?<bdi>(.*?)</bdi>', html, 'country')
[docs] def findisni(self, html): return self.findbyre(r'/isni/(\w+)', html)
[docs] def findviaf(self, html): return self.findbyre(r'"https://viaf.org/viaf/(\w+)/?"', html)
[docs] def findwebsite(self, html): return self.findbyre(r'(?s)<th>offici.le website:.*?<bdi>(.*?)<', html) or \ self.findbyre(r'<li class="home-favicon"><a href="(.*?)">', html)
[docs] def findtwitter(self, html): return self.findbyre(r'<li class="twitter-favicon"><a href="[^"]*">@([^<>]*)</a>', html)
[docs] def findfacebook(self, html): return self.findbyre(r'<li class="facebook-favicon"><a href="https://www.facebook.com/([^/"]+)/?">', html)
[docs] def findgender(self, html): return self.findbyre(r'class="gender">(.*?)</', html, 'gender')
[docs] def findbirthdate(self, html): return self.findbyre(r'(?s)<dt>Born:</dt>.*?<dd[^<>]*>(.*?)[<\(]', html)
[docs] def finddeathdate(self, html): return self.findbyre(r'(?s)<dt>Died:</dt>.*?<dd[^<>]*>(.*?)[<\(]', html)
[docs] def findbirthplace(self, html): section = self.findbyre(r'(?s)<dt>Born in:</dt>\s*(<dd.*?</dd>)', html) if section: return self.getdata('city', self.TAGRE.sub('', section))
[docs] def finddeathplace(self, html): section = self.findbyre(r'(?s)<dt>Died in:</dt>\s*(<dd.*?</dd>)', html) if section: return self.getdata('city', self.TAGRE.sub('', section)) section = self.findbyre(r'(?s)<h2>Genres</h2>(.*?)<h\d', html) if section: return self.findallbyre('>(.*?)<', section, 'music-genre', alt=['genre']) return None
[docs] def findmixedrefs(self, html): return self.finddefaultmixedrefs(html, includesocial=False) + \ [('P4862', self.findbyre(r'<li class="amazon-favicon"><a href="[^"]*amazon[^"\?]*/(B\w+)[\?"]', html))] +\ [('P3453', result) for result in self.findallbyre(r'<dd class="ipi-code">(.*?)</dd>', html)]
[docs]class StructuraeAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P454' self.dbid = 'Q1061861' self.dbname = 'Structurae' self.urlbase = 'http://en.structurae.de/structures/data/index.cfm?ID={id}' self.hrtre = '(<h1.*?)Participants</h2>' self.language = 'en'
[docs] def finddescription(self, html): return self.findbyre(r'<meta name="Description" content="(.*?)"', html)
[docs] def findlongtext(self, html): return self.findbyre(r'(?s)<div class="js-acordion-body" id="notes">\s*<p>(.*?)</div>', html)
[docs] def findlanguagenames(self, html): return [(m[0], m[1].replace('-', ' ')) for m in re.findall(r'(?s)"alternate"[^<>]*hreflang="(\w+)"[^<>]*/([^<>"]*)">', html)]
[docs] def findnames(self, html): return [self.findbyre(r'(?s)<h1[^<>]*>(.*?)<', html), self.findbyre(r'(?s)Name in [^<>]*</th>[^<>]*<td>(.*?)<', html), ]
[docs] def findinstanceof(self, html): return 'Q41176'
[docs] def findinception(self, html): return self.findbyre(r'(?s)<th>Completion.*?>([^<>]+)</a>', html)
[docs] def finduse(self, html): return self.findbyre(r'(?s)Function / usage:.*?>([^<>]+)</a>', html, 'function')
[docs] def findlocation(self, html): return self.findbyre(r"(?s)itemprop='containedInPlace'.*?<strong>(.*?)</", html, 'city')
[docs] def findcountry(self, html): return self.findbyre(r"itemprop='containedInPlace'.*>([^<>]+)</span>", html, 'country')
[docs] def findaddress(self, html): return self.findbyre(r'itemprop="address">([^<>]+)</', html)
[docs] def findcoords(self, html): lat = self.findbyre(r'itemprop="latitude" content="(.*?)"', html) lon = self.findbyre(r'itemprop="longitude" content="(.*?)"', html) if lat and lon: return '{} {}'.format(lat, lon)
[docs] def findheights(self, html): return [self.findbyre(r'(?s)<td>height</td>.*<td>(.*?)</td>', html)]
[docs] def findfloorsabove(self, html): return self.findbyre(r'(?s)<td>number of floors \(above ground\)</td>.*<td>(.*?)</td>', html)
[docs]class SelibrAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P906' self.dbid = 'Q1798125' self.dbname = 'LIBRIS' self.urlbase = 'https://libris.kb.se/auth/{id}' # self.urlbase = None self.hrtre = '(.*)' self.language = 'en'
[docs] def finddescription(self, html): return self.findbyre(r'<h1>(.*?)</', html)
[docs] def findlongtext(self, html): return self.findbyre(r'(?s)<div class="bio">(.*?)</div>', html)
[docs] def findinstanceof(self, html): return 'Q5'
[docs] def findviaf(self, html): return self.findbyre(r'http://viaf.org/viaf/(\w+)', html)
[docs] def findnames(self, html): return self.findallbyre(r'(?s)<h1[^<>]*>[^<>]*:([^<>]*?)[,<]', html)
[docs]class BneAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P950' self.dbid = None self.dbname = 'Biblioteca Nacional de España' self.urlbase = 'http://datos.bne.es/persona/{id}.html' self.hrtre = '(<h1.*?)<h3>Descarga en otros formatos' self.language = 'es'
[docs] def findnames(self, html): return self.findallbyre('<h3>(.*?)<', html)
[docs] def finddescriptions(self, html): return [ self.findbyre(r'"og:description" content="([^"]+),', html), self.findbyre(r'"og:description" content="Descubre ([^"]+),', html), self.findbyre(r'"og:description" content="([^"]+)"', html), self.findbyre(r'"og:title" content="(.+?)"', html), self.findbyre(r'(?s)class="bio">.*?<p>(.*?)</p>', html), ]
[docs] def findlongtext(self, html): return self.findbyre(r'(?s)<table class="table table-condensed table-responsive">(.*?)</table>', html)
[docs] def findlastname(self, html): return self.findbyre(r'<h1>([^<>]+),', html, 'lastname')
[docs] def findfirstname(self, html): return self.findbyre(r'<h1>[^<>]+,\s*([\w\-]+)', html, 'firstname')
[docs] def findbirthdate(self, html): result = self.findbyre(r'(?s)Año de nacimiento:\s*<span>(.*?)<', html) or \ self.findbyre(r'<h1>[^<>]+\((?:n\.\s*)?([^\)<>-]+?)[–\-\)]', html) if result and 'fl.' not in result and not result.strip().startswith('m.') and '1' in result: return result
[docs] def finddeathdate(self, html): result = self.findbyre(r'(?s)Año de fallecimiento:\s*<span>(.*?)<', html) if result: return result preresult = self.findbyre(r'<h1>(.*?)</h1>', html) if preresult and 'fl.' not in preresult: return self.findbyre(r'<h1>[^<>]+\([^<>]+[–\-]([^<>]+\d{4}[^<>]+)\)', html)
[docs] def findbirthplace(self, html): return self.findbyre(r'(?s)Lugar de nacimiento:\s*<span>(.*?)<', html, 'city')
[docs] def finddeathplace(self, html): return self.findbyre(r'(?s)Lugar de fallecimiento:\s*<span>(.*?)<', html, 'city')
[docs] def findviaf(self, html): return self.findbyre(r'"http://viaf.org/viaf/(\w+)/?"', html)
[docs] def findisni(self, html): return self.findbyre(r'"http://isni-url.oclc.nl/isni/(\w+)"', html)
[docs] def findmixedrefs(self, html): return self.finddefaultmixedrefs(html)
[docs] def findoccupations(self, html): section = self.findbyre(r'(?s)<h4>Categoría profesional:(.*?)</h4>', html) if section: return self.findallbyre(r'([^<>,]*)', section, 'occupation') return None
[docs] def findworkfields(self, html): section = self.findbyre(r'(?s)<h4>Campo de actividad:(.*?)</h4>', html) if section: return self.findallbyre(r'([^<>,]*)', section, 'subject') return None
[docs] def findlanguagesspoken(self, html): section = self.findbyre(r'(?s)<h4>>Lengua:(.*?)</h4>', html) if section: return self.findallbyre(r'([^<>,])*', section, 'subject') return None
[docs]class OrcidAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P496' self.dbid = None self.dbname = 'Orcid' self.urlbase = 'https://orcid.org/{id}' self.language = 'en' self.hrtre = r'(<div class="workspace-section">.*?)</i>\s*Works\('
[docs] def findinstanceof(self, html): return 'Q5'
[docs] def findnames(self, html): return self.findallbyre(r'(?s)"(?:full|other)-name">(.*?)<', html)
[docs] def finddescriptions(self, html): return [ self.findbyre(r'(?s)<div class="bio-content">(.*?)<', html), self.findbyre(r'(?s)<div class="bio-content">(.*?)</div>', html) ]
[docs] def findlongtext(self, html): return self.findbyre(r'(?s)<div class="bio-content">(.*?)</div>', html)
[docs] def findnationalities(self, html): return self.findallbyre(r'"country">(.*?)<', html, 'country')
[docs] def findschools(self, html): pywikibot.output('Check education and affiliations by hand!')
[docs]class CbdbAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P497' self.dbid = 'Q13407958' self.dbname = 'China Biographical Database' self.urlbase = 'https://cbdb.fas.harvard.edu/cbdbapi/person.php?id={id}' self.language = 'zh' self.hrtre = r'(<table style="font-size:smaller">.*?)<hr>'
[docs] def findlanguagenames(self, html): return [ ('en', self.findbyre(r'<b>索引/中文/英文名稱</b>:[^<>]*/([^<>]*)<', html)), ('zh', self.findbyre(r'<b>索引/中文/英文名稱</b>:[^<>]*?/([^<>]*)/', html)) ]
[docs] def findbirthdate(self, html): return self.findbyre(r'(?s)<b>生年</b>[^<>]*\(([^<>]*?)\)', html)
[docs] def finddeathdate(self, html): return self.findbyre(r'(?s)<b>卒年</b>[^<>]*\(([^<>]*?)\)', html)
[docs] def findlongtext(self, html): return self.findbyre(r'(?s)註.*?<td>(.*?)</td>', html)
[docs] def findnationalities(self, html): return [ self.findbyre(r'(?s)<b>生年</b>:\s*(.)', html, 'dynasty') or self.findbyre(r'(?s)<b>生年</b>:\s*(..)', html, 'dynasty'), self.findbyre(r'(?s)<b>卒年</b>:\s*(.)', html, 'dynasty') or self.findbyre(r'(?s)<b>卒年</b>:\s*(..)', html, 'dynasty') ]
[docs]class FindGraveAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P535' self.dbid = 'Q63056' self.dbname = 'Find a Grave' self.urlbase = 'https://www.findagrave.com/memorial/{id}' self.language = 'en' self.hrtre = r'(<h1.*?</table>)'
[docs] def getvalue(self, name, html, category=None): return self.findbyre(r'{}: "(.*?)"'.format(name), html, category)
[docs] def findnames(self, html): return [self.getvalue('shareTitle', html)]
[docs] def findlongtext(self, html): return self.findbyre(r'(?s) id="fullBio">(.*?)<', html)
[docs] def finddeathdate(self, html): return self.getvalue('deathDate', html) or \ self.findbyre(r'"deathDate">(.*?)<', html) or \ self.getvalue('deathYear', html)
[docs] def findburialplace(self, html): return self.getvalue('cemeteryName', html, 'cemetary') or \ self.getvalue('cemeteryCityName', html, 'city') or \ self.getvalue('locationName', html, 'city')
[docs] def findfirstname(self, html): return self.getvalue('firstName', html, 'firstname')
[docs] def findlastname(self, html): return self.getvalue('lastName', html, 'lastname')
[docs] def findbirthdate(self, html): return self.findbyre(r'"birthDate">(.*?)<', html) or \ self.getvalue('birthYear', html)
[docs] def findbirthplace(self, html): return self.findbyre(r'"birthPlace">(.*?)<', html, 'city')
[docs] def finddeathplace(self, html): return self.findbyre(r'"deathPlace">(.*?)<', html, 'city')
[docs] def findfather(self, html): result = self.getvalue('fatherName', html, 'person') if result: return result section = self.findbyre(r'(?s)>Ouders</b>(.*?)</ul>', html) if section: result = self.findallbyre(r'(?s)<h4[^<>]*>(.*?)</h4>', section, 'male-person') result = [r for r in result if r] if result: return result[0]
[docs] def findmother(self, html): result = self.getvalue('motherName', html, 'person') if result: return result section = self.findbyre(r'(?s)>Ouders</b>(.*?)</ul>', html) if section: result = self.findallbyre(r'(?s)<h4[^<>]*>(.*?)</h4>', section, 'female-person') result = [r for r in result if r] if result: return result[0]
[docs] def findspouses(self, html): result = self.findallbyre(r'sp\d+Name: "(.*?)"', html, 'person') if result: return result section = self.findbyre(r'(?s)>Partners</b>(.*?)</ul>', html) if section: return self.findallbyre(r'(?s)<h4[^<>]*>(.*?)</h4>', section, 'person')
[docs] def findsiblings(self, html): section = self.findbyre(r'(?s)>Broer[^<>]*zus[^<>]*</b>(.*?)</ul>', html) if section: return self.findallbyre(r'(?s)<h4[^<>]*>(.*?)</h4>', section, 'person')
[docs] def findchildren(self, html): section = self.findbyre(r'(?s)>Kinderen</b>(.*?)</ul>', html) if section: return self.findallbyre(r'(?s)<h4[^<>]*>(.*?)</h4>', section, 'person')
[docs]class IpniAuthorsAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P586' self.dbid = 'Q922063' self.dbname = 'International Plant Names Index' self.urlbase = 'http://www.ipni.org/ipni/idAuthorSearch.do?id={id}' self.language = 'en' self.hrtre = '</h2>(.*?)<p>View the'
[docs] def findinstanceof(self, html): return 'Q5'
[docs] def findnames(self, html): result = self.findallbyre(r'(?s)<h3>(.*?)[\(<]', html) section = self.findbyre(r'(?s)<h4>Alternative Names:\s*</h4(>.*?<)h/d', html) if section: result += self.findallbyre(r'(?)>([^<>]*)<', section) return result
[docs] def findlastname(self, html): return self.findbyre(r'(?s)<h3>([^<>]*?),', html, 'lastname')
[docs] def findfirstname(self, html): return self.findbyre(r'(?s)<h3>[^<>]*,\s*([\w\-]+)', html, 'firstname')
[docs] def findlongtext(self, html): return self.findbyre(r'(?s)<h4>Comment:\s*</h4>(.*?)<h\d', html)
[docs] def findbirthdate(self, html): return self.findbyre(r'(?s)<h3>[^<>]*\((\d+)-', html)
[docs] def finddeathdate(self, html): return self.findbyre(r'(?s)<h3>[^<>]*\([^<>]*?-(\d+)\)', html)
[docs] def findmixedrefs(self, html): return [('P428', self.findbyre(r'(?s)<h4>Standard Form:\s*</h4>\s*<p>(.*?)<', html))]
[docs] def findworkfields(self, html): section = self.findbyre(r'(?s)<h4>Area of Interest:\s*</h4>\s*<p>(.*?)</p>', html) if section: return self.findallbyre(r'([^,]*)', section, 'subject')
[docs] def findsources(self, html): section = self.findbyre(r'(?s)<h4>Information Source:</h4>\s*<p>(.*?)</p>', html) if section: return self.findallbyre(r'([^,]*)', section, 'source')
[docs] def findnationalities(self, html): section = self.findbyre(r'(?s)<h4>Countries:\s*</h4>(.*?)(?:<h|<p>View)', html) if section: return self.findallbyre(r'(?s)>(.*?)<', section, 'country')
[docs]class GnisAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P590' self.dbid = None self.dbname = 'GNIS' self.urlbase = 'https://geonames.usgs.gov/apex/f?p=gnispq:3:::NO::P3_FID:{id}' self.language = 'en'
[docs] def findnames(self, html): return self.findbyre(r'Name:</td><td[^<>]*>(.*?)<', html)
[docs] def findinstanceof(self, html): return self.findbyre(r'Class:</td><td[^<>]*>(.*?)[<\(]', html, 'instanceof')
[docs] def findelevations(self, html): return [ self.findbyre(r'Elevation:</td><td[^<>]*>(\d+)/', html) + ' feet', self.findbyre(r'Elevation:</td><td[^<>]*>\d+/(\d+)', html) + ' m' ]
[docs] def findadminloc(self, html): return self.findbyre(r'"COUNTY_NAME">(.*?)<', html, 'county') or \ self.findbyre(r'"STATE_NAME">(.*?)<', html, 'county')
[docs] def findcountry(self, html): return 'Q30'
[docs] def findcoords(self, html): lat = self.findbyre(r'"LAT">(.*?)<', html) lon = self.findbyre(r'"LONGI">(.*?)<', html) if lat and lon: return '{} {}'.format(lat, lon)
[docs]class MathGenAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P549' self.dbid = 'Q829984' self.dbname = 'Mathematics Genealogy Project' self.urlbase = 'https://www.genealogy.math.ndsu.nodak.edu/id.php?id={id}' self.hrtre = '(<h2.*?)We welcome any additional information.' self.language = 'en' self.escapehtml = True
[docs] def findnames(self, html): return [self.findbyre(r'(?s)<h2[^<>]*>(.*?)<', html)]
[docs] def findinstanceof(self, html): return 'Q5'
[docs] def finddegrees(self, html): return self.findallbyre(r'(?s)>\s*(Ph\.D\.)\s*<', html, 'degree')
[docs] def findschools(self, html): return self.findallbyre(r'(?s)>\s*Ph\.D\.\s*<[^<>]*>(.*?)<', html, 'university')
[docs] def findadvisors(self, html): return self.findallbyre(r'(?s)Advisor[^<>]*:[^<>]*<[^<>]*>(.*?)<', html, 'scientist')
[docs] def finddocstudents(self, html): section = self.findbyre(r'(?s)Students:.*?<table[^<>]*>(.*?)</table>', html) if not section: section = self.findbyre(r'(?s)<th>Descendants</th>(.*?)</table>', html) if section: return self.findallbyre(r'(?s)<a[^<>]*>(.*?)<', section, 'scientist')
[docs]class LeonoreAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P640' self.dbid = 'Q2886420' self.dbname = 'Léonore' self.urlbase = 'http://www2.culture.gouv.fr/public/mistral/leonore_fr?ACTION=CHERCHER&FIELD_1=COTE&VALUE_1={id}' self.hrtre = '(<TABLE VALIGN=TOP .*?</TABLE>)' self.language = 'fr' self._results = None self.escapeunicode = True
[docs] def getvalue(self, field, html, dtype=None): return self.findbyre( r'(?s)>\s*{}\s*<.*?<TD[^<>]*>(?:<[^<>]*>|\s)*([^<>]+)</' .format(field), html, dtype)
[docs] def findinstanceof(self, html): return 'Q5'
[docs] def findnames(self, html): try: return [self.getvalue('Prénoms', html) + ' ' + self.getvalue('Nom', html).title()] except TypeError: return []
[docs] def findlastname(self, html): return self.getvalue('Nom', html, 'lastname')
[docs] def findfirstname(self, html): return self.getvalue('Prénoms', html, 'firstname')
[docs] def findbirthdate(self, html): return self.getvalue('Date de naissance', html)
[docs] def findbirthplace(self, html): return self.getvalue('Lieu de naissance', html, 'city')
[docs] def findgender(self, html): return self.getvalue('Sexe', html, 'gender')
[docs]class OpenLibraryAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P648' self.dbid = 'Q1201876' self.dbname = 'Open Library' self.urlbase = 'https://openlibrary.org/works/{id}' self.hrtre = '(<h1.*?)</div>' self.language = 'en'
[docs] def finddescription(self, html): return self.findbyre(r'description" content="(.*?)"', html)
[docs] def findnames(self, html): return self.findallbyre(r'<title>([^<>]*)\|', html) +\ self.findallbyre('itemprop="name">(.*?)<', html)
[docs] def findlongtext(self, html): return self.findbyre(r'<div id="contentBody">(.*?)<div class="clearfix">', html)
[docs] def findinstanceof(self, html): return self.findbyre('og:type" content="(.*?)"', html, 'instanceof')
[docs] def findbirthdate(self, html): return self.findbyre('<span itemprop="birthDate">(.*?)<', html)
[docs] def finddeathdate(self, html): return self.findbyre('<span itemprop="deathDate">(.*?)<', html)
[docs]class RkdArtistsAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P650' self.dbid = 'Q17299517' self.dbname = 'RKDartists' self.urlbase = 'https://rkd.nl/nl/explore/artists/{id}' self.hrtre = '(<div class="fieldGroup.*?)<script>' self.language = 'nl' self.escapehtml = True
[docs] def findinstanceof(self, html): return 'Q5'
[docs] def finddescription(self, html): return self.findbyre(r'"og:description" content="(.*?)"', html)
[docs] def findlongtext(self, html): return self.findbyre(r'(?s)<div class="left">(.*?)<dt>Permalink</dt>', html)
[docs] def findnames(self, html): return [self.findbyre(r'(?s)itemprop="name">(.*?)<', html)] + \ [self.findbyre(r'(?s)<h2[^<>]*>(.*?)<', html)] + \ self.findallbyre(r'itemprop="alternateName">(.*?)<', html)
[docs] def findgender(self, html): return self.findbyre(r'(?s)itemprop="gender">(.*?)<', html, 'gender')
[docs] def findoccupations(self, html): section = self.findbyre(r'(?s)Kwalificaties\s*</dt>.*?<dd>(.*?)</dd>', html) if section: return self.findallbyre(r'">([^<>]+)</span>', section, 'occupation')
[docs] def findbirthplace(self, html): return self.findbyre(r'itemprop="birthPlace">([^<>]*),', html, 'city') or \ self.findbyre(r'itemprop="birthPlace">([^<>]*)<', html, 'city')
[docs] def findbirthdate(self, html): return self.findbyre(r'itemprop="birthDate">([^<>]*?)[</]', html)
[docs] def finddeathplace(self, html): return self.findbyre(r'itemprop="deathPlace">([^<>]*),', html, 'city') or \ self.findbyre(r'itemprop="deathPlace">([^<>]*)<', html, 'city')
[docs] def finddeathdate(self, html): return self.findbyre(r'itemprop="deathDate">([^<>]*?)[</]', html)
[docs] def findworkplaces(self, html): section = self.findbyre(r'(?s)Werkzaam in.*?<ul>(.*?)</ul>', html) if section: return self.findallbyre(r'>([^<>]+)</a>', section, 'city')
[docs] def findstudents(self, html): section = self.findbyre(r'(?s)Leraar van.*?<dd>(.*?)</dd>', html) if section: return self.findallbyre(r'>([^<>]*)</span>', section, 'artist')
[docs] def findteachers(self, html): section = self.findbyre(r'(?s)Leerling van.*?<dd>(.*?)</dd>', html) if section: return self.findallbyre(r'>([^<>]*)</span>', section, 'artist')
[docs] def findinfluences(self, html): section = self.findbyre(r'(?s)Be.nvloed door.*?<dd>(.*?)</dd>', html) if section: return self.findallbyre(r'>([^<>]*)</span>', section, 'artist')
[docs] def findschools(self, html): section = self.findbyre(r'(?s)<dt>\s*Opleiding\s*</dt>.*?<dd>(.*?)</dd>', html) if section: return self.findallbyre(r'>([^<>]+)</a>', section, 'university')
[docs] def findnationalities(self, html): return self.findallbyre(r'itemprop="nationality">(.*?)<', html, 'country')
[docs] def findgenres(self, html): return self.findallbyre(r'Onderwerpen\s*<em>(.*?)<', html, 'art-genre', alt=['genre'])
[docs] def findmovements(self, html): return self.findallbyre(r'Stroming\s*<em>(.*?)<', html, 'movement')
[docs] def findsiblings(self, html): return self.findallbyre(r'[bB]roer van ([^<>]*)', html, 'person') + \ self.findallbyre(r'[zZ]us(?:ter)? van ([^<>]*)', html, 'person')
[docs] def findfather(self, html): return self.findbyre(r'[zZ]oon van ([^<>]*)', html, 'male-person', skips=['female-person']) or \ self.findbyre(r'[dD]ochter van ([^<>]*)', html, 'male-person', skips=['female-person'])
[docs] def findmother(self, html): return self.findbyre(r'[zZ]oon van ([^<>]*)', html, 'female-person', skips=['male-person']) or \ self.findbyre(r'[dD]ochter van ([^<>]*)', html, 'female-person', skips=['male-person'])
[docs] def findmemberships(self, html): return self.findallbyre(r'Lid van[^<>]*<em>(.*?)<', html, 'organization')
[docs] def findmixedrefs(self, html): return self.finddefaultmixedrefs(html, includesocial=False)
[docs] def findfloruit(self, html): return self.findbyre(r'(?s)<dt>\s*Werkzame periode\s*</dt>\s*<dd>(.*?)<', html)
[docs]class BiografischPortaalAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P651' self.dbid = 'Q1868372' self.dbname = 'Biografisch Portaal' self.urlbase = 'http://www.biografischportaal.nl/persoon/{id}' self.hrtre = '(<h1.*)<h2' self.language = 'nl'
[docs] def finddescription(self, html): return self.findbyre(r'(?s)<th>(geboren.*?)</table>', html)
[docs] def findnames(self, html): result = [self.findbyre(r'(?s)<title>(.*?)<', html)] section = self.findbyre(r'(?s)<th>alternatieve namen</th>(.*?)</tr>', html) if section: result += self.findallbyre('<li>(.*?)<', section) return result
[docs] def findlongtext(self, html): return self.findbyre(r'(?s)<div class="levensbeschrijvingen">(.*?)<!-- content end', html)
[docs] def findbirthplace(self, html): return self.findbyre(r'(?s)<th>geboren</th>[^<>]*<td>[^<>]*<span><br\s*/>([^<>]*)<', html, 'city')
[docs] def findbirthdate(self, html): result = self.findbyre(r'(?s)<th>geboren</th>[^<>]*<td>(.*?)<', html) if result and 'tussen' not in result: return result
[docs] def finddeathplace(self, html): return self.findbyre(r'(?s)<th>gestorven</th>[^<>]*<td>[^<>]*<span><br\s*/>([^<>]*)<', html, 'city')
[docs] def finddeathdate(self, html): result = self.findbyre(r'(?s)<th>gestorven</th>[^<>]*<td>(.*?)<', html) if result and 'tussen' not in result: return result
[docs] def findmixedrefs(self, html): return self.finddefaultmixedrefs(html)
[docs] def findgender(self, html): return self.findbyre(r'(?s)<th>sekse</th>.*?<li>(.*?)<', html, 'gender')
[docs] def findsources(self, html): return self.findallbyre(r'(?s)<a class="external_link open_in_new_window"[^<>]*>(.*?)<', html, 'source')
[docs]class NkcrAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P691' self.dbid = 'Q13550863' self.dbname = 'NKC' self.urlbase = 'https://aleph.nkp.cz/F/?func=find-c&local_base=aut&ccl_term=ica={id}' self.language = 'cs' self.hrtre = '(<table width=100%>.*?)<script language='
[docs] def prepare(self, html): return html.replace('&nbsp;', ' ')
[docs] def getvalue(self, field, html, dtype=None): return self.findbyre(r'(?s)<td[^<>]*>\s*{}\s*</td>\s*<td[^<>]*>(?:<[^<>]*>)*(.*?)<'.format(field), html, dtype)
[docs] def findlongtext(self, html): return self.getvalue(r'Biogr\./Hist\. .daje', html)
[docs] def findinstanceof(self, html): return 'Q5'
[docs] def findnames(self, html): result = [ self.getvalue('Z.hlav.', html), self.getvalue('Pseudonym', html) ] return [','.join(r.split(',')[:-1]) for r in result if r]
[docs] def finddescription(self, html): return self.getvalue(r'Biogr\./Hist\. .daje', html)
[docs] def findnationality(self, html): section = self.getvalue('Související zem.', html) or\ self.getvalue(r'Biogr\./Hist\. .daje', html) if section: return self.findbyre(r'(\w+)', section, 'country') return None
[docs] def findbirthdate(self, html): return self.findbyre(r'[Nn]arozena? ([\d\.\s]*\d)', html)
[docs] def finddeathdate(self, html): return self.findbyre(r'[Zz]em.ela? ([\d\.\s]*\d)', html)
[docs] def findbirthplace(self, html): return self.findbyre(r'[Nn]arozena? [\d\.\s]* v ([\w\s]*)', html, 'city')
[docs] def finddeathplace(self, html): return self.findbyre(r'[Zz]em.ela [\d\.\s]* v ([\w\s]*)', html, 'city')
[docs] def findoccupations(self, html): section = self.getvalue(r'Biogr\./Hist\. .daje', html) if section: if 'special' in section: section = section[:section.find('special')] parts = section.split(' a ') result = [] for part in parts: result += self.findallbyre(r'([^\,\.;]*)', part, 'occupation') return result return None
[docs] def findrelorder(self, html): return self.getvalue(r'Související org\.', html, 'religious order')
[docs] def findlanguagesspoken(self, html): section = self.getvalue('Jazyk', html) if section: return self.findallbyre(r'([^;]+)', section, 'language')
[docs] def findworkfields(self, html): results = [] for regex in [ r'[oO]dborník v (.*?)[\.<]', r'[sS]pecial\w* (?:se )?(?:v|na) (.*?)[\.<]', r'[zZ]abývá se (.*?)[\.<]', r'Zaměřuje se na (.*?)[\.<]', r'[oO]boru (.*?)[\.<]', r'[zZ]aměřený na (.*?)[\.<]', ]: sections = self.findallbyre(regex, html) for section in sections: parts = section.split(' a ') for part in parts: if part.startswith('v '): part = part[2:] results += self.findallbyre(r'([\w\s]+)', part.replace(' v ', ' '), 'subject') return results
[docs]class DbnlAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P723' self.dbid = 'Q2451336' self.dbname = 'DBNL' self.urlbase = 'http://www.dbnl.org/auteurs/auteur.php?id={id}' self.language = 'nl' self.hrtre = '(<p><span class="label">.*?)<form class="mainsearchform"' self.escapehtml = True
[docs] def findnames(self, html): return [ self.findbyre(r'<title>(.*?)[&<·]', html), self.findbyre(r'"naam">(?:<[^<>]*>)*([^<>]+)<', html), self.findbyre(r'href="#naam">(.*?)<', html), ]
[docs] def findlongtext(self, html): return self.findbyre(r'(?s)<article[^<>]*>(.*?)</article>', html)
[docs] def findbirthdate(self, html): return self.findbyre(r'>geboren(?:<[^<>]*>)*<i>(.*?)<', html)
[docs] def findbirthplace(self, html): return self.findbyre(r'>geboren.*? te (?:<[^<>]*>)*([^<>]+)<', html, 'city')
[docs] def finddeathdate(self, html): return self.findbyre(r'>overleden(?:<[^<>]*>)*<i>(.*?)<', html)
[docs] def findburialdate(self, html): result = self.findbyre(r'(\d+ \w+ \(begraven\) \d+)', html) if result: return result.replace('(begraven) ', '') return None
[docs] def finddeathplace(self, html): return self.findbyre(r'>overleden<.*?> te (?:<[^<>]*>)*([^<>]+)<', html, 'city')
[docs] def findwebpages(self, html): result = [] section = self.findbyre(r'(?s)<section id="websites">.*?<table>(.*?)</table>', html) if section: result += self.findallbyre(r'>([^<>]*)</a>', section) section = self.findbyre(r'(?s)<h\d[^<>]*>Biografie[^<>]*(<.*?)</table>', html) if section: results = self.findallbyre(r'<a href="(.*?)"', section) result += ['https://www.dbnl.org/' + result.lstrip('/') for result in results] return result
[docs] def findsources(self, html): section = self.findbyre(r'(?s)<h\d[^<>]*>Biografie[^<>]*(<.*?)</table>', html) if section: return self.findallbyre(r'>([^<>]*)</a>', section, 'source')
[docs]class SikartAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P781' self.dbid = 'Q683543' self.dbname = 'SIKART' self.urlbase = 'http://www.sikart.ch/KuenstlerInnen.aspx?id={id}' self.language = 'de' self.hrtre = '<!-- content_start -->(.*?)<!-- content_end -->' self.escapehtml = True
[docs] def getvalue(self, field, html, dtype=None): return self.findbyre(r'(?s)>{}<.*?<div[^<>]*>(.*?)<' .format(field), html, dtype)
[docs] def findinstanceof(self, html): return 'Q5'
[docs] def findnames(self, html): return [ self.findbyre(r'<title>([^<>]+?)-', html), self.findbyre(r'<h1>(.*?)<', html) ]
[docs] def finddescriptions(self, html): return [ self.getvalue('Vitazeile', html), self.getvalue('Vitazeile', html).split('.')[0] ]
[docs] def findlongtext(self, html): return self.findbyre(r'(?s)<!-- content_start -->(.*)<!-- content_end -->', html)
[docs] def findlastname(self, html): return self.findbyre(r'token.lastname=(\w+)', html, 'lastname')
[docs] def findfirstname(self, html): return self.findbyre(r'token.firstname=([\w\-]+)', html, 'firstname')
[docs] def findbirthdate(self, html): dates = self.getvalue('Lebensdaten', html) if dates: return self.findbyre(r'\*\s*([\d\.]+)', dates)
[docs] def findbirthplace(self, html): dates = self.getvalue('Lebensdaten', html) if dates: return self.findbyre(r'\*\s*[\d\.]+\s*(.*?),', dates, 'city')
[docs] def finddeathdate(self, html): dates = self.getvalue('Lebensdaten', html) if dates: return self.findbyre(r'†(?:\s|&nbsp;)*([\d\.]+)', dates)
[docs] def finddeathplace(self, html): dates = self.getvalue('Lebensdaten', html) if dates: return self.findbyre(r'†(?:\s|&nbsp;)*[\d\.]+(.*)', dates, 'city')
[docs] def findchoriginplaces(self, html): section = self.getvalue('Bürgerort', html) if section: return self.findallbyre(r'([\w\s\-]+)', section, 'city')
[docs] def findnationality(self, html): return self.getvalue('Staatszugehörigkeit', html, 'country')
[docs] def findoccupations(self, html): section = self.getvalue('Vitazeile', html) if section: result = [] splitter = 'et' if ' et ' in section else 'und' for subsection in section.split('.')[0].split(' {} ' .format(splitter)): result += self.findallbyre(r'([\w\s]+)', subsection, 'occupation') return result
[docs] def findmixedrefs(self, html): return self.finddefaultmixedrefs(html)
[docs]class ImslpAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P839' self.dbid = 'Q523660' self.dbname = 'International Music Score Library Project' self.urlbase = 'https://imslp.org/wiki/{id}' self.isperson = self.id.startswith('Category:') self.hrtre = r'(<h\d.*?)<h2' self.language = 'nl'
[docs] def findinstanceof(self, html): if self.isperson: return 'Q5' raise NotImplementedError # analysis only made for persons
[docs] def findbirthdate(self, html): return self.findbyre(r'</h2>\(([^<>]*?)—', html)
[docs] def finddeathdate(self, html): return self.findbyre(r'</h2>\([^<>]*?—([^<>]*?)\)', html)
[docs] def findlanguagenames(self, html): result = [('nl', x) for x in self.findallbyre(r'<h2>\s*<span[^<>]*>(.*?)</span>', html)] section = self.findbyre(r'Andere Namen/Transliteraties:(.*?)<', html) if section: parts = section.split(',') for part in parts: subparts = self.findallbyre(r'((?:[^,\(]|\([^\(\)]*\))*)', part) for subpart in subparts: if '(' in subpart: result += [(lang.strip(), subpart[:subpart.find('(')]) for lang in self.findbyre(r'\(.*?)\)', subpart).split(',')] else: result.append(('nl', subpart)) section = self.findbyre(r'Aliassen:(.*)', html) if section: parts = self.findallbyre(r'(<span.*?/span>', section) for part in parts: result += [(language.strip(), self.findbyre(r'>([^<>]*)</span>', part)) for language in self.findbyre(r'<span title="(.*?)">', part).split(',')] return result
[docs] def findmixedrefs(self, html): return self.finddefaultmixedrefs(html)
[docs]class HdsAnalyzer(Analyzer):
[docs] def setup(self): self.id = '{:06d}'.format(int(self.id)) self.dbproperty = 'P902' self.dbid = 'Q642074' self.dbname = 'Historical Dictionary of Switzerland' self.urlbase = 'https://hls-dhs-dss.ch/de/articles/{id}/' self.hrtre = '(<h1.*?<!-- noindex -->)' self.language = 'de' self.escapeunicode = True
[docs] def finddescription(self, html): return self.findbyre(r'property="og:description" content="(.*?)"', html)
[docs] def findlongtext(self, html): return self.findbyre(r'(?s)(<h1.*?<!-- noindex -->)', html)
[docs] def findnames(self, html): return [self.findbyre(r'(?s)<title>(.*?)<', html)]
[docs] def findfirstname(self, html): return self.findbyre(r'<span itemprop="givenName">(.*?)</span>', html, 'firstname')
[docs] def findlastname(self, html): return self.findbyre(r'<span itemprop="familyName">(.*?)</span>', html, 'lastname')
[docs] def findbirthdate(self, html): return self.findbyre(r'<span itemprop="birthDate">(.*?)</span>', html)
[docs] def finddeathdate(self, html): return self.findbyre(r'<span itemprop="deathDate">(.*?)</span>', html)
[docs] def findbirthplace(self, html): return self.findbyre(r'<img alt="geboren"[^<>]*>\s*[^\s]*\s*([\w\s-]*)', html, 'city')
[docs] def finddeathplace(self, html): return self.findbyre(r'<img alt="gestorben"[^<>]*>\s*[^\s]*\s*([\w\s-]*)', html, 'city')
[docs] def findmixedrefs(self, html): return self.finddefaultmixedrefs(html, includesocial=False)
[docs]class NtaAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P1006' self.dbid = None self.dbname = 'NTA' self.urlbase = 'http://data.bibliotheken.nl/doc/thes/p{id}' self.hrtre = '(<h1.*?)<div id="bnodes">' self.language = 'nl'
[docs] def finddescription(self, html): return self.findbyre(r'<h1><span>(.*?)<', html)
[docs] def findnames(self, html): result = [self.findbyre(r'(?s)<title>(.*?)<', html)] section = self.findbyre(r'(?s)alternateName</span>(.*?)<label', html) if section: result += self.findallbyre(r'(?s)<div class="fixed">(.*?)[&<]', html) return result
[docs] def findinstanceof(self, html): return self.findbyre(r'http://schema.org/(.*?)[&"\']', html, 'instanceof')
[docs] def finddeathdate(self, html): return self.findbyre(r'(?s)<span>deathDate</span>.*?<span.*?>(.*?)[&<]', html)
[docs] def findbirthdate(self, html): return self.findbyre(r'(?s)<span>birthDate</span>.*?<span.*?>(.*?)[&<]', html)
[docs] def findfirstname(self, html): return self.findbyre(r'(?s)<span>givenName</span>.*?<span.*?>(.*?)[&<]', html, 'firstname')
[docs] def findlastname(self, html): return self.findbyre(r'(?s)<span>familyName</span>.*?<span.*?>(.*?)[&<]', html, 'lastname')
[docs] def findviaf(self, html): return self.findbyre(r'http://viaf.org/viaf/(\d+)', html)
[docs]class PtbnpAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P1005' self.dbid = None self.dbname = 'Biblioteca Nacional de Portugal' self.urlbase = 'http://urn.bn.pt/nca/unimarc-authorities/txt?id={id}' self.hrtre = '(.*)' self.language = 'pt' self.escapehtml = True
[docs] def findnames(self, html): return [self.TAGRE.sub(' ', text).replace('$b', '') for text in self.findallbyre(r'>[24]00<.*?\$a(.*?\$b.*?)(?:<br>|\$|$)', html)]
[docs] def finddescription(self, html): return self.findbyre(r'>830<.*?\$a.*?</font>([^<>]*)', html)
[docs] def findnationality(self, html): return self.findbyre(r'>102<.*?\$a(?:<[^<>]*>)*([^<>]+)', html, 'country')
[docs] def findlongtext(self, html): return '\n'.join(self.findallbyre(r'>830<.*?\$a.*?</font>([^<>]*)', html))
[docs] def findbirthdate(self, html): result = self.findbyre(r'>200<.*?\$f.*?</font>([^<>]*)-', html) if result and 'ca ' not in result and 'fl.' not in result: return result
[docs] def finddeathdate(self, html): result = self.findbyre(r'>200<.*?\$f.*?</font>[^<>]*-([^<>,]*)', html) if result and 'ca ' not in result and 'fl.' not in result: return result
[docs] def findfirstname(self, html): return self.findbyre(r'>200<.*?\$b</b></font>([^<>]*?),?\s*<', html, 'firstname')
[docs] def findlastname(self, html): return self.findbyre(r'>200<.*?\$a</b></font>(.*?),?<', html, 'lastname')
[docs]class BibsysAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P1015' self.dbid = None self.dbname = 'BIBSYS' self.urlbase = 'https://authority.bibsys.no/authority/rest/authorities/html/{id}' self.hrtre = '(<body>.*)' self.language = 'en'
[docs] def findnames(self, html): return self.findallbyre(r'<td>[^<>]*name[^<>]*</td><td>([^<>]*)</td>', html)
[docs] def findinstanceof(self, html): return self.findbyre(r'<td>Authority type</td><td>(.*?)</td>', html, 'instanceof')
[docs] def findisni(self, html): return self.findbyre(r'<td>isni</td><td>(.*?)</td>', html)
[docs] def findviaf(self, html): return self.findbyre(r'http://viaf.org/viaf/(\w+)', html) or \ self.findbyre(r'<td>viaf</td><td>(.*?)</td>', html)
[docs] def findfirstname(self, html): return self.findbyre(r'<td>Personal name</td><td>[^<>]*,\s*(\w+)', html, 'firstname')
[docs] def findlastname(self, html): return self.findbyre(r'<td>Personal name</td><td>([^<>]*),', html, 'lastname')
[docs] def findbirthdate(self, html): return self.findbyre(r'<td>Dates associated with a name</td><td>([^<>]*)-', html)
[docs] def finddeathdate(self, html): return self.findbyre(r'<td>Dates associated with a name</td><td>[^<>]*-([^<>]*)', html)
[docs]class KunstindeksAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P1138' self.dbid = 'Q3362041' self.dbname = 'Kunstindeks Danmark' self.urlbase = 'https://www.kulturarv.dk/kid/VisKunstner.do?kunstnerId={id}' self.urlbase3 = 'https://www.kulturarv.dk/kid/SoegKunstnerVaerker.do?kunstnerId={id}&hitsPerPage=1000' self.hrtre = 'Information from Kunstindeks Danmark</h2>(.*?)</table>' self.language = 'da'
[docs] def findnames(self, html): return [ self.findbyre(r':([^<>]*)</h1>', html), self.findbyre(r'Name:\s*</b>(.*?)<', html) ]
[docs] def findlongtext(self, html): return self.findbyre(r'(?s)(<h1>.*?)<td class="right\d', html)
[docs] def findlastname(self, html): return self.findbyre(r'(?s)<b>Name: </b>([^<>]*),', html, 'lastname')
[docs] def findfirstname(self, html): return self.findbyre(r'(?s)<b>Name: </b>[^<>]*,\s*([\w\-]+)', html, 'firstname')
[docs] def findbirthplace(self, html): return self.findbyre(r'(?s)<b>Born: </b>([^<>]*),', html, 'city')
[docs] def findbirthdate(self, html): return self.findbyre(r'(?s)<b>Born: </b>[^<>]*?([\d\-]+)\s*<', html)
[docs] def finddeathplace(self, html): return self.findbyre(r'(?s)<b>Died: </b>([^<>]*),', html, 'city')
[docs] def finddeathdate(self, html): return self.findbyre(r'(?s)<b>Died: </b>[^<>]*?([\d\-]+)\s*<', html)
[docs] def findoccupations(self, html): section = self.findbyre(r'(?s)Occupation: </b>(.*?)<', html) if section: return self.findallbyre(r'([\s\w]+)', section, 'occupation')
[docs] def findgender(self, html): return self.findbyre(r'(?s)Sex: </b>(.*?)<', html, 'gender')
[docs] def findnationality(self, html): return self.findbyre(r'(?s)Nationality: </b>(.*?)<', html, 'country')
[docs] def findincollections(self, html): return self.findallbyre(r'museumId=[^<>]*>(.*?)<', html, 'museum')
[docs]class IaafAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P1146' self.dbid = None self.dbname = 'IAAF' self.urlbase = 'https://www.iaaf.org/athletes/athlete={id}' self.hrtre = '(<div class="row offset.*? <div class="clearfix">)' self.language = 'en'
[docs] def findnames(self, html): return [self.findbyre(r'(?s)<h1>(.*?)<', html)]
[docs] def findlongtext(self, html): return self.findbyre(r'(?s)<div class="modal-body athletepopup">(.*?)</script>', html)
[docs] def instanceof(self, html): return 'Q5'
[docs] def findoccupations(self, html): return ['Q11513337']
[docs] def findsports(self, html): return ['Q542']
[docs] def findnationality(self, html): return self.findbyre(r'(?s)COUNTRY.*?>([^<>]*)</span>', html, 'country')
[docs] def findbirthdate(self, html): return self.findbyre(r'(?s)DATE OF BIRTH\s*<br\s*/>(.*?)<', html)
[docs]class ScopusAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P1153' self.dbid = 'Q371467' self.dbname = 'Scopus' self.urlbase = 'https://www.scopus.com/authid/detail.uri?authorId={id}' self.hrtre = '(<h2.*?)<h4' self.language = 'en'
[docs] def findinstanceof(self, html): return 'Q5'
[docs] def findnames(self, html): result = self.findallbyre(r'name="authorPreferredName" value="(.*?)"', html) section = self.findbyre(r'(?s)(<div id="otherNameFormatBadges".*?</div>)', html) if section: result += self.findallbyre(r'>(.*?)<', section) return result
[docs] def findworkfields(self, html): section = self.findbyre(r'(?s)(<div id="subjectAreaBadges".*?</div>)', html) if section: return self.findallbyre(r'>(.*?)<', section, 'subject')
[docs] def findmixedrefs(self, html): return self.finddefaultmixedrefs(html)
[docs] def findemployers(self, html): section = self.findbyre(r'(?s)<div class="authAffilcityCounty">(.*?)</div>', html) if section: return self.findallbyre(r'>([^<>]*)</span>', section, 'employer', alt=['university'])
[docs] def findworkplaces(self, html): section = self.findbyre(r'(?s)<div class="authAffilcityCounty">(.*?)</div>', html) if section: return self.findallbyre(r'(?s)>,([^<>],[^<>]*)<', section.replace('\n', ' '), 'city')
[docs]class RodovidAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P1185' self.dbid = 'Q649227' self.dbname = 'Rodovid' self.urlbase = 'https://en.rodovid.org/wk/Person:{id}' self.hrtre = '<table class="persondata">(.*?<h2>.*?)<h2' self.language = 'en' self.escapehtml = True
[docs] def findnames(self, html): return [ self.findbyre(r'<title>(.*?)(?: [bd]\. |<)', html), self.findbyre(r'<h1[^<>]*>(.*?)(?: [bd]\. |<)', html), self.findbyre(r'(?s)<b>Full name[^<>]*</b>\s*</td><td>(.*?)<', html) ]
[docs] def findlongtext(self, html): return self.findbyre(r'(?s)<span class="mw-headline">Notes</span></h2>\s*<p>(.*?)<h\d', html)
[docs] def findbirthdate(self, html): section = self.findbyre(r'(?s)Events</span></h2>(.*?)<h2', html) return self.findbyre(r'<b>([^<>]*)</b>birth:', section)
[docs] def findbirthplace(self, html): section = self.findbyre(r'(?s)Events</span></h2>(.*?)<h2', html) return self.findbyre(r'>birth: <[^<>]*>(.*?)<', section, 'city')
[docs] def finddeathdate(self, html): section = self.findbyre(r'(?s)Events</span></h2>(.*?)<h2', html) return self.findbyre(r'<b>([^<>]*)</b>death:', section)
[docs] def finddeathplace(self, html): section = self.findbyre(r'(?s)Events</span></h2>(.*?)<h2', html) return self.findbyre(r'death: <[^<>]*>(.*?)<', section, 'city')
[docs] def findchildren(self, html): section = self.findbyre(r'(?s)Events</span></h2>(.*?)<h2', html) return self.findallbyre(r"child birth:.*?Person:\d+'>(.*?)<", section, 'person')
[docs] def findspouses(self, html): section = self.findbyre(r'(?s)Events</span></h2>(.*?)<h2', html) return self.findallbyre(r"marriage</a>.*?Person:\d+'>(.*?)<", section, 'person')
[docs] def findfamily(self, html): section = self.findbyre(r'(?s)<b>Lineage\s*</b>(.*?)</tr>', html) if section: return self.findbyre(r'>([^<>]*)</a>', section, 'family')
[docs] def findgender(self, html): return self.findbyre(r'(?s)Sex\s*</b>\s*</td><td>(.*?)<', html, 'gender')
[docs] def findfather(self, html): section = self.findbyre(r'(?s)<b>Parents</b>(.*?)</tr>', html) if section: return self.findbyre(r"♂.*?Person:\d+'>(.*?)<", section, 'person')
[docs] def findmother(self, html): section = self.findbyre(r'(?s)<b>Parents</b>(.*?)</tr>', html) if section: return self.findbyre(r"♀.*?Person:\d+'>(.*?)<", section, 'person')
[docs] def findreligions(self, html): return self.findallbyre(r'(?s)religion:\s*<.*?>([^<>]+)<.*?></p>', html, 'religion')
[docs] def findtitles(self, html): section = self.findbyre(r'(?s)Events</span></h2>(.*?)<h2', html) return self.findallbyre(r'title:.*?<a[^<>]*>(.*?)<', section, 'title')
[docs]class IbdbAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P1220' self.dbid = 'Q31964' self.dbname = 'IBDB' self.urlbase = 'https://www.ibdb.com/person.php?id={id}' self.hrtre = '(<h1>.*?)<div class="dottedLine">' self.language = 'en'
[docs] def finddescription(self, html): return self.findbyre(r'<meta name="description" content="(.*?)"', html)
[docs] def findnames(self, html): section = self.findbyre(r'(?s)<b>Also Known As</b>\s*</div>\s*<div[^<>]*>(.*?)</div>', html) if section: result = self.findallbyre(r'([^\[\]<>]*?)[\[<]', section) else: result = [] return result + [ self.findbyre(r'<title>([^<>]*?) – ', html) ]
[docs] def findlongtext(self, html): parts = self.findallbyre(r'"personDescription"[^<>]*>(.*?)<', html) if parts: return ' '.join(parts)
[docs] def findoccupations(self, html): section = self.findbyre(r'(?s)<div class="s12 wrapper tag-block-compact extramarg">(.*?)</div>', html) if section: return self.findallbyre(r'>([^<>]*)<', section, 'theater-occupation', alt=['occupation'])
[docs] def findbirthdate(self, html): return self.findbyre(r'(?s)<div class="xt-lable">Born</div>\s*<div class="xt-main-title">(.*?)</div>', html)
[docs] def findbirthplace(self, html): return self.findbyre( r'(?s)<div class="xt-lable">Born</div>\s*<div class="xt-main-title">' r'[^<>]*</div>\s*<div class="xt-main-moreinfo">(.*?)</div>', html, 'city')
[docs] def finddeathdate(self, html): return self.findbyre(r'(?s)<div class="xt-lable">Died</div>\s*<div class="xt-main-title">(.*?)</div>', html)
[docs] def finddeathplace(self, html): return self.findbyre( r'(?s)<div class="xt-lable">Died</div>\s*<div class="xt-main-title">[^<>]*</div>' r'\s*<div class="xt-main-moreinfo">(.*?)</div>', html, 'city')
[docs] def findgender(self, html): return self.findbyre(r'(?s)<div class="xt-lable">Gender</div>\s*<div class="xt-main-title">(.*?)</div>', html, 'gender')
[docs] def findawards(self, html): section = self.findbyre(r'(?s)<div id="awards".*?>(.*?)</table>', html) if section: parts = self.findallbyre(r'(?s)(<tr><th.*?</tr>\s*<tr>.*?</tr>)', section) result = [] for part in parts: if '[nominee]' not in part: result.append(self.findbyre(r'<th[^<>]*>(.*?)<', section, 'award')) return result