Source code for scripts.dataextend

#!/usr/bin/env python3
"""Script to add properties, identifiers and sources to WikiBase items.

Usage:

    dataextend <item> [<property>[+*]] [args]

In the basic usage, where no property is specified, item is the Q-number
of the item to work on.from html import unescape

If a property (P-number, or the special value 'Wiki' or 'Data') is
specified, only the data from that identifier are added. With a '+'
after it, work starts on that identifier, then goes on to identifiers
after that (including new identifiers added while working on those
identifiers). With a '*' after it, the identifier itself is skipped, but
those coming after it (not those coming before it) are included.

The following parameters are supported:

-always    If this is supplied, the bot will not ask for permission
           after each external link has been handled.

-showonly  Only show claims for a given ItemPage. Don't try to add any
           properties

The bot will load the corresponding pages for these identifiers, and try
to the meaning of that string for the specified type of thing (for
example 'city' or 'gender'). If you want to use it, but not save it
(which can happen if the string specifies a certain value now, but might
show another value elsewhere, or if it is so specific that you're pretty
sure it won't occur a second time), you can provide the Q-number with X
rather than Q. If you do not want to use the string, you can just hit
enter, or give the special value 'XXX' which means that it will be
skipped in each subsequent run as well.

After an identifier has been worked on, there might be a list of names
that has been found, in lc:name format, where lc is a language code. You
can accept all suggested names (answer Y), none (answer N) or ask to get
asked for each name separately (answer S), the latter being the default
if you do not fill in anything.

After all identifiers have been worked on, possible descriptions in
various languages are presented, and you get to choose one. The default
is here 0, which always is the current description for that language.
Finally, for a number of identifiers text is shown that usually gives
parts of the description that are hard to parse automatically, so you
can see if there any additional pieces of data that can be added.

It is advisable to (re)load the item page that the bot has been working
on in the browser afterward, to correct any mistakes it has made, or
cases where a more precise and less precise value have both been
included.

.. versionadded:: 7.2
"""
#
# (C) Pywikibot team, 2020-2023
#
# Distributed under the terms of the MIT license.
#
import codecs
import datetime
import re
from collections import defaultdict
from contextlib import suppress
from html import unescape
from textwrap import shorten
from typing import Optional
from urllib.parse import quote, unquote

import pywikibot
from pywikibot.backports import List, Tuple
from pywikibot.bot import SingleSiteBot, input_yn, suggest_help
from pywikibot.comms import http
from pywikibot.data import sparql
from pywikibot.exceptions import (
    APIError,
    InvalidTitleError,
    NoPageError,
    OtherPageSaveError,
    ServerError,
)
from pywikibot.tools.collections import DequeGenerator


[docs]class DataExtendBot(SingleSiteBot): update_options = { 'restrict': '', 'showonly': False, } """The Bot.""" QRE = re.compile(r'Q\d+') PQRE = re.compile(r'[PQ]\d+') def __init__(self, **kwargs): """Initializer.""" super().__init__(**kwargs) self.labels = {} self.data = defaultdict(dict) self.noname = set() self.labelfile = 'labels.txt' self.datafile = 'defaultdata.txt' self.nonamefile = 'noname.txt' self.loaddata() self.analyzertype = { 'P213': IsniAnalyzer, 'P214': ViafAnalyzer, 'P227': GndAnalyzer, 'P244': LcAuthAnalyzer, 'P245': UlanAnalyzer, 'P268': BnfAnalyzer, 'P269': SudocAnalyzer, 'P271': CiniiAnalyzer, 'P345': ImdbAnalyzer, 'P396': SbnAnalyzer, 'P409': LibrariesAustraliaAnalyzer, 'P434': MusicBrainzAnalyzer, 'P454': StructuraeAnalyzer, 'P496': OrcidAnalyzer, 'P497': CbdbAnalyzer, 'P535': FindGraveAnalyzer, 'P549': MathGenAnalyzer, 'P586': IpniAuthorsAnalyzer, # 'P590': GnisAnalyzer, <http redirect loop> 'P640': LeonoreAnalyzer, 'P648': OpenLibraryAnalyzer, 'P650': RkdArtistsAnalyzer, 'P651': BiografischPortaalAnalyzer, 'P691': NkcrAnalyzer, 'P723': DbnlAnalyzer, 'P781': SikartAnalyzer, 'P839': ImslpAnalyzer, 'P902': HdsAnalyzer, 'P906': SelibrAnalyzer, 'P950': BneAnalyzer, 'P1005': PtbnpAnalyzer, # 'P1006': NtaAnalyzer, 'P1015': BibsysAnalyzer, 'P1138': KunstindeksAnalyzer, 'P1146': IaafAnalyzer, # 'P1153': ScopusAnalyzer, <requires login> 'P1185': RodovidAnalyzer, 'P1220': IbdbAnalyzer, 'P1233': IsfdbAnalyzer, 'P1263': NndbAnalyzer, 'P1273': CanticAnalyzer, 'P1280': ConorSiAnalyzer, 'P1284': MunzingerAnalyzer, # 'P1305': SkyScraperAnalyzer, <forbidden> # <changed, content is not on page any more> # 'P1315': PeopleAustraliaAnalyzer, 'P1367': ArtUkAnalyzer, 'P1368': LnbAnalyzer, 'P1415': OxfordAnalyzer, 'P1422': SandrartAnalyzer, 'P1440': FideAnalyzer, 'P1447': SportsReferenceAnalyzer, 'P1463': PrdlAnalyzer, 'P1469': FifaAnalyzer, 'P1556': ZbmathAnalyzer, 'P1580': UBarcelonaAnalyzer, 'P1607': DialnetAnalyzer, 'P1615': ClaraAnalyzer, 'P1648': WelshBioAnalyzer, 'P1667': TgnAnalyzer, # 'P1695': NlpAnalyzer, <id doesn't work anymore> 'P1707': DaaoAnalyzer, # 'P1711': BritishMuseumAnalyzer, <does not load> 'P1741': GtaaAnalyzer, 'P1749': ParlementPolitiekAnalyzer, 'P1795': AmericanArtAnalyzer, 'P1802': EmloAnalyzer, 'P1816': NpgPersonAnalyzer, 'P1819': GenealogicsAnalyzer, 'P1838': PssBuildingAnalyzer, 'P1871': CerlAnalyzer, 'P1952': MetallumAnalyzer, 'P1953': DiscogsAnalyzer, 'P1977': ArchivesDuSpectacleAnalyzer, 'P1986': ItalianPeopleAnalyzer, 'P1988': DelargeAnalyzer, 'P2005': HalensisAnalyzer, # 'P2013': FacebookAnalyzer, <requires being logged in> 'P2016': AcademiaeGroninganaeAnalyzer, 'P2029': UlsterAnalyzer, 'P2038': ResearchGateAnalyzer, 'P2041': NgvAnalyzer, 'P2089': JukeboxAnalyzer, 'P2163': FastAnalyzer, 'P2168': SvenskFilmAnalyzer, 'P2191': NilfAnalyzer, 'P2252': NgaAnalyzer, 'P2268': OrsayAnalyzer, 'P2332': ArtHistoriansAnalyzer, 'P2340': CesarAnalyzer, 'P2342': AgorhaAnalyzer, 'P2349': StuttgartAnalyzer, 'P2372': OdisAnalyzer, 'P2381': AcademicTreeAnalyzer, 'P2383': CthsAnalyzer, 'P2446': TransfermarktAnalyzer, 'P2454': KnawAnalyzer, 'P2456': DblpAnalyzer, 'P2469': TheatricaliaAnalyzer, # 'P2533': WomenWritersAnalyzer, #fully opaque 'P2604': KinopoiskAnalyzer, 'P2605': CsfdAnalyzer, 'P2639': FilmportalAnalyzer, 'P2728': CageMatchAnalyzer, 'P2732': PerseeAnalyzer, 'P2750': PhotographersAnalyzer, 'P2753': CanadianBiographyAnalyzer, 'P2829': IWDAnalyzer, 'P2843': BenezitAnalyzer, 'P2915': EcarticoAnalyzer, 'P2940': RostochiensiumAnalyzer, 'P2941': MunksRollAnalyzer, 'P2944': PlarrAnalyzer, 'P2945': BookTradeAnalyzer, 'P2949': WikitreeAnalyzer, 'P2963': GoodreadsAnalyzer, 'P2977': LbtAnalyzer, 'P3029': NationalArchivesAnalyzer, 'P3107': LdifAnalyzer, 'P3109': PeakbaggerAnalyzer, 'P3138': OfdbAnalyzer, 'P3154': RunebergAuthorAnalyzer, 'P3159': UGentAnalyzer, 'P3283': BandcampAnalyzer, 'P3314': Chess365Analyzer, 'P3346': HkmdbAnalyzer, 'P3351': AdultFilmAnalyzer, 'P3360': NobelPrizeAnalyzer, 'P3392': SurmanAnalyzer, 'P3410': CcedAnalyzer, 'P3413': LeopoldinaAnalyzer, 'P3429': EnlightenmentAnalyzer, 'P3430': SnacAnalyzer, 'P3630': BabelioAnalyzer, 'P3782': ArtnetAnalyzer, 'P3786': DanskefilmAnalyzer, 'P3788': BnaAnalyzer, 'P3790': AnimeConsAnalyzer, 'P3829': PublonsAnalyzer, 'P3844': SynchronkarteiAnalyzer, 'P3924': TrackFieldFemaleAnalyzer, 'P3925': TrackFieldMaleAnalyzer, 'P4124': WhosWhoFranceAnalyzer, 'P4145': AthenaeumAnalyzer, 'P4158': AutoresArAnalyzer, 'P4206': FoihAnalyzer, 'P4228': EoasAnalyzer, # 'P4293': PM20Analyzer, <content in frame with unclear url> 'P4399': ItauAnalyzer, 'P4432': AKLAnalyzer, 'P4459': SpanishBiographyAnalyzer, 'P4548': CommonwealthGamesAnalyzer, 'P4585': AccademiaCruscaAnalyzer, 'P4629': OnlineBooksAnalyzer, 'P4657': NumbersAnalyzer, 'P4663': DacsAnalyzer, 'P4666': CinemagiaAnalyzer, 'P4687': PeintresBelgesAnalyzer, 'P4749': AuteursLuxembourgAnalyzer, 'P4759': LuminousAnalyzer, 'P4769': GameFaqsAnalyzer, 'P4823': AmericanBiographyAnalyzer, 'P4872': GeprisAnalyzer, 'P4887': WebumeniaAnalyzer, 'P4927': InvaluableAnalyzer, 'P4929': AinmAnalyzer, 'P4985': TmdbAnalyzer, 'P5034': LibraryKoreaAnalyzer, 'P5068': KunstenpuntAnalyzer, 'P5239': ArtistsCanadaAnalyzer, 'P5240': RollDaBeatsAnalyzer, 'P5246': PornhubAnalyzer, 'P5267': YoupornAnalyzer, 'P5273': NelsonAtkinsAnalyzer, 'P5329': ArmbAnalyzer, 'P5359': OperoneAnalyzer, 'P5361': BnbAnalyzer, 'P5365': InternetBookAnalyzer, 'P5375': BiuSanteAnalyzer, 'P5394': PoetsWritersAnalyzer, 'P5308': ScottishArchitectsAnalyzer, 'P5357': SFAnalyzer, 'P5368': NatGeoCanadaAnalyzer, 'P5370': EntomologistAnalyzer, 'P5408': FantasticFictionAnalyzer, 'P5415': WhonameditAnalyzer, 'P5421': TradingCardAnalyzer, 'P5491': BedethequeAnalyzer, 'P5492': Edit16Analyzer, 'P5504': RismAnalyzer, 'P5534': OmdbAnalyzer, 'P5540': RedTubeAnalyzer, 'P5570': NoosfereAnalyzer, 'P5597': ArtcyclopediaAnalyzer, 'P5645': AcademieFrancaiseAnalyzer, 'P5731': AngelicumAnalyzer, 'P5739': PuscAnalyzer, 'P5747': CwaAnalyzer, 'P5794': IgdbAnalyzer, 'P5819': MathOlympAnalyzer, 'P5882': MuziekwebAnalyzer, 'P6127': LetterboxdAnalyzer, 'P6167': BritishExecutionsAnalyzer, 'P6188': BdfaAnalyzer, 'P6194': AustrianBiographicalAnalyzer, 'P6231': BdelAnalyzer, 'P6295': ArticArtistAnalyzer, 'P6517': WhoSampledAnalyzer, 'P6575': AcademieRouenAnalyzer, 'P6578': MutualAnalyzer, 'P6594': GuggenheimAnalyzer, 'P6770': SnsaAnalyzer, 'P6815': UvaAlbumAnalyzer, 'P6821': AlvinAnalyzer, 'P6844': AbartAnalyzer, 'P6873': IntraTextAnalyzer, 'P7032': RepertoriumAnalyzer, 'P7293': PlwabnAnalyzer, 'P7796': BewebAnalyzer, 'P7902': DeutscheBiographieAnalyzer, 'P8287': WorldsWithoutEndAnalyzer, 'P8696': BelgianPhotographerAnalyzer, 'P8795': AlkindiAnalyzer, 'P8848': ConorAlAnalyzer, 'P8849': ConorBgAnalyzer, 'P8851': ConorSrAnalyzer, 'P8914': ZobodatAnalyzer, 'P9017': OxfordMedievalAnalyzer, # 'P9046': AdSAnalyzer, hard to analyze JavaScript 'P9113': PatrinumAnalyzer, 'P9430': JwaAnalyzer, 'fomu.atomis.be': FotomuseumAnalyzer, 'catalogo.bn.gov.ar': BibliotecaNacionalAnalyzer, 'www.brooklynmuseum.org': BrooklynMuseumAnalyzer, 'www.vondel.humanities.uva.nl': OnstageAnalyzer, 'www.ias.edu': IasAnalyzer, 'kunstaspekte.art': KunstaspekteAnalyzer, 'www.nationaltrustcollections.org.uk': NationalTrustAnalyzer, 'www.oxfordartonline.com': BenezitUrlAnalyzer, 'exhibitions.univie.ac.at': UnivieAnalyzer, 'weber-gesamtausgabe.de': WeberAnalyzer, 'Wiki': WikiAnalyzer, 'Data': BacklinkAnalyzer, 'www.deutsche-biographie.de': DeutscheBiographieAnalyzer, }
[docs] def label(self, title): if title.startswith('!date!'): return self.showtime(self.createdateclaim(title[6:])) if title.startswith('!q!'): return title[3:] if not self.PQRE.fullmatch(title): return title if title in self.labels: return self.labels[title] item = self.page(title) try: labels = item.get()['labels'] except NoPageError: labels = {} for lang in ['en', 'nl', 'de', 'fr', 'es', 'it', 'af', 'nds', 'li', 'vls', 'zea', 'fy', 'no', 'sv', 'da', 'pt', 'ro', 'pl', 'cs', 'sk', 'hr', 'et', 'fi', 'lt', 'lv', 'tr', 'cy']: if lang in labels: try: label = labels[lang]['value'] except TypeError: label = labels[lang] break else: label = title self.labels[title] = label return label
[docs] def loaddata(self): """Read data from files.""" param = {'mode': 'r', 'encoding': 'utf-8'} with suppress(IOError), codecs.open(self.labelfile, **param) as f: for line in f.readlines(): key, value = line.strip().split(':', 1) self.labels[key] = value with suppress(IOError), codecs.open(self.datafile, **param) as f: for line in f.readlines(): parts = line.strip().split(':') # assume len(parts) > 1 dtype, *keys, value = parts key = ':'.join(keys) self.data[dtype][key] = value with suppress(IOError), codecs.open(self.nonamefile, **param) as f: self.noname = {line.strip() for line in f.readlines()}
[docs] def teardown(self) -> None: """Save data to files.""" param = {'mode': 'w', 'encoding': 'utf-8'} with codecs.open(self.labelfile, **param) as f: for item in self.labels: f.write(f'{item}:{self.labels[item]}\n') with codecs.open(self.datafile, **param) as f: for dtype in self.data: for key in self.data[dtype]: f.write('{}:{}:{}\n'.format(dtype, key, self.data[dtype][key])) with codecs.open(self.nonamefile, **param) as f: for noname in self.noname: f.write(f'{noname}\n')
[docs] def page(self, title): """Dispatch title and return the appropriate Page object.""" title = title.rsplit(':', 1)[-1] if title.startswith('Q'): return pywikibot.ItemPage(self.site, title) if title.startswith('P'): return pywikibot.PropertyPage(self.site, title) raise ValueError(f'Invalid title {title}')
[docs] @staticmethod def showtime(time): if time is None: return 'unknown' result = str(time.year) if time.precision < 9: result = 'ca. ' + result if time.precision >= 10: result = f'{time.month}-{result}' if time.precision >= 11: result = f'{time.day}-{result}' if time.precision >= 12: result = f'{result} {time.hour}' if time.precision >= 13: result = f'{result}:{time.minute}' if time.precision >= 14: result = f'{result}:{time.second}' return result
[docs] def showclaims(self, claims): pywikibot.info('Current information:') for prop in claims: for claim in claims[prop]: if claim.type == 'wikibase-item': if claim.getTarget() is None: pywikibot.info(f'{self.label(prop)}: unknown') else: pywikibot.info( '{}: {}' .format(self.label(prop), self.label(claim.getTarget().title()))) elif claim.type == 'time': pywikibot.info('{}: {}' .format(self.label(prop), self.showtime(claim.getTarget()))) elif claim.type in ['external-id', 'commonsMedia']: pywikibot.info('{}: {}'.format(self.label(prop), claim.getTarget())) elif claim.type == 'quantity': pywikibot.info( '{}: {} {}' .format(self.label(prop), claim.getTarget().amount, self.label( claim.getTarget().unit.split('/')[-1]))) else: pywikibot.info('Unknown type {} for property {}' .format(claim.type, self.label(prop)))
MONTHNUMBER = { '1': 1, '01': 1, 'i': 1, '2': 2, '02': 2, 'ii': 2, '3': 3, '03': 3, 'iii': 3, '4': 4, '04': 4, 'iv': 4, '5': 5, '05': 5, 'v': 5, '6': 6, '06': 6, 'vi': 6, '7': 7, '07': 7, 'vii': 7, '8': 8, '08': 8, 'viii': 8, '9': 9, '09': 9, 'ix': 9, '10': 10, 'x': 10, '11': 11, 'xi': 11, '12': 12, 'xii': 12, 'january': 1, 'jan': 1, 'february': 2, 'feb': 2, 'febr': 2, 'march': 3, 'mar': 3, 'april': 4, 'apr': 4, 'may': 5, 'june': 6, 'jun': 6, 'july': 7, 'jul': 7, 'august': 8, 'aug': 8, 'september': 9, 'sep': 9, 'sept': 9, 'october': 10, 'oct': 10, 'november': 11, 'nov': 11, 'december': 12, 'dec': 12, 'gennaio': 1, 'gen': 1, 'genn': 1, 'febbraio': 2, 'febb': 2, 'febbr': 2, 'marzo': 3, 'marz': 3, 'aprile': 4, 'maggio': 5, 'mag': 5, 'magg': 5, 'giugno': 6, 'giu': 6, 'luglio': 7, 'lug': 7, 'lugl': 7, 'agosto': 8, 'ago': 8, 'agost': 8, 'ag': 8, 'settembre': 9, 'set': 9, 'sett': 9, 'ottobre': 10, 'ott': 10, 'otto': 10, 'novembre': 11, 'dicembre': 12, 'dic': 12, 'januari': 1, 'februari': 2, 'maart': 3, 'maa': 3, 'mrt': 3, 'mei': 5, 'juni': 6, 'juli': 7, 'augustus': 8, 'oktober': 10, 'okt': 10, 'janvier': 1, 'février': 2, 'fevrier': 2, 'fév': 2, 'fev': 2, 'f\\xe9vrier': 2, 'mars': 3, 'avril': 4, 'avr': 4, 'mai': 5, 'juin': 6, 'juillet': 7, 'août': 8, 'aout': 8, 'aoû': 8, 'aou': 8, 'septembre': 9, 'octobre': 10, 'décembre': 12, 'déc': 12, 'januar': 1, 'jänner': 1, 'februar': 2, 'märz': 3, 'm\\xe4rz': 3, 'dezember': 12, 'dez': 12, 'eanáir': 1, 'eanair': 1, 'feabhra': 2, 'márta': 3, 'marta': 3, 'aibreán': 4, 'aibrean': 4, 'bealtaine': 5, 'meitheamh': 6, 'iúil': 7, 'iuil': 7, 'lúnasa': 8, 'lunasa': 8, 'meán fómhair': 9, 'mean fomhair': 9, 'deireadh fómhair': 10, 'deireadh fomhair': 10, 'samhain': 11, 'nollaig': 12, 'styczeń': 1, 'stycznia': 1, 'luty': 2, 'lutego': 2, 'marzec': 3, 'marca': 3, 'kwiecień': 4, 'kwietnia': 4, 'maj': 5, 'maja': 5, 'czerwiec': 6, 'czerwca': 6, 'lipiec': 7, 'lipca': 7, 'sierpień': 8, 'sierpnia': 8, 'wrzesień': 9, 'września': 9, 'październik': 10, 'października': 10, 'listopad': 11, 'listopada': 11, 'grudzień': 12, 'grudnia': 12, 'enero': 1, 'febrero': 2, 'abril': 4, 'mayo': 5, 'junio': 6, 'julio': 7, 'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12, 'gener': 1, 'febrer': 2, 'març': 3, 'maig': 5, 'juny': 6, 'juliol': 7, 'setembre': 9, 'desembre': 12, }
[docs] def createdateclaim(self, text): text = text.strip() year = None month = None day = None m = re.search(r'[{\|](\d{4})\|(\d+)\|(\d+)[\|}]', text) if m: year = int(m[1]) month = int(m[2]) day = int(m[3]) m = re.fullmatch(r'(\d{1,4})(?:年頃|\.)?', text) if m: year = int(m[1]) month = None day = None if re.fullmatch(r'(?:1\d{3}|20[01]\d)[01]\d[0123]\d', text): year = int(text[:4]) month = int(text[4:6]) day = int(text[6:]) if re.fullmatch(r'\d{4}-\d{2}', text): year = int(text[:4]) month = int(text[-2:]) m = re.match(r'(\d{1,2})[-/](\d{4})', text) if m: year = int(m[2]) month = int(m[1]) m = re.fullmatch(r'(\d+)[-./|](\d{1,2})[-./|](\d{1,2})', text) if m: year = int(m[1]) month = int(m[2]) day = int(m[3]) m = re.fullmatch( r'(\d{1,2})[-./|]\s*(\d{1,2})[-./|]\s*(\d{3,4})\.?', text) if m: year = int(m[3]) month = int(m[2]) day = int(m[1]) m = re.fullmatch(r'(\d{1,2})[-./\s]([iIvVxX]+)[-./\s](\d{4})', text) if m: year = int(m[3]) try: month = self.MONTHNUMBER[m[2].lower()] except KeyError: raise ValueError(f"Don't know month {m[2]}") day = int(m[1]) m = re.fullmatch(r"(\d+)(?:\.|er|eme|ème)?[\s.]\s*(?:d'|d[aei] )?" r'([^\s.]{2,})\.?[\s.]\s*(\d+)', text) if m: year = int(m[3]) try: month = self.MONTHNUMBER[m[2].lower()] except KeyError: raise ValueError(f"Don't know month {m[2]}") day = int(m[1]) m = re.fullmatch( r'(\d{4})\.?[\s.]\s*([^\s.]{3,})\.?[\s.]\s*(\d+)', text) if m: year = int(m[1]) try: month = self.MONTHNUMBER[m[2].lower()] except KeyError: raise ValueError(f"Don't know month {m[2]}") day = int(m[3]) m = re.match(r"(\d+) (?:de |d')?(\w+[a-z]\w+) de (\d+)", text) if m: year = int(m[3]) try: month = self.MONTHNUMBER[m[2].lower()] except KeyError: raise ValueError(f"Don't know month {m[2]}") day = int(m[1]) m = re.fullmatch(r'(\w*[a-zA-Z]\w*)\.? (\d+)', text) if m: year = int(m[2]) try: month = self.MONTHNUMBER[m[1].lower()] except KeyError: raise ValueError(f"Don't know month {m[1]}") m = re.fullmatch( r'(\w+)\.? (\d{1,2})(?:st|nd|rd|th)?\.?\s*,\s*(\d{3,4})', text) if m: year = int(m[3]) try: month = self.MONTHNUMBER[m[1].lower()] except KeyError: raise ValueError(f"Don't know month {m[1]}") day = int(m[2]) m = re.match(r'(\d{4}),? (\d{1,2}) (\w+)', text) if m: year = int(m[1]) try: month = self.MONTHNUMBER[m[3].lower()] except KeyError: raise ValueError(f"Don't know month {m[1]}") day = int(m[2]) m = re.match(r'(\d+)年(\d+)月(\d+)日', text) if m: year = int(m[1]) month = int(m[2]) day = int(m[3]) m = re.fullmatch(r'(\d+)年', text) if m: year = int(m[1]) if day == 0: day = None if day is None and month == 0: month = None if month and month > 12: raise ValueError('Date seems to have an invalid month number {}' .format(month)) if day and day > 31: raise ValueError('Date seems to have an invalid day number {}' .format(day)) if not year: raise ValueError(f"Can't interpret date {text}") return pywikibot.WbTime(year=year, month=month, day=day, precision=9 if month is None else 10 if day is None else 11)
QUANTITYTYPE = { 'meter': 'Q11573', 'metre': 'Q11573', 'm': 'Q11573', 'meters': 'Q11573', 'metres': 'Q11573', 'м': 'Q11573', 'centimeter': 'Q174728', 'centimetre': 'Q174728', 'cm': 'Q174728', 'foot': 'Q3710', 'feet': 'Q3710', 'ft': 'Q3710', 'mile': 'Q253276', 'mi': 'Q253276', 'kilometer': 'Q828224', 'kilometre': 'Q828224', 'km': 'Q828224', 'minute': 'Q7727', 'minutes': 'Q7727', 'min': 'Q7727', 'minuten': 'Q7727', 'second': 'Q11574', 's': 'Q11574', 'kilogram': 'Q11570', 'kg': 'Q11570', 'lb': 'Q100995', 'lbs': 'Q100995', 'pond': 'Q100995', }
[docs] def createquantityclaim(self, text): m = re.match(r'(\d+(?:\.\d+)?)\s*([a-z]\w*)', text.replace(',', '.')) amount = m[1] name = m[2].lower() return pywikibot.WbQuantity(amount, unit=pywikibot.ItemPage( self.site, self.QUANTITYTYPE[name]), site=self.site)
[docs] def treat(self, item) -> None: """Process the ItemPage.""" item.get() claims = item.claims self.showclaims(claims) if self.opt.showonly: return longtexts = [] newdescriptions = defaultdict(set) updatedclaims = {prop: claims[prop] for prop in claims} dorestrict = True continueafterrestrict = False restrict_end = self.opt.restrict and self.opt.restrict[-1] if restrict_end in ('+', '*'): self.opt.restrict = self.opt.restrict[:-1] continueafterrestrict = True if restrict_end == '*': dorestrict = False unidentifiedprops = [] failedprops = [] claims['Wiki'] = [Quasiclaim(page.title(force_interwiki=True, as_link=True)[2:-2]) for page in item.iterlinks()] claims['Data'] = [Quasiclaim(item.title())] propstodo = DequeGenerator(claims) propsdone = set() for prop in propstodo: descriptions = item.descriptions labels = item.labels aliases = item.aliases # This can happen after reloading if prop not in claims.keys(): continue if self.opt.restrict: if prop != self.opt.restrict: continue if continueafterrestrict: self.opt.restrict = '' if not dorestrict: continue for mainclaim in claims[prop]: if mainclaim.type != 'external-id' and prop != 'P973': continue identifier = mainclaim.getTarget() try: analyzertype = self.analyzertype[identifier.split('/')[2] if prop == 'P973' else prop] except KeyError: unidentifiedprops.append(prop) continue analyzer = analyzertype(identifier, self.data, item.title(), self) newclaims = analyzer.findclaims() or [] if newclaims is None: failedprops.append(prop) newclaims = [] if not self.opt.always: pywikibot.info('Found here:') for claim in newclaims: try: pywikibot.info( '{}: {}'.format(self.label(claim[0]), self.label(claim[1]))) except ValueError: newclaims = [nclaim for nclaim in newclaims if nclaim != claim] if self.opt.always or input_yn('Save this?', default=True): for claim in newclaims: if claim[0] in updatedclaims \ and self.isinclaims(claim[1], updatedclaims[claim[0]]): if claim[2]: source = None if claim[2].dbid: id_ = 'P143' if claim[2].iswiki else 'P248' source = pywikibot.Claim(self.site, id_) source.setTarget( pywikibot.ItemPage(self.site, claim[2].dbid)) id_ = 'P4656' if claim[2].iswiki else 'P854' url = pywikibot.Claim(self.site, id_) if claim[2].sparqlquery: url.setTarget(pywikibot.ItemPage( self.site, claim[1]).full_url()) else: url.setTarget(claim[2].url) if claim[2].iswiki or claim[2].isurl: iddata = None else: iddata = pywikibot.Claim(self.site, prop) iddata.setTarget(identifier) if url is None: date = None else: date = pywikibot.Claim(self.site, 'P813') date.setTarget( self.createdateclaim( min(datetime.datetime.now() .strftime('%Y-%m-%d'), datetime.datetime.utcnow() .strftime('%Y-%m-%d')))) if not analyzer.showurl: url = None sourceparts = [source, url, iddata, date] sourcedata = [sourcepart for sourcepart in sourceparts if sourcepart is not None] pywikibot.info('Sourcing {}: {}' .format(self.label(claim[0]), self.label(claim[1]))) # probably means the sourcing is already there with suppress(APIError): updatedclaims[claim[0]][self.getlocnumber( claim[1], updatedclaims[claim[0]])].addSources( sourcedata) else: if claim[0] not in propsdone: propstodo.append(claim[0]) createdclaim = pywikibot.Claim(self.site, claim[0]) if self.QRE.fullmatch(claim[1]): createdclaim.setTarget(pywikibot.ItemPage( self.site, claim[1])) elif claim[1].startswith('!date!'): try: target = self.createdateclaim(claim[1][6:]) except ValueError as ex: pywikibot.info( 'Unable to analyze date "{}" for {}: {}' .format(claim[1][6:], self.label(claim[0]), ex)) pywikibot.input('Press enter to continue') target = None if target is None: continue createdclaim.setTarget(target) elif claim[1].startswith('!q!'): target = self.createquantityclaim( claim[1][3:].strip()) if target is None: continue createdclaim.setTarget(target) elif claim[1].startswith('!i!'): createdclaim.setTarget( pywikibot.page.FilePage(self.site, claim[1][3:])) else: createdclaim.setTarget(claim[1]) pywikibot.info('Adding {}: {}' .format(self.label(claim[0]), self.label(claim[1]))) try: item.addClaim(createdclaim) except OtherPageSaveError as ex: if claim[1].startswith('!i!'): pywikibot.info( 'Unable to save image {}: {}' .format(claim[1][3:], ex)) continue raise if claim[0] in updatedclaims: updatedclaims[claim[0]].append(createdclaim) else: updatedclaims[claim[0]] = [createdclaim] if claim[2]: if claim[2].dbid: if claim[2].iswiki: source = pywikibot.Claim(self.site, 'P143') else: source = pywikibot.Claim(self.site, 'P248') source.setTarget( pywikibot.ItemPage(self.site, claim[2].dbid)) else: source = None if claim[2].iswiki: url = pywikibot.Claim(self.site, 'P4656') else: url = pywikibot.Claim(self.site, 'P854') if claim[2].sparqlquery: url.setTarget( pywikibot.ItemPage( self.site, claim[1]).full_url()) else: url.setTarget(claim[2].url) if claim[2].iswiki or claim[2].isurl: iddata = None else: iddata = pywikibot.Claim(self.site, prop) iddata.setTarget(identifier) if url is None: date = None else: date = pywikibot.Claim( self.site, 'P813') date.setTarget(self.createdateclaim( min(datetime.datetime.now().strftime( '%Y-%m-%d'), datetime.datetime.utcnow().strftime('%Y-%m-%d')))) if not analyzer.showurl: url = None sourcedata = [source, url, iddata, date] sourcedata = [sourcepart for sourcepart in sourcedata if sourcepart is not None] pywikibot.info('Sourcing {}: {}' .format(self.label(claim[0]), self.label(claim[1]))) try: createdclaim.addSources( [s for s in sourcedata if s is not None]) except AttributeError: try: updatedclaims[claim[0]][ self.getlocnumber( claim[1], updatedclaims[claim[0]]) ].addSources(sourcedata) except AttributeError: if prop not in propsdone: propstodo.append(prop) pywikibot.info('Sourcing failed') for language, description in analyzer.getdescriptions(): newdescriptions[language].add( shorten(description.rstrip('.'), width=249, placeholder='...')) newnames = analyzer.getnames() newlabels, newaliases = self.definelabels( labels, aliases, newnames) if newlabels: item.editLabels(newlabels) if newaliases: item.editAliases(newaliases) if newlabels or newaliases: item.get(force=True) claims = item.claims claims['Wiki'] = [ Quasiclaim( page.title(force_interwiki=True, as_link=True)[2:-2]) for page in item.iterlinks() ] claims['Data'] = [Quasiclaim(item.title())] descriptions = item.descriptions labels = item.labels aliases = item.aliases if analyzer.longtext(): longtexts.append((analyzer.dbname, analyzer.longtext())) propsdone.add(prop) item.get(force=True) claims = item.claims claims['Wiki'] = [Quasiclaim(page.title(force_interwiki=True, as_link=True)[2:-2]) for page in item.iterlinks()] claims['Data'] = [Quasiclaim(item.title())] editdescriptions = {} for language in newdescriptions.keys(): newdescription = self.definedescription( language, descriptions.get(language), newdescriptions.get(language)) if newdescription: editdescriptions[language] = newdescription if editdescriptions: item.editDescriptions(editdescriptions) for prop in unidentifiedprops: pywikibot.info('Unknown external {} ({})' .format(prop, self.label(prop))) for prop in failedprops: pywikibot.info('External failed to load: {} ({})' .format(prop, self.label(prop))) if longtexts: if unidentifiedprops or failedprops: pywikibot.input('Press Enter to continue') pywikibot.info('== longtexts ==') for longtext in longtexts: pywikibot.info(f'\n== {longtext[0]} ==\n{longtext[1]}') pywikibot.input('(press enter)')
[docs] @staticmethod def definedescription(language, existingdescription, suggestions): possibilities = [existingdescription] + list(suggestions) pywikibot.info(f'\nSelect a description for language {language}:') pywikibot.info('Default is to keep the old value (0)') for i, pos in enumerate(possibilities): if pos is None: pywikibot.info(f'{i}: No description') else: pywikibot.info(f'{i}: {pos}') answer = pywikibot.input('Which one to choose? ') try: answer = int(answer) except ValueError: answer = 0 if answer: return possibilities[answer] return None
[docs] def definelabels(self, existinglabels, existingaliases, newnames): realnewnames = defaultdict(list) anythingfound = False for (language, name) in newnames: name = name.strip() if name.lower() == (existinglabels.get(language) or '').lower() \ or name.lower() in (n.lower() for n in existingaliases.get(language, [])): continue if name not in realnewnames[language] and name not in self.noname: realnewnames[language].append(name) anythingfound = True if anythingfound: pywikibot.info('\nNew names found:') for language in realnewnames.keys(): for name in realnewnames[language]: pywikibot.info(f'{language}: {name}') result = pywikibot.input('Add these names? (y/n/[S]elect/x) ') if not result or result[0].upper() not in 'YNX': chosennewnames = defaultdict(list) for language in realnewnames.keys(): for name in realnewnames[language]: result = pywikibot.input(f'{language}: {name} - ') if (not result) or result[0].upper() == 'Y': chosennewnames[language].append(name) elif result[0].upper() == 'X': self.noname.add(name) realnewnames = chosennewnames result = 'Y' if result[0].upper() == 'X': for language in realnewnames.keys(): for name in realnewnames[language]: self.noname.add(name) elif result[0].upper() != 'N': returnvalue = [{}, {}] for language in realnewnames: if language in existinglabels: returnvalue[1][language] = existingaliases.get( language, []) + realnewnames[language] else: returnvalue[0][language] = realnewnames[language][0] if realnewnames[language]: returnvalue[1][language] = existingaliases.get( language, []) + realnewnames[language][1:] return returnvalue return [{}, {}]
[docs] def isclaim(self, value, claim): try: if value.startswith('!date!'): value = value[6:] if value.startswith('!q!'): value = re.search(r'\d+(?:\.\d+)?', value)[0] elif value.startswith('!i!'): value = value[3:].strip() if str(claim.getTarget()) == value: return True if claim.type == 'wikibase-item' \ and claim.getTarget().title() == value: return True if claim.type == 'commonsMedia' \ and claim.getTarget().title().split( ':', 1)[1].replace('_', ' ') == value.replace('_', ' '): return True if claim.type == 'time' \ and self.showtime(claim.getTarget()) == self.showtime( self.createdateclaim(value)): return True except (ValueError, AttributeError): return False
[docs] def isinclaims(self, value, claims): return any(self.isclaim(value, claim) for claim in claims)
[docs] def getlocnumber(self, value, claims): for pair in zip(range(len(claims)), claims): if self.isclaim(value, pair[1]): return pair[0] raise ValueError
[docs]class Quasiclaim: def __init__(self, title): """Initializer.""" self._target = title @property def type(self): return 'external-id'
[docs] def getTarget(self): # noqa: N802 """Return the target value of this QuasiClaim.""" return self._target
[docs]class Analyzer: TAGRE = re.compile('<[^<>]*>') SCRIPTRE = re.compile('(?s)<script.*?</script>') def __init__(self, id, data=None, item=None, bot=None): """Initializer.""" self.id = id self.data = defaultdict(dict) if data is None else data self.dbname = None self.urlbase = None self.urlbase2 = None self.urlbase3 = self.urlbase4 = None self.showurl = True self.dbid = None self.dbitem = None self.dbproperty = None self.hrtre = None self.language = 'en' self.escapeunicode = False self.escapehtml = False self.escapeurl = False self.item = item self.iswiki = False self.sparqlquery = None self.isurl = False self.skipfirst = False self.bot = bot self.setup() self.site = pywikibot.Site().data_repository()
[docs] def setup(self): """To be used for putting data into subclasses."""
@property def url(self): usedurl = self.urlbase if usedurl is None: if not self.sparqlquery: pywikibot.info( f'\n### Skipping {self.dbname} ({self.dbproperty}) ###') return None return usedurl.format(id=quote(self.id)) @property def alturl(self): if self.urlbase2: return self.urlbase2.format(id=quote(self.id)) return None @property def extraurls(self) -> List[str]: if not self.urlbase3: return [] if self.urlbase4: return [self.urlbase3.format(id=quote(self.id)), self.urlbase4.format(id=quote(self.id))] return [self.urlbase3.format(id=quote(self.id))]
[docs] @staticmethod def commastrip(term): term = re.sub(r'(?:\s|&nbsp;)+', ' ', term) term = term.strip().strip(',').rstrip('.').strip() term = term.split('(')[0] if ',' in term: if term.split(',')[1].strip().lower() in ['jr', 'sr']: term += '.' else: if term.strip()[-1] != term.strip()[-1].lower(): term = term.strip() + '.' term = term.split(',', 1)[1] + ' ' + term.split(',', 1)[0] term = re.sub(r'\s*-\s*', '-', term) return unescape(term).strip()
[docs] def getdata(self, dtype, text, ask=True): text = text.strip('. ').lower().replace('\\n', ' ').replace( '\n', ' ').replace('%20', ' ').strip() text = re.sub(' +', ' ', text) if not text: return None if dtype in self.data and text in self.data[dtype]: if self.data[dtype][text] == 'XXX': return None return self.data[dtype][text] if not ask: return None pywikibot.info(f"Trying to get a {dtype} out of '{text}'") answer = pywikibot.input( 'Type Qnnn to let it point to Qnnn from now on,\n' 'Xnnn to let it point to Qnnn only now,\n' 'XXX to never use it, or nothing to not use it now') if answer.startswith('Q'): self.data[dtype][text] = answer elif answer.upper() == 'XXX': self.data[dtype][text] = 'XXX' answer = None elif answer.startswith('X'): answer = 'Q' + answer[1:] else: answer = None return answer
[docs] def findclaims(self) -> List[Tuple[str, str, Optional['Analyzer']]]: if not self.id or not (self.url or self.sparqlquery): return [] self.html = '' newclaims = [] pywikibot.info() pagerequest = None if not self.skipfirst: for used, base in enumerate(self.urlbase, self.urlbase2): if used and not base: continue self.urlbase = base pywikibot.info(f'Getting {self.url}') with suppress(ServerError, ConnectionError): pagerequest = http.fetch(self.url) break else: pywikibot.info(f'Unable to load {self.url}') return [] if pagerequest: self.html = pagerequest.text for extraurl in self.extraurls: pywikibot.info(f'Getting {extraurl}') try: pagerequest = http.fetch(extraurl) except (ServerError, ConnectionError): pywikibot.info('Unable to receive altpage') else: self.html += '\n' + pagerequest.text if self.sparqlquery: self.html = str(sparql.SparqlQuery().select(self.sparqlquery)) if not self.html: return [] if self.escapeunicode: self.html = self.html.encode().decode('unicode-escape') if self.escapehtml: self.html = unescape(self.html) if self.escapeurl: self.html = unquote(self.html) self.html = self.prepare(self.html) pywikibot.info(f'\n=== {self.dbname} ({self.dbproperty}) ====') if self.hrtre: match = re.compile('(?s)' + self.hrtre).search(self.html) if match: text = match[1] text = text.replace('\\n', '\n') text = text.replace('\\t', '\t') text = text.replace('\\r', '\n') text = text.replace('\r', '\n') text = text.replace('\t', ' ') oldtext = '' while oldtext != text: oldtext = text text = self.SCRIPTRE.sub('', text) oldtext = '' while oldtext != text: oldtext = text text = self.TAGRE.sub(' ', text) while '&nbsp;' in text: text = text.replace('&nbsp;', ' ') while ' ' in text: text = text.replace(' ', ' ') while '\n ' in text: text = text.replace('\n ', '\n') while '\n\n' in text: text = text.replace('\n\n', '\n') text = text.strip() pywikibot.info(text) pywikibot.info('-' * (len(self.dbname) + 8)) for (function, prop) in [ (self.findinstanceof, 'P31'), (self.findfirstname, 'P735'), (self.findlastname, 'P734'), ]: result = function(self.html) if result: newclaims.append((prop, result.strip(), None)) for (function, prop) in [ (self.findcountries, 'P17'), (self.findspouses, 'P26'), (self.findpartners, 'P451'), (self.findworkplaces, 'P937'), (self.findresidences, 'P551'), (self.findoccupations, 'P106'), (self.findworkfields, 'P101'), (self.findpositions, 'P39'), (self.findtitles, 'P97'), (self.findemployers, 'P108'), (self.findranks, 'P410'), (self.findschools, 'P69'), (self.findethnicities, 'P172'), (self.findcrimes, 'P1399'), (self.findcomposers, 'P86'), (self.findmoviedirectors, 'P57'), (self.findartdirectors, 'P3174'), (self.findscreenwriters, 'P58'), (self.findproducers, 'P162'), (self.finddirectorsphotography, 'P344'), (self.findmovieeditors, 'P1040'), (self.findproductiondesigners, 'P2554'), (self.findsounddesigners, 'P5028'), (self.findcostumedesigners, 'P2515'), (self.findmakeupartists, 'P4805'), (self.findarchitects, 'P84'), (self.findgenres, 'P136'), (self.findengines, 'P408'), (self.findgamemodes, 'P404'), (self.findcast, 'P161'), (self.findmaterials, 'P186'), (self.finddevelopers, 'P178'), (self.findpublishers, 'P123'), (self.findprodcoms, 'P272'), (self.finddistcoms, 'P750'), (self.findoriglanguages, 'P364'), (self.findcolors, 'P462'), (self.findlanguagesspoken, 'P1412'), (self.findlanguages, 'P407'), (self.findnativelanguages, 'P103'), (self.findpseudonyms, 'P742'), (self.findparts, 'P527'), (self.findpartofs, 'P361'), (self.findinstruments, 'P1303'), (self.findlabels, 'P264'), (self.findsports, 'P641'), (self.findawards, 'P166'), (self.findnominations, 'P1411'), (self.findmemberships, 'P463'), (self.findsportteams, 'P54'), (self.findparties, 'P102'), (self.findbranches, 'P241'), (self.findconflicts, 'P607'), (self.findteampositions, 'P413'), (self.findpolitical, 'P1142'), (self.findstudents, 'P802'), (self.finddocstudents, 'P185'), (self.findteachers, 'P1066'), (self.findadvisors, 'P184'), (self.findinfluences, 'P737'), (self.finddegrees, 'P512'), (self.findmajors, 'P812'), (self.findparticipations, 'P1344'), (self.findnationalities, 'P27'), (self.findsportcountries, 'P1532'), (self.findreligions, 'P140'), (self.findchildren, 'P40'), (self.findsiblings, 'P3373'), (self.findkins, 'P1038'), (self.findincollections, 'P6379'), (self.findinworks, 'P1441'), (self.findmovements, 'P135'), (self.findorigcountries, 'P495'), (self.findwebpages, 'P973'), (self.findsources, 'P1343'), (self.findchoriginplaces, 'P1321'), (self.findpatronof, 'P2925'), (self.findnotableworks, 'P800'), (self.findparticipantins, 'P1344'), (self.findplatforms, 'P400'), (self.findfranchises, 'P8345'), (self.findvoices, 'P412'), ]: results = function(self.html) or [] for result in results: if result is not None and str(result).strip() and result != self.item: newclaims.append((prop, result.replace('\n', ' '), self)) for (function, prop) in [ (self.findfirstnames, 'P735'), (self.findlastnames, 'P734'), ]: results = function(self.html) or [] for result in results: if result is not None and str(result).strip() \ and result != self.item: newclaims.append((prop, result.replace('\n', ' '), None)) for (function, prop) in [ (self.findcountry, 'P17'), (self.findgender, 'P21'), (self.findfather, 'P22'), (self.findmother, 'P25'), (self.findreligion, 'P140'), (self.findadminloc, 'P131'), (self.findlocation, 'P276'), (self.findformationlocation, 'P740'), (self.findbirthplace, 'P19'), (self.finddeathplace, 'P20'), (self.findmannerdeath, 'P1196'), (self.findcausedeath, 'P509'), (self.findburialplace, 'P119'), (self.findorigcountry, 'P495'), (self.findnationality, 'P27'), (self.findethnicity, 'P172'), (self.findorientation, 'P91'), (self.findaddress, 'P969'), (self.findhaircolor, 'P1884'), (self.finduse, 'P366'), (self.findmountainrange, 'P4552'), (self.findviaf, 'P214'), (self.findrelorder, 'P611'), (self.findtwitter, 'P2002'), (self.findfacebook, 'P2013'), (self.findfacebookpage, 'P4003'), (self.findchoriginplace, 'P1321'), (self.findwebsite, 'P856'), (self.findvoice, 'P412'), (self.findfamily, 'P53'), (self.findgens, 'P5025'), (self.findchesstitle, 'P2962'), (self.findfeastday, 'P841'), (self.findbloodtype, 'P1853'), (self.findeyecolor, 'P1340'), ]: result = function(self.html) if result and not ( prop == 'P856' and 'wikipedia.org' in result or prop in ['P2013', 'P4003'] and result == 'pages'): newclaims.append((prop, result.strip(), self)) for (function, prop) in [ (self.findbirthdate, 'P569'), (self.finddeathdate, 'P570'), (self.findbaptismdate, 'P1636'), (self.findburialdate, 'P4602'), (self.findinception, 'P571'), (self.findpremiere, 'P1191'), (self.finddissolution, 'P576'), (self.findpubdate, 'P577'), (self.findfloruitstart, 'P2031'), (self.findfloruitend, 'P2032'), ]: result = function(self.html) if result: result = result.strip() if '?' not in result and re.search(r'\d{3}', result): newclaims.append((prop, '!date!' + result, self)) for (function, prop) in [ (self.findpubdates, 'P577'), ]: results = function(self.html) or [] for result in results: result = result.strip() if '?' not in result and re.search(r'\d{3}', result): newclaims.append((prop, '!date!' + result, self)) for function in [self.findfloruit]: result = function(self.html) if result: result = result.strip().lstrip('(').rstrip(')') result = result.replace('–', '-').replace('‑', '-') if '-' in result: (start, end) = (r.strip() for r in result.split('-', 1)) if start == end: newclaims.append(('P1317', '!date!' + start, self)) else: newclaims.append(('P2031', '!date!' + start, self)) newclaims.append(('P2032', '!date!' + end, self)) else: newclaims.append( ('P1317', '!date!' + result.strip(), self)) for (function, prop) in [ (self.findfloorsabove, 'P1101'), (self.findfloorsbelow, 'P1139'), ]: result = function(self.html) if result: newclaims.append((prop, str(int(result)), self)) for (function, prop) in [ (self.findheights, 'P2048'), (self.findweights, 'P2067'), (self.findelevations, 'P2044'), (self.finddurations, 'P2047'), (self.findprominences, 'P2660'), (self.findisolations, 'P2659'), ]: results = function(self.html) or [] for result in results: if result and result.strip(): newclaims.append((prop, '!q!' + result, self)) for (function, prop) in [ (self.findimage, 'P18'), (self.findcoatarms, 'P94'), (self.findsignature, 'P109'), (self.findlogo, 'P154'), ]: result = function(self.html) if result: result = re.sub('(<.*?>)', '', result) result = result.split('>')[-1] if len(result.strip()) > 2 and '.' in result: newclaims.append((prop, '!i!' + result, self)) result = self.findisni(self.html) if result: m = re.search(r'(\d{4})\s*(\d{4})\s*(\d{4})\s*(\w{4})', result) if m: newclaims.append(('P213', '{} {} {} {}' .format(*m.groups()), self)) for (prop, result) in self.findmixedrefs(self.html) or []: if result is not None: result = result.strip() if prop in ['P1309', 'P1255']: result = result.replace('vtls', '') elif prop == 'P1368': result = result.split('-')[-1] elif prop == 'P409': result = result.strip().lstrip('0') elif prop == 'P396' and '\\' not in result: result = result.replace('%5C', '\\') if '\\' not in result: m = re.match(r'^(.*?)(\d+)', result) result = 'IT\\ICCU\\{}\\{}'.format(*m.groups()) if result: newclaims.append((prop, result, self)) pywikibot.info() for (function, prop) in [ (self.findcoords, 'coordinates'), ]: result = function(self.html) if result: pywikibot.info(f'Please add yourself: {prop} - {result}') return newclaims
[docs] def prepare(self, html: str): return html
[docs] @staticmethod def singlespace(text): text = text.replace('\n', ' ') while ' ' in text: text = text.replace(' ', ' ') return text.strip()
[docs] def getdescriptions(self): return [ (self.language, self.singlespace(unescape(self.TAGRE.sub(' ', x)))) for x in self.finddescriptions(self.html) or [] if x ] + [ (language, self.singlespace(unescape(self.TAGRE.sub(' ', x)))) for (language, x) in self.findlanguagedescriptions(self.html) or [] if x ]
[docs] def longtext(self): result = self.TAGRE.sub(' ', self.findlongtext(self.html) or '') result = result.replace('\t', '\n').replace('\r', '') while ' ' in result: result = result.replace(' ', ' ') if '\n ' in result: result = result.replace('\n ', '\n') if ' \n' in result: result = result.replace(' \n', '\n') while '\n\n' in result: result = result.replace('\n\n', '\n') return result.strip()
[docs] def finddescriptions(self, html: str): return [self.finddescription(html)]
[docs] def getlanguage(self, code): if not code: return self.language translation = { 'cz': 'cs', 'hbo': 'he', 'simple': 'en', 'be-tarask': 'be-x-old', 'nb': 'no', } if code in translation: return translation[code] if code[-1] in '123456789': return self.getlanguage(code[:-1]) return code.replace('_', '-')
[docs] def findwikipedianames(self, html: str): links = self.findallbyre( r'//(\w+\.wikipedia\.org/wiki/[^\'"<>\s]+)', html) return [(self.getlanguage(link.split('.')[0]), unescape(unquote(link.split('/')[-1].replace( '_', ' '))).split('(')[0]) for link in links]
[docs] def getnames(self): return [ (self.language, self.commastrip(term)) for term in self.findnames(self.html) or [] if term and term.strip()] \ + [(self.getlanguage(language), self.commastrip(term)) for (language, term) in self.findlanguagenames(self.html) or [] if term and term.strip()] + self.findwikipedianames(self.html)
def __getattr__(self, name): """Return None if the function is not defined in subclass.""" prefix = 'find' funcnames = { 'address', 'adminloc', 'advisors', 'architects', 'artdirectors', 'awards', 'baptismdate', 'birthdate', 'birthplace', 'bloodtype', 'branches', 'burialdate', 'burialplace', 'cast', 'causedeath', 'chesstitle', 'children', 'choriginplace', 'choriginplaces', 'coatarms', 'colors', 'composers', 'conflicts', 'coords', 'costumedesigners', 'countries', 'country', 'crimes', 'deathdate', 'deathplace', 'degrees', 'description', 'developers', 'directorsphotography', 'dissolution', 'distcoms', 'docstudents', 'durations', 'elevations', 'employers', 'engines', 'ethnicities', 'ethnicity', 'eyecolor', 'facebook', 'facebookpage', 'family', 'father', 'feastday', 'firstname', 'firstnames', 'floorsabove', 'floorsbelow', 'floruit', 'floruitend', 'floruitstart', 'formationlocation', 'franchises', 'gamemodes', 'gender', 'genres', 'gens', 'haircolor', 'height', 'heights', 'image', 'inception', 'incollections', 'influences', 'instanceof', 'instruments', 'inworks', 'isni', 'isolations', 'kins', 'labels', 'languagedescriptions', 'languagenames', 'languages', 'languagesspoken', 'lastname', 'lastnames', 'location', 'logo', 'longtext', 'majors', 'makeupartists', 'mannerdeath', 'materials', 'memberships', 'mixedrefs', 'mother', 'mountainrange', 'movements', 'moviedirectors', 'movieeditors', 'nationalities', 'nationality', 'nativelanguages', 'nominations', 'notableworks', 'occupations', 'orientation', 'origcountries', 'origcountry', 'origlanguages', 'participantins', 'participations', 'parties', 'partners', 'partofs', 'parts', 'patronof', 'platforms', 'political', 'positions', 'premiere', 'prodcoms', 'producers', 'productiondesigners', 'prominences', 'pseudonyms', 'pubdate', 'pubdates', 'publishers', 'ranks', 'religion', 'religions', 'relorder', 'residences', 'schools', 'screenwriters', 'siblings', 'signature', 'sounddesigners', 'sources', 'sportcountries', 'sports', 'sportteams', 'spouses', 'students', 'teachers', 'teampositions', 'titles', 'twitter', 'use', 'viaf', 'voice', 'voices', 'webpages', 'website', 'weight', 'weights', 'workfields', 'workplaces', } pre, sep, post = name.partition(prefix) if not pre and sep == prefix and post in funcnames: return lambda html: None if not pre and sep == prefix and post == 'names': return lambda html: [] return super().__getattribute__(name)
[docs] def finddefaultmixedrefs(self, html, includesocial=True): defaultmixedrefs = [ ('P214', self.findbyre(r'viaf.org/(?:viaf/)?(\d+)', html)), ('P227', self.findbyre(r'd-nb\.info/(?:gnd/)?([\d\-xX]+)', html)), ('P244', self.findbyre( r'id\.loc\.gov/authorities/\w+/(\w+)', html)), ('P244', self.findbyre(r'https?://lccn\.loc\.gov/(\w+)', html)), ('P245', self.findbyre( r'https?://www.getty.edu/[^"\'\s]+subjectid=(\w+)', html)), ('P245', self.findbyre(r'getty.edu/page/ulan/(\w+)', html)), ('P268', self.findbyre( r'https?://catalogue.bnf.fr/ark./\d+/(?:cb)?(\w+)', html)), ('P268', self.findbyre(r'data\.bnf\.fr/ark:/\d+/cb(\w+)', html)), ('P269', self.findbyre(r'https?://\w+.idref.fr/(\w+)', html)), ('P345', self.findbyre(r'https?://www.imdb.com/\w+/(\w+)', html)), ('P349', self.findbyre( r'https?://id.ndl.go.jp/auth/[^"\'\s]+/(\w+)', html)), ('P396', self.findbyre( r'opac\.sbn\.it/opacsbn/opac/[^<>\'"\s]+\?bid=([^\s\'"<>]+)', html)), ('P409', self.findbyre( r'https?://nla.gov.au/anbd.aut-an(\w+)', html)), ('P434', self.findbyre( r'https?://musicbrainz.org/\w+/([\w\-]+)', html)), ('P496', self.findbyre(r'https?://orcid.org/([\d\-]+)', html)), ('P535', self.findbyre( r'https?://www.findagrave.com/memorial/(\w+)', html)), ('P535', self.findbyre( r'https?://www.findagrave.com/cgi-bin/fg.cgi\?[^<>"\']*id=(\w+)', html)), ('P549', self.findbyre( r'genealogy.math.ndsu.nodak.edu/id.php\?id=(\w+)', html)), ('P650', self.findbyre( r'https?://rkd.nl(?:/\w+)?/explore/artists/(\w+)', html)), ('P651', self.findbyre( r'biografischportaal\.nl/persoon/(\w+)', html)), ('P723', self.findbyre( r'dbnl\.(?:nl|org)/auteurs/auteur.php\?id=(\w+)', html)), ('P723', self.findbyre( r'data.bibliotheken.nl/id/dbnla/(\w+)', html)), ('P866', self.findbyre(r'perlentaucher.de/autor/([\w\-]+)', html)), ('P902', self.findbyre( r'hls-dhs-dss.ch/textes/\w/[A-Z]?(\d+)\.php', html)), ('P906', self.findbyre( r'libris.kb.se/(?:resource/)?auth/(\w+)', html)), ('P950', self.findbyre( r'catalogo.bne.es/[^"\'\s]+authority.id=(\w+)', html)), ('P1006', self.findbyre( r'data.bibliotheken.nl/id/thes/p(\d+X?)', html)), ('P1047', self.findbyre( r'catholic-hierarchy.org/\w+/b(.+?)\.html', html)), ('P1220', self.findbyre(r'//ibdb.com/person.php\?id=(\d+)', html)), ('P1233', self.findbyre( r'https?://www.isfdb.org/cgi-bin/ea.cgi\?(\d+)', html)), ('P1415', self.findbyre( r'doi\.org/\d+\.\d+/ref:odnb/(\d+)', html)), ('P1417', self.findbyre( r'https://www.britannica.com/([\w\-/]+)', html)), ('P1422', self.findbyre(r'ta.sandrartnet/-person-(\w+)', html)), ('P1563', self.findbyre( r'https?://www-history.mcs.st-andrews.ac.uk/Biographies/([^\'"<>\s]+)', html)), ('P1728', self.findbyre( r'https?://www.allmusic.com/artist/[\w\-]*?(mn/d+)', html)), ('P1749', self.findbyre( r'https?://www.parlement(?:airdocumentatiecentrum)?.(?:com|nl)/id/(\w+)', html)), ('P1788', self.findbyre( r'huygens.knaw.nl/vrouwenlexicon/lemmata/data/([^"\'<>\s]+)', html)), ('P1802', self.findbyre( r'https?://emlo.bodleian.ox.ac.uk/profile/person/([\w\-]+)', html)), ('P1842', self.findbyre( r'https?://gameo.org/index.php\?title=([^\'"\s]+)', html)), ('P1871', self.findbyre( r'https?://(?:data|thesaurus).cerl.org/(?:thesaurus|record)/(\w+)', html)), ('P1871', self.findbyre( r'thesaurus.cerl.org/cgi-bin/record.pl\?rid=(\w+)', html)), ('P1902', self.findbyre( r'https?://open.spotify.com/artist/(\w+)', html)), ('P1907', self.findbyre( r'https?://adb.anu.edu.au/biography/([\w\-]+)', html)), ('P1938', self.findbyre( r'https?://www.gutenberg.org/ebooks/author/(\d+)', html)), ('P1953', self.findbyre( r'https?://www.discogs.com/(\w+/)?artist/(\d+)', html)), ('P1986', self.findbyre( r'treccani.it/enciclopedia/([\w\-_]+)_\(Dizionario-Biografico\)', html)), ('P2016', self.findbyre( r'hoogleraren\.ub\.rug\.nl/hoogleraren/(\w+)', html)), ('P2038', self.findbyre( r'https?://www.researchgate.net/profile/([^\'"<>\s\?]+)', html)), ('P2163', self.findbyre(r'id\.worldcat\.org/fast/(\d+)', html)), ('P2332', self.findbyre(r'/arthistorians\.info/(\w+)', html)), ('P2372', self.findbyre(r'odis\.be/lnk/([\w_]+)', html)), ('P2373', self.findbyre( r'https?://genius.com/artists/([^\s\'"]*)', html)), ('P2397', self.findbyre(r'youtube\.com/channel/([\w\-_]+)', html)), ('P2454', self.findbyre( r'https?://www.dwc.knaw.nl/[^\'"\s]+=(\w+)', html)), ('P2456', self.findbyre( r'https?://dblp.uni-trier.de/pid/([\w/]+)', html)), ('P2469', self.findbyre(r'theatricalia.com/person/(\w+)', html)), ('P2639', (self.findbyre( r'filmportal.de/person/(\w+)', html) or '').lower() or None), ('P2722', self.findbyre(r'deezer.com/artist/(\w+)', html)), ('P2799', self.findbyre( r'cervantesvirtual.com/person/(\d+)', html)), ('P2850', self.findbyre( r'https?://itunes.apple.com(?:/\w{2})?/(?:id)?(\d+)', html)), ('P2909', self.findbyre( r'https?://www.secondhandsongs.com/artist/(\w+)', html)), ('P2915', self.findbyre( r'vondel.humanities.uva.nl/ecartico/persons/(\d+)', html)), ('P2941', self.findbyre( r'munksroll.rcplondon.ac.uk/Biography/Details/(\d+)', html)), ('P2949', self.findbyre( r'www\.wikitree\.com/wiki/(\w+-\d+)', html)), ('P2963', self.findbyre( r'goodreads\.com/author/show/(\d+)', html)), ('P2969', self.findbyre(r'goodreads\.com/book/show/(\d+)', html)), ('P3040', self.findbyre( r'https?://soundcloud.com/([\w\-]+)', html)), ('P3192', self.findbyre( r'https?://www.last.fm/music/([^\'"\s]+)', html)), ('P3217', self.findbyre( r'https?://sok.riksarkivet.se/sbl/Presentation.aspx\?id=(\d+)', html)), ('P3217', self.findbyre( r'https?://sok.riksarkivet.se/sbl/artikel/(\d+)', html)), ('P3241', self.findbyre( r'https?://www.newadvent.org/cathen/(\w+)\.htm', html)), ('P3265', self.findbyre( r'https?://myspace.com/([\w\-_/]+)', html)), ('P3365', self.findbyre( r'treccani.it/enciclopedia/([\w\-_]+)', html)), ('P3368', self.findbyre( r'https?://prabook.com/web/[^/<>"\']+/(\d+)', html)), ('P3368', self.findbyre( r'prabook.com/web/person-view.html\?profileId=(\d+)', html)), ('P3435', self.findbyre(r'vgmdb\.net/artist/(\w+)', html)), ('P3478', self.findbyre(r'songkick\.com/artists/(\w+)', html)), ('P3630', self.findbyre( r'https?://www.babelio.com/auteur/[^<>\'"\s]+/(\d+)', html)), ('P3854', self.findbyre( r'soundtrackcollector.com/\w+/(\w+)', html)), ('P4013', self.findbyre(r'https?://giphy.com/(\w+)', html)), ('P4073', self.findbyre(r'(\w+)\.wikia\.com', html)), ('P4198', self.findbyre( r'play.google.com/store/music/artist\?id=(\w+)', html)), ('P4223', self.findbyre( r'treccani.it/enciclopedia/([\w\-_]+)_\(Enciclopedia-Italiana\)', html)), ('P4228', self.findbyre( r'www.eoas.info/biogs/([^\s]+)\.html', html)), ('P4228', self.findbyre( r'www.eoas.info%2Fbiogs%2F([^\s]+)\.html', html)), ('P4252', self.findbyre( r'www.mathnet.ru/[\w/\.]+\?.*?personid=(\w+)', html)), ('P4862', self.findbyre( r'https?://www.amazon.com/[\w\-]*/e/(\w+)', html)), ('P5357', self.findbyre( r'sf-encyclopedia.com/entry/([\w_]+)', html)), ('P5404', self.findbyre( r'rateyourmusic.com/artist/([^\'"<>\s]+)', html)), ('P5431', self.findbyre( r'https?://www.setlist.fm/setlists/[\w\-]*?(\w+).html', html)), ('P5570', self.findbyre( r'www.noosfere.org/[\w\./]+\?numauteur=(\w+)', html)), ('P5882', self.findbyre( r'www\.muziekweb\.nl/\w+/(\w+)', html)), ('P5924', self.findbyre( r'lyrics.wikia.com/wiki/([^\'"<>\s]*)', html)), ('P6194', self.findbyre( r'biographien\.ac.\at/oebl/oebl_\w/[^\s\.]+\.', html)), ('P6517', self.findbyre( r'whosampled.com/([^\'"<>/\s]+)', html)), ('P6594', self.findbyre( r'gf\.org/fellows/all-fellows/([\w\-]+)', html)), ('P7032', self.findbyre( r'historici.nl/Onderzoek/Projecten/Repertorium/app/personen/(\d+)', html)), ('P7032', self.findbyre( r'repertoriumambtsdragersambtenaren1428-1861/app/personen/(\d+)', html)), ('P7195', self.findbyre( r'https?://www.bandsintown.com/\w+/(\d+)', html)), ('P7545', self.findbyre( r'https?://www.askart.com/artist/[\w_]*/(\d+)/', html)), ('P7620', self.findbyre( r'treccani.it/enciclopedia/([\w\-]+)_\(Enciclopedia_dei_Papi\)', html)), ('P7902', self.findbyre( r'www.deutsche-biographie.de/pnd(\w+)\.html', html)), ('P8034', self.findbyre( r'viaf.org/viaf/sourceID/BAV\|(\w+)', html)), ('P9029', self.findbyre( r'viceversalitterature\.ch/author/(\d+)', html)), ] if includesocial: defaultmixedrefs += [ ('P2002', self.findbyre( r'https?://(?:www\.)?twitter.com/#?(\w+)', html)), ('P2003', self.findbyre( r'https?://(?:\w+\.)?instagram.com/([^/\s\'"]{2,})', html)), ('P2013', self.findbyre( r'https?://www.facebook.com/(?:pg/)?([^/\s\'"<>\?]+)', html)), ('P2847', self.findbyre( r'https?://plus.google.com/(\+?\w+)', html)), ('P2850', self.findbyre( r'https?://itunes.apple.com/(?:\w+/)?artist/(?:\w*/)?[a-z]{0,2}(\d{3,})', html)), ('P3258', self.findbyre( r'https?://([\w\-]+)\.livejournal.com', html)), ('P3258', self.findbyre( r'https?://users\.livejournal.com/(\w+)', html)), ('P3265', self.findbyre( r'https?://www.myspace.com/([\w\-]+)', html)), ('P3283', self.findbyre( r'https?://([^/"\']+)\.bandcamp.com', html)), ('P4003', self.findbyre( r'https?://www.facebook.com/pages/([^\s\'"<>\?]+)', html)), ('P4175', self.findbyre( r'https://www.patreon.com/([\w\-]+)', html)), ('P6634', self.findbyre( r'\.linkedin\.com/in/([\w\-]+)', html)), ] result = [pair for pair in defaultmixedrefs if pair[0] != self.dbproperty] isniresult = re.search( r'isni\.org/isni/(\d{4})(\d{4})(\d{4})(\w{4})', html) if isniresult: result.append(('P213', '{} {} {} {}'.format(*isniresult.groups()))) commonsresult = self.findbyre( r'commons\.wikimedia\.org/wiki/\w+:([^\'"<>\s]+)', html) if commonsresult: result += [('P18', '!i!' + commonsresult)] return [r for r in result if r[1] and not (r[0] == 'P2002' and r[1] == 'intent') and not (r[0] == 'P2013' and r[1].startswith('pages')) and not (r[0] == 'P2013' and r[1] in ['pg', 'plugins', 'sharer']) and not (r[0] == 'P214' and r[1].lower() == 'sourceid') and not (r[0] == 'P3258' and r[1].lower() in ['users', 'comunity', 'www']) and r[1].lower() != 'search' and not (r[0] == 'P3365' and ('(Dizionario_Biografico)' in r[1] or '(Enciclopedia-Italiana)' in r[1] or '(Enciclopedia-dei-Papi)' in r[1])) and not (r[0] == 'P2013' and '.php' in r[1])]
[docs] def findbyre(self, regex, html, dtype=None, skips=None, alt=None) -> str: if not skips: skips = [] if not alt: alt = [] m = re.search(regex, html) if not m: return None if dtype: alt = [dtype] + alt for alttype in alt: if self.getdata(alttype, m[1], ask=False) \ and self.getdata(alttype, m[1], ask=False) != 'XXX': return self.getdata(alttype, m[1], ask=False) for skip in skips: if self.getdata(skip, m[1], ask=False) \ and self.getdata(skip, m[1], ask=False) != 'XXX': return None if dtype: return self.getdata(dtype, m[1]) return m[1]
[docs] def findallbyre(self, regex, html, dtype=None, skips=None, alt=None) -> List[str]: if not skips: skips = [] if not alt: alt = [] if dtype: alt = [dtype] + alt matches = re.findall(regex, html) result = set() for match in matches: doskip = False for alttype in alt: if self.getdata(alttype, match, ask=False) and self.getdata( alttype, match, ask=False) != 'XXX': result.add(self.getdata(alttype, match, ask=False)) doskip = True break for skip in skips: if self.getdata(skip, match, ask=False) and self.getdata( skip, match, ask=False) != 'XXX': doskip = True if doskip: continue if dtype: newresult = self.getdata(dtype, match) if newresult: result.add(newresult) else: result.add(match) return list(result)
[docs]class IsniAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P213' self.dbid = 'Q423048' self.dbname = 'International Standard Name Identifier' self.id = self.id.replace(' ', '') self.id = self.id[:4] + '+' + self.id[4:8] + '+' + self.id[8:12] + '+' + self.id[12:] self.urlbase = 'http://www.isni.org/{id}' self.urlbase3 = 'https://isni.oclc.org/DB=1.2/CMD?ACT=SRCH&IKT=8006&TRM=ISN%3A{id}&TERMS_OF_USE_AGREED=Y&terms_of_use_agree=send' self.skipfirst = True self.hrtre = '(<span class="rec.mat.long">.*?</span>)Sources' self.isperson = False self.language = 'en'
@property def url(self): # TODO: check whether this is right or needed return f'http://www.isni.org/{self.id}'.replace(' ', '')
[docs] def findlanguagenames(self, html: str): # TODO: check whether this is right or needed section = self.findbyre(r'(?s)>Name</td></tr>(.*?)</tr>', html) if section: return [('en', name) for name in self.findallbyre(r'(?s)<span>(.*?)(?:\([^{}<>]*\))?\s*</span>', section)]
[docs] def getvalues(self, field, html, dtype=None) -> List[str]: section = self.findbyre('(?s)<td class="rec_lable"><div><span>%s:.*?<td class="rec_title">(.*?)</td>', html) if section: return self.findallbyre('<span>(.*?)<', html, dtype) return []
[docs] def findnames(self, html) -> List[str]: return [self.findbyre(r'([^\(]*)', name) for name in self.getvalues('Name', html)]
[docs] def finddescriptions(self, html: str): return [self.findbyre(r'\((.*?)\)', name) for name in self.getvalues('Name', html)]
[docs] def findinstanceof(self, html: str): result = self.findbyre(r'<span class="rec.mat.long"><img alt="(.*?)"', html, 'instanceof') self.isperson = result == 'Q5' return result
[docs] def findmixedrefs(self, html: str): return self.finddefaultmixedrefs(html, includesocial=False)
[docs] def findoccupations(self, html: str): if self.isperson: return self.getvalues('Creation role', html, 'occupation')
[docs] def findbirthdate(self, html: str): if self.isperson: dates = self.getvalues('Dates', html) if dates: return self.findbyre(r'(.*?)-', dates[0])
[docs] def finddeathdate(self, html: str): if self.isperson: dates = self.getvalues('Dates', html) if dates: return self.findbyre(r'-(.*?)', dates[0])
[docs]class ViafAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P214' self.dbid = 'Q54919' self.dbname = 'Virtual International Authority File' self.urlbase = 'https://viaf.org/viaf/{id}/' self.hrtre = '(<ns1:Document.*?)<ns1:history>' self.language = 'en' self.escapehtml = True self.sourcelanguage = { 'DNB': 'de', 'LC': 'en', 'JPG': 'en', 'SUDOC': 'fr', 'NDL': 'ja', 'NLA': 'en', 'NKC': 'cs', 'SELIBR': 'sv', 'NLI': 'he', 'BNE': 'es', 'PTBNP': 'pt', 'NTA': 'nl', 'BIBSYS': 'nb', 'BAV': 'en', 'NUKAT': 'pl', 'BNC': 'ca', 'EGAXA': 'en', 'LNB': 'lv', 'NSK': 'hr', 'LAC': 'en', 'NLP': 'pl', 'BNCHL': 'es', 'N6I': 'en', 'FAST': 'en', 'RERO': 'fr', 'B2Q': 'fr', 'DBC': 'da', 'BLBNB': 'pt', 'KRNLK': 'ko', 'ISNI': 'en', 'BNF': 'fr', 'DE663': 'de', 'WKP': 'en', 'VLACC': 'nl', 'ERRR': 'et', 'NII': 'ja', 'BNL': 'fr', 'SWNL': 'fr', 'NLR': 'ru', 'ICCU': 'it', 'LNL': 'ar', 'W2Z': 'nb', 'LIH': 'lt', 'UIY': 'is', 'CAOONL': 'en', 'SIMACOB': 'sl', 'CYT': 'zh', 'SZ': 'de', 'PLWABN': 'pl', 'NLB': 'en', 'SKMASNL': 'sk', 'ARBABN': 'es', 'J9U': 'he', 'GRATEVE': 'el', }
[docs] def getid(self, name, html): result = self.findbyre(fr'>{name}\|([^<>]+)', html) if result: return result.replace(' ', '') return None
[docs] def findlanguagenames(self, html: str): languagenames = set() for section in self.findallbyre(r'(?s)<ns1:x\d+>(.*?)</ns1:x\d+>', html): for name in self.findallbyre(r'<ns1:subfield code="a">(.*?)<', section): for source in self.findallbyre(r'<ns1:s>(.*?)<', section): languagenames.add((self.sourcelanguage[source], name)) names = [name[1] for name in languagenames] for name in self.findallbyre(r'<ns1:subfield code="a">(.*?)<', html): if name not in names: languagenames.add(('en', name)) return languagenames
[docs] def findlanguagedescriptions(self, html: str): result = set() for section in self.findallbyre(r'(?s)<ns1:x\d+>(.*?)</ns1:x\d+>', html): for name in self.findallbyre(r'<ns1:subfield code="c">(.*?)<', section): for source in self.findallbyre(r'<ns1:s>(.*?)<', section): result.add((self.sourcelanguage[source], name)) names = [name[1] for name in result] for name in self.findallbyre(r'<ns1:subfield code="c">(.*?)<', html): if name not in names: result.add(('en', name)) return result
[docs] def findgender(self, html: str): return self.findbyre(r'<ns1:gender>([^<>]+)</ns1:gender>', html, 'gender')
[docs] def findnationalities(self, html: str): section = self.findbyre(r'<ns1:nationalityOfEntity>(.*?)</ns1:nationalityOfEntity>', html) if section: return self.findallbyre(r'<ns1:text>([^<>]+)</ns1:text>', section, 'country') return None
[docs] def findlanguagesspoken(self, html: str): section = self.findbyre(r'<ns1:languageOfEntity>(.*?)</ns1:languageOfEntity>', html) if section: return self.findallbyre(r'<ns1:text>([^<>]+)</ns1:text>', section, 'language') return None
[docs] def findoccupations(self, html: str): sections = self.findallbyre(r'<ns1:occupation>(.*?)</ns1:occupation>', html) section = '\n'.join(sections) return self.findallbyre(r'<ns1:text>(.*?)</ns1:text>', section, 'occupation')
[docs] def findworkfields(self, html: str): sections = self.findallbyre(r'<ns1:fieldOfActivity>(.*?)</ns1:fieldOfActivity>', html) section = '\n'.join(sections) return self.findallbyre(r'<ns1:text>(.*?)</ns1:text>', section, 'subject')
[docs] def findmixedrefs(self, html: str): result = [ ('P214', self.findbyre(r'<ns0:directto>(\d+)<', html)), ('P227', self.getid('DNB', html)), ('P244', self.getid('LC', html)), ('P245', self.getid('JPG', html)), ('P269', self.getid('SUDOC', html)), ('P271', self.getid('NII', html)), ('P349', self.getid('NDL', html)), ('P396', self.getid('ICCU', html)), ('P409', self.getid('NLA', html)), ('P691', self.getid('NKC', html)), ('P906', self.getid('SELIBR', html)), ('P949', self.getid('NLI', html)), ('P950', self.getid('BNE', html)), ('P1005', self.getid('PTBNP', html)), ('P1006', self.getid('NTA', html)), ('P1015', self.getid('BIBSYS', html)), ('P1017', self.getid('BAV', html)), ('P1207', self.getid('NUKAT', html)), ('P1255', self.getid('SWNL', html)), ('P1273', self.getid('BNC', html)), ('P1309', self.getid('EGAXA', html)), ('P1368', self.getid('LNB', html)), ('P1375', self.getid('NSK', html)), ('P1670', self.getid('LAC', html)), ('P1695', (self.getid('NLP', html) or '').upper() or None), # ('P1946', self.getid('N6I', html)), #obsolete ('P2163', self.getid('FAST', html)), # ('P3065', self.getid('RERO', html)), ('P3280', self.getid('B2Q', html)), ('P3348', self.getid('GRATEVE', html)), ('P3846', self.getid('DBC', html)), ('P4619', self.getid('BLBNB', html)), ('P5034', self.getid('KRNLK', html)), ('P5504', self.getid('DE663', html)), ('P7293', self.getid('PLWABN', html)), ('P7369', (self.getid('BNCHL', html) or '')[-9:] or None), ('P8034', (self.getid('BAV', html) or '').replace('_', '/') or None), ('P268', self.findbyre(r'"http://catalogue.bnf.fr/ark:/\d+/cb(\w+)"', html)), ('P1566', self.findbyre(r'"http://www.geonames.org/(\w+)"', html)), ] iccu = self.getid('ICCU', html) if iccu: result += [('P396', fr'IT\ICCU\{iccu[:4]}\{iccu[4:]}')] result += self.finddefaultmixedrefs(html) return result
[docs] def findisni(self, html: str): return self.getid('ISNI', html)
[docs] def findnotableworks(self, html: str): works = self.findallbyre(r'<ns1:work>(.*?)</ns1:work>', html) works = [(len(re.findall('(<ns1:s>)', work)), work) for work in works] works.sort(reverse=True) works = works[:5] works = [work for work in works if work[0] > 2] return [self.findbyre(r'<ns1:title>(.*?)<', work[1], 'work') for work in works]
[docs]class GndAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P227' self.dbid = 'Q36578' self.dbname = 'Gemeinsame Normdatei' self.urlbase = 'https://portal.dnb.de/opac.htm?method=simpleSearch&cqlMode=true&query=nid%3D{id}' self.hrtre = '(<table id="fullRecordTable".*?/table>)' self.language = 'de' self.escapehtml = True
[docs] def finddescriptions(self, html: str): return [ self.findbyre(r'(?s)<strong>Weitere Angaben</strong>.*?<td[^<>]*>(.*?)</td>', html), self.findbyre(r'(?s)<strong>Systematik</strong>.*?<td[^<>]*>\s*[^\s]+(.*?)</td>', html), self.findbyre(r'(?s)<strong>Beruf\(e\)</strong>.*?<td[^<>]*>(.*?)</td>', html), ]
[docs] def findlongtext(self, html: str): return re.sub(r'\s', ' ', self.findbyre(r'(?s)(<table id="fullRecordTable" .*?</table>)', html) or ''). \ replace('<tr>', '\n')
[docs] def findnames(self, html) -> List[str]: result = [] section = self.findbyre( r'(?s)<strong>Sachbegriff</strong>.*?(<td.*?>(.*?)</td>)', html) if section: result += self.findallbyre(r'>([^<>\(]*)', section) section = self.findbyre( r'(?s)<strong>Person</strong>.*?(<td.*?>(.*?)</td>)', html) if section: result += self.findallbyre(r'>([^<>\(]*)', section) section = self.findbyre( r'(?s)<strong>Synonyme</strong>.*?(<td.*?>(.*?)</td>)', html) if section: result += self.findallbyre(r'>([^<>\(]*)', section) section = self.findbyre( r'(?s)<strong>Andere Namen</strong>.*?(<td.*?>(.*?)</td>)', html) if section: result += self.findallbyre(r'>([^<>\(]*)', section) return result
[docs] def findinstanceof(self, html: str): result = self.findbyre(r'(?s)<strong>Typ</strong>.*?<td.*?>(.*?)(?:\(|</)', html, 'instanceof') if not result and '<strong>Person</strong>' in html: result = 'Q5' self.isperson = result == 'Q5' return result
[docs] def findbirthdate(self, html: str): return self.findbyre(r'(?s)Lebensdaten:([^<>]*?)-', html)
[docs] def finddeathdate(self, html: str): return self.findbyre(r'(?s)Lebensdaten:[^<>]*?-([^<>\(\)]*)', html)
[docs] def findnationalities(self, html: str): if self.isperson: section = self.findbyre(r'(?s)<strong>Land</strong>.*?<td.*?>(.*?)</td>', html) if section: return self.findallbyre(r'([\w\s]+)\(', section, 'country')
[docs] def findcountries(self, html: str): if not self.isperson: section = self.findbyre(r'(?s)<strong>Land</strong>.*?<td.*?>(.*?)</td>', html) if section: return self.findallbyre(r'([\w\s]+)\(', section, 'country')
[docs] def findbirthplace(self, html: str): return self.findbyre(r'(?s)Geburtsort:\s*(?:<[^<>]*>)?([^<>&]*)', html, 'city') or\ self.findbyre(r'(?s)([\s\w]+)\(Geburtsort\)', html, 'city')
[docs] def finddeathplace(self, html: str): return self.findbyre(r'(?s)Sterbeort:\s*(?:<[^<>]*>)?([^<>&]*)', html, 'city')
[docs] def findworkplaces(self, html: str): return ( self.findallbyre( r'(?s)Wirkungsort:\s*(?:<[^<>]*>)?([^<>]*)\(\d{3}', html, 'city') or self.findallbyre( r'(?s)Wirkungsort:\s*(?:<[^<>]*>)?([^<>]*)', html, 'city')) \ + self.findallbyre(r'(?s)([\s\w]+)\(Wirkungsort\)', html, 'city')
[docs] def findoccupations(self, html: str): result = [] sectionfound = False for sectionname in [r'Beruf\(e\)', r'Funktion\(en\)', 'Weitere Angaben']: if sectionname == 'Weitere Angaben' and sectionfound: continue section = self.findbyre(r'(?s)<strong>{}</strong>(.*?)</tr>' .format(sectionname), html) if section: sectionfound = True result += self.findallbyre(r'(?s)[>;,]([^<>;,]*)', section, 'occupation') return result
[docs] def findgender(self, html: str): return self.findbyre(r'(?s)<strong>Geschlecht</strong>.*?>([^<>]+)</td', html, 'gender')
[docs] def findinstruments(self, html: str): section = self.findbyre(r'(?s)<strong>Instrumente.*?<td[^<>]*>(.*?)</td>', html) if section: section = self.TAGRE.sub('', section) section = re.sub(r'(?s)(\([^()]*\))', ';', section) return self.findallbyre(r'(?s)([\s\w]+)', section, 'instrument')
[docs] def findvoice(self, html: str): section = self.findbyre(r'(?s)<strong>Instrumente.*?<td[^<>]*>(.*?)</td>', html) if not section: return None if '(' in section: return self.findbyre(r'(?s)([\s\w]+)\(', section, 'voice') return self.findbyre(r'(?s)([\s\w]+)', section, 'voice')
[docs] def findlanguagesspoken(self, html: str): if self.isperson: section = self.findbyre(r'(?s)<strong>Sprache.*?<td[^<>]*>(.*?)</td>', html) if section: return self.findallbyre(r'([^{});]*)\(', section, 'language')
[docs] def finddegrees(self, html: str): section = self.findbyre(r'(?s)Akademischer Grad.*?<td[^<>]*>(.*?)</td>', html) if section: return self.findallbyre(r'([^<>]+)', section, 'degree')
[docs] def findsiblings(self, html: str): section = self.findbyre(r'(?s)<strong>Beziehungen zu Personen</strong>.*?(<td.*?</td>)', html) if section: return self.findallbyre(r'(?s)([^<>]*)(?:</a> )?\((?:Bruder|Schwester)\)', section, 'person')
[docs] def findspouses(self, html: str): section = self.findbyre(r'(?s)<strong>Beziehungen zu Personen</strong>.*?(<td.*?</td>)', html) if section: return self.findallbyre(r'(?s)([^<>]*)(?:</a> )?\((?:Ehemann|Ehefrau)\)', section, 'person')
[docs] def findchildren(self, html: str): section = self.findbyre(r'(?s)<strong>Beziehungen zu Personen</strong>.*?(<td.*?</td>)', html) if section: return self.findallbyre(r'(?s)([^<>]*)(?:</a> )?\((?:Sohn|Tochter)\)', section, 'person')
[docs] def findfather(self, html: str): section = self.findbyre(r'(?s)<strong>Beziehungen zu Personen</strong>.*?(<td.*?</td>)', html) if section: return self.findbyre(r'(?s)([^<>]*)(?:</a> )?\(Vater\)', section, 'person')
[docs] def findmother(self, html: str): section = self.findbyre(r'(?s)<strong>Beziehungen zu Personen</strong>.*?(<td.*?</td>)', html) if section: return self.findbyre(r'(?s)([^<>]*)(?:</a> )?\(Mutter\)', section, 'person')
[docs] def findpseudonyms(self, html: str): section = self.findbyre(r'(?s)<strong>Beziehungen zu Personen</strong>.*?(<td.*?</td>)', html) if section: return [self.findbyre(r'Pseudonym: <a[^<>]*>(.*?)<', section)]
[docs] def findwebsite(self, html: str): return self.findbyre(r'Homepage[^<>]*<a[^<>]*href="(.*?)"', html)
[docs] def findwebpages(self, html: str): return self.findallbyre(r'Internet[^<>]*<a[^<>]*href="(.*?)"', html)
[docs] def findworkfields(self, html: str): result = self.findallbyre(r'(?s)Fachgebiet:(.*?)<', html, 'subject') sections = self.findallbyre(r'(?s)<strong>Thematischer Bezug</strong>.*?(<td.*?</td>)', html) for section in sections: subjects = self.findallbyre(r'>([^<>]*)<', section) for subject in subjects: if ':' in subject: result += self.findallbyre(r'([\w\s]+)', subject[subject.find(':') + 1:], 'subject') else: result += self.findallbyre(r'(.+)', subject, 'subject') return result
[docs] def findemployers(self, html: str): section = self.findbyre(r'(?s)<strong>Beziehungen zu Organisationen</strong>.*?(<td.*?</td>)', html) if section: return self.findallbyre(r'(?s)[>;]([^<>;]*)[<;]', section, 'employer', alt=['university']) return self.findallbyre(r'Tätig an (?:d\w\w )?([^<>;]*)', html, 'employer', alt=['university'])
[docs] def findsources(self, html: str): section = self.findbyre(r'(?s)<strong>Quelle</strong>.*?<td[^<>]*(>.*?<)/td>', html) if section: subsections = self.findallbyre(r'>([^<>]*)<', section) result = [] for subsection in subsections: result += self.findallbyre(r'([^;]+)', subsection, 'source') return result
[docs] def findmemberships(self, html: str): section = self.findbyre(r'(?s)<strong>Beziehungen zu Organisationen</strong>.*?(<td.*?</td>)', html) if section: return self.findallbyre(r'>([^<>]*)</a>', section, 'organization', skips=['religious order', 'employer', 'university'])
[docs] def findrelorder(self, html: str): section = self.findbyre(r'(?s)<strong>Beziehungen zu Organisationen</strong>.*?(<td.*?</td>)', html) if section: return self.findbyre(r'>([^<>]*)</a>', section, 'religious order', skips=['organization', 'employer', 'university'])
[docs] def findfloruit(self, html: str): return self.findbyre(r'(?s)Wirkungsdaten:(.*?)<', html)
[docs] def findmixedrefs(self, html: str): return self.finddefaultmixedrefs(html)
[docs]class LcAuthAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P244' self.dbid = 'Q13219454' self.dbname = 'Library of Congress Authorities' # self.urlbase = None self.hrtre = '(<h1.*?)<h3>(?:Editorial Notes|Change Notes|Sources|Alternate Formats)' self.language = 'en' self.escapehtml = True
@property def url(self): if self.isperson: return 'http://id.loc.gov/authorities/names/{id}.html'.format( id=self.id) if self.id.startswith('s'): return 'http://id.loc.gov/authorities/subjects/{id}.html'.format( id=self.id) return None @property def isperson(self): return self.id.startswith('n')
[docs] def findinstanceof(self, html: str): return self.findbyre(r'MADS/RDF ([^<>]+)', html, 'instanceof')
[docs] def findnames(self, html) -> List[str]: section = self.findbyre( r'(?s)<h3>Variants</h3><ul[^<>]*>(.*?)</ul>', html) if section: result = self.findallbyre(r'>([^<>]*)?(?:,[\s\d\-]+)<', section) else: result = [] return result \ + self.findallbyre(r'skos:prefLabel">(.*?)(?:</|, \d)', html) \ + self.findallbyre(r'skosxl:literalForm">(.*?)(?:<|, \d)', html)
[docs] def finddescriptions(self, html: str): result = [self.findbyre(r'<title>([^<>]*)-', html)] section = self.findbyre(r'(?s)<h3>Sources</h3>(.*?)</ul>', html) if section: result += self.findallbyre(r'\(([^<>]*?)\)', section) return result
[docs] def findlongtext(self, html: str): return self.findbyre(r'(?s)<h3>Sources</h3>(.*?)</ul>', html)
[docs] def findfirstname(self, html: str): if self.isperson: return self.findbyre(r'<h1[^<>]*>[^<>]*?,\s*(\w*)', html, 'firstname')
[docs] def findlastname(self, html: str): if self.isperson: return self.findbyre(r'h1[^<>]*>([^<>]*?),', html, 'lastname')
[docs] def findbirthdate(self, html: str): result = self.findbyre(r'<li><h3>Birth Date</h3><ul[^<>]*>(\d{8})<', html) if result: return f'{result[6:]}-{result[4:6]}-{result[:4]}' result = ( self.findbyre(r'(?s)Birth Date</h3><.*?>(?:\(.*?\))?([^<>]*?)</ul>', html) or self.findbyre(r'[\s\(]b\.\s+([\w\-/]+)', html) or self.findbyre(r'skos:prefLabel">[^<>]*, (\d+)-', html) ) if result and '[' not in result: m = re.match(r'(\d+)[/\-](\d+)[/\-](\d+)', result) if m: result = '{}-{}-{}'.format( m[2], m[1], m[3] if len(m[3]) > 2 else '19' + m[3] ) return result return None
[docs] def finddeathdate(self, html: str): result = self.findbyre(r'<li><h3>Death Date</h3><ul[^<>]*>(\d{8})<', html) if result: return f'{result[6:]}-{result[4:6]}-{result[:4]}' result = ( self.findbyre(r'(?s)Death Date</h3><.*?>(?:\(.*?\))?([^<>]*?)</ul>', html) or self.findbyre(r'skos:prefLabel">[^<>]*, \d+-(\d+)', html) ) if result and '[' not in result: m = re.match(r'(\d+)[/\-](\d+)[/\-](\d+)', result) if m: result = '{}-{}-{}'.format( m[2], m[1], m[3] if len(m[3]) > 2 else '19' + m[3] ) return result return None
[docs] def findbirthplace(self, html: str): return self.findbyre( r'(?s)Birth Place</h3><.*?>(?:\([^<>]*\))?([^<>]+)\s*(?:\([^<>]*\))?\s*</?[au]', html, 'city')
[docs] def finddeathplace(self, html: str): return self.findbyre( r'(?s)Death Place</h3><.*?>(?:\([^<>]*\))?([^<>]+)\s*(?:\([^<>]*\))?\s*</?[au]', html, 'city')
[docs] def findgender(self, html: str): return self.findbyre(r'(?s)Gender</h3><.*?>([^<>]*)(?:<[^<>]*>|\s)*</ul>', html, 'gender')
[docs] def findoccupations(self, html: str): section = self.findbyre(r'(?s)Occupation</h3>(.*?)<h3', html) if section: return self.findallbyre(r'>([^<>]+)</a>', section, 'occupation')
[docs] def findrelorder(self, html: str): section = self.findbyre(r'(?s)Affiliation</h3>.*?(<ul.*?</ul>)', html) if section: for result in self.findallbyre(r'>([^<>]+)</a', section, 'religious order', skips=['employer', 'university']): if result: return result
[docs] def findemployers(self, html: str): section = self.findbyre(r'(?s)Affiliation</h3>.*?(<ul.*?</ul>)', html) if section: return self.findallbyre(r'>([^<>]+)</a', section, 'employer', alt=['university'])
[docs] def findlanguagesspoken(self, html: str): if self.isperson: sections = self.findallbyre(r'(?s)Associated Language[^<>]*</h3>.*?(<ul.*?</ul>)', html) result = [] for section in sections: result += self.findallbyre(r'>([^<>]+)</a', section, 'language') return result return None
[docs] def findworkfields(self, html: str): section = self.findbyre(r'(?s)Field of Activity</h3>.*?(<ul.*?</ul>)', html) if section: return self.findallbyre(r'>([^<>]+)</a', section, 'subject') return None
[docs] def findmixedrefs(self, html: str): return self.finddefaultmixedrefs(html, includesocial=False)
[docs]class UlanAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P245' self.dbid = 'Q2494649' self.dbname = 'ULAN' self.urlbase = 'https://www.getty.edu/vow/ULANFullDisplay?find=&role=&nation=&subjectid={id}' self.hrtre = '(Record Type:.*?)Sources and Contributors:' self.language = 'en'
[docs] def finddescription(self, html: str): return self.findbyre(r'(?s)<SPAN CLASS=page>.*?</B>\s*\((.*?)\)', html)
[docs] def findlongtext(self, html: str): return self.findbyre(r'(?s)<B>Note:\s*</B>(.*?)</', html)
[docs] def findnames(self, html) -> List[str]: section = self.findbyre(r'<B>Names:</B>.*<TR>(.*?)</TABLE>', html) if section: return self.findallbyre(r'<B>(.*?)<', section) return []
[docs] def findinstanceof(self, html: str): result = self.findbyre(r'Record Type:.*?>(.*?)<', html, 'instanceof') self.isperson = result == 'Q5' return result
[docs] def findlastname(self, html: str): if self.isperson: return self.findbyre(r'(?s)<SPAN CLASS=page><B>([^<>]*?),', html, 'lastname')
[docs] def findfirstname(self, html: str): if self.isperson: return self.findbyre(r'(?s)<SPAN CLASS=page><B>[^<>]*?,\s*([\w\-]+)', html, 'firstname')
[docs] def findnationality(self, html: str): if self.isperson: return self.findbyre(r'(?s)Nationalities:.*<SPAN CLASS=page>([^<>]*)\(', html, 'country')
[docs] def country(self, html: str): if not self.isperson: return self.findbyre(r'(?s)Nationalities:.*<SPAN CLASS=page>([^<>]*)\(', html, 'country')
[docs] def findoccupations(self, html: str): if self.isperson: section = self.findbyre(r'(?s)>Roles:<.*?<TR>(.*?)</TABLE>', html) if section: return self.findallbyre(r'>([^<>\(\)]+)[<\(]', section, 'occupation')
[docs] def findgender(self, html: str): return self.findbyre(r'Gender:<.*?>(.*?)<', html, 'gender')
[docs] def findbirthplace(self, html: str): return self.findbyre(r'Born:.*?>([^<>]*)\(', html, 'city')
[docs] def finddeathplace(self, html: str): return self.findbyre(r'Died:.*?>([^<>]*)\(', html, 'city')
[docs] def findlocation(self, html: str): if not self.isperson: return self.findbyre(r'location:.*?<A.*?>([^<>]*)\(', html, 'city')
[docs] def findbirthdate(self, html: str): if self.isperson: result = self.findbyre(r'</B>\s*\([^<>]*,([^<>]*)-', html) if result and 'ctive' not in result: return result
[docs] def finddeathdate(self, html: str): if self.isperson: part = self.findbyre(r'</B>\s*\([^<>]*,([^<>]*-[^<>\)]*)', html) if part and 'ctive' not in part: return self.findbyre(r'-([^<>\)]*)', part)
[docs] def findworkplaces(self, html: str): return self.findallbyre(r'>active:(?:\s|&nbsp;|<[^<>]*>)*([^<>]*)\(', html, 'city')
[docs] def findchildren(self, html: str): return self.findallbyre(r'(?s)>parent of.*?<A[^<>]*>(.*?)<', html, 'person')
[docs] def findfather(self, html: str): result = self.findallbyre(r'(?s)>child of.*?<A[^<>]*>(.*?)<', html, 'male-person') if result: return result[0]
[docs] def findmother(self, html: str): result = self.findallbyre(r'(?s)>child of.*?<A[^<>]*>(.*?)<', html, 'female-person') if result: return result[0]
[docs] def findsiblings(self, html: str): return self.findallbyre(r'(?s)>sibling of.*?<A[^<>]*>(.*?)<', html, 'person')
[docs] def findstudents(self, html: str): return self.findallbyre(r'(?s)>teacher of.*?<A[^<>]*>(.*?)<', html, 'artist')
[docs] def findteachers(self, html: str): return self.findallbyre(r'(?s)>sibling of.*?<A[^<>]*>(.*?)<', html, 'artist')
[docs]class BnfAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P268' self.dbid = 'Q19938912' self.dbname = 'Bibliothèque nationale de France' self.urlbase = 'http://catalogue.bnf.fr/ark:/12148/cb{id}' self.hrtre = '(<div class="notice" id="ident">.*?)<div class="notice line"' self.language = 'fr' self.escapehtml = True
[docs] def finddescriptions(self, html: str): return self.findallbyre(r'<meta name="DC.subject" lang="fre" content="(.*?)"', html)
[docs] def findnames(self, html) -> List[str]: return self.findallbyre(r'<span class="gras">(.*?)<', html)
[docs] def findlongtext(self, html: str): return self.findbyre(r'(?s)<div[^<>]*"description">(.*?)</div>', html)
[docs] def findinstanceof(self, html: str): self.isperson = 'Notice de personne' in html if self.isperson: return 'Q5' # else return self.findbyre(r'(?s)Type de[^<>]+:.*?>([^<>]*)</', html, 'instanceof')
[docs] def findnationality(self, html: str): if self.isperson: return self.findbyre(r'(?s)Pays[^<>]*:.*?<span.*?>(.*?)</', html, 'country')
[docs] def findcountry(self, html: str): if not self.isperson: return self.findbyre(r'(?s)Pays[^<>]*:.*?<span.*?>(.*?)</', html, 'country')
[docs] def findlanguagesspoken(self, html: str): if self.isperson: result = [] section = self.findbyre(r'(?s)Langue\(s\).*?(<.*?>)\s*</div>', html) if section: section = section.replace('ancien ', 'ancien###') section = self.TAGRE.sub(' ', section) section = section.replace('###', ' ') result = self.findallbyre(r'([\w\s&;]{3,})', section, 'language') result += self.findallbyre(r'aussi(?: écrit)? en ([\w]+)', html, 'language') result += self.findallbyre(r'aussi(?: écrit)? en [\w\s]+ et en ([\w]+)', html, 'language') result += self.findallbyre(r'[tT]radu(?:cteur|it) du (.+?) en ', html, 'language') result += self.findallbyre(r'[tT]radu(?:cteur|it) .+? en ([\w\s]+)', html, 'language') return result
[docs] def findgender(self, html: str): return self.findbyre('(?s)Sexe[^<>]+:.*?<span.*?>(.*?)</', html, 'gender')
[docs] def findbirthdate(self, html: str): section = self.findbyre(r'(?s)Naissance.*?(<.*?>)\s*</div>', html) if section: result = self.findbyre(r'>([^<>]+?),', section) or self.findbyre(r'>([^<>]+?)</', section) if result and '..' not in result and re.search(r'\d{4}', result): return result return None
[docs] def findbirthplace(self, html: str): section = self.findbyre(r'(?s)Naissance.*?(<.*?>)\s*</div>', html) if section: result = self.findbyre(',([^<>]+)<', section, 'city') if not result: result = self.findbyre(r'Née? à ([\w\s]+)', html, 'city') return result
[docs] def finddeathdate(self, html: str): section = self.findbyre(r'(?s)Mort[^<>]*:.*?(<.*?>)\s*</div>', html) if section: result = self.findbyre(r'>([^<>]+?),', section) or self.findbyre(r'>([^<>]+?)</', section) if result and re.search(r'\d{4}', result): return result
[docs] def finddeathplace(self, html: str): section = self.findbyre(r'(?s)Mort[^<>]*:.*?(<.*?>)\s*</div>', html) if section: return self.findbyre(r',([^<>]+)<', section, 'city')
[docs] def findisni(self, html: str): return self.findbyre(r'ISNI ([\d\s]*)', html) or self.findbyre(r'isni/(\w+)', html)
[docs] def findoccupations(self, html: str): section = self.findbyre(r'(?s)"description">\s*<span[^<>]*>(.*?)</span>', html) if section: result = [] texts = [] for subsection in section.split(' et '): texts += self.findallbyre(r'(\w[\-\s\w&\']+)', subsection) for text in texts[:8]: result.append(self.findbyre(r'(.+)', text, 'occupation')) return result return None
[docs] def findworkfields(self, html: str): return self.findallbyre(r"[Pp]rofesseur d[eu']([\w\s]+)? [àa]u?x? ", html, 'subject') + \ self.findallbyre(r"[Ss]pécialiste d[eu']s?([\w\s]+) [àa]u?x? ", html, 'subject') + \ self.findallbyre(r'[Ss]pécialisée? en ([\w\s]+) [àa]u?x? ', html, 'subject') + \ self.findallbyre(r"[Pp]rofesseur d[eu']([\w\s]+)", html, 'subject') + \ self.findallbyre(r"[Ss]pécialiste d[eu']s?([\w\s]+)", html, 'subject') + \ self.findallbyre(r'[Ss]pécialisée? en ([\w\s]+)', html, 'subject')
[docs] def findemployers(self, html: str): sections = self.findallbyre(r'En poste\s*:(.*?)[\(<]', html) result = [] for section in sections: result += self.findallbyre(r'([^;]*)', section, 'employer', alt=['university']) return result
[docs]class SudocAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P269' self.dbid = 'Q47757534' self.dbname = 'SUDOC' self.urlbase = 'https://www.idref.fr/{id}' self.hrtre = '(<div id="editzone">.*?)<p>Informations sur la notice</p>' self.language = 'fr' self.escapehtml = True
[docs] def finddescriptions(self, html: str): return self.findallbyre(r'(?s)Notice de type</span>.*?([^<>]*)</span>', html) \ + self.findallbyre(r'(?s)<span class="detail_label">Note publique d\'information.*?"detail_value">(.*?)<', html)
[docs] def findnames(self, html) -> List[str]: result = [] section = self.findbyre( r"(?s)<p>Point d'accès autorisé</p>(.*)<p>", html) if section: result += self.findallbyre(r'(?s)<b>(.*?)[\(<]', section) section = self.findbyre( r"(?s)<p>Variantes de point d'accès</p>(.*)<p>", html) if section: result += self.findallbyre(r'(?s)<b>(.*?)[\(<]', section) return result
[docs] def findlongtext(self, html: str): return '\n'.join(self.findallbyre(r'(?s)<span class="detail_value">(.*?)</span>', html))
[docs] def findinstanceof(self, html: str): return self.findbyre(r'(?s)Notice de type</span>.*?([^<>]*)</span>', html, 'instanceof')
[docs] def findlanguagesspoken(self, html: str): result = self.findallbyre("Traducteur de l['ea](.*?)vers", html, 'language') +\ self.findallbyre("Traducteur de .*? vers l['ea](.*?)<", html, 'language') section = self.findbyre(r'(?s)<span id="Langues" class="DataCoded">(.*?)</span>', html) if section: result += self.findallbyre(r'([\w\s\(\)]+)', section, 'language') return result
[docs] def findnationality(self, html: str): return self.findbyre(r'(?s)<span id="PaysISO3166" class="DataCoded">(.*?)</span>', html, 'country')
[docs] def findbirthdate(self, html: str): result = self.findbyre(r'(?s)Date de naissance[^<>]*</b><span[^<>]*>([^<>]*)<', html) if result: return ''.join([char for char in result if char in '0123456789-/'])
[docs] def finddeathdate(self, html: str): result = self.findbyre(r'(?s)Date de mort[^<>]*</b><span[^<>]*>([^<>]*)<', html) if result: return ''.join([char for char in result if char in '0123456789-/'])
[docs] def findgender(self, html: str): return self.findbyre(r'<span id="Z120_sexe" class="DataCoded">(.*?)</span>', html, 'gender')
[docs] def findisni(self, html: str): return self.findbyre(r'http://isni.org/isni/(\w+)', html)
[docs] def findmixedrefs(self, html: str): return self.finddefaultmixedrefs(html)
[docs] def findbirthplace(self, html: str): return self.findbyre(r'ieu de naissance.? (.*?)[\.<>]', html, 'city')
[docs] def finddeathplace(self, html: str): return self.findbyre(r'ieu de décès.? (.*?)[\.<>]', html, 'city')
[docs] def findoccupations(self, html: str): sections = self.findallbyre(r'(?s)<div class="detail_chaqueNoteBio">.*?<span class="detail_value">(.*?)<', html) texts = [] for section in sections: for sectionpart in section.split(' et '): texts += self.findallbyre(r'([^\.,;]+)', sectionpart) return [self.findbyre(r'(.+)', text.strip().lstrip('-'), 'occupation') for text in texts[:8]]
[docs]class CiniiAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P271' self.dbid = 'Q10726338' self.dbname = 'CiNii' self.urlbase = 'https://ci.nii.ac.jp/author/{id}' self.hrtre = '(<div class="itemheading authordata">.*?)<div class="resultlist">' self.language = 'ja'
[docs] def findnames(self, html) -> List[str]: section = self.findbyre(r'(?s)<h1[^<>]>(.*?)</h1>', html) or '' return ( self.findallbyre( r'(?s)<span>(.*?)(?:, b\. \d+)?\s*</span>', section) + self.findallbyre(r'"seefm">(.*?)(?:, b\. \d+)?\s*[<\((]', html) )
[docs] def findinstanceof(self, html: str): return 'Q5'
[docs] def findfirstname(self, html: str): return self.findbyre(r'(?s)<h1[^<>]*>[^<>]*<span>[^<>]*?,\s*([\w\-]+)', html, 'firstname')
[docs] def findlastname(self, html: str): return self.findbyre(r'(?s)<h1[^<>]*>[^<>]*<span>([^<>]+?),', html, 'lastname')
[docs] def findbirthdate(self, html: str): return self.findbyre(r', b\. (\d+)', html)
[docs]class ImdbAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P345' self.dbid = 'Q37312' self.dbname = 'Internet Movie Database' self.urlbase = None if self.isfilm: self.hrtre = '(<h1.*?)<h2>Frequently Asked Questions' elif self.isperson: self.hrtre = '(<h1.*?</table>)' self.language = 'en' self.escapeurl = True
@property def url(self): if self.isfilm: return f'https://www.imdb.com/title/{self.id}/' if self.isperson: return f'https://www.imdb.com/name/{self.id}/' return None @property def isfilm(self): return self.id.startswith('tt') @property def isperson(self): return self.id.startswith('nm')
[docs] def finddescription(self, html: str): result = self.findbyre(r'<meta name="description" content="(.*?)"', html) if result: return '.'.join(result.split('.')[:2])
[docs] def findlongtext(self, html: str): return self.findbyre(r'(?s)<div class="inline">(.*?)<', html)
[docs] def findnames(self, html: str): result = self.findbyre(r'\'og:title\' content="(.*)"', html) or '' return [result.replace(' - IMDb', '')]
[docs] def findinstanceof(self, html: str): if self.isfilm: return 'Q11424' if self.isperson: return 'Q5' return None
[docs] def findorigcountry(self, html: str): if self.isfilm: return self.findbyre(r'(?s)Country:.*?>([^<>]+)</a>', html, 'country')
[docs] def findpubdate(self, html: str): if self.isfilm: return self.findbyre(r'span id="titleYear">\(\s*(?:<[^<>]*>)?(.*?)</', html)
[docs] def findmoviedirectors(self, html: str): section = self.findbyre(r'(?s)>Director:(<.*?</div>)', html) if section: return self.findallbyre(r'"name">([^<>]*)</span>', section, 'filmmaker')
[docs] def findscreenwriters(self, html: str): section = self.findbyre(r'(?s)>Writer:(<.*?</div>)', html) if section: return self.findallbyre(r'"name">([^<>]*)</span>', section, 'filmmaker')
[docs] def findcast(self, html: str): section = self.findbyre(r'(?s)>Credited cast:(<.*?</table>)', html) if section: return self.findallbyre(r'"name">([^<>]*)</span>', section, 'actor')
[docs] def findprodcoms(self, html: str): section = self.findbyre(r'(?s)>Production Co:(<.*?</div>)', html) if section: return self.findallbyre(r'"name">([^<>]*)</span>', section, 'filmcompany')
[docs] def findgenres(self, html: str): section = self.findbyre(r'(?s)>Genres:(<.*?</div>)', html) if section: return self.findallbyre(r'(?s)>([^<>]*)</a>', section, 'film-genre', alt=['genre'])
[docs] def findoriglanguages(self, html: str): section = self.findbyre(r'(?s)>Language:(<.*?</div>)', html) if section: return self.findallbyre(r'(?s)>([^<>]*)</a>', section, 'language')
[docs] def finddurations(self, html: str): section = self.findbyre(r'(?s)>Runtime:(<.*?</div>)', html) if section: return [self.findbyre(r'(?s)>([^<>]*)</time>', section)]
[docs] def findcolors(self, html: str): result = self.findbyre(r'(?s)>Color:.*?>([^<>]+)</a>', html, 'film-color') if result: return [result]
[docs] def findoccupations(self, html: str): section = self.findbyre(r'(?s)"jobTitle": (".*?"|\[.*?\])', html) if section: occupations = self.findallbyre(r'"(.*?)"', section, 'film-occupation', alt=['occupation']) return ['Q2526255' if result == 'Q3455803' else result for result in occupations]
[docs] def findbirthdate(self, html: str): return self.findbyre(r'"birthDate": "(.*?)"', html)
[docs] def finddeathdate(self, html: str): return self.findbyre(r'"deathDate": "(.*?)"', html)
[docs] def findbirthplace(self, html: str): return self.findbyre(r'birth_place=(.*?)[&"]', html, 'city')
[docs] def finddeathplace(self, html: str): return self.findbyre(r'death_place=(.*?)[&"]', html, 'city')
[docs]class SbnAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P396' self.dbid = None self.dbname = 'SBN' self.urlbase = 'http://opac.sbn.it/opacsbn/opac/iccu/scheda_authority.jsp?bid={id}' self.hrtre = '(<tbody>.*?</tbody>)' self.language = 'it'
[docs] def findnames(self, html) -> List[str]: result = [self.findbyre(r'(?s)Nome autore.*?<a .*?>(.*?)[<&\(]', html)] section = self.findbyre(r'(?s)Forme varianti.*?(<.*?)</tr>', html) if section: result += self.findallbyre(r'(?s)>([^<>]*)</div>', section) return result
[docs] def finddescription(self, html: str): return self.findbyre(r'(?s)Nota informativa.*?"detail_value">(.*?)<', html)
[docs] def findlongtext(self, html: str): return self.findbyre(r'(?s)Nota informativa.*?"detail_value">(.*?)<', html)
[docs] def findinstanceof(self, html: str): return self.findbyre(r'(?s)Tipo autore.*?detail_value">(.*?)</td>', html, 'instanceof')
[docs] def findbirthdate(self, html: str): return self.findbyre(r'(?s)Datazione\s*</td>\s*<td[^<>]*>(?:[^<>]*,)?([^<>]*?)-', html)
[docs] def finddeathdate(self, html: str): return self.findbyre(r'(?s)Datazione\s*</td>\s*<td[^<>]*>[^<>]*-(.*?)<', html)
[docs] def findoccupations(self, html: str): section = self.findbyre( r'(?s)Nota informativa.*?detail_value">([^<>]*?)\.', html) if not section: return None if ',' in section or ';' in section: return self.findallbyre(r'([^,;]+)', section, 'occupation') return self.findallbyre(r'(\w{3,})', section, 'occupation')
[docs] def findbirthplace(self, html: str): return self.findbyre(r'Nato ad? ([^<>]+) e morto', html, 'city') or \ self.findbyre(r'Nato ad? ([^<>]+?)[,\(\.]', html, 'city') or \ self.findbyre(r'Nato e morto ad? ([^<>,\(\.]+)', html, 'city') or \ self.findbyre(r'Nato ad? ([^<>\.]+)', html, 'city')
[docs] def finddeathplace(self, html: str): return self.findbyre(r'[mM]orto ad? ([^<>\.\(]+) nel', html, 'city') or \ self.findbyre(r'[mM]orto ad? ([^<>\.\(]+)', html, 'city')
[docs] def findlanguagesspoken(self, html: str): section = self.findbyre(r'Lingua.*?detail_value">(.*?)<', html) if section: return self.findallbyre(r'(\w{3,})', section, 'language')
[docs] def findisni(self, html: str): return self.findbyre(r'http://isni.org/isni/(\w+)', html)
[docs] def findrelorder(self, html: str): section = self.findbyre(r'(?s)Nota informativa.*?detail_value">([^<>]*?)\.', html) or '' if 'gesuita' in section.lower(): return 'Q36380' return None
[docs]class LibrariesAustraliaAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P409' self.dbid = None self.dbname = 'National Library of Australia' self.urlbase = 'https://librariesaustralia.nla.gov.au/search/display?dbid=auth&id={id}' self.hrtre = '<!--Record summary-->(.*?)<!--Record summary-->' self.language = 'en'
[docs] def finddescription(self, html: str): return self.findbyre(r'(?s)Heading:.*?">([^<>])*</a>', html)
[docs] def findnames(self, html) -> List[str]: result = self.findallbyre(r'(?s)<title>([^<>]*?)(?:<|\(|\s-\s)', html) return [','.join(r.split(',')[:2]) for r in result]
[docs] def findbirthdate(self, html: str): result = self.findbyre(r'(?s)<dt>Birth:</dt>.*?<li>(.*?)-?</li>', html) if result: if 'approx' not in result and 'active' not in result: return result else: section = self.findbyre(r'(?s)<dt>Heading:</dt>.*?>([^<>]*)</a', html) if section and 'approx' not in section and 'active' not in section: result = self.findbyre(r',([^,]*)-', section) return result if result else section return None
[docs] def findbirthplace(self, html: str): result = self.findbyre(r'(?s)<dt>Birth:</dt>(?:\s|<[^<>]*>)*<li>[^<>]*</li>\s*<li>(.*?)</li>', html) if result: return self.getdata('city', result) return None
[docs] def finddeathdate(self, html: str): result = self.findbyre(r'(?s)<dt>Death:</dt>.*?<li>(.*?)</li>', html) if result: if 'approx' not in result: return result else: section = self.findbyre(r'(?s)<dt>Heading:</dt>.*?>([^<>]*)-?</a', html) if section: result = self.findbyre(r'-([^,\-]*)', section) if result and 'approx' not in result: return result
[docs] def findfirstname(self, html: str): section = self.findbyre(r'(?s)<dt>Heading:</dt>.*?>([^<>]*)</a', html) pywikibot.info(section) if section: return self.findbyre(r',\s*(\w+)', section, 'firstname')
[docs] def findlastname(self, html: str): section = self.findbyre(r'(?s)<dt>Heading:</dt>.*?>([^<>]*)</a', html) if section: return self.findbyre(r'([^,]*),', section, 'lastname')
[docs] def finddeathplace(self, html: str): result = self.findbyre(r'(?s)<dt>Death:</dt>(?:\s|<[^<>]*>)*<li>[^<>]*</li>\s*<li>(.*?)</li>', html) if result: return self.getdata('city', result)
[docs] def findoccupations(self, html: str): section = self.findbyre(r'(?s)<dt>Occupations:</dt>.*?<li>(.*?)</li>', html) if section: return self.findallbyre(r'(\w+)', section, 'occupation')
[docs] def findmixedrefs(self, html: str): result = self.findbyre(r'(?s)<dt>LC number:</dt>.*?<li>(.*?)</li>', html) if result: result = result.replace(' ', '') results = self.findallbyre(r'[a-z]+\d+', result) return [('P244', result) for result in results]
[docs]class MusicBrainzAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P434' self.dbid = 'Q19832969' self.dbname = 'MusicBrainz' self.urlbase = 'https://musicbrainz.org/artist/{id}' self.urlbase3 = 'https://musicbrainz.org/artist/{id}/relationships' self.hrtre = '(<h2 class="artist-information">.*?)<div id="footer">' self.language = 'en'
[docs] def finddescription(self, html: str): return self.findbyre(r'<div class="wikipedia-extract-body wikipedia-extract-collapse"><p>(.+?)</p>', html)
[docs] def findnames(self, html) -> List[str]: return self.findallbyre(r'(?s)<dd class="sort-name">(.*?)</dd>', html)
[docs] def findinstanceof(self, html: str): result = self.findbyre(r'<dd class="type">(.*?)</dd>', html, 'instanceof') self.isperson = result == 'Q5' return result
[docs] def findinception(self, html: str): return self.findbyre(r'(?s)<dt>Founded:</dt>.*?<dd[^<>]*>(.*?)[<\(]', html)
[docs] def finddissolution(self, html: str): return self.findbyre(r'(?s)<dt>Dissolved:</dt>.*?<dd[^<>]*>(.*?)[<\(]', html)
[docs] def findformationlocation(self, html: str): if not self.isperson: return self.findbyre(r'(?s)<dt>Founded in:</dt>.*?<bdi>(\w+)', html, 'city') \ or self.findbyre(r'(?s)<dt>Founded in:</dt>.*?<bdi>(.*?)</bdi>', html, 'city') \ or self.findbyre(r'(?s)<dt>Area:</dt>.*?<bdi>(.*?)</bdi>', html, 'city')
[docs] def findorigcountry(self, html: str): if not self.isperson: return self.findbyre(r'(?s)<dt>Area:</dt>.*?<bdi>(.*?)</bdi>', html, 'country')
[docs] def findnationality(self, html: str): if self.isperson: return self.findbyre(r'(?s)<dt>Area:</dt>.*?<bdi>(.*?)</bdi>', html, 'country')
[docs] def findisni(self, html: str): return self.findbyre(r'/isni/(\w+)', html)
[docs] def findviaf(self, html: str): return self.findbyre(r'"https://viaf.org/viaf/(\w+)/?"', html)
[docs] def findwebsite(self, html: str): return self.findbyre(r'(?s)<th>offici.le website:.*?<bdi>(.*?)<', html) or \ self.findbyre(r'<li class="home-favicon"><a href="(.*?)">', html)
[docs] def findtwitter(self, html: str): return self.findbyre(r'<li class="twitter-favicon"><a href="[^"]*">@([^<>]*)</a>', html)
[docs] def findfacebook(self, html: str): return self.findbyre(r'<li class="facebook-favicon"><a href="https://www.facebook.com/([^/"]+)/?">', html)
[docs] def findgender(self, html: str): return self.findbyre(r'class="gender">(.*?)</', html, 'gender')
[docs] def findbirthdate(self, html: str): return self.findbyre(r'(?s)<dt>Born:</dt>.*?<dd[^<>]*>(.*?)[<\(]', html)
[docs] def finddeathdate(self, html: str): return self.findbyre(r'(?s)<dt>Died:</dt>.*?<dd[^<>]*>(.*?)[<\(]', html)
[docs] def findbirthplace(self, html: str): section = self.findbyre(r'(?s)<dt>Born in:</dt>\s*(<dd.*?</dd>)', html) if section: return self.getdata('city', self.TAGRE.sub('', section))
[docs] def finddeathplace(self, html: str): section = self.findbyre(r'(?s)<dt>Died in:</dt>\s*(<dd.*?</dd>)', html) if section: return self.getdata('city', self.TAGRE.sub('', section)) section = self.findbyre(r'(?s)<h2>Genres</h2>(.*?)<h\d', html) if section: return self.findallbyre('>(.*?)<', section, 'music-genre', alt=['genre']) return None
[docs] def findmixedrefs(self, html: str): return self.finddefaultmixedrefs(html, includesocial=False) + \ [('P4862', self.findbyre(r'<li class="amazon-favicon"><a href="[^"]*amazon[^"\?]*/(B\w+)[\?"]', html))] +\ [('P3453', result) for result in self.findallbyre(r'<dd class="ipi-code">(.*?)</dd>', html)]
[docs]class StructuraeAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P454' self.dbid = 'Q1061861' self.dbname = 'Structurae' self.urlbase = 'http://en.structurae.de/structures/data/index.cfm?ID={id}' self.hrtre = '(<h1.*?)Participants</h2>' self.language = 'en'
[docs] def finddescription(self, html: str): return self.findbyre(r'<meta name="Description" content="(.*?)"', html)
[docs] def findlongtext(self, html: str): return self.findbyre(r'(?s)<div class="js-acordion-body" id="notes">\s*<p>(.*?)</div>', html)
[docs] def findlanguagenames(self, html: str): return [(m[0], m[1].replace('-', ' ')) for m in re.findall(r'(?s)"alternate"[^<>]*hreflang="(\w+)"[^<>]*/([^<>"]*)">', html)]
[docs] def findnames(self, html) -> List[str]: return [ self.findbyre(r'(?s)<h1[^<>]*>(.*?)<', html), self.findbyre(r'(?s)Name in [^<>]*</th>[^<>]*<td>(.*?)<', html), ]
[docs] def findinstanceof(self, html: str): return 'Q41176'
[docs] def findinception(self, html: str): return self.findbyre(r'(?s)<th>Completion.*?>([^<>]+)</a>', html)
[docs] def finduse(self, html: str): return self.findbyre(r'(?s)Function / usage:.*?>([^<>]+)</a>', html, 'function')
[docs] def findlocation(self, html: str): return self.findbyre(r"(?s)itemprop='containedInPlace'.*?<strong>(.*?)</", html, 'city')
[docs] def findcountry(self, html: str): return self.findbyre(r"itemprop='containedInPlace'.*>([^<>]+)</span>", html, 'country')
[docs] def findaddress(self, html: str): return self.findbyre(r'itemprop="address">([^<>]+)</', html)
[docs] def findcoords(self, html: str): lat = self.findbyre(r'itemprop="latitude" content="(.*?)"', html) lon = self.findbyre(r'itemprop="longitude" content="(.*?)"', html) if lat and lon: return f'{lat} {lon}'
[docs] def findheights(self, html: str): return [self.findbyre(r'(?s)<td>height</td>.*<td>(.*?)</td>', html)]
[docs] def findfloorsabove(self, html: str): return self.findbyre(r'(?s)<td>number of floors \(above ground\)</td>.*<td>(.*?)</td>', html)
[docs]class SelibrAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P906' self.dbid = 'Q1798125' self.dbname = 'LIBRIS' self.urlbase = 'https://libris.kb.se/auth/{id}' # self.urlbase = None self.hrtre = '(.*)' self.language = 'en'
[docs] def finddescription(self, html: str): return self.findbyre(r'<h1>(.*?)</', html)
[docs] def findlongtext(self, html: str): return self.findbyre(r'(?s)<div class="bio">(.*?)</div>', html)
[docs] def findinstanceof(self, html: str): return 'Q5'
[docs] def findviaf(self, html: str): return self.findbyre(r'http://viaf.org/viaf/(\w+)', html)
[docs] def findnames(self, html) -> List[str]: return self.findallbyre(r'(?s)<h1[^<>]*>[^<>]*:([^<>]*?)[,<]', html)
[docs]class BneAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P950' self.dbid = None self.dbname = 'Biblioteca Nacional de España' self.urlbase = 'http://datos.bne.es/persona/{id}.html' self.hrtre = '(<h1.*?)<h3>Descarga en otros formatos' self.language = 'es'
[docs] def findnames(self, html) -> List[str]: return self.findallbyre('<h3>(.*?)<', html)
[docs] def finddescriptions(self, html: str): return [ self.findbyre(r'"og:description" content="([^"]+),', html), self.findbyre(r'"og:description" content="Descubre ([^"]+),', html), self.findbyre(r'"og:description" content="([^"]+)"', html), self.findbyre(r'"og:title" content="(.+?)"', html), self.findbyre(r'(?s)class="bio">.*?<p>(.*?)</p>', html), ]
[docs] def findlongtext(self, html: str): return self.findbyre(r'(?s)<table class="table table-condensed table-responsive">(.*?)</table>', html)
[docs] def findlastname(self, html: str): return self.findbyre(r'<h1>([^<>]+),', html, 'lastname')
[docs] def findfirstname(self, html: str): return self.findbyre(r'<h1>[^<>]+,\s*([\w\-]+)', html, 'firstname')
[docs] def findbirthdate(self, html: str): result = self.findbyre(r'(?s)Año de nacimiento:\s*<span>(.*?)<', html) or \ self.findbyre(r'<h1>[^<>]+\((?:n\.\s*)?([^\)<>-]+?)[–\-\)]', html) if result and 'fl.' not in result and not result.strip().startswith('m.') and '1' in result: return result
[docs] def finddeathdate(self, html: str): result = self.findbyre(r'(?s)Año de fallecimiento:\s*<span>(.*?)<', html) if result: return result preresult = self.findbyre(r'<h1>(.*?)</h1>', html) if preresult and 'fl.' not in preresult: return self.findbyre(r'<h1>[^<>]+\([^<>]+[–\-]([^<>]+\d{4}[^<>]+)\)', html)
[docs] def findbirthplace(self, html: str): return self.findbyre(r'(?s)Lugar de nacimiento:\s*<span>(.*?)<', html, 'city')
[docs] def finddeathplace(self, html: str): return self.findbyre(r'(?s)Lugar de fallecimiento:\s*<span>(.*?)<', html, 'city')
[docs] def findviaf(self, html: str): return self.findbyre(r'"http://viaf.org/viaf/(\w+)/?"', html)
[docs] def findisni(self, html: str): return self.findbyre(r'"http://isni-url.oclc.nl/isni/(\w+)"', html)
[docs] def findmixedrefs(self, html: str): return self.finddefaultmixedrefs(html)
[docs] def findoccupations(self, html: str): section = self.findbyre(r'(?s)<h4>Categoría profesional:(.*?)</h4>', html) if section: return self.findallbyre(r'([^<>,]*)', section, 'occupation') return None
[docs] def findworkfields(self, html: str): section = self.findbyre(r'(?s)<h4>Campo de actividad:(.*?)</h4>', html) if section: return self.findallbyre(r'([^<>,]*)', section, 'subject') return None
[docs] def findlanguagesspoken(self, html: str): section = self.findbyre(r'(?s)<h4>>Lengua:(.*?)</h4>', html) if section: return self.findallbyre(r'([^<>,])*', section, 'subject') return None
[docs]class OrcidAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P496' self.dbid = None self.dbname = 'Orcid' self.urlbase = 'https://orcid.org/{id}' self.language = 'en' self.hrtre = r'(<div class="workspace-section">.*?)</i>\s*Works\('
[docs] def findinstanceof(self, html: str): return 'Q5'
[docs] def findnames(self, html) -> List[str]: return self.findallbyre(r'(?s)"(?:full|other)-name">(.*?)<', html)
[docs] def finddescriptions(self, html: str): return [ self.findbyre(r'(?s)<div class="bio-content">(.*?)<', html), self.findbyre(r'(?s)<div class="bio-content">(.*?)</div>', html) ]
[docs] def findlongtext(self, html: str): return self.findbyre(r'(?s)<div class="bio-content">(.*?)</div>', html)
[docs] def findnationalities(self, html: str): return self.findallbyre(r'"country">(.*?)<', html, 'country')
[docs] def findschools(self, html: str): pywikibot.info('Check education and affiliations by hand!')
[docs]class CbdbAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P497' self.dbid = 'Q13407958' self.dbname = 'China Biographical Database' self.urlbase = 'https://cbdb.fas.harvard.edu/cbdbapi/person.php?id={id}' self.language = 'zh' self.hrtre = r'(<table style="font-size:smaller">.*?)<hr>'
[docs] def findlanguagenames(self, html: str): return [ ('en', self.findbyre(r'<b>索引/中文/英文名稱</b>:[^<>]*/([^<>]*)<', html)), ('zh', self.findbyre(r'<b>索引/中文/英文名稱</b>:[^<>]*?/([^<>]*)/', html)) ]
[docs] def findbirthdate(self, html: str): return self.findbyre(r'(?s)<b>生年</b>[^<>]*\(([^<>]*?)\)', html)
[docs] def finddeathdate(self, html: str): return self.findbyre(r'(?s)<b>卒年</b>[^<>]*\(([^<>]*?)\)', html)
[docs] def findlongtext(self, html: str): return self.findbyre(r'(?s)註.*?<td>(.*?)</td>', html)
[docs] def findnationalities(self, html: str): return [ self.findbyre(r'(?s)<b>生年</b>:\s*(.)', html, 'dynasty') or self.findbyre(r'(?s)<b>生年</b>:\s*(..)', html, 'dynasty'), self.findbyre(r'(?s)<b>卒年</b>:\s*(.)', html, 'dynasty') or self.findbyre(r'(?s)<b>卒年</b>:\s*(..)', html, 'dynasty') ]
[docs]class FindGraveAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P535' self.dbid = 'Q63056' self.dbname = 'Find a Grave' self.urlbase = 'https://www.findagrave.com/memorial/{id}' self.language = 'en' self.hrtre = r'(<h1.*?</table>)'
[docs] def getvalue(self, name, html, category=None): return self.findbyre(fr'{name}: "(.*?)"', html, category)
[docs] def findnames(self, html) -> List[str]: return [self.getvalue('shareTitle', html)]
[docs] def findlongtext(self, html: str): return self.findbyre(r'(?s) id="fullBio">(.*?)<', html)
[docs] def finddeathdate(self, html: str): return self.getvalue('deathDate', html) or \ self.findbyre(r'"deathDate">(.*?)<', html) or \ self.getvalue('deathYear', html)
[docs] def findburialplace(self, html: str): return self.getvalue('cemeteryName', html, 'cemetary') or \ self.getvalue('cemeteryCityName', html, 'city') or \ self.getvalue('locationName', html, 'city')
[docs] def findfirstname(self, html: str): return self.getvalue('firstName', html, 'firstname')
[docs] def findlastname(self, html: str): return self.getvalue('lastName', html, 'lastname')
[docs] def findbirthdate(self, html: str): return self.findbyre(r'"birthDate">(.*?)<', html) or \ self.getvalue('birthYear', html)
[docs] def findbirthplace(self, html: str): return self.findbyre(r'"birthPlace">(.*?)<', html, 'city')
[docs] def finddeathplace(self, html: str): return self.findbyre(r'"deathPlace">(.*?)<', html, 'city')
[docs] def findfather(self, html: str): result = self.getvalue('fatherName', html, 'person') if result: return result section = self.findbyre(r'(?s)>Ouders</b>(.*?)</ul>', html) if section: result = self.findallbyre(r'(?s)<h4[^<>]*>(.*?)</h4>', section, 'male-person') result = [r for r in result if r] if result: return result[0]
[docs] def findmother(self, html: str): result = self.getvalue('motherName', html, 'person') if result: return result section = self.findbyre(r'(?s)>Ouders</b>(.*?)</ul>', html) if section: result = self.findallbyre(r'(?s)<h4[^<>]*>(.*?)</h4>', section, 'female-person') result = [r for r in result if r] if result: return result[0]
[docs] def findspouses(self, html: str): result = self.findallbyre(r'sp\d+Name: "(.*?)"', html, 'person') if result: return result section = self.findbyre(r'(?s)>Partners</b>(.*?)</ul>', html) if section: return self.findallbyre(r'(?s)<h4[^<>]*>(.*?)</h4>', section, 'person')
[docs] def findsiblings(self, html: str): section = self.findbyre(r'(?s)>Broer[^<>]*zus[^<>]*</b>(.*?)</ul>', html) if section: return self.findallbyre(r'(?s)<h4[^<>]*>(.*?)</h4>', section, 'person')
[docs] def findchildren(self, html: str): section = self.findbyre(r'(?s)>Kinderen</b>(.*?)</ul>', html) if section: return self.findallbyre(r'(?s)<h4[^<>]*>(.*?)</h4>', section, 'person')
[docs]class IpniAuthorsAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P586' self.dbid = 'Q922063' self.dbname = 'International Plant Names Index' self.urlbase = 'http://www.ipni.org/ipni/idAuthorSearch.do?id={id}' self.language = 'en' self.hrtre = '</h2>(.*?)<p>View the'
[docs] def findinstanceof(self, html: str): return 'Q5'
[docs] def findnames(self, html) -> List[str]: result = self.findallbyre(r'(?s)<h3>(.*?)[\(<]', html) section = self.findbyre( r'(?s)<h4>Alternative Names:\s*</h4(>.*?<)h/d', html) if section: result += self.findallbyre(r'(?)>([^<>]*)<', section) return result
[docs] def findlastname(self, html: str): return self.findbyre(r'(?s)<h3>([^<>]*?),', html, 'lastname')
[docs] def findfirstname(self, html: str): return self.findbyre(r'(?s)<h3>[^<>]*,\s*([\w\-]+)', html, 'firstname')
[docs] def findlongtext(self, html: str): return self.findbyre(r'(?s)<h4>Comment:\s*</h4>(.*?)<h\d', html)
[docs] def findbirthdate(self, html: str): return self.findbyre(r'(?s)<h3>[^<>]*\((\d+)-', html)
[docs] def finddeathdate(self, html: str): return self.findbyre(r'(?s)<h3>[^<>]*\([^<>]*?-(\d+)\)', html)
[docs] def findmixedrefs(self, html: str): return [('P428', self.findbyre(r'(?s)<h4>Standard Form:\s*</h4>\s*<p>(.*?)<', html))]
[docs] def findworkfields(self, html: str): section = self.findbyre(r'(?s)<h4>Area of Interest:\s*</h4>\s*<p>(.*?)</p>', html) if section: return self.findallbyre(r'([^,]*)', section, 'subject')
[docs] def findsources(self, html: str): section = self.findbyre(r'(?s)<h4>Information Source:</h4>\s*<p>(.*?)</p>', html) if section: return self.findallbyre(r'([^,]*)', section, 'source')
[docs] def findnationalities(self, html: str): section = self.findbyre(r'(?s)<h4>Countries:\s*</h4>(.*?)(?:<h|<p>View)', html) if section: return self.findallbyre(r'(?s)>(.*?)<', section, 'country')
[docs]class GnisAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P590' self.dbid = None self.dbname = 'GNIS' self.urlbase = 'https://geonames.usgs.gov/apex/f?p=gnispq:3:::NO::P3_FID:{id}' self.language = 'en'
[docs] def findnames(self, html) -> List[str]: return self.findbyre(r'Name:</td><td[^<>]*>(.*?)<', html)
[docs] def findinstanceof(self, html: str): return self.findbyre( r'Class:</td><td[^<>]*>(.*?)[<\(]', html, 'instanceof')
[docs] def findelevations(self, html: str): return [ self.findbyre(r'Elevation:</td><td[^<>]*>(\d+)/', html) + ' feet', self.findbyre(r'Elevation:</td><td[^<>]*>\d+/(\d+)', html) + ' m' ]
[docs] def findadminloc(self, html: str): return self.findbyre(r'"COUNTY_NAME">(.*?)<', html, 'county') or \ self.findbyre(r'"STATE_NAME">(.*?)<', html, 'county')
[docs] def findcountry(self, html: str): return 'Q30'
[docs] def findcoords(self, html: str): lat = self.findbyre(r'"LAT">(.*?)<', html) lon = self.findbyre(r'"LONGI">(.*?)<', html) if lat and lon: return f'{lat} {lon}'
[docs]class MathGenAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P549' self.dbid = 'Q829984' self.dbname = 'Mathematics Genealogy Project' self.urlbase = 'https://www.genealogy.math.ndsu.nodak.edu/id.php?id={id}' self.hrtre = '(<h2.*?)We welcome any additional information.' self.language = 'en' self.escapehtml = True
[docs] def findnames(self, html) -> List[str]: return [self.findbyre(r'(?s)<h2[^<>]*>(.*?)<', html)]
[docs] def findinstanceof(self, html: str): return 'Q5'
[docs] def finddegrees(self, html: str): return self.findallbyre(r'(?s)>\s*(Ph\.D\.)\s*<', html, 'degree')
[docs] def findschools(self, html: str): return self.findallbyre(r'(?s)>\s*Ph\.D\.\s*<[^<>]*>(.*?)<', html, 'university')
[docs] def findadvisors(self, html: str): return self.findallbyre(r'(?s)Advisor[^<>]*:[^<>]*<[^<>]*>(.*?)<', html, 'scientist')
[docs] def finddocstudents(self, html: str): section = self.findbyre(r'(?s)Students:.*?<table[^<>]*>(.*?)</table>', html) if not section: section = self.findbyre(r'(?s)<th>Descendants</th>(.*?)</table>', html) if section: return self.findallbyre(r'(?s)<a[^<>]*>(.*?)<', section, 'scientist')
[docs]class LeonoreAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P640' self.dbid = 'Q2886420' self.dbname = 'Léonore' self.urlbase = 'http://www2.culture.gouv.fr/public/mistral/leonore_fr?ACTION=CHERCHER&FIELD_1=COTE&VALUE_1={id}' self.hrtre = '(<TABLE VALIGN=TOP .*?</TABLE>)' self.language = 'fr' self._results = None self.escapeunicode = True
[docs] def getvalue(self, field, html, dtype=None): return self.findbyre( r'(?s)>\s*{}\s*<.*?<TD[^<>]*>(?:<[^<>]*>|\s)*([^<>]+)</' .format(field), html, dtype)
[docs] def findinstanceof(self, html: str): return 'Q5'
[docs] def findnames(self, html) -> List[str]: try: return [ self.getvalue('Prénoms', html) + ' ' + self.getvalue('Nom', html).title() ] except TypeError: return []
[docs] def findlastname(self, html: str): return self.getvalue('Nom', html, 'lastname')
[docs] def findfirstname(self, html: str): return self.getvalue('Prénoms', html, 'firstname')
[docs] def findbirthdate(self, html: str): return self.getvalue('Date de naissance', html)
[docs] def findbirthplace(self, html: str): return self.getvalue('Lieu de naissance', html, 'city')
[docs] def findgender(self, html: str): return self.getvalue('Sexe', html, 'gender')
[docs]class OpenLibraryAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P648' self.dbid = 'Q1201876' self.dbname = 'Open Library' self.urlbase = 'https://openlibrary.org/works/{id}' self.hrtre = '(<h1.*?)</div>' self.language = 'en'
[docs] def finddescription(self, html: str): return self.findbyre(r'description" content="(.*?)"', html)
[docs] def findnames(self, html) -> List[str]: return self.findallbyre(r'<title>([^<>]*)\|', html) \ + self.findallbyre('itemprop="name">(.*?)<', html)
[docs] def findlongtext(self, html: str): return self.findbyre(r'<div id="contentBody">(.*?)<div class="clearfix">', html)
[docs] def findinstanceof(self, html: str): return self.findbyre('og:type" content="(.*?)"', html, 'instanceof')
[docs] def findbirthdate(self, html: str): return self.findbyre('<span itemprop="birthDate">(.*?)<', html)
[docs] def finddeathdate(self, html: str): return self.findbyre('<span itemprop="deathDate">(.*?)<', html)
[docs]class RkdArtistsAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P650' self.dbid = 'Q17299517' self.dbname = 'RKDartists' self.urlbase = 'https://rkd.nl/nl/explore/artists/{id}' self.hrtre = '(<div class="fieldGroup.*?)<script>' self.language = 'nl' self.escapehtml = True
[docs] def findinstanceof(self, html: str): return 'Q5'
[docs] def finddescription(self, html: str): return self.findbyre(r'"og:description" content="(.*?)"', html)
[docs] def findlongtext(self, html: str): return self.findbyre(r'(?s)<div class="left">(.*?)<dt>Permalink</dt>', html)
[docs] def findnames(self, html) -> List[str]: return [ self.findbyre(r'(?s)itemprop="name">(.*?)<', html), self.findbyre(r'(?s)<h2[^<>]*>(.*?)<', html) ] + self.findallbyre(r'itemprop="alternateName">(.*?)<', html)
[docs] def findgender(self, html: str): return self.findbyre(r'(?s)itemprop="gender">(.*?)<', html, 'gender')
[docs] def findoccupations(self, html: str): section = self.findbyre(r'(?s)Kwalificaties\s*</dt>.*?<dd>(.*?)</dd>', html) if section: return self.findallbyre(r'">([^<>]+)</span>', section, 'occupation')
[docs] def findbirthplace(self, html: str): return self.findbyre(r'itemprop="birthPlace">([^<>]*),', html, 'city') or \ self.findbyre(r'itemprop="birthPlace">([^<>]*)<', html, 'city')
[docs] def findbirthdate(self, html: str): return self.findbyre(r'itemprop="birthDate">([^<>]*?)[</]', html)
[docs] def finddeathplace(self, html: str): return self.findbyre(r'itemprop="deathPlace">([^<>]*),', html, 'city') or \ self.findbyre(r'itemprop="deathPlace">([^<>]*)<', html, 'city')
[docs] def finddeathdate(self, html: str): return self.findbyre(r'itemprop="deathDate">([^<>]*?)[</]', html)
[docs] def findworkplaces(self, html: str): section = self.findbyre(r'(?s)Werkzaam in.*?<ul>(.*?)</ul>', html) if section: return self.findallbyre(r'>([^<>]+)</a>', section, 'city')
[docs] def findstudents(self, html: str): section = self.findbyre(r'(?s)Leraar van.*?<dd>(.*?)</dd>', html) if section: return self.findallbyre(r'>([^<>]*)</span>', section, 'artist')
[docs] def findteachers(self, html: str): section = self.findbyre(r'(?s)Leerling van.*?<dd>(.*?)</dd>', html) if section: return self.findallbyre(r'>([^<>]*)</span>', section, 'artist')
[docs] def findinfluences(self, html: str): section = self.findbyre(r'(?s)Be.nvloed door.*?<dd>(.*?)</dd>', html) if section: return self.findallbyre(r'>([^<>]*)</span>', section, 'artist')
[docs] def findschools(self, html: str): section = self.findbyre(r'(?s)<dt>\s*Opleiding\s*</dt>.*?<dd>(.*?)</dd>', html) if section: return self.findallbyre(r'>([^<>]+)</a>', section, 'university')
[docs] def findnationalities(self, html: str): return self.findallbyre(r'itemprop="nationality">(.*?)<', html, 'country')
[docs] def findgenres(self, html: str): return self.findallbyre(r'Onderwerpen\s*<em>(.*?)<', html, 'art-genre', alt=['genre'])
[docs] def findmovements(self, html: str): return self.findallbyre(r'Stroming\s*<em>(.*?)<', html, 'movement')
[docs] def findsiblings(self, html: str): return self.findallbyre(r'[bB]roer van ([^<>]*)', html, 'person') + \ self.findallbyre(r'[zZ]us(?:ter)? van ([^<>]*)', html, 'person')
[docs] def findfather(self, html: str): return self.findbyre(r'[zZ]oon van ([^<>]*)', html, 'male-person', skips=['female-person']) or \ self.findbyre(r'[dD]ochter van ([^<>]*)', html, 'male-person', skips=['female-person'])
[docs] def findmother(self, html: str): return self.findbyre(r'[zZ]oon van ([^<>]*)', html, 'female-person', skips=['male-person']) or \ self.findbyre(r'[dD]ochter van ([^<>]*)', html, 'female-person', skips=['male-person'])
[docs] def findmemberships(self, html: str): return self.findallbyre(r'Lid van[^<>]*<em>(.*?)<', html, 'organization')
[docs] def findmixedrefs(self, html: str): return self.finddefaultmixedrefs(html, includesocial=False)
[docs] def findfloruit(self, html: str): return self.findbyre(r'(?s)<dt>\s*Werkzame periode\s*</dt>\s*<dd>(.*?)<', html)
[docs]class BiografischPortaalAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P651' self.dbid = 'Q1868372' self.dbname = 'Biografisch Portaal' self.urlbase = 'http://www.biografischportaal.nl/persoon/{id}' self.hrtre = '(<h1.*)<h2' self.language = 'nl'
[docs] def finddescription(self, html: str): return self.findbyre(r'(?s)<th>(geboren.*?)</table>', html)
[docs] def findnames(self, html) -> List[str]: result = [self.findbyre(r'(?s)<title>(.*?)<', html)] section = self.findbyre( r'(?s)<th>alternatieve namen</th>(.*?)</tr>', html) if section: result += self.findallbyre('<li>(.*?)<', section) return result
[docs] def findlongtext(self, html: str): return self.findbyre(r'(?s)<div class="levensbeschrijvingen">(.*?)<!-- content end', html)
[docs] def findbirthplace(self, html: str): return self.findbyre(r'(?s)<th>geboren</th>[^<>]*<td>[^<>]*<span><br\s*/>([^<>]*)<', html, 'city')
[docs] def findbirthdate(self, html: str): result = self.findbyre(r'(?s)<th>geboren</th>[^<>]*<td>(.*?)<', html) if result and 'tussen' not in result: return result
[docs] def finddeathplace(self, html: str): return self.findbyre(r'(?s)<th>gestorven</th>[^<>]*<td>[^<>]*<span><br\s*/>([^<>]*)<', html, 'city')
[docs] def finddeathdate(self, html: str): result = self.findbyre(r'(?s)<th>gestorven</th>[^<>]*<td>(.*?)<', html) if result and 'tussen' not in result: return result
[docs] def findmixedrefs(self, html: str): return self.finddefaultmixedrefs(html)
[docs] def findgender(self, html: str): return self.findbyre(r'(?s)<th>sekse</th>.*?<li>(.*?)<', html, 'gender')
[docs] def findsources(self, html: str): return self.findallbyre(r'(?s)<a class="external_link open_in_new_window"[^<>]*>(.*?)<', html, 'source')
[docs]class NkcrAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P691' self.dbid = 'Q13550863' self.dbname = 'NKC' self.urlbase = 'https://aleph.nkp.cz/F/?func=find-c&local_base=aut&ccl_term=ica={id}' self.language = 'cs' self.hrtre = '(<table width=100%>.*?)<script language='
[docs] def prepare(self, html: str): return html.replace('&nbsp;', ' ')
[docs] def getvalue(self, field, html, dtype=None): return self.findbyre(fr'(?s)<td[^<>]*>\s*{field}\s*</td>\s*<td[^<>]*>(?:<[^<>]*>)*(.*?)<', html, dtype)
[docs] def findlongtext(self, html: str): return self.getvalue(r'Biogr\./Hist\. .daje', html)
[docs] def findinstanceof(self, html: str): return 'Q5'
[docs] def findnames(self, html) -> List[str]: result = [ self.getvalue('Z.hlav.', html), self.getvalue('Pseudonym', html) ] return [','.join(r.split(',')[:-1]) for r in result if r]
[docs] def finddescription(self, html: str): return self.getvalue(r'Biogr\./Hist\. .daje', html)
[docs] def findnationality(self, html: str): section = self.getvalue('Související zem.', html) or\ self.getvalue(r'Biogr\./Hist\. .daje', html) if section: return self.findbyre(r'(\w+)', section, 'country') return None
[docs] def findbirthdate(self, html: str): return self.findbyre(r'[Nn]arozena? ([\d\.\s]*\d)', html)
[docs] def finddeathdate(self, html: str): return self.findbyre(r'[Zz]em.ela? ([\d\.\s]*\d)', html)
[docs] def findbirthplace(self, html: str): return self.findbyre(r'[Nn]arozena? [\d\.\s]* v ([\w\s]*)', html, 'city')
[docs] def finddeathplace(self, html: str): return self.findbyre(r'[Zz]em.ela [\d\.\s]* v ([\w\s]*)', html, 'city')
[docs] def findoccupations(self, html: str): section = self.getvalue(r'Biogr\./Hist\. .daje', html) if section: if 'special' in section: section = section[:section.find('special')] parts = section.split(' a ') result = [] for part in parts: result += self.findallbyre(r'([^\,\.;]*)', part, 'occupation') return result return None
[docs] def findrelorder(self, html: str): return self.getvalue(r'Související org\.', html, 'religious order')
[docs] def findlanguagesspoken(self, html: str): section = self.getvalue('Jazyk', html) if section: return self.findallbyre(r'([^;]+)', section, 'language')
[docs] def findworkfields(self, html: str): results = [] for regex in [ r'[oO]dborník v (.*?)[\.<]', r'[sS]pecial\w* (?:se )?(?:v|na) (.*?)[\.<]', r'[zZ]abývá se (.*?)[\.<]', r'Zaměřuje se na (.*?)[\.<]', r'[oO]boru (.*?)[\.<]', r'[zZ]aměřený na (.*?)[\.<]', ]: sections = self.findallbyre(regex, html) for section in sections: parts = section.split(' a ') for part in parts: if part.startswith('v '): part = part[2:] results += self.findallbyre(r'([\w\s]+)', part.replace(' v ', ' '), 'subject') return results
[docs]class DbnlAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P723' self.dbid = 'Q2451336' self.dbname = 'DBNL' self.urlbase = 'http://www.dbnl.org/auteurs/auteur.php?id={id}' self.language = 'nl' self.hrtre = '(<p><span class="label">.*?)<form class="mainsearchform"' self.escapehtml = True
[docs] def findnames(self, html) -> List[str]: return [ self.findbyre(r'<title>(.*?)[&<·]', html), self.findbyre(r'"naam">(?:<[^<>]*>)*([^<>]+)<', html), self.findbyre(r'href="#naam">(.*?)<', html), ]
[docs] def findlongtext(self, html: str): return self.findbyre(r'(?s)<article[^<>]*>(.*?)</article>', html)
[docs] def findbirthdate(self, html: str): return self.findbyre(r'>geboren(?:<[^<>]*>)*<i>(.*?)<', html)
[docs] def findbirthplace(self, html: str): return self.findbyre(r'>geboren.*? te (?:<[^<>]*>)*([^<>]+)<', html, 'city')
[docs] def finddeathdate(self, html: str): return self.findbyre(r'>overleden(?:<[^<>]*>)*<i>(.*?)<', html)
[docs] def findburialdate(self, html: str): result = self.findbyre(r'(\d+ \w+ \(begraven\) \d+)', html) if result: return result.replace('(begraven) ', '') return None
[docs] def finddeathplace(self, html: str): return self.findbyre(r'>overleden<.*?> te (?:<[^<>]*>)*([^<>]+)<', html, 'city')
[docs] def findwebpages(self, html: str): result = [] section = self.findbyre(r'(?s)<section id="websites">.*?<table>(.*?)</table>', html) if section: result += self.findallbyre(r'>([^<>]*)</a>', section) section = self.findbyre(r'(?s)<h\d[^<>]*>Biografie[^<>]*(<.*?)</table>', html) if section: results = self.findallbyre(r'<a href="(.*?)"', section) result += ['https://www.dbnl.org/' + result.lstrip('/') for result in results] return result
[docs] def findsources(self, html: str): section = self.findbyre(r'(?s)<h\d[^<>]*>Biografie[^<>]*(<.*?)</table>', html) if section: return self.findallbyre(r'>([^<>]*)</a>', section, 'source')
[docs]class SikartAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P781' self.dbid = 'Q683543' self.dbname = 'SIKART' self.urlbase = 'http://www.sikart.ch/KuenstlerInnen.aspx?id={id}' self.language = 'de' self.hrtre = '<!-- content_start -->(.*?)<!-- content_end -->' self.escapehtml = True
[docs] def getvalue(self, field, html, dtype=None): return self.findbyre(r'(?s)>{}<.*?<div[^<>]*>(.*?)<' .format(field), html, dtype)
[docs] def findinstanceof(self, html: str): return 'Q5'
[docs] def findnames(self, html) -> List[str]: return [ self.findbyre(r'<title>([^<>]+?)-', html), self.findbyre(r'<h1>(.*?)<', html) ]
[docs] def finddescriptions(self, html: str): return [ self.getvalue('Vitazeile', html), self.getvalue('Vitazeile', html).split('.')[0] ]
[docs] def findlongtext(self, html: str): return self.findbyre(r'(?s)<!-- content_start -->(.*)<!-- content_end -->', html)
[docs] def findlastname(self, html: str): return self.findbyre(r'token.lastname=(\w+)', html, 'lastname')
[docs] def findfirstname(self, html: str): return self.findbyre(r'token.firstname=([\w\-]+)', html, 'firstname')
[docs] def findbirthdate(self, html: str): dates = self.getvalue('Lebensdaten', html) if dates: return self.findbyre(r'\*\s*([\d\.]+)', dates)
[docs] def findbirthplace(self, html: str): dates = self.getvalue('Lebensdaten', html) if dates: return self.findbyre(r'\*\s*[\d\.]+\s*(.*?),', dates, 'city')
[docs] def finddeathdate(self, html: str): dates = self.getvalue('Lebensdaten', html) if dates: return self.findbyre(r'†(?:\s|&nbsp;)*([\d\.]+)', dates)
[docs] def finddeathplace(self, html: str): dates = self.getvalue('Lebensdaten', html) if dates: return self.findbyre(r'†(?:\s|&nbsp;)*[\d\.]+(.*)', dates, 'city')
[docs] def findchoriginplaces(self, html: str): section = self.getvalue('Bürgerort', html) if section: return self.findallbyre(r'([\w\s\-]+)', section, 'city')
[docs] def findnationality(self, html: str): return self.getvalue('Staatszugehörigkeit', html, 'country')
[docs] def findoccupations(self, html: str): section = self.getvalue('Vitazeile', html) if section: result = [] splitter = 'et' if ' et ' in section else 'und' for subsection in section.split('.')[0].split(' {} ' .format(splitter)): result += self.findallbyre(r'([\w\s]+)', subsection, 'occupation') return result
[docs] def findmixedrefs(self, html: str): return self.finddefaultmixedrefs(html)
[docs]class ImslpAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P839' self.dbid = 'Q523660' self.dbname = 'International Music Score Library Project' self.urlbase = 'https://imslp.org/wiki/{id}' self.isperson = self.id.startswith('Category:') self.hrtre = r'(<h\d.*?)<h2' self.language = 'nl'
[docs] def findinstanceof(self, html: str): if self.isperson: return 'Q5' raise NotImplementedError # analysis only made for persons
[docs] def findbirthdate(self, html: str): return self.findbyre(r'</h2>\(([^<>]*?)—', html)
[docs] def finddeathdate(self, html: str): return self.findbyre(r'</h2>\([^<>]*?—([^<>]*?)\)', html)
[docs] def findlanguagenames(self, html: str): result = [('nl', x) for x in self.findallbyre(r'<h2>\s*<span[^<>]*>(.*?)</span>', html)] section = self.findbyre(r'Andere Namen/Transliteraties:(.*?)<', html) if section: parts = section.split(',') for part in parts: subparts = self.findallbyre(r'((?:[^,\(]|\([^\(\)]*\))*)', part) for subpart in subparts: if '(' in subpart: result += [(lang.strip(), subpart[:subpart.find('(')]) for lang in self.findbyre(r'\(.*?)\)', subpart).split(',')] else: result.append(('nl', subpart)) section = self.findbyre(r'Aliassen:(.*)', html) if section: parts = self.findallbyre(r'(<span.*?/span>', section) for part in parts: result += [(language.strip(), self.findbyre(r'>([^<>]*)</span>', part)) for language in self.findbyre(r'<span title="(.*?)">', part).split(',')] return result
[docs] def findmixedrefs(self, html: str): return self.finddefaultmixedrefs(html)
[docs]class HdsAnalyzer(Analyzer):
[docs] def setup(self): self.id = f'{int(self.id):06d}' self.dbproperty = 'P902' self.dbid = 'Q642074' self.dbname = 'Historical Dictionary of Switzerland' self.urlbase = 'https://hls-dhs-dss.ch/de/articles/{id}/' self.hrtre = '(<h1.*?<!-- noindex -->)' self.language = 'de' self.escapeunicode = True
[docs] def finddescription(self, html: str): return self.findbyre(r'property="og:description" content="(.*?)"', html)
[docs] def findlongtext(self, html: str): return self.findbyre(r'(?s)(<h1.*?<!-- noindex -->)', html)
[docs] def findnames(self, html) -> List[str]: return [self.findbyre(r'(?s)<title>(.*?)<', html)]
[docs] def findfirstname(self, html: str): return self.findbyre(r'<span itemprop="givenName">(.*?)</span>', html, 'firstname')
[docs] def findlastname(self, html: str): return self.findbyre(r'<span itemprop="familyName">(.*?)</span>', html, 'lastname')
[docs] def findbirthdate(self, html: str): return self.findbyre(r'<span itemprop="birthDate">(.*?)</span>', html)
[docs] def finddeathdate(self, html: str): return self.findbyre(r'<span itemprop="deathDate">(.*?)</span>', html)
[docs] def findbirthplace(self, html: str): return self.findbyre(r'<img alt="geboren"[^<>]*>\s*[^\s]*\s*([\w\s-]*)', html, 'city')
[docs] def finddeathplace(self, html: str): return self.findbyre(r'<img alt="gestorben"[^<>]*>\s*[^\s]*\s*([\w\s-]*)', html, 'city')
[docs] def findmixedrefs(self, html: str): return self.finddefaultmixedrefs(html, includesocial=False)
[docs]class NtaAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P1006' self.dbid = None self.dbname = 'NTA' self.urlbase = 'http://data.bibliotheken.nl/doc/thes/p{id}' self.hrtre = '(<h1.*?)<div id="bnodes">' self.language = 'nl'
[docs] def finddescription(self, html: str): return self.findbyre(r'<h1><span>(.*?)<', html)
[docs] def findnames(self, html) -> List[str]: result = [self.findbyre(r'(?s)<title>(.*?)<', html)] section = self.findbyre(r'(?s)alternateName</span>(.*?)<label', html) if section: result += self.findallbyre( r'(?s)<div class="fixed">(.*?)[&<]', html) return result
[docs] def findinstanceof(self, html: str): return self.findbyre(r'http://schema.org/(.*?)[&"\']', html, 'instanceof')
[docs] def finddeathdate(self, html: str): return self.findbyre(r'(?s)<span>deathDate</span>.*?<span.*?>(.*?)[&<]', html)
[docs] def findbirthdate(self, html: str): return self.findbyre(r'(?s)<span>birthDate</span>.*?<span.*?>(.*?)[&<]', html)
[docs] def findfirstname(self, html: str): return self.findbyre(r'(?s)<span>givenName</span>.*?<span.*?>(.*?)[&<]', html, 'firstname')
[docs] def findlastname(self, html: str): return self.findbyre(r'(?s)<span>familyName</span>.*?<span.*?>(.*?)[&<]', html, 'lastname')
[docs] def findviaf(self, html: str): return self.findbyre(r'http://viaf.org/viaf/(\d+)', html)
[docs]class PtbnpAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P1005' self.dbid = None self.dbname = 'Biblioteca Nacional de Portugal' self.urlbase = 'http://urn.bn.pt/nca/unimarc-authorities/txt?id={id}' self.hrtre = '(.*)' self.language = 'pt' self.escapehtml = True
[docs] def findnames(self, html) -> List[str]: return [self.TAGRE.sub(' ', text).replace('$b', '') for text in self.findallbyre( r'>[24]00<.*?\$a(.*?\$b.*?)(?:<br>|\$|$)', html)]
[docs] def finddescription(self, html: str): return self.findbyre(r'>830<.*?\$a.*?</font>([^<>]*)', html)
[docs] def findnationality(self, html: str): return self.findbyre(r'>102<.*?\$a(?:<[^<>]*>)*([^<>]+)', html, 'country')
[docs] def findlongtext(self, html: str): return '\n'.join(self.findallbyre(r'>830<.*?\$a.*?</font>([^<>]*)', html))
[docs] def findbirthdate(self, html: str): result = self.findbyre(r'>200<.*?\$f.*?</font>([^<>]*)-', html) if result and 'ca ' not in result and 'fl.' not in result: return result
[docs] def finddeathdate(self, html: str): result = self.findbyre(r'>200<.*?\$f.*?</font>[^<>]*-([^<>,]*)', html) if result and 'ca ' not in result and 'fl.' not in result: return result
[docs] def findfirstname(self, html: str): return self.findbyre(r'>200<.*?\$b</b></font>([^<>]*?),?\s*<', html, 'firstname')
[docs] def findlastname(self, html: str): return self.findbyre(r'>200<.*?\$a</b></font>(.*?),?<', html, 'lastname')
[docs]class BibsysAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P1015' self.dbid = None self.dbname = 'BIBSYS' self.urlbase = 'https://authority.bibsys.no/authority/rest/authorities/html/{id}' self.hrtre = '(<body>.*)' self.language = 'en'
[docs] def findnames(self, html) -> List[str]: return self.findallbyre( r'<td>[^<>]*name[^<>]*</td><td>([^<>]*)</td>', html)
[docs] def findinstanceof(self, html: str): return self.findbyre(r'<td>Authority type</td><td>(.*?)</td>', html, 'instanceof')
[docs] def findisni(self, html: str): return self.findbyre(r'<td>isni</td><td>(.*?)</td>', html)
[docs] def findviaf(self, html: str): return self.findbyre(r'http://viaf.org/viaf/(\w+)', html) or \ self.findbyre(r'<td>viaf</td><td>(.*?)</td>', html)
[docs] def findfirstname(self, html: str): return self.findbyre(r'<td>Personal name</td><td>[^<>]*,\s*(\w+)', html, 'firstname')
[docs] def findlastname(self, html: str): return self.findbyre(r'<td>Personal name</td><td>([^<>]*),', html, 'lastname')
[docs] def findbirthdate(self, html: str): return self.findbyre(r'<td>Dates associated with a name</td><td>([^<>]*)-', html)
[docs] def finddeathdate(self, html: str): return self.findbyre(r'<td>Dates associated with a name</td><td>[^<>]*-([^<>]*)', html)
[docs]class KunstindeksAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P1138' self.dbid = 'Q3362041' self.dbname = 'Kunstindeks Danmark' self.urlbase = 'https://www.kulturarv.dk/kid/VisKunstner.do?kunstnerId={id}' self.urlbase3 = 'https://www.kulturarv.dk/kid/SoegKunstnerVaerker.do?kunstnerId={id}&hitsPerPage=1000' self.hrtre = 'Information from Kunstindeks Danmark</h2>(.*?)</table>' self.language = 'da'
[docs] def findnames(self, html) -> List[str]: return [ self.findbyre(r':([^<>]*)</h1>', html), self.findbyre(r'Name:\s*</b>(.*?)<', html) ]
[docs] def findlongtext(self, html: str): return self.findbyre(r'(?s)(<h1>.*?)<td class="right\d', html)
[docs] def findlastname(self, html: str): return self.findbyre(r'(?s)<b>Name: </b>([^<>]*),', html, 'lastname')
[docs] def findfirstname(self, html: str): return self.findbyre(r'(?s)<b>Name: </b>[^<>]*,\s*([\w\-]+)', html, 'firstname')
[docs] def findbirthplace(self, html: str): return self.findbyre(r'(?s)<b>Born: </b>([^<>]*),', html, 'city')
[docs] def findbirthdate(self, html: str): return self.findbyre(r'(?s)<b>Born: </b>[^<>]*?([\d\-]+)\s*<', html)
[docs] def finddeathplace(self, html: str): return self.findbyre(r'(?s)<b>Died: </b>([^<>]*),', html, 'city')
[docs] def finddeathdate(self, html: str): return self.findbyre(r'(?s)<b>Died: </b>[^<>]*?([\d\-]+)\s*<', html)
[docs] def findoccupations(self, html: str): section = self.findbyre(r'(?s)Occupation: </b>(.*?)<', html) if section: return self.findallbyre(r'([\s\w]+)', section, 'occupation')
[docs] def findgender(self, html: str): return self.findbyre(r'(?s)Sex: </b>(.*?)<', html, 'gender')
[docs] def findnationality(self, html: str): return self.findbyre(r'(?s)Nationality: </b>(.*?)<', html, 'country')
[docs] def findincollections(self, html: str): return self.findallbyre(r'museumId=[^<>]*>(.*?)<', html, 'museum')
[docs]class IaafAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P1146' self.dbid = None self.dbname = 'IAAF' self.urlbase = 'https://www.iaaf.org/athletes/athlete={id}' self.hrtre = '(<div class="row offset.*? <div class="clearfix">)' self.language = 'en'
[docs] def findnames(self, html) -> List[str]: return [self.findbyre(r'(?s)<h1>(.*?)<', html)]
[docs] def findlongtext(self, html: str): return self.findbyre(r'(?s)<div class="modal-body athletepopup">(.*?)</script>', html)
[docs] def instanceof(self, html: str): return 'Q5'
[docs] def findoccupations(self, html: str): return ['Q11513337']
[docs] def findsports(self, html: str): return ['Q542']
[docs] def findnationality(self, html: str): return self.findbyre(r'(?s)COUNTRY.*?>([^<>]*)</span>', html, 'country')
[docs] def findbirthdate(self, html: str): return self.findbyre(r'(?s)DATE OF BIRTH\s*<br\s*/>(.*?)<', html)
[docs]class ScopusAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P1153' self.dbid = 'Q371467' self.dbname = 'Scopus' self.urlbase = 'https://www.scopus.com/authid/detail.uri?authorId={id}' self.hrtre = '(<h2.*?)<h4' self.language = 'en'
[docs] def findinstanceof(self, html: str): return 'Q5'
[docs] def findnames(self, html) -> List[str]: result = self.findallbyre( r'name="authorPreferredName" value="(.*?)"', html) section = self.findbyre( r'(?s)(<div id="otherNameFormatBadges".*?</div>)', html) if section: result += self.findallbyre(r'>(.*?)<', section) return result
[docs] def findworkfields(self, html: str): section = self.findbyre(r'(?s)(<div id="subjectAreaBadges".*?</div>)', html) if section: return self.findallbyre(r'>(.*?)<', section, 'subject')
[docs] def findmixedrefs(self, html: str): return self.finddefaultmixedrefs(html)
[docs] def findemployers(self, html: str): section = self.findbyre(r'(?s)<div class="authAffilcityCounty">(.*?)</div>', html) if section: return self.findallbyre(r'>([^<>]*)</span>', section, 'employer', alt=['university'])
[docs] def findworkplaces(self, html: str): section = self.findbyre(r'(?s)<div class="authAffilcityCounty">(.*?)</div>', html) if section: return self.findallbyre(r'(?s)>,([^<>],[^<>]*)<', section.replace('\n', ' '), 'city')
[docs]class RodovidAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P1185' self.dbid = 'Q649227' self.dbname = 'Rodovid' self.urlbase = 'https://en.rodovid.org/wk/Person:{id}' self.hrtre = '<table class="persondata">(.*?<h2>.*?)<h2' self.language = 'en' self.escapehtml = True
[docs] def findnames(self, html) -> List[str]: return [ self.findbyre(r'<title>(.*?)(?: [bd]\. |<)', html), self.findbyre(r'<h1[^<>]*>(.*?)(?: [bd]\. |<)', html), self.findbyre( r'(?s)<b>Full name[^<>]*</b>\s*</td><td>(.*?)<', html) ]
[docs] def findlongtext(self, html: str): return self.findbyre(r'(?s)<span class="mw-headline">Notes</span></h2>\s*<p>(.*?)<h\d', html)
[docs] def findbirthdate(self, html: str): section = self.findbyre(r'(?s)Events</span></h2>(.*?)<h2', html) return self.findbyre(r'<b>([^<>]*)</b>birth:', section)
[docs] def findbirthplace(self, html: str): section = self.findbyre(r'(?s)Events</span></h2>(.*?)<h2', html) return self.findbyre(r'>birth: <[^<>]*>(.*?)<', section, 'city')
[docs] def finddeathdate(self, html: str): section = self.findbyre(r'(?s)Events</span></h2>(.*?)<h2', html) return self.findbyre(r'<b>([^<>]*)</b>death:', section)
[docs] def finddeathplace(self, html: str): section = self.findbyre(r'(?s)Events</span></h2>(.*?)<h2', html) return self.findbyre(r'death: <[^<>]*>(.*?)<', section, 'city')
[docs] def findchildren(self, html: str): section = self.findbyre(r'(?s)Events</span></h2>(.*?)<h2', html) return self.findallbyre(r"child birth:.*?Person:\d+'>(.*?)<", section, 'person')
[docs] def findspouses(self, html: str): section = self.findbyre(r'(?s)Events</span></h2>(.*?)<h2', html) return self.findallbyre(r"marriage</a>.*?Person:\d+'>(.*?)<", section, 'person')
[docs] def findfamily(self, html: str): section = self.findbyre(r'(?s)<b>Lineage\s*</b>(.*?)</tr>', html) if section: return self.findbyre(r'>([^<>]*)</a>', section, 'family')
[docs] def findgender(self, html: str): return self.findbyre(r'(?s)Sex\s*</b>\s*</td><td>(.*?)<', html, 'gender')
[docs] def findfather(self, html: str): section = self.findbyre(r'(?s)<b>Parents</b>(.*?)</tr>', html) if section: return self.findbyre(r"♂.*?Person:\d+'>(.*?)<", section, 'person')
[docs] def findmother(self, html: str): section = self.findbyre(r'(?s)<b>Parents</b>(.*?)</tr>', html) if section: return self.findbyre(r"♀.*?Person:\d+'>(.*?)<", section, 'person')
[docs] def findreligions(self, html: str): return self.findallbyre(r'(?s)religion:\s*<.*?>([^<>]+)<.*?></p>', html, 'religion')
[docs] def findtitles(self, html: str): section = self.findbyre(r'(?s)Events</span></h2>(.*?)<h2', html) return self.findallbyre(r'title:.*?<a[^<>]*>(.*?)<', section, 'title')
[docs]class IbdbAnalyzer(Analyzer):
[docs] def setup(self): self.dbproperty = 'P1220' self.dbid = 'Q31964' self.dbname = 'IBDB' self.urlbase = 'https://www.ibdb.com/person.php?id={id}' self.hrtre = '(<h1>.*?)<div class="dottedLine">' self.language = 'en'
[docs] def finddescription(self, html: str): return self.findbyre(r'<meta name="description" content="(.*?)"', html)
[docs] def findnames(self, html) -> List[str]: section = self.findbyre( r'(?s)<b>Also Known As</b>\s*</div>\s*<div[^<>]*>(.*?)</div>', html) if section: result = self.findallbyre(r'([^\[\]<>]*?)[\[<]', section) else: result = [] return result + [self.findbyre(r'<title>([^<>]*?) – ', html)]
[docs] def findlongtext(self, html: str): parts = self.findallbyre(r'"personDescription"[^<>]*>(.*?)<', html) if parts: return ' '.join(parts)
[docs] def findoccupations(self, html: str): section = self.findbyre(r'(?s)<div class="s12 wrapper tag-block-compact extramarg">(.*?)</div>', html) if section: return self.findallbyre(r'>([^<>]*)<', section, 'theater-occupation', alt=['occupation'])
[docs] def findbirthdate(self, html: str): return self.findbyre(r'(?s)<div class="xt-lable">Born</div>\s*<div class="xt-main-title">(.*?)</div>', html)
[docs] def findbirthplace(self, html: str): return self.findbyre( r'(?s)<div class="xt-lable">Born</div>\s*<div class="xt-main-title">' r'[^<>]*</div>\s*<div class="xt-main-moreinfo">(.*?)</div>', html, 'city')
[docs] def finddeathdate(self, html: str): return self.findbyre(r'(?s)<div class="xt-lable">Died</div>\s*<div class="xt-main-title">(.*?)</div>', html)
[docs] def finddeathplace(self, html: str): return self.findbyre( r'(?s)<div class="xt-lable">Died</div>\s*<div class="xt-main-title">[^<>]*</div>' r'\s*<div class="xt-main-moreinfo">(.*?)</div>', html, 'city')
[docs] def findgender(self, html: str): return self.findbyre(r'(?s)<div class="xt-lable">Gender</div>\s*<div class="xt-main-title">(.*?)</div>', html, 'gender')
[docs] def findawards(self, html: str): section = self.findbyre(r'(?s)<div id="awards".*?>(.*?)</table>', html) if section: parts = self.findallbyre(r'(?s)(<tr><th.*?</tr>\s*<tr>.*?</tr>)', section) result = [] for part in parts: if '[nominee]' not in part: result.append(self.findbyre(r'<th[^<>]*>(.*?)<', section, 'award')) return result
[docs] def findnominations(self, html: str): section = self.findbyre(r'(?s)<div id="awards".*?>(.*?)</table>', html) if section: parts = self.findallbyre(r'(?s)(<tr><th.*?</tr>\s*<tr>.*?</tr>)', section) result = [] for part in parts: if '[nominee]' in part: result.append(self.findbyre(r'<th[^<>]*>(.*?)<', section, 'award')) return result
[docs] def findspouses(self, html: str): return self.findallbyre(r'(?s)(?:Wife|Husband) of(?:<[^<>]*>|\s)*(.*?)<', html, 'person')
[docs] def findpartners(self, html: str): return self