Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
#!/usr/bin/python # -*- coding: utf-8 -*- Update the monuments database from a text file, some wiki page(s) or sparql.
Usage: # loop through all countries python update_database.py
# work on specific country-lang python update_database.py -countrycode:XX -langcode:YY """
check_integer, check_lat_with_lon, check_wikidata, checkLat, checkLon ) CH1903Converter, extract_elements_from_template_param, extractWikilink, int_to_european_digits, remove_commons_category_prefix ) close_database_connection, connect_to_monuments_database )
"""Run a named check.""" return check_integer(field_value, monument_key, source_page) else: countryconfig.get('table'), check))
"""Convert a field.""" if field.get('conv') == 'extractWikilink': return extractWikilink(contents.get(field.get('source'))) elif field.get('conv') == 'remove_commons_category_prefix': return remove_commons_category_prefix( contents.get(field.get('source'))) elif field.get('conv') == 'generateRegistrantUrl' and \ countryconfig.get('registrantUrlBase'): return countryconfig.get('registrantUrlBase') % ( contents.get(field.get('source')),) elif field.get('conv') == 'to_default_numeral': return int_to_european_digits( contents.get(field.get('source'))) elif field.get('conv') == 'CH1903ToLat': (lat, lon) = CH1903Converter( contents.get('CH1903_X'), contents.get('CH1903_Y')) return lat elif field.get('conv') == 'CH1903ToLon': (lat, lon) = CH1903Converter( contents.get('CH1903_X'), contents.get('CH1903_Y')) return lon elif field.get('conv') == 'generateRegistrantUrl-sv-ship' and \ countryconfig.get('registrantUrlBase'): idurl = contents.get(field.get('source')).replace(' ', '') if not idurl.startswith('wiki'): return countryconfig.get('registrantUrlBase') % idurl else: return '' elif field.get('conv') == 'es-ct-fop': pano = contents.get(field.get('source')) if pano == 'dp': return 'pd' elif pano == 'sí': return 'FoP' elif pano == 'no': return 'noFoP' else: return '' elif field.get('conv') == 'generateRegistrantUrl-wlpa-es-ct' and \ countryconfig.get('registrantUrlBase'): idurl_p = contents.get(field.get('source')).split('/') if len(idurl_p) == 2 and idurl_p[0] == 'bcn': return countryconfig.get('registrantUrlBase') % (idurl_p[1],) else: return contents.get(field.get('source')) elif field.get('conv') == 'il-fop': fop = contents.get(field.get('source')) if fop == 'PD': return 'pd' elif fop == 'YES': return 'FoP' elif fop == 'NO': return 'noFoP' else: return '' elif field.get('conv') == 'fi-fop': dyear = contents.get(field.get('source')) cyear = datetime.datetime.now().year try: dyear = int(dyear) if (dyear + 70) < cyear: return 'pd' else: return 'noFoP' except ValueError: return 'noFoP' else: raise pywikibot.Error( 'Un-defined converter in config for {1}: {2}'.format( countryconfig.get('table'), field.get('conv')))
""" Outputs a list of any unknown fields as a wikitext table.
The table contains the name and frequency of the field and a sample of source pages where this field was encountered.
@param countryconfig: the configurations for the dataset being processed. @param unknown_fields: dict of discovered fields with each value being a Counter for how frequently the field is encountered per page. @return: dict summarising the usages """ site, 'Commons:Monuments database/Unknown fields/{0}'.format( countryconfig.get('table')))
# People can add a /header template with more info
else: 'Field': key, 'Count': sum(counter.values()), 'Sources': format_source_field(counter, site) })
page, summary.format(len(unknown_fields)), text)
'report_page': page, 'config': countryconfig, 'total_fields': len(unknown_fields), 'total_pages': len(pages_with_fields), 'total_usages': total_usages }
""" Format a list of source pages to fit in the statistics field.
@param sources: set of pywikibot.Page objects @param site: the site to which the output should be written (commons) @param sample_size: the number of source pages to output """ as_link=True, with_ns=False, insite=site) else: source_page.title( as_link=True, with_ns=False, insite=site), source_count )
source_page): """Update a single monument in the source database."""
# Source is the first field
# Do some conversions here else:
# check data countryconfig, source_page): field_value = '' # throw away input if check fails
countryconfig.get('table'), '`, `'.join(fieldnames), ('%s, ' * len(fieldnames)).rstrip(', '))
# print query % tuple(fieldvalues)
# FIXME : Disable for now because print throws UnicodeEncodeErrors # if len(w) == 1: # print w[-1].message, ' when running ', query % tuple(fieldvalues)
# print contents # print 'updating!' # time.sleep(5)
""" Get the defaults for the row templates.
Return all fields that seem to be valid. Ignore other fields. """
# Check first that field is not empty # Is it in the fields list?
"""Generator of monument database data from sparql query results.""" for result_item in query_result: yield process_monument_wikidata(result_item, params)
"""Process a single instance of a wikidata sparql result.""" # convert pywikibot.data.sparql.Literal to string literals = ('id', 'commonscat', 'address') for key in literals: if result[key]: result[key] = result[key].value
if result['image']: result['image'] = urllib.parse.unquote( result['image'].value).split('/')[-1]
if result['itemLabel']: result['name'] = result['itemLabel'].value
if result['adminLabel']: result['admin'] = result['adminLabel'].value
if result['monument_article']: result['monument_article'], _site = common.get_page_from_url( result['monument_article'].value)
result['source'] = result['item'].value result['wd_item'] = result['item'].getID()
if result['coordinate']: # ignore any unknown_value/some_value if 'Point(' in result['coordinate'].value: result['lon'], result['lat'] = result['coordinate'].value[ len('Point('):-1].split(' ')
# remove params that may not be NULL non_null_params = set(param_order) - set(('lat', 'lon')) for key in non_null_params: if key in result and not result[key]: del result[key]
return tuple([result.get(key, '') for key in param_order])
header_defaults, unknown_fields): """Process a single instance of a monument row template."""
# Get all the fields # Add the source of information (permalink) contents[field.get('source')] = header_defaults.get( field.get('source')) else:
# Check first that field is not empty # Is it in the fields list? # Load it with Big fucking escape hack. Stupid mysql lib # Do this somewhere else.replace("'", "\\'") else: # FIXME: Include more information where it went wrong 'Found unknown field on page {0} : ({1}: {2})'.format( title, field, value), _logger) # time.sleep(5)
# If we truncate we don't have to check for primkey (it's a made up one) update_monument( contents, source, countryconfig, conn, cursor, source_page) # Check if the primkey is a tuple and if all parts are present all_keys = True for partkey in countryconfig.get('primkey'): if not contents.get(lookup_source_field(partkey, countryconfig)): all_keys = False if all_keys: update_monument( contents, source, countryconfig, conn, cursor, source_page) # Check if the primkey is filled. This only works for a single primkey, # not a tuple countryconfig)): contents, source, countryconfig, conn, cursor, source_page) else:
"""Lookup the source field of a destination."""
unknown_fields=None): """ Process text containing one or more instances of the monument row template.
Also makes a record of any unexpected fields. """
# print template # print params params, source, countryconfig, conn, cursor, page, header_defaults, unknown_fields) except NoPrimkeyException: primkey_exceptions += 1 # time.sleep(5) query = ( """REPLACE INTO commonscat (site, title, commonscat) """ """VALUES (%s, %s, %s)""") cursor.execute( query, (countryconfig.get('lang'), page.title(True), params[0]))
# output missing primkey warning pywikibot.warning('{0:d} primkey(s) missing on {1} ({2})'.format( primkey_exceptions, page.title(True), countryconfig.get('table')))
"""Process all the monuments of one country.""" else: countryconfig, conn, cursor, full_update, days_back)
"""Process all the monuments of one country using row templates.""" site = pywikibot.Site(countryconfig.get('lang'), countryconfig.get('project')) row_template = pywikibot.Page( site, '{0}:{1}'.format(site.namespace(10), countryconfig.get('rowTemplate')))
trans_gen = pagegenerators.ReferringPageGenerator( row_template, onlyTemplateInclusion=True) filtered_gen = pagegenerators.NamespaceFilterPageGenerator( trans_gen, countryconfig.get('namespaces'), site=site)
if countryconfig.get('truncate') or full_update: # Some countries are always truncated, otherwise only do it when # requested. query = """TRUNCATE table `{0}`""".format(countryconfig.get('table')) cursor.execute(query) generator = pagegenerators.PreloadingGenerator(filtered_gen) # FIXME : Truncate the table else: # Preloading first because the whole page needs to be fetched to get # the time pregenerator = pagegenerators.PreloadingGenerator(filtered_gen) begintime = datetime.datetime.utcnow() + \ datetime.timedelta(days=0 - days_back) generator = pagegenerators.EdittimeFilterPageGenerator( pregenerator, begintime=begintime)
unknown_fields = {}
for page in generator: if page.exists() and not page.isRedirectPage(): # Do some checking unknown_fields = process_page( page, page.permalink(percent_encoded=False), countryconfig, conn, cursor, unknown_fields=unknown_fields)
return unknown_fields_statistics(countryconfig, unknown_fields)
"""Fetch the SPARQL template for a wikidata config.""" filename = 'wikidata_query.sparql' with open(os.path.join(get_template_dir(), filename), 'r') as f: sparql = f.read() return sparql
"""Fetch the SQL template for a wikidata config.""" return os.path.join( os.path.dirname(os.path.abspath(__file__)), 'template')
"""Process all the monuments of one country using sparql.""" sparql_select = countryconfig.get('sparql') sparql_template = load_wikidata_template_sparql()
sparql_query = sparql_template % dict( select_statement=sparql_select, lang=countryconfig.get('lang'), project=countryconfig.get('project') )
sq = pywikibot.data.sparql.SparqlQuery() try: query_result = sq.select(sparql_query, full_data=True) except (Timeout, ConnectionError): # timeout on https may end up being interpreted as a ConnectionError pywikibot.output('Sparql endpoint being slow, giving it a moment...') time.sleep(10) query_result = sq.select(sparql_query, full_data=True)
pywikibot.output('Sparql query successful with {0} results'.format( len(query_result)))
# todo: check and log duplicate ids manually params = ['monument_article', 'name', 'source', 'admin', 'image', 'lon', 'wd_item', 'lat', 'address', 'commonscat', 'id']
query = "REPLACE INTO `{0}` (`{1}`) VALUES ({2})".format( countryconfig.get('table'), '`, `'.join(params), ('%s, ' * len(params)).rstrip(', '))
batch_size = 100 i = 0 for result_chunk in [query_result[i:i + batch_size] for i in range(0, len(query_result), batch_size)]: with warnings.catch_warnings(record=True) as w: warnings.simplefilter('always') cursor.executemany( query, monument_wikidata_generator(result_chunk, params)) conn.commit()
pywikibot.output('Finished processing {0} results'.format( min(len(query_result), i + batch_size)))
"""Output the overall results for unknown fields as a nice wikitable.""" site, 'Commons:Monuments database/Unknown fields/Statistics')
('code', 'country'), ('lang', None), ('total_fields', 'Total fields'), ('total_usages', 'Total usage of fields'), ('total_pages', 'Total pages containing fields'), ('report_page', 'Report page'), ('row_template', 'Row template'), ('header_template', 'Header template') ]) title_column, list([col for col in title_column if col.startswith('total')])) # sparql harvests don't generate statistics
countryconfig.get('lang'), countryconfig.get('project', 'wikipedia'), countryconfig.get('rowTemplate'), site) countryconfig.get('lang'), countryconfig.get('project', 'wikipedia'), countryconfig.get('headerTemplate'), site) as_link=True, with_ns=False, insite=site)
'code': countryconfig.get('country'), 'lang': countryconfig.get('lang'), 'total_fields': row.get('total_fields'), 'total_usages': row.get('total_usages'), 'total_pages': row.get('total_pages'), 'report_page': report_page, 'row_template': row_template, 'header_template': header_template })
'Updating unknown fields statistics. Total of {total_fields} ' 'unknown fields used {total_usages} times on {total_pages} different ' 'pages.'.format(**table.get_sum()))
"""The main loop.""" # First find out what to work on
countrycode = '' lang = '' full_update = True skip_wd = False days_back = 2 # Default 2 days. Runs every night so can miss one night. conn = None cursor = None (conn, cursor) = connect_to_monuments_database()
for arg in pywikibot.handleArgs(): option, sep, value = arg.partition(':') if option == '-countrycode': countrycode = value elif option == '-langcode': lang = value elif option == '-daysback': days_back = int(value) elif option == '-fullupdate': # does nothing since already default full_update = True elif option == '-skip_wd': skip_wd = True else: raise Exception( 'Bad parameters. Expected "-countrycode", "-langcode", ' '"-daysback", "-fullupdate", "-skip_wd" or pywikibot args. ' 'Found "{}"'.format(option))
if countrycode and lang: if not mconfig.countries.get((countrycode, lang)): pywikibot.warning( 'I have no config for countrycode "{0}" ' 'in language "{1}"'.format( countrycode, lang)) return False
pywikibot.log( 'Working on countrycode "{0}" in language "{1}"'.format( countrycode, lang)) try: countryconfig = mconfig.countries.get((countrycode, lang)) process_country(countryconfig, conn, cursor, full_update, days_back) except Exception as e: pywikibot.error( 'Unknown error occurred when processing country ' '{0} in lang {1}\n{2}'.format(countrycode, lang, str(e))) elif countrycode or lang: raise Exception('The "countrycode" and "langcode" arguments must ' 'be used together.') else: statistics = [] for (countrycode, lang), countryconfig in mconfig.filtered_countries( skip_wd=skip_wd): pywikibot.log( 'Working on countrycode "{0}" in language "{1}"'.format( countrycode, lang)) try: statistics.append( process_country(countryconfig, conn, cursor, full_update, days_back)) except Exception as e: pywikibot.error( 'Unknown error occurred when processing country ' '{0} in lang {1}\n{2}'.format(countrycode, lang, str(e))) continue make_statistics(statistics)
close_database_connection(conn, cursor)
pywikibot.log('Start of %s' % __file__) main() |