Source code for scripts.flickrripper

# -*- coding: utf-8 -*-
A tool to transfer flickr photos to Wikimedia Commons.

The following parameters are supported:

 -group_id:         specify group ID of the pool
 -photoset_id:      specify a photoset id
 -user_id:          give the user id of the flickrriper user
 -start_id:         the photo id to start with
 -end_id:           the photo id to end with
 -tags:             a tag to filter photo items (only one is supported)
 -flickerreview     add a flickr review template to the description
 -reviewer:         specify the reviewer
 -override:         override text for licence
 -addcategory:      specify a category
 -removecategories  remove all categories
 -autonomous        run bot in autonomous mode
# (C) Pywikibot team, 2009-2020
# Distributed under the terms of the MIT license.
from __future__ import absolute_import, division, unicode_literals

import base64
import hashlib
import io
import re

import pywikibot
from pywikibot import config, textlib
from pywikibot.comms.http import fetch
from pywikibot.specialbots import UploadRobot
from import PY2

    from pywikibot.userinterfaces.gui import Tkdialog
except ImportError as _tk_error:
    Tkdialog = _tk_error

if not PY2:
    from urllib.parse import urlencode
    from urllib import urlencode

    import flickrapi  # see:
except ImportError as e:
    flickrapi = e

# see
flickr_allowed_license = {
    0: False,  # All Rights Reserved
    1: False,  # Creative Commons Attribution-NonCommercial-ShareAlike License
    2: False,  # Creative Commons Attribution-NonCommercial License
    3: False,  # Creative Commons Attribution-NonCommercial-NoDerivs License
    4: True,   # Creative Commons Attribution License
    5: True,   # Creative Commons Attribution-ShareAlike License
    6: False,  # Creative Commons Attribution-NoDerivs License
    7: True,   # No known copyright restrictions
    8: True,   # United States Government Work
    9: True,   # Public Domain Dedication (CC0)
    10: True,  # Public Domain Mark

[docs]def getPhoto(flickr, photo_id): """ Get the photo info and the photo sizes so we can use these later on. TODO: Add exception handling """ while True: try: photoInfo = # xml.etree.ElementTree.dump(photoInfo) photoSizes = # xml.etree.ElementTree.dump(photoSizes) return photoInfo, photoSizes except flickrapi.exceptions.FlickrError: pywikibot.output('Flickr api problem, sleeping') pywikibot.sleep(30)
[docs]def isAllowedLicense(photoInfo): """ Check if the image contains the right license. TODO: Maybe add more licenses """ photo_license = photoInfo.find('photo').attrib['license'] return flickr_allowed_license[int(photo_license)]
[docs]def getPhotoUrl(photoSizes): """Get the url of the jpg file with the highest resolution.""" url = '' # The assumption is that the largest image is last for size in photoSizes.find('sizes').findall('size'): url = size.attrib['source'] return url
[docs]def downloadPhoto(photoUrl): """ Download the photo and store it in a io.BytesIO object. TODO: Add exception handling """ imageFile = fetch(photoUrl).raw return io.BytesIO(imageFile)
[docs]def findDuplicateImages(photo, site=None): """Find duplicate images. Take the photo, calculate the SHA1 hash and ask the MediaWiki api for a list of duplicates. TODO: Add exception handling. @param photo: Photo @type photo: io.BytesIO @param site: Site to search for duplicates. Defaults to using Wikimedia Commons if not supplied. @type site: or None """ if not site: site = pywikibot.Site('commons', 'commons') hashObject = hashlib.sha1() hashObject.update(photo.getvalue()) return site.getFilesFromAnHash(base64.b16encode(hashObject.digest()))
[docs]def getTags(photoInfo, raw=False): """Get all the tags on a photo. @param raw: use original tag name see @type raw: bool """ return [tag.attrib['raw'].lower() if raw else tag.text.lower() for tag in photoInfo.find('photo').find('tags').findall('tag')]
[docs]def getFlinfoDescription(photo_id): """ Get the description from TODO: Add exception handling, try a couple of times """ parameters = urlencode({'id': photo_id, 'raw': 'on'}) return fetch( '' % parameters).text
[docs]def getFilename(photoInfo, site=None, project='Flickr', photo_url=None): """Build a good filename for the upload based on the username and title. Prevents naming collisions. """ if not site: site = pywikibot.Site('commons', 'commons') username = photoInfo.find('photo').find('owner').attrib['username'] username = cleanUpTitle(username) title = photoInfo.find('photo').find('title').text if title: title = cleanUpTitle(title) if not title: # find the max length for a mw title maxBytes = 240 - len(project.encode('utf-8')) \ - len(username.encode('utf-8')) description = photoInfo.find('photo').find('description').text if description: descBytes = len(description.encode('utf-8')) if descBytes > maxBytes: # maybe we cut more than needed, anyway we do it items = max(min(len(description), maxBytes // 4), len(description) - descBytes + maxBytes) description = description[:items] title = cleanUpTitle(description) else: # Use the id of the photo as last resort. title = photoInfo.find('photo').attrib['id'] fileformat = photoInfo.find('photo').attrib['originalformat'] if not fileformat and photo_url: _, fileformat = photo_url.rsplit('.', 1) filename = '{} - {} - {}.{}'.format(title, project, username, fileformat) i = 1 while pywikibot.FilePage(site, filename).exists(): filename = '{} - {} - {} ({}).{}'.format(title, project, username, i, fileformat) i += 1 return filename
[docs]def cleanUpTitle(title): """Clean up the title of a potential MediaWiki page. Otherwise the title of the page might not be allowed by the software. """ title = title.strip() title = re.sub(r'[<{\[]', '(', title) title = re.sub(r'[>}\]]', ')', title) title = re.sub(r'[ _]?\(!\)', '', title) title = re.sub(',:[ _]', ', ', title) title = re.sub('[;:][ _]', ', ', title) title = re.sub(r'[\t\n ]+', ' ', title) title = re.sub(r'[\r\n ]+', ' ', title) title = re.sub('[\n]+', '', title) title = re.sub('[?!]([.\"]|$)', r'\1', title) title = re.sub('[&#%?!]', '^', title) title = re.sub('[;]', ',', title) title = re.sub(r'[/+\\:]', '-', title) title = re.sub('--+', '-', title) title = re.sub('[,|]+', ',', title) title = re.sub('[-,^]([.]|$)', r'\1', title) title = title.replace(' ', '_') title = title.strip('_') return title
[docs]def buildDescription(flinfoDescription='', flickrreview=False, reviewer='', override='', addCategory='', removeCategories=False): """Build the final description for the image. The description is based on the info from flickrinfo and improved. """ description = flinfoDescription # use template {{Taken on}} datetaken ='\|Date=(.*)\n', description).group(1) if datetaken: datetaken = '{{Taken on|%s}}' % (datetaken) description = re.sub(r'\|Date=.*\n', '|Date={}\n'.format(datetaken), description) if removeCategories: description = textlib.removeCategoryLinks(description, pywikibot.Site('commons', 'commons')) if override: description = description.replace('{{cc-by-sa-2.0}}\n', '') description = description.replace('{{cc-by-2.0}}\n', '') description = description.replace('{{flickrreview}}\n', '') description = description.replace( '{{copyvio|Flickr, licensed as "All Rights Reserved" which is not ' 'a free license --~~~~}}\n', '') description = description.replace('=={{int:license}}==', '=={{int:license}}==\n' + override) elif flickrreview and reviewer: description = description.replace( '{{flickrreview}}', '{{flickrreview|%s|' '{{subst:CURRENTYEAR}}-{{subst:CURRENTMONTH}}-' '{{subst:CURRENTDAY2}}}}' % reviewer) if '{{subst:unc}}' not in description: # Request category check description += '\n{{subst:chc}}\n' if addCategory: description = description.replace('{{subst:unc}}\n', '') description += '\n[[Category:{}]]\n'.format(addCategory) description = description.replace('\r\n', '\n') return description
[docs]def processPhoto(flickr, photo_id='', flickrreview=False, reviewer='', override='', addCategory='', removeCategories=False, autonomous=False): """Process a single Flickr photo. For each image: * Check the license * Check if it isn't already on Commons * Build suggested filename * Check for name collision and maybe alter it * Pull description from Flinfo * Show image and description to user * Add a nice hotcat lookalike for the adding of categories * Filter the categories * Upload the image """ if photo_id: pywikibot.output(str(photo_id)) (photoInfo, photoSizes) = getPhoto(flickr, photo_id) if isAllowedLicense(photoInfo) or override: # Get the url of the largest photo photoUrl = getPhotoUrl(photoSizes) # Should download the photo only once photo = downloadPhoto(photoUrl) # Don't upload duplicate images, should add override option duplicates = findDuplicateImages(photo) if duplicates: pywikibot.output('Found duplicate image at {}' .format(duplicates.pop())) else: filename = getFilename(photoInfo, photo_url=photoUrl) flinfoDescription = getFlinfoDescription(photo_id) if 'Blacklisted user' in flinfoDescription: pywikibot.warning('Blacklisted user found:\n' + flinfoDescription) return 0 photoDescription = buildDescription(flinfoDescription, flickrreview, reviewer, override, addCategory, removeCategories) # pywikibot.output(photoDescription) if not isinstance(Tkdialog, ImportError) and not autonomous: try: (newPhotoDescription, newFilename, skip) = Tkdialog( photoDescription, photo, filename).show_dialog() except ImportError as e: pywikibot.warning(e) pywikibot.warning('Switching to autonomous mode.') autonomous = True elif not autonomous: pywikibot.warning('Switching to autonomous mode because GUI ' 'interface cannot be used') pywikibot.warning(Tkdialog) autonomous = True if autonomous: newPhotoDescription = photoDescription newFilename = filename skip = False # Do the actual upload # Would be nice to check before I upload if the file is already at # Commons. Not that important for this program, but maybe for # derived programs if not skip: bot = UploadRobot(photoUrl, description=newPhotoDescription, use_filename=newFilename, keep_filename=True, verify_description=False) bot.upload_file() return 1 else: pywikibot.output('Invalid license') return 0
[docs]def getPhotos(flickr, user_id='', group_id='', photoset_id='', start_id='', end_id='', tags=''): """Loop over a set of Flickr photos. Get a set to work on (start with just a username). * Make it possible to delimit the set (from/to) """ found_start_id = not start_id # # Get the photos in a group if group_id: # First get the total number of photo's in the group photos = flickr.groups_pools_getPhotos(group_id=group_id, user_id=user_id, tags=tags, per_page='100', page='1') pages = photos.find('photos').attrib['pages'] def gen(i): return list(flickr.groups_pools_getPhotos( group_id=group_id, user_id=user_id, tags=tags, per_page='100', page=i ).find('photos')) # # Get the photos in a photoset elif photoset_id: photos = flickr.photosets_getPhotos(photoset_id=photoset_id, per_page='100', page='1') pages = photos.find('photoset').attrib['pages'] def gen(i): return list(flickr.photosets_getPhotos( photoset_id=photoset_id, per_page='100', page=i ).find('photoset')) # # Get the (public) photos uploaded by a user elif user_id: photos = flickr.people_getPublicPhotos(user_id=user_id, per_page='100', page='1') pages = photos.find('photos').attrib['pages'] def gen(i): return list(flickr.people_getPublicPhotos( user_id=user_id, per_page='100', page=i ).find('photos')) for i in range(1, int(pages) + 1): gotPhotos = False while not gotPhotos: try: for photo in gen(i): gotPhotos = True if photo.attrib['id'] == start_id: found_start_id = True if found_start_id: if photo.attrib['id'] == end_id: pywikibot.output('Found end_id') return else: yield photo.attrib['id'] except flickrapi.exceptions.FlickrError: gotPhotos = False pywikibot.output('Flickr api problem, sleeping') pywikibot.sleep(30)
[docs]def main(*args): """ Process command line arguments and invoke bot. If args is an empty list, sys.argv is used. @param args: command line arguments @type args: str """ local_args = pywikibot.handle_args(args) group_id = '' photoset_id = '' user_id = '' start_id = '' end_id = '' tags = '' addCategory = '' removeCategories = False autonomous = False totalPhotos = 0 uploadedPhotos = 0 # Do we mark the images as reviewed right away? if['review']: flickrreview =['review'] else: flickrreview = False # Set the Flickr reviewer if['reviewer']: reviewer =['reviewer'] elif 'commons' in config.usernames['commons']: reviewer = config.usernames['commons']['commons'] else: reviewer = '' # Should be renamed to overrideLicense or something like that override = '' for arg in local_args: if arg.startswith('-group_id'): if len(arg) == 9: group_id = pywikibot.input('What is the group_id of the pool?') else: group_id = arg[10:] elif arg.startswith('-photoset_id'): if len(arg) == 12: photoset_id = pywikibot.input('What is the photoset_id?') else: photoset_id = arg[13:] elif arg.startswith('-user_id'): if len(arg) == 8: user_id = pywikibot.input( 'What is the user_id of the flickr user?') else: user_id = arg[9:] elif arg.startswith('-start_id'): if len(arg) == 9: start_id = pywikibot.input( 'What is the id of the photo you want to start at?') else: start_id = arg[10:] elif arg.startswith('-end_id'): if len(arg) == 7: end_id = pywikibot.input( 'What is the id of the photo you want to end at?') else: end_id = arg[8:] elif arg.startswith('-tags'): if len(arg) == 5: tags = pywikibot.input( 'What is the tag you want to filter out (currently only ' 'one supported)?') else: tags = arg[6:] elif arg == '-flickrreview': flickrreview = True elif arg.startswith('-reviewer'): if len(arg) == 9: reviewer = pywikibot.input('Who is the reviewer?') else: reviewer = arg[10:] elif arg.startswith('-override'): if len(arg) == 9: override = pywikibot.input('What is the override text?') else: override = arg[10:] elif arg.startswith('-addcategory'): if len(arg) == 12: addCategory = pywikibot.input( 'What category do you want to add?') else: addCategory = arg[13:] elif arg == '-removecategories': removeCategories = True elif arg == '-autonomous': autonomous = True if isinstance(flickrapi, ImportError):'flickrapi',)) elif not['api_key']: additional_text = ( 'Flickr api key not found! Get yourself an api key\n' 'Any flickr user can get a key at\n' '') elif user_id or group_id or photoset_id: if 'api_secret' in and['api_secret']: flickr = flickrapi.FlickrAPI(['api_key'],['api_secret']) else: pywikibot.output('Accessing public content only') flickr = flickrapi.FlickrAPI(['api_key']) for photo_id in getPhotos(flickr, user_id, group_id, photoset_id, start_id, end_id, tags): uploadedPhotos += processPhoto(flickr, photo_id, flickrreview, reviewer, override, addCategory, removeCategories, autonomous) totalPhotos += 1 pywikibot.output('Finished running') pywikibot.output('Total photos: ' + str(totalPhotos)) pywikibot.output('Uploaded photos: ' + str(uploadedPhotos))
if __name__ == '__main__': main()