Source code for pywikibot.data.api._paraminfo

"""Object representing API parameter information."""
#
# (C) Pywikibot team, 2014-2022
#
# Distributed under the terms of the MIT license.
#
from collections.abc import Container, Sized
from typing import Any, Optional, Union

import pywikibot
from pywikibot import config
from pywikibot.backports import Dict, removeprefix
from pywikibot.tools.itertools import itergroup


__all__ = ['ParamInfo']


[docs]class ParamInfo(Sized, Container): """ API parameter information data object. Provides cache aware fetching of parameter information. It does not support the format modules. """ paraminfo_keys = frozenset(['modules', 'querymodules', 'formatmodules', 'mainmodule', 'pagesetmodule']) root_modules = frozenset(['main', 'pageset']) init_modules = frozenset(['main', 'paraminfo']) def __init__( self, site, preloaded_modules=None, modules_only_mode=None ) -> None: """ Initializer. :param preloaded_modules: API modules to preload :type preloaded_modules: set of string :param modules_only_mode: use the 'modules' only syntax for API request :type modules_only_mode: bool or None to only use default, which True if the site is 1.25wmf4+ """ self.site = site # Keys are module names, values are the raw responses from the server. self._paraminfo = {} # Cached data. self._prefixes = {} self._prefix_map = {} self._with_limits = None self._action_modules = frozenset() # top level modules self._modules = {} # filled in _init() (and enlarged in fetch) self._limit = None self.preloaded_modules = self.init_modules if preloaded_modules: self.preloaded_modules |= set(preloaded_modules) self.modules_only_mode = modules_only_mode if self.modules_only_mode: self.paraminfo_keys = frozenset(['modules']) def _add_submodules(self, name, modules) -> None: """Add the modules to the internal cache or check if equal.""" # The current implementation here doesn't support submodules inside of # submodules, because that would require to fetch all modules when only # the names of them were requested assert '+' not in name modules = frozenset(modules) if name == 'main': # The main module behaves differently as it has no prefix if self._action_modules: assert modules == self._action_modules else: self._action_modules = modules elif name in self._modules: assert modules == self._modules[name] else: self._modules[name] = modules def _init(self): assert ('query' in self._modules) is ('main' in self._paraminfo) if 'query' in self._modules: return mw_ver = self.site.mw_version # The paraminfo api deprecated the old request syntax of # querymodules='info'; to avoid warnings sites with 1.25wmf4+ # must only use 'modules' parameter. if self.modules_only_mode is None: self.modules_only_mode = mw_ver >= '1.25wmf4' if self.modules_only_mode: self.paraminfo_keys = frozenset(['modules']) # Assume that by v1.26, it will be desirable to prefetch 'query' if mw_ver > '1.26': self.preloaded_modules |= {'query'} self._fetch(self.preloaded_modules) main_modules_param = self.parameter('main', 'action') assert main_modules_param assert 'type' in main_modules_param assert isinstance(main_modules_param['type'], list) assert self._action_modules == set(main_modules_param['type']) # While deprecated with warning in 1.25, paraminfo param 'querymodules' # provides a list of all query modules. This will likely be removed # from the API in the future, in which case the fallback is the use # the same data available in the paraminfo for query. query_modules_param = self.parameter('paraminfo', 'querymodules') if 'limit' not in query_modules_param: raise RuntimeError('"limit" not found in query modules') self._limit = query_modules_param['limit'] if query_modules_param and 'type' in query_modules_param: # 'type' is the list of modules self._add_submodules('query', query_modules_param['type']) if 'query' not in self._modules: assert 'query' not in self._paraminfo self._fetch({'query'}) assert 'query' in self._modules def _emulate_pageset(self) -> None: """Emulate the pageset module, which existed until MW 1.24.""" # pageset isn't a module in the new system, so it is emulated, with # the paraminfo from the query module. assert 'query' in self._paraminfo self._paraminfo['pageset'] = { 'name': 'pageset', 'path': 'pageset', 'classname': 'ApiPageSet', 'prefix': '', 'readrights': '', 'helpurls': [], 'parameters': self._paraminfo['query']['parameters'] } @staticmethod def _modules_to_set(modules) -> set: """Return modules as a set. :type modules: iterable or str """ if isinstance(modules, str): return set(modules.split('|')) return set(modules)
[docs] def fetch(self, modules) -> None: """ Fetch paraminfo for multiple modules. No exception is raised when paraminfo for a module does not exist. Use __getitem__ to cause an exception if a module does not exist. :param modules: API modules to load :type modules: iterable or str """ if 'main' not in self._paraminfo: # The first request should be 'paraminfo', so that # query modules can be prefixed with 'query+' self._init() modules = self._modules_to_set(modules) if self._action_modules: # The query module may be added before the action modules have been if 'query' in self._modules: # It does fetch() while initializing, and this method can't be # called before it's initialized. modules = self._normalize_modules(modules) else: # We do know the valid action modules and require a subset assert not modules - self._action_modules - self.root_modules self._fetch(modules)
def _fetch(self, modules: Union[set, frozenset]) -> None: """ Fetch paraminfo for multiple modules without initializing beforehand. :param modules: API modules to load and which haven't been loaded yet. """ def module_generator(): """A generator yielding batches of modules.""" i = itergroup(sorted(modules), self._limit) for batch in i: for failed_module in failed_modules: yield [failed_module] del failed_modules[:] yield batch modules -= set(self._paraminfo) if not modules: return assert 'query' in self._modules or 'paraminfo' not in self._paraminfo # If something went wrong in a batch it can add each module to the # batch and the generator will on the next iteration yield each module # separately failed_modules = [] # This can be further optimised, by grouping them in more stable # subsets, which are unlikely to change. i.e. first request core # modules which have been a stable part of the API for a long time. # Also detecting extension based modules may help. # Also, when self.modules_only_mode is disabled, both modules and # querymodules may each be filled with self._limit items, doubling the # number of modules that may be processed in a single batch. for module_batch in module_generator(): if self.modules_only_mode and 'pageset' in module_batch: pywikibot.debug('paraminfo fetch: removed pageset') module_batch.remove('pageset') # If this occurred during initialisation, # also record it in the preloaded_modules. # (at least so tests know an extra load was intentional) if 'query' not in self._paraminfo: pywikibot.debug('paraminfo batch: added query') module_batch.append('query') self.preloaded_modules |= {'query'} params = { 'action': 'paraminfo', } if self.modules_only_mode: params['modules'] = module_batch else: params['modules'] = [mod for mod in module_batch if not mod.startswith('query+') and mod not in self.root_modules] params['querymodules'] = [removeprefix(mod, 'query+') for mod in module_batch if mod.startswith('query+')] for mod in set(module_batch) & self.root_modules: params[mod + 'module'] = 1 # Request need ParamInfo to determine use_get request = self.site._request(expiry=config.API_config_expiry, use_get=True, parameters=params) result = request.submit() normalized_result = self.normalize_paraminfo(result) for path in list(normalized_result): if normalized_result[path] is False: del normalized_result[path] # Sometimes the name/path of the module is not actually the name # which was requested, so we need to manually determine which # (wrongly named) module uses which actual name. See also T105478 missing_modules = [m for m in module_batch if m not in normalized_result] if len(missing_modules) == 1 and len(normalized_result) == 1: # Okay it's possible to recover normalized_result = next(iter(normalized_result.values())) pywikibot.warning('The module "{0[name]}" ("{0[path]}") ' 'was returned as path even though "{1}" ' 'was requested'.format(normalized_result, missing_modules[0])) normalized_result['path'] = missing_modules[0] normalized_result['name'] = missing_modules[0].rsplit('+')[0] normalized_result = {missing_modules[0]: normalized_result} elif len(module_batch) > 1 and missing_modules: # Rerequest the missing ones separately pywikibot.log('Inconsistency in batch "{}"; rerequest ' 'separately'.format(missing_modules)) failed_modules.extend(missing_modules) # Remove all modules which weren't requested, we can't be sure that # they are valid for path in list(normalized_result): if path not in module_batch: del normalized_result[path] self._paraminfo.update(normalized_result) for mod in normalized_result.values(): self._generate_submodules(mod['path']) if 'pageset' in modules and 'pageset' not in self._paraminfo: self._emulate_pageset() def _generate_submodules(self, module) -> None: """Check and generate submodules for the given module.""" parameters = self._paraminfo[module].get('parameters', []) submodules = set() # Advanced submodule into added to MW API in df80f1ea if self.site.mw_version >= '1.26wmf9': # This is supplying submodules even if they aren't submodules # of the given module so skip those for param in parameters: if module == 'main' and param['name'] == 'format' \ or 'submodules' not in param: continue for submodule in param['submodules'].values(): if '+' in submodule: parent, child = submodule.rsplit('+', 1) else: parent, child = 'main', submodule if parent == module: submodules.add(child) else: # Boolean submodule info added to MW API in afa153ae if self.site.mw_version < '1.24wmf18': if module == 'main': params = {'action'} elif module == 'query': params = {'prop', 'list', 'meta'} else: params = set() for param in parameters: if param['name'] in params: param['submodules'] = '' for param in parameters: # Do not add format modules if 'submodules' in param \ and (module != 'main' or param['name'] != 'format'): submodules |= set(param['type']) if submodules: self._add_submodules(module, submodules) if module == 'query': # Previously also modules from generator were used as query # modules, but verify that those are just a subset of the # prop/list/meta modules. There is no sanity check as this # needs to be revisited if query has no generator parameter for param in parameters: if param['name'] == 'generator': break else: param = {} assert param['name'] == 'generator' \ and submodules >= set(param['type']) def _normalize_modules(self, modules) -> set: """Add query+ to any query module name not also in action modules.""" # Users will supply the wrong type, and expect it to work. modules = self._modules_to_set(modules) assert self._action_modules return {'query+' + mod if '+' not in mod and mod in self.query_modules and mod not in self._action_modules else mod for mod in modules}
[docs] def normalize_modules(self, modules) -> set: """ Convert the modules into module paths. Add query+ to any query module name not also in action modules. :return: The modules converted into a module paths """ self._init() return self._normalize_modules(modules)
[docs] @classmethod def normalize_paraminfo(cls, data): """ Convert both old and new API JSON into a new-ish data structure. For duplicate paths, the value will be False. """ result_data = {} for paraminfo_key, modules_data in data['paraminfo'].items(): if not modules_data: continue if paraminfo_key[:-len('module')] in cls.root_modules: modules_data = [modules_data] elif not paraminfo_key.endswith('modules'): continue for mod_data in modules_data: if 'missing' in mod_data: continue name = mod_data.get('name') php_class = mod_data.get('classname') if not name and php_class: name = removeprefix(php_class, 'Api').lower() if name not in ('main', 'pageset'): pywikibot.warning('Unknown paraminfo module "{}"' .format(php_class)) name = '<unknown>:' + php_class mod_data['name'] = name if 'path' not in mod_data: # query modules often contain 'ApiQuery' and have a suffix. # 'ApiQuery' alone is the action 'query' if ('querytype' in mod_data or php_class and len(php_class) > 8 and 'ApiQuery' in php_class): mod_data['path'] = 'query+' + name else: mod_data['path'] = name path = mod_data['path'] if path in result_data: # Only warn first time if result_data[path] is not False: pywikibot.warning('Path "{}" is ambiguous.' .format(path)) else: pywikibot.log(f'Found another path "{path}"') result_data[path] = False else: result_data[path] = mod_data return result_data
def __getitem__(self, key): """ Return a paraminfo module for the module path, caching it. Use the module path, such as 'query+x', to obtain the paraminfo for submodule 'x' in the query module. If the key does not include a '+' and is not present in the top level of the API, it will fallback to looking for the key 'query+x'. """ self.fetch({key}) if key in self._paraminfo: return self._paraminfo[key] if '+' not in key: return self._paraminfo['query+' + key] raise KeyError(key) def __contains__(self, key) -> bool: """Return whether the key is valid.""" try: self[key] return True except KeyError: return False def __len__(self) -> int: """Return number of cached modules.""" return len(self._paraminfo)
[docs] def parameter( self, module: str, param_name: str ) -> Optional[Dict[str, Any]]: """ Get details about one modules parameter. Returns None if the parameter does not exist. :param module: API module name :param param_name: parameter name in the module :return: metadata that describes how the parameter may be used """ # TODO: the 'description' field of each parameter is not in the default # output of v1.25, and can't removed from previous API versions. # There should be an option to remove this verbose data from the cached # version, for earlier versions of the API, and/or extract any useful # data and discard the entire received paraminfo structure. There are # also params which are common to many modules, such as those provided # by the ApiPageSet php class: titles, pageids, redirects, etc. try: module = self[module] except KeyError: raise ValueError(f"paraminfo for '{module}' not loaded") try: params = module['parameters'] except KeyError: pywikibot.warning(f"module '{module}' has no parameters") return None param_data = [param for param in params if param['name'] == param_name] if not param_data: return None if len(param_data) != 1: raise RuntimeError( 'parameter data length is eiter empty or not unique.\n{}' .format(param_data)) return param_data[0]
@property def module_paths(self): """Set of all modules using their paths.""" return self._module_set(True) # As soon as modules() is removed, module_paths and _module_set can be # combined, so don't add any code between these two methods. def _module_set(self, path): # Load the submodules of all action modules available self.fetch(self.action_modules) modules = set(self.action_modules) for parent_module in self._modules: submodules = self.submodules(parent_module, path) assert not submodules & modules or not path modules |= submodules return modules @property def action_modules(self): """Set of all action modules.""" self._init() return self._action_modules @property def query_modules(self): """Set of all query module names without query+ path prefix.""" return self.submodules('query')
[docs] def submodules(self, name: str, path: bool = False) -> set: """ Set of all submodules. :param name: The name of the parent module. :param path: Whether the path and not the name is returned. :return: The names or paths of the submodules. """ if name not in self._modules: self.fetch([name]) submodules = self._modules[name] if path: submodules = self._prefix_submodules(submodules, name) return submodules
@staticmethod def _prefix_submodules(modules, prefix): """Prefix submodules with path.""" return {f'{prefix}+{mod}' for mod in modules} @property def prefix_map(self): """ Mapping of module to its prefix for all modules with a prefix. This loads paraminfo for all modules. """ if not self._prefix_map: self._prefix_map = {module: prefix for module, prefix in self.attributes('prefix').items() if prefix} return self._prefix_map.copy()
[docs] def attributes(self, attribute: str, modules: Optional[set] = None): """ Mapping of modules with an attribute to the attribute value. It will include all modules which have that attribute set, also if that attribute is empty or set to False. :param attribute: attribute name :param modules: modules to include. If None (default), it'll load all modules including all submodules using the paths. :rtype: dict using modules as keys """ if modules is None: modules = self.module_paths self.fetch(modules) return {mod: self[mod][attribute] for mod in modules if attribute in self[mod]}