Puppet Class: profile::analytics::refinery::job::import_mediawiki_dumps

Defined in:
modules/profile/manifests/analytics/refinery/job/import_mediawiki_dumps.pp

Overview

SPDX-License-Identifier: Apache-2.0

Class profile::analytics::refinery::job::import_mediawiki_dumps

Schedules an import of page-history xmldumps and site-info jsondumps to hadoop.

NOTE: This class assumes the xmldatadumps folder under which public dumps can be found is mounted under /mnt/data

Parameters:

  • ensure_timers (Wmflib::Ensure) (defaults to: lookup('profile::analytics::refinery::job::import_mediawiki_dumps::ensure_timers', { 'default_value' => 'present' }))


8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# File 'modules/profile/manifests/analytics/refinery/job/import_mediawiki_dumps.pp', line 8

class profile::analytics::refinery::job::import_mediawiki_dumps (
    Wmflib::Ensure $ensure_timers = lookup('profile::analytics::refinery::job::import_mediawiki_dumps::ensure_timers', { 'default_value' => 'present' }),
) {

    # Import siteinfo-namespaces
    profile::analytics::refinery::job::import_mediawiki_dumps_config { 'refinery-import-siteinfo-dumps':
        ensure            => $ensure_timers,
        dump_type         => 'siteinfo-namespaces',
        log_file_name     => 'import_siteinfo_dumps.log',
        timer_description => 'Schedules daily an incremental import of the current month of siteinfo-namespaces jsondumps into HDFS',
        timer_interval    => '*-*-* 02:00:00',
    }

    # Import pages-meta-history
    # Note: Skip wikidatawiki to speed-up data availability (T357859)
    profile::analytics::refinery::job::import_mediawiki_dumps_config { 'refinery-import-page-history-dumps':
        ensure            => $ensure_timers,
        dump_type         => 'pages-meta-history',
        log_file_name     => 'import_pages_history_dumps.log',
        skip_list         => 'wikidatawiki',
        timer_description => 'Schedules daily an incremental import of the current month of pages-meta-history xmldumps into HDFS',
        timer_interval    => '*-*-* 03:00:00',
    }

    # Import pages-meta-current
    profile::analytics::refinery::job::import_mediawiki_dumps_config { 'refinery-import-page-current-dumps':
        ensure            => $ensure_timers,
        dump_type         => 'pages-meta-current',
        log_file_name     => 'import_pages_current_dumps.log',
        timer_description => 'Schedules daily an incremental import of the current month of pages-meta-current xmldumps into HDFS',
        timer_interval    => '*-*-* 05:00:00',
    }

}