Puppet Class: profile::analytics::refinery::job::sqoop_mediawiki

Defined in:
modules/profile/manifests/analytics/refinery/job/sqoop_mediawiki.pp

Overview

Class profile::analytics::refinery::job::sqoop_mediawiki

Schedules sqoop to import MediaWiki databases into Hadoop monthly. NOTE: This requires that role::analytics_cluster::mysql_password has been included somewhere, so that /user/hdfs/mysql-analytics-research-client-pw.txt exists in HDFS. (We can't require it here, since it needs to only be included once on a different node.)

Parameters:

  • use_kerberos (Any) (defaults to: lookup('profile::analytics::refinery::job::sqoop_mediawiki::use_kerberos', { 'default_value' => false }))
  • ensure_timers (Any) (defaults to: lookup('profile::analytics::refinery::job::sqoop_mediawiki::ensure_timers', { 'default_value' => 'present' }))


8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# File 'modules/profile/manifests/analytics/refinery/job/sqoop_mediawiki.pp', line 8

class profile::analytics::refinery::job::sqoop_mediawiki (
    $use_kerberos = lookup('profile::analytics::refinery::job::sqoop_mediawiki::use_kerberos', { 'default_value' => false }),
    $ensure_timers = lookup('profile::analytics::refinery::job::sqoop_mediawiki::ensure_timers', { 'default_value' => 'present' }),
){
    require ::profile::analytics::refinery

    include ::passwords::mysql::analytics_labsdb
    include ::passwords::mysql::research

    $refinery_path              = $profile::analytics::refinery::path

    # Shortcut var to DRY up cron commands.
    $env = "export PYTHONPATH=\${PYTHONPATH}:${refinery_path}/python"

    $output_directory_labs      = '/wmf/data/raw/mediawiki/tables'
    $output_directory_private   = '/wmf/data/raw/mediawiki_private/tables'
    $wiki_file_labs             = '/mnt/hdfs/wmf/refinery/current/static_data/mediawiki/grouped_wikis/labs_grouped_wikis.csv'
    $wiki_file_private          = '/mnt/hdfs/wmf/refinery/current/static_data/mediawiki/grouped_wikis/prod_grouped_wikis.csv'
    # We sqoop most tables out of labsdb so that data is pre-sanitized.
    $labs_db_user               = $::passwords::mysql::analytics_labsdb::user
    $labs_log_file              = "${::profile::analytics::refinery::log_dir}/sqoop-mediawiki.log"
    # Sqoop anything private out of analytics-store
    $private_db_user            = $::passwords::mysql::research::user
    $private_log_file           = "${::profile::analytics::refinery::log_dir}/sqoop-mediawiki-private.log"
    # Separate log for sqoops from production replicas
    $production_log_file        = "${::profile::analytics::refinery::log_dir}/sqoop-mediawiki-production.log"
    # These are rendered elsewhere by role::analytics_cluster::mysql_password.
    $db_password_labs           = '/user/analytics/mysql-analytics-labsdb-client-pw.txt'
    $db_password_private        = '/user/analytics/mysql-analytics-research-client-pw.txt'
    # number of parallel processors to use when sqooping (querying MySQL)
    $num_processors             = 10
    # number of sqoop mappers to use for jobs getting data
    # since the beginning of wiki times or since 1 month
    $num_mappers_all_times      = 64
    $num_mappers_one_month      = 4
    # Yarn queue to run sqoop jobs in: production
    $yarn_queue                 = 'production'

    ############################################################################
    # Wrapper running entire-tables sqoop from labsdb, followed by entire-tables
    # sqoop from analytics-store
    # Template uses num_mappers_all_times

    file { '/usr/local/bin/refinery-sqoop-mediawiki':
        ensure  => $ensure_timers,
        content => template('profile/analytics/refinery/job/refinery-sqoop-mediawiki.sh.erb'),
        mode    => '0550',
        owner   => 'analytics',
        group   => 'analytics',
    }

    file { '/usr/local/bin/refinery-sqoop-mediawiki-production':
        ensure  => $ensure_timers,
        content => template('profile/analytics/refinery/job/refinery-sqoop-mediawiki-production.sh.erb'),
        mode    => '0550',
        owner   => 'analytics',
        group   => 'analytics',
    }

    file { '/usr/local/bin/refinery-sqoop-whole-mediawiki':
        ensure  => $ensure_timers,
        content => template('profile/analytics/refinery/job/refinery-sqoop-whole-mediawiki.sh.erb'),
        mode    => '0550',
        owner   => 'analytics',
        group   => 'analytics',
        require => File['/usr/local/bin/refinery-sqoop-mediawiki', '/usr/local/bin/refinery-sqoop-mediawiki-production'],
    }

    kerberos::systemd_timer { 'refinery-sqoop-whole-mediawiki':
        ensure       => $ensure_timers,
        description  => 'Schedules sqoop to import whole MediaWiki databases into Hadoop monthly.',
        command      => '/usr/local/bin/refinery-sqoop-whole-mediawiki',
        interval     => '*-*-01 00:00:00',
        user         => 'analytics',
        use_kerberos => $use_kerberos,
        require      => File['/usr/local/bin/refinery-sqoop-whole-mediawiki'],
    }

    ############################################################################
    # 1 month of tables from analytics-store, expected to last less than 2 hours
    # Template uses num_mappers_one_month
    # Tables: cu_changes

    file { '/usr/local/bin/refinery-sqoop-mediawiki-private':
        ensure  => $ensure_timers,
        content => template('profile/analytics/refinery/job/refinery-sqoop-mediawiki-private.sh.erb'),
        mode    => '0550',
        owner   => 'analytics',
        group   => 'analytics',
    }

    kerberos::systemd_timer { 'refinery-sqoop-mediawiki-private':
        ensure       => $ensure_timers,
        description  => 'Schedules sqoop to import MediaWiki databases (containing PII data) into Hadoop monthly.',
        command      => '/usr/local/bin/refinery-sqoop-mediawiki-private',
        interval     => '*-*-02 00:00:00',
        user         => 'analytics',
        use_kerberos => $use_kerberos,
        require      => File['/usr/local/bin/refinery-sqoop-mediawiki-private'],
    }
}