Puppet Class: profile::analytics::refinery::job::sqoop_mediawiki

Defined in:
modules/profile/manifests/analytics/refinery/job/sqoop_mediawiki.pp

Overview

SPDX-License-Identifier: Apache-2.0

Class profile::analytics::refinery::job::sqoop_mediawiki

Schedules sqoop to import MediaWiki databases into Hadoop monthly and daily. NOTE: This requires that role::analytics_cluster::mysql_password has been included somewhere, so that /user/hdfs/mysql-analytics-research-client-pw.txt exists in HDFS. (We can't require it here, since it needs to only be included once on a different node.)

Parameters:

  • ensure_timers (Wmflib::Ensure) (defaults to: lookup('profile::analytics::refinery::job::sqoop_mediawiki::ensure_timers', { 'default_value' => 'present' }))


9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
# File 'modules/profile/manifests/analytics/refinery/job/sqoop_mediawiki.pp', line 9

class profile::analytics::refinery::job::sqoop_mediawiki (
    Wmflib::Ensure $ensure_timers = lookup('profile::analytics::refinery::job::sqoop_mediawiki::ensure_timers', { 'default_value' => 'present' }),
){
    require ::profile::analytics::refinery

    include ::passwords::mysql::analytics_labsdb
    include ::passwords::mysql::research

    $refinery_path              = $profile::analytics::refinery::path

    # Shortcut var to DRY up cron commands.
    $env = "export PYTHONPATH=\${PYTHONPATH}:${refinery_path}/python"

    $output_directory_labs      = '/wmf/data/raw/mediawiki/tables'
    $output_directory_private   = '/wmf/data/raw/mediawiki_private/tables'
    $wiki_file                  = '/mnt/hdfs/wmf/refinery/current/static_data/mediawiki/grouped_wikis/grouped_wikis.csv'
    # We sqoop most tables out of clouddb so that data is pre-sanitized.
    $labs_db_user               = $::passwords::mysql::analytics_labsdb::user
    $labs_log_file              = "${::profile::analytics::refinery::log_dir}/sqoop-mediawiki.log"
    # Sqoop anything private out of analytics-store
    $private_db_user            = $::passwords::mysql::research::user
    $private_log_file           = "${::profile::analytics::refinery::log_dir}/sqoop-mediawiki-private.log"
    # Separate logs for sqoops from production replicas
    $production_log_file        = "${::profile::analytics::refinery::log_dir}/sqoop-mediawiki-production.log"
    $production_daily_log_file  = "${::profile::analytics::refinery::log_dir}/sqoop-mediawiki-production-daily.log"
    $wikifunctions_production_log_file = "${::profile::analytics::refinery::log_dir}/sqoop-wikifunctions-production.log"
    # These are rendered elsewhere by role::analytics_cluster::mysql_password.
    $db_password_labs           = '/user/analytics/mysql-analytics-labsdb-client-pw.txt'
    $db_password_private        = '/user/analytics/mysql-analytics-research-client-pw.txt'
    # number of parallel processors to use when sqooping (querying MySQL)
    $num_processors             = 10
    # number of sqoop mappers to use for jobs getting data
    # since the beginning of wiki times or since 1 month
    $num_mappers_all_times      = 64
    $num_mappers_one_month      = 4
    # Yarn queue to run sqoop jobs in: production
    $yarn_queue                 = 'production'

    ############################################################################
    # Template uses num_mappers_all_times

    # sqoop tables needed by the mediawiki history data pipeline, from cloud replicas
    file { '/usr/local/bin/refinery-sqoop-mediawiki-history':
        ensure  => $ensure_timers,
        content => template('profile/analytics/refinery/job/refinery-sqoop-mediawiki-history.sh.erb'),
        mode    => '0550',
        owner   => 'analytics',
        group   => 'analytics',
    }

    # sqoop tables not needed by the mediawiki history data pipeline, from cloud replicas
    file { '/usr/local/bin/refinery-sqoop-mediawiki-not-history':
        ensure  => $ensure_timers,
        content => template('profile/analytics/refinery/job/refinery-sqoop-mediawiki-not-history.sh.erb'),
        mode    => '0550',
        owner   => 'analytics',
        group   => 'analytics',
    }

    # sqoop from analytics-store replicas the tables needed by medawiki-history
    # Tables not available on cloud replicas for privacy reasons
    file { '/usr/local/bin/refinery-sqoop-mediawiki-production-history':
        ensure  => $ensure_timers,
        content => template('profile/analytics/refinery/job/refinery-sqoop-mediawiki-production-history.sh.erb'),
        mode    => '0550',
        owner   => 'analytics',
        group   => 'analytics',
    }

    # sqoop from analytics-store replicas the tables not needed by medawiki-history
    # Tables not available on cloud replicas for privacy reasons
    file { '/usr/local/bin/refinery-sqoop-mediawiki-production-not-history':
        ensure  => $ensure_timers,
        content => template('profile/analytics/refinery/job/refinery-sqoop-mediawiki-production-not-history.sh.erb'),
        mode    => '0550',
        owner   => 'analytics',
        group   => 'analytics',
    }

    file { '/usr/local/bin/refinery-sqoop-whole-mediawiki':
        ensure  => $ensure_timers,
        content => template('profile/analytics/refinery/job/refinery-sqoop-whole-mediawiki.sh.erb'),
        mode    => '0550',
        owner   => 'analytics',
        group   => 'analytics',
        require => File[
            '/usr/local/bin/refinery-sqoop-mediawiki-history',
            '/usr/local/bin/refinery-sqoop-mediawiki-not-history',
            '/usr/local/bin/refinery-sqoop-mediawiki-production-history',
            '/usr/local/bin/refinery-sqoop-mediawiki-production-not-history'
            ],
    }

    # Used to store sqoop-generated jar that is rebuilt at each script run
    file { '/tmp/sqoop-jars':
        ensure => directory,
        mode   => '0755',
        owner  => 'analytics',
        group  => 'analytics',
    }

    kerberos::systemd_timer { 'refinery-sqoop-whole-mediawiki':
        ensure      => $ensure_timers,
        description => 'Schedules sqoop to import whole MediaWiki databases into Hadoop monthly.',
        command     => '/usr/local/bin/refinery-sqoop-whole-mediawiki',
        interval    => '*-*-01 00:00:00',
        user        => 'analytics',
        require     => [File['/usr/local/bin/refinery-sqoop-whole-mediawiki'], File['/tmp/sqoop-jars']],
    }

    ############################################################################
    # 1 month of tables from analytics-store, expected to last less than 2 hours
    # Template uses num_mappers_one_month
    # Tables: cu_changes

    file { '/usr/local/bin/refinery-sqoop-mediawiki-private':
        ensure  => $ensure_timers,
        content => template('profile/analytics/refinery/job/refinery-sqoop-mediawiki-private.sh.erb'),
        mode    => '0550',
        owner   => 'analytics',
        group   => 'analytics',
    }

    kerberos::systemd_timer { 'refinery-sqoop-mediawiki-private':
        ensure      => $ensure_timers,
        description => 'Schedules sqoop to import MediaWiki databases (containing PII data) into Hadoop monthly.',
        command     => '/usr/local/bin/refinery-sqoop-mediawiki-private',
        interval    => '*-*-02 00:00:00',
        user        => 'analytics',
        require     => [File['/usr/local/bin/refinery-sqoop-mediawiki-private'], File['/tmp/sqoop-jars']],
    }

    ############################################################################
    # daily sqoop of all data in some small tables.  Expected to last an hour or
    # two on most runs and not use up too many resources.
    # Template uses num_mappers_one_month
    # Tables: discussiontools_subscription

    file { '/usr/local/bin/refinery-sqoop-mediawiki-production-daily':
        ensure  => $ensure_timers,
        content => template('profile/analytics/refinery/job/refinery-sqoop-mediawiki-production-daily.sh.erb'),
        mode    => '0550',
        owner   => 'analytics',
        group   => 'analytics',
    }

    kerberos::systemd_timer { 'refinery-sqoop-mediawiki-production-daily':
        ensure      => $ensure_timers,
        description => 'Schedules sqoop to import one-off MediaWiki tables into Hadoop daily.',
        command     => '/usr/local/bin/refinery-sqoop-mediawiki-production-daily',
        interval    => '*-*-* 05:00:00',
        user        => 'analytics',
        require     => [File['/tmp/sqoop-jars']],
    }

    ############################################################################
    # daily sqoop of data in WikiLambda tables. (WikiLambda is a MediaWiki
    # extension and a component of Wikifunctions.)  Expected to not use up too
    # many resources for the foreseeable future.
    # Template uses num_mappers_one_month
    # Tables: wikilambda_zobject_labels,wikilambda_zobject_function_join

    file { '/usr/local/bin/refinery-sqoop-wikifunctions-production':
        ensure  => $ensure_timers,
        content => template('profile/analytics/refinery/job/refinery-sqoop-wikifunctions-production.sh.erb'),
        mode    => '0550',
        owner   => 'analytics',
        group   => 'analytics',
    }

    kerberos::systemd_timer { 'refinery-sqoop-wikifunctions-production':
        ensure      => $ensure_timers,
        description => 'Schedules sqoop to import WikiLambda tables into Hadoop daily.',
        command     => '/usr/local/bin/refinery-sqoop-wikifunctions-production',
        interval    => '*-*-* 05:00:00',
        user        => 'analytics',
        require     => [File['/tmp/sqoop-jars']],
    }
}