Puppet Class: profile::analytics::refinery::job::sqoop_mediawiki
- Defined in:
- modules/profile/manifests/analytics/refinery/job/sqoop_mediawiki.pp
Overview
SPDX-License-Identifier: Apache-2.0
Class profile::analytics::refinery::job::sqoop_mediawiki
Schedules sqoop to import MediaWiki databases into Hadoop monthly and daily. NOTE: This requires that role::analytics_cluster::mysql_password has been included somewhere, so that /user/hdfs/mysql-analytics-research-client-pw.txt exists in HDFS. (We can't require it here, since it needs to only be included once on a different node.)
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
# File 'modules/profile/manifests/analytics/refinery/job/sqoop_mediawiki.pp', line 9
class profile::analytics::refinery::job::sqoop_mediawiki (
Wmflib::Ensure $ensure_timers = lookup('profile::analytics::refinery::job::sqoop_mediawiki::ensure_timers', { 'default_value' => 'present' }),
){
require ::profile::analytics::refinery
include ::passwords::mysql::analytics_labsdb
include ::passwords::mysql::research
$refinery_path = $profile::analytics::refinery::path
# Shortcut var to DRY up cron commands.
$env = "export PYTHONPATH=\${PYTHONPATH}:${refinery_path}/python"
$output_directory_labs = '/wmf/data/raw/mediawiki/tables'
$output_directory_private = '/wmf/data/raw/mediawiki_private/tables'
$wiki_file = '/mnt/hdfs/wmf/refinery/current/static_data/mediawiki/grouped_wikis/grouped_wikis.csv'
# We sqoop most tables out of clouddb so that data is pre-sanitized.
$labs_db_user = $::passwords::mysql::analytics_labsdb::user
$labs_log_file = "${::profile::analytics::refinery::log_dir}/sqoop-mediawiki.log"
# Sqoop anything private out of analytics-store
$private_db_user = $::passwords::mysql::research::user
$private_log_file = "${::profile::analytics::refinery::log_dir}/sqoop-mediawiki-private.log"
# Separate logs for sqoops from production replicas
$production_log_file = "${::profile::analytics::refinery::log_dir}/sqoop-mediawiki-production.log"
$production_daily_log_file = "${::profile::analytics::refinery::log_dir}/sqoop-mediawiki-production-daily.log"
$wikifunctions_production_log_file = "${::profile::analytics::refinery::log_dir}/sqoop-wikifunctions-production.log"
# These are rendered elsewhere by role::analytics_cluster::mysql_password.
$db_password_labs = '/user/analytics/mysql-analytics-labsdb-client-pw.txt'
$db_password_private = '/user/analytics/mysql-analytics-research-client-pw.txt'
# number of parallel processors to use when sqooping (querying MySQL)
$num_processors = 10
# number of sqoop mappers to use for jobs getting data
# since the beginning of wiki times or since 1 month
$num_mappers_all_times = 64
$num_mappers_one_month = 4
# Yarn queue to run sqoop jobs in: production
$yarn_queue = 'production'
############################################################################
# Template uses num_mappers_all_times
# sqoop tables needed by the mediawiki history data pipeline, from cloud replicas
file { '/usr/local/bin/refinery-sqoop-mediawiki-history':
ensure => $ensure_timers,
content => template('profile/analytics/refinery/job/refinery-sqoop-mediawiki-history.sh.erb'),
mode => '0550',
owner => 'analytics',
group => 'analytics',
}
# sqoop tables not needed by the mediawiki history data pipeline, from cloud replicas
file { '/usr/local/bin/refinery-sqoop-mediawiki-not-history':
ensure => $ensure_timers,
content => template('profile/analytics/refinery/job/refinery-sqoop-mediawiki-not-history.sh.erb'),
mode => '0550',
owner => 'analytics',
group => 'analytics',
}
# sqoop from analytics-store replicas, tables not available on cloud replicas for privacy reasons
file { '/usr/local/bin/refinery-sqoop-mediawiki-production':
ensure => $ensure_timers,
content => template('profile/analytics/refinery/job/refinery-sqoop-mediawiki-production.sh.erb'),
mode => '0550',
owner => 'analytics',
group => 'analytics',
}
file { '/usr/local/bin/refinery-sqoop-whole-mediawiki':
ensure => $ensure_timers,
content => template('profile/analytics/refinery/job/refinery-sqoop-whole-mediawiki.sh.erb'),
mode => '0550',
owner => 'analytics',
group => 'analytics',
require => File[
'/usr/local/bin/refinery-sqoop-mediawiki-history',
'/usr/local/bin/refinery-sqoop-mediawiki-not-history',
'/usr/local/bin/refinery-sqoop-mediawiki-production'
],
}
# Used to store sqoop-generated jar that is rebuilt at each script run
file { '/tmp/sqoop-jars':
ensure => directory,
mode => '0755',
owner => 'analytics',
group => 'analytics',
}
kerberos::systemd_timer { 'refinery-sqoop-whole-mediawiki':
ensure => $ensure_timers,
description => 'Schedules sqoop to import whole MediaWiki databases into Hadoop monthly.',
command => '/usr/local/bin/refinery-sqoop-whole-mediawiki',
interval => '*-*-01 00:00:00',
user => 'analytics',
require => [File['/usr/local/bin/refinery-sqoop-whole-mediawiki'], File['/tmp/sqoop-jars']],
}
############################################################################
# 1 month of tables from analytics-store, expected to last less than 2 hours
# Template uses num_mappers_one_month
# Tables: cu_changes
file { '/usr/local/bin/refinery-sqoop-mediawiki-private':
ensure => $ensure_timers,
content => template('profile/analytics/refinery/job/refinery-sqoop-mediawiki-private.sh.erb'),
mode => '0550',
owner => 'analytics',
group => 'analytics',
}
kerberos::systemd_timer { 'refinery-sqoop-mediawiki-private':
ensure => $ensure_timers,
description => 'Schedules sqoop to import MediaWiki databases (containing PII data) into Hadoop monthly.',
command => '/usr/local/bin/refinery-sqoop-mediawiki-private',
interval => '*-*-02 00:00:00',
user => 'analytics',
require => [File['/usr/local/bin/refinery-sqoop-mediawiki-private'], File['/tmp/sqoop-jars']],
}
############################################################################
# daily sqoop of all data in some small tables. Expected to last an hour or
# two on most runs and not use up too many resources.
# Template uses num_mappers_one_month
# Tables: discussiontools_subscription
file { '/usr/local/bin/refinery-sqoop-mediawiki-production-daily':
ensure => $ensure_timers,
content => template('profile/analytics/refinery/job/refinery-sqoop-mediawiki-production-daily.sh.erb'),
mode => '0550',
owner => 'analytics',
group => 'analytics',
}
kerberos::systemd_timer { 'refinery-sqoop-mediawiki-production-daily':
ensure => $ensure_timers,
description => 'Schedules sqoop to import one-off MediaWiki tables into Hadoop daily.',
command => '/usr/local/bin/refinery-sqoop-mediawiki-production-daily',
interval => '*-*-* 05:00:00',
user => 'analytics',
require => [File['/tmp/sqoop-jars']],
}
############################################################################
# daily sqoop of data in WikiLambda tables. (WikiLambda is a MediaWiki
# extension and a component of Wikifunctions.) Expected to not use up too
# many resources for the foreseeable future.
# Template uses num_mappers_one_month
# Tables: wikilambda_zobject_labels,wikilambda_zobject_function_join
file { '/usr/local/bin/refinery-sqoop-wikifunctions-production':
ensure => $ensure_timers,
content => template('profile/analytics/refinery/job/refinery-sqoop-wikifunctions-production.sh.erb'),
mode => '0550',
owner => 'analytics',
group => 'analytics',
}
kerberos::systemd_timer { 'refinery-sqoop-wikifunctions-production':
ensure => $ensure_timers,
description => 'Schedules sqoop to import WikiLambda tables into Hadoop daily.',
command => '/usr/local/bin/refinery-sqoop-wikifunctions-production',
interval => '*-*-* 05:00:00',
user => 'analytics',
require => [File['/tmp/sqoop-jars']],
}
}
|