Puppet Class: profile::analytics::airflow

Defined in:
modules/profile/manifests/analytics/airflow.pp

Overview

Class profile::analytics::airflow

Set up an apache-airflow instance to coordinate tasks in the analytics cluster.

Parameters:

  • service_user (String) (defaults to: lookup('profile::analytics::airflow::service_user'))
  • service_group (String) (defaults to: lookup('profile::analytics::airflow::service_group'))
  • webserver_port (Stdlib::Port) (defaults to: lookup('profile::analytics::airflow::webserver_port'))
  • mysql_host (Stdlib::Fqdn) (defaults to: lookup('profile::analytics::airflow::mysql_host'))
  • db_name (String) (defaults to: lookup('profile::analytics::airflow::db_name'))
  • deploy_target (String) (defaults to: lookup('profile::analytics::airflow::deploy_target'))
  • deploy_target_plugins (String) (defaults to: lookup('profile::analytics::airflow::deploy_target_plugins'))
  • deploy_user (String) (defaults to: lookup('profile::analytics::airflow::deploy_user'))
  • deploy_dir (Stdlib::Unixpath) (defaults to: lookup('profile::analytics::airflow::deploy_dir'))
  • airflow_dir (Stdlib::Unixpath) (defaults to: lookup('profile::analytics::airflow::airflow_dir'))
  • log_dir (Stdlib::Unixpath) (defaults to: lookup('profile::analytics::airflow::log_dir'))
  • run_dir (Stdlib::Unixpath) (defaults to: lookup('profile::analytics::airflow::run_dir'))
  • conf_dir (Stdlib::Unixpath) (defaults to: lookup('profile::analytics::airflow::conf_dir'))
  • conf_file (String) (defaults to: lookup('profile::analytics::airflow::conf_file'))
  • db_user (String) (defaults to: lookup('profile::analytics::airflow::db_user'))
  • db_password (String) (defaults to: lookup('profile::analytics::airflow::db_password'))


6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# File 'modules/profile/manifests/analytics/airflow.pp', line 6

class profile::analytics::airflow(
    String $service_user          = lookup('profile::analytics::airflow::service_user'),
    String $service_group         = lookup('profile::analytics::airflow::service_group'),
    Stdlib::Port $webserver_port  = lookup('profile::analytics::airflow::webserver_port'),
    Stdlib::Fqdn $mysql_host      = lookup('profile::analytics::airflow::mysql_host'),
    String $db_name               = lookup('profile::analytics::airflow::db_name'),
    String $deploy_target         = lookup('profile::analytics::airflow::deploy_target'),
    String $deploy_target_plugins = lookup('profile::analytics::airflow::deploy_target_plugins'),
    String $deploy_user           = lookup('profile::analytics::airflow::deploy_user'),
    Stdlib::Unixpath $deploy_dir  = lookup('profile::analytics::airflow::deploy_dir'),
    Stdlib::Unixpath $airflow_dir = lookup('profile::analytics::airflow::airflow_dir'),
    Stdlib::Unixpath $log_dir     = lookup('profile::analytics::airflow::log_dir'),
    Stdlib::Unixpath $run_dir     = lookup('profile::analytics::airflow::run_dir'),
    Stdlib::Unixpath $conf_dir    = lookup('profile::analytics::airflow::conf_dir'),
    String $conf_file             = lookup('profile::analytics::airflow::conf_file'),
    String $db_user               = lookup('profile::analytics::airflow::db_user'),
    String $db_password           = lookup('profile::analytics::airflow::db_password'),
) {
    ensure_packages([
        'python3',
        'python3-virtualenv',
        'virtualenv',
        'python3-pip',
        'python3-mysqldb',
    ])

    # wrapper script to run the airflow command in the right context
    $airflow_wrapper = '/usr/local/bin/airflow'
    file { $airflow_wrapper:
        ensure  => present,
        owner   => 'root',
        group   => 'root',
        mode    => '0555',
        content => template('profile/analytics/airflow/airflow.sh.erb'),
    }

    # Deploy upstream airflow code + dependencies
    scap::target { $deploy_target:
        deploy_user => $deploy_user,
    }

    # Deploy dags + plugins
    scap::target { $deploy_target_plugins:
        deploy_user => $deploy_user,
    }

    if $deploy_user != $service_user {
        # Allow scap to deploy revision controlled variables
        sudo::user { "scap_${deploy_user}_${service_user}":
            user       => $deploy_user,
            privileges => ["ALL=(${service_user}) NOPASSWD: /usr/local/bin/airflow variables *" ]
        }
    }

    file { $conf_dir:
        ensure => 'directory',
        owner  => 'root',
        group  => 'root',
        mode   => '0755',
    }

    $sql_alchemy_conn = "mysql://${db_user}:${db_password}@${mysql_host}/${db_name}?ssl_ca=/etc/ssl/certs/wmf-ca-certificates.crt"

    file { "${conf_dir}/${conf_file}":
        ensure  => present,
        # Since this stores passwords limit read access
        owner   => 'root',
        group   => $service_group,
        mode    => '0440',
        content => template('profile/analytics/airflow/airflow.cfg.erb'),
        require => Group[$service_group],
    }

    # Ensure places the daemons will write to are available.
    file { [$log_dir, $run_dir]:
        ensure => 'directory',
        owner  => $service_user,
        group  => $service_group,
        mode   => '0755',
    }

    file { '/usr/local/bin/airflow-clean-log-dirs':
        content => template('profile/analytics/airflow/airflow-clean-log-dirs.erb'),
        mode    => '0550',
        owner   => 'root',
        group   => 'root',
    }

    systemd::timer::job { 'airflow_clean_log_dirs':
        user        => 'root',
        description => 'Delete Airflow log dirs/files after 30 days',
        command     => '/usr/local/bin/airflow-clean-log-dirs',
        interval    => {
            'start'    => 'OnCalendar',
            'interval' => '*-*-* 03:00:00',  # Every day at 3:00
        },
        require     => File['/usr/local/bin/airflow-clean-log-dirs'],
    }

    systemd::service { 'airflow-webserver':
        content => template('profile/analytics/airflow/webserver.service.erb'),
        require => File[$log_dir, $run_dir, "${conf_dir}/${conf_file}", $airflow_wrapper],
    }

    profile::auto_restarts::service { 'airflow-webserver': }

    systemd::service { 'airflow-scheduler':
        content => template('profile/analytics/airflow/scheduler.service.erb'),
        require => File[$log_dir, $run_dir, "${conf_dir}/${conf_file}", $airflow_wrapper],
    }

    profile::auto_restarts::service { 'airflow-scheduler': }

    systemd::service { 'airflow-kerberos':
        content => template('profile/analytics/airflow/kerberos.service.erb'),
        require => File[$log_dir, $run_dir, "${conf_dir}/${conf_file}", $airflow_wrapper],
    }

    profile::auto_restarts::service { 'airflow-kerberos': }

    # Include analytics mediawiki sql replica credentials at
    # /etc/mysql/conf.d/analytics-research-client.cnf. This is only readable to
    # users in analytics-privatedata-users group, $service_user must be externally
    # configured as a member of this group.
    statistics::mysql_credentials { $service_group:
        group => $service_group,
    }
}