Defined Type: dumps::web::fetches::analytics::job

Defined in:
modules/dumps/manifests/web/fetches/analytics/job.pp

Overview

Define dumps::web::fetches::analytics::job

Regularly copies files from $hdfs_source to $local_destination. Uses hdfs-rsync, systemd timers and Kerberos.

Parameters

hdfs_source

HDFS Source directory to pull data from.

local_destination

Destination directory on local filesystem to put data into.

interval

Systemd interval that the timer will use.

user

User running the Systemd timer.

delete

Add the –delete if true.

exclude

Add –exclude $value if not undef.

use_kerberos

Authenticate via Kerberos before executing the systemd timer.

ensure

Ensure status of systemd timer.

Parameters:

  • hdfs_source (String)
  • local_destination (String)
  • interval (String)
  • user (String)
  • delete (Boolean) (defaults to: true)
  • use_kerberos (Boolean) (defaults to: false)
  • ignore_missing_source (Boolean) (defaults to: false)
  • ensure (Wmflib::Ensure) (defaults to: present)
  • exclude (Optional[String]) (defaults to: undef)


33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# File 'modules/dumps/manifests/web/fetches/analytics/job.pp', line 33

define dumps::web::fetches::analytics::job(
    String $hdfs_source,
    String $local_destination,
    String $interval,
    String $user,
    Boolean $delete = true,
    Boolean $use_kerberos = false,
    Boolean $ignore_missing_source = false,
    Wmflib::Ensure $ensure = present,
    Optional[String] $exclude = undef,
) {
    if !defined(File[$local_destination]) {
        file { $local_destination:
            ensure => 'directory',
            owner  => $user,
            group  => 'root',
        }
    }

    $delete_option = $delete ? {
        true    => '--delete',
        default => ''
    }

    # Quotes around the exclude value are on purpose to force
    # to parse it as a single value
    $exclude_option = $exclude ? {
        undef   => '',
        default => " --exclude \"${exclude}\""
    }

    # If $ignore_missing_source is enabled, add a check that prevents
    # hdfs-rsync to fail when the source directory is missing.
    $rsync_command = "/usr/local/bin/hdfs-rsync -r -t ${delete_option}${exclude_option} --chmod=go-w hdfs://${hdfs_source} file://${local_destination}"
    $ignore_msg = "Ignoring missing hdfs source hdfs://${hdfs_source}"
    $head = "#!/bin/bash\n"
    $script_content = $ignore_missing_source ? {
        true    => "${head}hdfs dfs -ls -d hdfs://${hdfs_source} > /dev/null 2>&1 && ${rsync_command} || echo ${ignore_msg}",
        default => "${head}${rsync_command}"
    }
    file { "/usr/local/bin/rsync-analytics-${title}":
        ensure  => $ensure,
        content => $script_content,
        mode    => '0550',
        owner   => $user,
        group   => 'root',
    }

    kerberos::systemd_timer { "analytics-dumps-fetch-${title}":
        description  => "Copy ${title} files from Hadoop HDFS.",
        command      => "/usr/local/bin/rsync-analytics-${title}",
        interval     => $interval,
        user         => $user,
        use_kerberos => $use_kerberos,
        require      => File["/usr/local/bin/rsync-analytics-${title}"],
    }
}