Defined Type: dumps::web::fetches::analytics::job

Defined in:
modules/dumps/manifests/web/fetches/analytics/job.pp

Overview

Define dumps::web::fetches::analytics::job

Regularly copies files from $hdfs_source to $local_destination. Uses hdfs-rsync, systemd timers and Kerberos.

Parameters

hdfs_source

HDFS Source directory to pull data from.

local_destination

Destination directory on local filesystem to put data into.

interval

Systemd interval that the timer will use.

user

User running the Systemd timer.

delete

Add the –delete if true.

exclude

Add –exclude $value if not undef.

ensure

Ensure status of systemd timer.

Parameters:

  • hdfs_source (String)
  • local_destination (String)
  • interval (String)
  • user (String)
  • delete (Boolean) (defaults to: true)
  • ignore_missing_source (Boolean) (defaults to: false)
  • ensure (Wmflib::Ensure) (defaults to: present)
  • exclude (Optional[String]) (defaults to: undef)


29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# File 'modules/dumps/manifests/web/fetches/analytics/job.pp', line 29

define dumps::web::fetches::analytics::job(
    String $hdfs_source,
    String $local_destination,
    String $interval,
    String $user,
    Boolean $delete = true,
    Boolean $ignore_missing_source = false,
    Wmflib::Ensure $ensure = present,
    Optional[String] $exclude = undef,
) {
    if !defined(File[$local_destination]) {
        file { $local_destination:
            ensure => 'directory',
            owner  => $user,
            group  => 'root',
        }
    }

    $delete_option = $delete ? {
        true    => '--delete',
        default => ''
    }

    # Quotes around the exclude value are on purpose to force
    # to parse it as a single value
    $exclude_option = $exclude ? {
        undef   => '',
        default => " --exclude \"${exclude}\""
    }

    # If $ignore_missing_source is enabled, add a check that prevents
    # hdfs-rsync to fail when the source directory is missing.
    $rsync_command = "/usr/local/bin/hdfs-rsync -r -t ${delete_option}${exclude_option} --perms --chmod D755,F644 hdfs://${hdfs_source} file://${local_destination}"
    $ignore_msg = "Ignoring missing hdfs source hdfs://${hdfs_source}"
    $head = "#!/bin/bash\n"
    $script_content = $ignore_missing_source ? {
        true    => "${head}hdfs dfs -ls -d hdfs://${hdfs_source} > /dev/null 2>&1 && ${rsync_command} || echo ${ignore_msg}",
        default => "${head}${rsync_command}"
    }
    file { "/usr/local/bin/rsync-analytics-${title}":
        ensure  => $ensure,
        content => $script_content,
        mode    => '0550',
        owner   => $user,
        group   => 'root',
    }

    kerberos::systemd_timer { "analytics-dumps-fetch-${title}":
        description => "Copy ${title} files from Hadoop HDFS.",
        command     => "/usr/local/bin/rsync-analytics-${title}",
        interval    => $interval,
        user        => $user,
        require     => File["/usr/local/bin/rsync-analytics-${title}"],
    }
}