Puppet Class: dumps::web::fetches::stats

Defined in:
modules/dumps/manifests/web/fetches/stats.pp

Overview

Parameters:

  • src_hdfs (Any) (defaults to: undef)
  • miscdatasetsdir (Any) (defaults to: undef)
  • user (Any) (defaults to: undef)


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# File 'modules/dumps/manifests/web/fetches/stats.pp', line 1

class dumps::web::fetches::stats(
    $src_hdfs = undef,
    $miscdatasetsdir = undef,
    $user = undef,
) {
    # Each of these jobs have a readme.html file rendered by  dumps::web::html.
    # We need to make sure the rsync --delete does not delete these files
    # which are put in place on the local destination host by puppet.
    Hdfs_tools::Hdfs_rsync_job {
        exclude => 'readme.html'
    }

    # Copies over mediacounts files from HDFS archive
    hdfs_tools::hdfs_rsync_job { 'mediacounts':
        hdfs_source       => "${src_hdfs}/mediacounts/",
        local_destination => "${miscdatasetsdir}/mediacounts/",
        interval          => '*-*-* *:41:00',
        user              => $user,
    }

    # Copies over files with pageview statistics per page and project,
    # using the current definition of pageviews, from HDFS archive
    hdfs_tools::hdfs_rsync_job { 'pageview':
        hdfs_source       => "${src_hdfs}/{pageview,projectview}/legacy/hourly/",
        local_destination => "${miscdatasetsdir}/pageviews/",
        interval          => '*-*-* *:51:00',
        user              => $user,
    }

    # Copies over files with unique devices statistics per project,
    # using the last access cookie method, from HDFS archive
    hdfs_tools::hdfs_rsync_job { 'unique_devices':
        hdfs_source       => "${src_hdfs}/unique_devices/",
        local_destination => "${miscdatasetsdir}/unique_devices/",
        interval          => '*-*-* *:31:00',
        user              => $user,
    }

    # Copies over clickstream files from HDFS archive
    hdfs_tools::hdfs_rsync_job { 'clickstream':
        hdfs_source       => "${src_hdfs}/clickstream/",
        local_destination => "${miscdatasetsdir}/clickstream/",
        interval          => '*-*-* *:04:00',
        user              => $user,
    }

    # Copies over mediawiki history dumps from HDFS archive
    # Copying only the last 2 dumps explicitely
    # --delete will take care of deleting old ones
    # Dates portions of the command are extracted as variables for reusability
    $date1_cmd = "\$(/bin/date --date=\"\$(/bin/date +%Y-%m-15) -1 month\" +\"%Y-%m\")"
    $date2_cmd = "\$(/bin/date --date=\"\$(/bin/date +%Y-%m-15) -2 month\" +\"%Y-%m\")"
    hdfs_tools::hdfs_rsync_job { 'mediawiki_history_dumps':
        hdfs_source           => "${src_hdfs}/mediawiki/history/{${date1_cmd},${date2_cmd}}",
        local_destination     => "${miscdatasetsdir}/mediawiki_history/",
        interval              => '*-*-* 05:00:00',
        user                  => $user,
        ignore_missing_source => true,
    }

    # Copies over geoeditors dumps from HDFS archive
    hdfs_tools::hdfs_rsync_job { 'geoeditors_dumps':
        hdfs_source       => "${src_hdfs}/geoeditors/public/",
        local_destination => "${miscdatasetsdir}/geoeditors/",
        interval          => '*-*-* 06:00:00',
        user              => $user,
    }

    # Copies over pageview complete daily dumps from HDFS archive
    hdfs_tools::hdfs_rsync_job { 'pageview_complete_dumps':
        hdfs_source       => "${src_hdfs}/pageview/complete/",
        local_destination => "${miscdatasetsdir}/pageview_complete/",
        interval          => '*-*-* 05:00:00',
        user              => $user,
    }

    # Copies over commons impact metrics dumps from HDFS archive
    hdfs_tools::hdfs_rsync_job { 'commons_impact_metrics':
        hdfs_source       => "${src_hdfs}/commons/",
        local_destination => "${miscdatasetsdir}/commons_impact_metrics/",
        interval          => '*-*-* 06:00:00',
        user              => $user,
    }
}