Puppet Class: dumps::web::fetches::stats

Defined in:
modules/dumps/manifests/web/fetches/stats.pp

Overview

Class: dumps::web::fetches::stats

Parameters:

src_hdfs_archive

Archive directory to source from. These datasets are meant for archival, with the intent to never delete.

src_hdfs_exports

Exports directory to source from. These datasets are meant for file exports that are temporal, with the intent to keep just the last N.

miscdatasetsdir

The local destination to sync datasets to.

user

The unix user to perform the sync as.

Parameters:

  • src_hdfs_archive (Any) (defaults to: undef)
  • src_hdfs_exports (Any) (defaults to: undef)
  • miscdatasetsdir (Any) (defaults to: undef)
  • user (Any) (defaults to: undef)


17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# File 'modules/dumps/manifests/web/fetches/stats.pp', line 17

class dumps::web::fetches::stats(
    $src_hdfs_archive = undef,
    $src_hdfs_exports = undef,
    $miscdatasetsdir = undef,
    $user = undef,
) {
    # Each of these jobs have a readme.html file rendered by  dumps::web::html.
    # We need to make sure the rsync --delete does not delete these files
    # which are put in place on the local destination host by puppet.
    Hdfs_tools::Hdfs_rsync_job {
        exclude => 'readme.html'
    }

    # Copies over mediacounts files from HDFS archive
    hdfs_tools::hdfs_rsync_job { 'mediacounts':
        hdfs_source       => "${src_hdfs_archive}/mediacounts/",
        local_destination => "${miscdatasetsdir}/mediacounts/",
        interval          => '*-*-* *:41:00',
        user              => $user,
    }

    # Copies over files with pageview statistics per page and project,
    # using the current definition of pageviews, from HDFS archive
    hdfs_tools::hdfs_rsync_job { 'pageview':
        hdfs_source       => "${src_hdfs_archive}/{pageview,projectview}/legacy/hourly/",
        local_destination => "${miscdatasetsdir}/pageviews/",
        interval          => '*-*-* *:51:00',
        user              => $user,
    }

    # Copies over files with unique devices statistics per project,
    # using the last access cookie method, from HDFS archive
    hdfs_tools::hdfs_rsync_job { 'unique_devices':
        hdfs_source       => "${src_hdfs_archive}/unique_devices/",
        local_destination => "${miscdatasetsdir}/unique_devices/",
        interval          => '*-*-* *:31:00',
        user              => $user,
    }

    # Copies over clickstream files from HDFS archive
    hdfs_tools::hdfs_rsync_job { 'clickstream':
        hdfs_source       => "${src_hdfs_archive}/clickstream/",
        local_destination => "${miscdatasetsdir}/clickstream/",
        interval          => '*-*-* *:04:00',
        user              => $user,
    }

    # Copies over mediawiki history dumps from HDFS archive
    # Copying only the last 2 dumps explicitely
    # --delete will take care of deleting old ones
    # Dates portions of the command are extracted as variables for reusability
    $date1_cmd = "\$(/bin/date --date=\"\$(/bin/date +%Y-%m-15) -1 month\" +\"%Y-%m\")"
    $date2_cmd = "\$(/bin/date --date=\"\$(/bin/date +%Y-%m-15) -2 month\" +\"%Y-%m\")"
    hdfs_tools::hdfs_rsync_job { 'mediawiki_history_dumps':
        hdfs_source           => "${src_hdfs_archive}/mediawiki/history/{${date1_cmd},${date2_cmd}}",
        local_destination     => "${miscdatasetsdir}/mediawiki_history/",
        interval              => '*-*-* 05:00:00',
        user                  => $user,
        ignore_missing_source => true,
    }

    # Copies over geoeditors dumps from HDFS archive
    hdfs_tools::hdfs_rsync_job { 'geoeditors_dumps':
        hdfs_source       => "${src_hdfs_archive}/geoeditors/public/",
        local_destination => "${miscdatasetsdir}/geoeditors/",
        interval          => '*-*-* 06:00:00',
        user              => $user,
    }

    # Copies over pageview complete daily dumps from HDFS archive
    hdfs_tools::hdfs_rsync_job { 'pageview_complete_dumps':
        hdfs_source       => "${src_hdfs_archive}/pageview/complete/",
        local_destination => "${miscdatasetsdir}/pageview_complete/",
        interval          => '*-*-* 05:00:00',
        user              => $user,
    }

    # Copies over commons impact metrics dumps from HDFS archive
    hdfs_tools::hdfs_rsync_job { 'commons_impact_metrics':
        hdfs_source       => "${src_hdfs_archive}/commons/",
        local_destination => "${miscdatasetsdir}/commons_impact_metrics/",
        interval          => '*-*-* 06:00:00',
        user              => $user,
    }

    # Copies over cirrus index dumps from HDFS archive
    hdfs_tools::hdfs_rsync_job { 'cirrus_index_dumps':
        hdfs_source       => "${src_hdfs_exports}/cirrus-search-index/",
        local_destination => "${miscdatasetsdir}/cirrus_search_index",
        interval          => '*-*-* 05:00:00',
        user              => $user,
    }
}