Puppet Class: profile::reportupdater::jobs

Defined in:
modules/profile/manifests/reportupdater/jobs.pp

Overview

Class profile::reportupdater::jobs

Installs reportupdater jobs that run on Hadoop/Hive. This profile should only be included in a single role.

This requires that a Hadoop client is installed and the statistics compute role for the published_path.



7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# File 'modules/profile/manifests/reportupdater/jobs.pp', line 7

class profile::reportupdater::jobs {

    require ::profile::analytics::cluster::packages::hadoop
    require ::profile::analytics::cluster::client

    $base_path = '/srv/reportupdater'

    # Set up reportupdater.
    # Reportupdater here launches Hadoop jobs, and
    # the 'analytics' user is the Analytics 'system' user that has
    # access to required files in Hadoop.
    class { 'reportupdater':
        user      => 'analytics',
        base_path => $base_path,
    }

    # And set up a link for periodic jobs to be included in published reports.
    # Because periodic is in published_path, files will be synced to
    # analytics.wikimedia.org/published/datasets/periodic/reports
    file { "${::statistics::compute::published_path}/datasets/periodic":
        ensure => 'directory',
        owner  => 'root',
        group  => 'wikidev',
        mode   => '0775',
    }
    file { "${::statistics::compute::published_path}/datasets/periodic/reports":
        ensure  => 'link',
        target  => "${base_path}/output",
        require => Class['reportupdater'],
    }

    # Set up a job to create browser reports on hive db.
    reportupdater::job { 'browser':
        output_dir   => 'metrics/browser',
        use_kerberos => true,
    }

    reportupdater::job { 'interlanguage':
        output_dir   => 'metrics/interlanguage',
        use_kerberos => true,
    }

    reportupdater::job { 'pingback':
        output_dir   => 'metrics/pingback',
        use_kerberos => true,
    }

    reportupdater::job { 'reference-previews':
        output_dir   => 'metrics/reference-previews',
        use_kerberos => true,
    }

    reportupdater::job { 'wmcs':
        output_dir   => 'metrics/wmcs',
        use_kerberos => true,
    }

    reportupdater::job { 'structured-data':
        output_dir   => 'metrics/structured-data',
        use_kerberos => true,
    }

    # Set up various jobs to be executed by reportupdater
    # creating several reports on mysql research db.
    reportupdater::job { 'flow-beta-features':
        output_dir => 'metrics/beta-feature-enables',
    }

    reportupdater::job { 'edit-beta-features':
        output_dir => 'metrics/beta-feature-enables',
    }

    reportupdater::job { 'language':
        output_dir => 'metrics/beta-feature-enables',
    }

    # Note:
    # The published_cx2_translations jobs were on stat1007 (hive based)
    # and on stat1006 (mysql based). They now have different job names,
    # but their output directory is the same on purpose, to allow rsync
    # jobs to properly collect and merge data downstream.
    reportupdater::job { 'published_cx2_translations':
        config_file  => "${base_path}/jobs/reportupdater-queries/published_cx2_translations/config-hive.yaml",
        output_dir   => 'metrics/published_cx2_translations',
        use_kerberos => true,
    }
    reportupdater::job { 'published_cx2_translations_mysql':
        config_file => "${base_path}/jobs/reportupdater-queries/published_cx2_translations/config-mysql.yaml",
        output_dir  => 'metrics/published_cx2_translations',
        query_dir   => 'published_cx2_translations',
        interval    => '*-*-* *:30:00',
    }

    reportupdater::job { 'mt_engines':
        output_dir => 'metrics/mt_engines',
    }

    reportupdater::job { 'cx':
        output_dir => 'metrics/cx',
    }

    reportupdater::job { 'ee':
        output_dir => 'metrics/echo',
    }

    reportupdater::job { 'ee-beta-features':
        output_dir => 'metrics/beta-feature-enables',
    }

    reportupdater::job { 'page-creation':
        output_dir => 'metrics/page-creation',
    }
}