Puppet Class: profile::webperf::processors

Defined in:
modules/profile/manifests/webperf/processors.pp

Overview

Class: profile::webperf::processors

Provision the webperf data processors. Consumes from Kafka (incl. EventLogging), and produces to StatsD and Graphite.

Contact: performance-team@wikimedia.org See also: <wikitech.wikimedia.org/wiki/Webperf>

Services:

  • statsv

  • navtiming

  • coal

Parameters:

  • statsd (Any) (defaults to: hiera('statsd'))
  • graphite_host (Any) (defaults to: hiera('graphite_host'))
  • prometheus_nodes (Any) (defaults to: hiera('prometheus_nodes'))


15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# File 'modules/profile/manifests/webperf/processors.pp', line 15

class profile::webperf::processors(
    $statsd = hiera('statsd'),
    $graphite_host = hiera('graphite_host'),
    $prometheus_nodes = hiera('prometheus_nodes')
) {
    $statsd_parts = split($statsd, ':')
    $statsd_host = $statsd_parts[0]
    $statsd_port = 0 + $statsd_parts[1]

    # statsv is on main kafka, not analytics or jumbo kafka.
    # Note that at any given time, all statsv varnishkafka producers are
    # configured to send to only one kafka cluster (usually main-eqiad).
    # statsv in an inactive datacenter will not process any messages, as
    # varnishkafka will not produce any messages to that DC's kafka cluster.
    # This is configured by the value of the hiera param
    # profile::cache::kafka::statsv::kafka_cluster_name when the statsv varnishkafka
    # profile is included (as of this writing on text caches).
    $kafka_main_config = kafka_config('main')
    $kafka_main_brokers = $kafka_main_config['brokers']['string']
    # Consume statsd metrics from Kafka and emit them to statsd.
    class { '::webperf::statsv':
        kafka_brokers     => $kafka_main_brokers,
        kafka_api_version => $kafka_main_config['api_version'],
        statsd_host       => $statsd_host,
        statsd_port       => $statsd_port,
    }

    # EventLogging is on the jumbo kafka. Unlike the main one, this
    # is not yet mirrored to other data centers, so for prod,
    # assume eqiad.
    $kafka_config  = kafka_config('jumbo', 'eqiad')
    $kafka_brokers = $kafka_config['brokers']['string']

    # Aggregate client-side latency measurements collected via the
    # NavigationTiming MediaWiki extension and send them to Graphite.
    # See <https://www.mediawiki.org/wiki/Extension:NavigationTiming>
    class { '::webperf::navtiming':
        kafka_brokers => $kafka_brokers,
        statsd_host   => $statsd_host,
        statsd_port   => $statsd_port,
    }

    # navtiming exports Prometheus metrics on port 9230.
    if $::realm == 'labs' {
        $ferm_srange = '$LABS_NETWORKS'
    } else {
        $prometheus_ferm_nodes = join($prometheus_nodes, ' ')
        $ferm_srange = "(@resolve((${prometheus_ferm_nodes})) @resolve((${prometheus_ferm_nodes}), AAAA))"
    }
    ferm::service { 'prometheus-navtiming-exporter':
        proto  => 'tcp',
        port   => '9230',
        srange => $ferm_srange,
    }

    monitoring::check_prometheus { 'webperf-navtiming-latest-handled':
        description     => 'too long since latest timing beacon',
        query           => 'time() - min(webperf_latest_handled_time_seconds)',
        method          => 'gt',
        warning         => 900,   # 15 minutes; <60 seconds is normal
        critical        => 86400, # 1 day
        prometheus_url  => "http://prometheus.svc.${::site}.wmnet/ops",
        contact_group   => 'team-performance',
        dashboard_links => ['https://grafana.wikimedia.org/d/000000143/navigation-timing'],
    }

    monitoring::check_prometheus { 'webperf-navtiming-error-rate':
        description     => 'high navtiming exception rate',
        query           => 'rate(webperf_errors[5m])', # Python exceptions per second
        method          => 'gt',
        warning         => 0.1, # 0 is normal
        critical        => 1,
        prometheus_url  => "http://prometheus.svc.${::site}.wmnet/ops",
        contact_group   => 'team-performance',
        dashboard_links => ['https://grafana.wikimedia.org/d/000000143/navigation-timing'],
    }

    monitoring::check_prometheus { 'webperf-navtiming-invalid-message-rate':
        description     => 'high navtiming invalid event rate',
        query           => 'sum(rate(webperf_navtiming_invalid_events[5m]))', # discards per second, across all groups
        method          => 'gt',
        warning         => 1, # ~0.2-0.5 is normal
        critical        => 5,
        prometheus_url  => "http://prometheus.svc.${::site}.wmnet/ops",
        contact_group   => 'team-performance',
        dashboard_links => ['https://grafana.wikimedia.org/d/000000143/navigation-timing'],
    }

    # Make a valid target for coal, and set up what's needed for the consumer
    # Consumes from the jumbo-eqiad cluster, just like navtiming
    class { '::coal::processor':
        kafka_brokers => $kafka_brokers,
        graphite_host => $graphite_host,
    }
}