Puppet Class: labstore::monitoring::interfaces

Defined in:
modules/labstore/manifests/monitoring/interfaces.pp

Overview

Parameters:

  • monitor_iface (Any) (defaults to: 'eth0')
  • contact_groups (Any) (defaults to: 'wmcs-team,admins')
  • int_throughput_warn (Any) (defaults to: 93750000)
  • int_throughput_crit (Any) (defaults to: 106250000)
  • load_warn (Any) (defaults to: *)
  • load_crit (Any) (defaults to: *)


11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# File 'modules/labstore/manifests/monitoring/interfaces.pp', line 11

class labstore::monitoring::interfaces(
    $monitor_iface = 'eth0',
    $contact_groups='wmcs-team,admins',
    $int_throughput_warn = 93750000,  # 750Mbps
    $int_throughput_crit = 106250000, # 850Mbps
    $load_warn = $::processorcount * 0.75,
    $load_crit = $::processorcount * 1.25,
) {

    # In minutes, how long icinga will wait before considering HARD state, see also T188624
    $retries = 10

    monitoring::check_prometheus { 'network_out_saturated':
        description     => 'Outgoing network saturation',
        dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/labs-monitoring'],
        query           => "sum(irate(node_network_transmit_bytes_total{instance=\"${::hostname}:9100\",device=\"${monitor_iface}\"}[5m]))",
        warning         => $int_throughput_warn,
        critical        => $int_throughput_crit,
        retries         => $retries,
        method          => 'ge',
        contact_group   => $contact_groups,
        notes_link      => 'https://wikitech.wikimedia.org/wiki/Portal:Data_Services/Admin/Labstore',
        prometheus_url  => "http://prometheus.svc.${::site}.wmnet/ops",
    }

    monitoring::check_prometheus { 'network_in_saturated':
        description     => 'Incoming network saturation',
        dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/labs-monitoring'],
        query           => "sum(irate(node_network_receive_bytes_total{instance=\"${::hostname}:9100\",device=\"${monitor_iface}\"}[5m]))",
        warning         => $int_throughput_warn,
        critical        => $int_throughput_crit,
        retries         => $retries,
        method          => 'ge',
        contact_group   => $contact_groups,
        notes_link      => 'https://wikitech.wikimedia.org/wiki/Portal:Data_Services/Admin/Labstore',
        prometheus_url  => "http://prometheus.svc.${::site}.wmnet/ops",
    }

    monitoring::check_prometheus { 'high_iowait_stalling':
        description     => 'Persistent high iowait',
        dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/labs-monitoring'],
        # iowait % across all CPUs
        query           => "100 * sum(irate(node_cpu_seconds_total{instance=\"${::hostname}:9100\",mode=\"iowait\"}[5m])) / scalar(count(node_cpu_seconds_total{mode=\"idle\",instance=\"${::hostname}:9100\"}))",
        warning         => 5,
        critical        => 10,
        retries         => $retries,
        method          => 'ge',
        contact_group   => $contact_groups,
        notes_link      => 'https://wikitech.wikimedia.org/wiki/Portal:Data_Services/Admin/Labstore',
        prometheus_url  => "http://prometheus.svc.${::site}.wmnet/ops",
    }

    # Monitor for high load consistently, is a 'catchall'
    monitoring::check_prometheus { 'high_load':
        description     => 'High 1m load average',
        dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/labs-monitoring'],
        query           => "quantile_over_time(.85, node_load1{instance=\"${::hostname}:9100\"}[10m])",
        warning         => $load_warn,
        critical        => $load_crit,
        retries         => $retries,
        method          => 'ge',
        contact_group   => 'wmcs-team-email,wmcs-bots',
        notes_link      => 'https://wikitech.wikimedia.org/wiki/Portal:Data_Services/Admin/Labstore',
        prometheus_url  => "http://prometheus.svc.${::site}.wmnet/ops",
    }
}