Puppet Class: labstore::monitoring::interfaces

Defined in:
modules/labstore/manifests/monitoring/interfaces.pp

Overview

Parameters:

  • monitor_iface (String) (defaults to: 'eth0')
  • contact_groups (String) (defaults to: 'wmcs-team,admins')
  • int_throughput_warn (Integer) (defaults to: 937500000)


11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'modules/labstore/manifests/monitoring/interfaces.pp', line 11

class labstore::monitoring::interfaces(
    String $monitor_iface = 'eth0',
    String $contact_groups='wmcs-team,admins',
    Integer $int_throughput_warn = 937500000,  # 7.5Gbps
){

    # In minutes, how long icinga will wait before considering HARD state, see also T188624
    $retries = 10
    # Absurdly big number, to avoid pages ()
    $int_throughput_crit = 106250000000 # 850Gbps

    monitoring::check_prometheus { 'network_out_saturated':
        description     => 'Outgoing network saturation',
        dashboard_links => ['https://grafana.wikimedia.org/d/000000568/wmcs-dumps-general-view'],
        query           => "sum(irate(node_network_transmit_bytes_total{instance=\"${::hostname}:9100\",device=\"${monitor_iface}\"}[5m]))",
        warning         => $int_throughput_warn,
        critical        => $int_throughput_crit,
        retries         => $retries,
        method          => 'ge',
        contact_group   => $contact_groups,
        notes_link      => 'https://wikitech.wikimedia.org/wiki/Portal:Data_Services/Admin/Shared_storage#Dumps',
        prometheus_url  => "http://prometheus.svc.${::site}.wmnet/ops",
    }

    monitoring::check_prometheus { 'network_in_saturated':
        description     => 'Incoming network saturation',
        dashboard_links => ['https://grafana.wikimedia.org/d/000000568/wmcs-dumps-general-view'],
        query           => "sum(irate(node_network_receive_bytes_total{instance=\"${::hostname}:9100\",device=\"${monitor_iface}\"}[5m]))",
        warning         => $int_throughput_warn,
        critical        => $int_throughput_crit,
        retries         => $retries,
        method          => 'ge',
        contact_group   => $contact_groups,
        notes_link      => 'https://wikitech.wikimedia.org/wiki/Portal:Data_Services/Admin/Shared_storage#Dumps',
        prometheus_url  => "http://prometheus.svc.${::site}.wmnet/ops",
    }

    monitoring::check_prometheus { 'high_iowait_stalling':
        description     => 'Persistent high iowait',
        dashboard_links => ['https://grafana.wikimedia.org/d/000000568/wmcs-dumps-general-view'],
        # iowait % across all CPUs
        query           => "100 * sum(irate(node_cpu_seconds_total{instance=\"${::hostname}:9100\",mode=\"iowait\"}[5m])) / scalar(count(node_cpu_seconds_total{mode=\"idle\",instance=\"${::hostname}:9100\"}))",
        warning         => 5,
        # Big number to avoid pages
        critical        => 10000,
        retries         => $retries,
        method          => 'ge',
        contact_group   => $contact_groups,
        notes_link      => 'https://wikitech.wikimedia.org/wiki/Portal:Data_Services/Admin/Shared_storage#Dumps',
        prometheus_url  => "http://prometheus.svc.${::site}.wmnet/ops",
    }
}