Puppet Class: labstore::monitoring::interfaces

Defined in:
modules/labstore/manifests/monitoring/interfaces.pp

Overview

Parameters:

  • monitor_iface (Any) (defaults to: 'eth0')
  • contact_groups (Any) (defaults to: 'wmcs-team,admins')
  • int_throughput_warn (Any) (defaults to: '93750000')
  • int_throughput_crit (Any) (defaults to: '106250000')
  • load_warn (Any) (defaults to: *)
  • load_crit (Any) (defaults to: *)


11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# File 'modules/labstore/manifests/monitoring/interfaces.pp', line 11

class labstore::monitoring::interfaces(
    $monitor_iface = 'eth0',
    $contact_groups='wmcs-team,admins',
    $int_throughput_warn = '93750000',  # 750Mbps
    $int_throughput_crit = '106250000', # 850Mbps
    $load_warn = $::processorcount * 0.75,
    $load_crit = $::processorcount * 1.25,
) {

    $interval = '10min' # see T188624

    monitoring::graphite_threshold { 'network_out_saturated':
        description     => 'Outgoing network saturation',
        dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/labs-monitoring'],
        metric          => "servers.${::hostname}.network.${monitor_iface}.tx_byte",
        from            => $interval,
        warning         => $int_throughput_warn,
        critical        => $int_throughput_crit,
        percentage      => '10',        # smooth over peaks
        contact_group   => $contact_groups,
    }

    monitoring::graphite_threshold { 'network_in_saturated':
        description     => 'Incoming network saturation',
        dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/labs-monitoring'],
        metric          => "servers.${::hostname}.network.${monitor_iface}.rx_byte",
        from            => $interval,
        warning         => $int_throughput_warn,
        critical        => $int_throughput_crit,
        percentage      => '10',        # smooth over peaks
        contact_group   => $contact_groups,
    }

    monitoring::graphite_threshold { 'high_iowait_stalling':
        description     => 'Persistent high iowait',
        dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/labs-monitoring'],
        metric          => "servers.${::hostname}.cpu.total.iowait",
        from            => '10min',
        warning         => '40', # Based off looking at history of metric
        critical        => '60',
        percentage      => '50', # Ignore small spikes
        contact_group   => $contact_groups,
    }

    # Monitor for high load consistently, is a 'catchall'
    monitoring::graphite_threshold { 'high_load':
        description     => 'High load average',
        dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/labs-monitoring'],
        metric          => "servers.${::hostname}.loadavg.01",
        from            => '10min',
        warning         => $load_warn,
        critical        => $load_crit,
        percentage      => '85', # Don't freak out on spikes
        contact_group   => $contact_groups,
    }
}