Defined Type: prometheus::blackbox::check::icmp

Defined in:
modules/prometheus/manifests/blackbox/check/icmp.pp

Summary

resource to configure icmp checks for a specific service

Overview

SPDX-License-Identifier: Apache-2.0

Parameters:

  • instance_label (Stdlib::Fqdn) (defaults to: $facts['networking']['hostname'])

    name the host part of 'instance' label to use

  • ip4 (Stdlib::IP::Address::V4::Nosubnet) (defaults to: $facts['networking']['ip'])

    The IP address to connect to

  • ip6 (Stdlib::IP::Address::V6::Nosubnet) (defaults to: $facts['networking']['ip6'])

    The IP6 address to connect to

  • ip_families (Array[Enum['ip4', 'ip6']]) (defaults to: ['ip4', 'ip6'])

    indicate support for ipv4 and/or ipv6

  • team (String[1]) (defaults to: 'sre')

    the WMF team to alert

  • severity (Prometheus::Alert::Severity) (defaults to: 'critical')

    The severity of the alert

  • timeout (Pattern[/\d+[ms]/]) (defaults to: '3s')

    the probe timeout

  • prometheus_instance (Prometheus::Blackbox::Check::Instance) (defaults to: 'ops')

    prometheus instance to deploy to, defaults to 'ops'

  • site (Wmflib::Sites) (defaults to: $::site)


11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# File 'modules/prometheus/manifests/blackbox/check/icmp.pp', line 11

define prometheus::blackbox::check::icmp (
    Stdlib::Fqdn                          $instance_label      = $facts['networking']['hostname'],
    Stdlib::IP::Address::V4::Nosubnet     $ip4                 = $facts['networking']['ip'],
    Stdlib::IP::Address::V6::Nosubnet     $ip6                 = $facts['networking']['ip6'],
    Array[Enum['ip4', 'ip6']]             $ip_families         = ['ip4', 'ip6'],
    String[1]                             $team                = 'sre',
    Prometheus::Alert::Severity           $severity            = 'critical',
    Pattern[/\d+[ms]/]                    $timeout             = '3s',
    Wmflib::Sites                         $site                = $::site,  # lint:ignore:top_scope_facts
    Prometheus::Blackbox::Check::Instance $prometheus_instance = 'ops',
) {
    $safe_title = $title.regsubst('\W', '_', 'G')
    $module_title = $safe_title
    $alert_title = "alerts_${safe_title}.yml"
    $target_file = "/srv/prometheus/${prometheus_instance}/targets/probes-custom_puppet-icmp.yaml"

    $icmp_module_params = {
        'ip_protocol_fallback' => false,
    }
    $module_config = {
        'modules' => Hash($ip_families.map |$family| {
            [ "icmp_${safe_title}_${family}",
              {
                  'prober' => 'icmp',
                  'timeout' => $timeout,
                  'icmp' => $icmp_module_params + { 'preferred_ip_protocol' => $family }
              }
            ]
        }),
    }
    $target_config = $ip_families.map |$family| {
        $address = ($family == 'ip4').bool2str($ip4, $ip6)
        $data = {
            'labels' => {
                'address' => $address,
                'family'  => $family,
                'module'  => "icmp_${safe_title}_${family}",
            },
            'targets' => ["${instance_label}:0@${address}"],
        }
        $data
    }

    $page_text = $severity ? {
        'page'   => ' #page',
        default => '',
    }

    $alert_config = {
        'groups' => [
          {
            'name'  => 'puppet_probes',
            'rules' => [{
                'alert'      => 'ProbeDown',
                'expr'       => "avg_over_time(probe_success{module=~'icmp_${safe_title}_.*'}[1m]) * 100 < 75",
                'for'         => '2m',
                'labels'      => {
                    'team'     => $team,
                    'severity' => $severity,
                },
                'annotations' => {
                    'description' => '{{ $labels.instance }} failed when probed by {{ $labels.module }} from {{ $externalLabels.site }}. Availability is {{ $value }}%.',
                    'summary'     => "Service {{ \$labels.instance }} has failed probes ({{ \$labels.module }})${page_text}",
                    'dashboard'   => 'https://grafana.wikimedia.org/d/O0nHhdhnz/network-probes-overview?var-job={{ $labels.job }}&var-module=All',
                    'logs'        => 'https://logstash.wikimedia.org/app/dashboards#/view/f3e709c0-a5f8-11ec-bf8e-43f1807d5bc2?_g=(filters:!((query:(match_phrase:(service.name:{{ $labels.module }})))))',
                    'runbook'     => 'https://wikitech.wikimedia.org/wiki/Network_monitoring#ProbeDown',
                },
            }],
          },
        ].filter |$alert| { $alert != undef },
    }
    $module_params = {
        'content' => $module_config.wmflib::to_yaml,
        'tag'     => "prometheus::blackbox::check::icmp::${::site}::${prometheus_instance}::module",
    }
    $alert_rule_params  = {
        'instance' => $prometheus_instance,
        'content' => $alert_config.wmflib::to_yaml,
        'tag'     => "prometheus::blackbox::check::icmp::${::site}::${prometheus_instance}::alert",
    }
    $target_frag_params = {
        'ensure'  => 'file',
        'content' => $target_config.wmflib::to_yaml,
        'tag'     => "prometheus::blackbox::check::icmp::${::site}::${prometheus_instance}::target",
    }

    wmflib::resource::export('prometheus::blackbox::module', $module_title, $title, $module_params)
    wmflib::resource::export('prometheus::rule', $alert_title, $title, $alert_rule_params)
    wmflib::resource::export('file', $target_file, $title, $target_frag_params)
}