Defined Type: prometheus::blackbox::check::icmp
- Defined in:
- modules/prometheus/manifests/blackbox/check/icmp.pp
Summary
resource to configure icmp checks for a specific serviceOverview
SPDX-License-Identifier: Apache-2.0
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
# File 'modules/prometheus/manifests/blackbox/check/icmp.pp', line 11
define prometheus::blackbox::check::icmp (
Stdlib::Fqdn $instance_label = $facts['networking']['hostname'],
Stdlib::IP::Address::V4::Nosubnet $ip4 = $facts['networking']['ip'],
Stdlib::IP::Address::V6::Nosubnet $ip6 = $facts['networking']['ip6'],
Array[Enum['ip4', 'ip6']] $ip_families = ['ip4', 'ip6'],
String[1] $team = 'sre',
Prometheus::Alert::Severity $severity = 'critical',
Pattern[/\d+[ms]/] $timeout = '3s',
Wmflib::Sites $site = $::site, # lint:ignore:top_scope_facts
Prometheus::Blackbox::Check::Instance $prometheus_instance = 'ops',
) {
$safe_title = $title.regsubst('\W', '_', 'G')
$module_title = $safe_title
$alert_title = "alerts_${safe_title}.yml"
$target_file = "/srv/prometheus/${prometheus_instance}/targets/probes-custom_puppet-icmp.yaml"
$icmp_module_params = {
'ip_protocol_fallback' => false,
}
$module_config = {
'modules' => Hash($ip_families.map |$family| {
[ "icmp_${safe_title}_${family}",
{
'prober' => 'icmp',
'timeout' => $timeout,
'icmp' => $icmp_module_params + { 'preferred_ip_protocol' => $family }
}
]
}),
}
$target_config = $ip_families.map |$family| {
$address = ($family == 'ip4').bool2str($ip4, $ip6)
$data = {
'labels' => {
'address' => $address,
'family' => $family,
'module' => "icmp_${safe_title}_${family}",
},
'targets' => ["${instance_label}:0@${address}"],
}
$data
}
$page_text = $severity ? {
'page' => ' #page',
default => '',
}
$alert_config = {
'groups' => [
{
'name' => 'puppet_probes',
'rules' => [{
'alert' => 'ProbeDown',
'expr' => "avg_over_time(probe_success{module=~'icmp_${safe_title}_.*'}[1m]) * 100 < 75",
'for' => '2m',
'labels' => {
'team' => $team,
'severity' => $severity,
},
'annotations' => {
'description' => '{{ $labels.instance }} failed when probed by {{ $labels.module }} from {{ $externalLabels.site }}. Availability is {{ $value }}%.',
'summary' => "Service {{ \$labels.instance }} has failed probes ({{ \$labels.module }})${page_text}",
'dashboard' => 'https://grafana.wikimedia.org/d/O0nHhdhnz/network-probes-overview?var-job={{ $labels.job }}&var-module=All',
'logs' => 'https://logstash.wikimedia.org/app/dashboards#/view/f3e709c0-a5f8-11ec-bf8e-43f1807d5bc2?_g=(filters:!((query:(match_phrase:(service.name:{{ $labels.module }})))))',
'runbook' => 'https://wikitech.wikimedia.org/wiki/Network_monitoring#ProbeDown',
},
}],
},
].filter |$alert| { $alert != undef },
}
$module_params = {
'content' => $module_config.wmflib::to_yaml,
'tag' => "prometheus::blackbox::check::icmp::${::site}::${prometheus_instance}::module",
}
$alert_rule_params = {
'instance' => $prometheus_instance,
'content' => $alert_config.wmflib::to_yaml,
'tag' => "prometheus::blackbox::check::icmp::${::site}::${prometheus_instance}::alert",
}
$target_frag_params = {
'ensure' => 'file',
'content' => $target_config.wmflib::to_yaml,
'tag' => "prometheus::blackbox::check::icmp::${::site}::${prometheus_instance}::target",
}
wmflib::resource::export('prometheus::blackbox::module', $module_title, $title, $module_params)
wmflib::resource::export('prometheus::rule', $alert_title, $title, $alert_rule_params)
wmflib::resource::export('file', $target_file, $title, $target_frag_params)
}
|