Defined Type: prometheus::blackbox::check::tcp

Defined in:
modules/prometheus/manifests/blackbox/check/tcp.pp

Summary

resource to configure tcp checks for a specific service

Overview

SPDX-License-Identifier: Apache-2.0

Parameters:

  • server_name (Stdlib::Fqdn) (defaults to: $facts['networking']['hostname'])

    the server name to use (during TLS)

  • instance_label (Stdlib::Fqdn) (defaults to: $facts['networking']['hostname'])

    name the host part of 'instance' label to use

  • ip4 (Stdlib::IP::Address::V4::Nosubnet) (defaults to: $facts['networking']['ip'])

    The IP address to connect to

  • ip6 (Stdlib::IP::Address::V6::Nosubnet) (defaults to: $facts['networking']['ip6'])

    The IP6 address to connect to

  • ip_families (Array[Enum['ip4', 'ip6']]) (defaults to: ['ip4', 'ip6'])

    indicate support for ipv4 and/or ipv6

  • team (String[1]) (defaults to: 'sre')

    the WMF team to alert

  • severity (Prometheus::Alert::Severity) (defaults to: 'critical')

    The severity of the alert

  • port (Stdlib::Port) (defaults to: 443)

    the port to run a specific check on

  • force_tls (Boolean) (defaults to: false)

    if true force ssl otherwise use port number to decide

  • certificate_expiry_days (Integer[1,120]) (defaults to: 10)

    alert when the certificate will expire sooner than days

  • timeout (Pattern[/\d+[ms]/]) (defaults to: '3s')

    the probe timeout

  • use_client_auth (Boolean) (defaults to: false)

    use client authentication

  • client_auth_cert (Stdlib::Unixpath) (defaults to: '/etc/prometheus/ssl/cert.pem')

    path to the client auth certificate to use. Please note this file must exist on the monitoring server not the server been monitored

  • client_auth_key (Stdlib::Unixpath) (defaults to: '/etc/prometheus/ssl/server.key')

    path to the client auth key to use. Please note this file must exist on the monitoring server not the server been monitored

  • prometheus_instance (Prometheus::Blackbox::Check::Instance) (defaults to: 'ops')

    prometheus instance to deploy to, defaults to 'ops'

  • alert_after (Pattern[/\d+[mh]/]) (defaults to: '2m')

    the time to wait between first expression hit and ProbeDown firing, defaults to 2m

  • site (Wmflib::Sites) (defaults to: $::site)
  • query_response (Prometheus::Blackbox::Query_response) (defaults to: undef)
  • probe_runbook (String[1]) (defaults to: 'https://wikitech.wikimedia.org/wiki/TLS/Runbook#{{ $labels.instance }}')


23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
# File 'modules/prometheus/manifests/blackbox/check/tcp.pp', line 23

define prometheus::blackbox::check::tcp (
    Stdlib::Fqdn                            $server_name             = $facts['networking']['hostname'],
    Stdlib::Fqdn                            $instance_label          = $facts['networking']['hostname'],
    Stdlib::IP::Address::V4::Nosubnet       $ip4                     = $facts['networking']['ip'],
    Stdlib::IP::Address::V6::Nosubnet       $ip6                     = $facts['networking']['ip6'],
    Array[Enum['ip4', 'ip6']]               $ip_families             = ['ip4', 'ip6'],
    String[1]                               $team                    = 'sre',
    Prometheus::Alert::Severity             $severity                = 'critical',
    Stdlib::Port                            $port                    = 443,
    Boolean                                 $force_tls               = false,
    Integer[1,120]                          $certificate_expiry_days = 10,
    Pattern[/\d+[ms]/]                      $timeout                 = '3s',
    Boolean                                 $use_client_auth         = false,
    # puppet agent certs exported in profile::prometheus::blackbox_exporter
    Stdlib::Unixpath                        $client_auth_cert        = '/etc/prometheus/ssl/cert.pem',
    Stdlib::Unixpath                        $client_auth_key         = '/etc/prometheus/ssl/server.key',
    Wmflib::Sites                           $site                    = $::site,  # lint:ignore:top_scope_facts
    Prometheus::Blackbox::Check::Instance   $prometheus_instance     = 'ops',
    Prometheus::Blackbox::Query_response    $query_response          = undef,
    Pattern[/\d+[mh]/]                      $alert_after             = '2m',
    String[1]                               $probe_runbook           = 'https://wikitech.wikimedia.org/wiki/TLS/Runbook#{{ $labels.instance }}',
) {
    $use_tls = ($force_tls or $port == 443)
    $safe_title = $title.regsubst('\W', '_', 'G')
    $module_title = $safe_title
    $alert_title = "alerts_${safe_title}.yml"
    $target_file = "/srv/prometheus/${prometheus_instance}/targets/probes-custom_puppet-tcp.yaml"

    $client_auth_config = $use_client_auth ? {
        false   => {},
        default => {'cert_file' => $client_auth_cert, 'key_file' => $client_auth_key},
    }
    $tls_config = $use_tls ? {
        false   => {},
        default => {'server_name' => $server_name} + $client_auth_config,
    }

    $tcp_module_params = {
        'ip_protocol_fallback'            => false,
        'tls'                             => $use_tls,
        'tls_config'                      => $tls_config,
    }.filter |$key, $value| { $value =~ Boolean or ($value =~ NotUndef and !$value.empty) }
    $module_config = {
        'modules' => Hash($ip_families.map |$family| {
            [ "tcp_${safe_title}_${family}",
              {
                  'prober' => 'tcp',
                  'timeout' => $timeout,
                  'tcp' => $tcp_module_params + { 'preferred_ip_protocol' => $family }
              }
            ]
        }),
    }
    $target_config = $ip_families.map |$family| {
        $address = ($family == 'ip4').bool2str($ip4, $ip6)
        $data = {
            'labels' => {
                'address' => $address,
                'family'  => $family,
                'module'  => "tcp_${safe_title}_${family}",
            },
            'targets' => ["${instance_label}:${port}@[${address}]:${port}"],
        }
        $data
    }

    $page_text = $severity ? {
        'page'   => ' #page',
        default => '',
    }

    # Deploy similar (but same alert name, so deduplication works) alerts to
    # the ones found in alerts.git/team-sre/probes.yaml. See also that file for more
    # information especially when making changes.
    # The difference here is the customisation in terms of team/severity and which exporter module to "hook" into

    if $use_tls {
        $tls_alert = {
            'name'  => 'ssl_expire',
            'rules' => [{
                'alert'      => 'CertAlmostExpired',
                'expr'       => "probe_ssl_earliest_cert_expiry{module=~'tcp_${safe_title}_.*'} - time() < (${certificate_expiry_days} * 86400)",
                'for'         => '3h',
                'labels'      => {
                    'team'     => $team,
                    'severity' => $severity,
                },
                'annotations' => {
                    'description' => 'The certificate presented by service {{ $labels.instance }} is going to expire in {{ $value | humanizeDuration }}',
                    'summary'     => 'Certificate for service {{ $labels.instance }} is about to expire',
                    'dashboard'   => 'https://grafana.wikimedia.org/d/K1dRhGCnz/probes-tls-dashboard',
                    'runbook'     => $probe_runbook,
                },
            }],
        }
    } else {
        $tls_alert = undef
    }

    $alert_config = {
        'groups' => [
          $tls_alert,
          {
            'name'  => 'puppet_probes',
            'rules' => [{
                'alert'      => 'ProbeDown',
                'expr'       => "avg_over_time(probe_success{module=~'tcp_${safe_title}_.*'}[1m]) * 100 < 75",
                'for'         => $alert_after,
                'labels'      => {
                    'team'     => $team,
                    'severity' => $severity,
                },
                'annotations' => {
                    'description' => '{{ $labels.instance }} failed when probed by {{ $labels.module }} from {{ $externalLabels.site }}. Availability is {{ $value }}%.',
                    'summary'     => "Service {{ \$labels.instance }} has failed probes ({{ \$labels.module }})${page_text}",
                    'dashboard'   => 'https://grafana.wikimedia.org/d/O0nHhdhnz/network-probes-overview?var-job={{ $labels.job }}&var-module=All',
                    'logs'        => 'https://logstash.wikimedia.org/app/dashboards#/view/f3e709c0-a5f8-11ec-bf8e-43f1807d5bc2?_g=(filters:!((query:(match_phrase:(service.name:{{ $labels.module }})))))',
                    'runbook'     => $probe_runbook,
                },
            }],
          },
        ].filter |$alert| { $alert != undef },
    }
    $module_params = {
        'content' => $module_config.wmflib::to_yaml,
        'tag'     => "prometheus::blackbox::check::tcp::${::site}::${prometheus_instance}::module",
    }
    $alert_rule_params  = {
        'instance' => $prometheus_instance,
        'content' => $alert_config.wmflib::to_yaml,
        'tag'     => "prometheus::blackbox::check::tcp::${::site}::${prometheus_instance}::alert",
    }
    $target_frag_params = {
        'ensure'  => 'file',
        'content' => $target_config.wmflib::to_yaml,
        'tag'     => "prometheus::blackbox::check::tcp::${::site}::${prometheus_instance}::target",
    }

    wmflib::resource::export('prometheus::blackbox::module', $module_title, $title, $module_params)
    wmflib::resource::export('prometheus::rule', $alert_title, $title, $alert_rule_params)
    wmflib::resource::export('file', $target_file, $title, $target_frag_params)
}