Defined Type: profile::trafficserver::monitoring

Defined in:
modules/profile/manifests/trafficserver/monitoring.pp

Overview

Parameters:

  • paths (Trafficserver::Paths)
  • port (Stdlib::Port)
  • prometheus_exporter_port (Wmflib::UserIpPort)
  • inbound_tls (Optional[Trafficserver::Inbound_TLS_settings]) (defaults to: undef)
  • default_instance (Boolean) (defaults to: false)
  • acme_chief (Boolean) (defaults to: false)
  • disable_config_check (Boolean) (defaults to: false)
  • instance_name (String) (defaults to: 'backend')
  • user (String) (defaults to: 'trafficserver')


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# File 'modules/profile/manifests/trafficserver/monitoring.pp', line 1

define profile::trafficserver::monitoring(
    Trafficserver::Paths $paths,
    Stdlib::Port $port,
    Wmflib::UserIpPort $prometheus_exporter_port,
    Optional[Trafficserver::Inbound_TLS_settings] $inbound_tls = undef,
    Boolean $default_instance = false,
    Boolean $acme_chief = false,
    Boolean $disable_config_check = false,
    String $instance_name = 'backend',
    String $user = 'trafficserver',
){
    if $inbound_tls {
        $endpoint = "https://127.0.0.1:${port}/_stats"
        $traffic_manager_http_check = 'check_https_hostheader_port_url'
    } else {
        $endpoint = "http://127.0.0.1:${port}/_stats"
        $traffic_manager_http_check = 'check_http_hostheader_port_url'
    }

    if $default_instance {
        $traffic_manager_nrpe_command = '/usr/lib/nagios/plugins/check_procs -c 1:1 -a "/usr/bin/traffic_manager --nosyslog"'
        $traffic_server_nrpe_command = "/usr/lib/nagios/plugins/check_procs -c 1:1 -a '/usr/bin/traffic_server -M --httpport ${port}'"
        $check_trafficserver_config_status_args = $paths['records']
    } else {
        $traffic_manager_nrpe_command = "/usr/lib/nagios/plugins/check_procs -c 1:1 -a '/usr/bin/traffic_manager --run-root=${paths['prefix']} --nosyslog'"
        $traffic_server_nrpe_command = "/usr/lib/nagios/plugins/check_procs -c 1:1 -a '${paths['bindir']}/traffic_server -M --run-root=${paths['prefix']}/runroot.yaml --httpport ${port}'"
        $check_trafficserver_config_status_args = "${paths['records']} ${paths['prefix']}"
    }

    prometheus::trafficserver_exporter { "trafficserver_exporter_${instance_name}":
        instance_name          => $instance_name,
        endpoint               => $endpoint,
        listen_port            => $prometheus_exporter_port,
        verify_ssl_certificate => false,
        require                => Trafficserver::Instance[$instance_name],
    }

    nrpe::monitor_service { "traffic_manager_${instance_name}":
        description  => "Ensure traffic_manager is running for instance ${instance_name}",
        nrpe_command => $traffic_manager_nrpe_command,
        require      => Trafficserver::Instance[$instance_name],
        notes_url    => 'https://wikitech.wikimedia.org/wiki/Apache_Traffic_Server',
    }

    nrpe::monitor_service { "traffic_server_${instance_name}":
        description  => "Ensure traffic_server is running for instance ${instance_name}",
        nrpe_command => $traffic_server_nrpe_command,
        require      => Trafficserver::Instance[$instance_name],
        notes_url    => 'https://wikitech.wikimedia.org/wiki/Apache_Traffic_Server',
    }

    nrpe::monitor_service { "trafficserver_exporter_${instance_name}":
        description  => "Ensure trafficserver_exporter is running for instance ${instance_name}",
        nrpe_command => "/usr/lib/nagios/plugins/check_procs -c 1:1 -a '/usr/bin/python3 /usr/bin/prometheus-trafficserver-exporter --no-procstats --no-ssl-verification --endpoint ${endpoint} --port ${prometheus_exporter_port}'",
        require      => Prometheus::Trafficserver_exporter["trafficserver_exporter_${instance_name}"],
        notes_url    => 'https://wikitech.wikimedia.org/wiki/Apache_Traffic_Server',
    }

    monitoring::service { "traffic_manager_${instance_name}_check_http":
        description   => "Ensure traffic_manager binds on ${port} and responds to HTTP requests",
        check_command => "${traffic_manager_http_check}!localhost!${port}!/_stats",
        require       => Prometheus::Trafficserver_exporter["trafficserver_exporter_${instance_name}"],
        notes_url     => 'https://wikitech.wikimedia.org/wiki/Apache_Traffic_Server',
    }

    profile::trafficserver::nrpe_monitor_script { "check_trafficserver_${instance_name}_config_status":
        ensure    => bool2str(!$disable_config_check, 'present', 'absent'),
        sudo_user => $user,
        checkname => 'check_trafficserver_config_status',
        args      => $check_trafficserver_config_status_args,
        require   => Trafficserver::Instance[$instance_name],
    }

    if $inbound_tls {
        $inbound_tls['certificates'].each |Trafficserver::TLS_certificate $certificate| {
            if $certificate['common_name'] and $certificate['sni'] and $certificate['warning_threshold'] and $certificate['critical_threshold'] {
                if $inbound_tls['do_ocsp'] == 1 {
                    $check_ocsp = 'check_ssl_ats_ocsp'
                } else {
                    $check_ocsp = 'check_ssl_ats'
                }
                if $certificate['default'] {
                    $check = "${check_ocsp}_default"
                } else {
                    $check = $check_ocsp
                }
                $check_sni_str = join($certificate['sni'], ',')
                ['ECDSA', 'RSA'].each |String $algorithm| {
                    monitoring::service { "trafficserver_${instance_name}_https_${certificate['common_name']}_${algorithm}":
                        description   => "ats-${instance_name} HTTPS ${certificate['common_name']} ${algorithm}",
                        check_command => "${check}!${certificate['warning_threshold']}!${certificate['critical_threshold']}!${certificate['common_name']}!${check_sni_str}!${port}!${algorithm}",
                        notes_url     => 'https://wikitech.wikimedia.org/wiki/HTTPS',
                    }
                }
            }
        }
        if $inbound_tls['do_ocsp'] == 1 {
            $check_args = '-c 259500 -w 173100 -d /var/cache/ocsp -g "*.ocsp"'
            $check_args_acme_chief = '-c 518400 -w 432000 -d /etc/acmecerts -g "*/live/*.ocsp"'
            nrpe::monitor_service { "trafficserver_${instance_name}_ocsp_freshness":
                description  => 'Freshness of OCSP Stapling files (ATS-TLS)',
                nrpe_command => "/usr/lib/nagios/plugins/check-fresh-files-in-dir.py ${check_args}",
                require      => File['/usr/lib/nagios/plugins/check-fresh-files-in-dir.py'],
                notes_url    => 'https://wikitech.wikimedia.org/wiki/HTTPS/Unified_Certificates',
            }
            nrpe::monitor_service { "trafficserver_${instance_name}_ocsp_freshness_acme_chief":
                ensure       => bool2str($acme_chief, 'present', 'absent'),
                description  => 'Freshness of OCSP Stapling files (ATS-TLS acme-chief)',
                nrpe_command => "/usr/lib/nagios/plugins/check-fresh-files-in-dir.py ${check_args_acme_chief}",
                require      => File['/usr/lib/nagios/plugins/check-fresh-files-in-dir.py'],
                notes_url    => 'https://wikitech.wikimedia.org/wiki/HTTPS/Unified_Certificates',
            }
        }
    }

    $prometheus_labels = "instance=~\"${::hostname}:.*\",layer=\"${instance_name}\""

    # In normal conditions, restart count is 1. Alert if it is >= 2.
    monitoring::check_prometheus { "trafficserver_${instance_name}_restart_count":
        description     => "traffic_server ${instance_name} process restarted",
        dashboard_links => ["https://grafana.wikimedia.org/d/6uhkG6OZk/ats-instance-drilldown?orgId=1&var-site=${::site} prometheus/ops&var-instance=${::hostname}&var-layer=${instance_name}"],
        query           => "scalar(trafficserver_restart_count{${prometheus_labels}})",
        method          => 'ge',
        warning         => 2,
        critical        => 2,
        prometheus_url  => "http://prometheus.svc.${::site}.wmnet/ops",
        notes_link      => 'https://wikitech.wikimedia.org/wiki/Apache_Traffic_Server',
    }
}