Defined Type: monitoring::alerts::traffic_drop

Defined in:
modules/monitoring/manifests/alerts/traffic_drop.pp

Overview

Computes the percentage difference of Varnish GET to text cluster Between 30min ago and now. Alerts warning at 70% drop, critical at 60.

Parameters:

  • site (Any)


4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# File 'modules/monitoring/manifests/alerts/traffic_drop.pp', line 4

define monitoring::alerts::traffic_drop(
  $site,
  ) {
    monitoring::check_prometheus { $title:
        description     => "Varnish traffic drop between 30min ago and now at ${site}",
        # The 'and sum(...)' below enforces a minimum 15000rps that we must have dropped below
        # before we are allowed to alert.  A simple ratio is very sensitive to slight traffic
        # variations when you have low absolute traffic.
        query           => "sum(job_method_status:varnish_requests:rate5m{method=\"GET\",job=\"varnish-text\", site=\"${site}\"}) * 100 / sum(job_method_status:varnish_requests:rate5m{method=\"GET\",job=\"varnish-text\", site=\"${site}\"} offset 30m) and sum(job_method_status:varnish_requests:rate5m{method=\"GET\",job=\"varnish-text\",site=\"${site}\"} offset 30m) > 15000",
        prometheus_url  => 'http://prometheus.svc.eqiad.wmnet/global',
        method          => 'le',
        retries         => 2,
        warning         => 70,
        critical        => 60,
        dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/varnish-http-requests?panelId=6&fullscreen&orgId=1'],
        notes_link      => 'https://wikitech.wikimedia.org/wiki/Varnish#Diagnosing_Varnish_alerts',
    }
}