Puppet Class: profile::mediawiki::alerts

Defined in:
modules/profile/manifests/mediawiki/alerts.pp

Overview

Class: profile::mediawiki::alerts

Install icinga alerts based on Prometheus metrics. NOTE to be included only from one host, icinga will generate different alerts for all hosts that include this class.



7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# File 'modules/profile/manifests/mediawiki/alerts.pp', line 7

class profile::mediawiki::alerts {
  ['eqiad', 'codfw'].each |String $site| {
    ['appserver', 'api_appserver'].each |String $cluster| {
      monitoring::check_prometheus { "mediawiki_http_requests_${cluster}_${site}_get":
        description     => "High average GET latency for mw requests on ${cluster} in ${site}",
        # Filter out NaN values
        query           => "cluster_code_method_handler:mediawiki_http_requests_duration:avg2m{cluster=\"${cluster}\",method=\"GET\",code=~\"2..\"} > 0",
        prometheus_url  => "http://prometheus.svc.${site}.wmnet/ops",
        retries         => 2,
        method          => 'gt',
        warning         => 0.35, # seconds
        critical        => 0.4, # seconds
        dashboard_links => ["https://grafana.wikimedia.org/d/RIA1lzDZk/application-servers-red-dashboard?panelId=9&fullscreen&orgId=1&from=now-3h&to=now&var-datasource=${site} prometheus/ops&var-cluster=${cluster}&var-method=GET"],
      }

      monitoring::check_prometheus { "mediawiki_http_requests_${cluster}_${site}_post":
        description     => "High average POST latency for mw requests on ${cluster} in ${site}",
        # Filter out NaN values
        query           => "cluster_code_method_handler:mediawiki_http_requests_duration:avg2m{cluster=\"${cluster}\",method=\"POST\",code=~\"2..\"} > 0",
        prometheus_url  => "http://prometheus.svc.${site}.wmnet/ops",
        retries         => 2,
        method          => 'gt',
        warning         => 1.4, # seconds
        critical        => 2.0, # seconds
        dashboard_links => ["https://grafana.wikimedia.org/d/RIA1lzDZk/application-servers-red-dashboard?panelId=9&fullscreen&orgId=1&from=now-3h&to=now&var-datasource=${site} prometheus/ops&var-cluster=${cluster}&var-method=POST"],
      }

      monitoring::check_prometheus { "mediawiki_workers_saturation_${cluster}_${site}":
        description     => "Not enough idle PHP-FPM workers for Mediawiki ${cluster} at ${site}",
        query           => "sum (phpfpm_statustext_processes{cluster=\"${cluster}\",state=\"idle\"}) / sum (phpfpm_statustext_processes{cluster=\"${cluster}\"})",
        prometheus_url  => "http://prometheus.svc.${site}.wmnet/ops",
        retries         => 2,
        method          => 'lt',
        warning         => 0.5,  # Ratio of (idle / total).
        critical        => 0.3,
        notes_link      => 'https://bit.ly/wmf-fpmsat',
        dashboard_links => ["https://grafana.wikimedia.org/d/RIA1lzDZk/application-servers-red-dashboard?panelId=54&fullscreen&orgId=1&from=now-3h&to=now&var-datasource=${site} prometheus/ops&var-cluster=${cluster}"],
        nagios_critical => true,
      }
    }
  }

  ### Logstash-based MediaWiki alerts: these don't need to iterate over sites.
  ### Logstash in codfw/eqiad site reads logs from all other sites' kafka,
  ### making the metrics we calculate from it effectively global. Thus icinga
  ### in each site only needs to check its local prometheus instance.

  # Monitor memcached error rate from MediaWiki. This is commonly a sign of
  # a failing nutcracker instance that can be tracked down via
  # https://logstash.wikimedia.org/#/dashboard/elasticsearch/memcached
  monitoring::check_prometheus { 'mediawiki-memcached-threshold':
    description     => 'MediaWiki memcached error rate',
    query           => 'sum(rate(logstash_mediawiki_events_total{channel="memcached", level="ERROR"}[4m])) * 60',
    prometheus_url  => "http://prometheus.svc.${::site}.wmnet/ops",
    retries         => 2,
    method          => 'gt',
    # Nominal error rate in production is <150/min
    warning         => 1000,
    critical        => 5000,
    notes_link      => 'https://wikitech.wikimedia.org/wiki/Memcached',
    dashboard_links => ["https://grafana.wikimedia.org/d/000000438/mediawiki-alerts?panelId=1&fullscreen&orgId=1&var-datasource=${::site} prometheus/ops"],
  }

  # Monitor MediaWiki fatals and exceptions.
  monitoring::check_prometheus { 'mediawiki-error-rate':
    description     => 'MediaWiki exceptions and fatals per minute',
    prometheus_url  => "http://prometheus.svc.${::site}.wmnet/ops",
    retries         => 2,
    method          => 'gt',
    query           => 'sum(rate(logstash_mediawiki_events_total{channel=~"(fatal|exception)",level="ERROR"}[4m])) without (channel, instance) * 60',
    warning         => 50,
    critical        => 100,
    notes_link      => 'https://wikitech.wikimedia.org/wiki/Application_servers',
    dashboard_links => ["https://grafana.wikimedia.org/d/000000438/mediawiki-alerts?panelId=2&fullscreen&orgId=1&var-datasource=${::site} prometheus/ops"],
  }
}