Puppet Class: profile::graphite::alerts

Defined in:
modules/profile/manifests/graphite/alerts.pp

Overview

Class: profile::graphite::alerts

Install icinga alerts on graphite metrics. NOTE to be included only from one host, icinga will generate different alerts for all hosts that include this class.

Parameters:

  • graphite_url (Any) (defaults to: hiera('graphite_url'))


7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# File 'modules/profile/manifests/graphite/alerts.pp', line 7

class profile::graphite::alerts($graphite_url = hiera('graphite_url')) {

    include ::graphite::monitoring::graphite

    Monitoring::Graphite_threshold {
        graphite_url => $graphite_url
    }

    $kafka_config = kafka_config('analytics')

    # Eventlogging
    #   Warn/Alert if the db inserts of EventLogging data have dropped dramatically
    #   Since the MySQL consumer is at the bottom of the pipeline
    #   this metric is a good proxy to make sure events are flowing through the
    #   kafka pipeline
    monitoring::graphite_threshold { 'eventlogging_overall_inserted_rate':
        description     => 'EventLogging overall insertion rate from MySQL consumer',
        dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/eventlogging?panelId=12&fullscreen&orgId=1'],
        metric          => 'movingAverage(eventlogging.overall.inserted.rate, "10min")',
        warning         => 50,
        critical        => 10,
        percentage      => 20, # At least 3 of the (25 - 10) = 15 readings
        from            => '25min',
        until           => '10min',
        contact_group   => 'analytics',
        under           => true,
    }

    # Monitor memcached error rate from MediaWiki. This is commonly a sign of
    # a failing nutcracker instance that can be tracked down via
    # https://logstash.wikimedia.org/#/dashboard/elasticsearch/memcached
    monitoring::graphite_threshold { 'mediawiki-memcached-threshold':
        description     => 'MediaWiki memcached error rate',
        dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/mediawiki-graphite-alerts?orgId=1&panelId=1&fullscreen'],
        metric          => 'transformNull(logstash.rate.mediawiki.memcached.ERROR.sum, 0)',
        # Nominal error rate in production is <150/min
        warning         => 1000,
        critical        => 5000,
        from            => '5min',
        percentage      => 40,
    }

    # Monitor MediaWiki fatals and exceptions.
    monitoring::graphite_threshold { 'mediawiki_error_rate':
        description     => 'MediaWiki exceptions and fatals per minute',
        dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/mediawiki-graphite-alerts?orgId=1&panelId=2&fullscreen'],
        metric          => 'transformNull(sumSeries(logstash.rate.mediawiki.fatal.ERROR.sum, logstash.rate.mediawiki.exception.ERROR.sum), 0)',
        warning         => 25,
        critical        => 50,
        from            => '10min',
        percentage      => 70,
    }

    # Monitor MediaWiki session failures
    # See https://grafana.wikimedia.org/dashboard/db/edit-count
    monitoring::graphite_threshold { 'mediawiki_session_loss':
        description     => 'MediaWiki edit session loss',
        dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/edit-count?panelId=13&fullscreen&orgId=1'],
        metric          => 'transformNull(scale(consolidateBy(MediaWiki.edit.failures.session_loss.rate, "max"), 60), 0)',
        warning         => 10,
        critical        => 50,
        from            => '15min',
        percentage      => 30,
    }

    monitoring::graphite_threshold { 'mediawiki_bad_token':
        description     => 'MediaWiki edit failure due to bad token',
        dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/edit-count?panelId=13&fullscreen&orgId=1'],
        metric          => 'transformNull(scale(consolidateBy(MediaWiki.edit.failures.bad_token.rate, "max"), 60), 0)',
        warning         => 10,
        critical        => 50,
        from            => '15min',
        percentage      => 30,
    }

    # Monitor MediaWiki CentralAuth bad tokens
    monitoring::graphite_threshold { 'mediawiki_centralauth_errors':
        description     => 'MediaWiki centralauth errors',
        dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/mediawiki-graphite-alerts?orgId=1&panelId=3&fullscreen'],
        metric          => 'transformNull(sumSeries(MediaWiki.centralauth.centrallogin_errors.*.rate), 0)',
        warning         => 0.5,
        critical        => 1,
        from            => '15min',
        percentage      => 30,
    }

    # Monitor EventBus 4xx and 5xx HTTP response rate.
    monitoring::graphite_threshold { 'eventbus_http_error_rate':
        description     => 'EventBus HTTP Error Rate (4xx + 5xx)',
        dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/eventbus?panelId=1&fullscreen&orgId=1'],
        metric          => 'transformNull(sumSeries(eventbus.counters.eventlogging.service.EventHandler.POST.[45]*.rate))',
        # If > 50% of datapoints over last 10 minutes is over thresholds, then alert.
        warning         => 1,
        critical        => 10,
        from            => '10min',
        percentage      => 50,
    }

    # Percent difference in the number of thumbs between eqiad and codfw
    monitoring::graphite_threshold { 'swift_thumbs_eqiad_codfw_diff':
        description     => 'Number of thumbnails in eqiad greater than codfw',
        dashboard_links => ['https://grafana.wikimedia.org/dashboard/file/swift.json'],
        metric          => 'keepLastValue(divideSeries(swift.eqiad-prod.containers.mw-media.thumb.objects, swift.codfw-prod.containers.mw-media.thumb.objects))',
        warning         => 1.02,
        critical        => 1.05,
        from            => '90min',
        percentage      => 80,
        check_interval  => 30,
    }
    monitoring::graphite_threshold { 'swift_thumbs_codfw_eqiad_diff':
        description     => 'Number of thumbnails in codfw greater than eqiad',
        dashboard_links => ['https://grafana.wikimedia.org/dashboard/file/swift.json'],
        metric          => 'keepLastValue(divideSeries(swift.codfw-prod.containers.mw-media.thumb.objects, swift.eqiad-prod.containers.mw-media.thumb.objects))',
        warning         => 1.02,
        critical        => 1.05,
        from            => '90min',
        percentage      => 80,
        check_interval  => 30,
    }
}