Puppet Class: profile::graphite::alerts

Defined in:
modules/profile/manifests/graphite/alerts.pp

Overview

SPDX-License-Identifier: Apache-2.0

Class: profile::graphite::alerts

Install icinga alerts on graphite metrics. NOTE to be included only from one host, icinga will generate different alerts for all hosts that include this class.

Parameters:

  • graphite_url (Stdlib::HTTPUrl) (defaults to: lookup('graphite_url'))


8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# File 'modules/profile/manifests/graphite/alerts.pp', line 8

class profile::graphite::alerts(
    Stdlib::HTTPUrl $graphite_url = lookup('graphite_url')
) {

    class {'graphite::monitoring::graphite':
        graphite_url => $graphite_url,
    }

    # Monitor MediaWiki CentralAuth login failures
    monitoring::graphite_threshold { 'mediawiki_centralauth_errors':
        description     => 'MediaWiki centralauth errors',
        graphite_url    => $graphite_url,
        dashboard_links => ['https://grafana.wikimedia.org/d/000000438/mediawiki-alerts?panelId=3&fullscreen&orgId=1'],
        metric          => 'transformNull(sumSeries(MediaWiki.authmanager.centrallogin.*.failure.*.rate), 0)',
        warning         => 0.5,
        critical        => 1,
        from            => '15min',
        percentage      => 30,
        notes_link      => 'https://wikitech.wikimedia.org/wiki/Application_servers',
    }

    # Monitor MediaWiki account creation errors are below 99%. T146090
    $account_failures = 'MediaWiki.authmanager.accountcreation.*.failure.*.sum'
    $account_success = 'MediaWiki.authmanager.accountcreation.*.success.sum'
    monitoring::graphite_threshold { 'mediawiki_accountcreation_errors':
        description     => 'MediaWiki account creation errors',
        graphite_url    => $graphite_url,
        dashboard_links => ['https://grafana.wikimedia.org/d/000000438/mediawiki-exceptions-alerts?orgId=1&forceLogin&viewPanel=23'],
        metric          => "asPercent( sumSeries(${account_failures}), sumSeries(${account_success}, ${account_failures}) )",
        warning         => 90,
        critical        => 100,
        from            => '15min',
        percentage      => 30,
        notes_link      => 'https://wikitech.wikimedia.org/wiki/Application_servers',
    }
}