Puppet Class: icinga::monitor::elasticsearch::cirrus_cluster_checks

Defined in:
modules/icinga/manifests/monitor/elasticsearch/cirrus_cluster_checks.pp

Overview

Class icinga::monitor::elasticsearch::cirrus_cluster_checks

Parameters:

  • shard_size_warning (Integer)
  • shard_size_critical (Integer)
  • threshold (String)
  • timeout (Integer)


2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# File 'modules/icinga/manifests/monitor/elasticsearch/cirrus_cluster_checks.pp', line 2

class icinga::monitor::elasticsearch::cirrus_cluster_checks(
    Integer $shard_size_warning,
    Integer $shard_size_critical,
    String $threshold,
    Integer $timeout,
){
    $ports = [9243, 9443, 9643]
    $sites = ['eqiad', 'codfw']
    $scheme = 'https'

    $sites.each |$site| {
        $host = "search.svc.${site}.wmnet"

        # Create the Icinga host for search.
        # The service::catalog integration used to create these hosts
        # automatically via 'monitoring' section (now deprecated).
        # See also https://phabricator.wikimedia.org/T291946
        @monitoring::host { $host:
            ip_address    => ipresolve($host, 4),
            contact_group => 'admins',
            group         => 'lvs',
            critical      => false,
        }

        icinga::monitor::elasticsearch::base_checks { $host:
            host                => $host,
            scheme              => $scheme,
            ports               => $ports,
            shard_size_warning  => $shard_size_warning,
            shard_size_critical => $shard_size_critical,
            timeout             => $timeout,
            threshold           => $threshold,
        }

        icinga::monitor::elasticsearch::cirrus_checks { $host:
            host   => $host,
            scheme => $scheme,
            ports  => $ports,
        }

        # this is checking for update rate over the last 60 minutes. Ideally, we'd like a shorter window for this
        # check, but T224425 makes it generate too much noise.
        # FIXME: reduce moving average to 10 minutes once T224425 is fixed.
        monitoring::graphite_threshold { "mediawiki_cirrus_update_rate_${site}":
            description     => "MediaWiki CirrusSearch update rate - ${site}",
            dashboard_links => ['https://grafana.wikimedia.org/d/JLK3I_siz/elasticsearch-indexing?panelId=44&fullscreen&orgId=1'],
            host            => $host,
            metric          => "movingAverage(transformNull(MediaWiki.CirrusSearch.${site}.updates.all.sent.rate),\"60minutes\")",
            warning         => 80,
            critical        => 50,
            under           => true,
            contact_group   => 'admins,team-discovery',
            notes_link      => 'https://wikitech.wikimedia.org/wiki/Search#No_updates',
        }
    }

    # Search is currently too busy - T262694
    monitoring::graphite_threshold { 'mediawiki_cirrus_pool_counter_rejections_rate':
        description     => 'MediaWiki CirrusSearch pool counter rejections rate',
        dashboard_links => ['https://grafana.wikimedia.org/d/qrOStmdGk/elasticsearch-pool-counters?viewPanel=4&orgId=1'],
        metric          => "aliasByNode(sum(movingAverage(consolidateBy(transformNull(MediaWiki.CirrusSearch.poolCounter.*.failureMs.sample_rate, 0), \"max\"), \"5minutes\")), 1, 2)",
        warning         => 500,
        critical        => 1000,
        contact_group   => 'admins,team-discovery',
        notes_link      => 'https://wikitech.wikimedia.org/wiki/Search#Pool_Counter_rejections_(search_is_currently_too_busy)',
    }

    # Background repair process finding lots of bad documents - T295365
    monitoring::graphite_threshold { 'mediawiki_cirrussearch_indices_high_fix_rate':
        description     => 'MediaWiki CirrusSearch Saneitizer Weekly Fix Rate',
        dashboard_links => ['https://grafana.wikimedia.org/d/JLK3I_siz/elasticsearch-indexing?viewPanel=35&orgId=1&from=now-6M&to=now'],
        metric          => 'smartSummarize(transformNull(MediaWiki.CirrusSearch.{eqiad,codfw,cloudelastic}.sanitization.fixed.sum, 0), "1wk", "sum")',
        warning         => 100000,
        critical        => 250000,
        contact_group   => 'admins,team-discovery',
        notes_link      => 'https://wikitech.wikimedia.org/wiki/Search#Saneitizer_(background_repair_process)',
    }
}