Defined Type: monitoring::alerts::kafka_topic_throughput

Defined in:
modules/monitoring/manifests/alerts/kafka_topic_throughput.pp

Overview

Define: profile::kafka::alert::topic_throughput To be declared on monitoring host. This will alert if the message rate througput of a topic (or sum of topics) on a Kafka cluster exceed or drops from the given threshold.

Parameters:

kafka_cluster_name

Name of Kafka cluser in kafka_clusters hiera.

topic

Topic (regex allowed) to check throughput of. If multiple topics are matched, this will check the sum of their message rate.

warning

warning threshold

critical

critical threshold

method

Default: ge

period

Default: 15m

dashboard_links

Default: undef

prometheus_url

Default: “prometheus.svc.$site.wmnet/ops”,

nagios_critical

Default: false

contact_group

Default: admins

ensure

Default: present

Parameters:

  • kafka_cluster_name (Any)
  • topic (Any)
  • warning (Any)
  • critical (Any)
  • method (Any) (defaults to: 'ge')
  • period (Any) (defaults to: '15m')
  • dashboard_links (Any) (defaults to: undef)
  • prometheus_url (Any) (defaults to: "http://prometheus.svc.${::site}.wmnet/ops")
  • nagios_critical (Any) (defaults to: false)
  • contact_group (Any) (defaults to: 'admins')
  • ensure (Wmflib::Ensure) (defaults to: present)


31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'modules/monitoring/manifests/alerts/kafka_topic_throughput.pp', line 31

define monitoring::alerts::kafka_topic_throughput (
    $kafka_cluster_name,
    $topic,
    $warning,
    $critical,
    $method                 = 'ge',
    $period                 = '15m',
    $dashboard_links        = undef,
    $prometheus_url         = "http://prometheus.svc.${::site}.wmnet/ops",
    $nagios_critical        = false,
    $contact_group          = 'admins',
    Wmflib::Ensure $ensure  = present,
) {
    # Alert if the message rate for the matched topics is outside of the given threshold.
    monitoring::check_prometheus { "kafka_topic_throughput_${title}":
        ensure          => $ensure,
        description     => "Kafka topic throughput alert for ${title} in cluster ${kafka_cluster_name} for topic(s) ${topic}.  Message rate should be ${method} (${warning}, ${critical}).",
        dashboard_links => $dashboard_links,
        # Examine the rate in the $quantile percentile over the last $period.
        query           => "scalar(sum(rate(kafka_server_BrokerTopicMetrics_MessagesIn_total{kafka_cluster=\"${kafka_cluster_name}\",topic=~\"${topic}\"}[${period}])))",
        method          => $method,
        warning         => $warning,
        critical        => $critical,
        prometheus_url  => $prometheus_url,
        nagios_critical => $nagios_critical,
        contact_group   => $contact_group,
    }
}