Defined Type: profile::kafka::mirror::alerts

Defined in:
modules/profile/manifests/kafka/mirror/alerts.pp

Overview

SPDX-License-Identifier: Apache-2.0

Define profile::kafka::mirror::alerts

Installs check_prometheus jobs to alert for MirrorMaker throughput and dropped messages.

Dropped messages will generate a warning at greater than 0 and critical at greater than 10 dropped messages in the last $monitoring_period.

mirror_name

This must match a the title of a declared confluent::kafka::mirror::instance. Default: $title

topic_blacklist

Regex of topics to exclude from lag monitoring. Default: undef

monitoring_period

Prometheus range period to monitor. Default: 30m.

warning_throughput

Alert warning if average consume or produce throughput (msgs/sec) drops below this. Default: 100

critical_throughput

Alert critical if average consume or produce throughput (msgs/sec) drops below this. Default: 0

warning_lag

Alert warning if max consumer lag in the last 10 minutes is above this. Default: 10000

critical_lag

Alert critical if max consumer lag in the last 10 minutes is above this. Default: 100000

contact_group

Default: admins

nagios_critical

Default: false

prometheus_url

Prometheus URL endpoint containing metrics for MirrorMaker. Default: “prometheus.svc.$site.wmnet/ops

source_prometheus_url

Prometheus URL endpoint containing metrics for the source Kafka cluster, including lag metrics from burrow, etc. Default: “prometheus.svc.$site.wmnet/ops

Parameters:

  • mirror_name (Any) (defaults to: $title)
  • topic_blacklist (Any) (defaults to: undef)
  • monitoring_period (Any) (defaults to: '30m')
  • warning_throughput (Any) (defaults to: 100)
  • critical_throughput (Any) (defaults to: 0)
  • warning_lag (Any) (defaults to: 10000)
  • critical_lag (Any) (defaults to: 100000)
  • contact_group (Any) (defaults to: 'admins')
  • nagios_critical (Any) (defaults to: false)
  • prometheus_url (Any) (defaults to: "http://prometheus.svc.${::site}.wmnet/ops")
  • source_prometheus_url (Any) (defaults to: "http://prometheus.svc.${::site}.wmnet/ops")


50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# File 'modules/profile/manifests/kafka/mirror/alerts.pp', line 50

define profile::kafka::mirror::alerts(
    $mirror_name           = $title,
    $topic_blacklist       = undef,
    $monitoring_period     = '30m',
    $warning_throughput    = 100,
    $critical_throughput   = 0,
    $warning_lag           = 10000,
    $critical_lag          = 100000,
    $contact_group         = 'admins',
    $nagios_critical       = false,
    $prometheus_url        = "http://prometheus.svc.${::site}.wmnet/ops",
    $source_prometheus_url = "http://prometheus.svc.${::site}.wmnet/ops",
) {
    # Extract grafana datasources from $prometheus_urls for the dashboard url.
    $grafana_datasource     = regsubst($prometheus_url,        '^.+prometheus\.svc\.(.+)\.wmnet/(.+)$', '\1 prometheus/\2')
    $grafana_lag_datasource = regsubst($source_prometheus_url, '^.+prometheus\.svc\.(.+)\.wmnet/(.+)$', '\1 prometheus/\2')
    $dashboard_url          = "https://grafana.wikimedia.org/d/000000521/kafka-mirrormaker?var-datasource=${grafana_datasource}&var-lag_datasource=${grafana_lag_datasource}&var-mirror_name=${mirror_name}"

    # Set check_prometheus defaults.
    Monitoring::Check_prometheus {
        # Most metrics are for MirrorMaker, so default to its $prometheus_url.
        prometheus_url  => $prometheus_url,
        method          => 'le',
        warning         => $warning_throughput,
        critical        => $critical_throughput,
        nagios_critical => $nagios_critical,
        contact_group   => $contact_group,
        dashboard_links => [$dashboard_url],
    }

    monitoring::check_prometheus { "kafka-mirror-${mirror_name}-consume_rate":
        description => "Kafka MirrorMaker ${mirror_name} average message consume rate in last ${monitoring_period}",
        query       => "scalar(sum(avg_over_time(kafka_consumer_consumer_fetch_manager_metrics_all_topics_records_consumed_rate{mirror_name=\"${mirror_name}\"} [${monitoring_period}])))",
        notes_link  => 'https://wikitech.wikimedia.org/wiki/Kafka/Administration#MirrorMaker',
    }

    monitoring::check_prometheus { "kafka-mirror-${mirror_name}-produce_rate":
        description => "Kafka MirrorMaker ${mirror_name} average message produce rate in last ${monitoring_period}",
        query       => "scalar(sum(avg_over_time(kafka_producer_producer_metrics_record_send_rate{mirror_name=\"${mirror_name}\"} [${monitoring_period}])))",
        notes_link  => 'https://wikitech.wikimedia.org/wiki/Kafka/Administration',
    }

    monitoring::check_prometheus { "kafka-mirror-${mirror_name}-dropped_messages":
        description => "Kafka MirrorMaker ${mirror_name} dropped message count in last ${monitoring_period}",
        query       => "scalar(sum(increase(kafka_tools_MirrorMaker_MirrorMaker_numDroppedMessages{mirror_name=\"${mirror_name}\"} [${monitoring_period}])))",
        method      => 'gt',
        # numDroppedMessages here doesn't really mean that messages were lost.
        # abort.on.send.failure defaults to true, so any MirrorMaker process that encounters
        # this will die before committing the offset for any dropped messages.  This will
        # cause these messages to be reconsumed and produced again by another MirrorMaker process.
        # https://github.com/apache/kafka/blob/trunk/core/src/main/scala/kafka/tools/MirrorMaker.scala#L741-L747
        # We alert on this, but are lenient about them.
        warning     => 100,
        critical    => 1000,
        notes_link  => 'https://wikitech.wikimedia.org/wiki/Kafka/Administration',
    }

    # Alert on max consumer lag in last $lag_check_period minutes.
    #
    # The change-prop topics are currently not replicated but due to previous tests,
    # the commits/offsets registered for those within the mirror maker consumer
    # group were not deleted from Kafka. They still end up in the Burrow's metrics
    # for the mirror maker consumer group, showing a constant lag that triggers the alarm.
    $lag_check_period = '10'

    if topic_blacklist {
        $cgroup_lag_query = "scalar(max(max_over_time(kafka_burrow_partition_lag{group=\"kafka-mirror-${mirror_name}\",topic\\!~\"${topic_blacklist}\"} [${lag_check_period}m])))"
    } else {
        $cgroup_lag_query = "scalar(max(max_over_time(kafka_burrow_partition_lag{group=\"kafka-mirror-${mirror_name}\"} [${lag_check_period}m])))"
    }
    monitoring::check_prometheus { "kafka-mirror-${mirror_name}-consumer_max_lag":
        description    => "Kafka MirrorMaker ${mirror_name} max lag in last ${lag_check_period} minutes",
        # This metric does not have the mirror_name label, so we target it in the group instead.
        query          => $cgroup_lag_query,
        method         => 'gt',
        warning        => $warning_lag,
        critical       => $critical_lag,
        retry_interval => 10,
        retries        => 3,
        prometheus_url => $source_prometheus_url,
        notes_link     => 'https://wikitech.wikimedia.org/wiki/Kafka/Administration',
    }
}