33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
|
# File 'modules/profile/manifests/monitoring.pp', line 33
class profile::monitoring (
Wmflib::Ensure $hardware_monitoring = lookup('profile::monitoring::hardware_monitoring'),
# TODO: make this an array
String $contact_group = lookup('profile::monitoring::contact_group'),
String $cluster = lookup('profile::monitoring::cluster'),
Boolean $is_critical = lookup('profile::monitoring::is_critical'),
String $nrpe_check_disk_options = lookup('profile::monitoring::nrpe_check_disk_options'),
Boolean $nrpe_check_disk_critical = lookup('profile::monitoring::nrpe_check_disk_critical'),
Boolean $raid_check = lookup('profile::monitoring::raid_check'),
Integer $raid_check_interval = lookup('profile::monitoring::raid_check_interval'),
Integer $raid_retry_interval = lookup('profile::monitoring::raid_retry_interval'),
Boolean $notifications_enabled = lookup('profile::monitoring::notifications_enabled'),
Boolean $do_paging = lookup('profile::monitoring::do_paging'),
String $nagios_group = lookup('profile::monitoring::nagios_group'),
Hash $services = lookup('profile::monitoring::services'),
Hash $hosts = lookup('profile::monitoring::hosts'),
Array[Stdlib::Host] $monitoring_hosts = lookup('profile::monitoring::monitoring_hosts'),
Optional[Enum['WriteThrough', 'WriteBack']] $raid_write_cache_policy = lookup('profile::monitoring::raid_write_cache_policy')
) {
if $raid_check and $hardware_monitoring == 'present' {
# RAID checks
class { 'raid':
write_cache_policy => $raid_write_cache_policy,
check_interval => $raid_check_interval,
retry_interval => $raid_retry_interval,
}
}
class { 'monitoring':
contact_group => $contact_group,
nagios_group => $nagios_group,
cluster => $cluster,
notifications_enabled => $notifications_enabled,
do_paging => $do_paging,
hosts => $hosts,
services => $services,
}
class { 'nrpe':
allowed_hosts => $monitoring_hosts.join(','),
}
# the nrpe class installs monitoring-plugins-* which creates the following directory
contain nrpe # lint:ignore:wmf_styleguide
nrpe::plugin { 'check_sysctl':
source => 'puppet:///modules/profile/monitoring/check_sysctl',
}
nrpe::plugin { 'check_established_connections':
source => 'puppet:///modules/profile/monitoring/check_established_connections.sh',
}
nrpe::plugin { 'check_fresh_files_in_dir':
source => 'puppet:///modules/profile/monitoring/check_fresh_files_in_dir.py',
}
nrpe::plugin { 'check_newest_file_age':
source => 'puppet:///modules/profile/monitoring/check_newest_file_age.sh',
}
file { [
'/usr/lib/nagios/plugins/check_sysctl',
'/usr/lib/nagios/plugins/check_established_connections',
'/usr/lib/nagios/plugins/check-fresh-files-in-dir.py',
]:
ensure => absent,
}
nrpe::monitor_service { 'disk_space':
description => 'Disk space',
critical => $nrpe_check_disk_critical,
nrpe_command => "/usr/lib/nagios/plugins/check_disk ${nrpe_check_disk_options}",
notes_url => 'https://wikitech.wikimedia.org/wiki/Monitoring/Disk_space',
dashboard_links => ["https://grafana.wikimedia.org/d/000000377/host-overview?var-server=${facts['hostname']}&var-datasource=${::site} prometheus/ops"],
check_interval => 20,
retry_interval => 5,
}
nrpe::plugin { 'check_systemd_state':
ensure => absent,
}
if ! $facts['is_virtual'] {
include profile::prometheus::nic_saturation_exporter
class { 'prometheus::node_nic_firmware': }
if $facts['processors']['models'][0] !~ /AMD/ {
class { 'prometheus::node_intel_microcode': }
}
}
if $facts['has_ipmi'] {
class { 'ipmi::monitor': ensure => $hardware_monitoring }
}
}
|