4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
|
# File 'modules/profile/manifests/ntp.pp', line 4
class profile::ntp (
Array[Stdlib::Host] $monitoring_hosts = lookup('monitoring_hosts'),
Hash[Wmflib::Sites, Array[Stdlib::Fqdn]] $ntp_peers = lookup('ntp_peers'),
){
include network::constants
# required for monitoring changes to the ntp.conf file
ensure_packages(['python3-pystemd'])
# all global peers at all sites
$wmf_all_peers = flatten(values($ntp_peers))
# $wmf_servers is a full list of peer servers applicable at each site
# (which will, for any given server, also include itself):
$wmf_servers_plus_self = $::site ? {
# core sites peer with all global peers at all sites
eqiad => $wmf_all_peers,
codfw => $wmf_all_peers,
# edge sites only peer with core DCs and themselves:
default => [$ntp_peers['eqiad'], $ntp_peers['codfw'], $ntp_peers[$::site]].flatten,
}
# a server can't peer with itself, so remove self from the list:
$wmf_servers = delete($wmf_servers_plus_self, $facts['networking']['fqdn'])
$pool_zone = $::site ? {
esams => 'nl',
eqsin => 'sg',
drmrs => 'fr',
magru => 'br',
default => 'us',
}
$wmf_server_upstream_pools = ["0.${pool_zone}.pool.ntp.org"]
### Extra "tos" config for our servers:
# minsane <N> - the number of acceptably-working pool-servers + peers we
# must be syncing with to consider *ourselves* to be a reliable source for
# others. These numbers can be bikeshedded a bit, but the default of 1 is
# lower than we'd like. Setting it too high can break time sync in some
# otherwise-survivable scenarios. The cores have more local peers between
# them and greater reliability in general, so they can tolerate a slightly
# higher number than the edges.
$minsane = $::site ? {
eqiad => 3,
codfw => 3,
default => 2,
}
# orphan <stratum> - if no internet servers are reachable, our servers will
# operate as an orphaned peer island and maintain some kind of stable
# sync with each other. Without this, if all of our global servers
# lost their upstreams, within a few minutes we'd have no time syncing
# happening at all ("peer" only protects you from *some* servers losing
# upstreams, not all). A plausible scenario here would be some global
# screwup of pool.ntp.org DNS ops. So set cores to do the orphan job.
$orphan = $::site ? {
eqiad => 12,
codfw => 12,
default => 13,
}
# maxclock - This needs to be the sum of:
# * The count of servers in wmf_servers for this host
# * The number (4) we want to use from the "pool" DNS lookup
# * One extra to account for the dummy "0.X.pool.ntp.org" entry
$maxclock = length($wmf_servers) + 4 + 1
# Generate a list of ACLs from "external networks" automatically. We also
# need 10.0.0.0/8 in addition to these. We cannot use production_networks
# since that will also include 127.0.0.0/8 and ::1/128.
$time_acl = $network::constants::external_networks << '10.0.0.0/8'
ntp::daemon { 'server':
servers => $wmf_servers,
pools => $wmf_server_upstream_pools,
time_acl => $time_acl,
extra_config => "tos minsane ${minsane} orphan ${orphan} maxclock ${maxclock}",
query_acl => $monitoring_hosts,
}
ferm::service { 'ntp':
proto => 'udp',
port => 123,
srange => '($PRODUCTION_NETWORKS $FRACK_NETWORKS $MGMT_NETWORKS $NETWORK_INFRA)',
}
monitoring::service { 'ntp peers':
description => 'NTP peers and stratum check',
check_interval => 5, # min
retry_interval => 5, # min
check_command => 'check_ntp_peer_and_stratum!0.05!0.1!5!10', # -W stratum 5, -C stratum 10
notes_url => 'https://wikitech.wikimedia.org/wiki/NTP',
}
nrpe::plugin { 'check_ntp_service':
source => 'puppet:///modules/profile/monitoring/check_service_restart.py',
}
$services_to_check = {
'ntpsec.service' => '/etc/ntpsec/ntp.conf',
}
$services_to_check.each |$service, $conf_file| {
nrpe::monitor_service { "check_service_restart_${service}":
description => "Check if ${service} has been restarted after ${conf_file} was changed",
nrpe_command => "/usr/local/lib/nagios/plugins/check_ntp_service --service ${service} --file ${conf_file} --critical 2",
sudo_user => 'root',
check_interval => 60, # 60mins
retry_interval => 30, # 30mins
notes_url => 'https://wikitech.wikimedia.org/wiki/NTP#Monitoring',
}
}
}
|