Puppet Class: base::sysctl

Defined in:
modules/base/manifests/sysctl.pp

Summary

base class to configure sysctl settings

Overview

Parameters:

  • unprivileged_userns_clone (Boolean) (defaults to: false)

    enable kernel.unprivileged_userns_clone



3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# File 'modules/base/manifests/sysctl.pp', line 3

class base::sysctl (
    Boolean $unprivileged_userns_clone = false,
) {
    # Systemctl hardening settings. We set them ourselves so we can purge /etc/sysctl.d.
    sysctl::parameters { 'ubuntu defaults':
        values   => {
            # 10-console-messages.conf
            'kernel.printk'                   => [ 4, 4, 1, 7 ],

            # 10-kernel-hardening.conf
            'kernel.kptr_restrict'            => 1,

            # 10-network-security.conf
            'net.ipv4.conf.default.rp_filter' => 1,
            'net.ipv4.conf.all.rp_filter'     => 1,
            'net.ipv4.tcp_syncookies'         => 1,

            # 10-ptrace.conf
            'kernel.yama.ptrace_scope'        => 1,

            # 10-link-restrictions.conf
            'fs.protected_hardlinks'          => 1,
            'fs.protected_symlinks'           => 1,

            # 10-zeropage.conf
            'vm.mmap_min_addr'                => 65536,

            # skip 10-ipv6-privacy.conf
            # skip 10-magic-sysrq.conf
        },
        priority => 10,
    }

    sysctl::parameters { 'wikimedia base':
        values   => {
            # Increase TCP max buffer size
            'net.core.rmem_max'                => 16777216,
            'net.core.wmem_max'                => 16777216,

            # Increase Linux auto-tuning TCP buffer limits
            # Values represent min, default, & max num. of bytes to use.
            'net.ipv4.tcp_rmem'                => [ 4096, 87380, 16777216 ],
            'net.ipv4.tcp_wmem'                => [ 4096, 65536, 16777216 ],

            # Don't cache ssthresh from previous connection
            'net.ipv4.tcp_no_metrics_save'     => 1,
            'net.core.netdev_max_backlog'      => 2500,

            # Increase the queue size of new TCP connections
            'net.core.somaxconn'               => 1024,
            'net.ipv4.tcp_max_syn_backlog'     => 4096,

            # Swapping makes things too slow and should be done rarely
            # 0 = only swap in OOM conditions (it does NOT disable swap.)
            'vm.swappiness'                    => 0,
            'net.ipv4.tcp_keepalive_time'      => 300,
            'net.ipv4.tcp_keepalive_intvl'     => 1,
            'net.ipv4.tcp_keepalive_probes'    => 2,

            # Default IPv6 route table max_size is too small for the modern
            # Internet.  It's tempting to set this only for public-facing
            # caches and LVS, but when surveying hosts there are non-obvious
            # cases with significant route table sizes (other independent
            # public services with v6, recdns servers, etc..), and even the
            # public ones with smaller live tables would be subject to easier
            # DoS without this setting.  I think it's better to simply set it
            # globally for all our hosts and be done with worrying about it so
            # much.  This value seems to be an order of magnitude+ above what
            # our tier-1 cache nodes currently need, and not cause any issues
            # from being oversized, but this may need adjustment later, unless
            # future kernels fix the issue completely as they did with ipv4.
            'net.ipv6.route.max_size'          => 131072,

            # Mitigate side-channel from challenge acks, at least until most
            # public servers are on kernel 4.7+ or have a backported fix.
            # Refs:
            # CVE-2016-5696
            # http://www.cs.ucr.edu/~zhiyunq/pub/sec16_TCP_pure_offpath.pdf
            # http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=75ff39ccc1bd5d3c455b6822ab09e533c551f758
            'net.ipv4.tcp_challenge_ack_limit' => 987654321,
        },
        priority => 60,
    }

    # unprivileged bpf is a feature introduced in Linux 4.4: https://lwn.net/Articles/660331/
    # We don't need it and it widens the attacks surface for local privilege escalation
    # significantly, so we're disabling it by enabling kernel.unprivileged_bpf_disabled
    if (versioncmp($::kernelversion, '4.4') >= 0) {
        sysctl::parameters { 'disable_unprivileged_bpf':
            values => {
            'kernel.unprivileged_bpf_disabled' => '1',
            },
        }
    }

    if (versioncmp($::kernelversion, '5.10') >= 0) {
        # Up to Buster Debian disabled unprivileged user namespaces in the default kernel config
        # This changed in Bullseye mostly to allow Chromium and Firefox to setup sandboxing via namespaces
        # But for a server deployment like ours, we have no use for it and it widens the attack surface,
        # so we disable it. Apply this to kernels starting with 5.10 (where it was enabled in Debian)
        sysctl::parameters { 'unprivileged_userns_clone':
            values => {
                'kernel.unprivileged_userns_clone' => $unprivileged_userns_clone.bool2str('1', '0'),
            },
        }

        # Kernels prior to v5.10.54 had this value set to 3600 as a defense
        # mechanism against faulty middle boxes which do not support TCP
        # fast open. The value was changed to 0 in v5.10.54. Unfortunately,
        # we do have faulty middle boxes in our path and the new value of 0
        # results in our mail queues backing up, as connections to Google's
        # mail servers sometimes timeout. This setting should not be
        # reverted without monitoring our mail queues. We are applying it
        # globally since it may be the source of other problems and there
        # is little downside for our workloads.
        # https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=v5.10.96&id=164294d09c47b9a6c6160b08c43d74ae93c82758
        sysctl::parameters { 'fastopen':
            values   => { 'net.ipv4.tcp_fastopen_blackhole_timeout_sec' => 3600 },
        }
    }

    # BBR congestion control (T147569)
    # https://lwn.net/Articles/701165/
    #
    # The BBR TCP congestion control algorithm is based on Bottleneck
    # Bandwidth, i.e. the estimated bandwidth of the slowest link, and
    # Round-Trip Time to control outgoing traffic. Other algorithms such as
    # CUBIC (default on Linux since 2.6.19) and Reno are instead based on
    # packet loss.
    #
    # To send out data at the proper rate, BBR uses the tc-fq packet scheduler
    # instead of the TCP congestion window.
    #
    # It has been added to Linux in version 4.9.
    $use_bbr = lookup('bbr_congestion_control', {'default_value' => false})
    if ($use_bbr) and (versioncmp($::kernelversion, '4.9') >= 0) {
        sysctl::parameters { 'tcp_bbr':
            values => {
                'net.core.default_qdisc'          => 'fq',
                'net.ipv4.tcp_congestion_control' => 'bbr',
            },
        }
    }
    # The security fix for CVE-2019-11479 introduced a new sysctl setting which clamps
    # the lower value for the advertised MSS. The Linux patch retains the formerly
    # hardcoded default of 48 for backwards compatibility reasons. We're setting it to
    # 538 which is the minimum MTU for IPv4 minus the size of the headers, which should
    # allow all legitimate traffic while avoiding the resource exhaustion.
    #
    # All the kernels which have CVE-2019-11479 backported, also have the fixes for
    # CVE-2019-11477 and CVE-2019-11478 applied. Re-enable TCP selective acknowledments
    # on all hosts which have been rebooted to that kernel and keep it disabled for all
    # servers still running an unfixed kernel.
    if $facts.has_key('kernel_details') and
        $facts['kernel_details']['sysctl_settings']['net.ipv4.tcp_min_snd_mss'] {
        sysctl::parameters{'tcp_min_snd_mss':
            values  => {
                'net.ipv4.tcp_min_snd_mss' => '538',
                'net.ipv4.tcp_sack'        => 1,
            },
        }
    } else {
        sysctl::parameters{'disable_tcp_sack':
            values  => { 'net.ipv4.tcp_sack' => 0 },
        }
    }
}