Puppet Class: cacheproxy::performance

Defined in:
modules/cacheproxy/manifests/performance.pp

Overview

Class cacheproxy::performance

This class contains production-specific performance hacks These should have zero functional effect, they are merely system-level tweaks to support heavy load/traffic.



6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
# File 'modules/cacheproxy/manifests/performance.pp', line 6

class cacheproxy::performance {

    $iface_primary = $facts['interface_primary']

    # Bump min_free_kbytes to ensure network buffers are available quickly
    #   without having to evict cache on the spot
    vm::min_free_kbytes { 'cache':
        pct => 2,
        min => 131072,
        max => 2097152,
    }

    grub::bootparam { 'tcpmhash_entries':
        value => 65536,
    }

    # flush vm more steadily in the background. helps avoid large performance
    #   spikes related to flushing out disk write cache.
    sysctl::parameters { 'cache_role_vm_settings':
        values => {
            'vm.dirty_ratio'            => 40,  # default 20
            'vm.dirty_background_ratio' => 5,   # default 10
            'vm.dirty_expire_centisecs' => 500, # default 3000
        },
    }

    # Turn on scsi_mod.use_blk_mq at boot.  Our newest nvme drives don't need
    # this (blk_mq is implicit in the nvme driver, which doesn't use the scsi
    # layer), but I stumbled on it while looking at them, at it's apparently a
    # good idea on scsi SSDs as well, so this is mainly for our SSD nodes.
    # Be careful about copypasta of this to other hardware which might have
    # at least some rotational disks, as there have been past regressions and
    # this can only be turned on for all of the scsi layer, not by-device.
    # It will eventually be the default, probably in 4.19 or later.
    grub::bootparam { 'scsi_mod.use_blk_mq': value  => 'y' }

    # Larger TX queue len for 10Gbps+
    interface::txqueuelen { $name:
        interface => $iface_primary,
        len       => 10000,
    }

    # Max out ring buffers, seems to eliminate the spurious drops under heavy traffic
    $ring_size = $facts['net_driver'][$iface_primary]['driver'] ? {
        'bnx2x'   => 4078,
        'bnxt_en' => 2047,
        'bnx2'    => 255, # only cp1008, this is defaults and we don't care much
    }

    interface::ring { "${name} rxring":
        interface => $iface_primary,
        setting   => 'rx',
        value     => $ring_size,
    }

    if $facts['net_driver'][$iface_primary]['driver'] == 'bnxt_en' {
        interface::ring { "${name} txring":
            interface => $iface_primary,
            setting   => 'tx',
            value     => $ring_size,
        }
    }

    # Disable LRO to avoid merging important headers for flow control and such
    interface::offload { "${iface_primary}-lro":
        interface => $iface_primary,
        setting   => 'lro',
        value     => 'off',
    }

    # Disable ethernet PAUSE behavior, dropping is better than buffering (in reasonable cases!)
    interface::noflow { $iface_primary: }

    # RPS/RSS to spread network i/o evenly.  Note this enables FQ as well,
    # which must be enabled before turning on BBR congestion control below
    interface::rps { 'primary':
        interface => $iface_primary,
        qdisc     => 'fq flow_limit 300 buckets 8192 maxrate 256mbit',
        before    => Sysctl::Parameters['cache proxy network tuning'],
    }

    # Network tuning for high-load HTTP caches
    sysctl::parameters { 'cache proxy network tuning':
        values => {
            # Increase the number of ephemeral ports
            'net.ipv4.ip_local_port_range'       => [ 4001, 65534 ],

            # All prod caches are 10GbE, standard recommendation is 300K for 10G
            # and 30K for 1G.  Our inbound traffic max is closer to 1G levels,
            # since we have 10G LVS splitting traffic to ~8 or more hosts for
            # high-traffic clusters.  Still, we should double the 1GbE numbers
            # at least, just in case of bursts and inequality, etc.  If low,
            # will see drops in col 2 of /proc/net/softnet_stat
            'net.core.netdev_max_backlog'        => 60000,

            # budget: Similar to the above, default 300, and is the #packets
            # handled per NAPI polling cycle across all interfaces.  You can see
            # effects of this being too low in col 3 of /proc/net/softnet_stat.
            # Caches show some small numbers there, so, experimenting with
            # raising this a bit for now
            'net.core.netdev_budget'             => 1024,

            # Default:1 - setting this to zero defers timestamping until after
            # RPS.  It's more efficient this way, but timestamp doesn't account
            # for any tiny delays in queueing before RPS, which I don't think is
            # an issue in our case.
            'net.core.netdev_tstamp_prequeue'    => 0,

            # Our rate of incoming SYN on heaviest cp hosts peaks around
            # 1-2K/sec.  For somaxconn, the SYN numbers should be multiplied
            # out for a few seconds of headroom (bursts, and userspace delays)
            # and then perhaps doubled again to handle the influx of depooling
            # large datacenters.  Note somaxconn is just a parameter limit, the
            # application still needs to set this explicitly (within the
            # limit).
            'net.core.somaxconn'                 => 16384,

            # Our active connection concurrency peaks in the ~100K-200K range
            # per cp host (e.g. text esams as shown in ipvsadm).  For
            # max_syn_backlog, we probably want a small multiple of peak
            # concurrency (maybe even just ~1x), as well as (again) dc failover
            # and/or cp host depool headroom.
            'net.ipv4.tcp_max_syn_backlog'       => 524288,

            # Building on the metrics above - tw_buckets should be somewhere
            # close to the concurrency/syn_backlog sort of level as well so that
            # we properly timewait connections when necc.  Note that tw_reuse
            # moderates the localhost<->localhost timewaits.  max_orphans should
            # be close to the same value, I think, as most of the lingering TW
            # will be orphans.
            'net.ipv4.tcp_max_tw_buckets'        => 524288,
            'net.ipv4.tcp_max_orphans'           => 524288,

            # tcp_tw_(reuse|recycle): both are off by default
            # http://vincent.bernat.im/en/blog/2014-tcp-time-wait-state-linux.html
            #    _recycle is dangerous: it violates RFCs, and probably breaks
            # clients when many clients are behind a single NAT gateway, and
            # affects the recycling of TIME_WAIT slots for both incoming and
            # outgoing connections.
            #    _reuse is not-so-dangerous: it only affects outgoing
            # connections, and looks at timestamp and other state information to
            # gaurantee that the reuse doesn't cause issues within reasonable
            # constraints.
            #    This helps prevent TIME_WAIT issues for our $localip<->$localip
            # connections from nginx to varnish-fe:80 - some of our caches reach
            # connection volume/rate spikes where this is a real issue.
            'net.ipv4.tcp_tw_reuse'              => 1,

            # FIN_WAIT_2 orphan time, def 60.  Reducing this reduces wasted
            # sockets and memory, and there's no good reason to set it higher
            # than roughly the maximum reasonable client RTT in our case.
            'net.ipv4.tcp_fin_timeout'           => 3,

            # Defaults are synack:5 and syn:6.  These control retries on SYN
            # (outbound) and SYNACK (inbound) before giving up on connection
            # establishment.  The defaults with the normal backoff timers can
            # leave not-yet-connected sockets lingering for unacceptably-long
            # times (1-2 minutes).  Aside from waste, that's also a potential
            # DoS vector we'd rather not have.  The "2" value drops the maximum
            # time windows down to ~7 seconds.
            'net.ipv4.tcp_synack_retries'        => 2,
            'net.ipv4.tcp_syn_retries'           => 2,

            # tcp_slow_start_after_idle: SSR resets the congestion window of
            # connections that have gone idle, which means it has a tendency to
            # reset the congestion window of HTTP keepalive and HTTP/2
            # connections, which are characterized by short bursts of activity
            # separated by long idle times.
            'net.ipv4.tcp_slow_start_after_idle' => 0,

            # tcp_notsent_lowat: Default is -1 (unset).  The default behavior is
            # to keep the socket writeable until the whole socket buffer fills.
            # With this set, even if there's buffer space, the kernel doesn't
            # notify of writeability (e.g. via epoll()) until the amount of
            # unsent data (as opposed to unacked) in the socket buffer is less
            # than this value.  This reduces local buffer bloat on our server's
            # sending side, which may help with HTTP/2 prioritization.  The
            # magic value for tuning is debateable, but arguably even setting a
            # conservative (higher) value here is better than not setting it
            # all, in almost all cases for any kind of TCP traffic.  ~128K seems
            # to be a common recommendation for something close-ish to optimal
            # for internet-facing things.
            'net.ipv4.tcp_notsent_lowat'         => 131072,

            # EXPERIMENTAL!
            # TCP autocorking exists and defaults on from 3.14 onwards.  The
            # idea is that some applications that should be doing a better job
            # of local buffering or manual TCP_CORK aren't, and the kernel
            # detects the common patterns for this and auto-corks for them
            # (doesn't immediately send a small write, instead waits a bit to
            # see if it can coalesce it with another).  Netstat counters for
            # autocorking are running up at a huge rate (ballpark near our reqs
            # or SYNs rate), which implies this is happening commonly to nginx
            # outbound traffic.  My theory is this is probably a net loss and
            # nginx and/or openssl know what they're doing and we'd benefit from
            # the writes going out immediately and not autocorking...
            'net.ipv4.tcp_autocorking'           => 0,

            # EXPERIMENTAL!
            # no_metrics_save: default 0.  Most tuning advice on the internet
            # says set it to 1, our own base-level sysctls for all systems also
            # set it to 1.  I think it's possible this advice is outdated and
            # harmful.  The rationale for no_metrics_save is that if there's
            # congestion/loss, congestion algorithms will cut down the cwnd of
            # the active connection very aggressively, and are very slow at
            # recovering from even small bursts of loss, and metrics cache will
            # carry this over to new connections after a temporary loss burst
            # that's already ended.  However, Linux 3.2+ implements PRR (RFC
            # 6937), which mitigates these issues and allows faster/fuller
            # recovery from loss bursts.  That should reduce the downsides of
            # saving metrics significantly, and the upsides have always been a
            # win because we remember (for an hour) past RTT, ssthresh, cwnd,
            # etc, which often allow better initial connection conditions.
            # Kernel boot param 'tcpmhash_entries' sets hash table slots for
            # this.
            'net.ipv4.tcp_no_metrics_save'       => 0,

            # BBR congestion control.  This *requires* fq qdisc to work
            # properly at this time (kernel 4.9).  We're setting the default
            # qdisc here so that we at least get an un-tuned FQ initially
            # before interface-rps kicks in on bootup.  interface::rps above
            # sets the tuned mq+fq setup properly, and should execute before
            # these sysctl settings when being applied at runtime.
            'net.core.default_qdisc'             => 'fq',
            'net.ipv4.tcp_congestion_control'    => 'bbr',

            # Attempt IPv4 PMTU detection (with improved baseline assumption of
            # 1024) when ICMP black hole detected.  This may fix some
            # minority-case clients using tunnels + blackhole paths, where there
            # is no other good recourse.
            'net.ipv4.tcp_mtu_probing'           => 1,
            'net.ipv4.tcp_base_mss'              => 1024,

        },
    }
}