Puppet Class: openstack::nova::compute::monitor

Defined in:
modules/openstack/manifests/nova/compute/monitor.pp

Overview

The 'nova compute' service does the actual VM management

within nova.

wiki.openstack.org/wiki/Nova

Parameters:

  • active (Any)
  • verify_instances (Any) (defaults to: false)
  • contact_groups (Any) (defaults to: 'wmcs-bots,admins')


4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# File 'modules/openstack/manifests/nova/compute/monitor.pp', line 4

class openstack::nova::compute::monitor(
    $active,
    $verify_instances=false,
    $contact_groups='wmcs-bots,admins',
){

    # monitoring::service doesn't take a bool
    if $active {
        $ensure = 'present'
    }
    else {
        $ensure = 'absent'
    }

    file { '/usr/local/lib/nagios/plugins/check_ssl_certfile':
        ensure => 'present',
        owner  => 'root',
        group  => 'root',
        mode   => '0755',
        source => 'puppet:///modules/nagios_common/check_commands/check_ssl_certfile',
    }

    # Having multiple nova-compute parent process running long term has been known to happen
    # when puppet staggers a restart and nova gets very confused with dualing processes
    # pulling from rabbimq and potentially executing conflicting instructions.  A transient
    # value of 2 process can be fine during graceful restart though so ensure only 1 but
    # give a generous allowance for recheck.
    #
    # The weird [n] is an attempt to keep check_procs from counting itself.
    #  https://serverfault.com/questions/359958/nagios-nrpe-check-procs-wrong-return-value
    nrpe::monitor_service { 'ensure_single_nova_compute_proc':
        ensure        => $ensure,
        description   => 'nova-compute proc maximum',
        nrpe_command  => "/usr/lib/nagios/plugins/check_procs -c 1:1 -p 1 --ereg-argument-array '^/usr/bin/pytho[n].* /usr/bin/nova-compute'",
        retries       => 5,
        contact_group => $contact_groups,
        notes_url     => 'https://wikitech.wikimedia.org/wiki/Portal:Cloud_VPS/Admin/Troubleshooting',
    }

    # Labvirts have been known to fully reboot in <=4 minutes and
    # instances /do not/ come up as started automatically so we need
    # to alert on an unreachable/down nova-compute process fairly quickly.
    # But allow for the possibility of 2 procs in case it is in graceful
    # transition where this persistent bad state will alert above.
    nrpe::monitor_service { 'ensure_nova_compute_running':
        ensure        => $ensure,
        description   => 'nova-compute proc minimum',
        nrpe_command  => "/usr/lib/nagios/plugins/check_procs -c 1:2 --ereg-argument-array '^/usr/bin/pytho[n].* /usr/bin/nova-compute'",
        retries       => 1,
        contact_group => $contact_groups,
        notes_url     => 'https://wikitech.wikimedia.org/wiki/Portal:Cloud_VPS/Admin/Troubleshooting',
    }

    if ($active) and ($verify_instances) {

        $kvmbinary = 'qemu-system-x86_64'

        # Where a stopped nova-compute processes means we are no longer processing
        # control plane messaging above, this check makes sure that at least one (even
        # if it is a token administrative) instance is running.  If a hypervisor
        # does reboot it will come up without running instances even after the nova-compute
        # processes has been fully restored.

        # This means we need to have a token administrative instance running on all
        # active hypervisors as a canary:
        # OS_PROJECT_ID=testlabs openstack server create \
        # --flavor 2 --image <image-id> --availability-zone host:<hypervisor> <instance>
        nrpe::monitor_service { 'ensure_running_kvm_instances':
            ensure        => $ensure,
            description   => 'ensure kvm processes are running',
            nrpe_command  => "/usr/lib/nagios/plugins/check_procs -c 1:120 --ereg-argument-array ${kvmbinary}",
            retries       => 2,
            contact_group => $contact_groups,
            notes_url     => 'https://wikitech.wikimedia.org/wiki/Portal:Cloud_VPS/Admin/Troubleshooting',
        }
    }
}