Puppet Class: ceph::osds

Defined in:
modules/ceph/manifests/osds.pp

Overview

SPDX-License-Identifier: Apache-2.0

Parameters:

  • fsid (String)
  • mon_hosts (Hash[String, Hash])
  • discrete_bluestore_device (Boolean) (defaults to: false)
  • osd_hosts (Optional[Hash[String, Hash]]) (defaults to: undef)
  • absent_osds (Optional[Array[String]]) (defaults to: undef)
  • excluded_slots (Optional[Array[String]]) (defaults to: undef)
  • bluestore_device_name (Optional[String]) (defaults to: undef)


2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# File 'modules/ceph/manifests/osds.pp', line 2

class ceph::osds (
    String                        $fsid,
    Hash[String, Hash]            $mon_hosts,
    Boolean                       $discrete_bluestore_device = false,
    Optional[Hash[String, Hash]]  $osd_hosts                 = undef,
    Optional[Array[String]]       $absent_osds               = undef,
    Optional[Array[String]]       $excluded_slots            = undef,
    Optional[String]              $bluestore_device_name     = undef,
) {
    Ceph::Auth::Keyring['admin'] -> Class['ceph::osds']
    Ceph::Auth::Keyring['bootstrap-osd'] -> Class['ceph::osds']
    Ceph::Auth::Keyring["osd.${facts['hostname']}"] -> Class['ceph::osds']
    Class['ceph::config'] -> Class['ceph::osds']

    ensure_packages(['ceph-osd','ceph-volume','hdparm'])

    # Disable the write cache on devices using the SCSI disk driver
    $facts['disk_type'].filter | $disk | { $disk[0] =~ 'sd*' }.each |$disk, $type| {
    # Unset wite cache
    exec { "Disable write cache on device /dev/${disk}":
        # 0->disable, 1->enable
        command => "hdparm -W 0 /dev/${disk}",
        user    => 'root',
        unless  => "hdparm -W /dev/${disk} | grep write-caching | egrep '(not supported|off)'",
        path    => ['/usr/sbin', '/usr/bin'],
    }

    # Set io scheduler on disks
    # hdd -> mq-deadline
    # ssd/nvme -> none
    if ($type == 'ssd') {
        $disk_io_scheduler = 'none'
    } elsif ($type == 'hdd') {
        $disk_io_scheduler = 'mq-deadline'
    } else {
        fail("${type} for /dev/${disk} is currently not managed")
    }

    # The device names /dev/sd* may be volatile, but if they change this will detect it
    # and refresh the sysfsutils service on first puppet run after boot.
    sysfs::parameters { "scheduler_${disk}":
        priority => 90,
        values   => {
            "block/${disk}/queue/scheduler" => $disk_io_scheduler,
        },
    }
  }

  # Create a new hash with the populated slots from all controllers, exclude any that are in the list of excluded slots.
  # This mechanism is intended to be used to avoid adding an OSD for the operating system disks.
  #
  # n.b. The ceph_disks fact is not available until after the first puppet run, so this conditional will defer management
  # of the OSDs until the second puppet run. This is a temporary measure to fix reimages.
  if $facts['ceph_disks'] {
    $storage_disks = $facts['ceph_disks'].values.map | $controller | {
        $controller['disks']
    }.reduce | $memo, $disk | {
        $memo + $disk
    }.filter | $slot | {
        ! ($slot[0] in $excluded_slots)
    }
  }
  else {
    $storage_disks = {}
  }

  # Optional support for creating bluestore partitions on a named NVMe device
    if ( $discrete_bluestore_device and $bluestore_device_name =~ '\/dev\/nvme[0-9]*n[0-9]*' ) {
        ensure_packages(['parted'])

        # Set gpt partition table
        exec { "Create gpt label on ${bluestore_device_name}":
            command => "parted -s -a optimal ${bluestore_device_name} mklabel gpt",
            user    => 'root',
            unless  => "parted -s ${bluestore_device_name} print|grep \"Partition Table: gpt\"",
            path    => ['/usr/sbin', '/usr/bin'],
        }

        # Filter the list of storage disks to obtain a list of HDDs that are to be used for hosting an OSD,
        # then partition the given device equally between the number of HDDs.
        $hdd_storage_disks = $storage_disks.values.filter | $disk | { $disk['medium'] == 'HDD' }

        if ( $hdd_storage_disks.length > 0 ) {
            $percent_partition = 100 / $hdd_storage_disks.length
        }

        $hdd_storage_disks.each |$index, $hdd_disk| {
            $start_partition = 0 + $index * $percent_partition
            $end_partition = ($index +1) * $percent_partition
            $hdd_disk_label = "c${hdd_disk['controller']}e${hdd_disk['enclosure']}s${hdd_disk['slot']}"

            exec { "Create partition db.${hdd_disk_label} on ${bluestore_device_name}":
                command => "parted -s -a optimal ${bluestore_device_name} mkpart db.${hdd_disk_label} ext4 ${start_partition}% ${end_partition}%",
                user    => 'root',
                unless  => "parted -s ${bluestore_device_name} print|grep db.${hdd_disk_label}",
                path    => ['/usr/sbin', '/usr/bin'],
            }
        }
    }

    # Create the OSD devices - We use the wwn here because it will always refer to the same drive.
    # It is not safe to depend on the device name /dev/sd* remaining the same across reboots.
    $storage_disks.each |$slot_id, $disk| {
        # Construct a name for the osd based on its controller, enclosure, and slot values.
        $osd_label = "c${disk['controller']}e${disk['enclosure']}s${disk['slot']}"

        # If this is a hard drive and we have specified that discrete bluestore partitions
        # are in use, then use its named partition for the bluestore db.
        if ($disk['medium'] == 'HDD') and $discrete_bluestore_device {
            $bluestore_db = "/dev/disk/by-partlabel/db.${osd_label}"
        } else {
            $bluestore_db = undef
        }

        # For a SATA disk the WWN reported by the perccli64 tool matches that reported by the kernel in /dev/disk/by-id/wwwn-0x*.
        # For a SAS hard drive we need to increment the hex string reported by three bits to obtain the LUN.
        # For a SAS sold-state drive we need to increment the hex string by one bit to obtain the first SAS port.
        # In order to handle this we convert the wwn to a decimal, add zero, one, or three bits, then convert it back to hexadecimal in lowercase.
        $sas_disk = bool2num($disk['interface'] == 'SAS')
        $wwn_bitshift = $disk['medium'] ? {
            'SSD' => $sas_disk,
            'HDD' => $sas_disk * 3,
        }
        $wwid = String.new(Integer.new("0x${disk['wwn']}")+$wwn_bitshift,'%#x')

        # This device name will always be a symlink from the disk with this WWN to its current /dev/sd* name, as managed by udev.
        # The links are always in lower case, whereas the WWN reported by the perccli64 tool is in upper case.
        $device = "/dev/disk/by-id/wwn-${wwid}"

        # Use the medium in the ceph_disks fact to inform the ceph-volume tool of its device class at the time of OSD creation.
        $device_class = $disk['medium'].downcase

        # Check to see if the current disk is marked for removal. This is intended to support replacement of failed OSDs
        # by temporarily absenting them. As opposed to $excluded_slots which is for permanently ignoring certain slots such as
        # those used for holding the O/S.
        $ensure_osd = ($osd_label in $absent_osds).bool2str('absent', 'present')

        ceph::osd { $osd_label:
            ensure       => $ensure_osd,
            fsid         => $fsid,
            device       => $device,
            device_class => $device_class,
            bluestore_db => $bluestore_db,
        }
    }
}