January 09, 2022

Categories: braindump

Adding a QNAP TS-473A to my Ceph Nautilus Cluster

Table of Contents

I added a QNAP TS-473 to my Red Hat Ceph Storage 4 (Nautilus) cluster.

part of my Ceph Dashboard after adding the QNAP TS-473A to the cluster

This is my braindump.

Install Red Hat Enterprise Linux 8.5

After initial bringup and a first install of RHEL 8.5, I re-installed RHEL 8.5 using the following kickstart file to use it as a node in my Ceph cluster.

Since I could not get the node to PXE boot, I simply put all I needed in the first install’s grub setup. c.f. towards the end of my Ansible Playbook qnap-ryzen-general-setup-rhel8.yml, further down in this post.

kickstart file RHEL85-QNAP-TS-473A-ks.cfg (click to expand).

#version=RHEL8

# avoid using half arsed names like sda, sdb, etc
# TS-473A User Guide, page 10, says
#   top is M.2 SSD slot 1
# lower is M.2 SSD slot 2
# Disks bays are numbered starting from 1, bay furthest away from the power button.
# for PCIe slots, the user guide says top is slot 1, bottom is slot 2
#
# NVMe slot 1 /dev/disk/by-path/pci-0000:03:00.0-nvme-1 (the top slot, contains a Samsung 980 500GB)
# NVMe slot 2 /dev/disk/by-path/pci-0000:04:00.0-nvme-1 (the bottom slot, contains a Crucial P2 2TB)
# HDD bay 1   /dev/disk/by-path/pci-0000:07:00.0-ata-1  (bay furthest away from the power button)
# HDD bay 2   /dev/disk/by-path/pci-0000:07:00.0-ata-2
# HDD bay 3   /dev/disk/by-path/pci-0000:09:00.0-ata-1
# HDD bay 4   /dev/disk/by-path/pci-0000:09:00.0-ata-2  (bay closest to the power button)

# reboot after installation is complete?
reboot

# OS is installed to the 500GB Samsung NVMe, that is in _M.2 SSD slot 1_, the top slot.
# all other storage is left untouched, ceph-ansible will deal with that
ignoredisk --only-use=/dev/disk/by-path/pci-0000:03:00.0-nvme-1

# Partition clearing information
# note that  OS goes on a small portion os the device in bay 1, the rest will be allocated to Ceph in a separtate VG.
# so kickstarting with the below clearpart line will nuke the Ceph bits on SSD !!!
clearpart --all --initlabel --drives=/dev/disk/by-path/pci-0000:03:00.0-nvme-1

# Use graphical install
graphical

# Keyboard layouts
keyboard --vckeymap=us --xlayouts='us'

# System language
lang en_US.UTF-8

# Network information all switch ports have the respective VLAN as native
# 2.5 Gig on-board 1 ('access' network)
network  --bootproto=dhcp --device=enp6s0                --ipv6=auto --activate
# 2.5 Gig on-board 2 (will go on 'storage' via ansible)
network  --bootproto=dhcp --device=enp5s0   --onboot=off --ipv6=auto --no-activate
# 10 Gig on PCIe (will go on 'ceph' via ansible)
network  --bootproto=dhcp --device=enp2s0f0 --onboot=off --ipv6=auto --no-activate
# 10 Gig on PCIe slot 2 (PCIe 3.0 x4), currently unused
network  --bootproto=dhcp --device=enp2s0f1 --onboot=off --ipv6=auto --no-activate
# hostname will be set via ansible
network  --hostname=localhost.localdomain

# Use network installation
url --url="ftp://fileserver.internal.pcfe.net/pub/redhat/RHEL/RHEL-8.5/Server/x86_64/os/BaseOS"
repo --name="AppStream" --baseurl=ftp://fileserver.internal.pcfe.net/pub/redhat/RHEL/RHEL-8.5/Server/x86_64/os/AppStream

# Root password
rootpw --iscrypted [REDACTED]

# Run the Setup Agent on first boot?
firstboot --disable

# Do not configure the X Window System
skipx

# System services
services --enabled="chronyd"

# Intended system purpose
syspurpose --role="Red Hat Enterprise Linux Server" --sla="Self-Support" --usage="Development/Test"

# System timezone
timezone Europe/Berlin --isUtc --ntpservers=[REDACTED]

# Ansible user
user --groups=wheel --name=ansible --password=[REDACTED] --iscrypted --gecos="ansible"

# Disk partitioning information
# the 500GB Samsung NVMe in slot 1 will be fully used for the OS
# the   2TB Crucial NVMe in slot 2 and the HDDs in slots 1 through 4
# will be fed to ceph-ansible as devices
# c.f. https://docs.ceph.com/ceph-ansible/master/osds/scenarios.html
part /boot        --fstype="ext4"  --ondisk=/dev/disk/by-path/pci-0000:03:00.0-nvme-1 --size=1024
part /boot/efi    --fstype="efi"   --ondisk=/dev/disk/by-path/pci-0000:03:00.0-nvme-1 --size=512    --fsoptions="umask=0077,shortname=winnt"
part pv.01        --fstype="lvmpv" --ondisk=/dev/disk/by-path/pci-0000:03:00.0-nvme-1 --size=61440  --grow
volgroup VG_OS_NVMe1 --pesize=4096 pv.01
logvol /                   --fstype="xfs"  --size=4096  --name=LV_root       --vgname=VG_OS_NVMe1
logvol swap                --fstype="swap" --size=4096  --name=LV_swap       --vgname=VG_OS_NVMe1
logvol /var                --fstype="xfs"  --size=5120  --name=LV_var        --vgname=VG_OS_NVMe1
logvol /var/log            --fstype="xfs"  --size=4096  --name=LV_var_log    --vgname=VG_OS_NVMe1
logvol /var/crash          --fstype="xfs"  --size=70000 --name=LV_var_crash  --vgname=VG_OS_NVMe1
logvol /var/lib/containers --fstype="xfs"  --size=8192  --name=LV_containers --vgname=VG_OS_NVMe1
logvol /home               --fstype="xfs"  --size=1024  --name=LV_home       --vgname=VG_OS_NVMe1

%packages
@^minimal-environment
chrony
kexec-tools

%end

%addon com_redhat_kdump --enable --reserve-mb='auto'

%end

%anaconda
pwpolicy root --minlen=6 --minquality=1 --notstrict --nochanges --notempty
pwpolicy user --minlen=6 --minquality=1 --notstrict --nochanges --emptyok
pwpolicy luks --minlen=6 --minquality=1 --notstrict --nochanges --notempty
%end

%post --log=/root/ks-post.log
# dump pcfe's ssh key to the root user
# obviously change this to your own pubkey unless you want to grant me root access
mkdir /root/.ssh
chown root.root /root/.ssh
chmod 700 /root/.ssh
cat <<EOF >>/root/.ssh/authorized_keys
[REDACTED]
EOF
chown root.root /root/.ssh/authorized_keys
chmod 600 /root/.ssh/authorized_keys
restorecon /root/.ssh/authorized_keys

cat <<EOF >>/etc/udev/rules.d/75-disable-5GB-on-board-stick.rules
# The on-board 5GB stick should be disabled
# I currently have no use for it and leaving it untouched allows a reset to the shipped state
# by choosing the USB stick as boot target during POST
# c.f. https://projectgus.com/2014/09/blacklisting-a-single-usb-device-from-linux/
SUBSYSTEM=="usb", ATTRS{idVendor}=="1005", ATTRS{idProduct}=="b155", ATTR{authorized}="0"
EOF
chown root.root /etc/udev/rules.d/75-disable-5GB-on-board-stick.rules
chmod 644 /etc/udev/rules.d/75-disable-5GB-on-board-stick.rules
restorecon /etc/udev/rules.d/75-disable-5GB-on-board-stick.rules

# Since Ceph and EPEL should not be mixed,
# pull check-mk-agent from my monitoring server (checkmk Raw edition)
dnf -y install http://check-mk.internal.pcfe.net/HouseNet/check_mk/agents/check-mk-agent-2.0.0p15-1.noarch.rpm
echo "check-mk-agent installed from monitoring server" >> /etc/motd

# seems I can NOT specify a connection name for the network setup
# https://pykickstart.readthedocs.io/en/latest/kickstart-docs.html#network
# so just remove the line setting the name "System enp…" and then move the files
# fedora.linux_system_roles.network will do the rest later
# I do this because I find speaking names so much more pleasant
sed --in-place "s/^NAME//g" /etc/sysconfig/network-scripts/ifcfg-enp*
mv /etc/sysconfig/network-scripts/ifcfg-enp6s0 /etc/sysconfig/network-scripts/ifcfg-2.5G_1
mv /etc/sysconfig/network-scripts/ifcfg-enp5s0 /etc/sysconfig/network-scripts/ifcfg-2.5G_2
mv /etc/sysconfig/network-scripts/ifcfg-enp2s0f0 /etc/sysconfig/network-scripts/ifcfg-10G_1
mv /etc/sysconfig/network-scripts/ifcfg-enp2s0f1 /etc/sysconfig/network-scripts/ifcfg-10G_2

# disable Red Hat graphical boot (rhgb)
sed --in-place "s/ rhgb//g" /etc/default/grub

echo "kickstarted at `date` for RHEL 8.5 on QNAP TS-473A" >> /etc/motd

%end

Prep, with Ansible, for addition to Ceph Cluster

Ansible Inventory Entries

In the hosts file used by my workstation

[QNAP_Ryzen_boxes]
ts-473a-01                  ansible_user=ansible

In the hosts file used by my ceph-ansible control node

[osds]
f5-422-0[1:4].storage.pcfe.net
ts-473a-01.storage.pcfe.net

group_vars/QNAP_Ryzen_boxes.yml (click to expand).

---
user_owner: pcfe
ansible_user: ansible
common_timezone: Europe/Berlin

Sets some variables used by my role pcfe.user_owner.

inventories/host_vars/ts-473a-01.yml (click to expand).

network_connections:
  - name: "System 2.5G_1"
    type: ethernet
    interface_name: "enp6s0"
    zone: "public"
    state: up
    persistent_state: present
    ip:
      dhcp4:      no
      auto6:      yes
      gateway4:   192.168.50.254
      dns:        192.168.50.248
      dns_search: internal.pcfe.net
      address:    192.168.50.185/24

  - name: "System 2.5G_2"
    type: "ethernet"
    interface_name: "enp5s0"
    zone: "public"
    state: up
    persistent_state: present
    ip:
      dhcp4:      no
      auto6:      yes
      dns_search: storage.pcfe.net
      address:    192.168.40.185/24
      route_append_only: yes

  - name: "System 10G_1"
    type: "ethernet"
    mtu: 9000
    interface_name: "enp2s0f0"
    zone: "public"
    state: up
    persistent_state: present
    ip:
      dhcp4:      no
      auto6:      yes
      dns_search: ceph.pcfe.net
      address:    192.168.30.185/24
      route_append_only: yes

# There is no need to muck around with the osd_memory_target settting on the QNAP TS-473A, it has 64 GiB RAM.

# avoid using half arsed names like sda, sdb, etc
# mapping is:
# NVMe slot 1 /dev/disk/by-path/pci-0000:03:00.0-nvme-1 (the top slot, contains a Samsung 980 500GB)
# NVMe slot 2 /dev/disk/by-path/pci-0000:04:00.0-nvme-1 (the bottom slot, contains a Crucial P2 2TB)
# HDD bay 1   /dev/disk/by-path/pci-0000:07:00.0-ata-1  (bay furthest away from the power button)
# HDD bay 2   /dev/disk/by-path/pci-0000:07:00.0-ata-2
# HDD bay 3   /dev/disk/by-path/pci-0000:09:00.0-ata-1
# HDD bay 4   /dev/disk/by-path/pci-0000:09:00.0-ata-2  (bay closest to the power button)
dmcrypt: True
osd_objectstore:  bluestore
osd_scenario:     lvm
devices:
  - /dev/disk/by-path/pci-0000:04:00.0-nvme-1
  - /dev/disk/by-path/pci-0000:07:00.0-ata-1
  - /dev/disk/by-path/pci-0000:07:00.0-ata-2
  - /dev/disk/by-path/pci-0000:09:00.0-ata-1
  - /dev/disk/by-path/pci-0000:09:00.0-ata-2

tuned_profile:  powersave

Sets variables used to configure

network connections
ceph-ansible
tuned

Initial Setup

Ansible Playbook qnap-ryzen-initial-setup-rhel8.yml (click to expand).

---
# sets up a RHEL 8 minimal install to be ready for ceph-ansible
#
# the playbook runs as root (see ansible_user line below) and preps the node for normal ansible operations wit the ansible user and ensures access to Red Hat subscriptions is used.
# Expect the redhat_subscription task to take more than one but less than five minutes
#
# this is for my home setup, not for production!
- hosts:
  - QNAP_Ryzen_boxes

  become: false

  roles:
    - pcfe.user_owner
    - pcfe.basic-security-setup
    - pcfe.housenet

  # no need for double indirect if you are OK with checking in ak details into git
  # this is OK to do if you use an in-house Satellite server and your security policies allow it
  # this is not a good idea if you register your systems directly to redhat.com and cannot guarantee that your git remains private
  vars_files:
    - "vars/subscription-manager-autoattach-ak-secrets.yml"
  vars:
    ansible_user: root
    user_owner: ansible
    common_timezone: Europe/Berlin
    rhsm_activationkey: "{{ vaulted_rhsm_activationkey }}"
    rhsm_org_id: "{{ vaulted_rhsm_org_id }}"
    rhsm_pool_ids: "{{ vaulted_rhsm_pool_ids }}"

  pre_tasks:
    # work around identical UUID on each host as per https://theforeman.org/plugins/katello/nightly/troubleshooting/content_hosts.html
    # The TerraMaster F5-422 boxes all have the same system-uuid :-(
    # and if I look at `dmidecode -s system-uuid` that looks surprisingly regular. I'll know for sure if I  ever purchase another of these
    # RHSM does not like that, so override to avoid both
    # "HTTP error (409 - Conflict): Request failed due to concurrent modification, please re-try.\n"
    # and all 4 boxes overriding each other in insights inventory
    - name: "RHSM | ensure uuid override is derived from fqdn, QNAP TS-473A all seem to have, like all F5-422 I own, the same system-uuid in DMI"
      copy:
        dest: "/etc/rhsm/facts/uuid_override.facts"
        owner: "root"
        group: "root"
        mode: 0644
        content: |
          {"dmi.system.uuid": "{{ ansible_fqdn | to_uuid }}"}          


    # https://access.redhat.com/documentation/en-us/red_hat_ceph_storage/4/html-single/installation_guide/index#enabling-the-red-hat-ceph-storage-repositories-install
    - name: "RHSM on the RHEL8 boxes"
      block:
        - name: "RHSM | ensure system is registered with my activation key"
          redhat_subscription:
            activationkey:  "{{ rhsm_activationkey }}"
            org_id:         "{{ rhsm_org_id }}"
            syspurpose:
              usage: "Development/Test"
              role: ""
              service_level_agreement: "Self-Support"
          tags: do_subsmgr_register
        # n.b. those are the prerequisites, all repos off except BaseOS and AppStream
        # the repos for MON, OSD, … will be handled in another playbook. (which no longer disables all and does not have the delay that using redhat_subscription task brings)
        - name: "RHSM | disable all repositories, next task will enable needed repos"
          rhsm_repository:
            name: '*'
            state: disabled
        - name: "RHSM | ensure RHEL8 BaseOS repos needed for Ceph are enabled"
          rhsm_repository:
            name:
              - rhel-8-for-x86_64-baseos-rpms
              - rhel-8-for-x86_64-appstream-rpms
            state: enabled
      when: ansible_distribution == "RedHat" and ansible_distribution_major_version == "8"
      tags: do_subsmgr_all

  tasks:
    # !!!
    #
    # if I ever enable EPEL, then I MUST exclude
    # - ansible
    # - ceph
    # in the EPEL repo files to ensure no newer versions of those packages are pulled in from EPEL
    #
    # exclude = *ceph* nfs-ganesha-rgw rbd-mirror *ansible*
    #
    # !!!
    #
    # https://access.redhat.com/documentation/en-us/red_hat_ceph_storage/4/html-single/installation_guide/index#enabling-the-red-hat-ceph-storage-repositories-install
    - name: "REPOS | ensure EPEL is disabled"
      yum_repository:
        name: epel
        state: absent

    # start by enabling time sync, RHSM operations will fail on too large time delta
    # note that this uses chronyd, not ntpd.
    - name: "CHRONYD | ensure chrony is installed"
      package:
        name:       chrony
        state:      present
    - name:         "CHRONYD | ensure chrony-wait is enabled"
      service:
        name:       chrony-wait
        enabled:    true
    - name:         "CHRONYD | ensure chronyd is enabled and running"
      service:
        name:       chronyd
        enabled:    true
        state:      started
    
    # enable persistent journal
    # https://access.redhat.com/solutions/696893 instructs to sinmply mkdir as root, so drop the owner, group and mode lines
    - name: "JOURNAL | ensure persistent logging for the systemd journal is possible"
      file:
        path: /var/log/journal
        state: directory

    # 2.10. Enabling Password-less SSH for Ansible
    - name: "SUDO | enable passwordless sudo for user {{ user_owner }}"
      copy:
        dest: '/etc/sudoers.d/{{ user_owner }}'
        content: |
          {{ user_owner }}   ALL=NOPASSWD:   ALL          
        owner: root
        group: root
        mode: 0440

    # Ensure the ansible user can NOT log in with password
    - name: "Ensure the ansible user can NOT log in with password"
      user:
        name: '{{ user_owner }}'
        password_lock: True

    # Install prerequisites
    # https://access.redhat.com/documentation/en-us/red_hat_ceph_storage/4/html/installation_guide/requirements-for-installing-rhcs#enabling-the-red-hat-ceph-storage-repositories-install
    - name: "package | ensure prerequisites needed in addition to minimal install are present"
      package:
        name:
          - yum-utils
          - vim
        state: present

That Playbook can take a moment to run because I have to do rhsm operations. It:

ensures the node is registered with subscription-manager
ensures prerequisites from the Red Hat Ceph Storage 4 installation guide are fulfilled
ensures I have persistent journal logging
ensures the ansible user, that ceph-ansible will connect as, is set up as needed

General Setup

Ansible Playbook qnap-ryzen-general-setup-rhel8.yml (click to expand).

---
- hosts:
  - QNAP_Ryzen_boxes
  become: true
  roles:
    - fedora.linux_system_roles.network
    - pcfe.user_owner
    - pcfe.basic-security-setup
    - pcfe.housenet
    - pcfe.comfort
    # - pcfe.checkmk

  handlers:
    - name: grub2-mkconfig | run
      command: grub2-mkconfig -o /boot/efi/EFI/redhat/grub.cfg

  tasks:
    # Install some tools
    - name: "PACKAGE | tool installation"
      package:
        name:
          - pciutils
          - usbutils
          - nvme-cli
          - fio
          - powertop
          - tuned
          - tuned-utils
          - numactl
          - mailx
          - teamd
          - NetworkManager-team
          - iperf3
          - tcpdump
          - hwloc
          - hwloc-gui
        state: present
        update_cache: no

    # linux-system-roles.network sets static network config (from host_vars)
    # but I want the static hostname nailed down too
    - name: "set hostname"
      hostname:
        name:          "{{ ansible_fqdn }}"
        use:           systemd

    # FIXME: should also find a module to do `hostnamectl set-chassis server`

    # enable watchdog
    # it's a
    # Dec 19 15:09:08 ts-473a-01.internal.pcfe.net kernel: sp5100_tco: SP5100/SB800 TCO WatchDog Timer Driver
    # Dec 19 15:09:08 ts-473a-01.internal.pcfe.net kernel: sp5100-tco sp5100-tco: Using 0xfeb00000 for watchdog MMIO address
    # and modinfo says
    # parm:           heartbeat:Watchdog heartbeat in seconds. (default=60) (int)
    # parm:           nowayout:Watchdog cannot be stopped once started. (default=0) (bool)
    - name: "WATCHDOG | ensure kernel module sp5100_tco has correct options configured"
      lineinfile:
        path:         /etc/modprobe.d/sp5100_tco.conf
        create:       true
        regexp:       '^options '
        insertafter:  '^#options'
        line:         'options sp5100_tco nowayout=0'

    # configure both watchdog.service and systemd watchdog, but only use the latter
    - name: "WATCHDOG | ensure watchdog package is installed"
      package:
        name:         watchdog
        state:        present
        update_cache: no
    - name: "WATCHDOG | ensure correct watchdog-device is used by watchdog.service"
      lineinfile:
        path:         /etc/watchdog.conf
        regexp:       '^watchdog-device'
        insertafter:  '^#watchdog-device'
        line:         'watchdog-device = /dev/watchdog0'
    - name: "WATCHDOG | ensure timeout is set to 30 seconds for watchdog.service"
      lineinfile:
        path:         /etc/watchdog.conf
        regexp:       '^watchdog-timeout'
        insertafter:  '^#watchdog-timeout'
        line:         'watchdog-timeout = 30'
    # Using systemd watchdog rather than watchdog.service
    - name: "WATCHDOG | ensure watchdog.service is disabled"
      systemd:
        name:         watchdog.service
        state:        stopped
        enabled:      false
    # configure systemd watchdog
    # c.f. http://0pointer.de/blog/projects/watchdog.html
    - name: "WATCHDOG | ensure systemd watchdog is enabled"
      lineinfile:
        path:         /etc/systemd/system.conf
        regexp:       '^RuntimeWatchdogSec'
        insertafter:  'EOF'
        line:         'RuntimeWatchdogSec=30'
    - name: "WATCHDOG | ensure systemd shutdown watchdog is enabled"
      lineinfile:
        path:         /etc/systemd/system.conf
        regexp:       '^ShutdownWatchdogSec'
        insertafter:  'EOF'
        line:         'ShutdownWatchdogSec=30'

    # install and enable rngd
    - name: "RNGD | ensure rng-tools package is installed"
      package:
        name:         rng-tools
        state:        present
        update_cache: no
    - name: "RNGD | ensure rngd.service is enabled and started"
      systemd:
        name:         rngd.service
        state:        started
        enabled:      true

    # ensure tuned is set up as I wish
    - name: "TUNED | ensure tuned.service is enabled and running"
      systemd:
        name:           tuned.service
        state:          started
        enabled:        true
    - name: "TUNED | check which tuned profile is active"
      command:        tuned-adm active
      register:       tuned_active_profile
      ignore_errors:  yes
      changed_when:   no
    - name: "TUNED | activate tuned profile {{ tuned_profile }}"
      command:        "tuned-adm profile {{ tuned_profile }}"
      when:           not tuned_active_profile.stdout is search('Current active profile:' ~ ' ' ~ tuned_profile)

    # install cockpit, but disabled for now
    - name: "COCKPIT | ensure packages for https://cockpit-project.org/ are installed"
      package:
        name:
          - cockpit
          - cockpit-selinux
          - cockpit-kdump
          - cockpit-system
        state: present
        update_cache: no
    - name: "COCKPIT | ensure cockpit.socket is stopped and disabled"
      systemd:
        name:       cockpit.socket
        state:      stopped
        enabled:    False
    - name: "COCKPIT | ensure firewalld forbids service cockpit in zone public"
      firewalld:
        service:    cockpit
        zone:       public
        permanent:  True
        state:      disabled
        immediate:  True

     # enable kdump.service since kickstart now creates a sufficiently large /var/crash
    # alternatively, you could set up netdump
    - name: "Ensure kdump.service is enabled and started"
      systemd:
        name:         kdump.service
        state:        started
        enabled:      True

    # podman
    - name: "PACKAGE | ensure podman is installed"
      package:
        name:
          - podman
          - podman-docker
        state: present
        update_cache: no

    # setroubleshoot, see also https://danwalsh.livejournal.com/20931.html
    - name: "PACKAGE | ensure setroubleshoot for headless server is installed"
      package:
        name:
          - setroubleshoot-server
          - setroubleshoot-plugins
        state: present
        update_cache: no

    - name: "MONITORING | ensure packages for monitoring are installed"
      package:
        name:
          - smartmontools
          - hdparm
          - check-mk-agent
          - lm_sensors
        state: present
        update_cache: no

    - name: "MONITORING | ensure firewalld permits 6556 in zone public for check-mk-agent"
      firewalld:
        port:       6556/tcp
        permanent:  True
        state:      enabled
        immediate:  True
        zone:       public
    - name: "MONITORING | ensure tarsnap cache is in fileinfo"
      lineinfile:
        path: /etc/check_mk/fileinfo.cfg
        line: "/usr/local/tarsnap-cache/cache"
        create: yes
    - name: "MONITORING | ensure entropy_avail plugin for Check_MK is present"
      template:
        src:        templates/check-mk-agent-plugin-entropy_avail.j2
        dest:       /usr/lib/check_mk_agent/plugins/entropy_avail
        mode:       0755
        group:      root
        owner:      root
    - name: "MONITORING | plugins from running CEE instance"
      get_url:
        url: "http://check-mk.internal.pcfe.net/HouseNet/check_mk/agents/plugins/{{ item }}"
        dest: "/usr/lib/check_mk_agent/plugins/{{ item }}"
        mode: "0755"
      loop:
        - smart
        - lvm
    - name: "MONITORING | ensure check_mk.socket is started and enabled"
      systemd:
        name:       check_mk.socket
        state:      started
        enabled:    True

    - name: "Ensure powertop autotune service runs once at boot"
      systemd:
        name:       powertop
        state:      stopped
        enabled:    True


    # I admit, the regexp is a search engine hit
    # maybe using grubby(8) would be more readable
    # - https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/8/html/managing_monitoring_and_updating_the_kernel/configuring-kernel-command-line-parameters_managing-monitoring-and-updating-the-kernel#what-is-grubby_configuring-kernel-command-line-parameters
    # - https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/system_administrators_guide/sec-Making_Persistent_Changes_to_a_GRUB_2_Menu_Using_the_grubby_Tool
    - name: "GRUB | ensure console blanking is disabled in defaults file plus handler"
      lineinfile:
        state: present
        dest: /etc/default/grub
        backrefs: yes
        regexp: '^(GRUB_CMDLINE_LINUX=(?!.* consoleblank)\"[^\"]+)(\".*)'
        line: '\1 consoleblank=0\2'
      notify: grub2-mkconfig | run

    # Since I do not manage to get these TS-473A to PXE boot, add an entry into grub
    # so that I can kickstart the box after this without fiddling with a USB stick
    - name: "GRUB | ensure initrd for RHEL 8.5 kickstart is present"
      get_url:
        url: "ftp://fileserver.internal.pcfe.net/pub/redhat/RHEL/RHEL-8.5/Server/x86_64/os/images/pxeboot/initrd.img"
        dest: "/boot/initrd-kickstart-rhel85.img"
        mode: "0600"
    - name: "GRUB | ensure kernel for RHEL 8.5 kickstart is present"
      get_url:
        url: "ftp://fileserver.internal.pcfe.net/pub/redhat/RHEL/RHEL-8.5/Server/x86_64/os/images/pxeboot/vmlinuz"
        dest: "/boot/vmlinuz-kickstart-rhel85"
        mode: "0755"
    - name: "GRUB | ensure kickstarting RHEL 8.5 entry is present"
      copy:
        dest: "/etc/grub.d/11_RHEL85_kickstart"
        owner: "root"
        group: "root"
        mode: 0755
        content: |
          #!/bin/sh
          exec tail -n +3 $0
          # This file provides an easy way to add custom menu entries.  Simply type the
          # menu entries you want to add after this comment.  Be careful not to change
          # the 'exec tail' line above.
          menuentry "WARNING Kickstart this box with RHEL 8.5 as a TS-473A ceph node WARNING" {
              linuxefi /vmlinuz-kickstart-rhel85 ip=dhcp inst.repo=ftp://fileserver.internal.pcfe.net/pub/redhat/RHEL/RHEL-8.5/Server/x86_64/os inst.ks=ftp://fileserver.internal.pcfe.net/pub/kickstart/RHEL85-QNAP-TS-473A-ks.cfg
              initrdefi /initrd-kickstart-rhel85.img
          }          
      notify: grub2-mkconfig | run

    # upgrade the box
    - name: "package | ensure all updates are applied"
      package:
        update_cache: yes
        name: '*'
        state: latest
      tags: apply_errata

That Playbook:

ensures my network connections are set up to my liking and Ceph needs (fedora.linux_system_roles.network)
ensures I have a user account on the node (pcfe.user_owner)
ensures SELinux is in enforcing mode and password authentication for ssh is disabled (pcfe.basic-security-setup)
ensures my user account is set up to my liking (pcfe.comfort)
ensures some tools I like to use are installed
ensures the hostname is set
ensures the hardware watchdog is set up
ensures rngd is active
ensures my chosen tuned profile is active
ensures Cockpit is installed
for now ensures Cockpit is disabled
ensures kdump is active
ensures setroubleshoot is set up for headless operation
ensures I can monitor the node with my Checkmk server
ensures powertop --auto-tune runs once at boot
ensures I can kickstart the node from grub
ensures all updates are applied

Copy over host_… and group_vars

Since for ceph-ansible use I need the below, I simply copy over my host_vars and group_vars shown at the start of this post to the host I use ceph-ansible on.

dmcrypt: True
osd_objectstore: bluestore
osd_scenario: lvm
devices:
  - /dev/disk/by-path/pci-0000:04:00.0-nvme-1
  - /dev/disk/by-path/pci-0000:07:00.0-ata-1
  - /dev/disk/by-path/pci-0000:07:00.0-ata-2
  - /dev/disk/by-path/pci-0000:09:00.0-ata-1
  - /dev/disk/by-path/pci-0000:09:00.0-ata-2

Add to Inventory used by ceph-ansible

For today, only put OSDs on, I’ll move some daemons from F5-422 nodes to this way more powerful node later.

[osds]
f5-422-0[1:4].storage.pcfe.net
ts-473a-01.storage.pcfe.net

Run ceph-ansible

As expected, that was completely hassle-free,

[ansible@ceph-ansible ~]$ cd /usr/share/ceph-ansible
[ansible@ceph-ansible ceph-ansible]$ ansible-playbook site-container.yml
[...]

Cluster State after Expansion

[root@f5-422-01 ~]# podman exec --interactive --tty ceph-mon-f5-422-01 ceph df
RAW STORAGE:
    CLASS     SIZE       AVAIL      USED        RAW USED     %RAW USED 
    hdd       32 TiB     26 TiB     5.2 TiB      5.3 TiB         16.60 
    TOTAL     32 TiB     26 TiB     5.2 TiB      5.3 TiB         16.60 
 
POOLS:
    POOL                           ID     STORED      OBJECTS     USED        %USED     MAX AVAIL 
    cephfs_data                     1     300 GiB       1.14M     1.1 TiB      4.41       7.6 TiB 
    cephfs_metadata                 2     783 MiB     182.05k     1.2 GiB         0       7.6 TiB 
    .rgw.root                       3     2.4 KiB           6     1.1 MiB         0       7.6 TiB 
    default.rgw.control             4         0 B           8         0 B         0       7.6 TiB 
    default.rgw.meta                5     5.0 KiB          28     4.5 MiB         0       7.6 TiB 
    default.rgw.log                 6     3.5 KiB         208     6.2 MiB         0       7.6 TiB 
    libvirt                        10     5.4 GiB       1.44k      16 GiB      0.07       7.6 TiB 
    device_health_metrics          11      49 MiB          17      49 MiB         0       7.6 TiB 
    rbd                            12       126 B           3     192 KiB         0       7.6 TiB 
    default.rgw.buckets.index      13      76 KiB          55      76 KiB         0       7.6 TiB 
    default.rgw.buckets.data       14      14 GiB       3.84k      41 GiB      0.18       7.6 TiB 
    default.rgw.buckets.non-ec     15     1.5 KiB          58      11 MiB         0       7.6 TiB 
    ocs_rbd                        17     158 GiB      41.91k     474 GiB      1.99       7.6 TiB 
    cinder                         18        19 B           1     192 KiB         0       7.6 TiB 
    proxmox_rbd                    19      61 GiB      34.15k     189 GiB      0.80       7.6 TiB

[root@f5-422-01 ~]# podman exec ceph-mon-f5-422-01 ceph osd tree
ID  CLASS WEIGHT   TYPE NAME           STATUS REWEIGHT PRI-AFF 
 -1       31.68657 root default                                
 -9        4.09348     host f5-422-01                          
  2   hdd  1.97089         osd.2           up  1.00000 1.00000 
  6   hdd  1.06129         osd.6           up  1.00000 1.00000 
 10   hdd  1.06129         osd.10          up  1.00000 1.00000 
 -7        5.00307     host f5-422-02                          
  3   hdd  1.97089         osd.3           up  1.00000 1.00000 
  7   hdd  1.97089         osd.7           up  1.00000 1.00000 
 11   hdd  1.06129         osd.11          up  1.00000 1.00000 
 -5        2.12259     host f5-422-03                          
  4   hdd  1.06129         osd.4           up  1.00000 1.00000 
  8   hdd  1.06129         osd.8           up  1.00000 1.00000 
 -3        4.09348     host f5-422-04                          
  1   hdd  1.97089         osd.1           up  1.00000 1.00000 
  5   hdd  1.06129         osd.5           up  1.00000 1.00000 
  9   hdd  1.06129         osd.9           up  1.00000 1.00000 
-11       16.37396     host ts-473a-01                         
  0   hdd  4.09349         osd.0           up  1.00000 1.00000 
 12   hdd  4.09349         osd.12          up  1.00000 1.00000 
 13   hdd  4.09349         osd.13          up  1.00000 1.00000 
 14   hdd  4.09349         osd.14          up  1.00000 1.00000