Add NVMe storage auto-setup, sops secrets, fix SSH permissions

- setup-node-storage service auto-partitions NVMe for containerd/longhorn
- Root password encrypted with sops/age, decrypted during build
- Fix SSH host key permissions (0600) so sshd actually starts
- Disable SSH socket activation for reliable boot
- Add OPERATIONS.md with runbook
- Makefile tracks source dependencies
This commit is contained in:
2026-02-06 00:58:38 +01:00
parent 258d1ecc60
commit 3f191d8f93
7 changed files with 643 additions and 6 deletions

195
files/setup-node-storage Normal file
View File

@@ -0,0 +1,195 @@
#!/bin/bash
# Setup local NVMe storage for K3s node
# Runs at boot via systemd service
#
# Logic:
# - No NVMe: exit cleanly
# - No partition table: auto-format (new drive)
# - Has our labels: mount and exit (already configured)
# - Has other partitions: prompt with 120s timeout (safety)
set -euo pipefail
DEVICE="/dev/nvme0n1"
CONTAINERD_SIZE="75GiB"
CONTAINERD_LABEL="containerd"
LONGHORN_LABEL="longhorn"
CONTAINERD_MOUNT="/var/lib/containerd"
LONGHORN_MOUNT="/var/lib/longhorn"
MARKER_FILE=".netboot-storage"
PROMPT_TIMEOUT=120
# Colors for console output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
CYAN='\033[0;36m'
NC='\033[0m'
# Log to both console and journald
log() { echo -e "${GREEN}[storage]${NC} $*"; logger -t setup-node-storage "$*"; }
warn() { echo -e "${YELLOW}[storage]${NC} $*"; logger -t setup-node-storage -p warning "$*"; }
error() { echo -e "${RED}[storage]${NC} $*"; logger -t setup-node-storage -p err "$*"; }
# Check if NVMe exists
if [ ! -b "$DEVICE" ]; then
log "No NVMe device found at $DEVICE - skipping storage setup"
exit 0
fi
DEVICE_SIZE=$(lsblk -b -d -n -o SIZE "$DEVICE" | awk '{printf "%.0fGB", $1/1000000000}')
log "Found NVMe: $DEVICE ($DEVICE_SIZE)"
# Get partition names (handles nvme naming with 'p' prefix)
if [[ "$DEVICE" == *"nvme"* ]]; then
PART1="${DEVICE}p1"
PART2="${DEVICE}p2"
else
PART1="${DEVICE}1"
PART2="${DEVICE}2"
fi
# Function to mount existing storage
mount_storage() {
log "Mounting existing storage..."
mkdir -p "$CONTAINERD_MOUNT" "$LONGHORN_MOUNT"
if ! mountpoint -q "$CONTAINERD_MOUNT"; then
mount -L "$CONTAINERD_LABEL" "$CONTAINERD_MOUNT" || {
error "Failed to mount containerd partition"
return 1
}
fi
if ! mountpoint -q "$LONGHORN_MOUNT"; then
mount -L "$LONGHORN_LABEL" "$LONGHORN_MOUNT" || {
error "Failed to mount longhorn partition"
return 1
}
fi
log "Storage mounted:"
log " $CONTAINERD_MOUNT: $(df -h "$CONTAINERD_MOUNT" | tail -1 | awk '{print $2}')"
log " $LONGHORN_MOUNT: $(df -h "$LONGHORN_MOUNT" | tail -1 | awk '{print $2}')"
return 0
}
# Function to format the drive
format_storage() {
log "Partitioning $DEVICE..."
wipefs -af "$DEVICE"
parted -s "$DEVICE" mklabel gpt
parted -s "$DEVICE" mkpart primary ext4 1MiB "$CONTAINERD_SIZE"
parted -s "$DEVICE" mkpart primary ext4 "$CONTAINERD_SIZE" 100%
# Tell kernel to re-read partition table and wait for udev
partprobe "$DEVICE"
udevadm settle --timeout=10
# Verify partitions appeared
if [ ! -b "$PART1" ] || [ ! -b "$PART2" ]; then
error "Partitions not found after partprobe: $PART1, $PART2"
exit 1
fi
log "Formatting ${PART1} as ext4 (containerd, 75GB)..."
mkfs.ext4 -L "$CONTAINERD_LABEL" -q "$PART1"
log "Formatting ${PART2} as ext4 (longhorn, remaining)..."
mkfs.ext4 -L "$LONGHORN_LABEL" -q "$PART2"
# Mount the new partitions
mkdir -p "$CONTAINERD_MOUNT" "$LONGHORN_MOUNT"
mount "$PART1" "$CONTAINERD_MOUNT"
mount "$PART2" "$LONGHORN_MOUNT"
# Create marker files with metadata
for mount_point in "$CONTAINERD_MOUNT" "$LONGHORN_MOUNT"; do
cat > "${mount_point}/${MARKER_FILE}" <<EOF
# Netboot storage marker - DO NOT DELETE
formatted_date=$(date -Iseconds)
formatted_by=setup-node-storage
hostname=$(hostname)
device=$DEVICE
EOF
done
log "Storage formatted and mounted successfully"
log " $CONTAINERD_MOUNT: $(df -h "$CONTAINERD_MOUNT" | tail -1 | awk '{print $2}')"
log " $LONGHORN_MOUNT: $(df -h "$LONGHORN_MOUNT" | tail -1 | awk '{print $2}')"
}
# Check for partition table
# Method 1: blkid returns empty PTTYPE for unpartitioned drives
# Method 2: parted error message (locale-dependent fallback)
has_partition_table() {
local pttype
pttype=$(blkid -o value -s PTTYPE "$DEVICE" 2>/dev/null)
if [ -n "$pttype" ]; then
return 0 # has partition table
fi
# Fallback: check if parted can read it
if parted -s "$DEVICE" print &>/dev/null; then
return 0 # has partition table
fi
return 1 # no partition table
}
if ! has_partition_table; then
# No partition table - this is a fresh drive, auto-format
log "Empty drive detected (no partition table) - auto-formatting..."
format_storage
exit 0
fi
# Has partition table - check if it's ours
if blkid -L "$CONTAINERD_LABEL" &>/dev/null && blkid -L "$LONGHORN_LABEL" &>/dev/null; then
# Check for marker file (belt and suspenders)
# Create temp mount to check marker without leaving dangling mount
TEMP_MOUNT=$(mktemp -d)
if mount -L "$CONTAINERD_LABEL" "$TEMP_MOUNT" 2>/dev/null; then
if [ -f "${TEMP_MOUNT}/${MARKER_FILE}" ]; then
umount "$TEMP_MOUNT"
rmdir "$TEMP_MOUNT"
log "Storage already configured (found labels and marker)"
mount_storage
exit 0
else
umount "$TEMP_MOUNT"
rmdir "$TEMP_MOUNT"
# Has our labels but no marker - probably ours, mount it
warn "Found labels but no marker file - assuming configured"
mount_storage
exit 0
fi
fi
rmdir "$TEMP_MOUNT" 2>/dev/null || true
fi
# Has partitions but not ours - this could contain data!
warn "NVMe has existing partitions but no netboot labels."
warn "This drive may contain important data!"
echo ""
lsblk "$DEVICE"
echo ""
# Prompt on console with timeout
echo -e "${CYAN}========================================${NC}"
echo -e "${CYAN} Press ENTER within ${PROMPT_TIMEOUT}s to format ${NC}"
echo -e "${CYAN} Or wait to skip (safe default) ${NC}"
echo -e "${CYAN}========================================${NC}"
echo ""
if read -t "$PROMPT_TIMEOUT" -p "Format $DEVICE? [press ENTER to confirm] " response; then
echo ""
warn "Formatting in 5 seconds... Ctrl+C to abort"
sleep 5
format_storage
else
echo ""
warn "Timeout - skipping storage setup (drive left untouched)"
warn "To format manually, reboot and press ENTER when prompted"
exit 0
fi

View File

@@ -0,0 +1,26 @@
[Unit]
Description=Setup local NVMe storage for K3s
Documentation=file:///usr/local/bin/setup-node-storage
# Run early, after devices are available but before container services
After=local-fs.target systemd-udevd.service
Before=containerd.service
# Only run if not already mounted
ConditionPathIsMountPoint=!/var/lib/containerd
[Service]
Type=oneshot
ExecStart=/usr/local/bin/setup-node-storage
RemainAfterExit=yes
# Console access for interactive prompt
StandardInput=tty
TTYPath=/dev/tty1
TTYReset=yes
# Generous timeout for user interaction (3 minutes)
TimeoutStartSec=180
[Install]
WantedBy=multi-user.target