Add NVMe storage auto-setup, sops secrets, fix SSH permissions
- setup-node-storage service auto-partitions NVMe for containerd/longhorn - Root password encrypted with sops/age, decrypted during build - Fix SSH host key permissions (0600) so sshd actually starts - Disable SSH socket activation for reliable boot - Add OPERATIONS.md with runbook - Makefile tracks source dependencies
This commit is contained in:
195
files/setup-node-storage
Normal file
195
files/setup-node-storage
Normal file
@@ -0,0 +1,195 @@
|
||||
#!/bin/bash
|
||||
# Setup local NVMe storage for K3s node
|
||||
# Runs at boot via systemd service
|
||||
#
|
||||
# Logic:
|
||||
# - No NVMe: exit cleanly
|
||||
# - No partition table: auto-format (new drive)
|
||||
# - Has our labels: mount and exit (already configured)
|
||||
# - Has other partitions: prompt with 120s timeout (safety)
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
DEVICE="/dev/nvme0n1"
|
||||
CONTAINERD_SIZE="75GiB"
|
||||
CONTAINERD_LABEL="containerd"
|
||||
LONGHORN_LABEL="longhorn"
|
||||
CONTAINERD_MOUNT="/var/lib/containerd"
|
||||
LONGHORN_MOUNT="/var/lib/longhorn"
|
||||
MARKER_FILE=".netboot-storage"
|
||||
PROMPT_TIMEOUT=120
|
||||
|
||||
# Colors for console output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
CYAN='\033[0;36m'
|
||||
NC='\033[0m'
|
||||
|
||||
# Log to both console and journald
|
||||
log() { echo -e "${GREEN}[storage]${NC} $*"; logger -t setup-node-storage "$*"; }
|
||||
warn() { echo -e "${YELLOW}[storage]${NC} $*"; logger -t setup-node-storage -p warning "$*"; }
|
||||
error() { echo -e "${RED}[storage]${NC} $*"; logger -t setup-node-storage -p err "$*"; }
|
||||
|
||||
# Check if NVMe exists
|
||||
if [ ! -b "$DEVICE" ]; then
|
||||
log "No NVMe device found at $DEVICE - skipping storage setup"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
DEVICE_SIZE=$(lsblk -b -d -n -o SIZE "$DEVICE" | awk '{printf "%.0fGB", $1/1000000000}')
|
||||
log "Found NVMe: $DEVICE ($DEVICE_SIZE)"
|
||||
|
||||
# Get partition names (handles nvme naming with 'p' prefix)
|
||||
if [[ "$DEVICE" == *"nvme"* ]]; then
|
||||
PART1="${DEVICE}p1"
|
||||
PART2="${DEVICE}p2"
|
||||
else
|
||||
PART1="${DEVICE}1"
|
||||
PART2="${DEVICE}2"
|
||||
fi
|
||||
|
||||
# Function to mount existing storage
|
||||
mount_storage() {
|
||||
log "Mounting existing storage..."
|
||||
|
||||
mkdir -p "$CONTAINERD_MOUNT" "$LONGHORN_MOUNT"
|
||||
|
||||
if ! mountpoint -q "$CONTAINERD_MOUNT"; then
|
||||
mount -L "$CONTAINERD_LABEL" "$CONTAINERD_MOUNT" || {
|
||||
error "Failed to mount containerd partition"
|
||||
return 1
|
||||
}
|
||||
fi
|
||||
|
||||
if ! mountpoint -q "$LONGHORN_MOUNT"; then
|
||||
mount -L "$LONGHORN_LABEL" "$LONGHORN_MOUNT" || {
|
||||
error "Failed to mount longhorn partition"
|
||||
return 1
|
||||
}
|
||||
fi
|
||||
|
||||
log "Storage mounted:"
|
||||
log " $CONTAINERD_MOUNT: $(df -h "$CONTAINERD_MOUNT" | tail -1 | awk '{print $2}')"
|
||||
log " $LONGHORN_MOUNT: $(df -h "$LONGHORN_MOUNT" | tail -1 | awk '{print $2}')"
|
||||
return 0
|
||||
}
|
||||
|
||||
# Function to format the drive
|
||||
format_storage() {
|
||||
log "Partitioning $DEVICE..."
|
||||
|
||||
wipefs -af "$DEVICE"
|
||||
parted -s "$DEVICE" mklabel gpt
|
||||
parted -s "$DEVICE" mkpart primary ext4 1MiB "$CONTAINERD_SIZE"
|
||||
parted -s "$DEVICE" mkpart primary ext4 "$CONTAINERD_SIZE" 100%
|
||||
|
||||
# Tell kernel to re-read partition table and wait for udev
|
||||
partprobe "$DEVICE"
|
||||
udevadm settle --timeout=10
|
||||
|
||||
# Verify partitions appeared
|
||||
if [ ! -b "$PART1" ] || [ ! -b "$PART2" ]; then
|
||||
error "Partitions not found after partprobe: $PART1, $PART2"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log "Formatting ${PART1} as ext4 (containerd, 75GB)..."
|
||||
mkfs.ext4 -L "$CONTAINERD_LABEL" -q "$PART1"
|
||||
|
||||
log "Formatting ${PART2} as ext4 (longhorn, remaining)..."
|
||||
mkfs.ext4 -L "$LONGHORN_LABEL" -q "$PART2"
|
||||
|
||||
# Mount the new partitions
|
||||
mkdir -p "$CONTAINERD_MOUNT" "$LONGHORN_MOUNT"
|
||||
mount "$PART1" "$CONTAINERD_MOUNT"
|
||||
mount "$PART2" "$LONGHORN_MOUNT"
|
||||
|
||||
# Create marker files with metadata
|
||||
for mount_point in "$CONTAINERD_MOUNT" "$LONGHORN_MOUNT"; do
|
||||
cat > "${mount_point}/${MARKER_FILE}" <<EOF
|
||||
# Netboot storage marker - DO NOT DELETE
|
||||
formatted_date=$(date -Iseconds)
|
||||
formatted_by=setup-node-storage
|
||||
hostname=$(hostname)
|
||||
device=$DEVICE
|
||||
EOF
|
||||
done
|
||||
|
||||
log "Storage formatted and mounted successfully"
|
||||
log " $CONTAINERD_MOUNT: $(df -h "$CONTAINERD_MOUNT" | tail -1 | awk '{print $2}')"
|
||||
log " $LONGHORN_MOUNT: $(df -h "$LONGHORN_MOUNT" | tail -1 | awk '{print $2}')"
|
||||
}
|
||||
|
||||
# Check for partition table
|
||||
# Method 1: blkid returns empty PTTYPE for unpartitioned drives
|
||||
# Method 2: parted error message (locale-dependent fallback)
|
||||
has_partition_table() {
|
||||
local pttype
|
||||
pttype=$(blkid -o value -s PTTYPE "$DEVICE" 2>/dev/null)
|
||||
if [ -n "$pttype" ]; then
|
||||
return 0 # has partition table
|
||||
fi
|
||||
# Fallback: check if parted can read it
|
||||
if parted -s "$DEVICE" print &>/dev/null; then
|
||||
return 0 # has partition table
|
||||
fi
|
||||
return 1 # no partition table
|
||||
}
|
||||
|
||||
if ! has_partition_table; then
|
||||
# No partition table - this is a fresh drive, auto-format
|
||||
log "Empty drive detected (no partition table) - auto-formatting..."
|
||||
format_storage
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Has partition table - check if it's ours
|
||||
if blkid -L "$CONTAINERD_LABEL" &>/dev/null && blkid -L "$LONGHORN_LABEL" &>/dev/null; then
|
||||
# Check for marker file (belt and suspenders)
|
||||
# Create temp mount to check marker without leaving dangling mount
|
||||
TEMP_MOUNT=$(mktemp -d)
|
||||
if mount -L "$CONTAINERD_LABEL" "$TEMP_MOUNT" 2>/dev/null; then
|
||||
if [ -f "${TEMP_MOUNT}/${MARKER_FILE}" ]; then
|
||||
umount "$TEMP_MOUNT"
|
||||
rmdir "$TEMP_MOUNT"
|
||||
log "Storage already configured (found labels and marker)"
|
||||
mount_storage
|
||||
exit 0
|
||||
else
|
||||
umount "$TEMP_MOUNT"
|
||||
rmdir "$TEMP_MOUNT"
|
||||
# Has our labels but no marker - probably ours, mount it
|
||||
warn "Found labels but no marker file - assuming configured"
|
||||
mount_storage
|
||||
exit 0
|
||||
fi
|
||||
fi
|
||||
rmdir "$TEMP_MOUNT" 2>/dev/null || true
|
||||
fi
|
||||
|
||||
# Has partitions but not ours - this could contain data!
|
||||
warn "NVMe has existing partitions but no netboot labels."
|
||||
warn "This drive may contain important data!"
|
||||
echo ""
|
||||
lsblk "$DEVICE"
|
||||
echo ""
|
||||
|
||||
# Prompt on console with timeout
|
||||
echo -e "${CYAN}========================================${NC}"
|
||||
echo -e "${CYAN} Press ENTER within ${PROMPT_TIMEOUT}s to format ${NC}"
|
||||
echo -e "${CYAN} Or wait to skip (safe default) ${NC}"
|
||||
echo -e "${CYAN}========================================${NC}"
|
||||
echo ""
|
||||
|
||||
if read -t "$PROMPT_TIMEOUT" -p "Format $DEVICE? [press ENTER to confirm] " response; then
|
||||
echo ""
|
||||
warn "Formatting in 5 seconds... Ctrl+C to abort"
|
||||
sleep 5
|
||||
format_storage
|
||||
else
|
||||
echo ""
|
||||
warn "Timeout - skipping storage setup (drive left untouched)"
|
||||
warn "To format manually, reboot and press ENTER when prompted"
|
||||
exit 0
|
||||
fi
|
||||
Reference in New Issue
Block a user