Add K3s agent setup with NVMe-backed persistent storage

Bind-mount K3s agent data, node identity, and kubelet dirs from
NVMe so container image cache and node registration survive reboots
on the diskless netboot nodes. Includes K3s binary download, agent
systemd service, DHCP hostname resolution, and open-iscsi for
Longhorn iSCSI support.
This commit is contained in:
2026-03-01 19:11:12 +01:00
parent 3f191d8f93
commit 492cc8abbc
8 changed files with 250 additions and 3 deletions

View File

@@ -130,7 +130,8 @@ apt-get install -y \
conntrack \
socat \
ethtool \
nfs-common
nfs-common \
open-iscsi
# Container runtime prerequisites
apt-get install -y \
@@ -156,8 +157,9 @@ rm -rf /var/lib/apt/lists/*
rm -rf /tmp/*
rm -rf /var/tmp/*
# Configure hostname (will be overridden by netplan)
echo "k3s-node" > /etc/hostname
# Don't set static hostname - let DHCP provide it via networkd
# Empty /etc/hostname allows transient hostname from DHCP
echo "" > /etc/hostname
# Configure network with netplan
cat > /etc/netplan/01-netcfg.yaml <<EOF
@@ -284,6 +286,34 @@ mkdir -p "$BUILD_DIR/rootfs/etc/systemd/system/multi-user.target.wants"
ln -sf /etc/systemd/system/setup-node-storage.service \
"$BUILD_DIR/rootfs/etc/systemd/system/multi-user.target.wants/setup-node-storage.service"
# Install DHCP hostname service
echo "Installing DHCP hostname service..."
cp "$FILES_DIR/set-hostname-from-dhcp" "$BUILD_DIR/rootfs/usr/local/bin/"
chmod +x "$BUILD_DIR/rootfs/usr/local/bin/set-hostname-from-dhcp"
cp "$FILES_DIR/set-hostname-from-dhcp.service" "$BUILD_DIR/rootfs/etc/systemd/system/"
ln -sf /etc/systemd/system/set-hostname-from-dhcp.service \
"$BUILD_DIR/rootfs/etc/systemd/system/multi-user.target.wants/set-hostname-from-dhcp.service"
# Download and install K3s binary
echo "Downloading K3s binary..."
K3S_VERSION="v1.34.3+k3s1"
curl -sfL "https://github.com/k3s-io/k3s/releases/download/${K3S_VERSION}/k3s" \
-o "$BUILD_DIR/rootfs/usr/local/bin/k3s"
chmod +x "$BUILD_DIR/rootfs/usr/local/bin/k3s"
echo "K3s $K3S_VERSION installed"
# Install K3s agent service
echo "Installing K3s agent service..."
# Create K3s directories first (will be bind-mounted from NVMe at runtime)
mkdir -p "$BUILD_DIR/rootfs/etc/rancher/k3s"
mkdir -p "$BUILD_DIR/rootfs/etc/rancher/node"
mkdir -p "$BUILD_DIR/rootfs/var/lib/rancher/k3s/agent"
cp "$FILES_DIR/k3s-agent.service" "$BUILD_DIR/rootfs/etc/systemd/system/"
cp "$FILES_DIR/k3s-agent.env" "$BUILD_DIR/rootfs/etc/rancher/k3s/"
# Enable the service
ln -sf /etc/systemd/system/k3s-agent.service \
"$BUILD_DIR/rootfs/etc/systemd/system/multi-user.target.wants/k3s-agent.service"
# Build initramfs while /proc/sys/dev are still mounted
echo "Building custom netboot initramfs..."
KERNEL_VERSION=$(ls -1 $BUILD_DIR/rootfs/boot/vmlinuz-* | sed 's|.*/vmlinuz-||' | head -1)

4
files/k3s-agent.env Normal file
View File

@@ -0,0 +1,4 @@
# K3s agent configuration
# Server URL and token for cluster join
K3S_URL="https://192.168.100.1:6443"
K3S_TOKEN="K106e2ea6914f7a019d1222c1fdd19c5065978377364701f60eb1f2a585e8c3924b::server:0a15c4d7a13df65b066f5b8eff710ecd"

25
files/k3s-agent.service Normal file
View File

@@ -0,0 +1,25 @@
[Unit]
Description=Lightweight Kubernetes (K3s Agent)
Documentation=https://k3s.io
After=network-online.target setup-node-storage.service set-hostname-from-dhcp.service
Wants=network-online.target
Requires=setup-node-storage.service set-hostname-from-dhcp.service
[Service]
Type=notify
EnvironmentFile=-/etc/rancher/k3s/k3s-agent.env
ExecStartPre=/sbin/modprobe br_netfilter
ExecStartPre=/sbin/modprobe overlay
ExecStart=/usr/local/bin/k3s agent
KillMode=process
Delegate=yes
LimitNOFILE=1048576
LimitNPROC=infinity
LimitCORE=infinity
TasksMax=infinity
TimeoutStartSec=0
Restart=always
RestartSec=5s
[Install]
WantedBy=multi-user.target

78
files/k3s-join Normal file
View File

@@ -0,0 +1,78 @@
#!/bin/bash
# K3s agent join script for netboot nodes
# Fetches token from server and starts k3s agent
#
# Runs at boot via k3s-join.service
set -euo pipefail
K3S_SERVER="192.168.100.1"
K3S_URL="https://${K3S_SERVER}:6443"
TOKEN_URL="http://${K3S_SERVER}:8800/k3s-token"
MAX_RETRIES=30
RETRY_DELAY=10
# Colors for console output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'
log() { echo -e "${GREEN}[k3s-join]${NC} $*"; logger -t k3s-join "$*"; }
warn() { echo -e "${YELLOW}[k3s-join]${NC} $*"; logger -t k3s-join -p warning "$*"; }
error() { echo -e "${RED}[k3s-join]${NC} $*"; logger -t k3s-join -p err "$*"; }
# Wait for network to be ready
wait_for_network() {
local count=0
while ! ping -c1 -W1 "$K3S_SERVER" &>/dev/null; do
count=$((count + 1))
if [ $count -ge $MAX_RETRIES ]; then
error "Network not available after $MAX_RETRIES attempts"
return 1
fi
warn "Waiting for network... ($count/$MAX_RETRIES)"
sleep $RETRY_DELAY
done
log "Network is up"
}
# Fetch join token from server
fetch_token() {
local count=0
local token=""
while [ -z "$token" ]; do
token=$(curl -sf "$TOKEN_URL" 2>/dev/null || true)
if [ -z "$token" ]; then
count=$((count + 1))
if [ $count -ge $MAX_RETRIES ]; then
error "Failed to fetch token after $MAX_RETRIES attempts"
return 1
fi
warn "Waiting for token... ($count/$MAX_RETRIES)"
sleep $RETRY_DELAY
fi
done
echo "$token"
}
# Main
log "Starting K3s agent join process"
wait_for_network
log "Fetching join token from $TOKEN_URL"
K3S_TOKEN=$(fetch_token)
if [ -z "$K3S_TOKEN" ]; then
error "Failed to get token, exiting"
exit 1
fi
log "Token acquired"
log "Starting K3s agent (server: $K3S_URL)"
exec /usr/local/bin/k3s agent \
--server="$K3S_URL" \
--token="$K3S_TOKEN" \
--node-name="$(hostname)"

26
files/k3s-join.service Normal file
View File

@@ -0,0 +1,26 @@
[Unit]
Description=K3s Agent Join Service
Documentation=file:///usr/local/bin/k3s-join
# Run after network and storage are ready
After=network-online.target setup-node-storage.service
Wants=network-online.target
Requires=containerd.service
[Service]
Type=exec
ExecStart=/usr/local/bin/k3s-join
Restart=on-failure
RestartSec=30
# Environment
Environment="PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
# Hardening
LimitNOFILE=1048576
LimitNPROC=infinity
LimitCORE=infinity
TasksMax=infinity
[Install]
WantedBy=multi-user.target

View File

@@ -0,0 +1,28 @@
#!/bin/bash
# Set hostname from DHCP lease
# Runs before k3s-agent to ensure proper node name
set -euo pipefail
log() { echo "[hostname] $*"; logger -t set-hostname "$*"; }
# Wait for DHCP lease
MAX_WAIT=60
for i in $(seq 1 $MAX_WAIT); do
# Check for lease files from systemd-networkd
for lease in /run/systemd/netif/leases/*; do
if [ -f "$lease" ]; then
HOSTNAME=$(grep -oP '^HOSTNAME=\K.*' "$lease" 2>/dev/null || true)
if [ -n "$HOSTNAME" ]; then
log "Found hostname in DHCP lease: $HOSTNAME"
hostnamectl set-hostname "$HOSTNAME"
log "Hostname set to: $(hostname)"
exit 0
fi
fi
done
sleep 1
done
log "Warning: No DHCP hostname found after ${MAX_WAIT}s, using default"
exit 0

View File

@@ -0,0 +1,15 @@
[Unit]
Description=Set hostname from DHCP lease
Documentation=file:///usr/local/bin/set-hostname-from-dhcp
After=network-online.target systemd-networkd.service
Wants=network-online.target
Before=k3s-agent.service
[Service]
Type=oneshot
ExecStart=/usr/local/bin/set-hostname-from-dhcp
RemainAfterExit=yes
TimeoutStartSec=90
[Install]
WantedBy=multi-user.target

View File

@@ -69,12 +69,50 @@ mount_storage() {
}
fi
# K3s persistence: bind mount agent data and node identity from NVMe
# This allows the node to survive reboots without re-registering
setup_k3s_persistence
log "Storage mounted:"
log " $CONTAINERD_MOUNT: $(df -h "$CONTAINERD_MOUNT" | tail -1 | awk '{print $2}')"
log " $LONGHORN_MOUNT: $(df -h "$LONGHORN_MOUNT" | tail -1 | awk '{print $2}')"
return 0
}
# Setup K3s persistence directories
# Bind mounts NVMe directories to k3s paths so node identity survives reboots
setup_k3s_persistence() {
# K3s agent data (containerd, kubelet certs, etc.)
# Uses overlayfs internally, so must be on real filesystem, not overlay
K3S_AGENT="/var/lib/rancher/k3s/agent"
K3S_AGENT_DATA="$CONTAINERD_MOUNT/k3s-agent"
mkdir -p "$K3S_AGENT_DATA" "$K3S_AGENT"
if ! mountpoint -q "$K3S_AGENT"; then
mount --bind "$K3S_AGENT_DATA" "$K3S_AGENT"
log " $K3S_AGENT: bind mount to NVMe"
fi
# K3s node identity (password file)
# Must persist across reboots or node will be rejected
K3S_NODE="/etc/rancher/node"
K3S_NODE_DATA="$CONTAINERD_MOUNT/k3s-node"
mkdir -p "$K3S_NODE_DATA" "$K3S_NODE"
if ! mountpoint -q "$K3S_NODE"; then
mount --bind "$K3S_NODE_DATA" "$K3S_NODE"
log " $K3S_NODE: bind mount to NVMe"
fi
# Kubelet data (pod volumes, projected tokens, etc.)
# Must be on NVMe so kubelet reports real disk capacity, not the 2G tmpfs overlay
KUBELET_DIR="/var/lib/kubelet"
KUBELET_DATA="$CONTAINERD_MOUNT/kubelet"
mkdir -p "$KUBELET_DATA" "$KUBELET_DIR"
if ! mountpoint -q "$KUBELET_DIR"; then
mount --bind "$KUBELET_DATA" "$KUBELET_DIR"
log " $KUBELET_DIR: bind mount to NVMe"
fi
}
# Function to format the drive
format_storage() {
log "Partitioning $DEVICE..."
@@ -116,6 +154,9 @@ device=$DEVICE
EOF
done
# K3s persistence: bind mount agent data and node identity from NVMe
setup_k3s_persistence
log "Storage formatted and mounted successfully"
log " $CONTAINERD_MOUNT: $(df -h "$CONTAINERD_MOUNT" | tail -1 | awk '{print $2}')"
log " $LONGHORN_MOUNT: $(df -h "$LONGHORN_MOUNT" | tail -1 | awk '{print $2}')"