diff --git a/build-image.sh b/build-image.sh index 69572fc..ce7b057 100755 --- a/build-image.sh +++ b/build-image.sh @@ -130,7 +130,8 @@ apt-get install -y \ conntrack \ socat \ ethtool \ - nfs-common + nfs-common \ + open-iscsi # Container runtime prerequisites apt-get install -y \ @@ -156,8 +157,9 @@ rm -rf /var/lib/apt/lists/* rm -rf /tmp/* rm -rf /var/tmp/* -# Configure hostname (will be overridden by netplan) -echo "k3s-node" > /etc/hostname +# Don't set static hostname - let DHCP provide it via networkd +# Empty /etc/hostname allows transient hostname from DHCP +echo "" > /etc/hostname # Configure network with netplan cat > /etc/netplan/01-netcfg.yaml </dev/null; do + count=$((count + 1)) + if [ $count -ge $MAX_RETRIES ]; then + error "Network not available after $MAX_RETRIES attempts" + return 1 + fi + warn "Waiting for network... ($count/$MAX_RETRIES)" + sleep $RETRY_DELAY + done + log "Network is up" +} + +# Fetch join token from server +fetch_token() { + local count=0 + local token="" + + while [ -z "$token" ]; do + token=$(curl -sf "$TOKEN_URL" 2>/dev/null || true) + if [ -z "$token" ]; then + count=$((count + 1)) + if [ $count -ge $MAX_RETRIES ]; then + error "Failed to fetch token after $MAX_RETRIES attempts" + return 1 + fi + warn "Waiting for token... ($count/$MAX_RETRIES)" + sleep $RETRY_DELAY + fi + done + + echo "$token" +} + +# Main +log "Starting K3s agent join process" + +wait_for_network + +log "Fetching join token from $TOKEN_URL" +K3S_TOKEN=$(fetch_token) +if [ -z "$K3S_TOKEN" ]; then + error "Failed to get token, exiting" + exit 1 +fi +log "Token acquired" + +log "Starting K3s agent (server: $K3S_URL)" +exec /usr/local/bin/k3s agent \ + --server="$K3S_URL" \ + --token="$K3S_TOKEN" \ + --node-name="$(hostname)" diff --git a/files/k3s-join.service b/files/k3s-join.service new file mode 100644 index 0000000..01e7e71 --- /dev/null +++ b/files/k3s-join.service @@ -0,0 +1,26 @@ +[Unit] +Description=K3s Agent Join Service +Documentation=file:///usr/local/bin/k3s-join + +# Run after network and storage are ready +After=network-online.target setup-node-storage.service +Wants=network-online.target +Requires=containerd.service + +[Service] +Type=exec +ExecStart=/usr/local/bin/k3s-join +Restart=on-failure +RestartSec=30 + +# Environment +Environment="PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" + +# Hardening +LimitNOFILE=1048576 +LimitNPROC=infinity +LimitCORE=infinity +TasksMax=infinity + +[Install] +WantedBy=multi-user.target diff --git a/files/set-hostname-from-dhcp b/files/set-hostname-from-dhcp new file mode 100644 index 0000000..75c5c1e --- /dev/null +++ b/files/set-hostname-from-dhcp @@ -0,0 +1,28 @@ +#!/bin/bash +# Set hostname from DHCP lease +# Runs before k3s-agent to ensure proper node name + +set -euo pipefail + +log() { echo "[hostname] $*"; logger -t set-hostname "$*"; } + +# Wait for DHCP lease +MAX_WAIT=60 +for i in $(seq 1 $MAX_WAIT); do + # Check for lease files from systemd-networkd + for lease in /run/systemd/netif/leases/*; do + if [ -f "$lease" ]; then + HOSTNAME=$(grep -oP '^HOSTNAME=\K.*' "$lease" 2>/dev/null || true) + if [ -n "$HOSTNAME" ]; then + log "Found hostname in DHCP lease: $HOSTNAME" + hostnamectl set-hostname "$HOSTNAME" + log "Hostname set to: $(hostname)" + exit 0 + fi + fi + done + sleep 1 +done + +log "Warning: No DHCP hostname found after ${MAX_WAIT}s, using default" +exit 0 diff --git a/files/set-hostname-from-dhcp.service b/files/set-hostname-from-dhcp.service new file mode 100644 index 0000000..8a8021c --- /dev/null +++ b/files/set-hostname-from-dhcp.service @@ -0,0 +1,15 @@ +[Unit] +Description=Set hostname from DHCP lease +Documentation=file:///usr/local/bin/set-hostname-from-dhcp +After=network-online.target systemd-networkd.service +Wants=network-online.target +Before=k3s-agent.service + +[Service] +Type=oneshot +ExecStart=/usr/local/bin/set-hostname-from-dhcp +RemainAfterExit=yes +TimeoutStartSec=90 + +[Install] +WantedBy=multi-user.target diff --git a/files/setup-node-storage b/files/setup-node-storage index e69fda7..55e3bf9 100644 --- a/files/setup-node-storage +++ b/files/setup-node-storage @@ -69,12 +69,50 @@ mount_storage() { } fi + # K3s persistence: bind mount agent data and node identity from NVMe + # This allows the node to survive reboots without re-registering + setup_k3s_persistence + log "Storage mounted:" log " $CONTAINERD_MOUNT: $(df -h "$CONTAINERD_MOUNT" | tail -1 | awk '{print $2}')" log " $LONGHORN_MOUNT: $(df -h "$LONGHORN_MOUNT" | tail -1 | awk '{print $2}')" return 0 } +# Setup K3s persistence directories +# Bind mounts NVMe directories to k3s paths so node identity survives reboots +setup_k3s_persistence() { + # K3s agent data (containerd, kubelet certs, etc.) + # Uses overlayfs internally, so must be on real filesystem, not overlay + K3S_AGENT="/var/lib/rancher/k3s/agent" + K3S_AGENT_DATA="$CONTAINERD_MOUNT/k3s-agent" + mkdir -p "$K3S_AGENT_DATA" "$K3S_AGENT" + if ! mountpoint -q "$K3S_AGENT"; then + mount --bind "$K3S_AGENT_DATA" "$K3S_AGENT" + log " $K3S_AGENT: bind mount to NVMe" + fi + + # K3s node identity (password file) + # Must persist across reboots or node will be rejected + K3S_NODE="/etc/rancher/node" + K3S_NODE_DATA="$CONTAINERD_MOUNT/k3s-node" + mkdir -p "$K3S_NODE_DATA" "$K3S_NODE" + if ! mountpoint -q "$K3S_NODE"; then + mount --bind "$K3S_NODE_DATA" "$K3S_NODE" + log " $K3S_NODE: bind mount to NVMe" + fi + + # Kubelet data (pod volumes, projected tokens, etc.) + # Must be on NVMe so kubelet reports real disk capacity, not the 2G tmpfs overlay + KUBELET_DIR="/var/lib/kubelet" + KUBELET_DATA="$CONTAINERD_MOUNT/kubelet" + mkdir -p "$KUBELET_DATA" "$KUBELET_DIR" + if ! mountpoint -q "$KUBELET_DIR"; then + mount --bind "$KUBELET_DATA" "$KUBELET_DIR" + log " $KUBELET_DIR: bind mount to NVMe" + fi +} + # Function to format the drive format_storage() { log "Partitioning $DEVICE..." @@ -116,6 +154,9 @@ device=$DEVICE EOF done + # K3s persistence: bind mount agent data and node identity from NVMe + setup_k3s_persistence + log "Storage formatted and mounted successfully" log " $CONTAINERD_MOUNT: $(df -h "$CONTAINERD_MOUNT" | tail -1 | awk '{print $2}')" log " $LONGHORN_MOUNT: $(df -h "$LONGHORN_MOUNT" | tail -1 | awk '{print $2}')"