Add NVMe storage auto-setup, sops secrets, fix SSH permissions
- setup-node-storage service auto-partitions NVMe for containerd/longhorn - Root password encrypted with sops/age, decrypted during build - Fix SSH host key permissions (0600) so sshd actually starts - Disable SSH socket activation for reliable boot - Add OPERATIONS.md with runbook - Makefile tracks source dependencies
This commit is contained in:
3
.sops.yaml
Normal file
3
.sops.yaml
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
creation_rules:
|
||||||
|
- path_regex: secrets/.*\.yaml$
|
||||||
|
age: age1gausnystsln7fpenw7arw7x79xe22z22697jnauj38npy0usayqqxqc7td2y
|
||||||
17
Makefile
17
Makefile
@@ -1,9 +1,19 @@
|
|||||||
.PHONY: build deploy clean help
|
.PHONY: deploy clean help check-nas all
|
||||||
|
|
||||||
NAS_HOST=phoenix
|
NAS_HOST=phoenix
|
||||||
NAS_PATH=/srv/netboot
|
NAS_PATH=/srv/netboot
|
||||||
SCRIPT_DIR=$(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
|
SCRIPT_DIR=$(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
|
||||||
|
|
||||||
|
# Source files that trigger a rebuild
|
||||||
|
BUILD_SOURCES := $(SCRIPT_DIR)/build-image.sh \
|
||||||
|
$(wildcard $(SCRIPT_DIR)/initramfs/*) \
|
||||||
|
$(wildcard $(SCRIPT_DIR)/initramfs/*/*) \
|
||||||
|
$(wildcard $(SCRIPT_DIR)/files/*) \
|
||||||
|
$(wildcard $(SCRIPT_DIR)/secrets/*.yaml)
|
||||||
|
|
||||||
|
# Build artifact (used as target for dependency tracking)
|
||||||
|
BUILD_ARTIFACT := $(SCRIPT_DIR)/http/filesystem.squashfs
|
||||||
|
|
||||||
help:
|
help:
|
||||||
@echo "Netboot image build and deployment"
|
@echo "Netboot image build and deployment"
|
||||||
@echo ""
|
@echo ""
|
||||||
@@ -23,7 +33,8 @@ check-nas:
|
|||||||
@echo "Checking NAS connectivity..."
|
@echo "Checking NAS connectivity..."
|
||||||
@ping -c 1 $(NAS_HOST) > /dev/null 2>&1 && echo "✓ NAS is reachable" || (echo "✗ Cannot reach $(NAS_HOST)"; exit 1)
|
@ping -c 1 $(NAS_HOST) > /dev/null 2>&1 && echo "✓ NAS is reachable" || (echo "✗ Cannot reach $(NAS_HOST)"; exit 1)
|
||||||
|
|
||||||
build:
|
# Build depends on source files - only rebuilds if sources changed
|
||||||
|
$(BUILD_ARTIFACT): $(BUILD_SOURCES)
|
||||||
@echo "Building netboot image..."
|
@echo "Building netboot image..."
|
||||||
@echo "This will take 15-30 minutes..."
|
@echo "This will take 15-30 minutes..."
|
||||||
sudo $(SCRIPT_DIR)/build-image.sh
|
sudo $(SCRIPT_DIR)/build-image.sh
|
||||||
@@ -32,6 +43,8 @@ build:
|
|||||||
@echo "Artifacts ready in $(SCRIPT_DIR)/http/"
|
@echo "Artifacts ready in $(SCRIPT_DIR)/http/"
|
||||||
@du -sh $(SCRIPT_DIR)/http/*
|
@du -sh $(SCRIPT_DIR)/http/*
|
||||||
|
|
||||||
|
build: $(BUILD_ARTIFACT)
|
||||||
|
|
||||||
deploy: check-nas
|
deploy: check-nas
|
||||||
@echo "Deploying to NAS ($(NAS_HOST):$(NAS_PATH))..."
|
@echo "Deploying to NAS ($(NAS_HOST):$(NAS_PATH))..."
|
||||||
@echo "Syncing http/ directory..."
|
@echo "Syncing http/ directory..."
|
||||||
|
|||||||
333
OPERATIONS.md
Normal file
333
OPERATIONS.md
Normal file
@@ -0,0 +1,333 @@
|
|||||||
|
# Netboot Operations Guide
|
||||||
|
|
||||||
|
This document covers day-to-day operations for the netboot K3s cluster system.
|
||||||
|
|
||||||
|
## Quick Reference
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Build new image (15-30 min, requires sudo)
|
||||||
|
cd /home/lindahl/git/netboot
|
||||||
|
sudo ./build-image.sh
|
||||||
|
make deploy
|
||||||
|
|
||||||
|
# Rebuild initramfs only (faster, ~2 min)
|
||||||
|
sudo ./rebuild-initramfs.sh
|
||||||
|
make deploy
|
||||||
|
|
||||||
|
# SSH to a node
|
||||||
|
ssh root@192.168.100.51
|
||||||
|
|
||||||
|
# Check node storage
|
||||||
|
ssh root@192.168.100.51 "lsblk && df -h /var/lib/containerd /var/lib/longhorn"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Architecture Overview
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────────┐ HTTP (8800) ┌──────────────────┐
|
||||||
|
│ Phoenix NAS │◄────────────────────►│ K3s Nodes │
|
||||||
|
│ 192.168.100.1 │ │ 192.168.100.5x │
|
||||||
|
├─────────────────┤ ├──────────────────┤
|
||||||
|
│ /srv/netboot/ │ │ RAM (overlay) │
|
||||||
|
│ http/ │ │ └─ / (root) │
|
||||||
|
│ vmlinuz │ │ NVMe (persistent)│
|
||||||
|
│ initrd-netboot.img │ ├─ containerd │
|
||||||
|
│ filesystem.squashfs │ └─ longhorn │
|
||||||
|
│ boot.ipxe │ └──────────────────┘
|
||||||
|
└─────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
**Boot sequence:**
|
||||||
|
1. Node PXE boots → loads iPXE
|
||||||
|
2. iPXE fetches `boot.ipxe` from phoenix
|
||||||
|
3. Downloads kernel + initramfs
|
||||||
|
4. Initramfs downloads squashfs root over HTTP
|
||||||
|
5. Mounts squashfs read-only with tmpfs overlay
|
||||||
|
6. `setup-node-storage.service` partitions/mounts local NVMe
|
||||||
|
7. System starts, K3s joins cluster
|
||||||
|
|
||||||
|
## Building Images
|
||||||
|
|
||||||
|
### Full Build
|
||||||
|
|
||||||
|
Builds everything from scratch: debootstrap, packages, initramfs, squashfs.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd /home/lindahl/git/netboot
|
||||||
|
sudo ./build-image.sh
|
||||||
|
make deploy
|
||||||
|
```
|
||||||
|
|
||||||
|
**Time:** 15-30 minutes
|
||||||
|
**When to use:** Package changes, kernel updates, major configuration changes
|
||||||
|
|
||||||
|
### Initramfs-Only Rebuild
|
||||||
|
|
||||||
|
Faster rebuild when only changing boot/network logic.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo ./rebuild-initramfs.sh
|
||||||
|
make deploy
|
||||||
|
```
|
||||||
|
|
||||||
|
**Time:** ~2 minutes
|
||||||
|
**When to use:** Changes to `initramfs/` scripts or hooks
|
||||||
|
|
||||||
|
### Verify Build
|
||||||
|
|
||||||
|
Check that all components are present and valid:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./verify-image.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
## Secret Management
|
||||||
|
|
||||||
|
Secrets are encrypted with [sops](https://github.com/getsops/sops) using age encryption. The age key lives on phoenix.
|
||||||
|
|
||||||
|
### Encrypted Files
|
||||||
|
|
||||||
|
| File | Contents |
|
||||||
|
|------|----------|
|
||||||
|
| `secrets/netboot.sops.yaml` | Root password hash for console login |
|
||||||
|
|
||||||
|
### Viewing Secrets
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# From any machine with SSH access to phoenix
|
||||||
|
cat secrets/netboot.sops.yaml | ssh phoenix "sops -d --input-type yaml --output-type yaml /dev/stdin"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Updating Root Password
|
||||||
|
|
||||||
|
1. Generate new password hash:
|
||||||
|
```bash
|
||||||
|
ssh phoenix "echo 'newpassword' | openssl passwd -6 -stdin"
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Update the encrypted file:
|
||||||
|
```bash
|
||||||
|
ssh phoenix "cd /path/to/netboot && sops secrets/netboot.sops.yaml"
|
||||||
|
# Edit root_password_hash value, save
|
||||||
|
```
|
||||||
|
|
||||||
|
Or recreate entirely:
|
||||||
|
```bash
|
||||||
|
NEW_HASH=$(ssh phoenix "echo 'newpassword' | openssl passwd -6 -stdin")
|
||||||
|
ssh phoenix "echo 'root_password_hash: \"$NEW_HASH\"' | sops --input-type yaml --output-type yaml -e --age age1gausnystsln7fpenw7arw7x79xe22z697jnauj38npy0usayqqxqc7td2y /dev/stdin" > secrets/netboot.sops.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Rebuild and deploy:
|
||||||
|
```bash
|
||||||
|
sudo ./build-image.sh
|
||||||
|
make deploy
|
||||||
|
```
|
||||||
|
|
||||||
|
4. Reboot nodes to pick up new password
|
||||||
|
|
||||||
|
### Adding New Secrets
|
||||||
|
|
||||||
|
Edit `.sops.yaml` to add new file patterns, then create encrypted files on phoenix:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ssh phoenix "sops secrets/newfile.sops.yaml"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Node Storage Setup
|
||||||
|
|
||||||
|
Local NVMe is automatically partitioned on first boot by `setup-node-storage.service`.
|
||||||
|
|
||||||
|
### Partition Layout
|
||||||
|
|
||||||
|
| Partition | Size | Label | Mount Point | Purpose |
|
||||||
|
|-----------|------|-------|-------------|---------|
|
||||||
|
| nvme0n1p1 | 75GB | containerd | /var/lib/containerd | Container images |
|
||||||
|
| nvme0n1p2 | Remaining | longhorn | /var/lib/longhorn | Distributed storage |
|
||||||
|
|
||||||
|
### Automatic Behavior
|
||||||
|
|
||||||
|
| Drive State | Action |
|
||||||
|
|-------------|--------|
|
||||||
|
| No partition table | Auto-format (no prompt) |
|
||||||
|
| Has our labels (containerd/longhorn) | Mount silently |
|
||||||
|
| Has unknown partitions | Prompt on tty1, 120s timeout, skip if no response |
|
||||||
|
|
||||||
|
### Manual Intervention
|
||||||
|
|
||||||
|
If a node has an unknown drive and you want to format it:
|
||||||
|
|
||||||
|
1. Connect to physical console (tty1)
|
||||||
|
2. Reboot the node
|
||||||
|
3. Press ENTER when prompted (within 120 seconds)
|
||||||
|
4. Wait 5 seconds (abort window)
|
||||||
|
5. Drive is formatted and mounted
|
||||||
|
|
||||||
|
### Checking Storage Status
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# On node
|
||||||
|
journalctl -u setup-node-storage
|
||||||
|
cat /var/lib/containerd/.netboot-storage # marker file with metadata
|
||||||
|
lsblk /dev/nvme0n1
|
||||||
|
df -h /var/lib/containerd /var/lib/longhorn
|
||||||
|
```
|
||||||
|
|
||||||
|
## SSH Access
|
||||||
|
|
||||||
|
### Authorized Keys
|
||||||
|
|
||||||
|
Keys are baked into the image at build time. Current keys:
|
||||||
|
|
||||||
|
| Key | Source |
|
||||||
|
|-----|--------|
|
||||||
|
| `ssh-ed25519 AAAAC3...y1J` | lindahl@lindahl-Legion-5-Pro-16ACH6H |
|
||||||
|
| `ssh-ed25519 AAAA...0tX` | lindahl@phoenix.home |
|
||||||
|
|
||||||
|
To add/remove keys, edit `build-image.sh` around line 164-167.
|
||||||
|
|
||||||
|
### Console Access
|
||||||
|
|
||||||
|
Root password is set for physical console login only. SSH remains pubkey-only.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Physical console or IPMI
|
||||||
|
login: root
|
||||||
|
Password: <from secrets/netboot.sops.yaml>
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Node Won't Boot
|
||||||
|
|
||||||
|
1. Check phoenix HTTP server:
|
||||||
|
```bash
|
||||||
|
ssh phoenix "curl -I http://localhost:8800/boot.ipxe"
|
||||||
|
ssh phoenix "ls -lh /srv/netboot/http/"
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Check nginx is running:
|
||||||
|
```bash
|
||||||
|
ssh phoenix "systemctl status nginx"
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Verify image integrity:
|
||||||
|
```bash
|
||||||
|
./verify-image.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
### Node Boots But No Network
|
||||||
|
|
||||||
|
1. Check if initramfs has network driver:
|
||||||
|
```bash
|
||||||
|
lsinitramfs http/initrd-netboot.img | grep -E "r8169|r8125"
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Check kernel cmdline includes `ip=dhcp`:
|
||||||
|
```bash
|
||||||
|
cat http/boot.ipxe
|
||||||
|
```
|
||||||
|
|
||||||
|
### Storage Not Mounting
|
||||||
|
|
||||||
|
1. Check service status:
|
||||||
|
```bash
|
||||||
|
ssh root@node "systemctl status setup-node-storage"
|
||||||
|
ssh root@node "journalctl -u setup-node-storage"
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Check if NVMe exists:
|
||||||
|
```bash
|
||||||
|
ssh root@node "lsblk"
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Check labels:
|
||||||
|
```bash
|
||||||
|
ssh root@node "blkid -L containerd && blkid -L longhorn"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Overlay Filling Up
|
||||||
|
|
||||||
|
The root overlay is only 2GB. If it fills:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check what's using space
|
||||||
|
ssh root@node "du -sh /var/* | sort -h"
|
||||||
|
|
||||||
|
# Temporary files should go to NVMe or tmpfs mounts
|
||||||
|
# /tmp, /var/tmp, /var/log are separate tmpfs
|
||||||
|
```
|
||||||
|
|
||||||
|
## File Reference
|
||||||
|
|
||||||
|
| File | Purpose |
|
||||||
|
|------|---------|
|
||||||
|
| `build-image.sh` | Main build script |
|
||||||
|
| `rebuild-initramfs.sh` | Quick initramfs rebuild |
|
||||||
|
| `verify-image.sh` | Validate built image |
|
||||||
|
| `Makefile` | Build/deploy automation |
|
||||||
|
| `initramfs/` | Custom initramfs config for mkinitramfs |
|
||||||
|
| `initramfs/scripts/netboot` | HTTP root download and overlay mount |
|
||||||
|
| `files/setup-node-storage` | NVMe partitioning script |
|
||||||
|
| `files/setup-node-storage.service` | Systemd unit for storage setup |
|
||||||
|
| `secrets/netboot.sops.yaml` | Encrypted root password |
|
||||||
|
| `.sops.yaml` | Sops encryption config |
|
||||||
|
| `http/boot.ipxe` | iPXE boot configuration |
|
||||||
|
|
||||||
|
## Network Configuration
|
||||||
|
|
||||||
|
### IP Address Layout
|
||||||
|
|
||||||
|
| Range | Purpose |
|
||||||
|
|-------|---------|
|
||||||
|
| .1 | phoenix (gateway, DHCP, HTTP) |
|
||||||
|
| .2-.19 | Reserved (future infrastructure) |
|
||||||
|
| .20-.29 | Infrastructure devices |
|
||||||
|
| .50-.59 | Static K3s nodes |
|
||||||
|
| .60-.100 | Dynamic DHCP pool |
|
||||||
|
|
||||||
|
### Static Assignments
|
||||||
|
|
||||||
|
| Host | IP | MAC | Role |
|
||||||
|
|------|-----|-----|------|
|
||||||
|
| phoenix | 192.168.100.1 | - | NAS, HTTP server, DHCP |
|
||||||
|
| usw-flex-2 | 192.168.100.21 | 94:2a:6f:4c:fc:72 | Managed switch |
|
||||||
|
| k3s-node-01 | 192.168.100.51 | 78:55:36:04:e7:c8 | K3s worker |
|
||||||
|
| k3s-node-02 | 192.168.100.52 | 78:55:36:04:e7:1d | K3s worker |
|
||||||
|
|
||||||
|
HTTP server: `http://192.168.100.1:8800/`
|
||||||
|
|
||||||
|
### DHCP Reservations
|
||||||
|
|
||||||
|
Static IP assignments are configured in `/etc/dnsmasq.d/pxe-netboot.conf` on phoenix:
|
||||||
|
|
||||||
|
```
|
||||||
|
dhcp-range=192.168.100.60,192.168.100.100,12h
|
||||||
|
|
||||||
|
# Static DHCP reservations for K3s nodes
|
||||||
|
dhcp-host=78:55:36:04:e7:c8,192.168.100.51,k3s-node-01
|
||||||
|
dhcp-host=78:55:36:04:e7:1d,192.168.100.52,k3s-node-02
|
||||||
|
|
||||||
|
# Infrastructure
|
||||||
|
dhcp-host=94:2a:6f:4c:fc:72,192.168.100.21,usw-flex-2
|
||||||
|
```
|
||||||
|
|
||||||
|
To add a new node:
|
||||||
|
|
||||||
|
1. Boot the node once to get its MAC (check leases):
|
||||||
|
```bash
|
||||||
|
ssh phoenix "cat /var/lib/misc/dnsmasq.leases"
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Add reservation:
|
||||||
|
```bash
|
||||||
|
ssh phoenix "sudo tee -a /etc/dnsmasq.d/pxe-netboot.conf << EOF
|
||||||
|
dhcp-host=XX:XX:XX:XX:XX:XX,192.168.100.5X,k3s-node-0X
|
||||||
|
EOF"
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Restart dnsmasq:
|
||||||
|
```bash
|
||||||
|
ssh phoenix "sudo systemctl restart dnsmasq"
|
||||||
|
```
|
||||||
|
|
||||||
|
To change the boot server IP, edit `http/boot.ipxe` and `initramfs/scripts/netboot`.
|
||||||
@@ -14,6 +14,22 @@ VERSION=$(date +%Y%m%d-%H%M)
|
|||||||
|
|
||||||
echo "Building netboot image version $VERSION"
|
echo "Building netboot image version $VERSION"
|
||||||
|
|
||||||
|
# Decrypt secrets from phoenix (requires SSH access as the invoking user, not root)
|
||||||
|
echo "Decrypting secrets from phoenix..."
|
||||||
|
SECRETS_FILE="$SCRIPT_DIR/secrets/netboot.sops.yaml"
|
||||||
|
SUDO_USER_HOME=$(getent passwd "${SUDO_USER:-$USER}" | cut -d: -f6)
|
||||||
|
if [ -f "$SECRETS_FILE" ]; then
|
||||||
|
# Run SSH as the original user (not root) to use their SSH keys
|
||||||
|
ROOT_PW_HASH=$(sudo -u "${SUDO_USER:-$USER}" bash -c "cat '$SECRETS_FILE' | ssh phoenix 'sops -d --input-type yaml --output-type yaml /dev/stdin'" | grep root_password_hash | cut -d' ' -f2)
|
||||||
|
if [ -z "$ROOT_PW_HASH" ]; then
|
||||||
|
echo "WARNING: Failed to decrypt root password, console login will be disabled"
|
||||||
|
ROOT_PW_HASH="*"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "WARNING: No secrets file found at $SECRETS_FILE, console login will be disabled"
|
||||||
|
ROOT_PW_HASH="*"
|
||||||
|
fi
|
||||||
|
|
||||||
# Clean previous build - unmount any stray mounts first
|
# Clean previous build - unmount any stray mounts first
|
||||||
if [ -d "$BUILD_DIR/rootfs" ]; then
|
if [ -d "$BUILD_DIR/rootfs" ]; then
|
||||||
echo "Cleaning up previous build mounts..."
|
echo "Cleaning up previous build mounts..."
|
||||||
@@ -40,6 +56,17 @@ debootstrap --arch=amd64 --variant=minbase --components=main,universe,multiverse
|
|||||||
noble $BUILD_DIR/rootfs \
|
noble $BUILD_DIR/rootfs \
|
||||||
http://archive.ubuntu.com/ubuntu
|
http://archive.ubuntu.com/ubuntu
|
||||||
|
|
||||||
|
# Write root password hash to temp file for chroot to read
|
||||||
|
# Use /root/ not /tmp/ because systemd installation may mount tmpfs over /tmp
|
||||||
|
mkdir -p "$BUILD_DIR/rootfs/root"
|
||||||
|
if [ -n "$ROOT_PW_HASH" ] && [ "$ROOT_PW_HASH" != "*" ]; then
|
||||||
|
echo "$ROOT_PW_HASH" > "$BUILD_DIR/rootfs/root/.pw_hash"
|
||||||
|
echo "Root password hash written to rootfs"
|
||||||
|
else
|
||||||
|
echo "*" > "$BUILD_DIR/rootfs/root/.pw_hash"
|
||||||
|
echo "WARNING: No valid password hash, console login will be disabled"
|
||||||
|
fi
|
||||||
|
|
||||||
# Chroot and configure
|
# Chroot and configure
|
||||||
cat << 'CHROOT_SCRIPT' > $BUILD_DIR/rootfs/setup.sh
|
cat << 'CHROOT_SCRIPT' > $BUILD_DIR/rootfs/setup.sh
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
@@ -118,7 +145,10 @@ apt-get install -y \
|
|||||||
less \
|
less \
|
||||||
rsync \
|
rsync \
|
||||||
git \
|
git \
|
||||||
squashfs-tools
|
squashfs-tools \
|
||||||
|
parted \
|
||||||
|
fdisk \
|
||||||
|
gdisk
|
||||||
|
|
||||||
# Clean up
|
# Clean up
|
||||||
apt-get clean
|
apt-get clean
|
||||||
@@ -148,11 +178,19 @@ EOF
|
|||||||
systemctl enable systemd-networkd
|
systemctl enable systemd-networkd
|
||||||
systemctl enable systemd-resolved
|
systemctl enable systemd-resolved
|
||||||
|
|
||||||
# Configure SSH
|
# Configure SSH - disable socket activation, use traditional daemon
|
||||||
sed -i 's/#PermitRootLogin.*/PermitRootLogin prohibit-password/' /etc/ssh/sshd_config
|
sed -i 's/#PermitRootLogin.*/PermitRootLogin prohibit-password/' /etc/ssh/sshd_config
|
||||||
sed -i 's/#PubkeyAuthentication.*/PubkeyAuthentication yes/' /etc/ssh/sshd_config
|
sed -i 's/#PubkeyAuthentication.*/PubkeyAuthentication yes/' /etc/ssh/sshd_config
|
||||||
|
# Disable socket activation (Ubuntu 24.04 default) and use traditional sshd
|
||||||
|
systemctl disable ssh.socket 2>/dev/null || true
|
||||||
|
rm -f /etc/systemd/system/ssh.service.requires/ssh.socket 2>/dev/null || true
|
||||||
|
rm -f /etc/systemd/system/sockets.target.wants/ssh.socket 2>/dev/null || true
|
||||||
systemctl enable ssh
|
systemctl enable ssh
|
||||||
|
|
||||||
|
# Fix SSH host key permissions (must be 0600 for private keys, sshd refuses otherwise)
|
||||||
|
chmod 600 /etc/ssh/ssh_host_*_key
|
||||||
|
chmod 644 /etc/ssh/ssh_host_*_key.pub
|
||||||
|
|
||||||
# Create SSH directory for root
|
# Create SSH directory for root
|
||||||
mkdir -p /root/.ssh
|
mkdir -p /root/.ssh
|
||||||
chmod 700 /root/.ssh
|
chmod 700 /root/.ssh
|
||||||
@@ -165,8 +203,10 @@ SSHKEY
|
|||||||
|
|
||||||
chmod 600 /root/.ssh/authorized_keys
|
chmod 600 /root/.ssh/authorized_keys
|
||||||
|
|
||||||
# Disable password authentication completely
|
# Set root password from decrypted hash (for console login only)
|
||||||
echo "root:*" | chpasswd -e
|
ROOT_PW_HASH=$(cat /root/.pw_hash)
|
||||||
|
echo "root:$ROOT_PW_HASH" | chpasswd -e
|
||||||
|
rm -f /root/.pw_hash
|
||||||
|
|
||||||
# Configure tmpfs mounts for ephemeral data
|
# Configure tmpfs mounts for ephemeral data
|
||||||
cat >> /etc/fstab <<FSTAB
|
cat >> /etc/fstab <<FSTAB
|
||||||
@@ -233,6 +273,17 @@ cp "$INITRAMFS_CONFIG/modules" "$BUILD_DIR/rootfs/etc/initramfs-tools/"
|
|||||||
cp -r "$INITRAMFS_CONFIG/hooks/"* "$BUILD_DIR/rootfs/usr/share/initramfs-tools/hooks/"
|
cp -r "$INITRAMFS_CONFIG/hooks/"* "$BUILD_DIR/rootfs/usr/share/initramfs-tools/hooks/"
|
||||||
cp -r "$INITRAMFS_CONFIG/scripts/"* "$BUILD_DIR/rootfs/usr/share/initramfs-tools/scripts/"
|
cp -r "$INITRAMFS_CONFIG/scripts/"* "$BUILD_DIR/rootfs/usr/share/initramfs-tools/scripts/"
|
||||||
|
|
||||||
|
# Install node storage setup service
|
||||||
|
echo "Installing node storage setup service..."
|
||||||
|
FILES_DIR="$SCRIPT_DIR/files"
|
||||||
|
cp "$FILES_DIR/setup-node-storage" "$BUILD_DIR/rootfs/usr/local/bin/"
|
||||||
|
chmod +x "$BUILD_DIR/rootfs/usr/local/bin/setup-node-storage"
|
||||||
|
cp "$FILES_DIR/setup-node-storage.service" "$BUILD_DIR/rootfs/etc/systemd/system/"
|
||||||
|
# Enable the service (create symlink manually since we can't run systemctl)
|
||||||
|
mkdir -p "$BUILD_DIR/rootfs/etc/systemd/system/multi-user.target.wants"
|
||||||
|
ln -sf /etc/systemd/system/setup-node-storage.service \
|
||||||
|
"$BUILD_DIR/rootfs/etc/systemd/system/multi-user.target.wants/setup-node-storage.service"
|
||||||
|
|
||||||
# Build initramfs while /proc/sys/dev are still mounted
|
# Build initramfs while /proc/sys/dev are still mounted
|
||||||
echo "Building custom netboot initramfs..."
|
echo "Building custom netboot initramfs..."
|
||||||
KERNEL_VERSION=$(ls -1 $BUILD_DIR/rootfs/boot/vmlinuz-* | sed 's|.*/vmlinuz-||' | head -1)
|
KERNEL_VERSION=$(ls -1 $BUILD_DIR/rootfs/boot/vmlinuz-* | sed 's|.*/vmlinuz-||' | head -1)
|
||||||
|
|||||||
195
files/setup-node-storage
Normal file
195
files/setup-node-storage
Normal file
@@ -0,0 +1,195 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Setup local NVMe storage for K3s node
|
||||||
|
# Runs at boot via systemd service
|
||||||
|
#
|
||||||
|
# Logic:
|
||||||
|
# - No NVMe: exit cleanly
|
||||||
|
# - No partition table: auto-format (new drive)
|
||||||
|
# - Has our labels: mount and exit (already configured)
|
||||||
|
# - Has other partitions: prompt with 120s timeout (safety)
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
DEVICE="/dev/nvme0n1"
|
||||||
|
CONTAINERD_SIZE="75GiB"
|
||||||
|
CONTAINERD_LABEL="containerd"
|
||||||
|
LONGHORN_LABEL="longhorn"
|
||||||
|
CONTAINERD_MOUNT="/var/lib/containerd"
|
||||||
|
LONGHORN_MOUNT="/var/lib/longhorn"
|
||||||
|
MARKER_FILE=".netboot-storage"
|
||||||
|
PROMPT_TIMEOUT=120
|
||||||
|
|
||||||
|
# Colors for console output
|
||||||
|
RED='\033[0;31m'
|
||||||
|
GREEN='\033[0;32m'
|
||||||
|
YELLOW='\033[1;33m'
|
||||||
|
CYAN='\033[0;36m'
|
||||||
|
NC='\033[0m'
|
||||||
|
|
||||||
|
# Log to both console and journald
|
||||||
|
log() { echo -e "${GREEN}[storage]${NC} $*"; logger -t setup-node-storage "$*"; }
|
||||||
|
warn() { echo -e "${YELLOW}[storage]${NC} $*"; logger -t setup-node-storage -p warning "$*"; }
|
||||||
|
error() { echo -e "${RED}[storage]${NC} $*"; logger -t setup-node-storage -p err "$*"; }
|
||||||
|
|
||||||
|
# Check if NVMe exists
|
||||||
|
if [ ! -b "$DEVICE" ]; then
|
||||||
|
log "No NVMe device found at $DEVICE - skipping storage setup"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
DEVICE_SIZE=$(lsblk -b -d -n -o SIZE "$DEVICE" | awk '{printf "%.0fGB", $1/1000000000}')
|
||||||
|
log "Found NVMe: $DEVICE ($DEVICE_SIZE)"
|
||||||
|
|
||||||
|
# Get partition names (handles nvme naming with 'p' prefix)
|
||||||
|
if [[ "$DEVICE" == *"nvme"* ]]; then
|
||||||
|
PART1="${DEVICE}p1"
|
||||||
|
PART2="${DEVICE}p2"
|
||||||
|
else
|
||||||
|
PART1="${DEVICE}1"
|
||||||
|
PART2="${DEVICE}2"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Function to mount existing storage
|
||||||
|
mount_storage() {
|
||||||
|
log "Mounting existing storage..."
|
||||||
|
|
||||||
|
mkdir -p "$CONTAINERD_MOUNT" "$LONGHORN_MOUNT"
|
||||||
|
|
||||||
|
if ! mountpoint -q "$CONTAINERD_MOUNT"; then
|
||||||
|
mount -L "$CONTAINERD_LABEL" "$CONTAINERD_MOUNT" || {
|
||||||
|
error "Failed to mount containerd partition"
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! mountpoint -q "$LONGHORN_MOUNT"; then
|
||||||
|
mount -L "$LONGHORN_LABEL" "$LONGHORN_MOUNT" || {
|
||||||
|
error "Failed to mount longhorn partition"
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "Storage mounted:"
|
||||||
|
log " $CONTAINERD_MOUNT: $(df -h "$CONTAINERD_MOUNT" | tail -1 | awk '{print $2}')"
|
||||||
|
log " $LONGHORN_MOUNT: $(df -h "$LONGHORN_MOUNT" | tail -1 | awk '{print $2}')"
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
# Function to format the drive
|
||||||
|
format_storage() {
|
||||||
|
log "Partitioning $DEVICE..."
|
||||||
|
|
||||||
|
wipefs -af "$DEVICE"
|
||||||
|
parted -s "$DEVICE" mklabel gpt
|
||||||
|
parted -s "$DEVICE" mkpart primary ext4 1MiB "$CONTAINERD_SIZE"
|
||||||
|
parted -s "$DEVICE" mkpart primary ext4 "$CONTAINERD_SIZE" 100%
|
||||||
|
|
||||||
|
# Tell kernel to re-read partition table and wait for udev
|
||||||
|
partprobe "$DEVICE"
|
||||||
|
udevadm settle --timeout=10
|
||||||
|
|
||||||
|
# Verify partitions appeared
|
||||||
|
if [ ! -b "$PART1" ] || [ ! -b "$PART2" ]; then
|
||||||
|
error "Partitions not found after partprobe: $PART1, $PART2"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "Formatting ${PART1} as ext4 (containerd, 75GB)..."
|
||||||
|
mkfs.ext4 -L "$CONTAINERD_LABEL" -q "$PART1"
|
||||||
|
|
||||||
|
log "Formatting ${PART2} as ext4 (longhorn, remaining)..."
|
||||||
|
mkfs.ext4 -L "$LONGHORN_LABEL" -q "$PART2"
|
||||||
|
|
||||||
|
# Mount the new partitions
|
||||||
|
mkdir -p "$CONTAINERD_MOUNT" "$LONGHORN_MOUNT"
|
||||||
|
mount "$PART1" "$CONTAINERD_MOUNT"
|
||||||
|
mount "$PART2" "$LONGHORN_MOUNT"
|
||||||
|
|
||||||
|
# Create marker files with metadata
|
||||||
|
for mount_point in "$CONTAINERD_MOUNT" "$LONGHORN_MOUNT"; do
|
||||||
|
cat > "${mount_point}/${MARKER_FILE}" <<EOF
|
||||||
|
# Netboot storage marker - DO NOT DELETE
|
||||||
|
formatted_date=$(date -Iseconds)
|
||||||
|
formatted_by=setup-node-storage
|
||||||
|
hostname=$(hostname)
|
||||||
|
device=$DEVICE
|
||||||
|
EOF
|
||||||
|
done
|
||||||
|
|
||||||
|
log "Storage formatted and mounted successfully"
|
||||||
|
log " $CONTAINERD_MOUNT: $(df -h "$CONTAINERD_MOUNT" | tail -1 | awk '{print $2}')"
|
||||||
|
log " $LONGHORN_MOUNT: $(df -h "$LONGHORN_MOUNT" | tail -1 | awk '{print $2}')"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Check for partition table
|
||||||
|
# Method 1: blkid returns empty PTTYPE for unpartitioned drives
|
||||||
|
# Method 2: parted error message (locale-dependent fallback)
|
||||||
|
has_partition_table() {
|
||||||
|
local pttype
|
||||||
|
pttype=$(blkid -o value -s PTTYPE "$DEVICE" 2>/dev/null)
|
||||||
|
if [ -n "$pttype" ]; then
|
||||||
|
return 0 # has partition table
|
||||||
|
fi
|
||||||
|
# Fallback: check if parted can read it
|
||||||
|
if parted -s "$DEVICE" print &>/dev/null; then
|
||||||
|
return 0 # has partition table
|
||||||
|
fi
|
||||||
|
return 1 # no partition table
|
||||||
|
}
|
||||||
|
|
||||||
|
if ! has_partition_table; then
|
||||||
|
# No partition table - this is a fresh drive, auto-format
|
||||||
|
log "Empty drive detected (no partition table) - auto-formatting..."
|
||||||
|
format_storage
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Has partition table - check if it's ours
|
||||||
|
if blkid -L "$CONTAINERD_LABEL" &>/dev/null && blkid -L "$LONGHORN_LABEL" &>/dev/null; then
|
||||||
|
# Check for marker file (belt and suspenders)
|
||||||
|
# Create temp mount to check marker without leaving dangling mount
|
||||||
|
TEMP_MOUNT=$(mktemp -d)
|
||||||
|
if mount -L "$CONTAINERD_LABEL" "$TEMP_MOUNT" 2>/dev/null; then
|
||||||
|
if [ -f "${TEMP_MOUNT}/${MARKER_FILE}" ]; then
|
||||||
|
umount "$TEMP_MOUNT"
|
||||||
|
rmdir "$TEMP_MOUNT"
|
||||||
|
log "Storage already configured (found labels and marker)"
|
||||||
|
mount_storage
|
||||||
|
exit 0
|
||||||
|
else
|
||||||
|
umount "$TEMP_MOUNT"
|
||||||
|
rmdir "$TEMP_MOUNT"
|
||||||
|
# Has our labels but no marker - probably ours, mount it
|
||||||
|
warn "Found labels but no marker file - assuming configured"
|
||||||
|
mount_storage
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
rmdir "$TEMP_MOUNT" 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Has partitions but not ours - this could contain data!
|
||||||
|
warn "NVMe has existing partitions but no netboot labels."
|
||||||
|
warn "This drive may contain important data!"
|
||||||
|
echo ""
|
||||||
|
lsblk "$DEVICE"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Prompt on console with timeout
|
||||||
|
echo -e "${CYAN}========================================${NC}"
|
||||||
|
echo -e "${CYAN} Press ENTER within ${PROMPT_TIMEOUT}s to format ${NC}"
|
||||||
|
echo -e "${CYAN} Or wait to skip (safe default) ${NC}"
|
||||||
|
echo -e "${CYAN}========================================${NC}"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
if read -t "$PROMPT_TIMEOUT" -p "Format $DEVICE? [press ENTER to confirm] " response; then
|
||||||
|
echo ""
|
||||||
|
warn "Formatting in 5 seconds... Ctrl+C to abort"
|
||||||
|
sleep 5
|
||||||
|
format_storage
|
||||||
|
else
|
||||||
|
echo ""
|
||||||
|
warn "Timeout - skipping storage setup (drive left untouched)"
|
||||||
|
warn "To format manually, reboot and press ENTER when prompted"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
26
files/setup-node-storage.service
Normal file
26
files/setup-node-storage.service
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=Setup local NVMe storage for K3s
|
||||||
|
Documentation=file:///usr/local/bin/setup-node-storage
|
||||||
|
|
||||||
|
# Run early, after devices are available but before container services
|
||||||
|
After=local-fs.target systemd-udevd.service
|
||||||
|
Before=containerd.service
|
||||||
|
|
||||||
|
# Only run if not already mounted
|
||||||
|
ConditionPathIsMountPoint=!/var/lib/containerd
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=oneshot
|
||||||
|
ExecStart=/usr/local/bin/setup-node-storage
|
||||||
|
RemainAfterExit=yes
|
||||||
|
|
||||||
|
# Console access for interactive prompt
|
||||||
|
StandardInput=tty
|
||||||
|
TTYPath=/dev/tty1
|
||||||
|
TTYReset=yes
|
||||||
|
|
||||||
|
# Generous timeout for user interaction (3 minutes)
|
||||||
|
TimeoutStartSec=180
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
16
secrets/netboot.sops.yaml
Normal file
16
secrets/netboot.sops.yaml
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
root_password_hash: ENC[AES256_GCM,data:Oc1Kpg1S3NSG4dDoe0AiDmdWe4wdz9zSMn/WlTvURz3u62HcF9ddZh3yKbsXdc19WbGj/ZJa+MFzucgCg6ChT5OG2k4S+JuAVvRaNmB54XSjyIL2vDkambq8Pt4rg5rVxfv5H6uEd5IWUg==,iv:fO72qW/8JIWGubbfjZYsfhjL3XUq/7RbohGPd1avS+8=,tag:nXP7w2b49iYAcnWxM4WFlA==,type:str]
|
||||||
|
sops:
|
||||||
|
age:
|
||||||
|
- recipient: age1gausnystsln7fpenw7arw7x79xe22z697jnauj38npy0usayqqxqc7td2y
|
||||||
|
enc: |
|
||||||
|
-----BEGIN AGE ENCRYPTED FILE-----
|
||||||
|
YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBDS1VrWkNZTmswYlRrVXAv
|
||||||
|
ZC9FemRVWkc2bzlVL1BuQm9FaDlPVmVuVFZvCnUyb2xnaDdwQ3BsVkNmY0NxZktp
|
||||||
|
Zk9qSlZVZk16UUhhOHdGRFN1Zno1V3cKLS0tIHV6YXE1bFBHZjMyVVdMbVZEMXlW
|
||||||
|
YTN1RnJ3SjRkN21MYmhQK0hZZFB5Sk0KfxfMPUdJjZq/JDOE87oD2XBpQebvy0a5
|
||||||
|
IAI5tdpEzNP6tF4oqunmh15fPc61Q0C/5ev+uz0QyHhTlTI13lYpGg==
|
||||||
|
-----END AGE ENCRYPTED FILE-----
|
||||||
|
lastmodified: "2026-02-05T20:16:15Z"
|
||||||
|
mac: ENC[AES256_GCM,data:mTCLM3t35mMv9nLQHba65Gq3yAWnY4UKUDHEncMF22RnZKiVDaTMAV6tiaKGu7hHXdDu9fU/E7wPomR8pirGf6pJBUWxCflCe3Q3ZGK9/Aw3guz5ZD34H9nMaCjXME59r1rQdQdQlWP5aW4o+kqfD/bukFpW1HUY0YT8g8fqCpw=,iv:bG1M8Ghuc8JkMNQfODZ1FkMI/8Qs217xlN5ihDnz7hs=,tag:gCScQi1YYXFH4Xo/8Wq5+g==,type:str]
|
||||||
|
unencrypted_suffix: _unencrypted
|
||||||
|
version: 3.11.0
|
||||||
Reference in New Issue
Block a user