Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions scripts/lib/runtime.sh
Original file line number Diff line number Diff line change
Expand Up @@ -226,3 +226,47 @@ check_local_provider_health() {
;;
esac
}

# ── Kubelet conflict detection ────────────────────────────────────
# Returns 0 if a conflicting kubelet is detected, 1 otherwise.
# Sets KUBELET_CONFLICT_DETAIL to a human-readable description.
# See: https://github.com/NVIDIA/NemoClaw/issues/431
detect_kubelet_conflict() {
KUBELET_CONFLICT_DETAIL=""

if pgrep -x kubelet > /dev/null 2>&1 || pgrep -x kubelite > /dev/null 2>&1 || pgrep -x k3s > /dev/null 2>&1; then
KUBELET_CONFLICT_DETAIL="kubelet process detected"
return 0
fi

if command -v microk8s > /dev/null 2>&1; then
if microk8s status 2>/dev/null | grep -q "microk8s is running"; then
KUBELET_CONFLICT_DETAIL="MicroK8s is running"
return 0
fi
fi

if systemctl is-active --quiet k3s 2>/dev/null || systemctl is-active --quiet k3s-agent 2>/dev/null; then
KUBELET_CONFLICT_DETAIL="k3s service is active"
return 0
fi

return 1
}

# Emit standardized warning for kubelet conflicts.
warn_kubelet_conflict() {
local detail="${1:-${KUBELET_CONFLICT_DETAIL:-}}"
warn "⚠️ Conflicting Kubernetes detected: $detail"
warn ""
warn "The gateway runs k3s inside Docker with cgroupns=host, which will"
warn "conflict with the host kubelet over /sys/fs/cgroup/kubepods."
warn "This causes all pods to enter CrashLoopBackOff."
warn ""
warn "Options:"
warn " 1. Stop the host Kubernetes first:"
warn " sudo microk8s stop # for MicroK8s"
warn " sudo systemctl stop k3s # for k3s"
warn " sudo systemctl stop kubelet # for kubeadm"
warn " 2. Continue anyway (gateway will likely fail)"
}
117 changes: 107 additions & 10 deletions scripts/setup-spark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,20 @@
#
# NemoClaw setup for DGX Spark devices.
#
# Ensures the current user is in the docker group so NemoClaw can
# manage containers without sudo.
# Spark ships Ubuntu 24.04 (cgroup v2) + Docker 28.x but no k3s.
# OpenShell's gateway starts k3s inside a Docker container, which
# needs cgroup host namespace access. This script configures Docker
# for that.
#
# Usage:
# sudo nemoclaw setup-spark
# # or directly:
# sudo bash scripts/setup-spark.sh
# # or via curl:
# curl -fsSL https://raw.githubusercontent.com/NVIDIA/NemoClaw/main/scripts/setup-spark.sh | sudo bash
#
# What it does:
# 1. Adds current user to docker group (avoids sudo for everything else)
# 2. Configures Docker daemon for cgroupns=host (k3s-in-Docker on cgroup v2)
# 3. Restarts Docker

set -euo pipefail

Expand All @@ -29,6 +33,10 @@ fail() {
exit 1
}

SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
# shellcheck source=lib/runtime.sh
source "$SCRIPT_DIR/lib/runtime.sh"

# ── Pre-flight checks ─────────────────────────────────────────────

if [ "$(uname -s)" != "Linux" ]; then
Expand All @@ -55,15 +63,104 @@ if [ -n "$REAL_USER" ]; then
else
info "Adding '$REAL_USER' to docker group..."
usermod -aG docker "$REAL_USER"
DOCKER_GROUP_ADDED=true
info "Added. Group will take effect on next login (or use 'newgrp docker')."
fi
fi

# ── 2. Next steps ─────────────────────────────────────────────────
# ── 1b. Check for conflicting Kubernetes installations ────────────
#
# If another kubelet is running on the host, cgroupns=host causes
# cgroup path conflicts → CrashLoopBackOff.
# See: https://github.com/NVIDIA/NemoClaw/issues/431

echo ""
if [ "${DOCKER_GROUP_ADDED:-}" = true ]; then
warn "Docker group was just added. You must open a new terminal (or run 'newgrp docker') before continuing."
if detect_kubelet_conflict; then
warn_kubelet_conflict "$KUBELET_CONFLICT_DETAIL"
warn ""

if [ -t 0 ]; then
if ! read -rp "Continue anyway? [y/N] " reply; then
fail "Aborted (no input). Stop the conflicting Kubernetes service and retry."
fi
if [[ ! "$reply" =~ ^[Yy] ]]; then
fail "Aborted. Stop the conflicting Kubernetes service and retry."
fi
else
fail "Conflicting Kubernetes detected. Stop it first or run interactively to override."
fi
fi

# ── 2. Docker cgroup namespace ────────────────────────────────────
#
# Spark runs cgroup v2 (Ubuntu 24.04). OpenShell's gateway embeds
# k3s in a Docker container, which needs --cgroupns=host to manage
# cgroup hierarchies. Without this, kubelet fails with:
# "openat2 /sys/fs/cgroup/kubepods/pids.max: no"
#
# Setting default-cgroupns-mode=host in daemon.json makes all
# containers use the host cgroup namespace. This is safe — it's
# the Docker default on cgroup v1 hosts anyway.

DAEMON_JSON="/etc/docker/daemon.json"
NEEDS_RESTART=false

if [ -f "$DAEMON_JSON" ]; then
# Check if already configured
if grep -q '"default-cgroupns-mode"' "$DAEMON_JSON" 2>/dev/null; then
CURRENT_MODE=$(python3 -c "import json; print(json.load(open('$DAEMON_JSON')).get('default-cgroupns-mode',''))" 2>/dev/null || echo "")
if [ "$CURRENT_MODE" = "host" ]; then
info "Docker daemon already configured for cgroupns=host"
else
info "Updating Docker daemon cgroupns mode to 'host'..."
python3 -c "
import json
with open('$DAEMON_JSON') as f:
d = json.load(f)
d['default-cgroupns-mode'] = 'host'
with open('$DAEMON_JSON', 'w') as f:
json.dump(d, f, indent=2)
"
NEEDS_RESTART=true
fi
else
info "Adding cgroupns=host to Docker daemon config..."
python3 -c "
import json
try:
with open('$DAEMON_JSON') as f:
d = json.load(f)
except:
d = {}
d['default-cgroupns-mode'] = 'host'
with open('$DAEMON_JSON', 'w') as f:
json.dump(d, f, indent=2)
"
NEEDS_RESTART=true
fi
else
info "DGX Spark Docker configuration complete."
info "Creating Docker daemon config with cgroupns=host..."
mkdir -p "$(dirname "$DAEMON_JSON")"
echo '{ "default-cgroupns-mode": "host" }' >"$DAEMON_JSON"
NEEDS_RESTART=true
fi

# ── 3. Restart Docker if needed ───────────────────────────────────

if [ "$NEEDS_RESTART" = true ]; then
info "Restarting Docker daemon..."
systemctl restart docker
# Wait for Docker to be ready
for i in 1 2 3 4 5 6 7 8 9 10; do
if docker info >/dev/null 2>&1; then
break
fi
[ "$i" -eq 10 ] && fail "Docker didn't come back after restart. Check 'systemctl status docker'."
sleep 2
done
info "Docker restarted with cgroupns=host"
fi

# ── 4. Run normal setup ──────────────────────────────────────────

echo ""
info "DGX Spark Docker configuration complete."
info ""