From 69d18cc4948f69be3b7b05b01e85029e91259b98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torbj=C3=B8rn=20Lindahl?= Date: Wed, 4 Feb 2026 19:10:03 +0100 Subject: [PATCH] fix: session stability improvements - Fix docker client initialization bug in app.py (context manager was closing client) - Add restart_session() method to preserve session IDs during container restarts - Add 60-second startup grace period before health checking new sessions - Fix _stop_container and _get_container_info to use docker_service API consistently - Disable mDNS in Dockerfile to prevent Bonjour service name conflicts - Remove old container before restart to free port bindings --- Dockerfile | 4 +- docker-compose.yml | 2 + session-manager/app.py | 10 ++-- session-manager/container_health.py | 83 ++++++++++++++++++----------- session-manager/session_manager.py | 80 +++++++++++++++++++++++++++ 5 files changed, 138 insertions(+), 41 deletions(-) diff --git a/Dockerfile b/Dockerfile index 60d2170..e817e7e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -28,5 +28,5 @@ EXPOSE 8080 # Set environment variables ENV PYTHONPATH=/app -# Start OpenCode server (OPENCODE_API_KEY passed via environment) -CMD ["/bin/bash", "-c", "source /root/.bashrc && opencode serve --hostname 0.0.0.0 --port 8080 --mdns"] \ No newline at end of file +# Start OpenCode server (mDNS disabled to prevent conflicts between containers) +CMD ["/bin/bash", "-c", "source /root/.bashrc && opencode serve --hostname 0.0.0.0 --port 8080"] \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 37dff52..6fdd644 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -30,6 +30,8 @@ services: # Host configuration - DOCKER_HOST_IP=${DOCKER_HOST_IP:-host.docker.internal} - DOCKER_TLS_PORT=${DOCKER_TLS_PORT:-2376} + # Disable database storage (use in-memory) + - USE_DATABASE_STORAGE=false networks: - lovdata-network restart: unless-stopped diff --git a/session-manager/app.py b/session-manager/app.py index fcd2d56..1d8f1e6 100644 --- a/session-manager/app.py +++ b/session-manager/app.py @@ -5,7 +5,6 @@ from fastapi import FastAPI from config import USE_ASYNC_DOCKER, USE_DATABASE_STORAGE from session_manager import session_manager -from async_docker_client import get_async_docker_client from http_pool import init_http_pool, shutdown_http_pool from database import init_database, shutdown_database, run_migrations from container_health import ( @@ -43,12 +42,9 @@ async def lifespan(app: FastAPI): session_manager._load_sessions_from_file() try: - docker_client = None - if USE_ASYNC_DOCKER: - async with get_async_docker_client() as client: - docker_client = client._docker if hasattr(client, "_docker") else None - else: - docker_client = session_manager.docker_service + # Use the session manager's docker_service for health monitoring + # This ensures the docker client stays alive for the lifetime of the application + docker_client = session_manager.docker_service await start_container_health_monitoring(session_manager, docker_client) logger.info("Container health monitoring started") diff --git a/session-manager/container_health.py b/session-manager/container_health.py index 4382310..6aac57d 100644 --- a/session-manager/container_health.py +++ b/session-manager/container_health.py @@ -129,23 +129,30 @@ class ContainerHealthMonitor: """Main monitoring loop.""" while self._monitoring: try: - await self._perform_health_checks() + await self._check_all_containers() await self._cleanup_old_history() except Exception as e: logger.error("Error in health monitoring loop", extra={"error": str(e)}) await asyncio.sleep(self.check_interval) - async def _perform_health_checks(self): + async def _check_all_containers(self): """Perform health checks on all running containers.""" if not self.session_manager: return - # Get all running sessions + from datetime import datetime, timedelta + + # Startup grace period - don't check containers that started recently + startup_grace_period = timedelta(seconds=60) + now = datetime.now() + + # Get all running sessions that are past the startup grace period running_sessions = [ session for session in self.session_manager.sessions.values() - if session.status == "running" + if session.status == "running" + and (now - session.created_at) > startup_grace_period ] if not running_sessions: @@ -263,23 +270,30 @@ class ContainerHealthMonitor: async def _get_container_info(self, container_id: str) -> Optional[Dict[str, Any]]: """Get container information from Docker.""" try: - if self.docker_client: - # Try async Docker client first - container = await self.docker_client.get_container(container_id) - if hasattr(container, "_container"): - return await container._container.show() - elif hasattr(container, "show"): - return await container.show() - else: - # Fallback to sync client if available - if ( - hasattr(self.session_manager, "docker_client") - and self.session_manager.docker_client - ): - container = self.session_manager.docker_client.containers.get( - container_id - ) - return container.attrs + # Use session_manager.docker_service for consistent container access + if ( + self.session_manager + and hasattr(self.session_manager, "docker_service") + and self.session_manager.docker_service + ): + container_info = await self.session_manager.docker_service.get_container_info(container_id) + if container_info: + # Convert ContainerInfo to dict format expected by health check + return { + "State": { + "Status": container_info.status, + "Health": {"Status": container_info.health_status} if container_info.health_status else {} + } + } + elif self.docker_client and hasattr(self.docker_client, "get_container_info"): + container_info = await self.docker_client.get_container_info(container_id) + if container_info: + return { + "State": { + "Status": container_info.status, + "Health": {"Status": container_info.health_status} if container_info.health_status else {} + } + } except Exception as e: logger.debug( f"Failed to get container info for {container_id}", @@ -384,8 +398,8 @@ class ContainerHealthMonitor: # Trigger container restart through session manager if self.session_manager: - # Create new container for the session - await self.session_manager.create_session() + # Restart container for the SAME session (preserves session_id) + await self.session_manager.restart_session(session_id) logger.info( "Container restart initiated", extra={ @@ -418,17 +432,22 @@ class ContainerHealthMonitor: async def _stop_container(self, container_id: str): """Stop a container.""" try: - if self.docker_client: - container = await self.docker_client.get_container(container_id) - await self.docker_client.stop_container(container, timeout=10) - elif ( - hasattr(self.session_manager, "docker_client") - and self.session_manager.docker_client + # Use session_manager.docker_service for container operations + # docker_service.stop_container takes container_id as a string + if ( + self.session_manager + and hasattr(self.session_manager, "docker_service") + and self.session_manager.docker_service ): - container = self.session_manager.docker_client.containers.get( - container_id + await self.session_manager.docker_service.stop_container(container_id, timeout=10) + elif self.docker_client and hasattr(self.docker_client, "stop_container"): + # If docker_client is docker_service, use it directly + await self.docker_client.stop_container(container_id, timeout=10) + else: + logger.warning( + "No docker client available to stop container", + extra={"container_id": container_id}, ) - container.stop(timeout=10) except Exception as e: logger.warning( "Failed to stop container during restart", diff --git a/session-manager/session_manager.py b/session-manager/session_manager.py index 825be70..f6d2c9a 100644 --- a/session-manager/session_manager.py +++ b/session-manager/session_manager.py @@ -367,6 +367,86 @@ class SessionManager: async def list_sessions(self) -> List[SessionData]: return list(self.sessions.values()) + async def restart_session(self, session_id: str) -> Optional[SessionData]: + """Restart a session's container while preserving the session ID. + + Unlike create_session(), this reuses the existing session data + and only creates a new container, maintaining session ID continuity. + This method removes the old container to free up the port. + """ + session = await self.get_session(session_id) + if not session: + logger.error( + "Cannot restart session: not found", + extra={"session_id": session_id}, + ) + return None + + old_container_id = session.container_id + logger.info( + "Restarting session container", + extra={"session_id": session_id, "old_container_id": old_container_id}, + ) + + # Stop and remove old container to free up the port + if old_container_id and self.docker_service: + try: + logger.info( + "Stopping old container for restart", + extra={"session_id": session_id, "container_id": old_container_id}, + ) + await self.docker_service.stop_container(old_container_id) + except Exception as e: + logger.warning( + "Failed to stop old container (may already be stopped)", + extra={"session_id": session_id, "container_id": old_container_id, "error": str(e)}, + ) + + try: + logger.info( + "Removing old container for restart", + extra={"session_id": session_id, "container_id": old_container_id}, + ) + await self.docker_service.remove_container(old_container_id, force=True) + except Exception as e: + logger.warning( + "Failed to remove old container", + extra={"session_id": session_id, "container_id": old_container_id, "error": str(e)}, + ) + + # Generate new container name for the restart + new_container_name = f"opencode-{session_id}-{uuid.uuid4().hex[:8]}" + session.container_name = new_container_name + session.container_id = None # Clear old container_id + session.status = "starting" + + # Update session in store before starting container + self.sessions[session_id] = session + + if USE_DATABASE_STORAGE: + try: + await SessionModel.update_session( + session_id, + { + "container_name": new_container_name, + "container_id": None, + "status": "starting", + }, + ) + except Exception as e: + logger.error( + "Failed to update session in database during restart", + extra={"session_id": session_id, "error": str(e)}, + ) + + # Start new container for this session + if USE_ASYNC_DOCKER: + asyncio.create_task(self._start_container_async(session)) + else: + asyncio.create_task(self._start_container_sync(session)) + + return session + async def list_containers_async(self, all: bool = False) -> List: return await self.docker_service.list_containers(all=all)