fix: session stability improvements

- Fix docker client initialization bug in app.py (context manager was closing client)
- Add restart_session() method to preserve session IDs during container restarts
- Add 60-second startup grace period before health checking new sessions
- Fix _stop_container and _get_container_info to use docker_service API consistently
- Disable mDNS in Dockerfile to prevent Bonjour service name conflicts
- Remove old container before restart to free port bindings
This commit is contained in:
2026-02-04 19:10:03 +01:00
parent 05aa70c4af
commit 69d18cc494
5 changed files with 138 additions and 41 deletions

View File

@@ -28,5 +28,5 @@ EXPOSE 8080
# Set environment variables
ENV PYTHONPATH=/app
# Start OpenCode server (OPENCODE_API_KEY passed via environment)
CMD ["/bin/bash", "-c", "source /root/.bashrc && opencode serve --hostname 0.0.0.0 --port 8080 --mdns"]
# Start OpenCode server (mDNS disabled to prevent conflicts between containers)
CMD ["/bin/bash", "-c", "source /root/.bashrc && opencode serve --hostname 0.0.0.0 --port 8080"]

View File

@@ -30,6 +30,8 @@ services:
# Host configuration
- DOCKER_HOST_IP=${DOCKER_HOST_IP:-host.docker.internal}
- DOCKER_TLS_PORT=${DOCKER_TLS_PORT:-2376}
# Disable database storage (use in-memory)
- USE_DATABASE_STORAGE=false
networks:
- lovdata-network
restart: unless-stopped

View File

@@ -5,7 +5,6 @@ from fastapi import FastAPI
from config import USE_ASYNC_DOCKER, USE_DATABASE_STORAGE
from session_manager import session_manager
from async_docker_client import get_async_docker_client
from http_pool import init_http_pool, shutdown_http_pool
from database import init_database, shutdown_database, run_migrations
from container_health import (
@@ -43,11 +42,8 @@ async def lifespan(app: FastAPI):
session_manager._load_sessions_from_file()
try:
docker_client = None
if USE_ASYNC_DOCKER:
async with get_async_docker_client() as client:
docker_client = client._docker if hasattr(client, "_docker") else None
else:
# Use the session manager's docker_service for health monitoring
# This ensures the docker client stays alive for the lifetime of the application
docker_client = session_manager.docker_service
await start_container_health_monitoring(session_manager, docker_client)

View File

@@ -129,23 +129,30 @@ class ContainerHealthMonitor:
"""Main monitoring loop."""
while self._monitoring:
try:
await self._perform_health_checks()
await self._check_all_containers()
await self._cleanup_old_history()
except Exception as e:
logger.error("Error in health monitoring loop", extra={"error": str(e)})
await asyncio.sleep(self.check_interval)
async def _perform_health_checks(self):
async def _check_all_containers(self):
"""Perform health checks on all running containers."""
if not self.session_manager:
return
# Get all running sessions
from datetime import datetime, timedelta
# Startup grace period - don't check containers that started recently
startup_grace_period = timedelta(seconds=60)
now = datetime.now()
# Get all running sessions that are past the startup grace period
running_sessions = [
session
for session in self.session_manager.sessions.values()
if session.status == "running"
and (now - session.created_at) > startup_grace_period
]
if not running_sessions:
@@ -263,23 +270,30 @@ class ContainerHealthMonitor:
async def _get_container_info(self, container_id: str) -> Optional[Dict[str, Any]]:
"""Get container information from Docker."""
try:
if self.docker_client:
# Try async Docker client first
container = await self.docker_client.get_container(container_id)
if hasattr(container, "_container"):
return await container._container.show()
elif hasattr(container, "show"):
return await container.show()
else:
# Fallback to sync client if available
# Use session_manager.docker_service for consistent container access
if (
hasattr(self.session_manager, "docker_client")
and self.session_manager.docker_client
self.session_manager
and hasattr(self.session_manager, "docker_service")
and self.session_manager.docker_service
):
container = self.session_manager.docker_client.containers.get(
container_id
)
return container.attrs
container_info = await self.session_manager.docker_service.get_container_info(container_id)
if container_info:
# Convert ContainerInfo to dict format expected by health check
return {
"State": {
"Status": container_info.status,
"Health": {"Status": container_info.health_status} if container_info.health_status else {}
}
}
elif self.docker_client and hasattr(self.docker_client, "get_container_info"):
container_info = await self.docker_client.get_container_info(container_id)
if container_info:
return {
"State": {
"Status": container_info.status,
"Health": {"Status": container_info.health_status} if container_info.health_status else {}
}
}
except Exception as e:
logger.debug(
f"Failed to get container info for {container_id}",
@@ -384,8 +398,8 @@ class ContainerHealthMonitor:
# Trigger container restart through session manager
if self.session_manager:
# Create new container for the session
await self.session_manager.create_session()
# Restart container for the SAME session (preserves session_id)
await self.session_manager.restart_session(session_id)
logger.info(
"Container restart initiated",
extra={
@@ -418,17 +432,22 @@ class ContainerHealthMonitor:
async def _stop_container(self, container_id: str):
"""Stop a container."""
try:
if self.docker_client:
container = await self.docker_client.get_container(container_id)
await self.docker_client.stop_container(container, timeout=10)
elif (
hasattr(self.session_manager, "docker_client")
and self.session_manager.docker_client
# Use session_manager.docker_service for container operations
# docker_service.stop_container takes container_id as a string
if (
self.session_manager
and hasattr(self.session_manager, "docker_service")
and self.session_manager.docker_service
):
container = self.session_manager.docker_client.containers.get(
container_id
await self.session_manager.docker_service.stop_container(container_id, timeout=10)
elif self.docker_client and hasattr(self.docker_client, "stop_container"):
# If docker_client is docker_service, use it directly
await self.docker_client.stop_container(container_id, timeout=10)
else:
logger.warning(
"No docker client available to stop container",
extra={"container_id": container_id},
)
container.stop(timeout=10)
except Exception as e:
logger.warning(
"Failed to stop container during restart",

View File

@@ -367,6 +367,86 @@ class SessionManager:
async def list_sessions(self) -> List[SessionData]:
return list(self.sessions.values())
async def restart_session(self, session_id: str) -> Optional[SessionData]:
"""Restart a session's container while preserving the session ID.
Unlike create_session(), this reuses the existing session data
and only creates a new container, maintaining session ID continuity.
This method removes the old container to free up the port.
"""
session = await self.get_session(session_id)
if not session:
logger.error(
"Cannot restart session: not found",
extra={"session_id": session_id},
)
return None
old_container_id = session.container_id
logger.info(
"Restarting session container",
extra={"session_id": session_id, "old_container_id": old_container_id},
)
# Stop and remove old container to free up the port
if old_container_id and self.docker_service:
try:
logger.info(
"Stopping old container for restart",
extra={"session_id": session_id, "container_id": old_container_id},
)
await self.docker_service.stop_container(old_container_id)
except Exception as e:
logger.warning(
"Failed to stop old container (may already be stopped)",
extra={"session_id": session_id, "container_id": old_container_id, "error": str(e)},
)
try:
logger.info(
"Removing old container for restart",
extra={"session_id": session_id, "container_id": old_container_id},
)
await self.docker_service.remove_container(old_container_id, force=True)
except Exception as e:
logger.warning(
"Failed to remove old container",
extra={"session_id": session_id, "container_id": old_container_id, "error": str(e)},
)
# Generate new container name for the restart
new_container_name = f"opencode-{session_id}-{uuid.uuid4().hex[:8]}"
session.container_name = new_container_name
session.container_id = None # Clear old container_id
session.status = "starting"
# Update session in store before starting container
self.sessions[session_id] = session
if USE_DATABASE_STORAGE:
try:
await SessionModel.update_session(
session_id,
{
"container_name": new_container_name,
"container_id": None,
"status": "starting",
},
)
except Exception as e:
logger.error(
"Failed to update session in database during restart",
extra={"session_id": session_id, "error": str(e)},
)
# Start new container for this session
if USE_ASYNC_DOCKER:
asyncio.create_task(self._start_container_async(session))
else:
asyncio.create_task(self._start_container_sync(session))
return session
async def list_containers_async(self, all: bool = False) -> List:
return await self.docker_service.list_containers(all=all)