fix: session stability improvements
- Fix docker client initialization bug in app.py (context manager was closing client) - Add restart_session() method to preserve session IDs during container restarts - Add 60-second startup grace period before health checking new sessions - Fix _stop_container and _get_container_info to use docker_service API consistently - Disable mDNS in Dockerfile to prevent Bonjour service name conflicts - Remove old container before restart to free port bindings
This commit is contained in:
@@ -28,5 +28,5 @@ EXPOSE 8080
|
|||||||
# Set environment variables
|
# Set environment variables
|
||||||
ENV PYTHONPATH=/app
|
ENV PYTHONPATH=/app
|
||||||
|
|
||||||
# Start OpenCode server (OPENCODE_API_KEY passed via environment)
|
# Start OpenCode server (mDNS disabled to prevent conflicts between containers)
|
||||||
CMD ["/bin/bash", "-c", "source /root/.bashrc && opencode serve --hostname 0.0.0.0 --port 8080 --mdns"]
|
CMD ["/bin/bash", "-c", "source /root/.bashrc && opencode serve --hostname 0.0.0.0 --port 8080"]
|
||||||
@@ -30,6 +30,8 @@ services:
|
|||||||
# Host configuration
|
# Host configuration
|
||||||
- DOCKER_HOST_IP=${DOCKER_HOST_IP:-host.docker.internal}
|
- DOCKER_HOST_IP=${DOCKER_HOST_IP:-host.docker.internal}
|
||||||
- DOCKER_TLS_PORT=${DOCKER_TLS_PORT:-2376}
|
- DOCKER_TLS_PORT=${DOCKER_TLS_PORT:-2376}
|
||||||
|
# Disable database storage (use in-memory)
|
||||||
|
- USE_DATABASE_STORAGE=false
|
||||||
networks:
|
networks:
|
||||||
- lovdata-network
|
- lovdata-network
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
|||||||
@@ -5,7 +5,6 @@ from fastapi import FastAPI
|
|||||||
|
|
||||||
from config import USE_ASYNC_DOCKER, USE_DATABASE_STORAGE
|
from config import USE_ASYNC_DOCKER, USE_DATABASE_STORAGE
|
||||||
from session_manager import session_manager
|
from session_manager import session_manager
|
||||||
from async_docker_client import get_async_docker_client
|
|
||||||
from http_pool import init_http_pool, shutdown_http_pool
|
from http_pool import init_http_pool, shutdown_http_pool
|
||||||
from database import init_database, shutdown_database, run_migrations
|
from database import init_database, shutdown_database, run_migrations
|
||||||
from container_health import (
|
from container_health import (
|
||||||
@@ -43,12 +42,9 @@ async def lifespan(app: FastAPI):
|
|||||||
session_manager._load_sessions_from_file()
|
session_manager._load_sessions_from_file()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
docker_client = None
|
# Use the session manager's docker_service for health monitoring
|
||||||
if USE_ASYNC_DOCKER:
|
# This ensures the docker client stays alive for the lifetime of the application
|
||||||
async with get_async_docker_client() as client:
|
docker_client = session_manager.docker_service
|
||||||
docker_client = client._docker if hasattr(client, "_docker") else None
|
|
||||||
else:
|
|
||||||
docker_client = session_manager.docker_service
|
|
||||||
|
|
||||||
await start_container_health_monitoring(session_manager, docker_client)
|
await start_container_health_monitoring(session_manager, docker_client)
|
||||||
logger.info("Container health monitoring started")
|
logger.info("Container health monitoring started")
|
||||||
|
|||||||
@@ -129,23 +129,30 @@ class ContainerHealthMonitor:
|
|||||||
"""Main monitoring loop."""
|
"""Main monitoring loop."""
|
||||||
while self._monitoring:
|
while self._monitoring:
|
||||||
try:
|
try:
|
||||||
await self._perform_health_checks()
|
await self._check_all_containers()
|
||||||
await self._cleanup_old_history()
|
await self._cleanup_old_history()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error("Error in health monitoring loop", extra={"error": str(e)})
|
logger.error("Error in health monitoring loop", extra={"error": str(e)})
|
||||||
|
|
||||||
await asyncio.sleep(self.check_interval)
|
await asyncio.sleep(self.check_interval)
|
||||||
|
|
||||||
async def _perform_health_checks(self):
|
async def _check_all_containers(self):
|
||||||
"""Perform health checks on all running containers."""
|
"""Perform health checks on all running containers."""
|
||||||
if not self.session_manager:
|
if not self.session_manager:
|
||||||
return
|
return
|
||||||
|
|
||||||
# Get all running sessions
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
|
# Startup grace period - don't check containers that started recently
|
||||||
|
startup_grace_period = timedelta(seconds=60)
|
||||||
|
now = datetime.now()
|
||||||
|
|
||||||
|
# Get all running sessions that are past the startup grace period
|
||||||
running_sessions = [
|
running_sessions = [
|
||||||
session
|
session
|
||||||
for session in self.session_manager.sessions.values()
|
for session in self.session_manager.sessions.values()
|
||||||
if session.status == "running"
|
if session.status == "running"
|
||||||
|
and (now - session.created_at) > startup_grace_period
|
||||||
]
|
]
|
||||||
|
|
||||||
if not running_sessions:
|
if not running_sessions:
|
||||||
@@ -263,23 +270,30 @@ class ContainerHealthMonitor:
|
|||||||
async def _get_container_info(self, container_id: str) -> Optional[Dict[str, Any]]:
|
async def _get_container_info(self, container_id: str) -> Optional[Dict[str, Any]]:
|
||||||
"""Get container information from Docker."""
|
"""Get container information from Docker."""
|
||||||
try:
|
try:
|
||||||
if self.docker_client:
|
# Use session_manager.docker_service for consistent container access
|
||||||
# Try async Docker client first
|
if (
|
||||||
container = await self.docker_client.get_container(container_id)
|
self.session_manager
|
||||||
if hasattr(container, "_container"):
|
and hasattr(self.session_manager, "docker_service")
|
||||||
return await container._container.show()
|
and self.session_manager.docker_service
|
||||||
elif hasattr(container, "show"):
|
):
|
||||||
return await container.show()
|
container_info = await self.session_manager.docker_service.get_container_info(container_id)
|
||||||
else:
|
if container_info:
|
||||||
# Fallback to sync client if available
|
# Convert ContainerInfo to dict format expected by health check
|
||||||
if (
|
return {
|
||||||
hasattr(self.session_manager, "docker_client")
|
"State": {
|
||||||
and self.session_manager.docker_client
|
"Status": container_info.status,
|
||||||
):
|
"Health": {"Status": container_info.health_status} if container_info.health_status else {}
|
||||||
container = self.session_manager.docker_client.containers.get(
|
}
|
||||||
container_id
|
}
|
||||||
)
|
elif self.docker_client and hasattr(self.docker_client, "get_container_info"):
|
||||||
return container.attrs
|
container_info = await self.docker_client.get_container_info(container_id)
|
||||||
|
if container_info:
|
||||||
|
return {
|
||||||
|
"State": {
|
||||||
|
"Status": container_info.status,
|
||||||
|
"Health": {"Status": container_info.health_status} if container_info.health_status else {}
|
||||||
|
}
|
||||||
|
}
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"Failed to get container info for {container_id}",
|
f"Failed to get container info for {container_id}",
|
||||||
@@ -384,8 +398,8 @@ class ContainerHealthMonitor:
|
|||||||
|
|
||||||
# Trigger container restart through session manager
|
# Trigger container restart through session manager
|
||||||
if self.session_manager:
|
if self.session_manager:
|
||||||
# Create new container for the session
|
# Restart container for the SAME session (preserves session_id)
|
||||||
await self.session_manager.create_session()
|
await self.session_manager.restart_session(session_id)
|
||||||
logger.info(
|
logger.info(
|
||||||
"Container restart initiated",
|
"Container restart initiated",
|
||||||
extra={
|
extra={
|
||||||
@@ -418,17 +432,22 @@ class ContainerHealthMonitor:
|
|||||||
async def _stop_container(self, container_id: str):
|
async def _stop_container(self, container_id: str):
|
||||||
"""Stop a container."""
|
"""Stop a container."""
|
||||||
try:
|
try:
|
||||||
if self.docker_client:
|
# Use session_manager.docker_service for container operations
|
||||||
container = await self.docker_client.get_container(container_id)
|
# docker_service.stop_container takes container_id as a string
|
||||||
await self.docker_client.stop_container(container, timeout=10)
|
if (
|
||||||
elif (
|
self.session_manager
|
||||||
hasattr(self.session_manager, "docker_client")
|
and hasattr(self.session_manager, "docker_service")
|
||||||
and self.session_manager.docker_client
|
and self.session_manager.docker_service
|
||||||
):
|
):
|
||||||
container = self.session_manager.docker_client.containers.get(
|
await self.session_manager.docker_service.stop_container(container_id, timeout=10)
|
||||||
container_id
|
elif self.docker_client and hasattr(self.docker_client, "stop_container"):
|
||||||
|
# If docker_client is docker_service, use it directly
|
||||||
|
await self.docker_client.stop_container(container_id, timeout=10)
|
||||||
|
else:
|
||||||
|
logger.warning(
|
||||||
|
"No docker client available to stop container",
|
||||||
|
extra={"container_id": container_id},
|
||||||
)
|
)
|
||||||
container.stop(timeout=10)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"Failed to stop container during restart",
|
"Failed to stop container during restart",
|
||||||
|
|||||||
@@ -367,6 +367,86 @@ class SessionManager:
|
|||||||
async def list_sessions(self) -> List[SessionData]:
|
async def list_sessions(self) -> List[SessionData]:
|
||||||
return list(self.sessions.values())
|
return list(self.sessions.values())
|
||||||
|
|
||||||
|
async def restart_session(self, session_id: str) -> Optional[SessionData]:
|
||||||
|
"""Restart a session's container while preserving the session ID.
|
||||||
|
|
||||||
|
Unlike create_session(), this reuses the existing session data
|
||||||
|
and only creates a new container, maintaining session ID continuity.
|
||||||
|
This method removes the old container to free up the port.
|
||||||
|
"""
|
||||||
|
session = await self.get_session(session_id)
|
||||||
|
if not session:
|
||||||
|
logger.error(
|
||||||
|
"Cannot restart session: not found",
|
||||||
|
extra={"session_id": session_id},
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
|
old_container_id = session.container_id
|
||||||
|
logger.info(
|
||||||
|
"Restarting session container",
|
||||||
|
extra={"session_id": session_id, "old_container_id": old_container_id},
|
||||||
|
)
|
||||||
|
|
||||||
|
# Stop and remove old container to free up the port
|
||||||
|
if old_container_id and self.docker_service:
|
||||||
|
try:
|
||||||
|
logger.info(
|
||||||
|
"Stopping old container for restart",
|
||||||
|
extra={"session_id": session_id, "container_id": old_container_id},
|
||||||
|
)
|
||||||
|
await self.docker_service.stop_container(old_container_id)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(
|
||||||
|
"Failed to stop old container (may already be stopped)",
|
||||||
|
extra={"session_id": session_id, "container_id": old_container_id, "error": str(e)},
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
logger.info(
|
||||||
|
"Removing old container for restart",
|
||||||
|
extra={"session_id": session_id, "container_id": old_container_id},
|
||||||
|
)
|
||||||
|
await self.docker_service.remove_container(old_container_id, force=True)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(
|
||||||
|
"Failed to remove old container",
|
||||||
|
extra={"session_id": session_id, "container_id": old_container_id, "error": str(e)},
|
||||||
|
)
|
||||||
|
|
||||||
|
# Generate new container name for the restart
|
||||||
|
new_container_name = f"opencode-{session_id}-{uuid.uuid4().hex[:8]}"
|
||||||
|
session.container_name = new_container_name
|
||||||
|
session.container_id = None # Clear old container_id
|
||||||
|
session.status = "starting"
|
||||||
|
|
||||||
|
# Update session in store before starting container
|
||||||
|
self.sessions[session_id] = session
|
||||||
|
|
||||||
|
if USE_DATABASE_STORAGE:
|
||||||
|
try:
|
||||||
|
await SessionModel.update_session(
|
||||||
|
session_id,
|
||||||
|
{
|
||||||
|
"container_name": new_container_name,
|
||||||
|
"container_id": None,
|
||||||
|
"status": "starting",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(
|
||||||
|
"Failed to update session in database during restart",
|
||||||
|
extra={"session_id": session_id, "error": str(e)},
|
||||||
|
)
|
||||||
|
|
||||||
|
# Start new container for this session
|
||||||
|
if USE_ASYNC_DOCKER:
|
||||||
|
asyncio.create_task(self._start_container_async(session))
|
||||||
|
else:
|
||||||
|
asyncio.create_task(self._start_container_sync(session))
|
||||||
|
|
||||||
|
return session
|
||||||
|
|
||||||
async def list_containers_async(self, all: bool = False) -> List:
|
async def list_containers_async(self, all: bool = False) -> List:
|
||||||
return await self.docker_service.list_containers(all=all)
|
return await self.docker_service.list_containers(all=all)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user