fix: session stability improvements

- Fix docker client initialization bug in app.py (context manager was closing client)
- Add restart_session() method to preserve session IDs during container restarts
- Add 60-second startup grace period before health checking new sessions
- Fix _stop_container and _get_container_info to use docker_service API consistently
- Disable mDNS in Dockerfile to prevent Bonjour service name conflicts
- Remove old container before restart to free port bindings
This commit is contained in:
2026-02-04 19:10:03 +01:00
parent 05aa70c4af
commit 69d18cc494
5 changed files with 138 additions and 41 deletions

View File

@@ -129,23 +129,30 @@ class ContainerHealthMonitor:
"""Main monitoring loop."""
while self._monitoring:
try:
await self._perform_health_checks()
await self._check_all_containers()
await self._cleanup_old_history()
except Exception as e:
logger.error("Error in health monitoring loop", extra={"error": str(e)})
await asyncio.sleep(self.check_interval)
async def _perform_health_checks(self):
async def _check_all_containers(self):
"""Perform health checks on all running containers."""
if not self.session_manager:
return
# Get all running sessions
from datetime import datetime, timedelta
# Startup grace period - don't check containers that started recently
startup_grace_period = timedelta(seconds=60)
now = datetime.now()
# Get all running sessions that are past the startup grace period
running_sessions = [
session
for session in self.session_manager.sessions.values()
if session.status == "running"
if session.status == "running"
and (now - session.created_at) > startup_grace_period
]
if not running_sessions:
@@ -263,23 +270,30 @@ class ContainerHealthMonitor:
async def _get_container_info(self, container_id: str) -> Optional[Dict[str, Any]]:
"""Get container information from Docker."""
try:
if self.docker_client:
# Try async Docker client first
container = await self.docker_client.get_container(container_id)
if hasattr(container, "_container"):
return await container._container.show()
elif hasattr(container, "show"):
return await container.show()
else:
# Fallback to sync client if available
if (
hasattr(self.session_manager, "docker_client")
and self.session_manager.docker_client
):
container = self.session_manager.docker_client.containers.get(
container_id
)
return container.attrs
# Use session_manager.docker_service for consistent container access
if (
self.session_manager
and hasattr(self.session_manager, "docker_service")
and self.session_manager.docker_service
):
container_info = await self.session_manager.docker_service.get_container_info(container_id)
if container_info:
# Convert ContainerInfo to dict format expected by health check
return {
"State": {
"Status": container_info.status,
"Health": {"Status": container_info.health_status} if container_info.health_status else {}
}
}
elif self.docker_client and hasattr(self.docker_client, "get_container_info"):
container_info = await self.docker_client.get_container_info(container_id)
if container_info:
return {
"State": {
"Status": container_info.status,
"Health": {"Status": container_info.health_status} if container_info.health_status else {}
}
}
except Exception as e:
logger.debug(
f"Failed to get container info for {container_id}",
@@ -384,8 +398,8 @@ class ContainerHealthMonitor:
# Trigger container restart through session manager
if self.session_manager:
# Create new container for the session
await self.session_manager.create_session()
# Restart container for the SAME session (preserves session_id)
await self.session_manager.restart_session(session_id)
logger.info(
"Container restart initiated",
extra={
@@ -418,17 +432,22 @@ class ContainerHealthMonitor:
async def _stop_container(self, container_id: str):
"""Stop a container."""
try:
if self.docker_client:
container = await self.docker_client.get_container(container_id)
await self.docker_client.stop_container(container, timeout=10)
elif (
hasattr(self.session_manager, "docker_client")
and self.session_manager.docker_client
# Use session_manager.docker_service for container operations
# docker_service.stop_container takes container_id as a string
if (
self.session_manager
and hasattr(self.session_manager, "docker_service")
and self.session_manager.docker_service
):
container = self.session_manager.docker_client.containers.get(
container_id
await self.session_manager.docker_service.stop_container(container_id, timeout=10)
elif self.docker_client and hasattr(self.docker_client, "stop_container"):
# If docker_client is docker_service, use it directly
await self.docker_client.stop_container(container_id, timeout=10)
else:
logger.warning(
"No docker client available to stop container",
extra={"container_id": container_id},
)
container.stop(timeout=10)
except Exception as e:
logger.warning(
"Failed to stop container during restart",