fix: session stability improvements
- Fix docker client initialization bug in app.py (context manager was closing client) - Add restart_session() method to preserve session IDs during container restarts - Add 60-second startup grace period before health checking new sessions - Fix _stop_container and _get_container_info to use docker_service API consistently - Disable mDNS in Dockerfile to prevent Bonjour service name conflicts - Remove old container before restart to free port bindings
This commit is contained in:
@@ -129,23 +129,30 @@ class ContainerHealthMonitor:
|
||||
"""Main monitoring loop."""
|
||||
while self._monitoring:
|
||||
try:
|
||||
await self._perform_health_checks()
|
||||
await self._check_all_containers()
|
||||
await self._cleanup_old_history()
|
||||
except Exception as e:
|
||||
logger.error("Error in health monitoring loop", extra={"error": str(e)})
|
||||
|
||||
await asyncio.sleep(self.check_interval)
|
||||
|
||||
async def _perform_health_checks(self):
|
||||
async def _check_all_containers(self):
|
||||
"""Perform health checks on all running containers."""
|
||||
if not self.session_manager:
|
||||
return
|
||||
|
||||
# Get all running sessions
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
# Startup grace period - don't check containers that started recently
|
||||
startup_grace_period = timedelta(seconds=60)
|
||||
now = datetime.now()
|
||||
|
||||
# Get all running sessions that are past the startup grace period
|
||||
running_sessions = [
|
||||
session
|
||||
for session in self.session_manager.sessions.values()
|
||||
if session.status == "running"
|
||||
if session.status == "running"
|
||||
and (now - session.created_at) > startup_grace_period
|
||||
]
|
||||
|
||||
if not running_sessions:
|
||||
@@ -263,23 +270,30 @@ class ContainerHealthMonitor:
|
||||
async def _get_container_info(self, container_id: str) -> Optional[Dict[str, Any]]:
|
||||
"""Get container information from Docker."""
|
||||
try:
|
||||
if self.docker_client:
|
||||
# Try async Docker client first
|
||||
container = await self.docker_client.get_container(container_id)
|
||||
if hasattr(container, "_container"):
|
||||
return await container._container.show()
|
||||
elif hasattr(container, "show"):
|
||||
return await container.show()
|
||||
else:
|
||||
# Fallback to sync client if available
|
||||
if (
|
||||
hasattr(self.session_manager, "docker_client")
|
||||
and self.session_manager.docker_client
|
||||
):
|
||||
container = self.session_manager.docker_client.containers.get(
|
||||
container_id
|
||||
)
|
||||
return container.attrs
|
||||
# Use session_manager.docker_service for consistent container access
|
||||
if (
|
||||
self.session_manager
|
||||
and hasattr(self.session_manager, "docker_service")
|
||||
and self.session_manager.docker_service
|
||||
):
|
||||
container_info = await self.session_manager.docker_service.get_container_info(container_id)
|
||||
if container_info:
|
||||
# Convert ContainerInfo to dict format expected by health check
|
||||
return {
|
||||
"State": {
|
||||
"Status": container_info.status,
|
||||
"Health": {"Status": container_info.health_status} if container_info.health_status else {}
|
||||
}
|
||||
}
|
||||
elif self.docker_client and hasattr(self.docker_client, "get_container_info"):
|
||||
container_info = await self.docker_client.get_container_info(container_id)
|
||||
if container_info:
|
||||
return {
|
||||
"State": {
|
||||
"Status": container_info.status,
|
||||
"Health": {"Status": container_info.health_status} if container_info.health_status else {}
|
||||
}
|
||||
}
|
||||
except Exception as e:
|
||||
logger.debug(
|
||||
f"Failed to get container info for {container_id}",
|
||||
@@ -384,8 +398,8 @@ class ContainerHealthMonitor:
|
||||
|
||||
# Trigger container restart through session manager
|
||||
if self.session_manager:
|
||||
# Create new container for the session
|
||||
await self.session_manager.create_session()
|
||||
# Restart container for the SAME session (preserves session_id)
|
||||
await self.session_manager.restart_session(session_id)
|
||||
logger.info(
|
||||
"Container restart initiated",
|
||||
extra={
|
||||
@@ -418,17 +432,22 @@ class ContainerHealthMonitor:
|
||||
async def _stop_container(self, container_id: str):
|
||||
"""Stop a container."""
|
||||
try:
|
||||
if self.docker_client:
|
||||
container = await self.docker_client.get_container(container_id)
|
||||
await self.docker_client.stop_container(container, timeout=10)
|
||||
elif (
|
||||
hasattr(self.session_manager, "docker_client")
|
||||
and self.session_manager.docker_client
|
||||
# Use session_manager.docker_service for container operations
|
||||
# docker_service.stop_container takes container_id as a string
|
||||
if (
|
||||
self.session_manager
|
||||
and hasattr(self.session_manager, "docker_service")
|
||||
and self.session_manager.docker_service
|
||||
):
|
||||
container = self.session_manager.docker_client.containers.get(
|
||||
container_id
|
||||
await self.session_manager.docker_service.stop_container(container_id, timeout=10)
|
||||
elif self.docker_client and hasattr(self.docker_client, "stop_container"):
|
||||
# If docker_client is docker_service, use it directly
|
||||
await self.docker_client.stop_container(container_id, timeout=10)
|
||||
else:
|
||||
logger.warning(
|
||||
"No docker client available to stop container",
|
||||
extra={"container_id": container_id},
|
||||
)
|
||||
container.stop(timeout=10)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"Failed to stop container during restart",
|
||||
|
||||
Reference in New Issue
Block a user