fixed all remaining issues with the session manager
This commit is contained in:
574
session-manager/container_health.py
Normal file
574
session-manager/container_health.py
Normal file
@@ -0,0 +1,574 @@
|
||||
"""
|
||||
Container Health Monitoring System
|
||||
|
||||
Provides active monitoring of Docker containers with automatic failure detection,
|
||||
recovery mechanisms, and integration with session management and alerting systems.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import Dict, List, Optional, Tuple, Any
|
||||
from datetime import datetime, timedelta
|
||||
from enum import Enum
|
||||
|
||||
from logging_config import get_logger, log_performance, log_security_event
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class ContainerStatus(Enum):
|
||||
"""Container health status enumeration."""
|
||||
|
||||
HEALTHY = "healthy"
|
||||
UNHEALTHY = "unhealthy"
|
||||
RESTARTING = "restarting"
|
||||
FAILED = "failed"
|
||||
UNKNOWN = "unknown"
|
||||
|
||||
|
||||
class HealthCheckResult:
|
||||
"""Result of a container health check."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
session_id: str,
|
||||
container_id: str,
|
||||
status: ContainerStatus,
|
||||
response_time: Optional[float] = None,
|
||||
error_message: Optional[str] = None,
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
):
|
||||
self.session_id = session_id
|
||||
self.container_id = container_id
|
||||
self.status = status
|
||||
self.response_time = response_time
|
||||
self.error_message = error_message
|
||||
self.metadata = metadata or {}
|
||||
self.timestamp = datetime.utcnow()
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert to dictionary for logging/serialization."""
|
||||
return {
|
||||
"session_id": self.session_id,
|
||||
"container_id": self.container_id,
|
||||
"status": self.status.value,
|
||||
"response_time": self.response_time,
|
||||
"error_message": self.error_message,
|
||||
"metadata": self.metadata,
|
||||
"timestamp": self.timestamp.isoformat(),
|
||||
}
|
||||
|
||||
|
||||
class ContainerHealthMonitor:
|
||||
"""Monitors Docker container health and handles automatic recovery."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
check_interval: int = 30, # seconds
|
||||
health_timeout: float = 10.0, # seconds
|
||||
max_restart_attempts: int = 3,
|
||||
restart_delay: int = 5, # seconds
|
||||
failure_threshold: int = 3, # consecutive failures before restart
|
||||
):
|
||||
self.check_interval = check_interval
|
||||
self.health_timeout = health_timeout
|
||||
self.max_restart_attempts = max_restart_attempts
|
||||
self.restart_delay = restart_delay
|
||||
self.failure_threshold = failure_threshold
|
||||
|
||||
# Monitoring state
|
||||
self._monitoring = False
|
||||
self._task: Optional[asyncio.Task] = None
|
||||
self._health_history: Dict[str, List[HealthCheckResult]] = {}
|
||||
self._restart_counts: Dict[str, int] = {}
|
||||
|
||||
# Dependencies (injected)
|
||||
self.session_manager = None
|
||||
self.docker_client = None
|
||||
|
||||
logger.info(
|
||||
"Container health monitor initialized",
|
||||
extra={
|
||||
"check_interval": check_interval,
|
||||
"health_timeout": health_timeout,
|
||||
"max_restart_attempts": max_restart_attempts,
|
||||
},
|
||||
)
|
||||
|
||||
def set_dependencies(self, session_manager, docker_client):
|
||||
"""Set dependencies for health monitoring."""
|
||||
self.session_manager = session_manager
|
||||
self.docker_client = docker_client
|
||||
|
||||
async def start_monitoring(self):
|
||||
"""Start the health monitoring loop."""
|
||||
if self._monitoring:
|
||||
logger.warning("Health monitoring already running")
|
||||
return
|
||||
|
||||
self._monitoring = True
|
||||
self._task = asyncio.create_task(self._monitoring_loop())
|
||||
logger.info("Container health monitoring started")
|
||||
|
||||
async def stop_monitoring(self):
|
||||
"""Stop the health monitoring loop."""
|
||||
if not self._monitoring:
|
||||
return
|
||||
|
||||
self._monitoring = False
|
||||
if self._task:
|
||||
self._task.cancel()
|
||||
try:
|
||||
await self._task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
logger.info("Container health monitoring stopped")
|
||||
|
||||
async def _monitoring_loop(self):
|
||||
"""Main monitoring loop."""
|
||||
while self._monitoring:
|
||||
try:
|
||||
await self._perform_health_checks()
|
||||
await self._cleanup_old_history()
|
||||
except Exception as e:
|
||||
logger.error("Error in health monitoring loop", extra={"error": str(e)})
|
||||
|
||||
await asyncio.sleep(self.check_interval)
|
||||
|
||||
async def _perform_health_checks(self):
|
||||
"""Perform health checks on all running containers."""
|
||||
if not self.session_manager:
|
||||
return
|
||||
|
||||
# Get all running sessions
|
||||
running_sessions = [
|
||||
session
|
||||
for session in self.session_manager.sessions.values()
|
||||
if session.status == "running"
|
||||
]
|
||||
|
||||
if not running_sessions:
|
||||
return
|
||||
|
||||
logger.debug(f"Checking health of {len(running_sessions)} running containers")
|
||||
|
||||
# Perform health checks concurrently
|
||||
tasks = [self._check_container_health(session) for session in running_sessions]
|
||||
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
# Process results
|
||||
for i, result in enumerate(results):
|
||||
session = running_sessions[i]
|
||||
if isinstance(result, Exception):
|
||||
logger.error(
|
||||
"Health check failed",
|
||||
extra={
|
||||
"session_id": session.session_id,
|
||||
"container_id": session.container_id,
|
||||
"error": str(result),
|
||||
},
|
||||
)
|
||||
continue
|
||||
|
||||
await self._process_health_result(result)
|
||||
|
||||
async def _check_container_health(self, session) -> HealthCheckResult:
|
||||
"""Check the health of a single container."""
|
||||
start_time = asyncio.get_event_loop().time()
|
||||
|
||||
try:
|
||||
# Check if container exists and is running
|
||||
if not session.container_id:
|
||||
return HealthCheckResult(
|
||||
session.session_id,
|
||||
session.container_id or "unknown",
|
||||
ContainerStatus.UNKNOWN,
|
||||
error_message="No container ID",
|
||||
)
|
||||
|
||||
# Get container status
|
||||
container_info = await self._get_container_info(session.container_id)
|
||||
if not container_info:
|
||||
return HealthCheckResult(
|
||||
session.session_id,
|
||||
session.container_id,
|
||||
ContainerStatus.FAILED,
|
||||
error_message="Container not found",
|
||||
)
|
||||
|
||||
# Check container state
|
||||
state = container_info.get("State", {})
|
||||
status = state.get("Status", "unknown")
|
||||
|
||||
if status != "running":
|
||||
return HealthCheckResult(
|
||||
session.session_id,
|
||||
session.container_id,
|
||||
ContainerStatus.FAILED,
|
||||
error_message=f"Container status: {status}",
|
||||
)
|
||||
|
||||
# Check health status if available
|
||||
health = state.get("Health", {})
|
||||
if health:
|
||||
health_status = health.get("Status", "unknown")
|
||||
if health_status == "healthy":
|
||||
response_time = (
|
||||
asyncio.get_event_loop().time() - start_time
|
||||
) * 1000
|
||||
return HealthCheckResult(
|
||||
session.session_id,
|
||||
session.container_id,
|
||||
ContainerStatus.HEALTHY,
|
||||
response_time=response_time,
|
||||
metadata={
|
||||
"docker_status": status,
|
||||
"health_status": health_status,
|
||||
},
|
||||
)
|
||||
elif health_status in ["unhealthy", "starting"]:
|
||||
return HealthCheckResult(
|
||||
session.session_id,
|
||||
session.container_id,
|
||||
ContainerStatus.UNHEALTHY,
|
||||
error_message=f"Health check: {health_status}",
|
||||
metadata={
|
||||
"docker_status": status,
|
||||
"health_status": health_status,
|
||||
},
|
||||
)
|
||||
|
||||
# If no health check configured, consider running containers healthy
|
||||
response_time = (asyncio.get_event_loop().time() - start_time) * 1000
|
||||
return HealthCheckResult(
|
||||
session.session_id,
|
||||
session.container_id,
|
||||
ContainerStatus.HEALTHY,
|
||||
response_time=response_time,
|
||||
metadata={"docker_status": status},
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
response_time = (asyncio.get_event_loop().time() - start_time) * 1000
|
||||
return HealthCheckResult(
|
||||
session.session_id,
|
||||
session.container_id or "unknown",
|
||||
ContainerStatus.UNKNOWN,
|
||||
response_time=response_time,
|
||||
error_message=str(e),
|
||||
)
|
||||
|
||||
async def _get_container_info(self, container_id: str) -> Optional[Dict[str, Any]]:
|
||||
"""Get container information from Docker."""
|
||||
try:
|
||||
if self.docker_client:
|
||||
# Try async Docker client first
|
||||
container = await self.docker_client.get_container(container_id)
|
||||
if hasattr(container, "_container"):
|
||||
return await container._container.show()
|
||||
elif hasattr(container, "show"):
|
||||
return await container.show()
|
||||
else:
|
||||
# Fallback to sync client if available
|
||||
if (
|
||||
hasattr(self.session_manager, "docker_client")
|
||||
and self.session_manager.docker_client
|
||||
):
|
||||
container = self.session_manager.docker_client.containers.get(
|
||||
container_id
|
||||
)
|
||||
return container.attrs
|
||||
except Exception as e:
|
||||
logger.debug(
|
||||
f"Failed to get container info for {container_id}",
|
||||
extra={"error": str(e)},
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
async def _process_health_result(self, result: HealthCheckResult):
|
||||
"""Process a health check result and take appropriate action."""
|
||||
# Store result in history
|
||||
if result.session_id not in self._health_history:
|
||||
self._health_history[result.session_id] = []
|
||||
|
||||
self._health_history[result.session_id].append(result)
|
||||
|
||||
# Keep only recent history (last 10 checks)
|
||||
if len(self._health_history[result.session_id]) > 10:
|
||||
self._health_history[result.session_id] = self._health_history[
|
||||
result.session_id
|
||||
][-10:]
|
||||
|
||||
# Log result
|
||||
log_extra = result.to_dict()
|
||||
if result.status == ContainerStatus.HEALTHY:
|
||||
logger.debug("Container health check passed", extra=log_extra)
|
||||
elif result.status == ContainerStatus.UNHEALTHY:
|
||||
logger.warning("Container health check failed", extra=log_extra)
|
||||
elif result.status in [ContainerStatus.FAILED, ContainerStatus.UNKNOWN]:
|
||||
logger.error("Container health check critical", extra=log_extra)
|
||||
|
||||
# Check if restart is needed
|
||||
await self._check_restart_needed(result)
|
||||
|
||||
async def _check_restart_needed(self, result: HealthCheckResult):
|
||||
"""Check if a container needs to be restarted based on health history."""
|
||||
if result.status == ContainerStatus.HEALTHY:
|
||||
# Reset restart count on successful health check
|
||||
if result.session_id in self._restart_counts:
|
||||
self._restart_counts[result.session_id] = 0
|
||||
return
|
||||
|
||||
# Count recent failures
|
||||
recent_results = self._health_history.get(result.session_id, [])
|
||||
recent_failures = sum(
|
||||
1
|
||||
for r in recent_results[-self.failure_threshold :]
|
||||
if r.status
|
||||
in [
|
||||
ContainerStatus.UNHEALTHY,
|
||||
ContainerStatus.FAILED,
|
||||
ContainerStatus.UNKNOWN,
|
||||
]
|
||||
)
|
||||
|
||||
if recent_failures >= self.failure_threshold:
|
||||
await self._restart_container(result.session_id, result.container_id)
|
||||
|
||||
async def _restart_container(self, session_id: str, container_id: str):
|
||||
"""Restart a failed container."""
|
||||
# Check restart limit
|
||||
restart_count = self._restart_counts.get(session_id, 0)
|
||||
if restart_count >= self.max_restart_attempts:
|
||||
logger.error(
|
||||
"Container restart limit exceeded",
|
||||
extra={
|
||||
"session_id": session_id,
|
||||
"container_id": container_id,
|
||||
"restart_attempts": restart_count,
|
||||
},
|
||||
)
|
||||
# Mark session as failed
|
||||
await self._mark_session_failed(
|
||||
session_id, f"Restart limit exceeded ({restart_count} attempts)"
|
||||
)
|
||||
return
|
||||
|
||||
logger.info(
|
||||
"Attempting container restart",
|
||||
extra={
|
||||
"session_id": session_id,
|
||||
"container_id": container_id,
|
||||
"restart_attempt": restart_count + 1,
|
||||
},
|
||||
)
|
||||
|
||||
try:
|
||||
# Stop the container
|
||||
await self._stop_container(container_id)
|
||||
|
||||
# Wait before restart
|
||||
await asyncio.sleep(self.restart_delay)
|
||||
|
||||
# Start new container for the session
|
||||
session = await self.session_manager.get_session(session_id)
|
||||
if session:
|
||||
# Update restart count
|
||||
self._restart_counts[session_id] = restart_count + 1
|
||||
|
||||
# Mark as restarting
|
||||
await self._update_session_status(session_id, "restarting")
|
||||
|
||||
# Trigger container restart through session manager
|
||||
if self.session_manager:
|
||||
# Create new container for the session
|
||||
await self.session_manager.create_session()
|
||||
logger.info(
|
||||
"Container restart initiated",
|
||||
extra={
|
||||
"session_id": session_id,
|
||||
"restart_attempt": restart_count + 1,
|
||||
},
|
||||
)
|
||||
|
||||
# Log security event
|
||||
log_security_event(
|
||||
"container_restart",
|
||||
"warning",
|
||||
{
|
||||
"session_id": session_id,
|
||||
"container_id": container_id,
|
||||
"reason": "health_check_failure",
|
||||
},
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Container restart failed",
|
||||
extra={
|
||||
"session_id": session_id,
|
||||
"container_id": container_id,
|
||||
"error": str(e),
|
||||
},
|
||||
)
|
||||
|
||||
async def _stop_container(self, container_id: str):
|
||||
"""Stop a container."""
|
||||
try:
|
||||
if self.docker_client:
|
||||
container = await self.docker_client.get_container(container_id)
|
||||
await self.docker_client.stop_container(container, timeout=10)
|
||||
elif (
|
||||
hasattr(self.session_manager, "docker_client")
|
||||
and self.session_manager.docker_client
|
||||
):
|
||||
container = self.session_manager.docker_client.containers.get(
|
||||
container_id
|
||||
)
|
||||
container.stop(timeout=10)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"Failed to stop container during restart",
|
||||
extra={"container_id": container_id, "error": str(e)},
|
||||
)
|
||||
|
||||
async def _update_session_status(self, session_id: str, status: str):
|
||||
"""Update session status."""
|
||||
if self.session_manager:
|
||||
session = self.session_manager.sessions.get(session_id)
|
||||
if session:
|
||||
session.status = status
|
||||
# Update in database if using database storage
|
||||
if (
|
||||
hasattr(self.session_manager, "USE_DATABASE_STORAGE")
|
||||
and self.session_manager.USE_DATABASE_STORAGE
|
||||
):
|
||||
try:
|
||||
from database import SessionModel
|
||||
|
||||
await SessionModel.update_session(
|
||||
session_id, {"status": status}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"Failed to update session status in database",
|
||||
extra={"session_id": session_id, "error": str(e)},
|
||||
)
|
||||
|
||||
async def _mark_session_failed(self, session_id: str, reason: str):
|
||||
"""Mark a session as permanently failed."""
|
||||
await self._update_session_status(session_id, "failed")
|
||||
|
||||
logger.error(
|
||||
"Session marked as failed",
|
||||
extra={"session_id": session_id, "reason": reason},
|
||||
)
|
||||
|
||||
# Log security event
|
||||
log_security_event(
|
||||
"session_failure", "error", {"session_id": session_id, "reason": reason}
|
||||
)
|
||||
|
||||
async def _cleanup_old_history(self):
|
||||
"""Clean up old health check history."""
|
||||
cutoff_time = datetime.utcnow() - timedelta(hours=1) # Keep last hour
|
||||
|
||||
for session_id in list(self._health_history.keys()):
|
||||
# Remove old results
|
||||
self._health_history[session_id] = [
|
||||
result
|
||||
for result in self._health_history[session_id]
|
||||
if result.timestamp > cutoff_time
|
||||
]
|
||||
|
||||
# Remove empty histories
|
||||
if not self._health_history[session_id]:
|
||||
del self._health_history[session_id]
|
||||
|
||||
def get_health_stats(self, session_id: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""Get health monitoring statistics."""
|
||||
stats = {
|
||||
"monitoring_active": self._monitoring,
|
||||
"check_interval": self.check_interval,
|
||||
"total_sessions_monitored": len(self._health_history),
|
||||
"sessions_with_failures": len(
|
||||
[
|
||||
sid
|
||||
for sid, history in self._health_history.items()
|
||||
if any(
|
||||
r.status != ContainerStatus.HEALTHY for r in history[-5:]
|
||||
) # Last 5 checks
|
||||
]
|
||||
),
|
||||
"restart_counts": dict(self._restart_counts),
|
||||
}
|
||||
|
||||
if session_id and session_id in self._health_history:
|
||||
recent_results = self._health_history[session_id][-10:] # Last 10 checks
|
||||
stats[f"session_{session_id}"] = {
|
||||
"total_checks": len(recent_results),
|
||||
"healthy_checks": sum(
|
||||
1 for r in recent_results if r.status == ContainerStatus.HEALTHY
|
||||
),
|
||||
"failed_checks": sum(
|
||||
1 for r in recent_results if r.status != ContainerStatus.HEALTHY
|
||||
),
|
||||
"average_response_time": sum(
|
||||
r.response_time or 0 for r in recent_results if r.response_time
|
||||
)
|
||||
/ max(1, sum(1 for r in recent_results if r.response_time)),
|
||||
"last_check": recent_results[-1].to_dict() if recent_results else None,
|
||||
}
|
||||
|
||||
return stats
|
||||
|
||||
def get_health_history(
|
||||
self, session_id: str, limit: int = 50
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Get health check history for a session."""
|
||||
if session_id not in self._health_history:
|
||||
return []
|
||||
|
||||
return [
|
||||
result.to_dict() for result in self._health_history[session_id][-limit:]
|
||||
]
|
||||
|
||||
|
||||
# Global health monitor instance
|
||||
_container_health_monitor = ContainerHealthMonitor()
|
||||
|
||||
|
||||
def get_container_health_monitor() -> ContainerHealthMonitor:
|
||||
"""Get the global container health monitor instance."""
|
||||
return _container_health_monitor
|
||||
|
||||
|
||||
async def start_container_health_monitoring(session_manager=None, docker_client=None):
|
||||
"""Start container health monitoring."""
|
||||
monitor = get_container_health_monitor()
|
||||
if session_manager:
|
||||
monitor.set_dependencies(session_manager, docker_client)
|
||||
await monitor.start_monitoring()
|
||||
|
||||
|
||||
async def stop_container_health_monitoring():
|
||||
"""Stop container health monitoring."""
|
||||
monitor = get_container_health_monitor()
|
||||
await monitor.stop_monitoring()
|
||||
|
||||
|
||||
def get_container_health_stats(session_id: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""Get container health statistics."""
|
||||
monitor = get_container_health_monitor()
|
||||
return monitor.get_health_stats(session_id)
|
||||
|
||||
|
||||
def get_container_health_history(
|
||||
session_id: str, limit: int = 50
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Get container health check history."""
|
||||
monitor = get_container_health_monitor()
|
||||
return monitor.get_health_history(session_id, limit)
|
||||
Reference in New Issue
Block a user