fixed all remaining issues with the session manager

2026-01-18 23:28:49 +01:00
parent 0243cfc250
commit 2f5464e1d2
11 changed files with 4040 additions and 101 deletions
--- a/session-manager/container_health.py
+++ b/session-manager/container_health.py
@@ -0,0 +1,574 @@
+"""
+Container Health Monitoring System
+
+Provides active monitoring of Docker containers with automatic failure detection,
+recovery mechanisms, and integration with session management and alerting systems.
+"""
+
+import asyncio
+import logging
+from typing import Dict, List, Optional, Tuple, Any
+from datetime import datetime, timedelta
+from enum import Enum
+
+from logging_config import get_logger, log_performance, log_security_event
+
+logger = get_logger(__name__)
+
+
+class ContainerStatus(Enum):
+    """Container health status enumeration."""
+
+    HEALTHY = "healthy"
+    UNHEALTHY = "unhealthy"
+    RESTARTING = "restarting"
+    FAILED = "failed"
+    UNKNOWN = "unknown"
+
+
+class HealthCheckResult:
+    """Result of a container health check."""
+
+    def __init__(
+        self,
+        session_id: str,
+        container_id: str,
+        status: ContainerStatus,
+        response_time: Optional[float] = None,
+        error_message: Optional[str] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+    ):
+        self.session_id = session_id
+        self.container_id = container_id
+        self.status = status
+        self.response_time = response_time
+        self.error_message = error_message
+        self.metadata = metadata or {}
+        self.timestamp = datetime.utcnow()
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for logging/serialization."""
+        return {
+            "session_id": self.session_id,
+            "container_id": self.container_id,
+            "status": self.status.value,
+            "response_time": self.response_time,
+            "error_message": self.error_message,
+            "metadata": self.metadata,
+            "timestamp": self.timestamp.isoformat(),
+        }
+
+
+class ContainerHealthMonitor:
+    """Monitors Docker container health and handles automatic recovery."""
+
+    def __init__(
+        self,
+        check_interval: int = 30,  # seconds
+        health_timeout: float = 10.0,  # seconds
+        max_restart_attempts: int = 3,
+        restart_delay: int = 5,  # seconds
+        failure_threshold: int = 3,  # consecutive failures before restart
+    ):
+        self.check_interval = check_interval
+        self.health_timeout = health_timeout
+        self.max_restart_attempts = max_restart_attempts
+        self.restart_delay = restart_delay
+        self.failure_threshold = failure_threshold
+
+        # Monitoring state
+        self._monitoring = False
+        self._task: Optional[asyncio.Task] = None
+        self._health_history: Dict[str, List[HealthCheckResult]] = {}
+        self._restart_counts: Dict[str, int] = {}
+
+        # Dependencies (injected)
+        self.session_manager = None
+        self.docker_client = None
+
+        logger.info(
+            "Container health monitor initialized",
+            extra={
+                "check_interval": check_interval,
+                "health_timeout": health_timeout,
+                "max_restart_attempts": max_restart_attempts,
+            },
+        )
+
+    def set_dependencies(self, session_manager, docker_client):
+        """Set dependencies for health monitoring."""
+        self.session_manager = session_manager
+        self.docker_client = docker_client
+
+    async def start_monitoring(self):
+        """Start the health monitoring loop."""
+        if self._monitoring:
+            logger.warning("Health monitoring already running")
+            return
+
+        self._monitoring = True
+        self._task = asyncio.create_task(self._monitoring_loop())
+        logger.info("Container health monitoring started")
+
+    async def stop_monitoring(self):
+        """Stop the health monitoring loop."""
+        if not self._monitoring:
+            return
+
+        self._monitoring = False
+        if self._task:
+            self._task.cancel()
+            try:
+                await self._task
+            except asyncio.CancelledError:
+                pass
+
+        logger.info("Container health monitoring stopped")
+
+    async def _monitoring_loop(self):
+        """Main monitoring loop."""
+        while self._monitoring:
+            try:
+                await self._perform_health_checks()
+                await self._cleanup_old_history()
+            except Exception as e:
+                logger.error("Error in health monitoring loop", extra={"error": str(e)})
+
+            await asyncio.sleep(self.check_interval)
+
+    async def _perform_health_checks(self):
+        """Perform health checks on all running containers."""
+        if not self.session_manager:
+            return
+
+        # Get all running sessions
+        running_sessions = [
+            session
+            for session in self.session_manager.sessions.values()
+            if session.status == "running"
+        ]
+
+        if not running_sessions:
+            return
+
+        logger.debug(f"Checking health of {len(running_sessions)} running containers")
+
+        # Perform health checks concurrently
+        tasks = [self._check_container_health(session) for session in running_sessions]
+
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+
+        # Process results
+        for i, result in enumerate(results):
+            session = running_sessions[i]
+            if isinstance(result, Exception):
+                logger.error(
+                    "Health check failed",
+                    extra={
+                        "session_id": session.session_id,
+                        "container_id": session.container_id,
+                        "error": str(result),
+                    },
+                )
+                continue
+
+            await self._process_health_result(result)
+
+    async def _check_container_health(self, session) -> HealthCheckResult:
+        """Check the health of a single container."""
+        start_time = asyncio.get_event_loop().time()
+
+        try:
+            # Check if container exists and is running
+            if not session.container_id:
+                return HealthCheckResult(
+                    session.session_id,
+                    session.container_id or "unknown",
+                    ContainerStatus.UNKNOWN,
+                    error_message="No container ID",
+                )
+
+            # Get container status
+            container_info = await self._get_container_info(session.container_id)
+            if not container_info:
+                return HealthCheckResult(
+                    session.session_id,
+                    session.container_id,
+                    ContainerStatus.FAILED,
+                    error_message="Container not found",
+                )
+
+            # Check container state
+            state = container_info.get("State", {})
+            status = state.get("Status", "unknown")
+
+            if status != "running":
+                return HealthCheckResult(
+                    session.session_id,
+                    session.container_id,
+                    ContainerStatus.FAILED,
+                    error_message=f"Container status: {status}",
+                )
+
+            # Check health status if available
+            health = state.get("Health", {})
+            if health:
+                health_status = health.get("Status", "unknown")
+                if health_status == "healthy":
+                    response_time = (
+                        asyncio.get_event_loop().time() - start_time
+                    ) * 1000
+                    return HealthCheckResult(
+                        session.session_id,
+                        session.container_id,
+                        ContainerStatus.HEALTHY,
+                        response_time=response_time,
+                        metadata={
+                            "docker_status": status,
+                            "health_status": health_status,
+                        },
+                    )
+                elif health_status in ["unhealthy", "starting"]:
+                    return HealthCheckResult(
+                        session.session_id,
+                        session.container_id,
+                        ContainerStatus.UNHEALTHY,
+                        error_message=f"Health check: {health_status}",
+                        metadata={
+                            "docker_status": status,
+                            "health_status": health_status,
+                        },
+                    )
+
+            # If no health check configured, consider running containers healthy
+            response_time = (asyncio.get_event_loop().time() - start_time) * 1000
+            return HealthCheckResult(
+                session.session_id,
+                session.container_id,
+                ContainerStatus.HEALTHY,
+                response_time=response_time,
+                metadata={"docker_status": status},
+            )
+
+        except Exception as e:
+            response_time = (asyncio.get_event_loop().time() - start_time) * 1000
+            return HealthCheckResult(
+                session.session_id,
+                session.container_id or "unknown",
+                ContainerStatus.UNKNOWN,
+                response_time=response_time,
+                error_message=str(e),
+            )
+
+    async def _get_container_info(self, container_id: str) -> Optional[Dict[str, Any]]:
+        """Get container information from Docker."""
+        try:
+            if self.docker_client:
+                # Try async Docker client first
+                container = await self.docker_client.get_container(container_id)
+                if hasattr(container, "_container"):
+                    return await container._container.show()
+                elif hasattr(container, "show"):
+                    return await container.show()
+            else:
+                # Fallback to sync client if available
+                if (
+                    hasattr(self.session_manager, "docker_client")
+                    and self.session_manager.docker_client
+                ):
+                    container = self.session_manager.docker_client.containers.get(
+                        container_id
+                    )
+                    return container.attrs
+        except Exception as e:
+            logger.debug(
+                f"Failed to get container info for {container_id}",
+                extra={"error": str(e)},
+            )
+
+        return None
+
+    async def _process_health_result(self, result: HealthCheckResult):
+        """Process a health check result and take appropriate action."""
+        # Store result in history
+        if result.session_id not in self._health_history:
+            self._health_history[result.session_id] = []
+
+        self._health_history[result.session_id].append(result)
+
+        # Keep only recent history (last 10 checks)
+        if len(self._health_history[result.session_id]) > 10:
+            self._health_history[result.session_id] = self._health_history[
+                result.session_id
+            ][-10:]
+
+        # Log result
+        log_extra = result.to_dict()
+        if result.status == ContainerStatus.HEALTHY:
+            logger.debug("Container health check passed", extra=log_extra)
+        elif result.status == ContainerStatus.UNHEALTHY:
+            logger.warning("Container health check failed", extra=log_extra)
+        elif result.status in [ContainerStatus.FAILED, ContainerStatus.UNKNOWN]:
+            logger.error("Container health check critical", extra=log_extra)
+
+        # Check if restart is needed
+        await self._check_restart_needed(result)
+
+    async def _check_restart_needed(self, result: HealthCheckResult):
+        """Check if a container needs to be restarted based on health history."""
+        if result.status == ContainerStatus.HEALTHY:
+            # Reset restart count on successful health check
+            if result.session_id in self._restart_counts:
+                self._restart_counts[result.session_id] = 0
+            return
+
+        # Count recent failures
+        recent_results = self._health_history.get(result.session_id, [])
+        recent_failures = sum(
+            1
+            for r in recent_results[-self.failure_threshold :]
+            if r.status
+            in [
+                ContainerStatus.UNHEALTHY,
+                ContainerStatus.FAILED,
+                ContainerStatus.UNKNOWN,
+            ]
+        )
+
+        if recent_failures >= self.failure_threshold:
+            await self._restart_container(result.session_id, result.container_id)
+
+    async def _restart_container(self, session_id: str, container_id: str):
+        """Restart a failed container."""
+        # Check restart limit
+        restart_count = self._restart_counts.get(session_id, 0)
+        if restart_count >= self.max_restart_attempts:
+            logger.error(
+                "Container restart limit exceeded",
+                extra={
+                    "session_id": session_id,
+                    "container_id": container_id,
+                    "restart_attempts": restart_count,
+                },
+            )
+            # Mark session as failed
+            await self._mark_session_failed(
+                session_id, f"Restart limit exceeded ({restart_count} attempts)"
+            )
+            return
+
+        logger.info(
+            "Attempting container restart",
+            extra={
+                "session_id": session_id,
+                "container_id": container_id,
+                "restart_attempt": restart_count + 1,
+            },
+        )
+
+        try:
+            # Stop the container
+            await self._stop_container(container_id)
+
+            # Wait before restart
+            await asyncio.sleep(self.restart_delay)
+
+            # Start new container for the session
+            session = await self.session_manager.get_session(session_id)
+            if session:
+                # Update restart count
+                self._restart_counts[session_id] = restart_count + 1
+
+                # Mark as restarting
+                await self._update_session_status(session_id, "restarting")
+
+                # Trigger container restart through session manager
+                if self.session_manager:
+                    # Create new container for the session
+                    await self.session_manager.create_session()
+                    logger.info(
+                        "Container restart initiated",
+                        extra={
+                            "session_id": session_id,
+                            "restart_attempt": restart_count + 1,
+                        },
+                    )
+
+                    # Log security event
+                    log_security_event(
+                        "container_restart",
+                        "warning",
+                        {
+                            "session_id": session_id,
+                            "container_id": container_id,
+                            "reason": "health_check_failure",
+                        },
+                    )
+
+        except Exception as e:
+            logger.error(
+                "Container restart failed",
+                extra={
+                    "session_id": session_id,
+                    "container_id": container_id,
+                    "error": str(e),
+                },
+            )
+
+    async def _stop_container(self, container_id: str):
+        """Stop a container."""
+        try:
+            if self.docker_client:
+                container = await self.docker_client.get_container(container_id)
+                await self.docker_client.stop_container(container, timeout=10)
+            elif (
+                hasattr(self.session_manager, "docker_client")
+                and self.session_manager.docker_client
+            ):
+                container = self.session_manager.docker_client.containers.get(
+                    container_id
+                )
+                container.stop(timeout=10)
+        except Exception as e:
+            logger.warning(
+                "Failed to stop container during restart",
+                extra={"container_id": container_id, "error": str(e)},
+            )
+
+    async def _update_session_status(self, session_id: str, status: str):
+        """Update session status."""
+        if self.session_manager:
+            session = self.session_manager.sessions.get(session_id)
+            if session:
+                session.status = status
+                # Update in database if using database storage
+                if (
+                    hasattr(self.session_manager, "USE_DATABASE_STORAGE")
+                    and self.session_manager.USE_DATABASE_STORAGE
+                ):
+                    try:
+                        from database import SessionModel
+
+                        await SessionModel.update_session(
+                            session_id, {"status": status}
+                        )
+                    except Exception as e:
+                        logger.warning(
+                            "Failed to update session status in database",
+                            extra={"session_id": session_id, "error": str(e)},
+                        )
+
+    async def _mark_session_failed(self, session_id: str, reason: str):
+        """Mark a session as permanently failed."""
+        await self._update_session_status(session_id, "failed")
+
+        logger.error(
+            "Session marked as failed",
+            extra={"session_id": session_id, "reason": reason},
+        )
+
+        # Log security event
+        log_security_event(
+            "session_failure", "error", {"session_id": session_id, "reason": reason}
+        )
+
+    async def _cleanup_old_history(self):
+        """Clean up old health check history."""
+        cutoff_time = datetime.utcnow() - timedelta(hours=1)  # Keep last hour
+
+        for session_id in list(self._health_history.keys()):
+            # Remove old results
+            self._health_history[session_id] = [
+                result
+                for result in self._health_history[session_id]
+                if result.timestamp > cutoff_time
+            ]
+
+            # Remove empty histories
+            if not self._health_history[session_id]:
+                del self._health_history[session_id]
+
+    def get_health_stats(self, session_id: Optional[str] = None) -> Dict[str, Any]:
+        """Get health monitoring statistics."""
+        stats = {
+            "monitoring_active": self._monitoring,
+            "check_interval": self.check_interval,
+            "total_sessions_monitored": len(self._health_history),
+            "sessions_with_failures": len(
+                [
+                    sid
+                    for sid, history in self._health_history.items()
+                    if any(
+                        r.status != ContainerStatus.HEALTHY for r in history[-5:]
+                    )  # Last 5 checks
+                ]
+            ),
+            "restart_counts": dict(self._restart_counts),
+        }
+
+        if session_id and session_id in self._health_history:
+            recent_results = self._health_history[session_id][-10:]  # Last 10 checks
+            stats[f"session_{session_id}"] = {
+                "total_checks": len(recent_results),
+                "healthy_checks": sum(
+                    1 for r in recent_results if r.status == ContainerStatus.HEALTHY
+                ),
+                "failed_checks": sum(
+                    1 for r in recent_results if r.status != ContainerStatus.HEALTHY
+                ),
+                "average_response_time": sum(
+                    r.response_time or 0 for r in recent_results if r.response_time
+                )
+                / max(1, sum(1 for r in recent_results if r.response_time)),
+                "last_check": recent_results[-1].to_dict() if recent_results else None,
+            }
+
+        return stats
+
+    def get_health_history(
+        self, session_id: str, limit: int = 50
+    ) -> List[Dict[str, Any]]:
+        """Get health check history for a session."""
+        if session_id not in self._health_history:
+            return []
+
+        return [
+            result.to_dict() for result in self._health_history[session_id][-limit:]
+        ]
+
+
+# Global health monitor instance
+_container_health_monitor = ContainerHealthMonitor()
+
+
+def get_container_health_monitor() -> ContainerHealthMonitor:
+    """Get the global container health monitor instance."""
+    return _container_health_monitor
+
+
+async def start_container_health_monitoring(session_manager=None, docker_client=None):
+    """Start container health monitoring."""
+    monitor = get_container_health_monitor()
+    if session_manager:
+        monitor.set_dependencies(session_manager, docker_client)
+    await monitor.start_monitoring()
+
+
+async def stop_container_health_monitoring():
+    """Stop container health monitoring."""
+    monitor = get_container_health_monitor()
+    await monitor.stop_monitoring()
+
+
+def get_container_health_stats(session_id: Optional[str] = None) -> Dict[str, Any]:
+    """Get container health statistics."""
+    monitor = get_container_health_monitor()
+    return monitor.get_health_stats(session_id)
+
+
+def get_container_health_history(
+    session_id: str, limit: int = 50
+) -> List[Dict[str, Any]]:
+    """Get container health check history."""
+    monitor = get_container_health_monitor()
+    return monitor.get_health_history(session_id, limit)