Files
lovdata-chat/session-manager/routes/health.py
2026-02-03 00:36:22 +01:00

150 lines
4.6 KiB
Python

from datetime import datetime
from fastapi import APIRouter, HTTPException
from config import (
CONTAINER_MEMORY_LIMIT,
CONTAINER_CPU_QUOTA,
CONTAINER_CPU_PERIOD,
MAX_CONCURRENT_SESSIONS,
USE_ASYNC_DOCKER,
USE_DATABASE_STORAGE,
)
from session_manager import session_manager
from host_ip_detector import async_get_host_ip
from resource_manager import check_system_resources
from http_pool import get_connection_pool_stats
from database import get_database_stats
from container_health import get_container_health_stats, get_container_health_history
from logging_config import get_logger
logger = get_logger(__name__)
router = APIRouter(tags=["health"])
@router.get("/health/container")
async def get_container_health():
stats = get_container_health_stats()
return stats
@router.get("/health/container/{session_id}")
async def get_session_container_health(session_id: str):
session = await session_manager.get_session(session_id)
if not session:
raise HTTPException(status_code=404, detail="Session not found")
stats = get_container_health_stats(session_id)
history = get_container_health_history(session_id, limit=20)
return {
"session_id": session_id,
"container_id": session.container_id,
"stats": stats.get(f"session_{session_id}", {}),
"recent_history": history,
}
@router.get("/health")
async def health_check():
docker_ok = False
host_ip_ok = False
detected_host_ip = None
resource_status = {}
http_pool_stats = {}
try:
docker_ok = await session_manager.docker_service.ping()
except Exception as e:
logger.warning(f"Docker health check failed: {e}")
docker_ok = False
try:
detected_host_ip = await async_get_host_ip()
host_ip_ok = True
except Exception as e:
logger.warning(f"Host IP detection failed: {e}")
host_ip_ok = False
try:
resource_status = check_system_resources()
except Exception as e:
logger.warning("Resource monitoring failed", extra={"error": str(e)})
resource_status = {"error": str(e)}
try:
http_pool_stats = await get_connection_pool_stats()
except Exception as e:
logger.warning("HTTP pool stats failed", extra={"error": str(e)})
http_pool_stats = {"error": str(e)}
database_status = {}
if USE_DATABASE_STORAGE:
try:
database_status = await get_database_stats()
except Exception as e:
logger.warning("Database stats failed", extra={"error": str(e)})
database_status = {"status": "error", "error": str(e)}
container_health_stats = {}
try:
container_health_stats = get_container_health_stats()
except Exception as e:
logger.warning("Container health stats failed", extra={"error": str(e)})
container_health_stats = {"error": str(e)}
resource_alerts = (
resource_status.get("alerts", []) if isinstance(resource_status, dict) else []
)
critical_alerts = [
a
for a in resource_alerts
if isinstance(a, dict) and a.get("level") == "critical"
]
http_healthy = (
http_pool_stats.get("status") == "healthy"
if isinstance(http_pool_stats, dict)
else False
)
if critical_alerts or not (docker_ok and host_ip_ok and http_healthy):
status = "unhealthy"
elif resource_alerts:
status = "degraded"
else:
status = "healthy"
health_data = {
"status": status,
"docker": docker_ok,
"docker_mode": "async" if USE_ASYNC_DOCKER else "sync",
"host_ip_detection": host_ip_ok,
"detected_host_ip": detected_host_ip,
"http_connection_pool": http_pool_stats,
"storage_backend": "database" if USE_DATABASE_STORAGE else "json_file",
"active_sessions": len(
[s for s in session_manager.sessions.values() if s.status == "running"]
),
"resource_limits": {
"memory_limit": CONTAINER_MEMORY_LIMIT,
"cpu_quota": CONTAINER_CPU_QUOTA,
"cpu_period": CONTAINER_CPU_PERIOD,
"max_concurrent_sessions": MAX_CONCURRENT_SESSIONS,
},
"system_resources": resource_status.get("system_resources", {})
if isinstance(resource_status, dict)
else {},
"resource_alerts": resource_alerts,
"timestamp": datetime.now().isoformat(),
}
if USE_DATABASE_STORAGE and database_status:
health_data["database"] = database_status
if container_health_stats:
health_data["container_health"] = container_health_stats
return health_data