connected zen
This commit is contained in:
149
session-manager/routes/health.py
Normal file
149
session-manager/routes/health.py
Normal file
@@ -0,0 +1,149 @@
|
||||
from datetime import datetime
|
||||
|
||||
from fastapi import APIRouter, HTTPException
|
||||
|
||||
from config import (
|
||||
CONTAINER_MEMORY_LIMIT,
|
||||
CONTAINER_CPU_QUOTA,
|
||||
CONTAINER_CPU_PERIOD,
|
||||
MAX_CONCURRENT_SESSIONS,
|
||||
USE_ASYNC_DOCKER,
|
||||
USE_DATABASE_STORAGE,
|
||||
)
|
||||
from session_manager import session_manager
|
||||
from host_ip_detector import async_get_host_ip
|
||||
from resource_manager import check_system_resources
|
||||
from http_pool import get_connection_pool_stats
|
||||
from database import get_database_stats
|
||||
from container_health import get_container_health_stats, get_container_health_history
|
||||
from logging_config import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
router = APIRouter(tags=["health"])
|
||||
|
||||
|
||||
@router.get("/health/container")
|
||||
async def get_container_health():
|
||||
stats = get_container_health_stats()
|
||||
return stats
|
||||
|
||||
|
||||
@router.get("/health/container/{session_id}")
|
||||
async def get_session_container_health(session_id: str):
|
||||
session = await session_manager.get_session(session_id)
|
||||
if not session:
|
||||
raise HTTPException(status_code=404, detail="Session not found")
|
||||
|
||||
stats = get_container_health_stats(session_id)
|
||||
history = get_container_health_history(session_id, limit=20)
|
||||
|
||||
return {
|
||||
"session_id": session_id,
|
||||
"container_id": session.container_id,
|
||||
"stats": stats.get(f"session_{session_id}", {}),
|
||||
"recent_history": history,
|
||||
}
|
||||
|
||||
|
||||
@router.get("/health")
|
||||
async def health_check():
|
||||
docker_ok = False
|
||||
host_ip_ok = False
|
||||
detected_host_ip = None
|
||||
resource_status = {}
|
||||
http_pool_stats = {}
|
||||
|
||||
try:
|
||||
docker_ok = await session_manager.docker_service.ping()
|
||||
except Exception as e:
|
||||
logger.warning(f"Docker health check failed: {e}")
|
||||
docker_ok = False
|
||||
|
||||
try:
|
||||
detected_host_ip = await async_get_host_ip()
|
||||
host_ip_ok = True
|
||||
except Exception as e:
|
||||
logger.warning(f"Host IP detection failed: {e}")
|
||||
host_ip_ok = False
|
||||
|
||||
try:
|
||||
resource_status = check_system_resources()
|
||||
except Exception as e:
|
||||
logger.warning("Resource monitoring failed", extra={"error": str(e)})
|
||||
resource_status = {"error": str(e)}
|
||||
|
||||
try:
|
||||
http_pool_stats = await get_connection_pool_stats()
|
||||
except Exception as e:
|
||||
logger.warning("HTTP pool stats failed", extra={"error": str(e)})
|
||||
http_pool_stats = {"error": str(e)}
|
||||
|
||||
database_status = {}
|
||||
if USE_DATABASE_STORAGE:
|
||||
try:
|
||||
database_status = await get_database_stats()
|
||||
except Exception as e:
|
||||
logger.warning("Database stats failed", extra={"error": str(e)})
|
||||
database_status = {"status": "error", "error": str(e)}
|
||||
|
||||
container_health_stats = {}
|
||||
try:
|
||||
container_health_stats = get_container_health_stats()
|
||||
except Exception as e:
|
||||
logger.warning("Container health stats failed", extra={"error": str(e)})
|
||||
container_health_stats = {"error": str(e)}
|
||||
|
||||
resource_alerts = (
|
||||
resource_status.get("alerts", []) if isinstance(resource_status, dict) else []
|
||||
)
|
||||
critical_alerts = [
|
||||
a
|
||||
for a in resource_alerts
|
||||
if isinstance(a, dict) and a.get("level") == "critical"
|
||||
]
|
||||
|
||||
http_healthy = (
|
||||
http_pool_stats.get("status") == "healthy"
|
||||
if isinstance(http_pool_stats, dict)
|
||||
else False
|
||||
)
|
||||
|
||||
if critical_alerts or not (docker_ok and host_ip_ok and http_healthy):
|
||||
status = "unhealthy"
|
||||
elif resource_alerts:
|
||||
status = "degraded"
|
||||
else:
|
||||
status = "healthy"
|
||||
|
||||
health_data = {
|
||||
"status": status,
|
||||
"docker": docker_ok,
|
||||
"docker_mode": "async" if USE_ASYNC_DOCKER else "sync",
|
||||
"host_ip_detection": host_ip_ok,
|
||||
"detected_host_ip": detected_host_ip,
|
||||
"http_connection_pool": http_pool_stats,
|
||||
"storage_backend": "database" if USE_DATABASE_STORAGE else "json_file",
|
||||
"active_sessions": len(
|
||||
[s for s in session_manager.sessions.values() if s.status == "running"]
|
||||
),
|
||||
"resource_limits": {
|
||||
"memory_limit": CONTAINER_MEMORY_LIMIT,
|
||||
"cpu_quota": CONTAINER_CPU_QUOTA,
|
||||
"cpu_period": CONTAINER_CPU_PERIOD,
|
||||
"max_concurrent_sessions": MAX_CONCURRENT_SESSIONS,
|
||||
},
|
||||
"system_resources": resource_status.get("system_resources", {})
|
||||
if isinstance(resource_status, dict)
|
||||
else {},
|
||||
"resource_alerts": resource_alerts,
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
}
|
||||
|
||||
if USE_DATABASE_STORAGE and database_status:
|
||||
health_data["database"] = database_status
|
||||
|
||||
if container_health_stats:
|
||||
health_data["container_health"] = container_health_stats
|
||||
|
||||
return health_data
|
||||
Reference in New Issue
Block a user