Files
lovdata-chat/session-manager/session_manager.py
Torbjørn Lindahl 69d18cc494 fix: session stability improvements
- Fix docker client initialization bug in app.py (context manager was closing client)
- Add restart_session() method to preserve session IDs during container restarts
- Add 60-second startup grace period before health checking new sessions
- Fix _stop_container and _get_container_info to use docker_service API consistently
- Disable mDNS in Dockerfile to prevent Bonjour service name conflicts
- Remove old container before restart to free port bindings
2026-02-04 19:10:03 +01:00

493 lines
19 KiB
Python

import os
import uuid
import json
import asyncio
import shutil
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, Optional, List
from fastapi import HTTPException
from config import (
SESSIONS_DIR,
SESSIONS_FILE,
CONTAINER_IMAGE,
MAX_CONCURRENT_SESSIONS,
SESSION_TIMEOUT_MINUTES,
USE_ASYNC_DOCKER,
USE_DATABASE_STORAGE,
)
from models import SessionData
from docker_service import DockerService
from database import SessionModel
from resource_manager import get_resource_limits, should_throttle_sessions
from session_auth import generate_session_auth_token, cleanup_expired_auth_tokens
from logging_config import get_logger
logger = get_logger(__name__)
class SessionManager:
def __init__(self, docker_service: Optional[DockerService] = None):
if docker_service:
self.docker_service = docker_service
else:
self.docker_service = DockerService(use_async=USE_ASYNC_DOCKER)
if USE_DATABASE_STORAGE:
self.sessions: Dict[str, SessionData] = {}
logger.info("Session storage initialized", extra={"backend": "database"})
else:
self.sessions: Dict[str, SessionData] = {}
self._load_sessions_from_file()
logger.info("Session storage initialized", extra={"backend": "json_file"})
from container_health import get_container_health_monitor
self.health_monitor = get_container_health_monitor()
logger.info(
"SessionManager initialized",
extra={
"docker_service_type": type(self.docker_service).__name__,
"storage_backend": "database" if USE_DATABASE_STORAGE else "json_file",
},
)
def _load_sessions_from_file(self):
if SESSIONS_FILE.exists():
try:
with open(SESSIONS_FILE, "r") as f:
data = json.load(f)
for session_id, session_dict in data.items():
session_dict["created_at"] = datetime.fromisoformat(
session_dict["created_at"]
)
session_dict["last_accessed"] = datetime.fromisoformat(
session_dict["last_accessed"]
)
self.sessions[session_id] = SessionData(**session_dict)
logger.info(
"Sessions loaded from JSON file",
extra={"count": len(self.sessions)},
)
except (json.JSONDecodeError, KeyError) as e:
logger.warning("Could not load sessions file", extra={"error": str(e)})
self.sessions = {}
async def _load_sessions_from_database(self):
try:
db_sessions = await SessionModel.get_sessions_by_status("running")
db_sessions.extend(await SessionModel.get_sessions_by_status("creating"))
self.sessions = {}
for session_dict in db_sessions:
session_data = SessionData(**session_dict)
self.sessions[session_dict["session_id"]] = session_data
logger.info(
"Sessions loaded from database", extra={"count": len(self.sessions)}
)
except Exception as e:
logger.error(
"Failed to load sessions from database", extra={"error": str(e)}
)
self.sessions = {}
def _save_sessions(self):
SESSIONS_DIR.mkdir(exist_ok=True)
data = {}
for session_id, session in self.sessions.items():
data[session_id] = session.dict()
with open(SESSIONS_FILE, "w") as f:
json.dump(data, f, indent=2, default=str)
def _generate_session_id(self) -> str:
return str(uuid.uuid4()).replace("-", "")[:16]
def _get_available_port(self) -> int:
used_ports = {s.port for s in self.sessions.values() if s.port}
port = 8081
while port in used_ports:
port += 1
return port
def _check_container_limits(self) -> bool:
active_sessions = sum(
1 for s in self.sessions.values() if s.status in ["creating", "running"]
)
return active_sessions < MAX_CONCURRENT_SESSIONS
async def _async_check_container_limits(self) -> bool:
return self._check_container_limits()
async def create_session(self) -> SessionData:
if USE_ASYNC_DOCKER:
limits_ok = await self._async_check_container_limits()
else:
limits_ok = self._check_container_limits()
if not limits_ok:
raise HTTPException(
status_code=429,
detail=f"Maximum concurrent sessions ({MAX_CONCURRENT_SESSIONS}) reached",
)
should_throttle, reason = should_throttle_sessions()
if should_throttle:
raise HTTPException(
status_code=503,
detail=f"System resource constraints prevent new sessions: {reason}",
)
session_id = self._generate_session_id()
container_name = f"opencode-{session_id}"
host_dir = str(SESSIONS_DIR / session_id)
port = self._get_available_port()
Path(host_dir).mkdir(parents=True, exist_ok=True)
auth_token = generate_session_auth_token(session_id)
session = SessionData(
session_id=session_id,
container_name=container_name,
host_dir=host_dir,
port=port,
auth_token=auth_token,
created_at=datetime.now(),
last_accessed=datetime.now(),
status="creating",
)
self.sessions[session_id] = session
if USE_DATABASE_STORAGE:
try:
await SessionModel.create_session(
{
"session_id": session_id,
"container_name": container_name,
"host_dir": host_dir,
"port": port,
"auth_token": auth_token,
"status": "creating",
}
)
logger.info(
"Session created in database", extra={"session_id": session_id}
)
except Exception as e:
logger.error(
"Failed to create session in database",
extra={"session_id": session_id, "error": str(e)},
)
if USE_ASYNC_DOCKER:
asyncio.create_task(self._start_container_async(session))
else:
asyncio.create_task(self._start_container_sync(session))
return session
async def _start_container_async(self, session: SessionData):
try:
resource_limits = get_resource_limits()
logger.info(
f"Starting container {session.container_name} with resource limits: "
f"memory={resource_limits.memory_limit}, cpu_quota={resource_limits.cpu_quota}"
)
container_info = await self.docker_service.create_container(
name=session.container_name,
image=CONTAINER_IMAGE,
volumes={session.host_dir: {"bind": "/app/somedir", "mode": "rw"}},
ports={"8080": session.port},
environment={
"MCP_SERVER": os.getenv("MCP_SERVER", ""),
"OPENAI_API_KEY": os.getenv("OPENAI_API_KEY", ""),
"ANTHROPIC_API_KEY": os.getenv("ANTHROPIC_API_KEY", ""),
"GOOGLE_API_KEY": os.getenv("GOOGLE_API_KEY", ""),
"OPENCODE_API_KEY": os.getenv("ZEN_API_KEY", ""),
"SESSION_AUTH_TOKEN": session.auth_token or "",
"SESSION_ID": session.session_id,
},
network_mode="bridge",
mem_limit=resource_limits.memory_limit,
cpu_quota=resource_limits.cpu_quota,
cpu_period=resource_limits.cpu_period,
tmpfs={
"/tmp": "rw,noexec,nosuid,size=100m",
"/var/tmp": "rw,noexec,nosuid,size=50m",
},
)
await self.docker_service.start_container(container_info.container_id)
session.container_id = container_info.container_id
session.status = "running"
self.sessions[session.session_id] = session
if USE_DATABASE_STORAGE:
try:
await SessionModel.update_session(
session.session_id,
{
"container_id": container_info.container_id,
"status": "running",
},
)
except Exception as e:
logger.error(
"Failed to update session in database",
extra={"session_id": session.session_id, "error": str(e)},
)
logger.info(
"Container started successfully",
extra={
"session_id": session.session_id,
"container_name": session.container_name,
"container_id": container_info.container_id,
"port": session.port,
},
)
except Exception as e:
session.status = "error"
self._save_sessions()
logger.error(f"Failed to start container {session.container_name}: {e}")
async def _start_container_sync(self, session: SessionData):
try:
resource_limits = get_resource_limits()
logger.info(
f"Starting container {session.container_name} with resource limits: "
f"memory={resource_limits.memory_limit}, cpu_quota={resource_limits.cpu_quota}"
)
container_info = await self.docker_service.create_container(
name=session.container_name,
image=CONTAINER_IMAGE,
volumes={session.host_dir: {"bind": "/app/somedir", "mode": "rw"}},
ports={"8080": session.port},
environment={
"MCP_SERVER": os.getenv("MCP_SERVER", ""),
"OPENAI_API_KEY": os.getenv("OPENAI_API_KEY", ""),
"ANTHROPIC_API_KEY": os.getenv("ANTHROPIC_API_KEY", ""),
"GOOGLE_API_KEY": os.getenv("GOOGLE_API_KEY", ""),
"OPENCODE_API_KEY": os.getenv("ZEN_API_KEY", ""),
"SESSION_AUTH_TOKEN": session.auth_token or "",
"SESSION_ID": session.session_id,
},
network_mode="bridge",
mem_limit=resource_limits.memory_limit,
cpu_quota=resource_limits.cpu_quota,
cpu_period=resource_limits.cpu_period,
tmpfs={
"/tmp": "rw,noexec,nosuid,size=100m",
"/var/tmp": "rw,noexec,nosuid,size=50m",
},
)
session.container_id = container_info.container_id
session.status = "running"
self.sessions[session.session_id] = session
if USE_DATABASE_STORAGE:
try:
await SessionModel.update_session(
session.session_id,
{
"container_id": container_info.container_id,
"status": "running",
},
)
except Exception as e:
logger.error(
"Failed to update session in database",
extra={"session_id": session.session_id, "error": str(e)},
)
logger.info(
"Container started successfully",
extra={
"session_id": session.session_id,
"container_name": session.container_name,
"container_id": container_info.container_id,
"port": session.port,
},
)
except Exception as e:
session.status = "error"
self._save_sessions()
logger.error(f"Failed to start container {session.container_name}: {e}")
async def get_session(self, session_id: str) -> Optional[SessionData]:
session = self.sessions.get(session_id)
if session:
session.last_accessed = datetime.now()
if USE_DATABASE_STORAGE:
try:
await SessionModel.update_session(
session_id, {"last_accessed": datetime.now()}
)
except Exception as e:
logger.warning(
"Failed to update session access time in database",
extra={"session_id": session_id, "error": str(e)},
)
return session
if USE_DATABASE_STORAGE:
try:
db_session = await SessionModel.get_session(session_id)
if db_session:
session_data = SessionData(**db_session)
self.sessions[session_id] = session_data
logger.debug(
"Session loaded from database", extra={"session_id": session_id}
)
return session_data
except Exception as e:
logger.error(
"Failed to load session from database",
extra={"session_id": session_id, "error": str(e)},
)
return None
async def list_sessions(self) -> List[SessionData]:
return list(self.sessions.values())
async def restart_session(self, session_id: str) -> Optional[SessionData]:
"""Restart a session's container while preserving the session ID.
Unlike create_session(), this reuses the existing session data
and only creates a new container, maintaining session ID continuity.
This method removes the old container to free up the port.
"""
session = await self.get_session(session_id)
if not session:
logger.error(
"Cannot restart session: not found",
extra={"session_id": session_id},
)
return None
old_container_id = session.container_id
logger.info(
"Restarting session container",
extra={"session_id": session_id, "old_container_id": old_container_id},
)
# Stop and remove old container to free up the port
if old_container_id and self.docker_service:
try:
logger.info(
"Stopping old container for restart",
extra={"session_id": session_id, "container_id": old_container_id},
)
await self.docker_service.stop_container(old_container_id)
except Exception as e:
logger.warning(
"Failed to stop old container (may already be stopped)",
extra={"session_id": session_id, "container_id": old_container_id, "error": str(e)},
)
try:
logger.info(
"Removing old container for restart",
extra={"session_id": session_id, "container_id": old_container_id},
)
await self.docker_service.remove_container(old_container_id, force=True)
except Exception as e:
logger.warning(
"Failed to remove old container",
extra={"session_id": session_id, "container_id": old_container_id, "error": str(e)},
)
# Generate new container name for the restart
new_container_name = f"opencode-{session_id}-{uuid.uuid4().hex[:8]}"
session.container_name = new_container_name
session.container_id = None # Clear old container_id
session.status = "starting"
# Update session in store before starting container
self.sessions[session_id] = session
if USE_DATABASE_STORAGE:
try:
await SessionModel.update_session(
session_id,
{
"container_name": new_container_name,
"container_id": None,
"status": "starting",
},
)
except Exception as e:
logger.error(
"Failed to update session in database during restart",
extra={"session_id": session_id, "error": str(e)},
)
# Start new container for this session
if USE_ASYNC_DOCKER:
asyncio.create_task(self._start_container_async(session))
else:
asyncio.create_task(self._start_container_sync(session))
return session
async def list_containers_async(self, all: bool = False) -> List:
return await self.docker_service.list_containers(all=all)
async def cleanup_expired_sessions(self):
now = datetime.now()
expired_sessions = []
for session_id, session in self.sessions.items():
if now - session.last_accessed > timedelta(minutes=SESSION_TIMEOUT_MINUTES):
expired_sessions.append(session_id)
try:
await self.docker_service.stop_container(
session.container_name, timeout=10
)
await self.docker_service.remove_container(session.container_name)
logger.info(f"Cleaned up container {session.container_name}")
except Exception as e:
logger.error(
f"Error cleaning up container {session.container_name}: {e}"
)
try:
shutil.rmtree(session.host_dir)
logger.info(f"Removed session directory {session.host_dir}")
except OSError as e:
logger.error(
f"Error removing session directory {session.host_dir}: {e}"
)
for session_id in expired_sessions:
del self.sessions[session_id]
if expired_sessions:
self._save_sessions()
logger.info(f"Cleaned up {len(expired_sessions)} expired sessions")
expired_tokens = cleanup_expired_auth_tokens()
if expired_tokens > 0:
logger.info(f"Cleaned up {expired_tokens} expired authentication tokens")
session_manager = SessionManager()