Vision support via images / pdfs etc that can be passed on to other models as part of analysis, additional context etc.

Image processing pipeline added OpenAI GPT-4.1 support Chat tool prompt enhancement Lint and code quality improvements
2025-06-16 13:14:53 +04:00
parent d498e9854b
commit 97fa6781cf
26 changed files with 1328 additions and 52 deletions
--- a/utils/conversation_memory.py
+++ b/utils/conversation_memory.py
@@ -142,6 +142,7 @@ class ConversationTurn(BaseModel):
        content: The actual message content/response
        timestamp: ISO timestamp when this turn was created
        files: List of file paths referenced in this specific turn
+        images: List of image paths referenced in this specific turn
        tool_name: Which tool generated this turn (for cross-tool tracking)
        model_provider: Provider used (e.g., "google", "openai")
        model_name: Specific model used (e.g., "gemini-2.5-flash-preview-05-20", "o3-mini")
@@ -152,6 +153,7 @@ class ConversationTurn(BaseModel):
    content: str
    timestamp: str
    files: Optional[list[str]] = None  # Files referenced in this turn
+    images: Optional[list[str]] = None  # Images referenced in this turn
    tool_name: Optional[str] = None  # Tool used for this turn
    model_provider: Optional[str] = None  # Model provider (google, openai, etc)
    model_name: Optional[str] = None  # Specific model used
@@ -300,6 +302,7 @@ def add_turn(
    role: str,
    content: str,
    files: Optional[list[str]] = None,
+    images: Optional[list[str]] = None,
    tool_name: Optional[str] = None,
    model_provider: Optional[str] = None,
    model_name: Optional[str] = None,
@@ -318,6 +321,7 @@ def add_turn(
        role: "user" (Claude) or "assistant" (Gemini/O3/etc)
        content: The actual message/response content
        files: Optional list of files referenced in this turn
+        images: Optional list of images referenced in this turn
        tool_name: Name of the tool adding this turn (for attribution)
        model_provider: Provider used (e.g., "google", "openai")
        model_name: Specific model used (e.g., "gemini-2.5-flash-preview-05-20", "o3-mini")
@@ -335,6 +339,7 @@ def add_turn(
        - Refreshes thread TTL to configured timeout on successful update
        - Turn limits prevent runaway conversations
        - File references are preserved for cross-tool access with atomic ordering
+        - Image references are preserved for cross-tool visual context
        - Model information enables cross-provider conversations
    """
    logger.debug(f"[FLOW] Adding {role} turn to {thread_id} ({tool_name})")
@@ -355,6 +360,7 @@ def add_turn(
        content=content,
        timestamp=datetime.now(timezone.utc).isoformat(),
        files=files,  # Preserved for cross-tool file context
+        images=images,  # Preserved for cross-tool visual context
        tool_name=tool_name,  # Track which tool generated this turn
        model_provider=model_provider,  # Track model provider
        model_name=model_name,  # Track specific model
@@ -489,6 +495,78 @@ def get_conversation_file_list(context: ThreadContext) -> list[str]:
    return file_list


+def get_conversation_image_list(context: ThreadContext) -> list[str]:
+    """
+    Extract all unique images from conversation turns with newest-first prioritization.
+
+    This function implements the identical prioritization logic as get_conversation_file_list()
+    to ensure consistency in how images are handled across conversation turns. It walks
+    backwards through conversation turns (from newest to oldest) and collects unique image
+    references, ensuring that when the same image appears in multiple turns, the reference
+    from the NEWEST turn takes precedence.
+
+    PRIORITIZATION ALGORITHM:
+    1. Iterate through turns in REVERSE order (index len-1 down to 0)
+    2. For each turn, process images in the order they appear in turn.images
+    3. Add image to result list only if not already seen (newest reference wins)
+    4. Skip duplicate images that were already added from newer turns
+
+    This ensures that:
+    - Images from newer conversation turns appear first in the result
+    - When the same image is referenced multiple times, only the newest reference is kept
+    - The order reflects the most recent conversation context
+
+    Example:
+        Turn 1: images = ["diagram.png", "flow.jpg"]
+        Turn 2: images = ["error.png"]
+        Turn 3: images = ["diagram.png", "updated.png"]  # diagram.png appears again
+
+        Result: ["diagram.png", "updated.png", "error.png", "flow.jpg"]
+        (diagram.png from Turn 3 takes precedence over Turn 1)
+
+    Args:
+        context: ThreadContext containing all conversation turns to process
+
+    Returns:
+        list[str]: Unique image paths ordered by newest reference first.
+                   Empty list if no turns exist or no images are referenced.
+
+    Performance:
+        - Time Complexity: O(n*m) where n=turns, m=avg images per turn
+        - Space Complexity: O(i) where i=total unique images
+        - Uses set for O(1) duplicate detection
+    """
+    if not context.turns:
+        logger.debug("[IMAGES] No turns found, returning empty image list")
+        return []
+
+    # Collect images by walking backwards (newest to oldest turns)
+    seen_images = set()
+    image_list = []
+
+    logger.debug(f"[IMAGES] Collecting images from {len(context.turns)} turns (newest first)")
+
+    # Process turns in reverse order (newest first) - this is the CORE of newest-first prioritization
+    # By iterating from len-1 down to 0, we encounter newer turns before older turns
+    # When we find a duplicate image, we skip it because the newer version is already in our list
+    for i in range(len(context.turns) - 1, -1, -1):  # REVERSE: newest turn first
+        turn = context.turns[i]
+        if turn.images:
+            logger.debug(f"[IMAGES] Turn {i + 1} has {len(turn.images)} images: {turn.images}")
+            for image_path in turn.images:
+                if image_path not in seen_images:
+                    # First time seeing this image - add it (this is the NEWEST reference)
+                    seen_images.add(image_path)
+                    image_list.append(image_path)
+                    logger.debug(f"[IMAGES] Added new image: {image_path} (from turn {i + 1})")
+                else:
+                    # Image already seen from a NEWER turn - skip this older reference
+                    logger.debug(f"[IMAGES] Skipping duplicate image: {image_path} (newer version already included)")
+
+    logger.debug(f"[IMAGES] Final image list ({len(image_list)}): {image_list}")
+    return image_list
+
+
 def _plan_file_inclusion_by_size(all_files: list[str], max_file_tokens: int) -> tuple[list[str], list[str], int]:
    """
    Plan which files to include based on size constraints.
--- a/utils/file_types.py
+++ b/utils/file_types.py
@@ -88,8 +88,9 @@ TEXT_DATA = {
    ".lock",  # Lock files
 }

-# Image file extensions
-IMAGES = {".jpg", ".jpeg", ".png", ".gif", ".bmp", ".svg", ".webp", ".ico", ".tiff", ".tif"}
+# Image file extensions - limited to what AI models actually support
+# Based on OpenAI and Gemini supported formats: PNG, JPEG, GIF, WebP
+IMAGES = {".jpg", ".jpeg", ".png", ".gif", ".webp"}

 # Binary executable and library extensions
 BINARIES = {
@@ -240,3 +241,30 @@ def get_token_estimation_ratio(file_path: str) -> float:

    extension = Path(file_path).suffix.lower()
    return TOKEN_ESTIMATION_RATIOS.get(extension, 3.5)  # Conservative default
+
+
+# MIME type mappings for image files - limited to what AI models actually support
+# Based on OpenAI and Gemini supported formats: PNG, JPEG, GIF, WebP
+IMAGE_MIME_TYPES = {
+    ".jpg": "image/jpeg",
+    ".jpeg": "image/jpeg",
+    ".png": "image/png",
+    ".gif": "image/gif",
+    ".webp": "image/webp",
+}
+
+
+def get_image_mime_type(extension: str) -> str:
+    """
+    Get the MIME type for an image file extension.
+
+    Args:
+        extension: File extension (with or without leading dot)
+
+    Returns:
+        MIME type string (default: image/jpeg for unknown extensions)
+    """
+    if not extension.startswith("."):
+        extension = "." + extension
+    extension = extension.lower()
+    return IMAGE_MIME_TYPES.get(extension, "image/jpeg")
--- a/utils/file_utils.py
+++ b/utils/file_utils.py
@@ -48,6 +48,36 @@ from .file_types import BINARY_EXTENSIONS, CODE_EXTENSIONS, IMAGE_EXTENSIONS, TE
 from .security_config import CONTAINER_WORKSPACE, EXCLUDED_DIRS, MCP_SIGNATURE_FILES, SECURITY_ROOT, WORKSPACE_ROOT
 from .token_utils import DEFAULT_CONTEXT_WINDOW, estimate_tokens

+
+def _is_builtin_custom_models_config(path_str: str) -> bool:
+    """
+    Check if path points to the server's built-in custom_models.json config file.
+
+    This only matches the server's internal config, not user-specified CUSTOM_MODELS_CONFIG_PATH.
+    We identify the built-in config by checking if it resolves to the server's conf directory.
+
+    Args:
+        path_str: Path to check
+
+    Returns:
+        True if this is the server's built-in custom_models.json config file
+    """
+    try:
+        path = Path(path_str)
+
+        # Get the server root by going up from this file: utils/file_utils.py -> server_root
+        server_root = Path(__file__).parent.parent
+        builtin_config = server_root / "conf" / "custom_models.json"
+
+        # Check if the path resolves to the same file as our built-in config
+        # This handles both relative and absolute paths to the same file
+        return path.resolve() == builtin_config.resolve()
+
+    except Exception:
+        # If path resolution fails, it's not our built-in config
+        return False
+
+
 logger = logging.getLogger(__name__)


@@ -271,7 +301,8 @@ def translate_path_for_environment(path_str: str) -> str:
    tools and utilities throughout the codebase. It handles:
    1. Docker host-to-container path translation (host paths -> /workspace/...)
    2. Direct mode (no translation needed)
-    3. Security validation and error handling
+    3. Internal server files (conf/custom_models.json)
+    4. Security validation and error handling

    Docker Path Translation Logic:
    - Input: /Users/john/project/src/file.py (host path from Claude)
@@ -284,6 +315,10 @@ def translate_path_for_environment(path_str: str) -> str:
    Returns:
        Translated path appropriate for the current environment
    """
+    # Handle built-in server config file - no translation needed
+    if _is_builtin_custom_models_config(path_str):
+        return path_str
+
    if not WORKSPACE_ROOT or not WORKSPACE_ROOT.strip() or not CONTAINER_WORKSPACE.exists():
        # Not in the configured Docker environment, no translation needed
        return path_str