Vision support via images / pdfs etc that can be passed on to other models as part of analysis, additional context etc.

Image processing pipeline added
OpenAI GPT-4.1 support
Chat tool prompt enhancement
Lint and code quality improvements
This commit is contained in:
Fahad
2025-06-16 13:14:53 +04:00
parent d498e9854b
commit 97fa6781cf
26 changed files with 1328 additions and 52 deletions

View File

@@ -142,6 +142,7 @@ class ConversationTurn(BaseModel):
content: The actual message content/response
timestamp: ISO timestamp when this turn was created
files: List of file paths referenced in this specific turn
images: List of image paths referenced in this specific turn
tool_name: Which tool generated this turn (for cross-tool tracking)
model_provider: Provider used (e.g., "google", "openai")
model_name: Specific model used (e.g., "gemini-2.5-flash-preview-05-20", "o3-mini")
@@ -152,6 +153,7 @@ class ConversationTurn(BaseModel):
content: str
timestamp: str
files: Optional[list[str]] = None # Files referenced in this turn
images: Optional[list[str]] = None # Images referenced in this turn
tool_name: Optional[str] = None # Tool used for this turn
model_provider: Optional[str] = None # Model provider (google, openai, etc)
model_name: Optional[str] = None # Specific model used
@@ -300,6 +302,7 @@ def add_turn(
role: str,
content: str,
files: Optional[list[str]] = None,
images: Optional[list[str]] = None,
tool_name: Optional[str] = None,
model_provider: Optional[str] = None,
model_name: Optional[str] = None,
@@ -318,6 +321,7 @@ def add_turn(
role: "user" (Claude) or "assistant" (Gemini/O3/etc)
content: The actual message/response content
files: Optional list of files referenced in this turn
images: Optional list of images referenced in this turn
tool_name: Name of the tool adding this turn (for attribution)
model_provider: Provider used (e.g., "google", "openai")
model_name: Specific model used (e.g., "gemini-2.5-flash-preview-05-20", "o3-mini")
@@ -335,6 +339,7 @@ def add_turn(
- Refreshes thread TTL to configured timeout on successful update
- Turn limits prevent runaway conversations
- File references are preserved for cross-tool access with atomic ordering
- Image references are preserved for cross-tool visual context
- Model information enables cross-provider conversations
"""
logger.debug(f"[FLOW] Adding {role} turn to {thread_id} ({tool_name})")
@@ -355,6 +360,7 @@ def add_turn(
content=content,
timestamp=datetime.now(timezone.utc).isoformat(),
files=files, # Preserved for cross-tool file context
images=images, # Preserved for cross-tool visual context
tool_name=tool_name, # Track which tool generated this turn
model_provider=model_provider, # Track model provider
model_name=model_name, # Track specific model
@@ -489,6 +495,78 @@ def get_conversation_file_list(context: ThreadContext) -> list[str]:
return file_list
def get_conversation_image_list(context: ThreadContext) -> list[str]:
"""
Extract all unique images from conversation turns with newest-first prioritization.
This function implements the identical prioritization logic as get_conversation_file_list()
to ensure consistency in how images are handled across conversation turns. It walks
backwards through conversation turns (from newest to oldest) and collects unique image
references, ensuring that when the same image appears in multiple turns, the reference
from the NEWEST turn takes precedence.
PRIORITIZATION ALGORITHM:
1. Iterate through turns in REVERSE order (index len-1 down to 0)
2. For each turn, process images in the order they appear in turn.images
3. Add image to result list only if not already seen (newest reference wins)
4. Skip duplicate images that were already added from newer turns
This ensures that:
- Images from newer conversation turns appear first in the result
- When the same image is referenced multiple times, only the newest reference is kept
- The order reflects the most recent conversation context
Example:
Turn 1: images = ["diagram.png", "flow.jpg"]
Turn 2: images = ["error.png"]
Turn 3: images = ["diagram.png", "updated.png"] # diagram.png appears again
Result: ["diagram.png", "updated.png", "error.png", "flow.jpg"]
(diagram.png from Turn 3 takes precedence over Turn 1)
Args:
context: ThreadContext containing all conversation turns to process
Returns:
list[str]: Unique image paths ordered by newest reference first.
Empty list if no turns exist or no images are referenced.
Performance:
- Time Complexity: O(n*m) where n=turns, m=avg images per turn
- Space Complexity: O(i) where i=total unique images
- Uses set for O(1) duplicate detection
"""
if not context.turns:
logger.debug("[IMAGES] No turns found, returning empty image list")
return []
# Collect images by walking backwards (newest to oldest turns)
seen_images = set()
image_list = []
logger.debug(f"[IMAGES] Collecting images from {len(context.turns)} turns (newest first)")
# Process turns in reverse order (newest first) - this is the CORE of newest-first prioritization
# By iterating from len-1 down to 0, we encounter newer turns before older turns
# When we find a duplicate image, we skip it because the newer version is already in our list
for i in range(len(context.turns) - 1, -1, -1): # REVERSE: newest turn first
turn = context.turns[i]
if turn.images:
logger.debug(f"[IMAGES] Turn {i + 1} has {len(turn.images)} images: {turn.images}")
for image_path in turn.images:
if image_path not in seen_images:
# First time seeing this image - add it (this is the NEWEST reference)
seen_images.add(image_path)
image_list.append(image_path)
logger.debug(f"[IMAGES] Added new image: {image_path} (from turn {i + 1})")
else:
# Image already seen from a NEWER turn - skip this older reference
logger.debug(f"[IMAGES] Skipping duplicate image: {image_path} (newer version already included)")
logger.debug(f"[IMAGES] Final image list ({len(image_list)}): {image_list}")
return image_list
def _plan_file_inclusion_by_size(all_files: list[str], max_file_tokens: int) -> tuple[list[str], list[str], int]:
"""
Plan which files to include based on size constraints.

View File

@@ -88,8 +88,9 @@ TEXT_DATA = {
".lock", # Lock files
}
# Image file extensions
IMAGES = {".jpg", ".jpeg", ".png", ".gif", ".bmp", ".svg", ".webp", ".ico", ".tiff", ".tif"}
# Image file extensions - limited to what AI models actually support
# Based on OpenAI and Gemini supported formats: PNG, JPEG, GIF, WebP
IMAGES = {".jpg", ".jpeg", ".png", ".gif", ".webp"}
# Binary executable and library extensions
BINARIES = {
@@ -240,3 +241,30 @@ def get_token_estimation_ratio(file_path: str) -> float:
extension = Path(file_path).suffix.lower()
return TOKEN_ESTIMATION_RATIOS.get(extension, 3.5) # Conservative default
# MIME type mappings for image files - limited to what AI models actually support
# Based on OpenAI and Gemini supported formats: PNG, JPEG, GIF, WebP
IMAGE_MIME_TYPES = {
".jpg": "image/jpeg",
".jpeg": "image/jpeg",
".png": "image/png",
".gif": "image/gif",
".webp": "image/webp",
}
def get_image_mime_type(extension: str) -> str:
"""
Get the MIME type for an image file extension.
Args:
extension: File extension (with or without leading dot)
Returns:
MIME type string (default: image/jpeg for unknown extensions)
"""
if not extension.startswith("."):
extension = "." + extension
extension = extension.lower()
return IMAGE_MIME_TYPES.get(extension, "image/jpeg")

View File

@@ -48,6 +48,36 @@ from .file_types import BINARY_EXTENSIONS, CODE_EXTENSIONS, IMAGE_EXTENSIONS, TE
from .security_config import CONTAINER_WORKSPACE, EXCLUDED_DIRS, MCP_SIGNATURE_FILES, SECURITY_ROOT, WORKSPACE_ROOT
from .token_utils import DEFAULT_CONTEXT_WINDOW, estimate_tokens
def _is_builtin_custom_models_config(path_str: str) -> bool:
"""
Check if path points to the server's built-in custom_models.json config file.
This only matches the server's internal config, not user-specified CUSTOM_MODELS_CONFIG_PATH.
We identify the built-in config by checking if it resolves to the server's conf directory.
Args:
path_str: Path to check
Returns:
True if this is the server's built-in custom_models.json config file
"""
try:
path = Path(path_str)
# Get the server root by going up from this file: utils/file_utils.py -> server_root
server_root = Path(__file__).parent.parent
builtin_config = server_root / "conf" / "custom_models.json"
# Check if the path resolves to the same file as our built-in config
# This handles both relative and absolute paths to the same file
return path.resolve() == builtin_config.resolve()
except Exception:
# If path resolution fails, it's not our built-in config
return False
logger = logging.getLogger(__name__)
@@ -271,7 +301,8 @@ def translate_path_for_environment(path_str: str) -> str:
tools and utilities throughout the codebase. It handles:
1. Docker host-to-container path translation (host paths -> /workspace/...)
2. Direct mode (no translation needed)
3. Security validation and error handling
3. Internal server files (conf/custom_models.json)
4. Security validation and error handling
Docker Path Translation Logic:
- Input: /Users/john/project/src/file.py (host path from Claude)
@@ -284,6 +315,10 @@ def translate_path_for_environment(path_str: str) -> str:
Returns:
Translated path appropriate for the current environment
"""
# Handle built-in server config file - no translation needed
if _is_builtin_custom_models_config(path_str):
return path_str
if not WORKSPACE_ROOT or not WORKSPACE_ROOT.strip() or not CONTAINER_WORKSPACE.exists():
# Not in the configured Docker environment, no translation needed
return path_str