Vision support via images / pdfs etc that can be passed on to other models as part of analysis, additional context etc.

Image processing pipeline added OpenAI GPT-4.1 support Chat tool prompt enhancement Lint and code quality improvements
2025-06-16 13:14:53 +04:00
parent d498e9854b
commit 97fa6781cf
26 changed files with 1328 additions and 52 deletions
--- a/simulator_tests/init.py
+++ b/simulator_tests/init.py
@@ -24,6 +24,7 @@ from .test_redis_validation import RedisValidationTest
 from .test_refactor_validation import RefactorValidationTest
 from .test_testgen_validation import TestGenValidationTest
 from .test_token_allocation_validation import TokenAllocationValidationTest
+from .test_vision_capability import VisionCapabilityTest
 from .test_xai_models import XAIModelsTest

 # Test registry for dynamic loading
@@ -45,6 +46,7 @@ TEST_REGISTRY = {
    "testgen_validation": TestGenValidationTest,
    "refactor_validation": RefactorValidationTest,
    "conversation_chain_validation": ConversationChainValidationTest,
+    "vision_capability": VisionCapabilityTest,
    "xai_models": XAIModelsTest,
    # "o3_pro_expensive": O3ProExpensiveTest,  # COMMENTED OUT - too expensive to run by default
 }
@@ -69,6 +71,7 @@ __all__ = [
    "TestGenValidationTest",
    "RefactorValidationTest",
    "ConversationChainValidationTest",
+    "VisionCapabilityTest",
    "XAIModelsTest",
    "TEST_REGISTRY",
 ]
--- a/simulator_tests/test_vision_capability.py
+++ b/simulator_tests/test_vision_capability.py
@@ -0,0 +1,163 @@
+#!/usr/bin/env python3
+"""
+Vision Capability Test
+
+Tests vision capability with the chat tool using O3 model:
+- Test file path image (PNG triangle)
+- Test base64 data URL image
+- Use chat tool with O3 model to analyze the images
+- Verify the model correctly identifies shapes
+"""
+
+import base64
+import os
+
+from .base_test import BaseSimulatorTest
+
+
+class VisionCapabilityTest(BaseSimulatorTest):
+    """Test vision capability with chat tool and O3 model"""
+
+    @property
+    def test_name(self) -> str:
+        return "vision_capability"
+
+    @property
+    def test_description(self) -> str:
+        return "Vision capability test with chat tool and O3 model"
+
+    def get_triangle_png_path(self) -> str:
+        """Get the path to the triangle.png file in tests directory"""
+        # Get the project root and find the triangle.png in tests/
+        current_dir = os.getcwd()
+        triangle_path = os.path.join(current_dir, "tests", "triangle.png")
+
+        if not os.path.exists(triangle_path):
+            raise FileNotFoundError(f"triangle.png not found at {triangle_path}")
+
+        abs_path = os.path.abspath(triangle_path)
+        self.logger.debug(f"Using triangle PNG at host path: {abs_path}")
+        return abs_path
+
+    def create_base64_triangle_data_url(self) -> str:
+        """Create a base64 data URL from the triangle.png file"""
+        triangle_path = self.get_triangle_png_path()
+
+        with open(triangle_path, "rb") as f:
+            image_data = base64.b64encode(f.read()).decode()
+
+        data_url = f"data:image/png;base64,{image_data}"
+        self.logger.debug(f"Created base64 data URL with {len(image_data)} characters")
+        return data_url
+
+    def run_test(self) -> bool:
+        """Test vision capability with O3 model"""
+        try:
+            self.logger.info("Test: Vision capability with O3 model")
+
+            # Test 1: File path image
+            self.logger.info("  1.1: Testing file path image (PNG triangle)")
+            triangle_path = self.get_triangle_png_path()
+            self.logger.info(f"  ✅ Using triangle PNG at: {triangle_path}")
+
+            response1, continuation_id = self.call_mcp_tool(
+                "chat",
+                {
+                    "prompt": "What shape do you see in this image? Please be specific and only mention the shape name.",
+                    "images": [triangle_path],
+                    "model": "o3",
+                },
+            )
+
+            if not response1:
+                self.logger.error("Failed to get response from O3 model for file path test")
+                return False
+
+            # Check for error indicators first
+            response1_lower = response1.lower()
+            if any(
+                error_phrase in response1_lower
+                for error_phrase in [
+                    "don't have access",
+                    "cannot see",
+                    "no image",
+                    "clarification_required",
+                    "image you're referring to",
+                    "supply the image",
+                    "error",
+                ]
+            ):
+                self.logger.error(f"  ❌ O3 model cannot access file path image. Response: {response1[:300]}...")
+                return False
+
+            if "triangle" not in response1_lower:
+                self.logger.error(
+                    f"  ❌ O3 did not identify triangle in file path test. Response: {response1[:200]}..."
+                )
+                return False
+
+            self.logger.info("  ✅ O3 correctly identified file path image as triangle")
+
+            # Test 2: Base64 data URL image
+            self.logger.info("  1.2: Testing base64 data URL image")
+            data_url = self.create_base64_triangle_data_url()
+
+            response2, _ = self.call_mcp_tool(
+                "chat",
+                {
+                    "prompt": "What shape do you see in this image? Please be specific and only mention the shape name.",
+                    "images": [data_url],
+                    "model": "o3",
+                },
+            )
+
+            if not response2:
+                self.logger.error("Failed to get response from O3 model for base64 test")
+                return False
+
+            response2_lower = response2.lower()
+            if any(
+                error_phrase in response2_lower
+                for error_phrase in [
+                    "don't have access",
+                    "cannot see",
+                    "no image",
+                    "clarification_required",
+                    "image you're referring to",
+                    "supply the image",
+                    "error",
+                ]
+            ):
+                self.logger.error(f"  ❌ O3 model cannot access base64 image. Response: {response2[:300]}...")
+                return False
+
+            if "triangle" not in response2_lower:
+                self.logger.error(f"  ❌ O3 did not identify triangle in base64 test. Response: {response2[:200]}...")
+                return False
+
+            self.logger.info("  ✅ O3 correctly identified base64 image as triangle")
+
+            # Optional: Test continuation with same image
+            if continuation_id:
+                self.logger.info("  1.3: Testing continuation with same image")
+                response3, _ = self.call_mcp_tool(
+                    "chat",
+                    {
+                        "prompt": "What color is this triangle?",
+                        "images": [triangle_path],  # Same image should be deduplicated
+                        "continuation_id": continuation_id,
+                        "model": "o3",
+                    },
+                )
+
+                if response3:
+                    self.logger.info("  ✅ Continuation also working correctly")
+                else:
+                    self.logger.warning("  ⚠️  Continuation response not received")
+
+            self.logger.info("  ✅ Vision capability test completed successfully")
+            return True
+
+        except Exception as e:
+            self.logger.error(f"Vision capability test failed: {e}")
+            return False