Native support for xAI Grok3

Model shorthand mapping related fixes Comprehensive auto-mode related tests
2025-06-15 12:21:44 +04:00
parent 4becd70a82
commit 6304b7af6b
24 changed files with 2278 additions and 58 deletions
--- a/simulator_tests/init.py
+++ b/simulator_tests/init.py
@@ -24,6 +24,7 @@ from .test_redis_validation import RedisValidationTest
 from .test_refactor_validation import RefactorValidationTest
 from .test_testgen_validation import TestGenValidationTest
 from .test_token_allocation_validation import TokenAllocationValidationTest
+from .test_xai_models import XAIModelsTest

 # Test registry for dynamic loading
 TEST_REGISTRY = {
@@ -44,6 +45,7 @@ TEST_REGISTRY = {
    "testgen_validation": TestGenValidationTest,
    "refactor_validation": RefactorValidationTest,
    "conversation_chain_validation": ConversationChainValidationTest,
+    "xai_models": XAIModelsTest,
    # "o3_pro_expensive": O3ProExpensiveTest,  # COMMENTED OUT - too expensive to run by default
 }

@@ -67,5 +69,6 @@ __all__ = [
    "TestGenValidationTest",
    "RefactorValidationTest",
    "ConversationChainValidationTest",
+    "XAIModelsTest",
    "TEST_REGISTRY",
 ]
--- a/simulator_tests/test_xai_models.py
+++ b/simulator_tests/test_xai_models.py
@@ -0,0 +1,280 @@
+#!/usr/bin/env python3
+"""
+X.AI GROK Model Tests
+
+Tests that verify X.AI GROK functionality including:
+- Model alias resolution (grok, grok3, grokfast map to actual GROK models)
+- GROK-3 and GROK-3-fast models work correctly
+- Conversation continuity works with GROK models
+- API integration and response validation
+"""
+
+import subprocess
+
+from .base_test import BaseSimulatorTest
+
+
+class XAIModelsTest(BaseSimulatorTest):
+    """Test X.AI GROK model functionality and integration"""
+
+    @property
+    def test_name(self) -> str:
+        return "xai_models"
+
+    @property
+    def test_description(self) -> str:
+        return "X.AI GROK model functionality and integration"
+
+    def get_recent_server_logs(self) -> str:
+        """Get recent server logs from the log file directly"""
+        try:
+            # Read logs directly from the log file
+            cmd = ["docker", "exec", self.container_name, "tail", "-n", "500", "/tmp/mcp_server.log"]
+            result = subprocess.run(cmd, capture_output=True, text=True)
+
+            if result.returncode == 0:
+                return result.stdout
+            else:
+                self.logger.warning(f"Failed to read server logs: {result.stderr}")
+                return ""
+        except Exception as e:
+            self.logger.error(f"Failed to get server logs: {e}")
+            return ""
+
+    def run_test(self) -> bool:
+        """Test X.AI GROK model functionality"""
+        try:
+            self.logger.info("Test: X.AI GROK model functionality and integration")
+
+            # Check if X.AI API key is configured and not empty
+            check_cmd = [
+                "docker",
+                "exec",
+                self.container_name,
+                "python",
+                "-c",
+                """
+import os
+xai_key = os.environ.get("XAI_API_KEY", "")
+is_valid = bool(xai_key and xai_key != "your_xai_api_key_here" and xai_key.strip())
+print(f"XAI_KEY_VALID:{is_valid}")
+                """.strip(),
+            ]
+            result = subprocess.run(check_cmd, capture_output=True, text=True)
+
+            if result.returncode == 0 and "XAI_KEY_VALID:False" in result.stdout:
+                self.logger.info("  ⚠️  X.AI API key not configured or empty - skipping test")
+                self.logger.info("  ℹ️  This test requires XAI_API_KEY to be set in .env with a valid key")
+                return True  # Return True to indicate test is skipped, not failed
+
+            # Setup test files for later use
+            self.setup_test_files()
+
+            # Test 1: 'grok' alias (should map to grok-3)
+            self.logger.info("  1: Testing 'grok' alias (should map to grok-3)")
+
+            response1, continuation_id = self.call_mcp_tool(
+                "chat",
+                {
+                    "prompt": "Say 'Hello from GROK model!' and nothing else.",
+                    "model": "grok",
+                    "temperature": 0.1,
+                },
+            )
+
+            if not response1:
+                self.logger.error("  ❌ GROK alias test failed")
+                return False
+
+            self.logger.info("  ✅ GROK alias call completed")
+            if continuation_id:
+                self.logger.info(f"  ✅ Got continuation_id: {continuation_id}")
+
+            # Test 2: Direct grok-3 model name
+            self.logger.info("  2: Testing direct model name (grok-3)")
+
+            response2, _ = self.call_mcp_tool(
+                "chat",
+                {
+                    "prompt": "Say 'Hello from GROK-3!' and nothing else.",
+                    "model": "grok-3",
+                    "temperature": 0.1,
+                },
+            )
+
+            if not response2:
+                self.logger.error("  ❌ Direct GROK-3 model test failed")
+                return False
+
+            self.logger.info("  ✅ Direct GROK-3 model call completed")
+
+            # Test 3: grok-3-fast model
+            self.logger.info("  3: Testing GROK-3-fast model")
+
+            response3, _ = self.call_mcp_tool(
+                "chat",
+                {
+                    "prompt": "Say 'Hello from GROK-3-fast!' and nothing else.",
+                    "model": "grok-3-fast",
+                    "temperature": 0.1,
+                },
+            )
+
+            if not response3:
+                self.logger.error("  ❌ GROK-3-fast model test failed")
+                return False
+
+            self.logger.info("  ✅ GROK-3-fast model call completed")
+
+            # Test 4: Shorthand aliases
+            self.logger.info("  4: Testing shorthand aliases (grok3, grokfast)")
+
+            response4, _ = self.call_mcp_tool(
+                "chat",
+                {
+                    "prompt": "Say 'Hello from grok3 alias!' and nothing else.",
+                    "model": "grok3",
+                    "temperature": 0.1,
+                },
+            )
+
+            if not response4:
+                self.logger.error("  ❌ grok3 alias test failed")
+                return False
+
+            response5, _ = self.call_mcp_tool(
+                "chat",
+                {
+                    "prompt": "Say 'Hello from grokfast alias!' and nothing else.",
+                    "model": "grokfast",
+                    "temperature": 0.1,
+                },
+            )
+
+            if not response5:
+                self.logger.error("  ❌ grokfast alias test failed")
+                return False
+
+            self.logger.info("  ✅ Shorthand aliases work correctly")
+
+            # Test 5: Conversation continuity with GROK models
+            self.logger.info("  5: Testing conversation continuity with GROK")
+
+            response6, new_continuation_id = self.call_mcp_tool(
+                "chat",
+                {
+                    "prompt": "Remember this number: 87. What number did I just tell you?",
+                    "model": "grok",
+                    "temperature": 0.1,
+                },
+            )
+
+            if not response6 or not new_continuation_id:
+                self.logger.error("  ❌ Failed to start conversation with continuation_id")
+                return False
+
+            # Continue the conversation
+            response7, _ = self.call_mcp_tool(
+                "chat",
+                {
+                    "prompt": "What was the number I told you earlier?",
+                    "model": "grok",
+                    "continuation_id": new_continuation_id,
+                    "temperature": 0.1,
+                },
+            )
+
+            if not response7:
+                self.logger.error("  ❌ Failed to continue conversation")
+                return False
+
+            # Check if the model remembered the number
+            if "87" in response7:
+                self.logger.info("  ✅ Conversation continuity working with GROK")
+            else:
+                self.logger.warning("  ⚠️  Model may not have remembered the number")
+
+            # Test 6: Validate X.AI API usage from logs
+            self.logger.info("  6: Validating X.AI API usage in logs")
+            logs = self.get_recent_server_logs()
+
+            # Check for X.AI API calls
+            xai_logs = [line for line in logs.split("\n") if "x.ai" in line.lower()]
+            xai_api_logs = [line for line in logs.split("\n") if "api.x.ai" in line]
+            grok_logs = [line for line in logs.split("\n") if "grok" in line.lower()]
+
+            # Check for specific model resolution
+            grok_resolution_logs = [
+                line
+                for line in logs.split("\n")
+                if ("Resolved model" in line and "grok" in line.lower()) or ("grok" in line and "->" in line)
+            ]
+
+            # Check for X.AI provider usage
+            xai_provider_logs = [line for line in logs.split("\n") if "XAI" in line or "X.AI" in line]
+
+            # Log findings
+            self.logger.info(f"   X.AI-related logs: {len(xai_logs)}")
+            self.logger.info(f"   X.AI API logs: {len(xai_api_logs)}")
+            self.logger.info(f"   GROK-related logs: {len(grok_logs)}")
+            self.logger.info(f"   Model resolution logs: {len(grok_resolution_logs)}")
+            self.logger.info(f"   X.AI provider logs: {len(xai_provider_logs)}")
+
+            # Sample log output for debugging
+            if self.verbose and xai_logs:
+                self.logger.debug("  📋 Sample X.AI logs:")
+                for log in xai_logs[:3]:
+                    self.logger.debug(f"    {log}")
+
+            if self.verbose and grok_logs:
+                self.logger.debug("  📋 Sample GROK logs:")
+                for log in grok_logs[:3]:
+                    self.logger.debug(f"    {log}")
+
+            # Success criteria
+            grok_mentioned = len(grok_logs) > 0
+            api_used = len(xai_api_logs) > 0 or len(xai_logs) > 0
+            provider_used = len(xai_provider_logs) > 0
+
+            success_criteria = [
+                ("GROK models mentioned in logs", grok_mentioned),
+                ("X.AI API calls made", api_used),
+                ("X.AI provider used", provider_used),
+                ("All model calls succeeded", True),  # We already checked this above
+                ("Conversation continuity works", True),  # We already tested this
+            ]
+
+            passed_criteria = sum(1 for _, passed in success_criteria if passed)
+            self.logger.info(f"   Success criteria met: {passed_criteria}/{len(success_criteria)}")
+
+            for criterion, passed in success_criteria:
+                status = "✅" if passed else "❌"
+                self.logger.info(f"    {status} {criterion}")
+
+            if passed_criteria >= 3:  # At least 3 out of 5 criteria
+                self.logger.info("  ✅ X.AI GROK model tests passed")
+                return True
+            else:
+                self.logger.error("  ❌ X.AI GROK model tests failed")
+                return False
+
+        except Exception as e:
+            self.logger.error(f"X.AI GROK model test failed: {e}")
+            return False
+        finally:
+            self.cleanup_test_files()
+
+
+def main():
+    """Run the X.AI GROK model tests"""
+    import sys
+
+    verbose = "--verbose" in sys.argv or "-v" in sys.argv
+    test = XAIModelsTest(verbose=verbose)
+
+    success = test.run_test()
+    sys.exit(0 if success else 1)
+
+
+if __name__ == "__main__":
+    main()