feat!: Full code can now be generated by an external model and shared with the AI tool (Claude Code / Codex etc)!

model definitions now support a new `allow_code_generation` flag, only to be used with higher reasoning models such as GPT-5-Pro and-Gemini 2.5-Pro

 When `true`, the `chat` tool can now request the external model to generate a full implementation / update / instructions etc and then share the implementation with the calling agent.

 This effectively allows us to utilize more powerful models such as GPT-5-Pro to generate code for us or entire implementations (which are either API-only or part of the $200 Pro plan from within the ChatGPT app)
This commit is contained in:
Fahad
2025-10-07 18:49:13 +04:00
parent 04f7ce5b03
commit ece8a5ebed
29 changed files with 1008 additions and 122 deletions

View File

@@ -56,7 +56,12 @@ class TestLargePromptHandling:
async def test_chat_large_prompt_detection(self, large_prompt):
"""Test that chat tool detects large prompts."""
tool = ChatTool()
result = await tool.execute({"prompt": large_prompt})
temp_dir = tempfile.mkdtemp()
temp_dir = tempfile.mkdtemp()
try:
result = await tool.execute({"prompt": large_prompt, "working_directory": temp_dir})
finally:
shutil.rmtree(temp_dir, ignore_errors=True)
assert len(result) == 1
assert isinstance(result[0], TextContent)
@@ -73,9 +78,16 @@ class TestLargePromptHandling:
"""Test that chat tool works normally with regular prompts."""
tool = ChatTool()
temp_dir = tempfile.mkdtemp()
# This test runs in the test environment which uses dummy keys
# The chat tool will return an error for dummy keys, which is expected
result = await tool.execute({"prompt": normal_prompt, "model": "gemini-2.5-flash"})
try:
result = await tool.execute(
{"prompt": normal_prompt, "model": "gemini-2.5-flash", "working_directory": temp_dir}
)
finally:
shutil.rmtree(temp_dir, ignore_errors=True)
assert len(result) == 1
output = json.loads(result[0].text)
@@ -105,7 +117,14 @@ class TestLargePromptHandling:
try:
# This test runs in the test environment which uses dummy keys
# The chat tool will return an error for dummy keys, which is expected
result = await tool.execute({"prompt": "", "files": [temp_prompt_file], "model": "gemini-2.5-flash"})
result = await tool.execute(
{
"prompt": "",
"files": [temp_prompt_file],
"model": "gemini-2.5-flash",
"working_directory": temp_dir,
}
)
assert len(result) == 1
output = json.loads(result[0].text)
@@ -261,7 +280,13 @@ class TestLargePromptHandling:
mock_prepare_files.return_value = ("File content", [other_file])
# Use a small prompt to avoid triggering size limit
await tool.execute({"prompt": "Test prompt", "files": [temp_prompt_file, other_file]})
await tool.execute(
{
"prompt": "Test prompt",
"files": [temp_prompt_file, other_file],
"working_directory": os.path.dirname(temp_prompt_file),
}
)
# Verify handle_prompt_file was called with the original files list
mock_handle_prompt.assert_called_once_with([temp_prompt_file, other_file])
@@ -295,7 +320,11 @@ class TestLargePromptHandling:
mock_get_provider.return_value = mock_provider
# With the fix, this should now pass because we check at MCP transport boundary before adding internal content
result = await tool.execute({"prompt": exact_prompt})
temp_dir = tempfile.mkdtemp()
try:
result = await tool.execute({"prompt": exact_prompt, "working_directory": temp_dir})
finally:
shutil.rmtree(temp_dir, ignore_errors=True)
output = json.loads(result[0].text)
assert output["status"] != "resend_prompt"
@@ -305,7 +334,11 @@ class TestLargePromptHandling:
tool = ChatTool()
over_prompt = "x" * (MCP_PROMPT_SIZE_LIMIT + 1)
result = await tool.execute({"prompt": over_prompt})
temp_dir = tempfile.mkdtemp()
try:
result = await tool.execute({"prompt": over_prompt, "working_directory": temp_dir})
finally:
shutil.rmtree(temp_dir, ignore_errors=True)
output = json.loads(result[0].text)
assert output["status"] == "resend_prompt"
@@ -326,7 +359,11 @@ class TestLargePromptHandling:
)
mock_get_provider.return_value = mock_provider
result = await tool.execute({"prompt": ""})
temp_dir = tempfile.mkdtemp()
try:
result = await tool.execute({"prompt": "", "working_directory": temp_dir})
finally:
shutil.rmtree(temp_dir, ignore_errors=True)
output = json.loads(result[0].text)
assert output["status"] != "resend_prompt"
@@ -362,7 +399,11 @@ class TestLargePromptHandling:
mock_model_context_class.return_value = mock_model_context
# Should continue with empty prompt when file can't be read
result = await tool.execute({"prompt": "", "files": [bad_file]})
temp_dir = tempfile.mkdtemp()
try:
result = await tool.execute({"prompt": "", "files": [bad_file], "working_directory": temp_dir})
finally:
shutil.rmtree(temp_dir, ignore_errors=True)
output = json.loads(result[0].text)
assert output["status"] != "resend_prompt"
@@ -408,6 +449,7 @@ class TestLargePromptHandling:
"prompt": "Summarize the design decisions",
"files": [str(large_file)],
"model": "flash",
"working_directory": str(tmp_path),
"_model_context": dummy_context,
}
)
@@ -424,6 +466,7 @@ class TestLargePromptHandling:
This test verifies that even if our internal prompt (with system prompts, history, etc.)
exceeds MCP_PROMPT_SIZE_LIMIT, it should still work as long as the user's input is small.
"""
tool = ChatTool()
# Small user input that should pass MCP boundary check
@@ -432,62 +475,57 @@ class TestLargePromptHandling:
# Mock a huge conversation history that would exceed MCP limits if incorrectly checked
huge_history = "x" * (MCP_PROMPT_SIZE_LIMIT * 2) # 100K chars = way over 50K limit
with (
patch.object(tool, "get_model_provider") as mock_get_provider,
patch("utils.model_context.ModelContext") as mock_model_context_class,
):
from tests.mock_helpers import create_mock_provider
temp_dir = tempfile.mkdtemp()
original_prepare_prompt = tool.prepare_prompt
mock_provider = create_mock_provider(model_name="flash")
mock_get_provider.return_value = mock_provider
try:
with (
patch.object(tool, "get_model_provider") as mock_get_provider,
patch("utils.model_context.ModelContext") as mock_model_context_class,
):
from tests.mock_helpers import create_mock_provider
from utils.model_context import TokenAllocation
# Mock ModelContext to avoid the comparison issue
from utils.model_context import TokenAllocation
mock_provider = create_mock_provider(model_name="flash")
mock_get_provider.return_value = mock_provider
mock_model_context = MagicMock()
mock_model_context.model_name = "flash"
mock_model_context.provider = mock_provider
mock_model_context.calculate_token_allocation.return_value = TokenAllocation(
total_tokens=1_048_576,
content_tokens=838_861,
response_tokens=209_715,
file_tokens=335_544,
history_tokens=335_544,
)
mock_model_context_class.return_value = mock_model_context
mock_model_context = MagicMock()
mock_model_context.model_name = "flash"
mock_model_context.provider = mock_provider
mock_model_context.calculate_token_allocation.return_value = TokenAllocation(
total_tokens=1_048_576,
content_tokens=838_861,
response_tokens=209_715,
file_tokens=335_544,
history_tokens=335_544,
)
mock_model_context_class.return_value = mock_model_context
# Mock the prepare_prompt to simulate huge internal context
original_prepare_prompt = tool.prepare_prompt
async def mock_prepare_prompt(request):
normal_prompt = await original_prepare_prompt(request)
huge_internal_prompt = f"{normal_prompt}\n\n=== HUGE INTERNAL CONTEXT ===\n{huge_history}"
assert len(huge_internal_prompt) > MCP_PROMPT_SIZE_LIMIT
return huge_internal_prompt
async def mock_prepare_prompt(request):
# Call original to get normal processing
normal_prompt = await original_prepare_prompt(request)
# Add huge internal context (simulating large history, system prompts, files)
huge_internal_prompt = f"{normal_prompt}\n\n=== HUGE INTERNAL CONTEXT ===\n{huge_history}"
tool.prepare_prompt = mock_prepare_prompt
# Verify the huge internal prompt would exceed MCP limits if incorrectly checked
assert len(huge_internal_prompt) > MCP_PROMPT_SIZE_LIMIT
result = await tool.execute(
{"prompt": small_user_prompt, "model": "flash", "working_directory": temp_dir}
)
output = json.loads(result[0].text)
return huge_internal_prompt
assert output["status"] != "resend_prompt"
tool.prepare_prompt = mock_prepare_prompt
mock_provider.generate_content.assert_called_once()
call_kwargs = mock_provider.generate_content.call_args[1]
actual_prompt = call_kwargs.get("prompt")
# This should succeed because we only check user input at MCP boundary
result = await tool.execute({"prompt": small_user_prompt, "model": "flash"})
output = json.loads(result[0].text)
# Should succeed even though internal context is huge
assert output["status"] != "resend_prompt"
# Verify the model was actually called with the huge prompt
mock_provider.generate_content.assert_called_once()
call_kwargs = mock_provider.generate_content.call_args[1]
actual_prompt = call_kwargs.get("prompt")
# Verify internal prompt was huge (proving we don't limit internal processing)
assert len(actual_prompt) > MCP_PROMPT_SIZE_LIMIT
assert huge_history in actual_prompt
assert small_user_prompt in actual_prompt
assert len(actual_prompt) > MCP_PROMPT_SIZE_LIMIT
assert huge_history in actual_prompt
assert small_user_prompt in actual_prompt
finally:
tool.prepare_prompt = original_prepare_prompt
shutil.rmtree(temp_dir, ignore_errors=True)
@pytest.mark.asyncio
async def test_mcp_boundary_vs_internal_processing_distinction(self):
@@ -500,27 +538,37 @@ class TestLargePromptHandling:
# Test case 1: Large user input should fail at MCP boundary
large_user_input = "x" * (MCP_PROMPT_SIZE_LIMIT + 1000)
result = await tool.execute({"prompt": large_user_input, "model": "flash"})
output = json.loads(result[0].text)
assert output["status"] == "resend_prompt" # Should fail
assert "too large for MCP's token limits" in output["content"]
temp_dir = tempfile.mkdtemp()
try:
result = await tool.execute({"prompt": large_user_input, "model": "flash", "working_directory": temp_dir})
output = json.loads(result[0].text)
assert output["status"] == "resend_prompt" # Should fail
assert "too large for MCP's token limits" in output["content"]
# Test case 2: Small user input should succeed even with huge internal processing
small_user_input = "Hello"
# Test case 2: Small user input should succeed even with huge internal processing
small_user_input = "Hello"
# This test runs in the test environment which uses dummy keys
# The chat tool will return an error for dummy keys, which is expected
result = await tool.execute({"prompt": small_user_input, "model": "gemini-2.5-flash"})
output = json.loads(result[0].text)
# This test runs in the test environment which uses dummy keys
# The chat tool will return an error for dummy keys, which is expected
result = await tool.execute(
{
"prompt": small_user_input,
"model": "gemini-2.5-flash",
"working_directory": temp_dir,
}
)
output = json.loads(result[0].text)
# The test will fail with dummy API keys, which is expected behavior
# We're mainly testing that the tool processes small prompts correctly without size errors
if output["status"] == "error":
# If it's an API error, that's fine - we're testing prompt handling, not API calls
assert "API" in output["content"] or "key" in output["content"] or "authentication" in output["content"]
else:
# If somehow it succeeds (e.g., with mocked provider), check the response
assert output["status"] != "resend_prompt"
# The test will fail with dummy API keys, which is expected behavior
# We're mainly testing that the tool processes small prompts correctly without size errors
if output["status"] == "error":
# If it's an API error, that's fine - we're testing prompt handling, not API calls
assert "API" in output["content"] or "key" in output["content"] or "authentication" in output["content"]
else:
# If somehow it succeeds (e.g., with mocked provider), check the response
assert output["status"] != "resend_prompt"
finally:
shutil.rmtree(temp_dir, ignore_errors=True)
@pytest.mark.asyncio
async def test_continuation_with_huge_conversation_history(self):
@@ -548,6 +596,8 @@ class TestLargePromptHandling:
# Ensure the history exceeds MCP limits
assert len(huge_conversation_history) > MCP_PROMPT_SIZE_LIMIT
temp_dir = tempfile.mkdtemp()
with (
patch.object(tool, "get_model_provider") as mock_get_provider,
patch("utils.model_context.ModelContext") as mock_model_context_class,
@@ -579,6 +629,7 @@ class TestLargePromptHandling:
"prompt": f"{huge_conversation_history}\n\n=== CURRENT REQUEST ===\n{small_continuation_prompt}",
"model": "flash",
"continuation_id": "test_thread_123",
"working_directory": temp_dir,
}
# Mock the conversation history embedding to simulate server.py behavior
@@ -628,6 +679,7 @@ class TestLargePromptHandling:
finally:
# Restore original execute method
tool.__class__.execute = original_execute
shutil.rmtree(temp_dir, ignore_errors=True)
if __name__ == "__main__":