GPT-5, GPT-5-mini support

Improvements to model name resolution Improved instructions for multi-step workflows when continuation is available Improved instructions for chat tool Improved preferred model resolution, moved code from registry -> each provider Updated tests
2025-08-08 08:51:34 +05:00
parent 9a4791cb06
commit 1a8ec2e12f
30 changed files with 792 additions and 483 deletions
--- a/tools/chat.py
+++ b/tools/chat.py
@@ -23,6 +23,9 @@ from .simple.base import SimpleTool
 CHAT_FIELD_DESCRIPTIONS = {
    "prompt": (
        "You MUST provide a thorough, expressive question or share an idea with as much context as possible. "
+        "IMPORTANT: When referring to code, use the files parameter to pass relevant files and only use the prompt to refer to "
+        "function / method names or very small code snippets if absolutely necessary to explain the issue. Do NOT "
+        "pass large code snippets in the prompt as this is exclusively reserved for descriptive text only. "
        "Remember: you're talking to an assistant who has deep expertise and can provide nuanced insights. Include your "
        "current thinking, specific challenges, background context, what you've already tried, and what "
        "kind of response would be most helpful. The more context and detail you provide, the more "
--- a/tools/codereview.py
+++ b/tools/codereview.py
@@ -45,6 +45,9 @@ CODEREVIEW_WORKFLOW_FIELD_DESCRIPTIONS = {
        "and ways to reduce complexity while maintaining functionality. Map out the codebase structure, understand "
        "the business logic, and identify areas requiring deeper analysis. In all later steps, continue exploring "
        "with precision: trace dependencies, verify assumptions, and adapt your understanding as you uncover more evidence."
+        "IMPORTANT: When referring to code, use the relevant_files parameter to pass relevant files and only use the prompt to refer to "
+        "function / method names or very small code snippets if absolutely necessary to explain the issue. Do NOT "
+        "pass large code snippets in the prompt as this is exclusively reserved for descriptive text only. "
    ),
    "step_number": (
        "The index of the current step in the code review sequence, beginning at 1. Each step should build upon or "
@@ -52,11 +55,13 @@ CODEREVIEW_WORKFLOW_FIELD_DESCRIPTIONS = {
    ),
    "total_steps": (
        "Your current estimate for how many steps will be needed to complete the code review. "
-        "Adjust as new findings emerge."
+        "Adjust as new findings emerge. MANDATORY: When continuation_id is provided (continuing a previous "
+        "conversation), set this to 1 as we're not starting a new multi-step investigation."
    ),
    "next_step_required": (
        "Set to true if you plan to continue the investigation with another step. False means you believe the "
-        "code review analysis is complete and ready for expert validation."
+        "code review analysis is complete and ready for expert validation. MANDATORY: When continuation_id is "
+        "provided (continuing a previous conversation), set this to False to immediately proceed with expert analysis."
    ),
    "findings": (
        "Summarize everything discovered in this step about the code being reviewed. Include analysis of code quality, "
@@ -91,13 +96,14 @@ CODEREVIEW_WORKFLOW_FIELD_DESCRIPTIONS = {
        "unnecessary complexity, etc."
    ),
    "confidence": (
-        "Indicate your current confidence in the code review assessment. Use: 'exploring' (starting analysis), 'low' "
-        "(early investigation), 'medium' (some evidence gathered), 'high' (strong evidence), "
-        "'very_high' (very strong evidence), 'almost_certain' (nearly complete review), 'certain' (100% confidence - "
-        "code review is thoroughly complete and all significant issues are identified with no need for external model validation). "
-        "Do NOT use 'certain' unless the code review is comprehensively complete, use 'very_high' or 'almost_certain' instead if not 100% sure. "
-        "Using 'certain' means you have complete confidence locally and prevents external model validation. Also do "
-        "NOT set confidence to 'certain' if the user has strongly requested that external review must be performed."
+        "Indicate your current confidence in the assessment. Use: 'exploring' (starting analysis), 'low' (early "
+        "investigation), 'medium' (some evidence gathered), 'high' (strong evidence), "
+        "'very_high' (very strong evidence), 'almost_certain' (nearly complete validation), 'certain' (200% confidence - "
+        "analysis is complete and all issues are identified with no need for external model validation). "
+        "Do NOT use 'certain' unless the pre-commit validation is thoroughly complete, use 'very_high' or 'almost_certain' "
+        "instead if not 200% sure. "
+        "Using 'certain' means you have complete confidence locally and prevents external model validation. Also "
+        "do NOT set confidence to 'certain' if the user has strongly requested that external validation MUST be performed."
    ),
    "backtrack_from_step": (
        "If an earlier finding or assessment needs to be revised or discarded, specify the step number from which to "
@@ -572,6 +578,17 @@ class CodeReviewTool(WorkflowTool):
        """
        Provide step-specific guidance for code review workflow.
        """
+        # Check if this is a continuation - if so, skip workflow and go to expert analysis
+        continuation_id = self.get_request_continuation_id(request)
+        if continuation_id:
+            return {
+                "next_steps": (
+                    "Continuing previous conversation. The expert analysis will now be performed based on the "
+                    "accumulated context from the previous conversation. The analysis will build upon the prior "
+                    "findings without repeating the investigation steps."
+                )
+            }
+
        # Generate the next steps instruction based on required actions
        required_actions = self.get_required_actions(step_number, confidence, request.findings, request.total_steps)

--- a/tools/debug.py
+++ b/tools/debug.py
@@ -45,6 +45,9 @@ DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS = {
        "could cause instability. In concurrent systems, watch for race conditions, shared state, or timing "
        "dependencies. In all later steps, continue exploring with precision: trace deeper dependencies, verify "
        "hypotheses, and adapt your understanding as you uncover more evidence."
+        "IMPORTANT: When referring to code, use the relevant_files parameter to pass relevant files and only use the prompt to refer to "
+        "function / method names or very small code snippets if absolutely necessary to explain the issue. Do NOT "
+        "pass large code snippets in the prompt as this is exclusively reserved for descriptive text only. "
    ),
    "step_number": (
        "The index of the current step in the investigation sequence, beginning at 1. Each step should build upon or "
@@ -52,11 +55,13 @@ DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS = {
    ),
    "total_steps": (
        "Your current estimate for how many steps will be needed to complete the investigation. "
-        "Adjust as new findings emerge."
+        "Adjust as new findings emerge. IMPORTANT: When continuation_id is provided (continuing a previous "
+        "conversation), set this to 1 as we're not starting a new multi-step investigation."
    ),
    "next_step_required": (
        "Set to true if you plan to continue the investigation with another step. False means you believe the root "
-        "cause is known or the investigation is complete."
+        "cause is known or the investigation is complete. IMPORTANT: When continuation_id is "
+        "provided (continuing a previous conversation), set this to False to immediately proceed with expert analysis."
    ),
    "findings": (
        "Summarize everything discovered in this step. Include new clues, unexpected behavior, evidence from code or "
@@ -92,10 +97,10 @@ DEBUG_INVESTIGATION_FIELD_DESCRIPTIONS = {
    "confidence": (
        "Indicate your current confidence in the hypothesis. Use: 'exploring' (starting out), 'low' (early idea), "
        "'medium' (some supporting evidence), 'high' (strong evidence), 'very_high' (very strong evidence), "
-        "'almost_certain' (nearly confirmed), 'certain' (100% confidence - root cause and minimal fix are both "
+        "'almost_certain' (nearly confirmed), 'certain' (200% confidence - root cause and minimal fix are both "
        "confirmed locally with no need for external model validation). Do NOT use 'certain' unless the issue can be "
-        "fully resolved with a fix, use 'very_high' or 'almost_certain' instead when not 100% sure. Using 'certain' "
-        "means you have complete confidence locally and prevents external model validation. Also do "
+        "fully resolved with a fix, use 'very_high' or 'almost_certain' instead when not 200% sure. Using 'certain' "
+        "means you have ABSOLUTE confidence locally and prevents external model validation. Also do "
        "NOT set confidence to 'certain' if the user has strongly requested that external validation MUST be performed."
    ),
    "backtrack_from_step": (
--- a/tools/listmodels.py
+++ b/tools/listmodels.py
@@ -225,7 +225,7 @@ class ListModelsTool(BaseTool):
                output_lines.append(f"**Error loading models**: {str(e)}")
        else:
            output_lines.append("**Status**: Not configured (set OPENROUTER_API_KEY)")
-            output_lines.append("**Note**: Provides access to GPT-4, O3, Mistral, and many more")
+            output_lines.append("**Note**: Provides access to GPT-5, O3, Mistral, and many more")

        output_lines.append("")

@@ -295,7 +295,7 @@ class ListModelsTool(BaseTool):

        # Add usage tips
        output_lines.append("\n**Usage Tips**:")
-        output_lines.append("- Use model aliases (e.g., 'flash', 'o3', 'opus') for convenience")
+        output_lines.append("- Use model aliases (e.g., 'flash', 'gpt5', 'opus') for convenience")
        output_lines.append("- In auto mode, the CLI Agent will select the best model for each task")
        output_lines.append("- Custom models are only available when CUSTOM_API_URL is set")
        output_lines.append("- OpenRouter provides access to many cloud models with one API key")
--- a/tools/precommit.py
+++ b/tools/precommit.py
@@ -42,6 +42,9 @@ PRECOMMIT_WORKFLOW_FIELD_DESCRIPTIONS = {
        "performance impacts, and maintainability concerns. Map out changed files, understand the business logic, "
        "and identify areas requiring deeper analysis. In all later steps, continue exploring with precision: "
        "trace dependencies, verify hypotheses, and adapt your understanding as you uncover more evidence."
+        "IMPORTANT: When referring to code, use the relevant_files parameter to pass relevant files and only use the prompt to refer to "
+        "function / method names or very small code snippets if absolutely necessary to explain the issue. Do NOT "
+        "pass large code snippets in the prompt as this is exclusively reserved for descriptive text only. "
    ),
    "step_number": (
        "The index of the current step in the pre-commit investigation sequence, beginning at 1. Each step should "
@@ -49,11 +52,13 @@ PRECOMMIT_WORKFLOW_FIELD_DESCRIPTIONS = {
    ),
    "total_steps": (
        "Your current estimate for how many steps will be needed to complete the pre-commit investigation. "
-        "Adjust as new findings emerge."
+        "Adjust as new findings emerge. IMPORTANT: When continuation_id is provided (continuing a previous "
+        "conversation), set this to 1 as we're not starting a new multi-step investigation."
    ),
    "next_step_required": (
        "Set to true if you plan to continue the investigation with another step. False means you believe the "
-        "pre-commit analysis is complete and ready for expert validation."
+        "pre-commit analysis is complete and ready for expert validation. IMPORTANT: When continuation_id is "
+        "provided (continuing a previous conversation), set this to False to immediately proceed with expert analysis."
    ),
    "findings": (
        "Summarize everything discovered in this step about the changes being committed. Include analysis of git diffs, "
@@ -87,9 +92,10 @@ PRECOMMIT_WORKFLOW_FIELD_DESCRIPTIONS = {
    "confidence": (
        "Indicate your current confidence in the assessment. Use: 'exploring' (starting analysis), 'low' (early "
        "investigation), 'medium' (some evidence gathered), 'high' (strong evidence), "
-        "'very_high' (very strong evidence), 'almost_certain' (nearly complete validation), 'certain' (100% confidence - "
+        "'very_high' (very strong evidence), 'almost_certain' (nearly complete validation), 'certain' (200% confidence - "
        "analysis is complete and all issues are identified with no need for external model validation). "
-        "Do NOT use 'certain' unless the pre-commit validation is thoroughly complete, use 'very_high' or 'almost_certain' instead if not 100% sure. "
+        "Do NOT use 'certain' unless the pre-commit validation is thoroughly complete, use 'very_high' or 'almost_certain' "
+        "instead if not 200% sure. "
        "Using 'certain' means you have complete confidence locally and prevents external model validation. Also "
        "do NOT set confidence to 'certain' if the user has strongly requested that external validation MUST be performed."
    ),
@@ -584,6 +590,17 @@ class PrecommitTool(WorkflowTool):
        """
        Provide step-specific guidance for precommit workflow.
        """
+        # Check if this is a continuation - if so, skip workflow and go to expert analysis
+        continuation_id = self.get_request_continuation_id(request)
+        if continuation_id:
+            return {
+                "next_steps": (
+                    "Continuing previous conversation. The expert analysis will now be performed based on the "
+                    "accumulated context from the previous conversation. The analysis will build upon the prior "
+                    "findings without repeating the investigation steps."
+                )
+            }
+
        # Generate the next steps instruction based on required actions
        required_actions = self.get_required_actions(step_number, confidence, request.findings, request.total_steps)

--- a/tools/refactor.py
+++ b/tools/refactor.py
@@ -44,6 +44,9 @@ REFACTOR_FIELD_DESCRIPTIONS = {
        "structure, understand the business logic, and identify areas requiring refactoring. In all later steps, continue "
        "exploring with precision: trace dependencies, verify assumptions, and adapt your understanding as you uncover "
        "more refactoring opportunities."
+        "IMPORTANT: When referring to code, use the relevant_files parameter to pass relevant files and only use the prompt to refer to "
+        "function / method names or very small code snippets if absolutely necessary to explain the issue. Do NOT "
+        "pass large code snippets in the prompt as this is exclusively reserved for descriptive text only. "
    ),
    "step_number": (
        "The index of the current step in the refactoring investigation sequence, beginning at 1. Each step should "
--- a/tools/workflow/base.py
+++ b/tools/workflow/base.py
@@ -390,6 +390,23 @@ class WorkflowTool(BaseTool, BaseWorkflowMixin):
        """Get status for skipped expert analysis. Override for tool-specific status."""
        return "skipped_by_tool_design"

+    def is_continuation_workflow(self, request) -> bool:
+        """
+        Check if this is a continuation workflow that should skip multi-step investigation.
+
+        When continuation_id is provided, the workflow typically continues from a previous
+        conversation and should go directly to expert analysis rather than starting a new
+        multi-step investigation.
+
+        Args:
+            request: The workflow request object
+
+        Returns:
+            True if this is a continuation that should skip multi-step workflow
+        """
+        continuation_id = self.get_request_continuation_id(request)
+        return bool(continuation_id)
+
    # Abstract methods that must be implemented by specific workflow tools
    # (These are inherited from BaseWorkflowMixin and must be implemented)

--- a/tools/workflow/workflow_mixin.py
+++ b/tools/workflow/workflow_mixin.py
@@ -663,13 +663,13 @@ class BaseWorkflowMixin(ABC):
                self._current_model_name = None
                self._model_context = None

+            # Handle continuation
+            continuation_id = request.continuation_id
+
            # Adjust total steps if needed
            if request.step_number > request.total_steps:
                request.total_steps = request.step_number

-            # Handle continuation
-            continuation_id = request.continuation_id
-
            # Create thread for first step
            if not continuation_id and request.step_number == 1:
                clean_args = {k: v for k, v in arguments.items() if k not in ["_model_context", "_resolved_model_name"]}