diff --git a/server.py b/server.py index 9241940..fcf54a9 100644 --- a/server.py +++ b/server.py @@ -117,8 +117,6 @@ try: # Also keep a size-based rotation as backup (100MB max per file) # This prevents any single day's log from growing too large - from logging.handlers import RotatingFileHandler - size_handler = RotatingFileHandler( "/tmp/mcp_server_overflow.log", maxBytes=100 * 1024 * 1024, backupCount=3 # 100MB ) diff --git a/systemprompts/codereview_prompt.py b/systemprompts/codereview_prompt.py index bc2cfb0..8dc87cd 100644 --- a/systemprompts/codereview_prompt.py +++ b/systemprompts/codereview_prompt.py @@ -64,5 +64,12 @@ After listing issues, add: • **Top 3 priority fixes** (quick bullets) • **Positive aspects** worth retaining +IF SCOPE TOO LARGE FOR FOCUSED REVIEW +If the codebase is too large or complex to review effectively in a single response, you MUST request Claude to +provide smaller, more focused subsets for review. Respond ONLY with this JSON format (and nothing else): +{"status": "focused_review_required", + "reason": "", + "suggestion": ""} + Remember: If required information is missing, use the clarification JSON above instead of guessing. """ diff --git a/systemprompts/testgen_prompt.py b/systemprompts/testgen_prompt.py index 14057bc..ede67b7 100644 --- a/systemprompts/testgen_prompt.py +++ b/systemprompts/testgen_prompt.py @@ -24,7 +24,8 @@ test idioms from the code snapshot provided. that are directly involved (network, DB, file-system, IPC). 3. **Adversarial Thinker** – enumerates realistic failures, boundary conditions, race conditions, and misuse patterns that historically break similar systems. -4. **Risk Prioritizer** – ranks findings by production impact and likelihood; discards speculative or out-of-scope cases. +4. **Risk Prioritizer** – ranks findings by production impact and likelihood; discards speculative or +out-of-scope cases. 5. **Test Scaffolder** – produces deterministic, isolated tests that follow the *project's* conventions (assert style, fixture layout, naming, any mocking strategy, language and tooling etc). @@ -41,6 +42,7 @@ pure functions). - Surface concurrency hazards with stress or fuzz tests when the language/runtime supports them. - Focus on realistic failure modes that actually occur in production - Remain within scope of language, framework, project. Do not over-step. Do not add unnecessary dependencies. +- No bogus, fake tests that seemingly pass for no reason at all EDGE-CASE TAXONOMY (REAL-WORLD, HIGH-VALUE) - **Data Shape Issues**: `null` / `undefined`, zero-length, surrogate-pair emojis, malformed UTF-8, mixed EOLs. @@ -93,8 +95,27 @@ it but do not approach or offer refactoring ideas. DELIVERABLE Return only the artefacts (analysis summary, coverage plan, and generated tests) that fit the detected framework and code / project layout. -No extra commentary, no generic boilerplate. -Must comment and document logic, test reason / hypothesis in delivered code +Group related tests but separate them into files where this is the convention and most suitable for the project at hand. +Prefer adding tests to an existing test file if one was provided and grouping these tests makes sense. +Must document logic, test reason/hypothesis in delivered code. +MUST NOT add any additional information, introduction, or summaries around generated code. Deliver only the essentials +relevant to the test. + +IF ADDITIONAL TEST CASES ARE REQUIRED +If you determine that comprehensive test coverage requires generating multiple test files or a large number of +test cases for each file that would risk exceeding context limits, you MUST follow this structured approach: + +1. **Generate Essential Tests First**: Create only the most critical and high-impact tests (typically 3-5 key test + cases covering the most important paths and failure modes). Clearly state the file these tests belong to, even if + these should be added to an existing test file. + +2. **Request Continuation**: You MUST your message with the following added in JSON format (and nothing + more after this). This will list the pending tests and their respective files (even if they belong to the same or + an existing test file) as this will be used for the next follow-up test generation request. +{"status": "more_tests_required", +"pending_tests": "test_name (file_name), another_test_name (file_name)"} + +This approach ensures comprehensive test coverage while maintaining quality and avoiding context overflow. Remember: your value is catching the hard bugs—not inflating coverage numbers. """ diff --git a/tests/test_special_status_parsing.py b/tests/test_special_status_parsing.py new file mode 100644 index 0000000..b46445d --- /dev/null +++ b/tests/test_special_status_parsing.py @@ -0,0 +1,263 @@ +""" +Tests for special status parsing in the base tool +""" + +from pydantic import BaseModel + +from tools.base import BaseTool + + +class MockRequest(BaseModel): + """Mock request for testing""" + + test_field: str = "test" + + +class TestTool(BaseTool): + """Minimal test tool implementation""" + + def get_name(self) -> str: + return "test_tool" + + def get_description(self) -> str: + return "Test tool for special status parsing" + + def get_input_schema(self) -> dict: + return {"type": "object", "properties": {}} + + def get_system_prompt(self) -> str: + return "Test prompt" + + def get_request_model(self): + return MockRequest + + async def prepare_prompt(self, request) -> str: + return "test prompt" + + +class TestSpecialStatusParsing: + """Test special status parsing functionality""" + + def setup_method(self): + """Setup test tool and request""" + self.tool = TestTool() + self.request = MockRequest() + + def test_full_codereview_required_parsing(self): + """Test parsing of full_codereview_required status""" + response_json = '{"status": "full_codereview_required", "reason": "Codebase too large for quick review"}' + + result = self.tool._parse_response(response_json, self.request) + + assert result.status == "full_codereview_required" + assert result.content_type == "json" + assert "reason" in result.content + + def test_full_codereview_required_without_reason(self): + """Test parsing of full_codereview_required without optional reason""" + response_json = '{"status": "full_codereview_required"}' + + result = self.tool._parse_response(response_json, self.request) + + assert result.status == "full_codereview_required" + assert result.content_type == "json" + + def test_test_sample_needed_parsing(self): + """Test parsing of test_sample_needed status""" + response_json = '{"status": "test_sample_needed", "reason": "Cannot determine test framework"}' + + result = self.tool._parse_response(response_json, self.request) + + assert result.status == "test_sample_needed" + assert result.content_type == "json" + assert "reason" in result.content + + def test_more_tests_required_parsing(self): + """Test parsing of more_tests_required status""" + response_json = ( + '{"status": "more_tests_required", "pending_tests": "test_auth (test_auth.py), test_login (test_user.py)"}' + ) + + result = self.tool._parse_response(response_json, self.request) + + assert result.status == "more_tests_required" + assert result.content_type == "json" + assert "pending_tests" in result.content + + def test_clarification_required_still_works(self): + """Test that existing clarification_required still works""" + response_json = ( + '{"status": "clarification_required", "question": "What files need review?", "files_needed": ["src/"]}' + ) + + result = self.tool._parse_response(response_json, self.request) + + assert result.status == "clarification_required" + assert result.content_type == "json" + assert "question" in result.content + + def test_invalid_status_payload(self): + """Test that invalid payloads for known statuses are handled gracefully""" + # Missing required field 'reason' for test_sample_needed + response_json = '{"status": "test_sample_needed"}' + + result = self.tool._parse_response(response_json, self.request) + + # Should fall back to normal processing since validation failed + assert result.status in ["success", "continuation_available"] + + def test_unknown_status_ignored(self): + """Test that unknown status types are ignored and treated as normal responses""" + response_json = '{"status": "unknown_status", "data": "some data"}' + + result = self.tool._parse_response(response_json, self.request) + + # Should be treated as normal response + assert result.status in ["success", "continuation_available"] + + def test_normal_response_unchanged(self): + """Test that normal text responses are handled normally""" + response_text = "This is a normal response with some analysis." + + result = self.tool._parse_response(response_text, self.request) + + # Should be processed as normal response + assert result.status in ["success", "continuation_available"] + assert response_text in result.content + + def test_malformed_json_handled(self): + """Test that malformed JSON is handled gracefully""" + response_text = '{"status": "clarification_required", "question": "incomplete json' + + result = self.tool._parse_response(response_text, self.request) + + # Should fall back to normal processing + assert result.status in ["success", "continuation_available"] + + def test_metadata_preserved(self): + """Test that model metadata is preserved in special status responses""" + response_json = '{"status": "full_codereview_required", "reason": "Too complex"}' + model_info = {"model_name": "test-model", "provider": "test-provider"} + + result = self.tool._parse_response(response_json, self.request, model_info) + + assert result.status == "full_codereview_required" + assert result.metadata["model_used"] == "test-model" + assert "original_request" in result.metadata + + def test_more_tests_required_detailed(self): + """Test more_tests_required with detailed pending_tests parameter""" + # Test the exact format expected by testgen prompt + pending_tests = "test_authentication_edge_cases (test_auth.py), test_password_validation_complex (test_auth.py), test_user_registration_flow (test_user.py)" + response_json = f'{{"status": "more_tests_required", "pending_tests": "{pending_tests}"}}' + + result = self.tool._parse_response(response_json, self.request) + + assert result.status == "more_tests_required" + assert result.content_type == "json" + + # Verify the content contains the validated, parsed data + import json + + parsed_content = json.loads(result.content) + assert parsed_content["status"] == "more_tests_required" + assert parsed_content["pending_tests"] == pending_tests + + # Verify Claude would receive the pending_tests parameter correctly + assert "test_authentication_edge_cases (test_auth.py)" in parsed_content["pending_tests"] + assert "test_password_validation_complex (test_auth.py)" in parsed_content["pending_tests"] + assert "test_user_registration_flow (test_user.py)" in parsed_content["pending_tests"] + + def test_more_tests_required_missing_pending_tests(self): + """Test that more_tests_required without required pending_tests field fails validation""" + response_json = '{"status": "more_tests_required"}' + + result = self.tool._parse_response(response_json, self.request) + + # Should fall back to normal processing since validation failed (missing required field) + assert result.status in ["success", "continuation_available"] + assert result.content_type != "json" + + def test_test_sample_needed_missing_reason(self): + """Test that test_sample_needed without required reason field fails validation""" + response_json = '{"status": "test_sample_needed"}' + + result = self.tool._parse_response(response_json, self.request) + + # Should fall back to normal processing since validation failed (missing required field) + assert result.status in ["success", "continuation_available"] + assert result.content_type != "json" + + def test_special_status_json_format_preserved(self): + """Test that special status responses preserve exact JSON format for Claude""" + test_cases = [ + { + "input": '{"status": "clarification_required", "question": "What framework to use?", "files_needed": ["tests/"]}', + "expected_fields": ["status", "question", "files_needed"], + }, + { + "input": '{"status": "full_codereview_required", "reason": "Codebase too large"}', + "expected_fields": ["status", "reason"], + }, + { + "input": '{"status": "test_sample_needed", "reason": "Cannot determine test framework"}', + "expected_fields": ["status", "reason"], + }, + { + "input": '{"status": "more_tests_required", "pending_tests": "test_auth (test_auth.py), test_login (test_user.py)"}', + "expected_fields": ["status", "pending_tests"], + }, + ] + + for test_case in test_cases: + result = self.tool._parse_response(test_case["input"], self.request) + + # Verify status is correctly detected + import json + + input_data = json.loads(test_case["input"]) + assert result.status == input_data["status"] + assert result.content_type == "json" + + # Verify all expected fields are preserved in the response + parsed_content = json.loads(result.content) + for field in test_case["expected_fields"]: + assert field in parsed_content, f"Field {field} missing from {input_data['status']} response" + assert ( + parsed_content[field] == input_data[field] + ), f"Field {field} value mismatch in {input_data['status']} response" + + def test_focused_review_required_parsing(self): + """Test that focused_review_required status is parsed correctly""" + import json + + json_response = { + "status": "focused_review_required", + "reason": "Codebase too large for single review", + "suggestion": "Review authentication module (auth.py, login.py)", + } + + result = self.tool._parse_response(json.dumps(json_response), self.request) + + assert result.status == "focused_review_required" + assert result.content_type == "json" + parsed_content = json.loads(result.content) + assert parsed_content["status"] == "focused_review_required" + assert parsed_content["reason"] == "Codebase too large for single review" + assert parsed_content["suggestion"] == "Review authentication module (auth.py, login.py)" + + def test_focused_review_required_missing_suggestion(self): + """Test that focused_review_required fails validation without suggestion""" + import json + + json_response = { + "status": "focused_review_required", + "reason": "Codebase too large", + # Missing required suggestion field + } + + result = self.tool._parse_response(json.dumps(json_response), self.request) + + # Should fall back to normal response since validation failed + assert result.status == "success" + assert result.content_type == "text" diff --git a/tests/test_testgen.py b/tests/test_testgen.py index b2ff70b..fd97b29 100644 --- a/tests/test_testgen.py +++ b/tests/test_testgen.py @@ -284,7 +284,7 @@ class TestComprehensive(unittest.TestCase): # Check formatting includes new action-oriented next steps assert raw_response in formatted - assert "IMMEDIATE ACTION REQUIRED" in formatted + assert "IMMEDIATE NEXT ACTION" in formatted assert "ULTRATHINK" in formatted assert "CREATE" in formatted assert "VALIDATE BY EXECUTION" in formatted diff --git a/tools/base.py b/tools/base.py index 0e6d89d..d418b7a 100644 --- a/tools/base.py +++ b/tools/base.py @@ -37,7 +37,7 @@ from utils.conversation_memory import ( ) from utils.file_utils import read_file_content, read_files, translate_path_for_environment -from .models import ClarificationRequest, ContinuationOffer, ToolOutput +from .models import SPECIAL_STATUS_MODELS, ContinuationOffer, ToolOutput logger = logging.getLogger(__name__) @@ -1183,31 +1183,43 @@ When recommending searches, be specific about what information you need and why logger = logging.getLogger(f"tools.{self.name}") try: - # Try to parse as JSON to check for clarification requests + # Try to parse as JSON to check for special status requests potential_json = json.loads(raw_text.strip()) - if isinstance(potential_json, dict) and potential_json.get("status") == "clarification_required": - # Validate the clarification request structure - clarification = ClarificationRequest(**potential_json) - logger.debug(f"{self.name} tool requested clarification: {clarification.question}") - # Extract model information for metadata - metadata = { - "original_request": (request.model_dump() if hasattr(request, "model_dump") else str(request)) - } - if model_info: - model_name = model_info.get("model_name") - if model_name: - metadata["model_used"] = model_name + if isinstance(potential_json, dict) and "status" in potential_json: + status_key = potential_json.get("status") + status_model = SPECIAL_STATUS_MODELS.get(status_key) - return ToolOutput( - status="clarification_required", - content=clarification.model_dump_json(), - content_type="json", - metadata=metadata, - ) + if status_model: + try: + # Use Pydantic for robust validation of the special status + parsed_status = status_model.model_validate(potential_json) + logger.debug(f"{self.name} tool detected special status: {status_key}") + + # Extract model information for metadata + metadata = { + "original_request": ( + request.model_dump() if hasattr(request, "model_dump") else str(request) + ) + } + if model_info: + model_name = model_info.get("model_name") + if model_name: + metadata["model_used"] = model_name + + return ToolOutput( + status=status_key, + content=parsed_status.model_dump_json(), + content_type="json", + metadata=metadata, + ) + + except Exception as e: + # Invalid payload for known status, log warning and continue as normal response + logger.warning(f"Invalid {status_key} payload: {e}") except (json.JSONDecodeError, ValueError, TypeError): - # Not a JSON clarification request, treat as normal response + # Not a JSON special status request, treat as normal response pass # Normal text response - format using tool-specific formatting diff --git a/tools/models.py b/tools/models.py index 146ecd8..bfc9847 100644 --- a/tools/models.py +++ b/tools/models.py @@ -36,6 +36,10 @@ class ToolOutput(BaseModel): "success", "error", "clarification_required", + "full_codereview_required", + "focused_review_required", + "test_sample_needed", + "more_tests_required", "resend_prompt", "continuation_available", ] = "success" @@ -50,6 +54,7 @@ class ToolOutput(BaseModel): class ClarificationRequest(BaseModel): """Request for additional context or clarification""" + status: Literal["clarification_required"] = "clarification_required" question: str = Field(..., description="Question to ask Claude for more context") files_needed: Optional[list[str]] = Field( default_factory=list, description="Specific files that are needed for analysis" @@ -60,6 +65,48 @@ class ClarificationRequest(BaseModel): ) +class FullCodereviewRequired(BaseModel): + """Request for full code review when scope is too large for quick review""" + + status: Literal["full_codereview_required"] = "full_codereview_required" + important: Optional[str] = Field(None, description="Important message about escalation") + reason: Optional[str] = Field(None, description="Reason why full review is needed") + + +class FocusedReviewRequired(BaseModel): + """Request for Claude to provide smaller, focused subsets of code for review""" + + status: Literal["focused_review_required"] = "focused_review_required" + reason: str = Field(..., description="Why the current scope is too large for effective review") + suggestion: str = Field( + ..., description="Suggested approach for breaking down the review into smaller, focused parts" + ) + + +class TestSampleNeeded(BaseModel): + """Request for additional test samples to determine testing framework""" + + status: Literal["test_sample_needed"] = "test_sample_needed" + reason: str = Field(..., description="Reason why additional test samples are required") + + +class MoreTestsRequired(BaseModel): + """Request for continuation to generate additional tests""" + + status: Literal["more_tests_required"] = "more_tests_required" + pending_tests: str = Field(..., description="List of pending tests to be generated") + + +# Registry mapping status strings to their corresponding Pydantic models +SPECIAL_STATUS_MODELS = { + "clarification_required": ClarificationRequest, + "full_codereview_required": FullCodereviewRequired, + "focused_review_required": FocusedReviewRequired, + "test_sample_needed": TestSampleNeeded, + "more_tests_required": MoreTestsRequired, +} + + class DiagnosticHypothesis(BaseModel): """A debugging hypothesis with context and next steps""" diff --git a/tools/testgen.py b/tools/testgen.py index 428662c..f3648eb 100644 --- a/tools/testgen.py +++ b/tools/testgen.py @@ -117,7 +117,10 @@ class TestGenTool(BaseTool): }, "continuation_id": { "type": "string", - "description": "Thread continuation ID for multi-turn conversations. Can be used to continue conversations across different tools. Only provide this if continuing a previous conversation thread.", + "description": ( + "Thread continuation ID for multi-turn conversations. Can be used to continue conversations " + "across different tools. Only provide this if continuing a previous conversation thread." + ), }, }, "required": ["files", "prompt"] + (["model"] if self.is_effective_auto_mode() else []), @@ -436,7 +439,7 @@ class TestGenTool(BaseTool): --- -# IMMEDIATE ACTION REQUIRED +# IMMEDIATE NEXT ACTION Claude, you are now in EXECUTION MODE. Take immediate action: @@ -472,5 +475,6 @@ After creating each test file, show the user: ## Step 5: MOVE TO NEXT ACTION Once tests are confirmed working, immediately proceed to the next logical step for the project. -**CRITICAL**: Do NOT stop after generating - you MUST create, validate, run, and confirm the tests work. -Take full ownership of the testing implementation and move to your next work.""" +**CRITICAL**: Do NOT stop after generating - you MUST create, validate, run, and confirm the tests work. Take full +ownership of the testing implementation and move to your next work. If you were supplied a more_work_required request +in the response above, you MUST honor it."""