Docs added to show how a new provider is added
Docs added to show how a new tool is created All tools should add numbers to code for models to be able to reference if needed Enabled line numbering for code for all tools to use Additional tests to validate line numbering is not added to git diffs
This commit is contained in:
@@ -11,9 +11,11 @@ from .test_content_validation import ContentValidationTest
|
||||
from .test_conversation_chain_validation import ConversationChainValidationTest
|
||||
from .test_cross_tool_comprehensive import CrossToolComprehensiveTest
|
||||
from .test_cross_tool_continuation import CrossToolContinuationTest
|
||||
from .test_line_number_validation import LineNumberValidationTest
|
||||
from .test_logs_validation import LogsValidationTest
|
||||
from .test_model_thinking_config import TestModelThinkingConfig
|
||||
from .test_o3_model_selection import O3ModelSelectionTest
|
||||
from .test_o3_pro_expensive import O3ProExpensiveTest
|
||||
from .test_ollama_custom_url import OllamaCustomUrlTest
|
||||
from .test_openrouter_fallback import OpenRouterFallbackTest
|
||||
from .test_openrouter_models import OpenRouterModelsTest
|
||||
@@ -30,6 +32,7 @@ TEST_REGISTRY = {
|
||||
"per_tool_deduplication": PerToolDeduplicationTest,
|
||||
"cross_tool_continuation": CrossToolContinuationTest,
|
||||
"cross_tool_comprehensive": CrossToolComprehensiveTest,
|
||||
"line_number_validation": LineNumberValidationTest,
|
||||
"logs_validation": LogsValidationTest,
|
||||
"redis_validation": RedisValidationTest,
|
||||
"model_thinking_config": TestModelThinkingConfig,
|
||||
@@ -41,6 +44,7 @@ TEST_REGISTRY = {
|
||||
"testgen_validation": TestGenValidationTest,
|
||||
"refactor_validation": RefactorValidationTest,
|
||||
"conversation_chain_validation": ConversationChainValidationTest,
|
||||
# "o3_pro_expensive": O3ProExpensiveTest, # COMMENTED OUT - too expensive to run by default
|
||||
}
|
||||
|
||||
__all__ = [
|
||||
@@ -50,10 +54,12 @@ __all__ = [
|
||||
"PerToolDeduplicationTest",
|
||||
"CrossToolContinuationTest",
|
||||
"CrossToolComprehensiveTest",
|
||||
"LineNumberValidationTest",
|
||||
"LogsValidationTest",
|
||||
"RedisValidationTest",
|
||||
"TestModelThinkingConfig",
|
||||
"O3ModelSelectionTest",
|
||||
"O3ProExpensiveTest",
|
||||
"OllamaCustomUrlTest",
|
||||
"OpenRouterFallbackTest",
|
||||
"OpenRouterModelsTest",
|
||||
|
||||
177
simulator_tests/test_line_number_validation.py
Normal file
177
simulator_tests/test_line_number_validation.py
Normal file
@@ -0,0 +1,177 @@
|
||||
"""
|
||||
Test to validate line number handling across different tools
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
|
||||
from .base_test import BaseSimulatorTest
|
||||
|
||||
|
||||
class LineNumberValidationTest(BaseSimulatorTest):
|
||||
"""Test that validates correct line number handling in chat, analyze, and refactor tools"""
|
||||
|
||||
@property
|
||||
def test_name(self) -> str:
|
||||
return "line_number_validation"
|
||||
|
||||
@property
|
||||
def test_description(self) -> str:
|
||||
return "Line number handling validation across tools"
|
||||
|
||||
def run_test(self) -> bool:
|
||||
"""Test line number handling in different tools"""
|
||||
try:
|
||||
self.logger.info("Test: Line number handling validation")
|
||||
|
||||
# Setup test files
|
||||
self.setup_test_files()
|
||||
|
||||
# Create a test file with known content
|
||||
test_file_content = '''# Example code with specific elements
|
||||
def calculate_total(items):
|
||||
"""Calculate total with tax"""
|
||||
subtotal = 0
|
||||
tax_rate = 0.08 # Line 5 - tax_rate defined
|
||||
|
||||
for item in items: # Line 7 - loop starts
|
||||
if item.price > 0:
|
||||
subtotal += item.price
|
||||
|
||||
tax_amount = subtotal * tax_rate # Line 11
|
||||
return subtotal + tax_amount
|
||||
|
||||
def validate_data(data):
|
||||
"""Validate input data""" # Line 15
|
||||
required_fields = ["name", "email", "age"] # Line 16
|
||||
|
||||
for field in required_fields:
|
||||
if field not in data:
|
||||
raise ValueError(f"Missing field: {field}")
|
||||
|
||||
return True # Line 22
|
||||
'''
|
||||
|
||||
test_file_path = os.path.join(self.test_dir, "line_test.py")
|
||||
with open(test_file_path, "w") as f:
|
||||
f.write(test_file_content)
|
||||
|
||||
self.logger.info(f"Created test file: {test_file_path}")
|
||||
|
||||
# Test 1: Chat tool asking about specific line
|
||||
self.logger.info(" 1.1: Testing chat tool with line number question")
|
||||
content, continuation_id = self.call_mcp_tool(
|
||||
"chat",
|
||||
{
|
||||
"prompt": "Where is tax_rate defined in this file? Please tell me the exact line number.",
|
||||
"files": [test_file_path],
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if content:
|
||||
# Check if the response mentions line 5
|
||||
if "line 5" in content.lower() or "line 5" in content:
|
||||
self.logger.info(" ✅ Chat tool correctly identified tax_rate at line 5")
|
||||
else:
|
||||
self.logger.warning(f" ⚠️ Chat tool response didn't mention line 5: {content[:200]}...")
|
||||
else:
|
||||
self.logger.error(" ❌ Chat tool request failed")
|
||||
return False
|
||||
|
||||
# Test 2: Analyze tool with line number reference
|
||||
self.logger.info(" 1.2: Testing analyze tool with line number analysis")
|
||||
content, continuation_id = self.call_mcp_tool(
|
||||
"analyze",
|
||||
{
|
||||
"prompt": "What happens between lines 7-11 in this code? Focus on the loop logic.",
|
||||
"files": [test_file_path],
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if content:
|
||||
# Check if the response references the loop
|
||||
if any(term in content.lower() for term in ["loop", "iterate", "line 7", "lines 7"]):
|
||||
self.logger.info(" ✅ Analyze tool correctly analyzed the specified line range")
|
||||
else:
|
||||
self.logger.warning(" ⚠️ Analyze tool response unclear about line range")
|
||||
else:
|
||||
self.logger.error(" ❌ Analyze tool request failed")
|
||||
return False
|
||||
|
||||
# Test 3: Refactor tool with line number precision
|
||||
self.logger.info(" 1.3: Testing refactor tool line number precision")
|
||||
content, continuation_id = self.call_mcp_tool(
|
||||
"refactor",
|
||||
{
|
||||
"prompt": "Analyze this code for refactoring opportunities",
|
||||
"files": [test_file_path],
|
||||
"refactor_type": "codesmells",
|
||||
"model": "flash",
|
||||
},
|
||||
)
|
||||
|
||||
if content:
|
||||
try:
|
||||
# Parse the JSON response
|
||||
result = json.loads(content)
|
||||
if result.get("status") == "refactor_analysis_complete":
|
||||
opportunities = result.get("refactor_opportunities", [])
|
||||
if opportunities:
|
||||
# Check if line numbers are precise
|
||||
has_line_refs = any(
|
||||
opp.get("start_line") is not None and opp.get("end_line") is not None
|
||||
for opp in opportunities
|
||||
)
|
||||
if has_line_refs:
|
||||
self.logger.info(" ✅ Refactor tool provided precise line number references")
|
||||
# Log some examples
|
||||
for opp in opportunities[:2]:
|
||||
if opp.get("start_line"):
|
||||
self.logger.info(
|
||||
f" - Issue at lines {opp['start_line']}-{opp['end_line']}: {opp.get('issue', '')[:50]}..."
|
||||
)
|
||||
else:
|
||||
self.logger.warning(" ⚠️ Refactor tool response missing line numbers")
|
||||
else:
|
||||
self.logger.info(" ℹ️ No refactoring opportunities found (code might be too clean)")
|
||||
except json.JSONDecodeError:
|
||||
self.logger.warning(" ⚠️ Refactor tool response not valid JSON")
|
||||
else:
|
||||
self.logger.error(" ❌ Refactor tool request failed")
|
||||
return False
|
||||
|
||||
# Test 4: Validate log patterns
|
||||
self.logger.info(" 1.4: Validating line number processing in logs")
|
||||
|
||||
# Get logs from container
|
||||
result = self.run_command(
|
||||
["docker", "exec", self.container_name, "tail", "-500", "/tmp/mcp_server.log"], capture_output=True
|
||||
)
|
||||
|
||||
logs = ""
|
||||
if result.returncode == 0:
|
||||
logs = result.stdout.decode()
|
||||
|
||||
# Check for line number formatting patterns
|
||||
line_number_patterns = ["Line numbers for", "enabled", "│", "line number"] # The line number separator
|
||||
|
||||
found_patterns = 0
|
||||
for pattern in line_number_patterns:
|
||||
if pattern in logs:
|
||||
found_patterns += 1
|
||||
|
||||
self.logger.info(f" Found {found_patterns}/{len(line_number_patterns)} line number patterns in logs")
|
||||
|
||||
if found_patterns >= 2:
|
||||
self.logger.info(" ✅ Line number processing confirmed in logs")
|
||||
else:
|
||||
self.logger.warning(" ⚠️ Limited line number processing evidence in logs")
|
||||
|
||||
self.logger.info(" ✅ Line number validation test completed successfully")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Line number validation test failed: {type(e).__name__}: {e}")
|
||||
return False
|
||||
@@ -9,6 +9,7 @@ Tests the refactor tool with a simple code smell example to validate:
|
||||
"""
|
||||
|
||||
import json
|
||||
|
||||
from .base_test import BaseSimulatorTest
|
||||
|
||||
|
||||
@@ -32,7 +33,7 @@ class RefactorValidationTest(BaseSimulatorTest):
|
||||
self.setup_test_files()
|
||||
|
||||
# Create a simple Python file with obvious code smells
|
||||
code_with_smells = '''# Code with obvious smells for testing
|
||||
code_with_smells = """# Code with obvious smells for testing
|
||||
def process_data(data):
|
||||
# Code smell: Magic number
|
||||
if len(data) > 42:
|
||||
@@ -57,22 +58,22 @@ def handle_everything(user_input, config, database):
|
||||
if not user_input:
|
||||
print("Error: No input") # Code smell: print instead of logging
|
||||
return
|
||||
|
||||
|
||||
# Processing
|
||||
processed = user_input.strip().lower()
|
||||
|
||||
|
||||
# Database operation
|
||||
connection = database.connect()
|
||||
data = connection.query("SELECT * FROM users") # Code smell: SQL in code
|
||||
|
||||
|
||||
# Business logic mixed with data access
|
||||
valid_users = []
|
||||
for row in data:
|
||||
if row[2] == processed: # Code smell: Magic index
|
||||
valid_users.append(row)
|
||||
|
||||
|
||||
return valid_users
|
||||
'''
|
||||
"""
|
||||
|
||||
# Create test file
|
||||
test_file = self.create_additional_test_file("smelly_code.py", code_with_smells)
|
||||
@@ -88,7 +89,7 @@ def handle_everything(user_input, config, database):
|
||||
"refactor_type": "codesmells",
|
||||
"model": "flash",
|
||||
"thinking_mode": "low", # Keep it fast for testing
|
||||
}
|
||||
},
|
||||
)
|
||||
|
||||
if not response:
|
||||
@@ -96,14 +97,14 @@ def handle_everything(user_input, config, database):
|
||||
return False
|
||||
|
||||
self.logger.info(" ✅ Got refactor response")
|
||||
|
||||
|
||||
# Parse response to check for line references
|
||||
try:
|
||||
response_data = json.loads(response)
|
||||
|
||||
|
||||
# Debug: log the response structure
|
||||
self.logger.debug(f"Response keys: {list(response_data.keys())}")
|
||||
|
||||
|
||||
# Extract the actual content if it's wrapped
|
||||
if "content" in response_data:
|
||||
# The actual refactoring data is in the content field
|
||||
@@ -114,93 +115,91 @@ def handle_everything(user_input, config, database):
|
||||
if content.endswith("```"):
|
||||
content = content[:-3] # Remove ```
|
||||
content = content.strip()
|
||||
|
||||
|
||||
# Find the end of the JSON object - handle truncated responses
|
||||
# Count braces to find where the JSON ends
|
||||
brace_count = 0
|
||||
json_end = -1
|
||||
in_string = False
|
||||
escape_next = False
|
||||
|
||||
|
||||
for i, char in enumerate(content):
|
||||
if escape_next:
|
||||
escape_next = False
|
||||
continue
|
||||
if char == '\\':
|
||||
if char == "\\":
|
||||
escape_next = True
|
||||
continue
|
||||
if char == '"' and not escape_next:
|
||||
in_string = not in_string
|
||||
if not in_string:
|
||||
if char == '{':
|
||||
if char == "{":
|
||||
brace_count += 1
|
||||
elif char == '}':
|
||||
elif char == "}":
|
||||
brace_count -= 1
|
||||
if brace_count == 0:
|
||||
json_end = i + 1
|
||||
break
|
||||
|
||||
|
||||
if json_end > 0:
|
||||
content = content[:json_end]
|
||||
|
||||
|
||||
# Parse the inner JSON
|
||||
inner_data = json.loads(content)
|
||||
self.logger.debug(f"Inner data keys: {list(inner_data.keys())}")
|
||||
else:
|
||||
inner_data = response_data
|
||||
|
||||
|
||||
# Check that we got refactoring suggestions (might be called refactor_opportunities)
|
||||
refactorings_key = None
|
||||
for key in ["refactorings", "refactor_opportunities"]:
|
||||
if key in inner_data:
|
||||
refactorings_key = key
|
||||
break
|
||||
|
||||
|
||||
if not refactorings_key:
|
||||
self.logger.error("No refactorings found in response")
|
||||
self.logger.error(f"Response structure: {json.dumps(inner_data, indent=2)[:500]}...")
|
||||
return False
|
||||
|
||||
|
||||
refactorings = inner_data[refactorings_key]
|
||||
if not isinstance(refactorings, list) or len(refactorings) == 0:
|
||||
self.logger.error("Empty refactorings list")
|
||||
return False
|
||||
|
||||
|
||||
# Validate that we have line references for code smells
|
||||
# Flash model typically detects these issues:
|
||||
# - Lines 4-18: process_data function (magic number, nested loops, duplicate code)
|
||||
# - Lines 11-14: duplicate code blocks
|
||||
# - Lines 21-40: handle_everything god function
|
||||
expected_line_ranges = [
|
||||
(4, 18), # process_data function issues
|
||||
(11, 14), # duplicate code
|
||||
(21, 40), # god function
|
||||
]
|
||||
|
||||
|
||||
self.logger.debug(f"Refactorings found: {len(refactorings)}")
|
||||
for i, ref in enumerate(refactorings[:3]): # Log first 3
|
||||
self.logger.debug(f"Refactoring {i}: start_line={ref.get('start_line')}, end_line={ref.get('end_line')}, type={ref.get('type')}")
|
||||
|
||||
self.logger.debug(
|
||||
f"Refactoring {i}: start_line={ref.get('start_line')}, end_line={ref.get('end_line')}, type={ref.get('type')}"
|
||||
)
|
||||
|
||||
found_references = []
|
||||
for refactoring in refactorings:
|
||||
# Check for line numbers in various fields
|
||||
start_line = refactoring.get("start_line")
|
||||
end_line = refactoring.get("end_line")
|
||||
location = refactoring.get("location", "")
|
||||
|
||||
|
||||
# Add found line numbers
|
||||
if start_line:
|
||||
found_references.append(f"line {start_line}")
|
||||
if end_line and end_line != start_line:
|
||||
found_references.append(f"line {end_line}")
|
||||
|
||||
|
||||
# Also extract from location string
|
||||
import re
|
||||
line_matches = re.findall(r'line[s]?\s+(\d+)', location.lower())
|
||||
|
||||
line_matches = re.findall(r"line[s]?\s+(\d+)", location.lower())
|
||||
found_references.extend([f"line {num}" for num in line_matches])
|
||||
|
||||
|
||||
self.logger.info(f" 📍 Found line references: {found_references}")
|
||||
|
||||
|
||||
# Check that flash found the expected refactoring areas
|
||||
found_ranges = []
|
||||
for refactoring in refactorings:
|
||||
@@ -208,71 +207,70 @@ def handle_everything(user_input, config, database):
|
||||
end = refactoring.get("end_line")
|
||||
if start and end:
|
||||
found_ranges.append((start, end))
|
||||
|
||||
|
||||
self.logger.info(f" 📍 Found refactoring ranges: {found_ranges}")
|
||||
|
||||
|
||||
# Verify we found issues in the main problem areas
|
||||
# Check if we have issues detected in process_data function area (lines 2-18)
|
||||
process_data_issues = [r for r in found_ranges if r[0] >= 2 and r[1] <= 18]
|
||||
# Check if we have issues detected in handle_everything function area (lines 21-40)
|
||||
god_function_issues = [r for r in found_ranges if r[0] >= 21 and r[1] <= 40]
|
||||
|
||||
|
||||
self.logger.info(f" 📍 Issues in process_data area (lines 2-18): {len(process_data_issues)}")
|
||||
self.logger.info(f" 📍 Issues in handle_everything area (lines 21-40): {len(god_function_issues)}")
|
||||
|
||||
|
||||
if len(process_data_issues) >= 1 and len(god_function_issues) >= 1:
|
||||
self.logger.info(f" ✅ Flash correctly identified code smells in both major areas")
|
||||
self.logger.info(" ✅ Flash correctly identified code smells in both major areas")
|
||||
self.logger.info(f" ✅ Found {len(refactorings)} total refactoring opportunities")
|
||||
|
||||
|
||||
# Verify we have reasonable number of total issues
|
||||
if len(refactorings) >= 3:
|
||||
self.logger.info(f" ✅ Refactoring analysis validation passed")
|
||||
self.logger.info(" ✅ Refactoring analysis validation passed")
|
||||
else:
|
||||
self.logger.warning(f" ⚠️ Only {len(refactorings)} refactorings found (expected >= 3)")
|
||||
else:
|
||||
self.logger.error(f" ❌ Flash didn't find enough issues in expected areas")
|
||||
self.logger.error(" ❌ Flash didn't find enough issues in expected areas")
|
||||
self.logger.error(f" - process_data area: found {len(process_data_issues)}, expected >= 1")
|
||||
self.logger.error(f" - handle_everything area: found {len(god_function_issues)}, expected >= 1")
|
||||
return False
|
||||
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
self.logger.error(f"Failed to parse refactor response as JSON: {e}")
|
||||
return False
|
||||
|
||||
|
||||
# Validate logs
|
||||
self.logger.info(" 📋 Validating execution logs...")
|
||||
|
||||
|
||||
# Get server logs from the actual log file inside the container
|
||||
result = self.run_command(
|
||||
["docker", "exec", self.container_name, "tail", "-500", "/tmp/mcp_server.log"],
|
||||
capture_output=True
|
||||
["docker", "exec", self.container_name, "tail", "-500", "/tmp/mcp_server.log"], capture_output=True
|
||||
)
|
||||
|
||||
|
||||
if result.returncode == 0:
|
||||
logs = result.stdout.decode() + result.stderr.decode()
|
||||
|
||||
|
||||
# Look for refactor tool execution patterns
|
||||
refactor_patterns = [
|
||||
"[REFACTOR]",
|
||||
"refactor tool",
|
||||
"codesmells",
|
||||
"Token budget",
|
||||
"Code files embedded successfully"
|
||||
"Code files embedded successfully",
|
||||
]
|
||||
|
||||
|
||||
patterns_found = 0
|
||||
for pattern in refactor_patterns:
|
||||
if pattern in logs:
|
||||
patterns_found += 1
|
||||
self.logger.debug(f" ✅ Found log pattern: {pattern}")
|
||||
|
||||
|
||||
if patterns_found >= 3:
|
||||
self.logger.info(f" ✅ Log validation passed ({patterns_found}/{len(refactor_patterns)} patterns)")
|
||||
else:
|
||||
self.logger.warning(f" ⚠️ Only found {patterns_found}/{len(refactor_patterns)} log patterns")
|
||||
else:
|
||||
self.logger.warning(" ⚠️ Could not retrieve Docker logs")
|
||||
|
||||
|
||||
self.logger.info(" ✅ Refactor tool validation completed successfully")
|
||||
return True
|
||||
|
||||
@@ -280,4 +278,4 @@ def handle_everything(user_input, config, database):
|
||||
self.logger.error(f"Refactor validation test failed: {e}")
|
||||
return False
|
||||
finally:
|
||||
self.cleanup_test_files()
|
||||
self.cleanup_test_files()
|
||||
|
||||
Reference in New Issue
Block a user