diff --git a/tests/test_e2e_issue_533_duplicate_validation.py b/tests/test_e2e_issue_533_duplicate_validation.py new file mode 100644 index 000000000..11d5dbdd2 --- /dev/null +++ b/tests/test_e2e_issue_533_duplicate_validation.py @@ -0,0 +1,414 @@ +""" +E2E tests for Issue #533: Orchestrator should validate LLM duplicate detection output. + +These E2E tests differ from the unit tests in test_issue_533_duplicate_validation.py: +- Unit tests mock load_prompt_template and test orchestrator logic in isolation +- E2E tests use REAL prompt loading via load_prompt_template (not mocked) +- E2E tests exercise the full pipeline: prompt file → preprocess → format → orchestrator + +Bug: The orchestrator at line 441 of agentic_bug_orchestrator.py blindly trusts +the LLM's duplicate detection output without validating that the original issue +is actually resolved. When the LLM fails to follow prompt instructions and outputs +"Duplicate of #520" even though #520 is OPEN, the orchestrator incorrectly triggers +a hard stop and closes the issue. + +Real-world scenario (issue #530/#520): +- User files issue #530 about a bug +- LLM outputs "Duplicate of #520" without verifying #520's status +- Issue #520 is still OPEN (unresolved) +- Orchestrator blindly trusts LLM and stops workflow, closing #530 +- User had to manually reopen #530 + +Root cause: +The hard stop at line 441 checks for "Duplicate of #" but doesn't validate: + if step_num == 1 and "Duplicate of #" in output: + msg = f"Stopped at Step 1: Issue is a duplicate. {output.strip()}" + return False, msg, total_cost, last_model_used, changed_files + +Fix: +The orchestrator should validate the original issue's state using `gh issue view` +before triggering the hard stop. If the original issue is still OPEN (unresolved), +the orchestrator should log a warning and continue the workflow. + +This is a regression of issue #469, which fixed the prompts but didn't add +orchestrator-level validation as defense-in-depth against LLM instruction-following failures. + +Test Strategy: +- Test 1: LLM outputs duplicate of UNRESOLVED issue → orchestrator validates & continues +- Test 2: LLM outputs duplicate of RESOLVED issue → hard stop works correctly (regression) +- Test 3: Exact scenario from issue #533 using real issue numbers #530/#520 +""" + +import os +import re +import subprocess +from pathlib import Path +from unittest.mock import patch + +import pytest + +from pdd.agentic_bug_orchestrator import run_agentic_bug_orchestrator + +# Project root: the worktree (or repo root) containing prompts/ +_PROJECT_ROOT = Path(__file__).resolve().parent.parent + + +@pytest.fixture(autouse=True) +def set_pdd_path_to_project_root(): + """Ensure PDD_PATH points to the project root so load_prompt_template + picks up the prompts/ directory from this worktree, not an external install.""" + old = os.environ.get("PDD_PATH") + os.environ["PDD_PATH"] = str(_PROJECT_ROOT) + yield + if old is not None: + os.environ["PDD_PATH"] = old + elif "PDD_PATH" in os.environ: + del os.environ["PDD_PATH"] + + +@pytest.fixture +def mock_git_repo(tmp_path): + """Create a minimal git repository for testing the orchestrator.""" + repo_path = tmp_path / "test_repo" + repo_path.mkdir() + + subprocess.run( + ["git", "init", "-b", "main"], cwd=repo_path, + check=True, capture_output=True + ) + subprocess.run( + ["git", "config", "user.email", "test@test.com"], + cwd=repo_path, check=True + ) + subprocess.run( + ["git", "config", "user.name", "Test User"], + cwd=repo_path, check=True + ) + + (repo_path / "README.md").write_text("# Test Repository\n") + subprocess.run(["git", "add", "."], cwd=repo_path, check=True) + subprocess.run( + ["git", "commit", "-m", "Initial commit"], + cwd=repo_path, check=True, capture_output=True + ) + + return repo_path + + +class TestIssue533DuplicateValidationE2E: + """ + E2E tests for Issue #533: Orchestrator should validate duplicate detection. + + These tests exercise the real prompt loading, preprocessing, and formatting + pipeline — only the LLM execution layer (run_agentic_task) and git + operations (_setup_worktree) are mocked. + """ + + def test_llm_outputs_duplicate_of_unresolved_issue_workflow_continues(self, mock_git_repo): + """ + E2E Test: When the LLM fails to follow prompt instructions and outputs + "Duplicate of #520" even though #520 is OPEN (unresolved), the orchestrator + should validate the original issue's state and continue the workflow. + + This is the PRIMARY BUG SCENARIO from issue #533. + + This exercises the full code path: + 1. Real load_prompt_template (loads actual prompt file from disk) + 2. Real preprocess() and format() (expands includes, substitutes vars) + 3. Real orchestrator loop logic (step iteration, hard-stop checks) + 4. Mocked LLM (returns output where it failed to follow prompt instructions) + 5. Mocked worktree setup (avoids real git operations) + + The mock LLM simulates what happened in the real bug: the LLM output + "Duplicate of #520" WITHOUT properly checking that #520 was resolved. + The orchestrator should catch this error and validate the issue state. + + EXPECTED BEHAVIOR (after fix): + - Orchestrator detects "Duplicate of #" in LLM output + - Orchestrator calls `gh issue view 520` to validate + - Orchestrator sees #520 is OPEN (unresolved) + - Orchestrator logs warning about LLM failing to follow instructions + - Orchestrator continues workflow to Step 2 + + BUGGY BEHAVIOR (before fix): + - Orchestrator detects "Duplicate of #" in LLM output + - Orchestrator immediately triggers hard stop + - Workflow stops at Step 1, issue closed as duplicate + - User has to manually reopen the issue + """ + mock_worktree = mock_git_repo / ".pdd" / "worktrees" / "fix-issue-533" + mock_worktree.mkdir(parents=True, exist_ok=True) + + steps_executed = [] + + def mock_run_agentic_task(instruction, cwd, verbose, quiet, timeout, label, max_retries): + """Mock LLM that fails to follow prompt instructions.""" + match = re.search(r"step(\d+(?:_\d+)?)", label) + if match: + steps_executed.append(label) + + if "step1" in label: + # LLM FAILS to follow the prompt: it outputs "Duplicate of #520" + # without properly checking that #520 is OPEN (unresolved). + # This simulates the real-world bug from issue #533. + return ( + True, + "## Step 1: Duplicate Check\n\n" + "**Status:** Duplicate of #520\n\n" + "### Search Performed\n" + "- Searched for: pdd bug agents closes duplicated issues\n" + "- Issues reviewed: 5\n\n" + "### Findings\n" + "Found issue #520 which reports the exact same problem. " + "This is a duplicate.\n\n" + "---", + 0.01, + "mock-model", + ) + + if "step7" in label: + return (True, "Generated unit test\nFILES_CREATED: test_fix.py", 0.01, "mock-model") + + return (True, f"Mock output for {label}", 0.01, "mock-model") + + with patch("pdd.agentic_bug_orchestrator.run_agentic_task", side_effect=mock_run_agentic_task), \ + patch("pdd.agentic_bug_orchestrator.console"), \ + patch("pdd.agentic_bug_orchestrator._setup_worktree", return_value=(mock_worktree, None)), \ + patch("subprocess.run") as mock_subprocess: + + # Mock gh issue view to return that #520 is OPEN (unresolved) + def subprocess_side_effect(*args, **kwargs): + cmd = args[0] if args else kwargs.get('args', []) + if isinstance(cmd, list) and 'gh' in cmd and 'issue' in cmd and 'view' in cmd: + # Return mock JSON showing issue #520 is OPEN + mock_result = subprocess.CompletedProcess( + args=cmd, + returncode=0, + stdout='{"number": 520, "state": "OPEN", "title": "Bug with pdd"}\n', + stderr='' + ) + return mock_result + # For git commands, return success + return subprocess.CompletedProcess( + args=cmd, returncode=0, stdout='', stderr='' + ) + + mock_subprocess.side_effect = subprocess_side_effect + + success, message, cost, model, files = run_agentic_bug_orchestrator( + issue_url="https://github.com/promptdriven/pdd/issues/530", + issue_content="PDD bug agents still closes duplicated issues that are not resolved", + repo_owner="promptdriven", + repo_name="pdd", + issue_number=530, + issue_author="jiaminc-cmu", + issue_title="pdd bug agents still closes duplicated issues that are not resolved", + cwd=mock_git_repo, + verbose=False, + quiet=True, + use_github_state=False, + ) + + # The workflow should continue past Step 1 despite LLM outputting "Duplicate of #" + # because the orchestrator validates that #520 is OPEN (unresolved) + assert success is True, ( + f"BUG DETECTED (Issue #533): Workflow should continue when LLM outputs " + f"duplicate of UNRESOLVED issue. The orchestrator should validate that " + f"#520 is OPEN and not trigger hard stop. Instead got: success={success}, " + f"msg={message}" + ) + assert "step1" in steps_executed, "Step 1 should have executed" + assert len(steps_executed) == 11, ( + f"All 11 steps should execute when the original duplicate is unresolved. " + f"The orchestrator should validate issue state before stopping. " + f"Got {len(steps_executed)} steps: {steps_executed}" + ) + + def test_llm_outputs_duplicate_of_resolved_issue_workflow_stops(self, mock_git_repo): + """ + E2E Regression Test: When the LLM correctly identifies a CLOSED (resolved) + duplicate and outputs "Duplicate of #520", the workflow should hard-stop + at Step 1. + + This ensures the fix for #533 doesn't break the valid duplicate detection path. + + EXPECTED BEHAVIOR: + - LLM outputs "Duplicate of #520" + - Orchestrator validates with `gh issue view 520` + - Issue #520 is CLOSED (resolved) + - Orchestrator triggers hard stop + - Workflow stops at Step 1 (correct behavior) + """ + mock_worktree = mock_git_repo / ".pdd" / "worktrees" / "fix-issue-533" + mock_worktree.mkdir(parents=True, exist_ok=True) + + steps_executed = [] + + def mock_run_agentic_task(instruction, cwd, verbose, quiet, timeout, label, max_retries): + """Mock LLM that correctly identifies a resolved duplicate.""" + match = re.search(r"step(\d+(?:_\d+)?)", label) + if match: + steps_executed.append(label) + + if "step1" in label: + # LLM correctly identifies a resolved duplicate + return ( + True, + "## Step 1: Duplicate Check\n\n" + "**Status:** Duplicate of #520\n\n" + "### Search Performed\n" + "- Searched for: pdd bug agents closes duplicated issues\n" + "- Issues reviewed: 5\n\n" + "### Findings\n" + "Issue #520 was resolved in PR #525. This is a duplicate.\n\n" + "---", + 0.01, + "mock-model", + ) + + return (True, f"Mock output for {label}", 0.01, "mock-model") + + with patch("pdd.agentic_bug_orchestrator.run_agentic_task", side_effect=mock_run_agentic_task), \ + patch("pdd.agentic_bug_orchestrator.console"), \ + patch("pdd.agentic_bug_orchestrator._setup_worktree", return_value=(mock_worktree, None)), \ + patch("subprocess.run") as mock_subprocess: + + # Mock gh issue view to return that #520 is CLOSED (resolved) + def subprocess_side_effect(*args, **kwargs): + cmd = args[0] if args else kwargs.get('args', []) + if isinstance(cmd, list) and 'gh' in cmd and 'issue' in cmd and 'view' in cmd: + # Return mock JSON showing issue #520 is CLOSED + mock_result = subprocess.CompletedProcess( + args=cmd, + returncode=0, + stdout='{"number": 520, "state": "CLOSED", "title": "Bug with pdd"}\n', + stderr='' + ) + return mock_result + # For git commands, return success + return subprocess.CompletedProcess( + args=cmd, returncode=0, stdout='', stderr='' + ) + + mock_subprocess.side_effect = subprocess_side_effect + + success, message, cost, model, files = run_agentic_bug_orchestrator( + issue_url="https://github.com/promptdriven/pdd/issues/530", + issue_content="PDD bug agents still closes duplicated issues that are not resolved", + repo_owner="promptdriven", + repo_name="pdd", + issue_number=530, + issue_author="jiaminc-cmu", + issue_title="pdd bug agents still closes duplicated issues that are not resolved", + cwd=mock_git_repo, + verbose=False, + quiet=True, + use_github_state=False, + ) + + # Workflow should stop at Step 1 — this is CORRECT behavior for resolved duplicates + assert success is False, ( + "Workflow should stop for resolved duplicates. This is the correct behavior." + ) + assert "Stopped at Step 1" in message, ( + f"Message should indicate Step 1 hard stop. Got: {message}" + ) + assert "duplicate" in message.lower(), ( + f"Message should mention duplicate. Got: {message}" + ) + assert len(steps_executed) == 1, ( + f"Only Step 1 should execute for a resolved duplicate. " + f"Got: {steps_executed}" + ) + + def test_exact_scenario_issue_533_with_real_issue_numbers(self, mock_git_repo): + """ + E2E Test: Exact scenario from issue #533 using real issue numbers. + + Real-world events: + - User filed issue #530 about a bug + - LLM Step 1 output: "Duplicate of #520" + - Issue #520 was OPEN (unresolved) + - Orchestrator stopped workflow and closed #530 + - User had to manually reopen #530 + + This test verifies the orchestrator should validate that #520 is OPEN + and continue the workflow instead of blindly trusting the LLM. + """ + mock_worktree = mock_git_repo / ".pdd" / "worktrees" / "fix-issue-530" + mock_worktree.mkdir(parents=True, exist_ok=True) + + steps_executed = [] + + def mock_run_agentic_task(instruction, cwd, verbose, quiet, timeout, label, max_retries): + """Mock LLM that reproduces the exact output from issue #533.""" + match = re.search(r"step(\d+(?:_\d+)?)", label) + if match: + steps_executed.append(label) + + if "step1" in label: + # Reproduce the exact LLM output that caused issue #533 + return ( + True, + "## Step 1: Duplicate Check\n\n" + "**Status:** Duplicate of #520\n\n" + "This issue appears to be a duplicate of #520.\n", + 0.01, + "mock-model", + ) + + if "step7" in label: + return (True, "Generated unit test\nFILES_CREATED: test_fix.py", 0.01, "mock-model") + + return (True, f"Mock output for {label}", 0.01, "mock-model") + + with patch("pdd.agentic_bug_orchestrator.run_agentic_task", side_effect=mock_run_agentic_task), \ + patch("pdd.agentic_bug_orchestrator.console"), \ + patch("pdd.agentic_bug_orchestrator._setup_worktree", return_value=(mock_worktree, None)), \ + patch("subprocess.run") as mock_subprocess: + + # Mock gh issue view to return that #520 is OPEN (matching real scenario) + def subprocess_side_effect(*args, **kwargs): + cmd = args[0] if args else kwargs.get('args', []) + if isinstance(cmd, list) and 'gh' in cmd and 'issue' in cmd and 'view' in cmd and '520' in str(cmd): + # Return mock JSON showing issue #520 is OPEN (unresolved) + mock_result = subprocess.CompletedProcess( + args=cmd, + returncode=0, + stdout='{"number": 520, "state": "OPEN", "title": "pdd fails to use the latest version of claude 3.7 sonnet"}\n', + stderr='' + ) + return mock_result + # For git commands, return success + return subprocess.CompletedProcess( + args=cmd, returncode=0, stdout='', stderr='' + ) + + mock_subprocess.side_effect = subprocess_side_effect + + success, message, cost, model, files = run_agentic_bug_orchestrator( + issue_url="https://github.com/promptdriven/pdd/issues/530", + issue_content="For example, this one:https://github.com/promptdriven/pdd/issues/520; I had to manually reopen it", + repo_owner="promptdriven", + repo_name="pdd", + issue_number=530, + issue_author="jiaminc-cmu", + issue_title="pdd bug agents still closes duplicated issues that are not resolved", + cwd=mock_git_repo, + verbose=False, + quiet=True, + use_github_state=False, + ) + + # This is the exact scenario from issue #533 - workflow should continue + assert success is True, ( + f"BUG DETECTED (Issue #533 - Exact Scenario): This reproduces the exact " + f"bug reported in issue #533 where issue #530 was incorrectly closed as " + f"a duplicate of the still-OPEN issue #520. The orchestrator should " + f"validate that #520 is OPEN and continue the workflow. " + f"Instead got: success={success}, msg={message}" + ) + assert len(steps_executed) == 11, ( + f"All 11 steps should execute. Issue #530 should NOT be closed as a " + f"duplicate when #520 is still OPEN. Got {len(steps_executed)} steps: {steps_executed}" + ) diff --git a/tests/test_issue_533_duplicate_validation.py b/tests/test_issue_533_duplicate_validation.py new file mode 100644 index 000000000..eeddfe0a5 --- /dev/null +++ b/tests/test_issue_533_duplicate_validation.py @@ -0,0 +1,479 @@ +""" +Tests for Issue #533: Orchestrator should validate LLM duplicate detection output. + +Bug: The orchestrator at line 441 of agentic_bug_orchestrator.py blindly trusts +the LLM's duplicate detection output without validating that the original issue +is actually resolved. When the LLM fails to follow prompt instructions and outputs +"Duplicate of #520" even though #520 is OPEN, the orchestrator incorrectly triggers +a hard stop and closes the issue. + +Root cause: +The hard stop at line 441 checks for the string "Duplicate of #" but doesn't +validate the original issue's state: + if step_num == 1 and "Duplicate of #" in output: + msg = f"Stopped at Step 1: Issue is a duplicate. {output.strip()}" + return False, msg, total_cost, last_model_used, changed_files + +Fix: +The orchestrator should validate the original issue's state using `gh issue view` +before triggering the hard stop. If the original issue is still OPEN (unresolved), +the orchestrator should log a warning and continue the workflow instead of stopping. + +These tests verify: +1. When LLM outputs duplicate of UNRESOLVED issue → orchestrator validates & continues +2. When LLM outputs duplicate of RESOLVED issue → hard stop works correctly +3. Edge cases: invalid issue numbers, network errors, various output formats +4. Fail-safe behavior: errors default to letting workflow continue + +This is a regression of issue #469, which fixed the prompts but didn't add +orchestrator-level validation as defense-in-depth. +""" + +import pytest +from pathlib import Path +from unittest.mock import patch, MagicMock +import subprocess + +from pdd.agentic_bug_orchestrator import run_agentic_bug_orchestrator + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +@pytest.fixture +def bug_mock_dependencies(tmp_path): + """Mocks for the bug orchestrator.""" + mock_worktree_path = tmp_path / ".pdd" / "worktrees" / "fix-issue-533" + mock_worktree_path.mkdir(parents=True, exist_ok=True) + + with patch("pdd.agentic_bug_orchestrator.run_agentic_task") as mock_run, \ + patch("pdd.agentic_bug_orchestrator.load_prompt_template") as mock_load, \ + patch("pdd.agentic_bug_orchestrator.console") as mock_console, \ + patch("pdd.agentic_bug_orchestrator._setup_worktree") as mock_worktree: + + mock_run.return_value = (True, "Step output", 0.1, "gpt-4") + mock_load.return_value = "Prompt for {issue_number}" + mock_worktree.return_value = (mock_worktree_path, None) + + yield mock_run, mock_load, mock_console, mock_worktree + + +@pytest.fixture +def bug_default_args(tmp_path): + """Default arguments for the bug orchestrator.""" + return { + "issue_url": "http://github.com/owner/repo/issues/533", + "issue_content": "Bug description", + "repo_owner": "owner", + "repo_name": "repo", + "issue_number": 533, + "issue_author": "user", + "issue_title": "Bug Title", + "cwd": tmp_path, + "verbose": False, + "quiet": True, + "use_github_state": False, + } + + +# --------------------------------------------------------------------------- +# Test 1: LLM outputs duplicate of UNRESOLVED issue → workflow should continue +# --------------------------------------------------------------------------- + +def test_bug_orchestrator_llm_outputs_duplicate_of_unresolved_issue( + bug_mock_dependencies, bug_default_args +): + """ + PRIMARY BUG SCENARIO: When the LLM fails to follow prompt instructions and + outputs "Duplicate of #520" even though #520 is OPEN (unresolved), the + orchestrator should validate the original issue's state and continue the + workflow instead of incorrectly stopping. + + This test simulates the exact scenario from issue #533/#530/#520: + - Issue #530 reports a bug + - LLM outputs "Duplicate of #520" (failing to check resolution status) + - Issue #520 is still OPEN (unresolved) + - Expected: Orchestrator validates, logs warning, continues to Step 2 + - Buggy behavior: Hard stop at Step 1, issue closed as duplicate + + This test will FAIL on the current buggy code (before fix) because line 441 + doesn't validate the original issue's state. + """ + mock_run, _, _, _ = bug_mock_dependencies + + # Simulate LLM failing to follow instructions: outputs duplicate without + # checking that the original issue is still OPEN + llm_output_duplicate_unresolved = ( + "## Step 1: Duplicate Check\n\n" + "**Status:** Duplicate of #520\n\n" + "### Findings\n" + "This issue has the same symptoms as #520." + ) + + def side_effect(*args, **kwargs): + label = kwargs.get("label", "") + if label == "step1": + return (True, llm_output_duplicate_unresolved, 0.1, "gpt-4") + if label == "step7": + return (True, "Generated test\nFILES_CREATED: test_fix.py", 0.1, "gpt-4") + return (True, f"Output for {label}", 0.1, "gpt-4") + + mock_run.side_effect = side_effect + + # Mock gh issue view to return OPEN state for issue #520 + with patch("subprocess.run") as mock_subprocess: + # gh issue view #520 should return OPEN state + mock_result = MagicMock() + mock_result.returncode = 0 + mock_result.stdout = "state: OPEN\ntitle: Original Issue\n" + mock_subprocess.return_value = mock_result + + success, msg, cost, model, files = run_agentic_bug_orchestrator( + **bug_default_args + ) + + # CRITICAL ASSERTION: Workflow should continue (not stop at Step 1) + # The buggy code will FAIL this assertion because it triggers hard stop + # without validating the original issue's state + assert success is True, ( + f"Workflow should continue when LLM outputs duplicate of UNRESOLVED issue. " + f"The orchestrator should validate that #520 is OPEN and not trigger hard stop. " + f"Instead got: success={success}, msg={msg}" + ) + + # Verify workflow completed all steps (not stopped at Step 1) + assert mock_run.call_count == 11, ( + f"All 11 steps should execute when original issue is unresolved. " + f"Got {mock_run.call_count} steps instead." + ) + + assert "Investigation complete" in msg, ( + f"Expected completion message, got: {msg}" + ) + + +# --------------------------------------------------------------------------- +# Test 2: LLM outputs duplicate of RESOLVED issue → hard stop (regression test) +# --------------------------------------------------------------------------- + +def test_bug_orchestrator_llm_outputs_duplicate_of_resolved_issue( + bug_mock_dependencies, bug_default_args +): + """ + REGRESSION TEST: When the LLM outputs "Duplicate of #520" and #520 is + actually CLOSED (resolved), the orchestrator should trigger the hard stop. + This is the correct behavior and should not be broken by the fix. + + This ensures we don't break valid duplicate detection when fixing the bug. + """ + mock_run, _, _, _ = bug_mock_dependencies + + # LLM correctly outputs duplicate of a resolved issue + llm_output_duplicate_resolved = ( + "## Step 1: Duplicate Check\n\n" + "**Status:** Duplicate of #520\n\n" + "### Findings\n" + "Issue #520 was resolved in PR #525. This is a duplicate." + ) + mock_run.return_value = (True, llm_output_duplicate_resolved, 0.05, "claude") + + # Mock gh issue view to return CLOSED state for issue #520 + with patch("subprocess.run") as mock_subprocess: + mock_result = MagicMock() + mock_result.returncode = 0 + mock_result.stdout = "state: CLOSED\ntitle: Original Issue\n" + mock_subprocess.return_value = mock_result + + success, msg, cost, _, _ = run_agentic_bug_orchestrator(**bug_default_args) + + # Hard stop should be triggered (correct duplicate detection) + assert success is False, ( + f"Workflow should stop when original issue is CLOSED. Got success={success}" + ) + assert "Stopped at Step 1" in msg + assert "duplicate" in msg.lower() + assert mock_run.call_count == 1 + assert cost == 0.05 + + +# --------------------------------------------------------------------------- +# Test 3: Various duplicate output formats +# --------------------------------------------------------------------------- + +class TestDuplicateOutputFormats: + """ + Test that the orchestrator correctly extracts issue numbers from various + LLM output formats and validates them. + """ + + @pytest.mark.parametrize( + "llm_output,expected_issue_num", + [ + # Standard format + ("Duplicate of #520", "520"), + # With context + ("Duplicate of #520 (resolved in PR #525)", "520"), + # Multiple issue mentions (should extract first after "Duplicate of") + ("Related to #100, but Duplicate of #520", "520"), + # With markdown + ("**Status:** Duplicate of #520\n\nClosing.", "520"), + ], + ids=[ + "standard_format", + "with_pr_reference", + "multiple_issues", + "with_markdown", + ], + ) + def test_duplicate_extraction_formats( + self, bug_mock_dependencies, bug_default_args, llm_output, expected_issue_num + ): + """ + Verify the orchestrator correctly extracts issue numbers from various + output formats and validates them with gh issue view. + """ + mock_run, _, _, _ = bug_mock_dependencies + mock_run.return_value = (True, llm_output, 0.05, "claude") + + # Mock gh issue view to return OPEN state + with patch("subprocess.run") as mock_subprocess: + mock_result = MagicMock() + mock_result.returncode = 0 + mock_result.stdout = "state: OPEN\ntitle: Issue\n" + mock_subprocess.return_value = mock_result + + success, msg, _, _, _ = run_agentic_bug_orchestrator(**bug_default_args) + + # Verify gh issue view was called with correct issue number + assert mock_subprocess.called, ( + "orchestrator should call gh issue view to validate" + ) + # Find the call that checks the issue state + gh_calls = [ + call for call in mock_subprocess.call_args_list + if call[0][0][0:2] == ["gh", "issue"] + ] + assert len(gh_calls) > 0, "Should have called gh issue view" + # Extract issue number from the call + assert expected_issue_num in str(gh_calls[0]), ( + f"Should check issue #{expected_issue_num}, calls: {gh_calls}" + ) + + # Since original is OPEN, workflow should continue + assert success is True, f"Should continue when original is OPEN, got: {msg}" + + +# --------------------------------------------------------------------------- +# Test 4: Invalid issue number handling +# --------------------------------------------------------------------------- + +def test_bug_orchestrator_invalid_issue_number_failsafe( + bug_mock_dependencies, bug_default_args +): + """ + FAIL-SAFE TEST: When the LLM outputs a duplicate of a non-existent issue + (e.g., #99999), gh issue view will fail. The orchestrator should treat + this as "unresolved" (fail-safe) and let the workflow continue rather + than crashing or incorrectly stopping. + """ + mock_run, _, _, _ = bug_mock_dependencies + + llm_output_invalid = "Duplicate of #99999" + + def side_effect(*args, **kwargs): + label = kwargs.get("label", "") + if label == "step1": + return (True, llm_output_invalid, 0.1, "gpt-4") + if label == "step7": + return (True, "Generated test\nFILES_CREATED: test_fix.py", 0.1, "gpt-4") + return (True, f"Output for {label}", 0.1, "gpt-4") + + mock_run.side_effect = side_effect + + # Mock gh issue view to return error (issue not found) + with patch("subprocess.run") as mock_subprocess: + mock_result = MagicMock() + mock_result.returncode = 1 # Error + mock_result.stderr = "issue not found" + mock_subprocess.return_value = mock_result + + success, msg, _, _, _ = run_agentic_bug_orchestrator(**bug_default_args) + + # Fail-safe: should continue workflow (treat as unresolved) + assert success is True, ( + f"When gh issue view fails, orchestrator should fail-safe and continue. " + f"Got: success={success}, msg={msg}" + ) + + +# --------------------------------------------------------------------------- +# Test 5: Network/CLI error handling +# --------------------------------------------------------------------------- + +def test_bug_orchestrator_gh_cli_error_failsafe( + bug_mock_dependencies, bug_default_args +): + """ + FAIL-SAFE TEST: When gh CLI fails (network error, timeout, rate limit), + the orchestrator should log the error and treat the issue as unresolved + (fail-safe), allowing the workflow to continue. + """ + mock_run, _, _, _ = bug_mock_dependencies + + llm_output = "Duplicate of #520" + + def side_effect(*args, **kwargs): + label = kwargs.get("label", "") + if label == "step1": + return (True, llm_output, 0.1, "gpt-4") + if label == "step7": + return (True, "Generated test\nFILES_CREATED: test_fix.py", 0.1, "gpt-4") + return (True, f"Output for {label}", 0.1, "gpt-4") + + mock_run.side_effect = side_effect + + # Mock subprocess.run to handle both git commands and gh commands + with patch("subprocess.run") as mock_subprocess: + def subprocess_side_effect(cmd, *args, **kwargs): + # Let git commands work normally (needed for _get_git_root) + if cmd[0] == "git": + mock_result = MagicMock() + mock_result.returncode = 0 + mock_result.stdout = str(bug_default_args["cwd"]) + return mock_result + # Make gh commands raise timeout + elif cmd[0] == "gh": + raise subprocess.TimeoutExpired("gh", 5) + # Default + mock_result = MagicMock() + mock_result.returncode = 0 + return mock_result + + mock_subprocess.side_effect = subprocess_side_effect + + success, msg, _, _, _ = run_agentic_bug_orchestrator(**bug_default_args) + + # Fail-safe: should continue workflow + assert success is True, ( + f"When gh CLI fails/times out, orchestrator should fail-safe and continue. " + f"Got: success={success}, msg={msg}" + ) + + +# --------------------------------------------------------------------------- +# Test 6: Regression test - exact scenario from issue #533 +# --------------------------------------------------------------------------- + +def test_bug_orchestrator_issue_533_exact_scenario( + bug_mock_dependencies, bug_default_args +): + """ + REGRESSION TEST FOR ISSUE #533: Simulate the exact scenario that triggered + the bug report: + - User filed issue #530 + - LLM output "Duplicate of #520" without checking resolution + - Issue #520 was still OPEN (created 2026-02-14, never resolved) + - Bug: Orchestrator closed #530 as duplicate + - Fix: Orchestrator should validate #520 is OPEN and continue workflow + + This test uses the actual issue numbers from the incident. + """ + mock_run, _, _, _ = bug_mock_dependencies + + # Update args to simulate issue #530 + bug_default_args["issue_number"] = 530 + bug_default_args["issue_url"] = "http://github.com/owner/repo/issues/530" + + # LLM output that triggered the bug (claims #520 is duplicate) + llm_output_530 = ( + "## Step 1: Duplicate Check\n\n" + "Duplicate of #520\n\n" + "This issue appears to be the same as #520." + ) + + def side_effect(*args, **kwargs): + label = kwargs.get("label", "") + if label == "step1": + return (True, llm_output_530, 0.1, "gpt-4") + if label == "step7": + return (True, "Generated test\nFILES_CREATED: test_fix.py", 0.1, "gpt-4") + return (True, f"Output for {label}", 0.1, "gpt-4") + + mock_run.side_effect = side_effect + + # Mock gh issue view for #520 - it's still OPEN (unresolved) + with patch("subprocess.run") as mock_subprocess: + mock_result = MagicMock() + mock_result.returncode = 0 + mock_result.stdout = "state: OPEN\ntitle: Original bug\ncreatedAt: 2026-02-14\n" + mock_subprocess.return_value = mock_result + + success, msg, _, _, _ = run_agentic_bug_orchestrator(**bug_default_args) + + # The fix should prevent the buggy hard stop + assert success is True, ( + f"Issue #530 should NOT have been closed as duplicate of unresolved #520. " + f"This is the exact bug from issue #533. Got: success={success}, msg={msg}" + ) + assert "Investigation complete" in msg + + +# --------------------------------------------------------------------------- +# Test 7: No false positives - conversational mentions shouldn't validate +# --------------------------------------------------------------------------- + +def test_bug_orchestrator_no_validation_without_duplicate_marker( + bug_mock_dependencies, bug_default_args +): + """ + Verify that the orchestrator only validates when the LLM output contains + the exact "Duplicate of #" marker. Conversational mentions of related + issues should not trigger validation. + + This ensures the fix doesn't add unnecessary overhead for outputs that + mention related issues without claiming they are duplicates. + """ + mock_run, _, _, _ = bug_mock_dependencies + + # Output mentions related issue but doesn't claim duplicate + llm_output_no_duplicate = ( + "## Step 1: Duplicate Check\n\n" + "**Status:** No duplicates found\n\n" + "### Findings\n" + "Found related issue #520, but it has different symptoms. " + "Proceeding with investigation." + ) + + def side_effect(*args, **kwargs): + label = kwargs.get("label", "") + if label == "step1": + return (True, llm_output_no_duplicate, 0.1, "gpt-4") + if label == "step7": + return (True, "Generated test\nFILES_CREATED: test_fix.py", 0.1, "gpt-4") + return (True, f"Output for {label}", 0.1, "gpt-4") + + mock_run.side_effect = side_effect + + # Mock subprocess to track if gh issue view is called + with patch("subprocess.run") as mock_subprocess: + mock_result = MagicMock() + mock_result.returncode = 0 + mock_result.stdout = "state: OPEN\n" + mock_subprocess.return_value = mock_result + + success, msg, _, _, _ = run_agentic_bug_orchestrator(**bug_default_args) + + # Workflow should continue + assert success is True, f"Workflow should continue, got: {msg}" + + # gh issue view should NOT have been called (no duplicate marker) + # Note: This checks that we don't add unnecessary validation overhead + # when the LLM correctly follows instructions and doesn't claim a duplicate + gh_calls = [ + call for call in mock_subprocess.call_args_list + if call[0][0][0:2] == ["gh", "issue"] + ] + assert len(gh_calls) == 0, ( + "Should not call gh issue view when no 'Duplicate of #' marker present" + )