From 69421a12830d53f096540e93c13b6bd066142065 Mon Sep 17 00:00:00 2001 From: Pengfei Hu Date: Mon, 1 Jun 2026 11:06:32 -0700 Subject: [PATCH] Fix packet low-confidence residuals --- CHANGELOG.md | 4 +- .../support_refund_agent/expected/packet.html | 2 +- .../support_refund_agent/expected/packet.json | 4 +- .../support_refund_agent/expected/packet.md | 2 +- src/agents_shipgate/packet/builder.py | 2 +- tests/test_evidence_packet.py | 156 +++++++++++++++++- 6 files changed, 163 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e21bebf4..ccda50db 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ ## Unreleased +## 0.11.0 - 2026-05-31 + - **Verifier adoption-loop release prep.** Public docs and discovery metadata now lead with the verify-first adoption path, pinned `v0.11.0` snippets, verifier artifacts, merge verdicts, `fix_task`, and explicit Action merge-policy @@ -10,8 +12,6 @@ `agents-shipgate feedback export` command plus `docs/feedback-schema.v0.1.json` for redacted design-partner feedback loops. -## 0.11.0 - 2026-05-31 - - **Verifier PR comment v2 + additive Action outputs.** The GitHub Action now defaults to the verifier workflow (`verify_mode: verify`) and the capability-review PR comment (`pr_comment_style: capability-review`) for the diff --git a/samples/support_refund_agent/expected/packet.html b/samples/support_refund_agent/expected/packet.html index 098e9f04..9d95f946 100644 --- a/samples/support_refund_agent/expected/packet.html +++ b/samples/support_refund_agent/expected/packet.html @@ -26,4 +26,4 @@ .status-missing { color: #7f1d1d; } .status-informational { color: #555; } .meta { color: #555; font-size: 0.92rem; } -

Release Evidence Packet

Project: support-refund-agent · Agent: refund-assistant · Environment: production_like
Run id: agents_shipgate_ebb71d7248235cc3 · Generated at: 2026-01-01T00:00:00+00:00 · Packet schema: 0.6

This packet is a reviewer-shaped synthesis of a static Agents Shipgate scan. See §10 for what the packet does not prove.

§1 Release decision — BLOCKED

CI gate behavior (informational)

Blockers

Review items

§1A Evidence matrix — compact review summary

DomainEvidence presentEvidence sourceConfidenceMissing controlsBlocking findingsReview items
Inventorypartialtool_inventory; tool_surface; +2 morehighSHIP-INVENTORY-WILDCARD-TOOLS on wildcard_mcp_tools.*: Wildcard tool exposure declaredSHIP-INVENTORY-WILDCARD-TOOLS (high)
Schemapartialtool_surface_facts.tools[].hashes; findings[]mixedSHIP-SCHEMA-MISSING-BOUNDS on stripe.create_refund: stripe.create_refund.amount has no maximum bound; SHIP-SCHEMA-BROAD-FREE-TEXT on zendesk.update_ticket: zendesk.update_ticket accepts broad free-form action input; +2 moreSHIP-SCHEMA-MISSING-BOUNDS (high); SHIP-SCHEMA-BROAD-FREE-TEXT (high); +2 more
Authpartialtool_surface_facts.scopes; tool_inventory[].auth_scopes; +1 moremixedSHIP-AUTH-MANIFEST-BROAD-SCOPE: Manifest declares broad permission scopes; SHIP-AUTH-SCOPE-COVERAGE-MISSING on shopify.cancel_order: shopify.cancel_order requires scopes not declared in the manifest; +3 moreSHIP-AUTH-MANIFEST-BROAD-SCOPE (high); SHIP-AUTH-SCOPE-COVERAGE-MISSING (high); +3 more
Approvalpartialtool_surface_facts.controls[kind=approval_policy]; findings[]highSHIP-POLICY-APPROVAL-MISSING on stripe.create_refund: stripe.create_refund lacks a declared approval policySHIP-POLICY-APPROVAL-MISSING (critical)
Confirmationpartialtool_surface_facts.controls[kind=confirmation_policy]; findings[]highSHIP-POLICY-CONFIRMATION-MISSING on stripe.create_refund: stripe.create_refund lacks a declared confirmation policy; SHIP-POLICY-CONFIRMATION-MISSING on gmail.send_customer_email: gmail.send_customer_email lacks a declared confirmation policySHIP-POLICY-CONFIRMATION-MISSING (high); SHIP-POLICY-CONFIRMATION-MISSING (high)
Idempotencypartialtool_surface_facts.controls[kind=idempotency_evidence]; action_surface_facts.actions[].safeguards.idempotency; +1 moremixedSHIP-SIDEFX-IDEMPOTENCY-MISSING on stripe.create_refund: stripe.create_refund lacks idempotency evidence; SHIP-SIDEFX-IDEMPOTENCY-MISSING on gmail.send_customer_email: gmail.send_customer_email lacks idempotency evidenceSHIP-SIDEFX-IDEMPOTENCY-MISSING (critical)SHIP-SIDEFX-IDEMPOTENCY-MISSING (high)
Side effectspartialtool_inventory[].risk_tags; action_surface_facts.actions[].effect; +1 moremixedSHIP-SCHEMA-BROAD-FREE-TEXT on zendesk.update_ticket: zendesk.update_ticket accepts broad free-form action input; SHIP-SCHEMA-BROAD-FREE-TEXT on gmail.send_customer_email: gmail.send_customer_email accepts broad free-form action input; +5 moreSHIP-POLICY-APPROVAL-MISSING (critical); SHIP-SIDEFX-IDEMPOTENCY-MISSING (critical)SHIP-SCHEMA-BROAD-FREE-TEXT (high); SHIP-SCHEMA-BROAD-FREE-TEXT (high); +3 more
Memory isolationnot_declaredunknown
Human-in-the-loop evidencenot_declaredunknown
Prompt/scope alignmentpartialdeclared_intentions; misalignments; +2 moremediumSHIP-SCOPE-PROHIBITED-TOOL-PRESENT on stripe.create_refund: stripe.create_refund appears to overlap with a prohibited action; SHIP-SCOPE-PROHIBITED-TOOL-PRESENT on gmail.send_customer_email: gmail.send_customer_email appears to overlap with a prohibited actionSHIP-SCOPE-PROHIBITED-TOOL-PRESENT (high); SHIP-SCOPE-PROHIBITED-TOOL-PRESENT (high)
Retry/timeoutnot_declaredunknown
Baseline debtinformationalunknown
Action-surface policycoveredaction_surface_facts.actionsmedium

§2 Capability ↔ Intent diff — missing

Declared

Observed tools

Divergences

§3 High-risk tool surface — partial

Total tools: 8 · High-risk: 3

ToolSourceRisk tagsApprovalIdempotency
gmail.send_customer_emailmcpcustomer_communication, external_writenono
shopify.cancel_orderopenapidestructive, writeyesyes
stripe.create_refundopenapiexternal_write, financial_action, writenono

§3A Tool-surface diff — not declared

Status: disabled — No --diff-from report or v0.3 baseline snapshot was provided.
Base: none

§3B Action-surface diff — not declared

Status: disabled — No action-surface comparison source was provided.
Base: none

§4 Approval policy coverage — partial

ToolDeclaredSourceGap finding(s)
shopify.cancel_orderyespolicies
stripe.create_refundnofp_f092940f62fbb012

Gap findings

§5 Idempotency / retry risk — partial

Retry policy: not declared

ToolDeclaredSourceGap finding(s)
gmail.send_customer_emailnofp_0f8aaa912d589cf0
shopify.cancel_orderyespolicies
stripe.create_refundnofp_dac8011e14c53777

Gap findings

§6 Scope coverage — missing

Declared scopes

ScopeDeclaredUsed by tools
gmail:sendnogmail.send_customer_email
shopify:orders:writenoshopify.cancel_order
stripe:*yes
stripe:refunds:writeyesstripe.create_refund
support:kb:readnosupport.search_kb
zendesk:tickets:readyes
zendesk:tickets:writeyeszendesk.update_ticket

Unused declared scopes

Used by tools but not declared

Gap findings

§7 Memory isolation — not declared

Manifest does not declare a memory isolation policy. The current manifest schema (v0.1) has no agent.memory field. See §10 for the residual review item.

§8 Human-in-the-loop evidence — covered

Approval-required tools

Confirmation-required tools

§9 Required dynamic scenarios — partial

§10 What this packet did NOT prove

Agents Shipgate is an advisory tool: the deterministic merge gate for AI-generated agent capability changes, run as a local-first, static Tool-Use Readiness review. The packet below is derived from a scan; it does not, by itself, prove the following properties:

Per-run residuals

+

Release Evidence Packet

Project: support-refund-agent · Agent: refund-assistant · Environment: production_like
Run id: agents_shipgate_ebb71d7248235cc3 · Generated at: 2026-01-01T00:00:00+00:00 · Packet schema: 0.6

This packet is a reviewer-shaped synthesis of a static Agents Shipgate scan. See §10 for what the packet does not prove.

§1 Release decision — BLOCKED

CI gate behavior (informational)

Blockers

Review items

§1A Evidence matrix — compact review summary

DomainEvidence presentEvidence sourceConfidenceMissing controlsBlocking findingsReview items
Inventorypartialtool_inventory; tool_surface; +2 morehighSHIP-INVENTORY-WILDCARD-TOOLS on wildcard_mcp_tools.*: Wildcard tool exposure declaredSHIP-INVENTORY-WILDCARD-TOOLS (high)
Schemapartialtool_surface_facts.tools[].hashes; findings[]mixedSHIP-SCHEMA-MISSING-BOUNDS on stripe.create_refund: stripe.create_refund.amount has no maximum bound; SHIP-SCHEMA-BROAD-FREE-TEXT on zendesk.update_ticket: zendesk.update_ticket accepts broad free-form action input; +2 moreSHIP-SCHEMA-MISSING-BOUNDS (high); SHIP-SCHEMA-BROAD-FREE-TEXT (high); +2 more
Authpartialtool_surface_facts.scopes; tool_inventory[].auth_scopes; +1 moremixedSHIP-AUTH-MANIFEST-BROAD-SCOPE: Manifest declares broad permission scopes; SHIP-AUTH-SCOPE-COVERAGE-MISSING on shopify.cancel_order: shopify.cancel_order requires scopes not declared in the manifest; +3 moreSHIP-AUTH-MANIFEST-BROAD-SCOPE (high); SHIP-AUTH-SCOPE-COVERAGE-MISSING (high); +3 more
Approvalpartialtool_surface_facts.controls[kind=approval_policy]; findings[]highSHIP-POLICY-APPROVAL-MISSING on stripe.create_refund: stripe.create_refund lacks a declared approval policySHIP-POLICY-APPROVAL-MISSING (critical)
Confirmationpartialtool_surface_facts.controls[kind=confirmation_policy]; findings[]highSHIP-POLICY-CONFIRMATION-MISSING on stripe.create_refund: stripe.create_refund lacks a declared confirmation policy; SHIP-POLICY-CONFIRMATION-MISSING on gmail.send_customer_email: gmail.send_customer_email lacks a declared confirmation policySHIP-POLICY-CONFIRMATION-MISSING (high); SHIP-POLICY-CONFIRMATION-MISSING (high)
Idempotencypartialtool_surface_facts.controls[kind=idempotency_evidence]; action_surface_facts.actions[].safeguards.idempotency; +1 moremixedSHIP-SIDEFX-IDEMPOTENCY-MISSING on stripe.create_refund: stripe.create_refund lacks idempotency evidence; SHIP-SIDEFX-IDEMPOTENCY-MISSING on gmail.send_customer_email: gmail.send_customer_email lacks idempotency evidenceSHIP-SIDEFX-IDEMPOTENCY-MISSING (critical)SHIP-SIDEFX-IDEMPOTENCY-MISSING (high)
Side effectspartialtool_inventory[].risk_tags; action_surface_facts.actions[].effect; +1 moremixedSHIP-SCHEMA-BROAD-FREE-TEXT on zendesk.update_ticket: zendesk.update_ticket accepts broad free-form action input; SHIP-SCHEMA-BROAD-FREE-TEXT on gmail.send_customer_email: gmail.send_customer_email accepts broad free-form action input; +5 moreSHIP-POLICY-APPROVAL-MISSING (critical); SHIP-SIDEFX-IDEMPOTENCY-MISSING (critical)SHIP-SCHEMA-BROAD-FREE-TEXT (high); SHIP-SCHEMA-BROAD-FREE-TEXT (high); +3 more
Memory isolationnot_declaredunknown
Human-in-the-loop evidencenot_declaredunknown
Prompt/scope alignmentpartialdeclared_intentions; misalignments; +2 moremediumSHIP-SCOPE-PROHIBITED-TOOL-PRESENT on stripe.create_refund: stripe.create_refund appears to overlap with a prohibited action; SHIP-SCOPE-PROHIBITED-TOOL-PRESENT on gmail.send_customer_email: gmail.send_customer_email appears to overlap with a prohibited actionSHIP-SCOPE-PROHIBITED-TOOL-PRESENT (high); SHIP-SCOPE-PROHIBITED-TOOL-PRESENT (high)
Retry/timeoutnot_declaredunknown
Baseline debtinformationalunknown
Action-surface policycoveredaction_surface_facts.actionsmedium

§2 Capability ↔ Intent diff — missing

Declared

Observed tools

Divergences

§3 High-risk tool surface — partial

Total tools: 8 · High-risk: 3

ToolSourceRisk tagsApprovalIdempotency
gmail.send_customer_emailmcpcustomer_communication, external_writenono
shopify.cancel_orderopenapidestructive, writeyesyes
stripe.create_refundopenapiexternal_write, financial_action, writenono

§3A Tool-surface diff — not declared

Status: disabled — No --diff-from report or v0.3 baseline snapshot was provided.
Base: none

§3B Action-surface diff — not declared

Status: disabled — No action-surface comparison source was provided.
Base: none

§4 Approval policy coverage — partial

ToolDeclaredSourceGap finding(s)
shopify.cancel_orderyespolicies
stripe.create_refundnofp_f092940f62fbb012

Gap findings

§5 Idempotency / retry risk — partial

Retry policy: not declared

ToolDeclaredSourceGap finding(s)
gmail.send_customer_emailnofp_0f8aaa912d589cf0
shopify.cancel_orderyespolicies
stripe.create_refundnofp_dac8011e14c53777

Gap findings

§6 Scope coverage — missing

Declared scopes

ScopeDeclaredUsed by tools
gmail:sendnogmail.send_customer_email
shopify:orders:writenoshopify.cancel_order
stripe:*yes
stripe:refunds:writeyesstripe.create_refund
support:kb:readnosupport.search_kb
zendesk:tickets:readyes
zendesk:tickets:writeyeszendesk.update_ticket

Unused declared scopes

Used by tools but not declared

Gap findings

§7 Memory isolation — not declared

Manifest does not declare a memory isolation policy. The current manifest schema (v0.1) has no agent.memory field. See §10 for the residual review item.

§8 Human-in-the-loop evidence — covered

Approval-required tools

Confirmation-required tools

§9 Required dynamic scenarios — partial

§10 What this packet did NOT prove

Agents Shipgate is an advisory tool: the deterministic merge gate for AI-generated agent capability changes, run as a local-first, static Tool-Use Readiness review. The packet below is derived from a scan; it does not, by itself, prove the following properties:

Per-run residuals

diff --git a/samples/support_refund_agent/expected/packet.json b/samples/support_refund_agent/expected/packet.json index a9557958..19c8cd70 100644 --- a/samples/support_refund_agent/expected/packet.json +++ b/samples/support_refund_agent/expected/packet.json @@ -1293,7 +1293,9 @@ "6 active finding(s) came from heuristic provenance (keyword_heuristic=6, regex_heuristic=0); review the finding evidence before acting." ], "headline": "Agents Shipgate is an advisory tool: the deterministic merge gate for AI-generated agent capability changes, run as a local-first, static Tool-Use Readiness review. The packet below is derived from a scan; it does not, by itself, prove the following properties:", - "low_confidence_tools": [], + "low_confidence_tools": [ + "send_email_preview" + ], "source_warnings": [ "MCP source declares wildcard tool exposure" ], diff --git a/samples/support_refund_agent/expected/packet.md b/samples/support_refund_agent/expected/packet.md index 07665195..aeedcf6f 100644 --- a/samples/support_refund_agent/expected/packet.md +++ b/samples/support_refund_agent/expected/packet.md @@ -234,7 +234,7 @@ Agents Shipgate is an advisory tool: the deterministic merge gate for AI-generat - Source warnings: - MCP source declares wildcard tool exposure -- Low-confidence tool extractions: none +- Low-confidence tool extractions: `send\_email\_preview` - Suppressed findings in effect: none - Memory isolation is not modeled by the v0.1 manifest schema; no static evidence is available. - 6 active finding\(s\) came from heuristic provenance \(keyword\_heuristic=6, regex\_heuristic=0\); review the finding evidence before acting. diff --git a/src/agents_shipgate/packet/builder.py b/src/agents_shipgate/packet/builder.py index c52712f0..37215a9a 100644 --- a/src/agents_shipgate/packet/builder.py +++ b/src/agents_shipgate/packet/builder.py @@ -1080,7 +1080,7 @@ def _build_not_proven( ) -> NotProvenSection: suppressed_ids = sorted(f.id for f in findings if f.suppressed and f.id) low_confidence_tools = sorted( - tool.name for tool in tools if tool.extraction_confidence == "low" + tool.name for tool in tools if tool.extraction_confidence != "high" ) additional = [ "Memory isolation is not modeled by the v0.1 manifest schema; " diff --git a/tests/test_evidence_packet.py b/tests/test_evidence_packet.py index 9a6113c3..9f820075 100644 --- a/tests/test_evidence_packet.py +++ b/tests/test_evidence_packet.py @@ -21,9 +21,11 @@ import pytest from typer.testing import CliRunner +from agents_shipgate.ci.release_decision import build_release_decision from agents_shipgate.cli.main import app from agents_shipgate.cli.scan import run_scan from agents_shipgate.core.disclaimers import HITL_RUNTIME_CONTROL_DISCLAIMER +from agents_shipgate.core.domain import Tool from agents_shipgate.packet import ( EvidencePacket, PacketSchemaError, @@ -38,7 +40,12 @@ PACKET_NON_PROOF_HEADLINE, ) from agents_shipgate.packet.evidence_matrix import build_evidence_matrix -from agents_shipgate.schemas.report import Finding +from agents_shipgate.schemas.report import ( + Finding, + ReadinessReport, + ReportSummary, + ToolSurfaceSummary, +) SAMPLE_CONFIG = Path("samples/support_refund_agent/shipgate.yaml") EXPECTED_DIR = Path("samples/support_refund_agent/expected") @@ -49,6 +56,78 @@ GENERATED_AT = "2026-01-01T00:00:00+00:00" +def _minimal_packet_with_not_proven( + section, + *, + low_confidence_tool_count: int = 0, +) -> EvidencePacket: + from agents_shipgate.schemas.packet import ( + ApprovalCoverageSection, + CapabilityIntentDiff, + DynamicScenariosSection, + HighRiskSurfaceSection, + HumanInTheLoopEvidence, + IdempotencyRiskSection, + MemoryIsolationStatus, + ReleaseDecisionSection, + ScopeCoverageSection, + ) + from agents_shipgate.schemas.report import ( + BaselineDelta, + EvidenceCoverageDecision, + FailPolicy, + ) + + decision = ReleaseDecisionSection( + decision="insufficient_evidence" if low_confidence_tool_count else "passed", + verdict="INSUFFICIENT EVIDENCE" if low_confidence_tool_count else "PASSED", + reason="Evidence coverage below threshold.", + evidence_coverage=EvidenceCoverageDecision( + level="static", + human_review_recommended=low_confidence_tool_count > 0, + source_warning_count=0, + low_confidence_tool_count=low_confidence_tool_count, + ), + baseline_delta=BaselineDelta(enabled=False), + fail_policy=FailPolicy( + ci_mode="advisory", + fail_on=[], + new_findings_only=False, + would_fail_ci=False, + exit_code=0, + ), + ) + return EvidencePacket( + generated_at=GENERATED_AT, + run_id="r", + project={"name": "p"}, + agent={"name": "a"}, + environment={"target": "local"}, + release_decision=decision, + capability_intent=CapabilityIntentDiff( + status="not_declared", + declared_purpose=[], + prohibited_actions=[], + observed_tools=[], + rows=[], + divergence_findings=[], + ), + high_risk_surface=HighRiskSurfaceSection( + status="informational", + total_tools=0, + high_risk_count=0, + tools=[], + ), + approval_coverage=ApprovalCoverageSection(status="informational"), + idempotency_risk=IdempotencyRiskSection(status="informational"), + scope_coverage=ScopeCoverageSection(status="informational"), + memory_isolation=MemoryIsolationStatus(), + human_in_the_loop=HumanInTheLoopEvidence(status="not_declared"), + dynamic_scenarios=DynamicScenariosSection(status="informational"), + not_proven=section, + ) + + def _scan_with_packet(tmp_path: Path) -> tuple[Path, EvidencePacket]: """Run scan against the support_refund_agent fixture and return ``(out_dir, parsed_packet)``.""" @@ -182,6 +261,81 @@ def test_not_proven_residuals_include_non_static_provenance(): assert "external policy packs" in residuals +def test_not_proven_low_confidence_residuals_match_release_decision_count(): + tools = [ + Tool( + id="high", + name="high_confidence_inventory", + source_type="mcp", + extraction_confidence="high", + ), + Tool( + id="medium", + name="medium_confidence_sdk", + source_type="sdk_function", + extraction_confidence="medium", + ), + Tool( + id="low", + name="low_confidence_sdk", + source_type="sdk_function", + extraction_confidence="low", + ), + ] + + section = _build_not_proven([], source_warnings=[], tools=tools) + + assert section.low_confidence_tools == [ + "low_confidence_sdk", + "medium_confidence_sdk", + ] + report = ReadinessReport( + run_id="r", + project={"name": "p"}, + agent={"name": "a"}, + environment={"target": "local"}, + summary=ReportSummary( + status="human_review_recommended", + critical_count=0, + high_count=0, + medium_count=0, + human_review_recommended=True, + evidence_coverage="static", + ), + tool_surface=ToolSurfaceSummary( + total_tools=len(tools), + high_risk_tools=0, + ), + findings=[], + source_warnings=[], + ) + decision = build_release_decision( + report=report, + tools=tools, + ci_mode="advisory", + fail_on=None, + new_findings_only=False, + ) + + assert decision.evidence_coverage.low_confidence_tool_count == len( + section.low_confidence_tools + ) + + packet = _minimal_packet_with_not_proven( + section, + low_confidence_tool_count=decision.evidence_coverage.low_confidence_tool_count, + ) + md = render_packet_markdown(packet) + html = render_packet_html(packet) + + assert "Low-confidence tool extractions: none" not in md + assert "Low-confidence tool extractions: none" not in html + assert "`medium\\_confidence\\_sdk`" in md + assert "medium_confidence_sdk" in html + assert "high_confidence_inventory" not in md + assert "high_confidence_inventory" not in html + + def test_evidence_matrix_uses_release_decision_only_for_blocking_and_review(): payload = { "release_decision": {