From 5cddde139fa1c8930e7121bf44ade26d75f1647b Mon Sep 17 00:00:00 2001 From: cafitac Date: Wed, 13 May 2026 16:26:29 +0900 Subject: [PATCH] feat: add reviewed episode promotion diagnostics --- .../current-progress-and-next-steps.md | 22 +++---- .dev/status/current-handoff.md | 18 +++--- .dev/status/next-agent-memory-action.md | 24 ++++---- src/agent_memory/api/cli.py | 57 ++++++++++++++++++- tests/test_cli.py | 38 ++++++++++++- tests/test_roadmap_contract.py | 3 +- 6 files changed, 125 insertions(+), 37 deletions(-) diff --git a/.dev/roadmap/memory-consolidation/current-progress-and-next-steps.md b/.dev/roadmap/memory-consolidation/current-progress-and-next-steps.md index 82d518c..bf5439b 100644 --- a/.dev/roadmap/memory-consolidation/current-progress-and-next-steps.md +++ b/.dev/roadmap/memory-consolidation/current-progress-and-next-steps.md @@ -1,11 +1,11 @@ # Memory Consolidation Current Progress and Next Steps Status: AI-authored draft. Not yet human-approved. -Last updated: 2026-05-13 15:48 KST +Last updated: 2026-05-13 16:32 KST ## v0.1.152 released runtime checkpoint and next runway -This document is the restartable checkpoint after the v0.1.152 release/runtime rollout: 50-task expanded retrieval fixture gate, 75 checked-in retrieval eval tasks across the fixture directory, per-candidate collapse proof artifact persistence/replay with supersession-chain evidence, one fresh non-idempotent narrow live reviewed-candidate promotion, copy/live-safe explicit approval corridor evidence, v0.1.152 `personal-oss` Hermes hook rollout, released named ranking policy/shadow-compare diagnostics, approval-gated config-only default-ranking migrate/rollback mechanics, and 50-task live-Hermes-DB representative shadow corpus evidence while keeping `conservative_legacy` as the live default. +This document is the restartable checkpoint after the v0.1.152 release/runtime rollout: 50-task expanded retrieval fixture gate, 75 checked-in retrieval eval tasks across the fixture directory, per-candidate collapse proof artifact persistence/replay with supersession-chain evidence, one fresh non-idempotent narrow live reviewed-candidate fact promotion, one guarded live reviewed procedure/episode promotion pair, copy/live-safe explicit approval corridor evidence, v0.1.152 `personal-oss` Hermes hook rollout, released named ranking policy/shadow-compare diagnostics, approval-gated config-only default-ranking migrate/rollback mechanics, and 50-task live-Hermes-DB representative fact plus mixed fact/procedure/episode shadow corpus evidence while keeping `conservative_legacy` as the live default. Current verified release state: @@ -20,28 +20,28 @@ Current verified release state: Fresh diagnostics: - `g4-linkage-gap-diagnose-v0138-fresh.json`: decision `fresh_trace_linkage_gap_not_detected`. -- `/Users/reddit/.agent-memory/reports/default-ranking-v0152-shadow/fresh-epoch-since-v0152.json`: still blocks epoch-wide automation on `epoch_empty_retrieval_outcome_metadata_gap_classified`. +- `/Users/reddit/.agent-memory/reports/default-ranking-v0152-shadow/fresh-epoch-since-v0152-with-metadata-gap-diagnostic.json`: still blocks epoch-wide automation on `low_epoch_observation_trace_coverage` and `epoch_empty_retrieval_outcome_metadata_gap_classified`; metadata-gap drilldown reports `dominant_blocker=classified_legacy_missing_outcome`, `classified_missing_outcome_count=6`, and `unresolved_adapter_payload_gap_count=0`. - `/tmp/agent-memory-apply-corridor-v0150/`: copy/live-safe explicit approval corridor smoke passed without unintended durable-memory mutation; live apply was idempotent. - `/tmp/agent-memory-telemetry-reset-decision/copy-apply.json`: copy telemetry reset passed with protected durable memory tables unchanged; live telemetry reset remains blocked. -- 50-task expanded retrieval source fixture gate exists, the checked-in fixture directory evaluates at 75/75 pass, and the live-Hermes-DB representative 50-task fact corpus passes with zero shadow regressions/no durable mutation. The checked-in expanded fixture is still not directly replayable against the tiny live DB because project-M1 references are absent; default ranking remains unchanged until a separate explicit default-rollout decision. +- 50-task expanded retrieval source fixture gate exists, the checked-in fixture directory evaluates at 75/75 pass, and live-Hermes-DB representative 50-task fact and mixed fact/procedure/episode corpora pass with zero shadow regressions/no durable ranking mutation. The checked-in expanded fixture is still not directly replayable against the tiny live DB because project-M1 references are absent; default ranking remains unchanged until a separate explicit default-rollout decision. - Collapse proof artifacts can be persisted/replayed and can reach `satisfied` with reviewed supersession-chain/relation evidence, but collapse/delete apply remains disabled. Progress estimate: -- Overall north-star: 76-78%. -- Substrate/evidence plumbing: about 86%. -- Safe automatic mutation/promotion: about 64-68%. -- Remaining work: about 22-24% overall. +- Overall north-star: 78-80%. +- Substrate/evidence plumbing: about 87%. +- Safe automatic mutation/promotion: about 66-70%. +- Remaining work: about 20-22% overall. Current interpretation: -Fresh v0.1.152 evidence and merged G5a-G5i plus default-ranking migration mechanics are healthy enough to continue the brain-like reviewed-candidate runway. The current runway has completed the expanded retrieval source fixture gate, stronger read-only opt-in ranking comparison, supersession-chain collapse proof evidence, one fresh guarded live reviewed-candidate promotion, the explicit default-ranking opt-in-to-default migration design, released named ranking policy diagnostics plus approval-gated config-only migrate/rollback mechanics, and representative live-Hermes-DB shadow evidence preserving `conservative_legacy`. Broad G4/background apply remains blocked. Current next work is to broaden live shadow fixture coverage beyond facts into procedure/episode surfaces, continue telemetry/fresh-epoch reconciliation, and only then consider explicit operator-approved default ranking migration. +Fresh v0.1.152 evidence and merged G5a-G5i plus default-ranking migration mechanics are healthy enough to continue the brain-like reviewed-candidate runway. The current runway has completed the expanded retrieval source fixture gate, stronger read-only opt-in ranking comparison, supersession-chain collapse proof evidence, one fresh guarded live reviewed-candidate fact promotion, one guarded live reviewed procedure/episode promotion pair, the explicit default-ranking opt-in-to-default migration design, released named ranking policy diagnostics plus approval-gated config-only migrate/rollback mechanics, and representative live-Hermes-DB fact plus mixed shadow evidence preserving `conservative_legacy`. Broad G4/background apply remains blocked. Current next work is to improve fresh-epoch telemetry coverage and reduce classified legacy missing-outcome rows through metadata-rich dogfooding before any explicit operator-approved default ranking migration. Recommended sequence from here: 1. Keep live default ranking on `conservative_legacy`; do not run live `retrieval-ranking-migrate-default` until the operator gives the exact approval phrase and fresh-epoch telemetry is green. -2. Broaden live shadow fixture coverage beyond the current 50 approved-fact tasks by seeding/approving representative procedure and episode memories through guarded review corridors. -3. Continue telemetry/fresh-epoch reconciliation; current post-v0.1.152 telemetry-only reconciliation is green, but fresh-epoch still blocks on `epoch_empty_retrieval_outcome_metadata_gap_classified`. +2. Continue metadata-rich dogfooding to lift fresh-epoch observation/trace linkage coverage above threshold and replace classified legacy missing-outcome rows. +3. Keep live mixed fact/procedure/episode corpus work in read-only shadow comparison unless additional representative memories are promoted through guarded review corridors with backup/hash/actor/reason/approval evidence. 4. Keep collapse proof evidence-driven: `satisfied` requires supersession-chain/relation evidence, and collapse/delete apply remains disabled. 5. Keep fresh reviewed candidate promotion limited to the explicit guarded corridor with backup/hash/actor/reason/approval evidence; do not use broad apply. 6. Preserve broad G4/background apply as blocked until ranking, rollback replay, telemetry reconciliation/fresh epoch, and reviewed queue approvals all pass on real runtime evidence. diff --git a/.dev/status/current-handoff.md b/.dev/status/current-handoff.md index e91cb04..e696bbe 100644 --- a/.dev/status/current-handoff.md +++ b/.dev/status/current-handoff.md @@ -1,7 +1,7 @@ # agent-memory current handoff Status: AI-authored draft. Not yet human-approved. -Last updated: 2026-05-13 15:48 KST +Last updated: 2026-05-13 16:32 KST ## v0.1.152 released runtime checkpoint @@ -16,23 +16,23 @@ Current verified state: - Hermes hook doctor is green for `personal-oss` after `--accept-hooks` smoke on the v0.1.152 runtime. - Fresh G4 report directory retained: `/Users/reddit/.agent-memory/reports/g4-v0138-20260512-132253/`. - Fresh linkage diagnosis retained from G4 diagnostics: `g4-linkage-gap-diagnose-v0138-fresh.json` passed with decision `fresh_trace_linkage_gap_not_detected`. -- Current v0.1.152 source/runtime runway now includes a 50-task expanded retrieval fixture gate (`live-compatible-50-gate.json`), 75 checked-in retrieval eval tasks across the fixture directory, persisted/replayed per-candidate collapse proof artifacts with relation-equivalence/supersession-chain evidence, one fresh live G5 reviewed-candidate promotion (`candidate:29db0390b2f81bdb` -> `fact:4`) with backup/hash evidence, idempotent live G4 queue apply evidence, the explicit default-ranking opt-in-to-default migration plan at `.dev/roadmap/memory-consolidation/default-ranking-opt-in-to-default-migration.md`, and the released default-ranking migration mechanics. -- Default-ranking migration mechanics are now released in v0.1.152: named `conservative_legacy`/`graph_reinforced_v1`/`shadow_compare` policy diagnostics, shadow compare on `retrieval-ranking-experiment`, and approval-gated config-only `retrieval-ranking-migrate-default` with protected table hash proof plus rollback metadata. Live Hermes remains on `conservative_legacy`; live shadow reports under `/Users/reddit/.agent-memory/reports/default-ranking-v0152-shadow/` include a 50-task representative live-Hermes-DB fact corpus with 50/50 pass, zero baseline regressions, protected default order, and no durable mutation. The checked-in expanded 50-task source fixture still fails against the tiny live DB because project-M1 references are absent; the gap artifact is `checked-in-expanded-50-live-gap.stderr.txt`. +- Current v0.1.152 source/runtime runway now includes a 50-task expanded retrieval fixture gate (`live-compatible-50-gate.json`), 75 checked-in retrieval eval tasks across the fixture directory, persisted/replayed per-candidate collapse proof artifacts with relation-equivalence/supersession-chain evidence, one fresh live G5 reviewed-candidate promotion (`candidate:29db0390b2f81bdb` -> `fact:4`) with backup/hash evidence, one guarded live reviewed procedure/episode promotion pair (`candidate:3435fe1db562aaf2` -> `procedure:1`, `candidate:4a35c03e7130fdec` -> `episode:1`) with backup/hash evidence, idempotent live G4 queue apply evidence, the explicit default-ranking opt-in-to-default migration plan at `.dev/roadmap/memory-consolidation/default-ranking-opt-in-to-default-migration.md`, and the released default-ranking migration mechanics. +- Default-ranking migration mechanics are now released in v0.1.152: named `conservative_legacy`/`graph_reinforced_v1`/`shadow_compare` policy diagnostics, shadow compare on `retrieval-ranking-experiment`, and approval-gated config-only `retrieval-ranking-migrate-default` with protected table hash proof plus rollback metadata. Live Hermes remains on `conservative_legacy`; live shadow reports under `/Users/reddit/.agent-memory/reports/default-ranking-v0152-shadow/` include a 50-task representative live-Hermes-DB fact corpus and a 50-task mixed fact/procedure/episode corpus, both with 50/50 pass, zero baseline regressions, protected default order, and no durable ranking mutation. The checked-in expanded 50-task source fixture still fails against the tiny live DB because project-M1 references are absent; the gap artifact is `checked-in-expanded-50-live-gap.stderr.txt`. - Broad G4/background apply remains blocked; default retrieval ranking changes, collapse/delete apply, live telemetry reset, and ordinary conversation auto-approval remain blocked. The new fact `fact:4` also records this guardrail in the live memory DB. Progress estimate: -- Overall north-star: 76-78%. -- Substrate/evidence plumbing: about 86%. -- Safe automatic mutation/promotion: about 64-68%. -- Remaining work: about 22-24% overall. +- Overall north-star: 78-80%. +- Substrate/evidence plumbing: about 87%. +- Safe automatic mutation/promotion: about 66-70%. +- Remaining work: about 20-22% overall. Current interpretation: - The trace/retrieval/candidate/proof substrate is healthy enough for the next safety runway. -- Completed in the current runway: expanded retrieval gate to 50 tasks, proved the checked-in fixture directory at 75/75 pass, moved collapse proof to `satisfied` with supersession-chain evidence while keeping collapse/delete disabled, ran one fresh non-idempotent narrow live reviewed-candidate promotion with backup/hash verification, released/runtime-smoked v0.1.151, documented the explicit default-ranking opt-in-to-default migration plan, implemented and released the named-policy/shadow-compare/config-only migrate/rollback command path in v0.1.152, and smoke-tested live shadow comparison plus a 50-task representative live fact corpus without changing the live default. +- Completed in the current runway: expanded retrieval gate to 50 tasks, proved the checked-in fixture directory at 75/75 pass, moved collapse proof to `satisfied` with supersession-chain evidence while keeping collapse/delete disabled, ran one fresh non-idempotent narrow live reviewed-candidate fact promotion plus one guarded reviewed procedure/episode promotion pair with backup/hash verification, released/runtime-smoked v0.1.151, documented the explicit default-ranking opt-in-to-default migration plan, implemented and released the named-policy/shadow-compare/config-only migrate/rollback command path in v0.1.152, and smoke-tested live shadow comparison plus both 50-task representative live fact and mixed corpora without changing the live default. - Broad G4/background apply remains blocked; existing docs/RED-test-only broad-G4 baseline must not be advertised as ready. -- Retrieval ranking changes remain opt-in experiments only; the expanded 50-task source experiment and the representative 50-task live-Hermes-DB fact corpus both passed as read-only comparisons with no durable mutation. v0.1.152 adds released migration mechanics, but live default enablement still requires broader live fixture coverage, fresh-epoch telemetry green, the exact approval phrase, and explicit operator approval. +- Retrieval ranking changes remain opt-in experiments only; the expanded 50-task source experiment, the representative 50-task live-Hermes-DB fact corpus, and the representative 50-task mixed fact/procedure/episode corpus all passed as read-only comparisons with no durable ranking mutation. v0.1.152 adds released migration mechanics, but live default enablement still requires fresh-epoch telemetry green, the exact approval phrase, and explicit operator approval. Current safe mutation boundaries: diff --git a/.dev/status/next-agent-memory-action.md b/.dev/status/next-agent-memory-action.md index 9bf0d76..276f9db 100644 --- a/.dev/status/next-agent-memory-action.md +++ b/.dev/status/next-agent-memory-action.md @@ -1,7 +1,7 @@ # agent-memory next action Status: AI-authored draft. Not yet human-approved. -Last updated: 2026-05-13 15:48 KST +Last updated: 2026-05-13 16:32 KST ## Use this first when the user asks @@ -16,7 +16,7 @@ Then verify the repo/runtime state briefly and answer from the recommendation be ## One-sentence current state -`agent-memory` is released and live-runtime-smoked through `v0.1.152`; the `personal-oss` Hermes hook is healthy on the v0.1.152 runtime. The current verified runway now has a 50-task expanded retrieval fixture gate, 75 checked-in retrieval eval tasks across the fixture directory, persisted/replayed per-candidate collapse proof artifacts with supersession-chain evidence, one fresh non-idempotent narrow live reviewed-candidate promotion, copy/live-safe explicit-approval corridor evidence, an idempotent live G4 queue apply, named ranking policy/shadow-compare diagnostics, approval-gated config-only default-ranking migrate/rollback mechanics, and a live Hermes DB 50-task representative shadow corpus. Broad G4/background apply, collapse/delete apply, live telemetry reset, default ranking migration, and ordinary conversation auto-approval remain blocked. Live default ranking remains `conservative_legacy`. +`agent-memory` is released and live-runtime-smoked through `v0.1.152`; the `personal-oss` Hermes hook is healthy on the v0.1.152 runtime. The current verified runway now has a 50-task expanded retrieval fixture gate, 75 checked-in retrieval eval tasks across the fixture directory, persisted/replayed per-candidate collapse proof artifacts with supersession-chain evidence, one fresh non-idempotent narrow live reviewed-candidate promotion, copy/live-safe explicit-approval corridor evidence, an idempotent live G4 queue apply, named ranking policy/shadow-compare diagnostics, approval-gated config-only default-ranking migrate/rollback mechanics, a live Hermes DB 50-task representative fact shadow corpus, and a new live Hermes DB 50-task mixed fact/procedure/episode shadow corpus. Broad G4/background apply, collapse/delete apply, live telemetry reset, default ranking migration, and ordinary conversation auto-approval remain blocked. Live default ranking remains `conservative_legacy`. ## Current progress estimate toward the north-star @@ -24,10 +24,10 @@ The north-star is a human-memory-like, mostly automatic, graph-based memory cons Approximate progress: -- Overall north-star: 76-78%. -- Substrate/evidence plumbing: about 86%. -- Safe automatic mutation/promotion: about 64-68%. -- Remaining work: about 22-24% overall. +- Overall north-star: 78-80%. +- Substrate/evidence plumbing: about 87%. +- Safe automatic mutation/promotion: about 66-70%. +- Remaining work: about 20-22% overall. Reasoning: @@ -50,17 +50,17 @@ Reasoning: - Historical scheduled dry-run retained: `/Users/reddit/.agent-memory/reports/g4-v0138-20260512-132253/scheduled-dry-run.json`. - Source G5a-G5i checkpoint: `dogfood trace-cluster-preview`, `dogfood trace-candidate-persist/list/update/apply`, read-only `review_score`/`review_recommendation`, `dogfood reinforcement-refinement-preview`, `dogfood decay-collapse-preview`, `dogfood supersession-preview`, lifecycle candidate registry/apply, decay deprecate apply, ranking gate/experiment, rollback confidence, `rollback-replay-validate`, `retrieval-ranking-experiment`, `decay-collapse-decision`, `telemetry-reconciliation`, telemetry reconciliation/reset safety reporting, and G4 reviewed queue preview/persist/update/apply are merged and released through v0.1.150. - Current local follow-up evidence: expanded fixture file `tests/fixtures/retrieval_eval/expanded/live-compatible-50-gate.json` has 50 live-compatible tasks; checked-in fixture directory evaluates at 75/75 pass; opt-in ranking experiment report `/Users/reddit/.agent-memory/reports/g5i-ranking-experiment-expanded-50-20260513T1355/ranking-experiment-expanded-50.json` is read-only with `expanded_fixture_gate_met=true`, `eval_gate_pass=true`, and `default_ranking_mutated=false`; fresh live reviewed candidate `candidate:29db0390b2f81bdb` promoted to `fact:4` only through the guarded explicit-approval corridor. -- Current source/runtime ranking evidence: `retrieval-ranking-experiment` has named policy/shadow-compare diagnostics; `retrieval-ranking-migrate-default` provides an approval-gated config-only migration with protected table hashes, audit output, and rollback metadata. v0.1.152 published and installed this path. Live default remains `conservative_legacy`. Live shadow reports under `/Users/reddit/.agent-memory/reports/default-ranking-v0152-shadow/` include `live-fact4-shadow.json` and `live-hermes-approved-fact-50-corpus-v1-shadow.json`; the latter replayed 50 representative tasks against the tiny live Hermes DB with 50/50 pass, zero baseline regressions, protected default order, and no durable mutation. The checked-in 50-task fixture still is not directly runnable against the tiny live Hermes DB because project-M1 references are absent there; the gap artifact is `checked-in-expanded-50-live-gap.stderr.txt`. +- Current source/runtime ranking evidence: `retrieval-ranking-experiment` has named policy/shadow-compare diagnostics; `retrieval-ranking-migrate-default` provides an approval-gated config-only migration with protected table hashes, audit output, and rollback metadata. v0.1.152 published and installed this path. Live default remains `conservative_legacy`. Live shadow reports under `/Users/reddit/.agent-memory/reports/default-ranking-v0152-shadow/` include `live-fact4-shadow.json`, `live-hermes-approved-fact-50-corpus-v1-shadow.json`, and `live-hermes-mixed-approved-50-corpus-v1-shadow.json`; the mixed corpus replayed 50 live tasks across approved facts/procedure/episode with 50/50 pass, zero baseline regressions, protected default order, and no durable mutation. The checked-in 50-task fixture still is not directly runnable against the tiny live Hermes DB because project-M1 references are absent there; the gap artifact is `checked-in-expanded-50-live-gap.stderr.txt`. ## Current blocker The v0.1.152 runtime is healthy, but broad brain-like automation is still intentionally blocked: -- Fresh epoch report `/Users/reddit/.agent-memory/reports/default-ranking-v0152-shadow/fresh-epoch-since-v0152.json`: quality gate still fails with `epoch_empty_retrieval_outcome_metadata_gap_classified`; continue dogfooding before trusting epoch-wide automation. +- Fresh epoch report `/Users/reddit/.agent-memory/reports/default-ranking-v0152-shadow/fresh-epoch-since-v0152-with-metadata-gap-diagnostic.json`: quality gate still fails with `low_epoch_observation_trace_coverage` and `epoch_empty_retrieval_outcome_metadata_gap_classified`. The new metadata-gap diagnostic shows `dominant_blocker=classified_legacy_missing_outcome`, `classified_missing_outcome_count=6`, and `unresolved_adapter_payload_gap_count=0`; continue metadata-rich dogfooding before telemetry reset or default ranking migration. - G4 review queue copy/live-safe smoke `/tmp/agent-memory-apply-corridor-v0150/`: live preview/list/reconciliation were read-only; copy telemetry reset and copy G4 queue apply preserved durable memory (`mutated=false`); live G4 queue apply was idempotent with `applied_count=0`, `already_applied_count=1`, `mutated=false`, and `default_retrieval_unchanged=true`. - Historical telemetry reconciliation via the telemetry reset copy smoke `/tmp/agent-memory-telemetry-reset-decision/copy-apply.json`: deleting 1773 historical telemetry rows on a DB copy passed with protected durable memory tables unchanged. Live DB was not reset because the fresh epoch gate still fails; live reset remains manual-only behind `telemetry-reset-v1` and `apply-telemetry-reset-v1`. - Collapse proof is evidence-driven and can persist/replay per-candidate proof artifacts. The current local proof path can reach `satisfied` when supersession-chain/relation evidence exists, but collapse/delete apply remains disabled even after proof satisfaction. -- Retrieval fixture coverage now includes a 50-task live-compatible expanded source gate, 75 checked-in eval tasks across the directory, and a live-Hermes-DB representative 50-task fact corpus. The opt-in ranking experiments passed as read-only comparisons, but default retrieval ranking is still unchanged and blocked until a separate explicit default-rollout decision is made after fresh-epoch telemetry is green. +- Retrieval fixture coverage now includes a 50-task live-compatible expanded source gate, 75 checked-in eval tasks across the directory, a live-Hermes-DB representative 50-task fact corpus, and a live-Hermes-DB representative 50-task mixed fact/procedure/episode corpus. The opt-in ranking experiments passed as read-only comparisons, but default retrieval ranking is still unchanged and blocked until a separate explicit default-rollout decision is made after fresh-epoch telemetry is green. - G4 broad apply contract remains blocked by policy even when a report is individually green. The guardrail now requires all of these to be green on real runtime evidence before reconsideration: retrieval ranking gate, rollback replay validation, live telemetry reconciliation, and human-reviewed queue approval; ordinary conversation auto-approval remains false. ## Recommended next work @@ -68,8 +68,8 @@ The v0.1.152 runtime is healthy, but broad brain-like automation is still intent Proceed in this sequence: 1. Keep live default ranking on `conservative_legacy`; do not run `retrieval-ranking-migrate-default` against the live profile until an operator gives the exact approval phrase and fresh-epoch telemetry is green. -2. Improve live fixture coverage beyond fact-only replay: seed or approve representative procedure/episode memories in a guarded corridor, then extend the live shadow corpus beyond the current 50 fact tasks. -3. Continue telemetry/fresh-epoch reconciliation; current post-v0.1.152 telemetry reconciliation is green as telemetry-only, but fresh-epoch still blocks on `epoch_empty_retrieval_outcome_metadata_gap_classified`. +2. Continue metadata-rich dogfooding to lift fresh-epoch `observation_trace_coverage_ratio` above threshold and eliminate classified legacy missing-outcome rows; the latest blocker is not an unresolved adapter payload gap. +3. Keep live mixed retrieval corpus coverage in the shadow-only lane; extend it only through guarded reviewed-candidate promotions with backup/audit evidence. 4. Keep fresh reviewed candidate promotion limited to the guarded explicit-approval corridor. 5. Keep broad G4/background apply blocked until ranking gate, rollback replay, telemetry reconciliation/fresh epoch, and reviewed queue approvals all pass on real runtime evidence. @@ -87,7 +87,7 @@ Do not silently delete, reset, or rewrite telemetry. Historical reconciliation m If asked "다음으로 뭐해야 해?", answer: -> 지금은 v0.1.152까지 릴리즈/설치/스모크가 끝났고 `personal-oss` Hermes hook도 doctor-green입니다. 전체 목표 대비 대략 76-78% 정도 왔습니다. live Hermes default는 여전히 `conservative_legacy`이고, `graph_reinforced_v1`은 shadow 후보로만 비교했습니다. 새 live-Hermes-DB 50-task representative fact corpus는 50/50 pass, zero baseline regression, no mutation으로 통과했습니다. 하지만 checked-in expanded 50-task fixture는 live DB에 project-M1 reference facts/procedures/episodes가 없어서 직접 replay는 아직 불가하고, post-v0.1.152 fresh-epoch도 `epoch_empty_retrieval_outcome_metadata_gap_classified`로 계속 block입니다. 다음은 live fixture coverage를 fact-only에서 procedure/episode까지 넓히고 fresh-epoch telemetry를 더 dogfood하는 순서입니다. broad G4/background apply, collapse/delete apply, ordinary conversation auto-approval, default ranking migration은 아직 금지입니다. +> 지금은 v0.1.152까지 릴리즈/설치/스모크가 끝났고 `personal-oss` Hermes hook도 doctor-green입니다. 전체 목표 대비 대략 78-80% 정도 왔습니다. live Hermes default는 여전히 `conservative_legacy`이고, `graph_reinforced_v1`은 shadow 후보로만 비교했습니다. 새 live-Hermes-DB mixed 50-task corpus는 approved facts/procedure/episode를 포함해 50/50 pass, zero baseline regression, protected default order, no mutation으로 통과했습니다. 다만 post-v0.1.152 fresh-epoch는 아직 `low_epoch_observation_trace_coverage`와 `epoch_empty_retrieval_outcome_metadata_gap_classified`로 block입니다. 새 diagnostic 기준 unresolved adapter payload gap은 0이고, 남은 핵심은 classified legacy missing-outcome row를 metadata-rich dogfooding으로 밀어내는 것입니다. broad G4/background apply, collapse/delete apply, ordinary conversation auto-approval, default ranking migration, live telemetry reset은 아직 금지입니다. ## Quick verification commands diff --git a/src/agent_memory/api/cli.py b/src/agent_memory/api/cli.py index a8f85c0..0befa32 100644 --- a/src/agent_memory/api/cli.py +++ b/src/agent_memory/api/cli.py @@ -4614,6 +4614,18 @@ def _reviewed_promotion_payload_from_args(args: argparse.Namespace) -> dict[str, "success_rate": args.success_rate, "evidence_ids": [], } + if args.promotion_type == "episode": + if not args.title or not args.summary: + raise ValueError("dogfood trace-candidate-update episode promotion requires --title and --summary") + return { + "promotion_type": "episode", + "title": args.title, + "summary": args.summary, + "source_ids": [], + "tags": list(args.tag or []), + "scope": args.scope, + "importance_score": args.importance_score, + } raise ValueError("unsupported trace candidate promotion type") @@ -4677,7 +4689,7 @@ def _dogfood_trace_candidate_update_payload(args: argparse.Namespace) -> dict[st "status_after": args.status, "status": args.status, "proposal_type": proposal_type, - "promotion_ready": args.status == "approved" and proposal_type in {"fact_promotion", "preference_promotion", "procedure_promotion"}, + "promotion_ready": args.status == "approved" and proposal_type in {"fact_promotion", "preference_promotion", "procedure_promotion", "episode_promotion"}, "reason_sha256": reason_sha256, "privacy": {"reviewed_payload_included": False, "raw_reason_included": False, "raw_content_included": False}, } @@ -4723,7 +4735,7 @@ def _dogfood_trace_candidate_apply_payload(args: argparse.Namespace) -> dict[str if row["status"] != "approved": skipped.append({"candidate_id": candidate_id, "reason": f"status_{row['status']}"}) continue - if row["proposal_type"] not in {"fact_promotion", "preference_promotion", "procedure_promotion"}: + if row["proposal_type"] not in {"fact_promotion", "preference_promotion", "procedure_promotion", "episode_promotion"}: skipped.append({"candidate_id": candidate_id, "reason": f"proposal_type_{row['proposal_type']}"}) continue with sqlite3.connect(db_path) as connection: @@ -4763,6 +4775,18 @@ def _dogfood_trace_candidate_apply_payload(args: argparse.Namespace) -> dict[str ) approve_procedure(db_path=db_path, procedure_id=procedure.id) promoted_ref = f"procedure:{procedure.id}" + elif promotion_type == "episode": + episode = create_episode( + db_path=db_path, + title=str(reviewed["title"]), + summary=str(reviewed["summary"]), + source_ids=[int(value) for value in reviewed.get("source_ids", [])], + tags=[str(value) for value in reviewed.get("tags", [])], + importance_score=float(reviewed.get("importance_score") or 0.0), + scope=str(reviewed.get("scope") or "global"), + status="approved", + ) + promoted_ref = f"episode:{episode.id}" else: skipped.append({"candidate_id": candidate_id, "reason": f"reviewed_payload_not_promotable_{promotion_type or 'missing'}"}) continue @@ -6888,6 +6912,21 @@ def _dogfood_fresh_epoch_payload(args: argparse.Namespace) -> dict[str, Any]: warnings.append("high_epoch_empty_retrieval_ratio") unknown_empty_outcome_count = empty_by_retrieval_outcome.get("unknown", 0) + empty_by_retrieval_outcome.get("", 0) unresolved_unknown_empty_outcome_count = empty_unknown_outcome_drilldown.get("adapter_payload_gap", 0) + classified_missing_outcome_count = max(0, unknown_empty_outcome_count - unresolved_unknown_empty_outcome_count) + if unresolved_unknown_empty_outcome_count: + dominant_blocker = "adapter_payload_gap" + classification_confidence = "partial" if classified_missing_outcome_count else "low" + metadata_gap_next_action = ( + "Fix adapter payload metadata for unresolved empty observations before treating classified legacy gaps as reset-safe." + ) + elif unknown_empty_outcome_count: + dominant_blocker = "classified_legacy_missing_outcome" + classification_confidence = "classified" + metadata_gap_next_action = "Collect more fresh metadata-rich dogfood before telemetry reset; no adapter payload gap detected." + else: + dominant_blocker = "none" + classification_confidence = "complete" + metadata_gap_next_action = "No unknown empty-retrieval outcome metadata gap detected." if unresolved_unknown_empty_outcome_count: warnings.append("epoch_empty_retrieval_outcome_unknown") elif unknown_empty_outcome_count: @@ -6949,6 +6988,14 @@ def _dogfood_fresh_epoch_payload(args: argparse.Namespace) -> dict[str, Any]: "classification_rule": "metadata-only aggregate inference from hook_event_name and response_mode", "next_action": "Prefer more v0.1.129+ dogfood or a targeted metadata backfill preview before telemetry reset.", }, + "metadata_gap_diagnostic": { + "unknown_empty_outcome_count": unknown_empty_outcome_count, + "unresolved_adapter_payload_gap_count": unresolved_unknown_empty_outcome_count, + "classified_missing_outcome_count": classified_missing_outcome_count, + "dominant_blocker": dominant_blocker, + "classification_confidence": classification_confidence, + "next_action": metadata_gap_next_action, + }, "by_hook_event_name": {key: empty_by_hook_event_name[key] for key in sorted(empty_by_hook_event_name)}, "by_surface": {key: empty_by_surface[key] for key in sorted(empty_by_surface)}, "by_scope": {key: empty_by_scope[key] for key in sorted(empty_by_scope)}, @@ -10911,7 +10958,7 @@ def _build_parser() -> argparse.ArgumentParser: dogfood_trace_candidate_update_parser.add_argument("--actor", required=True) dogfood_trace_candidate_update_parser.add_argument("--reason", required=True) dogfood_trace_candidate_update_parser.add_argument("--approval-phrase", required=True) - dogfood_trace_candidate_update_parser.add_argument("--promotion-type", choices=["fact", "preference", "procedure"]) + dogfood_trace_candidate_update_parser.add_argument("--promotion-type", choices=["fact", "preference", "procedure", "episode"]) dogfood_trace_candidate_update_parser.add_argument("--subject") dogfood_trace_candidate_update_parser.add_argument("--predicate") dogfood_trace_candidate_update_parser.add_argument("--object") @@ -10920,6 +10967,10 @@ def _build_parser() -> argparse.ArgumentParser: dogfood_trace_candidate_update_parser.add_argument("--precondition", action="append") dogfood_trace_candidate_update_parser.add_argument("--step", action="append") dogfood_trace_candidate_update_parser.add_argument("--success-rate", type=float, default=0.0) + dogfood_trace_candidate_update_parser.add_argument("--title") + dogfood_trace_candidate_update_parser.add_argument("--summary") + dogfood_trace_candidate_update_parser.add_argument("--tag", action="append") + dogfood_trace_candidate_update_parser.add_argument("--importance-score", type=float, default=0.0) dogfood_trace_candidate_update_parser.add_argument("--scope", default="global") dogfood_trace_candidate_update_parser.add_argument("--confidence", type=float, default=0.7) dogfood_trace_candidate_apply_parser = dogfood_subparsers.add_parser( diff --git a/tests/test_cli.py b/tests/test_cli.py index 5bf6f6d..c255d12 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -3147,6 +3147,14 @@ def test_python_module_cli_dogfood_fresh_epoch_classifies_unknown_empty_retrieva "classification_rule": "metadata-only aggregate inference from hook_event_name and response_mode", "next_action": "Prefer more v0.1.129+ dogfood or a targeted metadata backfill preview before telemetry reset.", } + assert diagnostics["metadata_gap_diagnostic"] == { + "unknown_empty_outcome_count": 2, + "unresolved_adapter_payload_gap_count": 1, + "classified_missing_outcome_count": 1, + "dominant_blocker": "adapter_payload_gap", + "classification_confidence": "partial", + "next_action": "Fix adapter payload metadata for unresolved empty observations before treating classified legacy gaps as reset-safe.", + } assert payload["quality_gate"] == { "pass": False, "decision": "continue_fresh_epoch_dogfooding", @@ -10819,9 +10827,37 @@ def test_dogfood_trace_candidate_apply_promotes_only_approved_reviewed_fact_cand "name, trigger_context, status", ("Run reviewed candidate promotion", "when a trace candidate is explicitly approved", "approved"), ), + ( + "episode", + [ + "--promotion-type", + "episode", + "--title", + "Reviewed live mixed corpus checkpoint", + "--summary", + "A reviewed episode records the live mixed retrieval shadow corpus checkpoint without raw transcript storage.", + "--tag", + "retrieval-eval", + "--tag", + "shadow-corpus", + "--scope", + "project:g5-candidates", + "--importance-score", + "0.7", + ], + "promote_reviewed_episode", + "episode:", + "episodes", + "title, summary, status", + ( + "Reviewed live mixed corpus checkpoint", + "A reviewed episode records the live mixed retrieval shadow corpus checkpoint without raw transcript storage.", + "approved", + ), + ), ], ) -def test_dogfood_trace_candidate_apply_supports_reviewed_preference_and_procedure_promotions( +def test_dogfood_trace_candidate_apply_supports_reviewed_preference_procedure_and_episode_promotions( tmp_path: Path, promotion_type: str, update_args: list[str], diff --git a/tests/test_roadmap_contract.py b/tests/test_roadmap_contract.py index 56c79e3..1bda550 100644 --- a/tests/test_roadmap_contract.py +++ b/tests/test_roadmap_contract.py @@ -71,10 +71,11 @@ def test_v0152_status_docs_record_current_brainlike_runway_and_blocked_broad_app assert "/Users/reddit/.agent-memory/runtime/v0.1.152/.venv/bin/agent-memory" in doc assert "fresh_trace_linkage_gap_not_detected" in doc assert "g4-v0138-20260512-132253" in doc - assert "Overall north-star: 76-78%" in doc + assert "Overall north-star: 78-80%" in doc assert "broad g4/background apply" in doc.lower() assert "50-task expanded retrieval fixture gate" in doc or "50-task expanded retrieval fixture" in doc assert "75 checked-in" in doc or "75/75" in doc + assert "mixed fact/procedure/episode" in doc or "approved facts/procedure/episode" in doc assert "collapse proof" in doc.lower() assert "dogfood trace-cluster-preview" in next_action