From 97734b2b10e7aa750f19366ae5d8f733c7ee9c10 Mon Sep 17 00:00:00 2001 From: cafitac Date: Wed, 13 May 2026 21:25:17 +0900 Subject: [PATCH] feat: add epoch-start scheduled dogfood measurement --- .../current-progress-and-next-steps.md | 2 +- .dev/status/current-handoff.md | 11 +- .dev/status/next-agent-memory-action.md | 17 +-- src/agent_memory/api/cli.py | 48 +++++--- tests/test_cli.py | 109 ++++++++++++++++++ 5 files changed, 163 insertions(+), 24 deletions(-) diff --git a/.dev/roadmap/memory-consolidation/current-progress-and-next-steps.md b/.dev/roadmap/memory-consolidation/current-progress-and-next-steps.md index 991cebb..477b9e8 100644 --- a/.dev/roadmap/memory-consolidation/current-progress-and-next-steps.md +++ b/.dev/roadmap/memory-consolidation/current-progress-and-next-steps.md @@ -1,7 +1,7 @@ # Memory Consolidation Current Progress and Next Steps Status: AI-authored draft. Not yet human-approved. -Last updated: 2026-05-13 18:19 KST +Last updated: 2026-05-13 21:23 KST diff --git a/.dev/status/current-handoff.md b/.dev/status/current-handoff.md index deb4e93..fc04eab 100644 --- a/.dev/status/current-handoff.md +++ b/.dev/status/current-handoff.md @@ -1,12 +1,21 @@ # agent-memory current handoff Status: AI-authored draft. Not yet human-approved. -Last updated: 2026-05-13 18:19 KST +Last updated: 2026-05-13 21:23 KST +## v0.1.154 continuation / v0.1.155 source checkpoint + +- Continuation report directory: `/Users/reddit/.agent-memory/reports/v0.1.154-continuation-20260513T120215/`. +- Fresh post-v0.1.154 runtime window using the released v0.1.154 CLI still showed that the old `scheduled-dry-run --since-hours` path can be blocked by historical rows in the lookback window. +- Source now adds `--epoch-start` to `dogfood trace-quality` and propagates it through `dogfood scheduled-dry-run`, so scheduled bundles can measure the same fresh epoch boundary as `dogfood fresh-epoch` instead of mixing in legacy rows. +- Repo-run evidence with `--epoch-start 2026-05-13T09:18:00Z` is green/read-only/no-mutation: `trace-quality-epoch-start-repo.json` has coverage `0.96`, no trace-quality warnings, empty retrieval ratio `0.32`, and historical excluded counts for retrieval observations/memory activations/experience traces; `scheduled-dry-run-epoch-start-repo.json` has decision `scheduled_dry_run_quality_gate_passed_plan_g4_only`. +- This does not enable broad G4/background apply. It only removes a measurement ambiguity so fresh-epoch scheduled evidence can be compared safely. Default ranking migration, collapse/delete apply, ordinary conversation auto-approval, and telemetry reset remain blocked without explicit approval corridors. +- Local full suite after the source slice: `uv run --python 3.11 pytest tests/ -q` -> `307 passed, 1 xfailed`. + ## v0.1.154 active runtime checkpoint - Release: `v0.1.154` (`https://github.com/cafitac/agent-memory/releases/tag/v0.1.154`). diff --git a/.dev/status/next-agent-memory-action.md b/.dev/status/next-agent-memory-action.md index c0078a9..32a53c8 100644 --- a/.dev/status/next-agent-memory-action.md +++ b/.dev/status/next-agent-memory-action.md @@ -1,7 +1,7 @@ # agent-memory next action Status: AI-authored draft. Not yet human-approved. -Last updated: 2026-05-13 18:19 KST +Last updated: 2026-05-13 21:23 KST ## Use this first when the user asks @@ -16,7 +16,7 @@ Then verify the repo/runtime state briefly and answer from the recommendation be ## One-sentence current state -`agent-memory` is released and live-runtime-smoked through `v0.1.153`; the `personal-oss` Hermes hook is healthy on the v0.1.153 runtime. The current verified runway now has a 50-task expanded retrieval fixture gate, 75 checked-in retrieval eval tasks across the fixture directory, persisted/replayed per-candidate collapse proof artifacts with supersession-chain evidence, one fresh non-idempotent narrow live reviewed-candidate promotion, copy/live-safe explicit-approval corridor evidence, an idempotent live G4 queue apply, named ranking policy/shadow-compare diagnostics, approval-gated config-only default-ranking migrate/rollback mechanics, a live Hermes DB 50-task representative fact shadow corpus, and a new live Hermes DB 50-task mixed fact/procedure/episode shadow corpus. Broad G4/background apply, collapse/delete apply, live telemetry reset, default ranking migration, and ordinary conversation auto-approval remain blocked. Live default ranking remains `conservative_legacy`. +`agent-memory` is released and live-runtime-smoked through `v0.1.154`; the `personal-oss` Hermes hook is healthy on the v0.1.154 runtime. The current verified runway now has a 50-task expanded retrieval fixture gate, 75 checked-in retrieval eval tasks across the fixture directory, persisted/replayed per-candidate collapse proof artifacts with supersession-chain evidence, one fresh non-idempotent narrow live reviewed-candidate promotion, copy/live-safe explicit-approval corridor evidence, an idempotent live G4 queue apply, named ranking policy/shadow-compare diagnostics, approval-gated config-only default-ranking migrate/rollback mechanics, a live Hermes DB 50-task representative fact shadow corpus, and a new live Hermes DB 50-task mixed fact/procedure/episode shadow corpus. Broad G4/background apply, collapse/delete apply, live telemetry reset, default ranking migration, and ordinary conversation auto-approval remain blocked. Live default ranking remains `conservative_legacy`. ## Current progress estimate toward the north-star @@ -38,10 +38,10 @@ Reasoning: - Release: `v0.1.154` - GitHub Release: `https://github.com/cafitac/agent-memory/releases/tag/v0.1.154` -- npm: `@cafitac/agent-memory@0.1.153` -- PyPI: `cafitac-agent-memory==0.1.153` +- npm: `@cafitac/agent-memory@0.1.154` +- PyPI: `cafitac-agent-memory==0.1.154` - Runtime: `/Users/reddit/.agent-memory/runtime/v0.1.154/.venv/bin/agent-memory` -- Runtime smoke: PyPI install smoke passed after simple-index propagation, npm installed-bin smoke passed, GitHub release exists, and `hermes --profile personal-oss hooks doctor` is green after `--accept-hooks` approval for the v0.1.153 hook command. v0.1.153 runtime QA artifacts: `/Users/reddit/.agent-memory/reports/v0.1.153-runtime-qa-20260513T080729/`. +- Runtime smoke: PyPI install smoke passed after simple-index propagation, npm installed-bin smoke passed, GitHub release exists, and `hermes --profile personal-oss hooks doctor` is green after `--accept-hooks` approval for the v0.1.153 hook command. v0.1.154 runtime QA artifacts: `/Users/reddit/.agent-memory/reports/v0.1.154-runtime-qa-20260513T091806/`. - Current source follow-up reports: `/tmp/agent-memory-g4-corridor-smoke/`, `/tmp/agent-memory-telemetry-reset-decision/`, `/tmp/agent-memory-fresh-epoch-v0149/`, and `/tmp/agent-memory-apply-corridor-v0150/`. - Fresh report directory retained from G4 diagnostics: `/Users/reddit/.agent-memory/reports/g4-v0138-20260512-132253/`. - Fresh linkage diagnosis retained: `/Users/reddit/.agent-memory/reports/g4-v0138-20260512-132253/g4-linkage-gap-diagnose-v0138-fresh.json` with decision `fresh_trace_linkage_gap_not_detected`. @@ -54,13 +54,14 @@ Reasoning: ## Current blocker -The v0.1.153 runtime is healthy, but broad brain-like automation is still intentionally blocked: +The v0.1.154 runtime is healthy, and current source has an epoch-start scheduled-dry-run measurement fix queued for release, but broad brain-like automation is still intentionally blocked: - Fresh epoch report `/Users/reddit/.agent-memory/reports/default-ranking-v0152-shadow/fresh-epoch-since-v0152-with-metadata-gap-diagnostic.json`: quality gate still fails with `low_epoch_observation_trace_coverage` and `epoch_empty_retrieval_outcome_metadata_gap_classified`. The new metadata-gap diagnostic shows `dominant_blocker=classified_legacy_missing_outcome`, `classified_missing_outcome_count=6`, and `unresolved_adapter_payload_gap_count=0`; continue metadata-rich dogfooding before telemetry reset or default ranking migration. - G4 review queue copy/live-safe smoke `/tmp/agent-memory-apply-corridor-v0150/`: live preview/list/reconciliation were read-only; copy telemetry reset and copy G4 queue apply preserved durable memory (`mutated=false`); live G4 queue apply was idempotent with `applied_count=0`, `already_applied_count=1`, `mutated=false`, and `default_retrieval_unchanged=true`. - Historical telemetry reconciliation via the telemetry reset copy smoke `/tmp/agent-memory-telemetry-reset-decision/copy-apply.json`: deleting 1773 historical telemetry rows on a DB copy passed with protected durable memory tables unchanged. Live DB was not reset because the fresh epoch gate still fails; live reset remains manual-only behind `telemetry-reset-v1` and `apply-telemetry-reset-v1`. - Collapse proof is evidence-driven and can persist/replay per-candidate proof artifacts. The current local proof path can reach `satisfied` when supersession-chain/relation evidence exists, but collapse/delete apply remains disabled even after proof satisfaction. - Retrieval fixture coverage now includes a 50-task live-compatible expanded source gate, 75 checked-in eval tasks across the directory, a live-Hermes-DB representative 50-task fact corpus, and a live-Hermes-DB representative 50-task mixed fact/procedure/episode corpus. The opt-in ranking experiments passed as read-only comparisons, but default retrieval ranking is still unchanged and blocked until a separate explicit default-rollout decision is made after fresh-epoch telemetry is green. +- New source follow-up evidence: `/Users/reddit/.agent-memory/reports/v0.1.154-continuation-20260513T120215/trace-quality-epoch-start-repo.json` and `/Users/reddit/.agent-memory/reports/v0.1.154-continuation-20260513T120215/scheduled-dry-run-epoch-start-repo.json` show that adding `--epoch-start` to trace-quality/scheduled-dry-run lets the post-v0.1.154 fresh window pass without legacy lookback pollution. This is a measurement fix, not an apply permission. - G4 broad apply contract remains blocked by policy even when a report is individually green. The guardrail now requires all of these to be green on real runtime evidence before reconsideration: retrieval ranking gate, rollback replay validation, live telemetry reconciliation, and human-reviewed queue approval; ordinary conversation auto-approval remains false. ## Recommended next work @@ -68,7 +69,7 @@ The v0.1.153 runtime is healthy, but broad brain-like automation is still intent Proceed in this sequence: 1. Keep live default ranking on `conservative_legacy`; do not run `retrieval-ranking-migrate-default` against the live profile until an operator gives the exact approval phrase and fresh-epoch telemetry is green. -2. Continue metadata-rich dogfooding to lift fresh-epoch `observation_trace_coverage_ratio` above threshold and eliminate classified legacy missing-outcome rows; the latest blocker is not an unresolved adapter payload gap. +2. Release the current `--epoch-start` scheduled-dry-run measurement slice, then keep metadata-rich dogfooding and compare fresh-epoch windows using the explicit epoch boundary; do not let legacy lookback rows drive go/no-go decisions. 3. Keep live mixed retrieval corpus coverage in the shadow-only lane; extend it only through guarded reviewed-candidate promotions with backup/audit evidence. 4. Keep fresh reviewed candidate promotion limited to the guarded explicit-approval corridor. 5. Keep broad G4/background apply blocked until ranking gate, rollback replay, telemetry reconciliation/fresh epoch, and reviewed queue approvals all pass on real runtime evidence. @@ -94,7 +95,7 @@ If asked "다음으로 뭐해야 해?", answer: ```bash cd /Users/reddit/Project/agent-memory git status --short --branch -/Users/reddit/.agent-memory/runtime/v0.1.153/.venv/bin/python - <<'PY' +/Users/reddit/.agent-memory/runtime/v0.1.154/.venv/bin/python - <<'PY' import agent_memory print(agent_memory.__version__) PY diff --git a/src/agent_memory/api/cli.py b/src/agent_memory/api/cli.py index 67a6b12..da35964 100644 --- a/src/agent_memory/api/cli.py +++ b/src/agent_memory/api/cli.py @@ -5444,6 +5444,7 @@ def _dogfood_scheduled_dry_run_payload(args: argparse.Namespace) -> dict[str, An argparse.Namespace( db_path=args.db_path, since_hours=args.since_hours, + epoch_start=getattr(args, "epoch_start", None), min_trace_coverage=args.min_trace_coverage, min_evidence_count=args.min_evidence_count, ) @@ -5485,6 +5486,7 @@ def _dogfood_scheduled_dry_run_payload(args: argparse.Namespace) -> dict[str, An }, "thresholds": { "since_hours": args.since_hours, + "epoch_start": getattr(args, "epoch_start", None), "min_trace_coverage": args.min_trace_coverage, "min_evidence_count": args.min_evidence_count, "candidate_min": args.candidate_min, @@ -6571,6 +6573,7 @@ def _parse_epoch_start(value: str) -> str: def _dogfood_trace_quality_payload(args: argparse.Namespace) -> dict[str, Any]: db_path = args.db_path.expanduser().resolve(strict=False) + epoch_start = _parse_epoch_start(args.epoch_start) if getattr(args, "epoch_start", None) else None since_hours = args.since_hours min_trace_coverage = args.min_trace_coverage min_evidence_count = args.min_evidence_count @@ -6584,47 +6587,63 @@ def _dogfood_trace_quality_payload(args: argparse.Namespace) -> dict[str, Any]: "warnings": ["database_missing"], } - since_modifier = f"-{since_hours} hours" + if epoch_start: + time_filter_sql = "created_at >= ?" + time_filter_params = (epoch_start,) + time_window = {"epoch_start": epoch_start} + else: + since_modifier = f"-{since_hours} hours" + time_filter_sql = "created_at >= datetime('now', ?)" + time_filter_params = (since_modifier,) + time_window = {"since_hours": since_hours, "sqlite_since_modifier": since_modifier} + with _open_readonly_sqlite(db_path) as connection: observation_rows = ( connection.execute( - """ + f""" SELECT id, retrieved_memory_refs_json FROM retrieval_observations - WHERE created_at >= datetime('now', ?) + WHERE {time_filter_sql} ORDER BY id ASC """, - (since_modifier,), + time_filter_params, ).fetchall() if _table_exists(connection, "retrieval_observations") else [] ) trace_rows = ( connection.execute( - """ + f""" SELECT event_kind, retention_policy, related_memory_refs_json, related_observation_ids_json FROM experience_traces - WHERE created_at >= datetime('now', ?) + WHERE {time_filter_sql} ORDER BY id ASC """, - (since_modifier,), + time_filter_params, ).fetchall() if _table_exists(connection, "experience_traces") else [] ) activation_rows = ( connection.execute( - """ + f""" SELECT activation_kind, memory_ref, observation_id FROM memory_activations - WHERE created_at >= datetime('now', ?) + WHERE {time_filter_sql} ORDER BY id ASC """, - (since_modifier,), + time_filter_params, ).fetchall() if _table_exists(connection, "memory_activations") else [] ) + if epoch_start: + time_window["historical_rows_excluded"] = { + table: int(connection.execute(f"SELECT COUNT(*) FROM {table} WHERE created_at < ?", (epoch_start,)).fetchone()[0]) + if _table_exists(connection, table) + else 0 + for table in ("experience_traces", "memory_activations", "retrieval_observations") + } ordinary_invariant = _ordinary_trace_metadata_only_invariant(connection) metadata_invariant = _metadata_json_validity(connection) @@ -6702,10 +6721,7 @@ def _dogfood_trace_quality_payload(args: argparse.Namespace) -> dict[str, Any]: "mutated": False, "status": "healthy" if not warnings else "warning", "database": {"path": str(db_path), "exists": True}, - "time_window": { - "since_hours": since_hours, - "sqlite_since_modifier": since_modifier, - }, + "time_window": time_window, "thresholds": { "min_trace_coverage": min_trace_coverage, "min_evidence_count": min_evidence_count, @@ -10761,6 +10777,7 @@ def _build_parser() -> argparse.ArgumentParser: ) dogfood_trace_quality_parser.add_argument("db_path", type=Path) dogfood_trace_quality_parser.add_argument("--since-hours", type=int, default=24) + dogfood_trace_quality_parser.add_argument("--epoch-start", help="Optional ISO-8601 cutoff for fresh-epoch trace quality measurement.") dogfood_trace_quality_parser.add_argument("--min-trace-coverage", type=float, default=0.25) dogfood_trace_quality_parser.add_argument("--min-evidence-count", type=int, default=2) dogfood_trace_cluster_preview_parser = dogfood_subparsers.add_parser( @@ -11098,6 +11115,7 @@ def _build_parser() -> argparse.ArgumentParser: dogfood_scheduled_parser.add_argument("--output", type=Path) dogfood_scheduled_parser.add_argument("--hermes-config", type=Path) dogfood_scheduled_parser.add_argument("--since-hours", type=int, default=24) + dogfood_scheduled_parser.add_argument("--epoch-start", help="Optional ISO-8601 cutoff for fresh-epoch trace quality inside the scheduled bundle.") dogfood_scheduled_parser.add_argument("--min-trace-coverage", type=float, default=0.25) dogfood_scheduled_parser.add_argument("--min-evidence-count", type=int, default=2) dogfood_scheduled_parser.add_argument("--limit", type=int, default=200) @@ -11983,6 +12001,8 @@ def main() -> None: if args.dogfood_action == "trace-quality": if args.since_hours < 1: raise ValueError("dogfood trace-quality since-hours must be >= 1") + if getattr(args, "epoch_start", None): + _parse_epoch_start(args.epoch_start) if not 0 <= args.min_trace_coverage <= 1: raise ValueError("dogfood trace-quality min-trace-coverage must be between 0 and 1") if args.min_evidence_count < 1: diff --git a/tests/test_cli.py b/tests/test_cli.py index 8823219..87d4135 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -2875,6 +2875,111 @@ def test_python_module_cli_dogfood_trace_quality_reports_read_only_aggregate_sig +def test_python_module_cli_dogfood_trace_quality_epoch_start_filters_legacy_rows( + tmp_path: Path, +) -> None: + db_path = tmp_path / "trace-quality-epoch.db" + initialize_database(db_path) + old_time = "2026-05-09 00:00:00" + epoch_start = "2026-05-10T00:00:00Z" + fresh_time = "2026-05-10 00:05:00" + with sqlite3.connect(db_path) as connection: + connection.execute( + """ + INSERT INTO retrieval_observations ( + id, created_at, surface, query_sha256, query_preview, preferred_scope, limit_value, + statuses_json, retrieved_memory_refs_json, top_memory_ref, response_mode, metadata_json + ) VALUES (1, ?, 'hermes-pre-llm-hook', ?, '', 'project:legacy', 1, '["approved"]', '[]', NULL, NULL, ?) + """, + (old_time, "a" * 64, json.dumps({"hook_event_name": "pre_llm_call"})), + ) + connection.execute( + """ + INSERT INTO retrieval_observations ( + id, created_at, surface, query_sha256, query_preview, preferred_scope, limit_value, + statuses_json, retrieved_memory_refs_json, top_memory_ref, response_mode, metadata_json + ) VALUES (2, ?, 'hermes-pre-llm-hook', ?, '', 'project:fresh', 1, '["approved"]', '["fact:1"]', 'fact:1', 'direct', ?) + """, + (fresh_time, "b" * 64, json.dumps({"hook_event_name": "pre_llm_call", "retrieval_outcome": "retrieved_memory"})), + ) + connection.execute( + """ + INSERT INTO memory_activations ( + id, created_at, surface, activation_kind, memory_ref, observation_id, trace_id, strength, scope, metadata_json + ) VALUES (1, ?, 'hermes-pre-llm-hook', 'empty_retrieval', NULL, 1, NULL, 0.0, 'project:legacy', '{}') + """, + (old_time,), + ) + connection.execute( + """ + INSERT INTO memory_activations ( + id, created_at, surface, activation_kind, memory_ref, observation_id, trace_id, strength, scope, metadata_json + ) VALUES (2, ?, 'hermes-pre-llm-hook', 'retrieved', 'fact:1', 2, 1, 1.0, 'project:fresh', '{}') + """, + (fresh_time,), + ) + connection.execute( + """ + INSERT INTO experience_traces ( + id, created_at, surface, event_kind, content_sha256, summary, scope, + related_memory_refs_json, related_observation_ids_json, retention_policy, metadata_json + ) VALUES (1, ?, 'hermes-pre-llm-hook', 'turn', ?, NULL, 'project:fresh', '["fact:1"]', '[2]', 'ephemeral', ?) + """, + (fresh_time, "c" * 64, json.dumps({"trace_recording": "default_metadata_only", "candidate_policy": "evidence_only", "auto_approved": False})), + ) + before_counts = { + table: connection.execute(f"SELECT COUNT(*) FROM {table}").fetchone()[0] + for table in ("retrieval_observations", "memory_activations", "experience_traces") + } + + env = {**os.environ, "PYTHONPATH": "src"} + result = subprocess.run( + [ + sys.executable, + "-m", + "agent_memory.api.cli", + "dogfood", + "trace-quality", + str(db_path), + "--epoch-start", + epoch_start, + "--min-trace-coverage", + "0.95", + "--min-evidence-count", + "1", + ], + cwd=Path(__file__).resolve().parents[1], + env=env, + capture_output=True, + text=True, + ) + + assert result.returncode == 0, result.stderr + payload = json.loads(result.stdout) + assert payload["kind"] == "dogfood_trace_quality" + assert payload["read_only"] is True + assert payload["mutated"] is False + assert payload["time_window"]["epoch_start"] == "2026-05-10 00:00:00" + assert payload["time_window"]["historical_rows_excluded"] == { + "experience_traces": 0, + "memory_activations": 1, + "retrieval_observations": 1, + } + assert payload["coverage"]["observation_count"] == 1 + assert payload["coverage"]["trace_count"] == 1 + assert payload["coverage"]["observation_trace_coverage_ratio"] == 1.0 + assert payload["retrieval_quality"]["empty_retrieval_count"] == 0 + assert payload["warnings"] == [] + assert payload["privacy"]["aggregate_only"] is True + + with sqlite3.connect(db_path) as connection: + after_counts = { + table: connection.execute(f"SELECT COUNT(*) FROM {table}").fetchone()[0] + for table in ("retrieval_observations", "memory_activations", "experience_traces") + } + assert after_counts == before_counts + + def test_python_module_cli_dogfood_fresh_epoch_filters_historical_telemetry_without_mutation( tmp_path: Path, ) -> None: @@ -4344,6 +4449,8 @@ def test_python_module_cli_dogfood_scheduled_dry_run_bundles_read_only_reports_w str(output_path), "--since-hours", "24", + "--epoch-start", + "2000-01-01T00:00:00Z", "--min-trace-coverage", "0.25", "--min-evidence-count", @@ -4371,6 +4478,8 @@ def test_python_module_cli_dogfood_scheduled_dry_run_bundles_read_only_reports_w assert payload["reports"]["storage_health"]["status"] == "healthy" assert "storage_health_not_clean" not in payload["quality_gate"]["blocked_reasons"] assert payload["reports"]["trace_quality"]["kind"] == "dogfood_trace_quality" + assert payload["reports"]["trace_quality"]["time_window"]["epoch_start"] == "2000-01-01 00:00:00" + assert payload["thresholds"]["epoch_start"] == "2000-01-01T00:00:00Z" assert payload["reports"]["remember_intent"]["kind"] == "remember_intent_dogfood_report" assert payload["reports"]["background_dry_run"]["kind"] == "memory_consolidation_background_dry_run" assert payload["quality_gate"]["decision"] in {