diff --git a/README.ko.md b/README.ko.md index 30a8626..1f96a05 100644 --- a/README.ko.md +++ b/README.ko.md @@ -75,7 +75,7 @@ Runtime은 Forge `agent_manifest.json`을 선택적으로 읽어 기존 Lab-comp 이 기능은 reliable edge agent runtime 방향의 첫 Runtime-side contract입니다. `agent_id`, `task_id`, `agent_type`, priority, latency budget, queue wait, fallback usage, telemetry context를 기록하지만 기존 `result.json`의 top-level compare/report 필드는 변경하지 않습니다. -Runtime result JSON에는 `runtime_health_snapshot`, `runtime_error_classification`, `runtime_events`도 additive evidence로 기록됩니다. 이제 health snapshot은 backend availability, latency budget/deadline observation, tegrastats evidence availability를 함께 남기고, runtime events는 sequential `event_index`를 가진 lifecycle trace로 기록됩니다. `--timeout-ms`는 latency timeout 관측 기준을 남기는 옵션이며, production request cancellation을 의미하지 않습니다. +Runtime result JSON에는 `runtime_health_snapshot`, `runtime_error_classification`, `runtime_events`도 additive evidence로 기록됩니다. 이제 health snapshot은 backend availability, latency budget/deadline observation, tegrastats evidence availability를 함께 남기고, runtime events는 sequential `event_index`를 가진 lifecycle trace로 기록됩니다. `--timeout-ms`는 latency timeout 관측 기준을 남기는 옵션이며, production request cancellation을 의미하지 않습니다. 실행이 `skipped`로 끝나면 Runtime은 `runtime_execution_skipped`, `retryable: true`, `retry_hint: check_backend_availability`를 남겨 Lab/Orchestrator가 failure handling evidence로 해석할 수 있게 합니다. 예시: diff --git a/README.md b/README.md index 11e5b26..a709fa7 100644 --- a/README.md +++ b/README.md @@ -488,7 +488,7 @@ This is the first bridge toward the reliable edge agent runtime direction. It re Runtime result JSON also includes additive operation evidence blocks: - `runtime_health_snapshot`: execution health, backend/device context, backend availability, run count, latency/FPS summary, latency-budget/deadline observation, tegrastats evidence availability, and explicit timeout observation status. `--timeout-ms` records an observation threshold; it does not claim production request cancellation. -- `runtime_error_classification`: structured success/error category, severity, retry hint, observed mean latency, and timeout budget for downstream report context. +- `runtime_error_classification`: structured success/error category, severity, retryability, retry hint, observed mean latency, and timeout budget for downstream report context. Skipped execution is recorded as `runtime_execution_skipped` with `retry_hint: check_backend_availability` so Lab/Orchestrator can explain runtime failure handling without treating Runtime as a worker daemon. - `runtime_events`: compact indexed lifecycle event log for configuration, benchmark completion, error classification, optional agent context, and tegrastats parsing. These fields are evidence for Orchestrator/Lab analysis. Runtime still does not schedule tasks or own deployment decisions. diff --git a/docs/agent_runtime_result_contract.md b/docs/agent_runtime_result_contract.md index 6f0a161..9ea286a 100644 --- a/docs/agent_runtime_result_contract.md +++ b/docs/agent_runtime_result_contract.md @@ -222,6 +222,7 @@ When provided, Runtime appends: - `runtime_health_snapshot` includes backend availability, latency-budget/deadline observation, timeout observation, and tegrastats evidence availability when those values are known. - `runtime_events` uses additive `inferedge-runtime-event-v1` entries with sequential `event_index` values so Lab/Orchestrator reports can show a compact lifecycle trace. - Runtime does not claim production request cancellation. `--timeout-ms` is an observation threshold: if a successful benchmark mean latency exceeds the configured threshold, Runtime records `timeout_observed: true`, `runtime_error_classification.category: runtime_timeout_observed`, and `retryable: true` for downstream reliability reporting. +- If execution is skipped because Runtime cannot complete the configured benchmark, Runtime records `runtime_error_classification.category: runtime_execution_skipped`, `severity: warning`, `retryable: true`, and `retry_hint: check_backend_availability`. This is failure-handling evidence for Lab/Orchestrator reporting, not a production worker retry loop. - Without `--timeout-ms`, results record `timeout_policy: not_configured`, `timeout_budget_ms: null`, and `timeout_observed: false`. ## Current Boundary diff --git a/scripts/smoke_default.sh b/scripts/smoke_default.sh index b621717..ed04b52 100755 --- a/scripts/smoke_default.sh +++ b/scripts/smoke_default.sh @@ -59,6 +59,20 @@ assert data["success"] is False, data["success"] assert data["run_config"]["power_mode"] == "unknown", data["run_config"] assert data["run_config"]["jetson_clocks"] == "unknown", data["run_config"] assert data["jetson_evidence"]["tegrastats_summary"]["status"] == "not_provided" +health = data["runtime_health_snapshot"] +assert health["status"] == "degraded", health +assert health["success"] is False +assert health["timeout_policy"] == "not_configured" +assert health["timeout_observed"] is False +error = data["runtime_error_classification"] +assert error["status"] == "classified", error +assert error["category"] == "runtime_execution_skipped", error +assert error["severity"] == "warning", error +assert error["retryable"] is True, error +assert error["retry_hint"] == "check_backend_availability", error +events = {event["type"]: event for event in data["runtime_events"]} +assert events["runtime_error_classified"]["category"] == "runtime_execution_skipped" +assert events["runtime_error_classified"]["retryable"] is True PY INFEREDGE_RUNTIME_RESULT_JSON="${OUTPUT_PATH}" python3 tests/test_lab_result_schema.py diff --git a/src/result_writer.cpp b/src/result_writer.cpp index 58fa252..28c6c75 100644 --- a/src/result_writer.cpp +++ b/src/result_writer.cpp @@ -388,6 +388,10 @@ std::string runtime_retry_hint(const RuntimeConfig& config, const BenchmarkResul return "check_runtime_error"; } +bool runtime_retryable(const RuntimeConfig& config, const BenchmarkResult& benchmark_result) { + return timeout_observed(config, benchmark_result) || benchmark_result.status == "skipped"; +} + bool latency_budget_exceeded(const RuntimeConfig& config, const BenchmarkResult& benchmark_result) { return benchmark_result.success && config.agent_latency_budget_ms > 0 && @@ -476,7 +480,7 @@ void write_runtime_error_classification_json( output << ",\n" << indent << " \"timeout_observed\": " << (observed_timeout ? "true" : "false") << ",\n" - << indent << " \"retryable\": " << (observed_timeout ? "true" : "false") << ",\n" + << indent << " \"retryable\": " << (runtime_retryable(config, benchmark_result) ? "true" : "false") << ",\n" << indent << " \"retry_hint\": " << json_string(runtime_retry_hint(config, benchmark_result)) << "\n" << indent << "}"; } @@ -551,7 +555,7 @@ void write_runtime_events_json( << ",\n" << item_indent << " \"observed_mean_ms\": " << benchmark_result.mean_ms << ",\n" << item_indent << " \"timeout_observed\": " << (observed_timeout ? "true" : "false") << ",\n" - << item_indent << " \"retryable\": " << (observed_timeout ? "true" : "false") << ",\n" + << item_indent << " \"retryable\": " << (runtime_retryable(config, benchmark_result) ? "true" : "false") << ",\n" << item_indent << " \"retry_hint\": " << json_string(runtime_retry_hint(config, benchmark_result)) << "\n" << item_indent << "},\n"; diff --git a/tests/test_agent_runtime_result_contract.py b/tests/test_agent_runtime_result_contract.py index 521aaa9..641bd41 100644 --- a/tests/test_agent_runtime_result_contract.py +++ b/tests/test_agent_runtime_result_contract.py @@ -103,6 +103,14 @@ def test_runtime_output_records_optional_agent_block_when_manifest_is_provided(s if result["success"]: self.assertEqual(error["status"], "none") self.assertEqual(error["category"], "none") + self.assertFalse(error["retryable"]) + elif result["status"] == "skipped": + self.assertEqual(health["status"], "degraded") + self.assertEqual(error["status"], "classified") + self.assertEqual(error["category"], "runtime_execution_skipped") + self.assertEqual(error["severity"], "warning") + self.assertTrue(error["retryable"]) + self.assertEqual(error["retry_hint"], "check_backend_availability") else: self.assertEqual(error["status"], "classified") self.assertNotEqual(error["category"], "none") @@ -125,6 +133,7 @@ def test_runtime_output_records_optional_agent_block_when_manifest_is_provided(s self.assertEqual(error_event["timeout_budget_ms"], 1) self.assertIn("retry_hint", error_event) self.assertFalse(error_event["timeout_observed"]) + self.assertEqual(error_event["retryable"], error["retryable"]) extra = result["extra"] self.assertTrue(extra["agent_manifest_recorded"])