diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e43b0f9 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.DS_Store diff --git a/backend/lined/docs/README.md b/backend/lined/docs/README.md index 53489e1..8c9efbc 100644 --- a/backend/lined/docs/README.md +++ b/backend/lined/docs/README.md @@ -17,6 +17,7 @@ changes. | HPA Resource Scenarios | Local kind variants for backend resource requests/limits, fixed replicas, and CPU-based HPA behavior. | `hpa-resource-scenarios.md` | Use before comparing deployment/runtime trade-offs under k6 workload traffic. | Resource pressure, fixed replica comparison, HPA prerequisites, scenario cleanup. | | Runtime Scenario Summaries | Scenario-runner seam for producing sanitized runtime-summary artifacts from one scenario and workload. | `runtime-scenario-summaries.md` | Use before generating collector-ready runtime evidence for local scenario comparisons. | Scenario runner CLI, k6 summary export, Kubernetes state summaries, provenance manifest. | | Runtime Fitness Extension | Runtime-aware fitness metric contract and optional collector input shape. | `runtime-fitness-extension.md` | Use before adding runtime-aware scoring or attaching runtime summaries to metrics documents. | Structural/runtime score separation, runtime metric schema, normalization, compatibility. | +| Runtime-Aware Scoring | Versioned scalar runtime score that compares current runtime summaries against baseline evidence. | `runtime-aware-scoring.md` | Use before running or interpreting runtime-aware fitness scoring. | Local baseline input, score fields, SLO classification, missing metrics, metrics-store fallback. | | Prometheus Telemetry Pipeline | Local Prometheus deployment and scrape workflow for kind backend metrics. | `prometheus-telemetry-pipeline.md` | Use before collecting persistent-enough Prometheus samples from local scenario runs. | Prometheus pod discovery, Actuator scrape verification, PromQL checks, telemetry cleanup. | | SLO Constraint Thresholds | Initial runtime SLO and constraint thresholds for classifying local experiment variants. | `slo-constraint-thresholds.md` | Use before interpreting runtime-summary evidence or adding runtime-aware scoring. | Latency, error-rate, availability, restart, readiness, and resource-pressure constraints. | | LLM Support Service | Plan for a separate advisory service that proposes candidate fitness rules and trade-off explanations. | `llm-support-service.md` | Use before designing or implementing LLM-assisted rule synthesis for the experiment. | Service boundary, serverless/manual triggers, input/output contracts, review workflow. | diff --git a/backend/lined/docs/experiment-tasks.md b/backend/lined/docs/experiment-tasks.md index 22918f2..0b30e85 100644 --- a/backend/lined/docs/experiment-tasks.md +++ b/backend/lined/docs/experiment-tasks.md @@ -18,7 +18,7 @@ scientific experiment work. | `experiment/scenario-fixture-discipline` | Task | Runtime evidence | Yes | Scenario fixture discipline | Define explicit workload/context profiles and repeatable input setup for Lined experiment scenario runs. | Deployment/runtime comparisons use stable fixtures instead of manual setup. | | `experiment/slo-constraint-thresholds` | Task | Runtime evidence | Yes | SLO and constraint thresholds | Define initial latency, error-rate, availability, restart, readiness, and resource-efficiency thresholds for classifying valid experiment variants. | Runtime evidence can be evaluated against explicit constraints instead of ad hoc interpretation. | | `experiment/fitness-runtime-extension` | Task | Runtime scoring | Yes | Runtime fitness extension | Extend experiment documentation and/or collector design to include telemetry metrics. | Fixed CI fitness can be compared with runtime-aware adaptive fitness. | -| `experiment/runtime-aware-scoring` | Task | Runtime scoring | No | Runtime-aware scoring | Add a versioned runtime fitness score that uses summarized runtime metrics while preserving the existing structural `fitnessScore`. | Runtime-aware scalar fitness can be computed without changing historical CI fitness semantics. | +| `experiment/runtime-aware-scoring` | Task | Runtime scoring | Yes | Runtime-aware scoring | Add a versioned runtime fitness score that uses summarized runtime metrics while preserving the existing structural `fitnessScore`. | Runtime-aware scalar fitness can be computed without changing historical CI fitness semantics. | | `experiment/adaptive-weighted-fitness` | Task | Runtime scoring | No | Adaptive weighted fitness | Implement context-sensitive weighting over structural and runtime signals for workload, SLO, or resource-pressure contexts. | Fixed structural fitness can be compared with an adaptive scalar fitness baseline. | | `experiment/pareto-optimization-baseline` | Task | Runtime scoring | No | Pareto optimization baseline | Add a small NSGA-II or equivalent Pareto-based optimizer over the current deployment scenario set and runtime objectives. | Deployment variants can be compared as multi-objective trade-offs rather than a single weighted score. | | `experiment/decision-usefulness-reporting` | Task | Runtime scoring | No | Decision-usefulness reporting | Extend experiment reporting to compare whether Pareto/NSGA-II gives more actionable trade-off alternatives than fixed-weight scalar scoring. | Results explain decision usefulness in addition to numeric objective values. | diff --git a/backend/lined/docs/runtime-aware-scoring.md b/backend/lined/docs/runtime-aware-scoring.md new file mode 100644 index 0000000..fae96de --- /dev/null +++ b/backend/lined/docs/runtime-aware-scoring.md @@ -0,0 +1,173 @@ +# Runtime-Aware Scoring + +This guide describes the runtime-aware scoring contract for +`experiment/runtime-aware-scoring`. + +Runtime-aware scoring is additive. It keeps the existing top-level +`fitnessScore` as the structural CI score and adds a separate versioned score +from summarized runtime evidence. + +## Scope + +This task provides: + +- a versioned scalar runtime score named `runtimeFitnessScore`; +- local scoring from explicit current and baseline runtime summary files; +- optional persisted baseline lookup through the collector metrics-store seam; +- SLO constraint classification from `slo-thresholds-v1.json`; +- optional local output when Cosmos DB or another metrics database is not + configured. + +This task does not add adaptive weighting, Pareto optimization, new backend +API behavior, production SLOs, dashboarding, or live telemetry scraping inside +the collector. + +## Collector Inputs + +The collector accepts current runtime evidence through the existing input: + +```text +RUNTIME_METRICS_JSON=/absolute/path/to/runtime-summary.json +``` + +For local/offline scoring, pass an explicit baseline summary: + +```text +RUNTIME_BASELINE_METRICS_JSON=/absolute/path/to/baseline-runtime-summary.json +``` + +When a metrics store is configured and no explicit baseline file is provided, +the collector can look for the latest persisted `main` runtime summary matching +the configured baseline scenario and current workload: + +```text +RUNTIME_BASELINE_SCENARIO=fixed-medium +``` + +The default threshold artifact is: + +```text +SLO_THRESHOLDS_JSON=../backend/lined/load-tests/runtime-scenarios/slo-thresholds-v1.json +``` + +When no database is configured, write the final document locally: + +```text +METRICS_OUTPUT_JSON=/absolute/path/to/metrics-document.json +``` + +For a runtime-only local smoke check without structural CI reports or +SonarCloud access, use: + +```text +RUNTIME_ONLY=true +``` + +The default collector path still reads Checkstyle, SpotBugs, JaCoCo, and +SonarCloud evidence. `RUNTIME_ONLY=true` is only for local runtime scoring +experiments where the structural CI score is not being recomputed. + +## Output Contract + +The stored or local metrics document preserves the structural score: + +```json +{ + "fitnessScore": 0.1234, + "runtimeFitnessScore": 0.2185, + "runtimeFitnessScoreVersion": "runtime-aware-v1", + "runtimeFitness": { + "current": { + "scenario": "replicas-2", + "workload": "baseline", + "source": "local-kind" + }, + "baseline": { + "scenario": "fixed-medium", + "workload": "baseline", + "source": "local-kind" + }, + "eligibleForStableComparison": false + } +} +``` + +`fitnessScore` remains the CI-only structural score. Runtime evidence is +attached under `metrics.runtime_metrics`; runtime score metadata is attached +under `runtimeFitness`. + +When `RUNTIME_ONLY=true`, the output document may contain +`fitnessScore: null` because no structural CI evidence was collected. That +does not redefine the field; it records that the runtime-only smoke path did +not compute the structural score. + +## Runtime-Aware v1 Formula + +The score compares current runtime summary metrics against a baseline runtime +summary. Each metric is normalized to `[-1, 1]` before weighting. + +Lower-is-better metrics use: + +```text +(baseline - current) / baseline +``` + +Higher-is-better metrics use: + +```text +(current - baseline) / baseline +``` + +If baseline and current are both zero, the normalized delta is `0`. If baseline +is zero and current is non-zero, beneficial movement is `1` and harmful +movement is `-1`. Missing metrics are omitted from the score and the active +weights are re-normalized. + +| Metric | Direction | Weight | +|--------|-----------|--------| +| `latency_p95_ms` | lower is better | `0.20` | +| `latency_p99_ms` | lower is better | `0.15` | +| `error_rate` | lower is better | `0.20` | +| `throughput_rps` | higher is better | `0.15` | +| `availability` | higher is better | `0.15` | +| `restart_count` | lower is better | `0.10` | +| `cpu_utilization` | lower is better | `0.025` | +| `memory_utilization` | lower is better | `0.025` | + +`hpa_current_replicas` and `hpa_desired_replicas` remain contextual evidence +and are not scored directly in v1. + +## SLO Classification + +The collector classifies current runtime evidence against +`slo-thresholds-v1.json` and records per-constraint results: + +- `valid` when evidence exists and satisfies the constraint; +- `warning` when evidence exists and crosses a warning threshold; +- `invalid` when evidence exists and violates a hard constraint; +- `unknown` when required evidence is missing. + +`runtimeFitness.eligibleForStableComparison` is `false` when any hard +constraint is `invalid` or `unknown`. The numeric runtime score may still be +emitted when comparable current and baseline metrics exist, but eligibility +keeps incomplete or unstable runs out of article-ready comparisons. + +Readiness remains external evidence. It is classified as `unknown` unless a +future runtime summary contract adds a summarized readiness source. + +## Local Example + +```bash +cd fitness-metrics-collector +npm run build +RUNTIME_ONLY=true \ +RUNTIME_METRICS_JSON=/absolute/path/current/runtime-summary.json \ +RUNTIME_BASELINE_METRICS_JSON=/absolute/path/baseline/runtime-summary.json \ +METRICS_OUTPUT_JSON=/absolute/path/output/metrics-document.json \ +npm run metrics +``` + +If `COSMOS_DB_CONNECTION_STRING` is absent, the collector writes the local +output document when `METRICS_OUTPUT_JSON` is set and skips database +persistence. Omit `RUNTIME_ONLY=true` when structural reports and `SONAR_TOKEN` +are available and the run should also compute the structural `fitnessScore`. diff --git a/fitness-metrics-collector/scripts/collectMetrics.test.ts b/fitness-metrics-collector/scripts/collectMetrics.test.ts index 8c015c8..74cf2f2 100644 --- a/fitness-metrics-collector/scripts/collectMetrics.test.ts +++ b/fitness-metrics-collector/scripts/collectMetrics.test.ts @@ -1,172 +1,409 @@ -import assert from "node:assert/strict"; import fs from "node:fs"; import os from "node:os"; import path from "node:path"; -import {describe, it} from "node:test"; +import {describe, it, type TestContext} from "node:test"; -import {parseRuntimeMetrics, readRuntimeMetrics} from "./collectMetrics"; +import { + collectRuntimeOnlyMetrics, + parseRuntimeMetrics, + readRuntimeMetrics, + writeMetricsOutput, +} from "./collectMetrics"; +import { + classifyRuntimeMetrics, + computeRuntimeFitness, + parseSloThresholds, +} from "./runtimeScoring"; -const validRuntimeMetricsJson = JSON.stringify({ - schema_version: 1, - scenario: "fixed-medium", - workload: "baseline", - source: "local-kind", +const RUNTIME_SCHEMA_VERSION = 1; +const UNSUPPORTED_RUNTIME_SCHEMA_VERSION = 2; +const RUNTIME_SCORE_VERSION = "runtime-aware-v1"; +const THRESHOLD_VERSION = "slo-thresholds-v1"; + +const SCENARIO_FIXED_MEDIUM = "fixed-medium"; +const SCENARIO_FIXED_SMALL = "fixed-small"; +const SCENARIO_REPLICAS_2 = "replicas-2"; +const WORKLOAD_BASELINE = "baseline"; +const WORKLOAD_SMOKE = "smoke"; +const SOURCE_LOCAL_KIND = "local-kind"; + +const LOCAL_BRANCH = "experiment-runtime-aware-scoring"; +const LOCAL_COMMIT = "abc123"; +const LOCAL_METRICS_DOCUMENT_ID = `${LOCAL_BRANCH}-${LOCAL_COMMIT}`; +const ISO_TIMESTAMP = "2026-06-04T00:00:00.000Z"; + +const SUMMARY_LATENCY_P95_MS = 250.5; +const SUMMARY_LATENCY_P99_MS = 550.25; +const SUMMARY_ERROR_RATE = 0.002; +const SUMMARY_THROUGHPUT_RPS = 42.1; +const SUMMARY_AVAILABILITY = 1; +const SUMMARY_RESTART_COUNT = 0; +const SUMMARY_CPU_UTILIZATION = 0.62; +const SUMMARY_MEMORY_UTILIZATION = 0.71; +const SUMMARY_HPA_DESIRED_REPLICAS = 2; +const SUMMARY_HPA_CURRENT_REPLICAS = 2; +const MISSING_PROCESS_CPU_USAGE = "process_cpu_usage"; + +const BASELINE_LATENCY_P95_MS = 300; +const BASELINE_LATENCY_P99_MS = 600; +const BASELINE_ERROR_RATE = 0.004; +const BASELINE_THROUGHPUT_RPS = 40; +const BASELINE_AVAILABILITY = 0.99; +const BASELINE_RESTART_COUNT = 1; +const BASELINE_CPU_UTILIZATION = 0.7; +const BASELINE_MEMORY_UTILIZATION = 0.8; + +const CURRENT_LATENCY_P95_MS = 240; +const CURRENT_LATENCY_P99_MS = 480; +const CURRENT_ERROR_RATE = 0.002; +const CURRENT_THROUGHPUT_RPS = 44; +const CURRENT_AVAILABILITY = 1; +const CURRENT_RESTART_COUNT = 0; +const CURRENT_CPU_UTILIZATION = 0.63; +const CURRENT_MEMORY_UTILIZATION = 0.72; + +const EXPECTED_RUNTIME_SCORE = 0.2915; +const EXPECTED_LATENCY_P95_WEIGHT = 0.2; +const EXPECTED_LATENCY_P95_DELTA = 0.2; +const EXPECTED_ERROR_ONLY_SCORE = 0.5; +const EXPECTED_ZERO_BASELINE_SCORE = -0.1429; +const EXPECTED_WRITTEN_STRUCTURAL_SCORE = 0.42; +const EXPECTED_WRITTEN_RUNTIME_SCORE = 0.25; +const STRUCTURAL_METRIC_ZERO = 0; +const STRUCTURAL_SPOTBUGS_CLASS_COUNT = 1; +const EXPECTED_CONSTRAINT_CLASSIFICATIONS = ["invalid", "valid", "warning", "unknown"]; +const INVALID_OUTPUT_PATH = "/path/that/does/not/exist/metrics.json"; + +const ERROR_ONLY_CURRENT_RATE = 0.002; +const ERROR_ONLY_BASELINE_RATE = 0.004; +const ZERO_BASELINE_CURRENT_THROUGHPUT = 10; +const ZERO_BASELINE_CURRENT_ERROR_RATE = 0.1; +const CLASSIFICATION_INVALID_LATENCY_P95_MS = 1200; +const CLASSIFICATION_VALID_ERROR_RATE = 0; +const CLASSIFICATION_WARNING_CPU_UTILIZATION = 0.9; + +const THRESHOLD_LATENCY_P95_LIMIT_MS = 1000; +const THRESHOLD_ERROR_RATE_LIMIT = 0.01; +const THRESHOLD_CPU_WARNING_LIMIT = 0.85; + +type RuntimePayload = { + schema_version: number; + scenario: string; + workload: string; + source: string; + summary: Record; + missing?: string[]; +}; + +// Full collector-ready summary: every field documents a runtime metric contract +// consumed by parseRuntimeMetrics, including HPA replica counts as context only. +const FULL_RUNTIME_PAYLOAD: RuntimePayload = { + schema_version: RUNTIME_SCHEMA_VERSION, + scenario: SCENARIO_FIXED_MEDIUM, + workload: WORKLOAD_BASELINE, + source: SOURCE_LOCAL_KIND, summary: { - latency_p95_ms: 250.5, - latency_p99_ms: 550.25, - error_rate: 0.002, - throughput_rps: 42.1, - availability: 1, - restart_count: 0, - cpu_utilization: 0.62, - memory_utilization: 0.71, - hpa_desired_replicas: 2, - hpa_current_replicas: 2, + latency_p95_ms: SUMMARY_LATENCY_P95_MS, + latency_p99_ms: SUMMARY_LATENCY_P99_MS, + error_rate: SUMMARY_ERROR_RATE, + throughput_rps: SUMMARY_THROUGHPUT_RPS, + availability: SUMMARY_AVAILABILITY, + restart_count: SUMMARY_RESTART_COUNT, + cpu_utilization: SUMMARY_CPU_UTILIZATION, + memory_utilization: SUMMARY_MEMORY_UTILIZATION, + hpa_desired_replicas: SUMMARY_HPA_DESIRED_REPLICAS, + hpa_current_replicas: SUMMARY_HPA_CURRENT_REPLICAS, }, - missing: ["process_cpu_usage"], -}); + missing: [MISSING_PROCESS_CPU_USAGE], +}; + +// Baseline fixture represents the stable scenario used as the denominator for +// normalized runtime-aware v1 scoring. +const RUNTIME_BASELINE_PAYLOAD: RuntimePayload = { + schema_version: RUNTIME_SCHEMA_VERSION, + scenario: SCENARIO_FIXED_MEDIUM, + workload: WORKLOAD_BASELINE, + source: SOURCE_LOCAL_KIND, + summary: { + latency_p95_ms: BASELINE_LATENCY_P95_MS, + latency_p99_ms: BASELINE_LATENCY_P99_MS, + error_rate: BASELINE_ERROR_RATE, + throughput_rps: BASELINE_THROUGHPUT_RPS, + availability: BASELINE_AVAILABILITY, + restart_count: BASELINE_RESTART_COUNT, + cpu_utilization: BASELINE_CPU_UTILIZATION, + memory_utilization: BASELINE_MEMORY_UTILIZATION, + }, +}; + +// Current fixture represents an improved deployment scenario compared with the +// baseline fixture while keeping the workload and source fixed. +const RUNTIME_CURRENT_PAYLOAD: RuntimePayload = { + schema_version: RUNTIME_SCHEMA_VERSION, + scenario: SCENARIO_REPLICAS_2, + workload: WORKLOAD_BASELINE, + source: SOURCE_LOCAL_KIND, + summary: { + latency_p95_ms: CURRENT_LATENCY_P95_MS, + latency_p99_ms: CURRENT_LATENCY_P99_MS, + error_rate: CURRENT_ERROR_RATE, + throughput_rps: CURRENT_THROUGHPUT_RPS, + availability: CURRENT_AVAILABILITY, + restart_count: CURRENT_RESTART_COUNT, + cpu_utilization: CURRENT_CPU_UTILIZATION, + memory_utilization: CURRENT_MEMORY_UTILIZATION, + }, +}; + +// Threshold fixture mirrors slo-thresholds-v1 rule categories: hard invalid +// constraints, warning-only pressure, and external readiness evidence. +const SLO_THRESHOLDS_PAYLOAD = { + threshold_version: THRESHOLD_VERSION, + thresholds: [ + { + id: "latency-p95-local", + metric: "latency_p95_ms", + operator: "<=", + value: THRESHOLD_LATENCY_P95_LIMIT_MS, + severity: "invalid", + }, + { + id: "error-rate-local", + metric: "error_rate", + operator: "<=", + value: THRESHOLD_ERROR_RATE_LIMIT, + severity: "invalid", + }, + { + id: "cpu-pressure-local", + metric: "cpu_utilization", + operator: ">", + value: THRESHOLD_CPU_WARNING_LIMIT, + severity: "warning", + }, + { + id: "readiness-local", + evidence_source: "readiness_probe_or_actuator_health", + operator: "==", + value: true, + severity: "invalid", + }, + ], +}; + +const OPTIONAL_FIELD_PAYLOAD: RuntimePayload = { + schema_version: RUNTIME_SCHEMA_VERSION, + scenario: SCENARIO_FIXED_SMALL, + workload: WORKLOAD_SMOKE, + source: SOURCE_LOCAL_KIND, + summary: { + error_rate: 0, + }, +}; + +const UNSUPPORTED_SCHEMA_PAYLOAD: RuntimePayload = { + schema_version: UNSUPPORTED_RUNTIME_SCHEMA_VERSION, + scenario: SCENARIO_FIXED_MEDIUM, + workload: WORKLOAD_BASELINE, + source: SOURCE_LOCAL_KIND, + summary: {}, +}; + +const BLANK_SCENARIO_PAYLOAD: RuntimePayload = { + schema_version: RUNTIME_SCHEMA_VERSION, + scenario: " ", + workload: WORKLOAD_BASELINE, + source: SOURCE_LOCAL_KIND, + summary: {}, +}; + +const NON_NUMERIC_LATENCY_PAYLOAD: RuntimePayload = { + schema_version: RUNTIME_SCHEMA_VERSION, + scenario: SCENARIO_FIXED_MEDIUM, + workload: WORKLOAD_BASELINE, + source: SOURCE_LOCAL_KIND, + summary: { + latency_p95_ms: "250", + }, +}; + +const NEGATIVE_RESTART_PAYLOAD: RuntimePayload = { + schema_version: RUNTIME_SCHEMA_VERSION, + scenario: SCENARIO_FIXED_MEDIUM, + workload: WORKLOAD_BASELINE, + source: SOURCE_LOCAL_KIND, + summary: { + restart_count: -1, + }, +}; + +const OUT_OF_RANGE_ERROR_RATE_PAYLOAD: RuntimePayload = { + schema_version: RUNTIME_SCHEMA_VERSION, + scenario: SCENARIO_FIXED_MEDIUM, + workload: WORKLOAD_BASELINE, + source: SOURCE_LOCAL_KIND, + summary: { + error_rate: 1.1, + }, +}; + +const INVALID_MISSING_FIELD_PAYLOAD = { + schema_version: RUNTIME_SCHEMA_VERSION, + scenario: SCENARIO_FIXED_MEDIUM, + workload: WORKLOAD_BASELINE, + source: SOURCE_LOCAL_KIND, + summary: {}, + missing: ["availability", 42], +}; + +const ERROR_ONLY_CURRENT_PAYLOAD: RuntimePayload = { + schema_version: RUNTIME_SCHEMA_VERSION, + scenario: SCENARIO_REPLICAS_2, + workload: WORKLOAD_BASELINE, + source: SOURCE_LOCAL_KIND, + summary: { + error_rate: ERROR_ONLY_CURRENT_RATE, + }, +}; + +const ERROR_ONLY_BASELINE_PAYLOAD: RuntimePayload = { + schema_version: RUNTIME_SCHEMA_VERSION, + scenario: SCENARIO_FIXED_MEDIUM, + workload: WORKLOAD_BASELINE, + source: SOURCE_LOCAL_KIND, + summary: { + error_rate: ERROR_ONLY_BASELINE_RATE, + latency_p95_ms: BASELINE_LATENCY_P95_MS, + }, +}; + +const ZERO_BASELINE_CURRENT_PAYLOAD: RuntimePayload = { + schema_version: RUNTIME_SCHEMA_VERSION, + scenario: SCENARIO_REPLICAS_2, + workload: WORKLOAD_BASELINE, + source: SOURCE_LOCAL_KIND, + summary: { + throughput_rps: ZERO_BASELINE_CURRENT_THROUGHPUT, + error_rate: ZERO_BASELINE_CURRENT_ERROR_RATE, + }, +}; + +const ZERO_BASELINE_PAYLOAD: RuntimePayload = { + schema_version: RUNTIME_SCHEMA_VERSION, + scenario: SCENARIO_FIXED_MEDIUM, + workload: WORKLOAD_BASELINE, + source: SOURCE_LOCAL_KIND, + summary: { + throughput_rps: 0, + error_rate: 0, + }, +}; + +const CLASSIFICATION_RUNTIME_PAYLOAD: RuntimePayload = { + schema_version: RUNTIME_SCHEMA_VERSION, + scenario: SCENARIO_FIXED_MEDIUM, + workload: WORKLOAD_BASELINE, + source: SOURCE_LOCAL_KIND, + summary: { + latency_p95_ms: CLASSIFICATION_INVALID_LATENCY_P95_MS, + error_rate: CLASSIFICATION_VALID_ERROR_RATE, + cpu_utilization: CLASSIFICATION_WARNING_CPU_UTILIZATION, + }, +}; + +const runtimeJson = (payload: unknown): string => JSON.stringify(payload); + +const validRuntimeMetricsJson = runtimeJson(FULL_RUNTIME_PAYLOAD); +const runtimeBaselineMetrics = parseRuntimeMetrics(runtimeJson(RUNTIME_BASELINE_PAYLOAD)); +const runtimeCurrentMetrics = parseRuntimeMetrics(runtimeJson(RUNTIME_CURRENT_PAYLOAD)); +const sloThresholdsJson = runtimeJson(SLO_THRESHOLDS_PAYLOAD); describe("parseRuntimeMetrics", () => { - it("parses a valid summarized runtime metrics document", () => { + it("parses a valid summarized runtime metrics document", (t: TestContext) => { + t.plan(1); + const result = parseRuntimeMetrics(validRuntimeMetricsJson); - assert.deepEqual(result, { - schema_version: 1, - scenario: "fixed-medium", - workload: "baseline", - source: "local-kind", - summary: { - latency_p95_ms: 250.5, - latency_p99_ms: 550.25, - error_rate: 0.002, - throughput_rps: 42.1, - availability: 1, - restart_count: 0, - cpu_utilization: 0.62, - memory_utilization: 0.71, - hpa_desired_replicas: 2, - hpa_current_replicas: 2, - }, - missing: ["process_cpu_usage"], - }); + t.assert.deepStrictEqual(result, FULL_RUNTIME_PAYLOAD); }); - it("accepts a summary with omitted optional metric fields", () => { - const result = parseRuntimeMetrics(JSON.stringify({ - schema_version: 1, - scenario: "fixed-small", - workload: "smoke", - source: "local-kind", - summary: { - error_rate: 0, - }, - })); - - assert.deepEqual(result, { - schema_version: 1, - scenario: "fixed-small", - workload: "smoke", - source: "local-kind", - summary: { - error_rate: 0, - }, + it("accepts a summary with omitted optional metric fields", (t: TestContext) => { + t.plan(1); + + const result = parseRuntimeMetrics(runtimeJson(OPTIONAL_FIELD_PAYLOAD)); + + t.assert.deepStrictEqual(result, { + ...OPTIONAL_FIELD_PAYLOAD, missing: undefined, }); }); - it("rejects unsupported schema versions", () => { - assert.throws( - () => parseRuntimeMetrics(JSON.stringify({ - schema_version: 2, - scenario: "fixed-medium", - workload: "baseline", - source: "local-kind", - summary: {}, - })), + it("rejects unsupported schema versions", (t: TestContext) => { + t.plan(1); + + t.assert.throws( + () => parseRuntimeMetrics(runtimeJson(UNSUPPORTED_SCHEMA_PAYLOAD)), /schema_version must be 1/ ); }); - it("rejects missing or blank required string fields", () => { - assert.throws( - () => parseRuntimeMetrics(JSON.stringify({ - schema_version: 1, - scenario: " ", - workload: "baseline", - source: "local-kind", - summary: {}, - })), + it("rejects missing or blank required string fields", (t: TestContext) => { + t.plan(1); + + t.assert.throws( + () => parseRuntimeMetrics(runtimeJson(BLANK_SCENARIO_PAYLOAD)), /scenario must be a non-empty string/ ); }); - it("rejects non-numeric summary fields", () => { - assert.throws( - () => parseRuntimeMetrics(JSON.stringify({ - schema_version: 1, - scenario: "fixed-medium", - workload: "baseline", - source: "local-kind", - summary: { - latency_p95_ms: "250", - }, - })), + it("rejects non-numeric summary fields", (t: TestContext) => { + t.plan(1); + + t.assert.throws( + () => parseRuntimeMetrics(runtimeJson(NON_NUMERIC_LATENCY_PAYLOAD)), /summary\.latency_p95_ms must be a finite number/ ); }); - it("rejects negative count and duration fields", () => { - assert.throws( - () => parseRuntimeMetrics(JSON.stringify({ - schema_version: 1, - scenario: "fixed-medium", - workload: "baseline", - source: "local-kind", - summary: { - restart_count: -1, - }, - })), + it("rejects negative count and duration fields", (t: TestContext) => { + t.plan(1); + + t.assert.throws( + () => parseRuntimeMetrics(runtimeJson(NEGATIVE_RESTART_PAYLOAD)), /summary\.restart_count must be >= 0/ ); }); - it("rejects ratio fields greater than one", () => { - assert.throws( - () => parseRuntimeMetrics(JSON.stringify({ - schema_version: 1, - scenario: "fixed-medium", - workload: "baseline", - source: "local-kind", - summary: { - error_rate: 1.1, - }, - })), + it("rejects ratio fields greater than one", (t: TestContext) => { + t.plan(1); + + t.assert.throws( + () => parseRuntimeMetrics(runtimeJson(OUT_OF_RANGE_ERROR_RATE_PAYLOAD)), /summary\.error_rate must be <= 1/ ); }); - it("rejects non-string missing field names", () => { - assert.throws( - () => parseRuntimeMetrics(JSON.stringify({ - schema_version: 1, - scenario: "fixed-medium", - workload: "baseline", - source: "local-kind", - summary: {}, - missing: ["availability", 42], - })), + it("rejects non-string missing field names", (t: TestContext) => { + t.plan(1); + + t.assert.throws( + () => parseRuntimeMetrics(runtimeJson(INVALID_MISSING_FIELD_PAYLOAD)), /missing\[1\] must be a non-empty string/ ); }); }); describe("readRuntimeMetrics", () => { - it("returns undefined when no path is provided", () => { - assert.equal(readRuntimeMetrics(), undefined); - assert.equal(readRuntimeMetrics(" "), undefined); + it("returns undefined when no path is provided", (t: TestContext) => { + t.plan(2); + + t.assert.strictEqual(readRuntimeMetrics(), undefined); + t.assert.strictEqual(readRuntimeMetrics(" "), undefined); }); - it("reads and parses a runtime metrics JSON file", () => { + it("reads and parses a runtime metrics JSON file", (t: TestContext) => { + t.plan(3); + const directory = fs.mkdtempSync(path.join(os.tmpdir(), "lined-runtime-")); const file = path.join(directory, "runtime-summary.json"); @@ -175,11 +412,190 @@ describe("readRuntimeMetrics", () => { const result = readRuntimeMetrics(file); - assert.equal(result?.schema_version, 1); - assert.equal(result?.scenario, "fixed-medium"); - assert.equal(result?.summary.latency_p95_ms, 250.5); + t.assert.strictEqual(result?.schema_version, RUNTIME_SCHEMA_VERSION); + t.assert.strictEqual(result?.scenario, SCENARIO_FIXED_MEDIUM); + t.assert.strictEqual(result?.summary.latency_p95_ms, SUMMARY_LATENCY_P95_MS); + } finally { + fs.rmSync(directory, {recursive: true, force: true}); + } + }); +}); + +describe("computeRuntimeFitness", () => { + it("does not emit runtime metadata when no current runtime input exists", (t: TestContext) => { + t.plan(1); + + const result = computeRuntimeFitness(); + + t.assert.deepStrictEqual(result, { + runtimeFitnessScore: null, + runtimeFitnessScoreVersion: RUNTIME_SCORE_VERSION, + }); + }); + + it("computes a runtime-aware score from current and baseline summaries", (t: TestContext) => { + t.plan(6); + + const result = computeRuntimeFitness(runtimeCurrentMetrics, runtimeBaselineMetrics); + + t.assert.strictEqual(result.runtimeFitnessScoreVersion, RUNTIME_SCORE_VERSION); + t.assert.strictEqual(result.runtimeFitnessScore, EXPECTED_RUNTIME_SCORE); + t.assert.strictEqual(result.runtimeFitness?.current.scenario, SCENARIO_REPLICAS_2); + t.assert.strictEqual(result.runtimeFitness?.baseline?.scenario, SCENARIO_FIXED_MEDIUM); + t.assert.strictEqual( + result.runtimeFitness?.activeMetricWeights.latency_p95_ms, + EXPECTED_LATENCY_P95_WEIGHT + ); + t.assert.strictEqual( + result.runtimeFitness?.normalizedDeltas.latency_p95_ms?.normalizedDelta, + EXPECTED_LATENCY_P95_DELTA + ); + }); + + it("marks runtime evidence ineligible when the comparison baseline is missing", ( + t: TestContext + ) => { + t.plan(3); + + const result = computeRuntimeFitness(runtimeCurrentMetrics, undefined, { + thresholdVersion: THRESHOLD_VERSION, + constraints: [], + hasInvalidHardConstraint: false, + hasUnknownHardConstraint: false, + eligibleForStableComparison: true, + }); + + t.assert.strictEqual(result.runtimeFitnessScore, null); + t.assert.strictEqual(result.runtimeFitness?.eligibleForStableComparison, false); + t.assert.strictEqual( + result.runtimeFitness?.reason, + "runtime baseline metrics are not available" + ); + }); + + it("omits missing runtime metrics and re-normalizes active weights", (t: TestContext) => { + t.plan(3); + + const current = parseRuntimeMetrics(runtimeJson(ERROR_ONLY_CURRENT_PAYLOAD)); + const baseline = parseRuntimeMetrics(runtimeJson(ERROR_ONLY_BASELINE_PAYLOAD)); + + const result = computeRuntimeFitness(current, baseline); + + t.assert.strictEqual(result.runtimeFitnessScore, EXPECTED_ERROR_ONLY_SCORE); + t.assert.deepStrictEqual(result.runtimeFitness?.activeMetricWeights, { + error_rate: 1, + }); + t.assert.ok( + result.runtimeFitness?.missingMetrics.includes("current.summary.latency_p95_ms") + ); + }); + + it("handles zero baselines with documented normalization rules", (t: TestContext) => { + t.plan(3); + + const current = parseRuntimeMetrics(runtimeJson(ZERO_BASELINE_CURRENT_PAYLOAD)); + const baseline = parseRuntimeMetrics(runtimeJson(ZERO_BASELINE_PAYLOAD)); + + const result = computeRuntimeFitness(current, baseline); + + t.assert.strictEqual(result.runtimeFitnessScore, EXPECTED_ZERO_BASELINE_SCORE); + t.assert.strictEqual( + result.runtimeFitness?.normalizedDeltas.throughput_rps?.normalizedDelta, + 1 + ); + t.assert.strictEqual( + result.runtimeFitness?.normalizedDeltas.error_rate?.normalizedDelta, + -1 + ); + }); +}); + +describe("classifyRuntimeMetrics", () => { + it("classifies valid, warning, invalid, and unknown runtime evidence", (t: TestContext) => { + t.plan(5); + + const thresholds = parseSloThresholds(sloThresholdsJson); + const runtime = parseRuntimeMetrics(runtimeJson(CLASSIFICATION_RUNTIME_PAYLOAD)); + + const result = classifyRuntimeMetrics(runtime, thresholds); + + t.assert.strictEqual(result.thresholdVersion, THRESHOLD_VERSION); + t.assert.strictEqual(result.eligibleForStableComparison, false); + t.assert.strictEqual(result.hasInvalidHardConstraint, true); + t.assert.strictEqual(result.hasUnknownHardConstraint, true); + t.assert.deepStrictEqual( + result.constraints.map((constraint) => constraint.classification), + EXPECTED_CONSTRAINT_CLASSIFICATIONS + ); + }); +}); + +describe("writeMetricsOutput", () => { + it("writes a local final metrics document without a database", (t: TestContext) => { + t.plan(3); + + const directory = fs.mkdtempSync(path.join(os.tmpdir(), "lined-metrics-output-")); + const file = path.join(directory, "metrics.json"); + + try { + writeMetricsOutput(file, { + id: LOCAL_METRICS_DOCUMENT_ID, + timestamp: ISO_TIMESTAMP, + branch: LOCAL_BRANCH, + commitHash: LOCAL_COMMIT, + metrics: { + checkstyle_violations: STRUCTURAL_METRIC_ZERO, + spotbugs_total: STRUCTURAL_METRIC_ZERO, + spotbugs_total_classes: STRUCTURAL_SPOTBUGS_CLASS_COUNT, + }, + fitnessScore: EXPECTED_WRITTEN_STRUCTURAL_SCORE, + runtimeFitnessScore: EXPECTED_WRITTEN_RUNTIME_SCORE, + runtimeFitnessScoreVersion: RUNTIME_SCORE_VERSION, + }); + + const written = JSON.parse(fs.readFileSync(file, "utf-8")); + t.assert.strictEqual(written.fitnessScore, EXPECTED_WRITTEN_STRUCTURAL_SCORE); + t.assert.strictEqual(written.runtimeFitnessScore, EXPECTED_WRITTEN_RUNTIME_SCORE); + t.assert.strictEqual(written.runtimeFitnessScoreVersion, RUNTIME_SCORE_VERSION); } finally { fs.rmSync(directory, {recursive: true, force: true}); } }); + + it("throws when the output path cannot be written", (t: TestContext) => { + t.plan(1); + + t.assert.throws( + () => writeMetricsOutput(INVALID_OUTPUT_PATH, { + id: LOCAL_METRICS_DOCUMENT_ID, + timestamp: ISO_TIMESTAMP, + branch: LOCAL_BRANCH, + commitHash: LOCAL_COMMIT, + metrics: {}, + fitnessScore: null, + runtimeFitnessScore: null, + runtimeFitnessScoreVersion: RUNTIME_SCORE_VERSION, + }), + /ENOENT/ + ); + }); +}); + +describe("collectRuntimeOnlyMetrics", () => { + it("requires a runtime metrics JSON path", (t: TestContext) => { + t.plan(1); + + t.assert.throws( + () => collectRuntimeOnlyMetrics({ + checkstylePath: "", + spotbugsXmlPath: "", + spotbugsHtmlPath: "", + jacocoPath: "", + runtimeBaselineScenario: SCENARIO_FIXED_MEDIUM, + runtimeOnly: true, + sloThresholdsJsonPath: "", + }), + /RUNTIME_ONLY=true requires RUNTIME_METRICS_JSON/ + ); + }); }); diff --git a/fitness-metrics-collector/scripts/collectMetrics.ts b/fitness-metrics-collector/scripts/collectMetrics.ts index 8c6bdc6..4c5aa2e 100644 --- a/fitness-metrics-collector/scripts/collectMetrics.ts +++ b/fitness-metrics-collector/scripts/collectMetrics.ts @@ -1,5 +1,15 @@ import fs from "node:fs"; import {CosmosClient} from "@azure/cosmos"; +import { + classifyRuntimeMetrics, + computeRuntimeFitness, + parseSloThresholds, + RUNTIME_FITNESS_SCORE_VERSION, + type RuntimeFitnessMetadata, + type RuntimeFitnessResult, + type RuntimeMetrics, + type RuntimeMetricSummary, +} from "./runtimeScoring"; /* ======================= TYPES @@ -7,9 +17,9 @@ import {CosmosClient} from "@azure/cosmos"; type FitnessScore = number | null; type Metrics = { - checkstyle_violations: number; - spotbugs_total: number; - spotbugs_total_classes: number; + checkstyle_violations?: number; + spotbugs_total?: number; + spotbugs_total_classes?: number; jacoco_line_coverage?: number; sonar_cloud_main_branch_metrics?: SonarMetricsMap; sonar_cloud_current_branch_metrics?: SonarMetricsMap; @@ -18,26 +28,23 @@ type Metrics = { runtime_metrics?: RuntimeMetrics; }; -type RuntimeMetricSummary = { - latency_p95_ms?: number; - latency_p99_ms?: number; - error_rate?: number; - throughput_rps?: number; - availability?: number; - restart_count?: number; - cpu_utilization?: number; - memory_utilization?: number; - hpa_desired_replicas?: number; - hpa_current_replicas?: number; +type MetricsDocument = { + id: string; + timestamp: string; + branch: string; + commitHash?: string; + pullRequestId?: string; + metrics: Metrics; + fitnessScore: FitnessScore; + runtimeFitnessScore: FitnessScore; + runtimeFitnessScoreVersion: typeof RUNTIME_FITNESS_SCORE_VERSION; + runtimeFitness?: RuntimeFitnessMetadata; }; -type RuntimeMetrics = { - schema_version: 1; - scenario: string; - workload: string; - source: string; - summary: RuntimeMetricSummary; - missing?: string[]; +type MetricsStore = { + findStructuralBaseline(isMainBranch: boolean): Promise; + findRuntimeBaseline(scenario: string, workload: string): Promise; + save(document: MetricsDocument): Promise; }; type SonarScope = @@ -81,6 +88,11 @@ type Config = { spotbugsHtmlPath: string; jacocoPath: string; runtimeMetricsJsonPath?: string; + runtimeBaselineMetricsJsonPath?: string; + runtimeBaselineScenario: string; + runtimeOnly: boolean; + sloThresholdsJsonPath: string; + metricsOutputJsonPath?: string; commitHash?: string; cosmosDbConnectionString?: string; pullRequestId?: string; @@ -95,6 +107,7 @@ const DEFAULT_PATHS = { SPOTBUGS_XML: "../backend/lined/build/reports/spotbugs/spotbugsMain.xml", SPOTBUGS_HTML: "../backend/lined/build/reports/spotbugs/spotbugsMain.html", JACOCO: "../backend/lined/build/reports/jacoco/test/jacocoTestReport.xml", + SLO_THRESHOLDS: "../backend/lined/load-tests/runtime-scenarios/slo-thresholds-v1.json", } as const; const REGEX_PATTERNS = { @@ -123,6 +136,10 @@ const readFile = (path: string): string => { return fs.readFileSync(path, "utf-8"); }; +const isEnabled = (value: string | undefined): boolean => { + return value === "true" || value === "1" || value === "yes"; +}; + const extractNumber = (content: string, pattern: RegExp, errorMsg: string): number => { const match = pattern.exec(content); if (!match?.[1]) { @@ -151,6 +168,11 @@ const getConfig = (): Config => { spotbugsHtmlPath: process.env.SPOTBUGS_HTML ?? DEFAULT_PATHS.SPOTBUGS_HTML, jacocoPath: process.env.JACOCO_XML ?? DEFAULT_PATHS.JACOCO, runtimeMetricsJsonPath: process.env.RUNTIME_METRICS_JSON, + runtimeBaselineMetricsJsonPath: process.env.RUNTIME_BASELINE_METRICS_JSON, + runtimeBaselineScenario: process.env.RUNTIME_BASELINE_SCENARIO ?? "fixed-medium", + runtimeOnly: isEnabled(process.env.RUNTIME_ONLY), + sloThresholdsJsonPath: process.env.SLO_THRESHOLDS_JSON ?? DEFAULT_PATHS.SLO_THRESHOLDS, + metricsOutputJsonPath: process.env.METRICS_OUTPUT_JSON, branchName: process.env.BRANCH_NAME, pullRequestId: process.env.PR_NUMBER, commitHash: process.env.GITHUB_SHA, @@ -177,8 +199,8 @@ const normalize = (main: number, current: number, higherIsBetter: boolean): numb }; const computeFitnessFunction = async ( + store: MetricsStore, config: Config, - container: ReturnType["container"]>, current: Metrics ): Promise => { const isMainBranch = config.branchName === "main"; @@ -196,24 +218,18 @@ const computeFitnessFunction = async ( const mainDuplication = toNumber(mainSonar["duplicated_lines_density"]) ?? 0; const currentDuplication = toNumber(currentSonar["duplicated_lines_density"]) ?? 0; - // SpotBugs + Checkstyle — try Cosmos DB, fall back to 0 - const query = isMainBranch - ? "SELECT * FROM c WHERE c.branch = 'main' ORDER BY c.timestamp DESC OFFSET 1 LIMIT 1" - : "SELECT * FROM c WHERE c.branch = 'main' ORDER BY c.timestamp DESC OFFSET 0 LIMIT 1"; - - const {resources} = await container.items.query(query).fetchAll(); - const snapshot = resources[0] as { metrics: Metrics } | undefined; + const snapshot = await store.findStructuralBaseline(isMainBranch); if (!snapshot) { console.log("[fitness] No main baseline in DB — using 0 for SpotBugs/Checkstyle baseline"); } - const mainSpotbugs = snapshot?.metrics.spotbugs_total ?? 0; - const mainCheckstyle = snapshot?.metrics.checkstyle_violations ?? 0; - const mainCoverage = snapshot?.metrics.jacoco_line_coverage ?? 0; + const mainSpotbugs = snapshot?.spotbugs_total ?? 0; + const mainCheckstyle = snapshot?.checkstyle_violations ?? 0; + const mainCoverage = snapshot?.jacoco_line_coverage ?? 0; - const currentSpotbugs = current.spotbugs_total; - const currentCheckstyle = current.checkstyle_violations; + const currentSpotbugs = current.spotbugs_total ?? 0; + const currentCheckstyle = current.checkstyle_violations ?? 0; const currentCoverage = current.jacoco_line_coverage ?? 0; const F = @@ -227,41 +243,147 @@ const computeFitnessFunction = async ( return Number(F.toFixed(4)); }; +const hasStructuralMetrics = (metrics: Metrics): boolean => { + return metrics.checkstyle_violations !== undefined && + metrics.spotbugs_total !== undefined && + metrics.spotbugs_total_classes !== undefined; +}; + +const requireStructuralMetrics = (metrics: Metrics): void => { + if (!hasStructuralMetrics(metrics)) { + throw new Error( + "Structural fitness scoring requires checkstyle_violations, " + + "spotbugs_total, and spotbugs_total_classes" + ); + } +}; + +const readSloThresholds = (path: string) => parseSloThresholds(readFile(path)); + +const resolveRuntimeBaseline = async ( + config: Config, + store: MetricsStore | undefined, + currentRuntimeMetrics?: RuntimeMetrics +): Promise => { + const explicitBaseline = readRuntimeMetrics(config.runtimeBaselineMetricsJsonPath); + if (explicitBaseline) { + return explicitBaseline; + } + + if (!store || !currentRuntimeMetrics) { + return undefined; + } + + return store.findRuntimeBaseline( + config.runtimeBaselineScenario, + currentRuntimeMetrics.workload + ); +}; + /* ======================= SAVE DATA IN COSMOS DB ======================= */ const sanitizeBranchName = (name: string): string => name.replaceAll(/[/\\#?]/g, '-'); -const saveMetrics = async ( - config: Config, - container: ReturnType["container"]>, - data: Metrics, - fitnessScore: FitnessScore): Promise => { +class CosmosMetricsStore implements MetricsStore { + private readonly container: ReturnType["container"]>; + + constructor(connectionString: string) { + const client = new CosmosClient(connectionString); + this.container = client.database("metrics").container("pipeline-runs"); + } + + async findStructuralBaseline(isMainBranch: boolean): Promise { + const query = isMainBranch + ? "SELECT * FROM c WHERE c.branch = 'main' ORDER BY c.timestamp DESC OFFSET 1 LIMIT 1" + : "SELECT * FROM c WHERE c.branch = 'main' ORDER BY c.timestamp DESC OFFSET 0 LIMIT 1"; + const {resources} = await this.container.items.query(query).fetchAll(); + const snapshot = resources[0] as { metrics: Metrics } | undefined; + return snapshot?.metrics; + } + + async findRuntimeBaseline( + scenario: string, + workload: string + ): Promise { + const query = + "SELECT * FROM c WHERE c.branch = 'main' " + + "AND c.metrics.runtime_metrics.scenario = @scenario " + + "AND c.metrics.runtime_metrics.workload = @workload " + + "ORDER BY c.timestamp DESC OFFSET 0 LIMIT 1"; + const {resources} = await this.container.items.query({ + query, + parameters: [ + {name: "@scenario", value: scenario}, + {name: "@workload", value: workload}, + ], + }).fetchAll(); + const snapshot = resources[0] as { metrics: Metrics } | undefined; + return snapshot?.metrics.runtime_metrics; + } + + async save(document: MetricsDocument): Promise { + const {resource} = await this.container.item(document.id, document.branch).read(); + + if (resource) { + console.log(`[metrics] already saved for commit ${document.commitHash}, skipping`); + return; + } + + await this.container.items.create(document); + } +} + +const createMetricsStore = (config: Config): MetricsStore | undefined => { if (!config.cosmosDbConnectionString) { - console.log(`We cannot save metrics to the database. Missing COSMOS_DB_CONNECTION_STRING environment variable.`); - return; + return undefined; } - const partitionKey = sanitizeBranchName(config.branchName ?? "unknown"); - const id = `${partitionKey}-${config.commitHash}`; + return new CosmosMetricsStore(config.cosmosDbConnectionString); +}; - const {resource} = await container.item(id, partitionKey).read(); +const LOCAL_BASELINE_STORE: MetricsStore = { + async findStructuralBaseline(): Promise { + return undefined; + }, + async findRuntimeBaseline(): Promise { + return undefined; + }, + async save(): Promise { + return undefined; + }, +}; - if (resource) { - console.log(`[metrics] already saved for commit ${config.commitHash}, skipping`); - return; - } +const buildMetricsDocument = ( + config: Config, + data: Metrics, + fitnessScore: FitnessScore, + runtimeFitnessResult: RuntimeFitnessResult +): MetricsDocument => { + const branch = sanitizeBranchName(config.branchName ?? "unknown"); + const id = `${branch}-${config.commitHash ?? "unknown"}`; - await container.items.create({ + return { id, timestamp: new Date().toISOString(), - branch: partitionKey, - commitHash: process.env.GITHUB_SHA, + branch, + commitHash: config.commitHash, pullRequestId: config.pullRequestId, metrics: data, - fitnessScore - }); + fitnessScore, + runtimeFitnessScore: runtimeFitnessResult.runtimeFitnessScore, + runtimeFitnessScoreVersion: runtimeFitnessResult.runtimeFitnessScoreVersion, + runtimeFitness: runtimeFitnessResult.runtimeFitness, + }; +}; + +export const writeMetricsOutput = (path: string | undefined, document: MetricsDocument): void => { + if (!path || path.trim() === "") { + return; + } + + fs.writeFileSync(path, `${JSON.stringify(document, null, 2)}\n`, "utf-8"); } /* ======================= @@ -465,7 +587,8 @@ const validateMetrics = (metrics: Metrics): Result => { return { metrics, checkstyle_valid: true, - spotbugs_valid: metrics.spotbugs_total_classes > 0, + spotbugs_valid: metrics.spotbugs_total_classes === undefined || + metrics.spotbugs_total_classes > 0, }; }; @@ -693,14 +816,28 @@ const collectMetrics = async (config: Config): Promise => { return metrics; }; +export const collectRuntimeOnlyMetrics = (config: Config): Metrics => { + const runtimeMetrics = readRuntimeMetrics(config.runtimeMetricsJsonPath); + if (!runtimeMetrics) { + throw new Error("RUNTIME_ONLY=true requires RUNTIME_METRICS_JSON"); + } + + return { + runtime_metrics: runtimeMetrics, + }; +}; + /* ======================= MAIN ======================= */ const main = async (): Promise => { try { const config: Config = getConfig(); - const metrics: Metrics = await collectMetrics(config); + const metrics: Metrics = config.runtimeOnly + ? collectRuntimeOnlyMetrics(config) + : await collectMetrics(config); const result: Result = validateMetrics(metrics); + const store = createMetricsStore(config); console.log(JSON.stringify(result, null, 2)); @@ -712,18 +849,39 @@ const main = async (): Promise => { process.exit(EXIT_CODES.SPOTBUGS_INVALID); } - if (!config.cosmosDbConnectionString) { - console.log(`We cannot save metrics to the database. Missing COSMOS_DB_CONNECTION_STRING environment variable.`); - return; - } + const runtimeBaseline = await resolveRuntimeBaseline( + config, + store, + metrics.runtime_metrics + ); + const sloClassification = metrics.runtime_metrics + ? classifyRuntimeMetrics(metrics.runtime_metrics, readSloThresholds(config.sloThresholdsJsonPath)) + : undefined; + const runtimeFitnessResult = computeRuntimeFitness( + metrics.runtime_metrics, + runtimeBaseline, + sloClassification + ); - const client = new CosmosClient(config.cosmosDbConnectionString); - const container = client.database("metrics").container("pipeline-runs"); + if (!config.runtimeOnly) { + requireStructuralMetrics(metrics); + } - const fitnessScore = await computeFitnessFunction(config, container, metrics); + const fitnessScore = config.runtimeOnly + ? null + : await computeFitnessFunction(store ?? LOCAL_BASELINE_STORE, config, metrics); + const document = buildMetricsDocument(config, metrics, fitnessScore, runtimeFitnessResult); console.log(`[fitness] F = ${fitnessScore}`); - await saveMetrics(config, container, metrics, fitnessScore); + console.log(`[fitness] runtime F = ${runtimeFitnessResult.runtimeFitnessScore}`); + writeMetricsOutput(config.metricsOutputJsonPath, document); + + if (!store) { + console.log(`We cannot save metrics to the database. Missing COSMOS_DB_CONNECTION_STRING environment variable.`); + return; + } + + await store.save(document); } catch (error) { console.error("Fatal error:", error instanceof Error ? error.message : error); process.exit(1); diff --git a/fitness-metrics-collector/scripts/runtimeScoring.ts b/fitness-metrics-collector/scripts/runtimeScoring.ts new file mode 100644 index 0000000..243ab20 --- /dev/null +++ b/fitness-metrics-collector/scripts/runtimeScoring.ts @@ -0,0 +1,462 @@ +export const RUNTIME_FITNESS_SCORE_VERSION = "runtime-aware-v1"; + +export type RuntimeMetricSummary = { + latency_p95_ms?: number; + latency_p99_ms?: number; + error_rate?: number; + throughput_rps?: number; + availability?: number; + restart_count?: number; + cpu_utilization?: number; + memory_utilization?: number; + hpa_desired_replicas?: number; + hpa_current_replicas?: number; +}; + +export type RuntimeMetrics = { + schema_version: 1; + scenario: string; + workload: string; + source: string; + summary: RuntimeMetricSummary; + missing?: string[]; +}; + +type RuntimeFitnessScore = number | null; + +type RuntimeFitnessMetric = keyof Pick< + RuntimeMetricSummary, + | "latency_p95_ms" + | "latency_p99_ms" + | "error_rate" + | "throughput_rps" + | "availability" + | "restart_count" + | "cpu_utilization" + | "memory_utilization" +>; + +type RuntimeScoreMetricResult = { + baseline: number; + current: number; + normalizedDelta: number; + weight: number; +}; + +type RuntimeSloClassification = "valid" | "warning" | "invalid" | "unknown"; + +type RuntimeSloConstraintResult = { + id: string; + metric?: string; + evidenceSource?: string; + classification: RuntimeSloClassification; + severity: "invalid" | "warning"; + missing: boolean; +}; + +export type RuntimeSloResult = { + thresholdVersion: string; + constraints: RuntimeSloConstraintResult[]; + hasInvalidHardConstraint: boolean; + hasUnknownHardConstraint: boolean; + eligibleForStableComparison: boolean; +}; + +export type RuntimeFitnessMetadata = { + current: { + scenario: string; + workload: string; + source: string; + }; + baseline?: { + scenario: string; + workload: string; + source: string; + }; + activeMetricWeights: Partial>; + missingMetrics: string[]; + normalizedDeltas: Partial>; + sloClassification?: RuntimeSloResult; + eligibleForStableComparison: boolean; + reason?: string; +}; + +export type RuntimeFitnessResult = { + runtimeFitnessScore: RuntimeFitnessScore; + runtimeFitnessScoreVersion: typeof RUNTIME_FITNESS_SCORE_VERSION; + runtimeFitness?: RuntimeFitnessMetadata; +}; + +type ThresholdRule = { + id: string; + metric?: string; + evidence_source?: string; + operator: "<=" | ">=" | "==" | ">"; + value: number | boolean; + severity: "invalid" | "warning"; +}; + +export type SloThresholdDocument = { + threshold_version: string; + thresholds: ThresholdRule[]; +}; + +type RuntimeScoreDefinition = { + field: RuntimeFitnessMetric; + weight: number; + higherIsBetter: boolean; +}; + +const SUPPORTED_THRESHOLD_OPERATORS = ["<=", ">=", "==", ">"] as const; +const SUPPORTED_THRESHOLD_SEVERITIES = ["invalid", "warning"] as const; + +const RUNTIME_SCORE_DEFINITIONS: readonly RuntimeScoreDefinition[] = [ + {field: "latency_p95_ms", weight: 0.2, higherIsBetter: false}, + {field: "latency_p99_ms", weight: 0.15, higherIsBetter: false}, + {field: "error_rate", weight: 0.2, higherIsBetter: false}, + {field: "throughput_rps", weight: 0.15, higherIsBetter: true}, + {field: "availability", weight: 0.15, higherIsBetter: true}, + {field: "restart_count", weight: 0.1, higherIsBetter: false}, + {field: "cpu_utilization", weight: 0.025, higherIsBetter: false}, + {field: "memory_utilization", weight: 0.025, higherIsBetter: false}, +]; + +const isRecord = (value: unknown): value is Record => { + return typeof value === "object" && value !== null && !Array.isArray(value); +}; + +const requireString = (value: unknown, field: string): string => { + if (typeof value !== "string" || value.trim() === "") { + throw new Error(`SLO thresholds: ${field} must be a non-empty string`); + } + + return value; +}; + +const optionalString = (value: unknown, field: string): string | undefined => { + return value === undefined ? undefined : requireString(value, field); +}; + +const requireNumberOrBoolean = (value: unknown, field: string): number | boolean => { + if (typeof value === "number" && Number.isFinite(value)) { + return value; + } + + if (typeof value === "boolean") { + return value; + } + + throw new Error(`SLO thresholds: ${field} must be a finite number or boolean`); +}; + +const requireThresholdOperator = (value: unknown, field: string): ThresholdRule["operator"] => { + const operator = requireString(value, field); + if (!SUPPORTED_THRESHOLD_OPERATORS.includes(operator as ThresholdRule["operator"])) { + throw new Error(`SLO thresholds: ${field} is unsupported`); + } + + return operator as ThresholdRule["operator"]; +}; + +const requireThresholdSeverity = (value: unknown, field: string): ThresholdRule["severity"] => { + const severity = requireString(value, field); + if (!SUPPORTED_THRESHOLD_SEVERITIES.includes(severity as ThresholdRule["severity"])) { + throw new Error(`SLO thresholds: ${field} is unsupported`); + } + + return severity as ThresholdRule["severity"]; +}; + +const parseThresholdRule = (value: unknown, index: number): ThresholdRule => { + if (!isRecord(value)) { + throw new Error(`SLO thresholds: thresholds[${index}] must be an object`); + } + + return { + id: requireString(value.id, `thresholds[${index}].id`), + metric: optionalString(value.metric, `thresholds[${index}].metric`), + evidence_source: optionalString( + value.evidence_source, + `thresholds[${index}].evidence_source` + ), + operator: requireThresholdOperator(value.operator, `thresholds[${index}].operator`), + value: requireNumberOrBoolean(value.value, `thresholds[${index}].value`), + severity: requireThresholdSeverity(value.severity, `thresholds[${index}].severity`), + }; +}; + +export const parseSloThresholds = (content: string): SloThresholdDocument => { + const parsed: unknown = JSON.parse(content); + if (!isRecord(parsed)) { + throw new Error("SLO thresholds JSON must contain an object"); + } + + if (!Array.isArray(parsed.thresholds)) { + throw new Error("SLO thresholds: thresholds must be an array"); + } + + return { + threshold_version: requireString(parsed.threshold_version, "threshold_version"), + thresholds: parsed.thresholds.map(parseThresholdRule), + }; +}; + +const compareThreshold = ( + current: number | boolean, + operator: ThresholdRule["operator"], + expected: number | boolean +): boolean => { + if (typeof current === "boolean" || typeof expected === "boolean") { + return operator === "==" && current === expected; + } + + if (operator === "<=") return current <= expected; + if (operator === ">=") return current >= expected; + if (operator === ">") return current > expected; + return current === expected; +}; + +const classifyThresholdMatch = ( + threshold: ThresholdRule, + missing: boolean, + matched: boolean +): RuntimeSloClassification => { + if (missing) { + return "unknown"; + } + + if (threshold.severity === "warning") { + return matched ? "warning" : "valid"; + } + + return matched ? "valid" : "invalid"; +}; + +const classifyThreshold = ( + runtimeMetrics: RuntimeMetrics, + threshold: ThresholdRule +): RuntimeSloConstraintResult => { + const field = threshold.metric as keyof RuntimeMetricSummary | undefined; + // Evidence-source rules, such as readiness, intentionally stay unknown until + // the runtime summary schema exposes a matching summarized metric field. + const current = field === undefined ? undefined : runtimeMetrics.summary[field]; + const missing = current === undefined; + const matched = !missing && compareThreshold( + current, + threshold.operator, + threshold.value + ); + + return { + id: threshold.id, + metric: threshold.metric, + evidenceSource: threshold.evidence_source, + classification: classifyThresholdMatch(threshold, missing, matched), + severity: threshold.severity, + missing, + }; +}; + +const hasHardConstraint = ( + constraints: RuntimeSloConstraintResult[], + classification: RuntimeSloClassification +): boolean => constraints.some((constraint) => + constraint.severity === "invalid" && constraint.classification === classification +); + +export const classifyRuntimeMetrics = ( + runtimeMetrics: RuntimeMetrics, + thresholds: SloThresholdDocument +): RuntimeSloResult => { + const constraints = thresholds.thresholds.map((threshold) => + classifyThreshold(runtimeMetrics, threshold) + ); + const hasInvalidHardConstraint = hasHardConstraint(constraints, "invalid"); + const hasUnknownHardConstraint = hasHardConstraint(constraints, "unknown"); + + return { + thresholdVersion: thresholds.threshold_version, + constraints, + hasInvalidHardConstraint, + hasUnknownHardConstraint, + eligibleForStableComparison: !hasInvalidHardConstraint && !hasUnknownHardConstraint, + }; +}; + +const identityOf = (runtimeMetrics: RuntimeMetrics): RuntimeFitnessMetadata["current"] => ({ + scenario: runtimeMetrics.scenario, + workload: runtimeMetrics.workload, + source: runtimeMetrics.source, +}); + +const collectMissingRuntimeScoreMetrics = ( + current: RuntimeMetrics, + baseline?: RuntimeMetrics +): string[] => { + const missing = new Set(current.missing ?? []); + + for (const definition of RUNTIME_SCORE_DEFINITIONS) { + if (current.summary[definition.field] === undefined) { + missing.add(`current.summary.${definition.field}`); + } + if (baseline && baseline.summary[definition.field] === undefined) { + missing.add(`baseline.summary.${definition.field}`); + } + } + + return [...missing].sort(); +}; + +const buildRuntimeFitnessMetadata = ( + current: RuntimeMetrics, + baseline: RuntimeMetrics | undefined, + sloClassification: RuntimeSloResult | undefined +): RuntimeFitnessMetadata => { + return { + current: identityOf(current), + baseline: baseline ? identityOf(baseline) : undefined, + activeMetricWeights: {}, + missingMetrics: collectMissingRuntimeScoreMetrics(current, baseline), + normalizedDeltas: {}, + sloClassification, + eligibleForStableComparison: Boolean( + baseline && sloClassification?.eligibleForStableComparison + ), + }; +}; + +const comparableRuntimeScoreDefinitions = ( + current: RuntimeMetrics, + baseline: RuntimeMetrics +): readonly RuntimeScoreDefinition[] => RUNTIME_SCORE_DEFINITIONS.filter((definition) => + current.summary[definition.field] !== undefined && + baseline.summary[definition.field] !== undefined +); + +const clamp = (value: number, min: number, max: number): number => + Math.max(min, Math.min(max, value)); + +const normalizeRuntimeDelta = ( + baseline: number, + current: number, + higherIsBetter: boolean +): number => { + if (baseline === 0 && current === 0) return 0; + if (baseline === 0) { + if (higherIsBetter) return current > 0 ? 1 : -1; + return current > 0 ? -1 : 1; + } + const delta = higherIsBetter + ? (current - baseline) / baseline + : (baseline - current) / baseline; + return clamp(delta, -1, 1); +}; + +const addRuntimeScoreMetric = ( + current: RuntimeMetrics, + baseline: RuntimeMetrics, + definition: RuntimeScoreDefinition, + totalWeight: number, + activeMetricWeights: Partial>, + normalizedDeltas: Partial> +): number => { + const currentValue = current.summary[definition.field] as number; + const baselineValue = baseline.summary[definition.field] as number; + const activeWeight = definition.weight / totalWeight; + const normalizedDelta = normalizeRuntimeDelta( + baselineValue, + currentValue, + definition.higherIsBetter + ); + const roundedWeight = Number(activeWeight.toFixed(6)); + + activeMetricWeights[definition.field] = roundedWeight; + normalizedDeltas[definition.field] = { + baseline: baselineValue, + current: currentValue, + normalizedDelta, + weight: roundedWeight, + }; + + return activeWeight * normalizedDelta; +}; + +const computeRuntimeScoreDetails = ( + current: RuntimeMetrics, + baseline: RuntimeMetrics, + activeDefinitions: readonly RuntimeScoreDefinition[] +): Pick & { + score: number; +} => { + const activeMetricWeights: Partial> = {}; + const normalizedDeltas: Partial> = {}; + const totalWeight = activeDefinitions.reduce((sum, definition) => sum + definition.weight, 0); + let score = 0; + + for (const definition of activeDefinitions) { + score += addRuntimeScoreMetric( + current, + baseline, + definition, + totalWeight, + activeMetricWeights, + normalizedDeltas + ); + } + + return { + score, + activeMetricWeights, + normalizedDeltas, + }; +}; + +export const computeRuntimeFitness = ( + current?: RuntimeMetrics, + baseline?: RuntimeMetrics, + sloClassification?: RuntimeSloResult +): RuntimeFitnessResult => { + if (!current) { + return { + runtimeFitnessScore: null, + runtimeFitnessScoreVersion: RUNTIME_FITNESS_SCORE_VERSION, + }; + } + + const metadataBase = buildRuntimeFitnessMetadata(current, baseline, sloClassification); + + if (!baseline) { + return { + runtimeFitnessScore: null, + runtimeFitnessScoreVersion: RUNTIME_FITNESS_SCORE_VERSION, + runtimeFitness: { + ...metadataBase, + reason: "runtime baseline metrics are not available", + }, + }; + } + + const activeDefinitions = comparableRuntimeScoreDefinitions(current, baseline); + if (activeDefinitions.length === 0) { + return { + runtimeFitnessScore: null, + runtimeFitnessScoreVersion: RUNTIME_FITNESS_SCORE_VERSION, + runtimeFitness: { + ...metadataBase, + reason: "no comparable runtime metrics are available", + }, + }; + } + + const scoreDetails = computeRuntimeScoreDetails(current, baseline, activeDefinitions); + + return { + runtimeFitnessScore: Number(scoreDetails.score.toFixed(4)), + runtimeFitnessScoreVersion: RUNTIME_FITNESS_SCORE_VERSION, + runtimeFitness: { + ...metadataBase, + activeMetricWeights: scoreDetails.activeMetricWeights, + normalizedDeltas: scoreDetails.normalizedDeltas, + }, + }; +};