agentflow/.github/workflows/ci.yml at main · brownjuly2003-code/agentflow · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
name: CI

on:
  push:
    branches: [main]
  pull_request:
    branches: [main]

permissions:
  contents: read

concurrency:
  group: ci-${{ github.ref }}
  cancel-in-progress: true

jobs:
  lint:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
      - uses: actions/setup-python@ece7cb06caefa5fff74198d8649806c4678c61a1 # v6.3.0
        with:
          python-version: "3.11"
      - name: Install dependencies
        run: pip install -e ".[dev]"
      - name: Ruff check
        run: ruff check src/ tests/ scripts/
      - name: Ruff format check
        run: ruff format --check src/ tests/ scripts/
      - name: Type check
        run: mypy src/ --ignore-missing-imports

  schema-check:
    runs-on: ubuntu-latest
    needs: lint
    steps:
      - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
        with:
          fetch-depth: 2
      - uses: actions/setup-python@ece7cb06caefa5fff74198d8649806c4678c61a1 # v6.3.0
        with:
          python-version: "3.11"
      - name: Install dependencies
        run: pip install -e ".[dev]"
      - name: Check schema evolution
        run: python scripts/check_schema_evolution.py

  test-unit:
    runs-on: ubuntu-latest
    needs: lint
    permissions:
      contents: read
      id-token: write
    steps:
      - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
      - uses: actions/setup-python@ece7cb06caefa5fff74198d8649806c4678c61a1 # v6.3.0
        with:
          python-version: "3.11"
      - name: Install dependencies
        run: |
          pip install -e ".[dev,cloud]"
          pip install -e "./sdk"
          pip install -e "./integrations[mcp]"
      - name: Prepare pytest temp directory
        run: mkdir -p .tmp
      - name: Run unit and property tests with coverage
        run: |
          # Full src/sdk baseline floor; changed-code coverage stays at 80% via Codecov patch status.
          # --cov-branch turns the floor into a combined line+branch metric — local baseline
          # 2026-05-25 is 62% (7716 lines / 2010 branches measured on HEAD `22b1be9`), so the
          # 60% gate stays passing with a 2pp cushion. Raise the gate once that cushion grows.
          python -m pytest tests/unit/ tests/property/ -v --tb=short --cov=src --cov=sdk --cov-branch --cov-report=xml --cov-report=term-missing --cov-fail-under=60
      - name: Run quality validators coverage gate
        run: |
          python -m pytest tests/unit/test_validators.py -v --tb=short --cov=src.quality.validators --cov-report=term-missing --cov-fail-under=90
      - name: Run freshness monitor coverage gate
        run: |
          python -m pytest tests/unit/test_freshness_monitor.py -v --tb=short --cov=src.quality.monitors.freshness_monitor --cov-report=term-missing --cov-fail-under=90
      - name: Run event producer coverage gate
        run: |
          python -m pytest tests/unit/test_event_producer.py -v --tb=short --cov=src.ingestion.producers.event_producer --cov-report=term-missing --cov-fail-under=90
      - name: Run SQL guard coverage gate
        run: |
          # Security-critical NL->SQL allowlist/denylist guard; local module
          # coverage is 100%, so the 90% gate keeps a 10pp regression cushion.
          python -m pytest tests/unit/test_sql_guard.py -v --tb=short --cov=src.serving.semantic_layer.sql_guard --cov-report=term-missing --cov-fail-under=90
      - name: Run PII masking coverage gate
        run: |
          # Security-critical PII masker (email/phone/address/name + query-result
          # masking); local module coverage is 99%, so the 90% gate keeps a ~9pp
          # regression cushion on a mutmut target.
          python -m pytest tests/unit/test_masking.py -v --tb=short --cov=src.serving.masking --cov-report=term-missing --cov-fail-under=90
      - name: Run rate limiter coverage gate
        run: |
          # Security-critical sliding-window rate limiter (Redis + in-memory
          # fail-open fallback); local module coverage is 98% (only the optional
          # redis auto-construct line is env-gated), so the 90% gate keeps a
          # cushion on a mutmut target.
          python -m pytest tests/unit/test_rate_limiter.py -v --tb=short --cov=src.serving.api.rate_limiter --cov-report=term-missing --cov-fail-under=90
      - name: Run auth manager coverage gate
        run: |
          # Security-critical auth manager (key match/verify, tenant isolation,
          # rate-limit/failed-auth windows, rotation grace) and a mutmut target;
          # the gate runs its dedicated unit files. Module coverage is 94% so the
          # 90% gate keeps a cushion; the remaining gap is the platform-divergent
          # SIGHUP handler and bcrypt rotation paths the integration/e2e auth
          # suites cover.
          #
          # NOTE: unlike the other per-module gates this uses `coverage run` +
          # `coverage report --include`, NOT `pytest --cov=<module>`. The auth
          # manager pulls in duckdb (usage table), and pytest-cov's source
          # instrumentation of a duckdb-importing module trips duckdb's lazy
          # `_duckdb._sqltypes` import at COLLECTION time, both locally and on CI
          # runners. `coverage run` imports duckdb normally and avoids the break.
          python -m coverage run -m pytest tests/unit/test_auth.py tests/unit/test_auth_manager_pure_logic.py tests/unit/test_auth_manager_memory_bounds.py tests/unit/test_auth_hashed_key_guidance.py tests/unit/test_auth_argon2_lookup.py -p no:schemathesis
          python -m coverage report --include="*/serving/api/auth/manager.py" --show-missing --fail-under=90
      - name: Run key rotation coverage gate
        run: |
          # Security-critical key-rotation lifecycle (create/rotate/revoke,
          # grace-period scheduling, rotation status) and a mutmut target. Like
          # the auth manager gate it pulls in duckdb, so it uses coverage run +
          # coverage report --include (not pytest --cov) to avoid the
          # duckdb _duckdb._sqltypes collection break. Module coverage is 93%.
          python -m coverage run -m pytest tests/unit/test_key_rotation.py -p no:schemathesis
          python -m coverage report --include="*/serving/api/auth/key_rotation.py" --show-missing --fail-under=90
      - name: Run outbox coverage gate
        run: |
          # Security/reliability-critical at-least-once outbox dispatch loop
          # (delivery, retry/backoff, poison-to-failed, mark-sent transactions)
          # and a mutmut target. Imports duckdb, so it uses coverage run +
          # coverage report --include like the auth gates. Module coverage is
          # 92% across the two dedicated unit files.
          python -m coverage run -m pytest tests/unit/test_outbox_processor.py tests/unit/test_outbox_connection_guard.py -p no:schemathesis
          python -m coverage report --include="*/processing/outbox.py" --show-missing --fail-under=90
      - name: Run query package coverage gate
        run: |
          # The NL->SQL orchestration surface (engine, entity/metric/NL query
          # mixins, SQL builder) and a mutmut target set; the old single-file
          # query_engine.py is a re-export shim, so the gate spans the whole
          # query package. The engine imports duckdb, so it uses coverage run +
          # coverage report --include like the auth/outbox gates. Package
          # coverage is 97% across the five dedicated unit files; the gap is
          # the OTel span-recording branches the integration suites cover.
          python -m coverage run -m pytest tests/unit/test_query_engine.py tests/unit/test_query_engine_injection.py tests/unit/test_query_engine_mixin_contracts.py tests/unit/test_paginated_nl_query.py tests/unit/test_query_package_logic.py -p no:schemathesis
          python -m coverage report --include="*/serving/semantic_layer/query/*" --show-missing --fail-under=90
      - name: Upload coverage
        uses: codecov/codecov-action@fb8b3582c8e4def4969c97caa2f19720cb33a72f # v7.0.0
        with:
          files: coverage.xml
          use_oidc: true
          fail_ci_if_error: false

  test-integration:
    runs-on: ubuntu-latest
    needs: lint
    services:
      kafka:
        image: confluentinc/cp-kafka:7.7.0
        ports:
          - 9092:9092
        env:
          KAFKA_NODE_ID: 1
          KAFKA_PROCESS_ROLES: broker,controller
          KAFKA_CONTROLLER_QUORUM_VOTERS: 1@localhost:29093
          KAFKA_LISTENERS: PLAINTEXT://0.0.0.0:9092,CONTROLLER://0.0.0.0:29093
          KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://localhost:9092
          KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER
          KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,CONTROLLER:PLAINTEXT
          KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
          CLUSTER_ID: "CITestCluster01"
      clickhouse:
        # Live coverage for the ClickHouse serving backend's sqlglot
        # transpile path (H-C2); test_clickhouse_backend_live.py skips
        # itself when CLICKHOUSE_LIVE_HOST is absent.
        image: clickhouse/clickhouse-server:25.3
        ports:
          - 8123:8123
        env:
          CLICKHOUSE_USER: agentflow
          CLICKHOUSE_PASSWORD: agentflow
          CLICKHOUSE_DB: agentflow
    steps:
      - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
      - uses: actions/setup-python@ece7cb06caefa5fff74198d8649806c4678c61a1 # v6.3.0
        with:
          python-version: "3.11"
      - name: Install dependencies
        run: |
          pip install -e ".[dev,cloud]"
          pip install -e "./sdk"
      - name: Prepare pytest temp directory
        run: mkdir -p .tmp
      - name: Wait for Kafka
        run: |
          timeout 30 bash -c 'until nc -z localhost 9092; do sleep 1; done'
      - name: Wait for ClickHouse
        run: |
          timeout 60 bash -c 'until curl -sf http://localhost:8123/ping; do sleep 1; done'
      - name: Run integration tests
        env:
          CLICKHOUSE_LIVE_HOST: localhost
          CLICKHOUSE_LIVE_PORT: "8123"
          CLICKHOUSE_LIVE_USER: agentflow
          CLICKHOUSE_LIVE_PASSWORD: agentflow
          CLICKHOUSE_LIVE_DATABASE: agentflow
        run: pytest tests/integration/ -v --tb=short

  helm-schema-live:
    runs-on: ubuntu-latest
    needs: lint
    timeout-minutes: 8
    steps:
      - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
      - uses: actions/setup-python@ece7cb06caefa5fff74198d8649806c4678c61a1 # v6.3.0
        with:
          python-version: "3.11"
      - name: Install dependencies
        run: pip install -e ".[dev]"
      - uses: azure/setup-helm@9bc31f4ebc9c6b171d7bfbaa5d006ae7abdb4310 # v5.0.1
      - uses: helm/kind-action@ef37e7f390d99f746eb8b610417061a60e82a6cc # v1.14.0
        with:
          install_only: true
      - name: Prepare pytest temp directory
        run: mkdir -p .tmp
      - name: Run Helm schema live validation
        run: python -m pytest tests/integration/test_helm_values_live_validation.py -v -m integration --tb=short

  perf-check:
    runs-on: ubuntu-latest
    needs:
      - test-unit
      - test-integration
    timeout-minutes: 20
    steps:
      - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
      - uses: actions/setup-python@ece7cb06caefa5fff74198d8649806c4678c61a1 # v6.3.0
        with:
          python-version: "3.11"
      - name: Install dependencies
        run: pip install -e ".[dev,load,cloud]"
      - name: Run benchmark
        run: python scripts/run_benchmark.py
      - name: Convert benchmark report to JSON
        run: |
          python - <<'PY'
          import json
          import re
          from pathlib import Path

          report_path = Path("docs/benchmark.md")
          report = report_path.read_text(encoding="utf-8")
          lines = [line.strip() for line in report.splitlines() if line.startswith("|")]
          if len(lines) < 3:
              raise SystemExit("Benchmark results table not found in docs/benchmark.md")

          generated_at_match = re.search(r"Generated: `([^`]+)`", report)
          endpoints = {}
          for line in lines[2:]:
              columns = [column.strip() for column in line.strip("|").split("|")]
              if len(columns) != 8:
                  continue
              endpoint, requests, failures, failure_rate, rps, p50, p95, p99 = columns
              endpoints[endpoint] = {
                  "request_count": int(requests),
                  "failure_count": int(failures),
                  "failure_rate_percent": float(failure_rate.removesuffix("%")),
                  "requests_per_second": float(rps),
                  "p50_latency_ms": float(p50.removesuffix(" ms")),
                  "p95_latency_ms": float(p95.removesuffix(" ms")),
                  "p99_latency_ms": float(p99.removesuffix(" ms")),
              }

          aggregate = endpoints.pop("ALL", None)
          if aggregate is None:
              raise SystemExit("Missing ALL aggregate row in benchmark report.")

          current_report = {
              "generated_at": generated_at_match.group(1) if generated_at_match else None,
              "source": str(report_path),
              "aggregate": aggregate,
              "endpoints": endpoints,
          }
          Path("/tmp/current.json").write_text(
              json.dumps(current_report, indent=2) + "\n",
              encoding="utf-8",
          )
          PY
      - name: Compare to baseline
        run: python scripts/check_performance.py docs/benchmark-baseline.json /tmp/current.json

  terraform-validate:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
      - uses: hashicorp/setup-terraform@dfe3c3f87815947d99a8997f908cb6525fc44e9e # v4.0.1
        with:
          terraform_version: "1.8.0"
      - name: Terraform fmt check
        run: terraform fmt -check -recursive infrastructure/terraform/
      - name: Terraform init
        run: |
          cd infrastructure/terraform
          terraform init -backend=false
      - name: Terraform validate
        run: |
          cd infrastructure/terraform
          terraform validate