diff --git a/.github/actions/k3s-setup/action.yml b/.github/actions/k3s-setup/action.yml
new file mode 100644
index 00000000..d21c4a43
--- /dev/null
+++ b/.github/actions/k3s-setup/action.yml
@@ -0,0 +1,57 @@
+name: 'K3s Setup'
+description: 'Install k3s and create kubeconfig for Docker containers'
+
+inputs:
+ namespace:
+ description: 'Kubernetes namespace to create'
+ required: false
+ default: 'integr8scode'
+ kubeconfig-path:
+ description: 'Path to write the Docker-accessible kubeconfig'
+ required: false
+ default: 'backend/kubeconfig.yaml'
+
+outputs:
+ kubeconfig:
+ description: 'Path to the kubeconfig file for Docker containers'
+ value: ${{ inputs.kubeconfig-path }}
+
+runs:
+ using: 'composite'
+ steps:
+ - name: Install k3s
+ shell: bash
+ run: |
+ # --bind-address 0.0.0.0: Listen on all interfaces so Docker containers can reach it
+ # --tls-san host.docker.internal: Include in cert SANs for Docker container access
+ curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="--disable=traefik --bind-address 0.0.0.0 --tls-san host.docker.internal" sh -
+ mkdir -p /home/runner/.kube
+ sudo k3s kubectl config view --raw > /home/runner/.kube/config
+ sudo chmod 600 /home/runner/.kube/config
+
+ - name: Wait for k3s to be ready
+ shell: bash
+ run: |
+ export KUBECONFIG=/home/runner/.kube/config
+ timeout 90 bash -c 'until kubectl cluster-info; do sleep 5; done'
+
+ - name: Create namespace
+ shell: bash
+ env:
+ NAMESPACE: ${{ inputs.namespace }}
+ run: |
+ export KUBECONFIG=/home/runner/.kube/config
+ kubectl create namespace "$NAMESPACE" --dry-run=client -o yaml | kubectl apply -f -
+
+ - name: Create kubeconfig for Docker containers
+ shell: bash
+ env:
+ KUBECONFIG_PATH: ${{ inputs.kubeconfig-path }}
+ run: |
+ # Replace localhost/0.0.0.0 with host.docker.internal for container access
+ # (k3s may use 0.0.0.0 when started with --bind-address 0.0.0.0)
+ sed -E 's#https://(127\.0\.0\.1|0\.0\.0\.0):6443#https://host.docker.internal:6443#g' \
+ /home/runner/.kube/config > "$KUBECONFIG_PATH"
+ chmod 644 "$KUBECONFIG_PATH"
+ echo "Kubeconfig written to $KUBECONFIG_PATH"
+ echo "Server URL: $(grep server "$KUBECONFIG_PATH" | head -1)"
diff --git a/.github/workflows/backend-ci.yml b/.github/workflows/backend-ci.yml
deleted file mode 100644
index ece0f874..00000000
--- a/.github/workflows/backend-ci.yml
+++ /dev/null
@@ -1,204 +0,0 @@
-name: Backend CI
-
-on:
- push:
- branches: [main, dev]
- paths:
- - 'backend/**'
- - '.github/workflows/backend-ci.yml'
- - 'docker-compose.ci.yaml'
- pull_request:
- branches: [main, dev]
- paths:
- - 'backend/**'
- - '.github/workflows/backend-ci.yml'
- - 'docker-compose.ci.yaml'
- workflow_dispatch:
-
-# Pin image versions for cache key consistency
-env:
- MONGO_IMAGE: mongo:8.0
- REDIS_IMAGE: redis:7-alpine
- KAFKA_IMAGE: apache/kafka:3.9.0
- SCHEMA_REGISTRY_IMAGE: confluentinc/cp-schema-registry:7.5.0
-
-jobs:
- unit:
- name: Unit Tests
- runs-on: ubuntu-latest
-
- steps:
- - uses: actions/checkout@v6
-
- - name: Set up uv
- uses: astral-sh/setup-uv@v7
- with:
- enable-cache: true
- cache-dependency-glob: "backend/uv.lock"
-
- - name: Install Python dependencies
- run: |
- cd backend
- uv python install 3.12
- uv sync --frozen
-
- - name: Run unit tests
- timeout-minutes: 5
- run: |
- cd backend
- uv run pytest tests/unit -v -rs \
- --durations=0 \
- --cov=app \
- --cov-report=xml --cov-report=term
-
- - name: Upload coverage to Codecov
- uses: codecov/codecov-action@v5
- if: always()
- with:
- token: ${{ secrets.CODECOV_TOKEN }}
- files: backend/coverage.xml
- flags: backend-unit
- name: backend-unit-coverage
- fail_ci_if_error: false
- verbose: true
-
- integration:
- name: Integration Tests
- runs-on: ubuntu-latest
-
- steps:
- - uses: actions/checkout@v6
-
- - name: Cache and load Docker images
- uses: ./.github/actions/docker-cache
- with:
- images: ${{ env.MONGO_IMAGE }} ${{ env.REDIS_IMAGE }} ${{ env.KAFKA_IMAGE }} ${{ env.SCHEMA_REGISTRY_IMAGE }}
-
- - name: Set up uv
- uses: astral-sh/setup-uv@v7
- with:
- enable-cache: true
- cache-dependency-glob: "backend/uv.lock"
-
- - name: Install Python dependencies
- run: |
- cd backend
- uv python install 3.12
- uv sync --frozen
-
- - name: Start infrastructure services
- run: |
- docker compose -f docker-compose.ci.yaml up -d --wait --wait-timeout 120
- docker compose -f docker-compose.ci.yaml ps
-
- - name: Run integration tests
- timeout-minutes: 10
- run: |
- cd backend
- uv run pytest tests/integration -v -rs \
- --durations=0 \
- --cov=app \
- --cov-report=xml --cov-report=term
-
- - name: Upload coverage to Codecov
- uses: codecov/codecov-action@v5
- if: always()
- with:
- token: ${{ secrets.CODECOV_TOKEN }}
- files: backend/coverage.xml
- flags: backend-integration
- name: backend-integration-coverage
- fail_ci_if_error: false
- verbose: true
-
- - name: Collect logs
- if: failure()
- run: |
- mkdir -p logs
- docker compose -f docker-compose.ci.yaml logs > logs/docker-compose.log 2>&1
- docker compose -f docker-compose.ci.yaml logs kafka > logs/kafka.log 2>&1
- docker compose -f docker-compose.ci.yaml logs schema-registry > logs/schema-registry.log 2>&1
-
- - name: Upload logs
- if: failure()
- uses: actions/upload-artifact@v6
- with:
- name: backend-logs
- path: logs/
-
- e2e:
- name: E2E Tests
- runs-on: ubuntu-latest
-
- steps:
- - uses: actions/checkout@v6
-
- - name: Cache and load Docker images
- uses: ./.github/actions/docker-cache
- with:
- images: ${{ env.MONGO_IMAGE }} ${{ env.REDIS_IMAGE }} ${{ env.KAFKA_IMAGE }} ${{ env.SCHEMA_REGISTRY_IMAGE }}
-
- - name: Set up uv
- uses: astral-sh/setup-uv@v7
- with:
- enable-cache: true
- cache-dependency-glob: "backend/uv.lock"
-
- - name: Install Python dependencies
- run: |
- cd backend
- uv python install 3.12
- uv sync --frozen
-
- - name: Start infrastructure services
- run: |
- docker compose -f docker-compose.ci.yaml up -d --wait --wait-timeout 120
- docker compose -f docker-compose.ci.yaml ps
-
- - name: Setup Kubernetes (k3s)
- run: |
- curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="--disable=traefik" sh -
- mkdir -p /home/runner/.kube
- sudo k3s kubectl config view --raw > /home/runner/.kube/config
- sudo chmod 600 /home/runner/.kube/config
- export KUBECONFIG=/home/runner/.kube/config
- timeout 90 bash -c 'until sudo k3s kubectl cluster-info; do sleep 5; done'
- kubectl create namespace integr8scode --dry-run=client -o yaml | kubectl apply -f -
-
- - name: Run E2E tests
- timeout-minutes: 10
- env:
- KUBECONFIG: /home/runner/.kube/config
- K8S_NAMESPACE: integr8scode
- run: |
- cd backend
- uv run pytest tests/e2e -v -rs \
- --durations=0 \
- --cov=app \
- --cov-report=xml --cov-report=term
-
- - name: Upload coverage to Codecov
- uses: codecov/codecov-action@v5
- if: always()
- with:
- token: ${{ secrets.CODECOV_TOKEN }}
- files: backend/coverage.xml
- flags: backend-e2e
- name: backend-e2e-coverage
- fail_ci_if_error: false
- verbose: true
-
- - name: Collect logs
- if: failure()
- run: |
- mkdir -p logs
- docker compose -f docker-compose.ci.yaml logs > logs/docker-compose.log 2>&1
- kubectl get events --sort-by='.metadata.creationTimestamp' -A > logs/k8s-events.log 2>&1 || true
- kubectl describe pods -A > logs/k8s-describe-pods.log 2>&1 || true
-
- - name: Upload logs
- if: failure()
- uses: actions/upload-artifact@v6
- with:
- name: k8s-logs
- path: logs/
diff --git a/.github/workflows/frontend-ci.yml b/.github/workflows/frontend-ci.yml
index c36fff8a..fe29a033 100644
--- a/.github/workflows/frontend-ci.yml
+++ b/.github/workflows/frontend-ci.yml
@@ -6,18 +6,16 @@ on:
paths:
- 'frontend/**'
- '.github/workflows/frontend-ci.yml'
- - 'docker-compose.ci.yaml'
pull_request:
branches: [main, dev]
paths:
- 'frontend/**'
- '.github/workflows/frontend-ci.yml'
- - 'docker-compose.ci.yaml'
workflow_dispatch:
jobs:
- unit:
- name: Unit Tests
+ quality:
+ name: Lint & Type Check
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
@@ -33,167 +31,10 @@ jobs:
working-directory: frontend
run: npm ci
- - name: Run unit tests with coverage
+ - name: Run ESLint
working-directory: frontend
- run: npm run test:coverage
+ run: npm run lint
- - name: Upload coverage to Codecov
- uses: codecov/codecov-action@v5
- with:
- token: ${{ secrets.CODECOV_TOKEN }}
- files: frontend/coverage/lcov.info
- flags: frontend
- name: frontend-coverage
- fail_ci_if_error: false
- verbose: true
-
- e2e:
- name: E2E Tests
- needs: unit
- runs-on: ubuntu-latest
-
- # Local registry for buildx to reference base image (docker-container driver is isolated)
- services:
- registry:
- image: registry:2
- ports:
- - 5000:5000
-
- env:
- MONGO_IMAGE: mongo:8.0
- REDIS_IMAGE: redis:7-alpine
- KAFKA_IMAGE: apache/kafka:3.9.0
- SCHEMA_REGISTRY_IMAGE: confluentinc/cp-schema-registry:7.5.0
-
- steps:
- - uses: actions/checkout@v6
-
- - name: Cache and load Docker images
- uses: ./.github/actions/docker-cache
- with:
- images: ${{ env.MONGO_IMAGE }} ${{ env.REDIS_IMAGE }} ${{ env.KAFKA_IMAGE }} ${{ env.SCHEMA_REGISTRY_IMAGE }}
-
- - name: Setup Node.js
- uses: actions/setup-node@v6
- with:
- node-version: '22'
- cache: 'npm'
- cache-dependency-path: frontend/package-lock.json
-
- - name: Install dependencies
- working-directory: frontend
- run: npm ci
-
- - name: Install Playwright browsers
+ - name: Run svelte-check
working-directory: frontend
- run: npx playwright install chromium
-
- - name: Setup Docker Buildx
- uses: docker/setup-buildx-action@v3
- with:
- driver-opts: network=host
-
- - name: Setup Kubernetes (k3s)
- run: |
- curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="--disable=traefik --tls-san host.docker.internal" sh -
- mkdir -p /home/runner/.kube
- sudo k3s kubectl config view --raw > /home/runner/.kube/config
- sudo chmod 600 /home/runner/.kube/config
- export KUBECONFIG=/home/runner/.kube/config
- timeout 90 bash -c 'until sudo k3s kubectl cluster-info; do sleep 5; done'
-
- - name: Create kubeconfig for Docker containers
- run: |
- # Copy k3s kubeconfig with host.docker.internal for container networking
- sed 's|https://127.0.0.1:6443|https://host.docker.internal:6443|g' \
- /home/runner/.kube/config > backend/kubeconfig.yaml
- chmod 644 backend/kubeconfig.yaml
-
- # Build images with GitHub Actions cache for faster subsequent builds
- # Base image pushed to local registry so buildx can reference it
- - name: Build and push base image
- uses: docker/build-push-action@v6
- with:
- context: ./backend
- file: ./backend/Dockerfile.base
- push: true
- tags: localhost:5000/integr8scode-base:latest
- cache-from: type=gha,scope=backend-base
- cache-to: type=gha,mode=max,scope=backend-base
-
- # Pull base to Docker daemon (needed for docker-compose)
- - name: Load base image to Docker daemon
- run: |
- docker pull localhost:5000/integr8scode-base:latest
- docker tag localhost:5000/integr8scode-base:latest integr8scode-base:latest
-
- - name: Build backend image
- uses: docker/build-push-action@v6
- with:
- context: ./backend
- file: ./backend/Dockerfile
- load: true
- tags: integr8scode-backend:latest
- build-contexts: |
- base=docker-image://localhost:5000/integr8scode-base:latest
- cache-from: type=gha,scope=backend
- cache-to: type=gha,mode=max,scope=backend
-
- - name: Build cert-generator image
- uses: docker/build-push-action@v6
- with:
- context: ./cert-generator
- file: ./cert-generator/Dockerfile
- load: true
- tags: integr8scode-cert-generator:latest
- cache-from: type=gha,scope=cert-generator
- cache-to: type=gha,mode=max,scope=cert-generator
-
- - name: Build frontend image
- uses: docker/build-push-action@v6
- with:
- context: ./frontend
- file: ./frontend/Dockerfile
- load: true
- tags: integr8scode-frontend:latest
- cache-from: type=gha,scope=frontend
- cache-to: type=gha,mode=max,scope=frontend
-
- - name: Start full stack
- run: |
- docker compose -f docker-compose.ci.yaml --profile full up -d --wait --wait-timeout 300
- docker compose -f docker-compose.ci.yaml ps
-
- - name: Seed test users
- run: |
- docker compose -f docker-compose.ci.yaml exec -T backend uv run python scripts/seed_users.py
-
- - name: Run E2E tests
- working-directory: frontend
- env:
- CI: true
- run: npx playwright test --reporter=html
-
- - name: Upload Playwright report
- uses: actions/upload-artifact@v6
- if: always()
- with:
- name: playwright-report
- path: frontend/playwright-report/
-
- - name: Collect logs
- if: failure()
- run: |
- mkdir -p logs
- docker compose -f docker-compose.ci.yaml logs > logs/docker-compose.log 2>&1
- docker compose -f docker-compose.ci.yaml logs backend > logs/backend.log 2>&1
- docker compose -f docker-compose.ci.yaml logs frontend > logs/frontend.log 2>&1
- docker compose -f docker-compose.ci.yaml logs kafka > logs/kafka.log 2>&1
- kubectl get events --sort-by='.metadata.creationTimestamp' -A > logs/k8s-events.log 2>&1 || true
-
- - name: Upload logs
- if: failure()
- uses: actions/upload-artifact@v6
- with:
- name: frontend-e2e-logs
- path: logs/
+ run: npm run check
diff --git a/.github/workflows/stack-tests.yml b/.github/workflows/stack-tests.yml
new file mode 100644
index 00000000..8711ea9f
--- /dev/null
+++ b/.github/workflows/stack-tests.yml
@@ -0,0 +1,418 @@
+name: Stack Tests
+
+on:
+ push:
+ branches: [main, dev]
+ paths:
+ - 'backend/**'
+ - 'frontend/**'
+ - 'docker-compose.yaml'
+ - 'deploy.sh'
+ - '.github/workflows/stack-tests.yml'
+ - '.github/actions/**'
+ pull_request:
+ branches: [main, dev]
+ paths:
+ - 'backend/**'
+ - 'frontend/**'
+ - 'docker-compose.yaml'
+ - 'deploy.sh'
+ - '.github/workflows/stack-tests.yml'
+ - '.github/actions/**'
+ workflow_dispatch:
+
+env:
+ MONGO_IMAGE: mongo:8.0
+ REDIS_IMAGE: redis:7-alpine
+ KAFKA_IMAGE: confluentinc/cp-kafka:7.8.2
+ ZOOKEEPER_IMAGE: confluentinc/cp-zookeeper:7.8.2
+ SCHEMA_REGISTRY_IMAGE: confluentinc/cp-schema-registry:7.8.2
+
+jobs:
+ # Fast unit tests (no infrastructure needed)
+ backend-unit:
+ name: Backend Unit Tests
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v6
+
+ - name: Set up uv
+ uses: astral-sh/setup-uv@v7
+ with:
+ enable-cache: true
+ cache-dependency-glob: "backend/uv.lock"
+
+ - name: Install Python dependencies
+ run: |
+ cd backend
+ uv python install 3.12
+ uv sync --frozen
+
+ - name: Run unit tests
+ timeout-minutes: 5
+ run: |
+ cd backend
+ uv run pytest tests/unit -v -rs \
+ --durations=0 \
+ --cov=app \
+ --cov-report=xml --cov-report=term
+
+ - name: Upload coverage to Codecov
+ uses: codecov/codecov-action@v5
+ if: always()
+ with:
+ token: ${{ secrets.CODECOV_TOKEN }}
+ files: backend/coverage.xml
+ flags: backend-unit
+ name: backend-unit-coverage
+ fail_ci_if_error: false
+ verbose: true
+
+ frontend-unit:
+ name: Frontend Unit Tests
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v6
+
+ - name: Setup Node.js
+ uses: actions/setup-node@v6
+ with:
+ node-version: '22'
+ cache: 'npm'
+ cache-dependency-path: frontend/package-lock.json
+
+ - name: Install dependencies
+ working-directory: frontend
+ run: npm ci
+
+ - name: Run unit tests with coverage
+ working-directory: frontend
+ run: npm run test:coverage
+
+ - name: Upload coverage to Codecov
+ uses: codecov/codecov-action@v5
+ with:
+ token: ${{ secrets.CODECOV_TOKEN }}
+ files: frontend/coverage/lcov.info
+ flags: frontend-unit
+ name: frontend-unit-coverage
+ fail_ci_if_error: false
+ verbose: true
+
+ # Build all images once, cache for test jobs
+ build-images:
+ name: Build Images
+ needs: [backend-unit, frontend-unit]
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v6
+
+ - name: Setup Docker Buildx
+ uses: docker/setup-buildx-action@v3
+
+ - name: Cache base image
+ uses: actions/cache@v4
+ id: base-cache
+ with:
+ path: /tmp/base-image.tar.zst
+ key: base-image-${{ runner.os }}-${{ hashFiles('backend/Dockerfile.base', 'backend/pyproject.toml', 'backend/uv.lock') }}
+
+ - name: Load base image from cache
+ if: steps.base-cache.outputs.cache-hit == 'true'
+ run: zstd -d -c /tmp/base-image.tar.zst | docker load
+
+ - name: Build base image
+ if: steps.base-cache.outputs.cache-hit != 'true'
+ uses: docker/build-push-action@v6
+ with:
+ context: ./backend
+ file: ./backend/Dockerfile.base
+ load: true
+ tags: integr8scode-base:latest
+ cache-from: type=gha,scope=backend-base
+ cache-to: type=gha,mode=max,scope=backend-base
+
+ - name: Save base image to cache
+ if: steps.base-cache.outputs.cache-hit != 'true'
+ run: docker save integr8scode-base:latest | zstd -T0 -3 > /tmp/base-image.tar.zst
+
+ - name: Build all images
+ run: |
+ docker build -t integr8scode-backend:latest --build-context base=docker-image://integr8scode-base:latest -f ./backend/Dockerfile ./backend
+ docker build -t integr8scode-coordinator:latest -f backend/workers/Dockerfile.coordinator --build-context base=docker-image://integr8scode-base:latest ./backend
+ docker build -t integr8scode-k8s-worker:latest -f backend/workers/Dockerfile.k8s_worker --build-context base=docker-image://integr8scode-base:latest ./backend
+ docker build -t integr8scode-pod-monitor:latest -f backend/workers/Dockerfile.pod_monitor --build-context base=docker-image://integr8scode-base:latest ./backend
+ docker build -t integr8scode-result-processor:latest -f backend/workers/Dockerfile.result_processor --build-context base=docker-image://integr8scode-base:latest ./backend
+ docker build -t integr8scode-saga-orchestrator:latest -f backend/workers/Dockerfile.saga_orchestrator --build-context base=docker-image://integr8scode-base:latest ./backend
+
+ - name: Build cert-generator image
+ uses: docker/build-push-action@v6
+ with:
+ context: ./cert-generator
+ file: ./cert-generator/Dockerfile
+ load: true
+ tags: integr8scode-cert-generator:latest
+ cache-from: type=gha,scope=cert-generator
+ cache-to: type=gha,mode=max,scope=cert-generator
+
+ - name: Build frontend image
+ uses: docker/build-push-action@v6
+ with:
+ context: ./frontend
+ file: ./frontend/Dockerfile
+ load: true
+ tags: integr8scode-frontend:latest
+ cache-from: type=gha,scope=frontend
+ cache-to: type=gha,mode=max,scope=frontend
+
+ - name: Save all images
+ run: |
+ docker save \
+ integr8scode-backend:latest \
+ integr8scode-coordinator:latest \
+ integr8scode-k8s-worker:latest \
+ integr8scode-pod-monitor:latest \
+ integr8scode-result-processor:latest \
+ integr8scode-saga-orchestrator:latest \
+ integr8scode-cert-generator:latest \
+ integr8scode-frontend:latest \
+ | zstd -T0 -3 > /tmp/all-images.tar.zst
+
+ - name: Upload images artifact
+ uses: actions/upload-artifact@v6
+ with:
+ name: docker-images
+ path: /tmp/all-images.tar.zst
+ retention-days: 1
+
+ # Three parallel test jobs
+ backend-integration:
+ name: Backend Integration Tests
+ needs: [build-images]
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v6
+
+ - name: Cache and load Docker images
+ uses: ./.github/actions/docker-cache
+ with:
+ images: ${{ env.MONGO_IMAGE }} ${{ env.REDIS_IMAGE }} ${{ env.KAFKA_IMAGE }} ${{ env.ZOOKEEPER_IMAGE }} ${{ env.SCHEMA_REGISTRY_IMAGE }}
+
+ - name: Download built images
+ uses: actions/download-artifact@v7
+ with:
+ name: docker-images
+ path: /tmp
+
+ - name: Load built images
+ run: zstd -d -c /tmp/all-images.tar.zst | docker load
+
+ - name: Setup k3s
+ uses: ./.github/actions/k3s-setup
+
+ - name: Use test environment config
+ run: cp backend/.env.test backend/.env
+
+ - name: Start stack
+ run: ./deploy.sh dev --wait
+
+ - name: Run integration tests
+ timeout-minutes: 10
+ run: |
+ docker compose exec -T -e TEST_RUN_ID=integration backend \
+ uv run pytest tests/integration -v -rs \
+ --durations=0 \
+ --cov=app \
+ --cov-report=xml:coverage-integration.xml \
+ --cov-report=term
+
+ - name: Copy coverage
+ if: always()
+ run: docker compose cp backend:/app/coverage-integration.xml backend/coverage-integration.xml || true
+
+ - name: Upload coverage to Codecov
+ uses: codecov/codecov-action@v5
+ if: always()
+ with:
+ token: ${{ secrets.CODECOV_TOKEN }}
+ files: backend/coverage-integration.xml
+ flags: backend-integration
+ name: backend-integration-coverage
+ fail_ci_if_error: false
+
+ - name: Collect logs on failure
+ if: failure()
+ run: |
+ mkdir -p logs
+ docker compose logs > logs/docker-compose.log 2>&1
+ docker compose logs backend > logs/backend.log 2>&1
+ docker compose logs kafka > logs/kafka.log 2>&1
+
+ - name: Upload logs
+ if: failure()
+ uses: actions/upload-artifact@v6
+ with:
+ name: backend-integration-logs
+ path: logs/
+
+ backend-e2e:
+ name: Backend E2E Tests
+ needs: [build-images]
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v6
+
+ - name: Cache and load Docker images
+ uses: ./.github/actions/docker-cache
+ with:
+ images: ${{ env.MONGO_IMAGE }} ${{ env.REDIS_IMAGE }} ${{ env.KAFKA_IMAGE }} ${{ env.ZOOKEEPER_IMAGE }} ${{ env.SCHEMA_REGISTRY_IMAGE }}
+
+ - name: Download built images
+ uses: actions/download-artifact@v7
+ with:
+ name: docker-images
+ path: /tmp
+
+ - name: Load built images
+ run: zstd -d -c /tmp/all-images.tar.zst | docker load
+
+ - name: Setup k3s
+ uses: ./.github/actions/k3s-setup
+
+ - name: Use test environment config
+ run: cp backend/.env.test backend/.env
+
+ - name: Start stack
+ run: ./deploy.sh dev --wait
+
+ - name: Seed test users
+ run: docker compose exec -T backend uv run python scripts/seed_users.py
+
+ - name: Run E2E tests
+ timeout-minutes: 10
+ run: |
+ docker compose exec -T -e TEST_RUN_ID=e2e backend \
+ uv run pytest tests/e2e -v -rs \
+ --durations=0 \
+ --cov=app \
+ --cov-report=xml:coverage-e2e.xml \
+ --cov-report=term
+
+ - name: Copy coverage
+ if: always()
+ run: docker compose cp backend:/app/coverage-e2e.xml backend/coverage-e2e.xml || true
+
+ - name: Upload coverage to Codecov
+ uses: codecov/codecov-action@v5
+ if: always()
+ with:
+ token: ${{ secrets.CODECOV_TOKEN }}
+ files: backend/coverage-e2e.xml
+ flags: backend-e2e
+ name: backend-e2e-coverage
+ fail_ci_if_error: false
+
+ - name: Collect logs on failure
+ if: failure()
+ run: |
+ mkdir -p logs
+ docker compose logs > logs/docker-compose.log 2>&1
+ docker compose logs backend > logs/backend.log 2>&1
+ docker compose logs kafka > logs/kafka.log 2>&1
+ docker compose logs coordinator > logs/coordinator.log 2>&1 || true
+ docker compose logs k8s-worker > logs/k8s-worker.log 2>&1 || true
+ kubectl get events --sort-by='.metadata.creationTimestamp' -A > logs/k8s-events.log 2>&1 || true
+
+ - name: Upload logs
+ if: failure()
+ uses: actions/upload-artifact@v6
+ with:
+ name: backend-e2e-logs
+ path: logs/
+
+ frontend-e2e:
+ name: Frontend E2E Tests
+ needs: [build-images]
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v6
+
+ - name: Setup Node.js
+ uses: actions/setup-node@v6
+ with:
+ node-version: '22'
+ cache: 'npm'
+ cache-dependency-path: frontend/package-lock.json
+
+ - name: Install frontend dependencies
+ working-directory: frontend
+ run: npm ci
+
+ - name: Cache Playwright browsers
+ uses: actions/cache@v4
+ id: playwright-cache
+ with:
+ path: ~/.cache/ms-playwright
+ key: playwright-${{ runner.os }}-${{ hashFiles('frontend/package-lock.json') }}
+
+ - name: Install Playwright system dependencies
+ working-directory: frontend
+ run: npx playwright install-deps chromium
+
+ - name: Install Playwright browsers
+ if: steps.playwright-cache.outputs.cache-hit != 'true'
+ working-directory: frontend
+ run: npx playwright install chromium
+
+ - name: Cache and load Docker images
+ uses: ./.github/actions/docker-cache
+ with:
+ images: ${{ env.MONGO_IMAGE }} ${{ env.REDIS_IMAGE }} ${{ env.KAFKA_IMAGE }} ${{ env.ZOOKEEPER_IMAGE }} ${{ env.SCHEMA_REGISTRY_IMAGE }}
+
+ - name: Download built images
+ uses: actions/download-artifact@v7
+ with:
+ name: docker-images
+ path: /tmp
+
+ - name: Load built images
+ run: zstd -d -c /tmp/all-images.tar.zst | docker load
+
+ - name: Setup k3s
+ uses: ./.github/actions/k3s-setup
+
+ - name: Use test environment config
+ run: cp backend/.env.test backend/.env
+
+ - name: Start stack
+ run: ./deploy.sh dev --wait
+
+ - name: Seed test users
+ run: docker compose exec -T backend uv run python scripts/seed_users.py
+
+ - name: Run Playwright tests
+ timeout-minutes: 10
+ working-directory: frontend
+ run: CI=true npx playwright test
+
+ - name: Upload Playwright report
+ uses: actions/upload-artifact@v6
+ if: always()
+ with:
+ name: playwright-report
+ path: frontend/playwright-report/
+
+ - name: Collect logs on failure
+ if: failure()
+ run: |
+ mkdir -p logs
+ docker compose logs > logs/docker-compose.log 2>&1
+ docker compose logs backend > logs/backend.log 2>&1
+ docker compose logs frontend > logs/frontend.log 2>&1
+
+ - name: Upload logs
+ if: failure()
+ uses: actions/upload-artifact@v6
+ with:
+ name: frontend-e2e-logs
+ path: logs/
diff --git a/README.md b/README.md
index a7327140..cb6d99ea 100644
--- a/README.md
+++ b/README.md
@@ -15,11 +15,14 @@
-
-
+
+
+
+
+
-
+
diff --git a/backend/.env b/backend/.env
index aa213436..01b22742 100644
--- a/backend/.env
+++ b/backend/.env
@@ -13,8 +13,8 @@ K8S_POD_MEMORY_LIMIT=128Mi
K8S_POD_CPU_REQUEST=200m
K8S_POD_MEMORY_REQUEST=128Mi
K8S_POD_EXECUTION_TIMEOUT=5
+K8S_NAMESPACE=integr8scode
RATE_LIMITS=100/minute
-RATE_LIMIT_ENABLED=false
# Event-Driven Design Configuration
KAFKA_BOOTSTRAP_SERVERS=kafka:29092
@@ -81,3 +81,6 @@ SERVER_HOST=127.0.0.1
# Security
BCRYPT_ROUNDS=12
+
+# Redis Configuration
+REDIS_MAX_CONNECTIONS=200
diff --git a/backend/.env.test b/backend/.env.test
index 7d175192..68b4d5d8 100644
--- a/backend/.env.test
+++ b/backend/.env.test
@@ -1,51 +1,84 @@
-# Test environment configuration
PROJECT_NAME=integr8scode
-DATABASE_NAME=integr8scode_test
-API_V1_STR=/api/v1
-SECRET_KEY=test-secret-key-for-testing-only-32chars!!
-TESTING=true
-
-# MongoDB - use localhost for tests
-MONGODB_URL=mongodb://root:rootpassword@localhost:27017/?authSource=admin
-MONGO_ROOT_USER=root
-MONGO_ROOT_PASSWORD=rootpassword
-
-# Redis - use localhost for tests
-REDIS_HOST=localhost
-REDIS_PORT=6379
-REDIS_DB=0
-REDIS_PASSWORD=
-REDIS_SSL=false
-REDIS_MAX_CONNECTIONS=50
-REDIS_DECODE_RESPONSES=true
-
-# Kafka - use localhost for tests
-KAFKA_BOOTSTRAP_SERVERS=localhost:9092
-KAFKA_TOPIC_PREFIX=test.
-SCHEMA_SUBJECT_PREFIX=test.
-SCHEMA_REGISTRY_URL=http://localhost:8081
-
-# Reduce consumer pool and timeouts for faster test startup/teardown
-# https://github.com/aio-libs/aiokafka/issues/773
-SSE_CONSUMER_POOL_SIZE=1
-KAFKA_SESSION_TIMEOUT_MS=6000
-KAFKA_HEARTBEAT_INTERVAL_MS=2000
-KAFKA_REQUEST_TIMEOUT_MS=5000
+DATABASE_NAME=integr8scode_db
+SECRET_KEY=${SECRET_KEY:-uS5xBF-OKXHV-1vqU4ASLwyPcKpSdUTLqGHPYs3y-Yc}
+ALGORITHM=HS256
+ACCESS_TOKEN_EXPIRE_MINUTES=1440
+MONGO_ROOT_USER="${MONGO_ROOT_USER:-root}"
+MONGO_ROOT_PASSWORD="${MONGO_ROOT_PASSWORD:-rootpassword}"
+MONGODB_URL="mongodb://${MONGO_ROOT_USER}:${MONGO_ROOT_PASSWORD}@mongo:27017/integr8scode?authSource=admin"
+KUBERNETES_CONFIG_PATH=/app/kubeconfig.yaml
+KUBERNETES_CA_CERTIFICATE_PATH=/app/certs/k8s-ca.pem
+K8S_POD_CPU_LIMIT=1000m
+K8S_POD_MEMORY_LIMIT=128Mi
+K8S_POD_CPU_REQUEST=200m
+K8S_POD_MEMORY_REQUEST=128Mi
+K8S_POD_EXECUTION_TIMEOUT=5
+K8S_NAMESPACE=integr8scode
+RATE_LIMITS=99999/second
+RATE_LIMIT_ENABLED=false
-# Security
-SECURE_COOKIES=true
-BCRYPT_ROUNDS=4
+# Event-Driven Design Configuration
+KAFKA_BOOTSTRAP_SERVERS=kafka:29092
+SCHEMA_REGISTRY_URL=http://schema-registry:8081
+ENABLE_EVENT_STREAMING=true
+EVENT_RETENTION_DAYS=30
+KAFKA_CONSUMER_GROUP_ID=integr8scode-backend
+KAFKA_AUTO_OFFSET_RESET=earliest
+KAFKA_ENABLE_AUTO_COMMIT=true
+KAFKA_SESSION_TIMEOUT_MS=10000
+KAFKA_HEARTBEAT_INTERVAL_MS=3000
+KAFKA_REQUEST_TIMEOUT_MS=15000
+KAFKA_MAX_POLL_RECORDS=500
+
+# WebSocket Configuration
+WEBSOCKET_PING_INTERVAL=30
+WEBSOCKET_PING_TIMEOUT=10
+
+# Logging Configuration
+LOG_LEVEL=WARNING
+WEBSOCKET_MAX_CONNECTIONS_PER_USER=5
+WEBSOCKET_STALE_CONNECTION_TIMEOUT=300
+
+# Distributed Tracing
+ENABLE_TRACING=true
+JAEGER_AGENT_HOST=jaeger
+JAEGER_AGENT_PORT=6831
+TRACING_SERVICE_NAME=integr8scode-backend
+TRACING_SERVICE_VERSION=1.0.0
+TRACING_SAMPLING_RATE=1.0
+
+# Dead Letter Queue Configuration
+DLQ_RETRY_MAX_ATTEMPTS=5
+DLQ_RETRY_BASE_DELAY_SECONDS=60.0
+DLQ_RETRY_MAX_DELAY_SECONDS=3600.0
+DLQ_RETENTION_DAYS=7
+DLQ_WARNING_THRESHOLD=100
+DLQ_CRITICAL_THRESHOLD=1000
-# Features
-RATE_LIMIT_ENABLED=true
-ENABLE_TRACING=false
+# App URL for notification links
+APP_URL=https://localhost
-# OpenTelemetry - disabled for tests
-# Empty endpoint prevents OTLP exporter creation in setup_metrics()
-# OTEL_SDK_DISABLED=true (set via pytest-env) provides additional safety
-OTEL_EXPORTER_OTLP_ENDPOINT=
+# Service Configuration
+SERVICE_NAME=integr8scode-backend
+SERVICE_VERSION=1.0.0
-# Development
-DEVELOPMENT_MODE=false
-LOG_LEVEL=INFO
-ENVIRONMENT=test
+# OpenTelemetry Configuration
+OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317
+OTEL_SERVICE_NAME=integr8scode-backend
+OTEL_SERVICE_VERSION=1.0.0
+OTEL_RESOURCE_ATTRIBUTES=environment=test,team=backend
+
+# Web server concurrency settings (Gunicorn + Uvicorn workers)
+# Tune these for your machine. Defaults are safe for dev.
+WEB_CONCURRENCY=1
+WEB_THREADS=4
+WEB_TIMEOUT=60
+WEB_BACKLOG=2048
+
+# Local development server bind address
+# When running uvicorn locally (outside Docker), bind to IPv4 loopback to avoid
+# IPv6-only localhost resolution on some Linux distros.
+SERVER_HOST=127.0.0.1
+
+# Security
+BCRYPT_ROUNDS=4
diff --git a/backend/app/core/container.py b/backend/app/core/container.py
index 97e0c48f..b67f133a 100644
--- a/backend/app/core/container.py
+++ b/backend/app/core/container.py
@@ -8,7 +8,6 @@
CoordinatorProvider,
CoreServicesProvider,
DatabaseProvider,
- DLQProcessorProvider,
EventProvider,
EventReplayProvider,
K8sWorkerProvider,
@@ -119,6 +118,7 @@ def create_pod_monitor_container(settings: Settings) -> AsyncContainer:
SettingsProvider(),
LoggingProvider(),
DatabaseProvider(),
+ RedisProvider(),
CoreServicesProvider(),
MetricsProvider(),
RepositoryProvider(),
@@ -154,6 +154,7 @@ def create_event_replay_container(settings: Settings) -> AsyncContainer:
SettingsProvider(),
LoggingProvider(),
DatabaseProvider(),
+ RedisProvider(),
CoreServicesProvider(),
MetricsProvider(),
RepositoryProvider(),
@@ -170,10 +171,11 @@ def create_dlq_processor_container(settings: Settings) -> AsyncContainer:
SettingsProvider(),
LoggingProvider(),
DatabaseProvider(),
+ RedisProvider(),
CoreServicesProvider(),
MetricsProvider(),
RepositoryProvider(),
+ MessagingProvider(),
EventProvider(),
- DLQProcessorProvider(),
context={Settings: settings},
)
diff --git a/backend/app/core/dishka_lifespan.py b/backend/app/core/dishka_lifespan.py
index d419bf54..3a91ee1d 100644
--- a/backend/app/core/dishka_lifespan.py
+++ b/backend/app/core/dishka_lifespan.py
@@ -1,3 +1,4 @@
+import asyncio
import logging
from contextlib import AsyncExitStack, asynccontextmanager
from typing import AsyncGenerator
@@ -8,7 +9,8 @@
from fastapi import FastAPI
from app.core.database_context import Database
-from app.core.startup import initialize_metrics_context, initialize_rate_limits
+from app.core.metrics import RateLimitMetrics
+from app.core.startup import initialize_rate_limits
from app.core.tracing import init_tracing
from app.db.docs import ALL_DOCUMENTS
from app.events.event_store_consumer import EventStoreConsumer
@@ -71,35 +73,38 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
extra={"testing": settings.TESTING, "enable_tracing": settings.ENABLE_TRACING},
)
- # Initialize schema registry once at startup
- schema_registry = await container.get(SchemaRegistryManager)
- await initialize_event_schemas(schema_registry)
-
- # Initialize Beanie ODM with database from DI container
- database = await container.get(Database)
- await init_beanie(database=database, document_models=ALL_DOCUMENTS)
- logger.info(f"Beanie ODM initialized with {len(ALL_DOCUMENTS)} document models")
-
- # Initialize metrics context with instances from DI container
- # This must happen early so services can access metrics via contextvars
- await initialize_metrics_context(container, logger)
- logger.info("Metrics context initialized with contextvars")
-
- # Initialize default rate limits in Redis
- redis_client = await container.get(redis.Redis)
- await initialize_rate_limits(redis_client, settings, logger)
- logger.info("Rate limits initialized in Redis")
-
- # Rate limit middleware added during app creation; service resolved lazily at runtime
+ # Phase 1: Resolve all DI dependencies in parallel
+ (
+ schema_registry,
+ database,
+ redis_client,
+ rate_limit_metrics,
+ sse_bridge,
+ event_store_consumer,
+ ) = await asyncio.gather(
+ container.get(SchemaRegistryManager),
+ container.get(Database),
+ container.get(redis.Redis),
+ container.get(RateLimitMetrics),
+ container.get(SSEKafkaRedisBridge),
+ container.get(EventStoreConsumer),
+ )
- # Acquire long-lived services and manage lifecycle via AsyncExitStack
- sse_bridge = await container.get(SSEKafkaRedisBridge)
- event_store_consumer = await container.get(EventStoreConsumer)
+ # Phase 2: Initialize infrastructure in parallel (independent subsystems)
+ await asyncio.gather(
+ initialize_event_schemas(schema_registry),
+ init_beanie(database=database, document_models=ALL_DOCUMENTS),
+ initialize_rate_limits(redis_client, settings, logger, rate_limit_metrics),
+ )
+ logger.info("Infrastructure initialized (schemas, beanie, rate limits)")
+ # Phase 3: Start Kafka consumers in parallel
async with AsyncExitStack() as stack:
- await stack.enter_async_context(sse_bridge)
- logger.info("SSE Kafka→Redis bridge started with consumer pool")
- await stack.enter_async_context(event_store_consumer)
- logger.info("EventStoreConsumer started - events will be persisted to MongoDB")
- logger.info("All services initialized by DI and managed by AsyncExitStack")
+ stack.push_async_callback(sse_bridge.aclose)
+ stack.push_async_callback(event_store_consumer.aclose)
+ await asyncio.gather(
+ sse_bridge.__aenter__(),
+ event_store_consumer.__aenter__(),
+ )
+ logger.info("SSE bridge and EventStoreConsumer started")
yield
diff --git a/backend/app/core/metrics/context.py b/backend/app/core/metrics/context.py
deleted file mode 100644
index dd87c3b2..00000000
--- a/backend/app/core/metrics/context.py
+++ /dev/null
@@ -1,285 +0,0 @@
-import contextvars
-import logging
-from typing import Any, Generic, Optional, Type, TypeVar
-
-from app.core.metrics import (
- ConnectionMetrics,
- CoordinatorMetrics,
- DatabaseMetrics,
- DLQMetrics,
- EventMetrics,
- ExecutionMetrics,
- HealthMetrics,
- KubernetesMetrics,
- NotificationMetrics,
- RateLimitMetrics,
- ReplayMetrics,
- SecurityMetrics,
-)
-
-# Type variable for generic metrics
-T = TypeVar("T")
-
-
-class MetricsContextVar(Generic[T]):
- """
- A wrapper around contextvars.ContextVar for type-safe metrics access.
-
- This class ensures that each metric type has its own context variable
- and provides a clean interface for getting and setting metrics.
- """
-
- def __init__(self, name: str, metric_class: Type[T], logger: logging.Logger) -> None:
- """
- Initialize a metrics context variable.
-
- Args:
- name: Name for the context variable (for debugging)
- metric_class: The class of the metric this context holds
- logger: Logger instance for logging
- """
- self._context_var: contextvars.ContextVar[Optional[T]] = contextvars.ContextVar(f"metrics_{name}", default=None)
- self._metric_class = metric_class
- self._name = name
- self.logger = logger
-
- def get(self) -> T:
- """
- Get the metric from context.
-
- Returns:
- The metric instance for the current context
-
- Raises:
- RuntimeError: If metrics not initialized via DI
- """
- metric = self._context_var.get()
- if metric is None:
- raise RuntimeError(
- f"{self._name} metrics not initialized. "
- "Ensure MetricsContext.initialize_all() is called during app startup."
- )
- return metric
-
- def set(self, metric: T) -> contextvars.Token[Optional[T]]:
- """
- Set the metric in the current context.
-
- Args:
- metric: The metric instance to set
-
- Returns:
- A token that can be used to reset the context
- """
- return self._context_var.set(metric)
-
- def reset(self) -> None:
- """Reset the metric to None in the current context."""
- self._context_var.set(None)
-
- def is_set(self) -> bool:
- """Check if a metric is set in the current context."""
- return self._context_var.get() is not None
-
-
-# Module-level logger for lazy initialization
-_module_logger: Optional[logging.Logger] = None
-
-
-def _get_module_logger() -> logging.Logger:
- """Get or create module logger for lazy initialization."""
- global _module_logger
- if _module_logger is None:
- _module_logger = logging.getLogger(__name__)
- return _module_logger
-
-
-# Create module-level context variables for each metric type
-# These are singletons that live for the lifetime of the application
-_connection_ctx = MetricsContextVar("connection", ConnectionMetrics, _get_module_logger())
-_coordinator_ctx = MetricsContextVar("coordinator", CoordinatorMetrics, _get_module_logger())
-_database_ctx = MetricsContextVar("database", DatabaseMetrics, _get_module_logger())
-_dlq_ctx = MetricsContextVar("dlq", DLQMetrics, _get_module_logger())
-_event_ctx = MetricsContextVar("event", EventMetrics, _get_module_logger())
-_execution_ctx = MetricsContextVar("execution", ExecutionMetrics, _get_module_logger())
-_health_ctx = MetricsContextVar("health", HealthMetrics, _get_module_logger())
-_kubernetes_ctx = MetricsContextVar("kubernetes", KubernetesMetrics, _get_module_logger())
-_notification_ctx = MetricsContextVar("notification", NotificationMetrics, _get_module_logger())
-_rate_limit_ctx = MetricsContextVar("rate_limit", RateLimitMetrics, _get_module_logger())
-_replay_ctx = MetricsContextVar("replay", ReplayMetrics, _get_module_logger())
-_security_ctx = MetricsContextVar("security", SecurityMetrics, _get_module_logger())
-
-
-class MetricsContext:
- """
- Central manager for all metrics contexts.
-
- This class provides a unified interface for managing all metric types
- in the application. It handles initialization at startup and provides
- access methods for each metric type.
- """
-
- @classmethod
- def initialize_all(cls, logger: logging.Logger, **metrics: Any) -> None:
- """
- Initialize all metrics contexts at application startup.
-
- This should be called once during application initialization,
- typically in the startup sequence after dependency injection
- has created the metric instances.
-
- Args:
- **metrics: Keyword arguments mapping metric names to instances
- e.g., event=EventMetrics(), connection=ConnectionMetrics()
- """
- for name, metric_instance in metrics.items():
- if name == "connection":
- _connection_ctx.set(metric_instance)
- elif name == "coordinator":
- _coordinator_ctx.set(metric_instance)
- elif name == "database":
- _database_ctx.set(metric_instance)
- elif name == "dlq":
- _dlq_ctx.set(metric_instance)
- elif name == "event":
- _event_ctx.set(metric_instance)
- elif name == "execution":
- _execution_ctx.set(metric_instance)
- elif name == "health":
- _health_ctx.set(metric_instance)
- elif name == "kubernetes":
- _kubernetes_ctx.set(metric_instance)
- elif name == "notification":
- _notification_ctx.set(metric_instance)
- elif name == "rate_limit":
- _rate_limit_ctx.set(metric_instance)
- elif name == "replay":
- _replay_ctx.set(metric_instance)
- elif name == "security":
- _security_ctx.set(metric_instance)
- else:
- logger.warning(f"Unknown metric type: {name}")
- continue
- logger.info(f"Initialized {name} metrics in context")
-
- @classmethod
- def reset_all(cls, logger: logging.Logger) -> None:
- """
- Reset all metrics contexts.
-
- This is primarily useful for testing to ensure a clean state
- between test cases.
- """
- _connection_ctx.reset()
- _coordinator_ctx.reset()
- _database_ctx.reset()
- _dlq_ctx.reset()
- _event_ctx.reset()
- _execution_ctx.reset()
- _health_ctx.reset()
- _kubernetes_ctx.reset()
- _notification_ctx.reset()
- _rate_limit_ctx.reset()
- _replay_ctx.reset()
- _security_ctx.reset()
- logger.debug("Reset all metrics contexts")
-
- @classmethod
- def get_connection_metrics(cls) -> ConnectionMetrics:
- return _connection_ctx.get()
-
- @classmethod
- def get_coordinator_metrics(cls) -> CoordinatorMetrics:
- return _coordinator_ctx.get()
-
- @classmethod
- def get_database_metrics(cls) -> DatabaseMetrics:
- return _database_ctx.get()
-
- @classmethod
- def get_dlq_metrics(cls) -> DLQMetrics:
- return _dlq_ctx.get()
-
- @classmethod
- def get_event_metrics(cls) -> EventMetrics:
- return _event_ctx.get()
-
- @classmethod
- def get_execution_metrics(cls) -> ExecutionMetrics:
- return _execution_ctx.get()
-
- @classmethod
- def get_health_metrics(cls) -> HealthMetrics:
- return _health_ctx.get()
-
- @classmethod
- def get_kubernetes_metrics(cls) -> KubernetesMetrics:
- return _kubernetes_ctx.get()
-
- @classmethod
- def get_notification_metrics(cls) -> NotificationMetrics:
- return _notification_ctx.get()
-
- @classmethod
- def get_rate_limit_metrics(cls) -> RateLimitMetrics:
- return _rate_limit_ctx.get()
-
- @classmethod
- def get_replay_metrics(cls) -> ReplayMetrics:
- return _replay_ctx.get()
-
- @classmethod
- def get_security_metrics(cls) -> SecurityMetrics:
- return _security_ctx.get()
-
-
-# Convenience functions for direct access with proper type annotations
-# Import types with forward references to avoid circular imports
-
-
-def get_connection_metrics() -> ConnectionMetrics:
- return MetricsContext.get_connection_metrics()
-
-
-def get_coordinator_metrics() -> CoordinatorMetrics:
- return MetricsContext.get_coordinator_metrics()
-
-
-def get_database_metrics() -> DatabaseMetrics:
- return MetricsContext.get_database_metrics()
-
-
-def get_dlq_metrics() -> DLQMetrics:
- return MetricsContext.get_dlq_metrics()
-
-
-def get_event_metrics() -> EventMetrics:
- return MetricsContext.get_event_metrics()
-
-
-def get_execution_metrics() -> ExecutionMetrics:
- return MetricsContext.get_execution_metrics()
-
-
-def get_health_metrics() -> HealthMetrics:
- return MetricsContext.get_health_metrics()
-
-
-def get_kubernetes_metrics() -> KubernetesMetrics:
- return MetricsContext.get_kubernetes_metrics()
-
-
-def get_notification_metrics() -> NotificationMetrics:
- return MetricsContext.get_notification_metrics()
-
-
-def get_rate_limit_metrics() -> RateLimitMetrics:
- return MetricsContext.get_rate_limit_metrics()
-
-
-def get_replay_metrics() -> ReplayMetrics:
- return MetricsContext.get_replay_metrics()
-
-
-def get_security_metrics() -> SecurityMetrics:
- return MetricsContext.get_security_metrics()
diff --git a/backend/app/core/metrics/events.py b/backend/app/core/metrics/events.py
index f74e94b6..bd417078 100644
--- a/backend/app/core/metrics/events.py
+++ b/backend/app/core/metrics/events.py
@@ -5,17 +5,16 @@ class EventMetrics(BaseMetrics):
"""Metrics for event processing and Kafka.
This class tracks metrics related to event processing, event buffers,
- and Kafka message production/consumption. It's now accessed through
- the contextvars-based MetricsContext system rather than a singleton.
+ and Kafka message production/consumption. Metrics are provided via
+ dependency injection (DI) through the MetricsProvider.
- Usage:
- from app.core.metrics.context import get_event_metrics
+ Usage (via DI):
+ class MyService:
+ def __init__(self, event_metrics: EventMetrics):
+ self.metrics = event_metrics
- metrics = get_event_metrics()
- metrics.record_event_published("execution.requested")
-
- The metrics instance is managed by the MetricsContext and is available
- throughout the application without needing to pass it through layers.
+ def my_method(self):
+ self.metrics.record_event_published("execution.requested")
"""
def _create_instruments(self) -> None:
diff --git a/backend/app/core/middlewares/rate_limit.py b/backend/app/core/middlewares/rate_limit.py
index a08a708e..56b2da62 100644
--- a/backend/app/core/middlewares/rate_limit.py
+++ b/backend/app/core/middlewares/rate_limit.py
@@ -46,8 +46,6 @@ def __init__(
self.app = app
self.rate_limit_service = rate_limit_service
self.settings = settings
- # Default to enabled unless settings says otherwise
- self.enabled = bool(settings.RATE_LIMIT_ENABLED) if settings else True
async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None:
if scope["type"] != "http":
@@ -56,7 +54,12 @@ async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None:
path = scope["path"]
- if not self.enabled or path in self.EXCLUDED_PATHS:
+ if path in self.EXCLUDED_PATHS:
+ await self.app(scope, receive, send)
+ return
+
+ # Check if rate limiting is globally disabled via settings
+ if self.settings is not None and not self.settings.RATE_LIMIT_ENABLED:
await self.app(scope, receive, send)
return
diff --git a/backend/app/core/providers.py b/backend/app/core/providers.py
index 3af1e28e..6ce30a01 100644
--- a/backend/app/core/providers.py
+++ b/backend/app/core/providers.py
@@ -9,19 +9,19 @@
from app.core.k8s_clients import K8sClients, close_k8s_clients, create_k8s_clients
from app.core.logging import setup_logger
from app.core.metrics import (
+ ConnectionMetrics,
CoordinatorMetrics,
DatabaseMetrics,
DLQMetrics,
+ EventMetrics,
ExecutionMetrics,
HealthMetrics,
KubernetesMetrics,
NotificationMetrics,
+ RateLimitMetrics,
ReplayMetrics,
SecurityMetrics,
)
-from app.core.metrics.connections import ConnectionMetrics
-from app.core.metrics.events import EventMetrics
-from app.core.metrics.rate_limit import RateLimitMetrics
from app.core.security import SecurityService
from app.core.tracing import TracerManager
from app.db.repositories import (
@@ -41,7 +41,6 @@
from app.db.repositories.resource_allocation_repository import ResourceAllocationRepository
from app.db.repositories.user_settings_repository import UserSettingsRepository
from app.dlq.manager import DLQManager, create_dlq_manager
-from app.domain.enums.kafka import KafkaTopic
from app.domain.saga.models import SagaConfig
from app.events.core import UnifiedProducer
from app.events.event_store import EventStore, create_event_store
@@ -120,7 +119,7 @@ async def get_redis_client(self, settings: Settings, logger: logging.Logger) ->
@provide
def get_rate_limit_service(
- self, redis_client: redis.Redis, settings: Settings, rate_limit_metrics: RateLimitMetrics
+ self, redis_client: redis.Redis, settings: Settings, rate_limit_metrics: RateLimitMetrics
) -> RateLimitService:
return RateLimitService(redis_client, settings, rate_limit_metrics)
@@ -158,16 +157,21 @@ class MessagingProvider(Provider):
@provide
async def get_kafka_producer(
- self, settings: Settings, schema_registry: SchemaRegistryManager, logger: logging.Logger
+ self, settings: Settings, schema_registry: SchemaRegistryManager, logger: logging.Logger,
+ event_metrics: EventMetrics
) -> AsyncIterator[UnifiedProducer]:
- async with UnifiedProducer(schema_registry, logger, settings) as producer:
+ async with UnifiedProducer(schema_registry, logger, settings, event_metrics) as producer:
yield producer
@provide
async def get_dlq_manager(
- self, settings: Settings, schema_registry: SchemaRegistryManager, logger: logging.Logger
+ self,
+ settings: Settings,
+ schema_registry: SchemaRegistryManager,
+ logger: logging.Logger,
+ dlq_metrics: DLQMetrics,
) -> AsyncIterator[DLQManager]:
- async with create_dlq_manager(settings, schema_registry, logger) as manager:
+ async with create_dlq_manager(settings, schema_registry, logger, dlq_metrics) as manager:
yield manager
@provide
@@ -176,9 +180,11 @@ def get_idempotency_repository(self, redis_client: redis.Redis) -> RedisIdempote
@provide
async def get_idempotency_manager(
- self, repo: RedisIdempotencyRepository, logger: logging.Logger
+ self, repo: RedisIdempotencyRepository, logger: logging.Logger, database_metrics: DatabaseMetrics
) -> AsyncIterator[IdempotencyManager]:
- manager = create_idempotency_manager(repository=repo, config=IdempotencyConfig(), logger=logger)
+ manager = create_idempotency_manager(
+ repository=repo, config=IdempotencyConfig(), logger=logger, database_metrics=database_metrics
+ )
await manager.initialize()
try:
yield manager
@@ -194,33 +200,40 @@ def get_schema_registry(self, settings: Settings, logger: logging.Logger) -> Sch
return SchemaRegistryManager(settings, logger)
@provide
- async def get_event_store(self, schema_registry: SchemaRegistryManager, logger: logging.Logger) -> EventStore:
- store = create_event_store(schema_registry=schema_registry, logger=logger, ttl_days=90)
- return store
+ async def get_event_store(
+ self, schema_registry: SchemaRegistryManager, logger: logging.Logger, event_metrics: EventMetrics
+ ) -> EventStore:
+ return create_event_store(
+ schema_registry=schema_registry, logger=logger, event_metrics=event_metrics, ttl_days=90
+ )
@provide
async def get_event_store_consumer(
- self,
- event_store: EventStore,
- schema_registry: SchemaRegistryManager,
- settings: Settings,
- kafka_producer: UnifiedProducer,
- logger: logging.Logger,
+ self,
+ event_store: EventStore,
+ schema_registry: SchemaRegistryManager,
+ settings: Settings,
+ kafka_producer: UnifiedProducer,
+ logger: logging.Logger,
+ event_metrics: EventMetrics,
) -> AsyncIterator[EventStoreConsumer]:
topics = get_all_topics()
async with create_event_store_consumer(
- event_store=event_store,
- topics=list(topics),
- schema_registry_manager=schema_registry,
- settings=settings,
- producer=kafka_producer,
- logger=logger,
+ event_store=event_store,
+ topics=list(topics),
+ schema_registry_manager=schema_registry,
+ settings=settings,
+ producer=kafka_producer,
+ logger=logger,
+ event_metrics=event_metrics,
) as consumer:
yield consumer
@provide
- async def get_event_bus_manager(self, settings: Settings, logger: logging.Logger) -> AsyncIterator[EventBusManager]:
- manager = EventBusManager(settings, logger)
+ async def get_event_bus_manager(
+ self, settings: Settings, logger: logging.Logger, connection_metrics: ConnectionMetrics
+ ) -> AsyncIterator[EventBusManager]:
+ manager = EventBusManager(settings, logger, connection_metrics)
try:
yield manager
finally:
@@ -232,7 +245,7 @@ class KubernetesProvider(Provider):
@provide
async def get_k8s_clients(self, settings: Settings, logger: logging.Logger) -> AsyncIterator[K8sClients]:
- clients = create_k8s_clients(logger)
+ clients = create_k8s_clients(logger, kubeconfig_path=settings.KUBERNETES_CONFIG_PATH)
try:
yield clients
finally:
@@ -240,7 +253,7 @@ async def get_k8s_clients(self, settings: Settings, logger: logging.Logger) -> A
class MetricsProvider(Provider):
- """Provides all metrics instances."""
+ """Provides all metrics instances via DI (no contextvars needed)."""
scope = Scope.APP
@@ -367,35 +380,38 @@ async def get_sse_redis_bus(self, redis_client: redis.Redis, logger: logging.Log
@provide
async def get_sse_kafka_redis_bridge(
- self,
- schema_registry: SchemaRegistryManager,
- settings: Settings,
- event_metrics: EventMetrics,
- sse_redis_bus: SSERedisBus,
- logger: logging.Logger,
+ self,
+ schema_registry: SchemaRegistryManager,
+ settings: Settings,
+ event_metrics: EventMetrics,
+ sse_redis_bus: SSERedisBus,
+ logger: logging.Logger,
) -> AsyncIterator[SSEKafkaRedisBridge]:
async with create_sse_kafka_redis_bridge(
- schema_registry=schema_registry,
- settings=settings,
- event_metrics=event_metrics,
- sse_bus=sse_redis_bus,
- logger=logger,
+ schema_registry=schema_registry,
+ settings=settings,
+ event_metrics=event_metrics,
+ sse_bus=sse_redis_bus,
+ logger=logger,
) as bridge:
yield bridge
@provide(scope=Scope.REQUEST)
- def get_sse_shutdown_manager(self, logger: logging.Logger) -> SSEShutdownManager:
- return create_sse_shutdown_manager(logger=logger)
+ def get_sse_shutdown_manager(
+ self, logger: logging.Logger, connection_metrics: ConnectionMetrics
+ ) -> SSEShutdownManager:
+ return create_sse_shutdown_manager(logger=logger, connection_metrics=connection_metrics)
@provide(scope=Scope.REQUEST)
def get_sse_service(
- self,
- sse_repository: SSERepository,
- router: SSEKafkaRedisBridge,
- sse_redis_bus: SSERedisBus,
- shutdown_manager: SSEShutdownManager,
- settings: Settings,
- logger: logging.Logger,
+ self,
+ sse_repository: SSERepository,
+ router: SSEKafkaRedisBridge,
+ sse_redis_bus: SSERedisBus,
+ shutdown_manager: SSEShutdownManager,
+ settings: Settings,
+ logger: logging.Logger,
+ connection_metrics: ConnectionMetrics,
) -> SSEService:
shutdown_manager.set_router(router)
return SSEService(
@@ -405,6 +421,7 @@ def get_sse_service(
shutdown_manager=shutdown_manager,
settings=settings,
logger=logger,
+ connection_metrics=connection_metrics,
)
@@ -413,7 +430,7 @@ class AuthProvider(Provider):
@provide
def get_auth_service(
- self, user_repository: UserRepository, security_service: SecurityService, logger: logging.Logger
+ self, user_repository: UserRepository, security_service: SecurityService, logger: logging.Logger
) -> AuthService:
return AuthService(user_repository, security_service, logger)
@@ -429,17 +446,19 @@ def get_event_service(self, event_repository: EventRepository) -> EventService:
@provide
def get_kafka_event_service(
- self,
- event_repository: EventRepository,
- kafka_producer: UnifiedProducer,
- settings: Settings,
- logger: logging.Logger,
+ self,
+ event_repository: EventRepository,
+ kafka_producer: UnifiedProducer,
+ settings: Settings,
+ logger: logging.Logger,
+ event_metrics: EventMetrics,
) -> KafkaEventService:
return KafkaEventService(
event_repository=event_repository,
kafka_producer=kafka_producer,
settings=settings,
logger=logger,
+ event_metrics=event_metrics,
)
@@ -448,11 +467,11 @@ class UserServicesProvider(Provider):
@provide
async def get_user_settings_service(
- self,
- repository: UserSettingsRepository,
- kafka_event_service: KafkaEventService,
- event_bus_manager: EventBusManager,
- logger: logging.Logger,
+ self,
+ repository: UserSettingsRepository,
+ kafka_event_service: KafkaEventService,
+ event_bus_manager: EventBusManager,
+ logger: logging.Logger,
) -> UserSettingsService:
service = UserSettingsService(repository, kafka_event_service, logger)
await service.initialize(event_bus_manager)
@@ -464,31 +483,33 @@ class AdminServicesProvider(Provider):
@provide(scope=Scope.REQUEST)
def get_admin_events_service(
- self,
- admin_events_repository: AdminEventsRepository,
- replay_service: ReplayService,
- logger: logging.Logger,
+ self,
+ admin_events_repository: AdminEventsRepository,
+ replay_service: ReplayService,
+ logger: logging.Logger,
) -> AdminEventsService:
return AdminEventsService(admin_events_repository, replay_service, logger)
@provide
def get_admin_settings_service(
- self,
- admin_settings_repository: AdminSettingsRepository,
- logger: logging.Logger,
+ self,
+ admin_settings_repository: AdminSettingsRepository,
+ logger: logging.Logger,
) -> AdminSettingsService:
return AdminSettingsService(admin_settings_repository, logger)
@provide
def get_notification_service(
- self,
- notification_repository: NotificationRepository,
- kafka_event_service: KafkaEventService,
- event_bus_manager: EventBusManager,
- schema_registry: SchemaRegistryManager,
- sse_redis_bus: SSERedisBus,
- settings: Settings,
- logger: logging.Logger,
+ self,
+ notification_repository: NotificationRepository,
+ kafka_event_service: KafkaEventService,
+ event_bus_manager: EventBusManager,
+ schema_registry: SchemaRegistryManager,
+ sse_redis_bus: SSERedisBus,
+ settings: Settings,
+ logger: logging.Logger,
+ notification_metrics: NotificationMetrics,
+ event_metrics: EventMetrics,
) -> NotificationService:
service = NotificationService(
notification_repository=notification_repository,
@@ -498,15 +519,17 @@ def get_notification_service(
sse_bus=sse_redis_bus,
settings=settings,
logger=logger,
+ notification_metrics=notification_metrics,
+ event_metrics=event_metrics,
)
service.initialize()
return service
@provide
def get_grafana_alert_processor(
- self,
- notification_service: NotificationService,
- logger: logging.Logger,
+ self,
+ notification_service: NotificationService,
+ logger: logging.Logger,
) -> GrafanaAlertProcessor:
return GrafanaAlertProcessor(notification_service, logger)
@@ -526,48 +549,54 @@ def _create_default_saga_config() -> SagaConfig:
# Standalone factory functions for lifecycle-managed services (eliminates duplication)
async def _provide_saga_orchestrator(
- saga_repository: SagaRepository,
- kafka_producer: UnifiedProducer,
- schema_registry: SchemaRegistryManager,
- settings: Settings,
- event_store: EventStore,
- idempotency_manager: IdempotencyManager,
- resource_allocation_repository: ResourceAllocationRepository,
- logger: logging.Logger,
+ saga_repository: SagaRepository,
+ kafka_producer: UnifiedProducer,
+ schema_registry: SchemaRegistryManager,
+ settings: Settings,
+ event_store: EventStore,
+ idempotency_manager: IdempotencyManager,
+ resource_allocation_repository: ResourceAllocationRepository,
+ logger: logging.Logger,
+ event_metrics: EventMetrics,
) -> AsyncIterator[SagaOrchestrator]:
"""Shared factory for SagaOrchestrator with lifecycle management."""
async with create_saga_orchestrator(
- saga_repository=saga_repository,
- producer=kafka_producer,
- schema_registry_manager=schema_registry,
- settings=settings,
- event_store=event_store,
- idempotency_manager=idempotency_manager,
- resource_allocation_repository=resource_allocation_repository,
- config=_create_default_saga_config(),
- logger=logger,
+ saga_repository=saga_repository,
+ producer=kafka_producer,
+ schema_registry_manager=schema_registry,
+ settings=settings,
+ event_store=event_store,
+ idempotency_manager=idempotency_manager,
+ resource_allocation_repository=resource_allocation_repository,
+ config=_create_default_saga_config(),
+ logger=logger,
+ event_metrics=event_metrics,
) as orchestrator:
yield orchestrator
async def _provide_execution_coordinator(
- kafka_producer: UnifiedProducer,
- schema_registry: SchemaRegistryManager,
- settings: Settings,
- event_store: EventStore,
- execution_repository: ExecutionRepository,
- idempotency_manager: IdempotencyManager,
- logger: logging.Logger,
+ kafka_producer: UnifiedProducer,
+ schema_registry: SchemaRegistryManager,
+ settings: Settings,
+ event_store: EventStore,
+ execution_repository: ExecutionRepository,
+ idempotency_manager: IdempotencyManager,
+ logger: logging.Logger,
+ coordinator_metrics: CoordinatorMetrics,
+ event_metrics: EventMetrics,
) -> AsyncIterator[ExecutionCoordinator]:
"""Shared factory for ExecutionCoordinator with lifecycle management."""
async with ExecutionCoordinator(
- producer=kafka_producer,
- schema_registry_manager=schema_registry,
- settings=settings,
- event_store=event_store,
- execution_repository=execution_repository,
- idempotency_manager=idempotency_manager,
- logger=logger,
+ producer=kafka_producer,
+ schema_registry_manager=schema_registry,
+ settings=settings,
+ event_store=event_store,
+ execution_repository=execution_repository,
+ idempotency_manager=idempotency_manager,
+ logger=logger,
+ coordinator_metrics=coordinator_metrics,
+ event_metrics=event_metrics,
) as coordinator:
yield coordinator
@@ -582,11 +611,11 @@ def __init__(self) -> None:
@provide
def get_saga_service(
- self,
- saga_repository: SagaRepository,
- execution_repository: ExecutionRepository,
- saga_orchestrator: SagaOrchestrator,
- logger: logging.Logger,
+ self,
+ saga_repository: SagaRepository,
+ execution_repository: ExecutionRepository,
+ saga_orchestrator: SagaOrchestrator,
+ logger: logging.Logger,
) -> SagaService:
return SagaService(
saga_repo=saga_repository,
@@ -597,12 +626,13 @@ def get_saga_service(
@provide
def get_execution_service(
- self,
- execution_repository: ExecutionRepository,
- kafka_producer: UnifiedProducer,
- event_store: EventStore,
- settings: Settings,
- logger: logging.Logger,
+ self,
+ execution_repository: ExecutionRepository,
+ kafka_producer: UnifiedProducer,
+ event_store: EventStore,
+ settings: Settings,
+ logger: logging.Logger,
+ execution_metrics: ExecutionMetrics,
) -> ExecutionService:
return ExecutionService(
execution_repo=execution_repository,
@@ -610,22 +640,23 @@ def get_execution_service(
event_store=event_store,
settings=settings,
logger=logger,
+ execution_metrics=execution_metrics,
)
@provide
def get_saved_script_service(
- self, saved_script_repository: SavedScriptRepository, logger: logging.Logger
+ self, saved_script_repository: SavedScriptRepository, logger: logging.Logger
) -> SavedScriptService:
return SavedScriptService(saved_script_repository, logger)
@provide
async def get_replay_service(
- self,
- replay_repository: ReplayRepository,
- kafka_producer: UnifiedProducer,
- event_store: EventStore,
- settings: Settings,
- logger: logging.Logger,
+ self,
+ replay_repository: ReplayRepository,
+ kafka_producer: UnifiedProducer,
+ event_store: EventStore,
+ settings: Settings,
+ logger: logging.Logger,
) -> ReplayService:
event_replay_service = EventReplayService(
repository=replay_repository,
@@ -638,13 +669,13 @@ async def get_replay_service(
@provide
def get_admin_user_service(
- self,
- admin_user_repository: AdminUserRepository,
- event_service: EventService,
- execution_service: ExecutionService,
- rate_limit_service: RateLimitService,
- security_service: SecurityService,
- logger: logging.Logger,
+ self,
+ admin_user_repository: AdminUserRepository,
+ event_service: EventService,
+ execution_service: ExecutionService,
+ rate_limit_service: RateLimitService,
+ security_service: SecurityService,
+ logger: logging.Logger,
) -> AdminUserService:
return AdminUserService(
user_repository=admin_user_repository,
@@ -669,23 +700,25 @@ class K8sWorkerProvider(Provider):
@provide
async def get_kubernetes_worker(
- self,
- kafka_producer: UnifiedProducer,
- schema_registry: SchemaRegistryManager,
- settings: Settings,
- event_store: EventStore,
- idempotency_manager: IdempotencyManager,
- logger: logging.Logger,
+ self,
+ kafka_producer: UnifiedProducer,
+ schema_registry: SchemaRegistryManager,
+ settings: Settings,
+ event_store: EventStore,
+ idempotency_manager: IdempotencyManager,
+ logger: logging.Logger,
+ event_metrics: EventMetrics,
) -> AsyncIterator[KubernetesWorker]:
config = K8sWorkerConfig()
async with KubernetesWorker(
- config=config,
- producer=kafka_producer,
- schema_registry_manager=schema_registry,
- settings=settings,
- event_store=event_store,
- idempotency_manager=idempotency_manager,
- logger=logger,
+ config=config,
+ producer=kafka_producer,
+ schema_registry_manager=schema_registry,
+ settings=settings,
+ event_store=event_store,
+ idempotency_manager=idempotency_manager,
+ logger=logger,
+ event_metrics=event_metrics,
) as worker:
yield worker
@@ -695,27 +728,29 @@ class PodMonitorProvider(Provider):
@provide
def get_event_mapper(
- self,
- logger: logging.Logger,
- k8s_clients: K8sClients,
+ self,
+ logger: logging.Logger,
+ k8s_clients: K8sClients,
) -> PodEventMapper:
return PodEventMapper(logger=logger, k8s_api=k8s_clients.v1)
@provide
async def get_pod_monitor(
- self,
- kafka_event_service: KafkaEventService,
- k8s_clients: K8sClients,
- logger: logging.Logger,
- event_mapper: PodEventMapper,
+ self,
+ kafka_event_service: KafkaEventService,
+ k8s_clients: K8sClients,
+ logger: logging.Logger,
+ event_mapper: PodEventMapper,
+ kubernetes_metrics: KubernetesMetrics,
) -> AsyncIterator[PodMonitor]:
config = PodMonitorConfig()
async with PodMonitor(
- config=config,
- kafka_event_service=kafka_event_service,
- logger=logger,
- k8s_clients=k8s_clients,
- event_mapper=event_mapper,
+ config=config,
+ kafka_event_service=kafka_event_service,
+ logger=logger,
+ k8s_clients=k8s_clients,
+ event_mapper=event_mapper,
+ kubernetes_metrics=kubernetes_metrics,
) as monitor:
yield monitor
@@ -733,12 +768,12 @@ class EventReplayProvider(Provider):
@provide
def get_event_replay_service(
- self,
- replay_repository: ReplayRepository,
- kafka_producer: UnifiedProducer,
- event_store: EventStore,
- settings: Settings,
- logger: logging.Logger,
+ self,
+ replay_repository: ReplayRepository,
+ kafka_producer: UnifiedProducer,
+ event_store: EventStore,
+ settings: Settings,
+ logger: logging.Logger,
) -> EventReplayService:
return EventReplayService(
repository=replay_repository,
@@ -747,23 +782,3 @@ def get_event_replay_service(
settings=settings,
logger=logger,
)
-
-
-class DLQProcessorProvider(Provider):
- scope = Scope.APP
-
- @provide
- async def get_dlq_manager(
- self,
- settings: Settings,
- schema_registry: SchemaRegistryManager,
- logger: logging.Logger,
- ) -> AsyncIterator[DLQManager]:
- async with create_dlq_manager(
- settings=settings,
- schema_registry=schema_registry,
- logger=logger,
- dlq_topic=KafkaTopic.DEAD_LETTER_QUEUE,
- retry_topic_suffix="-retry",
- ) as manager:
- yield manager
diff --git a/backend/app/core/startup.py b/backend/app/core/startup.py
index afabada3..549c3cb8 100644
--- a/backend/app/core/startup.py
+++ b/backend/app/core/startup.py
@@ -1,69 +1,25 @@
import logging
import redis.asyncio as redis
-from dishka import AsyncContainer
-from app.core.metrics import (
- ConnectionMetrics,
- CoordinatorMetrics,
- DatabaseMetrics,
- DLQMetrics,
- EventMetrics,
- ExecutionMetrics,
- HealthMetrics,
- KubernetesMetrics,
- NotificationMetrics,
- RateLimitMetrics,
- ReplayMetrics,
- SecurityMetrics,
-)
-from app.core.metrics.context import MetricsContext, get_rate_limit_metrics
+from app.core.metrics import RateLimitMetrics
from app.domain.rate_limit import RateLimitConfig
from app.services.rate_limit_service import RateLimitService
from app.settings import Settings
-async def initialize_metrics_context(container: AsyncContainer, logger: logging.Logger) -> None:
- try:
- # Get all metrics from the container
- # These are created as APP-scoped singletons by providers
- metrics_mapping = {}
-
- # Only add metrics that are actually provided by the container
- # Some metrics might not be needed for certain deployments
- metrics_mapping["event"] = await container.get(EventMetrics)
- metrics_mapping["connection"] = await container.get(ConnectionMetrics)
- metrics_mapping["rate_limit"] = await container.get(RateLimitMetrics)
- metrics_mapping["execution"] = await container.get(ExecutionMetrics)
- metrics_mapping["database"] = await container.get(DatabaseMetrics)
- metrics_mapping["health"] = await container.get(HealthMetrics)
- metrics_mapping["kubernetes"] = await container.get(KubernetesMetrics)
- metrics_mapping["coordinator"] = await container.get(CoordinatorMetrics)
- metrics_mapping["dlq"] = await container.get(DLQMetrics)
- metrics_mapping["notification"] = await container.get(NotificationMetrics)
- metrics_mapping["replay"] = await container.get(ReplayMetrics)
- metrics_mapping["security"] = await container.get(SecurityMetrics)
-
- # Initialize the context with available metrics
- MetricsContext.initialize_all(logger=logger, **metrics_mapping)
-
- logger.info(f"Initialized metrics context with {len(metrics_mapping)} metric types")
-
- except Exception as e:
- logger.error(f"Failed to initialize metrics context: {e}")
- # Don't fail startup if metrics init fails
- # The context will lazy-initialize metrics as needed
-
-
-async def initialize_rate_limits(redis_client: redis.Redis, settings: Settings, logger: logging.Logger) -> None:
+async def initialize_rate_limits(
+ redis_client: redis.Redis,
+ settings: Settings,
+ logger: logging.Logger,
+ rate_limit_metrics: RateLimitMetrics,
+) -> None:
"""
Initialize default rate limits in Redis on application startup.
This ensures default limits are always available.
"""
try:
- # Create metrics instance
- metrics = get_rate_limit_metrics()
- service = RateLimitService(redis_client, settings, metrics)
+ service = RateLimitService(redis_client, settings, rate_limit_metrics)
# Check if config already exists
config_key = f"{settings.RATE_LIMIT_REDIS_PREFIX}config"
diff --git a/backend/app/dlq/manager.py b/backend/app/dlq/manager.py
index 27aacfdf..1d450a03 100644
--- a/backend/app/dlq/manager.py
+++ b/backend/app/dlq/manager.py
@@ -8,7 +8,7 @@
from opentelemetry.trace import SpanKind
from app.core.lifecycle import LifecycleEnabled
-from app.core.metrics.context import get_dlq_metrics
+from app.core.metrics import DLQMetrics
from app.core.tracing import EventAttributes
from app.core.tracing.utils import extract_trace_context, get_tracer, inject_trace_context
from app.db.docs import DLQMessageDocument
@@ -40,13 +40,14 @@ def __init__(
producer: AIOKafkaProducer,
schema_registry: SchemaRegistryManager,
logger: logging.Logger,
+ dlq_metrics: DLQMetrics,
dlq_topic: KafkaTopic = KafkaTopic.DEAD_LETTER_QUEUE,
retry_topic_suffix: str = "-retry",
default_retry_policy: RetryPolicy | None = None,
):
super().__init__()
self.settings = settings
- self.metrics = get_dlq_metrics()
+ self.metrics = dlq_metrics
self.schema_registry = schema_registry
self.logger = logger
self.dlq_topic = dlq_topic
@@ -77,9 +78,8 @@ def _kafka_msg_to_message(self, msg: Any) -> DLQMessage:
async def _on_start(self) -> None:
"""Start DLQ manager."""
- # Start producer and consumer
- await self.producer.start()
- await self.consumer.start()
+ # Start producer and consumer in parallel for faster startup
+ await asyncio.gather(self.producer.start(), self.consumer.start())
# Start processing tasks
self._process_task = asyncio.create_task(self._process_messages())
@@ -444,6 +444,7 @@ def create_dlq_manager(
settings: Settings,
schema_registry: SchemaRegistryManager,
logger: logging.Logger,
+ dlq_metrics: DLQMetrics,
dlq_topic: KafkaTopic = KafkaTopic.DEAD_LETTER_QUEUE,
retry_topic_suffix: str = "-retry",
default_retry_policy: RetryPolicy | None = None,
@@ -478,6 +479,7 @@ def create_dlq_manager(
producer=producer,
schema_registry=schema_registry,
logger=logger,
+ dlq_metrics=dlq_metrics,
dlq_topic=dlq_topic,
retry_topic_suffix=retry_topic_suffix,
default_retry_policy=default_retry_policy,
diff --git a/backend/app/events/core/consumer.py b/backend/app/events/core/consumer.py
index 01556751..d0532f37 100644
--- a/backend/app/events/core/consumer.py
+++ b/backend/app/events/core/consumer.py
@@ -8,7 +8,7 @@
from aiokafka.errors import KafkaError
from opentelemetry.trace import SpanKind
-from app.core.metrics.context import get_event_metrics
+from app.core.metrics import EventMetrics
from app.core.tracing import EventAttributes
from app.core.tracing.utils import extract_trace_context, get_tracer
from app.domain.enums.kafka import KafkaTopic
@@ -28,6 +28,7 @@ def __init__(
schema_registry: SchemaRegistryManager,
settings: Settings,
logger: logging.Logger,
+ event_metrics: EventMetrics,
):
self._config = config
self.logger = logger
@@ -37,7 +38,7 @@ def __init__(
self._state = ConsumerState.STOPPED
self._running = False
self._metrics = ConsumerMetrics()
- self._event_metrics = get_event_metrics() # Singleton for Kafka metrics
+ self._event_metrics = event_metrics
self._error_callback: "Callable[[Exception, DomainEvent], Awaitable[None]] | None" = None
self._consume_task: asyncio.Task[None] | None = None
self._topic_prefix = settings.KAFKA_TOPIC_PREFIX
diff --git a/backend/app/events/core/producer.py b/backend/app/events/core/producer.py
index c5848aec..a41188c7 100644
--- a/backend/app/events/core/producer.py
+++ b/backend/app/events/core/producer.py
@@ -9,7 +9,7 @@
from aiokafka.errors import KafkaError
from app.core.lifecycle import LifecycleEnabled
-from app.core.metrics.context import get_event_metrics
+from app.core.metrics import EventMetrics
from app.dlq.models import DLQMessage, DLQMessageStatus
from app.domain.enums.kafka import KafkaTopic
from app.domain.events.typed import DomainEvent
@@ -28,6 +28,7 @@ def __init__(
schema_registry_manager: SchemaRegistryManager,
logger: logging.Logger,
settings: Settings,
+ event_metrics: EventMetrics,
):
super().__init__()
self._settings = settings
@@ -36,7 +37,7 @@ def __init__(
self._producer: AIOKafkaProducer | None = None
self._state = ProducerState.STOPPED
self._metrics = ProducerMetrics()
- self._event_metrics = get_event_metrics()
+ self._event_metrics = event_metrics
self._topic_prefix = settings.KAFKA_TOPIC_PREFIX
@property
diff --git a/backend/app/events/event_store.py b/backend/app/events/event_store.py
index 0c475cc3..026ae84a 100644
--- a/backend/app/events/event_store.py
+++ b/backend/app/events/event_store.py
@@ -7,7 +7,7 @@
from beanie.odm.enums import SortDirection
from pymongo.errors import BulkWriteError, DuplicateKeyError
-from app.core.metrics.context import get_event_metrics
+from app.core.metrics import EventMetrics
from app.core.tracing import EventAttributes
from app.core.tracing.utils import add_span_attributes
from app.db.docs import EventDocument
@@ -21,10 +21,11 @@ def __init__(
self,
schema_registry: SchemaRegistryManager,
logger: logging.Logger,
+ event_metrics: EventMetrics,
ttl_days: int = 90,
batch_size: int = 100,
):
- self.metrics = get_event_metrics()
+ self.metrics = event_metrics
self.schema_registry = schema_registry
self.logger = logger
self.ttl_days = ttl_days
@@ -317,12 +318,14 @@ async def health_check(self) -> dict[str, Any]:
def create_event_store(
schema_registry: SchemaRegistryManager,
logger: logging.Logger,
+ event_metrics: EventMetrics,
ttl_days: int = 90,
batch_size: int = 100,
) -> EventStore:
return EventStore(
schema_registry=schema_registry,
logger=logger,
+ event_metrics=event_metrics,
ttl_days=ttl_days,
batch_size=batch_size,
)
diff --git a/backend/app/events/event_store_consumer.py b/backend/app/events/event_store_consumer.py
index 4f2ba47d..41135a95 100644
--- a/backend/app/events/event_store_consumer.py
+++ b/backend/app/events/event_store_consumer.py
@@ -4,6 +4,7 @@
from opentelemetry.trace import SpanKind
from app.core.lifecycle import LifecycleEnabled
+from app.core.metrics import EventMetrics
from app.core.tracing.utils import trace_span
from app.domain.enums.events import EventType
from app.domain.enums.kafka import GroupId, KafkaTopic
@@ -24,6 +25,7 @@ def __init__(
schema_registry_manager: SchemaRegistryManager,
settings: Settings,
logger: logging.Logger,
+ event_metrics: EventMetrics,
producer: UnifiedProducer | None = None,
group_id: GroupId = GroupId.EVENT_STORE_CONSUMER,
batch_size: int = 100,
@@ -37,6 +39,7 @@ def __init__(
self.batch_size = batch_size
self.batch_timeout = batch_timeout_seconds
self.logger = logger
+ self.event_metrics = event_metrics
self.consumer: UnifiedConsumer | None = None
self.schema_registry_manager = schema_registry_manager
self.dispatcher = EventDispatcher(logger)
@@ -66,6 +69,7 @@ async def _on_start(self) -> None:
schema_registry=self.schema_registry_manager,
settings=self.settings,
logger=self.logger,
+ event_metrics=self.event_metrics,
)
# Register handler for all event types - store everything
@@ -166,6 +170,7 @@ def create_event_store_consumer(
schema_registry_manager: SchemaRegistryManager,
settings: Settings,
logger: logging.Logger,
+ event_metrics: EventMetrics,
producer: UnifiedProducer | None = None,
group_id: GroupId = GroupId.EVENT_STORE_CONSUMER,
batch_size: int = 100,
@@ -180,5 +185,6 @@ def create_event_store_consumer(
schema_registry_manager=schema_registry_manager,
settings=settings,
logger=logger,
+ event_metrics=event_metrics,
producer=producer,
)
diff --git a/backend/app/main.py b/backend/app/main.py
index 52af39bb..bf776a41 100644
--- a/backend/app/main.py
+++ b/backend/app/main.py
@@ -31,21 +31,6 @@
from app.core.dishka_lifespan import lifespan
from app.core.exceptions import configure_exception_handlers
from app.core.logging import setup_logger
-from app.core.metrics import (
- ConnectionMetrics,
- CoordinatorMetrics,
- DatabaseMetrics,
- DLQMetrics,
- EventMetrics,
- ExecutionMetrics,
- HealthMetrics,
- KubernetesMetrics,
- NotificationMetrics,
- RateLimitMetrics,
- ReplayMetrics,
- SecurityMetrics,
-)
-from app.core.metrics.context import MetricsContext
from app.core.middlewares import (
CacheControlMiddleware,
CSRFMiddleware,
@@ -68,22 +53,8 @@ def create_app(settings: Settings | None = None) -> FastAPI:
settings = settings or Settings()
logger = setup_logger(settings.LOG_LEVEL)
- # Initialize metrics context for all services
- MetricsContext.initialize_all(
- logger,
- connection=ConnectionMetrics(settings),
- coordinator=CoordinatorMetrics(settings),
- database=DatabaseMetrics(settings),
- dlq=DLQMetrics(settings),
- event=EventMetrics(settings),
- execution=ExecutionMetrics(settings),
- health=HealthMetrics(settings),
- kubernetes=KubernetesMetrics(settings),
- notification=NotificationMetrics(settings),
- rate_limit=RateLimitMetrics(settings),
- replay=ReplayMetrics(settings),
- security=SecurityMetrics(settings),
- )
+ # Note: Metrics are now provided via DI (MetricsProvider) and injected into services.
+ # No manual MetricsContext initialization is needed.
# Disable OpenAPI/Docs in production for security; health endpoints provide readiness
app = FastAPI(
@@ -99,9 +70,7 @@ def create_app(settings: Settings | None = None) -> FastAPI:
setup_metrics(app, settings, logger)
app.add_middleware(MetricsMiddleware)
- if settings.RATE_LIMIT_ENABLED:
- app.add_middleware(RateLimitMiddleware)
-
+ app.add_middleware(RateLimitMiddleware, settings=settings)
app.add_middleware(CSRFMiddleware, container=container)
app.add_middleware(CorrelationMiddleware)
app.add_middleware(RequestSizeLimitMiddleware)
diff --git a/backend/app/services/coordinator/coordinator.py b/backend/app/services/coordinator/coordinator.py
index b2610e59..5f93ceb6 100644
--- a/backend/app/services/coordinator/coordinator.py
+++ b/backend/app/services/coordinator/coordinator.py
@@ -6,7 +6,7 @@
from uuid import uuid4
from app.core.lifecycle import LifecycleEnabled
-from app.core.metrics.context import get_coordinator_metrics
+from app.core.metrics import CoordinatorMetrics, EventMetrics
from app.db.repositories.execution_repository import ExecutionRepository
from app.domain.enums.events import EventType
from app.domain.enums.kafka import CONSUMER_GROUP_SUBSCRIPTIONS, GroupId
@@ -56,13 +56,16 @@ def __init__(
execution_repository: ExecutionRepository,
idempotency_manager: IdempotencyManager,
logger: logging.Logger,
+ coordinator_metrics: CoordinatorMetrics,
+ event_metrics: EventMetrics,
consumer_group: str = GroupId.EXECUTION_COORDINATOR,
max_concurrent_scheduling: int = 10,
scheduling_interval_seconds: float = 0.5,
):
super().__init__()
self.logger = logger
- self.metrics = get_coordinator_metrics()
+ self.metrics = coordinator_metrics
+ self._event_metrics = event_metrics
self._settings = settings
# Kafka configuration
@@ -71,11 +74,19 @@ def __init__(
# Components
self.queue_manager = QueueManager(
- logger=self.logger, max_queue_size=10000, max_executions_per_user=100, stale_timeout_seconds=3600
+ logger=self.logger,
+ coordinator_metrics=coordinator_metrics,
+ max_queue_size=10000,
+ max_executions_per_user=100,
+ stale_timeout_seconds=3600,
)
self.resource_manager = ResourceManager(
- logger=self.logger, total_cpu_cores=32.0, total_memory_mb=65536, total_gpu_count=0
+ logger=self.logger,
+ coordinator_metrics=coordinator_metrics,
+ total_cpu_cores=32.0,
+ total_memory_mb=65536,
+ total_gpu_count=0,
)
# Kafka components
@@ -127,6 +138,7 @@ async def _on_start(self) -> None:
schema_registry=self._schema_registry_manager,
settings=self._settings,
logger=self.logger,
+ event_metrics=self._event_metrics,
)
# Register handlers with EventDispatcher BEFORE wrapping with idempotency
diff --git a/backend/app/services/coordinator/queue_manager.py b/backend/app/services/coordinator/queue_manager.py
index 64ba66c8..b8ac98eb 100644
--- a/backend/app/services/coordinator/queue_manager.py
+++ b/backend/app/services/coordinator/queue_manager.py
@@ -7,7 +7,7 @@
from enum import IntEnum
from typing import Any, Dict, List, Tuple
-from app.core.metrics.context import get_coordinator_metrics
+from app.core.metrics import CoordinatorMetrics
from app.domain.events.typed import ExecutionRequestedEvent
@@ -43,12 +43,13 @@ class QueueManager:
def __init__(
self,
logger: logging.Logger,
+ coordinator_metrics: CoordinatorMetrics,
max_queue_size: int = 10000,
max_executions_per_user: int = 100,
stale_timeout_seconds: int = 3600,
) -> None:
self.logger = logger
- self.metrics = get_coordinator_metrics()
+ self.metrics = coordinator_metrics
self.max_queue_size = max_queue_size
self.max_executions_per_user = max_executions_per_user
self.stale_timeout_seconds = stale_timeout_seconds
diff --git a/backend/app/services/coordinator/resource_manager.py b/backend/app/services/coordinator/resource_manager.py
index 8910852f..de2cbba6 100644
--- a/backend/app/services/coordinator/resource_manager.py
+++ b/backend/app/services/coordinator/resource_manager.py
@@ -3,7 +3,7 @@
from dataclasses import dataclass
from typing import Dict, List
-from app.core.metrics.context import get_coordinator_metrics
+from app.core.metrics import CoordinatorMetrics
@dataclass
@@ -86,13 +86,14 @@ class ResourceManager:
def __init__(
self,
logger: logging.Logger,
+ coordinator_metrics: CoordinatorMetrics,
total_cpu_cores: float = 32.0,
total_memory_mb: int = 65536, # 64GB
total_gpu_count: int = 0,
overcommit_factor: float = 1.2, # Allow 20% overcommit
):
self.logger = logger
- self.metrics = get_coordinator_metrics()
+ self.metrics = coordinator_metrics
self.pool = ResourcePool(
total_cpu_cores=total_cpu_cores * overcommit_factor,
total_memory_mb=int(total_memory_mb * overcommit_factor),
diff --git a/backend/app/services/event_bus.py b/backend/app/services/event_bus.py
index 455085d4..bd0080ee 100644
--- a/backend/app/services/event_bus.py
+++ b/backend/app/services/event_bus.py
@@ -13,7 +13,7 @@
from pydantic import BaseModel, ConfigDict
from app.core.lifecycle import LifecycleEnabled
-from app.core.metrics.context import get_connection_metrics
+from app.core.metrics import ConnectionMetrics
from app.domain.enums.kafka import KafkaTopic
from app.settings import Settings
@@ -53,11 +53,11 @@ class EventBus(LifecycleEnabled):
- *.completed - matches all completed events
"""
- def __init__(self, settings: Settings, logger: logging.Logger) -> None:
+ def __init__(self, settings: Settings, logger: logging.Logger, connection_metrics: ConnectionMetrics) -> None:
super().__init__()
self.logger = logger
self.settings = settings
- self.metrics = get_connection_metrics()
+ self.metrics = connection_metrics
self.producer: Optional[AIOKafkaProducer] = None
self.consumer: Optional[AIOKafkaConsumer] = None
self._subscriptions: dict[str, Subscription] = {} # id -> Subscription
@@ -83,7 +83,6 @@ async def _initialize_kafka(self) -> None:
max_batch_size=16384,
enable_idempotence=True,
)
- await self.producer.start()
# Consumer setup
self.consumer = AIOKafkaConsumer(
@@ -98,7 +97,9 @@ async def _initialize_kafka(self) -> None:
max_poll_interval_ms=self.settings.KAFKA_MAX_POLL_INTERVAL_MS,
request_timeout_ms=self.settings.KAFKA_REQUEST_TIMEOUT_MS,
)
- await self.consumer.start()
+
+ # Start both in parallel for faster startup
+ await asyncio.gather(self.producer.start(), self.consumer.start())
async def _on_stop(self) -> None:
"""Stop the event bus and clean up resources."""
@@ -318,9 +319,10 @@ async def get_statistics(self) -> dict[str, Any]:
class EventBusManager:
"""Manages EventBus lifecycle as a singleton."""
- def __init__(self, settings: Settings, logger: logging.Logger) -> None:
+ def __init__(self, settings: Settings, logger: logging.Logger, connection_metrics: ConnectionMetrics) -> None:
self.settings = settings
self.logger = logger
+ self._connection_metrics = connection_metrics
self._event_bus: Optional[EventBus] = None
self._lock = asyncio.Lock()
@@ -328,7 +330,7 @@ async def get_event_bus(self) -> EventBus:
"""Get or create the event bus instance."""
async with self._lock:
if self._event_bus is None:
- self._event_bus = EventBus(self.settings, self.logger)
+ self._event_bus = EventBus(self.settings, self.logger, self._connection_metrics)
await self._event_bus.__aenter__()
return self._event_bus
diff --git a/backend/app/services/execution_service.py b/backend/app/services/execution_service.py
index e4455c36..fb394342 100644
--- a/backend/app/services/execution_service.py
+++ b/backend/app/services/execution_service.py
@@ -5,7 +5,7 @@
from typing import Any, Generator, TypeAlias
from app.core.correlation import CorrelationContext
-from app.core.metrics.context import get_execution_metrics
+from app.core.metrics import ExecutionMetrics
from app.db.repositories.execution_repository import ExecutionRepository
from app.domain.enums.events import EventType
from app.domain.enums.execution import ExecutionStatus
@@ -52,6 +52,7 @@ def __init__(
event_store: EventStore,
settings: Settings,
logger: logging.Logger,
+ execution_metrics: ExecutionMetrics,
) -> None:
"""
Initialize execution service.
@@ -62,13 +63,14 @@ def __init__(
event_store: Event store for event persistence.
settings: Application settings.
logger: Logger instance.
+ execution_metrics: Metrics for tracking execution operations.
"""
self.execution_repo = execution_repo
self.producer = producer
self.event_store = event_store
self.settings = settings
self.logger = logger
- self.metrics = get_execution_metrics()
+ self.metrics = execution_metrics
@contextmanager
def _track_active_execution(self) -> Generator[None, None, None]: # noqa: D401
diff --git a/backend/app/services/idempotency/idempotency_manager.py b/backend/app/services/idempotency/idempotency_manager.py
index 90757740..e30b6efe 100644
--- a/backend/app/services/idempotency/idempotency_manager.py
+++ b/backend/app/services/idempotency/idempotency_manager.py
@@ -8,7 +8,7 @@
from pydantic import BaseModel
from pymongo.errors import DuplicateKeyError
-from app.core.metrics.context import get_database_metrics
+from app.core.metrics import DatabaseMetrics
from app.domain.events.typed import BaseEvent
from app.domain.idempotency import IdempotencyRecord, IdempotencyStats, IdempotencyStatus
@@ -67,9 +67,15 @@ async def health_check(self) -> None: ...
class IdempotencyManager:
- def __init__(self, config: IdempotencyConfig, repository: IdempotencyRepoProtocol, logger: logging.Logger) -> None:
+ def __init__(
+ self,
+ config: IdempotencyConfig,
+ repository: IdempotencyRepoProtocol,
+ logger: logging.Logger,
+ database_metrics: DatabaseMetrics,
+ ) -> None:
self.config = config
- self.metrics = get_database_metrics()
+ self.metrics = database_metrics
self._repo: IdempotencyRepoProtocol = repository
self._stats_update_task: asyncio.Task[None] | None = None
self.logger = logger
@@ -320,5 +326,6 @@ def create_idempotency_manager(
repository: IdempotencyRepoProtocol,
config: IdempotencyConfig | None = None,
logger: logging.Logger,
+ database_metrics: DatabaseMetrics,
) -> IdempotencyManager:
- return IdempotencyManager(config or IdempotencyConfig(), repository, logger)
+ return IdempotencyManager(config or IdempotencyConfig(), repository, logger, database_metrics)
diff --git a/backend/app/services/k8s_worker/worker.py b/backend/app/services/k8s_worker/worker.py
index 5a1c0ccc..cd9af936 100644
--- a/backend/app/services/k8s_worker/worker.py
+++ b/backend/app/services/k8s_worker/worker.py
@@ -10,7 +10,7 @@
from kubernetes.client.rest import ApiException
from app.core.lifecycle import LifecycleEnabled
-from app.core.metrics import ExecutionMetrics, KubernetesMetrics
+from app.core.metrics import EventMetrics, ExecutionMetrics, KubernetesMetrics
from app.domain.enums.events import EventType
from app.domain.enums.kafka import CONSUMER_GROUP_SUBSCRIPTIONS, GroupId
from app.domain.enums.storage import ExecutionErrorType
@@ -56,8 +56,10 @@ def __init__(
event_store: EventStore,
idempotency_manager: IdempotencyManager,
logger: logging.Logger,
+ event_metrics: EventMetrics,
):
super().__init__()
+ self._event_metrics = event_metrics
self.logger = logger
self.metrics = KubernetesMetrics(settings)
self.execution_metrics = ExecutionMetrics(settings)
@@ -126,6 +128,7 @@ async def _on_start(self) -> None:
schema_registry=self._schema_registry_manager,
settings=self._settings,
logger=self.logger,
+ event_metrics=self._event_metrics,
)
# Wrap consumer with idempotency - use content hash for pod commands
diff --git a/backend/app/services/kafka_event_service.py b/backend/app/services/kafka_event_service.py
index 1b25a34f..b0a3bcb7 100644
--- a/backend/app/services/kafka_event_service.py
+++ b/backend/app/services/kafka_event_service.py
@@ -7,7 +7,7 @@
from opentelemetry import trace
from app.core.correlation import CorrelationContext
-from app.core.metrics.context import get_event_metrics
+from app.core.metrics import EventMetrics
from app.core.tracing.utils import inject_trace_context
from app.db.repositories.event_repository import EventRepository
from app.domain.enums.events import EventType
@@ -26,11 +26,12 @@ def __init__(
kafka_producer: UnifiedProducer,
settings: Settings,
logger: logging.Logger,
+ event_metrics: EventMetrics,
):
self.event_repository = event_repository
self.kafka_producer = kafka_producer
self.logger = logger
- self.metrics = get_event_metrics()
+ self.metrics = event_metrics
self.settings = settings
async def publish_event(
diff --git a/backend/app/services/notification_service.py b/backend/app/services/notification_service.py
index eb6f79ad..780f1279 100644
--- a/backend/app/services/notification_service.py
+++ b/backend/app/services/notification_service.py
@@ -7,7 +7,7 @@
import httpx
-from app.core.metrics.context import get_notification_metrics
+from app.core.metrics import EventMetrics, NotificationMetrics
from app.core.tracing.utils import add_span_attributes
from app.core.utils import StringEnum
from app.db.repositories.notification_repository import NotificationRepository
@@ -122,11 +122,14 @@ def __init__(
sse_bus: SSERedisBus,
settings: Settings,
logger: logging.Logger,
+ notification_metrics: NotificationMetrics,
+ event_metrics: EventMetrics,
) -> None:
self.repository = notification_repository
self.event_service = event_service
self.event_bus_manager = event_bus_manager
- self.metrics = get_notification_metrics()
+ self.metrics = notification_metrics
+ self._event_metrics = event_metrics
self.settings = settings
self.schema_registry_manager = schema_registry_manager
self.sse_bus = sse_bus
@@ -247,6 +250,7 @@ async def _subscribe_to_events(self) -> None:
schema_registry=self.schema_registry_manager,
settings=self.settings,
logger=self.logger,
+ event_metrics=self._event_metrics,
)
# Start consumer
diff --git a/backend/app/services/pod_monitor/monitor.py b/backend/app/services/pod_monitor/monitor.py
index f6325ab3..ecbb4556 100644
--- a/backend/app/services/pod_monitor/monitor.py
+++ b/backend/app/services/pod_monitor/monitor.py
@@ -12,7 +12,7 @@
from app.core.k8s_clients import K8sClients, close_k8s_clients, create_k8s_clients
from app.core.lifecycle import LifecycleEnabled
-from app.core.metrics.context import get_kubernetes_metrics
+from app.core.metrics import KubernetesMetrics
from app.core.utils import StringEnum
from app.domain.events.typed import DomainEvent
from app.services.kafka_event_service import KafkaEventService
@@ -104,6 +104,7 @@ def __init__(
logger: logging.Logger,
k8s_clients: K8sClients,
event_mapper: PodEventMapper,
+ kubernetes_metrics: KubernetesMetrics,
) -> None:
"""Initialize the pod monitor with all required dependencies.
@@ -134,7 +135,7 @@ def __init__(
self._reconcile_task: asyncio.Task[None] | None = None
# Metrics
- self._metrics = get_kubernetes_metrics()
+ self._metrics = kubernetes_metrics
@property
def state(self) -> MonitorState:
@@ -462,6 +463,7 @@ async def create_pod_monitor(
config: PodMonitorConfig,
kafka_event_service: KafkaEventService,
logger: logging.Logger,
+ kubernetes_metrics: KubernetesMetrics,
k8s_clients: K8sClients | None = None,
event_mapper: PodEventMapper | None = None,
) -> AsyncIterator[PodMonitor]:
@@ -491,6 +493,7 @@ async def create_pod_monitor(
logger=logger,
k8s_clients=k8s_clients,
event_mapper=event_mapper,
+ kubernetes_metrics=kubernetes_metrics,
)
try:
diff --git a/backend/app/services/rate_limit_service.py b/backend/app/services/rate_limit_service.py
index d28204de..1b5393a0 100644
--- a/backend/app/services/rate_limit_service.py
+++ b/backend/app/services/rate_limit_service.py
@@ -9,7 +9,7 @@
import redis.asyncio as redis
-from app.core.metrics.rate_limit import RateLimitMetrics
+from app.core.metrics import RateLimitMetrics
from app.core.tracing.utils import add_span_attributes
from app.domain.rate_limit import (
EndpointGroup,
@@ -200,18 +200,6 @@ async def check_rate_limit(
)
try:
- if not self.settings.RATE_LIMIT_ENABLED:
- # Track request when rate limiting is disabled
- self.metrics.requests_total.add(
- 1,
- {
- "authenticated": str(ctx.authenticated).lower(),
- "endpoint": ctx.normalized_endpoint,
- "algorithm": "disabled",
- },
- )
- return self._unlimited()
-
if config is None:
with self._timer(self.metrics.redis_duration, {"operation": "get_config"}):
config = await self._get_config()
diff --git a/backend/app/services/result_processor/processor.py b/backend/app/services/result_processor/processor.py
index 530dbd15..3f9864db 100644
--- a/backend/app/services/result_processor/processor.py
+++ b/backend/app/services/result_processor/processor.py
@@ -5,7 +5,7 @@
from pydantic import BaseModel, ConfigDict, Field
from app.core.lifecycle import LifecycleEnabled
-from app.core.metrics.context import get_execution_metrics
+from app.core.metrics import EventMetrics, ExecutionMetrics
from app.core.utils import StringEnum
from app.db.repositories.execution_repository import ExecutionRepository
from app.domain.enums.events import EventType
@@ -62,6 +62,8 @@ def __init__(
settings: Settings,
idempotency_manager: IdempotencyManager,
logger: logging.Logger,
+ execution_metrics: ExecutionMetrics,
+ event_metrics: EventMetrics,
) -> None:
"""Initialize the result processor."""
super().__init__()
@@ -70,7 +72,8 @@ def __init__(
self._producer = producer
self._schema_registry = schema_registry
self._settings = settings
- self._metrics = get_execution_metrics()
+ self._metrics = execution_metrics
+ self._event_metrics = event_metrics
self._idempotency_manager: IdempotencyManager = idempotency_manager
self._state = ProcessingState.IDLE
self._consumer: IdempotentConsumerWrapper | None = None
@@ -137,6 +140,7 @@ async def _create_consumer(self) -> IdempotentConsumerWrapper:
schema_registry=self._schema_registry,
settings=self._settings,
logger=self.logger,
+ event_metrics=self._event_metrics,
)
wrapper = IdempotentConsumerWrapper(
consumer=base_consumer,
diff --git a/backend/app/services/saga/saga_orchestrator.py b/backend/app/services/saga/saga_orchestrator.py
index 4fef4167..194d6ac3 100644
--- a/backend/app/services/saga/saga_orchestrator.py
+++ b/backend/app/services/saga/saga_orchestrator.py
@@ -6,6 +6,7 @@
from opentelemetry.trace import SpanKind
from app.core.lifecycle import LifecycleEnabled
+from app.core.metrics import EventMetrics
from app.core.tracing import EventAttributes
from app.core.tracing.utils import get_tracer
from app.db.repositories.resource_allocation_repository import ResourceAllocationRepository
@@ -40,6 +41,7 @@ def __init__(
idempotency_manager: IdempotencyManager,
resource_allocation_repository: ResourceAllocationRepository,
logger: logging.Logger,
+ event_metrics: EventMetrics,
):
super().__init__()
self.config = config
@@ -55,6 +57,7 @@ def __init__(
self._alloc_repo: ResourceAllocationRepository = resource_allocation_repository
self._tasks: list[asyncio.Task[None]] = []
self.logger = logger
+ self._event_metrics = event_metrics
def register_saga(self, saga_class: type[BaseSaga]) -> None:
self._sagas[saga_class.get_name()] = saga_class
@@ -136,6 +139,7 @@ async def _start_consumer(self) -> None:
schema_registry=self._schema_registry_manager,
settings=self._settings,
logger=self.logger,
+ event_metrics=self._event_metrics,
)
self._consumer = IdempotentConsumerWrapper(
consumer=base_consumer,
@@ -542,6 +546,7 @@ def create_saga_orchestrator(
resource_allocation_repository: ResourceAllocationRepository,
config: SagaConfig,
logger: logging.Logger,
+ event_metrics: EventMetrics,
) -> SagaOrchestrator:
"""Factory function to create a saga orchestrator.
@@ -555,6 +560,7 @@ def create_saga_orchestrator(
resource_allocation_repository: Repository for resource allocations
config: Saga configuration
logger: Logger instance
+ event_metrics: Event metrics for tracking Kafka consumption
Returns:
A new saga orchestrator instance
@@ -569,4 +575,5 @@ def create_saga_orchestrator(
idempotency_manager=idempotency_manager,
resource_allocation_repository=resource_allocation_repository,
logger=logger,
+ event_metrics=event_metrics,
)
diff --git a/backend/app/services/sse/kafka_redis_bridge.py b/backend/app/services/sse/kafka_redis_bridge.py
index 950837ca..07e03c44 100644
--- a/backend/app/services/sse/kafka_redis_bridge.py
+++ b/backend/app/services/sse/kafka_redis_bridge.py
@@ -1,9 +1,10 @@
from __future__ import annotations
+import asyncio
import logging
from app.core.lifecycle import LifecycleEnabled
-from app.core.metrics.events import EventMetrics
+from app.core.metrics import EventMetrics
from app.domain.enums.events import EventType
from app.domain.enums.kafka import CONSUMER_GROUP_SUBSCRIPTIONS, GroupId
from app.domain.events.typed import DomainEvent
@@ -44,31 +45,29 @@ async def _on_start(self) -> None:
"""Start the SSE Kafka→Redis bridge."""
self.logger.info(f"Starting SSE Kafka→Redis bridge with {self.num_consumers} consumers")
- for i in range(self.num_consumers):
- consumer = await self._create_consumer(i)
- self.consumers.append(consumer)
+ # Phase 1: Build all consumers and track them immediately (no I/O)
+ self.consumers = [self._build_consumer(i) for i in range(self.num_consumers)]
+
+ # Phase 2: Start all in parallel - already tracked in self.consumers for cleanup
+ topics = list(CONSUMER_GROUP_SUBSCRIPTIONS[GroupId.WEBSOCKET_GATEWAY])
+ await asyncio.gather(*[c.start(topics) for c in self.consumers])
self.logger.info("SSE Kafka→Redis bridge started successfully")
async def _on_stop(self) -> None:
"""Stop the SSE Kafka→Redis bridge."""
self.logger.info("Stopping SSE Kafka→Redis bridge")
-
- for consumer in self.consumers:
- await consumer.stop()
-
+ await asyncio.gather(*[c.stop() for c in self.consumers], return_exceptions=True)
self.consumers.clear()
self.logger.info("SSE Kafka→Redis bridge stopped")
- async def _create_consumer(self, consumer_index: int) -> UnifiedConsumer:
+ def _build_consumer(self, consumer_index: int) -> UnifiedConsumer:
+ """Build a consumer instance without starting it."""
suffix = self.settings.KAFKA_GROUP_SUFFIX
- group_id = f"sse-bridge-pool.{suffix}"
- client_id = f"sse-bridge-{consumer_index}.{suffix}"
-
config = ConsumerConfig(
bootstrap_servers=self.settings.KAFKA_BOOTSTRAP_SERVERS,
- group_id=group_id,
- client_id=client_id,
+ group_id=f"sse-bridge-pool.{suffix}",
+ client_id=f"sse-bridge-{consumer_index}.{suffix}",
enable_auto_commit=True,
auto_offset_reset="latest",
max_poll_interval_ms=self.settings.KAFKA_MAX_POLL_INTERVAL_MS,
@@ -80,21 +79,15 @@ async def _create_consumer(self, consumer_index: int) -> UnifiedConsumer:
dispatcher = EventDispatcher(logger=self.logger)
self._register_routing_handlers(dispatcher)
- consumer = UnifiedConsumer(
+ return UnifiedConsumer(
config=config,
event_dispatcher=dispatcher,
schema_registry=self.schema_registry,
settings=self.settings,
logger=self.logger,
+ event_metrics=self.event_metrics,
)
- # Use WEBSOCKET_GATEWAY subscriptions - SSE bridge serves same purpose (real-time client delivery)
- topics = list(CONSUMER_GROUP_SUBSCRIPTIONS[GroupId.WEBSOCKET_GATEWAY])
- await consumer.start(topics)
-
- self.logger.info(f"Bridge consumer {consumer_index} started")
- return consumer
-
def _register_routing_handlers(self, dispatcher: EventDispatcher) -> None:
"""Publish relevant events to Redis channels keyed by execution_id."""
relevant_events = [
diff --git a/backend/app/services/sse/sse_service.py b/backend/app/services/sse/sse_service.py
index 3feed4c4..e474fc41 100644
--- a/backend/app/services/sse/sse_service.py
+++ b/backend/app/services/sse/sse_service.py
@@ -4,7 +4,7 @@
from datetime import datetime, timezone
from typing import Any, Dict
-from app.core.metrics.context import get_connection_metrics
+from app.core.metrics import ConnectionMetrics
from app.db.repositories.sse_repository import SSERepository
from app.domain.enums.events import EventType
from app.domain.enums.sse import SSEControlEvent, SSENotificationEvent
@@ -39,6 +39,7 @@ def __init__(
shutdown_manager: SSEShutdownManager,
settings: Settings,
logger: logging.Logger,
+ connection_metrics: ConnectionMetrics,
) -> None:
self.repository = repository
self.router = router
@@ -46,7 +47,7 @@ def __init__(
self.shutdown_manager = shutdown_manager
self.settings = settings
self.logger = logger
- self.metrics = get_connection_metrics()
+ self.metrics = connection_metrics
self.heartbeat_interval = getattr(settings, "SSE_HEARTBEAT_INTERVAL", 30)
async def create_execution_stream(self, execution_id: str, user_id: str) -> AsyncGenerator[Dict[str, Any], None]:
@@ -114,8 +115,8 @@ async def create_execution_stream(self, execution_id: str, user_id: str) -> Asyn
finally:
if subscription is not None:
- await subscription.close()
- await self.shutdown_manager.unregister_connection(execution_id, connection_id)
+ await asyncio.shield(subscription.close())
+ await asyncio.shield(self.shutdown_manager.unregister_connection(execution_id, connection_id))
self.logger.info("SSE connection closed", extra={"execution_id": execution_id})
async def _stream_events_redis(
@@ -254,11 +255,8 @@ async def create_notification_stream(self, user_id: str) -> AsyncGenerator[Dict[
)
)
finally:
- try:
- if subscription is not None:
- await subscription.close()
- except Exception:
- pass
+ if subscription is not None:
+ await asyncio.shield(subscription.close())
async def get_health_status(self) -> SSEHealthDomain:
router_stats = self.router.get_stats()
diff --git a/backend/app/services/sse/sse_shutdown_manager.py b/backend/app/services/sse/sse_shutdown_manager.py
index 86314b27..4551e812 100644
--- a/backend/app/services/sse/sse_shutdown_manager.py
+++ b/backend/app/services/sse/sse_shutdown_manager.py
@@ -5,7 +5,7 @@
from typing import Dict, Set
from app.core.lifecycle import LifecycleEnabled
-from app.core.metrics.context import get_connection_metrics
+from app.core.metrics import ConnectionMetrics
from app.domain.sse import ShutdownStatus
@@ -36,6 +36,7 @@ class SSEShutdownManager:
def __init__(
self,
logger: logging.Logger,
+ connection_metrics: ConnectionMetrics,
drain_timeout: float = 30.0,
notification_timeout: float = 5.0,
force_close_timeout: float = 10.0,
@@ -44,7 +45,7 @@ def __init__(
self.drain_timeout = drain_timeout
self.notification_timeout = notification_timeout
self.force_close_timeout = force_close_timeout
- self.metrics = get_connection_metrics()
+ self.metrics = connection_metrics
self._phase = ShutdownPhase.READY
self._shutdown_initiated = False
@@ -309,6 +310,7 @@ async def _wait_for_complete(self) -> None:
def create_sse_shutdown_manager(
logger: logging.Logger,
+ connection_metrics: ConnectionMetrics,
drain_timeout: float = 30.0,
notification_timeout: float = 5.0,
force_close_timeout: float = 10.0,
@@ -317,6 +319,7 @@ def create_sse_shutdown_manager(
Args:
logger: Logger instance
+ connection_metrics: Connection metrics for tracking SSE connections
drain_timeout: Time to wait for connections to close gracefully
notification_timeout: Time to wait for shutdown notifications to be sent
force_close_timeout: Time before force closing connections
@@ -326,6 +329,7 @@ def create_sse_shutdown_manager(
"""
return SSEShutdownManager(
logger=logger,
+ connection_metrics=connection_metrics,
drain_timeout=drain_timeout,
notification_timeout=notification_timeout,
force_close_timeout=force_close_timeout,
diff --git a/backend/app/settings.py b/backend/app/settings.py
index 44f8e2a3..fd510051 100644
--- a/backend/app/settings.py
+++ b/backend/app/settings.py
@@ -21,6 +21,7 @@ class Settings(BaseSettings):
KUBERNETES_CONFIG_PATH: str = "~/.kube/config"
KUBERNETES_CA_CERTIFICATE_PATH: str | None = None
RATE_LIMITS: str = "100/minute"
+ RATE_LIMIT_ENABLED: bool = True # Set to False to disable rate limiting entirely
SSL_KEYFILE: str = "/app/certs/server.key"
SSL_CERTFILE: str = "/app/certs/server.crt"
@@ -28,6 +29,9 @@ class Settings(BaseSettings):
SERVER_HOST: str = "localhost"
SERVER_PORT: int = 443
+ # Kubernetes namespace for execution pods
+ K8S_NAMESPACE: str = "integr8scode"
+
# Settings for Kubernetes resource limits and requests
K8S_POD_CPU_LIMIT: str = "1000m"
K8S_POD_MEMORY_LIMIT: str = "128Mi"
@@ -119,11 +123,10 @@ class Settings(BaseSettings):
REDIS_DB: int = 0
REDIS_PASSWORD: str | None = None
REDIS_SSL: bool = False
- REDIS_MAX_CONNECTIONS: int = 50
+ REDIS_MAX_CONNECTIONS: int = 200
REDIS_DECODE_RESPONSES: bool = True
# Rate Limiting Configuration
- RATE_LIMIT_ENABLED: bool = True
RATE_LIMIT_DEFAULT_REQUESTS: int = 100
RATE_LIMIT_DEFAULT_WINDOW: int = 60 # seconds
RATE_LIMIT_BURST_MULTIPLIER: float = 1.5
diff --git a/backend/tests/conftest.py b/backend/tests/conftest.py
index c9eef28d..aeadecd4 100644
--- a/backend/tests/conftest.py
+++ b/backend/tests/conftest.py
@@ -17,10 +17,17 @@
from scripts.create_topics import create_topics
# ===== Worker-specific isolation for pytest-xdist =====
-# Redis has 16 DBs (0-15); each xdist worker gets one, limiting parallel workers to 16.
+# Supports both xdist workers AND multiple independent pytest processes.
+#
+# TEST_RUN_ID: Unique identifier for this pytest process (set by CI or auto-generated).
+# Allows running backend-integration, backend-e2e, frontend-e2e in parallel.
+# PYTEST_XDIST_WORKER: Worker ID within a single pytest-xdist run (gw0, gw1, etc.)
+#
+# Combined, these give full isolation: each test worker in each pytest process is unique.
+_RUN_ID = os.environ.get("TEST_RUN_ID") or uuid.uuid4().hex[:8]
_WORKER_ID = os.environ.get("PYTEST_XDIST_WORKER", "gw0")
_WORKER_NUM = int(_WORKER_ID.removeprefix("gw") or "0")
-assert _WORKER_NUM < 16, f"xdist worker {_WORKER_NUM} >= 16 exceeds Redis DB limit; use -n 16 or fewer"
+_ISOLATION_KEY = f"{_RUN_ID}_{_WORKER_ID}"
# ===== Pytest hooks =====
@@ -46,21 +53,26 @@ def test_settings() -> Settings:
What gets isolated per worker (to prevent interference):
- DATABASE_NAME: Each worker gets its own MongoDB database
- - REDIS_DB: Each worker gets its own Redis database (0-15)
+ - REDIS_DB: Each worker gets its own Redis database (0-15, hash-distributed)
- KAFKA_GROUP_SUFFIX: Each worker gets unique consumer groups
What's SHARED (from env, no per-worker suffix):
- KAFKA_TOPIC_PREFIX: Topics created once by CI/scripts
- SCHEMA_SUBJECT_PREFIX: Schemas shared across workers
+
+ Isolation works across:
+ - xdist workers within a single pytest process (gw0, gw1, ...)
+ - Multiple independent pytest processes (via TEST_RUN_ID or auto-UUID)
"""
base = Settings(_env_file=".env.test")
- session_id = uuid.uuid4().hex[:8]
+ # Deterministic Redis DB: worker number + ASCII sum of RUN_ID (no hash randomization)
+ redis_db = (_WORKER_NUM + sum(ord(c) for c in _RUN_ID)) % 16
return base.model_copy(
update={
- # Per-worker isolation for xdist - must be dynamic, can't be in .env.test
- "DATABASE_NAME": f"integr8scode_test_{session_id}_{_WORKER_ID}",
- "REDIS_DB": _WORKER_NUM,
- "KAFKA_GROUP_SUFFIX": f"{session_id}.{_WORKER_ID}",
+ # Per-worker isolation - uses _ISOLATION_KEY which includes RUN_ID + WORKER_ID
+ "DATABASE_NAME": f"integr8scode_test_{_ISOLATION_KEY}",
+ "REDIS_DB": redis_db,
+ "KAFKA_GROUP_SUFFIX": _ISOLATION_KEY,
}
)
diff --git a/backend/tests/e2e/test_k8s_worker_create_pod.py b/backend/tests/e2e/test_k8s_worker_create_pod.py
index 30c23eb2..c43bb2e5 100644
--- a/backend/tests/e2e/test_k8s_worker_create_pod.py
+++ b/backend/tests/e2e/test_k8s_worker_create_pod.py
@@ -1,8 +1,8 @@
import logging
-import os
import uuid
import pytest
+from app.core.metrics import EventMetrics
from app.domain.events.typed import CreatePodCommandEvent, EventMetadata
from app.events.core import UnifiedProducer
from app.events.event_store import EventStore
@@ -21,18 +21,15 @@
@pytest.mark.asyncio
async def test_worker_creates_configmap_and_pod(
- scope: AsyncContainer, monkeypatch: pytest.MonkeyPatch, test_settings: Settings
+ scope: AsyncContainer, test_settings: Settings
) -> None:
- # Ensure non-default namespace for worker validation
- ns = os.environ.get("K8S_NAMESPACE", "integr8scode")
- if ns == "default":
- ns = "integr8scode"
- monkeypatch.setenv("K8S_NAMESPACE", ns)
+ ns = test_settings.K8S_NAMESPACE
schema: SchemaRegistryManager = await scope.get(SchemaRegistryManager)
store: EventStore = await scope.get(EventStore)
producer: UnifiedProducer = await scope.get(UnifiedProducer)
idem: IdempotencyManager = await scope.get(IdempotencyManager)
+ event_metrics: EventMetrics = await scope.get(EventMetrics)
cfg = K8sWorkerConfig(namespace=ns, max_concurrent_pods=1)
worker = KubernetesWorker(
@@ -43,6 +40,7 @@ async def test_worker_creates_configmap_and_pod(
event_store=store,
idempotency_manager=idem,
logger=_test_logger,
+ event_metrics=event_metrics,
)
# Initialize k8s clients using worker's own method
diff --git a/backend/tests/e2e/test_resource_cleaner_k8s.py b/backend/tests/e2e/test_resource_cleaner_k8s.py
index 805aa785..e4a79fa8 100644
--- a/backend/tests/e2e/test_resource_cleaner_k8s.py
+++ b/backend/tests/e2e/test_resource_cleaner_k8s.py
@@ -1,9 +1,9 @@
import asyncio
import logging
-import os
import pytest
from app.services.result_processor.resource_cleaner import ResourceCleaner
+from app.settings import Settings
pytestmark = [pytest.mark.e2e, pytest.mark.k8s]
@@ -11,19 +11,19 @@
@pytest.mark.asyncio
-async def test_initialize_and_get_usage() -> None:
+async def test_initialize_and_get_usage(test_settings: Settings) -> None:
rc = ResourceCleaner(logger=_test_logger)
await rc.initialize()
- usage = await rc.get_resource_usage(namespace=os.environ.get("K8S_NAMESPACE", "default"))
+ usage = await rc.get_resource_usage(namespace=test_settings.K8S_NAMESPACE)
assert set(usage.keys()) >= {"pods", "configmaps", "network_policies"}
@pytest.mark.asyncio
-async def test_cleanup_orphaned_resources_dry_run() -> None:
+async def test_cleanup_orphaned_resources_dry_run(test_settings: Settings) -> None:
rc = ResourceCleaner(logger=_test_logger)
await rc.initialize()
cleaned = await rc.cleanup_orphaned_resources(
- namespace=os.environ.get("K8S_NAMESPACE", "default"),
+ namespace=test_settings.K8S_NAMESPACE,
max_age_hours=0,
dry_run=True,
)
@@ -31,12 +31,12 @@ async def test_cleanup_orphaned_resources_dry_run() -> None:
@pytest.mark.asyncio
-async def test_cleanup_nonexistent_pod() -> None:
+async def test_cleanup_nonexistent_pod(test_settings: Settings) -> None:
rc = ResourceCleaner(logger=_test_logger)
await rc.initialize()
# Attempt to delete a pod that doesn't exist - should complete without errors
- namespace = os.environ.get("K8S_NAMESPACE", "default")
+ namespace = test_settings.K8S_NAMESPACE
nonexistent_pod = "integr8s-test-nonexistent-pod"
# Should complete within timeout and not raise any exceptions
diff --git a/backend/tests/e2e/test_resource_cleaner_orphan.py b/backend/tests/e2e/test_resource_cleaner_orphan.py
index cf879ed1..334b7b29 100644
--- a/backend/tests/e2e/test_resource_cleaner_orphan.py
+++ b/backend/tests/e2e/test_resource_cleaner_orphan.py
@@ -3,6 +3,7 @@
import pytest
from app.services.result_processor.resource_cleaner import ResourceCleaner
+from app.settings import Settings
from kubernetes import client as k8s_client
from kubernetes import config as k8s_config
@@ -19,10 +20,10 @@ def _ensure_kubeconfig() -> None:
@pytest.mark.asyncio
-async def test_cleanup_orphaned_configmaps_dry_run() -> None:
+async def test_cleanup_orphaned_configmaps_dry_run(test_settings: Settings) -> None:
_ensure_kubeconfig()
v1 = k8s_client.CoreV1Api()
- ns = "default"
+ ns = test_settings.K8S_NAMESPACE
name = f"int-test-cm-{int(datetime.now().timestamp())}"
# Create a configmap labeled like the app uses
diff --git a/backend/tests/integration/app/test_main_app.py b/backend/tests/integration/app/test_main_app.py
index c178fe14..d92a5359 100644
--- a/backend/tests/integration/app/test_main_app.py
+++ b/backend/tests/integration/app/test_main_app.py
@@ -8,7 +8,7 @@
pytestmark = pytest.mark.integration
-def test_create_app_real_instance(app: FastAPI, test_settings: Settings) -> None:
+def test_create_app_real_instance(app: FastAPI) -> None:
assert isinstance(app, FastAPI)
# Verify API routes are configured
@@ -24,8 +24,7 @@ def test_create_app_real_instance(app: FastAPI, test_settings: Settings) -> None
assert "RequestSizeLimitMiddleware" in middleware_class_names, "Request size limit middleware not configured"
assert "CacheControlMiddleware" in middleware_class_names, "Cache control middleware not configured"
assert "MetricsMiddleware" in middleware_class_names, "Metrics middleware not configured"
- if test_settings.RATE_LIMIT_ENABLED:
- assert "RateLimitMiddleware" in middleware_class_names, "Rate limit middleware not configured"
+ assert "RateLimitMiddleware" in middleware_class_names, "Rate limit middleware not configured"
def test_create_app_function_constructs(test_settings: Settings) -> None:
diff --git a/backend/tests/integration/dlq/test_dlq_manager.py b/backend/tests/integration/dlq/test_dlq_manager.py
index b1f84426..6af47303 100644
--- a/backend/tests/integration/dlq/test_dlq_manager.py
+++ b/backend/tests/integration/dlq/test_dlq_manager.py
@@ -6,12 +6,14 @@
import pytest
from aiokafka import AIOKafkaConsumer, AIOKafkaProducer
+from app.core.metrics import DLQMetrics
from app.dlq.manager import create_dlq_manager
from app.domain.enums.events import EventType
from app.domain.enums.kafka import KafkaTopic
from app.domain.events.typed import DLQMessageReceivedEvent
from app.events.schema.schema_registry import SchemaRegistryManager
from app.settings import Settings
+from dishka import AsyncContainer
from tests.helpers import make_execution_requested_event
@@ -24,10 +26,11 @@
@pytest.mark.asyncio
-async def test_dlq_manager_persists_and_emits_event(test_settings: Settings) -> None:
+async def test_dlq_manager_persists_and_emits_event(scope: AsyncContainer, test_settings: Settings) -> None:
"""Test that DLQ manager persists messages and emits DLQMessageReceivedEvent."""
schema_registry = SchemaRegistryManager(test_settings, _test_logger)
- manager = create_dlq_manager(settings=test_settings, schema_registry=schema_registry, logger=_test_logger)
+ dlq_metrics: DLQMetrics = await scope.get(DLQMetrics)
+ manager = create_dlq_manager(settings=test_settings, schema_registry=schema_registry, logger=_test_logger, dlq_metrics=dlq_metrics)
prefix = test_settings.KAFKA_TOPIC_PREFIX
ev = make_execution_requested_event(execution_id=f"exec-dlq-persist-{uuid.uuid4().hex[:8]}")
diff --git a/backend/tests/integration/events/test_consume_roundtrip.py b/backend/tests/integration/events/test_consume_roundtrip.py
index 9812b14f..94193247 100644
--- a/backend/tests/integration/events/test_consume_roundtrip.py
+++ b/backend/tests/integration/events/test_consume_roundtrip.py
@@ -3,6 +3,7 @@
import uuid
import pytest
+from app.core.metrics import EventMetrics
from app.domain.enums.events import EventType
from app.domain.enums.kafka import KafkaTopic
from app.domain.events.typed import DomainEvent
@@ -27,6 +28,7 @@ async def test_produce_consume_roundtrip(scope: AsyncContainer) -> None:
# Ensure schemas are registered
registry: SchemaRegistryManager = await scope.get(SchemaRegistryManager)
settings: Settings = await scope.get(Settings)
+ event_metrics: EventMetrics = await scope.get(EventMetrics)
await initialize_event_schemas(registry)
# Real producer from DI
@@ -54,6 +56,7 @@ async def _handle(_event: DomainEvent) -> None:
schema_registry=registry,
settings=settings,
logger=_test_logger,
+ event_metrics=event_metrics,
)
await consumer.start([KafkaTopic.EXECUTION_EVENTS])
diff --git a/backend/tests/integration/events/test_consumer_lifecycle.py b/backend/tests/integration/events/test_consumer_lifecycle.py
index 01833c19..5374e152 100644
--- a/backend/tests/integration/events/test_consumer_lifecycle.py
+++ b/backend/tests/integration/events/test_consumer_lifecycle.py
@@ -2,6 +2,7 @@
from uuid import uuid4
import pytest
+from app.core.metrics import EventMetrics
from app.domain.enums.kafka import KafkaTopic
from app.events.core import ConsumerConfig, EventDispatcher, UnifiedConsumer
from app.events.schema.schema_registry import SchemaRegistryManager
@@ -19,6 +20,7 @@
async def test_consumer_start_status_seek_and_stop(scope: AsyncContainer) -> None:
registry: SchemaRegistryManager = await scope.get(SchemaRegistryManager)
settings: Settings = await scope.get(Settings)
+ event_metrics: EventMetrics = await scope.get(EventMetrics)
cfg = ConsumerConfig(
bootstrap_servers=settings.KAFKA_BOOTSTRAP_SERVERS,
group_id=f"test-consumer-{uuid4().hex[:6]}",
@@ -30,6 +32,7 @@ async def test_consumer_start_status_seek_and_stop(scope: AsyncContainer) -> Non
schema_registry=registry,
settings=settings,
logger=_test_logger,
+ event_metrics=event_metrics,
)
await c.start([KafkaTopic.EXECUTION_EVENTS])
try:
diff --git a/backend/tests/integration/events/test_event_dispatcher.py b/backend/tests/integration/events/test_event_dispatcher.py
index d5f118a3..3d166cec 100644
--- a/backend/tests/integration/events/test_event_dispatcher.py
+++ b/backend/tests/integration/events/test_event_dispatcher.py
@@ -3,6 +3,7 @@
import uuid
import pytest
+from app.core.metrics import EventMetrics
from app.domain.enums.events import EventType
from app.domain.enums.kafka import KafkaTopic
from app.domain.events.typed import DomainEvent
@@ -27,6 +28,7 @@ async def test_dispatcher_with_multiple_handlers(scope: AsyncContainer) -> None:
# Ensure schema registry is ready
registry: SchemaRegistryManager = await scope.get(SchemaRegistryManager)
settings: Settings = await scope.get(Settings)
+ event_metrics: EventMetrics = await scope.get(EventMetrics)
await initialize_event_schemas(registry)
# Build dispatcher with two handlers for the same event
@@ -55,6 +57,7 @@ async def h2(_e: DomainEvent) -> None:
schema_registry=registry,
settings=settings,
logger=_test_logger,
+ event_metrics=event_metrics,
)
await consumer.start([KafkaTopic.EXECUTION_EVENTS])
diff --git a/backend/tests/integration/events/test_producer_roundtrip.py b/backend/tests/integration/events/test_producer_roundtrip.py
index 18493a51..cb91df15 100644
--- a/backend/tests/integration/events/test_producer_roundtrip.py
+++ b/backend/tests/integration/events/test_producer_roundtrip.py
@@ -2,6 +2,7 @@
from uuid import uuid4
import pytest
+from app.core.metrics import EventMetrics
from app.events.core import UnifiedProducer
from app.events.schema.schema_registry import SchemaRegistryManager
from app.infrastructure.kafka.mappings import get_topic_for_event
@@ -20,10 +21,12 @@ async def test_unified_producer_start_produce_send_to_dlq_stop(
scope: AsyncContainer, test_settings: Settings
) -> None:
schema: SchemaRegistryManager = await scope.get(SchemaRegistryManager)
+ event_metrics: EventMetrics = await scope.get(EventMetrics)
prod = UnifiedProducer(
schema,
logger=_test_logger,
settings=test_settings,
+ event_metrics=event_metrics,
)
async with prod:
diff --git a/backend/tests/integration/idempotency/test_consumer_idempotent.py b/backend/tests/integration/idempotency/test_consumer_idempotent.py
index 5e95eadb..19d4b05f 100644
--- a/backend/tests/integration/idempotency/test_consumer_idempotent.py
+++ b/backend/tests/integration/idempotency/test_consumer_idempotent.py
@@ -3,6 +3,7 @@
import uuid
import pytest
+from app.core.metrics import EventMetrics
from app.domain.enums.events import EventType
from app.domain.enums.kafka import KafkaTopic
from app.domain.events.typed import DomainEvent
@@ -34,6 +35,7 @@ async def test_consumer_idempotent_wrapper_blocks_duplicates(scope: AsyncContain
idm: IdempotencyManager = await scope.get(IdempotencyManager)
registry: SchemaRegistryManager = await scope.get(SchemaRegistryManager)
settings: Settings = await scope.get(Settings)
+ event_metrics: EventMetrics = await scope.get(EventMetrics)
# Future resolves when handler processes an event - no polling needed
handled_future: asyncio.Future[None] = asyncio.get_running_loop().create_future()
@@ -67,6 +69,7 @@ async def handle(_ev: DomainEvent) -> None:
schema_registry=registry,
settings=settings,
logger=_test_logger,
+ event_metrics=event_metrics,
)
wrapper = IdempotentConsumerWrapper(
consumer=base,
diff --git a/backend/tests/integration/idempotency/test_idempotency.py b/backend/tests/integration/idempotency/test_idempotency.py
index cc5017e4..032a7f46 100644
--- a/backend/tests/integration/idempotency/test_idempotency.py
+++ b/backend/tests/integration/idempotency/test_idempotency.py
@@ -8,11 +8,13 @@
import pytest
import redis.asyncio as redis
+from app.core.metrics import DatabaseMetrics
from app.domain.events.typed import DomainEvent
from app.domain.idempotency import IdempotencyRecord, IdempotencyStatus
from app.services.idempotency.idempotency_manager import IdempotencyConfig, IdempotencyManager
from app.services.idempotency.middleware import IdempotentEventHandler, idempotent_handler
from app.services.idempotency.redis_repository import RedisIdempotencyRepository
+from app.settings import Settings
from tests.helpers import make_execution_requested_event
@@ -26,7 +28,7 @@ class TestIdempotencyManager:
"""IdempotencyManager backed by real Redis repository (DI-provided client)."""
@pytest.fixture
- async def manager(self, redis_client: redis.Redis) -> AsyncGenerator[IdempotencyManager, None]:
+ async def manager(self, redis_client: redis.Redis, test_settings: Settings) -> AsyncGenerator[IdempotencyManager, None]:
prefix = f"idemp_ut:{uuid.uuid4().hex[:6]}"
cfg = IdempotencyConfig(
key_prefix=prefix,
@@ -37,7 +39,8 @@ async def manager(self, redis_client: redis.Redis) -> AsyncGenerator[Idempotency
enable_metrics=False,
)
repo = RedisIdempotencyRepository(redis_client, key_prefix=prefix)
- m = IdempotencyManager(cfg, repo, _test_logger)
+ database_metrics = DatabaseMetrics(test_settings)
+ m = IdempotencyManager(cfg, repo, _test_logger, database_metrics=database_metrics)
await m.initialize()
try:
yield m
@@ -254,11 +257,12 @@ class TestIdempotentEventHandlerIntegration:
"""Test IdempotentEventHandler with real components"""
@pytest.fixture
- async def manager(self, redis_client: redis.Redis) -> AsyncGenerator[IdempotencyManager, None]:
+ async def manager(self, redis_client: redis.Redis, test_settings: Settings) -> AsyncGenerator[IdempotencyManager, None]:
prefix = f"handler_test:{uuid.uuid4().hex[:6]}"
config = IdempotencyConfig(key_prefix=prefix, enable_metrics=False)
repo = RedisIdempotencyRepository(redis_client, key_prefix=prefix)
- m = IdempotencyManager(config, repo, _test_logger)
+ database_metrics = DatabaseMetrics(test_settings)
+ m = IdempotencyManager(config, repo, _test_logger, database_metrics=database_metrics)
await m.initialize()
try:
yield m
@@ -509,11 +513,12 @@ async def test_cleanup_expired_keys(self, manager: IdempotencyManager) -> None:
assert record is not None # Still exists until explicit cleanup
@pytest.mark.asyncio
- async def test_metrics_enabled(self, redis_client: redis.Redis) -> None:
+ async def test_metrics_enabled(self, redis_client: redis.Redis, test_settings: Settings) -> None:
"""Test manager with metrics enabled"""
config = IdempotencyConfig(key_prefix=f"metrics:{uuid.uuid4().hex[:6]}", enable_metrics=True)
repository = RedisIdempotencyRepository(redis_client, key_prefix=config.key_prefix)
- manager = IdempotencyManager(config, repository, _test_logger)
+ database_metrics = DatabaseMetrics(test_settings)
+ manager = IdempotencyManager(config, repository, _test_logger, database_metrics=database_metrics)
# Initialize with metrics
await manager.initialize()
diff --git a/backend/tests/integration/result_processor/test_result_processor.py b/backend/tests/integration/result_processor/test_result_processor.py
index 08a44a37..de2546d6 100644
--- a/backend/tests/integration/result_processor/test_result_processor.py
+++ b/backend/tests/integration/result_processor/test_result_processor.py
@@ -4,6 +4,7 @@
import pytest
from app.core.database_context import Database
+from app.core.metrics import EventMetrics, ExecutionMetrics
from app.db.repositories.execution_repository import ExecutionRepository
from app.domain.enums.events import EventType
from app.domain.enums.execution import ExecutionStatus
@@ -37,6 +38,8 @@ async def test_result_processor_persists_and_emits(scope: AsyncContainer) -> Non
# Ensure schemas
registry: SchemaRegistryManager = await scope.get(SchemaRegistryManager)
settings: Settings = await scope.get(Settings)
+ event_metrics: EventMetrics = await scope.get(EventMetrics)
+ execution_metrics: ExecutionMetrics = await scope.get(ExecutionMetrics)
await initialize_event_schemas(registry)
# Dependencies
@@ -63,6 +66,8 @@ async def test_result_processor_persists_and_emits(scope: AsyncContainer) -> Non
settings=settings,
idempotency_manager=idem,
logger=_test_logger,
+ execution_metrics=execution_metrics,
+ event_metrics=event_metrics,
)
# Setup a small consumer to capture ResultStoredEvent
@@ -87,6 +92,7 @@ async def _stored(event: ResultStoredEvent) -> None:
schema_registry=registry,
settings=settings,
logger=_test_logger,
+ event_metrics=event_metrics,
)
# Produce the event BEFORE starting consumers (auto_offset_reset="earliest" will read it)
diff --git a/backend/tests/integration/services/rate_limit/test_rate_limit_service.py b/backend/tests/integration/services/rate_limit/test_rate_limit_service.py
index 942b2a37..0476f048 100644
--- a/backend/tests/integration/services/rate_limit/test_rate_limit_service.py
+++ b/backend/tests/integration/services/rate_limit/test_rate_limit_service.py
@@ -19,21 +19,15 @@
@pytest.mark.asyncio
-async def test_normalize_and_disabled_and_bypass_and_no_rule(scope: AsyncContainer) -> None:
+async def test_normalize_and_bypass_and_no_rule(scope: AsyncContainer) -> None:
svc: RateLimitService = await scope.get(RateLimitService)
svc.prefix = f"{svc.prefix}{uuid4().hex[:6]}:"
- # ensure disabled for first path
- await svc.update_config(RateLimitConfig(default_rules=[]))
- svc.settings.RATE_LIMIT_ENABLED = False
+
# normalization masks uuids and ids
n = svc._normalize_endpoint("/api/12345678901234567890/abcdef-1234-5678-9abc-def012345678")
assert "*" in n
- # disabled path allowed
- res = await svc.check_rate_limit("u1", "/api/x")
- assert res.allowed is True
- # enabled, bypass
- svc.settings.RATE_LIMIT_ENABLED = True
+ # bypass user is always allowed
cfg = RateLimitConfig(default_rules=[], user_overrides={
"u1": UserRateLimit(user_id="u1", bypass_rate_limit=True)
})
@@ -51,7 +45,6 @@ async def test_normalize_and_disabled_and_bypass_and_no_rule(scope: AsyncContain
async def test_sliding_window_allowed_and_rejected(scope: AsyncContainer) -> None:
svc: RateLimitService = await scope.get(RateLimitService)
svc.prefix = f"{svc.prefix}{uuid4().hex[:6]}:"
- svc.settings.RATE_LIMIT_ENABLED = True # Enable rate limiting for this test
# matching rule with window 5, limit 3
rule = RateLimitRule(endpoint_pattern=r"^/api/v1/x", group=EndpointGroup.API, requests=3, window_seconds=5,
algorithm=RateLimitAlgorithm.SLIDING_WINDOW)
@@ -76,7 +69,6 @@ async def test_sliding_window_allowed_and_rejected(scope: AsyncContainer) -> Non
async def test_token_bucket_paths(scope: AsyncContainer) -> None:
svc: RateLimitService = await scope.get(RateLimitService)
svc.prefix = f"{svc.prefix}{uuid4().hex[:6]}:"
- svc.settings.RATE_LIMIT_ENABLED = True # Enable rate limiting for this test
rule = RateLimitRule(endpoint_pattern=r"^/api/v1/t", group=EndpointGroup.API, requests=2, window_seconds=10,
burst_multiplier=1.0, algorithm=RateLimitAlgorithm.TOKEN_BUCKET)
await svc.update_config(RateLimitConfig(default_rules=[rule]))
@@ -168,7 +160,6 @@ async def test_get_config_roundtrip(scope: AsyncContainer) -> None:
async def test_sliding_window_edge(scope: AsyncContainer) -> None:
svc: RateLimitService = await scope.get(RateLimitService)
svc.prefix = f"{svc.prefix}{uuid4().hex[:6]}:"
- svc.settings.RATE_LIMIT_ENABLED = True # Enable rate limiting for this test
# Configure a tight window and ensure behavior is consistent
cfg = RateLimitConfig(
default_rules=[
@@ -294,7 +285,6 @@ async def test_get_usage_stats_with_keys(scope: AsyncContainer) -> None:
@pytest.mark.asyncio
async def test_check_rate_limit_with_user_override(scope: AsyncContainer) -> None:
svc: RateLimitService = await scope.get(RateLimitService)
- svc.settings.RATE_LIMIT_ENABLED = True # Enable rate limiting for this test
rule = RateLimitRule(
endpoint_pattern=r"^/api",
group=EndpointGroup.API,
diff --git a/backend/tests/integration/services/sse/test_partitioned_event_router.py b/backend/tests/integration/services/sse/test_partitioned_event_router.py
index 15b0ec63..7e1c4ac6 100644
--- a/backend/tests/integration/services/sse/test_partitioned_event_router.py
+++ b/backend/tests/integration/services/sse/test_partitioned_event_router.py
@@ -4,7 +4,7 @@
import pytest
import redis.asyncio as redis
-from app.core.metrics.events import EventMetrics
+from app.core.metrics import EventMetrics
from app.events.core import EventDispatcher
from app.events.schema.schema_registry import SchemaRegistryManager
from app.schemas_pydantic.sse import RedisSSEMessage
diff --git a/backend/tests/unit/conftest.py b/backend/tests/unit/conftest.py
index ea7bab9f..65b28839 100644
--- a/backend/tests/unit/conftest.py
+++ b/backend/tests/unit/conftest.py
@@ -1,46 +1,82 @@
-import logging
-from collections.abc import Generator
from typing import NoReturn
import pytest
-from app.core.metrics.connections import ConnectionMetrics
-from app.core.metrics.context import MetricsContext
-from app.core.metrics.coordinator import CoordinatorMetrics
-from app.core.metrics.database import DatabaseMetrics
-from app.core.metrics.dlq import DLQMetrics
-from app.core.metrics.events import EventMetrics
-from app.core.metrics.execution import ExecutionMetrics
-from app.core.metrics.health import HealthMetrics
-from app.core.metrics.kubernetes import KubernetesMetrics
-from app.core.metrics.notifications import NotificationMetrics
-from app.core.metrics.rate_limit import RateLimitMetrics
-from app.core.metrics.replay import ReplayMetrics
-from app.core.metrics.security import SecurityMetrics
+from app.core.metrics import (
+ ConnectionMetrics,
+ CoordinatorMetrics,
+ DatabaseMetrics,
+ DLQMetrics,
+ EventMetrics,
+ ExecutionMetrics,
+ HealthMetrics,
+ KubernetesMetrics,
+ NotificationMetrics,
+ RateLimitMetrics,
+ ReplayMetrics,
+ SecurityMetrics,
+)
from app.settings import Settings
-_unit_test_logger = logging.getLogger("test.unit")
-
-
-@pytest.fixture(scope="session", autouse=True)
-def init_metrics_for_unit_tests(test_settings: Settings) -> Generator[None, None, None]:
- """Initialize all metrics context for unit tests."""
- MetricsContext.initialize_all(
- _unit_test_logger,
- connection=ConnectionMetrics(test_settings),
- coordinator=CoordinatorMetrics(test_settings),
- database=DatabaseMetrics(test_settings),
- dlq=DLQMetrics(test_settings),
- event=EventMetrics(test_settings),
- execution=ExecutionMetrics(test_settings),
- health=HealthMetrics(test_settings),
- kubernetes=KubernetesMetrics(test_settings),
- notification=NotificationMetrics(test_settings),
- rate_limit=RateLimitMetrics(test_settings),
- replay=ReplayMetrics(test_settings),
- security=SecurityMetrics(test_settings),
- )
- yield
- MetricsContext.reset_all(_unit_test_logger)
+
+# Metrics fixtures - provided via DI, not global context
+@pytest.fixture
+def connection_metrics(test_settings: Settings) -> ConnectionMetrics:
+ return ConnectionMetrics(test_settings)
+
+
+@pytest.fixture
+def coordinator_metrics(test_settings: Settings) -> CoordinatorMetrics:
+ return CoordinatorMetrics(test_settings)
+
+
+@pytest.fixture
+def database_metrics(test_settings: Settings) -> DatabaseMetrics:
+ return DatabaseMetrics(test_settings)
+
+
+@pytest.fixture
+def dlq_metrics(test_settings: Settings) -> DLQMetrics:
+ return DLQMetrics(test_settings)
+
+
+@pytest.fixture
+def event_metrics(test_settings: Settings) -> EventMetrics:
+ return EventMetrics(test_settings)
+
+
+@pytest.fixture
+def execution_metrics(test_settings: Settings) -> ExecutionMetrics:
+ return ExecutionMetrics(test_settings)
+
+
+@pytest.fixture
+def health_metrics(test_settings: Settings) -> HealthMetrics:
+ return HealthMetrics(test_settings)
+
+
+@pytest.fixture
+def kubernetes_metrics(test_settings: Settings) -> KubernetesMetrics:
+ return KubernetesMetrics(test_settings)
+
+
+@pytest.fixture
+def notification_metrics(test_settings: Settings) -> NotificationMetrics:
+ return NotificationMetrics(test_settings)
+
+
+@pytest.fixture
+def rate_limit_metrics(test_settings: Settings) -> RateLimitMetrics:
+ return RateLimitMetrics(test_settings)
+
+
+@pytest.fixture
+def replay_metrics(test_settings: Settings) -> ReplayMetrics:
+ return ReplayMetrics(test_settings)
+
+
+@pytest.fixture
+def security_metrics(test_settings: Settings) -> SecurityMetrics:
+ return SecurityMetrics(test_settings)
@pytest.fixture
diff --git a/backend/tests/unit/core/metrics/test_base_metrics.py b/backend/tests/unit/core/metrics/test_base_metrics.py
index ba4cdfde..e64f35fb 100644
--- a/backend/tests/unit/core/metrics/test_base_metrics.py
+++ b/backend/tests/unit/core/metrics/test_base_metrics.py
@@ -1,5 +1,5 @@
import pytest
-from app.core.metrics.base import BaseMetrics
+from app.core.metrics import BaseMetrics
from app.settings import Settings
pytestmark = pytest.mark.unit
diff --git a/backend/tests/unit/core/metrics/test_connections_and_coordinator_metrics.py b/backend/tests/unit/core/metrics/test_connections_and_coordinator_metrics.py
index fab6f368..202b7233 100644
--- a/backend/tests/unit/core/metrics/test_connections_and_coordinator_metrics.py
+++ b/backend/tests/unit/core/metrics/test_connections_and_coordinator_metrics.py
@@ -1,6 +1,5 @@
import pytest
-from app.core.metrics.connections import ConnectionMetrics
-from app.core.metrics.coordinator import CoordinatorMetrics
+from app.core.metrics import ConnectionMetrics, CoordinatorMetrics
from app.settings import Settings
pytestmark = pytest.mark.unit
diff --git a/backend/tests/unit/core/metrics/test_database_and_dlq_metrics.py b/backend/tests/unit/core/metrics/test_database_and_dlq_metrics.py
index 691d05aa..623e20b6 100644
--- a/backend/tests/unit/core/metrics/test_database_and_dlq_metrics.py
+++ b/backend/tests/unit/core/metrics/test_database_and_dlq_metrics.py
@@ -1,6 +1,5 @@
import pytest
-from app.core.metrics.database import DatabaseMetrics
-from app.core.metrics.dlq import DLQMetrics
+from app.core.metrics import DatabaseMetrics, DLQMetrics
from app.settings import Settings
pytestmark = pytest.mark.unit
diff --git a/backend/tests/unit/core/metrics/test_execution_and_events_metrics.py b/backend/tests/unit/core/metrics/test_execution_and_events_metrics.py
index 2eda95a8..fdd09bdc 100644
--- a/backend/tests/unit/core/metrics/test_execution_and_events_metrics.py
+++ b/backend/tests/unit/core/metrics/test_execution_and_events_metrics.py
@@ -1,6 +1,5 @@
import pytest
-from app.core.metrics.events import EventMetrics
-from app.core.metrics.execution import ExecutionMetrics
+from app.core.metrics import EventMetrics, ExecutionMetrics
from app.domain.enums.execution import ExecutionStatus
from app.settings import Settings
diff --git a/backend/tests/unit/core/metrics/test_health_and_rate_limit_metrics.py b/backend/tests/unit/core/metrics/test_health_and_rate_limit_metrics.py
index e22a3bff..54d06d27 100644
--- a/backend/tests/unit/core/metrics/test_health_and_rate_limit_metrics.py
+++ b/backend/tests/unit/core/metrics/test_health_and_rate_limit_metrics.py
@@ -1,5 +1,5 @@
import pytest
-from app.core.metrics.health import HealthMetrics
+from app.core.metrics import HealthMetrics
from app.settings import Settings
pytestmark = pytest.mark.unit
diff --git a/backend/tests/unit/core/metrics/test_kubernetes_and_notifications_metrics.py b/backend/tests/unit/core/metrics/test_kubernetes_and_notifications_metrics.py
index 061eed0e..3a12d8de 100644
--- a/backend/tests/unit/core/metrics/test_kubernetes_and_notifications_metrics.py
+++ b/backend/tests/unit/core/metrics/test_kubernetes_and_notifications_metrics.py
@@ -1,6 +1,5 @@
import pytest
-from app.core.metrics.kubernetes import KubernetesMetrics
-from app.core.metrics.notifications import NotificationMetrics
+from app.core.metrics import KubernetesMetrics, NotificationMetrics
from app.settings import Settings
pytestmark = pytest.mark.unit
diff --git a/backend/tests/unit/core/metrics/test_metrics_classes.py b/backend/tests/unit/core/metrics/test_metrics_classes.py
index 542a4a6a..382ed9c5 100644
--- a/backend/tests/unit/core/metrics/test_metrics_classes.py
+++ b/backend/tests/unit/core/metrics/test_metrics_classes.py
@@ -1,16 +1,18 @@
import pytest
-from app.core.metrics.connections import ConnectionMetrics
-from app.core.metrics.coordinator import CoordinatorMetrics
-from app.core.metrics.database import DatabaseMetrics
-from app.core.metrics.dlq import DLQMetrics
-from app.core.metrics.events import EventMetrics
-from app.core.metrics.execution import ExecutionMetrics
-from app.core.metrics.health import HealthMetrics
-from app.core.metrics.kubernetes import KubernetesMetrics
-from app.core.metrics.notifications import NotificationMetrics
-from app.core.metrics.rate_limit import RateLimitMetrics
-from app.core.metrics.replay import ReplayMetrics
-from app.core.metrics.security import SecurityMetrics
+from app.core.metrics import (
+ ConnectionMetrics,
+ CoordinatorMetrics,
+ DatabaseMetrics,
+ DLQMetrics,
+ EventMetrics,
+ ExecutionMetrics,
+ HealthMetrics,
+ KubernetesMetrics,
+ NotificationMetrics,
+ RateLimitMetrics,
+ ReplayMetrics,
+ SecurityMetrics,
+)
from app.domain.enums.execution import ExecutionStatus
from app.settings import Settings
diff --git a/backend/tests/unit/core/metrics/test_metrics_context.py b/backend/tests/unit/core/metrics/test_metrics_context.py
deleted file mode 100644
index 5f24a999..00000000
--- a/backend/tests/unit/core/metrics/test_metrics_context.py
+++ /dev/null
@@ -1,24 +0,0 @@
-import logging
-
-import pytest
-from app.core.metrics.context import (
- get_connection_metrics,
- get_coordinator_metrics,
-)
-
-_test_logger = logging.getLogger("test.core.metrics.context")
-
-pytestmark = pytest.mark.unit
-
-
-def test_metrics_context_returns_initialized_metrics() -> None:
- """Test metrics context returns initialized metrics from session fixture."""
- # Metrics are initialized by the session-scoped fixture in conftest.py
- c1 = get_connection_metrics()
- c2 = get_connection_metrics()
- assert c1 is c2 # same instance per context
-
- d1 = get_coordinator_metrics()
- d2 = get_coordinator_metrics()
- assert d1 is d2
-
diff --git a/backend/tests/unit/core/metrics/test_replay_and_security_metrics.py b/backend/tests/unit/core/metrics/test_replay_and_security_metrics.py
index 09462600..c7966e94 100644
--- a/backend/tests/unit/core/metrics/test_replay_and_security_metrics.py
+++ b/backend/tests/unit/core/metrics/test_replay_and_security_metrics.py
@@ -1,6 +1,5 @@
import pytest
-from app.core.metrics.replay import ReplayMetrics
-from app.core.metrics.security import SecurityMetrics
+from app.core.metrics import ReplayMetrics, SecurityMetrics
from app.settings import Settings
pytestmark = pytest.mark.unit
diff --git a/backend/tests/unit/services/coordinator/test_queue_manager.py b/backend/tests/unit/services/coordinator/test_queue_manager.py
index b3b87dee..b4b39b2d 100644
--- a/backend/tests/unit/services/coordinator/test_queue_manager.py
+++ b/backend/tests/unit/services/coordinator/test_queue_manager.py
@@ -1,6 +1,7 @@
import logging
import pytest
+from app.core.metrics import CoordinatorMetrics
from app.domain.events.typed import ExecutionRequestedEvent
from app.services.coordinator.queue_manager import QueueManager, QueuePriority
@@ -16,8 +17,8 @@ def ev(execution_id: str, priority: int = QueuePriority.NORMAL.value) -> Executi
@pytest.mark.asyncio
-async def test_requeue_execution_increments_priority() -> None:
- qm = QueueManager(max_queue_size=10, logger=_test_logger)
+async def test_requeue_execution_increments_priority(coordinator_metrics: CoordinatorMetrics) -> None:
+ qm = QueueManager(max_queue_size=10, logger=_test_logger, coordinator_metrics=coordinator_metrics)
await qm.start()
# Use NORMAL priority which can be incremented to LOW
e = ev("x", priority=QueuePriority.NORMAL.value)
@@ -29,8 +30,8 @@ async def test_requeue_execution_increments_priority() -> None:
@pytest.mark.asyncio
-async def test_queue_stats_empty_and_after_add() -> None:
- qm = QueueManager(max_queue_size=5, logger=_test_logger)
+async def test_queue_stats_empty_and_after_add(coordinator_metrics: CoordinatorMetrics) -> None:
+ qm = QueueManager(max_queue_size=5, logger=_test_logger, coordinator_metrics=coordinator_metrics)
await qm.start()
stats0 = await qm.get_queue_stats()
assert stats0["total_size"] == 0
diff --git a/backend/tests/unit/services/coordinator/test_resource_manager.py b/backend/tests/unit/services/coordinator/test_resource_manager.py
index 1cea9f82..3624dae6 100644
--- a/backend/tests/unit/services/coordinator/test_resource_manager.py
+++ b/backend/tests/unit/services/coordinator/test_resource_manager.py
@@ -1,14 +1,15 @@
import logging
import pytest
+from app.core.metrics import CoordinatorMetrics
from app.services.coordinator.resource_manager import ResourceManager
_test_logger = logging.getLogger("test.services.coordinator.resource_manager")
@pytest.mark.asyncio
-async def test_request_allocation_defaults_and_limits() -> None:
- rm = ResourceManager(total_cpu_cores=8.0, total_memory_mb=16384, total_gpu_count=0, logger=_test_logger)
+async def test_request_allocation_defaults_and_limits(coordinator_metrics: CoordinatorMetrics) -> None:
+ rm = ResourceManager(total_cpu_cores=8.0, total_memory_mb=16384, total_gpu_count=0, logger=_test_logger, coordinator_metrics=coordinator_metrics)
# Default for python
alloc = await rm.request_allocation("e1", "python")
@@ -25,8 +26,8 @@ async def test_request_allocation_defaults_and_limits() -> None:
@pytest.mark.asyncio
-async def test_release_and_can_allocate() -> None:
- rm = ResourceManager(total_cpu_cores=4.0, total_memory_mb=8192, total_gpu_count=0, logger=_test_logger)
+async def test_release_and_can_allocate(coordinator_metrics: CoordinatorMetrics) -> None:
+ rm = ResourceManager(total_cpu_cores=4.0, total_memory_mb=8192, total_gpu_count=0, logger=_test_logger, coordinator_metrics=coordinator_metrics)
a = await rm.request_allocation("e1", "python", requested_cpu=1.0, requested_memory_mb=512)
assert a is not None
@@ -45,8 +46,8 @@ async def test_release_and_can_allocate() -> None:
@pytest.mark.asyncio
-async def test_resource_stats() -> None:
- rm = ResourceManager(total_cpu_cores=2.0, total_memory_mb=4096, total_gpu_count=0, logger=_test_logger)
+async def test_resource_stats(coordinator_metrics: CoordinatorMetrics) -> None:
+ rm = ResourceManager(total_cpu_cores=2.0, total_memory_mb=4096, total_gpu_count=0, logger=_test_logger, coordinator_metrics=coordinator_metrics)
# Make sure the allocation succeeds
alloc = await rm.request_allocation("e1", "python", requested_cpu=0.5, requested_memory_mb=256)
assert alloc is not None, "Allocation should have succeeded"
diff --git a/backend/tests/unit/services/idempotency/test_idempotency_manager.py b/backend/tests/unit/services/idempotency/test_idempotency_manager.py
index 102aa56c..ef4676fb 100644
--- a/backend/tests/unit/services/idempotency/test_idempotency_manager.py
+++ b/backend/tests/unit/services/idempotency/test_idempotency_manager.py
@@ -2,6 +2,7 @@
from unittest.mock import MagicMock
import pytest
+from app.core.metrics import DatabaseMetrics
from app.domain.events.typed import BaseEvent
from app.services.idempotency.idempotency_manager import (
IdempotencyConfig,
@@ -85,9 +86,9 @@ def test_custom_config(self) -> None:
assert config.collection_name == "custom_keys"
-def test_manager_generate_key_variants() -> None:
+def test_manager_generate_key_variants(database_metrics: DatabaseMetrics) -> None:
repo = MagicMock()
- mgr = IdempotencyManager(IdempotencyConfig(), repo, _test_logger)
+ mgr = IdempotencyManager(IdempotencyConfig(), repo, _test_logger, database_metrics=database_metrics)
ev = MagicMock(spec=BaseEvent)
ev.event_type = "t"
ev.event_id = "e"
diff --git a/backend/tests/unit/services/pod_monitor/test_monitor.py b/backend/tests/unit/services/pod_monitor/test_monitor.py
index ec60121a..dc93a150 100644
--- a/backend/tests/unit/services/pod_monitor/test_monitor.py
+++ b/backend/tests/unit/services/pod_monitor/test_monitor.py
@@ -7,6 +7,7 @@
import pytest
from app.core import k8s_clients as k8s_clients_module
from app.core.k8s_clients import K8sClients
+from app.core.metrics import EventMetrics, KubernetesMetrics
from app.db.repositories.event_repository import EventRepository
from app.domain.events.typed import DomainEvent, EventMetadata, ExecutionCompletedEvent, ExecutionStartedEvent
from app.domain.execution.models import ResourceUsageDomain
@@ -72,7 +73,7 @@ async def aclose(self) -> None:
pass
-def create_test_kafka_event_service() -> tuple[KafkaEventService, FakeUnifiedProducer]:
+def create_test_kafka_event_service(event_metrics: EventMetrics) -> tuple[KafkaEventService, FakeUnifiedProducer]:
"""Create real KafkaEventService with fake dependencies for testing."""
fake_producer = FakeUnifiedProducer()
fake_repo = FakeEventRepository()
@@ -83,6 +84,7 @@ def create_test_kafka_event_service() -> tuple[KafkaEventService, FakeUnifiedPro
kafka_producer=fake_producer,
settings=settings,
logger=_test_logger,
+ event_metrics=event_metrics,
)
return service, fake_producer
@@ -120,6 +122,8 @@ def make_k8s_clients_di(
def make_pod_monitor(
+ event_metrics: EventMetrics,
+ kubernetes_metrics: KubernetesMetrics,
config: PodMonitorConfig | None = None,
kafka_service: KafkaEventService | None = None,
k8s_clients: K8sClients | None = None,
@@ -129,13 +133,14 @@ def make_pod_monitor(
cfg = config or PodMonitorConfig()
clients = k8s_clients or make_k8s_clients_di()
mapper = event_mapper or PodEventMapper(logger=_test_logger, k8s_api=FakeApi("{}"))
- service = kafka_service or create_test_kafka_event_service()[0]
+ service = kafka_service or create_test_kafka_event_service(event_metrics)[0]
return PodMonitor(
config=cfg,
kafka_event_service=service,
logger=_test_logger,
k8s_clients=clients,
event_mapper=mapper,
+ kubernetes_metrics=kubernetes_metrics,
)
@@ -143,12 +148,12 @@ def make_pod_monitor(
@pytest.mark.asyncio
-async def test_start_and_stop_lifecycle() -> None:
+async def test_start_and_stop_lifecycle(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None:
cfg = PodMonitorConfig()
cfg.enable_state_reconciliation = False
spy = SpyMapper()
- pm = make_pod_monitor(config=cfg, event_mapper=spy) # type: ignore[arg-type]
+ pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg, event_mapper=spy) # type: ignore[arg-type]
# Replace _watch_pods to avoid real watch loop
async def _quick_watch() -> None:
@@ -166,14 +171,14 @@ async def _quick_watch() -> None:
@pytest.mark.asyncio
-async def test_watch_pod_events_flow_and_publish() -> None:
+async def test_watch_pod_events_flow_and_publish(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None:
cfg = PodMonitorConfig()
cfg.enable_state_reconciliation = False
pod = make_pod(name="p", phase="Succeeded", labels={"execution-id": "e1"}, term_exit=0, resource_version="rv1")
k8s_clients = make_k8s_clients_di(events=[{"type": "MODIFIED", "object": pod}], resource_version="rv2")
- pm = make_pod_monitor(config=cfg, k8s_clients=k8s_clients)
+ pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg, k8s_clients=k8s_clients)
pm._state = MonitorState.RUNNING
await pm._watch_pod_events()
@@ -181,9 +186,9 @@ async def test_watch_pod_events_flow_and_publish() -> None:
@pytest.mark.asyncio
-async def test_process_raw_event_invalid_and_handle_watch_error() -> None:
+async def test_process_raw_event_invalid_and_handle_watch_error(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None:
cfg = PodMonitorConfig()
- pm = make_pod_monitor(config=cfg)
+ pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg)
await pm._process_raw_event({})
@@ -195,13 +200,13 @@ async def test_process_raw_event_invalid_and_handle_watch_error() -> None:
@pytest.mark.asyncio
-async def test_get_status() -> None:
+async def test_get_status(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None:
cfg = PodMonitorConfig()
cfg.namespace = "test-ns"
cfg.label_selector = "app=test"
cfg.enable_state_reconciliation = True
- pm = make_pod_monitor(config=cfg)
+ pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg)
pm._tracked_pods = {"pod1", "pod2"}
pm._reconnect_attempts = 3
pm._last_resource_version = "v123"
@@ -217,12 +222,12 @@ async def test_get_status() -> None:
@pytest.mark.asyncio
-async def test_reconciliation_loop_and_state() -> None:
+async def test_reconciliation_loop_and_state(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None:
cfg = PodMonitorConfig()
cfg.enable_state_reconciliation = True
cfg.reconcile_interval_seconds = 0 # sleep(0) yields control immediately
- pm = make_pod_monitor(config=cfg)
+ pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg)
pm._state = MonitorState.RUNNING
reconcile_called: list[bool] = []
@@ -251,7 +256,7 @@ async def wrapped_reconcile() -> ReconciliationResult:
@pytest.mark.asyncio
-async def test_reconcile_state_success() -> None:
+async def test_reconcile_state_success(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None:
cfg = PodMonitorConfig()
cfg.namespace = "test"
cfg.label_selector = "app=test"
@@ -260,7 +265,7 @@ async def test_reconcile_state_success() -> None:
pod2 = make_pod(name="pod2", phase="Running", resource_version="v1")
k8s_clients = make_k8s_clients_di(pods=[pod1, pod2])
- pm = make_pod_monitor(config=cfg, k8s_clients=k8s_clients)
+ pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg, k8s_clients=k8s_clients)
pm._tracked_pods = {"pod2", "pod3"}
processed: list[str] = []
@@ -280,7 +285,7 @@ async def mock_process(event: PodEvent) -> None:
@pytest.mark.asyncio
-async def test_reconcile_state_exception() -> None:
+async def test_reconcile_state_exception(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None:
cfg = PodMonitorConfig()
class FailV1(FakeV1Api):
@@ -296,7 +301,7 @@ def list_namespaced_pod(self, namespace: str, label_selector: str) -> Any:
watch=make_watch([]),
)
- pm = make_pod_monitor(config=cfg, k8s_clients=k8s_clients)
+ pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg, k8s_clients=k8s_clients)
result = await pm._reconcile_state()
assert result.success is False
@@ -305,7 +310,7 @@ def list_namespaced_pod(self, namespace: str, label_selector: str) -> Any:
@pytest.mark.asyncio
-async def test_process_pod_event_full_flow() -> None:
+async def test_process_pod_event_full_flow(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None:
cfg = PodMonitorConfig()
cfg.ignored_pod_phases = ["Unknown"]
@@ -321,7 +326,7 @@ class Event:
def clear_cache(self) -> None:
pass
- pm = make_pod_monitor(config=cfg, event_mapper=MockMapper()) # type: ignore[arg-type]
+ pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg, event_mapper=MockMapper()) # type: ignore[arg-type]
published: list[Any] = []
@@ -363,7 +368,7 @@ async def mock_publish(event: Any, pod: Any) -> None: # noqa: ARG001
@pytest.mark.asyncio
-async def test_process_pod_event_exception_handling() -> None:
+async def test_process_pod_event_exception_handling(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None:
cfg = PodMonitorConfig()
class FailMapper:
@@ -373,7 +378,7 @@ def map_pod_event(self, pod: Any, event_type: WatchEventType) -> list[Any]:
def clear_cache(self) -> None:
pass
- pm = make_pod_monitor(config=cfg, event_mapper=FailMapper()) # type: ignore[arg-type]
+ pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg, event_mapper=FailMapper()) # type: ignore[arg-type]
event = PodEvent(
event_type=WatchEventType.ADDED,
@@ -386,10 +391,10 @@ def clear_cache(self) -> None:
@pytest.mark.asyncio
-async def test_publish_event_full_flow() -> None:
+async def test_publish_event_full_flow(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None:
cfg = PodMonitorConfig()
- service, fake_producer = create_test_kafka_event_service()
- pm = make_pod_monitor(config=cfg, kafka_service=service)
+ service, fake_producer = create_test_kafka_event_service(event_metrics)
+ pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg, kafka_service=service)
event = ExecutionCompletedEvent(
execution_id="exec1",
@@ -407,7 +412,7 @@ async def test_publish_event_full_flow() -> None:
@pytest.mark.asyncio
-async def test_publish_event_exception_handling() -> None:
+async def test_publish_event_exception_handling(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None:
cfg = PodMonitorConfig()
class FailingProducer(FakeUnifiedProducer):
@@ -424,9 +429,10 @@ async def produce(
kafka_producer=failing_producer,
settings=Settings(),
logger=_test_logger,
+ event_metrics=event_metrics,
)
- pm = make_pod_monitor(config=cfg, kafka_service=failing_service)
+ pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg, kafka_service=failing_service)
event = ExecutionStartedEvent(
execution_id="exec1",
@@ -443,11 +449,11 @@ async def produce(
@pytest.mark.asyncio
-async def test_handle_watch_error_max_attempts() -> None:
+async def test_handle_watch_error_max_attempts(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None:
cfg = PodMonitorConfig()
cfg.max_reconnect_attempts = 2
- pm = make_pod_monitor(config=cfg)
+ pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg)
pm._state = MonitorState.RUNNING
pm._reconnect_attempts = 2
@@ -457,9 +463,9 @@ async def test_handle_watch_error_max_attempts() -> None:
@pytest.mark.asyncio
-async def test_watch_pods_main_loop() -> None:
+async def test_watch_pods_main_loop(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None:
cfg = PodMonitorConfig()
- pm = make_pod_monitor(config=cfg)
+ pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg)
pm._state = MonitorState.RUNNING
watch_count: list[int] = []
@@ -480,9 +486,9 @@ async def mock_handle_error() -> None:
@pytest.mark.asyncio
-async def test_watch_pods_api_exception() -> None:
+async def test_watch_pods_api_exception(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None:
cfg = PodMonitorConfig()
- pm = make_pod_monitor(config=cfg)
+ pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg)
pm._state = MonitorState.RUNNING
async def mock_watch() -> None:
@@ -504,9 +510,9 @@ async def mock_handle() -> None:
@pytest.mark.asyncio
-async def test_watch_pods_generic_exception() -> None:
+async def test_watch_pods_generic_exception(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None:
cfg = PodMonitorConfig()
- pm = make_pod_monitor(config=cfg)
+ pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg)
pm._state = MonitorState.RUNNING
async def mock_watch() -> None:
@@ -526,7 +532,7 @@ async def mock_handle() -> None:
@pytest.mark.asyncio
-async def test_create_pod_monitor_context_manager(monkeypatch: pytest.MonkeyPatch) -> None:
+async def test_create_pod_monitor_context_manager(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics, monkeypatch: pytest.MonkeyPatch) -> None:
"""Test create_pod_monitor factory with auto-created dependencies."""
# Mock create_k8s_clients to avoid real K8s connection
mock_v1 = FakeV1Api()
@@ -552,10 +558,10 @@ def mock_create_clients(
cfg = PodMonitorConfig()
cfg.enable_state_reconciliation = False
- service, _ = create_test_kafka_event_service()
+ service, _ = create_test_kafka_event_service(event_metrics)
# Use the actual create_pod_monitor which will use our mocked create_k8s_clients
- async with create_pod_monitor(cfg, service, _test_logger) as monitor:
+ async with create_pod_monitor(cfg, service, _test_logger, kubernetes_metrics=kubernetes_metrics) as monitor:
assert monitor.state == MonitorState.RUNNING
final_state: MonitorState = monitor.state
@@ -563,12 +569,12 @@ def mock_create_clients(
@pytest.mark.asyncio
-async def test_create_pod_monitor_with_injected_k8s_clients() -> None:
+async def test_create_pod_monitor_with_injected_k8s_clients(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None:
"""Test create_pod_monitor with injected K8sClients (DI path)."""
cfg = PodMonitorConfig()
cfg.enable_state_reconciliation = False
- service, _ = create_test_kafka_event_service()
+ service, _ = create_test_kafka_event_service(event_metrics)
mock_v1 = FakeV1Api()
mock_watch = make_watch([])
@@ -581,7 +587,7 @@ async def test_create_pod_monitor_with_injected_k8s_clients() -> None:
)
async with create_pod_monitor(
- cfg, service, _test_logger, k8s_clients=mock_k8s_clients
+ cfg, service, _test_logger, k8s_clients=mock_k8s_clients, kubernetes_metrics=kubernetes_metrics
) as monitor:
assert monitor.state == MonitorState.RUNNING
assert monitor._clients is mock_k8s_clients
@@ -592,10 +598,10 @@ async def test_create_pod_monitor_with_injected_k8s_clients() -> None:
@pytest.mark.asyncio
-async def test_start_already_running() -> None:
+async def test_start_already_running(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None:
"""Test idempotent start via __aenter__."""
cfg = PodMonitorConfig()
- pm = make_pod_monitor(config=cfg)
+ pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg)
# Simulate already started state
pm._lifecycle_started = True
@@ -606,10 +612,10 @@ async def test_start_already_running() -> None:
@pytest.mark.asyncio
-async def test_stop_already_stopped() -> None:
+async def test_stop_already_stopped(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None:
"""Test idempotent stop via aclose()."""
cfg = PodMonitorConfig()
- pm = make_pod_monitor(config=cfg)
+ pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg)
pm._state = MonitorState.STOPPED
# Not started, so aclose should be a no-op
@@ -617,10 +623,10 @@ async def test_stop_already_stopped() -> None:
@pytest.mark.asyncio
-async def test_stop_with_tasks() -> None:
+async def test_stop_with_tasks(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None:
"""Test cleanup of tasks on aclose()."""
cfg = PodMonitorConfig()
- pm = make_pod_monitor(config=cfg)
+ pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg)
pm._state = MonitorState.RUNNING
pm._lifecycle_started = True
@@ -637,9 +643,9 @@ async def dummy_task() -> None:
assert len(pm._tracked_pods) == 0
-def test_update_resource_version() -> None:
+def test_update_resource_version(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None:
cfg = PodMonitorConfig()
- pm = make_pod_monitor(config=cfg)
+ pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg)
class Stream:
_stop_event = types.SimpleNamespace(resource_version="v123")
@@ -654,9 +660,9 @@ class BadStream:
@pytest.mark.asyncio
-async def test_process_raw_event_with_metadata() -> None:
+async def test_process_raw_event_with_metadata(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None:
cfg = PodMonitorConfig()
- pm = make_pod_monitor(config=cfg)
+ pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg)
processed: list[PodEvent] = []
@@ -682,9 +688,9 @@ async def mock_process(event: PodEvent) -> None:
@pytest.mark.asyncio
-async def test_watch_pods_api_exception_other_status() -> None:
+async def test_watch_pods_api_exception_other_status(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None:
cfg = PodMonitorConfig()
- pm = make_pod_monitor(config=cfg)
+ pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg)
pm._state = MonitorState.RUNNING
async def mock_watch() -> None:
@@ -704,7 +710,7 @@ async def mock_handle() -> None:
@pytest.mark.asyncio
-async def test_watch_pod_events_with_field_selector() -> None:
+async def test_watch_pod_events_with_field_selector(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None:
cfg = PodMonitorConfig()
cfg.field_selector = "status.phase=Running"
cfg.enable_state_reconciliation = False
@@ -729,7 +735,7 @@ def stream(self, func: Any, **kwargs: Any) -> FakeWatchStream:
watch=TrackingWatch([], "rv1"),
)
- pm = make_pod_monitor(config=cfg, k8s_clients=k8s_clients)
+ pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg, k8s_clients=k8s_clients)
pm._state = MonitorState.RUNNING
await pm._watch_pod_events()
@@ -738,12 +744,12 @@ def stream(self, func: Any, **kwargs: Any) -> FakeWatchStream:
@pytest.mark.asyncio
-async def test_reconciliation_loop_exception() -> None:
+async def test_reconciliation_loop_exception(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None:
cfg = PodMonitorConfig()
cfg.enable_state_reconciliation = True
cfg.reconcile_interval_seconds = 0 # sleep(0) yields control immediately
- pm = make_pod_monitor(config=cfg)
+ pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg)
pm._state = MonitorState.RUNNING
hit = asyncio.Event()
@@ -763,11 +769,11 @@ async def raising() -> ReconciliationResult:
@pytest.mark.asyncio
-async def test_start_with_reconciliation() -> None:
+async def test_start_with_reconciliation(event_metrics: EventMetrics, kubernetes_metrics: KubernetesMetrics) -> None:
cfg = PodMonitorConfig()
cfg.enable_state_reconciliation = True
- pm = make_pod_monitor(config=cfg)
+ pm = make_pod_monitor(event_metrics, kubernetes_metrics, config=cfg)
async def mock_watch() -> None:
return None
diff --git a/backend/tests/unit/services/result_processor/test_processor.py b/backend/tests/unit/services/result_processor/test_processor.py
index f78cc3bc..c13fe0ab 100644
--- a/backend/tests/unit/services/result_processor/test_processor.py
+++ b/backend/tests/unit/services/result_processor/test_processor.py
@@ -2,6 +2,7 @@
from unittest.mock import MagicMock
import pytest
+from app.core.metrics import EventMetrics, ExecutionMetrics
from app.domain.enums.events import EventType
from app.domain.enums.kafka import CONSUMER_GROUP_SUBSCRIPTIONS, GroupId, KafkaTopic
from app.services.result_processor.processor import ResultProcessor, ResultProcessorConfig
@@ -28,7 +29,9 @@ def test_custom_values(self) -> None:
assert config.processing_timeout == 600
-def test_create_dispatcher_registers_handlers() -> None:
+def test_create_dispatcher_registers_handlers(
+ execution_metrics: ExecutionMetrics, event_metrics: EventMetrics
+) -> None:
rp = ResultProcessor(
execution_repo=MagicMock(),
producer=MagicMock(),
@@ -36,6 +39,8 @@ def test_create_dispatcher_registers_handlers() -> None:
settings=MagicMock(),
idempotency_manager=MagicMock(),
logger=_test_logger,
+ execution_metrics=execution_metrics,
+ event_metrics=event_metrics,
)
dispatcher = rp._create_dispatcher()
assert dispatcher is not None
diff --git a/backend/tests/unit/services/saga/test_saga_orchestrator_unit.py b/backend/tests/unit/services/saga/test_saga_orchestrator_unit.py
index b8e24fb1..b414884a 100644
--- a/backend/tests/unit/services/saga/test_saga_orchestrator_unit.py
+++ b/backend/tests/unit/services/saga/test_saga_orchestrator_unit.py
@@ -2,6 +2,7 @@
from unittest.mock import MagicMock
import pytest
+from app.core.metrics import EventMetrics
from app.db.repositories.resource_allocation_repository import ResourceAllocationRepository
from app.db.repositories.saga_repository import SagaRepository
from app.domain.enums.events import EventType
@@ -99,7 +100,7 @@ def get_steps(self) -> list[SagaStep[ExecutionRequestedEvent]]:
return [_StepOK()]
-def _orch() -> SagaOrchestrator:
+def _orch(event_metrics: EventMetrics) -> SagaOrchestrator:
return SagaOrchestrator(
config=SagaConfig(name="t", enable_compensation=True, store_events=True, publish_commands=False),
saga_repository=_FakeRepo(),
@@ -110,12 +111,13 @@ def _orch() -> SagaOrchestrator:
idempotency_manager=_FakeIdem(),
resource_allocation_repository=_FakeAlloc(),
logger=_test_logger,
+ event_metrics=event_metrics,
)
@pytest.mark.asyncio
-async def test_min_success_flow() -> None:
- orch = _orch()
+async def test_min_success_flow(event_metrics: EventMetrics) -> None:
+ orch = _orch(event_metrics)
orch.register_saga(_Saga)
# Set orchestrator running state via lifecycle property
orch._lifecycle_started = True
@@ -125,7 +127,7 @@ async def test_min_success_flow() -> None:
@pytest.mark.asyncio
-async def test_should_trigger_and_existing_short_circuit() -> None:
+async def test_should_trigger_and_existing_short_circuit(event_metrics: EventMetrics) -> None:
fake_repo = _FakeRepo()
orch = SagaOrchestrator(
config=SagaConfig(name="t", enable_compensation=True, store_events=True, publish_commands=False),
@@ -137,6 +139,7 @@ async def test_should_trigger_and_existing_short_circuit() -> None:
idempotency_manager=_FakeIdem(),
resource_allocation_repository=_FakeAlloc(),
logger=_test_logger,
+ event_metrics=event_metrics,
)
orch.register_saga(_Saga)
assert orch._should_trigger_saga(_Saga, make_execution_requested_event(execution_id="e")) is True
diff --git a/backend/tests/unit/services/sse/test_kafka_redis_bridge.py b/backend/tests/unit/services/sse/test_kafka_redis_bridge.py
index 6e78449b..6fa5d1ef 100644
--- a/backend/tests/unit/services/sse/test_kafka_redis_bridge.py
+++ b/backend/tests/unit/services/sse/test_kafka_redis_bridge.py
@@ -2,7 +2,7 @@
from unittest.mock import MagicMock
import pytest
-from app.core.metrics.events import EventMetrics
+from app.core.metrics import EventMetrics
from app.domain.enums.events import EventType
from app.domain.events.typed import DomainEvent, EventMetadata, ExecutionStartedEvent
from app.events.core import EventDispatcher
diff --git a/backend/tests/unit/services/sse/test_shutdown_manager.py b/backend/tests/unit/services/sse/test_shutdown_manager.py
index 69c9d9f5..05f6e023 100644
--- a/backend/tests/unit/services/sse/test_shutdown_manager.py
+++ b/backend/tests/unit/services/sse/test_shutdown_manager.py
@@ -3,6 +3,7 @@
import pytest
from app.core.lifecycle import LifecycleEnabled
+from app.core.metrics import ConnectionMetrics
from app.services.sse.sse_shutdown_manager import SSEShutdownManager
_test_logger = logging.getLogger("test.services.sse.shutdown_manager")
@@ -21,8 +22,8 @@ async def _on_stop(self) -> None:
@pytest.mark.asyncio
-async def test_shutdown_graceful_notify_and_drain() -> None:
- mgr = SSEShutdownManager(drain_timeout=1.0, notification_timeout=0.01, force_close_timeout=0.1, logger=_test_logger)
+async def test_shutdown_graceful_notify_and_drain(connection_metrics: ConnectionMetrics) -> None:
+ mgr = SSEShutdownManager(drain_timeout=1.0, notification_timeout=0.01, force_close_timeout=0.1, logger=_test_logger, connection_metrics=connection_metrics)
# Register two connections and arrange that they unregister when notified
ev1 = await mgr.register_connection("e1", "c1")
@@ -45,9 +46,9 @@ async def on_shutdown(event: asyncio.Event, cid: str) -> None:
@pytest.mark.asyncio
-async def test_shutdown_force_close_calls_router_stop_and_rejects_new() -> None:
+async def test_shutdown_force_close_calls_router_stop_and_rejects_new(connection_metrics: ConnectionMetrics) -> None:
mgr = SSEShutdownManager(
- drain_timeout=0.01, notification_timeout=0.01, force_close_timeout=0.01, logger=_test_logger
+ drain_timeout=0.01, notification_timeout=0.01, force_close_timeout=0.01, logger=_test_logger, connection_metrics=connection_metrics
)
router = _FakeRouter()
mgr.set_router(router)
@@ -69,8 +70,8 @@ async def test_shutdown_force_close_calls_router_stop_and_rejects_new() -> None:
@pytest.mark.asyncio
-async def test_get_shutdown_status_transitions() -> None:
- m = SSEShutdownManager(drain_timeout=0.01, notification_timeout=0.0, force_close_timeout=0.0, logger=_test_logger)
+async def test_get_shutdown_status_transitions(connection_metrics: ConnectionMetrics) -> None:
+ m = SSEShutdownManager(drain_timeout=0.01, notification_timeout=0.0, force_close_timeout=0.0, logger=_test_logger, connection_metrics=connection_metrics)
st0 = m.get_shutdown_status()
assert st0.phase == "ready"
await m.initiate_shutdown()
diff --git a/backend/tests/unit/services/sse/test_sse_service.py b/backend/tests/unit/services/sse/test_sse_service.py
index 5aa59e21..48ff1751 100644
--- a/backend/tests/unit/services/sse/test_sse_service.py
+++ b/backend/tests/unit/services/sse/test_sse_service.py
@@ -6,6 +6,7 @@
from unittest.mock import MagicMock
import pytest
+from app.core.metrics import ConnectionMetrics
from app.db.repositories.sse_repository import SSERepository
from app.domain.enums.events import EventType
from app.domain.enums.execution import ExecutionStatus
@@ -129,12 +130,12 @@ def _decode(evt: dict[str, Any]) -> dict[str, Any]:
@pytest.mark.asyncio
-async def test_execution_stream_closes_on_failed_event() -> None:
+async def test_execution_stream_closes_on_failed_event(connection_metrics: ConnectionMetrics) -> None:
repo = _FakeRepo()
bus = _FakeBus()
sm = _FakeShutdown()
svc = SSEService(repository=repo, router=_FakeRouter(), sse_bus=bus, shutdown_manager=sm,
- settings=_make_fake_settings(), logger=_test_logger)
+ settings=_make_fake_settings(), logger=_test_logger, connection_metrics=connection_metrics)
agen = svc.create_execution_stream("exec-1", user_id="u1")
first = await agen.__anext__()
@@ -158,7 +159,7 @@ async def test_execution_stream_closes_on_failed_event() -> None:
@pytest.mark.asyncio
-async def test_execution_stream_result_stored_includes_result_payload() -> None:
+async def test_execution_stream_result_stored_includes_result_payload(connection_metrics: ConnectionMetrics) -> None:
repo = _FakeRepo()
# DomainExecution with RU to_dict
repo.exec_for_result = DomainExecution(
@@ -178,7 +179,7 @@ async def test_execution_stream_result_stored_includes_result_payload() -> None:
bus = _FakeBus()
sm = _FakeShutdown()
svc = SSEService(repository=repo, router=_FakeRouter(), sse_bus=bus, shutdown_manager=sm,
- settings=_make_fake_settings(), logger=_test_logger)
+ settings=_make_fake_settings(), logger=_test_logger, connection_metrics=connection_metrics)
agen = svc.create_execution_stream("exec-2", user_id="u1")
await agen.__anext__() # connected
@@ -196,14 +197,14 @@ async def test_execution_stream_result_stored_includes_result_payload() -> None:
@pytest.mark.asyncio
-async def test_notification_stream_connected_and_heartbeat_and_message() -> None:
+async def test_notification_stream_connected_and_heartbeat_and_message(connection_metrics: ConnectionMetrics) -> None:
repo = _FakeRepo()
bus = _FakeBus()
sm = _FakeShutdown()
settings = _make_fake_settings()
settings.SSE_HEARTBEAT_INTERVAL = 0 # emit immediately
svc = SSEService(repository=repo, router=_FakeRouter(), sse_bus=bus, shutdown_manager=sm, settings=settings,
- logger=_test_logger)
+ logger=_test_logger, connection_metrics=connection_metrics)
agen = svc.create_notification_stream("u1")
connected = await agen.__anext__()
@@ -241,9 +242,9 @@ async def test_notification_stream_connected_and_heartbeat_and_message() -> None
@pytest.mark.asyncio
-async def test_health_status_shape() -> None:
+async def test_health_status_shape(connection_metrics: ConnectionMetrics) -> None:
svc = SSEService(repository=_FakeRepo(), router=_FakeRouter(), sse_bus=_FakeBus(), shutdown_manager=_FakeShutdown(),
- settings=_make_fake_settings(), logger=_test_logger)
+ settings=_make_fake_settings(), logger=_test_logger, connection_metrics=connection_metrics)
h = await svc.get_health_status()
assert isinstance(h, SSEHealthDomain)
assert h.active_consumers == 3 and h.active_executions == 2
diff --git a/backend/tests/unit/services/sse/test_sse_shutdown_manager.py b/backend/tests/unit/services/sse/test_sse_shutdown_manager.py
index 43d3e61c..fc7ffb3b 100644
--- a/backend/tests/unit/services/sse/test_sse_shutdown_manager.py
+++ b/backend/tests/unit/services/sse/test_sse_shutdown_manager.py
@@ -3,6 +3,7 @@
import pytest
from app.core.lifecycle import LifecycleEnabled
+from app.core.metrics import ConnectionMetrics
from app.services.sse.sse_shutdown_manager import SSEShutdownManager
pytestmark = pytest.mark.unit
@@ -23,8 +24,8 @@ async def _on_stop(self) -> None:
@pytest.mark.asyncio
-async def test_register_unregister_and_shutdown_flow() -> None:
- mgr = SSEShutdownManager(drain_timeout=0.5, notification_timeout=0.1, force_close_timeout=0.1, logger=_test_logger)
+async def test_register_unregister_and_shutdown_flow(connection_metrics: ConnectionMetrics) -> None:
+ mgr = SSEShutdownManager(drain_timeout=0.5, notification_timeout=0.1, force_close_timeout=0.1, logger=_test_logger, connection_metrics=connection_metrics)
mgr.set_router(_FakeRouter())
# Register two connections
@@ -50,9 +51,9 @@ async def test_register_unregister_and_shutdown_flow() -> None:
@pytest.mark.asyncio
-async def test_reject_new_connection_during_shutdown() -> None:
+async def test_reject_new_connection_during_shutdown(connection_metrics: ConnectionMetrics) -> None:
mgr = SSEShutdownManager(drain_timeout=0.5, notification_timeout=0.01, force_close_timeout=0.01,
- logger=_test_logger)
+ logger=_test_logger, connection_metrics=connection_metrics)
# Pre-register one active connection - shutdown will block waiting for it
e = await mgr.register_connection("e", "c0")
assert e is not None
diff --git a/backend/workers/run_result_processor.py b/backend/workers/run_result_processor.py
index 11cb7a72..5431b011 100644
--- a/backend/workers/run_result_processor.py
+++ b/backend/workers/run_result_processor.py
@@ -5,6 +5,7 @@
from app.core.container import create_result_processor_container
from app.core.logging import setup_logger
+from app.core.metrics import EventMetrics, ExecutionMetrics
from app.core.tracing import init_tracing
from app.db.docs import ALL_DOCUMENTS
from app.db.repositories.execution_repository import ExecutionRepository
@@ -30,6 +31,8 @@ async def run_result_processor(settings: Settings) -> None:
schema_registry = await container.get(SchemaRegistryManager)
idempotency_manager = await container.get(IdempotencyManager)
execution_repo = await container.get(ExecutionRepository)
+ execution_metrics = await container.get(ExecutionMetrics)
+ event_metrics = await container.get(EventMetrics)
logger = await container.get(logging.Logger)
logger.info(f"Beanie ODM initialized with {len(ALL_DOCUMENTS)} document models")
@@ -41,6 +44,8 @@ async def run_result_processor(settings: Settings) -> None:
settings=settings,
idempotency_manager=idempotency_manager,
logger=logger,
+ execution_metrics=execution_metrics,
+ event_metrics=event_metrics,
)
# Shutdown event - signal handlers just set this
diff --git a/cert-generator/Dockerfile b/cert-generator/Dockerfile
index 6dc068ab..5cc0fdb5 100644
--- a/cert-generator/Dockerfile
+++ b/cert-generator/Dockerfile
@@ -7,7 +7,7 @@ ARG KUBECTL_VERSION=v1.33.6
ARG MKCERT_VERSION=v1.4.4
# Install required packages and tools for all architectures
-RUN apk add --no-cache wget ca-certificates openssl curl dos2unix netcat-openbsd && \
+RUN apk add --no-cache wget ca-certificates openssl curl dos2unix netcat-openbsd iproute2 && \
update-ca-certificates && \
# Detect architecture and install appropriate binaries
ARCH=$(uname -m); \
diff --git a/cert-generator/setup-k8s.sh b/cert-generator/setup-k8s.sh
index c665a17e..43609841 100644
--- a/cert-generator/setup-k8s.sh
+++ b/cert-generator/setup-k8s.sh
@@ -3,6 +3,50 @@ set -e
echo "Setting up Kubernetes resources..."
+# Auto-configure kubectl for k3s if needed
+# k3s stores its kubeconfig at /etc/rancher/k3s/k3s.yaml
+# When running in bridge network, we need to use the routable host IP instead of 127.0.0.1
+configure_kubectl() {
+ # If kubectl already works, nothing to do
+ if kubectl version --request-timeout=2s >/dev/null 2>&1; then
+ return 0
+ fi
+ # Try k3s kubeconfig with routable IP (for bridge network containers)
+ if [ -r /etc/rancher/k3s/k3s.yaml ]; then
+ # Get the k3s node-ip from config (routable from containers)
+ K3S_HOST_IP=""
+ if [ -r /etc/rancher/k3s/config.yaml ]; then
+ K3S_HOST_IP=$(grep -E '^node-ip:' /etc/rancher/k3s/config.yaml 2>/dev/null | sed -E 's/^node-ip:[[:space:]]*"?([^"[:space:]]+)"?.*/\1/' | head -1)
+ fi
+ # If no node-ip found, try to detect host from container (for CI/Docker environments)
+ if [ -z "$K3S_HOST_IP" ] || [ "$K3S_HOST_IP" = "127.0.0.1" ]; then
+ # Prefer host.docker.internal (works with TLS cert SANs, requires extra_hosts in compose)
+ if getent hosts host.docker.internal >/dev/null 2>&1; then
+ K3S_HOST_IP="host.docker.internal"
+ fi
+ fi
+ if [ -z "$K3S_HOST_IP" ] || [ "$K3S_HOST_IP" = "127.0.0.1" ]; then
+ # Fallback: Docker gateway (may need insecure TLS if IP not in cert SANs)
+ K3S_HOST_IP=$(ip route 2>/dev/null | grep default | awk '{print $3}' | head -1)
+ fi
+ if [ -n "$K3S_HOST_IP" ] && [ "$K3S_HOST_IP" != "127.0.0.1" ]; then
+ # Create modified kubeconfig with routable IP/hostname
+ # Handle both 127.0.0.1 and 0.0.0.0 (k3s may use either depending on config)
+ mkdir -p /tmp/kube
+ sed -E "s#https://(127\.0\.0\.1|0\.0\.0\.0):#https://${K3S_HOST_IP}:#g" /etc/rancher/k3s/k3s.yaml > /tmp/kube/config
+ export KUBECONFIG=/tmp/kube/config
+ echo "Using k3s kubeconfig with routable IP: $K3S_HOST_IP"
+ else
+ export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
+ echo "Using k3s kubeconfig: $KUBECONFIG"
+ fi
+ return 0
+ fi
+ return 1
+}
+
+configure_kubectl || true
+
# In CI mode, skip k8s setup if connection fails
if [ -n "$CI" ]; then
echo "Running in CI mode"
@@ -34,11 +78,22 @@ EOF
fi
fi
-# Check k8s connection
-if ! kubectl version --request-timeout=5s >/dev/null 2>&1; then
- echo "ERROR: Cannot connect to Kubernetes cluster!"
- exit 1
-fi
+# Check k8s connection with retries (k3s may still be initializing)
+echo "Checking Kubernetes connection..."
+MAX_RETRIES=12
+RETRY_DELAY=5
+for i in $(seq 1 $MAX_RETRIES); do
+ if kubectl version --request-timeout=10s >/dev/null 2>&1; then
+ echo "Connected to Kubernetes (attempt $i)"
+ break
+ fi
+ if [ $i -eq $MAX_RETRIES ]; then
+ echo "ERROR: Cannot connect to Kubernetes cluster after $MAX_RETRIES attempts!"
+ exit 1
+ fi
+ echo "Kubernetes not ready, retrying in ${RETRY_DELAY}s... (attempt $i/$MAX_RETRIES)"
+ sleep $RETRY_DELAY
+done
echo "Connected to Kubernetes"
@@ -167,8 +222,29 @@ TOKEN_LEN=$(printf %s "$TOKEN" | wc -c | awk '{print $1}')
TOKEN_HEAD=$(printf %s "$TOKEN" | cut -c1-10)
echo "ServiceAccount token acquired (len=${TOKEN_LEN}, head=${TOKEN_HEAD}...)"
-# For containers: use host.docker.internal (mapped to host-gateway) but keep TLS host verification via tls-server-name
-CONTAINER_SERVER="https://host.docker.internal:${K8S_PORT}"
+# Determine the host IP that containers can reach
+# Priority: 1) k3s node-ip config, 2) server URL from kubeconfig, 3) fallback to host.docker.internal
+get_container_host_ip() {
+ # Try k3s config node-ip (most reliable for k3s setups)
+ if [ -f /etc/rancher/k3s/config.yaml ]; then
+ K3S_NODE_IP=$(grep -E '^node-ip:' /etc/rancher/k3s/config.yaml 2>/dev/null | sed -E 's/^node-ip:[[:space:]]*"?([^"[:space:]]+)"?.*/\1/' | head -1)
+ if [ -n "$K3S_NODE_IP" ] && [ "$K3S_NODE_IP" != "127.0.0.1" ]; then
+ echo "$K3S_NODE_IP"
+ return
+ fi
+ fi
+ # Try extracting from kubeconfig server URL (if not localhost)
+ if [ -n "$SERVER_HOST" ] && [ "$SERVER_HOST" != "127.0.0.1" ] && [ "$SERVER_HOST" != "localhost" ]; then
+ echo "$SERVER_HOST"
+ return
+ fi
+ # Fallback to host.docker.internal (works on Docker Desktop, may need extra_hosts on Linux)
+ echo "host.docker.internal"
+}
+
+CONTAINER_HOST_IP=$(get_container_host_ip)
+CONTAINER_SERVER="https://${CONTAINER_HOST_IP}:${K8S_PORT}"
+echo "Detected container-accessible host IP: ${CONTAINER_HOST_IP}"
echo "Writing kubeconfig for containers:"
echo " cluster: ${CLUSTER_NAME}"
diff --git a/deploy.sh b/deploy.sh
index 6d24f356..66f6cf8f 100755
--- a/deploy.sh
+++ b/deploy.sh
@@ -55,11 +55,17 @@ show_help() {
echo "Usage: ./deploy.sh Basic Information
@@ -25,32 +45,32 @@
Event ID
- {event.event.event_id}
+ {eventData.event_id}
Event Type
Timestamp
- {formatTimestamp(event.event.timestamp)}
+ {formatTimestamp(eventData.timestamp)}
Correlation ID
- {event.event.correlation_id}
+ {eventData.correlation_id}
@@ -58,19 +78,19 @@
Aggregate ID
- {event.event.aggregate_id || '-'}
+ {eventData.aggregate_id || '-'}
Metadata
- {JSON.stringify(event.event.metadata, null, 2)}
+ {JSON.stringify(eventData.metadata, null, 2)}
Payload
- {JSON.stringify(event.event.payload, null, 2)}
+ {JSON.stringify(eventData.payload, null, 2)}
Related Events
Recent Execution Events
Context Data
- {JSON.stringify(saga.context_data, null, 2)}
-