From 488c1a70a357693250ebe768a7454ab6d2c5bbda Mon Sep 17 00:00:00 2001 From: gaurav0107 Date: Sun, 7 Jun 2026 17:57:31 +0530 Subject: [PATCH] fix(deploy): pass full CH DSN; wire TRACEBILITY_CLICKHOUSE_URL to ingest-api MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two crash-loops on the live deploy after PR #8 unblocked the rollout: ingest-api: RuntimeError: TRACEBILITY_CLICKHOUSE_URL is required api: clickhouse_connect.driver.exceptions.DatabaseError: Code: 516. Authentication failed: ... default user Root cause (single pattern, two symptoms): PR #3 (multi-tenancy) added new ClickHouse-touching code paths in api and ingest-api but invented a credential-passing convention the helm chart didn't ship — splitting URL / USER / PASSWORD / DATABASE into four env vars. The chart's existing tracebility-clickhouse secret has ONE key (``url``) holding the full DSN with embedded credentials, matching the postgres / redis pattern. So in production: - ingest-api template never set TRACEBILITY_CLICKHOUSE_URL at all → config.load() raised on the missing required env var. - api template DID set TRACEBILITY_CLICKHOUSE_URL, but the new code passed username='default' / password='' as kwargs to clickhouse_connect.get_async_client(dsn=URL, username=...). Those kwargs override the DSN's embedded credentials, so it tried to auth as 'default' (which doesn't exist in the cluster). Fix: - AuditWriter.from_url(url) now accepts only the DSN; no override kwargs. The DSN's embedded credentials carry through. - reconciler_loop() in both reconcilers: same simplification. - Settings on api + ingest-api: drop clickhouse_user / clickhouse_password / clickhouse_database. Only clickhouse_url. - ingest-api deployment template: add the missing TRACEBILITY_CLICKHOUSE_URL env-from-secret line. - api deployment template: also pass TRACEBILITY_REDIS_URL (PR #3 added optional Redis support to the api for api-key invalidation + reconciler hooks). Verified locally: - ``uv run pytest services/`` → 68 passed. - ``helm template`` confirms api + ingest-api templates render with PG_DSN + REDIS_URL + CLICKHOUSE_URL all sourced from secrets. - ``ruff check`` and ``ruff format --check`` clean. Signed-off-by: Gaurav Dubey Signed-off-by: gaurav0107 --- .../tracebility/templates/api-deployment.yaml | 1 + .../templates/ingest-api-deployment.yaml | 1 + .../_shared/tenant/tracebility_tenant/audit.py | 17 ++++++++--------- .../tracebility_tenant/reconciler_audit.py | 12 +++--------- .../tracebility_tenant/reconciler_quota.py | 18 ++++++++---------- services/api/tracebility_api/app.py | 9 +-------- services/api/tracebility_api/config.py | 6 ------ services/ingest-api/tracebility_ingest/app.py | 7 +------ .../ingest-api/tracebility_ingest/config.py | 11 ++++------- 9 files changed, 27 insertions(+), 55 deletions(-) diff --git a/deploy/helm/tracebility/templates/api-deployment.yaml b/deploy/helm/tracebility/templates/api-deployment.yaml index b339d94..17c7961 100644 --- a/deploy/helm/tracebility/templates/api-deployment.yaml +++ b/deploy/helm/tracebility/templates/api-deployment.yaml @@ -49,6 +49,7 @@ spec: value: {{ .Values.logging.level | quote }} {{- include "tracebility.envFromSecret" (dict "cfg" .Values.postgres "name" "TRACEBILITY_PG_DSN") | nindent 12 }} {{- include "tracebility.envFromSecret" (dict "cfg" .Values.clickhouse "name" "TRACEBILITY_CLICKHOUSE_URL") | nindent 12 }} + {{- include "tracebility.envFromSecret" (dict "cfg" .Values.redis "name" "TRACEBILITY_REDIS_URL") | nindent 12 }} {{- include "tracebility.envFromSecret" (dict "cfg" .Values.session "name" "TRACEBILITY_SESSION_SECRET") | nindent 12 }} {{- if .Values.oauth.existingSecret }} - name: OAUTH_GOOGLE_CLIENT_ID diff --git a/deploy/helm/tracebility/templates/ingest-api-deployment.yaml b/deploy/helm/tracebility/templates/ingest-api-deployment.yaml index 5f8f3e1..4de8f92 100644 --- a/deploy/helm/tracebility/templates/ingest-api-deployment.yaml +++ b/deploy/helm/tracebility/templates/ingest-api-deployment.yaml @@ -69,6 +69,7 @@ spec: value: /var/lib/tracebility/ingest-buffer {{- include "tracebility.envFromSecret" (dict "cfg" .Values.postgres "name" "TRACEBILITY_PG_DSN") | nindent 12 }} {{- include "tracebility.envFromSecret" (dict "cfg" .Values.redis "name" "TRACEBILITY_REDIS_URL") | nindent 12 }} + {{- include "tracebility.envFromSecret" (dict "cfg" .Values.clickhouse "name" "TRACEBILITY_CLICKHOUSE_URL") | nindent 12 }} {{- if .Values.ingestApi.diskBuffer.enabled }} volumeMounts: - name: ingest-buffer diff --git a/services/_shared/tenant/tracebility_tenant/audit.py b/services/_shared/tenant/tracebility_tenant/audit.py index 981235e..d675037 100644 --- a/services/_shared/tenant/tracebility_tenant/audit.py +++ b/services/_shared/tenant/tracebility_tenant/audit.py @@ -86,17 +86,16 @@ async def from_url( cls, url: str, *, - username: str | None = None, - password: str | None = None, - database: str = "default", table: str = "audit_log", ) -> AuditWriter: - client = await clickhouse_connect.get_async_client( - dsn=url, - username=username, - password=password, - database=database, - ) + # Pass the full DSN through. Credentials and database are encoded + # in the URL (http://user:pass@host:port/db). Earlier versions + # accepted username/password/database kwargs, but that turned out + # to override the DSN's embedded credentials when callers passed + # default values — yielding cryptic AUTHENTICATION_FAILED errors + # in deployed environments where the helm chart only ships the + # DSN as a single secret. + client = await clickhouse_connect.get_async_client(dsn=url) return cls(client, table=table) async def write(self, event: AuditEvent) -> None: diff --git a/services/_shared/tenant/tracebility_tenant/reconciler_audit.py b/services/_shared/tenant/tracebility_tenant/reconciler_audit.py index c2352d1..b908470 100644 --- a/services/_shared/tenant/tracebility_tenant/reconciler_audit.py +++ b/services/_shared/tenant/tracebility_tenant/reconciler_audit.py @@ -192,17 +192,11 @@ async def reconciler_loop( *, pg: asyncpg.Pool, clickhouse_url: str, - clickhouse_user: str = "default", - clickhouse_password: str = "", - clickhouse_database: str = "default", interval_s: float = 24 * 60 * 60.0, # daily ) -> None: - ch = await clickhouse_connect.get_async_client( - dsn=clickhouse_url, - username=clickhouse_user, - password=clickhouse_password, - database=clickhouse_database, - ) + """``clickhouse_url`` is a full DSN with embedded credentials. See + reconciler_quota.reconciler_loop docstring for why we don't split.""" + ch = await clickhouse_connect.get_async_client(dsn=clickhouse_url) try: while True: try: diff --git a/services/_shared/tenant/tracebility_tenant/reconciler_quota.py b/services/_shared/tenant/tracebility_tenant/reconciler_quota.py index 9c705c9..6f7e766 100644 --- a/services/_shared/tenant/tracebility_tenant/reconciler_quota.py +++ b/services/_shared/tenant/tracebility_tenant/reconciler_quota.py @@ -165,18 +165,16 @@ async def reconciler_loop( pg: asyncpg.Pool, redis: redis_async.Redis, clickhouse_url: str, - clickhouse_user: str = "default", - clickhouse_password: str = "", - clickhouse_database: str = "default", interval_s: float = 60.0, ) -> None: - """Run forever. Operator service spawns this as a background task.""" - ch = await clickhouse_connect.get_async_client( - dsn=clickhouse_url, - username=clickhouse_user, - password=clickhouse_password, - database=clickhouse_database, - ) + """Run forever. Operator service spawns this as a background task. + + ``clickhouse_url`` is a full DSN with embedded credentials and database + (http(s)://user:pass@host:port/db). The helm chart ships it as a single + secret value; splitting credentials into separate kwargs caused + AUTHENTICATION_FAILED in production. + """ + ch = await clickhouse_connect.get_async_client(dsn=clickhouse_url) try: while True: try: diff --git a/services/api/tracebility_api/app.py b/services/api/tracebility_api/app.py index da1bdfb..9ddbc9e 100644 --- a/services/api/tracebility_api/app.py +++ b/services/api/tracebility_api/app.py @@ -92,14 +92,7 @@ async def lifespan(app: FastAPI) -> AsyncIterator[None]: # ``audit_log`` is read-only after this lands; new writes go to # ClickHouse via this writer (spec §5.8). app.state.audit_writer = ( - await AuditWriter.from_url( - settings.clickhouse_url, - username=settings.clickhouse_user, - password=settings.clickhouse_password, - database=settings.clickhouse_database, - ) - if settings.clickhouse_url - else None + await AuditWriter.from_url(settings.clickhouse_url) if settings.clickhouse_url else None ) # Redis is needed for the api-key invalidation publish path and the # quota / audit reconciler hooks. Optional; if unset, those features diff --git a/services/api/tracebility_api/config.py b/services/api/tracebility_api/config.py index 1a46809..f6ada77 100644 --- a/services/api/tracebility_api/config.py +++ b/services/api/tracebility_api/config.py @@ -16,9 +16,6 @@ class Settings: postgres_dsn: str session_secret: str clickhouse_url: str | None = None - clickhouse_user: str = "default" - clickhouse_password: str = "" - clickhouse_database: str = "default" redis_url: str | None = None session_cookie_name: str = "tracebility_session" session_max_age_seconds: int = 60 * 60 * 24 * 7 @@ -58,9 +55,6 @@ def load() -> Settings: postgres_dsn=postgres_dsn, session_secret=session_secret, clickhouse_url=os.environ.get("TRACEBILITY_CLICKHOUSE_URL"), - clickhouse_user=os.environ.get("TRACEBILITY_CLICKHOUSE_USER", "default"), - clickhouse_password=os.environ.get("TRACEBILITY_CLICKHOUSE_PASSWORD", ""), - clickhouse_database=os.environ.get("TRACEBILITY_CLICKHOUSE_DATABASE", "default"), redis_url=os.environ.get("TRACEBILITY_REDIS_URL"), session_cookie_name=os.environ.get("TRACEBILITY_SESSION_COOKIE", "tracebility_session"), session_max_age_seconds=int( diff --git a/services/ingest-api/tracebility_ingest/app.py b/services/ingest-api/tracebility_ingest/app.py index 1cef8c6..487ade4 100644 --- a/services/ingest-api/tracebility_ingest/app.py +++ b/services/ingest-api/tracebility_ingest/app.py @@ -85,12 +85,7 @@ async def lifespan(app: FastAPI) -> AsyncIterator[None]: await app.state.resolver.start_invalidator() app.state.rate_limiter = RateLimiter(app.state.redis) app.state.quota_meter = QuotaMeter(app.state.redis) - app.state.audit_writer = await AuditWriter.from_url( - settings.clickhouse_url, - username=settings.clickhouse_user, - password=settings.clickhouse_password, - database=settings.clickhouse_database, - ) + app.state.audit_writer = await AuditWriter.from_url(settings.clickhouse_url) app.state.redactor = redactor_from_env(settings.redact_pii) drain_task = asyncio.create_task(_drain_loop(app.state.enqueue)) log.info( diff --git a/services/ingest-api/tracebility_ingest/config.py b/services/ingest-api/tracebility_ingest/config.py index 7896428..07deb36 100644 --- a/services/ingest-api/tracebility_ingest/config.py +++ b/services/ingest-api/tracebility_ingest/config.py @@ -12,11 +12,11 @@ class Settings: redis_url: str postgres_dsn: str - # ClickHouse used by AuditWriter (egress + quota.block events) + # Full DSN — http(s)://user:pass@host:port/db. Credentials and + # database name are embedded; the helm chart ships this as a single + # secret value (see deploy/helm/.../templates/ingest-api-deployment.yaml + # and the tracebility-clickhouse secret created by the bootstrap runbook). clickhouse_url: str - clickhouse_user: str - clickhouse_password: str - clickhouse_database: str = "default" # spans larger than this are spilled to object storage (per ER-06) inline_blob_max_bytes: int = 1_000_000 # disk buffer for ingest enqueue when redis is unavailable (ER-01) @@ -46,9 +46,6 @@ def load() -> Settings: redis_url=redis_url, postgres_dsn=postgres_dsn, clickhouse_url=clickhouse_url, - clickhouse_user=os.environ.get("TRACEBILITY_CLICKHOUSE_USER", "default"), - clickhouse_password=os.environ.get("TRACEBILITY_CLICKHOUSE_PASSWORD", ""), - clickhouse_database=os.environ.get("TRACEBILITY_CLICKHOUSE_DATABASE", "default"), inline_blob_max_bytes=int(os.environ.get("TRACEBILITY_INLINE_BLOB_MAX_BYTES", "1000000")), disk_buffer_path=os.environ.get( "TRACEBILITY_DISK_BUFFER_PATH", "/var/lib/tracebility/ingest-buffer"