From 2b2e1f29d0321f3dc106f13c3782f9c3198085c3 Mon Sep 17 00:00:00 2001 From: "Benito J. Gonzalez" Date: Wed, 27 May 2026 12:40:13 -0700 Subject: [PATCH] Issue #938: Redact DATABASE_URL password before logging at startup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `database_setup.py` was logging the full `DATABASE_URL` (including the PG password) on every MDR API task startup. The shared `dev`/`demo` CloudWatch log group has broader read permissions than the SSM parameter that backs the password, so anyone with `logs:FilterLogEvents` could recover the credential by tailing the stream during startup — and every restart leaves another copy in retention history. Add a small `_redact_url` helper that uses `urlparse`/`urlunparse` to swap the password segment for `***` while preserving scheme, username, host, port, and database. The helper is best-effort: any parsing surprise (malformed URL, port that isn't an integer because an env var was unset) returns a `` sentinel rather than raising, so a logging path can never take down MDR startup. Adds the first tests under `test/components/lif/mdr_utils/`: - `__init__.py` (package marker matching siblings) - `conftest.py` seeds dummy POSTGRESQL_* env vars so importing `database_setup` doesn't fail at engine-construction time (these pure-Python helper tests never touch a real DB) - `test_database_setup.py` — five cases for `_redact_url` covering typical password redaction, special characters, missing port, no-password URLs (e.g. IAM auth), and unparseable input. Operational follow-up (out of scope here): rotate the dev + demo DB passwords since the previous values are in CloudWatch history. Other LIF services that share this logging pattern (graphql_*, translator_*, advisor_api, etc.) are also flagged for an audit pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- components/lif/mdr_utils/database_setup.py | 34 +++++++++++++- test/components/lif/mdr_utils/__init__.py | 0 test/components/lif/mdr_utils/conftest.py | 15 ++++++ .../lif/mdr_utils/test_database_setup.py | 47 +++++++++++++++++++ 4 files changed, 95 insertions(+), 1 deletion(-) create mode 100644 test/components/lif/mdr_utils/__init__.py create mode 100644 test/components/lif/mdr_utils/conftest.py create mode 100644 test/components/lif/mdr_utils/test_database_setup.py diff --git a/components/lif/mdr_utils/database_setup.py b/components/lif/mdr_utils/database_setup.py index d670584..f1780b7 100644 --- a/components/lif/mdr_utils/database_setup.py +++ b/components/lif/mdr_utils/database_setup.py @@ -4,6 +4,7 @@ import os import re from typing import AsyncGenerator +from urllib.parse import urlparse, urlunparse from fastapi import HTTPException, Request, status from lif.mdr_utils.logger_config import get_logger @@ -15,8 +16,39 @@ logger = get_logger(__name__) +def _redact_url(url: str) -> str: + """Mask the password in a SQLAlchemy connection URL for safe logging. + + Returns the URL with the password replaced by ``***`` while preserving + scheme, username, host, port, and database. URLs without a password + are returned unchanged. Issue #938: the previous startup log emitted + the full URL including the credential, exposing the dev/demo DB + password to anyone with read access on the shared CloudWatch log + group. + + Best-effort: this is only called for logging, so we never want it to + raise and take down MDR startup. Any parsing surprise (urlparse on + a malformed URL, ``parts.port`` raising on a non-integer port — which + happens when an env var was unset and the URL contains the literal + string ``None``) returns a sentinel so the log line still emits + something operator-readable. + """ + try: + parts = urlparse(url) + if not parts.password: + return url + user = parts.username or "" + host = parts.hostname or "" + netloc = f"{user}:***@{host}" + if parts.port: + netloc += f":{parts.port}" + return urlunparse(parts._replace(netloc=netloc)) + except ValueError: + return "" + + DATABASE_URL = f"postgresql+asyncpg://{os.getenv('POSTGRESQL_USER')}:{os.getenv('POSTGRESQL_PASSWORD')}@{os.getenv('POSTGRESQL_HOST')}:{os.getenv('POSTGRESQL_PORT')}/{os.getenv('POSTGRESQL_DB')}" -logger.info(f"DATABASE_URL : {DATABASE_URL}") +logger.info("DATABASE_URL : %s", _redact_url(DATABASE_URL)) # Create an async engine engine = create_async_engine(DATABASE_URL, echo=True) diff --git a/test/components/lif/mdr_utils/__init__.py b/test/components/lif/mdr_utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/components/lif/mdr_utils/conftest.py b/test/components/lif/mdr_utils/conftest.py new file mode 100644 index 0000000..96e4a51 --- /dev/null +++ b/test/components/lif/mdr_utils/conftest.py @@ -0,0 +1,15 @@ +"""Seed dummy env vars so importing `lif.mdr_utils.database_setup` doesn't +fail at engine-construction time when running these unit tests. + +The tests in this directory exercise pure-Python helpers (e.g. +`_redact_url`); they never actually connect to a database. We just need +the module-level `DATABASE_URL` + `create_async_engine` calls to succeed. +""" + +import os + +os.environ.setdefault("POSTGRESQL_USER", "test") +os.environ.setdefault("POSTGRESQL_PASSWORD", "test") +os.environ.setdefault("POSTGRESQL_HOST", "localhost") +os.environ.setdefault("POSTGRESQL_PORT", "5432") +os.environ.setdefault("POSTGRESQL_DB", "test") diff --git a/test/components/lif/mdr_utils/test_database_setup.py b/test/components/lif/mdr_utils/test_database_setup.py new file mode 100644 index 0000000..3f76513 --- /dev/null +++ b/test/components/lif/mdr_utils/test_database_setup.py @@ -0,0 +1,47 @@ +"""Unit tests for `database_setup._redact_url` — issue #938. + +The full `database_setup` module imports SQLAlchemy/asyncpg/etc at import +time and tries to construct an engine from env vars, so we import the +helper directly rather than the whole module via the package surface.""" + +from lif.mdr_utils.database_setup import _redact_url + + +class TestRedactUrl: + def test_masks_password_in_typical_postgres_url(self): + url = "postgresql+asyncpg://postgres:s3cret!@dbhost.example:5432/mydb" + assert _redact_url(url) == "postgresql+asyncpg://postgres:***@dbhost.example:5432/mydb" + + def test_masks_password_with_special_characters(self): + # Real dev password observed in CloudWatch had `:`, `}`, `&`, `<`, `$` + # in it. urlparse handles percent-encoding; here we use a + # representative literal to confirm the redaction doesn't choke. + url = "postgresql+asyncpg://postgres:p:%24sw0rd@host:5432/db" + redacted = _redact_url(url) + assert "p:%24sw0rd" not in redacted + assert "postgres:***@host:5432/db" in redacted + + def test_url_without_port_still_redacts(self): + url = "postgresql+asyncpg://postgres:s3cret@host/db" + # No explicit port — netloc is just user:pass@host. Redacted form + # must drop the password but keep everything else. + redacted = _redact_url(url) + assert "s3cret" not in redacted + assert "postgres:***@host" in redacted + assert "/db" in redacted + + def test_url_without_password_is_unchanged(self): + # IAM-auth style (no password), or local trust auth — we return + # the original string rather than mangling it. + url = "postgresql+asyncpg://postgres@host:5432/db" + assert _redact_url(url) == url + + def test_unparseable_input_does_not_raise(self): + # The function is only ever called for logging; if it raises, + # MDR startup fails. Return a sentinel string instead so the + # log line still emits something operator-readable. + # urlparse is famously tolerant — `urlparse("")` returns an + # empty ParseResult rather than raising — so this test really + # documents the "no exception" guarantee. + assert _redact_url("") == "" + assert _redact_url("not-a-url") == "not-a-url"