From 74ca57c7cd52484f82320f98c1db594470775f6d Mon Sep 17 00:00:00 2001 From: Jake Bromberg Date: Tue, 10 Mar 2026 22:30:28 -0700 Subject: [PATCH] test: add regression test for schema stale data + CI coverage enforcement Add test_schema_clears_stale_data_on_rerun: inserts data, re-runs schema, verifies tables are empty. This would have caught the UniqueViolation bug from #36 where CREATE TABLE IF NOT EXISTS preserved stale data across pipeline runs. Update CI to run all tests (unit + integration) with coverage reporting in the postgres job, enforcing a minimum threshold of 80%. --- .github/workflows/ci.yml | 10 +++++++--- tests/integration/test_schema.py | 29 ++++++++++++++++++++++++++++- 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4dc01e4..3a0c43e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -49,12 +49,16 @@ jobs: - uses: actions/setup-python@v5 with: python-version: "3.12" - - run: pip install -e ".[dev]" + - run: pip install -e ".[dev]" pytest-cov - name: Enable pg_trgm extension run: | PGPASSWORD=discogs psql -h localhost -p 5433 -U discogs -d postgres \ -c "CREATE EXTENSION IF NOT EXISTS pg_trgm;" - - name: Run integration and E2E tests + - name: Run all tests with coverage env: DATABASE_URL_TEST: postgresql://discogs:discogs@localhost:5433/postgres - run: pytest -m 'postgres or e2e' -v + run: >- + pytest -m 'not mysql' -v + --cov=scripts --cov=lib + --cov-report=term-missing + --cov-fail-under=80 diff --git a/tests/integration/test_schema.py b/tests/integration/test_schema.py index 5c19078..b9005a4 100644 --- a/tests/integration/test_schema.py +++ b/tests/integration/test_schema.py @@ -156,12 +156,39 @@ def test_no_unique_constraints_on_child_tables(self) -> None: assert unique_constraints == [], f"Unexpected UNIQUE constraints: {unique_constraints}" def test_schema_is_idempotent(self) -> None: - """Running the schema twice doesn't error (IF NOT EXISTS).""" + """Running the schema twice doesn't error.""" conn = psycopg.connect(self.db_url, autocommit=True) with conn.cursor() as cur: cur.execute(SCHEMA_DIR.joinpath("create_database.sql").read_text()) conn.close() + def test_schema_clears_stale_data_on_rerun(self) -> None: + """Re-running the schema drops old data so import doesn't hit UniqueViolation.""" + conn = psycopg.connect(self.db_url, autocommit=True) + + # Insert data as if a previous pipeline run completed + with conn.cursor() as cur: + cur.execute("INSERT INTO release (id, title) VALUES (1, 'DOGA')") + cur.execute( + "INSERT INTO release_artist (release_id, artist_name, extra) " + "VALUES (1, 'Juana Molina', 0)" + ) + cur.execute("SELECT count(*) FROM release") + assert cur.fetchone()[0] == 1 + + # Re-run schema (simulates a fresh pipeline run) + with conn.cursor() as cur: + cur.execute(SCHEMA_DIR.joinpath("create_database.sql").read_text()) + + # Tables should be empty — no stale data to conflict with new imports + with conn.cursor() as cur: + cur.execute("SELECT count(*) FROM release") + assert cur.fetchone()[0] == 0 + cur.execute("SELECT count(*) FROM release_artist") + assert cur.fetchone()[0] == 0 + + conn.close() + class TestCreateBaseIndexes: """Verify create_indexes.sql creates base trigram indexes."""