From a68a11cc2706a08bfa946406159a52faec4c4760 Mon Sep 17 00:00:00 2001 From: Jake Bromberg Date: Tue, 10 Mar 2026 13:21:32 -0700 Subject: [PATCH] fix: drop and recreate tables in schema to prevent stale data conflicts The schema used CREATE TABLE IF NOT EXISTS, which preserved data from previous pipeline runs. On re-runs, import_csv would hit UniqueViolation on release_pkey because old rows conflicted with new COPY data. The in-memory dedup only catches duplicates within the CSV, not against pre-existing table rows. Change to DROP TABLE IF EXISTS ... CASCADE followed by CREATE TABLE, ensuring a clean state for every fresh pipeline run. The --resume mode skips schema creation entirely, so in-progress work is unaffected. --- schema/create_database.sql | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/schema/create_database.sql b/schema/create_database.sql index e20c91d..07201f0 100644 --- a/schema/create_database.sql +++ b/schema/create_database.sql @@ -17,11 +17,19 @@ CREATE EXTENSION IF NOT EXISTS pg_trgm; CREATE EXTENSION IF NOT EXISTS unaccent; -- ============================================ --- Core tables +-- Core tables (drop + recreate for clean ETL) -- ============================================ +-- Drop in FK order: children first, then parent +DROP TABLE IF EXISTS cache_metadata CASCADE; +DROP TABLE IF EXISTS release_track_artist CASCADE; +DROP TABLE IF EXISTS release_track CASCADE; +DROP TABLE IF EXISTS release_label CASCADE; +DROP TABLE IF EXISTS release_artist CASCADE; +DROP TABLE IF EXISTS release CASCADE; + -- Releases -CREATE TABLE IF NOT EXISTS release ( +CREATE TABLE release ( id integer PRIMARY KEY, title text NOT NULL, release_year smallint, @@ -31,7 +39,7 @@ CREATE TABLE IF NOT EXISTS release ( ); -- Artists on releases -CREATE TABLE IF NOT EXISTS release_artist ( +CREATE TABLE release_artist ( release_id integer NOT NULL REFERENCES release(id) ON DELETE CASCADE, artist_id integer, -- Discogs artist ID (nullable for API-fetched releases) artist_name text NOT NULL, @@ -39,13 +47,13 @@ CREATE TABLE IF NOT EXISTS release_artist ( ); -- Labels on releases -CREATE TABLE IF NOT EXISTS release_label ( +CREATE TABLE release_label ( release_id integer NOT NULL REFERENCES release(id) ON DELETE CASCADE, label_name text NOT NULL ); -- Tracks on releases -CREATE TABLE IF NOT EXISTS release_track ( +CREATE TABLE release_track ( release_id integer NOT NULL REFERENCES release(id) ON DELETE CASCADE, sequence integer NOT NULL, position text, -- "A1", "B2", etc. @@ -54,7 +62,7 @@ CREATE TABLE IF NOT EXISTS release_track ( ); -- Artists on specific tracks (for compilations) -CREATE TABLE IF NOT EXISTS release_track_artist ( +CREATE TABLE release_track_artist ( release_id integer NOT NULL REFERENCES release(id) ON DELETE CASCADE, track_sequence integer NOT NULL, artist_name text NOT NULL @@ -64,7 +72,7 @@ CREATE TABLE IF NOT EXISTS release_track_artist ( -- Cache metadata (for tracking data freshness) -- ============================================ -CREATE TABLE IF NOT EXISTS cache_metadata ( +CREATE TABLE cache_metadata ( release_id integer PRIMARY KEY REFERENCES release(id) ON DELETE CASCADE, cached_at timestamptz NOT NULL DEFAULT now(), source text NOT NULL, -- 'bulk_import' or 'api_fetch'