From 527e07d707c038468f68e2945302eb89c6154a10 Mon Sep 17 00:00:00 2001 From: Jake Bromberg Date: Wed, 11 Mar 2026 10:54:39 -0700 Subject: [PATCH 1/2] feat: enrich schema with release dates, labels, artist details and update ETL pipeline --- migrations/02_enrich_data.sql | 72 ++++++++++++++++++++++ schema/create_database.sql | 54 ++++++++++++++++- scripts/dedup_releases.py | 6 +- scripts/import_csv.py | 86 +++++++++++++++++++++++++-- tests/fixtures/csv/release_artist.csv | 34 +++++------ tests/unit/test_import_csv.py | 13 ++-- 6 files changed, 233 insertions(+), 32 deletions(-) create mode 100644 migrations/02_enrich_data.sql diff --git a/migrations/02_enrich_data.sql b/migrations/02_enrich_data.sql new file mode 100644 index 0000000..fce1558 --- /dev/null +++ b/migrations/02_enrich_data.sql @@ -0,0 +1,72 @@ +-- Enrich Discogs cache: restore dropped columns, add artist detail tables +-- Run once against the existing database (post-migration 01). +-- Idempotent: safe to re-run. +-- +-- Usage: +-- psql -U postgres -d discogs -f 02_enrich_data.sql + +BEGIN; + +-- ============================================ +-- 1. Restore columns dropped by migration 01 +-- ============================================ + +-- Full release date string (e.g. "2024-03-15") +ALTER TABLE release ADD COLUMN IF NOT EXISTS released text; + +-- Discogs artist ID on release_artist (nullable for API-fetched releases) +ALTER TABLE release_artist ADD COLUMN IF NOT EXISTS artist_id integer; + +-- Role for extra artists (e.g. "Producer", "Mixed By") +ALTER TABLE release_artist ADD COLUMN IF NOT EXISTS role text; + +-- Restore country column on release (used by dedup ranking) +ALTER TABLE release ADD COLUMN IF NOT EXISTS country text; + +-- ============================================ +-- 2. Enrich release_label table +-- ============================================ + +ALTER TABLE release_label ADD COLUMN IF NOT EXISTS label_id integer; +ALTER TABLE release_label ADD COLUMN IF NOT EXISTS catno text; + +-- ============================================ +-- 3. Artist detail tables (new) +-- ============================================ + +CREATE TABLE IF NOT EXISTS artist ( + id integer PRIMARY KEY, + name text NOT NULL, + profile text, + image_url text, + fetched_at timestamptz NOT NULL DEFAULT now() +); + +CREATE TABLE IF NOT EXISTS artist_alias ( + artist_id integer NOT NULL REFERENCES artist(id) ON DELETE CASCADE, + alias_id integer, + alias_name text NOT NULL +); +CREATE INDEX IF NOT EXISTS idx_artist_alias_artist_id ON artist_alias(artist_id); + +CREATE TABLE IF NOT EXISTS artist_name_variation ( + artist_id integer NOT NULL REFERENCES artist(id) ON DELETE CASCADE, + name text NOT NULL +); +CREATE INDEX IF NOT EXISTS idx_artist_name_variation_artist_id ON artist_name_variation(artist_id); + +CREATE TABLE IF NOT EXISTS artist_member ( + artist_id integer NOT NULL REFERENCES artist(id) ON DELETE CASCADE, + member_id integer NOT NULL, + member_name text NOT NULL, + active boolean DEFAULT true +); +CREATE INDEX IF NOT EXISTS idx_artist_member_artist_id ON artist_member(artist_id); + +CREATE TABLE IF NOT EXISTS artist_url ( + artist_id integer NOT NULL REFERENCES artist(id) ON DELETE CASCADE, + url text NOT NULL +); +CREATE INDEX IF NOT EXISTS idx_artist_url_artist_id ON artist_url(artist_id); + +COMMIT; diff --git a/schema/create_database.sql b/schema/create_database.sql index 07201f0..c31b2e4 100644 --- a/schema/create_database.sql +++ b/schema/create_database.sql @@ -22,6 +22,11 @@ CREATE EXTENSION IF NOT EXISTS unaccent; -- Drop in FK order: children first, then parent DROP TABLE IF EXISTS cache_metadata CASCADE; +DROP TABLE IF EXISTS artist_url CASCADE; +DROP TABLE IF EXISTS artist_member CASCADE; +DROP TABLE IF EXISTS artist_name_variation CASCADE; +DROP TABLE IF EXISTS artist_alias CASCADE; +DROP TABLE IF EXISTS artist CASCADE; DROP TABLE IF EXISTS release_track_artist CASCADE; DROP TABLE IF EXISTS release_track CASCADE; DROP TABLE IF EXISTS release_label CASCADE; @@ -35,6 +40,7 @@ CREATE TABLE release ( release_year smallint, country text, artwork_url text, + released text, -- full date string, e.g. "2024-03-15" master_id integer -- used by dedup, dropped after dedup copy-swap ); @@ -43,13 +49,16 @@ CREATE TABLE release_artist ( release_id integer NOT NULL REFERENCES release(id) ON DELETE CASCADE, artist_id integer, -- Discogs artist ID (nullable for API-fetched releases) artist_name text NOT NULL, - extra integer DEFAULT 0 -- 0 = main artist, 1 = extra credit + extra integer DEFAULT 0, -- 0 = main artist, 1 = extra credit + role text -- role for extra artists: "Producer", "Mixed By", etc. ); -- Labels on releases CREATE TABLE release_label ( release_id integer NOT NULL REFERENCES release(id) ON DELETE CASCADE, - label_name text NOT NULL + label_id integer, + label_name text NOT NULL, + catno text ); -- Tracks on releases @@ -68,6 +77,41 @@ CREATE TABLE release_track_artist ( artist_name text NOT NULL ); +-- ============================================ +-- Artist detail tables +-- ============================================ + +CREATE TABLE artist ( + id integer PRIMARY KEY, + name text NOT NULL, + profile text, + image_url text, + fetched_at timestamptz NOT NULL DEFAULT now() +); + +CREATE TABLE artist_alias ( + artist_id integer NOT NULL REFERENCES artist(id) ON DELETE CASCADE, + alias_id integer, + alias_name text NOT NULL +); + +CREATE TABLE artist_name_variation ( + artist_id integer NOT NULL REFERENCES artist(id) ON DELETE CASCADE, + name text NOT NULL +); + +CREATE TABLE artist_member ( + artist_id integer NOT NULL REFERENCES artist(id) ON DELETE CASCADE, + member_id integer NOT NULL, + member_name text NOT NULL, + active boolean DEFAULT true +); + +CREATE TABLE artist_url ( + artist_id integer NOT NULL REFERENCES artist(id) ON DELETE CASCADE, + url text NOT NULL +); + -- ============================================ -- Cache metadata (for tracking data freshness) -- ============================================ @@ -89,6 +133,12 @@ CREATE INDEX IF NOT EXISTS idx_release_label_release_id ON release_label(release CREATE INDEX IF NOT EXISTS idx_release_track_release_id ON release_track(release_id); CREATE INDEX IF NOT EXISTS idx_release_track_artist_release_id ON release_track_artist(release_id); +-- Artist detail indexes +CREATE INDEX IF NOT EXISTS idx_artist_alias_artist_id ON artist_alias(artist_id); +CREATE INDEX IF NOT EXISTS idx_artist_name_variation_artist_id ON artist_name_variation(artist_id); +CREATE INDEX IF NOT EXISTS idx_artist_member_artist_id ON artist_member(artist_id); +CREATE INDEX IF NOT EXISTS idx_artist_url_artist_id ON artist_url(artist_id); + -- Partial index on master_id for dedup performance. -- Transient: dropped automatically by dedup copy-swap (which excludes master_id). CREATE INDEX IF NOT EXISTS idx_release_master_id ON release(master_id) WHERE master_id IS NOT NULL; diff --git a/scripts/dedup_releases.py b/scripts/dedup_releases.py index 189a0dc..2c862b2 100644 --- a/scripts/dedup_releases.py +++ b/scripts/dedup_releases.py @@ -576,14 +576,14 @@ def main(): # Step 2: Copy each table (keeping only non-duplicate rows) # Only base tables + cache_metadata (tracks are imported after dedup) tables = [ - ("release", "new_release", "id, title, release_year, country, artwork_url", "id"), + ("release", "new_release", "id, title, release_year, country, artwork_url, released", "id"), ( "release_artist", "new_release_artist", - "release_id, artist_id, artist_name, extra", + "release_id, artist_id, artist_name, extra, role", "release_id", ), - ("release_label", "new_release_label", "release_id, label_name", "release_id"), + ("release_label", "new_release_label", "release_id, label_id, label_name, catno", "release_id"), ( "cache_metadata", "new_cache_metadata", diff --git a/scripts/import_csv.py b/scripts/import_csv.py index 738decc..2e2d725 100644 --- a/scripts/import_csv.py +++ b/scripts/import_csv.py @@ -79,16 +79,16 @@ class TableConfig(TypedDict, total=False): "csv_file": "release.csv", "table": "release", "csv_columns": ["id", "title", "country", "released", "master_id"], - "db_columns": ["id", "title", "country", "release_year", "master_id"], + "db_columns": ["id", "title", "country", "released", "master_id"], "required": ["id", "title"], - "transforms": {"released": extract_year}, + "transforms": {}, "unique_key": ["id"], }, { "csv_file": "release_artist.csv", "table": "release_artist", - "csv_columns": ["release_id", "artist_id", "artist_name", "extra"], - "db_columns": ["release_id", "artist_id", "artist_name", "extra"], + "csv_columns": ["release_id", "artist_id", "artist_name", "extra", "role"], + "db_columns": ["release_id", "artist_id", "artist_name", "extra", "role"], "required": ["release_id", "artist_name"], "transforms": {}, "unique_key": ["release_id", "artist_name"], @@ -96,8 +96,8 @@ class TableConfig(TypedDict, total=False): { "csv_file": "release_label.csv", "table": "release_label", - "csv_columns": ["release_id", "label"], - "db_columns": ["release_id", "label_name"], + "csv_columns": ["release_id", "label", "catno"], + "db_columns": ["release_id", "label_name", "catno"], "required": ["release_id", "label"], "transforms": {}, "unique_key": ["release_id", "label"], @@ -124,6 +124,25 @@ class TableConfig(TypedDict, total=False): }, ] +ARTIST_TABLES: list[TableConfig] = [ + { + "csv_file": "artist_alias.csv", + "table": "artist_alias", + "csv_columns": ["artist_id", "alias_name"], + "db_columns": ["artist_id", "alias_name"], + "required": ["artist_id", "alias_name"], + "transforms": {}, + }, + { + "csv_file": "artist_member.csv", + "table": "artist_member", + "csv_columns": ["group_artist_id", "member_artist_id", "member_name"], + "db_columns": ["artist_id", "member_id", "member_name"], + "required": ["group_artist_id", "member_artist_id", "member_name"], + "transforms": {}, + }, +] + TABLES: list[TableConfig] = BASE_TABLES + TRACK_TABLES @@ -278,6 +297,26 @@ def populate_cache_metadata(conn) -> int: return count +def populate_release_year(conn) -> int: + """Populate release_year from the released text field. + + Extracts the 4-digit year prefix from the 'released' column and stores it + in the 'release_year' smallint column for efficient filtering. + """ + logger.info("Populating release_year from released text...") + with conn.cursor() as cur: + cur.execute(""" + UPDATE release SET release_year = CAST(LEFT(released, 4) AS smallint) + WHERE released IS NOT NULL + AND released ~ '^[0-9]{4}' + AND release_year IS NULL + """) + count = cur.rowcount + conn.commit() + logger.info(f" Populated release_year for {count:,} releases") + return count + + def create_track_count_table(conn, csv_dir: Path) -> int: """Pre-compute track counts from CSV and store in release_track_count table. @@ -481,6 +520,33 @@ def _import_child(table_config: TableConfig) -> int: return total +def import_artist_details(conn, csv_dir: Path) -> int: + """Import artist detail tables from CSV. + + Creates stub artist rows from release_artist data, then imports + artist_alias and artist_member CSVs. + + Returns total rows imported. + """ + # Create stub artist rows from release_artist (id + name only) + logger.info("Creating stub artist rows from release_artist...") + with conn.cursor() as cur: + cur.execute(""" + INSERT INTO artist (id, name) + SELECT DISTINCT artist_id, artist_name + FROM release_artist + WHERE artist_id IS NOT NULL + ON CONFLICT (id) DO NOTHING + """) + count = cur.rowcount + conn.commit() + logger.info(f" Created {count:,} stub artist rows") + + total = count + total += _import_tables(conn, csv_dir, ARTIST_TABLES) + return total + + def main(): import argparse @@ -542,21 +608,29 @@ def main(): logger.info("Populating artwork URLs...") import_artwork(conn, csv_dir) logger.info("Artwork URLs complete") + populate_release_year(conn) logger.info("Populating cache_metadata via COPY...") populate_cache_metadata(conn) logger.info("cache_metadata complete") logger.info("Creating track count table...") create_track_count_table(conn, csv_dir) logger.info("Track count table complete") + logger.info("Importing artist details...") + import_artist_details(conn, csv_dir) + logger.info("Artist details complete") conn.close() else: total = _import_tables(conn, csv_dir, TABLES) logger.info("Populating artwork URLs...") import_artwork(conn, csv_dir) logger.info("Artwork URLs complete") + populate_release_year(conn) logger.info("Populating cache_metadata via COPY...") populate_cache_metadata(conn) logger.info("cache_metadata complete") + logger.info("Importing artist details...") + import_artist_details(conn, csv_dir) + logger.info("Artist details complete") conn.close() logger.info(f"Total: {total:,} rows imported") diff --git a/tests/fixtures/csv/release_artist.csv b/tests/fixtures/csv/release_artist.csv index 240d425..12868d4 100644 --- a/tests/fixtures/csv/release_artist.csv +++ b/tests/fixtures/csv/release_artist.csv @@ -1,17 +1,17 @@ -release_id,artist_id,artist_name,extra,anv,position,join_field -1001,1,Radiohead,0,,1, -1002,1,Radiohead,0,,1, -1003,1,Radiohead,0,,1, -3001,1,Radiohead,0,,1, -4001,1,Radiohead,0,,1, -2001,2,Joy Division,0,,1, -2002,2,Joy Division,0,,1, -5001,3,DJ Unknown,0,,1, -5002,4,Mystery Band,0,,1, -6001,5,Björk,0,,1, -8001,7,Various,0,,1, -9001,8,"Beatles, The",0,,1, -9002,9,Simon & Garfunkel,0,,1, -10001,10,Random Artist X,0,,1, -10002,11,Obscure Band Y,0,,1, -1001,12,Some Producer,1,,2, +release_id,artist_id,artist_name,extra,anv,position,join_field,role +1001,1,Radiohead,0,,1,, +1002,1,Radiohead,0,,1,, +1003,1,Radiohead,0,,1,, +3001,1,Radiohead,0,,1,, +4001,1,Radiohead,0,,1,, +2001,2,Joy Division,0,,1,, +2002,2,Joy Division,0,,1,, +5001,3,DJ Unknown,0,,1,, +5002,4,Mystery Band,0,,1,, +6001,5,Björk,0,,1,, +8001,7,Various,0,,1,, +9001,8,"Beatles, The",0,,1,, +9002,9,Simon & Garfunkel,0,,1,, +10001,10,Random Artist X,0,,1,, +10002,11,Obscure Band Y,0,,1,, +1001,12,Some Producer,1,,2,,Producer diff --git a/tests/unit/test_import_csv.py b/tests/unit/test_import_csv.py index 635613e..f081a96 100644 --- a/tests/unit/test_import_csv.py +++ b/tests/unit/test_import_csv.py @@ -117,11 +117,12 @@ def test_release_artist_table_includes_artist_id(self) -> None: assert "artist_id" in ra_config["csv_columns"] assert "artist_id" in ra_config["db_columns"] - def test_release_table_transforms_released_to_year(self) -> None: - """The released field should be transformed via extract_year.""" + def test_release_table_imports_released_as_raw_text(self) -> None: + """The released field should be imported as raw text (no transform).""" release_config = next(t for t in TABLES if t["table"] == "release") - assert "released" in release_config["transforms"] - assert release_config["transforms"]["released"] is extract_year + assert "released" in release_config["csv_columns"] + assert "released" in release_config["db_columns"] + assert "released" not in release_config["transforms"] @pytest.mark.parametrize( "table_name", @@ -426,7 +427,9 @@ def test_default_mode_calls_import_tables(self, tmp_path) -> None: patch.object(_ic.psycopg, "connect", return_value=mock_conn), patch.object(_ic, "_import_tables", return_value=100) as mock_import, patch.object(_ic, "import_artwork", return_value=10), + patch.object(_ic, "populate_release_year", return_value=50), patch.object(_ic, "populate_cache_metadata", return_value=50), + patch.object(_ic, "import_artist_details", return_value=20), ): _ic.main() @@ -453,8 +456,10 @@ def test_base_only_mode_calls_parallel(self, tmp_path) -> None: patch.object(_ic.psycopg, "connect", return_value=mock_conn), patch.object(_ic, "_import_tables_parallel", return_value=100) as mock_parallel, patch.object(_ic, "import_artwork", return_value=10), + patch.object(_ic, "populate_release_year", return_value=50), patch.object(_ic, "populate_cache_metadata", return_value=50), patch.object(_ic, "create_track_count_table", return_value=20), + patch.object(_ic, "import_artist_details", return_value=20), ): _ic.main() From 4c83aa8321cab3137c2f4311a27d9615923c6bf2 Mon Sep 17 00:00:00 2001 From: Jake Bromberg Date: Wed, 11 Mar 2026 11:35:30 -0700 Subject: [PATCH 2/2] fix: format dedup_releases.py and call populate_release_year in integration test --- scripts/dedup_releases.py | 7 ++++++- tests/integration/test_import.py | 2 ++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/scripts/dedup_releases.py b/scripts/dedup_releases.py index 2c862b2..67d82af 100644 --- a/scripts/dedup_releases.py +++ b/scripts/dedup_releases.py @@ -583,7 +583,12 @@ def main(): "release_id, artist_id, artist_name, extra, role", "release_id", ), - ("release_label", "new_release_label", "release_id, label_id, label_name, catno", "release_id"), + ( + "release_label", + "new_release_label", + "release_id, label_id, label_name, catno", + "release_id", + ), ( "cache_metadata", "new_cache_metadata", diff --git a/tests/integration/test_import.py b/tests/integration/test_import.py index e784029..4dea514 100644 --- a/tests/integration/test_import.py +++ b/tests/integration/test_import.py @@ -23,6 +23,7 @@ import_artwork = _ic.import_artwork create_track_count_table = _ic.create_track_count_table populate_cache_metadata = _ic.populate_cache_metadata +populate_release_year = _ic.populate_release_year _import_tables = _ic._import_tables TABLES = _ic.TABLES BASE_TABLES = _ic.BASE_TABLES @@ -56,6 +57,7 @@ def _set_up_database(self, db_url): table_config["transforms"], ) import_artwork(conn, CSV_DIR) + populate_release_year(conn) with conn.cursor() as cur: cur.execute("""