Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions migrations/02_enrich_data.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
-- Enrich Discogs cache: restore dropped columns, add artist detail tables
-- Run once against the existing database (post-migration 01).
-- Idempotent: safe to re-run.
--
-- Usage:
-- psql -U postgres -d discogs -f 02_enrich_data.sql

BEGIN;

-- ============================================
-- 1. Restore columns dropped by migration 01
-- ============================================

-- Full release date string (e.g. "2024-03-15")
ALTER TABLE release ADD COLUMN IF NOT EXISTS released text;

-- Discogs artist ID on release_artist (nullable for API-fetched releases)
ALTER TABLE release_artist ADD COLUMN IF NOT EXISTS artist_id integer;

-- Role for extra artists (e.g. "Producer", "Mixed By")
ALTER TABLE release_artist ADD COLUMN IF NOT EXISTS role text;

-- Restore country column on release (used by dedup ranking)
ALTER TABLE release ADD COLUMN IF NOT EXISTS country text;

-- ============================================
-- 2. Enrich release_label table
-- ============================================

ALTER TABLE release_label ADD COLUMN IF NOT EXISTS label_id integer;
ALTER TABLE release_label ADD COLUMN IF NOT EXISTS catno text;

-- ============================================
-- 3. Artist detail tables (new)
-- ============================================

CREATE TABLE IF NOT EXISTS artist (
id integer PRIMARY KEY,
name text NOT NULL,
profile text,
image_url text,
fetched_at timestamptz NOT NULL DEFAULT now()
);

CREATE TABLE IF NOT EXISTS artist_alias (
artist_id integer NOT NULL REFERENCES artist(id) ON DELETE CASCADE,
alias_id integer,
alias_name text NOT NULL
);
CREATE INDEX IF NOT EXISTS idx_artist_alias_artist_id ON artist_alias(artist_id);

CREATE TABLE IF NOT EXISTS artist_name_variation (
artist_id integer NOT NULL REFERENCES artist(id) ON DELETE CASCADE,
name text NOT NULL
);
CREATE INDEX IF NOT EXISTS idx_artist_name_variation_artist_id ON artist_name_variation(artist_id);

CREATE TABLE IF NOT EXISTS artist_member (
artist_id integer NOT NULL REFERENCES artist(id) ON DELETE CASCADE,
member_id integer NOT NULL,
member_name text NOT NULL,
active boolean DEFAULT true
);
CREATE INDEX IF NOT EXISTS idx_artist_member_artist_id ON artist_member(artist_id);

CREATE TABLE IF NOT EXISTS artist_url (
artist_id integer NOT NULL REFERENCES artist(id) ON DELETE CASCADE,
url text NOT NULL
);
CREATE INDEX IF NOT EXISTS idx_artist_url_artist_id ON artist_url(artist_id);

COMMIT;
54 changes: 52 additions & 2 deletions schema/create_database.sql
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ CREATE EXTENSION IF NOT EXISTS unaccent;

-- Drop in FK order: children first, then parent
DROP TABLE IF EXISTS cache_metadata CASCADE;
DROP TABLE IF EXISTS artist_url CASCADE;
DROP TABLE IF EXISTS artist_member CASCADE;
DROP TABLE IF EXISTS artist_name_variation CASCADE;
DROP TABLE IF EXISTS artist_alias CASCADE;
DROP TABLE IF EXISTS artist CASCADE;
DROP TABLE IF EXISTS release_track_artist CASCADE;
DROP TABLE IF EXISTS release_track CASCADE;
DROP TABLE IF EXISTS release_label CASCADE;
Expand All @@ -35,6 +40,7 @@ CREATE TABLE release (
release_year smallint,
country text,
artwork_url text,
released text, -- full date string, e.g. "2024-03-15"
master_id integer -- used by dedup, dropped after dedup copy-swap
);

Expand All @@ -43,13 +49,16 @@ CREATE TABLE release_artist (
release_id integer NOT NULL REFERENCES release(id) ON DELETE CASCADE,
artist_id integer, -- Discogs artist ID (nullable for API-fetched releases)
artist_name text NOT NULL,
extra integer DEFAULT 0 -- 0 = main artist, 1 = extra credit
extra integer DEFAULT 0, -- 0 = main artist, 1 = extra credit
role text -- role for extra artists: "Producer", "Mixed By", etc.
);

-- Labels on releases
CREATE TABLE release_label (
release_id integer NOT NULL REFERENCES release(id) ON DELETE CASCADE,
label_name text NOT NULL
label_id integer,
label_name text NOT NULL,
catno text
);

-- Tracks on releases
Expand All @@ -68,6 +77,41 @@ CREATE TABLE release_track_artist (
artist_name text NOT NULL
);

-- ============================================
-- Artist detail tables
-- ============================================

CREATE TABLE artist (
id integer PRIMARY KEY,
name text NOT NULL,
profile text,
image_url text,
fetched_at timestamptz NOT NULL DEFAULT now()
);

CREATE TABLE artist_alias (
artist_id integer NOT NULL REFERENCES artist(id) ON DELETE CASCADE,
alias_id integer,
alias_name text NOT NULL
);

CREATE TABLE artist_name_variation (
artist_id integer NOT NULL REFERENCES artist(id) ON DELETE CASCADE,
name text NOT NULL
);

CREATE TABLE artist_member (
artist_id integer NOT NULL REFERENCES artist(id) ON DELETE CASCADE,
member_id integer NOT NULL,
member_name text NOT NULL,
active boolean DEFAULT true
);

CREATE TABLE artist_url (
artist_id integer NOT NULL REFERENCES artist(id) ON DELETE CASCADE,
url text NOT NULL
);

-- ============================================
-- Cache metadata (for tracking data freshness)
-- ============================================
Expand All @@ -89,6 +133,12 @@ CREATE INDEX IF NOT EXISTS idx_release_label_release_id ON release_label(release
CREATE INDEX IF NOT EXISTS idx_release_track_release_id ON release_track(release_id);
CREATE INDEX IF NOT EXISTS idx_release_track_artist_release_id ON release_track_artist(release_id);

-- Artist detail indexes
CREATE INDEX IF NOT EXISTS idx_artist_alias_artist_id ON artist_alias(artist_id);
CREATE INDEX IF NOT EXISTS idx_artist_name_variation_artist_id ON artist_name_variation(artist_id);
CREATE INDEX IF NOT EXISTS idx_artist_member_artist_id ON artist_member(artist_id);
CREATE INDEX IF NOT EXISTS idx_artist_url_artist_id ON artist_url(artist_id);

-- Partial index on master_id for dedup performance.
-- Transient: dropped automatically by dedup copy-swap (which excludes master_id).
CREATE INDEX IF NOT EXISTS idx_release_master_id ON release(master_id) WHERE master_id IS NOT NULL;
Expand Down
11 changes: 8 additions & 3 deletions scripts/dedup_releases.py
Original file line number Diff line number Diff line change
Expand Up @@ -576,14 +576,19 @@ def main():
# Step 2: Copy each table (keeping only non-duplicate rows)
# Only base tables + cache_metadata (tracks are imported after dedup)
tables = [
("release", "new_release", "id, title, release_year, country, artwork_url", "id"),
("release", "new_release", "id, title, release_year, country, artwork_url, released", "id"),
(
"release_artist",
"new_release_artist",
"release_id, artist_id, artist_name, extra",
"release_id, artist_id, artist_name, extra, role",
"release_id",
),
(
"release_label",
"new_release_label",
"release_id, label_id, label_name, catno",
"release_id",
),
("release_label", "new_release_label", "release_id, label_name", "release_id"),
(
"cache_metadata",
"new_cache_metadata",
Expand Down
86 changes: 80 additions & 6 deletions scripts/import_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,25 +79,25 @@ class TableConfig(TypedDict, total=False):
"csv_file": "release.csv",
"table": "release",
"csv_columns": ["id", "title", "country", "released", "master_id"],
"db_columns": ["id", "title", "country", "release_year", "master_id"],
"db_columns": ["id", "title", "country", "released", "master_id"],
"required": ["id", "title"],
"transforms": {"released": extract_year},
"transforms": {},
"unique_key": ["id"],
},
{
"csv_file": "release_artist.csv",
"table": "release_artist",
"csv_columns": ["release_id", "artist_id", "artist_name", "extra"],
"db_columns": ["release_id", "artist_id", "artist_name", "extra"],
"csv_columns": ["release_id", "artist_id", "artist_name", "extra", "role"],
"db_columns": ["release_id", "artist_id", "artist_name", "extra", "role"],
"required": ["release_id", "artist_name"],
"transforms": {},
"unique_key": ["release_id", "artist_name"],
},
{
"csv_file": "release_label.csv",
"table": "release_label",
"csv_columns": ["release_id", "label"],
"db_columns": ["release_id", "label_name"],
"csv_columns": ["release_id", "label", "catno"],
"db_columns": ["release_id", "label_name", "catno"],
"required": ["release_id", "label"],
"transforms": {},
"unique_key": ["release_id", "label"],
Expand All @@ -124,6 +124,25 @@ class TableConfig(TypedDict, total=False):
},
]

ARTIST_TABLES: list[TableConfig] = [
{
"csv_file": "artist_alias.csv",
"table": "artist_alias",
"csv_columns": ["artist_id", "alias_name"],
"db_columns": ["artist_id", "alias_name"],
"required": ["artist_id", "alias_name"],
"transforms": {},
},
{
"csv_file": "artist_member.csv",
"table": "artist_member",
"csv_columns": ["group_artist_id", "member_artist_id", "member_name"],
"db_columns": ["artist_id", "member_id", "member_name"],
"required": ["group_artist_id", "member_artist_id", "member_name"],
"transforms": {},
},
]

TABLES: list[TableConfig] = BASE_TABLES + TRACK_TABLES


Expand Down Expand Up @@ -278,6 +297,26 @@ def populate_cache_metadata(conn) -> int:
return count


def populate_release_year(conn) -> int:
"""Populate release_year from the released text field.

Extracts the 4-digit year prefix from the 'released' column and stores it
in the 'release_year' smallint column for efficient filtering.
"""
logger.info("Populating release_year from released text...")
with conn.cursor() as cur:
cur.execute("""
UPDATE release SET release_year = CAST(LEFT(released, 4) AS smallint)
WHERE released IS NOT NULL
AND released ~ '^[0-9]{4}'
AND release_year IS NULL
""")
count = cur.rowcount
conn.commit()
logger.info(f" Populated release_year for {count:,} releases")
return count


def create_track_count_table(conn, csv_dir: Path) -> int:
"""Pre-compute track counts from CSV and store in release_track_count table.

Expand Down Expand Up @@ -481,6 +520,33 @@ def _import_child(table_config: TableConfig) -> int:
return total


def import_artist_details(conn, csv_dir: Path) -> int:
"""Import artist detail tables from CSV.

Creates stub artist rows from release_artist data, then imports
artist_alias and artist_member CSVs.

Returns total rows imported.
"""
# Create stub artist rows from release_artist (id + name only)
logger.info("Creating stub artist rows from release_artist...")
with conn.cursor() as cur:
cur.execute("""
INSERT INTO artist (id, name)
SELECT DISTINCT artist_id, artist_name
FROM release_artist
WHERE artist_id IS NOT NULL
ON CONFLICT (id) DO NOTHING
""")
count = cur.rowcount
conn.commit()
logger.info(f" Created {count:,} stub artist rows")

total = count
total += _import_tables(conn, csv_dir, ARTIST_TABLES)
return total


def main():
import argparse

Expand Down Expand Up @@ -542,21 +608,29 @@ def main():
logger.info("Populating artwork URLs...")
import_artwork(conn, csv_dir)
logger.info("Artwork URLs complete")
populate_release_year(conn)
logger.info("Populating cache_metadata via COPY...")
populate_cache_metadata(conn)
logger.info("cache_metadata complete")
logger.info("Creating track count table...")
create_track_count_table(conn, csv_dir)
logger.info("Track count table complete")
logger.info("Importing artist details...")
import_artist_details(conn, csv_dir)
logger.info("Artist details complete")
conn.close()
else:
total = _import_tables(conn, csv_dir, TABLES)
logger.info("Populating artwork URLs...")
import_artwork(conn, csv_dir)
logger.info("Artwork URLs complete")
populate_release_year(conn)
logger.info("Populating cache_metadata via COPY...")
populate_cache_metadata(conn)
logger.info("cache_metadata complete")
logger.info("Importing artist details...")
import_artist_details(conn, csv_dir)
logger.info("Artist details complete")
conn.close()

logger.info(f"Total: {total:,} rows imported")
Expand Down
34 changes: 17 additions & 17 deletions tests/fixtures/csv/release_artist.csv
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
release_id,artist_id,artist_name,extra,anv,position,join_field
1001,1,Radiohead,0,,1,
1002,1,Radiohead,0,,1,
1003,1,Radiohead,0,,1,
3001,1,Radiohead,0,,1,
4001,1,Radiohead,0,,1,
2001,2,Joy Division,0,,1,
2002,2,Joy Division,0,,1,
5001,3,DJ Unknown,0,,1,
5002,4,Mystery Band,0,,1,
6001,5,Björk,0,,1,
8001,7,Various,0,,1,
9001,8,"Beatles, The",0,,1,
9002,9,Simon & Garfunkel,0,,1,
10001,10,Random Artist X,0,,1,
10002,11,Obscure Band Y,0,,1,
1001,12,Some Producer,1,,2,
release_id,artist_id,artist_name,extra,anv,position,join_field,role
1001,1,Radiohead,0,,1,,
1002,1,Radiohead,0,,1,,
1003,1,Radiohead,0,,1,,
3001,1,Radiohead,0,,1,,
4001,1,Radiohead,0,,1,,
2001,2,Joy Division,0,,1,,
2002,2,Joy Division,0,,1,,
5001,3,DJ Unknown,0,,1,,
5002,4,Mystery Band,0,,1,,
6001,5,Björk,0,,1,,
8001,7,Various,0,,1,,
9001,8,"Beatles, The",0,,1,,
9002,9,Simon & Garfunkel,0,,1,,
10001,10,Random Artist X,0,,1,,
10002,11,Obscure Band Y,0,,1,,
1001,12,Some Producer,1,,2,,Producer
2 changes: 2 additions & 0 deletions tests/integration/test_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import_artwork = _ic.import_artwork
create_track_count_table = _ic.create_track_count_table
populate_cache_metadata = _ic.populate_cache_metadata
populate_release_year = _ic.populate_release_year
_import_tables = _ic._import_tables
TABLES = _ic.TABLES
BASE_TABLES = _ic.BASE_TABLES
Expand Down Expand Up @@ -56,6 +57,7 @@ def _set_up_database(self, db_url):
table_config["transforms"],
)
import_artwork(conn, CSV_DIR)
populate_release_year(conn)

with conn.cursor() as cur:
cur.execute("""
Expand Down
Loading
Loading