WXYC · jakebromberg · Mar 11, 2026 · Mar 11, 2026 · Mar 11, 2026 · Mar 11, 2026
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -17,7 +17,7 @@ ETL pipeline for building and maintaining a PostgreSQL cache of Discogs release
 4. **Create schema** (`schema/create_database.sql`) and **functions** (`schema/create_functions.sql`), then **SET UNLOGGED** on all tables to skip WAL writes during bulk import
 5. **Import** filtered CSVs into PostgreSQL (`scripts/import_csv.py`)
 6. **Create indexes** including accent-insensitive trigram GIN indexes (`schema/create_indexes.sql`)
-7. **Deduplicate** by master_id (`scripts/dedup_releases.py`) -- prefers label match (with sublabel resolution via `--label-hierarchy`), then US releases, then most tracks, then lowest ID
+7. **Deduplicate** by (master_id, format) (`scripts/dedup_releases.py`) -- partitions by master_id and normalized format so different formats (CD, Vinyl, etc.) of the same album survive dedup independently. Within each partition, prefers label match (with sublabel resolution via `--label-hierarchy`), then US releases, then most tracks, then lowest ID
 8. **Prune or Copy-to** -- one of:
     - `--prune`: delete non-matching releases in place (~89% data reduction, 3 GB -> 340 MB)
     - `--copy-to`/`--target-db-url`: copy matched releases to a separate database, preserving the full import
@@ -38,6 +38,10 @@ The `release` table includes a `master_id` column used during import and dedup.
 
 The `country` column, by contrast, is permanent -- it is included in the dedup copy-swap SELECT list and persists in the final schema for consumers.
 
+### format Column Lifecycle
+
+The `format` column stores the normalized format category (Vinyl, CD, Cassette, 7", Digital). Unlike `master_id`, `format` persists after dedup and is available to consumers. During import, raw Discogs format strings are normalized via `lib/format_normalization.py` (e.g., "2xLP" → "Vinyl", "CD-R" → "CD"). During dedup, releases are partitioned by `(master_id, format)`, so a CD and Vinyl pressing of the same album both survive. During verify/prune, format-aware matching ensures only releases whose format matches the library's are kept (for exact artist+title matches). NULL format on either side is treated as "match anything" for backward compatibility.
+
 ### Database Schema (Shared Contract)
 
 The SQL files in `schema/` define the contract between this ETL pipeline and all consumers:
@@ -69,6 +73,7 @@ docker compose up db -d     # just the database (for tests)
 - `scripts/verify_cache.py` -- Multi-index fuzzy matching for KEEP/PRUNE classification; `--copy-to` streams matches to a target DB. Fuzzy matching is parallelized via ThreadPoolExecutor (rapidfuzz releases the GIL). Large prune sets (>10K IDs) use copy-and-swap instead of CASCADE DELETE.
 - `scripts/csv_to_tsv.py` -- CSV to TSV conversion utility
 - `scripts/fix_csv_newlines.py` -- Fix multiline CSV fields
+- `lib/format_normalization.py` -- Normalize raw Discogs/library format strings to broad categories (Vinyl, CD, Cassette, 7", Digital)
 - `lib/artist_splitting.py` -- Split combined multi-artist library entries into individual components for matching
 - `lib/matching.py` -- Compilation detection utility
 - `lib/pipeline_state.py` -- Pipeline state tracking for resumable runs
@@ -185,4 +190,4 @@ When writing inline test data or new fixture rows, use these defaults matching t
 
 **`library_artists.txt`**: `Juana Molina`, `Stereolab`, `Cat Power`, `Jessica Pratt`, `Chuquimamani-Condori`, `Duke Ellington`
 
-**SQLite `library` rows** (artist, title): `("Juana Molina", "DOGA")`, `("Stereolab", "Aluminum Tunes")`, `("Cat Power", "Moon Pix")`, `("Jessica Pratt", "On Your Own Love Again")`, `("Chuquimamani-Condori", "Edits")`, `("Duke Ellington", "Duke Ellington & John Coltrane")`
+**SQLite `library` rows** (artist, title, format): `("Juana Molina", "DOGA", "LP")`, `("Stereolab", "Aluminum Tunes", "CD")`, `("Cat Power", "Moon Pix", "LP")`, `("Jessica Pratt", "On Your Own Love Again", "LP")`, `("Chuquimamani-Condori", "Edits", "CD")`, `("Duke Ellington", "Duke Ellington & John Coltrane", "LP")`
diff --git a/lib/format_normalization.py b/lib/format_normalization.py
@@ -0,0 +1,114 @@
+"""Format normalization for Discogs and WXYC library format strings.
+
+Maps raw format strings to broad categories where track listings are typically
+identical within the category. Used by dedup (partition by format) and
+verify_cache (format-aware KEEP/PRUNE decisions).
+
+Categories:
+    "Vinyl"    — LP, Vinyl, 2xLP, 3xLP, 12", 10"
+    "CD"       — CD, 2xCD, 3xCD, CD-R, CDr
+    "Cassette" — Cassette
+    "7\""      — 7" singles (distinct track listings from LPs)
+    "Digital"  — File, FLAC, MP3, WAV
+    None       — unknown, empty, unrecognized
+"""
+
+from __future__ import annotations
+
+import re
+
+# Quantity prefix pattern: "2x", "3x", etc.
+_QUANTITY_RE = re.compile(r"^\d+x", re.IGNORECASE)
+
+# Mapping from lowercase format string to category.
+_FORMAT_MAP: dict[str, str] = {
+    "vinyl": "Vinyl",
+    "lp": "Vinyl",
+    '12"': "Vinyl",
+    '10"': "Vinyl",
+    "cd": "CD",
+    "cd-r": "CD",
+    "cdr": "CD",
+    "cassette": "Cassette",
+    '7"': '7"',
+    "file": "Digital",
+    "flac": "Digital",
+    "mp3": "Digital",
+    "wav": "Digital",
+}
+
+
+def normalize_format(raw: str | None) -> str | None:
+    """Normalize a Discogs format string to a broad category.
+
+    Splits multi-format on comma (takes first), strips quantity prefix ("2x"),
+    and maps to a category. Returns None for unknown/empty/unrecognized formats.
+
+    Args:
+        raw: Raw Discogs format string, e.g. "2xLP", "CD, DVD", "Vinyl".
+
+    Returns:
+        Normalized category string or None.
+    """
+    if not raw:
+        return None
+
+    # Take the first format from multi-format strings
+    fmt = raw.split(",")[0].strip()
+    if not fmt:
+        return None
+
+    # Strip quantity prefix (e.g. "2x" from "2xLP")
+    fmt = _QUANTITY_RE.sub("", fmt)
+
+    return _FORMAT_MAP.get(fmt.lower())
+
+
+def normalize_library_format(raw: str | None) -> str | None:
+    """Normalize a WXYC library format string to the same category space.
+
+    WXYC library uses simpler format names (LP, CD, Cassette, 7", Vinyl).
+
+    Args:
+        raw: Raw library format string.
+
+    Returns:
+        Normalized category string or None.
+    """
+    if not raw:
+        return None
+
+    fmt = raw.strip()
+    if not fmt:
+        return None
+
+    return _FORMAT_MAP.get(fmt.lower())
+
+
+def format_matches(release_format: str | None, library_formats: set[str | None]) -> bool:
+    """Check if a release's format is compatible with the library's format set.
+
+    Returns True if the release format is in the library's format set, or if
+    either side has no format data (graceful degradation).
+
+    Args:
+        release_format: Normalized release format category (or None).
+        library_formats: Set of normalized library format categories for a
+            specific (artist, title) pair. May contain None.
+
+    Returns:
+        True if the formats are compatible.
+    """
+    # No library format data — match anything (backward-compatible)
+    if not library_formats:
+        return True
+
+    # NULL release format — match anything (graceful degradation for direct-PG mode)
+    if release_format is None:
+        return True
+
+    # NULL in library formats means "match anything"
+    if None in library_formats:
+        return True
+
+    return release_format in library_formats
diff --git a/schema/create_database.sql b/schema/create_database.sql
@@ -41,6 +41,7 @@ CREATE TABLE release (
     country         text,
     artwork_url     text,
     released        text,              -- full date string, e.g. "2024-03-15"
+    format          text,              -- normalized format category: 'Vinyl', 'CD', etc.
     master_id       integer          -- used by dedup, dropped after dedup copy-swap
 );
 

diff --git a/scripts/dedup_releases.py b/scripts/dedup_releases.py
@@ -285,7 +285,7 @@ def ensure_dedup_ids(conn) -> int:
             SELECT id AS release_id FROM (
                 SELECT r.id, r.master_id,
                        ROW_NUMBER() OVER (
-                           PARTITION BY r.master_id
+                           PARTITION BY r.master_id, r.format
                            ORDER BY {order_by}
                        ) as rn
                 FROM release r
@@ -576,7 +576,12 @@ def main():
     # Step 2: Copy each table (keeping only non-duplicate rows)
     # Only base tables + cache_metadata (tracks are imported after dedup)
     tables = [
-        ("release", "new_release", "id, title, release_year, country, artwork_url, released", "id"),
+        (
+            "release",
+            "new_release",
+            "id, title, release_year, country, artwork_url, released, format",
+            "id",
+        ),
         (
             "release_artist",
             "new_release_artist",

diff --git a/scripts/import_csv.py b/scripts/import_csv.py
@@ -18,6 +18,9 @@
 
 import psycopg
 
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from lib.format_normalization import normalize_format  # noqa: E402
+
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s - %(levelname)s - %(message)s",
@@ -78,10 +81,10 @@ class TableConfig(TypedDict, total=False):
     {
         "csv_file": "release.csv",
         "table": "release",
-        "csv_columns": ["id", "title", "country", "released", "master_id"],
-        "db_columns": ["id", "title", "country", "released", "master_id"],
+        "csv_columns": ["id", "title", "country", "released", "format", "master_id"],
+        "db_columns": ["id", "title", "country", "released", "format", "master_id"],
         "required": ["id", "title"],
-        "transforms": {},
+        "transforms": {"format": normalize_format},
         "unique_key": ["id"],
     },
     {