From 56d0717b978e9165fab7933ede05ec2f945b6225 Mon Sep 17 00:00:00 2001 From: Jake Bromberg Date: Mon, 9 Mar 2026 21:26:10 -0700 Subject: [PATCH] perf: set tables UNLOGGED before converter COPY in direct-PG mode The UNLOGGED optimization was only applied in _run_database_build_post_import, which runs after the converter has already finished writing millions of rows via COPY to LOGGED tables. Move set_tables_unlogged into _run_xml_pipeline before the converter call so the heaviest I/O phase (bulk COPY from the Rust converter) benefits from skipping WAL writes. --- scripts/run_pipeline.py | 11 +++++---- tests/unit/test_run_pipeline.py | 42 +++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 4 deletions(-) diff --git a/scripts/run_pipeline.py b/scripts/run_pipeline.py index b03032a..f50a195 100644 --- a/scripts/run_pipeline.py +++ b/scripts/run_pipeline.py @@ -534,6 +534,10 @@ def _run_with_dirs(tmp_dir: Path, csv_out: Path) -> None: cur.execute("TRUNCATE release CASCADE") conn.close() + # Set tables UNLOGGED before the converter streams data via COPY. + # This skips WAL writes during the bulk import phase. + set_tables_unlogged(db_url) + # Converter streams releases into PG; supplementary CSVs still # go to csv_out (artist_alias.csv, label_hierarchy.csv). convert_and_filter( @@ -664,11 +668,10 @@ def _run_database_build_post_import( """Post-import database build for --direct-pg mode. Skips create_schema (already done), import_csv, and import_tracks - (converter loaded all data directly). Runs create_indexes through vacuum. + (converter loaded all data directly). Tables are already UNLOGGED + (set in _run_xml_pipeline before the converter). Runs create_indexes + through SET LOGGED. """ - # -- set_tables_unlogged (skip WAL writes during bulk operations) - set_tables_unlogged(db_url) - # -- create_indexes (base trigram indexes, run in parallel) conn = psycopg.connect(db_url, autocommit=True) with conn.cursor() as cur: diff --git a/tests/unit/test_run_pipeline.py b/tests/unit/test_run_pipeline.py index 1fe55bb..5e618a5 100644 --- a/tests/unit/test_run_pipeline.py +++ b/tests/unit/test_run_pipeline.py @@ -407,6 +407,48 @@ def test_descriptions_contain_logged(self) -> None: assert "LOGGED" in desc +class TestDirectPgUnloggedBeforeConverter: + """In --direct-pg mode, set_tables_unlogged is called before the converter.""" + + def test_unlogged_before_convert_and_filter(self, tmp_path) -> None: + """set_tables_unlogged must be called before convert_and_filter in direct-PG mode.""" + xml_file = tmp_path / "releases.xml.gz" + xml_file.touch() + + args = run_pipeline.parse_args(["--xml", str(xml_file), "--direct-pg"]) + + call_order = [] + + def track_set_unlogged(db_url): + call_order.append("set_tables_unlogged") + + def track_convert(xml, output_dir, converter, library_artists=None, database_url=None): + call_order.append("convert_and_filter") + + with ( + patch.object(run_pipeline, "parse_args", return_value=args), + patch.object(run_pipeline, "wait_for_postgres"), + patch.object(run_pipeline, "run_sql_file"), + patch.object(run_pipeline.psycopg, "connect") as mock_conn, + patch.object(run_pipeline, "set_tables_unlogged", side_effect=track_set_unlogged), + patch.object(run_pipeline, "convert_and_filter", side_effect=track_convert), + patch.object(run_pipeline, "_run_database_build_post_import"), + ): + mock_cursor = mock_conn.return_value.__enter__.return_value.cursor.return_value + mock_cursor.__enter__ = lambda self: self + mock_cursor.__exit__ = lambda self, *a: None + run_pipeline.main() + + assert "set_tables_unlogged" in call_order, "set_tables_unlogged should be called" + assert "convert_and_filter" in call_order, "convert_and_filter should be called" + unlogged_idx = call_order.index("set_tables_unlogged") + convert_idx = call_order.index("convert_and_filter") + assert unlogged_idx < convert_idx, ( + f"set_tables_unlogged (index {unlogged_idx}) must come before " + f"convert_and_filter (index {convert_idx})" + ) + + class TestXmlModeEnrichment: """In --xml mode, library_artists.txt is generated from library.db when not provided."""