From be4e2a7e2ae1c4ebfaa1dc0cfbd9d8b8ce277fee Mon Sep 17 00:00:00 2001 From: Bharath Krishna Date: Mon, 19 Jan 2026 15:27:07 -0800 Subject: [PATCH 1/4] fix: preserve HMS table properties during commits This change fixes HMS-specific table properties being lost during commits by merging parameters instead of replacing them. Fixes: #2926 --- pyiceberg/catalog/hive.py | 13 ++++++++++- tests/integration/test_reads.py | 38 +++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/pyiceberg/catalog/hive.py b/pyiceberg/catalog/hive.py index e096470451..2c75178300 100644 --- a/pyiceberg/catalog/hive.py +++ b/pyiceberg/catalog/hive.py @@ -551,11 +551,22 @@ def commit_table( if hive_table and current_table: # Table exists, update it. - hive_table.parameters = _construct_parameters( + new_parameters = _construct_parameters( metadata_location=updated_staged_table.metadata_location, previous_metadata_location=current_table.metadata_location, metadata_properties=updated_staged_table.properties, ) + + # Detect properties that were removed from Iceberg metadata + old_iceberg_keys = set(current_table.properties.keys()) + new_iceberg_keys = set(updated_staged_table.properties.keys()) + removed_keys = old_iceberg_keys - new_iceberg_keys + + # Start with current HMS parameters, remove deleted Iceberg properties, then update with new ones + updated_parameters = {k: v for k, v in hive_table.parameters.items() if k not in removed_keys} + updated_parameters.update(new_parameters) + hive_table.parameters = updated_parameters + # Update hive's schema and properties hive_table.sd = _construct_hive_storage_descriptor( updated_staged_table.schema(), diff --git a/tests/integration/test_reads.py b/tests/integration/test_reads.py index 0d52365d04..99a57575f1 100644 --- a/tests/integration/test_reads.py +++ b/tests/integration/test_reads.py @@ -135,6 +135,44 @@ def test_hive_properties(catalog: Catalog) -> None: assert hive_table.parameters.get("abc") is None +@pytest.mark.integration +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive")]) +def test_hive_preserves_hms_specific_properties(catalog: Catalog) -> None: + """Test that HMS-specific table properties are preserved during table commits. + + This verifies that HMS-specific properties that are not managed by Iceberg + are preserved during commits, rather than being lost. + + Regression test for: https://github.com/apache/iceberg-python/issues/2926 + """ + table = create_table(catalog) + hive_client: _HiveClient = _HiveClient(catalog.properties["uri"]) + with hive_client as open_client: + hive_table = open_client.get_table(*TABLE_NAME) + # Add HMS-specific properties that aren't managed by Iceberg + hive_table.parameters["table_category"] = "production" + hive_table.parameters["data_owner"] = "data_team" + open_client.alter_table(TABLE_NAME[0], TABLE_NAME[1], hive_table) + + with hive_client as open_client: + hive_table = open_client.get_table(*TABLE_NAME) + assert hive_table.parameters.get("table_category") == "production" + assert hive_table.parameters.get("data_owner") == "data_team" + + table.transaction().set_properties({"iceberg_property": "new_value"}).commit_transaction() + + # Verify that HMS-specific properties are STILL present after commit + with hive_client as open_client: + hive_table = open_client.get_table(*TABLE_NAME) + # HMS-specific properties should be preserved + assert hive_table.parameters.get("table_category") == "production", ( + "HMS property 'table_category' was lost during commit!" + ) + assert hive_table.parameters.get("data_owner") == "data_team", "HMS property 'data_owner' was lost during commit!" + # Iceberg properties should also be present + assert hive_table.parameters.get("iceberg_property") == "new_value" + + @pytest.mark.integration @pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_table_properties_dict(catalog: Catalog) -> None: From ae8a0240147c9a478670c37b373d14181d656b32 Mon Sep 17 00:00:00 2001 From: Bharath Krishna Date: Mon, 19 Jan 2026 16:49:03 -0800 Subject: [PATCH 2/4] Add more tests, comment --- pyiceberg/catalog/hive.py | 4 +- tests/integration/test_reads.py | 74 +++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 1 deletion(-) diff --git a/pyiceberg/catalog/hive.py b/pyiceberg/catalog/hive.py index 2c75178300..e039c96506 100644 --- a/pyiceberg/catalog/hive.py +++ b/pyiceberg/catalog/hive.py @@ -562,7 +562,9 @@ def commit_table( new_iceberg_keys = set(updated_staged_table.properties.keys()) removed_keys = old_iceberg_keys - new_iceberg_keys - # Start with current HMS parameters, remove deleted Iceberg properties, then update with new ones + # Merge HMS parameters: preserve HMS-only properties (e.g., table_category) while + # ensuring Iceberg controls its metadata. Start with HMS params, remove deleted + # Iceberg properties, then update with new Iceberg values (which take precedence). updated_parameters = {k: v for k, v in hive_table.parameters.items() if k not in removed_keys} updated_parameters.update(new_parameters) hive_table.parameters = updated_parameters diff --git a/tests/integration/test_reads.py b/tests/integration/test_reads.py index 99a57575f1..325c61602f 100644 --- a/tests/integration/test_reads.py +++ b/tests/integration/test_reads.py @@ -173,6 +173,80 @@ def test_hive_preserves_hms_specific_properties(catalog: Catalog) -> None: assert hive_table.parameters.get("iceberg_property") == "new_value" +@pytest.mark.integration +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive")]) +def test_hive_iceberg_property_takes_precedence(catalog: Catalog) -> None: + """Test that Iceberg properties take precedence over conflicting HMS properties. + + If an external tool sets an HMS property with the same name as an Iceberg-managed + property, Iceberg's value should win during commits. + """ + table = create_table(catalog) + hive_client: _HiveClient = _HiveClient(catalog.properties["uri"]) + + # Set an Iceberg property + table.transaction().set_properties({"my_prop": "iceberg_value"}).commit_transaction() + + # External tool modifies the same property in HMS + with hive_client as open_client: + hive_table = open_client.get_table(*TABLE_NAME) + hive_table.parameters["my_prop"] = "hms_value" # Conflicting value + open_client.alter_table(TABLE_NAME[0], TABLE_NAME[1], hive_table) + + # Verify HMS has the external value + with hive_client as open_client: + hive_table = open_client.get_table(*TABLE_NAME) + assert hive_table.parameters.get("my_prop") == "hms_value" + + # Perform another Iceberg commit + table.transaction().set_properties({"another_prop": "test"}).commit_transaction() + + # Iceberg's value should take precedence + with hive_client as open_client: + hive_table = open_client.get_table(*TABLE_NAME) + assert hive_table.parameters.get("my_prop") == "iceberg_value", ( + "Iceberg property value should take precedence over conflicting HMS value!" + ) + assert hive_table.parameters.get("another_prop") == "test" + + +@pytest.mark.integration +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive")]) +def test_hive_critical_properties_always_from_iceberg(catalog: Catalog) -> None: + """Test that critical properties (EXTERNAL, table_type, metadata_location) always come from Iceberg. + + These properties should never be carried over from old HMS state. + """ + table = create_table(catalog) + hive_client: _HiveClient = _HiveClient(catalog.properties["uri"]) + + # Get original metadata_location + with hive_client as open_client: + hive_table = open_client.get_table(*TABLE_NAME) + original_metadata_location = hive_table.parameters.get("metadata_location") + assert original_metadata_location is not None + assert hive_table.parameters.get("EXTERNAL") == "TRUE" + assert hive_table.parameters.get("table_type") == "ICEBERG" + + # Try to tamper with critical properties via HMS + with hive_client as open_client: + hive_table = open_client.get_table(*TABLE_NAME) + hive_table.parameters["EXTERNAL"] = "FALSE" # Try to change + open_client.alter_table(TABLE_NAME[0], TABLE_NAME[1], hive_table) + + # Perform Iceberg commit + table.transaction().set_properties({"test_prop": "value"}).commit_transaction() + + # Critical properties should be restored by Iceberg + with hive_client as open_client: + hive_table = open_client.get_table(*TABLE_NAME) + assert hive_table.parameters.get("EXTERNAL") == "TRUE", "EXTERNAL should always be TRUE from Iceberg!" + assert hive_table.parameters.get("table_type") == "ICEBERG", "table_type should always be ICEBERG!" + # metadata_location should be updated (new metadata file) + new_metadata_location = hive_table.parameters.get("metadata_location") + assert new_metadata_location != original_metadata_location, "metadata_location should be updated!" + + @pytest.mark.integration @pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_table_properties_dict(catalog: Catalog) -> None: From 291311f29b68cd4ef79be33450de9c0416ac4c8a Mon Sep 17 00:00:00 2001 From: Bharath Krishna Date: Mon, 19 Jan 2026 21:14:12 -0800 Subject: [PATCH 3/4] Address comments --- pyiceberg/catalog/hive.py | 20 +++++++-------- tests/integration/test_reads.py | 44 +++++++++++++++++++++++++++++++-- 2 files changed, 52 insertions(+), 12 deletions(-) diff --git a/pyiceberg/catalog/hive.py b/pyiceberg/catalog/hive.py index e039c96506..3046e25626 100644 --- a/pyiceberg/catalog/hive.py +++ b/pyiceberg/catalog/hive.py @@ -558,22 +558,22 @@ def commit_table( ) # Detect properties that were removed from Iceberg metadata - old_iceberg_keys = set(current_table.properties.keys()) - new_iceberg_keys = set(updated_staged_table.properties.keys()) - removed_keys = old_iceberg_keys - new_iceberg_keys + removed_keys = current_table.properties.keys() - updated_staged_table.properties.keys() - # Merge HMS parameters: preserve HMS-only properties (e.g., table_category) while - # ensuring Iceberg controls its metadata. Start with HMS params, remove deleted - # Iceberg properties, then update with new Iceberg values (which take precedence). - updated_parameters = {k: v for k, v in hive_table.parameters.items() if k not in removed_keys} - updated_parameters.update(new_parameters) - hive_table.parameters = updated_parameters + # Sync HMS parameters: Iceberg metadata is the source of truth, HMS parameters are + # a projection of Iceberg state plus any HMS-only properties. + # Start with existing HMS params, remove deleted Iceberg properties, then apply Iceberg values. + merged_params = dict(hive_table.parameters or {}) + for key in removed_keys: + merged_params.pop(key, None) + merged_params.update(new_parameters) + hive_table.parameters = merged_params # Update hive's schema and properties hive_table.sd = _construct_hive_storage_descriptor( updated_staged_table.schema(), updated_staged_table.location(), - property_as_bool(updated_staged_table.properties, HIVE2_COMPATIBLE, HIVE2_COMPATIBLE_DEFAULT), + property_as_bool(self.properties, HIVE2_COMPATIBLE, HIVE2_COMPATIBLE_DEFAULT), ) open_client.alter_table_with_environment_context( dbname=database_name, diff --git a/tests/integration/test_reads.py b/tests/integration/test_reads.py index 325c61602f..ddc9ebd4dd 100644 --- a/tests/integration/test_reads.py +++ b/tests/integration/test_reads.py @@ -175,8 +175,48 @@ def test_hive_preserves_hms_specific_properties(catalog: Catalog) -> None: @pytest.mark.integration @pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive")]) -def test_hive_iceberg_property_takes_precedence(catalog: Catalog) -> None: - """Test that Iceberg properties take precedence over conflicting HMS properties. +def test_iceberg_property_deletion_not_restored_from_old_hms_state(catalog: Catalog) -> None: + """Test that deleted Iceberg properties are truly removed and not restored from old HMS state. + + When a property is removed through Iceberg, it should be deleted from HMS and not + come back from the old HMS state during merge operations. + """ + table = create_table(catalog) + hive_client: _HiveClient = _HiveClient(catalog.properties["uri"]) + + # Set multiple Iceberg properties + table.transaction().set_properties({"prop_to_keep": "keep_value", "prop_to_delete": "delete_me"}).commit_transaction() + + # Verify both properties exist + with hive_client as open_client: + hive_table = open_client.get_table(*TABLE_NAME) + assert hive_table.parameters.get("prop_to_keep") == "keep_value" + assert hive_table.parameters.get("prop_to_delete") == "delete_me" + + # Delete one property through Iceberg + table.transaction().remove_properties("prop_to_delete").commit_transaction() + + # Verify property is deleted from HMS + with hive_client as open_client: + hive_table = open_client.get_table(*TABLE_NAME) + assert hive_table.parameters.get("prop_to_keep") == "keep_value" + assert hive_table.parameters.get("prop_to_delete") is None, "Deleted property should not exist in HMS!" + + # Perform another Iceberg commit + table.transaction().set_properties({"new_prop": "new_value"}).commit_transaction() + + # Ensure deleted property doesn't come back from old state + with hive_client as open_client: + hive_table = open_client.get_table(*TABLE_NAME) + assert hive_table.parameters.get("prop_to_keep") == "keep_value" + assert hive_table.parameters.get("new_prop") == "new_value" + assert hive_table.parameters.get("prop_to_delete") is None, "Deleted property should NOT be restored from old HMS state!" + + +@pytest.mark.integration +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive")]) +def test_iceberg_metadata_is_source_of_truth(catalog: Catalog) -> None: + """Test that Iceberg metadata is the source of truth for all Iceberg-managed properties. If an external tool sets an HMS property with the same name as an Iceberg-managed property, Iceberg's value should win during commits. From 6b7489a64fa7191bd36749213d1021b4da33f186 Mon Sep 17 00:00:00 2001 From: Bharath Krishna Date: Tue, 20 Jan 2026 12:57:13 -0800 Subject: [PATCH 4/4] Apply suggestions from code review Co-authored-by: Fokko Driesprong --- tests/integration/test_reads.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/integration/test_reads.py b/tests/integration/test_reads.py index ddc9ebd4dd..6e3b4c4458 100644 --- a/tests/integration/test_reads.py +++ b/tests/integration/test_reads.py @@ -174,8 +174,7 @@ def test_hive_preserves_hms_specific_properties(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive")]) -def test_iceberg_property_deletion_not_restored_from_old_hms_state(catalog: Catalog) -> None: +def test_iceberg_property_deletion_not_restored_from_old_hms_state(session_catalog_hive: Catalog) -> None: """Test that deleted Iceberg properties are truly removed and not restored from old HMS state. When a property is removed through Iceberg, it should be deleted from HMS and not