From 594294ae77231ce3693608f28780e2db1af64bba Mon Sep 17 00:00:00 2001 From: JuliaEdom Date: Mon, 29 Jun 2026 19:02:11 +0300 Subject: [PATCH] fix(dv2): ClickHouse customer MDM views admit all source conventions (audit #12) The five ClickHouse bv_customer_mdm__ views filtered hub rows by a hard-coded record_source = '1c__', silently dropping every OLTP- and X5-promoted customer (record_source pg_ops__/x5__) from the MDM result. The PostgreSQL port fixed this in #99 (split_part); the ClickHouse half was left behind as a CH/PG split-brain (audit_28_06_26 #M1), and the postgres_oltp README already documented the splitByString admission the views did not implement. Swap the hub admission filter to splitByString('__', record_source)[2] = '' (the ClickHouse equivalent of the PG split_part fix), so a customer promoted under any source convention is integrated. View DDL only - customer_hk = md5(business_key) is identical across loaders, so no data migration. Add tests/unit/test_dv2_business_vault_ddl.py: the CH business-vault views had no parse or coverage at all. Pins that every view parses under sqlglot's ClickHouse dialect and that all five MDM views admit by branch, never by '1c__' again. Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/unit/test_dv2_business_vault_ddl.py | 82 +++++++++++++++++++ .../business_vault/bv_customer_mdm__ala.sql | 5 +- .../business_vault/bv_customer_mdm__dxb.sql | 5 +- .../business_vault/bv_customer_mdm__ekb.sql | 5 +- .../business_vault/bv_customer_mdm__msk.sql | 7 +- .../business_vault/bv_customer_mdm__spb.sql | 5 +- 6 files changed, 104 insertions(+), 5 deletions(-) create mode 100644 tests/unit/test_dv2_business_vault_ddl.py diff --git a/tests/unit/test_dv2_business_vault_ddl.py b/tests/unit/test_dv2_business_vault_ddl.py new file mode 100644 index 0000000..48a886d --- /dev/null +++ b/tests/unit/test_dv2_business_vault_ddl.py @@ -0,0 +1,82 @@ +"""Unit tests for the DV2 ClickHouse business-vault views (no Docker). + +The ClickHouse ``business_vault/*.sql`` views had no parse or coverage before +this file. It pins that every view parses under sqlglot's ClickHouse dialect and +that the customer MDM views admit hub rows by branch via +``splitByString('__', record_source)[2]`` — NOT by a hard-coded +``record_source = '1c__'`` filter that silently dropped OLTP/X5-promoted +customers (``record_source`` ``pg_ops__`` / ``x5__``). + +This is the ClickHouse half of audit_28_06_26 #12. The PostgreSQL port was fixed +in ``test_dv2_postgres_ddl.py::test_customer_mdm_views_admit_all_source_conventions``; +pinning the ClickHouse views to the same source-agnostic admission keeps the two +engines from drifting apart (the split-brain the audit flagged). A live apply + +``bv_customer_mdm`` query is a separate Mac smoke (see business_vault/README.md). +""" + +from __future__ import annotations + +import glob +import re +from pathlib import Path + +import sqlglot + +import warehouse.agentflow.dv2.dialects as dialects_mod + +DV2_DIR = Path(dialects_mod.__file__).resolve().parent +BV_DIR = DV2_DIR / "business_vault" + +MDM_BRANCHES = ("msk", "spb", "ekb", "dxb", "ala") + + +def _strip_comments(sql: str) -> str: + sql = re.sub(r"/\*.*?\*/", "", sql, flags=re.DOTALL) + sql = re.sub(r"--[^\n]*", "", sql) + return sql + + +def _bv_sql_files() -> list[Path]: + return [Path(p) for p in sorted(glob.glob(str(BV_DIR / "*.sql")))] + + +def test_all_business_vault_ddl_parses(): + files = _bv_sql_files() + # 5 customer MDM views + bv_order_canonical + bv_order_canonical_mat. + assert len(files) >= 7, f"expected >=7 business-vault SQL files, found {len(files)}" + for path in files: + parsed = sqlglot.parse(path.read_text(encoding="utf-8"), dialect="clickhouse") + assert parsed, f"{path.name} produced no statements" + + +def test_every_mdm_branch_view_exists(): + for branch in MDM_BRANCHES: + path = BV_DIR / f"bv_customer_mdm__{branch}.sql" + assert path.exists(), f"missing ClickHouse MDM view for branch {branch}" + + +def test_customer_mdm_views_admit_all_source_conventions(): + """audit_28_06_26 #12 (ClickHouse half): the customer MDM views must select + hub rows by branch via ``splitByString('__', record_source)[2]``, NOT by a + hard-coded ``record_source = '1c__'`` filter that silently drops + OLTP/X5-promoted customers (record_source ``pg_ops__`` / ``x5__``). Proven + live on PostgreSQL in #99: the buggy filter returns 1 of 2 seeded customers, + the source-agnostic filter returns both. This keeps the ClickHouse views in + lock-step with the PostgreSQL port so the engines cannot diverge again.""" + for branch in MDM_BRANCHES: + path = BV_DIR / f"bv_customer_mdm__{branch}.sql" + # Strip block/line comments: the headers deliberately quote the old, + # buggy ``record_source = '1c__'`` filter to explain the fix. + body = _strip_comments(path.read_text(encoding="utf-8")) + assert f"CREATE OR REPLACE VIEW rv.bv_customer_mdm__{branch}" in body, ( + f"bv_customer_mdm__{branch}.sql must define rv.bv_customer_mdm__{branch}" + ) + assert f"splitByString('__', record_source)[2] = '{branch}'" in body, ( + f"bv_customer_mdm__{branch} must admit hubs by branch via " + f"splitByString('__', record_source)[2], not by source convention" + ) + # The regressed pattern must never reappear in executable SQL. + assert "record_source = '1c__" not in body, ( + f"hard-coded record_source = '1c__' filter in " + f"bv_customer_mdm__{branch} reintroduces audit #12" + ) diff --git a/warehouse/agentflow/dv2/business_vault/bv_customer_mdm__ala.sql b/warehouse/agentflow/dv2/business_vault/bv_customer_mdm__ala.sql index 35a2e3e..9a0928d 100644 --- a/warehouse/agentflow/dv2/business_vault/bv_customer_mdm__ala.sql +++ b/warehouse/agentflow/dv2/business_vault/bv_customer_mdm__ala.sql @@ -2,6 +2,9 @@ Purpose: Canonical customer record for the ALA branch. Layer: Business Vault. Branch: ala (KZ jurisdiction; Bitrix loyalty not wired in ALA). +Hub admission: splitByString('__', record_source)[2] = 'ala' (source-agnostic: + 1c__/pg_ops__/x5__ all integrated, not only 1C; audit_28_06_26 #12; + mirrors the PostgreSQL port's split_part(record_source,'__',2)). */ CREATE OR REPLACE VIEW rv.bv_customer_mdm__ala AS WITH @@ -21,7 +24,7 @@ WITH ala_hub AS ( SELECT customer_hk, customer_bk FROM rv.hub_customer - WHERE record_source = '1c__ala' + WHERE splitByString('__', record_source)[2] = 'ala' ) SELECT h.customer_hk AS customer_hk, diff --git a/warehouse/agentflow/dv2/business_vault/bv_customer_mdm__dxb.sql b/warehouse/agentflow/dv2/business_vault/bv_customer_mdm__dxb.sql index 953cfe9..00cab07 100644 --- a/warehouse/agentflow/dv2/business_vault/bv_customer_mdm__dxb.sql +++ b/warehouse/agentflow/dv2/business_vault/bv_customer_mdm__dxb.sql @@ -7,6 +7,9 @@ Conflict policy: - Loyalty — not yet wired in for DXB (Bitrix24 sat is MSK-only); the view keeps the loyalty columns for schema parity with bv_customer_mdm__msk so downstream marts can UNION ALL the two branches without renaming. +Hub admission: splitByString('__', record_source)[2] = 'dxb' (source-agnostic: + 1c__/pg_ops__/x5__ all integrated, not only 1C; audit_28_06_26 #12; + mirrors the PostgreSQL port's split_part(record_source,'__',2)). */ CREATE OR REPLACE VIEW rv.bv_customer_mdm__dxb AS WITH @@ -26,7 +29,7 @@ WITH dxb_hub AS ( SELECT customer_hk, customer_bk FROM rv.hub_customer - WHERE record_source = '1c__dxb' + WHERE splitByString('__', record_source)[2] = 'dxb' ) SELECT h.customer_hk AS customer_hk, diff --git a/warehouse/agentflow/dv2/business_vault/bv_customer_mdm__ekb.sql b/warehouse/agentflow/dv2/business_vault/bv_customer_mdm__ekb.sql index d0260c4..4bebca3 100644 --- a/warehouse/agentflow/dv2/business_vault/bv_customer_mdm__ekb.sql +++ b/warehouse/agentflow/dv2/business_vault/bv_customer_mdm__ekb.sql @@ -2,6 +2,9 @@ Purpose: Canonical customer record for the EKB branch. Layer: Business Vault. Branch: ekb (RU jurisdiction; same conflict policy as MSK). +Hub admission: splitByString('__', record_source)[2] = 'ekb' (source-agnostic: + 1c__/pg_ops__/x5__ all integrated, not only 1C; audit_28_06_26 #12; + mirrors the PostgreSQL port's split_part(record_source,'__',2)). */ CREATE OR REPLACE VIEW rv.bv_customer_mdm__ekb AS WITH @@ -32,7 +35,7 @@ WITH ekb_hub AS ( SELECT customer_hk, customer_bk FROM rv.hub_customer - WHERE record_source = '1c__ekb' + WHERE splitByString('__', record_source)[2] = 'ekb' ) SELECT h.customer_hk AS customer_hk, diff --git a/warehouse/agentflow/dv2/business_vault/bv_customer_mdm__msk.sql b/warehouse/agentflow/dv2/business_vault/bv_customer_mdm__msk.sql index a56271f..ea93742 100644 --- a/warehouse/agentflow/dv2/business_vault/bv_customer_mdm__msk.sql +++ b/warehouse/agentflow/dv2/business_vault/bv_customer_mdm__msk.sql @@ -8,6 +8,11 @@ Conflict policy: - Loyalty (segment/points/last_visit) — Bitrix wins (live CRM state). - If a customer exists only in Bitrix, PII columns are NULL but the row is still returned so loyalty-only customers stay visible. +Hub admission: splitByString('__', record_source)[2] = 'msk', so a customer + promoted under ANY source convention (1c__msk, pg_ops__msk, x5__msk, + ...) is integrated, not only 1C. The old record_source = '1c__msk' + filter silently dropped OLTP/X5-promoted customers (audit_28_06_26 #12); + this mirrors the PostgreSQL port's split_part(record_source,'__',2). */ CREATE OR REPLACE VIEW rv.bv_customer_mdm__msk AS WITH @@ -38,7 +43,7 @@ WITH msk_hub AS ( SELECT customer_hk, customer_bk, record_source AS hub_record_source FROM rv.hub_customer - WHERE record_source = '1c__msk' + WHERE splitByString('__', record_source)[2] = 'msk' ) SELECT h.customer_hk AS customer_hk, diff --git a/warehouse/agentflow/dv2/business_vault/bv_customer_mdm__spb.sql b/warehouse/agentflow/dv2/business_vault/bv_customer_mdm__spb.sql index 7fea007..5861284 100644 --- a/warehouse/agentflow/dv2/business_vault/bv_customer_mdm__spb.sql +++ b/warehouse/agentflow/dv2/business_vault/bv_customer_mdm__spb.sql @@ -2,6 +2,9 @@ Purpose: Canonical customer record for the SPB branch. Layer: Business Vault. Branch: spb (RU jurisdiction; same conflict policy as MSK). +Hub admission: splitByString('__', record_source)[2] = 'spb' (source-agnostic: + 1c__/pg_ops__/x5__ all integrated, not only 1C; audit_28_06_26 #12; + mirrors the PostgreSQL port's split_part(record_source,'__',2)). */ CREATE OR REPLACE VIEW rv.bv_customer_mdm__spb AS WITH @@ -32,7 +35,7 @@ WITH spb_hub AS ( SELECT customer_hk, customer_bk FROM rv.hub_customer - WHERE record_source = '1c__spb' + WHERE splitByString('__', record_source)[2] = 'spb' ) SELECT h.customer_hk AS customer_hk,