From 8a9858774f4ce4cf32fef66d30dd4300f712efa1 Mon Sep 17 00:00:00 2001 From: Peters Date: Tue, 5 May 2026 12:35:15 -0400 Subject: [PATCH 1/5] fix(user_profile_address_view): correct historical_order and add carry-forward for NULL fields - Replace (element_position + 1) with ROW_NUMBER() OVER (PARTITION BY Connect_ID ORDER BY element_position DESC) in Queries 3, 4, and 6 so that historical_order = 1 is the most recent history entry, consistent with the reverse_chron_order convention in the demo - Wrap all six UNION ALL queries in an outer SELECT with a named WINDOW that applies LAST_VALUE(... IGNORE NULLS) to fill true NULLs in address_line_1, address_line_2, city, state, and zip_code using the most recent non-null value per (Connect_ID, address_nickname) - Empty strings are intentionally preserved as-is (Step 5 behavior); NULLIF/TRIM normalization continues to happen downstream in the standardized_addresses CTE in address_processing.py --- sql/delivery_impact_of_change_to_up_hx.md | 43 +++++++++++++++ sql/user_profile_address_view.sql | 66 +++++++++++++++++++++-- 2 files changed, 105 insertions(+), 4 deletions(-) create mode 100644 sql/delivery_impact_of_change_to_up_hx.md diff --git a/sql/delivery_impact_of_change_to_up_hx.md b/sql/delivery_impact_of_change_to_up_hx.md new file mode 100644 index 0000000..8526f2d --- /dev/null +++ b/sql/delivery_impact_of_change_to_up_hx.md @@ -0,0 +1,43 @@ +# Internal Note: Delivery Impact of `user_profile_address_view.sql` Changes + +## What changed and why + +Two behavioral changes were made to how user profile history addresses are computed before delivery: + +1. **`historical_order` ordering corrected:** history entries were previously numbered oldest-first (`element_position + 1`). They are now numbered newest-first (`ROW_NUMBER() ... ORDER BY element_position DESC`), so `historical_order = 1` is always the most recent history snapshot. Current addresses remain `historical_order = 0`. Module4 addresses are unaffected. + +2. **Carry-forward of NULL address fields:** history entries that were missing a field (true `NULL`) now receive the value from the nearest newer non-null entry for that participant and address type. Empty strings are preserved as-is and are not filled. Module4 addresses are unaffected. + +--- + +## Impact on `address_hash` and delivery state + +The `address_hash` is computed from address field values (`city`, `state`, `zip_code`, `address_line_1`, `address_line_2`). It does **not** include `historical_order`. Therefore: + +| Row type | Hash changes? | Reason | +|---|---|---| +| All module4 rows | ❌ No | Untouched by this change | +| User profile current rows (`historical_order = 0`) | ⚠️ Only if a NULL field is filled by carry-forward from a history entry | Carry-forward window includes `historical_order = 0` | +| User profile history rows with no NULL fields | ❌ No | Carry-forward is a no-op; values unchanged | +| User profile history rows with at least one NULL field that can be filled | ✅ Yes | Field value changes → hash changes | + +--- + +## Three populations in the upcoming delivery + +**1. Truly new addresses:** participants or address types not previously seen. Handled normally. + +**2. Re-delivered history rows (enriched):** previously delivered with one or more NULL fields; now delivered again with those fields filled via carry-forward. These will appear as new records to `identify_new_addresses()` because their hash has changed. The old NULL version remains in `address_delivery_metadata` and `address_deliveries` — it is not overwritten. + +**3. History rows whose `historical_order` shifted:** if a participant's history array has grown since the last delivery, older entries will have higher `historical_order` values than before. Because `historical_order` is not in the hash, the hash is unchanged and these rows will **not** be re-delivered unless their field values also changed. + +--- + +## Merging NORC's returned data on our end + +When NORC returns geocoded results, rows should be matched back to our records using: `Connect_ID` + `address_nickname` + `ts_user_profile_updated` + + +- For **current addresses** (`historical_order = 0`), `ts_user_profile_updated` is `NULL` for all rows — match on `Connect_ID + address_nickname + historical_order = 0` instead. +- **Do not use `historical_order` alone** as a join key against prior deliveries, as the numbering convention has changed. +- Where a `Connect_ID + address_nickname + ts_user_profile_updated` combination appears in both an old delivery and the new one, the new delivery's geocoded result supersedes the old one. The old row (with NULLs) in `address_deliveries` can be flagged or ignored in downstream analysis. diff --git a/sql/user_profile_address_view.sql b/sql/user_profile_address_view.sql index 6011eef..d4b5e2c 100644 --- a/sql/user_profile_address_view.sql +++ b/sql/user_profile_address_view.sql @@ -1,3 +1,54 @@ +-- ========================================================================= +-- User Profile Address View +-- Author: Jake Peters +-- Updated: 2026-05-05 +-- +-- Purpose: Produces one row per participant per address type per historical +-- entry by unioning current and historical address data from the +-- participants table across three address types: +-- - user_profile_physical_address (cid: 207908218) +-- - user_profile_mailing_address (cid: 521824358) +-- - user_profile_alternative_address (cid: 284580415) +-- +-- historical_order convention: +-- 0 = current (active) address +-- 1 = most recent history entry +-- 2 = next most recent, etc. +-- +-- Carry-forward: LAST_VALUE(... IGNORE NULLS) fills true NULLs in +-- address_line_1, address_line_2, city, state, and zip_code +-- using the most recent non-null value per (Connect_ID, +-- address_nickname). Empty strings are preserved as-is (they +-- block carry-forward). NULLIF/TRIM normalization happens +-- downstream in the standardized_addresses CTE in +-- address_processing.py, after carry-forward has run. +-- +-- Placeholders: @raw_participants is replaced at runtime by +-- address_processing.py with the fully qualified BigQuery +-- table path defined in constants.py. +-- ========================================================================= + +SELECT + Connect_ID, + ts_user_profile_updated, + ts_address_delivered, + address_src_question_cid, + address_nickname, + address_source, + historical_order, + LAST_VALUE(address_line_1 IGNORE NULLS) OVER w AS address_line_1, + LAST_VALUE(address_line_2 IGNORE NULLS) OVER w AS address_line_2, + street_num, + street_name, + apartment_num, + LAST_VALUE(city IGNORE NULLS) OVER w AS city, + LAST_VALUE(state IGNORE NULLS) OVER w AS state, + LAST_VALUE(zip_code IGNORE NULLS) OVER w AS zip_code, + country, + cross_street1, + cross_street2 +FROM ( + -- ========================================================================= -- Query 1: User Profile - Current Physical Addresses -- This query extracts physical address information from the participants table @@ -88,12 +139,12 @@ SELECT '207908218' AS address_src_question_cid, 'user_profile_physical_address' AS address_nickname, 'user_profile' AS address_source, - (element_position + 1) AS historical_order, -- Converting 0-based to 1-based + ROW_NUMBER() OVER (PARTITION BY Connect_ID ORDER BY element_position DESC) AS historical_order, -- 1 = most recent history entry element.d_207908218 AS address_line_1, element.d_224392018 AS address_line_2, CAST(NULL AS STRING) AS street_num, CAST(NULL AS STRING) AS street_name, - NULL AS apartment_num, + CAST(NULL AS STRING) AS apartment_num, element.d_451993790 AS city, element.d_187799450 AS state, element.d_449168732 AS zip_code, @@ -129,7 +180,7 @@ SELECT '521824358' AS address_src_question_cid, 'user_profile_mailing_address' AS address_nickname, 'user_profile' AS address_source, - (element_position + 1) AS historical_order, + ROW_NUMBER() OVER (PARTITION BY Connect_ID ORDER BY element_position DESC) AS historical_order, element.d_521824358 AS address_line_1, element.d_442166669 AS address_line_2, CAST(NULL AS STRING) AS street_num, @@ -208,7 +259,7 @@ SELECT '284580415' AS address_src_question_cid, 'user_profile_alternative_address' AS address_nickname, 'user_profile' AS address_source, - (element_position + 1) AS historical_order, + ROW_NUMBER() OVER (PARTITION BY Connect_ID ORDER BY element_position DESC) AS historical_order, element.D_284580415 AS address_line_1, element.D_728926441 AS address_line_2, CAST(NULL AS STRING) AS street_num, @@ -234,3 +285,10 @@ WHERE (element.D_970839481 IS NOT NULL AND element.D_970839481 != '') OR (element.D_379899229 IS NOT NULL AND element.D_379899229 != '') ) + +) +WINDOW w AS ( + PARTITION BY Connect_ID, address_nickname + ORDER BY historical_order DESC + ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW +) \ No newline at end of file From b0ee924221892b6a12f9859cc7f2c9ceedfade81 Mon Sep 17 00:00:00 2001 From: Peters Date: Tue, 5 May 2026 16:10:28 -0400 Subject: [PATCH 2/5] add note document about impact of up hx changes --- delivery_impact_of_change_to_up_hx.md | 43 +++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 delivery_impact_of_change_to_up_hx.md diff --git a/delivery_impact_of_change_to_up_hx.md b/delivery_impact_of_change_to_up_hx.md new file mode 100644 index 0000000..8526f2d --- /dev/null +++ b/delivery_impact_of_change_to_up_hx.md @@ -0,0 +1,43 @@ +# Internal Note: Delivery Impact of `user_profile_address_view.sql` Changes + +## What changed and why + +Two behavioral changes were made to how user profile history addresses are computed before delivery: + +1. **`historical_order` ordering corrected:** history entries were previously numbered oldest-first (`element_position + 1`). They are now numbered newest-first (`ROW_NUMBER() ... ORDER BY element_position DESC`), so `historical_order = 1` is always the most recent history snapshot. Current addresses remain `historical_order = 0`. Module4 addresses are unaffected. + +2. **Carry-forward of NULL address fields:** history entries that were missing a field (true `NULL`) now receive the value from the nearest newer non-null entry for that participant and address type. Empty strings are preserved as-is and are not filled. Module4 addresses are unaffected. + +--- + +## Impact on `address_hash` and delivery state + +The `address_hash` is computed from address field values (`city`, `state`, `zip_code`, `address_line_1`, `address_line_2`). It does **not** include `historical_order`. Therefore: + +| Row type | Hash changes? | Reason | +|---|---|---| +| All module4 rows | ❌ No | Untouched by this change | +| User profile current rows (`historical_order = 0`) | ⚠️ Only if a NULL field is filled by carry-forward from a history entry | Carry-forward window includes `historical_order = 0` | +| User profile history rows with no NULL fields | ❌ No | Carry-forward is a no-op; values unchanged | +| User profile history rows with at least one NULL field that can be filled | ✅ Yes | Field value changes → hash changes | + +--- + +## Three populations in the upcoming delivery + +**1. Truly new addresses:** participants or address types not previously seen. Handled normally. + +**2. Re-delivered history rows (enriched):** previously delivered with one or more NULL fields; now delivered again with those fields filled via carry-forward. These will appear as new records to `identify_new_addresses()` because their hash has changed. The old NULL version remains in `address_delivery_metadata` and `address_deliveries` — it is not overwritten. + +**3. History rows whose `historical_order` shifted:** if a participant's history array has grown since the last delivery, older entries will have higher `historical_order` values than before. Because `historical_order` is not in the hash, the hash is unchanged and these rows will **not** be re-delivered unless their field values also changed. + +--- + +## Merging NORC's returned data on our end + +When NORC returns geocoded results, rows should be matched back to our records using: `Connect_ID` + `address_nickname` + `ts_user_profile_updated` + + +- For **current addresses** (`historical_order = 0`), `ts_user_profile_updated` is `NULL` for all rows — match on `Connect_ID + address_nickname + historical_order = 0` instead. +- **Do not use `historical_order` alone** as a join key against prior deliveries, as the numbering convention has changed. +- Where a `Connect_ID + address_nickname + ts_user_profile_updated` combination appears in both an old delivery and the new one, the new delivery's geocoded result supersedes the old one. The old row (with NULLs) in `address_deliveries` can be flagged or ignored in downstream analysis. From 473a769df4b06d0e186bd5c27f8c8499e75e34f6 Mon Sep 17 00:00:00 2001 From: Peters Date: Wed, 20 May 2026 12:47:43 -0400 Subject: [PATCH 3/5] add dry_run.py --- core/dry_run.py | 104 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 core/dry_run.py diff --git a/core/dry_run.py b/core/dry_run.py new file mode 100644 index 0000000..162b039 --- /dev/null +++ b/core/dry_run.py @@ -0,0 +1,104 @@ +import os +from google.cloud import bigquery +import sys +sys.path.insert(0, 'core') +import constants + +def dry_run(): + client = bigquery.Client(project=constants.PROJECT_ID) + + # Read and render SQL exactly as create_address_view does + with open(os.path.join(constants.SQL_DIR, constants.ADDRESS_QUERY_SQL), 'r') as f: + module4_query = f.read() + + with open(os.path.join(constants.SQL_DIR, constants.USER_PROFILE_QUERY_SQL), 'r') as f: + user_profile_query = f.read() + + module4_query = module4_query.replace('@flat_module4', constants.MODULE_4_TABLE) + module4_query = module4_query.replace('@flat_participants', constants.FLAT_PARTICIPANTS_TABLE) + module4_query = module4_query.replace('@raw_participants', constants.RAW_PARTICIPANTS_TABLE) + + user_profile_query = user_profile_query.replace('@flat_module4', constants.MODULE_4_TABLE) + user_profile_query = user_profile_query.replace('@flat_participants', constants.FLAT_PARTICIPANTS_TABLE) + user_profile_query = user_profile_query.replace('@raw_participants', constants.RAW_PARTICIPANTS_TABLE) + + module4_query = module4_query.strip().rstrip(';') + user_profile_query = user_profile_query.strip().rstrip(';') + + combined_query = f""" + {module4_query} + UNION ALL + {user_profile_query} + """ + + view_query = f""" + CREATE OR REPLACE VIEW {constants.ADDRESSES_VIEW} AS + WITH standardized_addresses AS ( + SELECT + CAST(Connect_ID AS STRING) AS Connect_ID, + CAST(ts_user_profile_updated AS TIMESTAMP) AS ts_user_profile_updated, + CAST(ts_address_delivered AS TIMESTAMP) AS ts_address_delivered, + CAST(address_src_question_cid AS STRING) AS address_src_question_cid, + CAST(address_nickname AS STRING) AS address_nickname, + CAST(address_source AS STRING) AS address_source, + CAST(historical_order AS INT64) AS historical_order, + NULLIF(TRIM(CAST(address_line_1 AS STRING)), '') AS address_line_1, + NULLIF(TRIM(CAST(address_line_2 AS STRING)), '') AS address_line_2, + NULLIF(TRIM(CAST(street_num AS STRING)), '') AS street_num, + NULLIF(TRIM(CAST(street_name AS STRING)), '') AS street_name, + NULLIF(TRIM(CAST(apartment_num AS STRING)), '') AS apartment_num, + NULLIF(TRIM(CAST(city AS STRING)), '') AS city, + NULLIF(TRIM(CAST(state AS STRING)), '') AS state, + NULLIF(TRIM(CAST(zip_code AS STRING)), '') AS zip_code, + NULLIF(TRIM(CAST(country AS STRING)), '') AS country, + NULLIF(TRIM(CAST(cross_street_1 AS STRING)), '') AS cross_street_1, + NULLIF(TRIM(CAST(cross_street_2 AS STRING)), '') AS cross_street_2 + FROM ( + {combined_query} + ) subquery + ) + SELECT *, + TO_HEX(MD5(CONCAT( + IFNULL(Connect_ID, ''), + IFNULL(address_src_question_cid, ''), + IFNULL(address_nickname, ''), + IFNULL(address_source, ''), + IFNULL(address_line_1, ''), + IFNULL(address_line_2, ''), + IFNULL(street_num, ''), + IFNULL(street_name, ''), + IFNULL(apartment_num, ''), + IFNULL(city, ''), + IFNULL(state, ''), + IFNULL(zip_code, ''), + IFNULL(country, ''), + IFNULL(cross_street_1, ''), + IFNULL(cross_street_2, '') + ))) AS address_hash + FROM standardized_addresses + WHERE ( + address_line_1 IS NOT NULL OR address_line_2 IS NOT NULL OR + street_num IS NOT NULL OR street_name IS NOT NULL OR + apartment_num IS NOT NULL OR city IS NOT NULL OR + state IS NOT NULL OR zip_code IS NOT NULL OR + country IS NOT NULL OR cross_street_1 IS NOT NULL OR + cross_street_2 IS NOT NULL + ) + """ + + # Save rendered SQL for inspection + os.makedirs('debug', exist_ok=True) + with open('debug/dry_run_view_query.sql', 'w') as f: + f.write(view_query) + print("Rendered SQL saved to debug/dry_run_view_query.sql") + + # Validate against BigQuery without executing + job_config = bigquery.QueryJobConfig(dry_run=True, use_query_cache=False) + try: + job = client.query(view_query, job_config=job_config) + print(f"✅ SQL is valid. Estimated bytes processed: {job.total_bytes_processed:,}") + except Exception as e: + print(f"❌ SQL validation failed: {e}") + +if __name__ == "__main__": + dry_run() \ No newline at end of file From 8562768c80bcb18f4cd5eb88ba7cb448f92cf85b Mon Sep 17 00:00:00 2001 From: Jake Peters Date: Wed, 20 May 2026 13:06:53 -0400 Subject: [PATCH 4/5] Delete sql/delivery_impact_of_change_to_up_hx.md extra copy of file --- sql/delivery_impact_of_change_to_up_hx.md | 43 ----------------------- 1 file changed, 43 deletions(-) delete mode 100644 sql/delivery_impact_of_change_to_up_hx.md diff --git a/sql/delivery_impact_of_change_to_up_hx.md b/sql/delivery_impact_of_change_to_up_hx.md deleted file mode 100644 index 8526f2d..0000000 --- a/sql/delivery_impact_of_change_to_up_hx.md +++ /dev/null @@ -1,43 +0,0 @@ -# Internal Note: Delivery Impact of `user_profile_address_view.sql` Changes - -## What changed and why - -Two behavioral changes were made to how user profile history addresses are computed before delivery: - -1. **`historical_order` ordering corrected:** history entries were previously numbered oldest-first (`element_position + 1`). They are now numbered newest-first (`ROW_NUMBER() ... ORDER BY element_position DESC`), so `historical_order = 1` is always the most recent history snapshot. Current addresses remain `historical_order = 0`. Module4 addresses are unaffected. - -2. **Carry-forward of NULL address fields:** history entries that were missing a field (true `NULL`) now receive the value from the nearest newer non-null entry for that participant and address type. Empty strings are preserved as-is and are not filled. Module4 addresses are unaffected. - ---- - -## Impact on `address_hash` and delivery state - -The `address_hash` is computed from address field values (`city`, `state`, `zip_code`, `address_line_1`, `address_line_2`). It does **not** include `historical_order`. Therefore: - -| Row type | Hash changes? | Reason | -|---|---|---| -| All module4 rows | ❌ No | Untouched by this change | -| User profile current rows (`historical_order = 0`) | ⚠️ Only if a NULL field is filled by carry-forward from a history entry | Carry-forward window includes `historical_order = 0` | -| User profile history rows with no NULL fields | ❌ No | Carry-forward is a no-op; values unchanged | -| User profile history rows with at least one NULL field that can be filled | ✅ Yes | Field value changes → hash changes | - ---- - -## Three populations in the upcoming delivery - -**1. Truly new addresses:** participants or address types not previously seen. Handled normally. - -**2. Re-delivered history rows (enriched):** previously delivered with one or more NULL fields; now delivered again with those fields filled via carry-forward. These will appear as new records to `identify_new_addresses()` because their hash has changed. The old NULL version remains in `address_delivery_metadata` and `address_deliveries` — it is not overwritten. - -**3. History rows whose `historical_order` shifted:** if a participant's history array has grown since the last delivery, older entries will have higher `historical_order` values than before. Because `historical_order` is not in the hash, the hash is unchanged and these rows will **not** be re-delivered unless their field values also changed. - ---- - -## Merging NORC's returned data on our end - -When NORC returns geocoded results, rows should be matched back to our records using: `Connect_ID` + `address_nickname` + `ts_user_profile_updated` - - -- For **current addresses** (`historical_order = 0`), `ts_user_profile_updated` is `NULL` for all rows — match on `Connect_ID + address_nickname + historical_order = 0` instead. -- **Do not use `historical_order` alone** as a join key against prior deliveries, as the numbering convention has changed. -- Where a `Connect_ID + address_nickname + ts_user_profile_updated` combination appears in both an old delivery and the new one, the new delivery's geocoded result supersedes the old one. The old row (with NULLs) in `address_deliveries` can be flagged or ignored in downstream analysis. From cf8fa87580536649a3aa8821c88c7716528a9a1d Mon Sep 17 00:00:00 2001 From: Peters Date: Fri, 22 May 2026 12:47:58 -0400 Subject: [PATCH 5/5] rm filter of nulls and empties from uphx addresses to allow carryforward logic to work --- core/dry_run.py | 2 ++ core/main.py | 2 ++ delivery_impact_of_change_to_up_hx.md | 29 +++++++++++++++++++++++++++ requirements.txt | 4 +++- sql/user_profile_address_view.sql | 28 ++++++-------------------- 5 files changed, 42 insertions(+), 23 deletions(-) diff --git a/core/dry_run.py b/core/dry_run.py index 162b039..3be4b52 100644 --- a/core/dry_run.py +++ b/core/dry_run.py @@ -5,6 +5,8 @@ import constants def dry_run(): + """Dry run the SQL code generation without running the pipeline end-to-end. + Useful for ensuring that the BigQuery code works before modifying metadata.""" client = bigquery.Client(project=constants.PROJECT_ID) # Read and render SQL exactly as create_address_view does diff --git a/core/main.py b/core/main.py index 14cd03b..3e1f9bf 100644 --- a/core/main.py +++ b/core/main.py @@ -1,3 +1,5 @@ +'''Check that the SQL works without generating artifacts or writing to database.''' + import datetime from google.cloud import bigquery import constants diff --git a/delivery_impact_of_change_to_up_hx.md b/delivery_impact_of_change_to_up_hx.md index 8526f2d..af6cdfa 100644 --- a/delivery_impact_of_change_to_up_hx.md +++ b/delivery_impact_of_change_to_up_hx.md @@ -41,3 +41,32 @@ When NORC returns geocoded results, rows should be matched back to our records u - For **current addresses** (`historical_order = 0`), `ts_user_profile_updated` is `NULL` for all rows — match on `Connect_ID + address_nickname + historical_order = 0` instead. - **Do not use `historical_order` alone** as a join key against prior deliveries, as the numbering convention has changed. - Where a `Connect_ID + address_nickname + ts_user_profile_updated` combination appears in both an old delivery and the new one, the new delivery's geocoded result supersedes the old one. The old row (with NULLs) in `address_deliveries` can be flagged or ignored in downstream analysis. + +## Upcoming Delivery Impact (as of 2026-05-05) + +The next delivery will contain **41,350 total rows** across **~39,926 unique participants**: + +| Category | Rows | % of delivery | Participants | +|---|---|---|---| +| Truly new (never delivered) | 38,827 | 93.9% | ~38,493 | +| Enriched re-deliveries | 2,523 | 6.1% | ~1,433 | +| **Total** | **41,350** | **100%** | **~39,926** | + +Re-deliveries broken down by address type: + +| Address type | Rows re-delivered | % of re-deliveries | Participants | +|---|---|---|---| +| `user_profile_mailing_address` | 2,455 | 97.3% | 1,397 | +| `user_profile_physical_address` | 53 | 2.1% | 25 | +| `user_profile_alternative_address` | 15 | 0.6% | 11 | +| **Total** | **2,523** | **100%** | **~1,433** | + +The 2,523 re-delivered rows represent **user profile history entries that were previously +delivered with one or more NULL address fields**, which carry-forward has now filled using +the most recent non-null value for that participant and address type. These rows should be +matched back to prior deliveries on `Connect_ID + address_nickname + ts_user_profile_updated` +and treated as **updates to existing records**, not new records. + +The average enriched participant has **1.76 history entries** being resent +(`2,523 rows / 1,433 participants`), consistent with carry-forward filling gaps across +multiple history snapshots per person. \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 4c8c458..80f5d57 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,6 @@ google-cloud-bigquery-storage==2.30.0 pandas==2.2.3 python-dateutil==2.9.0 python-tabulate==0.9.0 -openpxl==3.1.5 \ No newline at end of file +openpxl==3.1.5 +tabulate +xlsxwriter \ No newline at end of file diff --git a/sql/user_profile_address_view.sql b/sql/user_profile_address_view.sql index d4b5e2c..bcfaf9d 100644 --- a/sql/user_profile_address_view.sql +++ b/sql/user_profile_address_view.sql @@ -154,17 +154,14 @@ SELECT FROM @raw_participants, UNNEST(d_569151507) AS element WITH OFFSET AS element_position +-- Query 3 history filter (relaxed — let carry-forward do its job): WHERE Connect_ID IS NOT NULL AND d_821247024 = 197316935 -- Verification status = verified AND d_831041022 = 104430631 -- Data destruction requested = no - AND ( - (element.d_207908218 IS NOT NULL AND element.d_207908218 != '') OR - (element.d_224392018 IS NOT NULL AND element.d_224392018 != '') OR - (element.d_451993790 IS NOT NULL AND element.d_451993790 != '') OR - (element.d_187799450 IS NOT NULL AND element.d_187799450 != '') OR - (element.d_449168732 IS NOT NULL AND element.d_449168732 != '') - ) + -- No address field filter here — rows with all NULLs will be + -- enriched by carry-forward and then filtered downstream by + -- the NULLIF/TRIM + IS NOT NULL check in address_processing.py UNION ALL @@ -199,13 +196,7 @@ WHERE Connect_ID IS NOT NULL AND d_821247024 = 197316935 -- Verification status = verified AND d_831041022 = 104430631 -- Data destruction requested = no - AND ( - (element.d_521824358 IS NOT NULL AND element.d_521824358 != '') OR - (element.d_442166669 IS NOT NULL AND element.d_442166669 != '') OR - (element.d_703385619 IS NOT NULL AND element.d_703385619 != '') OR - (element.d_634434746 IS NOT NULL AND element.d_634434746 != '') OR - (element.d_892050548 IS NOT NULL AND element.d_892050548 != '') - ) + -- include even records with all nulls and empty strings (historical table requires them for carry forward logic) UNION ALL @@ -278,14 +269,7 @@ WHERE Connect_ID IS NOT NULL AND d_821247024 = 197316935 -- Verification status = verified AND d_831041022 = 104430631 -- Data destruction requested = no - AND ( - (element.D_284580415 IS NOT NULL AND element.D_284580415 != '') OR - (element.D_728926441 IS NOT NULL AND element.D_728926441 != '') OR - (element.D_907038282 IS NOT NULL AND element.D_907038282 != '') OR - (element.D_970839481 IS NOT NULL AND element.D_970839481 != '') OR - (element.D_379899229 IS NOT NULL AND element.D_379899229 != '') - ) - + -- include even records with all nulls and empty strings (historical table requires them for carry forward logic) ) WINDOW w AS ( PARTITION BY Connect_ID, address_nickname