From 6367b85362468beb61cbcf9684a1eadd85b11979 Mon Sep 17 00:00:00 2001 From: AGibson <4319494+annajgibson@users.noreply.github.com> Date: Mon, 19 May 2025 10:51:30 +0100 Subject: [PATCH 01/11] Remove prefix 'r' and start/end of text notifiers (^ $) from regex as this causes test to not performing as expected in AWS Glue versus local environment. --- scripts/jobs/housing/housing_dwellings_list_gx_suite.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/jobs/housing/housing_dwellings_list_gx_suite.py b/scripts/jobs/housing/housing_dwellings_list_gx_suite.py index 24523c63d..12ee5b89e 100644 --- a/scripts/jobs/housing/housing_dwellings_list_gx_suite.py +++ b/scripts/jobs/housing/housing_dwellings_list_gx_suite.py @@ -31,7 +31,7 @@ class ExpectLLPGColumnValueLengthsBetween(gxe.ExpectColumnValueLengthsToBeBetwee class ExpectBlockRefNoColumnValuesToMatchRegex(gxe.ExpectColumnValuesToMatchRegex): column: str = "block_reference_number" - regex: str = "\d+$" + regex: str = "\d+" description: str = "Expect Block Reference Number to match regex ^\d+$ (numerical)" From 9d6d50e9673fde438afd1df915a880423429ffc0 Mon Sep 17 00:00:00 2001 From: AGibson <4319494+annajgibson@users.noreply.github.com> Date: Thu, 31 Jul 2025 11:25:51 +0100 Subject: [PATCH 02/11] Change import_datetime to import_date to account for changes to the new source data tables --- scripts/jobs/data_and_insight/person_matching_module.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/jobs/data_and_insight/person_matching_module.py b/scripts/jobs/data_and_insight/person_matching_module.py index 93bbce78e..1461fa011 100644 --- a/scripts/jobs/data_and_insight/person_matching_module.py +++ b/scripts/jobs/data_and_insight/person_matching_module.py @@ -385,7 +385,7 @@ def prepare_clean_council_tax_data(spark: SparkSession, council_tax_account: Dat A DataFrame after preparing and cleaning data from multiple council tax tables. """ council_tax_occupation = council_tax_occupation.filter( - (col("live_ind") == 1) & (col("vacation_date") > col("import_datetime"))) + (col("live_ind") == 1) & (col("vacation_date") > col("import_date"))) council_tax_property_occupancy = council_tax_occupation \ .join(council_tax_property, "property_ref") \ @@ -538,19 +538,19 @@ def prepare_clean_housing_benefit_data(hb_member_df: DataFrame, .withColumnRenamed("addr2", "address_line_2") \ .withColumnRenamed("addr3", "address_line_3") \ .withColumnRenamed("addr4", "address_line_4") \ - .filter((col("from_date") < col("import_datetime")) & (col("to_date") > col("import_datetime"))) \ + .filter((col("from_date") < col("import_date")) & (col("to_date") > col("import_date"))) \ .select(col("claim_id"), col("claim_house_id"), col("address_line_1"), col("address_line_2"), col("address_line_3"), col("address_line_4"), col("post_code"), col("uprn")) housing_benefit_rent_assessment = hb_rent_assessment_df \ .withColumn("source_filter", when((col("dhp_ind") == 1) & (col("type_ind") > 1), "DHP").otherwise("HB")) \ - .filter((col("from_date") < col("import_datetime")) & (col("to_date") > col("import_datetime")) + .filter((col("from_date") < col("import_date")) & (col("to_date") > col("import_date")) & ((col("type_ind") == 1) | (col("dhp_ind") == 1)) & (col("model_amt") > 0)) \ .select(col("claim_id"), col("source_filter")) housing_benefit_ctax_assessment = hb_ctax_assessment_df \ .withColumn("source_filter", lit("CTS")) \ - .filter((col("from_date") < col("import_datetime")) & (col("to_date") > col("import_datetime")) + .filter((col("from_date") < col("import_date")) & (col("to_date") > col("import_date")) & (col("model_amt") > 0) & ((col("type_ind") == 1) | (col("dhp_ind") == 1))) \ .select(col("claim_id"), col("source_filter")) From 459b81eb8153770068aa410323f4189cd08a3ce4 Mon Sep 17 00:00:00 2001 From: AGibson <4319494+annajgibson@users.noreply.github.com> Date: Thu, 31 Jul 2025 11:31:17 +0100 Subject: [PATCH 03/11] Revert "Remove prefix 'r' and start/end of text notifiers (^ $) from regex as this causes test to not performing as expected in AWS Glue versus local environment." This reverts commit 6367b85362468beb61cbcf9684a1eadd85b11979. --- scripts/jobs/housing/housing_dwellings_list_gx_suite.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/jobs/housing/housing_dwellings_list_gx_suite.py b/scripts/jobs/housing/housing_dwellings_list_gx_suite.py index 12ee5b89e..24523c63d 100644 --- a/scripts/jobs/housing/housing_dwellings_list_gx_suite.py +++ b/scripts/jobs/housing/housing_dwellings_list_gx_suite.py @@ -31,7 +31,7 @@ class ExpectLLPGColumnValueLengthsBetween(gxe.ExpectColumnValueLengthsToBeBetwee class ExpectBlockRefNoColumnValuesToMatchRegex(gxe.ExpectColumnValuesToMatchRegex): column: str = "block_reference_number" - regex: str = "\d+" + regex: str = "\d+$" description: str = "Expect Block Reference Number to match regex ^\d+$ (numerical)" From d39b7699d62fcf7d861f8709d56dcc121855136b Mon Sep 17 00:00:00 2001 From: AGibson <4319494+annajgibson@users.noreply.github.com> Date: Thu, 31 Jul 2025 11:33:58 +0100 Subject: [PATCH 04/11] Change import_datetime to import_date to account for changes to the new source data tables --- scripts/jobs/data_and_insight/person_matching_module.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/jobs/data_and_insight/person_matching_module.py b/scripts/jobs/data_and_insight/person_matching_module.py index 1461fa011..296454f7c 100644 --- a/scripts/jobs/data_and_insight/person_matching_module.py +++ b/scripts/jobs/data_and_insight/person_matching_module.py @@ -276,7 +276,7 @@ def prepare_clean_housing_data(person_reshape: DataFrame, assets_reshape: DataFr A prepared and cleaned dataframe containing housing tenancy data. """ tenure_reshape = tenure_reshape.filter((tenure_reshape["endoftenuredate"].isNull()) | ( - tenure_reshape["endoftenuredate"].cast(DateType()) > current_date())) + tenure_reshape["endoftenuredate"].cast(DateType()) > current_date())) assets_reshape = assets_reshape.filter(assets_reshape['assettype'] == 'Dwelling') @@ -865,7 +865,7 @@ def prepare_clean_freedom_pass_admissions_data(freedom_df: DataFrame) -> DataFra .withColumnRenamed("district", "address_line_4") \ .withColumnRenamed("postcode", "post_code") \ .withColumnRenamed("email_address", "email") \ - .withColumn("date_of_birth", to_date(col("date_of_birth"), format="dd/MM/yyyy"))\ + .withColumn("date_of_birth", to_date(col("date_of_birth"), format="dd/MM/yyyy")) \ .withColumn("uprn", lit("")) \ .withColumn("source_filter", lit("freedom_passes_2024")) \ .select(col("source"), col("source_id"), col("title"), col("first_name"), col("middle_name"), From d3f23be31c1dcd37aacabfd589219a4511c65fe5 Mon Sep 17 00:00:00 2001 From: AGibson <4319494+annajgibson@users.noreply.github.com> Date: Thu, 31 Jul 2025 11:38:07 +0100 Subject: [PATCH 05/11] Linting --- .../person_matching_module.py | 1137 ++++++++--------- 1 file changed, 536 insertions(+), 601 deletions(-) diff --git a/scripts/jobs/data_and_insight/person_matching_module.py b/scripts/jobs/data_and_insight/person_matching_module.py index 296454f7c..86a89ebdc 100644 --- a/scripts/jobs/data_and_insight/person_matching_module.py +++ b/scripts/jobs/data_and_insight/person_matching_module.py @@ -21,23 +21,17 @@ from pyspark.sql.pandas.functions import pandas_udf from pyspark.sql.types import StructType, StructField, StringType, DateType, BooleanType, DoubleType -extracted_name_schema = StructType([ - StructField("entity_type", StringType(), True), - StructField("title", StringType(), True), - StructField("first_name", StringType(), True), - StructField("middle_name", StringType(), True), - StructField("last_name", StringType(), True) -]) - -features_schema = StructType([ - StructField("first_name_similar", BooleanType(), True), - StructField("middle_name_similar", BooleanType(), True), - StructField("last_name_similar", BooleanType(), True), - StructField("name_similarity", DoubleType(), True), - StructField("address_line_1_similarity", DoubleType(), True), - StructField("address_line_2_similarity", DoubleType(), True), - StructField("full_address_similarity", DoubleType(), True), -]) +extracted_name_schema = StructType( + [StructField("entity_type", StringType(), True), StructField("title", StringType(), True), + StructField("first_name", StringType(), True), StructField("middle_name", StringType(), True), + StructField("last_name", StringType(), True)]) + +features_schema = StructType( + [StructField("first_name_similar", BooleanType(), True), StructField("middle_name_similar", BooleanType(), True), + StructField("last_name_similar", BooleanType(), True), StructField("name_similarity", DoubleType(), True), + StructField("address_line_1_similarity", DoubleType(), True), + StructField("address_line_2_similarity", DoubleType(), True), + StructField("full_address_similarity", DoubleType(), True), ]) @udf(returnType=extracted_name_schema) @@ -58,24 +52,22 @@ def extract_person_name(name: str) -> (str, str, str, str, str): """ common_titles = ["mr", "mr.", "mrs", "mrs.", "ms", "ms.", "miss", "master", "exor", "exors", "executors", "of", "rep", "per", "pers", "reps", "prep", "&prep", "pe", "personal", "repmr", "repmrs", "repsmr", - "repsmrs", "the", "reps.of", - "dr", "dr.", "prof", "profeessor", "rev", "lady", "dame", "sir", "lord"] + "repsmrs", "the", "reps.of", "dr", "dr.", "prof", "profeessor", "rev", "lady", "dame", "sir", + "lord"] common_titles_subset_with_space = ["mr ", "mr. ", "mrs ", "mrs. ", "ms ", "ms. ", "miss ", "exor ", "exors "] common_business_types_small = ["ltd", "llp", "plc", "pvt", "&", "lbh", " inc,", "llc", "bv"] - common_business_types = ["limited", - "association", "housing", "trust", "home", "society", "estates", "properties", "property", - "group", "fund", "invest", "investment", "estate", "development", "board", - "letting", "agent", "accommodat", "occupier", "residential", "relocation", - "accomodation", "traveller", "living", "education", "residence", "resident", - "organisation", "management", "international", "national", "clinic", "solutions", - "service", "system", "security", "move", "store", "academy", "ventures", "rent", - "account", "building", "company", "congregation", "project", "residencial", "centre", - "sport", "assets", "developer", "asylum", "committee", "school", "apartment", - "chartered", "consultant", "enterprise", "corporate", "associates", "studios", - "consultancy", "borough", - "holdings", "agency", "propperties", "hotel", "lodge", "university", "proeprties", - "hackney", "empty", "void", "london", "council"] + common_business_types = ["limited", "association", "housing", "trust", "home", "society", "estates", "properties", + "property", "group", "fund", "invest", "investment", "estate", "development", "board", + "letting", "agent", "accommodat", "occupier", "residential", "relocation", "accomodation", + "traveller", "living", "education", "residence", "resident", "organisation", "management", + "international", "national", "clinic", "solutions", "service", "system", "security", + "move", "store", "academy", "ventures", "rent", "account", "building", "company", + "congregation", "project", "residencial", "centre", "sport", "assets", "developer", + "asylum", "committee", "school", "apartment", "chartered", "consultant", "enterprise", + "corporate", "associates", "studios", "consultancy", "borough", "holdings", "agency", + "propperties", "hotel", "lodge", "university", "proeprties", "hackney", "empty", "void", + "london", "council"] deceased_flags = ["decd", "dec'd", "d'cead", "desd", "d'ced", "de'd", "def'd", "dea's", "dece'd", "dec", "dec`d", "deceased"] @@ -87,8 +79,8 @@ def extract_person_name(name: str) -> (str, str, str, str, str): return "Unknown", None, None, None, None if any(business in name.casefold() for business in common_business_types) or ( - any(business in name.casefold() for business in common_business_types_small) and - not any(t in name.casefold() for t in common_titles_subset_with_space)): + any(business in name.casefold() for business in common_business_types_small) and not any( + t in name.casefold() for t in common_titles_subset_with_space)): return "Business", None, None, None, None person_title, first_name, middle_name, last_name = None, None, None, None @@ -120,9 +112,9 @@ def extract_person_name(name: str) -> (str, str, str, str, str): if not person_title: title_finder = [t for t in title_with_name if t.casefold() in common_titles] person_title = " ".join(title_finder) if len(title_finder) else None - remaining_name = [n for n in title_with_name if - n.casefold() != (person_title or "").casefold() and n.casefold() not in common_titles and - n.casefold() not in [".", "&"]] + remaining_name = [n for n in title_with_name if n.casefold() != ( + person_title or "").casefold() and n.casefold() not in common_titles and n.casefold() not in [".", + "&"]] if len(remaining_name) == 1: first_name = remaining_name[0] @@ -177,20 +169,12 @@ def categorise_title(title: Column) -> Column: category_sir = title.contains("sir") # Priority 12 category_rabbi = title.contains("rabbi") # Priority 13 - return when(category_master, lit("master")) \ - .when(category_ms, lit("ms")) \ - .when(category_mrs, lit("mrs")) \ - .when(category_miss, lit("miss")) \ - .when(category_mr, lit("mr")) \ - .when(category_dame, lit("dame")) \ - .when(category_lady, lit("lady")) \ - .when(category_lord, lit("lord")) \ - .when(category_prof, lit("prof")) \ - .when(category_dr, lit("dr")) \ - .when(category_rev, lit("rev")) \ - .when(category_sir, lit("sir")) \ - .when(category_rabbi, lit("rabbi")) \ - .otherwise("unknown") + return when(category_master, lit("master")).when(category_ms, lit("ms")).when(category_mrs, lit("mrs")).when( + category_miss, lit("miss")).when(category_mr, lit("mr")).when(category_dame, lit("dame")).when(category_lady, + lit("lady")).when( + category_lord, lit("lord")).when(category_prof, lit("prof")).when(category_dr, lit("dr")).when(category_rev, + lit("rev")).when( + category_sir, lit("sir")).when(category_rabbi, lit("rabbi")).otherwise("unknown") def standardize_name(name: Column) -> Column: @@ -210,8 +194,7 @@ def standardize_name(name: Column) -> Column: Returns: Column after applying the rules """ - return when(name.isNull(), lit("")) \ - .otherwise( + return when(name.isNull(), lit("")).otherwise( lower(trim(regexp_replace(regexp_replace(regexp_replace(name, "0", "O"), "1", "L"), "^[\\&*./\\\]+", "")))) @@ -280,40 +263,46 @@ def prepare_clean_housing_data(person_reshape: DataFrame, assets_reshape: DataFr assets_reshape = assets_reshape.filter(assets_reshape['assettype'] == 'Dwelling') - person_reshape = person_reshape.filter( - (person_reshape["type"].isin( - ["Secure", "Introductory", "Leasehold (RTB)", "Mense Profit Ac", "Mesne Profit Ac"])) - & (person_reshape["enddate"].isNull()) - & (person_reshape["person_type"].isin(["Tenant", "HouseholdMember"]))) - - housing = person_reshape \ - .join(assets_reshape, person_reshape["assetid"] == assets_reshape["asset_id"], how="left") \ - .join(tenure_reshape, person_reshape["person_id"] == tenure_reshape["person_id"], how="left") \ - .withColumn("source", lit("housing")) \ - .withColumn("extracted_name", extract_name_udf(col("member_fullname"))) \ - .withColumn("title", - when((col("extracted_name.title").isNull()) | - (lower(col("extracted_name.title")) == lower(col("preferredTitle"))), - col("preferredTitle")) - .otherwise(concat_ws(" ", col("preferredTitle"), col("extracted_name.title")))) \ - .withColumn("first_name", col("extracted_name.first_name")) \ - .withColumn("middle_name", col("extracted_name.middle_name")) \ - .withColumn("last_name", col("extracted_name.last_name")) \ - .withColumn("dob", to_date(substring(person_reshape["dateofbirth"], 1, 10), format="yyyy-MM-dd")) \ - .withColumn("date_of_birth", # null value represented by 1900-01-01, so converting 1900-01-01 to null - when(col("dob") == to_date(lit("1900-01-01"), "yyyy-MM-dd"), lit(None).cast(DateType())) - .otherwise(col("dob"))) \ - .withColumnRenamed("postcode", "post_code") \ - .withColumnRenamed("addressline1", "address_line_1") \ - .withColumnRenamed("addressline2", "address_line_2") \ - .withColumnRenamed("addressline3", "address_line_3") \ - .withColumnRenamed("addressline4", "address_line_4") \ - .withColumnRenamed("placeOfBirth", "place_of_birth") \ - .filter((length(col("first_name")) > 0) | (length(col("last_name")) > 0)) \ - .select(col("source"), person_reshape["person_id"], person_reshape["uprn"], col("title"), - col("first_name"), col("middle_name"), col("last_name"), col("date_of_birth"), - col("post_code"), col("address_line_1"), col("address_line_2"), col("address_line_3"), - col("address_line_4"), person_reshape["type"]) + person_reshape = person_reshape.filter((person_reshape["type"].isin( + ["Secure", "Introductory", "Leasehold (RTB)", "Mense Profit Ac", "Mesne Profit Ac"])) & ( + person_reshape["enddate"].isNull()) & ( + person_reshape["person_type"].isin(["Tenant", "HouseholdMember"]))) + + housing = person_reshape.join(assets_reshape, person_reshape["assetid"] == assets_reshape["asset_id"], + how="left").join(tenure_reshape, + person_reshape["person_id"] == tenure_reshape["person_id"], + how="left").withColumn("source", lit("housing")).withColumn( + "extracted_name", extract_name_udf(col("member_fullname"))).withColumn("title", when( + (col("extracted_name.title").isNull()) | (lower(col("extracted_name.title")) == lower(col("preferredTitle"))), + col("preferredTitle")).otherwise( + concat_ws(" ", col("preferredTitle"), col("extracted_name.title")))).withColumn("first_name", + col("extracted_name.first_name")).withColumn( + "middle_name", col("extracted_name.middle_name")).withColumn("last_name", + col("extracted_name.last_name")).withColumn("dob", + to_date( + substring( + person_reshape[ + "dateofbirth"], + 1, + 10), + format="yyyy-MM-dd")).withColumn( + "date_of_birth", # null value represented by 1900-01-01, so converting 1900-01-01 to null + when(col("dob") == to_date(lit("1900-01-01"), "yyyy-MM-dd"), lit(None).cast(DateType())).otherwise( + col("dob"))).withColumnRenamed("postcode", "post_code").withColumnRenamed("addressline1", + "address_line_1").withColumnRenamed( + "addressline2", "address_line_2").withColumnRenamed("addressline3", "address_line_3").withColumnRenamed( + "addressline4", "address_line_4").withColumnRenamed("placeOfBirth", "place_of_birth").filter( + (length(col("first_name")) > 0) | (length(col("last_name")) > 0)).select(col("source"), + person_reshape["person_id"], + person_reshape["uprn"], col("title"), + col("first_name"), col("middle_name"), + col("last_name"), col("date_of_birth"), + col("post_code"), + col("address_line_1"), + col("address_line_2"), + col("address_line_3"), + col("address_line_4"), + person_reshape["type"]) return housing @@ -344,35 +333,37 @@ def standardize_housing_data(housing_cleaned: DataFrame) -> DataFrame: Returns: A housing DataFrame with all the standard columns listed above. """ - housing = housing_cleaned \ - .withColumnRenamed("person_id", "source_id") \ - .withColumnRenamed("type", "source_filter") \ - .withColumn("title", categorise_title(lower(col("title")))) \ - .withColumn("first_name", standardize_name(col("first_name"))) \ - .withColumn("middle_name", standardize_name(col("middle_name"))) \ - .withColumn("last_name", standardize_name(col("last_name"))) \ - .withColumn("name", standardize_full_name(col("first_name"), col("middle_name"), col("last_name"))) \ - .withColumn("post_code", lower(col("post_code"))) \ - .withColumn("address_line_1", standardize_address_line(col("address_line_1"))) \ - .withColumn("address_line_2", standardize_address_line(col("address_line_2"))) \ - .withColumn("address_line_3", standardize_address_line(col("address_line_3"))) \ - .withColumn("address_line_4", standardize_address_line(col("address_line_4"))) \ - .withColumn("full_address", full_address(col("address_line_1"), col("address_line_2"), col("address_line_3"), - col("address_line_4"))) \ - .select(col("source"), col("source_id"), col("uprn"), col("title"), col("first_name"), col("middle_name"), - col("last_name"), col("name"), col("date_of_birth"), col("post_code"), col("address_line_1"), - col("address_line_2"), col("address_line_3"), col("address_line_4"), col("full_address"), - col("source_filter")) \ - .dropDuplicates(["source_id", "uprn", "date_of_birth"]) + housing = housing_cleaned.withColumnRenamed("person_id", "source_id").withColumnRenamed("type", + "source_filter").withColumn( + "title", categorise_title(lower(col("title")))).withColumn("first_name", + standardize_name(col("first_name"))).withColumn( + "middle_name", standardize_name(col("middle_name"))).withColumn("last_name", + standardize_name(col("last_name"))).withColumn( + "name", standardize_full_name(col("first_name"), col("middle_name"), col("last_name"))).withColumn("post_code", + lower( + col("post_code"))).withColumn( + "address_line_1", standardize_address_line(col("address_line_1"))).withColumn("address_line_2", + standardize_address_line( + col("address_line_2"))).withColumn( + "address_line_3", standardize_address_line(col("address_line_3"))).withColumn("address_line_4", + standardize_address_line( + col("address_line_4"))).withColumn( + "full_address", full_address(col("address_line_1"), col("address_line_2"), col("address_line_3"), + col("address_line_4"))).select(col("source"), col("source_id"), col("uprn"), + col("title"), col("first_name"), col("middle_name"), + col("last_name"), col("name"), col("date_of_birth"), + col("post_code"), col("address_line_1"), + col("address_line_2"), col("address_line_3"), + col("address_line_4"), col("full_address"), + col("source_filter")).dropDuplicates( + ["source_id", "uprn", "date_of_birth"]) return housing def prepare_clean_council_tax_data(spark: SparkSession, council_tax_account: DataFrame, - council_tax_liability_person: DataFrame, - council_tax_non_liability_person: DataFrame, - council_tax_occupation: DataFrame, - council_tax_property: DataFrame) -> DataFrame: + council_tax_liability_person: DataFrame, council_tax_non_liability_person: DataFrame, + council_tax_occupation: DataFrame, council_tax_property: DataFrame) -> DataFrame: """A function to prepare and clean council tax data. Args: spark: SparkSession, @@ -387,67 +378,55 @@ def prepare_clean_council_tax_data(spark: SparkSession, council_tax_account: Dat council_tax_occupation = council_tax_occupation.filter( (col("live_ind") == 1) & (col("vacation_date") > col("import_date"))) - council_tax_property_occupancy = council_tax_occupation \ - .join(council_tax_property, "property_ref") \ - .withColumnRenamed("postcode", "post_code") \ - .withColumnRenamed("addr1", "address_line_1") \ - .withColumnRenamed("addr2", "address_line_2") \ - .withColumnRenamed("addr3", "address_line_3") \ - .withColumnRenamed("addr4", "address_line_4") \ - .select(col("uprn"), col("account_ref"), col("occupation_date"), col("vacation_date"), col("post_code"), - col("address_line_1"), col("address_line_2"), col("address_line_3"), col("address_line_4")) - - liable_types = broadcast( - spark.createDataFrame([ - (0, 'Non-liable'), - (1, 'Joint & Several'), - (2, 'Freeholder'), - (3, 'Leaseholder'), - (4, 'Tenant'), - (5, 'Licencee'), - (6, 'Resident'), - (7, 'Owner'), - (8, 'Assumed'), - (9, 'VOID'), - (10, 'Other'), - (11, 'Suspense'), - (12, 'CTax Payer'), - (-1, '(DATA ERROR)')]).toDF("liability_id", "liability_type")) - - council_tax_lead_person = (council_tax_account - .join(liable_types, col("lead_liab_pos") == col("liability_id")) - .withColumn("source", lit("council_tax")) - .withColumn("sub_source", lit("lead")) - .withColumn("position", lit(0)) - .withColumnRenamed("lead_liab_name", "name") - .withColumn("extracted_name", extract_name_udf(col("name"))) - .select(col("source"), col("account_ref"), col("party_ref"), col("liability_type"), - col("sub_source"), - col("position"), col("extracted_name.*"), col("name"))) - - council_tax_liable_person = council_tax_liability_person \ - .join(liable_types, col("liab_pos") == col("liability_id")) \ - .withColumn("source", lit("council_tax")) \ - .withColumn("sub_source", lit("liable")) \ - .withColumn("position", col("liab_pers_occ")) \ - .withColumnRenamed("liab_name", "name") \ - .withColumn("extracted_name", extract_name_udf(col("name"))) \ - .select(col("source"), col("account_ref"), col("party_ref"), col("liability_type"), col("sub_source"), - col("position"), col("extracted_name.*"), col("name")) - - council_tax_non_liable_person = council_tax_non_liability_person \ - .withColumn("source", lit("council_tax")) \ - .withColumn("sub_source", lit("non liable")) \ - .withColumn("liability_type", lit(None).cast(StringType())) \ - .withColumn("position", col("nonliab_occ")) \ - .withColumnRenamed("nonliab_name", "name") \ - .withColumn("extracted_name", extract_name_udf(col("name"))) \ - .select(col("source"), col("account_ref"), col("party_ref"), col("liability_type"), col("sub_source"), - col("position"), col("extracted_name.*"), col("name")) - - council_tax_person = council_tax_lead_person.union(council_tax_liable_person).union(council_tax_non_liable_person) \ - .join(council_tax_property_occupancy, "account_ref") \ - .withColumn("source_filter", lit("council_tax")) + council_tax_property_occupancy = council_tax_occupation.join(council_tax_property, + "property_ref").withColumnRenamed("postcode", + "post_code").withColumnRenamed( + "addr1", "address_line_1").withColumnRenamed("addr2", "address_line_2").withColumnRenamed("addr3", + "address_line_3").withColumnRenamed( + "addr4", "address_line_4").select(col("uprn"), col("account_ref"), col("occupation_date"), col("vacation_date"), + col("post_code"), col("address_line_1"), col("address_line_2"), + col("address_line_3"), col("address_line_4")) + + liable_types = broadcast(spark.createDataFrame( + [(0, 'Non-liable'), (1, 'Joint & Several'), (2, 'Freeholder'), (3, 'Leaseholder'), (4, 'Tenant'), + (5, 'Licencee'), (6, 'Resident'), (7, 'Owner'), (8, 'Assumed'), (9, 'VOID'), (10, 'Other'), + (11, 'Suspense'), (12, 'CTax Payer'), (-1, '(DATA ERROR)')]).toDF("liability_id", "liability_type")) + + council_tax_lead_person = ( + council_tax_account.join(liable_types, col("lead_liab_pos") == col("liability_id")).withColumn("source", + lit("council_tax")).withColumn( + "sub_source", lit("lead")).withColumn("position", lit(0)).withColumnRenamed("lead_liab_name", + "name").withColumn( + "extracted_name", extract_name_udf(col("name"))).select(col("source"), col("account_ref"), col("party_ref"), + col("liability_type"), col("sub_source"), + col("position"), col("extracted_name.*"), + col("name"))) + + council_tax_liable_person = council_tax_liability_person.join(liable_types, + col("liab_pos") == col("liability_id")).withColumn( + "source", lit("council_tax")).withColumn("sub_source", lit("liable")).withColumn("position", + col("liab_pers_occ")).withColumnRenamed( + "liab_name", "name").withColumn("extracted_name", extract_name_udf(col("name"))).select(col("source"), + col("account_ref"), + col("party_ref"), + col("liability_type"), + col("sub_source"), + col("position"), + col("extracted_name.*"), + col("name")) + + council_tax_non_liable_person = council_tax_non_liability_person.withColumn("source", + lit("council_tax")).withColumn( + "sub_source", lit("non liable")).withColumn("liability_type", lit(None).cast(StringType())).withColumn( + "position", col("nonliab_occ")).withColumnRenamed("nonliab_name", "name").withColumn("extracted_name", + extract_name_udf( + col("name"))).select( + col("source"), col("account_ref"), col("party_ref"), col("liability_type"), col("sub_source"), col("position"), + col("extracted_name.*"), col("name")) + + council_tax_person = council_tax_lead_person.union(council_tax_liable_person).union( + council_tax_non_liable_person).join(council_tax_property_occupancy, "account_ref").withColumn("source_filter", + lit("council_tax")) return council_tax_person @@ -482,36 +461,38 @@ def standardize_council_tax_data(council_tax_cleaned: DataFrame) -> DataFrame: Returns: A council tax DataFrame with all the standard columns listed above. """ - council_tax = council_tax_cleaned \ - .filter(col("entity_type") == "Person") \ - .drop(col("entity_type")) \ - .withColumn("source_id", concat_ws("-", col("account_ref"), col("party_ref"), col("position"))) \ - .withColumn("date_of_birth", lit(None).cast(DateType())) \ - .withColumn("title", categorise_title(lower(col("title")))) \ - .withColumn("first_name", standardize_name(col("first_name"))) \ - .withColumn("middle_name", standardize_name(col("middle_name"))) \ - .withColumn("last_name", standardize_name(col("last_name"))) \ - .withColumn("name", standardize_full_name(col("first_name"), col("middle_name"), col("last_name"))) \ - .withColumn("post_code", lower(col("post_code"))) \ - .withColumn("address_line_1", standardize_address_line(col("address_line_1"))) \ - .withColumn("address_line_2", standardize_address_line(col("address_line_2"))) \ - .withColumn("address_line_3", standardize_address_line(col("address_line_3"))) \ - .withColumn("address_line_4", standardize_address_line(col("address_line_4"))) \ - .withColumn("full_address", full_address(col("address_line_1"), col("address_line_2"), col("address_line_3"), - col("address_line_4"))) \ - .select(col("source"), col("source_id"), col("uprn"), col("title"), col("first_name"), col("middle_name"), - col("last_name"), col("name"), col("date_of_birth"), col("post_code"), col("address_line_1"), - col("address_line_2"), col("address_line_3"), col("address_line_4"), col("full_address"), - col("source_filter")) \ - .dropDuplicates(["source_id", "uprn"]) + council_tax = council_tax_cleaned.filter(col("entity_type") == "Person").drop(col("entity_type")).withColumn( + "source_id", concat_ws("-", col("account_ref"), col("party_ref"), col("position"))).withColumn("date_of_birth", + lit(None).cast( + DateType())).withColumn( + "title", categorise_title(lower(col("title")))).withColumn("first_name", + standardize_name(col("first_name"))).withColumn( + "middle_name", standardize_name(col("middle_name"))).withColumn("last_name", + standardize_name(col("last_name"))).withColumn( + "name", standardize_full_name(col("first_name"), col("middle_name"), col("last_name"))).withColumn("post_code", + lower( + col("post_code"))).withColumn( + "address_line_1", standardize_address_line(col("address_line_1"))).withColumn("address_line_2", + standardize_address_line( + col("address_line_2"))).withColumn( + "address_line_3", standardize_address_line(col("address_line_3"))).withColumn("address_line_4", + standardize_address_line( + col("address_line_4"))).withColumn( + "full_address", full_address(col("address_line_1"), col("address_line_2"), col("address_line_3"), + col("address_line_4"))).select(col("source"), col("source_id"), col("uprn"), + col("title"), col("first_name"), col("middle_name"), + col("last_name"), col("name"), col("date_of_birth"), + col("post_code"), col("address_line_1"), + col("address_line_2"), col("address_line_3"), + col("address_line_4"), col("full_address"), + col("source_filter")).dropDuplicates( + ["source_id", "uprn"]) return council_tax -def prepare_clean_housing_benefit_data(hb_member_df: DataFrame, - hb_household_df: DataFrame, - hb_rent_assessment_df: DataFrame, - hb_ctax_assessment_df: DataFrame) -> DataFrame: +def prepare_clean_housing_benefit_data(hb_member_df: DataFrame, hb_household_df: DataFrame, + hb_rent_assessment_df: DataFrame, hb_ctax_assessment_df: DataFrame) -> DataFrame: """A function to prepare and clean housing benefit data. Data comes from multiple sources. This function is specific to this particular data source. For a new data source please add a new function. Args: @@ -522,50 +503,56 @@ def prepare_clean_housing_benefit_data(hb_member_df: DataFrame, Returns: A DataFrame after preparing and cleaning housing benefit data from multiple tables. """ - housing_benefit_member = hb_member_df \ - .withColumn("claim_house_id", concat_ws("-", col("claim_id"), col("house_id"))) \ - .withColumn("claim_person_ref", concat_ws("-", col("claim_id"), col("house_id"), col("member_id"))) \ - .withColumn("gender", when(col("gender") == 2, "F").when(col("gender") == 1, "M").otherwise("O")) \ - .withColumn("extracted_name", extract_name_udf(col("name"))) \ - .withColumn("source", lit("housing_benefit")) \ - .withColumn("date_of_birth", to_date(col("birth_date"))) \ - .select(col("source"), col("claim_person_ref"), col("claim_house_id"), col("extracted_name.*"), - col("date_of_birth"), col("gender")) - - housing_benefit_household = hb_household_df \ - .withColumn("claim_house_id", concat_ws("-", col("claim_id"), col("house_id"))) \ - .withColumnRenamed("addr1", "address_line_1") \ - .withColumnRenamed("addr2", "address_line_2") \ - .withColumnRenamed("addr3", "address_line_3") \ - .withColumnRenamed("addr4", "address_line_4") \ - .filter((col("from_date") < col("import_date")) & (col("to_date") > col("import_date"))) \ - .select(col("claim_id"), col("claim_house_id"), col("address_line_1"), col("address_line_2"), - col("address_line_3"), col("address_line_4"), col("post_code"), col("uprn")) - - housing_benefit_rent_assessment = hb_rent_assessment_df \ - .withColumn("source_filter", when((col("dhp_ind") == 1) & (col("type_ind") > 1), "DHP").otherwise("HB")) \ - .filter((col("from_date") < col("import_date")) & (col("to_date") > col("import_date")) - & ((col("type_ind") == 1) | (col("dhp_ind") == 1)) & (col("model_amt") > 0)) \ - .select(col("claim_id"), col("source_filter")) - - housing_benefit_ctax_assessment = hb_ctax_assessment_df \ - .withColumn("source_filter", lit("CTS")) \ - .filter((col("from_date") < col("import_date")) & (col("to_date") > col("import_date")) - & (col("model_amt") > 0) & ((col("type_ind") == 1) | (col("dhp_ind") == 1))) \ - .select(col("claim_id"), col("source_filter")) + housing_benefit_member = hb_member_df.withColumn("claim_house_id", + concat_ws("-", col("claim_id"), col("house_id"))).withColumn( + "claim_person_ref", concat_ws("-", col("claim_id"), col("house_id"), col("member_id"))).withColumn("gender", + when( + col("gender") == 2, + "F").when( + col("gender") == 1, + "M").otherwise( + "O")).withColumn( + "extracted_name", extract_name_udf(col("name"))).withColumn("source", lit("housing_benefit")).withColumn( + "date_of_birth", to_date(col("birth_date"))).select(col("source"), col("claim_person_ref"), + col("claim_house_id"), col("extracted_name.*"), + col("date_of_birth"), col("gender")) + + housing_benefit_household = hb_household_df.withColumn("claim_house_id", concat_ws("-", col("claim_id"), + col("house_id"))).withColumnRenamed( + "addr1", "address_line_1").withColumnRenamed("addr2", "address_line_2").withColumnRenamed("addr3", + "address_line_3").withColumnRenamed( + "addr4", "address_line_4").filter( + (col("from_date") < col("import_date")) & (col("to_date") > col("import_date"))).select(col("claim_id"), + col("claim_house_id"), + col("address_line_1"), + col("address_line_2"), + col("address_line_3"), + col("address_line_4"), + col("post_code"), + col("uprn")) + + housing_benefit_rent_assessment = hb_rent_assessment_df.withColumn("source_filter", when( + (col("dhp_ind") == 1) & (col("type_ind") > 1), "DHP").otherwise("HB")).filter( + (col("from_date") < col("import_date")) & (col("to_date") > col("import_date")) & ( + (col("type_ind") == 1) | (col("dhp_ind") == 1)) & (col("model_amt") > 0)).select(col("claim_id"), + col("source_filter")) + + housing_benefit_ctax_assessment = hb_ctax_assessment_df.withColumn("source_filter", lit("CTS")).filter( + (col("from_date") < col("import_date")) & (col("to_date") > col("import_date")) & (col("model_amt") > 0) & ( + (col("type_ind") == 1) | (col("dhp_ind") == 1))).select(col("claim_id"), col("source_filter")) housing_benefit_rent_ctax = housing_benefit_rent_assessment.union(housing_benefit_ctax_assessment) - housing_benefit_household_claims = housing_benefit_household \ - .join(housing_benefit_rent_ctax, ["claim_id"]) + housing_benefit_household_claims = housing_benefit_household.join(housing_benefit_rent_ctax, ["claim_id"]) - housing_benefit_cleaned = housing_benefit_household_claims.join(housing_benefit_member, ["claim_house_id"]) \ - .withColumn("source", lit("housing_benefit")) \ - .withColumn("source_id", col("claim_id")) \ - .select(col("source"), col("claim_person_ref"), col("uprn"), col("title"), - col("first_name"), col("middle_name"), col("last_name"), col("date_of_birth"), col("gender"), - col("post_code"), col("address_line_1"), col("address_line_2"), col("address_line_3"), - col("address_line_4"), col("source_filter")) + housing_benefit_cleaned = housing_benefit_household_claims.join(housing_benefit_member, + ["claim_house_id"]).withColumn("source", + lit("housing_benefit")).withColumn( + "source_id", col("claim_id")).select(col("source"), col("claim_person_ref"), col("uprn"), col("title"), + col("first_name"), col("middle_name"), col("last_name"), + col("date_of_birth"), col("gender"), col("post_code"), + col("address_line_1"), col("address_line_2"), col("address_line_3"), + col("address_line_4"), col("source_filter")) return housing_benefit_cleaned @@ -599,25 +586,26 @@ def standardize_housing_benefit_data(housing_benefit_cleaned: DataFrame) -> Data Returns: A housing benefit DataFrame with all the standard columns listed above. """ - housing_benefit = housing_benefit_cleaned \ - .withColumn("source_id", col("claim_person_ref")) \ - .withColumn("title", categorise_title(lower(col("title")))) \ - .withColumn("first_name", standardize_name(col("first_name"))) \ - .withColumn("middle_name", standardize_name(col("middle_name"))) \ - .withColumn("last_name", standardize_name(col("last_name"))) \ - .withColumn("name", standardize_full_name(col("first_name"), col("middle_name"), col("last_name"))) \ - .withColumn("post_code", lower(col("post_code"))) \ - .withColumn("address_line_1", standardize_address_line(col("address_line_1"))) \ - .withColumn("address_line_2", standardize_address_line(col("address_line_2"))) \ - .withColumn("address_line_3", standardize_address_line(col("address_line_3"))) \ - .withColumn("address_line_4", standardize_address_line(col("address_line_4"))) \ - .withColumn("full_address", full_address(col("address_line_1"), col("address_line_2"), col("address_line_3"), - col("address_line_4"))) \ - .select(col("source"), col("source_id"), col("uprn"), col("title"), col("first_name"), col("middle_name"), - col("last_name"), col("name"), col("date_of_birth"), col("post_code"), col("address_line_1"), - col("address_line_2"), col("address_line_3"), col("address_line_4"), col("full_address"), - col("source_filter")) \ - .dropDuplicates(["source_id", "first_name", "last_name", "date_of_birth", "post_code"]) + housing_benefit = housing_benefit_cleaned.withColumn("source_id", col("claim_person_ref")).withColumn("title", + categorise_title( + lower( + col("title")))).withColumn( + "first_name", standardize_name(col("first_name"))).withColumn("middle_name", + standardize_name(col("middle_name"))).withColumn( + "last_name", standardize_name(col("last_name"))).withColumn("name", standardize_full_name(col("first_name"), + col("middle_name"), + col("last_name"))).withColumn( + "post_code", lower(col("post_code"))).withColumn("address_line_1", + standardize_address_line(col("address_line_1"))).withColumn( + "address_line_2", standardize_address_line(col("address_line_2"))).withColumn("address_line_3", + standardize_address_line( + col("address_line_3"))).withColumn( + "address_line_4", standardize_address_line(col("address_line_4"))).withColumn("full_address", full_address( + col("address_line_1"), col("address_line_2"), col("address_line_3"), col("address_line_4"))).select( + col("source"), col("source_id"), col("uprn"), col("title"), col("first_name"), col("middle_name"), + col("last_name"), col("name"), col("date_of_birth"), col("post_code"), col("address_line_1"), + col("address_line_2"), col("address_line_3"), col("address_line_4"), col("full_address"), + col("source_filter")).dropDuplicates(["source_id", "first_name", "last_name", "date_of_birth", "post_code"]) return housing_benefit @@ -631,19 +619,17 @@ def prepare_clean_parking_permit_data(parking_permit_df: DataFrame) -> DataFrame A DataFrame after preparing and cleaning parking permit data. """ - parking_permit_cleaned = parking_permit_df \ - .withColumn("source", lit("parking_permit")) \ - .withColumn("source_filter", lit("live parking permit")) \ - .withColumn("extracted_name", - extract_name_udf(concat_ws(" ", col("forename_of_applicant"), col("surname_of_applicant")))) \ - .withColumn("date_of_birth", to_date(col("date_of_birth_of_applicant"), format="yyyy-MM-dd")) \ - .withColumnRenamed("postcode", "post_code") \ - .withColumnRenamed("email_address_of_applicant", "email") \ - .filter((col("permit_type").isin(["Residents", "Estate Resident"])) & (col("live_permit_flag") == 1)) \ - .select(col("source"), col("permit_reference"), - col("extracted_name.*"), - col("date_of_birth"), col("email"), col("post_code"), col("uprn"), - col("address_line_1"), col("address_line_2"), col("address_line_3"), col("source_filter")) + parking_permit_cleaned = parking_permit_df.withColumn("source", lit("parking_permit")).withColumn("source_filter", + lit("live parking permit")).withColumn( + "extracted_name", + extract_name_udf(concat_ws(" ", col("forename_of_applicant"), col("surname_of_applicant")))).withColumn( + "date_of_birth", to_date(col("date_of_birth_of_applicant"), format="yyyy-MM-dd")).withColumnRenamed("postcode", + "post_code").withColumnRenamed( + "email_address_of_applicant", "email").filter( + (col("permit_type").isin(["Residents", "Estate Resident"])) & (col("live_permit_flag") == 1)).select( + col("source"), col("permit_reference"), col("extracted_name.*"), col("date_of_birth"), col("email"), + col("post_code"), col("uprn"), col("address_line_1"), col("address_line_2"), col("address_line_3"), + col("source_filter")) return parking_permit_cleaned @@ -677,26 +663,25 @@ def standardize_parking_permit_data(parking_permit_cleaned: DataFrame) -> DataFr Returns: A parking permit DataFrame with all the standard columns listed above. """ - parking_permit = parking_permit_cleaned \ - .filter(col("entity_type") == "Person") \ - .drop(col("entity_type")) \ - .withColumn("source_id", col("permit_reference")) \ - .withColumn("title", categorise_title(lower(col("title")))) \ - .withColumn("first_name", standardize_name(col("first_name"))) \ - .withColumn("middle_name", standardize_name(col("middle_name"))) \ - .withColumn("last_name", standardize_name(col("last_name"))) \ - .withColumn("name", standardize_full_name(col("first_name"), col("middle_name"), col("last_name"))) \ - .withColumn("post_code", lower(col("post_code"))) \ - .withColumn("address_line_1", standardize_address_line(col("address_line_1"))) \ - .withColumn("address_line_2", standardize_address_line(col("address_line_2"))) \ - .withColumn("address_line_3", standardize_address_line(col("address_line_3"))) \ - .withColumn("address_line_4", lit("").cast(StringType())) \ - .withColumn("full_address", full_address(col("address_line_1"), col("address_line_2"), col("address_line_3"), - col("address_line_4"))) \ - .select(col("source"), col("source_id"), col("uprn"), col("title"), col("first_name"), col("middle_name"), - col("last_name"), col("name"), col("date_of_birth"), col("post_code"), col("address_line_1"), - col("address_line_2"), col("address_line_3"), col("address_line_4"), col("full_address"), - col("source_filter")) + parking_permit = parking_permit_cleaned.filter(col("entity_type") == "Person").drop(col("entity_type")).withColumn( + "source_id", col("permit_reference")).withColumn("title", categorise_title(lower(col("title")))).withColumn( + "first_name", standardize_name(col("first_name"))).withColumn("middle_name", + standardize_name(col("middle_name"))).withColumn( + "last_name", standardize_name(col("last_name"))).withColumn("name", standardize_full_name(col("first_name"), + col("middle_name"), + col("last_name"))).withColumn( + "post_code", lower(col("post_code"))).withColumn("address_line_1", + standardize_address_line(col("address_line_1"))).withColumn( + "address_line_2", standardize_address_line(col("address_line_2"))).withColumn("address_line_3", + standardize_address_line( + col("address_line_3"))).withColumn( + "address_line_4", lit("").cast(StringType())).withColumn("full_address", full_address(col("address_line_1"), + col("address_line_2"), + col("address_line_3"), + col("address_line_4"))).select( + col("source"), col("source_id"), col("uprn"), col("title"), col("first_name"), col("middle_name"), + col("last_name"), col("name"), col("date_of_birth"), col("post_code"), col("address_line_1"), + col("address_line_2"), col("address_line_3"), col("address_line_4"), col("full_address"), col("source_filter")) return parking_permit @@ -714,68 +699,58 @@ def prepare_clean_schools_admissions_data(schools_admissions_df: DataFrame) -> D address_cols = ["address_line_1", "address_line_2", "address_line_3", "address_line_4"] - schools_admissions_cleaned = schools_admissions_df \ - .withColumn("source", lit("schools_admission")) \ - .withColumn("source_id", col("child_id")) \ - .withColumn("first_name", split(schools_admissions_df["contact_forename"], ' ').getItem(0)) \ - .withColumn("middle_name", split(schools_admissions_df["contact_forename"], ' ').getItem(1)) \ - .withColumn("last_name", col("contact_surname")) \ - .withColumn("name", regexp_replace(concat_ws(" ", col("first_name"), col("middle_name"), - col("last_name")), r"\s+", " ")) \ - .withColumn("date_of_birth", lit("")) \ - .withColumnRenamed("first_lLine", "address_line_1") \ - .withColumnRenamed("second_line", "address_line_2") \ - .withColumnRenamed("third_line", "address_line_3") \ - .withColumnRenamed("town", "address_line_4") \ - .withColumn("source_filter", lit("school admissions")) \ - .select(col("source"), col("source_id"), col("title"), col("first_name"), col("middle_name"), - col("last_name"), col("name"), col("date_of_birth"), col("email"), col("post_code"), col("uprn"), - col("address_line_1"), col("address_line_2"), col("address_line_3"), - col("address_line_4"), col("source_filter")) + schools_admissions_cleaned = schools_admissions_df.withColumn("source", lit("schools_admission")).withColumn( + "source_id", col("child_id")).withColumn("first_name", + split(schools_admissions_df["contact_forename"], ' ').getItem( + 0)).withColumn("middle_name", + split(schools_admissions_df["contact_forename"], + ' ').getItem(1)).withColumn("last_name", + col("contact_surname")).withColumn( + "name", regexp_replace(concat_ws(" ", col("first_name"), col("middle_name"), col("last_name")), r"\s+", + " ")).withColumn("date_of_birth", lit("")).withColumnRenamed("first_lLine", + "address_line_1").withColumnRenamed( + "second_line", "address_line_2").withColumnRenamed("third_line", "address_line_3").withColumnRenamed("town", + "address_line_4").withColumn( + "source_filter", lit("school admissions")).select(col("source"), col("source_id"), col("title"), + col("first_name"), col("middle_name"), col("last_name"), + col("name"), col("date_of_birth"), col("email"), + col("post_code"), col("uprn"), col("address_line_1"), + col("address_line_2"), col("address_line_3"), + col("address_line_4"), col("source_filter")) # create a zip of address line arrays, sorted in the order of not null (False), column order - schools_admissions_cleaned = schools_admissions_cleaned.select( - col("source"), col("source_id"), col("title"), col("first_name"), col("middle_name"), - col("last_name"), col("name"), col("date_of_birth"), col("email"), col("post_code"), col("uprn"), - col("address_line_1"), col("address_line_2"), col("address_line_3"), - col("address_line_4"), col("source_filter"), - array_sort( - arrays_zip( - array([col(c).isNull() for c in address_cols]), - array([lit(i) for i in range(4)]), - array([col(c) for c in address_cols]) - ) - ).alias('address_sorted')) + schools_admissions_cleaned = schools_admissions_cleaned.select(col("source"), col("source_id"), col("title"), + col("first_name"), col("middle_name"), col("last_name"), col("name"), col("date_of_birth"), col("email"), + col("post_code"), col("uprn"), col("address_line_1"), col("address_line_2"), col("address_line_3"), + col("address_line_4"), col("source_filter"), array_sort( + arrays_zip(array([col(c).isNull() for c in address_cols]), array([lit(i) for i in range(4)]), + array([col(c) for c in address_cols]))).alias('address_sorted')) # disaggregate address_sorted arrays into columns - schools_admissions_cleaned = schools_admissions_cleaned.select( - col("source"), col("source_id"), col("title"), col("first_name"), col("middle_name"), - col("last_name"), col("name"), col("date_of_birth"), col("email"), col("post_code"), col("uprn"), - col("source_filter"), + schools_admissions_cleaned = schools_admissions_cleaned.select(col("source"), col("source_id"), col("title"), + col("first_name"), col("middle_name"), col("last_name"), col("name"), col("date_of_birth"), col("email"), + col("post_code"), col("uprn"), col("source_filter"), *[col("address_sorted")[i]['2'].alias(address_cols[i]) for i in range(4)]) # rejig address lines - schools_admissions_cleaned = schools_admissions_cleaned \ - .withColumn("address_line_1", when(col("address_line_1").rlike(r"\d+$") - & col("address_line_2").rlike(r"^[A-Za-z]"), - concat_ws(" ", col("address_line_1"), col("address_line_2"))) - .otherwise(col("address_line_1"))) \ - .withColumn("address_line_2", when(col("address_line_1").contains(col("address_line_2")), - col("address_line_3")) - .otherwise(concat_ws(" ", col("address_line_2"), col("address_line_3")))) \ - .withColumn("address_line_2", when(col("address_line_2").rlike(r"\d+$"), - concat_ws(" ", col("address_line_2"), col("address_line_4"))) - .otherwise(col("address_line_2"))) \ - .withColumn("address_line_3", when(col("address_line_2").contains(col("address_line_3")), lit("london"))) \ - .withColumn("address_line_2", when(col("address_line_2").isNull(), lit("hackney")) - .otherwise(col("address_line_2"))) \ - .withColumn("address_line_3", when(col("address_line_3").isNull(), lit("london")) - .otherwise(col("address_line_3"))) \ - .withColumn("address_line_4", lit("")) \ - .select(col("source"), col("source_id"), col("title"), col("first_name"), col("middle_name"), - col("last_name"), col("name"), col("date_of_birth"), col("email"), col("post_code"), col("uprn"), - col("address_line_1"), col("address_line_2"), col("address_line_3"), - col("address_line_4"), col("source_filter")) + schools_admissions_cleaned = schools_admissions_cleaned.withColumn("address_line_1", when( + col("address_line_1").rlike(r"\d+$") & col("address_line_2").rlike(r"^[A-Za-z]"), + concat_ws(" ", col("address_line_1"), col("address_line_2"))).otherwise(col("address_line_1"))).withColumn( + "address_line_2", when(col("address_line_1").contains(col("address_line_2")), col("address_line_3")).otherwise( + concat_ws(" ", col("address_line_2"), col("address_line_3")))).withColumn("address_line_2", when( + col("address_line_2").rlike(r"\d+$"), concat_ws(" ", col("address_line_2"), col("address_line_4"))).otherwise( + col("address_line_2"))).withColumn("address_line_3", when(col("address_line_2").contains(col("address_line_3")), + lit("london"))).withColumn("address_line_2", when( + col("address_line_2").isNull(), lit("hackney")).otherwise(col("address_line_2"))).withColumn("address_line_3", + when( + col("address_line_3").isNull(), + lit("london")).otherwise( + col("address_line_3"))).withColumn( + "address_line_4", lit("")).select(col("source"), col("source_id"), col("title"), col("first_name"), + col("middle_name"), col("last_name"), col("name"), col("date_of_birth"), + col("email"), col("post_code"), col("uprn"), col("address_line_1"), + col("address_line_2"), col("address_line_3"), col("address_line_4"), + col("source_filter")) return schools_admissions_cleaned @@ -815,26 +790,29 @@ def standardize_schools_admissions_data(schools_admissions_cleaned: DataFrame) - A schools admissions DataFrame with all the standard column listed above. """ - schools_admissions = schools_admissions_cleaned \ - .withColumn("source_id", col("source_id")) \ - .withColumn("title", categorise_title(lower(trim(col("title"))))) \ - .withColumn("first_name", standardize_name(trim(col("first_name")))) \ - .withColumn("middle_name", standardize_name(trim(col("middle_name")))) \ - .withColumn("last_name", standardize_name(trim(col("last_name")))) \ - .withColumn("name", standardize_name(trim(col("name")))) \ - .withColumn("post_code", lower(trim(col("post_code")))) \ - .withColumn("address_line_1", standardize_address_line(trim(col("address_line_1")))) \ - .withColumn("address_line_2", standardize_address_line(trim(col("address_line_2")))) \ - .withColumn("address_line_3", standardize_address_line(trim(col("address_line_3")))) \ - .withColumn("address_line_4", standardize_address_line(trim(col("address_line_4")))) \ - .withColumn("full_address1", full_address(trim(col("address_line_1")), trim(col("address_line_2")), - trim(col("address_line_3")), - trim(col("address_line_4")))) \ - .withColumn("full_address", regexp_replace(col("full_address1"), r"\s+", " ")) \ - .select(col("source"), col("source_id"), col("uprn"), col("title"), col("first_name"), col("middle_name"), - col("last_name"), col("name"), col("date_of_birth"), col("post_code"), col("address_line_1"), - col("address_line_2"), col("address_line_3"), col("address_line_4"), - col("full_address"), col("source_filter")) + schools_admissions = schools_admissions_cleaned.withColumn("source_id", col("source_id")).withColumn("title", + categorise_title( + lower(trim( + col("title"))))).withColumn( + "first_name", standardize_name(trim(col("first_name")))).withColumn("middle_name", standardize_name( + trim(col("middle_name")))).withColumn("last_name", standardize_name(trim(col("last_name")))).withColumn("name", + standardize_name( + trim( + col("name")))).withColumn( + "post_code", lower(trim(col("post_code")))).withColumn("address_line_1", standardize_address_line( + trim(col("address_line_1")))).withColumn("address_line_2", + standardize_address_line(trim(col("address_line_2")))).withColumn( + "address_line_3", standardize_address_line(trim(col("address_line_3")))).withColumn("address_line_4", + standardize_address_line( + trim( + col("address_line_4")))).withColumn( + "full_address1", + full_address(trim(col("address_line_1")), trim(col("address_line_2")), trim(col("address_line_3")), + trim(col("address_line_4")))).withColumn("full_address", + regexp_replace(col("full_address1"), r"\s+", " ")).select( + col("source"), col("source_id"), col("uprn"), col("title"), col("first_name"), col("middle_name"), + col("last_name"), col("name"), col("date_of_birth"), col("post_code"), col("address_line_1"), + col("address_line_2"), col("address_line_3"), col("address_line_4"), col("full_address"), col("source_filter")) return schools_admissions @@ -852,70 +830,62 @@ def prepare_clean_freedom_pass_admissions_data(freedom_df: DataFrame) -> DataFra address_cols = ["address_line_1", "address_line_2", "address_line_3", "address_line_4"] - freedom_cleaned = freedom_df \ - .withColumn("source", lit("freedom_passes")) \ - .withColumn("source_id", col("applicantid")) \ - .withColumn("first_name", col("forename")) \ - .withColumn("middle_name", lit("")) \ - .withColumn("last_name", col("surname")) \ - .withColumn("name", regexp_replace(concat_ws(" ", col("first_name"), col("last_name")), r"\s+", " ")) \ - .withColumnRenamed("house_name_number", "address_line_1") \ - .withColumnRenamed("building_name", "address_line_2") \ - .withColumnRenamed("street", "address_line_3") \ - .withColumnRenamed("district", "address_line_4") \ - .withColumnRenamed("postcode", "post_code") \ - .withColumnRenamed("email_address", "email") \ - .withColumn("date_of_birth", to_date(col("date_of_birth"), format="dd/MM/yyyy")) \ - .withColumn("uprn", lit("")) \ - .withColumn("source_filter", lit("freedom_passes_2024")) \ - .select(col("source"), col("source_id"), col("title"), col("first_name"), col("middle_name"), - col("last_name"), col("name"), col("date_of_birth"), col("email"), col("post_code"), col("uprn"), - col("address_line_1"), col("address_line_2"), col("address_line_3"), - col("address_line_4"), col("source_filter")) + freedom_cleaned = freedom_df.withColumn("source", lit("freedom_passes")).withColumn("source_id", + col("applicantid")).withColumn( + "first_name", col("forename")).withColumn("middle_name", lit("")).withColumn("last_name", + col("surname")).withColumn("name", + regexp_replace( + concat_ws( + " ", + col("first_name"), + col("last_name")), + r"\s+", + " ")).withColumnRenamed( + "house_name_number", "address_line_1").withColumnRenamed("building_name", "address_line_2").withColumnRenamed( + "street", "address_line_3").withColumnRenamed("district", "address_line_4").withColumnRenamed("postcode", + "post_code").withColumnRenamed( + "email_address", "email").withColumn("date_of_birth", + to_date(col("date_of_birth"), format="dd/MM/yyyy")).withColumn("uprn", + lit("")).withColumn( + "source_filter", lit("freedom_passes_2024")).select(col("source"), col("source_id"), col("title"), + col("first_name"), col("middle_name"), col("last_name"), + col("name"), col("date_of_birth"), col("email"), + col("post_code"), col("uprn"), col("address_line_1"), + col("address_line_2"), col("address_line_3"), + col("address_line_4"), col("source_filter")) # create a zip of address line arrays, sorted in the order of not null (False), column order - freedom_cleaned = freedom_cleaned.select( - col("source"), col("source_id"), col("title"), col("first_name"), col("middle_name"), - col("last_name"), col("name"), col("date_of_birth"), col("email"), col("post_code"), col("uprn"), - col("address_line_1"), col("address_line_2"), col("address_line_3"), - col("address_line_4"), col("source_filter"), - array_sort( - arrays_zip( - array([col(c).isNull() for c in address_cols]), - array([lit(i) for i in range(4)]), - array([col(c) for c in address_cols]) - ) - ).alias('address_sorted')) + freedom_cleaned = freedom_cleaned.select(col("source"), col("source_id"), col("title"), col("first_name"), + col("middle_name"), col("last_name"), col("name"), col("date_of_birth"), col("email"), col("post_code"), + col("uprn"), col("address_line_1"), col("address_line_2"), col("address_line_3"), col("address_line_4"), + col("source_filter"), array_sort( + arrays_zip(array([col(c).isNull() for c in address_cols]), array([lit(i) for i in range(4)]), + array([col(c) for c in address_cols]))).alias('address_sorted')) # disaggregate address_sorted arrays into columns - freedom_cleaned = freedom_cleaned.select( - col("source"), col("source_id"), col("title"), col("first_name"), col("middle_name"), - col("last_name"), col("name"), col("date_of_birth"), col("email"), col("post_code"), col("uprn"), - col("source_filter"), - *[col("address_sorted")[i]['2'].alias(address_cols[i]) for i in range(4)]) + freedom_cleaned = freedom_cleaned.select(col("source"), col("source_id"), col("title"), col("first_name"), + col("middle_name"), col("last_name"), col("name"), col("date_of_birth"), col("email"), col("post_code"), + col("uprn"), col("source_filter"), *[col("address_sorted")[i]['2'].alias(address_cols[i]) for i in range(4)]) # rejig address lines - freedom_cleaned = freedom_cleaned \ - .withColumn("address_line_1", when(col("address_line_1").rlike(r"\d+[a-z]$") - & col("address_line_2").rlike(r"^[A-Za-z]"), - concat_ws(" ", col("address_line_1"), col("address_line_2"))) - .otherwise(col("address_line_1"))) \ - .withColumn("address_line_2", when(col("address_line_1").contains(col("address_line_2")), - col("address_line_3")) - .otherwise(concat_ws(" ", col("address_line_2"), col("address_line_3")))) \ - .withColumn("address_line_2", when(col("address_line_2").rlike(r"\d+$"), - concat_ws(" ", col("address_line_2"), col("address_line_4"))) - .otherwise(col("address_line_2"))) \ - .withColumn("address_line_3", when(col("address_line_2").contains(col("address_line_3")), lit("london"))) \ - .withColumn("address_line_2", when(col("address_line_2").isNull(), lit("hackney")) - .otherwise(col("address_line_2"))) \ - .withColumn("address_line_3", when(col("address_line_3").isNull(), lit("london")) - .otherwise(col("address_line_3"))) \ - .withColumn("address_line_4", lit("")) \ - .select(col("source"), col("source_id"), col("title"), col("first_name"), col("middle_name"), - col("last_name"), col("name"), col("date_of_birth"), col("email"), col("post_code"), col("uprn"), - col("address_line_1"), col("address_line_2"), col("address_line_3"), - col("address_line_4"), col("source_filter")) + freedom_cleaned = freedom_cleaned.withColumn("address_line_1", when( + col("address_line_1").rlike(r"\d+[a-z]$") & col("address_line_2").rlike(r"^[A-Za-z]"), + concat_ws(" ", col("address_line_1"), col("address_line_2"))).otherwise(col("address_line_1"))).withColumn( + "address_line_2", when(col("address_line_1").contains(col("address_line_2")), col("address_line_3")).otherwise( + concat_ws(" ", col("address_line_2"), col("address_line_3")))).withColumn("address_line_2", when( + col("address_line_2").rlike(r"\d+$"), concat_ws(" ", col("address_line_2"), col("address_line_4"))).otherwise( + col("address_line_2"))).withColumn("address_line_3", when(col("address_line_2").contains(col("address_line_3")), + lit("london"))).withColumn("address_line_2", when( + col("address_line_2").isNull(), lit("hackney")).otherwise(col("address_line_2"))).withColumn("address_line_3", + when( + col("address_line_3").isNull(), + lit("london")).otherwise( + col("address_line_3"))).withColumn( + "address_line_4", lit("")).select(col("source"), col("source_id"), col("title"), col("first_name"), + col("middle_name"), col("last_name"), col("name"), col("date_of_birth"), + col("email"), col("post_code"), col("uprn"), col("address_line_1"), + col("address_line_2"), col("address_line_3"), col("address_line_4"), + col("source_filter")) return freedom_cleaned @@ -955,26 +925,37 @@ def standardize_freedom_pass_data(freedom_cleaned: DataFrame) -> DataFrame: freedom_passes (Dataframe): Freedom pass dataframe with all the standardised columns listed above. """ - freedom_passes = freedom_cleaned \ - .withColumn("source_id", col("source_id")) \ - .withColumn("title", categorise_title(lower(trim(col("title"))))) \ - .withColumn("first_name", standardize_name(trim(col("first_name")))) \ - .withColumn("middle_name", standardize_name(trim(col("middle_name")))) \ - .withColumn("last_name", standardize_name(trim(col("last_name")))) \ - .withColumn("name", standardize_name(trim(col("name")))) \ - .withColumn("post_code", lower(trim(col("post_code")))) \ - .withColumn("address_line_1", standardize_address_line(trim(col("address_line_1")))) \ - .withColumn("address_line_2", standardize_address_line(trim(col("address_line_2")))) \ - .withColumn("address_line_3", standardize_address_line(trim(col("address_line_3")))) \ - .withColumn("address_line_4", standardize_address_line(trim(col("address_line_4")))) \ - .withColumn("full_address1", full_address(trim(col("address_line_1")), trim(col("address_line_2")), - trim(col("address_line_3")), - trim(col("address_line_4")))) \ - .withColumn("full_address", regexp_replace(col("full_address1"), r"\s+", " ")) \ - .select(col("source"), col("source_id"), col("uprn"), col("title"), col("first_name"), col("middle_name"), - col("last_name"), col("name"), col("date_of_birth"), col("post_code"), col("address_line_1"), - col("address_line_2"), col("address_line_3"), col("address_line_4"), - col("full_address"), col("source_filter")) + freedom_passes = freedom_cleaned.withColumn("source_id", col("source_id")).withColumn("title", categorise_title( + lower(trim(col("title"))))).withColumn("first_name", standardize_name(trim(col("first_name")))).withColumn( + "middle_name", standardize_name(trim(col("middle_name")))).withColumn("last_name", standardize_name( + trim(col("last_name")))).withColumn("name", standardize_name(trim(col("name")))).withColumn("post_code", lower( + trim(col("post_code")))).withColumn("address_line_1", + standardize_address_line(trim(col("address_line_1")))).withColumn( + "address_line_2", standardize_address_line(trim(col("address_line_2")))).withColumn("address_line_3", + standardize_address_line( + trim( + col("address_line_3")))).withColumn( + "address_line_4", standardize_address_line(trim(col("address_line_4")))).withColumn("full_address1", + full_address(trim( + col("address_line_1")), + trim( + col("address_line_2")), + trim( + col("address_line_3")), + trim( + col("address_line_4")))).withColumn( + "full_address", regexp_replace(col("full_address1"), r"\s+", " ")).select(col("source"), col("source_id"), + col("uprn"), col("title"), + col("first_name"), col("middle_name"), + col("last_name"), col("name"), + col("date_of_birth"), + col("post_code"), + col("address_line_1"), + col("address_line_2"), + col("address_line_3"), + col("address_line_4"), + col("full_address"), + col("source_filter")) return freedom_passes @@ -991,48 +972,36 @@ def prepare_clean_electoral_register_data(electoral_register_df: DataFrame) -> D address_cols = ["address_line_1", "address_line_2", "address_line_3", "address_line_4"] - electoral_register_cleaned = electoral_register_df \ - .withColumn("source", lit("electoral_register")) \ - .withColumn("source_id", col("elector_id")) \ - .withColumn("first_name", split(electoral_register_df["elector_forename"], ' ').getItem(0)) \ - .withColumn("middle_name", col("elector_middle_name")) \ - .withColumn("last_name", col("elector_surname")) \ - .withColumn("name", regexp_replace(concat_ws(" ", col("first_name"), col("middle_name"), - col("last_name")), r"\s+", " ")) \ - .withColumn("date_of_birth", to_date(col("elector_dob"), format="yyyy-MM-dd")) \ - .withColumnRenamed("property_address_1", "address_line_1") \ - .withColumnRenamed("property_address_2", "address_line_2") \ - .withColumnRenamed("property_address_3", "address_line_3") \ - .withColumnRenamed("property_address_4", "address_line_4") \ - .withColumnRenamed("property_post_code", "post_code") \ - .withColumnRenamed("property_urn", "uprn") \ - .withColumn("email", lit("")) \ - .withColumn("title", lit("")) \ - .withColumn("source_filter", lit("electoral register jun23")) \ - .select(col("source"), col("source_id"), col("title"), col("first_name"), col("middle_name"), - col("last_name"), col("name"), col("date_of_birth"), col("email"), col("post_code"), col("uprn"), - col("address_line_1"), col("address_line_2"), col("address_line_3"), - col("address_line_4"), col("source_filter")) + electoral_register_cleaned = electoral_register_df.withColumn("source", lit("electoral_register")).withColumn( + "source_id", col("elector_id")).withColumn("first_name", + split(electoral_register_df["elector_forename"], ' ').getItem( + 0)).withColumn("middle_name", + col("elector_middle_name")).withColumn( + "last_name", col("elector_surname")).withColumn("name", regexp_replace( + concat_ws(" ", col("first_name"), col("middle_name"), col("last_name")), r"\s+", " ")).withColumn( + "date_of_birth", to_date(col("elector_dob"), format="yyyy-MM-dd")).withColumnRenamed("property_address_1", + "address_line_1").withColumnRenamed( + "property_address_2", "address_line_2").withColumnRenamed("property_address_3", + "address_line_3").withColumnRenamed( + "property_address_4", "address_line_4").withColumnRenamed("property_post_code", "post_code").withColumnRenamed( + "property_urn", "uprn").withColumn("email", lit("")).withColumn("title", lit("")).withColumn("source_filter", + lit("electoral register jun23")).select( + col("source"), col("source_id"), col("title"), col("first_name"), col("middle_name"), col("last_name"), + col("name"), col("date_of_birth"), col("email"), col("post_code"), col("uprn"), col("address_line_1"), + col("address_line_2"), col("address_line_3"), col("address_line_4"), col("source_filter")) # create a zip of address line arrays, sorted in the order of not null (False), column order - electoral_register_cleaned = electoral_register_cleaned.select( - col("source"), col("source_id"), col("title"), col("first_name"), col("middle_name"), - col("last_name"), col("name"), col("date_of_birth"), col("email"), col("post_code"), col("uprn"), - col("address_line_1"), col("address_line_2"), col("address_line_3"), - col("address_line_4"), col("source_filter"), - array_sort( - arrays_zip( - array([col(c).isNull() for c in address_cols]), - array([lit(i) for i in range(4)]), - array([col(c) for c in address_cols]) - ) - ).alias('address_sorted')) + electoral_register_cleaned = electoral_register_cleaned.select(col("source"), col("source_id"), col("title"), + col("first_name"), col("middle_name"), col("last_name"), col("name"), col("date_of_birth"), col("email"), + col("post_code"), col("uprn"), col("address_line_1"), col("address_line_2"), col("address_line_3"), + col("address_line_4"), col("source_filter"), array_sort( + arrays_zip(array([col(c).isNull() for c in address_cols]), array([lit(i) for i in range(4)]), + array([col(c) for c in address_cols]))).alias('address_sorted')) # disaggregate address_sorted arrays into columns - electoral_register_cleaned = electoral_register_cleaned.select( - col("source"), col("source_id"), col("title"), col("first_name"), col("middle_name"), - col("last_name"), col("name"), col("date_of_birth"), col("email"), col("post_code"), col("uprn"), - col("source_filter"), + electoral_register_cleaned = electoral_register_cleaned.select(col("source"), col("source_id"), col("title"), + col("first_name"), col("middle_name"), col("last_name"), col("name"), col("date_of_birth"), col("email"), + col("post_code"), col("uprn"), col("source_filter"), *[col("address_sorted")[i]['2'].alias(address_cols[i]) for i in range(4)]) return electoral_register_cleaned @@ -1073,26 +1042,29 @@ def standardize_electoral_register_data(electoral_register_cleaned: DataFrame) - A electoral_register DataFrame with all the standard column listed above. """ - electoral_register = electoral_register_cleaned \ - .withColumn("source_id", col("source_id")) \ - .withColumn("title", categorise_title(lower(trim(col("title"))))) \ - .withColumn("first_name", standardize_name(trim(col("first_name")))) \ - .withColumn("middle_name", standardize_name(trim(col("middle_name")))) \ - .withColumn("last_name", standardize_name(trim(col("last_name")))) \ - .withColumn("name", standardize_name(trim(col("name")))) \ - .withColumn("post_code", lower(trim(col("post_code")))) \ - .withColumn("address_line_1", standardize_address_line(trim(col("address_line_1")))) \ - .withColumn("address_line_2", standardize_address_line(trim(col("address_line_2")))) \ - .withColumn("address_line_3", standardize_address_line(trim(col("address_line_3")))) \ - .withColumn("address_line_4", standardize_address_line(trim(col("address_line_4")))) \ - .withColumn("full_address1", full_address(trim(col("address_line_1")), trim(col("address_line_2")), - trim(col("address_line_3")), - trim(col("address_line_4")))) \ - .withColumn("full_address", regexp_replace(col("full_address1"), r"\s+", " ")) \ - .select(col("source"), col("source_id"), col("uprn"), col("title"), col("first_name"), col("middle_name"), - col("last_name"), col("name"), col("date_of_birth"), col("post_code"), col("address_line_1"), - col("address_line_2"), col("address_line_3"), col("address_line_4"), - col("full_address"), col("source_filter")) + electoral_register = electoral_register_cleaned.withColumn("source_id", col("source_id")).withColumn("title", + categorise_title( + lower(trim( + col("title"))))).withColumn( + "first_name", standardize_name(trim(col("first_name")))).withColumn("middle_name", standardize_name( + trim(col("middle_name")))).withColumn("last_name", standardize_name(trim(col("last_name")))).withColumn("name", + standardize_name( + trim( + col("name")))).withColumn( + "post_code", lower(trim(col("post_code")))).withColumn("address_line_1", standardize_address_line( + trim(col("address_line_1")))).withColumn("address_line_2", + standardize_address_line(trim(col("address_line_2")))).withColumn( + "address_line_3", standardize_address_line(trim(col("address_line_3")))).withColumn("address_line_4", + standardize_address_line( + trim( + col("address_line_4")))).withColumn( + "full_address1", + full_address(trim(col("address_line_1")), trim(col("address_line_2")), trim(col("address_line_3")), + trim(col("address_line_4")))).withColumn("full_address", + regexp_replace(col("full_address1"), r"\s+", " ")).select( + col("source"), col("source_id"), col("uprn"), col("title"), col("first_name"), col("middle_name"), + col("last_name"), col("name"), col("date_of_birth"), col("post_code"), col("address_line_1"), + col("address_line_2"), col("address_line_3"), col("address_line_4"), col("full_address"), col("source_filter")) return electoral_register @@ -1110,15 +1082,11 @@ def remove_deceased(df: DataFrame) -> DataFrame: Returns: A DataFrame after removing all the deceased persons. """ - deceased_filter_cond = (lower(col("title")).contains("(deceased)") | - lower(col("title")).contains("executor") | - lower(col("title")).contains("exor") | - lower(col("title")).contains("rep") | - lower(col("title")).contains(" of") | - lower(col("title")).contains("of ") | - lower(col("title")).contains("the") | - lower(col("title")).contains("pe") | - lower(col("title")).contains("other")) + deceased_filter_cond = ( + lower(col("title")).contains("(deceased)") | lower(col("title")).contains("executor") | lower( + col("title")).contains("exor") | lower(col("title")).contains("rep") | lower(col("title")).contains( + " of") | lower(col("title")).contains("of ") | lower(col("title")).contains("the") | lower( + col("title")).contains("pe") | lower(col("title")).contains("other")) return df.filter(~deceased_filter_cond) @@ -1138,21 +1106,20 @@ def generate_possible_matches(df: DataFrame) -> DataFrame: """ partitions = 5 - df_a = df.select(*[col(c).alias(f"a_{c}") for c in df.columns]) \ - .withColumn("first_name_soundex", soundex(col("a_first_name"))) \ - .withColumn("last_name_soundex", soundex(col("a_last_name"))) \ - .repartition(partitions, col("first_name_soundex"), col("last_name_soundex")) + df_a = df.select(*[col(c).alias(f"a_{c}") for c in df.columns]).withColumn("first_name_soundex", + soundex(col("a_first_name"))).withColumn( + "last_name_soundex", soundex(col("a_last_name"))).repartition(partitions, col("first_name_soundex"), + col("last_name_soundex")) - df_b = df.select(*[col(c).alias(f"b_{c}") for c in df.columns]) \ - .withColumn("first_name_soundex", soundex(col("b_first_name"))) \ - .withColumn("last_name_soundex", soundex(col("b_last_name"))) \ - .repartition(partitions, col("first_name_soundex"), col("last_name_soundex")) + df_b = df.select(*[col(c).alias(f"b_{c}") for c in df.columns]).withColumn("first_name_soundex", + soundex(col("b_first_name"))).withColumn( + "last_name_soundex", soundex(col("b_last_name"))).repartition(partitions, col("first_name_soundex"), + col("last_name_soundex")) - return df_a.join(df_b, - (df_a["a_source_id"] != df_b["b_source_id"]) & - (df_a["first_name_soundex"] == df_b["first_name_soundex"]) & - (df_a["last_name_soundex"] == df_b["last_name_soundex"])) \ - .drop(*["first_name_soundex", "last_name_soundex"]) + return df_a.join(df_b, (df_a["a_source_id"] != df_b["b_source_id"]) & ( + df_a["first_name_soundex"] == df_b["first_name_soundex"]) & ( + df_a["last_name_soundex"] == df_b["last_name_soundex"])).drop( + *["first_name_soundex", "last_name_soundex"]) def automatically_label_data(df: DataFrame) -> DataFrame: @@ -1165,14 +1132,10 @@ def automatically_label_data(df: DataFrame) -> DataFrame: Returns: A DataFrame with column auto_labels. """ - return df.withColumn("auto_labels", - when((col("a_source_id") == col("b_source_id")) | ( - (col("a_first_name") == col("b_first_name")) & - (col("a_last_name") == col("b_last_name")) & - (col("a_date_of_birth") == col("b_date_of_birth")) & - (col("a_uprn") == col("b_uprn")) & - (col("a_post_code") == col("b_post_code"))), lit(True)) - .otherwise(lit(None).cast(BooleanType()))) + return df.withColumn("auto_labels", when((col("a_source_id") == col("b_source_id")) | ( + (col("a_first_name") == col("b_first_name")) & (col("a_last_name") == col("b_last_name")) & ( + col("a_date_of_birth") == col("b_date_of_birth")) & (col("a_uprn") == col("b_uprn")) & ( + col("a_post_code") == col("b_post_code"))), lit(True)).otherwise(lit(None).cast(BooleanType()))) @pandas_udf(features_schema) @@ -1226,11 +1189,9 @@ def generate_features(input_df: pd.DataFrame) -> pd.DataFrame: input_df["full_address_similarity"] = input_df.apply( lambda x: similarity_algo.sim(x["a_full_address"], x["b_full_address"]), axis=1) - return input_df.drop([ - "a_first_name", "b_first_name", "a_last_name", "b_last_name", - "a_name", "b_name", - "a_address_line_1", "b_address_line_1", "a_address_line_2", "b_address_line_2", - "a_full_address", "b_full_address"], axis=1) + return input_df.drop( + ["a_first_name", "b_first_name", "a_last_name", "b_last_name", "a_name", "b_name", "a_address_line_1", + "b_address_line_1", "a_address_line_2", "b_address_line_2", "a_full_address", "b_full_address"], axis=1) def feature_engineering(df: DataFrame) -> DataFrame: @@ -1262,38 +1223,27 @@ def feature_engineering(df: DataFrame) -> DataFrame: match = lit("match") non_match = lit("non-match") unknown = lit("unknown") - features_df = df \ - .withColumn("uprn_same", - when(col("a_uprn") == col("b_uprn"), match) - .when(col("a_uprn") != col("b_uprn"), non_match) - .otherwise(unknown)) \ - .withColumn("title_same", - when(col("a_title") == col("b_title"), match) - .when(col("a_title") != col("b_title"), non_match) - .otherwise(unknown)) \ - .withColumn("date_of_birth_same", - when(col("a_date_of_birth") == col("b_date_of_birth"), match) - .when(col("a_date_of_birth") != col("b_date_of_birth"), non_match) - .otherwise(unknown)) \ - .withColumn("similarity_features", - generate_features(struct( - col("a_first_name"), col("b_first_name"), - col("a_middle_name"), col("b_middle_name"), - col("a_last_name"), col("b_last_name"), - col("a_name"), col("b_name"), - col("a_address_line_1"), col("b_address_line_1"), - col("a_address_line_2"), col("b_address_line_2"), - col("a_full_address"), col("b_full_address")))) \ - .select(col("*"), col("similarity_features.*")).drop("similarity_features") + features_df = df.withColumn("uprn_same", + when(col("a_uprn") == col("b_uprn"), match).when(col("a_uprn") != col("b_uprn"), + non_match).otherwise( + unknown)).withColumn("title_same", + when(col("a_title") == col("b_title"), match).when( + col("a_title") != col("b_title"), non_match).otherwise( + unknown)).withColumn("date_of_birth_same", when( + col("a_date_of_birth") == col("b_date_of_birth"), match).when(col("a_date_of_birth") != col("b_date_of_birth"), + non_match).otherwise(unknown)).withColumn( + "similarity_features", generate_features( + struct(col("a_first_name"), col("b_first_name"), col("a_middle_name"), col("b_middle_name"), + col("a_last_name"), col("b_last_name"), col("a_name"), col("b_name"), col("a_address_line_1"), + col("b_address_line_1"), col("a_address_line_2"), col("b_address_line_2"), col("a_full_address"), + col("b_full_address")))).select(col("*"), col("similarity_features.*")).drop("similarity_features") return features_df def evaluation_for_various_metrics(predictions: DataFrame): - metrics = MulticlassClassificationEvaluator(predictionCol="prediction", - labelCol="label", - weightCol="label_confidence_score", - probabilityCol="probability") + metrics = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label", + weightCol="label_confidence_score", probabilityCol="probability") accuracy = metrics.evaluate(predictions, {metrics.metricName: "accuracy"}) precision_non_match = metrics.evaluate(predictions, {metrics.metricName: "precisionByLabel", metrics.metricLabel: 0.0}) @@ -1341,12 +1291,9 @@ def train_model(df: DataFrame, model_path: str, test_model: bool, save_model: bo # .addGrid(classifier.regParam, [0.0001, 0.00005, 8e-05, 7e-05, 5e-05]) \ # .addGrid(classifier.elasticNetParam, [0.0, 0.25, 0.5, 0.75, 1.0]) \ # .build() - param_grid = ParamGridBuilder() \ - .addGrid(classifier.regParam, [7e-05]) \ - .addGrid(classifier.elasticNetParam, [1.0]) \ - .build() - evaluator = BinaryClassificationEvaluator(labelCol="label", - rawPredictionCol="rawPrediction", + param_grid = ParamGridBuilder().addGrid(classifier.regParam, [7e-05]).addGrid(classifier.elasticNetParam, + [1.0]).build() + evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", weightCol="label_confidence_score") cv = CrossValidator(estimator=pipeline, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5, seed=42, @@ -1370,10 +1317,8 @@ def train_model(df: DataFrame, model_path: str, test_model: bool, save_model: bo # Fine-tuning the model to maximize performance f_measure = training_summary.fMeasureByThreshold max_f_measure = f_measure.groupBy().max("F-Measure").select("max(F-Measure)").head() - best_threshold = f_measure \ - .filter(f_measure["F-Measure"] == max_f_measure["max(F-Measure)"]) \ - .select("threshold") \ - .head()["threshold"] + best_threshold = \ + f_measure.filter(f_measure["F-Measure"] == max_f_measure["max(F-Measure)"]).select("threshold").head()["threshold"] print(f"Best threshold: {best_threshold}") cv_model.bestModel.stages[-1].setThreshold(best_threshold) @@ -1396,10 +1341,11 @@ def train_model(df: DataFrame, model_path: str, test_model: bool, save_model: bo test_prediction.show() print(f'Write predictions to csv...') test_prediction.printSchema() - test_prediction_for_export = test_prediction.withColumn('probability', vector_to_array(col('probability'))) \ - .withColumn('probability_str', concat_ws('probability')) \ - .drop('uprn_vec', 'title_vec', 'date_of_birth_vec', 'features', 'rawPrediction', 'uprn_indexed', - 'title_indexed', 'date_of_birth_indexed', 'probability') + test_prediction_for_export = test_prediction.withColumn('probability', + vector_to_array(col('probability'))).withColumn( + 'probability_str', concat_ws('probability')).drop('uprn_vec', 'title_vec', 'date_of_birth_vec', 'features', + 'rawPrediction', 'uprn_indexed', 'title_indexed', + 'date_of_birth_indexed', 'probability') test_prediction_for_export.write.csv(header=True, path=f"{model_path}/test_predictions") accuracy, precision_non_match, precision_match, recall_non_match, recall_match = evaluation_for_various_metrics( @@ -1426,11 +1372,11 @@ def predict(features_df: DataFrame, model_path: str) -> DataFrame: """ cv_model: CrossValidatorModel = CrossValidatorModel.load(model_path) predictions = cv_model.transform(features_df).withColumn("predicted_label", - when(col("prediction") == 1.0, "match") - .when(col("prediction") == 0.0, "non-match") - .otherwise("unknown")) \ - .drop(*["uprn_indexed", "title_indexed", "date_of_birth_indexed", "uprn_vec", "title_vec", - "date_of_birth_vec", "features", "rawPrediction", "probability"]) + when(col("prediction") == 1.0, "match").when( + col("prediction") == 0.0, "non-match").otherwise( + "unknown")).drop( + *["uprn_indexed", "title_indexed", "date_of_birth_indexed", "uprn_vec", "title_vec", "date_of_birth_vec", + "features", "rawPrediction", "probability"]) return predictions @@ -1448,33 +1394,22 @@ def link_all_matched_persons(standard_df: DataFrame, predicted_df: DataFrame) -> """ vertices = standard_df.withColumn("id", col("source_id")) - edges = predicted_df \ - .filter(col("prediction") == 1.0) \ - .withColumn("src", col("a_source_id")) \ - .withColumn("dst", col("b_source_id")) + edges = predicted_df.filter(col("prediction") == 1.0).withColumn("src", col("a_source_id")).withColumn("dst", + col("b_source_id")) person_graph = GraphFrame(vertices, edges).dropIsolatedVertices() connected = person_graph.connectedComponents() - unique_connections = connected \ - .select(col("source"), col("source_id"), col("component").alias("matching_id")) \ - .distinct() - return standard_df \ - .join(unique_connections, ["source", "source_id"]) \ - .orderBy(col("matching_id")) + unique_connections = connected.select(col("source"), col("source_id"), + col("component").alias("matching_id")).distinct() + return standard_df.join(unique_connections, ["source", "source_id"]).orderBy(col("matching_id")) # Extra analysis (for analyst only): if you need to do. - # To find how many connection are there - # person_graph.inDegrees.filter(col("inDegree") > 1).orderBy(col("inDegree").desc()).show(truncate=False) + # To find how many connection are there # person_graph.inDegrees.filter(col("inDegree") > 1).orderBy(col("inDegree").desc()).show(truncate=False) - # Graph query using motif to find where person 'a' is connected to person 'b', and person 'b' is also connected to - # person 'a' - # motif = person_graph.find("(a)-[]->(b); (b)-[]->(a)") - # motif.show(truncate=False) + # Graph query using motif to find where person 'a' is connected to person 'b', and person 'b' is also connected to # person 'a' # motif = person_graph.find("(a)-[]->(b); (b)-[]->(a)") # motif.show(truncate=False) - # To count number of triangles i.e. a connected to b, b connected to c and c is connected back to a - # triangle_count = person_graph.triangleCount() - # triangle_count.orderBy(col("count").desc()).show(n=10, truncate=False) + # To count number of triangles i.e. a connected to b, b connected to c and c is connected back to a # triangle_count = person_graph.triangleCount() # triangle_count.orderBy(col("count").desc()).show(n=10, truncate=False) def match_persons(model_path: str, standard_df: DataFrame) -> DataFrame: From 7fcf2abe1e33b1630fe02fe72dd4a4433ace009c Mon Sep 17 00:00:00 2001 From: AGibson <4319494+annajgibson@users.noreply.github.com> Date: Thu, 31 Jul 2025 11:41:42 +0100 Subject: [PATCH 06/11] Linting --- .../person_matching_module.py | 160 ++++++++++-------- 1 file changed, 94 insertions(+), 66 deletions(-) diff --git a/scripts/jobs/data_and_insight/person_matching_module.py b/scripts/jobs/data_and_insight/person_matching_module.py index 86a89ebdc..519492b5b 100644 --- a/scripts/jobs/data_and_insight/person_matching_module.py +++ b/scripts/jobs/data_and_insight/person_matching_module.py @@ -16,22 +16,23 @@ from pyspark.ml.functions import vector_to_array from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel from pyspark.sql import DataFrame, SparkSession, Column -from pyspark.sql.functions import to_date, col, lit, length, broadcast, udf, when, substring, lower, concat_ws, soundex, \ - regexp_replace, trim, split, struct, arrays_zip, array, array_sort, current_date +from pyspark.sql.functions import (to_date, col, lit, length, broadcast, udf, when, substring, lower, concat_ws, + soundex, \ + regexp_replace, trim, split, struct, arrays_zip, array, array_sort, current_date) from pyspark.sql.pandas.functions import pandas_udf from pyspark.sql.types import StructType, StructField, StringType, DateType, BooleanType, DoubleType extracted_name_schema = StructType( [StructField("entity_type", StringType(), True), StructField("title", StringType(), True), - StructField("first_name", StringType(), True), StructField("middle_name", StringType(), True), - StructField("last_name", StringType(), True)]) + StructField("first_name", StringType(), True), StructField("middle_name", StringType(), True), + StructField("last_name", StringType(), True)]) features_schema = StructType( [StructField("first_name_similar", BooleanType(), True), StructField("middle_name_similar", BooleanType(), True), - StructField("last_name_similar", BooleanType(), True), StructField("name_similarity", DoubleType(), True), - StructField("address_line_1_similarity", DoubleType(), True), - StructField("address_line_2_similarity", DoubleType(), True), - StructField("full_address_similarity", DoubleType(), True), ]) + StructField("last_name_similar", BooleanType(), True), StructField("name_similarity", DoubleType(), True), + StructField("address_line_1_similarity", DoubleType(), True), + StructField("address_line_2_similarity", DoubleType(), True), + StructField("full_address_similarity", DoubleType(), True), ]) @udf(returnType=extracted_name_schema) @@ -113,8 +114,7 @@ def extract_person_name(name: str) -> (str, str, str, str, str): title_finder = [t for t in title_with_name if t.casefold() in common_titles] person_title = " ".join(title_finder) if len(title_finder) else None remaining_name = [n for n in title_with_name if n.casefold() != ( - person_title or "").casefold() and n.casefold() not in common_titles and n.casefold() not in [".", - "&"]] + person_title or "").casefold() and n.casefold() not in common_titles and n.casefold() not in [".", "&"]] if len(remaining_name) == 1: first_name = remaining_name[0] @@ -312,21 +312,25 @@ def standardize_housing_data(housing_cleaned: DataFrame) -> DataFrame: standard names that will be used by various other functions like feature engineering etc.) The DataFrame returned will have the following columns: * source: Source of the data like parking, tax etc. Should be of type string and cannot be blank. - * source_id: Unique ID for reach record. It's ok to have same person with different source_id. Should be of type string and cannot be blank. + * source_id: Unique ID for reach record. It's ok to have same person with different source_id. Should be of type + string and cannot be blank. * uprn: UPRN of the address. Should be of type string and can be blank. * title: Title of the person. Should be of type string and can be blank. * first_name: First name of the person. Should be of type string and can be blank. * middle_name: Middle name of the person. Should be of type string and can be blank. * last_name: Last name of the person. Should be of type string and can be blank. - * name: Concatenation of first, middle and last name after sorting alphabetically of the person. Should be of type string and can be blank. + * name: Concatenation of first, middle and last name after sorting alphabetically of the person. Should be of + type string and can be blank. * date_of_birth: Date of birth of the person. Should be of type Date and can be blank. * post_code: Postal code of the address. Should be of type string and can be blank. * address_line_1: First line of the address. Should be of type string and can be blank. * address_line_2: Second line of the address. Should be of type string and can be blank. * address_line_3: Third line of the address. Should be of type string and can be blank. * address_line_4: Fourth line of the address. Should be of type string and can be blank. - * full_address: Concatenation of address line 1, address line 2, address line 3, address line 4 in that order. Should be of type string and can be blank. - * source_filter: A field containing more information on the datasource such as tenancy type; this allows the user to filter the dataset to only + * full_address: Concatenation of address line 1, address line 2, address line 3, address line 4 in that order. + Should be of type string and can be blank. + * source_filter: A field containing more information on the datasource such as tenancy type; this allows the user + to filter the dataset to only include records for certain tenancy types. Args: housing_cleaned: housing DataFrame after preparing and cleaning it. @@ -389,8 +393,8 @@ def prepare_clean_council_tax_data(spark: SparkSession, council_tax_account: Dat liable_types = broadcast(spark.createDataFrame( [(0, 'Non-liable'), (1, 'Joint & Several'), (2, 'Freeholder'), (3, 'Leaseholder'), (4, 'Tenant'), - (5, 'Licencee'), (6, 'Resident'), (7, 'Owner'), (8, 'Assumed'), (9, 'VOID'), (10, 'Other'), - (11, 'Suspense'), (12, 'CTax Payer'), (-1, '(DATA ERROR)')]).toDF("liability_id", "liability_type")) + (5, 'Licencee'), (6, 'Resident'), (7, 'Owner'), (8, 'Assumed'), (9, 'VOID'), (10, 'Other'), (11, 'Suspense'), + (12, 'CTax Payer'), (-1, '(DATA ERROR)')]).toDF("liability_id", "liability_type")) council_tax_lead_person = ( council_tax_account.join(liable_types, col("lead_liab_pos") == col("liability_id")).withColumn("source", @@ -534,12 +538,12 @@ def prepare_clean_housing_benefit_data(hb_member_df: DataFrame, hb_household_df: housing_benefit_rent_assessment = hb_rent_assessment_df.withColumn("source_filter", when( (col("dhp_ind") == 1) & (col("type_ind") > 1), "DHP").otherwise("HB")).filter( (col("from_date") < col("import_date")) & (col("to_date") > col("import_date")) & ( - (col("type_ind") == 1) | (col("dhp_ind") == 1)) & (col("model_amt") > 0)).select(col("claim_id"), - col("source_filter")) + (col("type_ind") == 1) | (col("dhp_ind") == 1)) & (col("model_amt") > 0)).select(col("claim_id"), + col("source_filter")) housing_benefit_ctax_assessment = hb_ctax_assessment_df.withColumn("source_filter", lit("CTS")).filter( (col("from_date") < col("import_date")) & (col("to_date") > col("import_date")) & (col("model_amt") > 0) & ( - (col("type_ind") == 1) | (col("dhp_ind") == 1))).select(col("claim_id"), col("source_filter")) + (col("type_ind") == 1) | (col("dhp_ind") == 1))).select(col("claim_id"), col("source_filter")) housing_benefit_rent_ctax = housing_benefit_rent_assessment.union(housing_benefit_ctax_assessment) @@ -620,7 +624,9 @@ def prepare_clean_parking_permit_data(parking_permit_df: DataFrame) -> DataFrame """ parking_permit_cleaned = parking_permit_df.withColumn("source", lit("parking_permit")).withColumn("source_filter", - lit("live parking permit")).withColumn( + lit("live " + "parking " + "permit")).withColumn( "extracted_name", extract_name_udf(concat_ws(" ", col("forename_of_applicant"), col("surname_of_applicant")))).withColumn( "date_of_birth", to_date(col("date_of_birth_of_applicant"), format="yyyy-MM-dd")).withColumnRenamed("postcode", @@ -720,17 +726,22 @@ def prepare_clean_schools_admissions_data(schools_admissions_df: DataFrame) -> D # create a zip of address line arrays, sorted in the order of not null (False), column order schools_admissions_cleaned = schools_admissions_cleaned.select(col("source"), col("source_id"), col("title"), - col("first_name"), col("middle_name"), col("last_name"), col("name"), col("date_of_birth"), col("email"), - col("post_code"), col("uprn"), col("address_line_1"), col("address_line_2"), col("address_line_3"), - col("address_line_4"), col("source_filter"), array_sort( + col("first_name"), col("middle_name"), + col("last_name"), col("name"), col("date_of_birth"), + col("email"), col("post_code"), col("uprn"), + col("address_line_1"), col("address_line_2"), + col("address_line_3"), col("address_line_4"), + col("source_filter"), array_sort( arrays_zip(array([col(c).isNull() for c in address_cols]), array([lit(i) for i in range(4)]), - array([col(c) for c in address_cols]))).alias('address_sorted')) + array([col(c) for c in address_cols]))).alias('address_sorted')) # disaggregate address_sorted arrays into columns schools_admissions_cleaned = schools_admissions_cleaned.select(col("source"), col("source_id"), col("title"), - col("first_name"), col("middle_name"), col("last_name"), col("name"), col("date_of_birth"), col("email"), - col("post_code"), col("uprn"), col("source_filter"), - *[col("address_sorted")[i]['2'].alias(address_cols[i]) for i in range(4)]) + col("first_name"), col("middle_name"), + col("last_name"), col("name"), col("date_of_birth"), + col("email"), col("post_code"), col("uprn"), + col("source_filter"), *[ + col("address_sorted")[i]['2'].alias(address_cols[i]) for i in range(4)]) # rejig address lines schools_admissions_cleaned = schools_admissions_cleaned.withColumn("address_line_1", when( @@ -780,7 +791,8 @@ def standardize_schools_admissions_data(schools_admissions_cleaned: DataFrame) - * address_line_4: Fourth line of the address. Should be of type string and can be blank. * full_address: Concatenation of address line 1, address line 2, address line 3, address line 4 in that order. Should be of type string and can be blank. - * source_filter: Field to contain additional information on schools admissions (only contains holding string for now). + * source_filter: Field to contain additional information on schools admissions (only contains holding string for + now). Should be of type string and can be blank. Args: @@ -856,16 +868,18 @@ def prepare_clean_freedom_pass_admissions_data(freedom_df: DataFrame) -> DataFra # create a zip of address line arrays, sorted in the order of not null (False), column order freedom_cleaned = freedom_cleaned.select(col("source"), col("source_id"), col("title"), col("first_name"), - col("middle_name"), col("last_name"), col("name"), col("date_of_birth"), col("email"), col("post_code"), - col("uprn"), col("address_line_1"), col("address_line_2"), col("address_line_3"), col("address_line_4"), - col("source_filter"), array_sort( + col("middle_name"), col("last_name"), col("name"), col("date_of_birth"), + col("email"), col("post_code"), col("uprn"), col("address_line_1"), + col("address_line_2"), col("address_line_3"), col("address_line_4"), + col("source_filter"), array_sort( arrays_zip(array([col(c).isNull() for c in address_cols]), array([lit(i) for i in range(4)]), - array([col(c) for c in address_cols]))).alias('address_sorted')) + array([col(c) for c in address_cols]))).alias('address_sorted')) # disaggregate address_sorted arrays into columns freedom_cleaned = freedom_cleaned.select(col("source"), col("source_id"), col("title"), col("first_name"), - col("middle_name"), col("last_name"), col("name"), col("date_of_birth"), col("email"), col("post_code"), - col("uprn"), col("source_filter"), *[col("address_sorted")[i]['2'].alias(address_cols[i]) for i in range(4)]) + col("middle_name"), col("last_name"), col("name"), col("date_of_birth"), + col("email"), col("post_code"), col("uprn"), col("source_filter"), + *[col("address_sorted")[i]['2'].alias(address_cols[i]) for i in range(4)]) # rejig address lines freedom_cleaned = freedom_cleaned.withColumn("address_line_1", when( @@ -915,7 +929,8 @@ def standardize_freedom_pass_data(freedom_cleaned: DataFrame) -> DataFrame: * address_line_4: Fourth line of the address. Should be of type string and can be blank. * full_address: Concatenation of address line 1, address line 2, address line 3, address line 4 in that order. Should be of type string and can be blank. - * source_filter: Field to contain additional information on freedom pass dataset e.g year (only contains holding string for now). + * source_filter: Field to contain additional information on freedom pass dataset e.g year (only contains holding + string for now). Should be of type string and can be blank. Args: @@ -938,12 +953,12 @@ def standardize_freedom_pass_data(freedom_cleaned: DataFrame) -> DataFrame: "address_line_4", standardize_address_line(trim(col("address_line_4")))).withColumn("full_address1", full_address(trim( col("address_line_1")), - trim( - col("address_line_2")), - trim( - col("address_line_3")), - trim( - col("address_line_4")))).withColumn( + trim( + col("address_line_2")), + trim( + col("address_line_3")), + trim( + col("address_line_4")))).withColumn( "full_address", regexp_replace(col("full_address1"), r"\s+", " ")).select(col("source"), col("source_id"), col("uprn"), col("title"), col("first_name"), col("middle_name"), @@ -985,24 +1000,31 @@ def prepare_clean_electoral_register_data(electoral_register_df: DataFrame) -> D "address_line_3").withColumnRenamed( "property_address_4", "address_line_4").withColumnRenamed("property_post_code", "post_code").withColumnRenamed( "property_urn", "uprn").withColumn("email", lit("")).withColumn("title", lit("")).withColumn("source_filter", - lit("electoral register jun23")).select( + lit("electoral " + "register " + "jun23")).select( col("source"), col("source_id"), col("title"), col("first_name"), col("middle_name"), col("last_name"), col("name"), col("date_of_birth"), col("email"), col("post_code"), col("uprn"), col("address_line_1"), col("address_line_2"), col("address_line_3"), col("address_line_4"), col("source_filter")) # create a zip of address line arrays, sorted in the order of not null (False), column order electoral_register_cleaned = electoral_register_cleaned.select(col("source"), col("source_id"), col("title"), - col("first_name"), col("middle_name"), col("last_name"), col("name"), col("date_of_birth"), col("email"), - col("post_code"), col("uprn"), col("address_line_1"), col("address_line_2"), col("address_line_3"), - col("address_line_4"), col("source_filter"), array_sort( + col("first_name"), col("middle_name"), + col("last_name"), col("name"), col("date_of_birth"), + col("email"), col("post_code"), col("uprn"), + col("address_line_1"), col("address_line_2"), + col("address_line_3"), col("address_line_4"), + col("source_filter"), array_sort( arrays_zip(array([col(c).isNull() for c in address_cols]), array([lit(i) for i in range(4)]), - array([col(c) for c in address_cols]))).alias('address_sorted')) + array([col(c) for c in address_cols]))).alias('address_sorted')) # disaggregate address_sorted arrays into columns electoral_register_cleaned = electoral_register_cleaned.select(col("source"), col("source_id"), col("title"), - col("first_name"), col("middle_name"), col("last_name"), col("name"), col("date_of_birth"), col("email"), - col("post_code"), col("uprn"), col("source_filter"), - *[col("address_sorted")[i]['2'].alias(address_cols[i]) for i in range(4)]) + col("first_name"), col("middle_name"), + col("last_name"), col("name"), col("date_of_birth"), + col("email"), col("post_code"), col("uprn"), + col("source_filter"), *[ + col("address_sorted")[i]['2'].alias(address_cols[i]) for i in range(4)]) return electoral_register_cleaned @@ -1032,7 +1054,8 @@ def standardize_electoral_register_data(electoral_register_cleaned: DataFrame) - * address_line_4: Fourth line of the address. Should be of type string and can be blank. * full_address: Concatenation of address line 1, address line 2, address line 3, address line 4 in that order. Should be of type string and can be blank. - * source_filter: Field to contain additional information on electoral register (only contains holding string for now). + * source_filter: Field to contain additional information on electoral register (only contains holding string for + now). Should be of type string and can be blank. Args: @@ -1083,10 +1106,10 @@ def remove_deceased(df: DataFrame) -> DataFrame: A DataFrame after removing all the deceased persons. """ deceased_filter_cond = ( - lower(col("title")).contains("(deceased)") | lower(col("title")).contains("executor") | lower( - col("title")).contains("exor") | lower(col("title")).contains("rep") | lower(col("title")).contains( - " of") | lower(col("title")).contains("of ") | lower(col("title")).contains("the") | lower( - col("title")).contains("pe") | lower(col("title")).contains("other")) + lower(col("title")).contains("(deceased)") | lower(col("title")).contains("executor") | lower( + col("title")).contains("exor") | lower(col("title")).contains("rep") | lower(col("title")).contains( + " of") | lower(col("title")).contains("of ") | lower(col("title")).contains("the") | lower( + col("title")).contains("pe") | lower(col("title")).contains("other")) return df.filter(~deceased_filter_cond) @@ -1117,8 +1140,8 @@ def generate_possible_matches(df: DataFrame) -> DataFrame: col("last_name_soundex")) return df_a.join(df_b, (df_a["a_source_id"] != df_b["b_source_id"]) & ( - df_a["first_name_soundex"] == df_b["first_name_soundex"]) & ( - df_a["last_name_soundex"] == df_b["last_name_soundex"])).drop( + df_a["first_name_soundex"] == df_b["first_name_soundex"]) & ( + df_a["last_name_soundex"] == df_b["last_name_soundex"])).drop( *["first_name_soundex", "last_name_soundex"]) @@ -1134,8 +1157,8 @@ def automatically_label_data(df: DataFrame) -> DataFrame: """ return df.withColumn("auto_labels", when((col("a_source_id") == col("b_source_id")) | ( (col("a_first_name") == col("b_first_name")) & (col("a_last_name") == col("b_last_name")) & ( - col("a_date_of_birth") == col("b_date_of_birth")) & (col("a_uprn") == col("b_uprn")) & ( - col("a_post_code") == col("b_post_code"))), lit(True)).otherwise(lit(None).cast(BooleanType()))) + col("a_date_of_birth") == col("b_date_of_birth")) & (col("a_uprn") == col("b_uprn")) & ( + col("a_post_code") == col("b_post_code"))), lit(True)).otherwise(lit(None).cast(BooleanType()))) @pandas_udf(features_schema) @@ -1191,7 +1214,7 @@ def generate_features(input_df: pd.DataFrame) -> pd.DataFrame: return input_df.drop( ["a_first_name", "b_first_name", "a_last_name", "b_last_name", "a_name", "b_name", "a_address_line_1", - "b_address_line_1", "a_address_line_2", "b_address_line_2", "a_full_address", "b_full_address"], axis=1) + "b_address_line_1", "a_address_line_2", "b_address_line_2", "a_full_address", "b_full_address"], axis=1) def feature_engineering(df: DataFrame) -> DataFrame: @@ -1234,9 +1257,9 @@ def feature_engineering(df: DataFrame) -> DataFrame: non_match).otherwise(unknown)).withColumn( "similarity_features", generate_features( struct(col("a_first_name"), col("b_first_name"), col("a_middle_name"), col("b_middle_name"), - col("a_last_name"), col("b_last_name"), col("a_name"), col("b_name"), col("a_address_line_1"), - col("b_address_line_1"), col("a_address_line_2"), col("b_address_line_2"), col("a_full_address"), - col("b_full_address")))).select(col("*"), col("similarity_features.*")).drop("similarity_features") + col("a_last_name"), col("b_last_name"), col("a_name"), col("b_name"), col("a_address_line_1"), + col("b_address_line_1"), col("a_address_line_2"), col("b_address_line_2"), col("a_full_address"), + col("b_full_address")))).select(col("*"), col("similarity_features.*")).drop("similarity_features") return features_df @@ -1318,7 +1341,8 @@ def train_model(df: DataFrame, model_path: str, test_model: bool, save_model: bo f_measure = training_summary.fMeasureByThreshold max_f_measure = f_measure.groupBy().max("F-Measure").select("max(F-Measure)").head() best_threshold = \ - f_measure.filter(f_measure["F-Measure"] == max_f_measure["max(F-Measure)"]).select("threshold").head()["threshold"] + f_measure.filter(f_measure["F-Measure"] == max_f_measure["max(F-Measure)"]).select("threshold").head()[ + "threshold"] print(f"Best threshold: {best_threshold}") cv_model.bestModel.stages[-1].setThreshold(best_threshold) @@ -1405,11 +1429,15 @@ def link_all_matched_persons(standard_df: DataFrame, predicted_df: DataFrame) -> # Extra analysis (for analyst only): if you need to do. - # To find how many connection are there # person_graph.inDegrees.filter(col("inDegree") > 1).orderBy(col("inDegree").desc()).show(truncate=False) + # To find how many connection are there # person_graph.inDegrees.filter(col("inDegree") > 1).orderBy(col( + # "inDegree").desc()).show(truncate=False) - # Graph query using motif to find where person 'a' is connected to person 'b', and person 'b' is also connected to # person 'a' # motif = person_graph.find("(a)-[]->(b); (b)-[]->(a)") # motif.show(truncate=False) + # Graph query using motif to find where person 'a' is connected to person 'b', and person 'b' is also connected + # to # person 'a' # motif = person_graph.find("(a)-[]->(b); (b)-[]->(a)") # motif.show(truncate=False) - # To count number of triangles i.e. a connected to b, b connected to c and c is connected back to a # triangle_count = person_graph.triangleCount() # triangle_count.orderBy(col("count").desc()).show(n=10, truncate=False) + # To count number of triangles i.e. a connected to b, b connected to c and c is connected back to a # + # triangle_count = person_graph.triangleCount() # triangle_count.orderBy(col("count").desc()).show(n=10, + # truncate=False) def match_persons(model_path: str, standard_df: DataFrame) -> DataFrame: From 260b77de8cec3078c45db2abab42c72ba3b6a508 Mon Sep 17 00:00:00 2001 From: AGibson <4319494+annajgibson@users.noreply.github.com> Date: Thu, 31 Jul 2025 11:53:52 +0100 Subject: [PATCH 07/11] Linting --- .../person_matching_module.py | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/scripts/jobs/data_and_insight/person_matching_module.py b/scripts/jobs/data_and_insight/person_matching_module.py index 519492b5b..6b78937e9 100644 --- a/scripts/jobs/data_and_insight/person_matching_module.py +++ b/scripts/jobs/data_and_insight/person_matching_module.py @@ -17,8 +17,8 @@ from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel from pyspark.sql import DataFrame, SparkSession, Column from pyspark.sql.functions import (to_date, col, lit, length, broadcast, udf, when, substring, lower, concat_ws, - soundex, \ - regexp_replace, trim, split, struct, arrays_zip, array, array_sort, current_date) + soundex, regexp_replace, trim, split, struct, arrays_zip, array, array_sort, + current_date) from pyspark.sql.pandas.functions import pandas_udf from pyspark.sql.types import StructType, StructField, StringType, DateType, BooleanType, DoubleType @@ -79,9 +79,9 @@ def extract_person_name(name: str) -> (str, str, str, str, str): if not name or any(junk in name.casefold() for junk in junk_data): return "Unknown", None, None, None, None - if any(business in name.casefold() for business in common_business_types) or ( - any(business in name.casefold() for business in common_business_types_small) and not any( - t in name.casefold() for t in common_titles_subset_with_space)): + if (any(business in name.casefold() for business in common_business_types) or ( + any(business in name.casefold() for business in common_business_types_small) and not any( + t in name.casefold() for t in common_titles_subset_with_space))): return "Business", None, None, None, None person_title, first_name, middle_name, last_name = None, None, None, None @@ -114,7 +114,7 @@ def extract_person_name(name: str) -> (str, str, str, str, str): title_finder = [t for t in title_with_name if t.casefold() in common_titles] person_title = " ".join(title_finder) if len(title_finder) else None remaining_name = [n for n in title_with_name if n.casefold() != ( - person_title or "").casefold() and n.casefold() not in common_titles and n.casefold() not in [".", "&"]] + person_title or "").casefold() and n.casefold() not in common_titles and n.casefold() not in [".", "&"]] if len(remaining_name) == 1: first_name = remaining_name[0] @@ -259,7 +259,7 @@ def prepare_clean_housing_data(person_reshape: DataFrame, assets_reshape: DataFr A prepared and cleaned dataframe containing housing tenancy data. """ tenure_reshape = tenure_reshape.filter((tenure_reshape["endoftenuredate"].isNull()) | ( - tenure_reshape["endoftenuredate"].cast(DateType()) > current_date())) + tenure_reshape["endoftenuredate"].cast(DateType()) > current_date())) assets_reshape = assets_reshape.filter(assets_reshape['assettype'] == 'Dwelling') @@ -538,12 +538,12 @@ def prepare_clean_housing_benefit_data(hb_member_df: DataFrame, hb_household_df: housing_benefit_rent_assessment = hb_rent_assessment_df.withColumn("source_filter", when( (col("dhp_ind") == 1) & (col("type_ind") > 1), "DHP").otherwise("HB")).filter( (col("from_date") < col("import_date")) & (col("to_date") > col("import_date")) & ( - (col("type_ind") == 1) | (col("dhp_ind") == 1)) & (col("model_amt") > 0)).select(col("claim_id"), - col("source_filter")) + (col("type_ind") == 1) | (col("dhp_ind") == 1)) & (col("model_amt") > 0)).select(col("claim_id"), + col("source_filter")) housing_benefit_ctax_assessment = hb_ctax_assessment_df.withColumn("source_filter", lit("CTS")).filter( (col("from_date") < col("import_date")) & (col("to_date") > col("import_date")) & (col("model_amt") > 0) & ( - (col("type_ind") == 1) | (col("dhp_ind") == 1))).select(col("claim_id"), col("source_filter")) + (col("type_ind") == 1) | (col("dhp_ind") == 1))).select(col("claim_id"), col("source_filter")) housing_benefit_rent_ctax = housing_benefit_rent_assessment.union(housing_benefit_ctax_assessment) @@ -1106,7 +1106,7 @@ def remove_deceased(df: DataFrame) -> DataFrame: A DataFrame after removing all the deceased persons. """ deceased_filter_cond = ( - lower(col("title")).contains("(deceased)") | lower(col("title")).contains("executor") | lower( + lower(col("title")).contains("(deceased)") | lower(col("title")).contains("executor") | lower( col("title")).contains("exor") | lower(col("title")).contains("rep") | lower(col("title")).contains( " of") | lower(col("title")).contains("of ") | lower(col("title")).contains("the") | lower( col("title")).contains("pe") | lower(col("title")).contains("other")) @@ -1140,8 +1140,8 @@ def generate_possible_matches(df: DataFrame) -> DataFrame: col("last_name_soundex")) return df_a.join(df_b, (df_a["a_source_id"] != df_b["b_source_id"]) & ( - df_a["first_name_soundex"] == df_b["first_name_soundex"]) & ( - df_a["last_name_soundex"] == df_b["last_name_soundex"])).drop( + df_a["first_name_soundex"] == df_b["first_name_soundex"]) & ( + df_a["last_name_soundex"] == df_b["last_name_soundex"])).drop( *["first_name_soundex", "last_name_soundex"]) @@ -1156,9 +1156,9 @@ def automatically_label_data(df: DataFrame) -> DataFrame: A DataFrame with column auto_labels. """ return df.withColumn("auto_labels", when((col("a_source_id") == col("b_source_id")) | ( - (col("a_first_name") == col("b_first_name")) & (col("a_last_name") == col("b_last_name")) & ( - col("a_date_of_birth") == col("b_date_of_birth")) & (col("a_uprn") == col("b_uprn")) & ( - col("a_post_code") == col("b_post_code"))), lit(True)).otherwise(lit(None).cast(BooleanType()))) + (col("a_first_name") == col("b_first_name")) & (col("a_last_name") == col("b_last_name")) & ( + col("a_date_of_birth") == col("b_date_of_birth")) & (col("a_uprn") == col("b_uprn")) & ( + col("a_post_code") == col("b_post_code"))), lit(True)).otherwise(lit(None).cast(BooleanType()))) @pandas_udf(features_schema) @@ -1429,13 +1429,13 @@ def link_all_matched_persons(standard_df: DataFrame, predicted_df: DataFrame) -> # Extra analysis (for analyst only): if you need to do. - # To find how many connection are there # person_graph.inDegrees.filter(col("inDegree") > 1).orderBy(col( + # To find how many connection are there # person_graph.inDegrees.filter(col("inDegree") > 1).orderBy(col( # # # "inDegree").desc()).show(truncate=False) # Graph query using motif to find where person 'a' is connected to person 'b', and person 'b' is also connected # to # person 'a' # motif = person_graph.find("(a)-[]->(b); (b)-[]->(a)") # motif.show(truncate=False) - # To count number of triangles i.e. a connected to b, b connected to c and c is connected back to a # + # To count number of triangles i.e. a connected to b, b connected to c and c is connected back to a # # # # # triangle_count = person_graph.triangleCount() # triangle_count.orderBy(col("count").desc()).show(n=10, # truncate=False) From 814b49e31af6c686e2afe02d456715d6f9c54cf8 Mon Sep 17 00:00:00 2001 From: AGibson <4319494+annajgibson@users.noreply.github.com> Date: Thu, 31 Jul 2025 12:11:21 +0100 Subject: [PATCH 08/11] Reformat code with Black --- .../person_matching_module.py | 2372 ++++++++++++----- 1 file changed, 1713 insertions(+), 659 deletions(-) diff --git a/scripts/jobs/data_and_insight/person_matching_module.py b/scripts/jobs/data_and_insight/person_matching_module.py index 6b78937e9..a88bb214c 100644 --- a/scripts/jobs/data_and_insight/person_matching_module.py +++ b/scripts/jobs/data_and_insight/person_matching_module.py @@ -11,28 +11,66 @@ from graphframes import GraphFrame from pyspark.ml import Pipeline from pyspark.ml.classification import LogisticRegression, LogisticRegressionModel -from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator +from pyspark.ml.evaluation import ( + BinaryClassificationEvaluator, + MulticlassClassificationEvaluator, +) from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler from pyspark.ml.functions import vector_to_array from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel from pyspark.sql import DataFrame, SparkSession, Column -from pyspark.sql.functions import (to_date, col, lit, length, broadcast, udf, when, substring, lower, concat_ws, - soundex, regexp_replace, trim, split, struct, arrays_zip, array, array_sort, - current_date) +from pyspark.sql.functions import ( + to_date, + col, + lit, + length, + broadcast, + udf, + when, + substring, + lower, + concat_ws, + soundex, + regexp_replace, + trim, + split, + struct, + arrays_zip, + array, + array_sort, + current_date, +) from pyspark.sql.pandas.functions import pandas_udf -from pyspark.sql.types import StructType, StructField, StringType, DateType, BooleanType, DoubleType +from pyspark.sql.types import ( + StructType, + StructField, + StringType, + DateType, + BooleanType, + DoubleType, +) extracted_name_schema = StructType( - [StructField("entity_type", StringType(), True), StructField("title", StringType(), True), - StructField("first_name", StringType(), True), StructField("middle_name", StringType(), True), - StructField("last_name", StringType(), True)]) + [ + StructField("entity_type", StringType(), True), + StructField("title", StringType(), True), + StructField("first_name", StringType(), True), + StructField("middle_name", StringType(), True), + StructField("last_name", StringType(), True), + ] +) features_schema = StructType( - [StructField("first_name_similar", BooleanType(), True), StructField("middle_name_similar", BooleanType(), True), - StructField("last_name_similar", BooleanType(), True), StructField("name_similarity", DoubleType(), True), - StructField("address_line_1_similarity", DoubleType(), True), - StructField("address_line_2_similarity", DoubleType(), True), - StructField("full_address_similarity", DoubleType(), True), ]) + [ + StructField("first_name_similar", BooleanType(), True), + StructField("middle_name_similar", BooleanType(), True), + StructField("last_name_similar", BooleanType(), True), + StructField("name_similarity", DoubleType(), True), + StructField("address_line_1_similarity", DoubleType(), True), + StructField("address_line_2_similarity", DoubleType(), True), + StructField("full_address_similarity", DoubleType(), True), + ] +) @udf(returnType=extracted_name_schema) @@ -51,27 +89,159 @@ def extract_person_name(name: str) -> (str, str, str, str, str): name: combined name including first name, last name, title etc. Returns: A quadruple where each element represents title, first_name, middle_name, last_name """ - common_titles = ["mr", "mr.", "mrs", "mrs.", "ms", "ms.", "miss", "master", "exor", "exors", "executors", "of", - "rep", "per", "pers", "reps", "prep", "&prep", "pe", "personal", "repmr", "repmrs", "repsmr", - "repsmrs", "the", "reps.of", "dr", "dr.", "prof", "profeessor", "rev", "lady", "dame", "sir", - "lord"] - common_titles_subset_with_space = ["mr ", "mr. ", "mrs ", "mrs. ", "ms ", "ms. ", "miss ", "exor ", "exors "] - - common_business_types_small = ["ltd", "llp", "plc", "pvt", "&", "lbh", " inc,", "llc", "bv"] - common_business_types = ["limited", "association", "housing", "trust", "home", "society", "estates", "properties", - "property", "group", "fund", "invest", "investment", "estate", "development", "board", - "letting", "agent", "accommodat", "occupier", "residential", "relocation", "accomodation", - "traveller", "living", "education", "residence", "resident", "organisation", "management", - "international", "national", "clinic", "solutions", "service", "system", "security", - "move", "store", "academy", "ventures", "rent", "account", "building", "company", - "congregation", "project", "residencial", "centre", "sport", "assets", "developer", - "asylum", "committee", "school", "apartment", "chartered", "consultant", "enterprise", - "corporate", "associates", "studios", "consultancy", "borough", "holdings", "agency", - "propperties", "hotel", "lodge", "university", "proeprties", "hackney", "empty", "void", - "london", "council"] - - deceased_flags = ["decd", "dec'd", "d'cead", "desd", "d'ced", "de'd", "def'd", "dea's", "dece'd", "dec", "dec`d", - "deceased"] + common_titles = [ + "mr", + "mr.", + "mrs", + "mrs.", + "ms", + "ms.", + "miss", + "master", + "exor", + "exors", + "executors", + "of", + "rep", + "per", + "pers", + "reps", + "prep", + "&prep", + "pe", + "personal", + "repmr", + "repmrs", + "repsmr", + "repsmrs", + "the", + "reps.of", + "dr", + "dr.", + "prof", + "profeessor", + "rev", + "lady", + "dame", + "sir", + "lord", + ] + common_titles_subset_with_space = [ + "mr ", + "mr. ", + "mrs ", + "mrs. ", + "ms ", + "ms. ", + "miss ", + "exor ", + "exors ", + ] + + common_business_types_small = [ + "ltd", + "llp", + "plc", + "pvt", + "&", + "lbh", + " inc,", + "llc", + "bv", + ] + common_business_types = [ + "limited", + "association", + "housing", + "trust", + "home", + "society", + "estates", + "properties", + "property", + "group", + "fund", + "invest", + "investment", + "estate", + "development", + "board", + "letting", + "agent", + "accommodat", + "occupier", + "residential", + "relocation", + "accomodation", + "traveller", + "living", + "education", + "residence", + "resident", + "organisation", + "management", + "international", + "national", + "clinic", + "solutions", + "service", + "system", + "security", + "move", + "store", + "academy", + "ventures", + "rent", + "account", + "building", + "company", + "congregation", + "project", + "residencial", + "centre", + "sport", + "assets", + "developer", + "asylum", + "committee", + "school", + "apartment", + "chartered", + "consultant", + "enterprise", + "corporate", + "associates", + "studios", + "consultancy", + "borough", + "holdings", + "agency", + "propperties", + "hotel", + "lodge", + "university", + "proeprties", + "hackney", + "empty", + "void", + "london", + "council", + ] + + deceased_flags = [ + "decd", + "dec'd", + "d'cead", + "desd", + "d'ced", + "de'd", + "def'd", + "dea's", + "dece'd", + "dec", + "dec`d", + "deceased", + ] junk_data = ["test", "owner"] @@ -79,21 +249,28 @@ def extract_person_name(name: str) -> (str, str, str, str, str): if not name or any(junk in name.casefold() for junk in junk_data): return "Unknown", None, None, None, None - if (any(business in name.casefold() for business in common_business_types) or ( - any(business in name.casefold() for business in common_business_types_small) and not any( - t in name.casefold() for t in common_titles_subset_with_space))): + if any(business in name.casefold() for business in common_business_types) or ( + any(business in name.casefold() for business in common_business_types_small) + and not any(t in name.casefold() for t in common_titles_subset_with_space) + ): return "Business", None, None, None, None person_title, first_name, middle_name, last_name = None, None, None, None - deceased_title = "(Deceased)" if any(dec in name.casefold() for dec in deceased_flags) else None + deceased_title = ( + "(Deceased)" if any(dec in name.casefold() for dec in deceased_flags) else None + ) name_cleaned = re.sub(r"\([^()]*\)", " ", name) # removes parentheses name_list = [n.strip() for n in name_cleaned.split(",") if n.strip()] if len(name_list) == 1: - parts_of_name = [n for n in name_list[0].split() if n.casefold() not in deceased_flags] + parts_of_name = [ + n for n in name_list[0].split() if n.casefold() not in deceased_flags + ] title_finder = [t for t in parts_of_name if t.casefold() in common_titles] person_title = " ".join(title_finder) if len(title_finder) else None - name_without_title = [n for n in parts_of_name if n.casefold() not in common_titles] + name_without_title = [ + n for n in parts_of_name if n.casefold() not in common_titles + ] if len(name_without_title) == 1: last_name = name_without_title[0] elif len(name_without_title) == 2: @@ -105,16 +282,28 @@ def extract_person_name(name: str) -> (str, str, str, str, str): last_name = name_without_title[-1] middle_name = " ".join(name_without_title[1:-1]) elif len(name_list) == 2: - title_finder = [t for t in name_list[0].split() if t.casefold() in common_titles] + title_finder = [ + t for t in name_list[0].split() if t.casefold() in common_titles + ] person_title = " ".join(title_finder) if len(title_finder) else None last_name = " ".join( - [n for n in name_list[0].split() if n and n.casefold() not in deceased_flags + common_titles]) + [ + n + for n in name_list[0].split() + if n and n.casefold() not in deceased_flags + common_titles + ] + ) title_with_name = name_list[1].split() if not person_title: title_finder = [t for t in title_with_name if t.casefold() in common_titles] person_title = " ".join(title_finder) if len(title_finder) else None - remaining_name = [n for n in title_with_name if n.casefold() != ( - person_title or "").casefold() and n.casefold() not in common_titles and n.casefold() not in [".", "&"]] + remaining_name = [ + n + for n in title_with_name + if n.casefold() != (person_title or "").casefold() + and n.casefold() not in common_titles + and n.casefold() not in [".", "&"] + ] if len(remaining_name) == 1: first_name = remaining_name[0] @@ -122,9 +311,15 @@ def extract_person_name(name: str) -> (str, str, str, str, str): first_name, middle_name = remaining_name elif len(remaining_name) > 2: first_name = remaining_name[0] - middle_name = ' '.join(remaining_name[1:]) # middle name includes anything not in first name or last name - - title = " ".join(filter(None, (person_title, deceased_title))).strip() if person_title or deceased_title else None + middle_name = " ".join( + remaining_name[1:] + ) # middle name includes anything not in first name or last name + + title = ( + " ".join(filter(None, (person_title, deceased_title))).strip() + if person_title or deceased_title + else None + ) last_name = last_name if last_name else None return "Person", title, first_name, middle_name, last_name @@ -169,12 +364,22 @@ def categorise_title(title: Column) -> Column: category_sir = title.contains("sir") # Priority 12 category_rabbi = title.contains("rabbi") # Priority 13 - return when(category_master, lit("master")).when(category_ms, lit("ms")).when(category_mrs, lit("mrs")).when( - category_miss, lit("miss")).when(category_mr, lit("mr")).when(category_dame, lit("dame")).when(category_lady, - lit("lady")).when( - category_lord, lit("lord")).when(category_prof, lit("prof")).when(category_dr, lit("dr")).when(category_rev, - lit("rev")).when( - category_sir, lit("sir")).when(category_rabbi, lit("rabbi")).otherwise("unknown") + return ( + when(category_master, lit("master")) + .when(category_ms, lit("ms")) + .when(category_mrs, lit("mrs")) + .when(category_miss, lit("miss")) + .when(category_mr, lit("mr")) + .when(category_dame, lit("dame")) + .when(category_lady, lit("lady")) + .when(category_lord, lit("lord")) + .when(category_prof, lit("prof")) + .when(category_dr, lit("dr")) + .when(category_rev, lit("rev")) + .when(category_sir, lit("sir")) + .when(category_rabbi, lit("rabbi")) + .otherwise("unknown") + ) def standardize_name(name: Column) -> Column: @@ -195,10 +400,21 @@ def standardize_name(name: Column) -> Column: Column after applying the rules """ return when(name.isNull(), lit("")).otherwise( - lower(trim(regexp_replace(regexp_replace(regexp_replace(name, "0", "O"), "1", "L"), "^[\\&*./\\\]+", "")))) - - -def standardize_full_name(first_name: Column, middle_name: Column, last_name: Column) -> Column: + lower( + trim( + regexp_replace( + regexp_replace(regexp_replace(name, "0", "O"), "1", "L"), + "^[\\&*./\\\]+", + "", + ) + ) + ) + ) + + +def standardize_full_name( + first_name: Column, middle_name: Column, last_name: Column +) -> Column: """A Dataframe helper function to sort person's name. This will help to create a full name composed of first_name, middle_name and last_name with any surplus whitespace removed. @@ -210,7 +426,9 @@ def standardize_full_name(first_name: Column, middle_name: Column, last_name: Co Returns: A single name with composed of first_name, middle_name and last_name. """ - return regexp_replace(trim(concat_ws(" ", first_name, middle_name, last_name)), r"\s+", " ") + return regexp_replace( + trim(concat_ws(" ", first_name, middle_name, last_name)), r"\s+", " " + ) def standardize_address_line(address_line: Column) -> Column: @@ -224,8 +442,12 @@ def standardize_address_line(address_line: Column) -> Column: return when(address_line.isNull(), lit("")).otherwise(trim(lower(address_line))) -def full_address(address_line_1: Column, address_line_2: Column, address_line_3: Column, - address_line_4: Column) -> Column: +def full_address( + address_line_1: Column, + address_line_2: Column, + address_line_3: Column, + address_line_4: Column, +) -> Column: """A DataFrame helper function that joins all the parts of the address to form a single address. For example if the address is: @@ -245,11 +467,14 @@ def full_address(address_line_1: Column, address_line_2: Column, address_line_3: Returns: Full address after joining all the lines. """ - return trim(concat_ws(" ", address_line_1, address_line_2, address_line_3, address_line_4)) + return trim( + concat_ws(" ", address_line_1, address_line_2, address_line_3, address_line_4) + ) -def prepare_clean_housing_data(person_reshape: DataFrame, assets_reshape: DataFrame, - tenure_reshape: DataFrame) -> DataFrame: +def prepare_clean_housing_data( + person_reshape: DataFrame, assets_reshape: DataFrame, tenure_reshape: DataFrame +) -> DataFrame: """A function to prepare and clean housing data. Args: person_reshape: Dataframe containing person reshape data @@ -258,51 +483,92 @@ def prepare_clean_housing_data(person_reshape: DataFrame, assets_reshape: DataFr Returns: A prepared and cleaned dataframe containing housing tenancy data. """ - tenure_reshape = tenure_reshape.filter((tenure_reshape["endoftenuredate"].isNull()) | ( - tenure_reshape["endoftenuredate"].cast(DateType()) > current_date())) - - assets_reshape = assets_reshape.filter(assets_reshape['assettype'] == 'Dwelling') - - person_reshape = person_reshape.filter((person_reshape["type"].isin( - ["Secure", "Introductory", "Leasehold (RTB)", "Mense Profit Ac", "Mesne Profit Ac"])) & ( - person_reshape["enddate"].isNull()) & ( - person_reshape["person_type"].isin(["Tenant", "HouseholdMember"]))) - - housing = person_reshape.join(assets_reshape, person_reshape["assetid"] == assets_reshape["asset_id"], - how="left").join(tenure_reshape, - person_reshape["person_id"] == tenure_reshape["person_id"], - how="left").withColumn("source", lit("housing")).withColumn( - "extracted_name", extract_name_udf(col("member_fullname"))).withColumn("title", when( - (col("extracted_name.title").isNull()) | (lower(col("extracted_name.title")) == lower(col("preferredTitle"))), - col("preferredTitle")).otherwise( - concat_ws(" ", col("preferredTitle"), col("extracted_name.title")))).withColumn("first_name", - col("extracted_name.first_name")).withColumn( - "middle_name", col("extracted_name.middle_name")).withColumn("last_name", - col("extracted_name.last_name")).withColumn("dob", - to_date( - substring( - person_reshape[ - "dateofbirth"], - 1, - 10), - format="yyyy-MM-dd")).withColumn( - "date_of_birth", # null value represented by 1900-01-01, so converting 1900-01-01 to null - when(col("dob") == to_date(lit("1900-01-01"), "yyyy-MM-dd"), lit(None).cast(DateType())).otherwise( - col("dob"))).withColumnRenamed("postcode", "post_code").withColumnRenamed("addressline1", - "address_line_1").withColumnRenamed( - "addressline2", "address_line_2").withColumnRenamed("addressline3", "address_line_3").withColumnRenamed( - "addressline4", "address_line_4").withColumnRenamed("placeOfBirth", "place_of_birth").filter( - (length(col("first_name")) > 0) | (length(col("last_name")) > 0)).select(col("source"), - person_reshape["person_id"], - person_reshape["uprn"], col("title"), - col("first_name"), col("middle_name"), - col("last_name"), col("date_of_birth"), - col("post_code"), - col("address_line_1"), - col("address_line_2"), - col("address_line_3"), - col("address_line_4"), - person_reshape["type"]) + tenure_reshape = tenure_reshape.filter( + (tenure_reshape["endoftenuredate"].isNull()) + | (tenure_reshape["endoftenuredate"].cast(DateType()) > current_date()) + ) + + assets_reshape = assets_reshape.filter(assets_reshape["assettype"] == "Dwelling") + + person_reshape = person_reshape.filter( + ( + person_reshape["type"].isin( + [ + "Secure", + "Introductory", + "Leasehold (RTB)", + "Mense Profit Ac", + "Mesne Profit Ac", + ] + ) + ) + & (person_reshape["enddate"].isNull()) + & (person_reshape["person_type"].isin(["Tenant", "HouseholdMember"])) + ) + + housing = ( + person_reshape.join( + assets_reshape, + person_reshape["assetid"] == assets_reshape["asset_id"], + how="left", + ) + .join( + tenure_reshape, + person_reshape["person_id"] == tenure_reshape["person_id"], + how="left", + ) + .withColumn("source", lit("housing")) + .withColumn("extracted_name", extract_name_udf(col("member_fullname"))) + .withColumn( + "title", + when( + (col("extracted_name.title").isNull()) + | (lower(col("extracted_name.title")) == lower(col("preferredTitle"))), + col("preferredTitle"), + ).otherwise( + concat_ws(" ", col("preferredTitle"), col("extracted_name.title")) + ), + ) + .withColumn("first_name", col("extracted_name.first_name")) + .withColumn("middle_name", col("extracted_name.middle_name")) + .withColumn("last_name", col("extracted_name.last_name")) + .withColumn( + "dob", + to_date( + substring(person_reshape["dateofbirth"], 1, 10), format="yyyy-MM-dd" + ), + ) + .withColumn( + "date_of_birth", # null value represented by 1900-01-01, so converting 1900-01-01 to null + when( + col("dob") == to_date(lit("1900-01-01"), "yyyy-MM-dd"), + lit(None).cast(DateType()), + ).otherwise(col("dob")), + ) + .withColumnRenamed("postcode", "post_code") + .withColumnRenamed("addressline1", "address_line_1") + .withColumnRenamed("addressline2", "address_line_2") + .withColumnRenamed("addressline3", "address_line_3") + .withColumnRenamed("addressline4", "address_line_4") + .withColumnRenamed("placeOfBirth", "place_of_birth") + .filter((length(col("first_name")) > 0) | (length(col("last_name")) > 0)) + .select( + col("source"), + person_reshape["person_id"], + person_reshape["uprn"], + col("title"), + col("first_name"), + col("middle_name"), + col("last_name"), + col("date_of_birth"), + col("post_code"), + col("address_line_1"), + col("address_line_2"), + col("address_line_3"), + col("address_line_4"), + person_reshape["type"], + ) + ) return housing @@ -337,37 +603,65 @@ def standardize_housing_data(housing_cleaned: DataFrame) -> DataFrame: Returns: A housing DataFrame with all the standard columns listed above. """ - housing = housing_cleaned.withColumnRenamed("person_id", "source_id").withColumnRenamed("type", - "source_filter").withColumn( - "title", categorise_title(lower(col("title")))).withColumn("first_name", - standardize_name(col("first_name"))).withColumn( - "middle_name", standardize_name(col("middle_name"))).withColumn("last_name", - standardize_name(col("last_name"))).withColumn( - "name", standardize_full_name(col("first_name"), col("middle_name"), col("last_name"))).withColumn("post_code", - lower( - col("post_code"))).withColumn( - "address_line_1", standardize_address_line(col("address_line_1"))).withColumn("address_line_2", - standardize_address_line( - col("address_line_2"))).withColumn( - "address_line_3", standardize_address_line(col("address_line_3"))).withColumn("address_line_4", - standardize_address_line( - col("address_line_4"))).withColumn( - "full_address", full_address(col("address_line_1"), col("address_line_2"), col("address_line_3"), - col("address_line_4"))).select(col("source"), col("source_id"), col("uprn"), - col("title"), col("first_name"), col("middle_name"), - col("last_name"), col("name"), col("date_of_birth"), - col("post_code"), col("address_line_1"), - col("address_line_2"), col("address_line_3"), - col("address_line_4"), col("full_address"), - col("source_filter")).dropDuplicates( - ["source_id", "uprn", "date_of_birth"]) + housing = ( + housing_cleaned.withColumnRenamed("person_id", "source_id") + .withColumnRenamed("type", "source_filter") + .withColumn("title", categorise_title(lower(col("title")))) + .withColumn("first_name", standardize_name(col("first_name"))) + .withColumn("middle_name", standardize_name(col("middle_name"))) + .withColumn("last_name", standardize_name(col("last_name"))) + .withColumn( + "name", + standardize_full_name( + col("first_name"), col("middle_name"), col("last_name") + ), + ) + .withColumn("post_code", lower(col("post_code"))) + .withColumn("address_line_1", standardize_address_line(col("address_line_1"))) + .withColumn("address_line_2", standardize_address_line(col("address_line_2"))) + .withColumn("address_line_3", standardize_address_line(col("address_line_3"))) + .withColumn("address_line_4", standardize_address_line(col("address_line_4"))) + .withColumn( + "full_address", + full_address( + col("address_line_1"), + col("address_line_2"), + col("address_line_3"), + col("address_line_4"), + ), + ) + .select( + col("source"), + col("source_id"), + col("uprn"), + col("title"), + col("first_name"), + col("middle_name"), + col("last_name"), + col("name"), + col("date_of_birth"), + col("post_code"), + col("address_line_1"), + col("address_line_2"), + col("address_line_3"), + col("address_line_4"), + col("full_address"), + col("source_filter"), + ) + .dropDuplicates(["source_id", "uprn", "date_of_birth"]) + ) return housing -def prepare_clean_council_tax_data(spark: SparkSession, council_tax_account: DataFrame, - council_tax_liability_person: DataFrame, council_tax_non_liability_person: DataFrame, - council_tax_occupation: DataFrame, council_tax_property: DataFrame) -> DataFrame: +def prepare_clean_council_tax_data( + spark: SparkSession, + council_tax_account: DataFrame, + council_tax_liability_person: DataFrame, + council_tax_non_liability_person: DataFrame, + council_tax_occupation: DataFrame, + council_tax_property: DataFrame, +) -> DataFrame: """A function to prepare and clean council tax data. Args: spark: SparkSession, @@ -380,57 +674,117 @@ def prepare_clean_council_tax_data(spark: SparkSession, council_tax_account: Dat A DataFrame after preparing and cleaning data from multiple council tax tables. """ council_tax_occupation = council_tax_occupation.filter( - (col("live_ind") == 1) & (col("vacation_date") > col("import_date"))) - - council_tax_property_occupancy = council_tax_occupation.join(council_tax_property, - "property_ref").withColumnRenamed("postcode", - "post_code").withColumnRenamed( - "addr1", "address_line_1").withColumnRenamed("addr2", "address_line_2").withColumnRenamed("addr3", - "address_line_3").withColumnRenamed( - "addr4", "address_line_4").select(col("uprn"), col("account_ref"), col("occupation_date"), col("vacation_date"), - col("post_code"), col("address_line_1"), col("address_line_2"), - col("address_line_3"), col("address_line_4")) - - liable_types = broadcast(spark.createDataFrame( - [(0, 'Non-liable'), (1, 'Joint & Several'), (2, 'Freeholder'), (3, 'Leaseholder'), (4, 'Tenant'), - (5, 'Licencee'), (6, 'Resident'), (7, 'Owner'), (8, 'Assumed'), (9, 'VOID'), (10, 'Other'), (11, 'Suspense'), - (12, 'CTax Payer'), (-1, '(DATA ERROR)')]).toDF("liability_id", "liability_type")) + (col("live_ind") == 1) & (col("vacation_date") > col("import_date")) + ) + + council_tax_property_occupancy = ( + council_tax_occupation.join(council_tax_property, "property_ref") + .withColumnRenamed("postcode", "post_code") + .withColumnRenamed("addr1", "address_line_1") + .withColumnRenamed("addr2", "address_line_2") + .withColumnRenamed("addr3", "address_line_3") + .withColumnRenamed("addr4", "address_line_4") + .select( + col("uprn"), + col("account_ref"), + col("occupation_date"), + col("vacation_date"), + col("post_code"), + col("address_line_1"), + col("address_line_2"), + col("address_line_3"), + col("address_line_4"), + ) + ) + + liable_types = broadcast( + spark.createDataFrame( + [ + (0, "Non-liable"), + (1, "Joint & Several"), + (2, "Freeholder"), + (3, "Leaseholder"), + (4, "Tenant"), + (5, "Licencee"), + (6, "Resident"), + (7, "Owner"), + (8, "Assumed"), + (9, "VOID"), + (10, "Other"), + (11, "Suspense"), + (12, "CTax Payer"), + (-1, "(DATA ERROR)"), + ] + ).toDF("liability_id", "liability_type") + ) council_tax_lead_person = ( - council_tax_account.join(liable_types, col("lead_liab_pos") == col("liability_id")).withColumn("source", - lit("council_tax")).withColumn( - "sub_source", lit("lead")).withColumn("position", lit(0)).withColumnRenamed("lead_liab_name", - "name").withColumn( - "extracted_name", extract_name_udf(col("name"))).select(col("source"), col("account_ref"), col("party_ref"), - col("liability_type"), col("sub_source"), - col("position"), col("extracted_name.*"), - col("name"))) - - council_tax_liable_person = council_tax_liability_person.join(liable_types, - col("liab_pos") == col("liability_id")).withColumn( - "source", lit("council_tax")).withColumn("sub_source", lit("liable")).withColumn("position", - col("liab_pers_occ")).withColumnRenamed( - "liab_name", "name").withColumn("extracted_name", extract_name_udf(col("name"))).select(col("source"), - col("account_ref"), - col("party_ref"), - col("liability_type"), - col("sub_source"), - col("position"), - col("extracted_name.*"), - col("name")) - - council_tax_non_liable_person = council_tax_non_liability_person.withColumn("source", - lit("council_tax")).withColumn( - "sub_source", lit("non liable")).withColumn("liability_type", lit(None).cast(StringType())).withColumn( - "position", col("nonliab_occ")).withColumnRenamed("nonliab_name", "name").withColumn("extracted_name", - extract_name_udf( - col("name"))).select( - col("source"), col("account_ref"), col("party_ref"), col("liability_type"), col("sub_source"), col("position"), - col("extracted_name.*"), col("name")) - - council_tax_person = council_tax_lead_person.union(council_tax_liable_person).union( - council_tax_non_liable_person).join(council_tax_property_occupancy, "account_ref").withColumn("source_filter", - lit("council_tax")) + council_tax_account.join( + liable_types, col("lead_liab_pos") == col("liability_id") + ) + .withColumn("source", lit("council_tax")) + .withColumn("sub_source", lit("lead")) + .withColumn("position", lit(0)) + .withColumnRenamed("lead_liab_name", "name") + .withColumn("extracted_name", extract_name_udf(col("name"))) + .select( + col("source"), + col("account_ref"), + col("party_ref"), + col("liability_type"), + col("sub_source"), + col("position"), + col("extracted_name.*"), + col("name"), + ) + ) + + council_tax_liable_person = ( + council_tax_liability_person.join( + liable_types, col("liab_pos") == col("liability_id") + ) + .withColumn("source", lit("council_tax")) + .withColumn("sub_source", lit("liable")) + .withColumn("position", col("liab_pers_occ")) + .withColumnRenamed("liab_name", "name") + .withColumn("extracted_name", extract_name_udf(col("name"))) + .select( + col("source"), + col("account_ref"), + col("party_ref"), + col("liability_type"), + col("sub_source"), + col("position"), + col("extracted_name.*"), + col("name"), + ) + ) + + council_tax_non_liable_person = ( + council_tax_non_liability_person.withColumn("source", lit("council_tax")) + .withColumn("sub_source", lit("non liable")) + .withColumn("liability_type", lit(None).cast(StringType())) + .withColumn("position", col("nonliab_occ")) + .withColumnRenamed("nonliab_name", "name") + .withColumn("extracted_name", extract_name_udf(col("name"))) + .select( + col("source"), + col("account_ref"), + col("party_ref"), + col("liability_type"), + col("sub_source"), + col("position"), + col("extracted_name.*"), + col("name"), + ) + ) + + council_tax_person = ( + council_tax_lead_person.union(council_tax_liable_person) + .union(council_tax_non_liable_person) + .join(council_tax_property_occupancy, "account_ref") + .withColumn("source_filter", lit("council_tax")) + ) return council_tax_person @@ -465,38 +819,68 @@ def standardize_council_tax_data(council_tax_cleaned: DataFrame) -> DataFrame: Returns: A council tax DataFrame with all the standard columns listed above. """ - council_tax = council_tax_cleaned.filter(col("entity_type") == "Person").drop(col("entity_type")).withColumn( - "source_id", concat_ws("-", col("account_ref"), col("party_ref"), col("position"))).withColumn("date_of_birth", - lit(None).cast( - DateType())).withColumn( - "title", categorise_title(lower(col("title")))).withColumn("first_name", - standardize_name(col("first_name"))).withColumn( - "middle_name", standardize_name(col("middle_name"))).withColumn("last_name", - standardize_name(col("last_name"))).withColumn( - "name", standardize_full_name(col("first_name"), col("middle_name"), col("last_name"))).withColumn("post_code", - lower( - col("post_code"))).withColumn( - "address_line_1", standardize_address_line(col("address_line_1"))).withColumn("address_line_2", - standardize_address_line( - col("address_line_2"))).withColumn( - "address_line_3", standardize_address_line(col("address_line_3"))).withColumn("address_line_4", - standardize_address_line( - col("address_line_4"))).withColumn( - "full_address", full_address(col("address_line_1"), col("address_line_2"), col("address_line_3"), - col("address_line_4"))).select(col("source"), col("source_id"), col("uprn"), - col("title"), col("first_name"), col("middle_name"), - col("last_name"), col("name"), col("date_of_birth"), - col("post_code"), col("address_line_1"), - col("address_line_2"), col("address_line_3"), - col("address_line_4"), col("full_address"), - col("source_filter")).dropDuplicates( - ["source_id", "uprn"]) + council_tax = ( + council_tax_cleaned.filter(col("entity_type") == "Person") + .drop(col("entity_type")) + .withColumn( + "source_id", + concat_ws("-", col("account_ref"), col("party_ref"), col("position")), + ) + .withColumn("date_of_birth", lit(None).cast(DateType())) + .withColumn("title", categorise_title(lower(col("title")))) + .withColumn("first_name", standardize_name(col("first_name"))) + .withColumn("middle_name", standardize_name(col("middle_name"))) + .withColumn("last_name", standardize_name(col("last_name"))) + .withColumn( + "name", + standardize_full_name( + col("first_name"), col("middle_name"), col("last_name") + ), + ) + .withColumn("post_code", lower(col("post_code"))) + .withColumn("address_line_1", standardize_address_line(col("address_line_1"))) + .withColumn("address_line_2", standardize_address_line(col("address_line_2"))) + .withColumn("address_line_3", standardize_address_line(col("address_line_3"))) + .withColumn("address_line_4", standardize_address_line(col("address_line_4"))) + .withColumn( + "full_address", + full_address( + col("address_line_1"), + col("address_line_2"), + col("address_line_3"), + col("address_line_4"), + ), + ) + .select( + col("source"), + col("source_id"), + col("uprn"), + col("title"), + col("first_name"), + col("middle_name"), + col("last_name"), + col("name"), + col("date_of_birth"), + col("post_code"), + col("address_line_1"), + col("address_line_2"), + col("address_line_3"), + col("address_line_4"), + col("full_address"), + col("source_filter"), + ) + .dropDuplicates(["source_id", "uprn"]) + ) return council_tax -def prepare_clean_housing_benefit_data(hb_member_df: DataFrame, hb_household_df: DataFrame, - hb_rent_assessment_df: DataFrame, hb_ctax_assessment_df: DataFrame) -> DataFrame: +def prepare_clean_housing_benefit_data( + hb_member_df: DataFrame, + hb_household_df: DataFrame, + hb_rent_assessment_df: DataFrame, + hb_ctax_assessment_df: DataFrame, +) -> DataFrame: """A function to prepare and clean housing benefit data. Data comes from multiple sources. This function is specific to this particular data source. For a new data source please add a new function. Args: @@ -507,56 +891,112 @@ def prepare_clean_housing_benefit_data(hb_member_df: DataFrame, hb_household_df: Returns: A DataFrame after preparing and cleaning housing benefit data from multiple tables. """ - housing_benefit_member = hb_member_df.withColumn("claim_house_id", - concat_ws("-", col("claim_id"), col("house_id"))).withColumn( - "claim_person_ref", concat_ws("-", col("claim_id"), col("house_id"), col("member_id"))).withColumn("gender", - when( - col("gender") == 2, - "F").when( - col("gender") == 1, - "M").otherwise( - "O")).withColumn( - "extracted_name", extract_name_udf(col("name"))).withColumn("source", lit("housing_benefit")).withColumn( - "date_of_birth", to_date(col("birth_date"))).select(col("source"), col("claim_person_ref"), - col("claim_house_id"), col("extracted_name.*"), - col("date_of_birth"), col("gender")) - - housing_benefit_household = hb_household_df.withColumn("claim_house_id", concat_ws("-", col("claim_id"), - col("house_id"))).withColumnRenamed( - "addr1", "address_line_1").withColumnRenamed("addr2", "address_line_2").withColumnRenamed("addr3", - "address_line_3").withColumnRenamed( - "addr4", "address_line_4").filter( - (col("from_date") < col("import_date")) & (col("to_date") > col("import_date"))).select(col("claim_id"), - col("claim_house_id"), - col("address_line_1"), - col("address_line_2"), - col("address_line_3"), - col("address_line_4"), - col("post_code"), - col("uprn")) - - housing_benefit_rent_assessment = hb_rent_assessment_df.withColumn("source_filter", when( - (col("dhp_ind") == 1) & (col("type_ind") > 1), "DHP").otherwise("HB")).filter( - (col("from_date") < col("import_date")) & (col("to_date") > col("import_date")) & ( - (col("type_ind") == 1) | (col("dhp_ind") == 1)) & (col("model_amt") > 0)).select(col("claim_id"), - col("source_filter")) - - housing_benefit_ctax_assessment = hb_ctax_assessment_df.withColumn("source_filter", lit("CTS")).filter( - (col("from_date") < col("import_date")) & (col("to_date") > col("import_date")) & (col("model_amt") > 0) & ( - (col("type_ind") == 1) | (col("dhp_ind") == 1))).select(col("claim_id"), col("source_filter")) - - housing_benefit_rent_ctax = housing_benefit_rent_assessment.union(housing_benefit_ctax_assessment) - - housing_benefit_household_claims = housing_benefit_household.join(housing_benefit_rent_ctax, ["claim_id"]) - - housing_benefit_cleaned = housing_benefit_household_claims.join(housing_benefit_member, - ["claim_house_id"]).withColumn("source", - lit("housing_benefit")).withColumn( - "source_id", col("claim_id")).select(col("source"), col("claim_person_ref"), col("uprn"), col("title"), - col("first_name"), col("middle_name"), col("last_name"), - col("date_of_birth"), col("gender"), col("post_code"), - col("address_line_1"), col("address_line_2"), col("address_line_3"), - col("address_line_4"), col("source_filter")) + housing_benefit_member = ( + hb_member_df.withColumn( + "claim_house_id", concat_ws("-", col("claim_id"), col("house_id")) + ) + .withColumn( + "claim_person_ref", + concat_ws("-", col("claim_id"), col("house_id"), col("member_id")), + ) + .withColumn( + "gender", + when(col("gender") == 2, "F").when(col("gender") == 1, "M").otherwise("O"), + ) + .withColumn("extracted_name", extract_name_udf(col("name"))) + .withColumn("source", lit("housing_benefit")) + .withColumn("date_of_birth", to_date(col("birth_date"))) + .select( + col("source"), + col("claim_person_ref"), + col("claim_house_id"), + col("extracted_name.*"), + col("date_of_birth"), + col("gender"), + ) + ) + + housing_benefit_household = ( + hb_household_df.withColumn( + "claim_house_id", concat_ws("-", col("claim_id"), col("house_id")) + ) + .withColumnRenamed("addr1", "address_line_1") + .withColumnRenamed("addr2", "address_line_2") + .withColumnRenamed("addr3", "address_line_3") + .withColumnRenamed("addr4", "address_line_4") + .filter( + (col("from_date") < col("import_date")) + & (col("to_date") > col("import_date")) + ) + .select( + col("claim_id"), + col("claim_house_id"), + col("address_line_1"), + col("address_line_2"), + col("address_line_3"), + col("address_line_4"), + col("post_code"), + col("uprn"), + ) + ) + + housing_benefit_rent_assessment = ( + hb_rent_assessment_df.withColumn( + "source_filter", + when((col("dhp_ind") == 1) & (col("type_ind") > 1), "DHP").otherwise("HB"), + ) + .filter( + (col("from_date") < col("import_date")) + & (col("to_date") > col("import_date")) + & ((col("type_ind") == 1) | (col("dhp_ind") == 1)) + & (col("model_amt") > 0) + ) + .select(col("claim_id"), col("source_filter")) + ) + + housing_benefit_ctax_assessment = ( + hb_ctax_assessment_df.withColumn("source_filter", lit("CTS")) + .filter( + (col("from_date") < col("import_date")) + & (col("to_date") > col("import_date")) + & (col("model_amt") > 0) + & ((col("type_ind") == 1) | (col("dhp_ind") == 1)) + ) + .select(col("claim_id"), col("source_filter")) + ) + + housing_benefit_rent_ctax = housing_benefit_rent_assessment.union( + housing_benefit_ctax_assessment + ) + + housing_benefit_household_claims = housing_benefit_household.join( + housing_benefit_rent_ctax, ["claim_id"] + ) + + housing_benefit_cleaned = ( + housing_benefit_household_claims.join( + housing_benefit_member, ["claim_house_id"] + ) + .withColumn("source", lit("housing_benefit")) + .withColumn("source_id", col("claim_id")) + .select( + col("source"), + col("claim_person_ref"), + col("uprn"), + col("title"), + col("first_name"), + col("middle_name"), + col("last_name"), + col("date_of_birth"), + col("gender"), + col("post_code"), + col("address_line_1"), + col("address_line_2"), + col("address_line_3"), + col("address_line_4"), + col("source_filter"), + ) + ) return housing_benefit_cleaned @@ -590,26 +1030,54 @@ def standardize_housing_benefit_data(housing_benefit_cleaned: DataFrame) -> Data Returns: A housing benefit DataFrame with all the standard columns listed above. """ - housing_benefit = housing_benefit_cleaned.withColumn("source_id", col("claim_person_ref")).withColumn("title", - categorise_title( - lower( - col("title")))).withColumn( - "first_name", standardize_name(col("first_name"))).withColumn("middle_name", - standardize_name(col("middle_name"))).withColumn( - "last_name", standardize_name(col("last_name"))).withColumn("name", standardize_full_name(col("first_name"), - col("middle_name"), - col("last_name"))).withColumn( - "post_code", lower(col("post_code"))).withColumn("address_line_1", - standardize_address_line(col("address_line_1"))).withColumn( - "address_line_2", standardize_address_line(col("address_line_2"))).withColumn("address_line_3", - standardize_address_line( - col("address_line_3"))).withColumn( - "address_line_4", standardize_address_line(col("address_line_4"))).withColumn("full_address", full_address( - col("address_line_1"), col("address_line_2"), col("address_line_3"), col("address_line_4"))).select( - col("source"), col("source_id"), col("uprn"), col("title"), col("first_name"), col("middle_name"), - col("last_name"), col("name"), col("date_of_birth"), col("post_code"), col("address_line_1"), - col("address_line_2"), col("address_line_3"), col("address_line_4"), col("full_address"), - col("source_filter")).dropDuplicates(["source_id", "first_name", "last_name", "date_of_birth", "post_code"]) + housing_benefit = ( + housing_benefit_cleaned.withColumn("source_id", col("claim_person_ref")) + .withColumn("title", categorise_title(lower(col("title")))) + .withColumn("first_name", standardize_name(col("first_name"))) + .withColumn("middle_name", standardize_name(col("middle_name"))) + .withColumn("last_name", standardize_name(col("last_name"))) + .withColumn( + "name", + standardize_full_name( + col("first_name"), col("middle_name"), col("last_name") + ), + ) + .withColumn("post_code", lower(col("post_code"))) + .withColumn("address_line_1", standardize_address_line(col("address_line_1"))) + .withColumn("address_line_2", standardize_address_line(col("address_line_2"))) + .withColumn("address_line_3", standardize_address_line(col("address_line_3"))) + .withColumn("address_line_4", standardize_address_line(col("address_line_4"))) + .withColumn( + "full_address", + full_address( + col("address_line_1"), + col("address_line_2"), + col("address_line_3"), + col("address_line_4"), + ), + ) + .select( + col("source"), + col("source_id"), + col("uprn"), + col("title"), + col("first_name"), + col("middle_name"), + col("last_name"), + col("name"), + col("date_of_birth"), + col("post_code"), + col("address_line_1"), + col("address_line_2"), + col("address_line_3"), + col("address_line_4"), + col("full_address"), + col("source_filter"), + ) + .dropDuplicates( + ["source_id", "first_name", "last_name", "date_of_birth", "post_code"] + ) + ) return housing_benefit @@ -623,19 +1091,41 @@ def prepare_clean_parking_permit_data(parking_permit_df: DataFrame) -> DataFrame A DataFrame after preparing and cleaning parking permit data. """ - parking_permit_cleaned = parking_permit_df.withColumn("source", lit("parking_permit")).withColumn("source_filter", - lit("live " - "parking " - "permit")).withColumn( - "extracted_name", - extract_name_udf(concat_ws(" ", col("forename_of_applicant"), col("surname_of_applicant")))).withColumn( - "date_of_birth", to_date(col("date_of_birth_of_applicant"), format="yyyy-MM-dd")).withColumnRenamed("postcode", - "post_code").withColumnRenamed( - "email_address_of_applicant", "email").filter( - (col("permit_type").isin(["Residents", "Estate Resident"])) & (col("live_permit_flag") == 1)).select( - col("source"), col("permit_reference"), col("extracted_name.*"), col("date_of_birth"), col("email"), - col("post_code"), col("uprn"), col("address_line_1"), col("address_line_2"), col("address_line_3"), - col("source_filter")) + parking_permit_cleaned = ( + parking_permit_df.withColumn("source", lit("parking_permit")) + .withColumn("source_filter", lit("live " "parking " "permit")) + .withColumn( + "extracted_name", + extract_name_udf( + concat_ws( + " ", col("forename_of_applicant"), col("surname_of_applicant") + ) + ), + ) + .withColumn( + "date_of_birth", + to_date(col("date_of_birth_of_applicant"), format="yyyy-MM-dd"), + ) + .withColumnRenamed("postcode", "post_code") + .withColumnRenamed("email_address_of_applicant", "email") + .filter( + (col("permit_type").isin(["Residents", "Estate Resident"])) + & (col("live_permit_flag") == 1) + ) + .select( + col("source"), + col("permit_reference"), + col("extracted_name.*"), + col("date_of_birth"), + col("email"), + col("post_code"), + col("uprn"), + col("address_line_1"), + col("address_line_2"), + col("address_line_3"), + col("source_filter"), + ) + ) return parking_permit_cleaned @@ -669,30 +1159,60 @@ def standardize_parking_permit_data(parking_permit_cleaned: DataFrame) -> DataFr Returns: A parking permit DataFrame with all the standard columns listed above. """ - parking_permit = parking_permit_cleaned.filter(col("entity_type") == "Person").drop(col("entity_type")).withColumn( - "source_id", col("permit_reference")).withColumn("title", categorise_title(lower(col("title")))).withColumn( - "first_name", standardize_name(col("first_name"))).withColumn("middle_name", - standardize_name(col("middle_name"))).withColumn( - "last_name", standardize_name(col("last_name"))).withColumn("name", standardize_full_name(col("first_name"), - col("middle_name"), - col("last_name"))).withColumn( - "post_code", lower(col("post_code"))).withColumn("address_line_1", - standardize_address_line(col("address_line_1"))).withColumn( - "address_line_2", standardize_address_line(col("address_line_2"))).withColumn("address_line_3", - standardize_address_line( - col("address_line_3"))).withColumn( - "address_line_4", lit("").cast(StringType())).withColumn("full_address", full_address(col("address_line_1"), - col("address_line_2"), - col("address_line_3"), - col("address_line_4"))).select( - col("source"), col("source_id"), col("uprn"), col("title"), col("first_name"), col("middle_name"), - col("last_name"), col("name"), col("date_of_birth"), col("post_code"), col("address_line_1"), - col("address_line_2"), col("address_line_3"), col("address_line_4"), col("full_address"), col("source_filter")) + parking_permit = ( + parking_permit_cleaned.filter(col("entity_type") == "Person") + .drop(col("entity_type")) + .withColumn("source_id", col("permit_reference")) + .withColumn("title", categorise_title(lower(col("title")))) + .withColumn("first_name", standardize_name(col("first_name"))) + .withColumn("middle_name", standardize_name(col("middle_name"))) + .withColumn("last_name", standardize_name(col("last_name"))) + .withColumn( + "name", + standardize_full_name( + col("first_name"), col("middle_name"), col("last_name") + ), + ) + .withColumn("post_code", lower(col("post_code"))) + .withColumn("address_line_1", standardize_address_line(col("address_line_1"))) + .withColumn("address_line_2", standardize_address_line(col("address_line_2"))) + .withColumn("address_line_3", standardize_address_line(col("address_line_3"))) + .withColumn("address_line_4", lit("").cast(StringType())) + .withColumn( + "full_address", + full_address( + col("address_line_1"), + col("address_line_2"), + col("address_line_3"), + col("address_line_4"), + ), + ) + .select( + col("source"), + col("source_id"), + col("uprn"), + col("title"), + col("first_name"), + col("middle_name"), + col("last_name"), + col("name"), + col("date_of_birth"), + col("post_code"), + col("address_line_1"), + col("address_line_2"), + col("address_line_3"), + col("address_line_4"), + col("full_address"), + col("source_filter"), + ) + ) return parking_permit -def prepare_clean_schools_admissions_data(schools_admissions_df: DataFrame) -> DataFrame: +def prepare_clean_schools_admissions_data( + schools_admissions_df: DataFrame, +) -> DataFrame: """A function to prepare and clean schools admissions data. Splits ou middle name from first name. Sorts address columns so that they are consistent with other datasets. @@ -703,70 +1223,170 @@ def prepare_clean_schools_admissions_data(schools_admissions_df: DataFrame) -> D A DataFrame after preparing data from multiple sources and cleaning it. """ - address_cols = ["address_line_1", "address_line_2", "address_line_3", "address_line_4"] - - schools_admissions_cleaned = schools_admissions_df.withColumn("source", lit("schools_admission")).withColumn( - "source_id", col("child_id")).withColumn("first_name", - split(schools_admissions_df["contact_forename"], ' ').getItem( - 0)).withColumn("middle_name", - split(schools_admissions_df["contact_forename"], - ' ').getItem(1)).withColumn("last_name", - col("contact_surname")).withColumn( - "name", regexp_replace(concat_ws(" ", col("first_name"), col("middle_name"), col("last_name")), r"\s+", - " ")).withColumn("date_of_birth", lit("")).withColumnRenamed("first_lLine", - "address_line_1").withColumnRenamed( - "second_line", "address_line_2").withColumnRenamed("third_line", "address_line_3").withColumnRenamed("town", - "address_line_4").withColumn( - "source_filter", lit("school admissions")).select(col("source"), col("source_id"), col("title"), - col("first_name"), col("middle_name"), col("last_name"), - col("name"), col("date_of_birth"), col("email"), - col("post_code"), col("uprn"), col("address_line_1"), - col("address_line_2"), col("address_line_3"), - col("address_line_4"), col("source_filter")) + address_cols = [ + "address_line_1", + "address_line_2", + "address_line_3", + "address_line_4", + ] + + schools_admissions_cleaned = ( + schools_admissions_df.withColumn("source", lit("schools_admission")) + .withColumn("source_id", col("child_id")) + .withColumn( + "first_name", + split(schools_admissions_df["contact_forename"], " ").getItem(0), + ) + .withColumn( + "middle_name", + split(schools_admissions_df["contact_forename"], " ").getItem(1), + ) + .withColumn("last_name", col("contact_surname")) + .withColumn( + "name", + regexp_replace( + concat_ws(" ", col("first_name"), col("middle_name"), col("last_name")), + r"\s+", + " ", + ), + ) + .withColumn("date_of_birth", lit("")) + .withColumnRenamed("first_lLine", "address_line_1") + .withColumnRenamed("second_line", "address_line_2") + .withColumnRenamed("third_line", "address_line_3") + .withColumnRenamed("town", "address_line_4") + .withColumn("source_filter", lit("school admissions")) + .select( + col("source"), + col("source_id"), + col("title"), + col("first_name"), + col("middle_name"), + col("last_name"), + col("name"), + col("date_of_birth"), + col("email"), + col("post_code"), + col("uprn"), + col("address_line_1"), + col("address_line_2"), + col("address_line_3"), + col("address_line_4"), + col("source_filter"), + ) + ) # create a zip of address line arrays, sorted in the order of not null (False), column order - schools_admissions_cleaned = schools_admissions_cleaned.select(col("source"), col("source_id"), col("title"), - col("first_name"), col("middle_name"), - col("last_name"), col("name"), col("date_of_birth"), - col("email"), col("post_code"), col("uprn"), - col("address_line_1"), col("address_line_2"), - col("address_line_3"), col("address_line_4"), - col("source_filter"), array_sort( - arrays_zip(array([col(c).isNull() for c in address_cols]), array([lit(i) for i in range(4)]), - array([col(c) for c in address_cols]))).alias('address_sorted')) + schools_admissions_cleaned = schools_admissions_cleaned.select( + col("source"), + col("source_id"), + col("title"), + col("first_name"), + col("middle_name"), + col("last_name"), + col("name"), + col("date_of_birth"), + col("email"), + col("post_code"), + col("uprn"), + col("address_line_1"), + col("address_line_2"), + col("address_line_3"), + col("address_line_4"), + col("source_filter"), + array_sort( + arrays_zip( + array([col(c).isNull() for c in address_cols]), + array([lit(i) for i in range(4)]), + array([col(c) for c in address_cols]), + ) + ).alias("address_sorted"), + ) # disaggregate address_sorted arrays into columns - schools_admissions_cleaned = schools_admissions_cleaned.select(col("source"), col("source_id"), col("title"), - col("first_name"), col("middle_name"), - col("last_name"), col("name"), col("date_of_birth"), - col("email"), col("post_code"), col("uprn"), - col("source_filter"), *[ - col("address_sorted")[i]['2'].alias(address_cols[i]) for i in range(4)]) + schools_admissions_cleaned = schools_admissions_cleaned.select( + col("source"), + col("source_id"), + col("title"), + col("first_name"), + col("middle_name"), + col("last_name"), + col("name"), + col("date_of_birth"), + col("email"), + col("post_code"), + col("uprn"), + col("source_filter"), + *[col("address_sorted")[i]["2"].alias(address_cols[i]) for i in range(4)], + ) # rejig address lines - schools_admissions_cleaned = schools_admissions_cleaned.withColumn("address_line_1", when( - col("address_line_1").rlike(r"\d+$") & col("address_line_2").rlike(r"^[A-Za-z]"), - concat_ws(" ", col("address_line_1"), col("address_line_2"))).otherwise(col("address_line_1"))).withColumn( - "address_line_2", when(col("address_line_1").contains(col("address_line_2")), col("address_line_3")).otherwise( - concat_ws(" ", col("address_line_2"), col("address_line_3")))).withColumn("address_line_2", when( - col("address_line_2").rlike(r"\d+$"), concat_ws(" ", col("address_line_2"), col("address_line_4"))).otherwise( - col("address_line_2"))).withColumn("address_line_3", when(col("address_line_2").contains(col("address_line_3")), - lit("london"))).withColumn("address_line_2", when( - col("address_line_2").isNull(), lit("hackney")).otherwise(col("address_line_2"))).withColumn("address_line_3", - when( - col("address_line_3").isNull(), - lit("london")).otherwise( - col("address_line_3"))).withColumn( - "address_line_4", lit("")).select(col("source"), col("source_id"), col("title"), col("first_name"), - col("middle_name"), col("last_name"), col("name"), col("date_of_birth"), - col("email"), col("post_code"), col("uprn"), col("address_line_1"), - col("address_line_2"), col("address_line_3"), col("address_line_4"), - col("source_filter")) + schools_admissions_cleaned = ( + schools_admissions_cleaned.withColumn( + "address_line_1", + when( + col("address_line_1").rlike(r"\d+$") + & col("address_line_2").rlike(r"^[A-Za-z]"), + concat_ws(" ", col("address_line_1"), col("address_line_2")), + ).otherwise(col("address_line_1")), + ) + .withColumn( + "address_line_2", + when( + col("address_line_1").contains(col("address_line_2")), + col("address_line_3"), + ).otherwise(concat_ws(" ", col("address_line_2"), col("address_line_3"))), + ) + .withColumn( + "address_line_2", + when( + col("address_line_2").rlike(r"\d+$"), + concat_ws(" ", col("address_line_2"), col("address_line_4")), + ).otherwise(col("address_line_2")), + ) + .withColumn( + "address_line_3", + when(col("address_line_2").contains(col("address_line_3")), lit("london")), + ) + .withColumn( + "address_line_2", + when(col("address_line_2").isNull(), lit("hackney")).otherwise( + col("address_line_2") + ), + ) + .withColumn( + "address_line_3", + when(col("address_line_3").isNull(), lit("london")).otherwise( + col("address_line_3") + ), + ) + .withColumn("address_line_4", lit("")) + .select( + col("source"), + col("source_id"), + col("title"), + col("first_name"), + col("middle_name"), + col("last_name"), + col("name"), + col("date_of_birth"), + col("email"), + col("post_code"), + col("uprn"), + col("address_line_1"), + col("address_line_2"), + col("address_line_3"), + col("address_line_4"), + col("source_filter"), + ) + ) return schools_admissions_cleaned -def standardize_schools_admissions_data(schools_admissions_cleaned: DataFrame) -> DataFrame: +def standardize_schools_admissions_data( + schools_admissions_cleaned: DataFrame, +) -> DataFrame: """Standardize schools admissions data. This function convert all the custom names (coming from their respective sources to standard names that will be used by various other functions like feature engineering etc.) The DataFrame returned will have the following columns: @@ -802,29 +1422,55 @@ def standardize_schools_admissions_data(schools_admissions_cleaned: DataFrame) - A schools admissions DataFrame with all the standard column listed above. """ - schools_admissions = schools_admissions_cleaned.withColumn("source_id", col("source_id")).withColumn("title", - categorise_title( - lower(trim( - col("title"))))).withColumn( - "first_name", standardize_name(trim(col("first_name")))).withColumn("middle_name", standardize_name( - trim(col("middle_name")))).withColumn("last_name", standardize_name(trim(col("last_name")))).withColumn("name", - standardize_name( - trim( - col("name")))).withColumn( - "post_code", lower(trim(col("post_code")))).withColumn("address_line_1", standardize_address_line( - trim(col("address_line_1")))).withColumn("address_line_2", - standardize_address_line(trim(col("address_line_2")))).withColumn( - "address_line_3", standardize_address_line(trim(col("address_line_3")))).withColumn("address_line_4", - standardize_address_line( - trim( - col("address_line_4")))).withColumn( - "full_address1", - full_address(trim(col("address_line_1")), trim(col("address_line_2")), trim(col("address_line_3")), - trim(col("address_line_4")))).withColumn("full_address", - regexp_replace(col("full_address1"), r"\s+", " ")).select( - col("source"), col("source_id"), col("uprn"), col("title"), col("first_name"), col("middle_name"), - col("last_name"), col("name"), col("date_of_birth"), col("post_code"), col("address_line_1"), - col("address_line_2"), col("address_line_3"), col("address_line_4"), col("full_address"), col("source_filter")) + schools_admissions = ( + schools_admissions_cleaned.withColumn("source_id", col("source_id")) + .withColumn("title", categorise_title(lower(trim(col("title"))))) + .withColumn("first_name", standardize_name(trim(col("first_name")))) + .withColumn("middle_name", standardize_name(trim(col("middle_name")))) + .withColumn("last_name", standardize_name(trim(col("last_name")))) + .withColumn("name", standardize_name(trim(col("name")))) + .withColumn("post_code", lower(trim(col("post_code")))) + .withColumn( + "address_line_1", standardize_address_line(trim(col("address_line_1"))) + ) + .withColumn( + "address_line_2", standardize_address_line(trim(col("address_line_2"))) + ) + .withColumn( + "address_line_3", standardize_address_line(trim(col("address_line_3"))) + ) + .withColumn( + "address_line_4", standardize_address_line(trim(col("address_line_4"))) + ) + .withColumn( + "full_address1", + full_address( + trim(col("address_line_1")), + trim(col("address_line_2")), + trim(col("address_line_3")), + trim(col("address_line_4")), + ), + ) + .withColumn("full_address", regexp_replace(col("full_address1"), r"\s+", " ")) + .select( + col("source"), + col("source_id"), + col("uprn"), + col("title"), + col("first_name"), + col("middle_name"), + col("last_name"), + col("name"), + col("date_of_birth"), + col("post_code"), + col("address_line_1"), + col("address_line_2"), + col("address_line_3"), + col("address_line_4"), + col("full_address"), + col("source_filter"), + ) + ) return schools_admissions @@ -840,66 +1486,158 @@ def prepare_clean_freedom_pass_admissions_data(freedom_df: DataFrame) -> DataFra freedom_cleaned (Dataframe): A DataFrame after preparing data from multiple sources and cleaning it. """ - address_cols = ["address_line_1", "address_line_2", "address_line_3", "address_line_4"] - - freedom_cleaned = freedom_df.withColumn("source", lit("freedom_passes")).withColumn("source_id", - col("applicantid")).withColumn( - "first_name", col("forename")).withColumn("middle_name", lit("")).withColumn("last_name", - col("surname")).withColumn("name", - regexp_replace( - concat_ws( - " ", - col("first_name"), - col("last_name")), - r"\s+", - " ")).withColumnRenamed( - "house_name_number", "address_line_1").withColumnRenamed("building_name", "address_line_2").withColumnRenamed( - "street", "address_line_3").withColumnRenamed("district", "address_line_4").withColumnRenamed("postcode", - "post_code").withColumnRenamed( - "email_address", "email").withColumn("date_of_birth", - to_date(col("date_of_birth"), format="dd/MM/yyyy")).withColumn("uprn", - lit("")).withColumn( - "source_filter", lit("freedom_passes_2024")).select(col("source"), col("source_id"), col("title"), - col("first_name"), col("middle_name"), col("last_name"), - col("name"), col("date_of_birth"), col("email"), - col("post_code"), col("uprn"), col("address_line_1"), - col("address_line_2"), col("address_line_3"), - col("address_line_4"), col("source_filter")) + address_cols = [ + "address_line_1", + "address_line_2", + "address_line_3", + "address_line_4", + ] + + freedom_cleaned = ( + freedom_df.withColumn("source", lit("freedom_passes")) + .withColumn("source_id", col("applicantid")) + .withColumn("first_name", col("forename")) + .withColumn("middle_name", lit("")) + .withColumn("last_name", col("surname")) + .withColumn( + "name", + regexp_replace( + concat_ws(" ", col("first_name"), col("last_name")), r"\s+", " " + ), + ) + .withColumnRenamed("house_name_number", "address_line_1") + .withColumnRenamed("building_name", "address_line_2") + .withColumnRenamed("street", "address_line_3") + .withColumnRenamed("district", "address_line_4") + .withColumnRenamed("postcode", "post_code") + .withColumnRenamed("email_address", "email") + .withColumn("date_of_birth", to_date(col("date_of_birth"), format="dd/MM/yyyy")) + .withColumn("uprn", lit("")) + .withColumn("source_filter", lit("freedom_passes_2024")) + .select( + col("source"), + col("source_id"), + col("title"), + col("first_name"), + col("middle_name"), + col("last_name"), + col("name"), + col("date_of_birth"), + col("email"), + col("post_code"), + col("uprn"), + col("address_line_1"), + col("address_line_2"), + col("address_line_3"), + col("address_line_4"), + col("source_filter"), + ) + ) # create a zip of address line arrays, sorted in the order of not null (False), column order - freedom_cleaned = freedom_cleaned.select(col("source"), col("source_id"), col("title"), col("first_name"), - col("middle_name"), col("last_name"), col("name"), col("date_of_birth"), - col("email"), col("post_code"), col("uprn"), col("address_line_1"), - col("address_line_2"), col("address_line_3"), col("address_line_4"), - col("source_filter"), array_sort( - arrays_zip(array([col(c).isNull() for c in address_cols]), array([lit(i) for i in range(4)]), - array([col(c) for c in address_cols]))).alias('address_sorted')) + freedom_cleaned = freedom_cleaned.select( + col("source"), + col("source_id"), + col("title"), + col("first_name"), + col("middle_name"), + col("last_name"), + col("name"), + col("date_of_birth"), + col("email"), + col("post_code"), + col("uprn"), + col("address_line_1"), + col("address_line_2"), + col("address_line_3"), + col("address_line_4"), + col("source_filter"), + array_sort( + arrays_zip( + array([col(c).isNull() for c in address_cols]), + array([lit(i) for i in range(4)]), + array([col(c) for c in address_cols]), + ) + ).alias("address_sorted"), + ) # disaggregate address_sorted arrays into columns - freedom_cleaned = freedom_cleaned.select(col("source"), col("source_id"), col("title"), col("first_name"), - col("middle_name"), col("last_name"), col("name"), col("date_of_birth"), - col("email"), col("post_code"), col("uprn"), col("source_filter"), - *[col("address_sorted")[i]['2'].alias(address_cols[i]) for i in range(4)]) + freedom_cleaned = freedom_cleaned.select( + col("source"), + col("source_id"), + col("title"), + col("first_name"), + col("middle_name"), + col("last_name"), + col("name"), + col("date_of_birth"), + col("email"), + col("post_code"), + col("uprn"), + col("source_filter"), + *[col("address_sorted")[i]["2"].alias(address_cols[i]) for i in range(4)], + ) # rejig address lines - freedom_cleaned = freedom_cleaned.withColumn("address_line_1", when( - col("address_line_1").rlike(r"\d+[a-z]$") & col("address_line_2").rlike(r"^[A-Za-z]"), - concat_ws(" ", col("address_line_1"), col("address_line_2"))).otherwise(col("address_line_1"))).withColumn( - "address_line_2", when(col("address_line_1").contains(col("address_line_2")), col("address_line_3")).otherwise( - concat_ws(" ", col("address_line_2"), col("address_line_3")))).withColumn("address_line_2", when( - col("address_line_2").rlike(r"\d+$"), concat_ws(" ", col("address_line_2"), col("address_line_4"))).otherwise( - col("address_line_2"))).withColumn("address_line_3", when(col("address_line_2").contains(col("address_line_3")), - lit("london"))).withColumn("address_line_2", when( - col("address_line_2").isNull(), lit("hackney")).otherwise(col("address_line_2"))).withColumn("address_line_3", - when( - col("address_line_3").isNull(), - lit("london")).otherwise( - col("address_line_3"))).withColumn( - "address_line_4", lit("")).select(col("source"), col("source_id"), col("title"), col("first_name"), - col("middle_name"), col("last_name"), col("name"), col("date_of_birth"), - col("email"), col("post_code"), col("uprn"), col("address_line_1"), - col("address_line_2"), col("address_line_3"), col("address_line_4"), - col("source_filter")) + freedom_cleaned = ( + freedom_cleaned.withColumn( + "address_line_1", + when( + col("address_line_1").rlike(r"\d+[a-z]$") + & col("address_line_2").rlike(r"^[A-Za-z]"), + concat_ws(" ", col("address_line_1"), col("address_line_2")), + ).otherwise(col("address_line_1")), + ) + .withColumn( + "address_line_2", + when( + col("address_line_1").contains(col("address_line_2")), + col("address_line_3"), + ).otherwise(concat_ws(" ", col("address_line_2"), col("address_line_3"))), + ) + .withColumn( + "address_line_2", + when( + col("address_line_2").rlike(r"\d+$"), + concat_ws(" ", col("address_line_2"), col("address_line_4")), + ).otherwise(col("address_line_2")), + ) + .withColumn( + "address_line_3", + when(col("address_line_2").contains(col("address_line_3")), lit("london")), + ) + .withColumn( + "address_line_2", + when(col("address_line_2").isNull(), lit("hackney")).otherwise( + col("address_line_2") + ), + ) + .withColumn( + "address_line_3", + when(col("address_line_3").isNull(), lit("london")).otherwise( + col("address_line_3") + ), + ) + .withColumn("address_line_4", lit("")) + .select( + col("source"), + col("source_id"), + col("title"), + col("first_name"), + col("middle_name"), + col("last_name"), + col("name"), + col("date_of_birth"), + col("email"), + col("post_code"), + col("uprn"), + col("address_line_1"), + col("address_line_2"), + col("address_line_3"), + col("address_line_4"), + col("source_filter"), + ) + ) return freedom_cleaned @@ -940,42 +1678,62 @@ def standardize_freedom_pass_data(freedom_cleaned: DataFrame) -> DataFrame: freedom_passes (Dataframe): Freedom pass dataframe with all the standardised columns listed above. """ - freedom_passes = freedom_cleaned.withColumn("source_id", col("source_id")).withColumn("title", categorise_title( - lower(trim(col("title"))))).withColumn("first_name", standardize_name(trim(col("first_name")))).withColumn( - "middle_name", standardize_name(trim(col("middle_name")))).withColumn("last_name", standardize_name( - trim(col("last_name")))).withColumn("name", standardize_name(trim(col("name")))).withColumn("post_code", lower( - trim(col("post_code")))).withColumn("address_line_1", - standardize_address_line(trim(col("address_line_1")))).withColumn( - "address_line_2", standardize_address_line(trim(col("address_line_2")))).withColumn("address_line_3", - standardize_address_line( - trim( - col("address_line_3")))).withColumn( - "address_line_4", standardize_address_line(trim(col("address_line_4")))).withColumn("full_address1", - full_address(trim( - col("address_line_1")), - trim( - col("address_line_2")), - trim( - col("address_line_3")), - trim( - col("address_line_4")))).withColumn( - "full_address", regexp_replace(col("full_address1"), r"\s+", " ")).select(col("source"), col("source_id"), - col("uprn"), col("title"), - col("first_name"), col("middle_name"), - col("last_name"), col("name"), - col("date_of_birth"), - col("post_code"), - col("address_line_1"), - col("address_line_2"), - col("address_line_3"), - col("address_line_4"), - col("full_address"), - col("source_filter")) + freedom_passes = ( + freedom_cleaned.withColumn("source_id", col("source_id")) + .withColumn("title", categorise_title(lower(trim(col("title"))))) + .withColumn("first_name", standardize_name(trim(col("first_name")))) + .withColumn("middle_name", standardize_name(trim(col("middle_name")))) + .withColumn("last_name", standardize_name(trim(col("last_name")))) + .withColumn("name", standardize_name(trim(col("name")))) + .withColumn("post_code", lower(trim(col("post_code")))) + .withColumn( + "address_line_1", standardize_address_line(trim(col("address_line_1"))) + ) + .withColumn( + "address_line_2", standardize_address_line(trim(col("address_line_2"))) + ) + .withColumn( + "address_line_3", standardize_address_line(trim(col("address_line_3"))) + ) + .withColumn( + "address_line_4", standardize_address_line(trim(col("address_line_4"))) + ) + .withColumn( + "full_address1", + full_address( + trim(col("address_line_1")), + trim(col("address_line_2")), + trim(col("address_line_3")), + trim(col("address_line_4")), + ), + ) + .withColumn("full_address", regexp_replace(col("full_address1"), r"\s+", " ")) + .select( + col("source"), + col("source_id"), + col("uprn"), + col("title"), + col("first_name"), + col("middle_name"), + col("last_name"), + col("name"), + col("date_of_birth"), + col("post_code"), + col("address_line_1"), + col("address_line_2"), + col("address_line_3"), + col("address_line_4"), + col("full_address"), + col("source_filter"), + ) + ) return freedom_passes -def prepare_clean_electoral_register_data(electoral_register_df: DataFrame) -> DataFrame: +def prepare_clean_electoral_register_data( + electoral_register_df: DataFrame, +) -> DataFrame: """ This function cleans raw electoral register data from Xpress read for standardising. Args: @@ -985,51 +1743,110 @@ def prepare_clean_electoral_register_data(electoral_register_df: DataFrame) -> D electoral_register_cleaned (Dataframe): Cleaned dataframe containing electoral register data. """ - address_cols = ["address_line_1", "address_line_2", "address_line_3", "address_line_4"] - - electoral_register_cleaned = electoral_register_df.withColumn("source", lit("electoral_register")).withColumn( - "source_id", col("elector_id")).withColumn("first_name", - split(electoral_register_df["elector_forename"], ' ').getItem( - 0)).withColumn("middle_name", - col("elector_middle_name")).withColumn( - "last_name", col("elector_surname")).withColumn("name", regexp_replace( - concat_ws(" ", col("first_name"), col("middle_name"), col("last_name")), r"\s+", " ")).withColumn( - "date_of_birth", to_date(col("elector_dob"), format="yyyy-MM-dd")).withColumnRenamed("property_address_1", - "address_line_1").withColumnRenamed( - "property_address_2", "address_line_2").withColumnRenamed("property_address_3", - "address_line_3").withColumnRenamed( - "property_address_4", "address_line_4").withColumnRenamed("property_post_code", "post_code").withColumnRenamed( - "property_urn", "uprn").withColumn("email", lit("")).withColumn("title", lit("")).withColumn("source_filter", - lit("electoral " - "register " - "jun23")).select( - col("source"), col("source_id"), col("title"), col("first_name"), col("middle_name"), col("last_name"), - col("name"), col("date_of_birth"), col("email"), col("post_code"), col("uprn"), col("address_line_1"), - col("address_line_2"), col("address_line_3"), col("address_line_4"), col("source_filter")) + address_cols = [ + "address_line_1", + "address_line_2", + "address_line_3", + "address_line_4", + ] + + electoral_register_cleaned = ( + electoral_register_df.withColumn("source", lit("electoral_register")) + .withColumn("source_id", col("elector_id")) + .withColumn( + "first_name", + split(electoral_register_df["elector_forename"], " ").getItem(0), + ) + .withColumn("middle_name", col("elector_middle_name")) + .withColumn("last_name", col("elector_surname")) + .withColumn( + "name", + regexp_replace( + concat_ws(" ", col("first_name"), col("middle_name"), col("last_name")), + r"\s+", + " ", + ), + ) + .withColumn("date_of_birth", to_date(col("elector_dob"), format="yyyy-MM-dd")) + .withColumnRenamed("property_address_1", "address_line_1") + .withColumnRenamed("property_address_2", "address_line_2") + .withColumnRenamed("property_address_3", "address_line_3") + .withColumnRenamed("property_address_4", "address_line_4") + .withColumnRenamed("property_post_code", "post_code") + .withColumnRenamed("property_urn", "uprn") + .withColumn("email", lit("")) + .withColumn("title", lit("")) + .withColumn("source_filter", lit("electoral " "register " "jun23")) + .select( + col("source"), + col("source_id"), + col("title"), + col("first_name"), + col("middle_name"), + col("last_name"), + col("name"), + col("date_of_birth"), + col("email"), + col("post_code"), + col("uprn"), + col("address_line_1"), + col("address_line_2"), + col("address_line_3"), + col("address_line_4"), + col("source_filter"), + ) + ) # create a zip of address line arrays, sorted in the order of not null (False), column order - electoral_register_cleaned = electoral_register_cleaned.select(col("source"), col("source_id"), col("title"), - col("first_name"), col("middle_name"), - col("last_name"), col("name"), col("date_of_birth"), - col("email"), col("post_code"), col("uprn"), - col("address_line_1"), col("address_line_2"), - col("address_line_3"), col("address_line_4"), - col("source_filter"), array_sort( - arrays_zip(array([col(c).isNull() for c in address_cols]), array([lit(i) for i in range(4)]), - array([col(c) for c in address_cols]))).alias('address_sorted')) + electoral_register_cleaned = electoral_register_cleaned.select( + col("source"), + col("source_id"), + col("title"), + col("first_name"), + col("middle_name"), + col("last_name"), + col("name"), + col("date_of_birth"), + col("email"), + col("post_code"), + col("uprn"), + col("address_line_1"), + col("address_line_2"), + col("address_line_3"), + col("address_line_4"), + col("source_filter"), + array_sort( + arrays_zip( + array([col(c).isNull() for c in address_cols]), + array([lit(i) for i in range(4)]), + array([col(c) for c in address_cols]), + ) + ).alias("address_sorted"), + ) # disaggregate address_sorted arrays into columns - electoral_register_cleaned = electoral_register_cleaned.select(col("source"), col("source_id"), col("title"), - col("first_name"), col("middle_name"), - col("last_name"), col("name"), col("date_of_birth"), - col("email"), col("post_code"), col("uprn"), - col("source_filter"), *[ - col("address_sorted")[i]['2'].alias(address_cols[i]) for i in range(4)]) + electoral_register_cleaned = electoral_register_cleaned.select( + col("source"), + col("source_id"), + col("title"), + col("first_name"), + col("middle_name"), + col("last_name"), + col("name"), + col("date_of_birth"), + col("email"), + col("post_code"), + col("uprn"), + col("source_filter"), + *[col("address_sorted")[i]["2"].alias(address_cols[i]) for i in range(4)], + ) return electoral_register_cleaned -def standardize_electoral_register_data(electoral_register_cleaned: DataFrame) -> DataFrame: +def standardize_electoral_register_data( + electoral_register_cleaned: DataFrame, +) -> DataFrame: """Standardize electoral register data. This function convert all the custom names (coming from their respective sources to standard names that will be used by various other functions like feature engineering etc.) The DataFrame returned will have the following columns: @@ -1065,29 +1882,55 @@ def standardize_electoral_register_data(electoral_register_cleaned: DataFrame) - A electoral_register DataFrame with all the standard column listed above. """ - electoral_register = electoral_register_cleaned.withColumn("source_id", col("source_id")).withColumn("title", - categorise_title( - lower(trim( - col("title"))))).withColumn( - "first_name", standardize_name(trim(col("first_name")))).withColumn("middle_name", standardize_name( - trim(col("middle_name")))).withColumn("last_name", standardize_name(trim(col("last_name")))).withColumn("name", - standardize_name( - trim( - col("name")))).withColumn( - "post_code", lower(trim(col("post_code")))).withColumn("address_line_1", standardize_address_line( - trim(col("address_line_1")))).withColumn("address_line_2", - standardize_address_line(trim(col("address_line_2")))).withColumn( - "address_line_3", standardize_address_line(trim(col("address_line_3")))).withColumn("address_line_4", - standardize_address_line( - trim( - col("address_line_4")))).withColumn( - "full_address1", - full_address(trim(col("address_line_1")), trim(col("address_line_2")), trim(col("address_line_3")), - trim(col("address_line_4")))).withColumn("full_address", - regexp_replace(col("full_address1"), r"\s+", " ")).select( - col("source"), col("source_id"), col("uprn"), col("title"), col("first_name"), col("middle_name"), - col("last_name"), col("name"), col("date_of_birth"), col("post_code"), col("address_line_1"), - col("address_line_2"), col("address_line_3"), col("address_line_4"), col("full_address"), col("source_filter")) + electoral_register = ( + electoral_register_cleaned.withColumn("source_id", col("source_id")) + .withColumn("title", categorise_title(lower(trim(col("title"))))) + .withColumn("first_name", standardize_name(trim(col("first_name")))) + .withColumn("middle_name", standardize_name(trim(col("middle_name")))) + .withColumn("last_name", standardize_name(trim(col("last_name")))) + .withColumn("name", standardize_name(trim(col("name")))) + .withColumn("post_code", lower(trim(col("post_code")))) + .withColumn( + "address_line_1", standardize_address_line(trim(col("address_line_1"))) + ) + .withColumn( + "address_line_2", standardize_address_line(trim(col("address_line_2"))) + ) + .withColumn( + "address_line_3", standardize_address_line(trim(col("address_line_3"))) + ) + .withColumn( + "address_line_4", standardize_address_line(trim(col("address_line_4"))) + ) + .withColumn( + "full_address1", + full_address( + trim(col("address_line_1")), + trim(col("address_line_2")), + trim(col("address_line_3")), + trim(col("address_line_4")), + ), + ) + .withColumn("full_address", regexp_replace(col("full_address1"), r"\s+", " ")) + .select( + col("source"), + col("source_id"), + col("uprn"), + col("title"), + col("first_name"), + col("middle_name"), + col("last_name"), + col("name"), + col("date_of_birth"), + col("post_code"), + col("address_line_1"), + col("address_line_2"), + col("address_line_3"), + col("address_line_4"), + col("full_address"), + col("source_filter"), + ) + ) return electoral_register @@ -1106,10 +1949,16 @@ def remove_deceased(df: DataFrame) -> DataFrame: A DataFrame after removing all the deceased persons. """ deceased_filter_cond = ( - lower(col("title")).contains("(deceased)") | lower(col("title")).contains("executor") | lower( - col("title")).contains("exor") | lower(col("title")).contains("rep") | lower(col("title")).contains( - " of") | lower(col("title")).contains("of ") | lower(col("title")).contains("the") | lower( - col("title")).contains("pe") | lower(col("title")).contains("other")) + lower(col("title")).contains("(deceased)") + | lower(col("title")).contains("executor") + | lower(col("title")).contains("exor") + | lower(col("title")).contains("rep") + | lower(col("title")).contains(" of") + | lower(col("title")).contains("of ") + | lower(col("title")).contains("the") + | lower(col("title")).contains("pe") + | lower(col("title")).contains("other") + ) return df.filter(~deceased_filter_cond) @@ -1129,20 +1978,26 @@ def generate_possible_matches(df: DataFrame) -> DataFrame: """ partitions = 5 - df_a = df.select(*[col(c).alias(f"a_{c}") for c in df.columns]).withColumn("first_name_soundex", - soundex(col("a_first_name"))).withColumn( - "last_name_soundex", soundex(col("a_last_name"))).repartition(partitions, col("first_name_soundex"), - col("last_name_soundex")) - - df_b = df.select(*[col(c).alias(f"b_{c}") for c in df.columns]).withColumn("first_name_soundex", - soundex(col("b_first_name"))).withColumn( - "last_name_soundex", soundex(col("b_last_name"))).repartition(partitions, col("first_name_soundex"), - col("last_name_soundex")) - - return df_a.join(df_b, (df_a["a_source_id"] != df_b["b_source_id"]) & ( - df_a["first_name_soundex"] == df_b["first_name_soundex"]) & ( - df_a["last_name_soundex"] == df_b["last_name_soundex"])).drop( - *["first_name_soundex", "last_name_soundex"]) + df_a = ( + df.select(*[col(c).alias(f"a_{c}") for c in df.columns]) + .withColumn("first_name_soundex", soundex(col("a_first_name"))) + .withColumn("last_name_soundex", soundex(col("a_last_name"))) + .repartition(partitions, col("first_name_soundex"), col("last_name_soundex")) + ) + + df_b = ( + df.select(*[col(c).alias(f"b_{c}") for c in df.columns]) + .withColumn("first_name_soundex", soundex(col("b_first_name"))) + .withColumn("last_name_soundex", soundex(col("b_last_name"))) + .repartition(partitions, col("first_name_soundex"), col("last_name_soundex")) + ) + + return df_a.join( + df_b, + (df_a["a_source_id"] != df_b["b_source_id"]) + & (df_a["first_name_soundex"] == df_b["first_name_soundex"]) + & (df_a["last_name_soundex"] == df_b["last_name_soundex"]), + ).drop(*["first_name_soundex", "last_name_soundex"]) def automatically_label_data(df: DataFrame) -> DataFrame: @@ -1155,10 +2010,20 @@ def automatically_label_data(df: DataFrame) -> DataFrame: Returns: A DataFrame with column auto_labels. """ - return df.withColumn("auto_labels", when((col("a_source_id") == col("b_source_id")) | ( - (col("a_first_name") == col("b_first_name")) & (col("a_last_name") == col("b_last_name")) & ( - col("a_date_of_birth") == col("b_date_of_birth")) & (col("a_uprn") == col("b_uprn")) & ( - col("a_post_code") == col("b_post_code"))), lit(True)).otherwise(lit(None).cast(BooleanType()))) + return df.withColumn( + "auto_labels", + when( + (col("a_source_id") == col("b_source_id")) + | ( + (col("a_first_name") == col("b_first_name")) + & (col("a_last_name") == col("b_last_name")) + & (col("a_date_of_birth") == col("b_date_of_birth")) + & (col("a_uprn") == col("b_uprn")) + & (col("a_post_code") == col("b_post_code")) + ), + lit(True), + ).otherwise(lit(None).cast(BooleanType())), + ) @pandas_udf(features_schema) @@ -1199,22 +2064,52 @@ def generate_features(input_df: pd.DataFrame) -> pd.DataFrame: similarity_algo = Cosine() input_df["first_name_similar"] = input_df.apply( - lambda x: phonetic_algo.encode(x["a_first_name"]) == phonetic_algo.encode(x["b_first_name"]), axis=1) + lambda x: phonetic_algo.encode(x["a_first_name"]) + == phonetic_algo.encode(x["b_first_name"]), + axis=1, + ) input_df["middle_name_similar"] = input_df.apply( - lambda x: phonetic_algo.encode(x["a_middle_name"]) == phonetic_algo.encode(x["b_middle_name"]), axis=1) + lambda x: phonetic_algo.encode(x["a_middle_name"]) + == phonetic_algo.encode(x["b_middle_name"]), + axis=1, + ) input_df["last_name_similar"] = input_df.apply( - lambda x: phonetic_algo.encode(x["a_last_name"]) == phonetic_algo.encode(x["b_last_name"]), axis=1) - input_df["name_similarity"] = input_df.apply(lambda x: similarity_algo.sim(x["a_name"], x["b_name"]), axis=1) + lambda x: phonetic_algo.encode(x["a_last_name"]) + == phonetic_algo.encode(x["b_last_name"]), + axis=1, + ) + input_df["name_similarity"] = input_df.apply( + lambda x: similarity_algo.sim(x["a_name"], x["b_name"]), axis=1 + ) input_df["address_line_1_similarity"] = input_df.apply( - lambda x: similarity_algo.sim(x["a_address_line_1"], x["b_address_line_1"]), axis=1) + lambda x: similarity_algo.sim(x["a_address_line_1"], x["b_address_line_1"]), + axis=1, + ) input_df["address_line_2_similarity"] = input_df.apply( - lambda x: similarity_algo.sim(x["a_address_line_2"], x["b_address_line_2"]), axis=1) + lambda x: similarity_algo.sim(x["a_address_line_2"], x["b_address_line_2"]), + axis=1, + ) input_df["full_address_similarity"] = input_df.apply( - lambda x: similarity_algo.sim(x["a_full_address"], x["b_full_address"]), axis=1) + lambda x: similarity_algo.sim(x["a_full_address"], x["b_full_address"]), axis=1 + ) return input_df.drop( - ["a_first_name", "b_first_name", "a_last_name", "b_last_name", "a_name", "b_name", "a_address_line_1", - "b_address_line_1", "a_address_line_2", "b_address_line_2", "a_full_address", "b_full_address"], axis=1) + [ + "a_first_name", + "b_first_name", + "a_last_name", + "b_last_name", + "a_name", + "b_name", + "a_address_line_1", + "b_address_line_1", + "a_address_line_2", + "b_address_line_2", + "a_full_address", + "b_full_address", + ], + axis=1, + ) def feature_engineering(df: DataFrame) -> DataFrame: @@ -1246,37 +2141,85 @@ def feature_engineering(df: DataFrame) -> DataFrame: match = lit("match") non_match = lit("non-match") unknown = lit("unknown") - features_df = df.withColumn("uprn_same", - when(col("a_uprn") == col("b_uprn"), match).when(col("a_uprn") != col("b_uprn"), - non_match).otherwise( - unknown)).withColumn("title_same", - when(col("a_title") == col("b_title"), match).when( - col("a_title") != col("b_title"), non_match).otherwise( - unknown)).withColumn("date_of_birth_same", when( - col("a_date_of_birth") == col("b_date_of_birth"), match).when(col("a_date_of_birth") != col("b_date_of_birth"), - non_match).otherwise(unknown)).withColumn( - "similarity_features", generate_features( - struct(col("a_first_name"), col("b_first_name"), col("a_middle_name"), col("b_middle_name"), - col("a_last_name"), col("b_last_name"), col("a_name"), col("b_name"), col("a_address_line_1"), - col("b_address_line_1"), col("a_address_line_2"), col("b_address_line_2"), col("a_full_address"), - col("b_full_address")))).select(col("*"), col("similarity_features.*")).drop("similarity_features") + features_df = ( + df.withColumn( + "uprn_same", + when(col("a_uprn") == col("b_uprn"), match) + .when(col("a_uprn") != col("b_uprn"), non_match) + .otherwise(unknown), + ) + .withColumn( + "title_same", + when(col("a_title") == col("b_title"), match) + .when(col("a_title") != col("b_title"), non_match) + .otherwise(unknown), + ) + .withColumn( + "date_of_birth_same", + when(col("a_date_of_birth") == col("b_date_of_birth"), match) + .when(col("a_date_of_birth") != col("b_date_of_birth"), non_match) + .otherwise(unknown), + ) + .withColumn( + "similarity_features", + generate_features( + struct( + col("a_first_name"), + col("b_first_name"), + col("a_middle_name"), + col("b_middle_name"), + col("a_last_name"), + col("b_last_name"), + col("a_name"), + col("b_name"), + col("a_address_line_1"), + col("b_address_line_1"), + col("a_address_line_2"), + col("b_address_line_2"), + col("a_full_address"), + col("b_full_address"), + ) + ), + ) + .select(col("*"), col("similarity_features.*")) + .drop("similarity_features") + ) return features_df def evaluation_for_various_metrics(predictions: DataFrame): - metrics = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label", - weightCol="label_confidence_score", probabilityCol="probability") + metrics = MulticlassClassificationEvaluator( + predictionCol="prediction", + labelCol="label", + weightCol="label_confidence_score", + probabilityCol="probability", + ) accuracy = metrics.evaluate(predictions, {metrics.metricName: "accuracy"}) - precision_non_match = metrics.evaluate(predictions, - {metrics.metricName: "precisionByLabel", metrics.metricLabel: 0.0}) - precision_match = metrics.evaluate(predictions, {metrics.metricName: "precisionByLabel", metrics.metricLabel: 1.0}) - recall_non_match = metrics.evaluate(predictions, {metrics.metricName: "recallByLabel", metrics.metricLabel: 0.0}) - recall_match = metrics.evaluate(predictions, {metrics.metricName: "recallByLabel", metrics.metricLabel: 1.0}) - return accuracy, precision_non_match, precision_match, recall_non_match, recall_match - - -def train_model(df: DataFrame, model_path: str, test_model: bool, save_model: bool) -> None: + precision_non_match = metrics.evaluate( + predictions, {metrics.metricName: "precisionByLabel", metrics.metricLabel: 0.0} + ) + precision_match = metrics.evaluate( + predictions, {metrics.metricName: "precisionByLabel", metrics.metricLabel: 1.0} + ) + recall_non_match = metrics.evaluate( + predictions, {metrics.metricName: "recallByLabel", metrics.metricLabel: 0.0} + ) + recall_match = metrics.evaluate( + predictions, {metrics.metricName: "recallByLabel", metrics.metricLabel: 1.0} + ) + return ( + accuracy, + precision_non_match, + precision_match, + recall_non_match, + recall_match, + ) + + +def train_model( + df: DataFrame, model_path: str, test_model: bool, save_model: bool +) -> None: """Trains the model Args: @@ -1296,40 +2239,83 @@ def train_model(df: DataFrame, model_path: str, test_model: bool, save_model: bo print(f"Training data size: {train.count()}") print(f"Test data size....: {test.count()}") - string_indexer = StringIndexer(inputCols=["uprn_same", "title_same", "date_of_birth_same"], - outputCols=["uprn_indexed", "title_indexed", "date_of_birth_indexed"], - stringOrderType="alphabetAsc") - one_hot_encoder = OneHotEncoder(inputCols=["uprn_indexed", "title_indexed", "date_of_birth_indexed"], - outputCols=["uprn_vec", "title_vec", "date_of_birth_vec"]) + string_indexer = StringIndexer( + inputCols=["uprn_same", "title_same", "date_of_birth_same"], + outputCols=["uprn_indexed", "title_indexed", "date_of_birth_indexed"], + stringOrderType="alphabetAsc", + ) + one_hot_encoder = OneHotEncoder( + inputCols=["uprn_indexed", "title_indexed", "date_of_birth_indexed"], + outputCols=["uprn_vec", "title_vec", "date_of_birth_vec"], + ) vector_assembler = VectorAssembler( - inputCols=["uprn_vec", "title_vec", "date_of_birth_vec", "first_name_similar", "middle_name_similar", - "last_name_similar", "name_similarity", "address_line_1_similarity", "address_line_2_similarity", - "full_address_similarity"], outputCol="features") - classifier = LogisticRegression(featuresCol="features", labelCol="label", weightCol="label_confidence_score", - standardization=False) - - pipeline = Pipeline(stages=[string_indexer, one_hot_encoder, vector_assembler, classifier]) + inputCols=[ + "uprn_vec", + "title_vec", + "date_of_birth_vec", + "first_name_similar", + "middle_name_similar", + "last_name_similar", + "name_similarity", + "address_line_1_similarity", + "address_line_2_similarity", + "full_address_similarity", + ], + outputCol="features", + ) + classifier = LogisticRegression( + featuresCol="features", + labelCol="label", + weightCol="label_confidence_score", + standardization=False, + ) + + pipeline = Pipeline( + stages=[string_indexer, one_hot_encoder, vector_assembler, classifier] + ) # Due to limited time I haven't searched on a larger space # param_grid = ParamGridBuilder() \ # .addGrid(classifier.regParam, [0.0001, 0.00005, 8e-05, 7e-05, 5e-05]) \ # .addGrid(classifier.elasticNetParam, [0.0, 0.25, 0.5, 0.75, 1.0]) \ # .build() - param_grid = ParamGridBuilder().addGrid(classifier.regParam, [7e-05]).addGrid(classifier.elasticNetParam, - [1.0]).build() - evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", - weightCol="label_confidence_score") - - cv = CrossValidator(estimator=pipeline, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5, seed=42, - parallelism=5) + param_grid = ( + ParamGridBuilder() + .addGrid(classifier.regParam, [7e-05]) + .addGrid(classifier.elasticNetParam, [1.0]) + .build() + ) + evaluator = BinaryClassificationEvaluator( + labelCol="label", + rawPredictionCol="rawPrediction", + weightCol="label_confidence_score", + ) + + cv = CrossValidator( + estimator=pipeline, + estimatorParamMaps=param_grid, + evaluator=evaluator, + numFolds=5, + seed=42, + parallelism=5, + ) cv_model = cv.fit(train) train_prediction = cv_model.transform(train) - print(f"Training ROC AUC train score before fine-tuning..: {evaluator.evaluate(train_prediction):.5f}") - accuracy, precision_non_match, precision_match, recall_non_match, recall_match = evaluation_for_various_metrics( - train_prediction) + print( + f"Training ROC AUC train score before fine-tuning..: {evaluator.evaluate(train_prediction):.5f}" + ) + ( + accuracy, + precision_non_match, + precision_match, + recall_non_match, + recall_match, + ) = evaluation_for_various_metrics(train_prediction) print(f"Training Accuracy before fine-tuning............: {accuracy:.5f}") - print(f"Training Precision before fine-tuning (non-match): {precision_non_match:.5f}") + print( + f"Training Precision before fine-tuning (non-match): {precision_non_match:.5f}" + ) print(f"Training Precision before fine-tuning.....(match): {precision_match:.5f}") print(f"Training Recall before fine-tuning.(non-match): {recall_non_match:.5f}") print(f"Training Recall before fine-tuning.....(match): {recall_match:.5f}") @@ -1340,9 +2326,11 @@ def train_model(df: DataFrame, model_path: str, test_model: bool, save_model: bo # Fine-tuning the model to maximize performance f_measure = training_summary.fMeasureByThreshold max_f_measure = f_measure.groupBy().max("F-Measure").select("max(F-Measure)").head() - best_threshold = \ - f_measure.filter(f_measure["F-Measure"] == max_f_measure["max(F-Measure)"]).select("threshold").head()[ - "threshold"] + best_threshold = ( + f_measure.filter(f_measure["F-Measure"] == max_f_measure["max(F-Measure)"]) + .select("threshold") + .head()["threshold"] + ) print(f"Best threshold: {best_threshold}") cv_model.bestModel.stages[-1].setThreshold(best_threshold) @@ -1350,30 +2338,60 @@ def train_model(df: DataFrame, model_path: str, test_model: bool, save_model: bo cv_model.write().overwrite().save(model_path) train_prediction = cv_model.transform(train) - print(f"Training ROC AUC train score after fine-tuning...: {evaluator.evaluate(train_prediction):.5f}") - accuracy, precision_non_match, precision_match, recall_non_match, recall_match = evaluation_for_various_metrics( - train_prediction) + print( + f"Training ROC AUC train score after fine-tuning...: {evaluator.evaluate(train_prediction):.5f}" + ) + ( + accuracy, + precision_non_match, + precision_match, + recall_non_match, + recall_match, + ) = evaluation_for_various_metrics(train_prediction) print(f"Training Accuracy after fine-tuning.............: {accuracy:.5f}") - print(f"Training Precision after fine-tuning .(non-match): {precision_non_match:.5f}") + print( + f"Training Precision after fine-tuning .(non-match): {precision_non_match:.5f}" + ) print(f"Training Precision after fine-tuning......(match): {precision_match:.5f}") print(f"Training Recall after fine-tuning..(non-match): {recall_non_match:.5f}") print(f"Training Recall after fine-tuning......(match): {recall_match:.5f}") if test_model: - print("Only evaluate once in the end, so keep it commented for most of the time.") + print( + "Only evaluate once in the end, so keep it commented for most of the time." + ) test_prediction = cv_model.transform(test) test_prediction.show() - print(f'Write predictions to csv...') + print(f"Write predictions to csv...") test_prediction.printSchema() - test_prediction_for_export = test_prediction.withColumn('probability', - vector_to_array(col('probability'))).withColumn( - 'probability_str', concat_ws('probability')).drop('uprn_vec', 'title_vec', 'date_of_birth_vec', 'features', - 'rawPrediction', 'uprn_indexed', 'title_indexed', - 'date_of_birth_indexed', 'probability') - test_prediction_for_export.write.csv(header=True, path=f"{model_path}/test_predictions") - - accuracy, precision_non_match, precision_match, recall_non_match, recall_match = evaluation_for_various_metrics( - test_prediction) + test_prediction_for_export = ( + test_prediction.withColumn( + "probability", vector_to_array(col("probability")) + ) + .withColumn("probability_str", concat_ws("probability")) + .drop( + "uprn_vec", + "title_vec", + "date_of_birth_vec", + "features", + "rawPrediction", + "uprn_indexed", + "title_indexed", + "date_of_birth_indexed", + "probability", + ) + ) + test_prediction_for_export.write.csv( + header=True, path=f"{model_path}/test_predictions" + ) + + ( + accuracy, + precision_non_match, + precision_match, + recall_non_match, + recall_match, + ) = evaluation_for_various_metrics(test_prediction) print(f"Test ROC AUC..............: {evaluator.evaluate(test_prediction):.5f}") print(f"Test Accuracy.............: {accuracy:.5f}") print(f"Test Precision (non-match): {precision_non_match:.5f}") @@ -1395,17 +2413,35 @@ def predict(features_df: DataFrame, model_path: str) -> DataFrame: Returns DataFrame with prediction. """ cv_model: CrossValidatorModel = CrossValidatorModel.load(model_path) - predictions = cv_model.transform(features_df).withColumn("predicted_label", - when(col("prediction") == 1.0, "match").when( - col("prediction") == 0.0, "non-match").otherwise( - "unknown")).drop( - *["uprn_indexed", "title_indexed", "date_of_birth_indexed", "uprn_vec", "title_vec", "date_of_birth_vec", - "features", "rawPrediction", "probability"]) + predictions = ( + cv_model.transform(features_df) + .withColumn( + "predicted_label", + when(col("prediction") == 1.0, "match") + .when(col("prediction") == 0.0, "non-match") + .otherwise("unknown"), + ) + .drop( + *[ + "uprn_indexed", + "title_indexed", + "date_of_birth_indexed", + "uprn_vec", + "title_vec", + "date_of_birth_vec", + "features", + "rawPrediction", + "probability", + ] + ) + ) return predictions -def link_all_matched_persons(standard_df: DataFrame, predicted_df: DataFrame) -> DataFrame: +def link_all_matched_persons( + standard_df: DataFrame, predicted_df: DataFrame +) -> DataFrame: """Finds all the matching person in the standard DataFrame. All the records having same matching_id are considered as same person. @@ -1418,30 +2454,32 @@ def link_all_matched_persons(standard_df: DataFrame, predicted_df: DataFrame) -> """ vertices = standard_df.withColumn("id", col("source_id")) - edges = predicted_df.filter(col("prediction") == 1.0).withColumn("src", col("a_source_id")).withColumn("dst", - col("b_source_id")) + edges = ( + predicted_df.filter(col("prediction") == 1.0) + .withColumn("src", col("a_source_id")) + .withColumn("dst", col("b_source_id")) + ) person_graph = GraphFrame(vertices, edges).dropIsolatedVertices() connected = person_graph.connectedComponents() - unique_connections = connected.select(col("source"), col("source_id"), - col("component").alias("matching_id")).distinct() - return standard_df.join(unique_connections, ["source", "source_id"]).orderBy(col("matching_id")) + unique_connections = connected.select( + col("source"), col("source_id"), col("component").alias("matching_id") + ).distinct() + return standard_df.join(unique_connections, ["source", "source_id"]).orderBy( + col("matching_id") + ) # Extra analysis (for analyst only): if you need to do. - # To find how many connection are there # person_graph.inDegrees.filter(col("inDegree") > 1).orderBy(col( # # - # "inDegree").desc()).show(truncate=False) + # To find how many connection are there # person_graph.inDegrees.filter(col("inDegree") > 1).orderBy(col( # # # "inDegree").desc()).show(truncate=False) - # Graph query using motif to find where person 'a' is connected to person 'b', and person 'b' is also connected - # to # person 'a' # motif = person_graph.find("(a)-[]->(b); (b)-[]->(a)") # motif.show(truncate=False) + # Graph query using motif to find where person 'a' is connected to person 'b', and person 'b' is also connected # to # person 'a' # motif = person_graph.find("(a)-[]->(b); (b)-[]->(a)") # motif.show(truncate=False) - # To count number of triangles i.e. a connected to b, b connected to c and c is connected back to a # # # # - # triangle_count = person_graph.triangleCount() # triangle_count.orderBy(col("count").desc()).show(n=10, - # truncate=False) + # To count number of triangles i.e. a connected to b, b connected to c and c is connected back to a # # # # # triangle_count = person_graph.triangleCount() # triangle_count.orderBy(col("count").desc()).show(n=10, # truncate=False) def match_persons(model_path: str, standard_df: DataFrame) -> DataFrame: - """ A convenient method that facilitate the user of the module to perform person match. This method accepts a + """A convenient method that facilitate the user of the module to perform person match. This method accepts a standard DataFrame that represents the dataset containing the record records referring to the same person. Standard DataFrame should have the following columns though data it can be missing. @@ -1473,13 +2511,29 @@ def match_persons(model_path: str, standard_df: DataFrame) -> DataFrame: Raises: AssertionError is mandatory columns are missing. """ - mandatory_columns = ["source", "source_id", "uprn", "title", "first_name", "middle_name", "last_name", "name", - "date_of_birth", "post_code", "address_line_1", "address_line_2", "address_line_3", - "address_line_4", "full_address"] + mandatory_columns = [ + "source", + "source_id", + "uprn", + "title", + "first_name", + "middle_name", + "last_name", + "name", + "date_of_birth", + "post_code", + "address_line_1", + "address_line_2", + "address_line_3", + "address_line_4", + "full_address", + ] try: assert set(mandatory_columns).issubset(standard_df.columns) except AssertionError as e: - raise AssertionError(f"Standard DataFrame doesn't contain all the mandatory columns and error is {e}") + raise AssertionError( + f"Standard DataFrame doesn't contain all the mandatory columns and error is {e}" + ) possible_matches = generate_possible_matches(standard_df) features_df = feature_engineering(possible_matches) From b12df56439872c31f371849db521a2ad1518137c Mon Sep 17 00:00:00 2001 From: AGibson <4319494+annajgibson@users.noreply.github.com> Date: Thu, 31 Jul 2025 12:15:43 +0100 Subject: [PATCH 09/11] Reformat code with Black --- scripts/jobs/data_and_insight/person_matching_module.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/jobs/data_and_insight/person_matching_module.py b/scripts/jobs/data_and_insight/person_matching_module.py index a88bb214c..4ad27c501 100644 --- a/scripts/jobs/data_and_insight/person_matching_module.py +++ b/scripts/jobs/data_and_insight/person_matching_module.py @@ -404,7 +404,7 @@ def standardize_name(name: Column) -> Column: trim( regexp_replace( regexp_replace(regexp_replace(name, "0", "O"), "1", "L"), - "^[\\&*./\\\]+", + "^[\&*./\]+", "", ) ) @@ -2362,7 +2362,7 @@ def train_model( ) test_prediction = cv_model.transform(test) test_prediction.show() - print(f"Write predictions to csv...") + print("Write predictions to csv...") test_prediction.printSchema() test_prediction_for_export = ( test_prediction.withColumn( From d2ae673fed7eb23d71d8ea15251e7ff229111170 Mon Sep 17 00:00:00 2001 From: AGibson <4319494+annajgibson@users.noreply.github.com> Date: Thu, 31 Jul 2025 12:17:10 +0100 Subject: [PATCH 10/11] Reformat code with Black --- scripts/jobs/data_and_insight/person_matching_module.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/jobs/data_and_insight/person_matching_module.py b/scripts/jobs/data_and_insight/person_matching_module.py index 4ad27c501..5f8bf14c5 100644 --- a/scripts/jobs/data_and_insight/person_matching_module.py +++ b/scripts/jobs/data_and_insight/person_matching_module.py @@ -404,7 +404,7 @@ def standardize_name(name: Column) -> Column: trim( regexp_replace( regexp_replace(regexp_replace(name, "0", "O"), "1", "L"), - "^[\&*./\]+", + "^[&*./]+", "", ) ) From ca6f653ccca1eca580766758f63dbc771a6ad04a Mon Sep 17 00:00:00 2001 From: AGibson <4319494+annajgibson@users.noreply.github.com> Date: Thu, 31 Jul 2025 12:32:13 +0100 Subject: [PATCH 11/11] Add in backslashes --- scripts/jobs/data_and_insight/person_matching_module.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/jobs/data_and_insight/person_matching_module.py b/scripts/jobs/data_and_insight/person_matching_module.py index 5f8bf14c5..5dcb5e6ae 100644 --- a/scripts/jobs/data_and_insight/person_matching_module.py +++ b/scripts/jobs/data_and_insight/person_matching_module.py @@ -404,7 +404,7 @@ def standardize_name(name: Column) -> Column: trim( regexp_replace( regexp_replace(regexp_replace(name, "0", "O"), "1", "L"), - "^[&*./]+", + r"^[\&*./\]+", "", ) )