diff --git a/scripts/helpers/housing_gx_dq_inputs.py b/scripts/helpers/housing_gx_dq_inputs.py index 21a0f0ff8..a850c8e17 100644 --- a/scripts/helpers/housing_gx_dq_inputs.py +++ b/scripts/helpers/housing_gx_dq_inputs.py @@ -6,7 +6,7 @@ 'sql': """SELECT * FROM "housing-refined-zone"."tenure_reshape" where import_date=(select max(import_date) from "housing-refined-zone"."tenure_reshape") and description in ('Secure', 'Introductory', 'Mesne Profit Ac', 'Non-Secure') and (endoftenuredate is null or substr(endoftenuredate, 1, 11) = '1900-01-01')""", 'id_field': 'tenancy_id'}, 'contacts_reshape': { - 'sql': """SELECT id, targetid, createdat, contacttype, subtype, value, lastmodified, targettype, isactive, person_id, import_date FROM "housing-refined-zone"."contacts_reshape" where import_date=(select max(import_date) from "housing-refined-zone"."contacts_reshape") and isactive=True""", + 'sql': """SELECT id, targetid, substr(createdat, 1, 10) as createdat, contacttype, subtype, value, substr(lastmodified, 1, 10) as lastmodified, targettype, isactive, person_id, import_date FROM "housing-refined-zone"."contacts_reshape" where import_date=(select max(import_date) from "housing-refined-zone"."contacts_reshape") and isactive=True""", 'id_field': 'id'}, 'housing_homeowner_record_sheet': { 'sql': """SELECT * FROM "housing-raw-zone"."housing_homeowner_record_sheet" where import_date=(select max(import_date) from "housing-raw-zone"."housing_homeowner_record_sheet")""", @@ -16,11 +16,17 @@ 'id_field': 'property_dwelling_reference_number'}, 'assets_reshape': { 'sql': """SELECT * FROM "housing-refined-zone"."assets_reshape" where import_date=(select max(import_date) from "housing-refined-zone"."assets_reshape") and assettype = 'Dwelling'""", - 'id_field': 'asset_id'} + 'id_field': 'asset_id'}, + 'matenancyagreement': { + 'sql': """SELECT *, substr(cast(eot as varchar), 1, 10) as eot_parsed, substr(cast(cot as varchar), 1, 10) as cot_parsed FROM "housing-raw-zone"."sow2b_dbo_matenancyagreement" where import_date=(select max(import_date) FROM "housing-raw-zone"."sow2b_dbo_matenancyagreement")""", + 'id_field': 'tag_ref'}, + 'maproperty': { + 'sql': """SELECT * FROM "housing-raw-zone"."sow2b_dbo_maproperty" where import_date=(select max(import_date) FROM "housing-raw-zone"."sow2b_dbo_maproperty")""", + 'id_field': 'prop_ref'} } table_list = ['person_reshape', 'tenure_reshape', 'contacts_reshape', 'housing_homeowner_record_sheet', - 'housing_dwellings_list', 'assets_reshape'] + 'housing_dwellings_list', 'assets_reshape', 'matenancyagreement', 'maproperty'] partition_keys = ['import_year', 'import_month', 'import_day', 'import_date'] @@ -39,10 +45,12 @@ 'expect_contact_value_column_values_to_be_unique': 'UNIQUENESS', 'expect_contact_value_column_values_to_not_be_null': 'COMPLETENESS', 'expect_date_of_birth_column_values_to_not_be_null': 'COMPLETENESS', - 'expect_date_of_birth_to_be_between': 'VALIDITY', + 'expect_date_of_birth_to_be_between': 'TIMELINESS', 'expect_description_values_to_be_in_set': 'CONSISTENCY', 'expect_estate_ref_no_column_values_to_match_regex': 'VALIDITY', 'expect_first_name_column_value_length': 'VALIDITY', + 'expect_is_organisation_column_values_to_not_be_null': 'COMPLETENESS', + 'expect_is_organisation_values_to_be_in_set': 'CONSISTENCY', 'expect_llpg_and_prop_ref_column_values_to_be_unique_within_record': 'UNIQUENESS', 'expect_llpg_column_value_lengths_between': 'VALIDITY', 'expect_llpg_column_values_to_be_unique': 'UNIQUENESS', @@ -70,6 +78,7 @@ 'expect_sub_type_column_values_to_not_be_null': 'COMPLETENESS', 'expect_surname_column_value_length': 'VALIDITY', 'expect_firstname_column_value_length': 'VALIDITY', + 'expect_tag_ref_column_not_to_be_null': 'COMPLETENESS', 'expect_target_id_and_value_column_values_to_be_unique_within_record': 'UNIQUENESS', 'expect_target_id_column_values_to_not_be_null': 'COMPLETENESS', 'expect_target_type_column_values_to_be_in_set': 'CONSISTENCY', @@ -78,6 +87,7 @@ 'expect_tenancy_id_column_not_to_be_null': 'COMPLETENESS', 'expect_tenure_code_column_not_to_be_null': 'COMPLETENESS', 'expect_tenure_type_column_values_to_be_in_set': 'CONSISTENCY', + 'expect_tenure_code_values_to_be_in_set': 'CONSISTENCY', 'expect_uprn_column_value_lengths_between': 'VALIDITY', 'expect_uprn_column_values_to_match_regex': 'VALIDITY', 'expect_uprn_column_values_to_not_be_null': 'COMPLETENESS', diff --git a/scripts/jobs/housing/housing_maproperty_gx_suite.py b/scripts/jobs/housing/housing_maproperty_gx_suite.py new file mode 100644 index 000000000..2e16fbf3b --- /dev/null +++ b/scripts/jobs/housing/housing_maproperty_gx_suite.py @@ -0,0 +1,37 @@ +# flake8: noqa: F821 +import sys + +from awsglue.utils import getResolvedOptions +import great_expectations as gx +import great_expectations.expectations as gxe + +arg_key = ['s3_target_location'] +args = getResolvedOptions(sys.argv, arg_key) +locals().update(args) + + +class ExpectPropRefColumnValuesToBeUnique(gxe.ExpectColumnValuesToBeUnique): + column: str = 'prop_ref' + description: str = "Expect Prop Ref field to be unique for a property type" + + +class ExpectArrPatchNotToBeNull(gxe.ExpectColumnValuesToNotBeNull): + column: str = "arr_patch" + description: str = "Expect Arrears Patch column to be complete with no missing values" + + +class ExpectPropRefNotToBeNull(gxe.ExpectColumnValuesToNotBeNull): + column: str = "prop_ref" + description: str = "Expect Prop Ref column to be complete with no missing values" + + +# add to GX context +context = gx.get_context(mode="file", project_root_dir=s3_target_location) + +suite = gx.ExpectationSuite(name='maproperty_suite') + +suite.add_expectation(ExpectPropRefColumnValuesToBeUnique()) +suite.add_expectation(ExpectArrPatchNotToBeNull()) +suite.add_expectation(ExpectPropRefNotToBeNull()) + +suite = context.suites.add(suite) diff --git a/scripts/jobs/housing/housing_matenancyagreement_gx_suite.py b/scripts/jobs/housing/housing_matenancyagreement_gx_suite.py new file mode 100644 index 000000000..5aae95569 --- /dev/null +++ b/scripts/jobs/housing/housing_matenancyagreement_gx_suite.py @@ -0,0 +1,82 @@ +# flake8: noqa: F821 +from datetime import datetime +import sys + +from awsglue.utils import getResolvedOptions +import great_expectations as gx +import great_expectations.expectations as gxe + +arg_key = ['s3_target_location'] +args = getResolvedOptions(sys.argv, arg_key) +locals().update(args) + + +class ExpectTagRefColumnValuesToBeUnique(gxe.ExpectColumnValuesToBeUnique): + column: str = 'tag_ref' + description: str = "Expect Tag Ref field to be unique for a tenancy" + + +class ExpectTagRefNotToBeNull(gxe.ExpectColumnValuesToNotBeNull): + column: str = "tag_ref" + description: str = "Expect Tag Ref column to be complete with no missing values" + + +class ExpectPropRefNotToBeNull(gxe.ExpectColumnValuesToNotBeNull): + column: str = "prop_ref" + description: str = "Expect Prop Ref column to be complete with no missing values" + + +class ExpectCoTNotToBeNull(gxe.ExpectColumnValuesToNotBeNull): + column: str = "cot" + description: str = "Expect Tenancy start date column (cot) to be complete with no missing values" + + +class ExpectTenureNotToBeNull(gxe.ExpectColumnValuesToNotBeNull): + column: str = "tenure" + description: str = "Expect tenure to be complete with no missing values" + + +class ExpectSaffRentAccNotToBeNull(gxe.ExpectColumnValuesToNotBeNull): + column: str = "u_saff_rentacc" + description: str = "Expect Saff rent account (payment ref) to be complete with no missing values" + + +class ExpectRentGroupRefNotToBeNull(gxe.ExpectColumnValuesToNotBeNull): + column: str = "rentgrp_ref" + description: str = "Expect Rent Group ref column to be complete with no missing values" + + +class ExpectEoTToBeBetween(gxe.ExpectColumnValuesToBeBetween): + column: str = 'eot_parsed' + min_value: str = datetime(1920, 1, 1, 0, 0, 0).isoformat() + max_value: str = datetime.today().isoformat() + description: str = "Expect eot_parsed be between 1920-01-01 and today's date" + condition_parser: str = 'great_expectations' + row_condition: str = 'col("eot_parsed").notNull()' + + +class ExpectCoTToBeBetween(gxe.ExpectColumnValuesToBeBetween): + column: str = 'cot_parsed' + min_value: str = datetime(1920, 1, 1, 0, 0, 0).isoformat() + max_value: str = datetime.today().isoformat() + description: str = "Expect cot_parsed be between 1920-01-01 and today's date" + condition_parser: str = 'great_expectations' + row_condition: str = 'col("cot").notNull()' + + +# add to GX context +context = gx.get_context(mode="file", project_root_dir=s3_target_location) + +suite = gx.ExpectationSuite(name='matenancyagreement_suite') + +suite.add_expectation(ExpectTagRefColumnValuesToBeUnique()) +suite.add_expectation(ExpectTagRefNotToBeNull()) +suite.add_expectation(ExpectPropRefNotToBeNull()) +suite.add_expectation(ExpectCoTNotToBeNull()) +suite.add_expectation(ExpectTenureNotToBeNull()) +suite.add_expectation(ExpectSaffRentAccNotToBeNull()) +suite.add_expectation(ExpectRentGroupRefNotToBeNull()) +suite.add_expectation(ExpectEoTToBeBetween()) +suite.add_expectation(ExpectCoTToBeBetween()) + +suite = context.suites.add(suite)