From f01bb5ec3a249d5746f35f38a8be5ba421fbf1cf Mon Sep 17 00:00:00 2001 From: prashanthmanji99 <147180243+prashanthmanji99@users.noreply.github.com> Date: Tue, 22 Jul 2025 17:02:38 -0400 Subject: [PATCH] Added Pipelines --- definitions.py | 16 +- orchestration_pipelines/medicaid/__init__.py | 7 + .../medicaid/claims_pipeline.py | 289 +++++++ .../medicaid/provider_pipeline.py | 330 ++++++++ .../medicaid/recipient_pipeline.py | 324 ++++++++ .../medicaid/snowflaketables.py | 92 +++ reusable_components/dq/control_table.py | 176 +++++ reusable_components/dq/dq_schema_validator.py | 412 ++++++++++ reusable_components/dq/dq_transactions.py | 323 ++++++++ reusable_components/dq/row_count_validator.py | 391 ++++++++++ .../unzip.py => error_handling/__init__.py} | 0 reusable_components/error_handling/alert.py | 164 ++++ .../error_handling/standardized_alerts.py | 426 +++++++++++ .../etl/adls_csv_to_snowflake_iceberg.py | 360 +++++++++ .../etl/adls_parquet_to_iceberg.py | 75 ++ .../etl/adls_to_snowflake_csv.py | 266 +++++++ .../etl/adls_to_snowflake_iceberg.py | 179 +++++ .../etl/adls_to_snowflake_parquet.py | 230 ++++++ reusable_components/etl/copy_adls_to_adls.py | 61 ++ .../etl/copy_adls_to_adls_access_keys.py | 72 ++ reusable_components/etl/copy_sftp_to_adls.py | 234 ++++++ .../etl/copy_stage_parquet_to_iceberg.py | 47 ++ .../etl/csv_to_parquet_access_keys.py | 77 ++ .../etl/csv_to_parquet_adls.py | 82 ++ reusable_components/etl/dq_audit.py | 125 ++++ reusable_components/etl/validate_and_copy.py | 511 +++++++++++++ .../file_processing/archive_files.py | 276 +++++++ .../file_processing/monitor_files.py | 708 ++++++++++++++++++ .../file_processing/unzip_processor.py | 493 ++++++++++++ setup.py | 2 +- utils/adls.py | 18 + 31 files changed, 6756 insertions(+), 10 deletions(-) create mode 100644 orchestration_pipelines/medicaid/__init__.py create mode 100644 orchestration_pipelines/medicaid/recipient_pipeline.py create mode 100644 orchestration_pipelines/medicaid/snowflaketables.py create mode 100644 reusable_components/dq/control_table.py create mode 100644 reusable_components/dq/dq_schema_validator.py create mode 100644 reusable_components/dq/dq_transactions.py create mode 100644 reusable_components/dq/row_count_validator.py rename reusable_components/{file_processing/unzip.py => error_handling/__init__.py} (100%) create mode 100644 reusable_components/error_handling/alert.py create mode 100644 reusable_components/error_handling/standardized_alerts.py create mode 100644 reusable_components/etl/adls_csv_to_snowflake_iceberg.py create mode 100644 reusable_components/etl/adls_parquet_to_iceberg.py create mode 100644 reusable_components/etl/adls_to_snowflake_csv.py create mode 100644 reusable_components/etl/adls_to_snowflake_iceberg.py create mode 100644 reusable_components/etl/adls_to_snowflake_parquet.py create mode 100644 reusable_components/etl/copy_adls_to_adls.py create mode 100644 reusable_components/etl/copy_adls_to_adls_access_keys.py create mode 100644 reusable_components/etl/copy_sftp_to_adls.py create mode 100644 reusable_components/etl/copy_stage_parquet_to_iceberg.py create mode 100644 reusable_components/etl/csv_to_parquet_access_keys.py create mode 100644 reusable_components/etl/csv_to_parquet_adls.py create mode 100644 reusable_components/etl/dq_audit.py create mode 100644 reusable_components/etl/validate_and_copy.py create mode 100644 reusable_components/file_processing/archive_files.py create mode 100644 reusable_components/file_processing/monitor_files.py create mode 100644 reusable_components/file_processing/unzip_processor.py diff --git a/definitions.py b/definitions.py index 9ad0597..eb50887 100644 --- a/definitions.py +++ b/definitions.py @@ -1,10 +1,3 @@ -# from dagster import Definitions -# from amida_demo.assets import processed_data - -# defs = Definitions( -# assets=[processed_data], -# ) - from dagster import Definitions, load_assets_from_modules from utils.snowflake import snowpark_session from utils.adls_sftp import adls_sftp_resource @@ -13,16 +6,21 @@ from orchestration_pipelines.medicaid.recipient_pipeline import ( recipient_sensor ) +from orchestration_pipelines.medicaid import provider_pipeline +from orchestration_pipelines.medicaid.provider_pipeline import ( + provider_sensor +) PIPELINE_MODULES = [ - recipient_pipeline + recipient_pipeline, + provider_pipeline ] all_assets = load_assets_from_modules(PIPELINE_MODULES) defs = Definitions( assets=all_assets, - sensors=[recipient_sensor], + sensors=[recipient_sensor,provider_sensor], resources={ "snowflake_snowpark": snowpark_session, "adls_sftp": adls_sftp_resource, diff --git a/orchestration_pipelines/medicaid/__init__.py b/orchestration_pipelines/medicaid/__init__.py new file mode 100644 index 0000000..eb0fd95 --- /dev/null +++ b/orchestration_pipelines/medicaid/__init__.py @@ -0,0 +1,7 @@ + +# orchestration_pipelines/medicaid/__init__.py +from . import claims_pipeline +from . import snowflaketables +from . import recipient_pipeline + +__all__ = ["claims_pipeline","snowflaketables","recipient_pipeline"] \ No newline at end of file diff --git a/orchestration_pipelines/medicaid/claims_pipeline.py b/orchestration_pipelines/medicaid/claims_pipeline.py index e69de29..2166159 100644 --- a/orchestration_pipelines/medicaid/claims_pipeline.py +++ b/orchestration_pipelines/medicaid/claims_pipeline.py @@ -0,0 +1,289 @@ + +from typing import List, Dict +from dagster import asset, AssetExecutionContext + +# Below asset performs the copy from one ADLS container to another +from reusable_components.etl.copy_adls_to_adls import copy_adls_to_adls +from reusable_components.error_handling.alert import with_alerts + +@asset( + name="copy_claims_files_adls_to_adls", + description="Copy all files matching a given prefix from one ADLS folder to another.", + required_resource_keys={"adls2"}, + group_name="medicaid_claims_pipeline", +) +@with_alerts() +def copy_claims_files_adls_to_adls(context: AssetExecutionContext) -> List[str]: + + # Source, destination and Prefix values + SOURCE_CONTAINER = "dagstersourcedata" + SOURCE_FOLDER = "claims_files" + DEST_CONTAINER = "dagsterdestinationdata" + DEST_FOLDER = "csvfiles" + PREFIX_FILTER = "CLAIMS_" + + # Inputs to the reusable component + file_transfer = copy_adls_to_adls( + context=context, + adls2_resource=context.resources.adls2, + source_container=SOURCE_CONTAINER, + source_folder=SOURCE_FOLDER, + dest_container=DEST_CONTAINER, + dest_folder=DEST_FOLDER, + prefix_filter=PREFIX_FILTER, + ) + + context.log.info(f"✅ Copied {len(file_transfer)} files from {SOURCE_CONTAINER}/{SOURCE_FOLDER} to {DEST_CONTAINER}/{DEST_FOLDER}: {file_transfer}") + return file_transfer + + +# Below is the asset that converts CSV files to parquet format and loads it in the destination location +from reusable_components.etl.csv_to_parquet_adls import convert_csv_to_parquet_adls + +@asset( + name="adls_csv_to_parquet", + description="Convert CSVs under a folder in one ADLS container to Parquet in another folder", + required_resource_keys={"adls2"}, + deps=[copy_claims_files_adls_to_adls], + group_name="medicaid_claims_pipeline", +) +@with_alerts() +def adls_csv_to_parquet(context: AssetExecutionContext) -> list[str]: + + context.log.info("▶️ Asset adls_csv_to_parquet starting") + + # Give the source, destination and file prefix format + SOURCE_CONTAINER = "dagsterdestinationdata" + SOURCE_FOLDER = "csvfiles" + DEST_CONTAINER = "dagsterparquetdata" + DEST_FOLDER = "parquet" + PREFIX_FILTER ="CLAIMS_" + + context.log.info( + f"📁 Converting files from {SOURCE_CONTAINER}/{SOURCE_FOLDER} to " + f"{DEST_CONTAINER}/{DEST_FOLDER} with filter '{PREFIX_FILTER}'" + ) + + # Call the reusable component by giving the values + file_conversion = convert_csv_to_parquet_adls( + context=context, + adls2=context.resources.adls2, + source_container=SOURCE_CONTAINER, + dest_container=DEST_CONTAINER, + source_folder=SOURCE_FOLDER, + dest_folder=DEST_FOLDER, + prefix_filter=PREFIX_FILTER, + ) + + context.log.info(f"✅ Asset complete: {len(file_conversion)} files written to '{DEST_CONTAINER}/{DEST_FOLDER}': {file_conversion}") + return file_conversion + +# ADLS to Snowflake CSV +from reusable_components.etl.adls_to_snowflake_csv import copy_adls_csv_to_snowflake + +@asset( + name="load_adls_csv_to_snowflake", + description="Copy all CSVs from dagsterdestinationdata/csvfiles into ADW_DEV.SANDBOX", + required_resource_keys={"adls2", "snowpark"}, + deps=[adls_csv_to_parquet], + group_name="medicaid_claims_pipeline", +) +@with_alerts() +def load_adls_csv_to_snowflake(context: AssetExecutionContext) -> List[str]: + """ + Load CSV files from ADLS to Snowflake tables. + + Returns: + List of fully qualified table names that were loaded + """ + # Input the Source and Destination + SOURCE_CONTAINER = "dagsterdestinationdata" + SOURCE_FOLDER = "csvfiles" + TARGET_DB = "ADW_DEV" + TARGET_SCHEMA = "SANDBOX" + FILE_FORMAT_NAME = "PM_CSV_FORMAT" + PREFIX_FILTER = "CLAIMS_" + STAGE_NAME="PM_SA_CSV_STAGE" + + + context.log.info( + f"📁 Loading CSV files from {SOURCE_CONTAINER}/{SOURCE_FOLDER} to " + f"{TARGET_DB}.{TARGET_SCHEMA} using format '{FILE_FORMAT_NAME}'" + ) + + try: + # Call the reusable component + loaded_tables = copy_adls_csv_to_snowflake( + context=context, # Pass context for better logging + session=context.resources.snowpark, + adls2=context.resources.adls2, + source_container=SOURCE_CONTAINER, + source_folder=SOURCE_FOLDER, + target_db=TARGET_DB, + target_schema=TARGET_SCHEMA, + file_format_name=FILE_FORMAT_NAME, + prefix_filter=PREFIX_FILTER, + stage_name=STAGE_NAME, + # Pradeep: I will change it according to our Usecase + truncate_before_load=True, + ) + + context.log.info( + f"✅ Asset complete: {len(loaded_tables)} tables loaded to " + f"{TARGET_DB}.{TARGET_SCHEMA}: {loaded_tables}" + ) + return loaded_tables + + except Exception as e: + context.log.error(f"❌ Failed to load CSV files to Snowflake: {str(e)}") + raise + +# ADLS to Snowflake COPY of PARQUET Files +from reusable_components.etl.adls_to_snowflake_parquet import copy_adls_parquet_to_snowflake + +@asset( + name="load_adls_parquet_to_snowflake", + description="Copy all Parquet files from dagsterparquetdata/parquet/data into ADW_DEV.SANDBOX", + required_resource_keys={"adls2", "snowpark"}, + deps=[load_adls_csv_to_snowflake], # Proper dependency chain + group_name="medicaid_claims_pipeline", +) +@with_alerts() +def load_adls_parquet_to_snowflake(context): + SOURCE_CONTAINER = "dagsterparquetdata" + SOURCE_FOLDER = "parquet/snowflake" + TARGET_DB = "ADW_DEV" + TARGET_SCHEMA = "SANDBOX" + FILE_FORMAT_NAME = "PM_PARQUET_FORMAT" + PREFIX_FILTER = "CLAIMS_" + STAGE_NAME = "PM_SA_PARQUET_STAGE" + + load_copy_info = copy_adls_parquet_to_snowflake( + context=context, + session=context.resources.snowpark, + adls2=context.resources.adls2, + source_container=SOURCE_CONTAINER, + source_folder=SOURCE_FOLDER, + target_db=TARGET_DB, + target_schema=TARGET_SCHEMA, + file_format_name=FILE_FORMAT_NAME, + prefix_filter=PREFIX_FILTER, + stage_name=STAGE_NAME, + truncate_before_load=True, # True = replace data, False = append data + ) + + context.log.info(f"✅ Loaded {len(load_copy_info)} Parquet tables: {load_copy_info}") + return load_copy_info + +# Loading ADLS Parquet file to Snowflake ICEBERG Table +from reusable_components.etl.copy_stage_parquet_to_iceberg import copy_stage_parquet_to_iceberg + +@asset( + name="load_all_claims_iceberg", + description="Copy every CLAIMS_*.parquet from ADLS to Snowflake Iceberg table.", + required_resource_keys={"snowpark"}, + deps=[load_adls_parquet_to_snowflake], + group_name="medicaid_claims_pipeline", +) +@with_alerts() +def load_all_claims_iceberg(context: AssetExecutionContext) -> List[str]: + + SOURCE_CONTAINER = "dagsterdestinationdata" + SOURCE_FOLDER = "parquet/iceberg" + PREFIX_FILTER = "CLAIMS_" + STAGE_NAME = "PM_SA_PARQUET_STAGE" + TARGET_DB = "ADW_DEV" + TARGET_SCHEMA = "SANDBOX" + + # full stage path is / + stage_path = f"{TARGET_DB}.{TARGET_SCHEMA}.{STAGE_NAME}/{SOURCE_FOLDER}" + + context.log.info( + f"▶️ Listing @{stage_path} for files starting with {PREFIX_FILTER}" + ) + + tables = copy_stage_parquet_to_iceberg( + context=context, + session=context.resources.snowpark, + stage_name=stage_path, + target_db=TARGET_DB, + target_schema=TARGET_SCHEMA, + prefix_filter=PREFIX_FILTER, + ) + + context.log.info(f"✅ Completed COPY INTO for: {tables}") + return tables + +# Checking the columns of a file from the metadata table which is TABLE_METADATA.csv +from reusable_components.etl.validate_and_copy import validate_csv_files_and_load_to_snowflake + +@asset( + name="validate_and_load_claims", + description="Validate CLAIMS_*.csv files against TABLE_METADATA.csv schema and load valid files into Snowflake", + required_resource_keys={"adls2", "snowpark"}, + deps=[load_all_claims_iceberg], + group_name="medicaid_claims_pipeline", +) +@with_alerts() +def validate_and_load_claims(context: AssetExecutionContext) -> Dict[str, List[str]]: + """ + Validate CSV files against metadata schema and load valid files to Snowflake. + + Returns: + Dictionary containing lists of passed and failed files + """ + context.log.info("▶️ Asset validate_and_load_claims starting") + + # Metadata file configuration + METADATA_CONTAINER = "dagsterdestinationdata" + METADATA_PATH = "validate/TABLE_METADATA.csv" + + # Source files configuration + SOURCE_CONTAINER = "dagsterdestinationdata" + SOURCE_FOLDER = "validate" + PREFIX_FILTER = "CLAIMS_" + + # Snowflake destination configuration + TARGET_DB = "ADW_DEV" + TARGET_SCHEMA = "SANDBOX" + FILE_FORMAT_NAME = "PM_CSV_FORMAT" + STAGE_NAME = "PM_SA_CSV_STAGE" + + context.log.info( + f"📁 Validating files from {SOURCE_CONTAINER}/{SOURCE_FOLDER} " + f"against metadata in {METADATA_CONTAINER}/{METADATA_PATH} " + f"with filter '{PREFIX_FILTER}'" + ) + + try: + # Call the reusable component + result = validate_csv_files_and_load_to_snowflake( + context=context, + session=context.resources.snowpark, + adls2=context.resources.adls2, + metadata_container=METADATA_CONTAINER, + metadata_path=METADATA_PATH, + source_container=SOURCE_CONTAINER, + source_folder=SOURCE_FOLDER, + prefix_filter=PREFIX_FILTER, + target_db=TARGET_DB, + target_schema=TARGET_SCHEMA, + file_format_name=FILE_FORMAT_NAME, + stage_name=STAGE_NAME, + ) + + context.log.info( + f"✅ Asset complete: {len(result['passed_files'])} files passed validation and loaded, " + f"{len(result['failed_files'])} files failed validation: {result['failed_files']}" + ) + + if result['failed_files']: + context.log.warning( + f"⚠️ {len(result['failed_files'])} files failed validation: {result['failed_files']}" + ) + + return result + + except Exception as e: + context.log.error(f"❌ Failed to validate and load claims files: {str(e)}") + raise \ No newline at end of file diff --git a/orchestration_pipelines/medicaid/provider_pipeline.py b/orchestration_pipelines/medicaid/provider_pipeline.py index e69de29..2b45727 100644 --- a/orchestration_pipelines/medicaid/provider_pipeline.py +++ b/orchestration_pipelines/medicaid/provider_pipeline.py @@ -0,0 +1,330 @@ +from dagster import asset, AssetExecutionContext, AssetIn, MaterializeResult +from reusable_components.etl.dq_audit import create_dq_audit_entry +from reusable_components.file_processing.archive_files import archive_files_with_result +from reusable_components.file_processing.monitor_files import create_standardized_file_monitor +from reusable_components.error_handling.standardized_alerts import with_pipeline_alerts +from reusable_components.dq.dq_transactions import load_dq_transactions_with_result +from reusable_components.dq.row_count_validator import run_dq_row_count_validation_with_result +from reusable_components.file_processing.unzip_processor import unzip_files_with_result +from reusable_components.etl.copy_sftp_to_adls import copy_files_sftp_to_adls +from reusable_components.dq.control_table import load_dq_control_table_with_result +from reusable_components.etl.adls_csv_to_snowflake_iceberg import load_csv_to_iceberg_with_result +from reusable_components.dq.dq_schema_validator import validate_all_file_schemas_with_result + +PROVIDER_CONFIG = { + "pipeline_name": "MEDICAID_PROVIDER", + "subject_area": "PROVIDER", + "program_name": "MEDICAID", + "asset_name": "provider_files_monitor", + "sftp_source_path": "/prod/mmis/provider", + "file_criteria": { + "prefix": {"pattern": ["P_MMIS"], "count": 11}, + "suffix": {"pattern": None, "count": 0}, + "contains": {"pattern": None, "count": 0}, + "not_contains": {"pattern": None, "count": 0}, + "regex": {"pattern": None, "count": 0}, + "extension":{"pattern": None, "count": 0} + }, + "downstream_assets": [ + "start_dq_audit_run_provider", + "copy_mftserver_provider_files_to_srcfiles_stage", + "load_dq_transactions_provider", + "unzip_provider_files_to_load_provider", + "archive_provider_files_provider", + "load_dq_control_table_provider", + "dq_provider_row_count_validation", + "dq_schema_check_provider", + "load_csv_to_iceberg_provider" + ], + "stage_container": "srcfiles", + "stage_directory": "medicaid/provider/stage", + "load_directory": "medicaid/provider/load", + "archive_directory": "medicaid/provider/archive", + "control_file": "P_MMIS_PROVIDER_CONTROL_FILE.csv", + "snowflake_db": "ANALYTYXONE_DEV", + "snowflake_schema": "BRONZE", + "snowflake_stage": "PARQUET_STAGE", + "group_name": "provider_file_processing", + "alert_config": { + "program_name": "Medicaid Provider Data Processing", + "send_success_alerts": True + } +} + +# Sensor and file monitor asset +provider_asset, provider_sensor = create_standardized_file_monitor(PROVIDER_CONFIG) + +# DQ Audit Asset +@asset( + name="start_dq_audit_run_provider", + description="Create DQ_Audit entry for provider pipeline", + required_resource_keys={"snowflake_snowpark"}, + ins={"monitor_result": AssetIn("provider_files_monitor")}, + group_name=PROVIDER_CONFIG["group_name"] +) +@with_pipeline_alerts( + pipeline_name=PROVIDER_CONFIG["pipeline_name"], + alert_config=PROVIDER_CONFIG["alert_config"] +) +def start_dq_audit_run_provider(context: AssetExecutionContext, monitor_result) -> MaterializeResult: + return create_dq_audit_entry( + context=context, + session=context.resources.snowflake_snowpark, + monitor_result=monitor_result, + program_name=PROVIDER_CONFIG["program_name"], + subject_area=PROVIDER_CONFIG["subject_area"], + pipeline_name=PROVIDER_CONFIG["pipeline_name"], + prefix=PROVIDER_CONFIG["file_criteria"]["prefix"]["pattern"], + suffix=PROVIDER_CONFIG["file_criteria"]["suffix"]["pattern"], + contains=PROVIDER_CONFIG["file_criteria"]["contains"]["pattern"], + not_contains=PROVIDER_CONFIG["file_criteria"]["not_contains"]["pattern"], + regex=PROVIDER_CONFIG["file_criteria"]["regex"]["pattern"], + extension=PROVIDER_CONFIG["file_criteria"]["extension"]["pattern"] + ) + +# Copy SFTP to ADLS Asset +@asset( + name="copy_mftserver_provider_files_to_srcfiles_stage", + description="Copy provider files from SFTP to ADLS staging", + required_resource_keys={"adls_sftp", "adls_access_keys"}, + ins={"file_monitor_result": AssetIn("provider_files_monitor")}, + group_name=PROVIDER_CONFIG["group_name"] +) +@with_pipeline_alerts( + pipeline_name=PROVIDER_CONFIG["pipeline_name"], + alert_config=PROVIDER_CONFIG["alert_config"] +) +def copy_provider_files_to_stage_provider(context: AssetExecutionContext, file_monitor_result) -> MaterializeResult: + return copy_files_sftp_to_adls( + context=context, + sftp_client=context.resources.adls_sftp, + adls_client=context.resources.adls_access_keys, + file_monitor_result=file_monitor_result, + source_path=PROVIDER_CONFIG["sftp_source_path"], + destination_container=PROVIDER_CONFIG["stage_container"], + destination_path=PROVIDER_CONFIG["stage_directory"], + pipeline_name=PROVIDER_CONFIG["pipeline_name"], + prefix=PROVIDER_CONFIG["file_criteria"]["prefix"]["pattern"], + suffix=PROVIDER_CONFIG["file_criteria"]["suffix"]["pattern"], + contains=PROVIDER_CONFIG["file_criteria"]["contains"]["pattern"], + not_contains=PROVIDER_CONFIG["file_criteria"]["not_contains"]["pattern"], + regex=PROVIDER_CONFIG["file_criteria"]["regex"]["pattern"], + extension=PROVIDER_CONFIG["file_criteria"]["extension"]["pattern"] + ) + +# DQ Transactions Asset +@asset( + name="load_dq_transactions_provider", + description="Load DQ transactions for provider files", + required_resource_keys={"snowflake_snowpark", "adls_access_keys"}, + ins={ + "copy_result": AssetIn("copy_mftserver_provider_files_to_srcfiles_stage"), + "audit_batch_id": AssetIn("start_dq_audit_run_provider") + }, + group_name=PROVIDER_CONFIG["group_name"] +) +@with_pipeline_alerts( + pipeline_name=PROVIDER_CONFIG["pipeline_name"], + alert_config=PROVIDER_CONFIG["alert_config"] +) +def load_file_transaction_metadata(context: AssetExecutionContext, copy_result, audit_batch_id: int) -> MaterializeResult: + return load_dq_transactions_with_result( + context=context, + snowpark_session=context.resources.snowflake_snowpark, + adls_client=context.resources.adls_access_keys, + copy_result=copy_result, + audit_batch_id=audit_batch_id, + container_name=PROVIDER_CONFIG["stage_container"], + directory_path=PROVIDER_CONFIG["stage_directory"], + program_name=PROVIDER_CONFIG["program_name"], + subject_area=PROVIDER_CONFIG["subject_area"], + pipeline_name=PROVIDER_CONFIG["pipeline_name"], + prefix=PROVIDER_CONFIG["file_criteria"]["prefix"]["pattern"], + suffix=PROVIDER_CONFIG["file_criteria"]["suffix"]["pattern"], + contains=PROVIDER_CONFIG["file_criteria"]["contains"]["pattern"], + not_contains=PROVIDER_CONFIG["file_criteria"]["not_contains"]["pattern"], + regex=PROVIDER_CONFIG["file_criteria"]["regex"]["pattern"], + extension=PROVIDER_CONFIG["file_criteria"]["extension"]["pattern"] + ) + +# Copy files to Load directory Asset +@asset( + name="unzip_provider_files_to_load_provider", + description="Unzip provider files to load directory", + required_resource_keys={"adls_access_keys"}, + ins={"copy_result": AssetIn("copy_mftserver_provider_files_to_srcfiles_stage")}, + group_name=PROVIDER_CONFIG["group_name"] +) +@with_pipeline_alerts( + pipeline_name=PROVIDER_CONFIG["pipeline_name"], + alert_config=PROVIDER_CONFIG["alert_config"] +) +def unzip_provider_files_to_load(context: AssetExecutionContext, copy_result) -> MaterializeResult: + return unzip_files_with_result( + context=context, + adls_client=context.resources.adls_access_keys, + copy_result=copy_result, + container_name=PROVIDER_CONFIG["stage_container"], + stage_directory=PROVIDER_CONFIG["stage_directory"], + load_directory=PROVIDER_CONFIG["load_directory"], + pipeline_name=PROVIDER_CONFIG["pipeline_name"], + prefix=PROVIDER_CONFIG["file_criteria"]["prefix"]["pattern"], + suffix=PROVIDER_CONFIG["file_criteria"]["suffix"]["pattern"], + contains=PROVIDER_CONFIG["file_criteria"]["contains"]["pattern"], + not_contains=PROVIDER_CONFIG["file_criteria"]["not_contains"]["pattern"], + regex=PROVIDER_CONFIG["file_criteria"]["regex"]["pattern"], + extension=PROVIDER_CONFIG["file_criteria"]["extension"]["pattern"] + ) + +# Copy files to Archive Directory Asset +@asset( + name="archive_provider_files_provider", + description="Archive ZIP files from stage to archive directory", + required_resource_keys={"adls_access_keys"}, + ins={"copy_result": AssetIn("copy_mftserver_provider_files_to_srcfiles_stage")}, + group_name=PROVIDER_CONFIG["group_name"] +) +@with_pipeline_alerts( + pipeline_name=PROVIDER_CONFIG["pipeline_name"], + alert_config=PROVIDER_CONFIG["alert_config"] +) +def archive_provider_files(context: AssetExecutionContext, copy_result) -> MaterializeResult: + return archive_files_with_result( + context=context, + adls_client=context.resources.adls_access_keys, + copy_result=copy_result, + stage_container=PROVIDER_CONFIG["stage_container"], + stage_directory=PROVIDER_CONFIG["stage_directory"], + archive_directory=PROVIDER_CONFIG["archive_directory"], + pipeline_name=PROVIDER_CONFIG["pipeline_name"], + prefix=PROVIDER_CONFIG["file_criteria"]["prefix"]["pattern"], + suffix=PROVIDER_CONFIG["file_criteria"]["suffix"]["pattern"], + contains=PROVIDER_CONFIG["file_criteria"]["contains"]["pattern"], + not_contains=PROVIDER_CONFIG["file_criteria"]["not_contains"]["pattern"], + regex=PROVIDER_CONFIG["file_criteria"]["regex"]["pattern"], + extension=PROVIDER_CONFIG["file_criteria"]["extension"]["pattern"] + ) + +# DQ Control Asset +@asset( + name="load_dq_control_table_provider", + description="Load control table data for provider pipeline", + required_resource_keys={"adls_access_keys", "snowflake_snowpark"}, + ins={ + "audit_batch_id": AssetIn("start_dq_audit_run_provider"), + "unzip_result": AssetIn("unzip_provider_files_to_load_provider") + }, + group_name=PROVIDER_CONFIG["group_name"] +) +@with_pipeline_alerts( + pipeline_name=PROVIDER_CONFIG["pipeline_name"], + alert_config=PROVIDER_CONFIG["alert_config"] +) +def load_dq_control_table(context: AssetExecutionContext, audit_batch_id: int, unzip_result) -> MaterializeResult: + return load_dq_control_table_with_result( + context=context, + snowpark_session=context.resources.snowflake_snowpark, + unzip_result=unzip_result, + audit_batch_id=audit_batch_id, + adls_container=PROVIDER_CONFIG["stage_container"], + folder_path=PROVIDER_CONFIG["load_directory"], + control_file=PROVIDER_CONFIG["control_file"], + program_name=PROVIDER_CONFIG["program_name"], + subject_area=PROVIDER_CONFIG["subject_area"], + pipeline_name=PROVIDER_CONFIG["pipeline_name"] + ) + +# Row Count Asset +@asset( + name="dq_provider_row_count_validation", + description="Validate row counts for provider files", + required_resource_keys={"adls_access_keys", "snowflake_snowpark"}, + ins={ + "audit_batch_id": AssetIn("start_dq_audit_run_provider"), + "control_table_result": AssetIn("load_dq_control_table_provider") + }, + group_name=PROVIDER_CONFIG["group_name"] +) +@with_pipeline_alerts( + pipeline_name=PROVIDER_CONFIG["pipeline_name"], + alert_config=PROVIDER_CONFIG["alert_config"] +) +def dq_provider_row_count_validation(context: AssetExecutionContext, audit_batch_id: int, control_table_result) -> MaterializeResult: + return run_dq_row_count_validation_with_result( + context=context, + session=context.resources.snowflake_snowpark, + adls2=context.resources.adls_access_keys, + control_table_result=control_table_result, + audit_batch_id=audit_batch_id, + container=PROVIDER_CONFIG["stage_container"], + folder_path=PROVIDER_CONFIG["load_directory"], + program_name=PROVIDER_CONFIG["program_name"], + subject_area=PROVIDER_CONFIG["subject_area"], + pipeline_name=PROVIDER_CONFIG["pipeline_name"], + prefix=PROVIDER_CONFIG["file_criteria"]["prefix"]["pattern"], + suffix=PROVIDER_CONFIG["file_criteria"]["suffix"]["pattern"], + contains=PROVIDER_CONFIG["file_criteria"]["contains"]["pattern"], + not_contains=["CONTROL"], + regex=PROVIDER_CONFIG["file_criteria"]["regex"]["pattern"], + extension=[".csv"] + ) + +# Schema check Asset +@asset( + name="dq_schema_check_provider", + description="Validate file schemas for provider pipeline", + required_resource_keys={"adls_access_keys", "snowflake_snowpark"}, + ins={"dq_result": AssetIn("dq_provider_row_count_validation")}, + group_name=PROVIDER_CONFIG["group_name"] +) +@with_pipeline_alerts( + pipeline_name=PROVIDER_CONFIG["pipeline_name"], + alert_config=PROVIDER_CONFIG["alert_config"] +) +def dq_schema_check(context: AssetExecutionContext, dq_result) -> MaterializeResult: + return validate_all_file_schemas_with_result( + adls_client=context.resources.adls_access_keys, + container=PROVIDER_CONFIG["stage_container"], + folder_path=PROVIDER_CONFIG["load_directory"], + session=context.resources.snowflake_snowpark, + context=context, + dq_result=dq_result, + pipeline_name=PROVIDER_CONFIG["pipeline_name"], + prefix=PROVIDER_CONFIG["file_criteria"]["prefix"]["pattern"], + suffix=PROVIDER_CONFIG["file_criteria"]["suffix"]["pattern"], + contains=PROVIDER_CONFIG["file_criteria"]["contains"]["pattern"], + not_contains=["CONTROL"], + regex=PROVIDER_CONFIG["file_criteria"]["regex"]["pattern"], + extension=[".csv"] + ) + +# CSV to Iceberg Asset +@asset( + name="load_csv_to_iceberg_provider", + description="Load CSV files to Iceberg tables for provider pipeline", + required_resource_keys={"adls_access_keys", "snowflake_snowpark"}, + ins={ + "audit_batch_id": AssetIn("start_dq_audit_run_provider"), + "schema_check_result": AssetIn("dq_schema_check_provider") + }, + group_name=PROVIDER_CONFIG["group_name"] +) +@with_pipeline_alerts( + pipeline_name=PROVIDER_CONFIG["pipeline_name"], + alert_config=PROVIDER_CONFIG["alert_config"] +) +def load_csv_to_iceberg(context: AssetExecutionContext, audit_batch_id: int, schema_check_result) -> MaterializeResult: + return load_csv_to_iceberg_with_result( + context=context, + adls_client=context.resources.adls_access_keys, + snowpark_session=context.resources.snowflake_snowpark, + audit_batch_id=audit_batch_id, + schema_check_result=schema_check_result, + config=PROVIDER_CONFIG, + prefix=PROVIDER_CONFIG["file_criteria"]["prefix"]["pattern"], + suffix=PROVIDER_CONFIG["file_criteria"]["suffix"]["pattern"], + contains=PROVIDER_CONFIG["file_criteria"]["contains"]["pattern"], + not_contains=["CONTROL"], + regex=PROVIDER_CONFIG["file_criteria"]["regex"]["pattern"], + extension=[".csv"] + ) \ No newline at end of file diff --git a/orchestration_pipelines/medicaid/recipient_pipeline.py b/orchestration_pipelines/medicaid/recipient_pipeline.py new file mode 100644 index 0000000..ee3ccfb --- /dev/null +++ b/orchestration_pipelines/medicaid/recipient_pipeline.py @@ -0,0 +1,324 @@ +from dagster import asset, AssetExecutionContext, AssetIn, MaterializeResult +from reusable_components.etl.dq_audit import create_dq_audit_entry +from reusable_components.file_processing.archive_files import archive_files_with_result +from reusable_components.file_processing.monitor_files import create_standardized_file_monitor +from reusable_components.error_handling.standardized_alerts import with_pipeline_alerts +from reusable_components.dq.dq_transactions import load_dq_transactions_with_result +from reusable_components.dq.row_count_validator import run_dq_row_count_validation_with_result +from reusable_components.file_processing.unzip_processor import unzip_files_with_result +from reusable_components.etl.copy_sftp_to_adls import copy_files_sftp_to_adls +from reusable_components.dq.control_table import load_dq_control_table_with_result +from reusable_components.etl.adls_csv_to_snowflake_iceberg import load_csv_to_iceberg_with_result +from reusable_components.dq.dq_schema_validator import validate_all_file_schemas_with_result + +RECIPIENT_CONFIG = { + "pipeline_name": "MEDICAID_RECIPIENT", + "subject_area": "RECIPIENT", + "program_name": "MEDICAID", + "asset_name": "recipient_files_monitor", + "sftp_source_path": "/prod/mmis/recipient", + "file_criteria": { + "prefix": {"pattern": ["R_MMIS"], "count": 9}, + "suffix": {"pattern": None, "count": 0}, + "contains": {"pattern": None, "count": 0}, + "not_contains": {"pattern": None, "count": 0}, + "regex": {"pattern": None, "count": 0}, + "extension":{"pattern": None, "count": 0} + }, + "downstream_assets": [ + "start_dq_audit_run", + "copy_mftserver_recipient_files_to_srcfiles_stage", + "load_dq_transactions", + "unzip_recipient_files_to_load", + "archive_recipient_files", + "load_dq_control_table", + "dq_recipient_row_count_validation", + "dq_schema_check", + "load_csv_to_iceberg" + ], + "stage_container": "srcfiles", + "stage_directory": "medicaid/recipient/stage", + "load_directory": "medicaid/recipient/load", + "archive_directory": "medicaid/recipient/archive", + "control_file": "R_MMIS_RECIPIENT_CONTROL_FILE.csv", + "snowflake_db": "ANALYTYXONE_DEV", + "snowflake_schema": "BRONZE", + "snowflake_stage": "PARQUET_STAGE", + "group_name": "recipient_file_processing", + "alert_config": { + "program_name": "Medicaid Recipient Data Processing", + "send_success_alerts": True + } +} + +# Sensor and file monitor asset +recipient_asset, recipient_sensor = create_standardized_file_monitor(RECIPIENT_CONFIG) + +# DQ Audit Asset +@asset( + name="start_dq_audit_run", + description="Create DQ_Audit entry for recipient pipeline", + required_resource_keys={"snowflake_snowpark"}, + ins={"monitor_result": AssetIn("recipient_files_monitor")}, + group_name=RECIPIENT_CONFIG["group_name"] +) +@with_pipeline_alerts( + pipeline_name=RECIPIENT_CONFIG["pipeline_name"], + alert_config=RECIPIENT_CONFIG["alert_config"] +) +def start_dq_audit_run(context: AssetExecutionContext, monitor_result) -> MaterializeResult: + return create_dq_audit_entry( + context=context, + session=context.resources.snowflake_snowpark, + monitor_result=monitor_result, + program_name=RECIPIENT_CONFIG["program_name"], + subject_area=RECIPIENT_CONFIG["subject_area"], + pipeline_name=RECIPIENT_CONFIG["pipeline_name"], + prefix=RECIPIENT_CONFIG["file_criteria"]["prefix"]["pattern"], + suffix=RECIPIENT_CONFIG["file_criteria"]["suffix"]["pattern"], + contains=RECIPIENT_CONFIG["file_criteria"]["contains"]["pattern"], + not_contains=RECIPIENT_CONFIG["file_criteria"]["not_contains"]["pattern"], + regex=RECIPIENT_CONFIG["file_criteria"]["regex"]["pattern"], + extension=RECIPIENT_CONFIG["file_criteria"]["extension"]["pattern"] + ) + +# Copy SFTP to ADLS Asset +@asset( + name="copy_mftserver_recipient_files_to_srcfiles_stage", + description="Copy recipient files from SFTP to ADLS staging", + required_resource_keys={"adls_sftp", "adls_access_keys"}, + ins={"file_monitor_result": AssetIn("recipient_files_monitor")}, + group_name=RECIPIENT_CONFIG["group_name"] +) +@with_pipeline_alerts( + pipeline_name=RECIPIENT_CONFIG["pipeline_name"], + alert_config=RECIPIENT_CONFIG["alert_config"] +) +def copy_recipient_files_to_stage(context: AssetExecutionContext, file_monitor_result) -> MaterializeResult: + return copy_files_sftp_to_adls( + context=context, + sftp_client=context.resources.adls_sftp, + adls_client=context.resources.adls_access_keys, + file_monitor_result=file_monitor_result, + source_path=RECIPIENT_CONFIG["sftp_source_path"], + destination_container=RECIPIENT_CONFIG["stage_container"], + destination_path=RECIPIENT_CONFIG["stage_directory"], + pipeline_name=RECIPIENT_CONFIG["pipeline_name"], + prefix=RECIPIENT_CONFIG["file_criteria"]["prefix"]["pattern"], + suffix=RECIPIENT_CONFIG["file_criteria"]["suffix"]["pattern"], + contains=RECIPIENT_CONFIG["file_criteria"]["contains"]["pattern"], + not_contains=RECIPIENT_CONFIG["file_criteria"]["not_contains"]["pattern"], + regex=RECIPIENT_CONFIG["file_criteria"]["regex"]["pattern"], + extension=RECIPIENT_CONFIG["file_criteria"]["extension"]["pattern"] + ) + +# DQ Transactions Asset +@asset( + name="load_dq_transactions", + description="Load DQ transactions for recipient files", + required_resource_keys={"snowflake_snowpark", "adls_access_keys"}, + ins={ + "copy_result": AssetIn("copy_mftserver_recipient_files_to_srcfiles_stage"), + "audit_batch_id": AssetIn("start_dq_audit_run") + }, + group_name=RECIPIENT_CONFIG["group_name"] +) +@with_pipeline_alerts( + pipeline_name=RECIPIENT_CONFIG["pipeline_name"], + alert_config=RECIPIENT_CONFIG["alert_config"] +) +def load_file_transaction_metadata(context: AssetExecutionContext, copy_result, audit_batch_id: int) -> MaterializeResult: + return load_dq_transactions_with_result( + context=context, + snowpark_session=context.resources.snowflake_snowpark, + adls_client=context.resources.adls_access_keys, + copy_result=copy_result, + audit_batch_id=audit_batch_id, + container_name=RECIPIENT_CONFIG["stage_container"], + directory_path=RECIPIENT_CONFIG["stage_directory"], + program_name=RECIPIENT_CONFIG["program_name"], + subject_area=RECIPIENT_CONFIG["subject_area"], + pipeline_name=RECIPIENT_CONFIG["pipeline_name"], + prefix=RECIPIENT_CONFIG["file_criteria"]["prefix"]["pattern"], + suffix=RECIPIENT_CONFIG["file_criteria"]["suffix"]["pattern"], + contains=RECIPIENT_CONFIG["file_criteria"]["contains"]["pattern"], + not_contains=RECIPIENT_CONFIG["file_criteria"]["not_contains"]["pattern"], + regex=RECIPIENT_CONFIG["file_criteria"]["regex"]["pattern"], + extension=RECIPIENT_CONFIG["file_criteria"]["extension"]["pattern"] + ) + +# Copy files to Load directory Asset +@asset( + name="unzip_recipient_files_to_load", + description="Unzip recipient files to load directory", + required_resource_keys={"adls_access_keys"}, + ins={"copy_result": AssetIn("copy_mftserver_recipient_files_to_srcfiles_stage")}, + group_name=RECIPIENT_CONFIG["group_name"] +) +@with_pipeline_alerts( + pipeline_name=RECIPIENT_CONFIG["pipeline_name"], + alert_config=RECIPIENT_CONFIG["alert_config"] +) +def unzip_recipient_files_to_load(context: AssetExecutionContext, copy_result) -> MaterializeResult: + return unzip_files_with_result( + context=context, + adls_client=context.resources.adls_access_keys, + copy_result=copy_result, + container_name=RECIPIENT_CONFIG["stage_container"], + stage_directory=RECIPIENT_CONFIG["stage_directory"], + load_directory=RECIPIENT_CONFIG["load_directory"], + pipeline_name=RECIPIENT_CONFIG["pipeline_name"], + prefix=RECIPIENT_CONFIG["file_criteria"]["prefix"]["pattern"], + suffix=RECIPIENT_CONFIG["file_criteria"]["suffix"]["pattern"], + contains=RECIPIENT_CONFIG["file_criteria"]["contains"]["pattern"], + not_contains=RECIPIENT_CONFIG["file_criteria"]["not_contains"]["pattern"], + regex=RECIPIENT_CONFIG["file_criteria"]["regex"]["pattern"], + extension=RECIPIENT_CONFIG["file_criteria"]["extension"]["pattern"] + ) + +# Copy files to Archive Directory Asset +@asset( + name="archive_recipient_files", + description="Archive ZIP files from stage to archive directory", + required_resource_keys={"adls_access_keys"}, + ins={"copy_result": AssetIn("copy_mftserver_recipient_files_to_srcfiles_stage")}, + group_name=RECIPIENT_CONFIG["group_name"] +) +@with_pipeline_alerts( + pipeline_name=RECIPIENT_CONFIG["pipeline_name"], + alert_config=RECIPIENT_CONFIG["alert_config"] +) +def archive_recipient_files(context: AssetExecutionContext, copy_result) -> MaterializeResult: + return archive_files_with_result( + context=context, + adls_client=context.resources.adls_access_keys, + copy_result=copy_result, + stage_container=RECIPIENT_CONFIG["stage_container"], + stage_directory=RECIPIENT_CONFIG["stage_directory"], + archive_directory=RECIPIENT_CONFIG["archive_directory"], + pipeline_name=RECIPIENT_CONFIG["pipeline_name"], + prefix=RECIPIENT_CONFIG["file_criteria"]["prefix"]["pattern"], + suffix=RECIPIENT_CONFIG["file_criteria"]["suffix"]["pattern"], + contains=RECIPIENT_CONFIG["file_criteria"]["contains"]["pattern"], + not_contains=RECIPIENT_CONFIG["file_criteria"]["not_contains"]["pattern"], + regex=RECIPIENT_CONFIG["file_criteria"]["regex"]["pattern"], + extension=RECIPIENT_CONFIG["file_criteria"]["extension"]["pattern"] + ) + +# DQ Control Asset +@asset( + name="load_dq_control_table", + description="Load control table data for recipient pipeline", + required_resource_keys={"adls_access_keys", "snowflake_snowpark"}, + ins={ + "audit_batch_id": AssetIn("start_dq_audit_run"), + "unzip_result": AssetIn("unzip_recipient_files_to_load") + }, + group_name=RECIPIENT_CONFIG["group_name"] +) +@with_pipeline_alerts( + pipeline_name=RECIPIENT_CONFIG["pipeline_name"], + alert_config=RECIPIENT_CONFIG["alert_config"] +) +def load_dq_control_table(context: AssetExecutionContext, audit_batch_id: int, unzip_result) -> MaterializeResult: + return load_dq_control_table_with_result( + context=context, + snowpark_session=context.resources.snowflake_snowpark, + unzip_result=unzip_result, + audit_batch_id=audit_batch_id, + adls_container=RECIPIENT_CONFIG["stage_container"], + folder_path=RECIPIENT_CONFIG["load_directory"], + control_file=RECIPIENT_CONFIG["control_file"], + program_name=RECIPIENT_CONFIG["program_name"], + subject_area=RECIPIENT_CONFIG["subject_area"], + pipeline_name=RECIPIENT_CONFIG["pipeline_name"] + ) + +# Row Count Asset +@asset( + name="dq_recipient_row_count_validation", + description="Validate row counts for recipient files", + required_resource_keys={"adls_access_keys", "snowflake_snowpark"}, + ins={ + "audit_batch_id": AssetIn("start_dq_audit_run"), + "control_table_result": AssetIn("load_dq_control_table") + }, + group_name=RECIPIENT_CONFIG["group_name"] +) +@with_pipeline_alerts( + pipeline_name=RECIPIENT_CONFIG["pipeline_name"], + alert_config=RECIPIENT_CONFIG["alert_config"] +) +def dq_recipient_row_count_validation(context: AssetExecutionContext, audit_batch_id: int, control_table_result) -> MaterializeResult: + return run_dq_row_count_validation_with_result( + context=context, + session=context.resources.snowflake_snowpark, + adls2=context.resources.adls_access_keys, + control_table_result=control_table_result, + audit_batch_id=audit_batch_id, + container=RECIPIENT_CONFIG["stage_container"], + folder_path=RECIPIENT_CONFIG["load_directory"], + program_name=RECIPIENT_CONFIG["program_name"], + subject_area=RECIPIENT_CONFIG["subject_area"], + pipeline_name=RECIPIENT_CONFIG["pipeline_name"] + ) + +# Schema check Asset +@asset( + name="dq_schema_check", + description="Validate file schemas for recipient pipeline", + required_resource_keys={"adls_access_keys", "snowflake_snowpark"}, + ins={"dq_result": AssetIn("dq_recipient_row_count_validation")}, + group_name=RECIPIENT_CONFIG["group_name"] +) +@with_pipeline_alerts( + pipeline_name=RECIPIENT_CONFIG["pipeline_name"], + alert_config=RECIPIENT_CONFIG["alert_config"] +) +def dq_schema_check(context: AssetExecutionContext, dq_result) -> MaterializeResult: + return validate_all_file_schemas_with_result( + adls_client=context.resources.adls_access_keys, + container=RECIPIENT_CONFIG["stage_container"], + folder_path=RECIPIENT_CONFIG["load_directory"], + session=context.resources.snowflake_snowpark, + context=context, + dq_result=dq_result, + pipeline_name=RECIPIENT_CONFIG["pipeline_name"], + prefix=RECIPIENT_CONFIG["file_criteria"]["prefix"]["pattern"], + suffix=RECIPIENT_CONFIG["file_criteria"]["suffix"]["pattern"], + contains=RECIPIENT_CONFIG["file_criteria"]["contains"]["pattern"], + not_contains=["CONTROL"], + regex=RECIPIENT_CONFIG["file_criteria"]["regex"]["pattern"], + extension=[".csv"] + ) + +# CSV to Iceberg Asset +@asset( + name="load_csv_to_iceberg", + description="Load CSV files to Iceberg tables for recipient pipeline", + required_resource_keys={"adls_access_keys", "snowflake_snowpark"}, + ins={ + "audit_batch_id": AssetIn("start_dq_audit_run"), + "schema_check_result": AssetIn("dq_schema_check") + }, + group_name=RECIPIENT_CONFIG["group_name"] +) +@with_pipeline_alerts( + pipeline_name=RECIPIENT_CONFIG["pipeline_name"], + alert_config=RECIPIENT_CONFIG["alert_config"] +) +def load_csv_to_iceberg(context: AssetExecutionContext, audit_batch_id: int, schema_check_result) -> MaterializeResult: + return load_csv_to_iceberg_with_result( + context=context, + adls_client=context.resources.adls_access_keys, + snowpark_session=context.resources.snowflake_snowpark, + audit_batch_id=audit_batch_id, + schema_check_result=schema_check_result, + config=RECIPIENT_CONFIG, + prefix=RECIPIENT_CONFIG["file_criteria"]["prefix"]["pattern"], + suffix=RECIPIENT_CONFIG["file_criteria"]["suffix"]["pattern"], + contains=RECIPIENT_CONFIG["file_criteria"]["contains"]["pattern"], + not_contains=["CONTROL"], + regex=RECIPIENT_CONFIG["file_criteria"]["regex"]["pattern"], + extension=[".csv"] + ) \ No newline at end of file diff --git a/orchestration_pipelines/medicaid/snowflaketables.py b/orchestration_pipelines/medicaid/snowflaketables.py new file mode 100644 index 0000000..b185d59 --- /dev/null +++ b/orchestration_pipelines/medicaid/snowflaketables.py @@ -0,0 +1,92 @@ +from dagster import asset, Config, get_dagster_logger +import os +from pathlib import Path +from reusable_components.error_handling.alert import with_alerts + +class SQLTransferConfig(Config): + source_table: str + target_table: str + sql_file_path: str + + +@asset( + name="github_sql_transfer", + required_resource_keys={"snowpark"}, + group_name="medicaid_snowflake" +) +@with_alerts() +def github_sql_transfer(context, config: SQLTransferConfig): + """ + Simple asset that reads SQL from local file and executes in Snowflake using Snowpark + Uses repository-relative paths that work in both local and cloud + """ + logger = get_dagster_logger() + session = context.resources.snowpark + + # STEP 1: READ THE SQL FILE + # Get the directory where Python file is located + current_file_dir = Path(__file__).parent + + # Navigate to repository root + repo_root = current_file_dir.parent.parent + + sql_file_path = repo_root / config.sql_file_path + + logger.info(f"Repository root: {repo_root}") + logger.info(f"Reading SQL from: {sql_file_path}") + logger.info(f"File exists: {sql_file_path.exists()}") + + # Check if file exists + if not sql_file_path.exists(): + # Try alternative path structure in case the repo structure is different + alternative_path = repo_root / "medicare" / "claims" / "sql" / "bronze" / "transform_table.sql" + logger.info(f"Trying alternative path: {alternative_path}") + + if alternative_path.exists(): + sql_file_path = alternative_path + else: + raise FileNotFoundError(f"SQL file not found at: {sql_file_path} or {alternative_path}") + + # Read the entire SQL file + with open(sql_file_path, 'r') as file: + sql_code = file.read() + + logger.info("Successfully read SQL from repository") + logger.info(f"SQL content (first 200 chars): {sql_code[:200]}...") + + + # STEP 2: REPLACE TABLE NAMES IN THE SQL + logger.info(f"Replacing placeholders - Source: {config.source_table}, Target: {config.target_table}") + + sql_code = sql_code.replace("{{source_table}}", config.source_table) + sql_code = sql_code.replace("{{target_table}}", config.target_table) + + logger.info(f"Processed SQL (first 200 chars): {sql_code[:200]}...") + + # STEP 3: RUN THE SQL IN SNOWFLAKE + logger.info("Executing SQL in Snowflake using Snowpark Session") + try: + logger.info("Executing SQL using session.sql()...") + result = session.sql(sql_code).collect() + logger.info(f"SQL execution completed successfully") + logger.info(f"Query executed, result count: {len(result) if result else 0}") + context.log.info(f"Successfully executed SQL. Processed {len(result) if result else 0} rows.") + return f"Successfully executed SQL. Processed {len(result) if result else 0} rows." + except Exception as e: + logger.error(f"Error with session.sql(): {str(e)}") + logger.info("Trying to execute SQL statements individually...") + sql_statements = [stmt.strip() for stmt in sql_code.split(';') if stmt.strip()] + results = [] + for i, statement in enumerate(sql_statements): + logger.info(f"Executing statement {i+1}/{len(sql_statements)}: {statement[:100]}...") + try: + result = session.sql(statement).collect() + results.append(result) + logger.info(f"Statement {i+1} completed successfully") + except Exception as stmt_error: + logger.error(f"Error in statement {i+1}: {str(stmt_error)}") + raise stmt_error + total_rows = sum(len(result) if result else 0 for result in results) + logger.info(f"All SQL statements completed. Total rows processed: {total_rows}") + context.log.info(f"Successfully executed {len(sql_statements)} SQL statements. Total rows: {total_rows}.") + return f"Successfully executed {len(sql_statements)} SQL statements. Total rows: {total_rows}." diff --git a/reusable_components/dq/control_table.py b/reusable_components/dq/control_table.py new file mode 100644 index 0000000..47428cb --- /dev/null +++ b/reusable_components/dq/control_table.py @@ -0,0 +1,176 @@ +from dagster import AssetExecutionContext, MaterializeResult, MetadataValue +from snowflake.snowpark import Session +from typing import Union + + +def load_dq_control_table_with_result( + context: AssetExecutionContext, + snowpark_session: Session, + unzip_result: Union[dict, str], + audit_batch_id: int, + adls_container: str, + folder_path: str, + control_file: str, + program_name: str, + subject_area: str, + pipeline_name: str +) -> MaterializeResult: + """ + Complete control table logic: validation + control data loading + MaterializeResult. + Returns MaterializeResult ready for the asset to return. + """ + + # Extract unzip status + unzip_status = unzip_result.get("status", "unknown") if isinstance(unzip_result, dict) else "completed" + + if unzip_status in ["skipped", "failed"]: + context.log.info(f"❌ Skipping {pipeline_name} control table load - unzip failed") + + return MaterializeResult( + value={"status": "skipped", "reason": f"Unzip operation failed: {unzip_status}", "pipeline_name": pipeline_name}, + metadata={ + "status": MetadataValue.text("⏭️ SKIPPED"), + "reason": MetadataValue.text("Previous step failed"), + "pipeline_name": MetadataValue.text(pipeline_name), + "batch_id": MetadataValue.int(audit_batch_id) + } + ) + + try: + context.log.info(f"📋 {pipeline_name} loading control table data for batch {audit_batch_id}") + context.log.info(f" Control file: {control_file}") + context.log.info(f" Source path: {folder_path}") + + # Use internal component to do the actual work + _load_dq_control_data_internal( + context=context, + snowpark_session=snowpark_session, + adls_container=adls_container, + folder_path=folder_path, + control_file=control_file, + program_name=program_name, + subject_area=subject_area, + batch_id=audit_batch_id, + ) + + context.log.info(f"✅ {pipeline_name} control table data loaded successfully") + + # Return successful MaterializeResult + return MaterializeResult( + value={ + "status": "completed", + "batch_id": audit_batch_id, + "control_file_loaded": control_file, + "pipeline_name": pipeline_name, + "folder_path": folder_path + }, + metadata={ + "status": MetadataValue.text("✅ SUCCESS"), + "batch_id": MetadataValue.int(audit_batch_id), + "pipeline_name": MetadataValue.text(pipeline_name), + "control_file": MetadataValue.text(control_file), + "program_name": MetadataValue.text(program_name), + "subject_area": MetadataValue.text(subject_area) + } + ) + + except Exception as e: + context.log.error(f"❌ {pipeline_name} control table load failed: {str(e)}") + + # Return failed MaterializeResult + return MaterializeResult( + value={"status": "failed", "error": str(e), "pipeline_name": pipeline_name}, + metadata={ + "status": MetadataValue.text("❌ FAILED"), + "error": MetadataValue.text(str(e)[:200] + "..." if len(str(e)) > 200 else str(e)), + "pipeline_name": MetadataValue.text(pipeline_name), + "batch_id": MetadataValue.int(audit_batch_id), + "control_file": MetadataValue.text(control_file) + } + ) + raise + + +def _load_dq_control_data_internal( + context: AssetExecutionContext, + snowpark_session: Session, + adls_container: str, + folder_path: str, + control_file: str, + program_name: str, + subject_area: str, + batch_id: int, +) -> None: + """ + Internal function that does the actual control table loading work. + 1. COPY INTO the two data columns (TABLE_NAME, ROW_COUNT) from the ADLS CSV. + 2. UPDATE the remaining metadata columns for any rows where META_BATCH_ID is NULL. + """ + # Fully qualified target table + full_table = "ANALYTYXONE_DEV.DATALOOM.DQ_CONTROL_TABLE" + + # Build the ADLS path string that your external stage is configured to point at + adls_path = f"{folder_path}/{control_file}" + stage_path = f"@ANALYTYXONE_DEV.DATALOOM.CSV_STAGE/{adls_path}" + + context.log.info(f"📋 Control Table Load Details:") + context.log.info(f" Target table: {full_table}") + context.log.info(f" Stage path: {stage_path}") + context.log.info(f" Program: {program_name}") + context.log.info(f" Subject area: {subject_area}") + context.log.info(f" Batch ID: {batch_id}") + + # 1) Load just the two columns from the CSV + copy_sql = f""" + COPY INTO {full_table} + FROM '{stage_path}' + FILE_FORMAT = ANALYTYXONE_DEV.DATALOOM.dataloom_csv + MATCH_BY_COLUMN_NAME = 'case_sensitive' + FORCE = TRUE + """ + + context.log.info(f"📋 Copying control file into {full_table}...") + snowpark_session.sql(copy_sql).collect() + + # 2) Update the metadata columns for new rows + update_sql = f""" + UPDATE {full_table} + SET + "META_PROGRAM_NAME" = '{program_name}', + "META_SUBJECT_AREA" = '{subject_area}', + "META_BATCH_ID" = {batch_id}, + "META_DATE_INSERT" = CURRENT_TIMESTAMP() + WHERE "META_BATCH_ID" IS NULL + """ + + context.log.info(f"📋 Updating metadata columns in {full_table}...") + snowpark_session.sql(update_sql).collect() + + context.log.info(f"✅ Control table load completed successfully") + + +# Legacy function for backward compatibility +def load_dq_control_data( + context: AssetExecutionContext, + snowpark_session: Session, + adls_container: str, + folder_path: str, + control_file: str, + program_name: str, + subject_area: str, + batch_id: int, +) -> None: + """ + Legacy function - calls the internal implementation. + Kept for backward compatibility with existing code. + """ + return _load_dq_control_data_internal( + context=context, + snowpark_session=snowpark_session, + adls_container=adls_container, + folder_path=folder_path, + control_file=control_file, + program_name=program_name, + subject_area=subject_area, + batch_id=batch_id, + ) \ No newline at end of file diff --git a/reusable_components/dq/dq_schema_validator.py b/reusable_components/dq/dq_schema_validator.py new file mode 100644 index 0000000..a2b5077 --- /dev/null +++ b/reusable_components/dq/dq_schema_validator.py @@ -0,0 +1,412 @@ +import os +from io import StringIO +import pandas as pd +from typing import Optional, List, Union +from azure.core.exceptions import ResourceNotFoundError +from azure.storage.filedatalake import DataLakeServiceClient +from dagster import AssetExecutionContext, MaterializeResult, MetadataValue +from snowflake.snowpark import Session + +# hard-code your DQ_OBJECTS table here +DQ_TABLE = "ANALYTYXONE_DEV.DATALOOM.DQ_OBJECTS" + + +def validate_all_file_schemas_with_result( + adls_client: DataLakeServiceClient, + container: str, + folder_path: str, + session: Session, + context: AssetExecutionContext, + dq_result: Union[dict, str], + pipeline_name: str, + prefix: Optional[Union[str, List[str]]] = None, + suffix: Optional[Union[str, List[str]]] = None, + contains: Optional[Union[str, List[str]]] = None, + not_contains: Optional[Union[str, List[str]]] = None, + regex: Optional[str] = None, + extension: Optional[Union[str, List[str]]] = None +) -> MaterializeResult: + """ + Complete schema validation logic: validation + schema checking + MaterializeResult. + Returns MaterializeResult ready for the asset to return. + """ + + dq_status = dq_result.get("status", "unknown") + + if dq_status != "completed": + context.log.info(f"❌ Skipping {pipeline_name} schema validation - DQ validation failed: {dq_status}") + + return MaterializeResult( + value={ + "status": "skipped", + "reason": f"DQ validation failed: {dq_status}", + "validation_passed": False, + "pipeline_name": pipeline_name + }, + metadata={ + "status": MetadataValue.text("⏭️ SKIPPED"), + "reason": MetadataValue.text("Previous step failed"), + "pipeline_name": MetadataValue.text(pipeline_name) + } + ) + + context.log.info(f"🔍 {pipeline_name} starting schema validation") + context.log.info(f" Container: {container}") + context.log.info(f" Folder: {folder_path}") + context.log.info(f" Criteria: prefix={prefix}, not_contains={not_contains}, extension={extension}") + + try: + # Use internal component to do the actual validation work + validation_result = _validate_all_file_schemas_internal( + adls_client=adls_client, + container=container, + folder_path=folder_path, + session=session, + context=context, + prefix=prefix, + suffix=suffix, + contains=contains, + not_contains=not_contains, + regex=regex, + extension=extension + ) + + files_validated = validation_result.get("files_validated", 0) + files_excluded = validation_result.get("files_excluded", 0) + validation_details = validation_result.get("validation_details", []) + + context.log.info(f"✅ {pipeline_name} schema validation completed successfully") + context.log.info(f" Files validated: {files_validated}") + context.log.info(f" Files excluded: {files_excluded}") + + # Return successful MaterializeResult + return MaterializeResult( + value={ + "status": "completed", + "validation_passed": True, + "files_validated": files_validated, + "files_excluded": files_excluded, + "pipeline_name": pipeline_name, + "validation_details": validation_details, + "exclusion_criteria": not_contains + }, + metadata={ + "status": MetadataValue.text("✅ SUCCESS"), + "pipeline_name": MetadataValue.text(pipeline_name), + "validation_result": MetadataValue.text("Schema validation passed"), + "files_validated": MetadataValue.int(files_validated), + "files_excluded": MetadataValue.int(files_excluded), + "excluded_pattern": MetadataValue.text(str(not_contains) if not_contains else "None"), + "file_criteria": MetadataValue.text(f"prefix={prefix}, extension={extension}") + } + ) + + except Exception as e: + context.log.error(f"❌ {pipeline_name} schema validation failed: {str(e)}") + + # Return failed MaterializeResult + return MaterializeResult( + value={ + "status": "failed", + "error": str(e), + "validation_passed": False, + "pipeline_name": pipeline_name + }, + metadata={ + "status": MetadataValue.text("❌ FAILED"), + "error": MetadataValue.text(str(e)[:500] + "..." if len(str(e)) > 500 else str(e)), + "pipeline_name": MetadataValue.text(pipeline_name), + "validation_result": MetadataValue.text("Schema validation failed") + } + ) + # Re-raise the exception to ensure Dagster marks this as failed + raise + + +def _validate_all_file_schemas_internal( + adls_client: DataLakeServiceClient, + container: str, + folder_path: str, + session: Session, + context: AssetExecutionContext, + prefix: Optional[Union[str, List[str]]] = None, + suffix: Optional[Union[str, List[str]]] = None, + contains: Optional[Union[str, List[str]]] = None, + not_contains: Optional[Union[str, List[str]]] = None, + regex: Optional[str] = None, + extension: Optional[Union[str, List[str]]] = None +) -> dict: + """ + Internal function that does the actual schema validation work. + 1) Lists every CSV under container/folder_path in ADLS with optional filtering. + 2) For each, derives object_name (filename sans .csv), pulls + expected FIELD_NAMEs from DQ_OBJECTS. + 3) Downloads the CSV header, compares exact list. + 4) Raises Exception on any mismatch. + """ + import re + + fs = adls_client.get_file_system_client(container) + + # Helper function to check if file matches all criteria + def file_matches_criteria(filename: str) -> bool: + """Check if file matches all specified criteria""" + + # Check prefix criteria + if prefix: + if isinstance(prefix, list): + if not any(filename.startswith(p) for p in prefix): + return False + else: + if not filename.startswith(prefix): + return False + + # Check suffix criteria + if suffix: + if isinstance(suffix, list): + if not any(filename.endswith(s) for s in suffix): + return False + else: + if not filename.endswith(suffix): + return False + + # Check contains criteria + if contains: + if isinstance(contains, list): + if not any(c in filename for c in contains): + return False + else: + if contains not in filename: + return False + + # Check not_contains criteria + if not_contains: + if isinstance(not_contains, list): + if any(nc in filename for nc in not_contains): + return False + else: + if not_contains in filename: + return False + + # Check regex criteria + if regex: + if not re.search(regex, filename): + return False + + # Check extension criteria + if extension: + if isinstance(extension, list): + if not any(filename.endswith(ext) for ext in extension): + return False + else: + if not filename.endswith(extension): + return False + + return True + + # List all files and apply filtering + files_to_validate = [] + files_excluded = 0 + + for path in fs.get_paths(path=folder_path): + if path.is_directory: + continue + + filename = os.path.basename(path.name) + + # Must be CSV file + if not filename.lower().endswith(".csv"): + continue + + # Apply filtering criteria + if not file_matches_criteria(filename): + context.log.info(f"🔍 Skipping {filename} (doesn't match criteria)") + files_excluded += 1 + continue + + files_to_validate.append(path) + + context.log.info(f"🔍 Found {len(files_to_validate)} CSV files to validate") + context.log.info(f"🔍 Excluded {files_excluded} files based on criteria") + + validation_errors = [] + validated_count = 0 + validation_details = [] + + # Validate each file + for path in files_to_validate: + filename = os.path.basename(path.name) + object_name = os.path.splitext(filename)[0] + + try: + context.log.info(f"🔍 Validating schema for: {filename}") + + # Pull expected fields from Snowflake + query = f""" + SELECT FIELD_NAME + FROM {DQ_TABLE} + WHERE OBJECT_NAME = '{object_name}' + ORDER BY ID_OBJECT + """ + rows = session.sql(query).collect() + expected = [r["FIELD_NAME"] for r in rows] + + if not expected: + error_msg = f"No schema definition found in DQ_OBJECTS for object '{object_name}'" + context.log.error(f"❌ {error_msg}") + validation_errors.append(error_msg) + validation_details.append({ + "filename": filename, + "object_name": object_name, + "status": "FAILED", + "error": "No schema definition found", + "expected_columns": 0, + "actual_columns": None + }) + continue + + # Download only the header of the CSV + file_client = fs.get_file_client(path.name) + try: + raw = file_client.download_file().readall().decode("utf-8") + except ResourceNotFoundError: + error_msg = f"File not found in ADLS: {path.name}" + context.log.error(f"❌ {error_msg}") + validation_errors.append(error_msg) + validation_details.append({ + "filename": filename, + "object_name": object_name, + "status": "FAILED", + "error": "File not found", + "expected_columns": len(expected), + "actual_columns": None + }) + continue + + # Read CSV header + try: + actual = pd.read_csv(StringIO(raw), nrows=0).columns.tolist() + except Exception as e: + error_msg = f"Failed to read CSV header for {filename}: {str(e)}" + context.log.error(f"❌ {error_msg}") + validation_errors.append(error_msg) + validation_details.append({ + "filename": filename, + "object_name": object_name, + "status": "FAILED", + "error": f"Failed to read CSV header: {str(e)}", + "expected_columns": len(expected), + "actual_columns": None + }) + continue + + # Compare EXACTLY + if actual != expected: + missing = [c for c in expected if c not in actual] + extra = [c for c in actual if c not in expected] + error_msg = ( + f"Schema mismatch for '{object_name}':\n" + f" Expected: {expected}\n" + f" Actual: {actual}\n" + f" Missing columns: {missing}\n" + f" Extra columns: {extra}" + ) + context.log.error(f"❌ {error_msg}") + validation_errors.append(error_msg) + validation_details.append({ + "filename": filename, + "object_name": object_name, + "status": "FAILED", + "error": "Schema mismatch", + "expected_columns": len(expected), + "actual_columns": len(actual), + "missing_columns": missing, + "extra_columns": extra + }) + else: + context.log.info(f"✅ Schema OK for '{object_name}' ({len(actual)} columns)") + validated_count += 1 + validation_details.append({ + "filename": filename, + "object_name": object_name, + "status": "PASSED", + "expected_columns": len(expected), + "actual_columns": len(actual) + }) + + except Exception as e: + error_msg = f"Unexpected error validating {filename}: {str(e)}" + context.log.error(f"❌ {error_msg}") + validation_errors.append(error_msg) + validation_details.append({ + "filename": filename, + "object_name": object_name, + "status": "FAILED", + "error": f"Unexpected error: {str(e)}", + "expected_columns": None, + "actual_columns": None + }) + + # Summary logging + context.log.info(f"📊 Schema Validation Summary:") + context.log.info(f" Files validated successfully: {validated_count}") + context.log.info(f" Files with errors: {len(validation_errors)}") + context.log.info(f" Files excluded by criteria: {files_excluded}") + + # If there were any validation errors, raise an exception to fail the asset + if validation_errors: + context.log.error(f"❌ Schema validation failed for {len(validation_errors)} files") + + # Create a comprehensive error message + error_summary = f"Schema validation failed for {len(validation_errors)} files:\n\n" + for i, error in enumerate(validation_errors, 1): + error_summary += f"{i}. {error}\n\n" + + # Truncate if too long for better readability + if len(error_summary) > 2000: + error_summary = error_summary[:2000] + "...\n(truncated for readability)" + + # This will cause the asset to fail + raise Exception(error_summary) + + context.log.info(f"✅ All {validated_count} files passed schema validation!") + + return { + "files_validated": validated_count, + "files_excluded": files_excluded, + "validation_details": validation_details, + "total_files_checked": len(files_to_validate) + } + + +# Legacy function for backward compatibility +def validate_all_file_schemas( + adls_client: DataLakeServiceClient, + container: str, + folder_path: str, + session: Session, + context: AssetExecutionContext, + prefix: Optional[Union[str, List[str]]] = None, + suffix: Optional[Union[str, List[str]]] = None, + contains: Optional[Union[str, List[str]]] = None, + not_contains: Optional[Union[str, List[str]]] = None, + regex: Optional[str] = None, + extension: Optional[Union[str, List[str]]] = None +) -> None: + """ + Legacy function - calls the internal implementation. + Kept for backward compatibility with existing code. + """ + _validate_all_file_schemas_internal( + adls_client=adls_client, + container=container, + folder_path=folder_path, + session=session, + context=context, + prefix=prefix, + suffix=suffix, + contains=contains, + not_contains=not_contains, + regex=regex, + extension=extension + ) \ No newline at end of file diff --git a/reusable_components/dq/dq_transactions.py b/reusable_components/dq/dq_transactions.py new file mode 100644 index 0000000..25be6b6 --- /dev/null +++ b/reusable_components/dq/dq_transactions.py @@ -0,0 +1,323 @@ +import os +import io +import zipfile +import re +from typing import List, Optional, Union +from dagster import AssetExecutionContext, MaterializeResult, MetadataValue +from snowflake.snowpark import Session +from dagster_azure.adls2 import ADLS2Resource +from datetime import date + + +def load_dq_transactions_with_result( + context: AssetExecutionContext, + snowpark_session: Session, + adls_client: ADLS2Resource, + copy_result: dict, + audit_batch_id: int, + container_name: str, + directory_path: str, + program_name: str, + subject_area: str, + pipeline_name: str, + prefix: Optional[Union[str, List[str]]] = None, + suffix: Optional[Union[str, List[str]]] = None, + contains: Optional[Union[str, List[str]]] = None, + not_contains: Optional[Union[str, List[str]]] = None, + regex: Optional[str] = None, + extension: Optional[Union[str, List[str]]] = None +) -> MaterializeResult: + """ + Complete DQ transactions logic: validation + transaction loading + MaterializeResult. + Returns MaterializeResult ready for the asset to return. + """ + + # Check if copy was successful + copy_status = copy_result.get("status", "unknown") + + if copy_status != "completed": + context.log.info(f"❌ Skipping {pipeline_name} DQ transactions - copy status: {copy_status}") + + return MaterializeResult( + value=[], + metadata={ + "status": MetadataValue.text("⏭️ SKIPPED"), + "reason": MetadataValue.text(f"Copy operation status: {copy_status}"), + "pipeline_name": MetadataValue.text(pipeline_name), + "batch_id": MetadataValue.int(audit_batch_id) + } + ) + + context.log.info(f"📊 {pipeline_name} loading DQ transactions for batch {audit_batch_id}") + + try: + # Load DQ transactions using internal component + transaction_ids = _load_dq_transactions_internal( + context=context, + snowpark_session=snowpark_session, + adls_client=adls_client, + container_name=container_name, + directory_path=directory_path, + batch_id=audit_batch_id, + program_name=program_name, + subject_area=subject_area, + prefix=prefix, + suffix=suffix, + contains=contains, + not_contains=not_contains, + regex=regex, + extension=extension + ) + + context.log.info(f"✅ {pipeline_name} loaded {len(transaction_ids)} file transactions") + + # Return successful MaterializeResult + return MaterializeResult( + value=transaction_ids, + metadata={ + "status": MetadataValue.text("✅ SUCCESS"), + "transactions_loaded": MetadataValue.int(len(transaction_ids)), + "batch_id": MetadataValue.int(audit_batch_id), + "pipeline_name": MetadataValue.text(pipeline_name), + "file_criteria_used": MetadataValue.text(f"prefix={prefix}, extension={extension}") + } + ) + + except Exception as e: + context.log.error(f"❌ {pipeline_name} DQ transactions failed: {str(e)}") + + # Return failed MaterializeResult + return MaterializeResult( + value=[], + metadata={ + "status": MetadataValue.text("❌ ERROR"), + "error": MetadataValue.text(str(e)[:200] + "..." if len(str(e)) > 200 else str(e)), + "pipeline_name": MetadataValue.text(pipeline_name), + "batch_id": MetadataValue.int(audit_batch_id), + "transactions_loaded": MetadataValue.int(0) + } + ) + raise + + +def _load_dq_transactions_internal( + context: AssetExecutionContext, + snowpark_session: Session, + adls_client: ADLS2Resource, + container_name: str, + directory_path: str, + batch_id: int, + program_name: str, + subject_area: str, + prefix: Optional[Union[str, List[str]]] = None, + suffix: Optional[Union[str, List[str]]] = None, + contains: Optional[Union[str, List[str]]] = None, + not_contains: Optional[Union[str, List[str]]] = None, + regex: Optional[str] = None, + extension: Optional[Union[str, List[str]]] = None +) -> List[str]: + """ + Internal function that does the actual DQ transactions work. + - Lists files under container_name/directory_path in ADLS, filtering by criteria. + - Inserts a parent transaction record for each file, retrieves its auto-generated ID_TRANSACTION. + - If the file is a ZIP, inspects its CSV contents and inserts a child record for each CSV, + setting CHILD_ID_TRANSACTION to the parent's ID_TRANSACTION. + + Returns a list of generated parent and child ID_TRANSACTION values as strings. + """ + # Hard-coded parameters + transaction_direction = "INBOUND" + frequency_code = "DAILY" + updated_by = "ETL" + date_str = date.today().isoformat() + + fs_client = adls_client.get_file_system_client(container_name) + base_url = fs_client.url + transaction_counter = 0 + inserted_ids: List[str] = [] + + context.log.info(f"📁 Scanning ADLS directory: {container_name}/{directory_path}") + + # Map extensions to file type labels + ext_map = { + ".csv": "CSV", + ".parquet": "PARQUET", + ".txt": "TEXT", + ".xlsx": "EXCEL", + ".xls": "EXCEL", + } + + # Helper function to check if file matches criteria + def file_matches_criteria(filename: str) -> bool: + """Check if file matches all specified criteria""" + + # Check prefix criteria + if prefix: + if isinstance(prefix, list): + if not any(filename.startswith(p) for p in prefix): + return False + else: + if not filename.startswith(prefix): + return False + + # Check suffix criteria + if suffix: + if isinstance(suffix, list): + if not any(filename.endswith(s) for s in suffix): + return False + else: + if not filename.endswith(suffix): + return False + + # Check contains criteria + if contains: + if isinstance(contains, list): + if not any(c in filename for c in contains): + return False + else: + if contains not in filename: + return False + + # Check not_contains criteria + if not_contains: + if isinstance(not_contains, list): + if any(nc in filename for nc in not_contains): + return False + else: + if not_contains in filename: + return False + + # Check regex criteria + if regex: + if not re.search(regex, filename): + return False + + # Check extension criteria + if extension: + if isinstance(extension, list): + if not any(filename.lower().endswith(ext.lower()) for ext in extension): + return False + else: + if not filename.lower().endswith(extension.lower()): + return False + + return True + + for path_props in fs_client.get_paths(path=directory_path): + if path_props.is_directory: + continue + + file_name = os.path.basename(path_props.name) + + # Apply file criteria filtering + if not file_matches_criteria(file_name): + context.log.info(f"⏭️ Skipping file: {file_name} (doesn't match criteria)") + continue + + # Generate new transaction ID + transaction_counter += 1 + transaction_id = f"{program_name}_{subject_area}_{date_str}_{batch_id}_{transaction_counter}" + + file_fqn = f"{base_url}/{path_props.name}" + is_zip = file_name.lower().endswith(".zip") + if is_zip: + file_type = "ZIP" + else: + ext = os.path.splitext(file_name)[1].lower() + file_type = ext_map.get(ext, ext.lstrip(".").upper()) + + size_bytes = getattr(path_props, "content_length", None) + file_size = f"{size_bytes}B" if size_bytes is not None else "UNKNOWN" + date_inbound = path_props.creation_time.isoformat() + + context.log.info(f"→ File: {file_name} | Created: {date_inbound}") + + # Insert parent record + insert_parent_sql = f""" + INSERT INTO ANALYTYXONE_DEV.DATALOOM.DQ_TRANSACTIONS ( + ID_TRANSACTION, + CHILD_ID_TRANSACTION, + FILE_NAME, + FILE_PATH_FQN, + CODE_FILE_TYPE, + CODE_TRANSACTION_TYPE, + DATE_INBOUND, + SUBJECT_AREA, + ID_BATCH, + CODE_FREQUENCY, + FILE_SIZE, + UPDATED_BY + ) VALUES ( + '{transaction_id}', + NULL, + '{file_name}', + '{file_fqn}', + '{file_type}', + '{transaction_direction}', + '{date_inbound}', + '{subject_area}', + {batch_id}, + '{frequency_code}', + '{file_size}', + '{updated_by}' + ) + """ + snowpark_session.sql(insert_parent_sql).collect() + context.log.info(f"✔ Inserted parent ID_TRANSACTION={transaction_id}") + inserted_ids.append(transaction_id) + + # If ZIP, preview and insert child records + if is_zip: + file_client = fs_client.get_file_client(path_props.name) + stream = io.BytesIO() + file_client.download_file().readinto(stream) + stream.seek(0) + + with zipfile.ZipFile(stream, 'r') as zf: + child_counter = 0 + for zip_info in zf.infolist(): + child_counter += 1 + child_name = os.path.basename(zip_info.filename) + + ext = os.path.splitext(child_name)[1].lower() + child_file_type = ext_map.get(ext, ext.lstrip(".").upper()) + child_size = f"{zip_info.file_size}B" + + insert_child_sql = f""" + INSERT INTO ANALYTYXONE_DEV.DATALOOM.DQ_TRANSACTIONS ( + ID_TRANSACTION, + CHILD_ID_TRANSACTION, + FILE_NAME, + FILE_PATH_FQN, + CODE_FILE_TYPE, + CODE_TRANSACTION_TYPE, + DATE_INBOUND, + SUBJECT_AREA, + ID_BATCH, + CODE_FREQUENCY, + FILE_SIZE, + UPDATED_BY + ) VALUES ( + '{transaction_id}', + '{child_counter}', + '{child_name}', + '{file_fqn}/{child_name}', + '{child_file_type}', + '{transaction_direction}', + '{date_inbound}', + '{subject_area}', + {batch_id}, + '{frequency_code}', + '{child_size}', + '{updated_by}' + ) + """ + snowpark_session.sql(insert_child_sql).collect() + context.log.info( + f"✔ Inserted child ID_TRANSACTION={transaction_id}, " + f"CHILD_ID_TRANSACTION={child_counter} for {child_name}" + ) + inserted_ids.append(f"{transaction_id}.{child_counter}") + + context.log.info(f"📊 Total records created: {len(inserted_ids)}") + return inserted_ids \ No newline at end of file diff --git a/reusable_components/dq/row_count_validator.py b/reusable_components/dq/row_count_validator.py new file mode 100644 index 0000000..48e9c61 --- /dev/null +++ b/reusable_components/dq/row_count_validator.py @@ -0,0 +1,391 @@ +from io import BytesIO +import pandas as pd +import re +from dagster import AssetExecutionContext, MaterializeResult, MetadataValue +from dagster_azure.adls2 import ADLS2Resource +from snowflake.snowpark import Session +from azure.core.exceptions import ResourceNotFoundError +from snowflake.snowpark.functions import col +from typing import Union, Optional, List + + +def run_dq_row_count_validation_with_result( + context: AssetExecutionContext, + session: Session, + adls2: ADLS2Resource, + control_table_result: Union[dict, str], + audit_batch_id: int, + container: str, + folder_path: str, + program_name: str, + subject_area: str, + pipeline_name: str, + # NEW: Optional file filtering parameters + prefix: Optional[Union[str, List[str]]] = None, + suffix: Optional[Union[str, List[str]]] = None, + contains: Optional[Union[str, List[str]]] = None, + not_contains: Optional[Union[str, List[str]]] = None, + regex: Optional[str] = None, + extension: Optional[Union[str, List[str]]] = None +) -> MaterializeResult: + """ + Complete row count validation logic with file filtering support. + + NEW: Now supports filtering which files to validate based on criteria: + - prefix: File must start with these patterns + - suffix: File must end with these patterns + - contains: File must contain these patterns + - not_contains: File must NOT contain these patterns + - regex: File must match this regex pattern + - extension: File must have these extensions + + Returns MaterializeResult ready for the asset to return. + """ + + control_status = control_table_result.get("status", "unknown") + + if control_status != "completed": + context.log.info(f"❌ Skipping {pipeline_name} DQ validation - control table load failed: {control_status}") + + return MaterializeResult( + value={"status": "skipped", "reason": f"Control table load failed: {control_status}", "pipeline_name": pipeline_name}, + metadata={ + "status": MetadataValue.text("⏭️ SKIPPED"), + "reason": MetadataValue.text("Previous step failed"), + "pipeline_name": MetadataValue.text(pipeline_name), + "batch_id": MetadataValue.int(audit_batch_id) + } + ) + + try: + context.log.info(f"🔍 {pipeline_name} starting DQ validation for batch {audit_batch_id}") + + # Log filtering criteria if provided + if any([prefix, suffix, contains, not_contains, regex, extension]): + context.log.info(f"🔍 {pipeline_name} File Filtering Criteria:") + if prefix: + context.log.info(f" - prefix: {prefix}") + if suffix: + context.log.info(f" - suffix: {suffix}") + if contains: + context.log.info(f" - contains: {contains}") + if not_contains: + context.log.info(f" - not_contains: {not_contains}") + if regex: + context.log.info(f" - regex: {regex}") + if extension: + context.log.info(f" - extension: {extension}") + + # Use internal component to do the actual validation work + validation_result = _run_dq_row_count_validation_internal( + context=context, + session=session, + adls2=adls2, + container=container, + folder_path=folder_path, + program_name=program_name, + subject_area=subject_area, + batch_id=audit_batch_id, + # Pass filtering criteria + prefix=prefix, + suffix=suffix, + contains=contains, + not_contains=not_contains, + regex=regex, + extension=extension + ) + + files_validated = validation_result.get("files_validated", 0) + files_filtered_out = validation_result.get("files_filtered_out", 0) + validation_details = validation_result.get("validation_details", []) + + context.log.info(f"✅ {pipeline_name} validation completed:") + context.log.info(f" Files validated: {files_validated}") + if files_filtered_out > 0: + context.log.info(f" Files filtered out: {files_filtered_out}") + + # Return successful MaterializeResult + return MaterializeResult( + value={ + "status": "completed", + "batch_id": audit_batch_id, + "validation_passed": True, + "pipeline_name": pipeline_name, + "files_validated": files_validated, + "files_filtered_out": files_filtered_out, + "validation_details": validation_details, + "filtering_applied": any([prefix, suffix, contains, not_contains, regex, extension]) + }, + metadata={ + "status": MetadataValue.text("✅ SUCCESS"), + "batch_id": MetadataValue.int(audit_batch_id), + "pipeline_name": MetadataValue.text(pipeline_name), + "validation_result": MetadataValue.text("All files passed validation"), + "files_validated": MetadataValue.int(files_validated), + "files_filtered_out": MetadataValue.int(files_filtered_out), + "program_name": MetadataValue.text(program_name), + "subject_area": MetadataValue.text(subject_area), + "filtering_criteria": MetadataValue.text( + f"prefix={prefix}, not_contains={not_contains}, extension={extension}" + if any([prefix, suffix, contains, not_contains, regex, extension]) + else "No filtering applied" + ) + } + ) + + except Exception as e: + context.log.error(f"❌ {pipeline_name} DQ validation failed: {str(e)}") + + # Return failed MaterializeResult + return MaterializeResult( + value={"status": "failed", "error": str(e), "validation_passed": False, "pipeline_name": pipeline_name}, + metadata={ + "status": MetadataValue.text("❌ FAILED"), + "error": MetadataValue.text(str(e)[:300] + "..." if len(str(e)) > 300 else str(e)), + "pipeline_name": MetadataValue.text(pipeline_name), + "batch_id": MetadataValue.int(audit_batch_id), + "program_name": MetadataValue.text(program_name), + "subject_area": MetadataValue.text(subject_area) + } + ) + + +def _run_dq_row_count_validation_internal( + context: AssetExecutionContext, + session: Session, + adls2: ADLS2Resource, + container: str, + folder_path: str, + program_name: str, + subject_area: str, + batch_id: int, + prefix: Optional[Union[str, List[str]]] = None, + suffix: Optional[Union[str, List[str]]] = None, + contains: Optional[Union[str, List[str]]] = None, + not_contains: Optional[Union[str, List[str]]] = None, + regex: Optional[str] = None, + extension: Optional[Union[str, List[str]]] = None +) -> dict: + """ + 1) Load TABLE_NAME/ROW_COUNT from Snowflake (fully qualified). + 2) For each entry, pull .csv from ADLS and compare its row count. + 3) Raise if any files are missing or counts don’t match. + """ + + def file_matches_criteria(filename: str) -> bool: + """Check if file matches all specified criteria""" + + # Check prefix criteria + if prefix: + if isinstance(prefix, list): + if not any(filename.startswith(p) for p in prefix): + return False + else: + if not filename.startswith(prefix): + return False + + # Check suffix criteria + if suffix: + if isinstance(suffix, list): + if not any(filename.endswith(s) for s in suffix): + return False + else: + if not filename.endswith(suffix): + return False + + # Check contains criteria + if contains: + if isinstance(contains, list): + if not any(c in filename for c in contains): + return False + else: + if contains not in filename: + return False + + # Check not_contains criteria + if not_contains: + if isinstance(not_contains, list): + if any(nc in filename for nc in not_contains): + return False + else: + if not_contains in filename: + return False + + # Check regex criteria + if regex: + if not re.search(regex, filename): + return False + + # Check extension criteria + if extension: + if isinstance(extension, list): + if not any(filename.lower().endswith(ext.lower()) for ext in extension): + return False + else: + if not filename.lower().endswith(extension.lower()): + return False + + return True + + # Load just the two needed columns in one go + control_df = ( + session + .table(f"ANALYTYXONE_DEV.DATALOOM.DQ_CONTROL_TABLE") + .filter( + (col("META_BATCH_ID") == batch_id) & + (col("META_PROGRAM_NAME") == program_name) & + (col("META_SUBJECT_AREA") == subject_area) + ) + .select("TABLE_NAME", "ROW_COUNT") + .to_pandas() + ) + + if control_df.empty: + context.log.warning(f"⚠️ No control records found for batch {batch_id}, program {program_name}, subject {subject_area}") + return { + "files_validated": 0, + "files_filtered_out": 0, + "validation_details": [], + "control_records_found": 0 + } + + context.log.info(f"📊 Found {len(control_df)} control records") + + # Apply filtering to determine which files to validate + files_to_validate = [] + files_filtered_out = 0 + + for table_name, expected in control_df.itertuples(index=False, name=None): + filename = f"{table_name}.csv" + + if file_matches_criteria(filename): + files_to_validate.append((table_name, int(expected))) + context.log.info(f" ✅ Will validate: {filename}") + else: + files_filtered_out += 1 + context.log.info(f" 🚫 Filtered out: {filename} (doesn't match criteria)") + + context.log.info(f"📋 Filtering Results:") + context.log.info(f" Total control records: {len(control_df)}") + context.log.info(f" Files to validate: {len(files_to_validate)}") + context.log.info(f" Files filtered out: {files_filtered_out}") + + if not files_to_validate: + context.log.warning(f"⚠️ No files match the filtering criteria") + return { + "files_validated": 0, + "files_filtered_out": files_filtered_out, + "validation_details": [], + "control_records_found": len(control_df) + } + + fs = adls2.get_file_system_client(container) + errors = [] + validation_details = [] + files_validated = 0 + + for table_name, expected in files_to_validate: + expected = int(expected) + path = f"{folder_path}/{table_name}.csv" + + try: + context.log.info(f"🔍 Validating: {table_name}.csv") + + blob = fs.get_file_client(path) + data = blob.download_file().readall() + actual = len(pd.read_csv(BytesIO(data), low_memory=False)) + + context.log.info(f" 📊 {table_name}.csv → expected {expected}, got {actual}") + + if actual != expected: + error_msg = f"{table_name}.csv: {actual} rows (expected {expected})" + errors.append(error_msg) + validation_details.append({ + "table_name": table_name, + "expected_rows": expected, + "actual_rows": actual, + "status": "FAILED", + "error": f"Row count mismatch: expected {expected}, got {actual}" + }) + context.log.error(f" ❌ {error_msg}") + else: + files_validated += 1 + validation_details.append({ + "table_name": table_name, + "expected_rows": expected, + "actual_rows": actual, + "status": "PASSED" + }) + context.log.info(f" ✅ {table_name}.csv: Row count matches ({actual} rows)") + + except ResourceNotFoundError: + error_msg = f"{table_name}.csv: file not found" + errors.append(error_msg) + validation_details.append({ + "table_name": table_name, + "expected_rows": expected, + "actual_rows": None, + "status": "FAILED", + "error": "File not found" + }) + context.log.error(f" ❌ {error_msg}") + + except Exception as e: + error_msg = f"{table_name}.csv: validation error - {str(e)}" + errors.append(error_msg) + validation_details.append({ + "table_name": table_name, + "expected_rows": expected, + "actual_rows": None, + "status": "FAILED", + "error": str(e) + }) + context.log.error(f" ❌ {error_msg}") + + # Summary logging + context.log.info(f"📊 Validation Summary:") + context.log.info(f" Files checked: {len(files_to_validate)}") + context.log.info(f" Files passed: {files_validated}") + context.log.info(f" Files failed: {len(errors)}") + context.log.info(f" Files filtered out: {files_filtered_out}") + + if errors: + context.log.error(f"❌ Validation failures:") + for error in errors: + context.log.error(f" • {error}") + + raise RuntimeError("DQ row-count validation failed:\n" + "\n".join(errors)) + + return { + "files_validated": files_validated, + "files_filtered_out": files_filtered_out, + "validation_details": validation_details, + "control_records_found": len(control_df) + } + + +# Legacy function for backward compatibility +def run_dq_row_count_validation( + context: AssetExecutionContext, + session: Session, + adls2: ADLS2Resource, + container: str, + folder_path: str, + program_name: str, + subject_area: str, + batch_id: int, +) -> None: + """ + Legacy function - calls the internal implementation without filtering. + Kept for backward compatibility with existing code. + """ + _run_dq_row_count_validation_internal( + context=context, + session=session, + adls2=adls2, + container=container, + folder_path=folder_path, + program_name=program_name, + subject_area=subject_area, + batch_id=batch_id, + ) \ No newline at end of file diff --git a/reusable_components/file_processing/unzip.py b/reusable_components/error_handling/__init__.py similarity index 100% rename from reusable_components/file_processing/unzip.py rename to reusable_components/error_handling/__init__.py diff --git a/reusable_components/error_handling/alert.py b/reusable_components/error_handling/alert.py new file mode 100644 index 0000000..e3e4870 --- /dev/null +++ b/reusable_components/error_handling/alert.py @@ -0,0 +1,164 @@ +import requests +from datetime import datetime +from dagster import get_dagster_logger +from functools import wraps + +def send_alert( + logic_app_url: str, + trigger_type: str, + message: str, + pipeline_name: str, + run_id: str, + error_details: str = None +): + """ + Sends alert notifications to Azure Logic App + + This function creates a structured payload and sends it to your Logic App, + which then distributes the alert to email, Teams, Slack, etc. + + Args: + logic_app_url: The HTTP trigger URL of your Azure Logic App + trigger_type: "info" (success), "error" (failure), or "log" (warning) + message: The main message to send in the alert + pipeline_name: Name of the Dagster asset that triggered this alert + run_id: Unique identifier for this Dagster run + error_details: Additional error information (for error/log types) + + Returns: + bool: True if alert sent successfully, False if failed + """ + logger = get_dagster_logger() + + severity_mapping = { + "error": "High", + "info": "Informational", + "log": "Medium" + } + + alert_payload = { + "triggerType": trigger_type, + "environment": "Development", + "applicationName": "Dagster", + "programName": "Medicaid Data Processing", + "pipelineName": pipeline_name, + "displayMessage": message, + "subjectMessage": message, + "runID": run_id, + "severity": severity_mapping.get(trigger_type, "Informational"), + "AlertTimestamp": datetime.utcnow().isoformat() + "Z", + "recipients": { + "email": [ + "greeshmanjali@amida.com" + ] + } + } + + # detailed error information for error and log alerts + if error_details and trigger_type in ["error", "log"]: + alert_payload["errorMessage"] = { + "message": error_details, + "code": trigger_type.upper(), + "stackTrace": error_details + } + + # Send alert to Logic App via HTTP POST + try: + # Make HTTP request to Logic App trigger endpoint + response = requests.post( + url=logic_app_url, + json=alert_payload, + timeout=30 + ) + response.raise_for_status() + logger.info(f"✅ {trigger_type.upper()} alert sent successfully to Logic App") + return True + + except requests.exceptions.RequestException as e: + logger.error(f"❌ Failed to send {trigger_type} alert to Logic App: {str(e)}") + return False + + except Exception as e: + logger.error(f"❌ Unexpected error sending {trigger_type} alert: {str(e)}") + return False + +def with_alerts( + logic_app_url: str = "https://prod-47.eastus2.logic.azure.com:443/workflows/09f114d3198a487db73ec504e0277148/triggers/Receive_Medicaid_Alert_Request/paths/invoke?api-version=2016-10-01&sp=%2Ftriggers%2FReceive_Medicaid_Alert_Request%2Frun&sv=1.0&sig=QdaGnWj1_gZp0nzZZCWzkgn-1nj_r9LGG-al5f1NPrE", + send_success_alerts: bool = True +): + """ + Decorator that automatically adds alert functionality to any Dagster asset + + This decorator wraps your asset function and: + 1. Runs your original asset function + 2. Sends SUCCESS alert if it completes without errors + 3. Sends ERROR alert if it fails, then re-throws the error to Dagster + + Usage: + @asset(name="my_asset") + @with_alerts() # Just add this line! + def my_asset(context): + # Your asset code here + return result + + Args: + logic_app_url: Azure Logic App endpoint URL (has default) + send_success_alerts: Whether to send alerts on successful completion + + Returns: + Decorated function with automatic alert capabilities + """ + + def decorator(asset_function): + """ + Inner decorator function that wraps the asset + """ + + @wraps(asset_function) + def wrapper(context, *args, **kwargs): + """ + Wrapper function that runs instead of the original asset + This adds the alert logic around your asset execution + """ + + logger = get_dagster_logger() + asset_name = asset_function.__name__ + + try: + logger.info(f"Starting execution of asset: {asset_name}") + result = asset_function(context, *args, **kwargs) + logger.info(f"Asset {asset_name} completed successfully") + + # If asset completed successfully and success alerts are enabled + if send_success_alerts: + send_alert( + logic_app_url=logic_app_url, + trigger_type="info", + message=f"Asset '{asset_name}' completed successfully.", + pipeline_name=asset_name, + run_id=context.run_id + ) + + # Return the original result from your asset + return result + + except Exception as error: + # If your asset fails for any reason, we catch the exception here + # Log the error in Dagster logs + error_message = str(error) + logger.error(f"Asset '{asset_name}' failed with error: {error_message}") + + # Send notification to external systems (email, Teams, Slack) via Logic App + send_alert( + logic_app_url=logic_app_url, + trigger_type="error", + message=f"Asset '{asset_name}' failed: {error_message}", + pipeline_name=asset_name, + run_id=context.run_id, + error_details=error_message + ) + raise error + + return wrapper + + return decorator diff --git a/reusable_components/error_handling/standardized_alerts.py b/reusable_components/error_handling/standardized_alerts.py new file mode 100644 index 0000000..a99c1fd --- /dev/null +++ b/reusable_components/error_handling/standardized_alerts.py @@ -0,0 +1,426 @@ +import requests +from datetime import datetime +from dagster import get_dagster_logger, AssetExecutionContext +from functools import wraps +from typing import Dict, Any, Optional + + +# Default alert configurations for different pipelines +DEFAULT_ALERT_CONFIGS = { + "MEDICAID_RECIPIENT": { + "logic_app_url": "https://prod-47.eastus2.logic.azure.com:443/workflows/09f114d3198a487db73ec504e0277148/triggers/Receive_Medicaid_Alert_Request/paths/invoke?api-version=2016-10-01&sp=%2Ftriggers%2FReceive_Medicaid_Alert_Request%2Frun&sv=1.0&sig=QdaGnWj1_gZp0nzZZCWzkgn-1nj_r9LGG-al5f1NPrE", + "environment": "Development", + "application_name": "Dagster - Medicaid Processing", + "program_name": "Medicaid Data Processing", + "recipients": { + "email": ["greeshmanjali@amida.com"] + }, + "send_success_alerts": True + }, + "MEDICAID_PROVIDER": { + "logic_app_url": "https://prod-47.eastus2.logic.azure.com:443/workflows/09f114d3198a487db73ec504e0277148/triggers/Receive_Medicaid_Alert_Request/paths/invoke?api-version=2016-10-01&sp=%2Ftriggers%2FReceive_Medicaid_Alert_Request%2Frun&sv=1.0&sig=QdaGnWj1_gZp0nzZZCWzkgn-1nj_r9LGG-al5f1NPrE", + "environment": "Development", + "application_name": "Dagster - Medicaid Processing", + "program_name": "Medicaid Provider Processing", + "recipients": { + "email": ["greeshmanjali@amida.com"] + }, + "send_success_alerts": True + }, + "MEDICAID_CLAIMS": { + "logic_app_url": "https://prod-47.eastus2.logic.azure.com:443/workflows/09f114d3198a487db73ec504e0277148/triggers/Receive_Medicaid_Alert_Request/paths/invoke?api-version=2016-10-01&sp=%2Ftriggers%2FReceive_Medicaid_Alert_Request%2Frun&sv=1.0&sig=QdaGnWj1_gZp0nzZZCWzkgn-1nj_r9LGG-al5f1NPrE", + "environment": "Development", + "application_name": "Dagster - Medicaid Processing", + "program_name": "Medicaid Claims Processing", + "recipients": { + "email": ["greeshmanjali@amida.com"] + }, + "send_success_alerts": True + }, + "DEFAULT": { + "logic_app_url": "https://prod-47.eastus2.logic.azure.com:443/workflows/09f114d3198a487db73ec504e0277148/triggers/Receive_Medicaid_Alert_Request/paths/invoke?api-version=2016-10-01&sp=%2Ftriggers%2FReceive_Medicaid_Alert_Request%2Frun&sv=1.0&sig=QdaGnWj1_gZp0nzZZCWzkgn-1nj_r9LGG-al5f1NPrE", + "environment": "Development", + "application_name": "Dagster", + "program_name": "Data Processing Pipeline", + "recipients": { + "email": ["greeshmanjali@amida.com"] + }, + "send_success_alerts": True + } +} + + +def get_pipeline_alert_config(pipeline_name: str, custom_config: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + """ + Get alert configuration for a specific pipeline + + Args: + pipeline_name: Name of the pipeline (e.g., "MEDICAID_RECIPIENT") + custom_config: Optional custom configuration overrides + + Returns: + Dict containing alert configuration + """ + # Get base config for pipeline or default + base_config = DEFAULT_ALERT_CONFIGS.get(pipeline_name, DEFAULT_ALERT_CONFIGS["DEFAULT"]).copy() + + # Apply custom overrides if provided + if custom_config: + base_config.update(custom_config) + + return base_config + + +def send_pipeline_alert( + context: AssetExecutionContext, + pipeline_name: str, + trigger_type: str, + message: str, + error_details: Optional[str] = None, + alert_config: Optional[Dict[str, Any]] = None, + additional_metadata: Optional[Dict[str, Any]] = None +) -> bool: + """ + Send standardized alert for any pipeline + + Args: + context: Dagster asset execution context + pipeline_name: Name of the pipeline (e.g., "MEDICAID_RECIPIENT") + trigger_type: "info" (success), "error" (failure), or "log" (warning) + message: Main alert message + error_details: Additional error information (for error/log types) + alert_config: Optional custom alert configuration + additional_metadata: Additional metadata to include in alert + + Returns: + bool: True if alert sent successfully, False if failed + """ + logger = get_dagster_logger() + + # Get alert configuration for this pipeline + config = get_pipeline_alert_config(pipeline_name, alert_config) + + # Map trigger types to severity levels + severity_mapping = { + "error": "High", + "info": "Informational", + "log": "Medium" + } + + # Get asset name from context + asset_name = getattr(context, 'asset_key', {}) + if hasattr(asset_name, 'path'): + asset_name = asset_name.path[-1] if asset_name.path else "unknown_asset" + else: + asset_name = "unknown_asset" + + # Build alert payload + alert_payload = { + "triggerType": trigger_type, + "environment": config.get("environment", "Development"), + "applicationName": config.get("application_name", "Dagster"), + "programName": config.get("program_name", pipeline_name), + "pipelineName": pipeline_name, + "assetName": asset_name, + "displayMessage": message, + "subjectMessage": f"[{pipeline_name}] {message}", + "runID": context.run_id, + "severity": severity_mapping.get(trigger_type, "Informational"), + "AlertTimestamp": datetime.utcnow().isoformat() + "Z", + "recipients": config.get("recipients", {"email": ["greeshmanjali@amida.com"]}), + "pipeline_metadata": { + "pipeline_name": pipeline_name, + "asset_name": asset_name, + "run_id": context.run_id, + "trigger_type": trigger_type + } + } + + # Add error details for error/log alerts + if error_details and trigger_type in ["error", "log"]: + alert_payload["errorMessage"] = { + "message": error_details, + "code": trigger_type.upper(), + "stackTrace": error_details, + "asset_name": asset_name, + "pipeline_name": pipeline_name + } + + # Add any additional metadata + if additional_metadata: + alert_payload["additional_metadata"] = additional_metadata + + # Send alert to Logic App + try: + logic_app_url = config.get("logic_app_url") + if not logic_app_url: + logger.error(f"❌ No Logic App URL configured for pipeline: {pipeline_name}") + return False + + response = requests.post( + url=logic_app_url, + json=alert_payload, + timeout=30 + ) + response.raise_for_status() + + logger.info(f"✅ {trigger_type.upper()} alert sent for {pipeline_name} - {asset_name}") + return True + + except requests.exceptions.RequestException as e: + logger.error(f"❌ Failed to send {trigger_type} alert for {pipeline_name}: {str(e)}") + return False + + except Exception as e: + logger.error(f"❌ Unexpected error sending {trigger_type} alert for {pipeline_name}: {str(e)}") + return False + + +def with_pipeline_alerts( + pipeline_name: str, + alert_config: Optional[Dict[str, Any]] = None, + send_success_alerts: Optional[bool] = None +): + """ + Decorator that automatically adds standardized alerting to any Dagster asset + + This decorator: + 1. Runs your asset function + 2. Sends SUCCESS alert if completed without errors (if enabled) + 3. Sends ERROR alert if failed, then re-throws error to Dagster + 4. Uses pipeline-specific configuration + + Args: + pipeline_name: Name of the pipeline (e.g., "MEDICAID_RECIPIENT") + alert_config: Optional custom alert configuration + send_success_alerts: Override success alert setting + + Usage: + @asset(name="my_asset") + @with_pipeline_alerts(pipeline_name="MEDICAID_RECIPIENT") + def my_asset(context): + # Your asset code here + return result + """ + + def decorator(asset_function): + @wraps(asset_function) + def wrapper(context, *args, **kwargs): + logger = get_dagster_logger() + asset_name = asset_function.__name__ + + # Get pipeline configuration + config = get_pipeline_alert_config(pipeline_name, alert_config) + should_send_success = send_success_alerts if send_success_alerts is not None else config.get("send_success_alerts", True) + + try: + logger.info(f"🚀 [{pipeline_name}] Starting asset: {asset_name}") + + # Execute the original asset function + result = asset_function(context, *args, **kwargs) + + logger.info(f"✅ [{pipeline_name}] Asset completed: {asset_name}") + + # Send success alert if enabled + if should_send_success: + send_pipeline_alert( + context=context, + pipeline_name=pipeline_name, + trigger_type="info", + message=f"Asset '{asset_name}' completed successfully", + alert_config=alert_config, + additional_metadata={ + "execution_status": "success", + "asset_name": asset_name + } + ) + + return result + + except Exception as error: + # Log error + error_message = str(error) + logger.error(f"❌ [{pipeline_name}] Asset failed: {asset_name} - {error_message}") + + # Send error alert + send_pipeline_alert( + context=context, + pipeline_name=pipeline_name, + trigger_type="error", + message=f"Asset '{asset_name}' failed: {error_message}", + error_details=error_message, + alert_config=alert_config, + additional_metadata={ + "execution_status": "failed", + "asset_name": asset_name, + "error_type": type(error).__name__ + } + ) + + # Re-raise the error so Dagster marks the asset as failed + raise error + + return wrapper + return decorator + + +# Legacy compatibility functions (for existing code) +def send_alert( + logic_app_url: str, + trigger_type: str, + message: str, + pipeline_name: str, + run_id: str, + error_details: str = None +) -> bool: + """ + Legacy compatibility function - use send_pipeline_alert instead + + This function is kept for backward compatibility with existing code. + New code should use send_pipeline_alert() instead. + """ + logger = get_dagster_logger() + logger.warning("⚠️ Using legacy send_alert function. Consider upgrading to send_pipeline_alert.") + + # Create a minimal context-like object for compatibility + class LegacyContext: + def __init__(self, run_id, pipeline_name): + self.run_id = run_id + self.asset_key = type('obj', (object,), {'path': [pipeline_name]})() + + context = LegacyContext(run_id, pipeline_name) + + # Use new alert system with legacy parameters + custom_config = {"logic_app_url": logic_app_url} + + return send_pipeline_alert( + context=context, + pipeline_name=pipeline_name, + trigger_type=trigger_type, + message=message, + error_details=error_details, + alert_config=custom_config + ) + + +def with_alerts( + logic_app_url: str = None, + send_success_alerts: bool = True +): + """ + Legacy compatibility decorator - use with_pipeline_alerts instead + + This decorator is kept for backward compatibility with existing code. + New code should use with_pipeline_alerts() instead. + """ + def decorator(asset_function): + @wraps(asset_function) + def wrapper(context, *args, **kwargs): + logger = get_dagster_logger() + logger.warning("⚠️ Using legacy with_alerts decorator. Consider upgrading to with_pipeline_alerts.") + + asset_name = asset_function.__name__ + + try: + logger.info(f"Starting execution of asset: {asset_name}") + result = asset_function(context, *args, **kwargs) + logger.info(f"Asset {asset_name} completed successfully") + + if send_success_alerts and logic_app_url: + send_alert( + logic_app_url=logic_app_url, + trigger_type="info", + message=f"Asset '{asset_name}' completed successfully.", + pipeline_name=asset_name, + run_id=context.run_id + ) + + return result + + except Exception as error: + error_message = str(error) + logger.error(f"Asset '{asset_name}' failed with error: {error_message}") + + if logic_app_url: + send_alert( + logic_app_url=logic_app_url, + trigger_type="error", + message=f"Asset '{asset_name}' failed: {error_message}", + pipeline_name=asset_name, + run_id=context.run_id, + error_details=error_message + ) + + raise error + + return wrapper + return decorator + + +# Utility functions for testing and configuration +def test_pipeline_alerts(pipeline_name: str, alert_config: Optional[Dict[str, Any]] = None) -> bool: + """ + Test alert functionality for a specific pipeline + + Args: + pipeline_name: Name of the pipeline to test + alert_config: Optional custom configuration + + Returns: + bool: True if test alert sent successfully + """ + logger = get_dagster_logger() + + # Create a test context + class TestContext: + def __init__(self): + self.run_id = f"test_run_{datetime.now().strftime('%Y%m%d_%H%M%S')}" + self.asset_key = type('obj', (object,), {'path': ['test_asset']})() + + context = TestContext() + + logger.info(f"🧪 Testing alerts for pipeline: {pipeline_name}") + + # Test info alert + success = send_pipeline_alert( + context=context, + pipeline_name=pipeline_name, + trigger_type="info", + message=f"Test alert for {pipeline_name} pipeline - all systems operational", + alert_config=alert_config, + additional_metadata={"test": True, "timestamp": datetime.now().isoformat()} + ) + + if success: + logger.info(f"✅ Test alert sent successfully for {pipeline_name}") + else: + logger.error(f"❌ Test alert failed for {pipeline_name}") + + return success + + +def get_available_pipelines() -> list: + """ + Get list of available pipeline configurations + + Returns: + List of configured pipeline names + """ + return [name for name in DEFAULT_ALERT_CONFIGS.keys() if name != "DEFAULT"] + + +def update_pipeline_alert_config(pipeline_name: str, new_config: Dict[str, Any]) -> None: + """ + Update alert configuration for a specific pipeline + + Args: + pipeline_name: Name of the pipeline + new_config: New configuration values to merge + """ + if pipeline_name not in DEFAULT_ALERT_CONFIGS: + DEFAULT_ALERT_CONFIGS[pipeline_name] = DEFAULT_ALERT_CONFIGS["DEFAULT"].copy() + + DEFAULT_ALERT_CONFIGS[pipeline_name].update(new_config) + + logger = get_dagster_logger() + logger.info(f"📝 Updated alert configuration for pipeline: {pipeline_name}") \ No newline at end of file diff --git a/reusable_components/etl/adls_csv_to_snowflake_iceberg.py b/reusable_components/etl/adls_csv_to_snowflake_iceberg.py new file mode 100644 index 0000000..4a1285a --- /dev/null +++ b/reusable_components/etl/adls_csv_to_snowflake_iceberg.py @@ -0,0 +1,360 @@ +from typing import List, Optional, Union, Dict, Any +import re +from dagster import AssetExecutionContext, MaterializeResult, MetadataValue +from azure.storage.filedatalake import DataLakeServiceClient + +def load_csv_to_iceberg_with_result( + context: AssetExecutionContext, + adls_client: DataLakeServiceClient, + snowpark_session, + audit_batch_id: int, + schema_check_result: Dict[str, Any], + config: Dict[str, Any], + # NEW: Optional override parameters + prefix: Optional[Union[str, List[str]]] = None, + suffix: Optional[Union[str, List[str]]] = None, + contains: Optional[Union[str, List[str]]] = None, + not_contains: Optional[Union[str, List[str]]] = None, + regex: Optional[str] = None, + extension: Optional[Union[str, List[str]]] = None +) -> MaterializeResult: + """ + Standardized CSV to Iceberg loading component with schema validation check. + + Args: + context: Dagster execution context + adls_client: Azure Data Lake client + snowpark_session: Snowflake Snowpark session + audit_batch_id: Batch ID for auditing + schema_check_result: Result from schema validation + config: Pipeline configuration dictionary + prefix: Override prefix criteria (optional) + suffix: Override suffix criteria (optional) + contains: Override contains criteria (optional) + not_contains: Override not_contains criteria (optional) + regex: Override regex criteria (optional) + extension: Override extension criteria (optional) + + Returns: + MaterializeResult with loading status and metadata + """ + + # Extract configuration values + pipeline_name = config["pipeline_name"] + stage_container = config["stage_container"] + load_directory = config["load_directory"] + snowflake_db = config["snowflake_db"] + snowflake_schema = config["snowflake_schema"] + snowflake_stage = config["snowflake_stage"] + program_name = config["program_name"] + subject_area = config["subject_area"] + file_criteria = config["file_criteria"] + + # Use override parameters if provided, otherwise use config values + final_prefix = prefix if prefix is not None else file_criteria["prefix"]["pattern"] + final_suffix = suffix if suffix is not None else file_criteria["suffix"]["pattern"] + final_contains = contains if contains is not None else file_criteria["contains"]["pattern"] + final_not_contains = not_contains if not_contains is not None else file_criteria["not_contains"]["pattern"] + final_regex = regex if regex is not None else file_criteria["regex"]["pattern"] + final_extension = extension if extension is not None else [".csv"] + + # Check if schema validation was successful + schema_status = schema_check_result.get("status", "unknown") + validation_passed = schema_check_result.get("validation_passed", False) + + if schema_status != "completed" or not validation_passed: + context.log.info(f"❌ Skipping {pipeline_name} CSV to Iceberg load - schema validation failed: {schema_status}") + + return MaterializeResult( + value={ + "status": "skipped", + "reason": f"Schema validation failed: {schema_status}", + "pipeline_name": pipeline_name + }, + metadata={ + "status": MetadataValue.text("⏭️ SKIPPED"), + "reason": MetadataValue.text("Schema validation failed"), + "pipeline_name": MetadataValue.text(pipeline_name), + "batch_id": MetadataValue.int(audit_batch_id) + } + ) + + context.log.info(f"🔄 {pipeline_name} loading CSV to Iceberg (after successful schema validation):") + context.log.info(f" Container: {stage_container}") + context.log.info(f" Folder: {load_directory}") + context.log.info(f" Database: {snowflake_db}") + context.log.info(f" Schema: {snowflake_schema}") + context.log.info(f" Stage: {snowflake_stage}") + + try: + # Log the criteria being used for CSV loading + context.log.info(f"🔍 {pipeline_name} CSV Loading Criteria (with overrides):") + context.log.info(f" - prefix: {final_prefix}") + context.log.info(f" - suffix: {final_suffix}") + context.log.info(f" - contains: {final_contains}") + context.log.info(f" - not_contains: {final_not_contains}") + context.log.info(f" - regex: {final_regex}") + context.log.info(f" - extension: {final_extension}") + + # Use the CSV loading function with final criteria + result = copy_all_csv_to_iceberg( + context=context, + adls_client=adls_client, + container=stage_container, + folder=load_directory, + prefix=final_prefix, + suffix=final_suffix, + contains=final_contains, + not_contains=final_not_contains, + regex=final_regex, + extension=final_extension, + db=snowflake_db, + schema=snowflake_schema, + stage_name=snowflake_stage, + program_name=program_name, + subject_area=subject_area, + batch_id=audit_batch_id, + ) + + # Ensure result is a list and count tables loaded + tables_loaded = result if isinstance(result, list) else [] + table_count = len(tables_loaded) + + context.log.info(f"✅ {pipeline_name} CSV to Iceberg load completed:") + context.log.info(f" Tables loaded: {table_count}") + if tables_loaded: + context.log.info(f" Loaded tables: {', '.join(tables_loaded)}") + + return MaterializeResult( + value={ + "status": "completed", + "tables_loaded": tables_loaded, + "table_count": table_count, + "pipeline_name": pipeline_name, + "batch_id": audit_batch_id, + "validated_before_load": True, + "file_criteria_used": { + "prefix": final_prefix, + "suffix": final_suffix, + "contains": final_contains, + "not_contains": final_not_contains, + "regex": final_regex, + "extension": final_extension + } + }, + metadata={ + "status": MetadataValue.text("✅ SUCCESS"), + "tables_loaded": MetadataValue.int(table_count), + "pipeline_name": MetadataValue.text(pipeline_name), + "batch_id": MetadataValue.int(audit_batch_id), + "validated_before_load": MetadataValue.bool(True), + "loaded_tables": MetadataValue.text(", ".join(tables_loaded) if tables_loaded else "None"), + "criteria_overrides": MetadataValue.text(f"prefix={final_prefix}, extension={final_extension}, not_contains={final_not_contains}") + } + ) + + except Exception as e: + context.log.error(f"❌ {pipeline_name} CSV to Iceberg load failed: {str(e)}") + + return MaterializeResult( + value={ + "status": "failed", + "error": str(e), + "pipeline_name": pipeline_name, + "table_count": 0 + }, + metadata={ + "status": MetadataValue.text("❌ FAILED"), + "error": MetadataValue.text(str(e)[:200] + "..." if len(str(e)) > 200 else str(e)), + "pipeline_name": MetadataValue.text(pipeline_name), + "batch_id": MetadataValue.int(audit_batch_id), + "tables_loaded": MetadataValue.int(0) + } + ) + + +def copy_all_csv_to_iceberg( + context: AssetExecutionContext, + adls_client: DataLakeServiceClient, + container: str, + folder: str, + db: str, + schema: str, + stage_name: str, + program_name: str, + subject_area: str, + batch_id: int, + prefix: Optional[Union[str, List[str]]] = None, + suffix: Optional[Union[str, List[str]]] = None, + contains: Optional[Union[str, List[str]]] = None, + not_contains: Optional[Union[str, List[str]]] = None, + regex: Optional[str] = None, + extension: Optional[Union[str, List[str]]] = None +) -> List[str]: + """ + Core CSV to Iceberg loading logic. + + 1) List all CSV files in ADLS under container/folder that match the criteria + 2) For each file, strip off ".csv" to get table_name + 3) Issue COPY INTO db.schema.table_name FROM @stage_name/ + using MATCH_BY_COLUMN_NAME='CASE_SENSITIVE' + + Returns the list of tables that were loaded. + """ + log = context.log + + def file_matches_criteria(filename: str) -> bool: + """Check if file matches all specified criteria""" + + # Check prefix criteria + if prefix: + if isinstance(prefix, list): + if not any(filename.startswith(p) for p in prefix): + return False + else: + if not filename.startswith(prefix): + return False + + # Check suffix criteria + if suffix: + if isinstance(suffix, list): + if not any(filename.endswith(s) for s in suffix): + return False + else: + if not filename.endswith(suffix): + return False + + # Check contains criteria + if contains: + if isinstance(contains, list): + if not any(c in filename for c in contains): + return False + else: + if contains not in filename: + return False + + # Check not_contains criteria + if not_contains: + if isinstance(not_contains, list): + if any(nc in filename for nc in not_contains): + return False + else: + if not_contains in filename: + return False + + # Check regex criteria + if regex: + if not re.search(regex, filename): + return False + + # Check extension criteria + if extension: + if isinstance(extension, list): + if not any(filename.lower().endswith(ext.lower()) for ext in extension): + return False + else: + if not filename.lower().endswith(extension.lower()): + return False + + return True + + # Use the client directly + file_system = adls_client.get_file_system_client(container) + log.info(f"🔍 Listing files in {container}/{folder} with criteria") + + # Collect all matching CSV files + matching_files = [] + all_files_count = 0 + csv_files_count = 0 + + for path in file_system.get_paths(path=folder): + if path.is_directory: + continue + + filename = path.name.split('/')[-1] + all_files_count += 1 + + # Must be CSV file first + if not filename.lower().endswith('.csv'): + continue + + csv_files_count += 1 + log.info(f" 📄 Found CSV file: {filename}") + + # Apply filtering criteria + if file_matches_criteria(filename): + matching_files.append(path.name) + log.info(f" ✅ Matches criteria: {filename}") + else: + log.info(f" ❌ Doesn't match criteria: {filename}") + + log.info(f"📋 File Discovery Summary:") + log.info(f" Total files found: {all_files_count}") + log.info(f" CSV files found: {csv_files_count}") + log.info(f" CSV files matching criteria: {len(matching_files)}") + + if not matching_files: + log.warning(f"⚠️ No CSV files found matching criteria in {container}/{folder}") + log.info(f" Criteria used:") + log.info(f" - prefix: {prefix}") + log.info(f" - suffix: {suffix}") + log.info(f" - contains: {contains}") + log.info(f" - not_contains: {not_contains}") + log.info(f" - regex: {regex}") + log.info(f" - extension: {extension}") + return [] + + loaded_tables: List[str] = [] + + for full_path in matching_files: + filename = full_path.rsplit("/", 1)[-1] + + # Remove .csv extension to get table name + if filename.lower().endswith('.csv'): + table_name = filename[:-4] # Remove .csv + else: + table_name = filename + + full_table = f"{db}.{schema}.{table_name}" + stage_path = f"@{db}.{schema}.{stage_name}/{full_path}" + + log.info(f"➡️ Copying {filename} → Iceberg table {full_table}") + + try: + # Execute COPY INTO command + copy_sql = f""" + COPY INTO {full_table} + FROM {stage_path} + FILE_FORMAT = ANALYTYXONE_DEV.BRONZE.bronze_csv + MATCH_BY_COLUMN_NAME = 'CASE_INSENSITIVE' + FORCE = TRUE + """ + log.info(f" SQL: {copy_sql}") + context.resources.snowflake_snowpark.sql(copy_sql).collect() + + # Back-fill metadata on new rows + log.info(f"📝 Back-filling metadata on new rows in {full_table}") + metadata_sql = f""" + UPDATE {full_table} + SET + "META_PROGRAM_NAME" = '{program_name}', + "META_SUBJECT_AREA" = '{subject_area}', + "META_BATCH_ID" = {batch_id}, + "META_ROW_ID" = DEFAULT, + "META_DATE_INSERT" = CURRENT_TIMESTAMP() + WHERE "META_BATCH_ID" IS NULL + """ + context.resources.snowflake_snowpark.sql(metadata_sql).collect() + + loaded_tables.append(table_name) + log.info(f"✅ Successfully loaded table: {table_name}") + + except Exception as e: + log.error(f"❌ Failed to load {filename} to {full_table}: {str(e)}") + # Log the full error details for debugging + import traceback + log.error(f" Full error: {traceback.format_exc()}") + raise + + log.info(f"✅ Completed COPY for {len(loaded_tables)} tables: {loaded_tables}") + return loaded_tables \ No newline at end of file diff --git a/reusable_components/etl/adls_parquet_to_iceberg.py b/reusable_components/etl/adls_parquet_to_iceberg.py new file mode 100644 index 0000000..aea16c5 --- /dev/null +++ b/reusable_components/etl/adls_parquet_to_iceberg.py @@ -0,0 +1,75 @@ +from typing import List, Optional +from dagster import AssetExecutionContext +from azure.storage.filedatalake import DataLakeServiceClient + +def copy_all_parquets_to_iceberg( + context: AssetExecutionContext, + adls_client: DataLakeServiceClient, + container: str, + folder: str, + db: str, + schema: str, + stage_name: str, + program_name: str, + subject_area: str, + batch_id: int, + prefix: Optional[str] = None, + suffix: Optional[str] = None, + contains: Optional[str] = None, + not_contains: Optional[str] = None, + regex: Optional[str] = None, + extension: Optional[str] = None +) -> List[str]: + """ + 1) List all .parquet files in ADLS under container/folder whose + basename starts with prefix. + 2) For each file, strip off ".parquet" to get table_name. + 3) Issue COPY INTO db.schema.table_name FROM @stage_name/ + using MATCH_BY_COLUMN_NAME='CASE_INSENSITIVE'. + Returns the list of tables that were loaded. + """ + log = context.log + # Use the client directly + file_system = adls_client.get_file_system_client(container) + log.info(f"🔍 Listing files in {container}/{folder} with prefix '{prefix}'") + + paths = [] + for path in file_system.get_paths(path=folder): + filename = path.name.split('/')[-1] + # Apply filtering + if prefix and not filename.startswith(prefix): + continue + if not filename.lower().endswith(".parquet"): + continue + paths.append(path.name) + + loaded_tables: List[str] = [] + for full_path in paths: + filename = full_path.rsplit("/", 1)[-1] + table_name = filename[:-8] + full_table = f"{db}.{schema}.{table_name}" + stage_path = f"@{db}.{schema}.{stage_name}/{full_path}" + log.info(f"➡️ Copying {filename} → Iceberg table {full_table}") + context.resources.snowflake_snowpark.sql(f""" + COPY INTO {full_table} + FROM {stage_path} + FILE_FORMAT = (TYPE = 'PARQUET') + MATCH_BY_COLUMN_NAME = 'CASE_INSENSITIVE' + FORCE = TRUE + """).collect() + log.info(f"📝 Back-filling metadata on new rows in {full_table}") + context.resources.snowflake_snowpark.sql(f""" + UPDATE {full_table} + SET + "META_PROGRAM_NAME" = '{program_name}', + "META_SUBJECT_AREA" = '{subject_area}', + "META_BATCH_ID" = {batch_id}, + "META_DATE_INSERT" = CURRENT_TIMESTAMP() + WHERE "META_BATCH_ID" IS NULL + """).collect() + + loaded_tables.append(table_name) + + log.info(f"✅ Completed COPY for tables: {loaded_tables}") + return loaded_tables + diff --git a/reusable_components/etl/adls_to_snowflake_csv.py b/reusable_components/etl/adls_to_snowflake_csv.py new file mode 100644 index 0000000..37c1bf1 --- /dev/null +++ b/reusable_components/etl/adls_to_snowflake_csv.py @@ -0,0 +1,266 @@ +import os +from typing import List +from dagster_azure.adls2 import ADLS2Resource +from snowflake.snowpark import Session +from dagster import AssetExecutionContext + + +def copy_adls_csv_to_snowflake( + context: AssetExecutionContext, + session: Session, + adls2: ADLS2Resource, + source_container: str, + source_folder: str, + target_db: str, + target_schema: str, + file_format_name: str, + prefix_filter: str, + stage_name: str, + truncate_before_load: bool, +) -> List[str]: + + context.log.info(f"🔍 Scanning for CSV files in {source_container}/{source_folder}") + + try: + # Get file system client + fs_client = adls2.adls2_client.get_file_system_client(source_container) + paths = fs_client.get_paths(path=source_folder) + + # Filter files based on prefix if provided + csv_files = [] + for path in paths: + if path.is_directory: + continue + + filename = os.path.basename(path.name) + if not filename.lower().endswith('.csv'): + continue + + if prefix_filter and not filename.startswith(prefix_filter): + continue + + csv_files.append(path) + + context.log.info(f"📊 Found {len(csv_files)} CSV files to process") + + if not csv_files: + context.log.warning("⚠️ No CSV files found matching criteria") + return [] + + # Set database and schema context + session.use_database(target_db) + session.use_schema(target_schema) + + loaded_tables: List[str] = [] + + for path in csv_files: + filename = os.path.basename(path.name) + table_name = os.path.splitext(filename)[0] + full_table = f"{target_db}.{target_schema}.{table_name}" + + context.log.info(f"📥 Processing file: {filename} -> {table_name}") + + try: + # Construct the stage path + stage_path = f"@{target_db}.{target_schema}.{stage_name}/{source_folder}/{filename}" + context.log.info(f"📂 Using stage path: {stage_path}") + + # Check if table exists + table_exists_sql = f""" + SELECT COUNT(*) as table_count + FROM INFORMATION_SCHEMA.TABLES + WHERE TABLE_SCHEMA = '{target_schema}' + AND TABLE_NAME = '{table_name}' + """ + + table_exists = session.sql(table_exists_sql).collect()[0]['TABLE_COUNT'] > 0 + + if not table_exists or not truncate_before_load: + context.log.info(f"🏗️ Creating table {table_name} using INFER_SCHEMA") + + # Use INFER_SCHEMA to create table with proper structure + create_table_sql = f""" + CREATE OR REPLACE TABLE {full_table} + USING TEMPLATE ( + SELECT ARRAY_AGG(OBJECT_CONSTRUCT(*)) + FROM TABLE( + INFER_SCHEMA( + LOCATION => '{stage_path}', + FILE_FORMAT => '{target_schema}.{file_format_name}' + ) + ) + ) + """ + + session.sql(create_table_sql).collect() + context.log.info(f"✅ Table {table_name} created with inferred schema") + + + elif truncate_before_load: + # Truncate existing table + context.log.info(f"🧹 Truncating existing table {table_name}") + session.sql(f"TRUNCATE TABLE {full_table}").collect() + + # Load data using COPY command + copy_sql = f""" + COPY INTO {full_table} + FROM {stage_path} + FILE_FORMAT = (FORMAT_NAME = '{target_schema}.{file_format_name}') + ON_ERROR = 'CONTINUE' + FORCE = TRUE + """ + + context.log.info(f"🔄 Executing COPY command for {table_name}") + copy_result = session.sql(copy_sql).collect() + + # Log copy results for debugging + if copy_result: + for result_row in copy_result: + context.log.info(f"📊 Copy result: {result_row}") + + # Get row count to verify load + count_result = session.sql(f"SELECT COUNT(*) as row_count FROM {full_table}").collect() + row_count = count_result[0]['ROW_COUNT'] + + if row_count > 0: + context.log.info(f"✅ Successfully loaded {row_count} rows into {table_name}") + loaded_tables.append(full_table) + else: + context.log.warning(f"⚠️ No rows loaded into {table_name}") + + # Additional debugging for zero rows + context.log.info(f"🔍 Debugging: Checking stage file existence") + try: + list_result = session.sql(f"LIST {stage_path}").collect() + context.log.info(f"📋 Stage file check: {list_result}") + except Exception as list_error: + context.log.warning(f"⚠️ Could not list stage file: {list_error}") + + except Exception as file_error: + context.log.error(f"❌ Failed to load {filename}: {str(file_error)}") + # Continue processing other files instead of failing the entire job + continue + + context.log.info(f"🎉 Successfully processed {len(loaded_tables)} out of {len(csv_files)} files") + return loaded_tables + + except Exception as e: + context.log.error(f"❌ Error in copy_adls_csv_to_snowflake: {str(e)}") + raise + +# import os +# from typing import List +# from dagster_azure.adls2 import ADLS2Resource +# from snowflake.snowpark import Session +# from dagster import get_dagster_logger +# logger = get_dagster_logger() + + +# def copy_adls_csv_to_snowflake( +# session: Session, +# adls2: ADLS2Resource, +# source_container: str, +# source_folder: str, +# target_db: str, +# target_schema: str, +# file_format_name: str, +# ) -> List[str]: + +# fs_client = adls2.adls2_client.get_file_system_client(source_container) +# paths = fs_client.get_paths(path=source_folder) + +# loaded_tables: List[str] = [] + +# for path in paths: +# if path.is_directory: +# continue + +# filename = os.path.basename(path.name) +# table_name = os.path.splitext(filename)[0] +# full_table = f"{target_db}.{target_schema}.{table_name}" +# format_fqn = f"{target_schema}.{file_format_name}" + +# session.use_database(target_db) +# session.use_schema(target_schema) + +# # # Create table if it doesn't exist +# # session.sql(f""" +# # CREATE TABLE IF NOT EXISTS {full_table} ( +# # data VARIANT +# # ) +# # """).collect() + +# # Clear existing data before loading +# session.sql(f"TRUNCATE TABLE {full_table}").collect() + +# copy_sql = f""" +# COPY INTO {full_table} +# FROM @{target_db}.{target_schema}.PM_SA_CSV_STAGE/{path.name} +# FILE_FORMAT = (FORMAT_NAME = '{format_fqn}') +# ON_ERROR = 'CONTINUE' +# """ + +# result = session.sql(copy_sql).collect() +# count_result = session.sql(f"SELECT COUNT(*) as row_count FROM {full_table}").collect() +# print(f"✅ Loaded {count_result[0]['ROW_COUNT']} rows into {table_name}") +# logger.info(f"✅ Loaded {count_result[0]['ROW_COUNT']} rows into {table_name}") + +# loaded_tables.append(full_table) + +# return loaded_tables + + +# # BEST: For Production: +# def copy_adls_csv_to_snowflake_production( +# session: Session, +# adls2: ADLS2Resource, +# source_container: str, +# source_folder: str, +# target_db: str, +# target_schema: str, +# file_format_name: str = "PM_CSV_FORMAT", +# ) -> List[str]: + +# fs_client = adls2.adls2_client.get_file_system_client(source_container) +# paths = fs_client.get_paths(path=source_folder) +# loaded_tables: List[str] = [] + +# for path in paths: +# if path.is_directory: +# continue + +# filename = os.path.basename(path.name) +# table_name = os.path.splitext(filename)[0] +# staging_table = f"{target_db}.{target_schema}.{table_name}_STAGING" +# final_table = f"{target_db}.{target_schema}.{table_name}" +# format_fqn = f"{target_schema}.{file_format_name}" + +# session.use_database(target_db) +# session.use_schema(target_schema) + +# # 1. Create staging table (always clean) +# session.sql(f"CREATE OR REPLACE TABLE {staging_table} LIKE {final_table}").collect() + +# # 2. Load into staging +# copy_sql = f""" +# COPY INTO {staging_table} +# FROM @{target_db}.{target_schema}.PM_SA_CSV_STAGE/{path.name} +# FILE_FORMAT = (FORMAT_NAME = '{format_fqn}') +# ON_ERROR = 'CONTINUE' +# FORCE = TRUE +# """ + +# result = session.sql(copy_sql).collect() + +# # 3. Swap tables atomically +# session.sql(f"ALTER TABLE {final_table} SWAP WITH {staging_table}").collect() + +# # 4. Drop staging table +# session.sql(f"DROP TABLE {staging_table}").collect() + +# count_result = session.sql(f"SELECT COUNT(*) as row_count FROM {final_table}").collect() +# print(f"✅ Loaded {count_result[0]['ROW_COUNT']} rows into {table_name}") + +# loaded_tables.append(final_table) + +# return loaded_tables \ No newline at end of file diff --git a/reusable_components/etl/adls_to_snowflake_iceberg.py b/reusable_components/etl/adls_to_snowflake_iceberg.py new file mode 100644 index 0000000..7a52d42 --- /dev/null +++ b/reusable_components/etl/adls_to_snowflake_iceberg.py @@ -0,0 +1,179 @@ +# reusable_components/etl/adls_to_snowflake_iceberg.py +import os +from typing import List +from dagster_azure.adls2 import ADLS2Resource +from snowflake.snowpark import Session + + +def create_adls_iceberg_tables( + session: Session, + adls2: ADLS2Resource, + source_container: str, + source_folder: str, + target_db: str, + target_schema: str, + stage_name: str = "PM_SA_ICEBERG_STAGE", + file_format_name: str = "PM_ICEBERG_PARQUET_FORMAT", +) -> List[str]: + """ + Create or replace Snowflake-managed Iceberg tables from Parquet files in ADLS. + + Args: + session: Snowpark session + adls2: ADLS2 resource + source_container: ADLS container name + source_folder: Folder path within container (e.g., "parquet/iceberg") + target_db: Target Snowflake database + target_schema: Target Snowflake schema + stage_name: Snowflake stage name + file_format_name: Snowflake file format for Parquet + + Returns: + List of created Iceberg table names + """ + + fs_client = adls2.adls2_client.get_file_system_client(source_container) + paths = fs_client.get_paths(path=source_folder) + + created_tables: List[str] = [] + + for path in paths: + if path.is_directory: + continue + + filename = os.path.basename(path.name) + # Remove .parquet extension for table name (handle double suffix) + base_name = os.path.splitext(filename)[0] + if base_name.endswith('_ICEBERG'): + table_name = base_name # Keep existing _ICEBERG suffix + else: + table_name = f"{base_name}_ICEBERG" # Add _ICEBERG suffix + full_table = f"{target_db}.{target_schema}.{table_name}" + format_fqn = f"{target_schema}.{file_format_name}" + + session.use_database(target_db) + session.use_schema(target_schema) + + print(f"📋 Creating Snowflake-managed Iceberg table for: {path.name}") + print(f" Target table: {full_table}") + + try: + # First, drop any existing table (regular or Iceberg) + try: + session.sql(f"DROP TABLE IF EXISTS {full_table}").collect() + print(f"🗑️ Dropped existing table {table_name}") + except: + pass # Table might not exist + + # Try Method 1: Simple Iceberg table with your external volume (no catalog) + try: + create_iceberg_sql = f""" + CREATE ICEBERG TABLE {full_table} + EXTERNAL_VOLUME = 'ICEBERG_EXTERNAL_VOLUME' + AS SELECT * FROM @{target_db}.{target_schema}.{stage_name}/{path.name} + (FILE_FORMAT => '{format_fqn}') + """ + + print(f"🔍 Attempting: CREATE ICEBERG TABLE (with ICEBERG_EXTERNAL_VOLUME)") + session.sql(create_iceberg_sql).collect() + print(f"✅ Created TRUE Iceberg table successfully!") + + except Exception as simple_error: + print(f"⚠️ Method 1 failed: {str(simple_error)[:150]}...") + + # Try Method 2: Use the existing external volume pattern + try: + create_iceberg_sql = f""" + CREATE ICEBERG TABLE {full_table} + EXTERNAL_VOLUME = 'EXTVOL_AMIDAADWDEVTRAINEUS2_MED02SANDBOXDATA' + AS SELECT * FROM @{target_db}.{target_schema}.{stage_name}/{path.name} + (FILE_FORMAT => '{format_fqn}') + """ + + print(f"🔍 Attempting: CREATE ICEBERG TABLE (with working external volume)") + session.sql(create_iceberg_sql).collect() + print(f"✅ Created Iceberg table with working external volume!") + + except Exception as working_vol_error: + print(f"⚠️ Method 2 failed: {str(working_vol_error)[:150]}...") + + # Try Method 3: Create empty Iceberg table first, then insert + try: + # First infer schema from Parquet + session.sql(f""" + CREATE OR REPLACE TABLE {full_table}_TEMP + USING TEMPLATE ( + SELECT ARRAY_AGG(OBJECT_CONSTRUCT(*)) + FROM TABLE( + INFER_SCHEMA( + LOCATION => '@{target_db}.{target_schema}.{stage_name}/{path.name}', + FILE_FORMAT => '{format_fqn}' + ) + ) + ) + """).collect() + + # Get column definitions + describe_result = session.sql(f"DESCRIBE TABLE {full_table}_TEMP").collect() + columns = [] + for row in describe_result: + col_name = row['name'] + col_type = row['type'] + columns.append(f"{col_name} {col_type}") + column_def = ",\n ".join(columns) + + # Create empty Iceberg table + create_iceberg_sql = f""" + CREATE ICEBERG TABLE {full_table} ( + {column_def} + ) + EXTERNAL_VOLUME = 'ICEBERG_EXTERNAL_VOLUME' + """ + + print(f"🔍 Attempting: CREATE empty ICEBERG TABLE then INSERT") + session.sql(create_iceberg_sql).collect() + + # Insert data + insert_sql = f""" + INSERT INTO {full_table} + SELECT * FROM @{target_db}.{target_schema}.{stage_name}/{path.name} + (FILE_FORMAT => '{format_fqn}') + """ + session.sql(insert_sql).collect() + + # Clean up temp table + session.sql(f"DROP TABLE {full_table}_TEMP").collect() + + print(f"✅ Created empty Iceberg table and inserted data!") + + except Exception as empty_error: + print(f"⚠️ Method 3 failed: {str(empty_error)[:150]}...") + + # Final fallback: Enhanced regular table + print(f"🔄 Creating enhanced table as final fallback...") + create_table_sql = f""" + CREATE TABLE {full_table} + AS SELECT + *, + METADATA$FILENAME as _source_file, + CURRENT_TIMESTAMP() as _loaded_at + FROM @{target_db}.{target_schema}.{stage_name}/{path.name} + (FILE_FORMAT => '{format_fqn}') + """ + + session.sql(create_table_sql).collect() + print(f"✅ Created enhanced regular table") + + # Verify the table was created and has data + count_result = session.sql(f"SELECT COUNT(*) as row_count FROM {full_table}").collect() + row_count = count_result[0]['ROW_COUNT'] + + print(f"✅ Table {table_name} ready with {row_count} rows") + + created_tables.append(full_table) + + except Exception as e: + print(f"❌ Error creating table from {filename}: {str(e)}") + continue + + return created_tables \ No newline at end of file diff --git a/reusable_components/etl/adls_to_snowflake_parquet.py b/reusable_components/etl/adls_to_snowflake_parquet.py new file mode 100644 index 0000000..ed261d7 --- /dev/null +++ b/reusable_components/etl/adls_to_snowflake_parquet.py @@ -0,0 +1,230 @@ +# # reusable_components/etl/adls_to_snowflake_parquet.py +import os +from typing import List +from dagster_azure.adls2 import ADLS2Resource +from snowflake.snowpark import Session +from dagster import AssetExecutionContext + +def copy_adls_parquet_to_snowflake( + context: AssetExecutionContext, + session: Session, + adls2: ADLS2Resource, + source_container: str, + source_folder: str, + target_db: str, + target_schema: str, + file_format_name: str, + prefix_filter: str, + stage_name: str, + truncate_before_load: bool, +) -> List[str]: + """ + Copy Parquet files from ADLS to Snowflake tables. + Creates new tables or optionally truncates existing ones before loading data. + + Args: + truncate_before_load: If True, truncates existing tables before loading. + If False, appends data to existing tables. + """ + + context.log.info(f"🔍 Scanning for Parquet files in {source_container}/{source_folder}") + + try: + # Get file system client + fs_client = adls2.adls2_client.get_file_system_client(source_container) + paths = fs_client.get_paths(path=source_folder) + + # Filter files based on prefix and extension + parquet_files = [] + for path in paths: + if path.is_directory: + continue + + filename = os.path.basename(path.name) + + if path.name.count('/') > 2: + continue + + if not filename.lower().endswith('.parquet'): + continue + + if prefix_filter and not filename.startswith(prefix_filter): + continue + + parquet_files.append(path) + + context.log.info(f"📊 Found {len(parquet_files)} Parquet files to process") + + if not parquet_files: + context.log.warning("⚠️ No Parquet files found matching criteria") + return [] + + # Set database and schema context + session.use_database(target_db) + session.use_schema(target_schema) + + loaded_tables: List[str] = [] + + for path in parquet_files: + filename = os.path.basename(path.name) + table_name = os.path.splitext(filename)[0] + full_table = f"{target_db}.{target_schema}.{table_name}" + + context.log.info(f"📥 Processing file: {filename} -> {table_name}") + + try: + # Check if table exists + table_exists_sql = f""" + SELECT COUNT(*) as table_count + FROM INFORMATION_SCHEMA.TABLES + WHERE TABLE_SCHEMA = '{target_schema}' + AND TABLE_NAME = '{table_name}' + """ + + table_exists = session.sql(table_exists_sql).collect()[0]['TABLE_COUNT'] > 0 + + # Construct the stage path + stage_path = f"@{target_db}.{target_schema}.{stage_name}/{path.name}" + context.log.info(f"📂 Using stage path: {stage_path}") + + if table_exists: + # Table exists - handle based on truncate_before_load parameter + if truncate_before_load: + context.log.info(f"🧹 Table {table_name} exists, truncating and loading fresh data") + session.sql(f"TRUNCATE TABLE {full_table}").collect() + else: + context.log.info(f"📝 Table {table_name} exists, appending new data") + else: + # Table doesn't exist - create it + context.log.info(f"🏗️ Creating table {table_name} using INFER_SCHEMA") + + # Create table with inferred schema from Parquet file + create_table_sql = f""" + CREATE TABLE {full_table} + USING TEMPLATE ( + SELECT ARRAY_AGG(OBJECT_CONSTRUCT(*)) + FROM TABLE( + INFER_SCHEMA( + LOCATION => '{stage_path}', + FILE_FORMAT => '{target_schema}.{file_format_name}' + ) + ) + ) + """ + + session.sql(create_table_sql).collect() + context.log.info(f"✅ Table {table_name} created with inferred Parquet schema") + + # Load data using COPY command (this runs for both existing and new tables) + copy_sql = f""" + COPY INTO {full_table} + FROM {stage_path} + FILE_FORMAT = (FORMAT_NAME = '{target_schema}.{file_format_name}') + MATCH_BY_COLUMN_NAME = CASE_INSENSITIVE + ON_ERROR = 'CONTINUE' + FORCE = TRUE + """ + + context.log.info(f"🔄 Executing COPY command for {table_name}") + copy_result = session.sql(copy_sql).collect() + + # Log copy results for debugging + if copy_result: + for result_row in copy_result: + context.log.info(f"📊 Copy result: {result_row}") + + # Get row count to verify load + count_result = session.sql(f"SELECT COUNT(*) as row_count FROM {full_table}").collect() + row_count = count_result[0]['ROW_COUNT'] + + if row_count > 0: + context.log.info(f"✅ Successfully loaded {row_count} rows into {table_name}") + loaded_tables.append(full_table) + else: + context.log.warning(f"⚠️ No rows loaded into {table_name}") + + # Additional debugging for zero rows + context.log.info(f"🔍 Debugging: Checking stage file existence") + try: + list_result = session.sql(f"LIST {stage_path}").collect() + context.log.info(f"📋 Stage file check: {list_result}") + except Exception as list_error: + context.log.warning(f"⚠️ Could not list stage file: {list_error}") + + except Exception as file_error: + context.log.error(f"❌ Failed to load {filename}: {str(file_error)}") + # Continue processing other files instead of failing the entire job + continue + + context.log.info(f"🎉 Successfully processed {len(loaded_tables)} out of {len(parquet_files)} Parquet files") + return loaded_tables + + except Exception as e: + context.log.error(f"❌ Error in copy_adls_parquet_to_snowflake: {str(e)}") + raise + +# from dagster import get_dagster_logger + +# def copy_adls_parquet_to_snowflake( +# session: Session, +# adls2: ADLS2Resource, +# source_container: str, +# source_folder: str, +# target_db: str, +# target_schema: str, +# file_format_name: str, +# stage_name: str = "PM_SA_PARQUET_STAGE", +# ) -> List[str]: + +# fs_client = adls2.adls2_client.get_file_system_client(source_container) +# paths = fs_client.get_paths(path=source_folder) + +# loaded_tables: List[str] = [] + +# for path in paths: +# if path.is_directory: +# continue + +# filename = os.path.basename(path.name) +# table_name = os.path.splitext(filename)[0] +# full_table = f"{target_db}.{target_schema}.{table_name}" +# format_fqn = f"{target_schema}.{file_format_name}" + +# session.use_database(target_db) +# session.use_schema(target_schema) + +# # For Parquet files, we need to create/recreate the table with proper schema +# # First, infer the schema from the Parquet file +# session.sql(f"DROP TABLE IF EXISTS {full_table}").collect() + +# # Create table with inferred schema from Parquet file +# session.sql(f""" +# CREATE TABLE {full_table} +# USING TEMPLATE ( +# SELECT ARRAY_AGG(OBJECT_CONSTRUCT(*)) +# FROM TABLE( +# INFER_SCHEMA( +# LOCATION => '@{target_db}.{target_schema}.{stage_name}/{path.name}', +# FILE_FORMAT => '{format_fqn}' +# ) +# ) +# ) +# """).collect() + +# logger.info(f"✅ Created table {table_name} with inferred schema") + +# copy_sql = f""" +# COPY INTO {full_table} +# FROM @{target_db}.{target_schema}.{stage_name}/{path.name} +# FILE_FORMAT = (FORMAT_NAME = '{format_fqn}') +# MATCH_BY_COLUMN_NAME = CASE_INSENSITIVE +# ON_ERROR = 'CONTINUE' +# """ + +# result = session.sql(copy_sql).collect() +# count_result = session.sql(f"SELECT COUNT(*) as row_count FROM {full_table}").collect() +# logger.info(f"✅ Loaded {count_result[0]['ROW_COUNT']} rows into {table_name}") + +# loaded_tables.append(full_table) + +# return loaded_tables \ No newline at end of file diff --git a/reusable_components/etl/copy_adls_to_adls.py b/reusable_components/etl/copy_adls_to_adls.py new file mode 100644 index 0000000..80d8114 --- /dev/null +++ b/reusable_components/etl/copy_adls_to_adls.py @@ -0,0 +1,61 @@ +from typing import List +from azure.storage.blob import BlobClient +from dagster_azure.adls2 import ADLS2Resource +from dagster import AssetExecutionContext + +def copy_adls_to_adls( + context: AssetExecutionContext, + adls2_resource: ADLS2Resource, + source_container: str, + source_folder: str, + dest_container: str, + dest_folder: str, + prefix_filter: str, +) -> List[str]: + """ + 1) List all blobs under source_container/source_prefix + 2) Filter to only those whose basename starts with prefix_filter + 3) Server-side copy each blob to dest_container/dest_prefix + Returns the list of full paths that were successfully copied. + """ + account = adls2_resource.storage_account + sas_token = adls2_resource.credential.token + + # 1) get the filesystem client + service_client = adls2_resource.adls2_client + fs = service_client.get_file_system_client(source_container) + + # 2) enumerate + filter + paths = fs.get_paths(path=source_folder, recursive=True) + to_copy = [ + p.name[len(source_folder)+1 :] + for p in paths + if not p.is_directory + and p.name.split("/")[-1].startswith(prefix_filter) + ] + + context.log.info(f" 🔍 Found {len(to_copy)} files matching '{prefix_filter}' under '{source_folder}': {to_copy}") + + # 3) COPY FILES + copied: List[str] = [] + for name in to_copy: + source_url = ( + f"https://{account}.blob.core.windows.net/" + f"{source_container}/{source_folder}/{name}?{sas_token}" + ) + dest_blob_name = f"{dest_folder}/{name}" + destination_blob = BlobClient( + account_url=f"https://{account}.blob.core.windows.net", + container_name=dest_container, + blob_name=dest_blob_name, + credential=sas_token, + ) + + try: + destination_blob.start_copy_from_url(source_url) + context.log.info(f"✅ Copied {name!r} → {dest_blob_name!r}") + copied.append(dest_blob_name) + except Exception as err: + context.log.error(f"❌ Failed to copy {name!r}: {err}") + + return copied \ No newline at end of file diff --git a/reusable_components/etl/copy_adls_to_adls_access_keys.py b/reusable_components/etl/copy_adls_to_adls_access_keys.py new file mode 100644 index 0000000..e4dcfc0 --- /dev/null +++ b/reusable_components/etl/copy_adls_to_adls_access_keys.py @@ -0,0 +1,72 @@ +import os +from typing import List +from azure.storage.filedatalake import DataLakeServiceClient +from dagster import AssetExecutionContext +from reusable_components.error_handling.alert import with_alerts + +@with_alerts() +def copy_adls_to_adls_access_key( + context: AssetExecutionContext, + client: DataLakeServiceClient, + source_container: str, + source_folder: str, + dest_container: str, + dest_folder: str, + prefix_filter: str, +) -> List[str]: + """ + Copies only files whose names start with `prefix_filter` under + `source_folder` in `source_container` into `dest_folder` in `dest_container`. + Returns the list of destination file paths that were written. + """ + + source_path = source_folder.strip("/") + destination_path = dest_folder.strip("/") + + context.log.info( + f"🔄 Starting copy: {source_container}/{source_path} → {dest_container}/{destination_path} " + f"(filter: '{prefix_filter}')" + ) + + source_filesystem = client.get_file_system_client(source_container) + destination_filesystem = client.get_file_system_client(dest_container) + + # collect all source paths + all_paths = list(source_filesystem.get_paths(path=source_path)) + context.log.info(f"Found {len(all_paths)} items under {source_container}/{source_path}") + context.log.info(f"All Paths: {all_paths}") + + copied_paths: List[str] = [] + for path_item in all_paths: + # skip directories + if path_item.is_directory: + continue + + src_path = path_item.name + filename = os.path.basename(src_path) + + # apply prefix filter + if not filename.startswith(prefix_filter): + context.log.debug(f"Skipping '{src_path}' (does not match prefix)") + continue + + # build destination path + rel_path = src_path[len(source_path):].lstrip("/") + dst_path = f"{destination_path}/{rel_path}" + + context.log.info(f"➡️ Copying '{src_path}' → '{dst_path}'") + # download/upload + src_file = source_filesystem.get_file_client(src_path) + data = src_file.download_file().readall() + + dst_file = destination_filesystem.get_file_client(dst_path) + dst_file.create_file() + dst_file.append_data(data, offset=0, length=len(data)) + dst_file.flush_data(len(data)) + + copied_paths.append(dst_path) + + context.log.info( + f"✅ Completed copy of {len(copied_paths)} file(s) matching '{prefix_filter}'" + ) + return copied_paths \ No newline at end of file diff --git a/reusable_components/etl/copy_sftp_to_adls.py b/reusable_components/etl/copy_sftp_to_adls.py new file mode 100644 index 0000000..b2e12e5 --- /dev/null +++ b/reusable_components/etl/copy_sftp_to_adls.py @@ -0,0 +1,234 @@ +import re +from typing import List, Optional, Union +from paramiko import SFTPClient +from azure.storage.filedatalake import DataLakeServiceClient +from azure.core.exceptions import AzureError +from dagster import AssetExecutionContext, MaterializeResult, MetadataValue + + +def copy_files_sftp_to_adls( + context: AssetExecutionContext, + sftp_client: SFTPClient, + adls_client: DataLakeServiceClient, + file_monitor_result: dict, + source_path: str, + destination_container: str, + destination_path: str, + pipeline_name: str, + prefix: Optional[Union[str, List[str]]] = None, + suffix: Optional[Union[str, List[str]]] = None, + contains: Optional[Union[str, List[str]]] = None, + not_contains: Optional[Union[str, List[str]]] = None, + regex: Optional[str] = None, + extension: Optional[Union[str, List[str]]] = None, + file_formats: Optional[List[str]] = None, +) -> MaterializeResult: + """ + Complete SFTP to ADLS copy logic: validation + copy + MaterializeResult. + Returns MaterializeResult ready for the asset to return. + """ + + # Extract monitoring results + files_complete = file_monitor_result.get("complete", False) + alert_was_sent = file_monitor_result.get("alert_sent", False) + expected_count = file_monitor_result.get("expected_total", 0) + found_count = file_monitor_result.get("found_total", 0) + + context.log.info(f"📁 {pipeline_name} File Copy Check:") + context.log.info(f" Complete: {files_complete}, Alert sent: {alert_was_sent}") + context.log.info(f" Files: {found_count}/{expected_count}") + + # Check if monitoring requirements are met + if not files_complete or not alert_was_sent or found_count != expected_count: + context.log.info("❌ Skipping copy - conditions not met") + + return MaterializeResult( + value={ + "status": "failed", + "reason": "File monitoring conditions not met", + "pipeline_name": pipeline_name, + "error": f"Copy conditions not met: complete={files_complete}, alert={alert_was_sent}, count={found_count}/{expected_count}" + }, + metadata={ + "status": MetadataValue.text("❌ FAILED"), + "reason": MetadataValue.text("File monitoring conditions not met"), + "pipeline_name": MetadataValue.text(pipeline_name) + } + ) + + # Log file criteria + context.log.info(f"🔍 {pipeline_name} Copy Criteria:") + context.log.info(f" - prefix: {prefix}") + context.log.info(f" - suffix: {suffix}") + context.log.info(f" - contains: {contains}") + context.log.info(f" - not_contains: {not_contains}") + context.log.info(f" - regex: {regex}") + context.log.info(f" - extension: {extension}") + + try: + # Perform the actual copy + copied = _copy_sftp_to_adls_internal( + context=context, + sftp_client=sftp_client, + adls_client=adls_client, + source_path=source_path, + destination_container=destination_container, + destination_path=destination_path, + prefix=prefix, + suffix=suffix, + contains=contains, + not_contains=not_contains, + regex=regex, + extension=extension, + file_formats=file_formats + ) + + context.log.info(f"✅ {pipeline_name} copied {len(copied)} files to staging") + + # Validate copy count + if len(copied) != expected_count: + context.log.warning(f"⚠️ Copy count mismatch: copied {len(copied)}, expected {expected_count}") + context.log.info("This might indicate files were already present or filtering criteria changed") + + # Return successful MaterializeResult + return MaterializeResult( + value={ + "status": "completed", + "copied_files": copied, + "copy_count": len(copied), + "expected_count": expected_count, + "pipeline_name": pipeline_name + }, + metadata={ + "status": MetadataValue.text("✅ SUCCESS"), + "files_copied": MetadataValue.int(len(copied)), + "expected_files": MetadataValue.int(expected_count), + "copy_efficiency": MetadataValue.text(f"{len(copied)}/{expected_count} files"), + "pipeline_name": MetadataValue.text(pipeline_name), + "destination": MetadataValue.text(f"{destination_container}/{destination_path}") + } + ) + + except Exception as e: + context.log.error(f"❌ {pipeline_name} copy operation failed: {str(e)}") + + # Return failed MaterializeResult + return MaterializeResult( + value={ + "status": "failed", + "error": str(e), + "pipeline_name": pipeline_name, + "copy_count": 0 + }, + metadata={ + "status": MetadataValue.text("❌ ERROR"), + "error": MetadataValue.text(str(e)[:200] + "..." if len(str(e)) > 200 else str(e)), + "pipeline_name": MetadataValue.text(pipeline_name), + "files_copied": MetadataValue.int(0) + } + ) + raise + + +def _copy_sftp_to_adls_internal( + context: AssetExecutionContext, + sftp_client: SFTPClient, + adls_client: DataLakeServiceClient, + source_path: str, + destination_container: str, + destination_path: str, + prefix: Optional[Union[str, List[str]]] = None, + suffix: Optional[Union[str, List[str]]] = None, + contains: Optional[Union[str, List[str]]] = None, + not_contains: Optional[Union[str, List[str]]] = None, + regex: Optional[str] = None, + extension: Optional[Union[str, List[str]]] = None, + file_formats: Optional[List[str]] = None, +) -> List[str]: + """ + Internal function that does the actual SFTP to ADLS copy work. + Separated from the main function to keep the MaterializeResult logic clean. + """ + dest_fs_client = adls_client.get_file_system_client(destination_container) + + # List files using the existing SFTP session + try: + files_list = sftp_client.listdir_attr(source_path) + except Exception as e: + context.log.error(f"Failed to list SFTP directory {source_path}: {e}") + raise + + copied_files: List[str] = [] + + for entry in files_list: + fname = entry.filename + full_sftp_path = f"{source_path}/{fname}" + + # Apply all filters inline + should_copy = True + + # Check prefix criteria + if prefix and should_copy: + if isinstance(prefix, list): + should_copy = any(fname.startswith(p) for p in prefix) + else: + should_copy = fname.startswith(prefix) + + # Check suffix criteria + if suffix and should_copy: + if isinstance(suffix, list): + should_copy = any(fname.endswith(s) for s in suffix) + else: + should_copy = fname.endswith(suffix) + + # Check contains criteria + if contains and should_copy: + if isinstance(contains, list): + should_copy = any(c in fname for c in contains) + else: + should_copy = contains in fname + + # Check not_contains criteria + if not_contains and should_copy: + if isinstance(not_contains, list): + should_copy = not any(nc in fname for nc in not_contains) + else: + should_copy = not_contains not in fname + + # Check regex criteria + if regex and should_copy: + should_copy = bool(re.search(regex, fname)) + + # Check extension criteria + if extension and should_copy: + if isinstance(extension, list): + should_copy = any(fname.endswith(ext) for ext in extension) + else: + should_copy = fname.endswith(extension) + + # Filter by file_formats (legacy parameter) + if file_formats and should_copy: + should_copy = any(fname.lower().endswith(ext.lower()) for ext in file_formats) + + if not should_copy: + continue + + # Prepare Destination ADLS path + adls_path = f"{destination_path}/{fname}" + dest_file_client = dest_fs_client.get_file_client(adls_path) + + if dest_file_client.exists(): + raise Exception(f"Destination file '{destination_container}/{adls_path}' already exists.") + + # Copy file + try: + with sftp_client.open(full_sftp_path, 'rb') as f: + data = f.read() + dest_file_client.upload_data(data, overwrite=True) + context.log.info(f"✅ Copied '{full_sftp_path}' -> '{destination_container}/{adls_path}'") + copied_files.append(f"{destination_container}/{adls_path}") + except Exception as e: + context.log.error(f"❌ Failed to copy {fname}: {e}") + raise + + return copied_files \ No newline at end of file diff --git a/reusable_components/etl/copy_stage_parquet_to_iceberg.py b/reusable_components/etl/copy_stage_parquet_to_iceberg.py new file mode 100644 index 0000000..5914775 --- /dev/null +++ b/reusable_components/etl/copy_stage_parquet_to_iceberg.py @@ -0,0 +1,47 @@ +from typing import List +from snowflake.snowpark import Session +from dagster import AssetExecutionContext + +def copy_stage_parquet_to_iceberg( + context: AssetExecutionContext, + session: Session, + stage_name: str, + target_db: str, + target_schema: str, + prefix_filter: str, +) -> List[str]: + """ + 1) LIST all files in @stage_name under prefix_filter + 2) COPY each into the matching Iceberg table .._ICEBERG + using the given file_format_name. + Returns the list of tables that were loaded. + """ + # 1) grab everything under prefix_filter + list_sql = f"LIST @{stage_name}/{prefix_filter};" + rows = session.sql(list_sql).collect() + + loaded_tables: List[str] = [] + for row in rows: + full_path = row.name # e.g. azure://…/dagsterparquetdata/parquet/iceberg/CLAIMS_FOO.parquet + context.log.info( + f"▶️ Loading @{full_path} into SNOWFLAKE ICEBERG TABLE" + ) + + # derive table from filename (strip path + .parquet, ) + filename = full_path.rsplit("/", 1)[-1] + + base = filename[:-8] # remove ".parquet" + iceberg_table = f"{target_db}.{target_schema}.{base}" + + # 2) COPY INTO that Iceberg table + session.sql(f""" + COPY INTO {iceberg_table} + FROM @{stage_name} + FILE_FORMAT = ( TYPE = 'PARQUET' ) + MATCH_BY_COLUMN_NAME = 'CASE_INSENSITIVE' + ON_ERROR = 'CONTINUE' + """).collect() + + loaded_tables.append(iceberg_table) + + return loaded_tables diff --git a/reusable_components/etl/csv_to_parquet_access_keys.py b/reusable_components/etl/csv_to_parquet_access_keys.py new file mode 100644 index 0000000..f8514b3 --- /dev/null +++ b/reusable_components/etl/csv_to_parquet_access_keys.py @@ -0,0 +1,77 @@ +import io +import os +import pandas as pd +import pyarrow as pa +import pyarrow.parquet as pq +from typing import List, Optional +from dagster import AssetExecutionContext +from dagster_azure.adls2 import ADLS2Resource + +def convert_csv_to_parquet( + context: AssetExecutionContext, + client: ADLS2Resource, + source_container: str, + source_folder: str, + dest_container: str, + dest_folder: str, + prefix: Optional[str] = None, + suffix: Optional[str] = None, + contains: Optional[str] = None, + not_contains: Optional[str] = None, + regex: Optional[str] = None, + extension: Optional[str] = None +) -> List[str]: + """ + Lists all CSV files in `source_container/source_folder` whose basenames + start with `prefix`, converts each to Parquet, and writes it into + `dest_container/dest_folder` with the same basename but a .parquet extension. + Returns the list of full destination paths written. + """ + log = context.log + log.info( + f"🔄 Starting CSV→Parquet conversion: " + f"src={source_container}/{source_folder}, " + f"dst={dest_container}/{dest_folder}, " + f"filter=prefix:{prefix}" + ) + + # Initialize file system clients + source_fs = client.get_file_system_client(source_container) + dest_fs = client.get_file_system_client(dest_container) + written_paths: List[str] = [] + + # Iterate over CSV files under source_folder + for path in source_fs.get_paths(path=source_folder, recursive=True): + if path.is_directory or not path.name.lower().endswith(".csv"): + continue + # Compute relative path and destination path + rel_path = path.name[len(source_folder):].lstrip("/") + base_name = os.path.splitext(os.path.basename(rel_path))[0] + # Apply file-prefix filter + if prefix and not base_name.startswith(prefix): + continue + parquet_name = f"{base_name}.parquet" + dest_path = f"{dest_folder.rstrip('/')}/{parquet_name}" if dest_folder else parquet_name + # 1) Download CSV into memory + source_client = source_fs.get_file_client(path.name) + stream = io.BytesIO() + source_client.download_file().readinto(stream) + stream.seek(0) + log.info(f"Processing file: {path.name} → {dest_path}") + # 2) Read entire CSV → pandas DataFrame + csv_df = pd.read_csv(stream) + # 3) Convert to Arrow Table + write to in-memory Parquet + table = pa.Table.from_pandas(csv_df) + pq_buffer = io.BytesIO() + pq.write_table(table, pq_buffer) + pq_buffer.seek(0) + # 4) Upload Parquet to destination container + data = pq_buffer.read() + dest_client = dest_fs.get_file_client(dest_path) + dest_client.create_file() + dest_client.append_data(data, offset=0) + dest_client.flush_data(len(data)) + log.info(f"Uploaded: {dest_path}") + written_paths.append(dest_path) + log.info(f"Conversion completed: {len(written_paths)} files written") + return written_paths diff --git a/reusable_components/etl/csv_to_parquet_adls.py b/reusable_components/etl/csv_to_parquet_adls.py new file mode 100644 index 0000000..971cc21 --- /dev/null +++ b/reusable_components/etl/csv_to_parquet_adls.py @@ -0,0 +1,82 @@ +import io +import os +import pandas as pd +import pyarrow as pa +import pyarrow.parquet as pq +from typing import List +from dagster import AssetExecutionContext +from dagster_azure.adls2 import ADLS2Resource + +def convert_csv_to_parquet_adls( + context: AssetExecutionContext, + adls2: ADLS2Resource, + source_container: str, + dest_container: str, + source_folder: str, + dest_folder: str, + prefix_filter: str, +) -> List[str]: + """ + Lists all CSV files in `source_container/source_folder` whose basenames + start with `prefix_filter`, converts each to Parquet, and writes it into + `dest_container/dest_folder` with the same basename but a .parquet extension. + + Returns the list of full destination paths written. + """ + log = context.log + log.info( + f"🔄 Starting CSV→Parquet conversion: " + f"src={source_container}/{source_folder}, " + f"dst={dest_container}/{dest_folder}, " + f"filter={prefix_filter}" + ) + + # Initialize file system clients + source_fs = adls2.adls2_client.get_file_system_client(source_container) + dest_fs = adls2.adls2_client.get_file_system_client(dest_container) + written_paths: List[str] = [] + + # Iterate over CSV files under source_folder + for path in source_fs.get_paths(path=source_folder, recursive=True): + if path.is_directory or not path.name.lower().endswith(".csv"): + continue + + # Compute relative path and destination path + rel_path = path.name[len(source_folder):].lstrip("/") + base_name = os.path.splitext(os.path.basename(rel_path))[0] + + # Apply file-prefix filter + if prefix_filter and not base_name.startswith(prefix_filter): + continue + + parquet_name = f"{base_name}.parquet" + dest_path = f"{dest_folder.rstrip('/')}/{parquet_name}" if dest_folder else parquet_name + + # 1) Download CSV into memory + source_client = source_fs.get_file_client(path.name) + stream = io.BytesIO() + source_client.download_file().readinto(stream) + stream.seek(0) + log.info(f"Processing file: {path.name} → {dest_path}") + + # 2) Read entire CSV → pandas DataFrame + csv_df = pd.read_csv(stream) + + # 3) Convert to Arrow Table + write to in-memory Parquet + table = pa.Table.from_pandas(csv_df) + pq_buffer = io.BytesIO() + pq.write_table(table, pq_buffer) + pq_buffer.seek(0) + + # 4) Upload Parquet to destination container + data = pq_buffer.read() + dest_client = dest_fs.get_file_client(dest_path) + dest_client.create_file() + dest_client.append_data(data, offset=0) + dest_client.flush_data(len(data)) + log.info(f"Uploaded: {dest_path}") + + written_paths.append(dest_path) + + log.info(f"Conversion completed: {len(written_paths)} files written") + return written_paths \ No newline at end of file diff --git a/reusable_components/etl/dq_audit.py b/reusable_components/etl/dq_audit.py new file mode 100644 index 0000000..e5c5f6c --- /dev/null +++ b/reusable_components/etl/dq_audit.py @@ -0,0 +1,125 @@ +from snowflake.snowpark import Session +from dagster import AssetExecutionContext, MaterializeResult, MetadataValue +from datetime import datetime, timezone +from typing import Optional, Union, List + +DQ_AUDIT_TABLE = "ANALYTYXONE_DEV.DATALOOM.DQ_AUDIT" +UPDATED_BY = "ETL" + +def create_dq_audit_entry( + context: AssetExecutionContext, + session: Session, + monitor_result: dict, + program_name: str, + subject_area: str, + pipeline_name: str, + prefix: Optional[Union[str, List[str]]] = None, + suffix: Optional[Union[str, List[str]]] = None, + contains: Optional[Union[str, List[str]]] = None, + not_contains: Optional[Union[str, List[str]]] = None, + regex: Optional[str] = None, + extension: Optional[Union[str, List[str]]] = None +) -> MaterializeResult: + """ + Complete DQ audit asset logic: validation + audit entry creation + MaterializeResult. + Returns MaterializeResult ready for the asset to return. + """ + + # Extract monitoring results + files_complete = monitor_result.get("complete", False) + files_found = monitor_result.get("found_total", 0) + expected_count = monitor_result.get("expected_total", 0) + + context.log.info(f"🔍 {pipeline_name} DQ Audit Check:") + context.log.info(f" Files complete: {files_complete}") + context.log.info(f" Files found: {files_found}/{expected_count}") + + # Check if monitoring requirements are met + if not files_complete or files_found != expected_count: + context.log.info("❌ Skipping DQ Audit - file monitoring requirements not met") + + return MaterializeResult( + metadata={ + "status": MetadataValue.text("❌ FAILED"), + "reason": MetadataValue.text("File monitoring requirements not met"), + "files_found": MetadataValue.int(files_found), + "expected_files": MetadataValue.int(expected_count), + "pipeline_name": MetadataValue.text(pipeline_name) + } + ) + + # Check file criteria for conditional logic (you can add your business logic here) + context.log.info(f"🔍 {pipeline_name} File Criteria:") + context.log.info(f" - prefix: {prefix}") + context.log.info(f" - suffix: {suffix}") + context.log.info(f" - contains: {contains}") + context.log.info(f" - not_contains: {not_contains}") + context.log.info(f" - regex: {regex}") + context.log.info(f" - extension: {extension}") + + # Here you can add conditional logic based on file criteria + # Example: Different audit logic for different prefixes + if prefix: + if isinstance(prefix, list): + context.log.info(f" Processing files with prefixes: {prefix}") + else: + context.log.info(f" Processing files with prefix: {prefix}") + + try: + # Calculate next batch ID + context.log.info(f"📊 Creating DQ audit entry for {program_name}/{subject_area}") + + id_query = session.sql(f""" + SELECT MAX(TRY_TO_NUMBER(ID_BATCH)) AS max_id + FROM {DQ_AUDIT_TABLE} + WHERE PROGRAM_NAME = '{program_name}' + AND SUBJECT_AREA = '{subject_area}' + AND CODE_LOAD_STATUS IN ('COMPLETED','FAILED','IN_PROGRESS') + """) + + row = id_query.collect()[0] + max_id = int(row["MAX_ID"]) if row["MAX_ID"] is not None else 0 + id_batch = max_id + 1 if max_id >= 1 else 1 + + # Generate timestamps + current_time = datetime.now(timezone.utc) + code_load_status = "IN_PROGRESS" + + # Insert audit record + session.sql(f""" + INSERT INTO {DQ_AUDIT_TABLE} ( + ID_BATCH, PROGRAM_NAME, SUBJECT_AREA, CODE_LOAD_STATUS, + DATE_BATCH_LOAD, UPDATED_BY + ) VALUES ( + '{id_batch}', '{program_name}', '{subject_area}', '{code_load_status}', + '{current_time.isoformat()}', '{UPDATED_BY}' + ) + """).collect() + + context.log.info(f"✅ {pipeline_name} DQ audit entry created (ID: {id_batch})") + + # Return successful MaterializeResult + return MaterializeResult( + value=id_batch, + metadata={ + "status": MetadataValue.text("✅ SUCCESS"), + "batch_id": MetadataValue.int(id_batch), + "pipeline_name": MetadataValue.text(pipeline_name), + "files_processed": MetadataValue.int(files_found), + "file_criteria_used": MetadataValue.text(f"prefix={prefix}, suffix={suffix}, contains={contains}") + } + ) + + except Exception as e: + context.log.error(f"❌ {pipeline_name} DQ audit entry failed: {str(e)}") + + # Return failed MaterializeResult + return MaterializeResult( + metadata={ + "status": MetadataValue.text("❌ ERROR"), + "error": MetadataValue.text(str(e)[:200] + "..." if len(str(e)) > 200 else str(e)), + "pipeline_name": MetadataValue.text(pipeline_name), + "files_found": MetadataValue.int(files_found) + } + ) + raise \ No newline at end of file diff --git a/reusable_components/etl/validate_and_copy.py b/reusable_components/etl/validate_and_copy.py new file mode 100644 index 0000000..0abb607 --- /dev/null +++ b/reusable_components/etl/validate_and_copy.py @@ -0,0 +1,511 @@ +# import os +# import io +# import csv +# from typing import Dict, List + +# from dagster import OpExecutionContext +# from dagster_azure.adls2 import ADLS2Resource +# from snowflake.snowpark import Session +# from dagster import get_dagster_logger + +# logger = get_dagster_logger() + +# def _read_table_metadata( +# adls2: ADLS2Resource, +# metadata_container: str, +# metadata_path: str, +# ) -> Dict[str, List[str]]: +# fs = adls2.adls2_client.get_file_system_client(metadata_container) +# file_client = fs.get_file_client(metadata_path) +# download = file_client.download_file().readall() +# text = download.decode("utf-8").splitlines() +# reader = csv.DictReader(text) +# meta: Dict[str, List[str]] = {} +# for row in reader: +# cols = [c.strip() for c in row["COLUMNS"].split(",") if c.strip()] +# meta[row["FILE_NAME"].strip()] = cols +# return meta + +# def _read_header( +# adls2: ADLS2Resource, +# source_container: str, +# source_folder: str, +# filename: str, +# ) -> List[str]: +# fs = adls2.adls2_client.get_file_system_client(source_container) +# file_client = fs.get_file_client(f"{source_folder}/{filename}") +# # read first 4KB (should include header) +# raw = file_client.download_file(offset=0, length=4096).readall() +# header = raw.decode("utf-8").splitlines()[0] +# return [c.strip() for c in header.split(",")] + +# def validate_and_copy( +# context: OpExecutionContext, +# session: Session, +# adls2: ADLS2Resource, +# *, +# metadata_container: str, +# metadata_path: str, +# source_container: str, +# source_folder: str, +# prefix: str, +# target_db: str, +# target_schema: str, +# file_format_name: str, +# ) -> Dict[str, List[str]]: +# passed: List[str] = [] +# failed: List[str] = [] + +# # 1) read metadata CSV +# metadata = _read_table_metadata(adls2, metadata_container, metadata_path) +# context.log.info(f"Loaded metadata for {len(metadata)} files") + +# # 2) list and filter source files +# fs = adls2.adls2_client.get_file_system_client(source_container) +# paths = fs.get_paths(path=source_folder) +# candidates = [ +# os.path.basename(p.name) +# for p in paths +# if not p.is_directory and os.path.basename(p.name).startswith(prefix) +# ] +# context.log.info(f"Found {len(candidates)} files starting with {prefix}") + +# # 3) prep snowflake +# session.use_database(target_db) +# session.use_schema(target_schema) +# stage_fqn = f"{target_db}.{target_schema}.PM_SA_CSV_STAGE" +# fmt_fqn = f"{target_schema}.{file_format_name}" + +# # 4) validate & load +# for fname in candidates: +# expected = metadata.get(fname) +# if not expected: +# context.log.error(f"No metadata entry for {fname}; skipping") +# failed.append(fname) +# continue + +# actual = _read_header(adls2, source_container, source_folder, fname) +# if actual != expected: +# context.log.error( +# f"Column mismatch for {fname}\n" +# f" expected: {expected}\n" +# f" actual: {actual}" +# ) +# failed.append(fname) +# continue + +# # ensure table exists +# table = os.path.splitext(fname)[0] +# full_table = f"{target_db}.{target_schema}.{table}" +# session.sql(f""" +# CREATE TABLE IF NOT EXISTS {full_table} ( +# data VARIANT +# ) +# """).collect() + +# # truncate and copy +# session.sql(f"TRUNCATE TABLE {full_table}").collect() +# copy_sql = ( +# f"COPY INTO {full_table} " +# f"FROM @{stage_fqn}/{source_folder}/{fname} " +# f"FILE_FORMAT=(FORMAT_NAME='{fmt_fqn}') " +# "ON_ERROR='CONTINUE'" +# ) +# session.sql(copy_sql).collect() +# context.log.info(f"✅ Loaded {fname} into {full_table}") +# passed.append(fname) + +# return {"passed_files": passed, "failed_files": failed} + +import os +import csv +from typing import Dict, List +from dagster import AssetExecutionContext, get_dagster_logger +from dagster_azure.adls2 import ADLS2Resource +from snowflake.snowpark import Session + +logger = get_dagster_logger() + +def load_metadata_from_csv( + adls2: ADLS2Resource, + metadata_container: str, + metadata_path: str, +) -> Dict[str, List[str]]: + """ + Load TABLE_METADATA.csv and return mapping of file names to expected columns. + + Args: + adls2: ADLS2 resource + metadata_container: Container containing metadata file + metadata_path: Path to TABLE_METADATA.csv file + + Returns: + Dictionary mapping file names to list of expected column names + """ + try: + # Read the metadata file from ADLS + fs = adls2.adls2_client.get_file_system_client(metadata_container) + file_client = fs.get_file_client(metadata_path) + download = file_client.download_file().readall() + text = download.decode("utf-8").strip() + + # Debug logging + logger.info(f"📋 Raw metadata file content (first 300 chars): {text[:300]}") + + lines = text.splitlines() + if not lines: + raise ValueError("Metadata file is empty") + + # Parse CSV content + reader = csv.DictReader(lines) + metadata_mapping: Dict[str, List[str]] = {} + + # Debug: Show available columns + logger.info(f"📋 Available columns in metadata file: {reader.fieldnames}") + + if not reader.fieldnames: + raise ValueError("No columns found in metadata file") + + # Find the correct column names (handle variations) + file_name_column = None + columns_column = None + + for field in reader.fieldnames: + field_upper = field.strip().upper() + if field_upper in ['FILE_NAME', 'FILENAME', 'FILE']: + file_name_column = field + elif field_upper in ['COLUMNS', 'COLUMN', 'COLS']: + columns_column = field + + if not file_name_column: + raise ValueError(f"File name column not found. Available columns: {reader.fieldnames}") + if not columns_column: + raise ValueError(f"Columns column not found. Available columns: {reader.fieldnames}") + + logger.info(f"📋 Using columns: file_name='{file_name_column}', columns='{columns_column}'") + + # Read metadata rows + for row_num, row in enumerate(reader, start=2): + try: + file_name = row[file_name_column].strip() + columns_str = row[columns_column].strip() + + if not file_name: + logger.warning(f"⚠️ Empty file name in row {row_num}, skipping") + continue + + if not columns_str: + logger.warning(f"⚠️ Empty columns for file {file_name} in row {row_num}, skipping") + continue + + # Parse and clean column names + columns = [col.strip() for col in columns_str.split(",") if col.strip()] + metadata_mapping[file_name] = columns + + logger.info(f"📋 Loaded metadata for {file_name}: {len(columns)} columns") + + except Exception as e: + logger.error(f"❌ Error processing row {row_num}: {str(e)}") + continue + + logger.info(f"📋 Successfully loaded metadata for {len(metadata_mapping)} files") + return metadata_mapping + + except Exception as e: + logger.error(f"❌ Failed to load metadata from {metadata_container}/{metadata_path}: {str(e)}") + raise + +def extract_csv_column_headers( + adls2: ADLS2Resource, + source_container: str, + source_folder: str, + filename: str, +) -> List[str]: + """ + Extract column headers from a CSV file in ADLS. + + Args: + adls2: ADLS2 resource + source_container: Container containing the CSV file + source_folder: Folder containing the CSV file + filename: Name of the CSV file + + Returns: + List of column names from the header row + """ + try: + fs = adls2.adls2_client.get_file_system_client(source_container) + file_path = f"{source_folder}/{filename}" + file_client = fs.get_file_client(file_path) + + # Read enough data to get the header line + raw_data = file_client.download_file(offset=0, length=8192).readall() + text = raw_data.decode("utf-8") + + # Get first line (header) + lines = text.splitlines() + if not lines: + raise ValueError(f"File {filename} appears to be empty") + + header_line = lines[0].strip() + + # Parse header and clean column names + columns = [] + for col in header_line.split(","): + # Remove quotes and whitespace + cleaned_col = col.strip().strip('"').strip("'").strip() + if cleaned_col: # Only add non-empty columns + columns.append(cleaned_col) + + logger.info(f"📄 Extracted {len(columns)} columns from {filename}") + return columns + + except Exception as e: + logger.error(f"❌ Failed to extract headers from {source_container}/{source_folder}/{filename}: {str(e)}") + raise + +def create_snowflake_table_if_not_exists( + session: Session, + table_name: str, + target_db: str, + target_schema: str, + columns: List[str], +) -> bool: + """ + Create a Snowflake table with the specified column schema if it doesn't already exist. + + Args: + session: Snowflake session + table_name: Name of the table to create + target_db: Target database + target_schema: Target schema + columns: List of column names for the table + + Returns: + True if table was created, False if it already existed + """ + try: + full_table_name = f"{target_db}.{target_schema}.{table_name}" + + # Check if table already exists + check_table_sql = f""" + SELECT COUNT(*) as table_count + FROM INFORMATION_SCHEMA.TABLES + WHERE TABLE_SCHEMA = '{target_schema.upper()}' + AND TABLE_NAME = '{table_name.upper()}' + AND TABLE_CATALOG = '{target_db.upper()}' + """ + + result = session.sql(check_table_sql).collect() + table_exists = result[0]['TABLE_COUNT'] > 0 + + if table_exists: + logger.info(f"📊 Table {full_table_name} already exists, skipping creation") + return False + + # Create table if it doesn't exist + columns_definitions = [] + for col in columns: + # Ensure column name is properly quoted + column_def = f'"{col.upper()}" VARCHAR(16777216)' + columns_definitions.append(column_def) + + columns_sql = ", ".join(columns_definitions) + + create_table_sql = f""" + CREATE TABLE {full_table_name} ( + {columns_sql} + ) + """ + + session.sql(create_table_sql).collect() + logger.info(f"📊 Created new table {full_table_name} with {len(columns)} columns") + return True + + except Exception as e: + logger.error(f"❌ Failed to create table {table_name}: {str(e)}") + raise + +def get_files_with_prefix( + adls2: ADLS2Resource, + container: str, + folder: str, + prefix: str, +) -> List[str]: + """ + Get list of files in ADLS folder that start with the given prefix. + + Args: + adls2: ADLS2 resource + container: Container name + folder: Folder path + prefix: File name prefix to filter by + + Returns: + List of file names matching the prefix + """ + try: + fs = adls2.adls2_client.get_file_system_client(container) + paths = fs.get_paths(path=folder) + + matching_files = [] + for path in paths: + if not path.is_directory: + filename = os.path.basename(path.name) + if filename.startswith(prefix): + matching_files.append(filename) + + logger.info(f"📁 Found {len(matching_files)} files with prefix '{prefix}' in {container}/{folder}") + return matching_files + + except Exception as e: + logger.error(f"❌ Failed to list files in {container}/{folder}: {str(e)}") + raise + +def validate_csv_files_and_load_to_snowflake( + context: AssetExecutionContext, + session: Session, + adls2: ADLS2Resource, + *, + metadata_container: str, + metadata_path: str, + source_container: str, + source_folder: str, + prefix_filter: str, + target_db: str, + target_schema: str, + file_format_name: str, + stage_name: str, +) -> Dict[str, List[str]]: + """ + Validate CSV files against metadata schema and load valid files to Snowflake. + + Args: + context: Dagster execution context + session: Snowflake session + adls2: ADLS2 resource + metadata_container: Container containing TABLE_METADATA.csv + metadata_path: Path to TABLE_METADATA.csv + source_container: Container containing source CSV files + source_folder: Folder containing source CSV files + prefix_filter: Prefix filter for file names (e.g., "CLAIMS_") + target_db: Target Snowflake database + target_schema: Target Snowflake schema + file_format_name: Snowflake file format name + stage_name: Snowflake stage name + + Returns: + Dictionary containing lists of passed and failed files + """ + validated_files: List[str] = [] + failed_files: List[str] = [] + + try: + # Step 1: Load metadata from CSV + context.log.info(f"📋 Loading metadata from {metadata_container}/{metadata_path}") + file_metadata = load_metadata_from_csv(adls2, metadata_container, metadata_path) + context.log.info(f"📋 Loaded metadata for {len(file_metadata)} files") + + # Step 2: Get list of files to process + context.log.info(f"📁 Getting files from {source_container}/{source_folder} with prefix '{prefix_filter}'") + candidate_files = get_files_with_prefix(adls2, source_container, source_folder, prefix_filter) + context.log.info(f"📁 Found {len(candidate_files)} files to process: {candidate_files}") + + # Step 3: Setup Snowflake environment + context.log.info(f"❄️ Setting up Snowflake environment: {target_db}.{target_schema}") + session.use_database(target_db) + session.use_schema(target_schema) + + # Construct fully qualified names + stage_full_name = f"{target_db}.{target_schema}.{stage_name}" + format_full_name = f"{target_schema}.{file_format_name}" + + # Step 4: Process each file + for filename in candidate_files: + context.log.info(f"🔍 Processing file: {filename}") + + # Check if metadata exists for this file + if filename not in file_metadata: + context.log.error(f"❌ No metadata found for file: {filename}") + failed_files.append(filename) + continue + + expected_columns = file_metadata[filename] + context.log.info(f"📋 Expected columns for {filename}: {expected_columns}") + + # Extract actual columns from the file + try: + actual_columns = extract_csv_column_headers(adls2, source_container, source_folder, filename) + context.log.info(f"📄 Actual columns in {filename}: {actual_columns}") + except Exception as e: + context.log.error(f"❌ Failed to read columns from {filename}: {str(e)}") + failed_files.append(filename) + continue + + # Validate column match + if actual_columns != expected_columns: + context.log.error( + f"❌ Column validation failed for {filename}\n" + f" Expected ({len(expected_columns)}): {expected_columns}\n" + f" Actual ({len(actual_columns)}): {actual_columns}" + ) + failed_files.append(filename) + continue + + # Create Snowflake table and load data + try: + table_name = os.path.splitext(filename)[0] + + # Create table only if it doesn't exist + table_created = create_snowflake_table_if_not_exists(session, table_name, target_db, target_schema, expected_columns) + + # Load data using Snowflake COPY command + full_table_name = f"{target_db}.{target_schema}.{table_name}" + + # Option 1: Truncate table before loading (replace data) + session.sql(f"TRUNCATE TABLE {full_table_name}").collect() + context.log.info(f"🗑️ Truncated existing data in {full_table_name}") + + # Option 2: Append data to existing table (comment out truncate above) + copy_command = f""" + COPY INTO {full_table_name} + FROM @{stage_full_name}/{source_folder}/{filename} + FILE_FORMAT=(FORMAT_NAME='{format_full_name}') + ON_ERROR='CONTINUE' + """ + + copy_result = session.sql(copy_command).collect() + + if table_created: + context.log.info(f"✅ Created table and loaded {filename} into {full_table_name}") + else: + context.log.info(f"✅ Loaded {filename} into existing table {full_table_name}") + + validated_files.append(filename) + + except Exception as e: + context.log.error(f"❌ Failed to load {filename} into Snowflake: {str(e)}") + failed_files.append(filename) + continue + + # Step 5: Final summary + context.log.info( + f"🎯 Processing complete: " + f"{len(validated_files)} files validated and loaded, " + f"{len(failed_files)} files failed" + ) + + if validated_files: + context.log.info(f"✅ Successfully processed: {validated_files}") + + if failed_files: + context.log.warning(f"⚠️ Failed to process: {failed_files}") + + return { + "passed_files": validated_files, + "failed_files": failed_files + } + + except Exception as e: + context.log.error(f"❌ Critical error in validation and loading process: {str(e)}") + raise \ No newline at end of file diff --git a/reusable_components/file_processing/archive_files.py b/reusable_components/file_processing/archive_files.py new file mode 100644 index 0000000..ecd7671 --- /dev/null +++ b/reusable_components/file_processing/archive_files.py @@ -0,0 +1,276 @@ +import os +from typing import List, Optional, Union +from dagster import AssetExecutionContext, MaterializeResult, MetadataValue +from azure.storage.filedatalake import DataLakeServiceClient + + +def archive_files_with_result( + context: AssetExecutionContext, + adls_client: DataLakeServiceClient, + copy_result: dict, + stage_container: str, + stage_directory: str, + archive_directory: str, + pipeline_name: str, + prefix: Optional[Union[str, List[str]]] = None, + suffix: Optional[Union[str, List[str]]] = None, + contains: Optional[Union[str, List[str]]] = None, + not_contains: Optional[Union[str, List[str]]] = None, + regex: Optional[str] = None, + extension: Optional[Union[str, List[str]]] = None +) -> MaterializeResult: + """ + Complete archive logic: validation + file archiving + MaterializeResult. + Returns MaterializeResult ready for the asset to return. + """ + + copy_status = copy_result.get("status", "unknown") + + if copy_status != "completed": + context.log.info(f"❌ Skipping {pipeline_name} archive - copy status: {copy_status}") + + return MaterializeResult( + value={"status": "skipped", "files_archived": 0, "pipeline_name": pipeline_name}, + metadata={ + "status": MetadataValue.text("⏭️ SKIPPED"), + "reason": MetadataValue.text(f"Copy status: {copy_status}"), + "pipeline_name": MetadataValue.text(pipeline_name) + } + ) + + try: + # Perform the actual archiving + result = _archive_files_internal( + context=context, + adls_client=adls_client, + stage_container=stage_container, + stage_directory=stage_directory, + archive_directory=archive_directory, + pipeline_name=pipeline_name, + prefix=prefix, + suffix=suffix, + contains=contains, + not_contains=not_contains, + regex=regex, + extension=extension + ) + + archived_count = result.get("files_archived", 0) + archived_files = result.get("archived_files", []) + + context.log.info(f"🎉 {pipeline_name} archived {archived_count} files") + + # Return successful MaterializeResult + return MaterializeResult( + value={ + "status": "completed", + "files_archived": archived_count, + "archived_files": archived_files, + "pipeline_name": pipeline_name, + "archive_criteria": { + "extension": extension, + "prefix": prefix, + "suffix": suffix, + "contains": contains, + "not_contains": not_contains + } + }, + metadata={ + "status": MetadataValue.text("✅ SUCCESS"), + "files_archived": MetadataValue.int(archived_count), + "pipeline_name": MetadataValue.text(pipeline_name), + "archive_directory": MetadataValue.text(archive_directory), + "archived_files": MetadataValue.text(", ".join(archived_files) if archived_files else "None"), + "file_criteria_used": MetadataValue.text(f"prefix={prefix}, extension={extension}") + } + ) + + except Exception as e: + context.log.error(f"❌ {pipeline_name} archive operation failed: {str(e)}") + + # Return failed MaterializeResult + return MaterializeResult( + value={ + "status": "failed", + "error": str(e), + "pipeline_name": pipeline_name, + "files_archived": 0 + }, + metadata={ + "status": MetadataValue.text("❌ ERROR"), + "error": MetadataValue.text(str(e)[:200] + "..." if len(str(e)) > 200 else str(e)), + "pipeline_name": MetadataValue.text(pipeline_name), + "files_archived": MetadataValue.int(0) + } + ) + raise + + +def _file_matches_archive_criteria(filename: str, + prefix: Optional[Union[str, List[str]]] = None, + suffix: Optional[Union[str, List[str]]] = None, + contains: Optional[Union[str, List[str]]] = None, + not_contains: Optional[Union[str, List[str]]] = None, + regex: Optional[str] = None, + extension: Optional[Union[str, List[str]]] = None) -> bool: + """Check if file matches all specified archive criteria""" + import re + + # Check extension criteria + if extension: + if isinstance(extension, list): + if not any(filename.lower().endswith(ext.lower()) for ext in extension): + return False + else: + if not filename.lower().endswith(extension.lower()): + return False + + # Check prefix criteria + if prefix: + if isinstance(prefix, list): + if not any(filename.startswith(p) for p in prefix): + return False + else: + if not filename.startswith(prefix): + return False + + # Check suffix criteria + if suffix: + if isinstance(suffix, list): + if not any(filename.endswith(s) for s in suffix): + return False + else: + if not filename.endswith(suffix): + return False + + # Check contains criteria + if contains: + if isinstance(contains, list): + if not any(c in filename for c in contains): + return False + else: + if contains not in filename: + return False + + # Check not_contains criteria + if not_contains: + if isinstance(not_contains, list): + if any(nc in filename for nc in not_contains): + return False + else: + if not_contains in filename: + return False + + # Check regex criteria + if regex: + if not re.search(regex, filename): + return False + + return True + + +def _archive_files_internal( + context: AssetExecutionContext, + adls_client: DataLakeServiceClient, + stage_container: str, + stage_directory: str, + archive_directory: str, + pipeline_name: str, + prefix: Optional[Union[str, List[str]]] = None, + suffix: Optional[Union[str, List[str]]] = None, + contains: Optional[Union[str, List[str]]] = None, + not_contains: Optional[Union[str, List[str]]] = None, + regex: Optional[str] = None, + extension: Optional[Union[str, List[str]]] = None +) -> dict: + """ + Internal function that does the actual file archiving work. + Separated from the main function to keep the MaterializeResult logic clean. + """ + + # Get ADLS client + filesystem_client = adls_client.get_file_system_client(stage_container) + + context.log.info(f"📦 {pipeline_name} Archive Operation:") + context.log.info(f" Source: {stage_container}/{stage_directory}") + context.log.info(f" Destination: {stage_container}/{archive_directory}") + context.log.info(f" Criteria: prefix={prefix}, extension={extension}") + + # Get files from stage using configuration with proper filtering + stage_files = [] + + try: + for path in filesystem_client.get_paths(path=stage_directory): + if not path.is_directory: + filename = os.path.basename(path.name) + + context.log.info(f"🔍 Checking file: {filename}") + + # Apply file matching criteria + if _file_matches_archive_criteria( + filename, + prefix=prefix, + suffix=suffix, + contains=contains, + not_contains=not_contains, + regex=regex, + extension=extension + ): + stage_files.append(path.name) + context.log.info(f" ✅ Will archive: {filename}") + else: + context.log.info(f" ❌ Skipping: {filename} (doesn't match criteria)") + + except Exception as e: + context.log.error(f"❌ Error listing files in {stage_directory}: {str(e)}") + raise + + context.log.info(f"📦 {pipeline_name} found {len(stage_files)} files to archive") + + archived_count = 0 + archived_files = [] + failed_files = [] + + # Copy each file to archive + for file_path in stage_files: + filename = os.path.basename(file_path) + archive_path = f"{archive_directory}/{filename}" + + try: + context.log.info(f"📦 Archiving: {filename}") + + # Read from stage + stage_client = filesystem_client.get_file_client(file_path) + file_data = stage_client.download_file().readall() + + # Write to archive + archive_client = filesystem_client.get_file_client(archive_path) + archive_client.create_file() + archive_client.append_data(file_data, offset=0, length=len(file_data)) + archive_client.flush_data(len(file_data)) + + archived_count += 1 + archived_files.append(filename) + context.log.info(f"✅ {pipeline_name} archived: {filename}") + + except Exception as e: + context.log.error(f"❌ Failed to archive {filename}: {str(e)}") + failed_files.append(filename) + # Continue with other files instead of failing completely + continue + + # Log summary + context.log.info(f"📦 Archive Summary:") + context.log.info(f" Files found: {len(stage_files)}") + context.log.info(f" Files archived: {archived_count}") + context.log.info(f" Files failed: {len(failed_files)}") + + if failed_files: + context.log.warning(f" Failed files: {failed_files}") + + return { + "files_archived": archived_count, + "archived_files": archived_files, + "failed_files": failed_files, + "total_files_found": len(stage_files) + } \ No newline at end of file diff --git a/reusable_components/file_processing/monitor_files.py b/reusable_components/file_processing/monitor_files.py new file mode 100644 index 0000000..a00c7eb --- /dev/null +++ b/reusable_components/file_processing/monitor_files.py @@ -0,0 +1,708 @@ +import os +import json +import re +from datetime import datetime +import paramiko +import io +from typing import Dict, List, Any, Optional, Tuple + +from dagster import ( + asset, AssetExecutionContext, sensor, DefaultSensorStatus, + AssetSelection, RunRequest, SkipReason, AssetKey, + MaterializeResult, MetadataValue +) +from reusable_components.error_handling.standardized_alerts import send_pipeline_alert, with_pipeline_alerts + + +def create_standardized_file_monitor(pipeline_config: Dict[str, Any]) -> Tuple[callable, callable]: + """ + Creates a standardized file monitor that works for any pipeline + + Args: + pipeline_config: Configuration dictionary containing: + - pipeline_name: Name of the pipeline (e.g., "MEDICAID_RECIPIENT") + - asset_name: Name for the monitoring asset (e.g., "recipient_files_monitor") + - sftp_source_path: SFTP directory to monitor + - file_criteria: File matching criteria dictionary + - downstream_assets: List of asset names to trigger when complete + - group_name: Dagster group name + - alert_config: Optional alert configuration overrides + + Returns: + Tuple of (monitor_asset_function, monitor_sensor_function) + """ + + # Extract configuration + pipeline_name = pipeline_config["pipeline_name"] + asset_name = pipeline_config["asset_name"] + directory_path = pipeline_config["sftp_source_path"] + file_criteria = pipeline_config["file_criteria"] + downstream_assets = pipeline_config.get("downstream_assets", []) + group_name = pipeline_config.get("group_name", f"{pipeline_name.lower()}_processing") + alert_config = pipeline_config.get("alert_config", {}) + + # Calculate total expected files + total_expected = _calculate_expected_file_count(file_criteria) + + # ========================================== + # PART 1: CREATE THE MONITORING ASSET + # ========================================== + @asset( + name=asset_name, + required_resource_keys={"adls_sftp", "adls_access_keys"}, + description=f"Monitor {directory_path} for {pipeline_name} files (expected: {total_expected})", + group_name=group_name + ) + @with_pipeline_alerts( + pipeline_name=pipeline_name, + alert_config=alert_config + ) + def monitor_asset(context: AssetExecutionContext) -> MaterializeResult: + """ + Standardized file monitoring asset that works for any pipeline + """ + + context.log.info(f"🔍 {pipeline_name} File Monitor Starting") + context.log.info(f" Directory: {directory_path}") + context.log.info(f" Expected files: {total_expected}") + + _log_file_criteria(context, file_criteria) + + # Set up memory file for this specific pipeline + memory_file = f"/tmp/{asset_name}_memory.json" + + # Connect to SFTP + sftp = context.resources.adls_sftp + + try: + # Get all files from SFTP directory + all_files = sftp.listdir(directory_path) + context.log.info(f" Total files in directory: {len(all_files)}") + + # Match files against criteria + matched_files = _match_files_against_criteria(all_files, file_criteria, context) + + # Calculate totals + total_found = _calculate_total_found(matched_files) + + # Load previous state + old_memory = _load_memory_file(memory_file) + old_file_count = old_memory.get('total_count', 0) + alert_already_sent = old_memory.get('alert_sent', False) + + # Check if file count changed (reset alert status if so) + if total_found != old_file_count: + context.log.info(f"🔄 File count changed from {old_file_count} to {total_found} - resetting alert status") + alert_already_sent = False + + # Check if we have complete set + have_complete_set = _check_complete_set(matched_files, file_criteria, context) + should_send_alert = False + monitoring_status = "WAITING" + + if have_complete_set and not alert_already_sent: + # SUCCESS! Complete set found + context.log.info("🚨 SUCCESS! Complete file set detected! 🚨") + context.log.info(f" Found {total_found}/{total_expected} files") + monitoring_status = "COMPLETE" + + _log_matched_files(context, matched_files) + + # Send success alert + should_send_alert = _send_file_complete_alert( + context, total_found, directory_path, pipeline_name, alert_config + ) + + if should_send_alert and downstream_assets: + context.log.info(f"🚀 Ready to trigger pipeline: {downstream_assets}") + + elif have_complete_set and alert_already_sent: + # Files still complete, alert already sent + context.log.info(f"✅ Complete set maintained ({total_found} files, alert already sent)") + monitoring_status = "COMPLETE_NOTIFIED" + + else: + # Still waiting or issues + context.log.info(f"📁 Progress: {total_found}/{total_expected} files") + if total_found < total_expected: + context.log.info("⏳ Waiting for complete set...") + monitoring_status = "INCOMPLETE" + elif total_found > total_expected: + context.log.info("⚠️ Excess files detected") + monitoring_status = "EXCESS_FILES" + else: + context.log.info("❌ File criteria not fully met") + monitoring_status = "CRITERIA_NOT_MET" + + # Save current state + _save_memory_file(memory_file, matched_files, alert_already_sent or should_send_alert, total_found) + + # Prepare result + result_value = { + "pipeline_name": pipeline_name, + "directory": directory_path, + "criteria": file_criteria, + "expected_total": total_expected, + "found_total": total_found, + "matched_files": matched_files, + "complete": have_complete_set, + "alert_sent": should_send_alert or alert_already_sent, + "timestamp": datetime.now().isoformat(), + "downstream_assets": downstream_assets, + "status": monitoring_status, + "all_found_files": _get_all_found_files(matched_files) + } + + # Create comprehensive metadata + metadata = _create_monitoring_metadata( + monitoring_status, have_complete_set, should_send_alert or alert_already_sent, + total_found, total_expected, directory_path, downstream_assets, + file_criteria, matched_files, context + ) + + context.log.info(f"✅ {pipeline_name} monitoring completed: {monitoring_status}") + + return MaterializeResult(value=result_value, metadata=metadata) + + except Exception as e: + context.log.error(f"❌ {pipeline_name} file monitoring failed: {str(e)}") + + # Return error result + error_result = { + "pipeline_name": pipeline_name, + "directory": directory_path, + "status": "ERROR", + "error": str(e), + "expected_total": total_expected, + "found_total": 0, + "complete": False, + "alert_sent": False, + "timestamp": datetime.now().isoformat() + } + + return MaterializeResult( + value=error_result, + metadata={ + "monitoring_status": MetadataValue.text("❌ ERROR"), + "pipeline_name": MetadataValue.text(pipeline_name), + "error_message": MetadataValue.text(str(e)[:200] + "..." if len(str(e)) > 200 else str(e)), + "summary": MetadataValue.text(f"❌ {pipeline_name} monitoring failed") + } + ) + + finally: + try: + sftp.close() + except: + pass + + # ========================================== + # PART 2: CREATE THE SENSOR + # ========================================== + @sensor( + asset_selection=AssetSelection.keys(*[AssetKey(asset_name)] + [AssetKey(asset) for asset in downstream_assets]), + minimum_interval_seconds=30, + default_status=DefaultSensorStatus.RUNNING, + name=f"{asset_name}_sensor" + ) + def monitor_sensor(context): + """ + Standardized sensor that works for any pipeline + """ + + # Set up memory files + sensor_memory_file = f"/tmp/{asset_name}_sensor_simple.json" + pipeline_complete_file = f"/tmp/{asset_name}_pipeline_complete.json" + + # Check if pipeline already completed + if _pipeline_already_completed(pipeline_complete_file): + return SkipReason(f"{pipeline_name} pipeline already completed. Monitoring stopped.") + + try: + # Get current file count + current_file_count = _get_current_file_count_with_criteria(context, directory_path, file_criteria) + if current_file_count is None: + return SkipReason(f"{pipeline_name}: Could not connect to SFTP") + + # Check alert status + alert_already_sent = _check_alert_status(asset_name) + + # Load previous count + previous_count = _load_previous_count(sensor_memory_file) + + # Save current count + _save_current_count(sensor_memory_file, current_file_count) + + # Log status + _log_sensor_status(context, pipeline_name, previous_count, current_file_count, total_expected, alert_already_sent) + + # Decide what to do + has_complete_set = current_file_count == total_expected + needs_alert = has_complete_set and not alert_already_sent + + if current_file_count != previous_count: + # File count changed + return _handle_file_count_change( + context, needs_alert, current_file_count, total_expected, + pipeline_complete_file, asset_name, downstream_assets, pipeline_name + ) + + elif needs_alert: + # Count same but need alert + return _handle_missed_alert( + context, current_file_count, total_expected, + pipeline_complete_file, asset_name, downstream_assets, pipeline_name + ) + + else: + # Nothing to do + if has_complete_set and alert_already_sent: + return SkipReason(f"{pipeline_name}: Complete set present, alert sent ({current_file_count} files)") + else: + return SkipReason(f"{pipeline_name}: No change ({current_file_count} files)") + + except Exception as e: + context.log.error(f"{pipeline_name} sensor error: {str(e)}") + return RunRequest(asset_selection=[AssetKey(asset_name)]) + + return monitor_asset, monitor_sensor + + +# ========================================== +# HELPER FUNCTIONS +# ========================================== + +def _calculate_expected_file_count(file_criteria: Dict[str, Any]) -> int: + """Calculate total expected files from criteria""" + total_expected = 0 + for criteria_type, criteria_config in file_criteria.items(): + if criteria_type in ["extension", "not_contains"]: + continue # These are filters, not counters + count = criteria_config.get("count", 0) + if count > 0: + total_expected += count + return total_expected + + +def _calculate_total_found(matched_files: Dict[str, List[str]]) -> int: + """Calculate total files found (excluding filter-only criteria)""" + return sum(len(files) for criteria_type, files in matched_files.items() + if criteria_type not in ["extension", "not_contains"]) + + +def _get_all_found_files(matched_files: Dict[str, List[str]]) -> List[str]: + """Get unique list of all found files""" + all_files = [] + for criteria_type, files in matched_files.items(): + if files and criteria_type not in ["extension", "not_contains"]: + all_files.extend(files) + return list(set(all_files)) + + +def _create_monitoring_metadata(monitoring_status, have_complete_set, alert_sent, + total_found, total_expected, directory_path, downstream_assets, + file_criteria, matched_files, context): + """Create comprehensive metadata for monitoring result""" + + # Get file breakdown + criteria_breakdown = {} + all_found_files = _get_all_found_files(matched_files) + + for criteria_type, files in matched_files.items(): + if files: + criteria_breakdown[criteria_type] = len(files) + + return { + # Status Information + "monitoring_status": MetadataValue.text(f"🔍 {monitoring_status}"), + "files_complete": MetadataValue.bool(have_complete_set), + "alert_sent": MetadataValue.bool(alert_sent), + + # File Counts + "files_found": MetadataValue.int(total_found), + "files_expected": MetadataValue.int(total_expected), + "completion_ratio": MetadataValue.text(f"{total_found}/{total_expected} files"), + + # Directory Information + "source_directory": MetadataValue.text(directory_path), + "downstream_assets": MetadataValue.text(", ".join(downstream_assets) if downstream_assets else "None"), + + # File Criteria Summary + "monitoring_criteria": MetadataValue.text(_format_criteria_summary(file_criteria)), + + # Criteria Breakdown + **{f"criteria_{k}": MetadataValue.text(f"{v} files") + for k, v in criteria_breakdown.items()}, + + # File Lists (truncated for display) + "found_files": MetadataValue.text( + ", ".join(all_found_files[:10]) + + (f"... (+{len(all_found_files)-10} more)" if len(all_found_files) > 10 else "") + if all_found_files else "None" + ), + + # Timing + "last_check": MetadataValue.text(datetime.now().strftime("%Y-%m-%d %H:%M:%S")), + "run_id": MetadataValue.text(context.run_id), + + # Status Summary + "summary": MetadataValue.text( + f"✅ READY: All {total_found} files found" if have_complete_set and alert_sent + else f"✅ COMPLETE: Files ready" if have_complete_set + else f"⏳ WAITING: {total_found}/{total_expected} files" if total_found < total_expected + else f"⚠️ EXCESS: Too many files ({total_found})" + ) + } + + +def _send_file_complete_alert(context, total_found, directory_path, pipeline_name, alert_config): + """Send alert when files are complete""" + + success = send_pipeline_alert( + context=context, + pipeline_name=pipeline_name, + trigger_type="info", + message=f"🎉 {pipeline_name} COMPLETE! All {total_found} files ready in {directory_path}", + alert_config=alert_config + ) + + if success: + context.log.info("✅ File complete alert sent successfully!") + return True + else: + context.log.error("❌ Failed to send file complete alert") + return False + + +def _format_criteria_summary(file_criteria): + """Format file criteria into readable summary""" + criteria_parts = [] + for criteria_type, criteria_config in file_criteria.items(): + pattern = criteria_config.get("pattern", "") + count = criteria_config.get("count", 0) + if pattern and count > 0: + criteria_parts.append(f"{criteria_type}='{pattern}'({count})") + return ", ".join(criteria_parts) + + +def _log_file_criteria(context, file_criteria): + """Log file criteria in readable format""" + context.log.info(" File Criteria:") + for criteria_type, criteria_config in file_criteria.items(): + patterns = criteria_config.get("pattern", "") + count = criteria_config.get("count", 0) + + if isinstance(patterns, list): + pattern_str = f"{patterns}" + else: + pattern_str = f"'{patterns}'" + + if criteria_type == "not_contains": + context.log.info(f" • {criteria_type}: {pattern_str} (exclude files containing these)") + else: + context.log.info(f" • {criteria_type}: {pattern_str} (expect {count} files)") + + +def _match_files_against_criteria(all_files, file_criteria, context): + """Match files against criteria with proper filtering""" + matched_files = {} + + # Apply extension filter first + if "extension" in file_criteria: + extension_patterns = file_criteria["extension"].get("pattern", "") + if extension_patterns: + if isinstance(extension_patterns, str): + extension_patterns = [extension_patterns] + + filtered_files = [] + for f in all_files: + for ext in extension_patterns: + if f.endswith(ext): + filtered_files.append(f) + break + + all_files = filtered_files + context.log.info(f" 📁 Extension filter {extension_patterns}: {len(all_files)} files") + + # Apply not_contains filter + if "not_contains" in file_criteria: + not_contains_patterns = file_criteria["not_contains"].get("pattern", "") + if not_contains_patterns: + if isinstance(not_contains_patterns, str): + not_contains_patterns = [not_contains_patterns] + elif not_contains_patterns is None: + not_contains_patterns = [] + + filtered_files = [] + for f in all_files: + should_exclude = False + for pattern in not_contains_patterns: + if pattern in f: + should_exclude = True + context.log.info(f" 🚫 Excluding: {f} (contains '{pattern}')") + break + + if not should_exclude: + filtered_files.append(f) + + all_files = filtered_files + context.log.info(f" 📁 Not_contains filter: {len(all_files)} files remain") + + # Apply counting criteria + for criteria_type, criteria_config in file_criteria.items(): + if criteria_type in ["extension", "not_contains"]: + continue + + patterns = criteria_config.get("pattern", "") + expected_count = criteria_config.get("count", 0) + + if expected_count == 0: + continue + + if isinstance(patterns, str): + patterns = [patterns] if patterns else [] + elif patterns is None: + patterns = [] + + matches = [] + for file in all_files: + for pattern in patterns: + if _file_matches_criteria(file, criteria_type, pattern): + matches.append(file) + break + + matched_files[criteria_type] = matches + context.log.info(f" {criteria_type} {patterns}: {len(matches)}/{expected_count} files") + + # Add filter results for reference + if "extension" in file_criteria: + matched_files["extension"] = all_files + + return matched_files + + +def _file_matches_criteria(filename, criteria_type, pattern): + """Check if file matches specific criteria""" + if criteria_type == "prefix": + if isinstance(pattern, list): + return any(filename.startswith(p) for p in pattern) + return filename.startswith(pattern) + + elif criteria_type == "suffix": + if isinstance(pattern, list): + return any(filename.endswith(s) for s in pattern) + return filename.endswith(pattern) + + elif criteria_type == "regex": + return bool(re.search(pattern, filename)) + + elif criteria_type == "extension": + if isinstance(pattern, list): + return any(filename.endswith(e) for e in pattern) + return filename.endswith(pattern) + + elif criteria_type == "contains": + if isinstance(pattern, list): + return any(c in filename for c in pattern) + return pattern in filename + + elif criteria_type == "not_contains": + if isinstance(pattern, list): + return not any(nc in filename for nc in pattern) + return pattern not in filename + + return False + + +def _check_complete_set(matched_files, file_criteria, context): + """Check if we have complete file set""" + for criteria_type, criteria_config in file_criteria.items(): + expected_count = criteria_config.get("count", 0) + + if criteria_type in ["extension", "not_contains"]: + if criteria_type == "extension": + extension_files = matched_files.get("extension", []) + expected_extension_count = criteria_config.get("count", 0) + + if expected_extension_count > 0 and len(extension_files) < expected_extension_count: + context.log.info(f" ❌ {criteria_type}: {len(extension_files)}/{expected_extension_count}") + return False + continue + + if expected_count == 0: + continue + + actual_count = len(matched_files.get(criteria_type, [])) + + if actual_count != expected_count: + context.log.info(f" ❌ {criteria_type}: {actual_count}/{expected_count}") + return False + else: + context.log.info(f" ✅ {criteria_type}: {actual_count}/{expected_count}") + + return True + + +def _log_matched_files(context, matched_files): + """Log all matched files by criteria""" + for criteria_type, files in matched_files.items(): + if files and criteria_type not in ["extension", "not_contains"]: + context.log.info(f" {criteria_type.upper()} files:") + for file in files: + context.log.info(f" ✅ {file}") + + +# Memory and sensor helper functions (keeping original logic) +def _load_memory_file(memory_file): + """Load previous memory state""" + if os.path.exists(memory_file): + try: + with open(memory_file, 'r') as f: + return json.load(f) + except: + return {} + return {} + + +def _save_memory_file(memory_file, matched_files, alert_sent, total_count): + """Save current state to memory""" + new_memory = { + 'matched_files': matched_files, + 'alert_sent': alert_sent, + 'total_count': total_count, + 'last_check': datetime.now().isoformat() + } + with open(memory_file, 'w') as f: + json.dump(new_memory, f) + + +def _get_current_file_count_with_criteria(context, directory_path, file_criteria): + """Quick SFTP connection to get current file count""" + try: + host = os.getenv("SFTP_HOST") + user = os.getenv("SFTP_USERNAME") + pem_body = os.getenv("ADLSSFTP_PEM_CONTENT") + + if not all([host, user, pem_body]): + context.log.error("Missing SFTP connection info") + return None + + key_stream = io.StringIO(pem_body) + key = paramiko.RSAKey.from_private_key(key_stream) + key_stream.close() + + client = paramiko.SSHClient() + client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + client.connect(hostname=host, port=22, username=user, pkey=key, + look_for_keys=False, allow_agent=False, timeout=10) + + sftp = client.open_sftp() + all_files = sftp.listdir(directory_path) + + # Apply same filtering logic + matched_files = _match_files_against_criteria(all_files, file_criteria, context) + total_count = _calculate_total_found(matched_files) + + sftp.close() + client.close() + + return total_count + + except Exception as e: + context.log.error(f"Error connecting to SFTP: {e}") + return None + + +def _pipeline_already_completed(pipeline_complete_file): + """Check if pipeline already completed""" + if os.path.exists(pipeline_complete_file): + try: + with open(pipeline_complete_file, 'r') as f: + complete_data = json.load(f) + return complete_data.get('completion_time') is not None + except: + pass + return False + + +def _check_alert_status(asset_name): + """Check if alert was already sent""" + asset_memory_file = f"/tmp/{asset_name}_memory.json" + if os.path.exists(asset_memory_file): + try: + with open(asset_memory_file, 'r') as f: + asset_memory = json.load(f) + return asset_memory.get('alert_sent', False) + except: + pass + return False + + +def _load_previous_count(sensor_memory_file): + """Load previous file count""" + if os.path.exists(sensor_memory_file): + try: + with open(sensor_memory_file, 'r') as f: + sensor_memory = json.load(f) + return sensor_memory.get('file_count', 0) + except: + pass + return 0 + + +def _save_current_count(sensor_memory_file, current_file_count): + """Save current file count""" + with open(sensor_memory_file, 'w') as f: + json.dump({'file_count': current_file_count, 'last_check': datetime.now().isoformat()}, f) + + +def _log_sensor_status(context, pipeline_name, previous_count, current_file_count, expected_count, alert_already_sent): + """Log sensor status""" + context.log.info(f"📊 {pipeline_name} Sensor Status:") + context.log.info(f" Previous: {previous_count}, Current: {current_file_count}, Expected: {expected_count}") + context.log.info(f" Alert sent: {alert_already_sent}, Changed: {current_file_count != previous_count}") + + +def _handle_file_count_change(context, needs_alert, current_file_count, expected_count, + pipeline_complete_file, asset_name, downstream_assets, pipeline_name): + """Handle file count changes""" + context.log.info(f"🔍 {pipeline_name} file count changed") + + if needs_alert: + context.log.info(f"🎉 {pipeline_name} COMPLETE SET! Triggering pipeline") + _mark_pipeline_complete(pipeline_complete_file, current_file_count, expected_count, pipeline_name) + return _trigger_all_assets(context, asset_name, downstream_assets, pipeline_name) + else: + context.log.info(f"{pipeline_name} progress update - running monitor") + return RunRequest(asset_selection=[AssetKey(asset_name)]) + + +def _handle_missed_alert(context, current_file_count, expected_count, + pipeline_complete_file, asset_name, downstream_assets, pipeline_name): + """Handle missed alert scenario""" + context.log.info(f"🚨 {pipeline_name} COMPLETE SET DETECTED! Triggering pipeline") + _mark_pipeline_complete(pipeline_complete_file, current_file_count, expected_count, pipeline_name) + return _trigger_all_assets(context, asset_name, downstream_assets, pipeline_name) + + +def _mark_pipeline_complete(pipeline_complete_file, current_file_count, expected_count, pipeline_name): + """Mark pipeline as complete""" + with open(pipeline_complete_file, 'w') as f: + json.dump({ + 'pipeline_name': pipeline_name, + 'completion_time': datetime.now().isoformat(), + 'files_processed': current_file_count, + 'expected_count': expected_count + }, f) + + +def _trigger_all_assets(context, asset_name, downstream_assets, pipeline_name): + """Trigger all pipeline assets""" + asset_selection = [AssetKey(asset_name)] + [AssetKey(asset) for asset in downstream_assets] + + context.log.info(f"🚀 {pipeline_name} Pipeline Assets:") + for key in asset_selection: + context.log.info(f" - {key.path[-1]}") + + context.log.info("🛑 Pipeline triggered - sensor will stop on next run") + return RunRequest(asset_selection=asset_selection) \ No newline at end of file diff --git a/reusable_components/file_processing/unzip_processor.py b/reusable_components/file_processing/unzip_processor.py new file mode 100644 index 0000000..dda726b --- /dev/null +++ b/reusable_components/file_processing/unzip_processor.py @@ -0,0 +1,493 @@ +import os +import io +import time +from typing import Dict, Optional, Union, List +import zipfile +import re +from dagster import AssetExecutionContext, MaterializeResult, MetadataValue +from datetime import datetime +from azure.storage.filedatalake import DataLakeServiceClient +from azure.core.exceptions import ServiceRequestError, HttpResponseError + +def unzip_files_with_result( + context: AssetExecutionContext, + adls_client: DataLakeServiceClient, + copy_result: dict, + container_name: str, + stage_directory: str, + load_directory: str, + pipeline_name: str, + prefix: Optional[Union[str, List[str]]] = None, + suffix: Optional[Union[str, List[str]]] = None, + contains: Optional[Union[str, List[str]]] = None, + not_contains: Optional[Union[str, List[str]]] = None, + regex: Optional[str] = None, + extension: Optional[Union[str, List[str]]] = None +) -> MaterializeResult: + """ + Complete unzip/copy logic: handles both ZIP files and CSV files. + - ZIP files: extracts and cleans filenames + - CSV files: copies and cleans filenames + Returns MaterializeResult ready for the asset to return. + """ + + copy_status = copy_result.get("status", "unknown") + copy_count = copy_result.get("copy_count", 0) + + context.log.info(f"📂 {pipeline_name} File Processing Check:") + context.log.info(f" Copy status: {copy_status}, Files: {copy_count}") + + # Check if copy was successful + if copy_status != "completed": + context.log.info(f"❌ Skipping {pipeline_name} file processing - copy not completed") + + return MaterializeResult( + value={"status": "skipped", "reason": "Copy operation failed", "pipeline_name": pipeline_name}, + metadata={ + "status": MetadataValue.text("⏭️ SKIPPED"), + "reason": MetadataValue.text("Copy operation failed"), + "pipeline_name": MetadataValue.text(pipeline_name) + } + ) + + context.log.info(f"✅ {pipeline_name} starting file processing: {copy_count} files to process") + + # Use retry logic for the entire operation + max_retries = 3 + retry_count = 0 + + while retry_count < max_retries: + try: + result = _process_files_internal( + context=context, + adls_client=adls_client, + container_name=container_name, + stage_directory=stage_directory, + load_directory=load_directory, + prefix=prefix, + suffix=suffix, + contains=contains, + not_contains=not_contains, + regex=regex, + extension=extension + ) + + # Success - extract counts and create successful result + files_processed = result.get("files_processed", 0) + csv_files_copied = result.get("csv_files_copied", 0) + zip_files_extracted = result.get("zip_files_extracted", 0) + + context.log.info(f"✅ {pipeline_name} file processing complete:") + context.log.info(f" Total files processed: {files_processed}") + context.log.info(f" CSV files copied: {csv_files_copied}") + context.log.info(f" ZIP files extracted: {zip_files_extracted}") + + return MaterializeResult( + value={**result, "pipeline_name": pipeline_name}, + metadata={ + "status": MetadataValue.text("✅ SUCCESS"), + "files_processed": MetadataValue.int(files_processed), + "csv_files_copied": MetadataValue.int(csv_files_copied), + "zip_files_extracted": MetadataValue.int(zip_files_extracted), + "pipeline_name": MetadataValue.text(pipeline_name), + "retry_attempts": MetadataValue.int(retry_count) + } + ) + + except Exception as e: + retry_count += 1 + error_msg = str(e) + + # Check for retryable network errors + if ("EOF occurred in violation of protocol" in error_msg or + "ssl.c:" in error_msg or + "Connection" in error_msg or + "timeout" in error_msg.lower()): + + if retry_count < max_retries: + context.log.warning(f"⚠️ {pipeline_name} network error on attempt {retry_count}/{max_retries}, retrying...") + time.sleep(5 * retry_count) # Exponential backoff + continue + else: + context.log.error(f"❌ {pipeline_name} network error persisted after {max_retries} attempts") + else: + # Non-retryable error + context.log.error(f"❌ {pipeline_name} non-retryable error: {error_msg}") + break + + # If we reach here, either max retries exceeded or non-retryable error + context.log.error(f"❌ {pipeline_name} file processing failed: {str(e)}") + + return MaterializeResult( + value={"status": "failed", "error": str(e), "pipeline_name": pipeline_name}, + metadata={ + "status": MetadataValue.text("❌ FAILED"), + "error_message": MetadataValue.text(str(e)[:200] + "..." if len(str(e)) > 200 else str(e)), + "pipeline_name": MetadataValue.text(pipeline_name), + "retry_attempts": MetadataValue.int(retry_count) + } + ) + +def _remove_timestamp_from_filename(filename): + """Remove only timestamp/date numbers from filenames, preserving meaningful identifiers like TB, TBL""" + name, ext = os.path.splitext(filename) + + # Specific approach: find TB/TBL and preserve everything before the timestamp part + # Pattern: look for _TB or _TBL followed by timestamp patterns + + # First, try to find TB/TBL patterns and preserve up to that point + tb_patterns = [ + (r'(.+_TB)_\d{4}_\d{2}_\d{2}T\d{6}Z?.*', r'\1'), # Keep everything up to _TB + (r'(.+_TBL)_\d{4}_\d{2}_\d{2}T\d{6}Z?.*', r'\1'), # Keep everything up to _TBL + (r'(.+_TB)_\d{4}_\d{2}_\d{2}.*', r'\1'), # Keep everything up to _TB (date format) + (r'(.+_TBL)_\d{4}_\d{2}_\d{2}.*', r'\1'), # Keep everything up to _TBL (date format) + (r'(.+_TB)_\d{8}T\d{6}Z?.*', r'\1'), # Keep everything up to _TB (compact) + (r'(.+_TBL)_\d{8}T\d{6}Z?.*', r'\1'), # Keep everything up to _TBL (compact) + ] + + clean_name = name + + # Try TB/TBL specific patterns first + for pattern, replacement in tb_patterns: + if re.match(pattern, clean_name): + clean_name = re.sub(pattern, replacement, clean_name) + return f"{clean_name}{ext}" + + # If no TB/TBL patterns matched, use general timestamp removal patterns + general_patterns = [ + r'_\d{4}_\d{2}_\d{2}T\d{6}Z?.*', # _2025_07_18T094424Z... + r'_\d{4}_\d{2}_\d{2}_\d{4,6}.*', # _2025_07_08_143000... + r'_\d{4}-\d{2}-\d{2}T\d{6}Z?.*', # _2024-07-08T143000Z... + r'_\d{4}-\d{2}-\d{2}_\d{4,6}.*', # _2024-07-08_1430... + r'_\d{8}T\d{6}Z?.*', # _20240708T143000Z... + r'_\d{8}_\d{4,6}.*', # _20240708_143000... + r'_\d{14}.*', # _20240708143000... + r'_\d{12}.*', # _202407081430... + r'_\d{10}.*', # _1234567890... + r'T\d{6}Z?.*', # T094424Z... + r'_\d{4}_\d{2}_\d{2}.*', # _2025_07_18... + r'_\d{8}.*', # _20250718... + ] + + for pattern in general_patterns: + clean_name = re.sub(pattern, '', clean_name) + + # Clean up any trailing underscores or dashes + clean_name = clean_name.rstrip('_-') + + # Handle edge case where name becomes empty + if not clean_name: + clean_name = 'file' + + return f"{clean_name}{ext}" + +def _file_matches_criteria(filename: str, + prefix: Union[str, List[str], tuple] = None, + suffix: Union[str, List[str], tuple] = None, + contains: Union[str, List[str], tuple] = None, + not_contains: Union[str, List[str], tuple] = None, + regex: str = None, + extension: Union[str, List[str], tuple] = None) -> bool: + """Check if a file matches all the specified criteria with proper list/tuple handling.""" + + # Check prefix - handle both string and list/tuple + if prefix is not None: + if isinstance(prefix, (list, tuple)): + if not any(filename.startswith(p) for p in prefix): + return False + else: + if not filename.startswith(prefix): + return False + + # Check suffix - handle both string and list/tuple + if suffix is not None: + if isinstance(suffix, (list, tuple)): + if not any(filename.endswith(s) for s in suffix): + return False + else: + if not filename.endswith(suffix): + return False + + # Check contains - handle both string and list/tuple + if contains is not None: + if isinstance(contains, (list, tuple)): + if not any(c in filename for c in contains): + return False + else: + if contains not in filename: + return False + + # Check not_contains - handle both string and list/tuple + if not_contains is not None: + if isinstance(not_contains, (list, tuple)): + if any(nc in filename for nc in not_contains): + return False + else: + if not_contains in filename: + return False + + # Check regex + if regex and not re.search(regex, filename): + return False + + # Check extension - handle both string and list/tuple + if extension is not None: + if isinstance(extension, (list, tuple)): + if not any(filename.lower().endswith(e.lower()) for e in extension): + return False + else: + if not filename.lower().endswith(extension.lower()): + return False + + return True + + +def _upload_file_with_retry(load_client, file_content: bytes, context: AssetExecutionContext, + filename: str, max_retries: int = 3, retry_delay: int = 5) -> bool: + """Upload file content with retry logic for network issues.""" + + for attempt in range(max_retries + 1): + try: + # Create file first + load_client.create_file() + + # Upload in chunks if file is large (>10MB) + content_length = len(file_content) + chunk_size = 10 * 1024 * 1024 # 10MB chunks + + if content_length > chunk_size: + context.log.info(f" 📤 Uploading large file {filename} ({content_length:,} bytes) in chunks...") + + # Upload in chunks + offset = 0 + while offset < content_length: + chunk_end = min(offset + chunk_size, content_length) + chunk = file_content[offset:chunk_end] + load_client.append_data(chunk, offset, len(chunk)) + offset = chunk_end + + # Flush all data + load_client.flush_data(content_length) + else: + # Small file - upload all at once + load_client.append_data(file_content, 0, content_length) + load_client.flush_data(content_length) + + return True + + except (ServiceRequestError, HttpResponseError, Exception) as e: + if attempt < max_retries: + context.log.warning(f" ⚠️ Upload attempt {attempt + 1} failed for {filename}: {str(e)}") + time.sleep(retry_delay) + retry_delay = min(retry_delay * 2, 60) # Max 60 seconds + + # Try to clean up failed file before retry + try: + load_client.delete_file() + except: + pass # Ignore cleanup errors + else: + context.log.error(f" ❌ Failed to upload {filename} after {max_retries + 1} attempts: {str(e)}") + return False + + return False + + +def _process_files_internal( + context: AssetExecutionContext, + adls_client, + container_name: str, + stage_directory: str, + load_directory: str, + prefix: Union[str, List[str], tuple] = None, + suffix: Union[str, List[str], tuple] = None, + contains: Union[str, List[str], tuple] = None, + not_contains: Union[str, List[str], tuple] = None, + regex: str = None, + extension: Union[str, List[str], tuple] = None +) -> Dict: + """ + Internal function that processes both ZIP and CSV files: + - ZIP files: extracts contents and cleans filenames + - CSV files: copies directly with filename cleaning + """ + + context.log.info(f"🗂️ File processing operation (handles ZIP and CSV):") + context.log.info(f" Container: {container_name}") + context.log.info(f" Source: {stage_directory}") + context.log.info(f" Destination: {load_directory}") + context.log.info(f" Filters: prefix={prefix}, not_contains={not_contains}") + + fs_client = adls_client.get_file_system_client(container_name) + + # List all files in stage directory + context.log.info(f"📂 Scanning {stage_directory} for files...") + try: + all_paths = list(fs_client.get_paths(path=stage_directory)) + context.log.info(f"📂 Found {len(all_paths)} total items in directory") + except Exception as e: + context.log.error(f"❌ Failed to list directory {stage_directory}: {e}") + raise + + # Separate ZIP and CSV files that match criteria + zip_files = [] + csv_files = [] + + for path in all_paths: + if path.is_directory: + continue + + # Get just the filename from the full path + filename = os.path.basename(path.name) + + # Apply file matching criteria + if _file_matches_criteria( + filename, + prefix=prefix, + suffix=suffix, + contains=contains, + not_contains=not_contains, + regex=regex, + extension=extension + ): + if filename.lower().endswith('.zip'): + zip_files.append(path.name) + context.log.info(f"✅ ZIP Match: {filename}") + elif filename.lower().endswith('.csv'): + csv_files.append(path.name) + context.log.info(f"✅ CSV Match: {filename}") + else: + context.log.info(f"📄 Other file type: {filename}") + + context.log.info(f"📁 File Summary:") + context.log.info(f" ZIP files to extract: {len(zip_files)}") + context.log.info(f" CSV files to copy: {len(csv_files)}") + + # Process results + total_files_processed = 0 + csv_files_copied = 0 + zip_files_extracted = 0 + total_failed = 0 + processed_files = [] + failed_files = [] + + # Process CSV files (copy with filename cleaning) + if csv_files: + context.log.info(f"📋 Processing {len(csv_files)} CSV files...") + + for csv_path in csv_files: + filename = os.path.basename(csv_path) + context.log.info(f"📄 Processing CSV: {filename}") + + try: + # Download CSV file + file_client = fs_client.get_file_client(csv_path) + csv_data = file_client.download_file().readall() + + # Clean filename + clean_name = _remove_timestamp_from_filename(filename) + context.log.info(f" Original: {filename}") + context.log.info(f" Cleaned: {clean_name}") + + # Save to load directory + load_path = f"{load_directory}/{clean_name}" + load_client = fs_client.get_file_client(load_path) + + # Upload with retry logic + if _upload_file_with_retry(load_client, csv_data, context, clean_name): + csv_files_copied += 1 + total_files_processed += 1 + processed_files.append({ + "original_name": filename, + "cleaned_name": clean_name, + "size_bytes": len(csv_data), + "file_type": "CSV", + "source": "direct_copy" + }) + context.log.info(f" ✅ Copied: {clean_name}") + else: + total_failed += 1 + failed_files.append(filename) + context.log.error(f" ❌ Failed to copy: {filename}") + + except Exception as e: + context.log.error(f" ❌ Error processing CSV {filename}: {str(e)}") + total_failed += 1 + failed_files.append(filename) + + # Process ZIP files (extract with filename cleaning) + if zip_files: + context.log.info(f"📦 Processing {len(zip_files)} ZIP files...") + + for zip_path in zip_files: + filename = os.path.basename(zip_path) + context.log.info(f"🗂️ Processing ZIP: {filename}") + + try: + # Download ZIP file + file_client = fs_client.get_file_client(zip_path) + zip_data = file_client.download_file().readall() + + # Extract contents + with zipfile.ZipFile(io.BytesIO(zip_data), 'r') as zip_ref: + file_list = zip_ref.namelist() + context.log.info(f" 📦 Contains {len(file_list)} files") + + for file_in_zip in file_list: + if file_in_zip.endswith('/'): + continue # Skip directories + + try: + file_content = zip_ref.read(file_in_zip) + + # Clean filename + clean_name = _remove_timestamp_from_filename(file_in_zip) + + # Save to load directory + load_path = f"{load_directory}/{clean_name}" + load_client = fs_client.get_file_client(load_path) + + if _upload_file_with_retry(load_client, file_content, context, clean_name): + zip_files_extracted += 1 + total_files_processed += 1 + processed_files.append({ + "original_name": file_in_zip, + "cleaned_name": clean_name, + "size_bytes": len(file_content), + "file_type": "ZIP_EXTRACTED", + "source_zip": filename + }) + context.log.info(f" ✅ Extracted: {clean_name}") + else: + total_failed += 1 + failed_files.append(f"{filename}:{clean_name}") + + except Exception as e: + context.log.error(f" ❌ Error extracting {file_in_zip}: {str(e)}") + total_failed += 1 + failed_files.append(f"{filename}:{file_in_zip}") + + except Exception as e: + context.log.error(f" ❌ Error processing ZIP {filename}: {str(e)}") + total_failed += 1 + failed_files.append(filename) + + # Final summary + context.log.info(f"🎉 File processing completed!") + context.log.info(f" Total files processed: {total_files_processed}") + context.log.info(f" CSV files copied: {csv_files_copied}") + context.log.info(f" ZIP files extracted: {zip_files_extracted}") + if total_failed > 0: + context.log.warning(f" Files failed: {total_failed}") + + return { + "status": "completed", + "files_processed": total_files_processed, + "csv_files_copied": csv_files_copied, + "zip_files_extracted": zip_files_extracted, + "files_failed": total_failed, + "processed_files": processed_files, + "failed_files": failed_files, + "source_csv_files": [os.path.basename(f) for f in csv_files], + "source_zip_files": [os.path.basename(f) for f in zip_files] + } \ No newline at end of file diff --git a/setup.py b/setup.py index 849f1cb..c038863 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup setup( - name="dagster_demo", + name="daster", install_requires=[ "dagster", "dagster-snowflake", diff --git a/utils/adls.py b/utils/adls.py index e69de29..d83e41a 100644 --- a/utils/adls.py +++ b/utils/adls.py @@ -0,0 +1,18 @@ +import os +from dagster import resource, InitResourceContext +from dagster_azure.adls2 import ADLS2Resource, ADLS2SASToken + +@resource +def adls2_resource(context: InitResourceContext) -> ADLS2Resource: + account = os.getenv("ADLS2_ACCOUNT") + token = os.getenv("ADLS2_SAS_TOKEN") + + if not account or not token: + raise Exception( + "ADLS2_RESOURCE: Must set both ADLS2_ACCOUNT and ADLS2_SAS_TOKEN in environment or Dagster+ secrets." + ) + + return ADLS2Resource( + storage_account=account, + credential=ADLS2SASToken(token=token), + )