From 97809ba7af52fe1b14e370372f1f20afa9ec3eda Mon Sep 17 00:00:00 2001 From: TiaTuinstra Date: Thu, 22 Jan 2026 15:58:21 +0000 Subject: [PATCH 1/3] add keep_columns variable to column filtering and keep age column if present in data; add check_client_info_complete and flag individuals with missing client info fields - similar to check_address_complete function --- pipeline/orchestrator.py | 7 +-- pipeline/preprocess.py | 106 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 105 insertions(+), 8 deletions(-) diff --git a/pipeline/orchestrator.py b/pipeline/orchestrator.py index 9517c6f..a61e79c 100755 --- a/pipeline/orchestrator.py +++ b/pipeline/orchestrator.py @@ -224,11 +224,12 @@ def run_step_2_preprocess( input_path = input_dir / input_file df_raw = preprocess.read_input(input_path) mapped_df, column_mapping = preprocess.map_columns(df_raw) - df_filtered = preprocess.filter_columns(mapped_df) + df_filtered = preprocess.filter_columns(mapped_df, keep_columns=["AGE"]) df = preprocess.ensure_required_columns(df_filtered) # Check that addresses are complete, return only complete rows df = preprocess.check_addresses_complete(df) + df = preprocess.check_client_info_complete(df) # Load configuration vaccine_reference_path = preprocess.VACCINE_REFERENCE_PATH @@ -326,8 +327,8 @@ def run_step_4_generate_notices( artifacts_dir = output_dir / "artifacts" # Assets now come from template directory (optional) - logo_path = template_dir / "assets" / "logo.png" - signature_path = template_dir / "assets" / "signature.png" + logo_path = template_dir / "assets" / "logo.jpg" + signature_path = template_dir / "assets" / "signature.jpg" # Note: Assets are NOT validated here. If a template references an asset # that doesn't exist, the template rendering will fail with a clear error. diff --git a/pipeline/preprocess.py b/pipeline/preprocess.py index cdad5c4..f507b43 100644 --- a/pipeline/preprocess.py +++ b/pipeline/preprocess.py @@ -219,17 +219,98 @@ def check_addresses_complete(df: pd.DataFrame) -> pd.DataFrame: "There are %d records with incomplete address information.", incomplete_count, ) + print( + f"⚠️ There are {incomplete_count} records with incomplete address information." + ) incomplete_records = df.loc[~df["address_complete"]] incomplete_path = Path("output/incomplete_addresses.csv") incomplete_records.to_csv(incomplete_path, index=False) LOG.info("Incomplete address records written to %s", incomplete_path) + print(f"Incomplete address records written to {incomplete_path}") # Return only rows with complete addresses return df.loc[df["address_complete"]].drop(columns=["address_complete"]) +def recover_client_id(x): + # Recover Client ID if 10 digits when converted to str int + try: + y = str(int(float(x))) + return y if len(y) == 10 else x + except Exception: + return x + + +def check_client_info_complete(df: pd.DataFrame) -> pd.DataFrame: + """ + Check if client fields are complete in the DataFrame. + + Adds a boolean 'client_info_complete' column based on presence of + first name, last name, DOB, school name, overdue disease, immunizations given, and client ID. + + Check that client ID is valid (10 digits), and recover if in float format. + """ + + df = df.copy() + + # Normalize text fields: convert to string, strip whitespace, convert "" to NA + client_info_cols = [ + "SCHOOL_NAME", + "CLIENT_ID", + "FIRST_NAME", + "LAST_NAME", + "DATE_OF_BIRTH", + "OVERDUE_DISEASE", + "IMMS_GIVEN", + ] + + for col in client_info_cols: + df[col] = df[col].astype(str).str.strip().replace({"": pd.NA, "nan": pd.NA}) + + # Check client ID formatting - expects 10-digit number, recover if in float format with '.0' + df["CLIENT_ID"] = df["CLIENT_ID"].str.replace(r"\.0$", "", regex=True) + df["client_id_valid"] = df["CLIENT_ID"].str.isdigit() & ( + df["CLIENT_ID"].str.len() == 10 + ) + if len(df[~df["client_id_valid"]]) > 0: + print(f"Invalid client IDs: {len(df[~df['client_id_valid']])}") + + # Check completeness + df["client_info_complete"] = ( + df["FIRST_NAME"].notna() + & df["LAST_NAME"].notna() + & df["CLIENT_ID"].notna() + & df["DATE_OF_BIRTH"].notna() + & df["OVERDUE_DISEASE"].notna() + & df["IMMS_GIVEN"].notna() + & df["client_id_valid"] + ) + + df = df.drop(columns=["client_id_valid"]) + + if not df["client_info_complete"].all(): + incomplete_count = (~df["client_info_complete"]).sum() + LOG.warning( + "There are %d records with incomplete/invalid client information.", + incomplete_count, + ) + print( + f"⚠️ There are {incomplete_count} total records with incomplete/invalid client information." + ) + + incomplete_records = df.loc[~df["client_info_complete"]] + + incomplete_path = Path("output/incomplete_clients.csv") + incomplete_records.to_csv(incomplete_path, index=False) + LOG.info("Incomplete client records written to %s", incomplete_path) + print(f"Incomplete client records written to {incomplete_path}") + + # Return only rows with complete addresses + return df.loc[df["client_info_complete"]].drop(columns=["client_info_complete"]) + + def convert_date_iso(date_str: str) -> str: """Convert a date from English display format to ISO format. @@ -267,7 +348,11 @@ def over_16_check(date_of_birth, date_notice_delivery): """ birth_datetime = datetime.strptime(date_of_birth, "%Y-%m-%d") - delivery_datetime = datetime.strptime(date_notice_delivery, "%Y-%m-%d") + + if isinstance(date_notice_delivery, datetime): + date_notice_delivery = str(date_notice_delivery.date()) + + delivery_datetime = datetime.strptime(str(date_notice_delivery), "%Y-%m-%d") age = delivery_datetime.year - birth_datetime.year @@ -455,11 +540,20 @@ def map_columns(df: pd.DataFrame, required_columns=REQUIRED_COLUMNS): if score >= THRESHOLD: # adjustable threshold # Map the original column name, not the normalized one actual_in_col = next(c for c in input_cols if normalize(c) == input_col) - col_map[actual_in_col] = best_match + # col_map[actual_in_col] = best_match # print colname and score for debugging print(f"Matching '{input_col}' to '{best_match}' with score {score}") + # Check if column already has an assigned mapping + if best_match not in col_map.values(): + print( + f"The value {best_match} does not exist in the dictionary - adding value." + ) + col_map[actual_in_col] = best_match + # else: + # print(f"The value {best_match} does not exist.") + return df.rename(columns=col_map), col_map @@ -476,13 +570,15 @@ def filter_columns( def filter_columns( - df: pd.DataFrame | None, required_columns: list[str] = REQUIRED_COLUMNS + df: pd.DataFrame | None, + required_columns: list[str] = REQUIRED_COLUMNS, + keep_columns: list[str] = [], ) -> pd.DataFrame | None: """Filter dataframe to only include required columns.""" if df is None or df.empty: return df - return df[[col for col in df.columns if col in required_columns]] + return df[[col for col in df.columns if col in required_columns + keep_columns]] def ensure_required_columns(df: pd.DataFrame) -> pd.DataFrame: @@ -814,7 +910,7 @@ def build_preprocess_result( if not pd.isna(row.AGE): # type: ignore[attr-defined] over_16 = bool(row.AGE >= 16) # type: ignore[attr-defined] elif dob_iso and date_notice_delivery: - over_16 = over_16_check(dob_iso, date_notice_delivery) + over_16 = over_16_check(dob_iso, datetime.now()) else: over_16 = False From 2c3e3e269b4b6f657505fb872e1420f1f52a68f0 Mon Sep 17 00:00:00 2001 From: TiaTuinstra Date: Thu, 22 Jan 2026 16:50:24 +0000 Subject: [PATCH 2/3] fixing test errors --- pipeline/orchestrator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipeline/orchestrator.py b/pipeline/orchestrator.py index a61e79c..0cb9607 100755 --- a/pipeline/orchestrator.py +++ b/pipeline/orchestrator.py @@ -327,8 +327,8 @@ def run_step_4_generate_notices( artifacts_dir = output_dir / "artifacts" # Assets now come from template directory (optional) - logo_path = template_dir / "assets" / "logo.jpg" - signature_path = template_dir / "assets" / "signature.jpg" + logo_path = template_dir / "assets" / "logo.png" + signature_path = template_dir / "assets" / "signature.png" # Note: Assets are NOT validated here. If a template references an asset # that doesn't exist, the template rendering will fail with a clear error. From 98bfc2412b0092efe4484a673b315efbee01a914 Mon Sep 17 00:00:00 2001 From: TiaTuinstra Date: Thu, 22 Jan 2026 12:12:41 -0500 Subject: [PATCH 3/3] Update preprocess.py Remove unused helper function (recover_client_id) - this now occurs directly in check_client_info_complete --- pipeline/preprocess.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/pipeline/preprocess.py b/pipeline/preprocess.py index f507b43..20bb07b 100644 --- a/pipeline/preprocess.py +++ b/pipeline/preprocess.py @@ -234,15 +234,6 @@ def check_addresses_complete(df: pd.DataFrame) -> pd.DataFrame: return df.loc[df["address_complete"]].drop(columns=["address_complete"]) -def recover_client_id(x): - # Recover Client ID if 10 digits when converted to str int - try: - y = str(int(float(x))) - return y if len(y) == 10 else x - except Exception: - return x - - def check_client_info_complete(df: pd.DataFrame) -> pd.DataFrame: """ Check if client fields are complete in the DataFrame.