From 97809ba7af52fe1b14e370372f1f20afa9ec3eda Mon Sep 17 00:00:00 2001
From: TiaTuinstra <tiatuinstra@gmail.com>
Date: Thu, 22 Jan 2026 15:58:21 +0000
Subject: [PATCH 1/3] add keep_columns variable to column filtering and keep
 age column if present in data; add check_client_info_complete and flag
 individuals with missing client info fields - similar to
 check_address_complete function

---
 pipeline/orchestrator.py |   7 +--
 pipeline/preprocess.py   | 106 +++++++++++++++++++++++++++++++++++++--
 2 files changed, 105 insertions(+), 8 deletions(-)

diff --git a/pipeline/orchestrator.py b/pipeline/orchestrator.py
index 9517c6f..a61e79c 100755
--- a/pipeline/orchestrator.py
+++ b/pipeline/orchestrator.py
@@ -224,11 +224,12 @@ def run_step_2_preprocess(
     input_path = input_dir / input_file
     df_raw = preprocess.read_input(input_path)
     mapped_df, column_mapping = preprocess.map_columns(df_raw)
-    df_filtered = preprocess.filter_columns(mapped_df)
+    df_filtered = preprocess.filter_columns(mapped_df, keep_columns=["AGE"])
     df = preprocess.ensure_required_columns(df_filtered)
 
     # Check that addresses are complete, return only complete rows
     df = preprocess.check_addresses_complete(df)
+    df = preprocess.check_client_info_complete(df)
 
     # Load configuration
     vaccine_reference_path = preprocess.VACCINE_REFERENCE_PATH
@@ -326,8 +327,8 @@ def run_step_4_generate_notices(
     artifacts_dir = output_dir / "artifacts"
 
     # Assets now come from template directory (optional)
-    logo_path = template_dir / "assets" / "logo.png"
-    signature_path = template_dir / "assets" / "signature.png"
+    logo_path = template_dir / "assets" / "logo.jpg"
+    signature_path = template_dir / "assets" / "signature.jpg"
 
     # Note: Assets are NOT validated here. If a template references an asset
     # that doesn't exist, the template rendering will fail with a clear error.
diff --git a/pipeline/preprocess.py b/pipeline/preprocess.py
index cdad5c4..f507b43 100644
--- a/pipeline/preprocess.py
+++ b/pipeline/preprocess.py
@@ -219,17 +219,98 @@ def check_addresses_complete(df: pd.DataFrame) -> pd.DataFrame:
             "There are %d records with incomplete address information.",
             incomplete_count,
         )
+        print(
+            f"⚠️ There are {incomplete_count} records with incomplete address information."
+        )
 
         incomplete_records = df.loc[~df["address_complete"]]
 
         incomplete_path = Path("output/incomplete_addresses.csv")
         incomplete_records.to_csv(incomplete_path, index=False)
         LOG.info("Incomplete address records written to %s", incomplete_path)
+        print(f"Incomplete address records written to {incomplete_path}")
 
     # Return only rows with complete addresses
     return df.loc[df["address_complete"]].drop(columns=["address_complete"])
 
 
+def recover_client_id(x):
+    # Recover Client ID if 10 digits when converted to str int
+    try:
+        y = str(int(float(x)))
+        return y if len(y) == 10 else x
+    except Exception:
+        return x
+
+
+def check_client_info_complete(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Check if client fields are complete in the DataFrame.
+
+    Adds a boolean 'client_info_complete' column based on presence of
+    first name, last name, DOB, school name, overdue disease, immunizations given, and client ID.
+
+    Check that client ID is valid (10 digits), and recover if in float format.
+    """
+
+    df = df.copy()
+
+    # Normalize text fields: convert to string, strip whitespace, convert "" to NA
+    client_info_cols = [
+        "SCHOOL_NAME",
+        "CLIENT_ID",
+        "FIRST_NAME",
+        "LAST_NAME",
+        "DATE_OF_BIRTH",
+        "OVERDUE_DISEASE",
+        "IMMS_GIVEN",
+    ]
+
+    for col in client_info_cols:
+        df[col] = df[col].astype(str).str.strip().replace({"": pd.NA, "nan": pd.NA})
+
+    # Check client ID formatting - expects 10-digit number, recover if in float format with '.0'
+    df["CLIENT_ID"] = df["CLIENT_ID"].str.replace(r"\.0$", "", regex=True)
+    df["client_id_valid"] = df["CLIENT_ID"].str.isdigit() & (
+        df["CLIENT_ID"].str.len() == 10
+    )
+    if len(df[~df["client_id_valid"]]) > 0:
+        print(f"Invalid client IDs: {len(df[~df['client_id_valid']])}")
+
+    # Check completeness
+    df["client_info_complete"] = (
+        df["FIRST_NAME"].notna()
+        & df["LAST_NAME"].notna()
+        & df["CLIENT_ID"].notna()
+        & df["DATE_OF_BIRTH"].notna()
+        & df["OVERDUE_DISEASE"].notna()
+        & df["IMMS_GIVEN"].notna()
+        & df["client_id_valid"]
+    )
+
+    df = df.drop(columns=["client_id_valid"])
+
+    if not df["client_info_complete"].all():
+        incomplete_count = (~df["client_info_complete"]).sum()
+        LOG.warning(
+            "There are %d records with incomplete/invalid client information.",
+            incomplete_count,
+        )
+        print(
+            f"⚠️ There are {incomplete_count} total records with incomplete/invalid client information."
+        )
+
+        incomplete_records = df.loc[~df["client_info_complete"]]
+
+        incomplete_path = Path("output/incomplete_clients.csv")
+        incomplete_records.to_csv(incomplete_path, index=False)
+        LOG.info("Incomplete client records written to %s", incomplete_path)
+        print(f"Incomplete client records written to {incomplete_path}")
+
+    # Return only rows with complete addresses
+    return df.loc[df["client_info_complete"]].drop(columns=["client_info_complete"])
+
+
 def convert_date_iso(date_str: str) -> str:
     """Convert a date from English display format to ISO format.
 
@@ -267,7 +348,11 @@ def over_16_check(date_of_birth, date_notice_delivery):
     """
 
     birth_datetime = datetime.strptime(date_of_birth, "%Y-%m-%d")
-    delivery_datetime = datetime.strptime(date_notice_delivery, "%Y-%m-%d")
+
+    if isinstance(date_notice_delivery, datetime):
+        date_notice_delivery = str(date_notice_delivery.date())
+
+    delivery_datetime = datetime.strptime(str(date_notice_delivery), "%Y-%m-%d")
 
     age = delivery_datetime.year - birth_datetime.year
 
@@ -455,11 +540,20 @@ def map_columns(df: pd.DataFrame, required_columns=REQUIRED_COLUMNS):
         if score >= THRESHOLD:  # adjustable threshold
             # Map the original column name, not the normalized one
             actual_in_col = next(c for c in input_cols if normalize(c) == input_col)
-            col_map[actual_in_col] = best_match
+            # col_map[actual_in_col] = best_match
 
             # print colname and score for debugging
             print(f"Matching '{input_col}' to '{best_match}' with score {score}")
 
+            # Check if column already has an assigned mapping
+            if best_match not in col_map.values():
+                print(
+                    f"The value {best_match} does not exist in the dictionary - adding value."
+                )
+                col_map[actual_in_col] = best_match
+            # else:
+            #     print(f"The value {best_match} does not exist.")
+
     return df.rename(columns=col_map), col_map
 
 
@@ -476,13 +570,15 @@ def filter_columns(
 
 
 def filter_columns(
-    df: pd.DataFrame | None, required_columns: list[str] = REQUIRED_COLUMNS
+    df: pd.DataFrame | None,
+    required_columns: list[str] = REQUIRED_COLUMNS,
+    keep_columns: list[str] = [],
 ) -> pd.DataFrame | None:
     """Filter dataframe to only include required columns."""
     if df is None or df.empty:
         return df
 
-    return df[[col for col in df.columns if col in required_columns]]
+    return df[[col for col in df.columns if col in required_columns + keep_columns]]
 
 
 def ensure_required_columns(df: pd.DataFrame) -> pd.DataFrame:
@@ -814,7 +910,7 @@ def build_preprocess_result(
         if not pd.isna(row.AGE):  # type: ignore[attr-defined]
             over_16 = bool(row.AGE >= 16)  # type: ignore[attr-defined]
         elif dob_iso and date_notice_delivery:
-            over_16 = over_16_check(dob_iso, date_notice_delivery)
+            over_16 = over_16_check(dob_iso, datetime.now())
         else:
             over_16 = False
 

From 2c3e3e269b4b6f657505fb872e1420f1f52a68f0 Mon Sep 17 00:00:00 2001
From: TiaTuinstra <tiatuinstra@gmail.com>
Date: Thu, 22 Jan 2026 16:50:24 +0000
Subject: [PATCH 2/3] fixing test errors

---
 pipeline/orchestrator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pipeline/orchestrator.py b/pipeline/orchestrator.py
index a61e79c..0cb9607 100755
--- a/pipeline/orchestrator.py
+++ b/pipeline/orchestrator.py
@@ -327,8 +327,8 @@ def run_step_4_generate_notices(
     artifacts_dir = output_dir / "artifacts"
 
     # Assets now come from template directory (optional)
-    logo_path = template_dir / "assets" / "logo.jpg"
-    signature_path = template_dir / "assets" / "signature.jpg"
+    logo_path = template_dir / "assets" / "logo.png"
+    signature_path = template_dir / "assets" / "signature.png"
 
     # Note: Assets are NOT validated here. If a template references an asset
     # that doesn't exist, the template rendering will fail with a clear error.

From 98bfc2412b0092efe4484a673b315efbee01a914 Mon Sep 17 00:00:00 2001
From: TiaTuinstra <tiatuinstra@gmail.com>
Date: Thu, 22 Jan 2026 12:12:41 -0500
Subject: [PATCH 3/3] Update preprocess.py

Remove unused helper function (recover_client_id) - this now occurs directly in check_client_info_complete
---
 pipeline/preprocess.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/pipeline/preprocess.py b/pipeline/preprocess.py
index f507b43..20bb07b 100644
--- a/pipeline/preprocess.py
+++ b/pipeline/preprocess.py
@@ -234,15 +234,6 @@ def check_addresses_complete(df: pd.DataFrame) -> pd.DataFrame:
     return df.loc[df["address_complete"]].drop(columns=["address_complete"])
 
 
-def recover_client_id(x):
-    # Recover Client ID if 10 digits when converted to str int
-    try:
-        y = str(int(float(x)))
-        return y if len(y) == 10 else x
-    except Exception:
-        return x
-
-
 def check_client_info_complete(df: pd.DataFrame) -> pd.DataFrame:
     """
     Check if client fields are complete in the DataFrame.