4747from hashlib import sha1
4848from pathlib import Path
4949from string import Formatter
50- from typing import Any , Dict , List , Optional
50+ from typing import Any , Dict , List , Optional , overload
5151import pandas as pd
5252import yaml
5353from babel .dates import format_date
9595
9696THRESHOLD = 80
9797
98+
9899def convert_date_string (
99100 date_str : str | datetime | pd .Timestamp , locale : str = "en"
100101) -> str | None :
@@ -172,6 +173,7 @@ def format_iso_date_for_language(iso_date: str, language: str) -> str:
172173
173174 return format_date (date_obj , format = "long" , locale = locale )
174175
176+
175177def check_addresses_complete (df : pd .DataFrame ) -> pd .DataFrame :
176178 """
177179 Check if address fields are complete in the DataFrame.
@@ -192,17 +194,13 @@ def check_addresses_complete(df: pd.DataFrame) -> pd.DataFrame:
192194 ]
193195
194196 for col in address_cols :
195- df [col ] = (
196- df [col ]
197- .astype (str )
198- .str .strip ()
199- .replace ({"" : pd .NA , "nan" : pd .NA })
200- )
197+ df [col ] = df [col ].astype (str ).str .strip ().replace ({"" : pd .NA , "nan" : pd .NA })
201198
202199 # Build combined address line
203200 df ["ADDRESS" ] = (
204- df ["STREET_ADDRESS_LINE_1" ].fillna ("" ) + " " +
205- df ["STREET_ADDRESS_LINE_2" ].fillna ("" )
201+ df ["STREET_ADDRESS_LINE_1" ].fillna ("" )
202+ + " "
203+ + df ["STREET_ADDRESS_LINE_2" ].fillna ("" )
206204 ).str .strip ()
207205
208206 df ["ADDRESS" ] = df ["ADDRESS" ].replace ({"" : pd .NA })
@@ -282,6 +280,7 @@ def over_16_check(date_of_birth, date_notice_delivery):
282280
283281 return age >= 16
284282
283+
285284def configure_logging (output_dir : Path , run_id : str ) -> Path :
286285 """Configure file logging for the preprocessing step.
287286
@@ -387,6 +386,7 @@ def read_input(file_path: Path) -> pd.DataFrame:
387386 LOG .error ("Failed to read %s: %s" , file_path , exc )
388387 raise
389388
389+
390390def normalize (col : str ) -> str :
391391 """Normalize formatting prior to matching."""
392392
@@ -443,11 +443,10 @@ def map_columns(df: pd.DataFrame, required_columns=REQUIRED_COLUMNS):
443443
444444 # Check each input column against required columns
445445 for input_col in normalized_input_cols :
446-
447446 col_name , score , index = process .extractOne (
448447 query = input_col ,
449448 choices = [normalize (req ) for req in required_columns ],
450- scorer = fuzz .partial_ratio
449+ scorer = fuzz .partial_ratio ,
451450 )
452451
453452 # Remove column if it has a score of 0
@@ -460,18 +459,32 @@ def map_columns(df: pd.DataFrame, required_columns=REQUIRED_COLUMNS):
460459
461460 # print colname and score for debugging
462461 print (f"Matching '{ input_col } ' to '{ best_match } ' with score { score } " )
463-
462+
464463 return df .rename (columns = col_map ), col_map
465464
465+
466+ @overload
466467def filter_columns (
467468 df : pd .DataFrame , required_columns : list [str ] = REQUIRED_COLUMNS
468- ) -> pd .DataFrame :
469+ ) -> pd .DataFrame : ...
470+
471+
472+ @overload
473+ def filter_columns (
474+ df : None , required_columns : list [str ] = REQUIRED_COLUMNS
475+ ) -> None : ...
476+
477+
478+ def filter_columns (
479+ df : pd .DataFrame | None , required_columns : list [str ] = REQUIRED_COLUMNS
480+ ) -> pd .DataFrame | None :
469481 """Filter dataframe to only include required columns."""
470482 if df is None or df .empty :
471483 return df
472484
473485 return df [[col for col in df .columns if col in required_columns ]]
474486
487+
475488def ensure_required_columns (df : pd .DataFrame ) -> pd .DataFrame :
476489 """Normalize column names and validate that all required columns are present.
477490
@@ -767,7 +780,7 @@ def build_preprocess_result(
767780 sorted_df ["SEQUENCE" ] = [f"{ idx + 1 :05d} " for idx in range (len (sorted_df ))]
768781
769782 clients : List [ClientRecord ] = []
770- for row in sorted_df .itertuples (index = False ): # type: ignore[attr-defined]
783+ for row in sorted_df .itertuples (index = False ):
771784 client_id = str (row .CLIENT_ID ) # type: ignore[attr-defined]
772785 sequence = row .SEQUENCE # type: ignore[attr-defined]
773786 dob_iso = (
0 commit comments