From a5d9cd52b8ac8836246eac71e06d81dcfcd1e499 Mon Sep 17 00:00:00 2001 From: timburke-hackit Date: Thu, 30 Jan 2025 10:39:06 +0000 Subject: [PATCH 1/6] formatting partition values --- scripts/jobs/data_and_insight/icaseworks_ingest_to_raw.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/jobs/data_and_insight/icaseworks_ingest_to_raw.py b/scripts/jobs/data_and_insight/icaseworks_ingest_to_raw.py index 59a7f1eb5..d9fc2d3de 100644 --- a/scripts/jobs/data_and_insight/icaseworks_ingest_to_raw.py +++ b/scripts/jobs/data_and_insight/icaseworks_ingest_to_raw.py @@ -117,9 +117,9 @@ def get_report_fromtime(report_id, timestamp_to_call, auth_headers, auth_payload def dump_dataframe(response, location, filename): df = pd.DataFrame.from_dict(response.json(), orient="columns") - df["import_year"] = datetime.today().year - df["import_month"] = datetime.today().month - df["import_day"] = datetime.today().day + df["import_year"] = datetime.today().strftime("%Y") + df["import_month"] = datetime.today().strftime("%m") + df["import_day"] = datetime.today().strftime("%d") df["import_date"] = datetime.today().strftime("%Y%m%d") print(f"Database: {target_database}") From 96560b4c4dbf7293360ca3bdf518a8c9c0038209 Mon Sep 17 00:00:00 2001 From: timburke-hackit Date: Thu, 30 Jan 2025 16:36:20 +0000 Subject: [PATCH 2/6] set column types to str --- .../data_and_insight/icaseworks_ingest_to_raw.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/scripts/jobs/data_and_insight/icaseworks_ingest_to_raw.py b/scripts/jobs/data_and_insight/icaseworks_ingest_to_raw.py index d9fc2d3de..932fa9739 100644 --- a/scripts/jobs/data_and_insight/icaseworks_ingest_to_raw.py +++ b/scripts/jobs/data_and_insight/icaseworks_ingest_to_raw.py @@ -1,5 +1,3 @@ -# flake8: noqa: F821 - import base64 import hashlib import hmac @@ -17,7 +15,6 @@ from dateutil.relativedelta import * from pyathena import connect - logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -115,7 +112,10 @@ def get_report_fromtime(report_id, timestamp_to_call, auth_headers, auth_payload def dump_dataframe(response, location, filename): - df = pd.DataFrame.from_dict(response.json(), orient="columns") + df = pd.DataFrame.from_dict( + response.json(), + orient="columns", + ) df["import_year"] = datetime.today().strftime("%Y") df["import_month"] = datetime.today().strftime("%m") @@ -125,6 +125,9 @@ def dump_dataframe(response, location, filename): print(f"Database: {target_database}") print(f"Table: {target_table}") + dict_values = ["string" for _ in range(len(df.columns))] + dtype_dict = dict(zip(df.columns, dict_values)) + # write to s3 wr.s3.to_parquet( df=df, @@ -134,6 +137,7 @@ def dump_dataframe(response, location, filename): table=target_table, mode="overwrite_partitions", partition_cols=partition_keys, + dtype=dtype_dict, ) print(f"Dumped Dataframe {df.shape} to {s3_target_location}") logger.info(f"Dumped Dataframe {df.shape} to {s3_target_location}") From 0643e7882d0954752fb3a260a7caa922e308b6be Mon Sep 17 00:00:00 2001 From: timburke-hackit Date: Thu, 30 Jan 2025 16:38:54 +0000 Subject: [PATCH 3/6] remove unused import --- scripts/jobs/data_and_insight/icaseworks_ingest_to_raw.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/jobs/data_and_insight/icaseworks_ingest_to_raw.py b/scripts/jobs/data_and_insight/icaseworks_ingest_to_raw.py index 932fa9739..dce081be2 100644 --- a/scripts/jobs/data_and_insight/icaseworks_ingest_to_raw.py +++ b/scripts/jobs/data_and_insight/icaseworks_ingest_to_raw.py @@ -12,7 +12,6 @@ import pandas as pd import requests from awsglue.utils import getResolvedOptions -from dateutil.relativedelta import * from pyathena import connect logging.basicConfig(level=logging.INFO) From 928df3b255d2cb381a6e1d41840b5a8ac9fb5e94 Mon Sep 17 00:00:00 2001 From: timburke-hackit Date: Thu, 30 Jan 2025 16:40:27 +0000 Subject: [PATCH 4/6] remove unused f strings and variables --- scripts/jobs/data_and_insight/icaseworks_ingest_to_raw.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/scripts/jobs/data_and_insight/icaseworks_ingest_to_raw.py b/scripts/jobs/data_and_insight/icaseworks_ingest_to_raw.py index dce081be2..b7c56ba11 100644 --- a/scripts/jobs/data_and_insight/icaseworks_ingest_to_raw.py +++ b/scripts/jobs/data_and_insight/icaseworks_ingest_to_raw.py @@ -145,7 +145,7 @@ def dump_dataframe(response, location, filename): def get_latest_timestamp(table_dict): # TODO: reintroduce try except # try: - print(f"Getting max timestamp") + print("Getting max timestamp") # 2025-01-05T15:06:16 # TODO: needs refactoring to allow for different tables @@ -219,7 +219,6 @@ def authenticate_icaseworks(api_key, secret): auth_payload = [] auth_headers = {"Authorization": authorization} - print(f"") return auth_payload, auth_headers @@ -290,8 +289,6 @@ def main(): ] for data_dict in list_of_datadictionaries: - location = data_dict["location"] - if data_dict["full_ingestion"] == False: date_to_track_from = get_latest_timestamp(data_dict) print(f"Starting calls from {date_to_track_from}") From 96f191f2c25faa4da076ab734aa38605ae176a19 Mon Sep 17 00:00:00 2001 From: timburke-hackit Date: Thu, 30 Jan 2025 16:41:04 +0000 Subject: [PATCH 5/6] linting --- scripts/jobs/data_and_insight/icaseworks_ingest_to_raw.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/jobs/data_and_insight/icaseworks_ingest_to_raw.py b/scripts/jobs/data_and_insight/icaseworks_ingest_to_raw.py index b7c56ba11..67ba5d77d 100644 --- a/scripts/jobs/data_and_insight/icaseworks_ingest_to_raw.py +++ b/scripts/jobs/data_and_insight/icaseworks_ingest_to_raw.py @@ -289,7 +289,7 @@ def main(): ] for data_dict in list_of_datadictionaries: - if data_dict["full_ingestion"] == False: + if data_dict["full_ingestion"] is False: date_to_track_from = get_latest_timestamp(data_dict) print(f"Starting calls from {date_to_track_from}") From 566c238596e141527cb75035a4754089428a3e2a Mon Sep 17 00:00:00 2001 From: timburke-hackit Date: Thu, 30 Jan 2025 16:42:42 +0000 Subject: [PATCH 6/6] remove comment --- scripts/jobs/data_and_insight/icaseworks_ingest_to_raw.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/scripts/jobs/data_and_insight/icaseworks_ingest_to_raw.py b/scripts/jobs/data_and_insight/icaseworks_ingest_to_raw.py index 67ba5d77d..43f9a2e84 100644 --- a/scripts/jobs/data_and_insight/icaseworks_ingest_to_raw.py +++ b/scripts/jobs/data_and_insight/icaseworks_ingest_to_raw.py @@ -262,9 +262,6 @@ def retrieve_credentials_from_secrets_manager(secrets_manager_client, secret_nam return response -### main function ## - - def main(): secrets_manager_client = boto3.client("secretsmanager") api_credentials_response = retrieve_credentials_from_secrets_manager(