edanalytics · gfitzgerald-ea · Mar 6, 2025 · Mar 7, 2025 · Mar 7, 2025 · Mar 10, 2025
diff --git a/README.md b/README.md
@@ -58,6 +58,67 @@ Raise an error if a name-collision occurs after formatting.
 </details>
 
 
+## ea_csv
+Helpers for working with csv's.
+
+<details>
+<summary>See more:</summary>
+
+-----
+
+### txt_to_csv
+Convert a txt file to a csv.
+
+Args:
+- file_in (str): A path to a txt file.
+- file_out (str): A path to a csv file. If 'None', then the input file path
+    is used.
+- delimiter (str): A txt file delimiter.
+- has_header (bool): If True, use the first row of the txt file as a column
+    header. If False, insert a column header using the column_names arg.
+    Default is True.
+- column_names (list[str]): An ordered list of column names to use in the
+    output csv. If 'None' and has_header is False, insert an ordered,
+    integer column header (e.g. 1, 2, ..., n where n is the number of
+    columns).
+- delete_txt (bool): If True, delete the input txt file.
+
+Returns:
+- file_out (str): A csv file path.
+
+### txt_files_to_csv
+Convert all txt files in a directory to csv files. Also works with a 
+single txt file path and can be optionally configured to process files in
+all subdirectories.
+
+Args:
+- path_in (str): A file or directory path containing zero or more txt files.
+- path_out (str): A file or directory path to write csv file(s) to. If
+    'None', then the input path is used. Note that a file will retain its
+    original names except with a .csv extension.
+- delimiter (str): A txt file delimiter. Note that this function assumes
+    that all txt files in an input directory use the same delimiter.
+- has_header (bool): If True, use the first row of the txt file(s) as a 
+    column header. If False, insert a column header using the column_names
+    arg. Default is True.
+- column_names (list[str]): An ordered list of column names to use in the
+    output csv(s). If 'None' and has_header is False, insert an ordered,
+    integer column header (e.g. 1, 2, ..., n where n is the number of
+    columns).
+- delete_txt (bool): If True, delete all of the input txt files.
+- include_subdirs (bool): If True, process all files in all subdirectories.
+    If False, only process files in the top level of the specified
+    directory. Default is False.
+
+Returns:
+- path_out (str): A file or directory path containing the output csv
+    file(s).
+
+-----
+
+</details>
+
+
 ## ftp
 FTP- and SFTP-utility helpers
 
@@ -623,7 +684,113 @@ For example, `/ed-fi/apiClients/districts-2425-ds5/{tenant_code}/prod/Stadium` w
 
 </details>
 
+## SharefileToSnowflakeDag
+`SharefileToSnowflakeDag` is an Airflow DAG that automates the process of 
+transferring files from ShareFile to Snowflake. The DAG retrieves txt and csv 
+files from a specified ShareFile location, transforms them into JSONL format,
+uploads the files to an S3 bucket, and finally loads the data into a Snowflake 
+database.
+
+<details>
+<summary>Arguments:</summary>
 
+-----
+
+| Argument                | Description                                                              |
+|-------------------------|--------------------------------------------------------------------------|
+| sharefile_conn_id       | A Sharefile connection ID.                                               |
+| local_base_path         | A base local path for downloading files.                                 |
+| s3_conn_id              | An Airflow connection ID for AWS S3.                                     |
+| s3_bucket               | An S3 bucket where to stage files.                                       |
+| snowflake_conn_id       | An Airflow connection ID for Snowflake.                                  |
+| snowflake_database      | A Snowflake database name.                                               |
+| snowflake_schema        | A Snowflake schema name.                                                 |
+| **kwargs                | Additional arguments to pass to the Airflow DAG.                         |
+
+-----
+
+</details>
+
+<details>
+<summary>Methods:</summary>
+
+-----
+
+**build_task_group()**
+
+Builds a task group to load data from csv and txt files in a
+Sharefile directory to a Snowflake table.
+
+Note that the arguments specified here are relative to the class
+arguments provided at instantiation. For example, the sharefile_path
+argument is relative to the sharefile_conn_id specified at the class
+level.
+
+| Argument                 | Description                                                                                                                                                                                                       |
+|--------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| group_id                 | A name for the Airflow task group.                                                                                                                                                                                |
+| sharefile_source_path    | A Sharefile path to extract data from.                                                                                                                                                                            |
+| sharefile_processed_path | A Sharefile path to move files to after they have been processed. If None, do not move processed files. Default is None.                                                                                          |
+| local_rel_path           | A local relative path to stage data in with  respect to the class's local_base_path. This is also used to determine the staging S3 destination relative to the class's S3 bucket.                                 |
+| snowflake_table          | A Snowflake table name to write data to.                                                                                                                                                                          |
+| txt_delimiter            | A text delimiter used in the txt files to load. Default is ','.                                                                                                                                                   |
+| txt_has_header           | If True, uses the first row of the txt file as a column header. If False, inserts a column header based on the txt columns arg. Default is True.                                                                  |
+| txt_columns              | An ordered list of column names in the txt files to load. If None and txt_has_header is False, then columns are labeled using integers (i.e. 1, 2, 3, ..., n, where n is the number of columns). Default is None. |
+| custom_metadata          | A mapping of metadata field names to values to include in the target Snowflake table.                                                                                                                             |
+| full_refresh             | If True, performs a full refresh load in Snowflake. Default is False.                                                                                                                                             |
+| csv_encoding             | Optional encoding to use for csv files. Default is 'utf-8'.                                                                                                                                                       |
+| **kwargs                 | Additional keyword arguments to pass to the task group.                                                                                                                                                           |
+
+-----
+
+</details>
+
+<details>
+<summary>Example Yaml File:</summary>
+
+```yaml
+default_args: &default_args
+  owner: 
+  run_as_user: 
+  depends_on_past: 
+  start_date:
+  email:
+  email_on_failure: False
+  retries: 0
+  trigger_rule: 
+  retry_delay:
+  execution_timeout: 
+  sla: 
+
+### Sharefile to Snowflake DAGs
+sharefile_to_snowflake_dags__default_args: &sharefile_to_snowflake_dags__default_args
+  sharefile_conn_id:
+  # Null here means processed files will not be moved in Sharefile
+  sharefile_processed_dir:
+
+  local_base_path:
+
+  s3_conn_id:
+  s3_bucket:
+
+  snowflake_conn_id:
+  snowflake_database:
+  snowflake_schema:
+
+  airflow_default_args: *default_task_args
+  schedule_interval: null
+
+sharefile_to_snowflake_dags:
+  resource_1:
+    <<: *sharefile_to_snowflake_dags__default_args
+    sharefile_base_path: path/to/folder
+    snowflake_table:
+  resource_2:
+    <<: *sharefile_to_snowflake_dags__default_args
+    sharefile_base_path: path/to/folder
+    snowflake_table:
+```
+</details>
 
 # Providers
 Finally, this package contains a handful of custom DBT operators to be used as an alternative to PythonOperators.

diff --git a/ea_airflow_util/__init__.py b/ea_airflow_util/__init__.py
@@ -8,6 +8,7 @@
 from ea_airflow_util.dags.dbt_snapshot_dag import DbtSnapshotDag
 from ea_airflow_util.dags.sftp_to_snowflake_dag import SFTPToSnowflakeDag
 from ea_airflow_util.dags.sharefile_custom_users_dag import LoadSharefileCustomUsersDag
+from ea_airflow_util.dags.sharefile_to_snowflake_dag import SharefileToSnowflakeDag
 
 from ea_airflow_util.callables.airflow import xcom_pull_template
 from ea_airflow_util.callables import slack as slack_callbacks

diff --git a/ea_airflow_util/callables/ea_csv.py b/ea_airflow_util/callables/ea_csv.py
@@ -0,0 +1,128 @@
+import os
+import pandas as pd
+
+
+def txt_to_csv(
+    file_in,
+    file_out=None,
+    delimiter=',',
+    has_header=True,
+    column_names=None,
+    delete_txt=False
+):
+    """Convert a txt file to a csv.
+
+    Args:
+    - file_in (str): A path to a txt file.
+    - file_out (str): A path to a csv file. If 'None', then the input file path
+        is used.
+    - delimiter (str): A txt file delimiter.
+    - has_header (bool): If True, use the first row of the txt file as a column
+        header. If False, insert a column header using the column_names arg.
+        Default is True.
+    - column_names (list[str]): An ordered list of column names to use in the
+        output csv. If 'None' and has_header is False, insert an ordered,
+        integer column header (e.g. 1, 2, ..., n where n is the number of
+        columns).
+    - delete_txt (bool): If True, delete the input txt file.
+
+    Returns:
+    - file_out (str): A csv file path.
+    """
+
+    if has_header == True:
+        # Force str dtype, otherwise pandas will do things like cast int's to
+        # floats
+        df = pd.read_csv(file_in, delimiter=delimiter, dtype=str)
+
+    elif has_header == False and column_names != None:
+        df = pd.read_csv(file_in, delimiter=delimiter, dtype=str, header=None)
+        df.columns = column_names
+
+    elif has_header == False and column_names == None:
+        df = pd.read_csv(file_in, delimiter=delimiter, dtype=str, header=None)
+        # 1-indexed column labels can simplify downstream processing
+        df.columns = df.columns + 1
+
+    if file_out == None:
+        file_out = file_in[:-4] + '.csv'
+
+    df.to_csv(file_out, index=False)
+
+    if delete_txt:
+        os.remove(file_in)
+
+    return file_out
+
+
+def txt_files_to_csv(
+    path_in,
+    path_out=None,
+    delimiter=',',
+    has_header=False,
+    column_names=None,
+    delete_txt=False,
+    include_subdirs=False
+):
+    """Convert all txt files in a directory to csv files. Also works with a 
+    single txt file path and can be optionally configured to process files in
+    all subdirectories.
+
+    Args:
+    - path_in (str): A file or directory path containing zero or more txt files.
+    - path_out (str): A file or directory path to write csv file(s) to. If
+        'None', then the input path is used. Note that a file will retain its
+        original names except with a .csv extension.
+    - delimiter (str): A txt file delimiter. Note that this function assumes
+        that all txt files in an input directory use the same delimiter.
+    - has_header (bool): If True, use the first row of the txt file(s) as a 
+        column header. If False, insert a column header using the column_names
+        arg. Default is True.
+    - column_names (list[str]): An ordered list of column names to use in the
+        output csv(s). If 'None' and has_header is False, insert an ordered,
+        integer column header (e.g. 1, 2, ..., n where n is the number of
+        columns).
+    - delete_txt (bool): If True, delete all of the input txt files.
+    - include_subdirs (bool): If True, process all files in all subdirectories.
+        If False, only process files in the top level of the specified
+        directory. Default is False.
+
+    Returns:
+    - path_out (str): A file or directory path containing the output csv
+        file(s).
+    """
+
+    for root, _, files in os.walk(path_in):
+
+        for file in files:
+
+            # Only process txt files
+            if file[-4:] != '.txt':
+                continue
+
+            filepath_in = os.path.join(root, file)
+
+            if path_out == None:
+                dir_out = root
+            else:
+                dir_out = path_out
+
+            filename_out = file[:-4] + '.csv'
+            filepath_out = os.path.join(dir_out, filename_out)
+
+            txt_to_csv(
+                file_in=filepath_in,
+                file_out=filepath_out,
+                delimiter=delimiter,
+                has_header=has_header,
+                column_names=column_names,
+                delete_txt=delete_txt
+            )
+
+        if include_subdirs == False:
+            break
+
+    if path_out == None:
+        path_out = path_in
+
+    return path_out
diff --git a/ea_airflow_util/callables/jsonl.py b/ea_airflow_util/callables/jsonl.py
@@ -60,6 +60,7 @@ def translate_csv_file_to_jsonl(
     delete_csv : bool = False,
     metadata_dict: Optional[dict] = None,
     to_snake_case: bool = False,
+    csv_encoding: str = 'utf-8',
     **kwargs
 ):
     """
@@ -95,7 +96,7 @@ def translate_csv_file_to_jsonl(
                 output_path_new = output_path
 
             try:
-                with open(full_local_path, 'r') as reader:
+                with open(full_local_path, 'r', encoding=csv_encoding) as reader:
                     json_records = csv.DictReader(reader)
                     serialize_json_records_to_disk(json_records, output_path_new, "w", metadata_dict, to_snake_case, **kwargs)
             except UnicodeDecodeError:

diff --git a/ea_airflow_util/dags/s3_to_snowflake_dag.py b/ea_airflow_util/dags/s3_to_snowflake_dag.py
@@ -36,7 +36,7 @@ def __init__(self,
         is_manual_upload: bool = False,
 
         pool: str,
-        full_replace: bool = False,  #TODO once on latest version of airflow, use dagrun parameter to allow full_replace runs even if not set here at dag level
+        full_replace: bool = False, 
 
         do_delete_from_source: bool = True,
         **kwargs