NEONScience
diff --git a/‎.github/workflows/DEV_enviroscan_site_list.yml‎
Lines changed: 34 additions & 0 deletions b/‎.github/workflows/DEV_enviroscan_site_list.yml‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎.github/workflows/DEV_enviroscan_update_dag.yml‎
Lines changed: 61 additions & 0 deletions b/‎.github/workflows/DEV_enviroscan_update_dag.yml‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎.github/workflows/build_push_calibration_group_and_convert.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/build_push_calibration_group_and_convert.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎modules/array_parser/array_parser.py‎
Lines changed: 9 additions & 7 deletions b/‎modules/array_parser/array_parser.py‎
Lines changed: 9 additions & 7 deletions
diff --git a/‎modules/array_parser/array_parser_main.py‎
Lines changed: 1 addition & 1 deletion b/‎modules/array_parser/array_parser_main.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎modules/array_parser/data_file_parser.py‎
Lines changed: 29 additions & 9 deletions b/‎modules/array_parser/data_file_parser.py‎
Lines changed: 29 additions & 9 deletions
diff --git a/‎modules/array_parser/schema_parser.py‎
Lines changed: 16 additions & 8 deletions b/‎modules/array_parser/schema_parser.py‎
Lines changed: 16 additions & 8 deletions
diff --git a/‎modules/calval_loader/get_calibration_stream_name.py‎
Lines changed: 2 additions & 1 deletion b/‎modules/calval_loader/get_calibration_stream_name.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎modules/calval_loader/load_all_calval_files.py‎
Lines changed: 2 additions & 3 deletions b/‎modules/calval_loader/load_all_calval_files.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎modules_combined/calibration_group_and_convert/Dockerfile‎
Lines changed: 5 additions & 1 deletion b/‎modules_combined/calibration_group_and_convert/Dockerfile‎
Lines changed: 5 additions & 1 deletion
@@ -0,0 +1,34 @@
+name: DEV-enviroscan-site-list
+on:
+  push:
+    branches:
+      - 'master'
+    paths:
+      - 'pipe/enviroscan/site-list*.json'
+  workflow_dispatch: {} # Allows trigger of workflow from web interface
+jobs:
+  put_files:
+    runs-on: arc-neon-gke
+    #runs-on: ubuntu-latest 
+    env:
+      PACHD_ADDRESS: grpcs://pachyderm-dev.transitions-nonprod.gcp.neoninternal.org:443
+      PACH_TOKEN: ${{ secrets.RepoOwnerPachydermDev }}
+      REPO: enviroscan_site_list # Pachyderm repo
+      BRANCH: master 
+      IN_PATHS: 'pipe/enviroscan/site-list.json' # Comma-separated list (no spaces) to one or more paths or directories. Length must match OUT_PATHS. If directory, all files in directory will be placed in pachyderm at corresponding entry of OUT_PATHS. 
+      OUT_PATHS: 'site-list.json' # Comma-separated list (no spaces) of corresponding path(s) to place the files(s) in Pachyderm. Must be same length as IN_PATHS. If corresponding entry in IN_PATHS is a file, specify to the file. If corresponding entry in IN_PATHS is a directory, specify to the directory. 
+    steps:
+      - uses: actions/checkout@v4
+      - run: ls -la
+      
+      - name: Put file
+        uses: ./.github/actions/put-files
+        with:
+          pachd_address: ${{ env.PACHD_ADDRESS }}
+          pach_token: ${{ env.PACH_TOKEN }}
+          repo_name: ${{ env.REPO }}
+          branch_name: ${{ env.BRANCH }}
+          in_paths: ${{ env.IN_PATHS }}
+          out_paths: ${{ env.OUT_PATHS }}
+        
+        
@@ -0,0 +1,61 @@
+name: DEV-enviroscan-update-dag
+on:
+  push:
+    branches:
+      - 'master'
+    paths:
+      - 'pipe/enviroscan/*.yaml'
+      - 'pipe/enviroscan/pipe_list_enviroscan.txt'
+  workflow_dispatch: {} # Allows trigger of workflow from web interface
+
+jobs:
+  # -------------------------------------------------------------
+  # Using GitHub's API is not supported for push events
+  # -------------------------------------------------------------
+  # 
+  # ----------------------------------------------------------------------------------------------
+  # Using local .git history
+  # ----------------------------------------------------------------------------------------------
+  # Event `push`: Compare the preceding remote commit -> to the current commit of the main branch 
+  # ----------------------------------------------------------------------------------------------
+
+  changed_files:
+    runs-on: ubuntu-latest  # windows-latest || macos-latest
+    outputs:
+    # Use this changed_file_list if you plan to use get-changed-files-action 
+      changed_file_list: ${{ steps.changed-files-action.outputs.changed_file_list }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0  # OR "2" -> To retrieve the preceding commit.
+          
+    # Using get-changed-files-action 
+      - name: Get changed files action
+        id: changed-files-action
+        uses: ./.github/actions/get-changed-files
+
+  update_pipelines:
+    needs: changed_files
+    runs-on: arc-neon-gke
+    #runs-on: ubuntu-latest 
+    env:
+      PACHD_ADDRESS: grpcs://pachyderm-dev.transitions-nonprod.gcp.neoninternal.org:443
+      PACH_TOKEN: ${{ secrets.RepoOwnerPachydermDev }}
+      PATHS: 'pipe/enviroscan' # Separate multiple with comma (e.g. 'pipe/pqs1,pipe/parWaterSurface'). Order matters.
+      TRANSACTION: True
+      UPDATE_SCOPE: changed # 'all' or 'changed'. If not specified, all will be updated. 'changed' will update/create any changed/non-existent pipelines.
+      CHANGED_FILES: ${{needs.changed_files.outputs.changed_file_list}}
+    steps:
+      - uses: actions/checkout@v4
+      - run: ls -la
+
+      - name: Update pipelines
+        uses: ./.github/actions/update-pipelines
+        with:
+          pachd_address: ${{ env.PACHD_ADDRESS }}
+          pach_token: ${{ env.PACH_TOKEN }}
+          paths: ${{ env.PATHS }}
+          transaction: ${{ env.TRANSACTION }}
+          update_scope: ${{ env.UPDATE_SCOPE }}
+          changed_files: ${{ env.CHANGED_FILES }}
+          
@@ -7,6 +7,7 @@ on:
     paths:
       - 'modules_combined/calibration_group_and_convert/**'
       - 'modules/filter_joiner/**'
+      - 'modules/array_parser/**'
       - 'modules/common/**'
       - 'flow/flow.kfka.comb/**'
       - 'flow/flow.cal.conv/**'
 
@@ -36,15 +36,17 @@ def parse(config: Config) -> None:
 
 def link_calibration_file(path: Path, out_path, schema_data: SchemaData) -> None:
     stream_id = calibration_file_parser.get_stream_id(path)
-    field_name = schema_data.mapping.get(stream_id)
+    field_name = schema_data.calibration_mapping.get(stream_id)
     link_path = Path(out_path, field_name, path.name)
-    log.debug(f'calibration link: {link_path}')
-    link_path.parent.mkdir(parents=True, exist_ok=True)
-    link_path.symlink_to(path)
+    if not link_path.exists():
+        log.debug(f'calibration link: {link_path}')
+        link_path.parent.mkdir(parents=True, exist_ok=True)
+        link_path.symlink_to(path)
 
 
 def link_data_file(path: Path, out_path: Path) -> None:
     link_path = Path(out_path, path.name)
-    link_path.parent.mkdir(parents=True, exist_ok=True)
-    log.debug(f'data link: {link_path}')
-    link_path.symlink_to(path)
+    if not link_path.exists():
+        link_path.parent.mkdir(parents=True, exist_ok=True)
+        log.debug(f'data link: {link_path}')
+        link_path.symlink_to(path)
@@ -17,7 +17,7 @@ def main() -> None:
     schema_path: Path = env.path('SCHEMA_PATH')
     out_path: Path = env.path('OUT_PATH')
     parse_calibration = env.bool('PARSE_CALIBRATION')
-    log_level: str = env.log_level('LOG_LEVEL', 'INFO')
+    log_level: str = env.str('LOG_LEVEL', 'INFO')
     source_type_index: int = env.int('SOURCE_TYPE_INDEX')
     year_index: int = env.int('YEAR_INDEX')
     month_index: int = env.int('MONTH_INDEX')
 
@@ -63,21 +63,41 @@ def write_restructured_file(path: Path, out_path: Path, schema: Path) -> None:
     :param schema: The new schema for the reordered file.
     :return: None
     """
-    table = pq.read_table(path)
-    data_values = table.column(3)
-    data_type: pa.lib.ListType = data_values.type
+    
+    # Read the schema
     schema_data: SchemaData = schema_parser.parse_schema_file(schema)
     field_names = schema_data.field_names
-    new_columns: List[list] = create_columns(field_names)
-    populate_columns(table, field_names, data_values, new_columns)
-    for i in range(0, len(new_columns)):
+
+    # Parse the array(s) into the new table
+    table = pq.read_table(path)
+    column_names = table.column_names
+    array_names = set(schema_data.data_mapping.values())
+    for array_name in array_names:
+        column_index = column_names.index(array_name)
+        data_values = table.column(column_index)
+        array_field_names=[key for key, value in schema_data.data_mapping.items() if value == array_name] # field names pertaining to this array
+        parsed_columns: List[list] = create_columns(array_field_names)
+        data_type: pa.lib.ListType = data_values.type
+        populate_columns(table, array_field_names, data_values, parsed_columns)
+        
         # convert to arrays with the appropriate type
-        column: pa.Array = pa.array(new_columns[i], data_type.value_type)
-        table: pa.Table = table.append_column(field_names[i], column)  # add column to table
-    table = table.remove_column(3)  # remove original data array from table
+        for i in range(0, len(parsed_columns)):
+            column: pa.Array = pa.array(parsed_columns[i], data_type.value_type)
+            table: pa.Table = table.append_column(array_field_names[i], column)  # add column to table
+    
+    # remove original data arrays from table
+    for array_name in array_names:
+        column_names = table.column_names
+        column_index = column_names.index(array_name)
+        table = table.remove_column(column_index)  
+        
+    # Rearrange columns to match the parsed schema   
+    table=table.select(field_names)
     metadata = get_metadata(schema_data)
     table = table.replace_schema_metadata(metadata)
     log.debug(f'modified_table:\n{table}')
+    
+    # Output
     file_path = Path(out_path, path.name)
     file_path.parent.mkdir(parents=True, exist_ok=True)
     file_path.touch()
 
@@ -8,31 +8,39 @@ class SchemaData(NamedTuple):
     schema: str
     source_type: str
     field_names: List[str]
-    mapping: dict
+    parse_field_names: List[str]
+    calibration_mapping: dict
+    data_mapping: dict
 
 
 def parse_schema_file(path: Path) -> SchemaData:
     """
-    Get the mapping between stream IDs and schema field names.
+    Get the mapping between stream IDs and schema field names for any applicable calibration data
+    Also get the mapping between schema field names and array names
 
     :param path: The file path.
-    :return: The source name and the mapping between stream IDs and schema field names.
+    :return: The source name and the mapping between stream IDs -> schema field names, and schema field names -> array names (i.e. which array they are in)
     """
-    field_exclusions = ['source_id', 'site_id', 'readout_time']
+    field_exclusions = ['source_id', 'site_id', 'readout_time'] # Assumes all other fields are fields to be parsed. 
     with open(str(path), 'r') as file:
         json_data = json.load(file)
         source_type = json_data['source']
         fields = json_data['fields']
-        mapping = {}
+        calibration_mapping = {}
+        data_mapping = {}
         field_names = []
+        parse_field_names = []
         for field in fields:
             name = field['name']
+            field_names.append(name)
             if name not in field_exclusions:
-                field_names.append(name)
+                parse_field_names.append(name)
                 try:
                     stream_id = field['__neon_stream_id']
-                    mapping[stream_id] = name
+                    array_name = field['__raw_array_name']
+                    calibration_mapping[stream_id] = name
+                    data_mapping[name] = array_name
                 except KeyError:
                     continue
         schema = json.dumps(json_data)
-    return SchemaData(schema=schema, source_type=source_type, field_names=field_names, mapping=mapping)
+    return SchemaData(schema=schema, source_type=source_type, field_names=field_names, parse_field_names=parse_field_names, calibration_mapping=calibration_mapping,data_mapping=data_mapping)
@@ -28,12 +28,13 @@ def get_calibration_stream_name(connection, asset_type: str, stream_number: int)
         and 
             is_asset_definition.sensor_type_name = %(sensor_type_name)s
     '''
+    # print(f'Finding stream name for asset_type: {asset_type} and stream_number: {stream_number}')
     with closing(connection.cursor()) as cursor:
         cursor.execute(sql, dict(sensor_type_name=asset_type, stream_number=stream_number))
         row = cursor.fetchone()
         if row is None:
             logging.error(f'Stream name not found for stream ID {stream_number} and asset type {asset_type}.')
             return None
         stream_name = row[0]
-       # print(f'stream_name: {stream_name}')
+        # print(f'asset_type: {asset_type}    stream_name: {stream_name}')
     return stream_name
@@ -48,9 +48,8 @@ def load() -> None:
                             stream_id = root.find('StreamCalVal').find('StreamID').text
                             stream_name = get_calibration_stream_name(connector.get_connection(), avro_schema_name,
                                                                       stream_id)
-                            print('repo name , asset_id, stream_name, filename are :', avro_schema_name, "  ", asset_id,
-                                  "  ",
-                                  stream_name, " ", filename)
+                            print('schema name , asset_id, stream_id, stream_name, filename are :', avro_schema_name, "  ", asset_id,
+                                  "  ", stream_id, " ", stream_name, " ", filename)
                             try:
                                 output_path = Path(output_directory, avro_schema_name, asset_id, stream_name, filename)
                                 output_path.parent.mkdir(parents=True, exist_ok=True)
 
@@ -1,4 +1,4 @@
-# Dockerfile for NEON IS Data Processing - combined filter-joiner, kafka combiner, and Calibration Conversion
+# Dockerfile for NEON IS Data Processing - combined filter-joiner, kafka combiner, array parser, and Calibration Conversion
 # Example command (must be run from project parent directory to include modules/ and flow/ paths in Docker context):
 # docker build -t neon-is-cal-grp-conv -f ./modules_combined/calibration_group_and_convert/Dockerfile .
 
@@ -12,13 +12,15 @@ MAINTAINER "Cove Sturtevant" csturtevant@battelleecology.org
 # Add in the python-based filter-joiner module
 ARG MODULE_DIR="modules"
 ARG APP_DIR="filter_joiner"
+ARG APP_DIR_2="array_parser"
 ARG COMMON_DIR="common"
 ARG CONTAINER_APP_DIR="/usr/src/app"
 ENV PYTHONPATH="${PYTHONPATH}:${CONTAINER_APP_DIR}"
 
 WORKDIR ${CONTAINER_APP_DIR}
 
 COPY ${MODULE_DIR}/${APP_DIR}/requirements.txt ${CONTAINER_APP_DIR}/${APP_DIR}/requirements.txt
+COPY ${MODULE_DIR}/${APP_DIR_2}/requirements.txt ${CONTAINER_APP_DIR}/${APP_DIR_2}/requirements.txt
 
 
 RUN apt update && \
@@ -27,6 +29,7 @@ RUN apt update && \
     apt install -y python3-pip && \
     python3 -mpip install --no-cache-dir --upgrade pip setuptools wheel && \
     python3 -mpip install --no-cache-dir -r ${CONTAINER_APP_DIR}/${APP_DIR}/requirements.txt && \
+    python3 -mpip install --no-cache-dir -r ${CONTAINER_APP_DIR}/${APP_DIR_2}/requirements.txt && \
    apt-get autoremove -y && \
 	 apt-get autoclean -y && \
 	 rm -rf /var/lib/apt/lists/* && \
@@ -35,6 +38,7 @@ RUN apt update && \
 
 # Copy in python code
 COPY ${MODULE_DIR}/${APP_DIR} ${CONTAINER_APP_DIR}/${APP_DIR}
+COPY ${MODULE_DIR}/${APP_DIR_2} ${CONTAINER_APP_DIR}/${APP_DIR_2}
 COPY ${MODULE_DIR}/${COMMON_DIR} ${CONTAINER_APP_DIR}/${COMMON_DIR}