From a36442be49f6d595291d4bd343bbfc741a771c97 Mon Sep 17 00:00:00 2001 From: aliziel <21992503+aliziel@users.noreply.github.com> Date: Fri, 3 Apr 2026 12:11:31 -0400 Subject: [PATCH 1/2] ci: reference head via environment --- .github/workflows/promotion-checker.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/promotion-checker.yml b/.github/workflows/promotion-checker.yml index 583db7a..611436b 100644 --- a/.github/workflows/promotion-checker.yml +++ b/.github/workflows/promotion-checker.yml @@ -38,7 +38,7 @@ jobs: - name: Check if pr.yml succeeded id: check_pr_workflow run: | - WORKFLOW_STATUS=$(gh run list --repo ${{ github.repository }} --branch ${{ github.event.pull_request.head.ref }} --workflow "pr.yml" --json status --jq '.[0].status // "not found"') + WORKFLOW_STATUS=$(gh run list --repo ${{ github.repository }} --branch "$HEAD_REF" --workflow "pr.yml" --json status --jq '.[0].status // "not found"') echo "Previous workflow (pr.yml) status: $WORKFLOW_STATUS" if [ "$WORKFLOW_STATUS" != "completed" ]; then echo "Previous workflow (pr.yml) did not complete successfully. Exiting..." @@ -46,6 +46,7 @@ jobs: fi env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + HEAD_REF: ${{ github.event.pull_request.head.ref }} trigger-promotion-workflow: runs-on: ubuntu-latest From a35ac7ed127c45ba231d63acfef7505e7bee0303 Mon Sep 17 00:00:00 2001 From: aliziel <21992503+aliziel@users.noreply.github.com> Date: Mon, 6 Apr 2026 12:02:53 -0700 Subject: [PATCH 2/2] chore: lint checks --- README.md | 26 ++++++++++++++++---------- requirements.in | 2 +- scripts/generate_mdx.py | 2 +- scripts/promote_collection.py | 2 +- scripts/promote_dataset.py | 2 +- tests/test_collections.py | 2 +- 6 files changed, 21 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 482fefc..25d631f 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,12 @@ # veda-data + This repository houses data and config used to create STAC records to be published to the veda STAC catalog. -# Repository layout +## Repository layout + The repo follows the following folder structure: -``` +```plain | ingestion-data | collections | archive @@ -14,10 +16,10 @@ The repo follows the following folder structure: . collection-n.json | discovery-items | archive - . archived-discovery-items-1.json - . archived-discovery-items-2.json + . archived-discovery-items-1.json + . archived-discovery-items-2.json ... - . archived-discovery-items-n.json + . archived-discovery-items-n.json . discovery-items-1.json . discovery-items-2.json ... @@ -28,12 +30,15 @@ The repo follows the following folder structure: ## ingestion-data ### collections + These are STAC collection records for all the available datasets. They should conform to the [STAC specification for a collection](https://github.com/radiantearth/stac-spec/blob/master/collection-spec/collection-spec.md). #### archive + These are the collections that we no longer update. However, we might still maintain them in the catalog. ### discovery-items + These are the items ingestion config files that are used by our data pipelines (airflow), specifically the `veda_discover` DAG in [veda-data-airflow](https://github.com/NASA-IMPACT/veda-data-airflow), which discovers all the specified files and triggers the `veda_ingest_raster` DAG which takes care of creating the stac items and publishing them. The format looks like this: @@ -63,10 +68,10 @@ The format looks like this: ``` ### transfer-config -These are the configs used to transfer assets from the dev bucket (`ghgc-data-store-develop` - where the data was delivered) to the production bucket (`ghgc-data-store` - where the data is moved after it is finalized). The files from the production bucket is used to publish to the catalog. The transfer is done via triggering `veda_transfer` DAG in [veda-data-airflow](https://github.com/NASA-IMPACT/veda-data-airflow). -##### Description of each field: +These are the configs used to transfer assets from the dev bucket (`ghgc-data-store-develop` - where the data was delivered) to the production bucket (`ghgc-data-store` - where the data is moved after it is finalized). The files from the production bucket is used to publish to the catalog. The transfer is done via triggering `veda_transfer` DAG in [veda-data-airflow](https://github.com/NASA-IMPACT/veda-data-airflow). +#### Description of each field | Field | Description | |--------------------|--------------------------------------------------| @@ -75,16 +80,17 @@ These are the configs used to transfer assets from the dev bucket (`ghgc-data-st | `prefix` | The s3 prefix under which to search for the files | | `filename_regex` | The regex pattern that the files to be discovered should match | | `id_regex` | Specifies in regex what part of the filename (usually the datetime) should be used to group assets into item. Example: if the filenames are `asset1_20151201.tif`, `asset2_20151201.tif`, `asset1_20161201.tif`, `asset2_20161201.tif`; the item should be based on the datetime part, hence it'd be `".*_(.*).tif$"`. The part should be specified using round brackets. The is also the part of the filenames that will be used to form the item id, together with the `id_template` field. | -| `id_template` | This is a python f-string formatted string that is used to define the `id` of the STAC item. It's used together with the value of `id_regex`. So, going off of the example above, if the `id_template` is `eccodarwin-{}`, then the two item `id`s would be `eccodarwin-20151201` and `eccodarwin-20161201` | +| `id_template` | This is a python f-string formatted string that is used to define the `id` of the STAC item. It's used together with the value of `id_regex`. So, going off of the example above, if the `id_template` is `eccodarwin-{}`, then the two item `id`s would be `eccodarwin-20151201` and `eccodarwin-20161201` | | `datetime_range` | This is used to extract the datetime range from the filename. Valid values are `day`, `month` and `year`. Example: if the filename has `20160104` in it, and `datetime_range` is `day` - the `start_datetime` and `end_datetime` are the start and end of the day. For `month`, they are the start and end of the month and so on. | | `` | An `id` for the asset | | `assets..title` | A title for the asset | | `assets..description` | A description for the asset | | `assets..regex` | The regex pattern that matches a filename to its respective asset | +#### config archive -#### archive These are the discovery-items config for collections that we no longer update. ## notebooks -Sometimes, there are exceptional datasets that might require a one-off ingestion that is not supported by the current state of our data pipelines. In such cases, we create notebooks/python scripts that can be used to ingest those data. This is where those notebooks/python scripts live. \ No newline at end of file + +Sometimes, there are exceptional datasets that might require a one-off ingestion that is not supported by the current state of our data pipelines. In such cases, we create notebooks/python scripts that can be used to ingest those data. This is where those notebooks/python scripts live. diff --git a/requirements.in b/requirements.in index 8a1b326..8f9b637 100644 --- a/requirements.in +++ b/requirements.in @@ -3,4 +3,4 @@ pip-tools pre-commit pystac[validation] pytest -ruff \ No newline at end of file +ruff diff --git a/scripts/generate_mdx.py b/scripts/generate_mdx.py index 92db411..e8726d5 100644 --- a/scripts/generate_mdx.py +++ b/scripts/generate_mdx.py @@ -127,4 +127,4 @@ def safe_open_w(path): ofile.write(new_content) collection_id = input_data["collection"] - print(collection_id) \ No newline at end of file + print(collection_id) diff --git a/scripts/promote_collection.py b/scripts/promote_collection.py index 49a79d2..abc26e6 100644 --- a/scripts/promote_collection.py +++ b/scripts/promote_collection.py @@ -77,4 +77,4 @@ def trigger_collection_dag(payload: Dict[str, Any], stage: str): except FileNotFoundError: print(f"Error: File '{sys.argv[1]}' not found.") except json.JSONDecodeError: - raise ValueError(f"Invalid JSON content in file {sys.argv[1]}") \ No newline at end of file + raise ValueError(f"Invalid JSON content in file {sys.argv[1]}") diff --git a/scripts/promote_dataset.py b/scripts/promote_dataset.py index 9558cd1..e2ea38f 100644 --- a/scripts/promote_dataset.py +++ b/scripts/promote_dataset.py @@ -131,4 +131,4 @@ def promote_to_production(payload): except FileNotFoundError: print(f"Error: File '{sys.argv[1]}' not found.") except json.JSONDecodeError: - raise ValueError(f"Invalid JSON content in file {sys.argv[1]}") \ No newline at end of file + raise ValueError(f"Invalid JSON content in file {sys.argv[1]}") diff --git a/tests/test_collections.py b/tests/test_collections.py index 8c41827..3ab80ed 100644 --- a/tests/test_collections.py +++ b/tests/test_collections.py @@ -10,4 +10,4 @@ @pytest.mark.parametrize("path", COLLECTIONS_PATH.rglob("*.json")) def test_validate(path: Path) -> None: collection = Collection.from_file(str(path)) - collection.validate() \ No newline at end of file + collection.validate()