From 01d0799b8acb2be19d41bb0cbd4b070db4c0e5b2 Mon Sep 17 00:00:00 2001 From: Harrison Cook Date: Thu, 13 Nov 2025 09:59:36 +0000 Subject: [PATCH 1/5] feat: Add entrypoint for anemoi-datasets to PyEarthTools --- docs/api/api.md | 1 + docs/api/pipeline/pipeline_entrypoints.md | 51 +++++++++++++++++++ packages/pipeline/pyproject.toml | 4 ++ .../pipeline/entrypoints/anemoi.py | 48 +++++++++++++++++ 4 files changed, 104 insertions(+) create mode 100644 docs/api/pipeline/pipeline_entrypoints.md create mode 100644 packages/pipeline/src/pyearthtools/pipeline/entrypoints/anemoi.py diff --git a/docs/api/api.md b/docs/api/api.md index e530a9cd..539ecf31 100644 --- a/docs/api/api.md +++ b/docs/api/api.md @@ -24,6 +24,7 @@ data/data_index data/data_api pipeline/pipeline_index pipeline/pipeline_api +pipeline/pipeline_entrypoints training/training_index training/training_api tutorial/tutorial_index diff --git a/docs/api/pipeline/pipeline_entrypoints.md b/docs/api/pipeline/pipeline_entrypoints.md new file mode 100644 index 00000000..8f96218b --- /dev/null +++ b/docs/api/pipeline/pipeline_entrypoints.md @@ -0,0 +1,51 @@ +# Pipeline Entrypoints + +As `PyEarthTools` pipelines propose a generic way to load and prepare various earth system datasets, it is possible to use +a pipeline as a source for [anemoi-datasets](https://anemoi.readthedocs.io/projects/datasets/en/latest/). + +## Example + +Below is a minimal example of using a `PyEarthTools` pipeline to load data and prepare it for `anemoi`, please see the `anemoi` docs +for more information on the `datasets` config. + +### Create the Pipeline + +.. code-block:: python + + import pyearthtools.data + import pyearthtools.pipeline + + pipeline = pyearthtools.pipeline.Pipeline( + pyearthtools.data.download.arcoera5.ARCOERA5(['t2m', 'u10', 'v10']), + pyearthtools.pipeline.operations.xarray.values.FillNan() + ) + pipeline.save('/PATH/TO/PIPELINE.yaml') + +### Create the anemoi-datasets config + +.. code-block:: yaml + + name: pyearthtools_to_anemoi + description: PyEarthTools Pipeline converted to Anemoi + attribution: PyEarthTools + + dates: + start: '2025-11-10T00:00:00' + end: '2025-11-12T00:00:00' + frequency: 1h + + input: + pyearthtools: # Use the pyearthtools input object + pipeline: /PATH/TO/PIPELINE.yaml + +### Run anemoi-datasets + +.. code-block:: bash + + anemoi-datasets create /path/to/anemoi/dataset.yaml + +## Contract + +The expected contract and result from the `PyEarthTools` pipeline is to return an `xarray` object of a single time index. + +Both tools provide methods to modify the metadata of the data, and should be used accordingly to prepare for downstream uses. diff --git a/packages/pipeline/pyproject.toml b/packages/pipeline/pyproject.toml index 2f2ef2b6..1a3a2188 100644 --- a/packages/pipeline/pyproject.toml +++ b/packages/pipeline/pyproject.toml @@ -29,6 +29,10 @@ dependencies = [ dynamic = ["version", "readme"] +[project.entry-points] +# Add PyEarthTools as an anemoi datasets source +"anemoi.datasets.create.sources".pyearthtools = "pyearthtools.pipeline.entrypoints.anemoi:pyearthtoolsSource" + [project.optional-dependencies] distributed = [ "dask", diff --git a/packages/pipeline/src/pyearthtools/pipeline/entrypoints/anemoi.py b/packages/pipeline/src/pyearthtools/pipeline/entrypoints/anemoi.py new file mode 100644 index 00000000..f6323740 --- /dev/null +++ b/packages/pipeline/src/pyearthtools/pipeline/entrypoints/anemoi.py @@ -0,0 +1,48 @@ +from functools import cached_property +from pathlib import Path + +from pyearthtools.pipeline import load +from pyearthtools.pipeline import Pipeline + +import earthkit.data as ekd +from anemoi.datasets.create.source import Source +from anemoi.datasets.create.typing import DateList + + +class pyearthtoolsSource(Source): + emoji = "🌏" # For tracing + + def __init__(self, context, pipeline: str | Path): + """Initialise the source. + + Parameters + ---------- + context : Any + The context for the data source. + pipeline: str + The path to the pyearthtools pipeline file. + """ + super().__init__(context) + self._pyearthtools_pipeline = pipeline + + @cached_property + def pipeline(self) -> Pipeline: + return load(self._pyearthtools_pipeline) + + def execute(self, dates: DateList) -> ekd.FieldList: + """Execute the source. + + Parameters + ---------- + dates : DateList + The input dates. + + Returns + ------- + ekd.FieldList + The output data. + """ + fields = [] + for date in dates: + fields.extend(ekd.from_object(self.pipeline[date.isoformat()])) # type: ignore + return ekd.FieldList.from_fields(fields) From 34154e8c6fae6a8178df80c50f376921721fd0f7 Mon Sep 17 00:00:00 2001 From: Harrison Cook Date: Thu, 13 Nov 2025 10:02:03 +0000 Subject: [PATCH 2/5] Fix indent --- docs/api/pipeline/pipeline_entrypoints.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/api/pipeline/pipeline_entrypoints.md b/docs/api/pipeline/pipeline_entrypoints.md index 8f96218b..77ea22e7 100644 --- a/docs/api/pipeline/pipeline_entrypoints.md +++ b/docs/api/pipeline/pipeline_entrypoints.md @@ -30,13 +30,13 @@ for more information on the `datasets` config. attribution: PyEarthTools dates: - start: '2025-11-10T00:00:00' - end: '2025-11-12T00:00:00' - frequency: 1h + start: '2025-11-10T00:00:00' + end: '2025-11-12T00:00:00' + frequency: 1h input: - pyearthtools: # Use the pyearthtools input object - pipeline: /PATH/TO/PIPELINE.yaml + pyearthtools: # Use the pyearthtools input object + pipeline: /PATH/TO/PIPELINE.yaml ### Run anemoi-datasets From 175e3a21ead5d5cd5eec6035342f9654ab5938ed Mon Sep 17 00:00:00 2001 From: Harrison Cook Date: Fri, 14 Nov 2025 09:46:53 +0000 Subject: [PATCH 3/5] Allow usage of pipeline directly --- .../src/pyearthtools/pipeline/entrypoints/anemoi.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/packages/pipeline/src/pyearthtools/pipeline/entrypoints/anemoi.py b/packages/pipeline/src/pyearthtools/pipeline/entrypoints/anemoi.py index f6323740..7947ca9b 100644 --- a/packages/pipeline/src/pyearthtools/pipeline/entrypoints/anemoi.py +++ b/packages/pipeline/src/pyearthtools/pipeline/entrypoints/anemoi.py @@ -12,7 +12,7 @@ class pyearthtoolsSource(Source): emoji = "🌏" # For tracing - def __init__(self, context, pipeline: str | Path): + def __init__(self, context, pipeline: str | Path | Pipeline): """Initialise the source. Parameters @@ -27,7 +27,10 @@ def __init__(self, context, pipeline: str | Path): @cached_property def pipeline(self) -> Pipeline: - return load(self._pyearthtools_pipeline) + pipeline = self._pyearthtools_pipeline + if isinstance(pipeline, Pipeline): + return pipeline + return load(pipeline) def execute(self, dates: DateList) -> ekd.FieldList: """Execute the source. From bd798592fca4ac19625669198c49f2a387a68e40 Mon Sep 17 00:00:00 2001 From: Harrison Cook Date: Fri, 14 Nov 2025 09:47:13 +0000 Subject: [PATCH 4/5] Add ECMWF License --- NOTICE.md | 2 ++ .../src/pyearthtools/pipeline/entrypoints/anemoi.py | 9 +++++++++ 2 files changed, 11 insertions(+) diff --git a/NOTICE.md b/NOTICE.md index e4e78eba..691375f1 100644 --- a/NOTICE.md +++ b/NOTICE.md @@ -15,3 +15,5 @@ The file packages/data/src/pyearthtools/data/indexes/extensions.py extends and i The package packages/bundled_models/fourcastnext extends and is significantly based on the code from https://github.com/nci/FourCastNeXt which is made available under the Apache 2.0 license. That repository in turn extends the code from https://github.com/NVlabs/FourCastNet/, released under the BSD 3-Clause license. The FourCastNet model is described in detail at https://arxiv.org/abs/2202.11214. The FourCastNeXt model is described in detail at https://arxiv.org/abs/2401.05584, and a version of the FourCastNeXt code is bundled, adapted for compatibility and maintained within the PyEarthTools repository so it can continue to be a useful reference implementation and learning aid. The package packages/bundled_models/lucie extends and is based on the code from https://github.com/ISCLPennState/LUCIE, which is made available under the MIT license. The LUCIE model is described in detail at https://doi.org/10.48550/arXiv.2405.16297. The version of the model bundled in PyEarthTools may undergo changes associated with package maintenance and compatibility so it can continue to be a useful reference implementation and learning aid. Within that repository, those authors bundle the file "torch_harmonics_local.py", which is based on https://github.com/NVIDIA/torch-harmonics . The bundled file has an Apache 2.0 copyright statement included in it but at the time of writing the NVIDIA repository carries the BSD 3-clause license. Both of these licenses allow bundling to occur and all relevant files preserve the copyright statement within the files. Copyright for the original works go to the LUCIE and torch-harmonics developers respectively. + +The file packages/pipeline/src/pyearthtools/pipeline/entrypoints/anemoi.py was originally developed by ECMWF, released under the Apache 2.0 license. diff --git a/packages/pipeline/src/pyearthtools/pipeline/entrypoints/anemoi.py b/packages/pipeline/src/pyearthtools/pipeline/entrypoints/anemoi.py index 7947ca9b..1ab829ab 100644 --- a/packages/pipeline/src/pyearthtools/pipeline/entrypoints/anemoi.py +++ b/packages/pipeline/src/pyearthtools/pipeline/entrypoints/anemoi.py @@ -1,3 +1,12 @@ +# (C) Copyright 2025- European Centre for Medium-Range Weather Forecasts (ECMWF) + +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# In applying this licence, ECMWF does not waive the privileges and immunities +# granted to it by virtue of its status as an intergovernmental organisation nor +# does it submit to any jurisdiction. + + from functools import cached_property from pathlib import Path From 413733bd53b6cefc9b64b090e2a995d1a778cde9 Mon Sep 17 00:00:00 2001 From: Harrison Cook Date: Mon, 17 Nov 2025 13:01:38 +0000 Subject: [PATCH 5/5] Apply suggestions from code review Co-authored-by: Tennessee Leeuwenburg <134973832+tennlee@users.noreply.github.com> --- docs/api/pipeline/pipeline_entrypoints.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/api/pipeline/pipeline_entrypoints.md b/docs/api/pipeline/pipeline_entrypoints.md index 77ea22e7..2335d9c1 100644 --- a/docs/api/pipeline/pipeline_entrypoints.md +++ b/docs/api/pipeline/pipeline_entrypoints.md @@ -8,7 +8,7 @@ a pipeline as a source for [anemoi-datasets](https://anemoi.readthedocs.io/proje Below is a minimal example of using a `PyEarthTools` pipeline to load data and prepare it for `anemoi`, please see the `anemoi` docs for more information on the `datasets` config. -### Create the Pipeline +### Create the Pipeline in PyEarthTools .. code-block:: python @@ -44,7 +44,7 @@ for more information on the `datasets` config. anemoi-datasets create /path/to/anemoi/dataset.yaml -## Contract +## Function Contract The expected contract and result from the `PyEarthTools` pipeline is to return an `xarray` object of a single time index.