diff --git a/NOTICE.md b/NOTICE.md index e4e78eba..691375f1 100644 --- a/NOTICE.md +++ b/NOTICE.md @@ -15,3 +15,5 @@ The file packages/data/src/pyearthtools/data/indexes/extensions.py extends and i The package packages/bundled_models/fourcastnext extends and is significantly based on the code from https://github.com/nci/FourCastNeXt which is made available under the Apache 2.0 license. That repository in turn extends the code from https://github.com/NVlabs/FourCastNet/, released under the BSD 3-Clause license. The FourCastNet model is described in detail at https://arxiv.org/abs/2202.11214. The FourCastNeXt model is described in detail at https://arxiv.org/abs/2401.05584, and a version of the FourCastNeXt code is bundled, adapted for compatibility and maintained within the PyEarthTools repository so it can continue to be a useful reference implementation and learning aid. The package packages/bundled_models/lucie extends and is based on the code from https://github.com/ISCLPennState/LUCIE, which is made available under the MIT license. The LUCIE model is described in detail at https://doi.org/10.48550/arXiv.2405.16297. The version of the model bundled in PyEarthTools may undergo changes associated with package maintenance and compatibility so it can continue to be a useful reference implementation and learning aid. Within that repository, those authors bundle the file "torch_harmonics_local.py", which is based on https://github.com/NVIDIA/torch-harmonics . The bundled file has an Apache 2.0 copyright statement included in it but at the time of writing the NVIDIA repository carries the BSD 3-clause license. Both of these licenses allow bundling to occur and all relevant files preserve the copyright statement within the files. Copyright for the original works go to the LUCIE and torch-harmonics developers respectively. + +The file packages/pipeline/src/pyearthtools/pipeline/entrypoints/anemoi.py was originally developed by ECMWF, released under the Apache 2.0 license. diff --git a/docs/api/api.md b/docs/api/api.md index e530a9cd..539ecf31 100644 --- a/docs/api/api.md +++ b/docs/api/api.md @@ -24,6 +24,7 @@ data/data_index data/data_api pipeline/pipeline_index pipeline/pipeline_api +pipeline/pipeline_entrypoints training/training_index training/training_api tutorial/tutorial_index diff --git a/docs/api/pipeline/pipeline_entrypoints.md b/docs/api/pipeline/pipeline_entrypoints.md new file mode 100644 index 00000000..2335d9c1 --- /dev/null +++ b/docs/api/pipeline/pipeline_entrypoints.md @@ -0,0 +1,51 @@ +# Pipeline Entrypoints + +As `PyEarthTools` pipelines propose a generic way to load and prepare various earth system datasets, it is possible to use +a pipeline as a source for [anemoi-datasets](https://anemoi.readthedocs.io/projects/datasets/en/latest/). + +## Example + +Below is a minimal example of using a `PyEarthTools` pipeline to load data and prepare it for `anemoi`, please see the `anemoi` docs +for more information on the `datasets` config. + +### Create the Pipeline in PyEarthTools + +.. code-block:: python + + import pyearthtools.data + import pyearthtools.pipeline + + pipeline = pyearthtools.pipeline.Pipeline( + pyearthtools.data.download.arcoera5.ARCOERA5(['t2m', 'u10', 'v10']), + pyearthtools.pipeline.operations.xarray.values.FillNan() + ) + pipeline.save('/PATH/TO/PIPELINE.yaml') + +### Create the anemoi-datasets config + +.. code-block:: yaml + + name: pyearthtools_to_anemoi + description: PyEarthTools Pipeline converted to Anemoi + attribution: PyEarthTools + + dates: + start: '2025-11-10T00:00:00' + end: '2025-11-12T00:00:00' + frequency: 1h + + input: + pyearthtools: # Use the pyearthtools input object + pipeline: /PATH/TO/PIPELINE.yaml + +### Run anemoi-datasets + +.. code-block:: bash + + anemoi-datasets create /path/to/anemoi/dataset.yaml + +## Function Contract + +The expected contract and result from the `PyEarthTools` pipeline is to return an `xarray` object of a single time index. + +Both tools provide methods to modify the metadata of the data, and should be used accordingly to prepare for downstream uses. diff --git a/packages/pipeline/pyproject.toml b/packages/pipeline/pyproject.toml index 2f2ef2b6..1a3a2188 100644 --- a/packages/pipeline/pyproject.toml +++ b/packages/pipeline/pyproject.toml @@ -29,6 +29,10 @@ dependencies = [ dynamic = ["version", "readme"] +[project.entry-points] +# Add PyEarthTools as an anemoi datasets source +"anemoi.datasets.create.sources".pyearthtools = "pyearthtools.pipeline.entrypoints.anemoi:pyearthtoolsSource" + [project.optional-dependencies] distributed = [ "dask", diff --git a/packages/pipeline/src/pyearthtools/pipeline/entrypoints/anemoi.py b/packages/pipeline/src/pyearthtools/pipeline/entrypoints/anemoi.py new file mode 100644 index 00000000..1ab829ab --- /dev/null +++ b/packages/pipeline/src/pyearthtools/pipeline/entrypoints/anemoi.py @@ -0,0 +1,60 @@ +# (C) Copyright 2025- European Centre for Medium-Range Weather Forecasts (ECMWF) + +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# In applying this licence, ECMWF does not waive the privileges and immunities +# granted to it by virtue of its status as an intergovernmental organisation nor +# does it submit to any jurisdiction. + + +from functools import cached_property +from pathlib import Path + +from pyearthtools.pipeline import load +from pyearthtools.pipeline import Pipeline + +import earthkit.data as ekd +from anemoi.datasets.create.source import Source +from anemoi.datasets.create.typing import DateList + + +class pyearthtoolsSource(Source): + emoji = "🌏" # For tracing + + def __init__(self, context, pipeline: str | Path | Pipeline): + """Initialise the source. + + Parameters + ---------- + context : Any + The context for the data source. + pipeline: str + The path to the pyearthtools pipeline file. + """ + super().__init__(context) + self._pyearthtools_pipeline = pipeline + + @cached_property + def pipeline(self) -> Pipeline: + pipeline = self._pyearthtools_pipeline + if isinstance(pipeline, Pipeline): + return pipeline + return load(pipeline) + + def execute(self, dates: DateList) -> ekd.FieldList: + """Execute the source. + + Parameters + ---------- + dates : DateList + The input dates. + + Returns + ------- + ekd.FieldList + The output data. + """ + fields = [] + for date in dates: + fields.extend(ekd.from_object(self.pipeline[date.isoformat()])) # type: ignore + return ekd.FieldList.from_fields(fields)