diff --git a/src/cchdo/hydro/metadata.py b/src/cchdo/hydro/metadata.py new file mode 100644 index 0000000..8c51952 --- /dev/null +++ b/src/cchdo/hydro/metadata.py @@ -0,0 +1,48 @@ +from collections import defaultdict +from itertools import groupby, islice +from logging import getLogger + +import xarray as xr + +log = getLogger(__name__) + + +def all_equal(iterable, key=None): + "Returns True if all the elements are equal to each other." + # see https://docs.python.org/3/library/itertools.html#itertools-recipes + return len(list(islice(groupby(iterable, key), 2))) <= 1 + + +SAME_KEYS = ( + "processing_level", + "comment", + "creator_name", +) +ALLOWED_DIFFER = ( + "date_modified", + "date_metadata_modified", +) + + +def validate(ds: xr.Dataset): + exceptions = [] + + # cannot use filter_by_attrs since we want to filter on the coordinates too + projects = defaultdict(list) + for name, da in ds.variables.items(): + if (project := da.attrs.get("project")) is not None: + projects[project].append(name) + + for project, vars in projects.items(): + log.debug(f"Checking project '{project}' which includes {vars}") + for key in SAME_KEYS: + values = {var: ds[var].attrs.get(key) for var in vars} + valid = all_equal(values.values()) + if not valid: + exception = ValueError( + f"Project '{project}' key '{key}' is not the same: {values}" + ) + log.debug(exception) + exceptions.append(exception) + if exceptions: + raise ExceptionGroup("Metadata has failed to validate", exceptions) diff --git a/src/cchdo/hydro/tests/test_metadata.py b/src/cchdo/hydro/tests/test_metadata.py new file mode 100644 index 0000000..1e60a81 --- /dev/null +++ b/src/cchdo/hydro/tests/test_metadata.py @@ -0,0 +1,24 @@ +from io import BytesIO + +import pytest + +from cchdo.hydro import read_csv +from cchdo.hydro.metadata import validate + + +def test_invalid_createor_name(): + test_data = BytesIO( + b"""EXPOCODE,STNNBR,CASTNO,SAMPNO,LATITUDE,LONGITUDE,DATE,TIME,CTDPRS [DBAR],SILCAT [UMOL/KG],SILCAT [UMOL/KG]_FLAG_W,PHSPHT [UMOL/KG],PHSPHT [UMOL/KG]_FLAG_W +TEST,1,1,1,0,0,20220101,0000,0,0,2,0,3""" + ) + ds = read_csv(test_data) + + ds.silicate.attrs["project"] = "nutrients" + ds.phosphate.attrs["project"] = "nutrients" + + ds.silicate.attrs["creator_name"] = "Susan Becker" + + with pytest.raises(ExceptionGroup) as excinfo: + validate(ds) + + assert excinfo.group_contains(ValueError)