diff --git a/notebooks/Collection_systems_network.ipynb b/notebooks/Collection_systems_network.ipynb new file mode 100644 index 000000000..5f71dbc44 --- /dev/null +++ b/notebooks/Collection_systems_network.ipynb @@ -0,0 +1,991 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 11, + "id": "fdb0d0b9", + "metadata": {}, + "outputs": [], + "source": [ + "import modelskill as ms\n", + "import xarray as xr\n", + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "id": "ee32618f", + "metadata": {}, + "source": [ + "(The following data structure can be automatically generated using a `NetworkMapper` from `mikeio1d`)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "48894cf8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset> Size: 231kB\n",
+       "Dimensions:     (time: 110, node: 259)\n",
+       "Coordinates:\n",
+       "  * time        (time) datetime64[ns] 880B 1994-08-07T16:35:00 ... 1994-08-07...\n",
+       "  * node        (node) int64 2kB 0 1 2 3 4 5 6 7 ... 252 253 254 255 256 257 258\n",
+       "Data variables:\n",
+       "    Discharge   (time, node) float32 114kB nan nan 5.72e-06 ... nan 0.01692 0.0\n",
+       "    WaterLevel  (time, node) float32 114kB 195.4 194.7 nan ... 188.5 nan nan
" + ], + "text/plain": [ + " Size: 231kB\n", + "Dimensions: (time: 110, node: 259)\n", + "Coordinates:\n", + " * time (time) datetime64[ns] 880B 1994-08-07T16:35:00 ... 1994-08-07...\n", + " * node (node) int64 2kB 0 1 2 3 4 5 6 7 ... 252 253 254 255 256 257 258\n", + "Data variables:\n", + " Discharge (time, node) float32 114kB nan nan 5.72e-06 ... nan 0.01692 0.0\n", + " WaterLevel (time, node) float32 114kB 195.4 194.7 nan ... 188.5 nan nan" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds = xr.load_dataset(\"../tests/testdata/network.nc\")\n", + "ds" + ] + }, + { + "cell_type": "markdown", + "id": "b643e568", + "metadata": {}, + "source": [ + "# Step 1: Create Network Model Result\n", + "\n", + "Now we'll create a `NetworkModelResult` from the network dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "edec2e5a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + ": WaterLevel" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "network_model = ms.NetworkModelResult(ds, item=\"WaterLevel\")\n", + "network_model" + ] + }, + { + "cell_type": "markdown", + "id": "6ddff185", + "metadata": {}, + "source": [ + "# Step 2: Create Sample Node Observations\n", + "\n", + "Let's create some synthetic node observations for demonstration purposes." + ] + }, + { + "cell_type": "markdown", + "id": "5905e267", + "metadata": {}, + "source": [ + "# Step 3: Extract Model Data at Observation Nodes\n", + "\n", + "Now we'll extract the network model data at each observation location." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "75990b8e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
station_0station_1station_2
1994-08-07 16:35:00.0001.0283641.0523290.964475
1994-08-07 16:36:01.8701.0577430.9989531.375202
1994-08-07 16:37:07.5600.9721701.0783681.021078
1994-08-07 16:38:55.8281.0400111.1425381.196989
1994-08-07 16:39:55.8281.2434950.9930461.230318
\n", + "
" + ], + "text/plain": [ + " station_0 station_1 station_2\n", + "1994-08-07 16:35:00.000 1.028364 1.052329 0.964475\n", + "1994-08-07 16:36:01.870 1.057743 0.998953 1.375202\n", + "1994-08-07 16:37:07.560 0.972170 1.078368 1.021078\n", + "1994-08-07 16:38:55.828 1.040011 1.142538 1.196989\n", + "1994-08-07 16:39:55.828 1.243495 0.993046 1.230318" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def dummy_series(t):\n", + " return 1.0 + 0.5 * np.sin(np.arange(t) * 0.1) + 0.1 * np.random.randn(t)\n", + "\n", + "model_time = ds.time\n", + "n_sensors = 3\n", + "# Create DataFrame with observations for multiple nodes\n", + "obs_data = {}\n", + "for i in range(n_sensors): \n", + " obs_data[f'station_{i}'] = dummy_series(len(model_time))\n", + " \n", + "obs_df = pd.DataFrame(obs_data, index=model_time)\n", + "obs_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "6eae35ae", + "metadata": {}, + "outputs": [], + "source": [ + "obs = []\n", + "nodes = [1, 3, 5]\n", + "for i, node in enumerate(nodes):\n", + " obs_i = ms.NodeObservation(\n", + " data=obs_df, \n", + " node=node,\n", + " item=i\n", + " )\n", + " obs.append(obs_i)" + ] + }, + { + "cell_type": "markdown", + "id": "db4b3bc2", + "metadata": {}, + "source": [ + "# Step 4: Match and Analyze Results\n", + "\n", + "Now let's match the network observations with the network model and compute skill metrics. The NetworkObservation will automatically be expanded to individual node observations during matching." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "de621fec", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nbiasrmseurmsemaeccsir2
observation
station_0110193.885463193.8870110.774637193.885463-0.4266290.732694-278329.043406
station_1110194.975574194.9762650.519284194.975574-0.3940080.494938-280256.621781
station_2110193.084552193.0864670.859845193.084552-0.2160120.819516-291215.095425
\n", + "
" + ], + "text/plain": [ + " n bias rmse urmse mae cc \\\n", + "observation \n", + "station_0 110 193.885463 193.887011 0.774637 193.885463 -0.426629 \n", + "station_1 110 194.975574 194.976265 0.519284 194.975574 -0.394008 \n", + "station_2 110 193.084552 193.086467 0.859845 193.084552 -0.216012 \n", + "\n", + " si r2 \n", + "observation \n", + "station_0 0.732694 -278329.043406 \n", + "station_1 0.494938 -280256.621781 \n", + "station_2 0.819516 -291215.095425 " + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ms.match(obs, network_model).skill()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "modelskill", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/modelskill/__init__.py b/src/modelskill/__init__.py index 0b3ce52f2..a6fa90a5f 100644 --- a/src/modelskill/__init__.py +++ b/src/modelskill/__init__.py @@ -38,13 +38,10 @@ TrackModelResult, GridModelResult, DfsuModelResult, + NetworkModelResult, DummyModelResult, ) -from .obs import ( - observation, - PointObservation, - TrackObservation, -) +from .obs import observation, PointObservation, TrackObservation, NodeObservation from .matching import from_matched, match from .configuration import from_config from .settings import options, get_option, set_option, reset_option, load_style @@ -94,9 +91,11 @@ def load(filename: Union[str, Path]) -> Comparer | ComparerCollection: "GridModelResult", "DfsuModelResult", "DummyModelResult", + "NetworkModelResult", "observation", "PointObservation", "TrackObservation", + "NodeObservation", "TimeSeries", "match", "from_matched", diff --git a/src/modelskill/comparison/_comparison.py b/src/modelskill/comparison/_comparison.py index f8fedef75..f4ce8f5ed 100644 --- a/src/modelskill/comparison/_comparison.py +++ b/src/modelskill/comparison/_comparison.py @@ -20,11 +20,13 @@ import xarray as xr from copy import deepcopy +from ..model.network import NodeModelResult + from .. import metrics as mtr from .. import Quantity from ..types import GeometryType -from ..obs import PointObservation, TrackObservation +from ..obs import PointObservation, TrackObservation, NodeObservation from ..model import PointModelResult, TrackModelResult from ..timeseries._timeseries import _validate_data_var_name from ._comparer_plotter import ComparerPlotter @@ -49,6 +51,12 @@ Serializable = Union[str, int, float] +def _drop_scalar_coords(data: xr.Dataset) -> xr.Dataset: + """Drop scalar coordinate variables that shouldn't appear as columns in dataframes""" + coords_to_drop = ["x", "y", "z", "node"] + return data.drop_vars(coords_to_drop, errors="ignore") + + def _parse_dataset(data: xr.Dataset) -> xr.Dataset: if not isinstance(data, xr.Dataset): raise ValueError("matched_data must be an xarray.Dataset") @@ -60,12 +68,15 @@ def _parse_dataset(data: xr.Dataset) -> xr.Dataset: raise ValueError("Observation data must not contain missing values.") # coordinates - if "x" not in data.coords: - data.coords["x"] = np.nan - if "y" not in data.coords: - data.coords["y"] = np.nan - if "z" not in data.coords: - data.coords["z"] = np.nan + # Only add x, y, z coordinates if they don't exist and we don't have node coordinates + has_node_coords = "node" in data.coords + if not has_node_coords: + if "x" not in data.coords: + data.coords["x"] = np.nan + if "y" not in data.coords: + data.coords["y"] = np.nan + if "z" not in data.coords: + data.coords["z"] = np.nan # Validate data vars = [v for v in data.data_vars] @@ -97,7 +108,11 @@ def _parse_dataset(data: xr.Dataset) -> xr.Dataset: # Validate attrs if "gtype" not in data.attrs: - data.attrs["gtype"] = str(GeometryType.POINT) + # Determine gtype based on available coordinates + if "node" in data.coords: + data.attrs["gtype"] = str(GeometryType.NETWORK) + else: + data.attrs["gtype"] = str(GeometryType.POINT) # assert "gtype" in data.attrs, "data must have a gtype attribute" # assert data.attrs["gtype"] in [ # str(GeometryType.POINT), @@ -444,7 +459,11 @@ class Comparer: def __init__( self, matched_data: xr.Dataset, - raw_mod_data: dict[str, PointModelResult | TrackModelResult] | None = None, + raw_mod_data: dict[ + str, + PointModelResult | TrackModelResult | NodeModelResult, + ] + | None = None, ) -> None: self.data = _parse_dataset(matched_data) self.raw_mod_data = ( @@ -464,7 +483,12 @@ def __init__( @staticmethod def from_matched_data( data: xr.Dataset | pd.DataFrame, - raw_mod_data: Optional[Dict[str, PointModelResult | TrackModelResult]] = None, + raw_mod_data: Optional[ + Dict[ + str, + PointModelResult | TrackModelResult | NodeModelResult, + ] + ] = None, obs_item: str | int | None = None, mod_items: Optional[Iterable[str | int]] = None, aux_items: Optional[Iterable[str | int]] = None, @@ -582,7 +606,15 @@ def z(self) -> Any: """z-coordinate""" return self._coordinate_values("z") - def _coordinate_values(self, coord: str) -> Any: + @property + def node(self) -> Any: + """node-coordinate""" + return self._coordinate_values("node") + + def _coordinate_values(self, coord: str) -> None | Any: + """Get coordinate values if they exist, otherwise return None""" + if coord not in self.data.coords: + return None vals = self.data[coord].values return np.atleast_1d(vals)[0] if vals.ndim == 0 else vals @@ -707,10 +739,10 @@ def rename( return Comparer(matched_data=data, raw_mod_data=raw_mod_data) - def _to_observation(self) -> PointObservation | TrackObservation: + def _to_observation(self) -> PointObservation | TrackObservation | NodeObservation: """Convert to Observation""" if self.gtype == "point": - df = self.data.drop_vars(["x", "y", "z"])[self._obs_str].to_dataframe() + df = _drop_scalar_coords(self.data)[self._obs_str].to_dataframe() return PointObservation( data=df, name=self.name, @@ -721,7 +753,9 @@ def _to_observation(self) -> PointObservation | TrackObservation: # TODO: add attrs ) elif self.gtype == "track": - df = self.data.drop_vars(["z"])[[self._obs_str]].to_dataframe() + df = self.data.drop_vars(["z"], errors="ignore")[ + [self._obs_str] + ].to_dataframe() return TrackObservation( data=df, item=0, @@ -731,10 +765,21 @@ def _to_observation(self) -> PointObservation | TrackObservation: quantity=self.quantity, # TODO: add attrs ) + elif self.gtype == "network": + df = _drop_scalar_coords(self.data)[self._obs_str].to_dataframe() + return NodeObservation( + data=df, + name=self.name, + node=self.node, + quantity=self.quantity, + # TODO: add attrs + ) else: raise NotImplementedError(f"Unknown gtype: {self.gtype}") - def _to_model(self) -> list[PointModelResult | TrackModelResult]: + def _to_model( + self, + ) -> list[PointModelResult | TrackModelResult | NodeModelResult]: mods = list(self.raw_mod_data.values()) return mods @@ -896,9 +941,9 @@ def _to_long_dataframe( ) -> pd.DataFrame: """Return a copy of the data as a long-format pandas DataFrame (for groupby operations)""" - data = self.data.drop_vars("z", errors="ignore") + data = self.data.drop_vars(["z", "node"], errors="ignore") - # this step is necessary since we keep arbitrary derived data in the dataset, but not z + # this step is necessary since we keep arbitrary derived data in the dataset, but not z/node # i.e. using a hardcoded whitelist of variables to keep is less flexible id_vars = [v for v in data.variables if v not in self.mod_names] @@ -981,8 +1026,8 @@ def skill( df = cmp._to_long_dataframe() res = _groupby_df(df, by=by, metrics=metrics) - res["x"] = np.nan if self.gtype == "track" else cmp.x - res["y"] = np.nan if self.gtype == "track" else cmp.y + res["x"] = np.nan if self.gtype == "track" or cmp.x is None else cmp.x + res["y"] = np.nan if self.gtype == "track" or cmp.y is None else cmp.y res = self._add_as_col_if_not_in_index(df, skilldf=res) return SkillTable(res) @@ -1138,7 +1183,7 @@ def gridded_skill( @property def _residual(self) -> np.ndarray: - df = self.data.drop_vars(["x", "y", "z"]).to_dataframe() + df = _drop_scalar_coords(self.data).to_dataframe() obs = df[self._obs_str].values mod = df[self.mod_names].values return mod - np.vstack(obs) @@ -1202,12 +1247,17 @@ def to_dataframe(self) -> pd.DataFrame: if self.gtype == str(GeometryType.POINT): # we remove the scalar coordinate variables as they # will otherwise be columns in the dataframe - return self.data.drop_vars(["x", "y", "z"]).to_dataframe() + return _drop_scalar_coords(self.data).to_dataframe() elif self.gtype == str(GeometryType.TRACK): - df = self.data.drop_vars(["z"]).to_dataframe() - # make sure that x, y cols are first - cols = ["x", "y"] + [c for c in df.columns if c not in ["x", "y"]] + df = self.data.drop_vars(["z"], errors="ignore").to_dataframe() + # make sure that x, y cols are first if they exist + coord_cols = [c for c in ["x", "y"] if c in df.columns] + other_cols = [c for c in df.columns if c not in ["x", "y"]] + cols = coord_cols + other_cols return df[cols] + elif self.gtype == str(GeometryType.NETWORK): + # For network data, drop node coordinate like other geometries drop their coordinates + return _drop_scalar_coords(self.data).to_dataframe() else: raise NotImplementedError(f"Unknown gtype: {self.gtype}") @@ -1258,7 +1308,10 @@ def load(filename: Union[str, Path]) -> "Comparer": return Comparer(matched_data=data) if data.gtype == "point": - raw_mod_data: Dict[str, PointModelResult | TrackModelResult] = {} + raw_mod_data: Dict[ + str, + PointModelResult | TrackModelResult | NodeModelResult, + ] = {} for var in data.data_vars: var_name = str(var) diff --git a/src/modelskill/matching.py b/src/modelskill/matching.py index b182f0ad7..ecb70de64 100644 --- a/src/modelskill/matching.py +++ b/src/modelskill/matching.py @@ -27,8 +27,15 @@ from .model.dfsu import DfsuModelResult from .model.dummy import DummyModelResult from .model.grid import GridModelResult +from .model.network import NetworkModelResult, NodeModelResult from .model.track import TrackModelResult -from .obs import Observation, PointObservation, TrackObservation +from .model.point import align_data +from .obs import ( + Observation, + PointObservation, + TrackObservation, + NodeObservation, +) from .timeseries import TimeSeries from .types import Period @@ -40,8 +47,13 @@ GridModelResult, DfsuModelResult, TrackModelResult, + NetworkModelResult, DummyModelResult, ] +Fieldypes = Union[ + GridModelResult, + DfsuModelResult, +] MRInputType = Union[ str, Path, @@ -56,7 +68,7 @@ TimeSeries, MRTypes, ] -ObsTypes = Union[PointObservation, TrackObservation] +ObsTypes = Union[PointObservation, TrackObservation, NodeObservation] ObsInputType = Union[ str, Path, @@ -67,7 +79,6 @@ pd.Series, ObsTypes, ] - T = TypeVar("T", bound="TimeSeries") @@ -248,6 +259,7 @@ def match( -------- from_matched - Create a Comparer from observation and model results that are already matched """ + if isinstance(obs, get_args(ObsInputType)): return _match_single_obs( obs, @@ -267,7 +279,15 @@ def match( if len(obs) > 1 and isinstance(mod, Collection) and len(mod) > 1: if not all( - isinstance(m, (DfsuModelResult, GridModelResult, DummyModelResult)) + isinstance( + m, + ( + DfsuModelResult, + GridModelResult, + NetworkModelResult, + DummyModelResult, + ), + ) for m in mod ): raise ValueError( @@ -317,14 +337,19 @@ def _match_single_obs( if len(names) != len(set(names)): raise ValueError(f"Duplicate model names found: {names}") - raw_mod_data = { - m.name: ( - m.extract(obs, spatial_method=spatial_method) - if isinstance(m, (DfsuModelResult, GridModelResult, DummyModelResult)) - else m - ) - for m in models - } + raw_mod_data: dict[str, PointModelResult | TrackModelResult | NodeModelResult] = {} + for m in models: + is_field = isinstance(m, (GridModelResult, DfsuModelResult)) + is_dummy = isinstance(m, DummyModelResult) + is_network = isinstance(m, NetworkModelResult) + if is_field or is_dummy: + matching_obs = m.extract(obs, spatial_method=spatial_method) + elif is_network: + matching_obs = m.extract(obs) + else: + matching_obs = m + + raw_mod_data[m.name] = matching_obs matched_data = _match_space_time( observation=obs, @@ -341,7 +366,7 @@ def _match_single_obs( def _get_global_start_end(idxs: Iterable[pd.DatetimeIndex]) -> Period: - assert all([len(x) > 0 for x in idxs]) + assert all([len(x) > 0 for x in idxs]), "All datetime indices must be non-empty" starts = [x[0] for x in idxs] ends = [x[-1] for x in idxs] @@ -351,7 +376,9 @@ def _get_global_start_end(idxs: Iterable[pd.DatetimeIndex]) -> Period: def _match_space_time( observation: Observation, - raw_mod_data: Mapping[str, PointModelResult | TrackModelResult], + raw_mod_data: Mapping[ + str, PointModelResult | TrackModelResult | NetworkModelResult | NodeModelResult + ], max_model_gap: float | None, spatial_tolerance: float, obs_no_overlap: Literal["ignore", "error", "warn"], @@ -374,7 +401,10 @@ def _match_space_time( observation, spatial_tolerance=spatial_tolerance ) case PointModelResult() as pmr, PointObservation(): - aligned = pmr.align(observation, max_gap=max_model_gap) + aligned = align_data(pmr.data, observation, max_gap=max_model_gap) + case NodeModelResult() as nmr, NodeObservation(): + # mr is the extracted NodeModelResult + aligned = align_data(nmr.data, observation, max_gap=max_model_gap) case _: raise TypeError( f"Matching not implemented for model type {type(mr)} and observation type {type(observation)}" diff --git a/src/modelskill/model/__init__.py b/src/modelskill/model/__init__.py index b0ba468ae..9840e64fc 100644 --- a/src/modelskill/model/__init__.py +++ b/src/modelskill/model/__init__.py @@ -6,9 +6,11 @@ * Timeseries - [`PointModelResult`](`modelskill.PointModelResult`) - a point result from a dfs0/nc file or a DataFrame - [`TrackModelResult`](`modelskill.TrackModelResult`) - a track (moving point) result from a dfs0/nc file or a DataFrame + - [`NodeModelResult`](`modelskill.NodeModelResult`) - a network node result for specific node IDs * SpatialField (extractable) - [`GridModelResult`](`modelskill.GridModelResult`) - a spatial field from a dfs2/nc file or a Xarray Dataset - [`DfsuModelResult`](`modelskill.DfsuModelResult`) - a spatial field from a dfsu file + - [`NetworkModelResult`](`modelskill.NetworkModelResult`) - a network field from xarray Dataset with time and node coordinates A model result can be created by explicitly invoking one of the above classes or using the [`model_result()`](`modelskill.model_result`) function which will return the appropriate type based on the input data (if possible). """ @@ -20,6 +22,8 @@ from .track import TrackModelResult from .dfsu import DfsuModelResult from .grid import GridModelResult +from .network import NetworkModelResult +from .network import NodeModelResult from .dummy import DummyModelResult __all__ = [ @@ -27,6 +31,8 @@ "TrackModelResult", "DfsuModelResult", "GridModelResult", + "NetworkModelResult", + "NodeModelResult", "model_result", "DummyModelResult", ] diff --git a/src/modelskill/model/_base.py b/src/modelskill/model/_base.py index 250dcc30d..3bc7aa873 100644 --- a/src/modelskill/model/_base.py +++ b/src/modelskill/model/_base.py @@ -10,9 +10,10 @@ if TYPE_CHECKING: from .point import PointModelResult from .track import TrackModelResult + from .network import NodeModelResult from ..utils import _get_name -from ..obs import Observation, PointObservation, TrackObservation +from ..obs import Observation, PointObservation, TrackObservation, NodeObservation @dataclass @@ -86,3 +87,10 @@ def _extract_point( def _extract_track( self, observation: TrackObservation, spatial_method: Optional[str] = None ) -> TrackModelResult: ... + + +class Network1D(Protocol): + def extract( + self, + observation: NodeObservation, + ) -> NodeModelResult: ... diff --git a/src/modelskill/model/network.py b/src/modelskill/model/network.py new file mode 100644 index 000000000..02da06c7f --- /dev/null +++ b/src/modelskill/model/network.py @@ -0,0 +1,187 @@ +from __future__ import annotations +from typing import Optional, Sequence +import pandas as pd +import xarray as xr + +from modelskill.timeseries import TimeSeries, _parse_network_node_input + +from ._base import Network1D, SelectedItems +from ..obs import NodeObservation +from ..quantity import Quantity +from ..types import PointType + + +class NodeModelResult(TimeSeries): + """Model result for a single network node. + + Construct a NodeModelResult from timeseries data for a specific node ID. + This is a simple timeseries class designed for network node data. + + Parameters + ---------- + data : str, Path, mikeio.Dataset, mikeio.DataArray, pd.DataFrame, pd.Series, xr.Dataset or xr.DataArray + filename (.dfs0 or .nc) or object with the data + name : Optional[str], optional + The name of the model result, + by default None (will be set to file name or item name) + node : int, optional + node ID (integer), by default None + item : str | int | None, optional + If multiple items/arrays are present in the input an item + must be given (as either an index or a string), by default None + quantity : Quantity, optional + Model quantity, for MIKE files this is inferred from the EUM information + aux_items : Optional[list[int | str]], optional + Auxiliary items, by default None + + Examples + -------- + >>> import modelskill as ms + >>> mr = ms.NodeModelResult(data, node=123, name="Node_123") + >>> mr2 = ms.NodeModelResult(df, item="Water Level", node=456) + """ + + def __init__( + self, + data: PointType, + *, + name: Optional[str] = None, + node: Optional[int] = None, + item: str | int | None = None, + quantity: Optional[Quantity] = None, + aux_items: Optional[Sequence[int | str]] = None, + ) -> None: + if not self._is_input_validated(data): + data = _parse_network_node_input( + data, + name=name, + item=item, + quantity=quantity, + node=node, + aux_items=aux_items, + ) + + assert isinstance(data, xr.Dataset) + data_var = str(list(data.data_vars)[0]) + data[data_var].attrs["kind"] = "model" + super().__init__(data=data) + + @property + def node(self) -> int: + """Node ID of model result""" + node_val = self.data.coords.get("node") + if node_val is None: + raise ValueError("Node coordinate not found in data") + return int(node_val.item()) + + +class NetworkModelResult(Network1D): + """Model result for network data with time and node dimensions. + + Construct a NetworkModelResult from an xarray.Dataset with time and node coordinates + and arbitrary number of data variables. Users must provide exact node IDs (integers) + when creating observations - no spatial interpolation is performed. + + Parameters + ---------- + data : xr.Dataset + xarray.Dataset with time and node coordinates + name : Optional[str], optional + The name of the model result, + by default None (will be set to first data variable name) + item : str | int | None, optional + If multiple items/arrays are present in the input an item + must be given (as either an index or a string), by default None + quantity : Quantity, optional + Model quantity + aux_items : Optional[list[int | str]], optional + Auxiliary items, by default None + + Examples + -------- + >>> import modelskill as ms + >>> mr = ms.NetworkModelResult(network_data, name="Network_Model") + >>> obs = ms.NodeObservation(data, node=123) + >>> extracted = mr.extract(obs) + """ + + def __init__( + self, + data: xr.Dataset, + *, + name: Optional[str] = None, + item: str | int | None = None, + quantity: Optional[Quantity] = None, + aux_items: Optional[Sequence[int | str]] = None, + ) -> None: + assert isinstance( + data, xr.Dataset + ), "NetworkModelResult requires xarray.Dataset" + assert "time" in data.dims, "Dataset must have time dimension" + assert "node" in data.dims, "Dataset must have node dimension" + assert len(data.data_vars) > 0, "Dataset must have at least one data variable" + + sel_items = SelectedItems.parse( + list(data.data_vars), item=item, aux_items=aux_items + ) + name = name or sel_items.values + + self.data: xr.Dataset = data[sel_items.all] + self.name = name + self.sel_items = sel_items + + if quantity is None: + da = self.data[sel_items.values] + quantity = Quantity.from_cf_attrs(da.attrs) + self.quantity = quantity + + # Mark data variables as model data + self.data[sel_items.values].attrs["kind"] = "model" + + def __repr__(self) -> str: + return f"<{self.__class__.__name__}>: {self.name}" + + @property + def time(self) -> pd.DatetimeIndex: + """Return the time coordinate as a pandas.DatetimeIndex.""" + return pd.DatetimeIndex(self.data.time.to_index()) + + def extract( + self, + observation: NodeObservation, + ) -> NodeModelResult: + """Extract ModelResult at exact node locations + + Parameters + ---------- + observation : NodeObservation + observation with node ID (only NodeObservation supported) + + Returns + ------- + NodeModelResult + extracted model result + """ + if not isinstance(observation, NodeObservation): + raise TypeError( + f"NetworkModelResult only supports NodeObservation, got {type(observation).__name__}" + ) + + node_id = observation.node + if node_id not in self.data.node.values: + available_nodes = list(self.data.node.values) + raise ValueError( + f"Node {node_id} not found. Available: {available_nodes[:5]}..." + ) + + # Extract data at the specified node + ds = self.data.sel(node=node_id) + df = ds.to_dataframe().dropna().drop(columns="node") + + return NodeModelResult( + data=df, + node=node_id, + name=self.name, + item=self.sel_items.values, + quantity=self.quantity, + ) diff --git a/src/modelskill/model/point.py b/src/modelskill/model/point.py index 5a11b1a30..d07e7341c 100644 --- a/src/modelskill/model/point.py +++ b/src/modelskill/model/point.py @@ -11,6 +11,83 @@ from ..timeseries import TimeSeries, _parse_xyz_point_input +def align_data( + data: xr.Dataset, + observation: Observation, + *, + max_gap: float | None = None, + **kwargs: Any, +) -> xr.Dataset: + """Align model data to observation time. + + Interpolates model result to the time of the observation. + + Parameters + ---------- + data : xr.Dataset + The model dataset to align + observation : Observation + The observation to align to + max_gap : float | None, optional + Maximum gap in seconds for interpolation gaps removal, by default None + **kwargs : Any + Additional keyword arguments passed to xarray.interp + + Returns + ------- + xr.Dataset + Aligned dataset + """ + new_time = observation.time + + dati = data.dropna("time").interp(time=new_time, assume_sorted=True, **kwargs) + + if max_gap is not None: + model_time = pd.DatetimeIndex(data.time.to_index()) + dati = _remove_model_gaps(dati, mod_index=model_time, max_gap=max_gap) + return dati + + +def _remove_model_gaps( + data: xr.Dataset, + mod_index: pd.DatetimeIndex, + max_gap: float | None = None, +) -> xr.Dataset: + """Remove model gaps longer than max_gap from Dataset""" + max_gap_delta = pd.Timedelta(max_gap, "s") + obs_time = pd.DatetimeIndex(data.time.to_index()) + valid_times = _get_valid_times(obs_time, mod_index, max_gap_delta) + return data.sel(time=valid_times) + + +def _get_valid_times( + obs_time: pd.DatetimeIndex, mod_index: pd.DatetimeIndex, max_gap: pd.Timedelta +) -> pd.DatetimeIndex: + """Get valid times where interpolation gaps are within max_gap""" + # init dataframe of available timesteps and their index + df = pd.DataFrame(index=mod_index) + df["idx"] = range(len(df)) + + # for query times get available left and right index of source times + df = ( + df.reindex(df.index.union(obs_time)) + .interpolate(method="time", limit_area="inside") + .reindex(obs_time) + .dropna() + ) + df["idxa"] = np.floor(df.idx).astype(int) + df["idxb"] = np.ceil(df.idx).astype(int) + + # time of left and right source times and time delta + df["ta"] = mod_index[df.idxa] + df["tb"] = mod_index[df.idxb] + df["dt"] = df.tb - df.ta + + # valid query times where time delta is less than max_gap + valid_idx = df.dt <= max_gap + return df[valid_idx].index + + class PointModelResult(TimeSeries): """Model result for a single point location. @@ -89,62 +166,5 @@ def interp_time(self, observation: Observation, **kwargs: Any) -> PointModelResu PointModelResult Interpolated model result """ - ds = self.align(observation, **kwargs) - return PointModelResult(ds) - - def align( - self, - observation: Observation, - *, - max_gap: float | None = None, - **kwargs: Any, - ) -> xr.Dataset: - new_time = observation.time - - dati = self.data.dropna("time").interp( - time=new_time, assume_sorted=True, **kwargs - ) - - pmr = PointModelResult(dati) - if max_gap is not None: - pmr = pmr._remove_model_gaps(mod_index=self.time, max_gap=max_gap) - return pmr.data - - def _remove_model_gaps( - self, - mod_index: pd.DatetimeIndex, - max_gap: float | None = None, - ) -> PointModelResult: - """Remove model gaps longer than max_gap from TimeSeries""" - max_gap_delta = pd.Timedelta(max_gap, "s") - valid_times = self._get_valid_times(mod_index, max_gap_delta) - ds = self.data.sel(time=valid_times) + ds = align_data(self.data, observation, **kwargs) return PointModelResult(ds) - - def _get_valid_times( - self, mod_index: pd.DatetimeIndex, max_gap: pd.Timedelta - ) -> pd.DatetimeIndex: - """Used only by _remove_model_gaps""" - obs_index = self.time - # init dataframe of available timesteps and their index - df = pd.DataFrame(index=mod_index) - df["idx"] = range(len(df)) - - # for query times get available left and right index of source times - df = ( - df.reindex(df.index.union(obs_index)) - .interpolate(method="time", limit_area="inside") - .reindex(obs_index) - .dropna() - ) - df["idxa"] = np.floor(df.idx).astype(int) - df["idxb"] = np.ceil(df.idx).astype(int) - - # time of left and right source times and time delta - df["ta"] = mod_index[df.idxa] - df["tb"] = mod_index[df.idxb] - df["dt"] = df.tb - df.ta - - # valid query times where time delta is less than max_gap - valid_idx = df.dt <= max_gap - return df[valid_idx].index diff --git a/src/modelskill/obs.py b/src/modelskill/obs.py index 5828b2b03..2f4d51dbf 100644 --- a/src/modelskill/obs.py +++ b/src/modelskill/obs.py @@ -1,10 +1,11 @@ """ # Observations -ModelSkill supports two types of observations: +ModelSkill supports three types of observations: * [`PointObservation`](`modelskill.PointObservation`) - a point timeseries from a dfs0/nc file or a DataFrame * [`TrackObservation`](`modelskill.TrackObservation`) - a track (moving point) timeseries from a dfs0/nc file or a DataFrame +* [`NodeObservation`](`modelskill.NodeObservation`) - a network node timeseries for specific node IDs An observation can be created by explicitly invoking one of the above classes or using the [`observation()`](`modelskill.observation`) function which will return the appropriate type based on the input data (if possible). """ @@ -22,6 +23,7 @@ TimeSeries, _parse_xyz_point_input, _parse_track_input, + _parse_network_node_input, ) # NetCDF attributes can only be str, int, float https://unidata.github.io/netcdf4-python/#attributes-in-a-netcdf-file @@ -31,9 +33,9 @@ def observation( data: DataInputType, *, - gtype: Optional[Literal["point", "track"]] = None, + gtype: Optional[Literal["point", "track", "node", "network"]] = None, **kwargs, -) -> PointObservation | TrackObservation: +) -> PointObservation | TrackObservation | NodeObservation: """Create an appropriate observation object. A factory function for creating an appropriate observation object @@ -41,19 +43,21 @@ def observation( If 'x' or 'y' is given, a PointObservation is created. If 'x_item' or 'y_item' is given, a TrackObservation is created. + If 'node' is given, a NodeObservation is created. Parameters ---------- data : DataInputType The data to be used for creating the Observation object. - gtype : Optional[Literal["point", "track"]] + gtype : Optional[Literal["point", "track", "node", "network"]] The geometry type of the data. If not specified, it will be guessed from the data. + Note: "node" and "network" are equivalent and both create NodeObservation. **kwargs Additional keyword arguments to be passed to the Observation constructor. Returns ------- - PointObservation or TrackObservation + PointObservation or TrackObservation or NodeObservation An observation object of the appropriate type Examples @@ -61,11 +65,14 @@ def observation( >>> import modelskill as ms >>> o_pt = ms.observation(df, item=0, x=366844, y=6154291, name="Klagshamn") >>> o_tr = ms.observation("lon_after_lat.dfs0", item="wl", x_item=1, y_item=0) + >>> o_node = ms.observation(df, item="Water Level", node=123, name="Node_123") """ if gtype is None: geometry = _guess_gtype(**kwargs) else: - geometry = GeometryType(gtype) + # Map "node" to "network" for backward compatibility + gtype_mapped = "network" if gtype == "node" else gtype + geometry = GeometryType(gtype_mapped) return _obs_class_lookup[geometry]( data=data, @@ -80,9 +87,11 @@ def _guess_gtype(**kwargs) -> GeometryType: return GeometryType.POINT elif "x_item" in kwargs or "y_item" in kwargs: return GeometryType.TRACK + elif "node" in kwargs: + return GeometryType.NETWORK else: warnings.warn( - "Could not guess geometry type from data or args, assuming POINT geometry. Use PointObservation or TrackObservation to be explicit." + "Could not guess geometry type from data or args, assuming POINT geometry. Use PointObservation, TrackObservation, or NodeObservation to be explicit." ) return GeometryType.POINT @@ -340,6 +349,73 @@ def __init__( super().__init__(data=data, weight=weight, attrs=attrs) +class NodeObservation(Observation): + """Class for observations at network nodes + + Create a NodeObservation from a DataFrame or other data source. + The node ID is specified as an integer. + + Parameters + ---------- + data : str, Path, mikeio.Dataset, mikeio.DataArray, pd.DataFrame, pd.Series, xr.Dataset or xr.DataArray + data source with time series for the node + item : (int, str), optional + index or name of the wanted item/column, by default None + if data contains more than one item, item must be given + node : int, optional + node ID (integer), by default None + name : str, optional + user-defined name for easy identification in plots etc, by default derived from data + weight : float, optional + weighting factor for skill scores, by default 1.0 + quantity : Quantity, optional + The quantity of the observation, for validation with model results + aux_items : list, optional + list of names or indices of auxiliary items, by default None + attrs : dict, optional + additional attributes to be added to the data, by default None + + Examples + -------- + >>> import modelskill as ms + >>> o1 = ms.NodeObservation(data, node=123, name="Node_123") + >>> o2 = ms.NodeObservation(df, item="Water Level", node=456) + """ + + def __init__( + self, + data: PointType, + *, + item: Optional[int | str] = None, + node: Optional[int] = None, + name: Optional[str] = None, + weight: float = 1.0, + quantity: Optional[Quantity] = None, + aux_items: Optional[list[int | str]] = None, + attrs: Optional[dict] = None, + ) -> None: + if not self._is_input_validated(data): + data = _parse_network_node_input( + data, + name=name, + item=item, + quantity=quantity, + node=node, + aux_items=aux_items, + ) + + assert isinstance(data, xr.Dataset) + super().__init__(data=data, weight=weight, attrs=attrs) + + @property + def node(self) -> int: + """Node ID of observation""" + node_val = self.data.coords.get("node") + if node_val is None: + raise ValueError("Node coordinate not found in data") + return int(node_val.item()) + + def unit_display_name(name: str) -> str: """Display name @@ -364,4 +440,5 @@ def unit_display_name(name: str) -> str: _obs_class_lookup = { GeometryType.POINT: PointObservation, GeometryType.TRACK: TrackObservation, + GeometryType.NETWORK: NodeObservation, } diff --git a/src/modelskill/timeseries/__init__.py b/src/modelskill/timeseries/__init__.py index cb1ca3ffc..4885c9987 100644 --- a/src/modelskill/timeseries/__init__.py +++ b/src/modelskill/timeseries/__init__.py @@ -1,9 +1,10 @@ from ._timeseries import TimeSeries -from ._point import _parse_xyz_point_input +from ._point import _parse_xyz_point_input, _parse_network_node_input from ._track import _parse_track_input __all__ = [ "TimeSeries", "_parse_xyz_point_input", "_parse_track_input", + "_parse_network_node_input", ] diff --git a/src/modelskill/timeseries/_coords.py b/src/modelskill/timeseries/_coords.py index 899dd5e63..48076dd84 100644 --- a/src/modelskill/timeseries/_coords.py +++ b/src/modelskill/timeseries/_coords.py @@ -17,3 +17,17 @@ def __init__( @property def as_dict(self) -> dict: return {"x": self.x, "y": self.y, "z": self.z} + + +class NetworkCoords: + def __init__( + self, + node: Optional[int] = None, + boundary: Optional[str] = None, + ): + self.node = node if node is not None else np.nan + self.boundary = boundary if boundary is not None else np.nan + + @property + def as_dict(self) -> dict: + return {"node": self.node, "boundary": self.boundary} diff --git a/src/modelskill/timeseries/_point.py b/src/modelskill/timeseries/_point.py index 5e2a0c86a..5c566ed8b 100644 --- a/src/modelskill/timeseries/_point.py +++ b/src/modelskill/timeseries/_point.py @@ -5,7 +5,6 @@ from typing import Sequence, get_args, List, Optional, Tuple, Union, Any import pandas as pd import xarray as xr -import numpy as np import mikeio @@ -13,7 +12,7 @@ from ..quantity import Quantity from ..utils import _get_name from ._timeseries import _validate_data_var_name -from ._coords import XYZCoords +from ._coords import XYZCoords, NetworkCoords @dataclass @@ -144,7 +143,7 @@ def _convert_to_dataset( def _include_coords( ds: xr.Dataset, *, - coords: Optional[XYZCoords] = None, + coords: Optional[XYZCoords | NetworkCoords] = None, ) -> xr.Dataset: ds = ds.copy() if coords is not None: @@ -153,7 +152,9 @@ def _include_coords( coords_to_add = {} for k, v in coords.as_dict.items(): # Add if coordinate doesn't exist, or if user provided a non-null value - if k not in ds.coords or (v is not None and not np.isnan(v)): + # - pd.isna(v) returns True for NaN, False otherwise + # - for string values: pd.isna(v) returns False (strings are never considered "NA" unless specifically None) + if k not in ds.coords or (v is not None and not pd.isna(v)): coords_to_add[k] = v ds.coords.update(coords_to_add) @@ -237,7 +238,7 @@ def _parse_point_input( quantity: Optional[Quantity], aux_items: Optional[Sequence[int | str]], *, - coords: XYZCoords, + coords: XYZCoords | NetworkCoords, ) -> xr.Dataset: """Convert accepted input data to an xr.Dataset""" @@ -265,3 +266,16 @@ def _parse_xyz_point_input( coords = XYZCoords(x, y, z) ds = _parse_point_input(data, name, item, quantity, aux_items, coords=coords) return ds + + +def _parse_network_node_input( + data: PointType, + name: Optional[str], + item: str | int | None, + quantity: Optional[Quantity], + node: Optional[int], + aux_items: Optional[Sequence[int | str]], +) -> xr.Dataset: + coords = NetworkCoords(node=node) + ds = _parse_point_input(data, name, item, quantity, aux_items, coords=coords) + return ds diff --git a/src/modelskill/timeseries/_timeseries.py b/src/modelskill/timeseries/_timeseries.py index 52ff26ca4..bde626842 100644 --- a/src/modelskill/timeseries/_timeseries.py +++ b/src/modelskill/timeseries/_timeseries.py @@ -63,13 +63,17 @@ def _validate_dataset(ds: xr.Dataset) -> xr.Dataset: ), "time must be increasing (please check for duplicate times))" # Validate coordinates - if "x" not in ds.coords: - raise ValueError("data must have an x-coordinate") - if "y" not in ds.coords: - raise ValueError("data must have a y-coordinate") - if "z" not in ds.coords: - ds.coords["z"] = np.nan - # assert "z" in ds.coords, "data must have a z-coordinate" + # Check for either traditional x,y coordinates OR node-based coordinates + has_spatial_coords = "x" in ds.coords and "y" in ds.coords + has_node_coord = "node" in ds.coords + + if not has_spatial_coords and not has_node_coord: + raise ValueError("data must have either x,y coordinates or a node coordinate") + + if has_spatial_coords: + # Traditional spatial data - ensure z coordinate exists + if "z" not in ds.coords: + ds.coords["z"] = np.nan # Validate data vars = [v for v in ds.data_vars] @@ -108,9 +112,10 @@ def _validate_dataset(ds: xr.Dataset) -> xr.Dataset: if ds.attrs["gtype"] not in [ str(GeometryType.POINT), str(GeometryType.TRACK), + str(GeometryType.NETWORK), ]: raise ValueError( - f"data attribute 'gtype' must be one of {GeometryType.POINT} or {GeometryType.TRACK}" + f"data attribute 'gtype' must be one of {GeometryType.POINT}, {GeometryType.TRACK}, or {GeometryType.NETWORK}" ) if "long_name" not in da.attrs: da.attrs["long_name"] = Quantity.undefined().name @@ -222,7 +227,14 @@ def y(self) -> Any: def y(self, value: Any) -> None: self.data["y"] = value - def _coordinate_values(self, coord: str) -> float | np.ndarray: + @property + def node(self) -> Any: + """node-coordinate""" + return self._coordinate_values("node") + + def _coordinate_values(self, coord: str) -> None | float | np.ndarray: + if coord not in self.data.coords: + return None # Node-based data doesn't have y coordinate vals = self.data[coord].values return np.atleast_1d(vals)[0] if vals.ndim == 0 else vals @@ -248,7 +260,12 @@ def __repr__(self) -> str: res = [] res.append(f"<{self.__class__.__name__}>: {self.name}") if self.gtype == str(GeometryType.POINT): - res.append(f"Location: {self.x}, {self.y}") + # Show location based on available coordinates + if "node" in self.data.coords: + node_id = self.data.coords["node"].item() + res.append(f"Node: {node_id}") + elif self.x is not None and self.y is not None: + res.append(f"Location: {self.x}, {self.y}") res.append(f"Time: {self.time[0]} - {self.time[-1]}") res.append(f"Quantity: {self.quantity}") if len(self._aux_vars) > 0: @@ -298,11 +315,18 @@ def to_dataframe(self) -> pd.DataFrame: if self.gtype == str(GeometryType.POINT): # we remove the scalar coordinate variables as they # will otherwise be columns in the dataframe - return self.data.drop_vars(["x", "y", "z"]).to_dataframe() + # Only drop coordinates that exist + coords_to_drop = [] + for coord in ["x", "y", "z"]: + if coord in self.data.coords: + coords_to_drop.append(coord) + return self.data.drop_vars(coords_to_drop).to_dataframe() elif self.gtype == str(GeometryType.TRACK): - df = self.data.drop_vars(["z"]).to_dataframe() - # make sure that x, y cols are first - cols = ["x", "y"] + [c for c in df.columns if c not in ["x", "y"]] + coords_to_drop = ["z"] if "z" in self.data.coords else [] + df = self.data.drop_vars(coords_to_drop).to_dataframe() + # makes sure that x comes first, then y, then other columns alphabetically + priority = {"x": 0, "y": 1} + cols = sorted(df.columns, key=lambda col: (priority.get(col, 999), col)) return df[cols] else: raise NotImplementedError(f"Unknown gtype: {self.gtype}") diff --git a/src/modelskill/types.py b/src/modelskill/types.py index 0ba3dcd63..ed70d53c3 100644 --- a/src/modelskill/types.py +++ b/src/modelskill/types.py @@ -14,6 +14,7 @@ class GeometryType(Enum): TRACK = "track" UNSTRUCTURED = "unstructured" GRID = "grid" + NETWORK = "network" def __str__(self) -> str: return self.name.lower() @@ -37,6 +38,8 @@ def from_string(s: str) -> "GeometryType": >>> GeometryType.from_string("grid") + >>> GeometryType.from_string("network") + """ try: @@ -87,6 +90,7 @@ def from_string(s: str) -> "GeometryType": xr.DataArray, ] TrackType = Union[str, Path, pd.DataFrame, mikeio.Dfs0, mikeio.Dataset, xr.Dataset] +NetworkType = pd.DataFrame @dataclass(frozen=True) diff --git a/tests/test_network.py b/tests/test_network.py new file mode 100644 index 000000000..180e63086 --- /dev/null +++ b/tests/test_network.py @@ -0,0 +1,268 @@ +"""Test network models and observations""" + +import pytest +import pandas as pd +import xarray as xr +import numpy as np +import modelskill as ms +from modelskill.model.network import NetworkModelResult, NodeModelResult +from modelskill.obs import NodeObservation + + +@pytest.fixture +def sample_network_data(): + """Sample network data as xr.Dataset""" + time = pd.date_range("2010-01-01", periods=10, freq="h") + nodes = [123, 456, 789] + + # Create sample data + np.random.seed(42) # For reproducible tests + data = np.random.randn(len(time), len(nodes)) + + ds = xr.Dataset( + { + "WaterLevel": (["time", "node"], data), + }, + coords={ + "time": time, + "node": nodes, + }, + ) + ds["WaterLevel"].attrs["units"] = "m" + ds["WaterLevel"].attrs["long_name"] = "Water Level" + + return ds + + +@pytest.fixture +def sample_node_data(): + """Sample node observation data""" + time = pd.date_range("2010-01-01", periods=10, freq="h") + + # Create sample data with some variation + np.random.seed(42) + data = np.random.randn(len(time)) * 0.1 + 1.5 + + df = pd.DataFrame({"WaterLevel": data}, index=time) + + return df + + +class TestNetworkModelResult: + """Test NetworkModelResult class""" + + def test_init_with_xarray_dataset(self, sample_network_data): + """Test initialization with xr.Dataset""" + nmr = NetworkModelResult(sample_network_data) + + assert nmr.name == "WaterLevel" + assert len(nmr.time) == 10 + assert isinstance(nmr.time, pd.DatetimeIndex) + assert list(sample_network_data.node.values) == [123, 456, 789] + + def test_init_with_item_selection(self, sample_network_data): + """Test initialization with specific item""" + # Add another variable + sample_network_data["Discharge"] = sample_network_data["WaterLevel"] * 10 + + nmr = NetworkModelResult( + sample_network_data, item="WaterLevel", name="Network_WL" + ) + + assert nmr.name == "Network_WL" + assert "WaterLevel" in nmr.data.data_vars + assert "Discharge" not in nmr.data.data_vars + + def test_init_fails_without_time_dimension(self, sample_network_data): + """Test that initialization fails without time dimension""" + data_no_time = sample_network_data.drop_dims("time") + + with pytest.raises(AssertionError, match="Dataset must have time dimension"): + NetworkModelResult(data_no_time) + + def test_init_fails_without_node_dimension(self, sample_network_data): + """Test that initialization fails without node dimension""" + data_no_node = sample_network_data.drop_dims("node") + + with pytest.raises(AssertionError, match="Dataset must have node dimension"): + NetworkModelResult(data_no_node) + + def test_init_fails_with_non_dataset(self): + """Test that initialization fails with non-xarray.Dataset""" + with pytest.raises( + AssertionError, match="NetworkModelResult requires xarray.Dataset" + ): + NetworkModelResult(pd.DataFrame({"a": [1, 2, 3]})) + + def test_repr(self, sample_network_data): + """Test string representation""" + nmr = NetworkModelResult(sample_network_data, name="Test_Network") + repr_str = repr(nmr) + + assert "NetworkModelResult" in repr_str + assert "Test_Network" in repr_str + + def test_extract_valid_node(self, sample_network_data, sample_node_data): + """Test extraction of valid node""" + nmr = NetworkModelResult(sample_network_data) + obs = NodeObservation(sample_node_data, node=123, name="Node_123") + + extracted = nmr.extract(obs) + + assert isinstance(extracted, NodeModelResult) + assert extracted.node == 123 + assert extracted.name == "WaterLevel" + assert len(extracted.time) == 10 + + def test_extract_invalid_node(self, sample_network_data, sample_node_data): + """Test extraction of invalid node""" + nmr = NetworkModelResult(sample_network_data) + obs = NodeObservation(sample_node_data, node=999, name="Node_999") + + with pytest.raises(ValueError, match="Node 999 not found"): + nmr.extract(obs) + + def test_extract_wrong_observation_type(self, sample_network_data): + """Test extraction with wrong observation type""" + nmr = NetworkModelResult(sample_network_data) + + # Create a proper PointObservation with DataFrame + df = pd.DataFrame( + {"WL": [1, 2, 3]}, index=pd.date_range("2010-01-01", periods=3, freq="h") + ) + obs = ms.PointObservation(df, x=0.0, y=0.0) + + with pytest.raises( + TypeError, match="NetworkModelResult only supports NodeObservation" + ): + nmr.extract(obs) + + +class TestNodeObservation: + """Test NodeObservation class""" + + def test_init_with_dataframe(self, sample_node_data): + """Test initialization with pandas DataFrame""" + obs = NodeObservation( + sample_node_data, node=123, name="Node_123", item="WaterLevel" + ) + + assert obs.node == 123 + assert obs.name == "Node_123" + assert len(obs.time) == 10 + assert isinstance(obs.time, pd.DatetimeIndex) + + def test_init_with_series(self, sample_node_data): + """Test initialization with pandas Series""" + series = sample_node_data["WaterLevel"] + obs = NodeObservation(series, node=456, name="Node_456") + + assert obs.node == 456 + assert obs.name == "Node_456" + assert len(obs.time) == 10 + + def test_node_property(self, sample_node_data): + """Test node property""" + obs = NodeObservation(sample_node_data, node=789, name="Node_789") + + assert obs.node == 789 + assert isinstance(obs.node, int) + + def test_node_property_missing_coordinate(self, sample_node_data): + """Test node property when coordinate is missing""" + obs = NodeObservation(sample_node_data, node=123, name="Node_123") + + # Manually remove the node coordinate to test error handling + del obs.data.coords["node"] + + with pytest.raises(ValueError, match="Node coordinate not found"): + _ = obs.node + + def test_weight_property(self, sample_node_data): + """Test weight property""" + obs = NodeObservation(sample_node_data, node=123, weight=2.5) + + assert obs.weight == 2.5 + + def test_attrs_property(self, sample_node_data): + """Test attrs property""" + attrs = {"source": "test", "version": "1.0"} + obs = NodeObservation(sample_node_data, node=123, attrs=attrs) + + assert obs.attrs["source"] == "test" + assert obs.attrs["version"] == "1.0" + + +class TestNodeModelResult: + """Test NodeModelResult class""" + + def test_init_with_dataframe(self, sample_node_data): + """Test initialization with pandas DataFrame""" + nmr = NodeModelResult(sample_node_data, node=123, name="Node_123_Model") + + assert nmr.node == 123 + assert nmr.name == "Node_123_Model" + assert len(nmr.time) == 10 + + def test_init_with_series(self, sample_node_data): + """Test initialization with pandas Series""" + series = sample_node_data["WaterLevel"] + nmr = NodeModelResult(series, node=456, name="Node_456_Model") + + assert nmr.node == 456 + assert nmr.name == "Node_456_Model" + + def test_node_property(self, sample_node_data): + """Test node property""" + nmr = NodeModelResult(sample_node_data, node=789, name="Node_789_Model") + + assert nmr.node == 789 + assert isinstance(nmr.node, int) + + def test_node_property_missing_coordinate(self, sample_node_data): + """Test node property when coordinate is missing""" + nmr = NodeModelResult(sample_node_data, node=123, name="Node_123_Model") + + # Manually remove the node coordinate to test error handling + del nmr.data.coords["node"] + + with pytest.raises(ValueError, match="Node coordinate not found"): + _ = nmr.node + + +class TestNetworkIntegration: + """Test integration between network models and observations""" + + def test_network_to_node_extraction(self, sample_network_data, sample_node_data): + """Test complete workflow from network model to node extraction""" + # Create network model result + nmr = NetworkModelResult(sample_network_data, name="Network_Model") + + # Create node observation + obs = NodeObservation(sample_node_data, node=123, name="Node_123_Obs") + + # Extract node model result + extracted = nmr.extract(obs) + + # Verify extraction worked + assert isinstance(extracted, NodeModelResult) + assert extracted.node == 123 + assert extracted.name == "Network_Model" + + # Verify time alignment possibilities exist + assert len(extracted.time) == len(obs.time) + + def test_matching_workflow(self, sample_network_data, sample_node_data): + """Test matching workflow with network data""" + # Create network model result + nmr = NetworkModelResult(sample_network_data, name="Network_Model") + + # Create node observation + obs = NodeObservation(sample_node_data, node=123, name="Node_123_Obs") + + # Test that matching works + comparer = ms.match(obs, nmr) + + assert comparer is not None + assert "Network_Model" in comparer.mod_names + assert comparer.n_points > 0 diff --git a/tests/test_timeseries.py b/tests/test_timeseries.py index d362a2943..f2510e862 100644 --- a/tests/test_timeseries.py +++ b/tests/test_timeseries.py @@ -80,12 +80,12 @@ def test_timeseries_validation_fails_kind(ds_point): def test_timeseries_validation_fails_xy(ds_point): ds_without_x = ds_point.drop_vars("x") - with pytest.raises(ValueError, match="data must have an x-coordinate"): + with pytest.raises(ValueError, match="data must have either x,y"): TimeSeries(ds_without_x) # ds_point.coords["x"] = 0 ds_without_y = ds_point.drop_vars("y") - with pytest.raises(ValueError, match="data must have a y-coordinate"): + with pytest.raises(ValueError, match="data must have either x,y"): TimeSeries(ds_without_y) diff --git a/tests/testdata/network.nc b/tests/testdata/network.nc new file mode 100644 index 000000000..951c38b25 Binary files /dev/null and b/tests/testdata/network.nc differ