diff --git a/doc/release_notes.rst b/doc/release_notes.rst index 80f9076e..28626b52 100644 --- a/doc/release_notes.rst +++ b/doc/release_notes.rst @@ -19,6 +19,7 @@ Upcoming Version *Other* +* Default internal integer labels to ``int32`` (configurable via ``linopy.options["label_dtype"]``, set to ``np.int64`` for the old behavior), cutting memory ~25% and speeding up model build 10-35%. Models exceeding the int32 maximum automatically widen their labels to a larger int dtype instead of raising; ``label_dtype`` acts as a floor. * ``add_variables(binary=True, ...)`` now accepts ``lower``/``upper`` bounds, as long as they are 0 or 1. Previously binary bounds could only be set via the ``.lower``/``.upper`` setters after creation. (https://github.com/PyPSA/linopy/issues/776) * ``add_piecewise_formulation`` gained an ``active_fill`` parameter that gates a partial ``active`` (defined over a subset of the indexed dimension, or masked) as always-active (``1``) or always-off (``0``); without it, a partial ``active`` — which was previously zeroed silently — now raises. Useful when one formulation mixes gated and ungated entities (e.g. committable and non-committable units sharing a ``status``). ``active_fill`` is transitional and will be removed once v1 semantics make ``active.reindex(coords).fillna(value)`` sufficient. (https://github.com/PyPSA/linopy/issues/796) diff --git a/linopy/common.py b/linopy/common.py index 9ee9777d..0e540bde 100644 --- a/linopy/common.py +++ b/linopy/common.py @@ -8,7 +8,6 @@ from __future__ import annotations import operator -import os from collections.abc import Callable, Generator, Hashable, Iterable, Sequence from functools import cached_property, reduce, wraps from pathlib import Path @@ -159,12 +158,10 @@ def infer_schema_polars(ds: Dataset) -> dict[str, DataTypeClass]: dict: A dictionary mapping column names to their corresponding Polars data types. """ schema: dict[str, DataTypeClass] = {} - np_major_version = int(np.__version__.split(".")[0]) - use_int32 = os.name == "nt" and np_major_version < 2 for name, array in ds.items(): name = str(name) if np.issubdtype(array.dtype, np.integer): - schema[name] = pl.Int32 if use_int32 else pl.Int64 + schema[name] = pl.Int32 if array.dtype.itemsize <= 4 else pl.Int64 elif np.issubdtype(array.dtype, np.floating): schema[name] = pl.Float64 elif np.issubdtype(array.dtype, np.bool_): @@ -308,7 +305,7 @@ def save_join(*dataarrays: DataArray, integer_dtype: bool = False) -> Dataset: ) arrs = xr_align(*dataarrays, join="outer") if integer_dtype: - arrs = tuple([ds.fillna(-1).astype(int) for ds in arrs]) + arrs = tuple([astype_labels(ds) for ds in arrs]) return Dataset({ds.name: ds for ds in arrs}) @@ -487,6 +484,30 @@ def best_int(max_value: int) -> type[signedinteger[Any]]: raise ValueError(f"Value {max_value} is too large for int64.") +def fitting_label_dtype(max_value: int) -> type[signedinteger[Any]]: + """ + Narrowest label dtype that holds ``max_value``, but never narrower than + ``options["label_dtype"]``. + + The configured ``label_dtype`` acts as a floor: models that fit it keep a + single, predictable dtype, while models exceeding it are widened (e.g. to + ``int64``) instead of overflowing. + """ + floor = options["label_dtype"] + fit = best_int(max_value) if max_value >= 0 else floor + return max(floor, fit, key=lambda t: np.dtype(t).itemsize) + + +def astype_labels(da: DataArray, fill_value: int = -1) -> DataArray: + """ + Fill missing entries and cast a labels array to the narrowest int dtype that + holds its values without truncation (see :func:`fitting_label_dtype`). + """ + filled = da.fillna(fill_value) + max_value = int(filled.max()) if filled.size else 0 + return filled.astype(fitting_label_dtype(max_value)) + + def get_index_map(*arrays: Sequence[Hashable]) -> dict[tuple, int]: """ Given arrays of hashable objects, create a map from unique combinations to unique integers. diff --git a/linopy/config.py b/linopy/config.py index 5d269c4e..6a28d43f 100644 --- a/linopy/config.py +++ b/linopy/config.py @@ -9,6 +9,10 @@ from typing import Any +import numpy as np + +_VALID_LABEL_DTYPES = {np.int32, np.int64} + class OptionSettings: """Runtime configuration knobs (e.g. display widths). Use as a context manager or set values directly via ``options(key=value)``.""" @@ -30,6 +34,10 @@ def set_value(self, **kwargs: Any) -> None: for k, v in kwargs.items(): if k not in self._defaults: raise KeyError(f"{k} is not a valid setting.") + if k == "label_dtype" and v not in _VALID_LABEL_DTYPES: + raise ValueError( + f"label_dtype must be one of {_VALID_LABEL_DTYPES}, got {v}" + ) self._current_values[k] = v def get_value(self, name: str) -> Any: @@ -62,4 +70,5 @@ def __repr__(self) -> str: options = OptionSettings( display_max_rows=14, display_max_terms=6, + label_dtype=np.int32, ) diff --git a/linopy/constraints.py b/linopy/constraints.py index 0b9dbb0a..da2f72cf 100644 --- a/linopy/constraints.py +++ b/linopy/constraints.py @@ -2181,7 +2181,10 @@ def flat(self) -> pd.DataFrame: return pd.DataFrame(columns=["coeffs", "vars", "labels", "key"]) df = pd.concat(dfs, ignore_index=True) unique_labels = df.labels.unique() - map_labels = pd.Series(np.arange(len(unique_labels)), index=unique_labels) + map_labels = pd.Series( + np.arange(len(unique_labels), dtype=options["label_dtype"]), + index=unique_labels, + ) df["key"] = df.labels.map(map_labels) return df diff --git a/linopy/expressions.py b/linopy/expressions.py index ea8588d2..d7442ce2 100644 --- a/linopy/expressions.py +++ b/linopy/expressions.py @@ -49,6 +49,7 @@ EmptyDeprecationWrapper, LocIndexer, assign_multiindex_safe, + astype_labels, check_common_keys_values, check_has_nulls, check_has_nulls_polars, @@ -451,7 +452,7 @@ def __init__(self, data: Dataset | Any | None, model: Model) -> None: ) if np.issubdtype(data.vars, np.floating): - data = assign_multiindex_safe(data, vars=data.vars.fillna(-1).astype(int)) + data = assign_multiindex_safe(data, vars=astype_labels(data.vars)) if not np.issubdtype(data.coeffs, np.floating): data["coeffs"].values = data.coeffs.values.astype(float) @@ -1535,7 +1536,7 @@ def sanitize(self) -> Self: linopy.LinearExpression """ if not np.issubdtype(self.vars.dtype, np.integer): - return self.assign(vars=self.vars.fillna(-1).astype(int)) + return self.assign(vars=astype_labels(self.vars)) return self @@ -1939,12 +1940,12 @@ def _simplify_row(vars_row: np.ndarray, coeffs_row: np.ndarray) -> np.ndarray: # Combined has dimensions (.., CV_DIM, TERM_DIM) # Drop terms where all vars are -1 (i.e., empty terms across all coordinates) - vars = combined.isel({CV_DIM: 0}).astype(int) + vars = astype_labels(combined.isel({CV_DIM: 0})) non_empty_terms = (vars != -1).any(dim=[d for d in vars.dims if d != TERM_DIM]) combined = combined.isel({TERM_DIM: non_empty_terms}) # Extract vars and coeffs from the combined result - vars = combined.isel({CV_DIM: 0}).astype(int) + vars = astype_labels(combined.isel({CV_DIM: 0})) coeffs = combined.isel({CV_DIM: 1}) # Create new dataset with simplified data diff --git a/linopy/model.py b/linopy/model.py index de5c089f..9f2af5bd 100644 --- a/linopy/model.py +++ b/linopy/model.py @@ -31,6 +31,7 @@ from linopy.common import ( assign_multiindex_safe, best_int, + fitting_label_dtype, maybe_replace_signs, replace_by_map, to_path, @@ -824,7 +825,9 @@ def add_variables( start = self._xCounter end = start + data.labels.size - data.labels.values = np.arange(start, end).reshape(data.labels.shape) + data.labels.values = np.arange( + start, end, dtype=fitting_label_dtype(end) + ).reshape(data.labels.shape) self._xCounter += data.labels.size if mask is not None: @@ -969,7 +972,9 @@ def _allocate_constraint_labels( """Assign label ranges from the constraint counter and apply an optional mask.""" start = self._cCounter end = start + data.labels.size - data.labels.values = np.arange(start, end).reshape(data.labels.shape) + data.labels.values = np.arange( + start, end, dtype=fitting_label_dtype(end) + ).reshape(data.labels.shape) self._cCounter += data.labels.size if mask is not None: data.labels.values = np.where(mask.values, data.labels.values, -1) diff --git a/linopy/variables.py b/linopy/variables.py index 0eed704d..20dce355 100644 --- a/linopy/variables.py +++ b/linopy/variables.py @@ -37,6 +37,7 @@ LocIndexer, VariableLabelIndex, assign_multiindex_safe, + astype_labels, check_has_nulls, check_has_nulls_polars, filter_nulls_polars, @@ -1265,6 +1266,7 @@ def ffill(self, dim: str, limit: None = None) -> Variable: ------- linopy.Variable """ + label_dtype = self.labels.dtype data = ( self.data.where(self.labels != -1) # .ffill(dim, limit=limit) @@ -1272,7 +1274,7 @@ def ffill(self, dim: str, limit: None = None) -> Variable: .map(DataArray.ffill, dim=dim, limit=limit) .fillna(self._fill_value) ) - return self.assign_multiindex_safe(labels=data.labels.astype(int)) + return self.assign_multiindex_safe(labels=data.labels.astype(label_dtype)) def bfill(self, dim: str, limit: None = None) -> Variable: """ @@ -1292,6 +1294,7 @@ def bfill(self, dim: str, limit: None = None) -> Variable: ------- linopy.Variable """ + label_dtype = self.labels.dtype data = ( self.data.where(~self.isnull()) # .bfill(dim, limit=limit) @@ -1299,7 +1302,7 @@ def bfill(self, dim: str, limit: None = None) -> Variable: .map(DataArray.bfill, dim=dim, limit=limit) .fillna(self._fill_value) ) - return self.assign(labels=data.labels.astype(int)) + return self.assign(labels=data.labels.astype(label_dtype)) def sanitize(self) -> Variable: """ @@ -1310,7 +1313,7 @@ def sanitize(self) -> Variable: linopy.Variable """ if issubdtype(self.labels.dtype, floating): - return self.assign(labels=self.labels.fillna(-1).astype(int)) + return self.assign(labels=astype_labels(self.labels)) return self def equals(self, other: Variable) -> bool: @@ -2032,7 +2035,10 @@ def flat(self) -> pd.DataFrame: """ df = pd.concat([self[k].flat for k in self], ignore_index=True) unique_labels = df.labels.unique() - map_labels = pd.Series(np.arange(len(unique_labels)), index=unique_labels) + map_labels = pd.Series( + np.arange(len(unique_labels), dtype=options["label_dtype"]), + index=unique_labels, + ) df["key"] = df.labels.map(map_labels) return df diff --git a/test/test_dtypes.py b/test/test_dtypes.py new file mode 100644 index 00000000..73987fa8 --- /dev/null +++ b/test/test_dtypes.py @@ -0,0 +1,102 @@ +"""Tests for int32 default label dtype.""" + +import numpy as np +import pytest + +from linopy import Model +from linopy.config import options + + +def test_default_label_dtype_is_int32() -> None: + assert options["label_dtype"] == np.int32 + + +def test_variable_labels_are_int32() -> None: + m = Model() + x = m.add_variables(lower=0, upper=10, coords=[range(5)], name="x") + assert x.labels.dtype == np.int32 + + +def test_constraint_labels_are_int32() -> None: + m = Model() + x = m.add_variables(lower=0, upper=10, coords=[range(5)], name="x") + m.add_constraints(x >= 1, name="c") + assert m.constraints["c"].labels.dtype == np.int32 + + +def test_expression_vars_are_int32() -> None: + m = Model() + x = m.add_variables(lower=0, upper=10, coords=[range(5)], name="x") + expr = 2 * x + 1 + assert expr.vars.dtype == np.int32 + + +@pytest.mark.skipif( + not pytest.importorskip("highspy", reason="highspy not installed"), + reason="highspy not installed", +) +def test_solve_with_int32_labels() -> None: + m = Model() + x = m.add_variables(lower=0, upper=10, name="x") + y = m.add_variables(lower=0, upper=10, name="y") + m.add_constraints(x + y <= 15, name="c1") + m.add_objective(x + 2 * y, sense="max") + m.solve("highs") + assert m.objective.value == pytest.approx(25.0) + + +def test_variable_labels_widen_past_int32() -> None: + m = Model() + m._xCounter = np.iinfo(np.int32).max - 1 + x = m.add_variables(lower=0, upper=1, coords=[range(5)], name="x") + assert x.labels.dtype == np.int64 + assert int(x.labels.max()) > np.iinfo(np.int32).max + + +def test_constraint_labels_widen_past_int32() -> None: + m = Model() + x = m.add_variables(lower=0, upper=1, coords=[range(5)], name="x") + m._cCounter = np.iinfo(np.int32).max - 1 + m.add_constraints(x >= 0, name="c") + assert m.constraints["c"].labels.dtype == np.int64 + assert int(m.constraints["c"].labels.max()) > np.iinfo(np.int32).max + + +def test_fitting_label_dtype_floors_and_widens() -> None: + from linopy.common import fitting_label_dtype + + # below the int32 ceiling: floored at the configured default + assert fitting_label_dtype(100) == np.int32 + assert fitting_label_dtype(np.iinfo(np.int32).max) == np.int32 + # above it: widened, never truncated + assert fitting_label_dtype(np.iinfo(np.int32).max + 1) == np.int64 + + +def test_astype_labels_preserves_values_past_int32() -> None: + # The label cast-back paths (ffill / sanitize / save_join / ...) must not + # truncate labels beyond the int32 ceiling back to the int32 default. + from xarray import DataArray + + from linopy.common import astype_labels + + big = np.iinfo(np.int32).max + 10 + # simulate the float round-trip these paths see (NaN -> -1 fill) + da = DataArray(np.array([big, big + 1, np.nan], dtype=float)) + out = astype_labels(da) + assert out.dtype == np.int64 + np.testing.assert_array_equal(out.values, [big, big + 1, -1]) + + +def test_label_dtype_option_int64() -> None: + with options: + options["label_dtype"] = np.int64 + m = Model() + x = m.add_variables(lower=0, upper=10, coords=[range(5)], name="x") + assert x.labels.dtype == np.int64 + expr = 2 * x + 1 + assert expr.vars.dtype == np.int64 + + +def test_label_dtype_rejects_invalid() -> None: + with pytest.raises(ValueError, match="label_dtype must be one of"): + options["label_dtype"] = np.float64