From 847e4ebc2cd10db80a10367c73092ea2debd5849 Mon Sep 17 00:00:00 2001 From: C1-BA-B1-F3 Date: Fri, 26 Jun 2026 10:00:49 +0800 Subject: [PATCH 1/4] fix: cache scipy flush_only_netcdf_file class to fix pickle identity (GH#11323) The class was defined inside `_open_scipy_netcdf()`, so each call created a new class object. After opening two scipy-backed datasets from file-like objects, the first dataset's class reference became unreachable by qualname, causing pickle's class-identity check to fail with: PicklingError: Can't pickle : it's not the same object as xarray.backends.scipy_._PickleWorkaround.flush_only_netcdf_file Fix: create the class once in `_get_flush_only_class()`, set its `__qualname__` to a module-level name, and register it as a module attribute so pickle can always resolve it. Regression test included. --- xarray/backends/scipy_.py | 83 +++++++++++++++++++---------------- xarray/tests/test_backends.py | 15 +++++++ 2 files changed, 61 insertions(+), 37 deletions(-) diff --git a/xarray/backends/scipy_.py b/xarray/backends/scipy_.py index 9d5f33e8947..39bac7432f7 100644 --- a/xarray/backends/scipy_.py +++ b/xarray/backends/scipy_.py @@ -122,16 +122,51 @@ def __setitem__(self, key, value): raise -# This is a dirty workaround to allow pickling of the flush_only_netcdf_file class. -# https://stackoverflow.com/questions/72766345/attributeerror-cant-pickle-local-object-in-multiprocessing -# TODO: Remove this after upstreaming the fixes to scipy. -class _PickleWorkaround: - flush_only_netcdf_file: type[scipy.io.netcdf_file] - - @classmethod - def add_cls(cls, new_class: type[Any]) -> None: - setattr(cls, new_class.__name__, new_class) - new_class.__qualname__ = cls.__qualname__ + "." + new_class.__name__ +# Cached class created once so its identity is stable for pickle. +# The class must not be re-created on each call to _open_scipy_netcdf; +# otherwise pickle sees a different class object when looking up the +# qualname and raises PicklingError (GH#11323). +# +# We set __qualname__ to a module-level name so pickle can always +# resolve the class via ``xarray.backends.scipy_.flush_only_netcdf_file``. +_flush_only_class: type[Any] | None = None + + +def _get_flush_only_class() -> type[Any]: + global _flush_only_class + if _flush_only_class is None: + import scipy.io + + # TODO: Remove this after upstreaming these fixes. + class flush_only_netcdf_file(scipy.io.netcdf_file): + # scipy.io.netcdf_file.close() incorrectly closes file objects that + # were passed in as constructor arguments: + # https://github.com/scipy/scipy/issues/13905 + + # Instead of closing such files, only call flush(), which is + # equivalent as long as the netcdf_file object is not mmapped. + # This suffices to keep BytesIO objects open long enough to read + # their contents from to_netcdf(), but underlying files still get + # closed when the netcdf_file is garbage collected (via __del__), + # and will need to be fixed upstream in scipy. + def close(self): + if hasattr(self, "fp") and not self.fp.closed: + self.flush() + self.fp.seek(0) # allow file to be read again + + def __del__(self): + # Remove the __del__ method, which in scipy is aliased to close(). + # These files need to be closed explicitly by xarray. + pass + + flush_only_netcdf_file.__qualname__ = "flush_only_netcdf_file" + _flush_only_class = flush_only_netcdf_file + # Make the class accessible as a module attribute so pickle can + # resolve it by qualname ``xarray.backends.scipy_.flush_only_netcdf_file``. + import sys + + sys.modules[__name__].flush_only_netcdf_file = _flush_only_class + return _flush_only_class def _open_scipy_netcdf( @@ -143,33 +178,7 @@ def _open_scipy_netcdf( ) -> scipy.io.netcdf_file: import scipy.io - # TODO: Remove this after upstreaming these fixes. - class flush_only_netcdf_file(scipy.io.netcdf_file): - # scipy.io.netcdf_file.close() incorrectly closes file objects that - # were passed in as constructor arguments: - # https://github.com/scipy/scipy/issues/13905 - - # Instead of closing such files, only call flush(), which is - # equivalent as long as the netcdf_file object is not mmapped. - # This suffices to keep BytesIO objects open long enough to read - # their contents from to_netcdf(), but underlying files still get - # closed when the netcdf_file is garbage collected (via __del__), - # and will need to be fixed upstream in scipy. - def close(self): - if hasattr(self, "fp") and not self.fp.closed: - self.flush() - self.fp.seek(0) # allow file to be read again - - def __del__(self): - # Remove the __del__ method, which in scipy is aliased to close(). - # These files need to be closed explicitly by xarray. - pass - - _PickleWorkaround.add_cls(flush_only_netcdf_file) - - netcdf_file = ( - _PickleWorkaround.flush_only_netcdf_file if flush_only else scipy.io.netcdf_file - ) + netcdf_file = _get_flush_only_class() if flush_only else scipy.io.netcdf_file # if the string ends with .gz, then gunzip and open as netcdf file if isinstance(filename, str) and filename.endswith(".gz"): diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 4e08b71260b..33a9e3c9deb 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -4579,6 +4579,21 @@ def roundtrip( with self.open(saved, **open_kwargs) as ds: yield ds + def test_pickle_after_multiple_opens_from_bytes(self) -> None: + # Regression test for GH#11323: opening two scipy-backed datasets + # from BytesIO objects would overwrite the cached flush_only class, + # making the first dataset unpicklable. + original = Dataset({"foo": ("x", [1, 2, 3])}) + netcdf_bytes = bytes(original.to_netcdf(engine=self.engine)) + ds1 = open_dataset(BytesIO(netcdf_bytes), engine=self.engine) + ds2 = open_dataset(BytesIO(netcdf_bytes), engine=self.engine) + try: + with pickle.loads(pickle.dumps(ds1)) as unpickled: + assert_identical(unpickled, original) + finally: + ds1.close() + ds2.close() + @pytest.mark.asyncio @pytest.mark.skip(reason="NetCDF backends don't support async loading") async def test_load_async(self) -> None: From 037a8d31b7fe9c89763a48929b99371338bee3f4 Mon Sep 17 00:00:00 2001 From: CI Bot Date: Fri, 26 Jun 2026 16:04:11 +0800 Subject: [PATCH 2/4] fix: add type ignore for dynamic module attribute assignment The line dynamically adds an attribute to the module for pickle resolution. Mypy cannot track this pattern, so we add . Fixes mypy error: xarray/backends/scipy_.py:168: error: Module has no attribute "flush_only_netcdf_file" [attr-defined] --- xarray/backends/scipy_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/scipy_.py b/xarray/backends/scipy_.py index 39bac7432f7..13dff16aee3 100644 --- a/xarray/backends/scipy_.py +++ b/xarray/backends/scipy_.py @@ -165,7 +165,7 @@ def __del__(self): # resolve it by qualname ``xarray.backends.scipy_.flush_only_netcdf_file``. import sys - sys.modules[__name__].flush_only_netcdf_file = _flush_only_class + sys.modules[__name__].flush_only_netcdf_file = _flush_only_class # type: ignore[attr-defined] return _flush_only_class From a2cb58d8c70d27438db7aa2d04a71e6b4b462918 Mon Sep 17 00:00:00 2001 From: Fix 11417 Date: Fri, 26 Jun 2026 20:06:53 +0800 Subject: [PATCH 3/4] fix: preserve scalar variables in reduce operations (GH#11417) When calling reduce operations like sum/mean on a Dataset with scalar (non-dimensional) data variables, the reduce_maybe_single logic would set axis=None for 0-d variables with no matching reduce dims. This caused numpy to attempt reduction on the scalar value itself, which failed for non-numeric types like strings. The fix adds a check that reduce_dims is non-empty before setting reduce_maybe_single=None. When reduce_dims is empty (variable doesn't have the target dimension), reduce_maybe_single stays as [] (empty list), which triggers the invariant_0d check in duck_array_ops and returns the scalar value unchanged. Closes #11417 --- xarray/core/dataset.py | 4 +++- xarray/tests/test_dataset.py | 25 +++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 1ce84904623..bf4fae2922a 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -6966,7 +6966,9 @@ def reduce( # keep single-element dims as list, to support Hashables reduce_maybe_single = ( None - if len(reduce_dims) == var.ndim and var.ndim != 1 + if reduce_dims + and len(reduce_dims) == var.ndim + and var.ndim != 1 else reduce_dims ) variables[name] = var.reduce( diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 762c647f15c..c95816ca904 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -6361,6 +6361,31 @@ def test_reduce_scalars(self) -> None: actual = ds.var("a") assert_identical(expected, actual) + def test_reduce_string_scalar(self) -> None: + # regression test for GH#11417 + # scalar variables without the reduce dim should be preserved + ds = Dataset( + data_vars={ + "a": (["index"], [1, 2, 3]), + "d": ([], "hello"), + } + ) + expected = Dataset({"a": 6, "d": "hello"}) + actual = ds.sum("index") + assert_identical(expected, actual) + + expected = Dataset({"a": 2.0, "d": "hello"}) + actual = ds.mean("index") + assert_identical(expected, actual) + + expected = Dataset({"a": 1, "d": "hello"}) + actual = ds.min("index") + assert_identical(expected, actual) + + expected = Dataset({"a": 3, "d": "hello"}) + actual = ds.max("index") + assert_identical(expected, actual) + def test_reduce_only_one_axis(self) -> None: def mean_only_one_axis(x, axis): if not isinstance(axis, integer_types): From 11415677e87e0c26e38d2672ba5679779d0575ce Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 26 Jun 2026 12:08:02 +0000 Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/core/dataset.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index bf4fae2922a..e40b9a7b104 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -6966,9 +6966,7 @@ def reduce( # keep single-element dims as list, to support Hashables reduce_maybe_single = ( None - if reduce_dims - and len(reduce_dims) == var.ndim - and var.ndim != 1 + if reduce_dims and len(reduce_dims) == var.ndim and var.ndim != 1 else reduce_dims ) variables[name] = var.reduce(