diff --git a/xarray/backends/scipy_.py b/xarray/backends/scipy_.py index 9d5f33e8947..90a9d07add1 100644 --- a/xarray/backends/scipy_.py +++ b/xarray/backends/scipy_.py @@ -134,16 +134,21 @@ def add_cls(cls, new_class: type[Any]) -> None: new_class.__qualname__ = cls.__qualname__ + "." + new_class.__name__ -def _open_scipy_netcdf( - filename: str | os.PathLike[Any] | IO[bytes], - mode: Literal["r", "w", "a"], - mmap: bool | None, - version: Literal[1, 2], - flush_only: bool = False, -) -> scipy.io.netcdf_file: +# Cache for the flush_only_netcdf_file class to prevent identity mismatch on pickle. +# GH#11323: Creating a new class on each call to _open_scipy_netcdf() breaks pickling +# because subsequent calls overwrite _PickleWorkaround.flush_only_netcdf_file with a +# new class object, making previously-created instances unpicklable. +_flush_only_class: type[scipy.io.netcdf_file] | None = None + + +def _get_flush_only_class() -> type[scipy.io.netcdf_file]: + """Return a cached subclass of scipy.io.netcdf_file that only flushes on close.""" + global _flush_only_class + if _flush_only_class is not None: + return _flush_only_class + import scipy.io - # TODO: Remove this after upstreaming these fixes. class flush_only_netcdf_file(scipy.io.netcdf_file): # scipy.io.netcdf_file.close() incorrectly closes file objects that # were passed in as constructor arguments: @@ -166,10 +171,20 @@ def __del__(self): pass _PickleWorkaround.add_cls(flush_only_netcdf_file) + _flush_only_class = flush_only_netcdf_file + return _flush_only_class + + +def _open_scipy_netcdf( + filename: str | os.PathLike[Any] | IO[bytes], + mode: Literal["r", "w", "a"], + mmap: bool | None, + version: Literal[1, 2], + flush_only: bool = False, +) -> scipy.io.netcdf_file: + import scipy.io - netcdf_file = ( - _PickleWorkaround.flush_only_netcdf_file if flush_only else scipy.io.netcdf_file - ) + netcdf_file = _get_flush_only_class() if flush_only else scipy.io.netcdf_file # if the string ends with .gz, then gunzip and open as netcdf file if isinstance(filename, str) and filename.endswith(".gz"): diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 4e08b71260b..7ae6a2a9a98 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -4682,6 +4682,35 @@ def test_nc4_scipy(self) -> None: open_dataset(tmp_file, engine="scipy") +@requires_scipy +def test_scipy_pickle_after_multiple_opens(tmp_path: Path) -> None: + """Regression test for GH#11323. + + Opening multiple scipy-backed datasets from file-like objects should not + break pickling of previously-opened datasets. The bug was that each call + to _open_scipy_netcdf() created a new flush_only_netcdf_file class, + overwriting the previous one and breaking pickle's class-identity check. + """ + ds = Dataset( + {"foo": (("x",), np.arange(4, dtype=np.float64))}, + coords={"x": np.arange(4)}, + ) + buf = BytesIO() + ds.to_netcdf(buf, engine="scipy") + + buf.seek(0) + ds1 = open_dataset(buf, engine="scipy") + buf.seek(0) + ds2 = open_dataset(buf, engine="scipy") + + # This should not raise PicklingError + pickle.dumps(ds1) + pickle.dumps(ds2) + + ds1.close() + ds2.close() + + @requires_netCDF4 class TestNetCDF3ViaNetCDF4Data(NetCDF3Only, CFEncodedBase): engine: T_NetcdfEngine = "netcdf4"