From 7215b782e4b3cb514c2aa1ddb45f3f0b434758e3 Mon Sep 17 00:00:00 2001 From: gaoflow Date: Tue, 2 Jun 2026 00:17:24 +0200 Subject: [PATCH 1/8] Download dataset files atomically to avoid corrupt cache (#4097) `_download` wrote directly to the destination cache path, so a concurrent reader (e.g. a parallel pytest worker sharing `settings.datasetdir`) could find the file present via `path.is_file()` and read it while it was still being written, getting a corrupted/partial file. Download to a temporary file in the same directory and atomically rename it into place once complete, so the destination only ever appears fully written. On failure the temporary file is removed and the destination is left untouched (it may belong to another process that finished first). --- src/scanpy/readwrite.py | 25 +++++++++++++++++---- tests/test_datasets.py | 50 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+), 4 deletions(-) diff --git a/src/scanpy/readwrite.py b/src/scanpy/readwrite.py index 9a3d3748fe..db211f756e 100644 --- a/src/scanpy/readwrite.py +++ b/src/scanpy/readwrite.py @@ -1084,6 +1084,7 @@ def _get_filename_from_key(key, ext=None) -> Path: def _download(url: str, path: Path): from ssl import create_default_context + from tempfile import NamedTemporaryFile from urllib.request import Request, urlopen from certifi import contents @@ -1092,6 +1093,11 @@ def _download(url: str, path: Path): blocksize = 1024 * 8 blocknum = 0 + # Download to a temporary file in the same directory and atomically move it + # into place once it is complete. This way concurrent readers (e.g. parallel + # pytest workers sharing a dataset cache) never observe ``path`` in a + # partially-written state. See #4097. + tmp_path: Path | None = None try: req = Request(url, headers={"User-agent": "scanpy-user"}) @@ -1105,8 +1111,14 @@ def _download(url: str, path: Path): unit_divisor=1024, total=total if total is None else int(total), ) as t, - path.open("wb") as f, + NamedTemporaryFile( + dir=path.parent, + prefix=f"{path.name}.", + suffix=".part", + delete=False, + ) as f, ): + tmp_path = Path(f.name) block = resp.read(blocksize) while block: f.write(block) @@ -1114,10 +1126,15 @@ def _download(url: str, path: Path): t.update(len(block)) block = resp.read(blocksize) + tmp_path.replace(path) + tmp_path = None + except (KeyboardInterrupt, Exception): - # Make sure file doesn’t exist half-downloaded - if path.is_file(): - path.unlink() + # Make sure no half-downloaded temporary file is left behind. ``path`` + # itself is only ever created by the atomic rename above, so it is left + # untouched (it may belong to another process that finished first). + if tmp_path is not None: + tmp_path.unlink(missing_ok=True) raise diff --git a/tests/test_datasets.py b/tests/test_datasets.py index e97d12a65a..157b860894 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -168,6 +168,56 @@ def test_download_failure() -> None: excinfo.value.close() +def test_download_atomic(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + """A download must be atomic (#4097). + + The destination path must not appear until the download has completed, so + that concurrent readers (e.g. parallel pytest workers sharing a cache) never + observe a partially-written file. + """ + import io + import urllib.request + + from scanpy.readwrite import _download + + content = b"0123456789" * 5_000 + dest = tmp_path / "cache" / "data.bin" + dest.parent.mkdir() + dest_present_during_download: list[bool] = [] + + class FakeResponse: + def __init__(self) -> None: + self._buf = io.BytesIO(content) + + def info(self) -> dict[str, str]: + return {"content-length": str(len(content))} + + def read(self, size: int) -> bytes: + chunk = self._buf.read(size) + if chunk: + dest_present_during_download.append(dest.exists()) + return chunk + + def __enter__(self) -> FakeResponse: + return self + + def __exit__(self, *exc: object) -> bool: + return False + + monkeypatch.setattr(urllib.request, "urlopen", lambda *a, **k: FakeResponse()) + + _download("http://example.invalid/data.bin", dest) + + assert dest.read_bytes() == content + # sanity check that the download was actually streamed in several chunks + assert len(dest_present_during_download) > 1 + assert not any(dest_present_during_download), ( + "destination appeared before the download finished" + ) + # the temporary file must have been renamed, leaving only the final file + assert list(dest.parent.iterdir()) == [dest] + + # These are tested via doctest DS_INCLUDED = frozenset({"krumsiek11", "toggleswitch", "pbmc68k_reduced"}) # These have parameters that affect shape and so on From 02b78890f1e5581cbc43d36fdcedfef4c4aa5121 Mon Sep 17 00:00:00 2001 From: gaoflow Date: Tue, 2 Jun 2026 00:17:48 +0200 Subject: [PATCH 2/8] docs: add release note for #4142 --- docs/release-notes/4142.fix.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 docs/release-notes/4142.fix.md diff --git a/docs/release-notes/4142.fix.md b/docs/release-notes/4142.fix.md new file mode 100644 index 0000000000..50b4a7693d --- /dev/null +++ b/docs/release-notes/4142.fix.md @@ -0,0 +1 @@ +Download cached dataset files atomically so that parallel processes sharing a dataset directory (e.g. {mod}`pytest-xdist` workers) can no longer read a partially-written file {smaller}`gaoflow` From 085cafd777e9e86117e1bf048f7d71baccafc197 Mon Sep 17 00:00:00 2001 From: gaoflow Date: Wed, 10 Jun 2026 07:13:25 +0200 Subject: [PATCH 3/8] docs: avoid cross-ref role for pytest-xdist in release note --- docs/release-notes/4142.fix.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/release-notes/4142.fix.md b/docs/release-notes/4142.fix.md index 50b4a7693d..9305dcfed4 100644 --- a/docs/release-notes/4142.fix.md +++ b/docs/release-notes/4142.fix.md @@ -1 +1 @@ -Download cached dataset files atomically so that parallel processes sharing a dataset directory (e.g. {mod}`pytest-xdist` workers) can no longer read a partially-written file {smaller}`gaoflow` +Download cached dataset files atomically so that parallel processes sharing a dataset directory (e.g. `pytest-xdist` workers) can no longer read a partially-written file {smaller}`gaoflow` From a367e0e2f36b2f4ba4bcd20049cf922aa8a761b7 Mon Sep 17 00:00:00 2001 From: gaoflow Date: Wed, 10 Jun 2026 07:14:39 +0200 Subject: [PATCH 4/8] test: return Self from FakeResponse.__enter__ (PYI034) --- tests/test_datasets.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 157b860894..a348b116d6 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -21,6 +21,7 @@ if TYPE_CHECKING: from collections.abc import Callable + from typing import Self from anndata import AnnData @@ -198,7 +199,7 @@ def read(self, size: int) -> bytes: dest_present_during_download.append(dest.exists()) return chunk - def __enter__(self) -> FakeResponse: + def __enter__(self) -> Self: return self def __exit__(self, *exc: object) -> bool: From 7a1c3ef19e1b43dcdfad816062a5be64dfbaf0ee Mon Sep 17 00:00:00 2001 From: gaoflow Date: Wed, 10 Jun 2026 07:33:57 +0200 Subject: [PATCH 5/8] test: cover the download error path that preserves an existing file --- tests/test_datasets.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index a348b116d6..d3c6b323f1 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -219,6 +219,48 @@ def __exit__(self, *exc: object) -> bool: assert list(dest.parent.iterdir()) == [dest] +def test_download_failure_keeps_existing_file( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """A failed download must not delete an already-present destination (#4097). + + The old code unconditionally removed ``path`` on error, which under the + parallel-cache race could wipe a file another process had finished + downloading. The atomic version only cleans up its own temporary file. + """ + import io + import urllib.request + + from scanpy.readwrite import _download + + dest = tmp_path / "cache" / "data.bin" + dest.parent.mkdir() + # A sibling process already finished this download. + dest.write_bytes(b"complete") + + class FailingResponse: + def info(self) -> dict[str, str]: + return {"content-length": "100"} + + def read(self, size: int) -> bytes: + raise OSError("connection reset") + + def __enter__(self) -> Self: + return self + + def __exit__(self, *exc: object) -> bool: + return False + + monkeypatch.setattr(urllib.request, "urlopen", lambda *a, **k: FailingResponse()) + + with pytest.raises(OSError, match="connection reset"): + _download("http://example.invalid/data.bin", dest) + + # The pre-existing file is untouched and no temporary file is left behind. + assert dest.read_bytes() == b"complete" + assert list(dest.parent.iterdir()) == [dest] + + # These are tested via doctest DS_INCLUDED = frozenset({"krumsiek11", "toggleswitch", "pbmc68k_reduced"}) # These have parameters that affect shape and so on From 9d0e4af2d5659345dafd80701fefd4d0b2d74558 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 10 Jun 2026 05:34:07 +0000 Subject: [PATCH 6/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_datasets.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index d3c6b323f1..d852ca6e9e 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -228,7 +228,6 @@ def test_download_failure_keeps_existing_file( parallel-cache race could wipe a file another process had finished downloading. The atomic version only cleans up its own temporary file. """ - import io import urllib.request from scanpy.readwrite import _download From cdc13367178e91982444926ee9de04bcfadb9542 Mon Sep 17 00:00:00 2001 From: gaoflow Date: Wed, 10 Jun 2026 08:55:10 +0200 Subject: [PATCH 7/8] test: assign exception message to variable (EM101) --- tests/test_datasets.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index d852ca6e9e..33dcb11bb2 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -242,7 +242,8 @@ def info(self) -> dict[str, str]: return {"content-length": "100"} def read(self, size: int) -> bytes: - raise OSError("connection reset") + msg = "connection reset" + raise OSError(msg) def __enter__(self) -> Self: return self From 94ae93c5c696353307218cc6104ef59e772027e0 Mon Sep 17 00:00:00 2001 From: gaoflow Date: Wed, 10 Jun 2026 09:12:56 +0200 Subject: [PATCH 8/8] refactor: trim verbose comments in download fix and tests --- src/scanpy/readwrite.py | 9 ++------- tests/test_datasets.py | 22 +++------------------- 2 files changed, 5 insertions(+), 26 deletions(-) diff --git a/src/scanpy/readwrite.py b/src/scanpy/readwrite.py index db211f756e..fc10b07bee 100644 --- a/src/scanpy/readwrite.py +++ b/src/scanpy/readwrite.py @@ -1093,10 +1093,7 @@ def _download(url: str, path: Path): blocksize = 1024 * 8 blocknum = 0 - # Download to a temporary file in the same directory and atomically move it - # into place once it is complete. This way concurrent readers (e.g. parallel - # pytest workers sharing a dataset cache) never observe ``path`` in a - # partially-written state. See #4097. + # Write to a temp file and rename so readers never see a partial file (#4097). tmp_path: Path | None = None try: req = Request(url, headers={"User-agent": "scanpy-user"}) @@ -1130,9 +1127,7 @@ def _download(url: str, path: Path): tmp_path = None except (KeyboardInterrupt, Exception): - # Make sure no half-downloaded temporary file is left behind. ``path`` - # itself is only ever created by the atomic rename above, so it is left - # untouched (it may belong to another process that finished first). + # Only remove our own temp file; leave path, which may be another process's. if tmp_path is not None: tmp_path.unlink(missing_ok=True) raise diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 33dcb11bb2..7d72ddd236 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -170,12 +170,7 @@ def test_download_failure() -> None: def test_download_atomic(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: - """A download must be atomic (#4097). - - The destination path must not appear until the download has completed, so - that concurrent readers (e.g. parallel pytest workers sharing a cache) never - observe a partially-written file. - """ + """The destination must not appear until the download finished (#4097).""" import io import urllib.request @@ -210,31 +205,21 @@ def __exit__(self, *exc: object) -> bool: _download("http://example.invalid/data.bin", dest) assert dest.read_bytes() == content - # sanity check that the download was actually streamed in several chunks assert len(dest_present_during_download) > 1 - assert not any(dest_present_during_download), ( - "destination appeared before the download finished" - ) - # the temporary file must have been renamed, leaving only the final file + assert not any(dest_present_during_download) assert list(dest.parent.iterdir()) == [dest] def test_download_failure_keeps_existing_file( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: - """A failed download must not delete an already-present destination (#4097). - - The old code unconditionally removed ``path`` on error, which under the - parallel-cache race could wipe a file another process had finished - downloading. The atomic version only cleans up its own temporary file. - """ + """A failed download must not delete an already-present destination (#4097).""" import urllib.request from scanpy.readwrite import _download dest = tmp_path / "cache" / "data.bin" dest.parent.mkdir() - # A sibling process already finished this download. dest.write_bytes(b"complete") class FailingResponse: @@ -256,7 +241,6 @@ def __exit__(self, *exc: object) -> bool: with pytest.raises(OSError, match="connection reset"): _download("http://example.invalid/data.bin", dest) - # The pre-existing file is untouched and no temporary file is left behind. assert dest.read_bytes() == b"complete" assert list(dest.parent.iterdir()) == [dest]