From b330199968074e4da25711420d0c86aa9b886b64 Mon Sep 17 00:00:00 2001 From: Penmatsa Tanoj Pavan Surya Varma Date: Mon, 30 Mar 2026 18:12:30 +0530 Subject: [PATCH 1/2] Add _get_file_stats utility for standardized file metadata retrieval --- malariagen_data/util.py | 32 +++++++++++++++++++++++++++++++ tests/test_util.py | 42 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 73 insertions(+), 1 deletion(-) diff --git a/malariagen_data/util.py b/malariagen_data/util.py index 2ab75ec56..d87a23991 100644 --- a/malariagen_data/util.py +++ b/malariagen_data/util.py @@ -537,6 +537,38 @@ def _init_zarr_store(fs, path): # some subtle bugs where instances where treated as normal tuples. So to avoid # confusion, create a dedicated class. +def _get_file_stats(url: str, **kwargs) -> Dict[str, Any]: + """Get metadata for a file at a given URL or path. + + Parameters + ---------- + url : str + The URL or path to the file. + **kwargs : dict + Additional arguments passed to _init_filesystem. + + Returns + ------- + Dict[str, Any] + A dictionary containing file statistics: size, mtime, protocol, and path. + """ + fs, path = _init_filesystem(url, **kwargs) + info = fs.info(path) + + size = info.get("size") + if size is None: + raise ValueError(f"Could not determine size for file: {url}") + + protocol = fs.protocol + if isinstance(protocol, (list, tuple)): + protocol = protocol[0] + + return { + "size": int(size), + "mtime": info.get("mtime"), + "protocol": protocol, + "path": path, + } class Region: """A region of a reference genome, i.e., a contig or contig interval.""" diff --git a/tests/test_util.py b/tests/test_util.py index 974046ab1..2d60cb6d0 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -2,8 +2,9 @@ import pytest -from malariagen_data.util import CacheMiss, Region +from unittest.mock import MagicMock, patch +from malariagen_data.util import CacheMiss, Region, _get_file_stats # --------------------------------------------------------------------------- # Region @@ -61,3 +62,42 @@ def test_cache_miss_is_exception(): raise CacheMiss("lookup_key") assert "lookup_key" in str(exc_info.value) assert repr(exc_info.value) == "CacheMiss('lookup_key')" + +def test_get_file_stats_local(tmp_path): + # Setup a local test file + content = b"test content" + p = tmp_path / "test.txt" + p.write_bytes(content) + + # Test retrieval + stats = _get_file_stats(str(p)) + + assert stats["size"] == len(content) + assert isinstance(stats["mtime"], (float, int)) + assert stats["protocol"] in ["file", "local"] + assert stats["path"] == str(p) + +def test_get_file_stats_missing_size(): + # Mock filesystem to return None for size + mock_fs = MagicMock() + mock_fs.info.return_value = {"size": None} + mock_fs.protocol = "gs" + + with patch("malariagen_data.util._init_filesystem", return_value=(mock_fs, "dummy/path")): + with pytest.raises(ValueError, match="Could not determine size for file"): + _get_file_stats("gs://bucket/file") + +def test_get_file_stats_protocol_normalization(): + # Mock filesystem with a list of protocols (common in fsspec) + mock_fs = MagicMock() + mock_fs.info.return_value = {"size": 100, "mtime": 123.4} + mock_fs.protocol = ("s3", "s3a") + + with patch("malariagen_data.util._init_filesystem", return_value=(mock_fs, "dummy/path")): + stats = _get_file_stats("s3://bucket/file") + assert stats["protocol"] == "s3" + +def test_get_file_stats_file_not_found(): + # Verify standard FileNotFoundError propagation + with pytest.raises(FileNotFoundError): + _get_file_stats("non_existent_file_9999.txt") From 2328d4f2191a7b31c7a862b77a79e72bfac0f025 Mon Sep 17 00:00:00 2001 From: Penmatsa Tanoj Pavan Surya Varma Date: Mon, 30 Mar 2026 18:29:02 +0530 Subject: [PATCH 2/2] Fix missing typing import for _get_file_stats --- malariagen_data/util.py | 1 + 1 file changed, 1 insertion(+) diff --git a/malariagen_data/util.py b/malariagen_data/util.py index d87a23991..da8a066dd 100644 --- a/malariagen_data/util.py +++ b/malariagen_data/util.py @@ -4,6 +4,7 @@ import re import sys import warnings +from typing import Dict, Any from collections import Counter from enum import Enum from math import prod