Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions malariagen_data/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import re
import sys
import warnings
from typing import Dict, Any
from collections import Counter
from enum import Enum
from math import prod
Expand Down Expand Up @@ -537,6 +538,38 @@ def _init_zarr_store(fs, path):
# some subtle bugs where instances where treated as normal tuples. So to avoid
# confusion, create a dedicated class.

def _get_file_stats(url: str, **kwargs) -> Dict[str, Any]:
"""Get metadata for a file at a given URL or path.

Parameters
----------
url : str
The URL or path to the file.
**kwargs : dict
Additional arguments passed to _init_filesystem.

Returns
-------
Dict[str, Any]
A dictionary containing file statistics: size, mtime, protocol, and path.
"""
fs, path = _init_filesystem(url, **kwargs)
info = fs.info(path)

size = info.get("size")
if size is None:
raise ValueError(f"Could not determine size for file: {url}")

protocol = fs.protocol
if isinstance(protocol, (list, tuple)):
protocol = protocol[0]

return {
"size": int(size),
"mtime": info.get("mtime"),
"protocol": protocol,
"path": path,
}

class Region:
"""A region of a reference genome, i.e., a contig or contig interval."""
Expand Down
42 changes: 41 additions & 1 deletion tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@

import pytest

from malariagen_data.util import CacheMiss, Region
from unittest.mock import MagicMock, patch

from malariagen_data.util import CacheMiss, Region, _get_file_stats

# ---------------------------------------------------------------------------
# Region
Expand Down Expand Up @@ -61,3 +62,42 @@ def test_cache_miss_is_exception():
raise CacheMiss("lookup_key")
assert "lookup_key" in str(exc_info.value)
assert repr(exc_info.value) == "CacheMiss('lookup_key')"

def test_get_file_stats_local(tmp_path):
# Setup a local test file
content = b"test content"
p = tmp_path / "test.txt"
p.write_bytes(content)

# Test retrieval
stats = _get_file_stats(str(p))

assert stats["size"] == len(content)
assert isinstance(stats["mtime"], (float, int))
assert stats["protocol"] in ["file", "local"]
assert stats["path"] == str(p)

def test_get_file_stats_missing_size():
# Mock filesystem to return None for size
mock_fs = MagicMock()
mock_fs.info.return_value = {"size": None}
mock_fs.protocol = "gs"

with patch("malariagen_data.util._init_filesystem", return_value=(mock_fs, "dummy/path")):
with pytest.raises(ValueError, match="Could not determine size for file"):
_get_file_stats("gs://bucket/file")

def test_get_file_stats_protocol_normalization():
# Mock filesystem with a list of protocols (common in fsspec)
mock_fs = MagicMock()
mock_fs.info.return_value = {"size": 100, "mtime": 123.4}
mock_fs.protocol = ("s3", "s3a")

with patch("malariagen_data.util._init_filesystem", return_value=(mock_fs, "dummy/path")):
stats = _get_file_stats("s3://bucket/file")
assert stats["protocol"] == "s3"

def test_get_file_stats_file_not_found():
# Verify standard FileNotFoundError propagation
with pytest.raises(FileNotFoundError):
_get_file_stats("non_existent_file_9999.txt")