diff --git a/malariagen_data/util.py b/malariagen_data/util.py index 7372620e3..984102d0c 100644 --- a/malariagen_data/util.py +++ b/malariagen_data/util.py @@ -664,17 +664,71 @@ def _valid_contigs(resource): def _parse_single_region(resource, region: single_region_param_type) -> Region: + """The function to parse a single region specifications into a Region object + + Accepted formats: + - Region instance: Return as it is + - Mapping with keys: Fetching key and values and return a new Region object + - String: + - contig name (e.g. "chr1") + - genomic interval (e.g. "chr1:1000-2000") + - feature ID (if genome_features available) + + Raises: + -TypeError if input type is invalid + -ValueError if region cannot be parsed or is invalid""" + if isinstance(region, Region): # The region is already a Region, nothing to do. + contig=region.contig + start=region.start + end=region.end + + if contig is None: + raise ValueError("Region mapping must include 'contig'.") + + if contig not in _valid_contigs(resource): + raise ValueError(f"Unknown contig {contig!r}.") + + if start is not None: + if not isinstance(start, int) or start < 1: + raise ValueError(f"Invalid start position: {start!r}.") + + if end is not None: + if not isinstance(end, int) or end < 1 or end > resource.genome_sequence(region=contig).shape[0]: + raise ValueError(f"Invalid end position: {end!r}.") + + if start is not None and end is not None: + if start > end: + raise ValueError(f"End position must be greater than start position.") + return region if isinstance(region, Mapping): # The region is in dictionary form, convert to Region instance. - return Region( - contig=region.get("contig"), - start=region.get("start"), - end=region.get("end"), - ) + contig=region.get("contig") + start=region.get("start") + end=region.get("end") + + if contig is None: + raise ValueError("Region mapping must include 'contig'.") + + if contig not in _valid_contigs(resource): + raise ValueError(f"Unknown contig {contig!r}.") + + if start is not None: + if not isinstance(start, int) or start < 1: + raise ValueError(f"Invalid start position: {start!r}.") + + if end is not None: + if not isinstance(end, int) or end < 1 or end > resource.genome_sequence(region=contig).shape[0]: + raise ValueError(f"Invalid end position: {end!r}.") + + if start is not None and end is not None: + if start > end: + raise ValueError(f"End position must be greater than start position.") + + return Region(contig=contig, start=start, end=end) if not isinstance(region, str): raise TypeError( diff --git a/tests/test_util.py b/tests/test_util.py index 974046ab1..e85056da1 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -1,14 +1,123 @@ -"""Tests for Region.__repr__ and CacheMiss.__repr__ / default message.""" - import pytest - -from malariagen_data.util import CacheMiss, Region - - -# --------------------------------------------------------------------------- -# Region -# --------------------------------------------------------------------------- - +from malariagen_data.util import _parse_single_region, Region, CacheMiss +from unittest.mock import MagicMock + +# _parse_single_region tests + +@pytest.fixture +def mock_resource(): + resource = MagicMock() + resource.contigs = ("2R", "2L", "3R", "3L", "X") + del resource.virtual_contigs + resource.genome_sequence.return_value.shape = [1_000_000] + return resource + +def test_parse_region_contig(mock_resource): + r = _parse_single_region(mock_resource, "2L") + assert r.contig == "2L" + assert r.start is None + assert r.end is None + + r = _parse_single_region(mock_resource, {"contig":"2L"}) + assert r.contig == "2L" + assert r.start is None + assert r.end is None + + r = _parse_single_region(mock_resource, Region("2L")) + assert r.contig == "2L" + assert r.start is None + assert r.end is None + +# String test +def test_parse_region_interval(mock_resource): + r = _parse_single_region(mock_resource, "2L:100-200") + assert r.contig == "2L" + assert r.start == 100 + assert r.end == 200 + +def test_parse_region_invalid_string(mock_resource): + invalid_regions = ["invalid_region", + "3L:abc-100", + "2L:100-25d", + "", + "2R:100-50", + "3R:0-10", + "2R-100-200", + "2L:150"] + for region in invalid_regions: + with pytest.raises(ValueError): + _parse_single_region(mock_resource, region) + +# Mapping test +def test_parse_region_invalid_dictionary(mock_resource): + invalid_regions = [{}, + {"start": 100, "end":200}, + {"contig":"3L", "start":-2, "end":10}, + {"contig":"X", "start":10, "end":-100}, + {"contig":"2L", "start":100, "end":10}, + {"contig": "Invalid_contig", "start":10, "end":20}, + {"contig":"2L", "start":"abc", "end":10}, + {"contig":"2R", "start":100, "end":"bcd"} + ] + + for region in invalid_regions: + with pytest.raises(ValueError): + _parse_single_region(mock_resource, region) + +def test_parse_region_dictionary_only_start(mock_resource): + r = _parse_single_region(mock_resource, {"contig": "2L", "start":200}) + assert r.contig == "2L" + assert r.start == 200 + assert r.end is None + +def test_parse_region_dictionary_only_end(mock_resource): + r = _parse_single_region(mock_resource, {"contig": "2L", "end":200}) + assert r.contig == "2L" + assert r.start is None + assert r.end == 200 + +# Region instance test +def test_parse_region_instance_interval(mock_resource): + r = _parse_single_region(mock_resource, Region("2L", start=100, end=200)) + assert r.contig == "2L" + assert r.start == 100 + assert r.end == 200 + +def test_parse_region_instance_only_start(mock_resource): + r = _parse_single_region(mock_resource, Region("2L", start=200)) + assert r.contig == "2L" + assert r.start == 200 + assert r.end is None + +def test_parse_region_instance_only_end(mock_resource): + r = _parse_single_region(mock_resource, Region("2L", end=200)) + assert r.contig == "2L" + assert r.start is None + assert r.end == 200 + +def test_parse_region_invalid_instance(mock_resource): + invalid_regions = [Region(contig=""), + Region(contig="", start = 100, end = 200), + Region(contig="3L", start=-2, end=10), + Region(contig="X", start=10, end=-100), + Region(contig="2L", start=100, end=10), + Region(contig= "Invalid_contig", start=10, end=20), + Region(contig="2L", start="abc", end=10), + Region(contig="2R", start=100, end="bcd") + ] + + for region in invalid_regions: + with pytest.raises(ValueError): + _parse_single_region(mock_resource, region) + +# Region type test +def test_parse_region_invalid_type(mock_resource): + invalid_types = [123456, ["2L:100-230"], ("2L:100-250",), True, False, 3.154] + for types in invalid_types: + with pytest.raises(TypeError): + _parse_single_region(mock_resource, types) + +# Region tests def test_region_repr_contig_only(): r = Region("2L") @@ -32,11 +141,7 @@ def test_region_repr_start_only(): assert repr(r) == "Region('X', 500, None)" assert str(r) == "X:500-" - -# --------------------------------------------------------------------------- -# CacheMiss -# --------------------------------------------------------------------------- - +# CacheMiss tests def test_cache_miss_no_key(): cm = CacheMiss()