Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 59 additions & 5 deletions malariagen_data/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -664,17 +664,71 @@ def _valid_contigs(resource):


def _parse_single_region(resource, region: single_region_param_type) -> Region:
"""The function to parse a single region specifications into a Region object

Accepted formats:
- Region instance: Return as it is
- Mapping with keys: Fetching key and values and return a new Region object
- String:
- contig name (e.g. "chr1")
- genomic interval (e.g. "chr1:1000-2000")
- feature ID (if genome_features available)

Raises:
-TypeError if input type is invalid
-ValueError if region cannot be parsed or is invalid"""

if isinstance(region, Region):
# The region is already a Region, nothing to do.
contig=region.contig
start=region.start
end=region.end

if contig is None:
raise ValueError("Region mapping must include 'contig'.")

if contig not in _valid_contigs(resource):
raise ValueError(f"Unknown contig {contig!r}.")

if start is not None:
if not isinstance(start, int) or start < 1:
raise ValueError(f"Invalid start position: {start!r}.")

if end is not None:
if not isinstance(end, int) or end < 1 or end > resource.genome_sequence(region=contig).shape[0]:
raise ValueError(f"Invalid end position: {end!r}.")

if start is not None and end is not None:
if start > end:
raise ValueError(f"End position must be greater than start position.")

return region

if isinstance(region, Mapping):
# The region is in dictionary form, convert to Region instance.
return Region(
contig=region.get("contig"),
start=region.get("start"),
end=region.get("end"),
)
contig=region.get("contig")
start=region.get("start")
end=region.get("end")

if contig is None:
raise ValueError("Region mapping must include 'contig'.")

if contig not in _valid_contigs(resource):
raise ValueError(f"Unknown contig {contig!r}.")

if start is not None:
if not isinstance(start, int) or start < 1:
raise ValueError(f"Invalid start position: {start!r}.")

if end is not None:
if not isinstance(end, int) or end < 1 or end > resource.genome_sequence(region=contig).shape[0]:
raise ValueError(f"Invalid end position: {end!r}.")

if start is not None and end is not None:
if start > end:
raise ValueError(f"End position must be greater than start position.")

return Region(contig=contig, start=start, end=end)

if not isinstance(region, str):
raise TypeError(
Expand Down
135 changes: 120 additions & 15 deletions tests/test_util.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,123 @@
"""Tests for Region.__repr__ and CacheMiss.__repr__ / default message."""

import pytest

from malariagen_data.util import CacheMiss, Region


# ---------------------------------------------------------------------------
# Region
# ---------------------------------------------------------------------------

from malariagen_data.util import _parse_single_region, Region, CacheMiss
from unittest.mock import MagicMock

# _parse_single_region tests

@pytest.fixture
def mock_resource():
resource = MagicMock()
resource.contigs = ("2R", "2L", "3R", "3L", "X")
del resource.virtual_contigs
resource.genome_sequence.return_value.shape = [1_000_000]
return resource

def test_parse_region_contig(mock_resource):
r = _parse_single_region(mock_resource, "2L")
assert r.contig == "2L"
assert r.start is None
assert r.end is None

r = _parse_single_region(mock_resource, {"contig":"2L"})
assert r.contig == "2L"
assert r.start is None
assert r.end is None

r = _parse_single_region(mock_resource, Region("2L"))
assert r.contig == "2L"
assert r.start is None
assert r.end is None

# String test
def test_parse_region_interval(mock_resource):
r = _parse_single_region(mock_resource, "2L:100-200")
assert r.contig == "2L"
assert r.start == 100
assert r.end == 200

def test_parse_region_invalid_string(mock_resource):
invalid_regions = ["invalid_region",
"3L:abc-100",
"2L:100-25d",
"",
"2R:100-50",
"3R:0-10",
"2R-100-200",
"2L:150"]
for region in invalid_regions:
with pytest.raises(ValueError):
_parse_single_region(mock_resource, region)

# Mapping test
def test_parse_region_invalid_dictionary(mock_resource):
invalid_regions = [{},
{"start": 100, "end":200},
{"contig":"3L", "start":-2, "end":10},
{"contig":"X", "start":10, "end":-100},
{"contig":"2L", "start":100, "end":10},
{"contig": "Invalid_contig", "start":10, "end":20},
{"contig":"2L", "start":"abc", "end":10},
{"contig":"2R", "start":100, "end":"bcd"}
]

for region in invalid_regions:
with pytest.raises(ValueError):
_parse_single_region(mock_resource, region)

def test_parse_region_dictionary_only_start(mock_resource):
r = _parse_single_region(mock_resource, {"contig": "2L", "start":200})
assert r.contig == "2L"
assert r.start == 200
assert r.end is None

def test_parse_region_dictionary_only_end(mock_resource):
r = _parse_single_region(mock_resource, {"contig": "2L", "end":200})
assert r.contig == "2L"
assert r.start is None
assert r.end == 200

# Region instance test
def test_parse_region_instance_interval(mock_resource):
r = _parse_single_region(mock_resource, Region("2L", start=100, end=200))
assert r.contig == "2L"
assert r.start == 100
assert r.end == 200

def test_parse_region_instance_only_start(mock_resource):
r = _parse_single_region(mock_resource, Region("2L", start=200))
assert r.contig == "2L"
assert r.start == 200
assert r.end is None

def test_parse_region_instance_only_end(mock_resource):
r = _parse_single_region(mock_resource, Region("2L", end=200))
assert r.contig == "2L"
assert r.start is None
assert r.end == 200

def test_parse_region_invalid_instance(mock_resource):
invalid_regions = [Region(contig=""),
Region(contig="", start = 100, end = 200),
Region(contig="3L", start=-2, end=10),
Region(contig="X", start=10, end=-100),
Region(contig="2L", start=100, end=10),
Region(contig= "Invalid_contig", start=10, end=20),
Region(contig="2L", start="abc", end=10),
Region(contig="2R", start=100, end="bcd")
]

for region in invalid_regions:
with pytest.raises(ValueError):
_parse_single_region(mock_resource, region)

# Region type test
def test_parse_region_invalid_type(mock_resource):
invalid_types = [123456, ["2L:100-230"], ("2L:100-250",), True, False, 3.154]
for types in invalid_types:
with pytest.raises(TypeError):
_parse_single_region(mock_resource, types)

# Region tests

def test_region_repr_contig_only():
r = Region("2L")
Expand All @@ -32,11 +141,7 @@ def test_region_repr_start_only():
assert repr(r) == "Region('X', 500, None)"
assert str(r) == "X:500-"


# ---------------------------------------------------------------------------
# CacheMiss
# ---------------------------------------------------------------------------

# CacheMiss tests

def test_cache_miss_no_key():
cm = CacheMiss()
Expand Down