eastgenomics · Arun-bioinformatics · Feb 23, 2026 · Feb 26, 2026 · Feb 26, 2026 · coderabbitai
diff --git a/README.md b/README.md
@@ -22,4 +22,9 @@ This series of notebooks and scripts is for an introduction to basics of coding.
   - hap.py output plotting
   - TSO500 CNV counting
 
+### 2026 additions
+
+- Unit Testing: Principles, Python and TDD
+
+
 ### Written by East Genomics GLH
diff --git a/learning-sessions/Unit_testing_python_2026/example_code/README.md b/learning-sessions/Unit_testing_python_2026/example_code/README.md
@@ -0,0 +1,11 @@
+## Codeschool presentation - Good with Unit testing: Principles, Python and TDD
+
+Here you will find the scripts that I have used to present some examples for unit testing. Feel free to use this to see how unit testing works, or feel free to use these scripts as a template to your unit testing.
+
+To run pytest, simply run `pytest` in terminal, inside the `example_code` folder. To run pytest-cov, run the following line instead from your terminal:
+
+**Prerequisites:** Install dependencies with `pip install pytest pytest-cov pytest-mock` before running the tests.
+
+```bash
+pytest --cov=bin --cov-report=html
+```
diff --git a/learning-sessions/Unit_testing_python_2026/example_code/bin/__init__.py b/learning-sessions/Unit_testing_python_2026/example_code/bin/__init__.py
diff --git a/learning-sessions/Unit_testing_python_2026/example_code/bin/clinvar_utils.py b/learning-sessions/Unit_testing_python_2026/example_code/bin/clinvar_utils.py
@@ -0,0 +1,45 @@
+"""Utility functions for fetching clinical significance from ClinVar."""
+import requests
+
+
+def get_clinvar_significance(variation_id):
+    """
+    Fetches clinical significance from ClinVar for a given Variation ID.
+
+    Parameters
+    ----------
+    variation_id : str
+        The ClinVar Variation ID (e.g., '12345').
+
+    Returns
+    -------
+    str
+        The clinical significance of the variant.
+
+    Raises
+    ------
+    requests.exceptions.RequestException
+        If the API call fails.
+    ValueError
+        If the Variation ID is not found or data is malformed.
+    """
+    # ClinVar API endpoint for variant data
+    url = ("https://api.ncbi.nlm.nih.gov/variation/v0/beta/"
+           f"clinical-significance/variation/{variation_id}")
+
+    # Send GET request
+    response = requests.get(url, timeout=10)
+
+    # Raise exception for bad status codes
+    response.raise_for_status()
+
+    data = response.json()
+
+    # Parse the nested JSON structure to get the significance
+    try:
+        significance = data['clinical_significance']['description']
+    except KeyError as exc:
+        raise ValueError(
+            f"Could not find significance data for ID {variation_id}") from exc
+    else:
+        return significance
diff --git a/learning-sessions/Unit_testing_python_2026/example_code/bin/script.py b/learning-sessions/Unit_testing_python_2026/example_code/bin/script.py
@@ -0,0 +1,41 @@
+"""
+This module contains a function to calculate the GC content of a DNA sequence.
+"""
+
+
+def calculate_gc_content(sequence):
+    """
+    Calculates the GC content percentage of a DNA sequence.
+
+    Input:
+        sequence (str): A string representing the DNA sequence (e.g., "ATGC").
+
+    Output:
+        float: The GC content percentage, rounded to two decimal places.
+
+    Raises:
+        TypeError: If the input is not a string.
+        ValueError: If the sequence contains characters other than A, T, G, C.
+    """
+
+    if not isinstance(sequence, str):
+        raise TypeError("Sequence must be a string.")
+
+    if not sequence:
+        return 0.0
+
+    # Normalize to uppercase to handle mixed cases
+    seq = sequence.upper()
+
+    # Validate that sequence contains only DNA bases
+    valid_bases = set('ATGC')
+    if not all(base in valid_bases for base in seq):
+        raise ValueError("Sequence contains invalid characters. "
+                         "Only A, T, G, C are allowed.")
+
+    # Count Gs and Cs
+    g_count = seq.count('G')
+    c_count = seq.count('C')
+
+    gc_percentage = ((g_count + c_count) / len(seq)) * 100
+    return round(gc_percentage, 2)
diff --git a/learning-sessions/Unit_testing_python_2026/example_code/bin/script_bed.py b/learning-sessions/Unit_testing_python_2026/example_code/bin/script_bed.py
@@ -0,0 +1,53 @@
+"""Utility function for calculating the length of a bed file."""
+import os
+import pandas as pd
+
+
+def calculate_total_bed_length(filepath):
+    """
+    Calculates the total genomic length covered by a BED file.
+
+    This function reads the first three columns of a BED file (Chrom, Start,
+    End), validates that the coordinates are numeric and logical, and
+    returns the sum of the lengths of all regions.
+
+    Parameters
+    ----------
+    filepath : str
+        The path to the BED file to be processed.
+
+    Returns
+    -------
+    int
+        The total number of base pairs across all regions defined in the file.
+
+    Raises
+    ------
+    FileNotFoundError
+        If the provided filepath does not exist on the system.
+    TypeError
+        If the 'start' or 'end' columns contain non-numeric data.
+    ValueError
+        If any record has a start coordinate greater than the end coordinate.
+    """
+
+    if not os.path.exists(filepath):
+        raise FileNotFoundError(f"The file '{filepath}' does not exist.")
+
+    cols = ['chrom', 'start', 'end']
+    df = pd.read_csv(filepath, sep='\t', names=cols, usecols=[0, 1, 2])
+
+    if df.empty:
+        return 0
+
+    # Explicit validation instead of try/except
+    if not pd.api.types.is_numeric_dtype(df['start']) or \
+       not pd.api.types.is_numeric_dtype(df['end']):
+        raise TypeError("BED coordinates must be numeric.")
+
+    df['length'] = df['end'] - df['start']
+
+    if (df['length'] < 0).any():
+        raise ValueError("Found BED record where start > end.")
+
+    return int(df['length'].sum())
diff --git a/learning-sessions/Unit_testing_python_2026/example_code/tests/__init__.py b/learning-sessions/Unit_testing_python_2026/example_code/tests/__init__.py
diff --git a/learning-sessions/Unit_testing_python_2026/example_code/tests/test_clinvar_utils.py b/learning-sessions/Unit_testing_python_2026/example_code/tests/test_clinvar_utils.py
@@ -0,0 +1,46 @@
+"""Unit tests for get_clinvar_significance function in bin/clinvar_utils.py."""
+import pytest
+from bin.clinvar_utils import get_clinvar_significance
+
+
+@pytest.fixture(name="clinvar_mock")
+def mock_clinvar_factory(mocker):
+    """A reusable fixture to mock the ClinVar API response."""
+    # Patch the request.get method in the clinvar_utils module
+    mock = mocker.patch('bin.clinvar_utils.requests.get')
+    # Set a default return value
+    mock_resp = mocker.Mock()
+    mock_resp.json.return_value = {
+        'clinical_significance':
+            {'description': 'Likely Benign'}
+        }
+    mock.return_value = mock_resp
+    return mock
+
+
+class TestClinVarUtils:
+    """Tests for the get_clinvar_significance function."""
+
+    def test_significance_logic(
+        self, clinvar_mock  # pylint: disable=unused-argument
+    ):
+        """
+        Test that the function correctly extracts clinical
+        significance from mocked API response.
+        """
+        # This test uses the fixture directly
+        result = get_clinvar_significance('67890')
+        assert result == 'Likely Benign'
+
+    def test_key_error(self, clinvar_mock):
+        """
+        Test that a ValueError is raised when the expected
+        keys are missing in the API response.
+        """
+        # Modify the mock to return a JSON without the expected keys
+        clinvar_mock.return_value.json.return_value = {}
+        with pytest.raises(
+            ValueError,
+            match="Could not find significance data for ID 67890"
+        ):
+            get_clinvar_significance('67890')
diff --git a/learning-sessions/Unit_testing_python_2026/example_code/tests/test_script.py b/learning-sessions/Unit_testing_python_2026/example_code/tests/test_script.py
@@ -0,0 +1,44 @@
+"""Unit tests for the calculate_gc_content function in bin/script.py."""
+import pytest
+from bin.script import calculate_gc_content
+
+
+class TestGCContent:
+    """Class to test valid input scenarios for GC content calculation."""
+
+    def test_basic_sequence(self):
+        """Test standard DNA sequence."""
+        assert calculate_gc_content("GCGC") == 100.0
+
+    def test_mixed_bases(self):
+        """Test a mixture of all bases."""
+        assert calculate_gc_content("ATGCGT") == 50.0
+
+    def test_case_insensitivity(self):
+        """Test that it handles lowercase and mixed-case letters."""
+        assert calculate_gc_content("atgcgt") == 50.0
+        assert calculate_gc_content("AtGcGt") == 50.0
+
+
+class TestGCContentEdgeCases:
+    """Class to test empty, invalid, or unusual input scenarios."""
+
+    def test_empty_string(self):
+        """Test empty string input."""
+        assert calculate_gc_content("") == 0.0
+
+    def test_invalid_input_type(self):
+        """Test that a TypeError is raised for non-string inputs."""
+        with pytest.raises(TypeError, match="Sequence must be a string"):
+            calculate_gc_content(12345)
+
+    def test_rounding(self):
+        """Test that the result is rounded correctly."""
+        # GC content of "GAT" is 1/3 = 33.3333...%
+        assert calculate_gc_content("GAT") == 33.33
+
+    def test_invalid_characters(self):
+        """Test if ValueError is raised for sequences with invalid chars."""
+        with pytest.raises(ValueError,
+                           match="Sequence contains invalid characters"):
+            calculate_gc_content("ATGCX")
diff --git a/learning-sessions/Unit_testing_python_2026/example_code/tests/test_script_bed.py b/learning-sessions/Unit_testing_python_2026/example_code/tests/test_script_bed.py
@@ -0,0 +1,53 @@
+"""Unit tests for calculate_total_bed_length function in bin/script_bed.py."""
+import pytest
+from bin.script_bed import calculate_total_bed_length
+
+
+@pytest.fixture(name="valid_bed")
+def valid_bed_factory(tmp_path):
+    """Creates a valid 2-line BED file."""
+    f = tmp_path / "valid.bed"
+    f.write_text("chr1\t100\t200\nchr2\t0\t50")
+    return f
+
+
+@pytest.fixture(name="malformed_bed")
+def malformed_bed_factory(tmp_path):
+    """Creates a BED file with text in the coordinate columns."""
+    f = tmp_path / "text_coords.bed"
+    f.write_text("chr1\tstring_data\t200")
+    return f
+
+
+class TestBedDocumentationContract:
+    """Verifies that the function adheres to its docstring specifications."""
+
+    def test_returns_correct_int(self, valid_bed):
+        """Verifies the 'Returns' section of the docstring."""
+        result = calculate_total_bed_length(valid_bed)
+        assert isinstance(result, int)
+        assert result == 150
+
+    def test_raises_file_not_found(self):
+        """Verifies the 'Raises FileNotFoundError' section."""
+        with pytest.raises(FileNotFoundError):
+            calculate_total_bed_length("imaginary_file.bed")
+
+    def test_raises_type_error(self, malformed_bed):
+        """Verifies the 'Raises TypeError' section."""
+        with pytest.raises(TypeError, match="must be numeric"):
+            calculate_total_bed_length(malformed_bed)
+
+    def test_raises_value_error(self, tmp_path):
+        """Verifies the 'Raises ValueError' section."""
+        f = tmp_path / "invalid_coords.bed"
+        f.write_text("chr1\t200\t100")  # start > end
+        with pytest.raises(ValueError, match="start > end"):
+            calculate_total_bed_length(f)
+
+    def test_empty_file_returns_zero(self, tmp_path):
+        """Verifies that an empty BED file returns 0."""
+        f = tmp_path / "empty.bed"
+        f.write_text("")
+        result = calculate_total_bed_length(f)
+        assert result == 0