diff --git a/README.md b/README.md index 48aacf9..8ad5075 100644 --- a/README.md +++ b/README.md @@ -22,4 +22,9 @@ This series of notebooks and scripts is for an introduction to basics of coding. - hap.py output plotting - TSO500 CNV counting +### 2026 additions + +- Unit Testing: Principles, Python and TDD + + ### Written by East Genomics GLH \ No newline at end of file diff --git a/learning-sessions/Unit_testing_python_2026/example_code/README.md b/learning-sessions/Unit_testing_python_2026/example_code/README.md new file mode 100644 index 0000000..04e2c11 --- /dev/null +++ b/learning-sessions/Unit_testing_python_2026/example_code/README.md @@ -0,0 +1,11 @@ +## Codeschool presentation - Good with Unit testing: Principles, Python and TDD + +Here you will find the scripts that I have used to present some examples for unit testing. Feel free to use this to see how unit testing works, or feel free to use these scripts as a template to your unit testing. + +To run pytest, simply run `pytest` in terminal, inside the `example_code` folder. To run pytest-cov, run the following line instead from your terminal: + +**Prerequisites:** Install dependencies with `pip install pytest pytest-cov pytest-mock` before running the tests. + +```bash +pytest --cov=bin --cov-report=html +``` \ No newline at end of file diff --git a/learning-sessions/Unit_testing_python_2026/example_code/bin/__init__.py b/learning-sessions/Unit_testing_python_2026/example_code/bin/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/learning-sessions/Unit_testing_python_2026/example_code/bin/clinvar_utils.py b/learning-sessions/Unit_testing_python_2026/example_code/bin/clinvar_utils.py new file mode 100644 index 0000000..8764143 --- /dev/null +++ b/learning-sessions/Unit_testing_python_2026/example_code/bin/clinvar_utils.py @@ -0,0 +1,45 @@ +"""Utility functions for fetching clinical significance from ClinVar.""" +import requests + + +def get_clinvar_significance(variation_id): + """ + Fetches clinical significance from ClinVar for a given Variation ID. + + Parameters + ---------- + variation_id : str + The ClinVar Variation ID (e.g., '12345'). + + Returns + ------- + str + The clinical significance of the variant. + + Raises + ------ + requests.exceptions.RequestException + If the API call fails. + ValueError + If the Variation ID is not found or data is malformed. + """ + # ClinVar API endpoint for variant data + url = ("https://api.ncbi.nlm.nih.gov/variation/v0/beta/" + f"clinical-significance/variation/{variation_id}") + + # Send GET request + response = requests.get(url, timeout=10) + + # Raise exception for bad status codes + response.raise_for_status() + + data = response.json() + + # Parse the nested JSON structure to get the significance + try: + significance = data['clinical_significance']['description'] + except KeyError as exc: + raise ValueError( + f"Could not find significance data for ID {variation_id}") from exc + else: + return significance diff --git a/learning-sessions/Unit_testing_python_2026/example_code/bin/script.py b/learning-sessions/Unit_testing_python_2026/example_code/bin/script.py new file mode 100644 index 0000000..54b6782 --- /dev/null +++ b/learning-sessions/Unit_testing_python_2026/example_code/bin/script.py @@ -0,0 +1,41 @@ +""" +This module contains a function to calculate the GC content of a DNA sequence. +""" + + +def calculate_gc_content(sequence): + """ + Calculates the GC content percentage of a DNA sequence. + + Input: + sequence (str): A string representing the DNA sequence (e.g., "ATGC"). + + Output: + float: The GC content percentage, rounded to two decimal places. + + Raises: + TypeError: If the input is not a string. + ValueError: If the sequence contains characters other than A, T, G, C. + """ + + if not isinstance(sequence, str): + raise TypeError("Sequence must be a string.") + + if not sequence: + return 0.0 + + # Normalize to uppercase to handle mixed cases + seq = sequence.upper() + + # Validate that sequence contains only DNA bases + valid_bases = set('ATGC') + if not all(base in valid_bases for base in seq): + raise ValueError("Sequence contains invalid characters. " + "Only A, T, G, C are allowed.") + + # Count Gs and Cs + g_count = seq.count('G') + c_count = seq.count('C') + + gc_percentage = ((g_count + c_count) / len(seq)) * 100 + return round(gc_percentage, 2) diff --git a/learning-sessions/Unit_testing_python_2026/example_code/bin/script_bed.py b/learning-sessions/Unit_testing_python_2026/example_code/bin/script_bed.py new file mode 100644 index 0000000..864a4f7 --- /dev/null +++ b/learning-sessions/Unit_testing_python_2026/example_code/bin/script_bed.py @@ -0,0 +1,53 @@ +"""Utility function for calculating the length of a bed file.""" +import os +import pandas as pd + + +def calculate_total_bed_length(filepath): + """ + Calculates the total genomic length covered by a BED file. + + This function reads the first three columns of a BED file (Chrom, Start, + End), validates that the coordinates are numeric and logical, and + returns the sum of the lengths of all regions. + + Parameters + ---------- + filepath : str + The path to the BED file to be processed. + + Returns + ------- + int + The total number of base pairs across all regions defined in the file. + + Raises + ------ + FileNotFoundError + If the provided filepath does not exist on the system. + TypeError + If the 'start' or 'end' columns contain non-numeric data. + ValueError + If any record has a start coordinate greater than the end coordinate. + """ + + if not os.path.exists(filepath): + raise FileNotFoundError(f"The file '{filepath}' does not exist.") + + cols = ['chrom', 'start', 'end'] + df = pd.read_csv(filepath, sep='\t', names=cols, usecols=[0, 1, 2]) + + if df.empty: + return 0 + + # Explicit validation instead of try/except + if not pd.api.types.is_numeric_dtype(df['start']) or \ + not pd.api.types.is_numeric_dtype(df['end']): + raise TypeError("BED coordinates must be numeric.") + + df['length'] = df['end'] - df['start'] + + if (df['length'] < 0).any(): + raise ValueError("Found BED record where start > end.") + + return int(df['length'].sum()) diff --git a/learning-sessions/Unit_testing_python_2026/example_code/tests/__init__.py b/learning-sessions/Unit_testing_python_2026/example_code/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/learning-sessions/Unit_testing_python_2026/example_code/tests/test_clinvar_utils.py b/learning-sessions/Unit_testing_python_2026/example_code/tests/test_clinvar_utils.py new file mode 100644 index 0000000..6b77109 --- /dev/null +++ b/learning-sessions/Unit_testing_python_2026/example_code/tests/test_clinvar_utils.py @@ -0,0 +1,46 @@ +"""Unit tests for get_clinvar_significance function in bin/clinvar_utils.py.""" +import pytest +from bin.clinvar_utils import get_clinvar_significance + + +@pytest.fixture(name="clinvar_mock") +def mock_clinvar_factory(mocker): + """A reusable fixture to mock the ClinVar API response.""" + # Patch the request.get method in the clinvar_utils module + mock = mocker.patch('bin.clinvar_utils.requests.get') + # Set a default return value + mock_resp = mocker.Mock() + mock_resp.json.return_value = { + 'clinical_significance': + {'description': 'Likely Benign'} + } + mock.return_value = mock_resp + return mock + + +class TestClinVarUtils: + """Tests for the get_clinvar_significance function.""" + + def test_significance_logic( + self, clinvar_mock # pylint: disable=unused-argument + ): + """ + Test that the function correctly extracts clinical + significance from mocked API response. + """ + # This test uses the fixture directly + result = get_clinvar_significance('67890') + assert result == 'Likely Benign' + + def test_key_error(self, clinvar_mock): + """ + Test that a ValueError is raised when the expected + keys are missing in the API response. + """ + # Modify the mock to return a JSON without the expected keys + clinvar_mock.return_value.json.return_value = {} + with pytest.raises( + ValueError, + match="Could not find significance data for ID 67890" + ): + get_clinvar_significance('67890') diff --git a/learning-sessions/Unit_testing_python_2026/example_code/tests/test_script.py b/learning-sessions/Unit_testing_python_2026/example_code/tests/test_script.py new file mode 100644 index 0000000..1ca0914 --- /dev/null +++ b/learning-sessions/Unit_testing_python_2026/example_code/tests/test_script.py @@ -0,0 +1,44 @@ +"""Unit tests for the calculate_gc_content function in bin/script.py.""" +import pytest +from bin.script import calculate_gc_content + + +class TestGCContent: + """Class to test valid input scenarios for GC content calculation.""" + + def test_basic_sequence(self): + """Test standard DNA sequence.""" + assert calculate_gc_content("GCGC") == 100.0 + + def test_mixed_bases(self): + """Test a mixture of all bases.""" + assert calculate_gc_content("ATGCGT") == 50.0 + + def test_case_insensitivity(self): + """Test that it handles lowercase and mixed-case letters.""" + assert calculate_gc_content("atgcgt") == 50.0 + assert calculate_gc_content("AtGcGt") == 50.0 + + +class TestGCContentEdgeCases: + """Class to test empty, invalid, or unusual input scenarios.""" + + def test_empty_string(self): + """Test empty string input.""" + assert calculate_gc_content("") == 0.0 + + def test_invalid_input_type(self): + """Test that a TypeError is raised for non-string inputs.""" + with pytest.raises(TypeError, match="Sequence must be a string"): + calculate_gc_content(12345) + + def test_rounding(self): + """Test that the result is rounded correctly.""" + # GC content of "GAT" is 1/3 = 33.3333...% + assert calculate_gc_content("GAT") == 33.33 + + def test_invalid_characters(self): + """Test if ValueError is raised for sequences with invalid chars.""" + with pytest.raises(ValueError, + match="Sequence contains invalid characters"): + calculate_gc_content("ATGCX") diff --git a/learning-sessions/Unit_testing_python_2026/example_code/tests/test_script_bed.py b/learning-sessions/Unit_testing_python_2026/example_code/tests/test_script_bed.py new file mode 100644 index 0000000..15076d1 --- /dev/null +++ b/learning-sessions/Unit_testing_python_2026/example_code/tests/test_script_bed.py @@ -0,0 +1,53 @@ +"""Unit tests for calculate_total_bed_length function in bin/script_bed.py.""" +import pytest +from bin.script_bed import calculate_total_bed_length + + +@pytest.fixture(name="valid_bed") +def valid_bed_factory(tmp_path): + """Creates a valid 2-line BED file.""" + f = tmp_path / "valid.bed" + f.write_text("chr1\t100\t200\nchr2\t0\t50") + return f + + +@pytest.fixture(name="malformed_bed") +def malformed_bed_factory(tmp_path): + """Creates a BED file with text in the coordinate columns.""" + f = tmp_path / "text_coords.bed" + f.write_text("chr1\tstring_data\t200") + return f + + +class TestBedDocumentationContract: + """Verifies that the function adheres to its docstring specifications.""" + + def test_returns_correct_int(self, valid_bed): + """Verifies the 'Returns' section of the docstring.""" + result = calculate_total_bed_length(valid_bed) + assert isinstance(result, int) + assert result == 150 + + def test_raises_file_not_found(self): + """Verifies the 'Raises FileNotFoundError' section.""" + with pytest.raises(FileNotFoundError): + calculate_total_bed_length("imaginary_file.bed") + + def test_raises_type_error(self, malformed_bed): + """Verifies the 'Raises TypeError' section.""" + with pytest.raises(TypeError, match="must be numeric"): + calculate_total_bed_length(malformed_bed) + + def test_raises_value_error(self, tmp_path): + """Verifies the 'Raises ValueError' section.""" + f = tmp_path / "invalid_coords.bed" + f.write_text("chr1\t200\t100") # start > end + with pytest.raises(ValueError, match="start > end"): + calculate_total_bed_length(f) + + def test_empty_file_returns_zero(self, tmp_path): + """Verifies that an empty BED file returns 0.""" + f = tmp_path / "empty.bed" + f.write_text("") + result = calculate_total_bed_length(f) + assert result == 0