From 15d58a41e7859fa961b98469c6ea59f7f9424df3 Mon Sep 17 00:00:00 2001 From: RalfG Date: Mon, 20 Oct 2025 12:28:35 +0200 Subject: [PATCH 1/5] Add validation and exception raising for PSM and spectrum ID regex patterns - Implemented _validate_regular_expressions function to check regex patterns in configuration. - Added tests for regex validation, covering valid, invalid, and edge cases. - Updated parse_psms and parse_spectra to apply ID patterns and handle errors accordingly. --- ms2rescore/config_parser.py | 29 ++++++++++++- ms2rescore/parse_psms.py | 34 +++++++++++---- ms2rescore/parse_spectra.py | 61 ++++++++++++++++++++++----- tests/test_config_parser.py | 84 ++++++++++++++++++++++++++++++++++++- tests/test_parse_psms.py | 71 +++++++++++++++++++++++++++++++ tests/test_parse_spectra.py | 55 +++++++++++++++++++++--- 6 files changed, 306 insertions(+), 28 deletions(-) create mode 100644 tests/test_parse_psms.py diff --git a/ms2rescore/config_parser.py b/ms2rescore/config_parser.py index f4f8ac66..258a7eb9 100644 --- a/ms2rescore/config_parser.py +++ b/ms2rescore/config_parser.py @@ -3,6 +3,7 @@ import importlib.resources import json import multiprocessing as mp +import re from argparse import Namespace from pathlib import Path from typing import Dict, List, Union @@ -10,7 +11,7 @@ try: import tomllib except ImportError: - import tomli as tomllib + import tomli as tomllib # type: ignore from cascade_config import CascadeConfig @@ -86,6 +87,31 @@ def _validate_processes(config: Dict) -> Dict: return config +def _validate_regular_expressions(config: Dict) -> Dict: + """Validate regular expressions in configuration.""" + for field in ["psm_id_pattern", "spectrum_id_pattern"]: + if config["ms2rescore"][field]: + + # Check if valid regex + try: + pattern = re.compile(config["ms2rescore"][field]) + except re.error as e: + raise MS2RescoreConfigurationError( + f"Invalid regular expression provided for '{field}': {e}" + ) from e + + # Check if regex has exactly one capturing group + if pattern.groups != 1: + raise MS2RescoreConfigurationError( + f"Regular expression for '{field}' should contain exactly one " + "capturing group. Please check and try again." + "See https://ms2rescore.readthedocs.io/en/stable/userguide/configuration/#mapping-psms-to-spectra " + "for more information." + ) + + return config + + def parse_configurations(configurations: List[Union[dict, str, Path, Namespace]]) -> Dict: """ Parse and validate MS²Rescore configuration files and CLI arguments. @@ -142,6 +168,7 @@ def parse_configurations(configurations: List[Union[dict, str, Path, Namespace]] # Validate and infer filenames and number of parallel processes config = _validate_filenames(config) config = _validate_processes(config) + config = _validate_regular_expressions(config) # Convert feature_generators and rescoring_engine names to lowercase config["ms2rescore"]["feature_generators"] = { diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py index 6a3c6065..e392f6a6 100644 --- a/ms2rescore/parse_psms.py +++ b/ms2rescore/parse_psms.py @@ -35,6 +35,31 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList: " for more information." ) + # Apply psm_id_pattern if provided + if config["psm_id_pattern"]: + pattern = re.compile(config["psm_id_pattern"]) + logger.debug("Applying 'psm_id_pattern'...") + logger.debug( + f"Parsing '{psm_list[0].spectrum_id}' to '{_match_psm_ids(psm_list[0].spectrum_id, pattern)}'" + ) + new_ids = [_match_psm_ids(old_id, pattern) for old_id in psm_list["spectrum_id"]] + + # Validate that n unique IDs remains the same + if len(set(new_ids)) != len(set(psm_list["spectrum_id"])): + example_old_id = psm_list["spectrum_id"][0] + example_new_id = new_ids[0] + raise MS2RescoreConfigurationError( + "'psm_id_pattern' resulted in a different number of unique PSM IDs. " + "This might indicate issues with the regex pattern. Please check and try again. " + f"Example old ID: '{example_old_id}' -> new ID: '{example_new_id}'. " + " See " + "https://ms2rescore.readthedocs.io/en/stable/userguide/configuration/#mapping-psms-to-spectra " + "for more information." + ) + + # Assign new IDs + psm_list["spectrum_id"] = new_ids + # Filter by PSM rank psm_list.set_ranks(config["lower_score_is_better"]) rank_filter = psm_list["rank"] <= config["max_psm_rank_input"] @@ -81,15 +106,6 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList: psm_list.add_fixed_modifications(config["fixed_modifications"]) psm_list.apply_fixed_modifications() - if config["psm_id_pattern"]: - pattern = re.compile(config["psm_id_pattern"]) - logger.debug("Applying 'psm_id_pattern'...") - logger.debug( - f"Parsing '{psm_list[0].spectrum_id}' to '{_match_psm_ids(psm_list[0].spectrum_id, pattern)}'" - ) - new_ids = [_match_psm_ids(old_id, pattern) for old_id in psm_list["spectrum_id"]] - psm_list["spectrum_id"] = new_ids - return psm_list diff --git a/ms2rescore/parse_spectra.py b/ms2rescore/parse_spectra.py index f07d3675..51626b75 100644 --- a/ms2rescore/parse_spectra.py +++ b/ms2rescore/parse_spectra.py @@ -7,10 +7,10 @@ from typing import Optional, Set, Tuple import numpy as np -from ms2rescore_rs import get_precursor_info +from ms2rescore_rs import Precursor, get_precursor_info from psm_utils import PSMList -from ms2rescore.exceptions import MS2RescoreError +from ms2rescore.exceptions import MS2RescoreConfigurationError, MS2RescoreError from ms2rescore.utils import infer_spectrum_path LOGGER = logging.getLogger(__name__) @@ -149,6 +149,43 @@ def add_precursor_values( return available_data_types +def _apply_spectrum_id_pattern( + precursors: dict[str, Precursor], pattern: str +) -> dict[str, Precursor]: + """Apply spectrum ID pattern to precursor IDs.""" + # Map precursor IDs using regex pattern + compiled_pattern = re.compile(pattern) + id_mapping = { + match.group(1): spectrum_id + for spectrum_id in precursors.keys() + if (match := compiled_pattern.search(spectrum_id)) is not None + } + + # Validate that any IDs were matched + if not id_mapping: + raise MS2RescoreConfigurationError( + "'spectrum_id_pattern' did not match any spectrum-file IDs. Please check and try " + "again. See " + "https://ms2rescore.readthedocs.io/en/stable/userguide/configuration/#mapping-psms-to-spectra " + "for more information." + ) + + # Validate that the same number of unique IDs were matched + elif len(set(id_mapping.keys())) != len(set(precursors.keys())): + example_old_id, example_new_id = next(iter(id_mapping.items())) + raise MS2RescoreConfigurationError( + "'spectrum_id_pattern' resulted in a different number of unique PSM IDs. This might " + "indicate issues with the regex pattern. Please check and try again. " + f"Example old ID: '{example_old_id}' -> new ID: '{example_new_id}'. " + "See https://ms2rescore.readthedocs.io/en/stable/userguide/configuration/#mapping-psms-to-spectra " + "for more information." + ) + + precursors = {new_id: precursors[orig_id] for new_id, orig_id in id_mapping.items()} + + return id_mapping + + def _get_precursor_values( psm_list: PSMList, spectrum_path: str, spectrum_id_pattern: Optional[str] = None ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: @@ -162,21 +199,23 @@ def _get_precursor_values( spectrum_file = infer_spectrum_path(spectrum_path, run_name) LOGGER.debug("Reading spectrum file: '%s'", spectrum_file) - precursors = get_precursor_info(str(spectrum_file)) + precursors: dict[str, Precursor] = get_precursor_info(str(spectrum_file)) + + # Ensure unique spectrum IDs + if len(precursors) != len(set(precursors.keys())): + raise SpectrumParsingError( + f"Spectrum file '{spectrum_file}' contains duplicate spectrum IDs. " + "Please ensure that all spectrum IDs are unique." + ) # Parse spectrum IDs with regex pattern if provided if spectrum_id_pattern: - compiled_pattern = re.compile(spectrum_id_pattern) - precursors = { - match.group(1): precursor - for spectrum_id, precursor in precursors.items() - if (match := compiled_pattern.search(spectrum_id)) is not None - } + precursors = _apply_spectrum_id_pattern(precursors, spectrum_id_pattern) # Ensure all PSMs have a precursor values for psm in psm_list_run: if psm.spectrum_id not in precursors: - raise SpectrumParsingError( + raise MS2RescoreConfigurationError( "Mismatch between PSM and spectrum file IDs. Could find precursor values " f"for PSM with ID {psm.spectrum_id} in run {run_name}.\n" "Please check that the `spectrum_id_pattern` and `psm_id_pattern` options " @@ -199,6 +238,6 @@ def _get_precursor_values( class SpectrumParsingError(MS2RescoreError): - """Error parsing retention time from spectrum file.""" + """Error while parsing spectrum file.""" pass diff --git a/tests/test_config_parser.py b/tests/test_config_parser.py index a2228ae5..01e260fd 100644 --- a/tests/test_config_parser.py +++ b/tests/test_config_parser.py @@ -1,6 +1,9 @@ from pathlib import Path -from ms2rescore.config_parser import _parse_output_path +import pytest + +from ms2rescore.config_parser import _parse_output_path, _validate_regular_expressions +from ms2rescore.exceptions import MS2RescoreConfigurationError def test__parse_output_path(): @@ -21,3 +24,82 @@ def test__parse_output_path(): for output_path, expected in test_cases: assert _parse_output_path(output_path, test_psm_file) == expected + + +def test__validate_regular_expressions_valid(): + """Test _validate_regular_expressions with valid regex patterns.""" + # Valid pattern with one capturing group + config = { + "ms2rescore": { + "psm_id_pattern": r"scan:(\d+):.*", + "spectrum_id_pattern": r"spectrum_(\d+)", + } + } + result = _validate_regular_expressions(config) + assert result == config + + +def test__validate_regular_expressions_none(): + """Test _validate_regular_expressions with None patterns.""" + config = { + "ms2rescore": { + "psm_id_pattern": None, + "spectrum_id_pattern": None, + } + } + result = _validate_regular_expressions(config) + assert result == config + + +def test__validate_regular_expressions_invalid_regex(): + """Test _validate_regular_expressions with invalid regex syntax.""" + config = { + "ms2rescore": { + "psm_id_pattern": r"scan:(\d+", # Missing closing parenthesis + "spectrum_id_pattern": None, + } + } + with pytest.raises(MS2RescoreConfigurationError, match="Invalid regular expression"): + _validate_regular_expressions(config) + + +def test__validate_regular_expressions_no_capturing_group(): + """Test _validate_regular_expressions with no capturing groups.""" + config = { + "ms2rescore": { + "psm_id_pattern": r"scan:\d+:.*", # No capturing group + "spectrum_id_pattern": None, + } + } + with pytest.raises( + MS2RescoreConfigurationError, match="should contain exactly one capturing group" + ): + _validate_regular_expressions(config) + + +def test__validate_regular_expressions_multiple_capturing_groups(): + """Test _validate_regular_expressions with multiple capturing groups.""" + config = { + "ms2rescore": { + "psm_id_pattern": r"scan:(\d+):(.*)", # Two capturing groups + "spectrum_id_pattern": None, + } + } + with pytest.raises( + MS2RescoreConfigurationError, match="should contain exactly one capturing group" + ): + _validate_regular_expressions(config) + + +def test__validate_regular_expressions_spectrum_id_pattern_invalid(): + """Test _validate_regular_expressions with invalid spectrum_id_pattern.""" + config = { + "ms2rescore": { + "psm_id_pattern": None, + "spectrum_id_pattern": r"spectrum_\d+", # No capturing group + } + } + with pytest.raises( + MS2RescoreConfigurationError, match="should contain exactly one capturing group" + ): + _validate_regular_expressions(config) diff --git a/tests/test_parse_psms.py b/tests/test_parse_psms.py new file mode 100644 index 00000000..d2e231c8 --- /dev/null +++ b/tests/test_parse_psms.py @@ -0,0 +1,71 @@ +import json +from pathlib import Path + +import pytest +from psm_utils import PSM, PSMList + +from ms2rescore.exceptions import MS2RescoreConfigurationError +from ms2rescore.parse_psms import parse_psms + + +@pytest.fixture(scope="module") +def default_config(): + cfg_path = Path(__file__).parents[1] / "ms2rescore" / "package_data" / "config_default.json" + cfg = json.loads(cfg_path.read_text())["ms2rescore"] + return cfg + + +@pytest.fixture +def psm_list_factory(): + def _factory(ids): + return PSMList( + psm_list=[ + PSM( + peptidoform="PEPTIDE/2", + run="run1", + spectrum_id=sid, + retention_time=None, + ion_mobility=None, + precursor_mz=None, + ) + for sid in ids + ] + ) + + return _factory + + +def test_psm_id_pattern_success(default_config, psm_list_factory): + psm_list = psm_list_factory(["scan:1:fileA", "scan:2:fileA"]) + # Ensure at least one decoy is present so parse_psms does not raise + psm_list[0].is_decoy = True + config = dict(default_config) + config.update( + { + "psm_id_pattern": r"scan:(\d+):.*", + "lower_score_is_better": True, + "psm_file": [], + "psm_reader_kwargs": {}, + "id_decoy_pattern": None, + } + ) + + result = parse_psms(config, psm_list) + assert list(result["spectrum_id"]) == ["1", "2"] + + +def test_psm_id_pattern_collapses_unique_ids(default_config, psm_list_factory): + psm_list = psm_list_factory(["scan:1:fileA", "scan:1:fileB"]) + config = dict(default_config) + config.update( + { + "psm_id_pattern": r"scan:(\d+):.*", + "lower_score_is_better": True, + "psm_file": [], + "psm_reader_kwargs": {}, + "id_decoy_pattern": None, + } + ) + + with pytest.raises(MS2RescoreConfigurationError): + parse_psms(config, psm_list) diff --git a/tests/test_parse_spectra.py b/tests/test_parse_spectra.py index 8230f89e..5e8fbc5b 100644 --- a/tests/test_parse_spectra.py +++ b/tests/test_parse_spectra.py @@ -4,6 +4,7 @@ import pytest from psm_utils import PSM, PSMList +from ms2rescore.exceptions import MS2RescoreConfigurationError from ms2rescore.parse_spectra import ( MSDataType, SpectrumParsingError, @@ -12,9 +13,25 @@ ) +def psm_list_factory(ids): + return PSMList( + psm_list=[ + PSM( + peptidoform="PEPTIDE/2", + run="run1", + spectrum_id=sid, + retention_time=None, + ion_mobility=None, + precursor_mz=None, + ) + for sid in ids + ] + ) + + @pytest.fixture def mock_psm_list(): - psm_list = PSMList( + return PSMList( psm_list=[ PSM( peptidoform="PEPTIDE/2", @@ -34,7 +51,6 @@ def mock_psm_list(): ), ] ) - return psm_list @pytest.fixture @@ -57,10 +73,39 @@ def mock_precursor_info_missing_im(): def mock_precursor_info_incomplete(): return { "spectrum1": MagicMock(mz=529.7935187324, rt=10.5, im=1.0), - # "spectrum2" is missing + # spectrum2 intentionally missing } +def test_spectrum_id_pattern_nonmatching(monkeypatch): + """If the provided spectrum_id_pattern doesn't match any spectrum-file IDs, raise.""" + psm_list = psm_list_factory(["scan:1:fileA"]) # PSM id doesn't matter for this test + + # Fake precursor info returns IDs that do not match the regex below + def fake_get_precursor_info(path): + class FakePrecursor: + def __init__(self, mz, rt, im): + self.mz = mz + self.rt = rt + self.im = im + + return {"specA": FakePrecursor(100.0, 1.0, 0.1), "specB": FakePrecursor(200.0, 2.0, 0.2)} + + monkeypatch.setattr("ms2rescore.parse_spectra.get_precursor_info", fake_get_precursor_info) + # Avoid filesystem validation in infer_spectrum_path by returning a dummy path + monkeypatch.setattr("ms2rescore.parse_spectra.infer_spectrum_path", lambda cfg, rn: "dummy") + + # Request a data type that is missing (retention_time) so the function will + # parse spectrum files and therefore apply the spectrum_id_pattern. + with pytest.raises(MS2RescoreConfigurationError): + add_precursor_values( + psm_list, + {MSDataType.retention_time}, + spectrum_path="/not/used", + spectrum_id_pattern=r"scan:(\d+):.*", + ) + + @patch("ms2rescore.parse_spectra.get_precursor_info") @patch("ms2rescore.parse_spectra.infer_spectrum_path") def test_add_precursor_values( @@ -148,7 +193,7 @@ def test_get_precursor_values_missing_spectrum_id( mock_infer_spectrum_path.return_value = "test_data/test_spectrum_file.mgf" mock_get_precursor_info.return_value = mock_precursor_info_incomplete - with pytest.raises(SpectrumParsingError): + with pytest.raises(MS2RescoreConfigurationError): _get_precursor_values(mock_psm_list, "test_data", None) @@ -215,8 +260,6 @@ def test_add_precursor_values_ms2_spectra_availability(): available_ms_data = add_precursor_values(psm_list, required_data_types) assert MSDataType.ms2_spectra not in available_ms_data - # With spectrum path - MS2 spectra available (but we can't test this without mocking) - def test_spectrum_parsing_error(): with pytest.raises(SpectrumParsingError): From 2a1f2a19a703e642cc3d3f9ca155f1d9671af820 Mon Sep 17 00:00:00 2001 From: RalfG Date: Mon, 20 Oct 2025 13:09:11 +0200 Subject: [PATCH 2/5] Implement suggested fixes --- ms2rescore/config_parser.py | 2 +- ms2rescore/parse_spectra.py | 19 ++++++------------- 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/ms2rescore/config_parser.py b/ms2rescore/config_parser.py index 258a7eb9..88f541c6 100644 --- a/ms2rescore/config_parser.py +++ b/ms2rescore/config_parser.py @@ -104,7 +104,7 @@ def _validate_regular_expressions(config: Dict) -> Dict: if pattern.groups != 1: raise MS2RescoreConfigurationError( f"Regular expression for '{field}' should contain exactly one " - "capturing group. Please check and try again." + "capturing group. Please check and try again. " "See https://ms2rescore.readthedocs.io/en/stable/userguide/configuration/#mapping-psms-to-spectra " "for more information." ) diff --git a/ms2rescore/parse_spectra.py b/ms2rescore/parse_spectra.py index 51626b75..862732c7 100644 --- a/ms2rescore/parse_spectra.py +++ b/ms2rescore/parse_spectra.py @@ -171,19 +171,19 @@ def _apply_spectrum_id_pattern( ) # Validate that the same number of unique IDs were matched - elif len(set(id_mapping.keys())) != len(set(precursors.keys())): - example_old_id, example_new_id = next(iter(id_mapping.items())) + elif len(id_mapping) != len(precursors): + new_id, old_id = next(iter(id_mapping.items())) raise MS2RescoreConfigurationError( "'spectrum_id_pattern' resulted in a different number of unique PSM IDs. This might " "indicate issues with the regex pattern. Please check and try again. " - f"Example old ID: '{example_old_id}' -> new ID: '{example_new_id}'. " + f"Example old ID: '{old_id}' -> new ID: '{new_id}'. " "See https://ms2rescore.readthedocs.io/en/stable/userguide/configuration/#mapping-psms-to-spectra " "for more information." ) precursors = {new_id: precursors[orig_id] for new_id, orig_id in id_mapping.items()} - return id_mapping + return precursors def _get_precursor_values( @@ -201,13 +201,6 @@ def _get_precursor_values( LOGGER.debug("Reading spectrum file: '%s'", spectrum_file) precursors: dict[str, Precursor] = get_precursor_info(str(spectrum_file)) - # Ensure unique spectrum IDs - if len(precursors) != len(set(precursors.keys())): - raise SpectrumParsingError( - f"Spectrum file '{spectrum_file}' contains duplicate spectrum IDs. " - "Please ensure that all spectrum IDs are unique." - ) - # Parse spectrum IDs with regex pattern if provided if spectrum_id_pattern: precursors = _apply_spectrum_id_pattern(precursors, spectrum_id_pattern) @@ -216,8 +209,8 @@ def _get_precursor_values( for psm in psm_list_run: if psm.spectrum_id not in precursors: raise MS2RescoreConfigurationError( - "Mismatch between PSM and spectrum file IDs. Could find precursor values " - f"for PSM with ID {psm.spectrum_id} in run {run_name}.\n" + "Mismatch between PSM and spectrum file IDs. Could not find precursor " + f"values for PSM with ID {psm.spectrum_id} in run {run_name}.\n" "Please check that the `spectrum_id_pattern` and `psm_id_pattern` options " "are configured correctly. See " "https://ms2rescore.readthedocs.io/en/stable/userguide/configuration/#mapping-psms-to-spectra" From 46ca94fefcd5144c828dd6f4d9a58194984786e1 Mon Sep 17 00:00:00 2001 From: RalfG Date: Mon, 20 Oct 2025 13:26:35 +0200 Subject: [PATCH 3/5] Implement requested fixes --- ms2rescore/parse_psms.py | 35 +++++++++++++++++++---------------- ms2rescore/parse_spectra.py | 4 ++-- 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py index e392f6a6..dfad5002 100644 --- a/ms2rescore/parse_psms.py +++ b/ms2rescore/parse_psms.py @@ -35,6 +35,24 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList: " for more information." ) + # Filter by PSM rank + psm_list.set_ranks(config["lower_score_is_better"]) + rank_filter = psm_list["rank"] <= config["max_psm_rank_input"] + psm_list = psm_list[rank_filter] + logger.info(f"Removed {sum(~rank_filter)} PSMs with rank >= {config['max_psm_rank_input']}.") + + # Remove invalid AAs, find decoys, calculate q-values + psm_list = _remove_invalid_aa(psm_list) + _find_decoys(psm_list, config["id_decoy_pattern"]) + _calculate_qvalues(psm_list, config["lower_score_is_better"]) + + # Parse retention time and/or ion mobility from spectrum_id if patterns provided + if config["psm_id_rt_pattern"] or config["psm_id_im_pattern"]: + logger.debug("Parsing retention time and/or ion mobility from PSM identifier...") + _parse_values_from_spectrum_id( + psm_list, config["psm_id_rt_pattern"], config["psm_id_im_pattern"] + ) + # Apply psm_id_pattern if provided if config["psm_id_pattern"]: pattern = re.compile(config["psm_id_pattern"]) @@ -60,22 +78,6 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList: # Assign new IDs psm_list["spectrum_id"] = new_ids - # Filter by PSM rank - psm_list.set_ranks(config["lower_score_is_better"]) - rank_filter = psm_list["rank"] <= config["max_psm_rank_input"] - psm_list = psm_list[rank_filter] - logger.info(f"Removed {sum(~rank_filter)} PSMs with rank >= {config['max_psm_rank_input']}.") - - # Remove invalid AAs, find decoys, calculate q-values - psm_list = _remove_invalid_aa(psm_list) - _find_decoys(psm_list, config["id_decoy_pattern"]) - _calculate_qvalues(psm_list, config["lower_score_is_better"]) - if config["psm_id_rt_pattern"] or config["psm_id_im_pattern"]: - logger.debug("Parsing retention time and/or ion mobility from PSM identifier...") - _parse_values_from_spectrum_id( - psm_list, config["psm_id_rt_pattern"], config["psm_id_im_pattern"] - ) - # Store scoring values for comparison later for psm in psm_list: psm.provenance_data.update( @@ -87,6 +89,7 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList: } ) + # Rename and add modifications logger.debug("Parsing modifications...") modifications_found = set( [ diff --git a/ms2rescore/parse_spectra.py b/ms2rescore/parse_spectra.py index 862732c7..3d2f3ccf 100644 --- a/ms2rescore/parse_spectra.py +++ b/ms2rescore/parse_spectra.py @@ -174,8 +174,8 @@ def _apply_spectrum_id_pattern( elif len(id_mapping) != len(precursors): new_id, old_id = next(iter(id_mapping.items())) raise MS2RescoreConfigurationError( - "'spectrum_id_pattern' resulted in a different number of unique PSM IDs. This might " - "indicate issues with the regex pattern. Please check and try again. " + "'spectrum_id_pattern' resulted in a different number of unique spectrum IDs. This " + "indicates issues with the regex pattern. Please check and try again. " f"Example old ID: '{old_id}' -> new ID: '{new_id}'. " "See https://ms2rescore.readthedocs.io/en/stable/userguide/configuration/#mapping-psms-to-spectra " "for more information." From be51218421f7add90d7dd7aef160ecfe3ec4a391 Mon Sep 17 00:00:00 2001 From: RalfG Date: Mon, 20 Oct 2025 13:38:43 +0200 Subject: [PATCH 4/5] Implement suggested changes --- ms2rescore/config_parser.py | 7 ++++++- ms2rescore/parse_psms.py | 2 +- ms2rescore/parse_spectra.py | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/ms2rescore/config_parser.py b/ms2rescore/config_parser.py index 88f541c6..73f656b0 100644 --- a/ms2rescore/config_parser.py +++ b/ms2rescore/config_parser.py @@ -89,7 +89,12 @@ def _validate_processes(config: Dict) -> Dict: def _validate_regular_expressions(config: Dict) -> Dict: """Validate regular expressions in configuration.""" - for field in ["psm_id_pattern", "spectrum_id_pattern"]: + for field in [ + "psm_id_pattern", + "spectrum_id_pattern", + "psm_id_rt_pattern", + "psm_id_im_pattern", + ]: if config["ms2rescore"][field]: # Check if valid regex diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py index dfad5002..65b3dcd6 100644 --- a/ms2rescore/parse_psms.py +++ b/ms2rescore/parse_psms.py @@ -62,7 +62,7 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList: ) new_ids = [_match_psm_ids(old_id, pattern) for old_id in psm_list["spectrum_id"]] - # Validate that n unique IDs remains the same + # Validate that the number of unique IDs remains the same if len(set(new_ids)) != len(set(psm_list["spectrum_id"])): example_old_id = psm_list["spectrum_id"][0] example_new_id = new_ids[0] diff --git a/ms2rescore/parse_spectra.py b/ms2rescore/parse_spectra.py index 3d2f3ccf..103619c6 100644 --- a/ms2rescore/parse_spectra.py +++ b/ms2rescore/parse_spectra.py @@ -205,7 +205,7 @@ def _get_precursor_values( if spectrum_id_pattern: precursors = _apply_spectrum_id_pattern(precursors, spectrum_id_pattern) - # Ensure all PSMs have a precursor values + # Ensure all PSMs have precursor values for psm in psm_list_run: if psm.spectrum_id not in precursors: raise MS2RescoreConfigurationError( From 943318c3b7220bc421790da0fb755fe036e54df9 Mon Sep 17 00:00:00 2001 From: RalfG Date: Mon, 20 Oct 2025 14:15:26 +0200 Subject: [PATCH 5/5] Update tests --- tests/test_config_parser.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/test_config_parser.py b/tests/test_config_parser.py index 01e260fd..2b1f904a 100644 --- a/tests/test_config_parser.py +++ b/tests/test_config_parser.py @@ -33,6 +33,8 @@ def test__validate_regular_expressions_valid(): "ms2rescore": { "psm_id_pattern": r"scan:(\d+):.*", "spectrum_id_pattern": r"spectrum_(\d+)", + "psm_id_rt_pattern": None, + "psm_id_im_pattern": None, } } result = _validate_regular_expressions(config) @@ -45,6 +47,8 @@ def test__validate_regular_expressions_none(): "ms2rescore": { "psm_id_pattern": None, "spectrum_id_pattern": None, + "psm_id_rt_pattern": None, + "psm_id_im_pattern": None, } } result = _validate_regular_expressions(config) @@ -57,6 +61,8 @@ def test__validate_regular_expressions_invalid_regex(): "ms2rescore": { "psm_id_pattern": r"scan:(\d+", # Missing closing parenthesis "spectrum_id_pattern": None, + "psm_id_rt_pattern": None, + "psm_id_im_pattern": None, } } with pytest.raises(MS2RescoreConfigurationError, match="Invalid regular expression"): @@ -69,6 +75,8 @@ def test__validate_regular_expressions_no_capturing_group(): "ms2rescore": { "psm_id_pattern": r"scan:\d+:.*", # No capturing group "spectrum_id_pattern": None, + "psm_id_rt_pattern": None, + "psm_id_im_pattern": None, } } with pytest.raises( @@ -83,6 +91,8 @@ def test__validate_regular_expressions_multiple_capturing_groups(): "ms2rescore": { "psm_id_pattern": r"scan:(\d+):(.*)", # Two capturing groups "spectrum_id_pattern": None, + "psm_id_rt_pattern": None, + "psm_id_im_pattern": None, } } with pytest.raises( @@ -97,6 +107,8 @@ def test__validate_regular_expressions_spectrum_id_pattern_invalid(): "ms2rescore": { "psm_id_pattern": None, "spectrum_id_pattern": r"spectrum_\d+", # No capturing group + "psm_id_rt_pattern": None, + "psm_id_im_pattern": None, } } with pytest.raises(