diff --git a/ms2pip/_utils/ion_mobility.py b/ms2pip/_utils/ion_mobility.py index 7dceb9f..db10492 100644 --- a/ms2pip/_utils/ion_mobility.py +++ b/ms2pip/_utils/ion_mobility.py @@ -1,4 +1,4 @@ -"""Module for ion mobility prediction with IM²Deep.""" +"""Module for ion mobility prediction with IM2Deep.""" import logging diff --git a/ms2pip/_utils/xgb_models.py b/ms2pip/_utils/xgb_models.py index a880433..8e9a38e 100644 --- a/ms2pip/_utils/xgb_models.py +++ b/ms2pip/_utils/xgb_models.py @@ -39,22 +39,28 @@ def get_predictions_xgb(features, num_ions, model_params, model_dir, processes=1 Number of CPUs to use in multiprocessing """ - # Init models - xgboost_models = _initialize_xgb_models( - model_params["xgboost_model_files"], - model_dir, - processes, - ) + xgb.set_config(verbosity=0) + os.environ.pop("CUDA_VISIBLE_DEVICES", None) # See issue at dmlc/xgboost#11283 + if isinstance(features, np.ndarray): features = xgb.DMatrix(features) + elif isinstance(features, xgb.DMatrix): + pass + else: + raise ValueError("Unsupported input type for features.") - logger.debug("Predicting intensities from XGBoost model files...") prediction_dict = {} - for ion_type, xgb_model in xgboost_models.items(): + n_models = len(model_params["xgboost_model_files"].items()) + for i, (ion_type, model_filename) in enumerate(model_params["xgboost_model_files"].items()): + model_file = os.path.join(model_dir, model_filename) + logger.debug(f"Initializing model from file: `{model_file}`") + xgb_model = xgb.Booster({"nthread": processes}, model_file=model_file) + # Get predictions from XGBoost model + logger.debug(f"Predicting intensities from XGBoost model {i + 1}/{n_models}...") preds = xgb_model.predict(features) preds = preds.clip(min=np.log2(0.001)) # Clip negative intensities - xgb_model.__del__() + del(xgb_model) # Reshape into arrays for each peptide if ion_type.lower() in ["x", "y", "y2", "z"]: @@ -113,18 +119,5 @@ def _check_model_integrity(filename, model_hash): if sha1_hash.hexdigest() == model_hash: return True else: - logger.warn("Model hash not recognized.") + logger.warning("Model hash not recognized.") return False - - -def _initialize_xgb_models(xgboost_model_files, model_dir, nthread) -> dict: - """Initialize xgboost models and return them in a dict with ion types as keys.""" - xgb.set_config(verbosity=0) - xgboost_models = {} - for ion_type in xgboost_model_files.keys(): - model_file = os.path.join(model_dir, xgboost_model_files[ion_type]) - logger.debug(f"Initializing model from file: `{model_file}`") - xgb_model = xgb.Booster({"nthread": nthread}) - xgb_model.load_model(model_file) - xgboost_models[ion_type] = xgb_model - return xgboost_models diff --git a/ms2pip/core.py b/ms2pip/core.py index 3ffd82b..df1b9e4 100644 --- a/ms2pip/core.py +++ b/ms2pip/core.py @@ -14,7 +14,6 @@ import numpy as np import pandas as pd from psm_utils import PSM, Peptidoform, PSMList -from rich.progress import track import ms2pip.exceptions as exceptions from ms2pip._cython_modules import ms2pip_pyx @@ -189,14 +188,11 @@ def predict_library( raise ValueError("Either `fasta_file` or `config` must be provided.") search_space = ProteomeSearchSpace.from_any(config) - search_space.build() + search_space.build(processes) - for batch in track( - _into_batches(search_space, batch_size=batch_size), - description="Predicting spectra...", - total=ceil(len(search_space) / batch_size), - ): - logging.disable(logging.CRITICAL) + n_batches = len(search_space) // batch_size + 1 + for i, batch in enumerate(_into_batches(search_space, batch_size=batch_size)): + logging.info(f"Processing batch {i + 1}/{n_batches}...") yield predict_batch( search_space.filter_psms_by_mz(PSMList(psm_list=list(batch))), add_retention_time=add_retention_time, @@ -205,7 +201,6 @@ def predict_library( model_dir=model_dir, processes=processes, ) - logging.disable(logging.NOTSET) def correlate( @@ -553,7 +548,7 @@ def _get_pool(self): """Get multiprocessing pool.""" logger.debug(f"Starting workers (processes={self.processes})...") if multiprocessing.current_process().daemon: - logger.warn( + logger.warning( "MS²PIP is running in a daemon process. Disabling multiprocessing as daemonic " "processes cannot have children." ) diff --git a/ms2pip/search_space.py b/ms2pip/search_space.py index 2ac3a87..e14cd18 100644 --- a/ms2pip/search_space.py +++ b/ms2pip/search_space.py @@ -15,7 +15,7 @@ { "fasta_file": "test.fasta", "min_length": 8, - "max_length": 3, + "max_length": 30, "cleavage_rule": "trypsin", "missed_cleavages": 2, "semi_specific": false, @@ -204,7 +204,23 @@ def __init__(self, **data: Any): """ super().__init__(**data) - self._peptidoform_spaces: List[_PeptidoformSearchSpace] = [] + self._peptidoform_spaces: Optional[List[_PeptidoformSearchSpace]] = None + + @field_validator("min_length") + @classmethod + def _validate_min_length(cls, v): + if v > 3: + return v + else: + raise ValueError("Minimum peptide length must be greater than 3.") + + @field_validator("max_length") + @classmethod + def _validate_max_length(cls, v): + if v <= 100: + return v + else: + raise ValueError("Maximum peptide length must be less than or equal to 100.") @field_validator("modifications") @classmethod @@ -229,7 +245,7 @@ def _validate_unspecific_cleavage(self): return self def __len__(self): - if not self._peptidoform_spaces: + if self._peptidoform_spaces is None: raise ValueError("Search space must be built before length can be determined.") return sum(len(pep_space) for pep_space in self._peptidoform_spaces) @@ -255,14 +271,14 @@ def from_any(cls, _input: Union[dict, str, Path, ProteomeSearchSpace]) -> Proteo else: raise ValueError("Search space must be a dict, str, Path, or ProteomeSearchSpace.") - def build(self, processes: int = 1): + def build(self, processes: int | None = None): """ Build peptide search space from FASTA file. Parameters ---------- processes : int - Number of processes to use for parallelization. + Number of processes to use for parallelization. If None, uses all available CPUs. """ processes = processes if processes else multiprocessing.cpu_count() @@ -285,7 +301,7 @@ def __iter__(self) -> Generator[PSM, None, None]: """ # Build search space if not already built - if not self._peptidoform_spaces: + if self._peptidoform_spaces is None: raise ValueError("Search space must be built before PSMs can be generated.") spectrum_id = 0