11import logging
2- from typing import Tuple , Union
2+ from typing import Tuple , Union , Iterator
33import pandas as pd
44import numpy as np
5- from torch .utils .data import Dataset , DataLoader
5+ from torch .utils .data import Dataset
66from ideeplc .utilities import df_to_matrix , reform_seq
77
88LOGGER = logging .getLogger (__name__ )
99
1010
11- # Making the pytorch dataset
1211class MyDataset (Dataset ):
1312 def __init__ (self , sequences : np .ndarray , retention : np .ndarray ) -> None :
1413 self .sequences = sequences
@@ -25,15 +24,14 @@ def data_initialize(
2524 csv_path : str , ** kwargs
2625) -> Union [Tuple [MyDataset , np .ndarray ], Tuple [MyDataset , np .ndarray ]]:
2726 """
28- Initialize peptides matrices based on a CSV file containing raw peptide sequences.
27+ Initialize peptide matrices based on a CSV file containing raw peptide sequences.
2928
3029 :param csv_path: Path to the CSV file containing raw peptide sequences.
31- :return: DataLoader for prediction.
30+ :return: Dataset for prediction or fine-tuning and x_shape .
3231 """
33-
3432 LOGGER .info (f"Loading peptides from { csv_path } " )
33+
3534 try :
36- # Load peptides from CSV file
3735 df = pd .read_csv (csv_path )
3836 except FileNotFoundError :
3937 LOGGER .error (f"File { csv_path } not found." )
@@ -63,22 +61,108 @@ def data_initialize(
6361 LOGGER .info (
6462 f"Loaded and reformed { len (reformed_peptides )} peptides sequences from the file."
6563 )
64+
6665 try :
67- # Convert sequences to matrix format
6866 sequences , tr , errors = df_to_matrix (reformed_peptides , df )
6967 except Exception as e :
7068 LOGGER .error (f"Error converting sequences to matrix format: { e } " )
7169 raise
70+
7271 if errors :
7372 LOGGER .warning (f"Errors encountered during conversion: { errors } " )
7473
7574 prediction_dataset = MyDataset (sequences , tr )
7675
77- # Create DataLoader objects
78- dataloader_pred = DataLoader ( prediction_dataset )
79- # passing the training X shape
80- for batch in dataloader_pred :
81- x_shape = batch [ 0 ]. shape
82- break
76+ if len ( prediction_dataset ) == 0 :
77+ LOGGER . error ( "No valid peptide entries were found in the input file." )
78+ raise ValueError ( "No valid peptide entries were found in the input file." )
79+
80+ # Keep historical x_shape contract expected by model/tests: ( batch, channels, length)
81+ x_shape = ( 1 ,) + prediction_dataset [ 0 ][ 0 ]. shape
8382 LOGGER .info (f"Dataset initialized with data shape { x_shape } ." )
8483 return prediction_dataset , x_shape
84+
85+
86+ def data_initialize_chunked (
87+ csv_path : str , chunk_size : int = 10000 , ** kwargs
88+ ) -> Iterator [Tuple [pd .DataFrame , MyDataset , np .ndarray ]]:
89+ """
90+ Initialize peptide matrices from a CSV file in chunks.
91+
92+ :param csv_path: Path to the CSV file containing raw peptide sequences.
93+ :param chunk_size: Number of rows to load per chunk.
94+ :return: Iterator yielding dataframe chunk, dataset chunk, and x_shape.
95+ """
96+ LOGGER .info (f"Loading peptides from { csv_path } in chunks of { chunk_size } " )
97+
98+ try :
99+ chunk_iter = pd .read_csv (csv_path , chunksize = chunk_size )
100+ except FileNotFoundError :
101+ LOGGER .error (f"File { csv_path } not found." )
102+ raise
103+ except pd .errors .EmptyDataError :
104+ LOGGER .error (f"File { csv_path } is empty." )
105+ raise
106+ except Exception as e :
107+ LOGGER .error (f"Error reading { csv_path } : { e } " )
108+ raise
109+
110+ for chunk_idx , df in enumerate (chunk_iter , start = 1 ):
111+ if "seq" not in df .columns :
112+ LOGGER .error ("CSV file must contain a 'seq' column with peptide sequences." )
113+ raise ValueError ("Missing 'seq' column in the CSV file." )
114+ if "modifications" not in df .columns :
115+ LOGGER .error (
116+ "CSV file must contain a 'modifications' column with peptide modifications."
117+ )
118+ raise ValueError ("Missing 'modifications' column in the CSV file." )
119+ if "tr" not in df .columns :
120+ LOGGER .error ("CSV file must contain a 'tr' column with retention times." )
121+ raise ValueError ("Missing 'tr' column in the CSV file." )
122+
123+ reformed_peptides = [
124+ reform_seq (seq , mod ) for seq , mod in zip (df ["seq" ], df ["modifications" ])
125+ ]
126+ LOGGER .info (
127+ f"Chunk { chunk_idx } : loaded and reformed { len (reformed_peptides )} peptides sequences."
128+ )
129+
130+ try :
131+ sequences , tr , errors = df_to_matrix (reformed_peptides , df )
132+ except Exception as e :
133+ LOGGER .error (
134+ f"Error converting sequences to matrix format in chunk { chunk_idx } : { e } "
135+ )
136+ raise
137+
138+ if errors :
139+ LOGGER .warning (f"Errors encountered during conversion in chunk { chunk_idx } : { errors } " )
140+
141+ prediction_dataset = MyDataset (sequences , tr )
142+
143+ if len (prediction_dataset ) == 0 :
144+ LOGGER .warning (f"Chunk { chunk_idx } contains no valid peptide entries." )
145+ continue
146+
147+ # Keep historical x_shape contract expected by model/tests: (batch, channels, length)
148+ x_shape = (1 ,) + prediction_dataset [0 ][0 ].shape
149+ LOGGER .info (f"Chunk { chunk_idx } initialized with data shape { x_shape } ." )
150+ yield df , prediction_dataset , x_shape
151+
152+
153+ def get_input_shape_from_first_chunk (csv_path : str , chunk_size : int = 10000 ):
154+ """
155+ Get the input shape from the first valid chunk of a CSV file.
156+
157+ :param csv_path: Path to the CSV file containing raw peptide sequences.
158+ :param chunk_size: Number of rows to load per chunk.
159+ :return: x_shape for model initialization.
160+ """
161+ for _ , dataset_chunk , x_shape in data_initialize_chunked (
162+ csv_path = csv_path , chunk_size = chunk_size
163+ ):
164+ LOGGER .info (f"Detected input shape from first valid chunk: { x_shape } " )
165+ return x_shape
166+
167+ LOGGER .error ("No valid chunks found in the input file." )
168+ raise ValueError ("No valid chunks found in the input file." )
0 commit comments