Skip to content

Commit 65ed461

Browse files
adding documentation to new functions
1 parent 96b903c commit 65ed461

2 files changed

Lines changed: 258 additions & 12 deletions

File tree

enviroMS/LC_FTICR_workflow.py

Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,39 @@
2323
################################### LC-FTICR OBJECT AND OBJECT'S METHODS ###################################
2424
@dataclass
2525
class LC_FTICR_WorkflowParameters:
26+
"""
27+
Data class to establish workflow parameters.
28+
29+
Parameters
30+
----------
31+
start_time : int
32+
Start time (minutes).
33+
end_time : int
34+
End time (minutes).
35+
time_block : int
36+
Time block (seconds).
37+
refmasslist_neg : str
38+
Path to reference m/z database.
39+
full_input_file_path : str
40+
The path of file to process.
41+
output_directory : str
42+
Path to save outputs.
43+
output_file_name : str
44+
Output filename.
45+
output_file_type : str
46+
Output extension.
47+
lc_fticr_toml_path : str
48+
The path to the toml file with the lc-fticr ms workflow parameters.
49+
corems_toml_path : str
50+
The path to the toml file with the CoreMS parameters.
51+
do_plot_van_krevelen_all_ids : bool
52+
Output van krevelen plot for all ID's
53+
do_plot_van_krevelen_individual : bool
54+
Output individual van krevelen plot for all ID's
55+
do_plot_properties : bool
56+
Output plot of properties.
57+
"""
58+
2659
# Time Block Parameters:
2760
start_time: int # minutes
2861
end_time: int # minutes
@@ -79,6 +112,13 @@ def create_temp_corems_toml(self):
79112

80113
### function that init parser and get data
81114
def init_parser_extract_data(self) -> pd.DataFrame:
115+
"""
116+
Initialize the parser and extract data from input file.
117+
This function reads the input file, extracts the Total Ion Chromatogram (TIC) data,
118+
and returns a DataFrame containing the scans, TIC values, and time.
119+
120+
"""
121+
82122
# Define datafile location
83123
file_in = self.full_input_file_path
84124

@@ -103,6 +143,25 @@ def init_parser_extract_data(self) -> pd.DataFrame:
103143
### process timeblocks
104144
# Process the time block mass spectrum
105145
def proc_time_block_inner(self, msreader, datafile, block):
146+
"""
147+
Process time blocks of mass spectra.
148+
149+
Parameters:
150+
----------
151+
msreader : ImportMassSpectraThermoMSFileReader
152+
The mass spectrum reader object.
153+
datafile : str
154+
The path to the data file.
155+
block : int
156+
The time block number.
157+
158+
Returns:
159+
-------
160+
msdf : pd.DataFrame
161+
DataFrame containing the processed mass spectrum data.
162+
statdict : dict
163+
Dictionary containing statistics for the processed mass spectrum.
164+
"""
106165
# scans = list(subset_df['scan'])
107166

108167
# load_and_set_toml_parameters_ms(MSParameters, self.corems_toml_path)
@@ -143,6 +202,21 @@ def proc_time_block_inner(self, msreader, datafile, block):
143202
return(msdf, statdict)
144203

145204
def process_with_time_block(self, tic_df):
205+
"""
206+
Process the mass spectra with time blocks.
207+
208+
Parameters:
209+
----------
210+
tic_df : pd.DataFrame
211+
DataFrame containing the Total Ion Chromatogram (TIC) data with time and scan information.
212+
Returns:
213+
-------
214+
all_msdfs : pd.DataFrame
215+
DataFrame containing all processed mass spectra data.
216+
all_statdics : list
217+
List of dictionaries containing statistics for each time block.
218+
219+
"""
146220
# Strip out the time where there's no useful data
147221
file_in = self.full_input_file_path
148222
tic_df = tic_df[(tic_df['time'] > self.start_time) & (tic_df['time'] < self.end_time)]
@@ -178,6 +252,17 @@ def process_with_time_block(self, tic_df):
178252

179253

180254
def create_summary(self, all_statdics):
255+
"""
256+
Create a summary DataFrame from the list of dictionaries containing statistics.
257+
Parameters:
258+
----------
259+
all_statdics : list
260+
List of dictionaries containing statistics for each time block.
261+
Returns:
262+
-------
263+
summary_df : pd.DataFrame
264+
DataFrame containing the summary statistics for all time blocks.
265+
"""
181266
# Flatten the list of dictionaries
182267
flat_list = [inner_dict for outer_dict in all_statdics for inner_dict in outer_dict.values()]
183268
# Create a DataFrame
@@ -191,6 +276,19 @@ def create_summary(self, all_statdics):
191276

192277
## for creating plots
193278
def filter_out_common_background(df):
279+
"""
280+
Filter out common background entries in the DataFrame based on 'Molecular Formula' and 'Peak Height'.
281+
282+
Parameters:
283+
----------
284+
df : pd.DataFrame
285+
DataFrame containing 'Molecular Formula', 'Peak Height', and 'block' columns.
286+
Returns:
287+
-------
288+
filtered_df : pd.DataFrame
289+
DataFrame with common background entries removed.
290+
"""
291+
194292
formula_block_counts = df.pivot_table(index='Molecular Formula', columns='block', aggfunc='size', fill_value=0)
195293

196294
# Filter to get 'Molecular Formula' entries that appear in all blocks
@@ -214,6 +312,19 @@ def peak_height_similar(df, tolerance=0.99): # 10% tolerance
214312

215313
### create plots
216314
def plot_van_krevelen_all_ids(all_msdfs_path, output_dir):
315+
"""
316+
Plot a van Krevelen diagram for all IDs in the provided DataFrame or CSV file.
317+
Parameters:
318+
----------
319+
all_msdfs_path : str or pd.DataFrame
320+
Path to the CSV file containing all mass spectra data or a DataFrame.
321+
output_dir : str
322+
Directory where the plot will be saved.
323+
Returns:
324+
-------
325+
None
326+
"""
327+
217328
if isinstance(all_msdfs_path,str):
218329
all_msdfs_df = pd.read_csv(all_msdfs_path)
219330
else:
@@ -240,6 +351,18 @@ def plot_van_krevelen_all_ids(all_msdfs_path, output_dir):
240351
plt.show()
241352

242353
def plot_van_krevelen_individual(all_msdfs_path, output_dir):
354+
"""
355+
Plot individual van Krevelen diagrams for each time block in the provided DataFrame or CSV file.
356+
Parameters:
357+
----------
358+
all_msdfs_path : str or pd.DataFrame
359+
Path to the CSV file containing all mass spectra data or a DataFrame.
360+
output_dir : str
361+
Directory where the plots will be saved.
362+
Returns:
363+
-------
364+
None
365+
"""
243366
if isinstance(all_msdfs_path,str):
244367
all_msdfs_df = pd.read_csv(all_msdfs_path)
245368
else:
@@ -278,6 +401,18 @@ def plot_van_krevelen_individual(all_msdfs_path, output_dir):
278401
fig.savefig(output_dir+'TimeBlockIDs.png',dpi=300,bbox_inches='tight')
279402

280403
def plot_properties(summary_df_path,output_dir):
404+
"""
405+
Plot trends and distributions of various properties from the summary DataFrame or CSV file.
406+
Parameters:
407+
----------
408+
summary_df_path : str or pd.DataFrame
409+
Path to the CSV file containing summary statistics or a DataFrame.
410+
output_dir : str
411+
Directory where the plots will be saved.
412+
Returns:
413+
-------
414+
None
415+
"""
281416
if isinstance(summary_df_path,str):
282417
summary_df = pd.read_csv(summary_df_path)
283418
else:
@@ -314,6 +449,19 @@ def plot_properties(summary_df_path,output_dir):
314449
################################### RUN LC-FTICR WORKFLOW ###################################
315450

316451
def run_LC_FTICR_workflow(lc_fticr_workflow_paramaters_toml_file):
452+
"""
453+
Run LC-FTICR metabolomics workflow.
454+
455+
Parameters
456+
----------
457+
lc_fticr_workflow_paramaters_toml_file : str
458+
Path to workflow parameters file.
459+
Returns
460+
-------
461+
None
462+
463+
"""
464+
317465
# read in LC_WorkflowParameters from toml file
318466
with open(lc_fticr_workflow_paramaters_toml_file, "r") as infile:
319467
lc_object = LC_FTICR_WorkflowParameters(**toml.load(infile))
@@ -345,6 +493,41 @@ def run_LC_FTICR_workflow_wdl(
345493
do_plot_van_krevelen_individual,
346494
do_plot_properties,
347495
):
496+
"""
497+
Run LC-FTICR metabolomics workflow with parameters from WDL inputs.
498+
Parameters
499+
----------
500+
start_time : int
501+
Start time (minutes).
502+
end_time : int
503+
End time (minutes).
504+
time_block : int
505+
Time block (seconds).
506+
refmasslist_neg : str
507+
Path to reference m/z database.
508+
full_input_file_path : str
509+
The path of file to process.
510+
output_directory : str
511+
Path to save outputs.
512+
output_file_name : str
513+
Output filename.
514+
output_file_type : str
515+
Output extension.
516+
lc_fticr_toml_path : str
517+
The path to the toml file with the lc-fticr ms workflow parameters.
518+
corems_toml_path : str
519+
The path to the toml file with the CoreMS parameters.
520+
do_plot_van_krevelen_all_ids : bool
521+
Output van krevelen plot for all ID's.
522+
do_plot_van_krevelen_individual : bool
523+
Output individual van krevelen plot for all ID's.
524+
do_plot_properties : bool
525+
Output plot of properties.
526+
527+
Returns
528+
-------
529+
None
530+
"""
348531
# read in LC_WorkflowParameters from wdl inputs
349532
lc_object = LC_FTICR_WorkflowParameters(start_time = start_time,
350533
end_time = end_time,

enviroMS/cli.py

Lines changed: 75 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -121,13 +121,13 @@ def create_database(corems_parameters_file, jobs):
121121
@click.argument("raw_file_final_scan", required=True, type=int)
122122
@click.argument("is_centroid", required=True, type=bool)
123123
@click.argument("calibration_ref_file_path", required=False, type=str)
124-
@click.option("--calibrate", "-c", default=True)
125-
@click.option("--plot_mz_error", "-e", default=True)
126-
@click.option("--plot_ms_assigned_unassigned", "-a", default=True)
127-
@click.option("--plot_c_dbe", "-cb", default=True)
128-
@click.option("--plot_van_krevelen", "-vk", default=True)
129-
@click.option("--plot_ms_classes", "-mc", default=True)
130-
@click.option("--plot_mz_error_classes", "-ec", default=True)
124+
@click.option("--calibrate", "-c", default=True, help="Calibrate the raw files")
125+
@click.option("--plot_mz_error", "-e", default=True, help="Plot m/z error")
126+
@click.option("--plot_ms_assigned_unassigned", "-a", default=True, help="Plot MS assigned and unassigned")
127+
@click.option("--plot_c_dbe", "-cb", default=True, help="Plot C vs DBE")
128+
@click.option("--plot_van_krevelen", "-vk", default=True, help="Plot Van Krevelen diagram")
129+
@click.option("--plot_ms_classes", "-mc", default=True, help="Plot MS classes")
130+
@click.option("--plot_mz_error_classes", "-ec", default=True, help="Plot m/z error classes")
131131
@click.option("--jobs", "-j", default=4, help="'cpu's'")
132132
def run_di_wdl(*args, **kwargs):
133133
"""Run the Direct Infusion Workflow using wdl"""
@@ -160,7 +160,39 @@ def run_di(di_workflow_paramaters_file, jobs, replicas, tasks, mpi):
160160
@cli.command(name="run_lc_fticr")
161161
@click.argument("lc_fticr_workflow_paramaters_file", required=True, type=str)
162162
def run_lc_fticr(lc_fticr_workflow_paramaters_file):
163-
"""Run the LC-FTICR workflow"""
163+
"""Run the LC FTICR MS workflow
164+
165+
Parameters
166+
----------
167+
full_input_file_path : str
168+
The path to the toml file with the workflow parameters
169+
start_time : int
170+
The paths to the input files, separated by commas as one string
171+
end_time : int
172+
The directory where the output files will be stored
173+
time_block : int
174+
The path corems parameters toml file
175+
refmasslist_neg : str
176+
The path to the sqlite database for spectra searching
177+
output_directory : str
178+
The path to the scan translator file
179+
output_file_name : str
180+
The number of cores to use for processing
181+
output_file_type : str
182+
The number of cores to use for processing
183+
lc_fticr_toml_path : str
184+
The number of cores to use for processing
185+
corems_toml_path : str
186+
The number of cores to use for processing
187+
do_plot_van_krevelen_all_ids : bool
188+
The number of cores to use for processing
189+
do_plot_van_krevelen_individual : bool
190+
The number of cores to use for processing
191+
do_plot_properties : bool
192+
The number of cores to use for processing
193+
194+
"""
195+
164196
run_LC_FTICR_workflow(lc_fticr_workflow_paramaters_file)
165197

166198

@@ -175,9 +207,9 @@ def run_lc_fticr(lc_fticr_workflow_paramaters_file):
175207
@click.argument("output_file_type", required=True, type=str)
176208
@click.argument("lc_fticr_toml_path", required=True, type=str)
177209
@click.argument("corems_toml_path", required=True, type=str)
178-
@click.option("--do_plot_van_krevelen_all_ids", "-a", default=True)
179-
@click.option("--do_plot_van_krevelen_individual", "-i", default=True)
180-
@click.option("--do_plot_properties", "-p", default=True)
210+
@click.option("--do_plot_van_krevelen_all_ids", "-a", default=True, help="Creates Van Krevelen plots for all ids.")
211+
@click.option("--do_plot_van_krevelen_individual", "-i", default=True, help="Creates Van Krevelen plots for all ids individually.")
212+
@click.option("--do_plot_properties", "-p", default=True, help="Creates plots of properties for run.")
181213
def run_lc_fticr_wdl(
182214
full_input_file_path,
183215
start_time,
@@ -193,7 +225,38 @@ def run_lc_fticr_wdl(
193225
do_plot_van_krevelen_individual,
194226
do_plot_properties,
195227
):
196-
"""Run the LC-FTICR Workflow using wdl"""
228+
"""
229+
Run the LC FTICR MS workflow using WDL.
230+
231+
Parameters
232+
----------
233+
full_input_file_path : str
234+
The path to the input file containing LC-FTICR data
235+
start_time : float
236+
The start time for the LC-FTICR analysis
237+
end_time : float
238+
The end time for the LC-FTICR analysis
239+
time_block : float
240+
The time block for processing the LC-FTICR data
241+
refmasslist_neg : str
242+
The path to the reference mass list for negative ion mode
243+
output_directory : str
244+
The directory where the output files will be stored
245+
output_file_name : str
246+
The name of the output file to be generated
247+
output_file_type : str
248+
The type of the output file (e.g., csv, json)
249+
lc_fticr_toml_path : str
250+
The path to the LC-FTICR workflow parameters file in TOML format
251+
corems_toml_path : str
252+
The path to the CoreMS parameters file in TOML format
253+
do_plot_van_krevelen_all_ids : bool
254+
Whether to create Van Krevelen plots for all IDs
255+
do_plot_van_krevelen_individual : bool
256+
Whether to create individual Van Krevelen plots for each ID
257+
do_plot_properties : bool
258+
Whether to create plots of properties for the run
259+
"""
197260
click.echo("Running lc-fticr workflow")
198261
run_LC_FTICR_workflow_wdl(
199262
full_input_file_path = full_input_file_path,

0 commit comments

Comments
 (0)