Skip to content

Commit fb55d2f

Browse files
committed
create empty marker plots if sample has no reads above the detection threshold
1 parent 2425112 commit fb55d2f

2 files changed

Lines changed: 41 additions & 34 deletions

File tree

lusSTR/workflows/strs.smk

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,7 @@ custom = config["custom_ranges"]
2121
def get_sample_IDs(input, a_software, output, software, separate):
2222
convert_out = f"{output}.txt"
2323
format_out = f"{output}.csv"
24-
if (software == "efm" or software == "mpsproto") and separate is False:
25-
ID_list = os.path.basename(output)
26-
elif os.path.exists(convert_out):
24+
if os.path.exists(convert_out):
2725
ID_list = get_existing_IDs(convert_out, "\t")
2826
elif os.path.exists(format_out):
2927
ID_list = get_existing_IDs(format_out, ",")
@@ -93,6 +91,13 @@ def get_output():
9391
return outname
9492

9593

94+
def get_markerplot_name(output, custom):
95+
if custom:
96+
return f"{output}_custom_range"
97+
else:
98+
return output
99+
100+
96101
rule all:
97102
input:
98103
expand("{name}.csv", name=output_name),
@@ -136,9 +141,9 @@ rule filter:
136141
rules.convert.output
137142
output:
138143
expand(
139-
"{outdir}/{samplename}_{prof_t}_{data_t}.csv", outdir=output_name,
144+
"MarkerPlots/{output_name}_{samplename}_marker_plots.pdf", output_name=get_markerplot_name(config["output"], config["custom_ranges"]),
140145
samplename=get_sample_IDs(input_name, config["analysis_software"], output_name, software,
141-
separate), prof_t=prof, data_t=data
146+
separate)
142147
)
143148
params:
144149
output_type=config["output_type"],

lusSTR/wrappers/filter.py

Lines changed: 31 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -313,39 +313,40 @@ def format_ref_table(new_rows, sample_data, datatype):
313313
return sort_df
314314

315315

316-
def marker_plots(df, output_name, kit, wd="."):
316+
def marker_plots(df, output_name, kit, sample_list, wd="."):
317317
Path(f"{wd}/MarkerPlots").mkdir(parents=True, exist_ok=True)
318318
filt_df = df[df["allele_type"] == "Typed"]
319-
for sample_id in df["SampleID"].unique():
320-
if df[df["SampleID"] == sample_id].empty:
321-
print(f"{sample_id} does not have any reads passing filter. Skipping to next sample.")
322-
else:
323-
with PdfPages(f"{wd}/MarkerPlots/{output_name}_{sample_id}_marker_plots.pdf") as pdf:
324-
if not filt_df[filt_df["SampleID"] == sample_id].empty:
325-
make_plot(filt_df, sample_id, output_name, kit, filters=True, at=False)
326-
pdf.savefig()
327-
make_plot(df, sample_id, output_name, kit)
328-
pdf.savefig()
329-
make_plot(df, sample_id, output_name, kit, sameyaxis=True)
319+
for sample_id in sample_list:
320+
# if df[df["SampleID"] == sample_id].empty:
321+
# print(f"{sample_id} does not have any reads passing filter. Skipping to next sample.")
322+
# else:
323+
with PdfPages(f"{wd}/MarkerPlots/{output_name}_{sample_id}_marker_plots.pdf") as pdf:
324+
if not filt_df[filt_df["SampleID"] == sample_id].empty:
325+
make_plot(filt_df, sample_id, output_name, kit, filters=True, at=False)
330326
pdf.savefig()
327+
make_plot(df, sample_id, output_name, kit)
328+
pdf.savefig()
329+
make_plot(df, sample_id, output_name, kit, sameyaxis=True)
330+
pdf.savefig()
331331

332332

333333
def make_plot(df, sample_id, output_name, kit, sameyaxis=False, filters=False, at=True):
334334
sample_df = df[df["SampleID"] == sample_id].copy()
335-
conditions = [
336-
sample_df["allele_type"].str.contains("Typed"),
337-
sample_df["allele_type"].str.contains("BelowAT"),
338-
sample_df["allele_type"].str.contains("stutter"),
339-
sample_df["allele_type"].str.contains("Deleted"),
340-
]
341-
values = ["Typed", "BelowAT", "Stutter", "Deleted"]
342-
sample_df.loc[:, "Type"] = np.select(conditions, values)
343-
max_reads = max(sample_df["Reads"])
344-
n = 100 if max_reads > 1000 else 10
345-
max_yvalue = (int(math.ceil(max_reads / n)) * n) + n
346-
increase_value = int(math.ceil((max_yvalue / 5) / n)) * n
335+
plot_loc = 0
347336
fig = plt.figure(figsize=(30, 30))
348-
n = 0
337+
if not sample_df.empty:
338+
conditions = [
339+
sample_df["allele_type"].str.contains("Typed"),
340+
sample_df["allele_type"].str.contains("BelowAT"),
341+
sample_df["allele_type"].str.contains("stutter"),
342+
sample_df["allele_type"].str.contains("Deleted"),
343+
]
344+
values = ["Typed", "BelowAT", "Stutter", "Deleted"]
345+
sample_df.loc[:, "Type"] = np.select(conditions, values)
346+
max_reads = max(sample_df["Reads"])
347+
n = 100 if max_reads > 1000 else 10
348+
max_yvalue = (int(math.ceil(max_reads / n)) * n) + n
349+
increase_value = int(math.ceil((max_yvalue / 5) / n)) * n
349350
if kit == "powerseq":
350351
str_list = (
351352
str_lists["powerseq_ystrs"] if "sexloci" in output_name else str_lists["powerseq_strs"]
@@ -355,10 +356,10 @@ def make_plot(df, sample_id, output_name, kit, sameyaxis=False, filters=False, a
355356
str_lists["forenseq_ystrs"] if "sexloci" in output_name else str_lists["forenseq_strs"]
356357
)
357358
for marker in str_list:
358-
n += 1
359+
plot_loc += 1
359360
colors = {"Typed": "green", "Stutter": "blue", "BelowAT": "red", "Deleted": "purple"}
360361
marker_df = sample_df[sample_df["Locus"] == marker].sort_values(by="CE_Allele")
361-
ax = fig.add_subplot(6, 5, n)
362+
ax = fig.add_subplot(6, 5, plot_loc)
362363
if not marker_df.empty:
363364
if marker == "AMELOGENIN":
364365
for i, row in marker_df.iterrows():
@@ -448,6 +449,7 @@ def process_input(
448449
info=True,
449450
):
450451
full_df = pd.read_csv(f"{input_name}.txt", sep="\t")
452+
sample_list = full_df["SampleID"].unique()
451453
if custom:
452454
seq_col = "Custom_Range_Sequence"
453455
brack_col = "Custom_Bracketed_Notation"
@@ -460,7 +462,7 @@ def process_input(
460462
)
461463
if nofiltering:
462464
full_df["allele_type"] = "Typed"
463-
marker_plots(full_df, input_name, kit)
465+
marker_plots(full_df, input_name, kit, sample_list)
464466
if output_type == "efm" or output_type == "mpsproto":
465467
EFM_output(full_df, outpath, profile_type, data_type, brack_col, sex, kit, separate)
466468
else:
@@ -469,7 +471,7 @@ def process_input(
469471
dict_loc = {k: v for k, v in full_df.groupby(["SampleID", "Locus"])}
470472
final_df, flags_df = process_strs(dict_loc, data_type, seq_col, brack_col, kit)
471473
if final_df is not None:
472-
marker_plots(final_df, input_name, kit)
474+
marker_plots(final_df, input_name, kit, sample_list)
473475
if output_type == "efm" or output_type == "mpsproto":
474476
EFM_output(
475477
final_df, outpath, profile_type, data_type, brack_col, sex, kit, separate

0 commit comments

Comments
 (0)