Skip to content

Commit 2425112

Browse files
authored
Implementing lusSTR workflow for Amelogenin locus (#85)
1 parent c6b6537 commit 2425112

21 files changed

Lines changed: 545 additions & 170 deletions

lusSTR/cli/gui.py

Lines changed: 47 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import pandas as pd
2323
from pathlib import Path
2424
import plotly.express as px
25+
import plotly.graph_objs as go
2526
import streamlit as st
2627
from streamlit_option_menu import option_menu
2728
import yaml
@@ -43,6 +44,14 @@ def get_filter_metadata_file():
4344
filter_marker_data = json.load(fh)
4445

4546

47+
def get_strlist_file():
48+
return importlib.resources.files("lusSTR") / "data/str_lists.json"
49+
50+
51+
with open(get_strlist_file(), "r") as fh:
52+
str_lists = json.load(fh)
53+
54+
4655
# ------------ Function to Generate config.yaml File ---------- #
4756

4857

@@ -197,14 +206,33 @@ def interactive_plots_allmarkers(sample_df, flagged_df):
197206
max_yvalue = (int(math.ceil(max_reads / n)) * n) + n
198207
increase_value = int(math.ceil((max_yvalue / 5) / n)) * n
199208
n = 0
200-
for marker in sample_df["Locus"].unique():
209+
all_loci = (
210+
str_lists["forenseq_strs"]
211+
if st.session_state.kit == "forenseq"
212+
else str_lists["powerseq_strs"]
213+
)
214+
missing_loci = [x for x in all_loci if x not in sample_df["Locus"].unique()]
215+
for marker in all_loci:
201216
col = cols[n]
202217
container = col.container(border=True)
203218
sample_locus = sample_df["SampleID"].unique() + "_" + marker
204-
marker_df = sample_df[sample_df["Locus"] == marker].sort_values(by="CE_Allele")
219+
sample_df = np.where(
220+
sample_df["Locus"] == "AMELOGENIN",
221+
np.where(sample_df["CE_Allele"] == "X", 0, 1),
222+
sample_df["CE_Allele"],
223+
)
224+
sample_df["CE_Allele"] = pd.to_numeric(sample_df["CE_Allele"])
225+
marker_df = sample_df[sample_df["Locus"] == marker].sort_values(
226+
by=["CE_Allele", "allele_type"], ascending=[False, True]
227+
)
205228
if sample_locus in flagged_df["key"].values:
206229
marker = f"⚠️{marker}⚠️"
207-
plot = interactive_plots(marker_df, marker, max_yvalue, increase_value, all=True)
230+
if marker in missing_loci:
231+
marker = f"⚠️{marker}⚠️"
232+
plot = go.Figure()
233+
plot.update_layout(title=marker)
234+
else:
235+
plot = interactive_plots(marker_df, marker, max_yvalue, increase_value, all=True)
208236
container.plotly_chart(plot, use_container_width=True)
209237
if n == 3:
210238
n = 0
@@ -240,9 +268,14 @@ def interactive_plots(df, locus, ymax, increase, all=False):
240268
)
241269
plot.add_hline(y=at, line_width=3, line_dash="dot", line_color="gray")
242270
plot.add_annotation(text=f"AT", x=min_x + 0.1, y=at, showarrow=False, yshift=10)
243-
plot.update_layout(
244-
xaxis=dict(range=[min_x, max_x], tickmode="array", tickvals=np.arange(min_x, max_x, 1))
245-
)
271+
if locus == "AMELOGENIN":
272+
plot.update_layout(
273+
xaxis=dict(tickvals=np.arange(-1, 2, 1), tickmode="array", ticktext=["", "X", "Y", ""])
274+
)
275+
else:
276+
plot.update_layout(
277+
xaxis=dict(range=[min_x, max_x], tickmode="array", tickvals=np.arange(min_x, max_x, 1))
278+
)
246279
if all:
247280
plot.update_layout(
248281
yaxis=dict(range=[0, ymax], tickmode="array", tickvals=np.arange(0, ymax, increase))
@@ -307,11 +340,16 @@ def interactive_setup(df1, file):
307340
)
308341
interactive_plots_allmarkers(sample_df, flags)
309342
else:
343+
plot_df = sample_df
344+
sample_df = np.where(
345+
sample_df["Locus"] == "AMELOGENIN",
346+
np.where(sample_df["CE_Allele"] == "X", 0, 1),
347+
sample_df["CE_Allele"],
348+
)
349+
plot_df["CE_Allele"] = pd.to_numeric(plot_df["CE_Allele"])
310350
locus_key = f"{sample}_{locus}"
311351
if locus_key not in st.session_state:
312-
st.session_state[locus_key] = sample_df[sample_df["Locus"] == locus].reset_index(
313-
drop=True
314-
)
352+
st.session_state[locus_key] = plot_df[plot_df["Locus"] == locus].reset_index(drop=True)
315353
Type = [
316354
"Deleted",
317355
"Typed",

lusSTR/data/filters.json

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,22 @@
11
{
2+
"AMELOGENIN": {
3+
"MinimumNumberReadsForDynamicThresholds": 650,
4+
"DetectionThresholdStaticCount": 10,
5+
"DetectionThresholdDynamicPercent": 0,
6+
"DetectionThresholdUse": "Static",
7+
"AnalyticalThresholdStaticCount": 20,
8+
"AnalyticalThresholdDynamicPercent": 0.017,
9+
"AnalyticalThresholdUse": "Both",
10+
"StochasticThresholdStaticCount": 20,
11+
"StochasticThresholdDynamicPercent": 0.017,
12+
"StochasticThresholdUse": "Both",
13+
"MinimumHeterozygousBalanceThresholdDynamicPercent": 0.50,
14+
"SameSizeThresholdDynamicPercent": 0,
15+
"StutterThresholdDynamicPercent": 0,
16+
"StutterForwardThresholdDynamicPercent": 0,
17+
"Intercept": 0,
18+
"Slope": 0
19+
},
220
"CSF1PO": {
321
"MinimumNumberReadsForDynamicThresholds": 650,
422
"DetectionThresholdStaticCount": 10,

lusSTR/data/str_lists.json

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
{
2+
3+
"powerseq_strs" : [
4+
"AMELOGENIN",
5+
"CSF1PO",
6+
"D10S1248",
7+
"D12S391",
8+
"D13S317",
9+
"D16S539",
10+
"D18S51",
11+
"D19S433",
12+
"D1S1656",
13+
"D21S11",
14+
"D22S1045",
15+
"D2S1338",
16+
"D2S441",
17+
"D3S1358",
18+
"D5S818",
19+
"D7S820",
20+
"D8S1179",
21+
"FGA",
22+
"PENTA D",
23+
"PENTA E",
24+
"TH01",
25+
"TPOX",
26+
"VWA"
27+
],
28+
"forenseq_strs" : [
29+
"AMELOGENIN",
30+
"CSF1PO",
31+
"D10S1248",
32+
"D12S391",
33+
"D13S317",
34+
"D16S539",
35+
"D17S1301",
36+
"D18S51",
37+
"D19S433",
38+
"D1S1656",
39+
"D20S482",
40+
"D21S11",
41+
"D22S1045",
42+
"D2S1338",
43+
"D2S441",
44+
"D3S1358",
45+
"D4S2408",
46+
"D5S818",
47+
"D6S1043",
48+
"D7S820",
49+
"D8S1179",
50+
"D9S1122",
51+
"FGA",
52+
"PENTA D",
53+
"PENTA E",
54+
"TH01",
55+
"TPOX",
56+
"VWA"
57+
],
58+
"powerseq_ystrs" : [
59+
"DYS19",
60+
"DYS385A-B",
61+
"DYS389II",
62+
"DYS390",
63+
"DYS391",
64+
"DYS392",
65+
"DYS393",
66+
"DYS437",
67+
"DYS438",
68+
"DYS439",
69+
"DYS448",
70+
"DYS456",
71+
"DYS458",
72+
"DYS481",
73+
"DYS533",
74+
"DYS549",
75+
"DYS570",
76+
"DYS576",
77+
"DYS635",
78+
"DYS643",
79+
"Y-GATA-H4"
80+
],
81+
"forenseq_ystrs" : [
82+
"DYS19",
83+
"DYS385A-B",
84+
"DYS389II",
85+
"DYS390",
86+
"DYS391",
87+
"DYS392",
88+
"DYS437",
89+
"DYS438",
90+
"DYS439",
91+
"DYS448",
92+
"DYS481",
93+
"DYS533",
94+
"DYS549",
95+
"DYS570",
96+
"DYS576",
97+
"DYS635",
98+
"DYS643",
99+
"Y-GATA-H4"
100+
]
101+
}

lusSTR/data/str_markers.json

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,23 @@
11
{
2+
"AMELOGENIN": {
3+
"BasesToSubtract": 0,
4+
"NumRepeats": 1,
5+
"Repeats": [
6+
"AAAGTG"
7+
],
8+
"NumBasesToSeparate": 0,
9+
"ReverseCompNeeded": "No",
10+
"LUS": "",
11+
"Sec": "",
12+
"Tert": "",
13+
"Foren_5": 26,
14+
"Foren_3": 37,
15+
"Power_5": 10,
16+
"Power_3": 37,
17+
"Custom_5": 0,
18+
"Custom_3": 0,
19+
"Alleles": ["X", "Y"]
20+
},
221
"CSF1PO": {
322
"BasesToSubtract": 0,
423
"NumRepeats": 1,

lusSTR/scripts/filter_settings.py

Lines changed: 47 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -28,20 +28,57 @@ def get_filter_metadata_file():
2828

2929
def filters(locus_allele_info, locus, locus_reads, datatype, brack_col):
3030
metadata = filter_marker_data[locus]
31-
if len(locus_allele_info) == 1:
32-
locus_allele_info = single_allele_thresholds(metadata, locus_reads, locus_allele_info)
31+
if locus == "AMELOGENIN":
32+
locus_allele_info = filter_amel(metadata, locus_allele_info, locus_reads)
3333
else:
34-
locus_allele_info, locus_reads = multiple_allele_thresholds(
35-
metadata, locus_reads, locus_allele_info
36-
)
37-
locus_allele_info = ce_filtering(
38-
locus_allele_info, locus_reads, metadata, datatype, brack_col
39-
)
40-
if datatype != "ce":
41-
locus_allele_info = same_size_filter(locus_allele_info, metadata, datatype)
34+
locus_allele_info["CE_Allele"] = locus_allele_info["CE_Allele"].astype(float)
35+
if len(locus_allele_info) == 1:
36+
locus_allele_info = single_allele_thresholds(metadata, locus_reads, locus_allele_info)
37+
else:
38+
locus_allele_info, locus_reads = multiple_allele_thresholds(
39+
metadata, locus_reads, locus_allele_info
40+
)
41+
locus_allele_info = ce_filtering(
42+
locus_allele_info, locus_reads, metadata, datatype, brack_col
43+
)
44+
if datatype != "ce":
45+
locus_allele_info = same_size_filter(locus_allele_info, metadata, datatype)
4246
return locus_allele_info
4347

4448

49+
def filter_amel(metadata, amel_df, locus_reads):
50+
for filter in ["Detection", "Analytical"]:
51+
use = metadata[f"{filter}ThresholdUse"]
52+
count = metadata[f"{filter}ThresholdStaticCount"]
53+
perc = metadata[f"{filter}ThresholdDynamicPercent"]
54+
thresh_perc = round(perc * locus_reads, 1)
55+
if (
56+
use.lower() == "dynamic"
57+
and locus_reads < metadata["MinimumNumberReadsForDynamicThresholds"]
58+
):
59+
use = "static"
60+
if use.lower() == "both":
61+
thresh = thresh_perc if thresh_perc >= count else count
62+
elif use.lower() == "static":
63+
thresh = count
64+
elif use.lower() == "dynamic":
65+
thresh = thresh_perc
66+
if filter == "Detection":
67+
amel_dt = amel_df[amel_df["Reads"] >= thresh].reset_index(drop=True)
68+
locus_reads = amel_df["Reads"].sum()
69+
else:
70+
for i in range(len(amel_dt)):
71+
al_reads = amel_dt.loc[i, "Reads"]
72+
if al_reads < thresh:
73+
amel_dt.loc[i, ["allele_type", "perc_noise"]] = [
74+
"BelowAT",
75+
round(al_reads / locus_reads, 3),
76+
]
77+
else:
78+
amel_dt.loc[i, "allele_type"] = "Typed"
79+
return amel_dt
80+
81+
4582
def single_allele_thresholds(metadata, locus_reads, single_all_df):
4683
if thresholds("Detection", metadata, locus_reads, single_all_df["Reads"][0])[1] is False:
4784
single_all_df = pd.DataFrame()

0 commit comments

Comments
 (0)