bioforensics
diff --git a/‎lusSTR/cli/gui.py‎
Lines changed: 47 additions & 9 deletions b/‎lusSTR/cli/gui.py‎
Lines changed: 47 additions & 9 deletions
diff --git a/‎lusSTR/data/filters.json‎
Lines changed: 18 additions & 0 deletions b/‎lusSTR/data/filters.json‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎lusSTR/data/str_lists.json‎
Lines changed: 101 additions & 0 deletions b/‎lusSTR/data/str_lists.json‎
Lines changed: 101 additions & 0 deletions
diff --git a/‎lusSTR/data/str_markers.json‎
Lines changed: 19 additions & 0 deletions b/‎lusSTR/data/str_markers.json‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎lusSTR/scripts/filter_settings.py‎
Lines changed: 47 additions & 10 deletions b/‎lusSTR/scripts/filter_settings.py‎
Lines changed: 47 additions & 10 deletions
@@ -22,6 +22,7 @@
 import pandas as pd
 from pathlib import Path
 import plotly.express as px
+import plotly.graph_objs as go
 import streamlit as st
 from streamlit_option_menu import option_menu
 import yaml
@@ -43,6 +44,14 @@ def get_filter_metadata_file():
     filter_marker_data = json.load(fh)
 
 
+def get_strlist_file():
+    return importlib.resources.files("lusSTR") / "data/str_lists.json"
+
+
+with open(get_strlist_file(), "r") as fh:
+    str_lists = json.load(fh)
+
+
 # ------------ Function to Generate config.yaml File ---------- #
 
 
@@ -197,14 +206,33 @@ def interactive_plots_allmarkers(sample_df, flagged_df):
     max_yvalue = (int(math.ceil(max_reads / n)) * n) + n
     increase_value = int(math.ceil((max_yvalue / 5) / n)) * n
     n = 0
-    for marker in sample_df["Locus"].unique():
+    all_loci = (
+        str_lists["forenseq_strs"]
+        if st.session_state.kit == "forenseq"
+        else str_lists["powerseq_strs"]
+    )
+    missing_loci = [x for x in all_loci if x not in sample_df["Locus"].unique()]
+    for marker in all_loci:
         col = cols[n]
         container = col.container(border=True)
         sample_locus = sample_df["SampleID"].unique() + "_" + marker
-        marker_df = sample_df[sample_df["Locus"] == marker].sort_values(by="CE_Allele")
+        sample_df = np.where(
+            sample_df["Locus"] == "AMELOGENIN",
+            np.where(sample_df["CE_Allele"] == "X", 0, 1),
+            sample_df["CE_Allele"],
+        )
+        sample_df["CE_Allele"] = pd.to_numeric(sample_df["CE_Allele"])
+        marker_df = sample_df[sample_df["Locus"] == marker].sort_values(
+            by=["CE_Allele", "allele_type"], ascending=[False, True]
+        )
         if sample_locus in flagged_df["key"].values:
             marker = f"⚠️{marker}⚠️"
-        plot = interactive_plots(marker_df, marker, max_yvalue, increase_value, all=True)
+        if marker in missing_loci:
+            marker = f"⚠️{marker}⚠️"
+            plot = go.Figure()
+            plot.update_layout(title=marker)
+        else:
+            plot = interactive_plots(marker_df, marker, max_yvalue, increase_value, all=True)
         container.plotly_chart(plot, use_container_width=True)
         if n == 3:
             n = 0
@@ -240,9 +268,14 @@ def interactive_plots(df, locus, ymax, increase, all=False):
     )
     plot.add_hline(y=at, line_width=3, line_dash="dot", line_color="gray")
     plot.add_annotation(text=f"AT", x=min_x + 0.1, y=at, showarrow=False, yshift=10)
-    plot.update_layout(
-        xaxis=dict(range=[min_x, max_x], tickmode="array", tickvals=np.arange(min_x, max_x, 1))
-    )
+    if locus == "AMELOGENIN":
+        plot.update_layout(
+            xaxis=dict(tickvals=np.arange(-1, 2, 1), tickmode="array", ticktext=["", "X", "Y", ""])
+        )
+    else:
+        plot.update_layout(
+            xaxis=dict(range=[min_x, max_x], tickmode="array", tickvals=np.arange(min_x, max_x, 1))
+        )
     if all:
         plot.update_layout(
             yaxis=dict(range=[0, ymax], tickmode="array", tickvals=np.arange(0, ymax, increase))
@@ -307,11 +340,16 @@ def interactive_setup(df1, file):
             )
         interactive_plots_allmarkers(sample_df, flags)
     else:
+        plot_df = sample_df
+        sample_df = np.where(
+            sample_df["Locus"] == "AMELOGENIN",
+            np.where(sample_df["CE_Allele"] == "X", 0, 1),
+            sample_df["CE_Allele"],
+        )
+        plot_df["CE_Allele"] = pd.to_numeric(plot_df["CE_Allele"])
         locus_key = f"{sample}_{locus}"
         if locus_key not in st.session_state:
-            st.session_state[locus_key] = sample_df[sample_df["Locus"] == locus].reset_index(
-                drop=True
-            )
+            st.session_state[locus_key] = plot_df[plot_df["Locus"] == locus].reset_index(drop=True)
         Type = [
             "Deleted",
             "Typed",
 
@@ -1,4 +1,22 @@
 {
+    "AMELOGENIN": {
+        "MinimumNumberReadsForDynamicThresholds": 650,
+        "DetectionThresholdStaticCount": 10,
+        "DetectionThresholdDynamicPercent": 0,
+        "DetectionThresholdUse": "Static",
+        "AnalyticalThresholdStaticCount": 20,
+        "AnalyticalThresholdDynamicPercent": 0.017,
+        "AnalyticalThresholdUse": "Both",
+        "StochasticThresholdStaticCount": 20,
+        "StochasticThresholdDynamicPercent": 0.017,
+        "StochasticThresholdUse": "Both",
+        "MinimumHeterozygousBalanceThresholdDynamicPercent": 0.50,
+        "SameSizeThresholdDynamicPercent": 0,
+        "StutterThresholdDynamicPercent": 0,
+        "StutterForwardThresholdDynamicPercent": 0,
+        "Intercept": 0,
+        "Slope": 0
+    },
     "CSF1PO": {
         "MinimumNumberReadsForDynamicThresholds": 650,
         "DetectionThresholdStaticCount": 10,
 
@@ -0,0 +1,101 @@
+{
+
+    "powerseq_strs" : [
+        "AMELOGENIN",
+        "CSF1PO",
+        "D10S1248",
+        "D12S391",
+        "D13S317",
+        "D16S539",
+        "D18S51",
+        "D19S433",
+        "D1S1656",
+        "D21S11",
+        "D22S1045",
+        "D2S1338",
+        "D2S441",
+        "D3S1358",
+        "D5S818",
+        "D7S820",
+        "D8S1179",
+        "FGA",
+        "PENTA D",
+        "PENTA E",
+        "TH01",
+        "TPOX",
+        "VWA"
+    ],
+    "forenseq_strs" : [
+        "AMELOGENIN",
+        "CSF1PO",
+        "D10S1248",
+        "D12S391",
+        "D13S317",
+        "D16S539",
+        "D17S1301",
+        "D18S51",
+        "D19S433",
+        "D1S1656",
+        "D20S482",
+        "D21S11",
+        "D22S1045",
+        "D2S1338",
+        "D2S441",
+        "D3S1358",
+        "D4S2408",
+        "D5S818",
+        "D6S1043",
+        "D7S820",
+        "D8S1179",
+        "D9S1122",
+        "FGA",
+        "PENTA D",
+        "PENTA E",
+        "TH01",
+        "TPOX",
+        "VWA"
+        ],
+    "powerseq_ystrs" : [
+        "DYS19",
+        "DYS385A-B",
+        "DYS389II",
+        "DYS390",
+        "DYS391",
+        "DYS392",
+        "DYS393",
+        "DYS437",
+        "DYS438",
+        "DYS439",
+        "DYS448",
+        "DYS456",
+        "DYS458",
+        "DYS481",
+        "DYS533",
+        "DYS549",
+        "DYS570",
+        "DYS576",
+        "DYS635",
+        "DYS643",
+        "Y-GATA-H4"
+    ],
+    "forenseq_ystrs" : [
+        "DYS19",
+        "DYS385A-B",
+        "DYS389II",
+        "DYS390",
+        "DYS391",
+        "DYS392",
+        "DYS437",
+        "DYS438",
+        "DYS439",
+        "DYS448",
+        "DYS481",
+        "DYS533",
+        "DYS549",
+        "DYS570",
+        "DYS576",
+        "DYS635",
+        "DYS643",
+        "Y-GATA-H4"
+    ]
+}
@@ -1,4 +1,23 @@
 {
+    "AMELOGENIN": {
+        "BasesToSubtract": 0,
+        "NumRepeats": 1,
+        "Repeats": [
+            "AAAGTG"
+        ],
+        "NumBasesToSeparate": 0,
+        "ReverseCompNeeded": "No",
+        "LUS": "",
+        "Sec": "",
+        "Tert": "",
+        "Foren_5": 26,
+        "Foren_3": 37,
+        "Power_5": 10,
+        "Power_3": 37,
+        "Custom_5": 0,
+        "Custom_3": 0,
+        "Alleles": ["X", "Y"]
+    },
     "CSF1PO": {
         "BasesToSubtract": 0,
         "NumRepeats": 1,
 
@@ -28,20 +28,57 @@ def get_filter_metadata_file():
 
 def filters(locus_allele_info, locus, locus_reads, datatype, brack_col):
     metadata = filter_marker_data[locus]
-    if len(locus_allele_info) == 1:
-        locus_allele_info = single_allele_thresholds(metadata, locus_reads, locus_allele_info)
+    if locus == "AMELOGENIN":
+        locus_allele_info = filter_amel(metadata, locus_allele_info, locus_reads)
     else:
-        locus_allele_info, locus_reads = multiple_allele_thresholds(
-            metadata, locus_reads, locus_allele_info
-        )
-        locus_allele_info = ce_filtering(
-            locus_allele_info, locus_reads, metadata, datatype, brack_col
-        )
-        if datatype != "ce":
-            locus_allele_info = same_size_filter(locus_allele_info, metadata, datatype)
+        locus_allele_info["CE_Allele"] = locus_allele_info["CE_Allele"].astype(float)
+        if len(locus_allele_info) == 1:
+            locus_allele_info = single_allele_thresholds(metadata, locus_reads, locus_allele_info)
+        else:
+            locus_allele_info, locus_reads = multiple_allele_thresholds(
+                metadata, locus_reads, locus_allele_info
+            )
+            locus_allele_info = ce_filtering(
+                locus_allele_info, locus_reads, metadata, datatype, brack_col
+            )
+            if datatype != "ce":
+                locus_allele_info = same_size_filter(locus_allele_info, metadata, datatype)
     return locus_allele_info
 
 
+def filter_amel(metadata, amel_df, locus_reads):
+    for filter in ["Detection", "Analytical"]:
+        use = metadata[f"{filter}ThresholdUse"]
+        count = metadata[f"{filter}ThresholdStaticCount"]
+        perc = metadata[f"{filter}ThresholdDynamicPercent"]
+        thresh_perc = round(perc * locus_reads, 1)
+        if (
+            use.lower() == "dynamic"
+            and locus_reads < metadata["MinimumNumberReadsForDynamicThresholds"]
+        ):
+            use = "static"
+        if use.lower() == "both":
+            thresh = thresh_perc if thresh_perc >= count else count
+        elif use.lower() == "static":
+            thresh = count
+        elif use.lower() == "dynamic":
+            thresh = thresh_perc
+        if filter == "Detection":
+            amel_dt = amel_df[amel_df["Reads"] >= thresh].reset_index(drop=True)
+            locus_reads = amel_df["Reads"].sum()
+        else:
+            for i in range(len(amel_dt)):
+                al_reads = amel_dt.loc[i, "Reads"]
+                if al_reads < thresh:
+                    amel_dt.loc[i, ["allele_type", "perc_noise"]] = [
+                        "BelowAT",
+                        round(al_reads / locus_reads, 3),
+                    ]
+                else:
+                    amel_dt.loc[i, "allele_type"] = "Typed"
+    return amel_dt
+
+
 def single_allele_thresholds(metadata, locus_reads, single_all_df):
     if thresholds("Detection", metadata, locus_reads, single_all_df["Reads"][0])[1] is False:
         single_all_df = pd.DataFrame()