Structure_MASST_App/structureMASST.py at main · Wang-Bioinformatics-Lab/Structure_MASST_App · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import os
import streamlit as st

# Write the page label
st.set_page_config(
    page_title="StructureMASST",
    layout="wide",
    page_icon="🔎",
)
st.logo("logo.png", icon_image="logo.png")

from streamlit.components.v1 import html
import pandas as pd
import importlib.util
from rdkit import Chem
from PIL import Image
import base64
import io
from bin.workflow_stepwise import retrieve_raw_data_matches
from bin.run_masstRecords_queries import get_library_table, get_masst_and_redu_tables
from bin.match_smiles import detect_smiles_or_smarts, neutralize_atoms, tautomerize_smiles
from bin.pubchem_handling  import pubchem_autocomplete, name_to_cid, cid_to_canonical_smiles
from bin.plotting import raw_data_sankey, export_hits_map
from bin.linkouts import build_dashboard_eic_url, build_spectraresolver_link
from bin.smarts_api import query_smarts
from bin.streamlit_fragment_domainMASST import domainmasst_fragment, domainmasst_intersection_fragment
from bin.streamlit_fragment_LifeMASST import lifemasst_fragment
from bin.api_health import test_fasst_api_search_nonblocking
import matplotlib.pyplot as plt
import matplotlib
from collections import defaultdict
import plotly.graph_objects as go
import plotly.express as px
import plotly.colors as pc
from formula_validation.Formula import Formula
import requests
import re
import urllib.parse as _url
import hashlib
from pathlib import Path
from typing import Iterable, Mapping, Union, List, Dict
import subprocess
import uuid
import time
import numpy as np
from dotenv import load_dotenv
from streamlit_ketcher import st_ketcher
import gc


import tasks


# Tracking
import umami
umami.set_url_base("https://analytics-api.gnps2.org/")
umami.set_website_id('032bfca4-a353-4586-b637-8908d8b71c85')
umami.set_hostname('analytics-api.gnps2.org')

# Load .env file
load_dotenv("keys.env")
SMARTS_API_KEY = os.getenv("SMARTS_API_KEY", "")

st.markdown("""
<style>
/* Make the main content truly wide and add fluid side padding */
.block-container {
  max-width: 100%;
  padding-left: 2vw;
  padding-right: 2vw;
  padding-top: 1rem;
}

/* Responsive typography scale for buttons */
:root{
  /* font-size automatically clamps between 12px and 20px, grows with viewport */
  --btn-font-size: clamp(12px, 1.2vw, 20px);
  /* padding grows a bit with viewport */
  --btn-pad-y: clamp(6px, 0.7vw, 14px);
  --btn-pad-x: clamp(10px, 1.4vw, 24px);
  --btn-radius: 14px;
}

/* Style all Streamlit buttons (works for st.button and st.link_button) */
.stButton > button, .stDownloadButton > button {
  width: 100%;               /* fill the column — column defines relative width */
  font-size: var(--btn-font-size);
  padding: var(--btn-pad-y) var(--btn-pad-x);
  border-radius: var(--btn-radius);
  line-height: 1.2;
}

/* Optional: make selectboxes/inputs feel consistent */
.stTextInput input, .stSelectbox > div > div, .stNumberInput input {
  font-size: clamp(12px, 1.1vw, 18px);
}

/* Small-screen tweaks */
@media (max-width: 768px) {
  .block-container { padding-left: 3vw; padding-right: 3vw; }
  .stButton > button, .stDownloadButton > button { line-height: 1.25; }
}
</style>
""", unsafe_allow_html=True)

# Add a tracking token
html('<script async defer data-website-id="74bc9983-13c4-4da0-89ae-b78209c13aaf" src="https://analytics.gnps2.org/umami.js"></script>', width=0, height=0) # GNPS2 Global
html('<script defer src="https://analytics-api.gnps2.org/script.js" data-website-id="74665d88-3b9d-4812-b8fc-7f55ceb08f11"></script>',  width=0, height=0) # Streamlit Apps
html('<script defer src="https://analytics-api.gnps2.org/script.js" data-website-id="032bfca4-a353-4586-b637-8908d8b71c85"></script>',  width=0, height=0) # Structure MASST


st.markdown("""
<style>
/* Make the sidebar header area taller so the logo has room */
section[data-testid="stSidebar"] [data-testid="stSidebarHeader"] {
  height: 140px;              /* adjust as needed */
  padding-top: 8px;
  padding-bottom: 8px;
}

/* Enlarge the logo image in the sidebar header */
section[data-testid="stSidebar"] [data-testid="stSidebarHeader"] img {
  height: 120px !important;   /* main control: set the target height */
  width: auto !important;     /* keep aspect ratio */
  display: block;
  margin: 0 auto;             /* center horizontally */
}

/* (Optional) control the tiny icon when the sidebar is collapsed */
[data-testid="stSidebarCollapsedControl"] img {
  height: 28px !important;
  width: auto !important;
}
</style>
""", unsafe_allow_html=True)


# This will have to be added to every page, or imported from a common module
st.sidebar.markdown(
    """
    <span style="font-size:0.85em;">
    <strong>Contributors</strong><br>
    Yasin El Abiead (UCSD)<br>
    Wilhan Nunes (UCSD)<br>
    Mingxun Wang (UCR)<br>
    </span>
    """,
    unsafe_allow_html=True
)

try:
    from rdkit.Chem import Draw
    _RD_DRAW_AVAILABLE = True
except ImportError:
    _RD_DRAW_AVAILABLE = False

# datasette masst_records.sqlite --setting max_returned_rows 1000000 --setting sql_time_limit_ms 60000

# — load config —
config_path = "config.py"
spec = importlib.util.spec_from_file_location("config", config_path)
config = importlib.util.module_from_spec(spec)
spec.loader.exec_module(config)


def get_session_hash() -> str:
    if "_session_hash" not in st.session_state:
        raw = f"{uuid.uuid4()}-{time.time()}-{os.urandom(16).hex()}"
        st.session_state["_session_hash"] = hashlib.sha256(raw.encode()).hexdigest()[:12]
    return st.session_state["_session_hash"]

# create unique session id
sid = get_session_hash()
st.write("Session:", sid)
output_folder = f"sessionoutput/{sid}"
os.makedirs(output_folder, exist_ok=True)

st.session_state["_session_output_folder"] = output_folder


# — SMILES or CSV input —
col_name, col_or1, col_smiles, col_or2, col_csv = st.columns([4, 1, 4, 1, 4])

# ---------- state init ----------
for k, v in [
    ("name_query", ""),
    ("last_fetched_query", None),
    ("name_suggestions", []),
    ("name_choice", None),
    ("smiles_input", ""),
    ("name_warning", None),
    ("structure_editor_open", False),
    ("new_smiles", ""),
]:
    st.session_state.setdefault(k, v)

def tautomerize_neutralize_smiles(smiles: str) -> str:
    """Tautomerize and neutralize a SMILES string."""
    try:
        smi = tautomerize_smiles(smiles)
    except Exception as e:
        print(f"Tautomerization failed: {e}")
        smi = smiles
    try:
        smi = neutralize_atoms(smi)
    except Exception as e:
        print(f"Neutralization failed: {e}")
    return smi

def _resolve_name_to_smiles(selected_name: str):
    """Resolve name → CID → Canonical SMILES, enforce single-component rule."""
    st.session_state["name_warning"] = None  # reset
    if not selected_name:
        return
    cid = name_to_cid(selected_name)
    smiles = cid_to_canonical_smiles(cid) if cid else None

    # Rule: must exist and must NOT contain '.'
    if smiles and "." not in smiles:
        st.session_state["smiles_input"] = smiles
    else:
        st.session_state["smiles_input"] = ""
        st.session_state["name_warning"] = (
            "PubChem entry does not represent a singular molecule "
            "(no SMILES available or multi-component SMILES containing '.')."
        )

with col_name:
    # Plain text_input (same style as SMILES). Pressing Enter triggers a rerun.
    name_query = st.text_input(
        "Type a chemical name to search PubChem",
        key="name_query",
        placeholder="e.g., diazepam, caffeine, etc.",
        on_change=lambda: st.session_state.update({'structure_editor_open': False, 'new_smiles': '', 'smiles_input': ''})
    )
    # If the query changed (e.g., after Enter), fetch suggestions once
    if name_query and name_query != st.session_state["last_fetched_query"]:
        suggestions = pubchem_autocomplete(name_query) or []
        st.session_state["name_suggestions"] = suggestions
        st.session_state["last_fetched_query"] = name_query

        # Preselect top suggestion and immediately try to populate SMILES
        if suggestions:
            st.session_state["name_choice"] = suggestions[0]
            _resolve_name_to_smiles(suggestions[0])

    # Show dropdown only when we have suggestions
    suggestions = st.session_state["name_suggestions"]
    if suggestions:
        def _on_choice_change():
            _resolve_name_to_smiles(st.session_state.get("name_choice"))
            st.session_state["structure_editor_open"] = False
            # mirror whatever resolve put into the text box; fall back to empty
            st.session_state["new_smiles"] = st.session_state.get("smiles_input", "")


        st.selectbox(
            "Suggestions",
            options=suggestions,
            key="name_choice",
            index=(
                suggestions.index(st.session_state["name_choice"])
                if st.session_state["name_choice"] in suggestions else 0
            ),
            on_change=_on_choice_change,
        )

    # Show warning (if any) right under the name controls
    if st.session_state["name_warning"]:
        st.warning(st.session_state["name_warning"])

with col_or1:
    st.markdown("<div style='text-align:center; margin-top:2.5em;'>or</div>", unsafe_allow_html=True)

with col_smiles:
    smiles_input = st.text_input(
        "SMILES/SMARTS",
        key="smiles_input",  # gets auto-populated only if single-component
        placeholder="Enter SMILES or SMARTS",
        help="Enter a valid SMILES or SMARTS string. For SMARTS string creation, you can use third-party tools like SMARTSPlus https://smarts.plus/create",
        #cleans session state on change and cleans name input and suggestions
        on_change=lambda: st.session_state.update({'structure_editor_open': False, 'new_smiles': '', 'name_query': '', 'name_suggestions': []}),
    )
    smiles_input = smiles_input.strip()
    # Use effective SMILES (from editor if available) for type detection
    effective_smiles = st.session_state.get('new_smiles', '') or smiles_input
    smiles_type = detect_smiles_or_smarts(effective_smiles)


with col_or2:
    st.markdown("<div style='text-align:center; margin-top:2.5em;'>or</div>", unsafe_allow_html=True)

with col_csv:
    #add spacer to align widgets
    st.markdown("<div style='height: 1.5em;'></div>", unsafe_allow_html=True)
    # if st.checkbox("Upload Batch file", key="batch_search"):
    with st.popover("Add batch file", icon=":material/file_upload:"):
        uploaded_file = st.file_uploader(
            """Drop CSV file for batch search (smiles and name columns). We recommend to stay below 100 molecules or reach out to us to run locally.""",
            type=["csv"]
        )

# — Display molecule if valid SMILES/SMARTS —
#@st.dialog("Visualize Structure")
def show_structure_dialog(mol):
    # If input is a BytesIO (image), show it directly; else, use RDKit rendering
    if isinstance(mol, io.BytesIO):
        #st.info("This is a SMARTS pattern, which may represent multiple structures.")
        st.image(mol, width=500)
    elif mol_to_base64_img:
        st.markdown(mol_to_base64_img(mol), unsafe_allow_html=True)
    else:
        st.info("RDKit not available, cannot render structure.")

try:
    from rdkit.Chem import Draw
    def mol_to_base64_img(mol, size=(300, 300)):
        try:
            img = Draw.MolToImage(mol, size=size)
            buf = io.BytesIO()
            img.save(buf, format="PNG")
            img_str = base64.b64encode(buf.getvalue()).decode("utf-8")
            return f"<img src='data:image/png;base64,{img_str}' style='margin-top:1em;'/>"
        except Exception:
            return f"<p style='color:red;'>Failed to draw molecule image.</p>"
except ImportError:
    mol_to_base64_img = None


# --- render logic ---
smiles_type = None  # Ensure smiles_type is always defined
default_search_index = 0 # default to exact match search


if smiles_input:
    smiles_input = smiles_input.strip()
    # Use effective SMILES (from editor if available) for type detection
    effective_smiles = st.session_state.get('new_smiles', '') or smiles_input
    smiles_type = detect_smiles_or_smarts(effective_smiles)

    df_input = pd.DataFrame(
        {
            "smiles": [effective_smiles],
            "name": ["Input_query"],
            "type": [smiles_type]
        }
    )
    if smiles_type == "smiles":
        edit_button = st.button("Edit Structure", icon=":material/edit:")
        if edit_button:
            st.session_state['structure_editor_open'] = True

        if edit_button or st.session_state['structure_editor_open']:
            with st.expander("Structure Editor", expanded=True):
                st.info(f"You can edit the structure below and click Apply to update it.")

                # Add a close button for persistent editor
                if st.session_state['structure_editor_open']:
                    if st.button("Close Structure Editor", icon=":material/close:", help="Warning: You will :red-badge[lose your changes]"):
                        st.session_state['structure_editor_open'] = False
                        st.rerun()

                # Use existing SMILES if available, otherwise use input
                new_smiles = st_ketcher(effective_smiles)

        else:
            new_smiles = smiles_input

        st.session_state['new_smiles'] = new_smiles


        # Structure rendering with RDKit
        if _RD_DRAW_AVAILABLE and new_smiles:
            mol = Chem.MolFromSmiles(new_smiles)
            if mol:
                st.markdown(mol_to_base64_img(mol), unsafe_allow_html=True)
            else:
                st.warning("Could not parse SMILES for rendering.")

    elif smiles_type == "smarts":
        job_id = str(uuid.uuid4())
        # For SMARTS, use the effective SMILES
        response = query_smarts(effective_smiles, api_key=SMARTS_API_KEY, job_id=job_id, file_format="png")
        if response and 'result' in response and 'image' in response['result']:
            image_data = base64.b64decode(response['result']['image'])
            bytes_io = io.BytesIO(image_data)
            #if st.button("View SMARTS", icon=":material/visibility:"):
            show_structure_dialog(bytes_io)
        else:
            st.warning("Failed to retrieve SMARTS image from the API. You can still go ahead.")
    else:
        st.warning("Not a valid SMILES/SMARTS")

# — mode selection UI —
search_options = ["Exact structure match", "Substructure match", "Tanimoto similarity"]
default_search_index = 0
col_a1, col_b2, _ = st.columns([2,1,2])
if smiles_type and smiles_type == 'smarts':
    #st.info("SMARTS input detected. Use 'Substructure match' for search.", icon=':material/info:')
    search_options = ["Substructure match"]
with col_a1:
    searchtype_option = st.radio(
        "Find available MS/MS spectra",
        search_options,
        horizontal=True, index=default_search_index
    )

if searchtype_option == "Tanimoto similarity":
    with col_b2:
        tanimoto_cutoff = st.text_input("Tanimoto threshold", value="0.8", key="tanimoto_threshold")

# Map UI option to backend value
if searchtype_option == "Exact structure match":
    searchtype_option = "exact"
elif searchtype_option == "Substructure match":
    searchtype_option = "substructure"
elif searchtype_option == "Tanimoto similarity":
    searchtype_option = "tanimoto"


# — run the search —
if st.button("Get Available Spectra", icon=':material/search:'):
    # Tracking this action
    try:
        umami.new_event(event_name="Get Available Spectra Clicked")
    except Exception as e:
        print(f"Error tracking event: {e}")

    # Use new_smiles from structure editor if available, otherwise use original input
    effective_smiles = st.session_state.get('new_smiles', '') or smiles_input
    if smiles_type == "smiles":
        effective_smiles = tautomerize_neutralize_smiles(effective_smiles)


    with st.spinner("Finding spectra..."):
        # Reset upstream & downstream state
        for key in [
            "selected_queries",
            "df_library_conflicts",
            "grouped_results",
            "raw_results",
            "molecule_overview",
            "query_table"
        ]:
            st.session_state.pop(key, None)

        # initialize what's needed
        st.session_state.selected_queries = {}
        st.session_state.df_library_conflicts = None
        st.session_state.grouped_results = {}
        st.session_state.molecule_overview = {}
        st.session_state.query_table = {}

        # organize input structure queries
        smiles_list = []
        if smiles_input:
            # Use new_smiles from structure editor if available, otherwise use original input
            effective_smiles = st.session_state.get('new_smiles', '') or smiles_input

            smiles_type = detect_smiles_or_smarts(effective_smiles)
            if smiles_type == "smiles":
                effective_smiles = tautomerize_neutralize_smiles(effective_smiles)

            df_input = pd.DataFrame(
                {
                    "smiles": [effective_smiles],
                    "name": ["Input_query"],
                    "type": [smiles_type],
                    "searchtype": [searchtype_option],
                }
            )

            st.session_state.query_table = df_input

            smiles_list = [effective_smiles]
            name_list = ['Input_query']
            searchtype_list = [searchtype_option]
        elif uploaded_file is not None:
            df_input = pd.read_csv(uploaded_file)
            if "smiles" in df_input.columns and "name" in df_input.columns:

                # drop rows with na in either column
                df_input = df_input.dropna(subset=["smiles", "name"])

                # add column with type detection
                df_input["type"] = df_input["smiles"].apply(detect_smiles_or_smarts)

                # harmonize smiles if type is smiles (tautomerize + neutralize)
                df_input["smiles"] = df_input.apply(
                    lambda row: tautomerize_neutralize_smiles(row["smiles"])
                    if row["type"] == "smiles" else row["smiles"],
                    axis=1)


                # for smiles set to exact match, for smarts set to substructure match
                if "searchtype" not in df_input.columns:
                    df_input["searchtype"] = df_input["type"].apply(lambda x: "exact" if x == "smiles" else "substructure")

                print(df_input)
                st.session_state.query_table = df_input

                smiles_list = df_input["smiles"].dropna().tolist()
                name_list = df_input["name"].dropna().tolist()
                searchtype_list = df_input["searchtype"].dropna().tolist()
            else:
                st.warning("CSV must contain a 'smiles' and 'name' column.")
                st.stop()
        else:
            st.warning("Please enter a SMILES or upload a CSV.")
            st.stop()

        # process each input structure query separately to retrieve spectra
        grouped_results = defaultdict(dict)
        molecule_overview = defaultdict(dict)
        for smi, name, searchtype in zip(smiles_list, name_list, searchtype_list):
            try:
                tanimoto_threshold = tanimoto_cutoff if searchtype == "tanimoto" else None
                df_library_structurematch = tasks.run_get_library_table(smi, searchtype, tanimoto_threshold, config.PATH_TO_SQLITE, config.MASSTRECORDS_ENDPOINT, config.MASSTRECORDS_TIMEOUT)

                if df_library_structurematch.empty:
                    continue

                # Setup for library conflicts. this is currently not doing anything
                df_library_conflicts = pd.DataFrame()
                df_library_conflicts["inchikey_first_block"] = pd.Series(dtype=str)
            except Exception as e:
                st.error(f"Error for {smi}: {e}")

            # Setup for library conflicts. this is currently not doing anything
            print(f"Processing {name} with {len(df_library_structurematch)} matches")
            overview = []
            for ik in df_library_structurematch["inchikey_first_block"].unique():
                sub_struct = df_library_structurematch[df_library_structurematch["inchikey_first_block"] == ik].copy()
                sub_conf   = df_library_conflicts[df_library_conflicts["inchikey_first_block"] == ik].copy()
                grouped_results[name][ik] = {"structure": sub_struct, "conflicts": sub_conf}
                st.session_state.selected_queries[ik] = list(sub_struct["query_spectrum_id"].unique())

                # pick most common Compound_Name (tie-break by len≈10 & fewest special chars)
                names = sub_struct["Compound_Name"].dropna().astype(str)
                if not names.empty:
                    vc = names.value_counts()
                    top = vc.iloc[0]
                    cands = vc[vc == top].index.tolist()
                    def special_count(s):
                        return len(re.findall(r"[^A-Za-z0-9]", s))
                    best_name = min(cands, key=lambda s: (abs(len(s) - 10), special_count(s)))
                else:
                    best_name = ""

                # grab first SMILES
                smiles = sub_struct["Smiles"].dropna().astype(str)
                inchikey_first_block = sub_struct["inchikey_first_block"].dropna().astype(str)
                first_smi = smiles.iloc[0] if not smiles.empty else ""
                ikb = inchikey_first_block.iloc[0] if not inchikey_first_block.empty else ""

                overview.append({
                    "Compound_Name": best_name,
                    "inchikey_first_block": ikb,
                    "Smiles": first_smi
                })

            st.session_state.molecule_overview[name] = pd.DataFrame(overview)

        # if grouped_results is empty, inform user
        if not grouped_results:
            st.warning("No spectra available for this structure.")
            st.stop()

        # get results into session state
        st.session_state.grouped_results = grouped_results

        for var in ["df_library_structurematch", "result"]:
            try:
                del locals()[var]
            except KeyError:
                pass
        gc.collect()

# render outer tabs for each structure query
if "grouped_results" in st.session_state and st.session_state["grouped_results"]:

    with st.expander("Available Library Entries", expanded=True):
        st.markdown("### Available Library Entries")
        name_tabs = st.tabs(list(st.session_state.grouped_results.keys()))

        for fig_id, (name, name_tab) in enumerate(zip(st.session_state.grouped_results.keys(), name_tabs)):
            with name_tab:

                tables = st.session_state.grouped_results[name]

                # number of molecules = number of 2d inchikey keys
                num_molecules = len(tables)

                # total number of matches across all "structure" tables
                total_matches = sum(
                    tbl["structure"].shape[0]
                    for tbl in tables.values()
                )

                # messages on retrieved spectra
                st.markdown(
                    f"##### Retrieved **{total_matches}** spectra for molecule ({name}).",
                )

                st.markdown(
                    "<div style='font-size:0.9em;'>"
                    "Below you see the public MS/MS spectra available for your search. The higher the spectral diversity, the less blind-spots you will have in public metabolomics raw data."
                    "</div>",
                    unsafe_allow_html=True
                )

                if total_matches == 0:
                    st.info("No spectra available for this structure.")
                    continue
                # Sankey per query structure
                ##########

                # concatenate all the 'structure' dfs under this name
                df_all = pd.concat(
                    [v["structure"] for v in st.session_state.grouped_results[name].values()],
                    ignore_index=True
                )

                # Check if all required columns exist before proceeding
                required_cols = ["msMassAnalyzer", "Ion_Mode", "Adduct", "collision_energy"]
                missing_cols = [col for col in required_cols if col not in df_all.columns]
                if not missing_cols:
                    # Fixing levels
                    stages = ["msMassAnalyzer", "Ion_Mode", "Adduct", "collision_energy"]

                    # make a copy to avoid SettingWithCopyWarning
                    df_sankey = df_all[stages].copy()

                    # Define default fill values with suffixes
                    fill_values = {
                        "msMassAnalyzer": "Unknown_1",
                        "Ion_Mode": "Unknown_2",
                        "Adduct": "Unknown_3",
                        "collision_energy": "Unknown_4",
                    }

                    # Convert collision_energy to string *before* filling to avoid dtype conflicts
                    if pd.api.types.is_numeric_dtype(df_sankey["collision_energy"]):
                        df_sankey["collision_energy"] = df_sankey["collision_energy"].astype(str)

                    # Fill NaNs with the custom Unknown labels
                    df_sankey = df_sankey.fillna(fill_values)

                    # Fill "nan" strings if any
                    for col, fill_val in fill_values.items():
                        df_sankey.loc[df_sankey[col] == "nan", col] = fill_val

                    # Create labels for Sankey diagram
                    labels = []
                    for col in stages:
                        labels += df_sankey[col].unique().tolist()
                    labels = list(dict.fromkeys(labels))

                    # get up to 5 colors (should not be more different instrument types than that)
                    ms_cats = df_sankey["msMassAnalyzer"].unique().tolist()
                    palette = px.colors.qualitative.Safe[:5]
                    color_map = {cat: palette[i] for i, cat in enumerate(ms_cats)}

                    # build links
                    source, target, value, link_colors = [], [], [], []
                    for i in range(len(stages) - 1):
                        for cat in ms_cats:
                            df_cat = df_sankey[df_sankey["msMassAnalyzer"] == cat]
                            grp = (
                                df_cat
                                .groupby([stages[i], stages[i + 1]])
                                .size()
                                .reset_index(name="count")
                            )
                            for _, row in grp.iterrows():
                                source.append(labels.index(row[stages[i]]))
                                target.append(labels.index(row[stages[i + 1]]))
                                value.append(row["count"])
                                link_colors.append(color_map[cat].replace("rgb", "rgba").replace(")", f", {0.3})"))


                    # all nodes light grey with black border
                    node_colors = ["#F2F2F2"] * len(labels)

                    fig = go.Figure(
                        go.Sankey(
                            arrangement="snap",
                            # ← trace‑level label styling
                            textfont=dict(family="Arial, sans-serif", size=12, color="black"),

                            node=dict(
                                label=labels,
                                color=node_colors,
                                pad=15,
                                thickness=20,
                                line=dict(color="black", width=0.5),
                            ),
                            link=dict(
                                source=source,
                                target=target,
                                value=value,
                                color=link_colors,
                            ),
                        )
                    )

                    # add stage annotations
                    fig.update_layout(
                        font=dict(family="Arial, sans-serif", size=12),
                        margin=dict(l=60, r=60, t=120, b=20),
                    )

                    # add stage labels above the Sankey diagram
                    stage_labels = ["Mass Analyzer", "Ion Mode", "Adduct", "Collision Energy"]

                    # calculate x positions for each stage label
                    n = len(stage_labels) - 1
                    for i, label in enumerate(stage_labels):
                        x = i / n
                        # xanchor depends on position
                        if i == 0:
                            xanchor = "left"
                        elif i == n:
                            xanchor = "right"
                        else:
                            xanchor = "center"

                        # add annotation for each stage label
                        fig.add_annotation(
                            x=x,
                            y=1.02,
                            xref="paper",
                            yref="paper",
                            text=label,
                            showarrow=False,
                            font=dict(size=14, color="black"),
                            xanchor=xanchor
                        )
                    # update layout
                    config_sankey_download = {
                        "toImageButtonOptions": {"format": "svg", "filename": f"plot_{fig_id}"},
                        "displaylogo": False,
                    }
                    st.plotly_chart(fig, use_container_width=True, key=f"plot_{fig_id}", config=config_sankey_download)

                molecule_overview_df = st.session_state.molecule_overview[name]

                if _RD_DRAW_AVAILABLE:
                    @st.cache_data
                    def smiles_to_datauri(smi: str, size=(500, 500)) -> str:
                        """Render a SMILES to PNG data URI."""
                        mol = Chem.MolFromSmiles(smi)
                        if not mol:
                            return ""
                        img = Draw.MolToImage(mol, size=size)
                        buf = io.BytesIO()
                        img.save(buf, format="PNG")
                        b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
                        return f"data:image/png;base64,{b64}"

                    molecule_overview_df = molecule_overview_df.copy()
                    molecule_overview_df["structure"] = molecule_overview_df["Smiles"].apply(smiles_to_datauri)

                    # reorder columns to Compound_Name, Structure, others
                    cols = ["Compound_Name", "structure"] + [col for col in molecule_overview_df.columns if col not in ["Compound_Name", "structure"]]
                    molecule_overview_df = molecule_overview_df[cols]

                    # display it with clickable links
                    table_mol = st.dataframe(
                        molecule_overview_df,
                        hide_index=True,
                        on_select="rerun",
                        selection_mode="multi-row",
                        width='stretch',
                        column_config={
                            "structure": st.column_config.ImageColumn(
                                "Structure",
                                width=300
                            )
                        },
                        key=f"{name}_molecule_table",
                        row_height=250
                    )
                else:
                    table_mol = st.dataframe(
                        molecule_overview_df,
                        hide_index=True,
                        on_select="rerun",
                        selection_mode="multi-row",
                        width='stretch',
                        key=f"{name}_molecule_table"
                    )

                # grab list of selected row indices
                selected = table_mol.selection.rows

                # show buttons with actions for selected molecules
                col1_mol_level, col2_mol_level, _, col_split_mols_into_queries = st.columns([2, 2, 4, 2])
                with col1_mol_level:
                    if st.button("Remove selected molecule(s)", key=f"{name}_mol_remove"):
                        if selected:
                            molecule_overview_df = molecule_overview_df.drop(molecule_overview_df.index[selected])
                            st.session_state.molecule_overview[name] = molecule_overview_df
                            unique_inchikeys = molecule_overview_df["inchikey_first_block"].unique().tolist()

                            st.session_state.grouped_results[name] = {
                                ik: data
                                for ik, data in st.session_state.grouped_results[name].items()
                                if ik in unique_inchikeys
                            }

                        else:
                            st.warning("No rows selected!")
                        st.rerun()

                with col2_mol_level:
                    if st.button("Keep only selected molecule(s)", key=f"{name}_mol_keep"):
                        if selected:
                            molecule_overview_df = molecule_overview_df.iloc[selected].reset_index(drop=True)
                            st.session_state.molecule_overview[name] = molecule_overview_df
                            unique_inchikeys = molecule_overview_df["inchikey_first_block"].unique().tolist()
                            # also remove from grouped_results from same name
                            st.session_state.grouped_results[name] = {
                                ik: data
                                for ik, data in st.session_state.grouped_results[name].items()
                                if ik in unique_inchikeys
                            }
                        else:
                            st.warning("No rows selected!")
                        st.rerun()
                #if more than 1 molecule, allow splitting
                if num_molecules > 1:
                    with col_split_mols_into_queries:
                        if st.button("Split each molecule into separate query", key=f"{name}_mol_split"):
                            # If nothing selected, split ALL rows
                            df_to_split = (
                                molecule_overview_df.iloc[selected].reset_index(drop=True)
                                if selected
                                else molecule_overview_df.reset_index(drop=True)
                            )

                            if df_to_split.empty:
                                st.warning("No rows to split!")
                            else:
                                src_results = st.session_state.grouped_results.get(name, {})
                                for idx, row in df_to_split.iterrows():
                                    new_name = f"{row.get('Compound_Name')}_{idx+1}"
                                    st.session_state.molecule_overview[new_name] = pd.DataFrame([row])

                                    ik = row.get("inchikey_first_block")
                                    if isinstance(src_results, dict) and ik in src_results:
                                        st.session_state.grouped_results[new_name] = {ik: src_results[ik]}
                                    else:
                                        st.session_state.grouped_results[new_name] = {
                                            "structure": pd.DataFrame(),
                                            "conflicts": pd.DataFrame(),
                                        }

                                # remove the original group
                                st.session_state.molecule_overview.pop(name, None)
                                st.session_state.grouped_results.pop(name, None)

                            st.rerun()


                # update the session state with the filtered dataframe
                st.session_state.molecule_overview[name] = molecule_overview_df

                # display each query structures available spectra as table
                with st.expander("Molecules by InChIKey", expanded=True):
                    # grab the keys
                    keys = list(st.session_state.grouped_results[name].keys())

                    max_tabs = 100
                    if len(keys) > max_tabs:
                        st.warning(f"⚠️ Too many InChIKeys ({len(keys)}). Showing only the first {max_tabs}.")
                        keys = keys[:max_tabs]

                    # create a tab for each 2D InChIKey (max 100)
                    ik_tabs = st.tabs(keys)
                    for ik, ik_tab in zip(keys, ik_tabs):
                        with ik_tab:

                            data = st.session_state.grouped_results[name][ik]
                            df0 = data["structure"].copy()


                            df0["spectrum_link"] = df0["query_spectrum_id"].apply(
                                lambda x: (
                                    f"http://metabolomics-usi.gnps2.org/dashinterface?usi1=mzspec%3AGNPS%3AGNPS-LIBRARY%3Aaccession%3A{x}&width=10.0&height=6.0&mz_min=None&mz_max=None&max_intensity=125&annotate_precision=4&annotation_rotation=90&cosine=standard&fragment_mz_tolerance=0.02&grid=True&annotate_peaks=%5B%5B%5D%2C%20%5B%5D%5D" if x.startswith("CCMSLIB")
                                    else f"http://metabolomics-usi.gnps2.org/dashinterface?usi1=mzspec%3AMASSBANK%3A%3Aaccession%3A{x}&width=10.0&height=6.0&mz_min=None&mz_max=None&max_intensity=125&annotate_precision=4&annotation_rotation=90&cosine=standard&fragment_mz_tolerance=0.02&grid=True&annotate_peaks=%5B%5B%5D%2C%20%5B%5D%5D"
                                )
                            )

                            # Make the spectrum column the first column
                            col_order = ['spectrum_link', 'Compound_Name', 'Precursor_MZ']
                            df0 = df0[col_order + [col for col in df0.columns if col not in col_order]]

                            # display it with clickable links
                            table_evt = st.dataframe(
                                df0,
                                column_config={
                                    "spectrum_link": st.column_config.LinkColumn(
                                        label="USI Viewer",
                                        display_text="Open Spectrum"
                                    )
                                },
                                hide_index=True,
                                on_select="rerun",
                                selection_mode="multi-row",
                                width='stretch',
                                key=f"{name}_{ik}_table"
                            )

                            # grab list of selected row indices
                            selected = table_evt.selection.rows

                            # show buttons with actions for selected spectra
                            col1, col2, col3, _ = st.columns([2, 2, 2, 4])
                            with col1:
                                if st.button("Remove selected spectra", key=f"{name}_{ik}_remove"):
                                    if selected:
                                        df_filtered = df0.drop(df0.index[selected])
                                        st.session_state.grouped_results[name][ik]["structure"] = df_filtered
                                    else:
                                        st.warning("No rows selected!")
                                    st.rerun()

                            with col2:
                                if st.button("Keep only selected spectra", key=f"{name}_{ik}_keep"):
                                    if selected:
                                        df_filtered = df0.iloc[selected].reset_index(drop=True)
                                        st.session_state.grouped_results[name][ik]["structure"] = df_filtered
                                    else:
                                        st.warning("No rows selected!")
                                    st.rerun()

                            # update the session state with the filtered dataframe
                            st.session_state.grouped_results[name][ik]["structure"] = df0

    # selection menu for raw data search
    @st.fragment
    def raw_data_search_panel():

        # Message that we can now search all of the above xx spectra across raw data to retrieve matching samples. Include exact number of spectra
        spectra_across_all_molecules = sum(
            len(ik_dict[ik]['structure'])
            for ik_dict in st.session_state.grouped_results.values()
            for ik in ik_dict
        )

        number_of_molecules = len(st.session_state.grouped_results)

        st.markdown(f"##### Next, search all {spectra_across_all_molecules} spectra of {number_of_molecules} molecule(s) across public metabolomics raw data to retrieve matching samples:")

        col_a, col_b = st.columns(2)
        with col_a:
            option = st.radio(
                "Mode",
                ["FASSTrecords", "FASST"],
                horizontal=True,
                key="mode"
            )
        with col_b:
            st.empty()

        last_iteration = "09/2025"  # need to get version into sql table
        if option == "FASSTrecords":
            min_peaks_allowed = 3
            min_cos_allowed = 0.7
        elif option == "FASST":
            min_peaks_allowed = 1
            min_cos_allowed = 0.3

        info_row = st.empty()  # single, stable placeholder

        def render_info_panels(last_iteration: str):
            with info_row.container():  # render both columns atomically
                col1, col2 = st.columns(2)

                with col1:
                    st.markdown(f"""
                    <div style="
                        border-left: 4px solid #2c7be5;
                        padding: 1em;
                        margin: 0.5em 0;
                        background-color: #f0f8ff;
                        border-radius: 4px;
                    ">
                    <h4 style="margin:0 0 0.5em;">
                        <strong>FASSTrecords</strong>
                    </h4>
                    <p style="margin:0; line-height:1.5; font-size:0.95em;">
                        This generally takes less than 1 minute even for hundreds of spectra as it relies on<br/>
                        precomputed annotations, and is therefore especially recommended for large numbers of<br/>