GlycoGauntlet/streamlit_app.py at main · BojarLab/GlycoGauntlet · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import streamlit as st
import pandas as pd
import requests
import os
from io import StringIO
import sys
import base64
from datetime import datetime
sys.path.append('validation')
from check_format import validate_submission, parse_gwp

GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN')
REPO_OWNER = os.environ.get('REPO_OWNER', 'BojarLab')
REPO_NAME = os.environ.get('REPO_NAME', 'GlycoGauntlet')

st.set_page_config(page_title="GlycoGauntlet Submission", page_icon="🍬", layout="wide")
st.title("🍬 GlycoGauntlet Submission Portal")
st.markdown("Submit your glycan structure predictions without needing a GitHub account!")

with st.expander("ℹ️ Submission Format Requirements"):
  st.markdown("""
  Your CSV files must contain these columns:
  - **m/z**: mass-to-charge ratio (float)
  - **RT**: retention time in minutes (float)
  - **charge**: signed charge, e.g., -1 for negative mode (integer)
  - **top1_pred**: predicted glycan structure in IUPAC-condensed notation (string)

  File names must match the test files exactly, replacing `_solution.csv` with `_submission.csv`.
  """)

username = st.text_input("Your name or model name (for leaderboard)", placeholder="JohnDoe_ManualAnnotation")

test_type = st.radio("Which test set are you submitting?", ["Public Test (immediate evaluation)", "Private Test (final evaluation only)", "Both"], horizontal=True)

public_files = None
private_files = None

if test_type in ["Public Test (immediate evaluation)", "Both"]:
  st.subheader("📊 Public Test Predictions")
  public_files = st.file_uploader("Upload your public test CSV or GlycoWorkbench files", type=['csv', 'gwp'], accept_multiple_files=True, key="public")

if test_type in ["Private Test (final evaluation only)", "Both"]:
  st.subheader("🔒 Private Test Predictions")
  private_files = st.file_uploader("Upload your private test CSV or GlycoWorkbench files", type=['csv', 'gwp'], accept_multiple_files=True, key="private")

agree = st.checkbox("I confirm my files follow the required format")

if st.button("Submit Predictions", disabled=not agree or not username or (not public_files and not private_files)):
  if not GITHUB_TOKEN:
    st.error("GitHub token not configured. Please contact the competition organizers.")
    st.stop()
  with st.spinner("Validating and submitting your predictions..."):
    try:
      validation_errors = []
      converted_public = {}
      if public_files:
        for file in public_files:
          if file.name.endswith('.gwp'):
            df = parse_gwp(file)
            new_name = file.name.replace('.gwp', '_submission.csv')
            converted_public[new_name] = df
            file._name = new_name
          else:
            df = pd.read_csv(file)
          required_cols = ['m/z', 'RT', 'charge', 'top1_pred']
          missing_cols = [col for col in required_cols if col not in df.columns]
          if missing_cols:
            validation_errors.append(f"{file.name}: Missing columns {missing_cols}")
          if not pd.api.types.is_numeric_dtype(df['m/z']):
            validation_errors.append(f"{file.name}: m/z must be numeric")
          if not pd.api.types.is_numeric_dtype(df['RT']):
            validation_errors.append(f"{file.name}: RT must be numeric")
          if not pd.api.types.is_integer_dtype(df['charge']):
            validation_errors.append(f"{file.name}: charge must be integer")
          if df['top1_pred'].isna().all():
            validation_errors.append(f"{file.name}: top1_pred column is empty")
          if len(df) == 0:
            validation_errors.append(f"{file.name}: File is empty")
      if validation_errors:
        st.error("Validation failed:")
        for error in validation_errors:
          st.write(f"❌ {error}")
        st.stop()
      headers = {'Authorization': f'token {GITHUB_TOKEN}', 'Accept': 'application/vnd.github.v3+json'}
      branch_name = f"submission-{username.replace(' ', '-')}-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
      main_response = requests.get(f'https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}/git/refs/heads/main', headers=headers)
      if main_response.status_code != 200:
        st.error(f"Failed to get main branch: {main_response.text}")
        st.stop()
      main_sha = main_response.json()['object']['sha']
      ref_data = {'ref': f'refs/heads/{branch_name}', 'sha': main_sha}
      ref_response = requests.post(f'https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}/git/refs', json=ref_data, headers=headers)
      if ref_response.status_code != 201:
        st.error(f"Failed to create branch: {ref_response.text}")
        st.stop()
      file_urls = {'public': [], 'private': []}
      for test_type_key, files in [('public', public_files), ('private', private_files)]:
        if not files:
          continue
        for file in files:
          if file.name in converted_public:
            csv_bytes = converted_public[file.name].to_csv(index = True).encode()
            content = base64.b64encode(csv_bytes).decode('utf-8')
          else:
            file.seek(0)
            content = base64.b64encode(file.read()).decode('utf-8')
          file_path = f"submissions/{username}/{test_type_key}/{file.name}"
          file_data = {'message': f'Add {file.name}', 'content': content, 'branch': branch_name}
          existing_response = requests.get(f'https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}/contents/{file_path}?ref={branch_name}', headers=headers)
          if existing_response.status_code == 200:
            file_data['sha'] = existing_response.json()['sha']
          file_response = requests.put(f'https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}/contents/{file_path}', json=file_data, headers=headers)
          if file_response.status_code not in [201, 200]:
            st.error(f"Failed to upload {file.name}: {file_response.text}")
            st.stop()
          raw_url = f"https://raw.githubusercontent.com/{REPO_OWNER}/{REPO_NAME}/{branch_name}/{file_path}"
          file_urls[test_type_key].append(raw_url)
      issue_body = f"### GitHub Username or Model Name\n{username}\n\n### Public Test Predictions\n"
      if file_urls['public']:
        for url in file_urls['public']:
          issue_body += f"- {url}\n"
      else:
        issue_body += "None\n"
      issue_body += "\n### Private Test Predictions\n"
      if file_urls['private']:
        for url in file_urls['private']:
          issue_body += f"- {url}\n"
      else:
        issue_body += "None\n"
      issue_body += "\n### Confirmation\n- [x] Files validated via Streamlit\n- [x] CSV files follow required format"
      issue_data = {'title': f'[Submission] {username}', 'body': issue_body, 'labels': ['submission', 'streamlit']}
      response = requests.post(f'https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}/issues', json=issue_data, headers=headers)
      if response.status_code != 201:
        st.error(f"Failed to create submission: {response.text}")
        st.stop()
      issue_number = response.json()['number']
      st.success(f"✅ Submission successful! Issue #{issue_number} created.")
      st.markdown(f"Track your submission at: https://github.com/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}")
      st.info("Your submission is being processed. Check the issue for results; validation and evaluation typically complete within a few minutes.")
    except Exception as e:
      st.error(f"Error during submission: {str(e)}")

st.markdown("---")
st.markdown(f"View the leaderboard: [Public Test Leaderboard](https://github.com/{REPO_OWNER}/{REPO_NAME}/blob/main/leaderboard/public.md)")