Skip to content

Commit 47d652b

Browse files
authored
Conformal predictions (#19)
* checkpoint for conformal prediction * k fold default one (works better with low N) * summary statistics * removed residue normalization * torch load warning * torch load warning * normalization bug * conformal predictions * conformal predictions * new demo data file * expanded tooltips
1 parent 98df558 commit 47d652b

8 files changed

Lines changed: 3389 additions & 75 deletions

File tree

app/app.py

Lines changed: 94 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@
115115
ui.output_image("image", inline=True),
116116
VERSION,
117117
ui.HTML(f'<a href="{PAPER_URL}" target="_blank">Please cite our paper.</a>'),
118-
ui.output_text("current_time", inline=True), # for debugging
118+
# ui.output_text("current_time", inline=True), # for debugging
119119
###############
120120
## DATA PAGE ##
121121
###############
@@ -304,7 +304,7 @@
304304
ui.navset_tab(
305305
ui.nav_panel(
306306
"Model Diagnostics",
307-
ui.output_ui("pred_vs_true_ui"),
307+
ui.output_ui("mlde_results_ui"),
308308
ui.output_data_frame("mlde_model_table"),
309309
),
310310
ui.nav_panel("Search Results", ui.output_ui("mlde_download_ui")),
@@ -592,7 +592,7 @@ def zero_shot_ui():
592592

593593
else:
594594
return ui.TagList(
595-
"Upload a protein sequence or protein structure in the 'Data' tab to proceed with the Zero Shot Module."
595+
tooltips.zs_file_type,
596596
)
597597

598598
################
@@ -639,9 +639,8 @@ def mlde_ui():
639639
ui.input_numeric(
640640
"k_folds",
641641
"K-Fold cross validation",
642-
value=5,
642+
value=1,
643643
min=1,
644-
max=10,
645644
),
646645
),
647646
),
@@ -677,7 +676,7 @@ def mlde_ui():
677676
)
678677

679678
else:
680-
return ui.TagList("Upload data in the 'Data' tab to proceed.")
679+
return ui.TagList(tooltips.mlde_file_type)
681680

682681
### MLDE SEARCH UI ###
683682
@output
@@ -719,18 +718,82 @@ def mlde_search_ui(alt=None):
719718
ui.column(6, ui.input_task_button("mlde_search_btn", "Search")),
720719
)
721720

721+
@output
722+
@render.ui
723+
def mlde_results_ui(alt=None):
724+
if MODEL() is not None:
725+
return ui.TagList(
726+
ui.h5("Predictions vs. True values of the validation data"),
727+
ui.output_ui("pred_vs_true_ui"),
728+
ui.row(
729+
ui.h5("Model statistics"),
730+
ui.column(
731+
12,
732+
ui.output_data_frame("mlde_statistics"),
733+
style="padding:25px;",
734+
),
735+
),
736+
ui.h5("Validation data"),
737+
)
738+
739+
@render.data_frame
740+
def mlde_statistics(alt=None):
741+
model = MODEL()
742+
if model is not None:
743+
calibration = round(model.calibration, 2)
744+
val_r2 = round(model.val_r2, 2)
745+
val_pearson = round(model.val_pearson[0], 2)
746+
# Add significance stars
747+
pearson_p = model.val_pearson[1]
748+
pearson_p_str = f"{pearson_p:.2e}" # Format p-value in scientific notation
749+
750+
val_ken_tau = round(
751+
model.val_ken_tau.statistic,
752+
2,
753+
)
754+
val_ken_tau_p = model.val_ken_tau.pvalue
755+
kendall_p_str = (
756+
f"{val_ken_tau_p:.2e}" # Format p-value in scientific notation
757+
)
758+
759+
# Create DataFrame
760+
df = pd.DataFrame(
761+
{
762+
"Metric": [
763+
"Calibration error (90% confidence)",
764+
"Validation R-squared value",
765+
"Validation Pearson correlation",
766+
"Kendall Tau correlation",
767+
],
768+
"Value": [
769+
f"+/- {calibration}",
770+
val_r2,
771+
f"{val_pearson} (p-value: {pearson_p_str})",
772+
f"{val_ken_tau} (p-value: {kendall_p_str})",
773+
],
774+
"Description": [
775+
"The calibration error is a measure of model and experimental noise. A value of 0 indicates a perfect model and no experimental noise.",
776+
"The R-squared value is a measure of how well the model fits the data. A value of 1 indicates a perfect fit.",
777+
"The Pearson correlation measures the strength and direction of the predictions and actual values. A value of 1 indicates a perfect correlation.",
778+
"The Kendall Tau correlation is a measure of how well the model ranks the data. A value of 1 indicates a perfect ranking.",
779+
],
780+
}
781+
)
782+
783+
return df
784+
722785
###################
723786
## DISCOVERY TAB ##
724787
###################
725788
@output
726789
@render.ui
727790
def discovery_ui_clu(alt=None):
728-
if Y_TYPE == "num":
791+
if Y_TYPE() != "class":
729792
return ui.TagList(
730793
"The Discovery workflow is only available for categorical Y-Values. Please use the MLDE workflow for numerical Y-values"
731794
)
732795

733-
elif MODE() != "start":
796+
elif MODE() == "dataset":
734797
return ui.TagList(
735798
ui.row(
736799
ui.h5("Protein Discovery and Annotation"),
@@ -792,19 +855,17 @@ def discovery_ui_clu(alt=None):
792855
)
793856

794857
else:
795-
return ui.TagList(
796-
"Upload a library in the 'Data' tab to proceed with the Discovery module."
797-
)
858+
return ui.TagList(tooltips.discovery_file_type)
798859

799860
@output
800861
@render.ui
801862
def discovery_ui_class(alt=None):
802-
if Y_TYPE == "num":
863+
if Y_TYPE() != "class":
803864
return ui.TagList(
804865
"The Discovery workflow is only available for categorical Y-Values. Please use the MLDE workflow for numerical Y-values"
805866
)
806867

807-
elif MODE() != "start":
868+
elif MODE() == "dataset":
808869
return ui.TagList(
809870
ui.row(
810871
ui.h5("Protein Discovery and Annotation"),
@@ -893,9 +954,7 @@ def discovery_ui_class(alt=None):
893954
)
894955

895956
else:
896-
return ui.TagList(
897-
"Upload a library in the 'Data' tab to proceed with the Discovery module."
898-
)
957+
return ui.TagList(tooltips.discovery_file_type)
899958

900959
### DISCOVERY SEARCH UI ###
901960
@output
@@ -1251,7 +1310,7 @@ async def _():
12511310
ui.update_select("discovery_model_type", choices=_MODEL_TYPES())
12521311

12531312
ui.update_select("y_type", choices=[_y_type])
1254-
print("Library loaded successfully")
1313+
12551314
PROTEIN.set(None)
12561315

12571316
REP_PATH.set(None) # used in train
@@ -1260,13 +1319,19 @@ async def _():
12601319

12611320
LIBRARY_PLOT.set(None)
12621321

1322+
with ui.Progress(min=1, max=15) as p:
1323+
p.set(
1324+
message="Data loaded",
1325+
)
1326+
await asyncio.sleep(0.5)
1327+
12631328
except Exception:
12641329
with ui.Progress(min=1, max=15) as p:
12651330
p.set(
12661331
message="Problem with input file",
12671332
detail="Please check if there are any problems with the input file.",
12681333
)
1269-
time.sleep(2.5)
1334+
await asyncio.sleep(2.5)
12701335

12711336
### READING PROTEIN FILE ###
12721337
@reactive.Effect
@@ -1443,7 +1508,7 @@ async def _():
14431508
prot = PROTEIN()
14441509

14451510
if input.demo_structure_check():
1446-
data_path = os.path.join(app_path, "../demo/demo_data/GB1.pdb")
1511+
data_path = os.path.join(app_path, "../demo/demo_data/1zb6.pdb")
14471512
file_name = data_path.split("/")[-1]
14481513
else:
14491514
f: list[FileInfo] = input.structure_file()
@@ -2312,6 +2377,8 @@ async def train_mlde_model():
23122377
finally:
23132378
# Reset the task running state in the session
23142379
IS_MLDE_TRAINING_RUNNING.set(False)
2380+
with ui.Progress(min=1, max=15) as p:
2381+
p.set(message="Done...", detail="")
23152382

23162383
@reactive.effect
23172384
@reactive.event(input.mlde_train_button)
@@ -2353,6 +2420,10 @@ def mlde_model_table(alt=None):
23532420
model = MODEL()
23542421
if model is not None:
23552422
table = VAL_DF()
2423+
if "y_sigma" in table.columns:
2424+
# if y_sigma column is empty, drop it
2425+
if table["y_sigma"].isnull().all():
2426+
table = table.drop(["y_sigma"], axis=1)
23562427
return table
23572428

23582429
#################
@@ -2422,6 +2493,10 @@ def mlde_search_table(alt=None):
24222493
table = table.drop(["sequence"], axis=1)
24232494
if "y_true" in table.columns:
24242495
table = table.drop(["y_true"], axis=1)
2496+
if "y_sigma" in table.columns:
2497+
# if y_sigma column is empty, drop it
2498+
if table["y_sigma"].isnull().all():
2499+
table = table.drop(["y_sigma"], axis=1)
24252500

24262501
return table
24272502

app/tooltips.py

Lines changed: 56 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,54 @@
11
data_tooltips = """
2-
ProteusAI, a user-friendly and open-source ML platform to streamline protein engineering and design tasks.
2+
ProteusAI, a user-friendly and open-source ML platform, streamlines protein engineering and design tasks.
33
ProteusAI offers modules to support researchers in various stages of the design-build-test-learn (DBTL) cycle,
44
including protein discovery, structure-based design, zero-shot predictions, and ML-guided directed evolution (MLDE).
55
Our benchmarking results demonstrate ProteusAI’s efficiency in improving proteins and enzymes within a few DBTL-cycle
66
iterations. ProteusAI democratizes access to ML-guided protein engineering and is freely available for academic and
7-
commercial use. Future work aims to expand and integrate novel methods in computational protein and enzyme design to
8-
further develop ProteusAI.
7+
commercial use.
8+
You can upload different data types to get started with ProteusAI. Click on the other module tabs to learn about their
9+
functionality and the expected data types.
10+
"""
11+
12+
zs_file_type = """
13+
Upload a FASTA file containing the protein sequence, or a PDB file containing the structure of the protein
14+
for which you want to generate zero-shot predictions.
915
"""
1016

1117
zs_tooltips = """
12-
The ProteusAI Zero Shot Module is designed to create a mutant library with no prior data.
18+
The ProteusAI Zero-Shot Module is designed to create a mutant library with no prior data.
1319
The module uses scores generated by large protein language models, such as ESM-1v, that have
1420
been trained to predict hidden residues in hundreds of millions of protein sequences.
1521
Often, you will find that several residues in your protein sequence have low predicted probabilities.
1622
It has been previously shown that swapping these residues for residues with higher probabilities
17-
has beneficial effects on the candidate protein.
23+
has beneficial effects on the candidate protein. In ProteusAI, we provide access to several language
24+
models which have been trained under slightly different conditions. The best models to produce Zero-Shot
25+
scores are ESM-1v and ESM-2 (650M). However, these models will take a long time to compute the results.
26+
Consider using ESM-2 (35M) to get familiar with the module first before moving to the larger models.
27+
"""
28+
29+
discovery_file_type = """
30+
Upload a CSV or EXCEL file in the 'Data' tab under 'Library' to proceed with the Discovery module.
31+
The file should contain a column for protein sequences, a column with the protein names, and a column for
32+
annotations, which can also be empty or partially populated with annotations.
1833
"""
1934

2035
discovery_tooltips = """
2136
The Discovery Module offers a structured approach to identifying proteins even with little to no
22-
experimental data to start with. The module relies on representations generated by large protein
23-
language models that transform protein sequences into meaningful vector representations. It has
24-
been shown that these vector representations often cluster based on function. The Discovery Module
25-
clusters sequences using these representations and offers algorithms to sample diverse candidates
26-
from different clusters. These clusters can either be generated through unsupervised machine learning,
27-
when no prior annotations are present, or from partially annotated datasets, where some protein
28-
functions are known. Clustering should be used if all or very few sequences have annotations.
29-
Classification should be used if some or all all sequences are annotated.
37+
experimental data to start with. The goal of the module is to identify proteins with similar
38+
functions and to propose novel sequences that are likely to have similar functions.
39+
The module relies on representations generated by large protein language models that transform protein
40+
sequences into meaningful vector representations. It has been shown that these vector representations often
41+
cluster based on function. Clustering should be used if all, very few, or no sequences have annotations.
42+
Classification should be used if some or all sequences are annotated. To find out if you have enough
43+
sequences for classification, we recommend using the model statistics on the validation set, which are
44+
automatically generated by the module after training.
45+
"""
46+
47+
mlde_file_type = """
48+
Upload a CSV or EXCEL file in the 'Data' tab under 'Library' to proceed with the MLDE module.
49+
The file should contain a column for protein sequences, a column with the protein names (e.g.,
50+
mutant descriptions M15V), and a column for experimental values (e.g., enzyme activity,
51+
fluorescence, etc.).
3052
"""
3153

3254
mlde_tooltips = """
@@ -37,27 +59,35 @@
3759
trained model to predict mutants that are likely to improve function. The Bayesian optimization
3860
algorithms used to search for novel mutants are based on models trained on protein representations
3961
that can either be generated from large language models, which is currently very slow, or from
40-
classical algorithms, such as BLOSUM62. For now, we recommend the use of BLOSUM62 representations
41-
combined with Random Forest models for the best trade-off of speed and quality. However, we encourage
62+
classical algorithms such as BLOSUM62. For now, we recommend the use of BLOSUM62 representations
63+
combined with Random Forest models for the best trade-off between speed and quality. However, we encourage
4264
experimentation with other models and representations.
4365
"""
4466

67+
design_file_type = """
68+
Upload a PDB file containing the structure of the protein
69+
to use the (structure-based) Design module.
70+
"""
71+
4572
design_tooltips = """
4673
The Protein Design module is a structure-based approach to predict novel sequences using 'Inverse Folding'
47-
algorithms to design sequences that are likely to preserve the fold of the protein while improving
74+
algorithms. The designed sequences are likely to preserve the fold of the protein while improving
4875
the thermal stability and solubility of proteins. To preserve important functions of the protein, we recommend
49-
the preservation of protein-protein, ligand-ligand interfaces, and evolutionarily conserved sites.
50-
The temperature factor influences the diversity of designs. We recommend the generation of at least 1000 sequences
51-
and rigorous filtering before ordering variants for validation. To give an example: Sort the sequences from
52-
the lowest to highest score, predict the structure of the lowest-scoring variants, and proceed with the designs
53-
that preserve the geometry of the active site (in the case of an enzyme). Experiment with a small sample size
54-
to explore temperature values that yield desired levels of diversification before generating large
76+
the preservation of protein-protein, ligand-ligand interfaces, and evolutionarily conserved sites, which can be
77+
entered manually. The temperature factor influences the diversity of designs. We recommend the generation of at
78+
least 1,000 sequences and rigorous filtering before ordering variants for validation. To give an example: Sort
79+
the sequences from the lowest to highest score, predict the structure of the lowest-scoring variants, and proceed
80+
with the designs that preserve the geometry of the active site (in the case of an enzyme). Experiment with a small
81+
sample size to explore temperature values that yield desired levels of diversification before generating large
5582
numbers of sequences.
5683
"""
5784

5885
representations_tooltips = """
59-
The Representations module offers methods to compute and visualize vector representations that are primarily
60-
used by the MLDE and Discovery modules. The representations use classical algorithms such as BLOSUM62 or
61-
large protein language models to infuse inductive biases into protein sequences and produce biologically meaningful
62-
representations.
86+
The Representations module offers methods to compute and visualize vector representations of proteins. These are primarily
87+
used by the MLDE and Discovery modules to make training more data-efficient. The representations are generated from
88+
classical algorithms such as BLOSUM62 or large protein language models that infuse helpful inductive biases into protein
89+
sequence representations. In some cases, the representations can be used to cluster proteins based on function or to
90+
predict protein properties. The module offers several visualization techniques to explore the representations and to
91+
understand the underlying structure of the protein data. Advanced analysis and predictions can be made by using the
92+
MLDE or Discovery modules in combination with the Representations module.
6393
"""

0 commit comments

Comments
 (0)