jonfunk21
diff --git a/‎app/app.py‎
Lines changed: 94 additions & 19 deletions b/‎app/app.py‎
Lines changed: 94 additions & 19 deletions
diff --git a/‎app/tooltips.py‎
Lines changed: 56 additions & 26 deletions b/‎app/tooltips.py‎
Lines changed: 56 additions & 26 deletions
@@ -115,7 +115,7 @@
     ui.output_image("image", inline=True),
     VERSION,
     ui.HTML(f'<a href="{PAPER_URL}" target="_blank">Please cite our paper.</a>'),
-    ui.output_text("current_time", inline=True),  # for debugging
+    # ui.output_text("current_time", inline=True),  # for debugging
     ###############
     ## DATA PAGE ##
     ###############
@@ -304,7 +304,7 @@
                 ui.navset_tab(
                     ui.nav_panel(
                         "Model Diagnostics",
-                        ui.output_ui("pred_vs_true_ui"),
+                        ui.output_ui("mlde_results_ui"),
                         ui.output_data_frame("mlde_model_table"),
                     ),
                     ui.nav_panel("Search Results", ui.output_ui("mlde_download_ui")),
@@ -592,7 +592,7 @@ def zero_shot_ui():
 
         else:
             return ui.TagList(
-                "Upload a protein sequence or protein structure in the 'Data' tab to proceed with the Zero Shot Module."
+                tooltips.zs_file_type,
             )
 
     ################
@@ -639,9 +639,8 @@ def mlde_ui():
                             ui.input_numeric(
                                 "k_folds",
                                 "K-Fold cross validation",
-                                value=5,
+                                value=1,
                                 min=1,
-                                max=10,
                             ),
                         ),
                     ),
@@ -677,7 +676,7 @@ def mlde_ui():
             )
 
         else:
-            return ui.TagList("Upload data in the 'Data' tab to proceed.")
+            return ui.TagList(tooltips.mlde_file_type)
 
     ### MLDE SEARCH UI ###
     @output
@@ -719,18 +718,82 @@ def mlde_search_ui(alt=None):
                 ui.column(6, ui.input_task_button("mlde_search_btn", "Search")),
             )
 
+    @output
+    @render.ui
+    def mlde_results_ui(alt=None):
+        if MODEL() is not None:
+            return ui.TagList(
+                ui.h5("Predictions vs. True values of the validation data"),
+                ui.output_ui("pred_vs_true_ui"),
+                ui.row(
+                    ui.h5("Model statistics"),
+                    ui.column(
+                        12,
+                        ui.output_data_frame("mlde_statistics"),
+                        style="padding:25px;",
+                    ),
+                ),
+                ui.h5("Validation data"),
+            )
+
+    @render.data_frame
+    def mlde_statistics(alt=None):
+        model = MODEL()
+        if model is not None:
+            calibration = round(model.calibration, 2)
+            val_r2 = round(model.val_r2, 2)
+            val_pearson = round(model.val_pearson[0], 2)
+            # Add significance stars
+            pearson_p = model.val_pearson[1]
+            pearson_p_str = f"{pearson_p:.2e}"  # Format p-value in scientific notation
+
+            val_ken_tau = round(
+                model.val_ken_tau.statistic,
+                2,
+            )
+            val_ken_tau_p = model.val_ken_tau.pvalue
+            kendall_p_str = (
+                f"{val_ken_tau_p:.2e}"  # Format p-value in scientific notation
+            )
+
+            # Create DataFrame
+            df = pd.DataFrame(
+                {
+                    "Metric": [
+                        "Calibration error (90% confidence)",
+                        "Validation R-squared value",
+                        "Validation Pearson correlation",
+                        "Kendall Tau correlation",
+                    ],
+                    "Value": [
+                        f"+/- {calibration}",
+                        val_r2,
+                        f"{val_pearson} (p-value: {pearson_p_str})",
+                        f"{val_ken_tau} (p-value: {kendall_p_str})",
+                    ],
+                    "Description": [
+                        "The calibration error is a measure of model and experimental noise. A value of 0 indicates a perfect model and no experimental noise.",
+                        "The R-squared value is a measure of how well the model fits the data. A value of 1 indicates a perfect fit.",
+                        "The Pearson correlation measures the strength and direction of the predictions and actual values. A value of 1 indicates a perfect correlation.",
+                        "The Kendall Tau correlation is a measure of how well the model ranks the data. A value of 1 indicates a perfect ranking.",
+                    ],
+                }
+            )
+
+            return df
+
     ###################
     ## DISCOVERY TAB ##
     ###################
     @output
     @render.ui
     def discovery_ui_clu(alt=None):
-        if Y_TYPE == "num":
+        if Y_TYPE() != "class":
             return ui.TagList(
                 "The Discovery workflow is only available for categorical Y-Values. Please use the MLDE workflow for numerical Y-values"
             )
 
-        elif MODE() != "start":
+        elif MODE() == "dataset":
             return ui.TagList(
                 ui.row(
                     ui.h5("Protein Discovery and Annotation"),
@@ -792,19 +855,17 @@ def discovery_ui_clu(alt=None):
             )
 
         else:
-            return ui.TagList(
-                "Upload a library in the 'Data' tab to proceed with the Discovery module."
-            )
+            return ui.TagList(tooltips.discovery_file_type)
 
     @output
     @render.ui
     def discovery_ui_class(alt=None):
-        if Y_TYPE == "num":
+        if Y_TYPE() != "class":
             return ui.TagList(
                 "The Discovery workflow is only available for categorical Y-Values. Please use the MLDE workflow for numerical Y-values"
             )
 
-        elif MODE() != "start":
+        elif MODE() == "dataset":
             return ui.TagList(
                 ui.row(
                     ui.h5("Protein Discovery and Annotation"),
@@ -893,9 +954,7 @@ def discovery_ui_class(alt=None):
             )
 
         else:
-            return ui.TagList(
-                "Upload a library in the 'Data' tab to proceed with the Discovery module."
-            )
+            return ui.TagList(tooltips.discovery_file_type)
 
     ### DISCOVERY SEARCH UI ###
     @output
@@ -1251,7 +1310,7 @@ async def _():
                 ui.update_select("discovery_model_type", choices=_MODEL_TYPES())
 
                 ui.update_select("y_type", choices=[_y_type])
-                print("Library loaded successfully")
+
                 PROTEIN.set(None)
 
                 REP_PATH.set(None)  # used in train
@@ -1260,13 +1319,19 @@ async def _():
 
                 LIBRARY_PLOT.set(None)
 
+                with ui.Progress(min=1, max=15) as p:
+                    p.set(
+                        message="Data loaded",
+                    )
+                    await asyncio.sleep(0.5)
+
             except Exception:
                 with ui.Progress(min=1, max=15) as p:
                     p.set(
                         message="Problem with input file",
                         detail="Please check if there are any problems with the input file.",
                     )
-                    time.sleep(2.5)
+                    await asyncio.sleep(2.5)
 
     ### READING PROTEIN FILE ###
     @reactive.Effect
@@ -1443,7 +1508,7 @@ async def _():
                 prot = PROTEIN()
 
                 if input.demo_structure_check():
-                    data_path = os.path.join(app_path, "../demo/demo_data/GB1.pdb")
+                    data_path = os.path.join(app_path, "../demo/demo_data/1zb6.pdb")
                     file_name = data_path.split("/")[-1]
                 else:
                     f: list[FileInfo] = input.structure_file()
@@ -2312,6 +2377,8 @@ async def train_mlde_model():
             finally:
                 # Reset the task running state in the session
                 IS_MLDE_TRAINING_RUNNING.set(False)
+                with ui.Progress(min=1, max=15) as p:
+                    p.set(message="Done...", detail="")
 
     @reactive.effect
     @reactive.event(input.mlde_train_button)
@@ -2353,6 +2420,10 @@ def mlde_model_table(alt=None):
         model = MODEL()
         if model is not None:
             table = VAL_DF()
+            if "y_sigma" in table.columns:
+                # if y_sigma column is empty, drop it
+                if table["y_sigma"].isnull().all():
+                    table = table.drop(["y_sigma"], axis=1)
             return table
 
     #################
@@ -2422,6 +2493,10 @@ def mlde_search_table(alt=None):
         table = table.drop(["sequence"], axis=1)
         if "y_true" in table.columns:
             table = table.drop(["y_true"], axis=1)
+        if "y_sigma" in table.columns:
+            # if y_sigma column is empty, drop it
+            if table["y_sigma"].isnull().all():
+                table = table.drop(["y_sigma"], axis=1)
 
         return table
 
 
@@ -1,32 +1,54 @@
 data_tooltips = """
-ProteusAI, a user-friendly and open-source ML platform to streamline protein engineering and design tasks.
+ProteusAI, a user-friendly and open-source ML platform, streamlines protein engineering and design tasks.
 ProteusAI offers modules to support researchers in various stages of the design-build-test-learn (DBTL) cycle,
 including protein discovery, structure-based design, zero-shot predictions, and ML-guided directed evolution (MLDE).
 Our benchmarking results demonstrate ProteusAI’s efficiency in improving proteins and enzymes within a few DBTL-cycle
 iterations. ProteusAI democratizes access to ML-guided protein engineering and is freely available for academic and
-commercial use. Future work aims to expand and integrate novel methods in computational protein and enzyme design to
-further develop ProteusAI.
+commercial use.
+You can upload different data types to get started with ProteusAI. Click on the other module tabs to learn about their
+functionality and the expected data types.
+"""
+
+zs_file_type = """
+Upload a FASTA file containing the protein sequence, or a PDB file containing the structure of the protein
+for which you want to generate zero-shot predictions.
 """
 
 zs_tooltips = """
-The ProteusAI Zero Shot Module is designed to create a mutant library with no prior data.
+The ProteusAI Zero-Shot Module is designed to create a mutant library with no prior data.
 The module uses scores generated by large protein language models, such as ESM-1v, that have
 been trained to predict hidden residues in hundreds of millions of protein sequences.
 Often, you will find that several residues in your protein sequence have low predicted probabilities.
 It has been previously shown that swapping these residues for residues with higher probabilities
-has beneficial effects on the candidate protein.
+has beneficial effects on the candidate protein. In ProteusAI, we provide access to several language
+models which have been trained under slightly different conditions. The best models to produce Zero-Shot
+scores are ESM-1v and ESM-2 (650M). However, these models will take a long time to compute the results.
+Consider using ESM-2 (35M) to get familiar with the module first before moving to the larger models.
+"""
+
+discovery_file_type = """
+Upload a CSV or EXCEL file in the 'Data' tab under 'Library' to proceed with the Discovery module.
+The file should contain a column for protein sequences, a column with the protein names, and a column for
+annotations, which can also be empty or partially populated with annotations.
 """
 
 discovery_tooltips = """
 The Discovery Module offers a structured approach to identifying proteins even with little to no
-experimental data to start with. The module relies on representations generated by large protein
-language models that transform protein sequences into meaningful vector representations. It has
-been shown that these vector representations often cluster based on function. The Discovery Module
-clusters sequences using these representations and offers algorithms to sample diverse candidates
-from different clusters. These clusters can either be generated through unsupervised machine learning,
-when no prior annotations are present, or from partially annotated datasets, where some protein
-functions are known. Clustering should be used if all or very few sequences have annotations.
-Classification should be used if some or all all sequences are annotated.
+experimental data to start with. The goal of the module is to identify proteins with similar
+functions and to propose novel sequences that are likely to have similar functions.
+The module relies on representations generated by large protein language models that transform protein
+sequences into meaningful vector representations. It has been shown that these vector representations often
+cluster based on function. Clustering should be used if all, very few, or no sequences have annotations.
+Classification should be used if some or all sequences are annotated. To find out if you have enough
+sequences for classification, we recommend using the model statistics on the validation set, which are
+automatically generated by the module after training.
+"""
+
+mlde_file_type = """
+Upload a CSV or EXCEL file in the 'Data' tab under 'Library' to proceed with the MLDE module.
+The file should contain a column for protein sequences, a column with the protein names (e.g.,
+mutant descriptions M15V), and a column for experimental values (e.g., enzyme activity,
+fluorescence, etc.).
 """
 
 mlde_tooltips = """
@@ -37,27 +59,35 @@
 trained model to predict mutants that are likely to improve function. The Bayesian optimization
 algorithms used to search for novel mutants are based on models trained on protein representations
 that can either be generated from large language models, which is currently very slow, or from
-classical algorithms, such as BLOSUM62. For now, we recommend the use of BLOSUM62 representations
-combined with Random Forest models for the best trade-off of speed and quality. However, we encourage
+classical algorithms such as BLOSUM62. For now, we recommend the use of BLOSUM62 representations
+combined with Random Forest models for the best trade-off between speed and quality. However, we encourage
 experimentation with other models and representations.
 """
 
+design_file_type = """
+Upload a PDB file containing the structure of the protein
+to use the (structure-based) Design module.
+"""
+
 design_tooltips = """
 The Protein Design module is a structure-based approach to predict novel sequences using 'Inverse Folding'
-algorithms to design sequences that are likely to preserve the fold of the protein while improving
+algorithms. The designed sequences are likely to preserve the fold of the protein while improving
 the thermal stability and solubility of proteins. To preserve important functions of the protein, we recommend
-the preservation of protein-protein, ligand-ligand interfaces, and evolutionarily conserved sites.
-The temperature factor influences the diversity of designs. We recommend the generation of at least 1000 sequences
-and rigorous filtering before ordering variants for validation. To give an example: Sort the sequences from
-the lowest to highest score, predict the structure of the lowest-scoring variants, and proceed with the designs
-that preserve the geometry of the active site (in the case of an enzyme). Experiment with a small sample size
-to explore temperature values that yield desired levels of diversification before generating large
+the preservation of protein-protein, ligand-ligand interfaces, and evolutionarily conserved sites, which can be
+entered manually. The temperature factor influences the diversity of designs. We recommend the generation of at
+least 1,000 sequences and rigorous filtering before ordering variants for validation. To give an example: Sort
+the sequences from the lowest to highest score, predict the structure of the lowest-scoring variants, and proceed
+with the designs that preserve the geometry of the active site (in the case of an enzyme). Experiment with a small
+sample size to explore temperature values that yield desired levels of diversification before generating large
 numbers of sequences.
 """
 
 representations_tooltips = """
-The Representations module offers methods to compute and visualize vector representations that are primarily
-used by the MLDE and Discovery modules. The representations use classical algorithms such as BLOSUM62 or
-large protein language models to infuse inductive biases into protein sequences and produce biologically meaningful
-representations.
+The Representations module offers methods to compute and visualize vector representations of proteins. These are primarily
+used by the MLDE and Discovery modules to make training more data-efficient. The representations are generated from
+classical algorithms such as BLOSUM62 or large protein language models that infuse helpful inductive biases into protein
+sequence representations. In some cases, the representations can be used to cluster proteins based on function or to
+predict protein properties. The module offers several visualization techniques to explore the representations and to
+understand the underlying structure of the protein data. Advanced analysis and predictions can be made by using the
+MLDE or Discovery modules in combination with the Representations module.
 """