From ef60de3a3c0ab540d30ddc032170191e9be51411 Mon Sep 17 00:00:00 2001
From: ElliottKasoar <45317199+ElliottKasoar@users.noreply.github.com>
Date: Fri, 22 May 2026 15:24:09 +0100
Subject: [PATCH 01/18] Refactor callbacks

---
 ml_peg/app/base_app.py                 |  39 ++++-
 ml_peg/app/build_app.py                |  45 +++--
 ml_peg/app/utils/build_components.py   | 117 +++++--------
 ml_peg/app/utils/register_callbacks.py | 217 ++++++++++++++++++-------
 4 files changed, 269 insertions(+), 149 deletions(-)

diff --git a/ml_peg/app/base_app.py b/ml_peg/app/base_app.py
index 1fc1ecfce..5f9f0691e 100644
--- a/ml_peg/app/base_app.py
+++ b/ml_peg/app/base_app.py
@@ -5,6 +5,7 @@
 from abc import ABC, abstractmethod
 from pathlib import Path
 
+from dash.dcc import Store
 from dash.development.base_component import Component
 from dash.html import Div
 
@@ -31,7 +32,7 @@ class BaseApp(ABC):
         URL for online documentation. Default is None.
     framework_id
         Framework identifier used for benchmark attribution tags. Default is
-        ``"ml_peg"``.
+        `"ml_peg"`.
     """
 
     def __init__(
@@ -60,6 +61,7 @@ def __init__(
             URL to online documentation. Default is None.
         framework_id
             Framework identifier used for benchmark attribution tags.
+            Default is `"ml_peg"`.
         """
         self.name = name
         self.description = description
@@ -91,7 +93,7 @@ def build_layout(self) -> Div:
             framework_id=self.framework_id,
             table=self.table,
             column_widths=getattr(self.table, "column_widths", None),
-            thresholds=getattr(self.table, "thresholds", None),
+            thresholds=self.table.thresholds,
             extra_components=self.extra_components,
         )
 
@@ -99,3 +101,36 @@ def build_layout(self) -> Div:
     def register_callbacks(self):
         """Register callbacks with app."""
         pass
+
+    @property
+    def stores(self) -> list[Store]:
+        """
+        List Stores to be registered with full app.
+
+        Returns
+        -------
+        list[Store]
+            List of Stores to be registered with full app.
+        """
+        return [
+            Store(
+                id=f"{self.table_id}-computed-store",
+                storage_type="session",
+                data=self.table.data,
+            ),
+            Store(
+                id=f"{self.table_id}-raw-data-store",
+                storage_type="session",
+                data=self.table.data,
+            ),
+            Store(
+                id=f"{self.table_id}-weight-store",
+                storage_type="session",
+                data=self.table.weights,
+            ),
+            Store(
+                id=f"{self.table_id}-thresholds-store",
+                storage_type="session",
+                data=self.table.thresholds,
+            ),
+        ]
diff --git a/ml_peg/app/build_app.py b/ml_peg/app/build_app.py
index dc109c6a1..9858f74e5 100644
--- a/ml_peg/app/build_app.py
+++ b/ml_peg/app/build_app.py
@@ -25,7 +25,10 @@
     build_tutorial_button,
     register_onboarding_callbacks,
 )
-from ml_peg.app.utils.register_callbacks import register_benchmark_to_category_callback
+from ml_peg.app.utils.register_callbacks import (
+    register_benchmark_to_category_callback,
+    register_filter_tables_callback,
+)
 from ml_peg.app.utils.utils import (
     build_level_of_theory_warnings,
     get_framework_config,
@@ -342,6 +345,7 @@ def build_sidebar(
 def get_all_tests(
     category: str = "*",
 ) -> tuple[
+    dict[str, dict[str, Dash]],
     dict[str, dict[str, list[Div]]],
     dict[str, dict[str, DataTable]],
     dict[str, dict[str, str]],
@@ -357,12 +361,13 @@ def get_all_tests(
     Returns
     -------
     tuple
-        Layouts, tables, and framework IDs for all categories.
+        Apps by test name, and layouts, tables, and framework IDs for all categories.
     """
     # Find Python files e.g. app_OC157.py in mlip_tesing.app module.
     # We will get the category from the parent's parent directory
     # E.g. ml_peg/app/surfaces/OC157/app_OC157.py -> surfaces
     tests = APP_ROOT.glob(f"{category}/*/app*.py")
+    apps = {}
     layouts = {}
     tables = {}
     frameworks = {}
@@ -377,15 +382,18 @@ def get_all_tests(
                 f"ml_peg.app.{category_name}.{test_name}.app_{test_name}"
             )
             test_app = test_module.get_app()
+            apps[test_name] = test_app
 
             # Get layouts and tables for each category/test
             if category_name not in layouts:
                 layouts[category_name] = {}
                 tables[category_name] = {}
                 frameworks[category_name] = {}
+
             layouts[category_name][test_app.name] = test_app.layout
             tables[category_name][test_app.name] = test_app.table
             frameworks[category_name][test_app.name] = test_app.framework_id
+
         except FileNotFoundError as err:
             warnings.warn(
                 f"Unable to load layout for {test_name} in {category_name} category. "
@@ -405,7 +413,7 @@ def get_all_tests(
             )
             continue
 
-    return layouts, tables, frameworks
+    return apps, layouts, tables, frameworks
 
 
 def build_category(
@@ -439,6 +447,7 @@ def build_category(
     category_views = {}
     category_tables = {}
     category_weights = {}
+    category_to_title = {}
     framework_ids: set[str] = set()
 
     # `category` corresponds to the category's directory name
@@ -458,6 +467,8 @@ def build_category(
             category_weight = 1
             benchmark_weights = {}
 
+        category_to_title[category] = category_title
+
         # Build category summary table
         summary_table = build_summary_table(
             dict(sorted(all_tables[category].items())),
@@ -475,7 +486,6 @@ def build_category(
         weight_components = build_weight_components(
             header="Weights",
             table=summary_table,
-            include_store=False,
             include_download_controls=False,
             column_widths=getattr(summary_table, "column_widths", None),
         )
@@ -500,15 +510,9 @@ def build_category(
             "tests": test_entries,
         }
 
-        # Register benchmark table -> category table callbacks
-        # Category summary table columns add "Score" to name for clarity
-        for test_name, benchmark_table in all_tables[category].items():
-            register_benchmark_to_category_callback(
-                benchmark_table_id=benchmark_table.id,
-                category_table_id=f"{category_title}-summary-table",
-                benchmark_column=test_name + " Score",
-                model_name_map=getattr(benchmark_table, "model_name_map", None),
-            )
+    # Register callback for all benchmark tables -> category table
+    # Category summary table columns add "Score" to name for clarity
+    register_benchmark_to_category_callback(all_tables, category_to_title)
 
     return category_views, category_tables, category_weights, framework_ids
 
@@ -836,6 +840,7 @@ def build_nav(
     framework_views: dict[str, dict[str, object]],
     summary_table: DataTable,
     weight_components: Div,
+    all_apps: dict[str, Dash],
 ) -> None:
     """
     Build page layouts and sidebar navigation.
@@ -852,6 +857,8 @@ def build_nav(
         Summary table with score from each category.
     weight_components
         Weight sliders, text boxes and reset button.
+    all_apps
+        Dictionary of all test apps.
     """
     category_paths = {
         category_name: _category_to_path(category_name)
@@ -977,6 +984,11 @@ def build_nav(
                 ),
             ]
         )
+
+    test_state_stores = []
+    for app in all_apps.values():
+        test_state_stores.extend(app.stores)
+
     global_state_stores = [
         Store(
             id="summary-table-weight-store",
@@ -985,6 +997,7 @@ def build_nav(
         ),
         Store(id="cmap-store", storage_type="local", data="viridis_r"),
         *category_state_stores,
+        *test_state_stores,
     ]
 
     full_layout = [
@@ -1261,11 +1274,13 @@ def build_full_app(full_app: Dash, category: str = "*") -> None:
         Category to build app for. Default is `*`, corresponding to all categories.
     """
     # Get layouts and tables for each test, grouped by categories
-    all_layouts, all_tables, all_frameworks = get_all_tests(category=category)
+    all_apps, all_layouts, all_tables, all_frameworks = get_all_tests(category=category)
 
     if not all_layouts:
         raise ValueError("No tests were built successfully")
 
+    register_filter_tables_callback(all_apps)
+
     # Combine tests into categories and create category summary
     cat_views, cat_tables, cat_weights, framework_ids = build_category(
         all_layouts, all_tables, all_frameworks
@@ -1278,7 +1293,6 @@ def build_full_app(full_app: Dash, category: str = "*") -> None:
     weight_components = build_weight_components(
         header="Weights",
         table=summary_table,
-        include_store=False,
         include_download_controls=False,
         column_widths=summary_table.column_widths,
     )
@@ -1289,5 +1303,6 @@ def build_full_app(full_app: Dash, category: str = "*") -> None:
         framework_views,
         summary_table,
         weight_components,
+        all_apps,
     )
     register_onboarding_callbacks()
diff --git a/ml_peg/app/utils/build_components.py b/ml_peg/app/utils/build_components.py
index 050e57306..49f604d1b 100644
--- a/ml_peg/app/utils/build_components.py
+++ b/ml_peg/app/utils/build_components.py
@@ -139,7 +139,6 @@ def build_weight_components(
     *,
     use_thresholds: bool = False,
     include_download_controls: bool = True,
-    include_store: bool = True,
     column_widths: dict[str, int] | None = None,
     thresholds: Thresholds | None = None,
 ) -> Div:
@@ -158,10 +157,6 @@ def build_weight_components(
         recompute Scores consistently.
     include_download_controls
         Whether to render download controls in the Score column slot.
-    include_store
-        Whether to include this table's weight ``dcc.Store`` in the returned
-        component. Set to ``False`` when that store is already created elsewhere,
-        for example in the main app layout.
     column_widths
         Optional mapping of table column IDs to pixel widths used to align the
         inputs with the rendered table.
@@ -292,14 +287,6 @@ def build_weight_components(
     )
 
     layout = [container]
-    if include_store:
-        layout.append(
-            Store(
-                id=f"{table.id}-weight-store",
-                storage_type="session",
-                data=weights,
-            )
-        )
 
     model_levels = getattr(table, "model_levels_of_theory", None)
     metric_levels = getattr(table, "metric_levels_of_theory", None)
@@ -787,10 +774,10 @@ def build_test_layout(
     description: str,
     framework_id: str,
     table: DataTable,
+    thresholds: Thresholds,
     extra_components: list[Component] | None = None,
     docs_url: str | None = None,
     column_widths: dict[str, int] | None = None,
-    thresholds: Thresholds | None = None,
 ) -> Div:
     """
     Build app layout for a test.
@@ -806,6 +793,9 @@ def build_test_layout(
     table
         Dash Table with metric results. Can include a `weights` attribute to be used by
         `build_weight_components`.
+    thresholds
+        Normalization metadata (metric -> (good, bad, unit)) supplied via the
+        analysis pipeline. Inline threshold controls are rendered automatically.
     extra_components
         List of Dash Components to include after the metrics table.
     docs_url
@@ -813,10 +803,6 @@ def build_test_layout(
     column_widths
         Optional column-width mapping inferred from analysis output. Used to align
         threshold controls beneath the table columns when available.
-    thresholds
-        Optional normalization metadata (metric -> (good, bad, unit)) supplied via the
-        analysis pipeline. When provided, inline threshold controls are rendered
-        automatically.
 
     Returns
     -------
@@ -875,33 +861,32 @@ def build_test_layout(
         )
     )
 
-    # Inline normalization thresholds when metadata is supplied
-    threshold_controls = None
-    if thresholds is not None:
-        reserved = {"MLIP", "Score", "id"}
-        metric_columns = [
-            col["id"] for col in table.columns if col.get("id") not in reserved
-        ]
-        layout_contents.append(
-            Store(
-                id=f"{table.id}-raw-data-store",
-                storage_type="session",
-                data=table.data,
-            )
-        )
-        layout_contents.append(
-            Store(
-                id=f"{table.id}-raw-tooltip-store",
-                storage_type="session",
-                data=table.tooltip_header,
-            )
+    reserved = {"MLIP", "Score", "id"}
+    metric_columns = [
+        col["id"] for col in table.columns if col.get("id") not in reserved
+    ]
+
+    layout_contents.append(
+        Store(
+            id=f"{table.id}-raw-data-store",
+            storage_type="session",
+            data=table.data,
         )
-        threshold_controls = build_threshold_inputs(
-            table_columns=metric_columns,
-            thresholds=thresholds,
-            table_id=table.id,
-            column_widths=column_widths,
+    )
+    layout_contents.append(
+        Store(
+            id=f"{table.id}-raw-tooltip-store",
+            storage_type="session",
+            data=table.tooltip_header,
         )
+    )
+
+    threshold_controls = build_threshold_inputs(
+        table_columns=metric_columns,
+        thresholds=thresholds,
+        table_id=table.id,
+        column_widths=column_widths,
+    )
 
     # Add metric-weight controls for every benchmark table
     metric_weights = build_weight_components(
@@ -916,24 +901,21 @@ def build_test_layout(
     # Build the controls element before the table wrapper so both can go into the
     # same fit-content div. The controls use width:100% of that wrapper, which
     # equals the table width, keeping the columns aligned.
-    if thresholds is not None:
-        controls_visual = Div(
-            [
-                Div(threshold_controls, style={"marginBottom": "0px"}),
-                Div(metric_weights, style={"marginTop": "0"}),
-            ],
-            style={
-                "backgroundColor": "#f8f9fa",
-                "border": "1px solid #dee2e6",
-                "borderRadius": "6px",
-                "padding": "0px 0px 0px 0px",  # top right bottom left
-                "marginTop": "-5px",
-                "boxSizing": "border-box",
-                "width": "100%",
-            },
-        )
-    else:
-        controls_visual = metric_weights
+    controls_visual = Div(
+        [
+            Div(threshold_controls, style={"marginBottom": "0px"}),
+            Div(metric_weights, style={"marginTop": "0"}),
+        ],
+        style={
+            "backgroundColor": "#f8f9fa",
+            "border": "1px solid #dee2e6",
+            "borderRadius": "6px",
+            "padding": "0px 0px 0px 0px",  # top right bottom left
+            "marginTop": "-5px",
+            "boxSizing": "border-box",
+            "width": "100%",
+        },
+    )
 
     table_section = [
         build_download_controls(table.id, row=True),
@@ -1199,12 +1181,6 @@ def build_threshold_inputs(
             )
         )
 
-    store = Store(
-        id=f"{table_id}-thresholds-store",
-        storage_type="session",
-        data=default_thresholds,
-    )
-
     # Register callbacks for these metrics, pass default_thresholds for reset
     register_normalization_callbacks(
         table_id,
@@ -1213,9 +1189,4 @@ def build_threshold_inputs(
         register_toggle=False,
     )
 
-    return Div(
-        [
-            Div(cells, id=f"{table_id}-threshold-grid", style=container_style),
-            store,
-        ]
-    )
+    return Div([Div(cells, id=f"{table_id}-threshold-grid", style=container_style)])
diff --git a/ml_peg/app/utils/register_callbacks.py b/ml_peg/app/utils/register_callbacks.py
index bff15b7e9..76280fd1c 100644
--- a/ml_peg/app/utils/register_callbacks.py
+++ b/ml_peg/app/utils/register_callbacks.py
@@ -19,6 +19,7 @@
     dcc,
     no_update,
 )
+from dash.dash_table import DataTable
 from dash.exceptions import PreventUpdate
 import pandas as pd
 
@@ -277,6 +278,86 @@ def register_category_table_callbacks(
     model_configs
         Optional configuration metadata for each model.
     """
+
+    @callback(
+        Output(table_id, "data", allow_duplicate=True),
+        Output(table_id, "style_data_conditional", allow_duplicate=True),
+        Input(f"{table_id}-raw-data-store", "data"),
+        State(f"{table_id}-computed-store", "data"),
+        State(f"{table_id}-weight-store", "data"),
+        State(f"{table_id}-thresholds-store", "data"),
+        State(f"{table_id}-normalized-toggle", "value"),
+        State("selected-models-store", "data"),
+        State("cmap-store", "data"),
+        State(f"{table_id}-raw-tooltip-store", "data"),
+        State(table_id, "columns"),
+        prevent_initial_call=True,
+        optional=True,
+    )
+    def update_table_from_store(
+        stored_raw_data: list[dict] | None,
+        stored_computed_data: list[dict] | None,
+        weights: dict[str, float] | None,
+        thresholds: dict | None,
+        toggle_value: list[str] | None,
+        selected_models: list[str] | None,
+        cmap_name: str | None,
+        raw_tooltips: dict[str, str] | None,
+        current_columns: list[dict] | None,
+    ) -> list[dict]:
+        """
+        Update visible table from cached data when the raw data store changes.
+
+        Parameters
+        ----------
+        stored_raw_data
+            Stored raw table data.
+        stored_computed_data
+            Stored computed table data.
+        weights
+            Stored weights for the table.
+        thresholds
+            Stored thresholds for the table.
+        toggle_value
+            Value of toggle to show normalised values.
+        selected_models
+            List of model names currently selected in the model filter.
+        cmap_name
+            Colourmap name from the cmap store.
+        raw_tooltips
+            Stored raw tooltip text for the table.
+        current_columns
+            Current table columns.
+
+        Returns
+        -------
+        list[dict]
+            Updated rows for the visible table.
+        """
+        display_rows = get_scores(
+            stored_raw_data, stored_computed_data, thresholds, toggle_value
+        )
+        scored_rows = calc_metric_scores(stored_raw_data, thresholds=thresholds)
+        filtered_rows = filter_rows_by_models(display_rows, selected_models)
+        filtered_scores = filter_rows_by_models(scored_rows, selected_models)
+        style = (
+            get_table_style(
+                filtered_rows,
+                scored_data=filtered_scores,
+                cmap_name=cmap_name or "viridis_r",
+            )
+            if filtered_rows
+            else []
+        )
+        style, tooltip_data = apply_level_of_theory_warnings(
+            filtered_rows,
+            style,
+            model_levels=model_levels,
+            metric_levels=metric_levels,
+            model_configs=model_configs,
+        )
+        return filtered_rows, style
+
     # Benchmark tables
     if use_thresholds:
 
@@ -287,7 +368,7 @@ def register_category_table_callbacks(
             Output(table_id, "columns", allow_duplicate=True),
             Output(table_id, "tooltip_header", allow_duplicate=True),
             Output(f"{table_id}-computed-store", "data", allow_duplicate=True),
-            Output(f"{table_id}-raw-data-store", "data"),
+            Output(f"{table_id}-raw-data-store", "data", allow_duplicate=True),
             Input(f"{table_id}-weight-store", "data"),
             Input(f"{table_id}-thresholds-store", "data"),
             Input("app-location", "pathname"),
@@ -299,6 +380,7 @@ def register_category_table_callbacks(
             State(f"{table_id}-raw-tooltip-store", "data"),
             State(table_id, "columns"),
             prevent_initial_call="initial_duplicate",
+            optional=True,
         )
         def update_benchmark_table_scores(
             stored_weights: dict[str, float] | None,
@@ -593,83 +675,99 @@ def update_scores_store(
 
 
 def register_benchmark_to_category_callback(
-    benchmark_table_id: str,
-    category_table_id: str,
-    benchmark_column: str,
-    use_threshold_store: bool = False,
-    model_name_map: dict[str, str] | None = None,
+    all_tables: dict[str, dict[str, DataTable]], category_to_title: dict[str, str]
 ) -> None:
     """
     Propagate a benchmark table's Score into its category summary table column.
 
     Parameters
     ----------
-    benchmark_table_id
-        ID of the benchmark test table (e.g., "OC157-table").
-    category_table_id
-        ID of the category summary table (e.g., "Surfaces-summary-table").
-    benchmark_column
-        Column name in the category summary table corresponding to the benchmark.
-    use_threshold_store
-        Whether the benchmark table exposes a normalization store for metrics.
-    model_name_map
-        Optional mapping of displayed benchmark MLIP names -> original model names.
+    all_tables
+        Tables for all tests, grouped by category.
+    category_to_title
+        Dictionary mapping category directory names to their display titles/table IDs.
     """
-    _ = use_threshold_store  # cached rows handle normalization
-    # flag kept for compatibility with existing call sites
-    name_map = dict(model_name_map or {})
+    all_info = {}
+    for category, tables in all_tables.items():
+        all_info[category] = {}
+        for test_name, benchmark_table in tables.items():
+            all_info[category][test_name] = {
+                "benchmark_table_id": benchmark_table.id,
+                "benchmark_column": test_name + " Score",
+                "model_name_map": getattr(benchmark_table, "model_name_map", {}),
+            }
+
+    outputs = []
+    inputs = []
+    for category, category_info in sorted(all_info.items()):
+        category_table_id = f"{category_to_title[category]}-summary-table"
+        outputs.append(
+            Output(f"{category_table_id}-computed-store", "data", allow_duplicate=True)
+        )
 
-    @callback(
-        Output(f"{category_table_id}-computed-store", "data", allow_duplicate=True),
-        Input(f"{benchmark_table_id}-computed-store", "data"),
-        State(f"{category_table_id}-weight-store", "data"),
-        State(f"{category_table_id}-computed-store", "data"),
-        prevent_initial_call=True,
-    )
-    def update_category_from_benchmark(
-        benchmark_computed_store: list[dict] | None,
-        category_weights: dict[str, float] | None,
-        category_computed_store: list[dict] | None,
-    ) -> list[dict]:
+        inputs.extend(
+            [
+                State(f"{category_table_id}-weight-store", "data"),
+                State(f"{category_table_id}-computed-store", "data"),
+            ]
+        )
+        inputs.extend(
+            [
+                Input(f"{table_info['benchmark_table_id']}-computed-store", "data")
+                for _, table_info in sorted(category_info.items())
+            ]
+        )
+
+    @callback(outputs, inputs, prevent_initial_call=True)
+    def update_category_from_benchmark(*args) -> list[list[dict]]:
         """
-        Update cached category summary rows from a benchmark's cached scores.
+        Update cached category summary rows from all benchmarks' cached scores.
 
         Parameters
         ----------
-        benchmark_computed_store
-            Latest scored benchmark rows emitted by the benchmark table.
-        category_weights
-            Stored weights for the category summary metrics.
-        category_computed_store
-            Cached scored rows for the category summary.
+        *args
+            States and Inputs for all category summary tables and benchmark tables.
+            Ordered by category. For each category, the weights, computed store, and
+            benchmark computed stores are listed sequentially.
 
         Returns
         -------
-        list[dict]
-            Refreshed cached rows for the category summary table.
+        list[list[dict]]
+            Refreshed cached rows for each category summary table.
         """
-        if not category_computed_store:
-            raise PreventUpdate
-        if not benchmark_computed_store:
-            raise PreventUpdate
-        category_rows = deepcopy(category_computed_store)
+        # Rebuild inputs for each category
+        iterator = iter(args)
+
+        all_category_rows = []
+
+        for category, category_info in sorted(all_info.items()):
+            category_weights = next(iterator)
+            category_rows = deepcopy(next(iterator))
+
+            for test_name, table_info in sorted(category_info.items()):
+                benchmark_rows = deepcopy(next(iterator))
+                name_map = table_info["model_name_map"]
+
+                benchmark_scores = {}
+                for row in benchmark_rows:
+                    display_name = row.get("MLIP")
+                    original_name = name_map.get(display_name, display_name)
+                    score = row.get("Score")
+                    if display_name is None or original_name is None:
+                        continue
+                    benchmark_scores[original_name] = score
 
-        benchmark_scores: dict[str, float] = {}
-        for row in benchmark_computed_store:
-            display_name = row.get("MLIP")
-            original_name = name_map.get(display_name, display_name)
-            score = row.get("Score")
-            if display_name is None or original_name is None or score is None:
-                continue
-            benchmark_scores[original_name] = score
+                for row in category_rows:
+                    mlip = row.get("MLIP")
+                    if mlip in benchmark_scores:
+                        row[all_info[category][test_name]["benchmark_column"]] = (
+                            benchmark_scores[mlip]
+                        )
 
-        for row in category_rows:
-            mlip = row.get("MLIP")
-            if mlip in benchmark_scores:
-                row[benchmark_column] = benchmark_scores[mlip]
+            category_rows, _ = update_score_style(category_rows, category_weights)
+            all_category_rows.append(category_rows)
 
-        category_rows, _ = update_score_style(category_rows, category_weights)
-        return category_rows
+        return all_category_rows
 
 
 def register_weight_callbacks(
@@ -920,6 +1018,7 @@ def sync_threshold_input_styles(
             State(f"{table_id}", "columns"),
             State("cmap-store", "data"),
             prevent_initial_call=True,
+            optional=True,
         )
         def toggle_normalized_display(
             show_normalized: list[str] | None,

From 7c5fdd2c7add4103829fe5fe9243a95ad1042e0f Mon Sep 17 00:00:00 2001
From: ElliottKasoar <45317199+ElliottKasoar@users.noreply.github.com>
Date: Fri, 22 May 2026 15:25:57 +0100
Subject: [PATCH 02/18] Remove callback function call

---
 ml_peg/app/build_app.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/ml_peg/app/build_app.py b/ml_peg/app/build_app.py
index 9858f74e5..7cb37d7aa 100644
--- a/ml_peg/app/build_app.py
+++ b/ml_peg/app/build_app.py
@@ -25,10 +25,7 @@
     build_tutorial_button,
     register_onboarding_callbacks,
 )
-from ml_peg.app.utils.register_callbacks import (
-    register_benchmark_to_category_callback,
-    register_filter_tables_callback,
-)
+from ml_peg.app.utils.register_callbacks import register_benchmark_to_category_callback
 from ml_peg.app.utils.utils import (
     build_level_of_theory_warnings,
     get_framework_config,
@@ -1279,8 +1276,6 @@ def build_full_app(full_app: Dash, category: str = "*") -> None:
     if not all_layouts:
         raise ValueError("No tests were built successfully")
 
-    register_filter_tables_callback(all_apps)
-
     # Combine tests into categories and create category summary
     cat_views, cat_tables, cat_weights, framework_ids = build_category(
         all_layouts, all_tables, all_frameworks

From 802dc18ed56cdbd13d189334606a062ce5648837 Mon Sep 17 00:00:00 2001
From: ElliottKasoar <45317199+ElliottKasoar@users.noreply.github.com>
Date: Fri, 22 May 2026 17:15:38 +0100
Subject: [PATCH 03/18] Fix None/NaN scores

---
 ml_peg/analysis/utils/utils.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/ml_peg/analysis/utils/utils.py b/ml_peg/analysis/utils/utils.py
index bddffba04..dc264490d 100644
--- a/ml_peg/analysis/utils/utils.py
+++ b/ml_peg/analysis/utils/utils.py
@@ -465,11 +465,14 @@ def calc_table_scores(
             # Strict mode: require all metrics to be present
             metrics_row["Score"] = None
         elif scores_list:
-            # Calculate weighted average of available metrics
-            try:
-                metrics_row["Score"] = np.average(scores_list, weights=weights_list)
-            except ZeroDivisionError:
-                metrics_row["Score"] = np.mean(scores_list)
+            if np.nan in scores_list:
+                metrics_row["Score"] = np.nan
+            else:
+                # Calculate weighted average of available metrics
+                try:
+                    metrics_row["Score"] = np.average(scores_list, weights=weights_list)
+                except ZeroDivisionError:
+                    metrics_row["Score"] = np.mean(scores_list)
         else:
             metrics_row["Score"] = None
 
@@ -726,9 +729,9 @@ def normalize_metric(
     try:
         # Handle NaNs robustly
         if np.isnan([value, good_threshold, bad_threshold]).any():
-            return None
+            return np.nan
     except TypeError:
-        return None
+        return np.nan
 
     if good_threshold == bad_threshold:
         return 1.0 if value == good_threshold else 0.0

From 5614289fe08d2f418fa8beda6aec198075d34c82 Mon Sep 17 00:00:00 2001
From: ElliottKasoar <45317199+ElliottKasoar@users.noreply.github.com>
Date: Tue, 26 May 2026 19:30:44 +0100
Subject: [PATCH 04/18] Fix None scores

---
 ml_peg/analysis/utils/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ml_peg/analysis/utils/utils.py b/ml_peg/analysis/utils/utils.py
index dc264490d..5cfc184ac 100644
--- a/ml_peg/analysis/utils/utils.py
+++ b/ml_peg/analysis/utils/utils.py
@@ -453,8 +453,8 @@ def calc_table_scores(
                 # Weight of zero excludes the metric from scoring requirements
                 continue
 
-            if value is not None:
-                scores_list.append(scores_row[key])
+            if value is not None and (score := scores_row.get(key)) is not None:
+                scores_list.append(score)
                 weights_list.append(weight)
             else:
                 # Track if any (weighted) metric is missing

From b5afb7e0807dbad14b6e476e70e55a0e39e05992 Mon Sep 17 00:00:00 2001
From: ElliottKasoar <45317199+ElliottKasoar@users.noreply.github.com>
Date: Tue, 26 May 2026 19:31:14 +0100
Subject: [PATCH 05/18] Fix very large numbers stored

---
 ml_peg/app/utils/load.py  |  3 +++
 ml_peg/app/utils/utils.py | 24 ++++++++++++++++++++++++
 2 files changed, 27 insertions(+)

diff --git a/ml_peg/app/utils/load.py b/ml_peg/app/utils/load.py
index 225da0758..b62667f32 100644
--- a/ml_peg/app/utils/load.py
+++ b/ml_peg/app/utils/load.py
@@ -15,6 +15,7 @@
 from ml_peg.app.utils.utils import (
     build_level_of_theory_warnings,
     calculate_column_widths,
+    clean_table_data,
     clean_thresholds,
     clean_weights,
     is_numeric_column,
@@ -53,6 +54,8 @@ def rebuild_table(
         table_json = json.load(f)
 
     data = table_json["data"]
+    data = clean_table_data(data)
+
     columns = table_json["columns"]
     model_name_map = dict(table_json.get("model_name_map") or {})
     thresholds = clean_thresholds(table_json.get("thresholds"))
diff --git a/ml_peg/app/utils/utils.py b/ml_peg/app/utils/utils.py
index 79540716c..8137f75d6 100644
--- a/ml_peg/app/utils/utils.py
+++ b/ml_peg/app/utils/utils.py
@@ -11,6 +11,7 @@
 
 import dash.dash_table.Format as TableFormat
 from matplotlib import colormaps
+import numpy as np
 import yaml
 
 from ml_peg.models import MODELS_ROOT
@@ -314,6 +315,29 @@ def clean_weights(raw_weights: dict[str, float] | None) -> dict[str, float]:
     return weights
 
 
+def clean_table_data(rows: list[dict]) -> list[dict]:
+    """
+    Ensure data does not exceed int limits.
+
+    Parameters
+    ----------
+    rows
+        List of table rows to clean.
+
+    Returns
+    -------
+    list[dict]
+        Cleaned table rows with values larger than int64 limits set to NaN.
+    """
+    for row in rows:
+        for key, value in row.items():
+            if isinstance(value, int | float) and (
+                value > np.iinfo(np.int64).max or value < np.iinfo(np.int64).min
+            ):
+                row[key] = np.nan
+    return rows
+
+
 def filter_rows_by_models(
     rows: list[dict] | None,
     selected_models: Sequence[str] | None,

From 3e44b0a0533ed2edb6c802ccab691f6f862226ca Mon Sep 17 00:00:00 2001
From: ElliottKasoar <45317199+ElliottKasoar@users.noreply.github.com>
Date: Tue, 26 May 2026 19:31:43 +0100
Subject: [PATCH 06/18] Make callbacks optional

---
 ml_peg/app/utils/register_callbacks.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/ml_peg/app/utils/register_callbacks.py b/ml_peg/app/utils/register_callbacks.py
index 76280fd1c..45b11d90c 100644
--- a/ml_peg/app/utils/register_callbacks.py
+++ b/ml_peg/app/utils/register_callbacks.py
@@ -207,6 +207,7 @@ def update_summary_computed_store(
         Input("app-location", "pathname"),
         Input("cmap-store", "data"),
         prevent_initial_call="initial_duplicate",
+        optional=True,
     )
     def sync_summary_table(
         selected_models: list[str] | None,
@@ -506,6 +507,7 @@ def update_benchmark_table_scores(
                 current_columns, thresholds, show_normalized
             )
             tooltips = format_tooltip_headers(raw_tooltips, thresholds, show_normalized)
+
             return (
                 filtered_rows,
                 style,
@@ -530,6 +532,7 @@ def update_benchmark_table_scores(
             State(table_id, "data"),
             State(f"{table_id}-computed-store", "data"),
             prevent_initial_call="initial_duplicate",
+            optional=True,
         )
         def update_table_scores(
             stored_weights: dict[str, float] | None,
@@ -589,6 +592,7 @@ def update_table_scores(
             Input("app-location", "pathname"),
             Input("cmap-store", "data"),
             prevent_initial_call="initial_duplicate",
+            optional=True,
         )
         def sync_table_from_computed_store(
             computed_store: list[dict] | None,
@@ -796,6 +800,7 @@ def register_weight_callbacks(
         Input(f"{table_id}-reset-button", "n_clicks"),
         State(f"{table_id}-weight-store", "data"),
         prevent_initial_call=True,
+        optional=True,
     )
     def store_input_value(
         input_weight: float | None,
@@ -839,6 +844,7 @@ def store_input_value(
         Input(f"{table_id}-weight-store", "data"),
         Input("app-location", "pathname"),
         prevent_initial_call="initial_duplicate",
+        optional=True,
     )
     def sync_inputs(stored_weights: dict[str, float], _pathname: str) -> float:
         """
@@ -893,6 +899,7 @@ def register_normalization_callbacks(
             Input(f"{table_id}-reset-thresholds-button", "n_clicks"),
             State(f"{table_id}-thresholds-store", "data"),
             prevent_initial_call=True,
+            optional=True,
         )
         def store_threshold_values(
             good_val, bad_val, n_clicks, stored_thresholds, metric=metric
@@ -976,6 +983,7 @@ def store_threshold_values(
             *threshold_style_outputs,
             Input("cmap-store", "data"),
             prevent_initial_call=False,
+            optional=True,
         )
         def sync_threshold_input_styles(
             cmap_name: str | None,
@@ -1061,6 +1069,7 @@ def toggle_normalized_display(
             Output(f"{table_id}-{metric}-bad-threshold", "value"),
             Input(f"{table_id}-thresholds-store", "data"),
             prevent_initial_call=True,
+            optional=True,
         )
         def sync_threshold_inputs(thresholds, metric=metric):
             """Sync threshold input values with stored thresholds."""
@@ -1112,6 +1121,7 @@ def register_image_download_callbacks() -> None:
         State({"type": "image-download-format", "index": MATCH}, "value"),
         State({"type": "image-download-target", "index": MATCH}, "data"),
         prevent_initial_call=True,
+        optional=True,
     )
     def _download_image(n_clicks, fmt, uris):
         """
@@ -1169,6 +1179,7 @@ def register_download_callbacks(table_id: str) -> None:
         State(table_id, "data"),
         State(table_id, "columns"),
         prevent_initial_call=True,
+        optional=True,
     )
     def download_table(
         n_clicks: int,
@@ -1243,4 +1254,5 @@ def download_table(
         Output(f"{table_id}-download", "data", allow_duplicate=True),
         Input(f"{table_id}-download-request", "data"),
         prevent_initial_call=True,
+        optional=True,
     )

From d5133baf83211da56eabebddbcf6382edfdbd51a Mon Sep 17 00:00:00 2001
From: ElliottKasoar <45317199+ElliottKasoar@users.noreply.github.com>
Date: Tue, 26 May 2026 20:42:18 +0100
Subject: [PATCH 07/18] Short-circuit summary update

---
 ml_peg/app/utils/register_callbacks.py | 43 +++++++++++++++++---------
 1 file changed, 29 insertions(+), 14 deletions(-)

diff --git a/ml_peg/app/utils/register_callbacks.py b/ml_peg/app/utils/register_callbacks.py
index 45b11d90c..813597a59 100644
--- a/ml_peg/app/utils/register_callbacks.py
+++ b/ml_peg/app/utils/register_callbacks.py
@@ -703,7 +703,10 @@ def register_benchmark_to_category_callback(
 
     outputs = []
     inputs = []
-    for category, category_info in sorted(all_info.items()):
+
+    category_order = sorted(all_info)
+    for category in category_order:
+        category_info = all_info[category]
         category_table_id = f"{category_to_title[category]}-summary-table"
         outputs.append(
             Output(f"{category_table_id}-computed-store", "data", allow_duplicate=True)
@@ -739,20 +742,23 @@ def update_category_from_benchmark(*args) -> list[list[dict]]:
         list[list[dict]]
             Refreshed cached rows for each category summary table.
         """
-        # Rebuild inputs for each category
+        trigger_id = ctx.triggered_id
         iterator = iter(args)
+        all_category_rows = [no_update for _ in category_order]
 
-        all_category_rows = []
-
-        for category, category_info in sorted(all_info.items()):
+        for category_index, category in enumerate(category_order):
+            category_info = all_info[category]
             category_weights = next(iterator)
-            category_rows = deepcopy(next(iterator))
+            category_rows_source = next(iterator)
 
-            for test_name, table_info in sorted(category_info.items()):
-                benchmark_rows = deepcopy(next(iterator))
-                name_map = table_info["model_name_map"]
+            for _test_name, table_info in sorted(category_info.items()):
+                benchmark_rows = next(iterator)
+                if f"{table_info['benchmark_table_id']}-computed-store" != trigger_id:
+                    continue
 
+                name_map = table_info["model_name_map"]
                 benchmark_scores = {}
+
                 for row in benchmark_rows:
                     display_name = row.get("MLIP")
                     original_name = name_map.get(display_name, display_name)
@@ -761,15 +767,24 @@ def update_category_from_benchmark(*args) -> list[list[dict]]:
                         continue
                     benchmark_scores[original_name] = score
 
+                category_rows = deepcopy(category_rows_source)
+                benchmark_column = table_info["benchmark_column"]
+                rows_updated = False
+
                 for row in category_rows:
                     mlip = row.get("MLIP")
                     if mlip in benchmark_scores:
-                        row[all_info[category][test_name]["benchmark_column"]] = (
-                            benchmark_scores[mlip]
-                        )
+                        new_score = benchmark_scores[mlip]
+                        if row.get(benchmark_column) != new_score:
+                            row[benchmark_column] = new_score
+                            rows_updated = True
+
+                if not rows_updated:
+                    break
 
-            category_rows, _ = update_score_style(category_rows, category_weights)
-            all_category_rows.append(category_rows)
+                category_rows, _ = update_score_style(category_rows, category_weights)
+                all_category_rows[category_index] = category_rows
+                break
 
         return all_category_rows
 

From d28fa70daa7a04581a50f65502ef617136f2dbf5 Mon Sep 17 00:00:00 2001
From: ElliottKasoar <45317199+ElliottKasoar@users.noreply.github.com>
Date: Tue, 26 May 2026 21:03:34 +0100
Subject: [PATCH 08/18] Revert "Short-circuit summary update"

This reverts commit d5133baf83211da56eabebddbcf6382edfdbd51a.
---
 ml_peg/app/utils/register_callbacks.py | 43 +++++++++-----------------
 1 file changed, 14 insertions(+), 29 deletions(-)

diff --git a/ml_peg/app/utils/register_callbacks.py b/ml_peg/app/utils/register_callbacks.py
index 813597a59..45b11d90c 100644
--- a/ml_peg/app/utils/register_callbacks.py
+++ b/ml_peg/app/utils/register_callbacks.py
@@ -703,10 +703,7 @@ def register_benchmark_to_category_callback(
 
     outputs = []
     inputs = []
-
-    category_order = sorted(all_info)
-    for category in category_order:
-        category_info = all_info[category]
+    for category, category_info in sorted(all_info.items()):
         category_table_id = f"{category_to_title[category]}-summary-table"
         outputs.append(
             Output(f"{category_table_id}-computed-store", "data", allow_duplicate=True)
@@ -742,23 +739,20 @@ def update_category_from_benchmark(*args) -> list[list[dict]]:
         list[list[dict]]
             Refreshed cached rows for each category summary table.
         """
-        trigger_id = ctx.triggered_id
+        # Rebuild inputs for each category
         iterator = iter(args)
-        all_category_rows = [no_update for _ in category_order]
 
-        for category_index, category in enumerate(category_order):
-            category_info = all_info[category]
-            category_weights = next(iterator)
-            category_rows_source = next(iterator)
+        all_category_rows = []
 
-            for _test_name, table_info in sorted(category_info.items()):
-                benchmark_rows = next(iterator)
-                if f"{table_info['benchmark_table_id']}-computed-store" != trigger_id:
-                    continue
+        for category, category_info in sorted(all_info.items()):
+            category_weights = next(iterator)
+            category_rows = deepcopy(next(iterator))
 
+            for test_name, table_info in sorted(category_info.items()):
+                benchmark_rows = deepcopy(next(iterator))
                 name_map = table_info["model_name_map"]
-                benchmark_scores = {}
 
+                benchmark_scores = {}
                 for row in benchmark_rows:
                     display_name = row.get("MLIP")
                     original_name = name_map.get(display_name, display_name)
@@ -767,24 +761,15 @@ def update_category_from_benchmark(*args) -> list[list[dict]]:
                         continue
                     benchmark_scores[original_name] = score
 
-                category_rows = deepcopy(category_rows_source)
-                benchmark_column = table_info["benchmark_column"]
-                rows_updated = False
-
                 for row in category_rows:
                     mlip = row.get("MLIP")
                     if mlip in benchmark_scores:
-                        new_score = benchmark_scores[mlip]
-                        if row.get(benchmark_column) != new_score:
-                            row[benchmark_column] = new_score
-                            rows_updated = True
-
-                if not rows_updated:
-                    break
+                        row[all_info[category][test_name]["benchmark_column"]] = (
+                            benchmark_scores[mlip]
+                        )
 
-                category_rows, _ = update_score_style(category_rows, category_weights)
-                all_category_rows[category_index] = category_rows
-                break
+            category_rows, _ = update_score_style(category_rows, category_weights)
+            all_category_rows.append(category_rows)
 
         return all_category_rows
 

From bee47c4d224163999f0628220a21a06c4cf89f94 Mon Sep 17 00:00:00 2001
From: ElliottKasoar <45317199+ElliottKasoar@users.noreply.github.com>
Date: Tue, 26 May 2026 21:33:04 +0100
Subject: [PATCH 09/18] Improve efficiency of updates

---
 ml_peg/app/utils/register_callbacks.py | 31 +++++++++++++-------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/ml_peg/app/utils/register_callbacks.py b/ml_peg/app/utils/register_callbacks.py
index 45b11d90c..eefce2715 100644
--- a/ml_peg/app/utils/register_callbacks.py
+++ b/ml_peg/app/utils/register_callbacks.py
@@ -744,32 +744,31 @@ def update_category_from_benchmark(*args) -> list[list[dict]]:
 
         all_category_rows = []
 
-        for category, category_info in sorted(all_info.items()):
+        for _category, category_info in sorted(all_info.items()):
             category_weights = next(iterator)
-            category_rows = deepcopy(next(iterator))
+            current_rows = next(iterator)
+            new_rows = {row["MLIP"]: {"MLIP": row["MLIP"]} for row in current_rows}
 
-            for test_name, table_info in sorted(category_info.items()):
-                benchmark_rows = deepcopy(next(iterator))
+            for _test_name, table_info in sorted(category_info.items()):
+                benchmark_rows = next(iterator)
                 name_map = table_info["model_name_map"]
 
-                benchmark_scores = {}
+                benchmark_column = table_info["benchmark_column"]
                 for row in benchmark_rows:
                     display_name = row.get("MLIP")
                     original_name = name_map.get(display_name, display_name)
-                    score = row.get("Score")
-                    if display_name is None or original_name is None:
+                    if original_name is None:
                         continue
-                    benchmark_scores[original_name] = score
 
-                for row in category_rows:
-                    mlip = row.get("MLIP")
-                    if mlip in benchmark_scores:
-                        row[all_info[category][test_name]["benchmark_column"]] = (
-                            benchmark_scores[mlip]
-                        )
+                    if original_name in new_rows:
+                        new_rows[original_name][benchmark_column] = row.get("Score")
 
-            category_rows, _ = update_score_style(category_rows, category_weights)
-            all_category_rows.append(category_rows)
+            new_rows = list(new_rows.values())
+            new_rows, _ = update_score_style(new_rows, category_weights)
+            if new_rows == current_rows:
+                all_category_rows.append(no_update)
+            else:
+                all_category_rows.append(new_rows)
 
         return all_category_rows
 

From 30eab2f200210715b7cff71a6e1742213c0b99a2 Mon Sep 17 00:00:00 2001
From: ElliottKasoar <45317199+ElliottKasoar@users.noreply.github.com>
Date: Wed, 27 May 2026 01:05:18 +0100
Subject: [PATCH 10/18] Simplify score calculations

---
 ml_peg/analysis/utils/utils.py | 105 ++++++++++++++++++++-------------
 1 file changed, 64 insertions(+), 41 deletions(-)

diff --git a/ml_peg/analysis/utils/utils.py b/ml_peg/analysis/utils/utils.py
index 5cfc184ac..87202116e 100644
--- a/ml_peg/analysis/utils/utils.py
+++ b/ml_peg/analysis/utils/utils.py
@@ -386,17 +386,31 @@ def calc_metric_scores(
     normalizer = normalizer if normalizer is not None else normalize_metric
     cleaned_thresholds = clean_thresholds(thresholds) if thresholds else None
 
-    metrics_scores = [row.copy() for row in metrics_data]
-    for row in metrics_scores:
-        for key, value in row.items():
-            # Value may be ``None`` if missing for a benchmark
-            if key not in {"MLIP", "Score", "id"} and value is not None:
-                if cleaned_thresholds is None or key not in cleaned_thresholds:
-                    row[key] = value
-                    continue
-
-                entry = cleaned_thresholds[key]
-                row[key] = normalizer(value, entry["good"], entry["bad"])
+    if cleaned_thresholds is None:
+        return metrics_data
+
+    metric_columns = [
+        key for key in metrics_data[0] if key not in {"MLIP", "Score", "id"}
+    ]
+    threshold_lookup = {
+        key: (entry["good"], entry["bad"]) for key, entry in cleaned_thresholds.items()
+    }
+
+    metrics_scores = []
+    for row in metrics_data:
+        new_row = row.copy()
+
+        for key in metric_columns:
+            if (value := row.get(key)) is None:
+                continue
+
+            if (thresholds_entry := threshold_lookup.get(key)) is None:
+                continue
+
+            good, bad = thresholds_entry
+            new_row[key] = normalizer(value, good, bad)
+
+        metrics_scores.append(new_row)
 
     return metrics_scores
 
@@ -407,7 +421,8 @@ def calc_table_scores(
     thresholds: Thresholds | None = None,
     normalizer: Callable[[float, float, float], float] | None = None,
     require_all_metrics: bool = True,
-) -> list[MetricRow]:
+    return_scores: bool = False,
+) -> list[MetricRow] | tuple[list[MetricRow], list[MetricRow]]:
     """
     Calculate (normalised) score for each model and add to table data.
 
@@ -429,53 +444,62 @@ def calc_table_scores(
         If True, score is set to None unless all metrics are present (not None).
         If False, score is calculated from available metrics only.
         Default is True.
+    return_scores
+        If True, also return the normalised metric rows used to calculate scores.
+        Default is False.
 
     Returns
     -------
-    list[MetricRow]
-        Rows of data with combined score for each model added.
+    list[MetricRow] | tuple[list[MetricRow], list[MetricRow]]
+        Rows of data with combined score for each model added. If `return_scores` is
+        `True`, the normalised metric rows are also returned.
     """
     weights = weights if weights else {}
 
     metrics_scores = calc_metric_scores(metrics_data, thresholds, normalizer)
 
+    metric_columns = [
+        key for key in metrics_data[0] if key not in {"MLIP", "Score", "id"}
+    ]
+    metric_weights = {key: weights.get(key, 1.0) for key in metric_columns}
+
     for metrics_row, scores_row in zip(metrics_data, metrics_scores, strict=True):
-        scores_list = []
-        weights_list = []
+        weighted_sum = 0.0
+        weight_sum = 0.0
+
         all_metrics_present = True
+        contains_nan = False
 
-        for key, value in metrics_row.items():
-            if key in {"MLIP", "Score", "id"}:
+        for key in metric_columns:
+            if (weight := metric_weights[key]) == 0:
                 continue
 
-            weight = weights.get(key, 1.0)
-            if weight == 0:
-                # Weight of zero excludes the metric from scoring requirements
-                continue
+            value = metrics_row.get(key)
+            score = scores_row.get(key)
 
-            if value is not None and (score := scores_row.get(key)) is not None:
-                scores_list.append(score)
-                weights_list.append(weight)
-            else:
-                # Track if any (weighted) metric is missing
+            if value is None or score is None:
                 all_metrics_present = False
+                continue
 
-        # Calculate score only if conditions are met
-        if require_all_metrics and not all_metrics_present:
-            # Strict mode: require all metrics to be present
+            if isinstance(score, float) and np.isnan(score):
+                contains_nan = True
+                break
+
+            weighted_sum += score * weight
+            weight_sum += weight
+
+        if contains_nan:
+            metrics_row["Score"] = np.nan
+        elif require_all_metrics and not all_metrics_present:
             metrics_row["Score"] = None
-        elif scores_list:
-            if np.nan in scores_list:
-                metrics_row["Score"] = np.nan
-            else:
-                # Calculate weighted average of available metrics
-                try:
-                    metrics_row["Score"] = np.average(scores_list, weights=weights_list)
-                except ZeroDivisionError:
-                    metrics_row["Score"] = np.mean(scores_list)
+        elif weight_sum > 0:
+            metrics_row["Score"] = weighted_sum / weight_sum
         else:
             metrics_row["Score"] = None
 
+    if return_scores:
+        return metrics_data, metrics_scores
+
     return metrics_data
 
 
@@ -690,8 +714,7 @@ def update_score_style(
         Updated table rows and style data.
     """
     weights = clean_weights(weights)
-    data = calc_table_scores(data, weights, thresholds)
-    scored_data = calc_metric_scores(data, thresholds)
+    data, scored_data = calc_table_scores(data, weights, thresholds, return_scores=True)
     style = get_table_style(data, scored_data=scored_data)
     return data, style
 

From 18d697210cd285529a111e0412be0f215b60fbd6 Mon Sep 17 00:00:00 2001
From: ElliottKasoar <45317199+ElliottKasoar@users.noreply.github.com>
Date: Wed, 27 May 2026 01:54:41 +0100
Subject: [PATCH 11/18] Fix setting input boxes on page changes

---
 ml_peg/app/utils/register_callbacks.py | 27 +++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/ml_peg/app/utils/register_callbacks.py b/ml_peg/app/utils/register_callbacks.py
index eefce2715..900054c09 100644
--- a/ml_peg/app/utils/register_callbacks.py
+++ b/ml_peg/app/utils/register_callbacks.py
@@ -814,7 +814,7 @@ def store_input_value(
         input_weight
             Weight value from input box.
         n_clicks
-            Number of clicks. Variable unused, but Input is required to reset weights.
+            Number of clicks.
         stored_weights
             Stored weights dictionary.
 
@@ -829,7 +829,7 @@ def store_input_value(
             if input_weight is None:
                 raise PreventUpdate
             stored_weights[column] = input_weight
-        elif trigger_id == f"{table_id}-reset-button":
+        elif trigger_id == f"{table_id}-reset-button" and n_clicks > 0:
             stored_weights.update(
                 (key, default_weights.get(key, 1.0)) for key in stored_weights
             )
@@ -909,6 +909,8 @@ def store_threshold_values(
 
             # Reset to defaults is specified via reset button
             if trigger_id == f"{table_id}-reset-thresholds-button":
+                if not n_clicks:
+                    raise PreventUpdate
                 if cleaned_defaults:
                     return deepcopy(cleaned_defaults)
                 return cleaned_store
@@ -1067,11 +1069,26 @@ def toggle_normalized_display(
             Output(f"{table_id}-{metric}-good-threshold", "value"),
             Output(f"{table_id}-{metric}-bad-threshold", "value"),
             Input(f"{table_id}-thresholds-store", "data"),
-            prevent_initial_call=True,
+            Input("app-location", "pathname"),
+            # prevent_initial_call=True,
             optional=True,
         )
-        def sync_threshold_inputs(thresholds, metric=metric):
-            """Sync threshold input values with stored thresholds."""
+        def sync_threshold_inputs(
+            thresholds: Thresholds | None, _pathname: str, metric: str = metric
+        ) -> tuple[float | None, float | None]:
+            """
+            Sync threshold input values with stored thresholds.
+
+            Parameters
+            ----------
+            thresholds
+                Stored threshold values.
+            _pathname
+                Current pathname. Variable unused, but required as input to trigger on
+                path change.
+            metric
+                Metric name corresponding to the threshold inputs.
+            """
             cleaned_thresholds = clean_thresholds(thresholds)
             if cleaned_thresholds and metric in cleaned_thresholds:
                 entry = cleaned_thresholds[metric]

From 9632cc054cf6bc7282b9c94c2892d331ceda734d Mon Sep 17 00:00:00 2001
From: ElliottKasoar <45317199+ElliottKasoar@users.noreply.github.com>
Date: Wed, 27 May 2026 15:10:18 +0100
Subject: [PATCH 12/18] Allow missing metrics data

---
 ml_peg/analysis/utils/utils.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/ml_peg/analysis/utils/utils.py b/ml_peg/analysis/utils/utils.py
index 87202116e..f1c5a2209 100644
--- a/ml_peg/analysis/utils/utils.py
+++ b/ml_peg/analysis/utils/utils.py
@@ -386,7 +386,7 @@ def calc_metric_scores(
     normalizer = normalizer if normalizer is not None else normalize_metric
     cleaned_thresholds = clean_thresholds(thresholds) if thresholds else None
 
-    if cleaned_thresholds is None:
+    if cleaned_thresholds is None or not metrics_data:
         return metrics_data
 
     metric_columns = [
@@ -458,6 +458,9 @@ def calc_table_scores(
 
     metrics_scores = calc_metric_scores(metrics_data, thresholds, normalizer)
 
+    if not metrics_data:
+        return metrics_data if not return_scores else (metrics_data, metrics_scores)
+
     metric_columns = [
         key for key in metrics_data[0] if key not in {"MLIP", "Score", "id"}
     ]

From f8b69491c80f9236ecedc2214194ce1192ddb3c0 Mon Sep 17 00:00:00 2001
From: ElliottKasoar <45317199+ElliottKasoar@users.noreply.github.com>
Date: Wed, 27 May 2026 15:34:05 +0100
Subject: [PATCH 13/18] Remove duplicate stores

---
 ml_peg/app/utils/build_components.py | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/ml_peg/app/utils/build_components.py b/ml_peg/app/utils/build_components.py
index 49f604d1b..d5dba5742 100644
--- a/ml_peg/app/utils/build_components.py
+++ b/ml_peg/app/utils/build_components.py
@@ -851,28 +851,11 @@ def build_test_layout(
         ]
     )
 
-    # dcc.Store renders no HTML, so its position here doesn't affect layout.
-    # Placed before the table so the table and controls can share one wrapper below.
-    layout_contents.append(
-        Store(
-            id=f"{table.id}-computed-store",
-            storage_type="session",
-            data=table.data,
-        )
-    )
-
     reserved = {"MLIP", "Score", "id"}
     metric_columns = [
         col["id"] for col in table.columns if col.get("id") not in reserved
     ]
 
-    layout_contents.append(
-        Store(
-            id=f"{table.id}-raw-data-store",
-            storage_type="session",
-            data=table.data,
-        )
-    )
     layout_contents.append(
         Store(
             id=f"{table.id}-raw-tooltip-store",

From d4022b44160cbc379cc7cb44f345576cb02cb353 Mon Sep 17 00:00:00 2001
From: ElliottKasoar <45317199+ElliottKasoar@users.noreply.github.com>
Date: Wed, 27 May 2026 16:38:12 +0100
Subject: [PATCH 14/18] Remove table persistence

---
 ml_peg/app/build_app.py  | 3 ---
 ml_peg/app/utils/load.py | 3 ---
 2 files changed, 6 deletions(-)

diff --git a/ml_peg/app/build_app.py b/ml_peg/app/build_app.py
index 7cb37d7aa..b26f42b5c 100644
--- a/ml_peg/app/build_app.py
+++ b/ml_peg/app/build_app.py
@@ -815,9 +815,6 @@ def build_summary_table(
         tooltip_data=tooltip_rows,
         tooltip_delay=100,
         tooltip_duration=None,
-        persistence=True,
-        persistence_type="session",
-        persisted_props=["data"],
         tooltip_header=tooltip_header,
         editable=False,
         fill_width=False,
diff --git a/ml_peg/app/utils/load.py b/ml_peg/app/utils/load.py
index b62667f32..c6a775ba4 100644
--- a/ml_peg/app/utils/load.py
+++ b/ml_peg/app/utils/load.py
@@ -205,9 +205,6 @@ def rebuild_table(
             }
         ],
         sort_action="native",
-        persistence=True,
-        persistence_type="session",
-        persisted_props=["data"],
         fill_width=False,
     )
 

From 28cb6277324e7a34afbfa1880e8de6bba933d40c Mon Sep 17 00:00:00 2001
From: ElliottKasoar <45317199+ElliottKasoar@users.noreply.github.com>
Date: Wed, 27 May 2026 16:49:17 +0100
Subject: [PATCH 15/18] Add patching to table updates

---
 ml_peg/app/utils/register_callbacks.py | 46 ++++++++++++++++----------
 1 file changed, 29 insertions(+), 17 deletions(-)

diff --git a/ml_peg/app/utils/register_callbacks.py b/ml_peg/app/utils/register_callbacks.py
index 900054c09..6aa29de5c 100644
--- a/ml_peg/app/utils/register_callbacks.py
+++ b/ml_peg/app/utils/register_callbacks.py
@@ -12,6 +12,7 @@
     ClientsideFunction,
     Input,
     Output,
+    Patch,
     State,
     callback,
     clientside_callback,
@@ -551,23 +552,15 @@ def update_table_scores(
 
             trigger_id = ctx.triggered_id
 
-            if trigger_id in ("app-location", "cmap-store"):
-                filtered_rows = filter_rows_by_models(source_data, selected_models)
-                style = (
-                    get_table_style(filtered_rows, cmap_name=cmap_name or "viridis_r")
-                    if filtered_rows
-                    else []
-                )
-                style, tooltip_data = apply_level_of_theory_warnings(
-                    filtered_rows,
-                    style,
-                    model_levels=model_levels,
-                    metric_levels=metric_levels,
-                    model_configs=model_configs,
-                )
-                return filtered_rows, style, tooltip_data, source_data
+            # Recompute scores only when weights changed
+            if trigger_id == f"{table_id}-weight-store":
+                scored_rows, _ = update_score_style(source_data, stored_weights)
+                updated_store = scored_rows
+
+            else:
+                scored_rows = source_data
+                updated_store = no_update
 
-            scored_rows, _ = update_score_style(source_data, stored_weights)
             filtered_rows = filter_rows_by_models(scored_rows, selected_models)
             style = (
                 get_table_style(filtered_rows, cmap_name=cmap_name or "viridis_r")
@@ -581,7 +574,26 @@ def update_table_scores(
                 metric_levels=metric_levels,
                 model_configs=model_configs,
             )
-            return filtered_rows, style, tooltip_data, scored_rows
+
+            if not table_data or len(filtered_rows) != len(table_data):
+                return filtered_rows, style, tooltip_data, scored_rows
+
+            patch = Patch()
+            rows_changed = False
+
+            for row_index, (old_row, new_row) in enumerate(
+                zip(table_data, filtered_rows, strict=True)
+            ):
+                for key, new_value in new_row.items():
+                    if old_row.get(key) != new_value:
+                        patch[row_index][key] = new_value
+                        rows_changed = True
+
+            # No visual change
+            if not rows_changed:
+                return no_update, style, tooltip_data, updated_store
+
+            return patch, style, tooltip_data, updated_store
 
         @callback(
             Output(table_id, "data", allow_duplicate=True),

From 3805bcda9c5890e4e2732e8ba5bbc7ea11364495 Mon Sep 17 00:00:00 2001
From: ElliottKasoar <45317199+ElliottKasoar@users.noreply.github.com>
Date: Wed, 27 May 2026 17:06:07 +0100
Subject: [PATCH 16/18] Remove app location triggers

---
 ml_peg/app/utils/register_callbacks.py | 32 +++-----------------------
 1 file changed, 3 insertions(+), 29 deletions(-)

diff --git a/ml_peg/app/utils/register_callbacks.py b/ml_peg/app/utils/register_callbacks.py
index 6aa29de5c..232c74170 100644
--- a/ml_peg/app/utils/register_callbacks.py
+++ b/ml_peg/app/utils/register_callbacks.py
@@ -205,7 +205,6 @@ def update_summary_computed_store(
         Output("summary-table", "tooltip_data", allow_duplicate=True),
         Input("selected-models-store", "data"),
         Input("summary-table-computed-store", "data"),
-        Input("app-location", "pathname"),
         Input("cmap-store", "data"),
         prevent_initial_call="initial_duplicate",
         optional=True,
@@ -213,7 +212,6 @@ def update_summary_computed_store(
     def sync_summary_table(
         selected_models: list[str] | None,
         computed_store: list[dict] | None,
-        _pathname: str,
         cmap_name: str | None,
     ) -> tuple[list[dict], list[dict], list[dict]]:
         """
@@ -225,9 +223,6 @@ def sync_summary_table(
             Models currently selected in the global model filter.
         computed_store
             Cached full summary rows for the overall summary table.
-        _pathname
-            Current pathname. Included so the visible table refreshes when the
-            summary page is opened.
         cmap_name
             Matplotlib colormap name from the cmap store.
 
@@ -373,7 +368,6 @@ def update_table_from_store(
             Output(f"{table_id}-raw-data-store", "data", allow_duplicate=True),
             Input(f"{table_id}-weight-store", "data"),
             Input(f"{table_id}-thresholds-store", "data"),
-            Input("app-location", "pathname"),
             Input(f"{table_id}-normalized-toggle", "value"),
             Input("selected-models-store", "data"),
             Input("cmap-store", "data"),
@@ -387,7 +381,6 @@ def update_table_from_store(
         def update_benchmark_table_scores(
             stored_weights: dict[str, float] | None,
             stored_threshold: dict | None,
-            _pathname: str,
             toggle_value: list[str] | None,
             selected_models: list[str] | None,
             cmap_name: str | None,
@@ -413,8 +406,6 @@ def update_benchmark_table_scores(
                 Stored weights dictionary for table metrics.
             stored_threshold
                 Stored thresholds dictionary for table metric thresholds.
-            _pathname
-                Current URL path. Unused, required to trigger on path change.
             toggle_value
                 Value of toggle to show normalised values.
             selected_models
@@ -434,8 +425,7 @@ def update_benchmark_table_scores(
             # Page changes and toggle flips reuse the cached scored rows rather than
             # recalculating scores, we only re-score when weights/thresholds change.
             if (
-                trigger_id
-                in ("app-location", f"{table_id}-normalized-toggle", "cmap-store")
+                trigger_id in (f"{table_id}-normalized-toggle", "cmap-store")
                 and stored_computed_data
             ):
                 display_rows = get_scores(
@@ -528,7 +518,6 @@ def update_benchmark_table_scores(
             Output(f"{table_id}-computed-store", "data", allow_duplicate=True),
             Input(f"{table_id}-weight-store", "data"),
             Input("selected-models-store", "data"),
-            Input("app-location", "pathname"),
             Input("cmap-store", "data"),
             State(table_id, "data"),
             State(f"{table_id}-computed-store", "data"),
@@ -538,7 +527,6 @@ def update_benchmark_table_scores(
         def update_table_scores(
             stored_weights: dict[str, float] | None,
             selected_models: list[str] | None,
-            _pathname: str,
             cmap_name: str | None,
             table_data: list[dict] | None,
             computed_store: list[dict] | None,
@@ -601,7 +589,6 @@ def update_table_scores(
             Output(table_id, "tooltip_data", allow_duplicate=True),
             Input(f"{table_id}-computed-store", "data"),
             Input("selected-models-store", "data"),
-            Input("app-location", "pathname"),
             Input("cmap-store", "data"),
             prevent_initial_call="initial_duplicate",
             optional=True,
@@ -609,7 +596,6 @@ def update_table_scores(
         def sync_table_from_computed_store(
             computed_store: list[dict] | None,
             selected_models: list[str] | None,
-            _pathname: str,
             cmap_name: str | None,
         ) -> tuple[list[dict], list[dict], list[dict]]:
             """
@@ -621,9 +607,6 @@ def sync_table_from_computed_store(
                 Cached unfiltered rows for the category summary.
             selected_models
                 Currently selected model names.
-            _pathname
-                Current pathname. Unused, required so the callback hydrates when the
-                category page is mounted.
 
             Returns
             -------
@@ -853,11 +836,10 @@ def store_input_value(
     @callback(
         Output(f"{input_id}-input", "value"),
         Input(f"{table_id}-weight-store", "data"),
-        Input("app-location", "pathname"),
         prevent_initial_call="initial_duplicate",
         optional=True,
     )
-    def sync_inputs(stored_weights: dict[str, float], _pathname: str) -> float:
+    def sync_inputs(stored_weights: dict[str, float]) -> float:
         """
         Sync weight values between the text input and Store.
 
@@ -865,9 +847,6 @@ def sync_inputs(stored_weights: dict[str, float], _pathname: str) -> float:
         ----------
         stored_weights
             Stored weight values for each column.
-        _pathname
-            Current pathname. Variable unused, but required as input to trigger on
-            path change.
 
         Returns
         -------
@@ -1081,12 +1060,10 @@ def toggle_normalized_display(
             Output(f"{table_id}-{metric}-good-threshold", "value"),
             Output(f"{table_id}-{metric}-bad-threshold", "value"),
             Input(f"{table_id}-thresholds-store", "data"),
-            Input("app-location", "pathname"),
-            # prevent_initial_call=True,
             optional=True,
         )
         def sync_threshold_inputs(
-            thresholds: Thresholds | None, _pathname: str, metric: str = metric
+            thresholds: Thresholds | None, metric: str = metric
         ) -> tuple[float | None, float | None]:
             """
             Sync threshold input values with stored thresholds.
@@ -1095,9 +1072,6 @@ def sync_threshold_inputs(
             ----------
             thresholds
                 Stored threshold values.
-            _pathname
-                Current pathname. Variable unused, but required as input to trigger on
-                path change.
             metric
                 Metric name corresponding to the threshold inputs.
             """

From fa9c75d3626a1ca2a09efb9afd21048427d375d2 Mon Sep 17 00:00:00 2001
From: ElliottKasoar <45317199+ElliottKasoar@users.noreply.github.com>
Date: Wed, 27 May 2026 17:45:07 +0100
Subject: [PATCH 17/18] Add patch for category updates

---
 ml_peg/app/utils/register_callbacks.py | 63 ++++++++++++++++++++------
 1 file changed, 50 insertions(+), 13 deletions(-)

diff --git a/ml_peg/app/utils/register_callbacks.py b/ml_peg/app/utils/register_callbacks.py
index 232c74170..1efe6672f 100644
--- a/ml_peg/app/utils/register_callbacks.py
+++ b/ml_peg/app/utils/register_callbacks.py
@@ -736,36 +736,73 @@ def update_category_from_benchmark(*args) -> list[list[dict]]:
         """
         # Rebuild inputs for each category
         iterator = iter(args)
-
-        all_category_rows = []
+        patched_outputs = []
 
         for _category, category_info in sorted(all_info.items()):
             category_weights = next(iterator)
             current_rows = next(iterator)
-            new_rows = {row["MLIP"]: {"MLIP": row["MLIP"]} for row in current_rows}
+
+            updated_rows = []
+            for row in current_rows:
+                updated_row = row.copy()
+                updated_rows.append(updated_row)
+
+            updated_by_mlip = {row["MLIP"]: row for row in updated_rows}
+
+            benchmark_changed = False
 
             for _test_name, table_info in sorted(category_info.items()):
                 benchmark_rows = next(iterator)
-                name_map = table_info["model_name_map"]
 
+                name_map = table_info["model_name_map"]
                 benchmark_column = table_info["benchmark_column"]
+
                 for row in benchmark_rows:
                     display_name = row.get("MLIP")
                     original_name = name_map.get(display_name, display_name)
-                    if original_name is None:
+                    if original_name not in updated_by_mlip:
                         continue
 
-                    if original_name in new_rows:
-                        new_rows[original_name][benchmark_column] = row.get("Score")
+                    new_score = row.get("Score")
+                    target_row = updated_by_mlip[original_name]
+
+                    if target_row.get(benchmark_column) != new_score:
+                        target_row[benchmark_column] = new_score
+                        benchmark_changed = True
+
+            if not benchmark_changed:
+                patched_outputs.append(no_update)
+                continue
+
+            # Recompute overall category scores using existing utility
+            rescored_rows, _ = update_score_style(updated_rows, category_weights)
+
+            patch = Patch()
+            score_changed = False
+
+            for idx, (old_row, new_row) in enumerate(
+                zip(current_rows, rescored_rows, strict=True)
+            ):
+                # Patch benchmark columns
+                for key, value in new_row.items():
+                    if key in {"MLIP", "Score"}:
+                        continue
+
+                    if old_row.get(key) != value:
+                        patch[idx][key] = value
+                        score_changed = True
+
+                # Patch overall score
+                if old_row.get("Score") != new_row.get("Score"):
+                    patch[idx]["Score"] = new_row.get("Score")
+                    score_changed = True
 
-            new_rows = list(new_rows.values())
-            new_rows, _ = update_score_style(new_rows, category_weights)
-            if new_rows == current_rows:
-                all_category_rows.append(no_update)
+            if score_changed:
+                patched_outputs.append(patch)
             else:
-                all_category_rows.append(new_rows)
+                patched_outputs.append(no_update)
 
-        return all_category_rows
+        return patched_outputs
 
 
 def register_weight_callbacks(

From 0e2d629579c287d0dbbd57cb709a987036f5ea1f Mon Sep 17 00:00:00 2001
From: ElliottKasoar <45317199+ElliottKasoar@users.noreply.github.com>
Date: Thu, 28 May 2026 15:36:49 +0100
Subject: [PATCH 18/18] Add patch for summary table updates

---
 ml_peg/app/utils/register_callbacks.py | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/ml_peg/app/utils/register_callbacks.py b/ml_peg/app/utils/register_callbacks.py
index 1efe6672f..8d2c0d4db 100644
--- a/ml_peg/app/utils/register_callbacks.py
+++ b/ml_peg/app/utils/register_callbacks.py
@@ -658,19 +658,26 @@ def update_scores_store(
         """
         # Only category summary tables should write to the global store
         if not table_id.endswith("-summary-table"):
-            return scores_data
+            raise PreventUpdate
 
         if not computed_rows:
-            return scores_data
+            raise PreventUpdate
+
+        # Category table IDs are of form "[category]-summary-table"
+        category_key = table_id.removesuffix("-summary-table") + " Score"
 
-        if not scores_data:
-            scores_data = {}
-        # Update scores store. Category table IDs are of form "[category]-summary-table"
-        # Table headings are of the form "[category] Score"
-        scores_data[table_id.removesuffix("-summary-table") + " Score"] = {
+        new_scores = {
             row["MLIP"]: row["Score"] for row in computed_rows if row.get("MLIP")
         }
-        return scores_data
+        current_scores = (scores_data or {}).get(category_key)
+
+        if current_scores == new_scores:
+            return no_update
+
+        patch = Patch()
+        patch[category_key] = new_scores
+
+        return patch
 
 
 def register_benchmark_to_category_callback(