From 2af810ba895f1f03b63c1748620103a57858cdf4 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Thu, 14 May 2026 07:29:24 +0000 Subject: [PATCH 1/2] perf: optimize DataFrame iteration and memoize FAQs YAML parsing Replaces slow `iterrows()` calls with `itertuples()` and `to_dict('records')` across several calculation scripts. Adds `@functools.cache` to `build_faqs()` to prevent redundant YAML parsing. Co-authored-by: alinelena <3306823+alinelena@users.noreply.github.com> --- .jules/bolt.md | 12 ++++++++++++ ml_peg/app/utils/build_components.py | 2 ++ .../calcs/bulk_crystal/elasticity/calc_elasticity.py | 2 +- ml_peg/calcs/conformers/MPCONF196/calc_MPCONF196.py | 6 +++--- .../conformers/solvMPCONF196/calc_solvMPCONF196.py | 6 +++--- ml_peg/calcs/utils/gscdb138.py | 8 ++++---- 6 files changed, 25 insertions(+), 11 deletions(-) diff --git a/.jules/bolt.md b/.jules/bolt.md index b140c4a7c..ce597a7d5 100644 --- a/.jules/bolt.md +++ b/.jules/bolt.md @@ -5,3 +5,15 @@ ## 2024-05-19 - Caching YAML Load for Framework Registry **Learning:** `yaml.safe_load` on `frameworks.yml` within `load_framework_registry()` was taking ~2-3 ms per call and it was repeatedly called for every framework entry via `get_framework_config()`. This was a micro-bottleneck, especially when dealing with lists or multiple frameworks. **Action:** Applied the `@lru_cache` and `deepcopy` pattern successfully again to `load_framework_registry()` and `get_framework_config()` to avoid caching a mutable dictionary directly and avoid repeated YAML I/O parsing. + +## 2024-05-19 - Pandas Iteration Bottlenecks +**Learning:** `iterrows()` is consistently used across the codebase (e.g., `calc_solvMPCONF196.py`, `gscdb138.py`) for iterating through DataFrames and is a major, known performance bottleneck (often 10-20x slower than alternatives). +**Action:** Replace `iterrows()` with `itertuples(index=False, name=None)` when simple tuple indexing is sufficient, standard `itertuples()` for dot-notation access, or `to_dict('records')` when dictionary access patterns like `.get()` are required by downstream logic. + +## 2024-05-19 - Caching UI Layout Generation +**Learning:** The `build_faqs()` component function was reading `faqs.yml` from disk synchronously on every render without caching, similar to previous issues discovered with `frameworks.yml`. +**Action:** Apply `@functools.cache` to UI component generation functions that depend on static configuration files to eliminate repetitive disk I/O and parsing overhead. + +## 2024-05-19 - Hanging Tests in Restricted Network +**Learning:** Running Pytest on heavy ML integration tests (like those in `calc_solvMPCONF196.py` and `calc_high_pressure_relaxation.py`) in environments with restricted network access causes the test suite to hang or timeout as the system attempts to download gigabytes of model weights (e.g., Torch, Mace) silently in the background. +**Action:** When tests hang in this manner due to missing heavy dependencies that cannot be easily installed, use static analysis (`ruff check`) and `python -m py_compile` as the primary verification strategy to ensure the syntax and logic of refactored code is sound without triggering remote downloads. diff --git a/ml_peg/app/utils/build_components.py b/ml_peg/app/utils/build_components.py index 66de62710..baf269938 100644 --- a/ml_peg/app/utils/build_components.py +++ b/ml_peg/app/utils/build_components.py @@ -2,6 +2,7 @@ from __future__ import annotations +from functools import cache from importlib import metadata from pathlib import Path import time @@ -468,6 +469,7 @@ def build_plot_download_controls(graph_id: str) -> Div: ) +@cache def build_faqs() -> Div: """ Build FAQ section with collapsible dropdowns from YAML file. diff --git a/ml_peg/calcs/bulk_crystal/elasticity/calc_elasticity.py b/ml_peg/calcs/bulk_crystal/elasticity/calc_elasticity.py index 90628dad1..6445ed649 100644 --- a/ml_peg/calcs/bulk_crystal/elasticity/calc_elasticity.py +++ b/ml_peg/calcs/bulk_crystal/elasticity/calc_elasticity.py @@ -235,7 +235,7 @@ def run_elasticity_benchmark( # Save relaxed structures to extxyz for visualisation atoms_list = [] - for _, row in results.iterrows(): + for row in results.to_dict('records'): struct = row.get("final_structure") if struct is not None: atoms = AseAtomsAdaptor.get_atoms(struct).copy() diff --git a/ml_peg/calcs/conformers/MPCONF196/calc_MPCONF196.py b/ml_peg/calcs/conformers/MPCONF196/calc_MPCONF196.py index 145bd924a..3574bf30f 100644 --- a/ml_peg/calcs/conformers/MPCONF196/calc_MPCONF196.py +++ b/ml_peg/calcs/conformers/MPCONF196/calc_MPCONF196.py @@ -85,9 +85,9 @@ def get_ref_energies(data_path: Path) -> dict[str, float]: ) ref_energies = {} - for row in df.iterrows(): - label = row[1][0] - ref_energies[label] = float(row[1][2]) * KCAL_TO_EV + for row in df.itertuples(index=False, name=None): + label = row[0] + ref_energies[label] = float(row[2]) * KCAL_TO_EV return ref_energies diff --git a/ml_peg/calcs/conformers/solvMPCONF196/calc_solvMPCONF196.py b/ml_peg/calcs/conformers/solvMPCONF196/calc_solvMPCONF196.py index c6cc078dd..6c105458c 100644 --- a/ml_peg/calcs/conformers/solvMPCONF196/calc_solvMPCONF196.py +++ b/ml_peg/calcs/conformers/solvMPCONF196/calc_solvMPCONF196.py @@ -83,9 +83,9 @@ def get_ref_energies(data_path: Path) -> dict[str, float]: ) ref_energies = {} - for row in df.iterrows(): - label = row[1][0] - e_ref = float(row[1][1]) * units.Hartree + for row in df.itertuples(index=False, name=None): + label = row[0] + e_ref = float(row[1]) * units.Hartree ref_energies[label] = e_ref return ref_energies diff --git a/ml_peg/calcs/utils/gscdb138.py b/ml_peg/calcs/utils/gscdb138.py index 4d0a019ca..095e14666 100644 --- a/ml_peg/calcs/utils/gscdb138.py +++ b/ml_peg/calcs/utils/gscdb138.py @@ -105,11 +105,11 @@ def run_gscdb138( df_refs["Reference"] *= units.Hartree # Calculate relative energy for each entry. - for _, row in tqdm(df_refs.iterrows(), dataset, total=df_refs.shape[0]): + for row in tqdm(df_refs.itertuples(), dataset, total=df_refs.shape[0]): atoms_list = [] - identifier = row["Reaction"] - reactions = row["Stoichiometry"].split(",") # Parse stoichiometry string. - e_rel_ref = row["Reference"] + identifier = row.Reaction + reactions = row.Stoichiometry.split(",") # Parse stoichiometry string. + e_rel_ref = row.Reference num_species = len(reactions) // 2 # Each species has coefficient and name. e_rel_model = 0 From 50b4a271eee33f35c0194f1f2ce49b03b2e600db Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Thu, 14 May 2026 07:40:00 +0000 Subject: [PATCH 2/2] perf: optimize DataFrame iteration and memoize FAQs YAML parsing Replaces slow `iterrows()` calls with `itertuples()` and `to_dict('records')` across several calculation scripts. Adds `@functools.cache` to `build_faqs()` to prevent redundant YAML parsing. Applies ruff formatting. Co-authored-by: alinelena <3306823+alinelena@users.noreply.github.com> --- ml_peg/calcs/bulk_crystal/elasticity/calc_elasticity.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ml_peg/calcs/bulk_crystal/elasticity/calc_elasticity.py b/ml_peg/calcs/bulk_crystal/elasticity/calc_elasticity.py index 6445ed649..d5256c94b 100644 --- a/ml_peg/calcs/bulk_crystal/elasticity/calc_elasticity.py +++ b/ml_peg/calcs/bulk_crystal/elasticity/calc_elasticity.py @@ -235,7 +235,7 @@ def run_elasticity_benchmark( # Save relaxed structures to extxyz for visualisation atoms_list = [] - for row in results.to_dict('records'): + for row in results.to_dict("records"): struct = row.get("final_structure") if struct is not None: atoms = AseAtomsAdaptor.get_atoms(struct).copy()