From 732673d328b7c611232f10e372fe5eb5aa7e4a78 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Tue, 19 May 2026 06:30:01 +0000 Subject: [PATCH 1/2] Optimize pandas iterrows loop in calculations Co-authored-by: alinelena <3306823+alinelena@users.noreply.github.com> --- .jules/bolt.md | 3 +++ ml_peg/calcs/bulk_crystal/elasticity/calc_elasticity.py | 3 ++- ml_peg/calcs/conformers/MPCONF196/calc_MPCONF196.py | 7 ++++--- .../calcs/conformers/solvMPCONF196/calc_solvMPCONF196.py | 7 ++++--- ml_peg/calcs/utils/gscdb138.py | 9 +++++---- 5 files changed, 18 insertions(+), 11 deletions(-) diff --git a/.jules/bolt.md b/.jules/bolt.md index b140c4a7c..8744e34c5 100644 --- a/.jules/bolt.md +++ b/.jules/bolt.md @@ -5,3 +5,6 @@ ## 2024-05-19 - Caching YAML Load for Framework Registry **Learning:** `yaml.safe_load` on `frameworks.yml` within `load_framework_registry()` was taking ~2-3 ms per call and it was repeatedly called for every framework entry via `get_framework_config()`. This was a micro-bottleneck, especially when dealing with lists or multiple frameworks. **Action:** Applied the `@lru_cache` and `deepcopy` pattern successfully again to `load_framework_registry()` and `get_framework_config()` to avoid caching a mutable dictionary directly and avoid repeated YAML I/O parsing. +## 2025-03-01 - Optimizing DataFrame Iteration in Calculation Loops +**Learning:** Pandas `iterrows()` is a known performance bottleneck. Using `iterrows()` forces Pandas to return a Series for each row, invoking expensive Series construction, type checking, and boxing. In our codebase loops traversing hundreds or thousands of structures/materials to perform benchmarking calculations, replacing `iterrows()` with `itertuples(index=False, name=None)` (returning standard tuples) or `to_dict('records')` removes this heavy overhead and cuts iteration time significantly. +**Action:** When refactoring nested loops parsing DataFrames for benchmarking, always use `itertuples()` for indexing columns or `to_dict('records')` when dictionary access is required over `iterrows()`. Be careful to replace `.get()` or `[1][x]` Series references with tuple indexing (`[x]`) or `namedtuple` property (`.x`) access. diff --git a/ml_peg/calcs/bulk_crystal/elasticity/calc_elasticity.py b/ml_peg/calcs/bulk_crystal/elasticity/calc_elasticity.py index 90628dad1..3e3305904 100644 --- a/ml_peg/calcs/bulk_crystal/elasticity/calc_elasticity.py +++ b/ml_peg/calcs/bulk_crystal/elasticity/calc_elasticity.py @@ -235,7 +235,8 @@ def run_elasticity_benchmark( # Save relaxed structures to extxyz for visualisation atoms_list = [] - for _, row in results.iterrows(): + # Performance optimization: Replace `iterrows` with `to_dict('records')` to avoid Series overhead + for row in results.to_dict("records"): struct = row.get("final_structure") if struct is not None: atoms = AseAtomsAdaptor.get_atoms(struct).copy() diff --git a/ml_peg/calcs/conformers/MPCONF196/calc_MPCONF196.py b/ml_peg/calcs/conformers/MPCONF196/calc_MPCONF196.py index 145bd924a..f72070fd0 100644 --- a/ml_peg/calcs/conformers/MPCONF196/calc_MPCONF196.py +++ b/ml_peg/calcs/conformers/MPCONF196/calc_MPCONF196.py @@ -85,9 +85,10 @@ def get_ref_energies(data_path: Path) -> dict[str, float]: ) ref_energies = {} - for row in df.iterrows(): - label = row[1][0] - ref_energies[label] = float(row[1][2]) * KCAL_TO_EV + # Performance optimization: Replace `iterrows` with `itertuples` to avoid Series overhead + for row in df.itertuples(index=False, name=None): + label = row[0] + ref_energies[label] = float(row[2]) * KCAL_TO_EV return ref_energies diff --git a/ml_peg/calcs/conformers/solvMPCONF196/calc_solvMPCONF196.py b/ml_peg/calcs/conformers/solvMPCONF196/calc_solvMPCONF196.py index c6cc078dd..da9673241 100644 --- a/ml_peg/calcs/conformers/solvMPCONF196/calc_solvMPCONF196.py +++ b/ml_peg/calcs/conformers/solvMPCONF196/calc_solvMPCONF196.py @@ -83,9 +83,10 @@ def get_ref_energies(data_path: Path) -> dict[str, float]: ) ref_energies = {} - for row in df.iterrows(): - label = row[1][0] - e_ref = float(row[1][1]) * units.Hartree + # Performance optimization: Replace `iterrows` with `itertuples` to avoid Series overhead + for row in df.itertuples(index=False, name=None): + label = row[0] + e_ref = float(row[1]) * units.Hartree ref_energies[label] = e_ref return ref_energies diff --git a/ml_peg/calcs/utils/gscdb138.py b/ml_peg/calcs/utils/gscdb138.py index 4d0a019ca..f76c0c2ec 100644 --- a/ml_peg/calcs/utils/gscdb138.py +++ b/ml_peg/calcs/utils/gscdb138.py @@ -105,11 +105,12 @@ def run_gscdb138( df_refs["Reference"] *= units.Hartree # Calculate relative energy for each entry. - for _, row in tqdm(df_refs.iterrows(), dataset, total=df_refs.shape[0]): + # Performance optimization: Replace `iterrows` with `itertuples` to avoid Series overhead + for row in tqdm(df_refs.itertuples(), dataset, total=df_refs.shape[0]): atoms_list = [] - identifier = row["Reaction"] - reactions = row["Stoichiometry"].split(",") # Parse stoichiometry string. - e_rel_ref = row["Reference"] + identifier = row.Reaction + reactions = row.Stoichiometry.split(",") # Parse stoichiometry string. + e_rel_ref = row.Reference num_species = len(reactions) // 2 # Each species has coefficient and name. e_rel_model = 0 From cfe83b76e70181ecf86c57979cbed4d4f444d6fd Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Tue, 19 May 2026 06:38:14 +0000 Subject: [PATCH 2/2] Optimize pandas iterrows loop in calculations Co-authored-by: alinelena <3306823+alinelena@users.noreply.github.com> --- ml_peg/calcs/bulk_crystal/elasticity/calc_elasticity.py | 2 +- ml_peg/calcs/conformers/MPCONF196/calc_MPCONF196.py | 2 +- ml_peg/calcs/conformers/solvMPCONF196/calc_solvMPCONF196.py | 2 +- ml_peg/calcs/utils/gscdb138.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ml_peg/calcs/bulk_crystal/elasticity/calc_elasticity.py b/ml_peg/calcs/bulk_crystal/elasticity/calc_elasticity.py index 3e3305904..8ad9f4df9 100644 --- a/ml_peg/calcs/bulk_crystal/elasticity/calc_elasticity.py +++ b/ml_peg/calcs/bulk_crystal/elasticity/calc_elasticity.py @@ -235,7 +235,7 @@ def run_elasticity_benchmark( # Save relaxed structures to extxyz for visualisation atoms_list = [] - # Performance optimization: Replace `iterrows` with `to_dict('records')` to avoid Series overhead + # Perf opt: Replace `iterrows` with `to_dict('records')` for row in results.to_dict("records"): struct = row.get("final_structure") if struct is not None: diff --git a/ml_peg/calcs/conformers/MPCONF196/calc_MPCONF196.py b/ml_peg/calcs/conformers/MPCONF196/calc_MPCONF196.py index f72070fd0..9db7829ce 100644 --- a/ml_peg/calcs/conformers/MPCONF196/calc_MPCONF196.py +++ b/ml_peg/calcs/conformers/MPCONF196/calc_MPCONF196.py @@ -85,7 +85,7 @@ def get_ref_energies(data_path: Path) -> dict[str, float]: ) ref_energies = {} - # Performance optimization: Replace `iterrows` with `itertuples` to avoid Series overhead + # Perf opt: Replace `iterrows` with `itertuples` to avoid Series overhead for row in df.itertuples(index=False, name=None): label = row[0] ref_energies[label] = float(row[2]) * KCAL_TO_EV diff --git a/ml_peg/calcs/conformers/solvMPCONF196/calc_solvMPCONF196.py b/ml_peg/calcs/conformers/solvMPCONF196/calc_solvMPCONF196.py index da9673241..197904d46 100644 --- a/ml_peg/calcs/conformers/solvMPCONF196/calc_solvMPCONF196.py +++ b/ml_peg/calcs/conformers/solvMPCONF196/calc_solvMPCONF196.py @@ -83,7 +83,7 @@ def get_ref_energies(data_path: Path) -> dict[str, float]: ) ref_energies = {} - # Performance optimization: Replace `iterrows` with `itertuples` to avoid Series overhead + # Perf opt: Replace `iterrows` with `itertuples` to avoid Series overhead for row in df.itertuples(index=False, name=None): label = row[0] e_ref = float(row[1]) * units.Hartree diff --git a/ml_peg/calcs/utils/gscdb138.py b/ml_peg/calcs/utils/gscdb138.py index f76c0c2ec..7cef5bd84 100644 --- a/ml_peg/calcs/utils/gscdb138.py +++ b/ml_peg/calcs/utils/gscdb138.py @@ -105,7 +105,7 @@ def run_gscdb138( df_refs["Reference"] *= units.Hartree # Calculate relative energy for each entry. - # Performance optimization: Replace `iterrows` with `itertuples` to avoid Series overhead + # Perf opt: Replace `iterrows` with `itertuples` to avoid Series overhead for row in tqdm(df_refs.itertuples(), dataset, total=df_refs.shape[0]): atoms_list = [] identifier = row.Reaction