From 732673d328b7c611232f10e372fe5eb5aa7e4a78 Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
 <161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Tue, 19 May 2026 06:30:01 +0000
Subject: [PATCH 1/2] Optimize pandas iterrows loop in calculations

Co-authored-by: alinelena <3306823+alinelena@users.noreply.github.com>
---
 .jules/bolt.md                                           | 3 +++
 ml_peg/calcs/bulk_crystal/elasticity/calc_elasticity.py  | 3 ++-
 ml_peg/calcs/conformers/MPCONF196/calc_MPCONF196.py      | 7 ++++---
 .../calcs/conformers/solvMPCONF196/calc_solvMPCONF196.py | 7 ++++---
 ml_peg/calcs/utils/gscdb138.py                           | 9 +++++----
 5 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/.jules/bolt.md b/.jules/bolt.md
index b140c4a7c..8744e34c5 100644
--- a/.jules/bolt.md
+++ b/.jules/bolt.md
@@ -5,3 +5,6 @@
 ## 2024-05-19 - Caching YAML Load for Framework Registry
 **Learning:** `yaml.safe_load` on `frameworks.yml` within `load_framework_registry()` was taking ~2-3 ms per call and it was repeatedly called for every framework entry via `get_framework_config()`. This was a micro-bottleneck, especially when dealing with lists or multiple frameworks.
 **Action:** Applied the `@lru_cache` and `deepcopy` pattern successfully again to `load_framework_registry()` and `get_framework_config()` to avoid caching a mutable dictionary directly and avoid repeated YAML I/O parsing.
+## 2025-03-01 - Optimizing DataFrame Iteration in Calculation Loops
+**Learning:** Pandas `iterrows()` is a known performance bottleneck. Using `iterrows()` forces Pandas to return a Series for each row, invoking expensive Series construction, type checking, and boxing. In our codebase loops traversing hundreds or thousands of structures/materials to perform benchmarking calculations, replacing `iterrows()` with `itertuples(index=False, name=None)` (returning standard tuples) or `to_dict('records')` removes this heavy overhead and cuts iteration time significantly.
+**Action:** When refactoring nested loops parsing DataFrames for benchmarking, always use `itertuples()` for indexing columns or `to_dict('records')` when dictionary access is required over `iterrows()`. Be careful to replace `.get()` or `[1][x]` Series references with tuple indexing (`[x]`) or `namedtuple` property (`.x`) access.
diff --git a/ml_peg/calcs/bulk_crystal/elasticity/calc_elasticity.py b/ml_peg/calcs/bulk_crystal/elasticity/calc_elasticity.py
index 90628dad1..3e3305904 100644
--- a/ml_peg/calcs/bulk_crystal/elasticity/calc_elasticity.py
+++ b/ml_peg/calcs/bulk_crystal/elasticity/calc_elasticity.py
@@ -235,7 +235,8 @@ def run_elasticity_benchmark(
 
     # Save relaxed structures to extxyz for visualisation
     atoms_list = []
-    for _, row in results.iterrows():
+    # Performance optimization: Replace `iterrows` with `to_dict('records')` to avoid Series overhead
+    for row in results.to_dict("records"):
         struct = row.get("final_structure")
         if struct is not None:
             atoms = AseAtomsAdaptor.get_atoms(struct).copy()
diff --git a/ml_peg/calcs/conformers/MPCONF196/calc_MPCONF196.py b/ml_peg/calcs/conformers/MPCONF196/calc_MPCONF196.py
index 145bd924a..f72070fd0 100644
--- a/ml_peg/calcs/conformers/MPCONF196/calc_MPCONF196.py
+++ b/ml_peg/calcs/conformers/MPCONF196/calc_MPCONF196.py
@@ -85,9 +85,10 @@ def get_ref_energies(data_path: Path) -> dict[str, float]:
     )
     ref_energies = {}
 
-    for row in df.iterrows():
-        label = row[1][0]
-        ref_energies[label] = float(row[1][2]) * KCAL_TO_EV
+    # Performance optimization: Replace `iterrows` with `itertuples` to avoid Series overhead
+    for row in df.itertuples(index=False, name=None):
+        label = row[0]
+        ref_energies[label] = float(row[2]) * KCAL_TO_EV
 
     return ref_energies
 
diff --git a/ml_peg/calcs/conformers/solvMPCONF196/calc_solvMPCONF196.py b/ml_peg/calcs/conformers/solvMPCONF196/calc_solvMPCONF196.py
index c6cc078dd..da9673241 100644
--- a/ml_peg/calcs/conformers/solvMPCONF196/calc_solvMPCONF196.py
+++ b/ml_peg/calcs/conformers/solvMPCONF196/calc_solvMPCONF196.py
@@ -83,9 +83,10 @@ def get_ref_energies(data_path: Path) -> dict[str, float]:
     )
     ref_energies = {}
 
-    for row in df.iterrows():
-        label = row[1][0]
-        e_ref = float(row[1][1]) * units.Hartree
+    # Performance optimization: Replace `iterrows` with `itertuples` to avoid Series overhead
+    for row in df.itertuples(index=False, name=None):
+        label = row[0]
+        e_ref = float(row[1]) * units.Hartree
         ref_energies[label] = e_ref
 
     return ref_energies
diff --git a/ml_peg/calcs/utils/gscdb138.py b/ml_peg/calcs/utils/gscdb138.py
index 4d0a019ca..f76c0c2ec 100644
--- a/ml_peg/calcs/utils/gscdb138.py
+++ b/ml_peg/calcs/utils/gscdb138.py
@@ -105,11 +105,12 @@ def run_gscdb138(
         df_refs["Reference"] *= units.Hartree
 
         # Calculate relative energy for each entry.
-        for _, row in tqdm(df_refs.iterrows(), dataset, total=df_refs.shape[0]):
+        # Performance optimization: Replace `iterrows` with `itertuples` to avoid Series overhead
+        for row in tqdm(df_refs.itertuples(), dataset, total=df_refs.shape[0]):
             atoms_list = []
-            identifier = row["Reaction"]
-            reactions = row["Stoichiometry"].split(",")  # Parse stoichiometry string.
-            e_rel_ref = row["Reference"]
+            identifier = row.Reaction
+            reactions = row.Stoichiometry.split(",")  # Parse stoichiometry string.
+            e_rel_ref = row.Reference
             num_species = len(reactions) // 2  # Each species has coefficient and name.
 
             e_rel_model = 0

From cfe83b76e70181ecf86c57979cbed4d4f444d6fd Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
 <161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Tue, 19 May 2026 06:38:14 +0000
Subject: [PATCH 2/2] Optimize pandas iterrows loop in calculations

Co-authored-by: alinelena <3306823+alinelena@users.noreply.github.com>
---
 ml_peg/calcs/bulk_crystal/elasticity/calc_elasticity.py     | 2 +-
 ml_peg/calcs/conformers/MPCONF196/calc_MPCONF196.py         | 2 +-
 ml_peg/calcs/conformers/solvMPCONF196/calc_solvMPCONF196.py | 2 +-
 ml_peg/calcs/utils/gscdb138.py                              | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/ml_peg/calcs/bulk_crystal/elasticity/calc_elasticity.py b/ml_peg/calcs/bulk_crystal/elasticity/calc_elasticity.py
index 3e3305904..8ad9f4df9 100644
--- a/ml_peg/calcs/bulk_crystal/elasticity/calc_elasticity.py
+++ b/ml_peg/calcs/bulk_crystal/elasticity/calc_elasticity.py
@@ -235,7 +235,7 @@ def run_elasticity_benchmark(
 
     # Save relaxed structures to extxyz for visualisation
     atoms_list = []
-    # Performance optimization: Replace `iterrows` with `to_dict('records')` to avoid Series overhead
+    # Perf opt: Replace `iterrows` with `to_dict('records')`
     for row in results.to_dict("records"):
         struct = row.get("final_structure")
         if struct is not None:
diff --git a/ml_peg/calcs/conformers/MPCONF196/calc_MPCONF196.py b/ml_peg/calcs/conformers/MPCONF196/calc_MPCONF196.py
index f72070fd0..9db7829ce 100644
--- a/ml_peg/calcs/conformers/MPCONF196/calc_MPCONF196.py
+++ b/ml_peg/calcs/conformers/MPCONF196/calc_MPCONF196.py
@@ -85,7 +85,7 @@ def get_ref_energies(data_path: Path) -> dict[str, float]:
     )
     ref_energies = {}
 
-    # Performance optimization: Replace `iterrows` with `itertuples` to avoid Series overhead
+    # Perf opt: Replace `iterrows` with `itertuples` to avoid Series overhead
     for row in df.itertuples(index=False, name=None):
         label = row[0]
         ref_energies[label] = float(row[2]) * KCAL_TO_EV
diff --git a/ml_peg/calcs/conformers/solvMPCONF196/calc_solvMPCONF196.py b/ml_peg/calcs/conformers/solvMPCONF196/calc_solvMPCONF196.py
index da9673241..197904d46 100644
--- a/ml_peg/calcs/conformers/solvMPCONF196/calc_solvMPCONF196.py
+++ b/ml_peg/calcs/conformers/solvMPCONF196/calc_solvMPCONF196.py
@@ -83,7 +83,7 @@ def get_ref_energies(data_path: Path) -> dict[str, float]:
     )
     ref_energies = {}
 
-    # Performance optimization: Replace `iterrows` with `itertuples` to avoid Series overhead
+    # Perf opt: Replace `iterrows` with `itertuples` to avoid Series overhead
     for row in df.itertuples(index=False, name=None):
         label = row[0]
         e_ref = float(row[1]) * units.Hartree
diff --git a/ml_peg/calcs/utils/gscdb138.py b/ml_peg/calcs/utils/gscdb138.py
index f76c0c2ec..7cef5bd84 100644
--- a/ml_peg/calcs/utils/gscdb138.py
+++ b/ml_peg/calcs/utils/gscdb138.py
@@ -105,7 +105,7 @@ def run_gscdb138(
         df_refs["Reference"] *= units.Hartree
 
         # Calculate relative energy for each entry.
-        # Performance optimization: Replace `iterrows` with `itertuples` to avoid Series overhead
+        # Perf opt: Replace `iterrows` with `itertuples` to avoid Series overhead
         for row in tqdm(df_refs.itertuples(), dataset, total=df_refs.shape[0]):
             atoms_list = []
             identifier = row.Reaction