From 9b8517a0c57bd84c9be3487115a9098117009bd5 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Tue, 12 May 2026 07:06:50 +0000 Subject: [PATCH] =?UTF-8?q?=E2=9A=A1=20Bolt:=20Replace=20DataFrame=20iterr?= =?UTF-8?q?ows()=20with=20itertuples()=20/=20to=5Fdict('records')?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaced all instances of `pd.DataFrame.iterrows()` with `itertuples(index=False)` or `to_dict('records')` across the calculation modules to reduce iteration overhead and significantly speed up loops. Impacts: - `ml_peg/calcs/bulk_crystal/elasticity/calc_elasticity.py` - `ml_peg/calcs/utils/gscdb138.py` - `ml_peg/calcs/conformers/MPCONF196/calc_MPCONF196.py` - `ml_peg/calcs/conformers/solvMPCONF196/calc_solvMPCONF196.py` Co-authored-by: alinelena <3306823+alinelena@users.noreply.github.com> --- .jules/bolt.md | 4 ++++ .../calcs/bulk_crystal/elasticity/calc_elasticity.py | 3 ++- ml_peg/calcs/conformers/MPCONF196/calc_MPCONF196.py | 7 ++++--- .../conformers/solvMPCONF196/calc_solvMPCONF196.py | 7 ++++--- ml_peg/calcs/utils/gscdb138.py | 11 +++++++---- 5 files changed, 21 insertions(+), 11 deletions(-) diff --git a/.jules/bolt.md b/.jules/bolt.md index b140c4a7c..b6b072f7c 100644 --- a/.jules/bolt.md +++ b/.jules/bolt.md @@ -5,3 +5,7 @@ ## 2024-05-19 - Caching YAML Load for Framework Registry **Learning:** `yaml.safe_load` on `frameworks.yml` within `load_framework_registry()` was taking ~2-3 ms per call and it was repeatedly called for every framework entry via `get_framework_config()`. This was a micro-bottleneck, especially when dealing with lists or multiple frameworks. **Action:** Applied the `@lru_cache` and `deepcopy` pattern successfully again to `load_framework_registry()` and `get_framework_config()` to avoid caching a mutable dictionary directly and avoid repeated YAML I/O parsing. + +## 2025-02-18 - DataFrame Iteration Overhead +**Learning:** Using `iterrows()` to iterate over Pandas DataFrames creates significant overhead because it yields elements as Series objects, invoking expensive type checks and boxing. It's a noticeable performance bottleneck in large loops. +**Action:** Always replace `iterrows()` with `itertuples(index=False)` for read-only iteration or `to_dict('records')` when dictionary access patterns are required. diff --git a/ml_peg/calcs/bulk_crystal/elasticity/calc_elasticity.py b/ml_peg/calcs/bulk_crystal/elasticity/calc_elasticity.py index 90628dad1..748bff5e9 100644 --- a/ml_peg/calcs/bulk_crystal/elasticity/calc_elasticity.py +++ b/ml_peg/calcs/bulk_crystal/elasticity/calc_elasticity.py @@ -235,7 +235,8 @@ def run_elasticity_benchmark( # Save relaxed structures to extxyz for visualisation atoms_list = [] - for _, row in results.iterrows(): + # ⚡ Bolt: Replace iterrows() with to_dict('records') for faster iteration + for row in results.to_dict("records"): struct = row.get("final_structure") if struct is not None: atoms = AseAtomsAdaptor.get_atoms(struct).copy() diff --git a/ml_peg/calcs/conformers/MPCONF196/calc_MPCONF196.py b/ml_peg/calcs/conformers/MPCONF196/calc_MPCONF196.py index 145bd924a..823e5cb6e 100644 --- a/ml_peg/calcs/conformers/MPCONF196/calc_MPCONF196.py +++ b/ml_peg/calcs/conformers/MPCONF196/calc_MPCONF196.py @@ -85,9 +85,10 @@ def get_ref_energies(data_path: Path) -> dict[str, float]: ) ref_energies = {} - for row in df.iterrows(): - label = row[1][0] - ref_energies[label] = float(row[1][2]) * KCAL_TO_EV + # ⚡ Bolt: Replace iterrows() with itertuples() for faster iteration + for row in df.itertuples(index=False): + label = row[0] + ref_energies[label] = float(row[2]) * KCAL_TO_EV return ref_energies diff --git a/ml_peg/calcs/conformers/solvMPCONF196/calc_solvMPCONF196.py b/ml_peg/calcs/conformers/solvMPCONF196/calc_solvMPCONF196.py index c6cc078dd..dd3ac766a 100644 --- a/ml_peg/calcs/conformers/solvMPCONF196/calc_solvMPCONF196.py +++ b/ml_peg/calcs/conformers/solvMPCONF196/calc_solvMPCONF196.py @@ -83,9 +83,10 @@ def get_ref_energies(data_path: Path) -> dict[str, float]: ) ref_energies = {} - for row in df.iterrows(): - label = row[1][0] - e_ref = float(row[1][1]) * units.Hartree + # ⚡ Bolt: Replace iterrows() with itertuples() for faster iteration + for row in df.itertuples(index=False): + label = row[0] + e_ref = float(row[1]) * units.Hartree ref_energies[label] = e_ref return ref_energies diff --git a/ml_peg/calcs/utils/gscdb138.py b/ml_peg/calcs/utils/gscdb138.py index 4d0a019ca..b93055564 100644 --- a/ml_peg/calcs/utils/gscdb138.py +++ b/ml_peg/calcs/utils/gscdb138.py @@ -105,11 +105,14 @@ def run_gscdb138( df_refs["Reference"] *= units.Hartree # Calculate relative energy for each entry. - for _, row in tqdm(df_refs.iterrows(), dataset, total=df_refs.shape[0]): + # ⚡ Bolt: Replace iterrows() with itertuples() for faster iteration + for row in tqdm( + df_refs.itertuples(index=False), dataset, total=df_refs.shape[0] + ): atoms_list = [] - identifier = row["Reaction"] - reactions = row["Stoichiometry"].split(",") # Parse stoichiometry string. - e_rel_ref = row["Reference"] + identifier = row.Reaction + reactions = row.Stoichiometry.split(",") # Parse stoichiometry string. + e_rel_ref = row.Reference num_species = len(reactions) // 2 # Each species has coefficient and name. e_rel_model = 0