diff --git a/.jules/bolt.md b/.jules/bolt.md index b140c4a7c..cbb5375c4 100644 --- a/.jules/bolt.md +++ b/.jules/bolt.md @@ -1,7 +1,3 @@ -## 2024-05-19 - Caching YAML Load for Model Parsing -**Learning:** `yaml.safe_load` on large configuration files like `models.yml` is significantly slower than parsing JSON or doing other basic IO. It can become a bottleneck when called repeatedly throughout an application's lifecycle (e.g., getting subsets of models, instantiating apps). -**Action:** Always memoize or `@lru_cache` functions that load static, read-only configuration files (like `models.yml`) to prevent repeated disk I/O and parsing overhead. - -## 2024-05-19 - Caching YAML Load for Framework Registry -**Learning:** `yaml.safe_load` on `frameworks.yml` within `load_framework_registry()` was taking ~2-3 ms per call and it was repeatedly called for every framework entry via `get_framework_config()`. This was a micro-bottleneck, especially when dealing with lists or multiple frameworks. -**Action:** Applied the `@lru_cache` and `deepcopy` pattern successfully again to `load_framework_registry()` and `get_framework_config()` to avoid caching a mutable dictionary directly and avoid repeated YAML I/O parsing. +## 2024-05-30 - Iterating pandas DataFrames efficiently +**Learning:** `pandas.DataFrame.iterrows()` is a major performance bottleneck for looping over datasets because it returns a Series for each row, creating significant overhead in Python. +**Action:** Always replace `iterrows()` with `itertuples(index=False, name=None)` for very fast, index-based tuple access, or `to_dict('records')` for dictionary key access, to eliminate DataFrame construction overhead during loops. diff --git a/ml_peg/calcs/bulk_crystal/elasticity/calc_elasticity.py b/ml_peg/calcs/bulk_crystal/elasticity/calc_elasticity.py index 2886f237d..d40770788 100644 --- a/ml_peg/calcs/bulk_crystal/elasticity/calc_elasticity.py +++ b/ml_peg/calcs/bulk_crystal/elasticity/calc_elasticity.py @@ -238,12 +238,12 @@ def run_elasticity_benchmark( # Save relaxed structures to extxyz for visualisation atoms_list = [] - for _, row in results.iterrows(): + for row in results.to_dict("records"): struct = row.get("final_structure") if struct is not None: atoms = AseAtomsAdaptor.get_atoms(struct).copy() atoms.calc = None - atoms.info = {"mp_id": row[benchmark.index_name]} + atoms.info = {"mp_id": row.get(benchmark.index_name)} atoms_list.append(atoms) if atoms_list: ase_write( diff --git a/ml_peg/calcs/conformers/MPCONF196/calc_MPCONF196.py b/ml_peg/calcs/conformers/MPCONF196/calc_MPCONF196.py index a033fabf2..14bf43bc4 100644 --- a/ml_peg/calcs/conformers/MPCONF196/calc_MPCONF196.py +++ b/ml_peg/calcs/conformers/MPCONF196/calc_MPCONF196.py @@ -86,9 +86,9 @@ def get_ref_energies(data_path: Path) -> dict[str, float]: ) ref_energies = {} - for row in df.iterrows(): - label = row[1][0] - ref_energies[label] = float(row[1][2]) * KCAL_TO_EV + for row in df.itertuples(index=False, name=None): + label = row[0] + ref_energies[label] = float(row[2]) * KCAL_TO_EV return ref_energies diff --git a/ml_peg/calcs/conformers/solvMPCONF196/calc_solvMPCONF196.py b/ml_peg/calcs/conformers/solvMPCONF196/calc_solvMPCONF196.py index be51974af..ca5c464fd 100644 --- a/ml_peg/calcs/conformers/solvMPCONF196/calc_solvMPCONF196.py +++ b/ml_peg/calcs/conformers/solvMPCONF196/calc_solvMPCONF196.py @@ -84,9 +84,9 @@ def get_ref_energies(data_path: Path) -> dict[str, float]: ) ref_energies = {} - for row in df.iterrows(): - label = row[1][0] - e_ref = float(row[1][1]) * units.Hartree + for row in df.itertuples(index=False, name=None): + label = row[0] + e_ref = float(row[1]) * units.Hartree ref_energies[label] = e_ref return ref_energies diff --git a/ml_peg/calcs/utils/gscdb138.py b/ml_peg/calcs/utils/gscdb138.py index 0fc26c1e0..1745fa497 100644 --- a/ml_peg/calcs/utils/gscdb138.py +++ b/ml_peg/calcs/utils/gscdb138.py @@ -106,7 +106,7 @@ def run_gscdb138( df_refs["Reference"] *= units.Hartree # Calculate relative energy for each entry. - for _, row in tqdm(df_refs.iterrows(), dataset, total=df_refs.shape[0]): + for row in tqdm(df_refs.to_dict("records"), dataset, total=df_refs.shape[0]): atoms_list = [] identifier = row["Reaction"] reactions = row["Stoichiometry"].split(",") # Parse stoichiometry string.