From 15ba8044646b46691bbf6530782e930a87067ddd Mon Sep 17 00:00:00 2001 From: Sriram <62185294+Sriram2631@users.noreply.github.com> Date: Thu, 12 Feb 2026 18:19:24 +0100 Subject: [PATCH 1/5] Add PotentialDeduplicator class --- src/lammpsparser/potential.py | 293 ++++++++++++++++++++++++++++++++++ 1 file changed, 293 insertions(+) diff --git a/src/lammpsparser/potential.py b/src/lammpsparser/potential.py index b5634d8..d479e78 100644 --- a/src/lammpsparser/potential.py +++ b/src/lammpsparser/potential.py @@ -33,6 +33,299 @@ """ +import re +from dataclasses import dataclass +from typing import List, Set, Dict, Tuple, Optional +from collections import defaultdict +import pandas as pd + + +@dataclass +class Potential: + """Unified potential representation.""" + year: str + year_suffix: str + authors: str + elements: Set[str] + repo_type: str + ipr: Optional[int] + original: str + df_index: Optional[int] = None + + @property + def sort_key(self): + """Key for sorting - prefer LAMMPS, then higher ipr.""" + return ( + 0 if self.repo_type == 'LAMMPS' else 1, + -(self.ipr if self.ipr else 0) + ) + + @property + def family_id(self) -> str: + """Return author_year[suffix] identifier.""" + year_full = self.year + self.year_suffix if self.year_suffix else self.year + return f"{self.authors}_{year_full}" + + +class PotentialDeduplicator: + """ + Deduplicate interatomic potentials from DataFrame. + + Rules: + 1. Potentials from same author+year+suffix are duplicates + 2. Within LAMMPS: prefer higher ipr + 3. Across repos: prefer LAMMPS over OpenKIM + 4. Only keep potentials containing target_element + """ + + def __init__(self, target_element: str = 'Ni', verbose: bool = False): + """ + Parameters + ---------- + target_element : str + Element to filter for + verbose : bool + Print deduplication details + """ + self.target_element = target_element + self.verbose = verbose + self.last_duplicates_map = {} + self.last_stats = {} + + @staticmethod + def normalize_author(author_str: str) -> str: + """Extract and normalize primary author surname.""" + parts = re.split(r'[-_]', author_str) + main = parts[0] + + # Handle camelCase in OpenKIM + if len(re.findall(r'[A-Z]', main)) > 1: + camel_parts = re.split(r'(?=[A-Z])', main) + camel_parts = [p for p in camel_parts if p] + main = camel_parts[0] if camel_parts else main + + return re.sub(r'[^a-z]', '', main.lower()) + + @staticmethod + def parse_potential_metadata(name: str) -> Optional[Dict]: + """Parse potential name for metadata (year, author, repo, ipr).""" + + # Try LAMMPS format + lammps_pattern = r'(\d{4})--([^-]+(?:-[^-]+)*)--([^-]+(?:-[^-]+)*)--LAMMPS--ipr(\d+)' + match = re.match(lammps_pattern, name) + if match: + year, authors, _, ipr = match.groups() + return { + 'year': year, + 'year_suffix': '', + 'authors': PotentialDeduplicator.normalize_author(authors), + 'repo_type': 'LAMMPS', + 'ipr': int(ipr) + } + + # Try OpenKIM format + year_match = re.search(r'_(\d{4})([^_]*)', name) + mo_match = re.search(r'__(MO_|SM_)', name) + + if year_match and mo_match: + year = year_match.group(1) + year_suffix = year_match.group(2) + + parts = name.split('_') + year_idx = None + for i, part in enumerate(parts): + if part.startswith(year): + year_idx = i + break + + if year_idx and year_idx > 0: + authors = parts[year_idx - 1] + else: + authors = '' + + return { + 'year': year, + 'year_suffix': year_suffix, + 'authors': PotentialDeduplicator.normalize_author(authors) if authors else '', + 'repo_type': 'OpenKIM', + 'ipr': None + } + + return None + + def deduplicate(self, df: pd.DataFrame) -> pd.DataFrame: + """ + Deduplicate potentials from DataFrame. + + Parameters + ---------- + df : DataFrame + Must have 'Name' and 'Species' columns + + Returns + ------- + deduplicated_df : DataFrame + Deduplicated potentials + """ + + if 'Name' not in df.columns or 'Species' not in df.columns: + raise ValueError("DataFrame must have 'Name' and 'Species' columns") + + # Parse all potentials + potentials = [] + unparsed_indices = [] + filtered_out_indices = [] + + for idx, row in df.iterrows(): + name = row['Name'] + species = row['Species'] + + # Convert species to set + if isinstance(species, list): + elements = set(species) + elif isinstance(species, set): + elements = species + else: + elements = set() + + # Check if target element is present + if self.target_element not in elements: + filtered_out_indices.append(idx) + continue + + # Parse metadata + metadata = self.parse_potential_metadata(name) + + if metadata: + pot = Potential( + year=metadata['year'], + year_suffix=metadata['year_suffix'], + authors=metadata['authors'], + elements=elements, + repo_type=metadata['repo_type'], + ipr=metadata['ipr'], + original=name, + df_index=idx + ) + potentials.append(pot) + else: + unparsed_indices.append(idx) + + # Store stats + self.last_stats = { + 'total': len(df), + 'filtered_out': len(filtered_out_indices), + 'unparsed': len(unparsed_indices), + 'valid': len(potentials) + } + + if self.verbose: + print(f"Total potentials: {self.last_stats['total']}") + print(f"Filtered out (no {self.target_element}): {self.last_stats['filtered_out']}") + print(f"Unparsed: {self.last_stats['unparsed']}") + print(f"Valid for deduplication: {self.last_stats['valid']}") + + # Group by (year+suffix, author) + groups = defaultdict(list) + for pot in potentials: + year_full = pot.year + pot.year_suffix if pot.year_suffix else pot.year + key = (year_full, pot.authors) + groups[key].append(pot) + + # Keep only the best from each group + kept_indices = [] + self.last_duplicates_map = {} + + for (year_full, author), group in sorted(groups.items()): + if len(group) == 1: + kept_indices.append(group[0].df_index) + continue + + # Sort by preference: LAMMPS first, then highest ipr + group.sort(key=lambda p: p.sort_key) + + best = group[0] + rest = group[1:] + + kept_indices.append(best.df_index) + self.last_duplicates_map[best.original] = [p.original for p in rest] + + if self.verbose: + print(f"\nGroup: {year_full} - {author}") + print(f" Kept: {best.original}") + for dup in rest: + print(f" Removed: {dup.original}") + + # Add back unparsed items + kept_indices.extend(unparsed_indices) + + # Update stats + self.last_stats['kept'] = len(kept_indices) + self.last_stats['removed_duplicates'] = sum(len(v) for v in self.last_duplicates_map.values()) + + if self.verbose: + print(f"\nFinal count: {self.last_stats['kept']}") + print(f"Duplicates removed: {self.last_stats['removed_duplicates']}") + + # Return deduplicated DataFrame + return df.loc[kept_indices].copy() + + def get_duplicates(self) -> Dict[str, List[str]]: + """Return the duplicates map from last deduplication.""" + return self.last_duplicates_map.copy() + + def get_stats(self) -> Dict[str, int]: + """Return statistics from last deduplication.""" + return self.last_stats.copy() + + def get_family_id(self, potential_name: str) -> Optional[str]: + """ + Get the family label (author_year[suffix]) for a potential. + + Returns normalized label like 'foiles_1986' or 'adams_1989Universal6'. + """ + metadata = self.parse_potential_metadata(potential_name) + if metadata: + year_full = metadata['year'] + metadata['year_suffix'] if metadata['year_suffix'] else metadata['year'] + return f"{metadata['authors']}_{year_full}" + return None + + def analyze_families(self, df: pd.DataFrame) -> pd.DataFrame: + """ + Analyze potential families in the DataFrame. + + Returns a summary DataFrame with family counts and repo types. + """ + families = defaultdict(lambda: {'count': 0, 'repos': set(), 'names': []}) + + for _, row in df.iterrows(): + name = row['Name'] + species = row['Species'] + + # Check target element + elements = set(species) if isinstance(species, list) else species + if self.target_element not in elements: + continue + + family_id = self.get_family_id(name) + if family_id: + metadata = self.parse_potential_metadata(name) + families[family_id]['count'] += 1 + families[family_id]['repos'].add(metadata['repo_type']) + families[family_id]['names'].append(name) + + # Convert to DataFrame + summary = [] + for family_id, info in sorted(families.items()): + summary.append({ + 'family': family_id, + 'count': info['count'], + 'repos': ', '.join(sorted(info['repos'])), + 'has_duplicates': info['count'] > 1 + }) + + return pd.DataFrame(summary) + class PotentialAbstract: """ From 1c2933461cc44a790c1c45d7e669c54b880d235b Mon Sep 17 00:00:00 2001 From: Sriram <62185294+Sriram2631@users.noreply.github.com> Date: Thu, 12 Feb 2026 18:35:55 +0100 Subject: [PATCH 2/5] Refactor target element handling in potential parser --- src/lammpsparser/potential.py | 90 ++++++++++++++++++++++++++++++----- 1 file changed, 77 insertions(+), 13 deletions(-) diff --git a/src/lammpsparser/potential.py b/src/lammpsparser/potential.py index d479e78..386b4b1 100644 --- a/src/lammpsparser/potential.py +++ b/src/lammpsparser/potential.py @@ -35,7 +35,7 @@ import re from dataclasses import dataclass -from typing import List, Set, Dict, Tuple, Optional +from typing import List, Set, Dict, Tuple, Optional, Union from collections import defaultdict import pandas as pd @@ -75,22 +75,43 @@ class PotentialDeduplicator: 1. Potentials from same author+year+suffix are duplicates 2. Within LAMMPS: prefer higher ipr 3. Across repos: prefer LAMMPS over OpenKIM - 4. Only keep potentials containing target_element + 4. Only keep potentials containing ALL target_elements """ - def __init__(self, target_element: str = 'Ni', verbose: bool = False): + def __init__(self, + target_elements: Union[str, List[str], Set[str]] = 'Ni', + verbose: bool = False): """ Parameters ---------- - target_element : str - Element to filter for + target_elements : str, list of str, or set of str + Element(s) to filter for. If multiple elements provided, + potentials must contain ALL of them. + Examples: 'Ni', ['Ni', 'Al'], {'Ni', 'Al', 'Cu'} verbose : bool Print deduplication details """ - self.target_element = target_element + # Normalize to set + if isinstance(target_elements, str): + self.target_elements = {target_elements} + elif isinstance(target_elements, (list, tuple)): + self.target_elements = set(target_elements) + elif isinstance(target_elements, set): + self.target_elements = target_elements + else: + raise ValueError(f"target_elements must be str, list, or set, got {type(target_elements)}") + self.verbose = verbose self.last_duplicates_map = {} self.last_stats = {} + + @property + def target_elements_str(self) -> str: + """Human-readable string of target elements.""" + if len(self.target_elements) == 1: + return list(self.target_elements)[0] + else: + return '{' + ', '.join(sorted(self.target_elements)) + '}' @staticmethod def normalize_author(author_str: str) -> str: @@ -153,6 +174,10 @@ def parse_potential_metadata(name: str) -> Optional[Dict]: return None + def contains_target_elements(self, elements: Set[str]) -> bool: + """Check if elements set contains ALL target elements.""" + return self.target_elements.issubset(elements) + def deduplicate(self, df: pd.DataFrame) -> pd.DataFrame: """ Deduplicate potentials from DataFrame. @@ -165,7 +190,7 @@ def deduplicate(self, df: pd.DataFrame) -> pd.DataFrame: Returns ------- deduplicated_df : DataFrame - Deduplicated potentials + Deduplicated potentials containing all target elements """ if 'Name' not in df.columns or 'Species' not in df.columns: @@ -188,8 +213,8 @@ def deduplicate(self, df: pd.DataFrame) -> pd.DataFrame: else: elements = set() - # Check if target element is present - if self.target_element not in elements: + # Check if ALL target elements are present + if not self.contains_target_elements(elements): filtered_out_indices.append(idx) continue @@ -221,7 +246,8 @@ def deduplicate(self, df: pd.DataFrame) -> pd.DataFrame: if self.verbose: print(f"Total potentials: {self.last_stats['total']}") - print(f"Filtered out (no {self.target_element}): {self.last_stats['filtered_out']}") + print(f"Target elements: {self.target_elements_str}") + print(f"Filtered out (missing target elements): {self.last_stats['filtered_out']}") print(f"Unparsed: {self.last_stats['unparsed']}") print(f"Valid for deduplication: {self.last_stats['valid']}") @@ -295,6 +321,7 @@ def analyze_families(self, df: pd.DataFrame) -> pd.DataFrame: Analyze potential families in the DataFrame. Returns a summary DataFrame with family counts and repo types. + Only includes potentials with all target elements. """ families = defaultdict(lambda: {'count': 0, 'repos': set(), 'names': []}) @@ -302,9 +329,9 @@ def analyze_families(self, df: pd.DataFrame) -> pd.DataFrame: name = row['Name'] species = row['Species'] - # Check target element + # Check target elements elements = set(species) if isinstance(species, list) else species - if self.target_element not in elements: + if not self.contains_target_elements(elements): continue family_id = self.get_family_id(name) @@ -325,7 +352,44 @@ def analyze_families(self, df: pd.DataFrame) -> pd.DataFrame: }) return pd.DataFrame(summary) - + + def filter_by_elements(self, df: pd.DataFrame, + target_elements: Optional[Union[str, List[str], Set[str]]] = None) -> pd.DataFrame: + """ + Filter DataFrame to only potentials containing specified elements. + + Parameters + ---------- + df : DataFrame + Input DataFrame with 'Species' column + target_elements : str, list, set, or None + Elements to filter for. If None, uses self.target_elements + + Returns + ------- + filtered_df : DataFrame + Filtered to potentials with all target elements + """ + if target_elements is not None: + # Temporarily change target elements + if isinstance(target_elements, str): + target_set = {target_elements} + elif isinstance(target_elements, (list, tuple)): + target_set = set(target_elements) + else: + target_set = target_elements + else: + target_set = self.target_elements + + filtered_indices = [] + for idx, row in df.iterrows(): + species = row['Species'] + elements = set(species) if isinstance(species, list) else species + if target_set.issubset(elements): + filtered_indices.append(idx) + + return df.loc[filtered_indices].copy() + class PotentialAbstract: """ From dc64c7aaa9603ab339796872fa57ee47e7c12afd Mon Sep 17 00:00:00 2001 From: Sriram <62185294+Sriram2631@users.noreply.github.com> Date: Thu, 12 Feb 2026 18:37:58 +0100 Subject: [PATCH 3/5] Include potential deduplication in view_potentials --- src/lammpsparser/potential.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/lammpsparser/potential.py b/src/lammpsparser/potential.py index 386b4b1..c2f1de7 100644 --- a/src/lammpsparser/potential.py +++ b/src/lammpsparser/potential.py @@ -640,8 +640,13 @@ def view_potentials(structure: Atoms, resource_path: str) -> pandas.DataFrame: pandas.Dataframe: Dataframe including all potential parameters. """ list_of_elements = set(structure.get_chemical_symbols()) - return LammpsPotentialFile(resource_path=resource_path).find(list_of_elements) + raw_df = LammpsPotentialFile(resource_path=resource_path).find(list_of_elements) + + dedup = PotentialDeduplicator(target_elements=list_of_elements, verbose=True) + clean_df = dedup.deduplicate(raw_df) + return clean_df + def convert_path_to_abs_posix(path: str) -> str: """ From 6dcde03e6e89a3ac93eedd1e141a14f7fe23ccca Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 12 Feb 2026 17:46:51 +0000 Subject: [PATCH 4/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/lammpsparser/potential.py | 264 ++++++++++++++++++---------------- 1 file changed, 142 insertions(+), 122 deletions(-) diff --git a/src/lammpsparser/potential.py b/src/lammpsparser/potential.py index c2f1de7..aaf11ba 100644 --- a/src/lammpsparser/potential.py +++ b/src/lammpsparser/potential.py @@ -34,15 +34,17 @@ """ import re -from dataclasses import dataclass -from typing import List, Set, Dict, Tuple, Optional, Union from collections import defaultdict +from dataclasses import dataclass +from typing import Dict, List, Optional, Set, Tuple, Union + import pandas as pd @dataclass class Potential: """Unified potential representation.""" + year: str year_suffix: str authors: str @@ -51,15 +53,12 @@ class Potential: ipr: Optional[int] original: str df_index: Optional[int] = None - + @property def sort_key(self): """Key for sorting - prefer LAMMPS, then higher ipr.""" - return ( - 0 if self.repo_type == 'LAMMPS' else 1, - -(self.ipr if self.ipr else 0) - ) - + return (0 if self.repo_type == "LAMMPS" else 1, -(self.ipr if self.ipr else 0)) + @property def family_id(self) -> str: """Return author_year[suffix] identifier.""" @@ -70,17 +69,19 @@ def family_id(self) -> str: class PotentialDeduplicator: """ Deduplicate interatomic potentials from DataFrame. - + Rules: 1. Potentials from same author+year+suffix are duplicates 2. Within LAMMPS: prefer higher ipr 3. Across repos: prefer LAMMPS over OpenKIM 4. Only keep potentials containing ALL target_elements """ - - def __init__(self, - target_elements: Union[str, List[str], Set[str]] = 'Ni', - verbose: bool = False): + + def __init__( + self, + target_elements: Union[str, List[str], Set[str]] = "Ni", + verbose: bool = False, + ): """ Parameters ---------- @@ -99,112 +100,118 @@ def __init__(self, elif isinstance(target_elements, set): self.target_elements = target_elements else: - raise ValueError(f"target_elements must be str, list, or set, got {type(target_elements)}") - + raise ValueError( + f"target_elements must be str, list, or set, got {type(target_elements)}" + ) + self.verbose = verbose self.last_duplicates_map = {} self.last_stats = {} - + @property def target_elements_str(self) -> str: """Human-readable string of target elements.""" if len(self.target_elements) == 1: return list(self.target_elements)[0] else: - return '{' + ', '.join(sorted(self.target_elements)) + '}' - + return "{" + ", ".join(sorted(self.target_elements)) + "}" + @staticmethod def normalize_author(author_str: str) -> str: """Extract and normalize primary author surname.""" - parts = re.split(r'[-_]', author_str) + parts = re.split(r"[-_]", author_str) main = parts[0] - + # Handle camelCase in OpenKIM - if len(re.findall(r'[A-Z]', main)) > 1: - camel_parts = re.split(r'(?=[A-Z])', main) + if len(re.findall(r"[A-Z]", main)) > 1: + camel_parts = re.split(r"(?=[A-Z])", main) camel_parts = [p for p in camel_parts if p] main = camel_parts[0] if camel_parts else main - - return re.sub(r'[^a-z]', '', main.lower()) - + + return re.sub(r"[^a-z]", "", main.lower()) + @staticmethod def parse_potential_metadata(name: str) -> Optional[Dict]: """Parse potential name for metadata (year, author, repo, ipr).""" - + # Try LAMMPS format - lammps_pattern = r'(\d{4})--([^-]+(?:-[^-]+)*)--([^-]+(?:-[^-]+)*)--LAMMPS--ipr(\d+)' + lammps_pattern = ( + r"(\d{4})--([^-]+(?:-[^-]+)*)--([^-]+(?:-[^-]+)*)--LAMMPS--ipr(\d+)" + ) match = re.match(lammps_pattern, name) if match: year, authors, _, ipr = match.groups() return { - 'year': year, - 'year_suffix': '', - 'authors': PotentialDeduplicator.normalize_author(authors), - 'repo_type': 'LAMMPS', - 'ipr': int(ipr) + "year": year, + "year_suffix": "", + "authors": PotentialDeduplicator.normalize_author(authors), + "repo_type": "LAMMPS", + "ipr": int(ipr), } - + # Try OpenKIM format - year_match = re.search(r'_(\d{4})([^_]*)', name) - mo_match = re.search(r'__(MO_|SM_)', name) - + year_match = re.search(r"_(\d{4})([^_]*)", name) + mo_match = re.search(r"__(MO_|SM_)", name) + if year_match and mo_match: year = year_match.group(1) year_suffix = year_match.group(2) - - parts = name.split('_') + + parts = name.split("_") year_idx = None for i, part in enumerate(parts): if part.startswith(year): year_idx = i break - + if year_idx and year_idx > 0: authors = parts[year_idx - 1] else: - authors = '' - + authors = "" + return { - 'year': year, - 'year_suffix': year_suffix, - 'authors': PotentialDeduplicator.normalize_author(authors) if authors else '', - 'repo_type': 'OpenKIM', - 'ipr': None + "year": year, + "year_suffix": year_suffix, + "authors": PotentialDeduplicator.normalize_author(authors) + if authors + else "", + "repo_type": "OpenKIM", + "ipr": None, } - + return None - + def contains_target_elements(self, elements: Set[str]) -> bool: """Check if elements set contains ALL target elements.""" return self.target_elements.issubset(elements) - + def deduplicate(self, df: pd.DataFrame) -> pd.DataFrame: """ Deduplicate potentials from DataFrame. - + Parameters ---------- df : DataFrame Must have 'Name' and 'Species' columns - + Returns ------- deduplicated_df : DataFrame Deduplicated potentials containing all target elements """ - - if 'Name' not in df.columns or 'Species' not in df.columns: + + if "Name" not in df.columns or "Species" not in df.columns: raise ValueError("DataFrame must have 'Name' and 'Species' columns") - + # Parse all potentials potentials = [] unparsed_indices = [] filtered_out_indices = [] - + for idx, row in df.iterrows(): - name = row['Name'] - species = row['Species'] - + name = row["Name"] + species = row["Species"] + # Convert species to set if isinstance(species, list): elements = set(species) @@ -212,159 +219,172 @@ def deduplicate(self, df: pd.DataFrame) -> pd.DataFrame: elements = species else: elements = set() - + # Check if ALL target elements are present if not self.contains_target_elements(elements): filtered_out_indices.append(idx) continue - + # Parse metadata metadata = self.parse_potential_metadata(name) - + if metadata: pot = Potential( - year=metadata['year'], - year_suffix=metadata['year_suffix'], - authors=metadata['authors'], + year=metadata["year"], + year_suffix=metadata["year_suffix"], + authors=metadata["authors"], elements=elements, - repo_type=metadata['repo_type'], - ipr=metadata['ipr'], + repo_type=metadata["repo_type"], + ipr=metadata["ipr"], original=name, - df_index=idx + df_index=idx, ) potentials.append(pot) else: unparsed_indices.append(idx) - + # Store stats self.last_stats = { - 'total': len(df), - 'filtered_out': len(filtered_out_indices), - 'unparsed': len(unparsed_indices), - 'valid': len(potentials) + "total": len(df), + "filtered_out": len(filtered_out_indices), + "unparsed": len(unparsed_indices), + "valid": len(potentials), } - + if self.verbose: print(f"Total potentials: {self.last_stats['total']}") print(f"Target elements: {self.target_elements_str}") - print(f"Filtered out (missing target elements): {self.last_stats['filtered_out']}") + print( + f"Filtered out (missing target elements): {self.last_stats['filtered_out']}" + ) print(f"Unparsed: {self.last_stats['unparsed']}") print(f"Valid for deduplication: {self.last_stats['valid']}") - + # Group by (year+suffix, author) groups = defaultdict(list) for pot in potentials: year_full = pot.year + pot.year_suffix if pot.year_suffix else pot.year key = (year_full, pot.authors) groups[key].append(pot) - + # Keep only the best from each group kept_indices = [] self.last_duplicates_map = {} - + for (year_full, author), group in sorted(groups.items()): if len(group) == 1: kept_indices.append(group[0].df_index) continue - + # Sort by preference: LAMMPS first, then highest ipr group.sort(key=lambda p: p.sort_key) - + best = group[0] rest = group[1:] - + kept_indices.append(best.df_index) self.last_duplicates_map[best.original] = [p.original for p in rest] - + if self.verbose: print(f"\nGroup: {year_full} - {author}") print(f" Kept: {best.original}") for dup in rest: print(f" Removed: {dup.original}") - + # Add back unparsed items kept_indices.extend(unparsed_indices) - + # Update stats - self.last_stats['kept'] = len(kept_indices) - self.last_stats['removed_duplicates'] = sum(len(v) for v in self.last_duplicates_map.values()) - + self.last_stats["kept"] = len(kept_indices) + self.last_stats["removed_duplicates"] = sum( + len(v) for v in self.last_duplicates_map.values() + ) + if self.verbose: print(f"\nFinal count: {self.last_stats['kept']}") print(f"Duplicates removed: {self.last_stats['removed_duplicates']}") - + # Return deduplicated DataFrame return df.loc[kept_indices].copy() - + def get_duplicates(self) -> Dict[str, List[str]]: """Return the duplicates map from last deduplication.""" return self.last_duplicates_map.copy() - + def get_stats(self) -> Dict[str, int]: """Return statistics from last deduplication.""" return self.last_stats.copy() - + def get_family_id(self, potential_name: str) -> Optional[str]: """ Get the family label (author_year[suffix]) for a potential. - + Returns normalized label like 'foiles_1986' or 'adams_1989Universal6'. """ metadata = self.parse_potential_metadata(potential_name) if metadata: - year_full = metadata['year'] + metadata['year_suffix'] if metadata['year_suffix'] else metadata['year'] + year_full = ( + metadata["year"] + metadata["year_suffix"] + if metadata["year_suffix"] + else metadata["year"] + ) return f"{metadata['authors']}_{year_full}" return None - + def analyze_families(self, df: pd.DataFrame) -> pd.DataFrame: """ Analyze potential families in the DataFrame. - + Returns a summary DataFrame with family counts and repo types. Only includes potentials with all target elements. """ - families = defaultdict(lambda: {'count': 0, 'repos': set(), 'names': []}) - + families = defaultdict(lambda: {"count": 0, "repos": set(), "names": []}) + for _, row in df.iterrows(): - name = row['Name'] - species = row['Species'] - + name = row["Name"] + species = row["Species"] + # Check target elements elements = set(species) if isinstance(species, list) else species if not self.contains_target_elements(elements): continue - + family_id = self.get_family_id(name) if family_id: metadata = self.parse_potential_metadata(name) - families[family_id]['count'] += 1 - families[family_id]['repos'].add(metadata['repo_type']) - families[family_id]['names'].append(name) - + families[family_id]["count"] += 1 + families[family_id]["repos"].add(metadata["repo_type"]) + families[family_id]["names"].append(name) + # Convert to DataFrame summary = [] for family_id, info in sorted(families.items()): - summary.append({ - 'family': family_id, - 'count': info['count'], - 'repos': ', '.join(sorted(info['repos'])), - 'has_duplicates': info['count'] > 1 - }) - + summary.append( + { + "family": family_id, + "count": info["count"], + "repos": ", ".join(sorted(info["repos"])), + "has_duplicates": info["count"] > 1, + } + ) + return pd.DataFrame(summary) - - def filter_by_elements(self, df: pd.DataFrame, - target_elements: Optional[Union[str, List[str], Set[str]]] = None) -> pd.DataFrame: + + def filter_by_elements( + self, + df: pd.DataFrame, + target_elements: Optional[Union[str, List[str], Set[str]]] = None, + ) -> pd.DataFrame: """ Filter DataFrame to only potentials containing specified elements. - + Parameters ---------- df : DataFrame Input DataFrame with 'Species' column target_elements : str, list, set, or None Elements to filter for. If None, uses self.target_elements - + Returns ------- filtered_df : DataFrame @@ -380,16 +400,16 @@ def filter_by_elements(self, df: pd.DataFrame, target_set = target_elements else: target_set = self.target_elements - + filtered_indices = [] for idx, row in df.iterrows(): - species = row['Species'] + species = row["Species"] elements = set(species) if isinstance(species, list) else species if target_set.issubset(elements): filtered_indices.append(idx) - + return df.loc[filtered_indices].copy() - + class PotentialAbstract: """ @@ -641,12 +661,12 @@ def view_potentials(structure: Atoms, resource_path: str) -> pandas.DataFrame: """ list_of_elements = set(structure.get_chemical_symbols()) raw_df = LammpsPotentialFile(resource_path=resource_path).find(list_of_elements) - + dedup = PotentialDeduplicator(target_elements=list_of_elements, verbose=True) clean_df = dedup.deduplicate(raw_df) return clean_df - + def convert_path_to_abs_posix(path: str) -> str: """ From 7b23c2f272bfd425b83db0ef6758eaca063e85c1 Mon Sep 17 00:00:00 2001 From: pyiron-runner Date: Thu, 12 Feb 2026 17:47:19 +0000 Subject: [PATCH 5/5] Format black --- src/lammpsparser/potential.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/lammpsparser/potential.py b/src/lammpsparser/potential.py index aaf11ba..3c8fe0f 100644 --- a/src/lammpsparser/potential.py +++ b/src/lammpsparser/potential.py @@ -172,9 +172,9 @@ def parse_potential_metadata(name: str) -> Optional[Dict]: return { "year": year, "year_suffix": year_suffix, - "authors": PotentialDeduplicator.normalize_author(authors) - if authors - else "", + "authors": ( + PotentialDeduplicator.normalize_author(authors) if authors else "" + ), "repo_type": "OpenKIM", "ipr": None, }