From 0e7cef7dd5b857bbf36e29395c081832ede3dcc5 Mon Sep 17 00:00:00 2001 From: Patrick Golden Date: Sat, 6 Sep 2025 22:52:16 -0400 Subject: [PATCH 1/4] Optimize split_dataframe_by_prefix In the previous implementation of the function, the entire MappingSetDataFrame was being iterated over many more times than necessary. This changes it to only go through once. All logging and output remains the same. --- src/sssom/parsers.py | 83 +++++++++++++++++++++++++++++++------------- 1 file changed, 58 insertions(+), 25 deletions(-) diff --git a/src/sssom/parsers.py b/src/sssom/parsers.py index 2ed4f4c2..fe21a8a4 100644 --- a/src/sssom/parsers.py +++ b/src/sssom/parsers.py @@ -7,9 +7,22 @@ import logging as _logging import os.path import typing -from collections import ChainMap, Counter +from collections import ChainMap, Counter, defaultdict from pathlib import Path -from typing import Any, Callable, Dict, Iterable, List, Optional, TextIO, Tuple, Union, cast +from typing import ( + Any, + Callable, + DefaultDict, + Dict, + Iterable, + List, + NamedTuple, + Optional, + TextIO, + Tuple, + Union, + cast, +) from xml.dom import Node, minidom from xml.dom.minidom import Document @@ -18,7 +31,7 @@ import pandas as pd import requests import yaml -from curies import Converter +from curies import Converter, ReferenceTuple from linkml_runtime.loaders.json_loader import JSONLoader from linkml_runtime.loaders.rdflib_loader import RDFLibLoader from pandas.errors import EmptyDataError @@ -977,9 +990,12 @@ def split_dataframe( ) -def _get_split_key(subject_prefix: str, relation_luid: str, object_prefix: str) -> str: - split = f"{subject_prefix.lower()}_{relation_luid.lower()}_{object_prefix.lower()}" - return split +class SSSOMSplitGroup(NamedTuple): + """The key of a group of mappings in a split MappingSetDataFrame.""" + + subject_prefix: str + object_prefix: str + relation_tup: ReferenceTuple def split_dataframe_by_prefix( @@ -996,34 +1012,51 @@ def split_dataframe_by_prefix( :param relations: a list of relations of interest :return: a dict of SSSOM data frame names to MappingSetDataFrame """ - df = msdf.df meta = msdf.metadata split_to_msdf: Dict[str, MappingSetDataFrame] = {} - for subject_prefix, object_prefix, relation in itt.product( - subject_prefixes, object_prefixes, relations - ): - relation_prefix, relation_id = relation.split(":") - split = _get_split_key(subject_prefix, relation_id, object_prefix) - if subject_prefix not in msdf.converter.bimap: - logging.warning(f"{split} - missing subject prefix - {subject_prefix}") + mappings_by_group: DefaultDict[SSSOMSplitGroup, List[object]] = defaultdict(list) + parse_curie = msdf.converter.parse_curie + + expected_split_groups = [ + SSSOMSplitGroup( + subject_prefix, + object_prefix, + parse_curie(relation, strict=True), + ) + for subject_prefix, relation, object_prefix in itt.product( + subject_prefixes, relations, object_prefixes + ) + ] + + for mapping in msdf.df.itertuples(index=False): + group = SSSOMSplitGroup( + parse_curie(getattr(mapping, SUBJECT_ID), strict=True).prefix, + parse_curie(getattr(mapping, OBJECT_ID), strict=True).prefix, + parse_curie(getattr(mapping, PREDICATE_ID), strict=True), + ) + mappings_by_group[group].append(mapping) + + for group in expected_split_groups: + split = f"{group.subject_prefix.lower()}_{group.relation_tup.identifier.lower()}_{group.object_prefix.lower()}" + mappings = mappings_by_group.get(group, None) + + if group.subject_prefix not in msdf.converter.bimap: + logging.warning(f"{split} - missing subject prefix - {group.subject_prefix}") continue - if object_prefix not in msdf.converter.bimap: - logging.warning(f"{split} - missing object prefix - {object_prefix}") + elif group.object_prefix not in msdf.converter.bimap: + logging.warning(f"{split} - missing object prefix - {group.object_prefix}") continue - df_subset = df[ - (df[SUBJECT_ID].str.startswith(subject_prefix + ":")) - & (df[PREDICATE_ID] == relation) - & (df[OBJECT_ID].str.startswith(object_prefix + ":")) - ] - if 0 == len(df_subset): - logging.debug(f"No matches ({len(df_subset)} matches found)") + elif mappings is None: + logging.debug(f"{split} - No matches matches found") continue + subconverter = msdf.converter.get_subconverter( - [subject_prefix, object_prefix, relation_prefix] + [group.subject_prefix, group.object_prefix, group.relation_tup.prefix] ) split_to_msdf[split] = from_sssom_dataframe( - df_subset, prefix_map=dict(subconverter.bimap), meta=meta + pd.DataFrame(mappings), prefix_map=dict(subconverter.bimap), meta=meta ) + return split_to_msdf From 6660a26a7974a2960f84161da4c46b0a04873652 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 22 Sep 2025 16:37:58 +0200 Subject: [PATCH 2/4] Update parsers.py --- src/sssom/parsers.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/sssom/parsers.py b/src/sssom/parsers.py index fe21a8a4..f581a23f 100644 --- a/src/sssom/parsers.py +++ b/src/sssom/parsers.py @@ -990,14 +990,6 @@ def split_dataframe( ) -class SSSOMSplitGroup(NamedTuple): - """The key of a group of mappings in a split MappingSetDataFrame.""" - - subject_prefix: str - object_prefix: str - relation_tup: ReferenceTuple - - def split_dataframe_by_prefix( msdf: MappingSetDataFrame, subject_prefixes: Iterable[str], @@ -1060,6 +1052,14 @@ def split_dataframe_by_prefix( return split_to_msdf +class SSSOMSplitGroup(NamedTuple): + """The key of a group of mappings in a split MappingSetDataFrame.""" + + subject_prefix: str + object_prefix: str + relation_tup: ReferenceTuple + + def _ensure_valid_mapping_from_dict(mdict: Dict[str, Any]): """ Return a valid mapping object if it can be constructed, else None. From 77e02f84131021ecb83fd5d0a4b7e7ac9f1fec9a Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 22 Sep 2025 16:40:52 +0200 Subject: [PATCH 3/4] Update parsers.py --- src/sssom/parsers.py | 84 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 82 insertions(+), 2 deletions(-) diff --git a/src/sssom/parsers.py b/src/sssom/parsers.py index 6ff94589..19abf7d6 100644 --- a/src/sssom/parsers.py +++ b/src/sssom/parsers.py @@ -9,7 +9,7 @@ import logging as _logging import os.path import typing -from collections import ChainMap, Counter +from collections import ChainMap, Counter, defaultdict from pathlib import Path from typing import ( Any, @@ -17,6 +17,7 @@ Dict, Iterable, List, + NamedTuple, Optional, TextIO, Tuple, @@ -994,7 +995,7 @@ def _get_mapping_set_from_df(df: pd.DataFrame, meta: Optional[MetadataType] = No return mapping_set -SplitMethod: TypeAlias = Literal["disjoint-indexes", "dense-indexes"] +SplitMethod: TypeAlias = Literal["disjoint-indexes", "dense-indexes", "patrick"] def split_dataframe( @@ -1051,6 +1052,13 @@ def split_dataframe_by_prefix( object_prefixes=object_prefixes, relations=relations, ) + if method == "patrick": + return _patrick_split( + msdf, + subject_prefixes=subject_prefixes, + object_prefixes=object_prefixes, + relations=relations, + ) predicates: List[ReferenceTuple] = [] for relation in relations: @@ -1076,6 +1084,78 @@ def split_dataframe_by_prefix( return rv +class SSSOMSplitGroup(NamedTuple): + """The key of a group of mappings in a split MappingSetDataFrame.""" + + subject_prefix: str + object_prefix: str + relation_tup: ReferenceTuple + + +def _patrick_split( + msdf: MappingSetDataFrame, + subject_prefixes: Iterable[str], + object_prefixes: Iterable[str], + relations: Iterable[str], +) -> Dict[str, MappingSetDataFrame]: + """Split a mapping set dataframe by prefix. + + :param msdf: An SSSOM MappingSetDataFrame + :param subject_prefixes: a list of prefixes pertaining to the subject + :param object_prefixes: a list of prefixes pertaining to the object + :param relations: a list of relations of interest + :return: a dict of SSSOM data frame names to MappingSetDataFrame + """ + meta = msdf.metadata + split_to_msdf: Dict[str, MappingSetDataFrame] = {} + mappings_by_group: defaultdict[SSSOMSplitGroup, List[object]] = defaultdict(list) + parse_curie = msdf.converter.parse_curie + + expected_split_groups = [ + SSSOMSplitGroup( + subject_prefix, + object_prefix, + parse_curie(relation, strict=True), + ) + for subject_prefix, relation, object_prefix in itt.product( + subject_prefixes, relations, object_prefixes + ) + ] + + for mapping in msdf.df.itertuples(index=False): + group = SSSOMSplitGroup( + parse_curie(getattr(mapping, SUBJECT_ID), strict=True).prefix, + parse_curie(getattr(mapping, OBJECT_ID), strict=True).prefix, + parse_curie(getattr(mapping, PREDICATE_ID), strict=True), + ) + mappings_by_group[group].append(mapping) + + for group in expected_split_groups: + split = _get_split_key( + group.subject_prefix, group.relation_tup.identifier, group.object_prefix + ) + mappings = mappings_by_group.get(group, None) + + if group.subject_prefix not in msdf.converter.bimap: + logging.warning(f"{split} - missing subject prefix - {group.subject_prefix}") + continue + elif group.object_prefix not in msdf.converter.bimap: + logging.warning(f"{split} - missing object prefix - {group.object_prefix}") + continue + elif mappings is None: + logging.debug(f"{split} - No matches matches found") + continue + + subconverter = msdf.converter.get_subconverter( + [group.subject_prefix, group.object_prefix, group.relation_tup.prefix] + ) + split_to_msdf[split] = from_sssom_dataframe( + pd.DataFrame(mappings), prefix_map=dict(subconverter.bimap), meta=meta + ) + + return split_to_msdf + + def _split_dataframe_by_prefix_old( msdf: MappingSetDataFrame, subject_prefixes: Iterable[str], From 5d85412bc63f61e91ac29579f07be273e368407f Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 22 Sep 2025 16:45:28 +0200 Subject: [PATCH 4/4] Add FIXMEs where fail on unregistered prefixes --- src/sssom/parsers.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/sssom/parsers.py b/src/sssom/parsers.py index 19abf7d6..2662ef0a 100644 --- a/src/sssom/parsers.py +++ b/src/sssom/parsers.py @@ -1111,10 +1111,16 @@ def _patrick_split( mappings_by_group: defaultdict[SSSOMSplitGroup, List[object]] = defaultdict(list) parse_curie = msdf.converter.parse_curie + # FIXME this doesn't work on mappings with subject, predicate, or object + # that has a prefix not registered correctly with the converter. + # current functionality requires it skips + + expected_split_groups = [ SSSOMSplitGroup( subject_prefix, object_prefix, + # FIXME parse_curie(relation, strict=True), ) for subject_prefix, relation, object_prefix in itt.product( @@ -1123,6 +1129,7 @@ def _patrick_split( ] for mapping in msdf.df.itertuples(index=False): + # FIXME group = SSSOMSplitGroup( parse_curie(getattr(mapping, SUBJECT_ID), strict=True).prefix, parse_curie(getattr(mapping, OBJECT_ID), strict=True).prefix,