diff --git a/src/sssom/parsers.py b/src/sssom/parsers.py index 6ff94589..2662ef0a 100644 --- a/src/sssom/parsers.py +++ b/src/sssom/parsers.py @@ -9,7 +9,7 @@ import logging as _logging import os.path import typing -from collections import ChainMap, Counter +from collections import ChainMap, Counter, defaultdict from pathlib import Path from typing import ( Any, @@ -17,6 +17,7 @@ Dict, Iterable, List, + NamedTuple, Optional, TextIO, Tuple, @@ -994,7 +995,7 @@ def _get_mapping_set_from_df(df: pd.DataFrame, meta: Optional[MetadataType] = No return mapping_set -SplitMethod: TypeAlias = Literal["disjoint-indexes", "dense-indexes"] +SplitMethod: TypeAlias = Literal["disjoint-indexes", "dense-indexes", "patrick"] def split_dataframe( @@ -1051,6 +1052,13 @@ def split_dataframe_by_prefix( object_prefixes=object_prefixes, relations=relations, ) + if method == "patrick": + return _patrick_split( + msdf, + subject_prefixes=subject_prefixes, + object_prefixes=object_prefixes, + relations=relations, + ) predicates: List[ReferenceTuple] = [] for relation in relations: @@ -1076,6 +1084,85 @@ def split_dataframe_by_prefix( return rv +class SSSOMSplitGroup(NamedTuple): + """The key of a group of mappings in a split MappingSetDataFrame.""" + + subject_prefix: str + object_prefix: str + relation_tup: ReferenceTuple + + +def _patrick_split( + msdf: MappingSetDataFrame, + subject_prefixes: Iterable[str], + object_prefixes: Iterable[str], + relations: Iterable[str], +) -> Dict[str, MappingSetDataFrame]: + """Split a mapping set dataframe by prefix. + + :param msdf: An SSSOM MappingSetDataFrame + :param subject_prefixes: a list of prefixes pertaining to the subject + :param object_prefixes: a list of prefixes pertaining to the object + :param relations: a list of relations of interest + :return: a dict of SSSOM data frame names to MappingSetDataFrame + """ + meta = msdf.metadata + split_to_msdf: Dict[str, MappingSetDataFrame] = {} + mappings_by_group: defaultdict[SSSOMSplitGroup, List[object]] = defaultdict(list) + parse_curie = msdf.converter.parse_curie + + # FIXME this doesn't work on mappings with subject, predicate, or object + # that has a prefix not registered correctly with the converter. + # current functionality requires it skips + + + expected_split_groups = [ + SSSOMSplitGroup( + subject_prefix, + object_prefix, + # FIXME + parse_curie(relation, strict=True), + ) + for subject_prefix, relation, object_prefix in itt.product( + subject_prefixes, relations, object_prefixes + ) + ] + + for mapping in msdf.df.itertuples(index=False): + # FIXME + group = SSSOMSplitGroup( + parse_curie(getattr(mapping, SUBJECT_ID), strict=True).prefix, + parse_curie(getattr(mapping, OBJECT_ID), strict=True).prefix, + parse_curie(getattr(mapping, PREDICATE_ID), strict=True), + ) + mappings_by_group[group].append(mapping) + + for group in expected_split_groups: + split = _get_split_key( + group.subject_prefix, group.relation_tup.identifier, group.object_prefix + ) + mappings = mappings_by_group.get(group, None) + + if group.subject_prefix not in msdf.converter.bimap: + logging.warning(f"{split} - missing subject prefix - {group.subject_prefix}") + continue + elif group.object_prefix not in msdf.converter.bimap: + logging.warning(f"{split} - missing object prefix - {group.object_prefix}") + continue + elif mappings is None: + logging.debug(f"{split} - No matches matches found") + continue + + subconverter = msdf.converter.get_subconverter( + [group.subject_prefix, group.object_prefix, group.relation_tup.prefix] + ) + split_to_msdf[split] = from_sssom_dataframe( + pd.DataFrame(mappings), prefix_map=dict(subconverter.bimap), meta=meta + ) + + return split_to_msdf + + def _split_dataframe_by_prefix_old( msdf: MappingSetDataFrame, subject_prefixes: Iterable[str],