Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 89 additions & 2 deletions src/sssom/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,15 @@
import logging as _logging
import os.path
import typing
from collections import ChainMap, Counter
from collections import ChainMap, Counter, defaultdict
from pathlib import Path
from typing import (
Any,
Callable,
Dict,
Iterable,
List,
NamedTuple,
Optional,
TextIO,
Tuple,
Expand Down Expand Up @@ -994,7 +995,7 @@ def _get_mapping_set_from_df(df: pd.DataFrame, meta: Optional[MetadataType] = No
return mapping_set


SplitMethod: TypeAlias = Literal["disjoint-indexes", "dense-indexes"]
SplitMethod: TypeAlias = Literal["disjoint-indexes", "dense-indexes", "patrick"]


def split_dataframe(
Expand Down Expand Up @@ -1051,6 +1052,13 @@ def split_dataframe_by_prefix(
object_prefixes=object_prefixes,
relations=relations,
)
if method == "patrick":
return _patrick_split(
msdf,
subject_prefixes=subject_prefixes,
object_prefixes=object_prefixes,
relations=relations,
)

predicates: List[ReferenceTuple] = []
for relation in relations:
Expand All @@ -1076,6 +1084,85 @@ def split_dataframe_by_prefix(
return rv


class SSSOMSplitGroup(NamedTuple):
"""The key of a group of mappings in a split MappingSetDataFrame."""

subject_prefix: str
object_prefix: str
relation_tup: ReferenceTuple


def _patrick_split(
msdf: MappingSetDataFrame,
subject_prefixes: Iterable[str],
object_prefixes: Iterable[str],
relations: Iterable[str],
) -> Dict[str, MappingSetDataFrame]:
"""Split a mapping set dataframe by prefix.

:param msdf: An SSSOM MappingSetDataFrame
:param subject_prefixes: a list of prefixes pertaining to the subject
:param object_prefixes: a list of prefixes pertaining to the object
:param relations: a list of relations of interest
:return: a dict of SSSOM data frame names to MappingSetDataFrame
"""
meta = msdf.metadata
split_to_msdf: Dict[str, MappingSetDataFrame] = {}
mappings_by_group: defaultdict[SSSOMSplitGroup, List[object]] = defaultdict(list)
parse_curie = msdf.converter.parse_curie

# FIXME this doesn't work on mappings with subject, predicate, or object
# that has a prefix not registered correctly with the converter.
# current functionality requires it skips


expected_split_groups = [
SSSOMSplitGroup(
subject_prefix,
object_prefix,
# FIXME
parse_curie(relation, strict=True),
)
for subject_prefix, relation, object_prefix in itt.product(
subject_prefixes, relations, object_prefixes
)
]

for mapping in msdf.df.itertuples(index=False):
# FIXME
group = SSSOMSplitGroup(
parse_curie(getattr(mapping, SUBJECT_ID), strict=True).prefix,
parse_curie(getattr(mapping, OBJECT_ID), strict=True).prefix,
parse_curie(getattr(mapping, PREDICATE_ID), strict=True),
)
mappings_by_group[group].append(mapping)

for group in expected_split_groups:
split = _get_split_key(
group.subject_prefix, group.relation_tup.identifier, group.object_prefix
)
mappings = mappings_by_group.get(group, None)

if group.subject_prefix not in msdf.converter.bimap:
logging.warning(f"{split} - missing subject prefix - {group.subject_prefix}")
continue
elif group.object_prefix not in msdf.converter.bimap:
logging.warning(f"{split} - missing object prefix - {group.object_prefix}")
continue
elif mappings is None:
logging.debug(f"{split} - No matches matches found")
continue

subconverter = msdf.converter.get_subconverter(
[group.subject_prefix, group.object_prefix, group.relation_tup.prefix]
)
split_to_msdf[split] = from_sssom_dataframe(
pd.DataFrame(mappings), prefix_map=dict(subconverter.bimap), meta=meta
)

return split_to_msdf


def _split_dataframe_by_prefix_old(
msdf: MappingSetDataFrame,
subject_prefixes: Iterable[str],
Expand Down
Loading