diff --git a/src/sssom/sexpr.py b/src/sssom/sexpr.py new file mode 100644 index 00000000..8b884011 --- /dev/null +++ b/src/sssom/sexpr.py @@ -0,0 +1,56 @@ +"""Generate canonical s-expressions and mapping hashes.""" + +import hashlib + +import curies +import zbase32 + +from sssom import Mapping +from sssom.constants import _get_sssom_schema_object + +__all__ = [ + "get_mapping_hash", +] + + +def get_mapping_hash(mapping: Mapping, converter: curies.Converter) -> str: + """Hash the mapping by converting to canonical s-expression, sha256 hashing, then zbase32 encoding.""" + s = hashlib.sha256() + s.update(to_sexpr(mapping, converter).encode("utf-8")) + dig = s.digest() + return zbase32.encode(dig) + + +SKIP_SLOTS = {"record_id", "mapping_cardinality"} + + +def _should_expand(slot: str) -> bool: + return slot in _get_sssom_schema_object().entity_reference_slots + + +def to_sexpr(x: Mapping, converter: curies.Converter) -> str: + # todo get canonical order + + schema_object = _get_sssom_schema_object() + rv = "(7:mapping(" + for slot in schema_object.mapping_slots: + if slot in SKIP_SLOTS: + continue + value = getattr(x, slot, None) + if not value: + continue + elif isinstance(value, str): + if _should_expand(slot): + value = converter.expand_or_standardize(value, strict=True) + # TODO check if it's an entity reference and should be expanded + rv += f"({len(slot)}:{slot}{len(value)}:{value})" + elif isinstance(value, float): + raise NotImplementedError + elif isinstance(value, list): + rv += f"({len(slot)}:{slot}(" + for v in value: + if _should_expand(slot): + v = converter.expand_or_standardize(v, strict=True) + rv += f"{len(v)}:{v}" + rv += "))" + return rv + "))" diff --git a/tests/data/sexpr_test.sssom.tsv b/tests/data/sexpr_test.sssom.tsv new file mode 100644 index 00000000..8fb0d4ef --- /dev/null +++ b/tests/data/sexpr_test.sssom.tsv @@ -0,0 +1,7 @@ +#curie_map: +# FBbt: "http://purl.obolibrary.org/obo/FBbt_" +# UBERON: "http://purl.obolibrary.org/obo/UBERON_" +# sssom.record: "https://example.org/sssom.record/" +# orcid: "https://orcid.org/" +record_id subject_id predicate_id object_id mapping_justification creator_id reviewer_id author_label mapping_tool_version similarity_score comment license author_id mapping_tool object_type predicate_modifier reviewer_label issue_tracker_item subject_source object_match_field mapping_provider subject_label object_category subject_source_version subject_preprocessing subject_category object_label mapping_source predicate_label curation_rule_text similarity_measure see_also publication_date mapping_date other object_source mapping_cardinality subject_type confidence subject_match_field curation_rule object_source_version object_preprocessing match_string creator_label +sssom.record:hq6bs14aptzepwgk6pw7j8ysnft6riqrw7har84rtk8r9xmbcwty FBbt:00001234 skos:exactMatch UBERON:0005678 semapv:ManualMappingCuration orcid:0000-0000-1234-5678|orcid:0000-0000-5678-1234 ventral abdominal es5 exact match someone diff --git a/tests/test_sexpr.py b/tests/test_sexpr.py new file mode 100644 index 00000000..8445fac9 --- /dev/null +++ b/tests/test_sexpr.py @@ -0,0 +1,103 @@ +"""Test s-expressions.""" + +import re +import unittest +from pathlib import Path + +import pandas as pd +from curies import Converter + +import sssom.io +from sssom import Mapping +from sssom.constants import SSSOMSchemaView +from sssom.sexpr import get_mapping_hash, to_sexpr + +HERE = Path(__file__).parent.resolve() +PATH = HERE.joinpath("data", "sexpr_test.sssom.tsv") + + +class TestSExpressions(unittest.TestCase): + """Test creation of canonical S-expressions.""" + + def test_explicit_example(self) -> None: + """Test a hard-coded example, explicit in the code.""" + converter = Converter.from_prefix_map( + { + "FBbt": "http://purl.obolibrary.org/obo/FBbt_", + "UBERON": "http://purl.obolibrary.org/obo/UBERON_", + "orcid": "https://orcid.org/", + "semapv": "https://w3id.org/semapv/vocab/", + "skos": "http://www.w3.org/2004/02/skos/core#", + } + ) + sexpr = """ + (7:mapping( + (10:subject_id44:http://purl.obolibrary.org/obo/FBbt_00001234) + (12:predicate_id46:http://www.w3.org/2004/02/skos/core#exactMatch) + (9:object_id45:http://purl.obolibrary.org/obo/UBERON_0005678) + (21:mapping_justification51:https://w3id.org/semapv/vocab/ManualMappingCuration) + (10:creator_id( + 37:https://orcid.org/0000-0000-1234-5678 + 37:https://orcid.org/0000-0000-5678-1234 + )) + )) + """ + mapping = Mapping( + subject_id="http://purl.obolibrary.org/obo/FBbt_00001234", + predicate_id="http://www.w3.org/2004/02/skos/core#exactMatch", + object_id="http://purl.obolibrary.org/obo/UBERON_0005678", + mapping_justification="https://w3id.org/semapv/vocab/ManualMappingCuration", + creator_id=[ + "https://orcid.org/0000-0000-1234-5678", + "https://orcid.org/0000-0000-5678-1234", + ], + ) + self.assertEqual(re.sub(r"\s", "", sexpr), to_sexpr(mapping, converter)) + self.assertEqual( + "hq6bs14aptzepwgk6pw7j8ysnft6riqrw7har84rtk8r9xmbcwty", + get_mapping_hash(mapping, converter), + ) + + def test_test_completion(self) -> None: + """Test that the example file is complete over the whole SSSOM schema.""" + view = SSSOMSchemaView() + + df = pd.read_csv(PATH, sep="\t", comment="#") + missing = set(view.mapping_slots).difference(df.columns) + if missing: + msg = "\n".join(sorted(missing)) + self.fail(msg=f"comprehensive testing file is missing slots:\n{msg}") + + for slot in view.mapping_slots: + with self.subTest(slot=slot): + series = df[slot] + self.assertTrue(series.any(), msg=f"there is no row that has a value for: {slot}") + + values = series.unique() + if slot in view.multivalued_slots: + self.assertTrue( + any("|" in value for value in values), + msg=f"missing a multi-valued example for slot: {slot}", + ) + self.assertTrue( + any("|" not in value for value in values), + msg=f"missing a single valued example for slot: {slot}", + ) + else: + self.assertFalse( + any("|" in value for value in values), + msg=f"should not have a pipe delimiter in single valued slot: {slot}", + ) + + def test_all(self) -> None: + """Test all.""" + msdf = sssom.parse_tsv(PATH) + + # After new SSSOM schema release, this will be part of the mapping data model + record_ids = pd.read_csv(PATH, sep="\t", comment="#")["record_id"] + for record_id, mapping in zip(record_ids, msdf.to_mappings()): + self.assertEqual( + record_id.removeprefix("sssom.record:"), + get_mapping_hash(mapping, msdf.converter), + msg=to_sexpr(mapping, msdf.converter), + )