Generalized hashing functions to support complex types

Pringled · Pringled · commit e113ac8ccb3b · 2026-01-15T08:25:21.000+01:00
diff --git a/semhash/semhash.py b/semhash/semhash.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-from collections import defaultdict
 from collections.abc import Sequence
 from math import ceil
 from typing import Any, Generic, Literal
@@ -18,6 +17,7 @@
     DatasetLike,
     Encoder,
     Record,
+    coerce_value,
     compute_candidate_limit,
     featurize,
     group_records_by_key,
@@ -373,12 +373,12 @@ def self_deduplicate(
 
         return result
 
-    def _validate_if_strings(self, records: Sequence[dict[str, Any] | str]) -> list[dict[str, str]]:
+    def _validate_if_strings(self, records: Sequence[dict[str, Any] | str]) -> list[dict[str, Any]]:
         """
         Validate if the records are strings.
 
         If the records are strings, they are converted to dictionaries with a single column.
-        If the records are dicts, values are coerced to strings and None is rejected.
+        If the records are dicts, primitives are stringified and complex types (images, etc.) are kept raw.
 
         :param records: The records to validate.
         :return: The records as a list of dictionaries.
@@ -396,25 +396,22 @@ def _validate_if_strings(self, records: Sequence[dict[str, Any] | str]) -> list[
                 raise ValueError("Records were not originally strings, but you passed strings.")
             if not all(isinstance(r, str) for r in records):
                 raise ValueError("Records must be all strings.")
-            # Type narrowing: we've validated all are strings
-            return [{"text": str(r)} for r in records]
+            return [{"text": r} for r in records]
 
-        # Dict path - coerce values to strings (matching prepare_records behavior)
+        # Dict path
         if not all(isinstance(r, dict) for r in records):
             raise ValueError("Records must be all dictionaries.")
 
-        # Type narrowing: we've validated all are dicts
         dict_records: Sequence[dict[str, Any]] = records  # type: ignore[assignment]
-        coerced: list[dict[str, str]] = []
+        result: list[dict[str, Any]] = []
         for r in dict_records:
-            out: dict[str, str] = {}
+            out = {}
             for c in self.columns:
-                val = r.get(c)
-                if val is None:
+                if (val := r.get(c)) is None:
                     raise ValueError(f"Column '{c}' has None value in record {r}")
-                out[c] = val if isinstance(val, str) else str(val)
-            coerced.append(out)
-        return coerced
+                out[c] = coerce_value(val)
+            result.append(out)
+        return result
 
     def find_representative(
         self,
diff --git a/semhash/utils.py b/semhash/utils.py
@@ -1,3 +1,4 @@
+import hashlib
 from collections import defaultdict
 from collections.abc import Sequence
 from typing import Any, Protocol, TypeAlias, TypeVar
@@ -11,19 +12,19 @@
 
 
 class Encoder(Protocol):
-    """An encoder protocol for SemHash."""
+    """An encoder protocol for SemHash. Supports text, images, or any encodable data."""
 
     def encode(
         self,
-        sentences: list[str] | str | Sequence[str],
+        inputs: Sequence[Any] | Any,
         **kwargs: Any,
     ) -> np.ndarray:
         """
-        Encode a list of sentences into embeddings.
+        Encode a list of inputs into embeddings.
 
-        :param sentences: A list of sentences to encode.
+        :param inputs: A list of inputs to encode (strings, images, etc.).
         :param **kwargs: Additional keyword arguments.
-        :return: The embeddings of the sentences.
+        :return: The embeddings of the inputs.
         """
         ...  # pragma: no cover
 
@@ -48,26 +49,67 @@ def __getitem__(self, key: str) -> Sequence[Any]:
         ...  # pragma: no cover
 
 
-def to_frozendict(record: dict[str, str], columns: Sequence[str] | set[str]) -> frozendict[str, str]:
+def make_hashable(value: Any) -> Any:
     """
-    Convert a record to a frozendict.
+    Convert a value to a hashable representation for use as dict keys.
+
+    Strings and other hashable types are returned as-is.
+    Non-hashable types (like PIL images, numpy arrays) are hashed to a string.
+
+    :param value: The value to make hashable.
+    :return: A hashable representation of the value.
+    """
+    # Fast path: most values are strings or already hashable
+    if isinstance(value, (str, int, float, bool, type(None))):
+        return value
+    # Handle objects with tobytes() (PIL Image, numpy array, etc.)
+    if hasattr(value, "tobytes"):
+        return hashlib.md5(value.tobytes()).hexdigest()
+    # Fallback: try to hash, otherwise stringify
+    try:
+        hash(value)
+        return value
+    except TypeError:
+        return str(value)
+
+
+def coerce_value(value: Any) -> Any:
+    """
+    Coerce a value for encoding: stringify primitives, keep complex types raw.
+
+    This ensures primitives (int, float, bool) work with text encoders,
+    while complex types (PIL images, tensors, etc.) are passed through for multimodal encoders.
+
+    :param value: The value to coerce.
+    :return: The coerced value.
+    """
+    if isinstance(value, (str, bytes)):
+        return value
+    if isinstance(value, (int, float, bool)):
+        return str(value)
+    return value  # Complex types (images, tensors, etc.)
+
+
+def to_frozendict(record: dict[str, Any], columns: Sequence[str] | set[str]) -> frozendict[str, Any]:
+    """
+    Convert a record to a frozendict with hashable values.
 
     :param record: The record to convert.
     :param columns: The columns to include.
-    :return: A frozendict with only the specified columns.
+    :return: A frozendict with only the specified columns (values made hashable).
     :raises ValueError: If a column is missing from the record.
     """
     try:
-        return frozendict({k: record[k] for k in columns})
+        return frozendict({k: make_hashable(record[k]) for k in columns})
     except KeyError as e:
         missing = e.args[0]
         raise ValueError(f"Missing column '{missing}' in record {record}") from e
 
 
 def group_records_by_key(
-    records: Sequence[dict[str, str]],
+    records: Sequence[dict[str, Any]],
     columns: Sequence[str],
-) -> tuple[list[dict[str, str]], list[list[dict[str, str]]]]:
+) -> tuple[list[dict[str, Any]], list[list[dict[str, Any]]]]:
     """
     Group records by exact match on columns, preserving first-occurrence order.
 
@@ -77,8 +119,8 @@ def group_records_by_key(
         - deduplicated_records: first record from each unique group
         - items: list of groups, each group is a list of exact duplicates
     """
-    buckets: dict[frozendict[str, str], list[dict[str, str]]] = {}
-    order: list[frozendict[str, str]] = []
+    buckets: dict[frozendict[str, Any], list[dict[str, Any]]] = {}
+    order: list[frozendict[str, Any]] = []
 
     for r in records:
         key = to_frozendict(r, columns)
@@ -123,7 +165,7 @@ def compute_candidate_limit(
 
 
 def featurize(
-    records: Sequence[dict[str, str]],
+    records: Sequence[dict[str, Any]],
     columns: Sequence[str],
     model: Encoder,
 ) -> np.ndarray:
@@ -150,12 +192,12 @@ def featurize(
 
 
 def remove_exact_duplicates(
-    records: Sequence[dict[str, str]],
+    records: Sequence[dict[str, Any]],
     columns: Sequence[str],
-    reference_records: list[list[dict[str, str]]] | None = None,
-) -> tuple[list[dict[str, str]], list[tuple[dict[str, str], list[dict[str, str]]]]]:
+    reference_records: list[list[dict[str, Any]]] | None = None,
+) -> tuple[list[dict[str, Any]], list[tuple[dict[str, Any], list[dict[str, Any]]]]]:
     """
-    Remove exact duplicates based on the unpacked string representation of each record.
+    Remove exact duplicates based on the hashable representation of each record.
 
     If reference_records is None, the function will only check for duplicates within the records list.
 
@@ -164,12 +206,12 @@ def remove_exact_duplicates(
     :param reference_records: A list of records to compare against. These are already unpacked
     :return: A list of deduplicated records and a list of duplicates.
     """
-    deduplicated = []
-    duplicates = []
+    deduplicated: list[dict[str, Any]] = []
+    duplicates: list[tuple[dict[str, Any], list[dict[str, Any]]]] = []
 
     column_set = set(columns)
     # Build a seen set from reference_records if provided
-    seen: defaultdict[frozendict[str, str], list[dict[str, str]]] = defaultdict(list)
+    seen: defaultdict[frozendict[str, Any], list[dict[str, Any]]] = defaultdict(list)
     if reference_records is not None:
         for record_set in reference_records:
             key = to_frozendict(record_set[0], column_set)
@@ -191,7 +233,7 @@ def remove_exact_duplicates(
 
 def prepare_records(
     records: Sequence[Record], columns: Sequence[str] | None
-) -> tuple[list[dict[str, str]], Sequence[str], bool]:
+) -> tuple[list[dict[str, Any]], Sequence[str], bool]:
     """
     Validate and prepare records for processing.
 
@@ -214,23 +256,23 @@ def prepare_records(
         if not all(isinstance(r, str) for r in records):
             raise ValueError("All records must be strings when the first record is a string.")
         columns = ["text"]
-        dict_records: list[dict[str, str]] = [{"text": str(record)} for record in records]
+        dict_records: list[dict[str, Any]] = [{"text": record} for record in records]
         was_string = True
     else:
         # Validate all records are dicts
         if not all(isinstance(r, dict) for r in records):
             raise ValueError("All records must be dicts when the first record is a dict.")
         assert columns is not None
-        # Coerce dict values to strings (matching dataset behavior)
+        # Coerce values: stringify primitives, keep complex types raw (for images, etc.)
         dict_records_typed: list[dict[str, Any]] = list(records)  # type: ignore[arg-type]
         dict_records = []
         for r in dict_records_typed:
-            coerced = {}
+            coerced: dict[str, Any] = {}
             for c in columns:
                 val = r.get(c)
                 if val is None:
                     raise ValueError(f"Column '{c}' has None value in record {r}")
-                coerced[c] = val if isinstance(val, str) else str(val)
+                coerced[c] = coerce_value(val)
             dict_records.append(coerced)
         was_string = False
 
@@ -263,7 +305,7 @@ def _validate_dataset(dataset: DatasetLike, columns: Sequence[str]) -> tuple[dic
 def prepare_dataset_records(
     dataset: DatasetLike,
     columns: Sequence[str],
-) -> tuple[list[dict[str, str]], list[list[dict[str, str]]], bool]:
+) -> tuple[list[dict[str, Any]], list[list[dict[str, Any]]], bool]:
     """
     Extract, validate, and exact-deduplicate dataset rows using columnar access.
 
@@ -286,17 +328,18 @@ def prepare_dataset_records(
     # values in the dataset are actual strings (not integers/floats coerced to str).
     was_string = len(columns) == 1 and columns[0] == "text"
 
-    def coerce(raw: Any, *, col: str, idx: int) -> str:
+    def validate_and_coerce(raw: Any, *, col: str, idx: int) -> Any:
+        """Validate value is not None, then coerce for encoding."""
         if raw is None:
             raise ValueError(f"Column '{col}' has None at index {idx}")
-        return raw if isinstance(raw, str) else str(raw)
+        return coerce_value(raw)
 
     # Build all records while tracking was_string
-    records: list[dict[str, str]] = []
+    records: list[dict[str, Any]] = []
     for i in range(n):
         if was_string and not isinstance(cols["text"][i], str):
             was_string = False
-        records.append({c: coerce(cols[c][i], col=c, idx=i) for c in columns})
+        records.append({c: validate_and_coerce(cols[c][i], col=c, idx=i) for c in columns})
 
     # Group by exact match, preserving first-occurrence order
     deduplicated_records, items = group_records_by_key(records, columns)