Removed from_dataset integration

Pringled · Pringled · commit c64a72ede8db · 2026-01-15T17:30:22.000+01:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -43,11 +43,6 @@ dev = [
     "ruff",
 ]
 
-# Integrations
-datasets = ["datasets"]
-all = [
-    "datasets",
-]
 
 [project.urls]
 "Homepage" = "https://github.com/MinishLab"
diff --git a/semhash/records.py b/semhash/records.py
@@ -5,7 +5,7 @@
 from frozendict import frozendict
 
 from semhash.datamodels import DeduplicationResult, DuplicateRecord
-from semhash.utils import DatasetLike, Record, coerce_value, to_frozendict
+from semhash.utils import Record, coerce_value, to_frozendict
 
 
 def group_records_by_key(
@@ -126,69 +126,6 @@ def prepare_records(
     return dict_records, columns, was_string
 
 
-def _validate_dataset(dataset: DatasetLike, columns: Sequence[str]) -> tuple[dict[str, Sequence[Any]], int]:
-    """Validate dataset structure and extract columns."""
-    try:
-        column_names = dataset.column_names
-    except AttributeError as e:
-        raise TypeError("dataset must satisfy DatasetLike (column_names, __len__, __getitem__)") from e
-
-    missing = set(columns) - set(column_names)
-    if missing:
-        raise ValueError(f"Columns {missing} not found in dataset")
-
-    n = len(dataset)
-    if n == 0:
-        raise ValueError("dataset must not be empty")
-
-    cols = {c: dataset[c] for c in columns}
-    for c in columns:
-        if len(cols[c]) != n:
-            raise ValueError(f"Column '{c}' length ({len(cols[c])}) does not match dataset length ({n})")
-
-    return cols, n
-
-
-def prepare_dataset_records(
-    dataset: DatasetLike,
-    columns: Sequence[str],
-) -> tuple[list[dict[str, Any]], list[list[dict[str, Any]]], bool]:
-    """
-    Extract, validate, and exact-deduplicate dataset rows using columnar access.
-
-    :param dataset: A dataset-like object with columnar access.
-    :param columns: Columns to use for deduplication.
-    :return: Tuple of (deduplicated_records, items, was_string) where:
-        - deduplicated_records: representative record per exact-duplicate bucket
-        - items: buckets of exact duplicates (each bucket is list[record])
-        - was_string: True iff columns == ["text"] and ALL raw values were strings
-    """
-    cols, n = _validate_dataset(dataset, columns)
-
-    # was_string controls whether deduplicate() returns strings or dicts.
-    # We only return strings if: (1) single column named "text", AND (2) all raw
-    # values in the dataset are actual strings (not integers/floats coerced to str).
-    was_string = len(columns) == 1 and columns[0] == "text"
-
-    def validate_and_coerce(raw: Any, *, col: str, idx: int) -> Any:
-        """Validate value is not None, then coerce for encoding."""
-        if raw is None:
-            raise ValueError(f"Column '{col}' has None at index {idx}")
-        return coerce_value(raw)
-
-    # Build all records while tracking was_string
-    records: list[dict[str, Any]] = []
-    for i in range(n):
-        if was_string and not isinstance(cols["text"][i], str):
-            was_string = False
-        records.append({c: validate_and_coerce(cols[c][i], col=c, idx=i) for c in columns})
-
-    # Group by exact match, preserving first-occurrence order
-    deduplicated_records, items = group_records_by_key(records, columns)
-
-    return deduplicated_records, items, was_string
-
-
 def dict_to_string(record: dict[str, str], columns: Sequence[str]) -> str:
     r"""
     Turn a record into a single string.
diff --git a/semhash/semhash.py b/semhash/semhash.py
@@ -16,12 +16,10 @@
     add_scores_to_records,
     group_records_by_key,
     map_deduplication_result_to_strings,
-    prepare_dataset_records,
     prepare_records,
     remove_exact_duplicates,
 )
 from semhash.utils import (
-    DatasetLike,
     Encoder,
     Record,
     coerce_value,
@@ -84,41 +82,6 @@ def from_records(
         index = Index.from_vectors_and_items(vectors=embeddings, items=items, backend_type=ann_backend, **kwargs)
         return cls(index=index, model=model, columns=columns, was_string=was_string)
 
-    @classmethod
-    def from_dataset(
-        cls,
-        dataset: DatasetLike,
-        columns: Sequence[str],
-        model: Encoder | None = None,
-        ann_backend: Backend | str = Backend.USEARCH,
-        **kwargs: Any,
-    ) -> SemHash:
-        """
-        Initialize SemHash from a dataset (e.g., HuggingFace Dataset).
-
-        Removes exact duplicates, featurizes the records, and fits a vicinity index.
-        Supports any dataset-like object that follows the DatasetLike protocol.
-
-        :param dataset: A dataset-like object with columnar access.
-        :param columns: Columns to use for deduplication (same as from_records).
-        :param model: (Optional) An Encoder model. If None, the default model is used (minishlab/potion-base-8M).
-        :param ann_backend: (Optional) The ANN backend to use. Defaults to Backend.USEARCH.
-        :param **kwargs: Any additional keyword arguments to pass to the Vicinity index.
-        :return: A SemHash instance with a fitted vicinity index.
-        """
-        # Load default model if needed
-        if model is None:  # pragma: no cover
-            model = StaticModel.from_pretrained("minishlab/potion-base-8M")
-
-        # Extract, validate, and deduplicate dataset records
-        deduplicated_records, items, was_string = prepare_dataset_records(dataset, columns)
-
-        # Create embeddings for deduplicated records only
-        vectors = featurize(records=deduplicated_records, columns=columns, model=model)
-
-        index = Index.from_vectors_and_items(vectors=vectors, items=items, backend_type=ann_backend, **kwargs)
-        return cls(index=index, model=model, columns=columns, was_string=was_string)
-
     @classmethod
     def from_embeddings(
         cls,
diff --git a/semhash/utils.py b/semhash/utils.py
@@ -28,26 +28,6 @@ def encode(
         ...  # pragma: no cover
 
 
-class DatasetLike(Protocol):
-    """
-    Protocol for dataset-like objects compatible with SemHash.from_dataset().
-
-    Any object that provides columnar access (dataset[column_name] -> sequence)
-    satisfies this protocol. HuggingFace datasets.Dataset is the primary example,
-    but custom dataset implementations are supported.
-    """
-
-    column_names: Sequence[str]
-
-    def __len__(self) -> int:
-        """Return the number of rows in the dataset."""
-        ...  # pragma: no cover
-
-    def __getitem__(self, key: str) -> Sequence[Any]:
-        """Return all values for the given column name."""
-        ...  # pragma: no cover
-
-
 def make_hashable(value: Any) -> Any:
     """
     Convert a value to a hashable representation for use as dict keys.
diff --git a/semhash/version.py b/semhash/version.py
@@ -1,2 +1,2 @@
-__version_triple__ = (0, 3, 3)
-__version__ = ".".join(map(str, __version_triple__))
+__version_triple__ = (0, 3, 3)  # pragma: no cover
+__version__ = ".".join(map(str, __version_triple__))  # pragma: no cover
diff --git a/tests/test_from_dataset.py b/tests/test_from_dataset.py
diff --git a/tests/test_semhash.py b/tests/test_semhash.py
@@ -279,37 +279,6 @@ def test_from_embeddings(model: Encoder, train_texts: list[str]) -> None:
     assert semhash.index.vectors.tolist() == [[0.0], [1.0], [3.0]]
 
 
-def test_from_dataset_with_custom_dataset_like(model: Encoder) -> None:
-    """Test that from_dataset works with custom DatasetLike implementations (no HF dependency)."""
-
-    class MiniDataset:
-        """Minimal DatasetLike implementation for testing."""
-
-        column_names = ["text"]
-
-        def __init__(self, data: dict[str, list[str]]) -> None:
-            self._data = data
-
-        def __len__(self) -> int:
-            return len(self._data["text"])
-
-        def __getitem__(self, key: str) -> list[str]:
-            return self._data[key]
-
-    # Create custom dataset with duplicates
-    ds = MiniDataset({"text": ["apple", "banana", "apple"]})
-
-    semhash = SemHash.from_dataset(ds, columns=["text"], model=model)
-
-    # Should have deduplicated to 2 unique items
-    assert len(semhash.index.items) == 2
-    assert len(semhash.index.vectors) == 2
-
-    # Should work with deduplication
-    result = semhash.self_deduplicate(threshold=0.95)
-    assert len(result.selected) == 2
-
-
 def test_from_records_edge_cases(model: Encoder) -> None:
     """Test from_records edge cases: coercion, order preservation, None rejection."""
     # Coerces non-string dict values to strings
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -3,7 +3,54 @@
 from frozendict import frozendict
 
 from semhash.records import prepare_records, remove_exact_duplicates
-from semhash.utils import Encoder, compute_candidate_limit, featurize, to_frozendict
+from semhash.utils import Encoder, coerce_value, compute_candidate_limit, featurize, make_hashable, to_frozendict
+
+
+def test_make_hashable() -> None:
+    """Test make_hashable with various types."""
+    # Fast path: primitives
+    assert make_hashable("hello") == "hello"
+    assert make_hashable(42) == 42
+    assert make_hashable(3.14) == 3.14
+    assert make_hashable(True) is True
+    assert make_hashable(None) is None
+
+    # Objects with tobytes() (simulate PIL Image or numpy array)
+    class MockImage:
+        def tobytes(self) -> bytes:
+            return b"fake_image_data"
+
+    img = MockImage()
+    result = make_hashable(img)
+    assert isinstance(result, str)
+    assert len(result) == 32  # MD5 hex digest
+
+    # Hashable objects (like tuples)
+    assert make_hashable((1, 2, 3)) == (1, 2, 3)
+
+    # Non-hashable fallback to string
+    unhashable = {"key": "value"}
+    result = make_hashable(unhashable)
+    assert result == "{'key': 'value'}"
+
+
+def test_coerce_value() -> None:
+    """Test coerce_value for encoding preparation."""
+    # Strings and bytes pass through
+    assert coerce_value("hello") == "hello"
+    assert coerce_value(b"bytes") == b"bytes"
+
+    # Primitives converted to strings
+    assert coerce_value(42) == "42"
+    assert coerce_value(3.14) == "3.14"
+    assert coerce_value(True) == "True"
+
+    # Complex types pass through unchanged
+    class MockImage:
+        pass
+
+    img = MockImage()
+    assert coerce_value(img) is img
 
 
 def test_to_frozendict() -> None:
diff --git a/uv.lock b/uv.lock