|
| 1 | +from __future__ import annotations |
| 2 | + |
| 3 | +import json |
| 4 | +import logging |
| 5 | +import tempfile |
| 6 | +from pathlib import Path |
| 7 | +from typing import TYPE_CHECKING, Any |
| 8 | + |
| 9 | +from vicinity.backends import BasicVectorStore, get_backend_class |
| 10 | +from vicinity.datatypes import Backend |
| 11 | + |
| 12 | +if TYPE_CHECKING: |
| 13 | + from huggingface_hub import CommitInfo |
| 14 | + |
| 15 | + from vicinity.vicinity import Vicinity |
| 16 | + |
| 17 | +_HUB_IMPORT_ERROR = ImportError( |
| 18 | + "`datasets` and `huggingface_hub` are required to push to the Hugging Face Hub. Please install them with `pip install 'vicinity[huggingface]'`" |
| 19 | +) |
| 20 | +_MODEL_NAME_OR_PATH_PRINT_STATEMENT = ( |
| 21 | + "Embeddings in Vicinity instance were created from model name or path: {model_name_or_path}" |
| 22 | +) |
| 23 | + |
| 24 | +logger = logging.getLogger(__name__) |
| 25 | + |
| 26 | + |
| 27 | +class HuggingFaceMixin: |
| 28 | + def push_to_hub( |
| 29 | + self, |
| 30 | + model_name_or_path: str, |
| 31 | + repo_id: str, |
| 32 | + token: str | None = None, |
| 33 | + private: bool = False, |
| 34 | + **kwargs: Any, |
| 35 | + ) -> "CommitInfo": |
| 36 | + """ |
| 37 | + Push the Vicinity instance to the Hugging Face Hub. |
| 38 | +
|
| 39 | + :param model_name_or_path: The name of the model or the path to the local directory |
| 40 | + that was used to create the embeddings in the Vicinity instance. |
| 41 | + :param repo_id: The repository ID on the Hugging Face Hub |
| 42 | + :param token: Optional authentication token for private repositories |
| 43 | + :param private: Whether to create a private repository |
| 44 | + :param **kwargs: Additional arguments passed to Dataset.push_to_hub() |
| 45 | + :return: The commit info |
| 46 | + """ |
| 47 | + try: |
| 48 | + from datasets import Dataset |
| 49 | + from huggingface_hub import DatasetCard, upload_file, upload_folder |
| 50 | + except ImportError: |
| 51 | + raise _HUB_IMPORT_ERROR |
| 52 | + |
| 53 | + # Create and push dataset with items and vectors |
| 54 | + if isinstance(self.items[0], dict): |
| 55 | + dataset_dict = {k: [item[k] for item in self.items] for k in self.items[0].keys()} |
| 56 | + else: |
| 57 | + dataset_dict = {"items": self.items} |
| 58 | + if self.vector_store is not None: |
| 59 | + dataset_dict["vectors"] = self.vector_store.vectors |
| 60 | + dataset = Dataset.from_dict(dataset_dict) |
| 61 | + dataset.push_to_hub(repo_id, token=token, private=private, **kwargs) |
| 62 | + |
| 63 | + # Save backend and config files to temp directory and upload |
| 64 | + with tempfile.TemporaryDirectory() as temp_dir: |
| 65 | + temp_path = Path(temp_dir) |
| 66 | + |
| 67 | + # Save and upload backend |
| 68 | + self.backend.save(temp_path) |
| 69 | + upload_folder( |
| 70 | + repo_id=repo_id, |
| 71 | + folder_path=temp_path, |
| 72 | + token=token, |
| 73 | + repo_type="dataset", |
| 74 | + path_in_repo="backend", |
| 75 | + ) |
| 76 | + |
| 77 | + # Save and upload config |
| 78 | + config = { |
| 79 | + "metadata": self.metadata, |
| 80 | + "backend_type": self.backend.backend_type.value, |
| 81 | + "model_name_or_path": model_name_or_path, |
| 82 | + } |
| 83 | + config_path = temp_path / "config.json" |
| 84 | + config_path.write_text(json.dumps(config)) |
| 85 | + upload_file( |
| 86 | + repo_id=repo_id, |
| 87 | + path_or_fileobj=config_path, |
| 88 | + token=token, |
| 89 | + repo_type="dataset", |
| 90 | + path_in_repo="config.json", |
| 91 | + ) |
| 92 | + |
| 93 | + # Load the dataset card template from the related path |
| 94 | + template_path = Path(__file__).parent / "dataset_card_template.md" |
| 95 | + template = template_path.read_text() |
| 96 | + content = template.format(repo_id=repo_id, num_items=len(self.items), config=json.dumps(config, indent=4)) |
| 97 | + return DatasetCard(content=content).push_to_hub(repo_id=repo_id, token=token, repo_type="dataset") |
| 98 | + |
| 99 | + @classmethod |
| 100 | + def load_from_hub(cls, repo_id: str, token: str | None = None, **kwargs: Any) -> "Vicinity": |
| 101 | + """ |
| 102 | + Load a Vicinity instance from the Hugging Face Hub. |
| 103 | +
|
| 104 | + :param repo_id: The repository ID on the Hugging Face Hub. |
| 105 | + :param token: Optional authentication token for private repositories. |
| 106 | + :param **kwargs: Additional arguments passed to load_dataset. |
| 107 | + :return: A Vicinity instance loaded from the Hub. |
| 108 | + """ |
| 109 | + try: |
| 110 | + from datasets import load_dataset |
| 111 | + from huggingface_hub import snapshot_download |
| 112 | + except ImportError: |
| 113 | + raise _HUB_IMPORT_ERROR |
| 114 | + |
| 115 | + # Load dataset and extract items and vectors |
| 116 | + dataset = load_dataset(repo_id, token=token, split="train", **kwargs) |
| 117 | + if "items" in dataset.column_names: |
| 118 | + items = dataset["items"] |
| 119 | + else: |
| 120 | + # Create items from all columns except 'vectors' |
| 121 | + items = [] |
| 122 | + columns = [col for col in dataset.column_names if col != "vectors"] |
| 123 | + for i in range(len(dataset)): |
| 124 | + items.append({col: dataset[col][i] for col in columns}) |
| 125 | + has_vectors = "vectors" in dataset.column_names |
| 126 | + vector_store = BasicVectorStore(vectors=dataset["vectors"]) if has_vectors else None |
| 127 | + |
| 128 | + # Download and load config and backend |
| 129 | + repo_path = Path(snapshot_download(repo_id=repo_id, token=token, repo_type="dataset")) |
| 130 | + with open(repo_path / "config.json") as f: |
| 131 | + config = json.load(f) |
| 132 | + model_name_or_path = config.pop("model_name_or_path") |
| 133 | + |
| 134 | + print(_MODEL_NAME_OR_PATH_PRINT_STATEMENT.format(model_name_or_path=model_name_or_path)) |
| 135 | + backend_type = Backend(config["backend_type"]) |
| 136 | + backend = get_backend_class(backend_type).load(repo_path / "backend") |
| 137 | + |
| 138 | + return cls(items=items, backend=backend, metadata=config["metadata"], vector_store=vector_store) |
0 commit comments