|
5 | 5 | from frozendict import frozendict |
6 | 6 |
|
7 | 7 | from semhash.datamodels import DeduplicationResult, DuplicateRecord |
8 | | -from semhash.utils import DatasetLike, Record, coerce_value, to_frozendict |
| 8 | +from semhash.utils import Record, coerce_value, to_frozendict |
9 | 9 |
|
10 | 10 |
|
11 | 11 | def group_records_by_key( |
@@ -126,69 +126,6 @@ def prepare_records( |
126 | 126 | return dict_records, columns, was_string |
127 | 127 |
|
128 | 128 |
|
129 | | -def _validate_dataset(dataset: DatasetLike, columns: Sequence[str]) -> tuple[dict[str, Sequence[Any]], int]: |
130 | | - """Validate dataset structure and extract columns.""" |
131 | | - try: |
132 | | - column_names = dataset.column_names |
133 | | - except AttributeError as e: |
134 | | - raise TypeError("dataset must satisfy DatasetLike (column_names, __len__, __getitem__)") from e |
135 | | - |
136 | | - missing = set(columns) - set(column_names) |
137 | | - if missing: |
138 | | - raise ValueError(f"Columns {missing} not found in dataset") |
139 | | - |
140 | | - n = len(dataset) |
141 | | - if n == 0: |
142 | | - raise ValueError("dataset must not be empty") |
143 | | - |
144 | | - cols = {c: dataset[c] for c in columns} |
145 | | - for c in columns: |
146 | | - if len(cols[c]) != n: |
147 | | - raise ValueError(f"Column '{c}' length ({len(cols[c])}) does not match dataset length ({n})") |
148 | | - |
149 | | - return cols, n |
150 | | - |
151 | | - |
152 | | -def prepare_dataset_records( |
153 | | - dataset: DatasetLike, |
154 | | - columns: Sequence[str], |
155 | | -) -> tuple[list[dict[str, Any]], list[list[dict[str, Any]]], bool]: |
156 | | - """ |
157 | | - Extract, validate, and exact-deduplicate dataset rows using columnar access. |
158 | | -
|
159 | | - :param dataset: A dataset-like object with columnar access. |
160 | | - :param columns: Columns to use for deduplication. |
161 | | - :return: Tuple of (deduplicated_records, items, was_string) where: |
162 | | - - deduplicated_records: representative record per exact-duplicate bucket |
163 | | - - items: buckets of exact duplicates (each bucket is list[record]) |
164 | | - - was_string: True iff columns == ["text"] and ALL raw values were strings |
165 | | - """ |
166 | | - cols, n = _validate_dataset(dataset, columns) |
167 | | - |
168 | | - # was_string controls whether deduplicate() returns strings or dicts. |
169 | | - # We only return strings if: (1) single column named "text", AND (2) all raw |
170 | | - # values in the dataset are actual strings (not integers/floats coerced to str). |
171 | | - was_string = len(columns) == 1 and columns[0] == "text" |
172 | | - |
173 | | - def validate_and_coerce(raw: Any, *, col: str, idx: int) -> Any: |
174 | | - """Validate value is not None, then coerce for encoding.""" |
175 | | - if raw is None: |
176 | | - raise ValueError(f"Column '{col}' has None at index {idx}") |
177 | | - return coerce_value(raw) |
178 | | - |
179 | | - # Build all records while tracking was_string |
180 | | - records: list[dict[str, Any]] = [] |
181 | | - for i in range(n): |
182 | | - if was_string and not isinstance(cols["text"][i], str): |
183 | | - was_string = False |
184 | | - records.append({c: validate_and_coerce(cols[c][i], col=c, idx=i) for c in columns}) |
185 | | - |
186 | | - # Group by exact match, preserving first-occurrence order |
187 | | - deduplicated_records, items = group_records_by_key(records, columns) |
188 | | - |
189 | | - return deduplicated_records, items, was_string |
190 | | - |
191 | | - |
192 | 129 | def dict_to_string(record: dict[str, str], columns: Sequence[str]) -> str: |
193 | 130 | r""" |
194 | 131 | Turn a record into a single string. |
|
0 commit comments