Skip to content

Commit 3321deb

Browse files
authored
[DE-7784] Migrate sdk image deduplication to async mode (#459)
* Add support for async dedup * Update sphinx package versions * Update sdk version to 0.18.1 * Remove support for sync dedup in sdk * Updater CHANGELOG * Address greptile
1 parent ebcf260 commit 3321deb

9 files changed

Lines changed: 590 additions & 235 deletions

File tree

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,14 @@ All notable changes to the [Nucleus Python Client](https://github.com/scaleapi/n
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## [0.18.1](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.18.1) - 2026-05-05
9+
10+
### Changed
11+
- `Dataset.deduplicate()` and `Dataset.deduplicate_by_ids()` now run asynchronously and return a `DeduplicationJob` instead of returning a `DeduplicationResult` directly. Call `job.result()` to wait for completion and retrieve the result.
12+
13+
### Removed
14+
- Sync deduplication support for `Dataset.deduplicate()` and `Dataset.deduplicate_by_ids()`.
15+
816
## [0.18.0](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.18.0) - 2026-04-29
917

1018
### Removed

nucleus/__init__.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
"AsyncJob",
55
"EmbeddingsExportJob",
66
"BoxAnnotation",
7+
"DeduplicationJob",
78
"DeduplicationResult",
89
"DeduplicationStats",
910
"BoxPrediction",
@@ -131,7 +132,11 @@
131132
from .data_transfer_object.job_status import JobInfoRequestPayload
132133
from .dataset import Dataset
133134
from .dataset_item import DatasetItem
134-
from .deduplication import DeduplicationResult, DeduplicationStats
135+
from .deduplication import (
136+
DeduplicationJob,
137+
DeduplicationResult,
138+
DeduplicationStats,
139+
)
135140
from .deprecation_warning import deprecated
136141
from .errors import (
137142
DatasetItemRetrievalError,

nucleus/dataset.py

Lines changed: 15 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@
8484
check_items_have_dimensions,
8585
)
8686
from .dataset_item_uploader import DatasetItemUploader
87-
from .deduplication import DeduplicationResult, DeduplicationStats
87+
from .deduplication import DeduplicationJob
8888
from .deprecation_warning import deprecated
8989
from .errors import NotFoundError, NucleusAPIError
9090
from .job import CustomerJobTypes, jobs_status_overview
@@ -1016,7 +1016,7 @@ def deduplicate(
10161016
self,
10171017
threshold: int,
10181018
reference_ids: Optional[List[str]] = None,
1019-
) -> DeduplicationResult:
1019+
) -> DeduplicationJob:
10201020
"""Deduplicate images or frames using user-defined reference IDs.
10211021
10221022
This method can deduplicate an entire dataset (when reference_ids is omitted)
@@ -1029,6 +1029,7 @@ def deduplicate(
10291029
not the scenes themselves. Frame reference IDs or dataset item IDs
10301030
should be provided for scene datasets.
10311031
- For very large datasets, this operation may take significant time.
1032+
This operation runs asynchronously to avoid HTTP timeouts.
10321033
10331034
Parameters:
10341035
threshold: Hamming distance threshold (0-64). Lower = stricter.
@@ -1038,7 +1039,9 @@ def deduplicate(
10381039
Cannot be an empty list - use None for entire dataset.
10391040
10401041
Returns:
1041-
DeduplicationResult with unique_reference_ids, unique_item_ids, and stats.
1042+
:class:`DeduplicationJob`: A background job. Call
1043+
``job.result()`` to block and unpack the
1044+
:class:`DeduplicationResult`.
10421045
10431046
Raises:
10441047
ValueError: If reference_ids is an empty list (use None for entire dataset).
@@ -1058,23 +1061,15 @@ def deduplicate(
10581061
payload[REFERENCE_IDS_KEY] = reference_ids
10591062

10601063
response = self._client.make_request(
1061-
payload, f"dataset/{self.id}/deduplicate"
1062-
)
1063-
return DeduplicationResult(
1064-
unique_item_ids=response["unique_item_ids"],
1065-
unique_reference_ids=response["unique_reference_ids"],
1066-
stats=DeduplicationStats(
1067-
threshold=threshold,
1068-
original_count=response["stats"]["original_count"],
1069-
deduplicated_count=response["stats"]["deduplicated_count"],
1070-
),
1064+
payload, f"dataset/{self.id}/deduplicate_async"
10711065
)
1066+
return DeduplicationJob.from_json(response, self._client)
10721067

10731068
def deduplicate_by_ids(
10741069
self,
10751070
threshold: int,
10761071
dataset_item_ids: List[str],
1077-
) -> DeduplicationResult:
1072+
) -> DeduplicationJob:
10781073
"""Deduplicate images or frames using internal Nucleus dataset item IDs.
10791074
10801075
This method identifies items by internal Nucleus IDs (e.g., "di_abc123...")
@@ -1090,7 +1085,9 @@ def deduplicate_by_ids(
10901085
user-defined reference IDs. Must be non-empty.
10911086
10921087
Returns:
1093-
DeduplicationResult with unique_item_ids, unique_reference_ids, and stats.
1088+
:class:`DeduplicationJob`: A background job. Call
1089+
``job.result()`` to block and unpack the
1090+
:class:`DeduplicationResult`.
10941091
10951092
Raises:
10961093
ValueError: If dataset_item_ids is empty.
@@ -1109,18 +1106,11 @@ def deduplicate_by_ids(
11091106
DATASET_ITEM_IDS_KEY: dataset_item_ids,
11101107
THRESHOLD_KEY: threshold,
11111108
}
1109+
11121110
response = self._client.make_request(
1113-
payload, f"dataset/{self.id}/deduplicate"
1114-
)
1115-
return DeduplicationResult(
1116-
unique_item_ids=response["unique_item_ids"],
1117-
unique_reference_ids=response["unique_reference_ids"],
1118-
stats=DeduplicationStats(
1119-
threshold=threshold,
1120-
original_count=response["stats"]["original_count"],
1121-
deduplicated_count=response["stats"]["deduplicated_count"],
1122-
),
1111+
payload, f"dataset/{self.id}/deduplicate_async"
11231112
)
1113+
return DeduplicationJob.from_json(response, self._client)
11241114

11251115
def build_slice(
11261116
self,

nucleus/dataset_item.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ class DatasetItem: # pylint: disable=R0902
100100
camera intrinsics the metadata of your camera image items. Nucleus
101101
requires these intrinsics to create visualizations such as cuboid
102102
projections. Refer to our `guide to uploading 3D data
103-
<https://nucleus.scale.com/docs/uploading-3d-data>`_ for more
103+
<https://nucleus.scale.com/docs/uploading-3d-data>`__ for more
104104
info.
105105
106106
Coordinate metadata may be provided to enable the Map Chart in the Nucleus Dataset charts page.

nucleus/deduplication.py

Lines changed: 121 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,134 @@
11
from dataclasses import dataclass
2-
from typing import List
2+
from typing import Any, Dict, List, Sequence, cast
3+
4+
from nucleus.async_job import AsyncJob, JobError
5+
6+
REQUIRED_RESULT_FIELDS = ("unique_item_ids", "unique_reference_ids", "stats")
7+
REQUIRED_STATS_FIELDS = ("threshold", "original_count", "deduplicated_count")
8+
9+
10+
def _require_fields(
11+
payload: Dict[str, Any], required_fields: Sequence[str], context: str
12+
) -> None:
13+
missing_fields = [
14+
field for field in required_fields if field not in payload
15+
]
16+
if missing_fields:
17+
missing_fields_message = ", ".join(missing_fields)
18+
raise RuntimeError(
19+
f"Deduplication job result missing {context} field(s): {missing_fields_message}"
20+
)
321

422

523
@dataclass
624
class DeduplicationStats:
25+
"""Summary statistics for a deduplication run.
26+
27+
Attributes:
28+
threshold: The Hamming distance threshold the run was executed at.
29+
Lower values are stricter; ``0`` means exact matches only.
30+
original_count: How many items were considered before deduplication.
31+
deduplicated_count: How many unique items remained afterwards.
32+
"""
33+
734
threshold: int
835
original_count: int
936
deduplicated_count: int
1037

1138

1239
@dataclass
1340
class DeduplicationResult:
14-
unique_item_ids: List[str] # Internal dataset item IDs
15-
unique_reference_ids: List[str] # User-defined reference IDs
41+
"""Output of a deduplication run.
42+
43+
Attributes:
44+
unique_item_ids: Nucleus-internal dataset item IDs (e.g.
45+
``"di_abc123..."``) that survived deduplication. One entry per
46+
kept item.
47+
unique_reference_ids: The user-defined reference IDs you supplied at
48+
upload time, in the same order as ``unique_item_ids``.
49+
stats: Summary statistics for the run. See :class:`DeduplicationStats`.
50+
"""
51+
52+
unique_item_ids: List[str]
53+
unique_reference_ids: List[str]
1654
stats: DeduplicationStats
55+
56+
57+
class DeduplicationJob(AsyncJob):
58+
"""Handle to a long-running deduplication job.
59+
60+
Returned from :meth:`Dataset.deduplicate` and
61+
:meth:`Dataset.deduplicate_by_ids`. Deduplication always runs in the
62+
background; collect the completed output with :meth:`result`.
63+
64+
Inherits all the standard :class:`AsyncJob` controls
65+
(:meth:`status`, :meth:`errors`, :meth:`sleep_until_complete`).
66+
67+
::
68+
69+
import nucleus
70+
71+
client = nucleus.NucleusClient(YOUR_API_KEY)
72+
dataset = client.get_dataset("ds_xxx")
73+
74+
job = dataset.deduplicate(threshold=10)
75+
result = job.result() # blocks until done
76+
print(result.stats.deduplicated_count)
77+
print(result.unique_reference_ids)
78+
79+
# You can also deduplicate a known set of internal dataset item IDs.
80+
job = dataset.deduplicate_by_ids(
81+
threshold=10,
82+
dataset_item_ids=["di_xxx", "di_yyy"],
83+
)
84+
result = job.result()
85+
86+
# Or split the wait and fetch yourself.
87+
job.sleep_until_complete()
88+
result = job.result(wait_for_completion=False)
89+
"""
90+
91+
def result(
92+
self, wait_for_completion: bool = True
93+
) -> "DeduplicationResult":
94+
"""Return the deduplication result, optionally waiting for the job.
95+
96+
Parameters:
97+
wait_for_completion: When ``True`` (default), block until the job
98+
reaches a terminal state. When ``False``, the caller is
99+
expected to have already waited (e.g. via
100+
:meth:`sleep_until_complete`).
101+
102+
Returns:
103+
A :class:`DeduplicationResult` containing the kept item IDs,
104+
reference IDs, and run statistics.
105+
106+
Raises:
107+
JobError: If the job did not finish successfully (e.g. it was
108+
cancelled or hit a server error).
109+
RuntimeError: If the completed job response is missing expected
110+
result fields.
111+
"""
112+
if wait_for_completion:
113+
self.sleep_until_complete(verbose_std_out=False)
114+
115+
status = self.status()
116+
if status["status"] != "Completed":
117+
raise JobError(status, self)
118+
119+
# AsyncJob.status() is typed as Dict[str, str] in the base class, but
120+
# the `message` slot is a JSON dict in practice. Cast locally so
121+
# static checkers don't flag the dict accesses below.
122+
msg = cast(Dict[str, Any], status["message"] or {})
123+
_require_fields(msg, REQUIRED_RESULT_FIELDS, "result")
124+
stats = cast(Dict[str, Any], msg.get("stats") or {})
125+
_require_fields(stats, REQUIRED_STATS_FIELDS, "stats")
126+
return DeduplicationResult(
127+
unique_item_ids=msg["unique_item_ids"],
128+
unique_reference_ids=msg["unique_reference_ids"],
129+
stats=DeduplicationStats(
130+
threshold=stats["threshold"],
131+
original_count=stats["original_count"],
132+
deduplicated_count=stats["deduplicated_count"],
133+
),
134+
)

nucleus/scene.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ class Frame:
4343
pointcloud and any number of images (e.g. from different angles).
4444
4545
Refer to our `guide to uploading 3D data
46-
<https://docs.nucleus.scale.com/docs/uploading-3d-data>`_ for more info!
46+
<https://nucleus.scale.com/docs/uploading-3d-data>`__ for more info!
4747
"""
4848

4949
def __init__(self, **kwargs: DatasetItem) -> None:
@@ -419,7 +419,7 @@ class LidarScene(Scene):
419419
`{ "context_attachments": [ { "attachment": 'https://example.com/1' }, { "attachment": 'https://example.com/2' }, ... ] }`.
420420
421421
Refer to our `guide to uploading 3D data
422-
<https://docs.nucleus.scale.com/docs/uploading-3d-data>`_ for more info!
422+
<https://nucleus.scale.com/docs/uploading-3d-data>`__ for more info!
423423
"""
424424

425425
def __repr__(self) -> str:

0 commit comments

Comments
 (0)