Skip to content

Commit 63871e7

Browse files
committed
fix: improve auto-scaling params for small datasets
Lower min_topic_size floor to 3 and scale more aggressively. Apply UMAP neighbor and component scaling for datasets under 100 items to prevent sparse high-dimensional output from breaking HDBSCAN clustering.
1 parent a4a8b32 commit 63871e7

1 file changed

Lines changed: 18 additions & 6 deletions

File tree

src/handler.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -52,21 +52,33 @@ def handler(event: dict) -> dict:
5252
# Auto-scale params for small datasets so HDBSCAN can find clusters
5353
n_items = len(request.items)
5454
if n_items < min_topic_size * 4:
55-
scaled_min = max(5, n_items // 5)
55+
scaled_min = max(3, n_items // 10)
5656
if scaled_min < min_topic_size:
5757
logger.info(
5858
f"Small dataset ({n_items} items): scaling min_topic_size "
5959
f"{min_topic_size}{scaled_min}"
6060
)
6161
params["min_topic_size"] = scaled_min
6262
min_topic_size = scaled_min
63-
# Also scale UMAP neighbors to avoid exceeding dataset size
64-
max_neighbors = max(5, n_items - 1)
65-
if params["umap_n_neighbors"] > max_neighbors:
63+
64+
if n_items < 100:
65+
# Scale UMAP neighbors — too many neighbors washes out local structure
66+
scaled_neighbors = max(5, n_items // 4)
67+
if scaled_neighbors < params["umap_n_neighbors"]:
68+
logger.info(
69+
f"Small dataset ({n_items} items): scaling umap_n_neighbors "
70+
f"{params['umap_n_neighbors']}{scaled_neighbors}"
71+
)
72+
params["umap_n_neighbors"] = scaled_neighbors
73+
74+
# Reduce UMAP dimensions — high-dim output is too sparse for HDBSCAN
75+
max_components = 5
76+
if params["umap_n_components"] > max_components:
6677
logger.info(
67-
f"Scaling umap_n_neighbors {params['umap_n_neighbors']}{max_neighbors}"
78+
f"Small dataset ({n_items} items): scaling umap_n_components "
79+
f"{params['umap_n_components']}{max_components}"
6880
)
69-
params["umap_n_neighbors"] = max_neighbors
81+
params["umap_n_components"] = max_components
7082

7183
# Validate minimum item count
7284
if len(request.items) < min_topic_size:

0 commit comments

Comments
 (0)