diff --git a/README.md b/README.md index 181a3001..a24c0f88 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,9 @@ See [Quick Start Guide](docs/operating/quick-start.md) for details. ![CLI with Inline Image Evidence](docs/media/screenshots/cli-search-with-images.png) *Command-line search returns concepts with source images rendered inline via chafa* +![Embedding Landscape with DBSCAN Clusters](docs/media/screenshots/web-embedding-landscape-clusters.png) +*t-SNE embedding landscape with auto-detected clusters, named by topic via TF-IDF* + ## What You Can Do **Ingest documents** — PDFs, markdown, images, text. The system extracts concepts, relationships, and evidence automatically. diff --git a/api/app/routes/projection.py b/api/app/routes/projection.py index 2a344e7d..78542a84 100644 --- a/api/app/routes/projection.py +++ b/api/app/routes/projection.py @@ -39,6 +39,7 @@ class ProjectionConceptResponse(BaseModel): diversity_related_count: Optional[int] = None ontology: Optional[str] = None # Source ontology (for cross-ontology mode) item_type: Optional[str] = None # Item type (for combined mode) + cluster_id: Optional[int] = None # DBSCAN cluster assignment (None = noise) class ProjectionParametersResponse(BaseModel): @@ -60,6 +61,10 @@ class ProjectionStatisticsResponse(BaseModel): embedding_dims: int grounding_range: Optional[List[float]] = None diversity_range: Optional[List[float]] = None + cluster_count: Optional[int] = None + cluster_sizes: Optional[Dict[str, int]] = None + cluster_names: Optional[Dict[str, str]] = None + cluster_noise_count: Optional[int] = None class ProjectionDatasetResponse(BaseModel): diff --git a/api/app/services/embedding_projection_service.py b/api/app/services/embedding_projection_service.py index 896d5cce..6c72f46b 100644 --- a/api/app/services/embedding_projection_service.py +++ b/api/app/services/embedding_projection_service.py @@ -14,6 +14,8 @@ import json import logging import hashlib +import math +from collections import Counter from datetime import datetime from typing import Dict, List, Optional, Any, Literal import numpy as np @@ -28,6 +30,14 @@ TSNE_AVAILABLE = False logger.warning("sklearn.manifold.TSNE not available") +try: + from sklearn.cluster import DBSCAN + from sklearn.neighbors import NearestNeighbors + DBSCAN_AVAILABLE = True +except ImportError: + DBSCAN_AVAILABLE = False + logger.warning("sklearn.cluster.DBSCAN not available") + try: from umap import UMAP UMAP_AVAILABLE = True @@ -685,6 +695,163 @@ def compute_projection( return projection.astype(np.float32) + def _compute_clusters(self, projection: np.ndarray, min_samples: int = 5) -> Dict[str, Any]: + """Run DBSCAN on projected coordinates to identify spatial clusters. + + Auto-tunes eps using the 40th percentile of k-NN distances. This + produces clusters where no single cluster dominates, giving a + "political map" coloring of the embedding space. + + Args: + projection: (N, D) array of projected coordinates + min_samples: DBSCAN min_samples parameter + + Returns: + Dict with cluster_labels, cluster_count, cluster_sizes, + eps_used, noise_count + """ + if not DBSCAN_AVAILABLE or len(projection) < min_samples: + return { + "cluster_labels": np.full(len(projection), -1, dtype=int), + "cluster_count": 0, + "cluster_sizes": {}, + "eps_used": 0.0, + "noise_count": len(projection), + } + + # Compute k-NN distances for eps estimation + k = min_samples + nn = NearestNeighbors(n_neighbors=k) + nn.fit(projection) + distances, _ = nn.kneighbors(projection) + k_distances = np.sort(distances[:, -1]) + + # Use 40th percentile — empirically produces balanced clusters where + # no single cluster dominates (largest ~10% of points). + # Higher percentiles merge too aggressively; lower ones fragment. + eps = float(np.percentile(k_distances, 40)) + + # Floor at 1% of data range (minimum 1e-6) to avoid degenerate eps=0 + data_range = float(np.max(projection.max(axis=0) - projection.min(axis=0))) + eps = max(eps, data_range * 0.01, 1e-6) + + # Run DBSCAN + db = DBSCAN(eps=eps, min_samples=min_samples) + labels = db.fit_predict(projection) + + # Compute stats + unique = set(labels) + unique.discard(-1) + cluster_sizes = {} + for label in unique: + cluster_sizes[str(int(label))] = int(np.sum(labels == label)) + noise_count = int(np.sum(labels == -1)) + + logger.info( + f"DBSCAN clustering: {len(unique)} clusters, " + f"{noise_count} noise points, eps={eps:.3f}" + ) + + return { + "cluster_labels": labels, + "cluster_count": len(unique), + "cluster_sizes": cluster_sizes, + "eps_used": eps, + "noise_count": noise_count, + } + + # Common English stop words for cluster naming + _STOP_WORDS = frozenset({ + "a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for", + "of", "with", "by", "from", "is", "are", "was", "were", "be", "been", + "being", "have", "has", "had", "do", "does", "did", "will", "would", + "could", "should", "may", "might", "shall", "can", "need", "must", + "not", "no", "nor", "so", "if", "then", "than", "that", "this", + "these", "those", "it", "its", "as", "up", "out", "about", "into", + "over", "after", "before", "between", "under", "above", "below", + "all", "each", "every", "both", "few", "more", "most", "other", + "some", "such", "only", "own", "same", "too", "very", "just", + "because", "through", "during", "while", "where", "when", "how", + "what", "which", "who", "whom", "why", "any", "many", "much", + "also", "back", "even", "still", "well", "way", "use", "her", + "his", "he", "she", "they", "we", "you", "your", "their", "our", + "us", "me", "my", "based", "using", "used", "via", "per", "vs", + }) + + def _name_clusters( + self, + labels: np.ndarray, + items: List[Dict[str, Any]], + ) -> Dict[int, str]: + """Derive descriptive names for clusters from concept labels. + + Uses TF-IDF-style scoring: terms frequent within a cluster but rare + across other clusters get the highest score. Top 2 terms form the name. + + Args: + labels: DBSCAN cluster assignment per item (-1 = noise) + items: List of item dicts with "label" keys + + Returns: + Dict mapping cluster_id -> descriptive name string + """ + unique = set(labels) + unique.discard(-1) + if not unique: + return {} + + # Tokenize: collect word counts per cluster + cluster_words: Dict[int, Counter] = {} + for i, item in enumerate(items): + cid = int(labels[i]) + if cid == -1: + continue + if cid not in cluster_words: + cluster_words[cid] = Counter() + words = item.get("label", "").lower().split() + for w in words: + # Strip non-alpha chars, skip short/stop words + w = w.strip("()-/,:;\"'") + if len(w) <= 2 or w in self._STOP_WORDS: + continue + cluster_words[cid][w] += 1 + + # Document frequency: how many clusters contain each term + num_clusters = len(unique) + doc_freq: Counter = Counter() + for wc in cluster_words.values(): + for w in wc: + doc_freq[w] += 1 + + # Score terms per cluster: tf * idf + # Use str keys to match Pydantic Dict[str, str] models + cluster_names: Dict[str, str] = {} + for cid in sorted(int(c) for c in unique): + wc = cluster_words.get(cid, Counter()) + key = str(cid) + if not wc: + cluster_names[key] = f"Cluster {cid}" + continue + + total = sum(wc.values()) + scored = [] + for w, count in wc.items(): + tf = count / total + if num_clusters <= 1: + # Single cluster: rank by frequency only + scored.append((w, tf, count)) + else: + idf = math.log(num_clusters / doc_freq[w]) if doc_freq[w] < num_clusters else 0.1 + scored.append((w, tf * idf, count)) + + # Sort by score desc, break ties by raw count + scored.sort(key=lambda x: (-x[1], -x[2])) + # Take top 2 terms, title-case + top = [s[0].title() for s in scored[:2]] + cluster_names[key] = " ".join(top) if top else f"Cluster {cid}" + + return cluster_names + def generate_projection_dataset( self, ontology: str, @@ -814,6 +981,14 @@ def generate_projection_dataset( center=center ) + # Run DBSCAN clustering on projected coordinates + cluster_result = self._compute_clusters(projection) + cluster_labels = cluster_result["cluster_labels"] + + # Derive descriptive names for each cluster from concept labels + cluster_names = self._name_clusters(cluster_labels, items) + cluster_result["cluster_names"] = cluster_names + # Batch compute fresh grounding if requested (only for concepts) fresh_groundings = {} if include_grounding and refresh_grounding and embedding_source in ("concepts", "combined"): @@ -846,7 +1021,8 @@ def generate_projection_dataset( "label": item["label"], "x": coords[0], "y": coords[1], - "z": coords[2] if n_components == 3 else 0.0 + "z": coords[2] if n_components == 3 else 0.0, + "cluster_id": int(cluster_labels[i]) if cluster_labels[i] != -1 else None } # Add item type for combined mode @@ -892,6 +1068,12 @@ def generate_projection_dataset( if diversity_values: stats["diversity_range"] = [min(diversity_values), max(diversity_values)] + # Cluster statistics + stats["cluster_count"] = cluster_result["cluster_count"] + stats["cluster_sizes"] = cluster_result["cluster_sizes"] + stats["cluster_names"] = cluster_result["cluster_names"] + stats["cluster_noise_count"] = cluster_result["noise_count"] + # Generate changelist ID for cache invalidation changelist_id = self._generate_changelist_id(f"{ontology}:{embedding_source}", len(items)) diff --git a/docs/features/web-workstation.md b/docs/features/web-workstation.md index 50b75200..218be9c2 100644 --- a/docs/features/web-workstation.md +++ b/docs/features/web-workstation.md @@ -75,15 +75,17 @@ Project concepts onto a semantic spectrum between two poles. ![Embedding Landscape](../media/screenshots/web-embedding-landscape.png) -3D visualization of all concept embeddings using t-SNE or UMAP. +3D visualization of all concept embeddings using t-SNE or UMAP with automatic DBSCAN cluster detection. **What you can do:** - See the overall shape of your semantic space -- Identify natural clusters before diving into details -- Click two concepts to preview a polarity axis +- View auto-detected clusters with TF-IDF-derived names +- Toggle cluster visibility to focus on specific regions +- Switch color palettes (Bold, Warm→Cool, Earth) and sort by name, count, or color +- Right-click any concept for details and to examine in force graph - Plan analysis based on what you see -**Best for:** Discovering semantic dimensions, validating embeddings, global overview before detailed exploration. +**Best for:** Discovering semantic dimensions, identifying topic clusters, validating embeddings, global overview before detailed exploration. --- diff --git a/docs/media/screenshots/web-embedding-landscape-clusters.png b/docs/media/screenshots/web-embedding-landscape-clusters.png new file mode 100644 index 00000000..036498db Binary files /dev/null and b/docs/media/screenshots/web-embedding-landscape-clusters.png differ diff --git a/docs/media/screenshots/web-embedding-landscape.png b/docs/media/screenshots/web-embedding-landscape.png index 503bd701..8ddab5cf 100644 Binary files a/docs/media/screenshots/web-embedding-landscape.png and b/docs/media/screenshots/web-embedding-landscape.png differ diff --git a/tests/test_clustering.py b/tests/test_clustering.py new file mode 100644 index 00000000..aec778c8 --- /dev/null +++ b/tests/test_clustering.py @@ -0,0 +1,146 @@ +"""Unit tests for DBSCAN clustering and cluster naming in EmbeddingProjectionService. + +Tests _compute_clusters() and _name_clusters() which are pure-function-like methods +that operate on numpy arrays without needing a database connection. +""" + +import numpy as np +import pytest + +from api.app.services.embedding_projection_service import EmbeddingProjectionService + + +@pytest.fixture +def service(): + """Create service with no db client (only clustering methods need no db).""" + return EmbeddingProjectionService(age_client=None) # type: ignore[arg-type] + + +class TestComputeClusters: + """Tests for _compute_clusters().""" + + def test_too_few_points_returns_all_noise(self, service): + """With fewer points than min_samples, everything is noise.""" + projection = np.array([[0, 0, 0], [1, 1, 1]], dtype=np.float32) + result = service._compute_clusters(projection, min_samples=5) + + assert result["cluster_count"] == 0 + assert result["noise_count"] == 2 + assert len(result["cluster_labels"]) == 2 + assert all(label == -1 for label in result["cluster_labels"]) + + def test_tight_cluster_detected(self, service): + """Points packed tightly should form at least one cluster.""" + rng = np.random.RandomState(42) + # 50 points in a tight ball + 50 in another + cluster_a = rng.normal(0, 0.1, (50, 3)) + cluster_b = rng.normal(10, 0.1, (50, 3)) + projection = np.vstack([cluster_a, cluster_b]).astype(np.float32) + + result = service._compute_clusters(projection, min_samples=5) + + assert result["cluster_count"] >= 2 + assert result["eps_used"] > 0 + # Cluster sizes should be str keys + for key in result["cluster_sizes"]: + assert isinstance(key, str) + + def test_all_same_point_single_cluster(self, service): + """Identical points should form one cluster.""" + projection = np.zeros((20, 3), dtype=np.float32) + result = service._compute_clusters(projection, min_samples=5) + + assert result["cluster_count"] == 1 + assert result["noise_count"] == 0 + + def test_noise_count_consistency(self, service): + """noise_count + sum(cluster_sizes) should equal total points.""" + rng = np.random.RandomState(123) + projection = rng.randn(100, 3).astype(np.float32) + result = service._compute_clusters(projection, min_samples=5) + + total_clustered = sum(result["cluster_sizes"].values()) + assert total_clustered + result["noise_count"] == 100 + + def test_returns_expected_keys(self, service): + """Result dict should have all expected keys.""" + projection = np.zeros((20, 3), dtype=np.float32) + result = service._compute_clusters(projection, min_samples=5) + + expected_keys = {"cluster_labels", "cluster_count", "cluster_sizes", "eps_used", "noise_count"} + assert set(result.keys()) == expected_keys + + +class TestNameClusters: + """Tests for _name_clusters().""" + + def test_empty_labels_returns_empty(self, service): + """All noise (-1) should return empty dict.""" + labels = np.array([-1, -1, -1]) + items = [{"label": "foo"}, {"label": "bar"}, {"label": "baz"}] + result = service._name_clusters(labels, items) + assert result == {} + + def test_single_cluster_names_by_frequency(self, service): + """Single cluster uses frequency ranking (no IDF).""" + labels = np.array([0, 0, 0, 0]) + items = [ + {"label": "machine learning algorithms"}, + {"label": "machine learning models"}, + {"label": "deep learning algorithms"}, + {"label": "machine learning training"}, + ] + result = service._name_clusters(labels, items) + assert "0" in result + name = result["0"].lower() + # "machine" appears 3 times, "learning" appears 4 times — both should appear + assert "learning" in name or "machine" in name + + def test_two_clusters_idf_distinguishes(self, service): + """IDF scoring should pick terms unique to each cluster.""" + labels = np.array([0, 0, 0, 1, 1, 1]) + items = [ + {"label": "quantum physics experiment"}, + {"label": "quantum mechanics theory"}, + {"label": "quantum entanglement research"}, + {"label": "economic market analysis"}, + {"label": "economic trade policy"}, + {"label": "economic growth forecast"}, + ] + result = service._name_clusters(labels, items) + assert "0" in result + assert "1" in result + # Cluster 0 should mention quantum-related terms + assert "quantum" in result["0"].lower() + # Cluster 1 should mention economic-related terms + assert "economic" in result["1"].lower() + + def test_stop_words_filtered(self, service): + """Stop words should not appear in cluster names.""" + labels = np.array([0, 0, 0]) + items = [ + {"label": "the big analysis of data"}, + {"label": "the big study of data"}, + {"label": "the big review of data"}, + ] + result = service._name_clusters(labels, items) + name_words = result["0"].lower().split() + # "the", "of" are stop words, should not appear + assert "the" not in name_words + assert "of" not in name_words + + def test_str_keys_in_result(self, service): + """Result dict should use str keys to match Pydantic model.""" + labels = np.array([0, 0, 0, 1, 1, 1]) + items = [{"label": f"concept {i}"} for i in range(6)] + result = service._name_clusters(labels, items) + for key in result: + assert isinstance(key, str) + + def test_missing_labels_fallback(self, service): + """Items with empty labels should still get a name.""" + labels = np.array([0, 0, 0, 0, 0]) + items = [{"label": ""} for _ in range(5)] + result = service._name_clusters(labels, items) + assert "0" in result + assert "Cluster 0" in result["0"] diff --git a/web/src/components/embeddings/ClusterLegend.tsx b/web/src/components/embeddings/ClusterLegend.tsx new file mode 100644 index 00000000..5ccb7726 --- /dev/null +++ b/web/src/components/embeddings/ClusterLegend.tsx @@ -0,0 +1,229 @@ +/** + * ClusterLegend — interactive legend for DBSCAN cluster visualization. + * + * Shows palette switcher, sortable cluster list with toggle-to-highlight, + * and noise count. Extracted from EmbeddingLandscapeWorkspace for clarity. + */ + +import type { ClusterPalette } from './types'; + +/** 20-color palettes designed for dark backgrounds. */ +export const CLUSTER_PALETTES: Record = { + bold: { + label: 'Bold', + colors: [ + '#e6194b', '#3cb44b', '#4363d8', '#f58231', '#911eb4', + '#42d4f4', '#f032e6', '#bfef45', '#fabed4', '#469990', + '#dcbeff', '#9a6324', '#fffac8', '#800000', '#aaffc3', + '#808000', '#ffd8b1', '#000075', '#a9a9a9', '#ffe119', + ], + }, + 'warm-cool': { + label: 'Warm → Cool', + colors: [ + '#ff1744', '#ff5722', '#ff9100', '#ffab00', '#ffd600', + '#c6ff00', '#76ff03', '#00e676', '#1de9b6', '#00e5ff', + '#00b0ff', '#2979ff', '#3d5afe', '#651fff', '#d500f9', + '#ff4081', '#ff6e40', '#ffab40', '#69f0ae', '#40c4ff', + ], + }, + earth: { + label: 'Earth', + colors: [ + '#bf360c', '#e65100', '#f57f17', '#827717', '#33691e', + '#1b5e20', '#004d40', '#006064', '#01579b', '#0d47a1', + '#1a237e', '#311b92', '#4a148c', '#880e4f', '#b71c1c', + '#3e2723', '#455a64', '#546e7a', '#78909c', '#8d6e63', + ], + }, +}; +export const CLUSTER_PALETTE_ORDER: ClusterPalette[] = ['bold', 'warm-cool', 'earth']; +export const NOISE_COLOR = '#555555'; + +/** Get the palette color for a cluster id. */ +export function clusterColor(palette: ClusterPalette, clusterId: number): string { + const colors = CLUSTER_PALETTES[palette].colors; + return colors[clusterId % colors.length]; +} + +export type ClusterSortKey = 'color' | 'count' | 'name'; + +interface Props { + clusterCount: number; + clusterSizes: Record; + clusterNames: Record; + noiseCount: number; + highlightedClusters: Set | null; + onHighlightChange: (clusters: Set | null) => void; + palette: ClusterPalette; + onPaletteChange: (palette: ClusterPalette) => void; + sort: { key: ClusterSortKey; desc: boolean }; + onSortChange: (sort: { key: ClusterSortKey; desc: boolean }) => void; +} + +export function ClusterLegend({ + clusterCount, + clusterSizes, + clusterNames, + noiseCount, + highlightedClusters, + onHighlightChange, + palette, + onPaletteChange, + sort, + onSortChange, +}: Props) { + if (clusterCount <= 0) { + return ( +
+ No clusters — regenerate projection +
+ ); + } + + const sortColumns = [ + { key: 'color' as const, label: '●', width: 'w-2.5', title: 'Sort by palette order' }, + { key: 'name' as const, label: 'Name', width: 'flex-1', title: 'Sort by name' }, + { key: 'count' as const, label: '#', width: 'w-6', title: 'Sort by count' }, + ] as const; + + const entries = Object.entries(clusterSizes) + .map(([clusterId, size]) => ({ + id: parseInt(clusterId), + name: clusterNames[clusterId] || `Cluster ${clusterId}`, + size, + })) + .sort((a, b) => { + let cmp: number; + switch (sort.key) { + case 'color': + cmp = (a.id % CLUSTER_PALETTES[palette].colors.length) + - (b.id % CLUSTER_PALETTES[palette].colors.length); + break; + case 'count': + cmp = a.size - b.size; + break; + case 'name': + default: + cmp = a.name.localeCompare(b.name); + break; + } + return sort.desc ? -cmp : cmp; + }); + + return ( +
+ {/* Palette switcher */} +
+ {CLUSTER_PALETTE_ORDER.map((p) => ( + + ))} +
+ + {/* Clear filter button */} + {highlightedClusters !== null && ( + + )} + + {/* Sort header */} +
+ {sortColumns.map(col => ( + + ))} +
+ + {/* Cluster list */} +
+ {entries.map(({ id, name, size }) => { + const isActive = highlightedClusters === null || highlightedClusters.has(id); + return ( + + ); + })} +
+ + {/* Noise row */} + {noiseCount > 0 && ( +
+
+ + noise + + + {noiseCount} + +
+ )} +
+ ); +} diff --git a/web/src/components/embeddings/EmbeddingLandscapeWorkspace.tsx b/web/src/components/embeddings/EmbeddingLandscapeWorkspace.tsx index 1af0181e..d0fa77b2 100644 --- a/web/src/components/embeddings/EmbeddingLandscapeWorkspace.tsx +++ b/web/src/components/embeddings/EmbeddingLandscapeWorkspace.tsx @@ -9,9 +9,10 @@ import { useState, useEffect, useMemo, useCallback } from 'react'; import { useNavigate } from 'react-router-dom'; import { apiClient } from '../../api/client'; import { Loader2, RefreshCw, Layers, Eye, EyeOff, SlidersHorizontal, FolderOpen } from 'lucide-react'; -import type { ProjectionData, EmbeddingPoint, ColorScheme, ProjectionItemType, DistanceMetric, GroundingScale, GroundingColorRamp } from './types'; +import type { ProjectionData, EmbeddingPoint, ColorScheme, ProjectionItemType, DistanceMetric, GroundingScale, GroundingColorRamp, ClusterPalette } from './types'; +import type { ClusterSortKey } from './ClusterLegend'; import { EmbeddingScatter3D } from './EmbeddingScatter3D'; -import { NodeInfoBox } from '../../explorers/common/NodeInfoBox'; +import { ClusterLegend, NOISE_COLOR, clusterColor } from './ClusterLegend'; import { IconRailPanel } from '../shared/IconRailPanel'; import { SavedQueriesPanel } from '../shared/SavedQueriesPanel'; import { useQueryReplay } from '../../hooks/useQueryReplay'; @@ -28,6 +29,8 @@ const ONTOLOGY_COLORS = [ '#aaff44', // bright lime ]; +// Cluster palette constants imported from ClusterLegend + // Color scheme descriptions const COLOR_SCHEME_INFO: Record = { ontology: { @@ -42,6 +45,10 @@ const COLOR_SCHEME_INFO: Record>(new Map()); const [selectedConcept, setSelectedConcept] = useState(null); - const [selectedScreenPos, setSelectedScreenPos] = useState<{ x: number; y: number } | null>(null); // Context menu state for right-click actions const [contextMenu, setContextMenu] = useState<{ @@ -262,6 +268,11 @@ export function EmbeddingLandscapeWorkspace() { // Color scheme for visualization const [colorScheme, setColorScheme] = useState('ontology'); + // Cluster visualization controls + const [highlightedClusters, setHighlightedClusters] = useState | null>(null); + const [clusterPalette, setClusterPalette] = useState('bold'); + const [clusterSort, setClusterSort] = useState<{ key: ClusterSortKey; desc: boolean }>({ key: 'name', desc: false }); + // Distance metric for projection (cosine best for embeddings) const [metric, setMetric] = useState('cosine'); @@ -333,7 +344,7 @@ export function EmbeddingLandscapeWorkspace() { setError(null); // Regenerate global projection (all ontologies together with global centering) - await apiClient.regenerateProjection('__all__', { + const result = await apiClient.regenerateProjection('__all__', { force: true, perplexity, metric, @@ -341,6 +352,17 @@ export function EmbeddingLandscapeWorkspace() { embedding_source: 'concepts', }); + // If queued (large dataset), poll until the job completes + if (result.status === 'queued' && result.job_id) { + const finalJob = await apiClient.pollJobUntilComplete(result.job_id, { + intervalMs: 1000, + }); + if (finalJob.status === 'failed') { + setError('Projection job failed'); + return; + } + } + // Reload the projection await loadGlobalProjection(); } catch (err: unknown) { @@ -374,6 +396,7 @@ export function EmbeddingLandscapeWorkspace() { grounding: number | null; ontologyColor: string; itemType: ProjectionItemType; + clusterId: number | null; }> = []; globalProjection.concepts.forEach(concept => { @@ -392,6 +415,7 @@ export function EmbeddingLandscapeWorkspace() { grounding: concept.grounding_strength, ontologyColor: ontologyColors.get(ontology) || '#888888', itemType: (concept.item_type as ProjectionItemType) || 'concept', + clusterId: concept.cluster_id ?? null, }); }); @@ -418,6 +442,18 @@ export function EmbeddingLandscapeWorkspace() { case 'position': color = positionToColor(p.x, p.y, p.z, bounds); break; + case 'cluster': { + const dimmed = highlightedClusters !== null && + (p.clusterId == null || !highlightedClusters.has(p.clusterId)); + if (dimmed) { + color = '#1a1a1a'; // nearly invisible + } else { + color = p.clusterId != null + ? clusterColor(clusterPalette, p.clusterId) + : NOISE_COLOR; + } + break; + } case 'ontology': default: color = p.ontologyColor; @@ -434,9 +470,10 @@ export function EmbeddingLandscapeWorkspace() { grounding: p.grounding, color, itemType: p.itemType, + clusterId: p.clusterId, }; }); - }, [globalProjection, ontologyVisibility, ontologyColors, colorScheme, groundingScale, groundingRamp]); + }, [globalProjection, ontologyVisibility, ontologyColors, colorScheme, groundingScale, groundingRamp, highlightedClusters, clusterPalette]); // Get list of ontologies for UI const ontologyList = useMemo(() => { @@ -459,8 +496,9 @@ export function EmbeddingLandscapeWorkspace() { }; }, [ontologyList, points, globalProjection]); - // Context menu handlers + // Context menu handlers — right-click opens info + actions const handleContextMenu = useCallback((point: EmbeddingPoint, screenPos: { x: number; y: number }) => { + setSelectedConcept(point); setContextMenu({ x: screenPos.x, y: screenPos.y, @@ -470,6 +508,7 @@ export function EmbeddingLandscapeWorkspace() { const closeContextMenu = useCallback(() => { setContextMenu(null); + setSelectedConcept(null); }, []); // Navigate to explorer with concept (similarity search) @@ -603,7 +642,10 @@ export function EmbeddingLandscapeWorkspace() { {(Object.keys(COLOR_SCHEME_INFO) as ColorScheme[]).map(scheme => (
)} + {colorScheme === 'cluster' && globalProjection && ( + + )}
{stats.totalConcepts} points
)} - {/* Selected concept info - using shared NodeInfoBox */} - {selectedConcept && selectedScreenPos && ( -
- { - setSelectedConcept(null); - setSelectedScreenPos(null); - }} - /> -
- )} - - {/* Context menu for right-click actions */} + {/* Context menu for right-click — info + actions */} {contextMenu && (
{contextMenu.point.ontology}
+
+ {contextMenu.point.grounding != null && ( + grounding: {contextMenu.point.grounding.toFixed(2)} + )} + {contextMenu.point.clusterId != null && ( + + + {globalProjection?.statistics.cluster_names?.[String(contextMenu.point.clusterId)] || `Cluster ${contextMenu.point.clusterId}`} + + )} +
{/* Concept Mode (Similarity Search) */} diff --git a/web/src/components/embeddings/EmbeddingScatter3D.tsx b/web/src/components/embeddings/EmbeddingScatter3D.tsx index 605945f5..74e12abd 100644 --- a/web/src/components/embeddings/EmbeddingScatter3D.tsx +++ b/web/src/components/embeddings/EmbeddingScatter3D.tsx @@ -112,7 +112,7 @@ const fragmentShader = ` interface Props { points: EmbeddingPoint[]; - onSelectPoint: (point: EmbeddingPoint | null, screenPos?: { x: number; y: number }) => void; + onSelectPoint: (point: EmbeddingPoint | null) => void; onContextMenu?: (point: EmbeddingPoint, screenPos: { x: number; y: number }) => void; selectedPoint: EmbeddingPoint | null; } @@ -346,31 +346,9 @@ export function EmbeddingScatter3D({ points, onSelectPoint, onContextMenu, selec } }, []); - // Handle click for selection - const handleClick = useCallback((event: React.MouseEvent) => { - if (!containerRef.current || !pointsRef.current || !cameraRef.current) return; - - const rect = containerRef.current.getBoundingClientRect(); - mouseRef.current.x = ((event.clientX - rect.left) / rect.width) * 2 - 1; - mouseRef.current.y = -((event.clientY - rect.top) / rect.height) * 2 + 1; - - raycasterRef.current.setFromCamera(mouseRef.current, cameraRef.current); - const intersects = raycasterRef.current.intersectObject(pointsRef.current); - - if (intersects.length > 0) { - const idx = intersects[0].index; - if (idx !== undefined) { - const point = pointDataRef.current[idx]; - // Pass container-relative coordinates for info box positioning - const screenPos = { - x: event.clientX - rect.left, - y: event.clientY - rect.top - }; - onSelectPoint(point || null, screenPos); - } - } else { - onSelectPoint(null); - } + // Left click deselects (orbit controls handle pan/rotate natively) + const handleClick = useCallback(() => { + onSelectPoint(null); }, [onSelectPoint]); // Handle right-click for context menu diff --git a/web/src/components/embeddings/types.ts b/web/src/components/embeddings/types.ts index b933a1f5..a7792163 100644 --- a/web/src/components/embeddings/types.ts +++ b/web/src/components/embeddings/types.ts @@ -2,7 +2,7 @@ * Type definitions for Embedding Landscape visualization (ADR-078) */ -export type ColorScheme = 'ontology' | 'grounding' | 'position'; +export type ColorScheme = 'ontology' | 'grounding' | 'position' | 'cluster'; // Color scale for grounding visualization export type GroundingScale = 'linear' | 'sqrt' | 'log'; @@ -15,6 +15,9 @@ export type GroundingColorRamp = | 'brown-white-teal' // Colorblind-safe, earthy | 'purple-white-orange'; // High contrast +// Color palettes for cluster visualization +export type ClusterPalette = 'bold' | 'warm-cool' | 'earth'; + // Embedding sources available for projection export type EmbeddingSource = 'concepts' | 'sources' | 'vocabulary' | 'combined'; @@ -32,6 +35,7 @@ export interface ProjectionConcept { diversity_related_count: number | null; ontology?: string; // Source ontology (for cross-ontology mode) item_type?: ProjectionItemType; // For distinguishing in combined view + cluster_id?: number | null; // DBSCAN cluster assignment (null = noise) } // Distance metric for projection algorithm @@ -58,6 +62,10 @@ export interface ProjectionData { embedding_dims: number; grounding_range: [number, number] | null; diversity_range: [number, number] | null; + cluster_count?: number; + cluster_sizes?: Record; + cluster_names?: Record; + cluster_noise_count?: number; }; } @@ -71,6 +79,7 @@ export interface EmbeddingPoint { grounding: number | null; color: string; itemType: ProjectionItemType; + clusterId?: number | null; } export interface OntologySelection {