diff --git a/.env.example b/.env.example index 42159f9..e111103 100644 --- a/.env.example +++ b/.env.example @@ -19,3 +19,35 @@ PORT=8080 # Default: info # Valid values: debug, info, warn, error LOG_LEVEL=info + + +# OPENAI API key for AI Enrichment features +OPENAI_API_KEY=sk-your-openai-api-key-here + +# River Job Queue Configuration +# River handles async embedding generation with retries and rate limiting + +# Enable River job queue (optional) +# Default: true (when OPENAI_API_KEY is set) +# Set to false to use fire-and-forget goroutines (legacy behavior) +# RIVER_ENABLED=true + +# Number of concurrent embedding workers (optional) +# Default: 10 +# Controls how many embedding jobs can run simultaneously +# RIVER_WORKERS=10 + +# Maximum retry attempts for failed jobs (optional) +# Default: 5 +# Jobs that fail will be retried with exponential backoff +# RIVER_MAX_RETRIES=5 + +# Embedding rate limit - OpenAI requests per second (optional) +# Default: 50 +# Adjust based on your OpenAI tier limits +# EMBEDDING_RATE_LIMIT=50 + +# Optional - Taxonomy Service (Python microservice for clustering) +TAXONOMY_SERVICE_URL=http://localhost:8001 # Python microservice URL (default) +TAXONOMY_SCHEDULER_ENABLED=true # Enable periodic scheduler (default: true) +TAXONOMY_POLL_INTERVAL=1m # Scheduler poll frequency (default: 1m) diff --git a/.gitignore b/.gitignore index b0b1db9..c406e82 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ .vscode .cursor bin/ +api .env *.out *.html diff --git a/AGENTS.md b/AGENTS.md index 6a4df01..db13d48 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -2,9 +2,10 @@ ## Project Structure & Module Organization - `cmd/api/` holds the API server entrypoint (`main.go`). -- `internal/` contains core application layers: `api/handlers`, `api/middleware`, `service`, `repository`, `models`, and `config`. +- `internal/` contains core application layers: `api/handlers`, `api/middleware`, `service`, `repository`, `models`, `worker`, and `config`. - `pkg/` provides shared utilities (currently `pkg/database`). - `sql/` stores SQL schema files (e.g., `sql/001_initial_schema.sql`). +- `services/` contains microservices (e.g., `services/taxonomy-generator/` Python service). - `tests/` contains integration tests. ## Build, Test, and Development Commands @@ -36,3 +37,20 @@ ## Security & Configuration Tips - Configure `API_KEY` and `DATABASE_URL` via `.env` or environment variables. - Do not commit `.env` or secrets; use `.env.example` as the base. + +## Taxonomy Service Architecture +The taxonomy feature uses a Python microservice for ML clustering: + +- **Go API** triggers jobs via HTTP to the taxonomy-generator service +- **Python service** writes results directly to Postgres (topics table, feedback_records.topic_id) +- **TaxonomyScheduler** (`internal/worker/`) polls for scheduled jobs and tracks completion +- Config: `TAXONOMY_SERVICE_URL`, `TAXONOMY_SCHEDULER_ENABLED`, `TAXONOMY_POLL_INTERVAL` + +To run the taxonomy service: +```bash +cd services/taxonomy-generator +pip install -r requirements.txt +uvicorn src.main:app --port 8001 +``` + +Key endpoints: `POST /v1/taxonomy/{tenant_id}/generate`, `GET /v1/taxonomy/{tenant_id}/status` diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..18d30cd --- /dev/null +++ b/Dockerfile @@ -0,0 +1,47 @@ +# syntax=docker/dockerfile:1 + +# Build stage +FROM golang:1.24-alpine AS builder + +# Install build dependencies +RUN apk add --no-cache git ca-certificates + +WORKDIR /app + +# Copy go mod files first for better caching +COPY go.mod go.sum ./ + +# Allow Go to download the required toolchain version +ENV GOTOOLCHAIN=auto +RUN go mod download + +# Copy source code +COPY . . + +# Build the binary (GOTOOLCHAIN=auto ensures correct version is used) +RUN CGO_ENABLED=0 GOOS=linux GOTOOLCHAIN=auto go build -ldflags="-w -s" -o /app/bin/api ./cmd/api + +# Runtime stage +FROM alpine:3.19 AS runtime + +# Install runtime dependencies +RUN apk add --no-cache ca-certificates wget + +WORKDIR /app + +# Copy binary from builder +COPY --from=builder /app/bin/api /app/api + +# Create non-root user +RUN adduser -D -u 1000 appuser +USER appuser + +# Expose port +EXPOSE 8080 + +# Health check +HEALTHCHECK --interval=10s --timeout=5s --start-period=5s --retries=3 \ + CMD wget -q --spider http://localhost:8080/health || exit 1 + +# Run the application +CMD ["/app/api"] diff --git a/Makefile b/Makefile index 1a5c76c..c92685c 100644 --- a/Makefile +++ b/Makefile @@ -1,28 +1,42 @@ -.PHONY: help tests tests-coverage build run init-db clean docker-up docker-down docker-clean deps install-tools fmt fmt-check lint dev-setup test-all test-unit schemathesis install-hooks +.PHONY: help tests tests-coverage build run init-db clean docker-up docker-down docker-clean deps install-tools fmt fmt-check lint dev-setup test-all test-unit schemathesis install-hooks backfill-embeddings prod-up prod-down prod-logs taxonomy-dev # Default target - show help help: @echo "Available targets:" - @echo " make help - Show this help message" - @echo " make dev-setup - Set up development environment (docker, deps, tools, schema, hooks)" - @echo " make build - Build the API server" - @echo " make run - Run the API server" + @echo "" + @echo "Development:" + @echo " make dev-setup - Set up dev environment (postgres, deps, tools, schema, hooks)" + @echo " make run - Run Go API server locally" + @echo " make taxonomy-dev - Run Python taxonomy service locally" + @echo " make docker-up - Start dev infrastructure (postgres, pgadmin)" + @echo " make docker-down - Stop dev infrastructure" + @echo " make docker-clean - Stop and remove volumes" + @echo "" + @echo "Production:" + @echo " make prod-up - Start full stack (postgres, api, taxonomy-generator)" + @echo " make prod-down - Stop full stack" + @echo " make prod-logs - View logs from production stack" + @echo "" + @echo "Build & Test:" + @echo " make build - Build API binaries" @echo " make test-unit - Run unit tests (fast, no database)" @echo " make tests - Run integration tests" @echo " make test-all - Run all tests (unit + integration)" @echo " make tests-coverage - Run tests with coverage report" - @echo " make init-db - Initialize database schema" + @echo " make schemathesis - Run Schemathesis API tests" + @echo "" + @echo "Code Quality:" @echo " make fmt - Format code with gofumpt" @echo " make fmt-check - Check if code is formatted" @echo " make lint - Run linter" + @echo "" + @echo "Utilities:" + @echo " make init-db - Initialize database schema" + @echo " make backfill-embeddings - Backfill embeddings for existing records" @echo " make deps - Install Go dependencies" - @echo " make install-tools - Install development tools (gofumpt, golangci-lint)" + @echo " make install-tools - Install dev tools (gofumpt, golangci-lint)" @echo " make install-hooks - Install git hooks" - @echo " make docker-up - Start Docker containers" - @echo " make docker-down - Stop Docker containers" - @echo " make docker-clean - Stop Docker containers and remove volumes" @echo " make clean - Clean build artifacts" - @echo " make schemathesis - Run Schemathesis API tests (requires API server running)" # Run all tests (integration tests in tests/ directory) tests: @@ -49,7 +63,19 @@ tests-coverage: build: @echo "Building API server..." go build -o bin/api cmd/api/main.go - @echo "Binary created: bin/api" + go build -o bin/backfill cmd/backfill/main.go + @echo "Binaries created: bin/api, bin/backfill" + +# Backfill embeddings for existing records +# This enqueues River jobs for all records missing embeddings +backfill-embeddings: + @echo "Backfilling embeddings for existing records..." + @if [ -f .env ]; then \ + export $$(grep -v '^#' .env | xargs) && \ + go run cmd/backfill/main.go; \ + else \ + go run cmd/backfill/main.go; \ + fi # Run the API server run: @@ -81,25 +107,34 @@ init-db: echo "Error: DATABASE_URL not found in .env file"; \ exit 1; \ fi && \ - psql "$$DATABASE_URL" -f sql/001_initial_schema.sql; \ + for f in sql/*.sql; do \ + echo "Applying $$f..."; \ + psql "$$DATABASE_URL" -f "$$f"; \ + done; \ else \ if [ -z "$$DATABASE_URL" ]; then \ echo "Error: DATABASE_URL environment variable is not set"; \ echo "Please set it or create a .env file with DATABASE_URL"; \ exit 1; \ fi && \ - psql "$$DATABASE_URL" -f sql/001_initial_schema.sql; \ + for f in sql/*.sql; do \ + echo "Applying $$f..."; \ + psql "$$DATABASE_URL" -f "$$f"; \ + done; \ fi @echo "Database schema initialized successfully" -# Start Docker containers +# Start dev infrastructure (postgres, pgadmin) docker-up: - @echo "Starting Docker containers..." + @echo "Starting dev infrastructure (postgres, pgadmin)..." docker compose up -d - @echo "Waiting for services to be ready..." + @echo "Waiting for postgres to be ready..." @sleep 3 @docker compose ps + @echo "" + @echo "Postgres: localhost:5432" + @echo "pgAdmin: localhost:5050 (admin@formbricks.com / admin)" # Stop Docker containers docker-down: @@ -176,9 +211,51 @@ install-hooks: # Run everything needed for development dev-setup: docker-up deps install-tools init-db install-hooks + @echo "" @echo "Development environment ready!" - @echo "Set API_KEY environment variable for authentication" - @echo "Run 'make run' to start the API server" + @echo "" + @echo "Next steps:" + @echo " Terminal 1: make run # Go API on :8080" + @echo " Terminal 2: make taxonomy-dev # Python service on :8001 (optional)" + +# Run Python taxonomy service locally (uses venv + pip, requires Python 3.11+) +# Prefers python3.11 if available, falls back to python3 +TAXONOMY_PYTHON := $(shell command -v python3.11 2>/dev/null || command -v python3 2>/dev/null) + +taxonomy-dev: + @echo "Starting taxonomy-generator service..." + @$(TAXONOMY_PYTHON) -c "import sys; exit(0 if sys.version_info >= (3, 11) else 1)" 2>/dev/null || \ + { echo "Error: Python 3.11+ required. Install with: brew install python@3.11"; exit 1; } + @cd services/taxonomy-generator && \ + if [ ! -f .env ]; then \ + echo "Creating .env from .env.example..."; \ + cp .env.example .env; \ + echo "⚠️ Edit services/taxonomy-generator/.env to set OPENAI_API_KEY"; \ + fi && \ + if [ ! -d .venv ]; then \ + echo "Creating virtual environment..."; \ + $(TAXONOMY_PYTHON) -m venv .venv; \ + fi && \ + . .venv/bin/activate && \ + pip install -q -r requirements.txt && \ + uvicorn src.main:app --reload --port 8001 + +# Production: start full stack +prod-up: + @echo "Starting production stack..." + docker compose -f docker-compose.prod.yml up -d --build + @echo "Waiting for services..." + @sleep 5 + @docker compose -f docker-compose.prod.yml ps + +# Production: stop full stack +prod-down: + @echo "Stopping production stack..." + docker compose -f docker-compose.prod.yml down + +# Production: view logs +prod-logs: + docker compose -f docker-compose.prod.yml logs -f # Run Schemathesis API tests (all phases for thorough local testing) # Phases: examples (schema examples), coverage (boundary values), stateful (API sequences), fuzzing (random) diff --git a/README.md b/README.md index bdc81aa..fbc042f 100644 --- a/README.md +++ b/README.md @@ -25,11 +25,12 @@ An open-source Experience Management (XM) database service. Hub is a headless AP ### Current Features - ✅ **RESTful API** for feedback record CRUD operations -- ✅ **PostgreSQL** for data persistence with optimized schema +- ✅ **PostgreSQL** with pgvector for data persistence and vector search +- ✅ **AI-Powered Taxonomy** - automatic topic clustering with UMAP, HDBSCAN, and GPT-4o +- ✅ **Embedding Generation** - async embedding generation via River job queue - ✅ **API Key Authentication** via environment variable - ✅ **Clean Architecture** with repository, service, and handler layers -- ✅ **Docker Compose** for local development -- ✅ **Database Schema** initialization +- ✅ **Docker Compose** for local development and production - ✅ **Swagger/OpenAPI** documentation - ✅ **Health Check** endpoints @@ -47,7 +48,8 @@ An open-source Experience Management (XM) database service. Hub is a headless AP ``` . ├── cmd/ -│ └── api/ # API server entrypoint +│ ├── api/ # API server entrypoint +│ └── backfill/ # CLI tool for backfilling embeddings ├── internal/ │ ├── api/ │ │ ├── handlers/ # HTTP request handlers @@ -55,12 +57,16 @@ An open-source Experience Management (XM) database service. Hub is a headless AP │ ├── service/ # Business logic layer │ ├── repository/ # Data access layer │ ├── models/ # Domain models and DTOs +│ ├── worker/ # Background workers (taxonomy scheduler) +│ ├── jobs/ # River job queue workers │ └── config/ # Configuration management ├── pkg/ │ └── database/ # Database utilities and connection pooling +├── services/ +│ └── taxonomy-generator/ # Python microservice for ML clustering ├── sql/ # SQL schema files -├── tests/ # Integration tests -└── docs/ # API documentation (Swagger) +├── tests/ # Integration tests +└── docs/ # API documentation (Swagger) ``` ## Getting Started @@ -199,15 +205,25 @@ Authorization: Bearer ### Available Make Commands ```bash -make help # Show all available commands -make dev-setup # Set up development environment (docker, deps, tools, schema) -make build # Build all binaries -make run # Run the API server -make tests # Run all tests -make init-db # Initialize database schema -make docker-up # Start Docker containers -make docker-down # Stop Docker containers -make clean # Clean build artifacts +make help # Show all available commands + +# Development +make dev-setup # Set up dev environment (postgres, deps, tools, schema, hooks) +make run # Run Go API server locally (port 8080) +make taxonomy-dev # Run Python taxonomy service locally (port 8001) +make docker-up # Start dev infrastructure (postgres, pgadmin) +make docker-down # Stop dev infrastructure + +# Production +make prod-up # Start full containerized stack +make prod-down # Stop full stack +make prod-logs # View production logs + +# Build & Test +make build # Build all binaries +make test-unit # Fast unit tests (no database) +make tests # Integration tests (requires database) +make test-all # Run all tests ``` ### Running Tests @@ -218,6 +234,20 @@ make tests # Integration tests (requires database) make test-all # Run all tests ``` +### Running with Taxonomy Service + +For full AI-powered topic clustering, run both services: + +```bash +# Terminal 1: Go API +make run + +# Terminal 2: Python taxonomy service +make taxonomy-dev +``` + +The taxonomy service requires `OPENAI_API_KEY` in `services/taxonomy-generator/.env`. + ### Git Hooks The repository includes pre-commit hooks for code quality. To install them: diff --git a/backfill b/backfill new file mode 100755 index 0000000..35b4dbf Binary files /dev/null and b/backfill differ diff --git a/cmd/api/main.go b/cmd/api/main.go index 2e2b348..30aa087 100644 --- a/cmd/api/main.go +++ b/cmd/api/main.go @@ -13,9 +13,17 @@ import ( "github.com/formbricks/hub/internal/api/handlers" "github.com/formbricks/hub/internal/api/middleware" "github.com/formbricks/hub/internal/config" + "github.com/formbricks/hub/internal/embeddings" + "github.com/formbricks/hub/internal/jobs" "github.com/formbricks/hub/internal/repository" "github.com/formbricks/hub/internal/service" + "github.com/formbricks/hub/internal/worker" "github.com/formbricks/hub/pkg/database" + "github.com/jackc/pgx/v5" + "github.com/jackc/pgx/v5/pgxpool" + "github.com/riverqueue/river" + "github.com/riverqueue/river/riverdriver/riverpgxv5" + "golang.org/x/time/rate" ) func main() { @@ -39,10 +47,65 @@ func main() { } defer db.Close() - // Initialize repository, service, and handler layers + // Initialize embedding client if OpenAI API key is configured + var embeddingClient embeddings.Client + if cfg.OpenAIAPIKey != "" { + embeddingClient = embeddings.NewOpenAIClient(cfg.OpenAIAPIKey) + slog.Info("AI enrichment enabled", "embedding_model", "text-embedding-3-small") + } else { + slog.Info("AI enrichment disabled (OPENAI_API_KEY not set)") + } + + // Initialize repositories + topicsRepo := repository.NewTopicsRepository(db) feedbackRecordsRepo := repository.NewFeedbackRecordsRepository(db) - feedbackRecordsService := service.NewFeedbackRecordsService(feedbackRecordsRepo) + knowledgeRecordsRepo := repository.NewKnowledgeRecordsRepository(db) + + // Initialize River job queue if enabled and embedding client is configured + var riverClient *river.Client[pgx.Tx] + var jobInserter jobs.JobInserter + if cfg.RiverEnabled && embeddingClient != nil { + var err error + riverClient, err = initRiver(ctx, db, cfg, embeddingClient, feedbackRecordsRepo, topicsRepo, knowledgeRecordsRepo) + if err != nil { + slog.Error("Failed to initialize River job queue", "error", err) + os.Exit(1) + } + jobInserter = jobs.NewRiverJobInserter(riverClient) + slog.Info("River job queue enabled", + "workers", cfg.RiverWorkers, + "max_retries", cfg.RiverMaxRetries, + "rate_limit", cfg.EmbeddingRateLimit, + ) + } else if cfg.OpenAIAPIKey != "" && !cfg.RiverEnabled { + slog.Info("River job queue disabled (RIVER_ENABLED=false), using legacy goroutines") + } + + // Initialize services with optional job inserter + var topicsService *service.TopicsService + if embeddingClient != nil { + topicsService = service.NewTopicsServiceWithEmbeddings(topicsRepo, embeddingClient, jobInserter) + } else { + topicsService = service.NewTopicsService(topicsRepo) + } + topicsHandler := handlers.NewTopicsHandler(topicsService) + + var feedbackRecordsService *service.FeedbackRecordsService + if embeddingClient != nil { + feedbackRecordsService = service.NewFeedbackRecordsServiceWithEmbeddings(feedbackRecordsRepo, embeddingClient, jobInserter) + } else { + feedbackRecordsService = service.NewFeedbackRecordsService(feedbackRecordsRepo) + } feedbackRecordsHandler := handlers.NewFeedbackRecordsHandler(feedbackRecordsService) + + var knowledgeRecordsService *service.KnowledgeRecordsService + if embeddingClient != nil { + knowledgeRecordsService = service.NewKnowledgeRecordsServiceWithEmbeddings(knowledgeRecordsRepo, embeddingClient, jobInserter) + } else { + knowledgeRecordsService = service.NewKnowledgeRecordsService(knowledgeRecordsRepo) + } + knowledgeRecordsHandler := handlers.NewKnowledgeRecordsHandler(knowledgeRecordsService) + healthHandler := handlers.NewHealthHandler() // Set up public endpoints (no authentication required) @@ -62,10 +125,39 @@ func main() { protectedMux.HandleFunc("DELETE /v1/feedback-records/{id}", feedbackRecordsHandler.Delete) protectedMux.HandleFunc("DELETE /v1/feedback-records", feedbackRecordsHandler.BulkDelete) + protectedMux.HandleFunc("POST /v1/knowledge-records", knowledgeRecordsHandler.Create) + protectedMux.HandleFunc("GET /v1/knowledge-records", knowledgeRecordsHandler.List) + protectedMux.HandleFunc("GET /v1/knowledge-records/{id}", knowledgeRecordsHandler.Get) + protectedMux.HandleFunc("PATCH /v1/knowledge-records/{id}", knowledgeRecordsHandler.Update) + protectedMux.HandleFunc("DELETE /v1/knowledge-records/{id}", knowledgeRecordsHandler.Delete) + protectedMux.HandleFunc("DELETE /v1/knowledge-records", knowledgeRecordsHandler.BulkDelete) + + protectedMux.HandleFunc("POST /v1/topics", topicsHandler.Create) + protectedMux.HandleFunc("GET /v1/topics", topicsHandler.List) + protectedMux.HandleFunc("GET /v1/topics/{id}/children", topicsHandler.GetChildren) + protectedMux.HandleFunc("GET /v1/topics/{id}", topicsHandler.Get) + protectedMux.HandleFunc("PATCH /v1/topics/{id}", topicsHandler.Update) + protectedMux.HandleFunc("DELETE /v1/topics/{id}", topicsHandler.Delete) + + // Taxonomy generation endpoints (calls Python microservice) + taxonomyClient := service.NewTaxonomyClient(cfg.TaxonomyServiceURL) + clusteringJobsRepo := repository.NewClusteringJobsRepository(db) + taxonomyHandler := handlers.NewTaxonomyHandlerWithSchedule(taxonomyClient, clusteringJobsRepo) + protectedMux.HandleFunc("POST /v1/taxonomy/{tenant_id}/generate", taxonomyHandler.Generate) + protectedMux.HandleFunc("POST /v1/taxonomy/{tenant_id}/generate/sync", taxonomyHandler.GenerateSync) + protectedMux.HandleFunc("GET /v1/taxonomy/{tenant_id}/status", taxonomyHandler.Status) + protectedMux.HandleFunc("GET /v1/taxonomy/health", taxonomyHandler.Health) + // Schedule management + protectedMux.HandleFunc("POST /v1/taxonomy/{tenant_id}/schedule", taxonomyHandler.CreateSchedule) + protectedMux.HandleFunc("GET /v1/taxonomy/{tenant_id}/schedule", taxonomyHandler.GetSchedule) + protectedMux.HandleFunc("DELETE /v1/taxonomy/{tenant_id}/schedule", taxonomyHandler.DeleteSchedule) + protectedMux.HandleFunc("GET /v1/taxonomy/schedules", taxonomyHandler.ListSchedules) + // Apply middleware to protected endpoints + // Order matters: CORS must wrap Auth so OPTIONS preflight requests bypass authentication var protectedHandler http.Handler = protectedMux protectedHandler = middleware.Auth(cfg.APIKey)(protectedHandler) - // protectedHandler = middleware.CORS(protectedHandler) // CORS disabled + protectedHandler = middleware.CORS(protectedHandler) // CORS wraps Auth // Combine both handlers mainMux := http.NewServeMux() @@ -76,11 +168,13 @@ func main() { handler := middleware.Logging(mainMux) // Create HTTP server + // WriteTimeout is set high to support long-running sync endpoints like /taxonomy/.../generate/sync + // For production, consider using async endpoints with polling instead server := &http.Server{ Addr: ":" + cfg.Port, Handler: handler, ReadTimeout: 15 * time.Second, - WriteTimeout: 15 * time.Second, + WriteTimeout: 10 * time.Minute, IdleTimeout: 60 * time.Second, } @@ -93,6 +187,20 @@ func main() { } }() + // Start taxonomy scheduler if enabled + workerCtx, workerCancel := context.WithCancel(context.Background()) + defer workerCancel() + + if cfg.TaxonomySchedulerEnabled { + taxonomyScheduler := worker.NewTaxonomyScheduler( + clusteringJobsRepo, + taxonomyClient, + cfg.TaxonomyPollInterval, + 5, // batch size + ) + go taxonomyScheduler.Start(workerCtx) + } + // Wait for interrupt signal to gracefully shutdown the server quit := make(chan os.Signal, 1) signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM) @@ -100,12 +208,21 @@ func main() { slog.Info("Shutting down server...") - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + shutdownCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() - if err := server.Shutdown(ctx); err != nil { + // 1. Stop accepting new HTTP requests + if err := server.Shutdown(shutdownCtx); err != nil { slog.Error("Server forced to shutdown", "error", err) - os.Exit(1) + } + + // 2. Stop River (waits for in-flight jobs to complete) + if riverClient != nil { + slog.Info("Stopping River job queue...") + if err := riverClient.Stop(shutdownCtx); err != nil { + slog.Error("River forced to shutdown", "error", err) + } + slog.Info("River job queue stopped") } slog.Info("Server exited") @@ -134,3 +251,54 @@ func setupLogging(level string) { handler := slog.NewTextHandler(os.Stdout, opts) slog.SetDefault(slog.New(handler)) } + +// initRiver initializes the River job queue client and workers +func initRiver( + ctx context.Context, + db *pgxpool.Pool, + cfg *config.Config, + embeddingClient embeddings.Client, + feedbackRepo *repository.FeedbackRecordsRepository, + topicsRepo *repository.TopicsRepository, + knowledgeRepo *repository.KnowledgeRecordsRepository, +) (*river.Client[pgx.Tx], error) { + // Create rate limiter for OpenAI API calls + rateLimiter := rate.NewLimiter(rate.Limit(cfg.EmbeddingRateLimit), 1) + + // Create embedding worker with dependencies + embeddingWorker := jobs.NewEmbeddingWorker(jobs.EmbeddingWorkerDeps{ + EmbeddingClient: embeddingClient, + FeedbackUpdater: jobs.NewFeedbackRecordsUpdater(feedbackRepo), + TopicUpdater: jobs.NewTopicsUpdater(topicsRepo), + KnowledgeUpdater: jobs.NewKnowledgeRecordsUpdater(knowledgeRepo), + RateLimiter: rateLimiter, + // Topic assignment for real-time feedback classification + TopicMatcher: topicsRepo, + FeedbackAssigner: feedbackRepo, + }) + + // Register workers + workers := river.NewWorkers() + river.AddWorker(workers, embeddingWorker) + + // Create River client + riverClient, err := river.NewClient(riverpgxv5.New(db), &river.Config{ + Queues: map[string]river.QueueConfig{ + river.QueueDefault: {MaxWorkers: cfg.RiverWorkers}, + }, + Workers: workers, + ErrorHandler: &jobs.ErrorHandler{}, + JobTimeout: 60 * time.Second, // Timeout for individual jobs + MaxAttempts: cfg.RiverMaxRetries, + }) + if err != nil { + return nil, err + } + + // Start River (begins processing jobs) + if err := riverClient.Start(ctx); err != nil { + return nil, err + } + + return riverClient, nil +} diff --git a/cmd/backfill/main.go b/cmd/backfill/main.go new file mode 100644 index 0000000..1c93c0b --- /dev/null +++ b/cmd/backfill/main.go @@ -0,0 +1,154 @@ +// Package main provides a CLI tool to backfill embeddings for existing records. +// This enqueues River jobs for all records that are missing embeddings. +// +// Usage: +// +// go run cmd/backfill/main.go +// +// Or after building: +// +// ./bin/backfill +// +// Environment variables: +// - DATABASE_URL: PostgreSQL connection string (required) +// - OPENAI_API_KEY: OpenAI API key (required for embedding generation) +// - RIVER_WORKERS: Number of concurrent workers (default: 10) +// - EMBEDDING_RATE_LIMIT: OpenAI requests per second (default: 50) +package main + +import ( + "context" + "fmt" + "log/slog" + "os" + "time" + + "github.com/formbricks/hub/internal/config" + "github.com/formbricks/hub/internal/embeddings" + "github.com/formbricks/hub/internal/jobs" + "github.com/formbricks/hub/internal/repository" + "github.com/formbricks/hub/pkg/database" + "github.com/riverqueue/river" + "github.com/riverqueue/river/riverdriver/riverpgxv5" + "golang.org/x/time/rate" +) + +func main() { + ctx := context.Background() + + // Configure logging + slog.SetDefault(slog.New(slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{ + Level: slog.LevelInfo, + }))) + + slog.Info("Starting embedding backfill...") + + // Load configuration + cfg, err := config.Load() + if err != nil { + slog.Error("Failed to load configuration", "error", err) + os.Exit(1) + } + + if cfg.OpenAIAPIKey == "" { + slog.Error("OPENAI_API_KEY is required for embedding generation") + os.Exit(1) + } + + // Connect to database + db, err := database.NewPostgresPool(ctx, cfg.DatabaseURL) + if err != nil { + slog.Error("Failed to connect to database", "error", err) + os.Exit(1) + } + defer db.Close() + + // Initialize embedding client + embeddingClient := embeddings.NewOpenAIClient(cfg.OpenAIAPIKey) + + // Initialize repositories for the worker + feedbackRepo := repository.NewFeedbackRecordsRepository(db) + topicsRepo := repository.NewTopicsRepository(db) + knowledgeRepo := repository.NewKnowledgeRecordsRepository(db) + + // Create rate limiter + rateLimiter := rate.NewLimiter(rate.Limit(cfg.EmbeddingRateLimit), 1) + + // Create embedding worker + embeddingWorker := jobs.NewEmbeddingWorker(jobs.EmbeddingWorkerDeps{ + EmbeddingClient: embeddingClient, + FeedbackUpdater: jobs.NewFeedbackRecordsUpdater(feedbackRepo), + TopicUpdater: jobs.NewTopicsUpdater(topicsRepo), + KnowledgeUpdater: jobs.NewKnowledgeRecordsUpdater(knowledgeRepo), + RateLimiter: rateLimiter, + }) + + // Register workers + workers := river.NewWorkers() + river.AddWorker(workers, embeddingWorker) + + // Create River client + riverClient, err := river.NewClient(riverpgxv5.New(db), &river.Config{ + Queues: map[string]river.QueueConfig{ + river.QueueDefault: {MaxWorkers: cfg.RiverWorkers}, + }, + Workers: workers, + ErrorHandler: &jobs.ErrorHandler{}, + JobTimeout: 60 * time.Second, + MaxAttempts: cfg.RiverMaxRetries, + }) + if err != nil { + slog.Error("Failed to create River client", "error", err) + os.Exit(1) + } + + // Start River + if err := riverClient.Start(ctx); err != nil { + slog.Error("Failed to start River", "error", err) + os.Exit(1) + } + + // Create job inserter + inserter := jobs.NewRiverJobInserter(riverClient) + + // Run backfill + slog.Info("Enqueueing embedding jobs for records without embeddings...") + stats, err := jobs.Backfill(ctx, db, inserter) + if err != nil { + slog.Error("Backfill failed", "error", err) + } + + // Print results + fmt.Println() + fmt.Println("Backfill Summary") + fmt.Println("================") + fmt.Printf("Feedback records enqueued: %d\n", stats.FeedbackRecordsEnqueued) + fmt.Printf("Topics enqueued: %d\n", stats.TopicsEnqueued) + fmt.Printf("Knowledge records enqueued: %d\n", stats.KnowledgeRecordsEnqueued) + fmt.Printf("Errors: %d\n", stats.Errors) + fmt.Println() + + total := stats.FeedbackRecordsEnqueued + stats.TopicsEnqueued + stats.KnowledgeRecordsEnqueued + if total == 0 { + slog.Info("No records need backfilling") + } else { + slog.Info("Jobs enqueued successfully", + "total", total, + "feedback", stats.FeedbackRecordsEnqueued, + "topics", stats.TopicsEnqueued, + "knowledge", stats.KnowledgeRecordsEnqueued, + ) + fmt.Println("Jobs have been enqueued. They will be processed by the running API server.") + fmt.Println("You can also run this command with the API server stopped and wait for completion.") + } + + // Stop River gracefully + shutdownCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + if err := riverClient.Stop(shutdownCtx); err != nil { + slog.Error("Failed to stop River gracefully", "error", err) + } + + slog.Info("Backfill complete") +} diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml new file mode 100644 index 0000000..2669585 --- /dev/null +++ b/docker-compose.prod.yml @@ -0,0 +1,75 @@ +# Production stack - all services containerized +# Usage: docker compose -f docker-compose.prod.yml up -d + +services: + # PostgreSQL database with pgvector extension + postgres: + image: pgvector/pgvector:pg18 + container_name: formbricks_postgres + environment: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_DB: test_db + ports: + - '5432:5432' + volumes: + - postgres_data:/var/lib/postgresql + healthcheck: + test: ['CMD-SHELL', 'pg_isready -U postgres -d test_db'] + interval: 10s + timeout: 5s + retries: 5 + command: > + postgres + -c shared_preload_libraries=vector + + # Hub API - Go API server + api: + build: + context: . + dockerfile: Dockerfile + container_name: formbricks_hub_api + environment: + DATABASE_URL: postgresql://postgres:postgres@postgres:5432/test_db + API_KEY: ${API_KEY} + OPENAI_API_KEY: ${OPENAI_API_KEY} + TAXONOMY_SERVICE_URL: http://taxonomy-generator:8001 + TAXONOMY_SCHEDULER_ENABLED: 'true' + LOG_LEVEL: info + ports: + - '8080:8080' + depends_on: + postgres: + condition: service_healthy + taxonomy-generator: + condition: service_healthy + healthcheck: + test: ['CMD', 'wget', '-q', '--spider', 'http://localhost:8080/health'] + interval: 10s + timeout: 5s + retries: 3 + + # Taxonomy Generator - Python microservice for UMAP/HDBSCAN clustering + taxonomy-generator: + build: + context: ./services/taxonomy-generator + dockerfile: Dockerfile + container_name: formbricks_taxonomy_generator + environment: + DATABASE_URL: postgresql://postgres:postgres@postgres:5432/test_db + OPENAI_API_KEY: ${OPENAI_API_KEY} + LOG_LEVEL: INFO + ports: + - '8001:8001' + depends_on: + postgres: + condition: service_healthy + healthcheck: + test: ['CMD', 'python', '-c', "import httpx; httpx.get('http://localhost:8001/health')"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 30s + +volumes: + postgres_data: diff --git a/docker-compose.yml b/docker-compose.yml index c5e83e5..c3ba578 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,5 +1,8 @@ +# Development infrastructure only - databases and tools +# Run services locally with: make run (Go API), poetry run uvicorn (Python) + services: - # PostgreSQL database with pgvector extension for Hub + # PostgreSQL database with pgvector extension postgres: image: pgvector/pgvector:pg18 container_name: formbricks_postgres @@ -20,5 +23,22 @@ services: postgres -c shared_preload_libraries=vector + # pgAdmin for database visualization (optional, access at localhost:5050) + pgadmin: + image: dpage/pgadmin4:latest + container_name: formbricks_pgadmin + environment: + PGADMIN_DEFAULT_EMAIL: admin@formbricks.com + PGADMIN_DEFAULT_PASSWORD: admin + PGADMIN_CONFIG_SERVER_MODE: 'False' + PGADMIN_CONFIG_MASTER_PASSWORD_REQUIRED: 'False' + ports: + - '5050:80' + volumes: + - pgadmin_data:/var/lib/pgadmin + depends_on: + - postgres + volumes: postgres_data: + pgadmin_data: diff --git a/docs/enrichment.md b/docs/enrichment.md new file mode 100644 index 0000000..9ef91f7 --- /dev/null +++ b/docs/enrichment.md @@ -0,0 +1,203 @@ +# AI Enrichment Architecture + +This document describes the AI enrichment system for Formbricks Hub, including Knowledge Records and Topics (taxonomy) that enable intelligent feedback categorization and insights. + +## Overview + +The Formbricks Hub stores feedback records from various sources. To provide meaningful AI-powered insights, we need two additional data types: + +1. **Knowledge Records** - Contextual information about the product, company, and domain that helps AI understand feedback better +2. **Topics** - A hierarchical taxonomy for categorizing feedback into themes and subthemes + +Together, these enable the AI to: +- Understand the context of feedback (what product features exist, company terminology, etc.) +- Categorize feedback into meaningful, user-defined categories +- Generate insights based on both the feedback content and organizational knowledge + +## Knowledge Records + +### Purpose + +Knowledge Records store contextual information that enriches AI understanding when processing feedback. This could include: + +- Product descriptions and feature explanations +- Company terminology and domain-specific language +- Business context and priorities +- Historical context about features or decisions + +### Data Model + +``` +KnowledgeRecord { + id: UUIDv7 + content: string (max 10,000 chars) + tenant_id: string (optional, for multi-tenancy) + created_at: datetime + updated_at: datetime +} +``` + +### Design Decisions + +| Decision | Choice | Rationale | +|----------|--------|-----------| +| Content max length | 10,000 characters | Optimal for vector embeddings; longer content should be chunked | +| Fields | Minimal (id, content, timestamps) | MVP simplicity; can extend later with title, source_type, metadata | +| Bulk delete | By tenant_id | Supports data isolation and cleanup for multi-tenant deployments | + +### Future Considerations + +- **Vector embeddings**: Knowledge records will be embedded and stored in pgvector for semantic search +- **Chunking**: Long documents may need automatic chunking for optimal embedding performance +- **Linking**: May link knowledge records to specific topics or feedback categories + +## Topics (Taxonomy) + +### Purpose + +Topics provide a hierarchical classification system for organizing feedback. Users define their own taxonomy based on their product and business needs. + +**Example hierarchy:** +``` +Performance (level 1) +├── Dashboard slow (level 2) +├── API response time (level 2) +└── Mobile app lag (level 2) + +Feature Requests (level 1) +├── Dashboard improvements (level 2) +├── New integrations (level 2) +└── Mobile features (level 2) +``` + +### Data Model + +``` +Topic { + id: UUIDv7 + title: string (max 255 chars) + level: integer (auto-calculated) + parent_id: uuid (nullable) + tenant_id: string (optional) + created_at: datetime + updated_at: datetime +} +``` + +### Design Decisions + +| Decision | Choice | Rationale | +|----------|--------|-----------| +| Naming | "Topics" | Clear, concise, fits theme/subtheme concept better than "taxonomy-items" or "categories" | +| Level field | Auto-calculated by server | Prevents inconsistencies; server computes as `parent.level + 1` or `1` if no parent | +| Level support | Unlimited depth (designed for 3-4+ levels) | Future-proofed for complex taxonomies | +| Parent immutability | Cannot change parent_id after creation | Simplifies hierarchy management, prevents circular references | +| Title uniqueness | Unique within same parent + tenant | Prevents confusing duplicate siblings | +| Delete behavior | Cascade delete | Deleting a parent removes all descendants | + +### Why These Constraints? + +**Level auto-calculation**: Allowing manual level input creates risk of inconsistent hierarchies (e.g., a child with level 1 under a parent with level 2). Server-side calculation ensures data integrity. + +**Parent immutability**: Moving topics between parents would require: +- Recalculating levels for entire subtrees +- Handling circular reference detection +- Complex UI for tree restructuring + +For MVP, immutability is simpler and safer. Users can delete and recreate topics if restructuring is needed. + +**Cascade delete**: Alternative approaches considered: +- **Prevent deletion** if children exist (safest but restrictive) +- **Orphan children** by setting parent_id to null (confusing UX) +- **Cascade delete** (chosen) - clear behavior, documented in API + +## API Design + +### Endpoints + +#### Knowledge Records +- `GET /v1/knowledge-records` - List with filters +- `POST /v1/knowledge-records` - Create +- `GET /v1/knowledge-records/{id}` - Get by ID +- `PATCH /v1/knowledge-records/{id}` - Update content +- `DELETE /v1/knowledge-records/{id}` - Delete single +- `DELETE /v1/knowledge-records?tenant_id=...` - Bulk delete by tenant + +#### Topics +- `GET /v1/topics` - List with filters (level, parent_id, title, tenant_id) +- `POST /v1/topics` - Create (level auto-calculated) +- `GET /v1/topics/{id}` - Get by ID +- `PATCH /v1/topics/{id}` - Update title only +- `DELETE /v1/topics/{id}` - Delete (cascades to descendants) + +### Authentication + +All endpoints are protected by the same `ApiKeyAuth` (Bearer token) used for feedback records. No additional configuration needed - inherited from global security scheme. + +### Multi-tenancy + +Both Knowledge Records and Topics support optional `tenant_id` for multi-tenant deployments. This allows: +- Different tenants to have separate knowledge bases +- Different tenants to have separate taxonomies +- Data isolation and independent bulk operations + +### Error Handling + +Following RFC 7807 Problem Details (`application/problem+json`): + +| Status | When | +|--------|------| +| 400 Bad Request | Validation failures, missing required fields | +| 404 Not Found | Record/topic not found, parent topic not found | +| 409 Conflict | Duplicate topic title within same parent and tenant | + +## Future Roadmap + +### Phase 1: MVP (Current) +- [x] Knowledge Records CRUD API +- [x] Topics CRUD API with hierarchy +- [x] Database schema and migrations +- [x] Service and repository implementation +- [x] Integration tests + +### Phase 2: Vector Search +- [ ] pgvector integration for knowledge records +- [ ] Embedding generation pipeline +- [ ] Semantic search endpoints +- [ ] Knowledge record chunking for long content + +### Phase 3: AI Categorization +- [ ] Link feedback records to topics +- [ ] AI-powered automatic categorization +- [ ] Confidence scores for categorizations +- [ ] Manual override and feedback loop + +### Phase 4: Insights +- [ ] Topic-based aggregations +- [ ] Trend detection within topics +- [ ] Knowledge-enhanced insight generation +- [ ] Dashboard visualizations + +## Technical Notes + +### Validation Patterns + +Following existing feedback-records patterns: +- NULL bytes pattern: `^[^\x00]*$` on all text fields +- String length constraints: `minLength`, `maxLength` +- UUID format validation for IDs +- Pagination: limit (1-1000, default 100), offset (0-2147483647) + +### Naming Conventions + +| Concept | Naming | +|---------|--------| +| Resource URLs | kebab-case plural (`/v1/knowledge-records`, `/v1/topics`) | +| JSON fields | snake_case (`tenant_id`, `parent_id`, `created_at`) | +| Schema names | PascalCase (`KnowledgeRecordData`, `TopicData`) | +| Operation IDs | kebab-case verb-noun (`list-topics`, `create-knowledge-record`) | + +--- + +*Last updated: January 2026* +*Status: Phase 1 MVP complete - API fully implemented* diff --git a/go.mod b/go.mod index 80ee02c..92be752 100644 --- a/go.mod +++ b/go.mod @@ -8,7 +8,13 @@ require ( github.com/google/uuid v1.6.0 github.com/jackc/pgx/v5 v5.8.0 github.com/joho/godotenv v1.5.1 + github.com/pgvector/pgvector-go v0.3.0 + github.com/riverqueue/river v0.30.2 + github.com/riverqueue/river/riverdriver/riverpgxv5 v0.30.2 + github.com/riverqueue/river/rivertype v0.30.2 + github.com/sashabaranov/go-openai v1.36.1 github.com/stretchr/testify v1.11.1 + golang.org/x/time v0.14.0 ) require ( @@ -19,10 +25,16 @@ require ( github.com/jackc/pgpassfile v1.0.0 // indirect github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect github.com/jackc/puddle/v2 v2.2.2 // indirect - github.com/kr/text v0.2.0 // indirect github.com/leodido/go-urn v1.4.0 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/riverqueue/river/riverdriver v0.30.2 // indirect + github.com/riverqueue/river/rivershared v0.30.2 // indirect github.com/rogpeppe/go-internal v1.14.1 // indirect + github.com/tidwall/gjson v1.18.0 // indirect + github.com/tidwall/match v1.2.0 // indirect + github.com/tidwall/pretty v1.2.1 // indirect + github.com/tidwall/sjson v1.2.5 // indirect + go.uber.org/goleak v1.3.0 // indirect golang.org/x/crypto v0.47.0 // indirect golang.org/x/sync v0.19.0 // indirect golang.org/x/sys v0.40.0 // indirect diff --git a/go.sum b/go.sum index 7a85dff..3d48fb5 100644 --- a/go.sum +++ b/go.sum @@ -1,9 +1,14 @@ -github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= +entgo.io/ent v0.14.3 h1:wokAV/kIlH9TeklJWGGS7AYJdVckr0DloWjIcO9iIIQ= +entgo.io/ent v0.14.3/go.mod h1:aDPE/OziPEu8+OWbzy4UlvWmD2/kbRuWfK2A40hcxJM= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/gabriel-vasile/mimetype v1.4.12 h1:e9hWvmLYvtp846tLHam2o++qitpguFiYCKbn0w9jyqw= github.com/gabriel-vasile/mimetype v1.4.12/go.mod h1:d+9Oxyo1wTzWdyVUPMmXFvp4F9tea18J8ufA774AB3s= +github.com/go-pg/pg/v10 v10.11.0 h1:CMKJqLgTrfpE/aOVeLdybezR2om071Vh38OLZjsyMI0= +github.com/go-pg/pg/v10 v10.11.0/go.mod h1:4BpHRoxE61y4Onpof3x1a2SQvi9c+q1dJnrNdMjsroA= +github.com/go-pg/zerochecker v0.2.0 h1:pp7f72c3DobMWOb2ErtZsnrPaSvHd2W4o9//8HtF4mU= +github.com/go-pg/zerochecker v0.2.0/go.mod h1:NJZ4wKL0NmTtz0GKCoJ8kym6Xn/EQzXRl2OnAe7MmDo= github.com/go-playground/assert/v2 v2.2.0 h1:JvknZsQTYeFEAhQwI4qEt9cyV5ONwRHC+lYKSsYSR8s= github.com/go-playground/assert/v2 v2.2.0/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4= github.com/go-playground/form/v4 v4.3.0 h1:OVttojbQv2WNCs4P+VnjPtrt/+30Ipw4890W3OaFlvk= @@ -16,6 +21,8 @@ github.com/go-playground/validator/v10 v10.30.1 h1:f3zDSN/zOma+w6+1Wswgd9fLkdwy0 github.com/go-playground/validator/v10 v10.30.1/go.mod h1:oSuBIQzuJxL//3MelwSLD5hc2Tu889bF0Idm9Dg26cM= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/jackc/pgerrcode v0.0.0-20240316143900-6e2875d9b438 h1:Dj0L5fhJ9F82ZJyVOmBx6msDp/kfd1t9GRfny/mfJA0= +github.com/jackc/pgerrcode v0.0.0-20240316143900-6e2875d9b438/go.mod h1:a/s9Lp5W7n/DD0VrVoyJ00FbP2ytTPDVOivvn2bMlds= github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM= github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg= github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 h1:iCEnooe7UlwOQYpKFhBabPMi4aNAfoODPEFNiAnClxo= @@ -24,6 +31,12 @@ github.com/jackc/pgx/v5 v5.8.0 h1:TYPDoleBBme0xGSAX3/+NujXXtpZn9HBONkQC7IEZSo= github.com/jackc/pgx/v5 v5.8.0/go.mod h1:QVeDInX2m9VyzvNeiCJVjCkNFqzsNb43204HshNSZKw= github.com/jackc/puddle/v2 v2.2.2 h1:PR8nw+E/1w0GLuRFSmiioY6UooMp6KJv0/61nB7icHo= github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4= +github.com/jinzhu/inflection v1.0.0 h1:K317FqzuhWc8YvSVlFMCCUb36O/S9MCKRDI7QkRKD/E= +github.com/jinzhu/inflection v1.0.0/go.mod h1:h+uFLlag+Qp1Va5pdKtLDYj+kHp5pxUVkryuEj+Srlc= +github.com/jinzhu/now v1.1.5 h1:/o9tlHleP7gOFmsnYNz3RGnqzefHA47wQpKrrdTIwXQ= +github.com/jinzhu/now v1.1.5/go.mod h1:d3SSVoowX0Lcu0IBviAWJpolVfI5UJVZZ7cO71lE/z8= +github.com/jmoiron/sqlx v1.3.5 h1:vFFPA71p1o5gAeqtEAwLU4dnX2napprKtHr7PYIcN3g= +github.com/jmoiron/sqlx v1.3.5/go.mod h1:nRVWtLre0KfCLJvgxzCsLVMogSvQ1zNJtpYr2Ccp0mQ= github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0= github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4= github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0= @@ -32,15 +45,64 @@ github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/leodido/go-urn v1.4.0 h1:WT9HwE9SGECu3lg4d/dIA+jxlljEa1/ffXKmRjqdmIQ= github.com/leodido/go-urn v1.4.0/go.mod h1:bvxc+MVxLKB4z00jd1z+Dvzr47oO32F/QSNjSBOlFxI= +github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw= +github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= +github.com/pgvector/pgvector-go v0.3.0 h1:Ij+Yt78R//uYqs3Zk35evZFvr+G0blW0OUN+Q2D1RWc= +github.com/pgvector/pgvector-go v0.3.0/go.mod h1:duFy+PXWfW7QQd5ibqutBO4GxLsUZ9RVXhFZGIBsWSA= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/riverqueue/river v0.30.2 h1:RtJ3/CBat00Jjtllvy2P7A/QxSH3PRR0ri/B8PxWm1w= +github.com/riverqueue/river v0.30.2/go.mod h1:iPpsnw82MCcwAVhLo42g7eNdb5apT8VZ37Bel2x/Gws= +github.com/riverqueue/river/riverdriver v0.30.2 h1:JUmzh0iGPVpK4H7hugpgmQm2crOI9X4iKsd/9wz3IJk= +github.com/riverqueue/river/riverdriver v0.30.2/go.mod h1:w8DiNtR5uUfpIoNZVq1K7Xv0ER+1GrBK8nIxRFugiqI= +github.com/riverqueue/river/riverdriver/riverpgxv5 v0.30.2 h1:nrz1NOLm9BXzTK96ANYmkiOXgjfD3+nLUbP7CrdSzY0= +github.com/riverqueue/river/riverdriver/riverpgxv5 v0.30.2/go.mod h1:KmZHJvXC1eOXSHxJa3V0JKBI+sSYhhAxkAl7AKRQPXk= +github.com/riverqueue/river/rivershared v0.30.2 h1:LFGWnhFZIXNgooXVRY/+Of6bc9Z6ndZ8kf0A6hUO+8c= +github.com/riverqueue/river/rivershared v0.30.2/go.mod h1:K/DCaSKzbmVcOLC2PmaPycHdc56MMTZjU3LWiNh3yqQ= +github.com/riverqueue/river/rivertype v0.30.2 h1:9VVcrsXEPDFnl6qyOS0PxEoUSo9P5yD1E1HwyTpbXS8= +github.com/riverqueue/river/rivertype v0.30.2/go.mod h1:rWpgI59doOWS6zlVocROcwc00fZ1RbzRwsRTU8CDguw= +github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs= +github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro= github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= +github.com/sashabaranov/go-openai v1.36.1 h1:EVfRXwIlW2rUzpx6vR+aeIKCK/xylSrVYAx1TMTSX3g= +github.com/sashabaranov/go-openai v1.36.1/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/tidwall/gjson v1.14.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= +github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY= +github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= +github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM= +github.com/tidwall/match v1.2.0 h1:0pt8FlkOwjN2fPt4bIl4BoNxb98gGHN2ObFEDkrfZnM= +github.com/tidwall/match v1.2.0/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM= +github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= +github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4= +github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= +github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY= +github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28= +github.com/tmthrgd/go-hex v0.0.0-20190904060850-447a3041c3bc h1:9lRDQMhESg+zvGYmW5DyG0UqvY96Bu5QYsTLvCHdrgo= +github.com/tmthrgd/go-hex v0.0.0-20190904060850-447a3041c3bc/go.mod h1:bciPuU6GHm1iF1pBvUfxfsH0Wmnc2VbpgvbI9ZWuIRs= +github.com/uptrace/bun v1.1.12 h1:sOjDVHxNTuM6dNGaba0wUuz7KvDE1BmNu9Gqs2gJSXQ= +github.com/uptrace/bun v1.1.12/go.mod h1:NPG6JGULBeQ9IU6yHp7YGELRa5Agmd7ATZdz4tGZ6z0= +github.com/uptrace/bun/dialect/pgdialect v1.1.12 h1:m/CM1UfOkoBTglGO5CUTKnIKKOApOYxkcP2qn0F9tJk= +github.com/uptrace/bun/dialect/pgdialect v1.1.12/go.mod h1:Ij6WIxQILxLlL2frUBxUBOZJtLElD2QQNDcu/PWDHTc= +github.com/uptrace/bun/driver/pgdriver v1.1.12 h1:3rRWB1GK0psTJrHwxzNfEij2MLibggiLdTqjTtfHc1w= +github.com/uptrace/bun/driver/pgdriver v1.1.12/go.mod h1:ssYUP+qwSEgeDDS1xm2XBip9el1y9Mi5mTAvLoiADLM= +github.com/vmihailenco/bufpool v0.1.11 h1:gOq2WmBrq0i2yW5QJ16ykccQ4wH9UyEsgLm6czKAd94= +github.com/vmihailenco/bufpool v0.1.11/go.mod h1:AFf/MOy3l2CFTKbxwt0mp2MwnqjNEs5H/UxrkA5jxTQ= +github.com/vmihailenco/msgpack/v5 v5.3.5 h1:5gO0H1iULLWGhs2H5tbAHIZTV8/cYafcFOr9znI5mJU= +github.com/vmihailenco/msgpack/v5 v5.3.5/go.mod h1:7xyJ9e+0+9SaZT0Wt1RGleJXzli6Q/V5KbhBonMG9jc= +github.com/vmihailenco/tagparser v0.1.2 h1:gnjoVuB/kljJ5wICEEOpx98oXMWPLj22G67Vbd1qPqc= +github.com/vmihailenco/tagparser v0.1.2/go.mod h1:OeAg3pn3UbLjkWt+rN9oFYB6u/cQgqMEUPoW2WPyhdI= +github.com/vmihailenco/tagparser/v2 v2.0.0 h1:y09buUbR+b5aycVFQs/g70pqKVZNBmxwAhO7/IwNM9g= +github.com/vmihailenco/tagparser/v2 v2.0.0/go.mod h1:Wri+At7QHww0WTrCBeu4J6bNtoV6mEfg5OIWRZA9qds= +github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= +github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= golang.org/x/crypto v0.47.0 h1:V6e3FRj+n4dbpw86FJ8Fv7XVOql7TEwpHapKoMJ/GO8= golang.org/x/crypto v0.47.0/go.mod h1:ff3Y9VzzKbwSSEzWqJsJVBnWmRwRSHt/6Op5n9bQc4A= golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4= @@ -49,9 +111,17 @@ golang.org/x/sys v0.40.0 h1:DBZZqJ2Rkml6QMQsZywtnjnnGvHza6BTfYFWY9kjEWQ= golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= golang.org/x/text v0.33.0 h1:B3njUFyqtHDUI5jMn1YIr5B0IE2U0qck04r6d4KPAxE= golang.org/x/text v0.33.0/go.mod h1:LuMebE6+rBincTi9+xWTY8TztLzKHc/9C1uBCG27+q8= +golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI= +golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gorm.io/driver/postgres v1.5.4 h1:Iyrp9Meh3GmbSuyIAGyjkN+n9K+GHX9b9MqsTL4EJCo= +gorm.io/driver/postgres v1.5.4/go.mod h1:Bgo89+h0CRcdA33Y6frlaHHVuTdOf87pmyzwW9C/BH0= +gorm.io/gorm v1.25.5 h1:zR9lOiiYf09VNh5Q1gphfyia1JpiClIWG9hQaxB/mls= +gorm.io/gorm v1.25.5/go.mod h1:hbnx/Oo0ChWMn1BIhpy1oYozzpM15i4YPuHDmfYtwg8= +mellium.im/sasl v0.3.1 h1:wE0LW6g7U83vhvxjC1IY8DnXM+EU095yeo8XClvCdfo= +mellium.im/sasl v0.3.1/go.mod h1:xm59PUYpZHhgQ9ZqoJ5QaCqzWMi8IeS49dhp6plPCzw= diff --git a/internal/api/handlers/knowledge_records_handler.go b/internal/api/handlers/knowledge_records_handler.go new file mode 100644 index 0000000..bd2d4a8 --- /dev/null +++ b/internal/api/handlers/knowledge_records_handler.go @@ -0,0 +1,197 @@ +package handlers + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "net/http" + + "github.com/formbricks/hub/internal/api/response" + "github.com/formbricks/hub/internal/api/validation" + apperrors "github.com/formbricks/hub/internal/errors" + "github.com/formbricks/hub/internal/models" + "github.com/google/uuid" +) + +// KnowledgeRecordsService defines the interface for knowledge records business logic. +type KnowledgeRecordsService interface { + CreateKnowledgeRecord(ctx context.Context, req *models.CreateKnowledgeRecordRequest) (*models.KnowledgeRecord, error) + GetKnowledgeRecord(ctx context.Context, id uuid.UUID) (*models.KnowledgeRecord, error) + ListKnowledgeRecords(ctx context.Context, filters *models.ListKnowledgeRecordsFilters) (*models.ListKnowledgeRecordsResponse, error) + UpdateKnowledgeRecord(ctx context.Context, id uuid.UUID, req *models.UpdateKnowledgeRecordRequest) (*models.KnowledgeRecord, error) + DeleteKnowledgeRecord(ctx context.Context, id uuid.UUID) error + BulkDeleteKnowledgeRecords(ctx context.Context, tenantID string) (int64, error) +} + +// KnowledgeRecordsHandler handles HTTP requests for knowledge records +type KnowledgeRecordsHandler struct { + service KnowledgeRecordsService +} + +// NewKnowledgeRecordsHandler creates a new knowledge records handler +func NewKnowledgeRecordsHandler(service KnowledgeRecordsService) *KnowledgeRecordsHandler { + return &KnowledgeRecordsHandler{service: service} +} + +// Create handles POST /v1/knowledge-records +func (h *KnowledgeRecordsHandler) Create(w http.ResponseWriter, r *http.Request) { + var req models.CreateKnowledgeRecordRequest + decoder := json.NewDecoder(r.Body) + decoder.DisallowUnknownFields() + if err := decoder.Decode(&req); err != nil { + response.RespondBadRequest(w, "Invalid request body") + return + } + + // Validate request + if err := validation.ValidateStruct(&req); err != nil { + validation.RespondValidationError(w, err) + return + } + + record, err := h.service.CreateKnowledgeRecord(r.Context(), &req) + if err != nil { + response.RespondInternalServerError(w, "An unexpected error occurred") + return + } + + response.RespondJSON(w, http.StatusCreated, record) +} + +// Get handles GET /v1/knowledge-records/{id} +func (h *KnowledgeRecordsHandler) Get(w http.ResponseWriter, r *http.Request) { + idStr := r.PathValue("id") + if idStr == "" { + response.RespondBadRequest(w, "Knowledge Record ID is required") + return + } + + id, err := uuid.Parse(idStr) + if err != nil { + response.RespondBadRequest(w, "Invalid UUID format") + return + } + + record, err := h.service.GetKnowledgeRecord(r.Context(), id) + if err != nil { + if errors.Is(err, apperrors.ErrNotFound) { + response.RespondNotFound(w, "Knowledge record not found") + return + } + response.RespondInternalServerError(w, "An unexpected error occurred") + return + } + + response.RespondJSON(w, http.StatusOK, record) +} + +// List handles GET /v1/knowledge-records +func (h *KnowledgeRecordsHandler) List(w http.ResponseWriter, r *http.Request) { + filters := &models.ListKnowledgeRecordsFilters{} + + // Decode and validate query parameters + if err := validation.ValidateAndDecodeQueryParams(r, filters); err != nil { + validation.RespondValidationError(w, err) + return + } + + result, err := h.service.ListKnowledgeRecords(r.Context(), filters) + if err != nil { + response.RespondInternalServerError(w, "An unexpected error occurred") + return + } + + response.RespondJSON(w, http.StatusOK, result) +} + +// Update handles PATCH /v1/knowledge-records/{id} +func (h *KnowledgeRecordsHandler) Update(w http.ResponseWriter, r *http.Request) { + idStr := r.PathValue("id") + if idStr == "" { + response.RespondBadRequest(w, "Knowledge Record ID is required") + return + } + + id, err := uuid.Parse(idStr) + if err != nil { + response.RespondBadRequest(w, "Invalid UUID format") + return + } + + var req models.UpdateKnowledgeRecordRequest + decoder := json.NewDecoder(r.Body) + decoder.DisallowUnknownFields() + if err := decoder.Decode(&req); err != nil { + response.RespondBadRequest(w, "Invalid request body") + return + } + + // Validate request + if err := validation.ValidateStruct(&req); err != nil { + validation.RespondValidationError(w, err) + return + } + + record, err := h.service.UpdateKnowledgeRecord(r.Context(), id, &req) + if err != nil { + if errors.Is(err, apperrors.ErrNotFound) { + response.RespondNotFound(w, "Knowledge record not found") + return + } + response.RespondInternalServerError(w, "An unexpected error occurred") + return + } + + response.RespondJSON(w, http.StatusOK, record) +} + +// Delete handles DELETE /v1/knowledge-records/{id} +func (h *KnowledgeRecordsHandler) Delete(w http.ResponseWriter, r *http.Request) { + idStr := r.PathValue("id") + if idStr == "" { + response.RespondBadRequest(w, "Knowledge Record ID is required") + return + } + + id, err := uuid.Parse(idStr) + if err != nil { + response.RespondBadRequest(w, "Invalid UUID format") + return + } + + if err := h.service.DeleteKnowledgeRecord(r.Context(), id); err != nil { + if errors.Is(err, apperrors.ErrNotFound) { + response.RespondNotFound(w, "Knowledge record not found") + return + } + response.RespondInternalServerError(w, "An unexpected error occurred") + return + } + + w.WriteHeader(http.StatusNoContent) +} + +// BulkDelete handles DELETE /v1/knowledge-records?tenant_id=... +func (h *KnowledgeRecordsHandler) BulkDelete(w http.ResponseWriter, r *http.Request) { + filters := &models.BulkDeleteKnowledgeRecordsFilters{} + + // Decode and validate query parameters + if err := validation.ValidateAndDecodeQueryParams(r, filters); err != nil { + validation.RespondValidationError(w, err) + return + } + + deletedCount, err := h.service.BulkDeleteKnowledgeRecords(r.Context(), filters.TenantID) + if err != nil { + response.RespondInternalServerError(w, "An unexpected error occurred") + return + } + + resp := models.BulkDeleteKnowledgeRecordsResponse{ + DeletedCount: deletedCount, + Message: fmt.Sprintf("Successfully deleted %d knowledge records", deletedCount), + } + + response.RespondJSON(w, http.StatusOK, resp) +} diff --git a/internal/api/handlers/taxonomy_handler.go b/internal/api/handlers/taxonomy_handler.go new file mode 100644 index 0000000..5bbbbba --- /dev/null +++ b/internal/api/handlers/taxonomy_handler.go @@ -0,0 +1,323 @@ +package handlers + +import ( + "context" + "encoding/json" + "errors" + "net/http" + + "github.com/formbricks/hub/internal/api/response" + "github.com/formbricks/hub/internal/api/validation" + apperrors "github.com/formbricks/hub/internal/errors" + "github.com/formbricks/hub/internal/models" + "github.com/formbricks/hub/internal/service" + "github.com/google/uuid" +) + +// TaxonomyService defines the interface for taxonomy operations. +type TaxonomyService interface { + TriggerClustering(ctx context.Context, tenantID string, config *service.ClusterConfig) (*service.ClusteringJobResponse, error) + GetClusteringStatus(ctx context.Context, tenantID string, jobID *uuid.UUID) (*service.ClusteringJobResponse, error) + GenerateTaxonomySync(ctx context.Context, tenantID string, config *service.ClusterConfig) (*service.ClusteringJobResponse, error) + HealthCheck(ctx context.Context) error +} + +// ScheduleRepository defines the interface for schedule data access. +type ScheduleRepository interface { + CreateOrUpdate(ctx context.Context, req *models.CreateClusteringJobRequest) (*models.ClusteringJob, error) + GetByTenantID(ctx context.Context, tenantID string) (*models.ClusteringJob, error) + Delete(ctx context.Context, tenantID string) error + List(ctx context.Context, filters *models.ListClusteringJobsFilters) ([]models.ClusteringJob, error) + Count(ctx context.Context, filters *models.ListClusteringJobsFilters) (int64, error) +} + +// TaxonomyHandler handles HTTP requests for taxonomy operations. +type TaxonomyHandler struct { + client TaxonomyService + scheduleRepo ScheduleRepository +} + +// NewTaxonomyHandler creates a new taxonomy handler. +func NewTaxonomyHandler(client TaxonomyService) *TaxonomyHandler { + return &TaxonomyHandler{client: client} +} + +// NewTaxonomyHandlerWithSchedule creates a taxonomy handler with schedule support. +func NewTaxonomyHandlerWithSchedule(client TaxonomyService, scheduleRepo ScheduleRepository) *TaxonomyHandler { + return &TaxonomyHandler{ + client: client, + scheduleRepo: scheduleRepo, + } +} + +// GenerateTaxonomyRequest is the request body for taxonomy generation. +type GenerateTaxonomyRequest struct { + // Optional clustering configuration overrides + UMAPNComponents *int `json:"umap_n_components,omitempty"` + UMAPNNeighbors *int `json:"umap_n_neighbors,omitempty"` + UMAPMinDist *float64 `json:"umap_min_dist,omitempty"` + HDBSCANMinClusterSize *int `json:"hdbscan_min_cluster_size,omitempty"` + HDBSCANMinSamples *int `json:"hdbscan_min_samples,omitempty"` + MaxEmbeddings *int `json:"max_embeddings,omitempty"` + GenerateLevel2 *bool `json:"generate_level2,omitempty"` + Level2MinClusterSize *int `json:"level2_min_cluster_size,omitempty"` +} + +// Generate handles POST /v1/taxonomy/{tenant_id}/generate +// Triggers async taxonomy generation for a tenant. +func (h *TaxonomyHandler) Generate(w http.ResponseWriter, r *http.Request) { + tenantID := r.PathValue("tenant_id") + if tenantID == "" { + response.RespondBadRequest(w, "tenant_id is required") + return + } + + var req GenerateTaxonomyRequest + if r.Body != nil && r.ContentLength > 0 { + decoder := json.NewDecoder(r.Body) + if err := decoder.Decode(&req); err != nil { + response.RespondBadRequest(w, "Invalid request body") + return + } + } + + // Convert to service config + config := &service.ClusterConfig{ + UMAPNComponents: req.UMAPNComponents, + UMAPNNeighbors: req.UMAPNNeighbors, + UMAPMinDist: req.UMAPMinDist, + HDBSCANMinClusterSize: req.HDBSCANMinClusterSize, + HDBSCANMinSamples: req.HDBSCANMinSamples, + MaxEmbeddings: req.MaxEmbeddings, + GenerateLevel2: req.GenerateLevel2, + Level2MinClusterSize: req.Level2MinClusterSize, + } + + result, err := h.client.TriggerClustering(r.Context(), tenantID, config) + if err != nil { + response.RespondInternalServerError(w, "Failed to trigger taxonomy generation: "+err.Error()) + return + } + + response.RespondJSON(w, http.StatusAccepted, result) +} + +// GenerateSync handles POST /v1/taxonomy/{tenant_id}/generate/sync +// Synchronously generates taxonomy (blocking call). +func (h *TaxonomyHandler) GenerateSync(w http.ResponseWriter, r *http.Request) { + tenantID := r.PathValue("tenant_id") + if tenantID == "" { + response.RespondBadRequest(w, "tenant_id is required") + return + } + + var req GenerateTaxonomyRequest + if r.Body != nil && r.ContentLength > 0 { + decoder := json.NewDecoder(r.Body) + if err := decoder.Decode(&req); err != nil { + response.RespondBadRequest(w, "Invalid request body") + return + } + } + + // Convert to service config + config := &service.ClusterConfig{ + UMAPNComponents: req.UMAPNComponents, + UMAPNNeighbors: req.UMAPNNeighbors, + UMAPMinDist: req.UMAPMinDist, + HDBSCANMinClusterSize: req.HDBSCANMinClusterSize, + HDBSCANMinSamples: req.HDBSCANMinSamples, + MaxEmbeddings: req.MaxEmbeddings, + GenerateLevel2: req.GenerateLevel2, + Level2MinClusterSize: req.Level2MinClusterSize, + } + + result, err := h.client.GenerateTaxonomySync(r.Context(), tenantID, config) + if err != nil { + response.RespondInternalServerError(w, "Taxonomy generation failed: "+err.Error()) + return + } + + response.RespondJSON(w, http.StatusOK, result) +} + +// Status handles GET /v1/taxonomy/{tenant_id}/status +// Gets the status of the most recent clustering job for a tenant. +func (h *TaxonomyHandler) Status(w http.ResponseWriter, r *http.Request) { + tenantID := r.PathValue("tenant_id") + if tenantID == "" { + response.RespondBadRequest(w, "tenant_id is required") + return + } + + // Optional job_id query parameter + var jobID *uuid.UUID + if jobIDStr := r.URL.Query().Get("job_id"); jobIDStr != "" { + id, err := uuid.Parse(jobIDStr) + if err != nil { + response.RespondBadRequest(w, "Invalid job_id format") + return + } + jobID = &id + } + + result, err := h.client.GetClusteringStatus(r.Context(), tenantID, jobID) + if err != nil { + response.RespondNotFound(w, "Job not found: "+err.Error()) + return + } + + response.RespondJSON(w, http.StatusOK, result) +} + +// Health handles GET /v1/taxonomy/health +// Checks if the taxonomy service is available. +func (h *TaxonomyHandler) Health(w http.ResponseWriter, r *http.Request) { + if err := h.client.HealthCheck(r.Context()); err != nil { + response.RespondJSON(w, http.StatusServiceUnavailable, map[string]string{ + "status": "unavailable", + "error": err.Error(), + }) + return + } + + response.RespondJSON(w, http.StatusOK, map[string]string{ + "status": "healthy", + }) +} + +// ScheduleRequest is the request body for creating/updating a schedule. +type ScheduleRequest struct { + Interval string `json:"interval" validate:"required,oneof=daily weekly monthly"` +} + +// CreateSchedule handles POST /v1/taxonomy/{tenant_id}/schedule +// Creates or updates a periodic clustering schedule for a tenant. +func (h *TaxonomyHandler) CreateSchedule(w http.ResponseWriter, r *http.Request) { + if h.scheduleRepo == nil { + response.RespondInternalServerError(w, "Scheduling not configured") + return + } + + tenantID := r.PathValue("tenant_id") + if tenantID == "" { + response.RespondBadRequest(w, "tenant_id is required") + return + } + + var req ScheduleRequest + decoder := json.NewDecoder(r.Body) + decoder.DisallowUnknownFields() + if err := decoder.Decode(&req); err != nil { + response.RespondBadRequest(w, "Invalid request body") + return + } + + if err := validation.ValidateStruct(&req); err != nil { + validation.RespondValidationError(w, err) + return + } + + interval := models.ScheduleInterval(req.Interval) + job, err := h.scheduleRepo.CreateOrUpdate(r.Context(), &models.CreateClusteringJobRequest{ + TenantID: tenantID, + ScheduleInterval: &interval, + }) + if err != nil { + response.RespondInternalServerError(w, "Failed to create schedule: "+err.Error()) + return + } + + response.RespondJSON(w, http.StatusCreated, job) +} + +// GetSchedule handles GET /v1/taxonomy/{tenant_id}/schedule +// Gets the current schedule for a tenant. +func (h *TaxonomyHandler) GetSchedule(w http.ResponseWriter, r *http.Request) { + if h.scheduleRepo == nil { + response.RespondInternalServerError(w, "Scheduling not configured") + return + } + + tenantID := r.PathValue("tenant_id") + if tenantID == "" { + response.RespondBadRequest(w, "tenant_id is required") + return + } + + job, err := h.scheduleRepo.GetByTenantID(r.Context(), tenantID) + if err != nil { + if errors.Is(err, apperrors.ErrNotFound) { + response.RespondNotFound(w, "No schedule found for tenant") + return + } + response.RespondInternalServerError(w, "Failed to get schedule: "+err.Error()) + return + } + + response.RespondJSON(w, http.StatusOK, job) +} + +// DeleteSchedule handles DELETE /v1/taxonomy/{tenant_id}/schedule +// Deletes the schedule for a tenant. +func (h *TaxonomyHandler) DeleteSchedule(w http.ResponseWriter, r *http.Request) { + if h.scheduleRepo == nil { + response.RespondInternalServerError(w, "Scheduling not configured") + return + } + + tenantID := r.PathValue("tenant_id") + if tenantID == "" { + response.RespondBadRequest(w, "tenant_id is required") + return + } + + if err := h.scheduleRepo.Delete(r.Context(), tenantID); err != nil { + if errors.Is(err, apperrors.ErrNotFound) { + response.RespondNotFound(w, "No schedule found for tenant") + return + } + response.RespondInternalServerError(w, "Failed to delete schedule: "+err.Error()) + return + } + + w.WriteHeader(http.StatusNoContent) +} + +// ListSchedules handles GET /v1/taxonomy/schedules +// Lists all clustering schedules. +func (h *TaxonomyHandler) ListSchedules(w http.ResponseWriter, r *http.Request) { + if h.scheduleRepo == nil { + response.RespondInternalServerError(w, "Scheduling not configured") + return + } + + filters := &models.ListClusteringJobsFilters{} + if err := validation.ValidateAndDecodeQueryParams(r, filters); err != nil { + validation.RespondValidationError(w, err) + return + } + + if filters.Limit <= 0 { + filters.Limit = 100 + } + + jobs, err := h.scheduleRepo.List(r.Context(), filters) + if err != nil { + response.RespondInternalServerError(w, "Failed to list schedules: "+err.Error()) + return + } + + total, err := h.scheduleRepo.Count(r.Context(), filters) + if err != nil { + response.RespondInternalServerError(w, "Failed to count schedules: "+err.Error()) + return + } + + response.RespondJSON(w, http.StatusOK, models.ListClusteringJobsResponse{ + Data: jobs, + Total: total, + Limit: filters.Limit, + Offset: filters.Offset, + }) +} diff --git a/internal/api/handlers/topics_handler.go b/internal/api/handlers/topics_handler.go new file mode 100644 index 0000000..1f5e29c --- /dev/null +++ b/internal/api/handlers/topics_handler.go @@ -0,0 +1,233 @@ +package handlers + +import ( + "context" + "encoding/json" + "errors" + "net/http" + "strconv" + + "github.com/formbricks/hub/internal/api/response" + "github.com/formbricks/hub/internal/api/validation" + apperrors "github.com/formbricks/hub/internal/errors" + "github.com/formbricks/hub/internal/models" + "github.com/google/uuid" +) + +// TopicsService defines the interface for topics business logic. +type TopicsService interface { + CreateTopic(ctx context.Context, req *models.CreateTopicRequest) (*models.Topic, error) + GetTopic(ctx context.Context, id uuid.UUID) (*models.Topic, error) + ListTopics(ctx context.Context, filters *models.ListTopicsFilters) (*models.ListTopicsResponse, error) + UpdateTopic(ctx context.Context, id uuid.UUID, req *models.UpdateTopicRequest) (*models.Topic, error) + DeleteTopic(ctx context.Context, id uuid.UUID) error + GetChildTopics(ctx context.Context, parentID uuid.UUID, tenantID *string, limit int) ([]models.Topic, error) +} + +// TopicsHandler handles HTTP requests for topics +type TopicsHandler struct { + service TopicsService +} + +// NewTopicsHandler creates a new topics handler +func NewTopicsHandler(service TopicsService) *TopicsHandler { + return &TopicsHandler{service: service} +} + +// Create handles POST /v1/topics +func (h *TopicsHandler) Create(w http.ResponseWriter, r *http.Request) { + var req models.CreateTopicRequest + decoder := json.NewDecoder(r.Body) + decoder.DisallowUnknownFields() + if err := decoder.Decode(&req); err != nil { + response.RespondBadRequest(w, "Invalid request body") + return + } + + // Validate request + if err := validation.ValidateStruct(&req); err != nil { + validation.RespondValidationError(w, err) + return + } + + topic, err := h.service.CreateTopic(r.Context(), &req) + if err != nil { + if errors.Is(err, apperrors.ErrValidation) { + response.RespondBadRequest(w, err.Error()) + return + } + if errors.Is(err, apperrors.ErrConflict) { + response.RespondConflict(w, err.Error()) + return + } + response.RespondInternalServerError(w, "An unexpected error occurred") + return + } + + response.RespondJSON(w, http.StatusCreated, topic) +} + +// Get handles GET /v1/topics/{id} +func (h *TopicsHandler) Get(w http.ResponseWriter, r *http.Request) { + idStr := r.PathValue("id") + if idStr == "" { + response.RespondBadRequest(w, "Topic ID is required") + return + } + + id, err := uuid.Parse(idStr) + if err != nil { + response.RespondBadRequest(w, "Invalid UUID format") + return + } + + topic, err := h.service.GetTopic(r.Context(), id) + if err != nil { + if errors.Is(err, apperrors.ErrNotFound) { + response.RespondNotFound(w, "Topic not found") + return + } + response.RespondInternalServerError(w, "An unexpected error occurred") + return + } + + response.RespondJSON(w, http.StatusOK, topic) +} + +// List handles GET /v1/topics +func (h *TopicsHandler) List(w http.ResponseWriter, r *http.Request) { + filters := &models.ListTopicsFilters{} + + // Decode and validate query parameters + if err := validation.ValidateAndDecodeQueryParams(r, filters); err != nil { + validation.RespondValidationError(w, err) + return + } + + result, err := h.service.ListTopics(r.Context(), filters) + if err != nil { + response.RespondInternalServerError(w, "An unexpected error occurred") + return + } + + response.RespondJSON(w, http.StatusOK, result) +} + +// Update handles PATCH /v1/topics/{id} +func (h *TopicsHandler) Update(w http.ResponseWriter, r *http.Request) { + idStr := r.PathValue("id") + if idStr == "" { + response.RespondBadRequest(w, "Topic ID is required") + return + } + + id, err := uuid.Parse(idStr) + if err != nil { + response.RespondBadRequest(w, "Invalid UUID format") + return + } + + var req models.UpdateTopicRequest + decoder := json.NewDecoder(r.Body) + decoder.DisallowUnknownFields() + if err := decoder.Decode(&req); err != nil { + response.RespondBadRequest(w, "Invalid request body") + return + } + + // Validate request + if err := validation.ValidateStruct(&req); err != nil { + validation.RespondValidationError(w, err) + return + } + + topic, err := h.service.UpdateTopic(r.Context(), id, &req) + if err != nil { + if errors.Is(err, apperrors.ErrNotFound) { + response.RespondNotFound(w, "Topic not found") + return + } + if errors.Is(err, apperrors.ErrConflict) { + response.RespondConflict(w, err.Error()) + return + } + response.RespondInternalServerError(w, "An unexpected error occurred") + return + } + + response.RespondJSON(w, http.StatusOK, topic) +} + +// Delete handles DELETE /v1/topics/{id} +func (h *TopicsHandler) Delete(w http.ResponseWriter, r *http.Request) { + idStr := r.PathValue("id") + if idStr == "" { + response.RespondBadRequest(w, "Topic ID is required") + return + } + + id, err := uuid.Parse(idStr) + if err != nil { + response.RespondBadRequest(w, "Invalid UUID format") + return + } + + if err := h.service.DeleteTopic(r.Context(), id); err != nil { + if errors.Is(err, apperrors.ErrNotFound) { + response.RespondNotFound(w, "Topic not found") + return + } + response.RespondInternalServerError(w, "An unexpected error occurred") + return + } + + w.WriteHeader(http.StatusNoContent) +} + +// GetChildren handles GET /v1/topics/{id}/children +// Returns Level 2 topics that are children of the given Level 1 topic +func (h *TopicsHandler) GetChildren(w http.ResponseWriter, r *http.Request) { + idStr := r.PathValue("id") + if idStr == "" { + response.RespondBadRequest(w, "Topic ID is required") + return + } + + id, err := uuid.Parse(idStr) + if err != nil { + response.RespondBadRequest(w, "Invalid UUID format") + return + } + + // Parse optional tenant_id query param + var tenantID *string + if tid := r.URL.Query().Get("tenant_id"); tid != "" { + tenantID = &tid + } + + // Parse optional limit query param (default 100) + limit := 100 + if limitStr := r.URL.Query().Get("limit"); limitStr != "" { + if parsed, err := strconv.Atoi(limitStr); err == nil && parsed > 0 { + limit = parsed + } + } + + topics, err := h.service.GetChildTopics(r.Context(), id, tenantID, limit) + if err != nil { + if errors.Is(err, apperrors.ErrNotFound) { + response.RespondNotFound(w, "Topic not found") + return + } + if errors.Is(err, apperrors.ErrValidation) { + response.RespondBadRequest(w, err.Error()) + return + } + response.RespondInternalServerError(w, "An unexpected error occurred") + return + } + + response.RespondJSON(w, http.StatusOK, map[string]interface{}{ + "data": topics, + }) +} diff --git a/internal/api/response/response.go b/internal/api/response/response.go index 5e9a38f..d9dbaae 100644 --- a/internal/api/response/response.go +++ b/internal/api/response/response.go @@ -54,6 +54,11 @@ func RespondNotFound(w http.ResponseWriter, detail string) { RespondError(w, http.StatusNotFound, "Not Found", detail) } +// RespondConflict writes a 409 Conflict error response +func RespondConflict(w http.ResponseWriter, detail string) { + RespondError(w, http.StatusConflict, "Conflict", detail) +} + // RespondInternalServerError writes a 500 Internal Server Error response func RespondInternalServerError(w http.ResponseWriter, detail string) { RespondError(w, http.StatusInternalServerError, "Internal Server Error", detail) diff --git a/internal/api/validation/validation.go b/internal/api/validation/validation.go index 4b07be4..33a8753 100644 --- a/internal/api/validation/validation.go +++ b/internal/api/validation/validation.go @@ -12,6 +12,7 @@ import ( "github.com/formbricks/hub/internal/api/response" "github.com/go-playground/form/v4" "github.com/go-playground/validator/v10" + "github.com/google/uuid" ) var ( @@ -48,6 +49,18 @@ func init() { } return &t, nil }, (*time.Time)(nil)) + + // Handle *uuid.UUID (pointer type used for topic_id filters) + decoder.RegisterCustomTypeFunc(func(vals []string) (interface{}, error) { + if len(vals) == 0 || vals[0] == "" { + return (*uuid.UUID)(nil), nil + } + id, err := uuid.Parse(vals[0]) + if err != nil { + return nil, fmt.Errorf("invalid UUID format: %w", err) + } + return &id, nil + }, (*uuid.UUID)(nil)) } // ValidateStruct validates a struct using go-playground/validator diff --git a/internal/config/config.go b/internal/config/config.go index 736e0a6..512fc73 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -4,16 +4,29 @@ import ( "errors" "os" "strconv" + "time" "github.com/joho/godotenv" ) // Config holds all application configuration type Config struct { - DatabaseURL string - Port string - APIKey string - LogLevel string + DatabaseURL string + Port string + APIKey string + LogLevel string + OpenAIAPIKey string // Optional: for AI enrichment features + + // River job queue settings + RiverEnabled bool // RIVER_ENABLED - enable River job queue (default: true if OpenAI key set) + RiverWorkers int // RIVER_WORKERS - concurrent embedding workers (default: 10) + RiverMaxRetries int // RIVER_MAX_RETRIES - max retry attempts (default: 5) + EmbeddingRateLimit float64 // EMBEDDING_RATE_LIMIT - OpenAI requests per second (default: 50) + + // Taxonomy service settings + TaxonomyServiceURL string // URL of the taxonomy-generator Python microservice + TaxonomySchedulerEnabled bool // Enable periodic taxonomy scheduler + TaxonomyPollInterval time.Duration // How often to check for due jobs (default: 1m) } // getEnv retrieves an environment variable or returns a default value @@ -37,6 +50,45 @@ func getEnvAsInt(key string, defaultValue int) int { return value } +// getEnvAsFloat retrieves an environment variable as a float64 or returns a default value +func getEnvAsFloat(key string, defaultValue float64) float64 { + valueStr := os.Getenv(key) + if valueStr == "" { + return defaultValue + } + value, err := strconv.ParseFloat(valueStr, 64) + if err != nil { + return defaultValue + } + return value +} + +// getEnvAsBool retrieves an environment variable as a bool or returns a default value +func getEnvAsBool(key string, defaultValue bool) bool { + valueStr := os.Getenv(key) + if valueStr == "" { + return defaultValue + } + value, err := strconv.ParseBool(valueStr) + if err != nil { + return defaultValue + } + return value +} + +// getEnvAsDuration retrieves an environment variable as a duration or returns a default value +func getEnvAsDuration(key string, defaultValue time.Duration) time.Duration { + valueStr := os.Getenv(key) + if valueStr == "" { + return defaultValue + } + value, err := time.ParseDuration(valueStr) + if err != nil { + return defaultValue + } + return value +} + // Load reads configuration from environment variables and returns a Config struct. // It automatically loads .env file if it exists. // Returns default values for any missing environment variables. @@ -50,11 +102,29 @@ func Load() (*Config, error) { return nil, errors.New("API_KEY environment variable is required but not set") } + openAIKey := os.Getenv("OPENAI_API_KEY") + + // River is enabled by default when OpenAI key is set, unless explicitly disabled + riverEnabledDefault := openAIKey != "" + riverEnabled := getEnvAsBool("RIVER_ENABLED", riverEnabledDefault) + cfg := &Config{ - DatabaseURL: getEnv("DATABASE_URL", "postgres://postgres:postgres@localhost:5432/test_db?sslmode=disable"), - Port: getEnv("PORT", "8080"), - APIKey: apiKey, - LogLevel: getEnv("LOG_LEVEL", "info"), + DatabaseURL: getEnv("DATABASE_URL", "postgres://postgres:postgres@localhost:5432/test_db?sslmode=disable"), + Port: getEnv("PORT", "8080"), + APIKey: apiKey, + LogLevel: getEnv("LOG_LEVEL", "info"), + OpenAIAPIKey: openAIKey, + + // River job queue configuration + RiverEnabled: riverEnabled, + RiverWorkers: getEnvAsInt("RIVER_WORKERS", 10), + RiverMaxRetries: getEnvAsInt("RIVER_MAX_RETRIES", 5), + EmbeddingRateLimit: getEnvAsFloat("EMBEDDING_RATE_LIMIT", 50), + + // Taxonomy service settings + TaxonomyServiceURL: getEnv("TAXONOMY_SERVICE_URL", "http://localhost:8001"), + TaxonomySchedulerEnabled: getEnv("TAXONOMY_SCHEDULER_ENABLED", "false") == "true", + TaxonomyPollInterval: getEnvAsDuration("TAXONOMY_POLL_INTERVAL", 1*time.Minute), } return cfg, nil diff --git a/internal/embeddings/client.go b/internal/embeddings/client.go new file mode 100644 index 0000000..86d3482 --- /dev/null +++ b/internal/embeddings/client.go @@ -0,0 +1,14 @@ +package embeddings + +import "context" + +// Client defines the interface for generating text embeddings. +type Client interface { + // GetEmbedding generates an embedding vector for the given text. + // Returns a slice of float32 values representing the embedding. + GetEmbedding(ctx context.Context, text string) ([]float32, error) + + // GetEmbeddings generates embedding vectors for multiple texts in a batch. + // More efficient than calling GetEmbedding multiple times. + GetEmbeddings(ctx context.Context, texts []string) ([][]float32, error) +} diff --git a/internal/embeddings/mock.go b/internal/embeddings/mock.go new file mode 100644 index 0000000..136867d --- /dev/null +++ b/internal/embeddings/mock.go @@ -0,0 +1,92 @@ +package embeddings + +import ( + "context" + "crypto/sha256" + "fmt" + "math" +) + +// MockClient implements the Client interface for testing purposes. +// It generates deterministic embeddings based on the input text hash. +type MockClient struct { + dimensions int +} + +// NewMockClient creates a new mock embedding client. +// Default dimensions is 1536 to match OpenAI's text-embedding-3-small. +func NewMockClient() *MockClient { + return &MockClient{dimensions: 1536} +} + +// NewMockClientWithDimensions creates a mock client with custom dimensions. +func NewMockClientWithDimensions(dimensions int) *MockClient { + return &MockClient{dimensions: dimensions} +} + +// GetEmbedding generates a deterministic embedding based on the text hash. +func (c *MockClient) GetEmbedding(ctx context.Context, text string) ([]float32, error) { + if text == "" { + return nil, fmt.Errorf("text cannot be empty") + } + return c.generateDeterministicEmbedding(text), nil +} + +// GetEmbeddings generates embeddings for multiple texts. +// Returns an error if any text is empty. +func (c *MockClient) GetEmbeddings(ctx context.Context, texts []string) ([][]float32, error) { + if len(texts) == 0 { + return nil, fmt.Errorf("texts cannot be empty") + } + + for i, text := range texts { + if text == "" { + return nil, fmt.Errorf("text at index %d cannot be empty", i) + } + } + + embeddings := make([][]float32, len(texts)) + for i, text := range texts { + embeddings[i] = c.generateDeterministicEmbedding(text) + } + return embeddings, nil +} + +// generateDeterministicEmbedding creates a normalized embedding vector from text hash. +func (c *MockClient) generateDeterministicEmbedding(text string) []float32 { + hash := sha256.Sum256([]byte(text)) + embedding := make([]float32, c.dimensions) + + // Generate embedding values from hash bytes + for i := 0; i < c.dimensions; i++ { + // Use hash bytes cyclically to generate float values + byteIdx := i % len(hash) + // Convert to float in range [-1, 1] + embedding[i] = (float32(hash[byteIdx]) / 127.5) - 1.0 + } + + // Normalize the embedding + return normalize(embedding) +} + +// normalize normalizes a vector to unit length. +func normalize(v []float32) []float32 { + var sum float64 + for _, val := range v { + sum += float64(val * val) + } + magnitude := float32(math.Sqrt(sum)) + + if magnitude == 0 { + return v + } + + normalized := make([]float32, len(v)) + for i, val := range v { + normalized[i] = val / magnitude + } + return normalized +} + +// Ensure MockClient implements Client interface +var _ Client = (*MockClient)(nil) diff --git a/internal/embeddings/openai.go b/internal/embeddings/openai.go new file mode 100644 index 0000000..da40ad6 --- /dev/null +++ b/internal/embeddings/openai.go @@ -0,0 +1,93 @@ +package embeddings + +import ( + "context" + "fmt" + + "github.com/sashabaranov/go-openai" +) + +// OpenAIClient implements the Client interface using OpenAI's embedding API. +type OpenAIClient struct { + client *openai.Client + model openai.EmbeddingModel +} + +// Ensure OpenAIClient implements Client interface +var _ Client = (*OpenAIClient)(nil) + +// NewOpenAIClient creates a new OpenAI embedding client. +// Uses text-embedding-3-small by default (1536 dimensions). +// Panics if apiKey is empty. +func NewOpenAIClient(apiKey string) *OpenAIClient { + if apiKey == "" { + panic("embeddings: OpenAI API key cannot be empty") + } + return &OpenAIClient{ + client: openai.NewClient(apiKey), + model: openai.SmallEmbedding3, // text-embedding-3-small, 1536 dims + } +} + +// NewOpenAIClientWithModel creates a new OpenAI embedding client with a custom model. +func NewOpenAIClientWithModel(apiKey string, model openai.EmbeddingModel) *OpenAIClient { + return &OpenAIClient{ + client: openai.NewClient(apiKey), + model: model, + } +} + +// GetEmbedding generates an embedding vector for the given text. +func (c *OpenAIClient) GetEmbedding(ctx context.Context, text string) ([]float32, error) { + if text == "" { + return nil, fmt.Errorf("text cannot be empty") + } + + resp, err := c.client.CreateEmbeddings(ctx, openai.EmbeddingRequest{ + Input: []string{text}, + Model: c.model, + }) + if err != nil { + return nil, fmt.Errorf("failed to create embedding: %w", err) + } + + if len(resp.Data) == 0 { + return nil, fmt.Errorf("no embedding returned from API") + } + + return resp.Data[0].Embedding, nil +} + +// GetEmbeddings generates embedding vectors for multiple texts in a batch. +// Returns an error if any text in the input is empty. +func (c *OpenAIClient) GetEmbeddings(ctx context.Context, texts []string) ([][]float32, error) { + if len(texts) == 0 { + return nil, fmt.Errorf("texts cannot be empty") + } + + // Validate all texts are non-empty + for i, t := range texts { + if t == "" { + return nil, fmt.Errorf("text at index %d cannot be empty", i) + } + } + + resp, err := c.client.CreateEmbeddings(ctx, openai.EmbeddingRequest{ + Input: texts, + Model: c.model, + }) + if err != nil { + return nil, fmt.Errorf("failed to create embeddings: %w", err) + } + + if len(resp.Data) != len(texts) { + return nil, fmt.Errorf("unexpected number of embeddings returned: got %d, expected %d", len(resp.Data), len(texts)) + } + + embeddings := make([][]float32, len(resp.Data)) + for i, data := range resp.Data { + embeddings[i] = data.Embedding + } + + return embeddings, nil +} diff --git a/internal/errors/errors.go b/internal/errors/errors.go index 82285e3..f80ff23 100644 --- a/internal/errors/errors.go +++ b/internal/errors/errors.go @@ -71,3 +71,38 @@ func NewValidationError(field, message string) *ValidationError { Message: message, } } + +// ErrConflict represents a conflict error +// This should be used when a resource already exists or conflicts with another +var ErrConflict = &ConflictError{} + +// ConflictError is a sentinel error for resource conflicts (e.g., duplicate entries) +type ConflictError struct { + Resource string + Message string +} + +// Error implements the error interface +func (e *ConflictError) Error() string { + if e.Message != "" { + return e.Message + } + if e.Resource != "" { + return fmt.Sprintf("%s already exists", e.Resource) + } + return "resource conflict" +} + +// Is implements the error interface for error comparison +func (e *ConflictError) Is(target error) bool { + _, ok := target.(*ConflictError) + return ok +} + +// NewConflictError creates a new ConflictError with a custom message +func NewConflictError(resource, message string) *ConflictError { + return &ConflictError{ + Resource: resource, + Message: message, + } +} diff --git a/internal/jobs/adapters.go b/internal/jobs/adapters.go new file mode 100644 index 0000000..f7ccd02 --- /dev/null +++ b/internal/jobs/adapters.go @@ -0,0 +1,70 @@ +package jobs + +import ( + "context" + + "github.com/formbricks/hub/internal/models" + "github.com/google/uuid" +) + +// FeedbackRecordsUpdater wraps the feedback records repository to implement EmbeddingUpdater. +type FeedbackRecordsUpdater struct { + repo FeedbackEnrichmentRepository +} + +// FeedbackEnrichmentRepository defines the interface needed from the feedback records repository. +type FeedbackEnrichmentRepository interface { + UpdateEnrichment(ctx context.Context, id uuid.UUID, req *models.UpdateFeedbackEnrichmentRequest) error +} + +// NewFeedbackRecordsUpdater creates a new feedback records updater. +func NewFeedbackRecordsUpdater(repo FeedbackEnrichmentRepository) *FeedbackRecordsUpdater { + return &FeedbackRecordsUpdater{repo: repo} +} + +// UpdateEmbedding implements EmbeddingUpdater for feedback records. +func (u *FeedbackRecordsUpdater) UpdateEmbedding(ctx context.Context, id uuid.UUID, embedding []float32) error { + return u.repo.UpdateEnrichment(ctx, id, &models.UpdateFeedbackEnrichmentRequest{ + Embedding: embedding, + }) +} + +// TopicsUpdater wraps the topics repository to implement EmbeddingUpdater. +type TopicsUpdater struct { + repo TopicsEmbeddingRepository +} + +// TopicsEmbeddingRepository defines the interface needed from the topics repository. +type TopicsEmbeddingRepository interface { + UpdateEmbedding(ctx context.Context, id uuid.UUID, embedding []float32) error +} + +// NewTopicsUpdater creates a new topics updater. +func NewTopicsUpdater(repo TopicsEmbeddingRepository) *TopicsUpdater { + return &TopicsUpdater{repo: repo} +} + +// UpdateEmbedding implements EmbeddingUpdater for topics. +func (u *TopicsUpdater) UpdateEmbedding(ctx context.Context, id uuid.UUID, embedding []float32) error { + return u.repo.UpdateEmbedding(ctx, id, embedding) +} + +// KnowledgeRecordsUpdater wraps the knowledge records repository to implement EmbeddingUpdater. +type KnowledgeRecordsUpdater struct { + repo KnowledgeEmbeddingRepository +} + +// KnowledgeEmbeddingRepository defines the interface needed from the knowledge records repository. +type KnowledgeEmbeddingRepository interface { + UpdateEmbedding(ctx context.Context, id uuid.UUID, embedding []float32) error +} + +// NewKnowledgeRecordsUpdater creates a new knowledge records updater. +func NewKnowledgeRecordsUpdater(repo KnowledgeEmbeddingRepository) *KnowledgeRecordsUpdater { + return &KnowledgeRecordsUpdater{repo: repo} +} + +// UpdateEmbedding implements EmbeddingUpdater for knowledge records. +func (u *KnowledgeRecordsUpdater) UpdateEmbedding(ctx context.Context, id uuid.UUID, embedding []float32) error { + return u.repo.UpdateEmbedding(ctx, id, embedding) +} diff --git a/internal/jobs/args.go b/internal/jobs/args.go new file mode 100644 index 0000000..9485ac2 --- /dev/null +++ b/internal/jobs/args.go @@ -0,0 +1,32 @@ +// Package jobs provides River job workers for async processing tasks. +package jobs + +import "github.com/google/uuid" + +// EmbeddingJobArgs contains the arguments for an embedding generation job. +type EmbeddingJobArgs struct { + // RecordID is the UUID of the record to generate embeddings for + RecordID uuid.UUID `json:"record_id"` + + // RecordType identifies which table the record belongs to + // Valid values: "feedback_record", "topic", "knowledge_record" + RecordType string `json:"record_type"` + + // Text is the content to generate embeddings for + // For topics, this is the hierarchical path (e.g., "Performance > API") + Text string `json:"text"` + + // TenantID is used for tenant-isolated topic assignment after embedding generation + // Only used for feedback_record type + TenantID *string `json:"tenant_id,omitempty"` +} + +// Kind returns the job type identifier for River +func (EmbeddingJobArgs) Kind() string { return "embedding" } + +// Record type constants +const ( + RecordTypeFeedback = "feedback_record" + RecordTypeTopic = "topic" + RecordTypeKnowledge = "knowledge_record" +) diff --git a/internal/jobs/backfill.go b/internal/jobs/backfill.go new file mode 100644 index 0000000..0e9025d --- /dev/null +++ b/internal/jobs/backfill.go @@ -0,0 +1,192 @@ +package jobs + +import ( + "context" + "fmt" + "log/slog" + + "github.com/google/uuid" + "github.com/jackc/pgx/v5/pgxpool" +) + +// BackfillStats holds statistics from a backfill operation. +type BackfillStats struct { + FeedbackRecordsEnqueued int + TopicsEnqueued int + KnowledgeRecordsEnqueued int + Errors int +} + +// Backfill enqueues embedding jobs for all records that are missing embeddings. +func Backfill(ctx context.Context, db *pgxpool.Pool, inserter JobInserter) (*BackfillStats, error) { + stats := &BackfillStats{} + + // Backfill feedback records + feedbackCount, err := backfillFeedbackRecords(ctx, db, inserter) + if err != nil { + slog.Error("failed to backfill feedback records", "error", err) + stats.Errors++ + } + stats.FeedbackRecordsEnqueued = feedbackCount + + // Backfill topics + topicsCount, err := backfillTopics(ctx, db, inserter) + if err != nil { + slog.Error("failed to backfill topics", "error", err) + stats.Errors++ + } + stats.TopicsEnqueued = topicsCount + + // Backfill knowledge records + knowledgeCount, err := backfillKnowledgeRecords(ctx, db, inserter) + if err != nil { + slog.Error("failed to backfill knowledge records", "error", err) + stats.Errors++ + } + stats.KnowledgeRecordsEnqueued = knowledgeCount + + return stats, nil +} + +// backfillFeedbackRecords enqueues embedding jobs for feedback records without embeddings. +func backfillFeedbackRecords(ctx context.Context, db *pgxpool.Pool, inserter JobInserter) (int, error) { + query := ` + SELECT id, value_text + FROM feedback_records + WHERE field_type = 'text' + AND embedding IS NULL + AND value_text IS NOT NULL + AND value_text != '' + ` + + rows, err := db.Query(ctx, query) + if err != nil { + return 0, fmt.Errorf("failed to query feedback records: %w", err) + } + defer rows.Close() + + count := 0 + for rows.Next() { + var id uuid.UUID + var text string + if err := rows.Scan(&id, &text); err != nil { + slog.Error("failed to scan feedback record", "error", err) + continue + } + + if err := inserter.InsertEmbeddingJob(ctx, EmbeddingJobArgs{ + RecordID: id, + RecordType: RecordTypeFeedback, + Text: text, + }); err != nil { + slog.Error("failed to enqueue feedback record embedding job", "id", id, "error", err) + continue + } + count++ + } + + if err := rows.Err(); err != nil { + return count, fmt.Errorf("error iterating feedback records: %w", err) + } + + return count, nil +} + +// backfillTopics enqueues embedding jobs for topics without embeddings. +// Note: This uses the topic title directly. For hierarchy paths, you may need to +// fetch parent topics and build the path. +func backfillTopics(ctx context.Context, db *pgxpool.Pool, inserter JobInserter) (int, error) { + // Query topics with their hierarchy paths using a recursive CTE + query := ` + WITH RECURSIVE topic_paths AS ( + -- Base case: Level 1 topics (no parent) + SELECT id, title, parent_id, title::text as hierarchy_path + FROM topics + WHERE parent_id IS NULL AND embedding IS NULL + + UNION ALL + + -- Recursive case: children with their parent's path + SELECT t.id, t.title, t.parent_id, (tp.hierarchy_path || ' > ' || t.title)::text + FROM topics t + INNER JOIN topic_paths tp ON t.parent_id = tp.id + WHERE t.embedding IS NULL + ) + SELECT id, hierarchy_path FROM topic_paths + ` + + rows, err := db.Query(ctx, query) + if err != nil { + return 0, fmt.Errorf("failed to query topics: %w", err) + } + defer rows.Close() + + count := 0 + for rows.Next() { + var id uuid.UUID + var hierarchyPath string + if err := rows.Scan(&id, &hierarchyPath); err != nil { + slog.Error("failed to scan topic", "error", err) + continue + } + + if err := inserter.InsertEmbeddingJob(ctx, EmbeddingJobArgs{ + RecordID: id, + RecordType: RecordTypeTopic, + Text: hierarchyPath, + }); err != nil { + slog.Error("failed to enqueue topic embedding job", "id", id, "error", err) + continue + } + count++ + } + + if err := rows.Err(); err != nil { + return count, fmt.Errorf("error iterating topics: %w", err) + } + + return count, nil +} + +// backfillKnowledgeRecords enqueues embedding jobs for knowledge records without embeddings. +func backfillKnowledgeRecords(ctx context.Context, db *pgxpool.Pool, inserter JobInserter) (int, error) { + query := ` + SELECT id, content + FROM knowledge_records + WHERE embedding IS NULL + AND content IS NOT NULL + AND content != '' + ` + + rows, err := db.Query(ctx, query) + if err != nil { + return 0, fmt.Errorf("failed to query knowledge records: %w", err) + } + defer rows.Close() + + count := 0 + for rows.Next() { + var id uuid.UUID + var content string + if err := rows.Scan(&id, &content); err != nil { + slog.Error("failed to scan knowledge record", "error", err) + continue + } + + if err := inserter.InsertEmbeddingJob(ctx, EmbeddingJobArgs{ + RecordID: id, + RecordType: RecordTypeKnowledge, + Text: content, + }); err != nil { + slog.Error("failed to enqueue knowledge record embedding job", "id", id, "error", err) + continue + } + count++ + } + + if err := rows.Err(); err != nil { + return count, fmt.Errorf("error iterating knowledge records: %w", err) + } + + return count, nil +} diff --git a/internal/jobs/embedding_worker.go b/internal/jobs/embedding_worker.go new file mode 100644 index 0000000..627c4cc --- /dev/null +++ b/internal/jobs/embedding_worker.go @@ -0,0 +1,188 @@ +package jobs + +import ( + "context" + "errors" + "log/slog" + + "github.com/formbricks/hub/internal/embeddings" + apperrors "github.com/formbricks/hub/internal/errors" + "github.com/formbricks/hub/internal/models" + "github.com/google/uuid" + "github.com/riverqueue/river" + "golang.org/x/time/rate" +) + +// EmbeddingUpdater is an interface for updating embeddings on records. +// This allows the worker to update any record type without knowing the concrete implementation. +type EmbeddingUpdater interface { + UpdateEmbedding(ctx context.Context, id uuid.UUID, embedding []float32) error +} + +// TopicMatcher is an interface for finding similar topics for embedding-based assignment. +type TopicMatcher interface { + FindMostSpecificTopic(ctx context.Context, embedding []float32, tenantID *string, minSimilarity float64) (*models.TopicMatch, error) +} + +// FeedbackAssigner is an interface for assigning topics to feedback records. +type FeedbackAssigner interface { + AssignTopic(ctx context.Context, id uuid.UUID, topicID uuid.UUID, confidence float64) error +} + +// DefaultMinSimilarity is the default threshold for topic assignment. +// Feedback must be at least this similar to a topic centroid to be assigned. +const DefaultMinSimilarity = 0.35 + +// EmbeddingWorkerDeps holds the dependencies for the embedding worker. +type EmbeddingWorkerDeps struct { + EmbeddingClient embeddings.Client + FeedbackUpdater EmbeddingUpdater + TopicUpdater EmbeddingUpdater + KnowledgeUpdater EmbeddingUpdater + RateLimiter *rate.Limiter + // Optional: for real-time topic assignment after embedding generation + TopicMatcher TopicMatcher + FeedbackAssigner FeedbackAssigner +} + +// EmbeddingWorker processes embedding generation jobs. +type EmbeddingWorker struct { + river.WorkerDefaults[EmbeddingJobArgs] + deps EmbeddingWorkerDeps +} + +// NewEmbeddingWorker creates a new embedding worker with the given dependencies. +func NewEmbeddingWorker(deps EmbeddingWorkerDeps) *EmbeddingWorker { + return &EmbeddingWorker{deps: deps} +} + +// Work processes an embedding job. +func (w *EmbeddingWorker) Work(ctx context.Context, job *river.Job[EmbeddingJobArgs]) error { + args := job.Args + + slog.Debug("processing embedding job", + "job_id", job.ID, + "record_type", args.RecordType, + "record_id", args.RecordID, + "text_length", len(args.Text), + ) + + // Wait for rate limit token if configured + if w.deps.RateLimiter != nil { + if err := w.deps.RateLimiter.Wait(ctx); err != nil { + return err + } + } + + // Generate embedding + embedding, err := w.deps.EmbeddingClient.GetEmbedding(ctx, args.Text) + if err != nil { + slog.Error("failed to generate embedding", + "job_id", job.ID, + "record_type", args.RecordType, + "record_id", args.RecordID, + "error", err, + ) + return err // River will retry based on configuration + } + + // Get the appropriate updater based on record type + updater := w.getUpdater(args.RecordType) + if updater == nil { + slog.Error("unknown record type", + "job_id", job.ID, + "record_type", args.RecordType, + ) + // Return nil to mark job as complete - unknown type won't be fixed by retry + return nil + } + + // Update the record with the embedding + err = updater.UpdateEmbedding(ctx, args.RecordID, embedding) + if err != nil { + // Check if record was deleted + var notFoundErr *apperrors.NotFoundError + if errors.As(err, ¬FoundErr) { + slog.Info("record deleted before embedding job completed", + "job_id", job.ID, + "record_type", args.RecordType, + "record_id", args.RecordID, + ) + // Return nil to mark job as complete - record no longer exists + return nil + } + + slog.Error("failed to update embedding", + "job_id", job.ID, + "record_type", args.RecordType, + "record_id", args.RecordID, + "error", err, + ) + return err // Retry on other errors + } + + slog.Info("embedding generated successfully", + "job_id", job.ID, + "record_type", args.RecordType, + "record_id", args.RecordID, + ) + + // For feedback records, attempt real-time topic assignment + if args.RecordType == RecordTypeFeedback && w.deps.TopicMatcher != nil && w.deps.FeedbackAssigner != nil { + w.assignTopicToFeedback(ctx, args.RecordID, embedding, args.TenantID) + } + + return nil +} + +// getUpdater returns the appropriate updater for the given record type. +func (w *EmbeddingWorker) getUpdater(recordType string) EmbeddingUpdater { + switch recordType { + case RecordTypeFeedback: + return w.deps.FeedbackUpdater + case RecordTypeTopic: + return w.deps.TopicUpdater + case RecordTypeKnowledge: + return w.deps.KnowledgeUpdater + default: + return nil + } +} + +// assignTopicToFeedback attempts to assign a topic to a feedback record based on embedding similarity. +// This is called after embedding generation succeeds for feedback records. +// Failures are logged but don't fail the job - embedding was already saved successfully. +func (w *EmbeddingWorker) assignTopicToFeedback(ctx context.Context, recordID uuid.UUID, embedding []float32, tenantID *string) { + match, err := w.deps.TopicMatcher.FindMostSpecificTopic(ctx, embedding, tenantID, DefaultMinSimilarity) + if err != nil { + slog.Warn("topic matching failed", + "record_id", recordID, + "error", err, + ) + return // Don't fail the job - embedding was successful + } + + if match == nil { + slog.Debug("no matching topic found", + "record_id", recordID, + "min_similarity", DefaultMinSimilarity, + ) + return // No topics exist yet or none above threshold + } + + if err := w.deps.FeedbackAssigner.AssignTopic(ctx, recordID, match.TopicID, match.Similarity); err != nil { + slog.Warn("topic assignment failed", + "record_id", recordID, + "topic_id", match.TopicID, + "error", err, + ) + return // Don't fail - embedding was successful, assignment can be retried + } + + slog.Info("topic assigned to feedback", + "record_id", recordID, + "topic_id", match.TopicID, + "topic_title", match.Title, + "confidence", match.Similarity, + ) +} diff --git a/internal/jobs/error_handler.go b/internal/jobs/error_handler.go new file mode 100644 index 0000000..1cd2f55 --- /dev/null +++ b/internal/jobs/error_handler.go @@ -0,0 +1,40 @@ +package jobs + +import ( + "context" + "log/slog" + + "github.com/riverqueue/river" + "github.com/riverqueue/river/rivertype" +) + +// ErrorHandler handles job errors and panics for logging and alerting. +type ErrorHandler struct{} + +// HandleError is called when a job returns an error. +func (h *ErrorHandler) HandleError(ctx context.Context, job *rivertype.JobRow, err error) *river.ErrorHandlerResult { + slog.Error("job failed", + "job_kind", job.Kind, + "job_id", job.ID, + "attempt", job.Attempt, + "max_attempts", job.MaxAttempts, + "error", err, + ) + + // Return nil to use default retry behavior + return nil +} + +// HandlePanic is called when a job panics. +func (h *ErrorHandler) HandlePanic(ctx context.Context, job *rivertype.JobRow, panicVal any, trace string) *river.ErrorHandlerResult { + slog.Error("job panicked", + "job_kind", job.Kind, + "job_id", job.ID, + "attempt", job.Attempt, + "panic_value", panicVal, + "stack_trace", trace, + ) + + // Return nil to use default behavior (mark as errored, will retry) + return nil +} diff --git a/internal/jobs/inserter.go b/internal/jobs/inserter.go new file mode 100644 index 0000000..3bb3553 --- /dev/null +++ b/internal/jobs/inserter.go @@ -0,0 +1,13 @@ +package jobs + +import ( + "context" +) + +// JobInserter is an interface for inserting jobs into the queue. +// This allows services to enqueue jobs without knowing about River directly. +type JobInserter interface { + // InsertEmbeddingJob enqueues an embedding generation job. + // Returns an error if the job could not be inserted. + InsertEmbeddingJob(ctx context.Context, args EmbeddingJobArgs) error +} diff --git a/internal/jobs/river_inserter.go b/internal/jobs/river_inserter.go new file mode 100644 index 0000000..9b333f3 --- /dev/null +++ b/internal/jobs/river_inserter.go @@ -0,0 +1,39 @@ +package jobs + +import ( + "context" + + "github.com/jackc/pgx/v5" + "github.com/riverqueue/river" + "github.com/riverqueue/river/rivertype" +) + +// RiverJobInserter implements JobInserter using the River client. +type RiverJobInserter struct { + client *river.Client[pgx.Tx] +} + +// NewRiverJobInserter creates a new River-based job inserter. +func NewRiverJobInserter(client *river.Client[pgx.Tx]) *RiverJobInserter { + return &RiverJobInserter{client: client} +} + +// InsertEmbeddingJob enqueues an embedding generation job with uniqueness constraints. +func (r *RiverJobInserter) InsertEmbeddingJob(ctx context.Context, args EmbeddingJobArgs) error { + _, err := r.client.Insert(ctx, args, &river.InsertOpts{ + UniqueOpts: river.UniqueOpts{ + // Only one pending job per record (by args) + ByArgs: true, + // Consider jobs in these states for deduplication + // Note: JobStatePending is required by River when using ByState + ByState: []rivertype.JobState{ + rivertype.JobStatePending, + rivertype.JobStateAvailable, + rivertype.JobStateRunning, + rivertype.JobStateRetryable, + rivertype.JobStateScheduled, + }, + }, + }) + return err +} diff --git a/internal/models/clustering_jobs.go b/internal/models/clustering_jobs.go new file mode 100644 index 0000000..0309d7f --- /dev/null +++ b/internal/models/clustering_jobs.go @@ -0,0 +1,74 @@ +package models + +import ( + "time" + + "github.com/google/uuid" +) + +// ScheduleInterval represents the interval for periodic clustering. +type ScheduleInterval string + +const ( + ScheduleDaily ScheduleInterval = "daily" + ScheduleWeekly ScheduleInterval = "weekly" + ScheduleMonthly ScheduleInterval = "monthly" +) + +// ClusteringJobStatus represents the status of a clustering job schedule. +type ClusteringJobStatus string + +const ( + JobStatusPending ClusteringJobStatus = "pending" + JobStatusRunning ClusteringJobStatus = "running" + JobStatusComplete ClusteringJobStatus = "completed" + JobStatusFailed ClusteringJobStatus = "failed" + JobStatusDisabled ClusteringJobStatus = "disabled" +) + +// ClusteringJob represents a scheduled clustering job for a tenant. +type ClusteringJob struct { + ID uuid.UUID `json:"id"` + TenantID string `json:"tenant_id"` + Status ClusteringJobStatus `json:"status"` + ScheduleInterval *ScheduleInterval `json:"schedule_interval,omitempty"` + NextRunAt *time.Time `json:"next_run_at,omitempty"` + LastRunAt *time.Time `json:"last_run_at,omitempty"` + LastJobID *uuid.UUID `json:"last_job_id,omitempty"` + LastError *string `json:"last_error,omitempty"` + TopicsGenerated int `json:"topics_generated"` + RecordsProcessed int `json:"records_processed"` + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` +} + +// CreateClusteringJobRequest represents the request to create/update a clustering schedule. +type CreateClusteringJobRequest struct { + TenantID string `json:"tenant_id" validate:"required,no_null_bytes,min=1,max=255"` + ScheduleInterval *ScheduleInterval `json:"schedule_interval,omitempty" validate:"omitempty,oneof=daily weekly monthly"` +} + +// UpdateClusteringJobRequest represents the request to update a clustering job after execution. +type UpdateClusteringJobRequest struct { + Status ClusteringJobStatus `json:"status"` + LastJobID *uuid.UUID `json:"last_job_id,omitempty"` + LastError *string `json:"last_error,omitempty"` + TopicsGenerated *int `json:"topics_generated,omitempty"` + RecordsProcessed *int `json:"records_processed,omitempty"` +} + +// ListClusteringJobsFilters represents filters for listing clustering jobs. +type ListClusteringJobsFilters struct { + TenantID *string `form:"tenant_id" validate:"omitempty,no_null_bytes"` + Status *ClusteringJobStatus `form:"status" validate:"omitempty"` + Limit int `form:"limit" validate:"omitempty,min=1,max=1000"` + Offset int `form:"offset" validate:"omitempty,min=0"` +} + +// ListClusteringJobsResponse represents the response for listing clustering jobs. +type ListClusteringJobsResponse struct { + Data []ClusteringJob `json:"data"` + Total int64 `json:"total"` + Limit int `json:"limit"` + Offset int `json:"offset"` +} diff --git a/internal/models/feedback_records.go b/internal/models/feedback_records.go index b328c3c..82069d4 100644 --- a/internal/models/feedback_records.go +++ b/internal/models/feedback_records.go @@ -28,6 +28,10 @@ type FeedbackRecord struct { UserIdentifier *string `json:"user_identifier,omitempty"` TenantID *string `json:"tenant_id,omitempty"` ResponseID *string `json:"response_id,omitempty"` + + // Similarity is populated at query time when filtering by topic_id + // It represents the cosine similarity between this feedback's embedding and the topic's embedding + Similarity *float64 `json:"similarity,omitempty"` } // CreateFeedbackRecordRequest represents the request to create a feedback record @@ -75,6 +79,20 @@ type ListFeedbackRecordsFilters struct { Until *time.Time `form:"until" validate:"omitempty"` Limit int `form:"limit" validate:"omitempty,min=1,max=1000"` Offset int `form:"offset" validate:"omitempty,min=0"` + + // TopicID filters feedback records by topic assignment + // By default, uses direct topic_id lookup (fast, pre-computed) + // Set UseSimilarity=true to use vector similarity search instead + TopicID *uuid.UUID `form:"topic_id" validate:"omitempty"` + + // UseSimilarity when true, uses vector similarity search instead of direct topic_id lookup + // This is slower but can find matches for unclassified feedback + UseSimilarity bool `form:"use_similarity"` + + // MinSimilarity overrides the default threshold when using similarity search + // Value between 0 and 1 (e.g., 0.5 = 50% similarity minimum) + // Only used when UseSimilarity=true. If not set, uses automatic thresholds based on topic level + MinSimilarity *float64 `form:"min_similarity" validate:"omitempty,min=0,max=1"` } // ListFeedbackRecordsResponse represents the response for listing feedback records @@ -96,3 +114,9 @@ type BulkDeleteResponse struct { DeletedCount int64 `json:"deleted_count"` Message string `json:"message"` } + +// UpdateFeedbackEnrichmentRequest represents internal request to update AI-enriched fields +// Used by the service layer, not exposed via API +type UpdateFeedbackEnrichmentRequest struct { + Embedding []float32 +} diff --git a/internal/models/knowledge_records.go b/internal/models/knowledge_records.go new file mode 100644 index 0000000..cbdc7db --- /dev/null +++ b/internal/models/knowledge_records.go @@ -0,0 +1,54 @@ +package models + +import ( + "time" + + "github.com/google/uuid" +) + +// KnowledgeRecord represents a single knowledge record +type KnowledgeRecord struct { + ID uuid.UUID `json:"id"` + Content string `json:"content"` + TenantID *string `json:"tenant_id,omitempty"` + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` +} + +// CreateKnowledgeRecordRequest represents the request to create a knowledge record +type CreateKnowledgeRecordRequest struct { + Content string `json:"content" validate:"required,no_null_bytes,min=1,max=10000"` + TenantID *string `json:"tenant_id,omitempty" validate:"omitempty,no_null_bytes,max=255"` +} + +// UpdateKnowledgeRecordRequest represents the request to update a knowledge record +// Only content can be updated +type UpdateKnowledgeRecordRequest struct { + Content *string `json:"content,omitempty" validate:"omitempty,no_null_bytes,min=1,max=10000"` +} + +// ListKnowledgeRecordsFilters represents filters for listing knowledge records +type ListKnowledgeRecordsFilters struct { + TenantID *string `form:"tenant_id" validate:"omitempty,no_null_bytes"` + Limit int `form:"limit" validate:"omitempty,min=1,max=1000"` + Offset int `form:"offset" validate:"omitempty,min=0"` +} + +// ListKnowledgeRecordsResponse represents the response for listing knowledge records +type ListKnowledgeRecordsResponse struct { + Data []KnowledgeRecord `json:"data"` + Total int64 `json:"total"` + Limit int `json:"limit"` + Offset int `json:"offset"` +} + +// BulkDeleteKnowledgeRecordsFilters represents query parameters for bulk delete operation +type BulkDeleteKnowledgeRecordsFilters struct { + TenantID string `form:"tenant_id" validate:"required,no_null_bytes,min=1"` +} + +// BulkDeleteKnowledgeRecordsResponse represents the response for bulk delete operation +type BulkDeleteKnowledgeRecordsResponse struct { + DeletedCount int64 `json:"deleted_count"` + Message string `json:"message"` +} diff --git a/internal/models/topics.go b/internal/models/topics.go new file mode 100644 index 0000000..76a67a5 --- /dev/null +++ b/internal/models/topics.go @@ -0,0 +1,90 @@ +package models + +import ( + "time" + + "github.com/google/uuid" +) + +// Topic represents a single topic +// Level 1 topics are broad categories, Level 2 topics are specific subtopics +// Level 2 topics have an explicit parent_id linking to their Level 1 parent +type Topic struct { + ID uuid.UUID `json:"id"` + Title string `json:"title"` + Level int `json:"level"` + ParentID *uuid.UUID `json:"parent_id,omitempty"` + TenantID *string `json:"tenant_id,omitempty"` + FeedbackCount *int64 `json:"feedback_count,omitempty"` + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` +} + +// CreateTopicRequest represents the request to create a topic +type CreateTopicRequest struct { + Title string `json:"title" validate:"required,no_null_bytes,min=1,max=255"` + Level int `json:"level" validate:"required,min=1,max=2"` + ParentID *uuid.UUID `json:"parent_id,omitempty"` // Required for Level 2 topics + TenantID *string `json:"tenant_id,omitempty" validate:"omitempty,no_null_bytes,max=255"` +} + +// UpdateTopicRequest represents the request to update a topic +// Only title can be updated - parent_id is immutable +type UpdateTopicRequest struct { + Title *string `json:"title,omitempty" validate:"omitempty,no_null_bytes,min=1,max=255"` +} + +// ListTopicsFilters represents filters for listing topics +type ListTopicsFilters struct { + Level *int `form:"level" validate:"omitempty,min=1"` + ParentID *uuid.UUID `form:"parent_id" validate:"omitempty"` + Title *string `form:"title" validate:"omitempty,no_null_bytes"` + TenantID *string `form:"tenant_id" validate:"omitempty,no_null_bytes"` + Limit int `form:"limit" validate:"omitempty,min=1,max=1000"` + Offset int `form:"offset" validate:"omitempty,min=0"` +} + +// ListTopicsResponse represents the response for listing topics +type ListTopicsResponse struct { + Data []Topic `json:"data"` + Total int64 `json:"total"` + Limit int `json:"limit"` + Offset int `json:"offset"` +} + +// TopicMatch represents a topic matched by vector similarity search +type TopicMatch struct { + TopicID uuid.UUID `json:"topic_id"` + Title string `json:"title"` + Level int `json:"level"` + Similarity float64 `json:"similarity"` +} + +// SimilarTopic represents a Level 2 topic similar to a Level 1 topic +type SimilarTopic struct { + ID uuid.UUID `json:"id"` + Title string `json:"title"` + Similarity float64 `json:"similarity"` +} + +// LevelThresholds maps topic hierarchy levels to minimum similarity thresholds. +// Higher levels (more specific topics) require higher similarity for matches. +// These values can be tuned based on production feedback. +var LevelThresholds = map[int]float64{ + 1: 0.30, // Level 1: broadest topics (e.g., "Performance") + 2: 0.40, // Level 2: (e.g., "Performance > API") + 3: 0.50, // Level 3: (e.g., "Performance > API > Latency") + 4: 0.60, // Level 4: more specific + 5: 0.70, // Level 5: most specific +} + +// DefaultThreshold is used for levels not in the map (fallback) +const DefaultThreshold = 0.50 + +// SimilarityThresholdForLevel returns the minimum similarity threshold for a given topic level. +func SimilarityThresholdForLevel(level int) float64 { + if threshold, ok := LevelThresholds[level]; ok { + return threshold + } + return DefaultThreshold +} diff --git a/internal/repository/clustering_jobs_repository.go b/internal/repository/clustering_jobs_repository.go new file mode 100644 index 0000000..158c4e2 --- /dev/null +++ b/internal/repository/clustering_jobs_repository.go @@ -0,0 +1,332 @@ +package repository + +import ( + "context" + "fmt" + "strings" + "time" + + apperrors "github.com/formbricks/hub/internal/errors" + "github.com/formbricks/hub/internal/models" + "github.com/google/uuid" + "github.com/jackc/pgx/v5" + "github.com/jackc/pgx/v5/pgxpool" +) + +// ClusteringJobsRepository handles data access for clustering jobs. +type ClusteringJobsRepository struct { + db *pgxpool.Pool +} + +// NewClusteringJobsRepository creates a new clustering jobs repository. +func NewClusteringJobsRepository(db *pgxpool.Pool) *ClusteringJobsRepository { + return &ClusteringJobsRepository{db: db} +} + +// CreateOrUpdate creates a new clustering job schedule or updates an existing one for the tenant. +func (r *ClusteringJobsRepository) CreateOrUpdate(ctx context.Context, req *models.CreateClusteringJobRequest) (*models.ClusteringJob, error) { + // Calculate next run time based on interval + var nextRunAt *time.Time + if req.ScheduleInterval != nil { + next := calculateNextRun(*req.ScheduleInterval) + nextRunAt = &next + } + + query := ` + INSERT INTO clustering_jobs (tenant_id, status, schedule_interval, next_run_at) + VALUES ($1, 'pending', $2, $3) + ON CONFLICT (tenant_id) DO UPDATE SET + schedule_interval = EXCLUDED.schedule_interval, + next_run_at = EXCLUDED.next_run_at, + status = CASE + WHEN EXCLUDED.schedule_interval IS NULL THEN 'disabled' + ELSE 'pending' + END, + updated_at = NOW() + RETURNING id, tenant_id, status, schedule_interval, next_run_at, last_run_at, + last_job_id, last_error, topics_generated, records_processed, created_at, updated_at + ` + + var job models.ClusteringJob + var scheduleInterval *string + err := r.db.QueryRow(ctx, query, req.TenantID, req.ScheduleInterval, nextRunAt).Scan( + &job.ID, &job.TenantID, &job.Status, &scheduleInterval, &job.NextRunAt, &job.LastRunAt, + &job.LastJobID, &job.LastError, &job.TopicsGenerated, &job.RecordsProcessed, &job.CreatedAt, &job.UpdatedAt, + ) + if err != nil { + return nil, fmt.Errorf("failed to create/update clustering job: %w", err) + } + + if scheduleInterval != nil { + si := models.ScheduleInterval(*scheduleInterval) + job.ScheduleInterval = &si + } + + return &job, nil +} + +// GetByTenantID retrieves the clustering job for a tenant. +func (r *ClusteringJobsRepository) GetByTenantID(ctx context.Context, tenantID string) (*models.ClusteringJob, error) { + query := ` + SELECT id, tenant_id, status, schedule_interval, next_run_at, last_run_at, + last_job_id, last_error, topics_generated, records_processed, created_at, updated_at + FROM clustering_jobs + WHERE tenant_id = $1 + ` + + var job models.ClusteringJob + var scheduleInterval *string + err := r.db.QueryRow(ctx, query, tenantID).Scan( + &job.ID, &job.TenantID, &job.Status, &scheduleInterval, &job.NextRunAt, &job.LastRunAt, + &job.LastJobID, &job.LastError, &job.TopicsGenerated, &job.RecordsProcessed, &job.CreatedAt, &job.UpdatedAt, + ) + if err != nil { + if err == pgx.ErrNoRows { + return nil, apperrors.NewNotFoundError("clustering_job", "no schedule found for tenant") + } + return nil, fmt.Errorf("failed to get clustering job: %w", err) + } + + if scheduleInterval != nil { + si := models.ScheduleInterval(*scheduleInterval) + job.ScheduleInterval = &si + } + + return &job, nil +} + +// GetDueJobs retrieves all jobs that are due to run (next_run_at <= now). +func (r *ClusteringJobsRepository) GetDueJobs(ctx context.Context, limit int) ([]models.ClusteringJob, error) { + if limit <= 0 { + limit = 10 + } + + query := ` + SELECT id, tenant_id, status, schedule_interval, next_run_at, last_run_at, + last_job_id, last_error, topics_generated, records_processed, created_at, updated_at + FROM clustering_jobs + WHERE status != 'disabled' + AND status != 'running' + AND next_run_at IS NOT NULL + AND next_run_at <= NOW() + ORDER BY next_run_at ASC + LIMIT $1 + ` + + rows, err := r.db.Query(ctx, query, limit) + if err != nil { + return nil, fmt.Errorf("failed to get due jobs: %w", err) + } + defer rows.Close() + + jobs := []models.ClusteringJob{} + for rows.Next() { + var job models.ClusteringJob + var scheduleInterval *string + err := rows.Scan( + &job.ID, &job.TenantID, &job.Status, &scheduleInterval, &job.NextRunAt, &job.LastRunAt, + &job.LastJobID, &job.LastError, &job.TopicsGenerated, &job.RecordsProcessed, &job.CreatedAt, &job.UpdatedAt, + ) + if err != nil { + return nil, fmt.Errorf("failed to scan clustering job: %w", err) + } + + if scheduleInterval != nil { + si := models.ScheduleInterval(*scheduleInterval) + job.ScheduleInterval = &si + } + + jobs = append(jobs, job) + } + + return jobs, nil +} + +// MarkRunning marks a job as running. +func (r *ClusteringJobsRepository) MarkRunning(ctx context.Context, id uuid.UUID) error { + query := `UPDATE clustering_jobs SET status = 'running' WHERE id = $1` + result, err := r.db.Exec(ctx, query, id) + if err != nil { + return fmt.Errorf("failed to mark job running: %w", err) + } + if result.RowsAffected() == 0 { + return apperrors.NewNotFoundError("clustering_job", "job not found") + } + return nil +} + +// UpdateAfterRun updates a job after execution completes. +func (r *ClusteringJobsRepository) UpdateAfterRun(ctx context.Context, id uuid.UUID, req *models.UpdateClusteringJobRequest) error { + // Get current job to calculate next run + var scheduleInterval *string + err := r.db.QueryRow(ctx, `SELECT schedule_interval FROM clustering_jobs WHERE id = $1`, id).Scan(&scheduleInterval) + if err != nil { + return fmt.Errorf("failed to get job for update: %w", err) + } + + // Calculate next run time + var nextRunAt *time.Time + if scheduleInterval != nil && req.Status == models.JobStatusComplete { + next := calculateNextRun(models.ScheduleInterval(*scheduleInterval)) + nextRunAt = &next + } + + query := ` + UPDATE clustering_jobs + SET status = $1, + last_run_at = NOW(), + last_job_id = $2, + last_error = $3, + topics_generated = COALESCE($4, topics_generated), + records_processed = COALESCE($5, records_processed), + next_run_at = COALESCE($6, next_run_at) + WHERE id = $7 + ` + + result, err := r.db.Exec(ctx, query, + req.Status, + req.LastJobID, + req.LastError, + req.TopicsGenerated, + req.RecordsProcessed, + nextRunAt, + id, + ) + if err != nil { + return fmt.Errorf("failed to update job after run: %w", err) + } + if result.RowsAffected() == 0 { + return apperrors.NewNotFoundError("clustering_job", "job not found") + } + return nil +} + +// Delete removes a clustering job schedule. +func (r *ClusteringJobsRepository) Delete(ctx context.Context, tenantID string) error { + query := `DELETE FROM clustering_jobs WHERE tenant_id = $1` + result, err := r.db.Exec(ctx, query, tenantID) + if err != nil { + return fmt.Errorf("failed to delete clustering job: %w", err) + } + if result.RowsAffected() == 0 { + return apperrors.NewNotFoundError("clustering_job", "schedule not found") + } + return nil +} + +// List retrieves clustering jobs with optional filters. +func (r *ClusteringJobsRepository) List(ctx context.Context, filters *models.ListClusteringJobsFilters) ([]models.ClusteringJob, error) { + query := ` + SELECT id, tenant_id, status, schedule_interval, next_run_at, last_run_at, + last_job_id, last_error, topics_generated, records_processed, created_at, updated_at + FROM clustering_jobs + ` + + var conditions []string + var args []interface{} + argCount := 1 + + if filters.TenantID != nil { + conditions = append(conditions, fmt.Sprintf("tenant_id = $%d", argCount)) + args = append(args, *filters.TenantID) + argCount++ + } + + if filters.Status != nil { + conditions = append(conditions, fmt.Sprintf("status = $%d", argCount)) + args = append(args, *filters.Status) + argCount++ + } + + if len(conditions) > 0 { + query += " WHERE " + strings.Join(conditions, " AND ") + } + + query += " ORDER BY created_at DESC" + + if filters.Limit > 0 { + query += fmt.Sprintf(" LIMIT $%d", argCount) + args = append(args, filters.Limit) + argCount++ + } + + if filters.Offset > 0 { + query += fmt.Sprintf(" OFFSET $%d", argCount) + args = append(args, filters.Offset) + } + + rows, err := r.db.Query(ctx, query, args...) + if err != nil { + return nil, fmt.Errorf("failed to list clustering jobs: %w", err) + } + defer rows.Close() + + jobs := []models.ClusteringJob{} + for rows.Next() { + var job models.ClusteringJob + var scheduleInterval *string + err := rows.Scan( + &job.ID, &job.TenantID, &job.Status, &scheduleInterval, &job.NextRunAt, &job.LastRunAt, + &job.LastJobID, &job.LastError, &job.TopicsGenerated, &job.RecordsProcessed, &job.CreatedAt, &job.UpdatedAt, + ) + if err != nil { + return nil, fmt.Errorf("failed to scan clustering job: %w", err) + } + + if scheduleInterval != nil { + si := models.ScheduleInterval(*scheduleInterval) + job.ScheduleInterval = &si + } + + jobs = append(jobs, job) + } + + return jobs, nil +} + +// Count returns the total count of clustering jobs matching the filters. +func (r *ClusteringJobsRepository) Count(ctx context.Context, filters *models.ListClusteringJobsFilters) (int64, error) { + query := `SELECT COUNT(*) FROM clustering_jobs` + + var conditions []string + var args []interface{} + argCount := 1 + + if filters.TenantID != nil { + conditions = append(conditions, fmt.Sprintf("tenant_id = $%d", argCount)) + args = append(args, *filters.TenantID) + argCount++ + } + + if filters.Status != nil { + conditions = append(conditions, fmt.Sprintf("status = $%d", argCount)) + args = append(args, *filters.Status) + } + + if len(conditions) > 0 { + query += " WHERE " + strings.Join(conditions, " AND ") + } + + var count int64 + err := r.db.QueryRow(ctx, query, args...).Scan(&count) + if err != nil { + return 0, fmt.Errorf("failed to count clustering jobs: %w", err) + } + + return count, nil +} + +// calculateNextRun calculates the next run time based on the interval. +func calculateNextRun(interval models.ScheduleInterval) time.Time { + now := time.Now() + switch interval { + case models.ScheduleDaily: + return now.Add(24 * time.Hour) + case models.ScheduleWeekly: + return now.Add(7 * 24 * time.Hour) + case models.ScheduleMonthly: + return now.AddDate(0, 1, 0) + default: + return now.Add(24 * time.Hour) + } +} diff --git a/internal/repository/feedback_records_repository.go b/internal/repository/feedback_records_repository.go index 0fec798..2676e74 100644 --- a/internal/repository/feedback_records_repository.go +++ b/internal/repository/feedback_records_repository.go @@ -3,6 +3,7 @@ package repository import ( "context" "fmt" + "log/slog" "strings" "time" @@ -11,6 +12,7 @@ import ( "github.com/google/uuid" "github.com/jackc/pgx/v5" "github.com/jackc/pgx/v5/pgxpool" + "github.com/pgvector/pgvector-go" ) // FeedbackRecordsRepository handles data access for feedback records @@ -144,6 +146,8 @@ func buildFilterConditions(filters *models.ListFeedbackRecordsFilters) (string, argCount++ } + // Note: TopicID is handled separately via vector similarity search, not here + if filters.Since != nil { conditions = append(conditions, fmt.Sprintf("collected_at >= $%d", argCount)) args = append(args, *filters.Since) @@ -163,7 +167,7 @@ func buildFilterConditions(filters *models.ListFeedbackRecordsFilters) (string, return whereClause, args } -// List retrieves feedback records with optional filters +// List retrieves feedback records with optional filters (non-vector based) func (r *FeedbackRecordsRepository) List(ctx context.Context, filters *models.ListFeedbackRecordsFilters) ([]models.FeedbackRecord, error) { query := ` SELECT id, collected_at, created_at, updated_at, @@ -369,3 +373,346 @@ func (r *FeedbackRecordsRepository) BulkDelete(ctx context.Context, userIdentifi return result.RowsAffected(), nil } + +// UpdateEnrichment updates the embedding for a feedback record +func (r *FeedbackRecordsRepository) UpdateEnrichment(ctx context.Context, id uuid.UUID, req *models.UpdateFeedbackEnrichmentRequest) error { + query := ` + UPDATE feedback_records + SET embedding = $1, updated_at = $2 + WHERE id = $3 + ` + + var embeddingValue interface{} + if req.Embedding != nil { + embeddingValue = pgvector.NewVector(req.Embedding) + } + + result, err := r.db.Exec(ctx, query, embeddingValue, time.Now(), id) + if err != nil { + return fmt.Errorf("failed to update feedback record enrichment: %w", err) + } + + if result.RowsAffected() == 0 { + return apperrors.NewNotFoundError("feedback record", "feedback record not found") + } + + return nil +} + +// AssignTopic updates the topic assignment for a feedback record. +// Only assigns if topic_id is currently NULL (preserves manual overrides). +// This is used for real-time topic assignment after embedding generation. +func (r *FeedbackRecordsRepository) AssignTopic(ctx context.Context, id uuid.UUID, topicID uuid.UUID, confidence float64) error { + query := ` + UPDATE feedback_records + SET topic_id = $1, classification_confidence = $2, updated_at = $3 + WHERE id = $4 AND topic_id IS NULL + ` + + result, err := r.db.Exec(ctx, query, topicID, confidence, time.Now(), id) + if err != nil { + return fmt.Errorf("failed to assign topic to feedback record: %w", err) + } + + // RowsAffected = 0 means either record not found OR topic already assigned + // Both cases are acceptable - we don't want to overwrite manual corrections + if result.RowsAffected() == 0 { + slog.Debug("topic assignment skipped (already assigned or not found)", "record_id", id, "topic_id", topicID) + } + + return nil +} + +// ListBySimilarityWithDescendants finds feedback similar to a topic AND all its descendants. +// Uses a single optimized query with recursive CTE for efficiency. +// Returns the matching records and total count in one database round-trip. +func (r *FeedbackRecordsRepository) ListBySimilarityWithDescendants( + ctx context.Context, + topicID uuid.UUID, + levelThresholds map[int]float64, + defaultThreshold float64, + filters *models.ListFeedbackRecordsFilters, +) ([]models.FeedbackRecord, int64, error) { + // Extract threshold values for each level (1-5), using default for missing levels + getThreshold := func(level int) float64 { + if t, ok := levelThresholds[level]; ok { + return t + } + return defaultThreshold + } + + // Build additional filter conditions + filterConditions, filterArgs, nextArg := buildSimilarityFilterConditions(filters, 10) + + // Build the optimized query with recursive CTE + // This query: + // 1. Gets target topic + all descendants via recursive CTE + // 2. Computes similarity for each (topic, feedback) pair + // 3. Applies level-appropriate threshold using CASE + // 4. Keeps best match per feedback record using DISTINCT ON + // 5. Returns total count via window function + query := fmt.Sprintf(` + WITH RECURSIVE topic_tree AS ( + -- Base: target topic + SELECT id, level, embedding + FROM topics + WHERE id = $1 AND embedding IS NOT NULL + + UNION ALL + + -- Recursive: descendants with embeddings + SELECT t.id, t.level, t.embedding + FROM topics t + INNER JOIN topic_tree tt ON t.parent_id = tt.id + WHERE t.embedding IS NOT NULL + ), + all_matches AS ( + -- Find feedback similar to ANY topic in tree + SELECT + fr.id, fr.collected_at, fr.created_at, fr.updated_at, + fr.source_type, fr.source_id, fr.source_name, + fr.field_id, fr.field_label, fr.field_type, + fr.value_text, fr.value_number, fr.value_boolean, fr.value_date, + fr.metadata, fr.language, fr.user_identifier, fr.tenant_id, fr.response_id, + 1 - (fr.embedding <=> tt.embedding) as similarity + FROM feedback_records fr + CROSS JOIN topic_tree tt + WHERE fr.embedding IS NOT NULL + AND 1 - (fr.embedding <=> tt.embedding) >= + CASE tt.level + WHEN 1 THEN $2 + WHEN 2 THEN $3 + WHEN 3 THEN $4 + WHEN 4 THEN $5 + WHEN 5 THEN $6 + ELSE $7 + END + %s + ), + deduplicated AS ( + -- Keep only the highest similarity per feedback record + SELECT DISTINCT ON (id) + id, collected_at, created_at, updated_at, + source_type, source_id, source_name, + field_id, field_label, field_type, + value_text, value_number, value_boolean, value_date, + metadata, language, user_identifier, tenant_id, response_id, + similarity + FROM all_matches + ORDER BY id, similarity DESC + ) + SELECT + id, collected_at, created_at, updated_at, + source_type, source_id, source_name, + field_id, field_label, field_type, + value_text, value_number, value_boolean, value_date, + metadata, language, user_identifier, tenant_id, response_id, + similarity, + COUNT(*) OVER() as total_count + FROM deduplicated + ORDER BY similarity DESC + LIMIT $8 OFFSET $9 + `, filterConditions) + + // Build args: topicID, thresholds (1-5), default, limit, offset, then filter args + limit := filters.Limit + if limit <= 0 { + limit = 100 + } + offset := filters.Offset + + args := []interface{}{ + topicID, + getThreshold(1), + getThreshold(2), + getThreshold(3), + getThreshold(4), + getThreshold(5), + defaultThreshold, + limit, + offset, + } + args = append(args, filterArgs...) + _ = nextArg // unused but returned by helper + + rows, err := r.db.Query(ctx, query, args...) + if err != nil { + return nil, 0, fmt.Errorf("failed to list feedback records by similarity with descendants: %w", err) + } + defer rows.Close() + + records := []models.FeedbackRecord{} + var totalCount int64 + + for rows.Next() { + var record models.FeedbackRecord + var similarity float64 + var count int64 + + err := rows.Scan( + &record.ID, &record.CollectedAt, &record.CreatedAt, &record.UpdatedAt, + &record.SourceType, &record.SourceID, &record.SourceName, + &record.FieldID, &record.FieldLabel, &record.FieldType, + &record.ValueText, &record.ValueNumber, &record.ValueBoolean, &record.ValueDate, + &record.Metadata, &record.Language, &record.UserIdentifier, &record.TenantID, &record.ResponseID, + &similarity, &count, + ) + if err != nil { + return nil, 0, fmt.Errorf("failed to scan feedback record: %w", err) + } + + record.Similarity = &similarity + records = append(records, record) + totalCount = count // Same for all rows due to window function + } + + if err := rows.Err(); err != nil { + return nil, 0, fmt.Errorf("error iterating feedback records: %w", err) + } + + return records, totalCount, nil +} + +// ListByTopicWithDescendants retrieves feedback records assigned to a topic or its descendants. +// Uses the pre-computed topic_id column set during taxonomy generation. +// This is faster than similarity search and provides accurate cluster-based results. +func (r *FeedbackRecordsRepository) ListByTopicWithDescendants( + ctx context.Context, + topicID uuid.UUID, + filters *models.ListFeedbackRecordsFilters, +) ([]models.FeedbackRecord, int64, error) { + // Build additional filter conditions + filterConditions, filterArgs, _ := buildSimilarityFilterConditions(filters, 3) + + limit := filters.Limit + if limit <= 0 { + limit = 100 + } + offset := filters.Offset + + // Build query with recursive CTE to get topic and all descendants + // Then match feedback records by topic_id column + query := fmt.Sprintf(` + WITH RECURSIVE topic_tree AS ( + SELECT id + FROM topics + WHERE id = $1 + + UNION ALL + + SELECT t.id + FROM topics t + INNER JOIN topic_tree tt ON t.parent_id = tt.id + ) + SELECT + fr.id, fr.collected_at, fr.created_at, fr.updated_at, + fr.source_type, fr.source_id, fr.source_name, + fr.field_id, fr.field_label, fr.field_type, + fr.value_text, fr.value_number, fr.value_boolean, fr.value_date, + fr.metadata, fr.language, fr.user_identifier, fr.tenant_id, fr.response_id, + fr.classification_confidence, + COUNT(*) OVER() as total_count + FROM feedback_records fr + WHERE fr.topic_id IN (SELECT id FROM topic_tree) + %s + ORDER BY fr.collected_at DESC + LIMIT $2 OFFSET $3 + `, filterConditions) + + args := []interface{}{topicID, limit, offset} + args = append(args, filterArgs...) + + rows, err := r.db.Query(ctx, query, args...) + if err != nil { + return nil, 0, fmt.Errorf("failed to list feedback records by topic: %w", err) + } + defer rows.Close() + + records := []models.FeedbackRecord{} + var totalCount int64 + + for rows.Next() { + var record models.FeedbackRecord + var confidence *float64 + var count int64 + + err := rows.Scan( + &record.ID, &record.CollectedAt, &record.CreatedAt, &record.UpdatedAt, + &record.SourceType, &record.SourceID, &record.SourceName, + &record.FieldID, &record.FieldLabel, &record.FieldType, + &record.ValueText, &record.ValueNumber, &record.ValueBoolean, &record.ValueDate, + &record.Metadata, &record.Language, &record.UserIdentifier, &record.TenantID, &record.ResponseID, + &confidence, &count, + ) + if err != nil { + return nil, 0, fmt.Errorf("failed to scan feedback record: %w", err) + } + + record.Similarity = confidence // Reuse similarity field for confidence + records = append(records, record) + totalCount = count + } + + if err := rows.Err(); err != nil { + return nil, 0, fmt.Errorf("error iterating feedback records: %w", err) + } + + return records, totalCount, nil +} + +// buildSimilarityFilterConditions builds WHERE clause conditions for similarity queries. +// Returns the conditions string (with AND prefix for each), the args slice, and the next arg index. +// startArg is the first parameter index to use. +func buildSimilarityFilterConditions(filters *models.ListFeedbackRecordsFilters, startArg int) (string, []interface{}, int) { + var conditions []string + var args []interface{} + argCount := startArg + + if filters.TenantID != nil { + conditions = append(conditions, fmt.Sprintf("AND fr.tenant_id = $%d", argCount)) + args = append(args, *filters.TenantID) + argCount++ + } + if filters.ResponseID != nil { + conditions = append(conditions, fmt.Sprintf("AND fr.response_id = $%d", argCount)) + args = append(args, *filters.ResponseID) + argCount++ + } + if filters.SourceType != nil { + conditions = append(conditions, fmt.Sprintf("AND fr.source_type = $%d", argCount)) + args = append(args, *filters.SourceType) + argCount++ + } + if filters.SourceID != nil { + conditions = append(conditions, fmt.Sprintf("AND fr.source_id = $%d", argCount)) + args = append(args, *filters.SourceID) + argCount++ + } + if filters.FieldID != nil { + conditions = append(conditions, fmt.Sprintf("AND fr.field_id = $%d", argCount)) + args = append(args, *filters.FieldID) + argCount++ + } + if filters.FieldType != nil { + conditions = append(conditions, fmt.Sprintf("AND fr.field_type = $%d", argCount)) + args = append(args, *filters.FieldType) + argCount++ + } + if filters.UserIdentifier != nil { + conditions = append(conditions, fmt.Sprintf("AND fr.user_identifier = $%d", argCount)) + args = append(args, *filters.UserIdentifier) + argCount++ + } + if filters.Since != nil { + conditions = append(conditions, fmt.Sprintf("AND fr.collected_at >= $%d", argCount)) + args = append(args, *filters.Since) + argCount++ + } + if filters.Until != nil { + conditions = append(conditions, fmt.Sprintf("AND fr.collected_at <= $%d", argCount)) + args = append(args, *filters.Until) + argCount++ + } + + return strings.Join(conditions, " "), args, argCount +} diff --git a/internal/repository/knowledge_records_repository.go b/internal/repository/knowledge_records_repository.go new file mode 100644 index 0000000..bed85bb --- /dev/null +++ b/internal/repository/knowledge_records_repository.go @@ -0,0 +1,237 @@ +package repository + +import ( + "context" + "fmt" + "strings" + "time" + + apperrors "github.com/formbricks/hub/internal/errors" + "github.com/formbricks/hub/internal/models" + "github.com/google/uuid" + "github.com/jackc/pgx/v5" + "github.com/jackc/pgx/v5/pgxpool" + "github.com/pgvector/pgvector-go" +) + +// KnowledgeRecordsRepository handles data access for knowledge records +type KnowledgeRecordsRepository struct { + db *pgxpool.Pool +} + +// NewKnowledgeRecordsRepository creates a new knowledge records repository +func NewKnowledgeRecordsRepository(db *pgxpool.Pool) *KnowledgeRecordsRepository { + return &KnowledgeRecordsRepository{db: db} +} + +// normalizeTenantID converts empty string tenant_id to nil for consistency +func normalizeTenantID(tenantID *string) *string { + if tenantID != nil && *tenantID == "" { + return nil + } + return tenantID +} + +// Create inserts a new knowledge record +func (r *KnowledgeRecordsRepository) Create(ctx context.Context, req *models.CreateKnowledgeRecordRequest) (*models.KnowledgeRecord, error) { + tenantID := normalizeTenantID(req.TenantID) + + query := ` + INSERT INTO knowledge_records (content, tenant_id) + VALUES ($1, $2) + RETURNING id, content, tenant_id, created_at, updated_at + ` + + var record models.KnowledgeRecord + err := r.db.QueryRow(ctx, query, req.Content, tenantID).Scan( + &record.ID, &record.Content, &record.TenantID, &record.CreatedAt, &record.UpdatedAt, + ) + if err != nil { + return nil, fmt.Errorf("failed to create knowledge record: %w", err) + } + + return &record, nil +} + +// GetByID retrieves a single knowledge record by ID +func (r *KnowledgeRecordsRepository) GetByID(ctx context.Context, id uuid.UUID) (*models.KnowledgeRecord, error) { + query := ` + SELECT id, content, tenant_id, created_at, updated_at + FROM knowledge_records + WHERE id = $1 + ` + + var record models.KnowledgeRecord + err := r.db.QueryRow(ctx, query, id).Scan( + &record.ID, &record.Content, &record.TenantID, &record.CreatedAt, &record.UpdatedAt, + ) + if err != nil { + if err == pgx.ErrNoRows { + return nil, apperrors.NewNotFoundError("knowledge record", "knowledge record not found") + } + return nil, fmt.Errorf("failed to get knowledge record: %w", err) + } + + return &record, nil +} + +// buildKnowledgeRecordsFilterConditions builds WHERE clause conditions and arguments from filters +func buildKnowledgeRecordsFilterConditions(filters *models.ListKnowledgeRecordsFilters) (string, []interface{}) { + var conditions []string + var args []interface{} + argCount := 1 + + if filters.TenantID != nil { + conditions = append(conditions, fmt.Sprintf("tenant_id = $%d", argCount)) + args = append(args, *filters.TenantID) + } + + whereClause := "" + if len(conditions) > 0 { + whereClause = " WHERE " + strings.Join(conditions, " AND ") + } + + return whereClause, args +} + +// List retrieves knowledge records with optional filters +func (r *KnowledgeRecordsRepository) List(ctx context.Context, filters *models.ListKnowledgeRecordsFilters) ([]models.KnowledgeRecord, error) { + query := ` + SELECT id, content, tenant_id, created_at, updated_at + FROM knowledge_records + ` + + whereClause, args := buildKnowledgeRecordsFilterConditions(filters) + query += whereClause + argCount := len(args) + 1 + + query += " ORDER BY created_at DESC" + + if filters.Limit > 0 { + query += fmt.Sprintf(" LIMIT $%d", argCount) + args = append(args, filters.Limit) + argCount++ + } + + if filters.Offset > 0 { + query += fmt.Sprintf(" OFFSET $%d", argCount) + args = append(args, filters.Offset) + } + + rows, err := r.db.Query(ctx, query, args...) + if err != nil { + return nil, fmt.Errorf("failed to list knowledge records: %w", err) + } + defer rows.Close() + + records := []models.KnowledgeRecord{} // Initialize as empty slice, not nil + for rows.Next() { + var record models.KnowledgeRecord + err := rows.Scan( + &record.ID, &record.Content, &record.TenantID, &record.CreatedAt, &record.UpdatedAt, + ) + if err != nil { + return nil, fmt.Errorf("failed to scan knowledge record: %w", err) + } + records = append(records, record) + } + + if err := rows.Err(); err != nil { + return nil, fmt.Errorf("error iterating knowledge records: %w", err) + } + + return records, nil +} + +// Count returns the total count of knowledge records matching the filters +func (r *KnowledgeRecordsRepository) Count(ctx context.Context, filters *models.ListKnowledgeRecordsFilters) (int64, error) { + query := `SELECT COUNT(*) FROM knowledge_records` + + whereClause, args := buildKnowledgeRecordsFilterConditions(filters) + query += whereClause + + var count int64 + err := r.db.QueryRow(ctx, query, args...).Scan(&count) + if err != nil { + return 0, fmt.Errorf("failed to count knowledge records: %w", err) + } + + return count, nil +} + +// Update updates an existing knowledge record +// Only content can be updated +func (r *KnowledgeRecordsRepository) Update(ctx context.Context, id uuid.UUID, req *models.UpdateKnowledgeRecordRequest) (*models.KnowledgeRecord, error) { + // If no content provided, just return the existing record + if req.Content == nil { + return r.GetByID(ctx, id) + } + + query := ` + UPDATE knowledge_records + SET content = $1, updated_at = $2 + WHERE id = $3 + RETURNING id, content, tenant_id, created_at, updated_at + ` + + var record models.KnowledgeRecord + err := r.db.QueryRow(ctx, query, *req.Content, time.Now(), id).Scan( + &record.ID, &record.Content, &record.TenantID, &record.CreatedAt, &record.UpdatedAt, + ) + if err != nil { + if err == pgx.ErrNoRows { + return nil, apperrors.NewNotFoundError("knowledge record", "knowledge record not found") + } + return nil, fmt.Errorf("failed to update knowledge record: %w", err) + } + + return &record, nil +} + +// Delete removes a knowledge record +func (r *KnowledgeRecordsRepository) Delete(ctx context.Context, id uuid.UUID) error { + query := `DELETE FROM knowledge_records WHERE id = $1` + + result, err := r.db.Exec(ctx, query, id) + if err != nil { + return fmt.Errorf("failed to delete knowledge record: %w", err) + } + + if result.RowsAffected() == 0 { + return apperrors.NewNotFoundError("knowledge record", "knowledge record not found") + } + + return nil +} + +// BulkDelete deletes all knowledge records matching tenant_id +func (r *KnowledgeRecordsRepository) BulkDelete(ctx context.Context, tenantID string) (int64, error) { + query := `DELETE FROM knowledge_records WHERE tenant_id = $1` + + result, err := r.db.Exec(ctx, query, tenantID) + if err != nil { + return 0, fmt.Errorf("failed to bulk delete knowledge records: %w", err) + } + + return result.RowsAffected(), nil +} + +// UpdateEmbedding updates the embedding vector for a knowledge record +func (r *KnowledgeRecordsRepository) UpdateEmbedding(ctx context.Context, id uuid.UUID, embedding []float32) error { + query := ` + UPDATE knowledge_records + SET embedding = $1, updated_at = $2 + WHERE id = $3 + ` + + result, err := r.db.Exec(ctx, query, pgvector.NewVector(embedding), time.Now(), id) + if err != nil { + return fmt.Errorf("failed to update knowledge record embedding: %w", err) + } + + if result.RowsAffected() == 0 { + return apperrors.NewNotFoundError("knowledge record", "knowledge record not found") + } + + return nil +} diff --git a/internal/repository/topics_repository.go b/internal/repository/topics_repository.go new file mode 100644 index 0000000..2ba964c --- /dev/null +++ b/internal/repository/topics_repository.go @@ -0,0 +1,473 @@ +package repository + +import ( + "context" + "fmt" + "log/slog" + "strings" + "time" + + apperrors "github.com/formbricks/hub/internal/errors" + "github.com/formbricks/hub/internal/models" + "github.com/google/uuid" + "github.com/jackc/pgx/v5" + "github.com/jackc/pgx/v5/pgxpool" + "github.com/pgvector/pgvector-go" +) + +// TopicsRepository handles data access for topics +type TopicsRepository struct { + db *pgxpool.Pool +} + +// NewTopicsRepository creates a new topics repository +func NewTopicsRepository(db *pgxpool.Pool) *TopicsRepository { + return &TopicsRepository{db: db} +} + +// Create inserts a new topic with the specified level +func (r *TopicsRepository) Create(ctx context.Context, req *models.CreateTopicRequest) (*models.Topic, error) { + tenantID := normalizeTenantID(req.TenantID) + + query := ` + INSERT INTO topics (title, level, parent_id, tenant_id) + VALUES ($1, $2, $3, $4) + RETURNING id, title, level, parent_id, tenant_id, created_at, updated_at + ` + + var topic models.Topic + err := r.db.QueryRow(ctx, query, req.Title, req.Level, req.ParentID, tenantID).Scan( + &topic.ID, &topic.Title, &topic.Level, &topic.ParentID, &topic.TenantID, &topic.CreatedAt, &topic.UpdatedAt, + ) + if err != nil { + // Check for unique constraint violation + if strings.Contains(err.Error(), "duplicate key value violates unique constraint") || + strings.Contains(err.Error(), "23505") { + return nil, apperrors.NewConflictError("topic", "topic with this title already exists") + } + return nil, fmt.Errorf("failed to create topic: %w", err) + } + + return &topic, nil +} + +// GetByID retrieves a single topic by ID +func (r *TopicsRepository) GetByID(ctx context.Context, id uuid.UUID) (*models.Topic, error) { + // Use recursive CTE to count feedback in entire subtree (this topic + all descendants) + query := ` + SELECT t.id, t.title, t.level, t.parent_id, t.tenant_id, t.created_at, t.updated_at, + ( + WITH RECURSIVE subtree AS ( + SELECT id FROM topics WHERE id = t.id + UNION ALL + SELECT c.id FROM topics c INNER JOIN subtree s ON c.parent_id = s.id + ) + SELECT COUNT(*) FROM feedback_records WHERE topic_id IN (SELECT id FROM subtree) + ) as feedback_count + FROM topics t + WHERE t.id = $1 + ` + + var topic models.Topic + var feedbackCount int64 + err := r.db.QueryRow(ctx, query, id).Scan( + &topic.ID, &topic.Title, &topic.Level, &topic.ParentID, &topic.TenantID, &topic.CreatedAt, &topic.UpdatedAt, &feedbackCount, + ) + if err != nil { + if err == pgx.ErrNoRows { + return nil, apperrors.NewNotFoundError("topic", "topic not found") + } + return nil, fmt.Errorf("failed to get topic: %w", err) + } + topic.FeedbackCount = &feedbackCount + + return &topic, nil +} + +// buildTopicsFilterConditions builds WHERE clause conditions and arguments from filters +func buildTopicsFilterConditions(filters *models.ListTopicsFilters) (string, []interface{}) { + return buildTopicsFilterConditionsWithAlias(filters, "") +} + +// buildTopicsFilterConditionsWithAlias builds WHERE clause conditions with an optional table alias +func buildTopicsFilterConditionsWithAlias(filters *models.ListTopicsFilters, alias string) (string, []interface{}) { + var conditions []string + var args []interface{} + argCount := 1 + + prefix := "" + if alias != "" { + prefix = alias + "." + } + + if filters.Level != nil { + conditions = append(conditions, fmt.Sprintf("%slevel = $%d", prefix, argCount)) + args = append(args, *filters.Level) + argCount++ + } + + if filters.ParentID != nil { + conditions = append(conditions, fmt.Sprintf("%sparent_id = $%d", prefix, argCount)) + args = append(args, *filters.ParentID) + argCount++ + } + + if filters.Title != nil { + conditions = append(conditions, fmt.Sprintf("%stitle = $%d", prefix, argCount)) + args = append(args, *filters.Title) + argCount++ + } + + if filters.TenantID != nil { + conditions = append(conditions, fmt.Sprintf("%stenant_id = $%d", prefix, argCount)) + args = append(args, *filters.TenantID) + } + + whereClause := "" + if len(conditions) > 0 { + whereClause = " WHERE " + strings.Join(conditions, " AND ") + } + + return whereClause, args +} + +// List retrieves topics with optional filters +func (r *TopicsRepository) List(ctx context.Context, filters *models.ListTopicsFilters) ([]models.Topic, error) { + // Use recursive CTE to count feedback in entire subtree (this topic + all descendants) + query := ` + SELECT t.id, t.title, t.level, t.parent_id, t.tenant_id, t.created_at, t.updated_at, + ( + WITH RECURSIVE subtree AS ( + SELECT id FROM topics WHERE id = t.id + UNION ALL + SELECT c.id FROM topics c INNER JOIN subtree s ON c.parent_id = s.id + ) + SELECT COUNT(*) FROM feedback_records WHERE topic_id IN (SELECT id FROM subtree) + ) as feedback_count + FROM topics t + ` + + whereClause, args := buildTopicsFilterConditionsWithAlias(filters, "t") + query += whereClause + argCount := len(args) + 1 + + query += " ORDER BY t.created_at DESC" + + if filters.Limit > 0 { + query += fmt.Sprintf(" LIMIT $%d", argCount) + args = append(args, filters.Limit) + argCount++ + } + + if filters.Offset > 0 { + query += fmt.Sprintf(" OFFSET $%d", argCount) + args = append(args, filters.Offset) + } + + rows, err := r.db.Query(ctx, query, args...) + if err != nil { + return nil, fmt.Errorf("failed to list topics: %w", err) + } + defer rows.Close() + + topics := []models.Topic{} // Initialize as empty slice, not nil + for rows.Next() { + var topic models.Topic + var feedbackCount int64 + err := rows.Scan( + &topic.ID, &topic.Title, &topic.Level, &topic.ParentID, &topic.TenantID, &topic.CreatedAt, &topic.UpdatedAt, &feedbackCount, + ) + if err != nil { + return nil, fmt.Errorf("failed to scan topic: %w", err) + } + topic.FeedbackCount = &feedbackCount + topics = append(topics, topic) + } + + if err := rows.Err(); err != nil { + return nil, fmt.Errorf("error iterating topics: %w", err) + } + + return topics, nil +} + +// Count returns the total count of topics matching the filters +func (r *TopicsRepository) Count(ctx context.Context, filters *models.ListTopicsFilters) (int64, error) { + query := `SELECT COUNT(*) FROM topics` + + whereClause, args := buildTopicsFilterConditions(filters) + query += whereClause + + var count int64 + err := r.db.QueryRow(ctx, query, args...).Scan(&count) + if err != nil { + return 0, fmt.Errorf("failed to count topics: %w", err) + } + + return count, nil +} + +// Update updates an existing topic +// Only title can be updated +func (r *TopicsRepository) Update(ctx context.Context, id uuid.UUID, req *models.UpdateTopicRequest) (*models.Topic, error) { + // If no title provided, just return the existing topic + if req.Title == nil { + return r.GetByID(ctx, id) + } + + query := ` + UPDATE topics + SET title = $1, updated_at = $2 + WHERE id = $3 + RETURNING id, title, level, parent_id, tenant_id, created_at, updated_at + ` + + var topic models.Topic + err := r.db.QueryRow(ctx, query, *req.Title, time.Now(), id).Scan( + &topic.ID, &topic.Title, &topic.Level, &topic.ParentID, &topic.TenantID, &topic.CreatedAt, &topic.UpdatedAt, + ) + if err != nil { + if err == pgx.ErrNoRows { + return nil, apperrors.NewNotFoundError("topic", "topic not found") + } + // Check for unique constraint violation + if strings.Contains(err.Error(), "duplicate key value violates unique constraint") || + strings.Contains(err.Error(), "23505") { + return nil, apperrors.NewConflictError("topic", "topic with this title already exists") + } + return nil, fmt.Errorf("failed to update topic: %w", err) + } + + return &topic, nil +} + +// Delete removes a topic (CASCADE handled by FK constraint) +func (r *TopicsRepository) Delete(ctx context.Context, id uuid.UUID) error { + query := `DELETE FROM topics WHERE id = $1` + + result, err := r.db.Exec(ctx, query, id) + if err != nil { + return fmt.Errorf("failed to delete topic: %w", err) + } + + if result.RowsAffected() == 0 { + return apperrors.NewNotFoundError("topic", "topic not found") + } + + return nil +} + +// ExistsByTitleAndLevel checks if a topic with the given title exists at the given level and tenant +func (r *TopicsRepository) ExistsByTitleAndLevel(ctx context.Context, title string, level int, tenantID *string) (bool, error) { + var query string + var args []interface{} + + if tenantID == nil { + query = `SELECT EXISTS(SELECT 1 FROM topics WHERE title = $1 AND level = $2 AND tenant_id IS NULL)` + args = []interface{}{title, level} + } else { + query = `SELECT EXISTS(SELECT 1 FROM topics WHERE title = $1 AND level = $2 AND tenant_id = $3)` + args = []interface{}{title, level, *tenantID} + } + + var exists bool + err := r.db.QueryRow(ctx, query, args...).Scan(&exists) + if err != nil { + return false, fmt.Errorf("failed to check topic existence: %w", err) + } + + return exists, nil +} + +// ExistsByTitleAndLevelExcluding checks if a topic with the given title exists at the given level and tenant, +// excluding a specific topic ID (used for update uniqueness validation) +func (r *TopicsRepository) ExistsByTitleAndLevelExcluding(ctx context.Context, title string, level int, tenantID *string, excludeID uuid.UUID) (bool, error) { + var query string + var args []interface{} + + if tenantID == nil { + query = `SELECT EXISTS(SELECT 1 FROM topics WHERE title = $1 AND level = $2 AND tenant_id IS NULL AND id != $3)` + args = []interface{}{title, level, excludeID} + } else { + query = `SELECT EXISTS(SELECT 1 FROM topics WHERE title = $1 AND level = $2 AND tenant_id = $3 AND id != $4)` + args = []interface{}{title, level, *tenantID, excludeID} + } + + var exists bool + err := r.db.QueryRow(ctx, query, args...).Scan(&exists) + if err != nil { + return false, fmt.Errorf("failed to check topic existence: %w", err) + } + + return exists, nil +} + +// UpdateEmbedding updates the embedding vector for a topic +func (r *TopicsRepository) UpdateEmbedding(ctx context.Context, id uuid.UUID, embedding []float32) error { + query := ` + UPDATE topics + SET embedding = $1, updated_at = $2 + WHERE id = $3 + ` + + result, err := r.db.Exec(ctx, query, pgvector.NewVector(embedding), time.Now(), id) + if err != nil { + return fmt.Errorf("failed to update topic embedding: %w", err) + } + + if result.RowsAffected() == 0 { + return apperrors.NewNotFoundError("topic", "topic not found") + } + + return nil +} + +// GetEmbedding retrieves the embedding vector for a topic +// Returns nil if the topic has no embedding +func (r *TopicsRepository) GetEmbedding(ctx context.Context, id uuid.UUID) ([]float32, error) { + query := `SELECT embedding FROM topics WHERE id = $1` + + var embedding pgvector.Vector + err := r.db.QueryRow(ctx, query, id).Scan(&embedding) + if err != nil { + if err == pgx.ErrNoRows { + return nil, apperrors.NewNotFoundError("topic", "topic not found") + } + return nil, fmt.Errorf("failed to get topic embedding: %w", err) + } + + // embedding.Slice() returns []float32 + if embedding.Slice() == nil { + return nil, nil // Topic exists but has no embedding + } + + return embedding.Slice(), nil +} + +// FindSimilarTopic finds the most similar topic to the given embedding vector. +// Returns nil if no topics with embeddings exist or similarity is below threshold. +// If level is provided, only searches topics at that level. +func (r *TopicsRepository) FindSimilarTopic(ctx context.Context, embedding []float32, tenantID *string, level *int, minSimilarity float64) (*models.TopicMatch, error) { + query := ` + SELECT id, title, level, 1 - (embedding <=> $1::vector) as similarity + FROM topics + WHERE embedding IS NOT NULL + AND ($2::varchar IS NULL OR tenant_id = $2) + AND ($3::int IS NULL OR level = $3) + ORDER BY similarity DESC + LIMIT 1 + ` + + var match models.TopicMatch + err := r.db.QueryRow(ctx, query, pgvector.NewVector(embedding), tenantID, level).Scan( + &match.TopicID, &match.Title, &match.Level, &match.Similarity, + ) + if err != nil { + if err == pgx.ErrNoRows { + slog.Debug("no topics with embeddings found for similarity search", "level", level) + return nil, nil // No topics with embeddings found + } + return nil, fmt.Errorf("failed to find similar topic: %w", err) + } + + slog.Debug("best topic match found", + "topic_id", match.TopicID, + "topic_title", match.Title, + "level", match.Level, + "similarity", match.Similarity, + "min_similarity", minSimilarity, + "above_threshold", match.Similarity >= minSimilarity, + ) + + // Return nil if similarity is below threshold + if match.Similarity < minSimilarity { + return nil, nil + } + + return &match, nil +} + +// FindMostSpecificTopic finds the best matching leaf topic, falling back to parents. +// Returns the most specific (highest level) topic above the similarity threshold. +// This is used for real-time topic assignment after embedding generation. +func (r *TopicsRepository) FindMostSpecificTopic(ctx context.Context, embedding []float32, tenantID *string, minSimilarity float64) (*models.TopicMatch, error) { + // Query finds topics above threshold, ordered by level DESC (most specific first), then similarity + query := ` + SELECT id, title, level, 1 - (embedding <=> $1::vector) as similarity + FROM topics + WHERE embedding IS NOT NULL + AND ($2::varchar IS NULL OR tenant_id = $2) + AND 1 - (embedding <=> $1::vector) >= $3 + ORDER BY level DESC, similarity DESC + LIMIT 1 + ` + + var match models.TopicMatch + err := r.db.QueryRow(ctx, query, pgvector.NewVector(embedding), tenantID, minSimilarity).Scan( + &match.TopicID, &match.Title, &match.Level, &match.Similarity, + ) + if err != nil { + if err == pgx.ErrNoRows { + slog.Debug("no topics above threshold for assignment", "min_similarity", minSimilarity) + return nil, nil + } + return nil, fmt.Errorf("failed to find most specific topic: %w", err) + } + + slog.Debug("most specific topic match found", + "topic_id", match.TopicID, + "topic_title", match.Title, + "level", match.Level, + "similarity", match.Similarity, + ) + + return &match, nil +} + +// GetChildTopics retrieves Level 2 topics that are children of a given Level 1 topic +func (r *TopicsRepository) GetChildTopics(ctx context.Context, parentID uuid.UUID, tenantID *string, limit int) ([]models.Topic, error) { + if limit <= 0 { + limit = 100 + } + + // Use recursive CTE to count feedback in entire subtree (this topic + all descendants) + query := ` + SELECT t.id, t.title, t.level, t.parent_id, t.tenant_id, t.created_at, t.updated_at, + ( + WITH RECURSIVE subtree AS ( + SELECT id FROM topics WHERE id = t.id + UNION ALL + SELECT c.id FROM topics c INNER JOIN subtree s ON c.parent_id = s.id + ) + SELECT COUNT(*) FROM feedback_records WHERE topic_id IN (SELECT id FROM subtree) + ) as feedback_count + FROM topics t + WHERE t.parent_id = $1 + AND ($2::varchar IS NULL OR t.tenant_id = $2) + ORDER BY t.title ASC + LIMIT $3 + ` + + rows, err := r.db.Query(ctx, query, parentID, tenantID, limit) + if err != nil { + return nil, fmt.Errorf("failed to get child topics: %w", err) + } + defer rows.Close() + + topics := []models.Topic{} + for rows.Next() { + var topic models.Topic + var feedbackCount int64 + if err := rows.Scan(&topic.ID, &topic.Title, &topic.Level, &topic.ParentID, &topic.TenantID, &topic.CreatedAt, &topic.UpdatedAt, &feedbackCount); err != nil { + return nil, fmt.Errorf("failed to scan child topic: %w", err) + } + topic.FeedbackCount = &feedbackCount + topics = append(topics, topic) + } + + if err := rows.Err(); err != nil { + return nil, fmt.Errorf("error iterating child topics: %w", err) + } + + return topics, nil +} diff --git a/internal/service/feedback_records_service.go b/internal/service/feedback_records_service.go index e9948e9..feb6fe6 100644 --- a/internal/service/feedback_records_service.go +++ b/internal/service/feedback_records_service.go @@ -3,7 +3,10 @@ package service import ( "context" "fmt" + "log/slog" + "github.com/formbricks/hub/internal/embeddings" + "github.com/formbricks/hub/internal/jobs" "github.com/formbricks/hub/internal/models" "github.com/google/uuid" ) @@ -17,21 +20,112 @@ type FeedbackRecordsRepository interface { Update(ctx context.Context, id uuid.UUID, req *models.UpdateFeedbackRecordRequest) (*models.FeedbackRecord, error) Delete(ctx context.Context, id uuid.UUID) error BulkDelete(ctx context.Context, userIdentifier string, tenantID *string) (int64, error) + UpdateEnrichment(ctx context.Context, id uuid.UUID, req *models.UpdateFeedbackEnrichmentRequest) error + // ListByTopicWithDescendants finds feedback assigned to a topic or its descendants (direct lookup) + ListByTopicWithDescendants(ctx context.Context, topicID uuid.UUID, filters *models.ListFeedbackRecordsFilters) ([]models.FeedbackRecord, int64, error) + // ListBySimilarityWithDescendants finds feedback similar to a topic AND all its descendants (vector search) + ListBySimilarityWithDescendants(ctx context.Context, topicID uuid.UUID, levelThresholds map[int]float64, defaultThreshold float64, filters *models.ListFeedbackRecordsFilters) ([]models.FeedbackRecord, int64, error) } // FeedbackRecordsService handles business logic for feedback records type FeedbackRecordsService struct { - repo FeedbackRecordsRepository + repo FeedbackRecordsRepository + embeddingClient embeddings.Client // nil if embeddings are disabled + jobInserter jobs.JobInserter // nil if River is disabled (falls back to goroutines) } -// NewFeedbackRecordsService creates a new feedback records service +// NewFeedbackRecordsService creates a new feedback records service without embeddings func NewFeedbackRecordsService(repo FeedbackRecordsRepository) *FeedbackRecordsService { return &FeedbackRecordsService{repo: repo} } +// NewFeedbackRecordsServiceWithEmbeddings creates a service with embedding support via River job queue +func NewFeedbackRecordsServiceWithEmbeddings(repo FeedbackRecordsRepository, embeddingClient embeddings.Client, jobInserter jobs.JobInserter) *FeedbackRecordsService { + return &FeedbackRecordsService{ + repo: repo, + embeddingClient: embeddingClient, + jobInserter: jobInserter, + } +} + // CreateFeedbackRecord creates a new feedback record func (s *FeedbackRecordsService) CreateFeedbackRecord(ctx context.Context, req *models.CreateFeedbackRecordRequest) (*models.FeedbackRecord, error) { - return s.repo.Create(ctx, req) + record, err := s.repo.Create(ctx, req) + if err != nil { + return nil, err + } + + // Generate embedding for text feedback asynchronously + if s.shouldGenerateEmbedding(req.FieldType, req.ValueText) { + s.enqueueEmbeddingJob(ctx, record.ID, *req.ValueText, record.TenantID) + } + + return record, nil +} + +// shouldGenerateEmbedding checks if embedding should be generated for the given field +func (s *FeedbackRecordsService) shouldGenerateEmbedding(fieldType string, valueText *string) bool { + if s.embeddingClient == nil { + return false + } + if fieldType != "text" { + return false + } + if valueText == nil || *valueText == "" { + return false + } + return true +} + +// enqueueEmbeddingJob enqueues an embedding job or falls back to sync generation +func (s *FeedbackRecordsService) enqueueEmbeddingJob(ctx context.Context, id uuid.UUID, text string, tenantID *string) { + // If job inserter is available, use River job queue + if s.jobInserter != nil { + err := s.jobInserter.InsertEmbeddingJob(ctx, jobs.EmbeddingJobArgs{ + RecordID: id, + RecordType: jobs.RecordTypeFeedback, + Text: text, + TenantID: tenantID, + }) + if err != nil { + slog.Error("failed to enqueue embedding job", + "record_type", "feedback_record", + "id", id, + "error", err, + ) + // Don't fail the request - embedding can be backfilled later + } + return + } + + // Fallback to sync generation in a goroutine (legacy behavior for tests or when River is disabled) + if s.embeddingClient != nil { + go s.generateEmbeddingSync(id, text) + } +} + +// generateEmbeddingSync generates and stores embedding synchronously (used as fallback) +func (s *FeedbackRecordsService) generateEmbeddingSync(id uuid.UUID, text string) { + ctx := context.Background() + + slog.Debug("generating embedding for feedback (sync)", "id", id, "text_length", len(text)) + + embedding, err := s.embeddingClient.GetEmbedding(ctx, text) + if err != nil { + slog.Error("failed to generate embedding", "record_type", "feedback_record", "id", id, "error", err) + return + } + + enrichReq := &models.UpdateFeedbackEnrichmentRequest{ + Embedding: embedding, + } + + if err := s.repo.UpdateEnrichment(ctx, id, enrichReq); err != nil { + slog.Error("failed to store embedding", "record_type", "feedback_record", "id", id, "error", err) + return + } + + slog.Info("embedding generated successfully", "record_type", "feedback_record", "id", id) } // GetFeedbackRecord retrieves a single feedback record by ID @@ -40,12 +134,19 @@ func (s *FeedbackRecordsService) GetFeedbackRecord(ctx context.Context, id uuid. } // ListFeedbackRecords retrieves a list of feedback records with optional filters +// If TopicID filter is provided, uses vector similarity search with hierarchical aggregation func (s *FeedbackRecordsService) ListFeedbackRecords(ctx context.Context, filters *models.ListFeedbackRecordsFilters) (*models.ListFeedbackRecordsResponse, error) { - // Set default limit if not provided (validation ensures it's within bounds if provided) + // Set default limit if not provided if filters.Limit <= 0 { - filters.Limit = 100 // Default limit + filters.Limit = 100 + } + + // If topic_id filter is provided, use direct lookup (or similarity if explicitly requested) + if filters.TopicID != nil { + return s.listByTopic(ctx, *filters.TopicID, filters) } + // Standard listing without vector search records, err := s.repo.List(ctx, filters) if err != nil { return nil, err @@ -64,9 +165,96 @@ func (s *FeedbackRecordsService) ListFeedbackRecords(ctx context.Context, filter }, nil } +// listByTopic retrieves feedback records assigned to a topic or its descendants. +// Uses direct topic_id lookup (fast, pre-computed during taxonomy generation). +// Falls back to similarity search if UseSimilarity filter is set. +func (s *FeedbackRecordsService) listByTopic(ctx context.Context, topicID uuid.UUID, filters *models.ListFeedbackRecordsFilters) (*models.ListFeedbackRecordsResponse, error) { + // Check if similarity search is explicitly requested + if filters.UseSimilarity { + return s.listByTopicSimilarity(ctx, topicID, filters) + } + + // Default: Use direct topic_id lookup (faster, uses pre-computed assignments) + slog.Debug("using direct topic_id lookup", + "topic_id", topicID, + ) + + records, total, err := s.repo.ListByTopicWithDescendants(ctx, topicID, filters) + if err != nil { + return nil, fmt.Errorf("failed to list feedback by topic: %w", err) + } + + return &models.ListFeedbackRecordsResponse{ + Data: records, + Total: total, + Limit: filters.Limit, + Offset: filters.Offset, + }, nil +} + +// listByTopicSimilarity retrieves feedback records similar to a topic AND all its descendants. +// Uses optimized single-query approach with level-based thresholds. +// This is slower but can find matches for unclassified feedback. +func (s *FeedbackRecordsService) listByTopicSimilarity(ctx context.Context, topicID uuid.UUID, filters *models.ListFeedbackRecordsFilters) (*models.ListFeedbackRecordsResponse, error) { + // Determine thresholds to use + var levelThresholds map[int]float64 + if filters.MinSimilarity != nil { + // Custom threshold overrides level-based thresholds + // Apply same threshold to all levels + threshold := *filters.MinSimilarity + levelThresholds = map[int]float64{ + 1: threshold, + 2: threshold, + 3: threshold, + 4: threshold, + 5: threshold, + } + slog.Debug("using custom similarity threshold for all levels", + "topic_id", topicID, + "threshold", threshold, + ) + } else { + // Use level-based thresholds from models + levelThresholds = models.LevelThresholds + slog.Debug("using level-based similarity thresholds", + "topic_id", topicID, + "thresholds", levelThresholds, + ) + } + + // Perform optimized similarity search with descendants + records, total, err := s.repo.ListBySimilarityWithDescendants( + ctx, + topicID, + levelThresholds, + models.DefaultThreshold, + filters, + ) + if err != nil { + return nil, fmt.Errorf("failed to search feedback by topic similarity: %w", err) + } + + return &models.ListFeedbackRecordsResponse{ + Data: records, + Total: total, + Limit: filters.Limit, + Offset: filters.Offset, + }, nil +} + // UpdateFeedbackRecord updates an existing feedback record func (s *FeedbackRecordsService) UpdateFeedbackRecord(ctx context.Context, id uuid.UUID, req *models.UpdateFeedbackRecordRequest) (*models.FeedbackRecord, error) { - return s.repo.Update(ctx, id, req) + record, err := s.repo.Update(ctx, id, req) + if err != nil { + return nil, err + } + + // Regenerate embedding if text was updated + if record.FieldType == "text" && s.shouldGenerateEmbedding(record.FieldType, req.ValueText) { + s.enqueueEmbeddingJob(ctx, id, *req.ValueText, record.TenantID) + } + + return record, nil } // DeleteFeedbackRecord deletes a feedback record by ID diff --git a/internal/service/knowledge_records_service.go b/internal/service/knowledge_records_service.go new file mode 100644 index 0000000..2bde57d --- /dev/null +++ b/internal/service/knowledge_records_service.go @@ -0,0 +1,165 @@ +package service + +import ( + "context" + "fmt" + "log/slog" + + "github.com/formbricks/hub/internal/embeddings" + "github.com/formbricks/hub/internal/jobs" + "github.com/formbricks/hub/internal/models" + "github.com/google/uuid" +) + +// KnowledgeRecordsRepository defines the interface for knowledge records data access. +type KnowledgeRecordsRepository interface { + Create(ctx context.Context, req *models.CreateKnowledgeRecordRequest) (*models.KnowledgeRecord, error) + GetByID(ctx context.Context, id uuid.UUID) (*models.KnowledgeRecord, error) + List(ctx context.Context, filters *models.ListKnowledgeRecordsFilters) ([]models.KnowledgeRecord, error) + Count(ctx context.Context, filters *models.ListKnowledgeRecordsFilters) (int64, error) + Update(ctx context.Context, id uuid.UUID, req *models.UpdateKnowledgeRecordRequest) (*models.KnowledgeRecord, error) + Delete(ctx context.Context, id uuid.UUID) error + BulkDelete(ctx context.Context, tenantID string) (int64, error) + UpdateEmbedding(ctx context.Context, id uuid.UUID, embedding []float32) error +} + +// KnowledgeRecordsService handles business logic for knowledge records +type KnowledgeRecordsService struct { + repo KnowledgeRecordsRepository + embeddingClient embeddings.Client // nil if embeddings are disabled + jobInserter jobs.JobInserter // nil if River is disabled (falls back to goroutines) +} + +// NewKnowledgeRecordsService creates a new knowledge records service without embeddings +func NewKnowledgeRecordsService(repo KnowledgeRecordsRepository) *KnowledgeRecordsService { + return &KnowledgeRecordsService{repo: repo} +} + +// NewKnowledgeRecordsServiceWithEmbeddings creates a service with embedding support via River job queue +func NewKnowledgeRecordsServiceWithEmbeddings(repo KnowledgeRecordsRepository, embeddingClient embeddings.Client, jobInserter jobs.JobInserter) *KnowledgeRecordsService { + return &KnowledgeRecordsService{ + repo: repo, + embeddingClient: embeddingClient, + jobInserter: jobInserter, + } +} + +// CreateKnowledgeRecord creates a new knowledge record +func (s *KnowledgeRecordsService) CreateKnowledgeRecord(ctx context.Context, req *models.CreateKnowledgeRecordRequest) (*models.KnowledgeRecord, error) { + record, err := s.repo.Create(ctx, req) + if err != nil { + return nil, err + } + + // Generate embedding asynchronously if client is configured + if s.embeddingClient != nil && req.Content != "" { + s.enqueueEmbeddingJob(ctx, record.ID, req.Content) + } + + return record, nil +} + +// enqueueEmbeddingJob enqueues an embedding job or falls back to sync generation +func (s *KnowledgeRecordsService) enqueueEmbeddingJob(ctx context.Context, id uuid.UUID, content string) { + // If job inserter is available, use River job queue + if s.jobInserter != nil { + err := s.jobInserter.InsertEmbeddingJob(ctx, jobs.EmbeddingJobArgs{ + RecordID: id, + RecordType: jobs.RecordTypeKnowledge, + Text: content, + }) + if err != nil { + slog.Error("failed to enqueue embedding job", + "record_type", "knowledge_record", + "id", id, + "error", err, + ) + // Don't fail the request - embedding can be backfilled later + } + return + } + + // Fallback to sync generation in a goroutine (legacy behavior for tests or when River is disabled) + if s.embeddingClient != nil { + go s.generateEmbeddingSync(id, content) + } +} + +// generateEmbeddingSync generates and stores embedding synchronously (used as fallback) +func (s *KnowledgeRecordsService) generateEmbeddingSync(id uuid.UUID, content string) { + ctx := context.Background() + + slog.Debug("generating embedding for knowledge record (sync)", "id", id, "content_length", len(content)) + + embedding, err := s.embeddingClient.GetEmbedding(ctx, content) + if err != nil { + slog.Error("failed to generate embedding", "record_type", "knowledge_record", "id", id, "error", err) + return + } + + if err := s.repo.UpdateEmbedding(ctx, id, embedding); err != nil { + slog.Error("failed to store embedding", "record_type", "knowledge_record", "id", id, "error", err) + return + } + + slog.Info("embedding generated successfully", "record_type", "knowledge_record", "id", id) +} + +// GetKnowledgeRecord retrieves a single knowledge record by ID +func (s *KnowledgeRecordsService) GetKnowledgeRecord(ctx context.Context, id uuid.UUID) (*models.KnowledgeRecord, error) { + return s.repo.GetByID(ctx, id) +} + +// ListKnowledgeRecords retrieves a list of knowledge records with optional filters +func (s *KnowledgeRecordsService) ListKnowledgeRecords(ctx context.Context, filters *models.ListKnowledgeRecordsFilters) (*models.ListKnowledgeRecordsResponse, error) { + // Set default limit if not provided + if filters.Limit <= 0 { + filters.Limit = 100 // Default limit + } + + records, err := s.repo.List(ctx, filters) + if err != nil { + return nil, err + } + + total, err := s.repo.Count(ctx, filters) + if err != nil { + return nil, err + } + + return &models.ListKnowledgeRecordsResponse{ + Data: records, + Total: total, + Limit: filters.Limit, + Offset: filters.Offset, + }, nil +} + +// UpdateKnowledgeRecord updates an existing knowledge record +func (s *KnowledgeRecordsService) UpdateKnowledgeRecord(ctx context.Context, id uuid.UUID, req *models.UpdateKnowledgeRecordRequest) (*models.KnowledgeRecord, error) { + record, err := s.repo.Update(ctx, id, req) + if err != nil { + return nil, err + } + + // Regenerate embedding if content was updated and client is configured + if req.Content != nil && *req.Content != "" && s.embeddingClient != nil { + s.enqueueEmbeddingJob(ctx, id, *req.Content) + } + + return record, nil +} + +// DeleteKnowledgeRecord deletes a knowledge record by ID +func (s *KnowledgeRecordsService) DeleteKnowledgeRecord(ctx context.Context, id uuid.UUID) error { + return s.repo.Delete(ctx, id) +} + +// BulkDeleteKnowledgeRecords deletes all knowledge records matching tenant_id +func (s *KnowledgeRecordsService) BulkDeleteKnowledgeRecords(ctx context.Context, tenantID string) (int64, error) { + if tenantID == "" { + return 0, fmt.Errorf("tenant_id is required") + } + + return s.repo.BulkDelete(ctx, tenantID) +} diff --git a/internal/service/taxonomy_client.go b/internal/service/taxonomy_client.go new file mode 100644 index 0000000..96b764c --- /dev/null +++ b/internal/service/taxonomy_client.go @@ -0,0 +1,226 @@ +package service + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "log/slog" + "net/http" + "time" + + "github.com/google/uuid" +) + +// TaxonomyClient is an HTTP client for the taxonomy-generator Python microservice. +type TaxonomyClient struct { + baseURL string + httpClient *http.Client +} + +// NewTaxonomyClient creates a new taxonomy client. +func NewTaxonomyClient(baseURL string) *TaxonomyClient { + return &TaxonomyClient{ + baseURL: baseURL, + httpClient: &http.Client{ + Timeout: 5 * time.Minute, // Clustering can take a while + }, + } +} + +// ClusterConfig contains optional configuration for clustering. +type ClusterConfig struct { + UMAPNComponents *int `json:"umap_n_components,omitempty"` + UMAPNNeighbors *int `json:"umap_n_neighbors,omitempty"` + UMAPMinDist *float64 `json:"umap_min_dist,omitempty"` + HDBSCANMinClusterSize *int `json:"hdbscan_min_cluster_size,omitempty"` + HDBSCANMinSamples *int `json:"hdbscan_min_samples,omitempty"` + MaxEmbeddings *int `json:"max_embeddings,omitempty"` + GenerateLevel2 *bool `json:"generate_level2,omitempty"` + Level2MinClusterSize *int `json:"level2_min_cluster_size,omitempty"` +} + +// ClusteringJobStatus represents the status of a clustering job. +type ClusteringJobStatus string + +const ( + ClusteringStatusPending ClusteringJobStatus = "pending" + ClusteringStatusRunning ClusteringJobStatus = "running" + ClusteringStatusCompleted ClusteringJobStatus = "completed" + ClusteringStatusFailed ClusteringJobStatus = "failed" +) + +// TopicResult represents a generated topic from clustering. +type TopicResult struct { + ID uuid.UUID `json:"id"` + Title string `json:"title"` + Description string `json:"description"` + Level int `json:"level"` + ParentID *uuid.UUID `json:"parent_id,omitempty"` + ClusterSize int `json:"cluster_size"` + AvgDistanceToCentroid float64 `json:"avg_distance_to_centroid"` +} + +// TaxonomyResult contains the result of a completed clustering job. +type TaxonomyResult struct { + TenantID string `json:"tenant_id"` + JobID uuid.UUID `json:"job_id"` + Status string `json:"status"` + TotalRecords int `json:"total_records"` + ClusteredRecords int `json:"clustered_records"` + NoiseRecords int `json:"noise_records"` + NumClusters int `json:"num_clusters"` + Topics []TopicResult `json:"topics"` + StartedAt time.Time `json:"started_at"` + CompletedAt *time.Time `json:"completed_at,omitempty"` + ErrorMessage *string `json:"error_message,omitempty"` +} + +// ClusteringJobResponse is the response from the taxonomy service. +type ClusteringJobResponse struct { + JobID uuid.UUID `json:"job_id"` + TenantID string `json:"tenant_id"` + Status ClusteringJobStatus `json:"status"` + Progress float64 `json:"progress"` + Message *string `json:"message,omitempty"` + Result *TaxonomyResult `json:"result,omitempty"` +} + +// TriggerClustering starts an async taxonomy generation job for a tenant. +func (c *TaxonomyClient) TriggerClustering(ctx context.Context, tenantID string, config *ClusterConfig) (*ClusteringJobResponse, error) { + url := fmt.Sprintf("%s/cluster/%s", c.baseURL, tenantID) + + var body io.Reader + if config != nil { + jsonBody, err := json.Marshal(config) + if err != nil { + return nil, fmt.Errorf("failed to marshal config: %w", err) + } + body = bytes.NewReader(jsonBody) + } + + req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, body) + if err != nil { + return nil, fmt.Errorf("failed to create request: %w", err) + } + req.Header.Set("Content-Type", "application/json") + + slog.Info("triggering taxonomy generation", "tenant_id", tenantID, "url", url) + + resp, err := c.httpClient.Do(req) + if err != nil { + return nil, fmt.Errorf("request failed: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode != http.StatusOK { + bodyBytes, _ := io.ReadAll(resp.Body) + return nil, fmt.Errorf("taxonomy service returned %d: %s", resp.StatusCode, string(bodyBytes)) + } + + var result ClusteringJobResponse + if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { + return nil, fmt.Errorf("failed to decode response: %w", err) + } + + return &result, nil +} + +// GetClusteringStatus retrieves the status of a clustering job. +func (c *TaxonomyClient) GetClusteringStatus(ctx context.Context, tenantID string, jobID *uuid.UUID) (*ClusteringJobResponse, error) { + url := fmt.Sprintf("%s/cluster/%s/status", c.baseURL, tenantID) + if jobID != nil { + url = fmt.Sprintf("%s?job_id=%s", url, jobID.String()) + } + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) + if err != nil { + return nil, fmt.Errorf("failed to create request: %w", err) + } + + resp, err := c.httpClient.Do(req) + if err != nil { + return nil, fmt.Errorf("request failed: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode == http.StatusNotFound { + return nil, fmt.Errorf("job not found") + } + + if resp.StatusCode != http.StatusOK { + bodyBytes, _ := io.ReadAll(resp.Body) + return nil, fmt.Errorf("taxonomy service returned %d: %s", resp.StatusCode, string(bodyBytes)) + } + + var result ClusteringJobResponse + if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { + return nil, fmt.Errorf("failed to decode response: %w", err) + } + + return &result, nil +} + +// GenerateTaxonomySync synchronously generates taxonomy for a tenant (blocking). +// Use this for testing or when you need to wait for results. +func (c *TaxonomyClient) GenerateTaxonomySync(ctx context.Context, tenantID string, config *ClusterConfig) (*ClusteringJobResponse, error) { + url := fmt.Sprintf("%s/cluster/%s/sync", c.baseURL, tenantID) + + var body io.Reader + if config != nil { + jsonBody, err := json.Marshal(config) + if err != nil { + return nil, fmt.Errorf("failed to marshal config: %w", err) + } + body = bytes.NewReader(jsonBody) + } + + req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, body) + if err != nil { + return nil, fmt.Errorf("failed to create request: %w", err) + } + req.Header.Set("Content-Type", "application/json") + + slog.Info("generating taxonomy synchronously", "tenant_id", tenantID, "url", url) + + resp, err := c.httpClient.Do(req) + if err != nil { + return nil, fmt.Errorf("request failed: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode != http.StatusOK { + bodyBytes, _ := io.ReadAll(resp.Body) + return nil, fmt.Errorf("taxonomy service returned %d: %s", resp.StatusCode, string(bodyBytes)) + } + + var result ClusteringJobResponse + if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { + return nil, fmt.Errorf("failed to decode response: %w", err) + } + + return &result, nil +} + +// HealthCheck checks if the taxonomy service is healthy. +func (c *TaxonomyClient) HealthCheck(ctx context.Context) error { + url := fmt.Sprintf("%s/health", c.baseURL) + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) + if err != nil { + return fmt.Errorf("failed to create request: %w", err) + } + + resp, err := c.httpClient.Do(req) + if err != nil { + return fmt.Errorf("request failed: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("taxonomy service unhealthy: status %d", resp.StatusCode) + } + + return nil +} diff --git a/internal/service/topics_service.go b/internal/service/topics_service.go new file mode 100644 index 0000000..8f7a598 --- /dev/null +++ b/internal/service/topics_service.go @@ -0,0 +1,265 @@ +package service + +import ( + "context" + "log/slog" + + "github.com/formbricks/hub/internal/embeddings" + apperrors "github.com/formbricks/hub/internal/errors" + "github.com/formbricks/hub/internal/jobs" + "github.com/formbricks/hub/internal/models" + "github.com/google/uuid" +) + +// TopicsRepository defines the interface for topics data access. +type TopicsRepository interface { + Create(ctx context.Context, req *models.CreateTopicRequest) (*models.Topic, error) + GetByID(ctx context.Context, id uuid.UUID) (*models.Topic, error) + List(ctx context.Context, filters *models.ListTopicsFilters) ([]models.Topic, error) + Count(ctx context.Context, filters *models.ListTopicsFilters) (int64, error) + Update(ctx context.Context, id uuid.UUID, req *models.UpdateTopicRequest) (*models.Topic, error) + Delete(ctx context.Context, id uuid.UUID) error + ExistsByTitleAndLevel(ctx context.Context, title string, level int, tenantID *string) (bool, error) + ExistsByTitleAndLevelExcluding(ctx context.Context, title string, level int, tenantID *string, excludeID uuid.UUID) (bool, error) + UpdateEmbedding(ctx context.Context, id uuid.UUID, embedding []float32) error + FindSimilarTopic(ctx context.Context, embedding []float32, tenantID *string, level *int, minSimilarity float64) (*models.TopicMatch, error) + GetChildTopics(ctx context.Context, parentID uuid.UUID, tenantID *string, limit int) ([]models.Topic, error) +} + +// TopicsService handles business logic for topics +type TopicsService struct { + repo TopicsRepository + embeddingClient embeddings.Client // nil if embeddings are disabled + jobInserter jobs.JobInserter // nil if River is disabled (falls back to goroutines) +} + +// NewTopicsService creates a new topics service without embeddings +func NewTopicsService(repo TopicsRepository) *TopicsService { + return &TopicsService{repo: repo} +} + +// NewTopicsServiceWithEmbeddings creates a service with embedding support via River job queue +func NewTopicsServiceWithEmbeddings(repo TopicsRepository, embeddingClient embeddings.Client, jobInserter jobs.JobInserter) *TopicsService { + return &TopicsService{ + repo: repo, + embeddingClient: embeddingClient, + jobInserter: jobInserter, + } +} + +// CreateTopic creates a new topic with validation +func (s *TopicsService) CreateTopic(ctx context.Context, req *models.CreateTopicRequest) (*models.Topic, error) { + // Normalize empty string tenant_id to nil + if req.TenantID != nil && *req.TenantID == "" { + req.TenantID = nil + } + + // Validate level + if req.Level < 1 || req.Level > 2 { + return nil, apperrors.NewValidationError("level", "level must be 1 or 2") + } + + // Validate parent_id based on level + if req.Level == 1 && req.ParentID != nil { + return nil, apperrors.NewValidationError("parent_id", "Level 1 topics cannot have a parent") + } + if req.Level == 2 && req.ParentID == nil { + return nil, apperrors.NewValidationError("parent_id", "Level 2 topics must have a parent_id") + } + + // If Level 2, validate that parent exists and is Level 1 + if req.ParentID != nil { + parent, err := s.repo.GetByID(ctx, *req.ParentID) + if err != nil { + return nil, apperrors.NewValidationError("parent_id", "parent topic not found") + } + if parent.Level != 1 { + return nil, apperrors.NewValidationError("parent_id", "parent must be a Level 1 topic") + } + } + + // Check title uniqueness within level + tenant + exists, err := s.repo.ExistsByTitleAndLevel(ctx, req.Title, req.Level, req.TenantID) + if err != nil { + return nil, err + } + if exists { + return nil, apperrors.NewConflictError("topic", "topic with this title already exists at this level") + } + + // Create topic + topic, err := s.repo.Create(ctx, req) + if err != nil { + return nil, err + } + + // Generate embedding asynchronously if client is configured + // Build hierarchy path synchronously (it's fast, just 1-2 DB reads) then enqueue job + if s.embeddingClient != nil { + hierarchyPath := s.buildHierarchyPath(ctx, req.Title, req.ParentID) + s.enqueueEmbeddingJob(ctx, topic.ID, hierarchyPath) + } + + return topic, nil +} + +// enqueueEmbeddingJob enqueues an embedding job or falls back to sync generation +func (s *TopicsService) enqueueEmbeddingJob(ctx context.Context, id uuid.UUID, hierarchyPath string) { + // If job inserter is available, use River job queue + if s.jobInserter != nil { + err := s.jobInserter.InsertEmbeddingJob(ctx, jobs.EmbeddingJobArgs{ + RecordID: id, + RecordType: jobs.RecordTypeTopic, + Text: hierarchyPath, + }) + if err != nil { + slog.Error("failed to enqueue embedding job", + "record_type", "topic", + "id", id, + "error", err, + ) + // Don't fail the request - embedding can be backfilled later + } + return + } + + // Fallback to sync generation in a goroutine (legacy behavior for tests or when River is disabled) + if s.embeddingClient != nil { + go s.generateEmbeddingSync(id, hierarchyPath) + } +} + +// generateEmbeddingSync generates and stores embedding synchronously (used as fallback) +func (s *TopicsService) generateEmbeddingSync(id uuid.UUID, hierarchyPath string) { + ctx := context.Background() + + slog.Debug("generating embedding for topic (sync)", "id", id, "path", hierarchyPath) + + embedding, err := s.embeddingClient.GetEmbedding(ctx, hierarchyPath) + if err != nil { + slog.Error("failed to generate embedding", "record_type", "topic", "id", id, "path", hierarchyPath, "error", err) + return + } + + if err := s.repo.UpdateEmbedding(ctx, id, embedding); err != nil { + slog.Error("failed to store embedding", "record_type", "topic", "id", id, "error", err) + return + } + + slog.Info("embedding generated successfully", "record_type", "topic", "id", id, "path", hierarchyPath) +} + +// buildHierarchyPath builds the full hierarchy path for a topic (e.g., "Performance > API > Latency") +func (s *TopicsService) buildHierarchyPath(ctx context.Context, title string, parentID *uuid.UUID) string { + if parentID == nil { + return title + } + + // Build path by walking up the parent chain + var pathParts []string + pathParts = append(pathParts, title) + + currentParentID := parentID + for currentParentID != nil { + parent, err := s.repo.GetByID(ctx, *currentParentID) + if err != nil { + // If we can't fetch parent, just use what we have + slog.Warn("failed to fetch parent for hierarchy path", "parent_id", *currentParentID, "error", err) + break + } + pathParts = append([]string{parent.Title}, pathParts...) + currentParentID = parent.ParentID + } + + // Join with " > " separator + result := pathParts[0] + for i := 1; i < len(pathParts); i++ { + result += " > " + pathParts[i] + } + return result +} + +// GetTopic retrieves a single topic by ID +func (s *TopicsService) GetTopic(ctx context.Context, id uuid.UUID) (*models.Topic, error) { + return s.repo.GetByID(ctx, id) +} + +// ListTopics retrieves a list of topics with optional filters +func (s *TopicsService) ListTopics(ctx context.Context, filters *models.ListTopicsFilters) (*models.ListTopicsResponse, error) { + // Set default limit if not provided + if filters.Limit <= 0 { + filters.Limit = 100 // Default limit + } + + topics, err := s.repo.List(ctx, filters) + if err != nil { + return nil, err + } + + total, err := s.repo.Count(ctx, filters) + if err != nil { + return nil, err + } + + return &models.ListTopicsResponse{ + Data: topics, + Total: total, + Limit: filters.Limit, + Offset: filters.Offset, + }, nil +} + +// UpdateTopic updates an existing topic +func (s *TopicsService) UpdateTopic(ctx context.Context, id uuid.UUID, req *models.UpdateTopicRequest) (*models.Topic, error) { + // If title is being updated, check uniqueness (excluding current topic) + if req.Title != nil { + // First, fetch the existing topic to get level and tenant_id + existing, err := s.repo.GetByID(ctx, id) + if err != nil { + return nil, err + } + + // Check if the new title conflicts with another topic at the same level + exists, err := s.repo.ExistsByTitleAndLevelExcluding(ctx, *req.Title, existing.Level, existing.TenantID, id) + if err != nil { + return nil, err + } + if exists { + return nil, apperrors.NewConflictError("topic", "topic with this title already exists at this level") + } + } + + topic, err := s.repo.Update(ctx, id, req) + if err != nil { + return nil, err + } + + // Regenerate embedding if title was updated and client is configured + // Build hierarchy path synchronously then enqueue job + if req.Title != nil && s.embeddingClient != nil { + hierarchyPath := s.buildHierarchyPath(ctx, topic.Title, topic.ParentID) + s.enqueueEmbeddingJob(ctx, id, hierarchyPath) + } + + return topic, nil +} + +// DeleteTopic deletes a topic by ID +func (s *TopicsService) DeleteTopic(ctx context.Context, id uuid.UUID) error { + return s.repo.Delete(ctx, id) +} + +// GetChildTopics retrieves Level 2 topics that are children of a Level 1 topic +func (s *TopicsService) GetChildTopics(ctx context.Context, parentID uuid.UUID, tenantID *string, limit int) ([]models.Topic, error) { + // Validate that the parent topic exists and is Level 1 + topic, err := s.repo.GetByID(ctx, parentID) + if err != nil { + return nil, err + } + + if topic.Level != 1 { + return nil, apperrors.NewValidationError("id", "parent must be a Level 1 topic") + } + + return s.repo.GetChildTopics(ctx, parentID, tenantID, limit) +} diff --git a/internal/worker/taxonomy_scheduler.go b/internal/worker/taxonomy_scheduler.go new file mode 100644 index 0000000..c905e5f --- /dev/null +++ b/internal/worker/taxonomy_scheduler.go @@ -0,0 +1,221 @@ +// Package worker provides background workers for the Hub API. +package worker + +import ( + "context" + "log/slog" + "time" + + "github.com/formbricks/hub/internal/models" + "github.com/formbricks/hub/internal/service" + "github.com/google/uuid" +) + +// ClusteringJobsRepository defines the interface for clustering jobs data access. +type ClusteringJobsRepository interface { + GetDueJobs(ctx context.Context, limit int) ([]models.ClusteringJob, error) + MarkRunning(ctx context.Context, id uuid.UUID) error + UpdateAfterRun(ctx context.Context, id uuid.UUID, req *models.UpdateClusteringJobRequest) error +} + +// TaxonomyClient defines the interface for taxonomy service calls. +type TaxonomyClient interface { + TriggerClustering(ctx context.Context, tenantID string, config *service.ClusterConfig) (*service.ClusteringJobResponse, error) + GetClusteringStatus(ctx context.Context, tenantID string, jobID *uuid.UUID) (*service.ClusteringJobResponse, error) +} + +// TaxonomyScheduler is a background worker that periodically checks for +// due clustering jobs and triggers them. +type TaxonomyScheduler struct { + repo ClusteringJobsRepository + client TaxonomyClient + pollInterval time.Duration + batchSize int + checkInterval time.Duration // How often to check job status +} + +// NewTaxonomyScheduler creates a new taxonomy scheduler worker. +func NewTaxonomyScheduler( + repo ClusteringJobsRepository, + client TaxonomyClient, + pollInterval time.Duration, + batchSize int, +) *TaxonomyScheduler { + if pollInterval <= 0 { + pollInterval = 1 * time.Minute + } + if batchSize <= 0 { + batchSize = 5 + } + + return &TaxonomyScheduler{ + repo: repo, + client: client, + pollInterval: pollInterval, + batchSize: batchSize, + checkInterval: 10 * time.Second, + } +} + +// Start begins the background worker loop. It runs until the context is cancelled. +func (w *TaxonomyScheduler) Start(ctx context.Context) { + slog.Info("taxonomy scheduler started", + "poll_interval", w.pollInterval, + "batch_size", w.batchSize, + ) + + // Run immediately on startup + w.runOnce(ctx) + + ticker := time.NewTicker(w.pollInterval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + slog.Info("taxonomy scheduler stopped") + return + case <-ticker.C: + w.runOnce(ctx) + } + } +} + +// runOnce checks for due jobs and triggers them. +func (w *TaxonomyScheduler) runOnce(ctx context.Context) { + // Get jobs that are due to run + jobs, err := w.repo.GetDueJobs(ctx, w.batchSize) + if err != nil { + slog.Error("failed to get due jobs", "error", err) + return + } + + if len(jobs) == 0 { + slog.Debug("no due clustering jobs found") + return + } + + slog.Info("found due clustering jobs", "count", len(jobs)) + + for _, job := range jobs { + w.processJob(ctx, job) + } +} + +// processJob triggers clustering for a single job and updates its status. +func (w *TaxonomyScheduler) processJob(ctx context.Context, job models.ClusteringJob) { + logger := slog.With("job_id", job.ID, "tenant_id", job.TenantID) + logger.Info("processing scheduled clustering job") + + // Mark as running + if err := w.repo.MarkRunning(ctx, job.ID); err != nil { + logger.Error("failed to mark job running", "error", err) + return + } + + // Trigger clustering + result, err := w.client.TriggerClustering(ctx, job.TenantID, nil) + if err != nil { + logger.Error("failed to trigger clustering", "error", err) + errMsg := err.Error() + if updateErr := w.repo.UpdateAfterRun(ctx, job.ID, &models.UpdateClusteringJobRequest{ + Status: models.JobStatusFailed, + LastError: &errMsg, + }); updateErr != nil { + logger.Error("failed to update job status after trigger failure", "error", updateErr) + } + return + } + + logger.Info("clustering triggered", "remote_job_id", result.JobID) + + // Poll for completion (async jobs) + if result.Status == service.ClusteringStatusPending || result.Status == service.ClusteringStatusRunning { + w.waitForCompletion(ctx, job, result.JobID, logger) + } else { + // Job completed immediately + w.updateJobResult(ctx, job.ID, result, logger) + } +} + +// waitForCompletion polls the taxonomy service for job completion. +func (w *TaxonomyScheduler) waitForCompletion(ctx context.Context, job models.ClusteringJob, remoteJobID uuid.UUID, logger *slog.Logger) { + ticker := time.NewTicker(w.checkInterval) + defer ticker.Stop() + + // Timeout after 30 minutes + timeout := time.After(30 * time.Minute) + + for { + select { + case <-ctx.Done(): + return + case <-timeout: + logger.Error("job timed out") + errMsg := "job timed out after 30 minutes" + if updateErr := w.repo.UpdateAfterRun(ctx, job.ID, &models.UpdateClusteringJobRequest{ + Status: models.JobStatusFailed, + LastJobID: &remoteJobID, + LastError: &errMsg, + }); updateErr != nil { + logger.Error("failed to update job status after timeout", "error", updateErr) + } + return + case <-ticker.C: + result, err := w.client.GetClusteringStatus(ctx, job.TenantID, &remoteJobID) + if err != nil { + logger.Error("failed to get job status", "error", err) + continue + } + + if result.Status == service.ClusteringStatusCompleted || result.Status == service.ClusteringStatusFailed { + w.updateJobResult(ctx, job.ID, result, logger) + return + } + + logger.Debug("job still running", "progress", result.Progress) + } + } +} + +// updateJobResult updates the local job record with the final result. +func (w *TaxonomyScheduler) updateJobResult(ctx context.Context, jobID uuid.UUID, result *service.ClusteringJobResponse, logger *slog.Logger) { + var status models.ClusteringJobStatus + switch result.Status { + case service.ClusteringStatusCompleted: + status = models.JobStatusComplete + case service.ClusteringStatusFailed: + status = models.JobStatusFailed + default: + status = models.JobStatusComplete + } + + req := &models.UpdateClusteringJobRequest{ + Status: status, + LastJobID: &result.JobID, + } + + if result.Result != nil { + req.TopicsGenerated = &result.Result.NumClusters + req.RecordsProcessed = &result.Result.TotalRecords + + if result.Result.ErrorMessage != nil { + req.LastError = result.Result.ErrorMessage + } + } + + if result.Message != nil && status == models.JobStatusFailed { + req.LastError = result.Message + } + + if err := w.repo.UpdateAfterRun(ctx, jobID, req); err != nil { + logger.Error("failed to update job result", "error", err) + return + } + + logger.Info("clustering job completed", + "status", status, + "topics_generated", req.TopicsGenerated, + "records_processed", req.RecordsProcessed, + ) +} diff --git a/openapi.yaml b/openapi.yaml index bf13bb0..7338f61 100644 --- a/openapi.yaml +++ b/openapi.yaml @@ -88,6 +88,35 @@ paths: type: string description: Filter by user identifier. NULL bytes not allowed. pattern: '^[^\x00]*$' + - name: topic_id + in: query + description: | + Filter by topic similarity. Returns feedback records whose embeddings are similar + to this topic AND all its descendant topics (hierarchical aggregation). Results are + deduplicated (keeping highest similarity) and sorted by relevance. Each record includes + a similarity score. Similarity thresholds vary by topic level: + - Level 1: 0.30 (broadest, most inclusive) + - Level 2: 0.40 + - Level 3: 0.50 + - Level 4: 0.60 + - Level 5: 0.70 (most specific, most selective) + Use min_similarity parameter to override with a custom threshold. + schema: + type: string + format: uuid + description: Topic ID for similarity-based filtering + - name: min_similarity + in: query + description: | + Override the automatic level-based similarity threshold when filtering by topic_id. + Value between 0 and 1 (e.g., 0.5 = 50% similarity minimum). When set, applies the + same threshold to all topic levels instead of using level-specific thresholds. + schema: + type: number + format: double + minimum: 0 + maximum: 1 + description: Custom similarity threshold (0-1) - name: since in: query description: Filter by collected_at >= since (ISO 8601 format). Must be between 1970-01-01 and 2080-12-31. @@ -490,96 +519,844 @@ paths: - location: "body.value_text" message: "Text value cannot be empty" value: "" -components: - securitySchemes: - ApiKeyAuth: - type: http - scheme: bearer - bearerFormat: API Key - description: API key authentication via Bearer token in Authorization header - schemas: - BulkDeleteFeedbackRecordsOutputBody: - type: object - additionalProperties: false - properties: - deleted_count: - type: integer - description: Number of records deleted - format: int64 - message: - type: string - description: Human-readable status message - required: - - deleted_count - - message - CreateFeedbackRecordInputBody: - type: object - additionalProperties: false - properties: - collected_at: - type: string - description: When the feedback was collected (defaults to now). Must be between 1970-01-01 and 2080-12-31. - format: date-time - field_id: + /v1/knowledge-records: + get: + tags: + - Knowledge Records + summary: List knowledge records with filters + description: Lists knowledge records with optional filters and pagination + operationId: list-knowledge-records + parameters: + - name: tenant_id + in: query + description: Filter by tenant ID (for multi-tenant deployments). NULL bytes not allowed. + schema: type: string - description: Identifier for the question/field. NULL bytes not allowed. - examples: - - q1 - minLength: 1 - maxLength: 255 + description: Filter by tenant ID (for multi-tenant deployments). NULL bytes not allowed. pattern: '^[^\x00]*$' - field_label: - type: string - description: The actual question text - examples: - - How satisfied are you? - field_type: + - name: limit + in: query + description: Number of results to return (max 1000) + schema: + type: integer + description: Number of results to return (max 1000) + format: int64 + default: 100 + minimum: 1 + maximum: 1000 + - name: offset + in: query + description: Number of results to skip + schema: + type: integer + description: Number of results to skip + format: int64 + default: 0 + minimum: 0 + maximum: 2147483647 + responses: + "200": + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/ListKnowledgeRecordsOutputBody' + examples: + basic: + summary: Basic list response + value: + data: + - id: "019abc12-3456-7def-8901-234567890abc" + content: "At Formbricks we are building an experience management solution that helps companies collect and analyze customer feedback." + tenant_id: "org-123" + created_at: "2024-01-15T10:30:00Z" + updated_at: "2024-01-15T10:30:00Z" + total: 1 + limit: 100 + offset: 0 + default: + description: Error + content: + application/problem+json: + schema: + $ref: '#/components/schemas/ErrorModel' + post: + tags: + - Knowledge Records + summary: Create a new knowledge record + description: Creates a new knowledge record for AI enrichment context + operationId: create-knowledge-record + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/CreateKnowledgeRecordInputBody' + examples: + basic: + summary: Basic knowledge record + value: + content: "At Formbricks we are building an experience management solution that helps companies collect and analyze customer feedback." + tenant_id: "org-123" + without_tenant: + summary: Knowledge record without tenant + value: + content: "Our product focuses on survey creation, feedback collection, and AI-powered insights generation." + required: true + responses: + "201": + description: Created + content: + application/json: + schema: + $ref: '#/components/schemas/KnowledgeRecordData' + examples: + created: + summary: Created knowledge record + value: + id: "019abc12-3456-7def-8901-234567890abc" + content: "At Formbricks we are building an experience management solution that helps companies collect and analyze customer feedback." + tenant_id: "org-123" + created_at: "2024-01-15T10:30:00Z" + updated_at: "2024-01-15T10:30:00Z" + links: + GetCreatedRecord: + operationId: get-knowledge-record + parameters: + id: '$response.body#/id' + description: Retrieve the created knowledge record by ID + UpdateCreatedRecord: + operationId: update-knowledge-record + parameters: + id: '$response.body#/id' + description: Update the created knowledge record by ID + DeleteCreatedRecord: + operationId: delete-knowledge-record + parameters: + id: '$response.body#/id' + description: Delete the created knowledge record by ID + default: + description: Error + content: + application/problem+json: + schema: + $ref: '#/components/schemas/ErrorModel' + examples: + validation_error: + summary: Validation error + value: + type: "about:blank" + title: "Bad Request" + status: 400 + detail: "Required field 'content' is missing" + instance: "/v1/knowledge-records" + content_too_long: + summary: Content exceeds maximum length + value: + type: "about:blank" + title: "Bad Request" + status: 400 + detail: "Content exceeds maximum length of 10000 characters" + instance: "/v1/knowledge-records" + errors: + - location: "body.content" + message: "Content must not exceed 10000 characters" + value: null + delete: + tags: + - Knowledge Records + summary: Bulk delete knowledge records by tenant + description: Permanently deletes all knowledge records matching the specified tenant_id. + operationId: bulk-delete-knowledge-records + parameters: + - name: tenant_id + in: query + description: Delete all records matching this tenant ID (required). NULL bytes not allowed. + required: true + schema: type: string - description: 'Field type: text (enrichable), categorical, nps, csat, ces, rating, number, boolean, date' - examples: - - rating - enum: - - text - - categorical - - nps - - csat - - ces - - rating - - number - - boolean - - date + description: Delete all records matching this tenant ID (required). NULL bytes not allowed. minLength: 1 - maxLength: 255 - language: - type: string - description: ISO language code. NULL bytes not allowed. - examples: - - en - maxLength: 10 pattern: '^[^\x00]*$' - metadata: - type: object - description: User agent, device, location, referrer, tags, etc. NULL bytes (\x00 or \u0000) are not allowed in JSON keys or values. - additionalProperties: {} - response_id: - type: string - description: Groups multiple answers from a single submission/session - examples: - - resp-abc-123 - maxLength: 255 - source_id: - type: string - description: Reference to survey/form/ticket ID - examples: - - survey-123 - source_name: + responses: + "200": + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/BulkDeleteKnowledgeRecordsOutputBody' + examples: + success: + summary: Successful bulk delete + value: + deleted_count: 15 + message: "Successfully deleted 15 knowledge records" + default: + description: Error + content: + application/problem+json: + schema: + $ref: '#/components/schemas/ErrorModel' + examples: + missing_tenant: + summary: Missing tenant_id parameter + value: + type: "about:blank" + title: "Bad Request" + status: 400 + detail: "Required parameter 'tenant_id' is missing" + instance: "/v1/knowledge-records" + /v1/knowledge-records/{id}: + get: + tags: + - Knowledge Records + summary: Get a knowledge record by ID + description: Retrieves a single knowledge record by its UUID + operationId: get-knowledge-record + parameters: + - name: id + in: path + description: Knowledge Record ID (UUID) + required: true + schema: type: string - description: Human-readable name - examples: - - Q1 NPS Survey - source_type: + description: Knowledge Record ID (UUID) + format: uuid + responses: + "200": + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/KnowledgeRecordData' + examples: + basic: + summary: Knowledge record response + value: + id: "019abc12-3456-7def-8901-234567890abc" + content: "At Formbricks we are building an experience management solution." + tenant_id: "org-123" + created_at: "2024-01-15T10:30:00Z" + updated_at: "2024-01-15T10:30:00Z" + default: + description: Error + content: + application/problem+json: + schema: + $ref: '#/components/schemas/ErrorModel' + patch: + tags: + - Knowledge Records + summary: Update a knowledge record + description: Updates the content of a knowledge record + operationId: update-knowledge-record + parameters: + - name: id + in: path + description: Knowledge Record ID (UUID) + required: true + schema: type: string - description: Type of feedback source (e.g., survey, review, feedback_form). NULL bytes not allowed. + description: Knowledge Record ID (UUID) + format: uuid + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/UpdateKnowledgeRecordInputBody' + examples: + update_content: + summary: Update content + value: + content: "Updated knowledge content with more detailed information about the product." + required: true + responses: + "200": + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/KnowledgeRecordData' + examples: + updated: + summary: Updated knowledge record + value: + id: "019abc12-3456-7def-8901-234567890abc" + content: "Updated knowledge content with more detailed information about the product." + tenant_id: "org-123" + created_at: "2024-01-15T10:30:00Z" + updated_at: "2024-01-15T11:00:00Z" + default: + description: Error + content: + application/problem+json: + schema: + $ref: '#/components/schemas/ErrorModel' + examples: + not_found: + summary: Record not found + value: + type: "about:blank" + title: "Not Found" + status: 404 + detail: "Knowledge record with ID '019abc12-3456-7def-8901-234567890abc' not found" + instance: "/v1/knowledge-records/019abc12-3456-7def-8901-234567890abc" + validation_error: + summary: Validation error + value: + type: "about:blank" + title: "Bad Request" + status: 400 + detail: "Invalid update request" + instance: "/v1/knowledge-records/019abc12-3456-7def-8901-234567890abc" + delete: + tags: + - Knowledge Records + summary: Delete a knowledge record + description: Permanently deletes a knowledge record + operationId: delete-knowledge-record + parameters: + - name: id + in: path + description: Knowledge Record ID (UUID) + required: true + schema: + type: string + description: Knowledge Record ID (UUID) + format: uuid + responses: + "204": + description: No Content + default: + description: Error + content: + application/problem+json: + schema: + $ref: '#/components/schemas/ErrorModel' + examples: + not_found: + summary: Record not found + value: + type: "about:blank" + title: "Not Found" + status: 404 + detail: "Knowledge record with ID '019abc12-3456-7def-8901-234567890abc' not found" + instance: "/v1/knowledge-records/019abc12-3456-7def-8901-234567890abc" + /v1/topics: + get: + tags: + - Topics + summary: List topics with filters + description: Lists topics with optional filters and pagination. Use level filter to get Level 1 or Level 2 topics. Level 2 topics have a parent_id linking to their Level 1 parent (use GET /topics/{id}/children to get children of a Level 1 topic). + operationId: list-topics + parameters: + - name: level + in: query + description: Filter by hierarchy level (1 = Level 1 topics, 2 = Level 2 topics) + schema: + type: integer + description: Filter by hierarchy level (1 = Level 1 topics, 2 = Level 2 topics) + format: int64 + minimum: 1 + maximum: 2 + - name: title + in: query + description: Filter by exact title match. NULL bytes not allowed. + schema: + type: string + description: Filter by exact title match. NULL bytes not allowed. + pattern: '^[^\x00]*$' + - name: tenant_id + in: query + description: Filter by tenant ID (for multi-tenant deployments). NULL bytes not allowed. + schema: + type: string + description: Filter by tenant ID (for multi-tenant deployments). NULL bytes not allowed. + pattern: '^[^\x00]*$' + - name: limit + in: query + description: Number of results to return (max 1000) + schema: + type: integer + description: Number of results to return (max 1000) + format: int64 + default: 100 + minimum: 1 + maximum: 1000 + - name: offset + in: query + description: Number of results to skip + schema: + type: integer + description: Number of results to skip + format: int64 + default: 0 + minimum: 0 + maximum: 2147483647 + responses: + "200": + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/ListTopicsOutputBody' + examples: + top_level: + summary: Top-level topics (level 1) + value: + data: + - id: "019def12-3456-7abc-8901-234567890def" + title: "Performance" + level: 1 + tenant_id: "org-123" + created_at: "2024-01-15T10:30:00Z" + updated_at: "2024-01-15T10:30:00Z" + - id: "019ghi12-3456-7abc-8901-234567890ghi" + title: "Feature Requests" + level: 1 + tenant_id: "org-123" + created_at: "2024-01-15T10:35:00Z" + updated_at: "2024-01-15T10:35:00Z" + total: 2 + limit: 100 + offset: 0 + level2: + summary: Level 2 topics + value: + data: + - id: "019jkl12-3456-7abc-8901-234567890jkl" + title: "Dashboard slow" + level: 2 + tenant_id: "org-123" + created_at: "2024-01-15T11:00:00Z" + updated_at: "2024-01-15T11:00:00Z" + - id: "019mno12-3456-7abc-8901-234567890mno" + title: "API response time" + level: 2 + tenant_id: "org-123" + created_at: "2024-01-15T11:05:00Z" + updated_at: "2024-01-15T11:05:00Z" + total: 2 + limit: 100 + offset: 0 + default: + description: Error + content: + application/problem+json: + schema: + $ref: '#/components/schemas/ErrorModel' + post: + tags: + - Topics + summary: Create a new topic + description: Creates a new topic with the specified level. Titles must be unique within the same level and tenant. + operationId: create-topic + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/CreateTopicInputBody' + examples: + level1: + summary: Create Level 1 topic + value: + title: "Performance" + level: 1 + tenant_id: "org-123" + level2: + summary: Create Level 2 topic + value: + title: "Dashboard slow" + level: 2 + tenant_id: "org-123" + required: true + responses: + "201": + description: Created + content: + application/json: + schema: + $ref: '#/components/schemas/TopicData' + examples: + level1: + summary: Created Level 1 topic + value: + id: "019def12-3456-7abc-8901-234567890def" + title: "Performance" + level: 1 + tenant_id: "org-123" + created_at: "2024-01-15T10:30:00Z" + updated_at: "2024-01-15T10:30:00Z" + level2: + summary: Created Level 2 topic + value: + id: "019jkl12-3456-7abc-8901-234567890jkl" + title: "Dashboard slow" + level: 2 + tenant_id: "org-123" + created_at: "2024-01-15T11:00:00Z" + updated_at: "2024-01-15T11:00:00Z" + links: + GetCreatedTopic: + operationId: get-topic + parameters: + id: '$response.body#/id' + description: Retrieve the created topic by ID + UpdateCreatedTopic: + operationId: update-topic + parameters: + id: '$response.body#/id' + description: Update the created topic by ID + DeleteCreatedTopic: + operationId: delete-topic + parameters: + id: '$response.body#/id' + description: Delete the created topic by ID + default: + description: Error + content: + application/problem+json: + schema: + $ref: '#/components/schemas/ErrorModel' + examples: + validation_error: + summary: Validation error + value: + type: "about:blank" + title: "Bad Request" + status: 400 + detail: "Required field 'title' is missing" + instance: "/v1/topics" + parent_not_found: + summary: Parent topic not found + value: + type: "about:blank" + title: "Not Found" + status: 404 + detail: "Parent topic with ID '019def12-3456-7abc-8901-234567890def' not found" + instance: "/v1/topics" + duplicate_title: + summary: Duplicate title within parent + value: + type: "about:blank" + title: "Conflict" + status: 409 + detail: "A topic with title 'Performance' already exists within this parent and tenant" + instance: "/v1/topics" + /v1/topics/{id}: + get: + tags: + - Topics + summary: Get a topic by ID + description: Retrieves a single topic by its UUID + operationId: get-topic + parameters: + - name: id + in: path + description: Topic ID (UUID) + required: true + schema: + type: string + description: Topic ID (UUID) + format: uuid + responses: + "200": + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/TopicData' + examples: + basic: + summary: Topic response + value: + id: "019def12-3456-7abc-8901-234567890def" + title: "Performance" + level: 1 + tenant_id: "org-123" + created_at: "2024-01-15T10:30:00Z" + updated_at: "2024-01-15T10:30:00Z" + default: + description: Error + content: + application/problem+json: + schema: + $ref: '#/components/schemas/ErrorModel' + examples: + not_found: + summary: Topic not found + value: + type: "about:blank" + title: "Not Found" + status: 404 + detail: "Topic with ID '019def12-3456-7abc-8901-234567890def' not found" + instance: "/v1/topics/019def12-3456-7abc-8901-234567890def" + patch: + tags: + - Topics + summary: Update a topic + description: Updates the title of a topic. Note that level cannot be changed after creation. + operationId: update-topic + parameters: + - name: id + in: path + description: Topic ID (UUID) + required: true + schema: + type: string + description: Topic ID (UUID) + format: uuid + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/UpdateTopicInputBody' + examples: + update_title: + summary: Update title + value: + title: "App Performance" + required: true + responses: + "200": + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/TopicData' + examples: + updated: + summary: Updated topic + value: + id: "019def12-3456-7abc-8901-234567890def" + title: "App Performance" + level: 1 + tenant_id: "org-123" + created_at: "2024-01-15T10:30:00Z" + updated_at: "2024-01-15T12:00:00Z" + default: + description: Error + content: + application/problem+json: + schema: + $ref: '#/components/schemas/ErrorModel' + examples: + not_found: + summary: Topic not found + value: + type: "about:blank" + title: "Not Found" + status: 404 + detail: "Topic with ID '019def12-3456-7abc-8901-234567890def' not found" + instance: "/v1/topics/019def12-3456-7abc-8901-234567890def" + duplicate_title: + summary: Duplicate title within parent + value: + type: "about:blank" + title: "Conflict" + status: 409 + detail: "A topic with title 'App Performance' already exists within this parent and tenant" + instance: "/v1/topics/019def12-3456-7abc-8901-234567890def" + delete: + tags: + - Topics + summary: Delete a topic + description: Permanently deletes a topic. WARNING - This operation cascades and will delete all descendant topics. + operationId: delete-topic + parameters: + - name: id + in: path + description: Topic ID (UUID) + required: true + schema: + type: string + description: Topic ID (UUID) + format: uuid + responses: + "204": + description: No Content + default: + description: Error + content: + application/problem+json: + schema: + $ref: '#/components/schemas/ErrorModel' + examples: + not_found: + summary: Topic not found + value: + type: "about:blank" + title: "Not Found" + status: 404 + detail: "Topic with ID '019def12-3456-7abc-8901-234567890def' not found" + instance: "/v1/topics/019def12-3456-7abc-8901-234567890def" + /v1/topics/{id}/children: + get: + tags: + - Topics + summary: Get child Level 2 topics + description: Returns Level 2 topics that are children of the given Level 1 topic (via parent_id relationship). + operationId: get-child-topics + parameters: + - name: id + in: path + description: Level 1 Topic ID (UUID) + required: true + schema: + type: string + format: uuid + - name: tenant_id + in: query + description: Filter by tenant ID + schema: + type: string + - name: limit + in: query + description: Maximum number of results (default 100) + schema: + type: integer + default: 100 + minimum: 1 + maximum: 1000 + responses: + "200": + description: OK + content: + application/json: + schema: + type: object + properties: + data: + type: array + items: + $ref: '#/components/schemas/TopicData' + examples: + child_topics: + summary: Level 2 topics under Performance + value: + data: + - id: "019jkl12-3456-7abc-8901-234567890jkl" + title: "Dashboard Performance" + level: 2 + parent_id: "019abc12-3456-7abc-8901-234567890abc" + created_at: "2024-01-15T10:30:00Z" + updated_at: "2024-01-15T10:30:00Z" + - id: "019mno12-3456-7abc-8901-234567890mno" + title: "API Response Time" + level: 2 + parent_id: "019abc12-3456-7abc-8901-234567890abc" + created_at: "2024-01-15T10:30:00Z" + updated_at: "2024-01-15T10:30:00Z" + default: + description: Error + content: + application/problem+json: + schema: + $ref: '#/components/schemas/ErrorModel' + examples: + not_found: + summary: Topic not found + value: + type: "about:blank" + title: "Not Found" + status: 404 + detail: "Topic not found" + not_level1: + summary: Topic is not Level 1 + value: + type: "about:blank" + title: "Bad Request" + status: 400 + detail: "topic must be a Level 1 topic" +components: + securitySchemes: + ApiKeyAuth: + type: http + scheme: bearer + bearerFormat: API Key + description: API key authentication via Bearer token in Authorization header + schemas: + BulkDeleteFeedbackRecordsOutputBody: + type: object + additionalProperties: false + properties: + deleted_count: + type: integer + description: Number of records deleted + format: int64 + message: + type: string + description: Human-readable status message + required: + - deleted_count + - message + CreateFeedbackRecordInputBody: + type: object + additionalProperties: false + properties: + collected_at: + type: string + description: When the feedback was collected (defaults to now). Must be between 1970-01-01 and 2080-12-31. + format: date-time + field_id: + type: string + description: Identifier for the question/field. NULL bytes not allowed. + examples: + - q1 + minLength: 1 + maxLength: 255 + pattern: '^[^\x00]*$' + field_label: + type: string + description: The actual question text + examples: + - How satisfied are you? + field_type: + type: string + description: 'Field type: text (enrichable), categorical, nps, csat, ces, rating, number, boolean, date' + examples: + - rating + enum: + - text + - categorical + - nps + - csat + - ces + - rating + - number + - boolean + - date + minLength: 1 + maxLength: 255 + language: + type: string + description: ISO language code. NULL bytes not allowed. + examples: + - en + maxLength: 10 + pattern: '^[^\x00]*$' + metadata: + type: object + description: User agent, device, location, referrer, tags, etc. NULL bytes (\x00 or \u0000) are not allowed in JSON keys or values. + additionalProperties: {} + response_id: + type: string + description: Groups multiple answers from a single submission/session + examples: + - resp-abc-123 + maxLength: 255 + source_id: + type: string + description: Reference to survey/form/ticket ID + examples: + - survey-123 + source_name: + type: string + description: Human-readable name + examples: + - Q1 NPS Survey + source_type: + type: string + description: Type of feedback source (e.g., survey, review, feedback_form). NULL bytes not allowed. examples: - survey minLength: 1 @@ -745,6 +1522,16 @@ components: type: string description: Text response. NULL bytes not allowed. pattern: '^[^\x00]*$' + similarity: + type: number + format: double + minimum: 0 + maximum: 1 + description: | + Cosine similarity score (0-1) indicating how closely this feedback matches the + queried topic. Only present when filtering by topic_id. Higher values indicate + stronger semantic similarity. Results are filtered by level-appropriate thresholds + and sorted by similarity descending. required: - id - collected_at @@ -811,4 +1598,225 @@ components: value_text: type: string description: Update text response. NULL bytes not allowed. - pattern: '^[^\x00]*$' \ No newline at end of file + pattern: '^[^\x00]*$' + KnowledgeRecordData: + type: object + additionalProperties: false + description: A knowledge record containing contextual information for AI enrichment + properties: + id: + type: string + format: uuid + description: UUIDv7 primary key + content: + type: string + description: The knowledge content text + maxLength: 10000 + tenant_id: + type: string + description: Tenant/organization identifier for multi-tenancy + maxLength: 255 + created_at: + type: string + format: date-time + description: When this record was created + updated_at: + type: string + format: date-time + description: When this record was last updated + required: + - id + - content + - created_at + - updated_at + CreateKnowledgeRecordInputBody: + type: object + additionalProperties: false + description: Request body for creating a new knowledge record + properties: + content: + type: string + description: The knowledge content text. NULL bytes not allowed. + examples: + - "At Formbricks we are building an experience management solution that helps companies collect and analyze customer feedback." + minLength: 1 + maxLength: 10000 + pattern: '^[^\x00]*$' + tenant_id: + type: string + description: Tenant/organization identifier for multi-tenancy. NULL bytes not allowed. + examples: + - org-123 + maxLength: 255 + pattern: '^[^\x00]*$' + required: + - content + UpdateKnowledgeRecordInputBody: + type: object + additionalProperties: false + description: Request body for updating a knowledge record + properties: + content: + type: string + description: Updated knowledge content text. NULL bytes not allowed. + minLength: 1 + maxLength: 10000 + pattern: '^[^\x00]*$' + ListKnowledgeRecordsOutputBody: + type: object + additionalProperties: false + description: Paginated list of knowledge records + properties: + data: + type: array + description: List of knowledge records + items: + $ref: '#/components/schemas/KnowledgeRecordData' + total: + type: integer + description: Total count of knowledge records matching filters + format: int64 + limit: + type: integer + description: Limit used in query + format: int64 + offset: + type: integer + description: Offset used in query + format: int64 + required: + - data + - total + - limit + - offset + BulkDeleteKnowledgeRecordsOutputBody: + type: object + additionalProperties: false + description: Response for bulk delete operation on knowledge records + properties: + deleted_count: + type: integer + description: Number of records deleted + format: int64 + message: + type: string + description: Human-readable status message + required: + - deleted_count + - message + TopicData: + type: object + additionalProperties: false + description: A topic for taxonomy classification. Level 1 topics are broad categories, Level 2 topics are specific subtopics linked via parent_id. + properties: + id: + type: string + format: uuid + description: UUIDv7 primary key + title: + type: string + description: Name of the topic + maxLength: 255 + level: + type: integer + description: Hierarchy level (1 = Level 1 topic, 2 = Level 2 topic) + format: int64 + minimum: 1 + maximum: 2 + parent_id: + type: string + format: uuid + nullable: true + description: Parent Level 1 topic ID. NULL for Level 1 topics, required for Level 2 topics. + tenant_id: + type: string + description: Tenant/organization identifier for multi-tenancy + maxLength: 255 + created_at: + type: string + format: date-time + description: When this topic was created + updated_at: + type: string + format: date-time + description: When this topic was last updated + required: + - id + - title + - level + - created_at + - updated_at + CreateTopicInputBody: + type: object + additionalProperties: false + description: Request body for creating a new topic. Level 2 topics require a parent_id. + properties: + title: + type: string + description: Name of the topic. Must be unique within the same level and tenant. NULL bytes not allowed. + examples: + - Performance + - Dashboard + - Feature Requests + minLength: 1 + maxLength: 255 + pattern: '^[^\x00]*$' + level: + type: integer + description: Hierarchy level (1 = Level 1 topic, 2 = Level 2 topic) + format: int64 + minimum: 1 + maximum: 2 + parent_id: + type: string + format: uuid + nullable: true + description: Parent Level 1 topic ID. Required for Level 2 topics, must be NULL for Level 1 topics. + tenant_id: + type: string + description: Tenant/organization identifier for multi-tenancy. NULL bytes not allowed. + examples: + - org-123 + maxLength: 255 + pattern: '^[^\x00]*$' + required: + - title + - level + UpdateTopicInputBody: + type: object + additionalProperties: false + description: Request body for updating a topic. Note that level cannot be changed after creation. + properties: + title: + type: string + description: Updated name of the topic. Must be unique within the same level and tenant. NULL bytes not allowed. + minLength: 1 + maxLength: 255 + pattern: '^[^\x00]*$' + ListTopicsOutputBody: + type: object + additionalProperties: false + description: Paginated list of topics + properties: + data: + type: array + description: List of topics + items: + $ref: '#/components/schemas/TopicData' + total: + type: integer + description: Total count of topics matching filters + format: int64 + limit: + type: integer + description: Limit used in query + format: int64 + offset: + type: integer + description: Offset used in query + format: int64 + required: + - data + - total + - limit + - offset diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 0000000..f01f4db --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,96 @@ +# Hub Scripts + +Utility scripts for development and testing. + +## ingest_csv.go + +Ingests feedback from a CSV file into the Hub API, simulating real production usage. + +### Features + +- **Creates default topics** for classification testing (Performance, UX, Features, etc.) +- **Extracts multiple feedback fields** from each CSV row +- **Sends via API** with proper authentication +- **Configurable delay** between requests to simulate realistic load +- **Dry-run mode** to preview without making API calls + +### Usage + +```bash +# Basic usage (with sample data) +go run scripts/ingest_csv.go \ + -file testdata/sample_feedback.csv \ + -api-key YOUR_API_KEY + +# All options +go run scripts/ingest_csv.go \ + -file /path/to/feedback.csv \ + -api-url http://localhost:8080 \ + -api-key YOUR_API_KEY \ + -create-topics=true \ + -delay 100 \ + -tenant-id optional-tenant \ + -dry-run +``` + +### Options + +| Flag | Default | Description | +|------|---------|-------------| +| `-file` | (required) | Path to the CSV file | +| `-api-url` | `http://localhost:8080` | Hub API base URL | +| `-api-key` | (required) | API key for authentication (uses `Authorization: Bearer` header) | +| `-create-topics` | `true` | Create default topics before ingesting | +| `-delay` | `100` | Milliseconds between API calls | +| `-tenant-id` | (empty) | Optional tenant ID for all records | +| `-dry-run` | `false` | Parse CSV but don't make API calls | + +### CSV Format + +The script expects a Formbricks survey export CSV with columns: +- Response ID (column 2) +- Timestamp (column 3) +- Country (column 12) +- Text feedback fields (columns 17-19, 30) +- NPS score (column 29) +- Email (column 32) + +### Extracted Fields + +For each CSV row, the script creates feedback records for: +1. `helped_solve` - How Formbricks helped solve problems +2. `help_better` - Suggestions for improvement +3. `missing_feature` - Missing feature requests +4. `nps_reason` - NPS score explanation +5. `nps_score` - Numeric NPS value (1-10) + +### Example Output + +``` +🚀 Formbricks Hub CSV Ingestion Tool + API URL: http://localhost:8080 + CSV File: /path/to/feedback.csv + Delay: 100ms between requests + +📂 Creating topics for classification... + + Performance + └─ Slow Loading + └─ Dashboard Performance + └─ API Response Time + ... + ✓ Created 24 topics + +📥 Ingesting feedback records... + ✓ Row 1: helped_solve + ✓ Row 2: help_better + ✓ Row 2: missing_feature + ... + +📊 Ingestion Summary + ───────────────────── + Total rows processed: 19 + Skipped (empty): 3 + Successfully created: 42 + Failed: 0 + Topics created: 24 +``` diff --git a/scripts/ingest_csv.go b/scripts/ingest_csv.go new file mode 100644 index 0000000..42464bf --- /dev/null +++ b/scripts/ingest_csv.go @@ -0,0 +1,464 @@ +// Package main provides a CLI tool to ingest feedback from a CSV file into the Hub API. +// This simulates real production usage by making API calls with proper authentication. +// +// Usage: +// +// go run scripts/ingest_csv.go -file /path/to/feedback.csv -api-url http://localhost:8080 -api-key YOUR_API_KEY +package main + +import ( + "bytes" + "encoding/csv" + "encoding/json" + "flag" + "fmt" + "io" + "net/http" + "os" + "strconv" + "strings" + "time" +) + +// Config holds the CLI configuration +type Config struct { + FilePath string + APIBaseURL string + APIKey string + CreateTopics bool + DelayMS int + DryRun bool + TenantID string +} + +// FeedbackRequest matches the CreateFeedbackRecordRequest model +type FeedbackRequest struct { + CollectedAt *string `json:"collected_at,omitempty"` + SourceType string `json:"source_type"` + SourceID *string `json:"source_id,omitempty"` + SourceName *string `json:"source_name,omitempty"` + FieldID string `json:"field_id"` + FieldLabel *string `json:"field_label,omitempty"` + FieldType string `json:"field_type"` + ValueText *string `json:"value_text,omitempty"` + ValueNumber *float64 `json:"value_number,omitempty"` + Metadata json.RawMessage `json:"metadata,omitempty"` + Language *string `json:"language,omitempty"` + UserIdentifier *string `json:"user_identifier,omitempty"` + TenantID *string `json:"tenant_id,omitempty"` + ResponseID *string `json:"response_id,omitempty"` +} + +// TopicRequest matches the CreateTopicRequest model +type TopicRequest struct { + Title string `json:"title"` + Level int `json:"level"` + ParentID *string `json:"parent_id,omitempty"` + TenantID *string `json:"tenant_id,omitempty"` +} + +// APIResponse represents a generic API response +type APIResponse struct { + ID string `json:"id"` + Error string `json:"error,omitempty"` +} + +// Stats tracks ingestion statistics +type Stats struct { + TotalRows int + SkippedEmpty int + SuccessfulPosts int + FailedPosts int + TopicsCreated int +} + +// Default topics to seed for classification +var defaultTopics = []struct { + Title string + Children []string +}{ + { + Title: "Performance", + Children: []string{"Slow Loading", "Dashboard Performance", "API Response Time"}, + }, + { + Title: "User Experience", + Children: []string{"Survey Results Viewing", "Navigation", "Mobile Experience"}, + }, + { + Title: "Feature Requests", + Children: []string{"Custom Dashboards", "Import/Export", "Workflows", "AI Features"}, + }, + { + Title: "Integrations", + Children: []string{"Third-party Apps", "API Access", "Webhooks"}, + }, + { + Title: "Authentication", + Children: []string{"Login Issues", "Session Management", "SSO"}, + }, + { + Title: "Pricing", + Children: []string{"Feature Deprecation", "Plan Limitations", "Value for Money"}, + }, +} + +// CSV column indices for normalized format (0-based) +// Format: collected_at;field_id;field_label;field_type;language;metadata;response_id;source_id;source_name;source_type;tenant_id;user_identifier;value_boolean;value_date;value_number;value_text +const ( + colCollectedAt = 0 + colFieldID = 1 + colFieldLabel = 2 + colFieldType = 3 + colLanguage = 4 + colMetadata = 5 + colResponseID = 6 + colSourceID = 7 + colSourceName = 8 + colSourceType = 9 + colTenantID = 10 + colUserIdentifier = 11 + colValueBoolean = 12 + colValueDate = 13 + colValueNumber = 14 + colValueText = 15 +) + +func main() { + cfg := parseFlags() + + if cfg.FilePath == "" { + fmt.Println("Error: -file is required") + flag.Usage() + os.Exit(1) + } + + if cfg.APIKey == "" { + fmt.Println("Error: -api-key is required") + flag.Usage() + os.Exit(1) + } + + fmt.Printf("🚀 Formbricks Hub CSV Ingestion Tool\n") + fmt.Printf(" API URL: %s\n", cfg.APIBaseURL) + fmt.Printf(" CSV File: %s\n", cfg.FilePath) + fmt.Printf(" Delay: %dms between requests\n", cfg.DelayMS) + if cfg.DryRun { + fmt.Printf(" ⚠️ DRY RUN MODE - No actual API calls will be made\n") + } + fmt.Println() + + // Create topics first if requested + if cfg.CreateTopics && !cfg.DryRun { + fmt.Println("📂 Creating topics for classification...") + topicsCreated := createTopics(cfg) + fmt.Printf(" ✓ Created %d topics\n\n", topicsCreated) + } + + // Process CSV + stats := processCSV(cfg) + + // Print summary + fmt.Println() + fmt.Println("📊 Ingestion Summary") + fmt.Println(" ─────────────────────") + fmt.Printf(" Total rows processed: %d\n", stats.TotalRows) + fmt.Printf(" Skipped (empty): %d\n", stats.SkippedEmpty) + fmt.Printf(" Successfully created: %d\n", stats.SuccessfulPosts) + fmt.Printf(" Failed: %d\n", stats.FailedPosts) + if cfg.CreateTopics { + fmt.Printf(" Topics created: %d\n", stats.TopicsCreated) + } + fmt.Println() + + if stats.FailedPosts > 0 { + os.Exit(1) + } +} + +func parseFlags() Config { + cfg := Config{} + + flag.StringVar(&cfg.FilePath, "file", "", "Path to CSV file (required)") + flag.StringVar(&cfg.APIBaseURL, "api-url", "http://localhost:8080", "Hub API base URL") + flag.StringVar(&cfg.APIKey, "api-key", "", "API key for authentication (required)") + flag.BoolVar(&cfg.CreateTopics, "create-topics", true, "Create default topics before ingesting") + flag.IntVar(&cfg.DelayMS, "delay", 100, "Delay in milliseconds between API calls") + flag.BoolVar(&cfg.DryRun, "dry-run", false, "Parse CSV but don't make API calls") + flag.StringVar(&cfg.TenantID, "tenant-id", "", "Optional tenant ID for all records") + + flag.Parse() + return cfg +} + +func createTopics(cfg Config) int { + count := 0 + client := &http.Client{Timeout: 10 * time.Second} + + for _, level1Topic := range defaultTopics { + // Create Level 1 topic + level1ID, err := createTopic(client, cfg, level1Topic.Title, 1, nil) + if err != nil { + fmt.Printf(" ⚠ Failed to create Level 1 topic '%s': %v\n", level1Topic.Title, err) + continue + } + count++ + fmt.Printf(" + %s (id: %s)\n", level1Topic.Title, level1ID) + + // Create Level 2 topics with parent_id + for _, level2Title := range level1Topic.Children { + _, err := createTopic(client, cfg, level2Title, 2, &level1ID) + if err != nil { + fmt.Printf(" ⚠ Failed to create Level 2 topic '%s': %v\n", level2Title, err) + continue + } + count++ + fmt.Printf(" └─ %s\n", level2Title) + } + + time.Sleep(time.Duration(cfg.DelayMS) * time.Millisecond) + } + + return count +} + +func createTopic(client *http.Client, cfg Config, title string, level int, parentID *string) (string, error) { + req := TopicRequest{ + Title: title, + Level: level, + ParentID: parentID, + } + if cfg.TenantID != "" { + req.TenantID = &cfg.TenantID + } + + body, _ := json.Marshal(req) + httpReq, err := http.NewRequest("POST", cfg.APIBaseURL+"/v1/topics", bytes.NewReader(body)) + if err != nil { + return "", err + } + + httpReq.Header.Set("Content-Type", "application/json") + httpReq.Header.Set("Authorization", "Bearer "+cfg.APIKey) + + resp, err := client.Do(httpReq) + if err != nil { + return "", err + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode != http.StatusCreated && resp.StatusCode != http.StatusOK { + respBody, _ := io.ReadAll(resp.Body) + return "", fmt.Errorf("status %d: %s", resp.StatusCode, string(respBody)) + } + + var apiResp APIResponse + if err := json.NewDecoder(resp.Body).Decode(&apiResp); err != nil { + return "", err + } + + return apiResp.ID, nil +} + +func processCSV(cfg Config) Stats { + stats := Stats{} + + file, err := os.Open(cfg.FilePath) + if err != nil { + fmt.Printf("Error opening file: %v\n", err) + os.Exit(1) + } + defer func() { _ = file.Close() }() + + reader := csv.NewReader(file) + reader.Comma = ';' // Use semicolon as delimiter + reader.FieldsPerRecord = -1 // Allow variable field counts + reader.LazyQuotes = true // Handle quotes more leniently + + client := &http.Client{Timeout: 10 * time.Second} + + // Skip header row + _, err = reader.Read() + if err != nil { + fmt.Printf("Error reading header: %v\n", err) + os.Exit(1) + } + + fmt.Println("📥 Ingesting feedback records...") + + rowNum := 1 + for { + row, err := reader.Read() + if err == io.EOF { + break + } + if err != nil { + fmt.Printf(" ⚠ Row %d: Error reading: %v\n", rowNum, err) + rowNum++ + continue + } + + stats.TotalRows++ + feedbackRecords := extractFeedbackFromRow(row, cfg) + + if len(feedbackRecords) == 0 { + stats.SkippedEmpty++ + rowNum++ + continue + } + + for _, feedback := range feedbackRecords { + if cfg.DryRun { + fmt.Printf(" [DRY] Row %d: Would create %s feedback\n", rowNum, feedback.FieldID) + stats.SuccessfulPosts++ + continue + } + + err := postFeedback(client, cfg, feedback) + if err != nil { + fmt.Printf(" ✗ Row %d (%s): %v\n", rowNum, feedback.FieldID, err) + stats.FailedPosts++ + } else { + fmt.Printf(" ✓ Row %d: %s\n", rowNum, feedback.FieldID) + stats.SuccessfulPosts++ + } + + time.Sleep(time.Duration(cfg.DelayMS) * time.Millisecond) + } + + rowNum++ + } + + return stats +} + +func extractFeedbackFromRow(row []string, cfg Config) []FeedbackRequest { + // Normalized CSV format: each row is one feedback record + // Format: collected_at;field_id;field_label;field_type;language;metadata;response_id;source_id;source_name;source_type;tenant_id;user_identifier;value_boolean;value_date;value_number;value_text + + // Get required fields + fieldID := strings.TrimSpace(safeGet(row, colFieldID)) + fieldType := strings.TrimSpace(safeGet(row, colFieldType)) + sourceType := strings.TrimSpace(safeGet(row, colSourceType)) + valueText := strings.TrimSpace(safeGet(row, colValueText)) + + // Skip if no text content (we only process text feedback for taxonomy) + if valueText == "" { + return nil + } + + // Skip if missing required fields + if fieldID == "" || fieldType == "" || sourceType == "" { + return nil + } + + // Get optional fields + collectedAtRaw := strings.TrimSpace(safeGet(row, colCollectedAt)) + fieldLabel := strings.TrimSpace(safeGet(row, colFieldLabel)) + language := strings.TrimSpace(safeGet(row, colLanguage)) + metadataRaw := strings.TrimSpace(safeGet(row, colMetadata)) + responseID := strings.TrimSpace(safeGet(row, colResponseID)) + sourceID := strings.TrimSpace(safeGet(row, colSourceID)) + sourceName := strings.TrimSpace(safeGet(row, colSourceName)) + tenantID := strings.TrimSpace(safeGet(row, colTenantID)) + userIdentifier := strings.TrimSpace(safeGet(row, colUserIdentifier)) + valueNumberRaw := strings.TrimSpace(safeGet(row, colValueNumber)) + + // Use tenant_id from CSV if not overridden by CLI + if cfg.TenantID != "" { + tenantID = cfg.TenantID + } + + // Parse timestamp for collected_at + var collectedAt *string + if collectedAtRaw != "" { + // Parse "2026-01-23 07:08:21" format and convert to RFC3339 + if t, err := time.Parse("2006-01-02 15:04:05", collectedAtRaw); err == nil { + formatted := t.Format(time.RFC3339) + collectedAt = &formatted + } else { + // Try RFC3339 format directly + if _, err := time.Parse(time.RFC3339, collectedAtRaw); err == nil { + collectedAt = &collectedAtRaw + } + } + } + + // Parse metadata JSON + var metadata json.RawMessage + if metadataRaw != "" { + // Validate it's valid JSON + if json.Valid([]byte(metadataRaw)) { + metadata = json.RawMessage(metadataRaw) + } + } + + // Parse value_number if present + var valueNumber *float64 + if valueNumberRaw != "" { + if num, err := strconv.ParseFloat(valueNumberRaw, 64); err == nil { + valueNumber = &num + } + } + + req := FeedbackRequest{ + CollectedAt: collectedAt, + SourceType: sourceType, + SourceID: nilIfEmpty(sourceID), + SourceName: nilIfEmpty(sourceName), + FieldID: fieldID, + FieldLabel: nilIfEmpty(fieldLabel), + FieldType: fieldType, + ValueText: nilIfEmpty(valueText), + ValueNumber: valueNumber, + Metadata: metadata, + Language: nilIfEmpty(language), + UserIdentifier: nilIfEmpty(userIdentifier), + TenantID: nilIfEmpty(tenantID), + ResponseID: nilIfEmpty(responseID), + } + + return []FeedbackRequest{req} +} + +func postFeedback(client *http.Client, cfg Config, feedback FeedbackRequest) error { + body, err := json.Marshal(feedback) + if err != nil { + return fmt.Errorf("marshal error: %w", err) + } + + req, err := http.NewRequest("POST", cfg.APIBaseURL+"/v1/feedback-records", bytes.NewReader(body)) + if err != nil { + return err + } + + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Authorization", "Bearer "+cfg.APIKey) + + resp, err := client.Do(req) + if err != nil { + return err + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode != http.StatusCreated && resp.StatusCode != http.StatusOK { + respBody, _ := io.ReadAll(resp.Body) + return fmt.Errorf("status %d: %s", resp.StatusCode, string(respBody)) + } + + return nil +} + +func safeGet(row []string, index int) string { + if index >= 0 && index < len(row) { + return row[index] + } + return "" +} + +func nilIfEmpty(s string) *string { + if s == "" { + return nil + } + return &s +} diff --git a/services/taxonomy-generator/.env.example b/services/taxonomy-generator/.env.example new file mode 100644 index 0000000..561f1ab --- /dev/null +++ b/services/taxonomy-generator/.env.example @@ -0,0 +1,75 @@ +# ============================================================================= +# Taxonomy Generator - Environment Variables +# ============================================================================= +# Copy this file to .env and fill in your values: +# cp env.example .env +# ============================================================================= + +# ----------------------------------------------------------------------------- +# Required +# ----------------------------------------------------------------------------- + +# PostgreSQL connection string (same database as Hub API) +DATABASE_URL=postgresql://postgres:postgres@localhost:5432/test_db + +# OpenAI API key for GPT-4o labeling and embedding generation +OPENAI_API_KEY=sk-your-api-key-here + +# ----------------------------------------------------------------------------- +# UMAP Settings (Dimensionality Reduction) +# ----------------------------------------------------------------------------- + +# Target number of dimensions after reduction (default: 10) +# Higher = more detail preserved, lower = faster clustering +UMAP_N_COMPONENTS=10 + +# Number of neighbors for local structure (default: 15) +# Higher = more global structure, lower = more local detail +UMAP_N_NEIGHBORS=15 + +# Minimum distance between points in reduced space (default: 0.1) +# Lower = tighter clusters, higher = more spread out +UMAP_MIN_DIST=0.1 + +# Distance metric (default: cosine) +# Options: cosine, euclidean, manhattan +UMAP_METRIC=cosine + +# ----------------------------------------------------------------------------- +# HDBSCAN Settings (Clustering) +# ----------------------------------------------------------------------------- + +# Minimum number of points to form a cluster (default: 50) +# Higher = fewer, larger clusters; lower = more, smaller clusters +HDBSCAN_MIN_CLUSTER_SIZE=50 + +# Minimum samples in neighborhood for core points (default: 10) +# Higher = more conservative clustering, lower = more aggressive +HDBSCAN_MIN_SAMPLES=10 + +# Distance threshold for cluster selection (default: 0.0) +# Higher = merges nearby clusters +HDBSCAN_CLUSTER_SELECTION_EPSILON=0.0 + +# ----------------------------------------------------------------------------- +# Clustering Limits +# ----------------------------------------------------------------------------- + +# Maximum embeddings to process per tenant (default: 100000) +MAX_EMBEDDINGS_PER_TENANT=100000 + +# Number of samples closest to centroid for LLM labeling (default: 10) +CENTROID_SAMPLE_SIZE=10 + +# ----------------------------------------------------------------------------- +# API Settings +# ----------------------------------------------------------------------------- + +# Logging level: DEBUG, INFO, WARN, ERROR (default: INFO) +LOG_LEVEL=INFO + +# API host (default: 0.0.0.0) +API_HOST=0.0.0.0 + +# API port (default: 8001) +API_PORT=8001 diff --git a/services/taxonomy-generator/.gitignore b/services/taxonomy-generator/.gitignore new file mode 100644 index 0000000..99739f0 --- /dev/null +++ b/services/taxonomy-generator/.gitignore @@ -0,0 +1,48 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Virtual environments +.venv/ +venv/ +ENV/ + +# Poetry +poetry.lock + +# IDE +.idea/ +.vscode/ +*.swp +*.swo + +# Environment +.env +.env.local + +# Testing +.coverage +htmlcov/ +.pytest_cache/ +.mypy_cache/ + +# Logs +*.log diff --git a/services/taxonomy-generator/Dockerfile b/services/taxonomy-generator/Dockerfile new file mode 100644 index 0000000..3b8383a --- /dev/null +++ b/services/taxonomy-generator/Dockerfile @@ -0,0 +1,63 @@ +# syntax=docker/dockerfile:1 + +FROM python:3.11-slim AS builder + +# Install build dependencies for numpy/hdbscan +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + gcc \ + g++ \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Copy dependency files +COPY pyproject.toml poetry.lock* ./ + +# Install dependencies using pip (more reliable for binary wheels) +# First install the packages with native extensions using binary wheels only +RUN pip install --no-cache-dir --upgrade pip setuptools wheel \ + && pip install --no-cache-dir --only-binary=:all: numpy numba llvmlite \ + && pip install --no-cache-dir \ + fastapi[standard] \ + uvicorn[standard] \ + pydantic \ + pydantic-settings \ + asyncpg \ + pgvector \ + umap-learn \ + hdbscan \ + openai \ + httpx \ + structlog + +# Production stage +FROM python:3.11-slim AS runtime + +# Install runtime dependencies for numpy/hdbscan +RUN apt-get update && apt-get install -y --no-install-recommends \ + libgomp1 \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Copy installed packages from builder +COPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages +COPY --from=builder /usr/local/bin /usr/local/bin + +# Copy application code +COPY src/ ./src/ + +# Create non-root user +RUN useradd --create-home --shell /bin/bash appuser +USER appuser + +# Expose port +EXPOSE 8001 + +# Health check +HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ + CMD python -c "import httpx; httpx.get('http://localhost:8001/health')" || exit 1 + +# Run the application +CMD ["uvicorn", "src.main:app", "--host", "0.0.0.0", "--port", "8001"] diff --git a/services/taxonomy-generator/README.md b/services/taxonomy-generator/README.md new file mode 100644 index 0000000..d29a172 --- /dev/null +++ b/services/taxonomy-generator/README.md @@ -0,0 +1,77 @@ +# Taxonomy Generator + +A Python microservice for generating per-tenant taxonomies using UMAP dimensionality reduction, HDBSCAN clustering, and GPT-4o labeling. + +## Features + +- **UMAP Dimensionality Reduction**: Reduces 1536-dimensional embeddings to 10 dimensions for better clustering +- **HDBSCAN Clustering**: Automatically discovers natural clusters without specifying K +- **GPT-4o Labeling**: Generates human-readable titles and descriptions for each cluster +- **Per-Tenant Isolation**: Each tenant gets their own taxonomy +- **Noise Detection**: Identifies feedback that doesn't fit any cluster + +## Development + +### Prerequisites + +- Python 3.11+ +- Poetry +- PostgreSQL with pgvector extension + +### Setup + +```bash +cd services/taxonomy-generator + +# Install dependencies +poetry install + +# Run development server +poetry run uvicorn src.main:app --reload --port 8001 +``` + +### Running Tests + +```bash +poetry run pytest +``` + +### Linting + +```bash +poetry run ruff check src/ +poetry run mypy src/ +``` + +## API Endpoints + +| Method | Endpoint | Description | +|--------|----------|-------------| +| GET | `/health` | Health check | +| POST | `/cluster/{tenant_id}` | Trigger taxonomy generation for a tenant | +| GET | `/cluster/{tenant_id}/status` | Get clustering job status | + +## Configuration + +Environment variables: + +| Variable | Description | Default | +|----------|-------------|---------| +| `DATABASE_URL` | PostgreSQL connection string | Required | +| `OPENAI_API_KEY` | OpenAI API key for GPT-4o | Required | +| `UMAP_N_COMPONENTS` | Target dimensions after UMAP | `10` | +| `HDBSCAN_MIN_CLUSTER_SIZE` | Minimum cluster size | `50` | +| `LOG_LEVEL` | Logging level | `INFO` | + +## Docker + +```bash +# Build +docker build -t taxonomy-generator . + +# Run +docker run -p 8001:8001 \ + -e DATABASE_URL="postgresql://..." \ + -e OPENAI_API_KEY="sk-..." \ + taxonomy-generator +``` diff --git a/services/taxonomy-generator/pyproject.toml b/services/taxonomy-generator/pyproject.toml new file mode 100644 index 0000000..d19ad15 --- /dev/null +++ b/services/taxonomy-generator/pyproject.toml @@ -0,0 +1,49 @@ +[tool.poetry] +name = "taxonomy-generator" +version = "0.1.0" +description = "Taxonomy generation microservice using UMAP, HDBSCAN, and GPT-4o" +authors = ["Formbricks "] +readme = "README.md" +packages = [{include = "src"}] + +[tool.poetry.dependencies] +python = "^3.11" +fastapi = "^0.115.0" +uvicorn = {extras = ["standard"], version = "^0.34.0"} +pydantic = "^2.10.0" +pydantic-settings = "^2.7.0" +asyncpg = "^0.30.0" +pgvector = "^0.3.6" +numpy = "^2.2.0" +umap-learn = "^0.5.7" +hdbscan = "^0.8.40" +openai = "^1.59.0" +httpx = "^0.28.0" +structlog = "^24.4.0" + +[tool.poetry.group.dev.dependencies] +pytest = "^8.3.0" +pytest-asyncio = "^0.25.0" +pytest-cov = "^6.0.0" +ruff = "^0.8.0" +mypy = "^1.14.0" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" + +[tool.ruff] +target-version = "py311" +line-length = 100 + +[tool.ruff.lint] +select = ["E", "F", "I", "N", "W", "UP"] + +[tool.mypy] +python_version = "3.11" +strict = true +ignore_missing_imports = true + +[tool.pytest.ini_options] +asyncio_mode = "auto" +testpaths = ["tests"] diff --git a/services/taxonomy-generator/requirements.txt b/services/taxonomy-generator/requirements.txt new file mode 100644 index 0000000..af1d88d --- /dev/null +++ b/services/taxonomy-generator/requirements.txt @@ -0,0 +1,21 @@ +# Generated from pyproject.toml - for pip-based installation +# Requires Python 3.11+ + +fastapi>=0.115.0 +uvicorn[standard]>=0.34.0 +pydantic>=2.10.0 +pydantic-settings>=2.7.0 +asyncpg>=0.30.0 +pgvector>=0.3.6 +numpy>=2.2.0 +umap-learn>=0.5.7 +hdbscan>=0.8.40 +openai>=1.59.0 +httpx>=0.28.0 +structlog>=24.4.0 + +# Dev dependencies (optional) +# pytest>=8.3.0 +# pytest-asyncio>=0.25.0 +# ruff>=0.8.0 +# mypy>=1.14.0 diff --git a/services/taxonomy-generator/src/__init__.py b/services/taxonomy-generator/src/__init__.py new file mode 100644 index 0000000..c32a70a --- /dev/null +++ b/services/taxonomy-generator/src/__init__.py @@ -0,0 +1 @@ +"""Taxonomy Generator - UMAP + HDBSCAN + GPT-4o clustering service.""" diff --git a/services/taxonomy-generator/src/clustering/__init__.py b/services/taxonomy-generator/src/clustering/__init__.py new file mode 100644 index 0000000..6db0169 --- /dev/null +++ b/services/taxonomy-generator/src/clustering/__init__.py @@ -0,0 +1,6 @@ +"""Clustering algorithms - UMAP and HDBSCAN.""" + +from src.clustering.hdbscan_clusterer import HDBSCANClusterer +from src.clustering.umap_reducer import UMAPReducer + +__all__ = ["HDBSCANClusterer", "UMAPReducer"] diff --git a/services/taxonomy-generator/src/clustering/hdbscan_clusterer.py b/services/taxonomy-generator/src/clustering/hdbscan_clusterer.py new file mode 100644 index 0000000..02e8686 --- /dev/null +++ b/services/taxonomy-generator/src/clustering/hdbscan_clusterer.py @@ -0,0 +1,183 @@ +"""HDBSCAN clustering for automatic cluster discovery.""" + +from dataclasses import dataclass +from uuid import UUID + +import hdbscan +import numpy as np +import structlog + +from src.config import settings + +logger = structlog.get_logger() + + +@dataclass +class Cluster: + """Represents a discovered cluster.""" + + label: int + member_indices: np.ndarray + member_ids: list[UUID] + member_texts: list[str] + centroid: np.ndarray + size: int + avg_distance_to_centroid: float + + +@dataclass +class ClusteringResult: + """Result of HDBSCAN clustering.""" + + clusters: list[Cluster] + noise_indices: np.ndarray + noise_ids: list[UUID] + labels: np.ndarray + probabilities: np.ndarray + + +class HDBSCANClusterer: + """Discovers natural clusters using HDBSCAN algorithm.""" + + def __init__( + self, + min_cluster_size: int | None = None, + min_samples: int | None = None, + cluster_selection_epsilon: float | None = None, + ): + """ + Initialize HDBSCAN clusterer. + + Args: + min_cluster_size: Minimum number of points to form a cluster (default: 50) + min_samples: Number of samples in neighborhood for core points (default: 10) + cluster_selection_epsilon: Distance threshold for cluster selection (default: 0.0) + """ + self.min_cluster_size = min_cluster_size or settings.hdbscan_min_cluster_size + self.min_samples = min_samples or settings.hdbscan_min_samples + self.cluster_selection_epsilon = ( + cluster_selection_epsilon or settings.hdbscan_cluster_selection_epsilon + ) + + def fit_predict( + self, + embeddings: np.ndarray, + record_ids: list[UUID], + texts: list[str], + ) -> ClusteringResult: + """ + Cluster embeddings and return discovered clusters. + + Args: + embeddings: Reduced embeddings (after UMAP) + record_ids: List of record UUIDs corresponding to embeddings + texts: List of text content for each record + + Returns: + ClusteringResult with clusters and noise points + """ + if embeddings.shape[0] == 0: + return ClusteringResult( + clusters=[], + noise_indices=np.array([]), + noise_ids=[], + labels=np.array([]), + probabilities=np.array([]), + ) + + logger.info( + "Starting HDBSCAN clustering", + n_samples=embeddings.shape[0], + min_cluster_size=self.min_cluster_size, + ) + + clusterer = hdbscan.HDBSCAN( + min_cluster_size=self.min_cluster_size, + min_samples=self.min_samples, + cluster_selection_epsilon=self.cluster_selection_epsilon, + metric="euclidean", # UMAP output works well with euclidean + cluster_selection_method="eom", # Excess of Mass - better for varying densities + prediction_data=True, + ) + + labels = clusterer.fit_predict(embeddings) + probabilities = clusterer.probabilities_ + + # Separate noise from clusters + noise_mask = labels == -1 + noise_indices = np.where(noise_mask)[0] + noise_ids = [record_ids[i] for i in noise_indices] + + # Build cluster objects + unique_labels = set(labels) - {-1} + clusters: list[Cluster] = [] + + for label in sorted(unique_labels): + mask = labels == label + indices = np.where(mask)[0] + cluster_embeddings = embeddings[mask] + + # Compute centroid + centroid = cluster_embeddings.mean(axis=0) + + # Compute distances to centroid + distances = np.linalg.norm(cluster_embeddings - centroid, axis=1) + avg_distance = float(distances.mean()) + + cluster = Cluster( + label=int(label), + member_indices=indices, + member_ids=[record_ids[i] for i in indices], + member_texts=[texts[i] for i in indices], + centroid=centroid, + size=len(indices), + avg_distance_to_centroid=avg_distance, + ) + clusters.append(cluster) + + logger.info( + "HDBSCAN clustering complete", + num_clusters=len(clusters), + noise_count=len(noise_indices), + cluster_sizes=[c.size for c in clusters], + ) + + return ClusteringResult( + clusters=clusters, + noise_indices=noise_indices, + noise_ids=noise_ids, + labels=labels, + probabilities=probabilities, + ) + + def get_closest_to_centroid( + self, + cluster: Cluster, + embeddings: np.ndarray, + n: int = 10, + ) -> list[tuple[int, str, float]]: + """ + Get the N points closest to the cluster centroid. + + Args: + cluster: The cluster to analyze + embeddings: Full embeddings array + n: Number of points to return + + Returns: + List of (index, text, distance) tuples sorted by distance + """ + cluster_embeddings = embeddings[cluster.member_indices] + distances = np.linalg.norm(cluster_embeddings - cluster.centroid, axis=1) + + # Sort by distance and take top N + sorted_indices = np.argsort(distances)[:n] + + result = [] + for local_idx in sorted_indices: + global_idx = cluster.member_indices[local_idx] + result.append( + (int(global_idx), cluster.member_texts[local_idx], float(distances[local_idx])) + ) + + return result diff --git a/services/taxonomy-generator/src/clustering/umap_reducer.py b/services/taxonomy-generator/src/clustering/umap_reducer.py new file mode 100644 index 0000000..e40e0e0 --- /dev/null +++ b/services/taxonomy-generator/src/clustering/umap_reducer.py @@ -0,0 +1,96 @@ +"""UMAP dimensionality reduction for embeddings.""" + +import warnings + +import numpy as np +import structlog +import umap + +from src.config import settings + +# Suppress UMAP warning about n_jobs when random_state is set (expected behavior) +warnings.filterwarnings("ignore", message="n_jobs value .* overridden to 1 by setting random_state") + +logger = structlog.get_logger() + + +class UMAPReducer: + """Reduces high-dimensional embeddings to lower dimensions using UMAP.""" + + def __init__( + self, + n_components: int | None = None, + n_neighbors: int | None = None, + min_dist: float | None = None, + metric: str | None = None, + ): + """ + Initialize UMAP reducer. + + Args: + n_components: Target number of dimensions (default: 10) + n_neighbors: Number of neighbors for local structure (default: 15) + min_dist: Minimum distance between points (default: 0.1) + metric: Distance metric (default: cosine) + """ + self.n_components = n_components or settings.umap_n_components + self.n_neighbors = n_neighbors or settings.umap_n_neighbors + self.min_dist = min_dist or settings.umap_min_dist + self.metric = metric or settings.umap_metric + + self._reducer: umap.UMAP | None = None + + def fit_transform(self, embeddings: np.ndarray) -> np.ndarray: + """ + Fit UMAP and transform embeddings to lower dimensions. + + Args: + embeddings: Array of shape (n_samples, n_features) + Typically 1536-dimensional OpenAI embeddings + + Returns: + Reduced embeddings of shape (n_samples, n_components) + """ + if embeddings.shape[0] == 0: + return np.array([]) + + logger.info( + "Starting UMAP reduction", + input_dim=embeddings.shape[1], + output_dim=self.n_components, + n_samples=embeddings.shape[0], + ) + + self._reducer = umap.UMAP( + n_components=self.n_components, + n_neighbors=min(self.n_neighbors, embeddings.shape[0] - 1), + min_dist=self.min_dist, + metric=self.metric, + random_state=42, # For reproducibility + low_memory=True, # Better for large datasets + verbose=False, + ) + + reduced = self._reducer.fit_transform(embeddings) + + logger.info( + "UMAP reduction complete", + output_shape=reduced.shape, + ) + + return reduced + + def transform(self, embeddings: np.ndarray) -> np.ndarray: + """ + Transform new embeddings using a fitted UMAP model. + + Args: + embeddings: New embeddings to transform + + Returns: + Reduced embeddings + """ + if self._reducer is None: + raise ValueError("UMAP model not fitted. Call fit_transform first.") + + return self._reducer.transform(embeddings) diff --git a/services/taxonomy-generator/src/config.py b/services/taxonomy-generator/src/config.py new file mode 100644 index 0000000..ad37814 --- /dev/null +++ b/services/taxonomy-generator/src/config.py @@ -0,0 +1,42 @@ +"""Application configuration using pydantic-settings.""" + +from pydantic_settings import BaseSettings, SettingsConfigDict + + +class Settings(BaseSettings): + """Application settings loaded from environment variables.""" + + model_config = SettingsConfigDict( + env_file=".env", + env_file_encoding="utf-8", + case_sensitive=False, + ) + + # Database + database_url: str + + # OpenAI + openai_api_key: str + + # UMAP settings + umap_n_components: int = 10 + umap_n_neighbors: int = 15 + umap_min_dist: float = 0.1 + umap_metric: str = "cosine" + + # HDBSCAN settings + hdbscan_min_cluster_size: int = 50 + hdbscan_min_samples: int = 10 + hdbscan_cluster_selection_epsilon: float = 0.0 + + # Clustering limits + max_embeddings_per_tenant: int = 100000 + centroid_sample_size: int = 10 + + # API settings + log_level: str = "INFO" + api_host: str = "0.0.0.0" + api_port: int = 8001 + + +settings = Settings() # type: ignore[call-arg] diff --git a/services/taxonomy-generator/src/db/__init__.py b/services/taxonomy-generator/src/db/__init__.py new file mode 100644 index 0000000..029cf94 --- /dev/null +++ b/services/taxonomy-generator/src/db/__init__.py @@ -0,0 +1,5 @@ +"""Database connection and operations.""" + +from src.db.postgres import get_db_pool, close_db_pool + +__all__ = ["get_db_pool", "close_db_pool"] diff --git a/services/taxonomy-generator/src/db/postgres.py b/services/taxonomy-generator/src/db/postgres.py new file mode 100644 index 0000000..6f4564a --- /dev/null +++ b/services/taxonomy-generator/src/db/postgres.py @@ -0,0 +1,164 @@ +"""PostgreSQL database connection using asyncpg.""" + +from contextlib import asynccontextmanager +from typing import Any, AsyncGenerator +from uuid import UUID + +import asyncpg +import numpy as np +import structlog +from asyncpg import Pool +from pgvector.asyncpg import register_vector + +from src.config import settings + +logger = structlog.get_logger() + +_pool: Pool | None = None + + +async def get_db_pool() -> Pool: + """Get or create the database connection pool.""" + global _pool + if _pool is None: + _pool = await asyncpg.create_pool( + settings.database_url, + min_size=2, + max_size=10, + init=_init_connection, + ) + logger.info("Database pool created") + return _pool + + +async def _init_connection(conn: asyncpg.Connection) -> None: + """Initialize connection with pgvector support.""" + await register_vector(conn) + + +async def close_db_pool() -> None: + """Close the database connection pool.""" + global _pool + if _pool is not None: + await _pool.close() + _pool = None + logger.info("Database pool closed") + + +@asynccontextmanager +async def get_connection() -> AsyncGenerator[asyncpg.Connection, None]: + """Get a database connection from the pool.""" + pool = await get_db_pool() + async with pool.acquire() as conn: + yield conn + + +async def load_embeddings_for_tenant( + tenant_id: str, limit: int = 100000 +) -> tuple[list[UUID], np.ndarray, list[str]]: + """ + Load embeddings for a specific tenant. + + Returns: + Tuple of (record_ids, embeddings_array, texts) + """ + async with get_connection() as conn: + rows = await conn.fetch( + """ + SELECT id, embedding, value_text + FROM feedback_records + WHERE tenant_id = $1 + AND embedding IS NOT NULL + AND value_text IS NOT NULL + ORDER BY created_at DESC + LIMIT $2 + """, + tenant_id, + limit, + ) + + if not rows: + return [], np.array([]), [] + + record_ids = [row["id"] for row in rows] + texts = [row["value_text"] or "" for row in rows] + + # Convert pgvector arrays to numpy + embeddings = np.array([np.array(row["embedding"]) for row in rows], dtype=np.float32) + + logger.info( + "Loaded embeddings", + tenant_id=tenant_id, + count=len(record_ids), + embedding_dim=embeddings.shape[1] if len(embeddings) > 0 else 0, + ) + + return record_ids, embeddings, texts + + +async def save_topic( + tenant_id: str, + title: str, + description: str, + level: int, + parent_id: UUID | None, + embedding: np.ndarray, +) -> UUID: + """Save a generated topic to the database.""" + async with get_connection() as conn: + row = await conn.fetchrow( + """ + INSERT INTO topics (title, level, parent_id, tenant_id, embedding) + VALUES ($1, $2, $3, $4, $5) + RETURNING id + """, + title, + level, + parent_id, + tenant_id, + embedding.tolist(), + ) + return row["id"] # type: ignore[return-value] + + +async def update_feedback_topic( + record_id: UUID, topic_id: UUID, confidence: float +) -> None: + """Update a feedback record with its classified topic.""" + async with get_connection() as conn: + await conn.execute( + """ + UPDATE feedback_records + SET topic_id = $1, classification_confidence = $2, updated_at = NOW() + WHERE id = $3 + """, + topic_id, + confidence, + record_id, + ) + + +async def clear_tenant_topics(tenant_id: str) -> int: + """Delete all topics for a tenant (before regenerating taxonomy).""" + async with get_connection() as conn: + # First, clear topic_id from feedback_records + await conn.execute( + """ + UPDATE feedback_records + SET topic_id = NULL, classification_confidence = NULL + WHERE tenant_id = $1 + """, + tenant_id, + ) + # Then delete topics + result = await conn.execute( + """ + DELETE FROM topics + WHERE tenant_id = $1 + """, + tenant_id, + ) + # Parse "DELETE X" result + count = int(result.split()[-1]) if result else 0 + logger.info("Cleared tenant topics", tenant_id=tenant_id, deleted=count) + return count diff --git a/services/taxonomy-generator/src/labeling/__init__.py b/services/taxonomy-generator/src/labeling/__init__.py new file mode 100644 index 0000000..1b5a195 --- /dev/null +++ b/services/taxonomy-generator/src/labeling/__init__.py @@ -0,0 +1,5 @@ +"""LLM-based cluster labeling.""" + +from src.labeling.openai_labeler import OpenAILabeler + +__all__ = ["OpenAILabeler"] diff --git a/services/taxonomy-generator/src/labeling/openai_labeler.py b/services/taxonomy-generator/src/labeling/openai_labeler.py new file mode 100644 index 0000000..cc80819 --- /dev/null +++ b/services/taxonomy-generator/src/labeling/openai_labeler.py @@ -0,0 +1,159 @@ +"""GPT-4o based cluster labeling.""" + +import json +from dataclasses import dataclass + +import structlog +from openai import AsyncOpenAI + +from src.config import settings + +logger = structlog.get_logger() + + +@dataclass +class TopicLabel: + """Generated label for a cluster.""" + + title: str + description: str + + +class OpenAILabeler: + """Generates human-readable labels for clusters using GPT-4o.""" + + def __init__(self, api_key: str | None = None): + """ + Initialize OpenAI labeler. + + Args: + api_key: OpenAI API key (defaults to settings) + """ + self.client = AsyncOpenAI(api_key=api_key or settings.openai_api_key) + self.model = "gpt-4o" + + async def label_cluster( + self, + representative_texts: list[str], + cluster_size: int, + parent_title: str | None = None, + level: int = 1, + ancestor_titles: list[str] | None = None, + ) -> TopicLabel: + """ + Generate a title and description for a cluster. + + Args: + representative_texts: 10 texts closest to centroid + cluster_size: Total number of items in cluster + parent_title: If generating sub-level, the parent topic title + level: The level being generated (1, 2, 3, 4, etc.) + ancestor_titles: List of all ancestor titles from root to parent + + Returns: + TopicLabel with title and description + """ + # Build level-specific context + level_descriptions = { + 1: "broad categories", + 2: "sub-categories", + 3: "specific themes", + 4: "detailed sub-themes", + 5: "granular topics", + } + level_desc = level_descriptions.get(level, f"level {level} topics") + + if level == 1: + context = f"""You are categorizing user feedback into {level_desc} (Level 1 topics). +These are the broadest groupings of feedback themes.""" + else: + # Build hierarchy context + if ancestor_titles: + hierarchy = " > ".join(ancestor_titles + [parent_title or ""]) + context = f"""You are categorizing user feedback within the hierarchy: {hierarchy} +This is a Level {level} topic ({level_desc}), which should be more specific than its parent "{parent_title}".""" + else: + context = f"""You are categorizing user feedback within the category "{parent_title}". +This is a Level {level} topic ({level_desc}).""" + + # Adjust title length guidance based on level + if level == 1: + title_guidance = "2-4 word title for this broad category" + elif level == 2: + title_guidance = "2-3 word title for this sub-category" + else: + title_guidance = "2-4 word specific title for this theme" + + prompt = f"""{context} + +Analyze these {len(representative_texts)} representative feedback items from a cluster of {cluster_size} total items. + +Feedback items: +{chr(10).join(f'- {text[:500]}' for text in representative_texts)} + +Based on the common theme in these items, provide: +1. A concise {title_guidance} +2. A single sentence description (max 100 characters) + +Important: The title should be distinct from the parent category and capture what makes this sub-group unique. + +Respond ONLY with valid JSON in this exact format: +{{"title": "Example Title", "description": "Brief description of what this category contains."}}""" + + try: + response = await self.client.chat.completions.create( + model=self.model, + messages=[{"role": "user", "content": prompt}], + response_format={"type": "json_object"}, + temperature=0.3, # Lower temperature for consistency + max_tokens=150, + ) + + content = response.choices[0].message.content + if not content: + raise ValueError("Empty response from OpenAI") + + result = json.loads(content) + + label = TopicLabel( + title=result.get("title", "Unnamed Category")[:255], + description=result.get("description", "")[:500], + ) + + logger.info( + "Generated cluster label", + title=label.title, + cluster_size=cluster_size, + level=level, + ) + + return label + + except json.JSONDecodeError as e: + logger.error("Failed to parse OpenAI response", error=str(e), content=content) + return TopicLabel( + title=f"Cluster ({cluster_size} items)", + description="Auto-generated cluster", + ) + except Exception as e: + logger.error("OpenAI labeling failed", error=str(e)) + return TopicLabel( + title=f"Cluster ({cluster_size} items)", + description="Auto-generated cluster", + ) + + async def generate_embedding(self, text: str) -> list[float]: + """ + Generate embedding for a topic title. + + Args: + text: Text to embed (usually the topic title) + + Returns: + Embedding vector + """ + response = await self.client.embeddings.create( + model="text-embedding-3-small", + input=text, + ) + return response.data[0].embedding diff --git a/services/taxonomy-generator/src/main.py b/services/taxonomy-generator/src/main.py new file mode 100644 index 0000000..eea7101 --- /dev/null +++ b/services/taxonomy-generator/src/main.py @@ -0,0 +1,197 @@ +"""FastAPI application entry point.""" + +from contextlib import asynccontextmanager +from typing import AsyncGenerator + +import structlog +from fastapi import BackgroundTasks, FastAPI, HTTPException +from fastapi.middleware.cors import CORSMiddleware + +from src.config import settings +from src.db import close_db_pool, get_db_pool +from src.models import ClusterConfig, ClusteringJobResponse, ClusteringJobStatus, HealthResponse +from src.service import TaxonomyService + +# Configure structured logging +structlog.configure( + processors=[ + structlog.stdlib.filter_by_level, + structlog.stdlib.add_logger_name, + structlog.stdlib.add_log_level, + structlog.stdlib.PositionalArgumentsFormatter(), + structlog.processors.TimeStamper(fmt="iso"), + structlog.processors.StackInfoRenderer(), + structlog.processors.format_exc_info, + structlog.processors.UnicodeDecoder(), + structlog.processors.JSONRenderer(), + ], + wrapper_class=structlog.stdlib.BoundLogger, + context_class=dict, + logger_factory=structlog.stdlib.LoggerFactory(), + cache_logger_on_first_use=True, +) + +logger = structlog.get_logger() + +# In-memory job tracking (replace with Redis/DB for production) +_jobs: dict[str, ClusteringJobResponse] = {} + + +@asynccontextmanager +async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: + """Application lifespan - startup and shutdown events.""" + # Startup + logger.info("Starting taxonomy-generator service", port=settings.api_port) + await get_db_pool() + yield + # Shutdown + logger.info("Shutting down taxonomy-generator service") + await close_db_pool() + + +app = FastAPI( + title="Taxonomy Generator", + description="Microservice for generating per-tenant taxonomies using UMAP, HDBSCAN, and GPT-4o", + version="0.1.0", + lifespan=lifespan, +) + +# CORS middleware +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], # Configure appropriately for production + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# Service instance +taxonomy_service = TaxonomyService() + + +@app.get("/health", response_model=HealthResponse) +async def health_check() -> HealthResponse: + """Health check endpoint.""" + return HealthResponse() + + +@app.post("/cluster/{tenant_id}", response_model=ClusteringJobResponse) +async def trigger_clustering( + tenant_id: str, + config: ClusterConfig | None = None, + background_tasks: BackgroundTasks = BackgroundTasks(), +) -> ClusteringJobResponse: + """ + Trigger taxonomy generation for a tenant. + + This starts a background job and returns immediately. + Use GET /cluster/{tenant_id}/status to check progress. + """ + from uuid import uuid4 + + job_id = uuid4() + job_key = f"{tenant_id}:{job_id}" + + # Create initial job response + job_response = ClusteringJobResponse( + job_id=job_id, + tenant_id=tenant_id, + status=ClusteringJobStatus.PENDING, + progress=0.0, + message="Job queued", + ) + _jobs[job_key] = job_response + + # Start background task + async def run_clustering() -> None: + try: + _jobs[job_key].status = ClusteringJobStatus.RUNNING + _jobs[job_key].progress = 0.1 + _jobs[job_key].message = "Loading embeddings..." + + result = await taxonomy_service.generate_taxonomy( + tenant_id=tenant_id, + config=config, + ) + + _jobs[job_key].status = result.status + _jobs[job_key].progress = 1.0 + _jobs[job_key].result = result + _jobs[job_key].message = f"Generated {len(result.topics)} topics" + + except Exception as e: + logger.error("Clustering job failed", error=str(e), tenant_id=tenant_id) + _jobs[job_key].status = ClusteringJobStatus.FAILED + _jobs[job_key].message = str(e) + + background_tasks.add_task(run_clustering) + + logger.info("Clustering job queued", tenant_id=tenant_id, job_id=str(job_id)) + return job_response + + +@app.get("/cluster/{tenant_id}/status", response_model=ClusteringJobResponse) +async def get_clustering_status(tenant_id: str, job_id: str | None = None) -> ClusteringJobResponse: + """ + Get status of a clustering job. + + If job_id is not provided, returns the most recent job for the tenant. + """ + if job_id: + job_key = f"{tenant_id}:{job_id}" + if job_key not in _jobs: + raise HTTPException(status_code=404, detail="Job not found") + return _jobs[job_key] + + # Find most recent job for tenant + tenant_jobs = [ + (key, job) for key, job in _jobs.items() if key.startswith(f"{tenant_id}:") + ] + if not tenant_jobs: + raise HTTPException(status_code=404, detail="No jobs found for tenant") + + # Return most recent (last in dict, assuming insertion order) + return tenant_jobs[-1][1] + + +@app.post("/cluster/{tenant_id}/sync") +async def sync_clustering( + tenant_id: str, + config: ClusterConfig | None = None, +) -> ClusteringJobResponse: + """ + Synchronously generate taxonomy (blocking). + + Use this for testing or when you need to wait for results. + For production, prefer the async POST /cluster/{tenant_id} endpoint. + """ + from uuid import uuid4 + + job_id = uuid4() + + logger.info("Starting synchronous clustering", tenant_id=tenant_id, job_id=str(job_id)) + + result = await taxonomy_service.generate_taxonomy( + tenant_id=tenant_id, + config=config, + ) + + return ClusteringJobResponse( + job_id=job_id, + tenant_id=tenant_id, + status=result.status, + progress=1.0, + message=f"Generated {len(result.topics)} topics", + result=result, + ) + + +if __name__ == "__main__": + import uvicorn + + uvicorn.run( + "src.main:app", + host=settings.api_host, + port=settings.api_port, + reload=True, + ) diff --git a/services/taxonomy-generator/src/models/__init__.py b/services/taxonomy-generator/src/models/__init__.py new file mode 100644 index 0000000..7f90c7c --- /dev/null +++ b/services/taxonomy-generator/src/models/__init__.py @@ -0,0 +1,19 @@ +"""Pydantic models for request/response schemas.""" + +from src.models.schemas import ( + ClusterConfig, + ClusteringJobResponse, + ClusteringJobStatus, + ClusterResult, + HealthResponse, + TopicResult, +) + +__all__ = [ + "ClusterConfig", + "ClusteringJobResponse", + "ClusteringJobStatus", + "ClusterResult", + "HealthResponse", + "TopicResult", +] diff --git a/services/taxonomy-generator/src/models/schemas.py b/services/taxonomy-generator/src/models/schemas.py new file mode 100644 index 0000000..963eaac --- /dev/null +++ b/services/taxonomy-generator/src/models/schemas.py @@ -0,0 +1,118 @@ +"""Pydantic schemas for API requests and responses.""" + +from datetime import datetime +from enum import Enum +from uuid import UUID + +from pydantic import BaseModel, Field + + +class HealthResponse(BaseModel): + """Health check response.""" + + status: str = "healthy" + version: str = "0.1.0" + + +class ClusterConfig(BaseModel): + """Configuration for clustering operation.""" + + # UMAP settings (optional overrides) + umap_n_components: int | None = Field(None, ge=2, le=50) + umap_n_neighbors: int | None = Field(None, ge=2, le=200) + umap_min_dist: float | None = Field(None, ge=0.0, le=1.0) + + # HDBSCAN settings (optional overrides) + hdbscan_min_cluster_size: int | None = Field(None, ge=5, le=1000) + hdbscan_min_samples: int | None = Field(None, ge=1, le=100) + + # Limit on embeddings to process + max_embeddings: int | None = Field(None, ge=100, le=500000) + + # Maximum depth of taxonomy hierarchy (1-4) + # For datasets <10k records, 3 levels is usually sufficient + max_levels: int = Field(default=3, ge=1, le=10) + + # Minimum cluster sizes per level for subdivision + # Key: level number, Value: minimum cluster size to attempt creating sub-topics + # Lower values = more topics get children, higher values = only large topics subdivide + level_min_cluster_sizes: dict[int, int] = Field( + default_factory=lambda: { + 1: 40, # Level 1: need 40+ items to create L2 children + 2: 20, # Level 2: need 20+ items to create L3 children + 3: 10, # Level 3: need 10+ items to create L4 children + 4: 10, # Level 4: terminal level (no children) + } + ) + + # HDBSCAN min_cluster_size per level (smaller clusters at deeper levels) + # This controls the minimum points HDBSCAN needs to form a cluster + # Lower values = more smaller clusters, higher values = fewer larger clusters + level_hdbscan_min_cluster_sizes: dict[int, int] = Field( + default_factory=lambda: { + 1: 30, # Level 1: need 30+ similar items to form a topic + 2: 15, # Level 2: need 15+ similar items + 3: 8, # Level 3: need 8+ similar items + 4: 5, # Level 4: need 5+ similar items + } + ) + + # DEPRECATED: kept for backwards compatibility + generate_level2: bool = True + level2_min_cluster_size: int = 50 + + def get_min_cluster_size_for_level(self, level: int) -> int: + """Get the minimum cluster size required to subdivide at this level.""" + return self.level_min_cluster_sizes.get(level, 25) + + def get_hdbscan_min_cluster_size_for_level(self, level: int) -> int: + """Get the HDBSCAN min_cluster_size for clustering at this level.""" + return self.level_hdbscan_min_cluster_sizes.get(level, 10) + + +class ClusteringJobStatus(str, Enum): + """Status of a clustering job.""" + + PENDING = "pending" + RUNNING = "running" + COMPLETED = "completed" + FAILED = "failed" + + +class TopicResult(BaseModel): + """Result of a generated topic.""" + + id: UUID + title: str + description: str + level: int + parent_id: UUID | None = None + cluster_size: int + avg_distance_to_centroid: float + + +class ClusterResult(BaseModel): + """Result of clustering operation.""" + + tenant_id: str + job_id: UUID + status: ClusteringJobStatus + total_records: int + clustered_records: int + noise_records: int + num_clusters: int + topics: list[TopicResult] + started_at: datetime + completed_at: datetime | None = None + error_message: str | None = None + + +class ClusteringJobResponse(BaseModel): + """Response for clustering job status.""" + + job_id: UUID + tenant_id: str + status: ClusteringJobStatus + progress: float = Field(ge=0.0, le=1.0, description="Progress from 0.0 to 1.0") + message: str | None = None + result: ClusterResult | None = None diff --git a/services/taxonomy-generator/src/service.py b/services/taxonomy-generator/src/service.py new file mode 100644 index 0000000..9d0ed3e --- /dev/null +++ b/services/taxonomy-generator/src/service.py @@ -0,0 +1,325 @@ +"""Taxonomy generation service - orchestrates the full pipeline.""" + +from datetime import datetime +from uuid import UUID, uuid4 + +import numpy as np +import structlog + +from src.clustering import HDBSCANClusterer, UMAPReducer +from src.config import settings +from src.db.postgres import ( + clear_tenant_topics, + load_embeddings_for_tenant, + save_topic, + update_feedback_topic, +) +from src.labeling import OpenAILabeler +from src.models import ClusterConfig, ClusteringJobStatus, ClusterResult, TopicResult + +logger = structlog.get_logger() + + +class TaxonomyService: + """Orchestrates the taxonomy generation pipeline.""" + + def __init__(self): + """Initialize the service with default components.""" + self.labeler = OpenAILabeler() + + async def generate_taxonomy( + self, + tenant_id: str, + config: ClusterConfig | None = None, + ) -> ClusterResult: + """ + Generate taxonomy for a tenant. + + Pipeline: + 1. Load embeddings for tenant + 2. Reduce dimensions with UMAP + 3. Cluster with HDBSCAN + 4. Label clusters with GPT-4o + 5. Save topics to database + 6. Recursively generate sub-topics up to max_levels + + Args: + tenant_id: Tenant ID to generate taxonomy for + config: Optional clustering configuration overrides + + Returns: + ClusterResult with generated topics + """ + config = config or ClusterConfig() + job_id = uuid4() + started_at = datetime.utcnow() + + logger.info( + "Starting taxonomy generation", + tenant_id=tenant_id, + job_id=str(job_id), + max_levels=config.max_levels, + ) + + try: + # 1. Clear existing topics for this tenant + await clear_tenant_topics(tenant_id) + + # 2. Load embeddings + max_embeddings = config.max_embeddings or settings.max_embeddings_per_tenant + record_ids, embeddings, texts = await load_embeddings_for_tenant( + tenant_id, limit=max_embeddings + ) + + if len(record_ids) == 0: + logger.warning("No embeddings found for tenant", tenant_id=tenant_id) + return ClusterResult( + tenant_id=tenant_id, + job_id=job_id, + status=ClusteringJobStatus.COMPLETED, + total_records=0, + clustered_records=0, + noise_records=0, + num_clusters=0, + topics=[], + started_at=started_at, + completed_at=datetime.utcnow(), + ) + + # 3. Generate topics recursively starting at level 1 + topics = await self._cluster_recursive( + tenant_id=tenant_id, + record_ids=record_ids, + embeddings=embeddings, + texts=texts, + parent_id=None, + parent_title=None, + ancestor_titles=[], + current_level=1, + config=config, + ) + + result = ClusterResult( + tenant_id=tenant_id, + job_id=job_id, + status=ClusteringJobStatus.COMPLETED, + total_records=len(record_ids), + clustered_records=len(record_ids), # Updated by recursive process + noise_records=0, + num_clusters=len([t for t in topics if t.level == 1]), + topics=topics, + started_at=started_at, + completed_at=datetime.utcnow(), + ) + + logger.info( + "Taxonomy generation complete", + tenant_id=tenant_id, + job_id=str(job_id), + num_topics=len(topics), + level_counts={ + level: len([t for t in topics if t.level == level]) + for level in range(1, config.max_levels + 1) + }, + duration_seconds=(datetime.utcnow() - started_at).total_seconds(), + ) + + return result + + except Exception as e: + logger.error( + "Taxonomy generation failed", + tenant_id=tenant_id, + job_id=str(job_id), + error=str(e), + ) + return ClusterResult( + tenant_id=tenant_id, + job_id=job_id, + status=ClusteringJobStatus.FAILED, + total_records=0, + clustered_records=0, + noise_records=0, + num_clusters=0, + topics=[], + started_at=started_at, + completed_at=datetime.utcnow(), + error_message=str(e), + ) + + async def _cluster_recursive( + self, + tenant_id: str, + record_ids: list[str], + embeddings: np.ndarray, + texts: list[str], + parent_id: UUID | None, + parent_title: str | None, + ancestor_titles: list[str], + current_level: int, + config: ClusterConfig, + ) -> list[TopicResult]: + """ + Recursively cluster feedback into hierarchical topics. + + Args: + tenant_id: Tenant ID + record_ids: List of feedback record IDs + embeddings: Original high-dimensional embeddings + texts: Feedback text content + parent_id: Parent topic ID (None for Level 1) + parent_title: Parent topic title (None for Level 1) + ancestor_titles: List of ancestor titles from root to parent + current_level: Current level being generated (1-based) + config: Clustering configuration + + Returns: + List of generated topics at this level and all child levels + """ + # Stop if we've reached max depth or not enough data + if current_level > config.max_levels: + return [] + + if len(record_ids) < 5: + return [] + + logger.info( + f"Clustering at level {current_level}", + parent_title=parent_title, + num_records=len(record_ids), + ) + + # Get level-specific HDBSCAN settings + min_cluster_size = config.get_hdbscan_min_cluster_size_for_level(current_level) + min_samples = max(min_cluster_size // 5, 3) + + # Adjust UMAP parameters for smaller datasets at deeper levels + n_neighbors = min( + config.umap_n_neighbors or 15, + max(len(record_ids) - 1, 2), + ) + n_components = min( + config.umap_n_components or 10, + max(5, 12 - current_level * 2), # Fewer dims at deeper levels + ) + + # UMAP dimensionality reduction + reducer = UMAPReducer( + n_components=n_components, + n_neighbors=n_neighbors, + min_dist=config.umap_min_dist or 0.1, + ) + + try: + reduced_embeddings = reducer.fit_transform(embeddings) + except Exception as e: + logger.warning( + f"UMAP failed at level {current_level}", + error=str(e), + num_records=len(record_ids), + ) + return [] + + # HDBSCAN clustering + clusterer = HDBSCANClusterer( + min_cluster_size=min_cluster_size, + min_samples=min_samples, + ) + + try: + clustering_result = clusterer.fit_predict(reduced_embeddings, record_ids, texts) + except Exception as e: + logger.warning( + f"HDBSCAN failed at level {current_level}", + error=str(e), + num_records=len(record_ids), + ) + return [] + + if len(clustering_result.clusters) == 0: + logger.info(f"No clusters found at level {current_level}") + return [] + + topics: list[TopicResult] = [] + + for cluster in clustering_result.clusters: + # Get representative samples for labeling + closest = clusterer.get_closest_to_centroid( + cluster, reduced_embeddings, n=settings.centroid_sample_size + ) + representative_texts = [text for _, text, _ in closest] + + # Generate label with GPT-4o + label = await self.labeler.label_cluster( + representative_texts=representative_texts, + cluster_size=cluster.size, + parent_title=parent_title, + level=current_level, + ancestor_titles=ancestor_titles if ancestor_titles else None, + ) + + # Generate embedding for the topic title + topic_embedding = await self.labeler.generate_embedding(label.title) + + # Save to database + topic_id = await save_topic( + tenant_id=tenant_id, + title=label.title, + description=label.description, + level=current_level, + parent_id=parent_id, + embedding=np.array(topic_embedding, dtype=np.float32), + ) + + # Update feedback records with topic classification + for member_id in cluster.member_ids: + confidence = 1.0 - min(cluster.avg_distance_to_centroid, 1.0) + await update_feedback_topic(member_id, topic_id, confidence) + + topic_result = TopicResult( + id=topic_id, + title=label.title, + description=label.description, + level=current_level, + parent_id=parent_id, + cluster_size=cluster.size, + avg_distance_to_centroid=cluster.avg_distance_to_centroid, + ) + topics.append(topic_result) + + logger.info( + f"Created Level {current_level} topic", + topic_id=str(topic_id), + title=label.title, + parent_title=parent_title, + cluster_size=cluster.size, + ) + + # Recursively create child topics if cluster is large enough + min_size_for_children = config.get_min_cluster_size_for_level(current_level) + if ( + current_level < config.max_levels + and cluster.size >= min_size_for_children + ): + # Extract this cluster's data for sub-clustering + cluster_embeddings = embeddings[cluster.member_indices] + cluster_texts = cluster.member_texts + cluster_ids = cluster.member_ids + + # Build ancestor chain for context + new_ancestors = ancestor_titles + [parent_title] if parent_title else [] + + child_topics = await self._cluster_recursive( + tenant_id=tenant_id, + record_ids=cluster_ids, + embeddings=cluster_embeddings, + texts=cluster_texts, + parent_id=topic_id, + parent_title=label.title, + ancestor_titles=new_ancestors, + current_level=current_level + 1, + config=config, + ) + topics.extend(child_topics) + + return topics diff --git a/services/taxonomy-generator/tests/__init__.py b/services/taxonomy-generator/tests/__init__.py new file mode 100644 index 0000000..af2e546 --- /dev/null +++ b/services/taxonomy-generator/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for taxonomy-generator service.""" diff --git a/services/taxonomy-generator/tests/test_clustering.py b/services/taxonomy-generator/tests/test_clustering.py new file mode 100644 index 0000000..64af9e9 --- /dev/null +++ b/services/taxonomy-generator/tests/test_clustering.py @@ -0,0 +1,97 @@ +"""Tests for clustering algorithms.""" + +import numpy as np +import pytest + +from src.clustering import HDBSCANClusterer, UMAPReducer + + +class TestUMAPReducer: + """Tests for UMAP dimensionality reduction.""" + + def test_reduce_dimensions(self) -> None: + """Test that UMAP reduces dimensions correctly.""" + # Create random high-dimensional data + np.random.seed(42) + embeddings = np.random.randn(100, 1536).astype(np.float32) + + reducer = UMAPReducer(n_components=10, n_neighbors=10) + reduced = reducer.fit_transform(embeddings) + + assert reduced.shape == (100, 10) + + def test_empty_input(self) -> None: + """Test handling of empty input.""" + reducer = UMAPReducer(n_components=10) + reduced = reducer.fit_transform(np.array([])) + + assert len(reduced) == 0 + + +class TestHDBSCANClusterer: + """Tests for HDBSCAN clustering.""" + + def test_find_clusters(self) -> None: + """Test that HDBSCAN finds natural clusters.""" + from uuid import uuid4 + + np.random.seed(42) + + # Create 3 distinct clusters + cluster1 = np.random.randn(50, 10) + np.array([5, 0, 0, 0, 0, 0, 0, 0, 0, 0]) + cluster2 = np.random.randn(50, 10) + np.array([0, 5, 0, 0, 0, 0, 0, 0, 0, 0]) + cluster3 = np.random.randn(50, 10) + np.array([0, 0, 5, 0, 0, 0, 0, 0, 0, 0]) + + embeddings = np.vstack([cluster1, cluster2, cluster3]).astype(np.float32) + record_ids = [uuid4() for _ in range(150)] + texts = [f"text {i}" for i in range(150)] + + clusterer = HDBSCANClusterer(min_cluster_size=10, min_samples=5) + result = clusterer.fit_predict(embeddings, record_ids, texts) + + # Should find approximately 3 clusters + assert len(result.clusters) >= 2 + assert len(result.clusters) <= 5 + + def test_noise_detection(self) -> None: + """Test that HDBSCAN identifies noise points.""" + from uuid import uuid4 + + np.random.seed(42) + + # Create one tight cluster and some scattered points + cluster = np.random.randn(50, 10) * 0.1 + noise = np.random.randn(20, 10) * 10 # Scattered noise + + embeddings = np.vstack([cluster, noise]).astype(np.float32) + record_ids = [uuid4() for _ in range(70)] + texts = [f"text {i}" for i in range(70)] + + clusterer = HDBSCANClusterer(min_cluster_size=10, min_samples=5) + result = clusterer.fit_predict(embeddings, record_ids, texts) + + # Should have some noise points + assert len(result.noise_ids) > 0 + + def test_get_closest_to_centroid(self) -> None: + """Test centroid representative selection.""" + from uuid import uuid4 + + np.random.seed(42) + + # Create a cluster + embeddings = np.random.randn(100, 10).astype(np.float32) + record_ids = [uuid4() for _ in range(100)] + texts = [f"text {i}" for i in range(100)] + + clusterer = HDBSCANClusterer(min_cluster_size=20, min_samples=5) + result = clusterer.fit_predict(embeddings, record_ids, texts) + + if result.clusters: + closest = clusterer.get_closest_to_centroid( + result.clusters[0], embeddings, n=5 + ) + assert len(closest) <= 5 + # Distances should be sorted ascending + distances = [d for _, _, d in closest] + assert distances == sorted(distances) diff --git a/sql/002_knowledge_and_topics.sql b/sql/002_knowledge_and_topics.sql new file mode 100644 index 0000000..ebf426d --- /dev/null +++ b/sql/002_knowledge_and_topics.sql @@ -0,0 +1,37 @@ +-- Knowledge records and topics schema + +-- Knowledge records table +CREATE TABLE knowledge_records ( + id UUID PRIMARY KEY DEFAULT uuidv7(), + content TEXT NOT NULL, + tenant_id VARCHAR(255), + created_at TIMESTAMP NOT NULL DEFAULT NOW(), + updated_at TIMESTAMP NOT NULL DEFAULT NOW() +); + +-- Indexes for knowledge_records +CREATE INDEX idx_knowledge_records_tenant_id ON knowledge_records(tenant_id); +CREATE INDEX idx_knowledge_records_created_at ON knowledge_records(created_at); + +-- Topics table +CREATE TABLE topics ( + id UUID PRIMARY KEY DEFAULT uuidv7(), + title VARCHAR(255) NOT NULL, + level INTEGER NOT NULL DEFAULT 1, + parent_id UUID REFERENCES topics(id) ON DELETE CASCADE, + tenant_id VARCHAR(255), + created_at TIMESTAMP NOT NULL DEFAULT NOW(), + updated_at TIMESTAMP NOT NULL DEFAULT NOW() +); + +-- Indexes for topics +CREATE INDEX idx_topics_tenant_id ON topics(tenant_id); +CREATE INDEX idx_topics_parent_id ON topics(parent_id); +CREATE INDEX idx_topics_level ON topics(level); + +-- Partial unique indexes for title uniqueness within (parent_id, tenant_id) +-- Handle NULL parent_id separately since NULL != NULL in PostgreSQL +CREATE UNIQUE INDEX idx_topics_title_parent_tenant + ON topics(tenant_id, parent_id, title) WHERE parent_id IS NOT NULL; +CREATE UNIQUE INDEX idx_topics_title_root_tenant + ON topics(tenant_id, title) WHERE parent_id IS NULL; diff --git a/sql/003_embeddings.sql b/sql/003_embeddings.sql new file mode 100644 index 0000000..aa07bda --- /dev/null +++ b/sql/003_embeddings.sql @@ -0,0 +1,30 @@ +-- Embeddings schema for AI enrichment +-- Adds vector columns to existing tables for semantic search and classification + +-- Add embedding column to knowledge_records +ALTER TABLE knowledge_records ADD COLUMN IF NOT EXISTS embedding vector(1536); + +-- Create vector similarity index for knowledge_records +-- Using HNSW (Hierarchical Navigable Small World) for approximate nearest neighbor search +-- HNSW works well on empty tables and has better query performance than ivfflat +CREATE INDEX IF NOT EXISTS idx_knowledge_records_embedding + ON knowledge_records USING hnsw (embedding vector_cosine_ops); + +-- Add embedding column to topics +ALTER TABLE topics ADD COLUMN IF NOT EXISTS embedding vector(1536); + +-- Create vector similarity index for topics +CREATE INDEX IF NOT EXISTS idx_topics_embedding + ON topics USING hnsw (embedding vector_cosine_ops); + +-- Add embedding and topic link to feedback_records +ALTER TABLE feedback_records ADD COLUMN IF NOT EXISTS embedding vector(1536); +ALTER TABLE feedback_records ADD COLUMN IF NOT EXISTS topic_id UUID REFERENCES topics(id) ON DELETE SET NULL; +ALTER TABLE feedback_records ADD COLUMN IF NOT EXISTS classification_confidence DOUBLE PRECISION; + +-- Create vector similarity index for feedback_records +CREATE INDEX IF NOT EXISTS idx_feedback_records_embedding + ON feedback_records USING hnsw (embedding vector_cosine_ops); + +-- Create index for topic lookups on feedback_records +CREATE INDEX IF NOT EXISTS idx_feedback_records_topic_id ON feedback_records(topic_id); diff --git a/sql/004_remove_sentiment_fields.sql b/sql/004_remove_sentiment_fields.sql new file mode 100644 index 0000000..0169895 --- /dev/null +++ b/sql/004_remove_sentiment_fields.sql @@ -0,0 +1,10 @@ +-- Remove sentiment fields that require separate LLM calls +-- These were added prematurely; keeping only embedding-based enrichment + +ALTER TABLE feedback_records DROP COLUMN IF EXISTS sentiment; +ALTER TABLE feedback_records DROP COLUMN IF EXISTS sentiment_score; +ALTER TABLE feedback_records DROP COLUMN IF EXISTS emotion; + +-- Drop the indexes that were created for these columns +DROP INDEX IF EXISTS idx_feedback_records_sentiment; +DROP INDEX IF EXISTS idx_feedback_records_emotion; diff --git a/sql/005_add_theme_id.sql b/sql/005_add_theme_id.sql new file mode 100644 index 0000000..79a5a07 --- /dev/null +++ b/sql/005_add_theme_id.sql @@ -0,0 +1,8 @@ +-- Migration to add theme_id column to feedback_records +-- This enables hierarchical classification: theme (level 1) + topic (level 2) + +-- Add theme_id column (references level-1 topics) +ALTER TABLE feedback_records ADD COLUMN IF NOT EXISTS theme_id UUID REFERENCES topics(id) ON DELETE SET NULL; + +-- Create index for theme lookups +CREATE INDEX IF NOT EXISTS idx_feedback_records_theme_id ON feedback_records(theme_id); diff --git a/sql/006_indexes_and_constraints.sql b/sql/006_indexes_and_constraints.sql new file mode 100644 index 0000000..c2609ec --- /dev/null +++ b/sql/006_indexes_and_constraints.sql @@ -0,0 +1,23 @@ +-- Migration: Add composite indexes and constraints for topics hierarchy +-- This improves query performance for tenant-scoped hierarchy queries +-- and prevents topics from referencing themselves. + +-- Composite index for tenant-scoped hierarchy queries +-- Optimizes: queries filtering by tenant_id AND parent_id together +CREATE INDEX IF NOT EXISTS idx_topics_tenant_parent + ON topics(tenant_id, parent_id); + +-- Prevent topics from referencing themselves (circular reference protection) +-- Note: This only prevents direct self-reference. Deeper cycles are prevented +-- by the application logic (parent_id is immutable after creation). +DO $$ +BEGIN + IF NOT EXISTS ( + SELECT 1 FROM pg_constraint + WHERE conname = 'chk_topics_no_self_reference' + ) THEN + ALTER TABLE topics + ADD CONSTRAINT chk_topics_no_self_reference + CHECK (parent_id IS NULL OR parent_id != id); + END IF; +END $$; diff --git a/sql/007_river_job_queue.sql b/sql/007_river_job_queue.sql new file mode 100644 index 0000000..86ea9ea --- /dev/null +++ b/sql/007_river_job_queue.sql @@ -0,0 +1,141 @@ +-- River Job Queue Schema +-- This migration adds the River job queue tables for reliable async processing +-- Source: https://riverqueue.com/docs/migrations (v0.30.x) + +-- Migration tracking table +CREATE TABLE IF NOT EXISTS river_migration( + line TEXT NOT NULL, + version bigint NOT NULL, + created_at timestamptz NOT NULL DEFAULT NOW(), + CONSTRAINT line_length CHECK (char_length(line) > 0 AND char_length(line) < 128), + CONSTRAINT version_gte_1 CHECK (version >= 1), + PRIMARY KEY (line, version) +); + +-- Job state enum +CREATE TYPE river_job_state AS ENUM( + 'available', + 'cancelled', + 'completed', + 'discarded', + 'pending', + 'retryable', + 'running', + 'scheduled' +); + +-- Main job table +CREATE TABLE river_job( + id bigserial PRIMARY KEY, + state river_job_state NOT NULL DEFAULT 'available', + attempt smallint NOT NULL DEFAULT 0, + max_attempts smallint NOT NULL, + attempted_at timestamptz, + created_at timestamptz NOT NULL DEFAULT NOW(), + finalized_at timestamptz, + scheduled_at timestamptz NOT NULL DEFAULT NOW(), + priority smallint NOT NULL DEFAULT 1, + args jsonb NOT NULL, + attempted_by text[], + errors jsonb[], + kind text NOT NULL, + metadata jsonb NOT NULL DEFAULT '{}', + queue text NOT NULL DEFAULT 'default', + tags varchar(255)[] NOT NULL DEFAULT '{}', + unique_key bytea, + unique_states BIT(8), + CONSTRAINT finalized_or_finalized_at_null CHECK ( + (finalized_at IS NULL AND state NOT IN ('cancelled', 'completed', 'discarded')) OR + (finalized_at IS NOT NULL AND state IN ('cancelled', 'completed', 'discarded')) + ), + CONSTRAINT max_attempts_is_positive CHECK (max_attempts > 0), + CONSTRAINT priority_in_range CHECK (priority >= 1 AND priority <= 4), + CONSTRAINT queue_length CHECK (char_length(queue) > 0 AND char_length(queue) < 128), + CONSTRAINT kind_length CHECK (char_length(kind) > 0 AND char_length(kind) < 128) +); + +-- Indexes for job fetching and querying +CREATE INDEX river_job_kind ON river_job USING btree(kind); +CREATE INDEX river_job_state_and_finalized_at_index ON river_job USING btree(state, finalized_at) WHERE finalized_at IS NOT NULL; +CREATE INDEX river_job_prioritized_fetching_index ON river_job USING btree(state, queue, priority, scheduled_at, id); +CREATE INDEX river_job_args_index ON river_job USING GIN(args); +CREATE INDEX river_job_metadata_index ON river_job USING GIN(metadata); + +-- Function to check job state in bitmask (for unique jobs) +CREATE OR REPLACE FUNCTION river_job_state_in_bitmask(bitmask BIT(8), state river_job_state) +RETURNS boolean +LANGUAGE SQL +IMMUTABLE +AS $$ + SELECT CASE state + WHEN 'available' THEN get_bit(bitmask, 7) + WHEN 'cancelled' THEN get_bit(bitmask, 6) + WHEN 'completed' THEN get_bit(bitmask, 5) + WHEN 'discarded' THEN get_bit(bitmask, 4) + WHEN 'pending' THEN get_bit(bitmask, 3) + WHEN 'retryable' THEN get_bit(bitmask, 2) + WHEN 'running' THEN get_bit(bitmask, 1) + WHEN 'scheduled' THEN get_bit(bitmask, 0) + ELSE 0 + END = 1; +$$; + +-- Unique index for deduplication +CREATE UNIQUE INDEX river_job_unique_idx ON river_job (unique_key) + WHERE unique_key IS NOT NULL + AND unique_states IS NOT NULL + AND river_job_state_in_bitmask(unique_states, state); + +-- Leader election table (unlogged for performance) +CREATE UNLOGGED TABLE river_leader( + elected_at timestamptz NOT NULL, + expires_at timestamptz NOT NULL, + leader_id text NOT NULL, + name text PRIMARY KEY DEFAULT 'default', + CONSTRAINT name_length CHECK (name = 'default'), + CONSTRAINT leader_id_length CHECK (char_length(leader_id) > 0 AND char_length(leader_id) < 128) +); + +-- Queue configuration table +CREATE TABLE river_queue ( + name text PRIMARY KEY NOT NULL, + created_at timestamptz NOT NULL DEFAULT now(), + metadata jsonb NOT NULL DEFAULT '{}' ::jsonb, + paused_at timestamptz, + updated_at timestamptz NOT NULL +); + +-- Client tracking table (unlogged for performance) +CREATE UNLOGGED TABLE river_client ( + id text PRIMARY KEY NOT NULL, + created_at timestamptz NOT NULL DEFAULT now(), + metadata jsonb NOT NULL DEFAULT '{}', + paused_at timestamptz, + updated_at timestamptz NOT NULL, + CONSTRAINT name_length CHECK (char_length(id) > 0 AND char_length(id) < 128) +); + +-- Client queue tracking (unlogged for performance) +CREATE UNLOGGED TABLE river_client_queue ( + river_client_id text NOT NULL REFERENCES river_client (id) ON DELETE CASCADE, + name text NOT NULL, + created_at timestamptz NOT NULL DEFAULT now(), + max_workers bigint NOT NULL DEFAULT 0, + metadata jsonb NOT NULL DEFAULT '{}', + num_jobs_completed bigint NOT NULL DEFAULT 0, + num_jobs_running bigint NOT NULL DEFAULT 0, + updated_at timestamptz NOT NULL, + PRIMARY KEY (river_client_id, name), + CONSTRAINT name_length CHECK (char_length(name) > 0 AND char_length(name) < 128), + CONSTRAINT num_jobs_completed_zero_or_positive CHECK (num_jobs_completed >= 0), + CONSTRAINT num_jobs_running_zero_or_positive CHECK (num_jobs_running >= 0) +); + +-- Mark River migrations as applied (so River doesn't try to re-run them) +INSERT INTO river_migration (line, version) VALUES + ('main', 1), + ('main', 2), + ('main', 3), + ('main', 4), + ('main', 5), + ('main', 6); diff --git a/sql/009_clustering_jobs.sql b/sql/009_clustering_jobs.sql new file mode 100644 index 0000000..9b43b4e --- /dev/null +++ b/sql/009_clustering_jobs.sql @@ -0,0 +1,48 @@ +-- Clustering jobs table for tracking taxonomy generation jobs and schedules +CREATE TABLE IF NOT EXISTS clustering_jobs ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + tenant_id VARCHAR(255) NOT NULL, + status VARCHAR(50) NOT NULL DEFAULT 'pending', + + -- Scheduling + schedule_interval VARCHAR(50), -- 'daily', 'weekly', 'monthly', NULL for one-time + next_run_at TIMESTAMP WITH TIME ZONE, + last_run_at TIMESTAMP WITH TIME ZONE, + + -- Job result tracking + last_job_id UUID, -- Most recent job ID from taxonomy service + last_error TEXT, + topics_generated INT DEFAULT 0, + records_processed INT DEFAULT 0, + + -- Timestamps + created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + + -- Ensure one schedule per tenant + CONSTRAINT unique_tenant_schedule UNIQUE (tenant_id) +); + +-- Index for finding jobs that need to run +CREATE INDEX IF NOT EXISTS idx_clustering_jobs_next_run + ON clustering_jobs (next_run_at) + WHERE status != 'disabled' AND next_run_at IS NOT NULL; + +-- Index for tenant lookups +CREATE INDEX IF NOT EXISTS idx_clustering_jobs_tenant_id + ON clustering_jobs (tenant_id); + +-- Add trigger to update updated_at +CREATE OR REPLACE FUNCTION update_clustering_jobs_updated_at() +RETURNS TRIGGER AS $$ +BEGIN + NEW.updated_at = NOW(); + RETURN NEW; +END; +$$ LANGUAGE plpgsql; + +DROP TRIGGER IF EXISTS trigger_clustering_jobs_updated_at ON clustering_jobs; +CREATE TRIGGER trigger_clustering_jobs_updated_at + BEFORE UPDATE ON clustering_jobs + FOR EACH ROW + EXECUTE FUNCTION update_clustering_jobs_updated_at(); diff --git a/testdata/README.md b/testdata/README.md new file mode 100644 index 0000000..be3c965 --- /dev/null +++ b/testdata/README.md @@ -0,0 +1,21 @@ +# Test Data + +Sample data files for testing the Hub API. + +## sample_feedback.csv + +Export from the Formbricks Enterprise Dream Survey containing real feedback responses. + +**Fields extracted by the ingestion script:** +- How Formbricks helped solve problems +- Suggestions for improvement +- Missing feature requests +- NPS scores and reasons + +**Usage:** + +```bash +go run scripts/ingest_csv.go \ + -file testdata/sample_feedback.csv \ + -api-key YOUR_API_KEY +``` diff --git a/testdata/sample_feedback.csv b/testdata/sample_feedback.csv new file mode 100644 index 0000000..7e41f76 --- /dev/null +++ b/testdata/sample_feedback.csv @@ -0,0 +1,19 @@ +"No.","Response ID","Timestamp","Finished","Quotas","Survey ID","Formbricks ID (internal)","User ID","Notes","Tags","url","country","userAgent - os","userAgent - device","userAgent - browser","1. Formbricks has helped solve my problems in surveys and experience management.","2. Can you share with us how we helped to solve your problems?","3. Can you share with us how we can help to solve your problems better?","4. What's the ONE feature you are missing at Formbricks? Why?","5. We also have some ideas! Please rank these features by importance for you:","5. We also have some ideas! Please rank these features by importance for you: - Option ID","6. We want to focus on the most essential and retire areas that do not bring value to our users. Choose 2 possible features to deprecate in exchange for a reduced Formbricks price by 20%.","6. We want to focus on the most essential and retire areas that do not bring value to our users. Choose 2 possible features to deprecate in exchange for a reduced Formbricks price by 20%. - Option ID","7. Would you be open to a short follow-up conversation about your suggestions?","7. Would you be open to a short follow-up conversation about your suggestions? - Option ID","8. Would you like to test our up and coming features?","8. Would you like to test our up and coming features? - Option ID","9. Your contact details","10. How likely would you recommend Formbricks to people you know?","11. Why did you choose this number?","delivery","plan","email" +1,"cmkqjhe8a0wrrad01nkcgwgc6","2026-01-23 07:08:21","No","","cmk5jsfev2d8wad0182oiq84e","","",,"","https://app.formbricks.com/s/cmk5jsfev2d8wad0182oiq84e?email=j.vanwees@nelson.nl&delivery=cloud","NL","iOS","mobile","Mobile Safari","4","We use Formbricks to automatically collect feedback From customers after different kinds of interactions (after visiting a store - we are a retailer -, contacting our customer service, etc.). We use te API to create unique invite links and to pull responses. Webhooks are used to trigger follow-up actions (e.g. issue a reward).","","","","","","","","","","","","","","cloud","","j.vanwees@nelson.nl" +2,"cmkpl3uda1oz0ad01pnmfwa7i","2026-01-22 15:06:02","Yes","","cmk5jsfev2d8wad0182oiq84e","","",,"","https://app.formbricks.com/s/cmk5jsfev2d8wad0182oiq84e?email=kees@boplicity.nl&delivery=cloud","NL","macOS","desktop","Firefox","4","","- crank up the performance, loading of a survey list is now very slow +- allow to download data from multiple surveys in one file","","Survey Import & Export; Custom dashboards; Workflows; AI Survey Translation; AI Topic Detection; AI Sentiment Analysis","xvxxzrqwkdrx1m84c7l5bm1s, k1f88dt0vg8nha6ea4jikrbk, s5yvsup0qoym9yrdbdp0ap6n, higymjieiks7xigzmfjp7bsi, m9y2kc4mhdo9uy0luet0pyej, snln527i78inmzzl6y7881rz","Airtable Integration; Activepieces Integration","lwyxqbvpbxulkcree05krwse, d57d7usk860o2df73a2zm1kd","No, thank you","ewyvoy65m7lf4mrcuw9chtoq","No, thank you","wy1nzt2fzhjb41iw77z90wul","","","","cloud","","kees@boplicity.nl" +3,"cmkp69oni1i5iad01t8b8kd8p","2026-01-22 08:10:40","Yes","","cmk5jsfev2d8wad0182oiq84e","","",,"","https://app.formbricks.com/s/cmk5jsfev2d8wad0182oiq84e?email=peter@zeitkapsl.eu&delivery=cloud","AT","iOS","mobile","Mobile Safari","3","","We are a bit special in that regard. Due to e2ee promise we can’t include arbitrary js snippets from third parties. hence we had to remove or not even include in some parts of our apps.","easy integration of forms via npm i and forwarding the responses via our backend, not directly from frontend. but again, i guess this won’t be relevant for a lot of other customers","Survey Import & Export; Custom dashboards; AI Sentiment Analysis; Workflows; AI Topic Detection; AI Survey Translation","xvxxzrqwkdrx1m84c7l5bm1s, k1f88dt0vg8nha6ea4jikrbk, snln527i78inmzzl6y7881rz, s5yvsup0qoym9yrdbdp0ap6n, m9y2kc4mhdo9uy0luet0pyej, higymjieiks7xigzmfjp7bsi","Airtable Integration; Activepieces Integration","lwyxqbvpbxulkcree05krwse, d57d7usk860o2df73a2zm1kd, other","Yes, I would be keen to chat about it","zfh9l8f48yf37ouop0dk1oov","Yes please!","na6k8m2q9dqmfh5x3t7u071m","Peter; Spiess-Knafl; peter@zeitkapsl.eu; +436803250100; zeitkapsl.eu / hardcode GmbH","7","feeling of gut","cloud","","peter@zeitkapsl.eu" +4,"cmkiv9awd85owad010pvk3t4q","2026-01-17 22:15:50","Yes","","cmk5jsfev2d8wad0182oiq84e","","",,"","https://app.formbricks.com/s/cmk5jsfev2d8wad0182oiq84e","FR","macOS","desktop","Firefox","4","We chose Formbricks in 2024 for its open-source nature. We use it occasionally (once a year) to collect feedback from users of our application.","","Be able to hide a question without having to isolate it in a dedicated block ","Survey Import & Export; Custom dashboards; AI Sentiment Analysis; AI Survey Translation; AI Topic Detection; Workflows","xvxxzrqwkdrx1m84c7l5bm1s, k1f88dt0vg8nha6ea4jikrbk, snln527i78inmzzl6y7881rz, higymjieiks7xigzmfjp7bsi, m9y2kc4mhdo9uy0luet0pyej, s5yvsup0qoym9yrdbdp0ap6n","Airtable Integration; Activepieces Integration","lwyxqbvpbxulkcree05krwse, d57d7usk860o2df73a2zm1kd","Yes, I would be happy to answer by e-mail or message","pej0b64yozxnem1dnnwurxmp","No, thank you","wy1nzt2fzhjb41iw77z90wul","Mathieu; HIREL; ran.tigr@gmail.com; DStash Foundation","7","Some fields lack customization, for example the ranking, which requires sorting through all the options, whereas we would like to have a minimum and maximum number of options to choose from. +On some fields (e.g. matrix), the “Bulk Edit” option is missing. +We would like the possibility to indicate the number of question (and the progression ""{current} / {total}""","","","" +5,"cmkiej81l245oad01b5x6gizv","2026-01-17 14:27:39","Yes","","cmk5jsfev2d8wad0182oiq84e","","",,"","https://app.formbricks.com/s/cmk5jsfev2d8wad0182oiq84e","GB","macOS","desktop","Chrome","4","Simple way to toss up ""sign up"" surveys, ""interest in X"" surveys. Plan at some point to add a ""give feedback"" button.","","App log-in token only lasts 1 day! Each day I want to check status of an ongoing survey, I need to log in again!","Custom dashboards; AI Sentiment Analysis; Survey Import & Export; AI Topic Detection; AI Survey Translation; Workflows","k1f88dt0vg8nha6ea4jikrbk, snln527i78inmzzl6y7881rz, xvxxzrqwkdrx1m84c7l5bm1s, m9y2kc4mhdo9uy0luet0pyej, higymjieiks7xigzmfjp7bsi, s5yvsup0qoym9yrdbdp0ap6n","Activepieces Integration; Environments (Dev & Prod)","d57d7usk860o2df73a2zm1kd, gxxc24nq7wqi63nfkgyvi60o","Yes, I would be happy to answer by e-mail or message","pej0b64yozxnem1dnnwurxmp","No, thank you","wy1nzt2fzhjb41iw77z90wul","George; Dunlap; gwd@laleolanguage.com; Laleo Language Ltd","8","","","","" +6,"cmki8s5ythy4vad01rosuxmcm","2026-01-17 11:46:39","Yes","","cmk5jsfev2d8wad0182oiq84e","","",,"","https://app.formbricks.com/s/cmk5jsfev2d8wad0182oiq84e","DE","Android","mobile","Mobile Chrome","4","","Have a view whete I can see each answer, not just the cumulate answers.","Single answers, not cumulative ones.","Custom dashboards; Survey Import & Export; Workflows; AI Topic Detection; AI Sentiment Analysis; AI Survey Translation","k1f88dt0vg8nha6ea4jikrbk, xvxxzrqwkdrx1m84c7l5bm1s, s5yvsup0qoym9yrdbdp0ap6n, m9y2kc4mhdo9uy0luet0pyej, snln527i78inmzzl6y7881rz, higymjieiks7xigzmfjp7bsi","Airtable Integration; Activepieces Integration","lwyxqbvpbxulkcree05krwse, d57d7usk860o2df73a2zm1kd","Yes, I would be happy to answer by e-mail or message","pej0b64yozxnem1dnnwurxmp","Yes please!","na6k8m2q9dqmfh5x3t7u071m","Stefan; Bricker; survey-formbricks.com@derstefan.com","8","","","","" +7,"cmkh58rwrlfemad011dddxcqh","2026-01-16 17:19:49","No","","cmk5jsfev2d8wad0182oiq84e","","",,"","https://app.formbricks.com/s/cmk5jsfev2d8wad0182oiq84e","US","macOS","desktop","Firefox","4","","","","","","","","","","","","","","","","","" +8,"cmkh1ghjlds85ad01qc1535nj","2026-01-16 15:33:50","Yes","","cmk5jsfev2d8wad0182oiq84e","","",,"","https://app.formbricks.com/s/cmk5jsfev2d8wad0182oiq84e","US","Windows","desktop","Chrome","3","","I like formbricks so far but it could use so work on viewing the survey results","Better results viewing... Individual viewing and final results in a clean way. I am used to using google forms so this is an upgrade overall but I like how I was able to view results on that platform better than Formbricks.","Survey Import & Export; Custom dashboards; AI Survey Translation; Workflows; AI Sentiment Analysis; AI Topic Detection","xvxxzrqwkdrx1m84c7l5bm1s, k1f88dt0vg8nha6ea4jikrbk, higymjieiks7xigzmfjp7bsi, s5yvsup0qoym9yrdbdp0ap6n, snln527i78inmzzl6y7881rz, m9y2kc4mhdo9uy0luet0pyej","Airtable Integration; Activepieces Integration","lwyxqbvpbxulkcree05krwse, d57d7usk860o2df73a2zm1kd","No, thank you","ewyvoy65m7lf4mrcuw9chtoq","Yes please!","na6k8m2q9dqmfh5x3t7u071m","Elizabeth; Kohl; ekohl@mote.org; 9413884441; Mote Marine Lab & Aquarium ","9","","","","" +9,"cmkgxyrzp5azuad019rhu8qyu","2026-01-16 13:56:05","No","","cmk5jsfev2d8wad0182oiq84e","","",,"","https://app.formbricks.com/s/cmk5jsfev2d8wad0182oiq84e","FR","macOS","desktop","Firefox","4","","","","","","","","","","","","","","","","","" +10,"cmkgq2slmiah4ad0170p4lpc5","2026-01-16 10:15:16","No","","cmk5jsfev2d8wad0182oiq84e","","",,"","https://app.formbricks.com/s/cmk5jsfev2d8wad0182oiq84e","IN","iOS","mobile","Mobile Chrome","5","","","","","","","","","","","","","","","","","" +11,"cmkgpuitihs4wad01cgqydvm2","2026-01-16 10:08:50","Yes","","cmk5jsfev2d8wad0182oiq84e","","",,"","https://app.formbricks.com/s/cmk5jsfev2d8wad0182oiq84e","DK","macOS","desktop","Firefox","5","I've been in contact with Johannes regarding some issues with the React Native SDK. This resulted in a quick workaround and the issue being prioritized for an upcoming SDK release.","","We would like to have the option of filtering survey responses by segments and not individual attributes. +Based on filtering by segment, we would like to be able to compare survey responses by segment/and or other attributes. +Breakdown charts, pie charts and more would also be super helpful for our UX team.","Workflows; Custom dashboards; AI Topic Detection; AI Survey Translation; AI Sentiment Analysis; Survey Import & Export","s5yvsup0qoym9yrdbdp0ap6n, k1f88dt0vg8nha6ea4jikrbk, m9y2kc4mhdo9uy0luet0pyej, higymjieiks7xigzmfjp7bsi, snln527i78inmzzl6y7881rz, xvxxzrqwkdrx1m84c7l5bm1s","Activepieces Integration; Airtable Integration","d57d7usk860o2df73a2zm1kd, lwyxqbvpbxulkcree05krwse","Yes, I would be happy to answer by e-mail or message","pej0b64yozxnem1dnnwurxmp","Yes please!","na6k8m2q9dqmfh5x3t7u071m","Michael; Jensen; michael@nabogo.com; +45 20 23 77 35; nabogo ApS","10","We've been using Formbricks for almost 2 years now and it's helped our UX team a lot!","","","" +12,"cmkgpojkmhetrad01mj6xj1yb","2026-01-16 10:04:11","Yes","","cmk5jsfev2d8wad0182oiq84e","","",,"","https://app.formbricks.com/s/cmk5jsfev2d8wad0182oiq84e","GB","Linux","desktop","Chrome","5","in-app surveys, forms, feedback collection","","I liked the custom redirect URL for the final button at the end of the survey (this functionality was deprecated from the free plan recently)","Workflows; Survey Import & Export; AI Sentiment Analysis; AI Topic Detection; AI Survey Translation; Custom dashboards","s5yvsup0qoym9yrdbdp0ap6n, xvxxzrqwkdrx1m84c7l5bm1s, snln527i78inmzzl6y7881rz, m9y2kc4mhdo9uy0luet0pyej, higymjieiks7xigzmfjp7bsi, k1f88dt0vg8nha6ea4jikrbk","Activepieces Integration","d57d7usk860o2df73a2zm1kd","Yes, I would be happy to answer by e-mail or message","pej0b64yozxnem1dnnwurxmp","No, thank you","wy1nzt2fzhjb41iw77z90wul","Stanley; Modrak; s.modrak@imperial.ac.uk; Imperial","9","best solution I've found so far at good price","","","" +13,"cmkfwucuy3tpgad01br3g64vt","2026-01-15 20:36:53","Yes","","cmk5jsfev2d8wad0182oiq84e","","",,"","https://app.formbricks.com/s/cmk5jsfev2d8wad0182oiq84e","FR","iOS","mobile","Mobile Safari","5","I can find good templates and get good data","","I cant think of any","Workflows; Survey Import & Export; AI Survey Translation; AI Sentiment Analysis; Custom dashboards; AI Topic Detection","s5yvsup0qoym9yrdbdp0ap6n, xvxxzrqwkdrx1m84c7l5bm1s, higymjieiks7xigzmfjp7bsi, snln527i78inmzzl6y7881rz, k1f88dt0vg8nha6ea4jikrbk, m9y2kc4mhdo9uy0luet0pyej","Environments (Dev & Prod)","gxxc24nq7wqi63nfkgyvi60o","No, thank you","ewyvoy65m7lf4mrcuw9chtoq","No, thank you","wy1nzt2fzhjb41iw77z90wul","","","","","","" \ No newline at end of file diff --git a/tests/helpers.go b/tests/helpers.go index 7e02ee5..73a3933 100644 --- a/tests/helpers.go +++ b/tests/helpers.go @@ -26,4 +26,12 @@ func CleanupTestData(t *testing.T) { // Be careful with this in production! _, err = db.Exec(ctx, "DELETE FROM feedback_records WHERE source_type = 'formbricks'") require.NoError(t, err) + + // Delete all knowledge records created during tests + _, err = db.Exec(ctx, "DELETE FROM knowledge_records") + require.NoError(t, err) + + // Delete all topics created during tests + _, err = db.Exec(ctx, "DELETE FROM topics") + require.NoError(t, err) } diff --git a/tests/integration_test.go b/tests/integration_test.go index a1cd3dc..baf1d8b 100644 --- a/tests/integration_test.go +++ b/tests/integration_test.go @@ -17,6 +17,7 @@ import ( "github.com/formbricks/hub/internal/repository" "github.com/formbricks/hub/internal/service" "github.com/formbricks/hub/pkg/database" + "github.com/google/uuid" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -36,10 +37,22 @@ func setupTestServer(t *testing.T) (*httptest.Server, func()) { db, err := database.NewPostgresPool(ctx, cfg.DatabaseURL) require.NoError(t, err, "Failed to connect to database") + // Clean up database before each test + CleanupTestData(t) + // Initialize repository, service, and handler layers feedbackRecordsRepo := repository.NewFeedbackRecordsRepository(db) feedbackRecordsService := service.NewFeedbackRecordsService(feedbackRecordsRepo) feedbackRecordsHandler := handlers.NewFeedbackRecordsHandler(feedbackRecordsService) + + knowledgeRecordsRepo := repository.NewKnowledgeRecordsRepository(db) + knowledgeRecordsService := service.NewKnowledgeRecordsService(knowledgeRecordsRepo) + knowledgeRecordsHandler := handlers.NewKnowledgeRecordsHandler(knowledgeRecordsService) + + topicsRepo := repository.NewTopicsRepository(db) + topicsService := service.NewTopicsService(topicsRepo) + topicsHandler := handlers.NewTopicsHandler(topicsService) + healthHandler := handlers.NewHealthHandler() // Set up public endpoints @@ -57,6 +70,19 @@ func setupTestServer(t *testing.T) (*httptest.Server, func()) { protectedMux.HandleFunc("DELETE /v1/feedback-records/{id}", feedbackRecordsHandler.Delete) protectedMux.HandleFunc("DELETE /v1/feedback-records", feedbackRecordsHandler.BulkDelete) + protectedMux.HandleFunc("POST /v1/knowledge-records", knowledgeRecordsHandler.Create) + protectedMux.HandleFunc("GET /v1/knowledge-records", knowledgeRecordsHandler.List) + protectedMux.HandleFunc("GET /v1/knowledge-records/{id}", knowledgeRecordsHandler.Get) + protectedMux.HandleFunc("PATCH /v1/knowledge-records/{id}", knowledgeRecordsHandler.Update) + protectedMux.HandleFunc("DELETE /v1/knowledge-records/{id}", knowledgeRecordsHandler.Delete) + protectedMux.HandleFunc("DELETE /v1/knowledge-records", knowledgeRecordsHandler.BulkDelete) + + protectedMux.HandleFunc("POST /v1/topics", topicsHandler.Create) + protectedMux.HandleFunc("GET /v1/topics", topicsHandler.List) + protectedMux.HandleFunc("GET /v1/topics/{id}", topicsHandler.Get) + protectedMux.HandleFunc("PATCH /v1/topics/{id}", topicsHandler.Update) + protectedMux.HandleFunc("DELETE /v1/topics/{id}", topicsHandler.Delete) + var protectedHandler http.Handler = protectedMux protectedHandler = middleware.Auth(cfg.APIKey)(protectedHandler) @@ -511,3 +537,931 @@ func TestDeleteFeedbackRecord(t *testing.T) { assert.Equal(t, http.StatusNotFound, resp.StatusCode) }) } + +// ============================================================================= +// Knowledge Records Tests +// ============================================================================= + +func TestCreateKnowledgeRecord(t *testing.T) { + server, cleanup := setupTestServer(t) + defer cleanup() + + client := &http.Client{} + + t.Run("Success with valid API key", func(t *testing.T) { + reqBody := map[string]interface{}{ + "content": "This is a test knowledge record content.", + "tenant_id": "test-tenant", + } + body, _ := json.Marshal(reqBody) + + req, _ := http.NewRequest("POST", server.URL+"/v1/knowledge-records", bytes.NewBuffer(body)) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + req.Header.Set("Content-Type", "application/json") + + resp, err := client.Do(req) + require.NoError(t, err) + defer func() { _ = resp.Body.Close() }() + + assert.Equal(t, http.StatusCreated, resp.StatusCode) + + var result models.KnowledgeRecord + err = decodeData(resp, &result) + require.NoError(t, err) + + assert.NotEmpty(t, result.ID) + assert.Equal(t, "This is a test knowledge record content.", result.Content) + assert.NotNil(t, result.TenantID) + assert.Equal(t, "test-tenant", *result.TenantID) + }) + + t.Run("Bad request with missing content", func(t *testing.T) { + reqBody := map[string]interface{}{ + "tenant_id": "test-tenant", + } + body, _ := json.Marshal(reqBody) + + req, _ := http.NewRequest("POST", server.URL+"/v1/knowledge-records", bytes.NewBuffer(body)) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + req.Header.Set("Content-Type", "application/json") + + resp, err := client.Do(req) + require.NoError(t, err) + defer func() { _ = resp.Body.Close() }() + + assert.Equal(t, http.StatusBadRequest, resp.StatusCode) + }) + + t.Run("Bad request with content too long", func(t *testing.T) { + // Create content longer than 10000 characters + longContent := make([]byte, 10001) + for i := range longContent { + longContent[i] = 'a' + } + + reqBody := map[string]interface{}{ + "content": string(longContent), + } + body, _ := json.Marshal(reqBody) + + req, _ := http.NewRequest("POST", server.URL+"/v1/knowledge-records", bytes.NewBuffer(body)) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + req.Header.Set("Content-Type", "application/json") + + resp, err := client.Do(req) + require.NoError(t, err) + defer func() { _ = resp.Body.Close() }() + + assert.Equal(t, http.StatusBadRequest, resp.StatusCode) + }) +} + +func TestListKnowledgeRecords(t *testing.T) { + server, cleanup := setupTestServer(t) + defer cleanup() + + client := &http.Client{} + + // Create a test knowledge record first + reqBody := map[string]interface{}{ + "content": "Test content for listing", + "tenant_id": "list-test-tenant", + } + body, _ := json.Marshal(reqBody) + req, _ := http.NewRequest("POST", server.URL+"/v1/knowledge-records", bytes.NewBuffer(body)) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + req.Header.Set("Content-Type", "application/json") + _, _ = client.Do(req) + + t.Run("List all knowledge records", func(t *testing.T) { + req, _ := http.NewRequest("GET", server.URL+"/v1/knowledge-records", nil) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + + resp, err := client.Do(req) + require.NoError(t, err) + defer func() { _ = resp.Body.Close() }() + + assert.Equal(t, http.StatusOK, resp.StatusCode) + + var result models.ListKnowledgeRecordsResponse + err = decodeData(resp, &result) + require.NoError(t, err) + + assert.NotEmpty(t, result.Data) + }) + + t.Run("List with tenant_id filter", func(t *testing.T) { + req, _ := http.NewRequest("GET", server.URL+"/v1/knowledge-records?tenant_id=list-test-tenant&limit=10", nil) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + + resp, err := client.Do(req) + require.NoError(t, err) + defer func() { _ = resp.Body.Close() }() + + assert.Equal(t, http.StatusOK, resp.StatusCode) + + var result models.ListKnowledgeRecordsResponse + err = decodeData(resp, &result) + require.NoError(t, err) + + for _, record := range result.Data { + assert.NotNil(t, record.TenantID) + assert.Equal(t, "list-test-tenant", *record.TenantID) + } + }) +} + +func TestGetKnowledgeRecord(t *testing.T) { + server, cleanup := setupTestServer(t) + defer cleanup() + + client := &http.Client{} + + // Create a test knowledge record + reqBody := map[string]interface{}{ + "content": "Test content for get", + } + body, _ := json.Marshal(reqBody) + req, _ := http.NewRequest("POST", server.URL+"/v1/knowledge-records", bytes.NewBuffer(body)) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + req.Header.Set("Content-Type", "application/json") + + createResp, err := client.Do(req) + require.NoError(t, err) + defer func() { _ = createResp.Body.Close() }() + + var created models.KnowledgeRecord + err = decodeData(createResp, &created) + require.NoError(t, err) + + t.Run("Get existing knowledge record", func(t *testing.T) { + req, _ := http.NewRequest("GET", fmt.Sprintf("%s/v1/knowledge-records/%s", server.URL, created.ID), nil) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + + resp, err := client.Do(req) + require.NoError(t, err) + defer func() { _ = resp.Body.Close() }() + + assert.Equal(t, http.StatusOK, resp.StatusCode) + + var result models.KnowledgeRecord + err = decodeData(resp, &result) + require.NoError(t, err) + + assert.Equal(t, created.ID, result.ID) + assert.Equal(t, "Test content for get", result.Content) + }) + + t.Run("Get non-existent knowledge record", func(t *testing.T) { + req, _ := http.NewRequest("GET", server.URL+"/v1/knowledge-records/00000000-0000-0000-0000-000000000000", nil) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + + resp, err := client.Do(req) + require.NoError(t, err) + defer func() { _ = resp.Body.Close() }() + + assert.Equal(t, http.StatusNotFound, resp.StatusCode) + }) +} + +func TestUpdateKnowledgeRecord(t *testing.T) { + server, cleanup := setupTestServer(t) + defer cleanup() + + client := &http.Client{} + + // Create a test knowledge record + reqBody := map[string]interface{}{ + "content": "Initial content", + } + body, _ := json.Marshal(reqBody) + req, _ := http.NewRequest("POST", server.URL+"/v1/knowledge-records", bytes.NewBuffer(body)) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + req.Header.Set("Content-Type", "application/json") + + createResp, err := client.Do(req) + require.NoError(t, err) + defer func() { _ = createResp.Body.Close() }() + + var created models.KnowledgeRecord + err = decodeData(createResp, &created) + require.NoError(t, err) + + t.Run("Update knowledge record", func(t *testing.T) { + updateBody := map[string]interface{}{ + "content": "Updated content", + } + body, _ := json.Marshal(updateBody) + + req, _ := http.NewRequest("PATCH", fmt.Sprintf("%s/v1/knowledge-records/%s", server.URL, created.ID), bytes.NewBuffer(body)) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + req.Header.Set("Content-Type", "application/json") + + resp, err := client.Do(req) + require.NoError(t, err) + defer func() { _ = resp.Body.Close() }() + + assert.Equal(t, http.StatusOK, resp.StatusCode) + + var result models.KnowledgeRecord + err = decodeData(resp, &result) + require.NoError(t, err) + + assert.Equal(t, created.ID, result.ID) + assert.Equal(t, "Updated content", result.Content) + }) +} + +func TestDeleteKnowledgeRecord(t *testing.T) { + server, cleanup := setupTestServer(t) + defer cleanup() + + client := &http.Client{} + + // Create a test knowledge record + reqBody := map[string]interface{}{ + "content": "To be deleted", + } + body, _ := json.Marshal(reqBody) + req, _ := http.NewRequest("POST", server.URL+"/v1/knowledge-records", bytes.NewBuffer(body)) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + req.Header.Set("Content-Type", "application/json") + + createResp, err := client.Do(req) + require.NoError(t, err) + defer func() { _ = createResp.Body.Close() }() + + var created models.KnowledgeRecord + err = decodeData(createResp, &created) + require.NoError(t, err) + + t.Run("Delete knowledge record", func(t *testing.T) { + req, _ := http.NewRequest("DELETE", fmt.Sprintf("%s/v1/knowledge-records/%s", server.URL, created.ID), nil) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + + resp, err := client.Do(req) + require.NoError(t, err) + defer func() { _ = resp.Body.Close() }() + + assert.Equal(t, http.StatusNoContent, resp.StatusCode) + }) + + t.Run("Verify deletion", func(t *testing.T) { + req, _ := http.NewRequest("GET", fmt.Sprintf("%s/v1/knowledge-records/%s", server.URL, created.ID), nil) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + + resp, err := client.Do(req) + require.NoError(t, err) + defer func() { _ = resp.Body.Close() }() + + assert.Equal(t, http.StatusNotFound, resp.StatusCode) + }) +} + +func TestBulkDeleteKnowledgeRecords(t *testing.T) { + server, cleanup := setupTestServer(t) + defer cleanup() + + client := &http.Client{} + + // Create test knowledge records with specific tenant_id + for i := 0; i < 3; i++ { + reqBody := map[string]interface{}{ + "content": fmt.Sprintf("Bulk delete test content %d", i), + "tenant_id": "bulk-delete-tenant", + } + body, _ := json.Marshal(reqBody) + req, _ := http.NewRequest("POST", server.URL+"/v1/knowledge-records", bytes.NewBuffer(body)) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + req.Header.Set("Content-Type", "application/json") + resp, _ := client.Do(req) + _ = resp.Body.Close() + } + + t.Run("Bulk delete knowledge records by tenant_id", func(t *testing.T) { + req, _ := http.NewRequest("DELETE", server.URL+"/v1/knowledge-records?tenant_id=bulk-delete-tenant", nil) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + + resp, err := client.Do(req) + require.NoError(t, err) + defer func() { _ = resp.Body.Close() }() + + assert.Equal(t, http.StatusOK, resp.StatusCode) + + var result models.BulkDeleteKnowledgeRecordsResponse + err = decodeData(resp, &result) + require.NoError(t, err) + + assert.Equal(t, int64(3), result.DeletedCount) + }) + + t.Run("Bulk delete with no matches returns 0", func(t *testing.T) { + req, _ := http.NewRequest("DELETE", server.URL+"/v1/knowledge-records?tenant_id=non-existent-tenant", nil) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + + resp, err := client.Do(req) + require.NoError(t, err) + defer func() { _ = resp.Body.Close() }() + + assert.Equal(t, http.StatusOK, resp.StatusCode) + + var result models.BulkDeleteKnowledgeRecordsResponse + err = decodeData(resp, &result) + require.NoError(t, err) + + assert.Equal(t, int64(0), result.DeletedCount) + }) +} + +// ============================================================================= +// Topics Tests +// ============================================================================= + +func TestCreateTopic(t *testing.T) { + server, cleanup := setupTestServer(t) + defer cleanup() + + client := &http.Client{} + + t.Run("Success with valid API key", func(t *testing.T) { + reqBody := map[string]interface{}{ + "title": "Test Topic", + "level": 1, + "tenant_id": "test-tenant", + } + body, _ := json.Marshal(reqBody) + + req, _ := http.NewRequest("POST", server.URL+"/v1/topics", bytes.NewBuffer(body)) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + req.Header.Set("Content-Type", "application/json") + + resp, err := client.Do(req) + require.NoError(t, err) + defer func() { _ = resp.Body.Close() }() + + assert.Equal(t, http.StatusCreated, resp.StatusCode) + + var result models.Topic + err = decodeData(resp, &result) + require.NoError(t, err) + + assert.NotEmpty(t, result.ID) + assert.Equal(t, "Test Topic", result.Title) + assert.Equal(t, 1, result.Level) + assert.NotNil(t, result.TenantID) + assert.Equal(t, "test-tenant", *result.TenantID) + }) + + t.Run("Bad request with missing title", func(t *testing.T) { + reqBody := map[string]interface{}{ + "level": 1, + "tenant_id": "test-tenant", + } + body, _ := json.Marshal(reqBody) + + req, _ := http.NewRequest("POST", server.URL+"/v1/topics", bytes.NewBuffer(body)) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + req.Header.Set("Content-Type", "application/json") + + resp, err := client.Do(req) + require.NoError(t, err) + defer func() { _ = resp.Body.Close() }() + + assert.Equal(t, http.StatusBadRequest, resp.StatusCode) + }) +} + +func TestCreateTopicWithLevel(t *testing.T) { + server, cleanup := setupTestServer(t) + defer cleanup() + + client := &http.Client{} + + t.Run("Create Level 1 topic", func(t *testing.T) { + reqBody := map[string]interface{}{ + "title": "Level 1 Test Topic", + "level": 1, + "tenant_id": "level-test-tenant", + } + body, _ := json.Marshal(reqBody) + + req, _ := http.NewRequest("POST", server.URL+"/v1/topics", bytes.NewBuffer(body)) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + req.Header.Set("Content-Type", "application/json") + + resp, err := client.Do(req) + require.NoError(t, err) + defer func() { _ = resp.Body.Close() }() + + assert.Equal(t, http.StatusCreated, resp.StatusCode) + + var result models.Topic + err = decodeData(resp, &result) + require.NoError(t, err) + + assert.Equal(t, "Level 1 Test Topic", result.Title) + assert.Equal(t, 1, result.Level) + }) + + t.Run("Create Level 2 topic", func(t *testing.T) { + // First create a parent topic + parentBody := map[string]interface{}{ + "title": "Parent for Level 2", + "level": 1, + "tenant_id": "level-test-tenant", + } + pBody, _ := json.Marshal(parentBody) + pReq, _ := http.NewRequest("POST", server.URL+"/v1/topics", bytes.NewBuffer(pBody)) + pReq.Header.Set("Authorization", "Bearer "+testAPIKey) + pReq.Header.Set("Content-Type", "application/json") + pResp, err := client.Do(pReq) + require.NoError(t, err) + defer func() { _ = pResp.Body.Close() }() + var parent models.Topic + err = decodeData(pResp, &parent) + require.NoError(t, err) + + reqBody := map[string]interface{}{ + "title": "Level 2 Test Topic", + "level": 2, + "parent_id": parent.ID, + "tenant_id": "level-test-tenant", + } + body, _ := json.Marshal(reqBody) + + req, _ := http.NewRequest("POST", server.URL+"/v1/topics", bytes.NewBuffer(body)) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + req.Header.Set("Content-Type", "application/json") + + resp, err := client.Do(req) + require.NoError(t, err) + defer func() { _ = resp.Body.Close() }() + + assert.Equal(t, http.StatusCreated, resp.StatusCode) + + var result models.Topic + err = decodeData(resp, &result) + require.NoError(t, err) + + assert.Equal(t, "Level 2 Test Topic", result.Title) + assert.Equal(t, 2, result.Level) + assert.NotNil(t, result.ParentID) + assert.Equal(t, parent.ID, *result.ParentID) + }) + + t.Run("Create topic with invalid level returns 400", func(t *testing.T) { + reqBody := map[string]interface{}{ + "title": "Invalid Level Topic", + "level": 3, + "tenant_id": "level-test-tenant", + } + body, _ := json.Marshal(reqBody) + + req, _ := http.NewRequest("POST", server.URL+"/v1/topics", bytes.NewBuffer(body)) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + req.Header.Set("Content-Type", "application/json") + + resp, err := client.Do(req) + require.NoError(t, err) + defer func() { _ = resp.Body.Close() }() + + assert.Equal(t, http.StatusBadRequest, resp.StatusCode) + }) +} + +func TestTopicTitleUniqueness(t *testing.T) { + server, cleanup := setupTestServer(t) + defer cleanup() + + client := &http.Client{} + + // Create first Level 1 topic + firstReq := map[string]interface{}{ + "title": "Unique Title L1", + "level": 1, + "tenant_id": "uniqueness-test-tenant", + } + body, _ := json.Marshal(firstReq) + req, _ := http.NewRequest("POST", server.URL+"/v1/topics", bytes.NewBuffer(body)) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + req.Header.Set("Content-Type", "application/json") + + firstResp, err := client.Do(req) + require.NoError(t, err) + defer func() { _ = firstResp.Body.Close() }() + + var firstTopic models.Topic + err = decodeData(firstResp, &firstTopic) + require.NoError(t, err) + + t.Run("Create duplicate title at same level returns 409", func(t *testing.T) { + duplicateReq := map[string]interface{}{ + "title": "Unique Title L1", // Same title + "level": 1, + "tenant_id": "uniqueness-test-tenant", + } + body, _ := json.Marshal(duplicateReq) + + req, _ := http.NewRequest("POST", server.URL+"/v1/topics", bytes.NewBuffer(body)) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + req.Header.Set("Content-Type", "application/json") + + resp, err := client.Do(req) + require.NoError(t, err) + defer func() { _ = resp.Body.Close() }() + + assert.Equal(t, http.StatusConflict, resp.StatusCode) + }) + + t.Run("Create same title at different level succeeds", func(t *testing.T) { + // Create Level 2 topic with same title as first Level 1 topic + // Need a parent for the Level 2 topic + parentReq := map[string]interface{}{ + "title": "Another Parent", + "level": 1, + "tenant_id": "uniqueness-test-tenant", + } + pBody, _ := json.Marshal(parentReq) + pReq, _ := http.NewRequest("POST", server.URL+"/v1/topics", bytes.NewBuffer(pBody)) + pReq.Header.Set("Authorization", "Bearer "+testAPIKey) + pReq.Header.Set("Content-Type", "application/json") + pResp, err := client.Do(pReq) + require.NoError(t, err) + defer func() { _ = pResp.Body.Close() }() + var parent models.Topic + err = decodeData(pResp, &parent) + require.NoError(t, err) + + level2Req := map[string]interface{}{ + "title": "Unique Title L1", // Same title as first topic, but different level + "level": 2, + "parent_id": parent.ID, + "tenant_id": "uniqueness-test-tenant", + } + body, _ := json.Marshal(level2Req) + + req, _ := http.NewRequest("POST", server.URL+"/v1/topics", bytes.NewBuffer(body)) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + req.Header.Set("Content-Type", "application/json") + + resp, err := client.Do(req) + require.NoError(t, err) + defer func() { _ = resp.Body.Close() }() + + assert.Equal(t, http.StatusCreated, resp.StatusCode) + }) +} + +func TestListTopics(t *testing.T) { + server, cleanup := setupTestServer(t) + defer cleanup() + + client := &http.Client{} + + // Create a test topic + reqBody := map[string]interface{}{ + "title": "List Test Topic", + "level": 1, + "tenant_id": "list-test-tenant", + } + body, _ := json.Marshal(reqBody) + req, _ := http.NewRequest("POST", server.URL+"/v1/topics", bytes.NewBuffer(body)) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + req.Header.Set("Content-Type", "application/json") + _, _ = client.Do(req) + + t.Run("List all topics", func(t *testing.T) { + req, _ := http.NewRequest("GET", server.URL+"/v1/topics", nil) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + + resp, err := client.Do(req) + require.NoError(t, err) + defer func() { _ = resp.Body.Close() }() + + assert.Equal(t, http.StatusOK, resp.StatusCode) + + var result models.ListTopicsResponse + err = decodeData(resp, &result) + require.NoError(t, err) + + assert.NotEmpty(t, result.Data) + }) + + t.Run("List with level filter", func(t *testing.T) { + req, _ := http.NewRequest("GET", server.URL+"/v1/topics?level=1&limit=10", nil) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + + resp, err := client.Do(req) + require.NoError(t, err) + defer func() { _ = resp.Body.Close() }() + + assert.Equal(t, http.StatusOK, resp.StatusCode) + + var result models.ListTopicsResponse + err = decodeData(resp, &result) + require.NoError(t, err) + + for _, topic := range result.Data { + assert.Equal(t, 1, topic.Level) + } + }) +} + +func TestGetTopic(t *testing.T) { + server, cleanup := setupTestServer(t) + defer cleanup() + + client := &http.Client{} + + // Create a test topic + reqBody := map[string]interface{}{ + "title": "Get Test Topic", + "level": 1, + } + body, _ := json.Marshal(reqBody) + req, _ := http.NewRequest("POST", server.URL+"/v1/topics", bytes.NewBuffer(body)) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + req.Header.Set("Content-Type", "application/json") + + createResp, err := client.Do(req) + require.NoError(t, err) + defer func() { _ = createResp.Body.Close() }() + + var created models.Topic + err = decodeData(createResp, &created) + require.NoError(t, err) + + t.Run("Get existing topic", func(t *testing.T) { + req, _ := http.NewRequest("GET", fmt.Sprintf("%s/v1/topics/%s", server.URL, created.ID), nil) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + + resp, err := client.Do(req) + require.NoError(t, err) + defer func() { _ = resp.Body.Close() }() + + assert.Equal(t, http.StatusOK, resp.StatusCode) + + var result models.Topic + err = decodeData(resp, &result) + require.NoError(t, err) + + assert.Equal(t, created.ID, result.ID) + assert.Equal(t, "Get Test Topic", result.Title) + }) + + t.Run("Get non-existent topic", func(t *testing.T) { + req, _ := http.NewRequest("GET", server.URL+"/v1/topics/00000000-0000-0000-0000-000000000000", nil) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + + resp, err := client.Do(req) + require.NoError(t, err) + defer func() { _ = resp.Body.Close() }() + + assert.Equal(t, http.StatusNotFound, resp.StatusCode) + }) +} + +func TestUpdateTopic(t *testing.T) { + server, cleanup := setupTestServer(t) + defer cleanup() + + client := &http.Client{} + + // Create a test topic + reqBody := map[string]interface{}{ + "title": "Initial Title", + "level": 1, + "tenant_id": "update-test-tenant", + } + body, _ := json.Marshal(reqBody) + req, _ := http.NewRequest("POST", server.URL+"/v1/topics", bytes.NewBuffer(body)) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + req.Header.Set("Content-Type", "application/json") + + createResp, err := client.Do(req) + require.NoError(t, err) + defer func() { _ = createResp.Body.Close() }() + + var created models.Topic + err = decodeData(createResp, &created) + require.NoError(t, err) + + t.Run("Update topic title", func(t *testing.T) { + updateBody := map[string]interface{}{ + "title": "Updated Title", + } + body, _ := json.Marshal(updateBody) + + req, _ := http.NewRequest("PATCH", fmt.Sprintf("%s/v1/topics/%s", server.URL, created.ID), bytes.NewBuffer(body)) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + req.Header.Set("Content-Type", "application/json") + + resp, err := client.Do(req) + require.NoError(t, err) + defer func() { _ = resp.Body.Close() }() + + assert.Equal(t, http.StatusOK, resp.StatusCode) + + var result models.Topic + err = decodeData(resp, &result) + require.NoError(t, err) + + assert.Equal(t, created.ID, result.ID) + assert.Equal(t, "Updated Title", result.Title) + }) + + t.Run("Update to same title (idempotent) succeeds", func(t *testing.T) { + updateBody := map[string]interface{}{ + "title": "Updated Title", + } + body, _ := json.Marshal(updateBody) + + req, _ := http.NewRequest("PATCH", fmt.Sprintf("%s/v1/topics/%s", server.URL, created.ID), bytes.NewBuffer(body)) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + req.Header.Set("Content-Type", "application/json") + + resp, err := client.Do(req) + require.NoError(t, err) + defer func() { _ = resp.Body.Close() }() + + assert.Equal(t, http.StatusOK, resp.StatusCode) + }) + + t.Run("Update title to duplicate returns 409", func(t *testing.T) { + // Create another topic + otherReq := map[string]interface{}{ + "title": "Other Topic", + "level": 1, + "tenant_id": "update-test-tenant", + } + body, _ := json.Marshal(otherReq) + req, _ := http.NewRequest("POST", server.URL+"/v1/topics", bytes.NewBuffer(body)) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + req.Header.Set("Content-Type", "application/json") + otherResp, _ := client.Do(req) + _ = otherResp.Body.Close() + + // Try to update first topic to have same title as other topic + updateBody := map[string]interface{}{ + "title": "Other Topic", + } + body, _ = json.Marshal(updateBody) + + req, _ = http.NewRequest("PATCH", fmt.Sprintf("%s/v1/topics/%s", server.URL, created.ID), bytes.NewBuffer(body)) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + req.Header.Set("Content-Type", "application/json") + + resp, err := client.Do(req) + require.NoError(t, err) + defer func() { _ = resp.Body.Close() }() + + assert.Equal(t, http.StatusConflict, resp.StatusCode) + }) +} + +func TestDeleteTopic(t *testing.T) { + server, cleanup := setupTestServer(t) + defer cleanup() + + client := &http.Client{} + + // Create a test topic + reqBody := map[string]interface{}{ + "title": "To be deleted", + "level": 1, + } + body, _ := json.Marshal(reqBody) + req, _ := http.NewRequest("POST", server.URL+"/v1/topics", bytes.NewBuffer(body)) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + req.Header.Set("Content-Type", "application/json") + + createResp, err := client.Do(req) + require.NoError(t, err) + defer func() { _ = createResp.Body.Close() }() + + var created models.Topic + err = decodeData(createResp, &created) + require.NoError(t, err) + + t.Run("Delete topic", func(t *testing.T) { + req, _ := http.NewRequest("DELETE", fmt.Sprintf("%s/v1/topics/%s", server.URL, created.ID), nil) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + + resp, err := client.Do(req) + require.NoError(t, err) + defer func() { _ = resp.Body.Close() }() + + assert.Equal(t, http.StatusNoContent, resp.StatusCode) + }) + + t.Run("Verify deletion", func(t *testing.T) { + req, _ := http.NewRequest("GET", fmt.Sprintf("%s/v1/topics/%s", server.URL, created.ID), nil) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + + resp, err := client.Do(req) + require.NoError(t, err) + defer func() { _ = resp.Body.Close() }() + + assert.Equal(t, http.StatusNotFound, resp.StatusCode) + }) +} + +func TestDeleteTopicIndependently(t *testing.T) { + server, cleanup := setupTestServer(t) + defer cleanup() + + client := &http.Client{} + + // Create Level 1 topic + level1Req := map[string]interface{}{ + "title": "Level 1 to Delete", + "level": 1, + "tenant_id": "delete-test-tenant", + } + body, _ := json.Marshal(level1Req) + req, _ := http.NewRequest("POST", server.URL+"/v1/topics", bytes.NewBuffer(body)) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + req.Header.Set("Content-Type", "application/json") + + level1Resp, err := client.Do(req) + require.NoError(t, err) + defer func() { _ = level1Resp.Body.Close() }() + + var level1Topic models.Topic + err = decodeData(level1Resp, &level1Topic) + require.NoError(t, err) + + // Create Level 2 topic + level2Req := map[string]interface{}{ + "title": "Level 2 Independent", + "level": 2, + "tenant_id": "delete-test-tenant", + } + body, _ = json.Marshal(level2Req) + req, _ = http.NewRequest("POST", server.URL+"/v1/topics", bytes.NewBuffer(body)) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + req.Header.Set("Content-Type", "application/json") + + level2Resp, err := client.Do(req) + require.NoError(t, err) + defer func() { _ = level2Resp.Body.Close() }() + + var level2Topic models.Topic + err = decodeData(level2Resp, &level2Topic) + require.NoError(t, err) + + t.Run("Delete Level 1 topic deletes its Level 2 children", func(t *testing.T) { + // Create Level 1 topic + level1Req := map[string]interface{}{ + "title": "Unique L1 to Delete " + uuid.New().String(), + "level": 1, + "tenant_id": "delete-test-tenant", + } + body, _ := json.Marshal(level1Req) + req, _ := http.NewRequest("POST", server.URL+"/v1/topics", bytes.NewBuffer(body)) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + req.Header.Set("Content-Type", "application/json") + level1Resp, err := client.Do(req) + require.NoError(t, err) + defer func() { _ = level1Resp.Body.Close() }() + assert.Equal(t, http.StatusCreated, level1Resp.StatusCode) + var level1Topic models.Topic + err = decodeData(level1Resp, &level1Topic) + require.NoError(t, err) + assert.NotEqual(t, uuid.Nil, level1Topic.ID) + + // Create Level 2 topic + level2Req := map[string]interface{}{ + "title": "Level 2 Child", + "level": 2, + "parent_id": level1Topic.ID, + "tenant_id": "delete-test-tenant", + } + body, _ = json.Marshal(level2Req) + req, _ = http.NewRequest("POST", server.URL+"/v1/topics", bytes.NewBuffer(body)) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + req.Header.Set("Content-Type", "application/json") + level2Resp, err := client.Do(req) + require.NoError(t, err) + defer func() { _ = level2Resp.Body.Close() }() + var level2Topic models.Topic + err = decodeData(level2Resp, &level2Topic) + require.NoError(t, err) + + // Delete Level 1 topic + req, _ = http.NewRequest("DELETE", fmt.Sprintf("%s/v1/topics/%s", server.URL, level1Topic.ID), nil) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + resp, err := client.Do(req) + require.NoError(t, err) + defer func() { _ = resp.Body.Close() }() + assert.Equal(t, http.StatusNoContent, resp.StatusCode) + + // Verify Level 2 topic is also deleted (cascade) + req, _ = http.NewRequest("GET", fmt.Sprintf("%s/v1/topics/%s", server.URL, level2Topic.ID), nil) + req.Header.Set("Authorization", "Bearer "+testAPIKey) + resp, err = client.Do(req) + require.NoError(t, err) + defer func() { _ = resp.Body.Close() }() + assert.Equal(t, http.StatusNotFound, resp.StatusCode) + }) +}