formbricks · mattinannt · Jan 26, 2026 · Jan 26, 2026 · Jan 26, 2026 · Jan 26, 2026
diff --git a/.env.example b/.env.example
@@ -19,3 +19,35 @@ PORT=8080
 # Default: info
 # Valid values: debug, info, warn, error
 LOG_LEVEL=info
+
+
+# OPENAI API key for AI Enrichment features
+OPENAI_API_KEY=sk-your-openai-api-key-here
+
+# River Job Queue Configuration
+# River handles async embedding generation with retries and rate limiting
+
+# Enable River job queue (optional)
+# Default: true (when OPENAI_API_KEY is set)
+# Set to false to use fire-and-forget goroutines (legacy behavior)
+# RIVER_ENABLED=true
+
+# Number of concurrent embedding workers (optional)
+# Default: 10
+# Controls how many embedding jobs can run simultaneously
+# RIVER_WORKERS=10
+
+# Maximum retry attempts for failed jobs (optional)
+# Default: 5
+# Jobs that fail will be retried with exponential backoff
+# RIVER_MAX_RETRIES=5
+
+# Embedding rate limit - OpenAI requests per second (optional)
+# Default: 50
+# Adjust based on your OpenAI tier limits
+# EMBEDDING_RATE_LIMIT=50
+
+# Optional - Taxonomy Service (Python microservice for clustering)
+TAXONOMY_SERVICE_URL=http://localhost:8001   # Python microservice URL (default)
+TAXONOMY_SCHEDULER_ENABLED=true              # Enable periodic scheduler (default: true)
+TAXONOMY_POLL_INTERVAL=1m                    # Scheduler poll frequency (default: 1m)
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,7 @@
 .vscode
 .cursor
 bin/
+api
 .env
 *.out
 *.html

diff --git a/AGENTS.md b/AGENTS.md
@@ -2,9 +2,10 @@
 
 ## Project Structure & Module Organization
 - `cmd/api/` holds the API server entrypoint (`main.go`).
-- `internal/` contains core application layers: `api/handlers`, `api/middleware`, `service`, `repository`, `models`, and `config`.
+- `internal/` contains core application layers: `api/handlers`, `api/middleware`, `service`, `repository`, `models`, `worker`, and `config`.
 - `pkg/` provides shared utilities (currently `pkg/database`).
 - `sql/` stores SQL schema files (e.g., `sql/001_initial_schema.sql`).
+- `services/` contains microservices (e.g., `services/taxonomy-generator/` Python service).
 - `tests/` contains integration tests.
 
 ## Build, Test, and Development Commands
@@ -36,3 +37,20 @@
 ## Security & Configuration Tips
 - Configure `API_KEY` and `DATABASE_URL` via `.env` or environment variables.
 - Do not commit `.env` or secrets; use `.env.example` as the base.
+
+## Taxonomy Service Architecture
+The taxonomy feature uses a Python microservice for ML clustering:
+
+- **Go API** triggers jobs via HTTP to the taxonomy-generator service
+- **Python service** writes results directly to Postgres (topics table, feedback_records.topic_id)
+- **TaxonomyScheduler** (`internal/worker/`) polls for scheduled jobs and tracks completion
+- Config: `TAXONOMY_SERVICE_URL`, `TAXONOMY_SCHEDULER_ENABLED`, `TAXONOMY_POLL_INTERVAL`
+
+To run the taxonomy service:
+```bash
+cd services/taxonomy-generator
+pip install -r requirements.txt
+uvicorn src.main:app --port 8001
+```
+
+Key endpoints: `POST /v1/taxonomy/{tenant_id}/generate`, `GET /v1/taxonomy/{tenant_id}/status`
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,47 @@
+# syntax=docker/dockerfile:1
+
+# Build stage
+FROM golang:1.24-alpine AS builder
+
+# Install build dependencies
+RUN apk add --no-cache git ca-certificates
+
+WORKDIR /app
+
+# Copy go mod files first for better caching
+COPY go.mod go.sum ./
+
+# Allow Go to download the required toolchain version
+ENV GOTOOLCHAIN=auto
+RUN go mod download
+
+# Copy source code
+COPY . .
+
+# Build the binary (GOTOOLCHAIN=auto ensures correct version is used)
+RUN CGO_ENABLED=0 GOOS=linux GOTOOLCHAIN=auto go build -ldflags="-w -s" -o /app/bin/api ./cmd/api
+
+# Runtime stage
+FROM alpine:3.19 AS runtime
+
+# Install runtime dependencies
+RUN apk add --no-cache ca-certificates wget
+
+WORKDIR /app
+
+# Copy binary from builder
+COPY --from=builder /app/bin/api /app/api
+
+# Create non-root user
+RUN adduser -D -u 1000 appuser
+USER appuser
+
+# Expose port
+EXPOSE 8080
+
+# Health check
+HEALTHCHECK --interval=10s --timeout=5s --start-period=5s --retries=3 \
+    CMD wget -q --spider http://localhost:8080/health || exit 1
+
+# Run the application
+CMD ["/app/api"]
diff --git a/Makefile b/Makefile
@@ -1,28 +1,42 @@
-.PHONY: help tests tests-coverage build run init-db clean docker-up docker-down docker-clean deps install-tools fmt fmt-check lint dev-setup test-all test-unit schemathesis install-hooks
+.PHONY: help tests tests-coverage build run init-db clean docker-up docker-down docker-clean deps install-tools fmt fmt-check lint dev-setup test-all test-unit schemathesis install-hooks backfill-embeddings prod-up prod-down prod-logs taxonomy-dev
 
 # Default target - show help
 help:
 	@echo "Available targets:"
-	@echo "  make help         - Show this help message"
-	@echo "  make dev-setup    - Set up development environment (docker, deps, tools, schema, hooks)"
-	@echo "  make build        - Build the API server"
-	@echo "  make run          - Run the API server"
+	@echo ""
+	@echo "Development:"
+	@echo "  make dev-setup    - Set up dev environment (postgres, deps, tools, schema, hooks)"
+	@echo "  make run          - Run Go API server locally"
+	@echo "  make taxonomy-dev - Run Python taxonomy service locally"
+	@echo "  make docker-up    - Start dev infrastructure (postgres, pgadmin)"
+	@echo "  make docker-down  - Stop dev infrastructure"
+	@echo "  make docker-clean - Stop and remove volumes"
+	@echo ""
+	@echo "Production:"
+	@echo "  make prod-up      - Start full stack (postgres, api, taxonomy-generator)"
+	@echo "  make prod-down    - Stop full stack"
+	@echo "  make prod-logs    - View logs from production stack"
+	@echo ""
+	@echo "Build & Test:"
+	@echo "  make build        - Build API binaries"
 	@echo "  make test-unit    - Run unit tests (fast, no database)"
 	@echo "  make tests        - Run integration tests"
 	@echo "  make test-all     - Run all tests (unit + integration)"
 	@echo "  make tests-coverage - Run tests with coverage report"
-	@echo "  make init-db      - Initialize database schema"
+	@echo "  make schemathesis - Run Schemathesis API tests"
+	@echo ""
+	@echo "Code Quality:"
 	@echo "  make fmt          - Format code with gofumpt"
 	@echo "  make fmt-check    - Check if code is formatted"
 	@echo "  make lint         - Run linter"
+	@echo ""
+	@echo "Utilities:"
+	@echo "  make init-db      - Initialize database schema"
+	@echo "  make backfill-embeddings - Backfill embeddings for existing records"
 	@echo "  make deps         - Install Go dependencies"
-	@echo "  make install-tools - Install development tools (gofumpt, golangci-lint)"
+	@echo "  make install-tools - Install dev tools (gofumpt, golangci-lint)"
 	@echo "  make install-hooks - Install git hooks"
-	@echo "  make docker-up    - Start Docker containers"
-	@echo "  make docker-down  - Stop Docker containers"
-	@echo "  make docker-clean - Stop Docker containers and remove volumes"
 	@echo "  make clean        - Clean build artifacts"
-	@echo "  make schemathesis - Run Schemathesis API tests (requires API server running)"
 
 # Run all tests (integration tests in tests/ directory)
 tests:
@@ -49,7 +63,19 @@ tests-coverage:
 build:
 	@echo "Building API server..."
 	go build -o bin/api cmd/api/main.go
-	@echo "Binary created: bin/api"
+	go build -o bin/backfill cmd/backfill/main.go
+	@echo "Binaries created: bin/api, bin/backfill"
+
+# Backfill embeddings for existing records
+# This enqueues River jobs for all records missing embeddings
+backfill-embeddings:
+	@echo "Backfilling embeddings for existing records..."
+	@if [ -f .env ]; then \
+		export $$(grep -v '^#' .env | xargs) && \
+		go run cmd/backfill/main.go; \
+	else \
+		go run cmd/backfill/main.go; \
+	fi
 
 # Run the API server
 run:
@@ -81,25 +107,34 @@ init-db:
 			echo "Error: DATABASE_URL not found in .env file"; \
 			exit 1; \
 		fi && \
-		psql "$$DATABASE_URL" -f sql/001_initial_schema.sql; \
+		for f in sql/*.sql; do \
+			echo "Applying $$f..."; \
+			psql "$$DATABASE_URL" -f "$$f"; \
+		done; \
 	else \
 		if [ -z "$$DATABASE_URL" ]; then \
 			echo "Error: DATABASE_URL environment variable is not set"; \
 			echo "Please set it or create a .env file with DATABASE_URL"; \
 			exit 1; \
 		fi && \
-		psql "$$DATABASE_URL" -f sql/001_initial_schema.sql; \
+		for f in sql/*.sql; do \
+			echo "Applying $$f..."; \
+			psql "$$DATABASE_URL" -f "$$f"; \
+		done; \
 	fi
 	@echo "Database schema initialized successfully"
 
 
-# Start Docker containers
+# Start dev infrastructure (postgres, pgadmin)
 docker-up:
-	@echo "Starting Docker containers..."
+	@echo "Starting dev infrastructure (postgres, pgadmin)..."
 	docker compose up -d
-	@echo "Waiting for services to be ready..."
+	@echo "Waiting for postgres to be ready..."
 	@sleep 3
 	@docker compose ps
+	@echo ""
+	@echo "Postgres: localhost:5432"
+	@echo "pgAdmin:  localhost:5050 (admin@formbricks.com / admin)"
 
 # Stop Docker containers
 docker-down:
@@ -176,9 +211,51 @@ install-hooks:
 
 # Run everything needed for development
 dev-setup: docker-up deps install-tools init-db install-hooks
+	@echo ""
 	@echo "Development environment ready!"
-	@echo "Set API_KEY environment variable for authentication"
-	@echo "Run 'make run' to start the API server"
+	@echo ""
+	@echo "Next steps:"
+	@echo "  Terminal 1: make run          # Go API on :8080"
+	@echo "  Terminal 2: make taxonomy-dev # Python service on :8001 (optional)"
+
+# Run Python taxonomy service locally (uses venv + pip, requires Python 3.11+)
+# Prefers python3.11 if available, falls back to python3
+TAXONOMY_PYTHON := $(shell command -v python3.11 2>/dev/null || command -v python3 2>/dev/null)
+
+taxonomy-dev:
+	@echo "Starting taxonomy-generator service..."
+	@$(TAXONOMY_PYTHON) -c "import sys; exit(0 if sys.version_info >= (3, 11) else 1)" 2>/dev/null || \
+		{ echo "Error: Python 3.11+ required. Install with: brew install python@3.11"; exit 1; }
+	@cd services/taxonomy-generator && \
+	if [ ! -f .env ]; then \
+		echo "Creating .env from .env.example..."; \
+		cp .env.example .env; \
+		echo "⚠️  Edit services/taxonomy-generator/.env to set OPENAI_API_KEY"; \
+	fi && \
+	if [ ! -d .venv ]; then \
+		echo "Creating virtual environment..."; \
+		$(TAXONOMY_PYTHON) -m venv .venv; \
+	fi && \
+	. .venv/bin/activate && \
+	pip install -q -r requirements.txt && \
+	uvicorn src.main:app --reload --port 8001
+
+# Production: start full stack
+prod-up:
+	@echo "Starting production stack..."
+	docker compose -f docker-compose.prod.yml up -d --build
+	@echo "Waiting for services..."
+	@sleep 5
+	@docker compose -f docker-compose.prod.yml ps
+
+# Production: stop full stack
+prod-down:
+	@echo "Stopping production stack..."
+	docker compose -f docker-compose.prod.yml down
+
+# Production: view logs
+prod-logs:
+	docker compose -f docker-compose.prod.yml logs -f
 
 # Run Schemathesis API tests (all phases for thorough local testing)
 # Phases: examples (schema examples), coverage (boundary values), stateful (API sequences), fuzzing (random)

diff --git a/README.md b/README.md
@@ -25,11 +25,12 @@ An open-source Experience Management (XM) database service. Hub is a headless AP
 ### Current Features
 
 - ✅ **RESTful API** for feedback record CRUD operations
-- ✅ **PostgreSQL** for data persistence with optimized schema
+- ✅ **PostgreSQL** with pgvector for data persistence and vector search
+- ✅ **AI-Powered Taxonomy** - automatic topic clustering with UMAP, HDBSCAN, and GPT-4o
+- ✅ **Embedding Generation** - async embedding generation via River job queue
 - ✅ **API Key Authentication** via environment variable
 - ✅ **Clean Architecture** with repository, service, and handler layers
-- ✅ **Docker Compose** for local development
-- ✅ **Database Schema** initialization
+- ✅ **Docker Compose** for local development and production
 - ✅ **Swagger/OpenAPI** documentation
 - ✅ **Health Check** endpoints
 
@@ -47,20 +48,25 @@ An open-source Experience Management (XM) database service. Hub is a headless AP
 ```
 .
 ├── cmd/
-│   └── api/              # API server entrypoint
+│   ├── api/              # API server entrypoint
+│   └── backfill/         # CLI tool for backfilling embeddings
 ├── internal/
 │   ├── api/
 │   │   ├── handlers/     # HTTP request handlers
 │   │   └── middleware/   # HTTP middleware (auth, CORS, logging)
 │   ├── service/          # Business logic layer
 │   ├── repository/       # Data access layer
 │   ├── models/           # Domain models and DTOs
+│   ├── worker/           # Background workers (taxonomy scheduler)
+│   ├── jobs/             # River job queue workers
 │   └── config/           # Configuration management
 ├── pkg/
 │   └── database/         # Database utilities and connection pooling
+├── services/
+│   └── taxonomy-generator/  # Python microservice for ML clustering
 ├── sql/                  # SQL schema files
-├── tests/               # Integration tests
-└── docs/                # API documentation (Swagger)
+├── tests/                # Integration tests
+└── docs/                 # API documentation (Swagger)
 ```
 
 ## Getting Started
@@ -199,15 +205,25 @@ Authorization: Bearer <api-key>
 ### Available Make Commands
 
 ```bash
-make help         # Show all available commands
-make dev-setup    # Set up development environment (docker, deps, tools, schema)
-make build        # Build all binaries
-make run          # Run the API server
-make tests        # Run all tests
-make init-db      # Initialize database schema
-make docker-up    # Start Docker containers
-make docker-down  # Stop Docker containers
-make clean        # Clean build artifacts
+make help           # Show all available commands
+
+# Development
+make dev-setup      # Set up dev environment (postgres, deps, tools, schema, hooks)
+make run            # Run Go API server locally (port 8080)
+make taxonomy-dev   # Run Python taxonomy service locally (port 8001)
+make docker-up      # Start dev infrastructure (postgres, pgadmin)
+make docker-down    # Stop dev infrastructure
+
+# Production
+make prod-up        # Start full containerized stack
+make prod-down      # Stop full stack
+make prod-logs      # View production logs
+
+# Build & Test
+make build          # Build all binaries
+make test-unit      # Fast unit tests (no database)
+make tests          # Integration tests (requires database)
+make test-all       # Run all tests
 ```
 
 ### Running Tests
@@ -218,6 +234,20 @@ make tests        # Integration tests (requires database)
 make test-all     # Run all tests
 ```
 
+### Running with Taxonomy Service
+
+For full AI-powered topic clustering, run both services:
+
+```bash
+# Terminal 1: Go API
+make run
+
+# Terminal 2: Python taxonomy service
+make taxonomy-dev
+```
+
+The taxonomy service requires `OPENAI_API_KEY` in `services/taxonomy-generator/.env`.
+
 ### Git Hooks
 
 The repository includes pre-commit hooks for code quality. To install them:

diff --git a/backfill b/backfill
-Original file line number
+Diff line change
@@ -1,6 +1,7 @@
     .vscode
     .cursor
     bin/
+    api
     .env
     *.out
     *.html
@@ Expand Down @@