diff --git a/src/.codespellignore b/src/.codespellignore index 1b6ede1602..f672c87772 100644 --- a/src/.codespellignore +++ b/src/.codespellignore @@ -13,4 +13,5 @@ SAIs iTerm SOM AKS +aks ACI diff --git a/src/docs.json b/src/docs.json index 019d2ca5ed..51a4c42484 100644 --- a/src/docs.json +++ b/src/docs.json @@ -840,6 +840,42 @@ "langsmith/gcp-self-hosted" ] }, + { + "group": "Deploy with Terraform", + "pages": [ + "langsmith/self-host-terraform", + { + "group": "AWS", + "pages": [ + "langsmith/self-host-terraform-aws-deploy", + "langsmith/self-host-terraform-aws-architecture", + "langsmith/self-host-terraform-aws-variables", + "langsmith/self-host-terraform-aws-quick-reference", + "langsmith/self-host-terraform-aws-troubleshooting" + ] + }, + { + "group": "GCP", + "pages": [ + "langsmith/self-host-terraform-gcp-deploy", + "langsmith/self-host-terraform-gcp-architecture", + "langsmith/self-host-terraform-gcp-variables", + "langsmith/self-host-terraform-gcp-quick-reference", + "langsmith/self-host-terraform-gcp-troubleshooting" + ] + }, + { + "group": "Azure", + "pages": [ + "langsmith/self-host-terraform-azure-deploy", + "langsmith/self-host-terraform-azure-architecture", + "langsmith/self-host-terraform-azure-variables", + "langsmith/self-host-terraform-azure-quick-reference", + "langsmith/self-host-terraform-azure-troubleshooting" + ] + } + ] + }, { "group": "Setup guides", "pages": [ diff --git a/src/images/self-hosted-terraform/aws-architecture.png b/src/images/self-hosted-terraform/aws-architecture.png new file mode 100644 index 0000000000..befcd65675 Binary files /dev/null and b/src/images/self-hosted-terraform/aws-architecture.png differ diff --git a/src/images/self-hosted-terraform/aws-deployment-flow.png b/src/images/self-hosted-terraform/aws-deployment-flow.png new file mode 100644 index 0000000000..c03ca1e0e2 Binary files /dev/null and b/src/images/self-hosted-terraform/aws-deployment-flow.png differ diff --git a/src/images/self-hosted-terraform/azure-architecture-light.png b/src/images/self-hosted-terraform/azure-architecture-light.png new file mode 100644 index 0000000000..958a95e1c7 Binary files /dev/null and b/src/images/self-hosted-terraform/azure-architecture-light.png differ diff --git a/src/images/self-hosted-terraform/azure-architecture-pass2.png b/src/images/self-hosted-terraform/azure-architecture-pass2.png new file mode 100644 index 0000000000..8b096cc3ca Binary files /dev/null and b/src/images/self-hosted-terraform/azure-architecture-pass2.png differ diff --git a/src/images/self-hosted-terraform/azure-architecture-pass3.png b/src/images/self-hosted-terraform/azure-architecture-pass3.png new file mode 100644 index 0000000000..0fc27a0282 Binary files /dev/null and b/src/images/self-hosted-terraform/azure-architecture-pass3.png differ diff --git a/src/images/self-hosted-terraform/azure-architecture-pass4-5.png b/src/images/self-hosted-terraform/azure-architecture-pass4-5.png new file mode 100644 index 0000000000..6e930ae17e Binary files /dev/null and b/src/images/self-hosted-terraform/azure-architecture-pass4-5.png differ diff --git a/src/images/self-hosted-terraform/azure-architecture.png b/src/images/self-hosted-terraform/azure-architecture.png new file mode 100644 index 0000000000..47d3d6439b Binary files /dev/null and b/src/images/self-hosted-terraform/azure-architecture.png differ diff --git a/src/images/self-hosted-terraform/gcp-architecture.png b/src/images/self-hosted-terraform/gcp-architecture.png new file mode 100644 index 0000000000..80d2a01de2 Binary files /dev/null and b/src/images/self-hosted-terraform/gcp-architecture.png differ diff --git a/src/images/self-hosted-terraform/langsmith-components.png b/src/images/self-hosted-terraform/langsmith-components.png new file mode 100644 index 0000000000..bbe3cece0a Binary files /dev/null and b/src/images/self-hosted-terraform/langsmith-components.png differ diff --git a/src/langsmith/aws-self-hosted.mdx b/src/langsmith/aws-self-hosted.mdx index 17659da2ab..bfdcb5e112 100644 --- a/src/langsmith/aws-self-hosted.mdx +++ b/src/langsmith/aws-self-hosted.mdx @@ -14,9 +14,7 @@ This page provides: - [AWS Well-Architected best practices](#aws-well-architected-best-practices) for operational excellence, security, and reliability. -LangChain provides Terraform modules specifically for AWS to help provision infrastructure for LangSmith. These modules can quickly set up EKS clusters, RDS, ElastiCache, S3, and networking resources. - -View the [AWS Terraform modules](https://github.com/langchain-ai/terraform/tree/main/modules/aws) for documentation and examples. +LangChain publishes production-ready [Terraform modules for AWS](https://github.com/langchain-ai/terraform/tree/main/modules/aws) that provision EKS, RDS, ElastiCache, S3, and networking in a single workflow. Start with the [Deploy with Terraform overview](/langsmith/self-host-terraform) to choose between the Terraform and Helm-only paths. ## Initial setup diff --git a/src/langsmith/azure-self-hosted.mdx b/src/langsmith/azure-self-hosted.mdx index 43326bb3fc..136948f81a 100644 --- a/src/langsmith/azure-self-hosted.mdx +++ b/src/langsmith/azure-self-hosted.mdx @@ -14,9 +14,7 @@ This page provides: - [Security and access control](#security-and-access-control) recommendations for Azure deployments. -LangChain provides Terraform modules specifically for Azure to help provision infrastructure for LangSmith. These modules can quickly set up AKS clusters, Azure Database for PostgreSQL, Azure Managed Redis, Blob Storage, and networking resources. - -View the [Azure Terraform modules](https://github.com/langchain-ai/terraform/tree/main/modules/azure) for documentation and examples. +LangChain publishes production-ready [Terraform modules for Azure](https://github.com/langchain-ai/terraform/tree/main/modules/azure) that provision AKS, Azure Database for PostgreSQL, Azure Managed Redis, Blob Storage, and Key Vault in a single workflow. Start with the [Deploy with Terraform overview](/langsmith/self-host-terraform) to choose between the Terraform and Helm-only paths. ## Initial setup diff --git a/src/langsmith/gcp-self-hosted.mdx b/src/langsmith/gcp-self-hosted.mdx index 5324258572..d9a0cd7ff8 100644 --- a/src/langsmith/gcp-self-hosted.mdx +++ b/src/langsmith/gcp-self-hosted.mdx @@ -14,9 +14,7 @@ This page provides: - [Google Cloud Well-Architected best practices](#google-cloud-well-architected-best-practices) for operational excellence, security, and reliability. -LangChain provides Terraform modules specifically for GCP to help provision infrastructure for LangSmith. These modules can quickly set up GKE clusters, Cloud SQL, Memorystore Redis, Cloud Storage, and networking resources. - -View the [GCP Terraform modules](https://github.com/langchain-ai/terraform/tree/main/modules/gcp) for documentation and examples. +LangChain publishes production-ready [Terraform modules for GCP](https://github.com/langchain-ai/terraform/tree/main/modules/gcp) that provision GKE, Cloud SQL, Memorystore, Cloud Storage, and networking in a single workflow. Start with the [Deploy with Terraform overview](/langsmith/self-host-terraform) to choose between the Terraform and Helm-only paths. ## Initial setup diff --git a/src/langsmith/kubernetes.mdx b/src/langsmith/kubernetes.mdx index 8355c6d3de..8138132719 100644 --- a/src/langsmith/kubernetes.mdx +++ b/src/langsmith/kubernetes.mdx @@ -25,15 +25,9 @@ LangChain has successfully tested LangSmith on the following Kubernetes distribu - OpenShift (4.14+) - Minikube and Kind (for development purposes) - -LangChain provides Terraform modules to help provision infrastructure for LangSmith. These modules can quickly set up Kubernetes clusters, storage, and networking for your deployment. - -Available modules: -- [AWS Terraform modules](https://github.com/langchain-ai/terraform/tree/main/modules/aws) -- [Azure Terraform modules](https://github.com/langchain-ai/terraform/tree/main/modules/azure) - -View the [full Terraform repository](https://github.com/langchain-ai/terraform) for documentation and additional resources. - + +**Prefer infrastructure as code?** [Deploy with Terraform](/langsmith/self-host-terraform) bundles cluster provisioning, secrets wiring, and the Helm release for AWS, Azure, and GCP into one workflow. The page below covers the Helm-only path against any conformant cluster you already manage. + ## Prerequisites diff --git a/src/langsmith/self-host-terraform-aws-architecture.mdx b/src/langsmith/self-host-terraform-aws-architecture.mdx new file mode 100644 index 0000000000..e9e187c251 --- /dev/null +++ b/src/langsmith/self-host-terraform-aws-architecture.mdx @@ -0,0 +1,326 @@ +--- +title: AWS Terraform architecture +sidebarTitle: Architecture +description: Platform layers, services, IRSA roles, networking, and module dependencies for LangSmith self-hosted on AWS EKS. +--- + +This page documents what the [AWS Terraform modules](https://github.com/langchain-ai/terraform/tree/main/modules/aws) provision and how the modules wire the resulting deployment together. + +## Platform layers + +LangSmith on AWS deploys in two stages with one optional add-on. The infrastructure stage provisions the cloud foundation. The application stage installs the LangSmith Helm chart. The LangSmith Deployment add-on is opt-in and adds the host-backend, listener, and operator services for managing LangGraph applications from the UI. + +LangSmith on AWS service layout + +```txt +LangSmith Deployment add-on (enable_langsmith_deployments = true) + host-backend, listener, operator + Per deployed graph: api-server, queue, redis, postgres (operator-managed) + Requires: KEDA (installed alongside infrastructure via k8s-bootstrap) + +LangSmith application (deploy_langsmith = true) + backend, frontend, playground, queue, ace-backend, clickhouse + Storage: RDS PostgreSQL (metadata) + S3 (trace blobs via VPC endpoint) + Ingress: AWS ALB | NGINX | Envoy Gateway | Istio + +AWS infrastructure + VPC + private/public subnets + single NAT gateway + EKS cluster + managed node group + cluster autoscaler + RDS PostgreSQL (private subnets) + ElastiCache Redis (private subnets) + S3 bucket + VPC Gateway Endpoint (no public route) + ALB controller + EBS CSI driver + metrics server + k8s-bootstrap: KEDA, ESO, optional Envoy Gateway + Optional: Network Firewall, WAF, CloudTrail, ALB access logs +``` + +## Component to storage mapping + +| Component | Storage backend | Access method | +|---|---|---| +| `backend` | RDS PostgreSQL | Private subnet, security group | +| `backend` | S3 bucket | IRSA + VPC Gateway Endpoint | +| `clickhouse` | EBS volume (GP3, EKS PVC) | Local | +| `redis` | ElastiCache or in-cluster | Private subnet, security group | +| LGP operator | RDS PostgreSQL (shared) | Private subnet, security group | + +## Application core services + +These pods run on every deployment. All write logs and metrics; the busier components (backend, queue, ingest-queue) scale horizontally. + +| Service | Purpose | Port | HPA | IRSA | Depends on | +|---|---|---|---|---|---| +| `langsmith-frontend` | React UI | 3000 | 1 to 10 | No | `backend`, `platform-backend` | +| `langsmith-backend` | Main API (traces, runs, projects, API keys, feedback) | 1984 | 3 to 10 | Yes (S3) | Postgres, Redis, ClickHouse, S3 | +| `langsmith-platform-backend` | Org and user management, auth, billing, settings | 1986 | 1 to 10 | Yes (S3) | Postgres, Redis, S3 | +| `langsmith-playground` | LLM prompt playground UI | 3001 | 1 to 10 | No | `backend` | +| `langsmith-queue` | Trace ingestion worker (Redis to ClickHouse + S3) | — | 3 to 10 + KEDA | Yes | Redis, ClickHouse, S3 | +| `langsmith-ingest-queue` | Dedicated high-throughput ingestion worker | — | 3 to 10 + KEDA | Yes | Redis, S3 | +| `langsmith-ace-backend` | Async compute (dataset runs, evaluations, background jobs) | — | 1 to 5 | No | Postgres, Redis | +| `langsmith-clickhouse` | Columnar store (trace spans, run metadata, eval results) | — | StatefulSet, single replica | No | EBS GP3 PVC | + + +In-cluster ClickHouse is dev/POC only (single pod, no replication, no backups). For production use [LangChain Managed ClickHouse](/langsmith/langsmith-managed-clickhouse) or a self-managed external cluster. + + +### One-time jobs + +The Helm chart runs three jobs at install and upgrade time: + +| Job | Purpose | +|---|---| +| `langsmith-backend-migrations` | PostgreSQL schema migrations | +| `langsmith-backend-ch-migrations` | ClickHouse schema migrations | +| `langsmith-backend-auth-bootstrap` | Creates the initial org and admin account from `initial_org_admin_password` in `langsmith-config` | + +## LangSmith Deployment add-on + +When `enable_langsmith_deployments = true`, three additional services are installed and a `LangGraphPlatform` CRD is registered. Each deployment the user creates in the LangSmith UI produces a Kubernetes Deployment in the `langsmith` namespace, managed by the operator. + +| Service | Purpose | +|---|---| +| `langsmith-host-backend` | LangGraph control plane API. Manages deployment lifecycle, serves deployment metadata. IRSA for S3 access. | +| `langsmith-listener` | Watches host-backend for deployment state changes, creates and updates `LangGraphPlatform` CRDs. IRSA for S3 access. | +| `langsmith-operator` | Kubernetes operator. Reconciles `LangGraphPlatform` CRDs, creates and deletes Deployments and Services for each agent. | + +## AWS managed services + +When `postgres_source = "external"` and `redis_source = "external"` (the recommended production setting), Terraform provisions the following AWS managed services: + +### RDS PostgreSQL + +- Default size: `db.t3.large`, private subnets, port 5432. +- Holds orgs, users, projects, API keys, settings. +- Secret flow: SSM `/langsmith/{base_name}/postgres-password` → ESO → `langsmith-config`. + +### ElastiCache Redis + +- Default size: `cache.m5.large`, private subnets, TLS port 6379. +- Trace ingestion queue, pub/sub, short-lived cache. +- Secret flow: SSM `/langsmith/{base_name}/redis-auth-token` → ESO → `langsmith-config`. + +### S3 bucket + +- Trace payloads: large inputs and outputs, attachments. +- IRSA via `langsmith_irsa_role` (no static keys). VPC Gateway Endpoint, no public internet. +- Prefixes: `ttl_s/` (short TTL) and `ttl_l/` (long TTL). +- Always required. Disabling blob storage breaks the cluster on large payloads. + +### SSM Parameter Store + +- Centralized secret store for all LangSmith secrets. +- Flow: `source infra/scripts/setup-env.sh` writes secrets to SSM. The ESO `ClusterSecretStore` reads them and projects a `langsmith-config` Kubernetes Secret that the Helm chart mounts via `config.existingSecretName`. +- Prefix: `/langsmith/{name_prefix}-{environment}/`. + +## Cluster infrastructure + +The `k8s-bootstrap` Terraform module installs the cluster-level services that LangSmith depends on: + +| Service | Namespace | IRSA | Purpose | +|---|---|---|---| +| `aws-load-balancer-controller` | `kube-system` | Yes | Provisions the AWS ALB from Kubernetes Ingress objects. Deleting the Ingress deprovisions the ALB and assigns a new DNS name on recreate, which breaks DNS records and OIDC redirect URIs. | +| `cluster-autoscaler` | `kube-system` | Yes | Scales EC2 node groups based on pod scheduling pressure. | +| `ebs-csi-driver` | `kube-system` | Yes | Provisions EBS volumes for PersistentVolumeClaims (used by ClickHouse). | +| KEDA | `keda` | No | Kubernetes Event-driven Autoscaling. Scales `queue` and `ingest-queue` on Redis queue depth. Required for the LangSmith Deployment add-on. | +| cert-manager | `cert-manager` | Optional (Route 53 IRSA when `letsencrypt`) | Automates TLS certificate issuance. Installed always; active for Let's Encrypt only. | +| External Secrets Operator | `external-secrets` | Yes | Syncs SSM parameters into the `langsmith-config` Kubernetes Secret. | + +## IRSA roles + +IRSA replaces static credentials. The EKS cluster's OIDC issuer is the trust anchor; service accounts in `langsmith` and `kube-system` are annotated with role ARNs and pods receive temporary credentials via the EKS token webhook. + +| Role | Defined in | Used by | Permissions | +|---|---|---|---| +| `langsmith_irsa_role` | `modules/eks` | `backend`, `platform-backend`, `queue`, `ingest-queue`, host-backend, listener | `s3:GetObject`, `s3:PutObject`, `s3:DeleteObject`, `s3:ListBucket` on the LangSmith bucket | +| `aws_iam_role.eso` | `aws/infra/main.tf` | ESO controller | `ssm:GetParameter`, `ssm:GetParameters` on `/langsmith/*` | + +## Network topology + +### Default — ALB ingress + +```txt +Internet + → AWS Application Load Balancer (port 80 or 443, TLS via ACM or Let's Encrypt) + → EKS Cluster (private subnets) + • kube-system: aws-load-balancer-controller, cluster-autoscaler, ebs-csi-driver, keda + • langsmith: backend, frontend, playground, queue, clickhouse + redis (in-cluster) OR ElastiCache (private subnet) + RDS PostgreSQL (private subnet) + S3 bucket (VPC Gateway Endpoint, no public route) +``` + +### Envoy Gateway — opt-in + +```txt +Internet + → AWS Network Load Balancer (NLB, ACM TLS termination at 443) + → envoy-gateway-system: Envoy proxy (GatewayClass: eg, Gateway: langsmith-gateway) + → langsmith namespace: backend, frontend, playground, queue, clickhouse, ... + → langsmith-agents namespace (optional dataplane): langgraph-dataplane listener + operator + agent pods + (HTTPRoute attaches to shared langsmith-gateway via allowedRoutes: All) +``` + +### Egress path with Network Firewall + +When `create_firewall = true`, all outbound internet traffic from private subnets is inspected before reaching the NAT gateway: + +```txt +EKS pods / RDS / ElastiCache (private subnets) + → AWS Network Firewall (TLS SNI + HTTP Host inspection) + ALLOWLIST: firewall_allowed_fqdns (default: beacon.langchain.com) + DROP: all other established connections + → NAT Gateway (public subnet) + → Internet +``` + +Pod-to-pod, pod-to-RDS, and pod-to-ElastiCache traffic uses the local VPC route and never touches the firewall. + +## Ingress options + +Four mutually exclusive ingress options ship with the modules. The choice determines whether split dataplane (agent pods in a separate namespace) is supported. + +| Option | Variable | Split dataplane | Traffic path | When to use | +|---|---|---|---|---| +| ALB (AWS LBC) | _default_ | No | `ALB → frontend NodePort` | Default. Single-namespace deployments, POC, simplest TLS via ACM. | +| NGINX Ingress | `enable_nginx_ingress = true` | No | `ALB → TGB → NGINX controller → frontend ClusterIP` | When NGINX is the standard ingress in your organization. | +| Envoy Gateway | `enable_envoy_gateway = true` | Yes | `ALB → TGB → Envoy proxy:10080 → HTTPRoute → services` | Cross-namespace HTTPRoute routing. Recommended for split dataplane on new AWS deployments. | +| Istio | `enable_istio_gateway = true` | Yes | `ALB → TGB → istio-ingressgateway:80 → VirtualService → services` | Clusters with Istio already installed, or when an mTLS mesh is required. | + +### Why ALB cannot support split dataplane + +Standard Kubernetes Ingress is namespace-scoped. The ALB controller routes only to services in the same namespace as the Ingress resource. Agent pods in `langsmith-agents` are invisible to an Ingress in `langsmith`. Envoy Gateway and Istio both support cross-namespace routing via the Kubernetes Gateway API. + +### ALB plus Envoy Gateway (chained) + +When the existing ALB already provides SSO (Okta or Cognito OIDC), WAF, and TLS, Envoy Gateway slots in behind it instead of replacing it: + +```txt +Internet + → ALB (unchanged: WAF, SSO, TLS, DNS) + → Envoy Gateway NLB (internal-scheme, auto-provisioned by k8s-bootstrap) + → HTTPRoute → langsmith namespace (control plane) + → HTTPRoute → langsmith-agents namespace (split dataplane) +``` + +The only change from the default ALB path is retargeting the ALB target group to the Envoy NLB. See `helm/values/examples/langsmith-values-ingress-envoy-gateway.yaml` in the modules repo for the values overlay. + +## TLS and DNS + +The `tls_certificate_source` variable controls the certificate strategy: + +| Mode | Behavior | Compatible gateways | +|---|---|---| +| `none` | HTTP only, no certificate | Any | +| `acm` | HTTPS:443 with HTTP→HTTPS redirect. ACM certificate, auto-provisioned or BYO. | ALB, NGINX | +| `letsencrypt` | HTTPS via cert-manager + Let's Encrypt DNS-01 (Route 53 IRSA) | Istio, Envoy | + +### Why ACM versus cert-manager + +ACM certificates are non-exportable. AWS attaches them directly to the ALB, which makes ACM the right choice when TLS terminates at the ALB. ACM cannot be used when TLS terminates inside the cluster (Istio Gateway, Envoy Gateway) because those gateways require the certificate material as a Kubernetes Secret. + +cert-manager handles in-cluster TLS for Istio and Envoy. The `letsencrypt` value is a reference implementation: it installs cert-manager and a Let's Encrypt ACME `ClusterIssuer`. In production, swap the `ClusterIssuer` for any cert-manager-compatible issuer. + +| Issuer | When to use | +|---|---| +| Let's Encrypt _(default)_ | Public domain, internet access, free | +| ACM Private CA (`aws-privateca-issuer`) | AWS-native, air-gap friendly, private domains, paid | +| Venafi (`cert-manager-venafi`) | Enterprise PKI, regulated environments | +| HashiCorp Vault (`cert-manager-vault`) | Self-hosted PKI | +| DigiCert, Sectigo, others | ACME or custom issuer plugins | + +The Terraform module provisions the cert-manager IRSA role and Route 53 permissions. Only the `ClusterIssuer` manifest changes between issuers. + +### Auto-provisioned DNS + +When `langsmith_domain` is set and `acm_certificate_arn` is empty, Terraform activates the `dns` module which creates: + +- A Route 53 hosted zone for the domain. +- An ACM certificate with DNS validation records. +- A Route 53 alias record pointing the domain to the ALB. + +**Staged deploy pattern:** Set `langsmith_domain` with `tls_certificate_source = "none"` first. Terraform creates the hosted zone and certificate without blocking on validation. Delegate the NS records at your registrar, then flip to `tls_certificate_source = "acm"` in a later apply. Terraform blocks until the certificate validates and wires it into the HTTPS listener. + +### Bring your own certificate + +Set `acm_certificate_arn` directly to skip the `dns` module. For in-cluster gateways, create a Kubernetes TLS Secret manually and reference it in the Gateway or VirtualService. + +## Module dependency graph + +```txt +vpc ─► firewall (optional, create_firewall = true) +│ +├─► eks ─► k8s-bootstrap (KEDA, ESO, Envoy Gateway [opt-in]) +│ └─► cert-manager (Let's Encrypt DNS-01 via Route 53 IRSA) +│ +├─► postgres (RDS, private subnets from VPC) +├─► redis (ElastiCache, private subnets from VPC) +├─► storage (S3 bucket + VPC Gateway Endpoint) +├─► alb (pre-provisioned ALB, public subnets) +│ └─► alb_access_logs (S3 bucket for access logs, opt-in) +├─► dns (Route 53 zone + ACM cert, optional) +├─► bastion (jump host for private EKS access, optional) +├─► cloudtrail (audit logging, optional) +├─► waf (WAF ACL on ALB, optional) +└─► firewall (Network Firewall egress filter, optional) + all ─► langsmith (root module) +``` + +### Opt-in security modules + +| Module | Variable | Default | Purpose | +|---|---|---|---| +| Network Firewall | `create_firewall` | `false` | FQDN-based egress filtering. Allows only domains in `firewall_allowed_fqdns` (TLS SNI + HTTP Host). Requires `create_vpc = true`. Cost ≈ `$0.40/hr/endpoint + $0.065/GB processed`. | +| ALB access logs | `alb_access_logs_enabled` | `false` | Traffic analysis and compliance | +| CloudTrail | `create_cloudtrail` | `false` | API call logging. Skip if an organization trail already exists. | +| WAF | `create_waf` | `false` | WAFv2 Web ACL — OWASP Top 10, IP reputation, known bad inputs | + +## Default resource sizes + +| Resource | Default | vCPU | Memory | +|---|---|---|---| +| EKS node | `m5.4xlarge` | 16 | 64 GB | +| RDS PostgreSQL | `db.t3.large` | 2 | 8 GB | +| ElastiCache Redis | `cache.m6g.xlarge` | 4 | 13.07 GB | +| RDS storage | 10 GB | — | — | + +For production sizing recommendations, see the [scaling guide](/langsmith/self-host-scale) and the [AWS deployment guide](/langsmith/self-host-terraform-aws-deploy#cluster-sizing-reference). + +## Validated behaviors and known constraints + +These constraints were validated during the April 2026 gateway permutation test run. + +| # | Area | Constraint or fix | +|---|---|---| +| 1 | ACM wildcard SANs | `langchain.com` has `0 issue "amazon.com"` CAA but not `0 issuewild "amazon.com"`. Wildcard SANs fail with `CAA_ERROR`. The `dns` module requests only the apex domain. | +| 2 | In-cluster Redis | The LangSmith Helm chart deploys Redis without `requirepass`. The `k8s_bootstrap` module writes `redis://langsmith-redis:6379`. Do not add an auth token unless you also configure the Helm chart Redis values. | +| 3 | `name_prefix` length | Maximum 15 characters. Names like `dz-nginx-tst` (12 characters) are valid. | +| 4 | Istio port | Istio 1.23+ ingressgateway listens on port 80 via `NET_BIND_SERVICE`, not port 8080. ALB TGB health check and security group rules must target port 80. | +| 5 | NGINX TGB port | NGINX ingress-nginx controller pods listen on port 80. The TargetGroupBinding target type is `ip`. | +| 6 | Envoy proxy port | Envoy proxy pods listen on port 10080 (not 80) when running as non-root. The TGB `servicePort` must be 10080. | +| 7 | Destroy order | Always run `terraform destroy` first and let Terraform handle namespace and Helm release lifecycle. Pre-deleting namespaces causes the `helm_release` resource to time out because Helm cannot uninstall cleanly into a terminating namespace. | +| 8 | Stuck terminating namespaces | KEDA's stale `external.metrics.k8s.io/v1beta1` API group causes `NamespaceDeletionDiscoveryFailure`. Fix: `kubectl delete apiservice v1beta1.external.metrics.k8s.io` before re-running `terraform destroy`. | + +## Verification commands + +```bash +# EKS cluster status +aws eks describe-cluster --name --query "cluster.status" + +# Node health +kubectl get nodes -o wide + +# ALB status +kubectl get ingress -n langsmith + +# RDS status +aws rds describe-db-instances \ + --query "DBInstances[?DBInstanceIdentifier==''].DBInstanceStatus" + +# ElastiCache status +aws elasticache describe-replication-groups \ + --query "ReplicationGroups[?ReplicationGroupId==''].Status" + +# S3 access from a pod (via VPC endpoint) +kubectl run s3-test --rm -it --image=amazon/aws-cli -n langsmith -- \ + aws s3 ls s3:// +``` diff --git a/src/langsmith/self-host-terraform-aws-deploy.mdx b/src/langsmith/self-host-terraform-aws-deploy.mdx new file mode 100644 index 0000000000..7c4afec8ac --- /dev/null +++ b/src/langsmith/self-host-terraform-aws-deploy.mdx @@ -0,0 +1,425 @@ +--- +title: Deploy LangSmith on AWS with Terraform +sidebarTitle: Deploy +description: End-to-end walkthrough for provisioning LangSmith self-hosted on AWS EKS using the LangChain Terraform modules. +--- + +Provision the AWS cloud foundation and install LangSmith with the public Terraform modules at [github.com/langchain-ai/terraform/tree/main/modules/aws](https://github.com/langchain-ai/terraform/tree/main/modules/aws). Plan for 30 to 40 minutes end to end on a clean account. + +The deployment runs in two stages: infrastructure (Terraform provisions VPC, EKS, RDS, ElastiCache, S3, IAM) and application (Helm installs the LangSmith chart against the cluster). Add-ons are enabled with a flag and a redeploy. + +## Prerequisites + +### Required tools + +| Tool | Version | Purpose | +|---|---|---| +| AWS CLI | v2 | Authenticate, query AWS resources, manage EKS kubeconfig | +| Terraform | 1.5 | Run the infrastructure modules | +| `kubectl` | 1.28 | Inspect the EKS cluster | +| Helm | 3.12 | Install and manage the LangSmith chart | +| `eksctl` | latest | Optional, handy for kubeconfig and debugging | + +Install on macOS: + +```bash +brew install awscli kubectl helm eksctl +brew tap hashicorp/tap && brew install hashicorp/tap/terraform +``` + +Verify each tool is on `PATH`: + +```bash +aws --version +terraform version +kubectl version --client +helm version +``` + +For Linux, follow the [AWS CLI install guide](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) and use your distribution's package manager for the remaining tools. + +### Required AWS IAM permissions + +The IAM user or role running Terraform needs permission to create and manage the cloud foundation. The following managed policies cover the full surface area. Use them as a starting point and trim down to least-privilege once the deployment is stable. + +| Policy | Purpose | +|---|---| +| `AmazonEKSClusterPolicy` | Create and manage EKS clusters | +| `AmazonVPCFullAccess` | Create VPC, subnets, route tables, and NAT | +| `AmazonRDSFullAccess` | Create and manage RDS PostgreSQL instances | +| `AmazonElastiCacheFullAccess` | Create ElastiCache Redis clusters | +| `AmazonS3FullAccess` | Create S3 buckets and VPC endpoints | +| `IAMFullAccess` | Create IRSA roles and policies | + + +Run `make preflight` from `modules/aws/` after authenticating. The preflight script confirms that the active credentials can perform each required action and reports the first missing permission, which is faster than discovering gaps mid-`terraform apply`. + + +### Authenticate + +Configure AWS credentials with the CLI: + +```bash +aws configure +``` + +Or export environment variables: + +```bash +export AWS_ACCESS_KEY_ID="..." +export AWS_SECRET_ACCESS_KEY="..." +export AWS_DEFAULT_REGION="us-west-2" +``` + +Confirm the credentials work and the target region is enabled in the account: + +```bash +aws sts get-caller-identity +aws ec2 describe-availability-zones --query 'AvailabilityZones[].ZoneName' --output table +``` + +### License key and domain + +Two non-AWS items must be ready before `terraform apply`: + +- **LangSmith license key.** [Contact sales](https://www.langchain.com/contact-sales) to request one. The key is stored in AWS SSM Parameter Store by the setup script, not in `tfvars`. +- **Domain or subdomain** that resolves to the AWS account, plus an ACM certificate covering it (or `letsencrypt` / `none` for the `tls_certificate_source` variable). + +### Cluster sizing reference + +The Terraform modules pick instance types and node counts based on `sizing_profile`. Plan capacity for the target tier before deploying. + +| Profile | EKS nodes | RDS instance | ElastiCache | Use case | +|---|---|---|---|---| +| `dev` | 2 × `m5.xlarge` | `db.t4g.medium` | `cache.t4g.small` | Demos, CI, short-lived POCs | +| `production` | 3 × `m5.2xlarge` (HPA on) | `db.m6g.large` | `cache.m6g.large` | Standard production | +| `production-large` | 6 × `m5.4xlarge` (HPA on) | `db.m6g.2xlarge` | `cache.m6g.xlarge` | High-volume, multi-tenant | + + +For production and `production-large`, also plan to provision external [LangChain Managed ClickHouse](/langsmith/langsmith-managed-clickhouse) or a self-managed external ClickHouse cluster. In-cluster ClickHouse is supported for `dev` only. + + +## Rapid path + +For the fastest path from zero to a running LangSmith instance, run these commands in order: + +```bash +# 1. Clone the public modules +git clone https://github.com/langchain-ai/terraform.git +cd terraform/modules/aws + +# 2. Generate terraform.tfvars interactively (Enter accepts current values) +make quickstart + +# 3. Load secrets into SSM Parameter Store +# Must be sourced, not executed +source infra/scripts/setup-env.sh + +# 4. Provision infrastructure (~20 to 25 min) +make init +make plan +make apply + +# 5. Configure kubectl +make kubeconfig +kubectl get nodes + +# 6. Deploy LangSmith via Helm (~5 to 10 min) +make init-values +make deploy + +# 7. Confirm +kubectl get pods -n langsmith +kubectl get ingress -n langsmith +``` + +To chain infrastructure and application in one command: + +```bash +make quickdeploy # interactive, prompts before terraform apply +make quickdeploy-auto # non-interactive, auto-approves terraform +``` + +`make quickdeploy` runs `terraform apply` → `kubeconfig` → `init-values` → `helm deploy` in sequence. If any step fails, the command exits with instructions for resuming from that step. + +The sections below cover each phase in detail. + +## Provision infrastructure + +Provisioning the AWS cloud foundation takes 20 to 25 minutes on a clean account. Do not interrupt the apply. + +### What gets provisioned + +| Resource | Purpose | +|---|---| +| VPC + subnets + NAT | Private network for the cluster and managed services | +| EKS cluster + node groups | Kubernetes compute | +| RDS PostgreSQL | LangSmith operational data | +| ElastiCache Redis | Queue and cache | +| S3 bucket + VPC endpoint | Trace payload blob storage | +| ALB + listeners | Public ingress with TLS | +| SSM Parameter Store entries | Application secrets, synced into the cluster by External Secrets Operator | +| IRSA roles + IAM policies | Per-service AWS access | +| KEDA, cert-manager, ESO | Bootstrap workloads installed alongside infrastructure | + +### Clone and configure + +```bash +git clone https://github.com/langchain-ai/terraform.git +cd terraform/modules/aws +``` + +All subsequent commands run from `modules/aws/`. Run `make help` for the full target list. + +Generate `terraform.tfvars` with the interactive wizard: + +```bash +make quickstart +``` + +The wizard prompts for naming prefix, region, EKS sizing, TLS source, external vs in-cluster services, and the optional add-on flags. It writes `infra/terraform.tfvars`. Re-running the wizard pre-selects existing values; press Enter at each prompt to keep the current config. + +Prefer to edit by hand? Copy the example and fill in the required fields: + +```bash +cp infra/terraform.tfvars.example infra/terraform.tfvars +vi infra/terraform.tfvars +``` + +The minimum required variables: + +```hcl +name_prefix = "acme" +environment = "prod" +region = "us-west-2" + +eks_cluster_version = "1.31" +eks_managed_node_groups = { + default = { + name = "node-group-default" + instance_types = ["m5.4xlarge"] + min_size = 3 + max_size = 10 + } +} + +postgres_source = "external" +redis_source = "external" + +tls_certificate_source = "acm" +acm_certificate_arn = "arn:aws:acm:us-west-2::certificate/" +langsmith_domain = "langsmith.example.com" +``` + +See the [AWS variables reference](/langsmith/self-host-terraform-aws-variables) for every input variable. + + +Configure a remote state backend before applying. Edit `infra/backend.tf` to point at an S3 bucket and DynamoDB lock table you control. The Terraform repo ships a local backend by default for first-time evaluations. + + +### Load secrets into SSM Parameter Store + +```bash +source infra/scripts/setup-env.sh +``` + +The script reads `terraform.tfvars`, derives the SSM path `/langsmith/{name_prefix}-{environment}/`, then for each secret either reuses an exported value, reads the existing SSM parameter, auto-generates one (for salts and tokens), or prompts you. The license key and admin password are the two values you supply interactively. The script must be sourced (not executed) because `make` cannot export environment variables back to the parent shell. + +The script manages the following SSM parameters: + +| SSM key | How it is set | Notes | +|---|---|---| +| `postgres-password` | Prompt | RDS uses this password | +| `redis-auth-token` | Auto-generated (`openssl rand -hex 32`) | ElastiCache requires hex | +| `langsmith-api-key-salt` | Auto-generated (`openssl rand -base64 32`) | Never rotate, breaks all API keys | +| `langsmith-jwt-secret` | Auto-generated (`openssl rand -base64 32`) | Never rotate, invalidates all sessions | +| `langsmith-license-key` | Prompt | From your LangChain account team | +| `langsmith-admin-password` | Prompt | Must contain a symbol | +| `deployments-encryption-key` | Auto-generated Fernet key | LangSmith Deployment add-on | +| `agent-builder-encryption-key` | Auto-generated Fernet key | Agent Builder add-on | +| `insights-encryption-key` | Auto-generated Fernet key | Insights add-on | +| `polly-encryption-key` | Auto-generated Fernet key | Polly add-on | + +Verify the secrets are present and the `TF_VAR_*` environment variables are exported: + +```bash +make secrets +``` + +### Apply + +```bash +make init +make plan +make apply +``` + +`make plan` shows the proposed diff. Review the output before applying. `make apply` provisions in dependency order: VPC and security groups, then EKS (about 12 minutes) and RDS (about 8 minutes, in parallel), then node groups, ElastiCache, S3, and the ALB. + +### Configure kubectl + +```bash +make kubeconfig +kubectl get nodes +kubectl get pods -n kube-system +``` + +All nodes should report `Ready` and the core add-ons (CoreDNS, kube-proxy, VPC CNI, KEDA, cert-manager, ESO) should be `Running`. + +## Deploy LangSmith + +Two deployment paths are supported. Pick one. + +### Script-driven Helm deploy (recommended) + +Best for most deployments. Interactive prompts guide you through sizing and product choices. + +```bash +cd modules/aws + +make init-values +make deploy +``` + +`init-values.sh` prompts for the admin email, then reads `sizing_profile` and the `enable_*` flags from `terraform.tfvars` and copies the matching values files from `helm/values/examples/` into `helm/values/`. On re-runs it preserves your choices and refreshes Terraform outputs. + +`make deploy` runs `helm/scripts/deploy.sh`, which: + +1. Refreshes the kubeconfig. +2. Runs preflight checks (AWS credentials, cluster reachability, the `langchain` Helm repo). +3. Applies the External Secrets Operator `ClusterSecretStore` and `ExternalSecret` so the cluster reads secrets directly from SSM. +4. Installs the LangSmith Helm chart with the layered values files. + +Expect 5 to 10 minutes for the chart to install and pods to become ready. + +#### Verify + +```bash +kubectl get pods -n langsmith +kubectl get ingress -n langsmith +``` + +When all pods are `Running` and the ingress shows the ALB DNS name, the deployment is ready. Use the domain you configured in `langsmith_domain` (or the ALB DNS name) to reach the UI. + +### Terraform-managed Helm deploy + +Best for teams that want the full deployment in Terraform state, or for "bring your own infrastructure" scenarios. The `app/` module manages the External Secrets Operator wiring, the `helm_release`, and feature toggles directly. + +```bash +cd modules/aws + +# Generate Helm values files from templates (required, the app module reads these) +make init-values + +# Pull infra outputs into app/infra.auto.tfvars.json +make init-app + +# Configure app-specific settings +cp app/terraform.tfvars.example app/terraform.tfvars +# Edit app/terraform.tfvars, set admin_email, sizing, and feature toggles + +# Deploy +make plan-app +make apply-app +``` + +The `app/terraform.tfvars` file controls the application configuration: + +```hcl +admin_email = "admin@example.com" +sizing = "production" # production | production-large | dev | none +enable_agent_deploys = true +enable_agent_builder = true +enable_insights = true +enable_polly = true +clickhouse_host = "clickhouse.example.com" +``` + + +`make init-values` is required before `make plan-app`. The app module reads the values files from `helm/values/` and `init-values` populates them from `helm/values/examples/` based on the sizing and add-on choices in `infra/terraform.tfvars`. + + +For "bring your own infrastructure", skip `make init-app` and set all variables manually in `app/terraform.tfvars`. + +## Enable add-ons + +Each add-on is gated by a flag in `infra/terraform.tfvars`. Set the flag, re-run `make init-values` to copy the matching values file, then re-run `make deploy`. + +```hcl +enable_deployments = true # LangGraph Platform (required for Agent Builder and Polly) +enable_agent_builder = true # Agent Builder UI +enable_insights = true # ClickHouse-backed analytics +enable_polly = true # Polly AI eval and monitoring +enable_usage_telemetry = false # Extended usage telemetry +``` + +```bash +make init-values +make deploy +``` + +For details on each add-on, see [LangSmith Deployment](/langsmith/deploy-self-hosted-full-platform). + +## Optional: private EKS cluster with bastion + +For deployments that must run a fully private EKS API endpoint, the modules ship a bastion host pattern: + +1. First, run from your workstation with `create_bastion = true` and `enable_public_eks_cluster = true` so the bastion can be created. +2. After the initial deployment, set `enable_public_eks_cluster = false` and re-apply. The EKS API endpoint becomes private only. +3. All subsequent Terraform work happens on the bastion. SSM into it, clone the repo, copy your `terraform.tfvars` and SSM secrets, then run the deployment from there. + +```hcl +enable_public_eks_cluster = false +create_bastion = true + +# Optional SSH access (SSM is the default and requires no key): +# bastion_key_name = "my-keypair" +# bastion_enable_ssh = true +# bastion_ssh_allowed_cidrs = ["203.0.113.0/24"] +``` + +Connect via SSM Session Manager: + +```bash +terraform output bastion_ssm_command +aws ssm start-session --target --region us-west-2 +``` + + +The bastion lives in a public subnet for SSM agent connectivity but does not need a public IP if your VPC has the SSM, SSMMessages, and EC2Messages VPC endpoints. The bastion comes preinstalled with `kubectl`, `helm`, `terraform`, `git`, and `jq`, with kubeconfig already configured for the EKS cluster. Install the [Session Manager plugin](https://docs.aws.amazon.com/systems-manager/latest/userguide/session-manager-working-with-install-plugin.html) for the AWS CLI on your workstation. + + +## Optional: Envoy Gateway ingress + +The default ingress is the AWS Load Balancer Controller (ALB). Set `enable_envoy_gateway = true` in `terraform.tfvars` to install [Envoy Gateway](https://gateway.envoyproxy.io/) instead. Envoy Gateway is required for multi-namespace dataplane deployments where the `langgraph-dataplane` chart runs in its own namespace. + +```hcl +# infra/terraform.tfvars +enable_envoy_gateway = true +``` + +```bash +source infra/scripts/setup-env.sh +make apply + +make init-values +cp helm/values/examples/langsmith-values-ingress-envoy-gateway.yaml helm/values/ +make deploy +``` + +The deploy script annotates the Envoy Gateway NLB service with the ACM certificate ARN automatically when `tls_certificate_source = "acm"`. TLS terminates at the NLB; Envoy sees plain HTTP internally. + +When running the dataplane chart in a separate namespace, apply the RBAC manifest once per dataplane namespace: + +```bash +kubectl apply -f helm/values/dataplane-rbac.yaml +``` + +This grants the `langsmith-host-backend` ServiceAccount read access to pods, pod logs, deployments, and ReplicaSets in the dataplane namespace. Without it, agent run logs do not stream in the LangSmith UI. + +## Next steps + +- Reference the [AWS variables](/langsmith/self-host-terraform-aws-variables) and the [quick reference](/langsmith/self-host-terraform-aws-quick-reference). +- Review the [AWS architecture](/langsmith/self-host-terraform-aws-architecture) for platform layers, IRSA, and module dependencies. +- When something breaks, check the [AWS troubleshooting guide](/langsmith/self-host-terraform-aws-troubleshooting). +- Enable agent deployment in the UI with [LangSmith Deployment](/langsmith/deploy-self-hosted-full-platform). diff --git a/src/langsmith/self-host-terraform-aws-quick-reference.mdx b/src/langsmith/self-host-terraform-aws-quick-reference.mdx new file mode 100644 index 0000000000..40a17ce372 --- /dev/null +++ b/src/langsmith/self-host-terraform-aws-quick-reference.mdx @@ -0,0 +1,301 @@ +--- +title: AWS Terraform quick reference +sidebarTitle: Quick reference +description: Make targets, Terraform commands, kubectl, AWS CLI, and Helm operations for LangSmith self-hosted on AWS EKS. +--- + +Command cheat sheet for day-to-day operations against an AWS LangSmith deployment provisioned with the [AWS Terraform modules](https://github.com/langchain-ai/terraform/tree/main/modules/aws). All `make` targets run from `modules/aws/`. Run `make help` for an inline summary. + +For the full deployment walkthrough, see the [AWS deployment guide](/langsmith/self-host-terraform-aws-deploy). + +## First-time setup + +```bash +cd terraform/modules/aws + +# 1. Generate terraform.tfvars (interactive wizard) +make quickstart + +# 2. Load secrets into SSM Parameter Store and export TF_VAR_* into your shell. +# Must use `source` — Make runs each target in a subshell. +source infra/scripts/setup-env.sh + +# 2a. Confirm secrets and TF_VAR_* are set (optional but recommended) +make secrets + +# 3. Provision infrastructure (~20–25 min) +make init +make plan # review — confirm no unexpected destroy/replace actions +make apply + +# 3a. Verify post-infra state (optional) +make preflight-post + +# 4. Update kubeconfig for the EKS cluster +make kubeconfig + +# 5. Generate Helm values from Terraform outputs +make init-values + +# 6. Deploy LangSmith (~10 min) +make deploy +``` + +Fast path once `make quickstart` and `source infra/scripts/setup-env.sh` are complete: + +```bash +make quickdeploy # interactive (prompts before terraform apply) +make quickdeploy-auto # non-interactive (auto-approves terraform) +``` + +## Day-2 operations + +```bash +# Check deployment state across all layers; print next-step guidance +make status + +# Re-deploy after editing Helm values or upgrading +make deploy + +# Re-generate Helm values after Terraform changes +make init-values + +# Re-sync ESO secrets without redeploying +make apply-eso + +# Check SSM secrets and TF_VAR_* export status (read-only) +make secrets + +# List all SSM parameters with last-modified timestamps +make secrets-list + +# Manage SSM secrets interactively (view, set, rotate, diff vs cluster) +make ssm + +# Update kubeconfig for the EKS cluster +make kubeconfig +``` + +## Preflight checks + +```bash +# Pre-Terraform: AWS credentials + IAM permissions +make preflight + +# Post-apply: kubectl, SSM params, Helm values, TLS config +make preflight-post + +# SSM only — confirm all parameters are populated (after make setup-env) +make preflight-ssm +``` + +## Add-ons + +Add-ons are controlled by `enable_*` flags in `infra/terraform.tfvars`. Set the flags, re-run `init-values` to copy the matching values files, then re-deploy. + +```hcl +# infra/terraform.tfvars +enable_deployments = true # LangGraph Platform (required for Agent Builder and Polly) +enable_agent_builder = true # Agent Builder UI +enable_insights = true # ClickHouse-backed analytics +enable_polly = true # Polly AI eval/monitoring +enable_usage_telemetry = false # Extended usage telemetry +``` + +```bash +make init-values +make deploy +``` + +## Sizing profiles + +Set `sizing_profile` in `terraform.tfvars`, then re-run `make init-values && make deploy`. + +```hcl +sizing_profile = "production" # multi-replica with HPA (recommended) +sizing_profile = "production-large" # high-volume (~50 users, ~1000 traces/sec) +sizing_profile = "dev" # single-replica, minimal resources +sizing_profile = "default" # chart defaults (no sizing file) +``` + +## Make targets + +### Setup and secrets + +| Command | Description | +|---|---| +| `make quickstart` | Interactive wizard. Generates `infra/terraform.tfvars` (region, node size, TLS method, add-ons). | +| `make setup-env` | Prints the exact `source` command for loading secrets into your shell. Cannot export variables directly. | +| `make secrets` | Show SSM secrets status (`✓ SET` / `✗ MISSING`) per parameter, check `TF_VAR_*` exports, give next steps. | +| `make secrets-list` | List all SSM parameters for this deployment with last-modified timestamps. | +| `make ssm` | Interactive SSM parameter manager. View, set, rotate, validate, diff vs the cluster Secret. | + +### Preflight + +| Command | Description | +|---|---| +| `make preflight` | Verify AWS credentials, IAM permissions, and required CLI tools before Terraform runs. | +| `make preflight-post` | Run after `make apply`. Checks kubectl context, cluster reachability, SSM params populated, Helm values present, TLS config. | +| `make preflight-ssm` | Check SSM params only. Narrower scope than `preflight-post`. | + +### Infrastructure + +| Command | Description | +|---|---| +| `make init` | `terraform init`. Downloads providers and modules. Safe to re-run. | +| `make plan` | `terraform plan`. Preview changes. Review before every apply. | +| `make apply` | `terraform apply`. Provisions VPC, EKS, RDS, ElastiCache, S3, ALB, IRSA. 20 to 25 minutes. | +| `make destroy` | `terraform destroy`. Tears down all infrastructure. Run `make uninstall` first. | + +### Helm deploy + +| Command | Description | +|---|---| +| `make init-values` | Generate `helm/values/langsmith-values-overrides.yaml` from Terraform outputs. Copy add-on values files based on `enable_*` flags. | +| `make deploy` | Deploy or upgrade LangSmith via Helm. Runs preflight, ESO sync, layered values build, and core readiness checks. | +| `make apply-eso` | Re-apply ESO `ClusterSecretStore` and `ExternalSecret` only. Use after rotating secrets without a full Helm redeploy. | +| `make uninstall` | Uninstall the LangSmith Helm release. Terraform infrastructure stays intact. | + +### Terraform-managed Helm + +| Command | Description | +|---|---| +| `make init-app` | Pull live infra Terraform outputs into `app/infra.auto.tfvars.json`. | +| `make plan-app` | `terraform plan` for the `app/` module. Auto-runs `init-app` first. | +| `make apply-app` | Deploy LangSmith Helm release via Terraform (`app/` module). | +| `make destroy-app` | Destroy the Helm release via Terraform. Infrastructure stays intact. | + +### Fast path + +| Command | Description | +|---|---| +| `make quickdeploy` | Full deploy in one command. Chains `terraform apply` → `kubeconfig` → `init-values` → `helm deploy` with gates. | +| `make quickdeploy-auto` | Same as `quickdeploy` but non-interactive. Passes `-auto-approve` to terraform. | +| `make deploy-all` | `make apply` → `make kubeconfig` → `make init-values` → `make deploy` in sequence. | +| `make deploy-all-tf` | `make apply` → `make init-values` → Terraform `app/` plan and apply in sequence. | + +### Utilities + +| Command | Description | +|---|---| +| `make status` | Check deployment state across all layers, print what to run next. | +| `make status-quick` | Same as `status` but skips SSM and Kubernetes queries (faster). | +| `make kubeconfig` | Update `~/.kube/config` with EKS cluster credentials (`aws eks update-kubeconfig`). | +| `make tls` | BYO ACM cert + Route 53 A alias. Use when `langsmith_domain` is set and you need DNS wiring. | +| `make clean` | Remove all local generated and sensitive files. Run after `make destroy`. | + +### Testing + +| Command | Description | +|---|---| +| `make test-e2e` | End-to-end gateway tests (ALB or Envoy Gateway) against the current cluster. | +| `make test-permutations` | Permutation tests sequentially on the current cluster. Use `ARGS="1 2 5"` for a subset. | +| `make test-parallel` | Permutation tests in parallel across isolated clusters. Your cluster is untouched. | + +## kubectl + +```bash +# Pod health +kubectl get pods -n langsmith +kubectl get pods -n langsmith -w +kubectl describe pod -n langsmith +kubectl logs -n langsmith --tail=100 -f +kubectl logs -n langsmith --previous --tail=50 + +# ALB and ingress +kubectl get ingress -n langsmith +kubectl describe ingress -n langsmith + +# External Secrets Operator sync status +kubectl get externalsecret langsmith-config -n langsmith + +# TLS +kubectl get certificate -n langsmith +kubectl get challenges -n langsmith +kubectl describe certificate -n langsmith + +# Helm +helm status langsmith -n langsmith +helm history langsmith -n langsmith +helm get values langsmith -n langsmith + +# IRSA — check per-component service account annotations +kubectl get sa -n langsmith -o yaml | grep eks.amazonaws.com + +# LangSmith Deployment (LangGraph Platform) +kubectl get lgp -n langsmith +kubectl get crd | grep langchain +kubectl get pods -n keda +``` + +## AWS CLI + +```bash +# EKS +aws eks list-clusters --region +aws eks describe-cluster --name --region +aws eks update-kubeconfig --region --name + +# RDS +aws rds describe-db-instances \ + --query "DBInstances[?contains(DBInstanceIdentifier,'langsmith')]" + +# ElastiCache +aws elasticache describe-cache-clusters \ + --query "CacheClusters[?contains(CacheClusterId,'langsmith')]" + +# S3 +aws s3 ls s3:// +aws s3api get-bucket-location --bucket + +# ALB +aws elbv2 describe-load-balancers \ + --query "LoadBalancers[?contains(LoadBalancerName,'langsmith')]" + +# VPC endpoint +aws ec2 describe-vpc-endpoints \ + --filters "Name=service-name,Values=com.amazonaws..s3" \ + --query "VpcEndpoints[].State" + +# SSM secrets +aws ssm get-parameters-by-path --path "/langsmith//" --with-decryption + +# IAM role +aws iam get-role --role-name +``` + +## Terraform + +```bash +cd modules/aws/infra + +terraform init +terraform plan +terraform apply +terraform apply -target=module.eks +terraform output +terraform output -raw cluster_name +terraform output -raw alb_dns_name +terraform output -raw langsmith_irsa_role_arn +terraform output -raw bucket_name +terraform state list +``` + +## Teardown + +```bash +cd terraform/modules/aws + +# Option A: script-driven deploy +make uninstall + +# Option B: Terraform-managed deploy +make destroy-app + +# Then destroy infrastructure: +# 1. Set postgres_deletion_protection = false in infra/terraform.tfvars +# 2. Apply the change, then destroy +cd infra +terraform apply +terraform destroy +``` diff --git a/src/langsmith/self-host-terraform-aws-troubleshooting.mdx b/src/langsmith/self-host-terraform-aws-troubleshooting.mdx new file mode 100644 index 0000000000..716907d3ae --- /dev/null +++ b/src/langsmith/self-host-terraform-aws-troubleshooting.mdx @@ -0,0 +1,435 @@ +--- +title: AWS Terraform troubleshooting +sidebarTitle: Troubleshooting +description: Common issues, fixes, and diagnostic commands for LangSmith self-hosted on AWS EKS deployed with the LangChain Terraform modules. +--- + +This page documents common issues, fixes, and diagnostic commands for LangSmith deployments provisioned with the [AWS Terraform modules](https://github.com/langchain-ai/terraform/tree/main/modules/aws). + + +Before upgrading, review the [LangSmith self-hosted changelog](/langsmith/self-hosted-changelog) for breaking changes and required variable updates. Run `aws eks update-kubeconfig --region --name ` before running any `kubectl` commands. + + +## Automated diagnostics + +Before running individual commands, try the bundled scripts: + +```bash +# Deployment status across all layers + next-step guidance +make status + +# SSM parameter validation +./infra/scripts/manage-ssm.sh validate +``` + +## Known issues + +### EKS node group creation fails: CREATE_FAILED + +**Symptom** + +``` +Error: waiting for EKS Node Group creation: unexpected state 'CREATE_FAILED' +``` + +**Cause:** The EKS control plane is not yet fully active when node group creation begins. Common after an interrupted apply. + +**Fix** + +```bash +aws eks wait cluster-active --name --region + +aws eks describe-nodegroup \ + --cluster-name \ + --nodegroup-name \ + --region \ + --query "nodegroup.health" + +terraform apply -var-file=terraform.tfvars +``` + +### kubectl fails: "You must be logged in to the server" + +**Symptom:** All `kubectl` commands fail with `error: You must be logged in to the server (Unauthorized)`. + +**Cause:** The kubeconfig is stale, the AWS credentials differ from those that created the cluster, or the token has expired. + +**Fix** + +```bash +aws eks update-kubeconfig --region --name +kubectl cluster-info + +aws sts get-caller-identity +``` + +If the cluster was created with a different IAM role, grant access via the `aws-auth` ConfigMap: + +```bash +kubectl edit configmap aws-auth -n kube-system +# Add your IAM user or role under mapUsers / mapRoles +``` + +### ALB not created after Helm install + +**Symptom:** `kubectl get ingress -n langsmith` shows no ADDRESS after several minutes. + +**Cause:** AWS Load Balancer Controller is not running or lacks IRSA permissions, the Terraform-provisioned ALB is not referenced correctly, or `alb_scheme = "internal"` is set (internal ALBs have no public address — see [ALB has no public address](#alb-has-no-public-address-internal-scheme)). + +**Fix** + +```bash +kubectl get pods -n kube-system | grep aws-load-balancer +kubectl logs -n kube-system -l app.kubernetes.io/name=aws-load-balancer-controller --tail=50 +kubectl get sa -n kube-system aws-load-balancer-controller -o yaml | grep eks.amazonaws.com + +terraform output alb_dns_name +aws elbv2 describe-load-balancers --query "LoadBalancers[?DNSName==''].State" +``` + +### RDS connection refused from EKS pods + +**Symptom:** Backend logs show `connection refused` or `timeout` for the RDS endpoint. + +**Cause:** The RDS security group does not allow inbound TCP 5432 from the EKS node or cluster security group. + +**Fix** + +```bash +aws eks describe-cluster --name \ + --query "cluster.resourcesVpcConfig.clusterSecurityGroupId" + +aws rds describe-db-instances \ + --db-instance-identifier \ + --query "DBInstances[0].VpcSecurityGroups" + +aws ec2 describe-security-group-rules \ + --filter "Name=group-id,Values=" +``` + +The `postgres` module sets up the security group automatically. If the rule is missing, re-apply: + +```bash +terraform apply -var-file=terraform.tfvars -target=module.postgres +``` + +### S3 access denied from pods (IRSA not configured) + +**Symptom:** Backend logs show `AccessDenied` when reading or writing S3. + +**Cause:** IRSA annotation missing from the LangSmith service account, or the S3 VPC Gateway Endpoint is not routing correctly. + +**Fix** + +```bash +kubectl get sa langsmith -n langsmith -o yaml | grep eks.amazonaws.com + +aws ec2 describe-vpc-endpoints \ + --filters "Name=service-name,Values=com.amazonaws..s3" \ + --query "VpcEndpoints[].State" + +kubectl run s3-test --rm -it --image=amazon/aws-cli -n langsmith -- \ + s3 ls s3:// +``` + +If the IRSA annotation is missing, verify `create_langsmith_irsa_role = true` in `terraform.tfvars` and that the service account name in the Helm values matches `langsmith`. + +### ElastiCache Redis connection timeout + +**Symptom:** Pods cannot connect to Redis. Logs show `dial tcp: i/o timeout`. + +**Cause:** ElastiCache security group does not allow inbound TCP 6379 from the EKS node security group. + +**Fix** + +```bash +aws elasticache describe-cache-clusters \ + --cache-cluster-id \ + --query "CacheClusters[0].SecurityGroups" + +kubectl run redis-test --rm -it --image=redis:7 -n langsmith -- \ + redis-cli -h -a ping +``` + +### EKS nodes not autoscaling + +**Symptom:** Pods remain `Pending`. Node count does not increase. + +**Cause:** Cluster Autoscaler lacks IAM permissions, targets the wrong ASG, or `min_size = max_size` on the node group. + +**Fix** + +```bash +kubectl logs -n kube-system -l app=cluster-autoscaler --tail=50 + +aws autoscaling describe-auto-scaling-groups \ + --query "AutoScalingGroups[?contains(Tags[].Key, 'k8s.io/cluster-autoscaler/')].[AutoScalingGroupName]" \ + --output table +``` + +### cert-manager fails to issue Let's Encrypt certificate + +**Symptom:** `kubectl get certificate -n langsmith` shows `READY=False`. HTTP01 challenge is failing. + +**Cause:** The ALB is not forwarding port 80 to the cert-manager solver pod, or the DNS record for the domain does not point to the ALB. + +**Fix** + +```bash +kubectl describe certificate -n langsmith +kubectl get challenges -n langsmith + +aws elbv2 describe-listeners --load-balancer-arn + +dig +short +# Expected: CNAME to the ALB DNS name +``` + +### postgres_deletion_protection blocks terraform destroy + +**Symptom** + +``` +Error: deleting RDS DB Instance: InvalidParameterCombination: +Cannot delete, DeletionProtection is enabled. +``` + +**Fix:** Disable deletion protection in `terraform.tfvars`, apply, then destroy: + +```hcl +postgres_deletion_protection = false +``` + +```bash +terraform apply -var-file=terraform.tfvars +terraform destroy +``` + +### ESO fails to sync: langsmith-config secret missing + +**Symptom:** Pods stuck in `CreateContainerConfigError`. `kubectl get secret langsmith-config -n langsmith` returns `NotFound`. + +**Cause:** ESO sync is all-or-nothing. If any single SSM parameter referenced by the `ExternalSecret` is missing, ESO refuses to create the Kubernetes Secret. All pods fail, not just the feature that needs the missing parameter. + +**Fix** + +```bash +kubectl get externalsecret langsmith-config -n langsmith +kubectl describe externalsecret langsmith-config -n langsmith + +./infra/scripts/manage-ssm.sh validate + +source ./infra/scripts/setup-env.sh +./helm/scripts/apply-eso.sh +``` + +The `describe` output shows which `remoteRef.key` failed. Match it against the SSM prefix `/langsmith/{name_prefix}-{environment}/`. + +### SSM parameter prefix mismatch + +**Symptom:** `manage-ssm.sh validate` passes but ESO still cannot sync. Or `setup-env.sh` wrote parameters under a different prefix than ESO expects. + +**Cause:** The SSM prefix is derived from `name_prefix` and `environment` in `terraform.tfvars`. If these changed after initial setup, the old parameters live under the old prefix and ESO looks under the new one. + +**Fix** + +```bash +kubectl get externalsecret langsmith-config -n langsmith -o yaml | grep 'key:' + +./infra/scripts/manage-ssm.sh list + +./infra/scripts/migrate-ssm.sh +``` + + +Never change `name_prefix` or `environment` on an existing deployment. + + +### Postgres password rejected by Terraform validation + +**Symptom** + +``` +Error: Invalid value for variable "postgres_password" +RDS master password must not contain '/', '@', '"', single quotes, or spaces. +``` + +**Cause:** The password contains characters RDS does not allow in the master password. + +**Fix:** Re-generate without restricted characters. `setup-env.sh` produces a compliant password automatically; to update manually: + +```bash +./infra/scripts/manage-ssm.sh set postgres-password "$(openssl rand -base64 24 | tr -d '/+= ')" +source ./infra/scripts/setup-env.sh +terraform apply -var-file=terraform.tfvars +``` + +### Private EKS cluster unreachable (bastion required) + +**Symptom:** `kubectl` and `terraform apply` time out when `enable_public_eks_cluster = false`. + +**Cause:** The EKS API endpoint is private. Commands must run from within the VPC, either via the bastion host or a VPN connection. + +**Fix** + +```bash +# If the bastion was provisioned (create_bastion = true) +aws ssm start-session --target + +# From the bastion +aws eks update-kubeconfig --region --name +kubectl get nodes +``` + +If no bastion was provisioned, set `create_bastion = true` and re-apply, or temporarily set `enable_public_eks_cluster = true`. + +### ALB has no public address (internal scheme) + +**Symptom:** `kubectl get ingress -n langsmith` shows an ADDRESS, but it resolves only within the VPC. + +**Cause:** `alb_scheme = "internal"` was set in `terraform.tfvars`. Internal ALBs are only reachable from within the VPC (VPN, peering, or PrivateLink). + +**Fix:** Intentional for private deployments. To make the ALB publicly reachable: + +```hcl +alb_scheme = "internet-facing" +``` + +```bash +terraform apply -var-file=terraform.tfvars +# Then redeploy Helm to pick up the new ALB +``` + +### ALB hostname changed after ingress recreation + +**Symptom:** The LangSmith URL stops working. Agent deployments stuck in `DEPLOYING`. DNS records or bookmarks point to an old ALB hostname that no longer resolves. + +**Cause:** Deleting the Kubernetes ingress (via `helm uninstall`, `kubectl delete ingress`, or namespace deletion) deprovisions the ALB. When the ingress is recreated, a new ALB with a different hostname is issued. The `config.deployment.url` in Helm values still points to the old hostname, so the operator's health checks fail and deployments stay stuck. + +This also happens if the ALB controller creates a new ALB instead of reusing the Terraform pre-provisioned one. The `group.name` annotation is required alongside `load-balancer-arn` to prevent this. + +**Prevention** + +- Ensure `group.name` and `load-balancer-arn` annotations are both set. `init-values.sh` does this automatically when a pre-provisioned ALB exists. +- Do not delete the ingress unless you plan to update all hostname-dependent config. +- Avoid `helm rollback` without `--server-side=false`. The ingress SSA conflict can trigger a delete/recreate cycle. + +**Fix** + +```bash +# 1. Check what hostname the ingress currently has +kubectl get ingress langsmith-ingress -n langsmith \ + -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' + +# 2. Check what Terraform expects +terraform output alb_dns_name + +# 3. If they differ, re-run init-values.sh and redeploy +make init-values +make deploy +``` + +### Node group scaling changes not applied by Terraform + +**Symptom:** Changing `min_size` or `max_size` in `terraform.tfvars` shows "No changes" on `terraform plan`. + +**Cause:** The ASG was changed out-of-band (AWS CLI, console, or cluster autoscaler) and the Terraform state already reflects the new values. The community EKS module ignores `desired_size` changes so the autoscaler can manage it; `min_size` and `max_size` should propagate normally. + +**Fix** + +```bash +terraform refresh +terraform plan + +# For an immediate change, use the AWS CLI directly +aws eks update-nodegroup-config \ + --cluster-name \ + --nodegroup-name \ + --scaling-config minSize=3,maxSize=8,desiredSize=5 \ + --region +``` + +## Diagnostic commands + +### Cluster access + +```bash +aws eks update-kubeconfig --region --name +kubectl config current-context +kubectl get nodes -o wide +aws sts get-caller-identity +``` + +### Pods + +```bash +kubectl get pods -n langsmith +kubectl get pods -n langsmith -w +kubectl describe pod -n langsmith +kubectl logs -n langsmith --tail=50 +kubectl logs -n langsmith --previous --tail=50 +kubectl logs -n langsmith deploy/langsmith-backend --tail=100 -f +``` + +### ALB and ingress + +```bash +kubectl get ingress -n langsmith +kubectl describe ingress -n langsmith +aws elbv2 describe-load-balancers --query "LoadBalancers[?contains(LoadBalancerName, 'langsmith')]" +``` + +### TLS and certificates + +```bash +kubectl get certificate -n langsmith +kubectl describe certificate -n langsmith +kubectl get challenges -n langsmith +kubectl get clusterissuer +``` + +### ESO and secrets + +```bash +kubectl get externalsecret -n langsmith +kubectl describe externalsecret langsmith-config -n langsmith +kubectl get clustersecretstore langsmith-ssm +kubectl get secret langsmith-config -n langsmith -o jsonpath='{.data}' | jq 'keys' +./infra/scripts/manage-ssm.sh validate +./infra/scripts/manage-ssm.sh diff +``` + +### Helm + +```bash +helm status langsmith -n langsmith +helm history langsmith -n langsmith +helm get values langsmith -n langsmith +``` + +### IRSA and IAM + +```bash +kubectl get sa langsmith -n langsmith -o yaml | grep eks.amazonaws.com +terraform output langsmith_irsa_role_arn +aws iam get-role --role-name +``` + +### LangSmith Deployment + +```bash +kubectl get pods -n langsmith | grep -E "host-backend|listener|operator" +kubectl get lgp -n langsmith +kubectl get crd | grep langchain +kubectl get pods -n keda +``` + +### Quick health check + +```bash +echo "=== Context ===" && kubectl config current-context +echo "=== Nodes ===" && kubectl get nodes +echo "=== Pods ===" && kubectl get pods -n langsmith +echo "=== Ingress ===" && kubectl get ingress -n langsmith +echo "=== Helm ===" && helm status langsmith -n langsmith 2>/dev/null | grep -E "STATUS|LAST DEPLOYED" +``` diff --git a/src/langsmith/self-host-terraform-aws-variables.mdx b/src/langsmith/self-host-terraform-aws-variables.mdx new file mode 100644 index 0000000000..a094ad8250 --- /dev/null +++ b/src/langsmith/self-host-terraform-aws-variables.mdx @@ -0,0 +1,145 @@ +--- +title: AWS Terraform variables reference +sidebarTitle: Variables +description: Complete reference of Terraform variables for LangSmith self-hosted on AWS EKS. +--- + +Reference for every input variable exposed by the [AWS Terraform modules](https://github.com/langchain-ai/terraform/tree/main/modules/aws). Set non-sensitive variables in `infra/terraform.tfvars`. For sensitive variables (license key, passwords, encryption keys), `make setup-env` writes them to AWS SSM Parameter Store and `External Secrets Operator` syncs them into the cluster. + +## Core + +| Variable | Default | Required | Description | +|---|---|---|---| +| `name_prefix` | — | yes | Prefix for all resource names (1 to 11 chars, lowercase). | +| `environment` | `dev` | no | Environment tag: `dev`, `staging`, `prod`, `test`, `uat`. | +| `region` | `us-west-2` | no | AWS region for all resources. | +| `owner` | `""` | no | Owner tag applied to all resources. | +| `cost_center` | `""` | no | Cost center tag for billing. | +| `tags` | `{}` | no | Additional tags applied to all resources. | + +## Networking + +| Variable | Default | Required | Description | +|---|---|---|---| +| `create_vpc` | `true` | no | Create a new VPC. Set `false` to use an existing one. | +| `vpc_id` | `null` | when `!create_vpc` | Existing VPC ID. | +| `private_subnets` | `[]` | when `!create_vpc` | Existing private subnet IDs. | +| `public_subnets` | `[]` | when `!create_vpc` | Existing public subnet IDs. | +| `vpc_cidr_block` | `null` | when `!create_vpc` | Existing VPC CIDR block. | + +## EKS + +| Variable | Default | Required | Description | +|---|---|---|---| +| `enable_public_eks_cluster` | `true` | no | Enable the public EKS API endpoint. Set `false` for a private cluster (requires `create_bastion`). | +| `eks_public_access_cidrs` | `["0.0.0.0/0"]` | no | CIDRs allowed to reach the public EKS API endpoint. | +| `eks_cluster_version` | `1.31` | no | EKS Kubernetes version. | +| `eks_managed_node_group_defaults` | `{ami_type: AL2023}` | no | Default config for managed node groups. | +| `eks_managed_node_groups` | `{default: m5.4xlarge}` | no | Managed node group definitions. | +| `create_gp3_storage_class` | `true` | no | Create and set `gp3` as the default `StorageClass`. | +| `eks_cluster_enabled_log_types` | `["api", "audit", ...]` | no | EKS control plane log types sent to CloudWatch. | +| `eks_addons` | `{}` | no | EKS managed add-on configurations. | +| `create_langsmith_irsa_role` | `true` | no | Create the IRSA role for LangSmith pods (S3 access). | + +## PostgreSQL (RDS) + +| Variable | Default | Required | Description | +|---|---|---|---| +| `postgres_source` | `external` | no | `external` (RDS) or `in-cluster` (Helm). | +| `postgres_instance_type` | `db.t3.large` | no | RDS instance class. | +| `postgres_storage_gb` | `10` | no | Initial RDS storage in GB. | +| `postgres_max_storage_gb` | `100` | no | Maximum RDS storage in GB (autoscaling). | +| `postgres_username` | `langsmith` | no | RDS database username. | +| `postgres_engine_version` | `16` | no | PostgreSQL engine version for RDS. | +| `postgres_password` | `""` | when external | RDS password. Use `TF_VAR_postgres_password`. | +| `postgres_iam_database_authentication_enabled` | `true` | no | Enable IAM database authentication on RDS. | +| `postgres_deletion_protection` | `true` | no | Enable deletion protection on RDS. | +| `postgres_backup_retention_period` | `7` | no | Days to retain automated RDS backups (0 = disabled). | + +## Redis (ElastiCache) + +| Variable | Default | Required | Description | +|---|---|---|---| +| `redis_source` | `external` | no | `external` (ElastiCache) or `in-cluster` (Helm). | +| `redis_instance_type` | `cache.m6g.xlarge` | no | ElastiCache node type. | +| `redis_auth_token` | `""` | when external | ElastiCache auth token (min 16 chars). Use `TF_VAR_redis_auth_token`. | + +## S3 + +| Variable | Default | Required | Description | +|---|---|---|---| +| `s3_ttl_enabled` | `true` | no | Enable S3 lifecycle rules for trace TTL. | +| `s3_ttl_short_days` | `14` | no | TTL for `ttl_s/` prefix in days. | +| `s3_ttl_long_days` | `400` | no | TTL for `ttl_l/` prefix in days. | +| `s3_kms_key_arn` | `""` | no | KMS CMK ARN for S3 encryption (empty = SSE-S3). | +| `s3_versioning_enabled` | `false` | no | Enable S3 bucket versioning. | + +## TLS and DNS + +| Variable | Default | Required | Description | +|---|---|---|---| +| `tls_certificate_source` | `acm` | no | `acm`, `letsencrypt`, or `none`. | +| `acm_certificate_arn` | `""` | when `acm` | ACM certificate ARN. | +| `letsencrypt_email` | `""` | when `letsencrypt` | Email for Let's Encrypt notifications. | +| `langsmith_domain` | `""` | no | Custom hostname (empty = use ALB DNS name). | +| `langsmith_namespace` | `langsmith` | no | Kubernetes namespace for LangSmith. | + +## ClickHouse and ingress + +| Variable | Default | Required | Description | +|---|---|---|---| +| `clickhouse_source` | `in-cluster` | no | `in-cluster` or `external`. | +| `alb_scheme` | `internet-facing` | no | ALB scheme: `internet-facing` or `internal`. | +| `alb_access_logs_enabled` | `false` | no | Enable ALB access logging to S3. | +| `enable_envoy_gateway` | `false` | no | Install Envoy Gateway instead of ALB. Required for multi-namespace dataplane deployments. | + +## Bastion (private cluster) + +| Variable | Default | Required | Description | +|---|---|---|---| +| `create_bastion` | `false` | no | Create an EC2 bastion host for private cluster access (SSM or SSH). | +| `bastion_instance_type` | `t3.micro` | no | EC2 instance type for the bastion. | +| `bastion_key_name` | `null` | no | EC2 key pair for SSH (empty = SSM only). | +| `bastion_enable_ssh` | `false` | no | Open port 22 on the bastion security group. | +| `bastion_ssh_allowed_cidrs` | `[]` | no | CIDRs allowed to SSH to the bastion. | +| `bastion_root_volume_size_gb` | `20` | no | Root EBS volume size for the bastion. | + +## Security and audit + +| Variable | Default | Required | Description | +|---|---|---|---| +| `create_cloudtrail` | `false` | no | Create a CloudTrail trail for AWS API audit. | +| `cloudtrail_multi_region` | `true` | no | Record API calls across all regions. | +| `cloudtrail_log_retention_days` | `365` | no | Days to retain CloudTrail logs. | +| `create_waf` | `false` | no | Attach a WAFv2 Web ACL to the ALB. | +| `create_firewall` | `false` | no | Deploy AWS Network Firewall for FQDN-based egress filtering. Requires `create_vpc = true`. Cost: about `$0.395/hr/endpoint` plus `$0.065/GB`. | +| `firewall_allowed_fqdns` | `["beacon.langchain.com"]` | no | Domains allowed for outbound internet traffic when `create_firewall = true`. Matched against TLS SNI (HTTPS) and HTTP Host header. All other destinations are dropped. | +| `firewall_subnet_cidr` | `"10.0.64.0/21"` | no | CIDR for the firewall subnet. Must not overlap with private (`10.0.0.0/21` to `10.0.32.0/21`) or public (`10.0.40.0/21` to `10.0.56.0/21`) subnets. | + +## Sizing and feature flags + +`sizing_profile` and the `enable_*` flags are read by `init-values.sh` and `deploy.sh`; Terraform ignores them. They affect which Helm overlay files the scripts generate. + +| Variable | Default | Required | Description | +|---|---|---|---| +| `sizing_profile` | `default` | no | Helm sizing: `production`, `production-large`, `dev`, `minimum`, `default`. | +| `enable_deployments` | `false` | no | Enable LangSmith Deployment (listener, operator, host-backend). | +| `enable_agent_builder` | `false` | no | Enable Agent Builder. Requires `enable_deployments = true`. | +| `enable_insights` | `false` | no | Enable ClickHouse-backed analytics. | +| `enable_polly` | `false` | no | Enable Polly AI eval and monitoring. Requires `enable_deployments = true`. | +| `enable_usage_telemetry` | `false` | no | Enable extended usage telemetry reporting. | + +## Sensitive variables (set with `setup-env.sh`) + +`make setup-env` writes these to AWS SSM Parameter Store. External Secrets Operator syncs them into the cluster as Kubernetes secrets. Never set these inline in `terraform.tfvars`. + +| Variable | Description | +|---|---| +| `langsmith_license_key` | LangSmith enterprise license key. | +| `langsmith_admin_password` | Initial org admin password. | +| `langsmith_api_key_salt` | Salt for hashing API keys. Must stay stable after first deploy. | +| `langsmith_jwt_secret` | JWT secret for Basic Auth sessions. | +| `langsmith_deployments_encryption_key` | Fernet key for LangSmith Deployment. Must never change. | +| `langsmith_agent_builder_encryption_key` | Fernet key for Agent Builder. Must never change. | +| `langsmith_insights_encryption_key` | Fernet key for Insights. Must never change. | +| `langsmith_polly_encryption_key` | Fernet key for Polly. Must never change. | diff --git a/src/langsmith/self-host-terraform-azure-architecture.mdx b/src/langsmith/self-host-terraform-azure-architecture.mdx new file mode 100644 index 0000000000..cf56401c8a --- /dev/null +++ b/src/langsmith/self-host-terraform-azure-architecture.mdx @@ -0,0 +1,311 @@ +--- +title: Azure Terraform architecture +sidebarTitle: Architecture +description: Platform layers, services, Workload Identity, networking, ingress options, and module dependencies for LangSmith self-hosted on AKS. +--- + +This page documents what the [Azure Terraform modules](https://github.com/langchain-ai/terraform/tree/main/modules/azure) provision and how the modules wire the resulting deployment together. + +## Platform layers + +LangSmith on Azure deploys in stages. Each stage adds a capability layer on top of the previous. All layers share the same AKS cluster and `langsmith` namespace. + +LangSmith on Azure service layout +LangSmith on Azure service layout + +| Stage | Layer | What it adds | +|---|---|---| +| Infrastructure | Azure infrastructure | VNet, AKS, Postgres, Redis, Blob, Key Vault, cert-manager, KEDA, ingress controller | +| Application | LangSmith base | frontend, backend, platform-backend, queue, ingest-queue, ace-backend, clickhouse, playground | +| LangSmith Deployment add-on | LangSmith Deployment | host-backend, listener, operator + per-deployment pods | +| Agent Builder add-on | Agent Builder | agent-builder-tool-server, agent-builder-trigger-server + deep-agent LGP | +| Insights + Polly add-on | Insights + Polly | Clio analytics (ClickHouse-backed), Polly eval agent (operator-managed, dynamic) | + +## Application deployment paths + +| Path | How | When to use | +|---|---|---| +| Helm path | `make init-values && make deploy` | Default. Shell script, interactive, reads TF outputs dynamically. Best for first deploys and day-2 re-deploys. | +| Terraform path | `make init-app && make apply-app` | Declarative. Kubernetes Secrets + `langsmith-ksa` SA + Helm release in Terraform state. Best for GitOps and CI/CD pipelines. | + +The Terraform path uses the `app/` module. `make init-app` calls `app/scripts/pull-infra-outputs.sh` to read all infra outputs and write them into `app/infra.auto.tfvars.json`. + +## Deployment tiers + +### Light deploy (all in-cluster) + +```txt +AKS Cluster +├── langsmith namespace +│ ├── frontend, backend, platform-backend, playground, queue, ace-backend +│ ├── clickhouse (in-cluster pod) +│ ├── postgres (in-cluster pod) +│ └── redis (in-cluster pod) +├── ingress-nginx (Azure Load Balancer → NGINX) +└── cert-manager (Let's Encrypt TLS) + +Azure +├── Azure Blob Storage (trace payloads — always external) +└── Azure Key Vault (secrets) +``` + +Set in `terraform.tfvars`: + +```hcl +postgres_source = "in-cluster" +redis_source = "in-cluster" +clickhouse_source = "in-cluster" +``` + +For the full all-in-cluster walkthrough (Front Door TLS, all-in-cluster DBs), see `BUILDING_LIGHT_LANGSMITH.md` in the [Azure module repo](https://github.com/langchain-ai/terraform/blob/main/modules/azure/BUILDING_LIGHT_LANGSMITH.md). + +### Production (external managed services) + +```txt +AKS Cluster +├── langsmith namespace +│ ├── frontend, backend, platform-backend, playground, queue, ingest-queue, ace-backend +│ └── clickhouse (in-cluster — use LangChain Managed for production scale) +└── ingress-nginx + cert-manager + +Azure Managed Services +├── Azure DB for PostgreSQL Flexible Server (private VNet) +├── Azure Cache for Redis Premium (private VNet) +├── Azure Blob Storage (Workload Identity — no static keys) +└── Azure Key Vault +``` + +## Networking + +### Light deploy + +```txt +langsmith-vnet +└── subnet-0 (AKS nodes only) + No Postgres/Redis subnets — chart-managed pods handle both +``` + +### Production + +```txt +langsmith-vnet +├── subnet-0 (AKS nodes) +├── subnet-postgres (Azure DB for PostgreSQL Flexible Server) +└── subnet-redis (Azure Cache for Redis Premium) +``` + +All subnets are private. Postgres and Redis are accessible only from within the VNet via private DNS resolution. No public endpoints. + +## Application core services + +| Service | Purpose | Port | HPA | Workload Identity | +|---|---|---|---|---| +| `langsmith-frontend` | React UI | 3000 | 1 to 10 | No | +| `langsmith-backend` | Main API (traces, runs, projects, API keys, feedback) | 1984 | 3 to 10 | Yes (Blob) | +| `langsmith-platform-backend` | Org and user management, auth, billing, settings | 1986 | 1 to 10 | Yes (Blob) | +| `langsmith-playground` | LLM prompt playground UI | 3001 | 1 to 10 | No | +| `langsmith-queue` | Trace ingestion worker (Redis → ClickHouse + Blob) | — | 3 to 10 + KEDA | Yes | +| `langsmith-ingest-queue` | Dedicated high-throughput ingestion worker | — | 3 to 10 + KEDA | Yes | +| `langsmith-ace-backend` | Async compute (dataset runs, evaluations, background jobs) | — | 1 to 5 | No | +| `langsmith-clickhouse` | Columnar store (trace spans, run metadata, eval results) | — | StatefulSet, single replica, 500Gi PVC | No | + + +In-cluster ClickHouse is dev/POC only (single pod, no replication, no backups). For production use [LangChain Managed ClickHouse](/langsmith/langsmith-managed-clickhouse) or a self-managed external cluster. + + +### One-time jobs + +| Job | Purpose | +|---|---| +| `langsmith-backend-migrations` | PostgreSQL schema migrations | +| `langsmith-backend-ch-migrations` | ClickHouse schema migrations | +| `langsmith-backend-auth-bootstrap` | Creates the initial org and admin account from `initial_org_admin_password` in `langsmith-config-secret` | + +## LangSmith Deployment add-on + +| Service | Purpose | Workload Identity | +|---|---|---| +| `langsmith-host-backend` | LangGraph control plane API. Manages deployment lifecycle, serves deployment metadata. | Yes | +| `langsmith-listener` | Watches host-backend for state changes, creates and updates `LangGraphPlatform` CRDs. | Yes | +| `langsmith-operator` | Kubernetes operator. Azure-specific: injects `azure.workload.identity/use: "true"` + `langsmith-ksa` so every agent pod accesses Blob Storage via Workload Identity. | No | + +## Agent Builder add-on + +| Pod | Type | Role | Workload Identity | +|---|---|---|---| +| `langsmith-agent-builder-tool-server` | Static | MCP tool execution server | Yes | +| `langsmith-agent-builder-trigger-server` | Static | Webhook receiver and scheduled trigger engine | Yes | +| `langsmith-agent-bootstrap` | Job | Registers the bundled Agent Builder agent | — | +| `agent-builder-` + queue + redis + `lg--0` | Dynamic | Agent Builder deployment, operator-managed | Inherited | + +## Insights and Polly add-on + +**Insights/Clio:** No static pods. Deploys lazily as a dynamic LangGraph deployment via the operator on first UI invocation. Reads `insights_encryption_key` from `langsmith-config-secret`. Never rotate this key — it permanently breaks existing Insights data. + +**Polly:** Runs as a dynamic LangGraph deployment. Resource limits 2 CPU / 4 Gi request, 4 CPU / 8 Gi limit, scales 1 to 5 replicas. Reads `polly_encryption_key` from `langsmith-config-secret`. Same rotation warning as Insights. + +## Azure managed services + +When `postgres_source = "external"` and `redis_source = "external"` (the recommended production setting), Terraform provisions: + +### Azure DB for PostgreSQL Flexible Server + +- Holds orgs, users, projects, API keys, settings. +- PostgreSQL ≥ 14 required (Azure Flexible Server defaults to 16). +- Extensions enabled automatically by the `postgres` module: `btree_gin`, `btree_gist`, `pgcrypto`, `citext`, `ltree`, `pg_trgm`. +- Private VNet only (`subnet-postgres`), SSL port 5432. +- Secret: `langsmith-postgres-secret`, created by the `k8s-bootstrap` Terraform module. + +### Azure Cache for Redis Premium + +- Trace ingestion queue, pub/sub, short-lived cache. +- Redis ≥ 5 required (Premium tier defaults to Redis 6). +- Each LangSmith installation must use its own dedicated Redis. Shared instances cause deployment tasks to route incorrectly. +- Private VNet only (`subnet-redis`), TLS port 6380. +- Secret: `langsmith-redis-secret`, created by the `k8s-bootstrap` Terraform module. + +### Azure Blob Storage + +- Trace payloads: large inputs and outputs, attachments. +- Workload Identity (no static keys) via the `k8s-app-identity` Managed Identity. +- Always required. Disabling blob storage breaks the cluster on large payloads. +- Prefixes: `ttl_s/` (14-day TTL), `ttl_l/` (400-day TTL). + +### Azure Key Vault + +- Centralized secret store for all LangSmith secrets. +- Secret flow: `az keyvault secret show` → `kubectl create secret generic langsmith-config-secret`. + +## Workload Identity + +Azure AD token exchange happens via the AKS OIDC issuer. Pods access Blob Storage without static keys. + +```txt +AKS OIDC issuer + → Federated credential on Azure Managed Identity (one per Kubernetes ServiceAccount) + → Kubernetes ServiceAccount annotated with azure.workload.identity/client-id + → Pod labeled with azure.workload.identity/use: "true" + → Azure AD issues a short-lived token — no storage keys in any Secret or env var +``` + +Workload Identity is centralized in `modules/k8s-cluster/` alongside the managed identity and OIDC issuer, which avoids circular dependencies and simplifies adding new ServiceAccounts. + +### Which pods need Workload Identity + +Every pod that reads blob storage env vars must have: + +1. A federated credential registered in Terraform (`modules/k8s-cluster/main.tf`). +2. The `azure.workload.identity/use: "true"` label on the Deployment. +3. The `azure.workload.identity/client-id` annotation on the ServiceAccount. + +| Pod | Stage | Needs WI | +|---|---|---| +| `langsmith-backend` | Application | Yes | +| `langsmith-platform-backend` | Application | Yes | +| `langsmith-queue` | Application | Yes | +| `langsmith-ingest-queue` | Application | Yes | +| `langsmith-host-backend` | LangSmith Deployment add-on | Yes | +| `langsmith-listener` | LangSmith Deployment add-on | Yes | +| `langsmith-agent-builder-tool-server` | Agent Builder add-on | Yes | +| `langsmith-agent-builder-trigger-server` | Agent Builder add-on | Yes | +| `langsmith-frontend` | Application | No | +| `langsmith-playground` | Application | No | +| `langsmith-ace-backend` | Application | No | +| `langsmith-clickhouse` | Application | No | +| `langsmith-operator` | LangSmith Deployment add-on | No | + +All federated credentials are registered in `modules/k8s-cluster/main.tf` under `service_accounts_for_workload_identity`. Adding a new pod that accesses blob storage requires adding its ServiceAccount name to that list and running `terraform apply -target=module.aks`. + +### What breaks without it + +```txt +panic: blob-storage health-check failed: get container properties failed: +DefaultAzureCredential: failed to acquire a token. +WorkloadIdentityCredential authentication failed. + AADSTS700213: No matching federated identity record found for presented assertion subject +``` + +The pod panics on startup — the ServiceAccount has no registered federated credential so Azure AD rejects the token exchange. + +## Secret flow + +```txt +Infrastructure stage + + ./setup-env.sh (read-only against Key Vault — never writes to KV directly) + First run: prompts for postgres password, license key, admin password. + Generates api_key_salt, jwt_secret, Fernet keys locally. + Key Vault does not exist yet → writes to local dot-files + secrets.auto.tfvars. + Subsequent: Key Vault exists → reads all secrets from KV → writes to secrets.auto.tfvars. + No prompts, no generation, no KV writes. + Output: secrets.auto.tfvars (gitignored, chmod 600) + Terraform picks this up automatically — no shell session coupling. + + terraform apply + Reads: terraform.tfvars (non-sensitive config) + secrets.auto.tfvars (sensitive values — sole input for KV secret creation) + Creates: Azure Key Vault + all secrets as KV secrets (Terraform is the sole KV writer) + +Application stage + + ./setup-env.sh (re-run on any machine to refresh secrets.auto.tfvars from Key Vault) + + kubectl create secret generic langsmith-config-secret + Reads: Key Vault secrets + Terraform outputs (postgres/redis URLs, blob account) + Writes: K8s secrets — langsmith-config-secret, langsmith-postgres-secret, + langsmith-redis-secret + + helm upgrade --install langsmith ... + Chart reads config.existingSecretName = "langsmith-config-secret". + No secrets inline in any YAML file. +``` + +**Key rule:** `secrets.auto.tfvars` is never committed. It is regenerated from Key Vault on any machine by running `./setup-env.sh`. Terraform is the sole writer to Key Vault; `setup-env.sh` only reads from it after the first apply. + +## Ingress options + +| Controller | Variable | DNS label support | Notes | +|---|---|---|---| +| `nginx` _(default)_ | `ingress_controller = "nginx"` | Yes | NGINX via Helm, standard Kubernetes Ingress. | +| `istio-addon` | `ingress_controller = "istio-addon"` | Yes | AKS managed Istio service mesh. Use `istio_addon_revision` to pin revision. | +| `istio` | `ingress_controller = "istio"` | Yes | Self-managed Istio via Helm. Full control over revision and config. | +| `agic` | `ingress_controller = "agic"` | Yes | Azure Application Gateway v2 + AGIC Helm chart. Native L7 WAF. HTTP-only or dns01 + custom domain. | +| `envoy-gateway` | `ingress_controller = "envoy-gateway"` | Yes | Gateway API native. Uses `envoyproxy/gateway-helm`. | +| `none` | `ingress_controller = "none"` | — | Bring your own ingress. | + +Azure Public IP DNS labels (`dns_label`) work with all controllers. `deploy.sh` applies the `service.beta.kubernetes.io/azure-dns-label-name` annotation to the correct LoadBalancer service based on the chosen controller. + +For the full TLS compatibility matrix and per-controller setup, see `INGRESS_CONTROLLERS.md` in the [Azure module repo](https://github.com/langchain-ai/terraform/blob/main/modules/azure/INGRESS_CONTROLLERS.md). + +## Resource sizing + +Four sizing profiles are available. + +| Profile | Use case | Set via | +|---|---|---| +| `minimum` | Cost parking, CI smoke tests, single-user demos | `sizing_profile = "minimum"` in `terraform.tfvars` | +| `dev` | Developer use, integration tests, POCs | `sizing_profile = "dev"` | +| `production` | Real traffic — multi-replica + HPA | `sizing_profile = "production"` _(recommended)_ | +| `production-large` | ~50 users, ~1000 traces/sec | `sizing_profile = "production-large"` | + +### AKS node pools + +| Pool | VM Size | vCPU | RAM | Min | Max | Purpose | +|---|---|---|---|---|---|---| +| default | `Standard_D8s_v3` | 8 | 32 GB | 3 | 10 | Core LangSmith, system pods | +| large | `Standard_D16s_v3` | 16 | 64 GB | 0 | 2 | ClickHouse (in-cluster), LGP agent pods | + + +ClickHouse (when in-cluster) requests 2 to 4 CPU and 8 to 15 GB RAM depending on profile. With [LangChain Managed ClickHouse](/langsmith/langsmith-managed-clickhouse), the `large` pool is only needed for LGP operator-spawned agent pods. + + +## Optional modules + +Each module is count-controlled (`0` disabled, `1` enabled). Enable any combination; the core deployment (Passes 1 to 5) works without them. + +| Module | Variable | Use case | +|---|---|---| +| `waf` | `create_waf = true` | Azure WAF policy (OWASP 3.2 + bot protection). Attach to Application Gateway. | +| `diagnostics` | `create_diagnostics = true` | Log Analytics workspace + diagnostic settings for AKS, Key Vault, Blob. Recommended for production observability. | +| `bastion` | `create_bastion = true` | Azure Bastion (Standard tier). Browser-based SSH to node VMs without a public IP. | +| `dns` | `create_dns_zone = true` | Azure DNS zone + A record. Required for DNS-01 cert issuance with a custom domain. | diff --git a/src/langsmith/self-host-terraform-azure-deploy.mdx b/src/langsmith/self-host-terraform-azure-deploy.mdx new file mode 100644 index 0000000000..7e3bcbf661 --- /dev/null +++ b/src/langsmith/self-host-terraform-azure-deploy.mdx @@ -0,0 +1,683 @@ +--- +title: Deploy LangSmith on Azure with Terraform +sidebarTitle: Deploy +description: End-to-end walkthrough for provisioning LangSmith self-hosted on Azure AKS using the LangChain Terraform modules. +--- + +Provision the Azure cloud foundation and install LangSmith with the public Terraform modules at [github.com/langchain-ai/terraform/tree/main/modules/azure](https://github.com/langchain-ai/terraform/tree/main/modules/azure). Plan for 40 to 50 minutes end to end on a clean subscription. + +The deployment runs in two stages: infrastructure (Terraform provisions AKS, Postgres, Redis, Blob Storage, Key Vault, cert-manager, KEDA, ingress) and application (Helm installs the LangSmith chart against the cluster). Three add-ons (LangSmith Deployment, Agent Builder, Insights and Polly) are enabled with flags and a redeploy. + +## Prerequisites + +### Required tools + +| Tool | Version | Purpose | +|---|---|---| +| Azure CLI (`az`) | 2.50 | Authenticate, query Azure resources, manage AKS credentials | +| Terraform | 1.5 | Run the infrastructure modules | +| `kubectl` | 1.28 | Inspect the AKS cluster | +| Helm | 3.12 | Install and manage the LangSmith chart | + +```bash +brew install azure-cli kubectl helm +brew tap hashicorp/tap && brew install hashicorp/tap/terraform + +az --version +terraform version +kubectl version --client +helm version +``` + +### Required Azure RBAC + +The identity running Terraform needs the following roles on the subscription: + +| Role | Purpose | +|---|---| +| `Contributor` | Create and manage all Azure resources | +| `User Access Administrator` | Create role assignments for Key Vault, Blob, cert-manager managed identities | + +`Owner` includes both. `Contributor` alone is insufficient because role assignments require User Access Administrator. + +### Authenticate + +```bash +az login +az account set --subscription +az account show +``` + +You also need a LangSmith license key ([contact sales](https://www.langchain.com/contact-sales)) and either a `dns_label` (Azure subdomain, no DNS setup needed) or a custom `langsmith_domain`. + +## Rapid path + +For the fastest path from zero to a running LangSmith instance: + +```bash +# 1. Clone the public modules +git clone https://github.com/langchain-ai/terraform.git +cd terraform/modules/azure + +# 2. Generate terraform.tfvars interactively +make quickstart + +# 3. Bootstrap secrets (writes infra/secrets.auto.tfvars, chmod 600, gitignored) +make setup-env + +# 4. Validate environment +make preflight + +# 5. Provision infrastructure (~15 to 20 min) +make init +make apply + +# 6. Get cluster credentials and push secrets into the cluster +make kubeconfig +make k8s-secrets + +# 7. Deploy LangSmith via Helm (~10 min) +make init-values +make deploy +``` + +Or run steps 5 through 7 in one shot: + +```bash +make deploy-all # apply → kubeconfig → k8s-secrets → init-values → deploy +``` + +The sections below cover each phase in detail. + +## Provision infrastructure + +Provisioning the Azure cloud foundation takes 15 to 20 minutes on a clean subscription. Do not interrupt the apply. + +### What gets provisioned + +| Resource | Type | Purpose | +|---|---|---| +| Resource Group | `azurerm_resource_group` | Container for all resources | +| Virtual Network | `azurerm_virtual_network` | Isolated network (10.0.0.0/17) | +| AKS Cluster | `azurerm_kubernetes_cluster` | Kubernetes, all workloads run here | +| Ingress Controller | Helm | External load balancer + TLS termination (nginx by default) | +| PostgreSQL Flexible Server | `azurerm_postgresql_flexible_server` | Org config, run metadata (external tier) | +| Redis Cache Premium | `azurerm_redis_cache` | Trace ingestion queue, pub/sub (external tier) | +| Blob Storage | `azurerm_storage_account` | Raw trace objects, always required | +| Managed Identity | `azurerm_user_assigned_identity` | Workload Identity for pod-to-Blob auth | +| Azure Key Vault | `azurerm_key_vault` | Stores all LangSmith secrets | +| cert-manager | Helm | Automated TLS certificate management | +| KEDA | Helm | Event-driven autoscaling for workers | + +### Clone and configure + +```bash +git clone https://github.com/langchain-ai/terraform.git +cd terraform/modules/azure +``` + +All subsequent commands run from `modules/azure/`. Run `make help` for the full target list. + +Generate `terraform.tfvars` with the interactive wizard: + +```bash +make quickstart +``` + +The wizard runs a 10-section questionnaire covering profile, subscription, naming, networking, AKS sizing, ingress controller, DNS/TLS, backend services, Key Vault, sizing profile, and security add-ons. Each section includes explanatory context, cost estimates, and trade-offs. Re-running is safe; existing values are preselected at each prompt, press Enter to keep them. + +Prefer manual editing: + +```bash +cp infra/terraform.tfvars.example infra/terraform.tfvars +vi infra/terraform.tfvars +``` + +Minimum required values: + +```hcl +# Identity +subscription_id = "" + +# Location +location = "eastus" + +# Naming + tagging +identifier = "-prod" # suffix on all resource names +environment = "prod" + +# Deployment tier, production recommended +postgres_source = "external" # Azure DB for PostgreSQL +redis_source = "external" # Azure Cache for Redis Premium +clickhouse_source = "in-cluster" # use "external" + LangChain Managed for production + +# DNS + TLS (HTTPS via Let's Encrypt on a free Azure subdomain) +dns_label = "langsmith-prod" # → langsmith-prod.eastus.cloudapp.azure.com +tls_certificate_source = "letsencrypt" +letsencrypt_email = "ops@example.com" + +# Sizing +sizing_profile = "production" # minimum | dev | production | production-large +``` + + +In-cluster ClickHouse runs as a single pod with no replication or backups, dev/POC only. For production, use [LangChain Managed ClickHouse](/langsmith/langsmith-managed-clickhouse). + + + +Blob Storage is always required, regardless of tier. Trace payloads must go to Azure Blob, never to ClickHouse. + + +For all variables, see the [Azure variables reference](/langsmith/self-host-terraform-azure-variables). + +### Bootstrap secrets + +```bash +make setup-env +``` + +`setup-env.sh` writes `infra/secrets.auto.tfvars` (gitignored, `chmod 600`). Terraform picks this file up automatically, no shell exports needed. + +- **First run:** prompts for PostgreSQL password, LangSmith license key, admin password, and admin email. Generates `api_key_salt`, `jwt_secret`, and four Fernet encryption keys locally. +- **Subsequent runs:** reads everything silently from Azure Key Vault. + + +Never commit `secrets.auto.tfvars`. It is gitignored. Regenerate on any machine by running `make setup-env`. + + +### Preflight + +```bash +make preflight +``` + +Validates Azure CLI auth, the active subscription, 11 required resource providers, RBAC (Contributor + User Access Administrator), `terraform.tfvars` and `secrets.auto.tfvars` presence, and `terraform`/`kubectl`/`helm` on PATH. + +### Apply + +```bash +make init +make apply # ~15 to 20 min on first run +``` + + +Skip `make plan` on a fresh deploy. `kubernetes_manifest` resources require a live cluster API during plan, which does not exist yet. `make apply` handles resource ordering in three internal stages: Azure resources → AKS → Kubernetes bootstrap. + + +### Cluster credentials and Kubernetes Secrets + +After `make apply` completes, get cluster credentials and push secrets into the cluster: + +```bash +make kubeconfig # fetches AKS credentials, merges into ~/.kube/config +make k8s-secrets # Key Vault → langsmith-config-secret in the langsmith namespace +``` + +`make k8s-secrets` reads 8 secrets from Key Vault and creates or updates `langsmith-config-secret`. Safe to re-run; uses `--dry-run=client | kubectl apply` to update in place. + +### Verify infrastructure + +```bash +# All nodes Ready +kubectl get nodes + +# Bootstrap components, all Running +kubectl get pods -n cert-manager # 3 pods +kubectl get pods -n keda # 3 pods +kubectl get pods -n ingress-nginx # 1 pod (if using nginx) + +# NGINX LoadBalancer, save the EXTERNAL-IP +kubectl get svc ingress-nginx-controller -n ingress-nginx + +# Workload Identity ServiceAccount, should have client-id annotation +kubectl get sa langsmith-ksa -n langsmith \ + -o jsonpath='{.metadata.annotations}' + +# Terraform outputs +terraform -chdir=infra output + +# Key outputs consumed by Helm scripts +terraform -chdir=infra output -raw keyvault_name +terraform -chdir=infra output -raw storage_account_name +terraform -chdir=infra output -raw storage_container_name +terraform -chdir=infra output -raw storage_account_k8s_managed_identity_client_id +``` + +## Deploy LangSmith + +Two deployment paths are supported. Pick one. + +| Path | Command | When to use | +|---|---|---| +| Helm path _(default)_ | `make init-values && make deploy` | Interactive output, kubeconfig refresh, preflight checks. Best for first-time deploys and day-2 re-deploys. | +| Terraform path | `make init-app && make apply-app` | Helm release + Kubernetes Secrets + Workload Identity SA managed in Terraform state. Best for GitOps and CI/CD pipelines. | + +### Helm path (recommended) + +#### Generate Helm values + +```bash +cd terraform/modules/azure +make init-values +``` + +`make init-values` reads `terraform output` and `terraform.tfvars` and generates `helm/values/values-overrides.yaml` with all fields populated: + +- `config.hostname`, your FQDN (from `dns_label` or `langsmith_domain`). +- `config.initialOrgAdminEmail`, the first org admin account. +- `config.existingSecretName: langsmith-config-secret`, secrets reference. +- `config.blobStorage`, storage account name + container + Workload Identity client ID. +- Workload Identity annotations for 5 ServiceAccounts (backend, platform-backend, queue, ingest-queue, host-backend). +- Ingress + TLS block (cert-manager annotation, TLS secret name). +- Postgres and Redis external secret references (when `postgres_source = "external"` / `redis_source = "external"`). + +Also copies the sizing overlay and any enabled add-on overlays from `helm/values/examples/` into `helm/values/`. + + +The admin email is read from `langsmith_admin_email` in `terraform.tfvars` (set during `make setup-env`) and written into `values-overrides.yaml` automatically. No manual editing needed. + + +#### Deploy + +```bash +make deploy # ~10 min +``` + +`make deploy` handles: + +1. Validates `values-overrides.yaml` exists. +2. Refreshes kubeconfig via `az aks get-credentials`. +3. Annotates the LoadBalancer service with `service.beta.kubernetes.io/azure-dns-label-name`, required for Azure to assign the DNS label to the public IP. +4. Creates the `letsencrypt-prod` cert-manager `ClusterIssuer` if `tls_certificate_source = "letsencrypt"` (idempotent). +5. Runs preflight checks (tools, cluster connectivity, Helm repo). +6. Verifies `langsmith-config-secret` exists; auto-creates from Key Vault if missing. +7. Builds and logs the values chain. +8. Auto-recovers any stuck `pending-upgrade` Helm release before proceeding. +9. Runs `helm upgrade --install langsmith langchain/langsmith --timeout 20m`. +10. Waits for core deployments to roll out. +11. Annotates the `langsmith-ksa` ServiceAccount with the Workload Identity client ID. +12. Prints the access URL and login credentials location. + + +Why `--timeout 20m`? The `langsmith-backend-auth-bootstrap` Job runs DB migrations and org initialization as a post-install hook. This takes up to 5 minutes on first install. Without a long timeout, Helm may report failure even though the install eventually succeeds. + + + +**Watch pods in a second terminal:** + +```bash +# macOS +brew install watch +watch kubectl get pods -n langsmith + +# Without watch +while true; do clear; kubectl get pods -n langsmith; sleep 3; done +``` + + +### Terraform path + +Use this path when you want the Helm release, Kubernetes Secrets, and Workload Identity ServiceAccount managed in Terraform state. + +```bash +# Copy and configure app vars +cp app/terraform.tfvars.example app/terraform.tfvars +vi app/terraform.tfvars # set admin_email at minimum + +# Pull infra outputs into app/infra.auto.tfvars.json + terraform init +make init-app + +# Deploy Helm release + K8s Secrets + WI ServiceAccount via Terraform +make apply-app +``` + +Feature flags in `app/terraform.tfvars`: + +```hcl +sizing = "production" # minimum | dev | production | production-large +enable_agent_deploys = true # LangSmith Deployment add-on +enable_agent_builder = true # Agent Builder add-on (requires agent_deploys) +enable_insights = true # Insights / ClickHouse add-on +enable_polly = true # Polly add-on (requires agent_deploys) +``` + +End-to-end via Terraform (infrastructure + application): + +```bash +make deploy-all-tf # apply → init-values → init-app → apply-app +``` + +### Verify the deployment + +```bash +# All pods Running or Completed (~17 pods) +kubectl get pods -n langsmith + +# Ingress host + TLS assigned +kubectl get ingress -n langsmith + +# TLS certificate issued +kubectl get certificate -n langsmith # READY: True + +# Helm release status +helm list -n langsmith +``` + +Expected pod state (all Running after ~5 minutes): + +```txt +langsmith-ace-backend-xxxxx 1/1 Running 0 5m +langsmith-backend-xxxxx 1/1 Running 0 5m +langsmith-backend-auth-bootstrap-xxxxx 0/1 Completed 0 5m +langsmith-backend-ch-migrations-xxxxx 0/1 Completed 0 5m +langsmith-backend-migrations-xxxxx 0/1 Completed 0 5m +langsmith-clickhouse-0 1/1 Running 0 5m +langsmith-frontend-xxxxx 1/1 Running 0 5m +langsmith-ingest-queue-xxxxx 1/1 Running 0 5m +langsmith-platform-backend-xxxxx 1/1 Running 0 5m +langsmith-playground-xxxxx 1/1 Running 0 5m +langsmith-queue-xxxxx 1/1 Running 0 5m +``` + +Open `https://` and log in with the admin email and password from Key Vault: + +```bash +az keyvault secret show \ + --vault-name $(terraform -chdir=infra output -raw keyvault_name) \ + --name langsmith-admin-password \ + --query value -o tsv +``` + +### Values chain + +`make deploy` applies Helm values files in this order (last file wins on conflicts): + +```txt +1. helm/values/values.yaml ← Azure base (NGINX, Blob WI, no Istio) +2. helm/values/values-overrides.yaml ← hostname, WI client-id, auth, postgres/redis +3. helm/values/langsmith-values-sizing-.yaml ← resource requests + HPA settings +4. (add-on files when enable_* flags are set) +``` + +All files in `helm/values/` are gitignored (generated or contain live secrets). Source templates live in `helm/values/examples/` and are copied by `make init-values`. + +### Day-2 operations + +```bash +make status # 10-section health check +make status-quick # skip Key Vault + K8s secret queries (faster) +make deploy # re-deploy after any Helm value changes +make init-values # re-generate values after Terraform changes +make kubeconfig # refresh cluster credentials +make k8s-secrets # re-create langsmith-config-secret from Key Vault +``` + +## Enable add-ons + +Each add-on is gated by a flag in `infra/terraform.tfvars`. Set the flag, re-run `make init-values` to regenerate values, then re-run `make deploy`. + +### LangSmith Deployment + +Enables [LangSmith Deployment](/langsmith/deploy-self-hosted-full-platform), which lets you deploy and manage LangGraph graphs as API servers directly from the LangSmith UI. Adds three new pods. + +| Pod | Role | Workload Identity | +|---|---|---| +| `langsmith-host-backend` | LangSmith Deployment control plane API. Manages deployment lifecycle, stores state in shared PostgreSQL. | Yes | +| `langsmith-listener` | Watches host-backend, creates and updates `LangGraphPlatform` CRDs in Kubernetes. | Yes | +| `langsmith-operator` | Reconciles CRDs. Creates per-deployment Deployments, StatefulSets, and Services. | No | + +#### Scale the node pool first + +Before enabling, bump `default_node_pool_min_count` to at least 5. The operator spawns agent deployment pods on demand and needs node headroom: + +```hcl +# infra/terraform.tfvars +default_node_pool_min_count = 5 # operator pods need headroom +enable_deployments = true +``` + + +Without sufficient node capacity, operator-spawned agent pods stay in `Pending` state indefinitely. Scale the node pool first, then enable. + + +#### Apply, regenerate values, deploy + +```bash +cd terraform/modules/azure +make apply # scale up node pool (~5 min) +make init-values # picks up enable_deployments = true → generates add-on overlay +make deploy # rolls out host-backend + listener + operator +``` + +`make init-values` appends the LangSmith Deployment add-on overlay (`langsmith-values-agent-deploys.yaml`) to the values chain. It automatically injects: + +```yaml +config: + deployment: + enabled: true # REQUIRED, without this listener and operator are skipped silently + url: "https://" # must match config.hostname (with protocol) + tlsEnabled: true # set based on tls_certificate_source +``` + + +**`config.deployment.url` must include `https://`.** Missing the protocol causes operator-deployed agents to stay stuck in `DEPLOYING` state indefinitely. The URL is injected automatically by `make init-values`, do not set it manually in the overlay file; it will be overwritten. + + + +**`config.deployment.enabled: true` is required.** Setting only `config.deployment.url` without `enabled: true` causes the chart to silently skip creating `listener` and `operator`. No error, they just never appear. + + +#### Verify + +```bash +# All three pods Running +kubectl get pods -n langsmith | grep -E "host-backend|listener|operator" + +# LangSmith Deployment CRDs registered +kubectl get crd | grep langchain + +# List LangSmith Deployments (empty on first deploy, populated when you create a deployment) +kubectl get lgp -n langsmith +``` + +Expected: `langsmith-host-backend`, `langsmith-listener`, and `langsmith-operator` all Running. Total pod count: ~20 Running + 3 Completed jobs. + +KEDA is already installed alongside infrastructure. With `enable_deployments = true`, the operator creates KEDA `ScaledObject` resources for each agent deployment's worker queue. Worker pods scale down to zero when idle and scale up based on Redis queue depth. + +### Agent Builder + +Provides visual AI-assisted creation and management of LangGraph agents from the LangSmith UI. No `terraform apply` needed; just `make init-values && make deploy`. + +**Prerequisite:** LangSmith Deployment enabled (`enable_deployments = true`). Enabling Agent Builder without it causes a preflight error. + +| Pod | Type | Role | +|---|---|---| +| `langsmith-agent-builder-tool-server` | Static | MCP tool execution server, code/file editing tools for the AI | +| `langsmith-agent-builder-trigger-server` | Static | Webhook receiver and scheduled trigger engine | +| `langsmith-agent-bootstrap` | Job (Completed) | Registers the bundled Agent Builder agent through the operator, runs once | +| `agent-builder-` + queue + redis + `lg--0` | Dynamic (operator-managed) | Agent Builder deployment, created by the operator when the bootstrap Job runs | + +Enable: + +```hcl +# infra/terraform.tfvars +enable_deployments = true # required prerequisite +enable_agent_builder = true +``` + +```bash +cd terraform/modules/azure +make init-values # appends langsmith-values-agent-builder.yaml to values chain +make deploy # rolling update, ~10 min for bootstrap Job to complete +``` + +`make init-values` appends the Agent Builder add-on overlay (`langsmith-values-agent-builder.yaml`) to the values chain. The overlay enables the Agent Builder UI and supporting services, sets `backend.agentBootstrap: true` (the post-install job that registers Agent Builder as a LangSmith Deployment and creates the required ConfigMap), and sets conservative agent worker pod resources (1 CPU / 1 Gi) instead of the chart's default 4 CPU / 8 Gi. + +Verify: + +```bash +# Static pods Running, bootstrap Job Completed +kubectl get pods -n langsmith | grep -E "tool-server|trigger-server|Bootstrap" + +# Operator-managed dynamic pods (4 pods, api-server, queue, redis, postgres StatefulSet) +kubectl get pods -n langsmith | grep agent-builder + +# Operator-managed LangSmith Deployment for Agent Builder +kubectl get lgp -n langsmith +``` + +Expected: 3 static pods (tool-server, trigger-server, bootstrap Job) + 4 dynamic pods. Total: ~26 pods. After `make deploy`, an **Agent Builder** section appears in the LangSmith UI navigation. + + +**Roll the frontend after `agentBootstrap` completes.** The `agentBootstrap` Job creates the `langsmith-polly-config` ConfigMap that the frontend reads for the Polly UI. If the frontend was running when bootstrap completed, Polly shows "Unable to connect to LangGraph server". Fix: + +```bash +kubectl rollout restart deployment langsmith-frontend -n langsmith +``` + + + +**Encryption key is read from `langsmith-config-secret`.** Do not set `config.agentBuilder.encryptionKey` inline in `values-overrides.yaml`. The chart reads it from `langsmith-config-secret` via `existingSecretName`. Setting it inline overrides the secret reference and creates a mismatch. + + +Both `langsmith-agent-builder-tool-server` and `langsmith-agent-builder-trigger-server` need Workload Identity to access Azure Blob Storage. Their federated credentials are pre-registered in `modules/k8s-cluster/main.tf`; no additional setup is needed. + +### Insights and Polly + +Two features, both of which require LangSmith Deployment. They are independent of each other; enable either one without the other. + +- **Insights:** AI-powered trace analytics (Clio). Surfaces patterns and anomalies in LangSmith traces. Clio deploys as a dynamic LangGraph deployment through the operator on first UI invocation. Adds no new static pods. +- **Polly:** AI-powered evaluation and monitoring agent. Runs as a dynamic LangGraph deployment. Sets resource limits for the Polly worker (2 CPU / 4 Gi request, 4 CPU / 8 Gi limit, scales 1 to 5 replicas). + +No `terraform apply` needed; just `make init-values && make deploy`. + +```hcl +# infra/terraform.tfvars +enable_deployments = true # required prerequisite +enable_insights = true # Insights / Clio analytics +enable_polly = true # Polly AI evaluation agent +``` + +Enable just one: + +```hcl +enable_insights = true # Insights only +# or +enable_polly = true # Polly only +``` + +```bash +cd terraform/modules/azure +make init-values # appends insights + polly add-on overlays to the values chain +make deploy # rolling update, ~5 min +``` + +`make init-values` appends the add-on overlays based on `clickhouse_source` in `terraform.tfvars`: + +- `clickhouse_source = "in-cluster"`, generates a minimal overlay (`config.insights.enabled: true` only). The Helm chart manages ClickHouse internally. +- `clickhouse_source = "external"`, generates a full overlay with `clickhouse.external.enabled: true` and a `langsmith-clickhouse` secret reference. Create this secret with the ClickHouse host and credentials before deploying. + + +**Do not manually copy the Insights example file for in-cluster ClickHouse.** The example `helm/values/examples/langsmith-values-insights.yaml` has `clickhouse.external.enabled: true` and `existingSecretName: langsmith-clickhouse`. Copying it manually when using in-cluster ClickHouse causes `CreateContainerConfigError` because the secret does not exist. Always use `make init-values` to generate the correct file. + + +Verify: + +```bash +# ClickHouse already running from base install +# Insights and Polly deploy as dynamic pods when first invoked from the UI +kubectl get pods -n langsmith | grep -E "clickhouse|polly|clio" + +# Watch for dynamic pods on first Insights use +kubectl get pods -n langsmith -w + +# Confirm Insights is enabled in Helm values +helm get values langsmith -n langsmith | grep -A3 insights +# Expected: enabled: true +``` + + +**Encryption keys must never change after first enable.** `insights_encryption_key` and `polly_encryption_key` must never change after first enable. Changing either permanently corrupts all existing encrypted data. There is no recovery path. These keys live in Key Vault and never rotate automatically. + + + +**Roll the frontend after first Polly enable.** If the Polly UI shows "Unable to connect to LangGraph server" after enabling, the frontend started before the bootstrap ConfigMap was ready. Fix: + +```bash +kubectl rollout restart deployment langsmith-frontend -n langsmith +``` + + +### Add-on summary + +| Phase | New pods | Total ~running | +|---|---|---| +| Base install | Core LangSmith (backend, frontend, queue, ingest-queue, clickhouse, etc.) | ~17 | +| LangSmith Deployment | `host-backend`, `listener`, `operator` | ~20 | +| Agent Builder | `tool-server`, `trigger-server`, `bootstrap` Job + 4 dynamic Agent Builder pods | ~26 | +| Insights and Polly | No new static pods (Clio + Polly appear dynamically on first use) | ~22 at rest | + +## Ingress controllers + +Set `ingress_controller` in `terraform.tfvars` before `make apply`. For the full TLS compatibility matrix, see `INGRESS_CONTROLLERS.md` in the [Azure module repo](https://github.com/langchain-ai/terraform/blob/main/modules/azure/INGRESS_CONTROLLERS.md). + +| Value | What Terraform installs | Best for | +|---|---|---| +| `nginx` _(default)_ | `ingress-nginx` Helm chart with Azure LB | Standard deployments. Simplest setup. | +| `istio-addon` | AKS Service Mesh add-on (Azure-managed Istio) | Azure-managed Istio mesh, multi-dataplane, mTLS. | +| `istio` | `istio-base` + `istiod` + `istio-ingressgateway` | Self-managed Istio. Full mesh and sidecar injection. | +| `agic` | Azure Application Gateway v2 + AGIC Helm chart | Enterprise Azure, native L7 WAF, HTTP-only or dns01 + custom domain. | +| `envoy-gateway` | `gateway-helm` OCI chart, Kubernetes Gateway API | Gateway API native, modern alternative to Ingress. | + + +`letsencrypt` (HTTP-01) only works with `nginx`, `istio` (self-managed), and `envoy-gateway`. `istio-addon` and `agic` do not create an IngressClass, so the ACME solver cannot receive traffic. For those controllers, use `dns01` with a custom domain, or `none` for HTTP-only. + + +## DNS and TLS + +`dns_label` gives you a free Azure subdomain, `