diff --git a/src/.codespellignore b/src/.codespellignore
index 1b6ede1602..f672c87772 100644
--- a/src/.codespellignore
+++ b/src/.codespellignore
@@ -13,4 +13,5 @@ SAIs
 iTerm
 SOM
 AKS
+aks
 ACI
diff --git a/src/docs.json b/src/docs.json
index 019d2ca5ed..51a4c42484 100644
--- a/src/docs.json
+++ b/src/docs.json
@@ -840,6 +840,42 @@
                         "langsmith/gcp-self-hosted"
                       ]
                     },
+                    {
+                      "group": "Deploy with Terraform",
+                      "pages": [
+                        "langsmith/self-host-terraform",
+                        {
+                          "group": "AWS",
+                          "pages": [
+                            "langsmith/self-host-terraform-aws-deploy",
+                            "langsmith/self-host-terraform-aws-architecture",
+                            "langsmith/self-host-terraform-aws-variables",
+                            "langsmith/self-host-terraform-aws-quick-reference",
+                            "langsmith/self-host-terraform-aws-troubleshooting"
+                          ]
+                        },
+                        {
+                          "group": "GCP",
+                          "pages": [
+                            "langsmith/self-host-terraform-gcp-deploy",
+                            "langsmith/self-host-terraform-gcp-architecture",
+                            "langsmith/self-host-terraform-gcp-variables",
+                            "langsmith/self-host-terraform-gcp-quick-reference",
+                            "langsmith/self-host-terraform-gcp-troubleshooting"
+                          ]
+                        },
+                        {
+                          "group": "Azure",
+                          "pages": [
+                            "langsmith/self-host-terraform-azure-deploy",
+                            "langsmith/self-host-terraform-azure-architecture",
+                            "langsmith/self-host-terraform-azure-variables",
+                            "langsmith/self-host-terraform-azure-quick-reference",
+                            "langsmith/self-host-terraform-azure-troubleshooting"
+                          ]
+                        }
+                      ]
+                    },
                     {
                       "group": "Setup guides",
                       "pages": [
diff --git a/src/images/self-hosted-terraform/aws-architecture.png b/src/images/self-hosted-terraform/aws-architecture.png
new file mode 100644
index 0000000000..befcd65675
Binary files /dev/null and b/src/images/self-hosted-terraform/aws-architecture.png differ
diff --git a/src/images/self-hosted-terraform/aws-deployment-flow.png b/src/images/self-hosted-terraform/aws-deployment-flow.png
new file mode 100644
index 0000000000..c03ca1e0e2
Binary files /dev/null and b/src/images/self-hosted-terraform/aws-deployment-flow.png differ
diff --git a/src/images/self-hosted-terraform/azure-architecture-light.png b/src/images/self-hosted-terraform/azure-architecture-light.png
new file mode 100644
index 0000000000..958a95e1c7
Binary files /dev/null and b/src/images/self-hosted-terraform/azure-architecture-light.png differ
diff --git a/src/images/self-hosted-terraform/azure-architecture-pass2.png b/src/images/self-hosted-terraform/azure-architecture-pass2.png
new file mode 100644
index 0000000000..8b096cc3ca
Binary files /dev/null and b/src/images/self-hosted-terraform/azure-architecture-pass2.png differ
diff --git a/src/images/self-hosted-terraform/azure-architecture-pass3.png b/src/images/self-hosted-terraform/azure-architecture-pass3.png
new file mode 100644
index 0000000000..0fc27a0282
Binary files /dev/null and b/src/images/self-hosted-terraform/azure-architecture-pass3.png differ
diff --git a/src/images/self-hosted-terraform/azure-architecture-pass4-5.png b/src/images/self-hosted-terraform/azure-architecture-pass4-5.png
new file mode 100644
index 0000000000..6e930ae17e
Binary files /dev/null and b/src/images/self-hosted-terraform/azure-architecture-pass4-5.png differ
diff --git a/src/images/self-hosted-terraform/azure-architecture.png b/src/images/self-hosted-terraform/azure-architecture.png
new file mode 100644
index 0000000000..47d3d6439b
Binary files /dev/null and b/src/images/self-hosted-terraform/azure-architecture.png differ
diff --git a/src/images/self-hosted-terraform/gcp-architecture.png b/src/images/self-hosted-terraform/gcp-architecture.png
new file mode 100644
index 0000000000..80d2a01de2
Binary files /dev/null and b/src/images/self-hosted-terraform/gcp-architecture.png differ
diff --git a/src/images/self-hosted-terraform/langsmith-components.png b/src/images/self-hosted-terraform/langsmith-components.png
new file mode 100644
index 0000000000..bbe3cece0a
Binary files /dev/null and b/src/images/self-hosted-terraform/langsmith-components.png differ
diff --git a/src/langsmith/aws-self-hosted.mdx b/src/langsmith/aws-self-hosted.mdx
index 17659da2ab..bfdcb5e112 100644
--- a/src/langsmith/aws-self-hosted.mdx
+++ b/src/langsmith/aws-self-hosted.mdx
@@ -14,9 +14,7 @@ This page provides:
 - [AWS Well-Architected best practices](#aws-well-architected-best-practices) for operational excellence, security, and reliability.
 
 <Note>
-LangChain provides Terraform modules specifically for AWS to help provision infrastructure for LangSmith. These modules can quickly set up EKS clusters, RDS, ElastiCache, S3, and networking resources.
-
-View the [AWS Terraform modules](https://github.com/langchain-ai/terraform/tree/main/modules/aws) for documentation and examples.
+LangChain publishes production-ready [Terraform modules for AWS](https://github.com/langchain-ai/terraform/tree/main/modules/aws) that provision EKS, RDS, ElastiCache, S3, and networking in a single workflow. Start with the [Deploy with Terraform overview](/langsmith/self-host-terraform) to choose between the Terraform and Helm-only paths.
 </Note>
 
 ## Initial setup
diff --git a/src/langsmith/azure-self-hosted.mdx b/src/langsmith/azure-self-hosted.mdx
index 43326bb3fc..136948f81a 100644
--- a/src/langsmith/azure-self-hosted.mdx
+++ b/src/langsmith/azure-self-hosted.mdx
@@ -14,9 +14,7 @@ This page provides:
 - [Security and access control](#security-and-access-control) recommendations for Azure deployments.
 
 <Note>
-LangChain provides Terraform modules specifically for Azure to help provision infrastructure for LangSmith. These modules can quickly set up AKS clusters, Azure Database for PostgreSQL, Azure Managed Redis, Blob Storage, and networking resources.
-
-View the [Azure Terraform modules](https://github.com/langchain-ai/terraform/tree/main/modules/azure) for documentation and examples.
+LangChain publishes production-ready [Terraform modules for Azure](https://github.com/langchain-ai/terraform/tree/main/modules/azure) that provision AKS, Azure Database for PostgreSQL, Azure Managed Redis, Blob Storage, and Key Vault in a single workflow. Start with the [Deploy with Terraform overview](/langsmith/self-host-terraform) to choose between the Terraform and Helm-only paths.
 </Note>
 
 ## Initial setup
diff --git a/src/langsmith/gcp-self-hosted.mdx b/src/langsmith/gcp-self-hosted.mdx
index 5324258572..d9a0cd7ff8 100644
--- a/src/langsmith/gcp-self-hosted.mdx
+++ b/src/langsmith/gcp-self-hosted.mdx
@@ -14,9 +14,7 @@ This page provides:
 - [Google Cloud Well-Architected best practices](#google-cloud-well-architected-best-practices) for operational excellence, security, and reliability.
 
 <Note>
-LangChain provides Terraform modules specifically for GCP to help provision infrastructure for LangSmith. These modules can quickly set up GKE clusters, Cloud SQL, Memorystore Redis, Cloud Storage, and networking resources.
-
-View the [GCP Terraform modules](https://github.com/langchain-ai/terraform/tree/main/modules/gcp) for documentation and examples.
+LangChain publishes production-ready [Terraform modules for GCP](https://github.com/langchain-ai/terraform/tree/main/modules/gcp) that provision GKE, Cloud SQL, Memorystore, Cloud Storage, and networking in a single workflow. Start with the [Deploy with Terraform overview](/langsmith/self-host-terraform) to choose between the Terraform and Helm-only paths.
 </Note>
 
 ## Initial setup
diff --git a/src/langsmith/kubernetes.mdx b/src/langsmith/kubernetes.mdx
index 8355c6d3de..8138132719 100644
--- a/src/langsmith/kubernetes.mdx
+++ b/src/langsmith/kubernetes.mdx
@@ -25,15 +25,9 @@ LangChain has successfully tested LangSmith on the following Kubernetes distribu
 - OpenShift (4.14+)
 - Minikube and Kind (for development purposes)
 
-<Note>
-LangChain provides Terraform modules to help provision infrastructure for LangSmith. These modules can quickly set up Kubernetes clusters, storage, and networking for your deployment.
-
-Available modules:
-- [AWS Terraform modules](https://github.com/langchain-ai/terraform/tree/main/modules/aws)
-- [Azure Terraform modules](https://github.com/langchain-ai/terraform/tree/main/modules/azure)
-
-View the [full Terraform repository](https://github.com/langchain-ai/terraform) for documentation and additional resources.
-</Note>
+<Tip>
+**Prefer infrastructure as code?** [Deploy with Terraform](/langsmith/self-host-terraform) bundles cluster provisioning, secrets wiring, and the Helm release for AWS, Azure, and GCP into one workflow. The page below covers the Helm-only path against any conformant cluster you already manage.
+</Tip>
 
 ## Prerequisites
 
diff --git a/src/langsmith/self-host-terraform-aws-architecture.mdx b/src/langsmith/self-host-terraform-aws-architecture.mdx
new file mode 100644
index 0000000000..e9e187c251
--- /dev/null
+++ b/src/langsmith/self-host-terraform-aws-architecture.mdx
@@ -0,0 +1,326 @@
+---
+title: AWS Terraform architecture
+sidebarTitle: Architecture
+description: Platform layers, services, IRSA roles, networking, and module dependencies for LangSmith self-hosted on AWS EKS.
+---
+
+This page documents what the [AWS Terraform modules](https://github.com/langchain-ai/terraform/tree/main/modules/aws) provision and how the modules wire the resulting deployment together.
+
+## Platform layers
+
+LangSmith on AWS deploys in two stages with one optional add-on. The infrastructure stage provisions the cloud foundation. The application stage installs the LangSmith Helm chart. The LangSmith Deployment add-on is opt-in and adds the host-backend, listener, and operator services for managing LangGraph applications from the UI.
+
+<img src="/images/self-hosted-terraform/aws-architecture.png" alt="LangSmith on AWS service layout" />
+
+```txt
+LangSmith Deployment add-on  (enable_langsmith_deployments = true)
+  host-backend, listener, operator
+  Per deployed graph: api-server, queue, redis, postgres (operator-managed)
+  Requires: KEDA (installed alongside infrastructure via k8s-bootstrap)
+
+LangSmith application  (deploy_langsmith = true)
+  backend, frontend, playground, queue, ace-backend, clickhouse
+  Storage: RDS PostgreSQL (metadata) + S3 (trace blobs via VPC endpoint)
+  Ingress: AWS ALB | NGINX | Envoy Gateway | Istio
+
+AWS infrastructure
+  VPC + private/public subnets + single NAT gateway
+  EKS cluster + managed node group + cluster autoscaler
+  RDS PostgreSQL (private subnets)
+  ElastiCache Redis (private subnets)
+  S3 bucket + VPC Gateway Endpoint (no public route)
+  ALB controller + EBS CSI driver + metrics server
+  k8s-bootstrap: KEDA, ESO, optional Envoy Gateway
+  Optional: Network Firewall, WAF, CloudTrail, ALB access logs
+```
+
+## Component to storage mapping
+
+| Component | Storage backend | Access method |
+|---|---|---|
+| `backend` | RDS PostgreSQL | Private subnet, security group |
+| `backend` | S3 bucket | IRSA + VPC Gateway Endpoint |
+| `clickhouse` | EBS volume (GP3, EKS PVC) | Local |
+| `redis` | ElastiCache or in-cluster | Private subnet, security group |
+| LGP operator | RDS PostgreSQL (shared) | Private subnet, security group |
+
+## Application core services
+
+These pods run on every deployment. All write logs and metrics; the busier components (backend, queue, ingest-queue) scale horizontally.
+
+| Service | Purpose | Port | HPA | IRSA | Depends on |
+|---|---|---|---|---|---|
+| `langsmith-frontend` | React UI | 3000 | 1 to 10 | No | `backend`, `platform-backend` |
+| `langsmith-backend` | Main API (traces, runs, projects, API keys, feedback) | 1984 | 3 to 10 | Yes (S3) | Postgres, Redis, ClickHouse, S3 |
+| `langsmith-platform-backend` | Org and user management, auth, billing, settings | 1986 | 1 to 10 | Yes (S3) | Postgres, Redis, S3 |
+| `langsmith-playground` | LLM prompt playground UI | 3001 | 1 to 10 | No | `backend` |
+| `langsmith-queue` | Trace ingestion worker (Redis to ClickHouse + S3) | — | 3 to 10 + KEDA | Yes | Redis, ClickHouse, S3 |
+| `langsmith-ingest-queue` | Dedicated high-throughput ingestion worker | — | 3 to 10 + KEDA | Yes | Redis, S3 |
+| `langsmith-ace-backend` | Async compute (dataset runs, evaluations, background jobs) | — | 1 to 5 | No | Postgres, Redis |
+| `langsmith-clickhouse` | Columnar store (trace spans, run metadata, eval results) | — | StatefulSet, single replica | No | EBS GP3 PVC |
+
+<Warning>
+In-cluster ClickHouse is dev/POC only (single pod, no replication, no backups). For production use [LangChain Managed ClickHouse](/langsmith/langsmith-managed-clickhouse) or a self-managed external cluster.
+</Warning>
+
+### One-time jobs
+
+The Helm chart runs three jobs at install and upgrade time:
+
+| Job | Purpose |
+|---|---|
+| `langsmith-backend-migrations` | PostgreSQL schema migrations |
+| `langsmith-backend-ch-migrations` | ClickHouse schema migrations |
+| `langsmith-backend-auth-bootstrap` | Creates the initial org and admin account from `initial_org_admin_password` in `langsmith-config` |
+
+## LangSmith Deployment add-on
+
+When `enable_langsmith_deployments = true`, three additional services are installed and a `LangGraphPlatform` CRD is registered. Each deployment the user creates in the LangSmith UI produces a Kubernetes Deployment in the `langsmith` namespace, managed by the operator.
+
+| Service | Purpose |
+|---|---|
+| `langsmith-host-backend` | LangGraph control plane API. Manages deployment lifecycle, serves deployment metadata. IRSA for S3 access. |
+| `langsmith-listener` | Watches host-backend for deployment state changes, creates and updates `LangGraphPlatform` CRDs. IRSA for S3 access. |
+| `langsmith-operator` | Kubernetes operator. Reconciles `LangGraphPlatform` CRDs, creates and deletes Deployments and Services for each agent. |
+
+## AWS managed services
+
+When `postgres_source = "external"` and `redis_source = "external"` (the recommended production setting), Terraform provisions the following AWS managed services:
+
+### RDS PostgreSQL
+
+- Default size: `db.t3.large`, private subnets, port 5432.
+- Holds orgs, users, projects, API keys, settings.
+- Secret flow: SSM `/langsmith/{base_name}/postgres-password` → ESO → `langsmith-config`.
+
+### ElastiCache Redis
+
+- Default size: `cache.m5.large`, private subnets, TLS port 6379.
+- Trace ingestion queue, pub/sub, short-lived cache.
+- Secret flow: SSM `/langsmith/{base_name}/redis-auth-token` → ESO → `langsmith-config`.
+
+### S3 bucket
+
+- Trace payloads: large inputs and outputs, attachments.
+- IRSA via `langsmith_irsa_role` (no static keys). VPC Gateway Endpoint, no public internet.
+- Prefixes: `ttl_s/` (short TTL) and `ttl_l/` (long TTL).
+- Always required. Disabling blob storage breaks the cluster on large payloads.
+
+### SSM Parameter Store
+
+- Centralized secret store for all LangSmith secrets.
+- Flow: `source infra/scripts/setup-env.sh` writes secrets to SSM. The ESO `ClusterSecretStore` reads them and projects a `langsmith-config` Kubernetes Secret that the Helm chart mounts via `config.existingSecretName`.
+- Prefix: `/langsmith/{name_prefix}-{environment}/`.
+
+## Cluster infrastructure
+
+The `k8s-bootstrap` Terraform module installs the cluster-level services that LangSmith depends on:
+
+| Service | Namespace | IRSA | Purpose |
+|---|---|---|---|
+| `aws-load-balancer-controller` | `kube-system` | Yes | Provisions the AWS ALB from Kubernetes Ingress objects. Deleting the Ingress deprovisions the ALB and assigns a new DNS name on recreate, which breaks DNS records and OIDC redirect URIs. |
+| `cluster-autoscaler` | `kube-system` | Yes | Scales EC2 node groups based on pod scheduling pressure. |
+| `ebs-csi-driver` | `kube-system` | Yes | Provisions EBS volumes for PersistentVolumeClaims (used by ClickHouse). |
+| KEDA | `keda` | No | Kubernetes Event-driven Autoscaling. Scales `queue` and `ingest-queue` on Redis queue depth. Required for the LangSmith Deployment add-on. |
+| cert-manager | `cert-manager` | Optional (Route 53 IRSA when `letsencrypt`) | Automates TLS certificate issuance. Installed always; active for Let's Encrypt only. |
+| External Secrets Operator | `external-secrets` | Yes | Syncs SSM parameters into the `langsmith-config` Kubernetes Secret. |
+
+## IRSA roles
+
+IRSA replaces static credentials. The EKS cluster's OIDC issuer is the trust anchor; service accounts in `langsmith` and `kube-system` are annotated with role ARNs and pods receive temporary credentials via the EKS token webhook.
+
+| Role | Defined in | Used by | Permissions |
+|---|---|---|---|
+| `langsmith_irsa_role` | `modules/eks` | `backend`, `platform-backend`, `queue`, `ingest-queue`, host-backend, listener | `s3:GetObject`, `s3:PutObject`, `s3:DeleteObject`, `s3:ListBucket` on the LangSmith bucket |
+| `aws_iam_role.eso` | `aws/infra/main.tf` | ESO controller | `ssm:GetParameter`, `ssm:GetParameters` on `/langsmith/*` |
+
+## Network topology
+
+### Default — ALB ingress
+
+```txt
+Internet
+  → AWS Application Load Balancer (port 80 or 443, TLS via ACM or Let's Encrypt)
+    → EKS Cluster (private subnets)
+      • kube-system: aws-load-balancer-controller, cluster-autoscaler, ebs-csi-driver, keda
+      • langsmith:   backend, frontend, playground, queue, clickhouse
+                     redis (in-cluster) OR ElastiCache (private subnet)
+                     RDS PostgreSQL (private subnet)
+                     S3 bucket (VPC Gateway Endpoint, no public route)
+```
+
+### Envoy Gateway — opt-in
+
+```txt
+Internet
+  → AWS Network Load Balancer (NLB, ACM TLS termination at 443)
+    → envoy-gateway-system: Envoy proxy (GatewayClass: eg, Gateway: langsmith-gateway)
+      → langsmith namespace:        backend, frontend, playground, queue, clickhouse, ...
+      → langsmith-agents namespace (optional dataplane): langgraph-dataplane listener + operator + agent pods
+         (HTTPRoute attaches to shared langsmith-gateway via allowedRoutes: All)
+```
+
+### Egress path with Network Firewall
+
+When `create_firewall = true`, all outbound internet traffic from private subnets is inspected before reaching the NAT gateway:
+
+```txt
+EKS pods / RDS / ElastiCache (private subnets)
+  → AWS Network Firewall (TLS SNI + HTTP Host inspection)
+     ALLOWLIST: firewall_allowed_fqdns (default: beacon.langchain.com)
+     DROP: all other established connections
+  → NAT Gateway (public subnet)
+  → Internet
+```
+
+Pod-to-pod, pod-to-RDS, and pod-to-ElastiCache traffic uses the local VPC route and never touches the firewall.
+
+## Ingress options
+
+Four mutually exclusive ingress options ship with the modules. The choice determines whether split dataplane (agent pods in a separate namespace) is supported.
+
+| Option | Variable | Split dataplane | Traffic path | When to use |
+|---|---|---|---|---|
+| ALB (AWS LBC) | _default_ | No | `ALB → frontend NodePort` | Default. Single-namespace deployments, POC, simplest TLS via ACM. |
+| NGINX Ingress | `enable_nginx_ingress = true` | No | `ALB → TGB → NGINX controller → frontend ClusterIP` | When NGINX is the standard ingress in your organization. |
+| Envoy Gateway | `enable_envoy_gateway = true` | Yes | `ALB → TGB → Envoy proxy:10080 → HTTPRoute → services` | Cross-namespace HTTPRoute routing. Recommended for split dataplane on new AWS deployments. |
+| Istio | `enable_istio_gateway = true` | Yes | `ALB → TGB → istio-ingressgateway:80 → VirtualService → services` | Clusters with Istio already installed, or when an mTLS mesh is required. |
+
+### Why ALB cannot support split dataplane
+
+Standard Kubernetes Ingress is namespace-scoped. The ALB controller routes only to services in the same namespace as the Ingress resource. Agent pods in `langsmith-agents` are invisible to an Ingress in `langsmith`. Envoy Gateway and Istio both support cross-namespace routing via the Kubernetes Gateway API.
+
+### ALB plus Envoy Gateway (chained)
+
+When the existing ALB already provides SSO (Okta or Cognito OIDC), WAF, and TLS, Envoy Gateway slots in behind it instead of replacing it:
+
+```txt
+Internet
+  → ALB (unchanged: WAF, SSO, TLS, DNS)
+    → Envoy Gateway NLB (internal-scheme, auto-provisioned by k8s-bootstrap)
+       → HTTPRoute → langsmith namespace        (control plane)
+       → HTTPRoute → langsmith-agents namespace (split dataplane)
+```
+
+The only change from the default ALB path is retargeting the ALB target group to the Envoy NLB. See `helm/values/examples/langsmith-values-ingress-envoy-gateway.yaml` in the modules repo for the values overlay.
+
+## TLS and DNS
+
+The `tls_certificate_source` variable controls the certificate strategy:
+
+| Mode | Behavior | Compatible gateways |
+|---|---|---|
+| `none` | HTTP only, no certificate | Any |
+| `acm` | HTTPS:443 with HTTP→HTTPS redirect. ACM certificate, auto-provisioned or BYO. | ALB, NGINX |
+| `letsencrypt` | HTTPS via cert-manager + Let's Encrypt DNS-01 (Route 53 IRSA) | Istio, Envoy |
+
+### Why ACM versus cert-manager
+
+ACM certificates are non-exportable. AWS attaches them directly to the ALB, which makes ACM the right choice when TLS terminates at the ALB. ACM cannot be used when TLS terminates inside the cluster (Istio Gateway, Envoy Gateway) because those gateways require the certificate material as a Kubernetes Secret.
+
+cert-manager handles in-cluster TLS for Istio and Envoy. The `letsencrypt` value is a reference implementation: it installs cert-manager and a Let's Encrypt ACME `ClusterIssuer`. In production, swap the `ClusterIssuer` for any cert-manager-compatible issuer.
+
+| Issuer | When to use |
+|---|---|
+| Let's Encrypt _(default)_ | Public domain, internet access, free |
+| ACM Private CA (`aws-privateca-issuer`) | AWS-native, air-gap friendly, private domains, paid |
+| Venafi (`cert-manager-venafi`) | Enterprise PKI, regulated environments |
+| HashiCorp Vault (`cert-manager-vault`) | Self-hosted PKI |
+| DigiCert, Sectigo, others | ACME or custom issuer plugins |
+
+The Terraform module provisions the cert-manager IRSA role and Route 53 permissions. Only the `ClusterIssuer` manifest changes between issuers.
+
+### Auto-provisioned DNS
+
+When `langsmith_domain` is set and `acm_certificate_arn` is empty, Terraform activates the `dns` module which creates:
+
+- A Route 53 hosted zone for the domain.
+- An ACM certificate with DNS validation records.
+- A Route 53 alias record pointing the domain to the ALB.
+
+**Staged deploy pattern:** Set `langsmith_domain` with `tls_certificate_source = "none"` first. Terraform creates the hosted zone and certificate without blocking on validation. Delegate the NS records at your registrar, then flip to `tls_certificate_source = "acm"` in a later apply. Terraform blocks until the certificate validates and wires it into the HTTPS listener.
+
+### Bring your own certificate
+
+Set `acm_certificate_arn` directly to skip the `dns` module. For in-cluster gateways, create a Kubernetes TLS Secret manually and reference it in the Gateway or VirtualService.
+
+## Module dependency graph
+
+```txt
+vpc ─► firewall (optional, create_firewall = true)
+│
+├─► eks ─► k8s-bootstrap (KEDA, ESO, Envoy Gateway [opt-in])
+│            └─► cert-manager (Let's Encrypt DNS-01 via Route 53 IRSA)
+│
+├─► postgres    (RDS, private subnets from VPC)
+├─► redis       (ElastiCache, private subnets from VPC)
+├─► storage     (S3 bucket + VPC Gateway Endpoint)
+├─► alb         (pre-provisioned ALB, public subnets)
+│     └─► alb_access_logs (S3 bucket for access logs, opt-in)
+├─► dns         (Route 53 zone + ACM cert, optional)
+├─► bastion     (jump host for private EKS access, optional)
+├─► cloudtrail  (audit logging, optional)
+├─► waf         (WAF ACL on ALB, optional)
+└─► firewall    (Network Firewall egress filter, optional)
+       all ─► langsmith (root module)
+```
+
+### Opt-in security modules
+
+| Module | Variable | Default | Purpose |
+|---|---|---|---|
+| Network Firewall | `create_firewall` | `false` | FQDN-based egress filtering. Allows only domains in `firewall_allowed_fqdns` (TLS SNI + HTTP Host). Requires `create_vpc = true`. Cost ≈ `$0.40/hr/endpoint + $0.065/GB processed`. |
+| ALB access logs | `alb_access_logs_enabled` | `false` | Traffic analysis and compliance |
+| CloudTrail | `create_cloudtrail` | `false` | API call logging. Skip if an organization trail already exists. |
+| WAF | `create_waf` | `false` | WAFv2 Web ACL — OWASP Top 10, IP reputation, known bad inputs |
+
+## Default resource sizes
+
+| Resource | Default | vCPU | Memory |
+|---|---|---|---|
+| EKS node | `m5.4xlarge` | 16 | 64 GB |
+| RDS PostgreSQL | `db.t3.large` | 2 | 8 GB |
+| ElastiCache Redis | `cache.m6g.xlarge` | 4 | 13.07 GB |
+| RDS storage | 10 GB | — | — |
+
+For production sizing recommendations, see the [scaling guide](/langsmith/self-host-scale) and the [AWS deployment guide](/langsmith/self-host-terraform-aws-deploy#cluster-sizing-reference).
+
+## Validated behaviors and known constraints
+
+These constraints were validated during the April 2026 gateway permutation test run.
+
+| # | Area | Constraint or fix |
+|---|---|---|
+| 1 | ACM wildcard SANs | `langchain.com` has `0 issue "amazon.com"` CAA but not `0 issuewild "amazon.com"`. Wildcard SANs fail with `CAA_ERROR`. The `dns` module requests only the apex domain. |
+| 2 | In-cluster Redis | The LangSmith Helm chart deploys Redis without `requirepass`. The `k8s_bootstrap` module writes `redis://langsmith-redis:6379`. Do not add an auth token unless you also configure the Helm chart Redis values. |
+| 3 | `name_prefix` length | Maximum 15 characters. Names like `dz-nginx-tst` (12 characters) are valid. |
+| 4 | Istio port | Istio 1.23+ ingressgateway listens on port 80 via `NET_BIND_SERVICE`, not port 8080. ALB TGB health check and security group rules must target port 80. |
+| 5 | NGINX TGB port | NGINX ingress-nginx controller pods listen on port 80. The TargetGroupBinding target type is `ip`. |
+| 6 | Envoy proxy port | Envoy proxy pods listen on port 10080 (not 80) when running as non-root. The TGB `servicePort` must be 10080. |
+| 7 | Destroy order | Always run `terraform destroy` first and let Terraform handle namespace and Helm release lifecycle. Pre-deleting namespaces causes the `helm_release` resource to time out because Helm cannot uninstall cleanly into a terminating namespace. |
+| 8 | Stuck terminating namespaces | KEDA's stale `external.metrics.k8s.io/v1beta1` API group causes `NamespaceDeletionDiscoveryFailure`. Fix: `kubectl delete apiservice v1beta1.external.metrics.k8s.io` before re-running `terraform destroy`. |
+
+## Verification commands
+
+```bash
+# EKS cluster status
+aws eks describe-cluster --name <cluster-name> --query "cluster.status"
+
+# Node health
+kubectl get nodes -o wide
+
+# ALB status
+kubectl get ingress -n langsmith
+
+# RDS status
+aws rds describe-db-instances \
+  --query "DBInstances[?DBInstanceIdentifier=='<db-id>'].DBInstanceStatus"
+
+# ElastiCache status
+aws elasticache describe-replication-groups \
+  --query "ReplicationGroups[?ReplicationGroupId=='<group-id>'].Status"
+
+# S3 access from a pod (via VPC endpoint)
+kubectl run s3-test --rm -it --image=amazon/aws-cli -n langsmith -- \
+  aws s3 ls s3://<bucket-name>
+```
diff --git a/src/langsmith/self-host-terraform-aws-deploy.mdx b/src/langsmith/self-host-terraform-aws-deploy.mdx
new file mode 100644
index 0000000000..7c4afec8ac
--- /dev/null
+++ b/src/langsmith/self-host-terraform-aws-deploy.mdx
@@ -0,0 +1,425 @@
+---
+title: Deploy LangSmith on AWS with Terraform
+sidebarTitle: Deploy
+description: End-to-end walkthrough for provisioning LangSmith self-hosted on AWS EKS using the LangChain Terraform modules.
+---
+
+Provision the AWS cloud foundation and install LangSmith with the public Terraform modules at [github.com/langchain-ai/terraform/tree/main/modules/aws](https://github.com/langchain-ai/terraform/tree/main/modules/aws). Plan for 30 to 40 minutes end to end on a clean account.
+
+The deployment runs in two stages: infrastructure (Terraform provisions VPC, EKS, RDS, ElastiCache, S3, IAM) and application (Helm installs the LangSmith chart against the cluster). Add-ons are enabled with a flag and a redeploy.
+
+## Prerequisites
+
+### Required tools
+
+| Tool | Version | Purpose |
+|---|---|---|
+| AWS CLI | v2 | Authenticate, query AWS resources, manage EKS kubeconfig |
+| Terraform | 1.5 | Run the infrastructure modules |
+| `kubectl` | 1.28 | Inspect the EKS cluster |
+| Helm | 3.12 | Install and manage the LangSmith chart |
+| `eksctl` | latest | Optional, handy for kubeconfig and debugging |
+
+Install on macOS:
+
+```bash
+brew install awscli kubectl helm eksctl
+brew tap hashicorp/tap && brew install hashicorp/tap/terraform
+```
+
+Verify each tool is on `PATH`:
+
+```bash
+aws --version
+terraform version
+kubectl version --client
+helm version
+```
+
+For Linux, follow the [AWS CLI install guide](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) and use your distribution's package manager for the remaining tools.
+
+### Required AWS IAM permissions
+
+The IAM user or role running Terraform needs permission to create and manage the cloud foundation. The following managed policies cover the full surface area. Use them as a starting point and trim down to least-privilege once the deployment is stable.
+
+| Policy | Purpose |
+|---|---|
+| `AmazonEKSClusterPolicy` | Create and manage EKS clusters |
+| `AmazonVPCFullAccess` | Create VPC, subnets, route tables, and NAT |
+| `AmazonRDSFullAccess` | Create and manage RDS PostgreSQL instances |
+| `AmazonElastiCacheFullAccess` | Create ElastiCache Redis clusters |
+| `AmazonS3FullAccess` | Create S3 buckets and VPC endpoints |
+| `IAMFullAccess` | Create IRSA roles and policies |
+
+<Tip>
+Run `make preflight` from `modules/aws/` after authenticating. The preflight script confirms that the active credentials can perform each required action and reports the first missing permission, which is faster than discovering gaps mid-`terraform apply`.
+</Tip>
+
+### Authenticate
+
+Configure AWS credentials with the CLI:
+
+```bash
+aws configure
+```
+
+Or export environment variables:
+
+```bash
+export AWS_ACCESS_KEY_ID="..."
+export AWS_SECRET_ACCESS_KEY="..."
+export AWS_DEFAULT_REGION="us-west-2"
+```
+
+Confirm the credentials work and the target region is enabled in the account:
+
+```bash
+aws sts get-caller-identity
+aws ec2 describe-availability-zones --query 'AvailabilityZones[].ZoneName' --output table
+```
+
+### License key and domain
+
+Two non-AWS items must be ready before `terraform apply`:
+
+- **LangSmith license key.** [Contact sales](https://www.langchain.com/contact-sales) to request one. The key is stored in AWS SSM Parameter Store by the setup script, not in `tfvars`.
+- **Domain or subdomain** that resolves to the AWS account, plus an ACM certificate covering it (or `letsencrypt` / `none` for the `tls_certificate_source` variable).
+
+### Cluster sizing reference
+
+The Terraform modules pick instance types and node counts based on `sizing_profile`. Plan capacity for the target tier before deploying.
+
+| Profile | EKS nodes | RDS instance | ElastiCache | Use case |
+|---|---|---|---|---|
+| `dev` | 2 × `m5.xlarge` | `db.t4g.medium` | `cache.t4g.small` | Demos, CI, short-lived POCs |
+| `production` | 3 × `m5.2xlarge` (HPA on) | `db.m6g.large` | `cache.m6g.large` | Standard production |
+| `production-large` | 6 × `m5.4xlarge` (HPA on) | `db.m6g.2xlarge` | `cache.m6g.xlarge` | High-volume, multi-tenant |
+
+<Note>
+For production and `production-large`, also plan to provision external [LangChain Managed ClickHouse](/langsmith/langsmith-managed-clickhouse) or a self-managed external ClickHouse cluster. In-cluster ClickHouse is supported for `dev` only.
+</Note>
+
+## Rapid path
+
+For the fastest path from zero to a running LangSmith instance, run these commands in order:
+
+```bash
+# 1. Clone the public modules
+git clone https://github.com/langchain-ai/terraform.git
+cd terraform/modules/aws
+
+# 2. Generate terraform.tfvars interactively (Enter accepts current values)
+make quickstart
+
+# 3. Load secrets into SSM Parameter Store
+#    Must be sourced, not executed
+source infra/scripts/setup-env.sh
+
+# 4. Provision infrastructure (~20 to 25 min)
+make init
+make plan
+make apply
+
+# 5. Configure kubectl
+make kubeconfig
+kubectl get nodes
+
+# 6. Deploy LangSmith via Helm (~5 to 10 min)
+make init-values
+make deploy
+
+# 7. Confirm
+kubectl get pods -n langsmith
+kubectl get ingress -n langsmith
+```
+
+To chain infrastructure and application in one command:
+
+```bash
+make quickdeploy          # interactive, prompts before terraform apply
+make quickdeploy-auto     # non-interactive, auto-approves terraform
+```
+
+`make quickdeploy` runs `terraform apply` → `kubeconfig` → `init-values` → `helm deploy` in sequence. If any step fails, the command exits with instructions for resuming from that step.
+
+The sections below cover each phase in detail.
+
+## Provision infrastructure
+
+Provisioning the AWS cloud foundation takes 20 to 25 minutes on a clean account. Do not interrupt the apply.
+
+### What gets provisioned
+
+| Resource | Purpose |
+|---|---|
+| VPC + subnets + NAT | Private network for the cluster and managed services |
+| EKS cluster + node groups | Kubernetes compute |
+| RDS PostgreSQL | LangSmith operational data |
+| ElastiCache Redis | Queue and cache |
+| S3 bucket + VPC endpoint | Trace payload blob storage |
+| ALB + listeners | Public ingress with TLS |
+| SSM Parameter Store entries | Application secrets, synced into the cluster by External Secrets Operator |
+| IRSA roles + IAM policies | Per-service AWS access |
+| KEDA, cert-manager, ESO | Bootstrap workloads installed alongside infrastructure |
+
+### Clone and configure
+
+```bash
+git clone https://github.com/langchain-ai/terraform.git
+cd terraform/modules/aws
+```
+
+All subsequent commands run from `modules/aws/`. Run `make help` for the full target list.
+
+Generate `terraform.tfvars` with the interactive wizard:
+
+```bash
+make quickstart
+```
+
+The wizard prompts for naming prefix, region, EKS sizing, TLS source, external vs in-cluster services, and the optional add-on flags. It writes `infra/terraform.tfvars`. Re-running the wizard pre-selects existing values; press Enter at each prompt to keep the current config.
+
+Prefer to edit by hand? Copy the example and fill in the required fields:
+
+```bash
+cp infra/terraform.tfvars.example infra/terraform.tfvars
+vi infra/terraform.tfvars
+```
+
+The minimum required variables:
+
+```hcl
+name_prefix = "acme"
+environment = "prod"
+region      = "us-west-2"
+
+eks_cluster_version = "1.31"
+eks_managed_node_groups = {
+  default = {
+    name           = "node-group-default"
+    instance_types = ["m5.4xlarge"]
+    min_size       = 3
+    max_size       = 10
+  }
+}
+
+postgres_source = "external"
+redis_source    = "external"
+
+tls_certificate_source = "acm"
+acm_certificate_arn    = "arn:aws:acm:us-west-2:<account-id>:certificate/<cert-id>"
+langsmith_domain       = "langsmith.example.com"
+```
+
+See the [AWS variables reference](/langsmith/self-host-terraform-aws-variables) for every input variable.
+
+<Tip>
+Configure a remote state backend before applying. Edit `infra/backend.tf` to point at an S3 bucket and DynamoDB lock table you control. The Terraform repo ships a local backend by default for first-time evaluations.
+</Tip>
+
+### Load secrets into SSM Parameter Store
+
+```bash
+source infra/scripts/setup-env.sh
+```
+
+The script reads `terraform.tfvars`, derives the SSM path `/langsmith/{name_prefix}-{environment}/`, then for each secret either reuses an exported value, reads the existing SSM parameter, auto-generates one (for salts and tokens), or prompts you. The license key and admin password are the two values you supply interactively. The script must be sourced (not executed) because `make` cannot export environment variables back to the parent shell.
+
+The script manages the following SSM parameters:
+
+| SSM key | How it is set | Notes |
+|---|---|---|
+| `postgres-password` | Prompt | RDS uses this password |
+| `redis-auth-token` | Auto-generated (`openssl rand -hex 32`) | ElastiCache requires hex |
+| `langsmith-api-key-salt` | Auto-generated (`openssl rand -base64 32`) | Never rotate, breaks all API keys |
+| `langsmith-jwt-secret` | Auto-generated (`openssl rand -base64 32`) | Never rotate, invalidates all sessions |
+| `langsmith-license-key` | Prompt | From your LangChain account team |
+| `langsmith-admin-password` | Prompt | Must contain a symbol |
+| `deployments-encryption-key` | Auto-generated Fernet key | LangSmith Deployment add-on |
+| `agent-builder-encryption-key` | Auto-generated Fernet key | Agent Builder add-on |
+| `insights-encryption-key` | Auto-generated Fernet key | Insights add-on |
+| `polly-encryption-key` | Auto-generated Fernet key | Polly add-on |
+
+Verify the secrets are present and the `TF_VAR_*` environment variables are exported:
+
+```bash
+make secrets
+```
+
+### Apply
+
+```bash
+make init
+make plan
+make apply
+```
+
+`make plan` shows the proposed diff. Review the output before applying. `make apply` provisions in dependency order: VPC and security groups, then EKS (about 12 minutes) and RDS (about 8 minutes, in parallel), then node groups, ElastiCache, S3, and the ALB.
+
+### Configure kubectl
+
+```bash
+make kubeconfig
+kubectl get nodes
+kubectl get pods -n kube-system
+```
+
+All nodes should report `Ready` and the core add-ons (CoreDNS, kube-proxy, VPC CNI, KEDA, cert-manager, ESO) should be `Running`.
+
+## Deploy LangSmith
+
+Two deployment paths are supported. Pick one.
+
+### Script-driven Helm deploy (recommended)
+
+Best for most deployments. Interactive prompts guide you through sizing and product choices.
+
+```bash
+cd modules/aws
+
+make init-values
+make deploy
+```
+
+`init-values.sh` prompts for the admin email, then reads `sizing_profile` and the `enable_*` flags from `terraform.tfvars` and copies the matching values files from `helm/values/examples/` into `helm/values/`. On re-runs it preserves your choices and refreshes Terraform outputs.
+
+`make deploy` runs `helm/scripts/deploy.sh`, which:
+
+1. Refreshes the kubeconfig.
+2. Runs preflight checks (AWS credentials, cluster reachability, the `langchain` Helm repo).
+3. Applies the External Secrets Operator `ClusterSecretStore` and `ExternalSecret` so the cluster reads secrets directly from SSM.
+4. Installs the LangSmith Helm chart with the layered values files.
+
+Expect 5 to 10 minutes for the chart to install and pods to become ready.
+
+#### Verify
+
+```bash
+kubectl get pods -n langsmith
+kubectl get ingress -n langsmith
+```
+
+When all pods are `Running` and the ingress shows the ALB DNS name, the deployment is ready. Use the domain you configured in `langsmith_domain` (or the ALB DNS name) to reach the UI.
+
+### Terraform-managed Helm deploy
+
+Best for teams that want the full deployment in Terraform state, or for "bring your own infrastructure" scenarios. The `app/` module manages the External Secrets Operator wiring, the `helm_release`, and feature toggles directly.
+
+```bash
+cd modules/aws
+
+# Generate Helm values files from templates (required, the app module reads these)
+make init-values
+
+# Pull infra outputs into app/infra.auto.tfvars.json
+make init-app
+
+# Configure app-specific settings
+cp app/terraform.tfvars.example app/terraform.tfvars
+# Edit app/terraform.tfvars, set admin_email, sizing, and feature toggles
+
+# Deploy
+make plan-app
+make apply-app
+```
+
+The `app/terraform.tfvars` file controls the application configuration:
+
+```hcl
+admin_email          = "admin@example.com"
+sizing               = "production"   # production | production-large | dev | none
+enable_agent_deploys = true
+enable_agent_builder = true
+enable_insights      = true
+enable_polly         = true
+clickhouse_host      = "clickhouse.example.com"
+```
+
+<Warning>
+`make init-values` is required before `make plan-app`. The app module reads the values files from `helm/values/` and `init-values` populates them from `helm/values/examples/` based on the sizing and add-on choices in `infra/terraform.tfvars`.
+</Warning>
+
+For "bring your own infrastructure", skip `make init-app` and set all variables manually in `app/terraform.tfvars`.
+
+## Enable add-ons
+
+Each add-on is gated by a flag in `infra/terraform.tfvars`. Set the flag, re-run `make init-values` to copy the matching values file, then re-run `make deploy`.
+
+```hcl
+enable_deployments     = true   # LangGraph Platform (required for Agent Builder and Polly)
+enable_agent_builder   = true   # Agent Builder UI
+enable_insights        = true   # ClickHouse-backed analytics
+enable_polly           = true   # Polly AI eval and monitoring
+enable_usage_telemetry = false  # Extended usage telemetry
+```
+
+```bash
+make init-values
+make deploy
+```
+
+For details on each add-on, see [LangSmith Deployment](/langsmith/deploy-self-hosted-full-platform).
+
+## Optional: private EKS cluster with bastion
+
+For deployments that must run a fully private EKS API endpoint, the modules ship a bastion host pattern:
+
+1. First, run from your workstation with `create_bastion = true` and `enable_public_eks_cluster = true` so the bastion can be created.
+2. After the initial deployment, set `enable_public_eks_cluster = false` and re-apply. The EKS API endpoint becomes private only.
+3. All subsequent Terraform work happens on the bastion. SSM into it, clone the repo, copy your `terraform.tfvars` and SSM secrets, then run the deployment from there.
+
+```hcl
+enable_public_eks_cluster = false
+create_bastion            = true
+
+# Optional SSH access (SSM is the default and requires no key):
+# bastion_key_name          = "my-keypair"
+# bastion_enable_ssh        = true
+# bastion_ssh_allowed_cidrs = ["203.0.113.0/24"]
+```
+
+Connect via SSM Session Manager:
+
+```bash
+terraform output bastion_ssm_command
+aws ssm start-session --target <instance-id> --region us-west-2
+```
+
+<Note>
+The bastion lives in a public subnet for SSM agent connectivity but does not need a public IP if your VPC has the SSM, SSMMessages, and EC2Messages VPC endpoints. The bastion comes preinstalled with `kubectl`, `helm`, `terraform`, `git`, and `jq`, with kubeconfig already configured for the EKS cluster. Install the [Session Manager plugin](https://docs.aws.amazon.com/systems-manager/latest/userguide/session-manager-working-with-install-plugin.html) for the AWS CLI on your workstation.
+</Note>
+
+## Optional: Envoy Gateway ingress
+
+The default ingress is the AWS Load Balancer Controller (ALB). Set `enable_envoy_gateway = true` in `terraform.tfvars` to install [Envoy Gateway](https://gateway.envoyproxy.io/) instead. Envoy Gateway is required for multi-namespace dataplane deployments where the `langgraph-dataplane` chart runs in its own namespace.
+
+```hcl
+# infra/terraform.tfvars
+enable_envoy_gateway = true
+```
+
+```bash
+source infra/scripts/setup-env.sh
+make apply
+
+make init-values
+cp helm/values/examples/langsmith-values-ingress-envoy-gateway.yaml helm/values/
+make deploy
+```
+
+The deploy script annotates the Envoy Gateway NLB service with the ACM certificate ARN automatically when `tls_certificate_source = "acm"`. TLS terminates at the NLB; Envoy sees plain HTTP internally.
+
+When running the dataplane chart in a separate namespace, apply the RBAC manifest once per dataplane namespace:
+
+```bash
+kubectl apply -f helm/values/dataplane-rbac.yaml
+```
+
+This grants the `langsmith-host-backend` ServiceAccount read access to pods, pod logs, deployments, and ReplicaSets in the dataplane namespace. Without it, agent run logs do not stream in the LangSmith UI.
+
+## Next steps
+
+- Reference the [AWS variables](/langsmith/self-host-terraform-aws-variables) and the [quick reference](/langsmith/self-host-terraform-aws-quick-reference).
+- Review the [AWS architecture](/langsmith/self-host-terraform-aws-architecture) for platform layers, IRSA, and module dependencies.
+- When something breaks, check the [AWS troubleshooting guide](/langsmith/self-host-terraform-aws-troubleshooting).
+- Enable agent deployment in the UI with [LangSmith Deployment](/langsmith/deploy-self-hosted-full-platform).
diff --git a/src/langsmith/self-host-terraform-aws-quick-reference.mdx b/src/langsmith/self-host-terraform-aws-quick-reference.mdx
new file mode 100644
index 0000000000..40a17ce372
--- /dev/null
+++ b/src/langsmith/self-host-terraform-aws-quick-reference.mdx
@@ -0,0 +1,301 @@
+---
+title: AWS Terraform quick reference
+sidebarTitle: Quick reference
+description: Make targets, Terraform commands, kubectl, AWS CLI, and Helm operations for LangSmith self-hosted on AWS EKS.
+---
+
+Command cheat sheet for day-to-day operations against an AWS LangSmith deployment provisioned with the [AWS Terraform modules](https://github.com/langchain-ai/terraform/tree/main/modules/aws). All `make` targets run from `modules/aws/`. Run `make help` for an inline summary.
+
+For the full deployment walkthrough, see the [AWS deployment guide](/langsmith/self-host-terraform-aws-deploy).
+
+## First-time setup
+
+```bash
+cd terraform/modules/aws
+
+# 1. Generate terraform.tfvars (interactive wizard)
+make quickstart
+
+# 2. Load secrets into SSM Parameter Store and export TF_VAR_* into your shell.
+#    Must use `source` — Make runs each target in a subshell.
+source infra/scripts/setup-env.sh
+
+# 2a. Confirm secrets and TF_VAR_* are set (optional but recommended)
+make secrets
+
+# 3. Provision infrastructure (~20–25 min)
+make init
+make plan       # review — confirm no unexpected destroy/replace actions
+make apply
+
+# 3a. Verify post-infra state (optional)
+make preflight-post
+
+# 4. Update kubeconfig for the EKS cluster
+make kubeconfig
+
+# 5. Generate Helm values from Terraform outputs
+make init-values
+
+# 6. Deploy LangSmith (~10 min)
+make deploy
+```
+
+Fast path once `make quickstart` and `source infra/scripts/setup-env.sh` are complete:
+
+```bash
+make quickdeploy        # interactive (prompts before terraform apply)
+make quickdeploy-auto   # non-interactive (auto-approves terraform)
+```
+
+## Day-2 operations
+
+```bash
+# Check deployment state across all layers; print next-step guidance
+make status
+
+# Re-deploy after editing Helm values or upgrading
+make deploy
+
+# Re-generate Helm values after Terraform changes
+make init-values
+
+# Re-sync ESO secrets without redeploying
+make apply-eso
+
+# Check SSM secrets and TF_VAR_* export status (read-only)
+make secrets
+
+# List all SSM parameters with last-modified timestamps
+make secrets-list
+
+# Manage SSM secrets interactively (view, set, rotate, diff vs cluster)
+make ssm
+
+# Update kubeconfig for the EKS cluster
+make kubeconfig
+```
+
+## Preflight checks
+
+```bash
+# Pre-Terraform: AWS credentials + IAM permissions
+make preflight
+
+# Post-apply: kubectl, SSM params, Helm values, TLS config
+make preflight-post
+
+# SSM only — confirm all parameters are populated (after make setup-env)
+make preflight-ssm
+```
+
+## Add-ons
+
+Add-ons are controlled by `enable_*` flags in `infra/terraform.tfvars`. Set the flags, re-run `init-values` to copy the matching values files, then re-deploy.
+
+```hcl
+# infra/terraform.tfvars
+enable_deployments     = true   # LangGraph Platform (required for Agent Builder and Polly)
+enable_agent_builder   = true   # Agent Builder UI
+enable_insights        = true   # ClickHouse-backed analytics
+enable_polly           = true   # Polly AI eval/monitoring
+enable_usage_telemetry = false  # Extended usage telemetry
+```
+
+```bash
+make init-values
+make deploy
+```
+
+## Sizing profiles
+
+Set `sizing_profile` in `terraform.tfvars`, then re-run `make init-values && make deploy`.
+
+```hcl
+sizing_profile = "production"        # multi-replica with HPA (recommended)
+sizing_profile = "production-large"  # high-volume (~50 users, ~1000 traces/sec)
+sizing_profile = "dev"               # single-replica, minimal resources
+sizing_profile = "default"           # chart defaults (no sizing file)
+```
+
+## Make targets
+
+### Setup and secrets
+
+| Command | Description |
+|---|---|
+| `make quickstart` | Interactive wizard. Generates `infra/terraform.tfvars` (region, node size, TLS method, add-ons). |
+| `make setup-env` | Prints the exact `source` command for loading secrets into your shell. Cannot export variables directly. |
+| `make secrets` | Show SSM secrets status (`✓ SET` / `✗ MISSING`) per parameter, check `TF_VAR_*` exports, give next steps. |
+| `make secrets-list` | List all SSM parameters for this deployment with last-modified timestamps. |
+| `make ssm` | Interactive SSM parameter manager. View, set, rotate, validate, diff vs the cluster Secret. |
+
+### Preflight
+
+| Command | Description |
+|---|---|
+| `make preflight` | Verify AWS credentials, IAM permissions, and required CLI tools before Terraform runs. |
+| `make preflight-post` | Run after `make apply`. Checks kubectl context, cluster reachability, SSM params populated, Helm values present, TLS config. |
+| `make preflight-ssm` | Check SSM params only. Narrower scope than `preflight-post`. |
+
+### Infrastructure
+
+| Command | Description |
+|---|---|
+| `make init` | `terraform init`. Downloads providers and modules. Safe to re-run. |
+| `make plan` | `terraform plan`. Preview changes. Review before every apply. |
+| `make apply` | `terraform apply`. Provisions VPC, EKS, RDS, ElastiCache, S3, ALB, IRSA. 20 to 25 minutes. |
+| `make destroy` | `terraform destroy`. Tears down all infrastructure. Run `make uninstall` first. |
+
+### Helm deploy
+
+| Command | Description |
+|---|---|
+| `make init-values` | Generate `helm/values/langsmith-values-overrides.yaml` from Terraform outputs. Copy add-on values files based on `enable_*` flags. |
+| `make deploy` | Deploy or upgrade LangSmith via Helm. Runs preflight, ESO sync, layered values build, and core readiness checks. |
+| `make apply-eso` | Re-apply ESO `ClusterSecretStore` and `ExternalSecret` only. Use after rotating secrets without a full Helm redeploy. |
+| `make uninstall` | Uninstall the LangSmith Helm release. Terraform infrastructure stays intact. |
+
+### Terraform-managed Helm
+
+| Command | Description |
+|---|---|
+| `make init-app` | Pull live infra Terraform outputs into `app/infra.auto.tfvars.json`. |
+| `make plan-app` | `terraform plan` for the `app/` module. Auto-runs `init-app` first. |
+| `make apply-app` | Deploy LangSmith Helm release via Terraform (`app/` module). |
+| `make destroy-app` | Destroy the Helm release via Terraform. Infrastructure stays intact. |
+
+### Fast path
+
+| Command | Description |
+|---|---|
+| `make quickdeploy` | Full deploy in one command. Chains `terraform apply` → `kubeconfig` → `init-values` → `helm deploy` with gates. |
+| `make quickdeploy-auto` | Same as `quickdeploy` but non-interactive. Passes `-auto-approve` to terraform. |
+| `make deploy-all` | `make apply` → `make kubeconfig` → `make init-values` → `make deploy` in sequence. |
+| `make deploy-all-tf` | `make apply` → `make init-values` → Terraform `app/` plan and apply in sequence. |
+
+### Utilities
+
+| Command | Description |
+|---|---|
+| `make status` | Check deployment state across all layers, print what to run next. |
+| `make status-quick` | Same as `status` but skips SSM and Kubernetes queries (faster). |
+| `make kubeconfig` | Update `~/.kube/config` with EKS cluster credentials (`aws eks update-kubeconfig`). |
+| `make tls` | BYO ACM cert + Route 53 A alias. Use when `langsmith_domain` is set and you need DNS wiring. |
+| `make clean` | Remove all local generated and sensitive files. Run after `make destroy`. |
+
+### Testing
+
+| Command | Description |
+|---|---|
+| `make test-e2e` | End-to-end gateway tests (ALB or Envoy Gateway) against the current cluster. |
+| `make test-permutations` | Permutation tests sequentially on the current cluster. Use `ARGS="1 2 5"` for a subset. |
+| `make test-parallel` | Permutation tests in parallel across isolated clusters. Your cluster is untouched. |
+
+## kubectl
+
+```bash
+# Pod health
+kubectl get pods -n langsmith
+kubectl get pods -n langsmith -w
+kubectl describe pod <pod-name> -n langsmith
+kubectl logs <pod-name> -n langsmith --tail=100 -f
+kubectl logs <pod-name> -n langsmith --previous --tail=50
+
+# ALB and ingress
+kubectl get ingress -n langsmith
+kubectl describe ingress -n langsmith
+
+# External Secrets Operator sync status
+kubectl get externalsecret langsmith-config -n langsmith
+
+# TLS
+kubectl get certificate -n langsmith
+kubectl get challenges -n langsmith
+kubectl describe certificate <cert-name> -n langsmith
+
+# Helm
+helm status langsmith -n langsmith
+helm history langsmith -n langsmith
+helm get values langsmith -n langsmith
+
+# IRSA — check per-component service account annotations
+kubectl get sa -n langsmith -o yaml | grep eks.amazonaws.com
+
+# LangSmith Deployment (LangGraph Platform)
+kubectl get lgp -n langsmith
+kubectl get crd | grep langchain
+kubectl get pods -n keda
+```
+
+## AWS CLI
+
+```bash
+# EKS
+aws eks list-clusters --region <region>
+aws eks describe-cluster --name <cluster-name> --region <region>
+aws eks update-kubeconfig --region <region> --name <cluster-name>
+
+# RDS
+aws rds describe-db-instances \
+  --query "DBInstances[?contains(DBInstanceIdentifier,'langsmith')]"
+
+# ElastiCache
+aws elasticache describe-cache-clusters \
+  --query "CacheClusters[?contains(CacheClusterId,'langsmith')]"
+
+# S3
+aws s3 ls s3://<bucket-name>
+aws s3api get-bucket-location --bucket <bucket-name>
+
+# ALB
+aws elbv2 describe-load-balancers \
+  --query "LoadBalancers[?contains(LoadBalancerName,'langsmith')]"
+
+# VPC endpoint
+aws ec2 describe-vpc-endpoints \
+  --filters "Name=service-name,Values=com.amazonaws.<region>.s3" \
+  --query "VpcEndpoints[].State"
+
+# SSM secrets
+aws ssm get-parameters-by-path --path "/langsmith/<base-name>/" --with-decryption
+
+# IAM role
+aws iam get-role --role-name <irsa-role-name>
+```
+
+## Terraform
+
+```bash
+cd modules/aws/infra
+
+terraform init
+terraform plan
+terraform apply
+terraform apply -target=module.eks
+terraform output
+terraform output -raw cluster_name
+terraform output -raw alb_dns_name
+terraform output -raw langsmith_irsa_role_arn
+terraform output -raw bucket_name
+terraform state list
+```
+
+## Teardown
+
+```bash
+cd terraform/modules/aws
+
+# Option A: script-driven deploy
+make uninstall
+
+# Option B: Terraform-managed deploy
+make destroy-app
+
+# Then destroy infrastructure:
+# 1. Set postgres_deletion_protection = false in infra/terraform.tfvars
+# 2. Apply the change, then destroy
+cd infra
+terraform apply
+terraform destroy
+```
diff --git a/src/langsmith/self-host-terraform-aws-troubleshooting.mdx b/src/langsmith/self-host-terraform-aws-troubleshooting.mdx
new file mode 100644
index 0000000000..716907d3ae
--- /dev/null
+++ b/src/langsmith/self-host-terraform-aws-troubleshooting.mdx
@@ -0,0 +1,435 @@
+---
+title: AWS Terraform troubleshooting
+sidebarTitle: Troubleshooting
+description: Common issues, fixes, and diagnostic commands for LangSmith self-hosted on AWS EKS deployed with the LangChain Terraform modules.
+---
+
+This page documents common issues, fixes, and diagnostic commands for LangSmith deployments provisioned with the [AWS Terraform modules](https://github.com/langchain-ai/terraform/tree/main/modules/aws).
+
+<Tip>
+Before upgrading, review the [LangSmith self-hosted changelog](/langsmith/self-hosted-changelog) for breaking changes and required variable updates. Run `aws eks update-kubeconfig --region <region> --name <cluster-name>` before running any `kubectl` commands.
+</Tip>
+
+## Automated diagnostics
+
+Before running individual commands, try the bundled scripts:
+
+```bash
+# Deployment status across all layers + next-step guidance
+make status
+
+# SSM parameter validation
+./infra/scripts/manage-ssm.sh validate
+```
+
+## Known issues
+
+### EKS node group creation fails: CREATE_FAILED
+
+**Symptom**
+
+```
+Error: waiting for EKS Node Group creation: unexpected state 'CREATE_FAILED'
+```
+
+**Cause:** The EKS control plane is not yet fully active when node group creation begins. Common after an interrupted apply.
+
+**Fix**
+
+```bash
+aws eks wait cluster-active --name <cluster-name> --region <region>
+
+aws eks describe-nodegroup \
+  --cluster-name <cluster-name> \
+  --nodegroup-name <nodegroup-name> \
+  --region <region> \
+  --query "nodegroup.health"
+
+terraform apply -var-file=terraform.tfvars
+```
+
+### kubectl fails: "You must be logged in to the server"
+
+**Symptom:** All `kubectl` commands fail with `error: You must be logged in to the server (Unauthorized)`.
+
+**Cause:** The kubeconfig is stale, the AWS credentials differ from those that created the cluster, or the token has expired.
+
+**Fix**
+
+```bash
+aws eks update-kubeconfig --region <region> --name <cluster-name>
+kubectl cluster-info
+
+aws sts get-caller-identity
+```
+
+If the cluster was created with a different IAM role, grant access via the `aws-auth` ConfigMap:
+
+```bash
+kubectl edit configmap aws-auth -n kube-system
+# Add your IAM user or role under mapUsers / mapRoles
+```
+
+### ALB not created after Helm install
+
+**Symptom:** `kubectl get ingress -n langsmith` shows no ADDRESS after several minutes.
+
+**Cause:** AWS Load Balancer Controller is not running or lacks IRSA permissions, the Terraform-provisioned ALB is not referenced correctly, or `alb_scheme = "internal"` is set (internal ALBs have no public address — see [ALB has no public address](#alb-has-no-public-address-internal-scheme)).
+
+**Fix**
+
+```bash
+kubectl get pods -n kube-system | grep aws-load-balancer
+kubectl logs -n kube-system -l app.kubernetes.io/name=aws-load-balancer-controller --tail=50
+kubectl get sa -n kube-system aws-load-balancer-controller -o yaml | grep eks.amazonaws.com
+
+terraform output alb_dns_name
+aws elbv2 describe-load-balancers --query "LoadBalancers[?DNSName=='<alb-dns-name>'].State"
+```
+
+### RDS connection refused from EKS pods
+
+**Symptom:** Backend logs show `connection refused` or `timeout` for the RDS endpoint.
+
+**Cause:** The RDS security group does not allow inbound TCP 5432 from the EKS node or cluster security group.
+
+**Fix**
+
+```bash
+aws eks describe-cluster --name <cluster-name> \
+  --query "cluster.resourcesVpcConfig.clusterSecurityGroupId"
+
+aws rds describe-db-instances \
+  --db-instance-identifier <db-id> \
+  --query "DBInstances[0].VpcSecurityGroups"
+
+aws ec2 describe-security-group-rules \
+  --filter "Name=group-id,Values=<rds-sg-id>"
+```
+
+The `postgres` module sets up the security group automatically. If the rule is missing, re-apply:
+
+```bash
+terraform apply -var-file=terraform.tfvars -target=module.postgres
+```
+
+### S3 access denied from pods (IRSA not configured)
+
+**Symptom:** Backend logs show `AccessDenied` when reading or writing S3.
+
+**Cause:** IRSA annotation missing from the LangSmith service account, or the S3 VPC Gateway Endpoint is not routing correctly.
+
+**Fix**
+
+```bash
+kubectl get sa langsmith -n langsmith -o yaml | grep eks.amazonaws.com
+
+aws ec2 describe-vpc-endpoints \
+  --filters "Name=service-name,Values=com.amazonaws.<region>.s3" \
+  --query "VpcEndpoints[].State"
+
+kubectl run s3-test --rm -it --image=amazon/aws-cli -n langsmith -- \
+  s3 ls s3://<bucket-name>
+```
+
+If the IRSA annotation is missing, verify `create_langsmith_irsa_role = true` in `terraform.tfvars` and that the service account name in the Helm values matches `langsmith`.
+
+### ElastiCache Redis connection timeout
+
+**Symptom:** Pods cannot connect to Redis. Logs show `dial tcp: i/o timeout`.
+
+**Cause:** ElastiCache security group does not allow inbound TCP 6379 from the EKS node security group.
+
+**Fix**
+
+```bash
+aws elasticache describe-cache-clusters \
+  --cache-cluster-id <cluster-id> \
+  --query "CacheClusters[0].SecurityGroups"
+
+kubectl run redis-test --rm -it --image=redis:7 -n langsmith -- \
+  redis-cli -h <elasticache-endpoint> -a <auth-token> ping
+```
+
+### EKS nodes not autoscaling
+
+**Symptom:** Pods remain `Pending`. Node count does not increase.
+
+**Cause:** Cluster Autoscaler lacks IAM permissions, targets the wrong ASG, or `min_size = max_size` on the node group.
+
+**Fix**
+
+```bash
+kubectl logs -n kube-system -l app=cluster-autoscaler --tail=50
+
+aws autoscaling describe-auto-scaling-groups \
+  --query "AutoScalingGroups[?contains(Tags[].Key, 'k8s.io/cluster-autoscaler/<cluster-name>')].[AutoScalingGroupName]" \
+  --output table
+```
+
+### cert-manager fails to issue Let's Encrypt certificate
+
+**Symptom:** `kubectl get certificate -n langsmith` shows `READY=False`. HTTP01 challenge is failing.
+
+**Cause:** The ALB is not forwarding port 80 to the cert-manager solver pod, or the DNS record for the domain does not point to the ALB.
+
+**Fix**
+
+```bash
+kubectl describe certificate <cert-name> -n langsmith
+kubectl get challenges -n langsmith
+
+aws elbv2 describe-listeners --load-balancer-arn <alb-arn>
+
+dig +short <your-langsmith-domain>
+# Expected: CNAME to the ALB DNS name
+```
+
+### postgres_deletion_protection blocks terraform destroy
+
+**Symptom**
+
+```
+Error: deleting RDS DB Instance: InvalidParameterCombination:
+Cannot delete, DeletionProtection is enabled.
+```
+
+**Fix:** Disable deletion protection in `terraform.tfvars`, apply, then destroy:
+
+```hcl
+postgres_deletion_protection = false
+```
+
+```bash
+terraform apply -var-file=terraform.tfvars
+terraform destroy
+```
+
+### ESO fails to sync: langsmith-config secret missing
+
+**Symptom:** Pods stuck in `CreateContainerConfigError`. `kubectl get secret langsmith-config -n langsmith` returns `NotFound`.
+
+**Cause:** ESO sync is all-or-nothing. If any single SSM parameter referenced by the `ExternalSecret` is missing, ESO refuses to create the Kubernetes Secret. All pods fail, not just the feature that needs the missing parameter.
+
+**Fix**
+
+```bash
+kubectl get externalsecret langsmith-config -n langsmith
+kubectl describe externalsecret langsmith-config -n langsmith
+
+./infra/scripts/manage-ssm.sh validate
+
+source ./infra/scripts/setup-env.sh
+./helm/scripts/apply-eso.sh
+```
+
+The `describe` output shows which `remoteRef.key` failed. Match it against the SSM prefix `/langsmith/{name_prefix}-{environment}/`.
+
+### SSM parameter prefix mismatch
+
+**Symptom:** `manage-ssm.sh validate` passes but ESO still cannot sync. Or `setup-env.sh` wrote parameters under a different prefix than ESO expects.
+
+**Cause:** The SSM prefix is derived from `name_prefix` and `environment` in `terraform.tfvars`. If these changed after initial setup, the old parameters live under the old prefix and ESO looks under the new one.
+
+**Fix**
+
+```bash
+kubectl get externalsecret langsmith-config -n langsmith -o yaml | grep 'key:'
+
+./infra/scripts/manage-ssm.sh list
+
+./infra/scripts/migrate-ssm.sh
+```
+
+<Warning>
+Never change `name_prefix` or `environment` on an existing deployment.
+</Warning>
+
+### Postgres password rejected by Terraform validation
+
+**Symptom**
+
+```
+Error: Invalid value for variable "postgres_password"
+RDS master password must not contain '/', '@', '"', single quotes, or spaces.
+```
+
+**Cause:** The password contains characters RDS does not allow in the master password.
+
+**Fix:** Re-generate without restricted characters. `setup-env.sh` produces a compliant password automatically; to update manually:
+
+```bash
+./infra/scripts/manage-ssm.sh set postgres-password "$(openssl rand -base64 24 | tr -d '/+= ')"
+source ./infra/scripts/setup-env.sh
+terraform apply -var-file=terraform.tfvars
+```
+
+### Private EKS cluster unreachable (bastion required)
+
+**Symptom:** `kubectl` and `terraform apply` time out when `enable_public_eks_cluster = false`.
+
+**Cause:** The EKS API endpoint is private. Commands must run from within the VPC, either via the bastion host or a VPN connection.
+
+**Fix**
+
+```bash
+# If the bastion was provisioned (create_bastion = true)
+aws ssm start-session --target <bastion-instance-id>
+
+# From the bastion
+aws eks update-kubeconfig --region <region> --name <cluster-name>
+kubectl get nodes
+```
+
+If no bastion was provisioned, set `create_bastion = true` and re-apply, or temporarily set `enable_public_eks_cluster = true`.
+
+### ALB has no public address (internal scheme)
+
+**Symptom:** `kubectl get ingress -n langsmith` shows an ADDRESS, but it resolves only within the VPC.
+
+**Cause:** `alb_scheme = "internal"` was set in `terraform.tfvars`. Internal ALBs are only reachable from within the VPC (VPN, peering, or PrivateLink).
+
+**Fix:** Intentional for private deployments. To make the ALB publicly reachable:
+
+```hcl
+alb_scheme = "internet-facing"
+```
+
+```bash
+terraform apply -var-file=terraform.tfvars
+# Then redeploy Helm to pick up the new ALB
+```
+
+### ALB hostname changed after ingress recreation
+
+**Symptom:** The LangSmith URL stops working. Agent deployments stuck in `DEPLOYING`. DNS records or bookmarks point to an old ALB hostname that no longer resolves.
+
+**Cause:** Deleting the Kubernetes ingress (via `helm uninstall`, `kubectl delete ingress`, or namespace deletion) deprovisions the ALB. When the ingress is recreated, a new ALB with a different hostname is issued. The `config.deployment.url` in Helm values still points to the old hostname, so the operator's health checks fail and deployments stay stuck.
+
+This also happens if the ALB controller creates a new ALB instead of reusing the Terraform pre-provisioned one. The `group.name` annotation is required alongside `load-balancer-arn` to prevent this.
+
+**Prevention**
+
+- Ensure `group.name` and `load-balancer-arn` annotations are both set. `init-values.sh` does this automatically when a pre-provisioned ALB exists.
+- Do not delete the ingress unless you plan to update all hostname-dependent config.
+- Avoid `helm rollback` without `--server-side=false`. The ingress SSA conflict can trigger a delete/recreate cycle.
+
+**Fix**
+
+```bash
+# 1. Check what hostname the ingress currently has
+kubectl get ingress langsmith-ingress -n langsmith \
+  -o jsonpath='{.status.loadBalancer.ingress[0].hostname}'
+
+# 2. Check what Terraform expects
+terraform output alb_dns_name
+
+# 3. If they differ, re-run init-values.sh and redeploy
+make init-values
+make deploy
+```
+
+### Node group scaling changes not applied by Terraform
+
+**Symptom:** Changing `min_size` or `max_size` in `terraform.tfvars` shows "No changes" on `terraform plan`.
+
+**Cause:** The ASG was changed out-of-band (AWS CLI, console, or cluster autoscaler) and the Terraform state already reflects the new values. The community EKS module ignores `desired_size` changes so the autoscaler can manage it; `min_size` and `max_size` should propagate normally.
+
+**Fix**
+
+```bash
+terraform refresh
+terraform plan
+
+# For an immediate change, use the AWS CLI directly
+aws eks update-nodegroup-config \
+  --cluster-name <cluster> \
+  --nodegroup-name <nodegroup> \
+  --scaling-config minSize=3,maxSize=8,desiredSize=5 \
+  --region <region>
+```
+
+## Diagnostic commands
+
+### Cluster access
+
+```bash
+aws eks update-kubeconfig --region <region> --name <cluster-name>
+kubectl config current-context
+kubectl get nodes -o wide
+aws sts get-caller-identity
+```
+
+### Pods
+
+```bash
+kubectl get pods -n langsmith
+kubectl get pods -n langsmith -w
+kubectl describe pod <pod-name> -n langsmith
+kubectl logs <pod-name> -n langsmith --tail=50
+kubectl logs <pod-name> -n langsmith --previous --tail=50
+kubectl logs -n langsmith deploy/langsmith-backend --tail=100 -f
+```
+
+### ALB and ingress
+
+```bash
+kubectl get ingress -n langsmith
+kubectl describe ingress -n langsmith
+aws elbv2 describe-load-balancers --query "LoadBalancers[?contains(LoadBalancerName, 'langsmith')]"
+```
+
+### TLS and certificates
+
+```bash
+kubectl get certificate -n langsmith
+kubectl describe certificate <cert-name> -n langsmith
+kubectl get challenges -n langsmith
+kubectl get clusterissuer
+```
+
+### ESO and secrets
+
+```bash
+kubectl get externalsecret -n langsmith
+kubectl describe externalsecret langsmith-config -n langsmith
+kubectl get clustersecretstore langsmith-ssm
+kubectl get secret langsmith-config -n langsmith -o jsonpath='{.data}' | jq 'keys'
+./infra/scripts/manage-ssm.sh validate
+./infra/scripts/manage-ssm.sh diff
+```
+
+### Helm
+
+```bash
+helm status langsmith -n langsmith
+helm history langsmith -n langsmith
+helm get values langsmith -n langsmith
+```
+
+### IRSA and IAM
+
+```bash
+kubectl get sa langsmith -n langsmith -o yaml | grep eks.amazonaws.com
+terraform output langsmith_irsa_role_arn
+aws iam get-role --role-name <irsa-role-name>
+```
+
+### LangSmith Deployment
+
+```bash
+kubectl get pods -n langsmith | grep -E "host-backend|listener|operator"
+kubectl get lgp -n langsmith
+kubectl get crd | grep langchain
+kubectl get pods -n keda
+```
+
+### Quick health check
+
+```bash
+echo "=== Context ===" && kubectl config current-context
+echo "=== Nodes ===" && kubectl get nodes
+echo "=== Pods ===" && kubectl get pods -n langsmith
+echo "=== Ingress ===" && kubectl get ingress -n langsmith
+echo "=== Helm ===" && helm status langsmith -n langsmith 2>/dev/null | grep -E "STATUS|LAST DEPLOYED"
+```
diff --git a/src/langsmith/self-host-terraform-aws-variables.mdx b/src/langsmith/self-host-terraform-aws-variables.mdx
new file mode 100644
index 0000000000..a094ad8250
--- /dev/null
+++ b/src/langsmith/self-host-terraform-aws-variables.mdx
@@ -0,0 +1,145 @@
+---
+title: AWS Terraform variables reference
+sidebarTitle: Variables
+description: Complete reference of Terraform variables for LangSmith self-hosted on AWS EKS.
+---
+
+Reference for every input variable exposed by the [AWS Terraform modules](https://github.com/langchain-ai/terraform/tree/main/modules/aws). Set non-sensitive variables in `infra/terraform.tfvars`. For sensitive variables (license key, passwords, encryption keys), `make setup-env` writes them to AWS SSM Parameter Store and `External Secrets Operator` syncs them into the cluster.
+
+## Core
+
+| Variable | Default | Required | Description |
+|---|---|---|---|
+| `name_prefix` | — | yes | Prefix for all resource names (1 to 11 chars, lowercase). |
+| `environment` | `dev` | no | Environment tag: `dev`, `staging`, `prod`, `test`, `uat`. |
+| `region` | `us-west-2` | no | AWS region for all resources. |
+| `owner` | `""` | no | Owner tag applied to all resources. |
+| `cost_center` | `""` | no | Cost center tag for billing. |
+| `tags` | `{}` | no | Additional tags applied to all resources. |
+
+## Networking
+
+| Variable | Default | Required | Description |
+|---|---|---|---|
+| `create_vpc` | `true` | no | Create a new VPC. Set `false` to use an existing one. |
+| `vpc_id` | `null` | when `!create_vpc` | Existing VPC ID. |
+| `private_subnets` | `[]` | when `!create_vpc` | Existing private subnet IDs. |
+| `public_subnets` | `[]` | when `!create_vpc` | Existing public subnet IDs. |
+| `vpc_cidr_block` | `null` | when `!create_vpc` | Existing VPC CIDR block. |
+
+## EKS
+
+| Variable | Default | Required | Description |
+|---|---|---|---|
+| `enable_public_eks_cluster` | `true` | no | Enable the public EKS API endpoint. Set `false` for a private cluster (requires `create_bastion`). |
+| `eks_public_access_cidrs` | `["0.0.0.0/0"]` | no | CIDRs allowed to reach the public EKS API endpoint. |
+| `eks_cluster_version` | `1.31` | no | EKS Kubernetes version. |
+| `eks_managed_node_group_defaults` | `{ami_type: AL2023}` | no | Default config for managed node groups. |
+| `eks_managed_node_groups` | `{default: m5.4xlarge}` | no | Managed node group definitions. |
+| `create_gp3_storage_class` | `true` | no | Create and set `gp3` as the default `StorageClass`. |
+| `eks_cluster_enabled_log_types` | `["api", "audit", ...]` | no | EKS control plane log types sent to CloudWatch. |
+| `eks_addons` | `{}` | no | EKS managed add-on configurations. |
+| `create_langsmith_irsa_role` | `true` | no | Create the IRSA role for LangSmith pods (S3 access). |
+
+## PostgreSQL (RDS)
+
+| Variable | Default | Required | Description |
+|---|---|---|---|
+| `postgres_source` | `external` | no | `external` (RDS) or `in-cluster` (Helm). |
+| `postgres_instance_type` | `db.t3.large` | no | RDS instance class. |
+| `postgres_storage_gb` | `10` | no | Initial RDS storage in GB. |
+| `postgres_max_storage_gb` | `100` | no | Maximum RDS storage in GB (autoscaling). |
+| `postgres_username` | `langsmith` | no | RDS database username. |
+| `postgres_engine_version` | `16` | no | PostgreSQL engine version for RDS. |
+| `postgres_password` | `""` | when external | RDS password. Use `TF_VAR_postgres_password`. |
+| `postgres_iam_database_authentication_enabled` | `true` | no | Enable IAM database authentication on RDS. |
+| `postgres_deletion_protection` | `true` | no | Enable deletion protection on RDS. |
+| `postgres_backup_retention_period` | `7` | no | Days to retain automated RDS backups (0 = disabled). |
+
+## Redis (ElastiCache)
+
+| Variable | Default | Required | Description |
+|---|---|---|---|
+| `redis_source` | `external` | no | `external` (ElastiCache) or `in-cluster` (Helm). |
+| `redis_instance_type` | `cache.m6g.xlarge` | no | ElastiCache node type. |
+| `redis_auth_token` | `""` | when external | ElastiCache auth token (min 16 chars). Use `TF_VAR_redis_auth_token`. |
+
+## S3
+
+| Variable | Default | Required | Description |
+|---|---|---|---|
+| `s3_ttl_enabled` | `true` | no | Enable S3 lifecycle rules for trace TTL. |
+| `s3_ttl_short_days` | `14` | no | TTL for `ttl_s/` prefix in days. |
+| `s3_ttl_long_days` | `400` | no | TTL for `ttl_l/` prefix in days. |
+| `s3_kms_key_arn` | `""` | no | KMS CMK ARN for S3 encryption (empty = SSE-S3). |
+| `s3_versioning_enabled` | `false` | no | Enable S3 bucket versioning. |
+
+## TLS and DNS
+
+| Variable | Default | Required | Description |
+|---|---|---|---|
+| `tls_certificate_source` | `acm` | no | `acm`, `letsencrypt`, or `none`. |
+| `acm_certificate_arn` | `""` | when `acm` | ACM certificate ARN. |
+| `letsencrypt_email` | `""` | when `letsencrypt` | Email for Let's Encrypt notifications. |
+| `langsmith_domain` | `""` | no | Custom hostname (empty = use ALB DNS name). |
+| `langsmith_namespace` | `langsmith` | no | Kubernetes namespace for LangSmith. |
+
+## ClickHouse and ingress
+
+| Variable | Default | Required | Description |
+|---|---|---|---|
+| `clickhouse_source` | `in-cluster` | no | `in-cluster` or `external`. |
+| `alb_scheme` | `internet-facing` | no | ALB scheme: `internet-facing` or `internal`. |
+| `alb_access_logs_enabled` | `false` | no | Enable ALB access logging to S3. |
+| `enable_envoy_gateway` | `false` | no | Install Envoy Gateway instead of ALB. Required for multi-namespace dataplane deployments. |
+
+## Bastion (private cluster)
+
+| Variable | Default | Required | Description |
+|---|---|---|---|
+| `create_bastion` | `false` | no | Create an EC2 bastion host for private cluster access (SSM or SSH). |
+| `bastion_instance_type` | `t3.micro` | no | EC2 instance type for the bastion. |
+| `bastion_key_name` | `null` | no | EC2 key pair for SSH (empty = SSM only). |
+| `bastion_enable_ssh` | `false` | no | Open port 22 on the bastion security group. |
+| `bastion_ssh_allowed_cidrs` | `[]` | no | CIDRs allowed to SSH to the bastion. |
+| `bastion_root_volume_size_gb` | `20` | no | Root EBS volume size for the bastion. |
+
+## Security and audit
+
+| Variable | Default | Required | Description |
+|---|---|---|---|
+| `create_cloudtrail` | `false` | no | Create a CloudTrail trail for AWS API audit. |
+| `cloudtrail_multi_region` | `true` | no | Record API calls across all regions. |
+| `cloudtrail_log_retention_days` | `365` | no | Days to retain CloudTrail logs. |
+| `create_waf` | `false` | no | Attach a WAFv2 Web ACL to the ALB. |
+| `create_firewall` | `false` | no | Deploy AWS Network Firewall for FQDN-based egress filtering. Requires `create_vpc = true`. Cost: about `$0.395/hr/endpoint` plus `$0.065/GB`. |
+| `firewall_allowed_fqdns` | `["beacon.langchain.com"]` | no | Domains allowed for outbound internet traffic when `create_firewall = true`. Matched against TLS SNI (HTTPS) and HTTP Host header. All other destinations are dropped. |
+| `firewall_subnet_cidr` | `"10.0.64.0/21"` | no | CIDR for the firewall subnet. Must not overlap with private (`10.0.0.0/21` to `10.0.32.0/21`) or public (`10.0.40.0/21` to `10.0.56.0/21`) subnets. |
+
+## Sizing and feature flags
+
+`sizing_profile` and the `enable_*` flags are read by `init-values.sh` and `deploy.sh`; Terraform ignores them. They affect which Helm overlay files the scripts generate.
+
+| Variable | Default | Required | Description |
+|---|---|---|---|
+| `sizing_profile` | `default` | no | Helm sizing: `production`, `production-large`, `dev`, `minimum`, `default`. |
+| `enable_deployments` | `false` | no | Enable LangSmith Deployment (listener, operator, host-backend). |
+| `enable_agent_builder` | `false` | no | Enable Agent Builder. Requires `enable_deployments = true`. |
+| `enable_insights` | `false` | no | Enable ClickHouse-backed analytics. |
+| `enable_polly` | `false` | no | Enable Polly AI eval and monitoring. Requires `enable_deployments = true`. |
+| `enable_usage_telemetry` | `false` | no | Enable extended usage telemetry reporting. |
+
+## Sensitive variables (set with `setup-env.sh`)
+
+`make setup-env` writes these to AWS SSM Parameter Store. External Secrets Operator syncs them into the cluster as Kubernetes secrets. Never set these inline in `terraform.tfvars`.
+
+| Variable | Description |
+|---|---|
+| `langsmith_license_key` | LangSmith enterprise license key. |
+| `langsmith_admin_password` | Initial org admin password. |
+| `langsmith_api_key_salt` | Salt for hashing API keys. Must stay stable after first deploy. |
+| `langsmith_jwt_secret` | JWT secret for Basic Auth sessions. |
+| `langsmith_deployments_encryption_key` | Fernet key for LangSmith Deployment. Must never change. |
+| `langsmith_agent_builder_encryption_key` | Fernet key for Agent Builder. Must never change. |
+| `langsmith_insights_encryption_key` | Fernet key for Insights. Must never change. |
+| `langsmith_polly_encryption_key` | Fernet key for Polly. Must never change. |
diff --git a/src/langsmith/self-host-terraform-azure-architecture.mdx b/src/langsmith/self-host-terraform-azure-architecture.mdx
new file mode 100644
index 0000000000..cf56401c8a
--- /dev/null
+++ b/src/langsmith/self-host-terraform-azure-architecture.mdx
@@ -0,0 +1,311 @@
+---
+title: Azure Terraform architecture
+sidebarTitle: Architecture
+description: Platform layers, services, Workload Identity, networking, ingress options, and module dependencies for LangSmith self-hosted on AKS.
+---
+
+This page documents what the [Azure Terraform modules](https://github.com/langchain-ai/terraform/tree/main/modules/azure) provision and how the modules wire the resulting deployment together.
+
+## Platform layers
+
+LangSmith on Azure deploys in stages. Each stage adds a capability layer on top of the previous. All layers share the same AKS cluster and `langsmith` namespace.
+
+<img src="/images/self-hosted-terraform/azure-architecture.png" alt="LangSmith on Azure service layout" className="block dark:hidden" />
+<img src="/images/self-hosted-terraform/azure-architecture-light.png" alt="LangSmith on Azure service layout" className="hidden dark:block" />
+
+| Stage | Layer | What it adds |
+|---|---|---|
+| Infrastructure | Azure infrastructure | VNet, AKS, Postgres, Redis, Blob, Key Vault, cert-manager, KEDA, ingress controller |
+| Application | LangSmith base | frontend, backend, platform-backend, queue, ingest-queue, ace-backend, clickhouse, playground |
+| LangSmith Deployment add-on | LangSmith Deployment | host-backend, listener, operator + per-deployment pods |
+| Agent Builder add-on | Agent Builder | agent-builder-tool-server, agent-builder-trigger-server + deep-agent LGP |
+| Insights + Polly add-on | Insights + Polly | Clio analytics (ClickHouse-backed), Polly eval agent (operator-managed, dynamic) |
+
+## Application deployment paths
+
+| Path | How | When to use |
+|---|---|---|
+| Helm path | `make init-values && make deploy` | Default. Shell script, interactive, reads TF outputs dynamically. Best for first deploys and day-2 re-deploys. |
+| Terraform path | `make init-app && make apply-app` | Declarative. Kubernetes Secrets + `langsmith-ksa` SA + Helm release in Terraform state. Best for GitOps and CI/CD pipelines. |
+
+The Terraform path uses the `app/` module. `make init-app` calls `app/scripts/pull-infra-outputs.sh` to read all infra outputs and write them into `app/infra.auto.tfvars.json`.
+
+## Deployment tiers
+
+### Light deploy (all in-cluster)
+
+```txt
+AKS Cluster
+├── langsmith namespace
+│   ├── frontend, backend, platform-backend, playground, queue, ace-backend
+│   ├── clickhouse (in-cluster pod)
+│   ├── postgres   (in-cluster pod)
+│   └── redis      (in-cluster pod)
+├── ingress-nginx (Azure Load Balancer → NGINX)
+└── cert-manager  (Let's Encrypt TLS)
+
+Azure
+├── Azure Blob Storage  (trace payloads — always external)
+└── Azure Key Vault     (secrets)
+```
+
+Set in `terraform.tfvars`:
+
+```hcl
+postgres_source   = "in-cluster"
+redis_source      = "in-cluster"
+clickhouse_source = "in-cluster"
+```
+
+For the full all-in-cluster walkthrough (Front Door TLS, all-in-cluster DBs), see `BUILDING_LIGHT_LANGSMITH.md` in the [Azure module repo](https://github.com/langchain-ai/terraform/blob/main/modules/azure/BUILDING_LIGHT_LANGSMITH.md).
+
+### Production (external managed services)
+
+```txt
+AKS Cluster
+├── langsmith namespace
+│   ├── frontend, backend, platform-backend, playground, queue, ingest-queue, ace-backend
+│   └── clickhouse (in-cluster — use LangChain Managed for production scale)
+└── ingress-nginx + cert-manager
+
+Azure Managed Services
+├── Azure DB for PostgreSQL Flexible Server (private VNet)
+├── Azure Cache for Redis Premium (private VNet)
+├── Azure Blob Storage (Workload Identity — no static keys)
+└── Azure Key Vault
+```
+
+## Networking
+
+### Light deploy
+
+```txt
+langsmith-vnet<identifier>
+└── subnet-0    (AKS nodes only)
+    No Postgres/Redis subnets — chart-managed pods handle both
+```
+
+### Production
+
+```txt
+langsmith-vnet<identifier>
+├── subnet-0              (AKS nodes)
+├── subnet-postgres       (Azure DB for PostgreSQL Flexible Server)
+└── subnet-redis          (Azure Cache for Redis Premium)
+```
+
+All subnets are private. Postgres and Redis are accessible only from within the VNet via private DNS resolution. No public endpoints.
+
+## Application core services
+
+| Service | Purpose | Port | HPA | Workload Identity |
+|---|---|---|---|---|
+| `langsmith-frontend` | React UI | 3000 | 1 to 10 | No |
+| `langsmith-backend` | Main API (traces, runs, projects, API keys, feedback) | 1984 | 3 to 10 | Yes (Blob) |
+| `langsmith-platform-backend` | Org and user management, auth, billing, settings | 1986 | 1 to 10 | Yes (Blob) |
+| `langsmith-playground` | LLM prompt playground UI | 3001 | 1 to 10 | No |
+| `langsmith-queue` | Trace ingestion worker (Redis → ClickHouse + Blob) | — | 3 to 10 + KEDA | Yes |
+| `langsmith-ingest-queue` | Dedicated high-throughput ingestion worker | — | 3 to 10 + KEDA | Yes |
+| `langsmith-ace-backend` | Async compute (dataset runs, evaluations, background jobs) | — | 1 to 5 | No |
+| `langsmith-clickhouse` | Columnar store (trace spans, run metadata, eval results) | — | StatefulSet, single replica, 500Gi PVC | No |
+
+<Warning>
+In-cluster ClickHouse is dev/POC only (single pod, no replication, no backups). For production use [LangChain Managed ClickHouse](/langsmith/langsmith-managed-clickhouse) or a self-managed external cluster.
+</Warning>
+
+### One-time jobs
+
+| Job | Purpose |
+|---|---|
+| `langsmith-backend-migrations` | PostgreSQL schema migrations |
+| `langsmith-backend-ch-migrations` | ClickHouse schema migrations |
+| `langsmith-backend-auth-bootstrap` | Creates the initial org and admin account from `initial_org_admin_password` in `langsmith-config-secret` |
+
+## LangSmith Deployment add-on
+
+| Service | Purpose | Workload Identity |
+|---|---|---|
+| `langsmith-host-backend` | LangGraph control plane API. Manages deployment lifecycle, serves deployment metadata. | Yes |
+| `langsmith-listener` | Watches host-backend for state changes, creates and updates `LangGraphPlatform` CRDs. | Yes |
+| `langsmith-operator` | Kubernetes operator. Azure-specific: injects `azure.workload.identity/use: "true"` + `langsmith-ksa` so every agent pod accesses Blob Storage via Workload Identity. | No |
+
+## Agent Builder add-on
+
+| Pod | Type | Role | Workload Identity |
+|---|---|---|---|
+| `langsmith-agent-builder-tool-server` | Static | MCP tool execution server | Yes |
+| `langsmith-agent-builder-trigger-server` | Static | Webhook receiver and scheduled trigger engine | Yes |
+| `langsmith-agent-bootstrap` | Job | Registers the bundled Agent Builder agent | — |
+| `agent-builder-<hash>` + queue + redis + `lg-<hash>-0` | Dynamic | Agent Builder deployment, operator-managed | Inherited |
+
+## Insights and Polly add-on
+
+**Insights/Clio:** No static pods. Deploys lazily as a dynamic LangGraph deployment via the operator on first UI invocation. Reads `insights_encryption_key` from `langsmith-config-secret`. Never rotate this key — it permanently breaks existing Insights data.
+
+**Polly:** Runs as a dynamic LangGraph deployment. Resource limits 2 CPU / 4 Gi request, 4 CPU / 8 Gi limit, scales 1 to 5 replicas. Reads `polly_encryption_key` from `langsmith-config-secret`. Same rotation warning as Insights.
+
+## Azure managed services
+
+When `postgres_source = "external"` and `redis_source = "external"` (the recommended production setting), Terraform provisions:
+
+### Azure DB for PostgreSQL Flexible Server
+
+- Holds orgs, users, projects, API keys, settings.
+- PostgreSQL ≥ 14 required (Azure Flexible Server defaults to 16).
+- Extensions enabled automatically by the `postgres` module: `btree_gin`, `btree_gist`, `pgcrypto`, `citext`, `ltree`, `pg_trgm`.
+- Private VNet only (`subnet-postgres`), SSL port 5432.
+- Secret: `langsmith-postgres-secret`, created by the `k8s-bootstrap` Terraform module.
+
+### Azure Cache for Redis Premium
+
+- Trace ingestion queue, pub/sub, short-lived cache.
+- Redis ≥ 5 required (Premium tier defaults to Redis 6).
+- Each LangSmith installation must use its own dedicated Redis. Shared instances cause deployment tasks to route incorrectly.
+- Private VNet only (`subnet-redis`), TLS port 6380.
+- Secret: `langsmith-redis-secret`, created by the `k8s-bootstrap` Terraform module.
+
+### Azure Blob Storage
+
+- Trace payloads: large inputs and outputs, attachments.
+- Workload Identity (no static keys) via the `k8s-app-identity` Managed Identity.
+- Always required. Disabling blob storage breaks the cluster on large payloads.
+- Prefixes: `ttl_s/` (14-day TTL), `ttl_l/` (400-day TTL).
+
+### Azure Key Vault
+
+- Centralized secret store for all LangSmith secrets.
+- Secret flow: `az keyvault secret show` → `kubectl create secret generic langsmith-config-secret`.
+
+## Workload Identity
+
+Azure AD token exchange happens via the AKS OIDC issuer. Pods access Blob Storage without static keys.
+
+```txt
+AKS OIDC issuer
+  → Federated credential on Azure Managed Identity (one per Kubernetes ServiceAccount)
+  → Kubernetes ServiceAccount annotated with azure.workload.identity/client-id
+  → Pod labeled with azure.workload.identity/use: "true"
+  → Azure AD issues a short-lived token — no storage keys in any Secret or env var
+```
+
+Workload Identity is centralized in `modules/k8s-cluster/` alongside the managed identity and OIDC issuer, which avoids circular dependencies and simplifies adding new ServiceAccounts.
+
+### Which pods need Workload Identity
+
+Every pod that reads blob storage env vars must have:
+
+1. A federated credential registered in Terraform (`modules/k8s-cluster/main.tf`).
+2. The `azure.workload.identity/use: "true"` label on the Deployment.
+3. The `azure.workload.identity/client-id` annotation on the ServiceAccount.
+
+| Pod | Stage | Needs WI |
+|---|---|---|
+| `langsmith-backend` | Application | Yes |
+| `langsmith-platform-backend` | Application | Yes |
+| `langsmith-queue` | Application | Yes |
+| `langsmith-ingest-queue` | Application | Yes |
+| `langsmith-host-backend` | LangSmith Deployment add-on | Yes |
+| `langsmith-listener` | LangSmith Deployment add-on | Yes |
+| `langsmith-agent-builder-tool-server` | Agent Builder add-on | Yes |
+| `langsmith-agent-builder-trigger-server` | Agent Builder add-on | Yes |
+| `langsmith-frontend` | Application | No |
+| `langsmith-playground` | Application | No |
+| `langsmith-ace-backend` | Application | No |
+| `langsmith-clickhouse` | Application | No |
+| `langsmith-operator` | LangSmith Deployment add-on | No |
+
+All federated credentials are registered in `modules/k8s-cluster/main.tf` under `service_accounts_for_workload_identity`. Adding a new pod that accesses blob storage requires adding its ServiceAccount name to that list and running `terraform apply -target=module.aks`.
+
+### What breaks without it
+
+```txt
+panic: blob-storage health-check failed: get container properties failed:
+DefaultAzureCredential: failed to acquire a token.
+WorkloadIdentityCredential authentication failed.
+  AADSTS700213: No matching federated identity record found for presented assertion subject
+```
+
+The pod panics on startup — the ServiceAccount has no registered federated credential so Azure AD rejects the token exchange.
+
+## Secret flow
+
+```txt
+Infrastructure stage
+
+  ./setup-env.sh   (read-only against Key Vault — never writes to KV directly)
+    First run:  prompts for postgres password, license key, admin password.
+                Generates api_key_salt, jwt_secret, Fernet keys locally.
+                Key Vault does not exist yet → writes to local dot-files + secrets.auto.tfvars.
+    Subsequent: Key Vault exists → reads all secrets from KV → writes to secrets.auto.tfvars.
+                No prompts, no generation, no KV writes.
+    Output:     secrets.auto.tfvars  (gitignored, chmod 600)
+                Terraform picks this up automatically — no shell session coupling.
+
+  terraform apply
+    Reads:  terraform.tfvars (non-sensitive config)
+            secrets.auto.tfvars (sensitive values — sole input for KV secret creation)
+    Creates: Azure Key Vault + all secrets as KV secrets (Terraform is the sole KV writer)
+
+Application stage
+
+  ./setup-env.sh   (re-run on any machine to refresh secrets.auto.tfvars from Key Vault)
+
+  kubectl create secret generic langsmith-config-secret
+    Reads:  Key Vault secrets + Terraform outputs (postgres/redis URLs, blob account)
+    Writes: K8s secrets — langsmith-config-secret, langsmith-postgres-secret,
+                          langsmith-redis-secret
+
+  helm upgrade --install langsmith ...
+    Chart reads config.existingSecretName = "langsmith-config-secret".
+    No secrets inline in any YAML file.
+```
+
+**Key rule:** `secrets.auto.tfvars` is never committed. It is regenerated from Key Vault on any machine by running `./setup-env.sh`. Terraform is the sole writer to Key Vault; `setup-env.sh` only reads from it after the first apply.
+
+## Ingress options
+
+| Controller | Variable | DNS label support | Notes |
+|---|---|---|---|
+| `nginx` _(default)_ | `ingress_controller = "nginx"` | Yes | NGINX via Helm, standard Kubernetes Ingress. |
+| `istio-addon` | `ingress_controller = "istio-addon"` | Yes | AKS managed Istio service mesh. Use `istio_addon_revision` to pin revision. |
+| `istio` | `ingress_controller = "istio"` | Yes | Self-managed Istio via Helm. Full control over revision and config. |
+| `agic` | `ingress_controller = "agic"` | Yes | Azure Application Gateway v2 + AGIC Helm chart. Native L7 WAF. HTTP-only or dns01 + custom domain. |
+| `envoy-gateway` | `ingress_controller = "envoy-gateway"` | Yes | Gateway API native. Uses `envoyproxy/gateway-helm`. |
+| `none` | `ingress_controller = "none"` | — | Bring your own ingress. |
+
+Azure Public IP DNS labels (`dns_label`) work with all controllers. `deploy.sh` applies the `service.beta.kubernetes.io/azure-dns-label-name` annotation to the correct LoadBalancer service based on the chosen controller.
+
+For the full TLS compatibility matrix and per-controller setup, see `INGRESS_CONTROLLERS.md` in the [Azure module repo](https://github.com/langchain-ai/terraform/blob/main/modules/azure/INGRESS_CONTROLLERS.md).
+
+## Resource sizing
+
+Four sizing profiles are available.
+
+| Profile | Use case | Set via |
+|---|---|---|
+| `minimum` | Cost parking, CI smoke tests, single-user demos | `sizing_profile = "minimum"` in `terraform.tfvars` |
+| `dev` | Developer use, integration tests, POCs | `sizing_profile = "dev"` |
+| `production` | Real traffic — multi-replica + HPA | `sizing_profile = "production"` _(recommended)_ |
+| `production-large` | ~50 users, ~1000 traces/sec | `sizing_profile = "production-large"` |
+
+### AKS node pools
+
+| Pool | VM Size | vCPU | RAM | Min | Max | Purpose |
+|---|---|---|---|---|---|---|
+| default | `Standard_D8s_v3` | 8 | 32 GB | 3 | 10 | Core LangSmith, system pods |
+| large | `Standard_D16s_v3` | 16 | 64 GB | 0 | 2 | ClickHouse (in-cluster), LGP agent pods |
+
+<Note>
+ClickHouse (when in-cluster) requests 2 to 4 CPU and 8 to 15 GB RAM depending on profile. With [LangChain Managed ClickHouse](/langsmith/langsmith-managed-clickhouse), the `large` pool is only needed for LGP operator-spawned agent pods.
+</Note>
+
+## Optional modules
+
+Each module is count-controlled (`0` disabled, `1` enabled). Enable any combination; the core deployment (Passes 1 to 5) works without them.
+
+| Module | Variable | Use case |
+|---|---|---|
+| `waf` | `create_waf = true` | Azure WAF policy (OWASP 3.2 + bot protection). Attach to Application Gateway. |
+| `diagnostics` | `create_diagnostics = true` | Log Analytics workspace + diagnostic settings for AKS, Key Vault, Blob. Recommended for production observability. |
+| `bastion` | `create_bastion = true` | Azure Bastion (Standard tier). Browser-based SSH to node VMs without a public IP. |
+| `dns` | `create_dns_zone = true` | Azure DNS zone + A record. Required for DNS-01 cert issuance with a custom domain. |
diff --git a/src/langsmith/self-host-terraform-azure-deploy.mdx b/src/langsmith/self-host-terraform-azure-deploy.mdx
new file mode 100644
index 0000000000..7e3bcbf661
--- /dev/null
+++ b/src/langsmith/self-host-terraform-azure-deploy.mdx
@@ -0,0 +1,683 @@
+---
+title: Deploy LangSmith on Azure with Terraform
+sidebarTitle: Deploy
+description: End-to-end walkthrough for provisioning LangSmith self-hosted on Azure AKS using the LangChain Terraform modules.
+---
+
+Provision the Azure cloud foundation and install LangSmith with the public Terraform modules at [github.com/langchain-ai/terraform/tree/main/modules/azure](https://github.com/langchain-ai/terraform/tree/main/modules/azure). Plan for 40 to 50 minutes end to end on a clean subscription.
+
+The deployment runs in two stages: infrastructure (Terraform provisions AKS, Postgres, Redis, Blob Storage, Key Vault, cert-manager, KEDA, ingress) and application (Helm installs the LangSmith chart against the cluster). Three add-ons (LangSmith Deployment, Agent Builder, Insights and Polly) are enabled with flags and a redeploy.
+
+## Prerequisites
+
+### Required tools
+
+| Tool | Version | Purpose |
+|---|---|---|
+| Azure CLI (`az`) | 2.50 | Authenticate, query Azure resources, manage AKS credentials |
+| Terraform | 1.5 | Run the infrastructure modules |
+| `kubectl` | 1.28 | Inspect the AKS cluster |
+| Helm | 3.12 | Install and manage the LangSmith chart |
+
+```bash
+brew install azure-cli kubectl helm
+brew tap hashicorp/tap && brew install hashicorp/tap/terraform
+
+az --version
+terraform version
+kubectl version --client
+helm version
+```
+
+### Required Azure RBAC
+
+The identity running Terraform needs the following roles on the subscription:
+
+| Role | Purpose |
+|---|---|
+| `Contributor` | Create and manage all Azure resources |
+| `User Access Administrator` | Create role assignments for Key Vault, Blob, cert-manager managed identities |
+
+`Owner` includes both. `Contributor` alone is insufficient because role assignments require User Access Administrator.
+
+### Authenticate
+
+```bash
+az login
+az account set --subscription <your-subscription-id>
+az account show
+```
+
+You also need a LangSmith license key ([contact sales](https://www.langchain.com/contact-sales)) and either a `dns_label` (Azure subdomain, no DNS setup needed) or a custom `langsmith_domain`.
+
+## Rapid path
+
+For the fastest path from zero to a running LangSmith instance:
+
+```bash
+# 1. Clone the public modules
+git clone https://github.com/langchain-ai/terraform.git
+cd terraform/modules/azure
+
+# 2. Generate terraform.tfvars interactively
+make quickstart
+
+# 3. Bootstrap secrets (writes infra/secrets.auto.tfvars, chmod 600, gitignored)
+make setup-env
+
+# 4. Validate environment
+make preflight
+
+# 5. Provision infrastructure (~15 to 20 min)
+make init
+make apply
+
+# 6. Get cluster credentials and push secrets into the cluster
+make kubeconfig
+make k8s-secrets
+
+# 7. Deploy LangSmith via Helm (~10 min)
+make init-values
+make deploy
+```
+
+Or run steps 5 through 7 in one shot:
+
+```bash
+make deploy-all   # apply → kubeconfig → k8s-secrets → init-values → deploy
+```
+
+The sections below cover each phase in detail.
+
+## Provision infrastructure
+
+Provisioning the Azure cloud foundation takes 15 to 20 minutes on a clean subscription. Do not interrupt the apply.
+
+### What gets provisioned
+
+| Resource | Type | Purpose |
+|---|---|---|
+| Resource Group | `azurerm_resource_group` | Container for all resources |
+| Virtual Network | `azurerm_virtual_network` | Isolated network (10.0.0.0/17) |
+| AKS Cluster | `azurerm_kubernetes_cluster` | Kubernetes, all workloads run here |
+| Ingress Controller | Helm | External load balancer + TLS termination (nginx by default) |
+| PostgreSQL Flexible Server | `azurerm_postgresql_flexible_server` | Org config, run metadata (external tier) |
+| Redis Cache Premium | `azurerm_redis_cache` | Trace ingestion queue, pub/sub (external tier) |
+| Blob Storage | `azurerm_storage_account` | Raw trace objects, always required |
+| Managed Identity | `azurerm_user_assigned_identity` | Workload Identity for pod-to-Blob auth |
+| Azure Key Vault | `azurerm_key_vault` | Stores all LangSmith secrets |
+| cert-manager | Helm | Automated TLS certificate management |
+| KEDA | Helm | Event-driven autoscaling for workers |
+
+### Clone and configure
+
+```bash
+git clone https://github.com/langchain-ai/terraform.git
+cd terraform/modules/azure
+```
+
+All subsequent commands run from `modules/azure/`. Run `make help` for the full target list.
+
+Generate `terraform.tfvars` with the interactive wizard:
+
+```bash
+make quickstart
+```
+
+The wizard runs a 10-section questionnaire covering profile, subscription, naming, networking, AKS sizing, ingress controller, DNS/TLS, backend services, Key Vault, sizing profile, and security add-ons. Each section includes explanatory context, cost estimates, and trade-offs. Re-running is safe; existing values are preselected at each prompt, press Enter to keep them.
+
+Prefer manual editing:
+
+```bash
+cp infra/terraform.tfvars.example infra/terraform.tfvars
+vi infra/terraform.tfvars
+```
+
+Minimum required values:
+
+```hcl
+# Identity
+subscription_id = "<your-azure-subscription-id>"
+
+# Location
+location = "eastus"
+
+# Naming + tagging
+identifier  = "-prod"      # suffix on all resource names
+environment = "prod"
+
+# Deployment tier, production recommended
+postgres_source   = "external"   # Azure DB for PostgreSQL
+redis_source      = "external"   # Azure Cache for Redis Premium
+clickhouse_source = "in-cluster" # use "external" + LangChain Managed for production
+
+# DNS + TLS (HTTPS via Let's Encrypt on a free Azure subdomain)
+dns_label              = "langsmith-prod"   # → langsmith-prod.eastus.cloudapp.azure.com
+tls_certificate_source = "letsencrypt"
+letsencrypt_email      = "ops@example.com"
+
+# Sizing
+sizing_profile = "production"   # minimum | dev | production | production-large
+```
+
+<Warning>
+In-cluster ClickHouse runs as a single pod with no replication or backups, dev/POC only. For production, use [LangChain Managed ClickHouse](/langsmith/langsmith-managed-clickhouse).
+</Warning>
+
+<Info>
+Blob Storage is always required, regardless of tier. Trace payloads must go to Azure Blob, never to ClickHouse.
+</Info>
+
+For all variables, see the [Azure variables reference](/langsmith/self-host-terraform-azure-variables).
+
+### Bootstrap secrets
+
+```bash
+make setup-env
+```
+
+`setup-env.sh` writes `infra/secrets.auto.tfvars` (gitignored, `chmod 600`). Terraform picks this file up automatically, no shell exports needed.
+
+- **First run:** prompts for PostgreSQL password, LangSmith license key, admin password, and admin email. Generates `api_key_salt`, `jwt_secret`, and four Fernet encryption keys locally.
+- **Subsequent runs:** reads everything silently from Azure Key Vault.
+
+<Warning>
+Never commit `secrets.auto.tfvars`. It is gitignored. Regenerate on any machine by running `make setup-env`.
+</Warning>
+
+### Preflight
+
+```bash
+make preflight
+```
+
+Validates Azure CLI auth, the active subscription, 11 required resource providers, RBAC (Contributor + User Access Administrator), `terraform.tfvars` and `secrets.auto.tfvars` presence, and `terraform`/`kubectl`/`helm` on PATH.
+
+### Apply
+
+```bash
+make init
+make apply   # ~15 to 20 min on first run
+```
+
+<Note>
+Skip `make plan` on a fresh deploy. `kubernetes_manifest` resources require a live cluster API during plan, which does not exist yet. `make apply` handles resource ordering in three internal stages: Azure resources → AKS → Kubernetes bootstrap.
+</Note>
+
+### Cluster credentials and Kubernetes Secrets
+
+After `make apply` completes, get cluster credentials and push secrets into the cluster:
+
+```bash
+make kubeconfig    # fetches AKS credentials, merges into ~/.kube/config
+make k8s-secrets   # Key Vault → langsmith-config-secret in the langsmith namespace
+```
+
+`make k8s-secrets` reads 8 secrets from Key Vault and creates or updates `langsmith-config-secret`. Safe to re-run; uses `--dry-run=client | kubectl apply` to update in place.
+
+### Verify infrastructure
+
+```bash
+# All nodes Ready
+kubectl get nodes
+
+# Bootstrap components, all Running
+kubectl get pods -n cert-manager     # 3 pods
+kubectl get pods -n keda             # 3 pods
+kubectl get pods -n ingress-nginx    # 1 pod (if using nginx)
+
+# NGINX LoadBalancer, save the EXTERNAL-IP
+kubectl get svc ingress-nginx-controller -n ingress-nginx
+
+# Workload Identity ServiceAccount, should have client-id annotation
+kubectl get sa langsmith-ksa -n langsmith \
+  -o jsonpath='{.metadata.annotations}'
+
+# Terraform outputs
+terraform -chdir=infra output
+
+# Key outputs consumed by Helm scripts
+terraform -chdir=infra output -raw keyvault_name
+terraform -chdir=infra output -raw storage_account_name
+terraform -chdir=infra output -raw storage_container_name
+terraform -chdir=infra output -raw storage_account_k8s_managed_identity_client_id
+```
+
+## Deploy LangSmith
+
+Two deployment paths are supported. Pick one.
+
+| Path | Command | When to use |
+|---|---|---|
+| Helm path _(default)_ | `make init-values && make deploy` | Interactive output, kubeconfig refresh, preflight checks. Best for first-time deploys and day-2 re-deploys. |
+| Terraform path | `make init-app && make apply-app` | Helm release + Kubernetes Secrets + Workload Identity SA managed in Terraform state. Best for GitOps and CI/CD pipelines. |
+
+### Helm path (recommended)
+
+#### Generate Helm values
+
+```bash
+cd terraform/modules/azure
+make init-values
+```
+
+`make init-values` reads `terraform output` and `terraform.tfvars` and generates `helm/values/values-overrides.yaml` with all fields populated:
+
+- `config.hostname`, your FQDN (from `dns_label` or `langsmith_domain`).
+- `config.initialOrgAdminEmail`, the first org admin account.
+- `config.existingSecretName: langsmith-config-secret`, secrets reference.
+- `config.blobStorage`, storage account name + container + Workload Identity client ID.
+- Workload Identity annotations for 5 ServiceAccounts (backend, platform-backend, queue, ingest-queue, host-backend).
+- Ingress + TLS block (cert-manager annotation, TLS secret name).
+- Postgres and Redis external secret references (when `postgres_source = "external"` / `redis_source = "external"`).
+
+Also copies the sizing overlay and any enabled add-on overlays from `helm/values/examples/` into `helm/values/`.
+
+<Info>
+The admin email is read from `langsmith_admin_email` in `terraform.tfvars` (set during `make setup-env`) and written into `values-overrides.yaml` automatically. No manual editing needed.
+</Info>
+
+#### Deploy
+
+```bash
+make deploy   # ~10 min
+```
+
+`make deploy` handles:
+
+1. Validates `values-overrides.yaml` exists.
+2. Refreshes kubeconfig via `az aks get-credentials`.
+3. Annotates the LoadBalancer service with `service.beta.kubernetes.io/azure-dns-label-name`, required for Azure to assign the DNS label to the public IP.
+4. Creates the `letsencrypt-prod` cert-manager `ClusterIssuer` if `tls_certificate_source = "letsencrypt"` (idempotent).
+5. Runs preflight checks (tools, cluster connectivity, Helm repo).
+6. Verifies `langsmith-config-secret` exists; auto-creates from Key Vault if missing.
+7. Builds and logs the values chain.
+8. Auto-recovers any stuck `pending-upgrade` Helm release before proceeding.
+9. Runs `helm upgrade --install langsmith langchain/langsmith --timeout 20m`.
+10. Waits for core deployments to roll out.
+11. Annotates the `langsmith-ksa` ServiceAccount with the Workload Identity client ID.
+12. Prints the access URL and login credentials location.
+
+<Info>
+Why `--timeout 20m`? The `langsmith-backend-auth-bootstrap` Job runs DB migrations and org initialization as a post-install hook. This takes up to 5 minutes on first install. Without a long timeout, Helm may report failure even though the install eventually succeeds.
+</Info>
+
+<Tip>
+**Watch pods in a second terminal:**
+
+```bash
+# macOS
+brew install watch
+watch kubectl get pods -n langsmith
+
+# Without watch
+while true; do clear; kubectl get pods -n langsmith; sleep 3; done
+```
+</Tip>
+
+### Terraform path
+
+Use this path when you want the Helm release, Kubernetes Secrets, and Workload Identity ServiceAccount managed in Terraform state.
+
+```bash
+# Copy and configure app vars
+cp app/terraform.tfvars.example app/terraform.tfvars
+vi app/terraform.tfvars   # set admin_email at minimum
+
+# Pull infra outputs into app/infra.auto.tfvars.json + terraform init
+make init-app
+
+# Deploy Helm release + K8s Secrets + WI ServiceAccount via Terraform
+make apply-app
+```
+
+Feature flags in `app/terraform.tfvars`:
+
+```hcl
+sizing                = "production"   # minimum | dev | production | production-large
+enable_agent_deploys  = true           # LangSmith Deployment add-on
+enable_agent_builder  = true           # Agent Builder add-on (requires agent_deploys)
+enable_insights       = true           # Insights / ClickHouse add-on
+enable_polly          = true           # Polly add-on (requires agent_deploys)
+```
+
+End-to-end via Terraform (infrastructure + application):
+
+```bash
+make deploy-all-tf   # apply → init-values → init-app → apply-app
+```
+
+### Verify the deployment
+
+```bash
+# All pods Running or Completed (~17 pods)
+kubectl get pods -n langsmith
+
+# Ingress host + TLS assigned
+kubectl get ingress -n langsmith
+
+# TLS certificate issued
+kubectl get certificate -n langsmith   # READY: True
+
+# Helm release status
+helm list -n langsmith
+```
+
+Expected pod state (all Running after ~5 minutes):
+
+```txt
+langsmith-ace-backend-xxxxx              1/1   Running     0   5m
+langsmith-backend-xxxxx                  1/1   Running     0   5m
+langsmith-backend-auth-bootstrap-xxxxx   0/1   Completed   0   5m
+langsmith-backend-ch-migrations-xxxxx    0/1   Completed   0   5m
+langsmith-backend-migrations-xxxxx       0/1   Completed   0   5m
+langsmith-clickhouse-0                   1/1   Running     0   5m
+langsmith-frontend-xxxxx                 1/1   Running     0   5m
+langsmith-ingest-queue-xxxxx             1/1   Running     0   5m
+langsmith-platform-backend-xxxxx         1/1   Running     0   5m
+langsmith-playground-xxxxx               1/1   Running     0   5m
+langsmith-queue-xxxxx                    1/1   Running     0   5m
+```
+
+Open `https://<HOSTNAME>` and log in with the admin email and password from Key Vault:
+
+```bash
+az keyvault secret show \
+  --vault-name $(terraform -chdir=infra output -raw keyvault_name) \
+  --name langsmith-admin-password \
+  --query value -o tsv
+```
+
+### Values chain
+
+`make deploy` applies Helm values files in this order (last file wins on conflicts):
+
+```txt
+1. helm/values/values.yaml                              ← Azure base (NGINX, Blob WI, no Istio)
+2. helm/values/values-overrides.yaml                    ← hostname, WI client-id, auth, postgres/redis
+3. helm/values/langsmith-values-sizing-<profile>.yaml   ← resource requests + HPA settings
+4. (add-on files when enable_* flags are set)
+```
+
+All files in `helm/values/` are gitignored (generated or contain live secrets). Source templates live in `helm/values/examples/` and are copied by `make init-values`.
+
+### Day-2 operations
+
+```bash
+make status         # 10-section health check
+make status-quick   # skip Key Vault + K8s secret queries (faster)
+make deploy         # re-deploy after any Helm value changes
+make init-values    # re-generate values after Terraform changes
+make kubeconfig     # refresh cluster credentials
+make k8s-secrets    # re-create langsmith-config-secret from Key Vault
+```
+
+## Enable add-ons
+
+Each add-on is gated by a flag in `infra/terraform.tfvars`. Set the flag, re-run `make init-values` to regenerate values, then re-run `make deploy`.
+
+### LangSmith Deployment
+
+Enables [LangSmith Deployment](/langsmith/deploy-self-hosted-full-platform), which lets you deploy and manage LangGraph graphs as API servers directly from the LangSmith UI. Adds three new pods.
+
+| Pod | Role | Workload Identity |
+|---|---|---|
+| `langsmith-host-backend` | LangSmith Deployment control plane API. Manages deployment lifecycle, stores state in shared PostgreSQL. | Yes |
+| `langsmith-listener` | Watches host-backend, creates and updates `LangGraphPlatform` CRDs in Kubernetes. | Yes |
+| `langsmith-operator` | Reconciles CRDs. Creates per-deployment Deployments, StatefulSets, and Services. | No |
+
+#### Scale the node pool first
+
+Before enabling, bump `default_node_pool_min_count` to at least 5. The operator spawns agent deployment pods on demand and needs node headroom:
+
+```hcl
+# infra/terraform.tfvars
+default_node_pool_min_count = 5      # operator pods need headroom
+enable_deployments          = true
+```
+
+<Warning>
+Without sufficient node capacity, operator-spawned agent pods stay in `Pending` state indefinitely. Scale the node pool first, then enable.
+</Warning>
+
+#### Apply, regenerate values, deploy
+
+```bash
+cd terraform/modules/azure
+make apply          # scale up node pool (~5 min)
+make init-values    # picks up enable_deployments = true → generates add-on overlay
+make deploy         # rolls out host-backend + listener + operator
+```
+
+`make init-values` appends the LangSmith Deployment add-on overlay (`langsmith-values-agent-deploys.yaml`) to the values chain. It automatically injects:
+
+```yaml
+config:
+  deployment:
+    enabled: true                          # REQUIRED, without this listener and operator are skipped silently
+    url: "https://<your-hostname>"         # must match config.hostname (with protocol)
+    tlsEnabled: true                       # set based on tls_certificate_source
+```
+
+<Warning>
+**`config.deployment.url` must include `https://`.** Missing the protocol causes operator-deployed agents to stay stuck in `DEPLOYING` state indefinitely. The URL is injected automatically by `make init-values`, do not set it manually in the overlay file; it will be overwritten.
+</Warning>
+
+<Warning>
+**`config.deployment.enabled: true` is required.** Setting only `config.deployment.url` without `enabled: true` causes the chart to silently skip creating `listener` and `operator`. No error, they just never appear.
+</Warning>
+
+#### Verify
+
+```bash
+# All three pods Running
+kubectl get pods -n langsmith | grep -E "host-backend|listener|operator"
+
+# LangSmith Deployment CRDs registered
+kubectl get crd | grep langchain
+
+# List LangSmith Deployments (empty on first deploy, populated when you create a deployment)
+kubectl get lgp -n langsmith
+```
+
+Expected: `langsmith-host-backend`, `langsmith-listener`, and `langsmith-operator` all Running. Total pod count: ~20 Running + 3 Completed jobs.
+
+KEDA is already installed alongside infrastructure. With `enable_deployments = true`, the operator creates KEDA `ScaledObject` resources for each agent deployment's worker queue. Worker pods scale down to zero when idle and scale up based on Redis queue depth.
+
+### Agent Builder
+
+Provides visual AI-assisted creation and management of LangGraph agents from the LangSmith UI. No `terraform apply` needed; just `make init-values && make deploy`.
+
+**Prerequisite:** LangSmith Deployment enabled (`enable_deployments = true`). Enabling Agent Builder without it causes a preflight error.
+
+| Pod | Type | Role |
+|---|---|---|
+| `langsmith-agent-builder-tool-server` | Static | MCP tool execution server, code/file editing tools for the AI |
+| `langsmith-agent-builder-trigger-server` | Static | Webhook receiver and scheduled trigger engine |
+| `langsmith-agent-bootstrap` | Job (Completed) | Registers the bundled Agent Builder agent through the operator, runs once |
+| `agent-builder-<hash>` + queue + redis + `lg-<hash>-0` | Dynamic (operator-managed) | Agent Builder deployment, created by the operator when the bootstrap Job runs |
+
+Enable:
+
+```hcl
+# infra/terraform.tfvars
+enable_deployments   = true    # required prerequisite
+enable_agent_builder = true
+```
+
+```bash
+cd terraform/modules/azure
+make init-values    # appends langsmith-values-agent-builder.yaml to values chain
+make deploy         # rolling update, ~10 min for bootstrap Job to complete
+```
+
+`make init-values` appends the Agent Builder add-on overlay (`langsmith-values-agent-builder.yaml`) to the values chain. The overlay enables the Agent Builder UI and supporting services, sets `backend.agentBootstrap: true` (the post-install job that registers Agent Builder as a LangSmith Deployment and creates the required ConfigMap), and sets conservative agent worker pod resources (1 CPU / 1 Gi) instead of the chart's default 4 CPU / 8 Gi.
+
+Verify:
+
+```bash
+# Static pods Running, bootstrap Job Completed
+kubectl get pods -n langsmith | grep -E "tool-server|trigger-server|Bootstrap"
+
+# Operator-managed dynamic pods (4 pods, api-server, queue, redis, postgres StatefulSet)
+kubectl get pods -n langsmith | grep agent-builder
+
+# Operator-managed LangSmith Deployment for Agent Builder
+kubectl get lgp -n langsmith
+```
+
+Expected: 3 static pods (tool-server, trigger-server, bootstrap Job) + 4 dynamic pods. Total: ~26 pods. After `make deploy`, an **Agent Builder** section appears in the LangSmith UI navigation.
+
+<Warning>
+**Roll the frontend after `agentBootstrap` completes.** The `agentBootstrap` Job creates the `langsmith-polly-config` ConfigMap that the frontend reads for the Polly UI. If the frontend was running when bootstrap completed, Polly shows "Unable to connect to LangGraph server". Fix:
+
+```bash
+kubectl rollout restart deployment langsmith-frontend -n langsmith
+```
+</Warning>
+
+<Warning>
+**Encryption key is read from `langsmith-config-secret`.** Do not set `config.agentBuilder.encryptionKey` inline in `values-overrides.yaml`. The chart reads it from `langsmith-config-secret` via `existingSecretName`. Setting it inline overrides the secret reference and creates a mismatch.
+</Warning>
+
+Both `langsmith-agent-builder-tool-server` and `langsmith-agent-builder-trigger-server` need Workload Identity to access Azure Blob Storage. Their federated credentials are pre-registered in `modules/k8s-cluster/main.tf`; no additional setup is needed.
+
+### Insights and Polly
+
+Two features, both of which require LangSmith Deployment. They are independent of each other; enable either one without the other.
+
+- **Insights:** AI-powered trace analytics (Clio). Surfaces patterns and anomalies in LangSmith traces. Clio deploys as a dynamic LangGraph deployment through the operator on first UI invocation. Adds no new static pods.
+- **Polly:** AI-powered evaluation and monitoring agent. Runs as a dynamic LangGraph deployment. Sets resource limits for the Polly worker (2 CPU / 4 Gi request, 4 CPU / 8 Gi limit, scales 1 to 5 replicas).
+
+No `terraform apply` needed; just `make init-values && make deploy`.
+
+```hcl
+# infra/terraform.tfvars
+enable_deployments = true    # required prerequisite
+enable_insights    = true    # Insights / Clio analytics
+enable_polly       = true    # Polly AI evaluation agent
+```
+
+Enable just one:
+
+```hcl
+enable_insights = true    # Insights only
+# or
+enable_polly    = true    # Polly only
+```
+
+```bash
+cd terraform/modules/azure
+make init-values    # appends insights + polly add-on overlays to the values chain
+make deploy         # rolling update, ~5 min
+```
+
+`make init-values` appends the add-on overlays based on `clickhouse_source` in `terraform.tfvars`:
+
+- `clickhouse_source = "in-cluster"`, generates a minimal overlay (`config.insights.enabled: true` only). The Helm chart manages ClickHouse internally.
+- `clickhouse_source = "external"`, generates a full overlay with `clickhouse.external.enabled: true` and a `langsmith-clickhouse` secret reference. Create this secret with the ClickHouse host and credentials before deploying.
+
+<Warning>
+**Do not manually copy the Insights example file for in-cluster ClickHouse.** The example `helm/values/examples/langsmith-values-insights.yaml` has `clickhouse.external.enabled: true` and `existingSecretName: langsmith-clickhouse`. Copying it manually when using in-cluster ClickHouse causes `CreateContainerConfigError` because the secret does not exist. Always use `make init-values` to generate the correct file.
+</Warning>
+
+Verify:
+
+```bash
+# ClickHouse already running from base install
+# Insights and Polly deploy as dynamic pods when first invoked from the UI
+kubectl get pods -n langsmith | grep -E "clickhouse|polly|clio"
+
+# Watch for dynamic pods on first Insights use
+kubectl get pods -n langsmith -w
+
+# Confirm Insights is enabled in Helm values
+helm get values langsmith -n langsmith | grep -A3 insights
+# Expected: enabled: true
+```
+
+<Warning>
+**Encryption keys must never change after first enable.** `insights_encryption_key` and `polly_encryption_key` must never change after first enable. Changing either permanently corrupts all existing encrypted data. There is no recovery path. These keys live in Key Vault and never rotate automatically.
+</Warning>
+
+<Warning>
+**Roll the frontend after first Polly enable.** If the Polly UI shows "Unable to connect to LangGraph server" after enabling, the frontend started before the bootstrap ConfigMap was ready. Fix:
+
+```bash
+kubectl rollout restart deployment langsmith-frontend -n langsmith
+```
+</Warning>
+
+### Add-on summary
+
+| Phase | New pods | Total ~running |
+|---|---|---|
+| Base install | Core LangSmith (backend, frontend, queue, ingest-queue, clickhouse, etc.) | ~17 |
+| LangSmith Deployment | `host-backend`, `listener`, `operator` | ~20 |
+| Agent Builder | `tool-server`, `trigger-server`, `bootstrap` Job + 4 dynamic Agent Builder pods | ~26 |
+| Insights and Polly | No new static pods (Clio + Polly appear dynamically on first use) | ~22 at rest |
+
+## Ingress controllers
+
+Set `ingress_controller` in `terraform.tfvars` before `make apply`. For the full TLS compatibility matrix, see `INGRESS_CONTROLLERS.md` in the [Azure module repo](https://github.com/langchain-ai/terraform/blob/main/modules/azure/INGRESS_CONTROLLERS.md).
+
+| Value | What Terraform installs | Best for |
+|---|---|---|
+| `nginx` _(default)_ | `ingress-nginx` Helm chart with Azure LB | Standard deployments. Simplest setup. |
+| `istio-addon` | AKS Service Mesh add-on (Azure-managed Istio) | Azure-managed Istio mesh, multi-dataplane, mTLS. |
+| `istio` | `istio-base` + `istiod` + `istio-ingressgateway` | Self-managed Istio. Full mesh and sidecar injection. |
+| `agic` | Azure Application Gateway v2 + AGIC Helm chart | Enterprise Azure, native L7 WAF, HTTP-only or dns01 + custom domain. |
+| `envoy-gateway` | `gateway-helm` OCI chart, Kubernetes Gateway API | Gateway API native, modern alternative to Ingress. |
+
+<Warning>
+`letsencrypt` (HTTP-01) only works with `nginx`, `istio` (self-managed), and `envoy-gateway`. `istio-addon` and `agic` do not create an IngressClass, so the ACME solver cannot receive traffic. For those controllers, use `dns01` with a custom domain, or `none` for HTTP-only.
+</Warning>
+
+## DNS and TLS
+
+`dns_label` gives you a free Azure subdomain, `<label>.<region>.cloudapp.azure.com`, with no domain registration or DNS zone needed. `deploy.sh` annotates the correct LoadBalancer service automatically.
+
+**Quickstart default (HTTP, zero setup):**
+
+```hcl
+dns_label              = "langsmith-prod"
+tls_certificate_source = "none"
+```
+
+**Add HTTPS with Let's Encrypt (nginx only):**
+
+```hcl
+dns_label              = "langsmith-prod"
+tls_certificate_source = "letsencrypt"
+letsencrypt_email      = "you@example.com"
+```
+
+**Custom domain + DNS-01 (all controllers, works behind firewalls):**
+
+```hcl
+langsmith_domain       = "langsmith.mycompany.com"
+tls_certificate_source = "dns01"
+letsencrypt_email      = "you@example.com"
+create_dns_zone        = true
+# After deploy: add ingress_ip = "<lb-ip>" and re-run make apply (creates A record)
+```
+
+**dns01 flow:**
+
+1. `make apply` creates the Azure DNS zone and outputs 4 nameservers.
+2. At your registrar, add NS records for the subdomain pointing to those 4 nameservers.
+3. Verify: `dig NS langsmith.mycompany.com @8.8.8.8`.
+4. `make deploy` issues the cert via DNS-01 automatically (Workload Identity writes the TXT record to Azure DNS).
+5. Get the LB IP, add `ingress_ip = "<ip>"` to `terraform.tfvars`, then `make apply` (creates the A record).
+6. `make status` shows exactly what NS and A records to add at each stage.
+
+<Note>
+**Why NS records, not CNAME:** cert-manager must write TXT records to the zone to prove ownership. That requires Azure DNS to be authoritative for the subdomain, and NS delegation grants that authority. A CNAME only aliases traffic and does not transfer DNS authority; the DNS-01 challenge will fail.
+</Note>
+
+## Next steps
+
+- Reference the [Azure variables](/langsmith/self-host-terraform-azure-variables) and the [quick reference](/langsmith/self-host-terraform-azure-quick-reference).
+- Review the [Azure architecture](/langsmith/self-host-terraform-azure-architecture) for module structure, traffic flow, and Workload Identity.
+- When something breaks, check the [Azure troubleshooting guide](/langsmith/self-host-terraform-azure-troubleshooting).
+- Enable agent deployment in the UI with [LangSmith Deployment](/langsmith/deploy-self-hosted-full-platform).
diff --git a/src/langsmith/self-host-terraform-azure-quick-reference.mdx b/src/langsmith/self-host-terraform-azure-quick-reference.mdx
new file mode 100644
index 0000000000..3c073855a1
--- /dev/null
+++ b/src/langsmith/self-host-terraform-azure-quick-reference.mdx
@@ -0,0 +1,220 @@
+---
+title: Azure Terraform quick reference
+sidebarTitle: Quick reference
+description: Make targets, Terraform, kubectl, Azure CLI, and Helm commands for LangSmith self-hosted on AKS.
+---
+
+Command cheat sheet for day-to-day operations against an Azure LangSmith deployment provisioned with the [Azure Terraform modules](https://github.com/langchain-ai/terraform/tree/main/modules/azure). All `make` targets run from `modules/azure/`. Run `make help` for an inline summary.
+
+For the full deployment walkthrough, see the [Azure deployment guide](/langsmith/self-host-terraform-azure-deploy).
+
+## Deployment overview
+
+| Stage | What gets deployed | Command |
+|---|---|---|
+| Infrastructure | AKS + Postgres + Redis + Blob + Key Vault + cert-manager + KEDA + ingress | `make apply` |
+| Cluster credentials | Kubeconfig + Kubernetes Secrets from Key Vault | `make kubeconfig && make k8s-secrets` |
+| LangSmith (Helm path) | LangSmith Helm (~17 pods) via shell scripts | `make init-values && make deploy` |
+| LangSmith (Terraform path) | Secrets + SA + Helm release managed in Terraform state | `make init-app && make apply-app` |
+| LangSmith Deployment add-on | host-backend, listener, operator. Bump `default_node_pool_min_count` to 5 first | `make apply && make init-values && make deploy` |
+| Agent Builder add-on | tool-server, trigger-server, agent-builder LGP | `make init-values && make deploy` |
+| Insights + Polly add-on | Clio analytics, Polly eval agent | `make init-values && make deploy` |
+
+## First-time setup
+
+```bash
+cd terraform/modules/azure
+
+# 1. Generate terraform.tfvars (interactive wizard)
+make quickstart
+
+# 2. Bootstrap secrets (prompts on first run, reads from Key Vault on repeat)
+make setup-env
+
+# 3. Preflight (Azure CLI, RBAC, providers, quotas)
+make preflight
+
+# 4. Deploy infrastructure (~15 to 20 min)
+#    Skip `make plan` on a fresh deploy — kubernetes_manifest needs a live cluster
+make init
+make apply
+
+# 5. Cluster credentials + Kubernetes Secrets
+make kubeconfig
+make k8s-secrets
+
+# 6. Generate Helm values from Terraform outputs
+make init-values
+
+# 7. Deploy LangSmith (~10 min)
+make deploy
+
+# 8. Health check
+make status
+```
+
+Or run everything after `make apply` in one shot:
+
+```bash
+make deploy-all      # kubeconfig → k8s-secrets → init-values → deploy
+make deploy-all-tf   # apply → init-values → init-app → apply-app (Terraform path)
+```
+
+## Day-2 operations
+
+```bash
+make status         # 10-section health check
+make status-quick   # skip Key Vault + K8s secret queries (faster)
+make deploy         # re-deploy after any Helm value changes
+make init-values    # re-generate values after Terraform changes
+make kubeconfig     # refresh cluster credentials
+make k8s-secrets    # re-create langsmith-config-secret from Key Vault
+
+# Manage Key Vault secrets interactively
+make keyvault                  # interactive menu
+make keyvault list             # all secrets with timestamps
+make keyvault get <secret>     # read a secret
+make keyvault set <key> <val>  # update a secret
+make keyvault validate         # check all required secrets exist
+make keyvault diff             # compare KV vs K8s secret
+make keyvault delete <key>     # soft-delete (recoverable 90 days)
+```
+
+## Add-ons
+
+Add-on passes (3 to 5) are controlled by flags in `infra/terraform.tfvars`. Set the flags, re-run `init-values && deploy`. `init-values.sh` copies the matching example file into `helm/values/` automatically.
+
+```hcl
+# infra/terraform.tfvars
+sizing_profile       = "production"   # minimum | dev | production | production-large
+enable_deployments   = true           # LangSmith Deployment add-on (listener + operator + host-backend)
+enable_agent_builder = true           # Agent Builder add-on (requires enable_deployments)
+enable_insights      = true           # Insights / Clio analytics add-on
+enable_polly         = true           # Polly AI eval add-on (requires enable_deployments)
+```
+
+<Warning>
+The LangSmith Deployment add-on requires `default_node_pool_min_count = 5` first. Operator-spawned pods need node headroom; without it, agent pods stay in `Pending` indefinitely.
+</Warning>
+
+## Sizing profiles
+
+Set `sizing_profile` in `terraform.tfvars`, then re-run `make init-values && make deploy`.
+
+| Profile | When to use |
+|---|---|
+| `minimum` | Cost parking, CI smoke tests, single-user demos. Expect OOM under real traffic. |
+| `dev` | Light non-production for local dev, CI pipelines, integration tests, short-lived POCs. |
+| `production` | _Recommended_ for production. Multi-replica with HPA on all stateless components. |
+| `production-large` | High-volume starting point based on the scale guide (~50 concurrent users, ~1000 traces/sec). |
+
+## kubectl
+
+```bash
+# Pod health
+kubectl get pods -n langsmith
+kubectl get pods -n langsmith -w
+kubectl describe pod <pod-name> -n langsmith
+kubectl logs <pod-name> -n langsmith --tail=100 -f
+kubectl logs <pod-name> -n langsmith --previous --tail=50
+
+# Backend logs (live)
+kubectl logs -n langsmith deploy/langsmith-backend --tail=100 -f
+
+# Ingress
+kubectl get ingress -n langsmith
+kubectl describe ingress -n langsmith
+
+# NGINX LoadBalancer external IP
+kubectl get svc ingress-nginx-controller -n ingress-nginx
+
+# TLS
+kubectl get certificate -n langsmith
+kubectl get challenges -n langsmith
+kubectl describe certificate <cert-name> -n langsmith
+kubectl get clusterissuer
+
+# Workload Identity
+kubectl get serviceaccount langsmith-ksa -n langsmith -o yaml | grep annotation -A5
+
+# Helm
+helm status langsmith -n langsmith
+helm history langsmith -n langsmith
+helm get values langsmith -n langsmith
+
+# LangSmith Deployment
+kubectl get lgp -n langsmith
+kubectl get crd | grep langchain
+```
+
+## Azure CLI
+
+```bash
+# Re-auth
+az login
+az account set --subscription <subscription-id>
+az account show
+
+# AKS
+az aks list
+az aks show --name <cluster> --resource-group <rg>
+az aks get-credentials --name <cluster> --resource-group <rg>
+
+# PostgreSQL
+az postgres flexible-server list
+az postgres flexible-server show --name <server> --resource-group <rg>
+
+# Redis
+az redis list
+az redis show --name <cache> --resource-group <rg>
+
+# Blob Storage
+az storage account list
+az storage container list --account-name <account>
+
+# Key Vault
+az keyvault list
+az keyvault secret list --vault-name <vault>
+az keyvault secret show --vault-name <vault> --name <secret> --query value -o tsv
+
+# Application Gateway (AGIC)
+az network application-gateway list
+```
+
+## Terraform
+
+```bash
+cd modules/azure/infra
+
+terraform init
+terraform plan        # skip on first run, see deploy notes
+terraform apply
+terraform apply -target=module.aks
+
+terraform output
+terraform output -raw aks_cluster_name
+terraform output -raw keyvault_name
+terraform output -raw storage_account_name
+
+terraform state list
+```
+
+## Key watchouts
+
+- Skip `make plan` on a fresh deploy. `kubernetes_manifest` resources need a live cluster API. Use `make apply` directly.
+- Uninstall Helm before `terraform destroy`. The Azure Load Balancer holds a subnet reference; leaving it blocks VNet deletion. Run `make uninstall` first.
+- `config.deployment.url` must include `https://`. Without it, operator-spawned agents stay stuck in `DEPLOYING`.
+- `config.deployment.enabled: true` is required for the LangSmith Deployment add-on. Setting only the URL without `enabled: true` silently skips `listener` and `operator`.
+- Encryption keys must never change after first enable. Rotating `insights_encryption_key` or `polly_encryption_key` permanently breaks existing encrypted data.
+- Roll the frontend after first Polly enable. `agentBootstrap` creates `langsmith-polly-config` after registering; frontend pods started earlier do not pick it up.
+- `letsencrypt` (HTTP-01) only works with `nginx`, `istio` (self-managed), and `envoy-gateway`. For `istio-addon` or `agic`, use `dns01` with a custom domain, or `none` for HTTP-only.
+- Key Vault enters 90-day soft-delete after destroy. With `keyvault_purge_protection = false`, run `az keyvault purge` to reclaim the name immediately.
+
+## Teardown
+
+```bash
+make uninstall   # removes Helm releases + LGP CRD + namespaces
+make destroy     # destroys all Azure infrastructure via terraform destroy
+make clean       # removes generated secrets, helm values, tfstate lock
+```
+
diff --git a/src/langsmith/self-host-terraform-azure-troubleshooting.mdx b/src/langsmith/self-host-terraform-azure-troubleshooting.mdx
new file mode 100644
index 0000000000..002231c6d1
--- /dev/null
+++ b/src/langsmith/self-host-terraform-azure-troubleshooting.mdx
@@ -0,0 +1,597 @@
+---
+title: Azure Terraform troubleshooting
+sidebarTitle: Troubleshooting
+description: Common issues, fixes, and diagnostic commands for LangSmith self-hosted on Azure AKS deployed with the LangChain Terraform modules.
+---
+
+This page documents common issues, fixes, and diagnostic commands for LangSmith deployments provisioned with the [Azure Terraform modules](https://github.com/langchain-ai/terraform/tree/main/modules/azure).
+
+<Tip>
+Before upgrading, review the [LangSmith self-hosted changelog](/langsmith/self-hosted-changelog) for breaking changes and required variable updates. Run `az aks get-credentials --name <cluster> --resource-group <rg>` before running any `kubectl` commands.
+</Tip>
+
+## Infrastructure stage
+
+### `K8sVersionNotSupported` — version is LTS-only
+
+**Symptom**
+
+```
+Error: creating Kubernetes Cluster ... unexpected status 400
+"code": "K8sVersionNotSupported"
+"message": "Managed cluster ... is on version 1.32.x, which is only available for Long-Term Support (LTS).
+If you intend to onboard to LTS, please ensure the cluster is in Premium tier ..."
+```
+
+**Cause:** Azure periodically retires minor versions from Standard tier support and moves them to LTS-only. As of April 2026, 1.32 and below are LTS-only in `eastus`. Standard tier clusters must use 1.33+.
+
+**Fix:** Update `kubernetes_version` to a version with `KubernetesOfficial` support:
+
+```bash
+az aks get-versions --location eastus -o table
+# Versions with KubernetesOfficial in SupportPlan column work on Standard tier
+```
+
+Remove or update any `kubernetes_version` pin in `terraform.tfvars`, then `make apply`. Existing clusters on 1.32 continue to run; this only blocks new cluster creation.
+
+### vCPU quota exceeded
+
+**Symptom — autoscaler backoff (pods Pending):**
+
+```
+Warning  FailedScheduling     pod/langsmith-backend-xxx  0/1 nodes are available: 1 Too many pods.
+Normal   NotTriggerScaleUp    pod/langsmith-backend-xxx  pod didn't trigger scale-up: 2 in backoff after failed scale-up
+```
+
+**Symptom — node pool rotation:**
+
+```
+Error: creating temporary Agent Pool ... "code": "ErrCode_InsufficientVCPUQuota",
+"message": "Insufficient vcpu quota requested 8, remaining 2 for family standardDSv3Family for region eastus."
+```
+
+**Cause:** Per-region vCPU quotas per VM family. Default for `standardDSv3Family` in `eastus` is often 10 cores. One `Standard_D8s_v3` node uses 8; only 2 remain.
+
+**Why `max_pods = 30` triggers it:** AKS default is 30 pods per node. The base LangSmith install alone deploys ~37 pods. The autoscaler tries to add a second node, hits quota, enters backoff. Fix: `default_node_pool_max_pods = 60` in `terraform.tfvars` so all pods fit on one node.
+
+**Recommended quota** for multi-dataplane (3 dataplanes): 32 cores.
+
+**Request a quota increase:**
+
+```bash
+# Azure portal usually auto-approves within minutes:
+# Portal → Subscriptions → <sub> → Usage + Quotas → search "DSv3" → eastus → Request increase → 32
+
+# Or via CLI
+az quota update \
+  --resource-name "standardDSv3Family" \
+  --scope /subscriptions/<sub-id>/providers/Microsoft.Compute/locations/eastus \
+  --limit-object value=32 limit-type=Independent \
+  --resource-type dedicated
+
+az vm list-usage --location eastus --query "[?contains(name.value,'DSv3')]" -o table
+```
+
+**Alternative, switch VM family if DSv3 quota is exhausted:** Use `Standard_DS4_v2` (default) + `Standard_DS5_v2` (large). Same vCPU, slightly less RAM. Validated for the full LangSmith install plus all add-ons.
+
+<Note>
+`max_pods` is immutable on an existing node pool. Set it before the first `terraform apply`.
+</Note>
+
+### Istio addon revision not supported
+
+**Symptom:** `terraform apply` rejects the Istio revision (`Revision asm-1-XX is not supported`). Azure retires old ASM revisions regularly.
+
+**Fix:** Check currently available revisions and update `istio_addon_revision`:
+
+```bash
+az aks mesh get-revisions --location eastus -o table
+```
+
+Set the value in `terraform.tfvars` and re-apply.
+
+### Key Vault purge protection cannot be disabled after enabling
+
+**Symptom**
+
+```
+Error: updating Key Vault "langsmith-kv-dz":
+once Purge Protection has been Enabled it's not possible to disable it
+```
+
+**Cause:** When a Key Vault is deleted via `terraform destroy`, Azure soft-deletes it for 90 days. The next `terraform apply` with the same name silently recovers the old Key Vault — including its original `purge_protection_enabled = true`. Purge protection is one-way (enabled → cannot be disabled).
+
+**Fix — accept purge protection (test environments):**
+
+```hcl
+keyvault_purge_protection = true
+```
+
+**Fix — need `purge_protection = false`:**
+
+```bash
+# 1. Remove KV from Terraform state (does not delete from Azure)
+terraform -chdir=infra state rm module.keyvault.azurerm_key_vault.langsmith
+
+# 2. Permanently purge the soft-deleted KV (irreversible!)
+az keyvault purge --name langsmith-kv<identifier> --location eastus
+
+# 3. Re-apply
+make apply
+```
+
+### Key Vault secrets already exist but are not in Terraform state
+
+**Symptom**
+
+```
+Error: a resource with the ID "https://langsmith-kv-<id>.vault.azure.net/secrets/.../..."
+already exists - to be managed via Terraform this resource needs to be imported into the State.
+```
+
+**Cause:** Older `setup-env.sh` versions wrote Fernet keys directly to Key Vault. Current `setup-env.sh` is read-only against Key Vault; Terraform is the sole writer.
+
+**Fix:** Import the conflicting secrets:
+
+```bash
+terraform import \
+  'module.keyvault.azurerm_key_vault_secret.deployments_encryption_key[0]' \
+  "$(az keyvault secret show --vault-name langsmith-kv<id> --name langsmith-deployments-encryption-key --query id -o tsv)"
+
+terraform import \
+  'module.keyvault.azurerm_key_vault_secret.agent_builder_encryption_key[0]' \
+  "$(az keyvault secret show --vault-name langsmith-kv<id> --name langsmith-agent-builder-encryption-key --query id -o tsv)"
+
+terraform import \
+  'module.keyvault.azurerm_key_vault_secret.insights_encryption_key[0]' \
+  "$(az keyvault secret show --vault-name langsmith-kv<id> --name langsmith-insights-encryption-key --query id -o tsv)"
+
+terraform apply
+```
+
+## Application stage
+
+### `dns_label` subdomain not resolving — TLS cert stuck pending
+
+**Symptom:** `nslookup langsmith-demo.eastus.cloudapp.azure.com` returns NXDOMAIN. The cert-manager ACME challenge cannot complete; TLS certificate stays `READY: False`.
+
+**Cause:** The `service.beta.kubernetes.io/azure-dns-label-name` annotation must be set on the NGINX LoadBalancer service so Azure assigns the DNS label to the public IP. `make deploy` sets it automatically via `deploy.sh`. If you ran `helm upgrade` directly, the annotation was never set.
+
+**Fix**
+
+```bash
+kubectl annotate svc ingress-nginx-controller -n ingress-nginx \
+  service.beta.kubernetes.io/azure-dns-label-name=<dns_label> \
+  --overwrite
+
+# Wait 1-2 minutes, verify DNS resolves
+nslookup <dns_label>.eastus.cloudapp.azure.com
+
+# Delete the stuck cert to trigger re-issue
+kubectl delete certificate langsmith-tls -n langsmith
+```
+
+### `istio-addon` — port 80/443 timeout, TLS handshake reset
+
+**Symptom:** Site unreachable after `make deploy` with `ingress_controller = "istio-addon"`. Port 80 times out, port 443 resets. ACME challenge stays `pending`.
+
+**Cause — three compounding issues:**
+
+1. **Wrong gateway label.** Kubernetes Ingress with `ingressClassName: istio` targets pods with label `istio: ingressgateway`. The AKS managed external gateway uses `istio: aks-istio-ingressgateway-external`.
+2. **`ClusterIssuer` created with `class: nginx`.** The ACME HTTP-01 solver ingress gets class `nginx`, not `istio`.
+3. **TLS secret in wrong namespace.** Istio SDS reads from the gateway pod namespace (`aks-istio-ingress`), not the app namespace (`langsmith`).
+
+**Fix:** `make deploy` handles all three automatically in the current scripts. If deploying manually, create an Istio `Gateway` targeting `istio: aks-istio-ingressgateway-external`, patch the `ClusterIssuer` solver to `ingressClassName: istio`, sync `langsmith-tls` to the `aks-istio-ingress` namespace, and create a `VirtualService` routing to the LangSmith frontend. See the [TROUBLESHOOTING.md source](https://github.com/langchain-ai/terraform/blob/main/modules/azure/TROUBLESHOOTING.md) for the full YAML.
+
+### `letsencrypt-prod` ClusterIssuer missing
+
+**Symptom:** `kubectl describe certificate langsmith-tls -n langsmith` shows `clusterissuers.cert-manager.io "letsencrypt-prod" not found`.
+
+**Cause:** Older versions of the `k8s-bootstrap` module did not create the `ClusterIssuer`. Current versions do; `make deploy` also applies it via `kubectl apply` (since `kubernetes_manifest` requires a live cluster API during plan).
+
+**Fix — apply manually:**
+
+```bash
+kubectl apply -f - <<EOF
+apiVersion: cert-manager.io/v1
+kind: ClusterIssuer
+metadata:
+  name: letsencrypt-prod
+spec:
+  acme:
+    server: https://acme-v02.api.letsencrypt.org/directory
+    email: you@example.com
+    privateKeySecretRef:
+      name: letsencrypt-prod-account-key
+    solvers:
+    - http01:
+        ingress:
+          ingressClassName: nginx   # use "istio" with istio-addon or istio
+EOF
+
+kubectl delete certificate langsmith-tls -n langsmith
+```
+
+### `database "langsmith" does not exist` — backend pods crashlooping
+
+**Symptom:** Backend pods crash immediately: `FATAL: database "langsmith" does not exist`.
+
+**Cause:** Azure DB for PostgreSQL Flexible Server does not auto-create application databases. The Terraform `postgres` module now creates the database via `azurerm_postgresql_flexible_server_database`. This error means you are on an older module version missing that resource.
+
+**Fix**
+
+```bash
+terraform apply
+kubectl rollout restart deployment -n langsmith
+```
+
+### `langsmith-backend-auth-bootstrap` stuck in `CreateContainerConfigError`
+
+**Cause:** The Job reads the admin password using key `initial_org_admin_password`. If the Secret was created with a different key name (for example `admin_password`), the container cannot start.
+
+**Fix**
+
+```bash
+kubectl delete secret langsmith-config-secret -n langsmith
+make k8s-secrets   # recreates with correct key names
+make deploy
+```
+
+### Cannot roll back to an older chart version
+
+**Cause:** LangSmith DB migrations are forward-only. Downgrading the chart leaves the DB at a revision the older app image cannot locate.
+
+**Fix:** Roll forward to the version you were on (or newer). Set `langsmith_helm_chart_version` in `terraform.tfvars` and re-deploy. Always test new chart versions in a separate environment before upgrading production.
+
+### Helm install times out
+
+**Cause:** `langsmith-backend-auth-bootstrap` runs DB migrations on every `helm upgrade`; first install takes up to 5 minutes. Without `--timeout 15m`, Helm reports failure even though the install eventually succeeds.
+
+**Fix:** `make deploy` already uses `--timeout 20m`. Running Helm manually, always include `--timeout 20m`.
+
+## Add-ons
+
+### Pods stay in `DEPLOYING`, never reach `HEALTHY`
+
+**Cause:** `config.deployment.url` was empty or `config.deployment.tlsEnabled` was `false` when TLS is enabled. The operator builds agent endpoint URLs from these values.
+
+**Fix:** `init-values.sh` automatically injects `url` and `tlsEnabled` after copying from examples. If deploying manually:
+
+```yaml
+config:
+  deployment:
+    enabled: true
+    url: "https://langsmith-demo.eastus.cloudapp.azure.com"   # must include https://
+    tlsEnabled: true   # must be true when tls_certificate_source = letsencrypt or dns01
+```
+
+### Insights add-on: `backend-ch-migrations` in `CreateContainerConfigError`
+
+**Symptom:** Multiple pods fail with `CreateContainerConfigError` after enabling `enable_insights = true`. Logs: `secret "langsmith-clickhouse" not found`.
+
+**Cause:** The example `langsmith-values-insights.yaml` sets `clickhouse.external.enabled: true` with `existingSecretName: langsmith-clickhouse`. This overrides the in-cluster ClickHouse configuration and expects an external secret that does not exist.
+
+**Fix:** `init-values.sh` now generates a minimal Insights file when `clickhouse_source = "in-cluster"`. For an existing deployment with this issue:
+
+```bash
+cat > helm/values/langsmith-values-insights.yaml << 'EOF'
+config:
+  insights:
+    enabled: true
+EOF
+make deploy
+```
+
+### Polly shows "Unable to connect to LangGraph server"
+
+**Symptom:** Polly chat widget shows connection error. Browser console: `POST http://localhost:8123/threads net::ERR_FAILED` and CORS error.
+
+**Cause A — Frontend started before `langsmith-polly-config` was created.** The bootstrap job creates the ConfigMap with `VITE_POLLY_DEPLOYMENT_URL` after Polly is registered. Env vars from ConfigMap load at pod start, not dynamically.
+
+**Fix**
+
+```bash
+kubectl rollout restart deployment langsmith-frontend -n langsmith
+kubectl exec -n langsmith deploy/langsmith-frontend -- env | grep POLLY
+# expect: VITE_POLLY_DEPLOYMENT_URL=https://<hostname>/lgp/smith-polly-<hash>
+```
+
+**Cause B — `LANGCHAIN_ENDPOINT` set in `polly.agent.extraEnv`.** `LANGCHAIN_ENDPOINT` is reserved. Setting it causes the bootstrap job to fail with `400 Bad Request: 'LANGCHAIN_ENDPOINT' is reserved`. Polly is never created.
+
+**Fix:** Remove the `polly.agent.extraEnv` block entirely. The operator injects `LANGCHAIN_ENDPOINT` automatically.
+
+### `listener` and `operator` pods never appear after enabling LangSmith Deployment
+
+**Cause:** `config.deployment.url` was set but `config.deployment.enabled: true` was omitted. The chart silently skips creating `listener` and `operator` when `enabled` is false (the default).
+
+**Fix:** Add `enabled: true` inside the `deployment` block:
+
+```yaml
+config:
+  deployment:
+    enabled: true          # required — url alone is not enough
+    url: "https://<your-hostname>"
+```
+
+### Duplicate top-level `config:` key silently drops values
+
+**Cause:** YAML disallows duplicate top-level keys. A second `config:` block silently drops one of them.
+
+**Fix:** Always add new config blocks inside the existing `config:` key. Verify with `helm get values langsmith -n langsmith`.
+
+### Encryption keys must not change after first deploy
+
+Changing `deployments_encryption_key`, `agent_builder_encryption_key`, or `insights_encryption_key` after their first use permanently corrupts the data they protect. There is no recovery.
+
+- Do not rotate these keys.
+- Do not set `config.agentBuilder.encryptionKey` or `config.insights.encryptionKey` inline in `values-overrides.yaml`. The chart reads them from `langsmith-config-secret` via `existingSecretName`. Setting inline overrides the secret reference.
+
+### `agent-builder-tool-server` or `polly` in CrashLoopBackOff
+
+**Symptom:** Pod restarts indefinitely. No traceback. Logs show "Child process died" repeatedly.
+
+**Cause:** `lc_config.settings.SharedSettings` is instantiated at module import time inside the uvicorn worker. A pydantic `ValidationError` raised there exits the worker with code 0; uvicorn's parent prints "Child process died" but swallows the traceback. Common triggers: `BASIC_AUTH_ENABLED = true` but `BASIC_AUTH_JWT_SECRET` is empty, or a required feature-flag key absent from `langsmith-config`.
+
+**Diagnose** by running the server in a debug pod with `envFrom` pointing at `langsmith-config` and `PYTHONUNBUFFERED=1`. **Fix:** add the missing key to Key Vault, rerun `make k8s-secrets`, restart the deployment.
+
+## Workload Identity
+
+### Pod panics: `AADSTS700213: No matching federated identity record found`
+
+**Symptom**
+
+```
+panic: blob-storage health-check failed: get container properties failed:
+DefaultAzureCredential: failed to acquire a token.
+WorkloadIdentityCredential authentication failed.
+  AADSTS700213: No matching federated identity record found for presented assertion subject
+  'system:serviceaccount:langsmith:langsmith-<service>'
+```
+
+**Cause:** The pod's Kubernetes ServiceAccount has no federated credential on the Azure Managed Identity. Every pod that accesses Blob Storage needs one.
+
+**Fix:** Add the missing ServiceAccount to `service_accounts_for_workload_identity` in `modules/k8s-cluster/main.tf`:
+
+```hcl
+service_accounts_for_workload_identity = [
+  "${var.langsmith_release_name}-backend",
+  "${var.langsmith_release_name}-platform-backend",
+  "${var.langsmith_release_name}-queue",
+  "${var.langsmith_release_name}-ingest-queue",
+  "${var.langsmith_release_name}-host-backend",                 # LangSmith Deployment add-on
+  "${var.langsmith_release_name}-listener",                     # LangSmith Deployment add-on
+  "${var.langsmith_release_name}-agent-builder-tool-server",    # Agent Builder add-on
+  "${var.langsmith_release_name}-agent-builder-trigger-server", # Agent Builder add-on
+]
+```
+
+```bash
+terraform apply -target=module.aks
+kubectl rollout restart deployment/langsmith-<service> -n langsmith
+```
+
+See the [architecture page](/langsmith/self-host-terraform-azure-architecture#workload-identity) for the full pod-to-WI mapping.
+
+## Teardown and cleanup
+
+### `make clean` before `make destroy` orphans infrastructure
+
+**Symptom:** `make destroy` after `make clean` fails with `No state file was found!`. Azure resources still run but Terraform has lost tracking.
+
+**Cause:** `make clean` removes `terraform.tfvars` and `secrets.auto.tfvars`. Without them, Terraform cannot initialize the backend.
+
+**Correct teardown order**
+
+```txt
+1. make uninstall   ← Helm + namespace
+2. make destroy     ← Azure infra (needs tfstate + tfvars)
+3. make clean       ← local secrets and generated files (LAST)
+```
+
+**Recovery when tfstate is gone**
+
+```bash
+az group delete --name langsmith-rg<identifier> --yes --no-wait
+az group show --name langsmith-rg<identifier> 2>&1 | grep -E "provisioningState|ResourceGroupNotFound"
+```
+
+If you reuse the same `identifier` afterwards, Azure may recover the soft-deleted Key Vault on the next `terraform apply`. With `keyvault_purge_protection = false`, purge first: `az keyvault purge --name langsmith-kv<identifier> --location <region>`.
+
+### `terraform destroy` stalls on VNet/subnet deletion
+
+**Cause:** The Azure Load Balancer provisioned by `ingress-nginx-controller` is not tracked by Terraform. Azure blocks VNet deletion while the LB holds a subnet reference.
+
+**Fix:** Run `make uninstall` first.
+
+```bash
+make uninstall
+kubectl delete namespace langsmith --timeout=60s
+make destroy
+```
+
+### `langsmith-agent-bootstrap` hook times out
+
+**Symptom:** Helm post-upgrade hook times out (`context deadline exceeded`). Agents progress through `QUEUED → AWAITING_DEPLOY → DEPLOYING` but do not reach `HEALTHY` in 20 minutes.
+
+**Cause:** On a cold cluster, three LGP agents (`agent-builder`, `clio`, `smith-polly`) can take longer than 20 minutes for first image pulls. The Helm hook waits synchronously.
+
+**Fix:** Not actually a failure. Resources are applied; agents continue deploying. Wait until pods stabilize, then re-run `make deploy`.
+
+### `listener` pods OOMKilled
+
+**Cause:** `langsmith-values-sizing-dev.yaml` sets `listener.deployment.resources.limits.memory: 512Mi`. With Deployments enabled, the listener exceeds this.
+
+**Fix:** The `langsmith-values-agent-deploys.yaml` overlay correctly sets `4Gi`. Re-run `make init-values` to regenerate overlays.
+
+<Note>
+The chart uses `listener.deployment.resources` for container limits, not `listener.resources`. Setting `listener.resources` in an overlay is silently ignored.
+</Note>
+
+### Stale HPA scales `listener` or `host-backend` to max replicas
+
+**Cause:** A prior Helm revision created an HPA. Helm does not clean it up on failed hooks. On re-deploy with `enabled: false`, the stale HPA remains and overrides `replicas`.
+
+**Fix**
+
+```bash
+kubectl delete hpa langsmith-listener langsmith-host-backend -n langsmith 2>/dev/null || true
+kubectl scale deployment langsmith-listener -n langsmith --replicas=1
+kubectl scale deployment langsmith-host-backend -n langsmith --replicas=1
+make deploy
+```
+
+## AGIC (Application Gateway Ingress Controller)
+
+### AGIC pod CrashLoopBackOff — 403 on AGW GET
+
+**Symptom:** `ingress-appgw-deployment` is CrashLoopBackOff. Logs: `ErrorApplicationGatewayForbidden: does not have authorization to perform action Microsoft.Network/applicationGateways/read`.
+
+**Cause:** AKS creates a managed identity for the AGIC add-on (`ingressapplicationgateway-<cluster>` in the `MC_` resource group). The identity is created during cluster provisioning but takes ~5 minutes to register in Azure AD before role assignments take effect.
+
+**Fix:** The `k8s-cluster` module waits 300s after cluster creation (`time_sleep.agic_identity_propagation`) and creates the three required role assignments automatically. If AGIC is still 403 after `make apply`:
+
+```bash
+az aks update --name <CLUSTER> --resource-group <RG> --yes
+kubectl delete pod -n kube-system -l app=ingress-azure
+```
+
+For manual role assignments (Reader on RG, Contributor on AGW, Network Contributor on VNet), see the [TROUBLESHOOTING.md source](https://github.com/langchain-ai/terraform/blob/main/modules/azure/TROUBLESHOOTING.md#agic-pod-crashloopbackoff--403-on-agw-get).
+
+### AGIC — `ApplicationGatewayInsufficientPermissionOnSubnet`
+
+**Cause:** AGIC add-on identity missing Network Contributor on the VNet.
+
+**Fix**
+
+```bash
+AGIC_OID=$(az aks show -g <RG> -n <CLUSTER> \
+  --query "addonProfiles.ingressApplicationGateway.identity.objectId" -o tsv)
+VNET_ID=$(az network vnet show -g <RG> -n <VNET> --query id -o tsv)
+
+az role assignment create --role "Network Contributor" --scope "$VNET_ID" \
+  --assignee-object-id "$AGIC_OID" --assignee-principal-type ServicePrincipal
+
+kubectl rollout restart deployment/ingress-appgw-deployment -n kube-system
+```
+
+### AGIC — `SecretNotFound` for TLS secret
+
+**Cause:** AGIC saw the Ingress before cert-manager issued the TLS certificate.
+
+**Fix:** Touch the Ingress to trigger re-sync:
+
+```bash
+kubectl get certificate langsmith-tls -n langsmith   # verify cert is ready
+kubectl annotate ingress langsmith-ingress -n langsmith touch="$(date +%s)" --overwrite
+```
+
+### AGIC — `ingressClassName: azure/application-gateway` rejected
+
+**Cause:** The legacy annotation `kubernetes.io/ingress.class: azure/application-gateway` (with slash) is not a valid `ingressClassName`. AKS creates the `IngressClass` as `azure-application-gateway` (hyphen).
+
+**Fix:** Use `ingressClassName: azure-application-gateway`. `make init-values` sets this automatically.
+
+## Istio (self-managed Helm)
+
+### Istio site returns connection refused / no routes
+
+**Symptom:** Connection refused. `pilot-agent request GET config_dump` shows `LDS: PUSH resources:0`.
+
+**Root causes (all three must be fixed):**
+
+1. `meshConfig.ingressControllerMode` not set. Default is `DEFAULT`, which ignores `ingressClassName`. Must be `STRICT`.
+2. `istio` IngressClass resource missing.
+3. `meshConfig.ingressClass` not set to `istio`.
+
+**Fix:** All three are automated — `meshConfig` is set in the istiod Helm release (Terraform), `deploy.sh` creates the IngressClass. Manual fix: create the IngressClass and restart istiod.
+
+### Istio HTTPS returns "no peer certificate available"
+
+**Cause:** istiod reads the TLS secret via SDS (`kubernetes://langsmith-tls`). The secret must exist in `istio-system` (the gateway pod namespace). cert-manager issues it to the `langsmith` namespace; it is not copied automatically.
+
+**Fix:** `deploy.sh` syncs the secret post-deploy. Manual fix: copy the secret to `istio-system`.
+
+### Leftover CRDs from `istio-addon` block self-managed Helm install
+
+**Symptom:** `terraform apply` fails: `CustomResourceDefinition "wasmplugins.extensions.istio.io" exists and cannot be imported into the current release: invalid ownership metadata`.
+
+**Fix**
+
+```bash
+kubectl get crd | grep "istio.io" | awk '{print $1}' | xargs kubectl delete crd
+terraform apply
+```
+
+## Diagnostic commands
+
+### Cluster access
+
+```bash
+az aks get-credentials --name <cluster> --resource-group <rg>
+kubectl config current-context
+kubectl get nodes -o wide
+```
+
+### Pods
+
+```bash
+kubectl get pods -n langsmith
+kubectl get pods -n langsmith -w
+kubectl describe pod <pod-name> -n langsmith
+kubectl logs <pod-name> -n langsmith --tail=100 -f
+kubectl logs <pod-name> -n langsmith --previous --tail=50
+```
+
+### Ingress and TLS
+
+```bash
+kubectl get ingress -n langsmith
+kubectl get svc ingress-nginx-controller -n ingress-nginx
+kubectl get certificate -n langsmith
+kubectl get challenges -n langsmith
+kubectl get clusterissuer
+```
+
+### Workload Identity
+
+```bash
+kubectl get serviceaccount langsmith-ksa -n langsmith \
+  -o jsonpath='{.metadata.annotations.azure\.workload\.identity/client-id}'
+
+kubectl get pod <pod> -n langsmith \
+  -o jsonpath='{.metadata.labels.azure\.workload\.identity/use}'
+```
+
+### Helm
+
+```bash
+helm status langsmith -n langsmith
+helm history langsmith -n langsmith
+helm get values langsmith -n langsmith
+```
+
+### LangSmith Deployment
+
+```bash
+kubectl get pods -n langsmith | grep -E "host-backend|listener|operator"
+kubectl get lgp -n langsmith
+kubectl get crd | grep langchain
+```
+
+### Key Vault and Kubernetes Secrets
+
+```bash
+make keyvault list
+make keyvault validate
+make keyvault diff
+
+kubectl get secrets -n langsmith
+kubectl get secret langsmith-config-secret -n langsmith -o jsonpath='{.data}' | jq 'keys'
+```
+
+### Quick health check
+
+```bash
+make status         # 10-section automated check
+make status-quick   # skip Key Vault + K8s secret queries
+```
diff --git a/src/langsmith/self-host-terraform-azure-variables.mdx b/src/langsmith/self-host-terraform-azure-variables.mdx
new file mode 100644
index 0000000000..4ee5aee1a6
--- /dev/null
+++ b/src/langsmith/self-host-terraform-azure-variables.mdx
@@ -0,0 +1,150 @@
+---
+title: Azure Terraform variables reference
+sidebarTitle: Variables
+description: Complete reference of Terraform variables for LangSmith self-hosted on Azure AKS.
+---
+
+Reference for every input variable exposed by the [Azure Terraform modules](https://github.com/langchain-ai/terraform/tree/main/modules/azure). Set non-sensitive variables in `infra/terraform.tfvars`. For sensitive variables (license key, passwords, encryption keys), `make setup-env` writes them to `infra/secrets.auto.tfvars` and Terraform stores them in Azure Key Vault.
+
+## Core
+
+| Variable | Default | Description |
+|---|---|---|
+| `subscription_id` | — | Azure subscription ID. Required. |
+| `location` | `eastus` | Azure region. |
+| `identifier` | `""` | Suffix appended to all resource names (for example, `-prod`, `-dev-dz`). Must start with a hyphen or be empty. |
+| `environment` | `dev` | Environment tag on all resources. |
+| `owner` | `""` | Owner tag applied to all resources. |
+| `cost_center` | `""` | Cost center tag for billing attribution. |
+
+## Deployment tier
+
+| Variable | Default | Description |
+|---|---|---|
+| `postgres_source` | `external` | `external` — Azure DB for PostgreSQL (private VNet). `in-cluster` — Helm chart manages its own Postgres pod (dev/demo only). |
+| `redis_source` | `external` | `external` — Azure Cache for Redis (private VNet). `in-cluster` — Helm chart manages its own Redis pod (dev/demo only). |
+| `clickhouse_source` | `in-cluster` | `in-cluster` — ClickHouse deployed as a Helm pod (dev/POC only). `external` — [LangChain Managed ClickHouse](/langsmith/langsmith-managed-clickhouse), recommended for production. |
+
+## PostgreSQL
+
+| Variable | Default | Description |
+|---|---|---|
+| `postgres_admin_username` | `langsmith` | PostgreSQL admin username. |
+| `postgres_admin_password` | `""` | PostgreSQL admin password (sensitive). Set with `setup-env.sh`. |
+| `postgres_subnet_address_prefix` | `["10.0.32.0/20"]` | CIDR for the PostgreSQL subnet. |
+| `postgres_deletion_protection` | `true` | Prevent accidental PostgreSQL server deletion. Set `false` for dev/test. |
+| `database_name` | `langsmith` | PostgreSQL database to create. Used in the `connection_url` output. |
+
+## Redis
+
+| Variable | Default | Description |
+|---|---|---|
+| `redis_subnet_address_prefix` | `["10.0.48.0/20"]` | CIDR for the Redis subnet. |
+| `redis_capacity` | `2` | Redis Cache tier (P2 = 13 GB). |
+
+## AKS node pools
+
+| Variable | Default | Description |
+|---|---|---|
+| `default_node_pool_vm_size` | `Standard_D8s_v3` | AKS node VM size (8 vCPU, 32 GB). Use `Standard_D4s_v3` for light/demo only. |
+| `default_node_pool_min_count` | `1` | Min nodes for the default pool. Set to 3 for production. Set to 5 before enabling the LangSmith Deployment add-on. |
+| `default_node_pool_max_count` | `10` | Max nodes for the autoscaler. |
+| `additional_node_pools` | `large: D16s_v3 0–2` | Extra node pools. Default includes a `large` pool (`Standard_D16s_v3`, 16 vCPU, 64 GB) scaled to zero when idle. Required for ClickHouse (15GB RAM request). |
+| `aks_service_cidr` | `10.0.64.0/20` | Kubernetes ClusterIP range. Must not overlap the VNet. |
+| `aks_dns_service_ip` | `10.0.64.10` | CoreDNS service IP. Must be within `aks_service_cidr`. |
+| `aks_deletion_protection` | `true` | Prevent accidental AKS cluster deletion. Set `false` for dev/test. |
+| `availability_zones` | `["1"]` | Availability zones for AKS node pools (for example, `["1", "2", "3"]`). Set to `[]` to disable zone pinning. |
+
+## Ingress controller
+
+| Variable | Default | Description |
+|---|---|---|
+| `ingress_controller` | `nginx` | Ingress controller: `nginx` \| `istio-addon` \| `istio` \| `agic` \| `envoy-gateway`. See `INGRESS_CONTROLLERS.md` in the [module repo](https://github.com/langchain-ai/terraform/blob/main/modules/azure/INGRESS_CONTROLLERS.md) for the full TLS compatibility matrix. |
+
+## DNS and TLS
+
+| Variable | Default | Description |
+|---|---|---|
+| `dns_label` | `""` | Azure Public IP DNS label for the ingress LoadBalancer. Results in `<label>.<region>.cloudapp.azure.com`. Works with nginx, istio, istio-addon, envoy-gateway. |
+| `langsmith_domain` | `""` | Custom hostname for LangSmith (for example, `langsmith.example.com`). Takes priority over `dns_label`. |
+| `tls_certificate_source` | `letsencrypt` | `letsencrypt` — HTTP-01 with cert-manager. `dns01` — DNS-01 with Azure DNS + Workload Identity. `none` — no TLS. |
+| `letsencrypt_email` | `""` | Email for Let's Encrypt notifications. Required when `tls_certificate_source` is `letsencrypt` or `dns01`. |
+| `cert_manager_identity_client_id` | `""` | Client ID of the cert-manager Managed Identity. Wired automatically from the `k8s-cluster` output. Required when `tls_certificate_source = "dns01"`. |
+| `create_dns_zone` | `false` | Enable Azure DNS zone + A record. Required for DNS-01 cert issuance. |
+| `dns_zone_name` | `""` | Azure DNS zone name (for example, `langsmith.mycompany.com`). Required when `tls_certificate_source = "dns01"`. |
+| `dns_resource_group_name` | `""` | Resource group containing the Azure DNS zone. Required when `tls_certificate_source = "dns01"`. |
+
+## LangSmith application
+
+| Variable | Default | Description |
+|---|---|---|
+| `langsmith_namespace` | `langsmith` | Kubernetes namespace for LangSmith workloads. |
+| `langsmith_release_name` | `langsmith` | Helm release name (used for Workload Identity federated credential subjects). |
+| `langsmith_helm_chart_version` | `""` | Pin a specific Helm chart version. Empty = use latest. |
+| `sizing_profile` | `production` | Helm sizing overlay: `minimum` \| `dev` \| `production` \| `production-large`. Read by `init-values.sh`; Terraform ignores this value. |
+
+## Blob Storage
+
+| Variable | Default | Description |
+|---|---|---|
+| `blob_ttl_enabled` | `true` | Enable lifecycle TTL rules on the blob container. |
+| `blob_ttl_short_days` | `14` | TTL for short-lived trace blobs. |
+| `blob_ttl_long_days` | `400` | TTL for long-lived trace blobs. |
+
+## Key Vault
+
+| Variable | Default | Description |
+|---|---|---|
+| `keyvault_name` | `""` | Override Key Vault name (default: `langsmith-kv<identifier>`). |
+| `keyvault_purge_protection` | `true` | Enable Key Vault purge protection. Set `false` for dev/test to allow immediate name reuse after destroy. |
+
+## Network (BYO VNet)
+
+| Variable | Default | Description |
+|---|---|---|
+| `create_vnet` | `true` | Create a new VNet. Set `false` to bring your own. |
+| `vnet_id` | `""` | Existing VNet resource ID. Required when `create_vnet = false`. |
+
+## High availability
+
+| Variable | Default | Description |
+|---|---|---|
+| `postgres_high_availability_mode` | `""` | PostgreSQL HA mode (for example, `ZoneRedundant`). Requires `GeneralPurpose` or `MemoryOptimized` SKU. |
+| `postgres_standby_availability_zone` | `""` | Zone for the PostgreSQL standby replica. Set when enabling zone-redundant HA. |
+
+## Optional modules
+
+| Variable | Default | Description |
+|---|---|---|
+| `create_waf` | `false` | Enable Azure WAF policy (OWASP 3.2 + bot protection). Safe to add post-deploy. |
+| `create_diagnostics` | `false` | Enable Log Analytics workspace + diagnostic settings for AKS, Key Vault, and PostgreSQL. Recommended for production. |
+| `enable_aks_diag` | `true` | Create the AKS diagnostic setting inside the diagnostics module. |
+| `enable_keyvault_diag` | `true` | Create the Key Vault diagnostic setting inside the diagnostics module. |
+| `enable_postgres_diag` | `false` | Create the PostgreSQL diagnostic setting. Set `true` when `postgres_source = "external"`. |
+| `create_bastion` | `false` | Enable a jump VM for private AKS cluster access with `az ssh vm`. No public IP required. |
+
+## Add-on flags
+
+`init-values.sh` and `deploy.sh` read these flags; Terraform ignores them. The flags affect which Helm add-on overlay files the scripts generate.
+
+| Variable | Default | Description |
+|---|---|---|
+| `enable_deployments` | `false` | Enables LangSmith Deployment (host-backend, listener, operator). See the [LangSmith Deployment add-on](/langsmith/self-host-terraform-azure-deploy#langsmith-deployment). Scale `default_node_pool_min_count` to 5 first. |
+| `enable_agent_builder` | `false` | Enable Agent Builder UI. Requires `enable_deployments = true`. See the [Agent Builder add-on](/langsmith/self-host-terraform-azure-deploy#agent-builder). |
+| `enable_insights` | `false` | Enable Insights / Clio analytics. Requires `enable_deployments = true`. See the [Insights and Polly add-on](/langsmith/self-host-terraform-azure-deploy#insights-and-polly). |
+| `enable_polly` | `false` | Enable Polly AI eval agent. Requires `enable_deployments = true`. See the [Insights and Polly add-on](/langsmith/self-host-terraform-azure-deploy#insights-and-polly). |
+
+## Sensitive variables (set with `setup-env.sh`)
+
+`make setup-env` writes these to `secrets.auto.tfvars` and Terraform stores them in Azure Key Vault. Never set these inline in `terraform.tfvars`.
+
+| Variable | Description |
+|---|---|
+| `langsmith_license_key` | LangSmith enterprise license key. |
+| `langsmith_admin_password` | Initial org admin password. |
+| `langsmith_api_key_salt` | Salt for hashing API keys. Must stay stable after first deploy. |
+| `langsmith_jwt_secret` | JWT secret for Basic Auth sessions. |
+| `langsmith_deployments_encryption_key` | Fernet key for the LangSmith Deployment add-on. Must never change. |
+| `langsmith_agent_builder_encryption_key` | Fernet key for the Agent Builder add-on. Must never change. |
+| `langsmith_insights_encryption_key` | Fernet key for the Insights add-on. Must never change. |
+| `langsmith_polly_encryption_key` | Fernet key for Polly. Must never change. |
diff --git a/src/langsmith/self-host-terraform-gcp-architecture.mdx b/src/langsmith/self-host-terraform-gcp-architecture.mdx
new file mode 100644
index 0000000000..cbb24c2305
--- /dev/null
+++ b/src/langsmith/self-host-terraform-gcp-architecture.mdx
@@ -0,0 +1,305 @@
+---
+title: GCP Terraform architecture
+sidebarTitle: Architecture
+description: Platform layers, services, Workload Identity, networking, and module dependencies for LangSmith self-hosted on GKE.
+---
+
+This page documents what the [GCP Terraform modules](https://github.com/langchain-ai/terraform/tree/main/modules/gcp) provision and how the modules wire the resulting deployment together.
+
+## Platform layers
+
+LangSmith on GCP deploys in up to five passes. Each pass adds a capability layer on top of the previous. All layers share the same GKE cluster and `langsmith` namespace.
+
+<img src="/images/self-hosted-terraform/gcp-architecture.png" alt="LangSmith on GCP deployment passes and service layout" />
+
+| Pass | Layer | What it adds |
+|---|---|---|
+| 1 | GCP infrastructure | VPC, GKE, Cloud SQL, Memorystore, GCS, K8s bootstrap, cert-manager, KEDA, Envoy Gateway |
+| 2 | LangSmith base | frontend, backend, platform-backend, queue, ace-backend, clickhouse, playground |
+| 3 | LangSmith Deployment | host-backend, listener, operator + per-deployment pods |
+| 4 | Agent Builder | agent-builder-tool-server, agent-builder-trigger-server + deep-agent LGP |
+| 5 | Insights + Polly | Clio analytics (ClickHouse-backed), Polly eval agent |
+
+## Module descriptions
+
+| Module | Path | Purpose |
+|---|---|---|
+| `networking` | `modules/networking/` | VPC, subnet with secondary ranges, Cloud Router, Cloud NAT, private service connection for Cloud SQL and Memorystore |
+| `k8s-cluster` | `modules/k8s-cluster/` | GKE Standard or Autopilot cluster, node pool with autoscaling, Workload Identity enabled |
+| `postgres` | `modules/postgres/` | Cloud SQL PostgreSQL instance, HA standby replica, private IP, deletion protection |
+| `redis` | `modules/redis/` | Memorystore Redis STANDARD_HA tier, private IP within VPC |
+| `storage` | `modules/storage/` | GCS bucket with lifecycle rules for `ttl_s/` (14 days) and `ttl_l/` (400 days) prefixes |
+| `k8s-bootstrap` | `modules/k8s-bootstrap/` | `langsmith` namespace, Kubernetes Secrets for Postgres and Redis URLs, cert-manager and KEDA Helm releases |
+| `ingress` | `modules/ingress/` | Envoy Gateway Helm release, GatewayClass, HTTPRoute, optional HTTPS Gateway listener |
+| `iam` | `modules/iam/` | GCP service accounts and Workload Identity bindings for GCS access (wired by default) |
+| `dns` | `modules/dns/` | Cloud DNS managed zone and managed cert (optional, enable with `enable_dns_module`) |
+| `secrets` | `modules/secrets/` | Secret Manager secret bundle (optional, enable with `enable_secret_manager_module`) |
+
+## Deployment tiers
+
+### Light deploy (all in-cluster)
+
+```txt
+VPC
+└── subnet (10.0.0.0/20 — GKE nodes only)
+    No Cloud SQL or Memorystore — chart pods handle both
+
+GKE Cluster
+├── langsmith namespace
+│   ├── frontend, backend, platform-backend, queue, ace-backend, playground
+│   ├── clickhouse (in-cluster)
+│   ├── postgres   (in-cluster)
+│   └── redis      (in-cluster)
+├── cert-manager
+├── keda
+└── envoy-gateway-system
+
+GCS Bucket (trace payloads, always external)
+```
+
+Set in `terraform.tfvars`:
+
+```hcl
+postgres_source   = "in-cluster"
+redis_source      = "in-cluster"
+clickhouse_source = "in-cluster"
+```
+
+### Production (external managed services)
+
+```txt
+VPC
+├── subnet (10.0.0.0/20 — GKE nodes, pods, services)
+│   └── Secondary ranges: pods 10.4.0.0/14, services 10.8.0.0/20
+└── Private service connection (VPC peering to Google managed network)
+    ├── Cloud SQL PostgreSQL  (private IP, regional standby)
+    └── Memorystore Redis     (private IP, STANDARD_HA tier)
+
+GKE Cluster
+├── langsmith namespace
+│   ├── frontend, backend, platform-backend, queue, ace-backend, playground
+│   └── clickhouse (in-cluster — use LangChain Managed for production scale)
+├── cert-manager
+├── keda
+└── envoy-gateway-system
+
+GCS Bucket (Workload Identity, no static keys)
+```
+
+## Application core services
+
+| Service | Purpose | Port | HPA | Workload Identity | Depends on |
+|---|---|---|---|---|---|
+| `langsmith-frontend` | React UI | 3000 | 1 to 10 | No | `backend`, `platform-backend` |
+| `langsmith-backend` | Main API (traces, runs, projects, API keys, feedback) | 1984 | 3 to 10 | Yes (GCS) | Postgres, Redis, ClickHouse, GCS |
+| `langsmith-platform-backend` | Org and user management, auth, billing, settings | 1986 | 1 to 10 | Yes (GCS) | Postgres, Redis, GCS |
+| `langsmith-playground` | LLM prompt playground UI | 3001 | 1 to 10 | No | `backend` |
+| `langsmith-queue` | Trace ingestion worker (Redis to ClickHouse + GCS) | — | 3 to 10 + KEDA | Yes | Redis, ClickHouse, GCS |
+| `langsmith-ingest-queue` | Dedicated high-throughput ingestion worker | — | 3 to 10 + KEDA | Yes | Redis, GCS |
+| `langsmith-ace-backend` | Async compute (dataset runs, evaluations, background jobs) | — | 1 to 5 | No | Postgres, Redis |
+| `langsmith-clickhouse` | Columnar store (trace spans, run metadata, eval results) | — | StatefulSet, single replica | No | 500Gi `premium-rwo` PVC |
+
+<Warning>
+In-cluster ClickHouse is dev/POC only (single pod, no replication, no backups). For production, use [LangChain Managed ClickHouse](/langsmith/langsmith-managed-clickhouse) or a self-managed external cluster.
+</Warning>
+
+### One-time jobs
+
+| Job | Purpose |
+|---|---|
+| `langsmith-backend-migrations` | PostgreSQL schema migrations |
+| `langsmith-backend-ch-migrations` | ClickHouse schema migrations |
+| `langsmith-backend-auth-bootstrap` | Creates the initial org and admin account |
+
+## LangSmith Deployment add-on
+
+| Service | Purpose | Workload Identity |
+|---|---|---|
+| `langsmith-host-backend` | LangGraph control plane API. Manages deployment lifecycle, serves deployment metadata. | Yes (GCS) |
+| `langsmith-listener` | Watches host-backend for state changes, creates and updates `LangGraphPlatform` CRDs. | Yes (GCS) |
+| `langsmith-operator` | Kubernetes operator. Reconciles `LangGraphPlatform` CRDs, creates and deletes Deployments and Services. | RBAC for Deployments and Services |
+
+Each LangGraph deployment created in the UI produces a Kubernetes Deployment in the `langsmith` namespace, with pods running as the `langsmith-ksa` ServiceAccount. That ServiceAccount must carry the `iam.gke.io/gcp-service-account` annotation, which `deploy.sh` applies idempotently.
+
+## GCP managed services
+
+When `postgres_source = "external"` and `redis_source = "external"` (the recommended production setting), Terraform provisions:
+
+### Cloud SQL PostgreSQL
+
+- Default size `db-custom-2-8192` (2 vCPU, 8 GB), private IP, port 5432.
+- REGIONAL availability with automatic failover.
+- Holds orgs, users, projects, API keys, settings.
+- Terraform writes the connection URL directly to the `langsmith-postgres` Kubernetes Secret.
+
+### Memorystore Redis
+
+- Default 5 GB, STANDARD_HA tier, private IP, port 6379.
+- Trace ingestion queue, pub/sub, short-lived cache.
+- No auth token required. Access is controlled by VPC private IP only.
+- Terraform writes the connection URL directly to the `langsmith-redis` Kubernetes Secret.
+
+### Cloud Storage bucket
+
+- Trace payloads: large inputs and outputs, attachments.
+- Accessed via the S3-compatible API (`apiURL: https://storage.googleapis.com`, `engine: S3`).
+- HMAC keys are required for the S3-compatible API even with Workload Identity. Create one under Cloud Storage → Settings → Interoperability and pass them to Helm via `config.blobStorage.accessKey` and `config.blobStorage.accessKeySecret`.
+- Lifecycle rules: `ttl_s/` prefix (14 days default), `ttl_l/` prefix (400 days default).
+- Always required.
+
+### Secret Manager (optional module)
+
+- Stores Postgres password and generated secrets (LangSmith secret key, JWT secret) when `enable_secret_manager_module = true`.
+- Core secrets (`langsmith-postgres`, `langsmith-redis`) are always stored in Kubernetes Secrets by `k8s-bootstrap` regardless of this module. Secret Manager provides an additional durable store for secrets that must survive cluster recreation.
+
+## Cluster infrastructure
+
+| Service | Namespace | Installed by | Required for |
+|---|---|---|---|
+| Envoy Gateway | `envoy-gateway-system` | `ingress` module (`install_ingress = true`, default) | All ingress |
+| KEDA | `keda` | `k8s-bootstrap` module when `enable_langsmith_deployment = true` | LangSmith Deployment add-on and later |
+| cert-manager | `cert-manager` | `k8s-bootstrap` module when `tls_certificate_source = "letsencrypt"` or `install_cert_manager = true` | Let's Encrypt TLS |
+| External Secrets Operator | `external-secrets` | `k8s-bootstrap` module | Custom secret workflows (optional) |
+
+<Note>
+The `Gateway` resource is managed by Terraform; the `HTTPRoute` is managed by Helm. Do not delete the Gateway resource manually. GCP releases the external IP when the Gateway is deleted, and a new IP is issued on recreate.
+</Note>
+
+## Workload Identity
+
+GKE pods access GCS through Workload Identity. The Kubernetes ServiceAccount is bound to a GCP service account via an IAM binding; pods receive temporary credentials with no static keys in Secrets or environment variables.
+
+```txt
+GKE pod
+  └── Kubernetes ServiceAccount (annotated with iam.gke.io/gcp-service-account)
+        └── IAM binding: roles/iam.workloadIdentityUser
+              └── GCP Service Account
+                    └── roles/storage.objectAdmin on the GCS bucket
+```
+
+| Component | Annotation | Permissions |
+|---|---|---|
+| `langsmith-backend` | `iam.gke.io/gcp-service-account: <gsa>` | GCS `storage.objectAdmin` on the LangSmith bucket |
+| `langsmith-platform-backend` | Same | GCS `storage.objectAdmin` |
+| `langsmith-queue` | Same | GCS `storage.objectAdmin` |
+| `langsmith-ingest-queue` | Same | GCS `storage.objectAdmin` |
+| `langsmith-host-backend` | Same | GCS `storage.objectAdmin` |
+| `langsmith-listener` | Same | GCS `storage.objectAdmin` |
+| `langsmith-ksa` (operator pods) | Same | GCS `storage.objectAdmin` |
+
+The GSA is defined by the `iam` module and output as `workload_identity_annotation`. `init-values.sh` writes these annotations into `values-overrides.yaml` automatically.
+
+GCS access via the S3-compatible API requires HMAC keys in addition to Workload Identity. Create the HMAC key under Cloud Storage → Settings → Interoperability and pass it to Helm.
+
+## Network topology
+
+| Range | CIDR | Used by |
+|---|---|---|
+| Subnet | `10.0.0.0/20` | GKE nodes |
+| Pods | `10.4.0.0/14` | GKE pod IPs (secondary range) |
+| Services | `10.8.0.0/20` | GKE ClusterIP services (secondary range) |
+| Private service connection | `/16` allocated by Google | Cloud SQL, Memorystore private IPs |
+
+Cloud SQL and Memorystore are accessed exclusively via private IP. The networking module establishes a private service connection (VPC peering to Google's managed network) whenever `postgres_source = "external"` or `redis_source = "external"`.
+
+## Traffic flow
+
+```txt
+Internet (HTTPS :443)
+  ↓
+Envoy Gateway  (envoy-gateway-system, external LoadBalancer IP)
+  TLS terminated — cert-manager + Let's Encrypt or existing certificate
+  │
+  ├── /                     → frontend:80
+  ├── /api/*                → backend:1984
+  └── /api/v1/deployments/* → host-backend:1985  (LangSmith Deployment add-on)
+
+Internal traffic (private IPs, never leaving VPC):
+  backend       → Cloud SQL:5432    via private IP
+  backend       → Memorystore:6379  via private IP
+  backend       → GCS               via Workload Identity + HMAC keys
+  host-backend  → K8s API           reads deployment pod status
+  listener      → K8s API           reconciles Deployment CRDs
+  operator      → K8s API           creates and manages deployment pods
+```
+
+## Component to storage mapping
+
+| Component | PostgreSQL | Redis | ClickHouse | GCS |
+|---|---|---|---|---|
+| `backend` | Org config, run metadata | Ingestion queue | — | Trace objects |
+| `platform-backend` | — | — | — | Blob routing |
+| `queue` | — | Pops jobs | — | Writes trace blobs |
+| `clickhouse` | — | — | Trace search index | — |
+| `host-backend` | Deployment lifecycle state | — | — | — |
+
+## Secret Manager integration
+
+Without Secret Manager:
+
+```txt
+terraform.tfvars → terraform apply → kubernetes_secret (postgres, redis)
+```
+
+With Secret Manager:
+
+```txt
+terraform.tfvars → terraform apply → Secret Manager secrets
+                                       → ESO (External Secrets Operator)
+                                         → kubernetes_secret (langsmith namespace)
+```
+
+## Terraform module graph
+
+```txt
+google_project_service (APIs enabled)
+  └── module.networking
+        ├── module.gke_cluster
+        │     └── null_resource.wait_for_cluster
+        │           ├── module.cloudsql      (count = postgres_source == "external")
+        │           ├── module.redis         (count = redis_source    == "external")
+        │           ├── module.storage
+        │           ├── module.iam           (count = enable_gcp_iam_module)
+        │           ├── module.secrets       (count = enable_secret_manager_module)
+        │           ├── module.dns           (count = enable_dns_module)
+        │           ├── module.k8s_bootstrap
+        │           └── module.ingress       (count = install_ingress)
+        └── (private_service_connection when external services)
+```
+
+LangSmith itself is not deployed by Terraform; the chart is installed in the application stage via `helm upgrade --install`.
+
+## Verification commands
+
+```bash
+# Cluster connectivity
+gcloud container clusters get-credentials <cluster-name> --region <region> --project <project-id>
+kubectl cluster-info
+kubectl get nodes -o wide
+
+# All LangSmith pods
+kubectl get pods -n langsmith
+
+# Envoy Gateway
+kubectl get pods -n envoy-gateway-system
+kubectl get svc -n envoy-gateway-system
+
+# cert-manager
+kubectl get pods -n cert-manager
+kubectl get certificate -n langsmith
+
+# KEDA (LangSmith Deployment add-on)
+kubectl get pods -n keda
+
+# Cloud SQL connectivity test
+kubectl run psql-test --rm -it --image=postgres:15 -n langsmith -- \
+  psql "postgresql://langsmith:<password>@<cloud-sql-private-ip>:5432/langsmith" -c "SELECT version();"
+
+# Memorystore connectivity test
+kubectl run redis-test --rm -it --image=redis:7 -n langsmith -- \
+  redis-cli -h <redis-private-ip> ping
+
+# GCS connectivity test
+kubectl run gcs-test --rm -it --image=google/cloud-sdk -n langsmith -- \
+  gsutil ls gs://<bucket-name>
+```
diff --git a/src/langsmith/self-host-terraform-gcp-deploy.mdx b/src/langsmith/self-host-terraform-gcp-deploy.mdx
new file mode 100644
index 0000000000..eaa674547b
--- /dev/null
+++ b/src/langsmith/self-host-terraform-gcp-deploy.mdx
@@ -0,0 +1,478 @@
+---
+title: Deploy LangSmith on GCP with Terraform
+sidebarTitle: Deploy
+description: End-to-end walkthrough for provisioning LangSmith self-hosted on GCP GKE using the LangChain Terraform modules.
+---
+
+Provision the GCP cloud foundation and install LangSmith with the public Terraform modules at [github.com/langchain-ai/terraform/tree/main/modules/gcp](https://github.com/langchain-ai/terraform/tree/main/modules/gcp). Plan for 35 to 45 minutes end to end on a clean project.
+
+The deployment runs in two stages: infrastructure (Terraform provisions VPC, GKE, Cloud SQL, Memorystore, GCS, Workload Identity) and application (Helm installs the LangSmith chart against the cluster). Add-ons are enabled with a flag and a redeploy.
+
+## Prerequisites
+
+### Required tools
+
+| Tool | Version | Purpose |
+|---|---|---|
+| Google Cloud SDK (`gcloud`) | 450 | Authenticate, query GCP resources, manage GKE credentials |
+| Terraform | 1.5 | Run the infrastructure modules |
+| `kubectl` | 1.28 | Inspect the GKE cluster |
+| Helm | 3.12 | Install and manage the LangSmith chart |
+
+Install on macOS:
+
+```bash
+brew install --cask google-cloud-sdk
+brew install kubectl helm
+brew tap hashicorp/tap && brew install hashicorp/tap/terraform
+
+gcloud version
+terraform version
+kubectl version --client
+helm version
+```
+
+### Required GCP APIs
+
+Terraform enables these automatically on first apply, but `cloudresourcemanager.googleapis.com` must be enabled first so Terraform can enable the rest. Enable everything manually for fast first runs:
+
+```bash
+gcloud services enable \
+  container.googleapis.com \
+  sqladmin.googleapis.com \
+  redis.googleapis.com \
+  storage.googleapis.com \
+  iam.googleapis.com \
+  secretmanager.googleapis.com \
+  certificatemanager.googleapis.com \
+  servicenetworking.googleapis.com \
+  cloudresourcemanager.googleapis.com \
+  --project <your-project-id>
+```
+
+### Required IAM roles
+
+The principal running Terraform needs the following roles on the target project. Trim to least-privilege after the initial deployment is stable.
+
+| Role | Purpose |
+|---|---|
+| `roles/container.admin` | Create and manage GKE clusters |
+| `roles/compute.networkAdmin` | Create VPC, subnets, firewall rules |
+| `roles/iam.serviceAccountAdmin` | Create service accounts for Workload Identity |
+| `roles/cloudsql.admin` | Create and manage Cloud SQL instances |
+| `roles/redis.admin` | Create and manage Memorystore Redis |
+| `roles/storage.admin` | Create GCS buckets and lifecycle policies |
+| `roles/resourcemanager.projectIamAdmin` | Grant IAM bindings during provisioning |
+| `roles/servicenetworking.networksAdmin` | Create private service connections (required for Cloud SQL and Redis) |
+
+### Authenticate
+
+```bash
+gcloud auth login
+gcloud config set project <your-project-id>
+gcloud auth application-default login
+```
+
+You also need a LangSmith license key ([contact sales](https://www.langchain.com/contact-sales)) and a domain or subdomain that resolves to GCP.
+
+## Rapid path
+
+For the fastest path from zero to a running LangSmith instance, run these commands in order:
+
+```bash
+# 1. Clone the public modules
+git clone https://github.com/langchain-ai/terraform.git
+cd terraform/modules/gcp
+
+# 2. Generate terraform.tfvars interactively (Enter accepts current values)
+make quickstart
+
+# 3. Load secrets into Secret Manager
+#    Must be sourced, not executed
+source infra/scripts/setup-env.sh
+
+# 4. Validate environment
+make preflight
+
+# 5. Provision infrastructure (~25 to 35 min)
+make init
+make plan
+make apply
+
+# 6. Configure kubectl
+make kubeconfig
+kubectl get nodes
+
+# 7. Deploy LangSmith via Helm (~8 to 12 min)
+make init-values
+make deploy
+
+# 8. Get the Gateway IP for DNS
+kubectl get gateway -n langsmith \
+  -o jsonpath='{.items[0].status.addresses[0].value}'
+```
+
+The sections below cover each phase in detail.
+
+## Provision infrastructure
+
+Provisioning the GCP cloud foundation takes 25 to 35 minutes on a clean project. Do not interrupt the apply.
+
+### What gets provisioned
+
+| Resource | Purpose |
+|---|---|
+| VPC + subnet + Cloud NAT | Private network for the cluster and managed services |
+| Private service connection | VPC peering for Cloud SQL and Memorystore private IPs |
+| GKE cluster (Standard or Autopilot) | Kubernetes compute, Workload Identity enabled |
+| Cloud SQL PostgreSQL | LangSmith operational data, HA standby, private IP |
+| Memorystore Redis | Queue and cache, STANDARD_HA tier, private IP |
+| GCS bucket | Trace payload blob storage, lifecycle rules |
+| Workload Identity service account | Per-pod GCP access without static keys |
+| cert-manager, KEDA, Envoy Gateway | Bootstrap workloads installed alongside infrastructure |
+
+### Clone and configure
+
+```bash
+git clone https://github.com/langchain-ai/terraform.git
+cd terraform/modules/gcp
+```
+
+All subsequent commands run from `modules/gcp/`. Run `make help` for the full target list.
+
+Generate `terraform.tfvars` with the interactive wizard:
+
+```bash
+make quickstart
+```
+
+The wizard prompts for project ID, naming prefix, region, GKE sizing, TLS source, external vs in-cluster services, and the optional add-on flags. It writes `infra/terraform.tfvars`. Re-running pre-selects existing values; press Enter at each prompt to keep the current config.
+
+Prefer to edit manually:
+
+```bash
+cp infra/terraform.tfvars.example infra/terraform.tfvars
+vi infra/terraform.tfvars
+```
+
+The minimum required variables:
+
+```hcl
+project_id            = "<your-gcp-project-id>"
+name_prefix           = "ls"
+environment           = "prod"
+langsmith_license_key = "<your-license-key>"
+langsmith_domain      = "langsmith.example.com"
+
+region = "us-west2"
+zone   = "us-west2-a"
+
+postgres_source   = "external"
+postgres_password = "<strong-password>"   # or: export TF_VAR_postgres_password=...
+
+redis_source = "external"
+
+clickhouse_source = "in-cluster"
+
+tls_certificate_source = "letsencrypt"
+letsencrypt_email      = "ops@example.com"
+
+enable_langsmith_deployment = true
+```
+
+See the [GCP variables reference](/langsmith/self-host-terraform-gcp-variables) for every input variable.
+
+<Tip>
+Configure a remote state backend before applying. Copy `infra/backend.tf.example` to `infra/backend.tf` and point it at a GCS bucket you control. Local state is fragile and easily lost during directory restructuring.
+</Tip>
+
+### Load secrets into Secret Manager
+
+```bash
+source infra/scripts/setup-env.sh
+```
+
+The script reads `terraform.tfvars`, derives the secret prefix, and for each secret either reuses an exported value, reads the existing Secret Manager secret, auto-generates one (for salts and Fernet keys), or prompts you. The license key and admin password are the two values you supply interactively. The script must be sourced because `make` cannot export environment variables back to the parent shell.
+
+Verify the secrets are present:
+
+```bash
+make secrets
+```
+
+### Preflight checks
+
+```bash
+make preflight
+```
+
+`make preflight` validates that the active `gcloud` credentials can perform each required action, that the required GCP APIs are enabled, and that the target region has the SKUs the modules request. Catching gaps here is faster than discovering them mid-`terraform apply`.
+
+### Apply
+
+```bash
+make init
+make plan
+make apply
+```
+
+`make plan` shows the proposed diff. Review the output before applying. `make apply` provisions in dependency order: VPC and networking, then GKE (about 10 to 15 minutes), private service connection, Cloud SQL (about 10 minutes with HA), Memorystore, GCS, and the bootstrap workloads.
+
+Equivalent direct Terraform flow:
+
+```bash
+cd modules/gcp/infra
+
+terraform init
+terraform plan -var-file=terraform.tfvars
+terraform apply -var-file=terraform.tfvars
+```
+
+### Configure kubectl
+
+```bash
+make kubeconfig
+kubectl get nodes
+```
+
+All nodes should report `Ready`.
+
+### Verify bootstrap components
+
+```bash
+kubectl get pods -n cert-manager
+kubectl get pods -n keda
+kubectl get secrets -n langsmith
+```
+
+cert-manager, KEDA, and the LangSmith namespace secrets should all be in place.
+
+## Deploy LangSmith
+
+Two paths are supported. Pick one.
+
+### Script-driven Helm deploy (recommended)
+
+Two commands install the LangSmith chart with sensible defaults wired from Terraform outputs:
+
+```bash
+cd modules/gcp
+
+make init-values
+make deploy
+```
+
+`init-values.sh` prompts for the admin email, then reads `sizing_profile` and the `enable_*` flags from `terraform.tfvars` and copies matching values files from `helm/values/examples/` into `helm/values/`. It also generates `values-overrides.yaml` with your hostname, Workload Identity annotations, and GCS bucket name.
+
+`make deploy` runs `helm/scripts/deploy.sh`, which refreshes the kubeconfig, runs preflight checks, applies the layered values files, and runs `helm upgrade --install`.
+
+Expect 8 to 12 minutes for the chart to install and pods to become ready.
+
+### Manual Helm install
+
+Best for teams running `helm` directly without the scripts. Generate the required secrets first:
+
+```bash
+export API_KEY_SALT=$(openssl rand -base64 32)
+export JWT_SECRET=$(openssl rand -base64 32)
+export AGENT_BUILDER_ENCRYPTION_KEY=$(python3 -c \
+  "from cryptography.fernet import Fernet; print(Fernet.generate_key().decode())")
+export INSIGHTS_ENCRYPTION_KEY=$(python3 -c \
+  "from cryptography.fernet import Fernet; print(Fernet.generate_key().decode())")
+export ADMIN_EMAIL="admin@example.com"
+export ADMIN_PASSWORD="<strong-password>"
+
+# GCS HMAC credentials (create in GCP Console: Storage > Settings > Interoperability)
+export GCS_ACCESS_KEY="<your-hmac-access-key>"
+export GCS_ACCESS_SECRET="<your-hmac-secret>"
+```
+
+```bash
+helm repo add langchain https://langchain-ai.github.io/helm
+helm repo update
+
+helm upgrade --install langsmith langchain/langsmith \
+  --namespace langsmith \
+  --create-namespace \
+  -f ../helm/values/values.yaml \
+  -f ../helm/values/values-overrides.yaml \
+  --set config.langsmithLicenseKey="<your-license-key>" \
+  --set config.apiKeySalt="$API_KEY_SALT" \
+  --set config.basicAuth.jwtSecret="$JWT_SECRET" \
+  --set config.hostname="<your-langsmith-domain>" \
+  --set config.basicAuth.initialOrgAdminEmail="$ADMIN_EMAIL" \
+  --set config.basicAuth.initialOrgAdminPassword="$ADMIN_PASSWORD" \
+  --set config.agentBuilder.encryptionKey="$AGENT_BUILDER_ENCRYPTION_KEY" \
+  --set config.insights.encryptionKey="$INSIGHTS_ENCRYPTION_KEY" \
+  --set config.blobStorage.bucketName="$(terraform output -raw storage_bucket_name)" \
+  --set config.blobStorage.accessKey="$GCS_ACCESS_KEY" \
+  --set config.blobStorage.accessKeySecret="$GCS_ACCESS_SECRET" \
+  --set gateway.enabled=true \
+  --set ingress.enabled=false \
+  --wait --timeout 15m
+```
+
+### Verify and configure DNS
+
+```bash
+kubectl get pods -n langsmith
+
+EXTERNAL_IP=$(kubectl get svc -n envoy-gateway-system \
+  -l gateway.envoyproxy.io/owning-gateway-name=langsmith-gateway \
+  -o jsonpath='{.items[0].status.loadBalancer.ingress[0].ip}')
+
+echo "Create A record: $EXTERNAL_IP -> <your-langsmith-domain>"
+
+kubectl get certificate -n langsmith
+```
+
+cert-manager cannot issue the Let's Encrypt certificate until the DNS A record resolves to the Gateway IP. Create the record at your DNS provider, wait for propagation, then re-check the certificate status.
+
+### Sizing profiles
+
+Set `sizing_profile` in `terraform.tfvars`, then re-run `make init-values && make deploy`.
+
+```hcl
+sizing_profile = "production"   # default | minimum | dev | production | production-large
+```
+
+| Profile | When to use |
+|---|---|
+| `default` | Chart defaults, no overlay applied |
+| `minimum` | Absolute floor, fits `e2-standard-4`. Cost parking or CI smoke tests |
+| `dev` | Single replica, minimal resources |
+| `production` | Multi-replica with HPA. Recommended for real workloads |
+| `production-large` | High memory, high CPU. 50+ users or 1000+ traces/sec |
+
+<Tip>
+**Minimum profile with LangSmith Deployment?** Run `make patch-lgp` after deploy to right-size LangGraph Platform CRs. The operator overwrites Deployment patches, so the CRs must be targeted directly.
+</Tip>
+
+### Expected pods
+
+```txt
+langsmith-ace-backend-xxx          1/1  Running    0
+langsmith-backend-xxx              1/1  Running    0
+langsmith-backend-auth-bootstrap   0/1  Completed  0
+langsmith-backend-migrations       0/1  Completed  0
+langsmith-clickhouse-0             1/1  Running    0
+langsmith-frontend-xxx             1/1  Running    0
+langsmith-ingest-queue-xxx         1/1  Running    0
+langsmith-platform-backend-xxx     1/1  Running    0
+langsmith-playground-xxx           1/1  Running    0
+langsmith-queue-xxx                1/1  Running    0
+```
+
+## Enable add-ons
+
+Each add-on is gated by a flag in `infra/terraform.tfvars`. Set the flag, re-apply Terraform, then re-run `make init-values && make deploy`.
+
+### LangSmith Deployment
+
+Adds `host-backend`, `listener`, and `operator`. Required before enabling Agent Builder or Insights. KEDA is installed automatically when `enable_langsmith_deployment = true`.
+
+```hcl
+# infra/terraform.tfvars
+enable_deployments = true
+```
+
+```bash
+cd modules/gcp
+
+make apply        # push the enable_deployments flag
+make init-values  # pick up the change
+make deploy       # roll out host-backend + listener + operator
+```
+
+Verify:
+
+```bash
+kubectl get pods -n langsmith | grep -E "host-backend|listener|operator"
+kubectl get lgp -n langsmith
+kubectl get crd | grep langchain
+kubectl get pods -n keda
+```
+
+<Warning>
+`config.deployment.url` must include `https://`. Without the protocol, operator-spawned agents stay stuck in `DEPLOYING` indefinitely.
+</Warning>
+
+### Agent Builder
+
+Prerequisite: LangSmith Deployment healthy. Adds `agent-builder-tool-server`, `agent-builder-trigger-server`, and an `agentBootstrap` Job that registers the Polly agent URL.
+
+```hcl
+# infra/terraform.tfvars
+enable_agent_builder = true
+```
+
+```bash
+make init-values
+make deploy
+```
+
+Verify:
+
+```bash
+kubectl get pods -n langsmith | grep -E "tool-server|trigger-server|bootstrap"
+```
+
+Roll the frontend after `agentBootstrap` completes so it picks up the `langsmith-polly-config` ConfigMap:
+
+```bash
+kubectl rollout restart deployment langsmith-frontend -n langsmith
+```
+
+<Warning>
+Skipping the frontend restart makes Polly show "Unable to connect to LangGraph server".
+</Warning>
+
+### Insights and Polly
+
+Prerequisite: Agent Builder healthy. Insights enables ClickHouse-backed trace analytics. Polly is the AI eval and monitoring agent. Enable both together.
+
+```hcl
+# infra/terraform.tfvars
+enable_insights = true
+enable_polly    = true
+```
+
+```bash
+make init-values
+make deploy
+```
+
+Verify:
+
+```bash
+kubectl get pods -n langsmith | grep -E "clio|polly"
+kubectl get pods -n langsmith -w
+```
+
+<Warning>
+`insights_encryption_key` and `polly_encryption_key` must never change after first enable. Rotating either permanently breaks existing encrypted data.
+</Warning>
+
+### Expected pods by add-on
+
+**LangSmith Deployment adds:** `langsmith-host-backend`, `langsmith-listener`, `langsmith-operator`.
+
+**Agent Builder adds:** `langsmith-agent-builder-tool-server`, `langsmith-agent-builder-trigger-server`, `langsmith-agent-builder-bootstrap` (Completed), `agent-builder-<hash>` (operator-spawned).
+
+**Insights and Polly add:** `clio-<hash>` (Insights analytics), `smith-polly-<hash>` (Polly agent), `lg-<hash>-0` (LangGraph StatefulSet).
+
+## Key watchouts
+
+- `config.deployment.url` must include `https://`. Without it, operator-spawned agents stay stuck in `DEPLOYING`.
+- `config.deployment.enabled: true` is required for LangSmith Deployment. Setting only the URL without `enabled: true` causes the chart to silently skip `listener` and `operator`.
+- Encryption keys must never change after first enable. Rotating `insights_encryption_key` or `polly_encryption_key` permanently breaks existing encrypted data.
+- Roll the frontend after first Polly enable. `agentBootstrap` creates the `langsmith-polly-config` ConfigMap after registering. Frontend pods started before bootstrap completes do not pick it up automatically.
+- Envoy Gateway IP changes on teardown. GCP releases the external IP when the Gateway is deleted. After a re-apply, a new IP is issued, so update your DNS A record.
+- `langsmith-ksa` annotation is not permanent. The operator creates `langsmith-ksa` at runtime; it does not survive namespace deletion. `deploy.sh` re-annotates it idempotently. Re-run `make deploy` if operator pods lose GCS access after a cluster rebuild.
+
+## Next steps
+
+- Reference the [GCP variables](/langsmith/self-host-terraform-gcp-variables) and the [quick reference](/langsmith/self-host-terraform-gcp-quick-reference).
+- Review the [GCP architecture](/langsmith/self-host-terraform-gcp-architecture) for module structure, traffic flow, and Workload Identity.
+- When something breaks, check the [GCP troubleshooting guide](/langsmith/self-host-terraform-gcp-troubleshooting).
+- Enable agent deployment in the UI with [LangSmith Deployment](/langsmith/deploy-self-hosted-full-platform).
diff --git a/src/langsmith/self-host-terraform-gcp-quick-reference.mdx b/src/langsmith/self-host-terraform-gcp-quick-reference.mdx
new file mode 100644
index 0000000000..2ef009b877
--- /dev/null
+++ b/src/langsmith/self-host-terraform-gcp-quick-reference.mdx
@@ -0,0 +1,242 @@
+---
+title: GCP Terraform quick reference
+sidebarTitle: Quick reference
+description: Make targets, Terraform, kubectl, gcloud, and Helm commands for LangSmith self-hosted on GKE.
+---
+
+Command cheat sheet for day-to-day operations against a GCP LangSmith deployment provisioned with the [GCP Terraform modules](https://github.com/langchain-ai/terraform/tree/main/modules/gcp). All `make` targets run from `modules/gcp/`. Run `make help` for an inline summary.
+
+## Deployment overview
+
+| Stage | What gets deployed | Command |
+|---|---|---|
+| Infrastructure | VPC + GKE + Cloud SQL + Memorystore + GCS + IAM + cert-manager + KEDA + Envoy Gateway | `make apply` |
+| Cluster credentials | Kubeconfig wired to the new GKE cluster | `make kubeconfig` |
+| LangSmith base | Frontend, backend, ingest, queue, ClickHouse | `make init-values && make deploy` |
+| LangSmith Deployment add-on | host-backend, listener, operator | `make apply && make init-values && make deploy` |
+| Agent Builder add-on | tool-server, trigger-server + agent-builder LGP | `make init-values && make deploy` |
+| Insights + Polly add-on | Clio analytics, Polly eval agent | `make init-values && make deploy` |
+
+Each stage builds on the previous. Verify pods are healthy before enabling the next.
+
+## First-time setup
+
+```bash
+cd terraform/modules/gcp
+
+# Interactive wizard — generates terraform.tfvars
+make quickstart
+
+# Set up secrets in Secret Manager (auto-generates passwords + Fernet keys)
+# Must be sourced so it can export TF_VAR_* into your shell
+source infra/scripts/setup-env.sh
+
+# Verify secrets are stored correctly
+make secrets
+
+# Deploy infrastructure
+make init
+make plan
+make apply
+
+# Generate Helm values from Terraform outputs
+make init-values
+
+# Deploy LangSmith
+make deploy
+```
+
+## Day-2 operations
+
+```bash
+# Check deployment state and next-step guidance
+make status              # full check
+make status-quick        # skip Secret Manager and K8s queries
+
+# Re-deploy after changing Helm values or upgrading chart version
+make deploy
+
+# Re-generate Helm values after Terraform changes
+make init-values
+
+# Manage Secret Manager secrets interactively
+make secrets
+
+# Update kubeconfig for the GKE cluster
+make kubeconfig
+```
+
+## Add-ons
+
+Set flags in `terraform.tfvars`, then `make init-values && make deploy`. `init-values.sh` copies the matching example file into `helm/values/` automatically.
+
+```hcl
+# terraform.tfvars
+enable_deployments   = true
+enable_agent_builder = true   # requires enable_deployments = true
+enable_insights      = true
+enable_polly         = true   # requires enable_deployments = true + Polly license
+enable_usage_telemetry = true # extended usage telemetry
+```
+
+To add an add-on after initial install without re-running `init-values.sh`, copy manually:
+
+```bash
+cp helm/values/examples/langsmith-values-agent-deploys.yaml helm/values/
+cp helm/values/examples/langsmith-values-agent-builder.yaml helm/values/
+cp helm/values/examples/langsmith-values-insights.yaml      helm/values/
+cp helm/values/examples/langsmith-values-polly.yaml         helm/values/
+
+make deploy
+```
+
+## Sizing profiles
+
+Set `sizing_profile` in `terraform.tfvars`, then re-run `make init-values && make deploy`.
+
+```hcl
+sizing_profile = "production"   # default | minimum | dev | production | production-large
+```
+
+| Profile | When to use |
+|---|---|
+| `default` | Chart defaults — quick tests, no overlay applied |
+| `minimum` | Absolute floor; fits `e2-standard-4`; use for cost parking or CI smoke tests |
+| `dev` | Single replica, minimal resources |
+| `production` | Multi-replica with HPA; recommended for real workloads |
+| `production-large` | High memory and CPU; 50+ users or 1000+ traces/sec |
+
+<Tip>
+**Minimum profile with LGP?** Run `make patch-lgp` after deploy to right-size LangGraph Platform CRs. The operator overwrites Deployment patches, so the CRs must be targeted directly.
+</Tip>
+
+## kubectl
+
+```bash
+# Pod health
+kubectl get pods -n langsmith
+kubectl get pods -n langsmith -w
+kubectl describe pod <pod-name> -n langsmith
+kubectl logs <pod-name> -n langsmith --tail=100 -f
+kubectl logs <pod-name> -n langsmith --previous --tail=50
+
+# Backend logs (live)
+kubectl logs -n langsmith deploy/langsmith-backend --tail=100 -f
+
+# Gateway and HTTPRoute
+kubectl get gateway -n langsmith
+kubectl get httproute -n langsmith
+kubectl get svc -n envoy-gateway-system
+
+# TLS
+kubectl get certificate -n langsmith
+kubectl get challenges -n langsmith
+kubectl describe certificate <cert-name> -n langsmith
+kubectl get clusterissuer
+
+# Workload Identity
+kubectl get serviceaccount langsmith-ksa -n langsmith -o yaml | grep annotation -A5
+
+# Helm
+helm status langsmith -n langsmith
+helm history langsmith -n langsmith
+helm get values langsmith -n langsmith
+
+# LangSmith Deployment
+kubectl get lgp -n langsmith
+kubectl get crd | grep langchain
+```
+
+## gcloud
+
+```bash
+# Re-auth if you hit oauth2 invalid_grant / invalid_rapt errors
+gcloud auth login
+gcloud auth application-default login
+
+# Cluster credentials
+gcloud container clusters get-credentials <cluster-name> --region <region> --project <project-id>
+
+# List clusters
+gcloud container clusters list --project <project-id>
+
+# Cluster status
+gcloud container clusters describe <cluster-name> --region <region> --format="value(status)"
+
+# Cloud SQL
+gcloud sql instances list --project <project-id>
+gcloud sql instances describe <instance-name> --format="value(ipAddresses)"
+
+# Memorystore Redis
+gcloud redis instances list --region <region>
+gcloud redis instances describe <instance-name> --region <region> --format="value(host)"
+
+# GCS bucket
+gsutil ls gs://<bucket-name>
+gsutil iam get gs://<bucket-name>
+
+# Workload Identity binding
+gcloud iam service-accounts get-iam-policy <gsa-email> --project <project-id>
+
+# Enabled APIs
+gcloud services list --enabled --project <project-id>
+
+# VPC peering
+gcloud services vpc-peerings list --network <vpc-name> --project <project-id>
+
+# Secret Manager
+gcloud secrets list --project <project-id> --filter="name:langsmith"
+gcloud secrets versions access latest --secret=<secret-id> --project <project-id>
+```
+
+## Terraform
+
+```bash
+cd modules/gcp/infra
+
+terraform init
+terraform plan -var-file=terraform.tfvars
+terraform apply -var-file=terraform.tfvars
+
+# Target a specific module
+terraform apply -var-file=terraform.tfvars -target=module.networking
+
+# Outputs
+terraform output
+terraform output -raw cluster_name
+terraform output -raw storage_bucket_name
+
+# State
+terraform state list
+terraform state show module.gke_cluster
+terraform refresh -var-file=terraform.tfvars
+```
+
+## Key watchouts
+
+- Uninstall Helm before `terraform destroy`. The Envoy Gateway load balancer references the VPC; leaving it blocks network deletion. Always run `make uninstall` first.
+- `config.deployment.url` must include `https://`. Without the protocol, operator-spawned agents stay stuck in `DEPLOYING`.
+- `config.deployment.enabled: true` is required for the LangSmith Deployment add-on. Setting only the URL without `enabled: true` silently skips `listener` and `operator`.
+- Encryption keys must never change after first enable. Rotating `insights_encryption_key` or `polly_encryption_key` permanently breaks existing encrypted data.
+- Roll the frontend after first Polly enable. `agentBootstrap` creates the `langsmith-polly-config` ConfigMap after registering; frontend pods started earlier do not pick it up.
+- Envoy Gateway IP changes on teardown. GCP releases the external IP when the Gateway is deleted. After `terraform destroy` and re-apply, update your DNS A record.
+- `langsmith-ksa` annotation is not permanent. The operator creates the ServiceAccount at runtime and it does not survive namespace deletion. `deploy.sh` re-annotates it idempotently.
+
+## Teardown
+
+```bash
+# 1. Remove LangSmith Deployment resources (if the add-on was enabled)
+kubectl delete lgp --all -n langsmith 2>/dev/null || true
+
+# 2. Uninstall LangSmith
+make uninstall
+
+# 3. Set deletion protection = false in terraform.tfvars, then:
+make destroy
+```
+
+```hcl
+# terraform.tfvars
+gke_deletion_protection      = false
+postgres_deletion_protection = false
+```
diff --git a/src/langsmith/self-host-terraform-gcp-troubleshooting.mdx b/src/langsmith/self-host-terraform-gcp-troubleshooting.mdx
new file mode 100644
index 0000000000..712ac3465e
--- /dev/null
+++ b/src/langsmith/self-host-terraform-gcp-troubleshooting.mdx
@@ -0,0 +1,441 @@
+---
+title: GCP Terraform troubleshooting
+sidebarTitle: Troubleshooting
+description: Common issues, fixes, and diagnostic commands for LangSmith self-hosted on GKE deployed with the LangChain Terraform modules.
+---
+
+This page documents common issues, fixes, and diagnostic commands for LangSmith deployments provisioned with the [GCP Terraform modules](https://github.com/langchain-ai/terraform/tree/main/modules/gcp).
+
+<Tip>
+Before upgrading, review the [LangSmith self-hosted changelog](/langsmith/self-hosted-changelog) for breaking changes and required variable updates. Run `gcloud container clusters get-credentials <cluster-name> --region <region> --project <project-id>` before running any `kubectl` commands.
+</Tip>
+
+## Automated diagnostics
+
+Before running individual commands, try the bundled scripts:
+
+```bash
+# Full deployment health check + next-step guidance
+make status
+
+# Secret Manager validation
+make secrets    # → manage-secrets.sh validate
+```
+
+## Known issues
+
+### terraform apply fails: GCP APIs not enabled
+
+**Symptom**
+
+```
+Error 403: ... has not been used in project <project-id> before or it is disabled.
+```
+
+**Cause:** Required GCP APIs are not enabled. Terraform enables them via `google_project_service`, but `cloudresourcemanager.googleapis.com` must already be enabled for Terraform to enable the others.
+
+**Fix**
+
+```bash
+gcloud services enable cloudresourcemanager.googleapis.com --project <project-id>
+cd modules/gcp/infra
+terraform apply -var-file=terraform.tfvars
+```
+
+### GKE cluster API server not accessible after apply
+
+**Symptom**
+
+```
+Error: Get "https://<cluster-endpoint>/api/v1/namespaces": dial tcp: connection refused
+```
+
+**Cause:** The GKE control plane takes 10 to 15 minutes to become fully operational. Terraform waits for `RUNNING` then adds a 90-second buffer. Cold-start API activation on slow projects can exceed the window.
+
+**Fix:** Wait for `RUNNING`, then re-run:
+
+```bash
+gcloud container clusters describe <cluster-name> \
+  --region <region> --project <project-id> --format="value(status)"
+
+terraform apply -var-file=terraform.tfvars
+```
+
+### GKE nodes not joining (NotReady)
+
+**Symptom:** `kubectl get nodes` shows no nodes or nodes stuck in `NotReady`.
+
+**Cause:** Node pool service account lacks `roles/container.nodeServiceAccount`, or VPC firewall rules block node-to-control-plane communication.
+
+**Fix**
+
+```bash
+gcloud container node-pools describe <pool-name> \
+  --cluster <cluster-name> --region <region> \
+  --format="value(config.serviceAccount)"
+
+gcloud projects add-iam-policy-binding <project-id> \
+  --member="serviceAccount:<node-sa-email>" \
+  --role="roles/container.nodeServiceAccount"
+
+gcloud compute firewall-rules list --filter="network:<vpc-name>"
+```
+
+### Cloud SQL connection refused from GKE pods
+
+**Symptom:** Backend logs show `connection refused` or `no route to host` for the Cloud SQL private IP.
+
+**Cause:** The private service connection (VPC peering) is not established, or the allocated IP range is too small. Often happens when `servicenetworking.googleapis.com` was not enabled before the networking module ran.
+
+**Fix**
+
+```bash
+gcloud services vpc-peerings list --network <vpc-name> --project <project-id>
+gcloud sql instances describe <instance-name> --format="value(ipAddresses)"
+gcloud compute networks peerings list --network <vpc-name>
+```
+
+If peering is missing, ensure `enable_private_service_connection = true` and re-apply:
+
+```bash
+terraform apply -var-file=terraform.tfvars -target=module.networking
+terraform apply -var-file=terraform.tfvars
+```
+
+### Memorystore Redis connection timeout
+
+**Symptom:** Pods cannot connect to Redis. Logs show `dial tcp: connection timed out` or `redis: connection refused`.
+
+**Cause:** The Memorystore `authorized_network` does not match the GKE VPC, or the Redis private IP is on a range not routable from the GKE subnet.
+
+**Fix**
+
+```bash
+gcloud redis instances describe <instance-name> --region <region> \
+  --format="value(host,authorizedNetwork)"
+
+kubectl run redis-test --rm -it --image=redis:7 -n langsmith -- \
+  redis-cli -h <redis-private-ip> ping
+# Expected: PONG
+```
+
+### cert-manager fails to issue Let's Encrypt certificate
+
+**Symptom:** `kubectl get certificate -n langsmith` shows `READY=False`. HTTP01 challenge failing.
+
+**Cause:** The DNS A record does not point to the Envoy Gateway IP, or port 80 is blocked on the load balancer.
+
+**Fix**
+
+```bash
+kubectl get svc -n envoy-gateway-system \
+  -l gateway.envoyproxy.io/owning-gateway-name=langsmith-gateway \
+  -o jsonpath='{.items[0].status.loadBalancer.ingress[0].ip}'
+
+kubectl describe certificate <cert-name> -n langsmith
+kubectl get challenges -n langsmith
+kubectl describe challenge -n langsmith
+
+dig +short <your-langsmith-domain>
+```
+
+The DNS A record must resolve to the Gateway IP before the certificate can be issued. cert-manager's HTTP01 solver needs port 80 to be reachable from the internet.
+
+### GCS bucket access denied from LangSmith pods
+
+**Symptom:** Backend logs show `AccessDeniedException: 403 Insufficient Permission` or `403 Forbidden` when writing to GCS.
+
+**Cause:** HMAC credentials passed to Helm are incorrect, or the service account that owns the HMAC key lacks `roles/storage.objectAdmin` on the bucket.
+
+**Fix**
+
+```bash
+helm get values langsmith -n langsmith | grep bucketName
+
+gsutil config -a   # configure with your HMAC key
+gsutil ls gs://<bucket-name>
+
+gcloud storage buckets get-iam-policy gs://<bucket-name>
+```
+
+Create a new HMAC key in GCP Console under Storage → Settings → Interoperability. The key's service account must have `roles/storage.objectAdmin` on the bucket.
+
+### Envoy Gateway webhook blocking GKE operations
+
+**Symptom**
+
+```
+Error from server (InternalError): failed calling webhook "validate.gateway.envoyproxy.io"
+```
+
+**Cause:** The Envoy Gateway admission webhook is not ready or its `caBundle` is stale.
+
+**Fix**
+
+```bash
+kubectl get pods -n envoy-gateway-system
+
+kubectl rollout restart deployment/envoy-gateway -n envoy-gateway-system
+kubectl rollout status deployment/envoy-gateway -n envoy-gateway-system
+```
+
+### Envoy Gateway external IP changed after re-apply
+
+**Symptom:** DNS no longer resolves to the correct IP after Terraform re-apply, or existing firewall allowlists stop working.
+
+**Cause:** The Envoy Gateway external IP is tied to the `Gateway` Kubernetes resource managed by Terraform. If the resource is deleted and recreated (`terraform taint`, a module change that forces replacement, or `terraform destroy` + re-apply), GCP issues a new IP. There is no way to reserve the original IP without pre-allocating a static regional address.
+
+**Prevention**
+
+- Do not `terraform taint` or manually delete the `Gateway` resource.
+- Use `make destroy` + `make apply` only for full teardown and rebuild.
+- Before any operation that might recreate the Gateway, note the current IP.
+
+**Recovery:** Update your DNS A record to the new IP:
+
+```bash
+kubectl get gateway -n langsmith -o jsonpath='{.items[0].status.addresses[0].value}'
+
+gcloud dns record-sets update <your-domain>. \
+  --type=A --ttl=300 \
+  --rrdatas=<new-ip> \
+  --zone=<zone-name> \
+  --project=<project-id>
+```
+
+### terraform destroy fails: deletion protection enabled
+
+**Symptom**
+
+```
+Error: googleapi: Error 409: The instance is protected from deletion.
+```
+
+**Cause:** `gke_deletion_protection = true` (default) or `postgres_deletion_protection = true` prevents Terraform from destroying the resources.
+
+**Fix**
+
+```hcl
+# terraform.tfvars
+gke_deletion_protection      = false
+postgres_deletion_protection = false
+```
+
+```bash
+terraform apply -var-file=terraform.tfvars
+terraform destroy
+```
+
+### Workload Identity not working (GCS permission denied)
+
+**Symptom**
+
+```
+AccessDeniedException: 403 <pod-sa>@<project>.iam.gserviceaccount.com
+  does not have storage.objects.create access to the Google Cloud Storage bucket.
+```
+
+**Cause:** The Kubernetes ServiceAccount used by LangSmith pods is missing the Workload Identity annotation, or the GCP SA is missing the GCS IAM binding.
+
+**Diagnosis**
+
+```bash
+kubectl get serviceaccount langsmith-backend -n langsmith \
+  -o jsonpath='{.metadata.annotations}' | python3 -m json.tool
+
+BUCKET=$(terraform -chdir=infra output -raw storage_bucket_name)
+gsutil iam get gs://$BUCKET | grep -A3 "serviceAccount"
+
+GSA=$(terraform -chdir=infra output -raw workload_identity_service_account_email)
+gcloud projects get-iam-policy <project-id> \
+  --flatten="bindings[].members" --filter="bindings.members:$GSA"
+```
+
+**Fix**
+
+```bash
+terraform -chdir=infra apply -target=module.iam
+make init-values
+make deploy
+```
+
+### `langsmith-ksa` missing Workload Identity annotation
+
+**Symptom:** Operator-spawned agent pods fail to start or get stuck in `Pending`. Logs show permission errors or the agent bootstrap job hangs.
+
+**Cause:** `langsmith-ksa` is created by the LangSmith operator (not Helm) and does not survive namespace teardowns or fresh cluster rebuilds. `deploy.sh` re-annotates it post-deploy; if a previous deploy was interrupted, the annotation may be missing.
+
+**Diagnosis**
+
+```bash
+kubectl get serviceaccount langsmith-ksa -n langsmith \
+  -o jsonpath='{.metadata.annotations.iam\.gke\.io/gcp-service-account}'
+```
+
+**Fix**
+
+```bash
+# Re-run deploy — idempotently creates and annotates langsmith-ksa
+make deploy
+
+# Or annotate manually
+WI=$(terraform -chdir=infra output -raw workload_identity_annotation)
+kubectl create serviceaccount langsmith-ksa -n langsmith --dry-run=client -o yaml \
+  | kubectl apply -f -
+kubectl annotate serviceaccount langsmith-ksa -n langsmith \
+  iam.gke.io/gcp-service-account="$WI" --overwrite
+```
+
+### Helm release stuck in `pending-upgrade`
+
+**Symptom**
+
+```
+Error: UPGRADE FAILED: another operation (install/upgrade/rollback) is in progress
+```
+
+**Cause:** A previous `helm upgrade` was interrupted (Ctrl+C during `--wait`). Helm left the release locked.
+
+**Fix:** `deploy.sh` detects and auto-recovers this state. If running manually:
+
+```bash
+helm rollback langsmith -n langsmith --wait --timeout 5m
+make deploy
+```
+
+### Secret Manager access denied
+
+**Symptom**
+
+```
+ERROR: PERMISSION_DENIED: Permission 'secretmanager.versions.access'
+  denied on resource 'projects/.../secrets/...'
+```
+
+**Cause:** Either `secretmanager.googleapis.com` is not enabled, or the operator account lacks `roles/secretmanager.admin`.
+
+**Fix**
+
+```bash
+gcloud services enable secretmanager.googleapis.com --project <project-id>
+
+gcloud projects add-iam-policy-binding <project-id> \
+  --member="user:$(gcloud config get account)" \
+  --role="roles/secretmanager.admin"
+```
+
+### `langsmith-postgres` or `langsmith-redis` Secret missing
+
+**Symptom:** Pods crash with database connection errors immediately after deploy, or `kubectl get secrets -n langsmith` does not list `langsmith-postgres` / `langsmith-redis`.
+
+**Cause:** The `k8s-bootstrap` module creates these Secrets. They are absent if `terraform apply` was not run, failed partway through, or the namespace was deleted out-of-band.
+
+**Fix**
+
+```bash
+terraform -chdir=infra apply -target=module.k8s_bootstrap
+
+kubectl get secret langsmith-postgres -n langsmith
+kubectl get secret langsmith-redis -n langsmith
+```
+
+## Diagnostic commands
+
+### Cluster access
+
+```bash
+gcloud container clusters get-credentials <cluster-name> --region <region> --project <project-id>
+kubectl config current-context
+kubectl get nodes -o wide
+```
+
+### Pods
+
+```bash
+kubectl get pods -n langsmith
+kubectl get pods -n langsmith -w
+kubectl describe pod <pod-name> -n langsmith
+kubectl logs <pod-name> -n langsmith --tail=50
+kubectl logs <pod-name> -n langsmith --previous --tail=50
+kubectl logs -n langsmith deploy/langsmith-backend --tail=100 -f
+```
+
+### TLS and certificates
+
+```bash
+kubectl get certificate -n langsmith
+kubectl describe certificate <cert-name> -n langsmith
+kubectl get challenges -n langsmith
+kubectl get clusterissuer
+```
+
+### Gateway and load balancer
+
+```bash
+kubectl get gateway -n langsmith
+kubectl get httproute -n langsmith
+kubectl get svc -n envoy-gateway-system -o wide
+kubectl get pods -n envoy-gateway-system
+```
+
+### Helm
+
+```bash
+helm status langsmith -n langsmith
+helm history langsmith -n langsmith
+helm get values langsmith -n langsmith
+```
+
+### LangSmith Deployment
+
+```bash
+kubectl get pods -n langsmith | grep -E "host-backend|listener|operator"
+kubectl get lgp -n langsmith
+kubectl get crd | grep langchain
+```
+
+### Workload Identity and IAM
+
+```bash
+kubectl get serviceaccount langsmith-backend -n langsmith \
+  -o jsonpath='{.metadata.annotations}' | python3 -m json.tool
+
+kubectl get serviceaccount langsmith-ksa -n langsmith \
+  -o jsonpath='{.metadata.annotations.iam\.gke\.io/gcp-service-account}'
+
+BUCKET=$(terraform -chdir=infra output -raw storage_bucket_name 2>/dev/null)
+gsutil iam get gs://$BUCKET
+
+gcloud iam service-accounts list --project <project-id> --filter="displayName:langsmith"
+```
+
+### Secrets and bootstrap
+
+```bash
+kubectl get secrets -n langsmith
+kubectl get secret langsmith-postgres -n langsmith
+kubectl get secret langsmith-redis -n langsmith
+
+kubectl get secret langsmith-postgres -n langsmith \
+  -o jsonpath='{.data.connection_url}' | base64 --decode
+
+gcloud secrets list --project <project-id> --filter="name:langsmith"
+
+gcloud secrets versions access latest \
+  --secret=langsmith-<prefix>-<env>-postgres-password \
+  --project <project-id>
+
+make secrets
+```
+
+### Quick health check
+
+```bash
+echo "=== Context ===" && kubectl config current-context
+echo "=== Nodes ===" && kubectl get nodes
+echo "=== Pods ===" && kubectl get pods -n langsmith
+echo "=== Certificate ===" && kubectl get certificate -n langsmith
+echo "=== Gateway ===" && kubectl get gateway -n langsmith
+echo "=== Secrets ===" && kubectl get secrets -n langsmith | grep -E "langsmith-postgres|langsmith-redis"
+echo "=== Helm ===" && helm status langsmith -n langsmith 2>/dev/null | grep -E "STATUS|LAST DEPLOYED"
+```
diff --git a/src/langsmith/self-host-terraform-gcp-variables.mdx b/src/langsmith/self-host-terraform-gcp-variables.mdx
new file mode 100644
index 0000000000..4ce624c8f9
--- /dev/null
+++ b/src/langsmith/self-host-terraform-gcp-variables.mdx
@@ -0,0 +1,135 @@
+---
+title: GCP Terraform variables reference
+sidebarTitle: Variables
+description: Complete reference of Terraform variables for LangSmith self-hosted on GCP GKE.
+---
+
+Reference for every input variable exposed by the [GCP Terraform modules](https://github.com/langchain-ai/terraform/tree/main/modules/gcp). Set non-sensitive variables in `infra/terraform.tfvars`. For sensitive variables (license key, passwords, encryption keys), `make setup-env` writes them to Secret Manager and the deploy script wires them into the cluster.
+
+## Core
+
+| Variable | Default | Required | Description |
+|---|---|---|---|
+| `project_id` | — | yes | GCP project ID. |
+| `region` | `us-west2` | no | GCP region. |
+| `zone` | `us-west2-a` | no | GCP zone for zonal resources. |
+| `environment` | `prod` | no | Environment: `dev`, `staging`, `prod`, `test`, `uat`. |
+| `name_prefix` | `ls` | no | Resource name prefix (1 to 11 chars). |
+| `unique_suffix` | `true` | no | Append a random suffix to resource names. |
+| `owner` | `platform-team` | no | Owner label applied to all resources. |
+| `cost_center` | `""` | no | Cost center label for billing attribution. |
+| `labels` | `{}` | no | Additional labels applied to all resources. |
+
+## Networking
+
+| Variable | Default | Required | Description |
+|---|---|---|---|
+| `subnet_cidr` | `10.0.0.0/20` | no | CIDR for the GKE subnet. |
+| `pods_cidr` | `10.4.0.0/14` | no | CIDR for GKE pods. |
+| `services_cidr` | `10.8.0.0/20` | no | CIDR for GKE services. |
+
+## GKE
+
+| Variable | Default | Required | Description |
+|---|---|---|---|
+| `gke_use_autopilot` | `false` | no | Use GKE Autopilot mode. |
+| `gke_node_count` | `2` | no | Initial node count per zone (Standard mode). |
+| `gke_min_nodes` | `2` | no | Minimum nodes per zone for autoscaling. |
+| `gke_max_nodes` | `10` | no | Maximum nodes per zone for autoscaling. |
+| `gke_machine_type` | `e2-standard-4` | no | GKE node machine type. |
+| `gke_disk_size` | `100` | no | Node disk size in GB. |
+| `gke_release_channel` | `REGULAR` | no | GKE release channel: `RAPID`, `REGULAR`, `STABLE`. |
+| `gke_deletion_protection` | `true` | no | Enable deletion protection on the GKE cluster. |
+| `gke_network_policy_provider` | `DATA_PLANE_V2` | no | Network policy: `CALICO` or `DATA_PLANE_V2`. |
+
+## PostgreSQL (Cloud SQL)
+
+| Variable | Default | Required | Description |
+|---|---|---|---|
+| `postgres_source` | `external` | no | `external` (Cloud SQL) or `in-cluster` (Helm). |
+| `postgres_version` | `POSTGRES_15` | no | PostgreSQL version for Cloud SQL. |
+| `postgres_tier` | `db-custom-2-8192` | no | Cloud SQL machine tier. |
+| `postgres_disk_size` | `50` | no | Cloud SQL disk size in GB. |
+| `postgres_high_availability` | `true` | no | Enable Cloud SQL HA (regional standby). |
+| `postgres_deletion_protection` | `true` | no | Enable deletion protection on Cloud SQL. |
+| `postgres_password` | `""` | when external | PostgreSQL password. Use `TF_VAR_postgres_password`. |
+
+## Redis (Memorystore)
+
+| Variable | Default | Required | Description |
+|---|---|---|---|
+| `redis_source` | `external` | no | `external` (Memorystore) or `in-cluster` (Helm). |
+| `redis_version` | `REDIS_7_0` | no | Redis version for Memorystore. |
+| `redis_memory_size` | `5` | no | Memorystore Redis memory size in GB. |
+| `redis_high_availability` | `true` | no | Enable Memorystore HA tier (Standard HA). |
+| `redis_prevent_destroy` | `false` | no | Prevent accidental Terraform destroy of Redis. |
+
+## ClickHouse
+
+| Variable | Default | Required | Description |
+|---|---|---|---|
+| `clickhouse_source` | `in-cluster` | no | `in-cluster`, `langsmith-managed`, or `external`. |
+| `clickhouse_host` | `""` | when external | ClickHouse host (external or managed only). |
+| `clickhouse_port` | `9440` | no | ClickHouse native protocol port. |
+| `clickhouse_http_port` | `8443` | no | ClickHouse HTTP port. |
+| `clickhouse_user` | `default` | no | ClickHouse username. |
+| `clickhouse_tls` | `true` | no | Enable TLS for ClickHouse connections. |
+
+## GCS storage
+
+| Variable | Default | Required | Description |
+|---|---|---|---|
+| `storage_ttl_short_days` | `14` | no | GCS TTL for `ttl_s/` prefix. |
+| `storage_ttl_long_days` | `400` | no | GCS TTL for `ttl_l/` prefix. |
+| `storage_force_destroy` | `false` | no | Allow bucket deletion with objects inside. |
+
+## LangSmith application
+
+| Variable | Default | Required | Description |
+|---|---|---|---|
+| `langsmith_namespace` | `langsmith` | no | Kubernetes namespace for LangSmith. |
+| `langsmith_domain` | `langsmith.example.com` | no | Fully qualified domain name. |
+| `langsmith_license_key` | `""` | no | License key. Use `TF_VAR_langsmith_license_key`. |
+| `langsmith_helm_chart_version` | `""` | no | Pin Helm chart version (empty = latest). |
+
+## Ingress and TLS
+
+| Variable | Default | Required | Description |
+|---|---|---|---|
+| `install_ingress` | `true` | no | Install Envoy Gateway via Terraform. |
+| `ingress_type` | `envoy` | no | Ingress type: `envoy`, `istio`, or `other`. |
+| `tls_certificate_source` | `none` | no | `none`, `letsencrypt`, or `existing`. |
+| `letsencrypt_email` | `""` | when `letsencrypt` | Email for Let's Encrypt notifications. |
+| `tls_secret_name` | `langsmith-tls` | no | Name for the TLS secret in Kubernetes. |
+
+## Add-on pass flags
+
+| Variable | Default | Required | Description |
+|---|---|---|---|
+| `enable_langsmith_deployment` | `true` | no | Enable LangSmith Deployment. Installs KEDA automatically. |
+
+## Optional GCP modules
+
+| Variable | Default | Description |
+|---|---|---|
+| `enable_gcp_iam_module` | `true` | Wires `modules/iam` for Workload Identity and bucket IAM binding. |
+| `enable_secret_manager_module` | `false` | Wires `modules/secrets` for Secret Manager bootstrap secret. |
+| `enable_dns_module` | `false` | Wires `modules/dns` for Cloud DNS and managed cert. |
+| `dns_create_zone` | `true` | Create a DNS zone when the DNS module is enabled. |
+| `dns_existing_zone_name` | `""` | Existing zone to use when `dns_create_zone = false`. |
+| `dns_create_certificate` | `true` | Create a Google-managed cert when the DNS module is enabled. |
+
+## Sensitive variables (set with `setup-env.sh`)
+
+`make setup-env` writes these to Secret Manager. Never set these inline in `terraform.tfvars`.
+
+| Variable | Description |
+|---|---|
+| `langsmith_license_key` | LangSmith enterprise license key. |
+| `langsmith_admin_password` | Initial org admin password. |
+| `langsmith_api_key_salt` | Salt for hashing API keys. Must stay stable after first deploy. |
+| `langsmith_jwt_secret` | JWT secret for Basic Auth sessions. |
+| `langsmith_deployments_encryption_key` | Fernet key for LangSmith Deployment. Must never change. |
+| `langsmith_agent_builder_encryption_key` | Fernet key for Agent Builder. Must never change. |
+| `langsmith_insights_encryption_key` | Fernet key for Insights. Must never change. |
+| `langsmith_polly_encryption_key` | Fernet key for Polly. Must never change. |
diff --git a/src/langsmith/self-host-terraform.mdx b/src/langsmith/self-host-terraform.mdx
new file mode 100644
index 0000000000..b5ed0090b9
--- /dev/null
+++ b/src/langsmith/self-host-terraform.mdx
@@ -0,0 +1,95 @@
+---
+title: Deploy LangSmith with Terraform
+sidebarTitle: Overview
+description: Provision LangSmith self-hosted on AWS, Azure, or GCP using LangChain's production-ready Terraform modules.
+---
+
+<Info>
+Self-hosted LangSmith is an add-on to the Enterprise plan designed for LangChain's largest, most security-conscious customers. See [pricing](https://www.langchain.com/pricing) for details, or [contact sales](https://www.langchain.com/contact-sales) to request a license key for trial.
+</Info>
+
+LangChain publishes production-ready Terraform modules for [LangSmith self-hosted](/langsmith/self-hosted) at [github.com/langchain-ai/terraform](https://github.com/langchain-ai/terraform). The modules provision the cloud foundation (network, cluster, database, cache, object storage, secrets, DNS) and install the LangSmith Helm chart with sensible defaults.
+
+Use this path when you want infrastructure as code from day one. If you already manage cloud infrastructure with your own tooling and need just the application install, follow the [Helm installation guide](/langsmith/kubernetes) instead.
+
+<Tip>
+**Prefer Helm?** The [Kubernetes setup guide](/langsmith/kubernetes) walks through installing with Helm against any conformant cluster, no Terraform required. The Terraform path bundles cluster provisioning, secrets wiring, and the Helm release into one workflow.
+</Tip>
+
+## Choose a provider
+
+<CardGroup cols={3}>
+  <Card title="AWS (EKS)" icon="brand-aws" href="/langsmith/self-host-terraform-aws-deploy">
+    Provision EKS, RDS PostgreSQL, ElastiCache, S3, and networking.
+  </Card>
+  <Card title="Azure (AKS)" icon="brand-windows" href="/langsmith/self-host-terraform-azure-deploy">
+    Provision AKS, Azure Database for PostgreSQL, Azure Managed Redis, Blob Storage, and Key Vault.
+  </Card>
+  <Card title="GCP (GKE)" icon="brand-google" href="/langsmith/self-host-terraform-gcp-deploy">
+    Provision GKE, Cloud SQL, Memorystore, GCS, and Workload Identity.
+  </Card>
+</CardGroup>
+
+## Prerequisites
+
+Install the following tools before running the modules:
+
+| Tool | Version | Purpose |
+|---|---|---|
+| `terraform` | 1.5 | Run the modules |
+| `kubectl` | 1.28 | Inspect the cluster after provisioning |
+| `helm` | 3.12 | Manage the LangSmith chart release |
+| Cloud CLI | latest | `aws`, `az`, or `gcloud` for the target provider |
+
+You also need:
+
+- A LangSmith license key. [Contact sales](https://www.langchain.com/contact-sales) to request one.
+- Permissions in the target cloud account to create VPC or VNet networking, a managed Kubernetes cluster, managed databases, object storage, secrets, and IAM roles.
+- A registered domain (or subdomain) for the LangSmith UI endpoint.
+
+## Deployment tiers
+
+Pick a tier with a single Terraform variable. The modules size every dependent resource accordingly.
+
+| Tier | PostgreSQL | Redis | ClickHouse | Use case |
+|---|---|---|---|---|
+| `dev` | In-cluster | In-cluster | In-cluster | Demos, evaluations, short-lived POCs |
+| `production` | Cloud-managed (RDS, Cloud SQL, Azure Database) | Cloud-managed (ElastiCache, Memorystore, Azure Cache) | [LangChain Managed ClickHouse](/langsmith/langsmith-managed-clickhouse) | Persistent, scalable production |
+| `production-large` | Cloud-managed, larger instance class | Cloud-managed, larger instance class | LangChain Managed ClickHouse | High-throughput production |
+
+<Warning>
+Use in-cluster ClickHouse for development and POC, not production. Production deployments must use [LangChain Managed ClickHouse](/langsmith/langsmith-managed-clickhouse) or a self-managed external ClickHouse cluster. Blob storage is always required because trace payloads must not live in ClickHouse.
+</Warning>
+
+## What the modules provision
+
+- **Networking:** VPC or VNet with public and private subnets, NAT, and security groups.
+- **Compute:** Managed Kubernetes (EKS, AKS, or GKE) with autoscaling node pools sized per tier.
+- **Data plane:** Managed PostgreSQL, managed Redis or cache, and a blob storage bucket for trace payloads.
+- **Secrets:** Cloud-native secret store (AWS SSM Parameter Store, Azure Key Vault, GCP Secret Manager) synced into Kubernetes by [External Secrets Operator](https://external-secrets.io/).
+- **Ingress:** Cloud-native load balancer by default. Envoy Gateway (Gateway API) is available for multi-namespace dataplane deployments.
+- **Optional hardening (AWS today):** AWS Network Firewall with FQDN egress allowlists, WAFv2, CloudTrail, and a private EKS API endpoint with SSM bastion access.
+
+## Enterprise feature toggles
+
+Each module exposes flags for the optional LangSmith add-ons:
+
+- [LangSmith Deployment](/langsmith/deploy-self-hosted-full-platform) (Agent Server and Fleet)
+- Agent Builder
+- Insights (ClickHouse-backed analytics)
+- Polly (AI evaluation and monitoring)
+
+Toggle each feature in the `tfvars` file before running `make apply`.
+
+## Need a tailored package?
+
+<Note>
+Enterprise customers can request a tailored Terraform package from their LangChain Solutions Architect. The tailored package adds account-specific defaults, customer-managed key wiring, and any custom hardening agreed during the engagement. The public modules linked above are the same source of truth, so a tailored package stays compatible with the upstream module structure.
+</Note>
+
+## Next steps
+
+- Pick a provider above and follow the deployment guide.
+- Review [required dependency versions](/langsmith/self-host-dependency-versions) for PostgreSQL, ClickHouse, Redis, and Kubernetes.
+- Plan capacity with the [scaling guide](/langsmith/self-host-scale).
+- After the application is running, enable [LangSmith Deployment](/langsmith/deploy-self-hosted-full-platform) to add agent deployment and management to the UI.