From a13c1a546b80ede6519e33ab004519386af20bbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Honeiser?= Date: Wed, 25 Feb 2026 15:26:59 +0100 Subject: [PATCH] feat(google_gke): make enabling the KubeRay operator addon possible --- google_gke/README.md | 229 +++++++++++++++++++++++++++++++++++++++- google_gke/cluster.tf | 16 +++ google_gke/variables.tf | 18 ++++ 3 files changed, 262 insertions(+), 1 deletion(-) diff --git a/google_gke/README.md b/google_gke/README.md index 156b428f..c17ea242 100644 --- a/google_gke/README.md +++ b/google_gke/README.md @@ -1,3 +1,227 @@ +# Shared VPC-based GKE Module + +Module creates an opinionated GKE cluster plus related resources within a Shared VPC context. + +## Simple Example + +This uses distinct networking variables and the (module) default node pool. + +```hcl +data "terraform_remote_state" "vpc" { + backend = "gcs" + + config = { + bucket = "my-state-bucket" + prefix = "projects/my-sharedvpc-project" + } +} + +module "gke" { + source = "github.com/mozilla/terraform-modules//google_gke?ref=main" + + name = "my-cluster" + project_id = "shared-clusters" + realm = "nonprod" + region = "us-west1" + + master_ipv4_cidr_block = "1.2.3.4/28" + network = "projects/my-vpc-project/global/networks/my-vpc-network" + pods_ip_cidr_range_name = "my-pods-or-cluster-secondary-range-name" + services_ip_cidr_range_name = "my-services-secondary-range-name" + subnetwork = "projects/my-vpc-project/regions/us-west1/subnetworks/my-subnetwork" + + # don't expect metrics to BQ + enable_resource_consumption_export = false + + # who can access the k8s control plane + # adds placeholder bastion network by default + master_authorized_networks = [ + { + cidr_block = "1.2.3.4/32" + display_name = "bastion" + } + ] +} +``` + +## Complex Example 1 + + This uses a Mozilla-internal Shared VPC Terraform outputs variable for networking. It also sets up cluster to be able to access GAR images in a different project. + +```hcl +data "terraform_remote_state" "vpc" { + backend = "gcs" + + config = { + bucket = "my-state-bucket" + prefix = "projects/my-sharedvpc-project" + } +} + +module "gke" { + source = "github.com/mozilla/terraform-modules//google_gke?ref=main" + + name = "my-cluster" + project_id = "shared-clusters" + realm = "nonprod" + region = "us-west1" + shared_vpc_outputs = data.terraform_remote_state.projects.outputs.projects.shared.nonprod.id["shared-clusters"].regions["us-west1"] + + # export metrics to a module-created BigQuery dataset + create_resource_usage_export_dataset = true + + # access docker image GARs in another project + # (self-same cluster project id included by default) + registry_project_ids = [ + "team-app1" + ] + + # who can access the k8s control plane + # adds placeholder bastion network by default + master_authorized_networks = [ + { + cidr_block = "1.2.3.4/32" + display_name = "bastion" + } + ] +} + +``` + +## Complex Example 2 + + This uses a Mozilla-internal Shared VPC Terraform outputs variable for networking. It creates multiple node pools with some defaults changed per node pool. + +```hcl +data "terraform_remote_state" "vpc" { + backend = "gcs" + + config = { + bucket = "my-state-bucket" + prefix = "projects/my-sharedvpc-project" + } +} + +module "gke" { + source = "github.com/mozilla/terraform-modules//google_gke?ref=main" + + name = "my-cluster" + project_id = "shared-clusters" + realm = "nonprod" + region = "us-west1" + shared_vpc_outputs = data.terraform_remote_state.projects.outputs.projects.shared.nonprod.id["shared-clusters"].regions["us-west1"] + + # export metrics to a pre-created BigQuery dataset + resource_usage_export_dataset_id = "cluster_metrics_dataset" + + # Don't use module-defaults node pool + # second node pool has special labels for np 2 only; + # see locals.tf for default values + node_pools = [ + { + name = "nodepool-1" + }, + { + name = "nodepool-2" + machine_type = "n2-standard-2" + max_count = 6 + } + ] + + node_pools_labels = { + nodepool-2 = { + "my-np2-label" = "some-value" + } + } + + # who can access the k8s control plane + # adds placeholder bastion network by default + master_authorized_networks = [ + { + cidr_block = "1.2.3.4/32" + display_name = "bastion" + } + ] +} +``` + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [autoscaling\_profile](#input\_autoscaling\_profile) | Specify the profile to be used for autoscaling. Defaults to 'BALANCED' | `string` | `"BALANCED"` | no | +| [create\_resource\_usage\_export\_dataset](#input\_create\_resource\_usage\_export\_dataset) | The ID of a BigQuery Dataset for using BigQuery as the destination of resource usage export. Defaults to empty string. | `bool` | `false` | no | +| [description](#input\_description) | The description of the cluster | `string` | `null` | no | +| [disable\_snat\_status](#input\_disable\_snat\_status) | Whether the cluster disables default in-node sNAT rules. Defaults to false. | `bool` | `false` | no | +| [dns\_cache](#input\_dns\_cache) | The status of the NodeLocal DNSCache addon. | `bool` | `true` | no | +| [enable\_config\_connector](#input\_enable\_config\_connector) | Enable Config Connector Add-On | `bool` | `false` | no | +| [enable\_cost\_allocation](#input\_enable\_cost\_allocation) | Enables Cost Allocation Feature and the cluster name and namespace of your GKE workloads appear in the labels field of the billing export to BigQuery | `bool` | `false` | no | +| [enable\_dataplane](#input\_enable\_dataplane) | Whether to enable dataplane v2 on the cluster. Sets DataPath field. Defaults to false. | `bool` | `false` | no | +| [enable\_dns\_endpoint](#input\_enable\_dns\_endpoint) | Enable external DNS endpoint for control plane access | `bool` | `false` | no | +| [enable\_gcfs](#input\_enable\_gcfs) | Enable Google Container File System (gcfs) image streaming. | `bool` | `true` | no | +| [enable\_high\_throughput\_logging](#input\_enable\_high\_throughput\_logging) | Whether to enable high throughput logging for all node pools. | `bool` | `false` | no | +| [enable\_k8s\_api\_proxy\_ip](#input\_enable\_k8s\_api\_proxy\_ip) | Whether we reserve an internal private ip for the k8s\_api\_proxy. Defaults to false. | `bool` | `false` | no | +| [enable\_network\_egress\_export](#input\_enable\_network\_egress\_export) | Whether to enable network egress metering for this cluster. If enabled, a daemonset will be created in the cluster to meter network egress traffic. Doesn't work with Shared VPC (https://cloud.google.com/kubernetes-engine/docs/how-to/cluster-usage-metering). Defaults to false. | `bool` | `false` | no | +| [enable\_private\_cluster](#input\_enable\_private\_cluster) | Determines whether the cluster is private or public. Defaults to private | `bool` | `true` | no | +| [enable\_public\_cidrs\_access](#input\_enable\_public\_cidrs\_access) | Whether the control plane is open to Google public IPs. Defaults to false. | `bool` | `false` | no | +| [enable\_ray\_operator](#input\_enable\_ray\_operator) | enable the ray operator addon | `bool` | `false` | no | +| [enable\_ray\_operator\_logging](#input\_enable\_ray\_operator\_logging) | enable ray operator logging | `bool` | `false` | no | +| [enable\_ray\_operator\_monitoring](#input\_enable\_ray\_operator\_monitoring) | enable ray operator monitoring | `bool` | `false` | no | +| [enable\_resource\_consumption\_export](#input\_enable\_resource\_consumption\_export) | Whether to enable resource consumption metering on this cluster. When enabled, a table will be created in the resource export BigQuery dataset to store resource consumption data. The resulting table can be joined with the resource usage table or with BigQuery billing export. Defaults to true. | `bool` | `true` | no | +| [enable\_vertical\_pod\_autoscaling](#input\_enable\_vertical\_pod\_autoscaling) | Enables Vertical Pod Autoscaling in the cluster | `bool` | `false` | no | +| [filestore\_csi\_driver](#input\_filestore\_csi\_driver) | The status of the Filestore CSI driver addon, which allows the usage of filestore instance as volumes | `bool` | `false` | no | +| [fuse\_csi\_driver](#input\_fuse\_csi\_driver) | The status of the GCSFuse CSI driver addon, which allows the usage of a gcs bucket as volumes | `bool` | `false` | no | +| [gateway\_api\_enabled](#input\_gateway\_api\_enabled) | Enabled Gateway in the GKE Cluster | `bool` | `false` | no | +| [google\_group\_name](#input\_google\_group\_name) | Name of the Google security group for use with Kubernetes RBAC. Must be in format: gke-security-groups@yourdomain.com | `string` | `null` | no | +| [grant\_registry\_access](#input\_grant\_registry\_access) | Grants created cluster-specific service account storage.objectViewer and artifactregistry.reader roles. | `bool` | `true` | no | +| [kubernetes\_version](#input\_kubernetes\_version) | The Kubernetes version of the masters. If set to 'latest' it will pull latest available version. Defaults to 'latest'. | `string` | `"latest"` | no | +| [labels](#input\_labels) | The GCE resource labels (a map of key/value pairs) to be applied to the cluster & other cluster-related resources. Merged with default labels (see locals.tf). | `map(string)` | `{}` | no | +| [maintenance\_exclusions](#input\_maintenance\_exclusions) | List of maintenance exclusions. A cluster can have up to three | `list(object({ name = string, start_time = string, end_time = string }))` | `[]` | no | +| [maintenance\_start\_time](#input\_maintenance\_start\_time) | Time window specified for daily or recurring maintenance operations in RFC3339 format | `string` | `"21:00"` | no | +| [master\_authorized\_networks](#input\_master\_authorized\_networks) | List of master authorized networks that can access the GKE Master Plane. If none are provided, it defaults to known Bastion hosts for the given realm. See locals.tf for defaults. | `list(object({ cidr_block = string, display_name = string }))` |
[
{
"cidr_block": "192.0.0.8/32",
"display_name": "tf module placeholder"
}
]
| no | +| [master\_ipv4\_cidr\_block](#input\_master\_ipv4\_cidr\_block) | The IP range in CIDR notation to use for the hosted master network. Overridden by shared\_vpc\_outputs. | `string` | `null` | no | +| [monitoring\_config\_enable\_components](#input\_monitoring\_config\_enable\_components) | Monitoring configuration for the cluster | `list(string)` |
[
"SYSTEM_COMPONENTS",
"SCHEDULER",
"CONTROLLER_MANAGER",
"STORAGE",
"HPA",
"POD",
"DAEMONSET",
"DEPLOYMENT",
"STATEFULSET"
]
| no | +| [monitoring\_enable\_managed\_prometheus](#input\_monitoring\_enable\_managed\_prometheus) | Configuration for Managed Service for Prometheus. Whether or not the managed collection is enabled. | `bool` | `false` | no | +| [name](#input\_name) | Name of the cluster or application (required). | `string` | n/a | yes | +| [network](#input\_network) | Shared VPC Network (formulated as a URL) wherein the cluster will be created. Overridden by shared\_vpc\_outputs. | `string` | `null` | no | +| [node\_pool\_sa\_roles](#input\_node\_pool\_sa\_roles) | n/a | `list` |
[
"roles/logging.logWriter",
"roles/monitoring.metricWriter",
"roles/monitoring.viewer",
"roles/stackdriver.resourceMetadata.writer"
]
| no | +| [node\_pools](#input\_node\_pools) | Map containing node pools, with each node pool's name (or name\_prefix if `use_name_prefix` is true) being the key and the values being that node pool's configurations. Configurable options per node pool include: `disk_size_gb` (string), `disk_type` (string), `machine_type` (string), `max_count` (number), `max_surge` (number), `max_unavailable` (number), `min_count` (number), `use_name_prefix` (bool). See locals.tf for defaults. | `list(map(string))` |
[
{
"name": "tf-default-node-pool"
}
]
| no | +| [node\_pools\_guest\_accelerator](#input\_node\_pools\_guest\_accelerator) | Map containing node pools guest accelerator. Each node pool's name is the key. See locals.tf for defaults. | `map(map(string))` |
{
"tf-default-node-pool": {}
}
| no | +| [node\_pools\_labels](#input\_node\_pools\_labels) | Map containing node pools non-default labels (as a map of strings). Each key is used as node pool's name prefix. See locals.tf for defaults. | `map(map(string))` |
{
"tf-default-node-pool": {}
}
| no | +| [node\_pools\_oauth\_scopes](#input\_node\_pools\_oauth\_scopes) | Map containing node pools non-default OAuth scopes (as an list). Each node pool's name is the key. See locals.tf for defaults. | `map(list(string))` |
{
"tf-default-node-pool": []
}
| no | +| [node\_pools\_spot\_enabled](#input\_node\_pools\_spot\_enabled) | Map containing node pools spot enabled. Each node pool's name is the key. See locals.tf for defaults. | `map(bool)` |
{
"tf-default-node-pool": false
}
| no | +| [node\_pools\_sysctls](#input\_node\_pools\_sysctls) | Map containing node pools non-default linux node config sysctls (as a map of maps). Each node pool's name is the key. | `map(map(any))` |
{
"tf-default-node-pool": {}
}
| no | +| [node\_pools\_tags](#input\_node\_pools\_tags) | Map containing node pools non-default tags (as an list). Each node pool's name is the key. See locals.tf for defaults. | `map(list(string))` |
{
"tf-default-node-pool": []
}
| no | +| [node\_pools\_taints](#input\_node\_pools\_taints) | Map containing node pools taints. Each node pool's name is the key. See locals.tf for defaults. | `map(list(map(string)))` |
{
"tf-default-node-pool": [
{}
]
}
| no | +| [pods\_ip\_cidr\_range\_name](#input\_pods\_ip\_cidr\_range\_name) | The Name of the IP address range for cluster pods IPs. Overridden by shared\_vpc\_outputs. | `string` | `null` | no | +| [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | `null` | no | +| [project\_outputs](#input\_project\_outputs) | Sets cluster-related variables based on a homegrown Project outputs data structure. |
object({
id = string
name = string
number = string
zone_dns_name = string
zone_name = string
})
| `null` | no | +| [realm](#input\_realm) | Name of infrastructure realm (e.g. prod or nonprod). | `string` | n/a | yes | +| [region](#input\_region) | Region where cluster & other regional resources should be provisioned. Defaults to us-central1. | `string` | `null` | no | +| [registry\_project\_ids](#input\_registry\_project\_ids) | Projects holding Google Container Registries. If empty, we use the cluster project. If a service account is created and the `grant_registry_access` variable is set to `true`, the `storage.objectViewer` and `artifactregsitry.reader` roles are assigned on these projects. | `list(string)` | `[]` | no | +| [release\_channel](#input\_release\_channel) | The release channel of this cluster. Accepted values are `UNSPECIFIED`, `RAPID`, `REGULAR` and `STABLE`. Defaults to `REGULAR`. | `string` | `"REGULAR"` | no | +| [resource\_usage\_export\_dataset\_id](#input\_resource\_usage\_export\_dataset\_id) | The ID of a BigQuery Dataset for using BigQuery as the destination of resource usage export. Defaults to null. | `string` | `null` | no | +| [service\_account\_id](#input\_service\_account\_id) | Id of the service account to be provisioned, overrides the default 'gke-cluster\_name' value | `string` | `null` | no | +| [service\_subnetworks](#input\_service\_subnetworks) | Service subnetworks associated with Shared VPC, segmented by region |
map(object({
ip_cidr_range = string
network = string
region = string
subnet_name = string
subnetwork = string
subnetwork_id = string
}))
| `null` | no | +| [services\_ip\_cidr\_range\_name](#input\_services\_ip\_cidr\_range\_name) | The Name of the IP address range for cluster services IPs. Overridden by shared\_vpc\_outputs. | `string` | `null` | no | +| [shared\_vpc\_outputs](#input\_shared\_vpc\_outputs) | Sets networking-related variables based on a homegrown Shared VPC Terraform outputs data structure. |
object({
ip_cidr_range = object({
master = string
pod = string
primary = string
service = string
additional = map(string)
})
network = string
project_id = string
region = string
secondary_ip_ranges = object({
pod = object({
ip_cidr_range = string
range_name = string
})
service = object({
ip_cidr_range = string
range_name = string
})
})
additional_ip_ranges = map(map(string))
subnet_name = string
subnetwork = string
subnetwork_id = string
})
| `null` | no | +| [subnetwork](#input\_subnetwork) | Shared VPC Subnetwork (formulated as a URL) wherein the cluster will be created. Overridden by shared\_vpc\_outputs. | `string` | `null` | no | +| [tags](#input\_tags) | The GCE resource tags (a list of strings) to be applied to the cluster & other cluster-related resources. Merged with default tags (see locals.tf). | `list(string)` | `[]` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| [ca\_certificate](#output\_ca\_certificate) | CA Certificate for the Cluster | +| [endpoint](#output\_endpoint) | Cluster endpoint | +| [id](#output\_id) | Cluster id | +| [k8s\_api\_proxy\_dns\_name](#output\_k8s\_api\_proxy\_dns\_name) | K8s api proxy dns record | +| [location](#output\_location) | Cluster location (region) | +| [master\_version](#output\_master\_version) | Current Kubernetes master version | +| [name](#output\_name) | Cluster name | +| [node\_pools](#output\_node\_pools) | List of node pools | +| [service\_account](#output\_service\_account) | Cluster Service Account | + # Shared VPC-based GKE Module @@ -165,6 +389,9 @@ module "gke" { | [enable\_network\_egress\_export](#input\_enable\_network\_egress\_export) | Whether to enable network egress metering for this cluster. If enabled, a daemonset will be created in the cluster to meter network egress traffic. Doesn't work with Shared VPC (https://cloud.google.com/kubernetes-engine/docs/how-to/cluster-usage-metering). Defaults to false. | `bool` | `false` | no | | [enable\_private\_cluster](#input\_enable\_private\_cluster) | Determines whether the cluster is private or public. Defaults to private | `bool` | `true` | no | | [enable\_public\_cidrs\_access](#input\_enable\_public\_cidrs\_access) | Whether the control plane is open to Google public IPs. Defaults to false. | `bool` | `false` | no | +| [enable\_ray\_cluster\_logging](#input\_enable\_ray\_cluster\_logging) | enable ray operator logging | `bool` | `true` | no | +| [enable\_ray\_cluster\_monitoring](#input\_enable\_ray\_cluster\_monitoring) | enable ray operator monitoring | `bool` | `true` | no | +| [enable\_ray\_operator](#input\_enable\_ray\_operator) | enable the ray operator addon | `bool` | `false` | no | | [enable\_resource\_consumption\_export](#input\_enable\_resource\_consumption\_export) | Whether to enable resource consumption metering on this cluster. When enabled, a table will be created in the resource export BigQuery dataset to store resource consumption data. The resulting table can be joined with the resource usage table or with BigQuery billing export. Defaults to true. | `bool` | `true` | no | | [enable\_vertical\_pod\_autoscaling](#input\_enable\_vertical\_pod\_autoscaling) | Enables Vertical Pod Autoscaling in the cluster | `bool` | `false` | no | | [filestore\_csi\_driver](#input\_filestore\_csi\_driver) | The status of the Filestore CSI driver addon, which allows the usage of filestore instance as volumes | `bool` | `false` | no | @@ -219,4 +446,4 @@ module "gke" { | [name](#output\_name) | Cluster name | | [node\_pools](#output\_node\_pools) | List of node pools | | [service\_account](#output\_service\_account) | Cluster Service Account | - + \ No newline at end of file diff --git a/google_gke/cluster.tf b/google_gke/cluster.tf index 2ab9057f..bcfb567c 100644 --- a/google_gke/cluster.tf +++ b/google_gke/cluster.tf @@ -168,6 +168,22 @@ resource "google_container_cluster" "primary" { dns_cache_config { enabled = var.dns_cache } + + dynamic "ray_operator_config" { + for_each = var.enable_ray_operator ? [1] : [] + + content { + enabled = var.enable_ray_operator + + ray_cluster_logging_config { + enabled = var.enable_ray_cluster_logging + } + + ray_cluster_monitoring_config { + enabled = var.enable_ray_cluster_monitoring + } + } + } } # Gateway-api diff --git a/google_gke/variables.tf b/google_gke/variables.tf index 57d1739a..ab773619 100644 --- a/google_gke/variables.tf +++ b/google_gke/variables.tf @@ -430,3 +430,21 @@ variable "autoscaling_profile" { type = string default = "BALANCED" } + +variable "enable_ray_operator" { + description = "enable the ray operator addon" + type = bool + default = false +} + +variable "enable_ray_cluster_logging" { + description = "enable ray operator logging" + type = bool + default = true +} + +variable "enable_ray_cluster_monitoring" { + description = "enable ray operator monitoring" + type = bool + default = true +}