Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ help: ## Display this help.

.PHONY: manifests
manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects.
$(CONTROLLER_GEN) rbac:roleName=manager-role crd:allowDangerousTypes=true,generateEmbeddedObjectMeta=true webhook paths="{./api/v1,./api/v2,./internal/controller/...,./internal/webhook/...}" output:crd:artifacts:config=config/crd/bases
$(CONTROLLER_GEN) rbac:roleName=manager-role crd:allowDangerousTypes=true,generateEmbeddedObjectMeta=true,maxDescLen=0 webhook paths="{./api/v1,./api/v2,./internal/controller/...,./internal/webhook/...}" output:crd:artifacts:config=config/crd/bases

.PHONY: generate
generate: controller-gen ## Generate code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations.
Expand Down
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@
## Description
// TODO(user): An in-depth paragraph about your project and overview of use

## Documentation

- [Monitoring and Telemetry Guide](docs/monitoring.md)

## Development

### Prerequisites
Expand Down
188 changes: 81 additions & 107 deletions Tiltfile
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ GROUP = "apps"
VERSION = "v1"
KIND = "wandb"
IMG = 'controller:latest'
CONTROLLERGEN = 'rbac:roleName=manager-role crd:allowDangerousTypes=true,generateEmbeddedObjectMeta=true webhook paths="{./api/v1,./api/v2,./internal/controller/...}" output:crd:artifacts:config=config/crd/bases'
CONTROLLERGEN = 'rbac:roleName=manager-role crd:allowDangerousTypes=true,generateEmbeddedObjectMeta=true,maxDescLen=0 webhook paths="{./api/v1,./api/v2,./internal/controller/...}" output:crd:artifacts:config=config/crd/bases'
DISABLE_SECURITY_CONTEXT = True

def manifests():
Expand Down Expand Up @@ -92,15 +92,18 @@ local_resource(
labels=["Helm-Repos"],
)

third_party_operator_flags = [
'--set=wandb-operator.enabled=false',
'--set=telemetry.enabled=false',
'--create-namespace',
]

helm_resource(
'third-party-operators',
chart='./deploy/operator',
resource_deps=['helm-dep-update'],
namespace='wandb-operator',
flags=[
'--set=wandb-operator.enabled=false',
'--create-namespace',
],
flags=third_party_operator_flags,
labels=["Third-Party-Operators"],
)

Expand All @@ -119,17 +122,17 @@ k8s_resource(
labels=["Operator-Resources"],
)

k8s_resource(
new_name='Application CRD',
objects=['applications.apps.wandb.com:customresourcedefinition'],
local_resource(
'Application CRD',
'kustomize build config/crd | kubectl apply --server-side=true --force-conflicts -f -',
resource_deps=["manifests", "generate"],
labels=["Operator-Resources"],
)

k8s_resource(
new_name='Wandb CRD',
objects=['weightsandbiases.apps.wandb.com:customresourcedefinition'],
resource_deps=["manifests", "generate"],
local_resource(
'Wandb CRD',
'echo "Wandb CRD is applied by Application CRD server-side apply step."',
resource_deps=["Application CRD"],
# wandb-operator is disabled in this Tilt setup; label is for 3rd party operator CRD grouping only
labels=["Operator-Resources", "third-party-operators"],
)
Expand Down Expand Up @@ -195,8 +198,22 @@ local_resource(
)

if settings.get("installWandb"):
crdName = read_yaml('./hack/testing-manifests/wandb/' + settings.get('wandbCRD') + '.yaml')['metadata']['name']
k8s_yaml('./hack/testing-manifests/wandb/' + settings.get('wandbCRD') + '.yaml')
wandbManifestPath = './hack/testing-manifests/wandb/' + settings.get('wandbCRD') + '.yaml'
wandbManifest = read_yaml(wandbManifestPath)

# FIXME: This hack shouldn't be needed
internalServiceAuthEnabled = settings.get("internalServiceAuth")
if internalServiceAuthEnabled != None:
if not wandbManifest.get('spec'):
wandbManifest['spec'] = {}
if not wandbManifest['spec'].get('wandb'):
wandbManifest['spec']['wandb'] = {}
wandbManifest['spec']['wandb']['internalServiceAuth'] = {
'enabled': internalServiceAuthEnabled,
}

crdName = wandbManifest['metadata']['name']
k8s_yaml(encode_yaml(wandbManifest))
k8s_resource(
new_name='Wandb',
objects=[
Expand All @@ -205,6 +222,14 @@ if settings.get("installWandb"):
resource_deps=["webhook-ready"],
labels=["Operator-Resources"],
)
local_resource(
'Wandb-PortForward-Nginx',
cmd='echo "Ensuring W&B nginx endpoint is running"',
serve_cmd='sh -c "until kubectl get svc -n default ' + crdName + '-nginx-proxy >/dev/null 2>&1; do sleep 2; done; exec kubectl port-forward -n default svc/' + crdName + '-nginx-proxy 8080:8080"',
resource_deps=["Wandb"],
links=[link('http://localhost:8080', 'W&B nginx')],
labels=["Operator-Resources"],
)

if settings.get("installTelemetry"):
local_resource(
Expand All @@ -229,113 +254,62 @@ if settings.get("installTelemetry"):
labels=["Telemetry"],
)
local_resource(
'vm-operator-ready',
'kubectl rollout status deployment ' +
'-n wandb-operator ' +
'-l app.kubernetes.io/name=victoria-metrics-operator ' +
'--timeout=120s',
resource_deps=["vm-crds-ready"],
labels=["Telemetry"],
)
k8s_yaml('./hack/testing-manifests/telemetry/victoria-dev.yaml')
k8s_resource(
new_name='Victoria-Metrics',
objects=[
'victoria-instance:vmsingle',
'victoria-agent:vmagent',
],
resource_deps=["vm-operator-ready"],
labels=["Telemetry"],
)
k8s_resource(
new_name='Victoria-Logs',
objects=[
'victoria-logs:vlsingle',
],
resource_deps=["vm-operator-ready"],
labels=["Telemetry"],
)
k8s_resource(
new_name='Victoria-Traces',
objects=[
'victoria-traces:vtsingle',
],
resource_deps=["vm-operator-ready"],
labels=["Telemetry"],
)
k8s_yaml('./hack/testing-manifests/telemetry/wandb-otel-connection-dev.yaml')
k8s_resource(
new_name='OTEL-Connection-Secret',
objects=[
'wandb-otel-connection:secret',
],
resource_deps=["Victoria-Metrics", "Victoria-Logs", "Victoria-Traces"],
labels=["Telemetry"],
)
k8s_yaml('./hack/testing-manifests/telemetry/kube-metrics-dev.yaml')
k8s_resource(
new_name='Kubernetes-Metrics',
objects=[
'kubelet-cadvisor:vmnodescrape',
],
# vm-crds-ready transitively satisfied via Victoria-Metrics → vm-operator-ready → vm-crds-ready
resource_deps=["Victoria-Metrics"],
'Telemetry-Stack',
cmd='helm upgrade --install third-party-operators ./deploy/operator ' +
'--namespace wandb-operator --create-namespace ' +
'--set=wandb-operator.enabled=false ' +
'--set=telemetry.enabled=true ' +
'--set=telemetry.mode=managed ' +
'--set=telemetry.namespace=default ' +
'--set=telemetry.ui.grafana.enabled=true',
resource_deps=["vm-crds-ready", "grafana-crds-ready"],
labels=["Telemetry"],
)
k8s_yaml('./hack/testing-manifests/telemetry/operator-metrics-dev.yaml')
k8s_resource(
new_name='Operator-Metrics',
objects=[
'wandb-operator:vmservicescrape',
'clickhouse-operator:vmservicescrape',
'grafana-operator:vmservicescrape',
'victoria-metrics-operator:vmservicescrape',
],
# vm-crds-ready transitively satisfied via Victoria-Metrics → vm-operator-ready → vm-crds-ready
resource_deps=["Victoria-Metrics"],

local_resource(
'Telemetry-PortForward-Grafana',
cmd='echo "Ensuring Grafana port-forward is running"',
serve_cmd='sh -c "until kubectl get svc -n default grafana-service >/dev/null 2>&1; do sleep 2; done; exec kubectl port-forward -n default svc/grafana-service 3000:3000"',
resource_deps=["Telemetry-Stack"],
links=[link('http://localhost:3000', 'Grafana')],
labels=["Telemetry"],
)
k8s_yaml('./hack/testing-manifests/telemetry/infra-metrics-dev.yaml')
k8s_resource(
new_name='Infrastructure-Metrics',
objects=[
'mysql-pxc:vmpodscrape',
'mysql-proxysql:vmpodscrape',
'kafka-brokers:vmpodscrape',
'minio-tenant:vmpodscrape',
'redis:vmpodscrape',
'clickhouse-metrics:service',
'clickhouse:vmservicescrape',
],
# vm-crds-ready transitively satisfied via Victoria-Metrics → vm-operator-ready → vm-crds-ready
resource_deps=["Victoria-Metrics"],
local_resource(
'Telemetry-PortForward-VictoriaMetrics',
cmd='echo "Ensuring VictoriaMetrics port-forward is running"',
serve_cmd='sh -c "until kubectl get svc -n default vmsingle-victoria-instance >/dev/null 2>&1; do sleep 2; done; exec kubectl port-forward -n default svc/vmsingle-victoria-instance 8428:8428"',
resource_deps=["Telemetry-Stack"],
links=[link('http://localhost:8428/vmui/', 'VictoriaMetrics UI')],
labels=["Telemetry"],
)
k8s_yaml('./hack/testing-manifests/telemetry/grafana-dev.yaml')
k8s_resource(
new_name='Grafana',
objects=[
'grafana:grafana',
],
resource_deps=["grafana-crds-ready"],
port_forwards="3000:3000",
local_resource(
'Telemetry-PortForward-VictoriaLogs',
cmd='echo "Ensuring VictoriaLogs port-forward is running"',
serve_cmd='sh -c "until kubectl get svc -n default vlsingle-victoria-logs >/dev/null 2>&1; do sleep 2; done; exec kubectl port-forward -n default svc/vlsingle-victoria-logs 9428:9428"',
resource_deps=["Telemetry-Stack"],
links=[link('http://localhost:9428', 'VictoriaLogs')],
labels=["Telemetry"],
)
k8s_resource(
new_name='Grafana-Datasources',
objects=[
'victoria-metrics:grafanadatasource',
'victoria-logs:grafanadatasource',
'victoria-traces:grafanadatasource',
],
resource_deps=["grafana-crds-ready", "Grafana", "Victoria-Metrics", "Victoria-Logs", "Victoria-Traces"],
local_resource(
'Telemetry-PortForward-VictoriaTraces',
cmd='echo "Ensuring VictoriaTraces port-forward is running"',
serve_cmd='sh -c "until kubectl get svc -n default vtsingle-victoria-traces >/dev/null 2>&1; do sleep 2; done; exec kubectl port-forward -n default svc/vtsingle-victoria-traces 10428:10428"',
resource_deps=["Telemetry-Stack"],
links=[link('http://localhost:10428', 'VictoriaTraces')],
labels=["Telemetry"],
)

manager_entrypoint = ['/manager', '--log-format=' + settings['logFormat']]
if settings.get("installTelemetry"):
manager_entrypoint += [
'--telemetry-enabled=true',
'--telemetry-mode=managed',
]

docker_build_with_restart(
IMG, '.',
dockerfile_contents=DOCKERFILE,
entrypoint=['/manager', '--log-format=' + settings['logFormat']],
entrypoint=manager_entrypoint,
only=['./tilt_bin/manager', './hack/testing-manifests/server-manifest'],
live_update=[
sync('./tilt_bin/manager', '/manager'),
Expand Down
64 changes: 57 additions & 7 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ import (
ctrlwh "sigs.k8s.io/controller-runtime/pkg/webhook"

"github.com/wandb/operator/internal/controller"
controllerv2 "github.com/wandb/operator/internal/controller/v2"

appsv2 "github.com/wandb/operator/api/v2"
webhookv2 "github.com/wandb/operator/internal/webhook/v2"
Expand Down Expand Up @@ -92,6 +93,15 @@ func main() {
var tlsOpts []func(*tls.Config)
var deployerAPI, isolationNamespaces string
var debug, airgapped, enableV2, enableWebhooks, enableRollouts bool
var telemetryEnabled bool
var telemetryMode string
var telemetryMetricsEndpoint, telemetryLogsEndpoint, telemetryTracesEndpoint string
var telemetryManagedNamespace string
var telemetryManagedVMSingleName, telemetryManagedVLSingleName, telemetryManagedVTSingleName string
var telemetryManagedOTLPGatewayEnabled bool
var telemetryManagedOTLPGatewayName string
var telemetryManagedOTLPGatewayHTTPPort int
var telemetryOTelSecretName, telemetryOTelProtocol, telemetryOTelServiceName, telemetryOTelResourceAttributes string

flag.StringVar(&metricsAddr, "metrics-bind-address", "0", "The address the metrics endpoint binds to. "+
"Use :8443 for HTTPS or :8080 for HTTP, or leave as 0 to disable the metrics service.")
Expand Down Expand Up @@ -120,6 +130,22 @@ func main() {
flag.BoolVar(&enableV2, "enable-v2", true, "Use V2 of WandB CRD")
flag.BoolVar(&enableWebhooks, "enable-webhooks", true, "Enable webhooks")
flag.BoolVar(&enableRollouts, "enable-rollouts", false, "Enable Argo Rollout Support")
flag.BoolVar(&telemetryEnabled, "telemetry-enabled", false, "Enable telemetry endpoint reconciliation for W&B applications")
flag.StringVar(&telemetryMode, "telemetry-mode", "managed", "Telemetry mode: managed or external")
flag.StringVar(&telemetryMetricsEndpoint, "telemetry-metrics-endpoint", "", "OTEL metrics endpoint when using telemetry mode external")
flag.StringVar(&telemetryLogsEndpoint, "telemetry-logs-endpoint", "", "OTEL logs endpoint when using telemetry mode external")
flag.StringVar(&telemetryTracesEndpoint, "telemetry-traces-endpoint", "", "OTEL traces endpoint when using telemetry mode external")
flag.StringVar(&telemetryManagedNamespace, "telemetry-managed-namespace", "", "Namespace where managed telemetry services run")
flag.StringVar(&telemetryManagedVMSingleName, "telemetry-managed-vmsingle-name", "victoria-instance", "Name of the managed VMSingle resource")
flag.StringVar(&telemetryManagedVLSingleName, "telemetry-managed-vlsingle-name", "victoria-logs", "Name of the managed VLSingle resource")
flag.StringVar(&telemetryManagedVTSingleName, "telemetry-managed-vtsingle-name", "victoria-traces", "Name of the managed VTSingle resource")
flag.BoolVar(&telemetryManagedOTLPGatewayEnabled, "telemetry-managed-otlp-gateway-enabled", true, "Enable a managed OTLP gateway service for telemetry ingest")
flag.StringVar(&telemetryManagedOTLPGatewayName, "telemetry-managed-otlp-gateway-name", "victoria-otlp-gateway", "Name of the managed OTLP gateway Service")
flag.IntVar(&telemetryManagedOTLPGatewayHTTPPort, "telemetry-managed-otlp-gateway-http-port", 4318, "HTTP OTLP port exposed by the managed OTLP gateway Service")
flag.StringVar(&telemetryOTelSecretName, "telemetry-otel-secret-name", "wandb-otel-connection", "Name of the OTEL connection secret managed by the operator")
flag.StringVar(&telemetryOTelProtocol, "telemetry-otel-protocol", "http/protobuf", "OTEL exporter protocol written to the OTEL connection secret")
flag.StringVar(&telemetryOTelServiceName, "telemetry-otel-service-name", "wandb-service", "OTEL service name written to the OTEL connection secret")
flag.StringVar(&telemetryOTelResourceAttributes, "telemetry-otel-resource-attributes", "", "OTEL resource attributes written to the OTEL connection secret")

var logLevel = flag.String("log-level", "info", "Log level: debug, info, warn, error")
var logFormat = flag.String("log-format", "text", "Log format: text or json")
Expand Down Expand Up @@ -160,6 +186,29 @@ func main() {
utilruntime.Must(argov1alpha1.AddToScheme(scheme))
}

telemetryConfig := controllerv2.DefaultTelemetryRuntimeConfig()
telemetryConfig.Enabled = telemetryEnabled
telemetryConfig.Mode = controllerv2.TelemetryMode(strings.ToLower(telemetryMode))
telemetryConfig.External.MetricsEndpoint = telemetryMetricsEndpoint
telemetryConfig.External.LogsEndpoint = telemetryLogsEndpoint
telemetryConfig.External.TracesEndpoint = telemetryTracesEndpoint
telemetryConfig.Managed.Namespace = telemetryManagedNamespace
telemetryConfig.Managed.VMSingleName = telemetryManagedVMSingleName
telemetryConfig.Managed.VLSingleName = telemetryManagedVLSingleName
telemetryConfig.Managed.VTSingleName = telemetryManagedVTSingleName
telemetryConfig.Managed.OTLPGateway.Enabled = telemetryManagedOTLPGatewayEnabled
telemetryConfig.Managed.OTLPGateway.Name = telemetryManagedOTLPGatewayName
telemetryConfig.Managed.OTLPGateway.HTTPPort = telemetryManagedOTLPGatewayHTTPPort
telemetryConfig.OTel.SecretName = telemetryOTelSecretName
telemetryConfig.OTel.Protocol = telemetryOTelProtocol
telemetryConfig.OTel.ServiceName = telemetryOTelServiceName
telemetryConfig.OTel.ResourceAttributes = telemetryOTelResourceAttributes
telemetryConfig.Normalize()
if err := telemetryConfig.Validate(); err != nil {
setupLog.Error(err, "invalid telemetry configuration")
os.Exit(1)
}

// if the enable-http2 flag is false (the default), http/2 should be disabled
// due to its vulnerabilities. More specifically, disabling http/2 will
// prevent from being vulnerable to the HTTP/2 Stream Cancellation and
Expand Down Expand Up @@ -285,13 +334,14 @@ func main() {
}

if err = (&controller.WeightsAndBiasesReconciler{
IsAirgapped: airgapped,
Recorder: mgr.GetEventRecorderFor("weightsandbiases"),
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
DeployerClient: &deployer.DeployerClient{DeployerAPI: deployerAPI},
Debug: debug,
EnableV2: enableV2,
IsAirgapped: airgapped,
Recorder: mgr.GetEventRecorderFor("weightsandbiases"),
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
DeployerClient: &deployer.DeployerClient{DeployerAPI: deployerAPI},
Debug: debug,
EnableV2: enableV2,
TelemetryConfig: telemetryConfig,
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "WeightsAndBiases")
os.Exit(1)
Expand Down
Loading
Loading