Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@
## Description
// TODO(user): An in-depth paragraph about your project and overview of use

## Documentation

- [Monitoring and Telemetry Guide](docs/monitoring.md)

## Development

### Prerequisites
Expand Down
53 changes: 53 additions & 0 deletions deploy/operator/templates/_telemetry_helpers.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
{{- define "operator.telemetry.namespace" -}}
{{- if .Values.telemetry.namespace -}}
{{- .Values.telemetry.namespace -}}
{{- else if .Values.wandb.namespace -}}
{{- .Values.wandb.namespace -}}
{{- else -}}
{{- .Release.Namespace -}}
{{- end -}}
{{- end -}}

{{- define "operator.telemetry.vmsingleName" -}}
{{- .Values.telemetry.managed.vmsingle.name -}}
{{- end -}}

{{- define "operator.telemetry.vmagentName" -}}
{{- .Values.telemetry.managed.vmagent.name -}}
{{- end -}}

{{- define "operator.telemetry.vlsingleName" -}}
{{- .Values.telemetry.managed.vlsingle.name -}}
{{- end -}}

{{- define "operator.telemetry.vtsingleName" -}}
{{- .Values.telemetry.managed.vtsingle.name -}}
{{- end -}}

{{- define "operator.telemetry.otlpGatewayName" -}}
{{- .Values.telemetry.managed.otlpGateway.name -}}
{{- end -}}

{{- define "operator.telemetry.metricsEndpoint" -}}
{{- if and .Values.telemetry.enabled (eq .Values.telemetry.mode "external") -}}
{{- .Values.telemetry.external.metricsEndpoint -}}
{{- else -}}
{{- printf "http://vmsingle-%s:8428/opentelemetry/v1/metrics" (include "operator.telemetry.vmsingleName" .) -}}
{{- end -}}
{{- end -}}

{{- define "operator.telemetry.logsEndpoint" -}}
{{- if and .Values.telemetry.enabled (eq .Values.telemetry.mode "external") -}}
{{- .Values.telemetry.external.logsEndpoint -}}
{{- else -}}
{{- printf "http://vlsingle-%s:9428/insert/opentelemetry/v1/logs" (include "operator.telemetry.vlsingleName" .) -}}
{{- end -}}
{{- end -}}

{{- define "operator.telemetry.tracesEndpoint" -}}
{{- if and .Values.telemetry.enabled (eq .Values.telemetry.mode "external") -}}
{{- .Values.telemetry.external.tracesEndpoint -}}
{{- else -}}
{{- printf "http://vtsingle-%s:10428/insert/opentelemetry/v1/traces" (include "operator.telemetry.vtsingleName" .) -}}
{{- end -}}
{{- end -}}
58 changes: 58 additions & 0 deletions deploy/operator/templates/telemetry-alerting.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
{{- if and .Values.telemetry.enabled (eq .Values.telemetry.mode "managed") .Values.telemetry.alerting.enabled }}
---
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: wandb-telemetry-default-rules
namespace: {{ include "operator.telemetry.namespace" . }}
labels:
app.kubernetes.io/component: telemetry
app.kubernetes.io/part-of: wandb
telemetry.wandb.com/rule-group: default
spec:
groups:
- name: wandb-telemetry
interval: {{ .Values.telemetry.alerting.evaluationInterval | quote }}
rules:
- alert: WandbOperatorDown
expr: up{job=~".*wandb-operator.*"} == 0
for: 5m
labels:
severity: warning
annotations:
summary: W&B operator metrics target is down
description: VictoriaMetrics cannot scrape the W&B operator metrics endpoint.
- alert: WandbNoKafkaMetrics
expr: absent(up{job=~".*kafka.*"})
for: 10m
labels:
severity: warning
annotations:
summary: Kafka metrics are missing
description: No Kafka metrics targets were discovered for at least 10 minutes.
---
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMAlert
metadata:
name: wandb-vmalert
namespace: {{ include "operator.telemetry.namespace" . }}
labels:
app.kubernetes.io/component: telemetry
app.kubernetes.io/part-of: wandb
spec:
datasource:
url: {{ printf "http://vmsingle-%s:8428" (include "operator.telemetry.vmsingleName" .) | quote }}
remoteRead:
url: {{ printf "http://vmsingle-%s:8428" (include "operator.telemetry.vmsingleName" .) | quote }}
remoteWrite:
url: {{ printf "http://vmsingle-%s:8428/api/v1/write" (include "operator.telemetry.vmsingleName" .) | quote }}
evaluationInterval: {{ .Values.telemetry.alerting.evaluationInterval | quote }}
selectAllByDefault: false
ruleSelector:
matchLabels:
telemetry.wandb.com/rule-group: default
{{- if .Values.telemetry.alerting.notifier.enabled }}
notifiers:
- url: {{ .Values.telemetry.alerting.notifier.target | quote }}
{{- end }}
{{- end }}
131 changes: 131 additions & 0 deletions deploy/operator/templates/telemetry-otlp-gateway.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
{{- if and .Values.telemetry.enabled (eq .Values.telemetry.mode "managed") .Values.telemetry.managed.otlpGateway.enabled }}
---
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "operator.telemetry.otlpGatewayName" . }}-config
namespace: {{ include "operator.telemetry.namespace" . }}
labels:
app.kubernetes.io/name: {{ include "operator.telemetry.otlpGatewayName" . }}
app.kubernetes.io/component: telemetry
app.kubernetes.io/part-of: wandb
data:
collector.yaml: |
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:{{ .Values.telemetry.managed.otlpGateway.service.grpcPort }}
http:
endpoint: 0.0.0.0:{{ .Values.telemetry.managed.otlpGateway.service.httpPort }}

processors:
batch: {}

exporters:
otlphttp/vm:
endpoint: http://vmsingle-{{ include "operator.telemetry.vmsingleName" . }}:8428
metrics_endpoint: http://vmsingle-{{ include "operator.telemetry.vmsingleName" . }}:8428/opentelemetry/v1/metrics
otlphttp/vl:
endpoint: http://vlsingle-{{ include "operator.telemetry.vlsingleName" . }}:9428
logs_endpoint: http://vlsingle-{{ include "operator.telemetry.vlsingleName" . }}:9428/insert/opentelemetry/v1/logs
otlphttp/vt:
endpoint: http://vtsingle-{{ include "operator.telemetry.vtsingleName" . }}:10428
traces_endpoint: http://vtsingle-{{ include "operator.telemetry.vtsingleName" . }}:10428/insert/opentelemetry/v1/traces

service:
telemetry:
metrics:
address: 0.0.0.0:{{ .Values.telemetry.managed.otlpGateway.service.metricsPort }}
pipelines:
metrics:
receivers: [otlp]
processors: [batch]
exporters: [otlphttp/vm]
logs:
receivers: [otlp]
processors: [batch]
exporters: [otlphttp/vl]
traces:
receivers: [otlp]
processors: [batch]
exporters: [otlphttp/vt]
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "operator.telemetry.otlpGatewayName" . }}
namespace: {{ include "operator.telemetry.namespace" . }}
labels:
app.kubernetes.io/name: {{ include "operator.telemetry.otlpGatewayName" . }}
app.kubernetes.io/component: telemetry
app.kubernetes.io/part-of: wandb
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: {{ include "operator.telemetry.otlpGatewayName" . }}
template:
metadata:
labels:
app.kubernetes.io/name: {{ include "operator.telemetry.otlpGatewayName" . }}
app.kubernetes.io/component: telemetry
app.kubernetes.io/part-of: wandb
spec:
containers:
- name: otel-collector
image: {{ printf "%s:%s" .Values.telemetry.managed.otlpGateway.image.repository .Values.telemetry.managed.otlpGateway.image.tag | quote }}
imagePullPolicy: {{ .Values.telemetry.managed.otlpGateway.image.pullPolicy }}
args:
- --config=/etc/otelcol/collector.yaml
ports:
- name: otlp-grpc
containerPort: {{ .Values.telemetry.managed.otlpGateway.service.grpcPort }}
protocol: TCP
- name: otlp-http
containerPort: {{ .Values.telemetry.managed.otlpGateway.service.httpPort }}
protocol: TCP
- name: metrics
containerPort: {{ .Values.telemetry.managed.otlpGateway.service.metricsPort }}
protocol: TCP
volumeMounts:
- name: config
mountPath: /etc/otelcol
{{- with .Values.telemetry.managed.otlpGateway.resources }}
resources:
{{- toYaml . | nindent 12 }}
{{- end }}
volumes:
- name: config
configMap:
name: {{ include "operator.telemetry.otlpGatewayName" . }}-config
items:
- key: collector.yaml
path: collector.yaml
---
apiVersion: v1
kind: Service
metadata:
name: {{ include "operator.telemetry.otlpGatewayName" . }}
namespace: {{ include "operator.telemetry.namespace" . }}
labels:
app.kubernetes.io/name: {{ include "operator.telemetry.otlpGatewayName" . }}
app.kubernetes.io/component: telemetry
app.kubernetes.io/part-of: wandb
spec:
selector:
app.kubernetes.io/name: {{ include "operator.telemetry.otlpGatewayName" . }}
ports:
- name: otlp-grpc
port: {{ .Values.telemetry.managed.otlpGateway.service.grpcPort }}
targetPort: otlp-grpc
protocol: TCP
- name: otlp-http
port: {{ .Values.telemetry.managed.otlpGateway.service.httpPort }}
targetPort: otlp-http
protocol: TCP
- name: metrics
port: {{ .Values.telemetry.managed.otlpGateway.service.metricsPort }}
targetPort: metrics
protocol: TCP
{{- end }}
86 changes: 86 additions & 0 deletions deploy/operator/templates/telemetry-perses.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
{{- if and .Values.telemetry.enabled .Values.telemetry.perses.enabled }}
---
apiVersion: v1
kind: ConfigMap
metadata:
name: perses-config
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yank out the perses stuff, we want to use graphana directly

namespace: {{ include "operator.telemetry.namespace" . }}
labels:
app.kubernetes.io/name: perses
app.kubernetes.io/component: telemetry
app.kubernetes.io/part-of: wandb
data:
config.yaml: |
http:
bind: 0.0.0.0:{{ .Values.telemetry.perses.service.port }}
database:
file:
folder: /perses
security:
enableAuth: false
telemetry:
metricsEndpoint: {{ include "operator.telemetry.metricsEndpoint" . }}
logsEndpoint: {{ include "operator.telemetry.logsEndpoint" . }}
tracesEndpoint: {{ include "operator.telemetry.tracesEndpoint" . }}
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: perses
namespace: {{ include "operator.telemetry.namespace" . }}
labels:
app.kubernetes.io/name: perses
app.kubernetes.io/component: telemetry
app.kubernetes.io/part-of: wandb
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: perses
template:
metadata:
labels:
app.kubernetes.io/name: perses
app.kubernetes.io/component: telemetry
app.kubernetes.io/part-of: wandb
spec:
containers:
- name: perses
image: {{ printf "%s:%s" .Values.telemetry.perses.image.repository .Values.telemetry.perses.image.tag | quote }}
imagePullPolicy: {{ .Values.telemetry.perses.image.pullPolicy }}
args:
- serve
- --config=/etc/perses/config/config.yaml
ports:
- name: http
containerPort: {{ .Values.telemetry.perses.service.port }}
volumeMounts:
- name: config
mountPath: /etc/perses/config
- name: data
mountPath: /perses
volumes:
- name: config
configMap:
name: perses-config
- name: data
emptyDir: {}
---
apiVersion: v1
kind: Service
metadata:
name: perses
namespace: {{ include "operator.telemetry.namespace" . }}
labels:
app.kubernetes.io/name: perses
app.kubernetes.io/component: telemetry
app.kubernetes.io/part-of: wandb
spec:
selector:
app.kubernetes.io/name: perses
ports:
- name: http
port: {{ .Values.telemetry.perses.service.port }}
targetPort: http
protocol: TCP
{{- end }}
Loading
Loading