From 1db8c205c4b2141cf517d4bba82e7c9476e9f9f2 Mon Sep 17 00:00:00 2001 From: Mathieu Garcia Date: Mon, 6 Oct 2025 10:09:15 +0200 Subject: [PATCH 01/34] feat(promtail): add remote write support to Promtail role The Promtail role now always executes its setup tasks and the config template conditionally includes a Loki `remote_write` client when `loki_remote_write` is defined, enabling log pushing to a remote Loki instance. Additionally, the `force` parameter in the download task was changed from the string "no" to the boolean `false` for correct usage. --- .../paas/roles/promtail/tasks/build.yml | 2 +- .../paas/roles/promtail/tasks/main.yml | 71 +++++++++---------- .../roles/promtail/templates/config.yaml.j2 | 4 ++ 3 files changed, 39 insertions(+), 38 deletions(-) diff --git a/ansible/playbooks/paas/roles/promtail/tasks/build.yml b/ansible/playbooks/paas/roles/promtail/tasks/build.yml index c5b28439..f3b6b120 100644 --- a/ansible/playbooks/paas/roles/promtail/tasks/build.yml +++ b/ansible/playbooks/paas/roles/promtail/tasks/build.yml @@ -14,7 +14,7 @@ url: "{{ upstream_file_url }}" dest: "{{ build_work_dir }}/download/" mode: '0644' - force: no + force: false register: download_result - name: Promtail | Unarchive GitHub release diff --git a/ansible/playbooks/paas/roles/promtail/tasks/main.yml b/ansible/playbooks/paas/roles/promtail/tasks/main.yml index 69fd20f6..21864a15 100644 --- a/ansible/playbooks/paas/roles/promtail/tasks/main.yml +++ b/ansible/playbooks/paas/roles/promtail/tasks/main.yml @@ -1,43 +1,40 @@ --- -- name: Promtail | Service is enabled - when: loki_remote_write is not defined - block: - - name: Promtail | Include upstream variables - ansible.builtin.include_vars: upstream.yml +- name: Promtail | Include upstream variables + ansible.builtin.include_vars: upstream.yml - - name: Promtail | Set custom variables - ansible.builtin.set_fact: - image_version: "{{ latest_version }}" - image_name: "{{ image.name }}" +- name: Promtail | Set custom variables + ansible.builtin.set_fact: + image_version: "{{ latest_version }}" + image_name: "{{ image.name }}" - - name: Promtail | Get binary - include_tasks: build.yml - when: ansible_local[image.name] is not defined or ansible_local[image.name] != latest_version +- name: Promtail | Get binary + ansible.builtin.include_tasks: build.yml + when: ansible_local[image.name] is not defined or ansible_local[image.name] != latest_version - - name: Promtail | Create custom directories - ansible.builtin.file: - dest: "{{ item }}" - state: directory - recurse: true - with_items: - - /etc/promtail - - /var/lib/promtail +- name: Promtail | Create custom directories + ansible.builtin.file: + dest: "{{ item }}" + state: directory + recurse: true + with_items: + - /etc/promtail + - /var/lib/promtail - - name: Promtail | Copy templates - ansible.builtin.template: - src: "{{ item.src }}" - dest: "{{ item.dest }}" - mode: 0644 - owner: root - group: root - loop: - - src: default.j2 - dest: /etc/default/promtail - - src: config.yaml.j2 - dest: /etc/promtail/config.yaml - - src: service.j2 - dest: /etc/systemd/system/promtail.service - notify: Restart promtail +- name: Promtail | Copy templates + ansible.builtin.template: + src: "{{ item.src }}" + dest: "{{ item.dest }}" + mode: '0644' + owner: root + group: root + loop: + - src: default.j2 + dest: /etc/default/promtail + - src: config.yaml.j2 + dest: /etc/promtail/config.yaml + - src: service.j2 + dest: /etc/systemd/system/promtail.service + notify: Restart promtail - - name: Promtail | Flush handlers - ansible.builtin.meta: flush_handlers +- name: Promtail | Flush handlers + ansible.builtin.meta: flush_handlers diff --git a/ansible/playbooks/paas/roles/promtail/templates/config.yaml.j2 b/ansible/playbooks/paas/roles/promtail/templates/config.yaml.j2 index ac2220d3..7e99073a 100644 --- a/ansible/playbooks/paas/roles/promtail/templates/config.yaml.j2 +++ b/ansible/playbooks/paas/roles/promtail/templates/config.yaml.j2 @@ -7,11 +7,15 @@ server: positions: filename: /var/lib/promtail/positions.yaml +{% if loki_remote_write is defined %} clients: - url: {{ loki_remote_write.url }}/api/prom/push basic_auth: username: {{ loki_remote_write.login }} password: {{ loki_remote_write.password }} +{% else %} +clients: [] +{% endif %} scrape_configs: - job_name: system From 837186d42bc327f422455e4aff519147f943de57 Mon Sep 17 00:00:00 2001 From: Mathieu Garcia Date: Tue, 21 Oct 2025 15:53:28 +0200 Subject: [PATCH 02/34] feat(coredns): add nomad cluster mode support to coredns role Switch the coredns role to use official CoreDNS releases instead of building from source, create a dedicated system user and group, and update the Corefile template to obtain the Nomad management token from the primary Nomad master node. Documentation is extended with a Nomad cluster mode section and the golang role is removed from the playbook. --- ansible/playbooks/paas/coredns.yml | 1 - .../playbooks/paas/roles/coredns/README.md | 4 +++ .../paas/roles/coredns/tasks/build.yml | 23 ++-------------- .../paas/roles/coredns/tasks/main.yml | 18 ++++++++++--- .../paas/roles/coredns/templates/Corefile.j2 | 5 ++-- .../paas/roles/coredns/templates/postinst.j2 | 27 ------------------- .../paas/roles/coredns/templates/prerm.j2 | 13 --------- .../paas/roles/coredns/vars/main.yml | 12 ++++----- .../paas/roles/coredns/vars/upstream.yml | 2 +- 9 files changed, 30 insertions(+), 75 deletions(-) delete mode 100644 ansible/playbooks/paas/roles/coredns/templates/postinst.j2 delete mode 100644 ansible/playbooks/paas/roles/coredns/templates/prerm.j2 diff --git a/ansible/playbooks/paas/coredns.yml b/ansible/playbooks/paas/coredns.yml index 284f0a85..85dff61f 100644 --- a/ansible/playbooks/paas/coredns.yml +++ b/ansible/playbooks/paas/coredns.yml @@ -5,5 +5,4 @@ gather_facts: true become: true roles: - - golang - coredns diff --git a/ansible/playbooks/paas/roles/coredns/README.md b/ansible/playbooks/paas/roles/coredns/README.md index e03dc9f4..55beefb1 100644 --- a/ansible/playbooks/paas/roles/coredns/README.md +++ b/ansible/playbooks/paas/roles/coredns/README.md @@ -1,3 +1,7 @@ # Role: `coredns` ## How to use this Ansible role? + +### nomad cluster mode + +nomad_primary_master_node: Set a primary nomad master node to get nomad_management_token \ No newline at end of file diff --git a/ansible/playbooks/paas/roles/coredns/tasks/build.yml b/ansible/playbooks/paas/roles/coredns/tasks/build.yml index 86cc7946..fc9b0c1c 100644 --- a/ansible/playbooks/paas/roles/coredns/tasks/build.yml +++ b/ansible/playbooks/paas/roles/coredns/tasks/build.yml @@ -14,37 +14,18 @@ url: "{{ upstream_file_url }}" dest: "{{ build_work_dir }}/download/" mode: '0644' - force: no register: download_result -- name: Coredns | Git checkout - ansible.builtin.git: - repo: https://github.com/coredns/coredns - dest: "{{ build_work_dir }}/download/coredns" - version: master - force: true - - name: Coredns | Unarchive GitHub release ansible.builtin.unarchive: - src: "{{ build_work_dir }}/download/{{ image.upstream.repo }}-{{ upstream_file_name }}" + src: "{{ build_work_dir }}/download/{{ upstream_file_name }}" dest: "{{ build_work_dir }}/download" remote_src: true when: download_result.changed -- name: Coredns | Build binary - ansible.builtin.shell: - cmd: "{{ item }}" - chdir: "{{ build_work_dir }}/download/coredns" - environment: - PATH: "/usr/local/go/bin:{{ ansible_env.PATH }}" - loop: - - echo "nomad:github.com/ituoga/coredns-nomad" >> plugin.cfg - - go mod edit -replace github.com/ituoga/coredns-nomad={{ build_work_dir }}/download/coredns-nomad-{{ latest_version }} - - make gen coredns - - name: Coredns | Copy binary ansible.builtin.copy: - src: "{{ build_work_dir }}/download/coredns/{{ image.upstream.binary }}" + src: "{{ build_work_dir }}/download/coredns" dest: /usr/local/bin/coredns owner: root group: root diff --git a/ansible/playbooks/paas/roles/coredns/tasks/main.yml b/ansible/playbooks/paas/roles/coredns/tasks/main.yml index a7b94816..8f5799e0 100644 --- a/ansible/playbooks/paas/roles/coredns/tasks/main.yml +++ b/ansible/playbooks/paas/roles/coredns/tasks/main.yml @@ -3,13 +3,25 @@ ansible.builtin.include_vars: upstream.yml - name: Coredns | Get binary - include_tasks: build.yml + ansible.builtin.include_tasks: build.yml when: ansible_local[image.name] is not defined or ansible_local[image.name] != latest_version +- name: Coredns | Create group + ansible.builtin.group: + name: coredns + system: true + +- name: Coredns | Create user + ansible.builtin.user: + name: coredns + create_home: false + system: true + - name: Coredns | Create custom directories ansible.builtin.file: dest: "{{ item.dest }}" state: directory + mode: '0755' owner: "{{ item.owner | default('root') }}" group: "{{ item.group | default('root') }}" loop: @@ -19,7 +31,7 @@ ansible.builtin.template: src: "{{ item.src }}" dest: "{{ item.dest }}" - mode: 0600 + mode: '0640' owner: coredns group: coredns loop: @@ -46,5 +58,5 @@ Cache=no DNSStubListenerExtra=172.17.0.1:53 dest: /etc/systemd/resolved.conf.d/coredns.conf - mode: 0644 + mode: '0644' notify: Restart systemd-resolved diff --git a/ansible/playbooks/paas/roles/coredns/templates/Corefile.j2 b/ansible/playbooks/paas/roles/coredns/templates/Corefile.j2 index 491b7723..5cf6dbf6 100644 --- a/ansible/playbooks/paas/roles/coredns/templates/Corefile.j2 +++ b/ansible/playbooks/paas/roles/coredns/templates/Corefile.j2 @@ -4,9 +4,8 @@ service.nomad.:1053 { #debug #log nomad { - zone service.nomad - address https://127.0.0.1:4646 - token {{ lookup('simple-stack-ui', type='secret', key=inventory_hostname, subkey='nomad_management_token', missing='error') }} + address https://{{ hostvars[nomad_primary_master_node | default(inventory_hostname)]['ansible_ens3']['ipv4']['address'] }}:4646 + token {{ lookup('simple-stack-ui', type='secret', key=nomad_primary_master_node | default(inventory_hostname), subkey='nomad_management_token', missing='error') }} ttl 10 } prometheus 127.0.0.1:9153 diff --git a/ansible/playbooks/paas/roles/coredns/templates/postinst.j2 b/ansible/playbooks/paas/roles/coredns/templates/postinst.j2 deleted file mode 100644 index 6284ae86..00000000 --- a/ansible/playbooks/paas/roles/coredns/templates/postinst.j2 +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash - -set -e - -SERVICE_NAME="{{ image.name }}" -USER_NAME="coredns" -GROUP_NAME="coredns" - -if ! getent group "$GROUP_NAME" >/dev/null; then - echo "Creating group $GROUP_NAME..." - groupadd --system $GROUP_NAME -fi - -if ! id -u "$USER_NAME" >/dev/null 2>&1; then - echo "Creating user $USER_NAME..." - useradd --system --gid $GROUP_NAME --shell /usr/sbin/nologin --no-create-home $USER_NAME -fi - - -if command -v systemctl >/dev/null 2>&1; then - echo "Reloading systemd configuration..." - systemctl daemon-reload - - echo "Enabling and starting $SERVICE_NAME service..." - systemctl enable "$SERVICE_NAME" - systemctl start "$SERVICE_NAME" -fi diff --git a/ansible/playbooks/paas/roles/coredns/templates/prerm.j2 b/ansible/playbooks/paas/roles/coredns/templates/prerm.j2 deleted file mode 100644 index f21147ef..00000000 --- a/ansible/playbooks/paas/roles/coredns/templates/prerm.j2 +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -set -e - -# Variables -SERVICE_NAME="{{ image.name }}" - -# Stop and disable the service -if command -v systemctl >/dev/null 2>&1; then - echo "Stopping and disabling $SERVICE_NAME service..." - systemctl stop "$SERVICE_NAME" || true - systemctl disable "$SERVICE_NAME" || true -fi diff --git a/ansible/playbooks/paas/roles/coredns/vars/main.yml b/ansible/playbooks/paas/roles/coredns/vars/main.yml index 11175960..262e254c 100644 --- a/ansible/playbooks/paas/roles/coredns/vars/main.yml +++ b/ansible/playbooks/paas/roles/coredns/vars/main.yml @@ -3,14 +3,14 @@ image: build: false upstream: source: github - user: ituoga - repo: coredns-nomad - type: archive - format: tar.gz - file: VERSION.FORMAT + user: coredns + repo: coredns + type: release + format: tgz + file: coredns_VERSION_OS_ARCH.FORMAT os: linux binary: coredns labels: {} name: coredns -build_work_dir: /tmp/coredns-nomad +build_work_dir: /tmp/coredns diff --git a/ansible/playbooks/paas/roles/coredns/vars/upstream.yml b/ansible/playbooks/paas/roles/coredns/vars/upstream.yml index f6893ce2..036aa64c 100644 --- a/ansible/playbooks/paas/roles/coredns/vars/upstream.yml +++ b/ansible/playbooks/paas/roles/coredns/vars/upstream.yml @@ -1,4 +1,4 @@ --- latest_version: "{{ (lookup('url', 'https://api.github.com/repos/{{ image.upstream.user }}/{{ image.upstream.repo }}/releases/latest', headers={'Accept': 'application/vnd.github+json', 'Authorization': 'Bearer ' + lookup('ansible.builtin.env', 'GITHUB_API_TOKEN') }) | from_json).get('tag_name') | replace('v', '') }}" upstream_file_name: "{{ image.upstream.file | replace('REPO', image.upstream.repo) | replace('VERSION', latest_version) | replace('OS', image.upstream.os) | replace('ARCH', upstream_default_arch) | replace('FORMAT', image.upstream.format) }}" -upstream_file_url: "https://github.com/{{ image.upstream.user }}/{{ image.upstream.repo }}/archive/refs/tags/v{{ upstream_file_name }}" +upstream_file_url: "https://github.com/{{ image.upstream.user }}/{{ image.upstream.repo }}/releases/download/v{{ latest_version }}/{{ upstream_file_name }}" From a3a51a227aa0ef659dee14309eb2f5b626f51196 Mon Sep 17 00:00:00 2001 From: Mathieu Garcia Date: Tue, 21 Oct 2025 15:56:27 +0200 Subject: [PATCH 03/34] feat(ansible): add safe checks for optional constraints --- ansible/playbooks/saas/roles/adguard/templates/nomad.hcl | 2 +- ansible/playbooks/saas/roles/arangodb/templates/nomad.hcl | 2 +- ansible/playbooks/saas/roles/caddy/templates/nomad.hcl | 6 ++++-- ansible/playbooks/saas/roles/dolibarr/templates/nomad.hcl | 2 +- ansible/playbooks/saas/roles/forgejo/templates/nomad.hcl | 2 +- ansible/playbooks/saas/roles/freshrss/templates/nomad.hcl | 2 +- ansible/playbooks/saas/roles/grafana/templates/nomad.hcl | 2 +- .../playbooks/saas/roles/homeassistant/templates/nomad.hcl | 2 +- ansible/playbooks/saas/roles/kresus/templates/nomad.hcl | 2 +- ansible/playbooks/saas/roles/loki/templates/nomad.hcl | 2 +- ansible/playbooks/saas/roles/mariadb/templates/nomad.hcl | 2 +- ansible/playbooks/saas/roles/mimir/templates/nomad.hcl | 6 ++++-- ansible/playbooks/saas/roles/minio/templates/nomad.hcl | 5 ++++- ansible/playbooks/saas/roles/mosquitto/templates/nomad.hcl | 2 +- ansible/playbooks/saas/roles/nextcloud/templates/nomad.hcl | 2 +- ansible/playbooks/saas/roles/nginx/templates/nomad.hcl | 2 +- ansible/playbooks/saas/roles/open-webui/templates/nomad.hcl | 2 +- ansible/playbooks/saas/roles/postgresql/templates/nomad.hcl | 2 +- ansible/playbooks/saas/roles/registry/templates/nomad.hcl | 4 +++- ansible/playbooks/saas/roles/rocketchat/templates/nomad.hcl | 4 +++- ansible/playbooks/saas/roles/traefik/templates/nomad.hcl | 2 +- ansible/playbooks/saas/roles/wordpress/templates/nomad.hcl | 2 +- .../playbooks/saas/roles/zigbee2mqtt/templates/nomad.hcl | 2 +- 23 files changed, 36 insertions(+), 25 deletions(-) diff --git a/ansible/playbooks/saas/roles/adguard/templates/nomad.hcl b/ansible/playbooks/saas/roles/adguard/templates/nomad.hcl index c3a315a5..3bb996f3 100644 --- a/ansible/playbooks/saas/roles/adguard/templates/nomad.hcl +++ b/ansible/playbooks/saas/roles/adguard/templates/nomad.hcl @@ -3,7 +3,7 @@ job "{{ domain }}" { datacenters = ["{{ fact_instance.datacenter }}"] type = "service" -{% if software.constraints.location %} +{% if software.constraints is defined and software.constraints.location is defined %} constraint { attribute = "${meta.location}" set_contains = "{{ software.constraints.location }}" diff --git a/ansible/playbooks/saas/roles/arangodb/templates/nomad.hcl b/ansible/playbooks/saas/roles/arangodb/templates/nomad.hcl index e6956e43..ccd7cace 100644 --- a/ansible/playbooks/saas/roles/arangodb/templates/nomad.hcl +++ b/ansible/playbooks/saas/roles/arangodb/templates/nomad.hcl @@ -3,7 +3,7 @@ job "{{ domain }}" { datacenters = ["{{ fact_instance.datacenter }}"] type = "service" -{% if software.constraints.location %} +{% if software.constraints is defined and software.constraints.location is defined %} constraint { attribute = "${meta.location}" set_contains = "{{ software.constraints.location }}" diff --git a/ansible/playbooks/saas/roles/caddy/templates/nomad.hcl b/ansible/playbooks/saas/roles/caddy/templates/nomad.hcl index edbe32a0..1067d1f5 100644 --- a/ansible/playbooks/saas/roles/caddy/templates/nomad.hcl +++ b/ansible/playbooks/saas/roles/caddy/templates/nomad.hcl @@ -3,17 +3,19 @@ job "{{ domain }}" { datacenters = ["{{ fact_instance.datacenter }}"] type = "service" -{% if software.constraints.location %} +{% if software.constraints is defined and software.constraints.location is defined %} constraint { attribute = "${meta.location}" set_contains = "{{ software.constraints.location }}" } {% endif %} +{% if software.constraints is defined and software.constraints.instance is defined %} constraint { attribute = "${meta.instance}" - set_contains = "{{ software.instance }}" + set_contains = "{{ software.constraints.instance }}" } +{% endif %} group "{{ domain }}" { count = {{ software.scale | default(1) }} diff --git a/ansible/playbooks/saas/roles/dolibarr/templates/nomad.hcl b/ansible/playbooks/saas/roles/dolibarr/templates/nomad.hcl index a0faff30..a3c77c12 100644 --- a/ansible/playbooks/saas/roles/dolibarr/templates/nomad.hcl +++ b/ansible/playbooks/saas/roles/dolibarr/templates/nomad.hcl @@ -3,7 +3,7 @@ job "{{ domain }}" { datacenters = ["{{ fact_instance.datacenter }}"] type = "service" -{% if software.constraints.location %} +{% if software.constraints is defined and software.constraints.location is defined %} constraint { attribute = "${meta.location}" set_contains = "{{ software.constraints.location }}" diff --git a/ansible/playbooks/saas/roles/forgejo/templates/nomad.hcl b/ansible/playbooks/saas/roles/forgejo/templates/nomad.hcl index 1608d244..0a93e435 100644 --- a/ansible/playbooks/saas/roles/forgejo/templates/nomad.hcl +++ b/ansible/playbooks/saas/roles/forgejo/templates/nomad.hcl @@ -3,7 +3,7 @@ job "{{ domain }}" { datacenters = ["{{ fact_instance.datacenter }}"] type = "service" -{% if software.constraints.location %} +{% if software.constraints is defined and software.constraints.location is defined %} constraint { attribute = "${meta.location}" set_contains = "{{ software.constraints.location }}" diff --git a/ansible/playbooks/saas/roles/freshrss/templates/nomad.hcl b/ansible/playbooks/saas/roles/freshrss/templates/nomad.hcl index 5b924ce5..466b31ea 100644 --- a/ansible/playbooks/saas/roles/freshrss/templates/nomad.hcl +++ b/ansible/playbooks/saas/roles/freshrss/templates/nomad.hcl @@ -3,7 +3,7 @@ job "{{ domain }}" { datacenters = ["{{ fact_instance.datacenter }}"] type = "service" -{% if software.constraints.location %} +{% if software.constraints is defined and software.constraints.location is defined %} constraint { attribute = "${meta.location}" set_contains = "{{ software.constraints.location }}" diff --git a/ansible/playbooks/saas/roles/grafana/templates/nomad.hcl b/ansible/playbooks/saas/roles/grafana/templates/nomad.hcl index a17e4205..d5b438bb 100644 --- a/ansible/playbooks/saas/roles/grafana/templates/nomad.hcl +++ b/ansible/playbooks/saas/roles/grafana/templates/nomad.hcl @@ -3,7 +3,7 @@ job "{{ domain }}" { datacenters = ["{{ fact_instance.datacenter }}"] type = "service" -{% if software.constraints.location %} +{% if software.constraints is defined and software.constraints.location is defined %} constraint { attribute = "${meta.location}" set_contains = "{{ software.constraints.location }}" diff --git a/ansible/playbooks/saas/roles/homeassistant/templates/nomad.hcl b/ansible/playbooks/saas/roles/homeassistant/templates/nomad.hcl index ec5ba3a6..68e101f1 100644 --- a/ansible/playbooks/saas/roles/homeassistant/templates/nomad.hcl +++ b/ansible/playbooks/saas/roles/homeassistant/templates/nomad.hcl @@ -3,7 +3,7 @@ job "{{ domain }}" { datacenters = ["{{ fact_instance.datacenter }}"] type = "service" -{% if software.constraints.location %} +{% if software.constraints is defined and software.constraints.location is defined %} constraint { attribute = "${meta.location}" set_contains = "{{ software.constraints.location }}" diff --git a/ansible/playbooks/saas/roles/kresus/templates/nomad.hcl b/ansible/playbooks/saas/roles/kresus/templates/nomad.hcl index 75da59ff..1e8d7e7d 100644 --- a/ansible/playbooks/saas/roles/kresus/templates/nomad.hcl +++ b/ansible/playbooks/saas/roles/kresus/templates/nomad.hcl @@ -3,7 +3,7 @@ job "{{ domain }}" { datacenters = ["{{ fact_instance.datacenter }}"] type = "service" -{% if software.constraints.location %} +{% if software.constraints is defined and software.constraints.location is defined %} constraint { attribute = "${meta.location}" set_contains = "{{ software.constraints.location }}" diff --git a/ansible/playbooks/saas/roles/loki/templates/nomad.hcl b/ansible/playbooks/saas/roles/loki/templates/nomad.hcl index ed695b61..d9364645 100644 --- a/ansible/playbooks/saas/roles/loki/templates/nomad.hcl +++ b/ansible/playbooks/saas/roles/loki/templates/nomad.hcl @@ -3,7 +3,7 @@ job "{{ domain }}" { datacenters = ["{{ fact_instance.datacenter }}"] type = "service" -{% if software.constraints.location %} +{% if software.constraints is defined and software.constraints.location is defined %} constraint { attribute = "${meta.location}" set_contains = "{{ software.constraints.location }}" diff --git a/ansible/playbooks/saas/roles/mariadb/templates/nomad.hcl b/ansible/playbooks/saas/roles/mariadb/templates/nomad.hcl index c21e91aa..ffd8ebe5 100644 --- a/ansible/playbooks/saas/roles/mariadb/templates/nomad.hcl +++ b/ansible/playbooks/saas/roles/mariadb/templates/nomad.hcl @@ -3,7 +3,7 @@ job "{{ domain }}" { datacenters = ["{{ fact_instance.datacenter }}"] type = "service" -{% if software.constraints.location %} +{% if software.constraints is defined and software.constraints.location is defined %} constraint { attribute = "${meta.location}" set_contains = "{{ software.constraints.location }}" diff --git a/ansible/playbooks/saas/roles/mimir/templates/nomad.hcl b/ansible/playbooks/saas/roles/mimir/templates/nomad.hcl index 66c84ec5..7395e598 100644 --- a/ansible/playbooks/saas/roles/mimir/templates/nomad.hcl +++ b/ansible/playbooks/saas/roles/mimir/templates/nomad.hcl @@ -3,17 +3,19 @@ job "{{ domain }}" { datacenters = ["{{ fact_instance.datacenter }}"] type = "service" -{% if software.constraints.location %} +{% if software.constraints is defined and software.constraints.location is defined %} constraint { attribute = "${meta.location}" set_contains = "{{ software.constraints.location }}" } {% endif %} +{% if software.constraints is defined and software.constraints.instance is defined %} constraint { attribute = "${meta.instance}" - set_contains = "{{ software.instance }}" + set_contains = "{{ software.constraints.instance }}" } +{% endif %} group "{{ domain }}-minio" { count = 1 diff --git a/ansible/playbooks/saas/roles/minio/templates/nomad.hcl b/ansible/playbooks/saas/roles/minio/templates/nomad.hcl index db983a14..79b3934f 100644 --- a/ansible/playbooks/saas/roles/minio/templates/nomad.hcl +++ b/ansible/playbooks/saas/roles/minio/templates/nomad.hcl @@ -3,7 +3,7 @@ job "{{ domain }}" { datacenters = ["{{ fact_instance.datacenter }}"] type = "service" -{% if software.constraints.location %} +{% if software.constraints is defined and software.constraints.location is defined %} constraint { attribute = "${meta.location}" set_contains = "{{ software.constraints.location }}" @@ -21,6 +21,9 @@ job "{{ domain }}" { network { port "minio" { to = 9000 +{% if software.static_port is defined %} + static = {{ software.static_port }} +{% endif %} } } diff --git a/ansible/playbooks/saas/roles/mosquitto/templates/nomad.hcl b/ansible/playbooks/saas/roles/mosquitto/templates/nomad.hcl index 600ce577..44264b10 100644 --- a/ansible/playbooks/saas/roles/mosquitto/templates/nomad.hcl +++ b/ansible/playbooks/saas/roles/mosquitto/templates/nomad.hcl @@ -3,7 +3,7 @@ job "{{ domain }}" { datacenters = ["{{ fact_instance.datacenter }}"] type = "service" -{% if software.constraints.location %} +{% if software.constraints is defined and software.constraints.location is defined %} constraint { attribute = "${meta.location}" set_contains = "{{ software.constraints.location }}" diff --git a/ansible/playbooks/saas/roles/nextcloud/templates/nomad.hcl b/ansible/playbooks/saas/roles/nextcloud/templates/nomad.hcl index 5fd428f3..4e8bf378 100644 --- a/ansible/playbooks/saas/roles/nextcloud/templates/nomad.hcl +++ b/ansible/playbooks/saas/roles/nextcloud/templates/nomad.hcl @@ -3,7 +3,7 @@ job "{{ domain }}" { datacenters = ["{{ fact_instance.datacenter }}"] type = "service" -{% if software.constraints.location %} +{% if software.constraints is defined and software.constraints.location is defined %} constraint { attribute = "${meta.location}" set_contains = "{{ software.constraints.location }}" diff --git a/ansible/playbooks/saas/roles/nginx/templates/nomad.hcl b/ansible/playbooks/saas/roles/nginx/templates/nomad.hcl index 2e3166fb..340167f2 100644 --- a/ansible/playbooks/saas/roles/nginx/templates/nomad.hcl +++ b/ansible/playbooks/saas/roles/nginx/templates/nomad.hcl @@ -3,7 +3,7 @@ job "{{ domain }}" { datacenters = ["{{ fact_instance.datacenter }}"] type = "service" -{% if software.constraints.location %} +{% if software.constraints is defined and software.constraints.location is defined %} constraint { attribute = "${meta.location}" set_contains = "{{ software.constraints.location }}" diff --git a/ansible/playbooks/saas/roles/open-webui/templates/nomad.hcl b/ansible/playbooks/saas/roles/open-webui/templates/nomad.hcl index bc1e1a92..39a276cf 100644 --- a/ansible/playbooks/saas/roles/open-webui/templates/nomad.hcl +++ b/ansible/playbooks/saas/roles/open-webui/templates/nomad.hcl @@ -3,7 +3,7 @@ job "{{ domain }}" { datacenters = ["{{ fact_instance.datacenter }}"] type = "service" -{% if software.constraints.location %} +{% if software.constraints is defined and software.constraints.location is defined %} constraint { attribute = "${meta.location}" set_contains = "{{ software.constraints.location }}" diff --git a/ansible/playbooks/saas/roles/postgresql/templates/nomad.hcl b/ansible/playbooks/saas/roles/postgresql/templates/nomad.hcl index b751328a..8bce6575 100644 --- a/ansible/playbooks/saas/roles/postgresql/templates/nomad.hcl +++ b/ansible/playbooks/saas/roles/postgresql/templates/nomad.hcl @@ -3,7 +3,7 @@ job "{{ domain }}" { datacenters = ["{{ fact_instance.datacenter }}"] type = "service" -{% if software.constraints.location %} +{% if software.constraints is defined and software.constraints.location is defined %} constraint { attribute = "${meta.location}" set_contains = "{{ software.constraints.location }}" diff --git a/ansible/playbooks/saas/roles/registry/templates/nomad.hcl b/ansible/playbooks/saas/roles/registry/templates/nomad.hcl index cd2ec90f..be570e06 100644 --- a/ansible/playbooks/saas/roles/registry/templates/nomad.hcl +++ b/ansible/playbooks/saas/roles/registry/templates/nomad.hcl @@ -7,7 +7,7 @@ job "{{ domain }}" { datacenters = ["{{ fact_instance.datacenter }}"] type = "service" -{% if software.constraints.location %} +{% if software.constraints is defined and software.constraints.location is defined %} constraint { attribute = "${meta.location}" set_contains = "{{ software.constraints.location }}" @@ -59,6 +59,8 @@ job "{{ domain }}" { REGISTRY_AUTH = "htpasswd" REGISTRY_AUTH_HTPASSWD_REALM = "Registry Realm" REGISTRY_AUTH_HTPASSWD_PATH = "/data/htpasswd" + REGISTRY_LOG_LEVEL = "info" + OTEL_TRACES_EXPORTER = "none" #There is a warning about the *HTTP secret*. It's only important if you have multiples registries behind a load balancer. REGISTRY_HTTP_SECRET = "Aumuu5ie-ieX7uwee-Aixah4ee" } diff --git a/ansible/playbooks/saas/roles/rocketchat/templates/nomad.hcl b/ansible/playbooks/saas/roles/rocketchat/templates/nomad.hcl index 2cc28193..95c78511 100644 --- a/ansible/playbooks/saas/roles/rocketchat/templates/nomad.hcl +++ b/ansible/playbooks/saas/roles/rocketchat/templates/nomad.hcl @@ -3,7 +3,7 @@ job "{{ domain }}" { datacenters = ["{{ fact_instance.datacenter }}"] type = "service" -{% if software.constraints.location %} +{% if software.constraints is defined and software.constraints.location is defined %} constraint { attribute = "${meta.location}" set_contains = "{{ software.constraints.location }}" @@ -114,6 +114,8 @@ EOH task "{{ domain }}-rocketchat" { driver = "docker" + user = "42420:42420" + env { ROOT_URL = "https://{{ domain }}" MONGO_URL = "mongodb://${NOMAD_HOST_ADDR_mongodb}/parties?replicaSet=rs0&directConnection=true" diff --git a/ansible/playbooks/saas/roles/traefik/templates/nomad.hcl b/ansible/playbooks/saas/roles/traefik/templates/nomad.hcl index b8065a6c..cf901f86 100644 --- a/ansible/playbooks/saas/roles/traefik/templates/nomad.hcl +++ b/ansible/playbooks/saas/roles/traefik/templates/nomad.hcl @@ -3,7 +3,7 @@ job "{{ domain }}" { datacenters = ["{{ fact_instance.datacenter }}"] type = "service" -{% if software.constraints.location %} +{% if software.constraints is defined and software.constraints.location is defined %} constraint { attribute = "${meta.location}" set_contains = "{{ software.constraints.location }}" diff --git a/ansible/playbooks/saas/roles/wordpress/templates/nomad.hcl b/ansible/playbooks/saas/roles/wordpress/templates/nomad.hcl index ab54a4e8..2e615550 100644 --- a/ansible/playbooks/saas/roles/wordpress/templates/nomad.hcl +++ b/ansible/playbooks/saas/roles/wordpress/templates/nomad.hcl @@ -3,7 +3,7 @@ job "{{ domain }}" { datacenters = ["{{ fact_instance.datacenter }}"] type = "service" -{% if software.constraints.location %} +{% if software.constraints is defined and software.constraints.location is defined %} constraint { attribute = "${meta.location}" set_contains = "{{ software.constraints.location }}" diff --git a/ansible/playbooks/saas/roles/zigbee2mqtt/templates/nomad.hcl b/ansible/playbooks/saas/roles/zigbee2mqtt/templates/nomad.hcl index 255ef6d9..4cf709e6 100644 --- a/ansible/playbooks/saas/roles/zigbee2mqtt/templates/nomad.hcl +++ b/ansible/playbooks/saas/roles/zigbee2mqtt/templates/nomad.hcl @@ -3,7 +3,7 @@ job "{{ domain }}" { datacenters = ["{{ fact_instance.datacenter }}"] type = "service" -{% if software.constraints.location %} +{% if software.constraints is defined and software.constraints.location is defined %} constraint { attribute = "${meta.location}" set_contains = "{{ software.constraints.location }}" From ad3389b25dae2d59e68ecf029dd2df3d8de9579d Mon Sep 17 00:00:00 2001 From: Mathieu Garcia Date: Tue, 21 Oct 2025 15:57:40 +0200 Subject: [PATCH 04/34] refactor(caddy): use Nomad template for Caddy config --- .../playbooks/saas/roles/caddy/tasks/main.yml | 11 --------- .../saas/roles/caddy/templates/Dockerfile.j2 | 2 +- .../saas/roles/caddy/templates/nomad.hcl | 24 +++++++++++-------- 3 files changed, 15 insertions(+), 22 deletions(-) diff --git a/ansible/playbooks/saas/roles/caddy/tasks/main.yml b/ansible/playbooks/saas/roles/caddy/tasks/main.yml index b45d0b3e..f61aa1fa 100644 --- a/ansible/playbooks/saas/roles/caddy/tasks/main.yml +++ b/ansible/playbooks/saas/roles/caddy/tasks/main.yml @@ -1,15 +1,4 @@ --- -- name: Create default directory - ansible.builtin.file: - path: "{{ item }}" - state: directory - owner: root - group: root - mode: "0755" - loop: - - "{{ software_path }}/etc/caddy" - delegate_to: "{{ software.instance }}" - - name: Copy nomad job to destination ansible.builtin.template: src: nomad.hcl diff --git a/ansible/playbooks/saas/roles/caddy/templates/Dockerfile.j2 b/ansible/playbooks/saas/roles/caddy/templates/Dockerfile.j2 index 387b89ba..a423587b 100644 --- a/ansible/playbooks/saas/roles/caddy/templates/Dockerfile.j2 +++ b/ansible/playbooks/saas/roles/caddy/templates/Dockerfile.j2 @@ -15,4 +15,4 @@ RUN mkdir -p /var/log/caddy /var/lib/caddy /etc/caddy \ USER caddy -CMD ["caddy", "run", "--config", "/etc/caddy/Caddyfile"] +CMD ["caddy", "run"] diff --git a/ansible/playbooks/saas/roles/caddy/templates/nomad.hcl b/ansible/playbooks/saas/roles/caddy/templates/nomad.hcl index 1067d1f5..cb828014 100644 --- a/ansible/playbooks/saas/roles/caddy/templates/nomad.hcl +++ b/ansible/playbooks/saas/roles/caddy/templates/nomad.hcl @@ -51,22 +51,26 @@ job "{{ domain }}" { task "{{ domain }}-caddy" { driver = "docker" - config { - image = "{{ docker_private_registry.url }}/caddy:{{ softwares.caddy.version }}" - volumes = [ - "{{ software_path }}/etc/caddy:/etc/caddy:ro" - ] - ports = ["caddy", "metrics"] - } - template { - change_mode = "noop" - destination = "{{ software_path }}/etc/caddy/Caddyfile" + change_mode = "restart" + destination = "local/Caddyfile" + perms = "644" data = < Date: Tue, 21 Oct 2025 15:58:35 +0200 Subject: [PATCH 05/34] feat(loki): use Nomad template for Loki config --- ansible/playbooks/saas/roles/loki/tasks/main.yml | 10 +--------- .../playbooks/saas/roles/loki/templates/nomad.hcl | 15 +++++++++++++-- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/ansible/playbooks/saas/roles/loki/tasks/main.yml b/ansible/playbooks/saas/roles/loki/tasks/main.yml index e10eec4a..10a2e33a 100644 --- a/ansible/playbooks/saas/roles/loki/tasks/main.yml +++ b/ansible/playbooks/saas/roles/loki/tasks/main.yml @@ -8,15 +8,7 @@ mode: '0755' loop: - "{{ software_path }}/var/lib/loki" - - "{{ software_path }}/etc/loki" - -- name: Copy config file - ansible.builtin.template: - src: config.yaml.j2 - dest: "{{ software_path }}/etc/loki/local-config.yaml" - owner: 10001 - group: 10001 - mode: '0644' + delegate_to: "{{ software.instance }}" - name: Copy nomad job ansible.builtin.template: diff --git a/ansible/playbooks/saas/roles/loki/templates/nomad.hcl b/ansible/playbooks/saas/roles/loki/templates/nomad.hcl index d9364645..514509d3 100644 --- a/ansible/playbooks/saas/roles/loki/templates/nomad.hcl +++ b/ansible/playbooks/saas/roles/loki/templates/nomad.hcl @@ -40,12 +40,23 @@ job "{{ domain }}" { config { image = "grafana/loki:{{ softwares.loki.version }}" volumes = [ - "{{ software_path }}/var/lib/loki:/var/lib/loki:rw", - "{{ software_path }}/etc/loki:/etc/loki:ro" + "{{ software_path }}/var/lib/loki:/var/lib/loki:rw" + ] + args = [ + "-config.file", + "/local/config.yaml" ] ports = ["loki"] } + template { + change_mode = "restart" + destination = "local/config.yaml" + data = < Date: Tue, 21 Oct 2025 15:59:39 +0200 Subject: [PATCH 06/34] feat(mimir): add Nomad templates and dynamic config for mimir --- .../playbooks/saas/roles/mimir/tasks/main.yml | 35 ----------- .../saas/roles/mimir/templates/mimir.yaml.j2 | 56 +++++++++++------ .../saas/roles/mimir/templates/nomad.hcl | 61 +++++++++++++++++-- .../{files => templates}/prometheus.yaml | 2 +- 4 files changed, 97 insertions(+), 57 deletions(-) rename ansible/playbooks/saas/roles/mimir/{files => templates}/prometheus.yaml (93%) diff --git a/ansible/playbooks/saas/roles/mimir/tasks/main.yml b/ansible/playbooks/saas/roles/mimir/tasks/main.yml index 342e53e7..f4de5a9f 100644 --- a/ansible/playbooks/saas/roles/mimir/tasks/main.yml +++ b/ansible/playbooks/saas/roles/mimir/tasks/main.yml @@ -1,39 +1,4 @@ --- -- name: Create data directories - ansible.builtin.file: - path: "{{ item.path }}" - state: directory - owner: "{{ item.owner | default('root') }}" - group: "{{ item.group | default('root') }}" - mode: '0755' - loop: - - path: /data/{{ domain }}/config - - path: /data/{{ domain }}/prometheus - owner: nobody - group: nogroup - -- name: Copy services configurations files - ansible.builtin.copy: - src: "{{ item }}" - dest: "/data/{{ domain }}/config/{{ item }}" - owner: root - group: root - mode: '0644' - loop: - - prometheus.yaml - - rules.yaml - -- name: Template services configurations files - ansible.builtin.template: - src: "{{ item }}.j2" - dest: "/data/{{ domain }}/config/{{ item }}" - owner: root - group: root - mode: '0644' - loop: - - mimir.yaml - - alertmanager-fallback-config.yaml - - name: Copy nomad job ansible.builtin.template: src: nomad.hcl diff --git a/ansible/playbooks/saas/roles/mimir/templates/mimir.yaml.j2 b/ansible/playbooks/saas/roles/mimir/templates/mimir.yaml.j2 index 8c13f86d..2083a297 100644 --- a/ansible/playbooks/saas/roles/mimir/templates/mimir.yaml.j2 +++ b/ansible/playbooks/saas/roles/mimir/templates/mimir.yaml.j2 @@ -4,13 +4,20 @@ target: all,alertmanager,overrides-exporter limits: compactor_blocks_retention_period: 90d +common: + storage: + backend: s3 + s3: + endpoint: minio-{{ service_name }}.default.service.nomad:9000 + access_key_id: "{{ lookup('simple-stack-ui', type='secret', key=domain, subkey='access_key_id', missing='error') }}" + secret_access_key: "{{ lookup('simple-stack-ui', type='secret', key=domain, subkey='secret_access_key', missing='error') }}" + insecure: true + +memberlist: + advertise_addr: {% raw %}{{env "attr.unique.network.ip-address"}}{% endraw %} + blocks_storage: - backend: s3 s3: - endpoint: 172.17.0.1:9000 - access_key_id: "{{ lookup('simple-stack-ui', type='secret', key=domain, subkey='access_key_id', missing='create', length=12) }}" - secret_access_key: "{{ lookup('simple-stack-ui', type='secret', key=domain, subkey='secret_access_key', missing='create', length=12) }}" - insecure: true bucket_name: mimir-blocks ruler: @@ -18,31 +25,46 @@ ruler: ring: heartbeat_period: 2s heartbeat_timeout: 10s + instance_addr: {% raw %}{{env "attr.unique.network.ip-address"}}{% endraw %} + ruler_storage: - backend: s3 s3: - endpoint: 172.17.0.1:9000 - access_key_id: "{{ lookup('simple-stack-ui', type='secret', key=domain, subkey='access_key_id', missing='error') }}" - secret_access_key: "{{ lookup('simple-stack-ui', type='secret', key=domain, subkey='secret_access_key', missing='error') }}" - insecure: true bucket_name: ruler alertmanager_storage: - backend: s3 s3: - endpoint: 172.17.0.1:9000 - access_key_id: "{{ lookup('simple-stack-ui', type='secret', key=domain, subkey='access_key_id', missing='error') }}" - secret_access_key: "{{ lookup('simple-stack-ui', type='secret', key=domain, subkey='secret_access_key', missing='error') }}" - insecure: true bucket_name: mimir alertmanager: - fallback_config_file: /config/alertmanager-fallback-config.yaml + fallback_config_file: /local/alertmanager-fallback-config.yaml external_url: http://127.0.0.1:8080/alertmanager + sharding_ring: + instance_addr: {% raw %}{{env "attr.unique.network.ip-address"}}{% endraw %} + + +compactor: + sharding_ring: + instance_addr: {% raw %}{{env "attr.unique.network.ip-address"}}{% endraw %} + + +distributor: + ring: + instance_addr: {% raw %}{{env "attr.unique.network.ip-address"}}{% endraw %} + server: - log_level: info + log_level: warn + +store_gateway: + sharding_ring: + instance_addr: {% raw %}{{env "attr.unique.network.ip-address"}}{% endraw %} + + +ingester: + ring: + instance_addr: {% raw %}{{env "attr.unique.network.ip-address"}}{% endraw %} + usage_stats: enabled: false diff --git a/ansible/playbooks/saas/roles/mimir/templates/nomad.hcl b/ansible/playbooks/saas/roles/mimir/templates/nomad.hcl index 7395e598..cfd71437 100644 --- a/ansible/playbooks/saas/roles/mimir/templates/nomad.hcl +++ b/ansible/playbooks/saas/roles/mimir/templates/nomad.hcl @@ -20,6 +20,13 @@ job "{{ domain }}" { group "{{ domain }}-minio" { count = 1 +{% if software.constraints is defined and software.constraints.minio_instance is defined %} + constraint { + attribute = "${meta.instance}" + set_contains = "{{ software.constraints.minio_instance }}" + } +{% endif %} + network { port "minio" { to = 9000 @@ -38,8 +45,8 @@ job "{{ domain }}" { driver = "docker" env { - MINIO_ROOT_USER = "{{ lookup('simple-stack-ui', type='secret', key=domain, subkey='access_key_id', missing='error') }}" - MINIO_ROOT_PASSWORD = "{{ lookup('simple-stack-ui', type='secret', key=domain, subkey='secret_access_key', missing='error') }}" + MINIO_ROOT_USER = "{{ lookup('simple-stack-ui', type='secret', key=domain, subkey='access_key_id', missing='create', length=12) }}" + MINIO_ROOT_PASSWORD = "{{ lookup('simple-stack-ui', type='secret', key=domain, subkey='secret_access_key', missing='create', length=12) }}" } config { @@ -63,15 +70,25 @@ job "{{ domain }}" { group "{{ domain }}-mimir" { count = 3 +{% if software.constraints is defined and software.constraints.distinct_hosts is defined %} + constraint { + operator = "distinct_hosts" + value = "{{ software.constraints.distinct_hosts | lower }}" + } +{% endif %} + network { port "mimir_7946" { to = 7946 + static = 7946 } port "mimir_9095" { to = 9095 + static = 9095 } port "mimir_8080" { to = 8080 + static = 8080 } } @@ -100,19 +117,55 @@ job "{{ domain }}" { driver = "docker" + env { + IP = "${attr.unique.network.ip-address}" + } + config { image = "grafana/mimir:{{ softwares.mimir.version }}" + network_mode = "host" volumes = [ - "{{ software_path }}/config:/config", "{{ software_path }}/mimir-${NOMAD_ALLOC_INDEX}:/data", ] args = [ - "-config.file=/config/mimir.yaml", + "-config.file=/local/mimir.yaml", "-memberlist.join=dnssrv+mimir.default.service.nomad" ] ports = ["mimir_7946", "mimir_9095", "mimir_8080"] } + template { + change_mode = "restart" + destination = "local/mimir.yaml" + data = < Date: Tue, 21 Oct 2025 16:01:32 +0200 Subject: [PATCH 07/34] refactor(minio): use project registry variable in Minio template --- ansible/playbooks/saas/roles/minio/tasks/main.yml | 11 ----------- .../playbooks/saas/roles/minio/templates/nomad.hcl | 2 +- 2 files changed, 1 insertion(+), 12 deletions(-) diff --git a/ansible/playbooks/saas/roles/minio/tasks/main.yml b/ansible/playbooks/saas/roles/minio/tasks/main.yml index 80f982f7..f61aa1fa 100644 --- a/ansible/playbooks/saas/roles/minio/tasks/main.yml +++ b/ansible/playbooks/saas/roles/minio/tasks/main.yml @@ -1,15 +1,4 @@ --- -- name: Create default directory - ansible.builtin.file: - path: "{{ item.path }}" - state: directory - owner: "{{ item.owner | default('root') }}" - group: "{{ item.group | default('root') }}" - mode: '0755' - loop: - - path: "{{ software_path }}/data" - - path: "{{ software_path }}/var/backup" - - name: Copy nomad job to destination ansible.builtin.template: src: nomad.hcl diff --git a/ansible/playbooks/saas/roles/minio/templates/nomad.hcl b/ansible/playbooks/saas/roles/minio/templates/nomad.hcl index 79b3934f..e9a92136 100644 --- a/ansible/playbooks/saas/roles/minio/templates/nomad.hcl +++ b/ansible/playbooks/saas/roles/minio/templates/nomad.hcl @@ -56,7 +56,7 @@ job "{{ domain }}" { } config { - image = "{{ docker_private_registry.url }}/minio:{{ softwares.minio.version }}" + image = "{{ docker_private_registry.url_project | default(docker_private_registry.url) }}/minio:{{ softwares.minio.version }}" volumes = [ "{{ software_path }}/data:/data:rw", "{{ software_path }}/var/backup:/var/backup:rw" From e5cf489c2a14520c73b08a502af6f10803dea5d8 Mon Sep 17 00:00:00 2001 From: Mathieu Garcia Date: Tue, 21 Oct 2025 16:01:57 +0200 Subject: [PATCH 08/34] fix(nomad): handle missing constraints in Nomad templates --- ansible/playbooks/saas/roles/nomad/templates/backup.hcl.j2 | 2 +- ansible/playbooks/saas/roles/nomad/templates/restore.hcl.j2 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/playbooks/saas/roles/nomad/templates/backup.hcl.j2 b/ansible/playbooks/saas/roles/nomad/templates/backup.hcl.j2 index d348356c..1c0e4b1a 100644 --- a/ansible/playbooks/saas/roles/nomad/templates/backup.hcl.j2 +++ b/ansible/playbooks/saas/roles/nomad/templates/backup.hcl.j2 @@ -3,7 +3,7 @@ job "{{ nomad_job_name }}" { datacenters = ["{{ fact_instance.datacenter }}"] type = "batch" -{% if software.constraints.location %} +{% if software.constraints is defined and software.constraints.location is defined %} constraint { attribute = "${meta.location}" set_contains = "{{ software.constraints.location }}" diff --git a/ansible/playbooks/saas/roles/nomad/templates/restore.hcl.j2 b/ansible/playbooks/saas/roles/nomad/templates/restore.hcl.j2 index fb7cae8b..00419a79 100644 --- a/ansible/playbooks/saas/roles/nomad/templates/restore.hcl.j2 +++ b/ansible/playbooks/saas/roles/nomad/templates/restore.hcl.j2 @@ -3,7 +3,7 @@ job "{{ nomad_job_name }}" { datacenters = ["{{ fact_instance.datacenter }}"] type = "batch" -{% if software.constraints.location %} +{% if software.constraints is defined and software.constraints.location is defined %} constraint { attribute = "${meta.location}" set_contains = "{{ software.constraints.location }}" From dda6fc1631697cb7b8703e173fb8f59d005c9cca Mon Sep 17 00:00:00 2001 From: Mathieu Garcia Date: Tue, 21 Oct 2025 16:02:59 +0200 Subject: [PATCH 09/34] feat(valkey): add templated config and update Nomad job --- .../saas/roles/valkey/tasks/main.yml | 18 ------------------ .../saas/roles/valkey/templates/nomad.hcl | 19 +++++++++++++++---- .../valkey/{files => templates}/valkey.conf | 5 +++-- 3 files changed, 18 insertions(+), 24 deletions(-) rename ansible/playbooks/saas/roles/valkey/{files => templates}/valkey.conf (99%) diff --git a/ansible/playbooks/saas/roles/valkey/tasks/main.yml b/ansible/playbooks/saas/roles/valkey/tasks/main.yml index a3262df1..6bb83999 100644 --- a/ansible/playbooks/saas/roles/valkey/tasks/main.yml +++ b/ansible/playbooks/saas/roles/valkey/tasks/main.yml @@ -1,22 +1,4 @@ --- -- name: Create default directories - ansible.builtin.file: - path: "{{ item }}" - state: directory - owner: root - group: root - mode: '0755' - loop: - - "{{ software_path }}/data" - - "{{ software_path }}/etc/valkey" - - "{{ software_path }}/run/valkey" - -- name: Copy config file - ansible.builtin.copy: - src: valkey.conf - dest: "{{ software_path }}/etc/valkey/valkey.conf" - mode: '0644' - - name: Copy nomad job to destination ansible.builtin.template: src: nomad.hcl diff --git a/ansible/playbooks/saas/roles/valkey/templates/nomad.hcl b/ansible/playbooks/saas/roles/valkey/templates/nomad.hcl index 55dc611d..026e2b06 100644 --- a/ansible/playbooks/saas/roles/valkey/templates/nomad.hcl +++ b/ansible/playbooks/saas/roles/valkey/templates/nomad.hcl @@ -3,7 +3,7 @@ job "{{ domain }}" { datacenters = ["{{ fact_instance.datacenter }}"] type = "service" -{% if software.constraints.location %} +{% if software.constraints is defined and software.constraints.location is defined %} constraint { attribute = "${meta.location}" set_contains = "{{ software.constraints.location }}" @@ -21,6 +21,9 @@ job "{{ domain }}" { network { port "valkey" { to = 6379 +{% if software.static_port is defined %} + static = {{ software.static_port }} +{% endif %} } } @@ -35,7 +38,7 @@ job "{{ domain }}" { driver = "docker" config { - image = "{{ software }}/{{ software }}:{{ softwares.valkey.version }}" + image = "valkey/valkey:{{ softwares.valkey.version }}" volumes = [ "{{ software_path }}/data:/data:rw", "{{ software_path }}/etc/valkey:/etc/valkey:ro", @@ -44,8 +47,16 @@ job "{{ domain }}" { ports = ["valkey"] command = "/usr/local/bin/valkey-server" args = [ - "/etc/valkey/valkey.conf", -] + "/local/valkey.conf", + ] + } + + template { + change_mode = "restart" + destination = "local/valkey.conf" + data = < Date: Tue, 21 Oct 2025 16:49:12 +0200 Subject: [PATCH 10/34] feat(traefik): introduce metrics entrypoint on 8081 for Prometheus Add a dedicated entrypoint for Traefik's metrics at port 8081 and update the Prometheus scrape configuration to rewrite the target address. This separates metric traffic from the main HTTPS entrypoint. --- ansible/playbooks/paas/roles/prometheus/templates/config.j2 | 6 ++++++ ansible/playbooks/saas/roles/traefik/templates/traefik.toml | 4 +++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/ansible/playbooks/paas/roles/prometheus/templates/config.j2 b/ansible/playbooks/paas/roles/prometheus/templates/config.j2 index 360d93c9..6359f4ed 100644 --- a/ansible/playbooks/paas/roles/prometheus/templates/config.j2 +++ b/ansible/playbooks/paas/roles/prometheus/templates/config.j2 @@ -248,6 +248,12 @@ scrape_configs: - source_labels: [__tmp_fqdn] target_label: fqdn + # Traefik exposes its metrics on a different port + - source_labels: [__address__] + regex: '(.+):\d+' + target_label: __address__ + replacement: '${1}:8081' + - job_name: 'minio' metrics_path: /minio/v2/metrics/cluster scheme: http diff --git a/ansible/playbooks/saas/roles/traefik/templates/traefik.toml b/ansible/playbooks/saas/roles/traefik/templates/traefik.toml index 031c8bd1..efbc64b9 100644 --- a/ansible/playbooks/saas/roles/traefik/templates/traefik.toml +++ b/ansible/playbooks/saas/roles/traefik/templates/traefik.toml @@ -14,6 +14,8 @@ [entryPoints.http.http.redirections.entrypoint] to = "https" scheme = "https" + [entryPoints.metrics] + address = ":8081" [entryPoints.https] address = ":443" @@ -25,7 +27,7 @@ buckets = [0.1,0.3,1.2,5.0] addEntryPointsLabels = true addServicesLabels = true - entryPoint = "https" + entryPoint = "metrics" [log] level = "WARN" From e5e3ca432d5a627a28d74d1ee44e2508cac2cef6 Mon Sep 17 00:00:00 2001 From: Mathieu Garcia Date: Tue, 21 Oct 2025 16:52:26 +0200 Subject: [PATCH 11/34] feat(vector): add vector role with nomad job and config --- ansible/playbooks/saas/roles/vector/README.md | 1 + .../saas/roles/vector/defaults/main.yml | 1 + .../saas/roles/vector/tasks/backup.yml | 1 + .../saas/roles/vector/tasks/build.yml | 14 ++++ .../saas/roles/vector/tasks/destroy.yml | 11 +++ .../saas/roles/vector/tasks/main.yml | 14 ++++ .../saas/roles/vector/tasks/restore.yml | 1 + .../saas/roles/vector/templates/nomad.hcl | 75 +++++++++++++++++++ .../roles/vector/templates/vector.yaml.j2 | 25 +++++++ .../playbooks/saas/roles/vector/vars/main.yml | 13 ++++ .../saas/roles/vector/vars/upstream.yml | 3 + 11 files changed, 159 insertions(+) create mode 100644 ansible/playbooks/saas/roles/vector/README.md create mode 100644 ansible/playbooks/saas/roles/vector/defaults/main.yml create mode 100644 ansible/playbooks/saas/roles/vector/tasks/backup.yml create mode 100644 ansible/playbooks/saas/roles/vector/tasks/build.yml create mode 100644 ansible/playbooks/saas/roles/vector/tasks/destroy.yml create mode 100644 ansible/playbooks/saas/roles/vector/tasks/main.yml create mode 100644 ansible/playbooks/saas/roles/vector/tasks/restore.yml create mode 100644 ansible/playbooks/saas/roles/vector/templates/nomad.hcl create mode 100644 ansible/playbooks/saas/roles/vector/templates/vector.yaml.j2 create mode 100644 ansible/playbooks/saas/roles/vector/vars/main.yml create mode 100644 ansible/playbooks/saas/roles/vector/vars/upstream.yml diff --git a/ansible/playbooks/saas/roles/vector/README.md b/ansible/playbooks/saas/roles/vector/README.md new file mode 100644 index 00000000..86fc9973 --- /dev/null +++ b/ansible/playbooks/saas/roles/vector/README.md @@ -0,0 +1 @@ +# Role: `vector` diff --git a/ansible/playbooks/saas/roles/vector/defaults/main.yml b/ansible/playbooks/saas/roles/vector/defaults/main.yml new file mode 100644 index 00000000..ed97d539 --- /dev/null +++ b/ansible/playbooks/saas/roles/vector/defaults/main.yml @@ -0,0 +1 @@ +--- diff --git a/ansible/playbooks/saas/roles/vector/tasks/backup.yml b/ansible/playbooks/saas/roles/vector/tasks/backup.yml new file mode 100644 index 00000000..ed97d539 --- /dev/null +++ b/ansible/playbooks/saas/roles/vector/tasks/backup.yml @@ -0,0 +1 @@ +--- diff --git a/ansible/playbooks/saas/roles/vector/tasks/build.yml b/ansible/playbooks/saas/roles/vector/tasks/build.yml new file mode 100644 index 00000000..7421b506 --- /dev/null +++ b/ansible/playbooks/saas/roles/vector/tasks/build.yml @@ -0,0 +1,14 @@ +--- +- name: Include upstream variables + ansible.builtin.include_vars: upstream.yml + +- name: Set custom variables + ansible.builtin.set_fact: + image_version: "{{ latest_version }}" + image_name: "{{ image.name }}" + image_labels: "{{ image.labels }}" + image_build: "{{ image.build }}" + +- name: End playbook if no new version + ansible.builtin.meta: end_host + when: softwares[image.name] is defined and softwares[image.name] == image_version diff --git a/ansible/playbooks/saas/roles/vector/tasks/destroy.yml b/ansible/playbooks/saas/roles/vector/tasks/destroy.yml new file mode 100644 index 00000000..e00b7de1 --- /dev/null +++ b/ansible/playbooks/saas/roles/vector/tasks/destroy.yml @@ -0,0 +1,11 @@ +--- +- name: Stop nomad job + ansible.builtin.include_role: + name: nomad + tasks_from: job_stop.yml + +- name: Remove software directory + ansible.builtin.file: + path: "{{ software_path }}" + state: absent + delegate_to: "{{ software.instance }}" \ No newline at end of file diff --git a/ansible/playbooks/saas/roles/vector/tasks/main.yml b/ansible/playbooks/saas/roles/vector/tasks/main.yml new file mode 100644 index 00000000..148c7a41 --- /dev/null +++ b/ansible/playbooks/saas/roles/vector/tasks/main.yml @@ -0,0 +1,14 @@ +--- +- name: Copy nomad job + ansible.builtin.template: + src: nomad.hcl + dest: "/var/tmp/{{ domain }}.nomad" + owner: root + group: root + mode: '0600' + become: true + +- name: Run nomad job + ansible.builtin.include_role: + name: nomad + tasks_from: job_run.yml diff --git a/ansible/playbooks/saas/roles/vector/tasks/restore.yml b/ansible/playbooks/saas/roles/vector/tasks/restore.yml new file mode 100644 index 00000000..ed97d539 --- /dev/null +++ b/ansible/playbooks/saas/roles/vector/tasks/restore.yml @@ -0,0 +1 @@ +--- diff --git a/ansible/playbooks/saas/roles/vector/templates/nomad.hcl b/ansible/playbooks/saas/roles/vector/templates/nomad.hcl new file mode 100644 index 00000000..ed39381a --- /dev/null +++ b/ansible/playbooks/saas/roles/vector/templates/nomad.hcl @@ -0,0 +1,75 @@ +job "{{ domain }}" { + region = "{{ fact_instance.region }}" + datacenters = ["{{ fact_instance.datacenter }}"] + type = "service" + +{% if software.constraints is defined and software.constraints.location is defined %} + constraint { + attribute = "${meta.location}" + set_contains = "{{ software.constraints.location }}" + } +{% endif %} + + constraint { + attribute = "${meta.instance}" + set_contains = "{{ software.instance }}" + } + + group "{{ domain }}" { + count = 1 + + network { + port "vector" { + to = 8686 +{% if software.static_port is defined %} + static = {{ software.static_port }} +{% endif %} + } + port "vector_http" { + to = 8687 + static = 8687 + } + } + + // service name + service { + name = "{{ service_name }}" + port = "vector" + provider = "nomad" + tags = [ + {{ lookup('template', '../../traefik/templates/traefik_tag.j2') | indent(8) }} + ] + } + + task "{{ domain }}-vector" { + driver = "docker" + + template { + change_mode = "restart" + destination = "local/vector.yaml" + perms = "644" + data = < Date: Tue, 21 Oct 2025 17:10:07 +0200 Subject: [PATCH 12/34] feat(vllm): add vllm role with Nomad job and Docker support --- ansible/playbooks/saas/roles/vllm/README.md | 1 + .../saas/roles/vllm/defaults/main.yml | 1 + .../saas/roles/vllm/tasks/backup.yml | 1 + .../playbooks/saas/roles/vllm/tasks/build.yml | 14 +++ .../saas/roles/vllm/tasks/destroy.yml | 10 ++ .../playbooks/saas/roles/vllm/tasks/main.yml | 60 +++++++++ .../saas/roles/vllm/tasks/restore.yml | 1 + .../roles/vllm/templates/entrypoint-llm.sh | 10 ++ .../saas/roles/vllm/templates/nomad-llm.hcl | 67 ++++++++++ .../saas/roles/vllm/templates/nomad.hcl | 115 ++++++++++++++++++ .../playbooks/saas/roles/vllm/vars/main.yml | 13 ++ .../saas/roles/vllm/vars/upstream.yml | 4 + ui/public/img/vllm.png | Bin 0 -> 34523 bytes 13 files changed, 297 insertions(+) create mode 100644 ansible/playbooks/saas/roles/vllm/README.md create mode 100644 ansible/playbooks/saas/roles/vllm/defaults/main.yml create mode 100644 ansible/playbooks/saas/roles/vllm/tasks/backup.yml create mode 100644 ansible/playbooks/saas/roles/vllm/tasks/build.yml create mode 100644 ansible/playbooks/saas/roles/vllm/tasks/destroy.yml create mode 100644 ansible/playbooks/saas/roles/vllm/tasks/main.yml create mode 100644 ansible/playbooks/saas/roles/vllm/tasks/restore.yml create mode 100644 ansible/playbooks/saas/roles/vllm/templates/entrypoint-llm.sh create mode 100644 ansible/playbooks/saas/roles/vllm/templates/nomad-llm.hcl create mode 100644 ansible/playbooks/saas/roles/vllm/templates/nomad.hcl create mode 100644 ansible/playbooks/saas/roles/vllm/vars/main.yml create mode 100644 ansible/playbooks/saas/roles/vllm/vars/upstream.yml create mode 100644 ui/public/img/vllm.png diff --git a/ansible/playbooks/saas/roles/vllm/README.md b/ansible/playbooks/saas/roles/vllm/README.md new file mode 100644 index 00000000..371c3efa --- /dev/null +++ b/ansible/playbooks/saas/roles/vllm/README.md @@ -0,0 +1 @@ +# Role: `vllm` diff --git a/ansible/playbooks/saas/roles/vllm/defaults/main.yml b/ansible/playbooks/saas/roles/vllm/defaults/main.yml new file mode 100644 index 00000000..ed97d539 --- /dev/null +++ b/ansible/playbooks/saas/roles/vllm/defaults/main.yml @@ -0,0 +1 @@ +--- diff --git a/ansible/playbooks/saas/roles/vllm/tasks/backup.yml b/ansible/playbooks/saas/roles/vllm/tasks/backup.yml new file mode 100644 index 00000000..ed97d539 --- /dev/null +++ b/ansible/playbooks/saas/roles/vllm/tasks/backup.yml @@ -0,0 +1 @@ +--- diff --git a/ansible/playbooks/saas/roles/vllm/tasks/build.yml b/ansible/playbooks/saas/roles/vllm/tasks/build.yml new file mode 100644 index 00000000..7421b506 --- /dev/null +++ b/ansible/playbooks/saas/roles/vllm/tasks/build.yml @@ -0,0 +1,14 @@ +--- +- name: Include upstream variables + ansible.builtin.include_vars: upstream.yml + +- name: Set custom variables + ansible.builtin.set_fact: + image_version: "{{ latest_version }}" + image_name: "{{ image.name }}" + image_labels: "{{ image.labels }}" + image_build: "{{ image.build }}" + +- name: End playbook if no new version + ansible.builtin.meta: end_host + when: softwares[image.name] is defined and softwares[image.name] == image_version diff --git a/ansible/playbooks/saas/roles/vllm/tasks/destroy.yml b/ansible/playbooks/saas/roles/vllm/tasks/destroy.yml new file mode 100644 index 00000000..ce77a12b --- /dev/null +++ b/ansible/playbooks/saas/roles/vllm/tasks/destroy.yml @@ -0,0 +1,10 @@ +--- +- name: Stop nomad job + ansible.builtin.include_role: + name: nomad + tasks_from: job_stop.yml + +- name: Remove software directory + ansible.builtin.file: + path: "{{ software_path }}" + state: absent diff --git a/ansible/playbooks/saas/roles/vllm/tasks/main.yml b/ansible/playbooks/saas/roles/vllm/tasks/main.yml new file mode 100644 index 00000000..87410702 --- /dev/null +++ b/ansible/playbooks/saas/roles/vllm/tasks/main.yml @@ -0,0 +1,60 @@ +--- +- name: Copy llm nomad job to destination + ansible.builtin.template: + src: nomad-llm.hcl + dest: "/var/tmp/{{ domain }}.nomad" + owner: root + group: root + mode: '0600' + become: true + +- name: Run nomad llm job + ansible.builtin.include_role: + name: nomad + tasks_from: job_run.yml + +- name: Copy nomad job to destination + ansible.builtin.template: + src: nomad.hcl + dest: "/var/tmp/{{ domain }}.nomad" + owner: root + group: root + mode: '0600' + become: true + +- name: Run nomad job + ansible.builtin.include_role: + name: nomad + tasks_from: job_run.yml + + +- name: Check for endpoint to become available + ansible.builtin.uri: + url: "https://{{ software.domain }}/v1/models" + delegate_to: localhost + register: _result + until: _result.status == 200 + +- name: Warm up model with a simple query + ansible.builtin.uri: + url: https://{{ software.domain }}/v1/chat/completions + method: POST + body_format: json + body: + model: "{{ software.model }}" + messages: + - role: user + content: hello + max_tokens: 50 + headers: + Content-Type: "application/json" + return_content: true + register: _result + until: _result.status == 200 + retries: 10 + delay: 10 + delegate_to: localhost + +- name: Afficher la réponse du modèle + ansible.builtin.debug: + msg: "{{ _result.json }}" diff --git a/ansible/playbooks/saas/roles/vllm/tasks/restore.yml b/ansible/playbooks/saas/roles/vllm/tasks/restore.yml new file mode 100644 index 00000000..ed97d539 --- /dev/null +++ b/ansible/playbooks/saas/roles/vllm/tasks/restore.yml @@ -0,0 +1 @@ +--- diff --git a/ansible/playbooks/saas/roles/vllm/templates/entrypoint-llm.sh b/ansible/playbooks/saas/roles/vllm/templates/entrypoint-llm.sh new file mode 100644 index 00000000..a4d522d8 --- /dev/null +++ b/ansible/playbooks/saas/roles/vllm/templates/entrypoint-llm.sh @@ -0,0 +1,10 @@ +#!/bin/sh + +apk update +apk add --no-cache python3 py3-pip +pip install --break-system-packages -U "huggingface_hub[cli]" hf-transfer + +mkdir -p /models/${MODEL} + +echo "Download model ..." +hf download meta-llama/${MODEL} --local-dir /models/${MODEL} \ No newline at end of file diff --git a/ansible/playbooks/saas/roles/vllm/templates/nomad-llm.hcl b/ansible/playbooks/saas/roles/vllm/templates/nomad-llm.hcl new file mode 100644 index 00000000..a69885ae --- /dev/null +++ b/ansible/playbooks/saas/roles/vllm/templates/nomad-llm.hcl @@ -0,0 +1,67 @@ +job "{{ domain }}-llm" { + region = "{{ fact_instance.region }}" + datacenters = ["{{ fact_instance.datacenter }}"] + type = "batch" + +{% if software.constraints is defined and software.constraints.location is defined %} + constraint { + attribute = "${meta.location}" + set_contains = "{{ software.constraints.location }}" + } +{% endif %} + +{% if software.constraints is defined and software.constraints.instance is defined %} + constraint { + attribute = "${meta.instance}" + set_contains = "{{ software.constraints.instance }}" + } +{% endif %} + + group "{{ domain }}-llm" { + count = 1 + + volume "models" { + type = "csi" + source = "juicefs-llm" + attachment_mode = "file-system" + access_mode = "multi-node-multi-writer" + } + + task "{{ domain }}-vllm" { + driver = "docker" + + template { + change_mode = "restart" + destination = "local/entrypoint.sh" + perms = "755" + data = <om|zOP0TDr&7Djw)TG5&d3kxjz-W@qjE&V~7-&XJ(2b2pCh2M6UA@$zqvOdxl9zNT+*JvkVM2aZxw>!5~@ivHZUe^}rT0pM*~Ir41vrsj0=LhTM>lPAM_b z(3uz;Pk=S2kITc0oD!4jQ_vDj7=oGNfD^%*JApUNlbvE1bE4i6HRdFdT9H0OBu!>Q zTC#_nZp%kCLW$T{c9N1FW~CIQRfAo1Mi!NoMNq{AlETWV(aVXnxCM$@s9O3;#e@P` zO4YCgu2XK^G2;;96!36Gwvbg{c~J@T<4mnkO<{vkbAi1*o84(4A$urd6Q!9UCLzcW z5N{gcSvh17VZlNW{aw7HWs(*uX1LjLc(|~a;J7Pt8Xq}vJwcqSw9t{YR8$0`1EpcX zpuo|=AV4W_&<_k89}McRG#Hp1IKjWtYTz_~!+oypt5`7bm9Z$41c!NT2y)Z4+{ z(T&erko+$QK2Z8So0**SFA#S-L2?~M6;g30R|`@uCKe_Zav^w9Qc?j|b4xxo38}x) zL2rWO*6!}ke9X*VUS3RI>`YFsR?MI|#>>pY#>~dX2!ddA^Ko=H@n&>%1OB6uf7T;m z;b!J)J;y?Egj)cXDucas)BLP29=R(#Fce^&cGhHymgz|C*-i7B2rWwttWC z`#iI^d8g{{)c?TyCn^X6#H#-Tg8l!50P*2}tK%QJ|Fe7l;`0ARoxoqCA!TpkW(^wP ze=!fVcl=$&e;56mz<;3rH7#`iJuSF+|HrKOkCMMJ|Iyw5Sgilp3H~Atv=s=!3o!p{ zTM>dc*=r&O0}}z0l@QhN20!hDOEZ_uz7KG6O#k9>D<|$H-ayI`+t!`%4BZB-XJ=`y zDkNtrakE^%f3?hW%DDc-p6`Mwsfvn;s=9^>qYAm8dNTqR8vOAFaHIE}quKuJ{C>cV zo$fmwJOAfz&2;Sa#h;x9;|VSYFy7kk|D0pg9Pl2XI>Yin5(1FHqk_eI$Tt6f_o_k< zn6*x$NS2YnIJT)1v-hVTj^t6kg?;F90@9~6=_rW9--?CL32!7`%-2W;DpR)kHuY#7 z(Ml=2`ZCm)&`GApafu^W;H$j|V4tX{$=JcrVZz4}A2fAf|KuBGt~$Qi`Eb>YU|xEWYN>vQ;@s(Uf^5X^a@Fl1(;O zhGOoB}NCF9mdgICYax0a!Ou>NY%`dr@_1`ob6F_@9IQ1=Znv6BM`H+z#Zwm=;# zq7-XVKsNb_5ArxVIDmFSo_KBkuDrl#^t#w^2Z0tYCt?A&EvvmOs7u*Q%6s&ozk$Yl zTLU(K2&Tw|1Cq(n<|f-{%{~Gax0$%EQnf5;CY^#J=o}iI<44?o3n+i-MNi71 znZqn1rk6ZXQeJfLS2lLxk-b@1idbE{OKw85k52E#6lh&oyW>C)PKOHL-uWthYaNZm zkYN0}B+g+hO(mU=Eck_U(B?G{xC!cUZHmuT`=j@`|5a*1$HR7PR8&;^(XCmMe(Mia zG2*zCJq@^DzkYfB@>p&^$mi$ewAf&v-1*+)3QM!8F`R9c|2W7T=wfdl2j7QQ0Mx#m z5W?H1U0?dbWdj6QI+1P?b#I&~X*VJ{61tyM>ON zICqWsa#rOlM+{e*p6)w)9Q=#9`{j!Ncbm8!{HR0^C$q2H5~1;zP3|@4{_FgEf!`*p z$1bZh@jB!`N_@S4{z%%c(ev_+aqb+Tt?%p%xc~iYWSJW)Guq%>8b~-(gx^*o9{e*Q z(Kp}6v5V7g$^J)UQAI_RU*}rt^hr5|^~jc*m6esYcBa`z3aTbaTz67`+S{ zb?gwGt9b6B6@G|cmgfy+h?GyIcmJHc>`Y8Qf#^^pULJUphEXrOzjp}hZD?#yM7CBa zaJJ@iycJCXy*Ml~Ud7!+d>g^`{beAEn2(R|Mzd@hA|%Jw(;tToA_PwdoRo~P3Ss@B zu-YjKj~PFlrjM)r);VU3yuA+atoN0S-QBqY8=Q15lZ{NIx)ox9(RO&Xx3xj|a-YXhHH)I|e1Wk(l4R7lXQ^rRfO|gj*r@B9X)%yB3 z?XM8-^A(RqKhIhT$6Vn55`b!!9ZUF$`bGEhOh|`NN&|zEK;KeKr$YYjht1hm=Z0S#Q ztv*T4(~}J*nkc4NnJ>6k_$R2mpafooxqpOcI97&rBpwHB<|L0ToL4+f7MHdr1k3kr z1Bl)au%2QvW7>?N3Y36D@k4OR!-Lb}qZSpsYqH|O_0KQ|7IopyrN@>C5>GgZ<_uh(GCdx;ZfFP|)actgL)Gbny#VWz zngFT*ad%KySf2GBdczGI@GjL~G;6e8Z;jq=A0BuHhtb3!6CU(8#Rt$X&;z?rv~REj zr$Be9j^EX0E_N1oCxPI$km#nnvny5w0ex`P#%O(AI)R@#bq&n-S-tkO8G#XdHyzhg zKU-`thzr5MrASkVV0GB?hu3fn(}0^1!yU73yZb|oJ>zAwA=VVH1|A%04&mPzM9M@2 z+V;jXgC6=x_N&4@M*P69K1j4U;2ph2gl&MfnavPiS|JtdmGn4$Fc|5cSnmo>-z7%p zAa1?^rEW_pV{l^BENHRf=TuIb1mQp4mJq458VeJRM222tBc#qBAMd_)zwV|gFU6ey z=o0CTW3a|WBTTPGVXhA`;{iBsj=5CxLLy5VgRehCSsqSd_A9WPc*lRS<{U+j#eq6$1o_R*fkS{PV;IA&T$bN)r<_zlr}WHx&^*pDcY zIJX{?v7DuWIN*A_DD>ussfGQ^_F+kpG$q6$UeKqsiX*Jitgs(lI0nw(HT}o-gh;!U zW2i++@C+R~U+j3h{@gZLSy^#>;kJsplSeMkHqk_0>J!y~8kBUGZFQ*%p>Mjnj@N4iVk@NTFC+9jrv z5p5T?jk3B)C}QS7SD5Q0H7xag(rJi^6bK~<#5FPy#Ilmg6PVqihqT(1LJ@0zEL?ht zHF`14=p!a1GzYJh1LJ(;l*fiShxdUjJYK>HSJx~IA29J7cI)}@ ztCg_J`AD_%R@+d~?FXtcPb-TU1}PUdZ8R;c>l|pJ_Y32=+S#ujzcd6e{#uXoFL!5D zsdrW30Wd{PsG%5T-H)qDw;x}9Kd!{D&iDvRMteRc?Sk54+lNOAU4R8%F2?{O(DyOpT*e{UxJ%Rvcjm_CZBpEnB=$7IlW(DV64AArA;x8d}6E zY=mkG*rK+EV~cEZX3;|U5se8M*H&X=8T9k#I`s2uBFZwP;PCfhN;_M~n`G-_a-KaR zfMPt@T7RW+PGJ)i@0Cn58zo~)?KrqFEqia*Qzh3qgnlir!>Kfp2PUM#(WcqQ6+i*b zo}-u*c&50eY}B=Y^Ss;1HQ44yl0?e*@olVpz0KEgRZVBkW*SYCOKPu|r{K-yW122K zCBYwQ*7!zM5ioV35z^*+WC^xN^Wbr3H9y&{VSTE?;a$g$K1Xj=*F`7^g05P@(K+ql zFmb_)Dli7vL$@M|cvOUQa_qXNS|V`m#PryszXWhG$TSw)2yuSLbjGx}l}ntH);vPj zCcLVs77G7f1jCDruN{)&Jai~74Ad3i{pqw{VC9CwS%&-uKVnrg4$ktZ*z>-U+05+7 ziA&|;i17u^{RTl z_*PF06cgsz@<{8n<6WvYb5N#2?2H%cj{(^9Yz4Jw@OR@;p)y7&G<4`FZ1Ut2f(a^^ z@MTi*K&RI{wAw<|HmbE4q%o`vW_7R)T_ZE)&a8Sx83mbtzl*u~t_8E-3Z7M7k!69Q zr19rqQCj=<1#hUN>uKQvBYYC@vIl^UW$Lx7-zIw+Qx;?mp~2W6 z6+D#K%e>#7Hd3SoqfMfDEQV?EQf)mah7esK*RpQsQ1vw1&I!c2 z%PPWak|V3H+cjYp#AsQbb=Dufx}QIxEAUQ{#+pK-OR<@?71GCTYIh!M{^*6(cuveh zicq$&poyldB|&uxyv{9ndpS6{pzp(?{Fn%|gi=i-qWNp3 zpr}s$Hl7V{RG}BI@UK4o1~6AKUg8bj{W=f4U-vt=01XLU`@i@$hf3#i&mB&z1Q#WO zKvp<8g$euHpw)gQ=pdcVob`z_kLRZdMj5(m`cCpdu9FvZ3qgcx%g~*hWu{QwUI91l+W&s*0GO zUmv)G({1bcptOfp$ahm5AfBFhQa>RK=t)q4B0+KTS$}w{Kd`1R%lRtoQmHu7R#Srj z#KrCnmkec#mkTRyK@8tnaEfqC7eO}_3_3L<*{R;M>0>mL9TU?_pW%_-Q(EX(@3=0+ zHcGUXhkO|TXXU(sqX@r3|MBbB>wLkVn?KLz%cJsKnmd@Eo2e3l>34SO`QG*15Fc?7 zUC1T#U9=@)t<$Mb{6QCCBt0ZEa0#$uOar~@OlCRF$s*>pL);x(NEP|MbP^W1&FT5F zvElGyV{k@|&JIaIQ*QZgOS)E7aQaMkjl~_9NyVc5n0YM{RJ+1&*TS^?Xd;QIgN=E4 zQV3pG~g#RTaDK9Q_S2R&AUr?w|C z@TSCDqZX=G%$2|jO)!bZ0gT3HlrppN7Vt!0k)bgX>i5yKh!MbW0dTOhtzPkKeZYRqVs8&8LJ;sVe>?n;i zw5TolUXwlg`nFIJLb%2LO(j6_W5Si(VREb?kgk6nV_eXDkT#9lV}+>hlhv|5FgpIMvUYQ=nJ;F z5FyZ`$uT@{+d-C+m@kLNq77KBp+rEIHDg1E;pAx+?Ek)g%Tz0irO)di8-hS6lT0=X z5`em&f1Q^zK74HRp*}D9XW9^%+vES&n^s zxOlv6*f!RaGP0KK{3QZCr{9q?HgNR4-=&P0>iMwum}mk&pOlEkPl#gV%8@Wp^hiOv zmMUV=MVPVJlI?F3TA4%M_wY%OVkPrNj4svc_6zk=?4nMbaG#a@b{lFd zBTAR1O;(&G0bTszyXObDNzOD?BqIyc6u0!Oc&%ML4{d+GOoB?D&g-vS07ZBDubz@^ zb6>asa@os9L9ZCf0=;>JKhl!ml}R!;3G(`iS!E;8CTb~UU^8qX2R8K<>mELfFY@27 zxn1hKUed!-E-vjx7dH{nliXQQp21)KdxgZ8r3eYfm(8m!4lPR7ED5Z|kd4o!wA429 z!T6W7erUgPrPjF{?59O08z%B1B&rEF#@U(>R8LHkm%aCB@cR{OctdVU zqH60tQ=@g@suuzmRshn!pC6}#@_BMv4R*_6LE8Z4V#|!&l(+TmrH(bFL7)Yy^P(zr z78P=2c1u8dk&fTpX>VXeWm*kgnw(Fj^-@@pJacDSms)zH4in>Vq>^vCL_Cu8Ib*aq zMu-oW&gmEMdxJkB0%AmF>%ag0*4r!k6xS3(=O7ADKh$M=-p;2%L0m?lc!UkKl0C}*6P;(xMKcWU3TP-B|DOBY+cq1mE#I|5L= z=w9G~7Pe!BTMpG|v@WvUM(u8BNW8Op!TRwn&?4Jw6eJQ8sZr06so+4~P-^1LDjk{v z3)5qhqn`V4x`HF3s-n8{3^V3F0!5shaWjQ3LQtH;i&10yjR+mcUUv;A(yB+Lg|5(U zb`iw+UEmg+$usAX{BU?VrZ(K?75P0+N%2R=B^`^MK?1LaE~Q5L@OT^gvVjT$nF!@q z6&nLpOY%a*0W>8(?+a3P1H6wOI^&w^HXJW&6b~!L4=wXb{O29l7y3=?*ZP%ii%3rr z7%z=f{3!V@LwJZq+ZtzjonK>ALeyholK%Yp^RCe!)=$*+;Z)Ojj1Hq+`1gMd(!aPjyY8m#kfUQ+h{7DI%Y(Z#-%&7*x^K)) z*B!f1tkD@YcW^*oNcMFhD5OWm@Rq4y)|u~8!M8M=Qq8r8XlhPDEF`@=^pD;=Do8PP zF~zXB4b06N3E=N=j0sC*IqA0tvt)6jUjvEqmw)@vcclNE?zlK#aul4r-nrCSD^15m z&t3^8v~w4L%L#>nNEED7jf)j3fIU2sJt_VnT;Ri_yGYjKCW^J0kn|(Tn0=r@uuzUM zQSj~uv`ly$AW9eCMeU)egHMW3_PFZP{1+E8*C;|p`u*xAW|egGBx?AMv+jB{ z9O&_TSAjw~h~%Hz{lDTE1s!MO7(O*Tqi`#iJnhG~B$ z@AQYX7K)Pbq@IiwiQv}0#vZe&hqcDw2=tElIMgIf23^pM6T5a4?6WC4h(4ufjbwa4{Rs1+zdmv4gYT7 zvcrUqx|4Kk@H=vgEdA0Wj>p&t4D`$fF7KET)PTo&;&7!yF}IeO^NHS~(;w77_08 zE&Pfm4r_dD+P+gnp3QcmaBjO+Q}&|>sg0~m!DU0>Exq0jZy8Vc$t-3ILo&{|y?Hlq z^Rb?YtP>Ecw*sjn+)=Zp(C=1su#DKfU0%YwTH7;)N|&Djxs&I&BzS)tbQ{#*yG;4k z>UkoMobaWrnJ5Iu3elb8+(JFo0nN$Z+UUINQ;hk~%|o8Ba=ey3KVH7<>~xj1gSF1s ztF@^QA2R32X2M~6)Se1O*(*^bx5##*gkQ)!8<7>s4JLcx6De8-G-*Bp+3=7Rxi1im z9{L7DCAy(!4w}cA`WqN0u)4@vf(KeijX7!_5vX{eLgr{yVm0d@$Cf!or*TrCc-Qz= zVL!mqJk{(@bZQn#+0Y?3^!LU!kFk%f91ORGwQ^E-Zelg?d;P3p&L8oNGv8iu4|g;s zrpNo0H-2G~Z67Org#qkT_W0`as8HU}K(fl&|0Rhs6sXGl6MlRUQ}nO{IYdhWYR6Q_?)Jp!?VI_FFN|2H&Y6?XxPJcQ zgcULhid_&(w$?-BCc094hUlikOXuT$t}MVooiI|$OvxD$D*av`SbJ(c_nujU)N=X)6?O>#%mLOqTtyV zm?88k()`2$R{v}a!cnLsH;OULGWYdqOSYXb7|#Y7siEg*w<_5T?rFblLdbaYjZ|>m zFZ{m)(Osql3%oHYCVoi#V^+SN?>;A6&6sIfku!`s8BxObLwnLl2|S5hg+^>_ukz^x z%`H2NqYoo{7jj$mZgKbS%v_MY@3tj5@wP5bipl4bI|eytxA?AE#+U}V$p@;#tr)2` zC^9=?e5dJyV$qiQSVokq@l|fY`?%P*dugG^!MbTaw>5o1j_l~xhLL9n-RK;=!z;Xz z@Nk7Gl&fSy4uN!he3BnO(I(5JBCQL^)w3ZLg#2Nrr880PU&)|csNZeV0P;f1jW5fF zu~DL$sJPkM-gLL<;pU&{o-Y%F@=zzFtr1lrt1BPr;2&+5s`K~j&EMYl^F3-!pF<3# zMU>bS-4d+4NwmAYT!`z;a}@e^b95w%VR&tVyAI`K>kJ<|C7;WPw{yQg=U;vr`TuVF>S_NG%7~oRn-8T4`sUG>)z?cm zWYWw~j0t~yOk=w$W-DMP>^U`g%{ho9po_}rDbc6rAv@nny3nJ_gnEIj5fzrEp;7iC zfr#JN*LMV+t6GGMy-iM;?APaMq33W6rQkPGNhRJsH+FIoS)VkQWMjcut=ZrV^(L<)j7{KrF*Uoyi^MRSmO3IN2j}3+Oyr@$;0ThuuY8Vw}PhL zThGSJxF)mzWV`xoi4^-8M!Q6$r7g4`RHKs+cKmgvx$fRa7yXdz80L}|>k}m;LjO4w z*A;t5WAA8xaX(EV=trXD+~KI}=o{x4B0&7d9)JKaiPDmusOJfPr6Rz-$T$cre~H63{Od7AHp!8eH2zwYuwk&q^dd6AX@~P~v%`A3YtHj=)49(k@J1^VM z+g<|RF3PmK_aJPXZbz({jz8z(61A;JNDJ&h{TO$DmaAcNyO7O8DE7(dGfl7Y&y%L* z_H=%$Px!qjj zntXS<(jN)Qf@iOx2raRb-@9e<80!Ucw-InVE{Hfl4jIMg(d{Imv3?04_`S@4`!XRy z$F$-cucbt#y|`=9w&Jn;VYoFedah+Jg~Ou_xam{hRm{JtLS_$p+spxN#ew$hK>Vvy zG&9q{@5*l7<*_^9Xwk_p$bS*86XQ~$Oe|*3K{1~TXaR}5q6LRIpQ^%= ziCS_@YmKC4jX8I~FSvXRThxI|`4qiSN(D)FEpKmbg{A&>clHzS%|PHGI}wh_<~NsX zlXQdW$@Q1s3>AZ7iLt%GS-qii*fv8=I9)c>6{}00qmy{44ld@oJ)aMT_KO+X9k<0n zuUEOukHb|KKkpI;huAJ5~$^&<11;9-~otd?5z zo$tzQV2B=z=qC{g{6Od&&vbOLF_+*H?3K2J!a;BkjOs~|F3hsT219I2>4GPI-+R z{}U|VtM^wDhwJPZ%oq!uXWdnOQy=W6QK$(q7V&btpy`|xa+cIdcqB7&_5@abPS!SV zw9*%UUvul|8Ub?+YN(9L-P1f!Sjo&TEX@a(Qdv)sUcB+=?yF5iy)mSnf6-bf1Q=k8 z^GPpnKv=rKNht5Y+T03nt5GBlhqx~ZTbyPbHZtzJ$ zQ&lR=RxO-@0%=laB}IlE74M-XI-=2f&0KVUg2j}QE)h)ww?~$_c_Ow}=$O~Z|L4Ys z$NF*MFZT1X_laLBu0G{Y^WnQ>D~Df>opyE%KWY7#6u`9yS8(GrD>N|~9r@8p*d7HT zoO{ncAz^Ah*ej^abnZw%c$zyQSA&@vRJQAh&W0X9MRimmqB!W8On?UN2$t7nLZI4B zS&rc$8p9Z`yU)*nII!Sb+stLckUM)I{N{uGH8gIEMFvw<3w*oM{I(K5+?te7%AadH z0A$gg=IYNXl~Czg$dV4!5J?JqmGi9q1HNG=RUI}fiCHcoF|(GQ$jK;OiL-BchStRu zU0D7(-G5h(D+hA@#}|Gx6FBCP@lOS7(Z>_xhRTza-lg?Z3N6Wmr6;KcmX^)Piz$Vn z=<~j|9n}eRae&>-$9tgL|q&?BfpbwInRy zEaqYJ!niWEP?YV}hR@3#q{HRlf>W}?7fL)23;($U8N7+5Se)<~^nM@=6Nt*|gs5!U zQ*0)H+;6c=-=ZJz*^FjpHskO|v?I2>hx~h+_u5h$!0YkjuO3vx)Cm=r{7no%wx)1W zIeBvg9-~#Rggkt>tdH1By@x=uMJ9FS$zT$xV|dQla9gha!EqJC;_Y5ttt=~)Z5^YJ zj0m_n;bL=VbUrzZ-X5@Nl~F0-)%fV+4>kNOjXaqbZnCTPr3^9^g;$y(l;omtiNB(8 zDS~I|`9}7;*`y2mW9u%i#xV$toQGiE&O#G|e6K%G+7P2Oc1?`Q=ilv~YTI2TGvl|oFVm=|Sw02J0&B@m)dsPKe#J`#9JJR*mUe+-OW zHw(YXcof1P;F}9Xu(KL1GS61K42>;kSfy7Kmpy)fR~Th1by-tcB9+5cXK4{hL^tHR5m0ax7?Y6VG*bTOPrwN$DYAqlwoxkY4@ z#fzJ%zrU%sIp*E~mL5f12L5is`vKqvEQBmygA;z;=k7MKBPCjc^fTeq1L~M%Fw_n8 zE`?*Rq#uGDAlx=Z7obQBY*>q8u-X_Mp8WQ4b8R(B04*}zma1^YG4&#rwm}FWuNvPV zCs#-@x!MzeZD?UpZci#*X4bDBs{KQb?jjqbv#?e(5eUCaP8=(~_kxH3BnUW!`1JdC zdU}Q=PJ40uf=Qj0Hg5|(3^4*jg|Pr_jlY_)?J(H~dlO&PW>@npECe1eXf%9H^!1!G z=Aioxqd$p8fp_EUyl6LZr4JKk>-ii65*=ADj-4S9{sCX%?K{whTWIr&x+w{TF8U?}~Rmwf9n)#E|8n|}#07L5auwxrC zVSQ+>AV&cz9>h1D!1fl+$dVK~XOLMx&vVsAg_ewR9$l#`=gUxUq83NpM&R=qgTEpj z=WqJ7%F^9(gxa$+=0W<%g_&BoId0+Q3mt4&=4B$xNotO-w*wh=dQmn;4)8HswX|`N zv}V?F{WneFnWx6KPDyt6zFe<^Kbc>DbQ|2huum%vF5SdSzw-1qtg_)n3Xgaqs4j(G z0&Sq##QWeLth1S6e%(i``1M5a07y{ zkuoq&o{yoE!fW5@a|oua)WTlFQrbj%$}iMhuYqaZQ@kL`72|^EP&6d)7RU#b`RhEB z(Tg+F=ElmH68g{wP!_NYjIvd8oJC^#XQv?zK6QV zl~BmS%|gLRCX;rCy2^`Dlmqsdt8@}o>g|Z*AMLMZ3r1yJHSyUhxF#fWkooeaWjF{U zrx69_F(bFLj0SSF&Zc>GPBXiX8SR#wNXA>wi4q{2C{j1s#Ks?W$VVa9X(NSMKExL3 zv1Q}b=nCPJPY9W|$NNvo)ZMYm2vLLz$UDc~3jZP=-)9%9nUE1>oY)pNs9N)QpS>%n z!EB2I8bcPPbNwj73i8ZSb&+P|`dYakcqok%lJGR7$MN_vo5cG;{?=bB$p@&Ra7rbt zIfozTS!l?3Tp4JQINRjV&!U#Bzuf}jcm4H~;K>Xql_FfzG1nzR7Q3Fl$Xvic1g>;R zXGmx~kIm$5ar(AEG2vz0Lf=`UsC?!c2zLeW=+6 z(WavBqMFaNT6^<&bu{`qJ#x@E&9(JN-`aKa>ELbr*2(8ym_oZ?wZN9d>_W6z9hw~1 zVk8~eylB;0;6VB^EK8@Fg=Wn@aE1k(@{1}ArZ~F*K{aPEY2ER^X=N%>ymD;E#v@p;&7n9$f>%gP9+D`>;hAGeAHy*%rM6lpNn_v6iBIo5L21hkMCPDDjV+8w0Iee*J1}MveVg9Wf&j+5Rmhn40E~C z-Le$&e^CA0tTb3l5@QJi^4S`Y`o8dL{!v}Y67!r8NXOM#;u$B zDnchMR{P_l=iU6|S@&y!$pj>;{Jg8v4|Q8bA%rzf)-CNSXYQU;?_tUu2!IY$7Niy0 z!=fl+Qyz}ADnffB%+Ck_qT@z$(`{-hc|(D7pJ~6>6Qc%88nQ)y=W8`FK=S3G7nsix ztu*$A2sJulA^r6E3PQ=5-e7b-a-KV0AE9s%jP zgaoj&JbaigA2LTwNMl*3@G1MUKP?~4R9O+`^A=N+Fv>N;7IlW|aGn~)hvL{pA~=h@ zt_y>NS9ku2)sZil+0W8^_K!POx|!LccbFqq6W3RY)E%Us^o7`l#*%*pl77Eqqm{wl zLk-*;h9_vwOV>T5Lm*wdQq^iRb)nlbEwX+}!V~&1Ph-6lnE-*B`WE`f>ScK!IO8~t=bha;ids4Yel@}>IY`CD25@fXHD&WGDX@j zn(!+qEX_@>=T2pib5lOn;jCXoaSMRgI7c2fq-X;N^n14ol6(BCuH87Wh8BQz&w+{jwA$Yz_}6gg#825vQnP zGjagaTa5B}e8C{RUv}Lto1txK1`(qpJ|%_!HM@lbs9FBpxGt4+xYp>gH=8Tn?BLU_ zxIHpK4R*1u@R2(!i<~G&)mzG41J#ym%vFFRM|L#iyZ_w|X4$-VbI`@+RQ~U&%h@80 zkF;uO7+N^U$QG)+YH(M>yzl$VEo!3^v{sxc!(t>7!6tiF+`77TgSih}lw%C^ARVXc z>hiMJ5S~@XE+zBh%5>lm0!+oJ-QF~|Bw_t3khpX!dUS0gKcCn)6C^cCXp$P&AfjK? zJjO<8V7V65DZ7m1VPGTZ8V5N*PVnd7e`A3z&-WSJ(QeiMb@p1&?eiR&-v39#iETAw z++v3aE(e!IU~ro~&NR{6asg>k#P}>Bk@JXR#4OC!>S1DO@;d4~%$_QmIu&DORA-n~ z_`BDeRo6qT?A>lF>dmn{`M2%LRW9s%NMn-N+%t=<*X&+Z1IqJSR%Y z*?NT4+(A+p%02ORa-s68V#*F=ha8khUCM$0c5t-sFP7?^Iz1=fzZLlVbMwv4^&{DZ zD*l)+sfWZ9-q!H6=|W0O!`Y}62G zNli*B33$ojw^6Z@=v5OMk?n0CMk@-A3(vIeOkeo@a0uB&X@!Ldm*aPvuq9czCyY_9P>s_6qS> zZp*nlW2QFWmDuo;O@_Ur6tEuzDj%?L+>qBX`+b7SRJP13?Q_ZEE!}q{-FZN zop$$=X1F!KXZyoq6cEp5aoA`rixZt?iGYHF+51tPw(O`k(F)=StLs?atKhObqrqN2 zF)qQQWpKVXt>6>!tp8goTYQLb)uA{a);Z$s*XrcxLx33` z7+7rSdk)d*-7YQ5@-q^`G3TaCtA9d>_)a%+N8;Vgm{E`=gOnH-%K#W1B21#4R>TT=?Xv1CN}d>hVTK} zufF5+?uwWTB(ai!*J#z5+Ax+L;RLg_NYRwZ<3bCjUR(~7SQVIsj-*jwZGYXRG@0k+ z`Gi3qU$vFU2-gwtx_;)ho+(prV<$hyHZj8$C(b;^po2I|NN|3Xxb=NjMu_g`^a)CL z%V>_s%a@ld0t&cxlmUoGR@6B4>r}%PivEl_^$)+s6NF8E4JqoIbN|AZmw!3pU@Kfb z*ZE>HIIGX*BSjB6*TIx~KNb&>-25YbAd`Wj{L(s@&_q6P|67sL?Il_f?QY`3By*fq z8mH+l`E?}w>vMs@dU{qKVL_g0h{BP7#zgrZwt05i@4r?SIzTQk4fb1e+~9ryZgb0? zmbJ*zgIqj?H1Z3~?Y0h44qWf)r@}N@*kt^_0$z0A)7>O-`)6byv+tQC8YaggF}LkR;1Y=T9k4K7g4y; zt(gRp4-H@{D#&boj8|JPdCc_@zF_LF*VTdl%_>Dmp%Om9C<{HB`^e&P1A$-0AGI2_7yI^Q zNaGK}!oqTZ#g?TX;43)M8%4-jA~dY5>%i2xa-x-XJDmdGwz^mmji`RHnPgyF*m#$B z4bZTEHeD3Zyh2AtM@HjcV$l`V5r4(v3j?Tf{`RO_YuS51RNpK8qOVCsh|7CbvzVGsxDV$o6P#G76=Wes9k4G+qeN`E@2uu zA6z=A?|e4iE}crw>;jn4W?mG8X%U!n%5}+%ui4}J)v(6&DY#GVY3fPq?EfGHk)eW| zIZa1D6ZE>j*gewu1|4}JJJ|K-nvze4gbey>5e7@Bz?~Ui0kvn?ytq;@ZpKTWVea1A zYk;hLd|aHAiP^nlEEfiCMvy#ziMl|*4Aw;T0CbIL_$d%F@oI>a@Hs`y1&cmA34TpV z3>5dua@hMi-u)t9j!Dn2H3XSWK`~Wh^lDpRhpDvhE<@q?Hr;?-KrTdWBe285a@c$`la~jD+3PDqFeN0T^r(uEFaa8$>dar#8WSu{U5TA}8#!z)WgK}noR1L5JDF^`gUz?UA%J4R8Cx-PNDSNp_m>MieqdhI*BE9% zGqWB1FFoHq#LrlAzK_2li!&;V-+?!tK{n{5HS#Fy(-dFb7*>HgNo6 zU$Z)JOC~5H1DHE+=HQ*=uhg>R>c4@$uybJA+hw1FAC;0;MMv?OU8~dQ6&=b58<({{ zyU2ml8O2aZW31SC?JE+iwK?S87i=1S9%Uwu_o$CZ)eRg&y=b|$%hErUgs3kfY@+7n zUYl1*a&vP_l!0+Rtwhr2&uA!nPDk^HBAT4Bn$=riPOEfo`qo{(3B?Ny?o2A3@G=0E z?eA;ve<)(rr>q#qngHGyFK155S;+0;(cZ4}F5ASCf)%Ku69*Zh6Y^T^E{ZT^U2UUtMWhg8(Bvf>1pyqVK1S`2v#%vhh~s9) z1P61hj_CDP*Dk!=vZbz%i7vFRl}PQmmP-#Wq{>j!x~Qq+pPg$+HhkyUJ0-L;e1^9& zLv125O>5knD+O`LbWM!dKg;h{o-(9XipHJcT(xsSi=Yq-v{`xi2|V4TDqpG6I?Eb> z$NGKigJ3yO`yT#I^DAl=PV+2G0Z@rIuKa_Pgc;-soi@H zy)t36n&nU-u&^37Bu_I@*0SaFnLK>^<5Ki@J%{@NlqUG8J0PoaHxM30iF3F57L(e^ zC+G~4fjK}vWj=6Kmes>U1FKYo{VZSfod4@vd8gGG<_(rfEm3EGW0P*j%BHP?L|3*wWXG7yC$NcqwDbgkQdDbmO+#vU~cA+O>v{u1O?sb{p|uUKDBPDwskTA`_{e2>SH$Yv-9m;}@zXbW1DI`^G?cYr>JNc#^=|8l%aC zb`Eh{N@I&PoAobgK*5ZKuVhwT7LDQ3^Xp%o-pt;-DF}{kc4UQS4@sBmLZfN93bq(i zB?fSj_iLL7_2uQWvy8s@{#rHrTJ9F-pJCCU&^vg;IXNX3j{OlxA8cY|6Pi&iO4C9x z9)EZKb2eL{T+K!+gzGT0(8eJtDm3H=`tnNxgInh6`^jhtNjiSg(EX=rnAj{8qHDp2 zzb6@sLE{U*ti?K=Ddguu82;zEZh2Sqv6`ouNNXjNL(m7Hjs7rbhhp`{CjDJE*d<}M zVhZK*SlOtPaku`5hV$gj=55ERMjKCE!t$S6zqQ7pZ1C#by8CW{cEoapb5{X4Pn+YB$jQ}wG*+bRcNkZ zyROkz3G`5q2$!c)6p)aT%`8~&NR*YcnD2xnYx(~q2wy;m&Bn!%gy^@fYG{D)-$ix36d(b?33A0Gm40cb3TjKVaoslvO7nkADon z72u45B8PogDDKDESob1h6=?LeF%#7Fe11l7RwkrRpZnA|mte^GSArW*Le?86F~}bP zvAq>5OME~2Hi}+|Cka&$AerQUSpeKLi6)LoFtY(GEd}n|-Y3AA6=Fio=!0tHIX{^F zpw(zQKn6%LW+^C~Zw;MC(8raBI@>OaCl{6hdi0t-BjYuY8z_4sNe`(INv7XvQM=k7 zihT60Azhla6{J~Yx8Q~5aZD(q$oTLNQlj;xEX3+Iy+QL#JfsuyV~!Q9yaiT}K*Un> zt66u<|SzLZ7VI5CU(Uu){)_1ilRpN0akIIh5iKlE5p7*~BcEwrUy$t~NU;oHdNC}{go<~S2k}5^+@p@qvm|w*Ct@SU-FUl8*20xus2#lVU(yWh)7Mo>* z!I5F~Sqp@SlCp1*fS0+DZAA{08F|@xoSashpQ&;^Wet zrSk6M9NU6`H*1Z^m4XeIETNvsb(;zuop6)FhrE;Sr#eS+VmetaX5!g@9wvTVBXgBx zFy1VosU33Hc?A=)(@{w=4R~1z1Djx1BDip!fe_U@0 zi?1YGA{jQZm>MFots{Cxv5VLz0^l-2&NUH>i5B1o)F;IcghPeU_&qKr<_~2AH%3XO znX#J~5m?tgZi(fS=#VNq^o|Dne101-!n#JN(NSlor_C{capx1@njon7<_hxgh{d*A zn6xokDVbaRuM{VtqTI9R7J0iIcY1Rjd$prmM1rGhAEBpE;9WK7b;>3H?BJ)@58X0L zlHZ)u%IlizIiwB(u?BP;G9M^2;SM>*{S9Vj+ZXD7+ruxIHxc-pDp(8`s!#`LmnT zdXQylAjG^4Icka61yx-nyeAkh%uSs;XBMl1Lk~hf?eE5WLA!dqu%_-Dr1|uJi2k5P z3JPAju8nNhV5yOBQXmV1tVkg}jEv;eV*W<1jsQKP%Gbyc(05Fo%QjEx4wR3rvjea`Jt77!gi!A?=RTdc0WBP*$LKUn`AyZN zZun0dJc&9#&1o>7NSx8)B&=ePWtZ4+Yo6-Cxk|Sdi)mh^1mw#l*h>fZVN@7$WaA&N z>=VaS4&r$5o?>$QXrtX%|FOkoQ9x)=b9=jgC=31W%X1w}lo7_E6NU%EZwp<}AEl+Z zTNPZv4;;HOaXs+c9=KN-1NU7T~P=SsE+3fPX!R&hx-?!Rq8>P#9N`Yb21%8MS zDR`AMpgH;DauFJeS_|_h!<0>rEZAKdJ@!iXl(TG*JH36clV|A0{GO?{h7d@kfeYbo zxY~5ZaV=|hoX6&vwCsHT7Z0)IN7v;)SPl_ngboTma1Yk-2WKWGW?1!5m60&JCE})=oXcoFz^diwH9~73 zY*xGX&yhE|!-|Erw*ri=$qph3m7Bx}=mxdlwZT#L1Y8&O)}PQ>&;!_WiQc?_y#Q-{ zL&-#mmda$SqHIG2R#1d9SCnCZyE_Hz1nFT%@w7;;o(LI;&t0T+AAau7{H^WfA&}{2 za3;|yE3X8cEixFFTB?27NuB&7Cjpr_A+Yf{1|XSu0<7~un)M7U94wB5sd$w(i`{)* zKwpyu1wNJcMmQdN^aHRM4n0pAaYpuj5TGQxOT{Cq2hhMC$b|cJ6+g&nHYD*cW-Ou8z*J zvGo)JKrzs|PU0l6!pk4wnq|ZVgjc{K=WYnB?_8WLjl6{fyG3U=ge;mh$jAHty0k1N z;B7X}`4acW@Xnw|pVjNh@l+1UA&@m3LMVvm8`GQg44<1b=U?JpW0~vr^af4wE2LJ0 z5@bf7y$<3zpP!#ys?+RLxe=k33WLnO)XivU`3C~GuKqlyMsDz~Hd3(h|5alaBh@c3 zw!fDQe5%DMlA6few(tWVTwp7L0@e|6r0XKGg_#`TZ z8MILsR2wGC46PJWY{@LdyZp2MhoBL3VeK@`YN~kYKe81eI0ovrT!m-Y!u*c9o!ab` z#?B{riJPzv8Va5O!`E~fShn`_ z2r1vG_$Ft;TFcv-8-lNXBh1=CR)g#r@yf3^Y+*Cgei+D1B;6*l6nWzG(&adl0D)W_ z0kSLDW^gL{b+rJTC|o(m0IB)itfyGd;@?4*lklvB#z{!s28jEo+yl3@FJJeIn(d% zmH(~GgnwAh7G1i69eAyA~*vO>oGgiJLw$Y#k!y zFwO>6{mKP&t?ISI&~kQ{?2?MClY2n5 za)2}vfiGc#4_A@-08Ca}zBfIMKUA7?ie6}ig7J>j5{6gjrzZ0=tYV?Hw&lg)=ZC{= zo#Q}AF^_x|?rLPvn+;Sl&*l4*V21LBR@U{9N+9ar-%379q_wx1WmFkJbJorhHztR* zJmz~3=aB>;I-59`7@v2metOl@g3~M{J@)4V)plM z5Oql}UO81AD^yp{arnWyGs^CSf;Qat!|pIYIwTtjPAAIwEfgE+?4=M)5r#lsO{!Fe z>CxoJW|XX2gTVxJXGg!*n~4N5Hxoa;{EPuKzgjh<1BQR<1B74~s7A!^LDdKf@+kJo&KL*FNlrh#a!=}VY(~bmf9bTfZ-E}nx#37mPm|V-C#qV=%Z zqUD55h5e>1cy9yG*YQ=>)>9In!4WJK9V%3qL#h&C!abwy;S$8VdY55W z74LT*?L=bCy@-uV9UZqYgpspk909DfRTdFxy@rbqwDN3Lv*i$l1z2mjabu8K zz;+wnaO;@O?t%Z=Higik#$!jgo~cTOv#zk+xYAk7(Lt(X260p3m?&?JAcqD3p5=-f znT}Rcz(Myo#wVF-wjc*xnyBcj4L?^3At-!68t_e0%aT4T?UXBF_2^{M$L*{C(!AZv z3qd-$Y_b+)54^-RqDI}4{UP$!2H-~N<6^Kkb_dBS7y-+eLO!`iut90l4lEFC`g`+6 zE-(EFrGn{GhY*lsVH=|B9AoD#PRxL#29w7F-O%v>c7Bc0!4h4pnbQ@1k)X7oraI=3 zv(KK3`YwPJx^Q4DYWe3m@}%SPA)oH?5lb$reg`|w3;SggRywBBl)IS3r2?98KvLTmI9%ESBAMIzsizpx54PP zU;>5YccJ@Ul&vH%ymMlFIQ_rBzIxS2QqpvTW5fMJF5>*gdnqu{P@kq1LAe$dr+_}! z-t6a2ueC=wfqzzyB8{BYQE*fbc43W#7nZ`9E|&|CR~TZ0dGUc!br$vL+1NS#@^sYH zb+a-WXN#8wBVBcNvcK(I1HdgnQ2nu46}4A>029+4%Rn~fte*)fg&302*S6~%zntGb znn1xEoQ|?a542X?v`9r&Bhyli$E}*dG1&LVw%}OG7!yv8wUipcqVMfwfL|jvpH+Z~ zBg~^;?b)1CzI7eFIU2yhqZK2&Na!C`#*OW4jM8lbw}^5*mcV^IEaf7OK<|dui(L_4 zSl2UYrcAuk{5L_X4`}FsNcq;n)PRSTnTumVGW74-X;fn~h>}J`*Z;!dXo!X0S>2*O zgOIm&b`Y_r-dEU=HDxrO*waUmJ)f2}Q!zn{Y7~0hNLO-$gfVmuC^^dd?J!i+fJcPG z8cG;<9wS3h`3<^J7iJ*gJWLc+BIsSBsYFY#i$?IsrEqOh2~?g%e^+sVTQ8vKU0URA z$c9ZR{?d5a@`D=x8N}4tWD>q_!d8G3Ub7At+Q?D(_;IHfdDG&YnwgGZ!wLgyGb)2# zp4NA25gs` z-U@xVOZFcOHbe|0ZoOHX69-#fcqTK|LdOa=(V0>SOuOh%JNX0R4X<5U=KvKO@y=PK zPo$i~hf(V+hpmJMwUL;DT+lU7B3eVw>N(nfyUtS}>r^ilo+Mg~ z5Ap$7u~r!EAQG}dvP0zsTZM*)eLk(21}VL{A)1y&oNmh^KOqPsG_^Lf!Jx_Gz>FgT znL)YClq-&=Vo%X-ad0lCY6P^x^wE{J1lm`Xh%WPaVMr8UD025qh*vZNM-niEaoEs= za{k}V)4zwL^c>EJ4Km|&nw*Txl_De(lL}KpfTQAvE~uGan%-VyAaX0(6L)12hB;-> z?7_ANb;3p3!J8i}ln;&d(FQYIIpaguIDTp(a@ayWq;(LT=XpSnh)`af+-{C)T9x3H zOE}yeD6 zuixfFpSm$kuc=A_?hKeWx6Oyy1AoK}laGEH z*?Om}{w$cCELxE_2^p9&yRa2>GiB*o8G^Hx0tZLgYprX|cGG3hT9}Raqf)fQHq$^K zhRzlgiV)i%1B8+!xhdHH5%?j0eU%f)C$5u&S*+#Utn(kT3{a?mf!vZLYy-3>&n=hq zQAd-Jf(qV0#X_`&aa0qd_IUz+VF*Sr${OxP9xQhkq;G&vi~SIaWF0!W+PXETl8@uY z2PH?B{PRvXIWTBt_~lr(w*@b>#3(HaudN3gR%FV(b!|3Zol>i@}vaL#05%TS8a^g$9=U7Y%W$lJqv z?ah{>chzD^tWkkdYV9474To7mYG!2y^vl|s=PApRp~ev4Bo&Ah{k(lB5; z$6a@kn^|_EH{d18>H*!l)-MONd3_>z095hWZFp2tqI_Y2v;gzj{U0mi1T~Ehe5z!v zUS|Mi3^tUOkwt%|6GNY!eb0~?QC*Ts zO@>H4UuL!hZ!`nw~PVaBI=1uw71PU&vu6ukvsZzT8sT$s@k`H0Gw)MQmr z)LRF!Kbp#^lx>rikfwJ2fDa7+Ln44$)GcMQcvo>;LI<308Ra6p>hl@?-C;P9M>1A& z7M=Wg7F|?Xh>L?qCij>B(B|~ecHkd{2iQMwL6XY6XT~=nJipAx1 z`ThG88KC-gCq1MzDQaXnB4OINZ5rF9S`5%*ITlgva4Vxnk!YkIe=jXs#(AtTYbY@y z2uf=i#ePOCH&W2-Em4d%nW0``0bJ9hh9&`P=YZz9 z_iOyWZGUU?L)^@QsYXRxT*%)a)>|_I!X~+!PBiT33FPr7k=V^nm%P`0pe zS9Z<~=-l(MF?dUw&dX6uX<1p2np3_qYS_g^4PI27Y2!g@5!oP|5+s^(`Ubr# zbOy+X=7x3-SF^kL8$kA&oK5p{J$vA?szL&+CKzBbJp>7d>KGuE@mwIY=TbX)M5){Y zIaql9cyj#qHY%pQa6k4ZHq!Z#tvXs3a%UT*^(!l=LycrT8s}beG18J`D$%#8-jpMs zYeb9A8n@y}w|-@D9)YoB7R&v}k`m&tU`H`4vj8(oK@QO(5~H?z>F)$XMfE&8KxrFA z6B(^1dtL&hXUk&*zHkT;V$hKhSIg^G{fjooDDWT_?1(S9R$-|!s5W8i*81aiU>HF` zGf*h@$`dmO04T(m7eAoMVc4hUtw78!2Z3;EW1Y(QIren??g9!ajj2uH6f{nW;v zLKC-g^T?(NM@#i+G8_@_DdDUah~N*h#!hI4BWpjal}az2+r!BPPF3ETtDua4{43Z~ z%P+vkH$0=i)m>j!-&X0WG)n#-c>o0>laWM)=eMk`Dco-H}ZS;7acLJRZcfts5 z4f(KWOdqplsl8fbJX5Z(yULVCI5wGT4U`>9; z@{;91%pA;{4Q~(*7~KR{H3_GkT7t{40&BB!yiJ}&s{0eTdKUlNmC0j)*r4B4{Ms$o znoJ?_`~BWnTV+6H2M5*jt-y>IA*XE!#8iuq)XQfn)NtA>W?6BA!HJFb_+K6K?#ri7 z&`B7;Y%Q6lGbeo$uWkdrF2+o+BrQY14uZ_%RN#8k&Lkru$&dx0lRZvDqC_CtKG1m( z52jFl-_5c8Rwhe#) z^xAHu`OYJwEh}7zTaYUSfiWwGl8Iunb9ft%lK|+4Fi7# zx|uUH)19yiWrY{AHW6V?os=eqjMoy+B6()G8Mhs;o72D$clMxL83m8RB5<48zIp9A z2~PSD8Jyh5Zz^PuGwsS^pjt-Mo2k%VMcXrwj>$>rcy($VQX0Z9Dlj0kVoo%=TeuYm zn$0B~@&5@W6+Jj0Mbk>7je`12(HT*Ybr$%qDLIirQJDvv$r|!F?E-bdER4hTWw`Qe zWOE(9WY76UwEk|_nWl+!;yYhGNIqS;w?=#@Q!TuwZINq{4pxujpW@L9{(C5J(|7&x z$J=q6i=FTHe8G4*F2ns_+TXFnqR%Cc%ZlkV0585Y|hz8Yzq4<=In5f~$R94e)dJiXSN z&t|@>VmP{45x^Tt`VPm$Wh>XePg>;P>8#M199b5uV*f-`*qQ zj%qYxOQXb8)|_0=|EWKNM0cT(lI{w66ZK_5iA9?f`0m6aRA){n=`j!wOfnkrAg8pd zgJpf6;*hmQ?(+LsK;R|%BU6-~jupYg*qIm zC>ZjG^>fV^8gI4F&;iri&wFFTS88w!>y@OITQvD6$6bnUv61E)6(h}4c4@WuIePLX zPnbW5f(|v~r~Yz^U=vW*LBUKzKlb=vo#Ro1*Z`A)qxkPfQ_c#!+kI@5>Z!2#yXq|$ zX>D{#nz~P9F*M?U*-&KJgb^s?VY`##r9JoH_CAR1SdnC$LhT+B%@S2v8>A{MbH|4v zSj(*`L(BWs{#x$v`OLVQ28!@E|AUxe#BeTH-zg@M<%ff%=#{p#XzWssZ1zJWq<-)5 z1u&A*>0;pHV;AJ?J7B!)n#{Q6*jQBZdj1C-x{3n9YC();y^8#PP6-s;7@ z+Z5(-RW*vPuwesozEUaK)zuJWRMoJUz?;xQ87+o<%=JTl*;F4_HkLjvxUV>^N51?4`GZkX!y-B75&q)`zB9xQ8j& z(y2`RzZ!LTaG^QOa}jIevVdFgNe;4cNrNv7@7kZH5PQzD;_dxC%*gyx_EUgrI4^@m zcH9l^aUm&naWCbA+5$!(j8K{gtBOGO4?ylmsrYWrA#{M z6xFlRF<|Rb9%SsToXB@B+0(->zly#Gi%BCq(_O@C*Ul4yY-!j_k(xGSa<}%|C2Q)WuFGSy}JE%2NC6zFp2pcUJP=qg;JZ7 z8zG?1wx)DF?xQC&P+6bqL~WK1*j6YlnMUH+3urBWY!`D{uwV0+N{3fw3b*=*ZJvbU zoK9iLLmhY*lr4Oc45{?RtvE{=NAyP&tkPOQv?+T2m+kGdM{vOwTj(|JS;^0Kl3bWh zm-)8*5kmfp<|fU{7#0uUNm1Tscc7?4@27ADSh-^4=6)Hy8b{O02;z8lLwRHWRoFjw zZ*Yf`Tz)LAJYEEeb@|L9HecR@!^bu#X;pjP>h;Jzolf-pUW%r!yb)+_0wPYU_~5;}ZNhra z?4kKf+g#3~(hsg85$&i0sh7moy@pC%@wI%B#?>J;n^wnfRoaZUcsTe*c+j$4*nSQ% z*U{DB*JTZX|BCn@kb*-PJW6V$g*h8s(ue{sh+)7iajO8B^#oc?jyi+p<=hro$a1Ed z!{@i4Q#9->c=3DOeczkerDBcsrk3<*lc@(>iM)<}JN-f%R3p%HIMuV$TkORjF8&(7 z?LT!Qqjr<5HwCTWc+$%36EPE;2uJN+8#FC)tdrJt^MItM1*0wI)P6!}ux|1Sg z)x*?GbZBs3f0$AvP92{^_ySYnCiv6MOtEY8A; zU%~R^y|-@&|6X_eRlIvPWpP{a;ns*7J~F=%+&8txspp_> zjn5>)MkH4!qLreJNB<(?qoMF%bAD{nUGyIH5I8S}n7Gj$frlSn}(< zWVA-BCn`;ygBmk7zHz|B!l*b<(^A)uWkhi03q^7TFUBQJtsLru9>&Aqzu^UNG#4Yc80+{Bw)EoA$PiIW97{1J_3Z!gxtM{SMjuwtv+W6G;> zfzYz`wcn42BD>9GZt5hC~8cC<3~F`Xx5Hj5)Sp&W6+#I&~uGlGeIvYy+a;(_n=jkV*&1@6eOrq9SD zq9Af9SE)r(*4o*}ur=9JR|xV*) z?(86r$%3`I%Wt?YotB9bB1YJLXJyzx!H$_4iT=EM%2kLO<~1^8mJUu1(S%c`2AJOcUvSu1-Gs-)sV4 zhG3xBsDKv-^Tq<;={1KGYG?Pn*_O7w9vsJ5CWyB4b7Uw{>;nWwC8URkC1y>7ax5%W zSxw+$C0{hHrdwrkl|`&L!91=ofkR_69tt<h#u{+gD@-eS%*-0R&Um2N4u1V zn~@<0|26n}aBx7^x2E~tb@l%hZ7a&ZBM>X%jHw_Asc1l&*62K9yQp_8?FYMI#N(iR_EPkj_>W$M4bI zo4q9kz*}8#PRE5`kE~BqbFwUT)s)b)q|3y|#}y;awu#I^J7kXk@2jExcYsleaQJW} zWd*Nkqg=~qsk?W$nUEmQn`Ym;Bcs&l(lk3;%Y%J1na?6DWXe3eAuxrfgy$=TQPQetXphA( zfAR$Eu(FlLh%B`K%u4@A?7C>75@e>2vcAS|jTbqP;Wu+x!Glm5QdI7nH%C4mwgNNg z6R(i;7M7+cOj8i2m9+sAKMabcoJXzh4S@)8 zv(FC63SRkagL$ASVH!P=P<@;>(T&Yha|AjF%W~HA2?|MYW0y$Cw?-Qqj8&m)*DgoM zG_sc(vadB2Fro9>>D%#V$j@O@Z^egDr}b`ENpv8a=!PRGKIa=r3y2#iC>kNSgoQ=456 z(dY6tK9rkpmHK&f7^x(#@0VIzL7lFJ{QHN-@+5hnr~){0)v4I<`g$~VaZogWL5Kh@ z46T0TNwzetu(Vb>(>Ija1JWr|ct!7>m+hMf)#$j@Ss5IxUm`tX#^La$UO2ItLPOCm zOoA;45hs*C^IoR)h`JcDaM`D&Dp}Ni^pAZD|v; ztUr{H=snl!3Z9zSA;e~JV0F!=4Fkp0`fFYoaf)i>B~zO&korb*yK)a6nd6s67Uctq7W2PsofWGMhSwn z?nB}mZ|wsqYLdp*TUwr}N{D z$*0$|kNizfE4Z%ooHi#m8@A8(r?Hn3H>;b4c%zm3rNmw{JlWV5|8F2vr^d`rD1He% zJ7<#`gtTOv5N$p>|Agdpf?}B|Bcs(Z#Gl8S$U;f`Netx#rCmP|f&-AT*g-IkQ3{rT z)kOQ9v~nZe(K6$4%*4$zGA5S~PIYer-r5Ac_#Z7`gtL$EtE4WS&ED2ux}_?WGy>5~ zG&s!rH8V^KxZk%vOcfoPHQd&Yr`T+1ykT5r=%9bocKg|hv1x>H95*yFm-`a!u&w_5 zsll;wO(&4Csb@HZnp>&4mkp(eI~H7RW8kzRp`_a{XSP>F1^rF~^~3lG6j*muY^L3P^7h4Tfuf*UtCDmEb1s_WwL`3GFX|^NZder!edWHkee3|3NxmS z2DQrQr^FQFVRxbaUBIp4_owB~i;U?*OA^9S5IN&ouM)`b6S_OIGlVmpNg< zj>=SzA_vi;zT<=ngQOJUaM#OGA-nkch0^5uOaqI^w+6L&b3({i+#n7Q=n<%|7$8T( zkrL&ZST3f+8geQ=CF|}{H_0Fx>2F_%0$&?9{Ju8TSz_?o#w5Is)tjx^1{O2xsynfr4b(xX)=H>L1qq+}6?7 zCXTz`=?x|fKH1vZDwECp4P1kXLL%-3HP$Qk1oepRhJs;_hxY~}CHS%F*ysOxZ$86r z9@b~NBb*tY+J_mOOlqV)-&nh#np^sU%wXOi*y)n0LH5un(phv-w^Fg%J@x(cN*MUb zSzK2mTr*JK#d6lVjJ7e<$X?!(Y!a1Dt>Y)~vE;q|)j>H^RF%>B**JWf)IiaV*(~Zm z{2QYR9~3Xr*tiej++7=@It$3+?DMN<)o~o!%=8mDtd2cbe^WqaxowcDPA`pf4&2so_4gM zFm-3T;?5yvApV{fIgleCjG(CduuN_qEr{&h8WwJlBlXC!-n23 zuFIVTT%{i(@*|Pu&t2XV(W;3Kbu;d!ijsEDJOq?9e@Xj4olq8Ny_tD0m&#Z{3uZu$ zFRWP$QexwgT)-h4xWI1;_j3_Ai}t}f9bvik!IXUsh zR8ySU^&(}02v4%sHo7`Ga{>YQ+O}3S{<^Q=vMB=)e^y2L$WgzL-^sg9RqJ%vgtz~RC%9#wCGNw zY97H_$~9u`Ae32CCUn1+UNH?4bh1j-q^^7xvV}zo8Y`?L4=Y4RMoXsRhK@`GW!0Me z%@HtCqy&yptAAVDH%mT~@>Xl`yO@Qvjg z|1Hz`%r{;dLAQCVj4!$}N`8YRMV#ae5^)wLXeSSivvh-g8q0N^MmXA zy7jZOx*7Omzn5AN6Q9wIs`z)rXUMT%CobO~@@m32hyzJ}PgSyok)=_xE5wb6AlQ^f zMVxOL{uY8QOTTygzFV$F^9mW+wY<5FuiZ|^0>%|!P5fU+ObV#ADGNKex||SV*DYbF z7CWZW05y$LCUs zEJ)lMEsZ`T*uP1V`IeN={V;CQXyDn({rv;j^WIm4=mL!Hm{B#4Nk{wcCbeeUSMaTp z$+a9EP0NJz_5qk}n(dT~xuv}9=NmvEY!aA5kN1)yZKsZ1CqC{!HLb*uxnCCe2bAXi zaBiHa)9qi_EdELA`SUqETJ@@puCDI59OZ^T$~wGt?%<_`F&tZ=8NotkA~Tg(W_Xh@ z5ol^-!eFO-Ms4D0*5EgeVaqi@(lNKu)P`aavamol#QOJ&AzNcYI}#R%2nKy1g4gx zKu(u2is{lG4<)C2xA+g{?IRK7H8`EI@yuvdrw`q$eJhvbwp(Ym7~a<1vnRTr>K{)} zd>+{^@t+_~tRT2+4uXBJ2Z@=;z*u&`;SF&*Q_mFKURQLt4FmsL+8&+h1;k9uV`ag8 zHy?^;F>dQ``dm*LX6II*kFeN2@$ zpHt4?PyPwYhnCVmmUVUIL$p8BGp%4Gd3HDBgY|879TBy<8#89CtEk@DX|V~fJe-#p zwB~eyGAjt;Eo4tYHg2Q3bEx&X>HVAf@i-ARXQzDxQ$GSVi+)kJ|B7iC+wJFj-U7E= z?i&|P-@4ZBcT}9XZLIAvg~tLZce-OaC3>ve0=aOtUZv5aQKfd#43c@4bx9iB?)CS7 z0v|jOY)PUAuxS^q$y~IsxEG&ojUnH^gkO&CCE8T^WLo9;kMqg93f#iBZdx@R`Ad|9A+7$fiY%tqmecqzjfYwgE{pL?x zG0);aSWpKosPe)U>mw!AUzD#5jsLe>$3cRuQyMXo@39JS?ITKYDo$bf6JttNnSH=kSZR8O_|= z_$3bcIAdFe%Ie9vqI)?&!nT7j3dZdDZ*v8}sr_ao|88@V3E;)Ja2Dr@;#4%U3p+O5 zd-+>m!H>ehE!1=_&+|Livh~*}10&J;>68#s?HEz)-&(~YKL_In^Y@t~$5b8h-6h&< zyU1A^#9Q51#%6x`IQm{m5ED8$q>tqC=bY|I=t~rCp8Z6175opv?I&j9hBEUbMtGTD z>yfj986NR$$7-hMFimny0l8`D?fQ7jRc+g^dL481ZkMWl&{q-etZGsuuSYVnyl;Uc zg$?x>$HX~iDrx&8fT*$-ss!1@jj0LQvbq%o(8 zQ1&>WVtaA*=0Pa7+Z~t(53hCfXwg;s?pi16gV;%q2lSgwflxIV^7ndKd&!7#a+Zq0 zgo705sc4x-t`AKkQBm*moFEUoF^bS*j;QV()hd1oPoO62hb?+SE7BqGNr#hh=}PjH zy4!Y+?^Ri`X$h|4kBI~IBMYAe_0cC7gl|v^+h)_NxGC_`(U zGwbAt^tNSF9hbF6`}Lo!T!b}HuA*m>NhPMqNd=_ib0FiPyA=*9IMEF#`>l$K3h_At zZd0y}M1$(q7NLsDjq*N3_GsOPRc=yy8=Dx`D}<5K?uE$yiD(cd7)Og-b@s1u7rZ?}fs|Z>xZ6Tms(gEk}Qe|%ZiI@Vuqyz4xg#Rd4q6l6j zhBT;;UUrX|Qo>Uic6DWcbPx){`%Q}cgNRG0Ku1FOHe0$%ZP3|lSRXhF#Cvf?S<$vF zM~17L%2WFEMg?d6k|1JK%NhnD)IXdjj9;$6i+V5-KiQ=3ao@J@d!_LE$ilB0c{*o( zE_8~Sy;~jXf!m<`fPzT z^U)ugl5slev2hvrz4y{F10L8JCBGAE7x;dJ@}PwQKhzKE5x^)^w$^@kzlu{^>3z)= zrqQ}HFF~7eKoc+@|ACKYX|l!iI9PaZTd^$L-ZqtmOBcEj(k;w*pwf-`Vk4~5!eG!P z;oI&J*wcv0fITT7Tib8p>BpYDCJk5DNNe)a3IF@HrW?syY1mDmoP!o0G3-3fyS}&3 z*`0gEVD+$IWq>a{eP5l;Z|+ojloG(h7*!wmCEU{PvbR5?k(%BN;iY7uyk+tLX%Wgy zl*d*>`LO*w3TVKYYT^D(oALXn0}WsBN+khe7pY^}pk(QB@5pR-&XxVGP&!S{L6p}9 zn^5*>T&T!m$jujgcTAqJK_V4Ji_;ZoUS1tm`oGVEz;vmsvn9`qqEW(Iy->KKSWY#* zr0SY+g^EDsVnX$-z@>RRr*X)O4;-Gc8xt7FA2IEIm@O5`@DGdlnzLf79sO=esF{q7 zgKKJjLTabt%m(jiZ4LA0`NcOyvd{?|db3(=!K5ywGoyl{K6SJ_Lw~FvZBxvK(V1<} z?{isG7boC%zT%2lwjUx81r*<<;ni}`b}Q4hyBnkK!MUr+k(K*}!T4}^se%#mIHcghTpW90>h>QG`Q)nr_|3hUq#Bk& zq2W*9PiD;8exOuLjL(o#`$v_T;cQcVy5-d^7MnP`?7Q14hsWu-+LV%pEc;*RiVSs% zij$lBQe)?eYC3~qzy5;TI7a`zCqRfoD=;>>;Ntt(?!x(#QC{w4*6Dl1MHm|dzWDZd zb=sx%uGn;wRf05sPo+m8QU7?7BcShaYK-p#`o+qRp1+Csnb|R;5~=G4D!x$8^WKEi zq^yXCuMSNOnc8@nIW{_!|9|AB5ebr&F8BIO4GQktko_CyQumsI(_3%6h!rWAZPw2c z;!F+ov0hPn#>`*;^=vF05PK6WowqSLt`Sp~dYjVhJN4JuNLuYR7}CYH!mLug$}+!c zGsGB$xn^dc8JDGMTxklru;EyQ)MhhOsQRO_;2-yX Date: Tue, 21 Oct 2025 17:11:13 +0200 Subject: [PATCH 13/34] feat(traefik): parameterize nomad endpoint in traefik Replace the static Nomad address with a Jinja2 expression that pulls the address from Ansible hostvars, defaulting to 127.0.0.1. This makes the Traefik configuration dynamic and adaptable to different environments. --- ansible/playbooks/saas/roles/traefik/templates/traefik.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/playbooks/saas/roles/traefik/templates/traefik.toml b/ansible/playbooks/saas/roles/traefik/templates/traefik.toml index efbc64b9..a6bfce55 100644 --- a/ansible/playbooks/saas/roles/traefik/templates/traefik.toml +++ b/ansible/playbooks/saas/roles/traefik/templates/traefik.toml @@ -49,7 +49,7 @@ prefix = "traefik" exposedByDefault = false [providers.nomad.endpoint] - address = "https://172.17.0.1:4646" + address = "https://{{ hostvars[nomad_primary_master_node]['ansible_ens3']['ipv4']['address'] | default('127.0.0.1') }}:4646" token = "{{ lookup('simple-stack-ui', type='secret', key=inventory_hostname, subkey='nomad_traefik_token', missing='error') }}" [providers.nomad.endpoint.tls] ca = "/etc/ssl/simplestack/simplestack-ca.pem" From 4add7472d6f79d4482fe0cf8296e3fd1670953ce Mon Sep 17 00:00:00 2001 From: Mathieu Garcia Date: Tue, 21 Oct 2025 17:16:15 +0200 Subject: [PATCH 14/34] fix(grafana): add guard for undefined prometheus remote write --- .../templates/provisioning/datasources/prometheus.yaml.j2 | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ansible/playbooks/saas/roles/grafana/templates/provisioning/datasources/prometheus.yaml.j2 b/ansible/playbooks/saas/roles/grafana/templates/provisioning/datasources/prometheus.yaml.j2 index 81abd7e0..539aa307 100644 --- a/ansible/playbooks/saas/roles/grafana/templates/provisioning/datasources/prometheus.yaml.j2 +++ b/ansible/playbooks/saas/roles/grafana/templates/provisioning/datasources/prometheus.yaml.j2 @@ -1,5 +1,6 @@ apiVersion: 1 +{% if prometheus_remote_write is defined %} datasources: - name: Mimir uid: prometheus @@ -47,3 +48,6 @@ datasources: basicAuthPassword: "{{ loki_remote_write.password }}" isDefault: false {% endif %} +{% else %} +datasources: [] +{% endif %} \ No newline at end of file From fc348d125037527d3d6d47c2ceea7224f9695c13 Mon Sep 17 00:00:00 2001 From: Mathieu Garcia Date: Tue, 21 Oct 2025 17:17:26 +0200 Subject: [PATCH 15/34] feat(grafana): add NVIDIA and vLLM dashboards and update Nomad dashboard --- .../grafana/files/dashboards/nomad/nomad.json | 779 +++--- .../files/dashboards/nvidia/nvidia.json | 2226 +++++++++++++++++ .../dashboards/vllm/official-dashboard.json | 1353 ++++++++++ .../grafana/files/dashboards/vllm/vllm.json | 1561 ++++++++++++ .../plugins/grafana-piechart-panel.yaml | 15 - 5 files changed, 5506 insertions(+), 428 deletions(-) create mode 100644 ansible/playbooks/saas/roles/grafana/files/dashboards/nvidia/nvidia.json create mode 100644 ansible/playbooks/saas/roles/grafana/files/dashboards/vllm/official-dashboard.json create mode 100644 ansible/playbooks/saas/roles/grafana/files/dashboards/vllm/vllm.json delete mode 100644 ansible/playbooks/saas/roles/grafana/files/provisioning/plugins/grafana-piechart-panel.yaml diff --git a/ansible/playbooks/saas/roles/grafana/files/dashboards/nomad/nomad.json b/ansible/playbooks/saas/roles/grafana/files/dashboards/nomad/nomad.json index 0a22af36..2c2798fe 100644 --- a/ansible/playbooks/saas/roles/grafana/files/dashboards/nomad/nomad.json +++ b/ansible/playbooks/saas/roles/grafana/files/dashboards/nomad/nomad.json @@ -18,11 +18,9 @@ "description": "Nomad Jobs metrics", "editable": true, "fiscalYearStartMonth": 0, - "gnetId": 6281, "graphTooltip": 0, - "id": 35, + "id": 45, "links": [], - "liveNow": false, "panels": [ { "datasource": { @@ -41,6 +39,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", @@ -57,6 +56,7 @@ "type": "linear" }, "showPoints": "auto", + "showValues": false, "spanNulls": false, "stacking": { "group": "A", @@ -72,7 +72,7 @@ "steps": [ { "color": "green", - "value": null + "value": 0 }, { "color": "red", @@ -100,11 +100,12 @@ "showLegend": true }, "tooltip": { + "hideZeros": false, "mode": "single", "sort": "none" } }, - "pluginVersion": "10.2.2", + "pluginVersion": "12.2.0", "targets": [ { "datasource": { @@ -112,7 +113,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "nomad_client_allocs_failed{host=~\"$host\"} ", + "expr": "nomad_client_allocs_failed{instance=~\"$instance\"} ", "instant": false, "legendFormat": "{{exported_job}}", "range": true, @@ -139,6 +140,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", @@ -155,6 +157,7 @@ "type": "linear" }, "showPoints": "auto", + "showValues": false, "spanNulls": false, "stacking": { "group": "A", @@ -170,7 +173,7 @@ "steps": [ { "color": "green", - "value": null + "value": 0 }, { "color": "red", @@ -198,11 +201,12 @@ "showLegend": true }, "tooltip": { + "hideZeros": false, "mode": "single", "sort": "none" } }, - "pluginVersion": "10.2.2", + "pluginVersion": "12.2.0", "targets": [ { "datasource": { @@ -210,7 +214,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(nomad_client_allocations_pending{host=~\"$host\",host=~\"$host\"}) by(project, node_status)", + "expr": "sum(nomad_client_allocations_pending{instance=~\"$instance\"}) by(project, node_status)", "instant": false, "legendFormat": "{{project}} {{node_status}}", "range": true, @@ -250,6 +254,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", @@ -266,6 +271,7 @@ "type": "linear" }, "showPoints": "never", + "showValues": false, "spanNulls": false, "stacking": { "group": "A", @@ -282,7 +288,7 @@ "steps": [ { "color": "green", - "value": null + "value": 0 }, { "color": "red", @@ -301,7 +307,6 @@ "y": 9 }, "id": 2, - "links": [], "options": { "legend": { "calcs": [ @@ -311,14 +316,17 @@ ], "displayMode": "table", "placement": "right", - "showLegend": true + "showLegend": true, + "sortBy": "Max", + "sortDesc": true }, "tooltip": { + "hideZeros": false, "mode": "multi", "sort": "none" } }, - "pluginVersion": "10.2.2", + "pluginVersion": "12.2.0", "repeat": "host", "repeatDirection": "v", "targets": [ @@ -327,134 +335,140 @@ "datasource": "$datasource", "uid": "$datasource" }, - "expr": "avg(nomad_client_allocs_cpu_total_percent{host=~\"$host\"}) by(exported_job, task)", + "editorMode": "code", + "expr": "avg(nomad_client_allocs_cpu_total_percent{instance=~\"$instance\"}) by(exported_job, task)", "format": "time_series", "interval": "", "intervalFactor": 1, "legendFormat": "{{task}}", + "range": true, "refId": "A" } ], - "title": "CPU Usage Percent from $project / $host", + "title": "CPU Usage Percent from $project / $instance", "type": "timeseries" }, { - "collapsed": true, + "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 90 + "y": 18 }, "id": 9, - "panels": [ - { - "datasource": { - "datasource": "$datasource", - "uid": "$datasource" + "panels": [], + "title": "CPU total ticks", + "type": "row" + }, + { + "datasource": { + "datasource": "$datasource", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "decimals": 3, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "timeticks" + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 10 - }, - "id": 3, - "links": [], - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "right", - "showLegend": true + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" }, - "tooltip": { - "mode": "multi", - "sort": "none" + "thresholdsStyle": { + "mode": "off" } }, - "pluginVersion": "10.2.2", - "repeat": "host", - "repeatDirection": "v", - "targets": [ - { - "datasource": { - "datasource": "$datasource", - "uid": "$datasource" + "decimals": 3, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 }, - "expr": "avg(nomad_client_allocs_cpu_total_ticks{host=~\"$host\"}) by(exported_job, task)", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "{{task}}", - "refId": "A" - } + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "timeticks" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 19 + }, + "id": 3, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max", + "min" ], - "title": "CPU Total Ticks from $project / $host", - "type": "timeseries" + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.2.0", + "repeat": "host", + "repeatDirection": "v", + "targets": [ + { + "datasource": { + "datasource": "$datasource", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "avg(nomad_client_allocs_cpu_total_ticks{instance=~\"$instance\"}) by(exported_job, task)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{task}}", + "range": true, + "refId": "A" } ], - "title": "CPU total ticks", - "type": "row" + "title": "CPU Total Ticks from $project / $instance", + "type": "timeseries" }, { "collapsed": false, @@ -462,7 +476,7 @@ "h": 1, "w": 24, "x": 0, - "y": 91 + "y": 27 }, "id": 13, "panels": [], @@ -486,6 +500,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", @@ -502,6 +517,7 @@ "type": "linear" }, "showPoints": "never", + "showValues": false, "spanNulls": false, "stacking": { "group": "A", @@ -517,7 +533,7 @@ "steps": [ { "color": "green", - "value": null + "value": 0 }, { "color": "red", @@ -533,11 +549,10 @@ "h": 7, "w": 24, "x": 0, - "y": 92 + "y": 28 }, "id": 12, "interval": "1m", - "links": [], "options": { "legend": { "calcs": [ @@ -550,11 +565,12 @@ "sortDesc": true }, "tooltip": { + "hideZeros": false, "mode": "multi", "sort": "none" } }, - "pluginVersion": "10.2.2", + "pluginVersion": "12.2.0", "repeat": "host", "repeatDirection": "v", "targets": [ @@ -564,7 +580,7 @@ "uid": "$datasource" }, "editorMode": "code", - "expr": "avg(nomad_client_allocs_memory_usage{host=~\"$host\"}) by(exported_job, task)", + "expr": "avg(nomad_client_allocs_memory_usage{instance=~\"$instance\"}) by(exported_job, task)", "format": "time_series", "interval": "", "intervalFactor": 1, @@ -573,275 +589,276 @@ "refId": "A" } ], - "title": "Memory usage from $project / $host", + "title": "Memory usage from $project / $instance", "type": "timeseries" }, { - "collapsed": true, + "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 155 + "y": 35 }, "id": 11, - "panels": [ - { - "datasource": { - "datasource": "$datasource", - "uid": "$datasource" + "panels": [], + "title": "Memory cache", + "type": "row" + }, + { + "datasource": { + "datasource": "$datasource", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "decimals": 3, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "decbytes" + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false }, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 24, - "x": 0, - "y": 12 - }, - "id": 7, - "links": [], - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "right", - "showLegend": true + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" }, - "tooltip": { - "mode": "multi", - "sort": "none" + "thresholdsStyle": { + "mode": "off" } }, - "pluginVersion": "10.2.2", - "repeat": "host", - "repeatDirection": "v", - "targets": [ - { - "datasource": { - "datasource": "$datasource", - "uid": "$datasource" + "decimals": 3, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 }, - "expr": "avg(nomad_client_allocs_memory_cache{host=~\"$host\"}) by(exported_job, task)", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "{{task}}", - "refId": "A" - } + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 36 + }, + "id": 7, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max", + "min" ], - "title": "Memory Cache from $project / $host", - "type": "timeseries" + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.2.0", + "repeat": "host", + "repeatDirection": "v", + "targets": [ + { + "datasource": { + "datasource": "$datasource", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "avg(nomad_client_allocs_memory_cache{instance=~\"$instance\"}) by(exported_job, task)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{task}}", + "range": true, + "refId": "A" } ], - "title": "Memory cache", - "type": "row" + "title": "Memory Cache from $project / $instance", + "type": "timeseries" }, { - "collapsed": true, + "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 156 + "y": 43 }, "id": 10, - "panels": [ - { - "datasource": { - "datasource": "$datasource", - "uid": "$datasource" + "panels": [], + "title": "RSS", + "type": "row" + }, + { + "datasource": { + "datasource": "$datasource", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "decimals": 3, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "decbytes" + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 13 - }, - "id": 6, - "links": [], - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "right", - "showLegend": true + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" }, - "tooltip": { - "mode": "multi", - "sort": "none" + "thresholdsStyle": { + "mode": "off" } }, - "pluginVersion": "10.2.2", - "repeat": "host", - "repeatDirection": "v", - "targets": [ - { - "datasource": { - "datasource": "$datasource", - "uid": "$datasource" + "decimals": 3, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 }, - "expr": "avg(nomad_client_allocs_memory_rss{host=~\"$host\"}) by(exported_job, task)", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "{{task}}", - "refId": "A" - } + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 44 + }, + "id": 6, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max", + "min" ], - "title": "RSS from $project / $host", - "type": "timeseries" + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.2.0", + "repeat": "host", + "repeatDirection": "v", + "targets": [ + { + "datasource": { + "datasource": "$datasource", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "avg(nomad_client_allocs_memory_rss{instance=~\"$instance\"}) by(exported_job, task)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{task}}", + "range": true, + "refId": "A" } ], - "title": "RSS", - "type": "row" + "title": "RSS from $project / $instance", + "type": "timeseries" } ], + "preload": false, "refresh": "30s", - "schemaVersion": 38, + "schemaVersion": 42, "tags": [], "templating": { "list": [ { "current": { - "selected": false, - "text": "Mimir", - "value": "prometheus" + "text": "prometheus", + "value": "cf0jbbq5bcwsgb" }, - "hide": 0, "includeAll": false, - "multi": false, "name": "datasource", "options": [], "query": "prometheus", - "queryValue": "", "refresh": 1, "regex": "", - "skipUrlSync": false, "type": "datasource" }, { "current": { - "selected": true, - "text": [ - "All" - ], + "text": "All", "value": [ "$__all" ] @@ -851,7 +868,6 @@ "uid": "${datasource}" }, "definition": "label_values(project)", - "hide": 0, "includeAll": true, "multi": true, "name": "project", @@ -863,43 +879,11 @@ }, "refresh": 1, "regex": "", - "skipUrlSync": false, - "sort": 0, "type": "query" }, { "current": { - "selected": false, - "text": "dc1", - "value": "dc1" - }, - "datasource": { - "datasource": "$datasource", - "uid": "$datasource" - }, - "definition": "", - "hide": 0, - "includeAll": false, - "label": "DC", - "multi": false, - "name": "datacenter", - "options": [], - "query": "label_values(nomad_client_uptime, datacenter)", - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "tagValuesQuery": "", - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "current": { - "selected": true, - "text": [ - "All" - ], + "text": "All", "value": [ "$__all" ] @@ -908,61 +892,30 @@ "datasource": "$datasource", "uid": "$datasource" }, - "definition": "label_values(nomad_client_uptime{project=~\"$project\"},host)", - "hide": 0, + "definition": "label_values(nomad_client_uptime{project=~\"$project\"},instance)", "includeAll": true, - "label": "Host", + "label": "instance", "multi": true, - "name": "host", + "name": "instance", "options": [], "query": { "qryType": 1, - "query": "label_values(nomad_client_uptime{project=~\"$project\"},host)", + "query": "label_values(nomad_client_uptime{project=~\"$project\"},instance)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 2, "regex": "", - "skipUrlSync": false, - "sort": 0, - "tagValuesQuery": "", - "tagsQuery": "", - "type": "query", - "useTags": false + "type": "query" } ] }, "time": { - "from": "now-24h", + "from": "now-30m", "to": "now" }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, + "timepicker": {}, "timezone": "", - "title": "Nomad", - "uid": "ad9aff6e-226f-42ef-aa3f-8be3c80662ce", - "version": 1, - "weekStart": "" -} + "title": "Nomad Copy2", + "uid": "hrvzvk7", + "version": 1 +} \ No newline at end of file diff --git a/ansible/playbooks/saas/roles/grafana/files/dashboards/nvidia/nvidia.json b/ansible/playbooks/saas/roles/grafana/files/dashboards/nvidia/nvidia.json new file mode 100644 index 00000000..a5d821e7 --- /dev/null +++ b/ansible/playbooks/saas/roles/grafana/files/dashboards/nvidia/nvidia.json @@ -0,0 +1,2226 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "panel", + "id": "bargauge", + "name": "Bar gauge", + "version": "" + }, + { + "type": "panel", + "id": "gauge", + "name": "Gauge", + "version": "" + }, + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "11.2.0" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Nvidia GPU Metrics based on the prometheus metrics from github.com/utkuozdemir/nvidia_gpu_exporter", + "editable": true, + "fiscalYearStartMonth": 0, + "gnetId": 14574, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The official product name of the GPU. This is an alphanumeric string. For all products.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 0, + "y": 0 + }, + "id": 23, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "name", + "wideLayout": true + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_gpu_info{uuid=\"$gpu\"}", + "instant": true, + "interval": "", + "legendFormat": "{{name}}", + "refId": "A" + } + ], + "title": "Name", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The current performance state for the GPU. States range from P0 (maximum performance) to P12 (minimum performance).", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "": { + "text": "" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "prefix:P" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 2, + "x": 4, + "y": 0 + }, + "id": 22, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "value", + "wideLayout": true + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_pstate{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "P-State", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Percent of time over the past sample period during which one or more kernels was executing on the GPU.\nThe sample period may be between 1 second and 1/6 second depending on the product.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 6, + "y": 0 + }, + "id": 6, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_utilization_gpu_ratio{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "title": "GPU Utilization %", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The last measured power draw for the entire board, in watts. Only available if power management is supported. This reading is accurate to within +/- 5 watts / The software power limit in watts.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 9, + "y": 0 + }, + "id": 21, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_power_draw_watts{uuid=\"$gpu\"} / nvidia_smi_power_default_limit_watts{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Power Draw %", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The fan speed value is the percent of the product's maximum noise tolerance fan speed that the device's fan is currently intended to run at. This value may exceed 100% in certain cases. Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, this output will not match the actual fan speed. Many parts do not report fan speeds because they rely on cooling via fans in the surrounding enclosure.\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 12, + "y": 0 + }, + "id": 4, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_fan_speed_ratio{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Fan Speed %", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Core GPU temperature. in degrees C.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "celsius" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 15, + "y": 0 + }, + "id": 16, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_temperature_gpu{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "title": "Temperature", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Percent of time over the past sample period during which global (device) memory was being read or written.\nThe sample period may be between 1 second and 1/6 second depending on the product.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line+area" + } + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "transparent", + "value": null + }, + { + "color": "orange", + "value": 0.7 + }, + { + "color": "red", + "value": 0.9 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 18, + "y": 0 + }, + "id": 11, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_utilization_memory_ratio{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "title": "Memory Utilization %", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The version of the installed NVIDIA display driver. This is an alphanumeric string.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 0, + "y": 3 + }, + "id": 14, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "name", + "wideLayout": true + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_gpu_info{uuid=\"$gpu\"}", + "instant": true, + "interval": "", + "legendFormat": "{{driver_version}}", + "refId": "A" + } + ], + "title": "Driver Version", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The BIOS of the GPU board.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 3, + "y": 3 + }, + "id": 34, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "name", + "wideLayout": true + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_gpu_info{uuid=\"$gpu\"}", + "instant": true, + "interval": "", + "legendFormat": "{{vbios_version}}", + "refId": "A" + } + ], + "title": "Vbios Version", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Information about factors that are reducing the frequency of clocks. If all throttle reasons are returned as \"Not Active\" it means that clocks are running as high as possible.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "0": { + "text": "Not Active" + }, + "1": { + "text": "Active" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 5 + }, + "id": 32, + "options": { + "displayMode": "gradient", + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "text": {}, + "valueMode": "color" + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_event_reasons_gpu_idle{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_gpu_idle{uuid=\"$gpu\"}", + "instant": false, + "interval": "", + "legendFormat": "Idle", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_event_reasons_hw_thermal_slowdown{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_hw_thermal_slowdown{uuid=\"$gpu\"}", + "hide": false, + "interval": "", + "legendFormat": "HW Thermal Slowdown", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_event_reasons_sw_power_cap{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_sw_power_cap{uuid=\"$gpu\"}", + "hide": false, + "interval": "", + "legendFormat": "SW Power Cap", + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_event_reasons_applications_clocks_setting{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_applications_clocks_setting{uuid=\"$gpu\"}", + "hide": false, + "interval": "", + "legendFormat": "App Clocks Setting", + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_event_reasons_hw_power_brake_slowdown{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_hw_power_brake_slowdown{uuid=\"$gpu\"}", + "hide": false, + "interval": "", + "legendFormat": "HW Power Brake", + "refId": "E" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_event_reasons_sw_thermal_slowdown{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_sw_thermal_slowdown{uuid=\"$gpu\"}", + "hide": false, + "interval": "", + "legendFormat": "SW Thermal Slowdown", + "refId": "F" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_event_reasons_sync_boost{uuid=\"$gpu\"} or nvidia_smi_clocks_throttle_reasons_sync_boost{uuid=\"$gpu\"}", + "hide": false, + "interval": "", + "legendFormat": "Sync Boost", + "refId": "G" + } + ], + "title": "Throttle Reasons", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Current frequency of graphics (shader) clock\n/\nMaximum frequency of graphics (shader) clock.\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 6, + "y": 5 + }, + "id": 20, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_current_graphics_clock_hz{uuid=\"$gpu\"} / nvidia_smi_clocks_max_graphics_clock_hz{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "GPU Clock Speed %", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Current frequency of memory clock / Maximum frequency of memory clock", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 9, + "y": 5 + }, + "id": 33, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_current_memory_clock_hz{uuid=\"$gpu\"} / nvidia_smi_clocks_max_memory_clock_hz{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Memory Clock Speed %", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Total memory allocated by active contexts / Total installed GPU memory.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 12, + "y": 5 + }, + "id": 25, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_memory_used_bytes{uuid=\"$gpu\"} / nvidia_smi_memory_total_bytes{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Memory Allocation %", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Percent of time over the past sample period during which global (device) memory was being read or written.\nThe sample period may be between 1 second and 1/6 second depending on the product.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 15, + "y": 5 + }, + "id": 7, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_utilization_memory_ratio{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Memory Utilization %", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Percent of time over the past sample period during which one or more kernels was executing on the GPU.\nThe sample period may be between 1 second and 1/6 second depending on the product.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line+area" + } + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "transparent", + "value": null + }, + { + "color": "orange", + "value": 0.7 + }, + { + "color": "red", + "value": 0.9 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 18, + "y": 5 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_utilization_gpu_ratio{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "GPU Utilization %", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Total memory allocated by active contexts.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 10 + }, + "id": 17, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_memory_used_bytes{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "title": "Memory Allocation", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Core GPU temperature. in degrees C.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line+area" + } + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "transparent", + "value": null + }, + { + "color": "orange", + "value": 70 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "celsius" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 6, + "y": 10 + }, + "id": 15, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_temperature_gpu{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "title": "Temperature", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The last measured power draw for the entire board, in watts. Only available if power management is supported. This reading is accurate to within +/- 5 watts", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "watt" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 12, + "y": 10 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_power_draw_watts{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "title": "Power Draw", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The fan speed value is the percent of the product's maximum noise tolerance fan speed that the device's fan is currently intended to run at. This value may exceed 100% in certain cases. Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, this output will not match the actual fan speed. Many parts do not report fan speeds because they rely on cooling via fans in the surrounding enclosure.\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line+area" + } + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "transparent", + "value": null + }, + { + "color": "orange", + "value": 0.7 + }, + { + "color": "red", + "value": 0.9 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 18, + "y": 10 + }, + "id": 9, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_fan_speed_ratio{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "title": "Fan Speed %", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Current frequency of graphics (shader) clock.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "hertz" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 15 + }, + "id": 12, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_current_graphics_clock_hz{uuid=\"$gpu\"}", + "format": "time_series", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "title": "Graphics Clock Speed", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Current frequency of video encoder/decoder clock.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "hertz" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 6, + "y": 15 + }, + "id": 19, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_current_video_clock_hz{uuid=\"$gpu\"}", + "format": "time_series", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "title": "Video Clock Speed", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Current frequency of SM (Streaming Multiprocessor) clock.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "hertz" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 12, + "y": 15 + }, + "id": 24, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_current_sm_clock_hz{uuid=\"$gpu\"}", + "format": "time_series", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "title": "SM Clock Speed", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Current frequency of memory clock.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "hertz" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 18, + "y": 15 + }, + "id": 18, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "nvidia_smi_clocks_current_memory_clock_hz{uuid=\"$gpu\"}", + "format": "time_series", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "title": "Memory Clock Speed", + "type": "timeseries" + } + ], + "refresh": "10s", + "schemaVersion": 39, + "tags": [ + "nvidia", + "nvidia-smi", + "nvidia_gpu_exporter", + "prometheus" + ], + "templating": { + "list": [ + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "label_values(nvidia_smi_index, job)", + "hide": 0, + "includeAll": false, + "label": "Job", + "multi": false, + "name": "job", + "options": [], + "query": { + "query": "label_values(nvidia_smi_index,job)", + "refId": "Prometheus-job-Variable-Query" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "label_values(nvidia_smi_index{job=\"$job\"},instance)", + "hide": 0, + "includeAll": false, + "label": "Host", + "multi": false, + "name": "node", + "options": [], + "query": { + "query": "label_values(nvidia_smi_index{job=\"$job\"},instance)", + "refId": "Prometheus-node-Variable-Query" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "label_values(nvidia_smi_index{instance=\"$node\"},uuid)", + "hide": 0, + "includeAll": false, + "label": "GPU", + "multi": false, + "name": "gpu", + "options": [], + "query": { + "query": "label_values(nvidia_smi_index{instance=\"$node\"},uuid)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Nvidia GPU Metrics", + "uid": "vlvPlrgnk", + "version": 8, + "weekStart": "" +} \ No newline at end of file diff --git a/ansible/playbooks/saas/roles/grafana/files/dashboards/vllm/official-dashboard.json b/ansible/playbooks/saas/roles/grafana/files/dashboards/vllm/official-dashboard.json new file mode 100644 index 00000000..35b921de --- /dev/null +++ b/ansible/playbooks/saas/roles/grafana/files/dashboards/vllm/official-dashboard.json @@ -0,0 +1,1353 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0,211,255,1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 0, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 100, + "panels": [], + "title": "Overview System Performance", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Number of healthy vLLM instances (by instance usage)", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 1, + "interval": "60s", + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.2.0", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "builder", + "expr": "count by(endpoint) (vllm:healthy_pods_total)", + "format": "time_series", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "vLLM instances", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Available vLLM instances", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Average end-to-end request latency in seconds", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 19, + "interval": "60s", + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.2.0", + "targets": [ + { + "editorMode": "builder", + "expr": "avg(vllm:e2e_request_latency_seconds_sum) / avg(vllm:e2e_request_latency_seconds_count)", + "legendFormat": "Avg Latency", + "range": true, + "refId": "A" + } + ], + "title": "Average Latency", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 2, + "interval": "60s", + "options": { + "displayMode": "gradient", + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "pluginVersion": "12.2.0", + "targets": [ + { + "editorMode": "builder", + "expr": "sum by(le) (vllm:e2e_request_latency_seconds_bucket)", + "format": "heatmap", + "legendFormat": "{{le}}", + "range": true, + "refId": "A" + } + ], + "title": "Request latency distribution", + "type": "bargauge" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 15 + }, + "id": 101, + "panels": [], + "title": "QoS Information", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 3, + "interval": "60s", + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.2.0", + "targets": [ + { + "editorMode": "builder", + "expr": "vllm:current_qps", + "legendFormat": "Current QPS", + "range": true, + "refId": "A" + } + ], + "title": "Current QPS", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Average Inter-Token Latency", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 20, + "interval": "60s", + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.2.0", + "targets": [ + { + "editorMode": "builder", + "expr": "avg(vllm:time_per_output_token_seconds_sum) / avg(vllm:time_per_output_token_seconds_count)", + "legendFormat": "Avg ITL", + "range": true, + "refId": "A" + } + ], + "title": "Average ITL", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 23 + }, + "id": 4, + "interval": "60s", + "options": { + "displayMode": "gradient", + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "pluginVersion": "12.2.0", + "targets": [ + { + "editorMode": "builder", + "expr": "sum by(le) (vllm:time_to_first_token_seconds_bucket)", + "format": "heatmap", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Request TTFT distribution", + "type": "bargauge" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 30 + }, + "id": 102, + "panels": [], + "title": "Serving Engine Load", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 31 + }, + "id": 10, + "interval": "60s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.0", + "targets": [ + { + "editorMode": "code", + "expr": "vllm:num_requests_running", + "legendFormat": "{{instance}}", + "range": true, + "refId": "A" + } + ], + "title": "Number of Running Requests", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 31 + }, + "id": 11, + "interval": "60s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.0", + "targets": [ + { + "editorMode": "builder", + "expr": "vllm:num_requests_waiting", + "legendFormat": "{{instance}}", + "range": true, + "refId": "A" + } + ], + "title": "Number of Pending Requests", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 31 + }, + "id": 12, + "interval": "60s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.0", + "targets": [ + { + "editorMode": "builder", + "expr": "vllm:gpu_cache_usage_perc", + "legendFormat": "{{instance}}", + "range": true, + "refId": "A" + } + ], + "title": "GPU KV Usage Percentage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 39 + }, + "id": 13, + "interval": "60s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.0", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "builder", + "expr": "vllm:gpu_prefix_cache_hits_total{endpoint=\"service-port\"} / vllm:gpu_prefix_cache_queries_total", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "{{instance}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "GPU KV Cache Hit Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Requests moved from GPU to CPU", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 39 + }, + "id": 21, + "interval": "60s", + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.2.0", + "targets": [ + { + "editorMode": "builder", + "expr": "vllm:num_requests_swapped", + "legendFormat": "Swapped Requests", + "range": true, + "refId": "A" + } + ], + "title": "Number of Swapped Requests", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 47 + }, + "id": 103, + "panels": [], + "title": "Current Resource Usage", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 48 + }, + "id": 14, + "interval": "60s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.0", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "builder", + "expr": "vllm:gpu_cache_usage_perc", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "GPU Usage", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "GPU Cache Usage (%)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 48 + }, + "id": 15, + "interval": "60s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.0", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "builder", + "expr": "router_cpu_usage_percent", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "CPU Usage", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "CPU Usage (%)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 48 + }, + "id": 16, + "interval": "60s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.0", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "builder", + "expr": "router_memory_usage_percent", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "Memory Usage", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Memory Usage (%)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 48 + }, + "id": 17, + "interval": "60s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.0", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "builder", + "expr": "router_disk_usage_percent", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "Disk Usage", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Disk Usage (%)", + "type": "timeseries" + } + ], + "preload": false, + "refresh": "auto", + "schemaVersion": 42, + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "vLLM Dashboard", + "uid": "750918234", + "version": 2 +} \ No newline at end of file diff --git a/ansible/playbooks/saas/roles/grafana/files/dashboards/vllm/vllm.json b/ansible/playbooks/saas/roles/grafana/files/dashboards/vllm/vllm.json new file mode 100644 index 00000000..7b5fc131 --- /dev/null +++ b/ansible/playbooks/saas/roles/grafana/files/dashboards/vllm/vllm.json @@ -0,0 +1,1561 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "description": "Monitoring vLLM Inference Server", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 0, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "End to end request latency measured in seconds.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 9, + "interval": "60s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "P99", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "P95", + "range": true, + "refId": "B", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "P90", + "range": true, + "refId": "C", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "P50", + "range": true, + "refId": "D", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(vllm:e2e_request_latency_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:e2e_request_latency_seconds_count{model_name=\"$model_name\"}[$__rate_interval])", + "hide": false, + "instant": false, + "legendFormat": "Average", + "range": true, + "refId": "E" + } + ], + "title": "E2E Request Latency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Number of tokens processed per second", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 8, + "interval": "60s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "rate(vllm:prompt_tokens_total{model_name=\"$model_name\"}[$__rate_interval])", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "Prompt Tokens/Sec", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "rate(vllm:generation_tokens_total{model_name=\"$model_name\"}[$__rate_interval])", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "Generation Tokens/Sec", + "range": true, + "refId": "B", + "useBackend": false + } + ], + "title": "Token Throughput", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Inter token latency in seconds.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 10, + "interval": "60s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "P99", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "P95", + "range": true, + "refId": "B", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "P90", + "range": true, + "refId": "C", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "P50", + "range": true, + "refId": "D", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(vllm:inter_token_latency_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:inter_token_latency_seconds_count{model_name=\"$model_name\"}[$__rate_interval])", + "hide": false, + "instant": false, + "legendFormat": "Mean", + "range": true, + "refId": "E" + } + ], + "title": "Inter Token Latency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Number of requests in RUNNING, WAITING, and SWAPPED state", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "vllm:num_requests_running{model_name=\"$model_name\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "Num Running", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "vllm:num_requests_waiting{model_name=\"$model_name\"}", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "Num Waiting", + "range": true, + "refId": "C", + "useBackend": false + } + ], + "title": "Scheduler State", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "P50, P90, P95, and P99 TTFT latency in seconds.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 5, + "interval": "60s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "P99", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "P95", + "range": true, + "refId": "B", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "P90", + "range": true, + "refId": "C", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "P50", + "range": true, + "refId": "D", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(vllm:time_to_first_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:time_to_first_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])", + "hide": false, + "instant": false, + "legendFormat": "Average", + "range": true, + "refId": "E" + } + ], + "title": "Time To First Token Latency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Percentage of used cache blocks by vLLM.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 4, + "interval": "60s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "vllm:gpu_cache_usage_perc{model_name=\"$model_name\"}", + "instant": false, + "legendFormat": "GPU Cache Usage", + "range": true, + "refId": "A" + } + ], + "title": "Cache Utilization", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Heatmap of request prompt length", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 12, + "interval": "60s", + "options": { + "calculate": false, + "cellGap": 1, + "cellValues": { + "unit": "none" + }, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "min": 0, + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Spectral", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto", + "value": "Request count" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisLabel": "Prompt Length", + "axisPlacement": "left", + "reverse": false, + "unit": "none" + } + }, + "pluginVersion": "12.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sum by(le) (increase(vllm:request_prompt_tokens_bucket{model_name=\"$model_name\"}[$__rate_interval]))", + "format": "heatmap", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{le}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Request Prompt Length", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Heatmap of request generation length", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 13, + "interval": "60s", + "options": { + "calculate": false, + "cellGap": 1, + "cellValues": { + "unit": "none" + }, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "min": 0, + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Spectral", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto", + "value": "Request count" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisLabel": "Generation Length", + "axisPlacement": "left", + "reverse": false, + "unit": "none" + } + }, + "pluginVersion": "12.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sum by(le) (increase(vllm:request_generation_tokens_bucket{model_name=\"$model_name\"}[$__rate_interval]))", + "format": "heatmap", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{le}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Request Generation Length", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Number of finished requests by their finish reason: either an EOS token was generated or the max sequence length was reached.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 32 + }, + "id": 11, + "interval": "60s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sum by(finished_reason) (increase(vllm:request_success_total{model_name=\"$model_name\"}[$__rate_interval]))", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "interval": "", + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Finish Reason", + "type": "timeseries" + }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "seconds", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "id": 14, + "interval": "60s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "rate(vllm:request_queue_time_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Queue Time", + "type": "timeseries" + }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 40 + }, + "id": 15, + "interval": "60s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "rate(vllm:request_prefill_time_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "Prefill", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(vllm:request_decode_time_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])", + "hide": false, + "instant": false, + "legendFormat": "Decode", + "range": true, + "refId": "B" + } + ], + "title": "Requests Prefill and Decode Time", + "type": "timeseries" + }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 40 + }, + "id": 16, + "interval": "60s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "rate(vllm:request_max_num_generation_tokens_sum{model_name=\"$model_name\"}[$__rate_interval])", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "Tokens", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Max Generation Token in Sequence Group", + "type": "timeseries" + } + ], + "preload": false, + "refresh": "", + "schemaVersion": 42, + "tags": [], + "templating": { + "list": [ + { + "current": { + "text": "prometheus", + "value": "cf0jbbq5bcwsgb" + }, + "includeAll": false, + "label": "datasource", + "name": "DS_PROMETHEUS", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "current": { + "text": "meta-llama/Llama-3.2-3B-Instruct", + "value": "meta-llama/Llama-3.2-3B-Instruct" + }, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "label_values(model_name)", + "includeAll": false, + "label": "model_name", + "name": "model_name", + "options": [], + "query": { + "query": "label_values(model_name)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "type": "query" + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "vLLM", + "uid": "b281712d-8bff-41ef-9f3f-71ad43c05e9b", + "version": 9 +} \ No newline at end of file diff --git a/ansible/playbooks/saas/roles/grafana/files/provisioning/plugins/grafana-piechart-panel.yaml b/ansible/playbooks/saas/roles/grafana/files/provisioning/plugins/grafana-piechart-panel.yaml deleted file mode 100644 index 10ee02f7..00000000 --- a/ansible/playbooks/saas/roles/grafana/files/provisioning/plugins/grafana-piechart-panel.yaml +++ /dev/null @@ -1,15 +0,0 @@ -# apiVersion: 1 - -# apps: -# - type: grafana-piechart-panel -# org_id: 1 -# # org_name: Main Org. -# disabled: false -# # # fields that will be converted to json and stored in jsonData. Custom per app. -# # jsonData: -# # # key/value pairs of string to object -# # key: value -# # # fields that will be converted to json, encrypted and stored in secureJsonData. Custom per app. -# # secureJsonData: -# # # key/value pairs of string to string -# # key: value From 6a4a4406075f4d1a50b439a139e68d857f149ee4 Mon Sep 17 00:00:00 2001 From: Mathieu Garcia Date: Tue, 21 Oct 2025 17:18:44 +0200 Subject: [PATCH 16/34] feat(grafana): add LLM plugin provisioning --- .../saas/roles/grafana/tasks/destroy.yml | 1 + .../saas/roles/grafana/tasks/main.yml | 22 ++++++++++++++++--- .../saas/roles/grafana/templates/nomad.hcl | 2 +- .../provisioning/plugins/llm.yaml.j2 | 4 ++++ 4 files changed, 25 insertions(+), 4 deletions(-) create mode 100644 ansible/playbooks/saas/roles/grafana/templates/provisioning/plugins/llm.yaml.j2 diff --git a/ansible/playbooks/saas/roles/grafana/tasks/destroy.yml b/ansible/playbooks/saas/roles/grafana/tasks/destroy.yml index ce77a12b..e00b7de1 100644 --- a/ansible/playbooks/saas/roles/grafana/tasks/destroy.yml +++ b/ansible/playbooks/saas/roles/grafana/tasks/destroy.yml @@ -8,3 +8,4 @@ ansible.builtin.file: path: "{{ software_path }}" state: absent + delegate_to: "{{ software.instance }}" \ No newline at end of file diff --git a/ansible/playbooks/saas/roles/grafana/tasks/main.yml b/ansible/playbooks/saas/roles/grafana/tasks/main.yml index 3211d5c1..32f26abd 100644 --- a/ansible/playbooks/saas/roles/grafana/tasks/main.yml +++ b/ansible/playbooks/saas/roles/grafana/tasks/main.yml @@ -1,4 +1,15 @@ --- +- name: Create default directory + ansible.builtin.file: + path: "{{ item }}" + state: directory + owner: root + group: root + mode: "0755" + loop: + - "{{ software_path }}" + delegate_to: "{{ software.instance }}" + - name: Copy Grafana content files ansible.builtin.copy: src: "{{ item }}" @@ -9,16 +20,21 @@ loop: - dashboards - provisioning + delegate_to: "{{ software.instance }}" - name: Copy Grafana content templates ansible.builtin.template: - src: "provisioning/datasources/{{ item }}.j2" - dest: "{{ software_path }}/provisioning/datasources/{{ item }}" + src: "provisioning/{{ item.path }}/{{ item.file }}.j2" + dest: "{{ software_path }}/provisioning/{{ item.path }}/{{ item.file }}" owner: root group: root mode: '0644' loop: - - prometheus.yaml + - path: datasources + file: prometheus.yaml + - path: plugins + file: llm.yaml + delegate_to: "{{ software.instance }}" - name: Copy nomad job ansible.builtin.template: diff --git a/ansible/playbooks/saas/roles/grafana/templates/nomad.hcl b/ansible/playbooks/saas/roles/grafana/templates/nomad.hcl index d5b438bb..2b68803f 100644 --- a/ansible/playbooks/saas/roles/grafana/templates/nomad.hcl +++ b/ansible/playbooks/saas/roles/grafana/templates/nomad.hcl @@ -41,7 +41,7 @@ job "{{ domain }}" { env { GF_LOG_MODE = "console" GF_SERVER_HTTP_PORT = "3000" - GF_INSTALL_PLUGINS = "grafana-piechart-panel" + GF_INSTALL_PLUGINS = "grafana-piechart-panel,grafana-llm-app" GF_SECURITY_ADMIN_USER = "{{ lookup('simple-stack-ui', type='secret', key=domain, subkey='user', missing='create', nosymbols=true, length=8) }}" GF_SECURITY_ADMIN_PASSWORD = "{{ lookup('simple-stack-ui', type='secret', key=domain, subkey='passwd', missing='create', length=12) }}" DS_PROMETHEUS = "prometheus" diff --git a/ansible/playbooks/saas/roles/grafana/templates/provisioning/plugins/llm.yaml.j2 b/ansible/playbooks/saas/roles/grafana/templates/provisioning/plugins/llm.yaml.j2 new file mode 100644 index 00000000..2d5cdd9e --- /dev/null +++ b/ansible/playbooks/saas/roles/grafana/templates/provisioning/plugins/llm.yaml.j2 @@ -0,0 +1,4 @@ +apiVersion: 1 + +apps: +{{ (lookup('simple-stack-ui', type='secret', key=domain, subkey='plugins', missing='error') | from_json) | to_nice_yaml }} \ No newline at end of file From b97bc417740d08b92598041204f882e78e484b9c Mon Sep 17 00:00:00 2001 From: Mathieu Garcia Date: Tue, 21 Oct 2025 17:20:37 +0200 Subject: [PATCH 17/34] feat(ansible): bind services to node IP address --- ansible/playbooks/paas/roles/promtail/templates/config.yaml.j2 | 2 +- .../playbooks/paas/roles/systemd_exporter/templates/default.j2 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/playbooks/paas/roles/promtail/templates/config.yaml.j2 b/ansible/playbooks/paas/roles/promtail/templates/config.yaml.j2 index 7e99073a..4d76945a 100644 --- a/ansible/playbooks/paas/roles/promtail/templates/config.yaml.j2 +++ b/ansible/playbooks/paas/roles/promtail/templates/config.yaml.j2 @@ -1,5 +1,5 @@ server: - http_listen_address: 127.0.0.1 + http_listen_address: {{ hostvars[inventory_hostname]['ansible_' + nomad_iface]['ipv4']['address'] }} http_listen_port: 9080 grpc_listen_port: 0 log_level: warn diff --git a/ansible/playbooks/paas/roles/systemd_exporter/templates/default.j2 b/ansible/playbooks/paas/roles/systemd_exporter/templates/default.j2 index 1a27d383..f64efe14 100644 --- a/ansible/playbooks/paas/roles/systemd_exporter/templates/default.j2 +++ b/ansible/playbooks/paas/roles/systemd_exporter/templates/default.j2 @@ -1,2 +1,2 @@ -ARGS="--web.listen-address=127.0.0.1:9558 \ +ARGS="--web.listen-address={{ hostvars[inventory_hostname]['ansible_' + nomad_iface]['ipv4']['address'] }}:9558 \ --systemd.collector.unit-include=docker.service|promtail.service|coredns.service|prometheus.service|blackbox_exporter.service|node_exporter.service|scan_exporter.service" From ea8f0724d0acdbb547687d16af94ba661d8098bb Mon Sep 17 00:00:00 2001 From: Mathieu Garcia Date: Tue, 21 Oct 2025 17:21:44 +0200 Subject: [PATCH 18/34] feat(ansible): add nvidia_gpu_exporter role Introduce a new Ansible role to install and manage the NVIDIA GPU Exporter. The role includes defaults (disabled by default), handlers to restart the service, build tasks that download and install the binary, systemd service template, and upstream variable handling for version detection. This enables optional deployment of the exporter on hosts with NVIDIA GPUs. --- .../paas/roles/nvidia_gpu_exporter/README.md | 1 + .../nvidia_gpu_exporter/defaults/main.yml | 2 + .../nvidia_gpu_exporter/handlers/main.yml | 7 +++ .../roles/nvidia_gpu_exporter/tasks/build.yml | 57 +++++++++++++++++++ .../roles/nvidia_gpu_exporter/tasks/main.yml | 23 ++++++++ .../nvidia_gpu_exporter/templates/service.j2 | 16 ++++++ .../roles/nvidia_gpu_exporter/vars/main.yml | 23 ++++++++ .../nvidia_gpu_exporter/vars/upstream.yml | 4 ++ 8 files changed, 133 insertions(+) create mode 100644 ansible/playbooks/paas/roles/nvidia_gpu_exporter/README.md create mode 100644 ansible/playbooks/paas/roles/nvidia_gpu_exporter/defaults/main.yml create mode 100644 ansible/playbooks/paas/roles/nvidia_gpu_exporter/handlers/main.yml create mode 100644 ansible/playbooks/paas/roles/nvidia_gpu_exporter/tasks/build.yml create mode 100644 ansible/playbooks/paas/roles/nvidia_gpu_exporter/tasks/main.yml create mode 100644 ansible/playbooks/paas/roles/nvidia_gpu_exporter/templates/service.j2 create mode 100644 ansible/playbooks/paas/roles/nvidia_gpu_exporter/vars/main.yml create mode 100644 ansible/playbooks/paas/roles/nvidia_gpu_exporter/vars/upstream.yml diff --git a/ansible/playbooks/paas/roles/nvidia_gpu_exporter/README.md b/ansible/playbooks/paas/roles/nvidia_gpu_exporter/README.md new file mode 100644 index 00000000..73adf688 --- /dev/null +++ b/ansible/playbooks/paas/roles/nvidia_gpu_exporter/README.md @@ -0,0 +1 @@ +# Role: `nvidia_gpu_exporter` diff --git a/ansible/playbooks/paas/roles/nvidia_gpu_exporter/defaults/main.yml b/ansible/playbooks/paas/roles/nvidia_gpu_exporter/defaults/main.yml new file mode 100644 index 00000000..bc455e64 --- /dev/null +++ b/ansible/playbooks/paas/roles/nvidia_gpu_exporter/defaults/main.yml @@ -0,0 +1,2 @@ +--- +nvidia_gpu_exporter_enable: false diff --git a/ansible/playbooks/paas/roles/nvidia_gpu_exporter/handlers/main.yml b/ansible/playbooks/paas/roles/nvidia_gpu_exporter/handlers/main.yml new file mode 100644 index 00000000..a017bade --- /dev/null +++ b/ansible/playbooks/paas/roles/nvidia_gpu_exporter/handlers/main.yml @@ -0,0 +1,7 @@ +--- +- name: Restart nvidia_gpu_exporter + listen: Restart nvidia_gpu_exporter + ansible.builtin.service: + name: nvidia_gpu_exporter + state: restarted + enabled: true diff --git a/ansible/playbooks/paas/roles/nvidia_gpu_exporter/tasks/build.yml b/ansible/playbooks/paas/roles/nvidia_gpu_exporter/tasks/build.yml new file mode 100644 index 00000000..2dacd7a4 --- /dev/null +++ b/ansible/playbooks/paas/roles/nvidia_gpu_exporter/tasks/build.yml @@ -0,0 +1,57 @@ +--- +- name: Nvidia_gpu_exporter | Create temporary build directory + ansible.builtin.file: + path: "{{ item }}" + recurse: true + state: directory + mode: '0755' + loop: + - "{{ build_work_dir }}/download" + - "{{ build_work_dir }}/{{ upstream_default_arch }}" + +- name: Nvidia_gpu_exporter | Download Github release + ansible.builtin.get_url: + url: "{{ upstream_file_url }}" + dest: "{{ build_work_dir }}/download/" + mode: '0644' + register: download_result + +- name: Nvidia_gpu_exporter | Unarchive GitHub release + ansible.builtin.unarchive: + src: "{{ build_work_dir }}/download/{{ upstream_file_name }}" + dest: "{{ build_work_dir }}/download" + remote_src: true + when: download_result.changed + +- name: Nvidia_gpu_exporter | Find binary + ansible.builtin.include_role: + name: upstream + tasks_from: find-binary + loop: + - "{{ image.upstream.binary }}" + +- name: Nvidia_gpu_exporter | Copy binary + ansible.builtin.copy: + src: "{{ build_work_dir }}/{{ upstream_default_arch }}/{{ image.upstream.binary }}" + dest: /usr/local/bin/nvidia_gpu_exporter + owner: root + group: root + mode: '0755' + remote_src: true + +- name: Nvidia_gpu_exporter | Clean up + ansible.builtin.file: + path: "{{ build_work_dir }}" + state: absent + +- name: Nvidia_gpu_exporter | Backup software version + ansible.builtin.copy: + content: | + #!/bin/bash + cat << EOF + "{{ latest_version }}" + EOF + dest: "/etc/ansible/facts.d/{{ image.name }}.fact" + owner: root + group: root + mode: '0755' diff --git a/ansible/playbooks/paas/roles/nvidia_gpu_exporter/tasks/main.yml b/ansible/playbooks/paas/roles/nvidia_gpu_exporter/tasks/main.yml new file mode 100644 index 00000000..8500b652 --- /dev/null +++ b/ansible/playbooks/paas/roles/nvidia_gpu_exporter/tasks/main.yml @@ -0,0 +1,23 @@ +--- +- name: End the play for hosts that don't have nvidia gpu + ansible.builtin.meta: end_host + when: not nvidia_gpu_exporter_enable + +- name: Nvidia_gpu_exporter | Include upstream variables + ansible.builtin.include_vars: upstream.yml + +- name: Nvidia_gpu_exporter | Get binary + ansible.builtin.include_tasks: build.yml + when: ansible_local[image.name] is not defined or ansible_local[image.name] != latest_version + +- name: Nvidia_gpu_exporter | Copy templates + ansible.builtin.template: + src: "{{ item.src }}" + dest: "{{ item.dest }}" + mode: '0644' + owner: prometheus + group: prometheus + loop: + - src: service.j2 + dest: /etc/systemd/system/nvidia_gpu_exporter.service + notify: Restart nvidia_gpu_exporter diff --git a/ansible/playbooks/paas/roles/nvidia_gpu_exporter/templates/service.j2 b/ansible/playbooks/paas/roles/nvidia_gpu_exporter/templates/service.j2 new file mode 100644 index 00000000..1b213d68 --- /dev/null +++ b/ansible/playbooks/paas/roles/nvidia_gpu_exporter/templates/service.j2 @@ -0,0 +1,16 @@ +[Unit] +Description=Nvidia GPU Exporter +Documentation=https://github.com/utkuozdemir/nvidia_gpu_exporter +After=network-online.target + +[Service] +Type=simple +User=prometheus +Group=prometheus +ExecStart=/usr/local/bin/nvidia_gpu_exporter +SyslogIdentifier=nvidia_gpu_exporter +Restart=always +RestartSec=10 + +[Install] +WantedBy=multi-user.target \ No newline at end of file diff --git a/ansible/playbooks/paas/roles/nvidia_gpu_exporter/vars/main.yml b/ansible/playbooks/paas/roles/nvidia_gpu_exporter/vars/main.yml new file mode 100644 index 00000000..8b1563b4 --- /dev/null +++ b/ansible/playbooks/paas/roles/nvidia_gpu_exporter/vars/main.yml @@ -0,0 +1,23 @@ +--- +image: + build: false + upstream: + source: github + user: utkuozdemir + repo: nvidia_gpu_exporter + type: release + format: tar.gz + file: nvidia_gpu_exporter_VERSION_OS_ARCH.FORMAT + os: linux + binary: nvidia_gpu_exporter + labels: {} + name: nginx_exporter + +build_work_dir: /tmp/nvidia_gpu_exporter + +architecture_map: + amd64: amd64 + x86_64: x86_64 + armv7l: arm + aarch64: arm64 + arm64: arm64 diff --git a/ansible/playbooks/paas/roles/nvidia_gpu_exporter/vars/upstream.yml b/ansible/playbooks/paas/roles/nvidia_gpu_exporter/vars/upstream.yml new file mode 100644 index 00000000..036aa64c --- /dev/null +++ b/ansible/playbooks/paas/roles/nvidia_gpu_exporter/vars/upstream.yml @@ -0,0 +1,4 @@ +--- +latest_version: "{{ (lookup('url', 'https://api.github.com/repos/{{ image.upstream.user }}/{{ image.upstream.repo }}/releases/latest', headers={'Accept': 'application/vnd.github+json', 'Authorization': 'Bearer ' + lookup('ansible.builtin.env', 'GITHUB_API_TOKEN') }) | from_json).get('tag_name') | replace('v', '') }}" +upstream_file_name: "{{ image.upstream.file | replace('REPO', image.upstream.repo) | replace('VERSION', latest_version) | replace('OS', image.upstream.os) | replace('ARCH', upstream_default_arch) | replace('FORMAT', image.upstream.format) }}" +upstream_file_url: "https://github.com/{{ image.upstream.user }}/{{ image.upstream.repo }}/releases/download/v{{ latest_version }}/{{ upstream_file_name }}" From 8d19f44ed5997b33d0ddebbcda1d425adac2152e Mon Sep 17 00:00:00 2001 From: Mathieu Garcia Date: Tue, 21 Oct 2025 17:23:20 +0200 Subject: [PATCH 19/34] feat(ansible): add dynamic node discovery, auth, and vllm job --- .../paas/roles/prometheus/tasks/main.yml | 2 +- .../paas/roles/prometheus/templates/config.j2 | 134 ++++++++++++++---- 2 files changed, 104 insertions(+), 32 deletions(-) diff --git a/ansible/playbooks/paas/roles/prometheus/tasks/main.yml b/ansible/playbooks/paas/roles/prometheus/tasks/main.yml index 1911b2de..55e64042 100644 --- a/ansible/playbooks/paas/roles/prometheus/tasks/main.yml +++ b/ansible/playbooks/paas/roles/prometheus/tasks/main.yml @@ -33,7 +33,7 @@ ansible.builtin.template: src: "{{ item.src }}" dest: "{{ item.dest }}" - mode: 0644 + mode: '0644' owner: prometheus group: prometheus loop: diff --git a/ansible/playbooks/paas/roles/prometheus/templates/config.j2 b/ansible/playbooks/paas/roles/prometheus/templates/config.j2 index 6359f4ed..a4530264 100644 --- a/ansible/playbooks/paas/roles/prometheus/templates/config.j2 +++ b/ansible/playbooks/paas/roles/prometheus/templates/config.j2 @@ -15,9 +15,11 @@ remote_write: headers: X-Scope-OrgID: demo send_exemplars: true +{% if prometheus_remote_write.basic_auth %} basic_auth: username: "{{ prometheus_remote_write.login }}" password: "{{ prometheus_remote_write.password }}" +{% endif %} queue_config: capacity: 25000 # Capacité totale de la file d'attente max_shards: 10 # Nombre de shards parallèles (trop haut = surcharge CPU) @@ -64,14 +66,10 @@ scrape_configs: regex: "^(__tmp_keep_me)$" static_configs: - - targets: ['127.0.0.1:9100'] +{% for item in groups['infrastructure'] | default([]) if item.split('.')[4] == project %} + - targets: ['{{ hostvars[item]['ansible_' + hostvars[item].nomad_iface]['ipv4']['address'] }}:9100'] labels: - instance: "{{ inventory_hostname }}" - project: "{{ prometheus_project }}" -{% for item in prometheus_nodes_exporter | default([]) %} - - targets: ['{{ item.target }}:9100'] - labels: - instance: "{{ item.instance }}" + instance: "{{ item }}" project: "{{ prometheus_project }}" {% endfor %} scrape_interval: 60s @@ -169,11 +167,39 @@ scrape_configs: project: "{{ prometheus_project }}" - job_name: "systemd_exporter" + metric_relabel_configs: + - action: drop + regex: "^(go_|prometheus_|promhttp_).*" + source_labels: [__name__] + bearer_token: "{{ lookup('simple-stack-ui', type='secret', key=nomad_primary_master_node | default(inventory_hostname), subkey='nomad_management_token', missing='error') }}" + params: + format: ['prometheus'] + metrics_path: /metrics + tls_config: + insecure_skip_verify: true static_configs: - - targets: ['127.0.0.1:9558'] +{% for item in groups['infrastructure'] | default([]) if item.split('.')[4] == project %} + - targets: ['{{ hostvars[item]['ansible_' + hostvars[item].nomad_iface]['ipv4']['address'] }}:9558'] labels: - instance: "{{ inventory_hostname }}" + instance: "{{ item }}" + project: "{{ prometheus_project }}" +{% endfor %} + + - job_name: "nvidia_gpu_exporter" + metric_relabel_configs: + - action: drop + regex: "^(go_|prometheus_|promhttp_).*" + source_labels: [__name__] + params: + format: ['prometheus'] + metrics_path: /metrics + static_configs: +{% for item in groups['infrastructure'] | default([]) if item.split('.')[4] == project %} + - targets: ['{{ hostvars[item]['ansible_' + hostvars[item].nomad_iface]['ipv4']['address'] }}:9835'] + labels: + instance: "{{ item }}" project: "{{ prometheus_project }}" +{% endfor %} - job_name: "nomad_exporter" metric_relabel_configs: @@ -181,38 +207,51 @@ scrape_configs: regex: "^(go_|prometheus_|promhttp_).*" source_labels: [__name__] scheme: https - bearer_token: "{{ lookup('simple-stack-ui', type='secret', key=inventory_hostname, subkey='nomad_management_token', missing='error') }}" + bearer_token: "{{ lookup('simple-stack-ui', type='secret', key=nomad_primary_master_node | default(inventory_hostname), subkey='nomad_management_token', missing='error') }}" params: format: ['prometheus'] metrics_path: /v1/metrics tls_config: insecure_skip_verify: true static_configs: - - targets: ['127.0.0.1:4646'] +{% for item in groups['infrastructure'] | default([]) if item.split('.')[4] == project %} + - targets: ['{{ hostvars[item]['ansible_' + hostvars[item].nomad_iface]['ipv4']['address'] }}:4646'] labels: - instance: "{{ inventory_hostname }}" + instance: "{{ item }}" project: "{{ prometheus_project }}" - +{% endfor %} - job_name: 'mimir_exporter' nomad_sd_configs: - - server: "https://127.0.0.1:4646" + - server: "https://{{ nomad_primary_master_address | default(inventory_hostname) }}:4646" region: "{{ fact_instance.region }}" tls_config: insecure_skip_verify: true authorization: - credentials: "{{ lookup('simple-stack-ui', type='secret', key=inventory_hostname, subkey='nomad_management_token', missing='error') }}" + credentials: "{{ lookup('simple-stack-ui', type='secret', key=nomad_primary_master_node | default(inventory_hostname), subkey='nomad_management_token', missing='error') }}" relabel_configs: - source_labels: ['__meta_nomad_service'] regex: 'mimir-exporter' action: keep - job_name: 'promtail' + metric_relabel_configs: + - action: drop + regex: "^(go_|prometheus_|promhttp_).*" + source_labels: [__name__] + bearer_token: "{{ lookup('simple-stack-ui', type='secret', key=nomad_primary_master_node | default(inventory_hostname), subkey='nomad_management_token', missing='error') }}" + params: + format: ['prometheus'] + metrics_path: /metrics + tls_config: + insecure_skip_verify: true static_configs: - - targets: ['127.0.0.1:9080'] +{% for item in groups['infrastructure'] | default([]) if item.split('.')[4] == project %} + - targets: ['{{ hostvars[item]['ansible_' + hostvars[item].nomad_iface]['ipv4']['address'] }}:9080'] labels: - instance: "{{ inventory_hostname }}" + instance: "{{ item }}" project: "{{ prometheus_project }}" +{% endfor %} - job_name: 'blackbox' static_configs: @@ -223,12 +262,12 @@ scrape_configs: - job_name: 'traefik' nomad_sd_configs: - - server: "https://127.0.0.1:4646" + - server: "https://{{ nomad_primary_master_address | default(inventory_hostname) }}:4646" region: "{{ fact_instance.region }}" tls_config: insecure_skip_verify: true authorization: - credentials: "{{ lookup('simple-stack-ui', type='secret', key=inventory_hostname, subkey='nomad_management_token', missing='error') }}" + credentials: "{{ lookup('simple-stack-ui', type='secret', key=nomad_primary_master_node | default(inventory_hostname), subkey='nomad_management_token', missing='error') }}" relabel_configs: - target_label: instance replacement: "{{ inventory_hostname }}" @@ -258,12 +297,12 @@ scrape_configs: metrics_path: /minio/v2/metrics/cluster scheme: http nomad_sd_configs: - - server: "https://127.0.0.1:4646" + - server: "https://{{ nomad_primary_master_address | default(inventory_hostname) }}:4646" region: "{{ fact_instance.region }}" tls_config: insecure_skip_verify: true authorization: - credentials: "{{ lookup('simple-stack-ui', type='secret', key=inventory_hostname, subkey='nomad_management_token', missing='error') }}" + credentials: "{{ lookup('simple-stack-ui', type='secret', key=nomad_primary_master_node | default(inventory_hostname), subkey='nomad_management_token', missing='error') }}" relabel_configs: - target_label: instance replacement: "{{ inventory_hostname }}" @@ -285,12 +324,12 @@ scrape_configs: - job_name: 'caddy' nomad_sd_configs: - - server: "https://127.0.0.1:4646" + - server: "https://{{ nomad_primary_master_address | default(inventory_hostname) }}:4646" region: "{{ fact_instance.region }}" tls_config: insecure_skip_verify: true authorization: - credentials: "{{ lookup('simple-stack-ui', type='secret', key=inventory_hostname, subkey='nomad_management_token', missing='error') }}" + credentials: "{{ lookup('simple-stack-ui', type='secret', key=nomad_primary_master_node | default(inventory_hostname), subkey='nomad_management_token', missing='error') }}" relabel_configs: - target_label: instance replacement: "{{ inventory_hostname }}" @@ -314,14 +353,47 @@ scrape_configs: # replacement: '$1' # target_label: instance + + - job_name: 'vllm' + nomad_sd_configs: + - server: "https://{{ nomad_primary_master_address | default(inventory_hostname) }}:4646" + region: "{{ fact_instance.region }}" + tls_config: + insecure_skip_verify: true + authorization: + credentials: "{{ lookup('simple-stack-ui', type='secret', key=nomad_primary_master_node | default(inventory_hostname), subkey='nomad_management_token', missing='error') }}" + relabel_configs: + - target_label: instance + replacement: "{{ inventory_hostname }}" + - target_label: project + replacement: "{{ prometheus_project }}" + + - source_labels: ['__meta_nomad_service'] + regex: 'vllm' + action: keep + + - source_labels: [__meta_nomad_tags] + regex: .*,fqdn:([^,]+),.* + target_label: __tmp_fqdn + replacement: $1 + action: replace + + - source_labels: [__tmp_fqdn] + target_label: fqdn + + # - source_labels: ['__meta_nomad_node'] + # replacement: '$1' + # target_label: instance + + - job_name: 'mysql_exporter' nomad_sd_configs: - - server: "https://127.0.0.1:4646" + - server: "https://{{ nomad_primary_master_address | default(inventory_hostname) }}:4646" region: "{{ fact_instance.region }}" tls_config: insecure_skip_verify: true authorization: - credentials: "{{ lookup('simple-stack-ui', type='secret', key=inventory_hostname, subkey='nomad_management_token', missing='error') }}" + credentials: "{{ lookup('simple-stack-ui', type='secret', key=nomad_primary_master_node | default(inventory_hostname), subkey='nomad_management_token', missing='error') }}" relabel_configs: - target_label: instance replacement: "{{ inventory_hostname }}" @@ -347,12 +419,12 @@ scrape_configs: - job_name: 'nginx_exporter' nomad_sd_configs: - - server: "https://127.0.0.1:4646" + - server: "https://{{ nomad_primary_master_address | default(inventory_hostname) }}:4646" region: "{{ fact_instance.region }}" tls_config: insecure_skip_verify: true authorization: - credentials: "{{ lookup('simple-stack-ui', type='secret', key=inventory_hostname, subkey='nomad_management_token', missing='error') }}" + credentials: "{{ lookup('simple-stack-ui', type='secret', key=nomad_primary_master_node | default(inventory_hostname), subkey='nomad_management_token', missing='error') }}" relabel_configs: - target_label: instance replacement: "{{ inventory_hostname }}" @@ -378,12 +450,12 @@ scrape_configs: - job_name: 'phpfpm_exporter' nomad_sd_configs: - - server: "https://127.0.0.1:4646" + - server: "https://{{ nomad_primary_master_address | default(inventory_hostname) }}:4646" region: "{{ fact_instance.region }}" tls_config: insecure_skip_verify: true authorization: - credentials: "{{ lookup('simple-stack-ui', type='secret', key=inventory_hostname, subkey='nomad_management_token', missing='error') }}" + credentials: "{{ lookup('simple-stack-ui', type='secret', key=nomad_primary_master_node | default(inventory_hostname), subkey='nomad_management_token', missing='error') }}" relabel_configs: - target_label: instance replacement: "{{ inventory_hostname }}" @@ -412,12 +484,12 @@ scrape_configs: metrics_path: /api/prometheus scheme: http nomad_sd_configs: - - server: "https://127.0.0.1:4646" + - server: "https://{{ nomad_primary_master_address | default(inventory_hostname) }}:4646" region: "{{ fact_instance.region }}" tls_config: insecure_skip_verify: true authorization: - credentials: "{{ lookup('simple-stack-ui', type='secret', key=inventory_hostname, subkey='nomad_management_token', missing='error') }}" + credentials: "{{ lookup('simple-stack-ui', type='secret', key=nomad_primary_master_node | default(inventory_hostname), subkey='nomad_management_token', missing='error') }}" relabel_configs: - target_label: instance From 8c42f07192bf27c5afc69ad0afa140482b50abbf Mon Sep 17 00:00:00 2001 From: Mathieu Garcia Date: Tue, 21 Oct 2025 17:26:19 +0200 Subject: [PATCH 20/34] refactor(nomad): standardize defaults and templates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Convert quoted string booleans to native boolean values across defaults and templates. - Add dynamic TLS SAN IP range generation and expose it via `nomad_tls_ip_range`. - Enable Docker private registry support and simplify Docker TLS handling. - Restructure certificate copy tasks to use loops for server and client nodes. - Comment out S3 storage plugin job templates and its handler flush. - Disable CNI installation task and update related conditionals. - Update various template files to use lower‑cased boolean rendering. BREAKING CHANGE: S3 storage plugin is disabled by default and boolean handling has changed; existing playbooks or roles that relied on the previous string representations or S3 plugin may need adjustment. --- .../paas/roles/nomad/defaults/main.yml | 114 ++++++++++-------- .../paas/roles/nomad/tasks/04_tls_certs.yml | 48 ++++---- .../roles/nomad/tasks/06_configuration.yml | 33 ++--- .../playbooks/paas/roles/nomad/tasks/main.yml | 5 +- .../paas/roles/nomad/templates/client.hcl.j2 | 8 +- .../paas/roles/nomad/templates/docker.hcl.j2 | 18 +-- .../paas/roles/nomad/templates/nomad.hcl.j2 | 38 +++--- .../paas/roles/nomad/templates/server.hcl.j2 | 6 +- 8 files changed, 139 insertions(+), 131 deletions(-) diff --git a/ansible/playbooks/paas/roles/nomad/defaults/main.yml b/ansible/playbooks/paas/roles/nomad/defaults/main.yml index b4945a05..86c61ac5 100644 --- a/ansible/playbooks/paas/roles/nomad/defaults/main.yml +++ b/ansible/playbooks/paas/roles/nomad/defaults/main.yml @@ -15,7 +15,7 @@ nomad_timezone: "Europe/Paris" nomad_group: simplestack # Configuration -nomad_dc_name: "dc1" +nomad_dc_name: dc1 nomad_project: "{{ fact_instance.project }}" nomad_region: "{{ fact_instance.region }}" @@ -43,15 +43,15 @@ nomad_leave_on_interrupt: true nomad_client_auto_join: true nomad_server_auto_join: true -nomad_s3_storage_enabled: true +nomad_s3_storage_enabled: false # Network nomad_http_scheme: https nomad_http_ip: "127.0.0.1" nomad_http_port: 4646 -nomad_cluster_bridge: "ens3" -nomad_iface: "ens3" +nomad_cluster_bridge: ens3 +nomad_iface: ens3 nomad_bind_address: "0.0.0.0" nomad_advertise_address: "{{ hostvars[inventory_hostname]['ansible_' + nomad_iface]['ipv4']['address'] }}" @@ -146,25 +146,38 @@ nomad_client_host_network_cluster: name: cluster interface: "{{ nomad_cluster_bridge }}" -nomad_client_meta_list: {"arch": "{{ architecture_map[ansible_facts.architecture] }}", "location": "{{ fact_instance.location }}", "instance": "{{ inventory_hostname }}"} +nomad_client_meta_list: >- + {"arch": "{{ architecture_map[ansible_facts.architecture] }}", + "location": "{{ fact_instance.location }}", + "instance": "{{ inventory_hostname }}"} + +nomad_server_join: >- + "{% if nomad_mode == 'single' %}127.0.0.1{% else %}{{ (groups[nomad_deploy_cluster_name] | + map('extract', hostvars) | + selectattr('nomad_node_role', 'equalto', 'both') | + map(attribute='ansible_br0.ipv4.address')) or + (groups[nomad_deploy_cluster_name] | + map('extract', hostvars) | + selectattr('nomad_node_role', 'equalto', 'both') | + map(attribute='ansible_br0.ipv4.address')) | + unique | list }}{% endif %}" -nomad_server_join: "{% if nomad_mode == 'single' %}127.0.0.1{% else %}{{ (groups[nomad_deploy_cluster_name] | map('extract', hostvars) | selectattr('nomad_node_role', 'equalto', 'both') | map(attribute='ansible_br0.ipv4.address')) or (groups[nomad_deploy_cluster_name] | map('extract', hostvars) | selectattr('nomad_node_role', 'equalto', 'both') | map(attribute='ansible_br0.ipv4.address')) | unique | list }}{% endif %}" nomad_server_join_retry_max: 3 -nomad_server_join_retry_interval: "15s" +nomad_server_join_retry_interval: 15s nomad_client_server_join_retry_max: 3 -nomad_client_server_join_retry_interval: "15s" +nomad_client_server_join_retry_interval: 15s -nomad_client_drain_on_shutdown_deadline: "1m" -nomad_client_drain_on_shutdown_force: "true" -nomad_client_drain_on_shutdown_ignore_system_jobs: "true" +nomad_client_drain_on_shutdown_deadline: 1m +nomad_client_drain_on_shutdown_force: true +nomad_client_drain_on_shutdown_ignore_system_jobs: true nomad_client_cpu_total_compute: 0 nomad_client_memory_total_mb: 0 nomad_client_disk_total_mb: 0 nomad_client_disk_free_mb: 0 -nomad_client_gc_interval: "1m" +nomad_client_gc_interval: 1m nomad_client_gc_disk_usage_threshold: 80 nomad_client_gc_inode_usage_threshold: 70 nomad_client_gc_parallel_destroys: 2 @@ -175,12 +188,15 @@ nomad_client_reserved_disk: 0 # TLS nomad_tls_ca_host: localhost -nomad_tls_ca_host_dir: "~/.simple-stack/tls" -nomad_tls_ca_pubkey: "simplestack-ca.pem" -nomad_tls_ca_privatekey: "simplestack-ca-key.pem" -nomad_tls_ca_provider: "ownca" -nomad_tls_host_certificate_dir: "/etc/ssl/simplestack" -nomad_tls_common_name: "nomad" +nomad_tls_ca_host_dir: ~/.simple-stack/tls +nomad_tls_ca_pubkey: simplestack-ca.pem +nomad_tls_ca_privatekey: simplestack-ca-key.pem +nomad_tls_ca_provider: ownca +nomad_tls_host_certificate_dir: /etc/ssl/simplestack + +nomad_tls_common_name: nomad +# IP range for 192.168.0.0/24 (all 256 addresses) +nomad_tls_ip_range: "{{ range(0,256) | map('regex_replace', '^', 'IP:192.168.0.') | list | join(',') }}" nomad_tls_check_delay: "+2w" # TLS Server @@ -188,36 +204,32 @@ nomad_tls_cert_server: "{{ nomad_dc_name }}-server-nomad.pem" nomad_tls_privatekey_server: "{{ nomad_dc_name }}-server-nomad.key" nomad_tls_common_name_server: "*.{{ nomad_dc_name }}.{{ nomad_tls_common_name }}" -# nomad_tls_subject_alt_name_server: "DNS:localhost,IP:127.0.0.1,DNS:server.global.{{ certificate_subject_alt_name }},DNS:server.{{ nomad_region }}.{{ certificate_subject_alt_name }},DNS:server.{{ nomad_dc_name }}.{{ certificate_subject_alt_name }},DNS:*.{{ nomad_dc_name }}.{{ certificate_subject_alt_name }},IP:172.26.64.1,IP:172.17.0.1,IP:172.18.0.1" -# nomad_tls_subject_alt_name_server: "DNS:localhost,IP:127.0.0.1,DNS:server.global.nomad,DNS:server.{{ nomad_region }}.nomad,DNS:server.{{ nomad_dc_name }}.nomad,DNS:*.{{ nomad_dc_name }}.nomad,IP:172.26.64.1,IP:172.17.0.1,IP:172.18.0.1" -nomad_tls_subject_alt_name_server: "DNS:localhost,IP:127.0.0.1,IP:172.17.0.1,DNS:server.global.nomad,DNS:server.{{ nomad_region }}.nomad,DNS:server.{{ nomad_dc_name }}.nomad,DNS:*.{{ nomad_dc_name }}.nomad" +nomad_tls_subject_alt_name_server: "DNS:localhost,IP:127.0.0.1,IP:172.17.0.1,{{ nomad_tls_ip_range }},DNS:server.global.nomad,DNS:server.{{ nomad_region }}.nomad,DNS:server.{{ nomad_dc_name }}.nomad,DNS:*.{{ nomad_dc_name }}.nomad" # TLS client nomad_tls_cert_client: "{{ inventory_hostname }}-{{ nomad_dc_name }}-client-nomad.pem" nomad_tls_privatekey_client: "{{ inventory_hostname }}-{{ nomad_dc_name }}-client-nomad.key" nomad_tls_common_name_client: "*.{{ nomad_dc_name }}.{{ nomad_tls_common_name }}" -# nomad_tls_subject_alt_name_client: "DNS:localhost,IP:127.0.0.1,DNS:client.global.{{ certificate_subject_alt_name }},DNS:client.{{ nomad_region }}.{{ nomad_tls_common_name }},DNS:client.{{ nomad_dc_name }}.{{ nomad_tls_common_name }},DNS:*.{{ nomad_dc_name }}.{{ nomad_tls_common_name }},IP:172.26.64.1,IP:172.17.0.1,IP:172.18.0.1" -# nomad_tls_subject_alt_name_client: "DNS:localhost,IP:127.0.0.1,DNS:client.global.nomad,DNS:client.{{ nomad_region }}.nomad,DNS:client.{{ nomad_dc_name }}.nomad,DNS:*.{{ nomad_dc_name }}.nomad,IP:172.26.64.1,IP:172.17.0.1,IP:172.18.0.1" -nomad_tls_subject_alt_name_client: "DNS:localhost,IP:127.0.0.1,IP:172.17.0.1,DNS:client.global.nomad,DNS:client.{{ nomad_region }}.nomad,DNS:client.{{ nomad_dc_name }}.nomad,DNS:*.{{ nomad_dc_name }}.nomad" +nomad_tls_subject_alt_name_client: "DNS:localhost,IP:127.0.0.1,IP:172.17.0.1,{{ nomad_tls_ip_range }},DNS:client.global.nomad,DNS:client.{{ nomad_region }}.nomad,DNS:client.{{ nomad_dc_name }}.nomad,DNS:*.{{ nomad_dc_name }}.nomad" -nomad_tls_rpc_upgrade_mode: "false" -nomad_tls_verify_server_hostname: "true" -nomad_tls_verify_https_client: "false" +nomad_tls_rpc_upgrade_mode: false +nomad_tls_verify_server_hostname: true +nomad_tls_verify_https_client: false # ACL nomad_acl_enabled: true nomad_acl_token_ttl: 30s nomad_acl_policy_ttl: 30s -nomad_acl_replication_token: "" +nomad_acl_replication_token: # Docker -nomad_docker_client_dc_name: "dc1" +nomad_docker_client_dc_name: "{{ nomad_dc_name }}" nomad_docker_tcp_listen_address: "127.0.0.1" nomad_docker_tcp_listen_port: 2376 -docker_tls_configuration: false +nomad_docker_tls_configuration: false nomad_docker_client_tls_host_certificate_dir: "/etc/ssl/docker" nomad_docker_client_tls_cert: "{{ nomad_docker_client_dc_name }}-client-docker.pem" @@ -241,40 +253,40 @@ nomad_docker_client_allow_caps: - sys_ptrace - sys_admin -nomad_docker_private_registry_state: false +nomad_docker_private_registry_state: true nomad_docker_private_registry_config: /etc/docker/config.json -nomad_docker_allow_privileged: "{% if nomad_s3_storage_enabled %}true{% else %}false{% endif %}" -nomad_docker_volume_enable: "true" -nomad_docker_gc_image: "true" -nomad_docker_gc_image_delay: "1h" -nomad_docker_gc_container: "true" -nomad_docker_gc_dangling_containers_enabled: "true" -nomad_docker_gc_dangling_containers_dry_run: "false" -nomad_docker_gc_dangling_containers_period: "5m" -nomad_docker_gc_dangling_containers_creation_grace: "5m" +nomad_docker_allow_privileged: true +nomad_docker_volume_enable: true +nomad_docker_gc_image: true +nomad_docker_gc_image_delay: 1h +nomad_docker_gc_container: true +nomad_docker_gc_dangling_containers_enabled: true +nomad_docker_gc_dangling_containers_dry_run: false +nomad_docker_gc_dangling_containers_period: 5m +nomad_docker_gc_dangling_containers_creation_grace: 5m # Telemetry -nomad_telemetry_disable_hostname: "false" -nomad_telemetry_collection_interval: "5s" -nomad_telemetry_use_node_name: "false" -nomad_telemetry_publish_allocation_metrics: "true" -nomad_telemetry_publish_node_metrics: "true" -nomad_telemetry_filter_default: "true" +nomad_telemetry_disable_hostname: false +nomad_telemetry_collection_interval: 5s +nomad_telemetry_use_node_name: false +nomad_telemetry_publish_allocation_metrics: true +nomad_telemetry_publish_node_metrics: true +nomad_telemetry_filter_default: true # nomad_telemetry_prefix_filter: -nomad_telemetry_disable_dispatched_job_summary_metrics: "false" +nomad_telemetry_disable_dispatched_job_summary_metrics: false # nomad_telemetry_statsite_address: "" # nomad_telemetry_statsd_address: "" # nomad_telemetry_datadog_address: "" # nomad_telemetry_datadog_tags: -nomad_telemetry_prometheus_metrics: "true" +nomad_telemetry_prometheus_metrics: true # nomad_telemetry_circonus_api_token: "" nomad_telemetry_circonus_api_app: "nomad" nomad_telemetry_circonus_api_url: "https://api.circonus.com/v2" nomad_telemetry_circonus_submission_interval: "10s" # nomad_telemetry_circonus_submission_url: "" # nomad_telemetry_circonus_check_id: "" -nomad_telemetry_circonus_check_force_metric_activation: "false" +nomad_telemetry_circonus_check_force_metric_activation: false # nomad_telemetry_circonus_check_instance_id: "" # nomad_telemetry_circonus_check_search_tag: "" # nomad_telemetry_circonus_check_display_name: "" @@ -299,5 +311,5 @@ nomad_ui_content_security_policy_script_src: "'self'" nomad_ui_content_security_policy_style_src: "" nomad_ui_label_text: "{{ inventory_hostname }}" -nomad_ui_label_background_color: "blue" -nomad_ui_label_text_color: "white" +nomad_ui_label_background_color: blue +nomad_ui_label_text_color: white diff --git a/ansible/playbooks/paas/roles/nomad/tasks/04_tls_certs.yml b/ansible/playbooks/paas/roles/nomad/tasks/04_tls_certs.yml index 54f88757..43fc09ae 100644 --- a/ansible/playbooks/paas/roles/nomad/tasks/04_tls_certs.yml +++ b/ansible/playbooks/paas/roles/nomad/tasks/04_tls_certs.yml @@ -45,24 +45,22 @@ run_once: true when: not cert_tls_server_present.stat.exists or (cert_tls_server_present.stat.exists and not tls_check_server.valid_at.delay) - - name: "Nomad | Copy cert private server key on nodes" + - name: "Nomad | Copy certificates on server nodes" ansible.builtin.copy: - src: "{{ nomad_tls_ca_host_dir }}/{{ nomad_tls_privatekey_server }}" - dest: "{{ nomad_tls_host_certificate_dir }}/{{ nomad_tls_privatekey_server }}" - owner: "root" - group: "{{ nomad_group }}" - mode: "0640" - - - name: "Nomad | Copy cert server on nodes" - ansible.builtin.copy: - src: "{{ nomad_tls_ca_host_dir }}/{{ nomad_tls_cert_server }}" - dest: "{{ nomad_tls_host_certificate_dir }}/{{ nomad_tls_cert_server }}" - owner: "root" + src: "{{ item.src }}" + dest: "{{ item.dest }}" + owner: root group: "{{ nomad_group }}" mode: "0640" + loop: + - src: "{{ nomad_tls_ca_host_dir }}/{{ nomad_tls_privatekey_server }}" + dest: "{{ nomad_tls_host_certificate_dir }}/{{ nomad_tls_privatekey_server }}" + - src: "{{ nomad_tls_ca_host_dir }}/{{ nomad_tls_cert_server }}" + dest: "{{ nomad_tls_host_certificate_dir }}/{{ nomad_tls_cert_server }}" + notify: Nomad_restart - name: Nomad | Copy certificate on client nodes - when: nomad_node_role == 'client' + when: nomad_node_role in ['client', 'both'] block: - name: "Nomad | Check if TLS cert exists for Client" ansible.builtin.stat: @@ -90,20 +88,18 @@ certificate_client_privatekey: "{{ nomad_tls_privatekey_client }}" certificate_common_name: "{{ nomad_tls_common_name_client }}" certificate_subject_alt_name: "{{ nomad_tls_subject_alt_name_client }}" - when: nomad_mode == 'cluster' - - - name: "Nomad | Copy cert client key on nodes" - ansible.builtin.copy: - src: "{{ nomad_tls_ca_host_dir }}/{{ nomad_tls_cert_client }}" - dest: "{{ nomad_tls_host_certificate_dir }}/{{ nomad_tls_cert_client }}" - owner: "root" - group: "{{ nomad_group }}" - mode: "0640" + # when: nomad_mode == 'cluster' - - name: "Nomad | Copy cert private client key on nodes" + - name: "Nomad | Copy certificates on client nodes" ansible.builtin.copy: - src: "{{ nomad_tls_ca_host_dir }}/{{ nomad_tls_privatekey_client }}" - dest: "{{ nomad_tls_host_certificate_dir }}/{{ nomad_tls_privatekey_client }}" - owner: "root" + src: "{{ item.src }}" + dest: "{{ item.dest }}" + owner: root group: "{{ nomad_group }}" mode: "0640" + loop: + - src: "{{ nomad_tls_ca_host_dir }}/{{ nomad_tls_cert_client }}" + dest: "{{ nomad_tls_host_certificate_dir }}/{{ nomad_tls_cert_client }}" + - src: "{{ nomad_tls_ca_host_dir }}/{{ nomad_tls_privatekey_client }}" + dest: "{{ nomad_tls_host_certificate_dir }}/{{ nomad_tls_privatekey_client }}" + notify: Nomad_restart diff --git a/ansible/playbooks/paas/roles/nomad/tasks/06_configuration.yml b/ansible/playbooks/paas/roles/nomad/tasks/06_configuration.yml index fc3057c1..b32cecc8 100644 --- a/ansible/playbooks/paas/roles/nomad/tasks/06_configuration.yml +++ b/ansible/playbooks/paas/roles/nomad/tasks/06_configuration.yml @@ -68,10 +68,11 @@ port: "{{ nomad_http_port }}" - name: Block + when: nomad_node_role in ['server', 'both'] block: - name: "Nomad Install | Read Nomad management token from UI" ansible.builtin.set_fact: - nomad_management_token: "{{ lookup('simple-stack-ui', type='secret', key=inventory_hostname, subkey='nomad_management_token', missing='error') }}" + nomad_management_token: "{{ lookup('simple-stack-ui', type='secret', key=nomad_primary_master_node | default(inventory_hostname), subkey='nomad_management_token', missing='error') }}" rescue: - name: "Nomad ACL | Generate Bootstrap token" ansible.builtin.uri: @@ -88,7 +89,7 @@ - name: "Nomad Install | Set Nomad management token and insert in UI" ansible.builtin.set_fact: - nomad_management_token: "{{ lookup('simple-stack-ui', type='secret', key=inventory_hostname, subkey='nomad_management_token', missing='create', userpass=nomad_management_token_result.json.SecretID) }}" + nomad_management_token: "{{ lookup('simple-stack-ui', type='secret', key=nomad_primary_master_node | default(inventory_hostname), subkey='nomad_management_token', missing='create', userpass=nomad_management_token_result.json.SecretID) }}" - name: "Nomad Configuration | Enable MemoryOversubscription" ansible.builtin.uri: @@ -109,18 +110,18 @@ register: nomad_memoryoversubscription ignore_errors: true -- name: "Nomad Configuration | Add S3 storage plugin job templates" - ansible.builtin.template: - src: "{{ item }}.j2" - dest: "{{ nomad_job_files_dir }}/{{ item }}" - mode: '0644' - loop: - - "plugin-s3-controller.hcl" - - "plugin-s3-node.hcl" - when: - - nomad_s3_storage_enabled - - nomad_node_role in ['client', 'both'] - notify: Nomad_s3_jobs +# - name: "Nomad Configuration | Add S3 storage plugin job templates" +# ansible.builtin.template: +# src: "{{ item }}.j2" +# dest: "{{ nomad_job_files_dir }}/{{ item }}" +# mode: '0644' +# loop: +# - "plugin-s3-controller.hcl" +# - "plugin-s3-node.hcl" +# when: +# - nomad_s3_storage_enabled +# - nomad_node_role in ['client', 'both'] +# notify: Nomad_s3_jobs -- name: "Nomad Configuration | Flush handlers" - ansible.builtin.meta: flush_handlers +# - name: "Nomad Configuration | Flush handlers" +# ansible.builtin.meta: flush_handlers diff --git a/ansible/playbooks/paas/roles/nomad/tasks/main.yml b/ansible/playbooks/paas/roles/nomad/tasks/main.yml index 75a72cd8..51957285 100644 --- a/ansible/playbooks/paas/roles/nomad/tasks/main.yml +++ b/ansible/playbooks/paas/roles/nomad/tasks/main.yml @@ -7,8 +7,8 @@ name: "{{ nomad_timezone }}" hwclock: local -- name: "Nomad | Install CNI" - ansible.builtin.include_tasks: "02_network.yml" +# - name: "Nomad | Install CNI" +# ansible.builtin.include_tasks: "02_network.yml" - name: "Nomad | Commons tasks" ansible.builtin.include_tasks: "03_commons_tasks.yml" @@ -27,4 +27,3 @@ - name: "Nomad | Change SystemD configuration" ansible.builtin.include_tasks: "08_systemd_tuning.yml" - diff --git a/ansible/playbooks/paas/roles/nomad/templates/client.hcl.j2 b/ansible/playbooks/paas/roles/nomad/templates/client.hcl.j2 index 184a5b9d..a7db1122 100644 --- a/ansible/playbooks/paas/roles/nomad/templates/client.hcl.j2 +++ b/ansible/playbooks/paas/roles/nomad/templates/client.hcl.j2 @@ -1,12 +1,12 @@ client { - enabled = {{ nomad_client_enabled | bool | lower }} + enabled = {{ nomad_client_enabled | lower }} state_dir = "{{ nomad_state_dir_client }}" node_class = "{{ nomad_client_node_class }}" node_pool = "{{ nomad_client_node_pool }}" - no_host_uuid = {{ nomad_client_no_host_uuid | bool | lower }} + no_host_uuid = {{ nomad_client_no_host_uuid | lower }} servers = [ {%- set comma = joiner(",") -%} @@ -51,8 +51,8 @@ client { {% if nomad_mode == 'cluster' %} drain_on_shutdown { deadline = "{{ nomad_client_drain_on_shutdown_deadline }}" - force = {{ nomad_client_drain_on_shutdown_force }} - ignore_system_jobs = {{ nomad_client_drain_on_shutdown_ignore_system_jobs }} + force = {{ nomad_client_drain_on_shutdown_force | lower }} + ignore_system_jobs = {{ nomad_client_drain_on_shutdown_ignore_system_jobs | lower }} } {% endif %} diff --git a/ansible/playbooks/paas/roles/nomad/templates/docker.hcl.j2 b/ansible/playbooks/paas/roles/nomad/templates/docker.hcl.j2 index 14043718..a82ce90c 100644 --- a/ansible/playbooks/paas/roles/nomad/templates/docker.hcl.j2 +++ b/ansible/playbooks/paas/roles/nomad/templates/docker.hcl.j2 @@ -1,6 +1,6 @@ plugin "docker" { config { -{% if docker_tls_configuration == true %} +{% if nomad_docker_tls_configuration %} endpoint = "tcp://{{ nomad_docker_tcp_listen_address }}:{{ nomad_docker_tcp_listen_port }}" tls { @@ -12,27 +12,27 @@ plugin "docker" { endpoint = "unix:///var/run/docker.sock" {% endif %} -{% if nomad_docker_private_registry_state == true %} +{% if nomad_docker_private_registry_state %} auth { config = "{{ nomad_docker_private_registry_config }}" } {% endif %} - allow_privileged = {{ nomad_docker_allow_privileged }} + allow_privileged = {{ nomad_docker_allow_privileged | lower }} volumes { - enabled = {{ nomad_docker_volume_enable }} + enabled = {{ nomad_docker_volume_enable | lower }} } allow_caps = [{% for item in nomad_docker_client_allow_caps %}"{{ item }}"{% if not loop.last %}, {% endif %}{% endfor %}] gc { - image = {{ nomad_docker_gc_image }} - image_delay = "{{ nomad_docker_gc_image_delay }}" - container = {{ nomad_docker_gc_container }} + image = {{ nomad_docker_gc_image | lower }} + image_delay = "{{ nomad_docker_gc_image_delay | lower }}" + container = {{ nomad_docker_gc_container | lower }} dangling_containers { - enabled = {{ nomad_docker_gc_dangling_containers_enabled }} - dry_run = {{ nomad_docker_gc_dangling_containers_dry_run }} + enabled = {{ nomad_docker_gc_dangling_containers_enabled | lower }} + dry_run = {{ nomad_docker_gc_dangling_containers_dry_run | lower }} period = "{{ nomad_docker_gc_dangling_containers_period }}" creation_grace = "{{ nomad_docker_gc_dangling_containers_creation_grace }}" } diff --git a/ansible/playbooks/paas/roles/nomad/templates/nomad.hcl.j2 b/ansible/playbooks/paas/roles/nomad/templates/nomad.hcl.j2 index 7d027483..c67aa932 100644 --- a/ansible/playbooks/paas/roles/nomad/templates/nomad.hcl.j2 +++ b/ansible/playbooks/paas/roles/nomad/templates/nomad.hcl.j2 @@ -2,8 +2,8 @@ name = "{{ nomad_node_name }}" region = "{{ nomad_region }}" datacenter = "{{ nomad_dc_name }}" -disable_anonymous_signature = {{ nomad_disable_anonymous_signature | bool | lower }} -disable_update_check = {{ nomad_disable_update_check | bool | lower }} +disable_anonymous_signature = {{ nomad_disable_anonymous_signature | lower }} +disable_update_check = {{ nomad_disable_update_check | lower }} data_dir = "{{ nomad_data_dir }}" @@ -20,15 +20,15 @@ ports { serf = {{ nomad_ports.serf }} } -enable_debug = {{ nomad_debug | bool | lower }} +enable_debug = {{ nomad_debug | lower }} log_file = "{{ nomad_log_file }}" log_level = "{{ nomad_log_level }}" log_rotate_bytes = {{ nomad_log_rotate_bytes }} log_rotate_duration = "{{ nomad_log_rotate_duration }}" log_rotate_max_files = {{ nomad_log_rotate_max_files }} -leave_on_terminate = {{ nomad_leave_on_terminate | bool | lower }} -leave_on_interrupt = {{ nomad_leave_on_interrupt | bool | lower }} +leave_on_terminate = {{ nomad_leave_on_terminate | lower }} +leave_on_interrupt = {{ nomad_leave_on_interrupt | lower }} tls { http = true @@ -36,39 +36,39 @@ tls { ca_file = "{{ nomad_tls_host_certificate_dir }}/{{ nomad_tls_ca_pubkey }}" cert_file = "{{ nomad_tls_host_certificate_dir }}/{{ (nomad_node_role == 'client') | ternary(nomad_tls_cert_client, nomad_tls_cert_server) }}" key_file = "{{ nomad_tls_host_certificate_dir }}/{{ (nomad_node_role == 'client') | ternary(nomad_tls_privatekey_client, nomad_tls_privatekey_server) }}" - rpc_upgrade_mode = {{ nomad_tls_rpc_upgrade_mode }} - verify_server_hostname = "{{ nomad_tls_verify_server_hostname }}" - verify_https_client = "{{ nomad_tls_verify_https_client }}" + rpc_upgrade_mode = {{ nomad_tls_rpc_upgrade_mode | lower }} + verify_server_hostname = {{ nomad_tls_verify_server_hostname | lower }} + verify_https_client = {{ nomad_tls_verify_https_client | lower }} } acl { - enabled = {{ nomad_acl_enabled | bool | lower }} + enabled = {{ nomad_acl_enabled | lower }} token_ttl = "{{ nomad_acl_token_ttl }}" policy_ttl = "{{ nomad_acl_policy_ttl }}" replication_token = "{{ nomad_acl_replication_token }}" } telemetry { - disable_hostname = {{ nomad_telemetry_disable_hostname }} + disable_hostname = {{ nomad_telemetry_disable_hostname | lower }} collection_interval = "{{ nomad_telemetry_collection_interval }}" - use_node_name = {{ nomad_telemetry_use_node_name }} - publish_allocation_metrics = {{ nomad_telemetry_publish_allocation_metrics }} - publish_node_metrics = {{ nomad_telemetry_publish_node_metrics }} - filter_default = {{ nomad_telemetry_filter_default }} + use_node_name = {{ nomad_telemetry_use_node_name | lower }} + publish_allocation_metrics = {{ nomad_telemetry_publish_allocation_metrics | lower }} + publish_node_metrics = {{ nomad_telemetry_publish_node_metrics | lower }} + filter_default = {{ nomad_telemetry_filter_default | lower }} prefix_filter = [] - disable_dispatched_job_summary_metrics = {{ nomad_telemetry_disable_dispatched_job_summary_metrics }} + disable_dispatched_job_summary_metrics = {{ nomad_telemetry_disable_dispatched_job_summary_metrics | lower }} statsite_address = "" statsd_address = "" datadog_address = "" datadog_tags = [] - prometheus_metrics = {{ nomad_telemetry_prometheus_metrics }} + prometheus_metrics = {{ nomad_telemetry_prometheus_metrics | lower }} circonus_api_token = "" circonus_api_app = "{{ nomad_telemetry_circonus_api_app }}" circonus_api_url = "{{ nomad_telemetry_circonus_api_url }}" circonus_submission_interval = "{{ nomad_telemetry_circonus_submission_interval }}" circonus_submission_url = "" circonus_check_id = "" - circonus_check_force_metric_activation = {{ nomad_telemetry_circonus_check_force_metric_activation }} + circonus_check_force_metric_activation = {{ nomad_telemetry_circonus_check_force_metric_activation | lower }} circonus_check_instance_id = "" circonus_check_search_tag = "" circonus_check_display_name = "" @@ -78,7 +78,7 @@ telemetry { } autopilot { - cleanup_dead_servers = {{ nomad_autopilot_cleanup_dead_servers | bool | lower }} + cleanup_dead_servers = {{ nomad_autopilot_cleanup_dead_servers | lower }} last_contact_threshold = "{{ nomad_autopilot_last_contact_threshold }}" max_trailing_logs = {{ nomad_autopilot_max_trailing_logs }} server_stabilization_time = "{{ nomad_autopilot_server_stabilization_time }}" @@ -90,7 +90,7 @@ limits { } ui { - enabled = {{ nomad_ui_enabled | bool | lower }} + enabled = {{ nomad_ui_enabled | lower }} content_security_policy { connect_src = ["{{ nomad_ui_content_security_policy_connect_src }}"] diff --git a/ansible/playbooks/paas/roles/nomad/templates/server.hcl.j2 b/ansible/playbooks/paas/roles/nomad/templates/server.hcl.j2 index e9edfca8..7a8e8e17 100644 --- a/ansible/playbooks/paas/roles/nomad/templates/server.hcl.j2 +++ b/ansible/playbooks/paas/roles/nomad/templates/server.hcl.j2 @@ -1,11 +1,11 @@ server { - enabled = {{ nomad_server_enabled | bool | lower }} + enabled = {{ nomad_server_enabled | lower }} bootstrap_expect = {{ nomad_servers | length }} data_dir = "{{ nomad_data_dir_server }}" - {% if nomad_server_retry_join | bool -%} + {% if nomad_server_retry_join -%} retry_join = [ {%- set comma = joiner(",") -%} {% for server in nomad_servers_advertise_address -%} @@ -21,7 +21,7 @@ server { {%- endfor -%} ] {%- endif %} - rejoin_after_leave = {{ nomad_server_rejoin_after_leave | bool | lower }} + rejoin_after_leave = {{ nomad_server_rejoin_after_leave | lower }} enabled_schedulers = [ {%- set comma = joiner(",") -%} From 7748d97a085904d471fe09d27fb08d21a73cd5a6 Mon Sep 17 00:00:00 2001 From: Mathieu Garcia Date: Tue, 21 Oct 2025 17:27:39 +0200 Subject: [PATCH 21/34] feat(ansible): add Nomad JuiceFS CSI driver deployment --- ansible/playbooks/paas/nomad-juicefs.yml | 11 ++++ .../paas/roles/nomad/tasks/10_juicefs.yml | 59 +++++++++++++++++++ .../nomad/templates/juicefs-controller.hcl.j2 | 43 ++++++++++++++ .../roles/nomad/templates/juicefs-node.hcl.j2 | 48 +++++++++++++++ .../nomad/templates/juicefs-volume.hcl.j2 | 18 ++++++ 5 files changed, 179 insertions(+) create mode 100644 ansible/playbooks/paas/nomad-juicefs.yml create mode 100644 ansible/playbooks/paas/roles/nomad/tasks/10_juicefs.yml create mode 100644 ansible/playbooks/paas/roles/nomad/templates/juicefs-controller.hcl.j2 create mode 100644 ansible/playbooks/paas/roles/nomad/templates/juicefs-node.hcl.j2 create mode 100644 ansible/playbooks/paas/roles/nomad/templates/juicefs-volume.hcl.j2 diff --git a/ansible/playbooks/paas/nomad-juicefs.yml b/ansible/playbooks/paas/nomad-juicefs.yml new file mode 100644 index 00000000..f4de3129 --- /dev/null +++ b/ansible/playbooks/paas/nomad-juicefs.yml @@ -0,0 +1,11 @@ +--- +- name: Install nomad juicefs CSI driver + any_errors_fatal: true + hosts: "{{ hosts_limit | default('infrastructure') }}" + gather_facts: true + become: true + tasks: + - name: Install nomad juicefs CSI driver + ansible.builtin.import_role: + name: nomad + tasks_from: 10_juicefs diff --git a/ansible/playbooks/paas/roles/nomad/tasks/10_juicefs.yml b/ansible/playbooks/paas/roles/nomad/tasks/10_juicefs.yml new file mode 100644 index 00000000..d5414e97 --- /dev/null +++ b/ansible/playbooks/paas/roles/nomad/tasks/10_juicefs.yml @@ -0,0 +1,59 @@ +--- +- name: "Nomad Juicefs | Copy configuration" + ansible.builtin.template: + src: "{{ item }}.j2" + dest: "{{ nomad_job_files_dir }}/{{ item }}" + mode: '0644' + loop: + - juicefs-controller.hcl + - juicefs-node.hcl + - juicefs-volume.hcl + +- name: "Nomad Juicefs | Copy volume configuration" + ansible.builtin.template: + src: juicefs-volume.hcl.j2 + dest: "{{ nomad_job_files_dir }}/juicefs-volume-{{ item }}.hcl" + mode: '0644' + loop: + - volume + - test + - llm + +- name: Nomad Juicefs | Run jobs + ansible.builtin.command: "/usr/bin/nomad job run {{ nomad_job_files_dir }}/{{ item }}" + args: + chdir: "{{ nomad_job_files_dir }}" + environment: + NOMAD_ADDR: "https://{{ nomad_http_ip }}:4646" + NOMAD_TOKEN: "{{ lookup('simple-stack-ui', type='secret', key=inventory_hostname, subkey='nomad_management_token', missing='error') }}" + NOMAD_CLIENT_CERT: "{{ nomad_tls_host_certificate_dir }}/{{ nomad_tls_cert_server }}" + NOMAD_CLIENT_KEY: "{{ nomad_tls_host_certificate_dir }}/{{ nomad_tls_privatekey_server }}" + NOMAD_CACERT: "{{ nomad_tls_host_certificate_dir }}/{{ nomad_tls_ca_pubkey }}" + register: nomad_job_start + loop: + - juicefs-controller.hcl + - juicefs-node.hcl + failed_when: nomad_job_start.rc >= 2 + changed_when: + - '"error" in nomad_job_start.stdout' + - nomad_job_start.rc >= 2 + +- name: Nomad Juicefs | Create volume + ansible.builtin.command: "/usr/bin/nomad volume create {{ nomad_job_files_dir }}/juicefs-volume-{{ item }}.hcl" + args: + chdir: "{{ nomad_job_files_dir }}" + environment: + NOMAD_ADDR: "https://{{ nomad_http_ip }}:4646" + NOMAD_TOKEN: "{{ lookup('simple-stack-ui', type='secret', key=inventory_hostname, subkey='nomad_management_token', missing='error') }}" + NOMAD_CLIENT_CERT: "{{ nomad_tls_host_certificate_dir }}/{{ nomad_tls_cert_server }}" + NOMAD_CLIENT_KEY: "{{ nomad_tls_host_certificate_dir }}/{{ nomad_tls_privatekey_server }}" + NOMAD_CACERT: "{{ nomad_tls_host_certificate_dir }}/{{ nomad_tls_ca_pubkey }}" + register: nomad_job_start + loop: + - volume + - test + - llm + failed_when: nomad_job_start.rc >= 2 + changed_when: + - '"error" in nomad_job_start.stdout' + - nomad_job_start.rc >= 2 diff --git a/ansible/playbooks/paas/roles/nomad/templates/juicefs-controller.hcl.j2 b/ansible/playbooks/paas/roles/nomad/templates/juicefs-controller.hcl.j2 new file mode 100644 index 00000000..c42da612 --- /dev/null +++ b/ansible/playbooks/paas/roles/nomad/templates/juicefs-controller.hcl.j2 @@ -0,0 +1,43 @@ +job "jfs-controller" { + datacenters = ["dc1"] + type = "system" + + group "controller" { + + constraint { + attribute = "${meta.instance}" + set_contains = "{{ nomad_constraints_juicefs_controller_instance }}" + } + + task "plugin" { + driver = "docker" + + config { + image = "juicedata/juicefs-csi-driver:v0.30.0" + + args = [ + "--endpoint=unix://csi/csi.sock", + "--logtostderr", + "--nodeid=test", + "--v=5", + "--by-process=true" + ] + + privileged = true + } + + csi_plugin { + id = "juicefs0" + type = "controller" + mount_dir = "/csi" + } + resources { + cpu = 100 + memory = 512 + } + env { + POD_NAME = "csi-controller" + } + } + } +} \ No newline at end of file diff --git a/ansible/playbooks/paas/roles/nomad/templates/juicefs-node.hcl.j2 b/ansible/playbooks/paas/roles/nomad/templates/juicefs-node.hcl.j2 new file mode 100644 index 00000000..f503ef6d --- /dev/null +++ b/ansible/playbooks/paas/roles/nomad/templates/juicefs-node.hcl.j2 @@ -0,0 +1,48 @@ +job "jfs-node" { + datacenters = ["dc1"] + type = "system" + + group "nodes" { + + constraint { + attribute = "${meta.location}" + operator = "set_contains_any" + value = "{{ nomad_constraints_juicefs_controller_nodes | join(',') }}" + } + + task "juicefs-plugin" { + driver = "docker" + + config { + image = "juicedata/juicefs-csi-driver:v0.30.0" + + args = [ + "--endpoint=unix://csi/csi.sock", + "--logtostderr", + "--v=5", + "--nodeid=test", + "--by-process=true", + #"--cache-dir", "/var/jfs-cache", + #"--cache-size", "50G", + #"--writeback", + #"--prefetch", "2" + ] + + privileged = true + } + + csi_plugin { + id = "juicefs0" + type = "node" + mount_dir = "/csi" + } + resources { + cpu = 1000 + memory = 1024 + } + env { + POD_NAME = "csi-node" + } + } + } +} \ No newline at end of file diff --git a/ansible/playbooks/paas/roles/nomad/templates/juicefs-volume.hcl.j2 b/ansible/playbooks/paas/roles/nomad/templates/juicefs-volume.hcl.j2 new file mode 100644 index 00000000..f45cf80f --- /dev/null +++ b/ansible/playbooks/paas/roles/nomad/templates/juicefs-volume.hcl.j2 @@ -0,0 +1,18 @@ +type = "csi" +id = "juicefs-{{ item }}" +name = "juicefs-{{ item }}" + +capability { + access_mode = "multi-node-multi-writer" + attachment_mode = "file-system" +} +plugin_id = "juicefs0" + +secrets { + name="juicefs-volume" + metaurl="redis://{{ nomad_juicefs_secrets.valkey.address }}:{{ nomad_juicefs_secrets.valkey.port }}/0" + bucket="http://{{ nomad_juicefs_secrets.minio.address }}:{{ nomad_juicefs_secrets.minio.port }}/minio/{{ item }}" + storage="minio" + access-key="{{ lookup('simple-stack-ui', type='secret', key=nomad_juicefs_secrets.minio.domain, subkey='user', missing='error') }}" + secret-key="{{ lookup('simple-stack-ui', type='secret', key=nomad_juicefs_secrets.minio.domain, subkey='passwd', missing='error') }}" +} \ No newline at end of file From 56a9fdbe5d7cb342c732ec0ac7ec0bf6cd459a41 Mon Sep 17 00:00:00 2001 From: Mathieu Garcia Date: Tue, 21 Oct 2025 17:28:13 +0200 Subject: [PATCH 22/34] refactor(ansible): use fully qualified debug module --- ansible/playbooks/saas/main.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/playbooks/saas/main.yml b/ansible/playbooks/saas/main.yml index 0097343b..07729907 100644 --- a/ansible/playbooks/saas/main.yml +++ b/ansible/playbooks/saas/main.yml @@ -30,8 +30,8 @@ software: "{{ lookup('simple-stack-ui', type='software', key=domain, subkey='', missing='warn') }}" - name: Debug software - debug: - msg: "{{ software }}" + ansible.builtin.debug: + msg: "{{ software }}" tasks: - name: Deploy service From 64dad41eee692a8e8fbd26162515ca57add46060 Mon Sep 17 00:00:00 2001 From: Mathieu Garcia Date: Tue, 21 Oct 2025 17:28:44 +0200 Subject: [PATCH 23/34] feat(ansible): add optional project prefix to image name --- ansible/playbooks/saas/image.yml | 25 +++++-------------------- 1 file changed, 5 insertions(+), 20 deletions(-) diff --git a/ansible/playbooks/saas/image.yml b/ansible/playbooks/saas/image.yml index 9f819d3d..63b71ebd 100644 --- a/ansible/playbooks/saas/image.yml +++ b/ansible/playbooks/saas/image.yml @@ -26,25 +26,9 @@ state: directory mode: '0755' loop: - - /root/.docker - "{{ build_work_dir }}/download" - "{{ build_work_dir }}/{{ upstream_default_arch }}" - - name: Copy docker config file - ansible.builtin.copy: - content: | - { - "auths": { - "{{ docker_private_registry.url }}": { - "auth": "{{ (docker_private_registry.username + ':' + docker_private_registry.password) | b64encode }}" - } - } - } - dest: /root/.docker/config.json - owner: root - group: root - mode: '0600' - tasks: - name: Install dependencies ansible.builtin.include_role: @@ -59,9 +43,9 @@ - name: Build when: image_build block: - - name: Build image + - name: Build and publish image community.docker.docker_image_build: - name: "{{ docker_private_registry.url }}/{{ image_name }}:{{ image_version }}" + name: "{{ docker_private_registry.url }}/{{ docker_private_registry.project is defined | ternary(docker_private_registry.project + '/', '') }}{{ image_name }}:{{ image_version }}" tag: latest path: "/tmp/{{ catalog }}" dockerfile: Dockerfile @@ -91,6 +75,7 @@ version: "{{ image_version }}" force_basic_auth: true status_code: 200 - ignore_errors: true + register: ui_update + failed_when: ui_update.status != 200 delegate_to: localhost - become: false \ No newline at end of file + become: false From a504975685c1d9bd71ae9e2a2c3d4f51b9ccb2b0 Mon Sep 17 00:00:00 2001 From: Mathieu Garcia Date: Tue, 21 Oct 2025 17:28:58 +0200 Subject: [PATCH 24/34] feat(ansible): add optional basic auth for Loki remote write --- ansible/playbooks/paas/roles/promtail/templates/config.yaml.j2 | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ansible/playbooks/paas/roles/promtail/templates/config.yaml.j2 b/ansible/playbooks/paas/roles/promtail/templates/config.yaml.j2 index 4d76945a..50a2def9 100644 --- a/ansible/playbooks/paas/roles/promtail/templates/config.yaml.j2 +++ b/ansible/playbooks/paas/roles/promtail/templates/config.yaml.j2 @@ -10,9 +10,11 @@ positions: {% if loki_remote_write is defined %} clients: - url: {{ loki_remote_write.url }}/api/prom/push +{% if loki_remote_write.basic_auth %} basic_auth: username: {{ loki_remote_write.login }} password: {{ loki_remote_write.password }} +{% endif %} {% else %} clients: [] {% endif %} From ff318ab8e8c71120945c0f06697b8f1038760634 Mon Sep 17 00:00:00 2001 From: Mathieu Garcia Date: Tue, 21 Oct 2025 17:29:18 +0200 Subject: [PATCH 25/34] feat(ansible): bind node_exporter to host IP address --- ansible/playbooks/paas/roles/node_exporter/templates/default.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/playbooks/paas/roles/node_exporter/templates/default.j2 b/ansible/playbooks/paas/roles/node_exporter/templates/default.j2 index acb52c40..1289c4c7 100644 --- a/ansible/playbooks/paas/roles/node_exporter/templates/default.j2 +++ b/ansible/playbooks/paas/roles/node_exporter/templates/default.j2 @@ -1,5 +1,5 @@ ARGS="--log.level=info \ ---web.listen-address=127.0.0.1:9100 \ +--web.listen-address={{ hostvars[inventory_hostname]['ansible_' + nomad_iface]['ipv4']['address'] }}:9100 \ --web.telemetry-path=/metrics \ --collector.diskstats.ignored-devices='^(ram|loop|fd|(h|s|v|xv)d[a-z]|nbd|nvme[0-9]+n[0-9]+p|md|dm-)[0-9]+$' \ --collector.filesystem.mount-points-exclude='^/(dev(/shm)?|proc|run(/.+)?|sys|var/tmp|(var/lib|home)/(docker|kubelet)/.+)($|/)' \ From 4fa1730c2585a2961689a694d69fbc4da8492e9f Mon Sep 17 00:00:00 2001 From: Mathieu Garcia Date: Tue, 21 Oct 2025 17:29:28 +0200 Subject: [PATCH 26/34] refactor(ansible): use fully qualified password_hash filter --- ansible/playbooks/saas/basic_auth.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/playbooks/saas/basic_auth.yml b/ansible/playbooks/saas/basic_auth.yml index 06e2d7af..cb400913 100644 --- a/ansible/playbooks/saas/basic_auth.yml +++ b/ansible/playbooks/saas/basic_auth.yml @@ -14,4 +14,4 @@ tasks: - name: Debug hash ansible.builtin.debug: - msg: "{{ login }}:{{ password | password_hash('blowfish') }}" + msg: "{{ login }}:{{ password | ansible.builtin.password_hash('blowfish') }}" From 78ac4bbb06a4d36a825bfedb2669b2d80add2d8d Mon Sep 17 00:00:00 2001 From: Mathieu Garcia Date: Tue, 21 Oct 2025 17:29:55 +0200 Subject: [PATCH 27/34] feat(ansible): add unified Docker repo task and root .docker dir --- .../roles/ansible-docker/tasks/install.yml | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/ansible/playbooks/paas/roles/ansible-docker/tasks/install.yml b/ansible/playbooks/paas/roles/ansible-docker/tasks/install.yml index 01bb2add..dd13b02f 100644 --- a/ansible/playbooks/paas/roles/ansible-docker/tasks/install.yml +++ b/ansible/playbooks/paas/roles/ansible-docker/tasks/install.yml @@ -21,14 +21,7 @@ url: "https://download.docker.com/linux/{{ ansible_distribution | lower }}/gpg" dest: /etc/apt/keyrings/docker.asc -- name: Add Docker repository on ubuntu < 24.04 - ansible.builtin.apt_repository: - repo: "deb [arch={{ upstream_default_arch }} signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu {{ ansible_distribution_release }} stable" - state: present - filename: docker - when: ansible_distribution_version is version('24.04', '<') - -- name: Add Docker repository on ubuntu >= 24.04 +- name: Add Docker repository on ubuntu ansible.builtin.copy: content: | Components: stable @@ -42,7 +35,6 @@ owner: root group: root mode: '0644' - when: ansible_distribution_version is version('24.04', '>=') - name: Install Docker ansible.builtin.apt: @@ -68,14 +60,24 @@ append: true notify: Docker_restart +- name: Create home docker directory + ansible.builtin.file: + path: "{{ item }}" + recurse: true + state: directory + mode: '0755' + loop: + - /root/.docker + - name: Copy config.json ansible.builtin.template: src: config.json.j2 - dest: "{{ docker_private_registry_config }}" + dest: "{{ item }}" owner: root group: root mode: '0600' when: docker_private_registry_state + loop: "{{ docker_private_registry_config }}" notify: Docker_restart - name: Copy daemon.json for DNS resolution From 477dbe9cd32b90d7262387c3e6ea3f2a132e14fb Mon Sep 17 00:00:00 2001 From: Mathieu Garcia Date: Tue, 21 Oct 2025 17:30:07 +0200 Subject: [PATCH 28/34] feat(ansible): add DNSStubListenerExtra to systemd-resolved config --- ansible/playbooks/paas/systemd-resolved.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/ansible/playbooks/paas/systemd-resolved.yml b/ansible/playbooks/paas/systemd-resolved.yml index f07ac0bb..9cfc4f6d 100644 --- a/ansible/playbooks/paas/systemd-resolved.yml +++ b/ansible/playbooks/paas/systemd-resolved.yml @@ -16,6 +16,7 @@ content: | [Resolve] DNSStubListener=yes + DNSStubListenerExtra=172.17.0.1:53 dest: /etc/systemd/resolved.conf.d/systemd-resolved.conf mode: '0644' notify: Restart systemd-resolved From 8080ffadc5458b5b9046bd88ba2478505d6eda09 Mon Sep 17 00:00:00 2001 From: Mathieu Garcia Date: Tue, 21 Oct 2025 17:30:30 +0200 Subject: [PATCH 29/34] feat(ansible): add partition-data playbook for sdb --- ansible/playbooks/paas/partition-data.yml | 32 +++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 ansible/playbooks/paas/partition-data.yml diff --git a/ansible/playbooks/paas/partition-data.yml b/ansible/playbooks/paas/partition-data.yml new file mode 100644 index 00000000..eff0d7cd --- /dev/null +++ b/ansible/playbooks/paas/partition-data.yml @@ -0,0 +1,32 @@ +--- +- name: Configure sdb partition + any_errors_fatal: true + hosts: "{{ hosts_limit | default('infrastructure') }}" + gather_facts: true + become: true + tasks: + - name: Create default directory + ansible.builtin.file: + path: /data + state: directory + owner: root + group: root + mode: '0755' + + - name: Create a new primary partition /dev/sdb1 + community.general.parted: + device: /dev/sdb + number: 1 + state: present + + - name: Create a ext4 filesystem on /dev/sdb1 + community.general.filesystem: + fstype: ext4 + dev: /dev/sdb1 + + - name: Mount up device + ansible.posix.mount: + path: /data + src: /dev/sdb1 + fstype: ext4 + state: present From d578140b739df037560a9ea06ad6a95e3fe722a9 Mon Sep 17 00:00:00 2001 From: Mathieu Garcia Date: Tue, 21 Oct 2025 17:31:16 +0200 Subject: [PATCH 30/34] feat(ansible): add hostname task and import service playbooks --- ansible/playbooks/paas/main.yml | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/ansible/playbooks/paas/main.yml b/ansible/playbooks/paas/main.yml index 85c0b7f3..b1c0b2e4 100644 --- a/ansible/playbooks/paas/main.yml +++ b/ansible/playbooks/paas/main.yml @@ -5,6 +5,12 @@ gather_facts: true become: true pre_tasks: + + - name: Set fqdn hostname + ansible.builtin.hostname: + name: "{{ inventory_hostname }}" + use: systemd + - name: Create ansible facts.d directory become: true ansible.builtin.file: @@ -14,20 +20,6 @@ group: "root" mode: '0755' - - name: Get ipinfo.io - ansible.builtin.uri: - url: https://ipinfo.io - http_agent: curl/7.81.0 - register: register_uri - check_mode: false - - - name: Set ipinfo local_fact - ansible.builtin.copy: - content: | - {{ register_uri.json | to_nice_json }} - dest: /etc/ansible/facts.d/ipinfo.fact - mode: '0644' - - name: Install mandatories packages ansible.builtin.apt: pkg: @@ -42,5 +34,17 @@ until: apt_status is success delay: 6 retries: 10 + roles: - unattended-upgrades + +- name: Configure systemd resolved + ansible.builtin.import_playbook: systemd-resolved.yml +- name: Configure docker + ansible.builtin.import_playbook: docker.yml +- name: Configure nomad + ansible.builtin.import_playbook: nomad.yml +- name: Configure coredns + ansible.builtin.import_playbook: coredns.yml +- name: Configure metrology + ansible.builtin.import_playbook: metrology.yml From 62eea20265053c9e043af472dd5f978cdfabd35b Mon Sep 17 00:00:00 2001 From: Mathieu Garcia Date: Tue, 21 Oct 2025 17:32:44 +0200 Subject: [PATCH 31/34] feat(ansible): add nvidia_gpu_exporter exporter --- ansible/playbooks/paas/metrology.yml | 61 ++++++++++++++++++++++------ 1 file changed, 48 insertions(+), 13 deletions(-) diff --git a/ansible/playbooks/paas/metrology.yml b/ansible/playbooks/paas/metrology.yml index 2f334268..56718234 100644 --- a/ansible/playbooks/paas/metrology.yml +++ b/ansible/playbooks/paas/metrology.yml @@ -4,16 +4,51 @@ hosts: "{{ hosts_limit | default('infrastructure') }}" gather_facts: true become: true - roles: - - prometheus - - promtail - - phpfpm_exporter - - node_exporter - - mysqld_exporter - - systemd_exporter - - mongodb_exporter - - blackbox_exporter - - nginx_exporter - - scan_exporter - - dns_exporter - - script_exporter + vars_prompt: + - name: project + prompt: project name + private: false + tasks: + - name: End the play for hosts that are not in admins group + ansible.builtin.meta: end_host + when: fact_instance.location != 'admins' + + - name: Install prometheus + ansible.builtin.import_role: + name: prometheus + +- name: Install exporters + any_errors_fatal: true + hosts: "{{ hosts_limit | default('infrastructure') }}" + gather_facts: true + become: true + tasks: + - name: Create prometheus group + ansible.builtin.group: + name: prometheus + system: true + + - name: Create prometheus user + ansible.builtin.user: + name: prometheus + create_home: false + system: true + + - name: Install exporters + ansible.builtin.include_role: + name: "{{ exporter }}" + loop: + - promtail + - phpfpm_exporter + - node_exporter + - mysqld_exporter + - systemd_exporter + - mongodb_exporter + - blackbox_exporter + - nginx_exporter + - scan_exporter + - dns_exporter + - script_exporter + - nvidia_gpu_exporter + loop_control: + loop_var: exporter From fd405e24a1fece095e43f7d5bb72b838d9b0be67 Mon Sep 17 00:00:00 2001 From: Mathieu Garcia Date: Tue, 21 Oct 2025 17:33:17 +0200 Subject: [PATCH 32/34] feat(ansible): add support for multiple Docker registry configs --- ansible/playbooks/paas/roles/ansible-docker/defaults/main.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ansible/playbooks/paas/roles/ansible-docker/defaults/main.yml b/ansible/playbooks/paas/roles/ansible-docker/defaults/main.yml index 53e0e926..5123c51c 100644 --- a/ansible/playbooks/paas/roles/ansible-docker/defaults/main.yml +++ b/ansible/playbooks/paas/roles/ansible-docker/defaults/main.yml @@ -13,7 +13,9 @@ docker_private_registry_state: false docker_private_registry_url: "" docker_private_registry_username: "" docker_private_registry_password: "" -docker_private_registry_config: /etc/docker/config.json +docker_private_registry_config: + - /etc/docker/config.json + - /root/.docker/config.json # DNS docker_dns_configuration: true From cb8adf003772af88787c1362ce790adf9fb7563d Mon Sep 17 00:00:00 2001 From: Mathieu Garcia Date: Tue, 21 Oct 2025 17:33:59 +0200 Subject: [PATCH 33/34] feat(ansible): add nvidia plugin installation playbook --- ansible/playbooks/paas/nvidia.yml | 142 ++++++++++++++++++++++++++++++ 1 file changed, 142 insertions(+) create mode 100644 ansible/playbooks/paas/nvidia.yml diff --git a/ansible/playbooks/paas/nvidia.yml b/ansible/playbooks/paas/nvidia.yml new file mode 100644 index 00000000..d8f2314d --- /dev/null +++ b/ansible/playbooks/paas/nvidia.yml @@ -0,0 +1,142 @@ +--- +- name: Install nomad nvidia plugin + any_errors_fatal: true + hosts: "{{ hosts_limit | default('infrastructure') }}" + gather_facts: true + become: true + vars: + build_work_dir: /tmp + upstream_file_url: https://github.com/hashicorp/nomad-device-nvidia.git + nvidia_container_toolkit_version: "1.17.8-1" + nvidia_gpg_key_url: "https://nvidia.github.io/libnvidia-container/gpgkey" + nvidia_repo_list_url: "https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list" + nvidia_keyring_path: "/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg" + nvidia_list_path: "/etc/apt/sources.list.d/nvidia-container-toolkit.list" + + roles: + - golang + + pre_tasks: + + - name: Créer le répertoire du keyring s'il n'existe pas + ansible.builtin.file: + path: "{{ nvidia_keyring_path | dirname }}" + state: directory + mode: "0755" + + - name: Télécharger et enregistrer la clé GPG NVIDIA + ansible.builtin.get_url: + url: "{{ nvidia_gpg_key_url }}" + dest: /tmp/nvidia-container-toolkit.gpg + mode: "0644" + + - name: Convertir la clé GPG en format keyring + ansible.builtin.command: + cmd: "gpg --dearmor -o {{ nvidia_keyring_path }} /tmp/nvidia-container-toolkit.gpg" + creates: "{{ nvidia_keyring_path }}" + + - name: Télécharger le fichier de dépôt NVIDIA et ajouter le signed-by + ansible.builtin.shell: | + curl -s -L {{ nvidia_repo_list_url }} | \ + sed 's#deb https://#deb [signed-by={{ nvidia_keyring_path }}] https://#g' > {{ nvidia_list_path }} + args: + creates: "{{ nvidia_list_path }}" + + - name: Activer la section experimental (décommenter) + ansible.builtin.replace: + path: "{{ nvidia_list_path }}" + regexp: '^#(.*experimental.*)$' + replace: '\1' + + - name: Mettre à jour la liste des paquets + ansible.builtin.apt: + update_cache: true + + - name: Installer les paquets NVIDIA Container Toolkit + ansible.builtin.apt: + name: + - "nvidia-container-toolkit={{ nvidia_container_toolkit_version }}" + - "nvidia-container-toolkit-base={{ nvidia_container_toolkit_version }}" + - "libnvidia-container-tools={{ nvidia_container_toolkit_version }}" + - "libnvidia-container1={{ nvidia_container_toolkit_version }}" + state: present + + tasks: + - name: Install dependencies + ansible.builtin.apt: + pkg: + - nvidia-utils-580 + - nvidia-driver-580 + - nvidia-container-runtime + - nomad-device-nvidia + state: present + install_recommends: true + update_cache: true + register: apt_status + until: apt_status is success + delay: 6 + retries: 10 + + - name: Nomad-nvidia-plugin | Git checkout + ansible.builtin.git: + repo: https://github.com/hashicorp/nomad-device-nvidia.git + dest: "{{ build_work_dir }}/nomad-device-nvidia" + version: main + force: true + + - name: Nomad-nvidia-plugin | Build binary + ansible.builtin.command: + cmd: make compile + chdir: "{{ build_work_dir }}/nomad-device-nvidia" + environment: + PATH: "/usr/local/go/bin:{{ ansible_env.PATH }}" + register: my_output + changed_when: my_output.rc != 0 + + - name: Create nomad plugin directory + ansible.builtin.file: + path: /opt/nomad/plugins + state: directory + owner: root + group: root + mode: "0755" + + - name: Nomad-nvidia-plugin | Copy binary + ansible.builtin.copy: + src: /tmp/nomad-plugins/nomad-device-nvidia + dest: /opt/nomad/plugins/nomad-device-nvidia + owner: root + group: root + mode: '0755' + remote_src: true + + + - name: Copy using inline content + ansible.builtin.copy: + content: | + plugin "nomad-device-nvidia" { + config { + enabled = true + fingerprint_period = "5s" + } + } + dest: /etc/nomad.d/nvidia.hcl + owner: root + group: root + mode: '0644' + + # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html + - name: Nomad-nvidia-plugin | Test nvidia support + ansible.builtin.command: nvidia-ctk runtime configure --runtime=docker + + - name: Nomad-nvidia-plugin | Restart docker + ansible.builtin.command: systemctl restart docker + + # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/sample-workload.html + - name: Nomad-nvidia-plugin | Test nvidia support + ansible.builtin.command: docker run --rm --runtime=nvidia --gpus all ubuntu nvidia-smi + register: docker_run + + - name: Nomad-nvidia-plugin | Debug + ansible.builtin.debug: + msg: "{{ docker_run }}" From 85d419aa844e71659bd736d502a9f6b718008091 Mon Sep 17 00:00:00 2001 From: Mathieu Garcia Date: Tue, 21 Oct 2025 17:34:43 +0200 Subject: [PATCH 34/34] feat(variables): Update UI --- ui/controllers/api.js | 11 ++++++----- ui/public/forms/catalogs.html | 6 ++++++ ui/public/forms/variables.html | 13 ++++++++++--- ui/public/img/generic.png | Bin 0 -> 15296 bytes ui/schemas/softwares.js | 32 ++++++++++++++------------------ 5 files changed, 36 insertions(+), 26 deletions(-) create mode 100644 ui/public/img/generic.png diff --git a/ui/controllers/api.js b/ui/controllers/api.js index d7c3d3ae..d75d2620 100644 --- a/ui/controllers/api.js +++ b/ui/controllers/api.js @@ -51,11 +51,12 @@ exports.install = function() { ROUTE('+API /api/ +softwares_execute/{id} --> Softwares/execute'); // Variables - ROUTE('+API /api/ +variables --> Variables/list'); - ROUTE('+API /api/ +variables_read/{id} --> Variables/read'); - ROUTE('+API /api/ +variables_create --> Variables/create'); - ROUTE('+API /api/ +variables_update/{id} --> Variables/update'); - ROUTE('+POST /api/secret --> Variables/secret'); + ROUTE('+API /api/ +variables --> Variables/list'); + ROUTE('+API /api/ +variables_read/{id} --> Variables/read'); + ROUTE('+API /api/ +variables_create --> Variables/create'); + ROUTE('+API /api/ +variables_update/{id} --> Variables/update'); + ROUTE('+API /api/ +variables_remove/{id} --> Variables/remove'); + ROUTE('+POST /api/secret --> Variables/secret'); // 3dForceGraph ROUTE('+API /api/ -graphs --> Graphs/list'); diff --git a/ui/public/forms/catalogs.html b/ui/public/forms/catalogs.html index c00feb78..bad80e7c 100644 --- a/ui/public/forms/catalogs.html +++ b/ui/public/forms/catalogs.html @@ -17,6 +17,7 @@