Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
1db8c20
feat(promtail): add remote write support to Promtail role
wiseflat Oct 6, 2025
837186d
feat(coredns): add nomad cluster mode support to coredns role
wiseflat Oct 21, 2025
a3a51a2
feat(ansible): add safe checks for optional constraints
wiseflat Oct 21, 2025
ad3389b
refactor(caddy): use Nomad template for Caddy config
wiseflat Oct 21, 2025
5747e58
feat(loki): use Nomad template for Loki config
wiseflat Oct 21, 2025
eb5c639
feat(mimir): add Nomad templates and dynamic config for mimir
wiseflat Oct 21, 2025
e671506
refactor(minio): use project registry variable in Minio template
wiseflat Oct 21, 2025
e5cf489
fix(nomad): handle missing constraints in Nomad templates
wiseflat Oct 21, 2025
dda6fc1
feat(valkey): add templated config and update Nomad job
wiseflat Oct 21, 2025
9e0fa1c
feat(traefik): introduce metrics entrypoint on 8081 for Prometheus
wiseflat Oct 21, 2025
e5e3ca4
feat(vector): add vector role with nomad job and config
wiseflat Oct 21, 2025
3e91b33
feat(vllm): add vllm role with Nomad job and Docker support
wiseflat Oct 21, 2025
a80d8da
feat(traefik): parameterize nomad endpoint in traefik
wiseflat Oct 21, 2025
4add747
fix(grafana): add guard for undefined prometheus remote write
wiseflat Oct 21, 2025
fc348d1
feat(grafana): add NVIDIA and vLLM dashboards and update Nomad dashboard
wiseflat Oct 21, 2025
6a4a440
feat(grafana): add LLM plugin provisioning
wiseflat Oct 21, 2025
b97bc41
feat(ansible): bind services to node IP address
wiseflat Oct 21, 2025
ea8f072
feat(ansible): add nvidia_gpu_exporter role
wiseflat Oct 21, 2025
8d19f44
feat(ansible): add dynamic node discovery, auth, and vllm job
wiseflat Oct 21, 2025
8c42f07
refactor(nomad): standardize defaults and templates
wiseflat Oct 21, 2025
7748d97
feat(ansible): add Nomad JuiceFS CSI driver deployment
wiseflat Oct 21, 2025
56a9fdb
refactor(ansible): use fully qualified debug module
wiseflat Oct 21, 2025
64dad41
feat(ansible): add optional project prefix to image name
wiseflat Oct 21, 2025
a504975
feat(ansible): add optional basic auth for Loki remote write
wiseflat Oct 21, 2025
ff318ab
feat(ansible): bind node_exporter to host IP address
wiseflat Oct 21, 2025
4fa1730
refactor(ansible): use fully qualified password_hash filter
wiseflat Oct 21, 2025
78ac4bb
feat(ansible): add unified Docker repo task and root .docker dir
wiseflat Oct 21, 2025
477dbe9
feat(ansible): add DNSStubListenerExtra to systemd-resolved config
wiseflat Oct 21, 2025
8080ffa
feat(ansible): add partition-data playbook for sdb
wiseflat Oct 21, 2025
d578140
feat(ansible): add hostname task and import service playbooks
wiseflat Oct 21, 2025
62eea20
feat(ansible): add nvidia_gpu_exporter exporter
wiseflat Oct 21, 2025
fd405e2
feat(ansible): add support for multiple Docker registry configs
wiseflat Oct 21, 2025
cb8adf0
feat(ansible): add nvidia plugin installation playbook
wiseflat Oct 21, 2025
85d419a
feat(variables): Update UI
wiseflat Oct 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion ansible/playbooks/paas/coredns.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,4 @@
gather_facts: true
become: true
roles:
- golang
- coredns
32 changes: 18 additions & 14 deletions ansible/playbooks/paas/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@
gather_facts: true
become: true
pre_tasks:

- name: Set fqdn hostname
ansible.builtin.hostname:
name: "{{ inventory_hostname }}"
use: systemd

- name: Create ansible facts.d directory
become: true
ansible.builtin.file:
Expand All @@ -14,20 +20,6 @@
group: "root"
mode: '0755'

- name: Get ipinfo.io
ansible.builtin.uri:
url: https://ipinfo.io
http_agent: curl/7.81.0
register: register_uri
check_mode: false

- name: Set ipinfo local_fact
ansible.builtin.copy:
content: |
{{ register_uri.json | to_nice_json }}
dest: /etc/ansible/facts.d/ipinfo.fact
mode: '0644'

- name: Install mandatories packages
ansible.builtin.apt:
pkg:
Expand All @@ -42,5 +34,17 @@
until: apt_status is success
delay: 6
retries: 10

roles:
- unattended-upgrades

- name: Configure systemd resolved
ansible.builtin.import_playbook: systemd-resolved.yml
- name: Configure docker
ansible.builtin.import_playbook: docker.yml
- name: Configure nomad
ansible.builtin.import_playbook: nomad.yml
- name: Configure coredns
ansible.builtin.import_playbook: coredns.yml
- name: Configure metrology
ansible.builtin.import_playbook: metrology.yml
61 changes: 48 additions & 13 deletions ansible/playbooks/paas/metrology.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,51 @@
hosts: "{{ hosts_limit | default('infrastructure') }}"
gather_facts: true
become: true
roles:
- prometheus
- promtail
- phpfpm_exporter
- node_exporter
- mysqld_exporter
- systemd_exporter
- mongodb_exporter
- blackbox_exporter
- nginx_exporter
- scan_exporter
- dns_exporter
- script_exporter
vars_prompt:
- name: project
prompt: project name
private: false
tasks:
- name: End the play for hosts that are not in admins group
ansible.builtin.meta: end_host
when: fact_instance.location != 'admins'

- name: Install prometheus
ansible.builtin.import_role:
name: prometheus

- name: Install exporters
any_errors_fatal: true
hosts: "{{ hosts_limit | default('infrastructure') }}"
gather_facts: true
become: true
tasks:
- name: Create prometheus group
ansible.builtin.group:
name: prometheus
system: true

- name: Create prometheus user
ansible.builtin.user:
name: prometheus
create_home: false
system: true

- name: Install exporters
ansible.builtin.include_role:
name: "{{ exporter }}"
loop:
- promtail
- phpfpm_exporter
- node_exporter
- mysqld_exporter
- systemd_exporter
- mongodb_exporter
- blackbox_exporter
- nginx_exporter
- scan_exporter
- dns_exporter
- script_exporter
- nvidia_gpu_exporter
loop_control:
loop_var: exporter
11 changes: 11 additions & 0 deletions ansible/playbooks/paas/nomad-juicefs.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
---
- name: Install nomad juicefs CSI driver
any_errors_fatal: true
hosts: "{{ hosts_limit | default('infrastructure') }}"
gather_facts: true
become: true
tasks:
- name: Install nomad juicefs CSI driver
ansible.builtin.import_role:
name: nomad
tasks_from: 10_juicefs
142 changes: 142 additions & 0 deletions ansible/playbooks/paas/nvidia.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
---
- name: Install nomad nvidia plugin
any_errors_fatal: true
hosts: "{{ hosts_limit | default('infrastructure') }}"
gather_facts: true
become: true
vars:
build_work_dir: /tmp
upstream_file_url: https://github.com/hashicorp/nomad-device-nvidia.git
nvidia_container_toolkit_version: "1.17.8-1"
nvidia_gpg_key_url: "https://nvidia.github.io/libnvidia-container/gpgkey"
nvidia_repo_list_url: "https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list"
nvidia_keyring_path: "/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg"
nvidia_list_path: "/etc/apt/sources.list.d/nvidia-container-toolkit.list"

roles:
- golang

pre_tasks:

- name: Créer le répertoire du keyring s'il n'existe pas
ansible.builtin.file:
path: "{{ nvidia_keyring_path | dirname }}"
state: directory
mode: "0755"

- name: Télécharger et enregistrer la clé GPG NVIDIA
ansible.builtin.get_url:
url: "{{ nvidia_gpg_key_url }}"
dest: /tmp/nvidia-container-toolkit.gpg
mode: "0644"

- name: Convertir la clé GPG en format keyring
ansible.builtin.command:
cmd: "gpg --dearmor -o {{ nvidia_keyring_path }} /tmp/nvidia-container-toolkit.gpg"
creates: "{{ nvidia_keyring_path }}"

- name: Télécharger le fichier de dépôt NVIDIA et ajouter le signed-by
ansible.builtin.shell: |
curl -s -L {{ nvidia_repo_list_url }} | \
sed 's#deb https://#deb [signed-by={{ nvidia_keyring_path }}] https://#g' > {{ nvidia_list_path }}
args:
creates: "{{ nvidia_list_path }}"

- name: Activer la section experimental (décommenter)
ansible.builtin.replace:
path: "{{ nvidia_list_path }}"
regexp: '^#(.*experimental.*)$'
replace: '\1'

- name: Mettre à jour la liste des paquets
ansible.builtin.apt:
update_cache: true

- name: Installer les paquets NVIDIA Container Toolkit
ansible.builtin.apt:
name:
- "nvidia-container-toolkit={{ nvidia_container_toolkit_version }}"
- "nvidia-container-toolkit-base={{ nvidia_container_toolkit_version }}"
- "libnvidia-container-tools={{ nvidia_container_toolkit_version }}"
- "libnvidia-container1={{ nvidia_container_toolkit_version }}"
state: present

tasks:
- name: Install dependencies
ansible.builtin.apt:
pkg:
- nvidia-utils-580
- nvidia-driver-580
- nvidia-container-runtime
- nomad-device-nvidia
state: present
install_recommends: true
update_cache: true
register: apt_status
until: apt_status is success
delay: 6
retries: 10

- name: Nomad-nvidia-plugin | Git checkout
ansible.builtin.git:
repo: https://github.com/hashicorp/nomad-device-nvidia.git
dest: "{{ build_work_dir }}/nomad-device-nvidia"
version: main
force: true

- name: Nomad-nvidia-plugin | Build binary
ansible.builtin.command:
cmd: make compile
chdir: "{{ build_work_dir }}/nomad-device-nvidia"
environment:
PATH: "/usr/local/go/bin:{{ ansible_env.PATH }}"
register: my_output
changed_when: my_output.rc != 0

- name: Create nomad plugin directory
ansible.builtin.file:
path: /opt/nomad/plugins
state: directory
owner: root
group: root
mode: "0755"

- name: Nomad-nvidia-plugin | Copy binary
ansible.builtin.copy:
src: /tmp/nomad-plugins/nomad-device-nvidia
dest: /opt/nomad/plugins/nomad-device-nvidia
owner: root
group: root
mode: '0755'
remote_src: true


- name: Copy using inline content
ansible.builtin.copy:
content: |
plugin "nomad-device-nvidia" {
config {
enabled = true
fingerprint_period = "5s"
}
}
dest: /etc/nomad.d/nvidia.hcl
owner: root
group: root
mode: '0644'

# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
- name: Nomad-nvidia-plugin | Test nvidia support
ansible.builtin.command: nvidia-ctk runtime configure --runtime=docker

- name: Nomad-nvidia-plugin | Restart docker
ansible.builtin.command: systemctl restart docker

# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/sample-workload.html
- name: Nomad-nvidia-plugin | Test nvidia support
ansible.builtin.command: docker run --rm --runtime=nvidia --gpus all ubuntu nvidia-smi
register: docker_run

- name: Nomad-nvidia-plugin | Debug
ansible.builtin.debug:
msg: "{{ docker_run }}"
32 changes: 32 additions & 0 deletions ansible/playbooks/paas/partition-data.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
---
- name: Configure sdb partition
any_errors_fatal: true
hosts: "{{ hosts_limit | default('infrastructure') }}"
gather_facts: true
become: true
tasks:
- name: Create default directory
ansible.builtin.file:
path: /data
state: directory
owner: root
group: root
mode: '0755'

- name: Create a new primary partition /dev/sdb1
community.general.parted:
device: /dev/sdb
number: 1
state: present

- name: Create a ext4 filesystem on /dev/sdb1
community.general.filesystem:
fstype: ext4
dev: /dev/sdb1

- name: Mount up device
ansible.posix.mount:
path: /data
src: /dev/sdb1
fstype: ext4
state: present
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@ docker_private_registry_state: false
docker_private_registry_url: ""
docker_private_registry_username: ""
docker_private_registry_password: ""
docker_private_registry_config: /etc/docker/config.json
docker_private_registry_config:
- /etc/docker/config.json
- /root/.docker/config.json

# DNS
docker_dns_configuration: true
Expand Down
22 changes: 12 additions & 10 deletions ansible/playbooks/paas/roles/ansible-docker/tasks/install.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,7 @@
url: "https://download.docker.com/linux/{{ ansible_distribution | lower }}/gpg"
dest: /etc/apt/keyrings/docker.asc

- name: Add Docker repository on ubuntu < 24.04
ansible.builtin.apt_repository:
repo: "deb [arch={{ upstream_default_arch }} signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu {{ ansible_distribution_release }} stable"
state: present
filename: docker
when: ansible_distribution_version is version('24.04', '<')

- name: Add Docker repository on ubuntu >= 24.04
- name: Add Docker repository on ubuntu
ansible.builtin.copy:
content: |
Components: stable
Expand All @@ -42,7 +35,6 @@
owner: root
group: root
mode: '0644'
when: ansible_distribution_version is version('24.04', '>=')

- name: Install Docker
ansible.builtin.apt:
Expand All @@ -68,14 +60,24 @@
append: true
notify: Docker_restart

- name: Create home docker directory
ansible.builtin.file:
path: "{{ item }}"
recurse: true
state: directory
mode: '0755'
loop:
- /root/.docker

- name: Copy config.json
ansible.builtin.template:
src: config.json.j2
dest: "{{ docker_private_registry_config }}"
dest: "{{ item }}"
owner: root
group: root
mode: '0600'
when: docker_private_registry_state
loop: "{{ docker_private_registry_config }}"
notify: Docker_restart

- name: Copy daemon.json for DNS resolution
Expand Down
4 changes: 4 additions & 0 deletions ansible/playbooks/paas/roles/coredns/README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# Role: `coredns`

## How to use this Ansible role?

### nomad cluster mode

nomad_primary_master_node: Set a primary nomad master node to get nomad_management_token
Loading