From fe8270363e7cba07dae39f3b9e61265eff6a6aec Mon Sep 17 00:00:00 2001 From: Giacomo Sanchietti Date: Wed, 18 Feb 2026 10:53:18 +0100 Subject: [PATCH 1/5] feat: add Grafana Mimir metrics infrastructure - Add Mimir 2-node cluster (memberlist gossip, replication_factor:2) - Add Grafana dashboard service with sub-path routing (/grafana/) - Add authenticated metrics proxy endpoint (ANY /api/mimir/*) - HTTP Basic Auth: system_key + system_secret (no JWT) - Adds X-Scope-OrgID = organization_id for multi-tenancy - Streaming proxy (no buffering) for large metric payloads - Add MIMIR_URL config field (default: http://localhost:9009) - Update nginx to route /mimir/ -> backend and /grafana/ -> Grafana - Add mimir/docker-compose.yml for dedicated metrics VM (Server B) - Remove mimir/grafana from main docker-compose (moved to mimir/) - Update render.yaml: pserv mimir1+mimir2 (prod+qa), Grafana web service - Add OpenAPI documentation for /api/mimir/{path} endpoint - Add user documentation (EN+IT) and operator README Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- backend/openapi.yaml | 127 ++++++++++++++++++- collect/.env.example | 3 + collect/configuration/configuration.go | 17 +++ collect/main.go | 16 ++- collect/methods/mimir.go | 146 ++++++++++++++++++++++ docs/en/08-metrics.md | 165 +++++++++++++++++++++++++ docs/it/08-metrics.md | 165 +++++++++++++++++++++++++ mkdocs.yml | 2 + proxy/entrypoint.sh | 2 +- proxy/nginx.conf | 37 ++++++ proxy/nginx.conf.local | 29 +++++ render.yaml | 2 +- services/mimir/Containerfile | 3 + services/mimir/my.yaml | 7 ++ services/mimir/runtime_config.yaml | 9 ++ 15 files changed, 725 insertions(+), 5 deletions(-) create mode 100644 collect/methods/mimir.go create mode 100644 docs/en/08-metrics.md create mode 100644 docs/it/08-metrics.md create mode 100644 services/mimir/runtime_config.yaml diff --git a/backend/openapi.yaml b/backend/openapi.yaml index 25b413e79..4b6f794c6 100644 --- a/backend/openapi.yaml +++ b/backend/openapi.yaml @@ -62,7 +62,8 @@ tags: description: Collect service system management and inventory collection - name: Collect - Rebranding description: Collect service rebranding endpoints for systems - + - name: Collect - Metrics + description: Collect service metrics proxy to Mimir (Prometheus remote_write and query) security: - BearerAuth: [] @@ -8231,3 +8232,127 @@ paths: format: binary '404': $ref: '#/components/responses/NotFound' + + # =========================================== + # METRICS ENDPOINTS (Collect - Mimir Proxy) + # =========================================== + + /api/services/mimir/{path}: + parameters: + - name: path + in: path + required: true + schema: + type: string + description: Wildcard path forwarded to Mimir (e.g. `api/v1/push`, `prometheus/api/v1/query`) + get: + operationId: mimirProxyGet + tags: + - Collect - Metrics + summary: Proxy GET request to Mimir + description: | + Authenticates the system using HTTP Basic Auth (`system_key` as username, + `system_secret` as password), injects the `X-Scope-OrgID` header with the + system's `organization_id` for multi-tenant isolation, and reverse-proxies + the request to Mimir. Typical use: Grafana PromQL queries via + `GET /api/services/mimir/prometheus/api/v1/query`. + security: + - BasicAuth: [] + responses: + '200': + description: Proxied response from Mimir + '400': + $ref: '#/components/responses/BadRequest' + '401': + $ref: '#/components/responses/Unauthorized' + '500': + $ref: '#/components/responses/InternalServerError' + post: + operationId: mimirProxyPost + tags: + - Collect - Metrics + summary: Proxy POST request to Mimir + description: | + Authenticates the system using HTTP Basic Auth (`system_key` as username, + `system_secret` as password), injects the `X-Scope-OrgID` header with the + system's `organization_id` for multi-tenant isolation, and reverse-proxies + the request to Mimir. Primary use case: Prometheus `remote_write` ingestion + via `POST /api/services/mimir/api/v1/push` from NethServer systems. + security: + - BasicAuth: [] + requestBody: + description: Request body forwarded as-is to Mimir (e.g. Prometheus remote_write protobuf payload) + required: false + content: + application/x-protobuf: + schema: + type: string + format: binary + application/json: + schema: + type: object + responses: + '200': + description: Proxied response from Mimir + '204': + description: No content — Mimir acknowledged the write with no response body + '400': + $ref: '#/components/responses/BadRequest' + '401': + $ref: '#/components/responses/Unauthorized' + '500': + $ref: '#/components/responses/InternalServerError' + put: + operationId: mimirProxyPut + tags: + - Collect - Metrics + summary: Proxy PUT request to Mimir + description: | + Authenticates the system using HTTP Basic Auth (`system_key` as username, + `system_secret` as password), injects the `X-Scope-OrgID` header with the + system's `organization_id` for multi-tenant isolation, and reverse-proxies + the request to Mimir. + security: + - BasicAuth: [] + requestBody: + description: Request body forwarded as-is to Mimir + required: false + content: + application/json: + schema: + type: object + responses: + '200': + description: Proxied response from Mimir + '204': + description: No content + '400': + $ref: '#/components/responses/BadRequest' + '401': + $ref: '#/components/responses/Unauthorized' + '500': + $ref: '#/components/responses/InternalServerError' + delete: + operationId: mimirProxyDelete + tags: + - Collect - Metrics + summary: Proxy DELETE request to Mimir + description: | + Authenticates the system using HTTP Basic Auth (`system_key` as username, + `system_secret` as password), injects the `X-Scope-OrgID` header with the + system's `organization_id` for multi-tenant isolation, and reverse-proxies + the request to Mimir. + security: + - BasicAuth: [] + responses: + '200': + description: Proxied response from Mimir + '204': + description: No content + '400': + $ref: '#/components/responses/BadRequest' + '401': + $ref: '#/components/responses/Unauthorized' + '500': + $ref: '#/components/responses/InternalServerError' + diff --git a/collect/.env.example b/collect/.env.example index 789f16aec..510c9717e 100644 --- a/collect/.env.example +++ b/collect/.env.example @@ -86,6 +86,9 @@ REDIS_URL=redis://localhost:6379 #CIRCUIT_BREAKER_THRESHOLD=10 #CIRCUIT_BREAKER_TIMEOUT=60s +# Mimir metrics storage (comma-separated list for HA) +#MIMIR_URLS=http://localhost:9009,http://localhost:9010 + # Logging configuration #LOG_LEVEL=info #LOG_FORMAT=json diff --git a/collect/configuration/configuration.go b/collect/configuration/configuration.go index 537d50126..fe8b56423 100644 --- a/collect/configuration/configuration.go +++ b/collect/configuration/configuration.go @@ -13,6 +13,7 @@ import ( "fmt" "os" "strconv" + "strings" "time" "github.com/nethesis/my/collect/logger" @@ -80,6 +81,9 @@ type Configuration struct { // Heartbeat monitoring configuration HeartbeatTimeoutMinutes int `json:"heartbeat_timeout_minutes"` + + // Mimir configuration + MimirURLs []string `json:"mimir_urls"` } var Config = Configuration{} @@ -161,6 +165,19 @@ func Init() { // Heartbeat monitoring configuration Config.HeartbeatTimeoutMinutes = parseIntWithDefault("HEARTBEAT_TIMEOUT_MINUTES", 10) + // Mimir configuration + if mimirURLs := os.Getenv("MIMIR_URLS"); mimirURLs != "" { + for _, u := range strings.Split(mimirURLs, ",") { + u = strings.TrimSpace(u) + if u != "" { + Config.MimirURLs = append(Config.MimirURLs, u) + } + } + } + if len(Config.MimirURLs) == 0 { + Config.MimirURLs = []string{"http://localhost:9009"} + } + // Log successful configuration load logger.LogConfigLoad("env", "configuration", true, nil) } diff --git a/collect/main.go b/collect/main.go index a015f2e20..26a05366c 100644 --- a/collect/main.go +++ b/collect/main.go @@ -105,8 +105,11 @@ func main() { // Add security monitoring middleware router.Use(logger.SecurityMiddleware()) - // Add compression - router.Use(gzip.Gzip(gzip.DefaultCompression)) + // Add compression (excluding Mimir proxy endpoints to avoid double-compression) + router.Use(gzip.Gzip( + gzip.DefaultCompression, + gzip.WithExcludedPathsRegexs([]string{"^/api/services/mimir"}), + )) // CORS configuration in debug mode if gin.Mode() == gin.DebugMode { @@ -158,6 +161,15 @@ func main() { systemsGroup.GET("/rebranding/:product_id/:asset", methods.GetSystemRebrandingAsset) } + // =========================================== + // EXTERNAL SERVICES PROXY + // =========================================== + servicesGroup := api.Group("/services", middleware.BasicAuthMiddleware()) + { + mimirProxy := servicesGroup.Group("/mimir") + mimirProxy.Any("/*path", methods.ProxyMimir) + } + // Handle missing endpoints router.NoRoute(func(c *gin.Context) { c.JSON(http.StatusNotFound, response.NotFound("api not found", nil)) diff --git a/collect/methods/mimir.go b/collect/methods/mimir.go new file mode 100644 index 000000000..b1519dfa1 --- /dev/null +++ b/collect/methods/mimir.go @@ -0,0 +1,146 @@ +/* +Copyright (C) 2026 Nethesis S.r.l. +SPDX-License-Identifier: AGPL-3.0-or-later +*/ + +package methods + +import ( + "bytes" + "database/sql" + "fmt" + "io" + "math/rand" + "net/http" + + "github.com/gin-gonic/gin" + + "github.com/nethesis/my/collect/configuration" + "github.com/nethesis/my/collect/database" + "github.com/nethesis/my/collect/logger" + "github.com/nethesis/my/collect/response" +) + +// ProxyMimir handles ANY /api/services/mimir/* — the BasicAuthMiddleware has +// already validated system credentials and placed system_id in the context. +// This handler resolves the organization_id, sets X-Scope-OrgID, and +// reverse-proxies the request to Mimir with HA support across multiple instances. +func ProxyMimir(c *gin.Context) { + // Step 1: Get system_id from context (set by BasicAuthMiddleware) + systemID, ok := getAuthenticatedSystemID(c) + if !ok { + logger.Warn().Str("reason", "missing system_id in context").Msg("mimir proxy auth failed") + c.JSON(http.StatusUnauthorized, response.Unauthorized("unauthorized", nil)) + return + } + + // Step 2: Query organization_id for this system + var organizationID string + err := database.DB.QueryRow( + `SELECT organization_id FROM systems WHERE id = $1`, + systemID, + ).Scan(&organizationID) + + if err == sql.ErrNoRows { + logger.Warn().Str("system_id", systemID).Str("reason", "system not found").Msg("mimir proxy: system lookup failed") + c.JSON(http.StatusUnauthorized, response.Unauthorized("unauthorized", nil)) + return + } + if err != nil { + logger.Error().Err(err).Str("system_id", systemID).Msg("mimir proxy: db query failed") + c.JSON(http.StatusInternalServerError, response.InternalServerError("internal server error", nil)) + return + } + + // Step 3: Buffer request body once so it can be replayed across retry attempts + bodyBytes, err := io.ReadAll(c.Request.Body) + if err != nil { + logger.Error().Err(err).Msg("mimir proxy: failed to read request body") + c.JSON(http.StatusInternalServerError, response.InternalServerError("internal server error", nil)) + return + } + + subPath := c.Param("path") + rawQuery := c.Request.URL.RawQuery + + // Step 4: Try each Mimir instance starting from a random index + urls := configuration.Config.MimirURLs + n := len(urls) + start := rand.Intn(n) + + for i := 0; i < n; i++ { + base := urls[(start+i)%n] + targetURL := fmt.Sprintf("%s%s", base, subPath) + if rawQuery != "" { + targetURL += "?" + rawQuery + } + + logger.Info().Str("target", targetURL).Int("attempt", i+1).Msg("mimir proxy: trying instance") + + req, err := http.NewRequest(c.Request.Method, targetURL, bytes.NewReader(bodyBytes)) + if err != nil { + logger.Warn().Err(err).Str("target", targetURL).Msg("mimir proxy: failed to create upstream request") + continue + } + + for _, header := range []string{"Content-Type", "Content-Encoding", "Accept", "User-Agent"} { + if val := c.GetHeader(header); val != "" { + req.Header.Set(header, val) + } + } + // Remove Accept-Encoding so Mimir sends plain JSON, not gzip + req.Header.Del("Accept-Encoding") + req.Header.Set("X-Scope-OrgID", organizationID) + + resp, err := http.DefaultClient.Do(req) + if err != nil { + logger.Warn().Err(err).Str("target", targetURL).Msg("mimir proxy: network error, trying next instance") + continue + } + + // Return 4xx immediately — client errors are not retried + if resp.StatusCode >= 400 && resp.StatusCode < 500 { + defer func() { + if err := resp.Body.Close(); err != nil { + logger.Error().Err(err).Msg("mimir proxy: failed to close upstream response body") + } + }() + if ct := resp.Header.Get("Content-Type"); ct != "" { + c.Header("Content-Type", ct) + } + c.Status(resp.StatusCode) + if _, err := io.Copy(c.Writer, resp.Body); err != nil { + logger.Error().Err(err).Msg("mimir proxy: error streaming response body") + } + return + } + + // Retry on 5xx + if resp.StatusCode >= 500 { + if err := resp.Body.Close(); err != nil { + logger.Error().Err(err).Msg("mimir proxy: failed to close upstream response body") + } + logger.Warn().Str("target", targetURL).Int("status", resp.StatusCode).Msg("mimir proxy: 5xx response, trying next instance") + continue + } + + // Success: stream response back to client + defer func() { + if err := resp.Body.Close(); err != nil { + logger.Error().Err(err).Msg("mimir proxy: failed to close upstream response body") + } + }() + if ct := resp.Header.Get("Content-Type"); ct != "" { + c.Header("Content-Type", ct) + } + c.Status(resp.StatusCode) + if _, err := io.Copy(c.Writer, resp.Body); err != nil { + logger.Error().Err(err).Msg("mimir proxy: error streaming response body") + } + return + } + + // All instances failed + logger.Error().Int("instances_tried", n).Msg("mimir proxy: all instances failed") + c.JSON(http.StatusBadGateway, response.InternalServerError("all mimir instances are unavailable", nil)) +} diff --git a/docs/en/08-metrics.md b/docs/en/08-metrics.md new file mode 100644 index 000000000..20e2ce556 --- /dev/null +++ b/docs/en/08-metrics.md @@ -0,0 +1,165 @@ +# Metrics + +Learn how external systems push Prometheus metrics to My platform and how to visualize them in Grafana. + +## Overview + +My platform supports Prometheus metrics collection via [Grafana Mimir](https://grafana.com/oss/mimir/). Any registered NethServer or NethSecurity system can push metrics using the standard Prometheus `remote_write` protocol. Metrics are isolated per organization and visible in Grafana dashboards. + +## How It Works + +### Metrics Ingestion Flow + +``` +┌─────────────────┐ ┌──────────────┐ +│ NethServer / │ POST /services/mimir/api/v1/push │ │ +│ NethSecurity │ ─────────────────────────────────> │ nginx │ +│ │ Basic Auth: system_key:system_secret │ │ +└─────────────────┘ └──────┬───────┘ + │ + │ /api/services/mimir/ + v + ┌──────────────┐ + │ Collect │ + │ │ + │ 1. Validate │ + │ Basic │ + │ Auth │ + │ 2. Set │ + │ X-Scope- │ + │ OrgID: │ + │ │ + └──────┬───────┘ + │ + v + ┌──────────────┐ + │ Mimir │ + │ (private) │ + └──────────────┘ +``` + +### Grafana Access Flow + +``` +┌─────────────┐ ┌──────────────┐ ┌──────────────┐ +│ Browser │ /grafana/ │ nginx │ │ Grafana │ +│ │ ─────────────> │ │ --> │ │ +│ │ └──────────────┘ └──────┬───────┘ +└─────────────┘ │ + │ queries Mimir + │ X-Scope-OrgID + v + ┌──────────────┐ + │ Mimir │ + │ (private) │ + └──────────────┘ +``` + +### Multi-Tenancy + +Each system belongs to an organization. The collect service resolves the system's `organization_id` from its credentials and injects it as the `X-Scope-OrgID` header before forwarding to Mimir. This ensures metrics are fully isolated between organizations — each organization only sees its own data. + +## Authentication + +Metrics push uses the same credentials as system registration and inventory: + +| Field | Value | +|-------|-------| +| **Username** | `system_key` (e.g. `NOC-F64B-A989-C9E7-45B9-A55D-59EC-6545-40EE`) | +| **Password** | `system_secret` (e.g. `my_a1b2c3d4e5f6g7h8i9j0.k1l2m3n4o5p6q7r8s9t0u1v2w3x4y5z6a7b8c9d0`) | +| **Method** | HTTP Basic Auth | + +No separate registration is needed — any system that has completed registration can immediately push metrics. See [System Registration](05-system-registration.md) for how to obtain credentials. + +## Configuring Prometheus `remote_write` + +Add the following block to your Prometheus configuration (`/etc/prometheus/prometheus.yml` or equivalent): + +```yaml +remote_write: + - url: https://my.nethesis.it/services/mimir/api/v1/push + basic_auth: + username: + password: +``` + +Replace `` and `` with the actual credentials stored on the system. + +**Example with real-looking values:** +```yaml +remote_write: + - url: https://my.nethesis.it/services/mimir/api/v1/push + basic_auth: + username: NOC-F64B-A989-C9E7-45B9-A55D-59EC-6545-40EE + password: my_a1b2c3d4e5f6g7h8i9j0.k1l2m3n4o5p6q7r8s9t0u1v2w3x4y5z6a7b8c9d0 +``` + +After updating the configuration, reload Prometheus: +```bash +systemctl reload prometheus +# or send SIGHUP +kill -HUP $(pidof prometheus) +``` + +!!! tip + Prometheus will start forwarding all scraped metrics to My. Use `remote_write_queue_samples_total` in your local Prometheus to verify metrics are being sent. + +## Accessing Grafana + +Grafana is available at: + +``` +https://my.nethesis.it/grafana/ +``` + +Dashboards are **per-organization**: each organization's users can only see metrics collected from systems belonging to their organization. The tenant isolation is enforced automatically via `X-Scope-OrgID`. + +!!! note + Grafana access is managed by your platform administrator. Contact them to get access or to request custom dashboards for your organization. + +## Troubleshooting + +### HTTP 401 Unauthorized + +**Cause:** Incorrect `system_key` or `system_secret`. + +**Solutions:** +1. Verify credentials match what is stored on the system +2. Ensure the system has completed registration (see [System Registration](05-system-registration.md)) +3. Check for leading/trailing spaces in the credentials +4. Test manually: + ```bash + curl -X POST https://my.nethesis.it/services/mimir/api/v1/push \ + -u "NOC-F64B-A989-C9E7-45B9-A55D-59EC-6545-40EE:my_a1b2c3d4e5f6g7h8i9j0.k1l2m3n4o5p6q7r8s9t0u1v2w3x4y5z6a7b8c9d0" \ + -H "Content-Type: application/x-protobuf" \ + --data-binary @/dev/null + ``` + A `400 Bad Request` (not 401) confirms authentication is working. + +### HTTP 500 Internal Server Error + +**Cause:** Mimir backend is unreachable or misconfigured. + +**Solutions:** +1. This is a platform-side issue — contact your administrator +2. Check platform status page or monitoring alerts +3. Retry after a few minutes; Mimir may be restarting + +### Metrics Not Appearing in Grafana + +**Cause:** Metrics are being sent but not yet visible. + +**Solutions:** +1. Wait 1–2 minutes — Mimir has an ingestion delay +2. Verify `remote_write` is enabled in Prometheus and the configuration is correct +3. Check Prometheus logs for remote write errors: + ```bash + journalctl -u prometheus -n 50 | grep remote_write + ``` +4. Confirm you are logged in to Grafana with an account that belongs to the correct organization + +## Related Documentation + +- [System Registration](05-system-registration.md) +- [Inventory and Heartbeat](06-inventory-heartbeat.md) +- [Systems Management](04-systems.md) diff --git a/docs/it/08-metrics.md b/docs/it/08-metrics.md new file mode 100644 index 000000000..a07c9c668 --- /dev/null +++ b/docs/it/08-metrics.md @@ -0,0 +1,165 @@ +# Metriche + +Scopri come i sistemi esterni inviano metriche Prometheus alla piattaforma My e come visualizzarle in Grafana. + +## Panoramica + +La piattaforma My supporta la raccolta di metriche Prometheus tramite [Grafana Mimir](https://grafana.com/oss/mimir/). Qualsiasi sistema NethServer o NethSecurity registrato può inviare metriche usando il protocollo standard Prometheus `remote_write`. Le metriche sono isolate per organizzazione e visibili nelle dashboard Grafana. + +## Come Funziona + +### Flusso di Acquisizione Metriche + +``` +┌─────────────────┐ ┌──────────────┐ +│ NethServer / │ POST /services/mimir/api/v1/push │ │ +│ NethSecurity │ ─────────────────────────────────> │ nginx │ +│ │ Basic Auth: system_key:system_secret │ │ +└─────────────────┘ └──────┬───────┘ + │ + │ /api/services/mimir/ + v + ┌──────────────┐ + │ Collect │ + │ │ + │ 1. Validate │ + │ Basic │ + │ Auth │ + │ 2. Set │ + │ X-Scope- │ + │ OrgID: │ + │ │ + └──────┬───────┘ + │ + v + ┌──────────────┐ + │ Mimir │ + │ (private) │ + └──────────────┘ +``` + +### Flusso di Accesso Grafana + +``` +┌─────────────┐ ┌──────────────┐ ┌──────────────┐ +│ Browser │ /grafana/ │ nginx │ │ Grafana │ +│ │ ─────────────> │ │ --> │ │ +│ │ └──────────────┘ └──────┬───────┘ +└─────────────┘ │ + │ queries Mimir + │ X-Scope-OrgID + v + ┌──────────────┐ + │ Mimir │ + │ (private) │ + └──────────────┘ +``` + +### Multi-Tenancy + +Ogni sistema appartiene a un'organizzazione. Il servizio collect risolve l'`organization_id` del sistema dalle sue credenziali e lo inietta come header `X-Scope-OrgID` prima di inoltrare la richiesta a Mimir. Questo garantisce che le metriche siano completamente isolate tra le organizzazioni — ogni organizzazione vede solo i propri dati. + +## Autenticazione + +L'invio delle metriche usa le stesse credenziali della registrazione del sistema e dell'inventario: + +| Campo | Valore | +|-------|--------| +| **Username** | `system_key` (es. `NOC-F64B-A989-C9E7-45B9-A55D-59EC-6545-40EE`) | +| **Password** | `system_secret` (es. `my_a1b2c3d4e5f6g7h8i9j0.k1l2m3n4o5p6q7r8s9t0u1v2w3x4y5z6a7b8c9d0`) | +| **Metodo** | HTTP Basic Auth | + +Non è necessaria una registrazione separata — qualsiasi sistema che ha completato la registrazione può inviare metriche immediatamente. Consulta [Registrazione Sistema](05-system-registration.md) per come ottenere le credenziali. + +## Configurazione di Prometheus `remote_write` + +Aggiungi il seguente blocco alla configurazione di Prometheus (`/etc/prometheus/prometheus.yml` o equivalente): + +```yaml +remote_write: + - url: https://my.nethesis.it/services/mimir/api/v1/push + basic_auth: + username: + password: +``` + +Sostituisci `` e `` con le credenziali effettive memorizzate sul sistema. + +**Esempio con valori realistici:** +```yaml +remote_write: + - url: https://my.nethesis.it/services/mimir/api/v1/push + basic_auth: + username: NOC-F64B-A989-C9E7-45B9-A55D-59EC-6545-40EE + password: my_a1b2c3d4e5f6g7h8i9j0.k1l2m3n4o5p6q7r8s9t0u1v2w3x4y5z6a7b8c9d0 +``` + +Dopo aver aggiornato la configurazione, ricarica Prometheus: +```bash +systemctl reload prometheus +# oppure invia SIGHUP +kill -HUP $(pidof prometheus) +``` + +!!! tip + Prometheus inizierà a inoltrare tutte le metriche raccolte a My. Usa `remote_write_queue_samples_total` nel tuo Prometheus locale per verificare che le metriche vengano inviate. + +## Accesso a Grafana + +Grafana è disponibile all'indirizzo: + +``` +https://my.nethesis.it/grafana/ +``` + +Le dashboard sono **per organizzazione**: gli utenti di ciascuna organizzazione possono vedere solo le metriche raccolte dai sistemi appartenenti alla propria organizzazione. L'isolamento del tenant è applicato automaticamente tramite `X-Scope-OrgID`. + +!!! note + L'accesso a Grafana è gestito dall'amministratore della piattaforma. Contattalo per ottenere l'accesso o per richiedere dashboard personalizzate per la tua organizzazione. + +## Risoluzione Problemi + +### HTTP 401 Unauthorized + +**Causa:** `system_key` o `system_secret` non corretti. + +**Soluzioni:** +1. Verifica che le credenziali corrispondano a quelle memorizzate sul sistema +2. Assicurati che il sistema abbia completato la registrazione (vedi [Registrazione Sistema](05-system-registration.md)) +3. Controlla eventuali spazi iniziali o finali nelle credenziali +4. Testa manualmente: + ```bash + curl -X POST https://my.nethesis.it/services/mimir/api/v1/push \ + -u "NOC-F64B-A989-C9E7-45B9-A55D-59EC-6545-40EE:my_a1b2c3d4e5f6g7h8i9j0.k1l2m3n4o5p6q7r8s9t0u1v2w3x4y5z6a7b8c9d0" \ + -H "Content-Type: application/x-protobuf" \ + --data-binary @/dev/null + ``` + Una risposta `400 Bad Request` (non 401) conferma che l'autenticazione funziona. + +### HTTP 500 Internal Server Error + +**Causa:** Il backend Mimir non è raggiungibile o è configurato in modo errato. + +**Soluzioni:** +1. Si tratta di un problema lato piattaforma — contatta il tuo amministratore +2. Controlla la pagina di stato della piattaforma o gli avvisi di monitoraggio +3. Riprova dopo qualche minuto; Mimir potrebbe essere in fase di riavvio + +### Metriche Non Visibili in Grafana + +**Causa:** Le metriche vengono inviate ma non sono ancora visibili. + +**Soluzioni:** +1. Attendi 1–2 minuti — Mimir ha un ritardo di acquisizione +2. Verifica che `remote_write` sia abilitato in Prometheus e che la configurazione sia corretta +3. Controlla i log di Prometheus per errori di remote write: + ```bash + journalctl -u prometheus -n 50 | grep remote_write + ``` +4. Conferma di essere connesso a Grafana con un account appartenente all'organizzazione corretta + +## Documentazione Correlata + +- [Registrazione Sistema](05-system-registration.md) +- [Inventario e Heartbeat](06-inventory-heartbeat.md) +- [Gestione Sistemi](04-systems.md) diff --git a/mkdocs.yml b/mkdocs.yml index 60966e595..ee6a92a96 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -131,6 +131,7 @@ plugins: Systems Management: Gestione Sistemi System Registration: Registrazione Sistema Inventory & Heartbeat: Inventario e Heartbeat + Metrics: Metriche Developer Documentation: Documentazione Sviluppatori Main Project: Progetto Principale Backend API: API Backend @@ -172,6 +173,7 @@ nav: - Systems Management: 04-systems.md - System Registration: 05-system-registration.md - Inventory & Heartbeat: 06-inventory-heartbeat.md + - Metrics: 08-metrics.md - Developer Documentation: - Main Project: https://github.com/NethServer/my/blob/main/README.md - Backend API: https://github.com/NethServer/my/blob/main/backend/README.md diff --git a/proxy/entrypoint.sh b/proxy/entrypoint.sh index f32d6afff..18187b8cf 100644 --- a/proxy/entrypoint.sh +++ b/proxy/entrypoint.sh @@ -29,7 +29,7 @@ else fi echo '==> Substituting nginx config...' -envsubst '$PORT $BACKEND_SERVICE_NAME $COLLECT_SERVICE_NAME $FRONTEND_SERVICE_NAME' < /etc/nginx/nginx.conf > /tmp/nginx.conf +envsubst '$PORT $BACKEND_SERVICE_NAME $COLLECT_SERVICE_NAME $FRONTEND_SERVICE_NAME $GRAFANA_SERVICE_NAME' < /etc/nginx/nginx.conf > /tmp/nginx.conf echo '==> Generated upstream URLs:' grep -E 'set.*upstream' /tmp/nginx.conf || true diff --git a/proxy/nginx.conf b/proxy/nginx.conf index b65793d6c..23ed06e19 100644 --- a/proxy/nginx.conf +++ b/proxy/nginx.conf @@ -87,6 +87,43 @@ http { proxy_read_timeout 30s; } + # Mimir metrics - proxied through collect for auth + X-Scope-OrgID + location /services/mimir/ { + set $mimir_collect_upstream https://${COLLECT_SERVICE_NAME}.onrender.com; + proxy_pass $mimir_collect_upstream/api/services/mimir/; + proxy_ssl_server_name on; + proxy_ssl_verify off; + proxy_set_header Host ${COLLECT_SERVICE_NAME}.onrender.com; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header X-Forwarded-Host $host; + + # Higher timeouts for metrics writes and queries + proxy_connect_timeout 30s; + proxy_send_timeout 60s; + proxy_read_timeout 60s; + # Disable request buffering - critical for streaming remote_write + proxy_request_buffering off; + } + + # Grafana dashboard + location /grafana/ { + set $grafana_upstream https://${GRAFANA_SERVICE_NAME}.onrender.com; + proxy_pass $grafana_upstream/; + proxy_ssl_server_name on; + proxy_ssl_verify off; + proxy_set_header Host ${GRAFANA_SERVICE_NAME}.onrender.com; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header X-Forwarded-Host $host; + + proxy_connect_timeout 30s; + proxy_send_timeout 30s; + proxy_read_timeout 60s; + } + # Frontend routes - everything else location / { set $frontend_upstream https://${FRONTEND_SERVICE_NAME}.onrender.com; diff --git a/proxy/nginx.conf.local b/proxy/nginx.conf.local index 39ec91f27..252f9ac9a 100644 --- a/proxy/nginx.conf.local +++ b/proxy/nginx.conf.local @@ -77,6 +77,35 @@ http { proxy_read_timeout 30s; } + # Mimir metrics - proxied through collect for auth + X-Scope-OrgID + location /services/mimir/ { + proxy_pass http://collect-full:8080/api/services/mimir/; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header X-Forwarded-Host $host; + + proxy_connect_timeout 30s; + proxy_send_timeout 60s; + proxy_read_timeout 60s; + proxy_request_buffering off; + } + + # Grafana dashboard + location /grafana/ { + proxy_pass http://grafana-full:3000/; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header X-Forwarded-Host $host; + + proxy_connect_timeout 30s; + proxy_send_timeout 30s; + proxy_read_timeout 60s; + } + # Frontend routes - everything else location / { proxy_pass http://frontend-full:8080; diff --git a/render.yaml b/render.yaml index e2e9ca8c7..c367ec50c 100644 --- a/render.yaml +++ b/render.yaml @@ -397,4 +397,4 @@ services: property: host autoDeploy: true # Auto-deploy on every commit branch: main - pullRequestPreviewsEnabled: true # PR previews enabled \ No newline at end of file + pullRequestPreviewsEnabled: true # PR previews enabled diff --git a/services/mimir/Containerfile b/services/mimir/Containerfile index 7996785b0..3b4b631da 100644 --- a/services/mimir/Containerfile +++ b/services/mimir/Containerfile @@ -15,6 +15,9 @@ COPY .render-build-trigger /tmp/build-trigger # Copy Mimir config template COPY my.yaml /etc/mimir/my.yaml.template +# Copy default runtime configuration (per-tenant overrides, reloaded every 10s) +COPY runtime_config.yaml /etc/mimir/runtime_config.yaml + # Copy entrypoint script (must be executable in the repo) COPY entrypoint.sh /entrypoint.sh diff --git a/services/mimir/my.yaml b/services/mimir/my.yaml index 769b6dead..206e46ab9 100644 --- a/services/mimir/my.yaml +++ b/services/mimir/my.yaml @@ -35,3 +35,10 @@ store_gateway: server: http_listen_port: ${PORT} log_level: info + +limits: + compactor_blocks_retention_period: 24h + +runtime_config: + file: /etc/mimir/runtime_config.yaml + period: 10s diff --git a/services/mimir/runtime_config.yaml b/services/mimir/runtime_config.yaml new file mode 100644 index 000000000..ba1404072 --- /dev/null +++ b/services/mimir/runtime_config.yaml @@ -0,0 +1,9 @@ +# Mimir runtime configuration — reloaded every 10s without restart. +# Use this file to set per-tenant limit overrides. +# +# Example: +# overrides: +# my-tenant-id: +# ingestion_rate: 10000 +# max_label_names_per_series: 30 +# compactor_blocks_retention_period: 48h From 357ce224a5c5609b565ac7e64dd2f40e19cd5bbc Mon Sep 17 00:00:00 2001 From: Giacomo Sanchietti Date: Fri, 20 Feb 2026 10:04:59 +0100 Subject: [PATCH 2/5] feat: enable alertmanager --- services/mimir/my.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/services/mimir/my.yaml b/services/mimir/my.yaml index 206e46ab9..43280b832 100644 --- a/services/mimir/my.yaml +++ b/services/mimir/my.yaml @@ -21,6 +21,16 @@ ruler_storage: s3: bucket_name: ${MIMIR_S3_RULER_BUCKET} +alertmanager: + enabled: true + data_dir: /tmp/mimir/alertmanager + poll_interval: 15s + max_config_size_bytes: 1048576 # 1MB + enable_api: true + persist_interval: 15m + retention: 120h + external_url: https://my-collect-qa-pr-41.onrender.com/api/services/mimir/alertmanager + compactor: data_dir: /tmp/mimir/compactor From e10563022074f2fdace5115f6718ee5b3f15c94a Mon Sep 17 00:00:00 2001 From: Giacomo Sanchietti Date: Fri, 20 Feb 2026 11:40:56 +0100 Subject: [PATCH 3/5] fix --- .github/workflows/ci-main.yml | 5 +- collect/.env.example | 4 +- collect/configuration/configuration.go | 17 +-- collect/methods/mimir.go | 108 ++++++---------- docs/en/08-alerting.md | 93 ++++++++++++++ docs/en/08-metrics.md | 165 ------------------------- docs/it/08-alerting.md | 107 ++++++++++++++++ docs/it/08-metrics.md | 165 ------------------------- mkdocs.yml | 4 +- render.yaml | 8 -- services/mimir/.env.example | 4 +- services/mimir/README.md | 16 ++- services/mimir/docker-compose.yml | 6 +- services/mimir/entrypoint.sh | 4 +- services/mimir/my.yaml | 36 +----- 15 files changed, 264 insertions(+), 478 deletions(-) create mode 100644 docs/en/08-alerting.md delete mode 100644 docs/en/08-metrics.md create mode 100644 docs/it/08-alerting.md delete mode 100644 docs/it/08-metrics.md diff --git a/.github/workflows/ci-main.yml b/.github/workflows/ci-main.yml index 86ef14f47..c4b32703c 100644 --- a/.github/workflows/ci-main.yml +++ b/.github/workflows/ci-main.yml @@ -137,6 +137,7 @@ jobs: context: proxy - component: mimir context: services/mimir + image_description: Mimir alertmanager backend for My Nethesis steps: - uses: actions/checkout@v4 @@ -170,7 +171,9 @@ jobs: platforms: linux/amd64,linux/arm64 push: ${{ github.event_name != 'pull_request' }} tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} + labels: | + ${{ steps.meta.outputs.labels }} + ${{ matrix.image_description != '' && format('org.opencontainers.image.description={0}', matrix.image_description) || '' }} cache-from: type=gha,scope=${{ matrix.component }} cache-to: type=gha,mode=max,scope=${{ matrix.component }} build-args: | diff --git a/collect/.env.example b/collect/.env.example index 510c9717e..f7950befa 100644 --- a/collect/.env.example +++ b/collect/.env.example @@ -86,8 +86,8 @@ REDIS_URL=redis://localhost:6379 #CIRCUIT_BREAKER_THRESHOLD=10 #CIRCUIT_BREAKER_TIMEOUT=60s -# Mimir metrics storage (comma-separated list for HA) -#MIMIR_URLS=http://localhost:9009,http://localhost:9010 +# Mimir metrics storage +#MIMIR_URL=http://localhost:9009 # Logging configuration #LOG_LEVEL=info diff --git a/collect/configuration/configuration.go b/collect/configuration/configuration.go index fe8b56423..b8d172b6c 100644 --- a/collect/configuration/configuration.go +++ b/collect/configuration/configuration.go @@ -13,7 +13,6 @@ import ( "fmt" "os" "strconv" - "strings" "time" "github.com/nethesis/my/collect/logger" @@ -83,7 +82,7 @@ type Configuration struct { HeartbeatTimeoutMinutes int `json:"heartbeat_timeout_minutes"` // Mimir configuration - MimirURLs []string `json:"mimir_urls"` + MimirURL string `json:"mimir_url"` } var Config = Configuration{} @@ -166,16 +165,10 @@ func Init() { Config.HeartbeatTimeoutMinutes = parseIntWithDefault("HEARTBEAT_TIMEOUT_MINUTES", 10) // Mimir configuration - if mimirURLs := os.Getenv("MIMIR_URLS"); mimirURLs != "" { - for _, u := range strings.Split(mimirURLs, ",") { - u = strings.TrimSpace(u) - if u != "" { - Config.MimirURLs = append(Config.MimirURLs, u) - } - } - } - if len(Config.MimirURLs) == 0 { - Config.MimirURLs = []string{"http://localhost:9009"} + if mimirURL := os.Getenv("MIMIR_URL"); mimirURL != "" { + Config.MimirURL = mimirURL + } else { + Config.MimirURL = "http://localhost:9009" } // Log successful configuration load diff --git a/collect/methods/mimir.go b/collect/methods/mimir.go index b1519dfa1..7586c674c 100644 --- a/collect/methods/mimir.go +++ b/collect/methods/mimir.go @@ -10,7 +10,6 @@ import ( "database/sql" "fmt" "io" - "math/rand" "net/http" "github.com/gin-gonic/gin" @@ -63,84 +62,47 @@ func ProxyMimir(c *gin.Context) { subPath := c.Param("path") rawQuery := c.Request.URL.RawQuery - // Step 4: Try each Mimir instance starting from a random index - urls := configuration.Config.MimirURLs - n := len(urls) - start := rand.Intn(n) - - for i := 0; i < n; i++ { - base := urls[(start+i)%n] - targetURL := fmt.Sprintf("%s%s", base, subPath) - if rawQuery != "" { - targetURL += "?" + rawQuery - } - - logger.Info().Str("target", targetURL).Int("attempt", i+1).Msg("mimir proxy: trying instance") - - req, err := http.NewRequest(c.Request.Method, targetURL, bytes.NewReader(bodyBytes)) - if err != nil { - logger.Warn().Err(err).Str("target", targetURL).Msg("mimir proxy: failed to create upstream request") - continue - } - - for _, header := range []string{"Content-Type", "Content-Encoding", "Accept", "User-Agent"} { - if val := c.GetHeader(header); val != "" { - req.Header.Set(header, val) - } - } - // Remove Accept-Encoding so Mimir sends plain JSON, not gzip - req.Header.Del("Accept-Encoding") - req.Header.Set("X-Scope-OrgID", organizationID) + // Step 4: Forward request to Mimir + targetURL := fmt.Sprintf("%s%s", configuration.Config.MimirURL, subPath) + if rawQuery != "" { + targetURL += "?" + rawQuery + } - resp, err := http.DefaultClient.Do(req) - if err != nil { - logger.Warn().Err(err).Str("target", targetURL).Msg("mimir proxy: network error, trying next instance") - continue - } + logger.Info().Str("target", targetURL).Msg("mimir proxy: forwarding request") - // Return 4xx immediately — client errors are not retried - if resp.StatusCode >= 400 && resp.StatusCode < 500 { - defer func() { - if err := resp.Body.Close(); err != nil { - logger.Error().Err(err).Msg("mimir proxy: failed to close upstream response body") - } - }() - if ct := resp.Header.Get("Content-Type"); ct != "" { - c.Header("Content-Type", ct) - } - c.Status(resp.StatusCode) - if _, err := io.Copy(c.Writer, resp.Body); err != nil { - logger.Error().Err(err).Msg("mimir proxy: error streaming response body") - } - return - } + req, err := http.NewRequest(c.Request.Method, targetURL, bytes.NewReader(bodyBytes)) + if err != nil { + logger.Error().Err(err).Str("target", targetURL).Msg("mimir proxy: failed to create upstream request") + c.JSON(http.StatusInternalServerError, response.InternalServerError("internal server error", nil)) + return + } - // Retry on 5xx - if resp.StatusCode >= 500 { - if err := resp.Body.Close(); err != nil { - logger.Error().Err(err).Msg("mimir proxy: failed to close upstream response body") - } - logger.Warn().Str("target", targetURL).Int("status", resp.StatusCode).Msg("mimir proxy: 5xx response, trying next instance") - continue + for _, header := range []string{"Content-Type", "Content-Encoding", "Accept", "User-Agent"} { + if val := c.GetHeader(header); val != "" { + req.Header.Set(header, val) } + } + // Remove Accept-Encoding so Mimir sends plain JSON, not gzip + req.Header.Del("Accept-Encoding") + req.Header.Set("X-Scope-OrgID", organizationID) - // Success: stream response back to client - defer func() { - if err := resp.Body.Close(); err != nil { - logger.Error().Err(err).Msg("mimir proxy: failed to close upstream response body") - } - }() - if ct := resp.Header.Get("Content-Type"); ct != "" { - c.Header("Content-Type", ct) - } - c.Status(resp.StatusCode) - if _, err := io.Copy(c.Writer, resp.Body); err != nil { - logger.Error().Err(err).Msg("mimir proxy: error streaming response body") - } + resp, err := http.DefaultClient.Do(req) + if err != nil { + logger.Error().Err(err).Str("target", targetURL).Msg("mimir proxy: network error") + c.JSON(http.StatusBadGateway, response.InternalServerError("mimir is unavailable", nil)) return } + defer func() { + if err := resp.Body.Close(); err != nil { + logger.Error().Err(err).Msg("mimir proxy: failed to close upstream response body") + } + }() - // All instances failed - logger.Error().Int("instances_tried", n).Msg("mimir proxy: all instances failed") - c.JSON(http.StatusBadGateway, response.InternalServerError("all mimir instances are unavailable", nil)) + if ct := resp.Header.Get("Content-Type"); ct != "" { + c.Header("Content-Type", ct) + } + c.Status(resp.StatusCode) + if _, err := io.Copy(c.Writer, resp.Body); err != nil { + logger.Error().Err(err).Msg("mimir proxy: error streaming response body") + } } diff --git a/docs/en/08-alerting.md b/docs/en/08-alerting.md new file mode 100644 index 000000000..76b0ba696 --- /dev/null +++ b/docs/en/08-alerting.md @@ -0,0 +1,93 @@ +# Alerting + +Learn how My platform manages alert rules and sends notifications per organization using Grafana Mimir's multi-tenant Alertmanager. + +## Overview + +My platform uses [Grafana Mimir](https://grafana.com/oss/mimir/)'s built-in multi-tenant Alertmanager to manage alert rules and route notifications. Each organization has its own isolated Alertmanager configuration — alert rules and notification receivers (e.g. email, PagerDuty, webhook) are fully scoped to the organization that owns them. + +## How It Works + +### Multi-Tenancy + +Each system belongs to an organization. The collect service resolves the system's `organization_id` from its credentials and injects it as the `X-Scope-OrgID` header before forwarding to Mimir. This ensures alert rules and notifications are fully isolated between organizations — each organization only manages and receives its own alerts. + +## Authentication + +Alertmanager API calls use the same credentials as system registration and inventory: + +| Field | Value | +|-------|-------| +| **Username** | `system_key` (e.g. `NOC-F64B-A989-C9E7-45B9-A55D-59EC-6545-40EE`) | +| **Password** | `system_secret` (e.g. `my_a1b2c3d4e5f6g7h8i9j0.k1l2m3n4o5p6q7r8s9t0u1v2w3x4y5z6a7b8c9d0`) | +| **Method** | HTTP Basic Auth | + +No separate registration is needed — any system that has completed registration can immediately interact with the Alertmanager API. See [System Registration](05-system-registration.md) for how to obtain credentials. + +## Alertmanager API + +The collect service proxies Alertmanager API calls and automatically injects the `X-Scope-OrgID` header based on the authenticated system's organization. The base path is: + +``` +https://my.nethesis.it/api/services/mimir/alertmanager/api/v2/ +``` + +This maps directly to the [Alertmanager v2 API](https://github.com/prometheus/alertmanager/blob/main/api/v2/openapi.yaml). All standard endpoints are available. + +### Example: List Active Alerts + +```bash +curl https://my.nethesis.it/api/services/mimir/alertmanager/api/v2/alerts \ + -u "NOC-F64B-A989-C9E7-45B9-A55D-59EC-6545-40EE:my_a1b2c3d4e5f6g7h8i9j0.k1l2m3n4o5p6q7r8s9t0u1v2w3x4y5z6a7b8c9d0" +``` + +### Example: Get Alertmanager Status + +```bash +curl https://my.nethesis.it/api/services/mimir/alertmanager/api/v2/status \ + -u ":" +``` + +### Example: Create or Update a Silence + +```bash +curl -X POST https://my.nethesis.it/api/services/mimir/alertmanager/api/v2/silences \ + -u ":" \ + -H "Content-Type: application/json" \ + -d '{ + "matchers": [{"name": "alertname", "value": "WatchdogDown", "isRegex": false}], + "startsAt": "2024-01-01T00:00:00Z", + "endsAt": "2024-01-02T00:00:00Z", + "createdBy": "admin", + "comment": "Planned maintenance" + }' +``` + +!!! tip + Replace `` and `` with the actual credentials stored on the system. The `X-Scope-OrgID` header is injected automatically by the collect service — do not set it manually. + +## Troubleshooting + +### HTTP 401 Unauthorized + +**Cause:** Incorrect `system_key` or `system_secret`. + +**Solutions:** +1. Verify credentials match what is stored on the system +2. Ensure the system has completed registration (see [System Registration](05-system-registration.md)) +3. Check for leading/trailing spaces in the credentials + +### HTTP 500 Internal Server Error + +**Cause:** Mimir Alertmanager backend is unreachable or misconfigured. + +**Solutions:** +1. This is a platform-side issue — contact your administrator +2. Check platform status page or monitoring alerts +3. Retry after a few minutes; Mimir may be restarting + +## Related Documentation + +- [System Registration](05-system-registration.md) +- [Inventory and Heartbeat](06-inventory-heartbeat.md) +- [Systems Management](04-systems.md) diff --git a/docs/en/08-metrics.md b/docs/en/08-metrics.md deleted file mode 100644 index 20e2ce556..000000000 --- a/docs/en/08-metrics.md +++ /dev/null @@ -1,165 +0,0 @@ -# Metrics - -Learn how external systems push Prometheus metrics to My platform and how to visualize them in Grafana. - -## Overview - -My platform supports Prometheus metrics collection via [Grafana Mimir](https://grafana.com/oss/mimir/). Any registered NethServer or NethSecurity system can push metrics using the standard Prometheus `remote_write` protocol. Metrics are isolated per organization and visible in Grafana dashboards. - -## How It Works - -### Metrics Ingestion Flow - -``` -┌─────────────────┐ ┌──────────────┐ -│ NethServer / │ POST /services/mimir/api/v1/push │ │ -│ NethSecurity │ ─────────────────────────────────> │ nginx │ -│ │ Basic Auth: system_key:system_secret │ │ -└─────────────────┘ └──────┬───────┘ - │ - │ /api/services/mimir/ - v - ┌──────────────┐ - │ Collect │ - │ │ - │ 1. Validate │ - │ Basic │ - │ Auth │ - │ 2. Set │ - │ X-Scope- │ - │ OrgID: │ - │ │ - └──────┬───────┘ - │ - v - ┌──────────────┐ - │ Mimir │ - │ (private) │ - └──────────────┘ -``` - -### Grafana Access Flow - -``` -┌─────────────┐ ┌──────────────┐ ┌──────────────┐ -│ Browser │ /grafana/ │ nginx │ │ Grafana │ -│ │ ─────────────> │ │ --> │ │ -│ │ └──────────────┘ └──────┬───────┘ -└─────────────┘ │ - │ queries Mimir - │ X-Scope-OrgID - v - ┌──────────────┐ - │ Mimir │ - │ (private) │ - └──────────────┘ -``` - -### Multi-Tenancy - -Each system belongs to an organization. The collect service resolves the system's `organization_id` from its credentials and injects it as the `X-Scope-OrgID` header before forwarding to Mimir. This ensures metrics are fully isolated between organizations — each organization only sees its own data. - -## Authentication - -Metrics push uses the same credentials as system registration and inventory: - -| Field | Value | -|-------|-------| -| **Username** | `system_key` (e.g. `NOC-F64B-A989-C9E7-45B9-A55D-59EC-6545-40EE`) | -| **Password** | `system_secret` (e.g. `my_a1b2c3d4e5f6g7h8i9j0.k1l2m3n4o5p6q7r8s9t0u1v2w3x4y5z6a7b8c9d0`) | -| **Method** | HTTP Basic Auth | - -No separate registration is needed — any system that has completed registration can immediately push metrics. See [System Registration](05-system-registration.md) for how to obtain credentials. - -## Configuring Prometheus `remote_write` - -Add the following block to your Prometheus configuration (`/etc/prometheus/prometheus.yml` or equivalent): - -```yaml -remote_write: - - url: https://my.nethesis.it/services/mimir/api/v1/push - basic_auth: - username: - password: -``` - -Replace `` and `` with the actual credentials stored on the system. - -**Example with real-looking values:** -```yaml -remote_write: - - url: https://my.nethesis.it/services/mimir/api/v1/push - basic_auth: - username: NOC-F64B-A989-C9E7-45B9-A55D-59EC-6545-40EE - password: my_a1b2c3d4e5f6g7h8i9j0.k1l2m3n4o5p6q7r8s9t0u1v2w3x4y5z6a7b8c9d0 -``` - -After updating the configuration, reload Prometheus: -```bash -systemctl reload prometheus -# or send SIGHUP -kill -HUP $(pidof prometheus) -``` - -!!! tip - Prometheus will start forwarding all scraped metrics to My. Use `remote_write_queue_samples_total` in your local Prometheus to verify metrics are being sent. - -## Accessing Grafana - -Grafana is available at: - -``` -https://my.nethesis.it/grafana/ -``` - -Dashboards are **per-organization**: each organization's users can only see metrics collected from systems belonging to their organization. The tenant isolation is enforced automatically via `X-Scope-OrgID`. - -!!! note - Grafana access is managed by your platform administrator. Contact them to get access or to request custom dashboards for your organization. - -## Troubleshooting - -### HTTP 401 Unauthorized - -**Cause:** Incorrect `system_key` or `system_secret`. - -**Solutions:** -1. Verify credentials match what is stored on the system -2. Ensure the system has completed registration (see [System Registration](05-system-registration.md)) -3. Check for leading/trailing spaces in the credentials -4. Test manually: - ```bash - curl -X POST https://my.nethesis.it/services/mimir/api/v1/push \ - -u "NOC-F64B-A989-C9E7-45B9-A55D-59EC-6545-40EE:my_a1b2c3d4e5f6g7h8i9j0.k1l2m3n4o5p6q7r8s9t0u1v2w3x4y5z6a7b8c9d0" \ - -H "Content-Type: application/x-protobuf" \ - --data-binary @/dev/null - ``` - A `400 Bad Request` (not 401) confirms authentication is working. - -### HTTP 500 Internal Server Error - -**Cause:** Mimir backend is unreachable or misconfigured. - -**Solutions:** -1. This is a platform-side issue — contact your administrator -2. Check platform status page or monitoring alerts -3. Retry after a few minutes; Mimir may be restarting - -### Metrics Not Appearing in Grafana - -**Cause:** Metrics are being sent but not yet visible. - -**Solutions:** -1. Wait 1–2 minutes — Mimir has an ingestion delay -2. Verify `remote_write` is enabled in Prometheus and the configuration is correct -3. Check Prometheus logs for remote write errors: - ```bash - journalctl -u prometheus -n 50 | grep remote_write - ``` -4. Confirm you are logged in to Grafana with an account that belongs to the correct organization - -## Related Documentation - -- [System Registration](05-system-registration.md) -- [Inventory and Heartbeat](06-inventory-heartbeat.md) -- [Systems Management](04-systems.md) diff --git a/docs/it/08-alerting.md b/docs/it/08-alerting.md new file mode 100644 index 000000000..8bd177cda --- /dev/null +++ b/docs/it/08-alerting.md @@ -0,0 +1,107 @@ +# Alerting + +Scopri come la piattaforma My gestisce le regole di alerting e le notifiche per organizzazione tramite Grafana Mimir Alertmanager. + +## Panoramica + +La piattaforma My utilizza l'Alertmanager multi-tenant di [Grafana Mimir](https://grafana.com/oss/mimir/) per gestire regole di alert e inviare notifiche. Ogni organizzazione dispone di un proprio insieme isolato di regole di alerting e configurazioni di notifica: nessuna organizzazione può vedere o modificare le regole delle altre. + +## Come Funziona + +### Multi-Tenancy + +Ogni sistema appartiene a un'organizzazione. Il servizio collect risolve l'`organization_id` del sistema dalle sue credenziali e lo inietta come header `X-Scope-OrgID` prima di inoltrare la richiesta a Mimir. Questo garantisce che le regole di alert e le notifiche siano completamente isolate tra le organizzazioni — ogni organizzazione gestisce e riceve solo i propri alert. + +## Autenticazione + +L'accesso all'API Alertmanager usa le stesse credenziali della registrazione del sistema e dell'inventario: + +| Campo | Valore | +|-------|--------| +| **Username** | `system_key` (es. `NOC-F64B-A989-C9E7-45B9-A55D-59EC-6545-40EE`) | +| **Password** | `system_secret` (es. `my_a1b2c3d4e5f6g7h8i9j0.k1l2m3n4o5p6q7r8s9t0u1v2w3x4y5z6a7b8c9d0`) | +| **Metodo** | HTTP Basic Auth | + +Non è necessaria una registrazione separata — qualsiasi sistema che ha completato la registrazione può interagire con l'API Alertmanager immediatamente. Consulta [Registrazione Sistema](05-system-registration.md) per come ottenere le credenziali. + +## API Alertmanager + +L'Alertmanager è esposto tramite il proxy della piattaforma al percorso: + +``` +/api/services/mimir/alertmanager/api/v2/ +``` + +È compatibile con l'[API standard di Alertmanager v2](https://github.com/prometheus/alertmanager/blob/main/api/v2/openapi.yaml). + +### Esempi di utilizzo + +**Recuperare gli alert attivi:** +```bash +curl https://my.nethesis.it/api/services/mimir/alertmanager/api/v2/alerts \ + -u "NOC-F64B-A989-C9E7-45B9-A55D-59EC-6545-40EE:my_a1b2c3d4e5f6g7h8i9j0.k1l2m3n4o5p6q7r8s9t0u1v2w3x4y5z6a7b8c9d0" +``` + +**Recuperare i gruppi di alert:** +```bash +curl https://my.nethesis.it/api/services/mimir/alertmanager/api/v2/alerts/groups \ + -u ":" +``` + +**Creare un silenzio:** +```bash +curl -X POST https://my.nethesis.it/api/services/mimir/alertmanager/api/v2/silences \ + -u ":" \ + -H "Content-Type: application/json" \ + -d '{ + "matchers": [{"name": "alertname", "value": "HighCPU", "isRegex": false}], + "startsAt": "2024-01-01T00:00:00Z", + "endsAt": "2024-01-02T00:00:00Z", + "createdBy": "admin", + "comment": "Manutenzione pianificata" + }' +``` + +**Recuperare i silenzi attivi:** +```bash +curl https://my.nethesis.it/api/services/mimir/alertmanager/api/v2/silences \ + -u ":" +``` + +**Eliminare un silenzio:** +```bash +curl -X DELETE https://my.nethesis.it/api/services/mimir/alertmanager/api/v2/silence/ \ + -u ":" +``` + +## Risoluzione Problemi + +### HTTP 401 Unauthorized + +**Causa:** `system_key` o `system_secret` non corretti. + +**Soluzioni:** +1. Verifica che le credenziali corrispondano a quelle memorizzate sul sistema +2. Assicurati che il sistema abbia completato la registrazione (vedi [Registrazione Sistema](05-system-registration.md)) +3. Controlla eventuali spazi iniziali o finali nelle credenziali +4. Testa manualmente: + ```bash + curl https://my.nethesis.it/api/services/mimir/alertmanager/api/v2/alerts \ + -u "NOC-F64B-A989-C9E7-45B9-A55D-59EC-6545-40EE:my_a1b2c3d4e5f6g7h8i9j0.k1l2m3n4o5p6q7r8s9t0u1v2w3x4y5z6a7b8c9d0" + ``` + Una risposta `200 OK` o `404 Not Found` (non 401) conferma che l'autenticazione funziona. + +### HTTP 500 Internal Server Error + +**Causa:** Il backend Mimir non è raggiungibile o è configurato in modo errato. + +**Soluzioni:** +1. Si tratta di un problema lato piattaforma — contatta il tuo amministratore +2. Controlla la pagina di stato della piattaforma o gli avvisi di monitoraggio +3. Riprova dopo qualche minuto; Mimir potrebbe essere in fase di riavvio + +## Documentazione Correlata + +- [Registrazione Sistema](05-system-registration.md) +- [Inventario e Heartbeat](06-inventory-heartbeat.md) +- [Gestione Sistemi](04-systems.md) diff --git a/docs/it/08-metrics.md b/docs/it/08-metrics.md deleted file mode 100644 index a07c9c668..000000000 --- a/docs/it/08-metrics.md +++ /dev/null @@ -1,165 +0,0 @@ -# Metriche - -Scopri come i sistemi esterni inviano metriche Prometheus alla piattaforma My e come visualizzarle in Grafana. - -## Panoramica - -La piattaforma My supporta la raccolta di metriche Prometheus tramite [Grafana Mimir](https://grafana.com/oss/mimir/). Qualsiasi sistema NethServer o NethSecurity registrato può inviare metriche usando il protocollo standard Prometheus `remote_write`. Le metriche sono isolate per organizzazione e visibili nelle dashboard Grafana. - -## Come Funziona - -### Flusso di Acquisizione Metriche - -``` -┌─────────────────┐ ┌──────────────┐ -│ NethServer / │ POST /services/mimir/api/v1/push │ │ -│ NethSecurity │ ─────────────────────────────────> │ nginx │ -│ │ Basic Auth: system_key:system_secret │ │ -└─────────────────┘ └──────┬───────┘ - │ - │ /api/services/mimir/ - v - ┌──────────────┐ - │ Collect │ - │ │ - │ 1. Validate │ - │ Basic │ - │ Auth │ - │ 2. Set │ - │ X-Scope- │ - │ OrgID: │ - │ │ - └──────┬───────┘ - │ - v - ┌──────────────┐ - │ Mimir │ - │ (private) │ - └──────────────┘ -``` - -### Flusso di Accesso Grafana - -``` -┌─────────────┐ ┌──────────────┐ ┌──────────────┐ -│ Browser │ /grafana/ │ nginx │ │ Grafana │ -│ │ ─────────────> │ │ --> │ │ -│ │ └──────────────┘ └──────┬───────┘ -└─────────────┘ │ - │ queries Mimir - │ X-Scope-OrgID - v - ┌──────────────┐ - │ Mimir │ - │ (private) │ - └──────────────┘ -``` - -### Multi-Tenancy - -Ogni sistema appartiene a un'organizzazione. Il servizio collect risolve l'`organization_id` del sistema dalle sue credenziali e lo inietta come header `X-Scope-OrgID` prima di inoltrare la richiesta a Mimir. Questo garantisce che le metriche siano completamente isolate tra le organizzazioni — ogni organizzazione vede solo i propri dati. - -## Autenticazione - -L'invio delle metriche usa le stesse credenziali della registrazione del sistema e dell'inventario: - -| Campo | Valore | -|-------|--------| -| **Username** | `system_key` (es. `NOC-F64B-A989-C9E7-45B9-A55D-59EC-6545-40EE`) | -| **Password** | `system_secret` (es. `my_a1b2c3d4e5f6g7h8i9j0.k1l2m3n4o5p6q7r8s9t0u1v2w3x4y5z6a7b8c9d0`) | -| **Metodo** | HTTP Basic Auth | - -Non è necessaria una registrazione separata — qualsiasi sistema che ha completato la registrazione può inviare metriche immediatamente. Consulta [Registrazione Sistema](05-system-registration.md) per come ottenere le credenziali. - -## Configurazione di Prometheus `remote_write` - -Aggiungi il seguente blocco alla configurazione di Prometheus (`/etc/prometheus/prometheus.yml` o equivalente): - -```yaml -remote_write: - - url: https://my.nethesis.it/services/mimir/api/v1/push - basic_auth: - username: - password: -``` - -Sostituisci `` e `` con le credenziali effettive memorizzate sul sistema. - -**Esempio con valori realistici:** -```yaml -remote_write: - - url: https://my.nethesis.it/services/mimir/api/v1/push - basic_auth: - username: NOC-F64B-A989-C9E7-45B9-A55D-59EC-6545-40EE - password: my_a1b2c3d4e5f6g7h8i9j0.k1l2m3n4o5p6q7r8s9t0u1v2w3x4y5z6a7b8c9d0 -``` - -Dopo aver aggiornato la configurazione, ricarica Prometheus: -```bash -systemctl reload prometheus -# oppure invia SIGHUP -kill -HUP $(pidof prometheus) -``` - -!!! tip - Prometheus inizierà a inoltrare tutte le metriche raccolte a My. Usa `remote_write_queue_samples_total` nel tuo Prometheus locale per verificare che le metriche vengano inviate. - -## Accesso a Grafana - -Grafana è disponibile all'indirizzo: - -``` -https://my.nethesis.it/grafana/ -``` - -Le dashboard sono **per organizzazione**: gli utenti di ciascuna organizzazione possono vedere solo le metriche raccolte dai sistemi appartenenti alla propria organizzazione. L'isolamento del tenant è applicato automaticamente tramite `X-Scope-OrgID`. - -!!! note - L'accesso a Grafana è gestito dall'amministratore della piattaforma. Contattalo per ottenere l'accesso o per richiedere dashboard personalizzate per la tua organizzazione. - -## Risoluzione Problemi - -### HTTP 401 Unauthorized - -**Causa:** `system_key` o `system_secret` non corretti. - -**Soluzioni:** -1. Verifica che le credenziali corrispondano a quelle memorizzate sul sistema -2. Assicurati che il sistema abbia completato la registrazione (vedi [Registrazione Sistema](05-system-registration.md)) -3. Controlla eventuali spazi iniziali o finali nelle credenziali -4. Testa manualmente: - ```bash - curl -X POST https://my.nethesis.it/services/mimir/api/v1/push \ - -u "NOC-F64B-A989-C9E7-45B9-A55D-59EC-6545-40EE:my_a1b2c3d4e5f6g7h8i9j0.k1l2m3n4o5p6q7r8s9t0u1v2w3x4y5z6a7b8c9d0" \ - -H "Content-Type: application/x-protobuf" \ - --data-binary @/dev/null - ``` - Una risposta `400 Bad Request` (non 401) conferma che l'autenticazione funziona. - -### HTTP 500 Internal Server Error - -**Causa:** Il backend Mimir non è raggiungibile o è configurato in modo errato. - -**Soluzioni:** -1. Si tratta di un problema lato piattaforma — contatta il tuo amministratore -2. Controlla la pagina di stato della piattaforma o gli avvisi di monitoraggio -3. Riprova dopo qualche minuto; Mimir potrebbe essere in fase di riavvio - -### Metriche Non Visibili in Grafana - -**Causa:** Le metriche vengono inviate ma non sono ancora visibili. - -**Soluzioni:** -1. Attendi 1–2 minuti — Mimir ha un ritardo di acquisizione -2. Verifica che `remote_write` sia abilitato in Prometheus e che la configurazione sia corretta -3. Controlla i log di Prometheus per errori di remote write: - ```bash - journalctl -u prometheus -n 50 | grep remote_write - ``` -4. Conferma di essere connesso a Grafana con un account appartenente all'organizzazione corretta - -## Documentazione Correlata - -- [Registrazione Sistema](05-system-registration.md) -- [Inventario e Heartbeat](06-inventory-heartbeat.md) -- [Gestione Sistemi](04-systems.md) diff --git a/mkdocs.yml b/mkdocs.yml index ee6a92a96..864f3bc73 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -131,7 +131,7 @@ plugins: Systems Management: Gestione Sistemi System Registration: Registrazione Sistema Inventory & Heartbeat: Inventario e Heartbeat - Metrics: Metriche + Alerting: Alerting Developer Documentation: Documentazione Sviluppatori Main Project: Progetto Principale Backend API: API Backend @@ -173,7 +173,7 @@ nav: - Systems Management: 04-systems.md - System Registration: 05-system-registration.md - Inventory & Heartbeat: 06-inventory-heartbeat.md - - Metrics: 08-metrics.md + - Alerting: 08-alerting.md - Developer Documentation: - Main Project: https://github.com/NethServer/my/blob/main/README.md - Backend API: https://github.com/NethServer/my/blob/main/backend/README.md diff --git a/render.yaml b/render.yaml index c367ec50c..041def0c6 100644 --- a/render.yaml +++ b/render.yaml @@ -52,12 +52,8 @@ services: sync: false - key: MIMIR_S3_SECRET_KEY sync: false - - key: MIMIR_S3_BUCKET - sync: false - key: MIMIR_S3_ALERTMANAGER_BUCKET sync: false - - key: MIMIR_S3_RULER_BUCKET - sync: false # Production Backend API Server (Private Service) - type: web @@ -227,12 +223,8 @@ services: sync: false - key: MIMIR_S3_SECRET_KEY sync: false - - key: MIMIR_S3_BUCKET - sync: false - key: MIMIR_S3_ALERTMANAGER_BUCKET sync: false - - key: MIMIR_S3_RULER_BUCKET - sync: false autoDeploy: true branch: main pullRequestPreviewsEnabled: true diff --git a/services/mimir/.env.example b/services/mimir/.env.example index 14910d081..1d15ba8ef 100644 --- a/services/mimir/.env.example +++ b/services/mimir/.env.example @@ -1,10 +1,8 @@ -# Mimir / Metrics Stack - environment variables +# Mimir / Alerting Stack - environment variables # Copy to mimir/.env and fill in actual values # S3-compatible storage credentials (DigitalOcean Spaces or AWS S3) MIMIR_S3_ENDPOINT=ams3.digitaloceanspaces.com MIMIR_S3_ACCESS_KEY=your-access-key MIMIR_S3_SECRET_KEY=your-secret-key -MIMIR_S3_BUCKET=your-mimir-blocks-bucket MIMIR_S3_ALERTMANAGER_BUCKET=your-mimir-alertmanager-bucket -MIMIR_S3_RULER_BUCKET=your-mimir-ruler-bucket diff --git a/services/mimir/README.md b/services/mimir/README.md index d6e77f08a..5360b6a49 100644 --- a/services/mimir/README.md +++ b/services/mimir/README.md @@ -1,16 +1,16 @@ -# Mimir — Metrics Infrastructure +# Mimir — Alerting Infrastructure -Grafana Mimir provides long-term metrics storage for the MY platform, deployed as a single node on a dedicated VM (Server B). The collect service on Server A writes metrics to Mimir and proxies read queries. +Grafana Mimir runs as a multi-tenant **Alertmanager** (`-target=alertmanager`) for the MY platform, deployed on a dedicated VM (Server B). It does **not** ingest metrics. The collect service on Server A routes alert notifications through Mimir's Alertmanager API. ## Topology ``` ┌──────────────────────────────────┐ ┌──────────────────────────────────┐ -│ Server A (main app) │ │ Server B (metrics VM) │ +│ Server A (main app) │ │ Server B (alerting VM) │ │ │ │ │ -│ collect ──/api/services/mimir──►│◄────│ mimir (port 19009) │ -│ backend │ │ └── S3 storage │ -│ frontend │ │ │ +│ collect ──/api/services/mimir──►──────► mimir (port 19009) │ +│ backend │ │ -target=alertmanager │ +│ frontend │ │ └── S3 alertmanager state │ │ nginx proxy │ │ │ └──────────────────────────────────┘ └──────────────────────────────────┘ ``` @@ -55,15 +55,13 @@ Should return `ready`. | `MIMIR_S3_ENDPOINT` | S3-compatible storage endpoint | `ams3.digitaloceanspaces.com` | | `MIMIR_S3_ACCESS_KEY` | S3 access key | `your-access-key` | | `MIMIR_S3_SECRET_KEY` | S3 secret key | `your-secret-key` | -| `MIMIR_S3_BUCKET` | Bucket for blocks (TSDB chunks) | `my-mimir-blocks` | | `MIMIR_S3_ALERTMANAGER_BUCKET` | Bucket for Alertmanager state | `my-mimir-alertmanager` | -| `MIMIR_S3_RULER_BUCKET` | Bucket for recording/alert rules | `my-mimir-ruler` | Copy `services/mimir/.env.example` to `services/mimir/.env` and fill in every value before starting the stack. ## Architecture -Mimir runs as a single node with `replication_factor: 1`. It uses three S3 buckets (blocks, alertmanager, ruler) for persistent storage. Multitenancy is enabled; all writes from `collect` include the tenant ID resolved from the system's organization. +Mimir runs as an alertmanager-only target (`-target=alertmanager`). It uses a single S3 bucket for persistent Alertmanager state. Multitenancy is enabled; all requests from `collect` include the tenant ID resolved from the system's organization. The config template (`services/mimir/my.yaml`) uses `${VAR}` placeholders that are expanded at container startup by `entrypoint.sh` via `envsubst`. diff --git a/services/mimir/docker-compose.yml b/services/mimir/docker-compose.yml index 4fd62afc1..732662de0 100644 --- a/services/mimir/docker-compose.yml +++ b/services/mimir/docker-compose.yml @@ -1,4 +1,4 @@ -# Metrics Infrastructure - Dedicated VM deployment +# Alerting Infrastructure - Dedicated VM deployment # # Run on a separate server from the main application stack. # @@ -10,7 +10,7 @@ # # ⚙️ Required environment variables (set in mimir/.env or shell): # MIMIR_S3_ENDPOINT, MIMIR_S3_ACCESS_KEY, MIMIR_S3_SECRET_KEY -# MIMIR_S3_BUCKET, MIMIR_S3_ALERTMANAGER_BUCKET, MIMIR_S3_RULER_BUCKET +# MIMIR_S3_ALERTMANAGER_BUCKET version: '3.8' @@ -30,9 +30,7 @@ services: MIMIR_S3_ENDPOINT: ${MIMIR_S3_ENDPOINT} MIMIR_S3_ACCESS_KEY: ${MIMIR_S3_ACCESS_KEY} MIMIR_S3_SECRET_KEY: ${MIMIR_S3_SECRET_KEY} - MIMIR_S3_BUCKET: ${MIMIR_S3_BUCKET} MIMIR_S3_ALERTMANAGER_BUCKET: ${MIMIR_S3_ALERTMANAGER_BUCKET} - MIMIR_S3_RULER_BUCKET: ${MIMIR_S3_RULER_BUCKET} ports: - "19009:9009" networks: diff --git a/services/mimir/entrypoint.sh b/services/mimir/entrypoint.sh index 843c153b3..a3fc165e3 100755 --- a/services/mimir/entrypoint.sh +++ b/services/mimir/entrypoint.sh @@ -7,5 +7,5 @@ export PORT echo "==> Expanding Mimir config..." envsubst < /etc/mimir/my.yaml.template > /tmp/mimir-config.yaml -echo "==> Starting Mimir on port ${PORT}..." -exec /bin/mimir --config.file=/tmp/mimir-config.yaml +echo "==> Starting Mimir alertmanager on port ${PORT}..." +exec /bin/mimir -target=alertmanager -config.file=/tmp/mimir-config.yaml diff --git a/services/mimir/my.yaml b/services/mimir/my.yaml index 43280b832..021439cc4 100644 --- a/services/mimir/my.yaml +++ b/services/mimir/my.yaml @@ -1,28 +1,14 @@ multitenancy_enabled: true -target: all - -common: - storage: - backend: s3 - s3: - endpoint: ${MIMIR_S3_ENDPOINT} - secret_access_key: ${MIMIR_S3_SECRET_KEY} - access_key_id: ${MIMIR_S3_ACCESS_KEY} - -blocks_storage: - s3: - bucket_name: ${MIMIR_S3_BUCKET} alertmanager_storage: + backend: s3 s3: + endpoint: ${MIMIR_S3_ENDPOINT} + secret_access_key: ${MIMIR_S3_SECRET_KEY} + access_key_id: ${MIMIR_S3_ACCESS_KEY} bucket_name: ${MIMIR_S3_ALERTMANAGER_BUCKET} -ruler_storage: - s3: - bucket_name: ${MIMIR_S3_RULER_BUCKET} - alertmanager: - enabled: true data_dir: /tmp/mimir/alertmanager poll_interval: 15s max_config_size_bytes: 1048576 # 1MB @@ -31,24 +17,10 @@ alertmanager: retention: 120h external_url: https://my-collect-qa-pr-41.onrender.com/api/services/mimir/alertmanager -compactor: - data_dir: /tmp/mimir/compactor - -ingester: - ring: - replication_factor: 1 - -store_gateway: - sharding_ring: - replication_factor: 1 - server: http_listen_port: ${PORT} log_level: info -limits: - compactor_blocks_retention_period: 24h - runtime_config: file: /etc/mimir/runtime_config.yaml period: 10s From 29872416236ccf4cbcefa05b84c7ad68eb6d1fd4 Mon Sep 17 00:00:00 2001 From: Giacomo Sanchietti Date: Fri, 20 Feb 2026 11:43:46 +0100 Subject: [PATCH 4/5] fix --- proxy/entrypoint.sh | 4 ++-- proxy/nginx.conf | 19 +------------------ proxy/nginx.conf.local | 16 +--------------- services/mimir/my.yaml | 2 +- 4 files changed, 5 insertions(+), 36 deletions(-) diff --git a/proxy/entrypoint.sh b/proxy/entrypoint.sh index 18187b8cf..d53291687 100644 --- a/proxy/entrypoint.sh +++ b/proxy/entrypoint.sh @@ -29,10 +29,10 @@ else fi echo '==> Substituting nginx config...' -envsubst '$PORT $BACKEND_SERVICE_NAME $COLLECT_SERVICE_NAME $FRONTEND_SERVICE_NAME $GRAFANA_SERVICE_NAME' < /etc/nginx/nginx.conf > /tmp/nginx.conf +envsubst '$PORT $BACKEND_SERVICE_NAME $COLLECT_SERVICE_NAME $FRONTEND_SERVICE_NAME' < /etc/nginx/nginx.conf > /tmp/nginx.conf echo '==> Generated upstream URLs:' grep -E 'set.*upstream' /tmp/nginx.conf || true echo '==> Starting nginx...' -exec nginx -c /tmp/nginx.conf -g 'daemon off;' \ No newline at end of file +exec nginx -c /tmp/nginx.conf -g 'daemon off;' diff --git a/proxy/nginx.conf b/proxy/nginx.conf index 23ed06e19..058dbd77d 100644 --- a/proxy/nginx.conf +++ b/proxy/nginx.conf @@ -107,23 +107,6 @@ http { proxy_request_buffering off; } - # Grafana dashboard - location /grafana/ { - set $grafana_upstream https://${GRAFANA_SERVICE_NAME}.onrender.com; - proxy_pass $grafana_upstream/; - proxy_ssl_server_name on; - proxy_ssl_verify off; - proxy_set_header Host ${GRAFANA_SERVICE_NAME}.onrender.com; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto $scheme; - proxy_set_header X-Forwarded-Host $host; - - proxy_connect_timeout 30s; - proxy_send_timeout 30s; - proxy_read_timeout 60s; - } - # Frontend routes - everything else location / { set $frontend_upstream https://${FRONTEND_SERVICE_NAME}.onrender.com; @@ -142,4 +125,4 @@ http { proxy_read_timeout 30s; } } -} \ No newline at end of file +} diff --git a/proxy/nginx.conf.local b/proxy/nginx.conf.local index 252f9ac9a..c15eb77f0 100644 --- a/proxy/nginx.conf.local +++ b/proxy/nginx.conf.local @@ -92,20 +92,6 @@ http { proxy_request_buffering off; } - # Grafana dashboard - location /grafana/ { - proxy_pass http://grafana-full:3000/; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto $scheme; - proxy_set_header X-Forwarded-Host $host; - - proxy_connect_timeout 30s; - proxy_send_timeout 30s; - proxy_read_timeout 60s; - } - # Frontend routes - everything else location / { proxy_pass http://frontend-full:8080; @@ -121,4 +107,4 @@ http { proxy_read_timeout 30s; } } -} \ No newline at end of file +} diff --git a/services/mimir/my.yaml b/services/mimir/my.yaml index 021439cc4..9c4457fd1 100644 --- a/services/mimir/my.yaml +++ b/services/mimir/my.yaml @@ -23,4 +23,4 @@ server: runtime_config: file: /etc/mimir/runtime_config.yaml - period: 10s + period: 60s From 8b5324c9f43fb49b48d388c8503a563f63aa3313 Mon Sep 17 00:00:00 2001 From: Giacomo Sanchietti Date: Fri, 20 Feb 2026 11:45:46 +0100 Subject: [PATCH 5/5] fix --- proxy/nginx.conf | 7 ++----- proxy/nginx.conf.local | 6 +++--- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/proxy/nginx.conf b/proxy/nginx.conf index 058dbd77d..e4b36a28a 100644 --- a/proxy/nginx.conf +++ b/proxy/nginx.conf @@ -99,12 +99,9 @@ http { proxy_set_header X-Forwarded-Proto $scheme; proxy_set_header X-Forwarded-Host $host; - # Higher timeouts for metrics writes and queries + # Timeouts proxy_connect_timeout 30s; - proxy_send_timeout 60s; - proxy_read_timeout 60s; - # Disable request buffering - critical for streaming remote_write - proxy_request_buffering off; + proxy_send_timeout 30s; } # Frontend routes - everything else diff --git a/proxy/nginx.conf.local b/proxy/nginx.conf.local index c15eb77f0..7eaa765b6 100644 --- a/proxy/nginx.conf.local +++ b/proxy/nginx.conf.local @@ -86,10 +86,10 @@ http { proxy_set_header X-Forwarded-Proto $scheme; proxy_set_header X-Forwarded-Host $host; + # Timeouts proxy_connect_timeout 30s; - proxy_send_timeout 60s; - proxy_read_timeout 60s; - proxy_request_buffering off; + proxy_send_timeout 30s; + proxy_read_timeout 30s; } # Frontend routes - everything else