diff --git a/.github/workflows/ci-main.yml b/.github/workflows/ci-main.yml index 5fa84e719..12ccd9130 100644 --- a/.github/workflows/ci-main.yml +++ b/.github/workflows/ci-main.yml @@ -137,6 +137,7 @@ jobs: context: proxy - component: mimir context: services/mimir + image_description: Mimir alertmanager backend for My Nethesis steps: - uses: actions/checkout@v4 @@ -170,7 +171,9 @@ jobs: platforms: linux/amd64 push: ${{ github.event_name != 'pull_request' }} tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} + labels: | + ${{ steps.meta.outputs.labels }} + ${{ matrix.image_description != '' && format('org.opencontainers.image.description={0}', matrix.image_description) || '' }} cache-from: type=gha,scope=${{ matrix.component }} cache-to: type=gha,mode=max,scope=${{ matrix.component }} build-args: | diff --git a/backend/.env.example b/backend/.env.example index 9e1c679e6..4f8c1cd40 100644 --- a/backend/.env.example +++ b/backend/.env.example @@ -78,6 +78,11 @@ REDIS_URL=redis://localhost:6379 # System configuration #SYSTEM_TYPES=ns8,nsec +# =========================================== +# MIMIR CONFIGURATION (Optional) +# =========================================== +#MIMIR_URL=http://localhost:9009 + # =========================================== # AUTO-DERIVED URLS (DO NOT SET MANUALLY) # =========================================== diff --git a/backend/.render-build-trigger b/backend/.render-build-trigger index 3bb7a5909..d461df4ae 100644 --- a/backend/.render-build-trigger +++ b/backend/.render-build-trigger @@ -2,9 +2,9 @@ # This file is used to force Docker service rebuilds in PR previews # Modify LAST_UPDATE to trigger rebuilds -LAST_UPDATE=2026-02-10T12:10:22Z +LAST_UPDATE=2026-02-26T15:22:50Z # Instructions: # 1. To force rebuild of Docker services in a PR, update LAST_UPDATE -# 2. Run: perl -i -pe "s/LAST_UPDATE=2026-02-10T12:10:22Z +# 2. Run: perl -i -pe "s/LAST_UPDATE=2026-02-26T15:22:50Z # 2. Commit and push changes to trigger Docker rebuilds \ No newline at end of file diff --git a/backend/configuration/configuration.go b/backend/configuration/configuration.go index ff355f94a..836a2219f 100644 --- a/backend/configuration/configuration.go +++ b/backend/configuration/configuration.go @@ -67,6 +67,8 @@ type Configuration struct { SMTPFrom string `json:"smtp_from"` SMTPFromName string `json:"smtp_from_name"` SMTPTLS bool `json:"smtp_tls"` + // Mimir configuration + MimirURL string `json:"mimir_url"` } var Config = Configuration{} @@ -196,6 +198,13 @@ func Init() { } Config.SMTPTLS = parseBoolWithDefault("SMTP_TLS", true) + // Mimir configuration + if mimirURL := os.Getenv("MIMIR_URL"); mimirURL != "" { + Config.MimirURL = mimirURL + } else { + Config.MimirURL = "http://localhost:9009" + } + // Log successful configuration load logger.LogConfigLoad("env", "configuration", true, nil) } diff --git a/backend/main.go b/backend/main.go index 93d599ee2..06e555a72 100644 --- a/backend/main.go +++ b/backend/main.go @@ -243,6 +243,17 @@ func main() { systemsGroup.GET("/:id/inventory/diffs/latest", methods.GetSystemLatestInventoryDiff) // Get latest diff } + // =========================================== + // ALERTING - manage alert routing configuration via Mimir + // =========================================== + alertingGroup := customAuthWithAudit.Group("/alerting", middleware.RequirePermission("manage:systems")) + { + alertingGroup.POST("/config", methods.ConfigureAlerts) + alertingGroup.DELETE("/config", methods.DisableAlerts) + alertingGroup.GET("/config", methods.GetAlertingConfig) + alertingGroup.GET("/alerts", methods.GetAlerts) + } + // =========================================== // FILTERS - For UI dropdowns // =========================================== diff --git a/backend/methods/alerting.go b/backend/methods/alerting.go new file mode 100644 index 000000000..d66cc818f --- /dev/null +++ b/backend/methods/alerting.go @@ -0,0 +1,219 @@ +/* +Copyright (C) 2026 Nethesis S.r.l. +SPDX-License-Identifier: AGPL-3.0-or-later +*/ + +package methods + +import ( + "encoding/json" + "net/http" + "strings" + + "github.com/gin-gonic/gin" + + "github.com/nethesis/my/backend/configuration" + "github.com/nethesis/my/backend/helpers" + "github.com/nethesis/my/backend/models" + "github.com/nethesis/my/backend/response" + "github.com/nethesis/my/backend/services/alerting" + "github.com/nethesis/my/backend/services/local" +) + +// resolveOrgID extracts the target organization ID. +// Owner/Distributor/Reseller must pass organization_id query param. +// Customer uses their own organization from JWT. +func resolveOrgID(c *gin.Context, user *models.User) (string, bool) { + orgID := c.Query("organization_id") + orgRole := strings.ToLower(user.OrgRole) + + if orgRole == "customer" { + // Customer always uses their own organization + return user.OrganizationID, true + } + + // Owner, Distributor, Reseller must provide organization_id + if orgID == "" { + c.JSON(http.StatusBadRequest, response.BadRequest("organization_id query parameter is required", nil)) + return "", false + } + + // Validate hierarchical access to the target organization + userService := local.NewUserService() + if !userService.IsOrganizationInHierarchy(orgRole, user.OrganizationID, orgID) { + c.JSON(http.StatusForbidden, response.Forbidden("access denied: organization not in your hierarchy", nil)) + return "", false + } + + return orgID, true +} + +// ConfigureAlerts handles POST /api/alerting/config +func ConfigureAlerts(c *gin.Context) { + user, ok := helpers.GetUserFromContext(c) + if !ok { + return + } + + orgID, ok := resolveOrgID(c, user) + if !ok { + return + } + + var req models.AlertingConfigRequest + if err := c.ShouldBindJSON(&req); err != nil { + c.JSON(http.StatusBadRequest, response.BadRequest("invalid request body: "+err.Error(), nil)) + return + } + + // Validate severity keys + validSeverities := map[string]bool{"critical": true, "warning": true, "info": true} + for key := range req { + if !validSeverities[key] { + c.JSON(http.StatusBadRequest, response.BadRequest("invalid severity level: "+key+". allowed: critical, warning, info", nil)) + return + } + } + + cfg := configuration.Config + yamlConfig, err := alerting.RenderConfig( + cfg.SMTPHost, cfg.SMTPPort, cfg.SMTPUsername, cfg.SMTPPassword, cfg.SMTPFrom, cfg.SMTPTLS, + req, + ) + if err != nil { + c.JSON(http.StatusInternalServerError, response.InternalServerError("failed to render alertmanager config: "+err.Error(), nil)) + return + } + + if err := alerting.PushConfig(orgID, yamlConfig); err != nil { + c.JSON(http.StatusInternalServerError, response.InternalServerError("failed to push config to mimir: "+err.Error(), nil)) + return + } + + c.JSON(http.StatusOK, response.OK("alerting configuration updated successfully", nil)) +} + +// DisableAlerts handles DELETE /api/alerting/config +func DisableAlerts(c *gin.Context) { + user, ok := helpers.GetUserFromContext(c) + if !ok { + return + } + + orgID, ok := resolveOrgID(c, user) + if !ok { + return + } + + cfg := configuration.Config + yamlConfig, err := alerting.RenderConfig( + cfg.SMTPHost, cfg.SMTPPort, cfg.SMTPUsername, cfg.SMTPPassword, cfg.SMTPFrom, cfg.SMTPTLS, + nil, + ) + if err != nil { + c.JSON(http.StatusInternalServerError, response.InternalServerError("failed to render blackhole config: "+err.Error(), nil)) + return + } + + if err := alerting.PushConfig(orgID, yamlConfig); err != nil { + c.JSON(http.StatusInternalServerError, response.InternalServerError("failed to push config to mimir: "+err.Error(), nil)) + return + } + + c.JSON(http.StatusOK, response.OK("all alerts disabled successfully", nil)) +} + +// GetAlerts handles GET /api/alerting/alerts +func GetAlerts(c *gin.Context) { + user, ok := helpers.GetUserFromContext(c) + if !ok { + return + } + + orgID, ok := resolveOrgID(c, user) + if !ok { + return + } + + body, err := alerting.GetAlerts(orgID) + if err != nil { + c.JSON(http.StatusInternalServerError, response.InternalServerError("failed to fetch alerts from mimir: "+err.Error(), nil)) + return + } + + // Parse alerts for optional filtering + var alerts []map[string]interface{} + if err := json.Unmarshal(body, &alerts); err != nil { + // Return raw response if parsing fails + c.Data(http.StatusOK, "application/json", body) + return + } + + var params models.AlertQueryParams + if err := c.ShouldBindQuery(¶ms); err == nil { + alerts = filterAlerts(alerts, params) + } + + c.JSON(http.StatusOK, response.OK("alerts retrieved successfully", gin.H{ + "alerts": alerts, + })) +} + +// GetAlertingConfig handles GET /api/alerting/config +func GetAlertingConfig(c *gin.Context) { + user, ok := helpers.GetUserFromContext(c) + if !ok { + return + } + + orgID, ok := resolveOrgID(c, user) + if !ok { + return + } + + body, err := alerting.GetConfig(orgID) + if err != nil { + c.JSON(http.StatusInternalServerError, response.InternalServerError("failed to fetch alerting config from mimir: "+err.Error(), nil)) + return + } + + c.JSON(http.StatusOK, response.OK("alerting configuration retrieved successfully", gin.H{ + "config": string(body), + })) +} + +// filterAlerts applies optional query filters to the alerts list +func filterAlerts(alerts []map[string]interface{}, params models.AlertQueryParams) []map[string]interface{} { + if params.State == "" && params.Severity == "" && params.SystemKey == "" { + return alerts + } + + filtered := make([]map[string]interface{}, 0, len(alerts)) + for _, alert := range alerts { + if params.State != "" { + if status, ok := alert["status"].(map[string]interface{}); ok { + if state, ok := status["state"].(string); ok && state != params.State { + continue + } + } + } + + labels, _ := alert["labels"].(map[string]interface{}) + + if params.Severity != "" { + if sev, ok := labels["severity"].(string); ok && sev != params.Severity { + continue + } + } + + if params.SystemKey != "" { + if sk, ok := labels["system_key"].(string); ok && sk != params.SystemKey { + continue + } + } + + filtered = append(filtered, alert) + } + + return filtered +} diff --git a/backend/models/alerting.go b/backend/models/alerting.go new file mode 100644 index 000000000..3ea98d20b --- /dev/null +++ b/backend/models/alerting.go @@ -0,0 +1,31 @@ +/* +Copyright (C) 2026 Nethesis S.r.l. +SPDX-License-Identifier: AGPL-3.0-or-later +*/ + +package models + +// WebhookConfig represents a named webhook receiver +type WebhookConfig struct { + Name string `json:"name" binding:"required"` + URL string `json:"url" binding:"required,url"` +} + +// SeverityConfig defines email and webhook receivers for a specific severity level, +// plus optional system_key exceptions that should be excluded from notifications +type SeverityConfig struct { + Emails []string `json:"emails" binding:"required,min=1,dive,email"` + Webhooks []WebhookConfig `json:"webhooks,omitempty"` + Exceptions []string `json:"exceptions,omitempty"` +} + +// AlertingConfigRequest is the JSON body for POST /api/alerting/config. +// Keys are severity levels: "critical", "warning", "info". +type AlertingConfigRequest map[string]SeverityConfig + +// AlertQueryParams holds optional query filters for GET /api/alerting/alerts +type AlertQueryParams struct { + State string `form:"state"` // e.g. "firing", "pending" + Severity string `form:"severity"` // e.g. "critical", "warning", "info" + SystemKey string `form:"system_key"` // filter by system_key label +} diff --git a/backend/openapi.yaml b/backend/openapi.yaml index de709d017..c257b7b63 100644 --- a/backend/openapi.yaml +++ b/backend/openapi.yaml @@ -2,7 +2,7 @@ openapi: 3.0.3 info: title: My Nethesis API description: REST API for My Nethesis with business hierarchy management and RBAC - version: 0.3.0 + version: 0.4.0 contact: name: Nethesis S.r.l. url: http://www.nethesis.it @@ -56,13 +56,32 @@ tags: - name: Backend - Rebranding description: Backend rebranding management for organizations and products + - name: Backend - Alerting + description: Backend alertmanager configuration and alert monitoring + - name: Collect - Health description: Collect service health and monitoring - name: Collect - Systems description: Collect service system management and inventory collection - name: Collect - Rebranding description: Collect service rebranding endpoints for systems - + - name: Collect - Alerting + description: | + Collect service alerting proxy to Mimir Alertmanager. + + **Multi-tenant Alertmanager API** + + This proxy forwards requests to the Grafana Mimir Alertmanager with automatic multi-tenant isolation. + Each system's organization is automatically resolved and injected as the `X-Scope-OrgID` header, + ensuring complete isolation of alerts and silences between organizations. + + **Accessible endpoints:** + - `POST /api/services/mimir/alertmanager/api/v2/alerts` - Add alerts + - `GET /api/services/mimir/alertmanager/api/v2/alerts` - List alerts + - `POST /api/services/mimir/alertmanager/api/v2/silences` - Create silence + - `GET /api/services/mimir/alertmanager/api/v2/silences` - List silences + - `GET /api/services/mimir/alertmanager/api/v2/silences/{id}` - Get silence + - `DELETE /api/services/mimir/alertmanager/api/v2/silences/{id}` - Delete silence security: - BearerAuth: [] @@ -93,6 +112,64 @@ components: data: $ref: '#/components/schemas/ErrorData' + SuccessResponse: + type: object + properties: + code: + type: integer + example: 200 + message: + type: string + data: + type: object + nullable: true + + AlertingConfigRequest: + type: object + description: Alert routing configuration keyed by severity level (critical, warning, info) + additionalProperties: + $ref: '#/components/schemas/SeverityConfig' + example: + critical: + emails: ["oncall@example.com"] + webhooks: + - name: "slack" + url: "https://hooks.slack.com/services/T00/B00/XXX" + exceptions: ["NOC-SKIP-0001"] + warning: + emails: ["team@example.com"] + SeverityConfig: + type: object + required: + - emails + properties: + emails: + type: array + items: + type: string + format: email + minItems: 1 + webhooks: + type: array + items: + $ref: '#/components/schemas/WebhookConfig' + exceptions: + type: array + description: system_key values to exclude from notifications for this severity + items: + type: string + WebhookConfig: + type: object + required: + - name + - url + properties: + name: + type: string + url: + type: string + format: uri + ErrorData: type: object properties: @@ -8292,3 +8369,309 @@ paths: format: binary '404': $ref: '#/components/responses/NotFound' + + # =========================================== + # ALERTING ENDPOINTS (Backend - Configuration) + # =========================================== + + /api/alerting/config: + post: + operationId: configureAlerts + tags: + - Backend - Alerting + summary: Configure alert routing + description: | + Generates an Alertmanager configuration with per-severity routing and pushes it to Mimir. + SMTP settings are injected server-side. The user provides email recipients, webhooks, and optional system_key exceptions per severity. + security: + - BearerAuth: [] + parameters: + - name: organization_id + in: query + description: Target organization ID. Required for Owner, Distributor, and Reseller roles. Customer role uses their own organization automatically. + schema: + type: string + requestBody: + required: true + content: + application/json: + schema: + $ref: '#/components/schemas/AlertingConfigRequest' + responses: + '200': + description: Alerting configuration updated + content: + application/json: + schema: + $ref: '#/components/schemas/SuccessResponse' + '400': + $ref: '#/components/responses/BadRequest' + '401': + $ref: '#/components/responses/Unauthorized' + '403': + $ref: '#/components/responses/Forbidden' + '500': + $ref: '#/components/responses/InternalServerError' + delete: + operationId: disableAlerts + tags: + - Backend - Alerting + summary: Disable all alerts + description: Replaces the Alertmanager config with a blackhole-only configuration, silencing all alerts for the organization. + security: + - BearerAuth: [] + parameters: + - name: organization_id + in: query + description: Target organization ID. Required for Owner, Distributor, and Reseller roles. + schema: + type: string + responses: + '200': + description: All alerts disabled + content: + application/json: + schema: + $ref: '#/components/schemas/SuccessResponse' + '400': + $ref: '#/components/responses/BadRequest' + '401': + $ref: '#/components/responses/Unauthorized' + '403': + $ref: '#/components/responses/Forbidden' + '500': + $ref: '#/components/responses/InternalServerError' + get: + operationId: getAlertingConfig + tags: + - Backend - Alerting + summary: Get current alerting configuration + description: Retrieves the current Alertmanager YAML configuration from Mimir for the specified organization. + security: + - BearerAuth: [] + parameters: + - name: organization_id + in: query + description: Target organization ID. Required for Owner, Distributor, and Reseller roles. + schema: + type: string + responses: + '200': + description: Current alerting configuration + content: + application/json: + schema: + type: object + properties: + code: + type: integer + example: 200 + message: + type: string + example: alerting configuration retrieved successfully + data: + type: object + properties: + config: + type: string + description: Raw YAML alertmanager configuration + '400': + $ref: '#/components/responses/BadRequest' + '401': + $ref: '#/components/responses/Unauthorized' + '403': + $ref: '#/components/responses/Forbidden' + '500': + $ref: '#/components/responses/InternalServerError' + + /api/alerting/alerts: + get: + operationId: getAlerts + tags: + - Backend - Alerting + summary: List active alerts + description: Retrieves active alerts from Mimir for the specified organization with optional filtering by state, severity, and system_key. + security: + - BearerAuth: [] + parameters: + - name: organization_id + in: query + description: Target organization ID. Required for Owner, Distributor, and Reseller roles. + schema: + type: string + - name: state + in: query + description: Filter alerts by state + schema: + type: string + enum: [active, suppressed, unprocessed] + - name: severity + in: query + description: Filter alerts by severity label + schema: + type: string + enum: [critical, warning, info] + - name: system_key + in: query + description: Filter alerts by system_key label + schema: + type: string + responses: + '200': + description: List of active alerts + content: + application/json: + schema: + type: object + properties: + code: + type: integer + example: 200 + message: + type: string + example: alerts retrieved successfully + data: + type: object + properties: + alerts: + type: array + items: + type: object + '400': + $ref: '#/components/responses/BadRequest' + '401': + $ref: '#/components/responses/Unauthorized' + '403': + $ref: '#/components/responses/Forbidden' + '500': + $ref: '#/components/responses/InternalServerError' + + # =========================================== + # ALERTING ENDPOINTS (Collect - Mimir Proxy) + # =========================================== + + /api/services/mimir/alertmanager/api/v2/alerts: + get: + operationId: mimirAlertsGet + tags: + - Collect - Alerting + summary: List active alerts + description: Lists all active alerts for the authenticated system's organization. + security: + - BasicAuth: [] + responses: + '200': + description: List of active alerts + '401': + $ref: '#/components/responses/Unauthorized' + '502': + $ref: '#/components/responses/InternalServerError' + post: + operationId: mimirAlertsPost + tags: + - Collect - Alerting + summary: Push alerts + description: Pushes one or more alerts to the Alertmanager for the authenticated system's organization. + security: + - BasicAuth: [] + requestBody: + required: true + content: + application/json: + schema: + type: array + items: + type: object + responses: + '200': + description: Alerts accepted + '400': + $ref: '#/components/responses/BadRequest' + '401': + $ref: '#/components/responses/Unauthorized' + '502': + $ref: '#/components/responses/InternalServerError' + + /api/services/mimir/alertmanager/api/v2/silences: + get: + operationId: mimirSilencesGet + tags: + - Collect - Alerting + summary: List silences + description: Lists all silences for the authenticated system's organization. + security: + - BasicAuth: [] + responses: + '200': + description: List of silences + '401': + $ref: '#/components/responses/Unauthorized' + '502': + $ref: '#/components/responses/InternalServerError' + post: + operationId: mimirSilencesPost + tags: + - Collect - Alerting + summary: Create or update a silence + description: Creates or updates a silence for the authenticated system's organization. + security: + - BasicAuth: [] + requestBody: + required: true + content: + application/json: + schema: + type: object + responses: + '200': + description: Silence created or updated + '400': + $ref: '#/components/responses/BadRequest' + '401': + $ref: '#/components/responses/Unauthorized' + '502': + $ref: '#/components/responses/InternalServerError' + + /api/services/mimir/alertmanager/api/v2/silences/{silence_id}: + parameters: + - name: silence_id + in: path + required: true + schema: + type: string + description: Silence UUID + get: + operationId: mimirSilenceGet + tags: + - Collect - Alerting + summary: Get a silence + description: Returns a specific silence by ID. + security: + - BasicAuth: [] + responses: + '200': + description: Silence details + '401': + $ref: '#/components/responses/Unauthorized' + '404': + $ref: '#/components/responses/NotFound' + '502': + $ref: '#/components/responses/InternalServerError' + delete: + operationId: mimirSilenceDelete + tags: + - Collect - Alerting + summary: Delete a silence + description: Deletes a silence by ID. + security: + - BasicAuth: [] + responses: + '200': + description: Silence deleted + '401': + $ref: '#/components/responses/Unauthorized' + '404': + $ref: '#/components/responses/NotFound' + '502': + $ref: '#/components/responses/InternalServerError' + diff --git a/backend/pkg/version/VERSION b/backend/pkg/version/VERSION index 0d91a54c7..1d0ba9ea1 100644 --- a/backend/pkg/version/VERSION +++ b/backend/pkg/version/VERSION @@ -1 +1 @@ -0.3.0 +0.4.0 diff --git a/backend/services/alerting/client.go b/backend/services/alerting/client.go new file mode 100644 index 000000000..57517acad --- /dev/null +++ b/backend/services/alerting/client.go @@ -0,0 +1,138 @@ +/* +Copyright (C) 2026 Nethesis S.r.l. +SPDX-License-Identifier: AGPL-3.0-or-later +*/ + +package alerting + +import ( + "fmt" + "io" + "net/http" + "strings" + "time" + + "github.com/nethesis/my/backend/configuration" +) + +var httpClient = &http.Client{Timeout: 30 * time.Second} + +// wrapForMimir wraps a raw Alertmanager YAML config in the Mimir multi-tenant +// format expected by POST /api/v1/alerts. +func wrapForMimir(yamlConfig string) string { + var sb strings.Builder + sb.WriteString("alertmanager_config: |\n") + for _, line := range strings.Split(yamlConfig, "\n") { + sb.WriteString(" ") + sb.WriteString(line) + sb.WriteString("\n") + } + return sb.String() +} + +// PushConfig uploads an alertmanager YAML configuration for the given tenant. +func PushConfig(orgID, yamlConfig string) error { + url := configuration.Config.MimirURL + "/api/v1/alerts" + + req, err := http.NewRequest(http.MethodPost, url, strings.NewReader(wrapForMimir(yamlConfig))) + if err != nil { + return fmt.Errorf("creating request: %w", err) + } + req.Header.Set("Content-Type", "application/yaml") + req.Header.Set("X-Scope-OrgID", orgID) + + resp, err := httpClient.Do(req) + if err != nil { + return fmt.Errorf("pushing config to mimir: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + body, _ := io.ReadAll(resp.Body) + return fmt.Errorf("mimir returned %d: %s", resp.StatusCode, string(body)) + } + + return nil +} + +// DeleteConfig removes the alertmanager configuration for the given tenant. +func DeleteConfig(orgID string) error { + url := configuration.Config.MimirURL + "/api/v1/alerts" + + req, err := http.NewRequest(http.MethodDelete, url, nil) + if err != nil { + return fmt.Errorf("creating request: %w", err) + } + req.Header.Set("X-Scope-OrgID", orgID) + + resp, err := httpClient.Do(req) + if err != nil { + return fmt.Errorf("deleting config from mimir: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + body, _ := io.ReadAll(resp.Body) + return fmt.Errorf("mimir returned %d: %s", resp.StatusCode, string(body)) + } + + return nil +} + +// GetAlerts fetches active alerts for the given tenant from Mimir. +func GetAlerts(orgID string) ([]byte, error) { + url := configuration.Config.MimirURL + "/alertmanager/api/v2/alerts" + + req, err := http.NewRequest(http.MethodGet, url, nil) + if err != nil { + return nil, fmt.Errorf("creating request: %w", err) + } + req.Header.Set("X-Scope-OrgID", orgID) + req.Header.Set("Accept", "application/json") + + resp, err := httpClient.Do(req) + if err != nil { + return nil, fmt.Errorf("fetching alerts from mimir: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("reading response body: %w", err) + } + + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return nil, fmt.Errorf("mimir returned %d: %s", resp.StatusCode, string(body)) + } + + return body, nil +} + +// GetConfig fetches the current alertmanager configuration for the given tenant. +func GetConfig(orgID string) ([]byte, error) { + url := configuration.Config.MimirURL + "/api/v1/alerts" + + req, err := http.NewRequest(http.MethodGet, url, nil) + if err != nil { + return nil, fmt.Errorf("creating request: %w", err) + } + req.Header.Set("X-Scope-OrgID", orgID) + req.Header.Set("Accept", "application/yaml") + + resp, err := httpClient.Do(req) + if err != nil { + return nil, fmt.Errorf("fetching config from mimir: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("reading response body: %w", err) + } + + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return nil, fmt.Errorf("mimir returned %d: %s", resp.StatusCode, string(body)) + } + + return body, nil +} diff --git a/backend/services/alerting/template.go b/backend/services/alerting/template.go new file mode 100644 index 000000000..4cb5fa912 --- /dev/null +++ b/backend/services/alerting/template.go @@ -0,0 +1,123 @@ +/* +Copyright (C) 2026 Nethesis S.r.l. +SPDX-License-Identifier: AGPL-3.0-or-later +*/ + +package alerting + +import ( + "bytes" + "fmt" + "regexp" + "strconv" + "strings" + "text/template" + + "github.com/nethesis/my/backend/models" +) + +// yamlEscape sanitizes a string for safe inclusion in single-quoted YAML values. +// It strips newlines/carriage returns and doubles single quotes. +func yamlEscape(s string) string { + s = strings.ReplaceAll(s, "\n", "") + s = strings.ReplaceAll(s, "\r", "") + s = strings.ReplaceAll(s, "'", "''") + return s +} + +var validSeverityKey = regexp.MustCompile(`^[a-zA-Z0-9_]+$`) + +// templateData holds all values injected into the alertmanager YAML template +type templateData struct { + SmtpSmarthost string + SmtpFrom string + SmtpAuthUsername string + SmtpAuthPassword string + SmtpRequireTLS bool + Severities map[string]models.SeverityConfig +} + +const alertmanagerTemplate = `global: + resolve_timeout: 5m + smtp_smarthost: '{{ .SmtpSmarthost }}' + smtp_from: '{{ .SmtpFrom }}' + smtp_auth_username: '{{ .SmtpAuthUsername }}' + smtp_auth_password: '{{ .SmtpAuthPassword }}' + smtp_require_tls: {{ if .SmtpRequireTLS }}true{{ else }}false{{ end }} + +route: + receiver: 'blackhole' + group_by: ['alertname', 'system_key'] + group_wait: 30s + group_interval: 5m + repeat_interval: 12h +{{- if .Severities }} + + routes: +{{- range $severity, $cfg := .Severities }} + - matchers: + - severity="{{ $severity }}" +{{- range $cfg.Exceptions }} + - system_key!="{{ yamlEscape . }}" +{{- end }} + receiver: '{{ yamlEscape $severity }}-receiver' + continue: false +{{- end }} +{{- end }} + +receivers: + - name: 'blackhole' +{{- range $severity, $cfg := .Severities }} + + - name: '{{ yamlEscape $severity }}-receiver' + email_configs: +{{- range $cfg.Emails }} + - to: '{{ yamlEscape . }}' + send_resolved: true +{{- end }} +{{- range $cfg.Webhooks }} + webhook_configs: + - url: '{{ yamlEscape .URL }}' + send_resolved: true +{{- end }} +{{- end }} + +templates: [] +` + +// RenderConfig renders the alertmanager YAML configuration from the request body +// and SMTP settings. If severities is nil, it produces a blackhole-only config. +func RenderConfig(smtpHost string, smtpPort int, smtpUser, smtpPass, smtpFrom string, smtpTLS bool, severities models.AlertingConfigRequest) (string, error) { + for key := range severities { + if !validSeverityKey.MatchString(key) { + return "", fmt.Errorf("invalid severity key: %q", key) + } + } + + smarthost := smtpHost + if smtpPort > 0 { + smarthost = smtpHost + ":" + strconv.Itoa(smtpPort) + } + + data := templateData{ + SmtpSmarthost: smarthost, + SmtpFrom: smtpFrom, + SmtpAuthUsername: smtpUser, + SmtpAuthPassword: smtpPass, + SmtpRequireTLS: smtpTLS, + Severities: severities, + } + + funcMap := template.FuncMap{"yamlEscape": yamlEscape} + tmpl, err := template.New("alertmanager").Funcs(funcMap).Parse(alertmanagerTemplate) + if err != nil { + return "", err + } + + var buf bytes.Buffer + if err := tmpl.Execute(&buf, data); err != nil { + return "", err + } + + return buf.String(), nil +} diff --git a/collect/.env.example b/collect/.env.example index 789f16aec..f7950befa 100644 --- a/collect/.env.example +++ b/collect/.env.example @@ -86,6 +86,9 @@ REDIS_URL=redis://localhost:6379 #CIRCUIT_BREAKER_THRESHOLD=10 #CIRCUIT_BREAKER_TIMEOUT=60s +# Mimir metrics storage +#MIMIR_URL=http://localhost:9009 + # Logging configuration #LOG_LEVEL=info #LOG_FORMAT=json diff --git a/collect/.render-build-trigger b/collect/.render-build-trigger index 3bb7a5909..d461df4ae 100644 --- a/collect/.render-build-trigger +++ b/collect/.render-build-trigger @@ -2,9 +2,9 @@ # This file is used to force Docker service rebuilds in PR previews # Modify LAST_UPDATE to trigger rebuilds -LAST_UPDATE=2026-02-10T12:10:22Z +LAST_UPDATE=2026-02-26T15:22:50Z # Instructions: # 1. To force rebuild of Docker services in a PR, update LAST_UPDATE -# 2. Run: perl -i -pe "s/LAST_UPDATE=2026-02-10T12:10:22Z +# 2. Run: perl -i -pe "s/LAST_UPDATE=2026-02-26T15:22:50Z # 2. Commit and push changes to trigger Docker rebuilds \ No newline at end of file diff --git a/collect/configuration/configuration.go b/collect/configuration/configuration.go index 537d50126..b8d172b6c 100644 --- a/collect/configuration/configuration.go +++ b/collect/configuration/configuration.go @@ -80,6 +80,9 @@ type Configuration struct { // Heartbeat monitoring configuration HeartbeatTimeoutMinutes int `json:"heartbeat_timeout_minutes"` + + // Mimir configuration + MimirURL string `json:"mimir_url"` } var Config = Configuration{} @@ -161,6 +164,13 @@ func Init() { // Heartbeat monitoring configuration Config.HeartbeatTimeoutMinutes = parseIntWithDefault("HEARTBEAT_TIMEOUT_MINUTES", 10) + // Mimir configuration + if mimirURL := os.Getenv("MIMIR_URL"); mimirURL != "" { + Config.MimirURL = mimirURL + } else { + Config.MimirURL = "http://localhost:9009" + } + // Log successful configuration load logger.LogConfigLoad("env", "configuration", true, nil) } diff --git a/collect/main.go b/collect/main.go index a015f2e20..611987a6d 100644 --- a/collect/main.go +++ b/collect/main.go @@ -105,8 +105,11 @@ func main() { // Add security monitoring middleware router.Use(logger.SecurityMiddleware()) - // Add compression - router.Use(gzip.Gzip(gzip.DefaultCompression)) + // Add compression (excluding Mimir proxy endpoints to avoid double-compression) + router.Use(gzip.Gzip( + gzip.DefaultCompression, + gzip.WithExcludedPathsRegexs([]string{"^/api/services/mimir"}), + )) // CORS configuration in debug mode if gin.Mode() == gin.DebugMode { @@ -158,6 +161,19 @@ func main() { systemsGroup.GET("/rebranding/:product_id/:asset", methods.GetSystemRebrandingAsset) } + // =========================================== + // EXTERNAL SERVICES PROXY + // =========================================== + // Systems can access only the alertmanager alert and silence endpoints. + // All management APIs are reserved for future backend implementation. + mimirGroup := api.Group("/services/mimir", middleware.BasicAuthMiddleware()) + { + mimirGroup.Any("/alertmanager/api/v2/alerts", methods.ProxyMimir) + mimirGroup.Any("/alertmanager/api/v2/alerts/*subpath", methods.ProxyMimir) + mimirGroup.Any("/alertmanager/api/v2/silences", methods.ProxyMimir) + mimirGroup.Any("/alertmanager/api/v2/silences/*subpath", methods.ProxyMimir) + } + // Handle missing endpoints router.NoRoute(func(c *gin.Context) { c.JSON(http.StatusNotFound, response.NotFound("api not found", nil)) diff --git a/collect/methods/mimir.go b/collect/methods/mimir.go new file mode 100644 index 000000000..54b794dee --- /dev/null +++ b/collect/methods/mimir.go @@ -0,0 +1,109 @@ +/* +Copyright (C) 2026 Nethesis S.r.l. +SPDX-License-Identifier: AGPL-3.0-or-later +*/ + +package methods + +import ( + "bytes" + "database/sql" + "fmt" + "io" + "net/http" + "strings" + + "github.com/gin-gonic/gin" + + "github.com/nethesis/my/collect/configuration" + "github.com/nethesis/my/collect/database" + "github.com/nethesis/my/collect/logger" + "github.com/nethesis/my/collect/response" +) + +// ProxyMimir forwards requests to Mimir on behalf of authenticated systems. +// BasicAuthMiddleware has already validated credentials and set "system_id" in the context. +// Route matching in main.go restricts access to /alertmanager/api/v2/alerts and +// /alertmanager/api/v2/silences; no further path checks are needed here. +// X-Scope-OrgID is always injected using the system's organization_id. +func ProxyMimir(c *gin.Context) { + subPath := strings.TrimPrefix(c.Request.URL.Path, "/api/services/mimir") + rawQuery := c.Request.URL.RawQuery + + // Resolve organization_id for X-Scope-OrgID injection + systemID, ok := getAuthenticatedSystemID(c) + if !ok { + logger.Warn().Str("reason", "missing system_id in context").Msg("mimir proxy auth failed") + c.JSON(http.StatusUnauthorized, response.Unauthorized("unauthorized", nil)) + return + } + + var orgID string + err := database.DB.QueryRow( + `SELECT organization_id FROM systems WHERE id = $1`, + systemID, + ).Scan(&orgID) + + if err == sql.ErrNoRows { + logger.Warn().Str("system_id", systemID).Str("reason", "system not found").Msg("mimir proxy: system lookup failed") + c.JSON(http.StatusUnauthorized, response.Unauthorized("unauthorized", nil)) + return + } + if err != nil { + logger.Error().Err(err).Str("system_id", systemID).Msg("mimir proxy: db query failed") + c.JSON(http.StatusInternalServerError, response.InternalServerError("internal server error", nil)) + return + } + + // Buffer request body once so it can be replayed across retry attempts + bodyBytes, err := io.ReadAll(c.Request.Body) + if err != nil { + logger.Error().Err(err).Msg("mimir proxy: failed to read request body") + c.JSON(http.StatusInternalServerError, response.InternalServerError("internal server error", nil)) + return + } + + // Forward request to Mimir + targetURL := fmt.Sprintf("%s%s", configuration.Config.MimirURL, subPath) + if rawQuery != "" { + targetURL += "?" + rawQuery + } + + logger.Info().Str("target", targetURL).Str("org_id", orgID).Msg("mimir proxy: forwarding request") + + req, err := http.NewRequest(c.Request.Method, targetURL, bytes.NewReader(bodyBytes)) + if err != nil { + logger.Error().Err(err).Str("target", targetURL).Msg("mimir proxy: failed to create upstream request") + c.JSON(http.StatusInternalServerError, response.InternalServerError("internal server error", nil)) + return + } + + for _, header := range []string{"Content-Type", "Content-Encoding", "Accept", "User-Agent"} { + if val := c.GetHeader(header); val != "" { + req.Header.Set(header, val) + } + } + // Remove Accept-Encoding so Mimir sends plain JSON, not gzip + req.Header.Del("Accept-Encoding") + req.Header.Set("X-Scope-OrgID", orgID) + + resp, err := http.DefaultClient.Do(req) + if err != nil { + logger.Error().Err(err).Str("target", targetURL).Msg("mimir proxy: network error") + c.JSON(http.StatusBadGateway, response.InternalServerError("mimir is unavailable", nil)) + return + } + defer func() { + if err := resp.Body.Close(); err != nil { + logger.Error().Err(err).Msg("mimir proxy: failed to close upstream response body") + } + }() + + if ct := resp.Header.Get("Content-Type"); ct != "" { + c.Header("Content-Type", ct) + } + c.Status(resp.StatusCode) + if _, err := io.Copy(c.Writer, resp.Body); err != nil { + logger.Error().Err(err).Msg("mimir proxy: error streaming response body") + } +} diff --git a/collect/pkg/version/VERSION b/collect/pkg/version/VERSION index 0d91a54c7..1d0ba9ea1 100644 --- a/collect/pkg/version/VERSION +++ b/collect/pkg/version/VERSION @@ -1 +1 @@ -0.3.0 +0.4.0 diff --git a/docs/en/08-alerting.md b/docs/en/08-alerting.md new file mode 100644 index 000000000..206f54d44 --- /dev/null +++ b/docs/en/08-alerting.md @@ -0,0 +1,227 @@ +# Alerting + +Learn how the My platform sends and manages alerts per organization using Grafana Mimir's multi-tenant Alertmanager. + +## Overview + +My platform uses [Grafana Mimir](https://grafana.com/oss/mimir/)'s built-in multi-tenant Alertmanager. Each system belongs to an organization — the collect service resolves the system's `organization_id` from its credentials and injects it as the `X-Scope-OrgID` header before forwarding to Mimir, ensuring alerts are fully isolated between organizations. + +For complete API documentation, see the [Prometheus Alertmanager v2 OpenAPI Specification](https://github.com/prometheus/alertmanager/blob/main/api/v2/openapi.yaml). + +## Authentication + +Alertmanager API calls use the same credentials as system registration and inventory: + +| Field | Value | +|-------|-------| +| **Username** | `system_key` (e.g. `NOC-F64B-A989-C9E7-45B9-A55D-59EC-6545-40EE`) | +| **Password** | `system_secret` (e.g. `my_a1b2c3d4e5f6g7h8i9j0.k1l2m3n4o5p6q7r8s9t0u1v2w3x4y5z6a7b8c9d0`) | +| **Method** | HTTP Basic Auth | + +No separate registration is needed — any system that has completed registration can immediately interact with the Alertmanager API. See [System Registration](05-system-registration.md) for how to obtain credentials. + +## Alertmanager API + +The collect service proxies Alertmanager API calls and automatically injects the `X-Scope-OrgID` header based on the authenticated system's organization. + +| Use Case | Path | +|----------|------| +| **Alerts** | `/api/services/mimir/alertmanager/api/v2/alerts` | +| **Silences** | `/api/services/mimir/alertmanager/api/v2/silences[/{silence_id}]` | + +## Common Examples + +### 1. Alert Management + +#### Inject an alert directly (Injection API) + +```bash +curl -X POST \ + -u "system_key:system_secret" \ + -H "Content-Type: application/json" \ + -H "Accept: application/json" \ + https://my.nethesis.it/api/services/mimir/alertmanager/api/v2/alerts \ + -d '[{ + "labels": { + "alertname": "HighCPU", + "severity": "critical", + "host": "server-01", + "system_key": "NOC-F64B-A989-C9E7-45B9-A55D-59EC-6545-40EE" + }, + "annotations": { + "summary": "CPU usage is too high", + "description": "CPU on server-01 is at 95%", + "runbook": "https://wiki.your-domain.com/high-cpu" + }, + "generatorURL": "https://prometheus.your-domain.com/graph", + "startsAt": "2024-01-15T10:30:00Z", + "endsAt": "0001-01-01T00:00:00Z" + }]' +``` + +**Response (200 OK)** - Alert successfully injected + +**Note on resolution:** Setting `endsAt` to `0001-01-01T00:00:00Z` means the alert remains active indefinitely until explicitly resolved. + +#### List active alerts + +```bash +curl -u "system_key:system_secret" \ + -H "Accept: application/json" \ + https://my.nethesis.it/api/services/mimir/alertmanager/api/v2/alerts +``` + +**Response (200 OK):** +```json +[ + { + "labels": { + "alertname": "HighCPU", + "severity": "critical", + "host": "server-01", + "system_key": "NOC-F64B-A989-C9E7-45B9-A55D-59EC-6545-40EE" + }, + "annotations": { + "summary": "CPU usage is too high", + "description": "CPU on server-01 is at 95%", + "runbook": "https://wiki.your-domain.com/high-cpu" + }, + "startsAt": "2024-01-15T10:30:00Z", + "endsAt": "0001-01-01T00:00:00Z", + "generatorURL": "https://prometheus.your-domain.com/graph", + "status": { + "state": "active", + "silencedBy": [], + "inhibitedBy": [] + } + } +] +``` + +#### Resolve an alert + +To resolve an alert, send the same alert with `endsAt` set to a past timestamp: + +```bash +curl -X POST \ + -u "system_key:system_secret" \ + -H "Content-Type: application/json" \ + -H "Accept: application/json" \ + https://my.nethesis.it/api/services/mimir/alertmanager/api/v2/alerts \ + -d '[{ + "labels": { + "alertname": "HighCPU", + "severity": "critical", + "host": "server-01", + "system_key": "NOC-F64B-A989-C9E7-45B9-A55D-59EC-6545-40EE" + }, + "annotations": { + "summary": "CPU usage is back to normal", + "description": "Issue resolved" + }, + "generatorURL": "https://prometheus.your-domain.com/graph", + "startsAt": "2024-01-15T10:30:00Z", + "endsAt": "2024-01-15T11:30:00Z" + }]' +``` + +--- + +### 2. Silence Management + +#### Create a silence + +```bash +curl -X POST \ + -u "system_key:system_secret" \ + -H "Content-Type: application/json" \ + https://my.nethesis.it/api/services/mimir/alertmanager/api/v2/silences \ + -d '{ + "matchers": [ + { + "name": "alertname", + "value": "HighCPU", + "isRegex": false + }, + { + "name": "host", + "value": "server-01", + "isRegex": false + } + ], + "startsAt": "2024-01-15T10:00:00Z", + "endsAt": "2024-01-15T18:00:00Z", + "createdBy": "admin@your-domain.com", + "comment": "Planned maintenance on server-01" + }' +``` + +**Response (200 OK):** +```json +{ + "silenceID": "2b05304b-a71e-48c0-a877-bb4824e84969" +} +``` + +#### List active silences + +```bash +curl -u "system_key:system_secret" \ + -H "Accept: application/json" \ + https://my.nethesis.it/api/services/mimir/alertmanager/api/v2/silences +``` + +**Response (200 OK):** List of all active silences and their configurations. + +#### Delete a silence + +```bash +curl -X DELETE \ + -u "system_key:system_secret" \ + https://my.nethesis.it/api/services/mimir/alertmanager/api/v2/silence/2b05304b-a71e-48c0-a877-bb4824e84969 +``` + +**Response (200 OK)** - Silence deleted + +## Troubleshooting + +### HTTP 401 Unauthorized + +**Cause:** Incorrect `system_key` or `system_secret`. + +**Solutions:** +1. Verify credentials match what is stored on the system +2. Ensure the system has completed registration (see [System Registration](05-system-registration.md)) +3. Check for leading/trailing spaces in the credentials +4. Test manually: + ```bash + curl https://my.nethesis.it/api/services/mimir/alertmanager/api/v2/alerts \ + -u "NOC-F64B-A989-C9E7-45B9-A55D-59EC-6545-40EE:my_a1b2c3d4e5f6g7h8i9j0.k1l2m3n4o5p6q7r8s9t0u1v2w3x4y5z6a7b8c9d0" + ``` + A `200 OK` or `404 Not Found` response (not `401`) confirms authentication is working. + +### HTTP 500 Internal Server Error + +**Cause:** Mimir Alertmanager backend is unreachable or misconfigured. + +**Solutions:** +1. This is a platform-side issue — contact your administrator +2. Check platform status page or monitoring alerts +3. Retry after a few minutes; Mimir may be restarting + +### HTTP 400 Bad Request + +**Cause:** Request body is invalid (malformed JSON, missing required fields, etc.) + +**Solutions:** +1. Verify JSON is valid using an online tool like [jsonlint.com](https://www.jsonlint.com/) +2. Ensure all required fields are present +3. Check ISO 8601 date format (e.g. `2024-01-15T10:30:00Z`) + +## Related Documentation + +- [System Registration](05-system-registration.md) +- [Inventory and Heartbeat](06-inventory-heartbeat.md) +- [Systems Management](04-systems.md) +- [Mimir HTTP API Documentation](https://grafana.com/docs/mimir/latest/references/http-api/) +- [Prometheus Alertmanager v2 OpenAPI](https://github.com/prometheus/alertmanager/blob/main/api/v2/openapi.yaml) diff --git a/docs/en/index.md b/docs/en/index.md index 5f03c707c..79cdde6dc 100644 --- a/docs/en/index.md +++ b/docs/en/index.md @@ -174,7 +174,7 @@ graph LR ## Version Information -Current version: **0.3.0** (Pre-production) +Current version: **0.4.0** (Pre-production) This documentation is continuously updated. Last update: 2025-11-06 diff --git a/docs/it/08-alerting.md b/docs/it/08-alerting.md new file mode 100644 index 000000000..d4512625d --- /dev/null +++ b/docs/it/08-alerting.md @@ -0,0 +1,225 @@ +# Alerting + +Scopri come la piattaforma My gestisce alert e silenzi per organizzazione tramite Grafana Mimir Alertmanager. + +## Panoramica + +La piattaforma My utilizza l'Alertmanager multi-tenant di [Grafana Mimir](https://grafana.com/oss/mimir/) per gestire alert e silenzi. Ogni sistema appartiene a un'organizzazione: il servizio collect risolve l'`organization_id` dalle credenziali del sistema e lo inietta come header `X-Scope-OrgID` prima di inoltrare la richiesta a Mimir. Nessuna organizzazione può vedere o modificare i dati delle altre. + +## Autenticazione + +L'accesso all'API Alertmanager usa le stesse credenziali della registrazione del sistema e dell'inventario: + +| Campo | Valore | +|-------|--------| +| **Username** | `system_key` (es. `NOC-F64B-A989-C9E7-45B9-A55D-59EC-6545-40EE`) | +| **Password** | `system_secret` (es. `my_a1b2c3d4e5f6g7h8i9j0.k1l2m3n4o5p6q7r8s9t0u1v2w3x4y5z6a7b8c9d0`) | +| **Metodo** | HTTP Basic Auth | + +Non è necessaria una registrazione separata — qualsiasi sistema che ha completato la registrazione può interagire con l'API Alertmanager immediatamente. Consulta [Registrazione Sistema](05-system-registration.md) per come ottenere le credenziali. + +## API Alertmanager + +I sistemi possono accedere solo ai seguenti endpoint: + +| Risorsa | Percorso | +|---------|----------| +| Alert | `/api/services/mimir/alertmanager/api/v2/alerts` | +| Silenzi | `/api/services/mimir/alertmanager/api/v2/silences[/{silence_id}]` | + +## Esempi Comuni + +### 1. Gestione Alert + +#### Aggiungere un alert direttamente (Injection API) + +```bash +curl -X POST \ + -u "system_key:system_secret" \ + -H "Content-Type: application/json" \ + -H "Accept: application/json" \ + https://my.nethesis.it/api/services/mimir/alertmanager/api/v2/alerts \ + -d '[{ + "labels": { + "alertname": "HighCPU", + "severity": "critical", + "host": "server-01", + "system_key": "NOC-F64B-A989-C9E7-45B9-A55D-59EC-6545-40EE" + }, + "annotations": { + "summary": "CPU usage è troppo alto", + "description": "CPU su server-01 è al 95%", + "runbook": "https://wiki.your-domain.com/high-cpu" + }, + "generatorURL": "https://prometheus.your-domain.com/graph", + "startsAt": "2024-01-15T10:30:00Z", + "endsAt": "0001-01-01T00:00:00Z" + }]' +``` + +**Risposta (200 OK)** - Alert aggiunto con successo + +**Nota sulla risoluzione:** Impostare `endsAt` su `0001-01-01T00:00:00Z` significa che l'alert rimane attivo indefinitamente finché non viene risolto esplicitamente. + +#### Recuperare gli alert attivi + +```bash +curl -u "system_key:system_secret" \ + -H "Accept: application/json" \ + https://my.nethesis.it/api/services/mimir/alertmanager/api/v2/alerts +``` + +**Risposta (200 OK):** +```json +[ + { + "labels": { + "alertname": "HighCPU", + "severity": "critical", + "host": "server-01", + "system_key": "NOC-F64B-A989-C9E7-45B9-A55D-59EC-6545-40EE" + }, + "annotations": { + "summary": "CPU usage è troppo alto", + "description": "CPU su server-01 è al 95%", + "runbook": "https://wiki.your-domain.com/high-cpu" + }, + "startsAt": "2024-01-15T10:30:00Z", + "endsAt": "0001-01-01T00:00:00Z", + "generatorURL": "https://prometheus.your-domain.com/graph", + "status": { + "state": "active", + "silencedBy": [], + "inhibitedBy": [] + } + } +] +``` + +#### Risolvere un alert + +Per risolvere un alert, invia lo stesso alert con `endsAt` impostato a un timestamp nel passato: + +```bash +curl -X POST \ + -u "system_key:system_secret" \ + -H "Content-Type: application/json" \ + -H "Accept: application/json" \ + https://my.nethesis.it/api/services/mimir/alertmanager/api/v2/alerts \ + -d '[{ + "labels": { + "alertname": "HighCPU", + "severity": "critical", + "host": "server-01", + "system_key": "NOC-F64B-A989-C9E7-45B9-A55D-59EC-6545-40EE" + }, + "annotations": { + "summary": "CPU usage è tornato alla norma", + "description": "Problema risolto" + }, + "generatorURL": "https://prometheus.your-domain.com/graph", + "startsAt": "2024-01-15T10:30:00Z", + "endsAt": "2024-01-15T11:30:00Z" + }]' +``` + +--- + +### 2. Gestione Silenzi + +#### Creare un silenzio + +```bash +curl -X POST \ + -u "system_key:system_secret" \ + -H "Content-Type: application/json" \ + https://my.nethesis.it/api/services/mimir/alertmanager/api/v2/silences \ + -d '{ + "matchers": [ + { + "name": "alertname", + "value": "HighCPU", + "isRegex": false + }, + { + "name": "host", + "value": "server-01", + "isRegex": false + } + ], + "startsAt": "2024-01-15T10:00:00Z", + "endsAt": "2024-01-15T18:00:00Z", + "createdBy": "admin@your-domain.com", + "comment": "Manutenzione pianificata su server-01" + }' +``` + +**Risposta (200 OK):** +```json +{ + "silenceID": "2b05304b-a71e-48c0-a877-bb4824e84969" +} +``` + +#### Recuperare i silenzi attivi + +```bash +curl -u "system_key:system_secret" \ + -H "Accept: application/json" \ + https://my.nethesis.it/api/services/mimir/alertmanager/api/v2/silences +``` + +**Risposta (200 OK):** Lista di tutti i silenzi attivi e le loro configurazioni. + +#### Eliminare un silenzio + +```bash +curl -X DELETE \ + -u "system_key:system_secret" \ + https://my.nethesis.it/api/services/mimir/alertmanager/api/v2/silence/2b05304b-a71e-48c0-a877-bb4824e84969 +``` + +**Risposta (200 OK)** - Silenzio eliminato + +## Risoluzione Problemi + +### HTTP 401 Unauthorized + +**Causa:** `system_key` o `system_secret` non corretti. + +**Soluzioni:** +1. Verifica che le credenziali corrispondano a quelle memorizzate sul sistema +2. Assicurati che il sistema abbia completato la registrazione (vedi [Registrazione Sistema](05-system-registration.md)) +3. Controlla eventuali spazi iniziali o finali nelle credenziali +4. Testa manualmente: + ```bash + curl https://my.nethesis.it/api/services/mimir/alertmanager/api/v2/alerts \ + -u "NOC-F64B-A989-C9E7-45B9-A55D-59EC-6545-40EE:my_a1b2c3d4e5f6g7h8i9j0.k1l2m3n4o5p6q7r8s9t0u1v2w3x4y5z6a7b8c9d0" + ``` + Una risposta `200 OK` o `404 Not Found` (non 401) conferma che l'autenticazione funziona. + +### HTTP 500 Internal Server Error + +**Causa:** Il backend Mimir non è raggiungibile o è configurato in modo errato. + +**Soluzioni:** +1. Si tratta di un problema lato piattaforma — contatta il tuo amministratore +2. Controlla la pagina di stato della piattaforma o gli avvisi di monitoraggio +3. Riprova dopo qualche minuto; Mimir potrebbe essere in fase di riavvio + +### HTTP 400 Bad Request + +**Causa:** Il corpo della richiesta non è valido (JSON malformato, campi obbligatori mancanti, ecc.) + +**Soluzioni:** +1. Verifica che il JSON sia valido usando uno strumento online come [jsonlint.com](https://www.jsonlint.com/) +2. Controlla che tutti i campi obbligatori siano presenti +3. Verifica il formato delle date ISO 8601 (es. `2024-01-15T10:30:00Z`) + +## Documentazione Correlata + +- [Registrazione Sistema](05-system-registration.md) +- [Inventario e Heartbeat](06-inventory-heartbeat.md) +- [Gestione Sistemi](04-systems.md) +- [Mimir HTTP API Documentation](https://grafana.com/docs/mimir/latest/references/http-api/) +- [Prometheus Alertmanager v2 OpenAPI](https://github.com/prometheus/alertmanager/blob/main/api/v2/openapi.yaml) diff --git a/docs/it/index.md b/docs/it/index.md index b1890efc7..442f41ea1 100644 --- a/docs/it/index.md +++ b/docs/it/index.md @@ -174,7 +174,7 @@ graph LR ## Informazioni Versione -Versione corrente: **0.3.0** (Pre-produzione) +Versione corrente: **0.4.0** (Pre-produzione) Questa documentazione viene aggiornata continuamente. Ultimo aggiornamento: 2025-11-07 diff --git a/frontend/.render-build-trigger b/frontend/.render-build-trigger index 3bb7a5909..d461df4ae 100644 --- a/frontend/.render-build-trigger +++ b/frontend/.render-build-trigger @@ -2,9 +2,9 @@ # This file is used to force Docker service rebuilds in PR previews # Modify LAST_UPDATE to trigger rebuilds -LAST_UPDATE=2026-02-10T12:10:22Z +LAST_UPDATE=2026-02-26T15:22:50Z # Instructions: # 1. To force rebuild of Docker services in a PR, update LAST_UPDATE -# 2. Run: perl -i -pe "s/LAST_UPDATE=2026-02-10T12:10:22Z +# 2. Run: perl -i -pe "s/LAST_UPDATE=2026-02-26T15:22:50Z # 2. Commit and push changes to trigger Docker rebuilds \ No newline at end of file diff --git a/frontend/package.json b/frontend/package.json index c9390c277..0e92cf39e 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -1,6 +1,6 @@ { "name": "my-nethesis-ui", - "version": "0.3.0", + "version": "0.4.0", "private": true, "type": "module", "scripts": { diff --git a/mkdocs.yml b/mkdocs.yml index 60966e595..864f3bc73 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -131,6 +131,7 @@ plugins: Systems Management: Gestione Sistemi System Registration: Registrazione Sistema Inventory & Heartbeat: Inventario e Heartbeat + Alerting: Alerting Developer Documentation: Documentazione Sviluppatori Main Project: Progetto Principale Backend API: API Backend @@ -172,6 +173,7 @@ nav: - Systems Management: 04-systems.md - System Registration: 05-system-registration.md - Inventory & Heartbeat: 06-inventory-heartbeat.md + - Alerting: 08-alerting.md - Developer Documentation: - Main Project: https://github.com/NethServer/my/blob/main/README.md - Backend API: https://github.com/NethServer/my/blob/main/backend/README.md diff --git a/proxy/.render-build-trigger b/proxy/.render-build-trigger index 3bb7a5909..d461df4ae 100644 --- a/proxy/.render-build-trigger +++ b/proxy/.render-build-trigger @@ -2,9 +2,9 @@ # This file is used to force Docker service rebuilds in PR previews # Modify LAST_UPDATE to trigger rebuilds -LAST_UPDATE=2026-02-10T12:10:22Z +LAST_UPDATE=2026-02-26T15:22:50Z # Instructions: # 1. To force rebuild of Docker services in a PR, update LAST_UPDATE -# 2. Run: perl -i -pe "s/LAST_UPDATE=2026-02-10T12:10:22Z +# 2. Run: perl -i -pe "s/LAST_UPDATE=2026-02-26T15:22:50Z # 2. Commit and push changes to trigger Docker rebuilds \ No newline at end of file diff --git a/proxy/nginx.conf b/proxy/nginx.conf index 0b900083c..a28b5f02a 100644 --- a/proxy/nginx.conf +++ b/proxy/nginx.conf @@ -99,4 +99,4 @@ http { proxy_read_timeout 30s; } } -} +} \ No newline at end of file diff --git a/render.yaml b/render.yaml index 2d9dae2b1..b1beaba7d 100644 --- a/render.yaml +++ b/render.yaml @@ -52,12 +52,8 @@ services: sync: false - key: MIMIR_S3_SECRET_KEY sync: false - - key: MIMIR_S3_BUCKET - sync: false - key: MIMIR_S3_ALERTMANAGER_BUCKET sync: false - - key: MIMIR_S3_RULER_BUCKET - sync: false # Production Backend API Server (Private Service) - type: web @@ -227,12 +223,8 @@ services: sync: false - key: MIMIR_S3_SECRET_KEY sync: false - - key: MIMIR_S3_BUCKET - sync: false - key: MIMIR_S3_ALERTMANAGER_BUCKET sync: false - - key: MIMIR_S3_RULER_BUCKET - sync: false autoDeploy: true branch: main pullRequestPreviewsEnabled: true @@ -284,6 +276,10 @@ services: - key: BACKEND_APP_SECRET sync: false + # Mimir Configuration + - key: MIMIR_URL + value: http://my-mimir-qa:9009 + # SMTP Configuration - key: SMTP_HOST sync: false @@ -399,4 +395,4 @@ services: property: host autoDeploy: true # Auto-deploy on every commit branch: main - pullRequestPreviewsEnabled: true # PR previews enabled \ No newline at end of file + pullRequestPreviewsEnabled: true # PR previews enabled diff --git a/scripts/.gitignore b/scripts/.gitignore new file mode 100644 index 000000000..7a60b85e1 --- /dev/null +++ b/scripts/.gitignore @@ -0,0 +1,2 @@ +__pycache__/ +*.pyc diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 000000000..e43ebaa2b --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,292 @@ +# scripts/ + +Utility scripts for interacting with the MY platform. + +## alerting_config.py + +CLI to manage alerting configuration via the MY backend API (requires user credentials). +Handles the full Logto OIDC authentication flow automatically. + +### Requirements + +```bash +pip install requests +``` + +### Usage + +``` +python alerting_config.py --url URL --email EMAIL --password PASS [options] +``` + +**Common arguments:** + +| Argument | Description | +|----------|-------------| +| `--url` | Base URL of the MY proxy (e.g. `https://my.nethesis.it`) | +| `--email` | User email address | +| `--password` | User password | + +Owner, Distributor, and Reseller roles must pass `--org ` to all commands. Customer role uses their own organization automatically. + +--- + +### Get current configuration + +```bash +python alerting_config.py --url https://my-proxy-qa-pr-42.onrender.com \ + --email admin@example.com --password 's3cr3t' \ + get --org veg2rx4p6lmo +``` + +--- + +### Set configuration + +Create a JSON file describing the per-severity routing: + +```json +{ + "critical": { + "emails": ["oncall@example.com"], + "webhooks": [{"name": "slack", "url": "https://hooks.slack.com/services/..."}], + "exceptions": ["NETH-XXXX-YYYY"] + }, + "warning": { + "emails": ["team@example.com"] + } +} +``` + +Then apply it: + +```bash +python alerting_config.py --url https://my-proxy-qa-pr-42.onrender.com \ + --email admin@example.com --password 's3cr3t' \ + set --org veg2rx4p6lmo --config my_config.json +``` + +| Option | Required | Description | +|--------|----------|-------------| +| `--org` | yes (non-Customer) | Target organization ID | +| `--config` | yes | Path to JSON config file | + +--- + +### Disable all alerts + +Replaces the Alertmanager config with a blackhole-only configuration: + +```bash +python alerting_config.py --url https://my-proxy-qa-pr-42.onrender.com \ + --email admin@example.com --password 's3cr3t' \ + delete --org veg2rx4p6lmo +``` + +--- + +### List active alerts + +```bash +# All active alerts for the organization +python alerting_config.py --url https://my-proxy-qa-pr-42.onrender.com \ + --email admin@example.com --password 's3cr3t' \ + alerts --org veg2rx4p6lmo + +# Filter by severity and state +python alerting_config.py --url https://my-proxy-qa-pr-42.onrender.com \ + --email admin@example.com --password 's3cr3t' \ + alerts --org veg2rx4p6lmo --severity critical --state active +``` + +| Option | Description | +|--------|-------------| +| `--org` | Organization ID | +| `--state` | Filter by state: `active`, `suppressed`, `unprocessed` | +| `--severity` | Filter by severity: `critical`, `warning`, `info` | +| `--system-key` | Filter by system key label | + +--- + +### Full example workflow + +```bash +BASE="https://my-proxy-qa-pr-42.onrender.com" +EMAIL="giacomo.sanchietti@nethesis.it" +PASS="+=V\$-{30vEd*" +ORG="veg2rx4p6lmo" + +# 1. Check current config +python alerting_config.py --url "$BASE" --email "$EMAIL" --password "$PASS" get --org "$ORG" + +# 2. Apply new config +python alerting_config.py --url "$BASE" --email "$EMAIL" --password "$PASS" \ + set --org "$ORG" --config my_config.json + +# 3. Verify it took effect +python alerting_config.py --url "$BASE" --email "$EMAIL" --password "$PASS" get --org "$ORG" + +# 4. Check for active alerts +python alerting_config.py --url "$BASE" --email "$EMAIL" --password "$PASS" alerts --org "$ORG" + +# 5. Disable alerts when done +python alerting_config.py --url "$BASE" --email "$EMAIL" --password "$PASS" delete --org "$ORG" +``` + +--- + +## alert.py + +CLI tool to push, resolve, silence, and list alerts via the Mimir Alertmanager proxy exposed by the collect service. + +### Requirements + +```bash +pip install requests +``` + +### Usage + +``` +python alert.py --url URL --key KEY --secret SECRET [options] +``` + +**Common arguments:** + +| Argument | Description | +|----------|-------------| +| `--url` | Base URL of the Mimir proxy: `https:///collect/api/services/mimir` | +| `--key` | System key (HTTP Basic Auth username) | +| `--secret` | System secret (HTTP Basic Auth password) | + +--- + +### Push an alert + +Fire an alert into Alertmanager. The alert stays active until resolved or it expires. + +```bash +python alert.py --url https://my.nethesis.it/collect/api/services/mimir \ + --key NETH-XXXX-XXXX --secret 'my_pub.secret' \ + push \ + --alertname DiskFull \ + --severity critical \ + --labels host=prod-01 service=storage \ + --annotations "summary=Disk usage above 90%" "runbook=https://wiki/disk" +``` + +Options: + +| Option | Required | Description | +|--------|----------|-------------| +| `--alertname` | yes | Alert name | +| `--severity` | yes | `critical`, `warning`, or `info` | +| `--labels` | no | Extra labels as `key=value` pairs | +| `--annotations` | no | Annotations as `key=value` pairs | + +--- + +### Resolve an alert + +Resolve a previously pushed alert by sending it with an explicit end time. + +```bash +python alert.py --url https://my.nethesis.it/collect/api/services/mimir \ + --key NETH-XXXX-XXXX --secret 'my_pub.secret' \ + resolve \ + --alertname DiskFull \ + --severity critical \ + --labels host=prod-01 +``` + +> Labels must match those used when the alert was pushed. + +Options: + +| Option | Required | Description | +|--------|----------|-------------| +| `--alertname` | yes | Alert name | +| `--severity` | yes | Must match the pushed alert | +| `--labels` | no | Must match the pushed alert labels | + +--- + +### Silence an alert + +Suppress notifications for a matching alert for a given duration. + +```bash +python alert.py --url https://my.nethesis.it/collect/api/services/mimir \ + --key NETH-XXXX-XXXX --secret 'my_pub.secret' \ + silence \ + --alertname DiskFull \ + --duration 120 \ + --comment "Planned maintenance window" \ + --created-by ops-team +``` + +Options: + +| Option | Required | Default | Description | +|--------|----------|---------|-------------| +| `--alertname` | yes | — | Alert name to silence | +| `--labels` | no | — | Extra label matchers as `key=value` pairs | +| `--duration` | no | 60 | Silence duration in minutes | +| `--comment` | no | `Silenced via alert.py` | Reason for the silence | +| `--created-by` | no | `alert.py` | Author of the silence | + +--- + +### List alerts + +List active alerts, with optional filters. + +```bash +# All active alerts +python alert.py --url https://my.nethesis.it/collect/api/services/mimir \ + --key NETH-XXXX-XXXX --secret 'my_pub.secret' \ + list + +# Filter by severity +python alert.py --url https://my.nethesis.it/collect/api/services/mimir \ + --key NETH-XXXX-XXXX --secret 'my_pub.secret' \ + list --severity critical + +# Filter by state +python alert.py --url https://my.nethesis.it/collect/api/services/mimir \ + --key NETH-XXXX-XXXX --secret 'my_pub.secret' \ + list --state firing +``` + +Options: + +| Option | Description | +|--------|-------------| +| `--state` | Filter by alert state: `firing`, `pending`, `unprocessed` | +| `--severity` | Filter by severity label: `critical`, `warning`, `info` | + +--- + +### Full example workflow + +```bash +BASE="https://my-proxy-qa-pr-42.onrender.com/collect/api/services/mimir" +KEY="NETH-F5D2-5E69-A174-45A9-B1AB-2BB9-03F5-F1B4" +SECRET="my_8dc030a0e5189eb1f9fe.6889e67a77d80a4c1315da65e6107503ebfc58ac" + +# 1. Fire a critical alert +python alert.py --url "$BASE" --key "$KEY" --secret "$SECRET" \ + push --alertname HighCPU --severity critical \ + --labels host=prod-01 --annotations "summary=CPU at 99%" + +# 2. List it +python alert.py --url "$BASE" --key "$KEY" --secret "$SECRET" list + +# 3. Silence it for 30 minutes +python alert.py --url "$BASE" --key "$KEY" --secret "$SECRET" \ + silence --alertname HighCPU --duration 30 --comment "Investigating" + +# 4. Resolve it +python alert.py --url "$BASE" --key "$KEY" --secret "$SECRET" \ + resolve --alertname HighCPU --severity critical --labels host=prod-01 +``` diff --git a/scripts/alert.py b/scripts/alert.py new file mode 100755 index 000000000..35b9633f9 --- /dev/null +++ b/scripts/alert.py @@ -0,0 +1,237 @@ +#!/usr/bin/env python3 +""" +Simple CLI to push, resolve, silence, and list alerts via the Mimir Alertmanager proxy. + +Usage: + python alert.py push --url URL --key KEY --secret SECRET --alertname NAME --severity SEV [--labels k=v ...] [--annotations k=v ...] + python alert.py resolve --url URL --key KEY --secret SECRET --alertname NAME --severity SEV [--labels k=v ...] + python alert.py silence --url URL --key KEY --secret SECRET --alertname NAME [--labels k=v ...] [--duration MINUTES] [--comment TEXT] [--created-by TEXT] + python alert.py list --url URL --key KEY --secret SECRET [--state STATE] [--severity SEV] + +Examples: + # Push a critical alert + python alert.py push --url https://my.nethesis.it/collect/api/services/mimir \ + --key NOC-XXXX-XXXX --secret 'my_pub.secretpart' \ + --alertname HighCPU --severity critical \ + --labels host=server-01 --annotations summary="CPU too high" + + # Resolve it + python alert.py resolve --url https://my.nethesis.it/collect/api/services/mimir \ + --key NOC-XXXX-XXXX --secret 'my_pub.secretpart' \ + --alertname HighCPU --severity critical \ + --labels host=server-01 + + # Silence an alert for 2 hours + python alert.py silence --url https://my.nethesis.it/collect/api/services/mimir \ + --key NOC-XXXX-XXXX --secret 'my_pub.secretpart' \ + --alertname HighCPU --duration 120 --comment "Maintenance window" + + # List all alerts + python alert.py list --url https://my.nethesis.it/collect/api/services/mimir \ + --key NOC-XXXX-XXXX --secret 'my_pub.secretpart' +""" + +import argparse +import json +import sys +from datetime import datetime, timezone, timedelta + +try: + import requests +except ImportError: + print("Error: 'requests' library is required. Install it with: pip install requests", file=sys.stderr) + sys.exit(1) + + +def parse_kv(pairs): + """Parse a list of 'key=value' strings into a dict.""" + result = {} + for pair in pairs or []: + if "=" not in pair: + print(f"Error: invalid key=value pair: {pair}", file=sys.stderr) + sys.exit(1) + k, v = pair.split("=", 1) + result[k] = v + return result + + +def push_alert(args): + """Push (fire) an alert.""" + labels = { + "alertname": args.alertname, + "severity": args.severity, + "system_key": args.key, + } + labels.update(parse_kv(args.labels)) + + annotations = parse_kv(args.annotations) + + payload = [{ + "labels": labels, + "annotations": annotations, + "generatorURL": f"http://{args.key}/alert", + "startsAt": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + "endsAt": "0001-01-01T00:00:00Z", + }] + + url = f"{args.url.rstrip('/')}/alertmanager/api/v2/alerts" + resp = requests.post( + url, + json=payload, + auth=(args.key, args.secret), + headers={"Accept": "application/json"}, + timeout=30, + ) + + if resp.ok: + print(f"Alert '{args.alertname}' pushed successfully (HTTP {resp.status_code})") + else: + print(f"Failed to push alert (HTTP {resp.status_code}): {resp.text}", file=sys.stderr) + sys.exit(1) + + +def resolve_alert(args): + """Resolve an alert by sending it with endsAt in the past.""" + labels = { + "alertname": args.alertname, + "severity": args.severity, + "system_key": args.key, + } + labels.update(parse_kv(args.labels)) + + now = datetime.now(timezone.utc) + payload = [{ + "labels": labels, + "annotations": {"summary": "resolved"}, + "generatorURL": f"http://{args.key}/alert", + "startsAt": (now - timedelta(hours=1)).strftime("%Y-%m-%dT%H:%M:%SZ"), + "endsAt": now.strftime("%Y-%m-%dT%H:%M:%SZ"), + }] + + url = f"{args.url.rstrip('/')}/alertmanager/api/v2/alerts" + resp = requests.post( + url, + json=payload, + auth=(args.key, args.secret), + headers={"Accept": "application/json"}, + timeout=30, + ) + + if resp.ok: + print(f"Alert '{args.alertname}' resolved successfully (HTTP {resp.status_code})") + else: + print(f"Failed to resolve alert (HTTP {resp.status_code}): {resp.text}", file=sys.stderr) + sys.exit(1) + + +def silence_alert(args): + """Create a silence for an alert.""" + now = datetime.now(timezone.utc) + ends_at = now + timedelta(minutes=args.duration) + + matchers = [{"name": "alertname", "value": args.alertname, "isRegex": False}] + for k, v in parse_kv(args.labels).items(): + matchers.append({"name": k, "value": v, "isRegex": False}) + + payload = { + "matchers": matchers, + "startsAt": now.strftime("%Y-%m-%dT%H:%M:%SZ"), + "endsAt": ends_at.strftime("%Y-%m-%dT%H:%M:%SZ"), + "comment": args.comment, + "createdBy": args.created_by, + } + + url = f"{args.url.rstrip('/')}/alertmanager/api/v2/silences" + resp = requests.post( + url, + json=payload, + auth=(args.key, args.secret), + headers={"Content-Type": "application/json", "Accept": "application/json"}, + timeout=30, + ) + + if resp.ok: + data = resp.json() + print(f"Silence created for '{args.alertname}' (ID: {data.get('silenceID', 'unknown')}, duration: {args.duration}m)") + else: + print(f"Failed to create silence (HTTP {resp.status_code}): {resp.text}", file=sys.stderr) + sys.exit(1) + + +def list_alerts(args): + """List active alerts.""" + url = f"{args.url.rstrip('/')}/alertmanager/api/v2/alerts" + resp = requests.get( + url, + auth=(args.key, args.secret), + headers={"Accept": "application/json"}, + timeout=30, + ) + + if not resp.ok: + print(f"Failed to list alerts (HTTP {resp.status_code}): {resp.text}", file=sys.stderr) + sys.exit(1) + + alerts = resp.json() + + # Optional filters + if args.state: + alerts = [a for a in alerts if a.get("status", {}).get("state") == args.state] + if args.severity: + alerts = [a for a in alerts if a.get("labels", {}).get("severity") == args.severity] + + if not alerts: + print("No alerts found.") + return + + print(json.dumps(alerts, indent=2)) + + +def main(): + parser = argparse.ArgumentParser(description="Manage Mimir Alertmanager alerts") + parser.add_argument("--url", required=True, help="Base URL of the Mimir proxy (e.g. https://my.nethesis.it/collect/api/services/mimir)") + parser.add_argument("--key", required=True, help="system_key for HTTP Basic Auth") + parser.add_argument("--secret", required=True, help="system_secret for HTTP Basic Auth") + + sub = parser.add_subparsers(dest="command", required=True) + + # push + push_parser = sub.add_parser("push", help="Push (fire) an alert") + push_parser.add_argument("--alertname", required=True, help="Alert name") + push_parser.add_argument("--severity", required=True, choices=["critical", "warning", "info"], help="Severity level") + push_parser.add_argument("--labels", nargs="*", help="Additional labels as key=value pairs") + push_parser.add_argument("--annotations", nargs="*", help="Annotations as key=value pairs") + + # resolve + resolve_parser = sub.add_parser("resolve", help="Resolve an alert") + resolve_parser.add_argument("--alertname", required=True, help="Alert name") + resolve_parser.add_argument("--severity", required=True, choices=["critical", "warning", "info"], help="Severity level") + resolve_parser.add_argument("--labels", nargs="*", help="Additional labels as key=value pairs (must match the fired alert)") + + # silence + silence_parser = sub.add_parser("silence", help="Silence an alert") + silence_parser.add_argument("--alertname", required=True, help="Alert name to silence") + silence_parser.add_argument("--labels", nargs="*", help="Additional label matchers as key=value pairs") + silence_parser.add_argument("--duration", type=int, default=60, help="Silence duration in minutes (default: 60)") + silence_parser.add_argument("--comment", default="Silenced via alert.py", help="Reason for the silence") + silence_parser.add_argument("--created-by", default="alert.py", dest="created_by", help="Author of the silence") + + # list + list_parser = sub.add_parser("list", help="List active alerts") + list_parser.add_argument("--state", help="Filter by state (e.g. firing, pending)") + list_parser.add_argument("--severity", help="Filter by severity label") + + args = parser.parse_args() + + if args.command == "push": + push_alert(args) + elif args.command == "resolve": + resolve_alert(args) + elif args.command == "silence": + silence_alert(args) + elif args.command == "list": + list_alerts(args) + + +if __name__ == "__main__": + main() diff --git a/scripts/alerting_config.py b/scripts/alerting_config.py new file mode 100644 index 000000000..166286845 --- /dev/null +++ b/scripts/alerting_config.py @@ -0,0 +1,328 @@ +#!/usr/bin/env python3 +""" +CLI to manage alerting configuration via the MY backend API. + +The backend requires a custom JWT obtained through the Logto OIDC flow. +This script handles authentication automatically given user credentials. + +Usage: + python alerting_config.py get --url URL --email EMAIL --password PASS [--org ORG_ID] + python alerting_config.py set --url URL --email EMAIL --password PASS [--org ORG_ID] --config FILE + python alerting_config.py delete --url URL --email EMAIL --password PASS [--org ORG_ID] + python alerting_config.py alerts --url URL --email EMAIL --password PASS [--org ORG_ID] [--state STATE] [--severity SEV] [--system-key KEY] + +Examples: + # Get current alerting config + python alerting_config.py get \\ + --url https://my-proxy-qa-pr-42.onrender.com \\ + --email admin@example.com --password 's3cr3t' + + # Configure alerts from a JSON file (owner/distributor must pass --org) + python alerting_config.py set \\ + --url https://my-proxy-qa-pr-42.onrender.com \\ + --email admin@example.com --password 's3cr3t' \\ + --org veg2rx4p6lmo --config config.json + + # Disable all alerts + python alerting_config.py delete \\ + --url https://my-proxy-qa-pr-42.onrender.com \\ + --email admin@example.com --password 's3cr3t' --org veg2rx4p6lmo + + # List active alerts + python alerting_config.py alerts \\ + --url https://my-proxy-qa-pr-42.onrender.com \\ + --email admin@example.com --password 's3cr3t' --org veg2rx4p6lmo + +Config file format (JSON): + { + "critical": { + "emails": ["oncall@example.com"], + "webhooks": [{"name": "slack", "url": "https://hooks.slack.com/..."}], + "exceptions": ["NETH-XXXX-XXXX"] + }, + "warning": { + "emails": ["team@example.com"] + } + } +""" + +import argparse +import base64 +import hashlib +import json +import secrets +import sys +from urllib.parse import parse_qs, urlparse + +try: + import requests +except ImportError: + print("Error: 'requests' library is required. Install it with: pip install requests", file=sys.stderr) + sys.exit(1) + +# Logto app configuration for the MY platform +_LOGTO_ENDPOINT = "https://qa.id.nethesis.it" +_LOGTO_APP_ID = "amz2744kof0iq3a6i7csu" +_LOGTO_REDIRECT_URI_PATH = "login-redirect" + + +def _logto_login(base_url: str, email: str, password: str) -> str: + """ + Authenticate via Logto OIDC + backend token exchange. + Returns the custom JWT for use with the backend API. + """ + redirect_uri = f"{base_url.rstrip('/')}/{_LOGTO_REDIRECT_URI_PATH}" + backend_url = f"{base_url.rstrip('/')}/backend/api" + + session = requests.Session() + + # PKCE setup + code_verifier = secrets.token_urlsafe(64) + code_challenge = ( + base64.urlsafe_b64encode(hashlib.sha256(code_verifier.encode()).digest()) + .rstrip(b"=") + .decode() + ) + state = secrets.token_urlsafe(16) + + # Step 1: Start OIDC authorization flow + session.get( + f"{_LOGTO_ENDPOINT}/oidc/auth", + params={ + "client_id": _LOGTO_APP_ID, + "redirect_uri": redirect_uri, + "response_type": "code", + "scope": "openid profile email offline_access " + "urn:logto:scope:organizations urn:logto:scope:organization_roles", + "state": state, + "code_challenge": code_challenge, + "code_challenge_method": "S256", + }, + allow_redirects=False, + ) + + # Step 2: Logto interaction API — sign in with email/password + session.put(f"{_LOGTO_ENDPOINT}/api/interaction", json={"event": "SignIn"}) + r = session.patch( + f"{_LOGTO_ENDPOINT}/api/interaction/identifiers", + json={"email": email, "password": password}, + ) + if r.status_code == 422: + print(f"Authentication failed: {r.json().get('message', r.text)}", file=sys.stderr) + sys.exit(1) + if not r.ok: + print(f"Authentication error ({r.status_code}): {r.text}", file=sys.stderr) + sys.exit(1) + + # Step 3: Submit sign-in + r4 = session.post(f"{_LOGTO_ENDPOINT}/api/interaction/submit") + redirect_to = r4.json().get("redirectTo") + if not redirect_to: + print(f"Unexpected sign-in response: {r4.text}", file=sys.stderr) + sys.exit(1) + + # Step 4: Handle optional consent screen + r5 = session.get(redirect_to, allow_redirects=False) + if "consent" in r5.headers.get("Location", ""): + session.get(f"{_LOGTO_ENDPOINT}{r5.headers['Location']}", allow_redirects=False) + r_consent = session.post(f"{_LOGTO_ENDPOINT}/api/interaction/consent") + redirect_to = r_consent.json().get("redirectTo") + + # Step 5: Follow final redirect to get the auth code + r_final = session.get(redirect_to, allow_redirects=False) + callback_url = r_final.headers.get("Location", "") + qs = parse_qs(urlparse(callback_url).query) + code = qs.get("code", [None])[0] + if not code: + print(f"Failed to obtain authorization code from: {callback_url}", file=sys.stderr) + sys.exit(1) + + # Step 6: Exchange code for Logto access token + r6 = requests.post( + f"{_LOGTO_ENDPOINT}/oidc/token", + data={ + "grant_type": "authorization_code", + "code": code, + "redirect_uri": redirect_uri, + "client_id": _LOGTO_APP_ID, + "code_verifier": code_verifier, + }, + ) + logto_token = r6.json().get("access_token") + if not logto_token: + print(f"Failed to get Logto access token: {r6.text}", file=sys.stderr) + sys.exit(1) + + # Step 7: Exchange Logto token for custom backend JWT + r7 = requests.post( + f"{backend_url}/auth/exchange", + json={"access_token": logto_token}, + headers={"Content-Type": "application/json"}, + ) + token = r7.json().get("data", {}).get("token") + if not token: + print(f"Failed to exchange token: {r7.text}", file=sys.stderr) + sys.exit(1) + + return token + + +def _get_my_org(backend_url: str, headers: dict) -> str: + """Fetch the authenticated user's own organization_id.""" + r = requests.get(f"{backend_url}/me", headers=headers) + if not r.ok: + print(f"Failed to get user info ({r.status_code}): {r.text}", file=sys.stderr) + sys.exit(1) + return r.json()["data"]["organization_id"] + + +def _resolve_org(args, backend_url: str, headers: dict) -> str | None: + """ + Return the organization_id to use for the request. + If --org is given explicitly, use it. + If not given, fetch the user's own org (valid for Customer role; + Owner/Distributor/Reseller must provide --org explicitly). + """ + if args.org: + return args.org + return None # Let the backend resolve it (works for Customer role) + + +def cmd_get(args): + jwt = _logto_login(args.url, args.email, args.password) + backend_url = f"{args.url.rstrip('/')}/backend/api" + headers = {"Authorization": f"Bearer {jwt}"} + + params = {} + if args.org: + params["organization_id"] = args.org + + r = requests.get(f"{backend_url}/alerting/config", headers=headers, params=params) + data = r.json() + if not r.ok: + print(f"Error ({r.status_code}): {data.get('message', r.text)}", file=sys.stderr) + sys.exit(1) + + config_yaml = data.get("data", {}).get("config", "") + print(config_yaml) + + +def cmd_set(args): + jwt = _logto_login(args.url, args.email, args.password) + backend_url = f"{args.url.rstrip('/')}/backend/api" + headers = {"Authorization": f"Bearer {jwt}", "Content-Type": "application/json"} + + try: + with open(args.config) as f: + config = json.load(f) + except (OSError, json.JSONDecodeError) as e: + print(f"Error reading config file: {e}", file=sys.stderr) + sys.exit(1) + + params = {} + if args.org: + params["organization_id"] = args.org + + r = requests.post(f"{backend_url}/alerting/config", headers=headers, params=params, json=config) + data = r.json() + if not r.ok: + print(f"Error ({r.status_code}): {data.get('message', r.text)}", file=sys.stderr) + sys.exit(1) + + print(f"Alerting configuration updated successfully.") + + +def cmd_delete(args): + jwt = _logto_login(args.url, args.email, args.password) + backend_url = f"{args.url.rstrip('/')}/backend/api" + headers = {"Authorization": f"Bearer {jwt}"} + + params = {} + if args.org: + params["organization_id"] = args.org + + r = requests.delete(f"{backend_url}/alerting/config", headers=headers, params=params) + data = r.json() + if not r.ok: + print(f"Error ({r.status_code}): {data.get('message', r.text)}", file=sys.stderr) + sys.exit(1) + + print("All alerts disabled successfully.") + + +def cmd_alerts(args): + jwt = _logto_login(args.url, args.email, args.password) + backend_url = f"{args.url.rstrip('/')}/backend/api" + headers = {"Authorization": f"Bearer {jwt}"} + + params = {} + if args.org: + params["organization_id"] = args.org + if args.state: + params["state"] = args.state + if args.severity: + params["severity"] = args.severity + if args.system_key: + params["system_key"] = args.system_key + + r = requests.get(f"{backend_url}/alerting/alerts", headers=headers, params=params) + data = r.json() + if not r.ok: + print(f"Error ({r.status_code}): {data.get('message', r.text)}", file=sys.stderr) + sys.exit(1) + + alerts = data.get("data", {}).get("alerts", []) + if not alerts: + print("No alerts found.") + return + + print(json.dumps(alerts, indent=2)) + + +def main(): + parser = argparse.ArgumentParser(description="Manage MY alerting configuration via the backend API") + parser.add_argument("--url", required=True, + help="Base URL of the MY proxy (e.g. https://my.nethesis.it)") + parser.add_argument("--email", required=True, help="User email for authentication") + parser.add_argument("--password", required=True, help="User password for authentication") + + sub = parser.add_subparsers(dest="command", required=True) + + # get + get_p = sub.add_parser("get", help="Get current alerting configuration") + get_p.add_argument("--org", help="Organization ID (required for Owner/Distributor/Reseller roles)") + + # set + set_p = sub.add_parser("set", help="Configure alerting from a JSON file") + set_p.add_argument("--org", help="Organization ID (required for Owner/Distributor/Reseller roles)") + set_p.add_argument("--config", required=True, metavar="FILE", + help="JSON file with the alerting configuration (see --help for format)") + + # delete + del_p = sub.add_parser("delete", help="Disable all alerts (blackhole config)") + del_p.add_argument("--org", help="Organization ID (required for Owner/Distributor/Reseller roles)") + + # alerts + alerts_p = sub.add_parser("alerts", help="List active alerts") + alerts_p.add_argument("--org", help="Organization ID (required for Owner/Distributor/Reseller roles)") + alerts_p.add_argument("--state", choices=["active", "suppressed", "unprocessed"], + help="Filter by alert state") + alerts_p.add_argument("--severity", choices=["critical", "warning", "info"], + help="Filter by severity label") + alerts_p.add_argument("--system-key", dest="system_key", + help="Filter by system_key label") + + args = parser.parse_args() + + if args.command == "get": + cmd_get(args) + elif args.command == "set": + cmd_set(args) + elif args.command == "delete": + cmd_delete(args) + elif args.command == "alerts": + cmd_alerts(args) + + +if __name__ == "__main__": + main() diff --git a/services/mimir/.env.example b/services/mimir/.env.example index 14910d081..1d15ba8ef 100644 --- a/services/mimir/.env.example +++ b/services/mimir/.env.example @@ -1,10 +1,8 @@ -# Mimir / Metrics Stack - environment variables +# Mimir / Alerting Stack - environment variables # Copy to mimir/.env and fill in actual values # S3-compatible storage credentials (DigitalOcean Spaces or AWS S3) MIMIR_S3_ENDPOINT=ams3.digitaloceanspaces.com MIMIR_S3_ACCESS_KEY=your-access-key MIMIR_S3_SECRET_KEY=your-secret-key -MIMIR_S3_BUCKET=your-mimir-blocks-bucket MIMIR_S3_ALERTMANAGER_BUCKET=your-mimir-alertmanager-bucket -MIMIR_S3_RULER_BUCKET=your-mimir-ruler-bucket diff --git a/services/mimir/.render-build-trigger b/services/mimir/.render-build-trigger index 9249691c6..d57bc5a20 100644 --- a/services/mimir/.render-build-trigger +++ b/services/mimir/.render-build-trigger @@ -2,7 +2,7 @@ # This file is used to force Docker service rebuilds in PR previews # Modify LAST_UPDATE to trigger rebuilds -LAST_UPDATE=2025-10-22T15:22:55Z +LAST_UPDATE=2026-02-26T15:22:50Z # Instructions: # 1. To force rebuild of Docker services in a PR, update LAST_UPDATE diff --git a/services/mimir/Containerfile b/services/mimir/Containerfile index 7996785b0..3b4b631da 100644 --- a/services/mimir/Containerfile +++ b/services/mimir/Containerfile @@ -15,6 +15,9 @@ COPY .render-build-trigger /tmp/build-trigger # Copy Mimir config template COPY my.yaml /etc/mimir/my.yaml.template +# Copy default runtime configuration (per-tenant overrides, reloaded every 10s) +COPY runtime_config.yaml /etc/mimir/runtime_config.yaml + # Copy entrypoint script (must be executable in the repo) COPY entrypoint.sh /entrypoint.sh diff --git a/services/mimir/README.md b/services/mimir/README.md index d6e77f08a..5360b6a49 100644 --- a/services/mimir/README.md +++ b/services/mimir/README.md @@ -1,16 +1,16 @@ -# Mimir — Metrics Infrastructure +# Mimir — Alerting Infrastructure -Grafana Mimir provides long-term metrics storage for the MY platform, deployed as a single node on a dedicated VM (Server B). The collect service on Server A writes metrics to Mimir and proxies read queries. +Grafana Mimir runs as a multi-tenant **Alertmanager** (`-target=alertmanager`) for the MY platform, deployed on a dedicated VM (Server B). It does **not** ingest metrics. The collect service on Server A routes alert notifications through Mimir's Alertmanager API. ## Topology ``` ┌──────────────────────────────────┐ ┌──────────────────────────────────┐ -│ Server A (main app) │ │ Server B (metrics VM) │ +│ Server A (main app) │ │ Server B (alerting VM) │ │ │ │ │ -│ collect ──/api/services/mimir──►│◄────│ mimir (port 19009) │ -│ backend │ │ └── S3 storage │ -│ frontend │ │ │ +│ collect ──/api/services/mimir──►──────► mimir (port 19009) │ +│ backend │ │ -target=alertmanager │ +│ frontend │ │ └── S3 alertmanager state │ │ nginx proxy │ │ │ └──────────────────────────────────┘ └──────────────────────────────────┘ ``` @@ -55,15 +55,13 @@ Should return `ready`. | `MIMIR_S3_ENDPOINT` | S3-compatible storage endpoint | `ams3.digitaloceanspaces.com` | | `MIMIR_S3_ACCESS_KEY` | S3 access key | `your-access-key` | | `MIMIR_S3_SECRET_KEY` | S3 secret key | `your-secret-key` | -| `MIMIR_S3_BUCKET` | Bucket for blocks (TSDB chunks) | `my-mimir-blocks` | | `MIMIR_S3_ALERTMANAGER_BUCKET` | Bucket for Alertmanager state | `my-mimir-alertmanager` | -| `MIMIR_S3_RULER_BUCKET` | Bucket for recording/alert rules | `my-mimir-ruler` | Copy `services/mimir/.env.example` to `services/mimir/.env` and fill in every value before starting the stack. ## Architecture -Mimir runs as a single node with `replication_factor: 1`. It uses three S3 buckets (blocks, alertmanager, ruler) for persistent storage. Multitenancy is enabled; all writes from `collect` include the tenant ID resolved from the system's organization. +Mimir runs as an alertmanager-only target (`-target=alertmanager`). It uses a single S3 bucket for persistent Alertmanager state. Multitenancy is enabled; all requests from `collect` include the tenant ID resolved from the system's organization. The config template (`services/mimir/my.yaml`) uses `${VAR}` placeholders that are expanded at container startup by `entrypoint.sh` via `envsubst`. diff --git a/services/mimir/VERSION b/services/mimir/VERSION index 0d91a54c7..1d0ba9ea1 100644 --- a/services/mimir/VERSION +++ b/services/mimir/VERSION @@ -1 +1 @@ -0.3.0 +0.4.0 diff --git a/services/mimir/docker-compose.yml b/services/mimir/docker-compose.yml index 4fd62afc1..732662de0 100644 --- a/services/mimir/docker-compose.yml +++ b/services/mimir/docker-compose.yml @@ -1,4 +1,4 @@ -# Metrics Infrastructure - Dedicated VM deployment +# Alerting Infrastructure - Dedicated VM deployment # # Run on a separate server from the main application stack. # @@ -10,7 +10,7 @@ # # ⚙️ Required environment variables (set in mimir/.env or shell): # MIMIR_S3_ENDPOINT, MIMIR_S3_ACCESS_KEY, MIMIR_S3_SECRET_KEY -# MIMIR_S3_BUCKET, MIMIR_S3_ALERTMANAGER_BUCKET, MIMIR_S3_RULER_BUCKET +# MIMIR_S3_ALERTMANAGER_BUCKET version: '3.8' @@ -30,9 +30,7 @@ services: MIMIR_S3_ENDPOINT: ${MIMIR_S3_ENDPOINT} MIMIR_S3_ACCESS_KEY: ${MIMIR_S3_ACCESS_KEY} MIMIR_S3_SECRET_KEY: ${MIMIR_S3_SECRET_KEY} - MIMIR_S3_BUCKET: ${MIMIR_S3_BUCKET} MIMIR_S3_ALERTMANAGER_BUCKET: ${MIMIR_S3_ALERTMANAGER_BUCKET} - MIMIR_S3_RULER_BUCKET: ${MIMIR_S3_RULER_BUCKET} ports: - "19009:9009" networks: diff --git a/services/mimir/entrypoint.sh b/services/mimir/entrypoint.sh index 843c153b3..077bbe224 100755 --- a/services/mimir/entrypoint.sh +++ b/services/mimir/entrypoint.sh @@ -7,5 +7,5 @@ export PORT echo "==> Expanding Mimir config..." envsubst < /etc/mimir/my.yaml.template > /tmp/mimir-config.yaml -echo "==> Starting Mimir on port ${PORT}..." -exec /bin/mimir --config.file=/tmp/mimir-config.yaml +echo "==> Starting Mimir with alertmanager only for alert support on port ${PORT}..." +exec /bin/mimir -config.file=/tmp/mimir-config.yaml diff --git a/services/mimir/my.yaml b/services/mimir/my.yaml index 769b6dead..f6adedb0a 100644 --- a/services/mimir/my.yaml +++ b/services/mimir/my.yaml @@ -1,5 +1,5 @@ multitenancy_enabled: true -target: all +target: alertmanager common: storage: @@ -9,29 +9,10 @@ common: secret_access_key: ${MIMIR_S3_SECRET_KEY} access_key_id: ${MIMIR_S3_ACCESS_KEY} -blocks_storage: - s3: - bucket_name: ${MIMIR_S3_BUCKET} - alertmanager_storage: s3: bucket_name: ${MIMIR_S3_ALERTMANAGER_BUCKET} -ruler_storage: - s3: - bucket_name: ${MIMIR_S3_RULER_BUCKET} - -compactor: - data_dir: /tmp/mimir/compactor - -ingester: - ring: - replication_factor: 1 - -store_gateway: - sharding_ring: - replication_factor: 1 - server: http_listen_port: ${PORT} - log_level: info + log_level: info \ No newline at end of file diff --git a/services/mimir/runtime_config.yaml b/services/mimir/runtime_config.yaml new file mode 100644 index 000000000..ba1404072 --- /dev/null +++ b/services/mimir/runtime_config.yaml @@ -0,0 +1,9 @@ +# Mimir runtime configuration — reloaded every 10s without restart. +# Use this file to set per-tenant limit overrides. +# +# Example: +# overrides: +# my-tenant-id: +# ingestion_rate: 10000 +# max_label_names_per_series: 30 +# compactor_blocks_retention_period: 48h diff --git a/sync/pkg/version/VERSION b/sync/pkg/version/VERSION index 0d91a54c7..1d0ba9ea1 100644 --- a/sync/pkg/version/VERSION +++ b/sync/pkg/version/VERSION @@ -1 +1 @@ -0.3.0 +0.4.0 diff --git a/version.json b/version.json index 5b61719be..b7baa9fa1 100644 --- a/version.json +++ b/version.json @@ -1,12 +1,12 @@ { - "version": "0.3.0", + "version": "0.4.0", "name": "My Nethesis", "components": { - "backend": "0.3.0", - "sync": "0.3.0", - "collect": "0.3.0", - "frontend": "0.3.0", - "proxy": "0.3.0", - "services/mimir": "0.3.0" + "backend": "0.4.0", + "sync": "0.4.0", + "collect": "0.4.0", + "frontend": "0.4.0", + "proxy": "0.4.0", + "services/mimir": "0.4.0" } }