Merge pull request #41 from bashbang/main

bashbang · web-flow · commit 5a8b4085be28 · 2024-10-29T11:05:01.000-07:00
release jq and yq into util pod.
diff --git a/tf-sysdig/.gitignore b/tf-sysdig/.gitignore
@@ -0,0 +1,36 @@
+# Local Terraform directories
+.terraform/
+.terraform.lock.hcl
+
+# Terraform plan files
+*.tfplan
+
+# Crash logs
+crash.log
+
+# Terraform state files
+*.tfstate
+*.tfstate.*
+
+# Override files
+override.tf
+override.tf.json
+*_override.tf
+*_override.tf.json
+
+# Sensitive variable files
+*.tfvars
+*.tfvars.json
+
+# Terraform variable environment files
+.terraform.tfvars
+terraform.tfvars
+
+# Sensitive provider configuration files
+provider.tfvars
+
+# Generated files by IDEs or OS
+*.DS_Store
+*.log
+*.bak
+*.swp
diff --git a/tf-sysdig/alerts.tf b/tf-sysdig/alerts.tf
@@ -0,0 +1,43 @@
+# Define Sysdig alerts with detailed attributes for all environments and metrics
+resource "sysdig_monitor_alert_v2_metric" "pod_metrics" {
+  for_each = local.all_metrics
+
+  name              = each.value.name
+  description       = each.value.description
+  enabled           = each.value.enabled
+  severity          = each.value.severity
+  metric            = each.value.metric
+  group_aggregation = each.value.group_aggregation
+  time_aggregation  = each.value.time_aggregation
+  operator          = each.value.operator
+  threshold         = each.value.threshold
+
+  dynamic "scope" {
+    for_each = [
+      {
+        label    = "kube_namespace_name"
+        operator = "equals"
+        values   = [each.value.namespace_name]
+      }
+    ]
+    content {
+      label    = scope.value.label
+      operator = scope.value.operator
+      values   = scope.value.values
+    }
+  }
+
+  notification_channels {
+    # TODO: hard coded id for now. This shold be generated and referenced.
+    id                     = 238924
+    renotify_every_minutes = 60
+  }
+
+  custom_notification {
+    subject = each.value.notification_subject
+    prepend = "Alert Details:"
+    append  = "Please check the system immediately."
+  }
+
+  range_seconds = each.value.range_seconds
+}
diff --git a/tf-sysdig/locals.tf b/tf-sysdig/locals.tf
@@ -0,0 +1,141 @@
+locals {
+  # Define common scope as a local variable
+  common_scope = [
+    {
+      label    = "kube_cluster_name"
+      operator = "equals"
+      values   = ["gold"]
+    },
+    {
+      label    = "kube_namespace_name"
+      operator = "equals"
+      values   = ["abc123-dev"]
+    },
+    {
+      label    = "kube_deployment_name"
+      operator = "equals"
+      values   = ["project-wordpress"]
+    }
+  ]
+
+  # Define environments
+  environments = {
+    dev  = "abc123-dev"
+    test = "abc123-test"
+    prod = "abc123-prod"
+  }
+
+  # Define pod metrics
+  pod_metrics = {
+    "Pod CPU Usage" = {
+      metric               = "sysdig_program_cpu_cores_used_percent"
+      description          = "Alert when 'sysdig_program_cpu_cores_used_percent' exceeds threshold"
+      enabled              = true
+      severity             = "high"
+      group_aggregation    = "avg"
+      time_aggregation     = "avg"
+      operator             = ">"
+      threshold            = 80
+      notification_subject = "Pod CPU Usage Alert Status"
+      range_seconds        = 60
+    }
+
+    "Pod Memory Usage" = {
+      metric               = "sysdig_program_memory_used_percent"
+      description          = "Alert when 'sysdig_program_memory_used_percent' exceeds threshold"
+      enabled              = true
+      severity             = "high"
+      group_aggregation    = "avg"
+      time_aggregation     = "avg"
+      operator             = ">"
+      threshold            = 80
+      notification_subject = "Pod Memory Usage Alert Status"
+      range_seconds        = 60
+    }
+
+    "Pod Restarts" = {
+      metric               = "kube_pod_sysdig_restart_count"
+      description          = "Alert when 'kube_pod_sysdig_restart_count' exceeds threshold"
+      enabled              = true
+      severity             = "high"
+      group_aggregation    = "max"
+      time_aggregation     = "avg"
+      operator             = ">"
+      threshold            = 5
+      notification_subject = "Pod Restart Alert Status"
+      range_seconds        = 300
+    }
+
+    "HTTP Error Count" = {
+      metric               = "sysdig_container_net_http_error_count"
+      description          = "Alert when 'sysdig_container_net_http_error_count' exceeds the threshold"
+      enabled              = true
+      severity             = "high"
+      group_aggregation    = "avg"
+      time_aggregation     = "avg"
+      operator             = ">"
+      threshold            = 25
+      notification_subject = "Pod HTTP Error Count Alert"
+      range_seconds        = 300
+    }
+
+    "Replica Count Below Minimum" = {
+      metric               = "kube_deployment_status_replicas"
+      description          = "Alert when 'kube_deployment_status_replicas' falls below the threshold"
+      enabled              = true
+      severity             = "high"
+      group_aggregation    = "avg"
+      time_aggregation     = "avg"
+      operator             = "<"
+      threshold            = 3
+      notification_subject = "Replica Count Alert"
+      range_seconds        = 60
+    }
+
+    "Pod Ready Status" = {
+      metric               = "kube_pod_sysdig_status_ready"
+      description          = "Alert when 'kube_pod_sysdig_status_ready' falls below the threshold"
+      enabled              = true
+      severity             = "high"
+      group_aggregation    = "avg"
+      time_aggregation     = "avg"
+      operator             = "<"
+      threshold            = 1
+      notification_subject = "Pod Ready Status Alert"
+      range_seconds        = 60
+    }
+
+    "Pod Unready Status" = {
+      metric               = "kube_pod_sysdig_status_ready"
+      description          = "Alert when 'kube_pod_sysdig_status_ready' is unready for more than 5 minutes"
+      enabled              = true
+      severity             = "high"
+      group_aggregation    = "avg"
+      time_aggregation     = "avg"
+      operator             = "<"
+      threshold            = 1
+      notification_subject = "Pod Unready Status Alert"
+      range_seconds        = 300
+    }
+  }
+
+  all_metrics = merge([
+    for env, ns in local.environments : {
+      for metric_name, metric_info in local.pod_metrics : "${env}-${metric_name}" => {
+        name                 = "${env} - ${metric_name}"
+        namespace_name       = ns
+        metric               = metric_info.metric
+        description          = metric_info.description
+        enabled              = metric_info.enabled
+        severity             = metric_info.severity
+        group_aggregation    = metric_info.group_aggregation
+        time_aggregation     = metric_info.time_aggregation
+        operator             = metric_info.operator
+        threshold            = metric_info.threshold
+        notification_subject = metric_info.notification_subject
+        range_seconds        = metric_info.range_seconds
+      }
+    }
+  ]...)
+
+}
diff --git a/tf-sysdig/main.tf b/tf-sysdig/main.tf
@@ -0,0 +1,21 @@
+terraform {
+  required_providers {
+    sysdig = {
+      source  = "sysdiglabs/sysdig"
+      version = ">=1.33.0"
+    }
+  }
+  backend "local" {
+    path = "terraform.tfstate"
+  }
+}
+
+provider "sysdig" {
+  sysdig_monitor_url       = "https://app.sysdigcloud.com"
+  sysdig_monitor_api_token = var.sysdig_api_token
+}
+
+# Define a variable for the Sysdig API token
+variable "sysdig_api_token" {
+  type = string
+}
diff --git a/tf-sysdig/notification_channel.tf b/tf-sysdig/notification_channel.tf
@@ -0,0 +1,9 @@
+# # Define a notification channel
+# resource "sysdig_monitor_notification_channel_email" "tf_sre" {
+#     name                    = "TF SRE"
+#     recipients              = ["chris@bashbang.com"]
+#     enabled                 = true
+#     notify_when_ok          = true
+#     notify_when_resolved    = true
+#     send_test_notification  = true
+# }
diff --git a/tf-sysdig/readme.md b/tf-sysdig/readme.md
@@ -0,0 +1,9 @@
+# How to use
+
+## This is currently a WIP and should just be used as sample for a launching pad to expand on.
+
+Terraform v1.5.5
+on darwin_amd64
+
+terraform init
+terraform apply
diff --git a/tf-sysdig/terraform.tfvars.example b/tf-sysdig/terraform.tfvars.example
@@ -0,0 +1,2 @@
+# This token is a PAT in my user profile. In my case was found here: https://app.sysdigcloud.com/#/settings/user under "Sysdig Monitor API"
+sysdig_api_token = "{THIS_IS_THE_GENERATED_TOKEN_FROM_SYSDIG}"
diff --git a/utility-pod/Dockerfile b/utility-pod/Dockerfile
@@ -14,6 +14,7 @@ RUN apt-get -y install \
     htop \
     iperf \
     iputils-ping \
+    jq \
     lynx \
     mysql-client \
     nmap \
@@ -27,6 +28,7 @@ RUN apt-get -y install \
     unzip \
     vim \
     wget \
+    yq \
     && \
     rm -rf /var/lib/apt/lists/*
 

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# This token is a PAT in my user profile. In my case was found here: https://app.sysdigcloud.com/#/settings/user under "Sysdig Monitor API"`
	`2`	`+sysdig_api_token = "{THIS_IS_THE_GENERATED_TOKEN_FROM_SYSDIG}"`