Skip to content

Commit 5a8b408

Browse files
authored
Merge pull request #41 from bashbang/main
release jq and yq into util pod.
2 parents 4d3a4cd + a7120f2 commit 5a8b408

8 files changed

Lines changed: 263 additions & 0 deletions

File tree

tf-sysdig/.gitignore

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# Local Terraform directories
2+
.terraform/
3+
.terraform.lock.hcl
4+
5+
# Terraform plan files
6+
*.tfplan
7+
8+
# Crash logs
9+
crash.log
10+
11+
# Terraform state files
12+
*.tfstate
13+
*.tfstate.*
14+
15+
# Override files
16+
override.tf
17+
override.tf.json
18+
*_override.tf
19+
*_override.tf.json
20+
21+
# Sensitive variable files
22+
*.tfvars
23+
*.tfvars.json
24+
25+
# Terraform variable environment files
26+
.terraform.tfvars
27+
terraform.tfvars
28+
29+
# Sensitive provider configuration files
30+
provider.tfvars
31+
32+
# Generated files by IDEs or OS
33+
*.DS_Store
34+
*.log
35+
*.bak
36+
*.swp

tf-sysdig/alerts.tf

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# Define Sysdig alerts with detailed attributes for all environments and metrics
2+
resource "sysdig_monitor_alert_v2_metric" "pod_metrics" {
3+
for_each = local.all_metrics
4+
5+
name = each.value.name
6+
description = each.value.description
7+
enabled = each.value.enabled
8+
severity = each.value.severity
9+
metric = each.value.metric
10+
group_aggregation = each.value.group_aggregation
11+
time_aggregation = each.value.time_aggregation
12+
operator = each.value.operator
13+
threshold = each.value.threshold
14+
15+
dynamic "scope" {
16+
for_each = [
17+
{
18+
label = "kube_namespace_name"
19+
operator = "equals"
20+
values = [each.value.namespace_name]
21+
}
22+
]
23+
content {
24+
label = scope.value.label
25+
operator = scope.value.operator
26+
values = scope.value.values
27+
}
28+
}
29+
30+
notification_channels {
31+
# TODO: hard coded id for now. This shold be generated and referenced.
32+
id = 238924
33+
renotify_every_minutes = 60
34+
}
35+
36+
custom_notification {
37+
subject = each.value.notification_subject
38+
prepend = "Alert Details:"
39+
append = "Please check the system immediately."
40+
}
41+
42+
range_seconds = each.value.range_seconds
43+
}

tf-sysdig/locals.tf

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
locals {
2+
# Define common scope as a local variable
3+
common_scope = [
4+
{
5+
label = "kube_cluster_name"
6+
operator = "equals"
7+
values = ["gold"]
8+
},
9+
{
10+
label = "kube_namespace_name"
11+
operator = "equals"
12+
values = ["abc123-dev"]
13+
},
14+
{
15+
label = "kube_deployment_name"
16+
operator = "equals"
17+
values = ["project-wordpress"]
18+
}
19+
]
20+
21+
# Define environments
22+
environments = {
23+
dev = "abc123-dev"
24+
test = "abc123-test"
25+
prod = "abc123-prod"
26+
}
27+
28+
# Define pod metrics
29+
pod_metrics = {
30+
"Pod CPU Usage" = {
31+
metric = "sysdig_program_cpu_cores_used_percent"
32+
description = "Alert when 'sysdig_program_cpu_cores_used_percent' exceeds threshold"
33+
enabled = true
34+
severity = "high"
35+
group_aggregation = "avg"
36+
time_aggregation = "avg"
37+
operator = ">"
38+
threshold = 80
39+
notification_subject = "Pod CPU Usage Alert Status"
40+
range_seconds = 60
41+
}
42+
43+
"Pod Memory Usage" = {
44+
metric = "sysdig_program_memory_used_percent"
45+
description = "Alert when 'sysdig_program_memory_used_percent' exceeds threshold"
46+
enabled = true
47+
severity = "high"
48+
group_aggregation = "avg"
49+
time_aggregation = "avg"
50+
operator = ">"
51+
threshold = 80
52+
notification_subject = "Pod Memory Usage Alert Status"
53+
range_seconds = 60
54+
}
55+
56+
"Pod Restarts" = {
57+
metric = "kube_pod_sysdig_restart_count"
58+
description = "Alert when 'kube_pod_sysdig_restart_count' exceeds threshold"
59+
enabled = true
60+
severity = "high"
61+
group_aggregation = "max"
62+
time_aggregation = "avg"
63+
operator = ">"
64+
threshold = 5
65+
notification_subject = "Pod Restart Alert Status"
66+
range_seconds = 300
67+
}
68+
69+
"HTTP Error Count" = {
70+
metric = "sysdig_container_net_http_error_count"
71+
description = "Alert when 'sysdig_container_net_http_error_count' exceeds the threshold"
72+
enabled = true
73+
severity = "high"
74+
group_aggregation = "avg"
75+
time_aggregation = "avg"
76+
operator = ">"
77+
threshold = 25
78+
notification_subject = "Pod HTTP Error Count Alert"
79+
range_seconds = 300
80+
}
81+
82+
"Replica Count Below Minimum" = {
83+
metric = "kube_deployment_status_replicas"
84+
description = "Alert when 'kube_deployment_status_replicas' falls below the threshold"
85+
enabled = true
86+
severity = "high"
87+
group_aggregation = "avg"
88+
time_aggregation = "avg"
89+
operator = "<"
90+
threshold = 3
91+
notification_subject = "Replica Count Alert"
92+
range_seconds = 60
93+
}
94+
95+
"Pod Ready Status" = {
96+
metric = "kube_pod_sysdig_status_ready"
97+
description = "Alert when 'kube_pod_sysdig_status_ready' falls below the threshold"
98+
enabled = true
99+
severity = "high"
100+
group_aggregation = "avg"
101+
time_aggregation = "avg"
102+
operator = "<"
103+
threshold = 1
104+
notification_subject = "Pod Ready Status Alert"
105+
range_seconds = 60
106+
}
107+
108+
"Pod Unready Status" = {
109+
metric = "kube_pod_sysdig_status_ready"
110+
description = "Alert when 'kube_pod_sysdig_status_ready' is unready for more than 5 minutes"
111+
enabled = true
112+
severity = "high"
113+
group_aggregation = "avg"
114+
time_aggregation = "avg"
115+
operator = "<"
116+
threshold = 1
117+
notification_subject = "Pod Unready Status Alert"
118+
range_seconds = 300
119+
}
120+
}
121+
122+
all_metrics = merge([
123+
for env, ns in local.environments : {
124+
for metric_name, metric_info in local.pod_metrics : "${env}-${metric_name}" => {
125+
name = "${env} - ${metric_name}"
126+
namespace_name = ns
127+
metric = metric_info.metric
128+
description = metric_info.description
129+
enabled = metric_info.enabled
130+
severity = metric_info.severity
131+
group_aggregation = metric_info.group_aggregation
132+
time_aggregation = metric_info.time_aggregation
133+
operator = metric_info.operator
134+
threshold = metric_info.threshold
135+
notification_subject = metric_info.notification_subject
136+
range_seconds = metric_info.range_seconds
137+
}
138+
}
139+
]...)
140+
141+
}

tf-sysdig/main.tf

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
terraform {
2+
required_providers {
3+
sysdig = {
4+
source = "sysdiglabs/sysdig"
5+
version = ">=1.33.0"
6+
}
7+
}
8+
backend "local" {
9+
path = "terraform.tfstate"
10+
}
11+
}
12+
13+
provider "sysdig" {
14+
sysdig_monitor_url = "https://app.sysdigcloud.com"
15+
sysdig_monitor_api_token = var.sysdig_api_token
16+
}
17+
18+
# Define a variable for the Sysdig API token
19+
variable "sysdig_api_token" {
20+
type = string
21+
}

tf-sysdig/notification_channel.tf

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# # Define a notification channel
2+
# resource "sysdig_monitor_notification_channel_email" "tf_sre" {
3+
# name = "TF SRE"
4+
# recipients = ["chris@bashbang.com"]
5+
# enabled = true
6+
# notify_when_ok = true
7+
# notify_when_resolved = true
8+
# send_test_notification = true
9+
# }

tf-sysdig/readme.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# How to use
2+
3+
## This is currently a WIP and should just be used as sample for a launching pad to expand on.
4+
5+
Terraform v1.5.5
6+
on darwin_amd64
7+
8+
terraform init
9+
terraform apply

tf-sysdig/terraform.tfvars.example

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# This token is a PAT in my user profile. In my case was found here: https://app.sysdigcloud.com/#/settings/user under "Sysdig Monitor API"
2+
sysdig_api_token = "{THIS_IS_THE_GENERATED_TOKEN_FROM_SYSDIG}"

utility-pod/Dockerfile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ RUN apt-get -y install \
1414
htop \
1515
iperf \
1616
iputils-ping \
17+
jq \
1718
lynx \
1819
mysql-client \
1920
nmap \
@@ -27,6 +28,7 @@ RUN apt-get -y install \
2728
unzip \
2829
vim \
2930
wget \
31+
yq \
3032
&& \
3133
rm -rf /var/lib/apt/lists/*
3234

0 commit comments

Comments
 (0)