Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
8e391fe
Working on ecs autoscaling
LDiazN Nov 19, 2025
0044944
Add autoscaling configs to ooniapi service
LDiazN Nov 19, 2025
7bfa937
Setting up ooniprobe to use autoscaling
LDiazN Nov 19, 2025
68768d6
Add autoscaling to oonimeasurements
LDiazN Nov 19, 2025
afccf07
Merge branch 'main' into ecs-autoscaling
LDiazN Nov 19, 2025
dd5bcd6
remove invalid parameter
LDiazN Nov 19, 2025
8bfefa1
remove unused parameter
LDiazN Nov 19, 2025
6d75bdf
move parameter
LDiazN Nov 19, 2025
054ba30
Testing cluster autoscaling with more task instances for oonimeasurem…
LDiazN Nov 20, 2025
889b3a2
Set desired count to previous value
LDiazN Nov 20, 2025
2319720
add capacity provider to ecs clusters
LDiazN Nov 20, 2025
f687ba2
Link capacity provider to ECS cluster
LDiazN Nov 21, 2025
258e0bd
test autoscaling with oonimeasurements
LDiazN Nov 21, 2025
3a6afe0
test autoscaling with oonimeasurements
LDiazN Nov 21, 2025
e10692b
adjusting target capacity
LDiazN Nov 21, 2025
9437fbc
Roll back oonimeasurements desired task count
LDiazN Nov 21, 2025
c15bfcd
Remove scale in protection to instances to allow auto scaler to reduc…
LDiazN Nov 21, 2025
0b75cfa
adjusting ooniprobe memory
LDiazN Nov 21, 2025
38e2228
Add more memory to oonimeasurements
LDiazN Dec 8, 2025
a6cbddd
Set target capacity to 100 to avoid unused machines
LDiazN Dec 9, 2025
3d763d2
Increasing capacity for services according to their mem usage in aws
LDiazN Dec 9, 2025
2279d27
Remove desired capacity parameter as it's replaced by capacity provider
LDiazN Dec 9, 2025
1a7cbd9
reduce min task count for oonimeasurements
LDiazN Dec 9, 2025
97af32d
remove deprecated parameters from prod tf
LDiazN Dec 9, 2025
438e22d
reduce min capacity of tier1 cluster to save costs
LDiazN Dec 9, 2025
9575ed2
mc
LDiazN Dec 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 28 additions & 11 deletions tf/environments/dev/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,6 @@ module "ooniapi_cluster" {

asg_min = 2
asg_max = 4
asg_desired = 2

instance_type = "t3a.micro"

Expand All @@ -331,9 +330,8 @@ module "oonitier1plus_cluster" {
vpc_id = module.network.vpc_id
subnet_ids = module.network.vpc_subnet_private[*].id

asg_min = 2
asg_min = 1
asg_max = 4
asg_desired = 2

instance_type = "t3a.micro"

Expand Down Expand Up @@ -402,9 +400,7 @@ module "ooniapi_ooniprobe_deployer" {
module "ooniapi_ooniprobe" {
source = "../../modules/ooniapi_service"

task_memory = 64

service_desired_count = 2
task_memory = 256

# First run should be set on first run to bootstrap the task definition
# first_run = true
Expand Down Expand Up @@ -436,6 +432,17 @@ module "ooniapi_ooniprobe" {
# module.ooniapi_cluster.web_security_group_id
]

use_autoscaling = true
service_desired_count = 1
max_desired_count = 4
autoscale_policies = [
{
resource_type = "memory"
name = "memory"
scaleout_treshold = 60
}
]

tags = merge(
local.tags,
{ Name = "ooni-tier0-ooniprobe" }
Expand Down Expand Up @@ -752,7 +759,7 @@ module "ooniapi_oonirun_deployer" {
module "ooniapi_oonirun" {
source = "../../modules/ooniapi_service"

task_memory = 64
task_memory = 256

vpc_id = module.network.vpc_id

Expand Down Expand Up @@ -801,7 +808,7 @@ module "ooniapi_oonifindings_deployer" {
module "ooniapi_oonifindings" {
source = "../../modules/ooniapi_service"

task_memory = 64
task_memory = 256

vpc_id = module.network.vpc_id

Expand Down Expand Up @@ -851,7 +858,7 @@ module "ooniapi_ooniauth_deployer" {
module "ooniapi_ooniauth" {
source = "../../modules/ooniapi_service"

task_memory = 64
task_memory = 128

vpc_id = module.network.vpc_id

Expand Down Expand Up @@ -918,7 +925,7 @@ module "ooniapi_oonimeasurements_deployer" {
module "ooniapi_oonimeasurements" {
source = "../../modules/ooniapi_service"

task_memory = 64
task_memory = 256

first_run = true
vpc_id = module.network.vpc_id
Expand All @@ -929,7 +936,6 @@ module "ooniapi_oonimeasurements" {
dns_zone_ooni_io = local.dns_zone_ooni_io
key_name = module.adm_iam_roles.oonidevops_key_name
ecs_cluster_id = module.oonitier1plus_cluster.cluster_id
service_desired_count = 2

task_secrets = {
POSTGRESQL_URL = data.aws_ssm_parameter.oonipg_url.arn
Expand All @@ -949,6 +955,17 @@ module "ooniapi_oonimeasurements" {
module.oonitier1plus_cluster.web_security_group_id
]

use_autoscaling = true
service_desired_count = 1
max_desired_count = 8
autoscale_policies = [
{
name = "memory"
resource_type = "memory"
scaleout_treshold = 60
}
]

tags = merge(
local.tags,
{ Name = "ooni-tier0-oonimeasurements" }
Expand Down
2 changes: 0 additions & 2 deletions tf/environments/prod/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -527,7 +527,6 @@ module "ooniapi_cluster" {
# You need be careful how these are tweaked.
asg_min = 2
asg_max = 10
asg_desired = 5

instance_type = "t3a.medium"

Expand Down Expand Up @@ -556,7 +555,6 @@ module "oonitier1plus_cluster" {

asg_min = 2
asg_max = 5
asg_desired = 4

instance_type = "t3a.medium"

Expand Down
44 changes: 42 additions & 2 deletions tf/modules/ecs_cluster/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ resource "aws_security_group" "container_host" {

security_groups = concat([
aws_security_group.web.id,
],
],
var.monitoring_sg_ids)
}

Expand Down Expand Up @@ -197,7 +197,9 @@ resource "aws_autoscaling_group" "container_host" {
vpc_zone_identifier = var.subnet_ids
min_size = var.asg_min
max_size = var.asg_max
desired_capacity = var.asg_desired
# desired_capacity is usually managed by the capacity provider
# defined below. Note that this is an ECS cluster, so
# cluster capacity is directed by task load demands

launch_template {
id = aws_launch_template.container_host.id
Expand All @@ -212,4 +214,42 @@ resource "aws_autoscaling_group" "container_host" {

triggers = ["tag"]
}

// This tag is required by the aws_ecs_capacity_provider resource
// See: https://registry.terraform.io/providers/hashicorp/aws/5.87.0/docs/resources/ecs_capacity_provider#example-usage
tag {
key = "AmazonECSManaged"
value = true
propagate_at_launch = true
}
}

resource "aws_ecs_capacity_provider" "capacity_provider" {
name = "${var.name}-capacity-provider"

auto_scaling_group_provider {
auto_scaling_group_arn = aws_autoscaling_group.container_host.arn
managed_termination_protection = "ENABLED"
# managed_draining = "ENABLED"

managed_scaling {
maximum_scaling_step_size = 1000
minimum_scaling_step_size = 1
status = "ENABLED"
target_capacity = 100
}
}
}

// You also need to link the capacity provider to the cluster
resource "aws_ecs_cluster_capacity_providers" "cluster_capacity_providers" {
cluster_name = aws_ecs_cluster.main.name

capacity_providers = [aws_ecs_capacity_provider.capacity_provider.name]

default_capacity_provider_strategy {
base = 1
weight = 100
capacity_provider = aws_ecs_capacity_provider.capacity_provider.name
}
}
7 changes: 1 addition & 6 deletions tf/modules/ecs_cluster/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,6 @@ variable "asg_max" {
default = 6
}

variable "asg_desired" {
description = "Desired numbers of servers in ASG"
default = 1
}

variable "admin_cidr_ingress" {
default = "0.0.0.0/0"
}
Expand All @@ -75,4 +70,4 @@ variable "monitoring_active" {
description = "If the monitoring system should consider cluster machines. Set it to 'true' to activate it, anything else to deactivate it"
default = "true"
type = string
}
}
40 changes: 40 additions & 0 deletions tf/modules/ooniapi_service/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -145,3 +145,43 @@ resource "aws_alb_target_group" "ooniapi_service" {

tags = var.tags
}

resource "aws_appautoscaling_target" "ecs_target" {
// Use count to support conditional resource creation
count = var.use_autoscaling ? 1 : 0
service_namespace = "ecs"
scalable_dimension = "ecs:service:DesiredCount"
resource_id = "${reverse(split(":", aws_ecs_service.ooniapi_service.id))[0]}"

min_capacity = var.service_desired_count
max_capacity = var.max_desired_count
}

resource "aws_appautoscaling_policy" "policies" {
for_each = {
for p in var.autoscale_policies :
p.name => p
}

name = each.value.name
service_namespace = "ecs"
scalable_dimension = aws_appautoscaling_target.ecs_target[0].scalable_dimension
resource_id = aws_appautoscaling_target.ecs_target[0].resource_id
policy_type = "TargetTrackingScaling"

target_tracking_scaling_policy_configuration {
predefined_metric_specification {
predefined_metric_type = lookup({
cpu = "ECSServiceAverageCPUUtilization"
memory = "ECSServiceAverageMemoryUtilization"
},
each.value.resource_type,
"ECSServiceAverageMemoryUtilization"
)
}

target_value = each.value.scaleout_treshold
scale_in_cooldown = 60
scale_out_cooldown = 60
}
}
30 changes: 29 additions & 1 deletion tf/modules/ooniapi_service/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,11 @@ variable "tags" {
}

variable "service_desired_count" {
description = "Desired numbers of instances in the ecs service"
description = <<-EOF
Desired numbers of instances in the ecs service.
When `use_autoscaling == true` this will be the minimum amount of
spawned services
EOF
default = 1
}

Expand Down Expand Up @@ -65,3 +69,27 @@ variable "ooniapi_service_security_groups" {
description = "the shared web security group from the ecs cluster"
type = list(string)
}

// Autoscaling
variable "use_autoscaling" {
description = "Whether this service should use autoscaling to modify task count at runtime"
type = bool
default = false
}

variable "max_desired_count" {
description = "Desired numbers of instances in the ecs service"
default = 1
}

variable "autoscale_policies" {
description = "Policies used for autoscaling resources, only valid if `use_autoscaling` == true"

type = list(object({
resource_type = string // memory | cpu
scaleout_treshold = number // from 0 to 100, number used to trigger a scale in. Should be higher than scalein_treshold
name = string
}))

default = []
}
Loading