diff --git a/tf/environments/dev/main.tf b/tf/environments/dev/main.tf index af37c92b..9f2e959d 100644 --- a/tf/environments/dev/main.tf +++ b/tf/environments/dev/main.tf @@ -304,7 +304,6 @@ module "ooniapi_cluster" { asg_min = 2 asg_max = 4 - asg_desired = 2 instance_type = "t3a.micro" @@ -331,9 +330,8 @@ module "oonitier1plus_cluster" { vpc_id = module.network.vpc_id subnet_ids = module.network.vpc_subnet_private[*].id - asg_min = 2 + asg_min = 1 asg_max = 4 - asg_desired = 2 instance_type = "t3a.micro" @@ -402,9 +400,7 @@ module "ooniapi_ooniprobe_deployer" { module "ooniapi_ooniprobe" { source = "../../modules/ooniapi_service" - task_memory = 64 - - service_desired_count = 2 + task_memory = 256 # First run should be set on first run to bootstrap the task definition # first_run = true @@ -436,6 +432,17 @@ module "ooniapi_ooniprobe" { # module.ooniapi_cluster.web_security_group_id ] + use_autoscaling = true + service_desired_count = 1 + max_desired_count = 4 + autoscale_policies = [ + { + resource_type = "memory" + name = "memory" + scaleout_treshold = 60 + } + ] + tags = merge( local.tags, { Name = "ooni-tier0-ooniprobe" } @@ -752,7 +759,7 @@ module "ooniapi_oonirun_deployer" { module "ooniapi_oonirun" { source = "../../modules/ooniapi_service" - task_memory = 64 + task_memory = 256 vpc_id = module.network.vpc_id @@ -801,7 +808,7 @@ module "ooniapi_oonifindings_deployer" { module "ooniapi_oonifindings" { source = "../../modules/ooniapi_service" - task_memory = 64 + task_memory = 256 vpc_id = module.network.vpc_id @@ -851,7 +858,7 @@ module "ooniapi_ooniauth_deployer" { module "ooniapi_ooniauth" { source = "../../modules/ooniapi_service" - task_memory = 64 + task_memory = 128 vpc_id = module.network.vpc_id @@ -918,7 +925,7 @@ module "ooniapi_oonimeasurements_deployer" { module "ooniapi_oonimeasurements" { source = "../../modules/ooniapi_service" - task_memory = 64 + task_memory = 256 first_run = true vpc_id = module.network.vpc_id @@ -929,7 +936,6 @@ module "ooniapi_oonimeasurements" { dns_zone_ooni_io = local.dns_zone_ooni_io key_name = module.adm_iam_roles.oonidevops_key_name ecs_cluster_id = module.oonitier1plus_cluster.cluster_id - service_desired_count = 2 task_secrets = { POSTGRESQL_URL = data.aws_ssm_parameter.oonipg_url.arn @@ -949,6 +955,17 @@ module "ooniapi_oonimeasurements" { module.oonitier1plus_cluster.web_security_group_id ] + use_autoscaling = true + service_desired_count = 1 + max_desired_count = 8 + autoscale_policies = [ + { + name = "memory" + resource_type = "memory" + scaleout_treshold = 60 + } + ] + tags = merge( local.tags, { Name = "ooni-tier0-oonimeasurements" } diff --git a/tf/environments/prod/main.tf b/tf/environments/prod/main.tf index ce3b24c5..2ec465ef 100644 --- a/tf/environments/prod/main.tf +++ b/tf/environments/prod/main.tf @@ -527,7 +527,6 @@ module "ooniapi_cluster" { # You need be careful how these are tweaked. asg_min = 2 asg_max = 10 - asg_desired = 5 instance_type = "t3a.medium" @@ -556,7 +555,6 @@ module "oonitier1plus_cluster" { asg_min = 2 asg_max = 5 - asg_desired = 4 instance_type = "t3a.medium" diff --git a/tf/modules/ecs_cluster/main.tf b/tf/modules/ecs_cluster/main.tf index fa0557ef..b95fb7ea 100644 --- a/tf/modules/ecs_cluster/main.tf +++ b/tf/modules/ecs_cluster/main.tf @@ -117,7 +117,7 @@ resource "aws_security_group" "container_host" { security_groups = concat([ aws_security_group.web.id, - ], + ], var.monitoring_sg_ids) } @@ -197,7 +197,9 @@ resource "aws_autoscaling_group" "container_host" { vpc_zone_identifier = var.subnet_ids min_size = var.asg_min max_size = var.asg_max - desired_capacity = var.asg_desired + # desired_capacity is usually managed by the capacity provider + # defined below. Note that this is an ECS cluster, so + # cluster capacity is directed by task load demands launch_template { id = aws_launch_template.container_host.id @@ -212,4 +214,42 @@ resource "aws_autoscaling_group" "container_host" { triggers = ["tag"] } + + // This tag is required by the aws_ecs_capacity_provider resource + // See: https://registry.terraform.io/providers/hashicorp/aws/5.87.0/docs/resources/ecs_capacity_provider#example-usage + tag { + key = "AmazonECSManaged" + value = true + propagate_at_launch = true + } +} + +resource "aws_ecs_capacity_provider" "capacity_provider" { + name = "${var.name}-capacity-provider" + + auto_scaling_group_provider { + auto_scaling_group_arn = aws_autoscaling_group.container_host.arn + managed_termination_protection = "ENABLED" + # managed_draining = "ENABLED" + + managed_scaling { + maximum_scaling_step_size = 1000 + minimum_scaling_step_size = 1 + status = "ENABLED" + target_capacity = 100 + } + } +} + +// You also need to link the capacity provider to the cluster +resource "aws_ecs_cluster_capacity_providers" "cluster_capacity_providers" { + cluster_name = aws_ecs_cluster.main.name + + capacity_providers = [aws_ecs_capacity_provider.capacity_provider.name] + + default_capacity_provider_strategy { + base = 1 + weight = 100 + capacity_provider = aws_ecs_capacity_provider.capacity_provider.name + } } diff --git a/tf/modules/ecs_cluster/variables.tf b/tf/modules/ecs_cluster/variables.tf index 16e1592e..a9669ebf 100644 --- a/tf/modules/ecs_cluster/variables.tf +++ b/tf/modules/ecs_cluster/variables.tf @@ -45,11 +45,6 @@ variable "asg_max" { default = 6 } -variable "asg_desired" { - description = "Desired numbers of servers in ASG" - default = 1 -} - variable "admin_cidr_ingress" { default = "0.0.0.0/0" } @@ -75,4 +70,4 @@ variable "monitoring_active" { description = "If the monitoring system should consider cluster machines. Set it to 'true' to activate it, anything else to deactivate it" default = "true" type = string -} \ No newline at end of file +} diff --git a/tf/modules/ooniapi_service/main.tf b/tf/modules/ooniapi_service/main.tf index f19e08f0..f0e459cc 100644 --- a/tf/modules/ooniapi_service/main.tf +++ b/tf/modules/ooniapi_service/main.tf @@ -145,3 +145,43 @@ resource "aws_alb_target_group" "ooniapi_service" { tags = var.tags } + +resource "aws_appautoscaling_target" "ecs_target" { + // Use count to support conditional resource creation + count = var.use_autoscaling ? 1 : 0 + service_namespace = "ecs" + scalable_dimension = "ecs:service:DesiredCount" + resource_id = "${reverse(split(":", aws_ecs_service.ooniapi_service.id))[0]}" + + min_capacity = var.service_desired_count + max_capacity = var.max_desired_count +} + +resource "aws_appautoscaling_policy" "policies" { + for_each = { + for p in var.autoscale_policies : + p.name => p + } + + name = each.value.name + service_namespace = "ecs" + scalable_dimension = aws_appautoscaling_target.ecs_target[0].scalable_dimension + resource_id = aws_appautoscaling_target.ecs_target[0].resource_id + policy_type = "TargetTrackingScaling" + + target_tracking_scaling_policy_configuration { + predefined_metric_specification { + predefined_metric_type = lookup({ + cpu = "ECSServiceAverageCPUUtilization" + memory = "ECSServiceAverageMemoryUtilization" + }, + each.value.resource_type, + "ECSServiceAverageMemoryUtilization" + ) + } + + target_value = each.value.scaleout_treshold + scale_in_cooldown = 60 + scale_out_cooldown = 60 + } +} diff --git a/tf/modules/ooniapi_service/variables.tf b/tf/modules/ooniapi_service/variables.tf index 0dfaf4bf..c0e42b66 100644 --- a/tf/modules/ooniapi_service/variables.tf +++ b/tf/modules/ooniapi_service/variables.tf @@ -30,7 +30,11 @@ variable "tags" { } variable "service_desired_count" { - description = "Desired numbers of instances in the ecs service" + description = <<-EOF + Desired numbers of instances in the ecs service. + When `use_autoscaling == true` this will be the minimum amount of + spawned services + EOF default = 1 } @@ -65,3 +69,27 @@ variable "ooniapi_service_security_groups" { description = "the shared web security group from the ecs cluster" type = list(string) } + +// Autoscaling +variable "use_autoscaling" { + description = "Whether this service should use autoscaling to modify task count at runtime" + type = bool + default = false +} + +variable "max_desired_count" { + description = "Desired numbers of instances in the ecs service" + default = 1 +} + +variable "autoscale_policies" { + description = "Policies used for autoscaling resources, only valid if `use_autoscaling` == true" + + type = list(object({ + resource_type = string // memory | cpu + scaleout_treshold = number // from 0 to 100, number used to trigger a scale in. Should be higher than scalein_treshold + name = string + })) + + default = [] +}