diff --git a/neuron-problem-detector/ecs-npd-cdk/.gitignore b/neuron-problem-detector/ecs-npd-cdk/.gitignore new file mode 100644 index 00000000..7039d498 --- /dev/null +++ b/neuron-problem-detector/ecs-npd-cdk/.gitignore @@ -0,0 +1,2 @@ +__pycache__/ +cdk.out/ diff --git a/neuron-problem-detector/ecs-npd-cdk/README.md b/neuron-problem-detector/ecs-npd-cdk/README.md new file mode 100644 index 00000000..3d9ca669 --- /dev/null +++ b/neuron-problem-detector/ecs-npd-cdk/README.md @@ -0,0 +1,100 @@ +# Overview + +This project contains CDK code to provision : + +* An ECS Cluster and one Inf2.xlarge EC2 instance joining the cluster. +* An ECS Task Definition for Neuron Problem Detector and Recovery +* An ECS Service that run the containers as Daemon in all instances +* Related IAM roles and log groups + + +This project is set up like a standard Python project. The initialization +process also creates a virtualenv within this project, stored under the `.venv` +directory. To create the virtualenv it assumes that there is a `python3` +(or `python` for Windows) executable in your path with access to the `venv` +package. If for any reason the automatic creation of the virtualenv fails, +you can create the virtualenv manually. + +The `cdk.json` file tells the CDK Toolkit how to execute your app. + +## Pre-requisites +Before you start, ensure that you have installed the latest version of the following tools on your machine: + +1. [aws cli](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) +2. [aws cdk](https://docs.aws.amazon.com/cdk/v2/guide/getting_started.html) +3. [Session Manager Plugin](https://docs.aws.amazon.com/systems-manager/latest/userguide/session-manager-working-with-install-plugin.html) + + +## Environment Setup +To manually create a virtualenv on MacOS and Linux: + +``` +$ python3 -m venv .venv +``` + +After the init process completes and the virtualenv is created, you can use the following +step to activate your virtualenv. + +``` +$ source .venv/bin/activate +``` + +If you are a Windows platform, you would activate the virtualenv like this: + +``` +% .venv\Scripts\activate.bat +``` + +Once the virtualenv is activated, you can install the required dependencies. + +``` +$ pip install -r requirements.txt +``` + +## Synthesize CloudFormation template +It is assumed that you have authenticated successfully to connect to your AWS environment. +At this point you can now synthesize the CloudFormation template for this code. + +``` +$ cdk synth +``` + +Perform bootstrap function with the following command. +``` +cdk bootstrap [--profile ] +``` +Deploy the stack in your AWS environment + +``` +cdk deploy [--profile ] +``` + +## Cleanup Instructions + +Destroy the stack in your AWS environment + +``` +cdk destroy [--profile ] +``` + +Delete the following log groups in cloudwatch + +``` +/ecs/recovery +/ecs/npd +``` + +## Optional +To add additional dependencies, for example other CDK libraries, just add +them to your `setup.py` file and rerun the `pip install -r requirements.txt` +command. + +## Useful commands + + * `cdk ls` list all stacks in the app + * `cdk synth` emits the synthesized CloudFormation template + * `cdk deploy` deploy this stack to your default AWS account/region + * `cdk diff` compare deployed stack with current state + * `cdk docs` open CDK documentation + + diff --git a/neuron-problem-detector/ecs-npd-cdk/app.py b/neuron-problem-detector/ecs-npd-cdk/app.py new file mode 100644 index 00000000..185bfa2c --- /dev/null +++ b/neuron-problem-detector/ecs-npd-cdk/app.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 +import os + +import aws_cdk as cdk + +from neuron_problem_detector.neuron_problem_detector_stack import NeuronProblemDetectorStack + + +app = cdk.App() +NeuronProblemDetectorStack(app, "NeuronProblemDetectorStack", + # If you don't specify 'env', this stack will be environment-agnostic. + # Account/Region-dependent features and context lookups will not work, + # but a single synthesized template can be deployed anywhere. + + # Uncomment the next line to specialize this stack for the AWS Account + # and Region that are implied by the current CLI configuration. + + #env=cdk.Environment(account=os.getenv('CDK_DEFAULT_ACCOUNT'), region=os.getenv('CDK_DEFAULT_REGION')), + + # Uncomment the next line if you know exactly what Account and Region you + # want to deploy the stack to. */ + + # env=cdk.Environment(account='464616699298', region='us-east-1'), + + # For more information, see https://docs.aws.amazon.com/cdk/latest/guide/environments.html + ) + +app.synth() diff --git a/neuron-problem-detector/ecs-npd-cdk/cdk.context.json b/neuron-problem-detector/ecs-npd-cdk/cdk.context.json new file mode 100644 index 00000000..6ab85676 --- /dev/null +++ b/neuron-problem-detector/ecs-npd-cdk/cdk.context.json @@ -0,0 +1,10 @@ +{ + "availability-zones:account=464616699298:region=us-east-1": [ + "us-east-1a", + "us-east-1b", + "us-east-1c", + "us-east-1d", + "us-east-1e", + "us-east-1f" + ] +} diff --git a/neuron-problem-detector/ecs-npd-cdk/cdk.json b/neuron-problem-detector/ecs-npd-cdk/cdk.json new file mode 100644 index 00000000..20c5a8fe --- /dev/null +++ b/neuron-problem-detector/ecs-npd-cdk/cdk.json @@ -0,0 +1,69 @@ +{ + "app": "python3 app.py", + "watch": { + "include": [ + "**" + ], + "exclude": [ + "README.md", + "cdk*.json", + "requirements*.txt", + "source.bat", + "**/__init__.py", + "**/__pycache__", + "tests" + ] + }, + "context": { + "@aws-cdk/aws-lambda:recognizeLayerVersion": true, + "@aws-cdk/core:checkSecretUsage": true, + "@aws-cdk/core:target-partitions": [ + "aws", + "aws-cn" + ], + "@aws-cdk-containers/ecs-service-extensions:enableDefaultLogDriver": true, + "@aws-cdk/aws-ec2:uniqueImdsv2TemplateName": true, + "@aws-cdk/aws-ecs:arnFormatIncludesClusterName": true, + "@aws-cdk/aws-iam:minimizePolicies": true, + "@aws-cdk/core:validateSnapshotRemovalPolicy": true, + "@aws-cdk/aws-codepipeline:crossAccountKeyAliasStackSafeResourceName": true, + "@aws-cdk/aws-s3:createDefaultLoggingPolicy": true, + "@aws-cdk/aws-sns-subscriptions:restrictSqsDescryption": true, + "@aws-cdk/aws-apigateway:disableCloudWatchRole": true, + "@aws-cdk/core:enablePartitionLiterals": true, + "@aws-cdk/aws-events:eventsTargetQueueSameAccount": true, + "@aws-cdk/aws-ecs:disableExplicitDeploymentControllerForCircuitBreaker": true, + "@aws-cdk/aws-iam:importedRoleStackSafeDefaultPolicyName": true, + "@aws-cdk/aws-s3:serverAccessLogsUseBucketPolicy": true, + "@aws-cdk/aws-route53-patters:useCertificate": true, + "@aws-cdk/customresources:installLatestAwsSdkDefault": false, + "@aws-cdk/aws-rds:databaseProxyUniqueResourceName": true, + "@aws-cdk/aws-codedeploy:removeAlarmsFromDeploymentGroup": true, + "@aws-cdk/aws-apigateway:authorizerChangeDeploymentLogicalId": true, + "@aws-cdk/aws-ec2:launchTemplateDefaultUserData": true, + "@aws-cdk/aws-secretsmanager:useAttachedSecretResourcePolicyForSecretTargetAttachments": true, + "@aws-cdk/aws-redshift:columnId": true, + "@aws-cdk/aws-stepfunctions-tasks:enableEmrServicePolicyV2": true, + "@aws-cdk/aws-ec2:restrictDefaultSecurityGroup": true, + "@aws-cdk/aws-apigateway:requestValidatorUniqueId": true, + "@aws-cdk/aws-kms:aliasNameRef": true, + "@aws-cdk/aws-autoscaling:generateLaunchTemplateInsteadOfLaunchConfig": true, + "@aws-cdk/core:includePrefixInUniqueNameGeneration": true, + "@aws-cdk/aws-efs:denyAnonymousAccess": true, + "@aws-cdk/aws-opensearchservice:enableOpensearchMultiAzWithStandby": true, + "@aws-cdk/aws-lambda-nodejs:useLatestRuntimeVersion": true, + "@aws-cdk/aws-efs:mountTargetOrderInsensitiveLogicalId": true, + "@aws-cdk/aws-rds:auroraClusterChangeScopeOfInstanceParameterGroupWithEachParameters": true, + "@aws-cdk/aws-appsync:useArnForSourceApiAssociationIdentifier": true, + "@aws-cdk/aws-rds:preventRenderingDeprecatedCredentials": true, + "@aws-cdk/aws-codepipeline-actions:useNewDefaultBranchForCodeCommitSource": true, + "@aws-cdk/aws-cloudwatch-actions:changeLambdaPermissionLogicalIdForLambdaAction": true, + "@aws-cdk/aws-codepipeline:crossAccountKeysDefaultValueToFalse": true, + "@aws-cdk/aws-codepipeline:defaultPipelineTypeToV2": true, + "@aws-cdk/aws-kms:reduceCrossAccountRegionPolicyScope": true, + "@aws-cdk/aws-eks:nodegroupNameAttribute": true, + "@aws-cdk/aws-ec2:ebsDefaultGp3Volume": true, + "@aws-cdk/aws-ecs:removeDefaultDeploymentAlarm": true, + "@aws-cdk/custom-resources:logApiResponseDataPropertyTrueDefault": false + } +} diff --git a/neuron-problem-detector/ecs-npd-cdk/neuron.yaml b/neuron-problem-detector/ecs-npd-cdk/neuron.yaml new file mode 100644 index 00000000..fe40b03c --- /dev/null +++ b/neuron-problem-detector/ecs-npd-cdk/neuron.yaml @@ -0,0 +1,1122 @@ +Resources: + NeuronProblemDetectorVPC5F617726: + Type: AWS::EC2::VPC + Properties: + CidrBlock: 10.0.0.0/16 + EnableDnsHostnames: true + EnableDnsSupport: true + InstanceTenancy: default + Tags: + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorVPC + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/Resource + NeuronProblemDetectorVPCPublicSubnet1Subnet842914BF: + Type: AWS::EC2::Subnet + Properties: + AvailabilityZone: + Fn::Select: + - 0 + - Fn::GetAZs: "" + CidrBlock: 10.0.0.0/18 + MapPublicIpOnLaunch: true + Tags: + - Key: aws-cdk:subnet-name + Value: Public + - Key: aws-cdk:subnet-type + Value: Public + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PublicSubnet1 + VpcId: + Ref: NeuronProblemDetectorVPC5F617726 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PublicSubnet1/Subnet + NeuronProblemDetectorVPCPublicSubnet1RouteTableC098CD6A: + Type: AWS::EC2::RouteTable + Properties: + Tags: + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PublicSubnet1 + VpcId: + Ref: NeuronProblemDetectorVPC5F617726 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PublicSubnet1/RouteTable + NeuronProblemDetectorVPCPublicSubnet1RouteTableAssociation9EC2AFC5: + Type: AWS::EC2::SubnetRouteTableAssociation + Properties: + RouteTableId: + Ref: NeuronProblemDetectorVPCPublicSubnet1RouteTableC098CD6A + SubnetId: + Ref: NeuronProblemDetectorVPCPublicSubnet1Subnet842914BF + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PublicSubnet1/RouteTableAssociation + NeuronProblemDetectorVPCPublicSubnet1DefaultRoute5C4F3954: + Type: AWS::EC2::Route + Properties: + DestinationCidrBlock: 0.0.0.0/0 + GatewayId: + Ref: NeuronProblemDetectorVPCIGW3EC7DAA5 + RouteTableId: + Ref: NeuronProblemDetectorVPCPublicSubnet1RouteTableC098CD6A + DependsOn: + - NeuronProblemDetectorVPCVPCGW5182937C + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PublicSubnet1/DefaultRoute + NeuronProblemDetectorVPCPublicSubnet1EIP71A9859B: + Type: AWS::EC2::EIP + Properties: + Domain: vpc + Tags: + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PublicSubnet1 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PublicSubnet1/EIP + NeuronProblemDetectorVPCPublicSubnet1NATGateway34AE13E8: + Type: AWS::EC2::NatGateway + Properties: + AllocationId: + Fn::GetAtt: + - NeuronProblemDetectorVPCPublicSubnet1EIP71A9859B + - AllocationId + SubnetId: + Ref: NeuronProblemDetectorVPCPublicSubnet1Subnet842914BF + Tags: + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PublicSubnet1 + DependsOn: + - NeuronProblemDetectorVPCPublicSubnet1DefaultRoute5C4F3954 + - NeuronProblemDetectorVPCPublicSubnet1RouteTableAssociation9EC2AFC5 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PublicSubnet1/NATGateway + NeuronProblemDetectorVPCPublicSubnet2Subnet53E01F76: + Type: AWS::EC2::Subnet + Properties: + AvailabilityZone: + Fn::Select: + - 1 + - Fn::GetAZs: "" + CidrBlock: 10.0.64.0/18 + MapPublicIpOnLaunch: true + Tags: + - Key: aws-cdk:subnet-name + Value: Public + - Key: aws-cdk:subnet-type + Value: Public + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PublicSubnet2 + VpcId: + Ref: NeuronProblemDetectorVPC5F617726 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PublicSubnet2/Subnet + NeuronProblemDetectorVPCPublicSubnet2RouteTable01829BCC: + Type: AWS::EC2::RouteTable + Properties: + Tags: + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PublicSubnet2 + VpcId: + Ref: NeuronProblemDetectorVPC5F617726 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PublicSubnet2/RouteTable + NeuronProblemDetectorVPCPublicSubnet2RouteTableAssociation9AFE0962: + Type: AWS::EC2::SubnetRouteTableAssociation + Properties: + RouteTableId: + Ref: NeuronProblemDetectorVPCPublicSubnet2RouteTable01829BCC + SubnetId: + Ref: NeuronProblemDetectorVPCPublicSubnet2Subnet53E01F76 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PublicSubnet2/RouteTableAssociation + NeuronProblemDetectorVPCPublicSubnet2DefaultRoute80B8BD8F: + Type: AWS::EC2::Route + Properties: + DestinationCidrBlock: 0.0.0.0/0 + GatewayId: + Ref: NeuronProblemDetectorVPCIGW3EC7DAA5 + RouteTableId: + Ref: NeuronProblemDetectorVPCPublicSubnet2RouteTable01829BCC + DependsOn: + - NeuronProblemDetectorVPCVPCGW5182937C + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PublicSubnet2/DefaultRoute + NeuronProblemDetectorVPCPublicSubnet2EIPEDE2DCF3: + Type: AWS::EC2::EIP + Properties: + Domain: vpc + Tags: + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PublicSubnet2 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PublicSubnet2/EIP + NeuronProblemDetectorVPCPublicSubnet2NATGateway475CF308: + Type: AWS::EC2::NatGateway + Properties: + AllocationId: + Fn::GetAtt: + - NeuronProblemDetectorVPCPublicSubnet2EIPEDE2DCF3 + - AllocationId + SubnetId: + Ref: NeuronProblemDetectorVPCPublicSubnet2Subnet53E01F76 + Tags: + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PublicSubnet2 + DependsOn: + - NeuronProblemDetectorVPCPublicSubnet2DefaultRoute80B8BD8F + - NeuronProblemDetectorVPCPublicSubnet2RouteTableAssociation9AFE0962 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PublicSubnet2/NATGateway + NeuronProblemDetectorVPCPrivateSubnet1Subnet708A0901: + Type: AWS::EC2::Subnet + Properties: + AvailabilityZone: + Fn::Select: + - 0 + - Fn::GetAZs: "" + CidrBlock: 10.0.128.0/18 + MapPublicIpOnLaunch: false + Tags: + - Key: aws-cdk:subnet-name + Value: Private + - Key: aws-cdk:subnet-type + Value: Private + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PrivateSubnet1 + VpcId: + Ref: NeuronProblemDetectorVPC5F617726 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PrivateSubnet1/Subnet + NeuronProblemDetectorVPCPrivateSubnet1RouteTableC2B2760B: + Type: AWS::EC2::RouteTable + Properties: + Tags: + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PrivateSubnet1 + VpcId: + Ref: NeuronProblemDetectorVPC5F617726 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PrivateSubnet1/RouteTable + NeuronProblemDetectorVPCPrivateSubnet1RouteTableAssociationE6D42BF0: + Type: AWS::EC2::SubnetRouteTableAssociation + Properties: + RouteTableId: + Ref: NeuronProblemDetectorVPCPrivateSubnet1RouteTableC2B2760B + SubnetId: + Ref: NeuronProblemDetectorVPCPrivateSubnet1Subnet708A0901 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PrivateSubnet1/RouteTableAssociation + NeuronProblemDetectorVPCPrivateSubnet1DefaultRoute1AD8D623: + Type: AWS::EC2::Route + Properties: + DestinationCidrBlock: 0.0.0.0/0 + NatGatewayId: + Ref: NeuronProblemDetectorVPCPublicSubnet1NATGateway34AE13E8 + RouteTableId: + Ref: NeuronProblemDetectorVPCPrivateSubnet1RouteTableC2B2760B + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PrivateSubnet1/DefaultRoute + NeuronProblemDetectorVPCPrivateSubnet2Subnet3B7C3437: + Type: AWS::EC2::Subnet + Properties: + AvailabilityZone: + Fn::Select: + - 1 + - Fn::GetAZs: "" + CidrBlock: 10.0.192.0/18 + MapPublicIpOnLaunch: false + Tags: + - Key: aws-cdk:subnet-name + Value: Private + - Key: aws-cdk:subnet-type + Value: Private + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PrivateSubnet2 + VpcId: + Ref: NeuronProblemDetectorVPC5F617726 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PrivateSubnet2/Subnet + NeuronProblemDetectorVPCPrivateSubnet2RouteTableD4FE42D0: + Type: AWS::EC2::RouteTable + Properties: + Tags: + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PrivateSubnet2 + VpcId: + Ref: NeuronProblemDetectorVPC5F617726 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PrivateSubnet2/RouteTable + NeuronProblemDetectorVPCPrivateSubnet2RouteTableAssociationCB083593: + Type: AWS::EC2::SubnetRouteTableAssociation + Properties: + RouteTableId: + Ref: NeuronProblemDetectorVPCPrivateSubnet2RouteTableD4FE42D0 + SubnetId: + Ref: NeuronProblemDetectorVPCPrivateSubnet2Subnet3B7C3437 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PrivateSubnet2/RouteTableAssociation + NeuronProblemDetectorVPCPrivateSubnet2DefaultRoute7B853FC0: + Type: AWS::EC2::Route + Properties: + DestinationCidrBlock: 0.0.0.0/0 + NatGatewayId: + Ref: NeuronProblemDetectorVPCPublicSubnet2NATGateway475CF308 + RouteTableId: + Ref: NeuronProblemDetectorVPCPrivateSubnet2RouteTableD4FE42D0 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/PrivateSubnet2/DefaultRoute + NeuronProblemDetectorVPCIGW3EC7DAA5: + Type: AWS::EC2::InternetGateway + Properties: + Tags: + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorVPC + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/IGW + NeuronProblemDetectorVPCVPCGW5182937C: + Type: AWS::EC2::VPCGatewayAttachment + Properties: + InternetGatewayId: + Ref: NeuronProblemDetectorVPCIGW3EC7DAA5 + VpcId: + Ref: NeuronProblemDetectorVPC5F617726 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/VPCGW + NeuronProblemDetectorVPCRestrictDefaultSecurityGroupCustomResource90BF6F18: + Type: Custom::VpcRestrictDefaultSG + Properties: + ServiceToken: + Fn::GetAtt: + - CustomVpcRestrictDefaultSGCustomResourceProviderHandlerDC833E5E + - Arn + DefaultSecurityGroupId: + Fn::GetAtt: + - NeuronProblemDetectorVPC5F617726 + - DefaultSecurityGroup + Account: + Ref: AWS::AccountId + UpdateReplacePolicy: Delete + DeletionPolicy: Delete + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorVPC/RestrictDefaultSecurityGroupCustomResource/Default + CustomVpcRestrictDefaultSGCustomResourceProviderRole26592FE0: + Type: AWS::IAM::Role + Properties: + AssumeRolePolicyDocument: + Version: "2012-10-17" + Statement: + - Action: sts:AssumeRole + Effect: Allow + Principal: + Service: lambda.amazonaws.com + ManagedPolicyArns: + - Fn::Sub: arn:${AWS::Partition}:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole + Policies: + - PolicyName: Inline + PolicyDocument: + Version: "2012-10-17" + Statement: + - Effect: Allow + Action: + - ec2:AuthorizeSecurityGroupIngress + - ec2:AuthorizeSecurityGroupEgress + - ec2:RevokeSecurityGroupIngress + - ec2:RevokeSecurityGroupEgress + Resource: + - Fn::Join: + - "" + - - "arn:" + - Ref: AWS::Partition + - ":ec2:" + - Ref: AWS::Region + - ":" + - Ref: AWS::AccountId + - :security-group/ + - Fn::GetAtt: + - NeuronProblemDetectorVPC5F617726 + - DefaultSecurityGroup + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/Custom::VpcRestrictDefaultSGCustomResourceProvider/Role + CustomVpcRestrictDefaultSGCustomResourceProviderHandlerDC833E5E: + Type: AWS::Lambda::Function + Properties: + Code: + S3Bucket: + Fn::Sub: cdk-hnb659fds-assets-${AWS::AccountId}-${AWS::Region} + S3Key: ee7de53d64cc9d6248fa6aa550f92358f6c907b5efd6f3298aeab1b5e7ea358a.zip + Timeout: 900 + MemorySize: 128 + Handler: __entrypoint__.handler + Role: + Fn::GetAtt: + - CustomVpcRestrictDefaultSGCustomResourceProviderRole26592FE0 + - Arn + Runtime: + Fn::FindInMap: + - LatestNodeRuntimeMap + - Ref: AWS::Region + - value + Description: Lambda function for removing all inbound/outbound rules from the VPC default security group + DependsOn: + - CustomVpcRestrictDefaultSGCustomResourceProviderRole26592FE0 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/Custom::VpcRestrictDefaultSGCustomResourceProvider/Handler + aws:asset:path: asset.ee7de53d64cc9d6248fa6aa550f92358f6c907b5efd6f3298aeab1b5e7ea358a + aws:asset:property: Code + NeuronProblemDetectorClusterED21CFD2: + Type: AWS::ECS::Cluster + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/Resource + NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityInstanceSecurityGroupC637EF03: + Type: AWS::EC2::SecurityGroup + Properties: + GroupDescription: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity/InstanceSecurityGroup + SecurityGroupEgress: + - CidrIp: 0.0.0.0/0 + Description: Allow all outbound traffic by default + IpProtocol: "-1" + Tags: + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity + VpcId: + Ref: NeuronProblemDetectorVPC5F617726 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity/InstanceSecurityGroup/Resource + NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityInstanceRole4CDFA2E5: + Type: AWS::IAM::Role + Properties: + AssumeRolePolicyDocument: + Statement: + - Action: sts:AssumeRole + Effect: Allow + Principal: + Service: ec2.amazonaws.com + Version: "2012-10-17" + ManagedPolicyArns: + - Fn::Join: + - "" + - - "arn:" + - Ref: AWS::Partition + - :iam::aws:policy/AmazonSSMManagedInstanceCore + Tags: + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity/InstanceRole/Resource + NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityInstanceRoleDefaultPolicy1F8A3A48: + Type: AWS::IAM::Policy + Properties: + PolicyDocument: + Statement: + - Action: + - ecs:DeregisterContainerInstance + - ecs:RegisterContainerInstance + - ecs:Submit* + Effect: Allow + Resource: + Fn::GetAtt: + - NeuronProblemDetectorClusterED21CFD2 + - Arn + - Action: + - ecs:Poll + - ecs:StartTelemetrySession + Condition: + ArnEquals: + ecs:cluster: + Fn::GetAtt: + - NeuronProblemDetectorClusterED21CFD2 + - Arn + Effect: Allow + Resource: "*" + - Action: + - ecr:GetAuthorizationToken + - ecs:DiscoverPollEndpoint + - logs:CreateLogStream + - logs:PutLogEvents + Effect: Allow + Resource: "*" + Version: "2012-10-17" + PolicyName: NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityInstanceRoleDefaultPolicy1F8A3A48 + Roles: + - Ref: NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityInstanceRole4CDFA2E5 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity/InstanceRole/DefaultPolicy/Resource + NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityInstanceProfile11E4E5E2: + Type: AWS::IAM::InstanceProfile + Properties: + Roles: + - Ref: NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityInstanceRole4CDFA2E5 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity/InstanceProfile + NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityLaunchTemplateF1F92126: + Type: AWS::EC2::LaunchTemplate + Properties: + LaunchTemplateData: + IamInstanceProfile: + Arn: + Fn::GetAtt: + - NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityInstanceProfile11E4E5E2 + - Arn + ImageId: + Ref: SsmParameterValueawsserviceecsoptimizedamiamazonlinux2infrecommendedimageidC96584B6F00A464EAD1953AFF4B05118Parameter + InstanceType: inf2.xlarge + Monitoring: + Enabled: false + SecurityGroupIds: + - Fn::GetAtt: + - NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityInstanceSecurityGroupC637EF03 + - GroupId + TagSpecifications: + - ResourceType: instance + Tags: + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity/LaunchTemplate + - ResourceType: volume + Tags: + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity/LaunchTemplate + UserData: + Fn::Base64: + Fn::Join: + - "" + - - |- + #!/bin/bash + echo ECS_CLUSTER= + - Ref: NeuronProblemDetectorClusterED21CFD2 + - " >> /etc/ecs/ecs.config" + TagSpecifications: + - ResourceType: launch-template + Tags: + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity/LaunchTemplate + DependsOn: + - NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityInstanceRoleDefaultPolicy1F8A3A48 + - NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityInstanceRole4CDFA2E5 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity/LaunchTemplate/Resource + NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityASGDE9EB8FF: + Type: AWS::AutoScaling::AutoScalingGroup + Properties: + DesiredCapacity: "1" + LaunchTemplate: + LaunchTemplateId: + Ref: NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityLaunchTemplateF1F92126 + Version: + Fn::GetAtt: + - NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityLaunchTemplateF1F92126 + - LatestVersionNumber + MaxSize: "3" + MinSize: "1" + Tags: + - Key: Name + PropagateAtLaunch: true + Value: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity + VPCZoneIdentifier: + - Ref: NeuronProblemDetectorVPCPrivateSubnet1Subnet708A0901 + - Ref: NeuronProblemDetectorVPCPrivateSubnet2Subnet3B7C3437 + UpdatePolicy: + AutoScalingReplacingUpdate: + WillReplace: true + AutoScalingScheduledAction: + IgnoreUnmodifiedGroupSizeProperties: true + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity/ASG + NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityDrainECSHookFunctionServiceRole49BA6389: + Type: AWS::IAM::Role + Properties: + AssumeRolePolicyDocument: + Statement: + - Action: sts:AssumeRole + Effect: Allow + Principal: + Service: lambda.amazonaws.com + Version: "2012-10-17" + ManagedPolicyArns: + - Fn::Join: + - "" + - - "arn:" + - Ref: AWS::Partition + - :iam::aws:policy/service-role/AWSLambdaBasicExecutionRole + Tags: + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity/DrainECSHook/Function/ServiceRole/Resource + NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityDrainECSHookFunctionServiceRoleDefaultPolicy91C029B7: + Type: AWS::IAM::Policy + Properties: + PolicyDocument: + Statement: + - Action: + - ec2:DescribeHosts + - ec2:DescribeInstanceAttribute + - ec2:DescribeInstanceStatus + - ec2:DescribeInstances + Effect: Allow + Resource: "*" + - Action: autoscaling:CompleteLifecycleAction + Effect: Allow + Resource: + Fn::Join: + - "" + - - "arn:" + - Ref: AWS::Partition + - ":autoscaling:" + - Ref: AWS::Region + - ":" + - Ref: AWS::AccountId + - :autoScalingGroup:*:autoScalingGroupName/ + - Ref: NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityASGDE9EB8FF + - Action: + - ecs:DescribeContainerInstances + - ecs:DescribeTasks + - ecs:ListTasks + - ecs:UpdateContainerInstancesState + Condition: + ArnEquals: + ecs:cluster: + Fn::GetAtt: + - NeuronProblemDetectorClusterED21CFD2 + - Arn + Effect: Allow + Resource: "*" + - Action: + - ecs:ListContainerInstances + - ecs:SubmitContainerStateChange + - ecs:SubmitTaskStateChange + Effect: Allow + Resource: + Fn::GetAtt: + - NeuronProblemDetectorClusterED21CFD2 + - Arn + Version: "2012-10-17" + PolicyName: NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityDrainECSHookFunctionServiceRoleDefaultPolicy91C029B7 + Roles: + - Ref: NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityDrainECSHookFunctionServiceRole49BA6389 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity/DrainECSHook/Function/ServiceRole/DefaultPolicy/Resource + NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityDrainECSHookFunction1625CD7D: + Type: AWS::Lambda::Function + Properties: + Code: + ZipFile: | + import boto3, json, os, time + + ecs = boto3.client('ecs') + autoscaling = boto3.client('autoscaling') + + + def lambda_handler(event, context): + print(json.dumps(dict(event, ResponseURL='...'))) + cluster = os.environ['CLUSTER'] + snsTopicArn = event['Records'][0]['Sns']['TopicArn'] + lifecycle_event = json.loads(event['Records'][0]['Sns']['Message']) + instance_id = lifecycle_event.get('EC2InstanceId') + if not instance_id: + print('Got event without EC2InstanceId: %s', json.dumps(dict(event, ResponseURL='...'))) + return + + instance_arn = container_instance_arn(cluster, instance_id) + print('Instance %s has container instance ARN %s' % (lifecycle_event['EC2InstanceId'], instance_arn)) + + if not instance_arn: + return + + task_arns = container_instance_task_arns(cluster, instance_arn) + + if task_arns: + print('Instance ARN %s has task ARNs %s' % (instance_arn, ', '.join(task_arns))) + + while has_tasks(cluster, instance_arn, task_arns): + time.sleep(10) + + try: + print('Terminating instance %s' % instance_id) + autoscaling.complete_lifecycle_action( + LifecycleActionResult='CONTINUE', + **pick(lifecycle_event, 'LifecycleHookName', 'LifecycleActionToken', 'AutoScalingGroupName')) + except Exception as e: + # Lifecycle action may have already completed. + print(str(e)) + + + def container_instance_arn(cluster, instance_id): + """Turn an instance ID into a container instance ARN.""" + arns = ecs.list_container_instances(cluster=cluster, filter='ec2InstanceId==' + instance_id)['containerInstanceArns'] + if not arns: + return None + return arns[0] + + def container_instance_task_arns(cluster, instance_arn): + """Fetch tasks for a container instance ARN.""" + arns = ecs.list_tasks(cluster=cluster, containerInstance=instance_arn)['taskArns'] + return arns + + def has_tasks(cluster, instance_arn, task_arns): + """Return True if the instance is running tasks for the given cluster.""" + instances = ecs.describe_container_instances(cluster=cluster, containerInstances=[instance_arn])['containerInstances'] + if not instances: + return False + instance = instances[0] + + if instance['status'] == 'ACTIVE': + # Start draining, then try again later + set_container_instance_to_draining(cluster, instance_arn) + return True + + task_count = None + + if task_arns: + # Fetch details for tasks running on the container instance + tasks = ecs.describe_tasks(cluster=cluster, tasks=task_arns)['tasks'] + if tasks: + # Consider any non-stopped tasks as running + task_count = sum(task['lastStatus'] != 'STOPPED' for task in tasks) + instance['pendingTasksCount'] + + if not task_count: + # Fallback to instance task counts if detailed task information is unavailable + task_count = instance['runningTasksCount'] + instance['pendingTasksCount'] + + print('Instance %s has %s tasks' % (instance_arn, task_count)) + + return task_count > 0 + + def set_container_instance_to_draining(cluster, instance_arn): + ecs.update_container_instances_state( + cluster=cluster, + containerInstances=[instance_arn], status='DRAINING') + + + def pick(dct, *keys): + """Pick a subset of a dict.""" + return {k: v for k, v in dct.items() if k in keys} + Environment: + Variables: + CLUSTER: + Ref: NeuronProblemDetectorClusterED21CFD2 + Handler: index.lambda_handler + Role: + Fn::GetAtt: + - NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityDrainECSHookFunctionServiceRole49BA6389 + - Arn + Runtime: python3.9 + Tags: + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity + Timeout: 310 + DependsOn: + - NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityDrainECSHookFunctionServiceRoleDefaultPolicy91C029B7 + - NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityDrainECSHookFunctionServiceRole49BA6389 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity/DrainECSHook/Function/Resource + NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityDrainECSHookFunctionAllowInvokeNeuronProblemDetectorStackNeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityLifecycleHookDrainHookTopicA5DCEF0A8A7A5064: + Type: AWS::Lambda::Permission + Properties: + Action: lambda:InvokeFunction + FunctionName: + Fn::GetAtt: + - NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityDrainECSHookFunction1625CD7D + - Arn + Principal: sns.amazonaws.com + SourceArn: + Ref: NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityLifecycleHookDrainHookTopicFB0CE430 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity/DrainECSHook/Function/AllowInvoke:NeuronProblemDetectorStackNeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityLifecycleHookDrainHookTopicA5DCEF0A + NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityDrainECSHookFunctionTopicBAF651D7: + Type: AWS::SNS::Subscription + Properties: + Endpoint: + Fn::GetAtt: + - NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityDrainECSHookFunction1625CD7D + - Arn + Protocol: lambda + TopicArn: + Ref: NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityLifecycleHookDrainHookTopicFB0CE430 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity/DrainECSHook/Function/Topic/Resource + NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityLifecycleHookDrainHookTopicFB0CE430: + Type: AWS::SNS::Topic + Properties: + Tags: + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity/LifecycleHookDrainHook/Topic/Resource + NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityLifecycleHookDrainHookRole7FF75B48: + Type: AWS::IAM::Role + Properties: + AssumeRolePolicyDocument: + Statement: + - Action: sts:AssumeRole + Effect: Allow + Principal: + Service: autoscaling.amazonaws.com + Version: "2012-10-17" + Tags: + - Key: Name + Value: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity/LifecycleHookDrainHook/Role/Resource + NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityLifecycleHookDrainHookRoleDefaultPolicy30C24756: + Type: AWS::IAM::Policy + Properties: + PolicyDocument: + Statement: + - Action: sns:Publish + Effect: Allow + Resource: + Ref: NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityLifecycleHookDrainHookTopicFB0CE430 + Version: "2012-10-17" + PolicyName: NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityLifecycleHookDrainHookRoleDefaultPolicy30C24756 + Roles: + - Ref: NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityLifecycleHookDrainHookRole7FF75B48 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity/LifecycleHookDrainHook/Role/DefaultPolicy/Resource + NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityLifecycleHookDrainHookC7D53AF2: + Type: AWS::AutoScaling::LifecycleHook + Properties: + AutoScalingGroupName: + Ref: NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityASGDE9EB8FF + DefaultResult: CONTINUE + HeartbeatTimeout: 300 + LifecycleTransition: autoscaling:EC2_INSTANCE_TERMINATING + NotificationTargetARN: + Ref: NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityLifecycleHookDrainHookTopicFB0CE430 + RoleARN: + Fn::GetAtt: + - NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityLifecycleHookDrainHookRole7FF75B48 + - Arn + DependsOn: + - NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityLifecycleHookDrainHookRoleDefaultPolicy30C24756 + - NeuronProblemDetectorClusterNeuronAutoScalingGroupCapacityLifecycleHookDrainHookRole7FF75B48 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorCluster/NeuronAutoScalingGroupCapacity/LifecycleHookDrainHook/Resource + NeuronProblemDetectorTaskExecutionRole563D2650: + Type: AWS::IAM::Role + Properties: + AssumeRolePolicyDocument: + Statement: + - Action: sts:AssumeRole + Effect: Allow + Principal: + Service: ecs-tasks.amazonaws.com + Version: "2012-10-17" + ManagedPolicyArns: + - Fn::Join: + - "" + - - "arn:" + - Ref: AWS::Partition + - :iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorTaskExecutionRole/Resource + NeuronProblemDetectorTaskExecutionRoleDefaultPolicy8DBFC0EE: + Type: AWS::IAM::Policy + Properties: + PolicyDocument: + Statement: + - Action: + - logs:CreateLogStream + - logs:PutLogEvents + Effect: Allow + Resource: + - Fn::GetAtt: + - NpdLogGroup39A02E3D + - Arn + - Fn::GetAtt: + - RecoveryLogGroupF6D50671 + - Arn + Version: "2012-10-17" + PolicyName: NeuronProblemDetectorTaskExecutionRoleDefaultPolicy8DBFC0EE + Roles: + - Ref: NeuronProblemDetectorTaskExecutionRole563D2650 + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorTaskExecutionRole/DefaultPolicy/Resource + NeuronProblemDetectorTaskRole673752FB: + Type: AWS::IAM::Role + Properties: + AssumeRolePolicyDocument: + Statement: + - Action: sts:AssumeRole + Effect: Allow + Principal: + Service: ecs-tasks.amazonaws.com + Version: "2012-10-17" + Policies: + - PolicyDocument: + Statement: + - Action: + - autoscaling:DescribeAutoScalingInstances + - autoscaling:SetInstanceHealth + - cloudwatch:PutMetricData + - ec2:DescribeInstances + Effect: Allow + Resource: "*" + Version: "2012-10-17" + PolicyName: node-recovery + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorTaskRole/Resource + NeuronProblemDetectorTaskRoleDefaultPolicyCFCDEF04: + Type: AWS::IAM::Policy + Properties: + PolicyDocument: + Statement: + - Action: + - logs:CreateLogStream + - logs:DescribeLogGroups + - logs:DescribeLogStreams + - logs:PutLogEvents + - ssmmessages:CreateControlChannel + - ssmmessages:CreateDataChannel + - ssmmessages:OpenControlChannel + - ssmmessages:OpenDataChannel + Effect: Allow + Resource: "*" + Version: "2012-10-17" + PolicyName: NeuronProblemDetectorTaskRoleDefaultPolicyCFCDEF04 + Roles: + - Ref: NeuronProblemDetectorTaskRole673752FB + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronProblemDetectorTaskRole/DefaultPolicy/Resource + NpdLogGroup39A02E3D: + Type: AWS::Logs::LogGroup + Properties: + LogGroupName: /ecs/npd + RetentionInDays: 7 + UpdateReplacePolicy: Retain + DeletionPolicy: Retain + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NpdLogGroup/Resource + RecoveryLogGroupF6D50671: + Type: AWS::Logs::LogGroup + Properties: + LogGroupName: /ecs/recovery + RetentionInDays: 7 + UpdateReplacePolicy: Retain + DeletionPolicy: Retain + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/RecoveryLogGroup/Resource + NeuronNpdAndRecoveryDaemonService03BA6456: + Type: AWS::ECS::Service + Properties: + Cluster: + Ref: NeuronProblemDetectorClusterED21CFD2 + DeploymentConfiguration: + MaximumPercent: 100 + MinimumHealthyPercent: 0 + EnableECSManagedTags: false + EnableExecuteCommand: true + LaunchType: EC2 + NetworkConfiguration: + AwsvpcConfiguration: + AssignPublicIp: DISABLED + SecurityGroups: + - Fn::GetAtt: + - NeuronNpdAndRecoveryDaemonServiceSecurityGroupC5B1D29B + - GroupId + Subnets: + - Ref: NeuronProblemDetectorVPCPrivateSubnet1Subnet708A0901 + - Ref: NeuronProblemDetectorVPCPrivateSubnet2Subnet3B7C3437 + SchedulingStrategy: DAEMON + DependsOn: + - NeuronProblemDetectorTaskRoleDefaultPolicyCFCDEF04 + - NeuronProblemDetectorTaskRole673752FB + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronNpdAndRecoveryDaemonService/Service + NeuronNpdAndRecoveryDaemonServiceSecurityGroupC5B1D29B: + Type: AWS::EC2::SecurityGroup + Properties: + GroupDescription: NeuronProblemDetectorStack/NeuronNpdAndRecoveryDaemonService/SecurityGroup + SecurityGroupEgress: + - CidrIp: 0.0.0.0/0 + Description: Allow all outbound traffic by default + IpProtocol: "-1" + VpcId: + Ref: NeuronProblemDetectorVPC5F617726 + DependsOn: + - NeuronProblemDetectorTaskRoleDefaultPolicyCFCDEF04 + - NeuronProblemDetectorTaskRole673752FB + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/NeuronNpdAndRecoveryDaemonService/SecurityGroup/Resource + CDKMetadata: + Type: AWS::CDK::Metadata + Properties: + Analytics: v2:deflate64:H4sIAAAAAAAA/2VSXW/bMAz8LX1XtDkb+p5lXRcgK4Q46GtBK0zCRZYMfaQLDP/30vJcJ+0Tj2eaOh45l8X9d1ncwWuY6d1pZqiSbRlBnwRTLy3quWyfGy2We/uslkKlypAuU2Ux9tyENi5F3EJlcOInbhGC0wSRnH0v7sHDSvXhCeIjRHyFi1Cezgynxisb0TMeCwYl/7NFZK3HGm0UJerkKV4evUtN1nBDrCFZfdxi3RgY3r5lOoE6yHZpUuD3+u8j3EI4/cQ9WRrlf2ScjUAW/RW3Jpv+KfBQI/cI4kHPS/Rn0oM9A+wEpOiCBkP2INsFJ+WQvA/xiVvTHvVFG/zt3CmPcU10gqCW7cYNa8hROV5Z9m1CKxsiWI3Kuz0ZFmKgrnYg21/syTjmNVboawqBs04Ey0bxgoL21IwFN/nWNZRvJgPu7g78y9pNc42468QGg0u+N4b9dvWUsv4r/Aeahl34UMUDnGmH/gcEFHxlGPl6D7mORY/+53U6u8u76YS6xKOzX77JopDF17u/gWjmk41Uo9wM8Q0+wiv5GQMAAA== + Metadata: + aws:cdk:path: NeuronProblemDetectorStack/CDKMetadata/Default + Condition: CDKMetadataAvailable +Mappings: + LatestNodeRuntimeMap: + af-south-1: + value: nodejs20.x + ap-east-1: + value: nodejs20.x + ap-northeast-1: + value: nodejs20.x + ap-northeast-2: + value: nodejs20.x + ap-northeast-3: + value: nodejs20.x + ap-south-1: + value: nodejs20.x + ap-south-2: + value: nodejs20.x + ap-southeast-1: + value: nodejs20.x + ap-southeast-2: + value: nodejs20.x + ap-southeast-3: + value: nodejs20.x + ap-southeast-4: + value: nodejs20.x + ap-southeast-5: + value: nodejs20.x + ap-southeast-7: + value: nodejs20.x + ca-central-1: + value: nodejs20.x + ca-west-1: + value: nodejs20.x + cn-north-1: + value: nodejs18.x + cn-northwest-1: + value: nodejs18.x + eu-central-1: + value: nodejs20.x + eu-central-2: + value: nodejs20.x + eu-north-1: + value: nodejs20.x + eu-south-1: + value: nodejs20.x + eu-south-2: + value: nodejs20.x + eu-west-1: + value: nodejs20.x + eu-west-2: + value: nodejs20.x + eu-west-3: + value: nodejs20.x + il-central-1: + value: nodejs20.x + me-central-1: + value: nodejs20.x + me-south-1: + value: nodejs20.x + mx-central-1: + value: nodejs20.x + sa-east-1: + value: nodejs20.x + us-east-1: + value: nodejs20.x + us-east-2: + value: nodejs20.x + us-west-1: + value: nodejs20.x + us-west-2: + value: nodejs20.x +Parameters: + SsmParameterValueawsserviceecsoptimizedamiamazonlinux2infrecommendedimageidC96584B6F00A464EAD1953AFF4B05118Parameter: + Type: AWS::SSM::Parameter::Value + Default: /aws/service/ecs/optimized-ami/amazon-linux-2/inf/recommended/image_id + BootstrapVersion: + Type: AWS::SSM::Parameter::Value + Default: /cdk-bootstrap/hnb659fds/version + Description: Version of the CDK Bootstrap resources in this environment, automatically retrieved from SSM Parameter Store. [cdk:skip] +Conditions: + CDKMetadataAvailable: + Fn::Or: + - Fn::Or: + - Fn::Equals: + - Ref: AWS::Region + - af-south-1 + - Fn::Equals: + - Ref: AWS::Region + - ap-east-1 + - Fn::Equals: + - Ref: AWS::Region + - ap-northeast-1 + - Fn::Equals: + - Ref: AWS::Region + - ap-northeast-2 + - Fn::Equals: + - Ref: AWS::Region + - ap-northeast-3 + - Fn::Equals: + - Ref: AWS::Region + - ap-south-1 + - Fn::Equals: + - Ref: AWS::Region + - ap-south-2 + - Fn::Equals: + - Ref: AWS::Region + - ap-southeast-1 + - Fn::Equals: + - Ref: AWS::Region + - ap-southeast-2 + - Fn::Equals: + - Ref: AWS::Region + - ap-southeast-3 + - Fn::Or: + - Fn::Equals: + - Ref: AWS::Region + - ap-southeast-4 + - Fn::Equals: + - Ref: AWS::Region + - ca-central-1 + - Fn::Equals: + - Ref: AWS::Region + - ca-west-1 + - Fn::Equals: + - Ref: AWS::Region + - cn-north-1 + - Fn::Equals: + - Ref: AWS::Region + - cn-northwest-1 + - Fn::Equals: + - Ref: AWS::Region + - eu-central-1 + - Fn::Equals: + - Ref: AWS::Region + - eu-central-2 + - Fn::Equals: + - Ref: AWS::Region + - eu-north-1 + - Fn::Equals: + - Ref: AWS::Region + - eu-south-1 + - Fn::Equals: + - Ref: AWS::Region + - eu-south-2 + - Fn::Or: + - Fn::Equals: + - Ref: AWS::Region + - eu-west-1 + - Fn::Equals: + - Ref: AWS::Region + - eu-west-2 + - Fn::Equals: + - Ref: AWS::Region + - eu-west-3 + - Fn::Equals: + - Ref: AWS::Region + - il-central-1 + - Fn::Equals: + - Ref: AWS::Region + - me-central-1 + - Fn::Equals: + - Ref: AWS::Region + - me-south-1 + - Fn::Equals: + - Ref: AWS::Region + - sa-east-1 + - Fn::Equals: + - Ref: AWS::Region + - us-east-1 + - Fn::Equals: + - Ref: AWS::Region + - us-east-2 + - Fn::Equals: + - Ref: AWS::Region + - us-west-1 + - Fn::Equals: + - Ref: AWS::Region + - us-west-2 + diff --git a/neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/__init__.py b/neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/ecs_task_definition.json b/neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/ecs_task_definition.json new file mode 100644 index 00000000..a6a52cc5 --- /dev/null +++ b/neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/ecs_task_definition.json @@ -0,0 +1,85 @@ +{ + "family": "neuron-npd-and-recovery", + "containerDefinitions": [ + { + "name": "npd", + "image": "registry.k8s.io/node-problem-detector/node-problem-detector:v0.8.20", + "cpu": 0, + "portMappings": [ + { + "name": "npd-80-tcp", + "containerPort": 80, + "hostPort": 80 + } + ], + "essential": true, + "entryPoint": [ + "/bin/sh", + "-c" + ], + "command": [ + "echo '{\"plugin\":\"kmsg\",\"logPath\":\"/dev/kmsg\",\"lookback\":\"5m\",\"bufferSize\":10,\"source\":\"kernel-monitor\",\"conditions\":[{\"type\":\"NeuronHealth\",\"reason\":\"NeuronHasNoError\",\"message\":\"Neuronhasnoerror\"}],\"rules\":[{\"type\":\"permanent\",\"condition\":\"NeuronHealth\",\"reason\":\"NeuronHasError_SRAM_UNCORRECTABLE_ERROR\",\"pattern\":\".*NEURON_HW_ERR=SRAM_UNCORRECTABLE_ERROR.*\"},{\"type\":\"permanent\",\"condition\":\"NeuronHealth\",\"reason\":\"NeuronHasError_NC_UNCORRECTABLE_ERROR\",\"pattern\":\".*NEURON_HW_ERR=NC_UNCORRECTABLE_ERROR.*\"},{\"type\":\"permanent\",\"condition\":\"NeuronHealth\",\"reason\":\"NeuronHasError_HBM_UNCORRECTABLE_ERROR\",\"pattern\":\".*NEURON_HW_ERR=HBM_UNCORRECTABLE_ERROR.*\"},{\"type\":\"permanent\",\"condition\":\"NeuronHealth\",\"reason\":\"NeuronHasError_DMA_ERROR\",\"pattern\":\".*NEURON_HW_ERR=DMA_ERROR.*\"}]}' > /config/kernel-monitor.json && /node-problem-detector --v=2 --logtostderr --enable-k8s-exporter=false --config.system-log-monitor=/config/kernel-monitor.json" + ], + "environment": [], + "mountPoints": [], + "volumesFrom": [], + "linuxParameters": { + "devices": [ + { + "hostPath": "/dev/kmsg", + "containerPath": "/dev/kmsg", + "permissions": [ + "read", + "write" + ] + } + ] + }, + "privileged": true, + "logConfiguration": { + "logDriver": "awslogs", + "options": { + "awslogs-group": "/ecs/npd", + "awslogs-create-group": "true", + "awslogs-stream-prefix": "ecs" + }, + "secretOptions": [] + }, + "systemControls": [] + }, + { + "name": "recovery", + "image": "public.ecr.aws/neuron/neuron-node-recovery:1.3.0", + "cpu": 0, + "portMappings": [], + "essential": true, + "entryPoint": [ + "/bin/sh", + "-c" + ], + "command": [ + "python scripts/check-health.py" + ], + "environment": [ + { + "name": "ENABLE_RECOVERY", + "value": "true" + } + ], + "mountPoints": [], + "volumesFrom": [], + "readonlyRootFilesystem": true, + "logConfiguration": { + "logDriver": "awslogs", + "options": { + "awslogs-create-group": "true", + "awslogs-group": "/ecs/recovery", + "awslogs-stream-prefix": "ecs" + } + }, + "systemControls": [] + } + ], + "cpu": "1024", + "memory": "3072" +} diff --git a/neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/neuron_problem_detector_stack.py b/neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/neuron_problem_detector_stack.py new file mode 100644 index 00000000..09a19958 --- /dev/null +++ b/neuron-problem-detector/ecs-npd-cdk/neuron_problem_detector/neuron_problem_detector_stack.py @@ -0,0 +1,178 @@ +from aws_cdk import ( + # Duration, + Stack, + # aws_sqs as sqs, + aws_ec2 as ec2, + aws_ecs as ecs, + aws_iam as iam, + aws_logs as logs, + aws_autoscaling as autoscaling, +) +from constructs import Construct +import json + + + +class NeuronProblemDetectorStack(Stack): + + def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: + super().__init__(scope, construct_id, **kwargs) + with open('neuron_problem_detector/ecs_task_definition.json', 'r') as f: + ecs_task_definition = json.load(f) + + vpc = ec2.Vpc(self, "NeuronProblemDetectorVPC", max_azs=2) + + ecs_cluster = ecs.Cluster(self, "NeuronProblemDetectorCluster", vpc=vpc) + + ecs_cluster.add_capacity( + id="NeuronAutoScalingGroupCapacity", + machine_image=ecs.EcsOptimizedImage.amazon_linux2( + ecs.AmiHardwareType.NEURON + ), + max_capacity=3, + min_capacity=1, + desired_capacity=1, + instance_type=ec2.InstanceType("inf2.xlarge"), + ssm_session_permissions=True, + can_containers_access_instance_role=True, + ) + + # Create the task execution role + task_execution_role = iam.Role( + self, + "NeuronProblemDetectorTaskExecutionRole", + assumed_by=iam.ServicePrincipal("ecs-tasks.amazonaws.com"), + managed_policies=[ + iam.ManagedPolicy.from_aws_managed_policy_name( + "service-role/AmazonECSTaskExecutionRolePolicy" + ), + ], + ) + + iam_policy_document = iam.PolicyDocument( + statements=[ + iam.PolicyStatement( + actions=[ + "autoscaling:SetInstanceHealth", + "autoscaling:DescribeAutoScalingInstances", + ], + resources=["*"], + effect=iam.Effect.ALLOW, + ), + iam.PolicyStatement( + actions=["ec2:DescribeInstances"], + resources=["*"], + effect=iam.Effect.ALLOW, + ), + iam.PolicyStatement( + actions=["cloudwatch:PutMetricData"], + resources=["*"], + effect=iam.Effect.ALLOW + ), + ] + ) + + iam.PolicyStatement( + actions=[ + "autoscaling:SetInstanceHealth", + "autoscaling:DescribeAutoScalingInstances", + ], + resources=["*"], + effect=iam.Effect.ALLOW, + ) + + # Create a task role (if needed) + task_role = iam.Role( + self, + "NeuronProblemDetectorTaskRole", + assumed_by=iam.ServicePrincipal("ecs-tasks.amazonaws.com"), + inline_policies={"node-recovery": iam_policy_document}, + ) + + # Create an ECS Task Definition + task_definition = ecs.TaskDefinition( + self, + "NeuronNpdAndRecoveryTaskDef", + family="neuron-npd-and-recovery", + network_mode=ecs.NetworkMode.AWS_VPC, + cpu=ecs_task_definition["cpu"], + memory_mib=ecs_task_definition["memory"], + compatibility=ecs.Compatibility.EC2, + execution_role=task_execution_role, + task_role=task_role + ) + + # Create the device mapping + device_mapping = ecs.Device( + host_path=ecs_task_definition["containerDefinitions"][0]["linuxParameters"]["devices"][0]["hostPath"], + container_path=ecs_task_definition["containerDefinitions"][0]["linuxParameters"]["devices"][0]["containerPath"], + permissions=[ecs.DevicePermission.READ, ecs.DevicePermission.WRITE], + ) + + linux_parameters = ecs.LinuxParameters( + self, + "NpdLinuxParameters", + ) + + linux_parameters.add_devices(device_mapping) + + npd_container = task_definition.add_container( + ecs_task_definition["containerDefinitions"][0]["name"], + image=ecs.ContainerImage.from_registry( + ecs_task_definition["containerDefinitions"][0]["image"] + ), + entry_point=ecs_task_definition["containerDefinitions"][0]["entryPoint"], + command=ecs_task_definition["containerDefinitions"][0]["command"], + privileged=True, + logging=ecs.AwsLogDriver( + stream_prefix=ecs_task_definition["containerDefinitions"][0]["logConfiguration"]["options"]["awslogs-stream-prefix"], + log_group=logs.LogGroup( + self, + "NpdLogGroup", + log_group_name=ecs_task_definition["containerDefinitions"][0]["logConfiguration"]["options"]["awslogs-group"], + retention=logs.RetentionDays.ONE_WEEK, + ), + ), + linux_parameters=linux_parameters, + ) + + npd_container.add_port_mappings( + ecs.PortMapping( + name=ecs_task_definition["containerDefinitions"][0]["portMappings"][0]["name"], + container_port=ecs_task_definition["containerDefinitions"][0]["portMappings"][0]["containerPort"], + host_port=ecs_task_definition["containerDefinitions"][0]["portMappings"][0]["hostPort"], + protocol=ecs.Protocol.TCP, + app_protocol=ecs.AppProtocol.http, + ) + ) + + recovery_container = task_definition.add_container( + ecs_task_definition["containerDefinitions"][1]["name"], + image=ecs.ContainerImage.from_registry( + ecs_task_definition["containerDefinitions"][1]["image"] + ), + entry_point=ecs_task_definition["containerDefinitions"][1]["entryPoint"], + command=ecs_task_definition["containerDefinitions"][1]["command"], + environment={ + ecs_task_definition["containerDefinitions"][1]["environment"][0]["name"]: ecs_task_definition["containerDefinitions"][1]["environment"][0]["value"] + }, + readonly_root_filesystem=ecs_task_definition["containerDefinitions"][1]["readonlyRootFilesystem"], + logging=ecs.AwsLogDriver( + stream_prefix=ecs_task_definition["containerDefinitions"][1]["logConfiguration"]["options"]["awslogs-stream-prefix"], + log_group=logs.LogGroup( + self, + "RecoveryLogGroup", + log_group_name=ecs_task_definition["containerDefinitions"][1]["logConfiguration"]["options"]["awslogs-group"], + retention=logs.RetentionDays.ONE_WEEK, + ), + ), + ) + + ec2_service = ecs.Ec2Service( + self, + "NeuronNpdAndRecoveryDaemonService", + cluster=ecs_cluster, + task_definition=task_definition, + daemon=True, + enable_execute_command=True, + ) diff --git a/neuron-problem-detector/ecs-npd-cdk/requirements-dev.txt b/neuron-problem-detector/ecs-npd-cdk/requirements-dev.txt new file mode 100644 index 00000000..92709451 --- /dev/null +++ b/neuron-problem-detector/ecs-npd-cdk/requirements-dev.txt @@ -0,0 +1 @@ +pytest==6.2.5 diff --git a/neuron-problem-detector/ecs-npd-cdk/requirements.txt b/neuron-problem-detector/ecs-npd-cdk/requirements.txt new file mode 100644 index 00000000..54af265a --- /dev/null +++ b/neuron-problem-detector/ecs-npd-cdk/requirements.txt @@ -0,0 +1,2 @@ +aws-cdk-lib>=2.152.0 +constructs>=10.0.0,<11.0.0 diff --git a/neuron-problem-detector/ecs-npd-cdk/source.bat b/neuron-problem-detector/ecs-npd-cdk/source.bat new file mode 100644 index 00000000..9e1a8344 --- /dev/null +++ b/neuron-problem-detector/ecs-npd-cdk/source.bat @@ -0,0 +1,13 @@ +@echo off + +rem The sole purpose of this script is to make the command +rem +rem source .venv/bin/activate +rem +rem (which activates a Python virtualenv on Linux or Mac OS X) work on Windows. +rem On Windows, this command just runs this batch file (the argument is ignored). +rem +rem Now we don't need to document a Windows command for activating a virtualenv. + +echo Executing .venv\Scripts\activate.bat for you +.venv\Scripts\activate.bat diff --git a/neuron-problem-detector/ecs-npd-cdk/tests/__init__.py b/neuron-problem-detector/ecs-npd-cdk/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/neuron-problem-detector/ecs-npd-cdk/tests/unit/__init__.py b/neuron-problem-detector/ecs-npd-cdk/tests/unit/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/neuron-problem-detector/ecs-npd-cdk/tests/unit/test_neuron_problem_detector_stack.py b/neuron-problem-detector/ecs-npd-cdk/tests/unit/test_neuron_problem_detector_stack.py new file mode 100644 index 00000000..d90e6b12 --- /dev/null +++ b/neuron-problem-detector/ecs-npd-cdk/tests/unit/test_neuron_problem_detector_stack.py @@ -0,0 +1,18 @@ +import aws_cdk as core +import aws_cdk.assertions as assertions + +from neuron_problem_detector.neuron_problem_detector_stack import NeuronProblemDetectorStack + +# example tests. To run these tests, uncomment this file along with the example +# resource in neuron_problem_detector/neuron_problem_detector_stack.py +def test_sqs_queue_created(): + app = core.App() + stack = NeuronProblemDetectorStack(app, "neuron-problem-detector") + template = assertions.Template.from_stack(stack) + + template.has_resource_properties("AWS::ECS::Cluster",{}) + + +# template.has_resource_properties("AWS::SQS::Queue", { +# "VisibilityTimeout": 300 +# }) diff --git a/neuron-problem-detector/ecs-npd-cdk/tools/check-ecs-exec.sh b/neuron-problem-detector/ecs-npd-cdk/tools/check-ecs-exec.sh new file mode 100644 index 00000000..2a692a79 --- /dev/null +++ b/neuron-problem-detector/ecs-npd-cdk/tools/check-ecs-exec.sh @@ -0,0 +1,717 @@ +#!/usr/bin/env bash + +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +# shellcheck disable=SC2059 + +CHECKER_VERSION=v0.7 + +# Script Name: check-ecs-exec.sh +# Usage : bash ./check-ecs-exec.sh + +set -euo pipefail + +## NOTE: Checks in this script are mainly based on: +## +## "Using Amazon ECS Exec for debugging - Amazon Elastic Container Service" +## https://docs.aws.amazon.com/AmazonECS/latest/developerguide/ecs-exec.html +## +## "NEW – Using Amazon ECS Exec to access your containers on AWS Fargate and Amazon EC2" +## https://aws.amazon.com/blogs/containers/new-using-amazon-ecs-exec-access-your-containers-fargate-ec2/ +## + +## NOTE: This script at least needs the following permissions. +## 1. If you use an IAM user with an assumed role to run the script, +## then you need to allow the "iam:ListRoles" action in addition to the following. +## 2. If you configured your ECS cluster to use KMS key for ECS Exec, +## then you need to allow the "kms:DescribeKey" action in addition to the following. +## { +## "Version": "2012-10-17", +## "Statement": [ +## { +## "Effect": "Allow", +## "Action": [ +## "iam:GetInstanceProfile", +## "iam:SimulatePrincipalPolicy", +## "ec2:DescribeSubnets", +## "ec2:DescribeVpcEndpoints", +## "ecs:DescribeClusters", +## "ecs:DescribeContainerInstances", +## "ecs:DescribeTaskDefinition", +## "ecs:DescribeTasks" +## ], +## "Resource": "*" +## } +## ] +## } + +# If you have multiple AWS CLI binaries, v1 and v2 for instance, you can choose which AWS CLI binary to use by setting the AWS_CLI_BIN env var. +# e.g. AWS_CLI_BIN=aws-v1 ./check-ecs-exec.sh YOUR_ECS_CLUSTER_NAME YOUR_ECS_TASK_ID +AWS_CLI_BIN=${AWS_CLI_BIN:-aws} + +# Force AWS CLI output format to json to use jq to parse its output +export AWS_DEFAULT_OUTPUT=json + +# Colors for output +COLOR_DEFAULT='\033[0m' +COLOR_RED='\033[0;31m' +COLOR_YELLOW='\033[1;33m' +COLOR_GREEN='\033[0;32m' + +# Validation for required parameters +CLUSTER_NAME=${1:-None} # A cluster name or a full ARN of the cluster +TASK_ID=${2:-None} # A task ID or a full ARN of the task +if [[ "${CLUSTER_NAME}" = "None" || "${TASK_ID}" = "None" ]]; then + printf "${COLOR_RED}Usage:\n" >&2 + printf " ./check-ecs-exec.sh YOUR_ECS_CLUSTER_NAME YOUR_ECS_TASK_ID${COLOR_DEFAULT}\n" >&2 + exit 1 +fi + +#### Functions +printSectionHeaderLine() { + printf "${COLOR_DEFAULT}-------------------------------------------------------------\n" +} +equalsOrGreaterVersion() { + required=$1 + current=$2 + if [[ "$(printf '%s\n' "$required" "$current" | sort -V | head -n1)" = "$required" ]]; then + return + fi + false +} +getRoleArnForAssumedRole() { + callerIdentityJson=$1 + ROLE_ID=$(echo "${callerIdentityJson}" | jq -r ".UserId" | cut -d: -f1) + aws iam list-roles --query "Roles[?RoleId=='${ROLE_ID}'].Arn" --output text +} +# For `iam simulate-principal-policy` +readEvalDecision() { + evalResultsJson=$1 + actionName=$2 + echo "${evalResultsJson}" | jq -r --arg ACTION_NAME "$actionName" '.EvaluationResults[] | select(.EvalActionName==$ACTION_NAME) | .EvalDecision' +} +showEvalResult() { + evalResult=$1 + actionName=$2 + printf "${COLOR_DEFAULT} ${actionName}: " + if [[ "${evalResult}" = "allowed" ]]; then + printf "${COLOR_GREEN}${evalResult}\n" + else + printf "${COLOR_RED}${evalResult}\n" + fi +} + +## 1. CHECK PREREQUISITES FOR check-ecs-exec.sh ########################################## +printSectionHeaderLine +printf "${COLOR_DEFAULT}Prerequisites for check-ecs-exec.sh ${CHECKER_VERSION}\n" +printSectionHeaderLine +########################################################################################## + +# Check if jq command exists +command -v jq >/dev/null 2>&1 && status="$?" || status="$?" +if [[ ! "${status}" = 0 ]]; then + printf "${COLOR_RED}Pre-flight check failed: \`jq\` command is missing${COLOR_DEFAULT}\n" >&2 + exit 1 +fi +printf "${COLOR_DEFAULT} jq | ${COLOR_GREEN}OK ${COLOR_DEFAULT}($(which jq))\n" + +# Check if aws command exists +command -v "${AWS_CLI_BIN}" >/dev/null 2>&1 && status="$?" || status="$?" +if [[ ! "${status}" = 0 ]]; then + printf "${COLOR_RED}Pre-flight check failed: \`${AWS_CLI_BIN}\` command is missing${COLOR_DEFAULT}\n" >&2 + exit 1 +fi +printf "${COLOR_DEFAULT} AWS CLI | ${COLOR_GREEN}OK ${COLOR_DEFAULT}($(which "${AWS_CLI_BIN}"))\n" + +# Find AWS region +REGION=$(${AWS_CLI_BIN} configure get region | tr -d "\r" || echo "") +export AWS_REGION=${AWS_REGION:-$REGION} +# Check region configuration in "source_profile" if the user uses MFA configurations +source_profile=$(${AWS_CLI_BIN} configure get source_profile || echo "") +if [ "${AWS_REGION}" = "" ] && [ "${source_profile}" != "" ]; then + region=$(${AWS_CLI_BIN} configure get region --profile "${source_profile}" || echo "") + export AWS_REGION="${region}" +fi +if [[ "${AWS_REGION}" = "" ]]; then + printf "${COLOR_RED}Pre-flight check failed: Missing AWS region. Use the \`aws configure set default.region\` command or set the \"AWS_REGION\" environment variable.${COLOR_DEFAULT}\n" >&2 + exit 1 +fi + +## 2. CHECK PREREQUISITES FOR USING ECS EXEC FEATURE VIA AWS CLI ######################### +printf "\n" +printSectionHeaderLine +printf "${COLOR_DEFAULT}Prerequisites for the AWS CLI to use ECS Exec\n" +printSectionHeaderLine +########################################################################################## + +# MFA +AWS_MFA_SERIAL=${AWS_MFA_SERIAL:-$(${AWS_CLI_BIN} configure get mfa_serial || echo "")} +ROLE_TO_BE_ASSUMED=$(${AWS_CLI_BIN} configure get role_arn || echo "") +SOURCE_PROFILE=$(${AWS_CLI_BIN} configure get source_profile || echo "") +# Normally we don't need to ask MFA code thanks to the AWS CLI +# but we do need to prompt explicitly if the "AWS_MFA_SERIAL" value only exists without "role_arn" and "source_profile" +if [ "${AWS_MFA_SERIAL}" != "" ] && [ "${ROLE_TO_BE_ASSUMED}" == "" ] && [ "${SOURCE_PROFILE}" == "" ]; then + # Prpmpt users to enter MFA code to obtain temporary credentials + mfa_code="" + while true; do + printf "\n" + printf "Type MFA code for ${AWS_MFA_SERIAL}: " + read -rs mfa_code + if [ -z "${mfa_code}" ]; then + printf "${COLOR_RED}MFA code cannot be empty${COLOR_DEFAULT}" + continue + fi + break + done + + tmpCreds=$(${AWS_CLI_BIN} sts get-session-token --serial-number "${AWS_MFA_SERIAL}" --token-code "${mfa_code}") + accessKey=$( echo "${tmpCreds}" | jq -r .Credentials.AccessKeyId ) + secretKey=$( echo "${tmpCreds}" | jq -r .Credentials.SecretAccessKey ) + sessionToken=$( echo "${tmpCreds}" | jq -r .Credentials.SessionToken ) + export AWS_ACCESS_KEY_ID="${accessKey}" + export AWS_SECRET_ACCESS_KEY="${secretKey}" + export AWS_SESSION_TOKEN="${sessionToken}" +fi + +# Find caller identity +callerIdentityJson=$(${AWS_CLI_BIN} sts get-caller-identity) +ACCOUNT_ID=$(echo "${callerIdentityJson}" | jq -r ".Account") +CALLER_IAM_ARN=$(echo "${callerIdentityJson}" | jq -r ".Arn") +case "${CALLER_IAM_ARN}" in + *:user/*|*:role/*|*:group/* ) MY_IAM_ARN="${CALLER_IAM_ARN}";; + *:assumed-role/*) MY_IAM_ARN=$(getRoleArnForAssumedRole "${callerIdentityJson}");; + * ) printf "${COLOR_RED}Pre-flight check failed: The ARN \"${CALLER_IAM_ARN}\" associated with the caller(=you) is not supported. Try again either with one of an IAM user, an IAM role, or an assumed IAM role.${COLOR_DEFAULT}\n" >&2 && exit 1;; +esac +if [[ "${MY_IAM_ARN}" = "" ]]; then + printf "${COLOR_RED}Unknown error: Failed to get the role ARN of the caller(=you).${COLOR_DEFAULT}\n" >&2 + exit 1 +fi + +# Check task existence +describedTaskJson=$(${AWS_CLI_BIN} ecs describe-tasks \ + --cluster "${CLUSTER_NAME}" \ + --tasks "${TASK_ID}" \ + --output json) +existTask=$(echo "${describedTaskJson}" | jq -r ".tasks[0].taskDefinitionArn") +if [[ "${existTask}" = "null" ]]; then + printf "${COLOR_RED}Pre-flight check failed: The specified ECS task does not exist.\n\ +Make sure the parameters you have specified for cluster \"${CLUSTER_NAME}\" and task \"${TASK_ID}\" are both valid.${COLOR_DEFAULT}\n" + exit 1 +fi + +# Check whether the AWS CLI v1.19.28/v2.1.30 or later exists +executeCommandEnabled=$(echo "${describedTaskJson}" | jq -r ".tasks[0].enableExecuteCommand") +if [[ "${executeCommandEnabled}" = "null" ]]; then + printf "${COLOR_RED}Pre-flight check failed: ECS Exec requires the AWS CLI v1.19.28/v2.1.30 or later.\n\ +Please update the AWS CLI and try again?\n\ + For v2: https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2.html\n\ + For v1: https://docs.aws.amazon.com/cli/latest/userguide/install-cliv1.html${COLOR_DEFAULT}\n" + exit 1 +fi +awsCliVersion=$(${AWS_CLI_BIN} --version 2>&1 | tr -d "\r") +printf "${COLOR_DEFAULT} AWS CLI Version | ${COLOR_GREEN}OK ${COLOR_DEFAULT}(${awsCliVersion})\n" + +# Check whether the Session Manager plugin exists +printf "${COLOR_DEFAULT} Session Manager Plugin | " +command -v session-manager-plugin >/dev/null 2>&1 && status="$?" || status="$?" +if [[ "${status}" = 0 ]]; then + smpVersion=$(session-manager-plugin --version) + printf "${COLOR_GREEN}OK ${COLOR_DEFAULT}(${smpVersion})\n" +else + # https://docs.aws.amazon.com/systems-manager/latest/userguide/session-manager-working-with-install-plugin.html + printf "${COLOR_RED}Missing\n" +fi + +## 3. CHECK CLUSTER AND TASK CONFIGURATIONS ############################################## +printf "\n" +printSectionHeaderLine +printf "${COLOR_DEFAULT}Checks on ECS task and other resources\n" +printSectionHeaderLine +printf "${COLOR_DEFAULT}Region : ${AWS_REGION}\n" +printf "${COLOR_DEFAULT}Cluster: ${CLUSTER_NAME}\n" +printf "${COLOR_DEFAULT}Task : ${TASK_ID}\n" +printSectionHeaderLine +########################################################################################## + +# 1. Checks on the cluster configurations (yellow) +describedClusterJson=$(${AWS_CLI_BIN} ecs describe-clusters \ + --clusters "${CLUSTER_NAME}" \ + --include CONFIGURATIONS \ + --output json) +executeCommandConfigurationJson=$(echo "${describedClusterJson}" \ + | jq ".clusters[0].configuration.executeCommandConfiguration") + +printf "${COLOR_DEFAULT} Cluster Configuration |" + +kmsKeyId="null" +kmsKeyArn="null" +logging="null" +s3BucketName="null" +s3KeyPrefix="null" +s3Encryption="null" +cloudWatchLogGroupName="null" +cloudWatchLogEncryptionEnabled="null" +if [[ "${executeCommandConfigurationJson}" = "null" ]]; then + printf "${COLOR_YELLOW} Audit Logging Not Configured" +else + printf "\n" + + kmsKeyId=$(echo "${executeCommandConfigurationJson}" | jq -r ".kmsKeyId") + printf "${COLOR_DEFAULT} KMS Key : " + if [[ "${kmsKeyId}" = "null" ]]; then + printf "${COLOR_YELLOW}Not Configured" + else + printf "${kmsKeyId}" + kmsKeyArn=$(${AWS_CLI_BIN} kms describe-key --key-id "${kmsKeyId}" --query 'KeyMetadata.Arn' --output text) + fi + printf "\n" + + logging=$(echo "${executeCommandConfigurationJson}" | jq -r ".logging") + printf "${COLOR_DEFAULT} Audit Logging : " + if [[ "${logging}" = "null" ]]; then + printf "${COLOR_YELLOW}Not Configured" + elif [[ "${logging}" = "NONE" ]]; then + printf "${COLOR_YELLOW}Disabled" + else + printf "${logging}" + fi + printf "\n" + + s3BucketName=$(echo "${executeCommandConfigurationJson}" | jq -r ".logConfiguration.s3BucketName") + s3KeyPrefix=$(echo "${executeCommandConfigurationJson}" | jq -r ".logConfiguration.s3KeyPrefix") + s3Encryption=$(echo "${executeCommandConfigurationJson}" | jq -r ".logConfiguration.s3EncryptionEnabled") + printf "${COLOR_DEFAULT} S3 Bucket Name: " + if [[ "${s3BucketName}" = "null" ]]; then + printf "Not Configured" + else + printf "${s3BucketName}" + if [[ ! "${s3KeyPrefix}" = "null" ]]; then + printf ", Key Prefix: ${s3KeyPrefix}" + fi + printf ", Encryption Enabled: ${s3Encryption}" + fi + printf "\n" + + cloudWatchLogGroupName=$(echo "${executeCommandConfigurationJson}" | jq -r ".logConfiguration.cloudWatchLogGroupName") + cloudWatchLogEncryptionEnabled=$(echo "${executeCommandConfigurationJson}" | jq -r ".logConfiguration.cloudWatchEncryptionEnabled") + printf "${COLOR_DEFAULT} CW Log Group : " + if [[ "${cloudWatchLogGroupName}" = "null" ]]; then + printf "Not Configured" + else + printf "${cloudWatchLogGroupName}" + printf ", Encryption Enabled: ${cloudWatchLogEncryptionEnabled}" + fi +fi +printf "\n" + +# 2. Check whether "I" can call ecs:ExecuteCommand +printf "${COLOR_DEFAULT} Can I ExecuteCommand? | ${MY_IAM_ARN}\n" +ecsExecuteCommand="ecs:ExecuteCommand" +ecsExecEvalResult=$(${AWS_CLI_BIN} iam simulate-principal-policy \ + --policy-source-arn "${MY_IAM_ARN}" \ + --action-names "${ecsExecuteCommand}" \ + --resource-arns "arn:aws:ecs:${AWS_REGION}:${ACCOUNT_ID}:task/${CLUSTER_NAME}/${TASK_ID}" \ + --output json \ + | jq -r ".EvaluationResults[0].EvalDecision") +showEvalResult "${ecsExecEvalResult}" "${ecsExecuteCommand}" +if [[ ! "${kmsKeyId}" = "null" ]]; then + kmsGenerateDataKey="kms:GenerateDataKey" + kmsGenerateDataKeyResult=$(${AWS_CLI_BIN} iam simulate-principal-policy \ + --policy-source-arn "${MY_IAM_ARN}" \ + --action-names "${kmsGenerateDataKey}" \ + --resource-arns "${kmsKeyArn}" \ + --output json \ + | jq -r ".EvaluationResults[0].EvalDecision") + showEvalResult "${kmsGenerateDataKeyResult}" "${kmsGenerateDataKey}" +fi +## Check for ensuring "I cannot" call ssm:StartSession (yellow) +### See the "Limiting access to the Start Session action" section at https://docs.aws.amazon.com/AmazonECS/latest/developerguide/ecs-exec.html#ecs-exec-limit-access-start-session +ssmStartSession="ssm:StartSession" +printf "${COLOR_DEFAULT} ${ssmStartSession} denied?: " +ssmSessionEvalResult=$(${AWS_CLI_BIN} iam simulate-principal-policy \ + --policy-source-arn "${MY_IAM_ARN}" \ + --action-names "${ssmStartSession}" \ + --resource-arns "arn:aws:ecs:${AWS_REGION}:${ACCOUNT_ID}:task/${CLUSTER_NAME}/${TASK_ID}" \ + --output json \ + | jq -r ".EvaluationResults[0].EvalDecision") +if [[ "${ssmSessionEvalResult}" = "allowed" ]]; then + printf "${COLOR_YELLOW}" +else + printf "${COLOR_GREEN}" +fi +printf "${ssmSessionEvalResult}\n" + +# 3. Check the task is in RUNNING state +printf "${COLOR_DEFAULT} Task Status | " +taskStatus=$(echo "${describedTaskJson}" | jq -r ".tasks[0].lastStatus") +stoppedReason=$(echo "${describedTaskJson}" | jq -r ".tasks[0].stoppedReason") +case "${taskStatus}" in + RUNNING ) printf "${COLOR_GREEN}${taskStatus}";; + PROVISIONING|ACTIVATING|PENDING ) printf "${COLOR_YELLOW}${taskStatus}";; + DEACTIVATING|STOPPING|DEPROVISIONING ) printf "${COLOR_RED}${taskStatus}";; + STOPPED ) printf "${COLOR_RED}${taskStatus} (${stoppedReason})";; + * ) printf "${COLOR_RED}${taskStatus}";; +esac +printf "${COLOR_DEFAULT}\n" + +# 4. Check the launch type, platform version, ecs-agent version +launchType=$(echo "${describedTaskJson}" | jq -r ".tasks[0].launchType") +describedContainerInstanceJson="" +printf "${COLOR_DEFAULT} Launch Type | " +if [[ "${launchType}" = "FARGATE" ]]; then # For FARGATE Launch Type + printf "${COLOR_GREEN}Fargate\n" + # Check the PV + printf "${COLOR_DEFAULT} Platform Version | " + + # Detect platform family to use correct platform version required + pf=$(echo "${describedTaskJson}" | jq -r ".tasks[0].platformFamily") + if [[ ${pf} == *"Windows"* ]]; then + requiredPV="1.0.0" #1.0.0 minimum for windows + else + requiredPV="1.4.0" #1.4.0 for others + fi + + pv=$(echo "${describedTaskJson}" | jq -r ".tasks[0].platformVersion") + if equalsOrGreaterVersion "${requiredPV}" "${pv}"; then + printf "${COLOR_GREEN}${pv}" + else + printf "${COLOR_RED}${pv} (Required: >= ${requiredPV})" + fi + printf "\n" +elif [[ "${launchType}" = "EC2" ]]; then # For EC2 Launch Type + printf "${COLOR_GREEN}EC2\n" + # Check the ECS-Agent version + containerInstanceArn=$(echo "${describedTaskJson}" | jq -r ".tasks[0].containerInstanceArn") + requiredAgentVersion="1.50.2" + describedContainerInstanceJson=$(${AWS_CLI_BIN} ecs describe-container-instances \ + --cluster "${CLUSTER_NAME}" \ + --container-instance "${containerInstanceArn}" \ + --output json) + agentVersion=$(echo "${describedContainerInstanceJson}" | jq -r ".containerInstances[0].versionInfo.agentVersion") + printf "${COLOR_DEFAULT} ECS Agent Version | " + if equalsOrGreaterVersion "${requiredAgentVersion}" "${agentVersion}"; then + printf "${COLOR_GREEN}${agentVersion}" + else + printf "${COLOR_RED}${agentVersion} (Required: >= ${requiredAgentVersion})" + fi + printf "\n" +else + printf "${COLOR_YELLOW}UNKNOWN\n" +fi + +# 5. Check whether the `execute-command` option is enabled for the task +printf "${COLOR_DEFAULT} Exec Enabled for Task | " +if [[ "${executeCommandEnabled}" = "true" ]]; then + printf "${COLOR_GREEN}OK" +else + printf "${COLOR_RED}NO" +fi +printf "${COLOR_DEFAULT}\n" + +# 6. Check the managed agents' status +printf "${COLOR_DEFAULT} Container-Level Checks | \n" +printf "${COLOR_DEFAULT} ----------\n" +printf "${COLOR_DEFAULT} Managed Agent Status" +if [[ "${executeCommandEnabled}" = "false" ]]; then + printf " - ${COLOR_YELLOW}SKIPPED\n" + printf "${COLOR_DEFAULT} ----------\n" +else + printf "\n" + printf "${COLOR_DEFAULT} ----------\n" + agentsStatus=$(echo "${describedTaskJson}" | jq -r ".tasks[0].containers[].managedAgents[].lastStatus") + idx=0 + for _ in $agentsStatus; do + containerName=$(echo "${describedTaskJson}" | jq -r ".tasks[0].containers[${idx}].name") + status=$(echo "${describedTaskJson}" | jq -r ".tasks[0].containers[${idx}].managedAgents[0].lastStatus") + reason=$(echo "${describedTaskJson}" | jq -r ".tasks[0].containers[${idx}].managedAgents[0].reason") + lastStartedAt=$(echo "${describedTaskJson}" | jq -r ".tasks[0].containers[${idx}].managedAgents[0].lastStartedAt") + printf " $((idx+1)). " + case "${status}" in + *STOPPED* ) printf "${COLOR_RED}STOPPED (Reason: ${reason})";; + *PENDING* ) printf "${COLOR_YELLOW}PENDING";; + * ) printf "${COLOR_GREEN}RUNNING";; + esac + printf "${COLOR_DEFAULT} for \"${containerName}\"" + if [[ "${status}" = "STOPPED" ]]; then + printf " - LastStartedAt: ${lastStartedAt}" + fi + printf "\n" + idx=$((idx+1)) + done +fi + +# 7. Check the "initProcessEnabled" flag added in the task definition (yellow) +taskDefArn=$(echo "${describedTaskJson}" | jq -r ".tasks[0].taskDefinitionArn") +taskDefJson=$(${AWS_CLI_BIN} ecs describe-task-definition \ + --task-definition "${taskDefArn}" \ + --output json) +taskDefFamily=$(echo "${taskDefJson}" | jq -r ".taskDefinition.family") +taskDefRevision=$(echo "${taskDefJson}" | jq -r ".taskDefinition.revision") +initEnabledList=$(echo "${taskDefJson}" | jq -r ".taskDefinition.containerDefinitions[].linuxParameters.initProcessEnabled") +idx=0 +printf "${COLOR_DEFAULT} ----------\n" +printf "${COLOR_DEFAULT} Init Process Enabled (${taskDefFamily}:${taskDefRevision})\n" +printf "${COLOR_DEFAULT} ----------\n" +for enabled in $initEnabledList; do + containerName=$(echo "${taskDefJson}" | jq -r ".taskDefinition.containerDefinitions[${idx}].name") + printf " $((idx+1)). " + case "${enabled}" in + *true* ) printf "${COLOR_GREEN}Enabled";; + *false* ) printf "${COLOR_YELLOW}Disabled";; + * ) printf "${COLOR_YELLOW}Disabled";; + esac + printf "${COLOR_DEFAULT} - \"${containerName}\"\n" + idx=$((idx+1)) +done + +# 8. Check the "readonlyRootFilesystem" flag added in the task definition (red) +readonlyRootFsList=$(echo "${taskDefJson}" | jq -r ".taskDefinition.containerDefinitions[].readonlyRootFilesystem") +idx=0 +printf "${COLOR_DEFAULT} ----------\n" +printf "${COLOR_DEFAULT} Read-Only Root Filesystem (${taskDefFamily}:${taskDefRevision})\n" +printf "${COLOR_DEFAULT} ----------\n" +for enabled in $readonlyRootFsList; do + containerName=$(echo "${taskDefJson}" | jq -r ".taskDefinition.containerDefinitions[${idx}].name") + printf " $((idx+1)). " + case "${enabled}" in + *false* ) printf "${COLOR_GREEN}Disabled";; + *true* ) printf "${COLOR_RED}ReadOnly";; + * ) printf "${COLOR_GREEN}Disabled";; + esac + printf "${COLOR_DEFAULT} - \"${containerName}\"\n" + idx=$((idx+1)) +done + +# 9. Check the task role permissions +overriddenTaskRole=true +taskRoleArn=$(echo "${describedTaskJson}" | jq -r ".tasks[0].overrides.taskRoleArn") +if [[ "${taskRoleArn}" = "null" ]]; then + overriddenTaskRole=false + taskRoleArn=$(echo "${taskDefJson}" | jq -r ".taskDefinition.taskRoleArn") +fi + +hasRole=true +isEC2Role=false +if [[ "${taskRoleArn}" = "null" ]]; then + ## When the task runs on EC2 without a task role then we should check the instance profile + if [[ "${launchType}" = "EC2" ]]; then + ec2InstanceId=$(echo "${describedContainerInstanceJson}" | jq -r ".containerInstances[0].ec2InstanceId") + instanceProfileArn=$(${AWS_CLI_BIN} ec2 describe-instances --instance-ids "${ec2InstanceId}" | jq -r ".Reservations[0].Instances[0].IamInstanceProfile.Arn") + if [[ "${instanceProfileArn}" = "null" ]]; then + hasRole=false + else + instanceProfileName=$(echo "${instanceProfileArn}" | sed 's/arn:aws:iam::.*:instance-profile\///g') + taskRoleArn=$(${AWS_CLI_BIN} iam get-instance-profile \ + --instance-profile-name "${instanceProfileName}" \ + | jq -r ".InstanceProfile.Roles[0].Arn") + if [[ "${taskRoleArn}" = "null" ]]; then + hasRole=false + else + isEC2Role=true + fi + fi + else + ## Fargate launch type doesn't support to use EC2 instance roles + hasRole=false + fi +fi + +if [[ ! "${hasRole}" = "true" ]]; then + printf "${COLOR_DEFAULT} EC2 or Task Role | ${COLOR_RED}Not Configured\n" +else + if [[ "${isEC2Role}" = "true" ]]; then + printf "${COLOR_DEFAULT} EC2 Role Permissions | " + else + printf "${COLOR_DEFAULT} Task Role Permissions | " + fi + printf "${taskRoleArn}" + if [[ "${overriddenTaskRole}" = "true" ]]; then + printf " (Overridden)" + fi + printf "\n" + ## Required Permissions + ### SSM + ssm="ssmmessages:" + ssmCreateControlChannel="${ssm}CreateControlChannel" + ssmCreateDataChannel="${ssm}CreateDataChannel" + ssmOpenControlChannel="${ssm}OpenControlChannel" + ssmOpenDataChannel="${ssm}OpenDataChannel" + + ssmEvalResultsJson=$(${AWS_CLI_BIN} iam simulate-principal-policy \ + --policy-source-arn "${taskRoleArn}" \ + --action-names "${ssmCreateControlChannel}" "${ssmCreateDataChannel}" "${ssmOpenControlChannel}" "${ssmOpenDataChannel}" \ + --output json) + ssmCreateControlChannelResult=$(readEvalDecision "${ssmEvalResultsJson}" "${ssmCreateControlChannel}") + showEvalResult "${ssmCreateControlChannelResult}" "${ssmCreateControlChannel}" + ssmCreateDataChannelResult=$(readEvalDecision "${ssmEvalResultsJson}" "${ssmCreateDataChannel}") + showEvalResult "${ssmCreateDataChannelResult}" "${ssmCreateDataChannel}" + ssmOpenControlChannelResult=$(readEvalDecision "${ssmEvalResultsJson}" "${ssmOpenControlChannel}") + showEvalResult "${ssmOpenControlChannelResult}" "${ssmOpenControlChannel}" + ssmOpenDataChannelResult=$(readEvalDecision "${ssmEvalResultsJson}" "${ssmOpenDataChannel}") + showEvalResult "${ssmOpenDataChannelResult}" "${ssmOpenDataChannel}" + + ## Optional Permissions (Might be required if audit-logging is enabled) + ### KMS + if [[ ! "${kmsKeyId}" = "null" ]]; then + printf "${COLOR_DEFAULT} -----\n" + kmsDecrypt="kms:Decrypt" + kmsEvalResult=$(${AWS_CLI_BIN} iam simulate-principal-policy \ + --policy-source-arn "${taskRoleArn}" \ + --action-names "${kmsDecrypt}" \ + --resource-arns "${kmsKeyArn}" \ + --output json \ + | jq -r ".EvaluationResults[0].EvalDecision") + showEvalResult "${kmsEvalResult}" "${kmsDecrypt}" + fi + ### S3 Bucket + if [[ ! "${s3BucketName}" = "null" ]]; then + printf "${COLOR_DEFAULT} -----\n" + s3PutObject="s3:PutObject" + bucketArn="arn:aws:s3:::${s3BucketName}" + resourceArn="" + if [[ ! "${s3KeyPrefix}" = "null" ]]; then + resourceArn="${bucketArn}/${s3KeyPrefix}*" + else + resourceArn="${bucketArn}/*" + fi + s3EvalResult=$(${AWS_CLI_BIN} iam simulate-principal-policy \ + --policy-source-arn "${taskRoleArn}" \ + --action-names "${s3PutObject}" \ + --resource-arns "${resourceArn}" \ + --output json \ + | jq -r ".EvaluationResults[0].EvalDecision") + showEvalResult "${s3EvalResult}" "${s3PutObject}" + if [[ "${s3Encryption}" = "true" ]]; then + s3GetEncryptionConfiguration="s3:GetEncryptionConfiguration" + s3EvalResult=$(${AWS_CLI_BIN} iam simulate-principal-policy \ + --policy-source-arn "${taskRoleArn}" \ + --action-names "${s3GetEncryptionConfiguration}" \ + --resource-arns "${bucketArn}" \ + --output json \ + | jq -r ".EvaluationResults[0].EvalDecision") + showEvalResult "${s3EvalResult}" "${s3GetEncryptionConfiguration}" + fi + fi + ### CloudWatch Logs + if [[ ! "${cloudWatchLogGroupName}" = "null" ]]; then + printf "${COLOR_DEFAULT} -----\n" + # For Resource "*" + logsDescribeLogGroup="logs:DescribeLogGroups" + logsDescribeLogGroupEvalResult=$(${AWS_CLI_BIN} iam simulate-principal-policy \ + --policy-source-arn "${taskRoleArn}" \ + --action-names "${logsDescribeLogGroup}" \ + --output json \ + | jq -r ".EvaluationResults[0].EvalDecision") + showEvalResult "${logsDescribeLogGroupEvalResult}" "${logsDescribeLogGroup}" + # For Resource "${cloudWatchLogGroupName}" + cwlogGroupArn="arn:aws:logs:${AWS_REGION}:${ACCOUNT_ID}:log-group:${cloudWatchLogGroupName}:*" + logsCreateLogStream="logs:CreateLogStream" + logsDescribeLogStreams="logs:DescribeLogStreams" + logsPutLogEvents="logs:PutLogEvents" + logsEvalResultsJson=$(${AWS_CLI_BIN} iam simulate-principal-policy \ + --policy-source-arn "${taskRoleArn}" \ + --action-names "${logsCreateLogStream}" "${logsDescribeLogStreams}" "${logsPutLogEvents}" \ + --resource-arns "${cwlogGroupArn}" \ + --output json) + logsCreateLogStreamResult=$(readEvalDecision "${logsEvalResultsJson}" "${logsCreateLogStream}") + showEvalResult "${logsCreateLogStreamResult}" "${logsCreateLogStream}" + logsDescribeLogStreamsResult=$(readEvalDecision "${logsEvalResultsJson}" "${logsDescribeLogStreams}") + showEvalResult "${logsDescribeLogStreamsResult}" "${logsDescribeLogStreams}" + logsPutLogEventsResult=$(readEvalDecision "${logsEvalResultsJson}" "${logsPutLogEvents}") + showEvalResult "${logsPutLogEventsResult}" "${logsPutLogEvents}" + fi +fi + +# 10. Check existing VPC Endpoints (PrivateLinks) in the task VPC. +# If there is any VPC Endpoints configured for the task VPC, we assume you would need an additional SSM PrivateLink to be configured. (yellow) +# TODO: In the ideal world, the script should simply check if the task can reach to the internet or not :) +requiredEndpoint="com.amazonaws.${AWS_REGION}.ssmmessages" +taskNetworkingAttachment=$(echo "${describedTaskJson}" | jq -r ".tasks[0].attachments[0]") +if [[ "${taskNetworkingAttachment}" = "null" ]]; then + ## bridge/host networking (only for EC2) + taskVpcId=$(echo "${describedContainerInstanceJson}" | jq -r ".containerInstances[0].attributes[] | select(.name==\"ecs.vpc-id\") | .value") + taskSubnetId=$(echo "${describedContainerInstanceJson}" | jq -r ".containerInstances[0].attributes[] | select(.name==\"ecs.subnet-id\") | .value") + subnetJson=$(${AWS_CLI_BIN} ec2 describe-subnets --subnet-ids "${taskSubnetId}") +else + ## awsvpc networking (for both EC2 and Fargate) + taskSubnetId=$(echo "${describedTaskJson}" | jq -r ".tasks[0].attachments[0].details[] | select(.name==\"subnetId\") | .value") + subnetJson=$(${AWS_CLI_BIN} ec2 describe-subnets --subnet-ids "${taskSubnetId}") + taskVpcId=$(echo "${subnetJson}" | jq -r ".Subnets[0].VpcId") +fi +## Obtain the ownerID of subnet's owner to check if the subnet is shared via AWS RAM (which check-ecs-exec.sh doesn't support today) +subnetOwnerId=$(echo "${subnetJson}" | jq -r ".Subnets[0].OwnerId") +printf "${COLOR_DEFAULT} VPC Endpoints | " +if [[ ! "${ACCOUNT_ID}" = "${subnetOwnerId}" ]]; then + ## Shared Subnets (VPC) are not supported in Amazon ECS Exec Checker + printf "${COLOR_RED}CHECK FAILED${COLOR_YELLOW}\n" + printf " Amazon ECS Exec Checker doesn't support VPC endpoint validation for AWS RAM shared VPC/subnets.\n" + printf " Check or contact your administrator to find if additional VPC endpoints are required by the following resources.\n" + printf " - Resources: ${taskVpcId} and ${taskSubnetId}\n" + printf " - VPC Endpoint: ${requiredEndpoint}${COLOR_DEFAULT}\n" +else + ## List Vpc Endpoints + vpcEndpointsJson=$(${AWS_CLI_BIN} ec2 describe-vpc-endpoints \ + --filters Name=vpc-id,Values="${taskVpcId}") + vpcEndpoints=$(echo "${vpcEndpointsJson}" | tr -d '\n' | jq -r ".VpcEndpoints[]") + if [[ "${vpcEndpoints}" = "" ]]; then + printf "${COLOR_GREEN}SKIPPED ${COLOR_DEFAULT}(${taskVpcId} - No additional VPC endpoints required)\n" + else + # Check whether an ssmmessages VPC endpoint exists + vpcEndpoints=$(echo "${vpcEndpointsJson}" | tr -d '\n' | jq -r ".VpcEndpoints[].ServiceName") + printf "\n" + ssmsessionVpcEndpointExists=false + for vpe in $vpcEndpoints; do + if [[ "${vpe}" = "${requiredEndpoint}" ]]; then + ssmsessionVpcEndpointExists=true + break + fi + done + + printf " Found existing endpoints for ${taskVpcId}:\n" + for vpe in $vpcEndpoints; do + if [[ "${vpe}" = "${requiredEndpoint}" ]]; then + printf " - ${COLOR_GREEN}${vpe}${COLOR_DEFAULT}\n" + else + printf " - ${COLOR_DEFAULT}${vpe}\n" + fi + done + if [[ "${ssmsessionVpcEndpointExists}" = "false" ]]; then + printf " SSM PrivateLink \"${COLOR_YELLOW}${requiredEndpoint}${COLOR_DEFAULT}\" not found. You must ensure your task has proper outbound internet connectivity." + fi + fi +fi + +# 11. Check task definition containers for environment variables AWS_ACCESS_KEY, AWS_ACCESS_KEY_ID, and AWS_SECRET_ACCESS_KEY +# if AWS_ACCESS_KEY, AWS_ACCESS_KEY_ID, and AWS_SECRET_ACCESS_KEY are defined in a container, they will be used by the SSM service +# if the key defined does not have requirement permissions, the execute-command will not work. +containerNameList=$(echo "${taskDefJson}" | jq -r ".taskDefinition.containerDefinitions[].name") +idx=0 +printf "${COLOR_DEFAULT} Environment Variables | (${taskDefFamily}:${taskDefRevision})\n" +for containerName in $containerNameList; do + printf " ${COLOR_DEFAULT}$((idx+1)). container \"${containerName}\"\n" + # find AWS_ACCESS_KEY + printf " ${COLOR_DEFAULT}- AWS_ACCESS_KEY" + AWS_ACCESS_KEY_FOUND=$(echo "${taskDefJson}" | jq -r ".taskDefinition.containerDefinitions[${idx}].environment[] | select(.name==\"AWS_ACCESS_KEY\") | .name") + case "${AWS_ACCESS_KEY_FOUND}" in + *AWS_ACCESS_KEY* ) printf ": ${COLOR_YELLOW}defined${COLOR_DEFAULT}\n";; + * ) printf ": ${COLOR_GREEN}not defined${COLOR_DEFAULT}\n";; + esac + # find AWS_ACCESS_KEY_ID + printf " ${COLOR_DEFAULT}- AWS_ACCESS_KEY_ID" + AWS_ACCESS_KEY_ID_FOUND=$(echo "${taskDefJson}" | jq -r ".taskDefinition.containerDefinitions[${idx}].environment[] | select(.name==\"AWS_ACCESS_KEY_ID\") | .name") + case "${AWS_ACCESS_KEY_ID_FOUND}" in + *AWS_ACCESS_KEY_ID* ) printf ": ${COLOR_YELLOW}defined${COLOR_DEFAULT}\n";; + * ) printf ": ${COLOR_GREEN}not defined${COLOR_DEFAULT}\n";; + esac + # find AWS_SECRET_ACCESS_KEY + printf " ${COLOR_DEFAULT}- AWS_SECRET_ACCESS_KEY" + AWS_SECRET_ACCESS_KEY_FOUND=$(echo "${taskDefJson}" | jq -r ".taskDefinition.containerDefinitions[${idx}].environment[] | select(.name==\"AWS_SECRET_ACCESS_KEY\") | .name") + case "${AWS_SECRET_ACCESS_KEY_FOUND}" in + *AWS_SECRET_ACCESS_KEY* ) printf ": ${COLOR_YELLOW}defined${COLOR_DEFAULT}\n";; + * ) printf ": ${COLOR_GREEN}not defined${COLOR_DEFAULT}\n";; + esac + idx=$((idx+1)) +done + +printf "\n"