diff --git a/cfn-template/deeplearning.template b/cfn-template/deeplearning.template index 8c3fdff..95a2549 100644 --- a/cfn-template/deeplearning.template +++ b/cfn-template/deeplearning.template @@ -2,6 +2,65 @@ "AWSTemplateFormatVersion" : "2010-09-09", "Description" : "Launches a Deep Learning Cluster with one Master and variable number of Workers.", "Parameters" : { + "S3Bucket" : { + "Description" : "S3 bucket name that contains training code, data and scripts, e.g. my-s3-bucket ", + "Type" : "String" + }, + "TarData" : { + "Description" : "Data tar file prefix in S3Bucket copied to EFS, or copied and extracted on worker EBS file system, e.g. data.tar", + "Type" : "String", + "AllowedPattern": ".+\\.tar(\\.gz)?" + }, + "TarSource" : { + "Description" : "Source tar file prefix in S3Bucket copied and extracted on worker EBS file system, e.g. src.tar" , + "Type" : "String", + "AllowedPattern": ".+\\.tar(\\.gz)?" + }, + "RunScript" : { + "Default": "run.sh", + "Description" : "Bash shell run script prefix in S3Bucket for starting training on master, e.g. run.sh", + "Type" : "String" + }, + "SetupScript" : { + "Default": "setup.sh", + "Description" : "Bash shell setup script prefix in S3Bucket for setting up training environment on each worker, e.g. setup.sh", + "Type" : "String" + }, + "MyVpcCIDR" : { + "Default": "192.168.0.0/26", + "Description" : "My VPC CIDR", + "AllowedPattern": "(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})/(\\d{1,2})", + "Type" : "String" + }, + "MyAvailabilityZone" : { + "Description" : "My availability zone, e.g. us-east-1d", + "AllowedPattern": "[a-z]+-[a-z]+-[1-9]{1}[a-z]{1}", + "Type" : "String" + }, + "PublicSubnetCIDR" : { + "Default": "192.168.0.0/27", + "Description" : "Public Subnet CIDR in MyVpcCIDR", + "AllowedPattern": "(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})/(\\d{1,2})", + "Type" : "String" + }, + "PrivateSubnetCIDR" : { + "Default": "192.168.0.32/27", + "Description" : "Private Subnet CIDR in MyVpcCIDR", + "AllowedPattern": "(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})/(\\d{1,2})", + "Type" : "String" + }, + "EbsDeviceName" : { + "Default": "/dev/sda1", + "Description" : "Ebs device name", + "Type" : "String", + "AllowedValues": [ "/dev/sda1" ] + }, + "EbsVolumeSize" : { + "Default": "150", + "Description" : "Ebs volume size", + "Type" : "String", + "AllowedValues": [ "100", "150", "200", "250", "300", "350", "400", "450", "500"] + }, "KeyName" : { "Description" : "Name of an existing Amazon EC2 KeyPair to enable SSH access to the instances", "Type" : "AWS::EC2::KeyPair::KeyName" @@ -13,9 +72,9 @@ "Default" : "1" }, "InstanceType" : { - "Description" : "The EC2 instance type for workers.For GPUs choose g3.xx, p2.xx or p3.xx.", + "Description" : "The EC2 instance type for workers. For latest GPUs choose p3.xx", "Type" : "String", - "Default" : "p3.2xlarge", + "Default" : "p3.16xlarge", "AllowedValues" : [ "p2.16xlarge", "p2.8xlarge", @@ -23,9 +82,9 @@ "p3.2xlarge", "p3.8xlarge", "p3.16xlarge", - "g3.16xlarge", - "g3.8xlarge", - "g3.4xlarge", + "p3dn.24xlarge", + "g2.8xlarge", + "g2.2xlarge", "t2.small", "t2.medium", "t2.large", @@ -84,6 +143,11 @@ "AllowedValues" : [ "AmazonLinux", "Ubuntu" ], "ConstraintDescription" : "Amazon Supported Image Type" }, + "AMIOverride" : { + "Description" : "Advanced option to override Deep Learning AMI of specified ImageType available in region", + "Type" : "String", + "AllowedPattern": "(ami-[0-9a-z]{17})?" + }, "SSHLocation": { "Description": "Restrict SSH access to a valid CIDR range, this should be a valid CIDR IP address range that you want to allow access to your Master and Stack.", "Type": "String", @@ -103,44 +167,57 @@ "Description" : "The Linux mount point for the EFS volume", "Type": "String", "MinLength": "1", - "Default": "myEFSvolume" + "Default": "efs" + }, + "EFSServesData" : { + "Description" : "Use EFS for serving data to workers", + "Type": "String", + "Default": "false", + "AllowedValues": ["true", "false"] + }, + "EBSOptimized" : { + "Description": "Is the instance EBS optimized? Not all instace types support EBS optimized option.", + "Type": "String", + "Default": "true", + "AllowedValues": [ "false", "true"] + }, + "ActivateCondaEnv" : { + "Description" : "Activate conda environment", + "Type": "String", + "Default": "tensorflow_p36", + "AllowedValues": ["base", "caffe2_p27", "caffe_p27", "caffe_p35", "chainer_p27", "chainer_p36", "cntk_p27", "cntk_p36", "mxnet_p27", "mxnet_p36", "python2", "python3", "pytorch_p27", "pytorch_p36", "tensorflow_p36", "tensorflow_p27", "theano_p27", "theano_p36" ] } }, "Conditions" : { - "CreateNewFileSystem" : { "Fn::Equals" : [{"Ref": "EFSFileSystemId"}, ""] } + "CreateNewFileSystem" : { "Fn::Equals" : [{"Ref": "EFSFileSystemId"}, ""] }, + "CopyDataToEFS" : { "Fn::Equals" : [{"Ref": "EFSServesData"}, "true"] }, + "OverrideAMI" : { "Fn::Not" : [{ "Fn::Equals" : [{"Ref": "AMIOverride"}, ""]} ]} }, "Mappings" : { - "AmazonLinux" : { - "us-east-1" : { "AMI" : "ami-0f5788229b53809c9" }, - "us-west-2" : { "AMI" : "ami-0c0c1a8d6a4695fdc" }, - "eu-west-1" : { "AMI" : "ami-088b2e2cc2498f3ca" }, - "us-east-2" : { "AMI" : "ami-001f9c1ca57fbc7a2" }, - "ap-southeast-2" : { "AMI" : "ami-02c907307d02dc462" }, - "ap-northeast-1" : { "AMI" : "ami-08a7740ff4d3fd90f" }, - "ap-northeast-2" : { "AMI" : "ami-07b22a7626892dd48" }, - "ap-south-1" : { "AMI" : "ami-074811debc0b11bdf" }, - "eu-central-1" : { "AMI" : "ami-055ab192b68ca4d2f" }, - "ap-southeast-1" : { "AMI" : "ami-044c38d8c0100ea15" }, - "us-west-1" : { "AMI" : "ami-0351f8fc8044b3dea" } + "Ubuntu" : { + "us-east-1" : { "AMI" : "ami-09a706a24845d0723" }, + "us-east-2" : { "AMI" : "ami-003ce277a8a9c0014" }, + "us-west-2" : { "AMI" : "ami-0b294f219d14e6a82" }, + "eu-west-1" : { "AMI" : "ami-086062166ec8340ac" }, + "eu-central-1" : { "AMI" : "ami-0f57552c8fc9e228f" }, + "ap-southeast-1" : { "AMI" : "ami-077b987c8b7a6462e" }, + "ap-southeast-2" : { "AMI" : "ami-0512a7cd86ea45901" }, + "ap-south-1" : { "AMI" : "ami-01e5f909b3c234383" }, + "ap-northeast-1" : { "AMI" : "ami-07a65197e224510c7" }, + "ap-northeast-2" : { "AMI" : "ami-098cb0cca04bdac5a" } }, - "Ubuntu" : { - "us-east-1" : { "AMI" : "ami-09a706a24845d0723" }, - "us-west-2" : { "AMI" : "ami-0b294f219d14e6a82" }, - "eu-west-1" : { "AMI" : "ami-086062166ec8340ac" }, - "us-east-2" : { "AMI" : "ami-003ce277a8a9c0014" }, - "ap-southeast-2" : { "AMI" : "ami-0512a7cd86ea45901" }, - "ap-northeast-1" : { "AMI" : "ami-07a65197e224510c7" }, - "ap-northeast-2" : { "AMI" : "ami-098cb0cca04bdac5a" }, - "ap-south-1" : { "AMI" : "ami-01e5f909b3c234383" }, - "eu-central-1" : { "AMI" : "ami-0f57552c8fc9e228f" }, - "ap-southeast-1" : { "AMI" : "ami-077b987c8b7a6462e" }, - "us-west-1" : { "AMI" : "ami-0f4a47e4242cb9816" } - }, - "SubnetConfig" : { - "VPC" : { "CIDR" : "10.0.0.0/16" }, - "Public" : { "CIDR" : "10.0.0.0/24" }, - "Private" : { "CIDR" : "10.0.1.0/24" } + "AmazonLinux" : { + "us-east-1" : { "AMI" : "ami-0f5788229b53809c9" }, + "us-east-2" : { "AMI" : "ami-001f9c1ca57fbc7a2" }, + "us-west-2" : { "AMI" : "ami-0c0c1a8d6a4695fdc" }, + "eu-west-1" : { "AMI" : "ami-088b2e2cc2498f3ca" }, + "eu-central-1" : { "AMI" : "ami-055ab192b68ca4d2f" }, + "ap-southeast-1" : { "AMI" : "ami-044c38d8c0100ea15" }, + "ap-southeast-2" : { "AMI" : "ami-02c907307d02dc462" }, + "ap-south-1" : { "AMI" : "ami-074811debc0b11bdf" }, + "ap-northeast-1" : { "AMI" : "ami-08a7740ff4d3fd90f" }, + "ap-northeast-2" : { "AMI" : "ami-07b22a7626892dd48" } }, "S3" : { "us-east-1" : { "URL" : "https://s3.amazonaws.com/" }, @@ -159,12 +236,18 @@ "S3SourceBucket" : { "BucketNameSuffix" : "-aws-dl-cfn" }, "Setup" : { "Filename" : "dl_cfn_setup_v2.py" }, "LambdaFunction" : { "FileName": "dl_cfn_setup_lambda.zip" }, - "TimeoutValues" : { "WaitConditionTimeout" : "3300", "MasterLaunchTimeout" : "600"}, + "TimeoutValues" : { "WaitConditionTimeout" : "3600", "MasterLaunchTimeout" : "1200"}, "DefaultUser" : {"AmazonLinux": "ec2-user", "Ubuntu": "ubuntu"}, - "CfnPath" : {"AmazonLinux": "/opt/aws/bin", "Ubuntu": "/usr/local/bin"} + "CfnPath" : {"Ubuntu": "/usr/local/bin"} } }, "Resources" : { + "ResourcePlacementGroup": { + "Type" : "AWS::EC2::PlacementGroup", + "Properties" : { + "Strategy" : "cluster" + } + }, "ResourceMetadataLambdaFunction": { "Type": "AWS::Lambda::Function", "DependsOn" : ["MasterQueue"], @@ -256,7 +339,33 @@ } ] }, "Path" : "/", - "Policies" : [ { + "Policies" : [ + { + "PolicyName": "s3-read-write", + "PolicyDocument": { + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "s3:Get*", + "s3:List*" + ], + "Resource": "*" + }, + { + "Effect": "Allow", + "Action": [ + "s3:Put*" + ], + "Resource": [ + { "Fn::Join": [ "", [ "arn:aws:s3:::", { "Ref": "S3Bucket" } ] ] }, + { "Fn::Join": [ "", [ "arn:aws:s3:::", { "Ref": "S3Bucket" }, "/*" ] ] } + ] + } + ] + } + }, + { "PolicyName" : "instance", "PolicyDocument" : { "Statement" : [ { @@ -463,18 +572,27 @@ "WorkerLaunchConfig" : { "Type" : "AWS::AutoScaling::LaunchConfiguration", "Properties" : { - "ImageId" : { - "Fn::FindInMap" : [ {"Ref" : "ImageType" }, { "Ref" : "AWS::Region" }, "AMI" ] - }, + "ImageId" : { "Fn::If": [ "OverrideAMI", {"Ref": "AMIOverride"}, + {"Fn::FindInMap" : [ {"Ref" : "ImageType" }, { "Ref" : "AWS::Region" }, "AMI" ]} + ] }, "InstanceType" : { "Ref" : "InstanceType" }, + "EbsOptimized": { + "Ref": "EBSOptimized" + }, "IamInstanceProfile" : { "Ref" : "InstanceProfile" }, "SecurityGroups" : [ {"Ref" : "WorkerSecurityGroup"} ], + "BlockDeviceMappings" : [ + { + "DeviceName" : {"Ref": "EbsDeviceName" }, + "Ebs" : { "VolumeSize" : {"Ref": "EbsVolumeSize"}, "VolumeType" : "gp2" } + } + ], "UserData" : { "Fn::Base64" : { "Fn::Join" : [ "", @@ -489,6 +607,18 @@ "mkdir -p /opt/deeplearning", "\n", + "sudo ln -s /home/", + { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}, + "/anaconda3/etc/profile.d/conda.sh /etc/profile.d/conda.sh", + "\n", + + "echo 'conda activate ", + {"Ref": "ActivateCondaEnv"}, + "' >> /home/", + { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}, + "/.bash_login", + "\n", + "# run cfn-init. \n", { "Fn::FindInMap" : [ "Other", "CfnPath", {"Ref" : "ImageType" } ]}, "\\/cfn-init -v --region ", { "Ref" : "AWS::Region" }, @@ -520,8 +650,27 @@ "02_mount" : { "command" : {"Fn::Join" : [ "", [ "sudo mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 ", {"Fn::If" : [ "CreateNewFileSystem", {"Ref" : "FileSystem"}, {"Ref" : "EFSFileSystemId"} ]}, ".efs.", { "Ref" : "AWS::Region" }, ".amazonaws.com:/ /", {"Ref": "EFSMountPoint"} ]]} }, - "03_permissions" : { - "command" : {"Fn::Join" : [ "", [ "chown ",{ "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}, ":", { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}, " /", { "Ref" : "EFSMountPoint" }]]} + "03_data" : { + "command" : {"Fn::Join" : [ "", [ "touch /", {"Ref": "EFSMountPoint"}, "/", { "Fn::If" : [ "CopyDataToEFS", "data.txt", "nodata.txt" ] } ]]} + }, + "04_data_ebs" : { + "test": {"Fn::Join" : [ "", [ "test ! -e /", {"Ref": "EFSMountPoint"}, "/data.txt" ]]}, + "command" : {"Fn::Join" : [ "", [ "aws s3 cp s3://", {"Ref" : "S3Bucket" } , "/", { "Ref" : "TarData" } , " /home/", { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}]]} + }, + "05_src_ebs" : { + "command" : {"Fn::Join" : [ "", [ "aws s3 cp s3://", {"Ref" : "S3Bucket" } , "/", { "Ref" : "TarSource" } , " /home/", { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}]]} + }, + "06_ebs_tar" : { + "command" : {"Fn::Join" : [ "", [ "for file in /home/", { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}, "/*.tar ; do tar -xf $file --directory ", " /home/", { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}, " ; done"]]} + }, + "07_setup_script" : { + "command" : {"Fn::Join" : [ "", [ "aws s3 cp s3://", {"Ref" : "S3Bucket" } , "/", { "Ref" : "SetupScript" }, " /home/", { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}]]} + }, + "08_permissions" : { + "command" : {"Fn::Join" : [ "", [ "chown ",{ "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}, ":", { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}, " -R /home/", { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}]]} + }, + "09_permissions" : { + "command" : {"Fn::Join" : [ "", [ "chmod u+x /home/", { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}, "/*.sh" ]]} } } }, @@ -559,12 +708,15 @@ "Type" : "AWS::AutoScaling::LaunchConfiguration", "Properties" : { "AssociatePublicIpAddress" : "true", - "ImageId" : { - "Fn::FindInMap" : [ {"Ref" : "ImageType" }, { "Ref" : "AWS::Region" }, "AMI" ] - }, + "ImageId" : { "Fn::If": [ "OverrideAMI", {"Ref": "AMIOverride"}, + { "Fn::FindInMap" : [ {"Ref" : "ImageType" }, { "Ref" : "AWS::Region" }, "AMI" ]} + ] }, "InstanceType" : { "Ref" : "InstanceType" }, + "EbsOptimized": { + "Ref": "EBSOptimized" + }, "IamInstanceProfile" : { "Ref" : "InstanceProfile" }, @@ -572,6 +724,12 @@ { "Ref" : "MasterSecurityGroup" }, { "Ref" : "AdminSSHSecurityGroup" } ], + "BlockDeviceMappings" : [ + { + "DeviceName" : {"Ref": "EbsDeviceName" }, + "Ebs" : { "VolumeSize" : {"Ref": "EbsVolumeSize"}, "VolumeType" : "gp2" } + } + ], "UserData" : { "Fn::Base64" : { "Fn::Join" : [ "", @@ -585,6 +743,18 @@ "mkdir -p /opt/deeplearning", "\n", + "sudo ln -s /home/", + { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}, + "/anaconda3/etc/profile.d/conda.sh /etc/profile.d/conda.sh", + "\n", + + "echo 'conda activate ", + {"Ref": "ActivateCondaEnv"}, + "' >> /home/", + { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}, + "/.bash_login", + "\n", + "# run cfn-init. \n", { "Fn::FindInMap" : [ "Other", "CfnPath", {"Ref" : "ImageType" } ]}, "\\/cfn-init -v --region ", { "Ref" : "AWS::Region" }, @@ -616,7 +786,36 @@ "02_mount" : { "command" : {"Fn::Join" : [ "", [ "sudo mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 ", { "Fn::If" : [ "CreateNewFileSystem", {"Ref" : "FileSystem"}, {"Ref" : "EFSFileSystemId"} ] }, ".efs.", { "Ref" : "AWS::Region" }, ".amazonaws.com:/ /", {"Ref": "EFSMountPoint"} ]]} }, - "03_permissions" : { + "03_data" : { + "command" : {"Fn::Join" : [ "", [ "touch /", {"Ref": "EFSMountPoint"}, "/", { "Fn::If" : [ "CopyDataToEFS", "data.txt", "nodata.txt" ] } ]]} + }, + "04_data_efs" : { + "test": {"Fn::Join" : [ "", [ "test -e /", {"Ref": "EFSMountPoint"}, "/data.txt" ]]}, + "command" : {"Fn::Join" : [ "", [ "aws s3 cp s3://", {"Ref" : "S3Bucket" } , "/", { "Ref" : "TarData" } , " /", {"Ref": "EFSMountPoint"} ]]} + }, + "05_data_ebs" : { + "test": {"Fn::Join" : [ "", [ "test ! -e /", {"Ref": "EFSMountPoint"}, "/data.txt" ]]}, + "command" : {"Fn::Join" : [ "", [ "aws s3 cp s3://", {"Ref" : "S3Bucket" } , "/", { "Ref" : "TarData" } , " /home/", { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}]]} + }, + "06_src_ebs" : { + "command" : {"Fn::Join" : [ "", [ "aws s3 cp s3://", {"Ref" : "S3Bucket" } , "/", { "Ref" : "TarSource" } , " /home/", { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}]]} + }, + "07_ebs_tar" : { + "command" : {"Fn::Join" : [ "", [ "for file in /home/", { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}, "/*.tar ; do tar -xf $file --directory ", " /home/", { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}, " ; done"]]} + }, + "08_run_script" : { + "command" : {"Fn::Join" : [ "", [ "aws s3 cp s3://", {"Ref" : "S3Bucket" } , "/", { "Ref" : "RunScript" }, " /home/", { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}]]} + }, + "09_setup_script" : { + "command" : {"Fn::Join" : [ "", [ "aws s3 cp s3://", {"Ref" : "S3Bucket" } , "/", { "Ref" : "SetupScript" }, " /home/", { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}]]} + }, + "10_permissions" : { + "command" : {"Fn::Join" : [ "", [ "chown ",{ "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}, ":", { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}, " -R /home/", { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}]]} + }, + "11_permissions" : { + "command" : {"Fn::Join" : [ "", [ "chmod u+x /home/", { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}, "/*.sh" ]]} + }, + "12_permissions" : { "command" : {"Fn::Join" : [ "", [ "chown ",{ "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}, ":", { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}, " /", { "Ref" : "EFSMountPoint" }]]} } } @@ -664,6 +863,9 @@ "DesiredCapacity" : "1", "MinSize" : "1", "MaxSize" : "1", + "PlacementGroup": { + "Ref": "ResourcePlacementGroup" + }, "LaunchConfigurationName" : { "Ref" : "MasterLaunchConfig"}, "VPCZoneIdentifier" : [{ "Ref" : "PublicSubnet"}], "NotificationConfiguration" : { @@ -697,6 +899,9 @@ "Properties" : { "MinSize" : "0", "MaxSize" : { "Ref" : "WorkerCount" }, + "PlacementGroup": { + "Ref": "ResourcePlacementGroup" + }, "DesiredCapacity" : { "Ref" : "WorkerCount" }, "LaunchConfigurationName" : { "Ref" : "WorkerLaunchConfig" @@ -773,7 +978,7 @@ "Vpc" : { "Type" : "AWS::EC2::VPC", "Properties" : { - "CidrBlock" : { "Fn::FindInMap" : [ "SubnetConfig", "VPC", "CIDR" ]}, + "CidrBlock" : {"Ref": "MyVpcCIDR"}, "EnableDnsSupport" : "true", "EnableDnsHostnames" : "true", "Tags" : [ @@ -803,7 +1008,7 @@ "Properties" : { "VpcId" : {"Ref" : "Vpc"}, "AvailabilityZone" : { "Fn::GetAtt" : [ "PrivateSubnet", "AvailabilityZone" ] } , - "CidrBlock": { "Fn::FindInMap" : [ "SubnetConfig", "Public", "CIDR" ]}, + "CidrBlock": { "Ref" : "PublicSubnetCIDR"}, "Tags" : [ { "Key" : "Network", "Value" : "Public" }, { "Key" : "Name", "Value" : { "Ref" : "AWS::StackName" } } @@ -814,7 +1019,8 @@ "Type" : "AWS::EC2::Subnet", "Properties" : { "VpcId" : { "Ref" : "Vpc" }, - "CidrBlock" : { "Fn::FindInMap" : [ "SubnetConfig", "Private", "CIDR" ]}, + "CidrBlock" : { "Ref" : "PrivateSubnetCIDR"}, + "AvailabilityZone": { "Ref": "MyAvailabilityZone"}, "Tags" : [ { "Key" : "Network", "Value" : "Private" }, { "Key" : "Name", "Value" : { "Ref" : "AWS::StackName" }} diff --git a/cfn-template/private-deeplearning.template b/cfn-template/private-deeplearning.template new file mode 100644 index 0000000..107c24a --- /dev/null +++ b/cfn-template/private-deeplearning.template @@ -0,0 +1,2443 @@ +{ + "AWSTemplateFormatVersion": "2010-09-09", + "Description": "Launches a Deep Learning Cluster with one Master and variable number of Workers.", + "Parameters": { + "S3Bucket": { + "Description": "S3 bucket name that contains training code, data and scripts, e.g. my-s3-bucket ", + "Type": "String" + }, + "TarData": { + "Description": "Data tar file prefix in S3Bucket copied to EFS, or copied and extracted on worker EBS file system, e.g. data.tar", + "Type": "String", + "AllowedPattern": ".+\\.tar(\\.gz)?" + }, + "TarSource": { + "Description": "Source tar file prefix in S3Bucket copied and extracted on worker EBS file system, e.g. src.tar", + "Type": "String", + "AllowedPattern": ".+\\.tar(\\.gz)?" + }, + "RunScript": { + "Default": "run.sh", + "Description": "Bash shell run script prefix in S3Bucket for starting training on master, e.g. run.sh", + "Type": "String" + }, + "SetupScript": { + "Default": "setup.sh", + "Description": "Bash shell setup script prefix in S3Bucket for setting up training environment on each worker, e.g. setup.sh", + "Type": "String" + }, + "MyVpcId": { + "Description": "My VPC ID", + "Type": "AWS::EC2::VPC::Id" + }, + "PrivateSubnetId": { + "Description": "My Subnet ID", + "Type": "AWS::EC2::Subnet::Id" + }, + "EbsDeviceName": { + "Default": "/dev/sda1", + "Description": "Ebs device name", + "Type": "String", + "AllowedValues": [ + "/dev/sda1" + ] + }, + "EbsVolumeSize": { + "Default": "150", + "Description": "Ebs volume size", + "Type": "String", + "AllowedValues": [ + "100", + "150", + "200", + "250", + "300", + "350", + "400", + "450", + "500" + ] + }, + "KeyName": { + "Description": "Name of an existing Amazon EC2 KeyPair to enable SSH access to the instances", + "Type": "AWS::EC2::KeyPair::KeyName" + }, + "WorkerCount": { + "Description": "The number of worker instances (launches +1 instance for the Master).", + "Type": "Number", + "MinValue": "1", + "Default": "1" + }, + "InstanceType": { + "Description": "The EC2 instance type for workers. For latest GPUs choose p3.xx", + "Type": "String", + "Default": "p3.16xlarge", + "AllowedValues": [ + "p2.16xlarge", + "p2.8xlarge", + "p2.xlarge", + "p3.2xlarge", + "p3.8xlarge", + "p3.16xlarge", + "p3dn.24xlarge", + "g2.8xlarge", + "g2.2xlarge", + "t2.small", + "t2.medium", + "t2.large", + "t2.xlarge", + "t2.2xlarge", + "m4.large", + "m4.xlarge", + "m4.2xlarge", + "m4.4xlarge", + "m4.10xlarge", + "m4.16xlarge", + "m3.medium", + "m3.large", + "m3.xlarge", + "m3.2xlarge", + "c4.large", + "c4.xlarge", + "c4.2xlarge", + "c4.4xlarge", + "c4.8xlarge", + "c3.large", + "c3.xlarge", + "c3.2xlarge", + "c3.4xlarge", + "c3.8xlarge", + "x1.16large", + "x1.32xlarge", + "r4.large", + "r4.xlarge", + "r4.2xlarge", + "r4.4xlarge", + "r4.8xlarge", + "r4.16xlarge", + "r3.large", + "r3.xlarge", + "r3.2xlarge", + "r3.4xlarge", + "r3.8xlarge", + "i2.xlarge", + "i2.2xlarge", + "i2.4xlarge", + "i2.8xlarge", + "d2.xlarge", + "d2.2xlarge", + "d2.4xlarge", + "d2.8xlarge", + "f1.2xlarge", + "f1.16xlarge" + ], + "ConstraintDescription": "Must be a valid CPU optimized or GPU EC2 instance type." + }, + "ImageType": { + "Description": "Linux Flavor(Amazon Linux or Ubuntu)", + "Type": "String", + "Default": "Ubuntu", + "AllowedValues": [ + "AmazonLinux", + "Ubuntu" + ], + "ConstraintDescription": "Amazon Supported Image Type" + }, + "AMIOverride": { + "Description": "Advanced option to override Deep Learning AMI of specified ImageType available in region", + "Type": "String", + "AllowedPattern": "(ami-[0-9a-z]{17})?" + }, + "SSHLocation": { + "Description": "Restrict SSH access to a valid CIDR range, this should be a valid CIDR IP address range that you want to allow access to your Master and Stack.", + "Type": "String", + "MinLength": "9", + "MaxLength": "18", + "AllowedPattern": "(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})/(\\d{1,2})", + "ConstraintDescription": "Must be a valid CIDR range of the form x.x.x.x/x" + }, + "EFSFileSystemId": { + "Description": "Existing Amazon EFS File System Id or leave it blank to create a new EFS File System.", + "Type": "String", + "AllowedPattern": "(^fs-[0-9a-f]{8,8})$|()$", + "Default": "", + "ConstraintDescription": "Should be a Valid EFS File System Id" + }, + "EFSMountPoint": { + "Description": "The Linux mount point for the EFS volume", + "Type": "String", + "MinLength": "1", + "Default": "efs" + }, + "EFSServesData": { + "Description": "Use EFS for serving data to workers", + "Type": "String", + "Default": "false", + "AllowedValues": [ + "true", + "false" + ] + }, + "EBSOptimized": { + "Description": "Is the instance EBS optimized? Not all instace types support EBS optimized option.", + "Type": "String", + "Default": "true", + "AllowedValues": [ + "false", + "true" + ] + }, + "ActivateCondaEnv": { + "Description": "Activate conda environment", + "Type": "String", + "Default": "tensorflow_p36", + "AllowedValues": [ + "base", + "caffe2_p27", + "caffe_p27", + "caffe_p35", + "chainer_p27", + "chainer_p36", + "cntk_p27", + "cntk_p36", + "mxnet_p27", + "mxnet_p36", + "python2", + "python3", + "pytorch_p27", + "pytorch_p36", + "tensorflow_p36", + "tensorflow_p27", + "theano_p27", + "theano_p36" + ] + } + }, + "Conditions": { + "CreateNewFileSystem": { + "Fn::Equals": [ + { + "Ref": "EFSFileSystemId" + }, + "" + ] + }, + "CopyDataToEFS": { + "Fn::Equals": [ + { + "Ref": "EFSServesData" + }, + "true" + ] + }, + "OverrideAMI": { + "Fn::Not": [ + { + "Fn::Equals": [ + { + "Ref": "AMIOverride" + }, + "" + ] + } + ] + } + }, + "Mappings": { + "Ubuntu": { + "us-east-1": { + "AMI": "ami-0f9e8c4a1305ecd22" + }, + "us-east-2": { + "AMI": "ami-0c9ae74667b049f59" + }, + "us-west-2": { + "AMI": "ami-0d0ff0945ae093aea" + }, + "eu-west-1": { + "AMI": "ami-0827ddd2d8e38aa56" + }, + "eu-central-1": { + "AMI": "ami-03580946c6347e2f8" + }, + "ap-southeast-1": { + "AMI": "ami-09dfb1478dc499b95" + }, + "ap-southeast-2": { + "AMI": "ami-0a8f8e89b02a21088" + }, + "ap-south-1": { + "AMI": "ami-07ffe4e02cf5c2bd0" + }, + "ap-northeast-1": { + "AMI": "ami-031ce7af929321d3a" + }, + "ap-northeast-2": { + "AMI": "ami-09c2b38b2fbb748ce" + } + }, + "AmazonLinux": { + "us-east-1": { + "AMI": "ami-0a4b759b63b333b0e" + }, + "us-east-2": { + "AMI": "ami-0f71284ab59a38265" + }, + "us-west-2": { + "AMI": "ami-0305a0d7a68489e58" + }, + "eu-west-1": { + "AMI": "ami-0d2e5838a2908742f" + }, + "eu-central-1": { + "AMI": "ami-09b5cb82b50e3c9e9" + }, + "ap-southeast-1": { + "AMI": "ami-0abbc7f71da968649" + }, + "ap-southeast-2": { + "AMI": "ami-0b6d01aebbf6a1490" + }, + "ap-south-1": { + "AMI": "ami-0e17a6861b2574143" + }, + "ap-northeast-1": { + "AMI": "ami-0165fe49c30cad525" + }, + "ap-northeast-2": { + "AMI": "ami-0b54ee3b4c6e0b975" + } + }, + "S3": { + "us-east-1": { + "URL": "https://s3.amazonaws.com/" + }, + "us-west-2": { + "URL": "https://s3-us-west-2.amazonaws.com/" + }, + "eu-west-1": { + "URL": "https://s3-eu-west-1.amazonaws.com/" + }, + "us-east-2": { + "URL": "https://s3-us-east-2.amazonaws.com/" + }, + "ap-southeast-2": { + "URL": "https://s3-ap-southeast-2.amazonaws.com/" + }, + "ap-northeast-1": { + "URL": "https://s3-ap-northeast-1.amazonaws.com/" + }, + "ap-northeast-2": { + "URL": "https://s3-ap-northeast-2.amazonaws.com/" + }, + "ap-south-1": { + "URL": "https://s3-ap-south-1.amazonaws.com/" + }, + "eu-central-1": { + "URL": "https://s3-eu-central-1.amazonaws.com/" + }, + "ap-southeast-1": { + "URL": "https://s3-ap-southeast-1.amazonaws.com/" + } + }, + "Other": { + "S3SourceBucket": { + "BucketNameSuffix": "-aws-dl-cfn" + }, + "Setup": { + "Filename": "dl_cfn_setup_v2.py" + }, + "LambdaFunction": { + "FileName": "dl_cfn_setup_lambda.zip" + }, + "TimeoutValues": { + "WaitConditionTimeout": "3600", + "MasterLaunchTimeout": "1200" + }, + "DefaultUser": { + "AmazonLinux": "ec2-user", + "Ubuntu": "ubuntu" + }, + "CfnPath": { + "Ubuntu": "/usr/local/bin" + } + } + }, + "Resources": { + "ResourcePlacementGroup": { + "Type": "AWS::EC2::PlacementGroup", + "Properties": { + "Strategy": "cluster" + } + }, + "ResourceMetadataLambdaFunction": { + "Type": "AWS::Lambda::Function", + "DependsOn": [ + "MasterQueue" + ], + "Properties": { + "Handler": "lambda_function.lambda_handler", + "Role": { + "Fn::GetAtt": [ + "LambdaExecutionRole", + "Arn" + ] + }, + "Code": { + "S3Bucket": { + "Fn::Join": [ + "", + [ + { + "Ref": "AWS::Region" + }, + { + "Fn::FindInMap": [ + "Other", + "S3SourceBucket", + "BucketNameSuffix" + ] + } + ] + ] + }, + "S3Key": { + "Fn::FindInMap": [ + "Other", + "LambdaFunction", + "FileName" + ] + } + }, + "MemorySize": "256", + "Timeout": "60", + "Runtime": "python2.7", + "Environment": { + "Variables": { + "AWS_DL_STACK_ID": { + "Ref": "AWS::StackName" + }, + "AWS_DL_MASTER_SQS_URL": { + "Ref": "MasterQueue" + } + } + } + } + }, + "LambdaExecutionRole": { + "Type": "AWS::IAM::Role", + "DependsOn": [ + "MasterQueue" + ], + "Properties": { + "ManagedPolicyArns": [ + "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole" + ], + "AssumeRolePolicyDocument": { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Service": [ + "lambda.amazonaws.com" + ] + }, + "Action": [ + "sts:AssumeRole" + ] + } + ] + }, + "Path": "/", + "Policies": [ + { + "PolicyName": "AWSDeepLearningLambdaExecutionRole", + "PolicyDocument": { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "autoscaling:DescribeAutoScalingGroups", + "autoscaling:SetDesiredCapacity", + "autoscaling:SuspendProcesses", + "cloudformation:DescribeStackResource", + "cloudformation:SignalResource" + ], + "Resource": "*" + } + ] + } + }, + { + "PolicyName": "AllowLambdaSQSSend", + "PolicyDocument": { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "sqs:sendmessage" + ], + "Resource": { + "Fn::GetAtt": [ + "MasterQueue", + "Arn" + ] + } + } + ] + } + } + ] + } + }, + "PermissionForSNSToInvokeLambda": { + "Type": "AWS::Lambda::Permission", + "Properties": { + "FunctionName": { + "Fn::GetAtt": [ + "ResourceMetadataLambdaFunction", + "Arn" + ] + }, + "Action": "lambda:InvokeFunction", + "Principal": "sns.amazonaws.com", + "SourceArn": { + "Ref": "ResourceMetadataSNSTopic" + } + } + }, + "InstanceRole": { + "Type": "AWS::IAM::Role", + "Properties": { + "RoleName": { + "Fn::Join": [ + "", + [ + { + "Ref": "AWS::StackName" + }, + "-InstanceRole" + ] + ] + }, + "AssumeRolePolicyDocument": { + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Service": [ + "ec2.amazonaws.com" + ] + }, + "Action": [ + "sts:AssumeRole" + ] + } + ] + }, + "Path": "/", + "Policies": [ + { + "PolicyName": "s3-read-write", + "PolicyDocument": { + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "s3:Get*", + "s3:List*" + ], + "Resource": "*" + }, + { + "Effect": "Allow", + "Action": [ + "s3:Put*" + ], + "Resource": [ + { + "Fn::Join": [ + "", + [ + "arn:aws:s3:::", + { + "Ref": "S3Bucket" + } + ] + ] + }, + { + "Fn::Join": [ + "", + [ + "arn:aws:s3:::", + { + "Ref": "S3Bucket" + }, + "/*" + ] + ] + } + ] + } + ] + } + }, + { + "PolicyName": "instance", + "PolicyDocument": { + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "autoscaling:DescribeAutoScalingGroups", + "autoscaling:DescribeAutoScalingInstances", + "ec2:DescribeInstances", + "cloudformation:DescribeStackResource" + ], + "Resource": "*" + } + ] + } + }, + { + "PolicyName": "allow-sqs-receive-send-delete-master", + "PolicyDocument": { + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "sqs:DeleteMessage", + "sqs:ReceiveMessage", + "sqs:SendMessage", + "sqs:GetQueueUrl" + ], + "Resource": { + "Fn::GetAtt": [ + "MasterQueue", + "Arn" + ] + } + } + ] + } + }, + { + "PolicyName": "allow-sqs-receive-send-delete-worker", + "PolicyDocument": { + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "sqs:DeleteMessage", + "sqs:ReceiveMessage", + "sqs:SendMessage", + "sqs:GetQueueUrl" + ], + "Resource": { + "Fn::GetAtt": [ + "WorkerQueue", + "Arn" + ] + } + } + ] + } + }, + { + "PolicyName": "allow-to-send-signal-to-WaitConditionHandle", + "PolicyDocument": { + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "s3:*" + ], + "Resource": { + "Fn::Join": [ + "", + [ + "arn:aws:s3:::", + "cloudformation-waitcondition-", + { + "Ref": "AWS::Region" + }, + "/*" + ] + ] + } + } + ] + } + } + ] + } + }, + "InstanceProfile": { + "Type": "AWS::IAM::InstanceProfile", + "DependsOn": "InstanceRole", + "Properties": { + "Path": "/", + "Roles": [ + { + "Ref": "InstanceRole" + } + ] + } + }, + "AdminSSHSecurityGroup": { + "Type": "AWS::EC2::SecurityGroup", + "Properties": { + "GroupDescription": "Security group that controls SSH access to the Master instance.", + "VpcId": { + "Ref": "MyVpcId" + }, + "Tags": [ + { + "Key": "Name", + "Value": { + "Fn::Join": [ + "", + [ + { + "Ref": "AWS::StackName" + }, + "_SSH" + ] + ] + } + } + ], + "SecurityGroupIngress": [ + { + "IpProtocol": "tcp", + "FromPort": "22", + "ToPort": "22", + "CidrIp": { + "Ref": "SSHLocation" + } + } + ], + "SecurityGroupEgress": [] + } + }, + "MasterSecurityGroup": { + "Type": "AWS::EC2::SecurityGroup", + "Properties": { + "GroupDescription": "Enable Port access to and from the Master on the Private Interface.", + "VpcId": { + "Ref": "MyVpcId" + }, + "Tags": [ + { + "Key": "Name", + "Value": { + "Fn::Join": [ + "", + [ + { + "Ref": "AWS::StackName" + }, + "_Master" + ] + ] + } + } + ], + "SecurityGroupIngress": [], + "SecurityGroupEgress": [] + } + }, + "MasterSecurityIngress1": { + "Type": "AWS::EC2::SecurityGroupIngress", + "DependsOn": [ + "MasterSecurityGroup" + ], + "Properties": { + "GroupId": { + "Fn::GetAtt": [ + "MasterSecurityGroup", + "GroupId" + ] + }, + "IpProtocol": "tcp", + "FromPort": "0", + "ToPort": "65535", + "SourceSecurityGroupId": { + "Fn::GetAtt": [ + "MasterSecurityGroup", + "GroupId" + ] + } + } + }, + "MasterSecurityIngress2": { + "Type": "AWS::EC2::SecurityGroupIngress", + "DependsOn": [ + "MasterSecurityGroup", + "WorkerSecurityGroup" + ], + "Properties": { + "GroupId": { + "Fn::GetAtt": [ + "MasterSecurityGroup", + "GroupId" + ] + }, + "IpProtocol": "icmp", + "FromPort": "-1", + "ToPort": "-1", + "SourceSecurityGroupId": { + "Fn::GetAtt": [ + "MasterSecurityGroup", + "GroupId" + ] + } + } + }, + "MasterSecurityIngress3": { + "Type": "AWS::EC2::SecurityGroupIngress", + "DependsOn": [ + "MasterSecurityGroup", + "WorkerSecurityGroup" + ], + "Properties": { + "GroupId": { + "Fn::GetAtt": [ + "MasterSecurityGroup", + "GroupId" + ] + }, + "IpProtocol": "tcp", + "FromPort": "0", + "ToPort": "65535", + "SourceSecurityGroupId": { + "Fn::GetAtt": [ + "WorkerSecurityGroup", + "GroupId" + ] + } + } + }, + "MasterSecurityIngress4": { + "Type": "AWS::EC2::SecurityGroupIngress", + "DependsOn": [ + "MasterSecurityGroup", + "WorkerSecurityGroup" + ], + "Properties": { + "GroupId": { + "Fn::GetAtt": [ + "MasterSecurityGroup", + "GroupId" + ] + }, + "IpProtocol": "icmp", + "FromPort": "-1", + "ToPort": "-1", + "SourceSecurityGroupId": { + "Fn::GetAtt": [ + "WorkerSecurityGroup", + "GroupId" + ] + } + } + }, + "WorkerSecurityGroup": { + "Type": "AWS::EC2::SecurityGroup", + "DependsOn": [ + "MasterSecurityGroup" + ], + "Properties": { + "GroupDescription": "Enable Port access to and from the Worker on the Private Interface", + "VpcId": { + "Ref": "MyVpcId" + }, + "Tags": [ + { + "Key": "Name", + "Value": { + "Fn::Join": [ + "", + [ + { + "Ref": "AWS::StackName" + }, + "_Worker" + ] + ] + } + } + ], + "SecurityGroupIngress": [ + { + "IpProtocol": "tcp", + "FromPort": "0", + "ToPort": "65535", + "SourceSecurityGroupId": { + "Ref": "MasterSecurityGroup" + } + }, + { + "IpProtocol": "icmp", + "FromPort": "-1", + "ToPort": "-1", + "SourceSecurityGroupId": { + "Ref": "MasterSecurityGroup" + } + } + ], + "SecurityGroupEgress": [] + } + }, + "WorkerSecurityIngress3": { + "Type": "AWS::EC2::SecurityGroupIngress", + "DependsOn": [ + "WorkerSecurityGroup" + ], + "Properties": { + "GroupId": { + "Fn::GetAtt": [ + "WorkerSecurityGroup", + "GroupId" + ] + }, + "IpProtocol": "tcp", + "FromPort": "0", + "ToPort": "65535", + "SourceSecurityGroupId": { + "Fn::GetAtt": [ + "WorkerSecurityGroup", + "GroupId" + ] + } + } + }, + "WorkerSecurityIngress4": { + "Type": "AWS::EC2::SecurityGroupIngress", + "DependsOn": [ + "WorkerSecurityGroup" + ], + "Properties": { + "GroupId": { + "Fn::GetAtt": [ + "WorkerSecurityGroup", + "GroupId" + ] + }, + "IpProtocol": "icmp", + "FromPort": "-1", + "ToPort": "-1", + "SourceSecurityGroupId": { + "Fn::GetAtt": [ + "WorkerSecurityGroup", + "GroupId" + ] + } + } + }, + "MountTargetSecurityGroup": { + "Type": "AWS::EC2::SecurityGroup", + "DependsOn": [ + "MasterSecurityGroup", + "WorkerSecurityGroup" + ], + "Properties": { + "GroupDescription": "Security group for mount target", + "VpcId": { + "Ref": "MyVpcId" + }, + "Tags": [ + { + "Key": "Name", + "Value": { + "Fn::Join": [ + "", + [ + { + "Ref": "AWS::StackName" + }, + "_Master" + ] + ] + } + } + ], + "SecurityGroupIngress": [ + { + "IpProtocol": "tcp", + "FromPort": "2049", + "ToPort": "2049", + "SourceSecurityGroupId": { + "Ref": "MasterSecurityGroup" + } + }, + { + "IpProtocol": "tcp", + "FromPort": "2049", + "ToPort": "2049", + "SourceSecurityGroupId": { + "Ref": "WorkerSecurityGroup" + } + } + ], + "SecurityGroupEgress": [] + } + }, + "FileSystem": { + "Type": "AWS::EFS::FileSystem", + "Condition": "CreateNewFileSystem", + "DeletionPolicy": "Retain", + "Properties": { + "PerformanceMode": "generalPurpose", + "FileSystemTags": [ + { + "Key": "Name", + "Value": { + "Ref": "AWS::StackName" + } + } + ] + } + }, + "MountTarget": { + "Type": "AWS::EFS::MountTarget", + "Properties": { + "FileSystemId": { + "Fn::If": [ + "CreateNewFileSystem", + { + "Ref": "FileSystem" + }, + { + "Ref": "EFSFileSystemId" + } + ] + }, + "SubnetId": { + "Ref": "PrivateSubnetId" + }, + "SecurityGroups": [ + { + "Ref": "MountTargetSecurityGroup" + } + ] + } + }, + "WorkerLaunchConfig": { + "Type": "AWS::AutoScaling::LaunchConfiguration", + "Properties": { + "ImageId": { + "Fn::If": [ + "OverrideAMI", + { + "Ref": "AMIOverride" + }, + { + "Fn::FindInMap": [ + { + "Ref": "ImageType" + }, + { + "Ref": "AWS::Region" + }, + "AMI" + ] + } + ] + }, + "InstanceType": { + "Ref": "InstanceType" + }, + "EbsOptimized": { + "Ref": "EBSOptimized" + }, + "IamInstanceProfile": { + "Ref": "InstanceProfile" + }, + "SecurityGroups": [ + { + "Ref": "WorkerSecurityGroup" + } + ], + "BlockDeviceMappings": [ + { + "DeviceName": { + "Ref": "EbsDeviceName" + }, + "Ebs": { + "VolumeSize": { + "Ref": "EbsVolumeSize" + }, + "VolumeType": "gp2" + } + } + ], + "UserData": { + "Fn::Base64": { + "Fn::Join": [ + "", + [ + "#!/bin/bash -xe", + "\n", + "# setup ssh-forwarding. ", + "sed -i \"s/^#\\(\\s\\+\\)ForwardAgent\\(\\s\\+\\)no/\\ \\1ForwardAgent\\2yes/g\" /etc/ssh/ssh_config", + "\n", + "mkdir -p /opt/deeplearning", + "\n", + "sudo ln -s /home/", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + }, + "/anaconda3/etc/profile.d/conda.sh /etc/profile.d/conda.sh", + "\n", + "echo 'conda activate ", + { + "Ref": "ActivateCondaEnv" + }, + "' >> /home/", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + }, + "/.bash_login", + "\n", + "# run cfn-init. \n", + { + "Fn::FindInMap": [ + "Other", + "CfnPath", + { + "Ref": "ImageType" + } + ] + }, + "\\/cfn-init -v --region ", + { + "Ref": "AWS::Region" + }, + " --configsets Setup ", + " -s ", + { + "Ref": "AWS::StackId" + }, + " -r WorkerLaunchConfig ", + "\n", + "" + ] + ] + } + }, + "KeyName": { + "Ref": "KeyName" + } + }, + "Metadata": { + "AWS::CloudFormation::Init": { + "configSets": { + "Setup": [ + "efs-config", + "download-setup", + "deeplearning-config" + ] + }, + "efs-config": { + "commands": { + "00_install_nfs": { + "command": { + "Fn::Join": [ + "", + [ + "if [ \"AmazonLinux\" = \"", + { + "Ref": "ImageType" + }, + "\" ];", + "then yum -y -q install nfs-utils; else apt-get -qq -y install nfs-common ; fi" + ] + ] + } + }, + "01_createdir": { + "command": { + "Fn::Join": [ + "", + [ + "mkdir -p /", + { + "Ref": "EFSMountPoint" + } + ] + ] + } + }, + "02_mount": { + "command": { + "Fn::Join": [ + "", + [ + "sudo mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 ", + { + "Fn::If": [ + "CreateNewFileSystem", + { + "Ref": "FileSystem" + }, + { + "Ref": "EFSFileSystemId" + } + ] + }, + ".efs.", + { + "Ref": "AWS::Region" + }, + ".amazonaws.com:/ /", + { + "Ref": "EFSMountPoint" + } + ] + ] + } + }, + "03_data": { + "command": { + "Fn::Join": [ + "", + [ + "touch /", + { + "Ref": "EFSMountPoint" + }, + "/", + { + "Fn::If": [ + "CopyDataToEFS", + "data.txt", + "nodata.txt" + ] + } + ] + ] + } + }, + "04_data_ebs": { + "test": { + "Fn::Join": [ + "", + [ + "test ! -e /", + { + "Ref": "EFSMountPoint" + }, + "/data.txt" + ] + ] + }, + "command": { + "Fn::Join": [ + "", + [ + "aws s3 cp s3://", + { + "Ref": "S3Bucket" + }, + "/", + { + "Ref": "TarData" + }, + " /home/", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + } + ] + ] + } + }, + "05_src_ebs": { + "command": { + "Fn::Join": [ + "", + [ + "aws s3 cp s3://", + { + "Ref": "S3Bucket" + }, + "/", + { + "Ref": "TarSource" + }, + " /home/", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + } + ] + ] + } + }, + "06_ebs_tar": { + "command": { + "Fn::Join": [ + "", + [ + "for file in /home/", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + }, + "/*.tar ; do tar -xf $file --directory ", + " /home/", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + }, + " ; done" + ] + ] + } + }, + "07_setup_script": { + "command": { + "Fn::Join": [ + "", + [ + "aws s3 cp s3://", + { + "Ref": "S3Bucket" + }, + "/", + { + "Ref": "SetupScript" + }, + " /home/", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + } + ] + ] + } + }, + "08_permissions": { + "command": { + "Fn::Join": [ + "", + [ + "chown ", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + }, + ":", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + }, + " -R /home/", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + } + ] + ] + } + }, + "09_permissions": { + "command": { + "Fn::Join": [ + "", + [ + "chmod u+x /home/", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + }, + "/*.sh" + ] + ] + } + } + } + }, + "download-setup": { + "files": { + "/opt/deeplearning/dl_cfn_setup_v2.py": { + "source": { + "Fn::Join": [ + "", + [ + { + "Fn::FindInMap": [ + "S3", + { + "Ref": "AWS::Region" + }, + "URL" + ] + }, + { + "Fn::Join": [ + "", + [ + { + "Ref": "AWS::Region" + }, + { + "Fn::FindInMap": [ + "Other", + "S3SourceBucket", + "BucketNameSuffix" + ] + } + ] + ] + }, + "/", + { + "Fn::FindInMap": [ + "Other", + "Setup", + "Filename" + ] + } + ] + ] + } + } + } + }, + "deeplearning-config": { + "commands": { + "01_setup": { + "command": "python /opt/deeplearning/dl_cfn_setup_v2.py | tee -a /var/log/cloud-init-output.log", + "cwd": "/opt/deeplearning", + "env": { + "AWS_DL_NODE_TYPE": "Worker", + "AWS_DL_MASTER_QUEUE": { + "Fn::GetAtt": [ + "MasterQueue", + "QueueName" + ] + }, + "AWS_DL_WORKER_QUEUE": { + "Fn::GetAtt": [ + "WorkerQueue", + "QueueName" + ] + }, + "AWS_DL_WAITCONDITION_TIMEOUT": { + "Fn::FindInMap": [ + "Other", + "TimeoutValues", + "WaitConditionTimeout" + ] + }, + "AWS_DL_MASTERLAUNCH_TIMEOUT": { + "Fn::FindInMap": [ + "Other", + "TimeoutValues", + "MasterLaunchTimeout" + ] + }, + "AWS_DL_STACK_ID": { + "Ref": "AWS::StackId" + }, + "AWS_DL_WAIT_HANDLE": { + "Ref": "myWaitHandle" + }, + "AWS_DL_ROLE_NAME": { + "Fn::Join": [ + "", + [ + { + "Ref": "AWS::StackName" + }, + "-InstanceRole" + ] + ] + }, + "AWS_DL_DEFAULT_USER": { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + }, + "AWS_REGION": { + "Ref": "AWS::Region" + }, + "EFS_MOUNT": { + "Fn::Join": [ + "", + [ + "/", + { + "Ref": "EFSMountPoint" + } + ] + ] + }, + "CFN_PATH": { + "Fn::FindInMap": [ + "Other", + "CfnPath", + { + "Ref": "ImageType" + } + ] + } + } + } + } + } + } + } + }, + "MasterLaunchConfig": { + "Type": "AWS::AutoScaling::LaunchConfiguration", + "Properties": { + "AssociatePublicIpAddress": "false", + "ImageId": { + "Fn::If": [ + "OverrideAMI", + { + "Ref": "AMIOverride" + }, + { + "Fn::FindInMap": [ + { + "Ref": "ImageType" + }, + { + "Ref": "AWS::Region" + }, + "AMI" + ] + } + ] + }, + "InstanceType": { + "Ref": "InstanceType" + }, + "EbsOptimized": { + "Ref": "EBSOptimized" + }, + "IamInstanceProfile": { + "Ref": "InstanceProfile" + }, + "SecurityGroups": [ + { + "Ref": "MasterSecurityGroup" + }, + { + "Ref": "AdminSSHSecurityGroup" + } + ], + "BlockDeviceMappings": [ + { + "DeviceName": { + "Ref": "EbsDeviceName" + }, + "Ebs": { + "VolumeSize": { + "Ref": "EbsVolumeSize" + }, + "VolumeType": "gp2" + } + } + ], + "UserData": { + "Fn::Base64": { + "Fn::Join": [ + "", + [ + "#!/bin/bash -xe", + "\n", + "# setup ssh-forwarding. \n", + "sed -i \"s/^#\\(\\s\\+\\)ForwardAgent\\(\\s\\+\\)no/\\ \\1ForwardAgent\\2yes/g\" /etc/ssh/ssh_config", + "\n", + "mkdir -p /opt/deeplearning", + "\n", + "sudo ln -s /home/", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + }, + "/anaconda3/etc/profile.d/conda.sh /etc/profile.d/conda.sh", + "\n", + "echo 'conda activate ", + { + "Ref": "ActivateCondaEnv" + }, + "' >> /home/", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + }, + "/.bash_login", + "\n", + "# run cfn-init. \n", + { + "Fn::FindInMap": [ + "Other", + "CfnPath", + { + "Ref": "ImageType" + } + ] + }, + "\\/cfn-init -v --region ", + { + "Ref": "AWS::Region" + }, + " --configsets Setup ", + " -s ", + { + "Ref": "AWS::StackId" + }, + " -r MasterLaunchConfig ", + "\n", + "" + ] + ] + } + }, + "KeyName": { + "Ref": "KeyName" + } + }, + "Metadata": { + "AWS::CloudFormation::Init": { + "configSets": { + "Setup": [ + "efs-config", + "download-setup", + "deeplearning-config" + ] + }, + "efs-config": { + "commands": { + "00_install_nfs": { + "command": { + "Fn::Join": [ + "", + [ + "if [ \"AmazonLinux\" = \"", + { + "Ref": "ImageType" + }, + "\" ];", + "then yum -y -q install nfs-utils; else apt-get -qq -y install nfs-common ; fi" + ] + ] + } + }, + "01_createdir": { + "command": { + "Fn::Join": [ + "", + [ + "mkdir -p /", + { + "Ref": "EFSMountPoint" + } + ] + ] + } + }, + "02_mount": { + "command": { + "Fn::Join": [ + "", + [ + "sudo mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 ", + { + "Fn::If": [ + "CreateNewFileSystem", + { + "Ref": "FileSystem" + }, + { + "Ref": "EFSFileSystemId" + } + ] + }, + ".efs.", + { + "Ref": "AWS::Region" + }, + ".amazonaws.com:/ /", + { + "Ref": "EFSMountPoint" + } + ] + ] + } + }, + "03_data": { + "command": { + "Fn::Join": [ + "", + [ + "touch /", + { + "Ref": "EFSMountPoint" + }, + "/", + { + "Fn::If": [ + "CopyDataToEFS", + "data.txt", + "nodata.txt" + ] + } + ] + ] + } + }, + "04_data_efs": { + "test": { + "Fn::Join": [ + "", + [ + "test -e /", + { + "Ref": "EFSMountPoint" + }, + "/data.txt" + ] + ] + }, + "command": { + "Fn::Join": [ + "", + [ + "aws s3 cp s3://", + { + "Ref": "S3Bucket" + }, + "/", + { + "Ref": "TarData" + }, + " /", + { + "Ref": "EFSMountPoint" + } + ] + ] + } + }, + "05_data_ebs": { + "test": { + "Fn::Join": [ + "", + [ + "test ! -e /", + { + "Ref": "EFSMountPoint" + }, + "/data.txt" + ] + ] + }, + "command": { + "Fn::Join": [ + "", + [ + "aws s3 cp s3://", + { + "Ref": "S3Bucket" + }, + "/", + { + "Ref": "TarData" + }, + " /home/", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + } + ] + ] + } + }, + "06_src_ebs": { + "command": { + "Fn::Join": [ + "", + [ + "aws s3 cp s3://", + { + "Ref": "S3Bucket" + }, + "/", + { + "Ref": "TarSource" + }, + " /home/", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + } + ] + ] + } + }, + "07_ebs_tar": { + "command": { + "Fn::Join": [ + "", + [ + "for file in /home/", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + }, + "/*.tar ; do tar -xf $file --directory ", + " /home/", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + }, + " ; done" + ] + ] + } + }, + "08_run_script": { + "command": { + "Fn::Join": [ + "", + [ + "aws s3 cp s3://", + { + "Ref": "S3Bucket" + }, + "/", + { + "Ref": "RunScript" + }, + " /home/", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + } + ] + ] + } + }, + "09_setup_script": { + "command": { + "Fn::Join": [ + "", + [ + "aws s3 cp s3://", + { + "Ref": "S3Bucket" + }, + "/", + { + "Ref": "SetupScript" + }, + " /home/", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + } + ] + ] + } + }, + "10_permissions": { + "command": { + "Fn::Join": [ + "", + [ + "chown ", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + }, + ":", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + }, + " -R /home/", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + } + ] + ] + } + }, + "11_permissions": { + "command": { + "Fn::Join": [ + "", + [ + "chmod u+x /home/", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + }, + "/*.sh" + ] + ] + } + }, + "12_permissions": { + "command": { + "Fn::Join": [ + "", + [ + "chown ", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + }, + ":", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + }, + " /", + { + "Ref": "EFSMountPoint" + } + ] + ] + } + } + } + }, + "download-setup": { + "files": { + "/opt/deeplearning/dl_cfn_setup_v2.py": { + "source": { + "Fn::Join": [ + "", + [ + { + "Fn::FindInMap": [ + "S3", + { + "Ref": "AWS::Region" + }, + "URL" + ] + }, + { + "Fn::Join": [ + "", + [ + { + "Ref": "AWS::Region" + }, + { + "Fn::FindInMap": [ + "Other", + "S3SourceBucket", + "BucketNameSuffix" + ] + } + ] + ] + }, + "/", + { + "Fn::FindInMap": [ + "Other", + "Setup", + "Filename" + ] + } + ] + ] + } + } + } + }, + "deeplearning-config": { + "commands": { + "01_setup": { + "command": "python /opt/deeplearning/dl_cfn_setup_v2.py | tee -a /var/log/cloud-init-output.log", + "cwd": "/opt/deeplearning", + "env": { + "AWS_DL_NODE_TYPE": "Master", + "AWS_DL_MASTER_QUEUE": { + "Fn::GetAtt": [ + "MasterQueue", + "QueueName" + ] + }, + "AWS_DL_WORKER_QUEUE": { + "Fn::GetAtt": [ + "WorkerQueue", + "QueueName" + ] + }, + "AWS_DL_WAITCONDITION_TIMEOUT": { + "Fn::FindInMap": [ + "Other", + "TimeoutValues", + "WaitConditionTimeout" + ] + }, + "AWS_DL_MASTERLAUNCH_TIMEOUT": { + "Fn::FindInMap": [ + "Other", + "TimeoutValues", + "MasterLaunchTimeout" + ] + }, + "AWS_DL_STACK_ID": { + "Ref": "AWS::StackId" + }, + "AWS_DL_WAIT_HANDLE": { + "Ref": "myWaitHandle" + }, + "AWS_DL_ROLE_NAME": { + "Fn::Join": [ + "", + [ + { + "Ref": "AWS::StackName" + }, + "-InstanceRole" + ] + ] + }, + "AWS_DL_DEFAULT_USER": { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + }, + "AWS_REGION": { + "Ref": "AWS::Region" + }, + "EFS_MOUNT": { + "Fn::Join": [ + "", + [ + "/", + { + "Ref": "EFSMountPoint" + } + ] + ] + }, + "CFN_PATH": { + "Fn::FindInMap": [ + "Other", + "CfnPath", + { + "Ref": "ImageType" + } + ] + } + } + } + } + } + } + } + }, + "MasterAutoScalingGroup": { + "Type": "AWS::AutoScaling::AutoScalingGroup", + "DependsOn": [ + "MasterLaunchConfig", + "MountTarget", + "MasterQueue", + "WorkerQueue" + ], + "CreationPolicy": { + "ResourceSignal": { + "Timeout": { + "Fn::Join": [ + "", + [ + "PT", + { + "Fn::FindInMap": [ + "Other", + "TimeoutValues", + "MasterLaunchTimeout" + ] + }, + "S" + ] + ] + }, + "Count": "1" + } + }, + "Properties": { + "DesiredCapacity": "1", + "MinSize": "1", + "MaxSize": "1", + "PlacementGroup": { + "Ref": "ResourcePlacementGroup" + }, + "LaunchConfigurationName": { + "Ref": "MasterLaunchConfig" + }, + "VPCZoneIdentifier": [ + { + "Ref": "PrivateSubnetId" + } + ], + "NotificationConfiguration": { + "TopicARN": { + "Ref": "ResourceMetadataSNSTopic" + }, + "NotificationTypes": [ + "autoscaling:EC2_INSTANCE_LAUNCH", + "autoscaling:EC2_INSTANCE_LAUNCH_ERROR", + "autoscaling:EC2_INSTANCE_TERMINATE_ERROR", + "autoscaling:EC2_INSTANCE_TERMINATE" + ] + }, + "Tags": [ + { + "Key": "Name", + "Value": { + "Fn::Join": [ + "", + [ + { + "Ref": "AWS::StackName" + }, + "-Master" + ] + ] + }, + "PropagateAtLaunch": true + }, + { + "Key": "NodeType", + "Value": "Master", + "PropagateAtLaunch": true + } + ] + } + }, + "WorkerAutoScalingGroup": { + "Type": "AWS::AutoScaling::AutoScalingGroup", + "DependsOn": [ + "WorkerLaunchConfig", + "MountTarget", + "MasterQueue", + "WorkerQueue", + "MasterAutoScalingGroup" + ], + "Properties": { + "MinSize": "0", + "MaxSize": { + "Ref": "WorkerCount" + }, + "PlacementGroup": { + "Ref": "ResourcePlacementGroup" + }, + "DesiredCapacity": { + "Ref": "WorkerCount" + }, + "LaunchConfigurationName": { + "Ref": "WorkerLaunchConfig" + }, + "VPCZoneIdentifier": [ + { + "Ref": "PrivateSubnetId" + } + ], + "NotificationConfiguration": { + "TopicARN": { + "Ref": "ResourceMetadataSNSTopic" + }, + "NotificationTypes": [ + "autoscaling:EC2_INSTANCE_LAUNCH", + "autoscaling:EC2_INSTANCE_LAUNCH_ERROR", + "autoscaling:EC2_INSTANCE_TERMINATE_ERROR", + "autoscaling:EC2_INSTANCE_TERMINATE" + ] + }, + "Tags": [ + { + "Key": "Name", + "Value": { + "Fn::Join": [ + "", + [ + { + "Ref": "AWS::StackName" + }, + "-Worker" + ] + ] + }, + "PropagateAtLaunch": true + }, + { + "Key": "NodeType", + "Value": "Worker", + "PropagateAtLaunch": true + } + ] + } + }, + "MasterQueue": { + "Type": "AWS::SQS::Queue", + "Properties": { + "QueueName": { + "Fn::Join": [ + "", + [ + { + "Ref": "AWS::StackName" + }, + "-aws-dl-cfn-master" + ] + ] + } + } + }, + "WorkerQueue": { + "Type": "AWS::SQS::Queue", + "Properties": { + "QueueName": { + "Fn::Join": [ + "", + [ + { + "Ref": "AWS::StackName" + }, + "-aws-dl-cfn-worker" + ] + ] + } + } + }, + "ResourceMetadataSNSTopic": { + "Type": "AWS::SNS::Topic", + "Properties": { + "DisplayName": { + "Fn::Join": [ + "", + [ + { + "Ref": "AWS::StackName" + }, + "-aws-dl-cfn" + ] + ] + }, + "Subscription": [ + { + "Endpoint": { + "Fn::GetAtt": [ + "ResourceMetadataLambdaFunction", + "Arn" + ] + }, + "Protocol": "lambda" + } + ], + "TopicName": { + "Fn::Join": [ + "", + [ + { + "Ref": "AWS::StackName" + }, + "-aws-dl-cfn" + ] + ] + } + } + }, + "myWaitHandle": { + "Type": "AWS::CloudFormation::WaitConditionHandle", + "Properties": {} + }, + "myWaitCondition": { + "Type": "AWS::CloudFormation::WaitCondition", + "Properties": { + "Handle": { + "Ref": "myWaitHandle" + }, + "Timeout": { + "Fn::FindInMap": [ + "Other", + "TimeoutValues", + "WaitConditionTimeout" + ] + } + } + } + }, + "Outputs": { + "AdminSSHSecurityGroup": { + "Description": "Security Group that restricts Inbound IPs to SSH into the Master", + "Value": { + "Ref": "AdminSSHSecurityGroup" + } + }, + "MasterAutoScalingGroup": { + "Description": "Autoscaling Group that contains the Master Instance", + "Value": { + "Ref": "MasterAutoScalingGroup" + } + }, + "WorkerAutoScalingGroup": { + "Description": "Autoscaling Group that contains the Workers", + "Value": { + "Ref": "WorkerAutoScalingGroup" + } + }, + "MountTargetID": { + "Description": "EFS Mount target ID", + "Value": { + "Ref": "MountTarget" + } + } + } +} \ No newline at end of file