From c6ec91a4a63f2c71e2e6a6c9ac0dee5a6f579115 Mon Sep 17 00:00:00 2001 From: Mark Schreiber Date: Mon, 2 Feb 2026 14:23:39 -0500 Subject: [PATCH 1/5] feat: adds aws-healthomics official power --- README.md | 8 + aws-healthomics/POWER.md | 66 +++ aws-healthomics/mcp.json | 27 ++ .../steering/ecr-pull-through-cache.md | 370 +++++++++++++++++ .../steering/migration-guide-for-nextflow.md | 318 ++++++++++++++ .../steering/migration-guide-for-wdl.md | 389 ++++++++++++++++++ aws-healthomics/steering/troubleshooting.md | 15 + .../steering/workflow-development.md | 107 +++++ 8 files changed, 1300 insertions(+) create mode 100644 aws-healthomics/POWER.md create mode 100644 aws-healthomics/mcp.json create mode 100644 aws-healthomics/steering/ecr-pull-through-cache.md create mode 100644 aws-healthomics/steering/migration-guide-for-nextflow.md create mode 100644 aws-healthomics/steering/migration-guide-for-wdl.md create mode 100644 aws-healthomics/steering/troubleshooting.md create mode 100644 aws-healthomics/steering/workflow-development.md diff --git a/README.md b/README.md index 03c7d25..fc64009 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,14 @@ Documentation is available at https://kiro.dev/docs/powers/ --- +### aws-healthomics + +**Create and Manage Bioinformatics Workflows with AWS HealthOmics** - create, migrate, run, debug and identify optimization opportunities for bioinformatics workflows in AWS HealthOmics. + +**MCP Servers:** awslabs.aws-healthomics-mcp-server + +--- + ### aws-infrastructure-as-code **Build AWS infrastructure with CDK and CloudFormation** - Generate well-architected AWS infrastructure with CDK and CloudFormation - access latest documentation, validate templates, and troubleshoot deployments. diff --git a/aws-healthomics/POWER.md b/aws-healthomics/POWER.md new file mode 100644 index 0000000..2a59f8a --- /dev/null +++ b/aws-healthomics/POWER.md @@ -0,0 +1,66 @@ +--- +name: "aws-healthomics" +displayName: "AWS HealthOmics" +description: "Create, migrate, run, debug and optimize bioinformatics workflows in AWS HealthOmics" +keywords: ["healthomics", "WDL", "CWL", "Nextflow", "workflow", "genomics", "bioinformatics", "pipeline"] +author: "AWS" +--- + +# When to use this power + +When you want to create, migrate, run, debug and identify optimization opportunities for genomics workflows in AWS HealthOmics + +# When to Load Steering Files + +Whenever you are asked to perform a task related to any of the following scenarios - ensure you load and read the appropriate markdown file mentioned + +- Creating a new WDL, Nextflow or CWL workflow -> use `./steering_files/workflow-development.md` +- Onboarding an existing WDL workflow ensuring compatibility with HealthOmics -> use `.\steering_files/migration-guide-for-wdl.md` +- Onboarding an existing Nextflow workflow ensuring compatibility with HealthOmics -> use `./steering_files/migration-guide-for-wdl.md` +- Diagnosing workflow creation issues -> use `./steering_files/troubleshooting.md` +- Diagnosing run failures -> use `./steering_files/troubleshooting.md` +- Using public containers with HealthOmics via ECR Pullthrough Caches -> use `./steering_files/ecr-pull-through-cache.md` + + +# Onboarding + +1. **Ensure the user has valid AWS Credentials** These are used by the HealthOmics MCP server to interact with AWS Services. +2. **Obtain the current account number** Using `aws sts get-caller-identity` +3. **Create a `config.toml`** Create a `.healthomics/config.toml` file to specify run parameters. This helps you, the agent, create workflows and start runs with the correct settings: + + **config.toml:** + ```toml + // This is a service role used to start runs, it must have a trust policy for the omics principal + omics_iam_role = "arn:aws:iam:::role/" + // Outputs of runs are written here, the service role must have write permissions to this location + run_output_uri = "s3:///healthomics-outputs/" + run_storage_type = "DYNAMIC" # Recommended for faster runs and automatic scaling + ``` + + - Ask the customer for the `omics_iam_role` and `run_output_uri` values. You may also offer to create them. Record the values by updating the toml + - ALWAYS use settings from `.healthomics/config.toml` when they are set +4. **Dependencies** The MCP server configured by this power requires [`uvx`](https://docs.astral.sh/uv/getting-started/installation/) +``` + +# Integrations + +This power integrates with [AWS Healthomics MCP Server](https://github.com/awslabs/mcp/tree/main/src/aws-healthomics-mcp-server) (Apache-2.0 license). + + +# License + +``` +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +``` diff --git a/aws-healthomics/mcp.json b/aws-healthomics/mcp.json new file mode 100644 index 0000000..356ade8 --- /dev/null +++ b/aws-healthomics/mcp.json @@ -0,0 +1,27 @@ +{ + "mcpServers": { + "aws-healthomics": { + "command": "uvx", + "args": [ + "awslabs.aws-healthomics-mcp-server@latest" + ], + "timeout": 300000, + "env": { + "HEALTHOMICS_DEFAULT_MAX_RESULTS": "100" + }, + "disabled": false, + "autoApprove": [ + "DiagnoseAHORunFailure", + "GetAHOWorkflow", + "GetAHORun", + "GetAHORunTask", + "GetAHORunLogs", + "GetAHORunManifestLogs", + "GetAHORunEngineLogs", + "GetAHOTaskLogs", + "GetSupportedFileTypes", + "GetAHOSupportedRegions" + ] + } + } +} diff --git a/aws-healthomics/steering/ecr-pull-through-cache.md b/aws-healthomics/steering/ecr-pull-through-cache.md new file mode 100644 index 0000000..d78d818 --- /dev/null +++ b/aws-healthomics/steering/ecr-pull-through-cache.md @@ -0,0 +1,370 @@ +# AWS HealthOmics ECR Pullthrough and Container Registry Maps + +## Overview +Container Registry Maps are a feature in AWS HealthOmics that enable workflows to use ECR pull through caches to access public container registries without manually replicating containers into private ECR repositories. This feature provides automatic mapping between upstream registries (like Docker Hub and Quay.io) and your private ECR repositories. + +ECR Pull through cache setup and container registry mapping are two distinct but related concepts. If you setup pull through caches your +workflows will automatically pull containers from the upstream registries and cache them in your ECR repositories. You will reference these +containers using ECR private URIs in your workflow definitions. If you also add a container registry map then you can use the original +public registry URIs in your workflow definitions and HealthOmics will automatically map them to your ECR private URIs. + +*When creating new workflows container registry maps are usually not needed, ECR pull through caches are sufficient* +*When updating existing workflows container registry maps can be used to avoid changing all container URIs in the workflow* + +## Prerequisites +- AWS CLI v2 installed and configured +- Appropriate IAM permissions for ECR and HealthOmics + +## Regions +You should configure your ECR registry and HealthOmics workflows in the same region. If you will use multiple regions then repeat these steps in each region. + +### Step 1: Create Secrets Manager Secrets (For Authenticated Registries) +Some registries such as Docker Hub or private registries will require authentication. To use pull through cache, you must create a secret in Secrets Manager that contains the credentials for the registry. In these examples the region us-east-1 is specified. You should change this as needed. + +To obtain a Docker Hub token refer to https://docs.docker.com/security/access-tokens/ + +**Docker Hub Secret** +``` +aws secretsmanager create-secret \ + --name "ecr-pullthroughcache/docker-hub" \ + --description "Docker Hub credentials for ECR pull through cache" \ + --secret-string '{ + "username": "your-docker-username", + "accessToken": "your-docker-access-token" + }' \ + --region us-east-1 +``` + +**Quay.io Secret (if using private repositories, not required for public repositories)** +``` +aws secretsmanager create-secret \ + --name "ecr-pullthroughcache/quay" \ + --description "Quay.io credentials for ECR pull through cache" \ + --secret-string '{ + "username": "your-quay-username", + "accessToken": "your-quay-access-token" + }' \ + --region us-east-1 +``` + +## Step 2: Create ECR Pull Through Cache Rules + +**Docker Hub Pull Through Cache** +``` +aws ecr create-pull-through-cache-rule \ + --ecr-repository-prefix docker-hub \ + --upstream-registry-url registry-1.docker.io \ + --credential-arn arn:aws:secretsmanager:us-east-1:123456789012:secret:ecr-pullthroughcache/docker-hub-AbCdEf \ + --region us-east-1 +``` + +**Quay.io Pull Through Cache** +``` +aws ecr create-pull-through-cache-rule \ + --ecr-repository-prefix quay \ + --upstream-registry-url quay.io \ + --region us-east-1 +``` + +**ECR Public Pull Through Cache** +``` +aws ecr create-pull-through-cache-rule \ + --ecr-repository-prefix ecr-public \ + --upstream-registry-url public.ecr.aws \ + --region us-east-1 +``` + +## Step 3: Configure Registry Permissions +Create a registry permissions policy to allow HealthOmics to use pull through cache: + +``` +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "AllowPTCinRegPermissions", + "Effect": "Allow", + "Principal": { + "Service": "omics.amazonaws.com" + }, + "Action": [ + "ecr:CreateRepository", + "ecr:BatchImportUpstreamImage" + ], + "Resource": [ + "arn:aws:ecr:us-east-1:123456789012:repository/docker-hub/*", + "arn:aws:ecr:us-east-1:123456789012:repository/quay/*", + "arn:aws:ecr:us-east-1:123456789012:repository/ecr-public/*" + ] + } + ] +} +``` + +Apply the policy: + +``` +aws ecr put-registry-policy \ + --policy-text file://registry-policy.json \ + --region us-east-1 +``` + +## Step 4: Create Repository Creation Templates + +**Docker Hub Template** + +``` +aws ecr create-repository-creation-template \ + --prefix docker-hub \ + --applied-for PULL_THROUGH_CACHE \ + --repository-policy '{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "PTCRepoCreationTemplate", + "Effect": "Allow", + "Principal": { + "Service": "omics.amazonaws.com" + }, + "Action": [ + "ecr:BatchGetImage", + "ecr:GetDownloadUrlForLayer" + ], + "Resource": "*" + } + ] + }' \ + --region us-east-1 +``` + +**Quay.io Template** + +``` +aws ecr create-repository-creation-template \ + --prefix quay \ + --applied-for PULL_THROUGH_CACHE \ + --repository-policy '{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "PTCRepoCreationTemplate", + "Effect": "Allow", + "Principal": { + "Service": "omics.amazonaws.com" + }, + "Action": [ + "ecr:BatchGetImage", + "ecr:GetDownloadUrlForLayer" + ], + "Resource": "*" + } + ] + }' \ + --region us-east-1 +``` + +**ECR Public Template** + +``` +aws ecr create-repository-creation-template \ + --prefix ecr-public \ + --applied-for PULL_THROUGH_CACHE \ + --repository-policy '{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "PTCRepoCreationTemplate", + "Effect": "Allow", + "Principal": { + "Service": "omics.amazonaws.com" + }, + "Action": [ + "ecr:BatchGetImage", + "ecr:GetDownloadUrlForLayer" + ], + "Resource": "*" + } + ] + }' \ + --region us-east-1 +``` + +## Step 5: Create Container Registry Maps + +*This step is optional and generally only required when migrating a workflow. Otherwise we recommend using full private ECR URIs in your workflows* + +Registry mappings can be used to map specific upstream registries to your private ECR repositories. In the example here, containers from Docker Hub, Quay.io and ECR Public used in a workflow will be mapped to your private ECR pull through caches. + +Create a registry map file (registry-map.json): + +``` +{ + "registryMappings": [ + { + "upstreamRegistryUrl": "registry-1.docker.io", + "ecrRepositoryPrefix": "docker-hub" + }, + { + "upstreamRegistryUrl": "quay.io", + "ecrRepositoryPrefix": "quay" + }, + { + "upstreamRegistryUrl": "public.ecr.aws", + "ecrRepositoryPrefix": "ecr-public" + } + ] +} +``` + +**Image Mappings Example** +Image mappings can be used to map specific containers to your private ECR repositories. These mappings will take precedence over registryMappings if both are provided. + +Create an image map file (image-map.json) for specific container overrides: + +``` +{ + "imageMappings": [ + { + "sourceImage": "broadinstitute/gatk:4.6.0.2", + "destinationImage": "123456789012.dkr.ecr.us-east-1.amazonaws.com/docker-hub/broadinstitute/gatk:latest" + }, + { + "sourceImage": "quay.io/biocontainers/samtools:1.17--h00cdaf9_0", + "destinationImage": "123456789012.dkr.ecr.us-east-1.amazonaws.com/quay/biocontainers/samtools:1.17--h00cdaf9_0" + } + ] +} +``` + +**Combined Registry and Image Map** + +Create a complete map file (container-registry-map.json): + +``` +{ + "registryMappings": [ + { + "upstreamRegistryUrl": "registry-1.docker.io", + "ecrRepositoryPrefix": "docker-hub" + }, + { + "upstreamRegistryUrl": "quay.io", + "ecrRepositoryPrefix": "quay" + } + ], + "imageMappings": [ + { + "sourceImage": "ubuntu", + "destinationImage": "123456789012.dkr.ecr.us-east-1.amazonaws.com/docker-hub/library/ubuntu:20.04" + }, + { + "sourceImage": "quay.io/biocontainers/bwa:0.7.17--hed695b0_7", + "destinationImage": "123456789012.dkr.ecr.us-east-1.amazonaws.com/quay/biocontainers/bwa:0.7.17--hed695b0_7" + } + ] +} +``` + +Container regitry map files should be loaded to S3 and referenced when creating a workflow using the CreateAHOWorlflow tool. + +## Step 6: Configure HealthOmics Service Role +The HealthOmics service role used during workflow runs must have ECR permissions to pull container images from your pull through cache repositories. + +**Create Trust Policy File** + +``` +cat > trust-policy.json << 'EOF' +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Service": "omics.amazonaws.com" + }, + "Action": "sts:AssumeRole" + } + ] +} +EOF +``` + +**Create Service Role Policy File** + +``` +cat > service-role-policy.json << 'EOF' +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "s3:GetObject", + "s3:PutObject" + ], + "Resource": [ + "arn:aws:s3:::your-workflow-bucket/*" + ] + }, + { + "Effect": "Allow", + "Action": [ + "s3:ListBucket" + ], + "Resource": [ + "arn:aws:s3:::your-workflow-bucket" + ] + }, + { + "Effect": "Allow", + "Action": [ + "logs:DescribeLogStreams", + "logs:CreateLogStream", + "logs:PutLogEvents", + "logs:CreateLogGroup" + ], + "Resource": [ + "arn:aws:logs:us-east-1:123456789012:log-group:/aws/omics/WorkflowLog*" + ] + }, + { + "Effect": "Allow", + "Action": [ + "ecr:BatchGetImage", + "ecr:GetDownloadUrlForLayer", + "ecr:BatchCheckLayerAvailability" + ], + "Resource": [ + "arn:aws:ecr:us-east-1:123456789012:repository/docker-hub/*", + "arn:aws:ecr:us-east-1:123456789012:repository/quay/*", + "arn:aws:ecr:us-east-1:123456789012:repository/ecr-public/*" + ] + } + ] +} +EOF +``` + +**Create the Service Role** + +``` +aws iam create-role \ + --role-name HealthOmicsWorkflowRole \ + --assume-role-policy-document file://trust-policy.json \ + --description "Service role for HealthOmics workflows with container registry mappings" +``` + +**Create and Attach the Policy** + +``` +aws iam create-policy \ + --policy-name HealthOmicsWorkflowPolicy \ + --policy-document file://service-role-policy.json \ + --description "Policy for HealthOmics workflows with ECR pull through cache access" +``` + +``` +aws iam attach-role-policy \ + --role-name HealthOmicsWorkflowRole \ + --policy-arn arn:aws:iam::123456789012:policy/HealthOmicsWorkflowPolicy +``` diff --git a/aws-healthomics/steering/migration-guide-for-nextflow.md b/aws-healthomics/steering/migration-guide-for-nextflow.md new file mode 100644 index 0000000..a316479 --- /dev/null +++ b/aws-healthomics/steering/migration-guide-for-nextflow.md @@ -0,0 +1,318 @@ +# AWS HealthOmics Nextflow Migration + +## Overview + +This document covers 'on-boarding' a Nextflow workflow to be compatible with AWS HealthOmics. This involves container migration, resource configuration, storage migration, and output path standardization. + +## Background + +AWS HealthOmics requires specific configurations: +- All containers must be in ECR repositories accessible to HealthOmics +- All input files must be in S3 +- All processes must have explicit CPU and memory declarations +- Output directories must use `/mnt/workflow/pubdir/` prefix + +## Goals + +1. **Container Migration**: Identify all Docker/Singularity containers and migrate to ECR +2. **Resource Configuration**: Ensure all processes have CPU and memory declarations +3. **Storage Migration**: Move reference files and inputs to S3 +4. **Output Path Standardization**: Update all publishDir directives to use HealthOmics-compatible paths +5. **Validation**: Test the migrated workflow on HealthOmics + +## Non-Goals + +- Modifying the scientific logic of the workflow +- Changing the workflow structure or dependencies +- Performance optimization beyond HealthOmics requirements + +## Requirements + +### Phase 1: Container Inventory and Migration + +**Objective**: Identify all containers and create ECR migration plan + +**Tasks**: +1. Extract all unique container URIs +2. Generate container inventory CSV with columns: + - Module/Process name + - Original container URI + - Container registry + - Tool name and version + - Target ECR URI +3. Create `scripts/migrate_containers_to_ecr.sh` to: + - Find or create ECR repositories for each tool with access policies that allow the omics principal to read from the repository + - Pull each container from source registry ensuring x86 containers are pulled + - Tag for ECR with naming convention: `.dkr.ecr..amazonaws.com//:` + - Push to ECR repositories + - Handle authentication for different registries +4. Create `scripts/update_container_refs.sh` to: + - Replace all container URIs in module files + - Update to use ECR registry + - Preserve conditional logic for singularity vs docker +5. Create `conf/healthomics.config` with ECR registry base path and import this at the end of the top level nextflow.config + +**Acceptance Criteria**: +- `container_inventory.csv` with all containers documented +- Migration script successfully pushes all containers to ECR +- All module `main.nf` files updated with ECR URIs +- Zero references to external registries remain +- Test that at least 5 key containers are accessible from ECR +- Documentation of migration strategy and any challenges encountered + +### Phase 2: Resource Declaration Audit + +**Objective**: Ensure all processes have CPU and memory declarations + +**HealthOmics Requirements**: +- Minimum: 2 vCPUs, 4 GB memory +- Maximum: 96 vCPUs, 768 GB memory +- Must be explicit in process definition or config + +**Tasks**: +1. Inspect resource declarations: + - Scan all module files for resource declarations + - Identify processes relying only on labels + - Check if label-based resources are sufficient +2. Verify all processes in `conf/base.config` have explicit resources +3. Add HealthOmics-specific resource overrides in `conf/healthomics.config`: + - Ensure minimums are met + - Optimize for HealthOmics instance types + - Add retry strategy with increased resources +4. Document resource requirements per tool in `docs/healthomics_resources.md` +5. Create validation script to ensure no process lacks resources + +**Acceptance Criteria**: +- Resource audit report generated +- All processes have resources via direct declaration or label +- `conf/healthomics.config` includes resource overrides +- All resources meet HealthOmics minimums (≥2 vCPU, ≥4 GB) +- Documentation of resource rationale per tool +- Validation script confirms 100% coverage + +### Phase 3: Reference and Input File Migration + +**Objective**: Migrate all reference files and inputs to S3 + +**Tasks**: +1. Identify input files, samplesheets and any hard coded or configured reference genomes, databases etc.: + - Scan `*.config` files for all file references + - Extract all reference parameters from `nextflow.config` + - List files in `assets/` directory + - Identify files referenced in sample sheets + - Generate reference inventory with sizes + - Scan for hardcoded paths in helper scripts and shell scripts in processes +2. Design S3 bucket structure appropriate for the workflow. For example: + ``` + s3:/// + ├── references/ + │ ├── Homo_sapiens/ + │ │ ├── GATK/GRCh38/ + │ │ │ ├── Sequence/ + │ │ │ ├── Annotation/ + │ │ │ └── Variation/ + │ │ └── NCBI/GRCh38/ + │ └── Mus_musculus/ + ├── annotation/ + │ ├── snpeff_cache/ + │ └── vep_cache/ + └── assets/ + ``` +3. Create `scripts/migrate_references_to_s3.sh` to: + - Copy from existing S3 locations if available + - Upload local files if needed + - Obtain and upload http(s):// and ftp:// resources + - Set appropriate S3 storage class (Intelligent-Tiering) + - Validate checksums after upload +4. Update `conf/healthomics.config` with S3 paths: + - Set all reference parameters to S3 URIs +5. Update sample sheets to point to new S3 URIs +6. Update any hard coded paths to point to new S3 URIs + +**Acceptance Criteria**: +- Reference inventory CSV with all files and sizes +- S3 bucket created with proper structure +- All reference files accessible from S3 +- `conf/healthomics.config` uses S3 URIs exclusively +- Migration script with progress tracking +- Documentation of S3 structure and access +- Validation that workflow can access all S3 references + +### Phase 4: Output Path Standardization + +**Objective**: Update all publishDir directives for HealthOmics compatibility + +**HealthOmics Requirement**: +- All outputs must be under `/mnt/workflow/pubdir/` +- Structure: `/mnt/workflow/pubdir/` + +**Tasks**: +1. Identify publishDir directives: + - Find all publishDir declarations in modules and subworkflows and configs + - Extract current path patterns + - Identify hardcoded paths vs parameterized paths +2. Update paths: + - Update default publishDir in `conf/modules/modules.config` + - Update all process-specific publishDir overrides + - Replace `${params.outdir}` with `/mnt/workflow/pubdir` + - Preserve all other publishDir options (mode, pattern, saveAs) +3. Update `conf/healthomics.config`: + ```groovy + params { + outdir = '/mnt/workflow/pubdir' + } + ``` +4. Scan for hardcoded paths in: + - Shell scripts within process definitions + - Template files + - Helper scripts +5. Create `docs/healthomics_outputs.md` documenting: + - Output directory structure + - File organization + - How to retrieve outputs from HealthOmics + +**Acceptance Criteria**: +- Audit report of all publishDir declarations +- All publishDir paths use `/mnt/workflow/pubdir/` prefix +- No references to `${params.outdir}` outside of healthomics.config +- Relative path structure preserved +- All publishDir options (mode, pattern, saveAs) maintained +- No hardcoded absolute paths in scripts +- Documentation of output structure +- Test run confirms outputs written to correct location + +### Phase 5: Configuration and Testing + +**Objective**: Create HealthOmics-specific configuration and validate + +**Tasks**: +1. Create comprehensive `conf/healthomics.config` (for example): + ```groovy + params { + // Container registry + container_registry = '.dkr.ecr..amazonaws.com/' + + // S3 references + igenomes_base = 's3:///references' + snpeff_cache = 's3:///annotation/snpeff_cache' + vep_cache = 's3:///annotation/vep_cache' + + // Output + outdir = '/mnt/workflow/pubdir' + publish_dir_mode = 'copy' + + // HealthOmics optimizations + max_cpus = 96 + max_memory = 768.GB + max_time = 168.h + } + + process { + // Disable conda (not supported) + conda = null + + // Use ECR containers + container = { "${params.container_registry}/${task.process.tokenize(':')[-1].toLowerCase()}" } + + // Error handling for HealthOmics + errorStrategy = { task.exitStatus in [143,137,104,134,139,140] ? 'retry' : 'finish' } + maxRetries = 3 + } + ``` + +2. Create `conf/test/test_healthomics.config`: + - Use small test dataset (e.g., chr22 only) + - Minimal tools: `--tools haplotypecaller` + - Fast execution: `--skip_tools baserecalibrator` + - S3 test data location + - Expected runtime: <2 hours + +3. Update `nextflow.config`: + ```groovy + profiles { + healthomics { + includeConfig 'conf/healthomics.config' + } + test_healthomics { + includeConfig 'conf/test/test_healthomics.config' + } + } + ``` + +4. Create test execution plan: + - Stage 1: Validate configuration locally with `-profile healthomics,test_healthomics` + - Stage 2: Test on HealthOmics with minimal dataset + - Stage 3: Test with full-size dataset + - Stage 4: Resource optimization + +**Acceptance Criteria**: +- `conf/healthomics.config` complete with correct syntax +- `conf/test/test_healthomics.config` complete with correct syntax +- Workflow definition JSON validated +- Test profile completes successfully on HealthOmics +- Full migration guide documentation +- Known issues documented with workarounds +- Performance benchmarks recorded +- Resource analysis completed + + +## Technical Details + +### Container Registry Pattern +``` +Original: quay.io/biocontainers/bwa:0.7.17--h5bf99c6_8 +Target: .dkr.ecr..amazonaws.com/sarek/bwa:0.7.17--h5bf99c6_8 +``` + +### Resource Declaration Pattern +```groovy +process EXAMPLE { + cpus 4 + memory 8.GB + + // ... rest of process +} +``` + +### PublishDir Pattern +```groovy +// Before +publishDir "${params.outdir}/preprocessing/mapped", mode: params.publish_dir_mode + +// After +publishDir "/mnt/workflow/pubdir/preprocessing/mapped", mode: params.publish_dir_mode +``` + +### S3 Reference Pattern +```groovy +// Before +params.fasta = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta" + +// After +params.fasta = "s3:///references/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta" +``` + +## Dependencies + +- AWS CLI configured with appropriate permissions +- ECR repositories created +- S3 bucket(s) created with appropriate permissions +- HealthOmics service access +- Docker/ Finch/ Podman installed for container operations + +## Success Metrics + +- 100% of containers migrated to ECR +- 100% of processes have resource declarations +- All reference files accessible from S3 +- All outputs written to `/mnt/workflow/pubdir/` +- Test workflow completes successfully on HealthOmics +- Documentation complete and accurate + + +## References + +- [AWS HealthOmics Documentation](https://docs.aws.amazon.com/omics/) +- [nf-core documentation](https://nf-co.re) +- [Nextflow on AWS HealthOmics](https://www.nextflow.io/docs/latest/aws.html#aws-omics) +- [ECR Documentation](https://docs.aws.amazon.com/ecr/) \ No newline at end of file diff --git a/aws-healthomics/steering/migration-guide-for-wdl.md b/aws-healthomics/steering/migration-guide-for-wdl.md new file mode 100644 index 0000000..e8a9e90 --- /dev/null +++ b/aws-healthomics/steering/migration-guide-for-wdl.md @@ -0,0 +1,389 @@ +# AWS HealthOmics WDL Migration + +## Overview + +This document covers migration of on-prem or Cromwell variant WDL workflows to run in HealthOmics. This involves container migration, runtime configuration, storage migration, and output path standardization. + +## Background + +AWS HealthOmics requires specific configurations: +- All containers must be in ECR repositories accessible to HealthOmics +- All input files must be in S3 +- All tasks must have explicit CPU and memory runtime attributes +- Output files are automatically collected from task outputs +- WDL 1.0+ syntax is required (draft-2 not supported) + +## Goals + +1. **Container Migration**: Identify all Docker containers and migrate to ECR +2. **Runtime Configuration**: Ensure all tasks have CPU and memory declarations +3. **Storage Migration**: Move reference files and inputs to S3 +4. **WDL Version Upgrade**: Ensure WDL 1.0+ compatibility +5. **Validation**: Lint and test the migrated workflow on HealthOmics + +## Non-Goals + +- Modifying the scientific logic of the workflow +- Changing the workflow structure or task dependencies +- Performance optimization beyond HealthOmics requirements + +## Requirements + +### Phase 1: Container Inventory and Migration + +**Objective**: Identify all containers and create ECR migration plan + +**Tasks**: +1. Extract all unique container URIs from runtime sections: + - Scan all WDL files for `docker:` and `container:` runtime attributes + - Check imported WDL files and sub-workflows + - Identify containers in struct/object definitions +2. Generate container inventory CSV with columns: + - Task name + - Original container URI + - Container registry + - Tool name and version + - Target ECR URI +3. Create `scripts/migrate_containers_to_ecr.sh` to: + - Find or create ECR repositories for each tool with access policies that allow the omics principal to read from the repository + - Pull each container from source registry ensuring x86 containers are pulled + - Tag for ECR with naming convention: `.dkr.ecr..amazonaws.com//:` + - Push to ECR repositories +4. Create `scripts/update_container_refs.sh` to: + - Replace all container URIs in WDL task runtime sections + - Update to use ECR registry + - Parameterize container references +5. Create `healthomics.inputs.json` with ECR registry base path parameter + +**Acceptance Criteria**: +- `container_inventory.csv` with all containers documented +- Migration script successfully pushes all containers to ECR +- All WDL task runtime sections updated with ECR URIs +- Zero references to external registries remain +- Test that at least 5 key containers are accessible from ECR +- Documentation of migration strategy and any challenges encountered + +### Phase 2: Runtime Attribute Audit + +**Objective**: Ensure all tasks have CPU and memory runtime declarations + +**HealthOmics Requirements**: +- Minimum: 2 vCPUs, 4 GiB memory +- Maximum: 96 vCPUs, 768 GiB memory +- Must be explicit in task runtime section + +**Tasks**: +1. Inspect runtime declarations: + - Scan all WDL files for runtime sections + - Identify tasks missing cpu, memory, or disks attributes + - Check for dynamic resource calculations +2. Add or update runtime attributes in all tasks: + ```wdl + runtime { + docker: "..." + cpu: 4 + memory: "8 GiB" + } + ``` +3. Document resource requirements per task in `docs/healthomics_resources.md` +4. Create validation script to ensure no task lacks runtime attributes + +**Acceptance Criteria**: +- Runtime audit report generated +- All tasks have docker (or container for WDL 1.1), cpu, and memory runtime attributes +- All resources meet HealthOmics minimums (≥2 vCPU, ≥4 GB) +- Documentation of resource rationale per task +- Validation script confirms 100% coverage + +### Phase 3: WDL Version Compatibility + +**Objective**: Ensure WDL 1.0+ (or devel) compatibility (HealthOmics does not support draft-2) + +**Tasks**: +1. Check WDL version declarations: + - Scan all WDL files for version statements + - Identify draft-2 syntax usage + - List deprecated features in use +2. Upgrade syntax if needed: + - Update version declaration to `version 1.0` or `version 1.1` + - Replace `${}` with `~{}` for command interpolation + - Update type declarations + - Replace deprecated functions + - Update struct definitions if using WDL 1.1 + - Replace `command { ... }` syntax with `comand <<< ... >>>` syntax for WDL 1.1+ +3. Validate imports: + - Ensure all imported WDL files are also 1.0+ and the same version as the main workflow + - Update import statements to use proper aliasing + - Check for circular dependencies +4. Test with linters: + - Use the `LintAHOWorkflowDefinition` or `LintAHOWorkflowBundle` tools to verify syntax and identify issues + - For large workflows use `miniwdl check` if available locally + - Resolve all issues and deprecations + +**Acceptance Criteria**: +- All WDL files declare version 1.0 or higher +- No draft-2 syntax remains +- Syntax validation passes for all WDL files +- All imports resolve correctly +- Documentation of syntax changes made + +### Phase 4: Reference and Input File Migration + +**Objective**: Migrate all reference files and inputs to S3 + +**Tasks**: +1. Identify input files and reference data: + - Extract all File and File? input parameters + - Scan for hardcoded file paths in command sections + - List reference files in workflow inputs + - Identify files in Array[File] inputs + - Generate reference inventory with sizes +2. Design S3 bucket structure appropriate for the workflow. For example: + ``` + s3:/// + ├── references/ + │ ├── Homo_sapiens/ + │ │ ├── GATK/GRCh38/ + │ │ │ ├── Sequence/ + │ │ │ ├── Annotation/ + │ │ │ └── Variation/ + │ │ └── NCBI/GRCh38/ + │ └── Mus_musculus/ + ├── annotation/ + │ ├── snpeff_db/ + │ └── vep_cache/ + └── inputs/ + └── samples/ + ``` +3. Create `scripts/migrate_references_to_s3.sh` to: + - Copy from existing S3 locations if available + - Upload local files if needed + - Obtain and upload http(s):// and ftp:// resources to S3 + - Set appropriate S3 storage class (Intelligent-Tiering) + - Validate checksums after upload +4. Create `healthomics.inputs.json` with S3 URIs: + - Set all File inputs to S3 URIs + - Update reference file paths + - Include sample input files +5. Update any hardcoded paths in command sections to use input variables + +**Acceptance Criteria**: +- Reference inventory CSV with all files and sizes +- S3 bucket created with proper structure +- All reference files accessible from S3 +- `healthomics.inputs.json` uses S3 URIs exclusively +- Migration script with progress tracking +- Documentation of S3 structure and access +- Validation that workflow can access all S3 references +- No hardcoded file paths in command sections + +### Phase 5: Output Collection Strategy + +**Objective**: Ensure all workflow outputs are properly declared + +**HealthOmics Behavior**: +- Outputs are automatically collected from workflow output section +- Task outputs must be explicitly declared in workflow outputs to be retained +- Intermediate files are automatically cleaned up at the end of a run unless declared as workflow outputs + +**Tasks**: +1. Audit workflow outputs: + - Identify all task outputs that should be retained + - Check workflow output section completeness + - Verify output types (File, Array[File], etc.) +2. Update workflow output section if needed: + ```wdl + output { + File final_vcf = CallVariants.vcf + File final_vcf_index = CallVariants.vcf_index + Array[File] bam_files = AlignReads.bam + File metrics_report = CollectMetrics.report + } + ``` +3. Document output structure: + - Create `docs/healthomics_outputs.md` + - List all workflow outputs with descriptions + - Explain output file organization + - Document how to retrieve outputs from HealthOmics +4. Verify task output declarations: + - Ensure all tasks declare their outputs + - Check glob patterns are correct + - Validate output file naming + +**Acceptance Criteria**: +- Audit report of all workflow outputs +- Workflow output section includes all desired outputs +- All task outputs properly declared +- Output types correctly specified +- Documentation of output structure +- Test run confirms expected outputs are collected + +### Phase 6: Configuration and Testing + +**Objective**: Create HealthOmics-specific configuration and validate + +**Tasks**: +1. Create comprehensive `healthomics.inputs.json`: + ```json + { + "WorkflowName.container_registry": ".dkr.ecr..amazonaws.com/", + "WorkflowName.reference_fasta": "s3:///references/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta", + "WorkflowName.reference_fasta_index": "s3:///references/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta.fai", + "WorkflowName.dbsnp_vcf": "s3:///references/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/dbsnp_146.hg38.vcf.gz", + "WorkflowName.input_bam": "s3:///inputs/samples/sample1.bam" + } + ``` + +2. Create `test_healthomics.inputs.json`: + - Use small test dataset (e.g., chr22 only) + - Minimal sample set (1-2 samples) + - S3 test data location + - Expected runtime: <2 hours + - Use DYNAMIC storage for test runs + +3. Create test execution plan: + - Stage 1: Validate WDL syntax and lint + - Stage 2: Test on HealthOmics with minimal dataset + - Stage 3: Test with full-size dataset + - Stage 4: Resource optimization + + +**Acceptance Criteria**: +- `healthomics.inputs.json` complete with all required inputs +- `test_healthomics.inputs.json` with minimal test data +- WDL validation passes +- Test workflow completes successfully on HealthOmics +- Use the `DiagnoseAHORunFailure` tool to identify issues with the test run and remediate +- Documentation of test execution plan and any challenges encountered +- Full migration guide documentation +- Known issues documented with workarounds +- Performance benchmarks recorded +- Resource analysis completed + +## Technical Details + +### Container Runtime Pattern +```wdl +# Before +runtime { + docker: "quay.io/biocontainers/bwa:0.7.17--h5bf99c6_8" +} + +# After +runtime { + docker: ".dkr.ecr..amazonaws.com/workflow-name/bwa:0.7.17--h5bf99c6_8" + cpu: 4 + memory: "8 GB" +} +``` + +### WDL Version Upgrade Pattern +```wdl +# Before (draft-2) +workflow MyWorkflow { + call MyTask { input: file = input_file } +} + +# After (1.0+) +version 1.0 + +workflow MyWorkflow { + input { + File input_file + } + + call MyTask { input: file = input_file } + + output { + File result = MyTask.output_file + } +} +``` + +### S3 Input Pattern +```json +// Before (local paths) +{ + "WorkflowName.reference_fasta": "/path/to/reference.fasta" +} + +// After (S3 URIs) +{ + "WorkflowName.reference_fasta": "s3://bucket/references/Homo_sapiens/GATK/GRCh38/Sequence/reference.fasta" +} +``` + +### Task Output Declaration Pattern +```wdl +task AlignReads { + input { + File input_fastq + File reference_fasta + } + + command <<< + bwa mem ~{reference_fasta} ~{input_fastq} > aligned.sam + samtools view -b aligned.sam > aligned.bam + >>> + + output { + File aligned_bam = "aligned.bam" + } + + runtime { + docker: ".dkr.ecr..amazonaws.com//bwa-samtools:latest" + cpu: 8 + memory: "16 GB" + } +} +``` + +## Dependencies + +- AWS CLI configured with appropriate permissions +- ECR repositories created +- S3 bucket(s) created with appropriate permissions +- HealthOmics service access +- HealthOmics MCP server +- Docker/Finch/Podman installed for container operations + +## Success Metrics + +- 100% of containers migrated to ECR +- 100% of tasks have runtime attributes (cpu, memory, disks) +- All WDL files are version 1.0 or higher +- All reference files accessible from S3 +- All workflow outputs properly declared +- Test workflow completes successfully on HealthOmics +- Documentation complete and accurate + +## Common WDL-Specific Considerations + +### Scatter-Gather Patterns +- Ensure scattered tasks have appropriate resources +- Verify Array[File] outputs are properly collected +- Test scatter parallelization limits + +### Sub-Workflows +- Ensure all imported WDL files are migrated +- Verify sub-workflow outputs are properly passed +- Check import paths resolve correctly + +### Optional Inputs +- Handle File? inputs gracefully +- Use select_first() or defined() appropriately +- Provide defaults where sensible + +### Command Section +- Use ~{} for variable interpolation (WDL 1.0+) +- Avoid hardcoded paths +- Use sep() for array joining +- Handle optional inputs with if/then/else + +## References + +- [AWS HealthOmics Documentation](https://docs.aws.amazon.com/omics/) +- [WDL 1.0 Specification](https://github.com/openwdl/wdl/blob/main/versions/1.0/SPEC.md) +- [WDL 1.1 Specification](https://github.com/openwdl/wdl/blob/main/versions/1.1/SPEC.md) +- [WDL on AWS HealthOmics](https://docs.aws.amazon.com/omics/latest/dev/workflows.html) +- [ECR Documentation](https://docs.aws.amazon.com/ecr/) \ No newline at end of file diff --git a/aws-healthomics/steering/troubleshooting.md b/aws-healthomics/steering/troubleshooting.md new file mode 100644 index 0000000..c776dfd --- /dev/null +++ b/aws-healthomics/steering/troubleshooting.md @@ -0,0 +1,15 @@ +# Troubleshooting Guide + +## Workflow Creation Failure + +If a workflow fails to reach a CREATED status in HealthOmics, the likely reasons are: + +1. The workflow zip package is corrupted or missing +2. The workflow zip package has multiple workflow definition files at the top level. There should only be one `main.wdl`, `main.nf` etc at the top level and dependencies should be packaged in sub-directories. +3. The workflow zip package is missing a dependency that is required by the workflow definition file or the dependency location is not consistent with the import path for the dependency +4. The workflow has invalid syntax. Use lint tools to verify the workflow definition file is valid. + +## Run Failures + +- If a run fails with a service error (5xx error) then a transient error has occured in the HealthOmics service and the run can be re-started +- If a workflow run fails with a customer error (4xx error) use the `DiagnoseAHORunFailure` tool to access important logs and run information \ No newline at end of file diff --git a/aws-healthomics/steering/workflow-development.md b/aws-healthomics/steering/workflow-development.md new file mode 100644 index 0000000..bbf91cc --- /dev/null +++ b/aws-healthomics/steering/workflow-development.md @@ -0,0 +1,107 @@ +# Workflow Development Guide + +## Overview + +This guide covers the complete process of developing genomics workflows for AWS HealthOmics including creation, deployment and running. + +## Creating a Workflow + **Language** + - Use WDL 1.1, Nextflow DSL2 or CWL 1.2 for workflows. + - Prefer WDL 1.1 unless otherwise instructed + + **Structure** + - Define a top level `main.wdl`, `main.cwl`, `main.nf` or `main.cwl` file + - Define a `tasks` folder with subfolders for each task + - Define a `workflows` folder with subfolders for each sub-workflow + + **Code Docs** + - Use comments to document the purpose of each task and workflow + - For WDL generate meta and parameter_meta blocks to document the workflow and parameters + - For Nextflow generate nf-schema.json to document the workflow and parameters + - Create a detailed README.md + + **Scripting** + - Use BASH best practices for the definition of the task/ process command/ script + - Use `set -eu` to prevent silent failures + - In WDL use the ~{var_name} interpolation syntax for variable substitution + - In WDL use <<< >>> syntax to delimit the command block + + **Parallelization** + - Use `scatter` patterns and Nextflow `Channels` to parallelize tasks + - Where possible scatter over samples and genomic intervals + - Consider computing intervals in reference genomes so they have approximately even sizes + - HealthOmics can support large scatters but may require requesting increases to quota limits (Maximum concurrent tasks per run) + + **Task Parameters** + - All tasks (or processes) must declare CPU, memory and container requirements + - Use reasonable resource allocations with at least 1GB of memory and 1 CPU for all tasks + - Consider setting timeouts and retries for workflow tasks using language appropriate directives + + **Outputs** + - Final Workflow outputs must be declared. Intermediate task outputs will not be retained by HealthOmics. + - When using a Nextflow publishDir directive, the path must be a subdirectory of `/mnt/workflow/pubdir` + + **Containers** + - All workflow tasks run in containers which must contain all software used in the script/ command of the task + - Container images must be available in the users AWS ECR private registry in repositories that are readable by HealthOmics + - ECR private registry URLs are of the form `123456789012.dkr.ecr.us-east-1.amazonaws.com/myrepo:tag` + - Use `aws sts get-caller-identity` to get the account number and replace the `123456789012` in the example above + - Note that ECR public gallery images are **not** private repositories and cannot be used by HealthOmics unless using Pull Through Caches + - HealthOmics can use ECR Pull Through Caches if the container image is not available in the users private registry: + - The image must be available from a supported upstream registry + - Consult the [ECR Pull Through Cache](./ecr-pull-through-cache.md) steering documentation for more information + - Alternatively, use Docker (Podman, Finch etc) to pull, retag and push the container image to the users private registry + + **`parameters.json`** + - Define an example `parameters.json` for the workflow + - Use the `SearchGenomicsFiles` tool from the HealthOmics MCP server to help identify suitable inputs + - Workflow parameters should **NOT** be namespaced: + **correct:** + ``` + { + "input_file": "s3://bucket/path/to/input.vcf" + } + ``` + + **wrong:** + ``` + { + "MyWorkflow.input_file": "s3://bucket/path/to/input.vcf" + } + ``` + + **Linting** + - Use the `LintAHOWorkflowDefinition` tool or `LintAHOWorkflowBundle` tool to validate the workflow definition + - Resolve any linting errors before deployment + +## Deploying a Workflow + **Packaging** + - If the workflow is a single file, use the `PackageAHOWorkflowDefinition` tool to package the workflow definition into a zip archive + - If the workflow is relatively small (< 15 files), use the `PackageAHOWorkflowBundle` tool to package the workflow definition into a zip archive + - If the workflow is large (> 15 files), make a local zip file and copy it to S3. + - Ensure the `main` entry point file is at the top level of the archive with required imports packaged relative to this file + + **Deploy to AWS HealthOmics** + - Use the `CreateAHOWorkflow` tool to create the new workflow in HealthOmics + - If you are updating an existing HealthOmics workflow, use the `CreateAHOWorkflowVersion` tool to create a new version of the workflow + - Use semantic versioning for the version name e.g. `1.0.0` or `1.0.1` + - Verify that the workflow has created successfully using the `GetAHOWorkflow` tool + +## Running a Workflow + **Pre-conditions** + - Ensure the workflow has been deployed successfully + - Ensure an parameters.json or inputs.json file has been created and that the inputs are valid and accessible + - All file inputs must come from S3 locations in the same region as the workflow run + - Verify all S3 objects exist + - ALWAYS read and use preferences/ defaults from `.healthomics/config.toml` if present + - A run requires an output location in S3 that is writable, ask the user where they want their outputs to be written + - A run requires a Service Role with a trust policy that allows HealthOmics to assume the role and that grants access to read the inputs and write to the output location, identify or create a suitable role and use the roles ARN when starting the workflow. + + **Run the workflow** + - Use the `RunAHOWorkflow` tool to run the workflow + - Use the `GetAHOWorkflowRun` tool to check the status of the workflow run + - Use the `GetAHO*Logs` tools to retrieve various logs for the run + - When the workflow completes outputs will be written to the location specified when starting the run + - If the workflow fails, use the `DiagnoseAHORunFailure` tool to get more information about the failure, then fix the workflow, create a new version of the workflow in HealthOmics and try again + + From 7bd62ac79053ad07b54806d6b468d929aa98aae7 Mon Sep 17 00:00:00 2001 From: Mark Schreiber Date: Tue, 3 Feb 2026 09:42:57 -0500 Subject: [PATCH 2/5] feat: adds steering docs for workflow-versioning --- aws-healthomics/POWER.md | 3 +- .../steering/workflow-versioning.md | 80 +++++++++++++++++++ 2 files changed, 82 insertions(+), 1 deletion(-) create mode 100644 aws-healthomics/steering/workflow-versioning.md diff --git a/aws-healthomics/POWER.md b/aws-healthomics/POWER.md index 2a59f8a..7c4ce0c 100644 --- a/aws-healthomics/POWER.md +++ b/aws-healthomics/POWER.md @@ -1,7 +1,7 @@ --- name: "aws-healthomics" displayName: "AWS HealthOmics" -description: "Create, migrate, run, debug and optimize bioinformatics workflows in AWS HealthOmics" +description: "Create, migrate, run, debug and optimize genomics workflows in AWS HealthOmics" keywords: ["healthomics", "WDL", "CWL", "Nextflow", "workflow", "genomics", "bioinformatics", "pipeline"] author: "AWS" --- @@ -17,6 +17,7 @@ Whenever you are asked to perform a task related to any of the following scenari - Creating a new WDL, Nextflow or CWL workflow -> use `./steering_files/workflow-development.md` - Onboarding an existing WDL workflow ensuring compatibility with HealthOmics -> use `.\steering_files/migration-guide-for-wdl.md` - Onboarding an existing Nextflow workflow ensuring compatibility with HealthOmics -> use `./steering_files/migration-guide-for-wdl.md` +- Modifying, updating, or fixing an existing HealthOmics workflow -> use `./steering_files/workflow-versioning.md` - Diagnosing workflow creation issues -> use `./steering_files/troubleshooting.md` - Diagnosing run failures -> use `./steering_files/troubleshooting.md` - Using public containers with HealthOmics via ECR Pullthrough Caches -> use `./steering_files/ecr-pull-through-cache.md` diff --git a/aws-healthomics/steering/workflow-versioning.md b/aws-healthomics/steering/workflow-versioning.md new file mode 100644 index 0000000..5050421 --- /dev/null +++ b/aws-healthomics/steering/workflow-versioning.md @@ -0,0 +1,80 @@ +# Workflow Versioning Guide + +## Overview + +When a customer is modifying an existing HealthOmics workflow, **always use `CreateAHOWorkflowVersion`** to create a new version rather than creating an entirely new workflow. This preserves workflow history, maintains consistent workflow IDs for downstream integrations, and follows AWS HealthOmics best practices. + +## When to Use Workflow Versioning + +**Use `CreateAHOWorkflowVersion` when:** +- Fixing bugs in an existing workflow +- Adding new features or tasks to a workflow +- Updating container images or versions +- Modifying resource allocations (CPU, memory) +- Changing workflow parameters or outputs +- Optimizing workflow performance after analyzing run metrics +- Applying fixes after diagnosing run failures + +**Use `CreateAHOWorkflow` only when:** +- Creating a brand new workflow that doesn't exist yet +- The workflow represents fundamentally different functionality +- The customer explicitly requests a new workflow ID + +## Workflow Modification Process + +1. **Identify the existing workflow** + - Use `ListAHOWorkflows` to find the workflow + - Use `GetAHOWorkflow` to retrieve current workflow details including the workflow ID + +2. **Make modifications locally** + - Edit the workflow definition files + - Use `LintAHOWorkflowDefinition` or `LintAHOWorkflowBundle` to validate changes + +3. **Package the updated workflow** + - Use `PackageAHOWorkflowDefinition` for single-file workflows + - Use `PackageAHOWorkflowBundle` for multi-file workflows + +4. **Create a new version** + - Use `CreateAHOWorkflowVersion` with the existing workflow ID + - Apply semantic versioning (e.g., `1.0.0` → `1.0.1` for patches, `1.1.0` for features) + - Include a meaningful description of changes + +5. **Verify the new version** + - Use `GetAHOWorkflow` to confirm the version was created successfully + - Check that the workflow status is `ACTIVE` + +## Version Naming Conventions + +Follow semantic versioning for workflow versions: +- **MAJOR.MINOR.PATCH** (e.g., `1.0.0`, `2.1.3`) +- **MAJOR**: Breaking changes to inputs/outputs +- **MINOR**: New features, backward compatible +- **PATCH**: Bug fixes, performance improvements + +## Benefits of Versioning + +- **Audit Trail**: Complete history of workflow changes +- **Rollback Capability**: Easy to revert to previous versions if issues arise +- **Consistent Integration**: Downstream systems can reference the same workflow ID +- **Cost Tracking**: All runs grouped under a single workflow for billing analysis +- **Compliance**: Maintains lineage for regulatory requirements in genomics workflows + +## Common Scenarios + +### After Diagnosing a Run Failure +When `DiagnoseAHORunFailure` identifies an issue: +1. Fix the workflow definition +2. Create a new version with `CreateAHOWorkflowVersion` +3. Re-run using the updated workflow version + +### After Performance Optimization +When `AnalyzeAHORunPerformance` suggests improvements: +1. Apply recommended resource adjustments +2. Create a new version with `CreateAHOWorkflowVersion` +3. Run the optimized version to validate improvements + +### Updating Container Images +When updating to newer container versions: +1. Update container references in task definitions +2. Test locally if possible +3. Create a new version with `CreateAHOWorkflowVersion` From 2b594b984973aa9718939b5b45fee0f6aa0afa1a Mon Sep 17 00:00:00 2001 From: Mark Schreiber Date: Tue, 3 Feb 2026 10:55:52 -0500 Subject: [PATCH 3/5] feat: adds steering docs for creating workflow from git --- aws-healthomics/POWER.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/aws-healthomics/POWER.md b/aws-healthomics/POWER.md index 7c4ce0c..868c22a 100644 --- a/aws-healthomics/POWER.md +++ b/aws-healthomics/POWER.md @@ -14,7 +14,8 @@ When you want to create, migrate, run, debug and identify optimization opportuni Whenever you are asked to perform a task related to any of the following scenarios - ensure you load and read the appropriate markdown file mentioned -- Creating a new WDL, Nextflow or CWL workflow -> use `./steering_files/workflow-development.md` +- Creating a workflow from a remote Git repository URL (GitHub, GitLab, Bitbucket, Azure DevOps) -> use `./steering_files/git-integration.md` (takes precedence over workflow-development.md) +- Creating a new WDL, Nextflow or CWL workflow from local files -> use `./steering_files/workflow-development.md` - Onboarding an existing WDL workflow ensuring compatibility with HealthOmics -> use `.\steering_files/migration-guide-for-wdl.md` - Onboarding an existing Nextflow workflow ensuring compatibility with HealthOmics -> use `./steering_files/migration-guide-for-wdl.md` - Modifying, updating, or fixing an existing HealthOmics workflow -> use `./steering_files/workflow-versioning.md` From 81144deb8bfa1c56dda524e9b491c97f320a4828 Mon Sep 17 00:00:00 2001 From: Mark Schreiber Date: Tue, 3 Feb 2026 16:31:03 -0500 Subject: [PATCH 4/5] fix: fixes error in git integration, removes autoApproved list from mcp.json --- aws-healthomics/POWER.md | 2 +- aws-healthomics/mcp.json | 10 - aws-healthomics/steering/git-integration.md | 241 ++++++++++++++++++++ 3 files changed, 242 insertions(+), 11 deletions(-) create mode 100644 aws-healthomics/steering/git-integration.md diff --git a/aws-healthomics/POWER.md b/aws-healthomics/POWER.md index 868c22a..3e87d73 100644 --- a/aws-healthomics/POWER.md +++ b/aws-healthomics/POWER.md @@ -14,7 +14,7 @@ When you want to create, migrate, run, debug and identify optimization opportuni Whenever you are asked to perform a task related to any of the following scenarios - ensure you load and read the appropriate markdown file mentioned -- Creating a workflow from a remote Git repository URL (GitHub, GitLab, Bitbucket, Azure DevOps) -> use `./steering_files/git-integration.md` (takes precedence over workflow-development.md) +- Creating a workflow from a remote Git repository URL (GitHub, GitLab, Bitbucket) -> use `./steering_files/git-integration.md` (takes precedence over workflow-development.md) - Creating a new WDL, Nextflow or CWL workflow from local files -> use `./steering_files/workflow-development.md` - Onboarding an existing WDL workflow ensuring compatibility with HealthOmics -> use `.\steering_files/migration-guide-for-wdl.md` - Onboarding an existing Nextflow workflow ensuring compatibility with HealthOmics -> use `./steering_files/migration-guide-for-wdl.md` diff --git a/aws-healthomics/mcp.json b/aws-healthomics/mcp.json index 356ade8..fb387b4 100644 --- a/aws-healthomics/mcp.json +++ b/aws-healthomics/mcp.json @@ -11,16 +11,6 @@ }, "disabled": false, "autoApprove": [ - "DiagnoseAHORunFailure", - "GetAHOWorkflow", - "GetAHORun", - "GetAHORunTask", - "GetAHORunLogs", - "GetAHORunManifestLogs", - "GetAHORunEngineLogs", - "GetAHOTaskLogs", - "GetSupportedFileTypes", - "GetAHOSupportedRegions" ] } } diff --git a/aws-healthomics/steering/git-integration.md b/aws-healthomics/steering/git-integration.md new file mode 100644 index 0000000..a4f6400 --- /dev/null +++ b/aws-healthomics/steering/git-integration.md @@ -0,0 +1,241 @@ +# Git Integration for HealthOmics Workflows + +## Overview + +When a user provides a Git repository URL to create a HealthOmics workflow, **use the `definitionRepository` parameter** with `CreateAHOWorkflow` or `CreateAHOWorkflowVersion` instead of manually cloning, packaging, and uploading the workflow. This approach: +- Eliminates manual download, zip, and S3 staging steps +- Enables direct workflow creation from public or private repositories +- Supports GitHub, GitLab, and Bitbucket + +## When to Use Git Integration + +**Use `definitionRepository` when:** +- User provides a GitHub, GitLab, or Bitbucket repository URL +- User wants to create a workflow from a specific branch, tag, or commit +- User references a public workflow repository (e.g., nf-core pipelines) +- User wants to keep their workflow definition in source control + +**Use traditional packaging when:** +- User has local workflow files not in a Git repository +- User provides an S3 URI for the workflow definition +- User explicitly requests local packaging + +## Supported Git Providers + +| Provider | Repository URL Format | +|----------|----------------------| +| GitHub | `https://github.com/owner/repo` | +| GitLab | `https://gitlab.com/owner/repo` | +| Bitbucket | `https://bitbucket.org/owner/repo` | +| GitLab Self-Managed | `https://gitlab.example.com/owner/repo` | +| GitHub Enterprise | `https://github.example.com/owner/repo` | + +## Workflow for Git-Based Workflow Creation + +### Step 1: Check for Existing Code Connections + +Use `ListCodeConnections` to find existing connections for the Git provider: + +``` +ListCodeConnections(provider_type_filter="GitHub") # or GitLab, Bitbucket, etc. +``` + +Look for a connection with status `AVAILABLE`. If found, use its `connection_arn`. + +### Step 2: Create Code Connection (If Needed) + +If no suitable connection exists: + +1. **Create the connection:** + ``` + CreateCodeConnection( + connection_name="my-github-connection", + provider_type="GitHub" # GitHub, GitLab, Bitbucket, GitHubEnterpriseServer, GitLabSelfManaged + ) + ``` + +2. **Inform the user** that they must complete OAuth authorization in the AWS Console: + - The tool returns a `console_url` for completing authorization + - Connection status will be `PENDING` until OAuth is completed + - User must authorize the connection before it can be used + +3. **Verify connection status:** + ``` + GetCodeConnection(connection_arn="arn:aws:codeconnections:...") + ``` + - Wait for status to become `AVAILABLE` before proceeding + +### Step 3: Parse Repository Information + +Extract from the user-provided URL: +- **fullRepositoryId**: `owner/repo` format (e.g., `nf-core/rnaseq`) +- **sourceReference**: Branch, tag, or commit + - Type: `BRANCH`, `TAG`, or `COMMIT` + - Value: The branch name, tag name, or commit SHA + +### Step 4: Check for Container Registry Map + +Before creating the workflow, check if the repository contains a container registry map file: +- Common locations: `container-registry-map.json`, `registry-map.json`, `.healthomics/container-registry-map.json` + +**If a container registry map exists in the repository:** +- Pass `container_registry_map_uri` pointing to the S3 location if uploaded +- Or use `container_registry_map` parameter with the map contents + +**If no container registry map exists:** +- Analyze the workflow definition for container references +- If containers reference public registries (Docker Hub, Quay.io, ECR Public): + - Follow the [ECR Pull Through Cache](./ecr-pull-through-cache.md) steering guide + - Use `CreateContainerRegistryMap` to generate a registry map + - Use `ValidateHealthOmicsECRConfig` to verify ECR configuration +- If containers reference private ECR repositories: + - Proceed without a container registry map (containers are already accessible) + +### Step 5: Create the Workflow + +Use `CreateAHOWorkflow` with the `definition_repository` parameter: + +``` +CreateAHOWorkflow( + name="my-workflow", + definition_repository={ + "connectionArn": "arn:aws:codeconnections:us-east-1:123456789012:connection/abc123", + "fullRepositoryId": "owner/repo", + "sourceReference": { + "type": "BRANCH", # or TAG, COMMIT + "value": "main" # branch name, tag, or commit SHA + }, + "excludeFilePatterns": ["test/*", "docs/*"] # optional + }, + description="Workflow created from Git repository", + parameter_template_path="parameters.json", # optional: path within repo + readme_path="README.md", # optional: path within repo + container_registry_map={...} # if needed +) +``` + +### Step 6: Verify Workflow Creation + +``` +GetAHOWorkflow(workflow_id="1234567") +``` + +Check that: +- Status is `ACTIVE` +- Workflow type matches expected engine (WDL, NEXTFLOW, CWL) + +## Parameter Reference + +### definitionRepository Object + +| Field | Required | Description | +|-------|----------|-------------| +| `connectionArn` | Yes | ARN of the CodeConnection to use | +| `fullRepositoryId` | Yes | Repository identifier in `owner/repo` format | +| `sourceReference.type` | Yes | `BRANCH`, `TAG`, or `COMMIT` | +| `sourceReference.value` | Yes | Branch name, tag name, or commit SHA | +| `excludeFilePatterns` | No | Glob patterns for files to exclude | + +### Additional Parameters for Git Workflows + +| Parameter | Description | +|-----------|-------------| +| `parameter_template_path` | Path to parameter template JSON within the repository | +| `readme_path` | Path to README markdown file within the repository | + +## Common Scenarios + +### Creating from nf-core Pipeline + +``` +# User: "Create a workflow from https://github.com/nf-core/rnaseq" + +1. ListCodeConnections(provider_type_filter="GitHub") +2. If no connection: CreateCodeConnection(connection_name="github", provider_type="GitHub") +3. CreateAHOWorkflow( + name="nf-core-rnaseq", + definition_repository={ + "connectionArn": "...", + "fullRepositoryId": "nf-core/rnaseq", + "sourceReference": {"type": "TAG", "value": "3.14.0"} + }, + container_registry_map={...} # Use ECR pull-through cache mappings + ) +``` + +### Creating from Specific Branch + +``` +# User: "Create workflow from my-org/my-workflow on the develop branch" + +CreateAHOWorkflow( + name="my-workflow-dev", + definition_repository={ + "connectionArn": "...", + "fullRepositoryId": "my-org/my-workflow", + "sourceReference": {"type": "BRANCH", "value": "develop"} + } +) +``` + +### Creating from Specific Commit + +``` +# User: "Create workflow from commit abc123 in owner/repo" + +CreateAHOWorkflow( + name="my-workflow", + definition_repository={ + "connectionArn": "...", + "fullRepositoryId": "owner/repo", + "sourceReference": {"type": "COMMIT", "value": "abc123def456"} + } +) +``` + +## Error Handling + +### Connection Not Available +If `GetCodeConnection` returns status `PENDING`: +- Remind user to complete OAuth authorization in AWS Console +- Provide the console URL from the connection creation response +- Wait for user confirmation before retrying + +### Repository Access Denied +If workflow creation fails with access errors: +- Verify the connection has appropriate repository permissions +- For private repositories, ensure OAuth scope includes repo access +- Check that `fullRepositoryId` is correct + +### Workflow Definition Not Found +If HealthOmics cannot find the workflow definition: +- Verify the repository contains a valid workflow file (main.wdl, main.nf, main.cwl) +- Check `excludeFilePatterns` isn't excluding the main workflow file +- Use `path_to_main` parameter if the main file isn't at the repository root + +## Required IAM Permissions + +Users need these permissions for Git integration: +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "codeconnections:CreateConnection", + "codeconnections:GetConnection", + "codeconnections:ListConnections", + "codeconnections:UseConnection" + ], + "Resource": "*" + } + ] +} +``` + +## References + +- [AWS HealthOmics Git Integration Documentation](https://docs.aws.amazon.com/omics/latest/dev/workflows-git-integration.html) +- [CreateWorkflow API Reference](https://docs.aws.amazon.com/omics/latest/api/API_CreateWorkflow.html) +- [ECR Pull Through Cache Guide](./ecr-pull-through-cache.md) From 4d89854aa4433b54449721c8e7fe2f5a6c6caac4 Mon Sep 17 00:00:00 2001 From: Mark Schreiber Date: Wed, 4 Feb 2026 11:12:38 -0500 Subject: [PATCH 5/5] feat: updates ptc steering docs to reference new MCP tools and provides example mappings --- .../steering/ecr-pull-through-cache.md | 506 ++++++++---------- 1 file changed, 229 insertions(+), 277 deletions(-) diff --git a/aws-healthomics/steering/ecr-pull-through-cache.md b/aws-healthomics/steering/ecr-pull-through-cache.md index d78d818..dbab4d7 100644 --- a/aws-healthomics/steering/ecr-pull-through-cache.md +++ b/aws-healthomics/steering/ecr-pull-through-cache.md @@ -1,370 +1,322 @@ # AWS HealthOmics ECR Pullthrough and Container Registry Maps ## Overview -Container Registry Maps are a feature in AWS HealthOmics that enable workflows to use ECR pull through caches to access public container registries without manually replicating containers into private ECR repositories. This feature provides automatic mapping between upstream registries (like Docker Hub and Quay.io) and your private ECR repositories. + +HealthOmics requires that containers used for workflow tasks come from PRIVATE ECR repositories with permissions correctly set to allow the HealthOmics service to access these containers. In many public workflow definitions it is common to use container images from public sources like Dockerhub, ECR Public Gallery, Quay.io, Seqera Wave etc. To use these containers they must either be cloned to an ECR private registry AND their URIs updated in the workflow definition - Or you can make use of ECR Pull Through Caches to automatically clone the images as well as Container Registry maps that translate the public image URI in the workflow definition to the ECR private URI in ECR. + ECR Pull through cache setup and container registry mapping are two distinct but related concepts. If you setup pull through caches your workflows will automatically pull containers from the upstream registries and cache them in your ECR repositories. You will reference these -containers using ECR private URIs in your workflow definitions. If you also add a container registry map then you can use the original -public registry URIs in your workflow definitions and HealthOmics will automatically map them to your ECR private URIs. +containers using ECR private URIs in your workflow definitions. If you also add a container registry map (recommended) then you can use the original +public registry URIs in your workflow definitions and HealthOmics will automatically map them to your ECR private URIs. Container registry maps can be used to avoid changing all container URIs in the workflow. -*When creating new workflows container registry maps are usually not needed, ECR pull through caches are sufficient* -*When updating existing workflows container registry maps can be used to avoid changing all container URIs in the workflow* +**Key Concepts:** +- **ECR Pull-Through Cache**: Automatically pulls and caches containers from upstream registries. Workflows reference containers using ECR private URIs. +- **Container Registry Map**: Optional mapping that allows workflows to use original public registry URIs while HealthOmics automatically redirects to your ECR pull-through caches. + +**When to use each approach:** +- **New workflows**: Use ECR pull-through caches with private ECR URIs (registry maps not needed) +- **Migrating existing workflows**: Use container registry maps to avoid changing container URIs in workflow definitions ## Prerequisites -- AWS CLI v2 installed and configured -- Appropriate IAM permissions for ECR and HealthOmics + +- AWS credentials configured with appropriate IAM permissions for ECR and HealthOmics +- For Docker Hub: A Docker Hub access token (obtain from https://docs.docker.com/security/access-tokens/) + +## MCP Tools Reference + +This guide uses the following MCP tools from the `aws-healthomics` server: + +| Tool | Purpose | +|------|---------| +| `ValidateHealthOmicsECRConfig` | Validate ECR configuration for HealthOmics | +| `ListPullThroughCacheRules` | List existing PTC rules with HealthOmics usability status | +| `CreatePullThroughCacheForHealthOmics` | Create PTC rules pre-configured for HealthOmics | +| `ListECRRepositories` | List available ECR private repositories with HealthOmics accessibility status | +| `CheckContainerAvailability` | Check if a container is available and accessible by HealthOmics | +| `CloneContainerToECR` | Clone containers to ECR with HealthOmics access permissions | +| `GrantHealthOmicsRepositoryAccess` | Grant HealthOmics access to an ECR repository | +| `CreateContainerRegistryMap` | Generate container registry maps for workflows | + ## Regions You should configure your ECR registry and HealthOmics workflows in the same region. If you will use multiple regions then repeat these steps in each region. -### Step 1: Create Secrets Manager Secrets (For Authenticated Registries) -Some registries such as Docker Hub or private registries will require authentication. To use pull through cache, you must create a secret in Secrets Manager that contains the credentials for the registry. In these examples the region us-east-1 is specified. You should change this as needed. +## Step 1: Validate Current ECR Configuration -To obtain a Docker Hub token refer to https://docs.docker.com/security/access-tokens/ +Before making changes, validate your current ECR setup: -**Docker Hub Secret** -``` +**Use `ValidateHealthOmicsECRConfig`** to check: +- Existing pull-through cache rules +- Registry permissions policy for HealthOmics +- Repository creation templates +- Required permissions for each prefix + +The tool returns a list of issues with specific remediation steps. + +--- + +## Step 2: Create Secrets for Authenticated Registries + +Some registries require authentication. Create secrets in AWS Secrets Manager before creating pull-through cache rules. + +**Docker Hub Secret** (required for Docker Hub): +```bash aws secretsmanager create-secret \ --name "ecr-pullthroughcache/docker-hub" \ --description "Docker Hub credentials for ECR pull through cache" \ - --secret-string '{ - "username": "your-docker-username", - "accessToken": "your-docker-access-token" - }' \ + --secret-string '{"username": "your-docker-username", "accessToken": "your-docker-access-token"}' \ --region us-east-1 ``` -**Quay.io Secret (if using private repositories, not required for public repositories)** -``` +**Quay.io Secret** (only for private Quay repositories): +```bash aws secretsmanager create-secret \ --name "ecr-pullthroughcache/quay" \ --description "Quay.io credentials for ECR pull through cache" \ - --secret-string '{ - "username": "your-quay-username", - "accessToken": "your-quay-access-token" - }' \ + --secret-string '{"username": "your-quay-username", "accessToken": "your-quay-access-token"}' \ --region us-east-1 ``` -## Step 2: Create ECR Pull Through Cache Rules +--- -**Docker Hub Pull Through Cache** -``` -aws ecr create-pull-through-cache-rule \ - --ecr-repository-prefix docker-hub \ - --upstream-registry-url registry-1.docker.io \ - --credential-arn arn:aws:secretsmanager:us-east-1:123456789012:secret:ecr-pullthroughcache/docker-hub-AbCdEf \ - --region us-east-1 -``` +## Step 3: Create Pull-Through Cache Rules -**Quay.io Pull Through Cache** -``` -aws ecr create-pull-through-cache-rule \ - --ecr-repository-prefix quay \ - --upstream-registry-url quay.io \ - --region us-east-1 -``` +**Use `ListPullThroughCacheRules`** to see existing pull-through cache rules and their HealthOmics usability status. If there is already a valid cache for the upstream registry you need then you can re-use it and don't need to create another one. -**ECR Public Pull Through Cache** -``` -aws ecr create-pull-through-cache-rule \ - --ecr-repository-prefix ecr-public \ - --upstream-registry-url public.ecr.aws \ - --region us-east-1 -``` +**Use `CreatePullThroughCacheForHealthOmics`** to create pull-through cache rules that are automatically configured for HealthOmics. This tool: -## Step 3: Configure Registry Permissions -Create a registry permissions policy to allow HealthOmics to use pull through cache: +1. Creates the pull-through cache rule +2. Updates the registry permissions policy to allow HealthOmics to create repositories and import images +3. Creates a repository creation template that grants HealthOmics image pull permissions -``` -{ - "Version": "2012-10-17", - "Statement": [ - { - "Sid": "AllowPTCinRegPermissions", - "Effect": "Allow", - "Principal": { - "Service": "omics.amazonaws.com" - }, - "Action": [ - "ecr:CreateRepository", - "ecr:BatchImportUpstreamImage" - ], - "Resource": [ - "arn:aws:ecr:us-east-1:123456789012:repository/docker-hub/*", - "arn:aws:ecr:us-east-1:123456789012:repository/quay/*", - "arn:aws:ecr:us-east-1:123456789012:repository/ecr-public/*" - ] - } - ] -} -``` +**Parameters:** +- `upstream_registry`: Registry type (`docker-hub`, `quay`, or `ecr-public`) +- `ecr_repository_prefix`: Optional custom prefix (defaults to registry type name) +- `credential_arn`: Optional Secrets Manager ARN (required for `docker-hub`) -Apply the policy: +**Example configurations:** -``` -aws ecr put-registry-policy \ - --policy-text file://registry-policy.json \ - --region us-east-1 -``` +| Registry | upstream_registry | credential_arn | Notes | +|----------|------------------|----------------|-------| +| Docker Hub | `docker-hub` | Required | Use secret ARN from Step 2 | +| Quay.io | `quay` | Optional | Only needed for private repos | +| ECR Public | `ecr-public` | Not needed | Public access | -## Step 4: Create Repository Creation Templates +--- -**Docker Hub Template** +## Step 4: Verify Pull-Through Cache Configuration -``` -aws ecr create-repository-creation-template \ - --prefix docker-hub \ - --applied-for PULL_THROUGH_CACHE \ - --repository-policy '{ - "Version": "2012-10-17", - "Statement": [ - { - "Sid": "PTCRepoCreationTemplate", - "Effect": "Allow", - "Principal": { - "Service": "omics.amazonaws.com" - }, - "Action": [ - "ecr:BatchGetImage", - "ecr:GetDownloadUrlForLayer" - ], - "Resource": "*" - } - ] - }' \ - --region us-east-1 -``` +**Use `ListPullThroughCacheRules`** to verify your pull-through cache rules are properly configured. The tool shows: -**Quay.io Template** +- All pull-through cache rules in the region +- HealthOmics usability status for each rule +- Missing permissions or configuration issues -``` -aws ecr create-repository-creation-template \ - --prefix quay \ - --applied-for PULL_THROUGH_CACHE \ - --repository-policy '{ - "Version": "2012-10-17", - "Statement": [ - { - "Sid": "PTCRepoCreationTemplate", - "Effect": "Allow", - "Principal": { - "Service": "omics.amazonaws.com" - }, - "Action": [ - "ecr:BatchGetImage", - "ecr:GetDownloadUrlForLayer" - ], - "Resource": "*" - } - ] - }' \ - --region us-east-1 -``` +A rule is usable by HealthOmics when: +1. Registry permissions policy grants HealthOmics required permissions +2. Repository creation template exists for the prefix +3. Template grants HealthOmics image pull permissions -**ECR Public Template** +--- -``` -aws ecr create-repository-creation-template \ - --prefix ecr-public \ - --applied-for PULL_THROUGH_CACHE \ - --repository-policy '{ - "Version": "2012-10-17", - "Statement": [ - { - "Sid": "PTCRepoCreationTemplate", - "Effect": "Allow", - "Principal": { - "Service": "omics.amazonaws.com" - }, - "Action": [ - "ecr:BatchGetImage", - "ecr:GetDownloadUrlForLayer" - ], - "Resource": "*" - } - ] - }' \ - --region us-east-1 -``` +## Step 5: Check Container Availability -## Step 5: Create Container Registry Maps +Before running workflows, verify containers are accessible: -*This step is optional and generally only required when migrating a workflow. Otherwise we recommend using full private ECR URIs in your workflows* +**Use `CheckContainerAvailability`** to check: +- Whether the container image exists in ECR +- Whether HealthOmics can access the image +- Pull-through cache status -Registry mappings can be used to map specific upstream registries to your private ECR repositories. In the example here, containers from Docker Hub, Quay.io and ECR Public used in a workflow will be mapped to your private ECR pull through caches. +**Parameters:** +- `repository_name`: ECR repository name (e.g., `docker-hub/library/ubuntu`) +- `image_tag`: Image tag (default: `latest`) +- `initiate_pull_through`: Set to `true` to trigger pull-through for missing images (recommended) -Create a registry map file (registry-map.json): +**Example repository names for pull-through caches:** +- Docker Hub official: `docker-hub/library/ubuntu` +- Docker Hub user: `docker-hub/broadinstitute/gatk` +- Quay.io: `quay/biocontainers/samtools` +- ECR Public: `ecr-public/lts/ubuntu` -``` +--- + +## Step 6: Clone Containers (Alternative Approach) + +If you need to copy containers without pull-through cache, or want to use containers from registries not supported by pull through cache such as Seqera Wave containers: + +**Use `CloneContainerToECR`** to: +1. Parse source image references (handles Docker Hub shorthand) +2. Use existing pull-through cache rules when available +3. Grant HealthOmics access permissions automatically +4. Return the ECR URI and digest for workflow use + +**Supported image reference formats:** +- `ubuntu:latest` → Docker Hub official image +- `myorg/myimage:v1` → Docker Hub user image +- `quay.io/biocontainers/samtools:1.17` → Quay.io image +- `public.ecr.aws/lts/ubuntu:22.04` → ECR Public image + +Image URIs with hashes (e.g., `sha256:...`) are also supported by the tool. + +--- + +## Step 7: Grant HealthOmics Access to Existing Repositories + +To verify what repositories already exist and their accessiblity to HealthOmics: + +**Use `ListECRRepositories`** to: +- List all ECR repositories in the region +- Check HealthOmics accessibility status for each repository +- Filter to show only HealthOmics-accessible repositories + +For repositories not created through pull-through cache: + +**Use `GrantHealthOmicsRepositoryAccess`** to add the required permissions to the repository: +- `ecr:BatchGetImage` +- `ecr:GetDownloadUrlForLayer` + +The tool preserves existing repository policies while adding HealthOmics permissions. + + +**Parameters:** +- `filter_healthomics_accessible`: Set to `true` to only show accessible repositories + +--- + + +## Step 8: Create Container Registry Maps (Optional) + +Container registry maps are useful when migrating existing workflows that reference public container URIs. + +**Use `CreateContainerRegistryMap`** to generate a registry map that: +1. Discovers all HealthOmics-usable pull-through cache rules +2. Creates registry mappings for each discovered cache +3. Supports additional custom mappings and image overrides + +**Parameters:** +- `include_pull_through_caches`: Auto-discover and include PTC rules (default: `true`) +- `additional_registry_mappings`: Custom registry mappings +- `image_mappings`: Specific image overrides (take precedence over registry mappings) + +**Using the generated map:** +- Pass directly to `CreateAHOWorkflow` via `container_registry_map` parameter +- Or upload to S3 and reference via `container_registry_map_uri` parameter + +**Example container registry map for common upstream registries** + +The `ecrRepositoryPrefix` values in the registry map should match the `ecr_repository_prefix` values used when creating the pull-through cache rules. + +```json { "registryMappings": [ { - "upstreamRegistryUrl": "registry-1.docker.io", - "ecrRepositoryPrefix": "docker-hub" + "upstreamRegistryUrl": "registry-1.docker.io", + "ecrRepositoryPrefix": "docker-hub" }, { - "upstreamRegistryUrl": "quay.io", - "ecrRepositoryPrefix": "quay" + "upstreamRegistryUrl": "quay.io", + "ecrRepositoryPrefix": "quay" }, { - "upstreamRegistryUrl": "public.ecr.aws", - "ecrRepositoryPrefix": "ecr-public" + "upstreamRegistryUrl": "public.ecr.aws", + "ecrRepositoryPrefix": "ecr-public" } - ] -} + ] + } ``` -**Image Mappings Example** -Image mappings can be used to map specific containers to your private ECR repositories. These mappings will take precedence over registryMappings if both are provided. - -Create an image map file (image-map.json) for specific container overrides: - -``` +**Example image mappings for specific overrides:** +```json { "imageMappings": [ { - "sourceImage": "broadinstitute/gatk:4.6.0.2", - "destinationImage": "123456789012.dkr.ecr.us-east-1.amazonaws.com/docker-hub/broadinstitute/gatk:latest" + "sourceImage": "ubuntu", + "destinationImage": "123456789012.dkr.ecr.us-east-1.amazonaws.com/docker-hub/library/ubuntu:20.04" }, { - "sourceImage": "quay.io/biocontainers/samtools:1.17--h00cdaf9_0", - "destinationImage": "123456789012.dkr.ecr.us-east-1.amazonaws.com/quay/biocontainers/samtools:1.17--h00cdaf9_0" + "sourceImage": "quay.io/biocontainers/bwa:0.7.17", + "destinationImage": "123456789012.dkr.ecr.us-east-1.amazonaws.com/quay/biocontainers/bwa:0.7.17" } ] } ``` -**Combined Registry and Image Map** +**Example combined mapping*** -Create a complete map file (container-registry-map.json): +Registry mappings and image mappings can be combined. Image mappings will override any matching registry mapping. -``` +In the following example, docker hub images will use the registry mapping except for the `ubuntu` image which will use the custom +mapping. + +```json { "registryMappings": [ { - "upstreamRegistryUrl": "registry-1.docker.io", - "ecrRepositoryPrefix": "docker-hub" - }, - { - "upstreamRegistryUrl": "quay.io", - "ecrRepositoryPrefix": "quay" + "upstreamRegistryUrl": "registry-1.docker.io", + "ecrRepositoryPrefix": "docker-hub" } - ], + ] + } + "imageMappings": [ { "sourceImage": "ubuntu", - "destinationImage": "123456789012.dkr.ecr.us-east-1.amazonaws.com/docker-hub/library/ubuntu:20.04" - }, - { - "sourceImage": "quay.io/biocontainers/bwa:0.7.17--hed695b0_7", - "destinationImage": "123456789012.dkr.ecr.us-east-1.amazonaws.com/quay/biocontainers/bwa:0.7.17--hed695b0_7" + "destinationImage": "123456789012.dkr.ecr.us-east-1.amazonaws.com/myrepo/library/ubuntu:20.04" } ] -} ``` -Container regitry map files should be loaded to S3 and referenced when creating a workflow using the CreateAHOWorlflow tool. -## Step 6: Configure HealthOmics Service Role -The HealthOmics service role used during workflow runs must have ECR permissions to pull container images from your pull through cache repositories. +--- -**Create Trust Policy File** +## Step 9: Configure HealthOmics Service Role -``` -cat > trust-policy.json << 'EOF' +The HealthOmics service role used during workflow runs must have ECR permissions. Add these permissions to your service role policy: + +```json { - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Principal": { - "Service": "omics.amazonaws.com" - }, - "Action": "sts:AssumeRole" - } + "Effect": "Allow", + "Action": [ + "ecr:BatchGetImage", + "ecr:GetDownloadUrlForLayer", + "ecr:BatchCheckLayerAvailability" + ], + "Resource": [ + "arn:aws:ecr:us-east-1:123456789012:repository/docker-hub/*", + "arn:aws:ecr:us-east-1:123456789012:repository/quay/*", + "arn:aws:ecr:us-east-1:123456789012:repository/ecr-public/*" ] } -EOF ``` -**Create Service Role Policy File** +Adjust the repository ARN patterns to match your pull-through cache prefixes. -``` -cat > service-role-policy.json << 'EOF' -{ - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": [ - "s3:GetObject", - "s3:PutObject" - ], - "Resource": [ - "arn:aws:s3:::your-workflow-bucket/*" - ] - }, - { - "Effect": "Allow", - "Action": [ - "s3:ListBucket" - ], - "Resource": [ - "arn:aws:s3:::your-workflow-bucket" - ] - }, - { - "Effect": "Allow", - "Action": [ - "logs:DescribeLogStreams", - "logs:CreateLogStream", - "logs:PutLogEvents", - "logs:CreateLogGroup" - ], - "Resource": [ - "arn:aws:logs:us-east-1:123456789012:log-group:/aws/omics/WorkflowLog*" - ] - }, - { - "Effect": "Allow", - "Action": [ - "ecr:BatchGetImage", - "ecr:GetDownloadUrlForLayer", - "ecr:BatchCheckLayerAvailability" - ], - "Resource": [ - "arn:aws:ecr:us-east-1:123456789012:repository/docker-hub/*", - "arn:aws:ecr:us-east-1:123456789012:repository/quay/*", - "arn:aws:ecr:us-east-1:123456789012:repository/ecr-public/*" - ] - } - ] -} -EOF -``` +--- -**Create the Service Role** +## Quick Reference: Common Workflows -``` -aws iam create-role \ - --role-name HealthOmicsWorkflowRole \ - --assume-role-policy-document file://trust-policy.json \ - --description "Service role for HealthOmics workflows with container registry mappings" -``` +### New Workflow with Pull-Through Cache -**Create and Attach the Policy** +1. `CreatePullThroughCacheForHealthOmics` - Create PTC rule for needed registries +2. `ValidateHealthOmicsECRConfig` - Verify configuration +3. Use ECR URIs in workflow (e.g., `123456789012.dkr.ecr.us-east-1.amazonaws.com/docker-hub/library/ubuntu:latest`) or use container registry mapping -``` -aws iam create-policy \ - --policy-name HealthOmicsWorkflowPolicy \ - --policy-document file://service-role-policy.json \ - --description "Policy for HealthOmics workflows with ECR pull through cache access" -``` +### Migrate Existing Workflow -``` -aws iam attach-role-policy \ - --role-name HealthOmicsWorkflowRole \ - --policy-arn arn:aws:iam::123456789012:policy/HealthOmicsWorkflowPolicy -``` +1. `CreatePullThroughCacheForHealthOmics` - Create required PTC rules +2. `CreateContainerRegistryMap` - Generate registry map +3. `CloneContainerToECR` - Clone any containers from registries not supported by pull-through caches +3. `CreateAHOWorkflow` with `container_registry_map` parameter + +### Verify Container Access + +1. `CheckContainerAvailability` with `initiate_pull_through: true` +2. `ListECRRepositories` with `filter_healthomics_accessible: true` + +### Troubleshoot Access Issues + +1. `ValidateHealthOmicsECRConfig` - Check for configuration issues +2. `ListPullThroughCacheRules` - Verify PTC rule status +3. `GrantHealthOmicsRepositoryAccess` - Fix repository permissions \ No newline at end of file