From 3f059bd740c9f85e62d79c72a5c68a1d08284a32 Mon Sep 17 00:00:00 2001
From: Girish Raman <grraman@amazon.com>
Date: Fri, 27 Mar 2026 15:14:00 -0700
Subject: [PATCH 01/15] feat(sagemaker-ai): Add model customization and
 hyperpod skills

---
 .../sagemaker-ai/.claude-plugin/plugin.json   |   22 +
 plugins/sagemaker-ai/.mcp.json                |   12 +
 .../skills/dataset-evaluation/SKILL.md        |   54 +
 .../references/strategy_data_requirements.md  |  173 ++
 .../scripts/format_detector.py                |  678 ++++++++
 .../skills/dataset-transformation/SKILL.md    |  220 +++
 .../references/dataset_transformation_code.md |  135 ++
 .../references/notebook_structure.md          |   46 +
 .../references/notebook_writing_guide.md      |   99 ++
 .../references/sagemaker_dataset_formats.md   |  146 ++
 .../scripts/transformation_tools.py           |  146 ++
 .../skills/directory-management/SKILL.md      |   32 +
 .../skills/finetuning-setup/SKILL.md          |   73 +
 .../finetune_technique_selection_guide.md     |   37 +
 .../scripts/get_model_names.py                |   43 +
 .../finetuning-setup/scripts/get_recipes.py   |   30 +
 .../sagemaker-ai/skills/finetuning/SKILL.md   |  128 ++
 .../finetuning/references/dpo_example.md      |  159 ++
 .../finetuning/references/eula_links.md       |   19 +
 .../finetuning/references/rlvr_example.md     |  169 ++
 .../references/rlvr_reward_function.md        |   43 +
 .../finetuning/references/sft_example.md      |  159 ++
 .../rlvr_reward_function_source_template.py   |  250 +++
 .../skills/hyperpod-issue-report/SKILL.md     |   74 +
 .../references/collection-details.md          |  105 ++
 .../references/troubleshooting.md             |   22 +
 .../scripts/hyperpod_issue_report.py          | 1430 +++++++++++++++++
 .../scripts/requirements.txt                  |    3 +
 .../sagemaker-ai/skills/hyperpod-ssm/SKILL.md |   96 ++
 .../references/troubleshooting.md             |   61 +
 .../hyperpod-ssm/scripts/get-cluster-info.sh  |   20 +
 .../skills/hyperpod-ssm/scripts/list-nodes.sh |   37 +
 .../skills/hyperpod-ssm/scripts/ssm-exec.sh   |   85 +
 .../skills/hyperpod-version-checker/SKILL.md  |   74 +
 .../scripts/hyperpod_check_versions.sh        |  545 +++++++
 .../skills/model-deployment/SKILL.md          |  122 ++
 .../references/deploy-nova-bedrock.md         |  119 ++
 .../references/deploy-nova-sagemaker.md       |  142 ++
 .../references/deploy-oss-bedrock.md          |  138 ++
 .../references/deploy-oss-sagemaker.md        |  149 ++
 .../references/model-licenses.md              |   23 +
 .../scripts/deploy-nova-bedrock.py            |   51 +
 .../scripts/deploy-nova-sagemaker.py          |   54 +
 .../scripts/deploy-oss-bedrock.py             |  110 ++
 .../scripts/deploy-oss-sagemaker.py           |   55 +
 .../skills/model-evaluation/SKILL.md          |  240 +++
 .../references/builtin-metrics.md             |   35 +
 .../references/llmaaj-builtin-evaluation.md   |   23 +
 .../references/llmaaj-custom-evaluation.md    |   63 +
 .../references/notebook_structure.md          |   63 +
 .../references/supported-judge-models.md      |   35 +
 .../scripts/notebook_cells.py                 |   83 +
 .../scripts/validate_custom_metrics.py        |  124 ++
 plugins/sagemaker-ai/skills/planning/SKILL.md |  121 ++
 .../references/model-customization-plan.md    |   15 +
 .../references/skill-routing-constraints.md   |   32 +
 .../skills/use-case-specification/SKILL.md    |   76 +
 57 files changed, 7298 insertions(+)
 create mode 100644 plugins/sagemaker-ai/.claude-plugin/plugin.json
 create mode 100644 plugins/sagemaker-ai/.mcp.json
 create mode 100644 plugins/sagemaker-ai/skills/dataset-evaluation/SKILL.md
 create mode 100644 plugins/sagemaker-ai/skills/dataset-evaluation/references/strategy_data_requirements.md
 create mode 100644 plugins/sagemaker-ai/skills/dataset-evaluation/scripts/format_detector.py
 create mode 100644 plugins/sagemaker-ai/skills/dataset-transformation/SKILL.md
 create mode 100644 plugins/sagemaker-ai/skills/dataset-transformation/references/dataset_transformation_code.md
 create mode 100644 plugins/sagemaker-ai/skills/dataset-transformation/references/notebook_structure.md
 create mode 100644 plugins/sagemaker-ai/skills/dataset-transformation/references/notebook_writing_guide.md
 create mode 100644 plugins/sagemaker-ai/skills/dataset-transformation/references/sagemaker_dataset_formats.md
 create mode 100644 plugins/sagemaker-ai/skills/dataset-transformation/scripts/transformation_tools.py
 create mode 100644 plugins/sagemaker-ai/skills/directory-management/SKILL.md
 create mode 100644 plugins/sagemaker-ai/skills/finetuning-setup/SKILL.md
 create mode 100644 plugins/sagemaker-ai/skills/finetuning-setup/references/finetune_technique_selection_guide.md
 create mode 100644 plugins/sagemaker-ai/skills/finetuning-setup/scripts/get_model_names.py
 create mode 100644 plugins/sagemaker-ai/skills/finetuning-setup/scripts/get_recipes.py
 create mode 100644 plugins/sagemaker-ai/skills/finetuning/SKILL.md
 create mode 100644 plugins/sagemaker-ai/skills/finetuning/references/dpo_example.md
 create mode 100644 plugins/sagemaker-ai/skills/finetuning/references/eula_links.md
 create mode 100644 plugins/sagemaker-ai/skills/finetuning/references/rlvr_example.md
 create mode 100644 plugins/sagemaker-ai/skills/finetuning/references/rlvr_reward_function.md
 create mode 100644 plugins/sagemaker-ai/skills/finetuning/references/sft_example.md
 create mode 100644 plugins/sagemaker-ai/skills/finetuning/templates/rlvr_reward_function_source_template.py
 create mode 100755 plugins/sagemaker-ai/skills/hyperpod-issue-report/SKILL.md
 create mode 100755 plugins/sagemaker-ai/skills/hyperpod-issue-report/references/collection-details.md
 create mode 100755 plugins/sagemaker-ai/skills/hyperpod-issue-report/references/troubleshooting.md
 create mode 100755 plugins/sagemaker-ai/skills/hyperpod-issue-report/scripts/hyperpod_issue_report.py
 create mode 100755 plugins/sagemaker-ai/skills/hyperpod-issue-report/scripts/requirements.txt
 create mode 100755 plugins/sagemaker-ai/skills/hyperpod-ssm/SKILL.md
 create mode 100755 plugins/sagemaker-ai/skills/hyperpod-ssm/references/troubleshooting.md
 create mode 100755 plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/get-cluster-info.sh
 create mode 100755 plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/list-nodes.sh
 create mode 100755 plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/ssm-exec.sh
 create mode 100755 plugins/sagemaker-ai/skills/hyperpod-version-checker/SKILL.md
 create mode 100755 plugins/sagemaker-ai/skills/hyperpod-version-checker/scripts/hyperpod_check_versions.sh
 create mode 100644 plugins/sagemaker-ai/skills/model-deployment/SKILL.md
 create mode 100644 plugins/sagemaker-ai/skills/model-deployment/references/deploy-nova-bedrock.md
 create mode 100644 plugins/sagemaker-ai/skills/model-deployment/references/deploy-nova-sagemaker.md
 create mode 100644 plugins/sagemaker-ai/skills/model-deployment/references/deploy-oss-bedrock.md
 create mode 100644 plugins/sagemaker-ai/skills/model-deployment/references/deploy-oss-sagemaker.md
 create mode 100644 plugins/sagemaker-ai/skills/model-deployment/references/model-licenses.md
 create mode 100644 plugins/sagemaker-ai/skills/model-deployment/scripts/deploy-nova-bedrock.py
 create mode 100644 plugins/sagemaker-ai/skills/model-deployment/scripts/deploy-nova-sagemaker.py
 create mode 100644 plugins/sagemaker-ai/skills/model-deployment/scripts/deploy-oss-bedrock.py
 create mode 100644 plugins/sagemaker-ai/skills/model-deployment/scripts/deploy-oss-sagemaker.py
 create mode 100644 plugins/sagemaker-ai/skills/model-evaluation/SKILL.md
 create mode 100644 plugins/sagemaker-ai/skills/model-evaluation/references/builtin-metrics.md
 create mode 100644 plugins/sagemaker-ai/skills/model-evaluation/references/llmaaj-builtin-evaluation.md
 create mode 100644 plugins/sagemaker-ai/skills/model-evaluation/references/llmaaj-custom-evaluation.md
 create mode 100644 plugins/sagemaker-ai/skills/model-evaluation/references/notebook_structure.md
 create mode 100644 plugins/sagemaker-ai/skills/model-evaluation/references/supported-judge-models.md
 create mode 100644 plugins/sagemaker-ai/skills/model-evaluation/scripts/notebook_cells.py
 create mode 100644 plugins/sagemaker-ai/skills/model-evaluation/scripts/validate_custom_metrics.py
 create mode 100644 plugins/sagemaker-ai/skills/planning/SKILL.md
 create mode 100644 plugins/sagemaker-ai/skills/planning/references/model-customization-plan.md
 create mode 100644 plugins/sagemaker-ai/skills/planning/references/skill-routing-constraints.md
 create mode 100644 plugins/sagemaker-ai/skills/use-case-specification/SKILL.md

diff --git a/plugins/sagemaker-ai/.claude-plugin/plugin.json b/plugins/sagemaker-ai/.claude-plugin/plugin.json
new file mode 100644
index 00000000..ee1b60a1
--- /dev/null
+++ b/plugins/sagemaker-ai/.claude-plugin/plugin.json
@@ -0,0 +1,22 @@
+{
+  "author": {
+    "name": "Amazon Web Services"
+  },
+  "description": "Equip AI coding agents with skills to build, train, and deploy ML and generative AI workloads on Amazon SageMaker AI.",
+  "homepage": "https://github.com/awslabs/agent-plugins",
+  "keywords": [
+    "sagemaker",
+    "machine-learning",
+    "generative-ai",
+    "fine-tuning",
+    "training",
+    "deployment",
+    "inference",
+    "mlops",
+    "aws"
+  ],
+  "license": "Apache-2.0",
+  "name": "sagemaker-ai",
+  "repository": "https://github.com/awslabs/agent-plugins",
+  "version": "1.0.0"
+}
diff --git a/plugins/sagemaker-ai/.mcp.json b/plugins/sagemaker-ai/.mcp.json
new file mode 100644
index 00000000..573fb77e
--- /dev/null
+++ b/plugins/sagemaker-ai/.mcp.json
@@ -0,0 +1,12 @@
+{
+  "mcpServers": {
+    "aws-mcp": {
+      "command": "uvx",
+      "args": [
+        "mcp-proxy-for-aws@latest",
+        "https://aws-mcp.us-east-1.api.aws/mcp"
+      ],
+      "disabled": false
+    }
+  }
+}
diff --git a/plugins/sagemaker-ai/skills/dataset-evaluation/SKILL.md b/plugins/sagemaker-ai/skills/dataset-evaluation/SKILL.md
new file mode 100644
index 00000000..81d2d956
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/dataset-evaluation/SKILL.md
@@ -0,0 +1,54 @@
+---
+name: dataset-evaluation
+description: Validates dataset formatting and quality for SageMaker model fine-tuning (SFT, DPO, or RLVR). Use when the user says "is my dataset okay", "evaluate my data", "check my training data", "I have my own data", or before starting any fine-tuning job. Detects file format, checks schema compliance against the selected model and technique, and reports whether the data is ready for training or evaluation.
+---
+
+# Workflow Instruction
+
+Follow the workflow shown below. Locate the dataset, check the file type, and resolve any issues with missing files or wrong file types. Determine the fine-tuning model and fine-tuning strategy. Run scripts/format_detector.py to evaluate whether the file is formatted correctly for the currently selected model and strategy. Summarize the results: is the dataset ready for fine-tuning?
+
+## Workflow
+
+1. **Locate Dataset**:
+   - The full path may be a local file path, or an S3 URI
+   - Resolve the full path to the dataset file, make sure read permissions are available, and help the user if the file is not found
+
+2. **Determine strategy and model**:
+   - File formatting depends on the currently selected fine-tuning strategy and fine-tuning base model.
+   - If the strategy and model are already known from the conversation context (e.g., selected via the finetuning-setup skill), use them.
+   - If not available in context, activate the finetuning-setup skill to determine them before proceeding.
+
+3. **Check File Formatting**: Run the tool format_detector.py to make sure the file conforms to formatting requirements.
+   - Send the full path directly to the format_detector script as an argument
+   - Do not send the model and strategy as arguments
+   - Do not download data from S3
+   - Do not make local copies of data
+
+4. **Summarize Results**: Tell the user if their data is ready
+   - Examine the output of format_detector and compare to the known strategy and model
+   - **Important: training datasets and evaluation datasets have different format requirements.**
+     - **Training datasets** must match the fine-tuning strategy format (SFT, DPO, RLVR) per `references/strategy_data_requirements.md`
+     - **Evaluation datasets** (for model evaluation) must match one of the [SageMaker evaluation dataset formats](https://docs.aws.amazon.com/sagemaker/latest/dg/model-customize-evaluation-dataset-formats.html).
+   - Report back to the user if their current dataset is valid for its intended purpose
+   - Warn the user if their dataset is valid, but for a different strategy or model
+   - Warn the user if their dataset is not valid for any strategy/model pair
+
+## Messages to the User
+
+- Introduction: "This skill checks the structure of your dataset for model fine-tuning."
+- File types: This skill applies to files that are formatted according to the [Amazon SageMaker AI Developer Guide](https://docs.aws.amazon.com/sagemaker/latest/dg/autopilot-llms-finetuning-data-format.html#autopilot-llms-finetuning-dataset-format)
+
+# Resources
+
+- scripts/format_detector.py is self-contained format validation script that can be run independently
+- finetuning-setup skill should have already determined the fine-tuning strategy and base model
+- references/strategy_data_requirements.md contains data format requirements per strategy
+
+## Script Details
+
+- scripts/format_detector.py is self-contained format validation script that can be run independently:
+
+```bash
+# With the file path argument identified in workflow step 1
+python src/format_detector.py local_path/to/dataset
+```
diff --git a/plugins/sagemaker-ai/skills/dataset-evaluation/references/strategy_data_requirements.md b/plugins/sagemaker-ai/skills/dataset-evaluation/references/strategy_data_requirements.md
new file mode 100644
index 00000000..7821b1ac
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/dataset-evaluation/references/strategy_data_requirements.md
@@ -0,0 +1,173 @@
+# Finetuning Strategy Data Requirements
+
+**Critical** Nova models have a different set of formats than open weights models. Make sure you refer to the right section based on the user's base model.
+
+## Open Weights Models Data Format by Strategy (Llama, Qwen, GPT-OSS, etc.)
+
+### SFT (Supervised Fine-Tuning)
+
+**Required format:**
+
+```jsonl
+{
+  "prompt": "",
+  "completion": ""
+}
+```
+
+**What it needs:**
+
+- Input-output pairs
+- Single "correct" response per input
+- Consistent quality across examples
+
+### DPO (Direct Preference Optimization)
+
+**Required format:**
+
+```jsonl
+{
+  "prompt": "",
+  "chosen": "",
+  "rejected": ""
+}
+```
+
+**What it needs:**
+
+- Input with two responses: preferred (chosen) and dispreferred (rejected)
+- Clear preference signal between responses
+- Both responses should be plausible but one is better
+- Avoiding unintentional length bias
+
+### RLVR (Reinforcement Learning from Verifiable Rewards)
+
+**Required format:**
+
+```jsonl
+{
+  "data_source": "",
+  "prompt": [
+    {
+      "content": "",
+      "role": ""
+    }
+  ],
+  "ability": "",
+  "reward_model": {
+    "ground_truth": "",
+    "style": ""
+  }
+}
+```
+
+**What it needs:**
+
+- user prompt
+- Ground truth responses in `reward_model.ground_truth` field (leave empty if user data does not have responses)
+
+**How it works:**
+
+1. Model generates response for input
+2. Lambda receives full user prompt + reward model fields
+3. Lambda computes reward (uses ground_truth if included in verification logic)
+4. Model learns to maximize rewards
+
+## Nova Models Data Format by Strategy
+
+### SFT (Supervised Fine-Tuning)
+
+```jsonl
+{
+  "schemaVersion": "bedrock-conversation-2024",
+  "system": [
+    {
+      "text": ""
+    }
+  ],
+  "messages": [
+    {
+      "role": "user",
+      "content": [
+        {
+          "text": ""
+        }
+      ]
+    },
+    {
+      "role": "assistant",
+      "content": [
+        {
+          "text": ""
+        }
+      ]
+    }
+  ]
+}
+```
+
+### DPO (Direct Preference Optimization)
+
+The format is the same as SFT for the first N-1 turns. The final assistant turn uses `candidates` with `preferenceLabel` instead of regular `content`.
+
+```jsonl
+{
+  "schemaVersion": "bedrock-conversation-2024",
+  "system": [
+    {
+      "text": ""
+    }
+  ],
+  "messages": [
+    {
+      "role": "user",
+      "content": [
+        {
+          "text": ""
+        }
+      ]
+    },
+    {
+      "role": "assistant",
+      "candidates": [
+        {
+          "content": [
+            {
+              "text": ""
+            }
+          ],
+          "preferenceLabel": "preferred"
+        },
+        {
+          "content": [
+            {
+              "text": ""
+            }
+          ],
+          "preferenceLabel": "non-preferred"
+        }
+      ]
+    }
+  ]
+}
+```
+
+### RLVR
+
+```jsonl
+{
+  "messages": [
+    {
+      "role": "system",
+      "content": "You are a helpful assistant."
+    },
+    {
+      "role": "user",
+      "content": "Hello!"
+    }
+  ],
+  "reference_answer": {
+    "solution": "49"
+  }
+}
+```
diff --git a/plugins/sagemaker-ai/skills/dataset-evaluation/scripts/format_detector.py b/plugins/sagemaker-ai/skills/dataset-evaluation/scripts/format_detector.py
new file mode 100644
index 00000000..a9ed1fb6
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/dataset-evaluation/scripts/format_detector.py
@@ -0,0 +1,678 @@
+"""Format detection for S3 JSONL files.
+
+This module provides functionality to detect and validate JSONL file formats
+stored in S3. It samples the first 1MB of a file to determine the format type
+across 11 supported formats: Nova SFT, Nova DPO, Nova RLVR, GPT-OSS SFT,
+GPT-OSS DPO, Open Weights SFT, Open Weights SFT Conv, Open Weights DPO,
+Verl, Verl Legacy, and SageMaker Eval.
+
+Usage:
+    result = detect_format("s3://my-bucket/data.jsonl")
+    if result.is_valid:
+        print(f"Format: {result.format_type}")
+"""
+
+from dataclasses import dataclass
+from enum import Enum
+import boto3
+import json
+import logging
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["FormatType", "ConfidenceLevel", "ValidationError", "FormatDetectionResult", "detect_format"]
+
+
+class FormatType(Enum):
+    """Supported JSONL format types."""
+    NOVA_SFT = "nova_sft"
+    NOVA_DPO = "nova_dpo"
+    NOVA_RLVR = "nova_rlvr"
+    GPT_OSS_SFT = "gpt_oss_sft"
+    GPT_OSS_DPO = "gpt_oss_dpo"
+    OPEN_WEIGHTS_SFT = "open_weights_sft"
+    OPEN_WEIGHTS_SFT_CONV = "open_weights_sft_conv"
+    OPEN_WEIGHTS_DPO = "open_weights_dpo"
+    VERL = "verl"
+    VERL_LEGACY = "verl_legacy"
+    SAGEMAKER_EVAL = "sagemaker_eval"
+    UNKNOWN = "unknown"
+
+
+class ConfidenceLevel(Enum):
+    """Confidence level for format detection results."""
+    HIGH = "high"
+    LOW = "low"
+    NONE = "none"
+
+
+@dataclass
+class ValidationError:
+    """Represents a validation error found during format detection."""
+    line_number: int
+    error_type: str
+    message: str
+
+
+@dataclass
+class FormatDetectionResult:
+    """Result of format detection operation."""
+    format_type: FormatType
+    is_valid: bool
+    lines_sampled: int
+    errors: list[ValidationError]
+    confidence: ConfidenceLevel
+
+
+def _sample_local_file(file_path: str, sample_size: int) -> list[str]:
+    """Sample lines from local JSONL file.
+    
+    Args:
+        file_path: Path to local file
+        sample_size: Maximum bytes to read
+        
+    Returns:
+        List of lines from file
+        
+    Raises:
+        FileNotFoundError: If file doesn't exist
+        IOError: If file can't be read
+    """
+    logger.debug("Sampling local file: %s", file_path)
+    with open(file_path, "rb") as f:
+        data = f.read(sample_size)
+    
+    if not data:
+        return []
+    
+    text = data.decode("utf-8")
+    
+    last_newline_idx = text.rfind("\n")
+    if last_newline_idx == -1:
+        return []
+    
+    complete_text = text[:last_newline_idx + 1]
+    lines = [line for line in complete_text.split("\n") if line]
+    
+    return lines
+
+
+def _sample_s3_file(s3_uri: str, sample_size_bytes: int, s3_client=None) -> list[str]:
+    """Sample the first N bytes of an S3 file and return complete lines.
+    
+    Reads the first sample_size_bytes from an S3 file using a Range request,
+    then truncates to the last complete newline to avoid partial lines.
+    
+    Args:
+        s3_uri: S3 URI in format "s3://bucket/key"
+        sample_size_bytes: Number of bytes to sample (default 1MB)
+        s3_client: Optional boto3 S3 client to reuse
+        
+    Returns:
+        List of complete JSONL lines (strings without trailing newlines)
+        
+    Raises:
+        ValueError: If S3 URI is invalid (missing "s3://", bucket, or key)
+        botocore.exceptions.ClientError: If S3 access fails
+    """
+    logger.debug("Sampling S3 file: %s (%d bytes)", s3_uri, sample_size_bytes)
+    # Parse S3 URI
+    if not s3_uri.startswith("s3://"):
+        raise ValueError(f"Invalid S3 URI: must start with 's3://' (got: {s3_uri})")
+    
+    uri_without_prefix = s3_uri[5:]  # Remove "s3://"
+    parts = uri_without_prefix.split("/", 1)
+    
+    if len(parts) != 2 or not parts[0] or not parts[1]:
+        raise ValueError(f"Invalid S3 URI: must contain bucket and key (got: {s3_uri})")
+    
+    bucket, key = parts
+    
+    # Read first sample_size_bytes using Range header
+    client = s3_client or boto3.client("s3")
+    range_header = f"bytes=0-{sample_size_bytes - 1}"
+    
+    response = client.get_object(Bucket=bucket, Key=key, Range=range_header)
+    data = response["Body"].read()
+    
+    # Handle empty file
+    if not data:
+        return []
+    
+    # Decode bytes to string
+    text = data.decode("utf-8")
+    
+    # Find last complete newline to avoid truncated lines
+    last_newline_idx = text.rfind("\n")
+    if last_newline_idx == -1:
+        # No newlines found - return empty list if file is all one line
+        # (we can't be sure it's complete)
+        return []
+    
+    # Keep only complete lines (up to and including last newline)
+    complete_text = text[:last_newline_idx + 1]
+    
+    # Split on newlines and filter empty strings
+    lines = [line for line in complete_text.split("\n") if line]
+    
+    return lines
+
+
+def _classify_nova_format(record: dict) -> FormatType:
+    """Classify Nova-specific format by checking last message structure.
+    
+    Args:
+        record: Parsed JSON record with messages field
+        
+    Returns:
+        FormatType.NOVA_DPO if last message has candidates field,
+        FormatType.NOVA_SFT if last message has standard content field,
+        FormatType.UNKNOWN otherwise
+    """
+    messages = record.get("messages", [])
+    if not messages:
+        return FormatType.UNKNOWN
+    
+    last_message = messages[-1]
+    if "candidates" in last_message:
+        return FormatType.NOVA_DPO
+    elif "content" in last_message and last_message["content"]:
+        return FormatType.NOVA_SFT
+    else:
+        return FormatType.UNKNOWN
+
+
+def _classify_messages_format(record: dict) -> FormatType:
+    """Distinguish Nova vs GPT-OSS/HF by inspecting content structure.
+    
+    Nova has nested content arrays (list of dicts with 'text' field),
+    GPT-OSS/HF has flat content strings.
+    
+    Args:
+        record: Parsed JSON record with messages field
+        
+    Returns:
+        FormatType value for the detected format
+    """
+    messages = record.get("messages")
+    
+    # Critical type checking: messages must be a list
+    if not isinstance(messages, list):
+        return FormatType.UNKNOWN
+    
+    if not messages:
+        return FormatType.UNKNOWN
+    
+    first_message = messages[0]
+    
+    # Check if content field exists
+    if "content" not in first_message:
+        return FormatType.UNKNOWN
+    
+    content = first_message["content"]
+    
+    # Nova: nested content arrays (list of dicts with 'text' field)
+    if isinstance(content, list):
+        return _classify_nova_format(record)
+    # GPT-OSS/HF: flat content strings
+    elif isinstance(content, str):
+        return FormatType.GPT_OSS_SFT
+    else:
+        return FormatType.UNKNOWN
+
+
+def _classify_schema(samples: list[dict]) -> FormatType:
+    """Top-level classifier that checks for all 11 supported formats.
+    
+    Args:
+        samples: List of parsed JSON records
+        
+    Returns:
+        FormatType value for the detected format
+    """
+    if not samples:
+        return FormatType.UNKNOWN
+    
+    first = samples[0]
+    fields = set(first.keys())
+    
+    # SageMaker Evaluation: query + response
+    if "query" in fields and "response" in fields:
+        return FormatType.SAGEMAKER_EVAL
+    
+    # Verl/RLVR: prompt + (reward_model or extra_info), no completion
+    if "prompt" in fields and ("reward_model" in fields or "extra_info" in fields):
+        if "completion" not in fields:
+            if isinstance(first["prompt"], list):
+                return FormatType.VERL
+            return FormatType.VERL_LEGACY
+    
+    # Messages-based formats: Nova RLVR, Nova, GPT-OSS
+    if "messages" in fields:
+        if "reference_answer" in fields:
+            return FormatType.NOVA_RLVR
+        return _classify_messages_format(first)
+    
+    # DPO: prompt/chosen/rejected
+    if {"prompt", "chosen", "rejected"}.issubset(fields):
+        if isinstance(first["prompt"], list):
+            return FormatType.GPT_OSS_DPO
+        return FormatType.OPEN_WEIGHTS_DPO
+    
+    # SFT: prompt/completion
+    if {"prompt", "completion"}.issubset(fields):
+        if isinstance(first["prompt"], list):
+            return FormatType.OPEN_WEIGHTS_SFT_CONV
+        return FormatType.OPEN_WEIGHTS_SFT
+    
+    return FormatType.UNKNOWN
+
+
+def _validate_nova_messages(messages: list, line_num: int, is_dpo: bool) -> list[ValidationError]:
+    """Validate Nova SFT/DPO message structure."""
+    errors = []
+    for msg_idx, msg in enumerate(messages):
+        if "role" not in msg:
+            errors.append(ValidationError(
+                line_number=line_num,
+                error_type="missing_field",
+                message=f"Message {msg_idx} missing required field 'role'"
+            ))
+        elif msg["role"] not in ["user", "assistant", "system"]:
+            errors.append(ValidationError(
+                line_number=line_num,
+                error_type="invalid_structure",
+                message=f"Invalid role '{msg['role']}' in message {msg_idx}"
+            ))
+        if "content" not in msg and "candidates" not in msg:
+            errors.append(ValidationError(
+                line_number=line_num,
+                error_type="missing_field",
+                message=f"Message {msg_idx} missing 'content' or 'candidates'"
+            ))
+        if "content" in msg and not isinstance(msg["content"], list):
+            errors.append(ValidationError(
+                line_number=line_num,
+                error_type="invalid_structure",
+                message=f"Nova format content must be list, got {type(msg['content']).__name__}"
+            ))
+        if is_dpo and "candidates" in msg:
+            for cand_idx, candidate in enumerate(msg["candidates"]):
+                if "preferenceLabel" not in candidate:
+                    errors.append(ValidationError(
+                        line_number=line_num,
+                        error_type="missing_field",
+                        message=f"DPO message {msg_idx} candidate {cand_idx} missing 'preferenceLabel'"
+                    ))
+                elif candidate["preferenceLabel"] not in ["preferred", "non-preferred"]:
+                    errors.append(ValidationError(
+                        line_number=line_num,
+                        error_type="invalid_structure",
+                        message=f"Invalid preferenceLabel '{candidate['preferenceLabel']}' in message {msg_idx} candidate {cand_idx}"
+                    ))
+    return errors
+
+
+def _validate_gpt_messages(messages: list, line_num: int) -> list[ValidationError]:
+    """Validate GPT-OSS SFT message structure."""
+    errors = []
+    for msg_idx, msg in enumerate(messages):
+        if "role" not in msg:
+            errors.append(ValidationError(
+                line_number=line_num,
+                error_type="missing_field",
+                message=f"Message {msg_idx} missing required field 'role'"
+            ))
+        elif msg["role"] not in ["user", "assistant", "system"]:
+            errors.append(ValidationError(
+                line_number=line_num,
+                error_type="invalid_structure",
+                message=f"Invalid role '{msg['role']}' in message {msg_idx}"
+            ))
+        if "content" not in msg:
+            errors.append(ValidationError(
+                line_number=line_num,
+                error_type="missing_field",
+                message=f"Message {msg_idx} missing required field 'content'"
+            ))
+        elif not isinstance(msg["content"], str):
+            errors.append(ValidationError(
+                line_number=line_num,
+                error_type="invalid_structure",
+                message=f"GPT-OSS format content must be string, got {type(msg['content']).__name__}"
+            ))
+    return errors
+
+
+def _validate_rlvr_messages(messages: list, line_num: int) -> list[ValidationError]:
+    """Validate Nova RLVR message structure."""
+    errors = []
+    for msg_idx, msg in enumerate(messages):
+        if "role" not in msg:
+            errors.append(ValidationError(
+                line_number=line_num,
+                error_type="missing_field",
+                message=f"Message {msg_idx} missing required field 'role'"
+            ))
+        elif msg["role"] not in ["user", "assistant", "system"]:
+            errors.append(ValidationError(
+                line_number=line_num,
+                error_type="invalid_structure",
+                message=f"Invalid role '{msg['role']}' in message {msg_idx}"
+            ))
+        if "content" not in msg:
+            errors.append(ValidationError(
+                line_number=line_num,
+                error_type="missing_field",
+                message=f"Message {msg_idx} missing required field 'content'"
+            ))
+        elif not isinstance(msg["content"], str):
+            errors.append(ValidationError(
+                line_number=line_num,
+                error_type="invalid_structure",
+                message=f"Nova RLVR content must be string, got {type(msg['content']).__name__}"
+            ))
+    return errors
+
+
+def _validate_verl_prompt(record: dict, line_num: int) -> list[ValidationError]:
+    """Validate Verl prompt structure (list of role/content dicts)."""
+    errors = []
+    if "prompt" not in record:
+        errors.append(ValidationError(
+            line_number=line_num,
+            error_type="missing_field",
+            message="Missing required field 'prompt'"
+        ))
+    elif not isinstance(record["prompt"], list):
+        errors.append(ValidationError(
+            line_number=line_num,
+            error_type="invalid_structure",
+            message=f"Verl field 'prompt' must be list, got {type(record['prompt']).__name__}"
+        ))
+    else:
+        for msg_idx, msg in enumerate(record["prompt"]):
+            if not isinstance(msg, dict) or "role" not in msg or "content" not in msg:
+                errors.append(ValidationError(
+                    line_number=line_num,
+                    error_type="invalid_structure",
+                    message=f"Prompt message {msg_idx} must have 'role' and 'content'"
+                ))
+    if "reward_model" not in record and "extra_info" not in record:
+        errors.append(ValidationError(
+            line_number=line_num,
+            error_type="missing_field",
+            message="Missing required field 'reward_model' or 'extra_info'"
+        ))
+    return errors
+
+
+def _validate_verl_legacy_prompt(record: dict, line_num: int) -> list[ValidationError]:
+    """Validate Verl Legacy prompt structure (string) and extra_info."""
+    errors = []
+    if "prompt" not in record:
+        errors.append(ValidationError(
+            line_number=line_num,
+            error_type="missing_field",
+            message="Missing required field 'prompt'"
+        ))
+    elif not isinstance(record["prompt"], str):
+        errors.append(ValidationError(
+            line_number=line_num,
+            error_type="invalid_structure",
+            message=f"Verl Legacy field 'prompt' must be string, got {type(record['prompt']).__name__}"
+        ))
+    if "extra_info" not in record:
+        errors.append(ValidationError(
+            line_number=line_num,
+            error_type="missing_field",
+            message="Missing required field 'extra_info'"
+        ))
+    return errors
+
+
+# Schema-driven format validation specs.
+# Each entry defines required_fields (field->type mapping) and an optional
+# message_validator or record_validator for complex per-record checks.
+# - message_validator: called with (messages_list, line_num) -> list[ValidationError]
+#   Used for formats whose top-level required field is "messages" (list).
+# - record_validator: called with (record, line_num) -> list[ValidationError]
+#   Used for formats needing whole-record access (verl, verl_legacy).
+FORMAT_SCHEMAS = {
+    FormatType.NOVA_SFT: {
+        "required_fields": {"messages": list},
+        "message_validator": lambda msgs, ln: _validate_nova_messages(msgs, ln, is_dpo=False),
+    },
+    FormatType.NOVA_DPO: {
+        "required_fields": {"messages": list},
+        "message_validator": lambda msgs, ln: _validate_nova_messages(msgs, ln, is_dpo=True),
+    },
+    FormatType.NOVA_RLVR: {
+        "required_fields": {"messages": list, "reference_answer": dict},
+        "message_validator": _validate_rlvr_messages,
+    },
+    FormatType.GPT_OSS_SFT: {
+        "required_fields": {"messages": list},
+        "message_validator": _validate_gpt_messages,
+    },
+    FormatType.GPT_OSS_DPO: {
+        "required_fields": {"prompt": list, "chosen": list, "rejected": list},
+        "field_error_prefix": "GPT-OSS DPO",
+    },
+    FormatType.OPEN_WEIGHTS_SFT: {
+        "required_fields": {"prompt": str, "completion": str},
+        "field_error_prefix": "Open Weights SFT",
+    },
+    FormatType.OPEN_WEIGHTS_SFT_CONV: {
+        "required_fields": {"prompt": list, "completion": list},
+        "field_error_prefix": "Open Weights SFT Conv",
+    },
+    FormatType.OPEN_WEIGHTS_DPO: {
+        "required_fields": {"prompt": str, "chosen": str, "rejected": str},
+        "field_error_prefix": "Open Weights DPO",
+    },
+    FormatType.SAGEMAKER_EVAL: {
+        "required_fields": {"query": str, "response": str},
+        "field_error_prefix": "SageMaker Eval",
+    },
+    FormatType.VERL: {
+        "required_fields": {},
+        "record_validator": _validate_verl_prompt,
+    },
+    FormatType.VERL_LEGACY: {
+        "required_fields": {},
+        "record_validator": _validate_verl_legacy_prompt,
+    },
+}
+
+
+def _validate_samples(samples: list[dict], expected_format: FormatType, line_numbers: list[int]) -> tuple[bool, list[ValidationError]]:
+    """Validate that all samples conform to the expected format schema.
+    
+    Args:
+        samples: List of parsed JSON records
+        expected_format: Expected FormatType enum value
+        line_numbers: 1-based line numbers corresponding to each sample
+        
+    Returns:
+        Tuple of (is_valid, errors) where errors is a list of ValidationError objects
+    """
+    errors = []
+    schema = FORMAT_SCHEMAS.get(expected_format)
+
+    for record, line_num in zip(samples, line_numbers):
+        # Check schema consistency
+        detected_format = _classify_schema([record])
+        if detected_format != expected_format:
+            errors.append(ValidationError(
+                line_number=line_num,
+                error_type="schema_mismatch",
+                message=f"Expected {expected_format.value} but found {detected_format.value}"
+            ))
+            continue
+
+        if schema is None:
+            continue
+
+        # Record-level validator (verl, verl_legacy) handles everything
+        if "record_validator" in schema:
+            errors.extend(schema["record_validator"](record, line_num))
+            continue
+
+        # Check required fields exist with correct types
+        required = schema["required_fields"]
+        prefix = schema.get("field_error_prefix", "")
+        skip_messages = False
+        for field, expected_type in required.items():
+            if field not in record:
+                errors.append(ValidationError(
+                    line_number=line_num,
+                    error_type="missing_field",
+                    message=f"Missing required field '{field}'"
+                ))
+                if field == "messages":
+                    skip_messages = True
+            elif not isinstance(record[field], expected_type):
+                actual = type(record[field]).__name__
+                if field == "messages":
+                    errors.append(ValidationError(
+                        line_number=line_num,
+                        error_type="invalid_structure",
+                        message=f"Field 'messages' must be a list"
+                    ))
+                    skip_messages = True
+                elif prefix:
+                    errors.append(ValidationError(
+                        line_number=line_num,
+                        error_type="invalid_structure",
+                        message=f"{prefix} field '{field}' must be {expected_type.__name__}, got {actual}"
+                    ))
+                else:
+                    errors.append(ValidationError(
+                        line_number=line_num,
+                        error_type="invalid_structure",
+                        message=f"Field '{field}' must be {expected_type.__name__}, got {actual}"
+                    ))
+
+        if skip_messages:
+            continue
+
+        # Message-level validator
+        if "message_validator" in schema:
+            errors.extend(schema["message_validator"](record["messages"], line_num))
+
+    logger.debug("Validation found %d error(s)", len(errors))
+    return (len(errors) == 0, errors)
+
+
+def detect_format(file_path: str, sample_size_bytes: int = 1_048_576, s3_client=None) -> FormatDetectionResult:
+    """Detect the format of a JSONL file in S3 or on local disk.
+    
+    Samples the first sample_size_bytes of the file and analyzes the structure
+    to determine if it matches one of the 11 supported formats.
+    
+    Args:
+        file_path: S3 URI (s3://bucket/key) or local file path
+        sample_size_bytes: Number of bytes to sample (default 1MB = 1,048,576 bytes)
+        s3_client: Optional boto3 S3 client to reuse (ignored for local files)
+        
+    Returns:
+        FormatDetectionResult with format type, validation status, and any errors
+    """
+    if file_path.startswith("s3://"):
+        lines = _sample_s3_file(file_path, sample_size_bytes, s3_client=s3_client)
+    else:
+        lines = _sample_local_file(file_path, sample_size_bytes)
+    
+    # Parse JSON lines and collect parse errors
+    parsed_records = []
+    line_numbers = []
+    errors = []
+    
+    for line_num, line in enumerate(lines, start=1):
+        try:
+            parsed_records.append(json.loads(line))
+            line_numbers.append(line_num)
+        except json.JSONDecodeError as e:
+            errors.append(ValidationError(
+                line_number=line_num,
+                error_type="parse_error",
+                message=f"Invalid JSON: {str(e)}"
+            ))
+    
+    # If no successfully parsed records, return UNKNOWN with parse errors
+    if not parsed_records:
+        confidence = ConfidenceLevel.NONE if errors else ConfidenceLevel.HIGH
+        return FormatDetectionResult(
+            format_type=FormatType.UNKNOWN,
+            is_valid=len(errors) == 0,
+            lines_sampled=len(lines),
+            errors=errors,
+            confidence=confidence
+        )
+    
+    # Classify schema using first successfully parsed record
+    format_type = _classify_schema(parsed_records)
+    
+    # Validate all parsed records against detected format
+    is_valid, validation_errors = _validate_samples(parsed_records, format_type, line_numbers)
+    errors.extend(validation_errors)
+    
+    # Calculate confidence level
+    if len(errors) == 0:
+        confidence = ConfidenceLevel.HIGH
+    elif any(err.error_type == "parse_error" for err in errors):
+        confidence = ConfidenceLevel.NONE
+    else:
+        confidence = ConfidenceLevel.LOW
+    
+    logger.debug("Detected format: %s (valid=%s, confidence=%s)", format_type.value, is_valid, confidence.value)
+    
+    return FormatDetectionResult(
+        format_type=format_type,
+        is_valid=len(errors) == 0,
+        lines_sampled=len(lines),
+        errors=errors,
+        confidence=confidence
+    )
+
+
+if __name__ == "__main__":
+    import argparse
+    import sys
+    
+    parser = argparse.ArgumentParser(description="Detect and validate JSONL file formats")
+    parser.add_argument("file_path", help="S3 URI (s3://bucket/key) or local file path")
+    parser.add_argument("--sample-size", type=int, default=1_048_576, help="Bytes to sample (default: 1MB)")
+    parser.add_argument("--json", action="store_true", help="Output as JSON instead of human-readable")
+    args = parser.parse_args()
+    
+    try:
+        result = detect_format(args.file_path, args.sample_size)
+        
+        if args.json:
+            output = {
+                "format_type": result.format_type.value,
+                "is_valid": result.is_valid,
+                "confidence": result.confidence.value,
+                "lines_sampled": result.lines_sampled,
+                "errors": [
+                    {"line_number": e.line_number, "error_type": e.error_type, "message": e.message}
+                    for e in result.errors
+                ],
+            }
+            print(json.dumps(output, indent=2))
+        else:
+            print(f"Format: {result.format_type.value}")
+            print(f"Valid: {'✓' if result.is_valid else '✗'}")
+            print(f"Confidence: {result.confidence.name}")
+            print(f"Lines sampled: {result.lines_sampled}")
+            if result.errors:
+                print("Errors:")
+                for err in result.errors:
+                    print(f"  Line {err.line_number}: {err.message}")
+        
+        sys.exit(0 if result.is_valid else 1)
+    except (FileNotFoundError, IOError, ValueError) as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
diff --git a/plugins/sagemaker-ai/skills/dataset-transformation/SKILL.md b/plugins/sagemaker-ai/skills/dataset-transformation/SKILL.md
new file mode 100644
index 00000000..91fef48c
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/dataset-transformation/SKILL.md
@@ -0,0 +1,220 @@
+---
+name: dataset-transformation
+description: Generates a Jupyter notebook that transforms datasets between ML schemas for model training or evaluation. Use when the user says "transform", "convert", "reformat", "change the format", or when a dataset's schema needs to change to match the target format — always use this skill for format changes rather than writing inline transformation code. Supports OpenAI chat, SageMaker SFT/DPO/RLVR, HuggingFace preference, Bedrock Nova, VERL, and custom JSONL formats from local files or S3.
+---
+
+# Dataset Transformation Agent
+
+Transforms a data set provided by the user into their desired format. All transformation code is delivered as a Jupyter notebook.
+
+## When to Use
+
+- User needs to generate code for transforming datasets for SageMaker model training or model evaluation.
+- A dataset requires processing, cleaning, or formatting before training or evaluation.
+- Workflow requires a formal review and approval cycle before execution.
+
+## Principles
+
+1. **One thing at a time.** Each response advances exactly one decision. Never combine multiple questions or recommendations in a single turn.
+2. **Confirm before proceeding.** Wait for the user to agree before moving to the next step. You are a guide, not a runaway train.
+3. **Don't read files until you need them.** Only read reference files when you've reached the workflow step that requires them and the user has confirmed the direction. Never read ahead.
+4. **No narration.** Don't explain what you're about to do or what you just did. Share outcomes and ask questions. Keep responses short and focused.
+5. **No repetition.** If you said something before a tool call, don't repeat it after. Only share new information.
+6. **Do not deviate from the Workflow.** The steps listed in the workflow should be followed exactly as described. Progress from Step 1 to Step 10 to complete the task. Do not deviate from the workflow!
+7. **Always end with a question.** Whenever you pause for user input, acknowledgment, or feedback, your response must end with a question. Never leave the user with a statement and expect them to know they need to respond.
+8. **Never overwrite existing files — append instead.** If a target notebook already exists, do NOT overwrite it. Append new cells to the existing file. Notify the user that the file already exists and that you will be appending to it.
+9. **Avoid filename collisions.** When creating a new file, check if a file with the same name already exists. If it does, rename the new file by appending a numeric suffix (e.g., `transform_dataset_2.ipynb`) before writing.
+10. **Default output format is JSONL.** Unless the user explicitly requests a different file format, the transformed dataset should be written as `.jsonl` (JSON Lines — one JSON object per line).
+
+## Known Dataset Formats Reference
+
+This skill supports two transformation purposes — **training data** and **evaluation data** — each with its own format resolution path. The purpose is determined in Step 1 of the workflow.
+
+### Training Data Formats
+
+When the transformation is for **model training**, resolve the target format using the reference file `../dataset-evaluation/references/strategy_data_requirements.md`. The required format depends on both the **model type** (Open Weights like Llama/Qwen vs Nova) and the **finetuning technique** (SFT, DPO, RLVR) — make sure to match on both dimensions. If either the model type or technique is not yet known, ask the user before resolving the format.
+
+### Evaluation Data Formats
+
+When the transformation is for **model evaluation**, resolve the target format using this order:
+
+1. Try fetching the live documentation at https://docs.aws.amazon.com/sagemaker/latest/dg/model-customize-evaluation-dataset-formats.html to get the latest evaluation dataset schema definitions.
+2. **If the fetch fails** (e.g., no internet access, VPC environment), fall back to the offline copy at `references/sagemaker_dataset_formats.md`. Inform the user that the format schemas are from an offline copy and may be outdated.
+
+Use whichever source you successfully access as the source of truth for the target format. Do not rely on memorized schemas.
+
+## Workflow
+
+### Step 1: Determine transformation purpose
+
+Your first response should determine whether this transformation is for **model training** or **model evaluation**. If the context already makes this clear (e.g., the user said "I need to prep my training data" or "I need to format my eval dataset"), confirm your understanding and move on. Otherwise, ask:
+
+> "Is this dataset transformation for model training or model evaluation? This helps me look up the right target format for you."
+
+- **Training** → format resolution will use the local training data requirements reference (model type + finetuning technique dependent).
+- **Evaluation** → format resolution will use the live AWS documentation (with offline fallback).
+
+Remember this choice — it determines how the target format is resolved in Step 3.
+
+⏸ Wait for user.
+
+### Step 2: Set expectations
+
+Acknowledge the user's request and state what this skill can do:
+
+> "I can help you transform your dataset's format! Here's my plan: I will first need to understand the format of your dataset and the transformation requirements. Once I have that, I will generate a dataset transformation function that we can refine together. After the dataset transformation function is refined to your liking, I will perform the transformation task and upload it to your desired location! Does this sound good?"
+
+⏸ Wait for user.
+
+### Step 3: Understand the dataset transformation task
+
+For this step, you need to know: **what dataset format the user would like to transform their dataset from and what dataset format they would like to transform it in to.**
+If you know this already, skip this step. If not, ask the user:
+
+> "What's the dataset format you would like to transform it into?"
+
+Resolve the target format based on the purpose determined in Step 1:
+
+- **If training data**: Ask the user for the finetuning technique (SFT, DPO, RLVR) and model type (Open Weights like Llama/Qwen vs Nova) if not already known. Then look up the required format from the "Training Data Formats" section in the Known Dataset Formats Reference above.
+- **If evaluation data**: If the user mentions a well-known format name (e.g., "OpenAI format", "SageMaker format"), fetch the schema from the live documentation as described in the "Evaluation Data Formats" section above. If a well-known format is fetched, confirm with the user:
+
+> "I've found a SageMaker dataset format: {sagemaker-dataset-format-name} with schema: {sagemaker-dataset-format-schema}. Is this what you were referring to?"
+
+If the user describes a custom format not listed in the reference doc, ask them to provide a sample record of the desired output format.
+
+⏸ Wait for user.
+
+### Step 4: Get the dataset from the user
+
+For this step, you need: **the location of the user's dataset**.
+If you know this already, skip this step. If not, ask the user:
+
+> "Where can I find your dataset? Either a local directory or S3 location works!"
+
+⏸ Wait for user.
+
+### Step 5: Examine sample data
+
+Read 1–2 sample records from the user's dataset and show them so the user can confirm the source schema. Do not run format detection — that is handled by the planning skill before this skill is invoked.
+
+Do not show a side-by-side mapping to the target format here — the detailed mapping will be handled in Step 7 when generating the transformation function.
+
+⏸ Wait for user.
+
+### Step 6: Get the dataset output location
+
+For this step, you need: **to understand where to output the transformed dataset to. It could be an S3 URI or local directory**
+If you already know where the dataset is supposed to be output to, skip this step. If not, ask the user:
+
+> "Where should I output your transformed dataset to? Either a local directory or S3 location works!"
+
+If the user provides a directory (not a full file path), construct the output filename using the pattern `{original_name}_{target_format}.jsonl` (e.g., `gen_qa_100k_openai.jsonl`).
+
+⏸ Wait for user.
+
+### Step 7: Generate and validate the transformation function
+
+For this step, you need: **to generate a python function that transforms the dataset from the format in Step 5 to the format in Step 3**
+
+Read the reference guide at `references/dataset_transformation_code.md` and follow its skeleton exactly when generating the transformation function.
+
+The python function should be in the form of:
+
+```python
+def transform_dataset(df: pd.DataFrame) -> pd.DataFrame:
+```
+
+Add a `%%writefile <project-dir>/scripts/transform_fn.py` code cell to the notebook AND write the file to disk for testing. The `<project-dir>` is the project directory established by the directory-management skill (e.g., `dpo-to-rlvr-conversion`). All notebooks go in `<project-dir>/notebooks/` and all scripts go in `<project-dir>/scripts/`.
+
+Continue iterating with the user's feedback — update the notebook cell in place on each revision rather than showing code inline.
+
+**If sample data was collected in Step 5**, test the function against the sample records:
+
+1. Generate the transformation function.
+2. Write the sample data to a temporary JSONL file (e.g., `/tmp/test_input.jsonl`), then run:
+   `python3 -c "import sys; sys.path.insert(0, '<project-dir>/scripts'); from transform_fn import transform_dataset; import pandas as pd; df = pd.read_json('/tmp/test_input.jsonl', lines=True); result = transform_dataset(df); print(result.to_json(orient='records', lines=True))"`
+3. If the test fails, fix and re-test until it passes.
+4. Show the user the function and transformed sample output for review.
+
+**If no sample data**, present the function for review and refinement.
+
+⏸ Wait for user.
+
+### Step 8: Generate the execution cells in the notebook
+
+**Before writing the notebook, read:**
+
+- `references/notebook_structure.md` (cell order, placeholders, and content)
+- `references/notebook_writing_guide.md` (Jupyter notebook JSON formatting)
+
+Generate the execution logic as code cells in the notebook.
+
+- Add a `%%writefile <project-dir>/scripts/<script_name>.py` code cell to the notebook AND write the file to disk for testing.
+- The script must import `transform_dataset` from `transform_fn`.
+- Replace placeholders with the actual input/output paths.
+
+Read the reference guide at `references/dataset_transformation_code.md` and follow its execution script skeleton exactly.
+
+**If sample data was collected in Step 5**, test the full pipeline:
+
+1. Write the sample records to a temporary JSONL file (e.g., `/tmp/test_input.jsonl`).
+2. Run: `python3 <project-dir>/scripts/<script_name> --input /tmp/test_input.jsonl --output /tmp/test_output.jsonl`
+3. If it fails, debug and fix, then re-run until successful.
+4. Show the user the output for review.
+
+**If no sample data**, present the notebook for review and refinement.
+
+⏸ Wait for user.
+
+### Step 9: Determine and confirm execution mode
+
+Check the size of the input dataset:
+
+- If the dataset is in S3, use the AWS MCP tool `head-object` (S3 service) with the bucket and key to get `ContentLength`.
+- If the dataset is local, check the file size.
+
+**Decision criteria:**
+
+- Dataset < 50 MB → recommend local execution
+- Dataset ≥ 50 MB → recommend SageMaker Processing Job
+
+Inform the user of the recommendation and get their approval:
+
+If local:
+
+> "Your dataset is {size} MB — since it's under 50 MB, I'd recommend running the transformation locally. Would you like to proceed with local execution, or would you prefer a SageMaker Processing Job instead?"
+
+If SageMaker Processing Job:
+
+> "Your dataset is {size} MB — since it's over 50 MB, I'd recommend running this as a SageMaker Processing Job for better performance. Would you like to proceed with a SageMaker Processing Job, or would you prefer to run it locally instead?"
+
+Do not execute until the user approves. If the user rejects the recommendation, switch to the alternative and get their explicit approval before proceeding.
+
+⏸ Wait for user.
+
+**After user confirms, add an execution cell to the notebook. Do NOT run the full transformation — only generate the cell for the user to execute themselves:**
+
+If local execution:
+
+- Add a cell that runs the transformation by importing from the `.py` files already on disk (written by the agent during Steps 7–8): import `transform_dataset` from `transform_fn`, load the dataset, transform, and save output. Scripts are located in `<project-dir>/scripts/`.
+
+If SageMaker Processing Job:
+
+- Add a cell that submits and monitors the Processing Job inline using the V3 SageMaker SDK directly (FrameworkProcessor, ProcessingInput, ProcessingOutput, etc.). Create a FrameworkProcessor with the SKLearn 1.2-1 image, configure inputs/outputs, and call `processor.run(wait=True, logs=True)` to block the cell and stream logs until the job completes. See `scripts/transformation_tools.py` for reference implementation details.
+- Inform the user they can run this cell to kick off and monitor the job.
+
+**Important:** The agent must NOT execute the full dataset transformation itself. The notebook cells are generated for the user to review and run. Only sample data (from Steps 7–8) should be transformed by the agent for validation purposes.
+
+> "I've added the execution cell to the notebook. You can run it to transform the full dataset. Would you like to review the notebook before running it?"
+
+⏸ Wait for user.
+
+### Step 10: Verify and confirm with the user
+
+For this step, you need: **to verify the output looks correct and confirm with the user.**
+
+- Read 1–2 sample records from the output to show the user.
+- Report the total number of records transformed.
+- Ask the user if the output looks good.
+
+⏸ Wait for user to confirm.
diff --git a/plugins/sagemaker-ai/skills/dataset-transformation/references/dataset_transformation_code.md b/plugins/sagemaker-ai/skills/dataset-transformation/references/dataset_transformation_code.md
new file mode 100644
index 00000000..86adb243
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/dataset-transformation/references/dataset_transformation_code.md
@@ -0,0 +1,135 @@
+# Dataset Transformation Code Reference
+
+## When to Reference
+
+When generating:
+
+- a dataset transformation function
+- a dataset transformation execution script
+
+**follow the exact python skeletons** captured in this document.
+
+## Related Files
+
+- `scripts/transformation_tools.py` — contains `execute_transformation_job()` for running the generated script as a SageMaker Processing Job. Use this when the user wants remote execution instead of local.
+
+## Requirements
+
+- The dataset transformation function should: **ONLY transform the input DataFrame into the target output format. No I/O, no side effects.**
+- The dataset transformation execution script should: **ORCHESTRATE the full pipeline: load the dataset using `load_dataset_from`, apply the transformation function, and write the output using `output_dataset_to`.**
+- The script must work in two execution contexts:
+  - **Local execution**: paths may be S3 URIs or local file paths
+  - **SageMaker Processing Job**: inputs are mounted at `/opt/ml/processing/input/` and outputs go to `/opt/ml/processing/output/`
+
+## Generating a dataset transformation function
+
+The transformation function should be saved to its own file at `<project-dir>/scripts/transform_fn.py` so the user can view and edit it directly. The `<project-dir>` is the project directory established by the directory-management skill (e.g., `dpo-to-rlvr-conversion`).
+
+```python
+import pandas as pd
+
+def transform_dataset(df: pd.DataFrame) -> pd.DataFrame:
+    # Transform each row from source format to target format
+    # Return a DataFrame matching the target schema
+    transformed = {transformation logic}
+    return transformed
+```
+
+## Generating a dataset transformation execution script
+
+The execution script imports `transform_dataset` from `transform_fn.py` rather than embedding it inline. Both files must be in the same directory (`<project-dir>/scripts/`).
+
+```python
+import pandas as pd
+import json
+import subprocess
+import shutil
+import os
+import argparse
+from transform_fn import transform_dataset
+
+def load_dataset_from(input_location: str, to: str):
+    """
+    Load a dataset from S3 or local path.
+    - input_location: S3 URI or local file path (including SageMaker Processing mounted paths)
+    - to: local file path to save the dataset to
+    """
+    if input_location.startswith("s3://"):
+        subprocess.run(["aws", "s3", "cp", input_location, to], check=True)
+    else:
+        shutil.copy(input_location, to)
+
+def output_dataset_to(output_location: str, from_path: str):
+    """
+    Output a dataset to S3 or local path.
+    - output_location: S3 URI or local directory/file path (including SageMaker Processing mounted paths)
+    - from_path: local file path of the transformed dataset to upload/move
+    """
+    if output_location.startswith("s3://"):
+        subprocess.run(["aws", "s3", "cp", from_path, output_location], check=True)
+    else:
+        os.makedirs(os.path.dirname(output_location) or ".", exist_ok=True)
+        shutil.copy(from_path, output_location)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input", required=True, help="S3 URI, local path, or /opt/ml/processing/input/...")
+    parser.add_argument("--output", required=True, help="S3 URI, local path, or /opt/ml/processing/output/...")
+    args = parser.parse_args()
+
+    # 1. Load dataset
+    local_input = "/tmp/input_dataset.jsonl"
+    load_dataset_from(args.input, to=local_input)
+
+    # 2. Read into DataFrame
+    df = pd.read_json(local_input, lines=True)
+    print(f"Loaded {len(df)} records")
+
+    # 3. Transform
+    df = transform_dataset(df)
+
+    # 4. Write transformed output locally
+    local_output = "/tmp/output_dataset.jsonl"
+    df.to_json(local_output, orient="records", lines=True)
+
+    # 5. Output to destination
+    output_dataset_to(args.output, from_path=local_output)
+
+    print(f"Transformed {len(df)} records -> {args.output}")
+```
+
+## Execution Examples
+
+### Local execution
+
+```bash
+python transform.py --input s3://my-bucket/data/input.jsonl --output s3://my-bucket/data/output.jsonl
+```
+
+### SageMaker Processing Job
+
+Use `execute_transformation_job` from `scripts/transformation_tools.py` to run the script as a SageMaker Processing Job. This function handles container setup, S3 input/output mounting, and job orchestration. Do not manually construct Processing Job logic — always delegate to this tool.
+
+The job is submitted asynchronously (`wait=False`). Use `describe_transformation_job` to check job status.
+
+```python
+from scripts.transformation_tools import execute_transformation_job, describe_transformation_job
+
+execute_transformation_job(
+    transform_script_path="transform.py",       # Local path to the saved script
+    dataset_source_s3="s3://bucket/input.jsonl", # S3 URI of input dataset
+    dataset_output_s3="s3://bucket/output/",     # S3 URI for output
+)
+```
+
+After submitting, check status with:
+
+```python
+from scripts.transformation_tools import describe_transformation_job
+
+status = describe_transformation_job(job_name="<job-name>")
+print(status)
+# Returns: {"job_name": "...", "status": "InProgress|Completed|Failed|Stopped", ...}
+```
+
+Call `describe_transformation_job` repeatedly (every ~30 seconds) until `status` is `Completed`, `Failed`, or `Stopped`.
diff --git a/plugins/sagemaker-ai/skills/dataset-transformation/references/notebook_structure.md b/plugins/sagemaker-ai/skills/dataset-transformation/references/notebook_structure.md
new file mode 100644
index 00000000..fe29349d
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/dataset-transformation/references/notebook_structure.md
@@ -0,0 +1,46 @@
+# Dataset Transformation Notebook Structure
+
+Cell order, placeholders, and JSON formatting for the dataset transformation notebook.
+
+## Cells
+
+| Cell | Label                                       | Content                                                           |
+| ---- | ------------------------------------------- | ----------------------------------------------------------------- |
+| 0    | Markdown header: `# Dataset Transformation` | Description of the transformation (source format → target format) |
+| 1    | Configuration                               | Input/output paths, region, any user-configurable parameters      |
+| 2    | Transformation Function                     | The approved `transform_dataset(df)` function from Step 6         |
+| 3    | Load Dataset                                | Load dataset using `load_dataset_from` and read into DataFrame    |
+| 4    | Transform                                   | Apply `transform_dataset(df)` and preview transformed records     |
+| 5    | Save Output                                 | Write transformed DataFrame and upload using `output_dataset_to`  |
+
+## Placeholders (Cell 1 only)
+
+| Placeholder         | Description                           | Example                                 |
+| ------------------- | ------------------------------------- | --------------------------------------- |
+| `[INPUT_LOCATION]`  | S3 URI or local path to input dataset | `s3://bucket/data/input.jsonl`          |
+| `[OUTPUT_LOCATION]` | S3 URI or local path for output       | `s3://bucket/output/input_openai.jsonl` |
+
+## JSON Formatting
+
+Each line of code is a separate string in `source`, ending with `\n` (except the last line):
+
+```json
+{
+  "cell_type": "code",
+  "execution_count": null,
+  "metadata": {},
+  "outputs": [],
+  "source": [
+    "import os\n",
+    "x = 5\n",
+    "print(x)"
+  ]
+}
+```
+
+- Escape quotes inside strings: `\"`
+- No trailing commas in arrays or objects
+- 2-space indentation
+- Use `fs_write` with `command: create` to write the complete notebook JSON
+- Markdown cell 0: `"cell_type": "markdown"`, no `execution_count` or `outputs`
+- Wrap all cells in `{"cells": [...], "metadata": {...}, "nbformat": 4, "nbformat_minor": 4}`
diff --git a/plugins/sagemaker-ai/skills/dataset-transformation/references/notebook_writing_guide.md b/plugins/sagemaker-ai/skills/dataset-transformation/references/notebook_writing_guide.md
new file mode 100644
index 00000000..3b79be11
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/dataset-transformation/references/notebook_writing_guide.md
@@ -0,0 +1,99 @@
+# Guide: Writing Jupyter Notebooks
+
+## Critical Differences from Regular Files
+
+Jupyter notebooks (.ipynb) are JSON files with a specific structure. Writing to them is fundamentally different from writing regular Python files.
+
+## The Problem
+
+When you write Python code to a regular .py file, you write it as plain text with newlines:
+
+```python
+import os
+x = 5
+print(x)
+```
+
+But in a Jupyter notebook, each line must be a separate string in a JSON array:
+
+```json
+{
+  "source": [
+    "import os\n",
+    "x = 5\n",
+    "print(x)"
+  ]
+}
+```
+
+## The Solution: Use fs_write with JSON Structure
+
+**ALWAYS use the `fs_write` tool with `command: create` to write notebooks.**
+
+### Correct Notebook Structure (Pretty-Print Format)
+
+Use **2-space indentation** (pretty-print format) for consistent, readable formatting:
+
+```json
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# This is line 1\n",
+        "import os\n",
+        "x = 5\n",
+        "print(x)"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python",
+      "version": "3.9.0"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 4
+}
+```
+
+**CRITICAL**: Use exactly 2 spaces for each indentation level (standard pretty-print format).
+
+### Key Points
+
+1. **Each line ends with `\n`** - This is how newlines are represented in JSON strings
+2. **Lines are separate array elements** - Each line is a string in the `source` array
+3. **Use proper JSON escaping** - Quotes inside strings must be escaped: `\"text\"`
+4. **No trailing comma** - Last element in arrays/objects should not have a comma
+
+## Common Mistakes to Avoid
+
+❌ **DON'T** use bash commands to generate JSON and pipe to file
+❌ **DON'T** write code as a single string without line breaks
+❌ **DON'T** forget to escape quotes in strings
+❌ **DON'T** add trailing commas to last array elements
+
+✅ **DO** use fs_write with the complete JSON structure
+✅ **DO** add `\n` to end of each line in source arrays
+✅ **DO** validate JSON structure before writing
+✅ **DO** use proper escaping for special characters
+
+## Validation Checklist
+
+Before writing the notebook, verify:
+
+- [ ] Each cell has proper structure (cell_type, execution_count, metadata, outputs, source)
+- [ ] Source arrays have each line as a separate string ending in `\n`
+- [ ] Quotes are properly escaped
+- [ ] No trailing commas
+- [ ] Metadata section is complete
+- [ ] nbformat and nbformat_minor are set
diff --git a/plugins/sagemaker-ai/skills/dataset-transformation/references/sagemaker_dataset_formats.md b/plugins/sagemaker-ai/skills/dataset-transformation/references/sagemaker_dataset_formats.md
new file mode 100644
index 00000000..b0d51d88
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/dataset-transformation/references/sagemaker_dataset_formats.md
@@ -0,0 +1,146 @@
+# SageMaker Supported Dataset Formats (Offline Fallback)
+
+This is an offline copy of the supported dataset formats from:
+https://docs.aws.amazon.com/sagemaker/latest/dg/model-customize-evaluation-dataset-formats.html
+
+**Note:** Always attempt to fetch the live documentation first. Only use this file as a fallback when internet access is unavailable (e.g., VPC environments).
+
+## Required Fields
+
+| Field         | Required               |
+| ------------- | ---------------------- |
+| User Prompt   | Yes                    |
+| System Prompt | No                     |
+| Ground truth  | Only for Custom Scorer |
+| Category      | No                     |
+
+## 1. OpenAI Format
+
+```json
+{
+  "messages": [
+    {
+      "role": "system",
+      "content": "You are a helpful assistant."
+    },
+    {
+      "role": "user",
+      "content": "Hello!"
+    },
+    {
+      "role": "assistant",
+      "content": "Hello to you!"
+    }
+  ]
+}
+```
+
+- `system` role is optional (system prompt)
+- `user` role is the query
+- `assistant` role is the ground truth
+
+## 2. SageMaker Evaluation Format
+
+```json
+{
+  "system": "You are an English major with top marks in class who likes to give minimal word responses: ",
+  "query": "What is the symbol that ends the sentence as a question",
+  "response": "?",
+  "category": "Grammar"
+}
+```
+
+- `system` and `category` are optional
+- `response` is the ground truth
+
+## 3. HuggingFace Prompt Completion Format
+
+### Standard
+
+```json
+{
+  "prompt": "What is the symbol that ends the sentence as a question",
+  "completion": "?"
+}
+```
+
+### Conversational
+
+```json
+{
+  "prompt": [
+    { "role": "user", "content": "What is the symbol that ends the sentence as a question" }
+  ],
+  "completion": [
+    { "role": "assistant", "content": "?" }
+  ]
+}
+```
+
+- `completion` is the ground truth
+
+## 4. HuggingFace Preference Format
+
+### Standard
+
+```json
+{
+  "prompt": "The sky is",
+  "chosen": "blue",
+  "rejected": "green"
+}
+```
+
+### Conversational
+
+```json
+{
+  "prompt": [
+    { "role": "user", "content": "What color is the sky?" }
+  ],
+  "chosen": [
+    { "role": "assistant", "content": "It is blue." }
+  ],
+  "rejected": [
+    { "role": "assistant", "content": "It is green." }
+  ]
+}
+```
+
+- `chosen` is the ground truth
+
+## 5. Verl Format
+
+### Current (prompt as messages array)
+
+```json
+{
+  "data_source": "openai/gsm8k",
+  "prompt": [
+    { "content": "You are a helpful math tutor.", "role": "system" },
+    { "content": "What is 2+2?", "role": "user" }
+  ],
+  "ability": "math",
+  "extra_info": {
+    "answer": "4"
+  },
+  "reward_model": {
+    "ground_truth": "4"
+  }
+}
+```
+
+### Legacy (prompt as string)
+
+```json
+{
+  "data_source": "openai/gsm8k",
+  "prompt": "What is 2+2?",
+  "extra_info": {
+    "answer": "4"
+  }
+}
+```
+
+- Ground truth via `extra_info.answer` (preferred) or `reward_model.ground_truth`
+- Preserves metadata fields: `id`, `data_source`, `ability`, `reward_model`, `extra_info`, `attributes`, `difficulty`
diff --git a/plugins/sagemaker-ai/skills/dataset-transformation/scripts/transformation_tools.py b/plugins/sagemaker-ai/skills/dataset-transformation/scripts/transformation_tools.py
new file mode 100644
index 00000000..4cc38743
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/dataset-transformation/scripts/transformation_tools.py
@@ -0,0 +1,146 @@
+#!/usr/bin/env python3
+
+import os
+
+import boto3
+from sagemaker.core.helper.session_helper import Session, get_execution_role
+from sagemaker.core import image_uris
+from sagemaker.core.processing import FrameworkProcessor
+from sagemaker.core.shapes import ProcessingInput, ProcessingOutput, ProcessingS3Input, ProcessingS3Output
+from sagemaker.core.resources import ProcessingJob
+from sagemaker.core import Attribution, set_attribution
+
+set_attribution(Attribution.SAGEMAKER_AGENT_PLUGIN)
+
+
+def _get_session(region=None):
+    """Create a SageMaker Session, optionally pinned to a region."""
+    return Session(
+        boto_session=boto3.Session(region_name=region) if region else None
+    )
+
+
+def execute_transformation_job(
+    transform_script_path,
+    dataset_source_s3,
+    dataset_output_s3,
+    instance_type="ml.m5.xlarge",
+    region=None,
+    execution_role=None,
+    base_job_name="dataset-transformation",
+    image_uri=None,
+):
+    """
+    Execute a dataset transformation script as a SageMaker Processing Job
+    using the V3 SDK FrameworkProcessor.
+
+    The entire directory containing the script is uploaded as source_dir,
+    so transform_fn.py (and any other dependencies) are included automatically.
+
+    Args:
+        transform_script_path: Local path to the transformation script (e.g., "<project-dir>/scripts/transform.py")
+        dataset_source_s3: S3 URI of the input dataset
+        dataset_output_s3: S3 URI for the transformed output dataset
+        instance_type: ML instance type (default: ml.m5.xlarge)
+        region: AWS region (auto-detected if None)
+        execution_role: IAM role ARN (auto-detected if None)
+        base_job_name: Prefix for the Processing Job name
+        image_uri: Docker image URI for the processing container.
+                   If None, uses the SKLearn processing image.
+    """
+    if not execution_role:
+        execution_role = get_execution_role()
+
+    sagemaker_session = _get_session(region)
+
+    if not region:
+        region = sagemaker_session.boto_region_name
+
+    # Use SKLearn processing image as default (includes pandas)
+    if not image_uri:
+        image_uri = image_uris.retrieve(
+            framework="sklearn",
+            region=region,
+            version="1.2-1",
+            instance_type=instance_type,
+        )
+
+    source_dir = os.path.dirname(os.path.abspath(transform_script_path))
+    script_name = os.path.basename(transform_script_path)
+
+    processor = FrameworkProcessor(
+        role=execution_role,
+        image_uri=image_uri,
+        command=["python3"],
+        instance_count=1,
+        instance_type=instance_type,
+        base_job_name=base_job_name,
+        sagemaker_session=sagemaker_session,
+    )
+
+    input_local_path = "/opt/ml/processing/input"
+    output_local_path = "/opt/ml/processing/output"
+    input_filename = os.path.basename(dataset_source_s3.rstrip("/"))
+
+    processor.run(
+        code=script_name,
+        source_dir=source_dir,
+        arguments=[
+            "--input", os.path.join(input_local_path, input_filename),
+            "--output", os.path.join(output_local_path, input_filename),
+        ],
+        inputs=[
+            ProcessingInput(
+                input_name="dataset",
+                s3_input=ProcessingS3Input(
+                    s3_uri=dataset_source_s3,
+                    local_path=input_local_path,
+                    s3_data_type="S3Prefix",
+                    s3_input_mode="File",
+                ),
+            )
+        ],
+        outputs=[
+            ProcessingOutput(
+                output_name="transformed",
+                s3_output=ProcessingS3Output(
+                    s3_uri=dataset_output_s3,
+                    local_path=output_local_path,
+                    s3_upload_mode="EndOfJob",
+                ),
+            )
+        ],
+        wait=False,
+    )
+
+    print(f"Processing job submitted. Output will be at: {dataset_output_s3}")
+
+
+def describe_transformation_job(job_name, region=None):
+    """
+    Describe a SageMaker Processing Job by name.
+
+    Args:
+        job_name: The name of the processing job to describe.
+        region: AWS region (auto-detected if None).
+
+    Returns:
+        dict: Job details including status, inputs, outputs, and timing info.
+    """
+    sagemaker_session = _get_session(region)
+
+    job = ProcessingJob.get(
+        processing_job_name=job_name,
+        session=sagemaker_session.boto_session,
+    )
+
+    details = job.refresh().__dict__
+    return {
+        "job_name": job_name,
+        "status": details.get("processing_job_status"),
+        "failure_reason": details.get("failure_reason"),
+        "creation_time": str(details.get("creation_time", "")),
+        "processing_end_time": str(details.get("processing_end_time", "")),
+        "inputs": details.get("processing_inputs", []),
+        "outputs": getattr(details.get("processing_output_config"), "outputs", []),
+    }
diff --git a/plugins/sagemaker-ai/skills/directory-management/SKILL.md b/plugins/sagemaker-ai/skills/directory-management/SKILL.md
new file mode 100644
index 00000000..8a7f92b5
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/directory-management/SKILL.md
@@ -0,0 +1,32 @@
+---
+name: directory-management
+description: Manages project directory setup and artifact organization. Use when starting a new project, resuming an existing one, or when a PLAN.md needs to be associated with a project directory. Creates the project folder structure (specs/, scripts/, notebooks/) and resolves project naming.
+---
+
+# Directory Management
+
+## Project Setup
+
+Before any work begins, resolve the project name:
+
+1. If the project name is already known from conversation context, use it.
+2. Otherwise, scan for existing `*/PLAN.md` files in the current directory. If found, ask the user if they are resuming an existing project and load that `PLAN.md` into context.
+3. If no existing projects are found, recommend a ≤64-char lowercase slug based on what you know from the conversation (only `[a-z0-9-]`), or ask directly if there isn't enough context. Present the recommended name and wait for user confirmation.
+
+Once project name is resolved:
+
+1. Create and/or use the `<experiment-name>/` directory using the confirmed name for storing all the artifacts
+
+## Directory Structure
+
+When working with the agent, all generated files are organized under an project directory.
+
+```
+<project-name>/
+├── specs/  
+│   ├── PLAN.md             # Your customization plan
+├── scripts/                # Generated Python scripts
+│   ├── <project-name>_transform_fn.py
+└── notebooks/              # Generated Jupyter notebooks
+    ├── <project-name>_training.ipynb
+```
diff --git a/plugins/sagemaker-ai/skills/finetuning-setup/SKILL.md b/plugins/sagemaker-ai/skills/finetuning-setup/SKILL.md
new file mode 100644
index 00000000..a13a0b38
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/finetuning-setup/SKILL.md
@@ -0,0 +1,73 @@
+---
+name: finetuning-setup
+description: Selects a base model and fine-tuning technique (SFT, DPO, or RLVR) for the user's use case by querying SageMaker Hub. Use when the user asks which model or technique to use, wants to start fine-tuning, or mentions a model name or family (e.g., "Llama", "Mistral") — always activate even for known model names because the exact Hub model ID must be resolved. Queries available models, validates technique compatibility, and confirms selections.
+---
+
+# Finetuning Setup
+
+Guides the user through selecting a base model and fine-tuning technique based on their use case.
+
+## When to Use
+
+- User asks which fine-tuning technique to use
+- User wants to select or change their base model
+- User mentions a model name or family (e.g., "Llama", "Mistral") — the exact Hub model ID still needs to be resolved
+
+## Prerequisites
+
+- A `use_case_spec.md` file exists. If not, activate the use-case-specification skill to generate it first.
+
+## Workflow
+
+### Step 1: Discover Hub
+
+1. List all available SageMaker Hubs in the user's region by calling the SageMaker `ListHubs` API using the `aws___call_aws` tool.
+2. From the results, filter out any hub whose `HubDescription` contains "AI Registry" — these do not contain JumpStart models.
+3. The remaining hubs are eligible (e.g., `SageMakerPublicHub` and any private hubs).
+4. If exactly one eligible hub exists, use it automatically — do not ask the user.
+5. If multiple eligible hubs exist, present them to the user and ask which one to use. Example:
+
+   ```
+   I found the following model hubs:
+   - SageMakerPublicHub — SageMaker Public Hub
+   - Private-Hub-XYZ — Private Hub models
+   Which hub would you like to use?
+   ```
+
+6. Store the selected hub name for use in subsequent steps.
+
+### Step 2: Select Base Model
+
+1. Read `use_case_spec.md` to understand the use case and success criteria.
+2. Restate the use case in one sentence.
+3. Always retrieve the full list of available SageMaker Hub model names by running: `python finetuning-setup/scripts/get_model_names.py <hub-name>` — even if the user has already mentioned a model name or family. Do not skip this step or filter the results.
+4. Present all available models to the user, grouped by model family (e.g., Llama, Mistral, Gemma) for readability.
+5. Ask the user to pick the exact model ID from the list.
+6. Validate the selected model exists in the retrieved list before proceeding.
+
+EXTREMELY IMPORTANT: NEVER recommend or suggest any particular model based on the context you have. YOU ARE ALLOWED ONLY to display the list of models
+as given by the script. DO NOT add your own recommendation or suggestion after displaying the list of models to tell which model is correct. Present this
+statement to the user: "Which model would you like to use? Please type the exact model name from the above list." and allow the user to select the model.
+
+### Step 3: Determine Finetuning Technique
+
+1. Consult `references/finetune_technique_selection_guide.md` and recommend the best-fit technique (SFT, DPO, or RLVR) for the use case. Present the recommendation and reasoning to the user.
+2. Ask the user if they'd like to go with the recommendation or prefer a different technique.
+3. Once the user confirms a technique, retrieve the finetuning techniques available for the selected model by running: `python finetuning-setup/scripts/get_recipes.py <model-name> <hub-name>`
+   - This returns only the techniques the model actually supports, filtered to SFT, DPO, and RLVR. Only these three techniques are supported — ignore any other techniques even if the model's recipes include them.
+4. If the chosen technique is available for the model, proceed to Step 4.
+5. If the chosen technique is not available for the model, explain that the selected model does not support it on SageMaker and offer to go back to Step 2 to pick a different model that supports the chosen technique.
+
+### Step 4: Confirm Selections
+
+Present a summary to the user:
+
+```
+Here's what we've selected:
+- Base model: [model name]
+- Fine-tuning technique: [SFT/DPO/RLVR]
+```
+
+## References
+
+- `references/finetune_technique_selection_guide.md` — Technique guidance
diff --git a/plugins/sagemaker-ai/skills/finetuning-setup/references/finetune_technique_selection_guide.md b/plugins/sagemaker-ai/skills/finetuning-setup/references/finetune_technique_selection_guide.md
new file mode 100644
index 00000000..38472471
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/finetuning-setup/references/finetune_technique_selection_guide.md
@@ -0,0 +1,37 @@
+# Finetuning Technique Selection Guide
+
+Not all models support all techniques. Always validate technique availability against the selected model's recipes before recommending. Only SFT, DPO, and RLVR are supported.
+
+## Technique Overview
+
+### SFT (Supervised Fine-Tuning)
+
+**Use when:**
+
+- Task has clear right/wrong answers
+- Single optimal output per input
+- Output represents exemplary responses
+- Classification, extraction, structured generation
+
+### DPO (Direct Preference Optimization)
+
+**Use when:**
+
+- Multiple valid outputs, some better than others
+- Subjective quality (tone, style, helpfulness)
+- Creative tasks with preference judgments
+
+### RLVR (Reinforcement Learning from Verifiable Rewards)
+
+**Use when:**
+
+- Outputs can be verified programmatically
+- Want to reward similarity to gold responses
+- Code generation (passes tests = reward)
+- Math problems (correct answer = reward)
+- Constraint satisfaction (meets criteria = reward)
+
+**Key difference from SFT:**
+
+- SFT: Model learns to imitate gold responses directly
+- RLVR: Model learns to maximize rewards (can be gold similarity or verification-based)
diff --git a/plugins/sagemaker-ai/skills/finetuning-setup/scripts/get_model_names.py b/plugins/sagemaker-ai/skills/finetuning-setup/scripts/get_model_names.py
new file mode 100644
index 00000000..2b4fcf75
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/finetuning-setup/scripts/get_model_names.py
@@ -0,0 +1,43 @@
+import boto3
+import json
+import sys
+
+if len(sys.argv) < 2:
+    print("Usage: python get_model_names.py <hub-name> [region]")
+    sys.exit(1)
+
+hub_name = sys.argv[1]
+region_name = sys.argv[2] if len(sys.argv) > 2 else None
+
+sm_client = boto3.client("sagemaker", region_name=region_name)
+
+# Retrieve all models with pagination
+all_contents = []
+next_token = None
+
+while True:
+    params = {
+        "HubName": hub_name,
+        "HubContentType": "Model",
+        "MaxResults": 100
+    }
+
+    if next_token:
+        params["NextToken"] = next_token
+
+    response = sm_client.list_hub_contents(**params)
+    all_contents.extend(response.get("HubContentSummaries", []))
+
+    next_token = response.get("NextToken")
+    if not next_token:
+        break
+
+# Filter for customization-capable models
+customization_models = [
+    content for content in all_contents
+    if "@capability:customization" in content.get("HubContentSearchKeywords", [])
+]
+
+model_names = [m.get("HubContentName") for m in customization_models]
+
+print(json.dumps(model_names))
diff --git a/plugins/sagemaker-ai/skills/finetuning-setup/scripts/get_recipes.py b/plugins/sagemaker-ai/skills/finetuning-setup/scripts/get_recipes.py
new file mode 100644
index 00000000..7439954d
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/finetuning-setup/scripts/get_recipes.py
@@ -0,0 +1,30 @@
+import boto3
+import json
+import sys
+
+if len(sys.argv) < 3:
+    print("Usage: python get_recipes.py <model-name> <hub-name>")
+    sys.exit(1)
+
+model_name = sys.argv[1]
+hub_name = sys.argv[2]
+sm_client = boto3.client("sagemaker")
+
+detail = sm_client.describe_hub_content(
+    HubName=hub_name,
+    HubContentType="Model",
+    HubContentName=model_name
+)
+
+keywords = detail.get("HubContentSearchKeywords", [])
+
+# Only include SFT, DPO, and RLVR techniques
+supported = {"sft", "dpo", "rlvr"}
+techniques = sorted(
+    t.replace("@recipe:finetuning_", "").split("_")[0]
+    for t in keywords
+    if t.startswith("@recipe:finetuning_")
+)
+techniques = [t for t in dict.fromkeys(techniques) if t in supported]
+
+print(json.dumps(techniques))
diff --git a/plugins/sagemaker-ai/skills/finetuning/SKILL.md b/plugins/sagemaker-ai/skills/finetuning/SKILL.md
new file mode 100644
index 00000000..f475769c
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/finetuning/SKILL.md
@@ -0,0 +1,128 @@
+---
+name: finetuning
+description: Generates a Jupyter notebook that fine-tunes a base model using SageMaker serverless training jobs. Use when the user says "start training", "fine-tune my model", "I'm ready to train", or when the plan reaches the finetuning step. Supports SFT, DPO, and RLVR trainers, including RLVR Lambda reward function creation.
+---
+
+# Prerequisites
+
+Before starting this workflow, verify:
+
+1. A `use_case_spec.md` file exists
+   - If missing: Activate the `use-case-specification` skill first, then resume
+   - DON'T EVER offer to create a use case spec without activating the use-case-specification skill.
+
+2. A fine-tuning technique (SFT, DPO, or RLVR) and base model have already been selected
+   - If missing: Activate the `finetuning-setup` skill to collect what's missing, then resume
+   - Don't make recommendations on the spot. You MUST activate the finetuning-setup skill.
+
+3. A base model name available on SageMakerHub has been identified
+   - If missing: Activate the `finetuning-setup` skill to get it
+   - **Important:** Only use the model name that `finetuning-setup` retrieves, as it may differ from other commonly used names for the same model
+
+# Critical Rules
+
+## Code Generation Rules
+
+- ✅ Use EXACTLY the imports shown in each cell template
+- ❌ Do NOT add additional imports even if they seem helpful
+- ❌ Do NOT create variables before they're needed in that cell
+- 📋 Copy the code structure precisely - no improvisation
+- 🎯 Follow the minimal code principle strictly
+- ✅ When writing a notebook cell, make sure the indentation and f strings are correct
+
+## User Communication Rules
+
+- ❌ NEVER offer to run the notebook for the user (you don't have the tools)
+- ❌ NEVER offer to move on to a downstream skill while training is in progress (logically impossible)
+- ❌ NEVER set ACCEPT_EULA to True yourself (user must read and agree)
+- ✅ Always mention both the number AND title of cells you reference
+- ✅ If user asks how to run: Tell them to run cells one by one, mention ipykernel requirement
+
+---
+
+# Workflow
+
+## 1. Notebook Setup
+
+### 1.1 Directory Setup
+
+1. Identify project directory from conversation context
+   - If unclear (multiple relevant directories exist) → Ask user which folder to use
+2. Create Jupyter notebook: `[title]_finetuning.ipynb`
+   - `[title]` = snake_case name derived from use case
+   - Save under the identified directory
+
+### 1.2 Select Reference Template
+
+Read the example notebook matching the finetuning strategy:
+
+- SFT → `references/sft_example.md`
+- DPO → `references/dpo_example.md`
+- RLVR → `references/rlvr_example.md`
+
+### 1.3 Copy Notebook Structure
+
+1. Write the exact cells from the example to `[title]_finetuning.ipynb`
+2. Use same order, dependencies, and imports as the example
+3. DO NOT improvise or add extra code
+
+### 1.4 Auto-Generate Configuration Values
+
+**In the 'Setup & Credentials' cell, populate:**
+
+1. **BASE_MODEL**
+   - Use the exact SageMakerHub model name from context
+
+2. **MODEL_PACKAGE_GROUP_NAME**
+   - Generate from use case (read `use_case_spec.md` if needed)
+   - Format rules:
+     - Lowercase, alphanumeric with hyphens only
+     - 1-63 characters
+     - Pattern: `[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}`
+     - Example: "Customer Support Chatbot" → `customer-support-chatbot-v1`
+
+3. Save notebook
+
+## 2. RLVR Reward Function (for RLVR only, skip this section if technique is SFT or DPO)
+
+### 2.1 Check Reward Function Status
+
+- Ask if user has a reward function already, or would like help creating one.
+  - If user says they have one → Ask for the SageMaker Hub Evaluator ARN. Only proceed to Section 2.3 once the user provides a valid Evaluator ARN. If they don't have it registered as a SageMaker Hub Evaluator, continue to 2.2.
+  - If user says they do not have one → Continue to 2.2
+
+### 2.2 Generate Reward Function From Template
+
+1. Follow workflow in `references/rlvr_reward_function.md` section "Helping Users Create Lambda Functions"
+
+### 2.3 Set CUSTOM_REWARD_FUNCTION value
+
+1. Set the value for `CUSTOM_REWARD_FUNCTION` in the Notebook with the ARN of the reward function (either given directly by the user, or from the function generation code as `evaluator.arn`).
+
+## 3. EULA review and acceptance
+
+1. Look up the official EULA link for the selected base model from references/eula_links.md
+2. Display the EULA link(s) to the user in your message as clickable markdown links
+3. Tell the user they must read and agree to the EULA before using this model (one sentence)
+4. Ask them to manually change `ACCEPT_EULA` to `True` in the notebook after reviewing the license
+5. **NEVER set ACCEPT_EULA to True yourself**
+
+## 4. Notebook Execution
+
+1. **Display the following to the user:**: `A Jupyter notebook has now been generated which will help you finetune your model. You are free to run it now. Please let me know once the training is complete.`
+2. Wait for user's confirmation about training completion. Once the user has confirmed this, you are free to move to the next step of the plan.
+
+**CRITICAL:**
+
+- DON'T suggest moving to next steps before training completes
+- DON'T elaborate on the next steps unless the user specifically asks you about them.
+
+---
+
+# References
+
+- `rlvr_reward_function.md` - Lambda reward function creation guide (RLVR only)
+- `templates/rlvr_reward_function_source_template.py` - Lambda reward function source template (RLVR only)
+- `sft_example.md` - Complete notebook template for Supervised Fine-Tuning
+- `dpo_example.md` - Complete notebook template for Direct Preference Optimization
+- `rlvr_example.md` - Complete notebook template for Reinforcement Learning from Verifiable Rewards
diff --git a/plugins/sagemaker-ai/skills/finetuning/references/dpo_example.md b/plugins/sagemaker-ai/skills/finetuning/references/dpo_example.md
new file mode 100644
index 00000000..041a5ffd
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/finetuning/references/dpo_example.md
@@ -0,0 +1,159 @@
+# DPO (Direct Preference Optimization) Notebook Template
+
+This template provides the complete cell structure for a DPO finetuning notebook.
+
+---
+
+## Cell 1: Install Dependencies
+
+```python
+!pip install 'sagemaker>=3.7.0,<4.0' boto3 -q
+```
+
+---
+
+## Cell 2: Setup & Credentials
+
+```python
+import os
+import boto3
+from sagemaker.ai_registry.dataset_utils import CustomizationTechnique
+from botocore.exceptions import ClientError
+from sagemaker.ai_registry.dataset import DataSet
+from sagemaker.core.resources import ModelPackageGroup
+from sagemaker.core.helper.session_helper import Session, get_execution_role
+from sagemaker.core import Attribution, set_attribution
+
+set_attribution(Attribution.SAGEMAKER_AGENT_PLUGIN)
+
+# Setup
+sm_client = boto3.Session().client("sagemaker")
+sagemaker_session = Session(sagemaker_client=sm_client)
+bucket = sagemaker_session.default_bucket()
+
+# Configuration - USER please fill in these fields with your information:
+
+BASE_MODEL = ""  # e.g., "meta-textgeneration-llama-3-8b"
+TRAINING_DATA_S3 = ""  # S3 path
+S3_OUTPUT_PATH = f"s3://{bucket}/finetuning-output/"
+ROLE_ARN = get_execution_role() # You can change this to a specific role.
+ACCEPT_EULA = False  # Set to True to accept the base model's End-User License Agreement
+MODEL_PACKAGE_GROUP_NAME = ""  # Auto-generated based on use case
+```
+
+---
+
+## Cell 3: Create Dataset and Model Package Group
+
+```python
+# Create Model Package Group
+try:
+    model_package_group = ModelPackageGroup.create(
+        model_package_group_name=MODEL_PACKAGE_GROUP_NAME,
+        model_package_group_description="",
+    )
+    print(f"Created new model package group named {MODEL_PACKAGE_GROUP_NAME}")
+except ClientError as e:
+    if e.response['Error']['Code'] in ('ResourceInUse', 'ValidationException'):
+        model_package_group = ModelPackageGroup.get(model_package_group_name=MODEL_PACKAGE_GROUP_NAME)
+        print(f"There is already a model package group with the name {MODEL_PACKAGE_GROUP_NAME}.\nIf you want to save your finetuned model under a different name, change the value of MODEL_PACKAGE_GROUP_NAME in the previous cell.")
+    else:
+        raise
+
+# Create Dataset
+# Register dataset in SageMaker AI Registry. This creates a versioned dataset that can be referenced by ARN
+dataset = DataSet.create(
+    name=MODEL_PACKAGE_GROUP_NAME,
+    source=TRAINING_DATA_S3,
+    wait=True
+)
+
+TRAINING_DATASET_ARN = dataset.arn
+print(f"Here is your model package group ARN: {model_package_group.model_package_group_arn}\n")
+print(f"Here is your training dataset ARN: {dataset.arn}")
+```
+
+---
+
+## Cell 4: Configure Trainer
+
+```python
+from sagemaker.train.dpo_trainer import DPOTrainer
+from sagemaker.train.common import TrainingType
+
+trainer = DPOTrainer(
+    model=BASE_MODEL,
+    training_type=TrainingType.LORA,
+    model_package_group=model_package_group,
+    training_dataset=TRAINING_DATASET_ARN,
+    s3_output_path=S3_OUTPUT_PATH,
+    sagemaker_session=sagemaker_session,
+    accept_eula=ACCEPT_EULA,
+    role=ROLE_ARN
+)
+print ("Here are the recommended hyperparameters for the current training job:")
+print(f"Batch size: {trainer.hyperparameters.global_batch_size}")
+print(f"Number of epochs: {trainer.hyperparameters.max_epochs}")
+print(f"Learning rate: {trainer.hyperparameters.learning_rate}")
+print(f"Learning rate warmup ratio: {trainer.hyperparameters.lr_warmup_ratio}")
+print(f"Adam Beta: {trainer.hyperparameters.adam_beta}")
+
+# To change a hyperparameter, uncomment its corresponding line and set the value you want.
+# Note: You might get an error if the value you choose is not supported for your model. 
+# If that happens, simply choose from the allowed range that's indicated in the error.
+
+# Uncomment the following line to change the learning rate
+# trainer.hyperparameters.learning_rate = 0.0002
+
+# Uncomment the following line to change the batch size
+# trainer.hyperparameters.global_batch_size = 16
+
+# Uncomment the following line to change the number of epochs
+# trainer.hyperparameters.max_epochs = 5
+
+# Uncomment the following line to change the learning rate warmup ratio
+# trainer.hyperparameters.lr_warmup_ratio = 0.05
+
+# Uncomment the following line to change Adam Beta 
+# trainer.hyperparameters.adam_beta = 0.01
+```
+
+---
+
+## Cell 5: Start Training
+
+```python
+# Start training
+training_job = trainer.train(wait=True)
+
+print(f"Training Job Name: {training_job.training_job_name}")
+print(f"Training Status: {training_job.training_job_status}")
+```
+
+---
+
+## Cell 6: Plot and Display Metrics
+
+```python
+import matplotlib.pyplot as plt
+import mlflow
+from mlflow.tracking import MlflowClient
+
+run_id = training_job.mlflow_details.mlflow_run_id
+mlflow.set_tracking_uri(training_job.mlflow_config.mlflow_resource_arn)
+client = MlflowClient()
+
+metrics = ["loss_per_batch", "rewards/chosen", "rewards/rejected", "rewards/margins", "acc_per_batch"]
+fig, axes = plt.subplots(1, len(metrics), figsize=(4 * len(metrics), 3))
+for idx, metric in enumerate(metrics):
+history = client.get_metric_history(run_id, metric)
+axes[idx].plot([h.step for h in history], [h.value for h in history], linewidth=2, marker='o', markersize=4)
+axes[idx].set_xlabel('Step')
+axes[idx].set_ylabel(metric.split('/')[-1])
+axes[idx].set_title(metric, fontweight='bold')
+axes[idx].grid(True, alpha=0.3)
+
+plt.suptitle(f'Training Metrics: {training_job.training_job_name}', fontweight='bold')
+plt.tight_layout()
+plt.show()
+```
diff --git a/plugins/sagemaker-ai/skills/finetuning/references/eula_links.md b/plugins/sagemaker-ai/skills/finetuning/references/eula_links.md
new file mode 100644
index 00000000..52c33b6b
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/finetuning/references/eula_links.md
@@ -0,0 +1,19 @@
+# Model License Information
+
+| SageMaker Hub Model ID                       | Model Name                   | License URL(s)                                                                                                                    |
+| -------------------------------------------- | ---------------------------- | --------------------------------------------------------------------------------------------------------------------------------- |
+| `huggingface-reasoning-qwen3-8b`             | Qwen3-8B                     | https://huggingface.co/Qwen/Qwen3-8B/blob/main/LICENSE                                                                            |
+| `huggingface-reasoning-qwen3-32b`            | Qwen3-32B                    | https://huggingface.co/Qwen/Qwen3-32B/blob/main/LICENSE                                                                           |
+| `huggingface-reasoning-qwen3-06b`            | Qwen3-0.6B                   | https://huggingface.co/Qwen/Qwen3-0.6B/blob/main/LICENSE                                                                          |
+| `huggingface-llm-qwen2-5-7b-instruct`        | Qwen2.5-7B-Instruct          | https://huggingface.co/Qwen/Qwen2.5-7B-Instruct/blob/main/LICENSE                                                                 |
+| `huggingface-llm-qwen2-5-32b-instruct`       | Qwen2.5-32B-Instruct         | https://huggingface.co/Qwen/Qwen2.5-32B-Instruct/blob/main/LICENSE                                                                |
+| `deepseek-llm-r1-distill-qwen-32b`           | DeepSeek-R1-Distill-Qwen-32B | https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/blob/main/LICENSE                                                 |
+| `openai-reasoning-gpt-oss-20b`               | GPT-OSS-20B                  | https://huggingface.co/openai/gpt-oss-20b/blob/main/LICENSE<br>https://huggingface.co/openai/gpt-oss-20b/blob/main/USAGE_POLICY   |
+| `openai-reasoning-gpt-oss-120b`              | GPT-OSS-120B                 | https://huggingface.co/openai/gpt-oss-120b/blob/main/LICENSE<br>https://huggingface.co/openai/gpt-oss-120b/blob/main/USAGE_POLICY |
+| `nova-textgeneration-pro`                    | Amazon Nova Pro              | https://aws.amazon.com/service-terms/                                                                                             |
+| `nova-textgeneration-micro`                  | Amazon Nova Micro            | https://aws.amazon.com/service-terms/                                                                                             |
+| `nova-textgeneration-lite`                   | Amazon Nova Lite             | https://aws.amazon.com/service-terms/                                                                                             |
+| `nova-textgeneration-lite-v2`                | Amazon Nova Lite v2          | https://aws.amazon.com/service-terms/                                                                                             |
+| `meta-textgeneration-llama-3-3-70b-instruct` | Llama 3.3 70B Instruct       | https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct/blob/main/LICENSE                                                        |
+| `meta-textgeneration-llama-3-2-1b-instruct`  | Llama 3.2 1B Instruct        | https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct/blob/main/LICENSE.txt                                                     |
+| `meta-textgeneration-llama-3-1-8b-instruct`  | Llama 3.1 8B Instruct        | https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct/blob/main/LICENSE                                                         |
diff --git a/plugins/sagemaker-ai/skills/finetuning/references/rlvr_example.md b/plugins/sagemaker-ai/skills/finetuning/references/rlvr_example.md
new file mode 100644
index 00000000..2c983436
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/finetuning/references/rlvr_example.md
@@ -0,0 +1,169 @@
+# RLVR (Reinforcement Learning from Verifiable Rewards) Notebook Template
+
+This template provides the complete cell structure for an RLVR finetuning notebook.
+
+---
+
+## Cell 1: Install Dependencies
+
+```python
+!pip install 'sagemaker>=3.7.0,<4.0' boto3 -q
+```
+
+---
+
+## Cell 2: Setup & Credentials
+
+```python
+import os
+import boto3
+from sagemaker.ai_registry.dataset_utils import CustomizationTechnique
+from botocore.exceptions import ClientError
+from sagemaker.ai_registry.dataset import DataSet
+from sagemaker.core.resources import ModelPackageGroup
+from sagemaker.core.helper.session_helper import Session, get_execution_role
+from sagemaker.core import Attribution, set_attribution
+
+set_attribution(Attribution.SAGEMAKER_AGENT_PLUGIN)
+
+# Setup
+sm_client = boto3.Session().client("sagemaker")
+sagemaker_session = Session(sagemaker_client=sm_client)
+bucket = sagemaker_session.default_bucket()
+
+# Configuration - USER please fill in these fields with your information:
+
+BASE_MODEL = ""  # e.g., "meta-textgeneration-llama-3-8b"
+TRAINING_DATA_S3 = ""  # S3 path
+S3_OUTPUT_PATH = f"s3://{bucket}/finetuning-output/"
+ROLE_ARN = get_execution_role() # You can change this to a specific role.
+ACCEPT_EULA = False  # Set to True to accept the base model's End-User License Agreement
+MODEL_PACKAGE_GROUP_NAME = ""  # Auto-generated based on use case
+CUSTOM_REWARD_FUNCTION = "" # Reward Function ARN
+```
+
+---
+
+## Cell 3: Create Dataset and Model Package Group
+
+```python
+# Create Model Package Group
+try:
+    model_package_group = ModelPackageGroup.create(
+        model_package_group_name=MODEL_PACKAGE_GROUP_NAME,
+        model_package_group_description="",
+    )
+    print(f"Created new model package group named {MODEL_PACKAGE_GROUP_NAME}")
+except ClientError as e:
+    if e.response['Error']['Code'] in ('ResourceInUse', 'ValidationException'):
+        model_package_group = ModelPackageGroup.get(model_package_group_name=MODEL_PACKAGE_GROUP_NAME)
+        print(f"There is already a model package group with the name {MODEL_PACKAGE_GROUP_NAME}.\nIf you want to save your finetuned model under a different name, change the value of MODEL_PACKAGE_GROUP_NAME in the previous cell.")
+    else:
+        raise
+
+# Create Dataset
+# Register dataset in SageMaker AI Registry. This creates a versioned dataset that can be referenced by ARN
+dataset = DataSet.create(
+    name=MODEL_PACKAGE_GROUP_NAME,
+    source=TRAINING_DATA_S3,
+    wait=True
+)
+TRAINING_DATASET_ARN = dataset.arn
+
+print(f"Here is your model package group ARN: {model_package_group.model_package_group_arn}\n")
+print(f"Here is your training dataset ARN: {dataset.arn}")
+```
+
+---
+
+## Cell 4: Configure Trainer
+
+```python
+from sagemaker.train.rlvr_trainer import RLVRTrainer
+from sagemaker.train.common import TrainingType
+
+
+trainer = RLVRTrainer(
+    model=BASE_MODEL,
+    model_package_group=model_package_group,
+    training_dataset=TRAINING_DATASET_ARN,
+    s3_output_path=S3_OUTPUT_PATH,
+    sagemaker_session=sagemaker_session,
+    accept_eula=ACCEPT_EULA,
+    role=ROLE_ARN,
+    custom_reward_function=CUSTOM_REWARD_FUNCTION
+)
+print ("Here are the recommended hyperparameters for the current training job:")
+print(f"Batch size: {trainer.hyperparameters.global_batch_size}")
+print(f"Number of epochs: {trainer.hyperparameters.max_epochs}")
+print(f"Learning rate: {trainer.hyperparameters.learning_rate}")
+
+# To change a hyperparameter, uncomment its corresponding line and set the value you want.
+# Note: You might get an error if the value you choose is not supported for your model. 
+# If that happens, simply choose from the allowed range that's indicated in the error.
+
+# Uncomment the following line to change the learning rate
+# trainer.hyperparameters.learning_rate = 0.0002
+
+# Uncomment the following line to change the batch size
+# trainer.hyperparameters.global_batch_size = 16
+
+# Uncomment the following line to change the number of epochs
+# trainer.hyperparameters.max_epochs = 5
+
+# Uncomment the following line to change Adam Beta 
+# trainer.hyperparameters.adam_beta = 0.01
+```
+
+---
+
+## Cell 5: Start Training
+
+```python
+# Start training
+training_job = trainer.train(wait=True)
+
+print(f"Training Job Name: {training_job.training_job_name}")
+print(f"Training Status: {training_job.training_job_status}")
+```
+
+---
+
+## Cell 6: Plot and Display Metrics
+
+```python
+import matplotlib.pyplot as plt
+import mlflow
+from mlflow.tracking import MlflowClient
+
+run_id = training_job.mlflow_details.mlflow_run_id
+mlflow.set_tracking_uri(training_job.mlflow_config.mlflow_resource_arn)
+client = MlflowClient()
+
+# Core RL metrics - adjust val-core metric names based on your data source and reward function
+metrics = [
+"critic/rewards/mean",
+"response_length/mean",
+"actor/entropy_loss",
+"actor/grad_norm",
+"critic/advantages/mean",
+]
+# Note: Validation reward metrics follow the pattern: val-core/{data_source}/reward(/acc)/mean@{k}
+# Add your specific val-core metrics to the list above, e.g.:
+#   "val-core/my_dataset/reward/mean@1"
+# ResponseQuality: Verl allows printing to a file. Check training job output for details.
+
+fig, axes = plt.subplots(1, len(metrics), figsize=(4 * len(metrics), 3))
+for idx, metric in enumerate(metrics):
+history = client.get_metric_history(run_id, metric)
+if history:
+axes[idx].plot([h.step for h in history], [h.value for h in history], linewidth=2, marker='o', markersize=4)
+axes[idx].set_xlabel('Step')
+axes[idx].set_ylabel(metric.split('/')[-1])
+axes[idx].set_title(metric, fontweight='bold')
+axes[idx].grid(True, alpha=0.3)
+
+plt.suptitle(f'Training Metrics: {training_job.training_job_name}', fontweight='bold')
+plt.tight_layout()
+plt.show()
+```
diff --git a/plugins/sagemaker-ai/skills/finetuning/references/rlvr_reward_function.md b/plugins/sagemaker-ai/skills/finetuning/references/rlvr_reward_function.md
new file mode 100644
index 00000000..8668bc6b
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/finetuning/references/rlvr_reward_function.md
@@ -0,0 +1,43 @@
+# RLVR Lambda Reward Function Guide
+
+## What is a Lambda Reward Function?
+
+For RLVR training, a Lambda reward function is an AWS Lambda that evaluates model outputs during training and returns
+numerical rewards. SageMaker calls this Lambda in the training loop to provide learning signals.
+
+## Helping Users Create Lambda Functions
+
+### Step 1: Copy Template to Project
+
+Copy the template file `templates/rlvr_reward_function_source_template.py` as `lambda_function.py` into the project's scripts directory.
+
+- Read the `directory-management` skill to determine the correct directory for storing scripts.
+
+### Step 2: Generate Notebook Cell
+
+Create a single notebook cell that registers the local file as a SageMaker Hub Evaluator. Set `reward_function_path` to the path where `lambda_function.py` was saved in Step 1.
+
+```python
+from sagemaker.ai_registry.evaluator import Evaluator
+
+reward_function_path = ""  # Path to lambda_function.py from Step 1
+
+evaluator = Evaluator.create(
+    name="[GENERATE A NAME FOR THE EVALUATOR HERE]",
+    type="RewardFunction",
+    source=reward_function_path,
+)
+print(f"Reward Function ARN: {evaluator.arn}")
+```
+
+Remember to set an appropriate name for the Evaluator by yourself in the above code, based on the use case and the current context.
+
+- Format: lowercase, alphanumeric with hyphens only, 1-20 characters
+- Pattern: `[a-zA-Z0-9](-*[a-zA-Z0-9]){0,20}`
+
+### Step 3: Inform User About TODOs
+
+After copying the template and generating the notebook cell, inform the user that `lambda_function.py` contains `TODO` sections that they
+must customize for their use case before running the notebook. The sections that need customization include helper functions,
+reward logic, input parsing, score computation, and the return statement. Direct the user to edit `lambda_function.py` directly.
+Wait for the user's acknowledgment before proceeding.
diff --git a/plugins/sagemaker-ai/skills/finetuning/references/sft_example.md b/plugins/sagemaker-ai/skills/finetuning/references/sft_example.md
new file mode 100644
index 00000000..5695abde
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/finetuning/references/sft_example.md
@@ -0,0 +1,159 @@
+# SFT (Supervised Fine-Tuning) Notebook Template
+
+This template provides the complete cell structure for an SFT finetuning notebook.
+
+---
+
+## Cell 1: Install Dependencies
+
+```python
+!pip install 'sagemaker>=3.7.0,<4.0' boto3 -q
+```
+
+---
+
+## Cell 2: Setup & Credentials
+
+```python
+import os
+import boto3
+from sagemaker.ai_registry.dataset_utils import CustomizationTechnique
+from botocore.exceptions import ClientError
+from sagemaker.ai_registry.dataset import DataSet
+from sagemaker.core.resources import ModelPackageGroup
+from sagemaker.core.helper.session_helper import Session, get_execution_role
+from sagemaker.core import Attribution, set_attribution
+
+set_attribution(Attribution.SAGEMAKER_AGENT_PLUGIN)
+
+# Setup
+sm_client = boto3.Session().client("sagemaker")
+sagemaker_session = Session(sagemaker_client=sm_client)
+bucket = sagemaker_session.default_bucket()
+
+# Configuration - USER please fill in these fields with your information:
+
+BASE_MODEL = ""  # e.g., "meta-textgeneration-llama-3-8b"
+TRAINING_DATA_S3 = ""  # S3 path
+S3_OUTPUT_PATH = f"s3://{bucket}/finetuning-output/"
+ROLE_ARN = get_execution_role() # You can change this to a specific role.
+ACCEPT_EULA = False  # Set to True to accept the base model's End-User License Agreement
+MODEL_PACKAGE_GROUP_NAME = ""  # Auto-generated based on use case
+```
+
+---
+
+## Cell 3: Create Dataset and Model Package Group
+
+```python
+# Create Model Package Group
+try:
+    model_package_group = ModelPackageGroup.create(
+        model_package_group_name=MODEL_PACKAGE_GROUP_NAME,
+        model_package_group_description="",
+    )
+    print(f"Created new model package group named {MODEL_PACKAGE_GROUP_NAME}")
+except ClientError as e:
+    if e.response['Error']['Code'] in ('ResourceInUse', 'ValidationException'):
+        model_package_group = ModelPackageGroup.get(model_package_group_name=MODEL_PACKAGE_GROUP_NAME)
+        print(f"There is already a model package group with the name {MODEL_PACKAGE_GROUP_NAME}. If you want to save your finetuned model under a different name, change the value of MODEL_PACKAGE_GROUP_NAME in the previous cell.")
+    else:
+        raise
+
+# Create Dataset
+# Register dataset in SageMaker AI Registry. This creates a versioned dataset that can be referenced by ARN
+dataset = DataSet.create(
+    name=MODEL_PACKAGE_GROUP_NAME,
+    source=TRAINING_DATA_S3,
+    wait=True
+)
+
+TRAINING_DATASET_ARN = dataset.arn
+print(f"Here is your model package group ARN: {model_package_group.model_package_group_arn}\n")
+print(f"Here is your training dataset ARN: {dataset.arn}")
+```
+
+---
+
+## Cell 4: Configure Trainer
+
+```python
+from sagemaker.train.sft_trainer import SFTTrainer
+from sagemaker.train.common import TrainingType
+
+
+trainer = SFTTrainer(
+    model=BASE_MODEL,
+    training_type=TrainingType.LORA,
+    model_package_group=model_package_group,
+    training_dataset=TRAINING_DATASET_ARN,
+    s3_output_path=S3_OUTPUT_PATH,
+    sagemaker_session=sagemaker_session,
+    accept_eula=ACCEPT_EULA,
+    role=ROLE_ARN
+)
+
+print ("Here are the recommended hyperparameters for the current training job:")
+print(f"Batch size: {trainer.hyperparameters.global_batch_size}")
+print(f"Number of epochs: {trainer.hyperparameters.max_epochs}")
+print(f"Learning rate: {trainer.hyperparameters.learning_rate}")
+print(f"Learning rate warmup ratio: {trainer.hyperparameters.lr_warmup_ratio}")
+
+# To change a hyperparameter, uncomment its corresponding line and set the value you want.
+# Note: You might get an error if the value you choose is not supported for your model. 
+# If that happens, simply choose from the allowed range that's indicated in the error.
+
+# Uncomment the following line to change the learning rate
+# trainer.hyperparameters.learning_rate = 0.0002
+
+# Uncomment the following line to change the batch size
+# trainer.hyperparameters.global_batch_size = 16
+
+# Uncomment the following line to change the number of epochs
+# trainer.hyperparameters.max_epochs = 5
+
+# Uncomment the following line to change the learning rate warmup ratio
+# trainer.hyperparameters.lr_warmup_ratio = 0.05
+
+# Uncomment the following line to change Adam Beta 
+# trainer.hyperparameters.adam_beta = 0.01
+```
+
+---
+
+## Cell 5: Start Training
+
+```python
+# Start training
+training_job = trainer.train(wait=True)
+
+print(f"Training Job Name: {training_job.training_job_name}")
+print(f"Training Status: {training_job.training_job_status}")
+```
+
+---
+
+## Cell 6: Plot and Display Metrics
+
+```python
+import matplotlib.pyplot as plt
+import mlflow
+from mlflow.tracking import MlflowClient
+
+run_id = training_job.mlflow_details.mlflow_run_id
+mlflow.set_tracking_uri(training_job.mlflow_config.mlflow_resource_arn)
+client = MlflowClient()
+
+fig, axes = plt.subplots(1, 2, figsize=(12, 3))
+for idx, metric in enumerate(["total_loss", "val_eval_total_loss"]):
+history = client.get_metric_history(run_id, metric)
+axes[idx].plot([h.step for h in history], [h.value for h in history], linewidth=2, marker='o', markersize=4)
+axes[idx].set_xlabel('Step')
+axes[idx].set_ylabel('Loss')
+axes[idx].set_title(metric, fontweight='bold')
+axes[idx].grid(True, alpha=0.3)
+
+plt.suptitle(f'Training Metrics: {training_job.training_job_name}', fontweight='bold')
+plt.tight_layout()
+plt.show()
+```
diff --git a/plugins/sagemaker-ai/skills/finetuning/templates/rlvr_reward_function_source_template.py b/plugins/sagemaker-ai/skills/finetuning/templates/rlvr_reward_function_source_template.py
new file mode 100644
index 00000000..32f17ea2
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/finetuning/templates/rlvr_reward_function_source_template.py
@@ -0,0 +1,250 @@
+"""
+Provide your custom reward function code below. Learn about the available libraries and templates that you can use
+at: https://docs.aws.amazon.com/sagemaker/latest/dg/customize-model.html.
+
+- You must add your evaluation logic in the reward_function() function
+- Do not remove the lambda_handler() function or modify its schema as it is required to create the reward function
+"""
+
+import json  # For JSON parsing - adjust imports based on your use case
+import re    # For pattern matching and validation
+from typing import Dict, Any, List, Optional # For type hints
+# Add any other imports your use case requires
+
+# ========================================================================================
+#  NOTE: INITIAL SUGGESTION ONLY - MUST BE CUSTOMIZED
+#
+#     YOU MUST:
+#     1. Review and update each section per YOUR use case
+#     2. Customize the logic for YOUR SPECIFIC requirements
+#     3. Replace example values (field names, thresholds, etc.) with your actual values
+#     4. Test thoroughly before using
+#
+#     DO NOT use this code as-is. It will not work until you uncomment and customize it.
+# =========================================================================================
+
+
+# =========================================================================================
+# SECTION 1: Helper function 1
+# =========================================================================================
+# TODO: UPDATE or REMOVE the helper function as per YOUR use case
+# Note the below lines of code are examples and will not work for your use case
+# You MUST update them to match YOUR use case
+def extract_number(text: str) -> Optional[float]:
+    """
+    Extract numerical answer from text.
+    Looks for numbers after answer keywords, or returns the last number found.
+    
+    Args:
+        text: Text containing a numerical answer
+        
+    Returns:
+        Extracted number as float, or None if no number found
+    """
+    if not text:
+        return None
+
+    # Try to find numbers after common answer keywords
+    answer_patterns = [
+        r'(?:equals|is|answer is|result is|=)\s*(-?\d+\.?\d*)',
+        r'(?:answer|result|solution):\s*(-?\d+\.?\d*)',
+    ]
+
+    for pattern in answer_patterns:
+        match = re.search(pattern, text, re.IGNORECASE)
+        if match:
+            try:
+                return float(match.group(1))
+            except ValueError:
+                pass
+
+    # Fallback: find all numbers and return the last one (likely the answer)
+    pattern = r'-?\d+\.?\d*'
+    matches = re.findall(pattern, text)
+
+    if matches:
+        try:
+            return float(matches[-1])  # Return last number instead of first
+        except ValueError:
+            return None
+
+    return None
+
+# =========================================================================================
+# SECTION 2: Helper function 2
+# =========================================================================================
+# TODO: UPDATE or REMOVE the helper function as per YOUR use case
+# Note the below lines of code are examples and will not work for your use case
+# You MUST update them to match YOUR use case
+def compute_reasoning_quality(response: str) -> float:
+    """
+    Compute reasoning quality score based on response characteristics.
+    This is a simple heuristic - customize based on your needs.
+    
+    Args:
+        response: The model's response text
+        
+    Returns:
+        Quality score between 0.0 and 1.0
+    """
+    if not response:
+        return 0.0
+
+    score = 0.0
+
+    # Check for reasoning indicators (customize these for your use case)
+    reasoning_indicators = [
+        'because', 'therefore', 'thus', 'since', 'so',
+        'first', 'second', 'then', 'finally',
+        'step', 'calculate', 'compute', 'equals'
+    ]
+
+    response_lower = response.lower()
+
+    # Award points for reasoning indicators (max 0.55)
+    indicator_count = sum(1 for indicator in reasoning_indicators if indicator in response_lower)
+    score += min(indicator_count * 0.11, 0.55)
+
+    # Award points for response length (indicates detailed reasoning, max 0.25)
+    if len(response) > 30:
+        score += 0.05
+    if len(response) > 60:
+        score += 0.1
+    if len(response) > 120:
+        score += 0.1
+
+    # Award points for structured response (max 0.2)
+    if '\n' in response or '.' in response:
+        score += 0.2
+
+    return min(score, 1.0)
+
+# =========================================================================================
+# SECTION 3: Sample reward function
+# =========================================================================================
+# TODO: UPDATE or REMOVE the reward function as per YOUR use case
+# Note the below lines of code are examples and will not work for your use case
+# You MUST update them to match YOUR use case
+def reward_function(sample: Dict[str, Any], index: int) -> Dict[str, Any]:
+    """
+    Args:
+        sample: Dictionary containing messages and reference_answer
+        index: Sample index in batch
+    
+    Returns:
+        Dictionary with reward scores and metrics
+    """
+    # ========================================================================
+    # SECTION 4: Parse input
+    # ========================================================================
+    # TODO: UPDATE logic to parse the input as per YOUR use case
+    # Note the below lines of code are examples and will not work for your use case
+    # You MUST update them to match YOUR use case
+    # Extract the response and reference
+    messages = sample.get('messages', [])
+    reference_answer = sample.get('reference_answer', {}).get('text', '')
+
+    # Get the question and assistant's response
+    question = ""
+    response = ""
+    for msg in messages:
+        if msg.get('role') == 'user':
+            question = msg.get('content', '')
+        elif msg.get('role') == 'assistant':
+            response = msg.get('content', '')
+
+    # Extract numerical answers
+    predicted = extract_number(response)
+    expected = extract_number(reference_answer)
+
+    # Compute metrics
+    exact_match = 0.0
+    answer_present = 0.0
+    reasoning_quality = compute_reasoning_quality(response)
+
+    if predicted is not None and expected is not None:
+        exact_match = 1.0 if abs(predicted - expected) < 1e-6 else 0.0
+        answer_present = 1.0
+
+    # ========================================================================
+    # SECTION 5: Compute reward scores
+    # ========================================================================
+    # TODO: UPDATE logic to compute aggregate score 
+    # Note the below lines of code are examples and will not work for your use case
+    # You MUST update them to match YOUR use case
+    # Aggregate reward computation
+    aggregate_reward = 0.7 * exact_match + 0.3 * reasoning_quality
+
+    # ========================================================================
+    # SECTION 6: Form the metrics list
+    # ========================================================================
+    # TODO: UPDATE logic to compute metrics list
+    # Note the below lines of code are examples and will not work for your use case
+    # You MUST update them to match YOUR use case
+    metrics = [
+        {
+            'name': 'exact_match',
+            'value': float(exact_match),
+            'type': 'Reward'
+        },
+        {
+            'name': 'answer_present',
+            'value': float(answer_present),
+            'type': 'Metric'
+        },
+        {
+            'name': 'reasoning_quality',
+            'value': float(reasoning_quality),
+            'type': 'Metric'
+        }
+    ]
+
+    # ========================================================================
+    # SECTION 7: Return output
+    # ========================================================================
+    # TODO: UPDATE the return statement to return YOUR outout
+    # UPDATE the key before creating the evaluator
+    # Note the below lines of code are examples and will not work for your use case
+    # You MUST update them to match YOUR use case
+
+    return {
+        'id': str(sample.get('my_key', f'sample-{index:03d}')),  # Use formatted index as fallback
+        'aggregate_reward_score': float(aggregate_reward),
+        'metrics_list': metrics
+    }
+
+def lambda_handler(event: Dict[str, Any], context: Any) -> Dict[str, Any]:
+    """
+    AWS Lambda Handler for reward function
+    """
+    try:
+        # Extract batch from event
+        batch = event.get('input', event) if isinstance(event, dict) else event
+        if 'batch' in event:
+            batch = event.get('batch', [])
+        elif 'body' in event:
+            body = json.loads(event.get('body', '{}'))
+            batch = body.get('batch', [])
+
+        if not batch:
+            return {"error":"Missing or empty batch"}
+
+        # Process each sample
+        results = []
+        for i, sample in enumerate(batch):
+            try:
+                result = reward_function(sample, i)
+                results.append(result)
+            except Exception as e:
+                return {"error": str(e)}
+
+        return {
+            'statusCode': 200,
+            'headers': {'Content-Type': 'application/json'},
+            'body': json.dumps(results)
+        }
+    except Exception as e:
+        return {
+            'statusCode': 400,
+            'body': json.dumps({"error": str(e)})
+        }
diff --git a/plugins/sagemaker-ai/skills/hyperpod-issue-report/SKILL.md b/plugins/sagemaker-ai/skills/hyperpod-issue-report/SKILL.md
new file mode 100755
index 00000000..b15fd4c4
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-issue-report/SKILL.md
@@ -0,0 +1,74 @@
+---
+name: hyperpod-issue-report
+description: Generate comprehensive issue reports from HyperPod clusters (EKS and Slurm) by collecting diagnostic logs and configurations for troubleshooting and AWS Support cases. Use when users need to collect diagnostics from HyperPod cluster nodes, generate issue reports for AWS Support, investigate node failures or performance problems, document cluster state, or create diagnostic snapshots. Triggers on requests involving issue reports, diagnostic collection, support case preparation, or cluster troubleshooting that requires gathering logs and system information from multiple nodes.
+---
+
+# HyperPod Issue Report
+
+Collect diagnostic logs from HyperPod cluster nodes via SSM, store results in S3. Supports both EKS and Slurm clusters with auto-detection. Uses the bundled `scripts/hyperpod_issue_report.py` for reliable parallel collection.
+
+## Prerequisites
+
+- AWS CLI configured with permissions: `sagemaker:DescribeCluster`, `sagemaker:ListClusterNodes`, `ssm:StartSession`, `s3:PutObject`, `s3:GetObject`, `eks:DescribeCluster`
+- Python 3.8+ with `pip install -r scripts/requirements.txt`
+- SSM Agent running on target nodes; node IAM roles need `s3:GetObject`/`s3:PutObject` on the report bucket
+- For EKS clusters: kubectl installed and configured (see Workflow step 2)
+
+## Workflow
+
+### 1. Gather Information
+
+Collect from the user:
+
+- **Cluster identifier** (required): accepts cluster name or full cluster ARN (e.g., `arn:aws:sagemaker:us-west-2:123456789012:cluster/abc123`)
+- **AWS region** (required unless extractable from ARN)
+- **S3 path** for report storage (required, e.g. `s3://bucket/prefix`). If the user doesn't have a bucket, create one (e.g., `s3://hyperpod-diagnostics-<account-id>-<region>`)
+- **Issue description** (optional)
+- **Target scope**: all nodes, specific instance groups, or specific node IDs (optional)
+- **Additional commands** to run on nodes (optional)
+
+### 2. Verify Environment
+
+```bash
+aws sts get-caller-identity
+aws sagemaker describe-cluster --cluster-name <name-or-arn> --region <region>
+pip install -r scripts/requirements.txt
+```
+
+If the S3 bucket doesn't exist, create it:
+
+```bash
+aws s3 mb s3://<bucket-name> --region <region>
+```
+
+**For EKS clusters** (check `Orchestrator.Eks` in describe-cluster output):
+
+1. Ensure kubectl is installed (`which kubectl`). If missing, install it for the current platform.
+2. Configure kubeconfig using the EKS cluster name from the describe-cluster response:
+
+   ```bash
+   aws eks update-kubeconfig --name <eks-cluster-name> --region <region>
+   ```
+
+### 3. Run the Collection Script
+
+```bash
+python scripts/hyperpod_issue_report.py \
+  --cluster <cluster-name-or-arn> \
+  --region <region> \
+  --s3-path s3://<bucket>[/prefix]
+```
+
+Use `--help` for all options including `--instance-groups`, `--nodes`, `--command`, `--max-workers`, and `--debug`. Note: `--instance-groups` and `--nodes` are mutually exclusive. Node identifiers accept instance IDs (`i-*`), EKS names (`hyperpod-i-*`), or Slurm names (`ip-*`).
+
+### 4. Present Results
+
+After collection, the script shows statistics and offers interactive download. Report the S3 location and offer to:
+
+- Download the report locally
+- Help analyze collected diagnostics (see [references/collection-details.md](references/collection-details.md) for what's in each file)
+- Prepare a summary for AWS Support
+
+## Troubleshooting
+
+See [references/troubleshooting.md](references/troubleshooting.md) for error handling, large cluster tuning, and known limitations.
diff --git a/plugins/sagemaker-ai/skills/hyperpod-issue-report/references/collection-details.md b/plugins/sagemaker-ai/skills/hyperpod-issue-report/references/collection-details.md
new file mode 100755
index 00000000..0e7c4505
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-issue-report/references/collection-details.md
@@ -0,0 +1,105 @@
+# Collection Details
+
+## What Gets Collected
+
+### Common (Both EKS and Slurm)
+
+- `nvidia_smi.txt` — GPU status, utilization, memory, temperature
+- `resource_config.json` — HyperPod resource config from `/opt/ml/config/resource_config.json`
+- `cluster_logs/` — Contents of `/var/log/aws/clusters/`
+- `systemd_services.txt` — All systemd service statuses
+- `disk_usage.txt` — `df` output
+- `hostname.txt`, `instance_group.txt`, `instance_id.txt`, `cluster_type.txt`, `timestamp.txt`
+
+### EKS-Specific (Per-Node)
+
+- `containerd_status.txt` — `systemctl status containerd`
+- `kubelet_status.txt` — `systemctl status kubelet`
+- `eks-log-collector-output.txt` — EKS log collector execution log
+- `eks-logs/` — EKS log collector output subdirectories:
+  - `cni/` — CNI plugin logs and config
+  - `containerd/` — Runtime logs, config, version, images, containers, tasks, plugins
+  - `docker/` — Docker logs (if present)
+  - `gpu/` — GPU diagnostics
+  - `ipamd/` — AWS VPC CNI IPAMD logs
+  - `kernel/` — dmesg output, uname info
+  - `kubelet/` — Kubelet logs and config
+  - `modinfo/` — Kernel module info (lustre, ip_vs, etc.)
+  - `networking/` — Network config, iptables, routes, interfaces
+  - `nodeadm/` — Node administration logs
+  - `sandbox-image/` — Sandbox image info
+  - `storage/` — Mounts, inodes, lsblk, LVM, fstab, XFS, pod local storage
+  - `sysctls/` — Kernel parameters
+  - `system/` — Services, systemd-analyze, top, ps, netstat, CPU/IO throttling
+  - `var_log/` — System logs from /var/log
+
+### EKS-Specific (kubectl — Collected Locally)
+
+Packaged as `kubectl_resources.tar.gz`, collected from the local machine (not from nodes).
+
+**High Priority:**
+
+- `nodes_describe.txt` — Detailed node descriptions (capacity, conditions, running pods)
+- `pods_all_namespaces.txt` / `pods_describe_all_namespaces.txt` — All pods with details
+- `events_all_namespaces.txt` — Cluster events sorted by timestamp
+- `pvcs_all_namespaces.txt` / `pvcs_describe_all_namespaces.txt` — PersistentVolumeClaims
+- `services_all_namespaces.txt` / `services_describe_all_namespaces.txt` — Network endpoints
+
+**Medium Priority:**
+
+- `deployments_all_namespaces.txt`, `statefulsets_all_namespaces.txt`, `daemonsets_all_namespaces.txt`
+- `configmaps_all_namespaces.txt`, `secrets_all_namespaces.txt` (metadata only)
+- `resourcequotas_all_namespaces.txt`, `networkpolicies_all_namespaces.txt`
+
+### Slurm-Specific
+
+- `sinfo.txt` — Node and partition information
+- `sinfo_R.txt` — Reasons for node down/drain states
+- `slurmctld_status.txt` — Slurm controller daemon status
+- `slurmd_status.txt` — Slurm compute node daemon status
+- `opt_slurm_etc/` — Slurm configuration from `/opt/slurm/etc/`
+- `nvidia-bug-report.log.gz` — NVIDIA bug report (compressed)
+- `syslog`, `kern.log` — System logs
+- `dmesg_T.txt` — Kernel ring buffer with timestamps
+- `var_log_slurm/` — Slurm logs from `/var/log/slurm/`
+
+### Custom Commands
+
+User-specified commands are saved as `command_01_<sanitized_name>.txt`, `command_02_...`, etc.
+
+## Report Output Structure
+
+```
+s3://bucket/prefix/cluster-name/YYYYMMDD_HHMMSS/
+├── collector_script.sh
+├── summary.json
+├── kubectl_resources.tar.gz      # EKS only
+└── instances/
+    ├── worker1_i-abc123.tar.gz
+    └── worker2_i-abc124.tar.gz
+```
+
+Tarball filename format: `{instance-group}_{instance-id}.tar.gz`
+
+## Summary JSON Format
+
+```json
+{
+  "cluster_name": "my-cluster",
+  "cluster_id": "abc123",
+  "report_id": "20260126_143022",
+  "timestamp": "2026-01-26T14:30:22.123456",
+  "total_nodes": 8,
+  "successful": 7,
+  "failed": 1,
+  "results": [
+    {
+      "InstanceId": "i-0123456789abcdef0",
+      "NodeGroup": "worker-group",
+      "Success": true,
+      "Output": "...",
+      "ElapsedTime": 45.2
+    }
+  ]
+}
+```
diff --git a/plugins/sagemaker-ai/skills/hyperpod-issue-report/references/troubleshooting.md b/plugins/sagemaker-ai/skills/hyperpod-issue-report/references/troubleshooting.md
new file mode 100755
index 00000000..9ab32540
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-issue-report/references/troubleshooting.md
@@ -0,0 +1,22 @@
+# Troubleshooting
+
+## Error Handling
+
+| Issue                                         | Cause                                                         | Fix                                                                                                                                                                                |
+| --------------------------------------------- | ------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `kubectl not found in PATH`                   | kubectl not installed                                         | Install kubectl for the current platform, then re-run                                                                                                                              |
+| `kubectl must be configured for EKS clusters` | kubectl missing or wrong context                              | Run `aws eks update-kubeconfig --name <eks-cluster-name> --region <region>`. Get the EKS cluster name from `aws sagemaker describe-cluster` output (`Orchestrator.Eks.ClusterArn`) |
+| Cluster name from ARN not found               | ARN contains cluster ID, not name                             | Pass the full ARN to `--cluster` instead of extracting the ID portion. Alternatively, use `aws sagemaker list-clusters` to find the cluster name                                   |
+| No instance reports in S3                     | Node IAM role missing S3 permissions                          | Add `s3:GetObject`/`s3:PutObject` to node role for the report bucket                                                                                                               |
+| SSM connectivity failed                       | SSM agent down, missing IAM, or network                       | Check `systemctl status amazon-ssm-agent`, verify `AmazonSSMManagedInstanceCore` policy                                                                                            |
+| "Failed to detect shell prompt"               | Custom SSM session config (custom `.bashrc`, SSM preferences) | Not compatible without modifying prompt detection; use manual SSM sessions as workaround                                                                                           |
+| SSM throttling                                | Too many concurrent sessions                                  | Reduce `--max-workers`; automatic retry handles transient throttling                                                                                                               |
+| Nodes unresponsive                            | Node completely down                                          | Noted in report; other nodes' diagnostics may reveal pattern                                                                                                                       |
+| EKS log collector fails                       | Script download or execution error                            | Check `eks-log-collector-output.txt`; verify disk space in `/var/log/` and `/tmp/`                                                                                                 |
+
+## Large Cluster Handling
+
+- Default `--max-workers 16` tested up to 130 nodes (99.2% success rate, ~15 min)
+- If throttled (`ThrottlingException`): reduce to `--max-workers 8`
+- For 200+ nodes: batch by instance group or increase to `--max-workers 32` if no throttling
+- kubectl collection may take 20-30 minutes for 1000+ node clusters
diff --git a/plugins/sagemaker-ai/skills/hyperpod-issue-report/scripts/hyperpod_issue_report.py b/plugins/sagemaker-ai/skills/hyperpod-issue-report/scripts/hyperpod_issue_report.py
new file mode 100755
index 00000000..e68f2f28
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-issue-report/scripts/hyperpod_issue_report.py
@@ -0,0 +1,1430 @@
+#!/usr/bin/env python3
+"""
+HyperPod Issue Report Collector
+
+Collects diagnostic logs and configurations from multiple HyperPod nodes.
+Supports both HyperPod EKS and HyperPod Slurm clusters.
+Uses hyperpod_run_on_multi_nodes mechanism to execute collection scripts on nodes.
+Downloads collection script from S3 and uploads results back to S3.
+"""
+
+import argparse
+import boto3
+import json
+import os
+import platform
+import pexpect
+import shutil
+import signal
+import subprocess  # nosec B404 - required for kubectl CLI commands
+import sys
+import tarfile
+import tempfile
+import time
+import traceback
+import zipfile
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from datetime import datetime, timezone
+from typing import List, Dict, Optional
+
+
+# ============================================================================
+# TIMEOUT CONFIGURATION
+# ============================================================================
+# These timeouts are calibrated for large clusters (tested up to 130 nodes).
+# Adjust these values if you experience timeouts with larger clusters.
+#
+# Test results (130-node cluster):
+# - kubectl commands: 1-26s (longest: kubectl describe pods)
+# - SSM node collection: 31-48s per node
+# ============================================================================
+
+# SSM session timeouts (seconds)
+# These are passed explicitly to each pexpect expect() call
+SSM_SCRIPT_EXECUTION_TIMEOUT = 900  # 15 minutes - script execution on nodes
+SSM_PROMPT_TIMEOUT = 60             # 60 seconds - prompt detection and setup
+
+# kubectl command timeout (seconds)
+KUBECTL_TIMEOUT = 600               # 10 minutes - all kubectl operations
+
+
+class HyperPodIssueReportCollector:
+    def __init__(self, cluster_name: str, s3_path: str, region: Optional[str] = None, debug: bool = False):
+        self.cluster_name = cluster_name
+        self.debug = debug
+        
+        # Parse S3 path
+        self.s3_bucket, self.s3_prefix = self.parse_s3_path(s3_path)
+        
+        # Configure boto3 clients with optional region
+        client_kwargs = {}
+        if region:
+            client_kwargs['region_name'] = region
+        
+        self.sagemaker_client = boto3.client('sagemaker', **client_kwargs)
+        self.s3_client = boto3.client('s3', **client_kwargs)
+        self.eks_client = boto3.client('eks', **client_kwargs)
+        self.region = region
+        
+        self.cluster_arn = None
+        self.cluster_id = None
+        self.cluster_type = None  # 'eks' or 'slurm'
+        self.eks_cluster_arn = None
+        self.eks_cluster_name = None
+        self.nodes = []
+        
+        # Generate unique report ID using UTC time
+        self.report_id = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
+        self.report_s3_key = f"{self.s3_prefix}/{cluster_name}/{self.report_id}"
+    
+    def parse_s3_path(self, s3_path: str) -> tuple:
+        """Parse S3 path into bucket and prefix.
+        
+        Accepts formats:
+        - s3://bucket-name/prefix/path
+        - s3://bucket-name
+        """
+        s3_path = s3_path.strip()
+        
+        # Require s3:// prefix
+        if not s3_path.startswith('s3://'):
+            raise ValueError(
+                f"S3 path must start with 's3://' prefix.\n"
+                f"Received: {s3_path}\n"
+                f"Expected format: s3://bucket-name or s3://bucket-name/custom-prefix"
+            )
+        
+        # Remove s3:// prefix
+        s3_path = s3_path[5:]
+        
+        # Split into bucket and prefix
+        parts = s3_path.split('/', 1)
+        bucket = parts[0]
+        prefix = parts[1].rstrip('/') if len(parts) > 1 else 'hyperpod-issue-reports'
+        
+        return bucket, prefix
+    
+    def extract_cluster_id_from_arn(self, cluster_arn: str) -> str:
+        """Extract cluster ID from cluster ARN."""
+        if cluster_arn:
+            if '/cluster/' in cluster_arn:
+                return cluster_arn.split('/cluster/')[-1]
+            elif ':cluster/' in cluster_arn:
+                return cluster_arn.split(':cluster/')[-1]
+            parts = cluster_arn.split(':')
+            if len(parts) > 0:
+                return parts[-1]
+        return None
+    
+    def get_slurm_node_name(self, instance_id: str) -> Optional[str]:
+        """Get Slurm node name (e.g. ip-10-1-104-161) for a node via describe_cluster_node API."""
+        try:
+            response = self.sagemaker_client.describe_cluster_node(
+                ClusterName=self.cluster_name,
+                NodeId=instance_id
+            )
+            
+            # Extract private DNS name from NodeDetails
+            node_details = response.get('NodeDetails', {})
+            private_dns = node_details.get('PrivateDnsHostname', '')
+            
+            # Private DNS format is like: ip-10-1-104-161.us-west-2.compute.internal
+            # Extract the IP part (ip-10-1-104-161)
+            if private_dns and private_dns.startswith('ip-'):
+                # Get the first part before the first dot
+                slurm_node_name = private_dns.split('.')[0]
+                return slurm_node_name
+            
+            return None
+            
+        except Exception as e:
+            if self.debug:
+                print(f"Warning: Could not get private IP for {instance_id}: {e}")
+            return None
+    
+    def get_cluster_nodes(self) -> List[Dict]:
+        """Get all nodes in the HyperPod cluster and detect cluster type."""
+        try:
+            print(f"Describing cluster: {self.cluster_name}")
+            response = self.sagemaker_client.describe_cluster(ClusterName=self.cluster_name)
+            
+            print(f"Cluster status: {response.get('ClusterStatus', 'Unknown')}")
+            
+            # Detect cluster type from Orchestrator field
+            orchestrator = response.get('Orchestrator', {})
+            
+            if 'Eks' in orchestrator:
+                self.cluster_type = 'eks'
+                print(f"Detected cluster type: EKS")
+                # Extract EKS cluster ARN
+                eks_config = orchestrator.get('Eks', {})
+                self.eks_cluster_arn = eks_config.get('ClusterArn')
+                if self.eks_cluster_arn:
+                    # Extract cluster name from ARN: arn:aws:eks:region:account:cluster/cluster-name
+                    self.eks_cluster_name = self.eks_cluster_arn.split('/')[-1]
+                    print(f"EKS Cluster ARN: {self.eks_cluster_arn}")
+                    print(f"EKS Cluster Name: {self.eks_cluster_name}")
+                else:
+                    print("Warning: Could not extract EKS cluster ARN from orchestrator config")
+            elif 'Slurm' in orchestrator:
+                self.cluster_type = 'slurm'
+                print(f"Detected cluster type: Slurm")
+            else:
+                # If Orchestrator field is missing or doesn't contain Eks/Slurm, assume Slurm
+                self.cluster_type = 'slurm'
+                print(f"Orchestrator field not found or unrecognized, assuming cluster type: Slurm")
+            
+            self.cluster_arn = response.get('ClusterArn')
+            self.cluster_id = self.extract_cluster_id_from_arn(self.cluster_arn)
+            print(f"Cluster ID: {self.cluster_id}")
+            
+            if not self.cluster_id:
+                print("Warning: Could not extract cluster ID from ARN")
+                return []
+            
+            # List all nodes with pagination
+            instance_ids = []
+            next_token = None
+            page_count = 0
+            
+            while True:
+                page_count += 1
+                print(f"Fetching nodes page {page_count}...")
+                
+                list_params = {'ClusterName': self.cluster_name}
+                if next_token:
+                    list_params['NextToken'] = next_token
+                
+                nodes_response = self.sagemaker_client.list_cluster_nodes(**list_params)
+                
+                current_page_nodes = nodes_response.get('ClusterNodeSummaries', [])
+                print(f"Found {len(current_page_nodes)} nodes on page {page_count}")
+                
+                for node in current_page_nodes:
+                    instance_id = node.get('InstanceId')
+                    if instance_id:
+                        instance_ids.append({
+                            'InstanceId': instance_id,
+                            'NodeGroup': node.get('InstanceGroupName', 'unknown'),
+                            'InstanceType': node.get('InstanceType', 'unknown'),
+                            'InstanceStatus': node.get('InstanceStatus', {}).get('Status', 'unknown')
+                        })
+                
+                next_token = nodes_response.get('NextToken')
+                if not next_token:
+                    break
+            
+            print(f"Total instances found: {len(instance_ids)}")
+            return instance_ids
+            
+        except Exception as e:
+            print(f"Error getting cluster nodes: {e}")
+            return []
+    
+    def resolve_node_identifiers(self, node_identifiers: List[str]) -> List[str]:
+        """Resolve node identifiers to instance IDs.
+        
+        Supports multiple formats:
+        - Instance IDs: i-0123456789abcdef0 (EKS and Slurm)
+        - Slurm node names: ip-10-1-104-161 (Slurm only)
+        - EKS node names: hyperpod-i-0123456789abcdef0 (EKS only)
+        
+        Returns list of instance IDs.
+        """
+        if not node_identifiers:
+            return []
+        
+        # Separate different identifier types
+        instance_ids = []
+        slurm_node_names = []
+        eks_node_names = []
+        
+        for identifier in node_identifiers:
+            if identifier.startswith('i-'):
+                # This is an instance ID
+                instance_ids.append(identifier)
+            elif identifier.startswith('ip-'):
+                # This looks like a Slurm node name
+                slurm_node_names.append(identifier)
+            elif identifier.startswith('hyperpod-i-'):
+                # This looks like an EKS node name (hyperpod-i-*)
+                eks_node_names.append(identifier)
+            else:
+                # Unknown format, treat as instance ID and let validation fail later
+                instance_ids.append(identifier)
+        
+        # Resolve EKS node names if present
+        if eks_node_names:
+            if self.cluster_type == 'eks':
+                print(f"Resolving EKS node names to instance IDs...")
+                for eks_name in eks_node_names:
+                    # Extract instance ID from hyperpod-i-* format
+                    # Format: hyperpod-i-0123456789abcdef0
+                    if eks_name.startswith('hyperpod-'):
+                        extracted_id = eks_name[9:]  # Remove 'hyperpod-' prefix
+                        if extracted_id.startswith('i-'):
+                            instance_ids.append(extracted_id)
+                            print(f"  {eks_name} -> {extracted_id}")
+                        else:
+                            print(f"  Warning: Invalid EKS node name format '{eks_name}' (expected hyperpod-i-*)")
+                    else:
+                        print(f"  Warning: Invalid EKS node name format '{eks_name}'")
+            else:
+                print(f"Warning: EKS node names provided but cluster type is {self.cluster_type}")
+                print(f"  EKS node names (hyperpod-i-*) are only supported for EKS clusters")
+                print(f"  Ignoring: {', '.join(eks_node_names)}")
+        
+        # Resolve Slurm node names if present
+        if slurm_node_names:
+            if self.cluster_type == 'slurm':
+                print(f"Resolving Slurm node names to instance IDs...")
+                
+                # Build a mapping of Slurm node name to instance ID
+                slurm_to_instance = {}
+                
+                for node in self.nodes:
+                    instance_id = node.get('InstanceId')
+                    if instance_id:
+                        slurm_name = self.get_slurm_node_name(instance_id)
+                        if slurm_name:
+                            slurm_to_instance[slurm_name] = instance_id
+                
+                # Resolve the requested Slurm node names
+                for slurm_name in slurm_node_names:
+                    if slurm_name in slurm_to_instance:
+                        resolved_id = slurm_to_instance[slurm_name]
+                        instance_ids.append(resolved_id)
+                        print(f"  {slurm_name} -> {resolved_id}")
+                    else:
+                        print(f"  Warning: Slurm node name '{slurm_name}' not found in cluster")
+            else:
+                print(f"Warning: Slurm node names provided but cluster type is {self.cluster_type}")
+                print(f"  Slurm node names (ip-*) are only supported for Slurm clusters")
+                print(f"  Ignoring: {', '.join(slurm_node_names)}")
+        
+        return instance_ids
+    
+    def generate_collector_script(self, commands: List[str]) -> str:
+        """Generate the bash script that will run on each node.
+        Instance group and ID are passed as environment variables.
+        Script content varies based on cluster type (EKS vs Slurm)."""
+        script_lines = [
+            "#!/bin/bash",
+            "# HyperPod Issue Report Collector Script",
+            "# Auto-generated script to collect diagnostic information",
+            "# Expects INSTANCE_GROUP, INSTANCE_ID, and CLUSTER_TYPE environment variables",
+            "",
+            "# Note: We don't use 'set -e' because some commands (like grep) may return non-zero",
+            "# exit codes even when they succeed (e.g., grep returns 1 when no matches found)",
+            "",
+            "# Validate required environment variables",
+            "if [ -z \"${INSTANCE_GROUP}\" ] || [ -z \"${INSTANCE_ID}\" ] || [ -z \"${CLUSTER_TYPE}\" ]; then",
+            "    echo \"Error: INSTANCE_GROUP, INSTANCE_ID, and CLUSTER_TYPE environment variables are required\"",
+            "    exit 1",
+            "fi",
+            "",
+            "# Instance identification",
+            "TIMESTAMP=$(date -u +%Y%m%d_%H%M%S)",
+            "OUTPUT_DIR=\"/tmp/hyperpod_report_${INSTANCE_GROUP}_${INSTANCE_ID}_${TIMESTAMP}\"",
+            "",
+            "echo \"Creating output directory: ${OUTPUT_DIR}\"",
+            "mkdir -p \"${OUTPUT_DIR}\"",
+            "if [ $? -ne 0 ]; then",
+            "    echo \"ERROR: Failed to create output directory\"",
+            "    exit 1",
+            "fi",
+            "",
+            "# Collect system information",
+            "echo \"Collecting system information...\"",
+            "echo \"${INSTANCE_GROUP}\" > \"${OUTPUT_DIR}/instance_group.txt\"",
+            "echo \"${INSTANCE_ID}\" > \"${OUTPUT_DIR}/instance_id.txt\"",
+            "echo \"${CLUSTER_TYPE}\" > \"${OUTPUT_DIR}/cluster_type.txt\"",
+            "hostname > \"${OUTPUT_DIR}/hostname.txt\"",
+            "date -u > \"${OUTPUT_DIR}/timestamp.txt\"",
+            "",
+            "# Collect HyperPod resource config if available",
+            "if [ -f /opt/ml/config/resource_config.json ]; then",
+            "    echo \"Collecting HyperPod resource config...\"",
+            "    cp /opt/ml/config/resource_config.json \"${OUTPUT_DIR}/resource_config.json\" 2>/dev/null || echo \"Could not copy resource_config.json\"",
+            "fi",
+            "",
+            "# Collect cluster logs if available",
+            "if [ -d /var/log/aws/clusters ]; then",
+            "    echo \"Collecting cluster logs...\"",
+            "    mkdir -p \"${OUTPUT_DIR}/cluster_logs\"",
+            "    cp -r /var/log/aws/clusters/* \"${OUTPUT_DIR}/cluster_logs/\" 2>/dev/null || echo \"Could not copy cluster logs\"",
+            "fi",
+            "",
+            "# Collect systemd service status",
+            "echo \"Collecting systemd service status...\"",
+            "systemctl list-units --type=service --all --no-pager > \"${OUTPUT_DIR}/systemd_services.txt\" 2>&1 || echo \"Could not collect systemd services\"",
+            "",
+            "# Collect disk usage",
+            "echo \"Collecting disk usage...\"",
+            "df > \"${OUTPUT_DIR}/disk_usage.txt\" 2>&1 || echo \"Could not collect disk usage\"",
+            "",
+            "# Collect nvidia-smi output",
+            "echo \"Collecting nvidia-smi output...\"",
+            "nvidia-smi > \"${OUTPUT_DIR}/nvidia_smi.txt\" 2>&1 || echo \"nvidia-smi not available or failed\"",
+            "",
+        ]
+        
+        # Add cluster-type specific collections
+        if self.cluster_type == 'eks':
+            script_lines.extend([
+                "# EKS-specific collections",
+                "echo \"Collecting containerd service status...\"",
+                "systemctl status containerd > \"${OUTPUT_DIR}/containerd_status.txt\" 2>&1 || echo \"containerd service not found or not running\"",
+                "",
+                "echo \"Collecting kubelet service status...\"",
+                "systemctl status kubelet > \"${OUTPUT_DIR}/kubelet_status.txt\" 2>&1 || echo \"kubelet service not found or not running\"",
+                "",
+                "echo \"Running EKS log collector...\"",
+                "EKS_LOG_COLLECTOR_URL=\"https://raw.githubusercontent.com/awslabs/amazon-eks-ami/main/log-collector-script/linux/eks-log-collector.sh\"",
+                "curl -o /tmp/eks-log-collector.sh \"${EKS_LOG_COLLECTOR_URL}\"",  # nosec B108 - remote node shell script, not local Python
+                "chmod +x /tmp/eks-log-collector.sh",
+                "",
+                "# Run the collector and capture its output",
+                "/tmp/eks-log-collector.sh > \"${OUTPUT_DIR}/eks-log-collector-output.txt\" 2>&1 || echo \"EKS log collector completed with warnings\"",
+                "",
+                "# Find the generated tarball (it's created in /var/log/)",
+                "EKS_TARBALL=$(ls -t /var/log/eks_*.tar.gz 2>/dev/null | head -1)",
+                "if [ -n \"${EKS_TARBALL}\" ]; then",
+                "    echo \"Found EKS logs at ${EKS_TARBALL}\"",
+                "    echo \"Extracting EKS logs from ${EKS_TARBALL}\"",
+                "    mkdir -p \"${OUTPUT_DIR}/eks-logs\"",
+                "    tar -xzf \"${EKS_TARBALL}\" -C \"${OUTPUT_DIR}/eks-logs\" 2>/dev/null || echo \"Extracted EKS logs\"",
+                "    rm -f \"${EKS_TARBALL}\"",
+                "else",
+                "    echo \"ERROR: No EKS log tarball found in /var/log/\" | tee -a \"${OUTPUT_DIR}/eks-log-collector-output.txt\"",
+                "    echo \"EKS log collector may have failed. Check eks-log-collector-output.txt for details.\" | tee -a \"${OUTPUT_DIR}/eks-log-collector-output.txt\"",
+                "    rm -f /tmp/eks-log-collector.sh",
+                "    exit 1",
+                "fi",
+                "",
+                "# Clean up the collector script",
+                "rm -f /tmp/eks-log-collector.sh",
+                "",
+            ])
+        elif self.cluster_type == 'slurm':
+            script_lines.extend([
+                "# Slurm-specific collections",
+                "echo \"Collecting Slurm information...\"",
+                "",
+                "# Slurm info commands",
+                "sinfo > \"${OUTPUT_DIR}/sinfo.txt\" 2>&1 || echo \"sinfo not available\"",
+                "sinfo -R > \"${OUTPUT_DIR}/sinfo_R.txt\" 2>&1 || echo \"sinfo -R not available\"",
+                "",
+                "# Slurm service status",
+                "systemctl status slurmctld > \"${OUTPUT_DIR}/slurmctld_status.txt\" 2>&1 || echo \"slurmctld not running on this node\"",
+                "systemctl status slurmd > \"${OUTPUT_DIR}/slurmd_status.txt\" 2>&1 || echo \"slurmd not running on this node\"",
+                "",
+                "# Slurm configuration",
+                "if [ -d /opt/slurm/etc ]; then",
+                "    echo \"Collecting Slurm configuration...\"",
+                "    mkdir -p \"${OUTPUT_DIR}/opt_slurm_etc\"",
+                "    cp -r /opt/slurm/etc/* \"${OUTPUT_DIR}/opt_slurm_etc/\" 2>/dev/null || echo \"Could not copy Slurm config\"",
+                "fi",
+                "",
+                "# NVIDIA bug report",
+                "echo \"Running nvidia-bug-report.sh...\"",
+                "nvidia-bug-report.sh --output-file \"${OUTPUT_DIR}/nvidia-bug-report.log.gz\" 2>&1 || echo \"nvidia-bug-report.sh not available or failed\"",
+                "",
+                "# System logs",
+                "echo \"Collecting system logs...\"",
+                "cp /var/log/syslog \"${OUTPUT_DIR}/syslog\" 2>/dev/null || echo \"Could not copy syslog\"",
+                "cp /var/log/kern.log \"${OUTPUT_DIR}/kern.log\" 2>/dev/null || echo \"Could not copy kern.log\"",
+                "dmesg -T > \"${OUTPUT_DIR}/dmesg_T.txt\" 2>&1 || echo \"Could not run dmesg -T\"",
+                "",
+                "# Slurm logs",
+                "if [ -d /var/log/slurm ]; then",
+                "    echo \"Collecting Slurm logs...\"",
+                "    mkdir -p \"${OUTPUT_DIR}/var_log_slurm\"",
+                "    cp -r /var/log/slurm/* \"${OUTPUT_DIR}/var_log_slurm/\" 2>/dev/null || echo \"Could not copy Slurm logs\"",
+                "fi",
+                "",
+            ])
+        
+        # Add each command to the script
+        for i, cmd in enumerate(commands, 1):
+            # Sanitize command for filename - replace problematic characters
+            safe_name = cmd.replace(' ', '_').replace('/', '_').replace('|', '_').replace('>', '_').replace('<', '_').replace('&', '_').replace(';', '_').replace('(', '_').replace(')', '_').replace('$', '_').replace('`', '_').replace('"', '_').replace("'", '_')[:50]
+            output_file = f"command_{i:02d}_{safe_name}.txt"
+            
+            # Use regular string (not f-string) to avoid any escaping issues with bash variables
+            cmd_line = f"{cmd} > \"${{OUTPUT_DIR}}/{output_file}\" 2>&1 || echo \"Command failed with exit code $?\" >> \"${{OUTPUT_DIR}}/{output_file}\""
+            
+            script_lines.extend([
+                f"# Command {i}: {cmd}",
+                f"echo \"Running: {cmd}\"",
+                cmd_line,
+                "",
+            ])
+        
+        # Add S3 upload logic with new filename format
+        script_lines.extend([
+            "# Upload results to S3",
+            f"S3_BUCKET=\"{self.s3_bucket}\"",
+            f"S3_PREFIX=\"{self.report_s3_key}/instances\"",
+            "",
+            "echo \"Creating tarball...\"",
+            "TARBALL=\"/tmp/${INSTANCE_GROUP}_${INSTANCE_ID}.tar.gz\"",
+            "tar -czf \"${TARBALL}\" -C /tmp \"$(basename ${OUTPUT_DIR})\"",
+            "if [ $? -ne 0 ]; then",
+            "    echo \"ERROR: Failed to create tarball\"",
+            "    exit 1",
+            "fi",
+            "",
+            "echo \"Uploading to S3...\"",
+            "aws s3 cp \"${TARBALL}\" \"s3://${S3_BUCKET}/${S3_PREFIX}/$(basename ${TARBALL})\"",
+            "",
+            "if [ $? -eq 0 ]; then",
+            "    echo \"Successfully uploaded report to s3://${S3_BUCKET}/${S3_PREFIX}/$(basename ${TARBALL})\"",
+            "    rm -rf \"${OUTPUT_DIR}\" \"${TARBALL}\"",
+            "    echo \"Report collection completed for ${INSTANCE_GROUP}/${INSTANCE_ID}\"",
+            "    exit 0",
+            "else",
+            "    echo \"ERROR: Failed to upload to S3\"",
+            "    exit 1",
+            "fi",
+        ])
+        
+        return '\n'.join(script_lines)
+    
+    def get_hyperpod_ssm_target(self, instance_id: str, instance_group_name: str) -> str:
+        """Construct the HyperPod SSM target format."""
+        if not self.cluster_id:
+            raise ValueError("Cluster ID is required for HyperPod SSM targets")
+        return f"sagemaker-cluster:{self.cluster_id}_{instance_group_name}-{instance_id}"
+    
+    def execute_collection_on_node(self, node: Dict, commands: List[str], script_s3_uri: str) -> Dict:
+        """Execute the collection script on a single node via SSM using pexpect."""
+        instance_id = node['InstanceId']
+        instance_group = node.get('NodeGroup', 'unknown')
+        
+        # Start timing
+        start_time = time.time()
+        
+        try:
+            ssm_target = self.get_hyperpod_ssm_target(instance_id, instance_group)
+        except ValueError as e:
+            return {
+                'InstanceId': instance_id,
+                'NodeGroup': instance_group,
+                'Success': False,
+                'Error': str(e),
+                'ElapsedTime': time.time() - start_time
+            }
+        
+        # Build the command to download and execute the script with environment variables
+        commands_to_run = [
+            f"aws s3 cp {script_s3_uri} /tmp/collector_script.sh",
+            "chmod +x /tmp/collector_script.sh",
+            f"INSTANCE_GROUP={instance_group} INSTANCE_ID={instance_id} CLUSTER_TYPE={self.cluster_type} /tmp/collector_script.sh"
+        ]
+        
+        full_command = " && ".join(commands_to_run)
+        
+        print(f"Executing collection on {instance_id} ({instance_group})...")
+        
+        child = None
+        custom_prompt = "PEXPECT_READY# "
+        
+        try:
+            ssm_command = f"aws ssm start-session --target {ssm_target}"
+            
+            if self.debug:
+                print(f"[DEBUG] {instance_id}: SSM command: {ssm_command}")
+                print(f"[DEBUG] {instance_id}: Full command: {full_command}")
+            
+            # Use pexpect to handle the interactive session
+            # Note: No default timeout set - each expect() call has explicit timeout
+            child = pexpect.spawn(ssm_command, encoding='utf-8')
+            child.logfile_read = None
+            
+            # Wait for initial prompt (60 seconds to handle slow SSM session initialization)
+            initial_prompt_patterns = [
+                r'[\$#]\s+',            # Standard shell prompt
+                r'sh-\d+\.\d+[\$#]\s*', # sh prompt
+                pexpect.TIMEOUT
+            ]
+            
+            prompt_index = child.expect(initial_prompt_patterns, timeout=SSM_PROMPT_TIMEOUT)
+            
+            if prompt_index == len(initial_prompt_patterns) - 1:  # TIMEOUT
+                # Get output for debugging
+                output_sample = ""
+                if child and hasattr(child, 'before') and child.before:
+                    # Show more output to help diagnose the issue
+                    output_sample = child.before.strip()
+                    if len(output_sample) > 1000:
+                        output_sample = output_sample[-1000:]  # Last 1000 chars
+                
+                error_msg = (
+                    f"Failed to detect shell prompt after 60 seconds.\n"
+                    f"This may indicate:\n"
+                    f"  - Custom SSM session configuration interfering with prompt detection\n"
+                    f"  - Non-standard shell prompt format\n"
+                    f"  - SSM session initialization issues\n"
+                )
+                
+                if output_sample:
+                    error_msg += f"\nSession output received:\n{output_sample}\n"
+                    error_msg += (
+                        f"\nExpected prompt patterns: $ or # followed by space\n"
+                        f"If your cluster uses custom SSM session commands or non-standard prompts,\n"
+                        f"this tool may not be compatible."
+                    )
+                else:
+                    error_msg += "\nNo output received from SSM session."
+                
+                return {
+                    'InstanceId': instance_id,
+                    'NodeGroup': instance_group,
+                    'Success': False,
+                    'Error': error_msg
+                }
+            
+            # Set custom prompt
+            child.sendline(f'export PS1="{custom_prompt}"')
+            child.sendline('echo "PROMPT_SET_MARKER"')
+            child.expect('PROMPT_SET_MARKER', timeout=SSM_PROMPT_TIMEOUT)
+            child.expect(custom_prompt, timeout=SSM_PROMPT_TIMEOUT)
+            
+            if self.debug:
+                print(f"[DEBUG] {instance_id}: Custom prompt set")
+            
+            # Execute the command and capture exit code immediately
+            child.sendline(f'{full_command}; EXIT_CODE=$?; echo "EXIT_CODE:$EXIT_CODE"')
+            
+            # Wait for command completion (15 minutes for script execution)
+            child.expect(custom_prompt, timeout=SSM_SCRIPT_EXECUTION_TIMEOUT)
+            
+            # Extract output
+            output = child.before
+            exit_code = 1  # Default to failure
+            
+            if output:
+                lines = output.split('\n')
+                cleaned_lines = []
+                command_echo_removed = False
+                
+                for line in lines:
+                    line_stripped = line.strip()
+                    
+                    # Remove command echo
+                    if not command_echo_removed and full_command in line:
+                        command_echo_removed = True
+                        continue
+                    
+                    # Extract exit code
+                    if line_stripped.startswith('EXIT_CODE:'):
+                        try:
+                            exit_code = int(line_stripped.split(':')[1].strip())
+                        except (ValueError, IndexError):
+                            pass
+                        continue
+                    
+                    if line_stripped:
+                        cleaned_lines.append(line_stripped)
+                
+                output = '\n'.join(cleaned_lines)
+            else:
+                output = ""
+            
+            # Close session
+            try:
+                child.sendline('exit')
+                child.expect(pexpect.EOF, timeout=5)
+            except Exception:
+                try:
+                    child.kill(signal.SIGINT)
+                except Exception:  # nosec B110 - best-effort cleanup
+                    pass
+            
+            # Determine success based on exit code OR successful S3 upload message
+            # Some nodes may not properly echo the EXIT_CODE line due to terminal issues
+            success_indicators = [
+                exit_code == 0,
+                'Successfully uploaded report to s3://' in output,
+                'upload: ../../tmp/' in output and '.tar.gz to s3://' in output
+            ]
+            
+            if any(success_indicators):
+                return {
+                    'InstanceId': instance_id,
+                    'NodeGroup': instance_group,
+                    'Success': True,
+                    'Output': output,
+                    'ElapsedTime': time.time() - start_time
+                }
+            else:
+                # Show last 15 lines of output which usually contain the error
+                output_lines = output.split('\n')
+                error_context = '\n'.join(output_lines[-15:]) if len(output_lines) > 15 else output
+                
+                return {
+                    'InstanceId': instance_id,
+                    'NodeGroup': instance_group,
+                    'Success': False,
+                    'Error': f"Script execution failed (exit code: {exit_code})\n{error_context}",
+                    'Output': output,
+                    'ElapsedTime': time.time() - start_time
+                }
+            
+        except pexpect.TIMEOUT:
+            # Show more context about where the timeout occurred
+            output_sample = ""
+            if child and hasattr(child, 'before') and child.before:
+                output_sample = child.before.strip()
+                if len(output_sample) > 1000:
+                    output_sample = output_sample[-1000:]  # Last 1000 chars
+            
+            error_msg = (
+                f"Operation timed out during command execution.\n"
+                f"This may indicate:\n"
+                f"  - Command taking longer than expected to complete\n"
+                f"  - Custom shell configuration interfering with output detection\n"
+                f"  - Network or SSM session issues\n"
+            )
+            
+            if output_sample:
+                error_msg += f"\nLast output received:\n{output_sample}"
+            else:
+                error_msg += "\nNo output received."
+            
+            return {
+                'InstanceId': instance_id,
+                'NodeGroup': instance_group,
+                'Success': False,
+                'Error': error_msg,
+                'ElapsedTime': time.time() - start_time
+            }
+            
+        except pexpect.EOF:
+            output_sample = ""
+            if child and hasattr(child, 'before') and child.before:
+                output_sample = child.before.strip()
+                if len(output_sample) > 500:
+                    output_sample = output_sample[-500:]  # Last 500 chars
+            
+            error_msg = "SSM session ended unexpectedly"
+            if output_sample:
+                error_msg += f"\nLast output:\n{output_sample}"
+            
+            return {
+                'InstanceId': instance_id,
+                'NodeGroup': instance_group,
+                'Success': False,
+                'Error': error_msg,
+                'ElapsedTime': time.time() - start_time
+            }
+            
+        except Exception as e:
+            error_msg = f"Error executing command: {str(e)}"
+            if self.debug:
+                error_msg += f"\nTraceback: {traceback.format_exc()}"
+            return {
+                'InstanceId': instance_id,
+                'NodeGroup': instance_group,
+                'Success': False,
+                'Error': error_msg,
+                'ElapsedTime': time.time() - start_time
+            }
+            
+        finally:
+            if child and child.isalive():
+                try:
+                    child.terminate(force=True)
+                except Exception:  # nosec B110 - best-effort cleanup
+                    pass
+    
+    def execute_with_retry(self, node: Dict, commands: List[str], script_s3_uri: str, max_retries: int = 3) -> Dict:
+        """Execute collection on a node with exponential backoff on throttling errors."""
+        for attempt in range(max_retries):
+            result = self.execute_collection_on_node(node, commands, script_s3_uri)
+            
+            error_msg = result.get('Error', '')
+            if 'ThrottlingException' in error_msg or 'Rate exceeded' in error_msg:
+                if attempt < max_retries - 1:
+                    wait_time = 2 ** attempt
+                    if self.debug:
+                        print(f"[DEBUG] {node['InstanceId']}: Throttled, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries})")
+                    time.sleep(wait_time)
+                    continue
+            
+            return result
+        
+        return result
+
+    def collect_reports(self, commands: List[str], instance_groups: Optional[List[str]] = None, instance_ids: Optional[List[str]] = None, max_workers: int = 16):
+        """Collect reports from all nodes, specific instance groups, or specific instance IDs.
+        
+        For Slurm clusters, instance_ids can be either:
+        - Instance IDs: i-0123456789abcdef0
+        - Slurm node names: ip-10-1-104-161
+        
+        Note: max_workers defaults to 16 to balance speed and avoid SSM throttling on large clusters.
+        """
+        # Get cluster nodes
+        self.nodes = self.get_cluster_nodes()
+        
+        if not self.nodes:
+            print("No nodes found in cluster")
+            return
+        
+        # Collect kubectl information first (for EKS clusters)
+        if self.cluster_type == 'eks':
+            self.collect_kubectl_node_info()
+        
+        # Filter by specific instance IDs or Slurm node names if specified
+        if instance_ids:
+            # Resolve node identifiers (handles both instance IDs and Slurm node names)
+            resolved_instance_ids = self.resolve_node_identifiers(instance_ids)
+            
+            if not resolved_instance_ids:
+                print(f"No valid nodes found from specified identifiers: {', '.join(instance_ids)}")
+                return
+            
+            self.nodes = [n for n in self.nodes if n.get('InstanceId') in resolved_instance_ids]
+            if not self.nodes:
+                print(f"No nodes found with specified identifiers: {', '.join(instance_ids)}")
+                return
+            
+            # Show which requested identifiers were not found
+            found_ids = {n.get('InstanceId') for n in self.nodes}
+            missing_ids = set(resolved_instance_ids) - found_ids
+            if missing_ids:
+                print(f"Warning: Instance IDs not found in cluster: {', '.join(missing_ids)}")
+        # Filter by instance groups if specified (only if instance_ids not specified)
+        elif instance_groups:
+            # Convert instance groups to lowercase for case-insensitive matching
+            instance_groups_lower = [ig.lower() for ig in instance_groups]
+            self.nodes = [n for n in self.nodes if n.get('NodeGroup', '').lower() in instance_groups_lower]
+            if not self.nodes:
+                print(f"No nodes found in instance groups: {', '.join(instance_groups)}")
+                return
+            print(f"Filtering to instance groups: {', '.join(instance_groups)}")
+        
+        print(f"\nCollecting reports from {len(self.nodes)} nodes")
+        print(f"Cluster type: {self.cluster_type.upper()}")
+        print(f"Report ID: {self.report_id}")
+        print(f"S3 Location: s3://{self.s3_bucket}/{self.report_s3_key}/")
+        
+        # Show what will be collected based on cluster type
+        if self.cluster_type == 'eks':
+            print(f"Default collections: nvidia-smi, containerd status, kubelet status, EKS log collector, resource config, cluster logs, systemd services, disk usage")
+        elif self.cluster_type == 'slurm':
+            print(f"Default collections: nvidia-smi, nvidia-bug-report, sinfo, Slurm services, Slurm config, Slurm logs, system logs")
+        
+        if commands:
+            print(f"Additional commands: {', '.join(commands)}")
+        print("-" * 60)
+        
+        # Generate and upload the collector script once
+        script_content = self.generate_collector_script(commands)
+        script_key = f"{self.report_s3_key}/collector_script.sh"
+        
+        try:
+            self.s3_client.put_object(
+                Bucket=self.s3_bucket,
+                Key=script_key,
+                Body=script_content.encode('utf-8'),
+                ContentType='text/x-shellscript'
+            )
+            script_s3_uri = f"s3://{self.s3_bucket}/{script_key}"
+            print(f"Uploaded collector script to: {script_s3_uri}")
+        except Exception as e:
+            print(f"Error uploading collector script: {e}")
+            return
+        
+        # Execute collection on all nodes using ThreadPoolExecutor
+        results = []
+        
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            future_to_node = {
+                executor.submit(self.execute_with_retry, node, commands, script_s3_uri): node
+                for node in self.nodes
+            }
+            
+            for future in as_completed(future_to_node):
+                node = future_to_node[future]
+                try:
+                    result = future.result()
+                    results.append(result)
+                    
+                    status = "✓" if result['Success'] else "✗"
+                    elapsed = result.get('ElapsedTime', 0)
+                    print(f"[{status}] {result['InstanceId']} ({result['NodeGroup']}) - {elapsed:.1f}s")
+                    
+                    if not result['Success']:
+                        error_msg = result.get('Error', 'Unknown error')
+                        # Print error details with indentation for readability
+                        for line in error_msg.split('\n'):
+                            if line.strip():
+                                print(f"    {line}")
+                    
+                except Exception as e:
+                    print(f"[✗] {node['InstanceId']}: Exception: {e}")
+                    results.append({
+                        'InstanceId': node['InstanceId'],
+                        'NodeGroup': node.get('NodeGroup', 'unknown'),
+                        'Success': False,
+                        'Error': str(e),
+                        'ElapsedTime': 0
+                    })
+        
+        # Save summary
+        self.save_summary(results)
+        
+        print("-" * 60)
+        print(f"\nReport collection completed!")
+        print(f"Instance reports uploaded to: s3://{self.s3_bucket}/{self.report_s3_key}/instances/")
+        print(f"Summary: s3://{self.s3_bucket}/{self.report_s3_key}/summary.json")
+        
+        # Print statistics
+        successful = sum(1 for r in results if r['Success'])
+        failed = len(results) - successful
+        print(f"\nStatistics:")
+        print(f"  Total nodes: {len(results)}")
+        print(f"  Successful: {successful}")
+        print(f"  Failed: {failed}")
+        
+        # Offer to download results
+        self.offer_download_results()
+    
+    def offer_download_results(self):
+        """Ask user if they want to download results from S3."""
+        print("\n" + "=" * 60)
+        print("Download Results")
+        print("=" * 60)
+        
+        try:
+            response = input("\nWould you like to download all results from S3 to the current directory? (y/n): ").strip().lower()
+            
+            if response in ['y', 'yes']:
+                download_dir = self.download_results_from_s3()
+                
+                if download_dir:
+                    # Ask about creating zip archive
+                    response = input("\nWould you like to create a zip archive of the downloaded results? (y/n): ").strip().lower()
+                    
+                    if response in ['y', 'yes']:
+                        self.create_zip_archive(download_dir)
+            else:
+                print("\nSkipping download. You can download manually using:")
+                print(f"  aws s3 sync s3://{self.s3_bucket}/{self.report_s3_key}/ ./{self.cluster_name}_{self.report_id}/")
+                
+        except KeyboardInterrupt:
+            print("\n\nDownload cancelled by user.")
+        except Exception as e:
+            print(f"\nError during download prompt: {e}")
+    
+    def download_results_from_s3(self) -> Optional[str]:
+        """Download all results from S3 to local directory.
+        
+        Returns:
+            str: Path to download directory if successful, None otherwise
+        """
+        # Create download directory
+        download_dir = f"{self.cluster_name}_{self.report_id}"
+        
+        print(f"\nDownloading results to: ./{download_dir}/")
+        print(f"Source: s3://{self.s3_bucket}/{self.report_s3_key}/")
+        
+        try:
+            # List all objects in the S3 prefix
+            paginator = self.s3_client.get_paginator('list_objects_v2')
+            pages = paginator.paginate(Bucket=self.s3_bucket, Prefix=self.report_s3_key)
+            
+            files_to_download = []
+            for page in pages:
+                if 'Contents' in page:
+                    for obj in page['Contents']:
+                        key = obj['Key']
+                        # Skip the prefix itself (directory marker)
+                        if key != self.report_s3_key and key != f"{self.report_s3_key}/":
+                            files_to_download.append(key)
+            
+            if not files_to_download:
+                print("No files found to download.")
+                return None
+            
+            print(f"Found {len(files_to_download)} files to download...")
+            
+            # Download each file
+            downloaded = 0
+            failed = 0
+            
+            for key in files_to_download:
+                # Calculate relative path (remove the report_s3_key prefix)
+                relative_path = key[len(self.report_s3_key):].lstrip('/')
+                local_path = os.path.join(download_dir, relative_path)
+                
+                # Create parent directory if needed
+                local_dir = os.path.dirname(local_path)
+                if local_dir:
+                    os.makedirs(local_dir, exist_ok=True)
+                
+                try:
+                    # Download file
+                    self.s3_client.download_file(self.s3_bucket, key, local_path)
+                    downloaded += 1
+                    
+                    # Show progress for every 5 files or last file
+                    if downloaded % 5 == 0 or downloaded == len(files_to_download):
+                        print(f"  Downloaded {downloaded}/{len(files_to_download)} files...")
+                        
+                except Exception as e:
+                    print(f"  Failed to download {relative_path}: {e}")
+                    failed += 1
+            
+            print(f"\n✓ Download completed!")
+            print(f"  Downloaded: {downloaded} files")
+            if failed > 0:
+                print(f"  Failed: {failed} files")
+            print(f"  Location: ./{download_dir}/")
+            
+            return download_dir
+            
+        except Exception as e:
+            print(f"\nError downloading results: {e}")
+            if self.debug:
+                traceback.print_exc()
+            return None
+    
+    def create_zip_archive(self, directory: str):
+        """Create a zip archive of the downloaded results.
+        
+        Args:
+            directory: Path to directory to archive
+        """
+        zip_filename = f"{directory}.zip"
+        
+        print(f"\nCreating zip archive: {zip_filename}")
+        
+        try:
+            with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
+                # Walk through directory
+                file_count = 0
+                for root, dirs, files in os.walk(directory):
+                    for file in files:
+                        file_path = os.path.join(root, file)
+                        # Calculate archive name (relative to directory)
+                        arcname = os.path.relpath(file_path, os.path.dirname(directory))
+                        zipf.write(file_path, arcname)
+                        file_count += 1
+                        
+                        # Show progress
+                        if file_count % 5 == 0:
+                            print(f"  Archived {file_count} files...")
+            
+            # Get zip file size
+            zip_size = os.path.getsize(zip_filename)
+            zip_size_mb = zip_size / (1024 * 1024)
+            
+            print(f"\n✓ Zip archive created!")
+            print(f"  File: {zip_filename}")
+            print(f"  Size: {zip_size_mb:.2f} MB")
+            print(f"  Files: {file_count}")
+            
+            # Ask if user wants to delete the uncompressed directory
+            response = input(f"\nWould you like to delete the uncompressed directory '{directory}'? (y/n): ").strip().lower()
+            
+            if response in ['y', 'yes']:
+                shutil.rmtree(directory)
+                print(f"✓ Deleted directory: {directory}")
+            else:
+                print(f"Keeping directory: {directory}")
+                
+        except Exception as e:
+            print(f"\nError creating zip archive: {e}")
+            if self.debug:
+                traceback.print_exc()
+    
+    def save_summary(self, results: List[Dict]):
+        """Save collection summary to S3."""
+        summary = {
+            'cluster_name': self.cluster_name,
+            'cluster_id': self.cluster_id,
+            'report_id': self.report_id,
+            'timestamp': datetime.now(timezone.utc).isoformat(),
+            'total_nodes': len(results),
+            'successful': sum(1 for r in results if r['Success']),
+            'failed': sum(1 for r in results if not r['Success']),
+            'results': results
+        }
+        
+        summary_key = f"{self.report_s3_key}/summary.json"
+        
+        try:
+            self.s3_client.put_object(
+                Bucket=self.s3_bucket,
+                Key=summary_key,
+                Body=json.dumps(summary, indent=2).encode('utf-8'),
+                ContentType='application/json'
+            )
+            print(f"Summary saved to: s3://{self.s3_bucket}/{summary_key}")
+        except Exception as e:
+            print(f"Error saving summary: {e}")
+    
+    def verify_kubectl_config(self) -> bool:
+        """Verify kubectl is configured for the EKS cluster."""
+        if not self.eks_cluster_name:
+            print("Warning: EKS cluster name not available, skipping kubectl verification")
+            return False
+        
+        try:
+            # Check if kubectl is installed
+            result = subprocess.run(['kubectl', 'version', '--client'],  # nosec B603 B607
+                                  capture_output=True, text=True, timeout=10)
+            if result.returncode != 0:
+                print("\n" + "!" * 60)
+                print("ERROR: kubectl is not installed or not in PATH")
+                print("!" * 60)
+                return False
+            
+            # Extract just the version line
+            version_line = result.stdout.strip().split('\n')[0] if result.stdout else "kubectl installed"
+            print(f"kubectl version: {version_line}")
+            
+            # Check current context
+            result = subprocess.run(['kubectl', 'config', 'current-context'],  # nosec B603 B607
+                                  capture_output=True, text=True, timeout=10)
+            if result.returncode == 0:
+                current_context = result.stdout.strip()
+                print(f"Current kubectl context: {current_context}")
+                
+                # Check if context matches EKS cluster
+                if self.eks_cluster_name in current_context:
+                    print(f"✓ kubectl is configured for EKS cluster: {self.eks_cluster_name}")
+                    return True
+                else:
+                    # Extract region from EKS cluster ARN
+                    region = self.eks_cluster_arn.split(':')[3] if self.eks_cluster_arn else 'REGION'
+                    
+                    print("\n" + "!" * 60)
+                    print(f"ERROR: kubectl context does not match EKS cluster")
+                    print(f"Current context: {current_context}")
+                    print(f"Expected cluster: {self.eks_cluster_name}")
+                    print("!" * 60)
+                    print("\nTo configure kubectl for this EKS cluster, run:")
+                    print(f"  aws eks update-kubeconfig --name {self.eks_cluster_name} --region {region}")
+                    return False
+            else:
+                # Extract region from EKS cluster ARN
+                region = self.eks_cluster_arn.split(':')[3] if self.eks_cluster_arn else 'REGION'
+                
+                print("\n" + "!" * 60)
+                print("ERROR: No kubectl context configured")
+                print("!" * 60)
+                print("\nTo configure kubectl for this EKS cluster, run:")
+                print(f"  aws eks update-kubeconfig --name {self.eks_cluster_name} --region {region}")
+                return False
+                
+        except subprocess.TimeoutExpired:
+            print("Warning: kubectl command timed out")
+            return False
+        except FileNotFoundError:
+            print("\n" + "!" * 60)
+            print("ERROR: kubectl not found in PATH")
+            print("!" * 60)
+            return False
+        except Exception as e:
+            print(f"Warning: Error verifying kubectl config: {e}")
+            return False
+    
+    def collect_kubectl_node_info(self):
+        """Collect kubectl describe node information for all nodes."""
+        if self.cluster_type != 'eks':
+            print("Skipping kubectl collection - not an EKS cluster")
+            return
+        
+        if not self.eks_cluster_name:
+            print("Skipping kubectl collection - EKS cluster name not available")
+            return
+        
+        print("\n" + "=" * 60)
+        print("Collecting kubectl node information...")
+        print("=" * 60)
+        
+        # Verify kubectl configuration - exit if not configured
+        if not self.verify_kubectl_config():
+            print("\n" + "!" * 60)
+            print("ERROR: kubectl must be configured for EKS clusters")
+            print("!" * 60)
+            print("\nPlease configure kubectl and re-run the tool.\n")
+            sys.exit(1)
+        
+        try:
+            # Create output directory
+            kubectl_output_dir = tempfile.mkdtemp(prefix='kubectl_output_')
+            
+            # Define resources to collect
+            collections = [
+                # High Priority - Essential for troubleshooting
+                {
+                    'name': 'nodes_describe',
+                    'command': ['kubectl', 'describe', 'nodes'],
+                    'description': 'Node descriptions (capacity, conditions, pods)'
+                },
+                {
+                    'name': 'pods_all_namespaces',
+                    'command': ['kubectl', 'get', 'pods', '-A', '-o', 'wide'],
+                    'description': 'All pods across namespaces (wide output)'
+                },
+                {
+                    'name': 'pods_describe_all_namespaces',
+                    'command': ['kubectl', 'describe', 'pods', '-A'],
+                    'description': 'Detailed pod descriptions (all namespaces)'
+                },
+                {
+                    'name': 'events_all_namespaces',
+                    'command': ['kubectl', 'get', 'events', '-A', '--sort-by=.lastTimestamp'],
+                    'description': 'Cluster events sorted by timestamp'
+                },
+                {
+                    'name': 'pvcs_all_namespaces',
+                    'command': ['kubectl', 'get', 'pvc', '-A', '-o', 'wide'],
+                    'description': 'PersistentVolumeClaims (storage)'
+                },
+                {
+                    'name': 'pvcs_describe_all_namespaces',
+                    'command': ['kubectl', 'describe', 'pvc', '-A'],
+                    'description': 'Detailed PVC descriptions'
+                },
+                {
+                    'name': 'services_all_namespaces',
+                    'command': ['kubectl', 'get', 'svc', '-A', '-o', 'wide'],
+                    'description': 'Services (network endpoints)'
+                },
+                {
+                    'name': 'services_describe_all_namespaces',
+                    'command': ['kubectl', 'describe', 'svc', '-A'],
+                    'description': 'Detailed service descriptions'
+                },
+                
+                # Medium Priority - Very useful
+                {
+                    'name': 'deployments_all_namespaces',
+                    'command': ['kubectl', 'get', 'deployments', '-A', '-o', 'wide'],
+                    'description': 'Deployments'
+                },
+                {
+                    'name': 'statefulsets_all_namespaces',
+                    'command': ['kubectl', 'get', 'statefulsets', '-A', '-o', 'wide'],
+                    'description': 'StatefulSets'
+                },
+                {
+                    'name': 'daemonsets_all_namespaces',
+                    'command': ['kubectl', 'get', 'daemonsets', '-A', '-o', 'wide'],
+                    'description': 'DaemonSets'
+                },
+                {
+                    'name': 'configmaps_all_namespaces',
+                    'command': ['kubectl', 'get', 'configmaps', '-A'],
+                    'description': 'ConfigMaps (metadata only)'
+                },
+                {
+                    'name': 'secrets_all_namespaces',
+                    'command': ['kubectl', 'get', 'secrets', '-A'],
+                    'description': 'Secrets (metadata only, no content)'
+                },
+                {
+                    'name': 'resourcequotas_all_namespaces',
+                    'command': ['kubectl', 'get', 'resourcequota', '-A'],
+                    'description': 'Resource quotas'
+                },
+                {
+                    'name': 'networkpolicies_all_namespaces',
+                    'command': ['kubectl', 'get', 'networkpolicies', '-A'],
+                    'description': 'Network policies'
+                },
+            ]
+            
+            print(f"Collecting {len(collections)} Kubernetes resource types...")
+            successful = 0
+            failed = 0
+            
+            for collection in collections:
+                name = collection['name']
+                command = collection['command']
+                description = collection['description']
+                
+                print(f"  Collecting: {description}...", end=' ', flush=True)
+                
+                try:
+                    # Use unified timeout for all kubectl operations
+                    timeout = KUBECTL_TIMEOUT
+                    
+                    # Measure execution time
+                    start_time = time.time()
+                    
+                    result = subprocess.run(  # nosec B603
+                        command,
+                        capture_output=True,
+                        text=True,
+                        timeout=timeout
+                    )
+                    
+                    elapsed_time = time.time() - start_time
+                    
+                    output_file = os.path.join(kubectl_output_dir, f'{name}.txt')
+                    
+                    if result.returncode == 0:
+                        if result.stdout.strip():
+                            with open(output_file, 'w', encoding='utf-8') as f:
+                                f.write(result.stdout)
+                            print(f"✓ ({elapsed_time:.1f}s)")
+                            successful += 1
+                        else:
+                            # Empty output (no resources of this type)
+                            with open(output_file, 'w', encoding='utf-8') as f:
+                                f.write("No resources found\n")
+                            print(f"✓ (empty, {elapsed_time:.1f}s)")
+                            successful += 1
+                    else:
+                        # Command failed
+                        with open(output_file, 'w', encoding='utf-8') as f:
+                            f.write(f"Error: {result.stderr}\n")
+                        print(f"✗ ({result.stderr.strip()[:50]}, {elapsed_time:.1f}s)")
+                        failed += 1
+                        
+                except subprocess.TimeoutExpired:
+                    output_file = os.path.join(kubectl_output_dir, f'{name}.txt')
+                    with open(output_file, 'w', encoding='utf-8') as f:
+                        f.write("Error: Command timed out\n")
+                    print(f"✗ (timeout after {timeout}s)")
+                    failed += 1
+                    
+                except Exception as e:
+                    output_file = os.path.join(kubectl_output_dir, f'{name}.txt')
+                    with open(output_file, 'w', encoding='utf-8') as f:
+                        f.write(f"Error: {str(e)}\n")
+                    print(f"✗ ({str(e)[:50]})")
+                    failed += 1
+            
+            print(f"\nCollection summary: {successful} successful, {failed} failed")
+            
+            # Create tarball with files at root level (no wrapper directory)
+            print("\nCreating kubectl output tarball...")
+            tarball_path = os.path.join(tempfile.gettempdir(), 'kubectl_resources.tar.gz')
+            
+            with tarfile.open(tarball_path, 'w:gz') as tar:
+                # Add each file directly to the tarball root (no parent directory)
+                for filename in os.listdir(kubectl_output_dir):
+                    file_path = os.path.join(kubectl_output_dir, filename)
+                    tar.add(file_path, arcname=filename)
+            
+            print(f"Created tarball: {tarball_path}")
+            
+            # Upload to S3
+            s3_key = f"{self.report_s3_key}/kubectl_resources.tar.gz"
+            print(f"Uploading to S3: s3://{self.s3_bucket}/{s3_key}")
+            
+            self.s3_client.upload_file(tarball_path, self.s3_bucket, s3_key)
+            
+            print(f"✓ Successfully uploaded kubectl resource information to S3")
+            print(f"  Location: s3://{self.s3_bucket}/{s3_key}")
+            
+            # Cleanup
+            shutil.rmtree(kubectl_output_dir, ignore_errors=True)
+            os.remove(tarball_path)
+            
+        except Exception as e:
+            print(f"Error collecting kubectl information: {e}")
+            if self.debug:
+                traceback.print_exc()
+
+
+def main():
+    # Check platform compatibility
+    if platform.system() == 'Windows':
+        print("=" * 70)
+        print("ERROR: Windows is not supported")
+        print("=" * 70)
+        print()
+        print("This tool uses pexpect for interactive SSM sessions, which has")
+        print("different behavior on Windows compared to macOS and Linux.")
+        print()
+        print("Supported platforms:")
+        print("  - macOS")
+        print("  - Linux")
+        print()
+        print("Please run this tool from a macOS or Linux machine, or use WSL")
+        print("(Windows Subsystem for Linux) if you're on Windows.")
+        print()
+        sys.exit(1)
+    
+    parser = argparse.ArgumentParser(
+        description='HyperPod Issue Report Collector - Supports both EKS and Slurm clusters',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Basic usage - auto-detects cluster type
+  python hyperpod_issue_report.py --cluster my-cluster --region us-west-2 --s3-path s3://my-bucket
+
+  # With custom prefix and additional commands
+  python hyperpod_issue_report.py --cluster my-cluster --region us-west-2 --s3-path s3://my-bucket/diagnostics \\
+    --command "df -h" --command "free -h"
+
+  # Target specific instance groups
+  python hyperpod_issue_report.py --cluster my-cluster --region us-west-2 --s3-path s3://my-bucket \\
+    --instance-groups worker-group-1 worker-group-2
+
+  # Target specific nodes (instance IDs, EKS names, or Slurm names)
+  python hyperpod_issue_report.py --cluster my-cluster --region us-west-2 --s3-path s3://my-bucket \\
+    --nodes i-abc123 hyperpod-i-044bbf66a68558e87 ip-10-1-104-161
+        """
+    )
+    
+    parser.add_argument('--cluster', '-c', required=True, help='HyperPod cluster name (EKS or Slurm)')
+    parser.add_argument('--region', '-r', help='AWS region (uses default boto3 region if not specified)')
+    parser.add_argument('--s3-path', '-s', required=True, help='S3 path for storing reports (e.g., s3://bucket-name/prefix or s3://bucket-name)')
+    parser.add_argument('--command', '-cmd', action='append', help='Additional command to execute on nodes (can be specified multiple times)')
+    parser.add_argument('--instance-groups', '-g', nargs='+', help='Target specific instance groups (e.g., --instance-groups worker1 worker2)')
+    parser.add_argument('--max-workers', '-w', type=int, default=16, help='Maximum concurrent SSM sessions (default: 16, reduce if hitting throttling)')
+    parser.add_argument('--nodes', '-n', nargs='+', help='Target specific nodes: instance IDs (i-*), EKS node names (hyperpod-i-*), or Slurm node names (ip-*)')
+    parser.add_argument('--debug', '-d', action='store_true', help='Enable debug mode')
+    
+    args = parser.parse_args()
+    
+    # Validate mutually exclusive options
+    if args.instance_groups and args.nodes:
+        print("Error: --instance-groups and --nodes cannot be used together")
+        sys.exit(1)
+    
+    try:
+        collector = HyperPodIssueReportCollector(
+            cluster_name=args.cluster,
+            s3_path=args.s3_path,
+            region=args.region,
+            debug=args.debug
+        )
+        
+        # User-specified commands
+        commands = []
+        
+        # Add any user-specified commands
+        if args.command:
+            commands.extend(args.command)
+        
+        collector.collect_reports(
+            commands=commands,
+            instance_groups=args.instance_groups,
+            instance_ids=args.nodes,
+            max_workers=args.max_workers
+        )
+        
+    except KeyboardInterrupt:
+        print("\n\nInterrupted by user. Exiting...")
+        sys.exit(1)
+    except Exception as e:
+        print(f"\nError: {e}")
+        if args.debug:
+            traceback.print_exc()
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/plugins/sagemaker-ai/skills/hyperpod-issue-report/scripts/requirements.txt b/plugins/sagemaker-ai/skills/hyperpod-issue-report/scripts/requirements.txt
new file mode 100755
index 00000000..690613af
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-issue-report/scripts/requirements.txt
@@ -0,0 +1,3 @@
+boto3>=1.26.0
+botocore>=1.29.0
+pexpect>=4.8.0
diff --git a/plugins/sagemaker-ai/skills/hyperpod-ssm/SKILL.md b/plugins/sagemaker-ai/skills/hyperpod-ssm/SKILL.md
new file mode 100755
index 00000000..048d962d
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-ssm/SKILL.md
@@ -0,0 +1,96 @@
+---
+name: hyperpod-ssm
+description: Remote command execution and file transfer on SageMaker HyperPod cluster nodes via AWS Systems Manager (SSM). This is the primary interface for accessing HyperPod nodes — direct SSH is not available. Use when any skill, workflow, or user request needs to execute commands on cluster nodes, upload files to nodes, read/download files from nodes, run diagnostics, install packages, or perform any operation requiring shell access to HyperPod instances. Other HyperPod skills depend on this skill for all node-level operations.
+---
+
+# HyperPod SSM Access
+
+## SSM Target Format
+
+Target: `sagemaker-cluster:<CLUSTER_ID>_<GROUP_NAME>-<INSTANCE_ID>`
+
+- `CLUSTER_ID`: Last segment of cluster ARN (NOT the cluster name). Extract via `get-cluster-info.sh`.
+- `GROUP_NAME`: Instance group name — retrieve via `list-nodes.sh`.
+- `INSTANCE_ID`: EC2 instance ID (e.g., `i-0123456789abcdef0`)
+
+## Scripts
+
+Three scripts under `scripts/`. Resolve cluster info and nodes **once**, then execute per node.
+
+### get-cluster-info.sh — Resolve cluster name → ID (call once)
+
+```bash
+scripts/get-cluster-info.sh CLUSTER_NAME [--region REGION]
+# Output: {"cluster_id":"...","cluster_arn":"...","cluster_name":"...","region":"..."}
+```
+
+### list-nodes.sh — List all nodes with pagination (call once)
+
+```bash
+scripts/list-nodes.sh CLUSTER_NAME [--region REGION] [--instance-group GROUP] [--instance-id ID]
+# Output: JSON array of ClusterNodeSummaries (InstanceId, InstanceGroupName, InstanceStatus, etc.)
+```
+
+`list-cluster-nodes` paginates at 100 nodes. This script handles pagination automatically.
+
+### ssm-exec.sh — Execute command on a node (call per node)
+
+```bash
+# Execute — with pre-built target
+scripts/ssm-exec.sh --target "sagemaker-cluster:CLUSTERID_GROUP-INSTANCEID" 'command' [--region REGION]
+
+# Execute — with parts
+scripts/ssm-exec.sh --cluster-id ID --group GROUP --instance-id INSTANCE_ID 'command' [--region REGION]
+
+# Upload
+scripts/ssm-exec.sh --target TARGET --upload LOCAL_PATH REMOTE_PATH [--region REGION]
+
+# Read remote file
+scripts/ssm-exec.sh --target TARGET --read REMOTE_PATH [--region REGION]
+```
+
+## Running Commands Across Many Nodes
+
+SSM `start-session` rate limit: **3 TPS** per account. Plan batch size and delay accordingly.
+
+`aws ssm send-command` does NOT support `sagemaker-cluster:` targets — only `start-session` works.
+
+## Manual SSM Commands
+
+When the scripts aren't suitable, use `aws ssm start-session` directly with `AWS-StartNonInteractiveCommand`:
+
+```bash
+cat > /tmp/cmd.json << 'EOF'
+{"command": ["bash -c 'echo hello && whoami'"]}
+EOF
+
+aws ssm start-session \
+  --target sagemaker-cluster:CLUSTERID_GROUPNAME-INSTANCEID \
+  --region REGION \
+  --document-name AWS-StartNonInteractiveCommand \
+  --parameters file:///tmp/cmd.json
+```
+
+Always use a JSON file for `--parameters` — inline parameters break with special characters.
+
+## Common Diagnostic Commands
+
+| Task             | Command                                                        |
+| ---------------- | -------------------------------------------------------------- |
+| Lifecycle logs   | `cat /var/log/provision/provisioning.log`                      |
+| Memory           | `free -h`                                                      |
+| Disk/mounts      | `df -h && lsblk`                                               |
+| GPU status       | `nvidia-smi`                                                   |
+| GPU memory       | `nvidia-smi --query-gpu=memory.used,memory.total --format=csv` |
+| EFA/network      | `fi_info -p efa`                                               |
+| CloudWatch agent | `sudo systemctl status amazon-cloudwatch-agent`                |
+| Top processes    | `ps aux --sort=-%mem \| head -20`                              |
+
+## Key Details
+
+- Default SSM non-interactive user is `root`.
+- SSM rate limit: **3 TPS** per account.
+- For interactive sessions (rare), omit `--document-name` to get a shell.
+- Interactive commands (vim, top) are not supported via `AWS-StartNonInteractiveCommand`.
+- Large outputs may be truncated by SSM.
+- For troubleshooting common errors, see [references/troubleshooting.md](references/troubleshooting.md).
diff --git a/plugins/sagemaker-ai/skills/hyperpod-ssm/references/troubleshooting.md b/plugins/sagemaker-ai/skills/hyperpod-ssm/references/troubleshooting.md
new file mode 100755
index 00000000..e8a098d8
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-ssm/references/troubleshooting.md
@@ -0,0 +1,61 @@
+# Troubleshooting
+
+## TargetNotConnected
+
+```
+An error occurred (TargetNotConnected) when calling the StartSession operation
+```
+
+Causes:
+
+- Wrong target format — verify underscore between cluster ID and group name, hyphen before instance ID
+- Cluster ID is wrong — must be extracted from ARN, not the cluster name
+- Node not in `Running` state — check with `list-cluster-nodes`
+- SSM agent not running on the node
+
+Verify:
+
+```bash
+aws sagemaker list-cluster-nodes --cluster-name CLUSTER --region REGION \
+  --query 'ClusterNodeSummaries[?InstanceId==`INSTANCE_ID`].[InstanceGroupName,InstanceStatus.Status]' \
+  --output text
+```
+
+## AccessDeniedException
+
+Ensure IAM permissions include:
+
+- `sagemaker:DescribeCluster`, `sagemaker:ListClusterNodes`
+- `ssm:StartSession`, `ssm:TerminateSession`
+
+## Command Timeout / Hangs
+
+- Long-running commands without output can cause SSM to hang
+- Add periodic output or redirect to file then cat: `bash -c 'cmd > /tmp/out.log 2>&1 && cat /tmp/out.log'`
+
+## Base64 Upload Corruption
+
+- Always use `base64 -w 0` (no line wrapping)
+- For large files (>256KB), SSM parameter size limits may apply — split into chunks or use shared filesystem (FSx/EFS) instead
+
+## RunAs User Error
+
+```
+Unable to start command: failed to start pty since RunAs user does not exist
+```
+
+SSM Run-as-user is configured but user doesn't exist on the node. Use default (root) and `sudo -u USERNAME` explicitly.
+
+## ThrottlingException on StartSession
+
+```
+An error occurred (ThrottlingException) when calling the StartSession operation: Rate exceeded
+```
+
+Cause: Too many concurrent `start-session` calls. SSM has per-account rate limits.
+
+Fix: Use batched parallel execution with a delay between batches (see "Running Commands Across Many Nodes" in SKILL.md). A batch size of 20 with a 2-second delay between batches works reliably for clusters of 100+ nodes.
+
+## send-command Not Supported
+
+`aws ssm send-command` does not support `sagemaker-cluster:` targets and will return a `ValidationException`. Use `start-session` with `AWS-StartNonInteractiveCommand` instead.
diff --git a/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/get-cluster-info.sh b/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/get-cluster-info.sh
new file mode 100755
index 00000000..0412462b
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/get-cluster-info.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+# Get HyperPod cluster ID and metadata
+# Usage: ./get-cluster-info.sh CLUSTER_NAME [--region REGION]
+# Output: JSON with cluster_id extracted from ARN
+set -euo pipefail
+
+CLUSTER="$1"; shift
+REGION="${AWS_DEFAULT_REGION:-us-west-2}"
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --region) REGION="$2"; shift 2 ;;
+    *) shift ;;
+  esac
+done
+
+ARN=$(aws sagemaker describe-cluster --cluster-name "$CLUSTER" --region "$REGION" \
+  --query 'ClusterArn' --output text)
+CLUSTER_ID=$(echo "$ARN" | cut -d'/' -f2)
+
+echo "{\"cluster_id\":\"${CLUSTER_ID}\",\"cluster_arn\":\"${ARN}\",\"cluster_name\":\"${CLUSTER}\",\"region\":\"${REGION}\"}"
diff --git a/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/list-nodes.sh b/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/list-nodes.sh
new file mode 100755
index 00000000..028df598
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/list-nodes.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+# List all HyperPod cluster nodes with instance group info (handles pagination)
+# Usage: ./list-nodes.sh CLUSTER_NAME [--region REGION] [--instance-group GROUP] [--instance-id ID]
+# Output: JSON array of nodes with InstanceId, InstanceGroupName, InstanceStatus, etc.
+set -euo pipefail
+
+CLUSTER="$1"; shift
+REGION="${AWS_DEFAULT_REGION:-us-west-2}"
+FILTER_GROUP="" ; FILTER_ID=""
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --region)          REGION="$2"; shift 2 ;;
+    --instance-group)  FILTER_GROUP="$2"; shift 2 ;;
+    --instance-id)     FILTER_ID="$2"; shift 2 ;;
+    *) shift ;;
+  esac
+done
+
+# Paginate to collect ALL nodes
+NODES='[]'; NEXT=""
+while :; do
+  PAGE=$(aws sagemaker list-cluster-nodes --cluster-name "$CLUSTER" --region "$REGION" \
+    ${NEXT:+--next-token "$NEXT"} --output json)
+  NODES=$(echo "$NODES" "$PAGE" | jq -s '.[0] + .[1].ClusterNodeSummaries')
+  NEXT=$(echo "$PAGE" | jq -r '.NextToken // empty')
+  [[ -z "$NEXT" ]] && break
+done
+
+# Apply filters
+if [[ -n "$FILTER_GROUP" ]]; then
+  NODES=$(echo "$NODES" | jq --arg g "$FILTER_GROUP" '[.[] | select(.InstanceGroupName==$g)]')
+fi
+if [[ -n "$FILTER_ID" ]]; then
+  NODES=$(echo "$NODES" | jq --arg id "$FILTER_ID" '[.[] | select(.InstanceId==$id)]')
+fi
+
+echo "$NODES"
diff --git a/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/ssm-exec.sh b/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/ssm-exec.sh
new file mode 100755
index 00000000..b53e6b7f
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/ssm-exec.sh
@@ -0,0 +1,85 @@
+#!/usr/bin/env bash
+# Execute SSM command on a HyperPod node using a pre-resolved target
+# Usage:
+#   Execute:  ./ssm-exec.sh --target TARGET 'command' [--region REGION]
+#   Upload:   ./ssm-exec.sh --target TARGET --upload LOCAL_PATH REMOTE_PATH [--region REGION]
+#   Read:     ./ssm-exec.sh --target TARGET --read REMOTE_PATH [--region REGION]
+#
+# Target format: sagemaker-cluster:<CLUSTER_ID>_<GROUP_NAME>-<INSTANCE_ID>
+# Build target from parts: use --cluster-id, --group, --instance-id instead of --target
+set -euo pipefail
+
+REGION="${AWS_DEFAULT_REGION:-us-west-2}"
+TARGET="" ; CLUSTER_ID="" ; GROUP="" ; INSTANCE_ID=""
+MODE="exec" ; CMD="" ; LOCAL_PATH="" ; REMOTE_PATH=""
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --target)      TARGET="$2"; shift 2 ;;
+    --cluster-id)  CLUSTER_ID="$2"; shift 2 ;;
+    --group)       GROUP="$2"; shift 2 ;;
+    --instance-id) INSTANCE_ID="$2"; shift 2 ;;
+    --upload)      MODE="upload"; LOCAL_PATH="$2"; REMOTE_PATH="$3"; shift 3 ;;
+    --read)        MODE="read"; REMOTE_PATH="$2"; shift 2 ;;
+    --region)      REGION="$2"; shift 2 ;;
+    *)             CMD="$1"; shift ;;
+  esac
+done
+
+# Build target from parts if --target not provided
+if [[ -z "$TARGET" ]]; then
+  [[ -z "$CLUSTER_ID" || -z "$GROUP" || -z "$INSTANCE_ID" ]] && \
+    echo "Error: Provide --target or all of --cluster-id, --group, --instance-id" >&2 && exit 1
+  TARGET="sagemaker-cluster:${CLUSTER_ID}_${GROUP}-${INSTANCE_ID}"
+fi
+
+TMPFILE=$(mktemp /tmp/ssm-cmd-XXXXXX.json)
+trap "rm -f '$TMPFILE'" EXIT
+
+# Cross-platform base64 encode with no line wrapping (GNU: -w0, macOS: -b0)
+# Usage: b64_encode FILE  or  cmd | b64_encode
+b64_encode() {
+  if base64 --help 2>&1 | grep -q '\-w'; then
+    if [[ $# -gt 0 ]]; then base64 -w 0 "$1"; else base64 -w 0; fi
+  else
+    if [[ $# -gt 0 ]]; then base64 -b 0 -i "$1"; else base64 -b 0; fi
+  fi
+}
+
+json_cmd() {
+  local cmd="$1"
+  if command -v jq >/dev/null 2>&1; then
+    jq -n --arg c "$cmd" '{"command":[$c]}'
+  else
+    local escaped
+    escaped=$(printf '%s' "$cmd" | sed 's/\\/\\\\/g; s/"/\\"/g; s/\t/\\t/g')
+    printf '{"command":["%s"]}\n' "$escaped"
+  fi
+}
+
+case "$MODE" in
+  exec)
+    [[ -z "$CMD" ]] && echo "Error: No command specified" >&2 && exit 1
+    INNER=$(printf '%s' "$CMD" | sed "s/'/'\\\\''/g")
+    json_cmd "bash -c '${INNER}'" > "$TMPFILE"
+    ;;
+  upload)
+    ENCODED=$(b64_encode "$LOCAL_PATH")
+    # Compress large files to stay within SSM command limits (~64KB)
+    if [[ ${#ENCODED} -gt 8000 ]]; then
+      ENCODED=$(gzip -c "$LOCAL_PATH" | b64_encode)
+      json_cmd "bash -c 'echo ${ENCODED} | base64 -d | gunzip > ${REMOTE_PATH}'" > "$TMPFILE"
+    else
+      json_cmd "bash -c 'echo ${ENCODED} | base64 -d > ${REMOTE_PATH}'" > "$TMPFILE"
+    fi
+    ;;
+  read)
+    json_cmd "cat '${REMOTE_PATH}'" > "$TMPFILE"
+    ;;
+esac
+
+aws ssm start-session \
+  --target "$TARGET" \
+  --region "$REGION" \
+  --document-name AWS-StartNonInteractiveCommand \
+  --parameters "file://$TMPFILE"
diff --git a/plugins/sagemaker-ai/skills/hyperpod-version-checker/SKILL.md b/plugins/sagemaker-ai/skills/hyperpod-version-checker/SKILL.md
new file mode 100755
index 00000000..aafcd08b
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-version-checker/SKILL.md
@@ -0,0 +1,74 @@
+---
+name: hyperpod-version-checker
+description: Check and compare software component versions on SageMaker HyperPod cluster nodes - NVIDIA drivers, CUDA toolkit, cuDNN, NCCL, EFA, AWS OFI NCCL, GDRCopy, MPI, Neuron SDK (Trainium/Inferentia), Python, and PyTorch. Use when checking component versions, verifying CUDA/driver compatibility, detecting version mismatches across nodes, planning upgrades, documenting cluster configuration, or troubleshooting version-related issues on HyperPod. Triggers on requests about versions, compatibility, component checks, or upgrade planning for HyperPod clusters.
+---
+
+# HyperPod Version Checker
+
+Upload to cluster nodes via `hyperpod-ssm` skill, then execute.
+
+## Usage
+
+```bash
+# Text report to console + file
+bash hyperpod_check_versions.sh
+
+# JSON only to stdout (text report still saved to file) — best for piping/parsing
+bash hyperpod_check_versions.sh --json
+
+# Custom output file
+bash hyperpod_check_versions.sh --output /tmp/versions.txt
+
+# No color (for logging)
+bash hyperpod_check_versions.sh --no-color
+```
+
+Output file: `component_versions_<hostname>_<timestamp>.txt` (default)
+
+## What It Checks
+
+| Component         | Detection Method                                | Applicable When                               |
+| ----------------- | ----------------------------------------------- | --------------------------------------------- |
+| NVIDIA Driver     | `nvidia-smi`                                    | GPU instances (p3/p4/p5/g5)                   |
+| CUDA Toolkit      | `nvcc`, `/usr/local/cuda` symlink               | GPU instances                                 |
+| cuDNN             | Header file, packages                           | GPU instances doing deep learning             |
+| NCCL              | Library filename, header, packages              | Distributed GPU training                      |
+| EFA               | `/opt/amazon/efa_installed_packages`, `fi_info` | EFA-capable instances (p4d/p4de/p5/trn1/trn2) |
+| AWS OFI NCCL      | `efa_installed_packages`, library search        | EFA + NCCL workloads                          |
+| GDRCopy           | rpm/dpkg, kernel module                         | GPU instances with RDMA (p4d+/p5)             |
+| MPI               | `mpirun`, `/opt/amazon/openmpi`                 | Distributed training                          |
+| Neuron SDK        | `neuronx-cc`, `neuron-ls`, packages             | Trainium/Inferentia (trn1/trn2/inf1/inf2)     |
+| Python/PyTorch    | `python3`, `torch` import                       | ML workloads                                  |
+| Container runtime | `docker`, `containerd`, `kubectl`, `nvidia-ctk` | EKS clusters                                  |
+
+## Multi-Node Comparison
+
+Run on each node and compare. With `--json`, stdout is clean JSON for easy diffing:
+
+```bash
+# Via hyperpod_run_on_multi_nodes.py (from hyperpod-diagnostics skill)
+python hyperpod_run_on_multi_nodes.py --cluster <name> \
+  --command "bash hyperpod_check_versions.sh --json"
+```
+
+Or run individually via SSM on each node and diff the JSON outputs.
+
+## Compatibility Reference
+
+The script automatically analyzes CUDA/driver compatibility. For reference:
+
+| Driver Series | Supported CUDA                |
+| ------------- | ----------------------------- |
+| 580+          | 13.x, 12.x, 11.x              |
+| 570+          | 12.8+ (Blackwell), 12.x, 11.x |
+| 545+          | 12.3-12.7, 11.x               |
+| 525-535       | 12.0-12.2, 11.x               |
+| 450+          | 11.x only                     |
+
+NCCL: Use 2.18+ for CUDA 12.x, 2.12+ for CUDA 11.x. Must be consistent across all nodes.
+
+| EFA Installer | AWS OFI NCCL          |
+| ------------- | --------------------- |
+| 1.29+         | v1.7.3+ (recommended) |
+| 1.26-1.28     | v1.7.0-v1.7.2         |
+| 1.20-1.25     | v1.6.0+               |
diff --git a/plugins/sagemaker-ai/skills/hyperpod-version-checker/scripts/hyperpod_check_versions.sh b/plugins/sagemaker-ai/skills/hyperpod-version-checker/scripts/hyperpod_check_versions.sh
new file mode 100755
index 00000000..5bda095f
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/hyperpod-version-checker/scripts/hyperpod_check_versions.sh
@@ -0,0 +1,545 @@
+#!/bin/bash
+# HyperPod Version Checker - Detect software component versions on HyperPod cluster nodes
+#
+# Checks: NVIDIA driver, CUDA, cuDNN, NCCL, EFA, AWS OFI NCCL, GDRCopy, MPI,
+#          Neuron SDK, Python, PyTorch, container runtime
+# Works on both EKS and Slurm HyperPod clusters.
+#
+# Usage: bash hyperpod_check_versions.sh [--json] [--no-color] [--output FILE]
+
+# --- Defaults ---
+JSON_OUTPUT=false
+USE_COLOR=true
+OUTPUT_FILE=""
+
+# --- Parse args ---
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --json) JSON_OUTPUT=true; shift ;;
+        --no-color) USE_COLOR=false; shift ;;
+        --output|-o) OUTPUT_FILE="$2"; shift 2 ;;
+        -h|--help)
+            echo "Usage: bash hyperpod_check_versions.sh [--json] [--no-color] [--output FILE]"
+            echo "  --json       Output ONLY JSON to stdout (text report still saved to file)"
+            echo "  --no-color   Disable color output"
+            echo "  --output/-o  Write report to FILE (default: component_versions_<host>_<time>.txt)"
+            exit 0 ;;
+        *) echo "Unknown option: $1"; exit 1 ;;
+    esac
+done
+
+# --- Color setup ---
+if $USE_COLOR && [ -t 1 ] && ! $JSON_OUTPUT; then
+    RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m'
+else
+    RED=''; GREEN=''; YELLOW=''; BLUE=''; NC=''
+fi
+
+# --- Output file ---
+if [ -z "$OUTPUT_FILE" ]; then
+    OUTPUT_FILE="component_versions_$(hostname)_$(date +%Y%m%d_%H%M%S).txt"
+fi
+
+# --- Helpers ---
+# In JSON mode: text goes only to file. Otherwise: both console and file.
+log() {
+    local stripped
+    stripped=$(echo -e "$@" | sed 's/\x1b\[[0-9;]*m//g')
+    echo "$stripped" >> "$OUTPUT_FILE"
+    if ! $JSON_OUTPUT; then
+        echo -e "$@"
+    fi
+}
+
+section() {
+    log "${BLUE}========================================${NC}"
+    log "${BLUE}$1${NC}"
+    log "${BLUE}========================================${NC}"
+}
+
+cmd_exists() { command -v "$1" >/dev/null 2>&1; }
+cmd_or_path() { command -v "$1" 2>/dev/null || echo "$2"; }
+
+# Detect instance type via IMDS
+IMDS_TOKEN=$(curl -s -m 2 -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 60" 2>/dev/null)
+INSTANCE_TYPE=$(curl -s -m 2 -H "X-aws-ec2-metadata-token: $IMDS_TOKEN" http://169.254.169.254/latest/meta-data/instance-type 2>/dev/null)
+IS_NEURON=false
+IS_GPU=false
+[[ "$INSTANCE_TYPE" =~ ^ml\.(trn|inf) ]] && IS_NEURON=true
+[[ "$INSTANCE_TYPE" =~ ^ml\.(p[0-9]|g[0-9]) ]] && IS_GPU=true
+
+# JSON-safe string escape
+json_escape() { printf '%s' "$1" | sed 's/\\/\\\\/g; s/"/\\"/g; s/\t/\\t/g'; }
+
+declare -A VERSIONS
+
+# --- System Information ---
+: > "$OUTPUT_FILE"
+section "System Information"
+log "Host: $(hostname)"
+log "Date: $(date)"
+log "OS: $(grep PRETTY_NAME /etc/os-release 2>/dev/null | cut -d'"' -f2)"
+log "Kernel: $(uname -r)"
+log "Architecture: $(uname -m)"
+log "Instance Type: ${INSTANCE_TYPE:-unknown}"
+log ""
+
+# --- NVIDIA Driver & CUDA ---
+section "CUDA Information"
+
+if cmd_exists nvidia-smi; then
+    DRIVER_VER=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null | head -1)
+    if [ $? -ne 0 ] || [ -z "$DRIVER_VER" ] || [[ "$DRIVER_VER" == *"failed"* ]] || [[ "$DRIVER_VER" == *"NVIDIA-SMI"* ]]; then
+        DRIVER_VER=""
+        if $IS_NEURON; then
+            log "${YELLOW}NVIDIA driver: N/A (Trainium/Inferentia instance)${NC}"
+        else
+            log "${YELLOW}nvidia-smi found but driver not responding${NC}"
+        fi
+    else
+        VERSIONS[NVIDIA_DRIVER]="$DRIVER_VER"
+        log "NVIDIA Driver: $DRIVER_VER"
+    fi
+
+    MAX_CUDA=$(nvidia-smi 2>/dev/null | grep "CUDA Version" | sed -n 's/.*CUDA Version: \([0-9.]*\).*/\1/p' | head -1)
+    if [ -n "$MAX_CUDA" ]; then
+        VERSIONS[MAX_CUDA]="$MAX_CUDA"
+        log "Max Supported CUDA: $MAX_CUDA (driver capability)"
+    fi
+
+    log ""
+    log "GPUs:"
+    nvidia-smi -L 2>/dev/null | while read -r line; do log "  $line"; done
+    log ""
+else
+    log "${YELLOW}nvidia-smi not found - no NVIDIA GPU or driver not installed${NC}"
+    log ""
+fi
+
+if cmd_exists nvcc; then
+    CUDA_VER=$(nvcc --version 2>/dev/null | grep "release" | sed -n 's/.*release \([0-9.]*\).*/\1/p')
+    VERSIONS[CUDA_TOOLKIT]="$CUDA_VER"
+    log "CUDA Toolkit (nvcc): $CUDA_VER"
+elif [ -L /usr/local/cuda ]; then
+    CUDA_LINK=$(readlink /usr/local/cuda)
+    CUDA_VER=$(echo "$CUDA_LINK" | sed -n 's/.*cuda-\([0-9.]*\).*/\1/p')
+    VERSIONS[CUDA_TOOLKIT]="${CUDA_VER} (symlink)"
+    log "CUDA Toolkit (symlink): $CUDA_VER"
+fi
+
+CUDA_DIRS=$(ls -d /usr/local/cuda-* 2>/dev/null)
+if [ -n "$CUDA_DIRS" ]; then
+    log "Installed CUDA dirs: $CUDA_DIRS"
+    [ -L /usr/local/cuda ] && log "Active symlink: /usr/local/cuda -> $(readlink /usr/local/cuda)"
+fi
+log ""
+
+# --- cuDNN ---
+section "cuDNN Information"
+
+CUDNN_VER=""
+# Check header file
+CUDNN_HEADER=$(find /usr/local/cuda/include /usr/include 2>/dev/null -maxdepth 2 -name "cudnn_version.h" 2>/dev/null | head -1)
+if [ -z "$CUDNN_HEADER" ]; then
+    CUDNN_HEADER=$(find /usr/local/cuda/include /usr/include 2>/dev/null -maxdepth 2 -name "cudnn.h" 2>/dev/null | head -1)
+fi
+if [ -n "$CUDNN_HEADER" ]; then
+    MAJOR=$(grep "#define CUDNN_MAJOR" "$CUDNN_HEADER" 2>/dev/null | awk '{print $3}')
+    MINOR=$(grep "#define CUDNN_MINOR" "$CUDNN_HEADER" 2>/dev/null | awk '{print $3}')
+    PATCH=$(grep "#define CUDNN_PATCHLEVEL" "$CUDNN_HEADER" 2>/dev/null | awk '{print $3}')
+    [ -n "$MAJOR" ] && [ -n "$MINOR" ] && CUDNN_VER="${MAJOR}.${MINOR}.${PATCH}"
+fi
+# Package fallback
+if [ -z "$CUDNN_VER" ]; then
+    if cmd_exists dpkg; then
+        CUDNN_VER=$(dpkg -l 2>/dev/null | grep -i "libcudnn[0-9]" | head -1 | awk '{print $3}' | sed 's/-.*//')
+    fi
+    if [ -z "$CUDNN_VER" ] && cmd_exists rpm; then
+        CUDNN_VER=$(rpm -qa 2>/dev/null | grep -i "libcudnn" | head -1 | sed -n 's/.*-\([0-9][0-9.]*\)-.*/\1/p')
+    fi
+fi
+
+if [ -n "$CUDNN_VER" ]; then
+    VERSIONS[CUDNN]="$CUDNN_VER"
+    log "cuDNN: v${CUDNN_VER}"
+else
+    # Check if library exists at all
+    CUDNN_LIB=$(find /usr/local/cuda/lib64 /usr/lib 2>/dev/null -maxdepth 2 -name "libcudnn.so*" 2>/dev/null | head -1)
+    if [ -n "$CUDNN_LIB" ]; then
+        log "cuDNN library found: $CUDNN_LIB (version unknown)"
+    else
+        log "${YELLOW}cuDNN not found${NC}"
+    fi
+fi
+log ""
+
+# --- NCCL ---
+section "NCCL Information"
+
+NCCL_VER=""
+NCCL_LIBS=$(find /usr/local/cuda*/lib* /usr/lib* /usr/local/lib* /opt/nccl/lib 2>/dev/null -maxdepth 2 -name "libnccl.so*" 2>/dev/null | head -10)
+if [ -n "$NCCL_LIBS" ]; then
+    log "Libraries found:"
+    echo "$NCCL_LIBS" | while read -r lib; do log "  $lib"; done
+    for lib in $NCCL_LIBS; do
+        if [[ $lib =~ libnccl\.so\.([0-9]+\.[0-9]+\.[0-9]+) ]]; then
+            NCCL_VER="${BASH_REMATCH[1]}"
+            break
+        fi
+    done
+fi
+
+# Fallback to header
+if [ -z "$NCCL_VER" ]; then
+    NCCL_HEADER=$(find /usr/local/cuda*/include /usr/include /usr/local/include /opt/nccl/include 2>/dev/null -maxdepth 2 -name "nccl.h" 2>/dev/null | head -1)
+    if [ -n "$NCCL_HEADER" ]; then
+        MAJOR=$(grep "NCCL_MAJOR" "$NCCL_HEADER" 2>/dev/null | head -1 | awk '{print $3}')
+        MINOR=$(grep "NCCL_MINOR" "$NCCL_HEADER" 2>/dev/null | head -1 | awk '{print $3}')
+        PATCH=$(grep "NCCL_PATCH" "$NCCL_HEADER" 2>/dev/null | head -1 | awk '{print $3}')
+        [ -n "$MAJOR" ] && [ -n "$MINOR" ] && [ -n "$PATCH" ] && NCCL_VER="${MAJOR}.${MINOR}.${PATCH}"
+        [ -n "$NCCL_VER" ] && log "Version from header ($NCCL_HEADER): $NCCL_VER"
+    fi
+fi
+
+if [ -n "$NCCL_VER" ]; then
+    VERSIONS[NCCL]="$NCCL_VER"
+    log "NCCL version: v${NCCL_VER}"
+else
+    log "${YELLOW}NCCL not found${NC}"
+fi
+
+# Package info
+if cmd_exists dpkg; then
+    NCCL_PKGS=$(dpkg -l 2>/dev/null | grep -i nccl)
+    [ -n "$NCCL_PKGS" ] && { log ""; log "Packages (dpkg):"; echo "$NCCL_PKGS" | while read -r p; do log "  $p"; done; }
+fi
+if cmd_exists rpm; then
+    NCCL_RPMS=$(rpm -qa 2>/dev/null | grep -i nccl)
+    [ -n "$NCCL_RPMS" ] && { log ""; log "Packages (rpm):"; echo "$NCCL_RPMS" | while read -r p; do log "  $p"; done; }
+fi
+
+# nccl-tests
+NCCL_TESTS=$(find /opt /usr/local 2>/dev/null -maxdepth 4 -name "all_reduce_perf" 2>/dev/null | head -1)
+[ -n "$NCCL_TESTS" ] && log "nccl-tests found: $(dirname "$NCCL_TESTS")"
+log ""
+
+# --- EFA ---
+section "EFA Information"
+
+EFA_VER=""
+LIBFABRIC_VER=""
+
+if [ -f /opt/amazon/efa_installed_packages ]; then
+    EFA_VER=$(grep "# EFA installer version:" /opt/amazon/efa_installed_packages | sed -n 's/.*version: \([0-9.]*\).*/\1/p')
+    LIBFABRIC_VER=$(grep "libfabric-aws-" /opt/amazon/efa_installed_packages | sed -n 's/.*libfabric-aws-\([0-9.]*\)amzn.*/\1/p' | head -1)
+    log "EFA installed packages:"
+    while read -r line; do log "  $line"; done < /opt/amazon/efa_installed_packages
+    log ""
+fi
+
+if [ -z "$LIBFABRIC_VER" ]; then
+    FI_INFO=""
+    cmd_exists fi_info && FI_INFO="fi_info"
+    [ -z "$FI_INFO" ] && [ -f /opt/amazon/efa/bin/fi_info ] && FI_INFO="/opt/amazon/efa/bin/fi_info"
+    if [ -n "$FI_INFO" ]; then
+        LIBFABRIC_VER=$($FI_INFO --version 2>&1 | grep "libfabric" | sed -n 's/.*libfabric: \([0-9.]*\).*/\1/p' | head -1)
+        log "Libfabric ($FI_INFO): $LIBFABRIC_VER"
+    fi
+fi
+
+[ -n "$EFA_VER" ] && VERSIONS[EFA_INSTALLER]="$EFA_VER" && log "EFA Installer: $EFA_VER"
+[ -n "$LIBFABRIC_VER" ] && VERSIONS[LIBFABRIC]="$LIBFABRIC_VER" && log "Libfabric: $LIBFABRIC_VER"
+
+# EFA provider check
+FI_CMD=""
+cmd_exists fi_info && FI_CMD="fi_info"
+[ -z "$FI_CMD" ] && [ -f /opt/amazon/efa/bin/fi_info ] && FI_CMD="/opt/amazon/efa/bin/fi_info"
+if [ -n "$FI_CMD" ]; then
+    if $FI_CMD -p efa 2>&1 | grep -q "provider: efa"; then
+        log "${GREEN}EFA provider available${NC}"
+    else
+        log "${YELLOW}EFA provider not detected${NC}"
+    fi
+fi
+
+[ -d /sys/class/infiniband ] && log "InfiniBand devices: $(ls /sys/class/infiniband/ 2>/dev/null | tr '\n' ' ')" || log "${YELLOW}No InfiniBand devices found${NC}"
+log ""
+
+# --- AWS OFI NCCL ---
+section "AWS OFI NCCL Plugin"
+
+OFI_NCCL_VER=""
+if [ -f /opt/amazon/efa_installed_packages ]; then
+    OFI_NCCL_VER=$(grep "libnccl-ofi-" /opt/amazon/efa_installed_packages | sed -n 's/.*libnccl-ofi-\([0-9.]*\)-.*/\1/p' | head -1)
+fi
+
+if [ -n "$OFI_NCCL_VER" ]; then
+    VERSIONS[AWS_OFI_NCCL]="$OFI_NCCL_VER"
+    log "AWS OFI NCCL: v${OFI_NCCL_VER}"
+else
+    OFI_LIB=$(find /opt/amazon/ofi-nccl /usr/lib* 2>/dev/null -maxdepth 3 -name "libnccl-net.so" 2>/dev/null | head -1)
+    if [ -n "$OFI_LIB" ]; then
+        log "AWS OFI NCCL library found: $OFI_LIB (version unknown)"
+    else
+        log "${YELLOW}AWS OFI NCCL not found${NC}"
+    fi
+fi
+log ""
+
+# --- GDRCopy ---
+section "GDRCopy Information"
+
+GDRCOPY_VER=""
+if cmd_exists rpm; then
+    GDRCOPY_VER=$(rpm -qa 2>/dev/null | grep "^gdrcopy-[0-9]" | head -1 | sed -n 's/gdrcopy-\([0-9.]*\)-.*/\1/p')
+fi
+if [ -z "$GDRCOPY_VER" ] && cmd_exists dpkg; then
+    GDRCOPY_VER=$(dpkg -l 2>/dev/null | grep "^ii.*gdrcopy" | head -1 | awk '{print $3}' | sed -n 's/\([0-9.]*\)-.*/\1/p')
+fi
+
+if [ -n "$GDRCOPY_VER" ]; then
+    VERSIONS[GDRCOPY]="$GDRCOPY_VER"
+    log "GDRCopy: v${GDRCOPY_VER}"
+else
+    GDRCOPY_LIB=$(find /usr /opt 2>/dev/null -maxdepth 4 -name "libgdrapi.so*" 2>/dev/null | head -1)
+    [ -n "$GDRCOPY_LIB" ] && log "GDRCopy library found: $GDRCOPY_LIB (version unknown)" || log "${YELLOW}GDRCopy not found${NC}"
+fi
+
+if lsmod 2>/dev/null | grep -q gdrdrv; then
+    log "Kernel module: ${GREEN}gdrdrv loaded${NC}"
+else
+    log "Kernel module: ${YELLOW}gdrdrv not loaded${NC}"
+fi
+log ""
+
+# --- MPI ---
+section "MPI Information"
+
+MPI_VER=""
+if cmd_exists mpirun; then
+    MPI_VER=$(mpirun --version 2>&1 | head -1)
+elif [ -f /opt/amazon/openmpi/bin/mpirun ]; then
+    MPI_VER=$(/opt/amazon/openmpi/bin/mpirun --version 2>&1 | head -1)
+    log "MPI (/opt/amazon/openmpi): $MPI_VER"
+fi
+if [ -n "$MPI_VER" ]; then
+    VERSIONS[MPI]="$MPI_VER"
+    log "MPI: $MPI_VER"
+else
+    log "${YELLOW}MPI not found${NC}"
+fi
+log ""
+
+# --- Neuron SDK (Trainium/Inferentia) ---
+section "Neuron SDK Information"
+
+NEURON_DETECTED=false
+NEURON_BIN="/opt/aws/neuron/bin"
+
+# Neuron driver (kernel module)
+NEURON_DRV_VER=$(modinfo neuron 2>/dev/null | grep "^version:" | awk '{print $2}')
+if [ -n "$NEURON_DRV_VER" ]; then
+    VERSIONS[NEURON_DRIVER]="$NEURON_DRV_VER"
+    log "Neuron Driver: $NEURON_DRV_VER"
+    NEURON_DETECTED=true
+fi
+
+# Neuron devices
+NEURON_DEV_COUNT=$(ls /dev/neuron* 2>/dev/null | wc -l)
+if [ "$NEURON_DEV_COUNT" -gt 0 ]; then
+    VERSIONS[NEURON_DEVICES]="$NEURON_DEV_COUNT"
+    log "Neuron Devices: $NEURON_DEV_COUNT"
+    NEURON_DETECTED=true
+fi
+
+# Neuron devices listing
+NEURON_LS=$(cmd_or_path neuron-ls "$NEURON_BIN/neuron-ls")
+if [ -x "$NEURON_LS" ]; then
+    NEURON_DETECTED=true
+    log "Neuron devices:"
+    $NEURON_LS 2>/dev/null | while read -r line; do log "  $line"; done
+    log ""
+fi
+
+# Neuron compiler
+NEURON_CC=$(cmd_or_path neuronx-cc "$NEURON_BIN/neuronx-cc")
+if [ -x "$NEURON_CC" ]; then
+    NEURON_CC_VER=$($NEURON_CC --version 2>&1 | head -1)
+    VERSIONS[NEURON_COMPILER]="$NEURON_CC_VER"
+    log "Neuron Compiler: $NEURON_CC_VER"
+    NEURON_DETECTED=true
+fi
+
+# Neuron runtime
+NEURON_RT_VER=""
+if cmd_exists dpkg; then
+    NEURON_RT_VER=$(dpkg -l 2>/dev/null | grep "aws-neuronx-runtime-lib" | head -1 | awk '{print $3}')
+fi
+if [ -z "$NEURON_RT_VER" ] && cmd_exists rpm; then
+    NEURON_RT_VER=$(rpm -qa 2>/dev/null | grep "aws-neuronx-runtime" | head -1 | sed -n 's/.*-\([0-9][0-9.]*\)-.*/\1/p')
+fi
+if [ -n "$NEURON_RT_VER" ]; then
+    VERSIONS[NEURON_RUNTIME]="$NEURON_RT_VER"
+    log "Neuron Runtime: $NEURON_RT_VER"
+    NEURON_DETECTED=true
+fi
+
+# torch-neuronx
+TORCH_NEURON_VER=$(python3 -c "import torch_neuronx; print(torch_neuronx.__version__)" 2>/dev/null)
+if [ -n "$TORCH_NEURON_VER" ]; then
+    VERSIONS[TORCH_NEURONX]="$TORCH_NEURON_VER"
+    log "torch-neuronx: $TORCH_NEURON_VER"
+    NEURON_DETECTED=true
+fi
+
+# Neuron tools
+NEURON_TOP=$(cmd_or_path neuron-top "$NEURON_BIN/neuron-top")
+if [ -x "$NEURON_TOP" ]; then
+    NEURON_TOOLS_VER=""
+    if cmd_exists dpkg; then
+        NEURON_TOOLS_VER=$(dpkg -l 2>/dev/null | grep "aws-neuronx-tools" | head -1 | awk '{print $3}')
+    fi
+    if [ -z "$NEURON_TOOLS_VER" ] && cmd_exists rpm; then
+        NEURON_TOOLS_VER=$(rpm -qa 2>/dev/null | grep "aws-neuronx-tools" | head -1 | sed -n 's/.*-\([0-9][0-9.]*\)-.*/\1/p')
+    fi
+    [ -n "$NEURON_TOOLS_VER" ] && log "Neuron Tools: $NEURON_TOOLS_VER"
+    NEURON_DETECTED=true
+fi
+
+if ! $NEURON_DETECTED; then
+    log "${YELLOW}Neuron SDK not found (expected on non-Trainium/Inferentia instances)${NC}"
+fi
+log ""
+
+# --- Python & PyTorch ---
+section "Python / ML Frameworks"
+
+if cmd_exists python3; then
+    PY_VER=$(python3 --version 2>&1 | awk '{print $2}')
+    VERSIONS[PYTHON]="$PY_VER"
+    log "Python: $PY_VER"
+
+    PT_INFO=$(python3 -c "
+import torch
+print(f'{torch.__version__}')
+print(f'cuda_available={torch.cuda.is_available()}')
+print(f'cuda_version={torch.version.cuda or \"N/A\"}')
+if hasattr(torch, 'xpu') and hasattr(torch.xpu, 'is_available'):
+    print(f'xpu_available={torch.xpu.is_available()}')
+" 2>/dev/null)
+    if [ -n "$PT_INFO" ]; then
+        PT_VER=$(echo "$PT_INFO" | head -1)
+        VERSIONS[PYTORCH]="$PT_VER"
+        log "PyTorch: $PT_VER"
+        echo "$PT_INFO" | tail -n +2 | while read -r line; do log "  $line"; done
+    fi
+else
+    log "${YELLOW}python3 not found${NC}"
+fi
+log ""
+
+# --- Container Runtime ---
+section "Container Runtime"
+cmd_exists docker && log "Docker: $(docker --version 2>&1)"
+cmd_exists containerd && log "Containerd: $(containerd --version 2>&1)"
+cmd_exists kubectl && log "kubectl: $(kubectl version --client --short 2>&1 || kubectl version --client 2>&1 | head -1)"
+# NVIDIA Container Toolkit
+if cmd_exists nvidia-ctk; then
+    NCTK_VER=$(nvidia-ctk --version 2>&1 | head -1)
+    VERSIONS[NVIDIA_CTK]="$NCTK_VER"
+    log "NVIDIA Container Toolkit: $NCTK_VER"
+elif cmd_exists dpkg && dpkg -l 2>/dev/null | grep -q nvidia-container-toolkit; then
+    NCTK_VER=$(dpkg -l 2>/dev/null | grep "nvidia-container-toolkit " | head -1 | awk '{print $3}')
+    VERSIONS[NVIDIA_CTK]="$NCTK_VER"
+    log "NVIDIA Container Toolkit: $NCTK_VER"
+fi
+log ""
+
+# --- Environment Variables ---
+section "Relevant Environment Variables"
+log "LD_LIBRARY_PATH: ${LD_LIBRARY_PATH:-<not set>}"
+log "NCCL vars: $(env | grep -i "^NCCL" 2>/dev/null | tr '\n' ' ')"
+log "EFA vars: $(env | grep -i "^FI_\|^EFA_\|^RDMAV" 2>/dev/null | tr '\n' ' ')"
+log "NEURON vars: $(env | grep -i "^NEURON" 2>/dev/null | tr '\n' ' ')"
+log ""
+
+# --- CUDA/Driver Compatibility Analysis ---
+section "CUDA/Driver Compatibility Analysis"
+
+if [ -n "${VERSIONS[NVIDIA_DRIVER]}" ] && [ -n "${VERSIONS[MAX_CUDA]}" ]; then
+    DRIVER_MAJOR=$(echo "${VERSIONS[NVIDIA_DRIVER]}" | cut -d'.' -f1)
+    log "Driver ${VERSIONS[NVIDIA_DRIVER]} (series $DRIVER_MAJOR):"
+
+    if [ "$DRIVER_MAJOR" -ge 580 ] 2>/dev/null; then
+        log "  ${GREEN}✓ Supports CUDA 13.x, 12.x, 11.x${NC}"
+    elif [ "$DRIVER_MAJOR" -ge 570 ] 2>/dev/null; then
+        log "  ${GREEN}✓ Supports CUDA 12.8+ (Blackwell), 12.x, 11.x${NC}"
+    elif [ "$DRIVER_MAJOR" -ge 545 ] 2>/dev/null; then
+        log "  ${GREEN}✓ Supports CUDA 12.3-12.7, 11.x${NC}"
+        log "  ${YELLOW}⚠ NOT compatible with CUDA 12.8+ (needs driver 570+)${NC}"
+    elif [ "$DRIVER_MAJOR" -ge 525 ] 2>/dev/null; then
+        log "  ${GREEN}✓ Supports CUDA 12.0-12.2, 11.x${NC}"
+        log "  ${YELLOW}⚠ NOT compatible with CUDA 12.3+ (needs driver 545+)${NC}"
+    elif [ "$DRIVER_MAJOR" -ge 450 ] 2>/dev/null; then
+        log "  ${GREEN}✓ Supports CUDA 11.x${NC}"
+        log "  ${YELLOW}⚠ NOT compatible with CUDA 12.x (needs driver 525+)${NC}"
+    else
+        log "  ${YELLOW}⚠ Driver older than CUDA 11.x baseline${NC}"
+    fi
+fi
+log ""
+
+# --- Version Summary ---
+section "Version Summary"
+
+log "NVIDIA_DRIVER: ${VERSIONS[NVIDIA_DRIVER]:-not found}"
+log "MAX_CUDA: ${VERSIONS[MAX_CUDA]:-not found}"
+log "CUDA_TOOLKIT: ${VERSIONS[CUDA_TOOLKIT]:-not found}"
+log "CUDNN: ${VERSIONS[CUDNN]:+v${VERSIONS[CUDNN]}}${VERSIONS[CUDNN]:-not found}"
+log "NCCL: ${VERSIONS[NCCL]:+v${VERSIONS[NCCL]}}${VERSIONS[NCCL]:-not found}"
+log "EFA_INSTALLER: ${VERSIONS[EFA_INSTALLER]:-not found}"
+log "LIBFABRIC: ${VERSIONS[LIBFABRIC]:-not found}"
+log "AWS_OFI_NCCL: ${VERSIONS[AWS_OFI_NCCL]:+v${VERSIONS[AWS_OFI_NCCL]}}${VERSIONS[AWS_OFI_NCCL]:-not found}"
+log "GDRCOPY: ${VERSIONS[GDRCOPY]:+v${VERSIONS[GDRCOPY]}}${VERSIONS[GDRCOPY]:-not found}"
+log "MPI: ${VERSIONS[MPI]:-not found}"
+log "NEURON_DRIVER: ${VERSIONS[NEURON_DRIVER]:-not found}"
+log "NEURON_DEVICES: ${VERSIONS[NEURON_DEVICES]:-0}"
+log "NEURON_COMPILER: ${VERSIONS[NEURON_COMPILER]:-not found}"
+log "NEURON_RUNTIME: ${VERSIONS[NEURON_RUNTIME]:-not found}"
+log "TORCH_NEURONX: ${VERSIONS[TORCH_NEURONX]:-not found}"
+log "PYTHON: ${VERSIONS[PYTHON]:-not found}"
+log "PYTORCH: ${VERSIONS[PYTORCH]:-not found}"
+
+log ""
+log "Report saved to: $OUTPUT_FILE"
+
+# --- JSON output (stdout only) ---
+if $JSON_OUTPUT; then
+    cat <<EOF
+{
+  "hostname": "$(hostname)",
+  "timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
+  "instance_type": "$(json_escape "${INSTANCE_TYPE:-unknown}")",
+  "versions": {
+    "nvidia_driver": "$(json_escape "${VERSIONS[NVIDIA_DRIVER]:-}")",
+    "max_cuda": "$(json_escape "${VERSIONS[MAX_CUDA]:-}")",
+    "cuda_toolkit": "$(json_escape "${VERSIONS[CUDA_TOOLKIT]:-}")",
+    "cudnn": "$(json_escape "${VERSIONS[CUDNN]:-}")",
+    "nccl": "$(json_escape "${VERSIONS[NCCL]:-}")",
+    "efa_installer": "$(json_escape "${VERSIONS[EFA_INSTALLER]:-}")",
+    "libfabric": "$(json_escape "${VERSIONS[LIBFABRIC]:-}")",
+    "aws_ofi_nccl": "$(json_escape "${VERSIONS[AWS_OFI_NCCL]:-}")",
+    "gdrcopy": "$(json_escape "${VERSIONS[GDRCOPY]:-}")",
+    "mpi": "$(json_escape "${VERSIONS[MPI]:-}")",
+    "neuron_driver": "$(json_escape "${VERSIONS[NEURON_DRIVER]:-}")",
+    "neuron_devices": "$(json_escape "${VERSIONS[NEURON_DEVICES]:-}")",
+    "neuron_compiler": "$(json_escape "${VERSIONS[NEURON_COMPILER]:-}")",
+    "neuron_runtime": "$(json_escape "${VERSIONS[NEURON_RUNTIME]:-}")",
+    "torch_neuronx": "$(json_escape "${VERSIONS[TORCH_NEURONX]:-}")",
+    "nvidia_container_toolkit": "$(json_escape "${VERSIONS[NVIDIA_CTK]:-}")",
+    "python": "$(json_escape "${VERSIONS[PYTHON]:-}")",
+    "pytorch": "$(json_escape "${VERSIONS[PYTORCH]:-}")"
+  }
+}
+EOF
+fi
diff --git a/plugins/sagemaker-ai/skills/model-deployment/SKILL.md b/plugins/sagemaker-ai/skills/model-deployment/SKILL.md
new file mode 100644
index 00000000..e1568428
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/model-deployment/SKILL.md
@@ -0,0 +1,122 @@
+---
+name: model-deployment
+description: Generates a Jupyter notebook that deploys fine-tuned models from SageMaker Serverless Model Customization to SageMaker endpoints or Bedrock. Use when the user says "deploy my model", "create an endpoint", "make it available", or asks about deployment options. Identifies the correct deployment pathway (Nova vs OSS), generates deployment code, and handles endpoint configuration.
+---
+
+# Model Deployment
+
+Identifies the correct deployment pathway based on model characteristics and generates deployment code.
+
+## Scope
+
+This skill supports deploying Nova and OSS models that were fine-tuned through **SageMaker Serverless Model Customization** only.
+
+**Not supported:**
+
+- Base models (not fine-tuned)
+- Models fine-tuned through other processes
+- Full Fine-Tuning (FFT) — only LoRA fine-tuned models are supported
+
+## Principles
+
+1. **One thing at a time.** Each response advances exactly one decision.
+2. **Confirm before proceeding.** Wait for the user to agree before moving on. But don't re-ask questions already answered in the conversation — use what you know.
+3. **Don't read files until you need them.** Only read pathway references after the pathway is confirmed.
+4. **Use what you know.** If conversation history or artifacts already answer a question, confirm your understanding instead of asking again.
+
+## Workflow
+
+### Step 1: Identify the Training Job
+
+You need the training job name or ARN. Check the conversation history first — the user may have already mentioned it, or it may be available from earlier steps in the workflow (e.g., fine-tuning). If not, ask the user.
+
+Once you have the training job name or ARN, use the AWS MCP tool to look it up:
+
+1. Use the AWS MCP tool `describe-training-job` and extract:
+   - **S3 output path** (from `ModelArtifacts.S3ModelArtifacts` or `OutputDataConfig.S3OutputPath`)
+   - **IAM role ARN** (from `RoleArn`)
+   - **Region**
+2. Use the AWS MCP tool `list-tags` on the training job ARN and extract:
+   - **Model ID** from the `sagemaker-studio:jumpstart-model-id` tag
+3. Determine the **model type** from the model ID:
+   - Contains "nova" (nova-micro, nova-lite, nova-pro) → **Nova**
+   - Llama, Mistral, Qwen, GPT-OSS, DeepSeek, etc. → **OSS**
+
+**Unsupported models:** This skill only supports OSS and Nova models that were LoRA fine-tuned through SageMaker Serverless Model Customization. If the model doesn't match, tell the user this skill can't help and suggest the finetuning skill.
+
+### Step 2: Determine Eligible Deployment Targets
+
+Use the following table:
+
+| Model Type | Eligible Targets   |
+| ---------- | ------------------ |
+| OSS        | SageMaker, Bedrock |
+| Nova       | SageMaker, Bedrock |
+
+If only one target is eligible, confirm it with the user. Use details from Step 5.
+
+If multiple targets are eligible, help the user decide. Use details from Step 5.
+
+If no targets are eligible, tell the user and explain why.
+
+### Step 3: Let the User Choose a Deployment Target
+
+Present the eligible options to the user. Present these details to help them decide between SageMaker and Bedrock, if both are available options:
+
+**SageMaker Endpoint:**
+
+- Dedicated compute resources for consistent performance
+- Control instance types and scaling
+- Best for predictable workloads with specific latency requirements
+
+**Bedrock:**
+
+- Fully managed serverless inference
+- Auto-scales instantly with no capacity planning
+- Pay per request
+- Best for variable workloads with fluctuating demand
+
+Do NOT make a recommendation. Let the user choose.
+
+Do NOT mention technical details like merged/unmerged weights, reference files, or APIs, unless the user asks.
+
+⏸ Wait for user to select a deployment option.
+
+### Step 4: Display License Agreement
+
+Before proceeding to deployment, display the model's license or service terms to the user.
+
+1. Read `references/model-licenses.md` and look up the model by its model ID (determined in Step 1).
+2. Follow the instructions in the Notes column — use the exact phrasing provided.
+3. If the model ID is not found in the table, warn the user that you could not find license information for their model and recommend they verify the license independently before proceeding.
+
+⏸ Wait for the user to confirm before proceeding.
+
+### Step 5: Follow Pathway Workflow
+
+Read the reference file for the selected pathway and follow its instructions.
+
+| Model Type | Deployment Target | Reference                             |
+| ---------- | ----------------- | ------------------------------------- |
+| OSS        | SageMaker         | `references/deploy-oss-sagemaker.md`  |
+| OSS        | Bedrock           | `references/deploy-oss-bedrock.md`    |
+| Nova       | SageMaker         | `references/deploy-nova-sagemaker.md` |
+| Nova       | Bedrock           | `references/deploy-nova-bedrock.md`   |
+
+### Step 6: Post-Deployment Summary
+
+After deployment completes, provide the user with a summary. Cover these topics, using details from the pathway reference doc you followed in Step 5:
+
+- **What was deployed** — endpoint or model name, ARN, status
+- **How to use it** — sample invoke code for the specific deployment target
+- **Cost** — billing model (instance-based vs. pay-per-request) and what to expect
+- **Cleanup** — how to delete the endpoint or model when done
+
+## Troubleshooting
+
+### How to check if a model was LoRA or FFT fine-tuned
+
+If deployment fails unexpectedly, the model may have been full fine-tuned (FFT) rather than LoRA. To check, download the training job's hydra config from its S3 output path at `.hydra/config.yaml`:
+
+- `peft_config` populated (r, alpha, dropout, etc.) → **LoRA** (supported)
+- `peft_config: null` → **FFT** (not supported by this skill)
diff --git a/plugins/sagemaker-ai/skills/model-deployment/references/deploy-nova-bedrock.md b/plugins/sagemaker-ai/skills/model-deployment/references/deploy-nova-bedrock.md
new file mode 100644
index 00000000..7cd38fff
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/model-deployment/references/deploy-nova-bedrock.md
@@ -0,0 +1,119 @@
+# Deploy Nova LoRA to Bedrock (PySDK BedrockModelBuilder)
+
+## Scenario
+
+- **Model Type**: Nova
+- **Fine-tuning Method**: LoRA
+- **Deployment Target**: Bedrock Custom Model
+- **Approach**: SageMaker PySdk `BedrockModelBuilder`
+
+## Overview
+
+Uses the SageMaker PySdk `BedrockModelBuilder` to deploy a Nova fine-tuned LoRA model to Bedrock as a Custom Model. The builder auto-detects Nova models and calls `CreateCustomModel`.
+
+**Required inputs** (collected in the steps below):
+
+- Training job name
+- Custom model name
+- IAM role ARN
+- AWS region
+
+## Prerequisites
+
+Requires SageMaker Python SDK >= 3.6.0 with `BedrockModelBuilder` Nova support (installed by Cell 1).
+
+## Workflow
+
+### Important Instructions
+
+- Make sure to use dedicated tools instead of bash commands whenever possible
+
+### Step 1: Gather Training Job Name
+
+The training job name was identified in Step 1 of the main workflow. Confirm you have it.
+
+### Step 2: Gather Custom Model Name
+
+For this step, you need: **a name for the deployed custom model.**
+
+Suggest a name based on the training job or use case, e.g., `nova-micro-bedrock-<timestamp>`. Ask the user to confirm or provide their own.
+
+⏸ Wait for user before moving on.
+
+### Step 3: Verify IAM Role
+
+Use the IAM role from the training job (extracted in Step 1 of the main workflow via `describe-training-job`). We'll assume this role has the necessary permissions for Bedrock deployment.
+
+### Step 4: Confirm Region
+
+The region was identified in Step 1 of the main workflow. Nova → Bedrock deployment is currently only supported in **us-east-1**. If the training job is in a different region, tell the user that Bedrock deployment is not supported for this model in this region.
+
+### Step 5: Confirm Configuration
+
+> "Here's the deployment setup:
+>
+> - Deployment target: Bedrock (Custom Model)
+> - Training Job: [job-name]
+> - Custom Model Name: [name]
+> - IAM Role: [arn]
+> - Region: us-east-1
+>
+> Does this look right?"
+
+⏸ Wait for user approval.
+
+### Step 6: Generate Notebook
+
+Ask the user if they have an existing notebook to add the deployment cells to, or if they want a new one. If new, suggest a name like `deploy-[model]-[target].ipynb` and ask where to save it.
+
+## Notebook Structure
+
+### Markdown Header
+
+```json
+{
+  "cell_type": "markdown",
+  "metadata": {},
+  "source": [
+    "# Deploy Nova to Bedrock"
+  ]
+}
+```
+
+### Cells
+
+Each cell's content comes from `../scripts/deploy-nova-bedrock.py`, split on the `# Cell N:` comments.
+
+- **Cell 1**: Setup (pip install)
+- **Cell 2**: Configuration (env vars, imports, placeholders)
+- **Cell 3**: Build and Deploy to Bedrock (blocks until deployment is Active)
+- **Cell 4**: Test Inference
+
+### Placeholders
+
+Cell 2:
+
+- `[REGION]` → AWS region (us-east-1)
+- `[TRAINING_JOB_NAME]` → SageMaker training job name
+- `[ROLE_ARN]` → IAM role ARN
+- `[CUSTOM_MODEL_NAME]` → Name for the custom model
+
+All other cells have no placeholders.
+
+### Step 7: Provide Run Instructions
+
+```
+To run:
+1. Cell 1 — install SDK packages
+2. Cell 2 — set configuration values
+3. Cell 3 — creates custom model via BedrockModelBuilder and deploys (blocks until Active)
+4. Cell 4 — test inference with a sample prompt via Converse API
+```
+
+## Common Issues
+
+- **"ServiceQuotaExceededException: The number of custom models in Creating status has reached the quota limit"**: Too many concurrent model creations. Wait for in-progress models to finish, or delete old custom models.
+- **"No module named 'sagemaker.serve.bedrock_model_builder'"**: Re-run Cell 1 to install the required packages, then restart the kernel.
+- **"Access denied to S3"**: Add S3 read permissions to the IAM role for the model artifacts bucket.
+- **"Provided IAM role could not be assumed"**: Ensure role has trust policy for `bedrock.amazonaws.com`.
+- **Deployment status "Failed"**: Check CloudTrail for the `CreateCustomModel` event to see the failure reason.
diff --git a/plugins/sagemaker-ai/skills/model-deployment/references/deploy-nova-sagemaker.md b/plugins/sagemaker-ai/skills/model-deployment/references/deploy-nova-sagemaker.md
new file mode 100644
index 00000000..daeca77f
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/model-deployment/references/deploy-nova-sagemaker.md
@@ -0,0 +1,142 @@
+# Deploy Nova LoRA to SageMaker
+
+## Scenario
+
+- **Model Type**: Nova
+- **Fine-tuning Method**: LoRA
+- **Deployment Target**: SageMaker Single Model Endpoint
+- **Approach**: SageMaker ModelBuilder
+
+## Overview
+
+Deploys a Nova fine-tuned model to a SageMaker endpoint using `ModelBuilder`.
+
+Nova deploys as a model-on-variant (no inference components), so you invoke the endpoint directly without specifying an `InferenceComponentName`.
+
+**Required inputs** (collected in the steps below):
+
+- Training job name
+- Instance type
+- IAM execution role ARN
+- AWS region
+- Endpoint name
+
+## Prerequisites
+
+Requires SageMaker Python SDK >= 3.6.0 (installed by Cell 1).
+
+## Workflow
+
+### Important Instructions
+
+- Make sure to use dedicated tools instead of bash commands whenever possible
+
+### Step 1: Gather Training Job Name
+
+The training job name was identified in Step 1 of the main workflow. Confirm you have it.
+
+### Step 2: Determine Instance Type
+
+For this step, you need: **the instance type.**
+
+First, determine the Nova variant from the training job's model package. Use your AWS tool to run `sagemaker describe-training-job` for the training job name and extract the `OutputModelPackageArn` from the response. Then inspect the model package to find the `hub_content_name` (e.g., `nova-textgeneration-micro`).
+
+Supported instances by Nova variant (smallest to largest). Larger instances support longer context lengths.
+
+Nova Micro (`nova-textgeneration-micro`): ml.g5.12xlarge, ml.g5.24xlarge, ml.g6.12xlarge, ml.g6.24xlarge, ml.g6.48xlarge, ml.p5.48xlarge
+
+Nova Lite (`nova-textgeneration-lite`): ml.g6.48xlarge, ml.p5.48xlarge
+
+Nova Lite v2 (`nova-textgeneration-lite-v2`): ml.p5.48xlarge
+
+Nova Pro (`nova-textgeneration-pro`): ml.g6.48xlarge, ml.p5.48xlarge
+
+Present the supported instance types and ask which one the user would like to use. The larger instances will be more expensive, but have larger context windows.
+
+⏸ Wait for user to confirm before moving on.
+
+### Step 3: Verify IAM Role
+
+Use the IAM role from the training job (extracted in Step 1 of the main workflow via `describe-training-job`). This role should already have the necessary SageMaker and S3 permissions. Confirm with the user.
+
+### Step 4: Confirm Region
+
+The region was identified in Step 1 of the main workflow. Nova deployment is only supported in: us-east-1, us-west-2, eu-west-2, ap-northeast-1. If the region isn't supported, tell the user that SageMaker deployment is not supported for this model in this region.
+
+### Step 5: Choose Endpoint Name
+
+Suggest a name based on the model, e.g., `nova-micro-deploy-<timestamp>`. Ask the user to confirm or provide their own.
+
+⏸ Wait for user before moving on.
+
+### Step 6: Confirm Configuration
+
+> "Here's the deployment setup:
+>
+> - Model: [base-model-name] fine-tuned with LoRA (e.g., "Nova Micro fine-tuned with LoRA")
+> - Deployment target: SageMaker Endpoint
+> - Training Job: [name]
+> - Instance Type: [type]
+> - IAM Role: [arn]
+> - Region: [region]
+> - Endpoint Name: [name]
+>
+> Does this look right?"
+
+⏸ Wait for user approval.
+
+### Step 7: Generate Notebook
+
+Ask the user if they have an existing notebook to add the deployment cells to, or if they want a new one. If new, suggest a name like `deploy-[model]-[target].ipynb` and ask where to save it.
+
+## Notebook Structure
+
+### Markdown Header
+
+```json
+{
+  "cell_type": "markdown",
+  "metadata": {},
+  "source": [
+    "# Deploy Nova Fine-Tuned Model to SageMaker"
+  ]
+}
+```
+
+### Cells
+
+Each cell's content comes from `../scripts/deploy-nova-sagemaker.py`, split on the `# Cell N:` comments.
+
+- **Cell 1**: Setup (pip install)
+- **Cell 2**: Configuration
+- **Cell 3**: Build Model
+- **Cell 4**: Deploy Endpoint
+- **Cell 5**: Test Inference
+
+### Placeholders
+
+Cell 2:
+
+- `[REGION]` → AWS region
+- `[TRAINING_JOB_NAME]` → Training job name
+- `[ROLE_ARN]` → IAM execution role ARN
+- `[INSTANCE_TYPE]` → SageMaker instance type (e.g., `ml.g5.12xlarge`)
+- `[ENDPOINT_NAME]` → Endpoint name
+
+## Step 8: Provide Run Instructions
+
+```
+To run:
+1. Cell 1 — install SDK packages, then restart the kernel before continuing
+2. Cell 2 — set configuration values
+3. Cell 3 — build model via ModelBuilder (~30s, creates SageMaker Model resource)
+4. Cell 4 — deploy endpoint (waits for InService, ~10-15 min)
+5. Cell 5 — test inference with a sample prompt
+```
+
+## Common Issues
+
+- **"No module named 'sagemaker.core'" or "No module named 'sagemaker.train'"**: Re-run Cell 1 to install the required packages, then restart the kernel.
+- **"Must setup local AWS configuration with a region"**: Set `AWS_DEFAULT_REGION` env var or configure `~/.aws/config`
+- **"Cannot create already existing endpoint configuration"**: An endpoint with that name already exists. Use a different name or delete the existing one first.
+- **Endpoint fails to reach InService**: Check CloudWatch logs for the endpoint. Common causes: wrong instance type for the model size, or IAM role missing permissions.
diff --git a/plugins/sagemaker-ai/skills/model-deployment/references/deploy-oss-bedrock.md b/plugins/sagemaker-ai/skills/model-deployment/references/deploy-oss-bedrock.md
new file mode 100644
index 00000000..e0790a5c
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/model-deployment/references/deploy-oss-bedrock.md
@@ -0,0 +1,138 @@
+# Deploy OSS Merged LoRA to Bedrock CMI
+
+## Scenario
+
+- **Model Type**: OSS (Open Source)
+- **Fine-tuning Method**: LoRA
+- **Merge Status**: Merged (`merge_weights: true`)
+- **Deployment Target**: Bedrock Custom Model Import (CMI)
+- **Approach**: SageMaker PySdk `BedrockModelBuilder`
+
+## Overview
+
+Uses the SageMaker PySdk `BedrockModelBuilder` to import a fine-tuned model into Bedrock as a Custom Model Import (CMI). The builder auto-resolves model artifacts from a training job.
+
+**Required inputs** (collected in the steps below):
+
+- Training job name
+- Model name
+- IAM role ARN (with Bedrock trust policy and S3 read access)
+- AWS region (must be us-east-1, us-east-2, us-west-2, or eu-central-1)
+
+## Prerequisites
+
+### Model Size Limit
+
+Bedrock CMI has a 200GB limit for text models (100GB for multimodal). Check before proceeding using the AWS MCP tool `list-objects-v2` (S3 service) with the bucket and prefix `<prefix>/checkpoints/hf_merged/`. Sum the `Size` field from all returned objects to determine total size.
+If the model exceeds 200GB, this pathway cannot be used.
+
+### Required Files
+
+The `hf_merged/` folder must contain: `.safetensors` files, `config.json`, `tokenizer.json`, `tokenizer_config.json`.
+
+### SDK Version
+
+Requires `sagemaker>=3.0.0` with `BedrockModelBuilder` support.
+
+## Workflow
+
+### Step 1: Gather Training Job Name
+
+The training job name was identified in Step 1 of the main workflow. Confirm you have it.
+
+### Step 2: Gather Model Name
+
+Suggest a name for the deployed model based on the training job or use case. Format: lowercase, alphanumeric with hyphens. Confirm with the user.
+
+### Step 3: Verify IAM Role
+
+Use the IAM role from the training job (extracted in Step 1 of the main workflow via `describe-training-job`). We'll assume this role has the necessary permissions for Bedrock deployment.
+
+### Step 4: Confirm Region
+
+Bedrock CMI is available in: us-east-1, us-east-2, us-west-2, eu-central-1.
+
+The region was identified in Step 1. Confirm it's in the supported list. If not, tell the user that Bedrock deployment is not supported for this model in this region.
+
+### Step 5: Confirm Configuration
+
+> "Here's the deployment setup:
+>
+> - Deployment target: Bedrock
+> - Training Job: [job-name]
+> - Model Name: [name]
+> - IAM Role: [arn]
+> - Region: [region]
+>
+> Does this look right?"
+
+⏸ Wait for user approval.
+
+### Step 6: Generate Notebook
+
+Ask the user if they have an existing notebook to add the deployment cells to, or if they want a new one. If new, suggest a name like `deploy-[model]-[target].ipynb` and ask where to save it.
+
+⏸ Wait for user.
+
+## Notebook Structure
+
+### Markdown Header
+
+```json
+{
+  "cell_type": "markdown",
+  "metadata": {},
+  "source": [
+    "# Deploy to Bedrock"
+  ]
+}
+```
+
+### Cells
+
+Each cell's content comes from `../scripts/deploy-oss-bedrock.py`, split on the `# Cell N:` comments.
+
+- **Cell 1**: Setup (pip install)
+- **Cell 2**: Configuration
+- **Cell 3**: Flatten S3 Structure and Start Import
+- **Cell 4**: Wait for Import to Complete
+- **Cell 5**: Test Inference
+
+### Placeholders
+
+Cell 2:
+
+- `[REGION]` → AWS region
+- `[TRAINING_JOB_NAME]` → SageMaker training job name
+- `[ROLE_ARN]` → IAM role ARN with Bedrock trust policy and S3 read permissions
+- `[MODEL_NAME]` → Name for the imported model
+
+All other cells have no placeholders.
+
+### Step 7: Provide Run Instructions
+
+```
+To run:
+1. Cell 1 — install/upgrade SageMaker SDK
+2. Cell 2 — configuration and imports
+3. Cell 3 — flattens S3 structure and starts import job via BedrockModelBuilder
+4. Cell 4 — waits for import to complete (typically a few minutes)
+5. Cell 5 — test inference with a sample prompt
+```
+
+## Common Issues
+
+- **"Model weights are larger than 200GB"**: Cannot use this pathway.
+- **"No module named 'sagemaker.serve.bedrock_model_builder'"**: Upgrade SDK: `pip install --upgrade sagemaker>=3.0.0`
+- **Import starts but uses wrong region**: Known issue — `BedrockModelBuilder` defaults to us-east-1. The notebook code overrides this.
+- **"Access denied to S3"**: Add S3 read permissions to the IAM role for the model bucket.
+- **"Provided IAM role could not be assumed"**: Ensure role has trust policy for `bedrock.amazonaws.com`.
+
+## Post-Deployment Summary
+
+After the notebook runs successfully, tell the user:
+
+- **Model**: `[MODEL_NAME]` has been imported to Bedrock
+- **How to invoke**: Use the Bedrock Converse API with the imported model ARN
+- **Billing**: Pay per request — no cost while idle
+- **Cleanup**: When done, delete the imported model using the AWS MCP tool `delete-imported-model` (Bedrock service) with the model name.
diff --git a/plugins/sagemaker-ai/skills/model-deployment/references/deploy-oss-sagemaker.md b/plugins/sagemaker-ai/skills/model-deployment/references/deploy-oss-sagemaker.md
new file mode 100644
index 00000000..5af59db7
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/model-deployment/references/deploy-oss-sagemaker.md
@@ -0,0 +1,149 @@
+# Deploy OSS LoRA to SageMaker Multi-Adapter Endpoint
+
+## Scenario
+
+- **Model Type**: OSS (Open Source)
+- **Fine-tuning Method**: LoRA
+- **Merge Status**: Unmerged (`merge_weights: false`)
+- **Deployment Target**: SageMaker Multi-Adapter Endpoint
+- **Approach**: SageMaker PySdk `JumpStartModel`
+
+## Overview
+
+Uses the SageMaker PySdk `JumpStartModel` to resolve the base model S3 URI and container image, rather than manually querying `describe_hub_content` and parsing the hub content document JSON. Requires `sagemaker>=3.0.0`.
+
+**Required inputs** (collected in the steps below):
+
+- Training job name (to resolve JumpStart model ID from tags)
+- Instance type
+- IAM execution role ARN
+- AWS region
+
+## Prerequisites
+
+### SDK Version
+
+Requires `sagemaker>=3.0.0` with `JumpStartModel` support.
+
+## Key Gotchas
+
+- **ArtifactUrl for adapter ICs**: An S3 prefix (directory) works despite docs saying it must be `.tar.gz`. No need to repackage.
+- **Container version**: LMI 0.31.0 does NOT have the `vllm_async_service` entrypoint. Use `OPTION_ROLLING_BATCH=lmi-dist` instead.
+- **Gated models**: Use JumpStart S3 cache via ModelDataSource to avoid needing HF_TOKEN.
+- **Endpoint config**: Including ExecutionRoleArn enables inference-component mode. Do NOT include ModelName in ProductionVariants.
+
+## Workflow
+
+### Step 1: Gather Training Job Name
+
+The training job name was identified in Step 1 of the main workflow. Confirm you have it.
+
+This is needed to look up the JumpStart model ID (from training job tags), which `JumpStartModel` uses to resolve the base model S3 URI and container image automatically.
+
+### Step 2: Determine Instance Type
+
+For this step, you need: **the instance type.**
+
+Recommend an instance based on model size:
+
+- Small models (<3B): `ml.g5.2xlarge` (1 GPU, ~24GB)
+- Medium models (<10B): `ml.g5.12xlarge` (4 GPUs, ~96GB)
+- Large models (>10B): `ml.g6e.48xlarge` (8 GPUs, ~1TB)
+
+Give your suggestion to the user with reasoning and ask them to confirm. If they would like a different instance type, accept their choice. If you think it will cause issues (e.g., not enough GPU memory for the model), call that out.
+
+⏸ Wait for user to confirm before moving on.
+
+### Step 3: Verify IAM Role
+
+Use the IAM role from the training job (extracted in Step 1 of the main workflow via `describe-training-job`). This role should already have the necessary SageMaker and S3 permissions. Confirm with the user.
+
+### Step 4: Confirm Region
+
+The region was identified in Step 1 of the main workflow. Confirm it with the user.
+
+### Step 5: Confirm Configuration
+
+> "Here's the deployment setup:
+>
+> - Target: SageMaker Multi-Adapter Endpoint
+> - Training Job: [name]
+> - Instance Type: [type]
+> - IAM Role: [arn]
+> - Region: [region]
+>
+> Does this look right?"
+
+⏸ Wait for user approval.
+
+### Step 6: Generate Notebook
+
+Ask the user if they have an existing notebook to add the deployment cells to, or if they want a new one. If new, suggest a name like `deploy-[model]-[target].ipynb` and ask where to save it.
+
+⏸ Wait for user.
+
+## Notebook Structure
+
+### Markdown Header
+
+```json
+{
+  "cell_type": "markdown",
+  "metadata": {},
+  "source": [
+    "# Deploy to SageMaker Multi-Adapter Endpoint"
+  ]
+}
+```
+
+### Cells
+
+Each cell's content comes from `../scripts/deploy-oss-sagemaker.py`, split on the `# Cell N:` comments.
+
+- **Cell 1**: Setup (pip install)
+- **Cell 2**: Configuration
+- **Cell 3**: Create Model and Endpoint
+- **Cell 4**: Create Base Model and Adapter Inference Components
+- **Cell 5**: Test Inference
+
+### Placeholders
+
+Cell 2:
+
+- `[REGION]` → AWS region
+- `[INSTANCE_TYPE]` → SageMaker instance type (e.g., `ml.g5.2xlarge`)
+- `[TRAINING_JOB_NAME]` → Training job name (used to look up JumpStart model ID from tags)
+- `[ROLE_ARN]` → IAM execution role ARN
+- `[ENDPOINT_NAME]` → Name for the endpoint (agent should generate a reasonable default)
+
+### Step 7: Provide Run Instructions
+
+```
+To run:
+1. Cell 1 — install/upgrade SageMaker SDK
+2. Cell 2 — configuration (resolves adapter path and base model metadata via JumpStartModel)
+3. Cell 3 — creates model and endpoint (waits for endpoint to be InService, ~5-10 min)
+4. Cell 4 — creates base model and adapter inference components (waits for both to be InService, ~5-10 min)
+5. Cell 5 — test inference with a sample prompt
+```
+
+## Common Issues
+
+- **"No module named 'sagemaker.jumpstart'"**: Upgrade SDK: `pip install --upgrade sagemaker>=3.0.0`
+- **"ModuleNotFoundError" for vllm_async_service**: Using LMI 0.31.0 container. Use `OPTION_ROLLING_BATCH=lmi-dist` instead of `OPTION_ENTRYPOINT`.
+- **Base IC fails health check**: Check `MinMemoryRequiredInMb` fits within instance memory. Reduce if needed.
+- **"Inference Component Name header is required"**: Must pass `InferenceComponentName` when invoking the endpoint.
+- **Console shows "Missing required key 'ModelName'"**: This is a console UI issue, not a deployment issue. The endpoint works correctly.
+- **Adapter IC fails**: Verify adapter weights exist at `<model-s3-uri>/checkpoints/hf/`. Check that the S3 prefix is accessible.
+
+## Post-Deployment Summary
+
+After the notebook runs successfully, tell the user:
+
+- **Endpoint**: `[ENDPOINT_NAME]` is now InService
+- **How to invoke**: Use SageMaker runtime `InvokeEndpoint` with `InferenceComponentName` set to the adapter IC name (derived from the endpoint name)
+- **Billing**: This endpoint is billed by the hour while running, even when idle. Delete it when you're done testing.
+- **Cleanup**: Delete the adapter inference component first, then the base inference component, then the endpoint using the AWS MCP tool:
+  1. Use `delete-inference-component` (SageMaker service) with the adapter IC name
+  2. Wait for deletion to complete, then use `delete-inference-component` with the base IC name
+  3. Wait for deletion to complete, then use `delete-endpoint` with the endpoint name
diff --git a/plugins/sagemaker-ai/skills/model-deployment/references/model-licenses.md b/plugins/sagemaker-ai/skills/model-deployment/references/model-licenses.md
new file mode 100644
index 00000000..ef7a6b37
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/model-deployment/references/model-licenses.md
@@ -0,0 +1,23 @@
+# Model Licenses
+
+License URLs for models supported by the deployment skill. Look up the model by its JumpStart model ID (from the `sagemaker-studio:jumpstart-model-id` training job tag).
+
+Display the license or service terms to the user and follow any instructions in the Notes column.
+
+| Model ID                                     | License             | License URL                                                                       | Notes                                                                                                                                              |
+| -------------------------------------------- | ------------------- | --------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `huggingface-reasoning-qwen3-8b`             | Apache 2.0          | https://huggingface.co/Qwen/Qwen3-8B/blob/main/LICENSE                            | Say: "This model is licensed under **Apache 2.0**. Please review the license terms here: {URL}. Say 'yes' to accept and proceed."                  |
+| `huggingface-reasoning-qwen3-32b`            | Apache 2.0          | https://huggingface.co/Qwen/Qwen3-32B/blob/main/LICENSE                           | Say: "This model is licensed under **Apache 2.0**. Please review the license terms here: {URL}. Say 'yes' to accept and proceed."                  |
+| `huggingface-reasoning-qwen3-06b`            | Apache 2.0          | https://huggingface.co/Qwen/Qwen3-0.6B/blob/main/LICENSE                          | Say: "This model is licensed under **Apache 2.0**. Please review the license terms here: {URL}. Say 'yes' to accept and proceed."                  |
+| `huggingface-llm-qwen2-5-7b-instruct`        | Apache 2.0          | https://huggingface.co/Qwen/Qwen2.5-7B-Instruct/blob/main/LICENSE                 | Say: "This model is licensed under **Apache 2.0**. Please review the license terms here: {URL}. Say 'yes' to accept and proceed."                  |
+| `huggingface-llm-qwen2-5-32b-instruct`       | Apache 2.0          | https://huggingface.co/Qwen/Qwen2.5-32B-Instruct/blob/main/LICENSE                | Say: "This model is licensed under **Apache 2.0**. Please review the license terms here: {URL}. Say 'yes' to accept and proceed."                  |
+| `deepseek-llm-r1-distill-qwen-32b`           | MIT                 | https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/blob/main/LICENSE | Say: "This model is licensed under **MIT**. Please review the license terms here: {URL}. Say 'yes' to accept and proceed."                         |
+| `openai-reasoning-gpt-oss-20b`               | Apache 2.0          | https://huggingface.co/openai/gpt-oss-20b/blob/main/LICENSE                       | Say: "This model is licensed under **Apache 2.0**. Please review the license terms here: {URL}. Say 'yes' to accept and proceed."                  |
+| `openai-reasoning-gpt-oss-120b`              | Apache 2.0          | https://huggingface.co/openai/gpt-oss-120b/blob/main/LICENSE                      | Say: "This model is licensed under **Apache 2.0**. Please review the license terms here: {URL}. Say 'yes' to accept and proceed."                  |
+| `nova-textgeneration-pro`                    | AWS Service Terms   | https://aws.amazon.com/service-terms/                                             | Say: "This model is subject to the AWS Service Terms: {URL}. Would you like to proceed?"                                                           |
+| `nova-textgeneration-micro`                  | AWS Service Terms   | https://aws.amazon.com/service-terms/                                             | Say: "This model is subject to the AWS Service Terms: {URL}. Would you like to proceed?"                                                           |
+| `nova-textgeneration-lite`                   | AWS Service Terms   | https://aws.amazon.com/service-terms/                                             | Say: "This model is subject to the AWS Service Terms: {URL}. Would you like to proceed?"                                                           |
+| `nova-textgeneration-lite-v2`                | AWS Service Terms   | https://aws.amazon.com/service-terms/                                             | Say: "This model is subject to the AWS Service Terms: {URL}. Would you like to proceed?"                                                           |
+| `meta-textgeneration-llama-3-3-70b-instruct` | Llama 3.3 Community | https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct/blob/main/LICENSE        | Say: "This model is licensed under **Llama 3.3 Community License**. Please review the license terms here: {URL}. Say 'yes' to accept and proceed." |
+| `meta-textgeneration-llama-3-2-1b-instruct`  | Llama 3.2 Community | https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct/blob/main/LICENSE.txt     | Say: "This model is licensed under **Llama 3.2 Community License**. Please review the license terms here: {URL}. Say 'yes' to accept and proceed." |
+| `meta-textgeneration-llama-3-1-8b-instruct`  | Llama 3.1 Community | https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct/blob/main/LICENSE         | Say: "This model is licensed under **Llama 3.1 Community License**. Please review the license terms here: {URL}. Say 'yes' to accept and proceed." |
diff --git a/plugins/sagemaker-ai/skills/model-deployment/scripts/deploy-nova-bedrock.py b/plugins/sagemaker-ai/skills/model-deployment/scripts/deploy-nova-bedrock.py
new file mode 100644
index 00000000..9b7492c3
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/model-deployment/scripts/deploy-nova-bedrock.py
@@ -0,0 +1,51 @@
+# Cell 1: Setup
+
+%pip install sagemaker>=3.7.0 --quiet
+
+# Cell 2: Configuration
+
+import os
+import boto3
+
+os.environ["AWS_DEFAULT_REGION"] = "[REGION]"
+
+from sagemaker.core.resources import TrainingJob
+from sagemaker.serve.bedrock_model_builder import BedrockModelBuilder
+from sagemaker.core import Attribution, set_attribution
+from pprint import pprint
+
+set_attribution(Attribution.SAGEMAKER_AGENT_PLUGIN)
+
+REGION = "[REGION]"
+TRAINING_JOB_NAME = "[TRAINING_JOB_NAME]"
+ROLE_ARN = "[ROLE_ARN]"
+CUSTOM_MODEL_NAME = "[CUSTOM_MODEL_NAME]"
+
+# Cell 3: Build and Deploy to Bedrock
+
+training_job = TrainingJob.get(training_job_name=TRAINING_JOB_NAME)
+print(f"Training job status: {training_job.training_job_status}")
+
+bedrock_builder = BedrockModelBuilder(model=training_job)
+
+deployment_result = bedrock_builder.deploy(
+    role_arn=ROLE_ARN,
+    custom_model_name=CUSTOM_MODEL_NAME,
+)
+
+deployment_arn = deployment_result["customModelDeploymentArn"]
+pprint(f"Deployment Result: {deployment_result}")
+
+# Cell 4: Test Inference
+
+bedrock_runtime = boto3.client("bedrock-runtime", region_name=REGION)
+message = "What is the capital of France?"
+print(f"Model Inference Message: {message}")
+resp = bedrock_runtime.converse(
+    modelId=deployment_arn,
+    messages=[{"role": "user", "content": [{"text": message}]}],
+    inferenceConfig={"maxTokens": 100, "temperature": 0.7},
+)
+
+response_str = resp["output"]["message"]["content"][0]["text"]
+print(f"Model Response: {response_str}")
diff --git a/plugins/sagemaker-ai/skills/model-deployment/scripts/deploy-nova-sagemaker.py b/plugins/sagemaker-ai/skills/model-deployment/scripts/deploy-nova-sagemaker.py
new file mode 100644
index 00000000..0a126ea9
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/model-deployment/scripts/deploy-nova-sagemaker.py
@@ -0,0 +1,54 @@
+# Cell 1: Setup
+
+%pip install sagemaker>=3.7.0 --quiet
+
+# Cell 2: Configuration
+
+import os
+import json
+
+os.environ["AWS_DEFAULT_REGION"] = "[REGION]"
+
+from sagemaker.core.resources import TrainingJob
+from sagemaker.serve import ModelBuilder
+from sagemaker.core import Attribution, set_attribution
+
+set_attribution(Attribution.SAGEMAKER_AGENT_PLUGIN)
+
+TRAINING_JOB_NAME = "[TRAINING_JOB_NAME]"
+ROLE_ARN = "[ROLE_ARN]"
+INSTANCE_TYPE = "[INSTANCE_TYPE]"
+ENDPOINT_NAME = "[ENDPOINT_NAME]"
+
+# Cell 3: Build Model
+
+training_job = TrainingJob.get(training_job_name=TRAINING_JOB_NAME)
+print(f"Training job: {training_job.training_job_name}")
+print(f"Model artifacts: {training_job.model_artifacts.s3_model_artifacts}")
+
+model_builder = ModelBuilder(
+    model=training_job,
+    role_arn=ROLE_ARN,
+    instance_type=INSTANCE_TYPE,
+)
+model = model_builder.build()
+print(f"Model: {model.model_name}")
+print(f"Image: {model_builder.image_uri}")
+print(f"Env vars: {model_builder.env_vars}")
+
+# Cell 4: Deploy Endpoint
+
+endpoint = model_builder.deploy(endpoint_name=ENDPOINT_NAME)
+print(f"Endpoint: {endpoint.endpoint_name}")
+print(f"Status: {endpoint.endpoint_status}")
+
+# Cell 5: Test Inference
+
+output = endpoint.invoke(
+    body=json.dumps({
+        "messages": [{"role": "user", "content": "What is the capital of France?"}],
+        "max_tokens": 50,
+    }),
+    content_type="application/json",
+)
+print(f"Response: {json.loads(output.body.read())}")
diff --git a/plugins/sagemaker-ai/skills/model-deployment/scripts/deploy-oss-bedrock.py b/plugins/sagemaker-ai/skills/model-deployment/scripts/deploy-oss-bedrock.py
new file mode 100644
index 00000000..01da3e11
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/model-deployment/scripts/deploy-oss-bedrock.py
@@ -0,0 +1,110 @@
+# Cell 1: Setup
+
+%pip install sagemaker>=3.7.0 --quiet
+
+# Cell 2: Configuration
+
+import boto3
+import json
+import time
+from sagemaker.serve.bedrock_model_builder import BedrockModelBuilder
+from sagemaker.core.resources import TrainingJob
+from sagemaker.core import Attribution, set_attribution
+
+set_attribution(Attribution.SAGEMAKER_AGENT_PLUGIN)
+
+REGION = "[REGION]"
+TRAINING_JOB_NAME = "[TRAINING_JOB_NAME]"
+ROLE_ARN = "[ROLE_ARN]"
+MODEL_NAME = "[MODEL_NAME]"
+
+sm = boto3.client("sagemaker", region_name=REGION)
+s3 = boto3.client("s3", region_name=REGION)
+
+# Cell 3: Flatten S3 Structure and Start Import
+
+# BedrockModelBuilder passes the root model artifacts path to Bedrock CMI,
+# but Bedrock expects config.json at the root of the URI. This cell copies
+# files from checkpoints/hf_merged/ to the model artifacts root (server-side).
+
+tj = sm.describe_training_job(TrainingJobName=TRAINING_JOB_NAME)
+root = tj["ModelArtifacts"]["S3ModelArtifacts"]
+parts = root.replace("s3://", "").split("/", 1)
+bucket, root_prefix = parts[0], parts[1].rstrip("/") + "/"
+hf_prefix = root_prefix + "checkpoints/hf_merged/"
+
+resp = s3.list_objects_v2(Bucket=bucket, Prefix=root_prefix + "config.json", MaxKeys=1)
+if resp.get("KeyCount", 0) > 0:
+    print("Files already at root, skipping copy")
+else:
+    paginator = s3.get_paginator("list_objects_v2")
+    copied = 0
+    for page in paginator.paginate(Bucket=bucket, Prefix=hf_prefix):
+        for obj in page.get("Contents", []):
+            filename = obj["Key"].replace(hf_prefix, "")
+            if not filename or filename.endswith("/"):
+                continue
+            s3.copy_object(
+                Bucket=bucket,
+                CopySource={"Bucket": bucket, "Key": obj["Key"]},
+                Key=root_prefix + filename,
+            )
+            copied += 1
+    print(f"Copied {copied} files to root")
+
+training_job = TrainingJob.get(training_job_name=TRAINING_JOB_NAME, region=REGION)
+builder = BedrockModelBuilder(model=training_job)
+builder._bedrock_client = boto3.client("bedrock", region_name=REGION)
+
+result = builder.deploy(
+    job_name=MODEL_NAME,
+    imported_model_name=MODEL_NAME,
+    role_arn=ROLE_ARN,
+)
+
+job_arn = result["jobArn"]
+print(f"Import job created: {job_arn}")
+
+# Cell 4: Wait for Import to Complete
+
+bedrock = boto3.client("bedrock", region_name=REGION)
+
+while True:
+    resp = bedrock.get_model_import_job(jobIdentifier=job_arn)
+    status = resp["status"]
+    print(f"Status: {status}")
+
+    if status == "Completed":
+        model_arn = resp["importedModelArn"]
+        print(f"\nModel imported successfully!")
+        print(f"Model ARN: {model_arn}")
+        break
+    elif status in ("Failed", "Stopped"):
+        print(f"\nImport failed: {resp.get('failureMessage', 'Unknown error')}")
+        break
+
+    time.sleep(30)
+
+# Cell 5: Test Inference
+
+print("Testing inference (model may need a few minutes to warm up)...")
+bedrock_runtime = boto3.client("bedrock-runtime", region_name=REGION)
+
+for attempt in range(1, 25):
+    try:
+        response = bedrock_runtime.invoke_model(
+            modelId=model_arn,
+            body=json.dumps({
+                "prompt": "What is the capital of France?",
+                "max_gen_len": 50,
+                "temperature": 0.7,
+            }),
+        )
+        result = json.loads(response["body"].read())
+        print(f"Response: {json.dumps(result)[:300]}")
+        break
+    except bedrock_runtime.exceptions.ModelNotReadyException:
+        print(f"  Attempt {attempt}: Model not ready, waiting 30s...")
+        time.sleep(30)
+else:
+    print("Model did not become ready after 12 minutes.")
diff --git a/plugins/sagemaker-ai/skills/model-deployment/scripts/deploy-oss-sagemaker.py b/plugins/sagemaker-ai/skills/model-deployment/scripts/deploy-oss-sagemaker.py
new file mode 100644
index 00000000..a7e67a89
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/model-deployment/scripts/deploy-oss-sagemaker.py
@@ -0,0 +1,55 @@
+# Cell 1: Setup
+
+%pip install sagemaker>=3.7.0 --quiet
+
+# Cell 2: Configuration
+
+import os
+import json
+
+os.environ["AWS_DEFAULT_REGION"] = "[REGION]"
+
+from sagemaker.core.resources import TrainingJob
+from sagemaker.serve import ModelBuilder
+from sagemaker.core import Attribution, set_attribution
+
+set_attribution(Attribution.SAGEMAKER_AGENT_PLUGIN)
+
+TRAINING_JOB_NAME = "[TRAINING_JOB_NAME]"
+ROLE_ARN = "[ROLE_ARN]"
+INSTANCE_TYPE = "[INSTANCE_TYPE]"
+ENDPOINT_NAME = "[ENDPOINT_NAME]"
+ADAPTER_IC_NAME = f"{ENDPOINT_NAME}-adapter"
+
+# Cell 3: Build Model
+
+training_job = TrainingJob.get(training_job_name=TRAINING_JOB_NAME)
+print(f"Training job: {training_job.training_job_name}")
+print(f"Model package: {training_job.output_model_package_arn}")
+
+model_builder = ModelBuilder(
+    model=training_job,
+    role_arn=ROLE_ARN,
+    instance_type=INSTANCE_TYPE,
+)
+model = model_builder.build(model_name=ENDPOINT_NAME)
+print(f"Model: {model.model_arn}")
+
+# Cell 4: Deploy Endpoint
+
+endpoint = model_builder.deploy(
+    endpoint_name=ENDPOINT_NAME,
+    inference_component_name=ADAPTER_IC_NAME,
+)
+print(f"Endpoint: {endpoint.endpoint_name}")
+
+# Cell 5: Test Inference
+
+output = endpoint.invoke(
+    body=json.dumps({
+        "inputs": "What is the capital of France?",
+        "parameters": {"max_new_tokens": 50},
+    }),
+    inference_component_name=ADAPTER_IC_NAME,
+)
+print(f"Response: {output.body.read()}")
diff --git a/plugins/sagemaker-ai/skills/model-evaluation/SKILL.md b/plugins/sagemaker-ai/skills/model-evaluation/SKILL.md
new file mode 100644
index 00000000..71c4417d
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/model-evaluation/SKILL.md
@@ -0,0 +1,240 @@
+---
+name: model-evaluation
+description: Generates a Jupyter notebook that evaluates a fine-tuned SageMaker model using LLM-as-a-Judge. Use when the user says "evaluate my model", "how did my model perform", "compare models", or after a training job completes. Supports built-in and custom evaluation metrics, evaluation dataset setup, and judge model selection.
+---
+
+# Model Evaluation Code Generator
+
+Generate a Jupyter notebook that evaluates a SageMaker fine-tuned model using LLM-as-Judge via sagemaker-python-sdk v3.
+
+## Principles
+
+1. **One thing at a time.** Each response advances exactly one decision. Never combine multiple questions or recommendations in a single turn.
+2. **Confirm before proceeding.** Wait for the user to agree before moving to the next step. You are a guide, not a runaway train.
+3. **Don't read files until you need them.** Only read reference files when you've reached the workflow step that requires them and the user has confirmed the direction. Never read ahead.
+4. **No narration.** Don't explain what you're about to do or what you just did. Share outcomes and ask questions. Keep responses short and focused.
+5. **No repetition.** If you said something before a tool call, don't repeat it after. Only share new information.
+
+## Workflow
+
+### Step 0: Check for prior context
+
+Before starting the conversation, silently check for `workflow_state.json` in the project directory.
+If it exists, read it and remember any useful information (such as model package ARN, model package group name, training job name, dataset paths).
+
+### Step 1: Understand the task
+
+For this step, you need: **what task the model is trained to do.**
+If you know this already, skip this step. If not, ask the user:
+
+> "What task is this model trained to do?"
+
+⏸ Wait for user.
+
+### Step 2: Get evaluation dataset
+
+For this step, you need: **the evaluation dataset S3 path.**
+If you know this already, skip this step. If not, ask the user:
+
+> "Where's your evaluation dataset stored in S3?"
+
+⏸ Wait for user.
+
+### Step 3: Understand the data
+
+For this step, you need: **to understand what the data looks like to inform metric recommendations.**
+If you already know what the data looks like, skip this step. If not, ask the user:
+
+> "Can you tell me a bit about your evaluation dataset — what format is it in, and what do the input/output fields look like?"
+
+If the user isn't sure, offer to peek at the data:
+
+> "May I read a few records of your dataset to help inform my recommendations?"
+
+If they say yes, use the AWS tool to call `s3api get-object` with a `Range` header to read the first few KB.
+If you fail to get a sample, move on and rely on the user's description.
+
+### Step 4: Validate dataset format
+
+If the evaluation dataset was already validated via the **dataset-evaluation** skill earlier in the conversation, skip this step.
+
+Otherwise, activate the **dataset-evaluation** skill to validate it. If it fails, offer to activate the **dataset-transformation** skill to convert it. Do not proceed until the dataset is valid.
+
+### Step 5: Check for custom metrics
+
+For this step, you need: **whether the user has predefined custom metrics.**
+
+> "Do you have predefined custom metrics you'd like to use? If so, they must follow the Bedrock custom metrics format: https://docs.aws.amazon.com/bedrock/latest/userguide/model-evaluation-custom-metrics-prompt-formats.html
+>
+> If not, no worries — I can recommend built-in metrics for your task."
+
+⏸ Wait for user.
+
+- If the user has custom metrics → Read `references/llmaaj-custom-evaluation.md` and follow its instructions to collect and validate the metrics JSON.
+- If the user does not have custom metrics → Move to Step 6.
+
+### Step 6: Select built-in metrics
+
+For this step, you need: **user agreement on which built-in metrics to use (if any).**
+
+If the user provided custom metrics in Step 5, ask whether they also want built-in metrics:
+
+> "Would you also like to include any built-in metrics alongside your custom ones?"
+
+If they say no, skip to Step 7.
+
+For built-in metric selection, read `references/llmaaj-builtin-evaluation.md` and follow its instructions.
+
+⏸ Wait for user to confirm metrics.
+
+### Step 7: Resolve Model Package ARN
+
+For this step, you need: **the Model Package ARN of the fine-tuned model.**
+
+**Use this priority order:**
+
+1. **Model Package ARN from workflow state or conversation**: If you already have a model package ARN from Step 0 (workflow state) or from earlier in the conversation, confirm it with the user and move on.
+2. **Ask the user**: If you don't have the ARN, ask:
+   > "What's the Model Package ARN (or group name) of your fine-tuned model?"
+   > If they provide a group name, resolve the ARN by calling `list-model-packages` via the AWS tool with the group name.
+   > Use the latest version's `ModelPackageArn` from the response.
+
+**Validate the resolved ARN** (whether from API lookup, workflow state, or user input):
+
+- A valid versioned model package ARN looks like: `arn:aws:sagemaker:REGION:ACCOUNT:model-package/NAME/VERSION`
+- If the ARN contains `:model-package-group/`, the user provided a group ARN, not a package ARN. Resolve it using the lookup in #2.
+- If the ARN contains `:model-package/` but does NOT end with a version number (e.g., `/1`), resolve it: extract the group name from the ARN and use the lookup in #2.
+- If it contains `/DataSet/`, `/TrainingJob/`, or other non-model-package resource types, flag it: "That looks like a [Dataset/TrainingJob] ARN, not a model package ARN. Could you double-check?"
+- **Verify the ARN exists** before proceeding by calling `describe-model-package` via the AWS tool.
+  If this fails, tell the user the ARN wasn't found and ask them to double-check.
+
+⏸ Wait for confirmation before proceeding.
+
+### Step 8: Select judge model
+
+For this step, you need: **which judge model to use for evaluation.**
+This step always runs — both built-in and custom metrics require a judge model.
+
+Read `references/supported-judge-models.md` for the canonical list, selection guidance, and validation steps.
+
+Before presenting options, run the validation checks from the reference doc against the user's account and region. Only include models that pass all checks.
+
+Present the available models as a numbered list:
+
+> "Here are the judge models available in your region:
+>
+> 1. [model A]
+> 2. [model B]
+>    ...
+>
+> Which model would you like to use? Please type the exact model name from the above list."
+
+**EXTREMELY IMPORTANT: NEVER recommend or suggest any particular model based on the context you have. YOU ARE ALLOWED ONLY to display the list of models. DO NOT add your own recommendation or suggestion after displaying the list.**
+
+⏸ Wait for user to confirm.
+
+### Step 9: Collect remaining parameters
+
+For this step, you need: **AWS Region and S3 output path.**
+For each value you don't already have, ask one at a time.
+
+⏸ Wait for each answer before asking the next.
+
+### Step 10: Confirm configuration
+
+> "Here's the evaluation setup:
+>
+> - Task: [task]
+> - Dataset: [path]
+> - Custom metrics: [Yes — N metrics / No]
+> - Built-in metrics: [list, or None]
+> - Judge: [model]
+> - Model Package ARN: [arn]
+> - Region: [region]
+> - S3 output: [path]
+>
+> Your fine-tuned model will automatically be compared against its base model.
+>
+> Does this look right?"
+
+⏸ Wait for user approval.
+
+### Step 11: Bedrock Evaluations agreement
+
+**This step is mandatory. Do not skip it. Do not proceed without explicit user confirmation.**
+
+Before generating the notebook, present the following agreement language:
+
+> **Important: Amazon Bedrock Evaluations Terms**
+>
+> This feature is powered by Amazon Bedrock Evaluations. Your use of this feature is subject to pricing of Amazon Bedrock Evaluations, the [Service Terms](https://aws.amazon.com/service-terms/) applicable to Amazon Bedrock, and the terms that apply to your usage of third-party models. Amazon Bedrock Evaluations may securely transmit data across AWS Regions within your geography for processing. For more information, access [Amazon Bedrock Evaluations documentation](https://docs.aws.amazon.com/bedrock/latest/userguide/evaluation-judge.html).
+>
+> Do you acknowledge and agree to proceed?
+
+⏸ **Hard stop.** Wait for the user to explicitly confirm. Acceptable responses include "yes", "I agree", "proceed", "ok", or similar affirmative statements. If the user asks questions about the terms, answer them, then re-ask for confirmation. Do NOT generate the notebook until the user has confirmed.
+
+### Step 12: Generate notebook
+
+If a project directory already exists (from earlier in the workflow), use it. Otherwise, activate the **directory-management** skill to set one up.
+
+Check for existing notebooks in `<project-name>/notebooks/`. Then ask:
+
+> "Would you like to append to an existing notebook, or create a new one: `<project-name>/notebooks/<project-name>_model-evaluation.ipynb`?"
+
+⏸ Wait for user.
+
+**Before writing the notebook, read:**
+
+- `references/notebook_structure.md` (cell order, placeholders, JSON formatting)
+- `scripts/notebook_cells.py` (all cell code templates)
+
+### Step 13: Provide run instructions
+
+```
+To run:
+1. Cell 1 — configuration and SDK install
+2. Cell 2 — start evaluation
+3. Cell 3 — polls status automatically (~25-60 min)
+4. Cell 4 — show base vs custom model comparison
+```
+
+## Notes
+
+- Not all models support serverless evaluation. If job fails with "DownstreamServiceUnavailable", the model doesn't have evaluation recipes.
+- Jobs stuck in "Executing" is normal — inference takes 15-30+ minutes.
+- For faster iteration, use a small dataset (5-10 examples).
+- Known working models: DeepSeek R1 Distilled Qwen 32B
+- Expected duration: small model (<10B) 25-40 min, large model (>30B) 40-60 min, with base comparison 2x.
+
+## FAQ
+
+**Q: Can I use benchmarks or custom scorer evaluations?**
+A: Not yet — this skill currently supports LLM-as-Judge evaluations only (built-in and custom metrics). Benchmark and custom scorer support will be added in a future version. In the meantime, you can set these up through the SageMaker console or refer to the [SageMaker evaluation documentation](https://docs.aws.amazon.com/sagemaker/latest/dg/model-evaluation.html).
+
+**Q: Can I combine custom and built-in metrics in the same evaluation?**
+A: Yes. You can use up to 10 custom metrics alongside any number of built-in metrics in a single evaluation job.
+
+## Troubleshooting
+
+### Evaluation job fails with "access denied when attempting to assume role"
+
+The Bedrock evaluation job needs to assume your IAM role, which requires `bedrock.amazonaws.com` in the role's trust policy. This is common when running from a local IDE with temporary or SSO credentials.
+
+To check, inspect your current role's trust policy using the AWS MCP tool:
+
+1. Use the AWS MCP tool `get-caller-identity` (STS service) to get your current role ARN.
+2. Extract the role name from the ARN (the part after `role/` or `assumed-role/`).
+3. Use the AWS MCP tool `get-role` (IAM service) with the role name, and extract `Role.AssumeRolePolicyDocument` from the response.
+
+Look for `bedrock.amazonaws.com` in `Principal.Service`. If it's missing, either add it to the trust policy or switch to a role that already trusts Bedrock (e.g., your SageMaker execution role).
+
+### Helping a user find their Model Package ARN
+
+If the user doesn't know their model package ARN and can only provide partial info (dataset ARN, training job name, etc.), guide them through these steps:
+
+1. **Ask for keywords** from the model or training job name (e.g., "medication-simplification").
+2. **Search model package groups** via the AWS tool: `list-model-package-groups` with `name-contains <keyword>`.
+3. **List packages in the group** via the AWS tool: `list-model-packages` with the group name.
+4. **Verify the match** via the AWS tool: `describe-model-package` with the ARN. Check that the `S3Uri` in `InferenceSpecification.Containers` matches the expected training output path.
+
+Always confirm the resolved ARN with the user before proceeding.
diff --git a/plugins/sagemaker-ai/skills/model-evaluation/references/builtin-metrics.md b/plugins/sagemaker-ai/skills/model-evaluation/references/builtin-metrics.md
new file mode 100644
index 00000000..816ad85d
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/model-evaluation/references/builtin-metrics.md
@@ -0,0 +1,35 @@
+# LLM-as-Judge Built-in Metrics
+
+SageMaker provides 11 built-in metrics for LLM-as-Judge evaluation, organized into Quality and Responsible AI categories.
+
+## Quality Metrics
+
+| Metric                   | Description                                                                                                                                                              | When to Use                                                          |
+| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------- |
+| Correctness              | Measures if the model's response to the prompt is correct. If a reference response (ground truth) is provided in the dataset, the evaluator considers this when scoring. | QA, math problems, factual tasks                                     |
+| Completeness             | Measures how well the model's response answers every question in the prompt. If a reference response is provided, the evaluator considers this when scoring.             | Multi-part questions, comprehensive answers, summarization           |
+| Faithfulness             | Identifies whether the response contains information not found in the prompt to measure how faithful the response is to the available context.                           | RAG applications, context-grounded responses                         |
+| Helpfulness              | Measures how helpful the model's response is using factors including whether it follows instructions, is sensible and coherent, and anticipates implicit needs.          | General assistance, customer service, broad evaluation               |
+| Coherence                | Measures how coherent the response is by identifying logical gaps, inconsistencies, and contradictions.                                                                  | Long-form content, reasoning tasks, explanations                     |
+| Relevance                | Measures how relevant the answer is to the prompt.                                                                                                                       | All tasks - commonly used baseline metric                            |
+| FollowingInstructions    | Measures how well the model's response respects the exact directions found in the prompt.                                                                                | Instruction-following tasks, structured outputs, specific formatting |
+| ProfessionalStyleAndTone | Measures how appropriate the response's style, formatting, and tone is for a professional setting.                                                                       | Business communications, formal writing                              |
+
+## Responsible AI Metrics
+
+| Metric       | Description                                                                                                    | When to Use                                       |
+| ------------ | -------------------------------------------------------------------------------------------------------------- | ------------------------------------------------- |
+| Harmfulness  | Evaluates whether the response contains harmful content.                                                       | Safety evaluation, content moderation             |
+| Stereotyping | Evaluates whether content in the response contains stereotypes of any kind (either positive or negative).      | Fairness evaluation, bias detection               |
+| Refusal      | Determines if the response directly declines to answer the prompt or rejects the request by providing reasons. | Safety evaluation, understanding model boundaries |
+
+## Usage in Code
+
+In code, these metrics are specified as `Builtin.Correctness`, `Builtin.Completeness`, etc. When discussing with users, use natural language names.
+
+## Common Metric Combinations
+
+- **QA/Math tasks** → Correctness, Completeness, Faithfulness, Relevance
+- **Summarization** → Completeness, Coherence, Relevance
+- **General assistance** → Helpfulness, Relevance, FollowingInstructions
+- **Safety evaluation** → Harmfulness, Stereotyping, Refusal
diff --git a/plugins/sagemaker-ai/skills/model-evaluation/references/llmaaj-builtin-evaluation.md b/plugins/sagemaker-ai/skills/model-evaluation/references/llmaaj-builtin-evaluation.md
new file mode 100644
index 00000000..27e3d66a
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/model-evaluation/references/llmaaj-builtin-evaluation.md
@@ -0,0 +1,23 @@
+# LLM-as-Judge with Built-in Metrics: Alignment Guide
+
+This file guides you through aligning with the user on built-in metric selection.
+
+## Select Metrics
+
+Read `references/builtin-metrics.md` for the full list of metrics with descriptions and common combinations.
+
+Based on the user's task and data, recommend specific metrics with reasoning:
+
+> "Based on your [task], I recommend these metrics:
+>
+> - [metric1]: [why it matters for this task]
+>
+> Does this look good, or do you want to consider other metrics?"
+
+⏸ **Wait for user to confirm.**
+
+Tips:
+
+- Start with the common combinations from the metrics file as a baseline
+- Adjust based on what you know about the user's task and data
+- If the user pushes back, understand why and adjust — don't just agree
diff --git a/plugins/sagemaker-ai/skills/model-evaluation/references/llmaaj-custom-evaluation.md b/plugins/sagemaker-ai/skills/model-evaluation/references/llmaaj-custom-evaluation.md
new file mode 100644
index 00000000..eb8f7a37
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/model-evaluation/references/llmaaj-custom-evaluation.md
@@ -0,0 +1,63 @@
+# LLM-as-Judge Custom Metrics Guide
+
+This file guides you through collecting and validating custom metric definitions from the user.
+
+## Step 1: Collect Custom Metrics
+
+Ask the user to provide their custom metrics as JSON — either by pasting it directly or pointing to a file. The JSON must be an array of metric definitions following the Bedrock format.
+
+> "Please share your custom metrics JSON. You can paste it here or point me to a file."
+
+⏸ Wait for user.
+
+### Helping Users Structure Metrics
+
+If the user doesn't have ready-made JSON but describes what they want to evaluate, you can help them create the JSON structure. Be upfront about limitations:
+
+> "I can help you put together the JSON structure based on what you've described. Note that I can't guarantee the judge model will interpret your metric exactly as intended — you may need to iterate on the prompt wording after seeing initial results."
+
+When helping, follow the Bedrock-recommended prompt structure (in this order):
+
+1. Role definition (optional)
+2. Task description (required, minimum 15 words)
+3. Criterion and rubric (optional)
+4. Input variables (required, must be last in the prompt)
+
+Available input variables: `{{prompt}}`, `{{prediction}}`, `{{ground_truth}}`
+
+Example of a valid single custom metric:
+
+```json
+[
+  {
+    "customMetricDefinition": {
+      "name": "DomainAccuracy",
+      "instructions": "You are a domain expert. Evaluate whether the response accurately addresses the domain-specific aspects of the prompt.\n\nPrompt: {{prompt}}\nResponse: {{prediction}}",
+      "ratingScale": [
+        { "definition": "Accurate", "value": { "floatValue": 1.0 } },
+        { "definition": "Inaccurate", "value": { "floatValue": 0.0 } }
+      ]
+    }
+  }
+]
+```
+
+Multiple custom metrics go in the same array (max 10 per job).
+
+## Step 2: Write and Validate the JSON Artifact
+
+Once you have the custom metrics JSON (from the user or co-created), write it to a file called `custom_metrics.json` next to where the notebook will go.
+
+Then validate it by running the validation script:
+
+```bash
+python scripts/validate_custom_metrics.py custom_metrics.json
+```
+
+If validation fails, show the errors to the user and iterate until it passes.
+
+⏸ Do not proceed until validation passes.
+
+## After Collection
+
+Once custom metrics are validated, return to the main workflow (Step 6) to check if the user also wants built-in metrics.
diff --git a/plugins/sagemaker-ai/skills/model-evaluation/references/notebook_structure.md b/plugins/sagemaker-ai/skills/model-evaluation/references/notebook_structure.md
new file mode 100644
index 00000000..10d2b5eb
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/model-evaluation/references/notebook_structure.md
@@ -0,0 +1,63 @@
+# Notebook Structure
+
+Cell order, placeholders, and JSON formatting for the evaluation notebook.
+
+## Cells
+
+| Cell | Label                                                  | Source                             |
+| ---- | ------------------------------------------------------ | ---------------------------------- |
+| 0    | Markdown header: `# SageMaker LLM-as-Judge Evaluation` | —                                  |
+| 1    | Configuration                                          | `scripts/notebook_cells.py` Cell 1 |
+| 2    | Start Evaluation                                       | `scripts/notebook_cells.py` Cell 2 |
+| 3    | Wait for Completion                                    | `scripts/notebook_cells.py` Cell 3 |
+| 4    | Show Results                                           | `scripts/notebook_cells.py` Cell 4 |
+
+## Placeholders (Cell 1 only)
+
+| Placeholder             | Description                                                                        | Example                                                          |
+| ----------------------- | ---------------------------------------------------------------------------------- | ---------------------------------------------------------------- |
+| `[REGION]`              | AWS region                                                                         | `us-west-2`                                                      |
+| `[MODEL_ARN]`           | Model package ARN                                                                  | `arn:aws:sagemaker:us-west-2:123456789:model-package/my-model/1` |
+| `[DATASET_S3_URI]`      | Evaluation dataset S3 path                                                         | `s3://bucket/data.jsonl`                                         |
+| `[JUDGE_MODEL]`         | Judge model ID                                                                     | `anthropic.claude-3-5-haiku-20241022-v1:0`                       |
+| `[METRICS_LIST]`        | Built-in metrics as quoted strings, or empty list                                  | `"Completeness", "Correctness"` or `None`                        |
+| `[CUSTOM_METRICS_JSON]` | `json.load(open("custom_metrics.json"))` if custom metrics exist, otherwise `None` | See below                                                        |
+| `[S3_OUTPUT_PATH]`      | S3 output path                                                                     | `s3://bucket/eval-output/`                                       |
+| `[TRUE_OR_FALSE]`       | Compare to base model                                                              | `True`                                                           |
+
+### Custom Metrics Placeholder
+
+When the user has no custom metrics, substitute `[CUSTOM_METRICS_JSON]` with `None`.
+
+When the user has custom metrics, the validated `custom_metrics.json` file sits next to the notebook. Substitute `[CUSTOM_METRICS_JSON]` with a `json.load` call:
+
+```python
+CUSTOM_METRICS = json.load(open("custom_metrics.json"))
+```
+
+Do not inline the JSON into the notebook. The validated file is the source of truth.
+
+## JSON Formatting
+
+Each line of code is a separate string in `source`, ending with `\n` (except the last line):
+
+```json
+{
+  "cell_type": "code",
+  "execution_count": null,
+  "metadata": {},
+  "outputs": [],
+  "source": [
+    "import os\n",
+    "x = 5\n",
+    "print(x)"
+  ]
+}
+```
+
+- Escape quotes inside strings: `\"`
+- No trailing commas in arrays or objects
+- 2-space indentation
+- Use `fs_write` with `command: create` to write the complete notebook JSON
+- Markdown cell 0: `"cell_type": "markdown"`, no `execution_count` or `outputs`
+- Wrap all cells in `{"cells": [...], "metadata": {...}, "nbformat": 4, "nbformat_minor": 4}`
diff --git a/plugins/sagemaker-ai/skills/model-evaluation/references/supported-judge-models.md b/plugins/sagemaker-ai/skills/model-evaluation/references/supported-judge-models.md
new file mode 100644
index 00000000..f1ee3adb
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/model-evaluation/references/supported-judge-models.md
@@ -0,0 +1,35 @@
+# Supported Judge Models
+
+Reference: [Amazon Bedrock LLM-as-Judge Evaluation](https://docs.aws.amazon.com/bedrock/latest/userguide/evaluation-judge.html)
+
+## Allowed Judge Models
+
+The SageMaker Python SDK validates the judge model against a hardcoded allowlist before submitting the evaluation job. Only these models are accepted:
+
+| Model                          | Model ID                                    | Regions                                         |
+| ------------------------------ | ------------------------------------------- | ----------------------------------------------- |
+| Amazon Nova Pro                | `amazon.nova-pro-v1:0`                      | us-east-1                                       |
+| Anthropic Claude 3.5 Sonnet v1 | `anthropic.claude-3-5-sonnet-20240620-v1:0` | us-west-2, us-east-1, ap-northeast-1            |
+| Anthropic Claude 3.5 Sonnet v2 | `anthropic.claude-3-5-sonnet-20241022-v2:0` | us-west-2                                       |
+| Anthropic Claude 3 Haiku       | `anthropic.claude-3-haiku-20240307-v1:0`    | us-west-2, us-east-1, ap-northeast-1, eu-west-1 |
+| Anthropic Claude 3.5 Haiku     | `anthropic.claude-3-5-haiku-20241022-v1:0`  | us-west-2                                       |
+| Meta Llama 3.1 70B Instruct    | `meta.llama3-1-70b-instruct-v1:0`           | us-west-2                                       |
+| Mistral Large                  | `mistral.mistral-large-2402-v1:0`           | us-west-2, us-east-1, eu-west-1                 |
+
+This list applies to both built-in and custom metrics — the SDK does not distinguish between them.
+
+Source: `sagemaker.train.constants._ALLOWED_EVALUATOR_MODELS` (sagemaker SDK v3)
+
+## Before Recommending
+
+Verify each candidate is active in the user's region. Use the AWS MCP tool `get-foundation-model` (Bedrock service) with the model identifier and region. Extract `modelDetails.modelLifecycle.status` from the response.
+
+Only include models that return `ACTIVE`. Models marked `LEGACY` will fail at evaluation time.
+
+## Selection Guidance
+
+Present all active models to the user and let them choose. **NEVER recommend or suggest any particular model.** Only display the list. If the user asks for guidance, you may share these trade-offs:
+
+- Simple tasks (QA, classification) → Claude 3.5 Haiku (fast, cost-effective)
+- Complex reasoning (math, multi-step) → Claude 3.5 Sonnet v2 (higher quality)
+- Budget-conscious → Claude 3.5 Haiku or Mistral Large
diff --git a/plugins/sagemaker-ai/skills/model-evaluation/scripts/notebook_cells.py b/plugins/sagemaker-ai/skills/model-evaluation/scripts/notebook_cells.py
new file mode 100644
index 00000000..ba1abe3a
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/model-evaluation/scripts/notebook_cells.py
@@ -0,0 +1,83 @@
+# Combined Notebook Scripts
+# Each section corresponds to one notebook cell.
+# The agent should copy each section into the corresponding cell.
+
+# ==============================================================================
+# Cell 1: Configuration
+
+# Set AWS region before importing SageMaker SDK
+import os
+REGION = "[REGION]"
+os.environ['AWS_DEFAULT_REGION'] = REGION
+
+%pip install --upgrade sagemaker>=3.7.0 --quiet
+
+import json
+from sagemaker.train.evaluate import LLMAsJudgeEvaluator
+from sagemaker.core import Attribution, set_attribution
+
+set_attribution(Attribution.SAGEMAKER_AGENT_PLUGIN)
+
+# Suppress verbose logging from SageMaker SDK
+import logging
+logging.getLogger('sagemaker').setLevel(logging.WARNING)
+logging.getLogger('botocore').setLevel(logging.WARNING)
+
+# Evaluation configuration
+MODEL = "[MODEL_ARN]"
+DATASET = "[DATASET_S3_URI]"
+EVALUATOR_MODEL = "[JUDGE_MODEL]"
+BUILTIN_METRICS = [METRICS_LIST]
+CUSTOM_METRICS = [CUSTOM_METRICS_JSON]
+S3_OUTPUT = "[S3_OUTPUT_PATH]"
+EVALUATE_BASE = [TRUE_OR_FALSE]
+
+# ==============================================================================
+# Cell 2: Start Evaluation
+
+# Build evaluator kwargs
+evaluator_kwargs = dict(
+    model=MODEL,
+    evaluator_model=EVALUATOR_MODEL,
+    dataset=DATASET,
+    s3_output_path=S3_OUTPUT,
+    evaluate_base_model=EVALUATE_BASE,
+    region=REGION
+)
+
+if BUILTIN_METRICS:
+    evaluator_kwargs["builtin_metrics"] = BUILTIN_METRICS
+if CUSTOM_METRICS:
+    evaluator_kwargs["custom_metrics"] = json.dumps(CUSTOM_METRICS)
+
+evaluator = LLMAsJudgeEvaluator(**evaluator_kwargs)
+
+print("✅ Starting evaluation...")
+print(f"Model: {MODEL}")
+print(f"Dataset: {DATASET}")
+print(f"Judge: {EVALUATOR_MODEL}")
+if BUILTIN_METRICS:
+    print(f"Built-in metrics: {BUILTIN_METRICS}")
+if CUSTOM_METRICS:
+    print(f"Custom metrics: {len(CUSTOM_METRICS)} defined")
+
+execution = evaluator.evaluate()
+
+print(f"\n✅ Evaluation job started!")
+print(f"Job ARN: {execution.arn}")
+print(f"Job Name: {execution.name}")
+print(f"Status: {execution.status.overall_status}")
+
+# ==============================================================================
+# Cell 3: Wait for Completion
+
+# Expected duration: 25-60 min depending on model size and base comparison
+
+execution.wait(target_status="Succeeded", poll=60, timeout=7200)
+
+# ==============================================================================
+# Cell 4: Show Results
+
+# Display evaluation results
+# If evaluate_base_model was True, this shows a comparison between base and custom model
+execution.show_results()
diff --git a/plugins/sagemaker-ai/skills/model-evaluation/scripts/validate_custom_metrics.py b/plugins/sagemaker-ai/skills/model-evaluation/scripts/validate_custom_metrics.py
new file mode 100644
index 00000000..ab6703e6
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/model-evaluation/scripts/validate_custom_metrics.py
@@ -0,0 +1,124 @@
+"""Validate custom metrics JSON against the Bedrock LLM-as-Judge format.
+
+Usage:
+    python validate_custom_metrics.py '<json_string>'
+    python validate_custom_metrics.py path/to/custom_metrics.json
+"""
+
+import json
+import sys
+from typing import Optional, Union
+
+from pydantic import BaseModel, field_validator, model_validator
+
+
+class RatingValue(BaseModel):
+    floatValue: Optional[float] = None
+    stringValue: Optional[str] = None
+
+    @model_validator(mode="after")
+    def exactly_one_value(self):
+        has_float = self.floatValue is not None
+        has_string = self.stringValue is not None
+        if has_float == has_string:  # both set or neither set
+            raise ValueError("Exactly one of 'floatValue' or 'stringValue' must be set.")
+        return self
+
+
+class RatingScaleEntry(BaseModel):
+    definition: str
+    value: RatingValue
+
+    @field_validator("definition")
+    @classmethod
+    def definition_length(cls, v):
+        if len(v) > 100:
+            raise ValueError(f"Definition exceeds 100 chars ({len(v)}).")
+        return v
+
+
+class CustomMetricDefinition(BaseModel):
+    name: str
+    instructions: str
+    ratingScale: Optional[list[RatingScaleEntry]] = None
+
+    @model_validator(mode="after")
+    def check_instructions(self):
+        if len(self.instructions) > 5000:
+            raise ValueError(
+                f"Instructions exceed 5000 char limit ({len(self.instructions)})."
+            )
+        if "{{prediction}}" not in self.instructions and "{{prompt}}" not in self.instructions:
+            raise ValueError(
+                "Instructions must contain at least {{prompt}} or {{prediction}}."
+            )
+        return self
+
+    @model_validator(mode="after")
+    def consistent_scale_types(self):
+        if not self.ratingScale:
+            return self
+        types = set()
+        for entry in self.ratingScale:
+            if entry.value.floatValue is not None:
+                types.add("float")
+            if entry.value.stringValue is not None:
+                types.add("string")
+        if len(types) > 1:
+            raise ValueError("ratingScale mixes float and string values. Use one type.")
+        return self
+
+
+class CustomMetric(BaseModel):
+    customMetricDefinition: CustomMetricDefinition
+
+
+def validate(raw: str) -> tuple[bool, list[str]]:
+    """Validate a JSON string of custom metrics. Returns (ok, errors)."""
+    try:
+        data = json.loads(raw)
+    except json.JSONDecodeError as e:
+        return False, [f"Invalid JSON: {e}"]
+
+    if not isinstance(data, list):
+        return False, ["Must be a JSON array of metric definitions."]
+    if len(data) == 0:
+        return False, ["Array is empty — need at least one metric."]
+    if len(data) > 10:
+        return False, [f"Too many metrics ({len(data)}). Maximum is 10."]
+
+    errors = []
+    for i, item in enumerate(data):
+        try:
+            CustomMetric.model_validate(item)
+        except Exception as e:
+            errors.append(f"Metric [{i}]: {e}")
+
+    return len(errors) == 0, errors
+
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: python validate_custom_metrics.py '<json>' | file.json")
+        sys.exit(1)
+
+    arg = sys.argv[1]
+    try:
+        with open(arg) as f:
+            raw = f.read()
+    except (FileNotFoundError, IsADirectoryError):
+        raw = arg
+
+    ok, errors = validate(raw)
+    if ok:
+        count = len(json.loads(raw))
+        print(f"✅ Valid — {count} custom metric{'s' if count != 1 else ''} defined.")
+    else:
+        print("❌ Validation failed:")
+        for err in errors:
+            print(f"  - {err}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/plugins/sagemaker-ai/skills/planning/SKILL.md b/plugins/sagemaker-ai/skills/planning/SKILL.md
new file mode 100644
index 00000000..325e073f
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/planning/SKILL.md
@@ -0,0 +1,121 @@
+---
+name: planning
+description: Discovers user intent and generates a structured, step-by-step customization plan that orchestrates other skills. Always activate at the start of every conversation, when all tasks in a plan are completed, or when the user asks to modify the current plan. Handles intent discovery, plan generation, plan iteration, and mid-execution plan alterations. When in doubt, use this skill.
+---
+
+# Planning
+
+Implements dynamic planning for flexible user journeys. Instead of forcing a rigid workflow, this skill discovers what the user wants, proposes a plan, and adapts as needed.
+
+---
+
+## Phase 1: Brainstorming
+
+**Goal:** Understand what the user wants to accomplish.
+
+**First message rules:**
+
+- If the user describes a goal, ask one clarifying question at most — then move to Phase 2.
+- Do NOT list capabilities, pipeline steps, or menus unprompted.
+- Do NOT read files or run tools unless the user asks.
+
+**During brainstorming:**
+
+- The goal of this phase is to determine which skills and tools to use to fulfill the user's intent. Every question you ask should help you decide whether a specific skill or tool belongs in the plan.
+- Before asking questions, review the name, description, and details of each skill in your context (do not actually load the full SKILL.md files yet), as well as the available MCP tools. Identify what information you'd need from the user to decide if each skill/tool is relevant.
+- Ask only questions whose answers would include or exclude a skill or tool from the plan. Do not ask generic or open-ended questions. Each question should map to a planning and skill-selection decision.
+- When evaluating whether to include a skill, check if ALL of the skill's responsibilities are satisfied, not just the primary one. If a skill handles multiple decisions (e.g., technique selection AND model selection), include it if any of those decisions remain unresolved.
+- Move to Phase 2 as soon as you can determine which skills and tools the plan needs. Don't over-ask — 1 to 3 targeted questions should be sufficient in most cases.
+
+---
+
+## Phase 2: Plan Generation
+
+**Goal:** Propose a structured plan for the user to review.
+
+Generate a plan as a numbered list of tasks. Each task has:
+
+- A short name
+- A one-sentence description of what happens
+- Which skill handles it (if applicable)
+
+**Format:**
+
+```
+Based on what you've described, here's what I propose:
+
+1. ⬜ **[Task Name]** — [What happens]. *(Skill: [skill-name])*
+2. ⬜ **[Task Name]** — [What happens]. *(Skill: [skill-name])*
+3. ⬜ **[Task Name]** — [What happens]. *(Skill: [skill-name])*
+
+Does this plan look right, or would you like to change anything?
+```
+
+**Rules for plan generation:**
+
+- Before presenting a plan, always read `references/skill-routing-constraints.md` and validate the plan against it.
+- Draw tasks from the skills available in your context. Use each skill's name and description to determine relevance.
+- Not every plan needs every skill. Tailor the plan to the user's actual intent.
+- If the user already has artifacts (a dataset, a trained model, etc.), skip the steps that produce them.
+- Keep plans short. Only include tasks that are necessary.
+
+When the user approves the plan, write it to `PLAN.md` using the following format. Save the file under the project directory structure defined by the directory-management skill, if available.
+
+```markdown
+# Plan
+
+1. ⬜ **[Task Name]** — [Description]. _(Skill: [skill-name])_
+2. ⬜ **[Task Name]** — [Description]. _(Skill: [skill-name])_
+3. ⬜ **[Task Name]** — [Description]. _(Skill: [skill-name])_
+```
+
+**Status indicators:**
+
+- ⬜ Not Started
+- 🔄 In Progress
+- ✅ Completed
+
+Update `PLAN.md` whenever a task's status changes.
+
+---
+
+## Phase 3: Plan Iteration
+
+**Goal:** Refine the plan until the user approves it.
+
+- If the user suggests changes, regenerate the plan incorporating their feedback.
+- If the user approves (e.g., "looks good", "let's go", "yes"), begin execution by handing off to the first task's skill.
+
+---
+
+## Execution
+
+Once the plan is approved:
+
+1. Before starting a task, update its status in `PLAN.md` to 🔄 (In Progress).
+2. If the task maps to a skill, load that skill's full SKILL.md before doing any work. Do not attempt the task from general knowledge — always defer to the skill's instructions.
+3. Execute the task by following the loaded skill's workflow.
+4. When the task completes, update its status in `PLAN.md` to ✅ (Completed), then briefly confirm completion and move to the next task.
+5. If the user interrupts with a new request mid-execution:
+   - Completed tasks are immutable — DO NOT ever modify completed tasks in the plan. You are allowed to only modify tasks that are in progress or not started.
+   - Regenerate the remaining tasks to incorporate the user's new input.
+   - Present the updated remainder for approval before continuing.
+
+---
+
+## Plan Completion
+
+When all tasks in the plan are done:
+
+> "We've completed everything in the plan. What would you like to do next?"
+
+This re-enters Phase 1 (Brainstorming) for a new goal. There is no terminal state — the conversation continues as long as the user wants.
+
+---
+
+## References
+
+Always load the corresponding reference plan based on the customer intent to learn about what a typical plan looks like, and then adjust based on customer's needs.
+
+- `references/model-customization-plan.md` — A typical end-to-end model customization/finetuning plan for reference when generating plans.
+- `references/skill-routing-constraints.md` — Mandatory inclusion rules, ordering constraints, and skill boundary rules. Always consult when generating or modifying a plan.
diff --git a/plugins/sagemaker-ai/skills/planning/references/model-customization-plan.md b/plugins/sagemaker-ai/skills/planning/references/model-customization-plan.md
new file mode 100644
index 00000000..1e420c31
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/planning/references/model-customization-plan.md
@@ -0,0 +1,15 @@
+# Typical End-to-End Model Customization Plan
+
+A typical model customization workflow follows these steps in order:
+
+1. **Define Use Case** — Capture the business problem, users, and success criteria. _(Skill: use-case-specification)_
+2. **Finetuning Setup** — Choose a fine-tuning technique (SFT, DPO, or RLVR) and base model. _(Skill: finetuning-setup)_
+3. **Evaluate Dataset** — Assess data quality, completeness, and format. _(Skill: dataset-evaluation)_
+4. **Transform Dataset** — Convert the dataset to the required format for the selected fine-tuning technique and base model. _(Skill: dataset-transformation)_
+5. **Fine-Tune Model** — Train a custom model using SageMaker. _(Skill: finetuning)_
+6. **Evaluate Model** — Measure model performance against success criteria. _(Skill: model-evaluation)_
+7. **Deploy Model** — Create an endpoint for inference. _(Skill: model-deployment)_
+
+Not every plan needs every step. Users may skip steps if they already have the required artifacts (e.g., skip steps 3–4 if they have validated data, or skip to deployment if they have a trained model).
+
+**Note:** Evaluation datasets require a different format than training datasets (e.g., SageMaker Eval `query`/`response` vs SFT `prompt`/`completion`). If the user has a separate eval dataset, it may need its own validation and transformation pass before model evaluation.
diff --git a/plugins/sagemaker-ai/skills/planning/references/skill-routing-constraints.md b/plugins/sagemaker-ai/skills/planning/references/skill-routing-constraints.md
new file mode 100644
index 00000000..374f39ab
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/planning/references/skill-routing-constraints.md
@@ -0,0 +1,32 @@
+# Skill Routing Constraints
+
+## Plan Completeness
+
+- Generate the complete plan upfront. The plan presented to the user
+  must include all steps needed to reach their goal. Do not generate
+  a partial plan with the intent to add steps later.
+- Each step must be executed by its designated skill. Do not perform
+  a skill's work ad-hoc or inline within another skill.
+
+## Mandatory Inclusion
+
+- use-case-specification: Include by default in every model
+  customization plan unless the user explicitly declines or has an
+  existing spec.
+
+## Ordering Constraints
+
+- finetuning-setup MUST run before dataset-evaluation and finetuning.
+  The base model and technique must be known before data can be
+  validated or training can begin. If the user asks to evaluate data
+  first, explain that model and technique selection is needed first
+  and propose reordering.
+- dataset-evaluation should run after finetuning-setup and before
+  finetuning, to catch format issues before training.
+
+## Skill Boundaries
+
+- All dataset format changes MUST go through dataset-transformation.
+  Do not write inline transformation code in other skills' notebooks.
+- All model/technique selection MUST go through finetuning-setup.
+  Do not resolve model IDs or select techniques ad-hoc.
diff --git a/plugins/sagemaker-ai/skills/use-case-specification/SKILL.md b/plugins/sagemaker-ai/skills/use-case-specification/SKILL.md
new file mode 100644
index 00000000..965412f4
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/use-case-specification/SKILL.md
@@ -0,0 +1,76 @@
+---
+name: use-case-specification
+description: Creates a reusable use case specification file that defines the business problem, stakeholders, and measurable success criteria for model customization, as recommended by the AWS Responsible AI Lens. Use as the default first step in any model customization plan. Skip only if the user explicitly declines or already has a use case specification to reuse. Captures problem statement, primary users, and LLM-as-a-Judge success tenets.
+---
+
+# Use Case Specification
+
+Multi-turn conversation to gather use case details and produce a use case specification document.
+
+## Principles
+
+1. **One thing at a time.** Each response advances exactly one decision or collects one piece of information.
+2. **Confirm before proceeding.** Wait for the user to approve the spec before considering this skill complete.
+3. **Infer, don't interrogate.** Use what's already known from the conversation. Only ask when you truly can't infer.
+
+## Workflow
+
+### Step 0: Check for Existing Spec
+
+Before starting discovery, check if a `*_use_case_spec.md` file already exists in the project. If it does, present it to the user and ask whether they want to reuse it, modify it, or start fresh.
+
+### Phase 1: Discovery (1–3 turns)
+
+Review what is already known from the conversation so far, then identify what is still missing. You need these three things:
+
+- **What** is the problem the user is trying to solve with model customization
+- **Who** will use the finetuned model and in what context
+- **Which** success criteria can be used to evaluate how well the custom model performs compared to the base model on a test set. Success criteria must be measurable by an LLM-as-a-Judge (e.g., response accuracy, tone adherence) — not things like latency or throughput.
+
+**Guidelines**:
+
+- Infer as much as possible from what the user has already said
+- If the user gave examples, use them to fill gaps rather than asking again
+- Only ask clarifying questions when you cannot infer the information needed for Phase 2
+- If everything is already clear, say "You've given me a clear picture. I'll put together a use case specification now." and move to Phase 2.
+
+⏸ Wait for user after each clarifying question.
+
+### Phase 2: Producing a Use Case Specification Document
+
+1. Save all generated artifacts under the project directory structure defined by the directory-management skill, if available.
+2. Synthesize the information you collected from the user into a Markdown document called [relevant_title]_use_case_spec.md containing the following fields (and only these fields):
+
+```
+Use case description
+  - Concise problem statement + what the custom model will do
+  - Field name: “Business Problem”
+  - Type: String
+
+Key stakeholders
+  - Who uses the model and in what context
+  - Field name: “Primary Users”
+  - Type: String, comma separated if there are multiple 
+
+Success criteria
+  - A list of 3 criteria (a short name and a description) with which the user measure the success of the custom model. 
+  - Field name: “Success Tenets”
+  - Type: list of name-description pairs
+```
+
+1. Present the use case specification in a human-readable format as follows:
+
+I have put together a use case specification and saved it in [relevant_title]_use_case_spec.md.
+
+A use case specification is a design principle recommended by the [AWS Responsible AI Lens](https://docs.aws.amazon.com/wellarchitected/latest/responsible-ai-lens/design-principles.html).
+
+[use case in human-readable format]
+
+Does this match your intent?
+
+⏸ Wait for user approval.
+
+## use_case_specification Edit Protocol
+
+- If the user requests changes pertaining to any information covered by use_case_spec.md, you must edit it accordingly and ask for confirmation again.
+- The user can edit use_case_spec.md directly if they want to. If the user says they've updated the file directly, read it to get the latest in your context.

From 83b9dc7417bb7522fe3e263e76077aeea4f150a7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Minoru=20Onda=EF=BC=88=E3=81=BF=E3=81=AE=E3=82=8B=E3=82=93?=
 =?UTF-8?q?=EF=BC=89?= <74597894+minorun365@users.noreply.github.com>
Date: Sun, 29 Mar 2026 07:38:52 +0900
Subject: [PATCH 02/15] docs: Add Japanese README (#106)

* Add Japanese README (README.jp.md)

* Add cross-language links between README.md and README.jp.md

Add language selector line to the main README linking to the Japanese
translation, and a reciprocal link in README.jp.md back to English.
Also apply dprint formatting to README.jp.md.

---------

Co-authored-by: bonk-ai[bot] <bonk-ai[bot]@users.noreply.github.com>
---
 README.jp.md | 283 +++++++++++++++++++++++++++++++++++++++++++++++++++
 README.md    |   2 +
 2 files changed, 285 insertions(+)
 create mode 100644 README.jp.md

diff --git a/README.jp.md b/README.jp.md
new file mode 100644
index 00000000..5c9c36ff
--- /dev/null
+++ b/README.jp.md
@@ -0,0 +1,283 @@
+# Agent Plugins for AWS
+
+_他の言語で読む: [English](README.md)_
+
+> [!IMPORTANT]
+> 生成 AI は誤った出力を生成する可能性があります。選択した AI モデルおよびエージェントコーディングアシスタントが生成するすべての出力とコストを確認することを推奨します。[AWS 責任ある AI ポリシー](https://aws.amazon.com/ai/responsible-ai/policy/)を参照してください。
+
+Agent Plugins for AWS は、AI コーディングエージェントに AWS 上でのアーキテクチャ設計、デプロイ、運用を支援するスキルを提供します。Agent Plugins は現在、Claude Code と Cursor に対応しています。
+
+AI コーディングエージェントはソフトウェア開発において急速に普及し、開発者のコード記述、レビュー、デプロイを効率化しています。
+
+エージェントスキルとエージェントプラグインは、モデルのコンテキストを肥大化させることなくコーディングエージェントを的確な結果へ導くためのベストプラクティスとして確立されつつあります。長い AWS ガイダンスをプロンプトに繰り返し貼り付ける代わりに、エージェントが必要に応じて呼び出す再利用可能でバージョン管理された機能としてエンコードすることで、以下のメリットが得られます。
+
+- 出力の決定論性が向上する
+- コンテキストのオーバーヘッドが削減される
+- チーム全体でエージェントの動作を標準化しやすくなる
+
+エージェントプラグインは、さまざまな種類の専門知識アーティファクトをパッケージ化するコンテナとして機能します。1 つのエージェントプラグインには以下を含めることができます。
+
+- **エージェントスキル** – デプロイ、コードレビュー、アーキテクチャ設計などの複雑なタスクを通じて AI をガイドする、構造化されたワークフローとベストプラクティスのプレイブック。エージェントスキルは、ドメインの専門知識をステップバイステップのプロセスとしてエンコードします。
+- **MCP サーバー** – 外部サービス、データソース、API への接続。MCP サーバーは、ランタイムにおいてアシスタントにライブドキュメント、料金データ、その他のリソースへのアクセスを提供します。[MCP Servers for AWS](https://github.com/awslabs/mcp) で詳細をご覧ください。
+- **Hooks** – 開発者のアクションに応じて実行される自動化とガードレール。Hooks は変更の検証、標準の適用、ワークフローの自動トリガーが可能です。
+- **リファレンス** – エージェントスキルが参照できるドキュメント、設定デフォルト値、ナレッジ。リファレンスにより、プロンプトを肥大化させることなくエージェントスキルをよりスマートにします。
+
+この分野で新しいタイプの専門知識アーティファクトが登場しても、エージェントプラグインにパッケージ化できるため、進化は開発者に対して透過的に行われます。
+
+## ベストプラクティス
+
+プラグインを活用した開発のメリットを最大化しつつ、セキュリティとコード品質を維持するため、以下の重要なガイドラインに従ってください：
+
+- デプロイ前に、生成されたコードを必ずレビューしてください（例：セキュリティ、コスト、レジリエンスに関する制約への適合性）
+- プラグインは開発者の判断力と専門知識の代替ではなく、アクセラレーターとして使用してください。
+- 最新の AWS ベストプラクティスの恩恵を受けるため、プラグインを最新の状態に保ってください。
+- AWS 認証情報の設定には最小権限の原則に従ってください。
+- 生成されたインフラストラクチャコードに対してセキュリティスキャンツールを実行してください。
+
+## プラグイン一覧
+
+| プラグイン                  | 説明                                                                                                                     | ステータス                  |
+| --------------------------- | ------------------------------------------------------------------------------------------------------------------------ | --------------------------- |
+| **amazon-location-service** | Amazon Location Service を使用してアプリケーションにマップ、ジオコーディング、ルーティング、場所検索、地理空間機能を追加 | 利用可能                    |
+| **aws-amplify**             | 認証、データ、ストレージ、関数のガイド付きワークフローで AWS Amplify Gen 2 を使用したフルスタックアプリを構築            | 利用可能                    |
+| **aws-serverless**          | Lambda、API Gateway、EventBridge、Step Functions、Durable Functions を使用したサーバーレスアプリケーションを構築         | 利用可能                    |
+| **databases-on-aws**        | AWS データベースポートフォリオのガイダンス — スキーマ設計、クエリ、マイグレーション、マルチテナントパターン              | 一部利用可能（Aurora DSQL） |
+| **deploy-on-aws**           | アーキテクチャ推奨、コスト見積もり、IaC デプロイにより AWS へのアプリケーションデプロイを支援                            | 利用可能                    |
+| **migration-to-aws**        | リソース検出、アーキテクチャマッピング、コスト分析、実行計画により GCP インフラを AWS に移行                             | 利用可能                    |
+
+## インストール
+
+### Claude Code
+
+#### マーケットプレイスの追加
+
+```bash
+/plugin marketplace add awslabs/agent-plugins
+```
+
+#### プラグインのインストール
+
+インストールしたいプラグインに対応するコマンドを実行してください。
+
+Amazon Location Service
+
+```bash
+/plugin install amazon-location-service@agent-plugins-for-aws
+```
+
+AWS Amplify
+
+```bash
+/plugin install aws-amplify@agent-plugins-for-aws
+```
+
+AWS Serverless
+
+```bash
+/plugin install aws-serverless@agent-plugins-for-aws
+```
+
+Databases on AWS
+
+```bash
+/plugin install databases-on-aws@agent-plugins-for-aws
+```
+
+Deploy on AWS
+
+```bash
+/plugin install deploy-on-aws@agent-plugins-for-aws
+```
+
+Migration to AWS
+
+```bash
+/plugin install migration-to-aws@agent-plugins-for-aws
+```
+
+### Cursor
+
+**deploy-on-aws** プラグインは [Cursor Marketplace](https://cursor.com/marketplace/aws) からインストールできます。詳細については、[Cursor プラグインドキュメント](https://cursor.com/docs/plugins)を参照してください。Cursor アプリケーション内からもインストール可能です。
+
+- Cursor の設定を開く
+- `Plugins` に移動する
+- `AWS` で検索する
+- インストールしたいプラグインを選択し、`Add to Cursor` をクリックする
+- インストールされたプラグインのスコープを選択する
+- プラグインが `Plugins -> Installed` に表示されます
+
+## amazon-location-service
+
+Amazon Location Service を使用したマップ、場所検索、ジオコーディング、ルーティング、その他の地理空間機能の追加について、認証セットアップ、SDK 統合、ベストプラクティスを含めて開発者をガイドします。
+
+### エージェントスキルのトリガー
+
+| エージェントスキル          | トリガー                                                                                                                                        |
+| --------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- |
+| **amazon-location-service** | 「マップを追加して」「住所をジオコーディングして」「ルートを計算して」「位置情報対応アプリ」「Amazon Location Service」「地理空間」「場所検索」 |
+
+### MCP サーバー
+
+| サーバー    | 用途                                     |
+| ----------- | ---------------------------------------- |
+| **aws-mcp** | AWS ドキュメントおよびサービスガイダンス |
+
+## aws-amplify
+
+TypeScript コードファーストの開発で AWS Amplify Gen 2 を使用したフルスタックアプリを構築します。AWS 公式の Agent Standard Operating Procedures（SOP）に基づいてガイドされます。
+
+### ワークフロー
+
+1. **バックエンド** - Amplify Gen 2 リソース（認証、データ、ストレージ、関数）を作成
+2. **サンドボックス** - テスト用にサンドボックスにデプロイ
+3. **フロントエンドとテスト** - フロントエンドをバックエンドに接続し、ローカルで検証
+4. **本番環境** - 本番環境にデプロイ
+
+### エージェントスキルのトリガー
+
+| エージェントスキル   | トリガー                                                                                                                                   |
+| -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
+| **amplify-workflow** | 「Amplify アプリを作って」「Amplify プロジェクトを作成して」「Amplify に認証を追加して」「Amplify をデプロイして」「フルスタック Amplify」 |
+
+### MCP サーバー
+
+| サーバー    | 用途                                     |
+| ----------- | ---------------------------------------- |
+| **aws-mcp** | AWS ドキュメントおよびサービスガイダンス |
+
+## aws-serverless
+
+AWS Lambda、API Gateway、EventBridge、Step Functions、Durable Functions を使用してサーバーレスアプリケーションの設計、構築、デプロイ、テスト、デバッグを行います。SAM および CDK のデプロイワークフロー、SAM テンプレート検証 Hook、そして耐障害性のある長時間実行マルチステップアプリケーション構築のための AWS Lambda Durable Functions スキルを含みます。
+
+### エージェントスキルのトリガー
+
+| エージェントスキル               | トリガー                                                                                                                                                                                         |
+| -------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| **aws-lambda**                   | 「Lambda 関数」「イベントソース」「サーバーレスアプリケーション」「API Gateway」「EventBridge」「Step Functions」「サーバーレス API」「イベント駆動アーキテクチャ」「Lambda トリガー」           |
+| **aws-serverless-deployment**    | 「SAM を使って」「SAM テンプレート」「SAM init」「SAM deploy」「CDK サーバーレス」「CDK Lambda コンストラクト」「NodejsFunction」「PythonFunction」「サーバーレス CI/CD パイプライン」           |
+| **aws-lambda-durable-functions** | 「Lambda Durable Functions」「ワークフローオーケストレーション」「ステートマシン」「リトライ/チェックポイントパターン」「長時間実行のステートフル Lambda」「Saga パターン」「Human-in-the-loop」 |
+
+### MCP サーバー
+
+| サーバー               | 用途                                                                                 |
+| ---------------------- | ------------------------------------------------------------------------------------ |
+| **aws-serverless-mcp** | サーバーレス開発ガイダンス、プロジェクトのスキャフォールディング、IaC 生成、デプロイ |
+
+### Hooks
+
+| Hook                     | トリガー                                | アクション                                      |
+| ------------------------ | --------------------------------------- | ----------------------------------------------- |
+| **SAM テンプレート検証** | `template.yaml`/`template.yml` の編集後 | `sam validate` を実行しエラーをインラインで報告 |
+
+## databases-on-aws
+
+AWS データベースポートフォリオのデータベースガイダンス。スキーマ設計、クエリ実行、マイグレーション処理、アプリケーション構築、ワークロードに最適なデータベースの選択を支援します。現在は Aurora DSQL（サーバーレスの PostgreSQL 互換分散 SQL データベース）を含みます。
+
+### エージェントスキルのトリガー
+
+| エージェントスキル | トリガー                                                                                                                                            |
+| ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------- |
+| **dsql**           | 「Aurora DSQL」「DSQL スキーマ」「分散 SQL データベース」「サーバーレス PostgreSQL 互換データベース」「DSQL テーブルを作成して」「DSQL に移行して」 |
+
+### MCP サーバー
+
+| サーバー         | 用途                                                                              |
+| ---------------- | --------------------------------------------------------------------------------- |
+| **awsknowledge** | AWS ドキュメント、アーキテクチャガイダンス、ベストプラクティス                    |
+| **aurora-dsql**  | データベースの直接操作 — クエリ、スキーマ、トランザクション（デフォルトでは無効） |
+
+### Hooks
+
+| Hook             | トリガー            | アクション                                               |
+| ---------------- | ------------------- | -------------------------------------------------------- |
+| **スキーマ検証** | `transact` 操作の後 | スキーマ変更と影響を受ける行の検証を促すプロンプトを表示 |
+
+## deploy-on-aws
+
+AWS デプロイを加速するスキルをエージェントに提供します — AWS アーキテクチャとサービスの推奨、コスト見積もり、Infrastructure as Code（CDK または CloudFormation）の生成、そしてデプロイのガイドを行います。
+
+### ワークフロー
+
+1. **分析** - コードベースをスキャンしてフレームワーク、データベース、依存関係を検出
+2. **推奨** - 簡潔な根拠とともに AWS サービスを選定
+3. **見積もり** - 進行前にコスト見積もりを提示
+4. **生成** - IaC コード（CDK/CloudFormation）を記述
+5. **デプロイ** - ユーザーの確認を得て実行
+
+### エージェントスキルのトリガー
+
+| エージェントスキル | トリガー                                                                                                                           |
+| ------------------ | ---------------------------------------------------------------------------------------------------------------------------------- |
+| **deploy**         | 「AWS にデプロイして」「AWS でホストして」「AWS で動かして」「AWS アーキテクチャ」「AWS コストを見積もって」「インフラを生成して」 |
+
+### MCP サーバー
+
+| サーバー         | 用途                                                           |
+| ---------------- | -------------------------------------------------------------- |
+| **awsknowledge** | AWS ドキュメント、アーキテクチャガイダンス、ベストプラクティス |
+| **awspricing**   | コスト見積もり用のリアルタイム AWS サービス料金                |
+| **aws-iac-mcp**  | CDK/CloudFormation の IaC ベストプラクティス                   |
+
+## migration-to-aws
+
+Terraform リソースの検出、アーキテクチャマッピング、コスト見積もり、実行計画を通じて、GCP インフラから AWS への体系的な移行を支援します。
+
+### ワークフロー
+
+1. **検出** - Terraform ファイルをスキャンして GCP リソースを検出し、インフラストラクチャを抽出
+2. **確認** - コンピューティングワークロードとアーキテクチャパターンを理解
+3. **設計** - GCP サービスを AWS の同等サービスにマッピングし、根拠を提示
+4. **見積もり** - 月間 AWS コストを算出し、GCP と比較
+5. **実行** - 移行タイムラインを策定し、デプロイリスクを特定
+
+### エージェントスキルのトリガー
+
+| エージェントスキル | トリガー                                                                                                                |
+| ------------------ | ----------------------------------------------------------------------------------------------------------------------- |
+| **gcp-to-aws**     | 「GCP から AWS に移行して」「GCP から移行して」「GCP 移行計画」「AWS コストを見積もって」「GCP インフラのアセスメント」 |
+
+### MCP サーバー
+
+| サーバー         | 用途                                            |
+| ---------------- | ----------------------------------------------- |
+| **awsknowledge** | AWS ドキュメント、アーキテクチャガイダンス      |
+| **awspricing**   | コスト見積もり用のリアルタイム AWS サービス料金 |
+
+## 要件
+
+- Claude Code >=2.1.29 または [Cursor >= 2.5](https://cursor.com/changelog/2-5)
+- 適切な認証情報が設定された AWS CLI
+
+## トラブルシューティング
+
+プラグインのインストールや使用に問題がありますか？一般的な解決策については、[トラブルシューティングガイド](./docs/TROUBLESHOOTING.md)を参照してください。
+
+## コントリビューション
+
+素晴らしいコントリビューターの皆さんに大きな感謝を！このプロジェクトをより良くしてくれてありがとうございます！
+
+あらゆる種類のコントリビューションを歓迎します！詳細は[コントリビューターガイド](./CONTRIBUTING.md)をご覧ください。
+
+## 開発者ガイド
+
+ライブラリに新しいプラグインを追加したい場合は、[設計ガイドライン](./docs/DESIGN_GUIDELINES.md)と[開発ガイド](./docs/DEVELOPMENT_GUIDE.md)をご覧ください。
+
+## メンテナー
+
+リポジトリのレビューアー、メンテナー、管理者は、PR レビューワークフロー、マージルール、CI/CD ドキュメントを[メンテナーガイド](./docs/MAINTAINERS_GUIDE.md)で確認できます。
+
+## その他のリソース
+
+- [Introducing Agent Plugins for AWS（公式ブログ）](https://aws.amazon.com/blogs/developer/introducing-agent-plugins-for-aws/)
+- [Extend Cursor with plugins ft. AWS](https://cursor.com/blog/marketplace)
+- [MCP Servers for AWS](https://github.com/awslabs/mcp)
+- [Claude Code プラグインドキュメント](https://code.claude.com/docs/en/plugins)
+- [Cursor プラグインドキュメント](https://cursor.com/docs/plugins)
+
+## ライセンス
+
+このプロジェクトは Apache-2.0 ライセンスの下で提供されています。
+
+---
+
+_この日本語版 README は [Minoru Onda (AWS AI Hero)](https://github.com/minorun365) が翻訳・作成しました。原文は [README.md](./README.md) を参照してください。_
diff --git a/README.md b/README.md
index 1afc6488..411528e0 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,7 @@
 # Agent Plugins for AWS
 
+_Read this in other languages: [日本語](README.jp.md)_
+
 > [!IMPORTANT]
 > Generative AI can make mistakes. You should consider reviewing all output and costs generated by your chosen AI model and agentic coding assistant. See [AWS Responsible AI Policy](https://aws.amazon.com/ai/responsible-ai/policy/).
 

From af10622d5bd9414386ab85b0c3e09f50259b6e16 Mon Sep 17 00:00:00 2001
From: Girish Raman <grraman@amazon.com>
Date: Sun, 29 Mar 2026 09:36:20 -0700
Subject: [PATCH 03/15] fix(sagemaker-ai): Update README, CODEOWNERS and other
 config

---
 .claude-plugin/marketplace.json               | 19 +++++++++++++++++++
 .github/CODEOWNERS                            |  1 +
 README.md                                     | 11 +++++++++++
 .../sagemaker-ai/.claude-plugin/plugin.json   |  2 +-
 4 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
index 20f1983a..d982e6f9 100644
--- a/.claude-plugin/marketplace.json
+++ b/.claude-plugin/marketplace.json
@@ -140,6 +140,25 @@
       "source": "./plugins/migration-to-aws",
       "tags": ["aws", "gcp", "migration", "infrastructure"],
       "version": "1.0.0"
+    },
+    {
+      "category": "ai",
+      "description": "Build, train, and deploy AI models with deep AWS AI/ML expertise brought directly into your coding assistants, covering the surface area of Amazon SageMaker AI.",
+      "keywords": [
+        "aws",
+        "sagemaker",
+        "machine-learning",
+        "generative-ai",
+        "fine-tuning",
+        "training",
+        "deployment",
+        "inference",
+        "mlops"
+      ],
+      "name": "sagemaker-ai",
+      "source": "./plugins/sagemaker-ai",
+      "tags": ["aws", "sagemaker", "machine-learning", "generative-ai"],
+      "version": "1.0.0"
     }
   ]
 }
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 598b2cb7..2f282a92 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -36,6 +36,7 @@ plugins/aws-serverless              @awslabs/agent-plugins-admins @awslabs/agent
 plugins/databases-on-aws            @awslabs/agent-plugins-admins @awslabs/agent-plugins-maintainers  @awslabs/agent-plugins-dsql
 plugins/deploy-on-aws               @awslabs/agent-plugins-admins @awslabs/agent-plugins-maintainers  @awslabs/agent-plugins-deploy-on-aws
 plugins/migration-to-aws            @awslabs/agent-plugins-admins @awslabs/agent-plugins-maintainers  @awslabs/agent-plugins-migrate-to-aws
+plugins/sagemaker-ai                @awslabs/agent-plugins-admins @awslabs/agent-plugins-maintainers  @awslabs/agent-plugins-sagemaker-ai
 
 ## Evals (match plugin ownership)
 
diff --git a/README.md b/README.md
index 411528e0..45d06d01 100644
--- a/README.md
+++ b/README.md
@@ -36,6 +36,7 @@ To maximize the benefits of plugin-assisted development while maintaining securi
 | **databases-on-aws**        | Database guidance for the AWS database portfolio — schema design, queries, migrations, and multi-tenant patterns       | Some Services Available (Aurora DSQL) |
 | **deploy-on-aws**           | Deploy applications to AWS with architecture recommendations, cost estimates, and IaC deployment                       | Available                             |
 | **migration-to-aws**        | Migrate GCP infrastructure to AWS with resource discovery, architecture mapping, cost analysis, and execution planning | Available                             |
+| **sagemaker-ai**            | Build, train, and deploy AI models with deep AWS AI/ML expertise brought directly into your coding assistants, covering the surface area of [Amazon SageMaker AI](https://aws.amazon.com/sagemaker/ai/) | Available                             |
 
 ## Installation
 
@@ -83,6 +84,12 @@ or
 /plugin install migration-to-aws@agent-plugins-for-aws
 ```
 
+or
+
+```bash
+/plugin install sagemaker-ai@agent-plugins-for-aws
+```
+
 ### Cursor
 
 You can install the **deploy-on-aws** plugin from the [Cursor Marketplace](https://cursor.com/marketplace/aws). For additional information, please refer to the [Cursor plugin documentation](https://cursor.com/docs/plugins). You can also install within the Cursor application:
@@ -231,6 +238,10 @@ Helps you systematically migrate GCP infrastructure to AWS through Terraform res
 | **awsknowledge** | AWS documentation, architecture guidance         |
 | **awspricing**   | Real-time AWS service pricing for cost estimates |
 
+## sagemaker-ai
+
+Build, train, and deploy AI models with deep AWS AI/ML expertise brought directly into your coding assistants, covering the surface area of [Amazon SageMaker AI](https://aws.amazon.com/sagemaker/ai/). [Learn more](./plugins/sagemaker-ai/README.md).
+
 ## Requirements
 
 - Claude Code >=2.1.29 or [Cursor >= 2.5](https://cursor.com/changelog/2-5)
diff --git a/plugins/sagemaker-ai/.claude-plugin/plugin.json b/plugins/sagemaker-ai/.claude-plugin/plugin.json
index ee1b60a1..6de9aba8 100644
--- a/plugins/sagemaker-ai/.claude-plugin/plugin.json
+++ b/plugins/sagemaker-ai/.claude-plugin/plugin.json
@@ -2,7 +2,7 @@
   "author": {
     "name": "Amazon Web Services"
   },
-  "description": "Equip AI coding agents with skills to build, train, and deploy ML and generative AI workloads on Amazon SageMaker AI.",
+  "description": "Build, train, and deploy AI models with deep AWS AI/ML expertise brought directly into your coding assistants, covering the surface area of Amazon SageMaker AI.",
   "homepage": "https://github.com/awslabs/agent-plugins",
   "keywords": [
     "sagemaker",

From b396f1516034dca785fde93bbb88b623692ae78d Mon Sep 17 00:00:00 2001
From: Girish Raman <grraman@amazon.com>
Date: Mon, 30 Mar 2026 09:20:45 -0700
Subject: [PATCH 04/15] fix(sagemaker-ai) (hyperpod): Fix semgrep issues, use
 uv run

---
 .../skills/hyperpod-issue-report/SKILL.md     |   5 +-
 .../scripts/hyperpod_issue_report.py          | 287 +++++++++---------
 .../scripts/requirements.txt                  |   3 -
 .../skills/hyperpod-ssm/scripts/list-nodes.sh |   5 +-
 .../skills/hyperpod-ssm/scripts/ssm-exec.sh   |  10 +-
 .../scripts/hyperpod_check_versions.sh        |  18 +-
 6 files changed, 162 insertions(+), 166 deletions(-)
 delete mode 100755 plugins/sagemaker-ai/skills/hyperpod-issue-report/scripts/requirements.txt

diff --git a/plugins/sagemaker-ai/skills/hyperpod-issue-report/SKILL.md b/plugins/sagemaker-ai/skills/hyperpod-issue-report/SKILL.md
index b15fd4c4..3e7d6a01 100755
--- a/plugins/sagemaker-ai/skills/hyperpod-issue-report/SKILL.md
+++ b/plugins/sagemaker-ai/skills/hyperpod-issue-report/SKILL.md
@@ -10,7 +10,7 @@ Collect diagnostic logs from HyperPod cluster nodes via SSM, store results in S3
 ## Prerequisites
 
 - AWS CLI configured with permissions: `sagemaker:DescribeCluster`, `sagemaker:ListClusterNodes`, `ssm:StartSession`, `s3:PutObject`, `s3:GetObject`, `eks:DescribeCluster`
-- Python 3.8+ with `pip install -r scripts/requirements.txt`
+- Python 3.8+ and [uv](https://docs.astral.sh/uv/) (`pip install uv`)
 - SSM Agent running on target nodes; node IAM roles need `s3:GetObject`/`s3:PutObject` on the report bucket
 - For EKS clusters: kubectl installed and configured (see Workflow step 2)
 
@@ -32,7 +32,6 @@ Collect from the user:
 ```bash
 aws sts get-caller-identity
 aws sagemaker describe-cluster --cluster-name <name-or-arn> --region <region>
-pip install -r scripts/requirements.txt
 ```
 
 If the S3 bucket doesn't exist, create it:
@@ -53,7 +52,7 @@ aws s3 mb s3://<bucket-name> --region <region>
 ### 3. Run the Collection Script
 
 ```bash
-python scripts/hyperpod_issue_report.py \
+uv run scripts/hyperpod_issue_report.py \
   --cluster <cluster-name-or-arn> \
   --region <region> \
   --s3-path s3://<bucket>[/prefix]
diff --git a/plugins/sagemaker-ai/skills/hyperpod-issue-report/scripts/hyperpod_issue_report.py b/plugins/sagemaker-ai/skills/hyperpod-issue-report/scripts/hyperpod_issue_report.py
index e68f2f28..14f38195 100755
--- a/plugins/sagemaker-ai/skills/hyperpod-issue-report/scripts/hyperpod_issue_report.py
+++ b/plugins/sagemaker-ai/skills/hyperpod-issue-report/scripts/hyperpod_issue_report.py
@@ -1,4 +1,12 @@
 #!/usr/bin/env python3
+# /// script
+# requires-python = ">=3.8"
+# dependencies = [
+#   "boto3>=1.26.0",
+#   "botocore>=1.29.0",
+#   "pexpect>=4.8.0",
+# ]
+# ///
 """
 HyperPod Issue Report Collector
 
@@ -1133,7 +1141,32 @@ def verify_kubectl_config(self) -> bool:
         except Exception as e:
             print(f"Warning: Error verifying kubectl config: {e}")
             return False
-    
+
+    @staticmethod
+    def _save_kubectl_result(result: subprocess.CompletedProcess,
+                             name: str, description: str,
+                             kubectl_output_dir: str, elapsed: float,
+                             successful: int, failed: int) -> tuple:
+        """Save kubectl output and update counters. Returns (successful, failed)."""
+        output_file = os.path.join(kubectl_output_dir, f'{name}.txt')
+        if result.returncode == 0:
+            if result.stdout.strip():
+                with open(output_file, 'w', encoding='utf-8') as f:
+                    f.write(result.stdout)
+                print(f"  Collecting: {description}... ✓ ({elapsed:.1f}s)")
+                successful += 1
+            else:
+                with open(output_file, 'w', encoding='utf-8') as f:
+                    f.write("No resources found\n")
+                print(f"  Collecting: {description}... ✓ (empty, {elapsed:.1f}s)")
+                successful += 1
+        else:
+            with open(output_file, 'w', encoding='utf-8') as f:
+                f.write(f"Error: {result.stderr}\n")
+            print(f"  Collecting: {description}... ✗ ({result.stderr.strip()[:50]}, {elapsed:.1f}s)")
+            failed += 1
+        return successful, failed
+
     def collect_kubectl_node_info(self):
         """Collect kubectl describe node information for all nodes."""
         if self.cluster_type != 'eks':
@@ -1160,149 +1193,119 @@ def collect_kubectl_node_info(self):
             # Create output directory
             kubectl_output_dir = tempfile.mkdtemp(prefix='kubectl_output_')
             
-            # Define resources to collect
-            collections = [
-                # High Priority - Essential for troubleshooting
-                {
-                    'name': 'nodes_describe',
-                    'command': ['kubectl', 'describe', 'nodes'],
-                    'description': 'Node descriptions (capacity, conditions, pods)'
-                },
-                {
-                    'name': 'pods_all_namespaces',
-                    'command': ['kubectl', 'get', 'pods', '-A', '-o', 'wide'],
-                    'description': 'All pods across namespaces (wide output)'
-                },
-                {
-                    'name': 'pods_describe_all_namespaces',
-                    'command': ['kubectl', 'describe', 'pods', '-A'],
-                    'description': 'Detailed pod descriptions (all namespaces)'
-                },
-                {
-                    'name': 'events_all_namespaces',
-                    'command': ['kubectl', 'get', 'events', '-A', '--sort-by=.lastTimestamp'],
-                    'description': 'Cluster events sorted by timestamp'
-                },
-                {
-                    'name': 'pvcs_all_namespaces',
-                    'command': ['kubectl', 'get', 'pvc', '-A', '-o', 'wide'],
-                    'description': 'PersistentVolumeClaims (storage)'
-                },
-                {
-                    'name': 'pvcs_describe_all_namespaces',
-                    'command': ['kubectl', 'describe', 'pvc', '-A'],
-                    'description': 'Detailed PVC descriptions'
-                },
-                {
-                    'name': 'services_all_namespaces',
-                    'command': ['kubectl', 'get', 'svc', '-A', '-o', 'wide'],
-                    'description': 'Services (network endpoints)'
-                },
-                {
-                    'name': 'services_describe_all_namespaces',
-                    'command': ['kubectl', 'describe', 'svc', '-A'],
-                    'description': 'Detailed service descriptions'
-                },
-                
-                # Medium Priority - Very useful
-                {
-                    'name': 'deployments_all_namespaces',
-                    'command': ['kubectl', 'get', 'deployments', '-A', '-o', 'wide'],
-                    'description': 'Deployments'
-                },
-                {
-                    'name': 'statefulsets_all_namespaces',
-                    'command': ['kubectl', 'get', 'statefulsets', '-A', '-o', 'wide'],
-                    'description': 'StatefulSets'
-                },
-                {
-                    'name': 'daemonsets_all_namespaces',
-                    'command': ['kubectl', 'get', 'daemonsets', '-A', '-o', 'wide'],
-                    'description': 'DaemonSets'
-                },
-                {
-                    'name': 'configmaps_all_namespaces',
-                    'command': ['kubectl', 'get', 'configmaps', '-A'],
-                    'description': 'ConfigMaps (metadata only)'
-                },
-                {
-                    'name': 'secrets_all_namespaces',
-                    'command': ['kubectl', 'get', 'secrets', '-A'],
-                    'description': 'Secrets (metadata only, no content)'
-                },
-                {
-                    'name': 'resourcequotas_all_namespaces',
-                    'command': ['kubectl', 'get', 'resourcequota', '-A'],
-                    'description': 'Resource quotas'
-                },
-                {
-                    'name': 'networkpolicies_all_namespaces',
-                    'command': ['kubectl', 'get', 'networkpolicies', '-A'],
-                    'description': 'Network policies'
-                },
-            ]
-            
-            print(f"Collecting {len(collections)} Kubernetes resource types...")
+            # Each subprocess.run uses static string arguments so security
+            # linters can verify no dynamic command injection is possible.
+            print("Collecting 15 Kubernetes resource types...")
             successful = 0
             failed = 0
-            
-            for collection in collections:
-                name = collection['name']
-                command = collection['command']
-                description = collection['description']
-                
-                print(f"  Collecting: {description}...", end=' ', flush=True)
-                
-                try:
-                    # Use unified timeout for all kubectl operations
-                    timeout = KUBECTL_TIMEOUT
-                    
-                    # Measure execution time
-                    start_time = time.time()
-                    
-                    result = subprocess.run(  # nosec B603
-                        command,
-                        capture_output=True,
-                        text=True,
-                        timeout=timeout
-                    )
-                    
-                    elapsed_time = time.time() - start_time
-                    
-                    output_file = os.path.join(kubectl_output_dir, f'{name}.txt')
-                    
-                    if result.returncode == 0:
-                        if result.stdout.strip():
-                            with open(output_file, 'w', encoding='utf-8') as f:
-                                f.write(result.stdout)
-                            print(f"✓ ({elapsed_time:.1f}s)")
-                            successful += 1
-                        else:
-                            # Empty output (no resources of this type)
-                            with open(output_file, 'w', encoding='utf-8') as f:
-                                f.write("No resources found\n")
-                            print(f"✓ (empty, {elapsed_time:.1f}s)")
-                            successful += 1
-                    else:
-                        # Command failed
-                        with open(output_file, 'w', encoding='utf-8') as f:
-                            f.write(f"Error: {result.stderr}\n")
-                        print(f"✗ ({result.stderr.strip()[:50]}, {elapsed_time:.1f}s)")
-                        failed += 1
-                        
-                except subprocess.TimeoutExpired:
-                    output_file = os.path.join(kubectl_output_dir, f'{name}.txt')
-                    with open(output_file, 'w', encoding='utf-8') as f:
-                        f.write("Error: Command timed out\n")
-                    print(f"✗ (timeout after {timeout}s)")
-                    failed += 1
-                    
-                except Exception as e:
-                    output_file = os.path.join(kubectl_output_dir, f'{name}.txt')
-                    with open(output_file, 'w', encoding='utf-8') as f:
-                        f.write(f"Error: {str(e)}\n")
-                    print(f"✗ ({str(e)[:50]})")
-                    failed += 1
+            timeout = KUBECTL_TIMEOUT
+
+            # High Priority - Essential for troubleshooting
+            t = time.time()
+            successful, failed = self._save_kubectl_result(
+                subprocess.run(['kubectl', 'describe', 'nodes'],  # nosec B603 B607
+                               capture_output=True, text=True, timeout=timeout),
+                'nodes_describe', 'Node descriptions (capacity, conditions, pods)',
+                kubectl_output_dir, time.time() - t, successful, failed)
+
+            t = time.time()
+            successful, failed = self._save_kubectl_result(
+                subprocess.run(['kubectl', 'get', 'pods', '-A', '-o', 'wide'],  # nosec B603 B607
+                               capture_output=True, text=True, timeout=timeout),
+                'pods_all_namespaces', 'All pods across namespaces (wide output)',
+                kubectl_output_dir, time.time() - t, successful, failed)
+
+            t = time.time()
+            successful, failed = self._save_kubectl_result(
+                subprocess.run(['kubectl', 'describe', 'pods', '-A'],  # nosec B603 B607
+                               capture_output=True, text=True, timeout=timeout),
+                'pods_describe_all_namespaces', 'Detailed pod descriptions (all namespaces)',
+                kubectl_output_dir, time.time() - t, successful, failed)
+
+            t = time.time()
+            successful, failed = self._save_kubectl_result(
+                subprocess.run(['kubectl', 'get', 'events', '-A', '--sort-by=.lastTimestamp'],  # nosec B603 B607
+                               capture_output=True, text=True, timeout=timeout),
+                'events_all_namespaces', 'Cluster events sorted by timestamp',
+                kubectl_output_dir, time.time() - t, successful, failed)
+
+            t = time.time()
+            successful, failed = self._save_kubectl_result(
+                subprocess.run(['kubectl', 'get', 'pvc', '-A', '-o', 'wide'],  # nosec B603 B607
+                               capture_output=True, text=True, timeout=timeout),
+                'pvcs_all_namespaces', 'PersistentVolumeClaims (storage)',
+                kubectl_output_dir, time.time() - t, successful, failed)
+
+            t = time.time()
+            successful, failed = self._save_kubectl_result(
+                subprocess.run(['kubectl', 'describe', 'pvc', '-A'],  # nosec B603 B607
+                               capture_output=True, text=True, timeout=timeout),
+                'pvcs_describe_all_namespaces', 'Detailed PVC descriptions',
+                kubectl_output_dir, time.time() - t, successful, failed)
+
+            t = time.time()
+            successful, failed = self._save_kubectl_result(
+                subprocess.run(['kubectl', 'get', 'svc', '-A', '-o', 'wide'],  # nosec B603 B607
+                               capture_output=True, text=True, timeout=timeout),
+                'services_all_namespaces', 'Services (network endpoints)',
+                kubectl_output_dir, time.time() - t, successful, failed)
+
+            t = time.time()
+            successful, failed = self._save_kubectl_result(
+                subprocess.run(['kubectl', 'describe', 'svc', '-A'],  # nosec B603 B607
+                               capture_output=True, text=True, timeout=timeout),
+                'services_describe_all_namespaces', 'Detailed service descriptions',
+                kubectl_output_dir, time.time() - t, successful, failed)
+
+            # Medium Priority - Very useful
+            t = time.time()
+            successful, failed = self._save_kubectl_result(
+                subprocess.run(['kubectl', 'get', 'deployments', '-A', '-o', 'wide'],  # nosec B603 B607
+                               capture_output=True, text=True, timeout=timeout),
+                'deployments_all_namespaces', 'Deployments',
+                kubectl_output_dir, time.time() - t, successful, failed)
+
+            t = time.time()
+            successful, failed = self._save_kubectl_result(
+                subprocess.run(['kubectl', 'get', 'statefulsets', '-A', '-o', 'wide'],  # nosec B603 B607
+                               capture_output=True, text=True, timeout=timeout),
+                'statefulsets_all_namespaces', 'StatefulSets',
+                kubectl_output_dir, time.time() - t, successful, failed)
+
+            t = time.time()
+            successful, failed = self._save_kubectl_result(
+                subprocess.run(['kubectl', 'get', 'daemonsets', '-A', '-o', 'wide'],  # nosec B603 B607
+                               capture_output=True, text=True, timeout=timeout),
+                'daemonsets_all_namespaces', 'DaemonSets',
+                kubectl_output_dir, time.time() - t, successful, failed)
+
+            t = time.time()
+            successful, failed = self._save_kubectl_result(
+                subprocess.run(['kubectl', 'get', 'configmaps', '-A'],  # nosec B603 B607
+                               capture_output=True, text=True, timeout=timeout),
+                'configmaps_all_namespaces', 'ConfigMaps (metadata only)',
+                kubectl_output_dir, time.time() - t, successful, failed)
+
+            t = time.time()
+            successful, failed = self._save_kubectl_result(
+                subprocess.run(['kubectl', 'get', 'secrets', '-A'],  # nosec B603 B607
+                               capture_output=True, text=True, timeout=timeout),
+                'secrets_all_namespaces', 'Secrets (metadata only, no content)',
+                kubectl_output_dir, time.time() - t, successful, failed)
+
+            t = time.time()
+            successful, failed = self._save_kubectl_result(
+                subprocess.run(['kubectl', 'get', 'resourcequota', '-A'],  # nosec B603 B607
+                               capture_output=True, text=True, timeout=timeout),
+                'resourcequotas_all_namespaces', 'Resource quotas',
+                kubectl_output_dir, time.time() - t, successful, failed)
+
+            t = time.time()
+            successful, failed = self._save_kubectl_result(
+                subprocess.run(['kubectl', 'get', 'networkpolicies', '-A'],  # nosec B603 B607
+                               capture_output=True, text=True, timeout=timeout),
+                'networkpolicies_all_namespaces', 'Network policies',
+                kubectl_output_dir, time.time() - t, successful, failed)
             
             print(f"\nCollection summary: {successful} successful, {failed} failed")
             
diff --git a/plugins/sagemaker-ai/skills/hyperpod-issue-report/scripts/requirements.txt b/plugins/sagemaker-ai/skills/hyperpod-issue-report/scripts/requirements.txt
deleted file mode 100755
index 690613af..00000000
--- a/plugins/sagemaker-ai/skills/hyperpod-issue-report/scripts/requirements.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-boto3>=1.26.0
-botocore>=1.29.0
-pexpect>=4.8.0
diff --git a/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/list-nodes.sh b/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/list-nodes.sh
index 028df598..2d66154f 100755
--- a/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/list-nodes.sh
+++ b/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/list-nodes.sh
@@ -19,8 +19,9 @@ done
 # Paginate to collect ALL nodes
 NODES='[]'; NEXT=""
 while :; do
-  PAGE=$(aws sagemaker list-cluster-nodes --cluster-name "$CLUSTER" --region "$REGION" \
-    ${NEXT:+--next-token "$NEXT"} --output json)
+  CMD=(aws sagemaker list-cluster-nodes --cluster-name "$CLUSTER" --region "$REGION" --output json)
+  [[ -n "$NEXT" ]] && CMD+=(--next-token "$NEXT")
+  PAGE=$("${CMD[@]}")
   NODES=$(echo "$NODES" "$PAGE" | jq -s '.[0] + .[1].ClusterNodeSummaries')
   NEXT=$(echo "$PAGE" | jq -r '.NextToken // empty')
   [[ -z "$NEXT" ]] && break
diff --git a/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/ssm-exec.sh b/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/ssm-exec.sh
index b53e6b7f..d54b0519 100755
--- a/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/ssm-exec.sh
+++ b/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/ssm-exec.sh
@@ -9,6 +9,8 @@
 # Build target from parts: use --cluster-id, --group, --instance-id instead of --target
 set -euo pipefail
 
+command -v jq >/dev/null 2>&1 || { echo "Error: jq is required but not installed" >&2; exit 1; }
+
 REGION="${AWS_DEFAULT_REGION:-us-west-2}"
 TARGET="" ; CLUSTER_ID="" ; GROUP="" ; INSTANCE_ID=""
 MODE="exec" ; CMD="" ; LOCAL_PATH="" ; REMOTE_PATH=""
@@ -48,13 +50,7 @@ b64_encode() {
 
 json_cmd() {
   local cmd="$1"
-  if command -v jq >/dev/null 2>&1; then
-    jq -n --arg c "$cmd" '{"command":[$c]}'
-  else
-    local escaped
-    escaped=$(printf '%s' "$cmd" | sed 's/\\/\\\\/g; s/"/\\"/g; s/\t/\\t/g')
-    printf '{"command":["%s"]}\n' "$escaped"
-  fi
+  jq -n --arg c "$cmd" '{"command":[$c]}'
 }
 
 case "$MODE" in
diff --git a/plugins/sagemaker-ai/skills/hyperpod-version-checker/scripts/hyperpod_check_versions.sh b/plugins/sagemaker-ai/skills/hyperpod-version-checker/scripts/hyperpod_check_versions.sh
index 5bda095f..51b09ebd 100755
--- a/plugins/sagemaker-ai/skills/hyperpod-version-checker/scripts/hyperpod_check_versions.sh
+++ b/plugins/sagemaker-ai/skills/hyperpod-version-checker/scripts/hyperpod_check_versions.sh
@@ -29,7 +29,7 @@ while [[ $# -gt 0 ]]; do
 done
 
 # --- Color setup ---
-if $USE_COLOR && [ -t 1 ] && ! $JSON_OUTPUT; then
+if [[ "$USE_COLOR" == "true" ]] && [ -t 1 ] && [[ "$JSON_OUTPUT" != "true" ]]; then
     RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m'
 else
     RED=''; GREEN=''; YELLOW=''; BLUE=''; NC=''
@@ -46,7 +46,7 @@ log() {
     local stripped
     stripped=$(echo -e "$@" | sed 's/\x1b\[[0-9;]*m//g')
     echo "$stripped" >> "$OUTPUT_FILE"
-    if ! $JSON_OUTPUT; then
+    if [[ "$JSON_OUTPUT" != "true" ]]; then
         echo -e "$@"
     fi
 }
@@ -91,7 +91,7 @@ if cmd_exists nvidia-smi; then
     DRIVER_VER=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null | head -1)
     if [ $? -ne 0 ] || [ -z "$DRIVER_VER" ] || [[ "$DRIVER_VER" == *"failed"* ]] || [[ "$DRIVER_VER" == *"NVIDIA-SMI"* ]]; then
         DRIVER_VER=""
-        if $IS_NEURON; then
+        if [[ "$IS_NEURON" == "true" ]]; then
             log "${YELLOW}NVIDIA driver: N/A (Trainium/Inferentia instance)${NC}"
         else
             log "${YELLOW}nvidia-smi found but driver not responding${NC}"
@@ -242,7 +242,7 @@ if [ -z "$LIBFABRIC_VER" ]; then
     cmd_exists fi_info && FI_INFO="fi_info"
     [ -z "$FI_INFO" ] && [ -f /opt/amazon/efa/bin/fi_info ] && FI_INFO="/opt/amazon/efa/bin/fi_info"
     if [ -n "$FI_INFO" ]; then
-        LIBFABRIC_VER=$($FI_INFO --version 2>&1 | grep "libfabric" | sed -n 's/.*libfabric: \([0-9.]*\).*/\1/p' | head -1)
+        LIBFABRIC_VER=$("$FI_INFO" --version 2>&1 | grep "libfabric" | sed -n 's/.*libfabric: \([0-9.]*\).*/\1/p' | head -1)
         log "Libfabric ($FI_INFO): $LIBFABRIC_VER"
     fi
 fi
@@ -255,7 +255,7 @@ FI_CMD=""
 cmd_exists fi_info && FI_CMD="fi_info"
 [ -z "$FI_CMD" ] && [ -f /opt/amazon/efa/bin/fi_info ] && FI_CMD="/opt/amazon/efa/bin/fi_info"
 if [ -n "$FI_CMD" ]; then
-    if $FI_CMD -p efa 2>&1 | grep -q "provider: efa"; then
+    if "$FI_CMD" -p efa 2>&1 | grep -q "provider: efa"; then
         log "${GREEN}EFA provider available${NC}"
     else
         log "${YELLOW}EFA provider not detected${NC}"
@@ -357,14 +357,14 @@ NEURON_LS=$(cmd_or_path neuron-ls "$NEURON_BIN/neuron-ls")
 if [ -x "$NEURON_LS" ]; then
     NEURON_DETECTED=true
     log "Neuron devices:"
-    $NEURON_LS 2>/dev/null | while read -r line; do log "  $line"; done
+    "$NEURON_LS" 2>/dev/null | while read -r line; do log "  $line"; done
     log ""
 fi
 
 # Neuron compiler
 NEURON_CC=$(cmd_or_path neuronx-cc "$NEURON_BIN/neuronx-cc")
 if [ -x "$NEURON_CC" ]; then
-    NEURON_CC_VER=$($NEURON_CC --version 2>&1 | head -1)
+    NEURON_CC_VER=$("$NEURON_CC" --version 2>&1 | head -1)
     VERSIONS[NEURON_COMPILER]="$NEURON_CC_VER"
     log "Neuron Compiler: $NEURON_CC_VER"
     NEURON_DETECTED=true
@@ -406,7 +406,7 @@ if [ -x "$NEURON_TOP" ]; then
     NEURON_DETECTED=true
 fi
 
-if ! $NEURON_DETECTED; then
+if [[ "$NEURON_DETECTED" != "true" ]]; then
     log "${YELLOW}Neuron SDK not found (expected on non-Trainium/Inferentia instances)${NC}"
 fi
 log ""
@@ -514,7 +514,7 @@ log ""
 log "Report saved to: $OUTPUT_FILE"
 
 # --- JSON output (stdout only) ---
-if $JSON_OUTPUT; then
+if [[ "$JSON_OUTPUT" == "true" ]]; then
     cat <<EOF
 {
   "hostname": "$(hostname)",

From b11d9f990c33f5c1ebd10f914ba4cb86285df10d Mon Sep 17 00:00:00 2001
From: Girish Raman <grraman@amazon.com>
Date: Mon, 30 Mar 2026 09:22:12 -0700
Subject: [PATCH 05/15] fix(sagemaker-ai) (model customization): Update
 planning, finetuning skills

---
 .../templates/rlvr_reward_function_source_template.py         | 4 ++--
 plugins/sagemaker-ai/skills/model-evaluation/SKILL.md         | 2 ++
 plugins/sagemaker-ai/skills/planning/SKILL.md                 | 3 ++-
 .../skills/planning/references/model-customization-plan.md    | 2 ++
 4 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/plugins/sagemaker-ai/skills/finetuning/templates/rlvr_reward_function_source_template.py b/plugins/sagemaker-ai/skills/finetuning/templates/rlvr_reward_function_source_template.py
index 32f17ea2..542ff124 100644
--- a/plugins/sagemaker-ai/skills/finetuning/templates/rlvr_reward_function_source_template.py
+++ b/plugins/sagemaker-ai/skills/finetuning/templates/rlvr_reward_function_source_template.py
@@ -141,8 +141,8 @@ def reward_function(sample: Dict[str, Any], index: int) -> Dict[str, Any]:
     # Note the below lines of code are examples and will not work for your use case
     # You MUST update them to match YOUR use case
     # Extract the response and reference
-    messages = sample.get('messages', [])
-    reference_answer = sample.get('reference_answer', {}).get('text', '')
+    messages = sample.get('messages', sample.get('prompt', []))
+    reference_answer = sample.get('reference_answer', {}).get('text', '') or sample.get('reward_model', {}).get('ground_truth', '')
 
     # Get the question and assistant's response
     question = ""
diff --git a/plugins/sagemaker-ai/skills/model-evaluation/SKILL.md b/plugins/sagemaker-ai/skills/model-evaluation/SKILL.md
index 71c4417d..c28ce65a 100644
--- a/plugins/sagemaker-ai/skills/model-evaluation/SKILL.md
+++ b/plugins/sagemaker-ai/skills/model-evaluation/SKILL.md
@@ -142,6 +142,8 @@ For each value you don't already have, ask one at a time.
 
 ### Step 10: Confirm configuration
 
+Summarize everything and ask for approval:
+
 > "Here's the evaluation setup:
 >
 > - Task: [task]
diff --git a/plugins/sagemaker-ai/skills/planning/SKILL.md b/plugins/sagemaker-ai/skills/planning/SKILL.md
index 325e073f..f7ef3d3f 100644
--- a/plugins/sagemaker-ai/skills/planning/SKILL.md
+++ b/plugins/sagemaker-ai/skills/planning/SKILL.md
@@ -55,8 +55,9 @@ Does this plan look right, or would you like to change anything?
 
 - Before presenting a plan, always read `references/skill-routing-constraints.md` and validate the plan against it.
 - Draw tasks from the skills available in your context. Use each skill's name and description to determine relevance.
+- Only offer capabilities that are covered by an available skill. Do not offer, suggest, or imply the ability to help with tasks that no skill supports. If the user needs something outside the available skills, explain that it is not supported.
 - Not every plan needs every skill. Tailor the plan to the user's actual intent.
-- If the user already has artifacts (a dataset, a trained model, etc.), skip the steps that produce them.
+- If the user already has artifacts (e.g., a trained model), skip the steps that produce them.
 - Keep plans short. Only include tasks that are necessary.
 
 When the user approves the plan, write it to `PLAN.md` using the following format. Save the file under the project directory structure defined by the directory-management skill, if available.
diff --git a/plugins/sagemaker-ai/skills/planning/references/model-customization-plan.md b/plugins/sagemaker-ai/skills/planning/references/model-customization-plan.md
index 1e420c31..7f66351a 100644
--- a/plugins/sagemaker-ai/skills/planning/references/model-customization-plan.md
+++ b/plugins/sagemaker-ai/skills/planning/references/model-customization-plan.md
@@ -12,4 +12,6 @@ A typical model customization workflow follows these steps in order:
 
 Not every plan needs every step. Users may skip steps if they already have the required artifacts (e.g., skip steps 3–4 if they have validated data, or skip to deployment if they have a trained model).
 
+**Note:** This skills package does not support data generation. Do not suggest, offer, or imply that you have the ability to generate data. If the user asks about this, make it clear that the skills do not support this ability.
+
 **Note:** Evaluation datasets require a different format than training datasets (e.g., SageMaker Eval `query`/`response` vs SFT `prompt`/`completion`). If the user has a separate eval dataset, it may need its own validation and transformation pass before model evaluation.

From 062fa049b5ab647bd871e51889a2bab9da5cbe3a Mon Sep 17 00:00:00 2001
From: Girish Raman <grraman@amazon.com>
Date: Mon, 30 Mar 2026 09:14:06 -0700
Subject: [PATCH 06/15] fix(sagemaker-ai) (model customization): Fix semgrep
 issues

---
 .../dataset-evaluation/scripts/format_detector.py      | 10 +++++-----
 .../scripts/validate_custom_metrics.py                 |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/plugins/sagemaker-ai/skills/dataset-evaluation/scripts/format_detector.py b/plugins/sagemaker-ai/skills/dataset-evaluation/scripts/format_detector.py
index a9ed1fb6..158db918 100644
--- a/plugins/sagemaker-ai/skills/dataset-evaluation/scripts/format_detector.py
+++ b/plugins/sagemaker-ai/skills/dataset-evaluation/scripts/format_detector.py
@@ -441,11 +441,11 @@ def _validate_verl_legacy_prompt(record: dict, line_num: int) -> list[Validation
 FORMAT_SCHEMAS = {
     FormatType.NOVA_SFT: {
         "required_fields": {"messages": list},
-        "message_validator": lambda msgs, ln: _validate_nova_messages(msgs, ln, is_dpo=False),
+        "message_validator": lambda msgs, ln: _validate_nova_messages(msgs, ln, is_dpo=False),  # nosemgrep: python.lang.maintainability.return.return-not-in-function -- lambda inside dict literal, not a bare return
     },
     FormatType.NOVA_DPO: {
         "required_fields": {"messages": list},
-        "message_validator": lambda msgs, ln: _validate_nova_messages(msgs, ln, is_dpo=True),
+        "message_validator": lambda msgs, ln: _validate_nova_messages(msgs, ln, is_dpo=True),  # nosemgrep: python.lang.maintainability.return.return-not-in-function -- lambda inside dict literal, not a bare return
     },
     FormatType.NOVA_RLVR: {
         "required_fields": {"messages": list, "reference_answer": dict},
@@ -653,7 +653,7 @@ def detect_format(file_path: str, sample_size_bytes: int = 1_048_576, s3_client=
         if args.json:
             output = {
                 "format_type": result.format_type.value,
-                "is_valid": result.is_valid,
+                "is_valid": result.is_valid,  # nosemgrep: python.lang.maintainability.is-function-without-parentheses -- dataclass field, not a method
                 "confidence": result.confidence.value,
                 "lines_sampled": result.lines_sampled,
                 "errors": [
@@ -664,7 +664,7 @@ def detect_format(file_path: str, sample_size_bytes: int = 1_048_576, s3_client=
             print(json.dumps(output, indent=2))
         else:
             print(f"Format: {result.format_type.value}")
-            print(f"Valid: {'✓' if result.is_valid else '✗'}")
+            print(f"Valid: {'✓' if result.is_valid else '✗'}")  # nosemgrep: python.lang.maintainability.is-function-without-parentheses -- dataclass field, not a method
             print(f"Confidence: {result.confidence.name}")
             print(f"Lines sampled: {result.lines_sampled}")
             if result.errors:
@@ -672,7 +672,7 @@ def detect_format(file_path: str, sample_size_bytes: int = 1_048_576, s3_client=
                 for err in result.errors:
                     print(f"  Line {err.line_number}: {err.message}")
         
-        sys.exit(0 if result.is_valid else 1)
+        sys.exit(0 if result.is_valid else 1)  # nosemgrep: python.lang.maintainability.is-function-without-parentheses -- dataclass field, not a method
     except (FileNotFoundError, IOError, ValueError) as e:
         print(f"Error: {e}", file=sys.stderr)
         sys.exit(1)
diff --git a/plugins/sagemaker-ai/skills/model-evaluation/scripts/validate_custom_metrics.py b/plugins/sagemaker-ai/skills/model-evaluation/scripts/validate_custom_metrics.py
index ab6703e6..3ff2c717 100644
--- a/plugins/sagemaker-ai/skills/model-evaluation/scripts/validate_custom_metrics.py
+++ b/plugins/sagemaker-ai/skills/model-evaluation/scripts/validate_custom_metrics.py
@@ -104,7 +104,7 @@ def main():
 
     arg = sys.argv[1]
     try:
-        with open(arg) as f:
+        with open(arg, encoding="utf-8") as f:
             raw = f.read()
     except (FileNotFoundError, IsADirectoryError):
         raw = arg

From 471f160a409550987e05876e9fca04853e03e689 Mon Sep 17 00:00:00 2001
From: Girish Raman <grraman@amazon.com>
Date: Mon, 30 Mar 2026 09:56:07 -0700
Subject: [PATCH 07/15] fix(sagemaker-ai) (model customization): Fix
 indentation and other minor issues

---
 .../skills/dataset-evaluation/SKILL.md             |  2 +-
 .../skills/finetuning/references/dpo_example.md    | 12 ++++++------
 .../skills/finetuning/references/rlvr_example.md   | 14 +++++++-------
 .../skills/finetuning/references/sft_example.md    | 12 ++++++------
 .../model-deployment/scripts/deploy-oss-bedrock.py |  1 -
 5 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/plugins/sagemaker-ai/skills/dataset-evaluation/SKILL.md b/plugins/sagemaker-ai/skills/dataset-evaluation/SKILL.md
index 81d2d956..d37ffb96 100644
--- a/plugins/sagemaker-ai/skills/dataset-evaluation/SKILL.md
+++ b/plugins/sagemaker-ai/skills/dataset-evaluation/SKILL.md
@@ -50,5 +50,5 @@ Follow the workflow shown below. Locate the dataset, check the file type, and re
 
 ```bash
 # With the file path argument identified in workflow step 1
-python src/format_detector.py local_path/to/dataset
+python scripts/format_detector.py local_path/to/dataset
 ```
diff --git a/plugins/sagemaker-ai/skills/finetuning/references/dpo_example.md b/plugins/sagemaker-ai/skills/finetuning/references/dpo_example.md
index 041a5ffd..74093866 100644
--- a/plugins/sagemaker-ai/skills/finetuning/references/dpo_example.md
+++ b/plugins/sagemaker-ai/skills/finetuning/references/dpo_example.md
@@ -146,12 +146,12 @@ client = MlflowClient()
 metrics = ["loss_per_batch", "rewards/chosen", "rewards/rejected", "rewards/margins", "acc_per_batch"]
 fig, axes = plt.subplots(1, len(metrics), figsize=(4 * len(metrics), 3))
 for idx, metric in enumerate(metrics):
-history = client.get_metric_history(run_id, metric)
-axes[idx].plot([h.step for h in history], [h.value for h in history], linewidth=2, marker='o', markersize=4)
-axes[idx].set_xlabel('Step')
-axes[idx].set_ylabel(metric.split('/')[-1])
-axes[idx].set_title(metric, fontweight='bold')
-axes[idx].grid(True, alpha=0.3)
+    history = client.get_metric_history(run_id, metric)
+    axes[idx].plot([h.step for h in history], [h.value for h in history], linewidth=2, marker='o', markersize=4)
+    axes[idx].set_xlabel('Step')
+    axes[idx].set_ylabel(metric.split('/')[-1])
+    axes[idx].set_title(metric, fontweight='bold')
+    axes[idx].grid(True, alpha=0.3)
 
 plt.suptitle(f'Training Metrics: {training_job.training_job_name}', fontweight='bold')
 plt.tight_layout()
diff --git a/plugins/sagemaker-ai/skills/finetuning/references/rlvr_example.md b/plugins/sagemaker-ai/skills/finetuning/references/rlvr_example.md
index 2c983436..9eb9a57c 100644
--- a/plugins/sagemaker-ai/skills/finetuning/references/rlvr_example.md
+++ b/plugins/sagemaker-ai/skills/finetuning/references/rlvr_example.md
@@ -155,13 +155,13 @@ metrics = [
 
 fig, axes = plt.subplots(1, len(metrics), figsize=(4 * len(metrics), 3))
 for idx, metric in enumerate(metrics):
-history = client.get_metric_history(run_id, metric)
-if history:
-axes[idx].plot([h.step for h in history], [h.value for h in history], linewidth=2, marker='o', markersize=4)
-axes[idx].set_xlabel('Step')
-axes[idx].set_ylabel(metric.split('/')[-1])
-axes[idx].set_title(metric, fontweight='bold')
-axes[idx].grid(True, alpha=0.3)
+    history = client.get_metric_history(run_id, metric)
+    if history:
+        axes[idx].plot([h.step for h in history], [h.value for h in history], linewidth=2, marker='o', markersize=4)
+        axes[idx].set_xlabel('Step')
+        axes[idx].set_ylabel(metric.split('/')[-1])
+        axes[idx].set_title(metric, fontweight='bold')
+        axes[idx].grid(True, alpha=0.3)
 
 plt.suptitle(f'Training Metrics: {training_job.training_job_name}', fontweight='bold')
 plt.tight_layout()
diff --git a/plugins/sagemaker-ai/skills/finetuning/references/sft_example.md b/plugins/sagemaker-ai/skills/finetuning/references/sft_example.md
index 5695abde..fa794e69 100644
--- a/plugins/sagemaker-ai/skills/finetuning/references/sft_example.md
+++ b/plugins/sagemaker-ai/skills/finetuning/references/sft_example.md
@@ -146,12 +146,12 @@ client = MlflowClient()
 
 fig, axes = plt.subplots(1, 2, figsize=(12, 3))
 for idx, metric in enumerate(["total_loss", "val_eval_total_loss"]):
-history = client.get_metric_history(run_id, metric)
-axes[idx].plot([h.step for h in history], [h.value for h in history], linewidth=2, marker='o', markersize=4)
-axes[idx].set_xlabel('Step')
-axes[idx].set_ylabel('Loss')
-axes[idx].set_title(metric, fontweight='bold')
-axes[idx].grid(True, alpha=0.3)
+    history = client.get_metric_history(run_id, metric)
+    axes[idx].plot([h.step for h in history], [h.value for h in history], linewidth=2, marker='o', markersize=4)
+    axes[idx].set_xlabel('Step')
+    axes[idx].set_ylabel('Loss')
+    axes[idx].set_title(metric, fontweight='bold')
+    axes[idx].grid(True, alpha=0.3)
 
 plt.suptitle(f'Training Metrics: {training_job.training_job_name}', fontweight='bold')
 plt.tight_layout()
diff --git a/plugins/sagemaker-ai/skills/model-deployment/scripts/deploy-oss-bedrock.py b/plugins/sagemaker-ai/skills/model-deployment/scripts/deploy-oss-bedrock.py
index 01da3e11..8c785d0d 100644
--- a/plugins/sagemaker-ai/skills/model-deployment/scripts/deploy-oss-bedrock.py
+++ b/plugins/sagemaker-ai/skills/model-deployment/scripts/deploy-oss-bedrock.py
@@ -54,7 +54,6 @@
 
 training_job = TrainingJob.get(training_job_name=TRAINING_JOB_NAME, region=REGION)
 builder = BedrockModelBuilder(model=training_job)
-builder._bedrock_client = boto3.client("bedrock", region_name=REGION)
 
 result = builder.deploy(
     job_name=MODEL_NAME,

From a6076f31f5a56f393f8ebe78d988e3b1cd04bf54 Mon Sep 17 00:00:00 2001
From: Girish Raman <grraman@amazon.com>
Date: Mon, 30 Mar 2026 10:47:00 -0700
Subject: [PATCH 08/15] fix(sagemaker-ai): Add README.md for the new plugin

---
 plugins/sagemaker-ai/README.md | 173 +++++++++++++++++++++++++++++++++
 1 file changed, 173 insertions(+)
 create mode 100644 plugins/sagemaker-ai/README.md

diff --git a/plugins/sagemaker-ai/README.md b/plugins/sagemaker-ai/README.md
new file mode 100644
index 00000000..ddf0ba8a
--- /dev/null
+++ b/plugins/sagemaker-ai/README.md
@@ -0,0 +1,173 @@
+## Overview
+
+This plugin brings deep AWS AI/ML expertise directly into your coding assistant, covering the surface area of [Amazon SageMaker AI](https://aws.amazon.com/sagemaker/ai/); currently, skills are provided to assist with the following capability areas:
+
+- **Model Customization** — End-to-end guided workflows for fine-tuning foundation models, from use case definition through data preparation, training, evaluation, and deployment on Amazon SageMaker AI.
+- **HyperPod Cluster Operations** — Remote command execution on nodes via SSM, version checking, and diagnostic reporting for SageMaker HyperPod training clusters.
+
+## Agent Skills
+
+| # | Skill | Description | Documentation |
+|---|---|---|---|
+| 1 | `planning` | Builds a dynamic, step-by-step plan tailored to your intents | [SKILL.md](skills/planning/SKILL.md) |
+| 2 | `directory-management` | Manages project directory setup, artifact organization, and plan association for new or existing projects | [SKILL.md](skills/directory-management/SKILL.md) |
+| 3 | `use-case-specification` | Guided, conversational process to define your model customization use case goals, key stakeholders, and success criteria | [SKILL.md](skills/use-case-specification/SKILL.md) |
+| 4 | `dataset-evaluation` | Dataset quality validation, format detection, and data requirements analysis | [SKILL.md](skills/dataset-evaluation/SKILL.md) |
+| 5 | `dataset-transformation` | Dataset format conversion and preparation for SageMaker-compatible training formats | [SKILL.md](skills/dataset-transformation/SKILL.md) |
+| 6 | `finetuning-setup` | Fine-tuning technique selection (SFT, DPO, RVLR, etc.) and base model selection | [SKILL.md](skills/finetuning-setup/SKILL.md) |
+| 7 | `finetuning` | Hyperparameter configuration and training job execution | [SKILL.md](skills/finetuning/SKILL.md) |
+| 8 | `model-evaluation` | Evaluation design, benchmark selection, LLM-as-a-judge, and model comparison | [SKILL.md](skills/model-evaluation/SKILL.md) |
+| 9 | `model-deployment` | Deployment configuration and endpoint setup (SageMaker or Bedrock) | [SKILL.md](skills/model-deployment/SKILL.md) |
+| 10 | `hyperpod-ssm` | Remote command execution and file transfer on HyperPod cluster nodes via SSM | [SKILL.md](skills/hyperpod-ssm/SKILL.md) |
+| 11 | `hyperpod-version-checker` | Check and compare software component versions across HyperPod cluster nodes | [SKILL.md](skills/hyperpod-version-checker/SKILL.md) |
+| 12 | `hyperpod-issue-report` | Generate diagnostic reports for HyperPod troubleshooting and support cases | [SKILL.md](skills/hyperpod-issue-report/SKILL.md) |
+
+## MCP Servers
+
+| # | Server | Description |
+|---|---|---|
+| 1 | `aws-mcp` | AWS documentation and SOP retrieval via `mcp-proxy-for-aws` |
+
+## Installation
+
+### Claude Code
+
+```
+claude plugin install awslabs/agent-plugins --plugin sagemaker-ai
+```
+
+### Cursor
+
+Install the `sagemaker-ai` plugin from the AWS Agent Plugins Marketplace via the MCP configuration panel in Cursor's AI assistant settings.
+
+### Other Platforms
+
+For coding assistants that don't support agent plugins directly (for example, Kiro CLI or IDE as of Mar 2026), install the MCP server and Skills separately as shown below.
+
+**Step 1: Configure the MCP server**
+
+Add the contents from the `.mcp.json` file to your platform's MCP configuration file (e.g., `.kiro/settings/mcp.json` for Kiro, `.vscode/mcp.json` for VS Code).
+
+**Step 2: Install Skills**
+
+You may use the [Skills CLI](https://github.com/vercel-labs/skills) (from Vercel Labs) to install the skills into your platform:
+
+```
+npx skills add https://github.com/awslabs/agent-plugins/tree/main/plugins/sagemaker-ai/skills --all --agent <agents...>
+```
+
+This copies the skill files into your agent platform's skills directory (e.g., `.kiro/skills/` for Kiro), making them available to your coding assistant.
+
+## Model Customization
+
+The model customization skills cover the jobs-to-be-done for fine-tuning foundation models on Amazon SageMaker AI. They encode AWS best practices into agent-readable instruction packages, guiding you from use case definition through deployment.
+
+### How It Works
+
+- **Build your plan** — Describe what you want to build. The Planning skill discovers your intent, asks targeted clarifying questions, and generates a step-by-step customization plan covering data preparation, fine-tuning, evaluation, and deployment — adapting as your project evolves.
+- **Define your use case** — The Use Case Specification skill guides you through a structured process to capture goals, constraints, and success criteria, producing a reusable specification document.
+- **Work through each stage** — At each step, the agent generates executable Jupyter notebooks you can review, edit, and run cell by cell. You validate results and iterate before moving on.
+- **Deploy your model** — Once evaluation criteria are met, the deployment skill guides you through endpoint configuration and launch on Amazon SageMaker AI or Amazon Bedrock.
+
+For Kiro IDE users who use the chat interface, SageMaker AI model customization skills trigger correctly in ["vibe" mode](https://kiro.dev/docs/chat/vibe/) but not consistently in "spec" mode. Select "Vibe" when prompted by Kiro.
+
+### Examples
+
+- "Hi, help me customize a model"
+- "I want to fine-tune a model for customer support classification"
+- "Evaluate my dataset for finetuning a base model"
+- "Deploy my fine-tuned model"
+
+## HyperPod Cluster Operations
+
+The HyperPod skills provide operational tooling for Amazon SageMaker HyperPod AI/ML clusters orchestrated via Slurm or Amazon EKS, enabling you to manage, diagnose, and troubleshoot cluster nodes directly from your coding assistant.
+
+- **`hyperpod-ssm`** — Run commands and transfer files on cluster nodes via AWS Systems Manager (SSM), without needing direct SSH access.
+- **`hyperpod-version-checker`** — Check and compare software component versions (drivers, libraries, frameworks) across cluster nodes to identify drift or incompatibilities.
+- **`hyperpod-issue-report`** — Generate comprehensive issue reports that collect system state, logs, and configuration details for troubleshooting or support case submission.
+
+### Examples
+
+- "Check the GPU memory usage on all nodes in my HyperPod cluster using SSM"
+- "Check driver versions on my HyperPod cluster"
+- "Generate an issue report for my HyperPod cluster"
+
+## Supported Environments
+
+### Using the plugin through a remote connection to SageMaker Spaces
+
+You may choose to setup a remote connection to your existing SageMaker Spaces, and use the plugin there. If you choose to do this, the environment is pre-configured with AWS credentials and environment variables. You may skip the Authentication and Authorization and Configuration pre-requisite steps, simply install the plugin and start using it with your agent. Learn more about remote connections to SageMaker Spaces [here](https://docs.aws.amazon.com/sagemaker/latest/dg/remote-access.html).
+
+### Using the plugin in your local compute
+
+In your own local compute environment, you need to follow the Authentication and Authorization and Configuration pre-requisite steps outlined below to get your local environment ready for the use of this plugin. Then, you may install the plugin and start using it with your agent of choice.
+
+#### Prerequisites
+
+- An AWS account with access to Amazon SageMaker AI
+- Local AWS credentials and config
+- Python 3.8+ (for generated notebooks)
+
+#### Authentication and Authorization
+
+In your local environment, configure AWS credentials using one of the following methods, to enable the skills to execute relevant SageMaker AI and AWS API operations as needed:
+
+- **AWS CLI** — Run `aws configure` (IAM Role credentials)
+- **Environment variables** — Set `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, and `AWS_SESSION_TOKEN`. See [Configuring environment variables](https://docs.aws.amazon.com/cli/v1/userguide/cli-configure-envvars.html) for details. If you're using isolated environments such as conda or venv, make sure to set your environment variables within your environment.
+
+The credentials configured here should be for an IAM Role. This IAM Role:
+
+- Should have the necessary IAM permissions to invoke the various SageMaker AI operations (such as CreateTrainingJob, CreateEndpoint, CreateModel, etc.). The `AmazonSageMakerFullAccess` managed policy covers all SageMaker operations used by the skills.
+- Should be configured to be used as an "execution role" for SageMaker AI operations that require an execution role (trusted by [sagemaker.amazonaws.com](http://sagemaker.amazonaws.com))
+- For Bedrock deployment and evaluation: must also be trusted by [bedrock.amazonaws.com](http://bedrock.amazonaws.com) (add to the role's trust policy) and have permissions for Bedrock operations (see supplemental policy below)
+- For RLVR fine-tuning: must also be trusted by [lambda.amazonaws.com](http://lambda.amazonaws.com) (add to the role's trust policy) to allow the finetuning skill to create an RLVR reward Lambda function
+
+**Supplemental Bedrock permissions:** `AmazonSageMakerFullAccess` does not include the Bedrock permissions required by the model-evaluation and model-deployment (Bedrock pathway) skills. Add the following policy to your role:
+
+```json
+{
+  "Version": "2012-10-17",
+  "Statement": [
+    {
+      "Sid": "BedrockModelImportAndDeploy",
+      "Effect": "Allow",
+      "Action": [
+        "bedrock:CreateModelImportJob",
+        "bedrock:GetModelImportJob",
+        "bedrock:DeleteImportedModel",
+        "bedrock:CreateCustomModel",
+        "bedrock:GetFoundationModel",
+        "bedrock-runtime:Converse"
+      ],
+      "Resource": "*"
+    }
+  ]
+}
+```
+
+**S3 bucket naming caveat:** The default SageMaker execution policy only grants `s3:GetObject` and `s3:PutObject` on S3 buckets with "sagemaker" in the name. If your datasets or model artifacts are stored in buckets without "sagemaker" in the name, you must add a supplemental S3 policy granting access to those buckets.
+
+**RLVR Lambda naming caveat:** For RLVR fine-tuning, `lambda:InvokeFunction` is only granted on Lambda functions with "sagemaker" in the name. Ensure your RLVR reward functions follow this naming convention, or add a broader Lambda invoke policy.
+
+Learn more about AWS Identity and Access Management for Amazon SageMaker AI [here](https://docs.aws.amazon.com/sagemaker/latest/dg/security-iam.html).
+
+#### Configuration
+
+- Set `AWS_DEFAULT_REGION` to your preferred AWS region (e.g., `us-east-1`) for your customization workflow. See [Configuring environment variables](https://docs.aws.amazon.com/cli/v1/userguide/cli-configure-envvars.html) for details. If you're using isolated environments such as conda or venv, make sure to set your environment variables within your environment.
+- When deploying a customized model to Bedrock for inference, set your region inference policy to control scale of inference geographically or globally. See [Bedrock documentation](https://aws.amazon.com/blogs/machine-learning/getting-started-with-cross-region-inference-in-amazon-bedrock/).
+
+## Customizing Skills for Your Organization
+
+The skills in this plugin encode AWS best practices, but they are fully customizable. You can fork the repository and modify any `SKILL.md` to reflect your organization's standards, approved techniques, required evaluation benchmarks, or internal tooling. Workspace-level skills take precedence over global skills, so teams can maintain their own versions without affecting other users.
+
+## Related Resources
+
+- [Amazon SageMaker AI Model Customization](https://aws.amazon.com/sagemaker/ai/model-customization/)
+- [SageMaker AI Developer Guide](https://docs.aws.amazon.com/sagemaker/latest/dg/customize-model.html)
+- [Agent Skills open standard — Anthropic](https://www.anthropic.com/engineering/equipping-agents-for-the-real-world-with-agent-skills)
+- [AWS Agent Plugins Marketplace](https://github.com/awslabs/agent-plugins)
+- [SageMaker AI MCP Server](https://github.com/awslabs/mcp)
+
+## License
+
+This project is licensed under the Apache 2.0 License.

From 6d6814cf308eb2b18c7d0f9f5989918be5a5a0a2 Mon Sep 17 00:00:00 2001
From: Girish Raman <grraman@amazon.com>
Date: Mon, 30 Mar 2026 10:52:31 -0700
Subject: [PATCH 09/15] fix(sagemaker-ai): Add metadata.version for all skills

---
 plugins/sagemaker-ai/skills/dataset-evaluation/SKILL.md       | 2 ++
 plugins/sagemaker-ai/skills/dataset-transformation/SKILL.md   | 2 ++
 plugins/sagemaker-ai/skills/directory-management/SKILL.md     | 2 ++
 plugins/sagemaker-ai/skills/finetuning-setup/SKILL.md         | 2 ++
 plugins/sagemaker-ai/skills/finetuning/SKILL.md               | 2 ++
 plugins/sagemaker-ai/skills/hyperpod-issue-report/SKILL.md    | 2 ++
 plugins/sagemaker-ai/skills/hyperpod-ssm/SKILL.md             | 2 ++
 plugins/sagemaker-ai/skills/hyperpod-version-checker/SKILL.md | 2 ++
 plugins/sagemaker-ai/skills/model-deployment/SKILL.md         | 2 ++
 plugins/sagemaker-ai/skills/model-evaluation/SKILL.md         | 2 ++
 plugins/sagemaker-ai/skills/planning/SKILL.md                 | 2 ++
 plugins/sagemaker-ai/skills/use-case-specification/SKILL.md   | 2 ++
 12 files changed, 24 insertions(+)

diff --git a/plugins/sagemaker-ai/skills/dataset-evaluation/SKILL.md b/plugins/sagemaker-ai/skills/dataset-evaluation/SKILL.md
index d37ffb96..60b6ee3e 100644
--- a/plugins/sagemaker-ai/skills/dataset-evaluation/SKILL.md
+++ b/plugins/sagemaker-ai/skills/dataset-evaluation/SKILL.md
@@ -1,6 +1,8 @@
 ---
 name: dataset-evaluation
 description: Validates dataset formatting and quality for SageMaker model fine-tuning (SFT, DPO, or RLVR). Use when the user says "is my dataset okay", "evaluate my data", "check my training data", "I have my own data", or before starting any fine-tuning job. Detects file format, checks schema compliance against the selected model and technique, and reports whether the data is ready for training or evaluation.
+metadata:
+  version: "1.0.0"
 ---
 
 # Workflow Instruction
diff --git a/plugins/sagemaker-ai/skills/dataset-transformation/SKILL.md b/plugins/sagemaker-ai/skills/dataset-transformation/SKILL.md
index 91fef48c..48a6cf33 100644
--- a/plugins/sagemaker-ai/skills/dataset-transformation/SKILL.md
+++ b/plugins/sagemaker-ai/skills/dataset-transformation/SKILL.md
@@ -1,6 +1,8 @@
 ---
 name: dataset-transformation
 description: Generates a Jupyter notebook that transforms datasets between ML schemas for model training or evaluation. Use when the user says "transform", "convert", "reformat", "change the format", or when a dataset's schema needs to change to match the target format — always use this skill for format changes rather than writing inline transformation code. Supports OpenAI chat, SageMaker SFT/DPO/RLVR, HuggingFace preference, Bedrock Nova, VERL, and custom JSONL formats from local files or S3.
+metadata:
+  version: "1.0.0"
 ---
 
 # Dataset Transformation Agent
diff --git a/plugins/sagemaker-ai/skills/directory-management/SKILL.md b/plugins/sagemaker-ai/skills/directory-management/SKILL.md
index 8a7f92b5..a9d1489d 100644
--- a/plugins/sagemaker-ai/skills/directory-management/SKILL.md
+++ b/plugins/sagemaker-ai/skills/directory-management/SKILL.md
@@ -1,6 +1,8 @@
 ---
 name: directory-management
 description: Manages project directory setup and artifact organization. Use when starting a new project, resuming an existing one, or when a PLAN.md needs to be associated with a project directory. Creates the project folder structure (specs/, scripts/, notebooks/) and resolves project naming.
+metadata:
+  version: "1.0.0"
 ---
 
 # Directory Management
diff --git a/plugins/sagemaker-ai/skills/finetuning-setup/SKILL.md b/plugins/sagemaker-ai/skills/finetuning-setup/SKILL.md
index a13a0b38..509d6c13 100644
--- a/plugins/sagemaker-ai/skills/finetuning-setup/SKILL.md
+++ b/plugins/sagemaker-ai/skills/finetuning-setup/SKILL.md
@@ -1,6 +1,8 @@
 ---
 name: finetuning-setup
 description: Selects a base model and fine-tuning technique (SFT, DPO, or RLVR) for the user's use case by querying SageMaker Hub. Use when the user asks which model or technique to use, wants to start fine-tuning, or mentions a model name or family (e.g., "Llama", "Mistral") — always activate even for known model names because the exact Hub model ID must be resolved. Queries available models, validates technique compatibility, and confirms selections.
+metadata:
+  version: "1.0.0"
 ---
 
 # Finetuning Setup
diff --git a/plugins/sagemaker-ai/skills/finetuning/SKILL.md b/plugins/sagemaker-ai/skills/finetuning/SKILL.md
index f475769c..775c3a8d 100644
--- a/plugins/sagemaker-ai/skills/finetuning/SKILL.md
+++ b/plugins/sagemaker-ai/skills/finetuning/SKILL.md
@@ -1,6 +1,8 @@
 ---
 name: finetuning
 description: Generates a Jupyter notebook that fine-tunes a base model using SageMaker serverless training jobs. Use when the user says "start training", "fine-tune my model", "I'm ready to train", or when the plan reaches the finetuning step. Supports SFT, DPO, and RLVR trainers, including RLVR Lambda reward function creation.
+metadata:
+  version: "1.0.0"
 ---
 
 # Prerequisites
diff --git a/plugins/sagemaker-ai/skills/hyperpod-issue-report/SKILL.md b/plugins/sagemaker-ai/skills/hyperpod-issue-report/SKILL.md
index 3e7d6a01..fe0264de 100755
--- a/plugins/sagemaker-ai/skills/hyperpod-issue-report/SKILL.md
+++ b/plugins/sagemaker-ai/skills/hyperpod-issue-report/SKILL.md
@@ -1,6 +1,8 @@
 ---
 name: hyperpod-issue-report
 description: Generate comprehensive issue reports from HyperPod clusters (EKS and Slurm) by collecting diagnostic logs and configurations for troubleshooting and AWS Support cases. Use when users need to collect diagnostics from HyperPod cluster nodes, generate issue reports for AWS Support, investigate node failures or performance problems, document cluster state, or create diagnostic snapshots. Triggers on requests involving issue reports, diagnostic collection, support case preparation, or cluster troubleshooting that requires gathering logs and system information from multiple nodes.
+metadata:
+  version: "1.0.0"
 ---
 
 # HyperPod Issue Report
diff --git a/plugins/sagemaker-ai/skills/hyperpod-ssm/SKILL.md b/plugins/sagemaker-ai/skills/hyperpod-ssm/SKILL.md
index 048d962d..287f57f0 100755
--- a/plugins/sagemaker-ai/skills/hyperpod-ssm/SKILL.md
+++ b/plugins/sagemaker-ai/skills/hyperpod-ssm/SKILL.md
@@ -1,6 +1,8 @@
 ---
 name: hyperpod-ssm
 description: Remote command execution and file transfer on SageMaker HyperPod cluster nodes via AWS Systems Manager (SSM). This is the primary interface for accessing HyperPod nodes — direct SSH is not available. Use when any skill, workflow, or user request needs to execute commands on cluster nodes, upload files to nodes, read/download files from nodes, run diagnostics, install packages, or perform any operation requiring shell access to HyperPod instances. Other HyperPod skills depend on this skill for all node-level operations.
+metadata:
+  version: "1.0.0"
 ---
 
 # HyperPod SSM Access
diff --git a/plugins/sagemaker-ai/skills/hyperpod-version-checker/SKILL.md b/plugins/sagemaker-ai/skills/hyperpod-version-checker/SKILL.md
index aafcd08b..9500bcfb 100755
--- a/plugins/sagemaker-ai/skills/hyperpod-version-checker/SKILL.md
+++ b/plugins/sagemaker-ai/skills/hyperpod-version-checker/SKILL.md
@@ -1,6 +1,8 @@
 ---
 name: hyperpod-version-checker
 description: Check and compare software component versions on SageMaker HyperPod cluster nodes - NVIDIA drivers, CUDA toolkit, cuDNN, NCCL, EFA, AWS OFI NCCL, GDRCopy, MPI, Neuron SDK (Trainium/Inferentia), Python, and PyTorch. Use when checking component versions, verifying CUDA/driver compatibility, detecting version mismatches across nodes, planning upgrades, documenting cluster configuration, or troubleshooting version-related issues on HyperPod. Triggers on requests about versions, compatibility, component checks, or upgrade planning for HyperPod clusters.
+metadata:
+  version: "1.0.0"
 ---
 
 # HyperPod Version Checker
diff --git a/plugins/sagemaker-ai/skills/model-deployment/SKILL.md b/plugins/sagemaker-ai/skills/model-deployment/SKILL.md
index e1568428..00ca7f3d 100644
--- a/plugins/sagemaker-ai/skills/model-deployment/SKILL.md
+++ b/plugins/sagemaker-ai/skills/model-deployment/SKILL.md
@@ -1,6 +1,8 @@
 ---
 name: model-deployment
 description: Generates a Jupyter notebook that deploys fine-tuned models from SageMaker Serverless Model Customization to SageMaker endpoints or Bedrock. Use when the user says "deploy my model", "create an endpoint", "make it available", or asks about deployment options. Identifies the correct deployment pathway (Nova vs OSS), generates deployment code, and handles endpoint configuration.
+metadata:
+  version: "1.0.0"
 ---
 
 # Model Deployment
diff --git a/plugins/sagemaker-ai/skills/model-evaluation/SKILL.md b/plugins/sagemaker-ai/skills/model-evaluation/SKILL.md
index c28ce65a..166dd4ca 100644
--- a/plugins/sagemaker-ai/skills/model-evaluation/SKILL.md
+++ b/plugins/sagemaker-ai/skills/model-evaluation/SKILL.md
@@ -1,6 +1,8 @@
 ---
 name: model-evaluation
 description: Generates a Jupyter notebook that evaluates a fine-tuned SageMaker model using LLM-as-a-Judge. Use when the user says "evaluate my model", "how did my model perform", "compare models", or after a training job completes. Supports built-in and custom evaluation metrics, evaluation dataset setup, and judge model selection.
+metadata:
+  version: "1.0.0"
 ---
 
 # Model Evaluation Code Generator
diff --git a/plugins/sagemaker-ai/skills/planning/SKILL.md b/plugins/sagemaker-ai/skills/planning/SKILL.md
index f7ef3d3f..474db14f 100644
--- a/plugins/sagemaker-ai/skills/planning/SKILL.md
+++ b/plugins/sagemaker-ai/skills/planning/SKILL.md
@@ -1,6 +1,8 @@
 ---
 name: planning
 description: Discovers user intent and generates a structured, step-by-step customization plan that orchestrates other skills. Always activate at the start of every conversation, when all tasks in a plan are completed, or when the user asks to modify the current plan. Handles intent discovery, plan generation, plan iteration, and mid-execution plan alterations. When in doubt, use this skill.
+metadata:
+  version: "1.0.0"
 ---
 
 # Planning
diff --git a/plugins/sagemaker-ai/skills/use-case-specification/SKILL.md b/plugins/sagemaker-ai/skills/use-case-specification/SKILL.md
index 965412f4..03a63527 100644
--- a/plugins/sagemaker-ai/skills/use-case-specification/SKILL.md
+++ b/plugins/sagemaker-ai/skills/use-case-specification/SKILL.md
@@ -1,6 +1,8 @@
 ---
 name: use-case-specification
 description: Creates a reusable use case specification file that defines the business problem, stakeholders, and measurable success criteria for model customization, as recommended by the AWS Responsible AI Lens. Use as the default first step in any model customization plan. Skip only if the user explicitly declines or already has a use case specification to reuse. Captures problem statement, primary users, and LLM-as-a-Judge success tenets.
+metadata:
+  version: "1.0.0"
 ---
 
 # Use Case Specification

From 8f25996892815a8e5b2e89460fb4950fed036215 Mon Sep 17 00:00:00 2001
From: Girish Raman <grraman@amazon.com>
Date: Mon, 30 Mar 2026 15:46:54 -0700
Subject: [PATCH 10/15] (fix) Fix sagemaker SDK version

---
 .../model-deployment/references/deploy-nova-bedrock.md      | 2 +-
 .../model-deployment/references/deploy-nova-sagemaker.md    | 2 +-
 .../model-deployment/references/deploy-oss-bedrock.md       | 6 +++---
 .../model-deployment/references/deploy-oss-sagemaker.md     | 6 +++---
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/plugins/sagemaker-ai/skills/model-deployment/references/deploy-nova-bedrock.md b/plugins/sagemaker-ai/skills/model-deployment/references/deploy-nova-bedrock.md
index 7cd38fff..4b28a256 100644
--- a/plugins/sagemaker-ai/skills/model-deployment/references/deploy-nova-bedrock.md
+++ b/plugins/sagemaker-ai/skills/model-deployment/references/deploy-nova-bedrock.md
@@ -20,7 +20,7 @@ Uses the SageMaker PySdk `BedrockModelBuilder` to deploy a Nova fine-tuned LoRA
 
 ## Prerequisites
 
-Requires SageMaker Python SDK >= 3.6.0 with `BedrockModelBuilder` Nova support (installed by Cell 1).
+Requires SageMaker Python SDK >= 3.7.0 with `BedrockModelBuilder` Nova support (installed by Cell 1).
 
 ## Workflow
 
diff --git a/plugins/sagemaker-ai/skills/model-deployment/references/deploy-nova-sagemaker.md b/plugins/sagemaker-ai/skills/model-deployment/references/deploy-nova-sagemaker.md
index daeca77f..6018de30 100644
--- a/plugins/sagemaker-ai/skills/model-deployment/references/deploy-nova-sagemaker.md
+++ b/plugins/sagemaker-ai/skills/model-deployment/references/deploy-nova-sagemaker.md
@@ -23,7 +23,7 @@ Nova deploys as a model-on-variant (no inference components), so you invoke the
 
 ## Prerequisites
 
-Requires SageMaker Python SDK >= 3.6.0 (installed by Cell 1).
+Requires SageMaker Python SDK >= 3.7.0 (installed by Cell 1).
 
 ## Workflow
 
diff --git a/plugins/sagemaker-ai/skills/model-deployment/references/deploy-oss-bedrock.md b/plugins/sagemaker-ai/skills/model-deployment/references/deploy-oss-bedrock.md
index e0790a5c..16e581d0 100644
--- a/plugins/sagemaker-ai/skills/model-deployment/references/deploy-oss-bedrock.md
+++ b/plugins/sagemaker-ai/skills/model-deployment/references/deploy-oss-bedrock.md
@@ -32,7 +32,7 @@ The `hf_merged/` folder must contain: `.safetensors` files, `config.json`, `toke
 
 ### SDK Version
 
-Requires `sagemaker>=3.0.0` with `BedrockModelBuilder` support.
+Requires `sagemaker>=3.7.0` with `BedrockModelBuilder` support.
 
 ## Workflow
 
@@ -123,7 +123,7 @@ To run:
 ## Common Issues
 
 - **"Model weights are larger than 200GB"**: Cannot use this pathway.
-- **"No module named 'sagemaker.serve.bedrock_model_builder'"**: Upgrade SDK: `pip install --upgrade sagemaker>=3.0.0`
+- **"No module named 'sagemaker.serve.bedrock_model_builder'"**: Upgrade SDK: `pip install --upgrade sagemaker>=3.7.0`
 - **Import starts but uses wrong region**: Known issue — `BedrockModelBuilder` defaults to us-east-1. The notebook code overrides this.
 - **"Access denied to S3"**: Add S3 read permissions to the IAM role for the model bucket.
 - **"Provided IAM role could not be assumed"**: Ensure role has trust policy for `bedrock.amazonaws.com`.
@@ -133,6 +133,6 @@ To run:
 After the notebook runs successfully, tell the user:
 
 - **Model**: `[MODEL_NAME]` has been imported to Bedrock
-- **How to invoke**: Use the Bedrock Converse API with the imported model ARN
+- **How to invoke**: Use the Bedrock `invoke_model` API with the imported model ARN
 - **Billing**: Pay per request — no cost while idle
 - **Cleanup**: When done, delete the imported model using the AWS MCP tool `delete-imported-model` (Bedrock service) with the model name.
diff --git a/plugins/sagemaker-ai/skills/model-deployment/references/deploy-oss-sagemaker.md b/plugins/sagemaker-ai/skills/model-deployment/references/deploy-oss-sagemaker.md
index 5af59db7..41be5102 100644
--- a/plugins/sagemaker-ai/skills/model-deployment/references/deploy-oss-sagemaker.md
+++ b/plugins/sagemaker-ai/skills/model-deployment/references/deploy-oss-sagemaker.md
@@ -10,7 +10,7 @@
 
 ## Overview
 
-Uses the SageMaker PySdk `JumpStartModel` to resolve the base model S3 URI and container image, rather than manually querying `describe_hub_content` and parsing the hub content document JSON. Requires `sagemaker>=3.0.0`.
+Uses the SageMaker PySdk `JumpStartModel` to resolve the base model S3 URI and container image, rather than manually querying `describe_hub_content` and parsing the hub content document JSON. Requires `sagemaker>=3.7.0`.
 
 **Required inputs** (collected in the steps below):
 
@@ -23,7 +23,7 @@ Uses the SageMaker PySdk `JumpStartModel` to resolve the base model S3 URI and c
 
 ### SDK Version
 
-Requires `sagemaker>=3.0.0` with `JumpStartModel` support.
+Requires `sagemaker>=3.7.0` with `JumpStartModel` support.
 
 ## Key Gotchas
 
@@ -129,7 +129,7 @@ To run:
 
 ## Common Issues
 
-- **"No module named 'sagemaker.jumpstart'"**: Upgrade SDK: `pip install --upgrade sagemaker>=3.0.0`
+- **"No module named 'sagemaker.jumpstart'"**: Upgrade SDK: `pip install --upgrade sagemaker>=3.7.0`
 - **"ModuleNotFoundError" for vllm_async_service**: Using LMI 0.31.0 container. Use `OPTION_ROLLING_BATCH=lmi-dist` instead of `OPTION_ENTRYPOINT`.
 - **Base IC fails health check**: Check `MinMemoryRequiredInMb` fits within instance memory. Reduce if needed.
 - **"Inference Component Name header is required"**: Must pass `InferenceComponentName` when invoking the endpoint.

From c7220d54f1af68f26a2998165d01ebbb710424de Mon Sep 17 00:00:00 2001
From: Girish Raman <grraman@amazon.com>
Date: Mon, 30 Mar 2026 21:05:07 -0700
Subject: [PATCH 11/15] (fix) Remove model recommendation from model-evaluation
 skill

---
 .../references/supported-judge-models.md              | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/plugins/sagemaker-ai/skills/model-evaluation/references/supported-judge-models.md b/plugins/sagemaker-ai/skills/model-evaluation/references/supported-judge-models.md
index f1ee3adb..07db776e 100644
--- a/plugins/sagemaker-ai/skills/model-evaluation/references/supported-judge-models.md
+++ b/plugins/sagemaker-ai/skills/model-evaluation/references/supported-judge-models.md
@@ -20,16 +20,13 @@ This list applies to both built-in and custom metrics — the SDK does not disti
 
 Source: `sagemaker.train.constants._ALLOWED_EVALUATOR_MODELS` (sagemaker SDK v3)
 
-## Before Recommending
+## Selection Guidance
 
 Verify each candidate is active in the user's region. Use the AWS MCP tool `get-foundation-model` (Bedrock service) with the model identifier and region. Extract `modelDetails.modelLifecycle.status` from the response.
 
 Only include models that return `ACTIVE`. Models marked `LEGACY` will fail at evaluation time.
 
-## Selection Guidance
-
-Present all active models to the user and let them choose. **NEVER recommend or suggest any particular model.** Only display the list. If the user asks for guidance, you may share these trade-offs:
+Present all active models to the user and let them choose. **NEVER recommend or suggest any particular model.** Only display the list. If the user asks for guidance, you may share these general trade-offs so they can decide:
 
-- Simple tasks (QA, classification) → Claude 3.5 Haiku (fast, cost-effective)
-- Complex reasoning (math, multi-step) → Claude 3.5 Sonnet v2 (higher quality)
-- Budget-conscious → Claude 3.5 Haiku or Mistral Large
+- Cost vs quality: Smaller models are faster and cheaper; larger models produce higher-quality judgments
+- Task complexity: Simple tasks (QA, classification) may not need the most capable model; complex reasoning (math, multi-step) benefits from stronger models

From 87528066e7a1d757846d2c539671976b99a1da95 Mon Sep 17 00:00:00 2001
From: Girish Raman <grraman@amazon.com>
Date: Mon, 30 Mar 2026 21:06:09 -0700
Subject: [PATCH 12/15] fix: Update README instructions

---
 plugins/sagemaker-ai/README.md | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/plugins/sagemaker-ai/README.md b/plugins/sagemaker-ai/README.md
index ddf0ba8a..160f3913 100644
--- a/plugins/sagemaker-ai/README.md
+++ b/plugins/sagemaker-ai/README.md
@@ -154,7 +154,12 @@ Learn more about AWS Identity and Access Management for Amazon SageMaker AI [her
 #### Configuration
 
 - Set `AWS_DEFAULT_REGION` to your preferred AWS region (e.g., `us-east-1`) for your customization workflow. See [Configuring environment variables](https://docs.aws.amazon.com/cli/v1/userguide/cli-configure-envvars.html) for details. If you're using isolated environments such as conda or venv, make sure to set your environment variables within your environment.
-- When deploying a customized model to Bedrock for inference, set your region inference policy to control scale of inference geographically or globally. See [Bedrock documentation](https://aws.amazon.com/blogs/machine-learning/getting-started-with-cross-region-inference-in-amazon-bedrock/).
+- SageMaker LLM as a Judge: This feature is powered by Amazon Bedrock Evaluations. Your use of this feature is subject to pricing of Amazon Bedrock Evaluations, see the [Service Terms](https://aws.amazon.com/service-terms/) applicable to Amazon Bedrock, and the terms that apply to your usage of third-party models. Amazon Bedrock Evaluations may securely transmit data across AWS Regions within your geography for processing. For more information, access [Amazon Bedrock Evaluations documentation](https://docs.aws.amazon.com/bedrock/latest/userguide/evaluation-judge.html).
+- When deploying a customized model to Bedrock for inference, set your region inference policy to control scale of inference geographically or globally. See [Bedrock documentation](https://docs.aws.amazon.com/bedrock/latest/userguide/cross-region-inference.html).
+- Callouts:
+  - Nova models are subject to the following restrictions:
+    - Only available for customization in us-east-1
+    - Not supported for model evaluation with LLMaaJ.
 
 ## Customizing Skills for Your Organization
 

From 3ebcc4776ffe9d7a553d65b2ead2e7bcd4e93045 Mon Sep 17 00:00:00 2001
From: Girish Raman <grraman@amazon.com>
Date: Mon, 30 Mar 2026 21:16:55 -0700
Subject: [PATCH 13/15] fix(sagemaker-ai) (hyperpod): Address PR feedback

---
 .../skills/hyperpod-issue-report/SKILL.md     |   2 +-
 .../scripts/hyperpod_issue_report.py          | 116 ++++++++++++------
 .../sagemaker-ai/skills/hyperpod-ssm/SKILL.md |   2 +-
 .../hyperpod-ssm/scripts/get-cluster-info.sh  |   8 +-
 .../skills/hyperpod-ssm/scripts/list-nodes.sh |   7 +-
 .../skills/hyperpod-ssm/scripts/ssm-exec.sh   |  28 +++--
 .../skills/hyperpod-version-checker/SKILL.md  |  10 +-
 .../scripts/hyperpod_check_versions.sh        |  49 +++++---
 8 files changed, 140 insertions(+), 82 deletions(-)

diff --git a/plugins/sagemaker-ai/skills/hyperpod-issue-report/SKILL.md b/plugins/sagemaker-ai/skills/hyperpod-issue-report/SKILL.md
index fe0264de..be2b5d8d 100755
--- a/plugins/sagemaker-ai/skills/hyperpod-issue-report/SKILL.md
+++ b/plugins/sagemaker-ai/skills/hyperpod-issue-report/SKILL.md
@@ -12,7 +12,7 @@ Collect diagnostic logs from HyperPod cluster nodes via SSM, store results in S3
 ## Prerequisites
 
 - AWS CLI configured with permissions: `sagemaker:DescribeCluster`, `sagemaker:ListClusterNodes`, `ssm:StartSession`, `s3:PutObject`, `s3:GetObject`, `eks:DescribeCluster`
-- Python 3.8+ and [uv](https://docs.astral.sh/uv/) (`pip install uv`)
+- Python 3.8+ and [uv](https://docs.astral.sh/uv/) (see [uv installation docs](https://docs.astral.sh/uv/getting-started/installation/) for install options)
 - SSM Agent running on target nodes; node IAM roles need `s3:GetObject`/`s3:PutObject` on the report bucket
 - For EKS clusters: kubectl installed and configured (see Workflow step 2)
 
diff --git a/plugins/sagemaker-ai/skills/hyperpod-issue-report/scripts/hyperpod_issue_report.py b/plugins/sagemaker-ai/skills/hyperpod-issue-report/scripts/hyperpod_issue_report.py
index 14f38195..40355dab 100755
--- a/plugins/sagemaker-ai/skills/hyperpod-issue-report/scripts/hyperpod_issue_report.py
+++ b/plugins/sagemaker-ai/skills/hyperpod-issue-report/scripts/hyperpod_issue_report.py
@@ -22,9 +22,10 @@
 import os
 import platform
 import pexpect
+import shlex
 import shutil
 import signal
-import subprocess  # nosec B404 - required for kubectl CLI commands
+import subprocess  # nosec B404 - required for kubectl CLI commands  # nosemgrep: gitlab.bandit.B404
 import sys
 import tarfile
 import tempfile
@@ -60,7 +61,7 @@ class HyperPodIssueReportCollector:
     def __init__(self, cluster_name: str, s3_path: str, region: Optional[str] = None, debug: bool = False):
         self.cluster_name = cluster_name
         self.debug = debug
-        
+
         # Parse S3 path
         self.s3_bucket, self.s3_prefix = self.parse_s3_path(s3_path)
         
@@ -120,8 +121,7 @@ def extract_cluster_id_from_arn(self, cluster_arn: str) -> str:
             elif ':cluster/' in cluster_arn:
                 return cluster_arn.split(':cluster/')[-1]
             parts = cluster_arn.split(':')
-            if len(parts) > 0:
-                return parts[-1]
+            return parts[-1]
         return None
     
     def get_slurm_node_name(self, instance_id: str) -> Optional[str]:
@@ -146,8 +146,9 @@ def get_slurm_node_name(self, instance_id: str) -> Optional[str]:
             return None
             
         except Exception as e:
+            print(f"Warning: Could not get Slurm node name for {instance_id}: {e}")
             if self.debug:
-                print(f"Warning: Could not get private IP for {instance_id}: {e}")
+                traceback.print_exc()
             return None
     
     def get_cluster_nodes(self) -> List[Dict]:
@@ -227,7 +228,7 @@ def get_cluster_nodes(self) -> List[Dict]:
             
         except Exception as e:
             print(f"Error getting cluster nodes: {e}")
-            return []
+            raise
     
     def resolve_node_identifiers(self, node_identifiers: List[str]) -> List[str]:
         """Resolve node identifiers to instance IDs.
@@ -388,8 +389,26 @@ def generate_collector_script(self, commands: List[str]) -> str:
                 "systemctl status kubelet > \"${OUTPUT_DIR}/kubelet_status.txt\" 2>&1 || echo \"kubelet service not found or not running\"",
                 "",
                 "echo \"Running EKS log collector...\"",
-                "EKS_LOG_COLLECTOR_URL=\"https://raw.githubusercontent.com/awslabs/amazon-eks-ami/main/log-collector-script/linux/eks-log-collector.sh\"",
-                "curl -o /tmp/eks-log-collector.sh \"${EKS_LOG_COLLECTOR_URL}\"",  # nosec B108 - remote node shell script, not local Python
+                "# Pinned to specific commit for reproducibility; update hash when bumping",
+                "EKS_LOG_COLLECTOR_URL=\"https://raw.githubusercontent.com/awslabs/amazon-eks-ami/2ac5fc03a8030bb8bc3c1fc1e810209118a10656/log-collector-script/linux/eks-log-collector.sh\"",
+                "EKS_LOG_COLLECTOR_SHA256=\"61c8940e9391330a9c67d8cd6720de3de3a1a90021546835f7f54f1fed2efb3f\"",
+                "curl -f -o /tmp/eks-log-collector.sh \"${EKS_LOG_COLLECTOR_URL}\"",  # nosec B108 - remote node shell script, not local Python
+                "if [ $? -ne 0 ]; then",
+                "    echo \"ERROR: Failed to download EKS log collector script\"",
+                "    exit 1",
+                "fi",
+                "",
+                "# Verify integrity with embedded SHA256 checksum",
+                "ACTUAL_SHA=$(sha256sum /tmp/eks-log-collector.sh | awk '{print $1}')",
+                "if [ \"${ACTUAL_SHA}\" != \"${EKS_LOG_COLLECTOR_SHA256}\" ]; then",
+                "    echo \"ERROR: SHA256 checksum verification failed for EKS log collector\"",
+                "    echo \"Expected: ${EKS_LOG_COLLECTOR_SHA256}\"",
+                "    echo \"Actual:   ${ACTUAL_SHA}\"",
+                "    rm -f /tmp/eks-log-collector.sh",
+                "    exit 1",
+                "fi",
+                "echo \"SHA256 checksum verified for EKS log collector\"",
+                "",
                 "chmod +x /tmp/eks-log-collector.sh",
                 "",
                 "# Run the collector and capture its output",
@@ -458,13 +477,15 @@ def generate_collector_script(self, commands: List[str]) -> str:
             # Sanitize command for filename - replace problematic characters
             safe_name = cmd.replace(' ', '_').replace('/', '_').replace('|', '_').replace('>', '_').replace('<', '_').replace('&', '_').replace(';', '_').replace('(', '_').replace(')', '_').replace('$', '_').replace('`', '_').replace('"', '_').replace("'", '_')[:50]
             output_file = f"command_{i:02d}_{safe_name}.txt"
-            
-            # Use regular string (not f-string) to avoid any escaping issues with bash variables
+
+            # Use shlex.quote() to safely escape the command for display in echo
+            quoted_cmd = shlex.quote(cmd)
+
             cmd_line = f"{cmd} > \"${{OUTPUT_DIR}}/{output_file}\" 2>&1 || echo \"Command failed with exit code $?\" >> \"${{OUTPUT_DIR}}/{output_file}\""
-            
+
             script_lines.extend([
-                f"# Command {i}: {cmd}",
-                f"echo \"Running: {cmd}\"",
+                f"# Command {i}",
+                f"echo 'Running: '{quoted_cmd}",
                 cmd_line,
                 "",
             ])
@@ -472,8 +493,8 @@ def generate_collector_script(self, commands: List[str]) -> str:
         # Add S3 upload logic with new filename format
         script_lines.extend([
             "# Upload results to S3",
-            f"S3_BUCKET=\"{self.s3_bucket}\"",
-            f"S3_PREFIX=\"{self.report_s3_key}/instances\"",
+            f"S3_BUCKET={shlex.quote(self.s3_bucket)}",
+            f"S3_PREFIX={shlex.quote(self.report_s3_key + '/instances')}",
             "",
             "echo \"Creating tarball...\"",
             "TARBALL=\"/tmp/${INSTANCE_GROUP}_${INSTANCE_ID}.tar.gz\"",
@@ -526,9 +547,9 @@ def execute_collection_on_node(self, node: Dict, commands: List[str], script_s3_
         
         # Build the command to download and execute the script with environment variables
         commands_to_run = [
-            f"aws s3 cp {script_s3_uri} /tmp/collector_script.sh",
+            f"aws s3 cp {shlex.quote(script_s3_uri)} /tmp/collector_script.sh",
             "chmod +x /tmp/collector_script.sh",
-            f"INSTANCE_GROUP={instance_group} INSTANCE_ID={instance_id} CLUSTER_TYPE={self.cluster_type} /tmp/collector_script.sh"
+            f"INSTANCE_GROUP={shlex.quote(instance_group)} INSTANCE_ID={shlex.quote(instance_id)} CLUSTER_TYPE={shlex.quote(self.cluster_type)} /tmp/collector_script.sh"
         ]
         
         full_command = " && ".join(commands_to_run)
@@ -539,15 +560,17 @@ def execute_collection_on_node(self, node: Dict, commands: List[str], script_s3_
         custom_prompt = "PEXPECT_READY# "
         
         try:
-            ssm_command = f"aws ssm start-session --target {ssm_target}"
-            
+            ssm_command = ['aws', 'ssm', 'start-session', '--target', ssm_target]
+            if self.region:
+                ssm_command.extend(['--region', self.region])
+
             if self.debug:
                 print(f"[DEBUG] {instance_id}: SSM command: {ssm_command}")
                 print(f"[DEBUG] {instance_id}: Full command: {full_command}")
-            
+
             # Use pexpect to handle the interactive session
             # Note: No default timeout set - each expect() call has explicit timeout
-            child = pexpect.spawn(ssm_command, encoding='utf-8')
+            child = pexpect.spawn(ssm_command[0], ssm_command[1:], encoding='utf-8')
             child.logfile_read = None
             
             # Wait for initial prompt (60 seconds to handle slow SSM session initialization)
@@ -883,12 +906,15 @@ def collect_reports(self, commands: List[str], instance_groups: Optional[List[st
                     })
         
         # Save summary
-        self.save_summary(results)
-        
+        summary_saved = self.save_summary(results)
+
         print("-" * 60)
         print(f"\nReport collection completed!")
         print(f"Instance reports uploaded to: s3://{self.s3_bucket}/{self.report_s3_key}/instances/")
-        print(f"Summary: s3://{self.s3_bucket}/{self.report_s3_key}/summary.json")
+        if summary_saved:
+            print(f"Summary: s3://{self.s3_bucket}/{self.report_s3_key}/summary.json")
+        else:
+            print("Warning: Summary upload failed — see error above")
         
         # Print statistics
         successful = sum(1 for r in results if r['Success'])
@@ -1050,8 +1076,8 @@ def create_zip_archive(self, directory: str):
             if self.debug:
                 traceback.print_exc()
     
-    def save_summary(self, results: List[Dict]):
-        """Save collection summary to S3."""
+    def save_summary(self, results: List[Dict]) -> bool:
+        """Save collection summary to S3. Returns True on success."""
         summary = {
             'cluster_name': self.cluster_name,
             'cluster_id': self.cluster_id,
@@ -1062,9 +1088,9 @@ def save_summary(self, results: List[Dict]):
             'failed': sum(1 for r in results if not r['Success']),
             'results': results
         }
-        
+
         summary_key = f"{self.report_s3_key}/summary.json"
-        
+
         try:
             self.s3_client.put_object(
                 Bucket=self.s3_bucket,
@@ -1073,8 +1099,10 @@ def save_summary(self, results: List[Dict]):
                 ContentType='application/json'
             )
             print(f"Summary saved to: s3://{self.s3_bucket}/{summary_key}")
+            return True
         except Exception as e:
             print(f"Error saving summary: {e}")
+            return False
     
     def verify_kubectl_config(self) -> bool:
         """Verify kubectl is configured for the EKS cluster."""
@@ -1109,7 +1137,11 @@ def verify_kubectl_config(self) -> bool:
                     return True
                 else:
                     # Extract region from EKS cluster ARN
-                    region = self.eks_cluster_arn.split(':')[3] if self.eks_cluster_arn else 'REGION'
+                    arn_parts = self.eks_cluster_arn.split(':') if self.eks_cluster_arn else []
+                    if len(arn_parts) <= 3:
+                        print(f"Error: Malformed EKS cluster ARN: {self.eks_cluster_arn}")
+                        return False
+                    region = arn_parts[3]
                     
                     print("\n" + "!" * 60)
                     print(f"ERROR: kubectl context does not match EKS cluster")
@@ -1172,15 +1204,15 @@ def collect_kubectl_node_info(self):
         if self.cluster_type != 'eks':
             print("Skipping kubectl collection - not an EKS cluster")
             return
-        
+
         if not self.eks_cluster_name:
             print("Skipping kubectl collection - EKS cluster name not available")
             return
-        
+
         print("\n" + "=" * 60)
         print("Collecting kubectl node information...")
         print("=" * 60)
-        
+
         # Verify kubectl configuration - exit if not configured
         if not self.verify_kubectl_config():
             print("\n" + "!" * 60)
@@ -1188,7 +1220,9 @@ def collect_kubectl_node_info(self):
             print("!" * 60)
             print("\nPlease configure kubectl and re-run the tool.\n")
             sys.exit(1)
-        
+
+        kubectl_output_dir = None
+        tarball_path = None
         try:
             # Create output directory
             kubectl_output_dir = tempfile.mkdtemp(prefix='kubectl_output_')
@@ -1311,7 +1345,8 @@ def collect_kubectl_node_info(self):
             
             # Create tarball with files at root level (no wrapper directory)
             print("\nCreating kubectl output tarball...")
-            tarball_path = os.path.join(tempfile.gettempdir(), 'kubectl_resources.tar.gz')
+            tarball_fd, tarball_path = tempfile.mkstemp(suffix='_kubectl_resources.tar.gz')
+            os.close(tarball_fd)
             
             with tarfile.open(tarball_path, 'w:gz') as tar:
                 # Add each file directly to the tarball root (no parent directory)
@@ -1329,15 +1364,18 @@ def collect_kubectl_node_info(self):
             
             print(f"✓ Successfully uploaded kubectl resource information to S3")
             print(f"  Location: s3://{self.s3_bucket}/{s3_key}")
-            
-            # Cleanup
-            shutil.rmtree(kubectl_output_dir, ignore_errors=True)
-            os.remove(tarball_path)
-            
+
         except Exception as e:
             print(f"Error collecting kubectl information: {e}")
             if self.debug:
                 traceback.print_exc()
+            raise
+        finally:
+            # Cleanup temp files regardless of success or failure
+            if kubectl_output_dir and os.path.isdir(kubectl_output_dir):
+                shutil.rmtree(kubectl_output_dir, ignore_errors=True)
+            if tarball_path and os.path.exists(tarball_path):
+                os.remove(tarball_path)
 
 
 def main():
diff --git a/plugins/sagemaker-ai/skills/hyperpod-ssm/SKILL.md b/plugins/sagemaker-ai/skills/hyperpod-ssm/SKILL.md
index 287f57f0..fe09f313 100755
--- a/plugins/sagemaker-ai/skills/hyperpod-ssm/SKILL.md
+++ b/plugins/sagemaker-ai/skills/hyperpod-ssm/SKILL.md
@@ -67,7 +67,7 @@ cat > /tmp/cmd.json << 'EOF'
 EOF
 
 aws ssm start-session \
-  --target sagemaker-cluster:CLUSTERID_GROUPNAME-INSTANCEID \
+  --target sagemaker-cluster:{CLUSTER_ID}_{GROUP_NAME}-{INSTANCE_ID} \
   --region REGION \
   --document-name AWS-StartNonInteractiveCommand \
   --parameters file:///tmp/cmd.json
diff --git a/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/get-cluster-info.sh b/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/get-cluster-info.sh
index 0412462b..8203549d 100755
--- a/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/get-cluster-info.sh
+++ b/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/get-cluster-info.sh
@@ -4,17 +4,21 @@
 # Output: JSON with cluster_id extracted from ARN
 set -euo pipefail
 
+command -v jq >/dev/null 2>&1 || { echo "Error: jq is required but not installed" >&2; exit 1; }
+
 CLUSTER="$1"; shift
 REGION="${AWS_DEFAULT_REGION:-us-west-2}"
 while [[ $# -gt 0 ]]; do
   case "$1" in
     --region) REGION="$2"; shift 2 ;;
-    *) shift ;;
+    *) echo "Unknown option: $1" >&2; exit 1 ;;
   esac
 done
 
 ARN=$(aws sagemaker describe-cluster --cluster-name "$CLUSTER" --region "$REGION" \
   --query 'ClusterArn' --output text)
+[[ -z "$ARN" || "$ARN" == "None" ]] && { echo "Error: Could not retrieve cluster ARN for '$CLUSTER' (cluster not found or permission denied)" >&2; exit 1; }
 CLUSTER_ID=$(echo "$ARN" | cut -d'/' -f2)
 
-echo "{\"cluster_id\":\"${CLUSTER_ID}\",\"cluster_arn\":\"${ARN}\",\"cluster_name\":\"${CLUSTER}\",\"region\":\"${REGION}\"}"
+jq -n --arg id "$CLUSTER_ID" --arg arn "$ARN" --arg name "$CLUSTER" --arg region "$REGION" \
+  '{cluster_id: $id, cluster_arn: $arn, cluster_name: $name, region: $region}'
diff --git a/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/list-nodes.sh b/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/list-nodes.sh
index 2d66154f..ec31b8b7 100755
--- a/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/list-nodes.sh
+++ b/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/list-nodes.sh
@@ -4,6 +4,8 @@
 # Output: JSON array of nodes with InstanceId, InstanceGroupName, InstanceStatus, etc.
 set -euo pipefail
 
+command -v jq >/dev/null 2>&1 || { echo "Error: jq is required but not installed" >&2; exit 1; }
+
 CLUSTER="$1"; shift
 REGION="${AWS_DEFAULT_REGION:-us-west-2}"
 FILTER_GROUP="" ; FILTER_ID=""
@@ -12,7 +14,7 @@ while [[ $# -gt 0 ]]; do
     --region)          REGION="$2"; shift 2 ;;
     --instance-group)  FILTER_GROUP="$2"; shift 2 ;;
     --instance-id)     FILTER_ID="$2"; shift 2 ;;
-    *) shift ;;
+    *) echo "Unknown option: $1" >&2; exit 1 ;;
   esac
 done
 
@@ -21,7 +23,8 @@ NODES='[]'; NEXT=""
 while :; do
   CMD=(aws sagemaker list-cluster-nodes --cluster-name "$CLUSTER" --region "$REGION" --output json)
   [[ -n "$NEXT" ]] && CMD+=(--next-token "$NEXT")
-  PAGE=$("${CMD[@]}")
+  PAGE=$("${CMD[@]}") || { echo "Error: AWS API call failed" >&2; exit 1; }
+  echo "$PAGE" | jq -e '.ClusterNodeSummaries' >/dev/null 2>&1 || { echo "Error: Malformed response from AWS (missing ClusterNodeSummaries)" >&2; exit 1; }
   NODES=$(echo "$NODES" "$PAGE" | jq -s '.[0] + .[1].ClusterNodeSummaries')
   NEXT=$(echo "$PAGE" | jq -r '.NextToken // empty')
   [[ -z "$NEXT" ]] && break
diff --git a/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/ssm-exec.sh b/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/ssm-exec.sh
index d54b0519..bf9bb28e 100755
--- a/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/ssm-exec.sh
+++ b/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/ssm-exec.sh
@@ -24,7 +24,9 @@ while [[ $# -gt 0 ]]; do
     --upload)      MODE="upload"; LOCAL_PATH="$2"; REMOTE_PATH="$3"; shift 3 ;;
     --read)        MODE="read"; REMOTE_PATH="$2"; shift 2 ;;
     --region)      REGION="$2"; shift 2 ;;
-    *)             CMD="$1"; shift ;;
+    -*)            echo "Unknown option: $1" >&2; exit 1 ;;
+    *)             [[ -n "$CMD" ]] && { echo "Error: Unexpected argument: $1 (command already set)" >&2; exit 1; }
+                   CMD="$1"; shift ;;
   esac
 done
 
@@ -35,8 +37,9 @@ if [[ -z "$TARGET" ]]; then
   TARGET="sagemaker-cluster:${CLUSTER_ID}_${GROUP}-${INSTANCE_ID}"
 fi
 
-TMPFILE=$(mktemp /tmp/ssm-cmd-XXXXXX.json)
-trap "rm -f '$TMPFILE'" EXIT
+TMPFILE=$(mktemp "${TMPDIR:-/tmp}/ssm-cmd-XXXXXXXXXX.json")
+chmod 600 "$TMPFILE"
+trap 'rm -f "$TMPFILE"' EXIT
 
 # Cross-platform base64 encode with no line wrapping (GNU: -w0, macOS: -b0)
 # Usage: b64_encode FILE  or  cmd | b64_encode
@@ -53,24 +56,33 @@ json_cmd() {
   jq -n --arg c "$cmd" '{"command":[$c]}'
 }
 
+safe_quote() {
+  # Shell-safe quoting via jq @sh (handles all special characters)
+  jq -n --arg s "$1" '$s | @sh' -r
+}
+
 case "$MODE" in
   exec)
     [[ -z "$CMD" ]] && echo "Error: No command specified" >&2 && exit 1
-    INNER=$(printf '%s' "$CMD" | sed "s/'/'\\\\''/g")
-    json_cmd "bash -c '${INNER}'" > "$TMPFILE"
+    json_cmd "$CMD" > "$TMPFILE"
     ;;
   upload)
+    [[ ! -f "$LOCAL_PATH" ]] && echo "Error: Local file not found: $LOCAL_PATH" >&2 && exit 1
+    SAFE_REMOTE=$(safe_quote "$REMOTE_PATH")
     ENCODED=$(b64_encode "$LOCAL_PATH")
     # Compress large files to stay within SSM command limits (~64KB)
     if [[ ${#ENCODED} -gt 8000 ]]; then
       ENCODED=$(gzip -c "$LOCAL_PATH" | b64_encode)
-      json_cmd "bash -c 'echo ${ENCODED} | base64 -d | gunzip > ${REMOTE_PATH}'" > "$TMPFILE"
+      # ENCODED is base64 (only A-Za-z0-9+/=), safe inside single quotes
+      json_cmd "echo '${ENCODED}' | base64 -d | gunzip > ${SAFE_REMOTE}" > "$TMPFILE"
     else
-      json_cmd "bash -c 'echo ${ENCODED} | base64 -d > ${REMOTE_PATH}'" > "$TMPFILE"
+      # ENCODED is base64 (only A-Za-z0-9+/=), safe inside single quotes
+      json_cmd "echo '${ENCODED}' | base64 -d > ${SAFE_REMOTE}" > "$TMPFILE"
     fi
     ;;
   read)
-    json_cmd "cat '${REMOTE_PATH}'" > "$TMPFILE"
+    SAFE_REMOTE=$(safe_quote "$REMOTE_PATH")
+    json_cmd "cat ${SAFE_REMOTE}" > "$TMPFILE"
     ;;
 esac
 
diff --git a/plugins/sagemaker-ai/skills/hyperpod-version-checker/SKILL.md b/plugins/sagemaker-ai/skills/hyperpod-version-checker/SKILL.md
index 9500bcfb..cdab4c86 100755
--- a/plugins/sagemaker-ai/skills/hyperpod-version-checker/SKILL.md
+++ b/plugins/sagemaker-ai/skills/hyperpod-version-checker/SKILL.md
@@ -45,15 +45,7 @@ Output file: `component_versions_<hostname>_<timestamp>.txt` (default)
 
 ## Multi-Node Comparison
 
-Run on each node and compare. With `--json`, stdout is clean JSON for easy diffing:
-
-```bash
-# Via hyperpod_run_on_multi_nodes.py (from hyperpod-diagnostics skill)
-python hyperpod_run_on_multi_nodes.py --cluster <name> \
-  --command "bash hyperpod_check_versions.sh --json"
-```
-
-Or run individually via SSM on each node and diff the JSON outputs.
+Run on each node individually via the `hyperpod-ssm` skill. With `--json`, stdout is clean JSON for easy diffing.
 
 ## Compatibility Reference
 
diff --git a/plugins/sagemaker-ai/skills/hyperpod-version-checker/scripts/hyperpod_check_versions.sh b/plugins/sagemaker-ai/skills/hyperpod-version-checker/scripts/hyperpod_check_versions.sh
index 51b09ebd..03f5cb28 100755
--- a/plugins/sagemaker-ai/skills/hyperpod-version-checker/scripts/hyperpod_check_versions.sh
+++ b/plugins/sagemaker-ai/skills/hyperpod-version-checker/scripts/hyperpod_check_versions.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 # HyperPod Version Checker - Detect software component versions on HyperPod cluster nodes
 #
 # Checks: NVIDIA driver, CUDA, cuDNN, NCCL, EFA, AWS OFI NCCL, GDRCopy, MPI,
@@ -7,6 +7,8 @@
 #
 # Usage: bash hyperpod_check_versions.sh [--json] [--no-color] [--output FILE]
 
+command -v jq >/dev/null 2>&1 || { echo "Error: jq is required but not installed" >&2; exit 1; }
+
 # --- Defaults ---
 JSON_OUTPUT=false
 USE_COLOR=true
@@ -44,7 +46,7 @@ fi
 # In JSON mode: text goes only to file. Otherwise: both console and file.
 log() {
     local stripped
-    stripped=$(echo -e "$@" | sed 's/\x1b\[[0-9;]*m//g')
+    stripped=$(printf '%b\n' "$@" | sed 's/\x1b\[[0-9;]*m//g')
     echo "$stripped" >> "$OUTPUT_FILE"
     if [[ "$JSON_OUTPUT" != "true" ]]; then
         echo -e "$@"
@@ -61,15 +63,23 @@ cmd_exists() { command -v "$1" >/dev/null 2>&1; }
 cmd_or_path() { command -v "$1" 2>/dev/null || echo "$2"; }
 
 # Detect instance type via IMDS
-IMDS_TOKEN=$(curl -s -m 2 -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 60" 2>/dev/null)
-INSTANCE_TYPE=$(curl -s -m 2 -H "X-aws-ec2-metadata-token: $IMDS_TOKEN" http://169.254.169.254/latest/meta-data/instance-type 2>/dev/null)
+IMDS_TOKEN=$(curl -s -m 2 -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 60" 2>/dev/null) || true
+if [[ -z "$IMDS_TOKEN" ]]; then
+    echo "Error: Failed to retrieve IMDS token (IMDSv2 endpoint unreachable)" >&2
+    INSTANCE_TYPE=""
+else
+    INSTANCE_TYPE=$(curl -s -m 2 -H "X-aws-ec2-metadata-token: $IMDS_TOKEN" http://169.254.169.254/latest/meta-data/instance-type 2>/dev/null) || true
+    if [[ -z "$INSTANCE_TYPE" ]]; then
+        echo "Error: Failed to retrieve instance type from IMDS" >&2
+    fi
+fi
 IS_NEURON=false
 IS_GPU=false
-[[ "$INSTANCE_TYPE" =~ ^ml\.(trn|inf) ]] && IS_NEURON=true
-[[ "$INSTANCE_TYPE" =~ ^ml\.(p[0-9]|g[0-9]) ]] && IS_GPU=true
+[[ "$INSTANCE_TYPE" =~ (^|\.)(trn|inf) ]] && IS_NEURON=true
+[[ "$INSTANCE_TYPE" =~ (^|\.)(p[0-9]|g[0-9]) ]] && IS_GPU=true
 
-# JSON-safe string escape
-json_escape() { printf '%s' "$1" | sed 's/\\/\\\\/g; s/"/\\"/g; s/\t/\\t/g'; }
+# JSON-safe string escape via jq (handles all special/unicode characters correctly)
+json_escape() { jq -rn --arg v "$1" '$v | @json | .[1:-1]'; }
 
 declare -A VERSIONS
 
@@ -139,9 +149,9 @@ section "cuDNN Information"
 
 CUDNN_VER=""
 # Check header file
-CUDNN_HEADER=$(find /usr/local/cuda/include /usr/include 2>/dev/null -maxdepth 2 -name "cudnn_version.h" 2>/dev/null | head -1)
+CUDNN_HEADER=$(find /usr/local/cuda/include /usr/include -maxdepth 2 -name "cudnn_version.h" 2>/dev/null | head -1)
 if [ -z "$CUDNN_HEADER" ]; then
-    CUDNN_HEADER=$(find /usr/local/cuda/include /usr/include 2>/dev/null -maxdepth 2 -name "cudnn.h" 2>/dev/null | head -1)
+    CUDNN_HEADER=$(find /usr/local/cuda/include /usr/include -maxdepth 2 -name "cudnn.h" 2>/dev/null | head -1)
 fi
 if [ -n "$CUDNN_HEADER" ]; then
     MAJOR=$(grep "#define CUDNN_MAJOR" "$CUDNN_HEADER" 2>/dev/null | awk '{print $3}')
@@ -164,7 +174,7 @@ if [ -n "$CUDNN_VER" ]; then
     log "cuDNN: v${CUDNN_VER}"
 else
     # Check if library exists at all
-    CUDNN_LIB=$(find /usr/local/cuda/lib64 /usr/lib 2>/dev/null -maxdepth 2 -name "libcudnn.so*" 2>/dev/null | head -1)
+    CUDNN_LIB=$(find /usr/local/cuda/lib64 /usr/lib -maxdepth 2 -name "libcudnn.so*" 2>/dev/null | head -1)
     if [ -n "$CUDNN_LIB" ]; then
         log "cuDNN library found: $CUDNN_LIB (version unknown)"
     else
@@ -177,21 +187,21 @@ log ""
 section "NCCL Information"
 
 NCCL_VER=""
-NCCL_LIBS=$(find /usr/local/cuda*/lib* /usr/lib* /usr/local/lib* /opt/nccl/lib 2>/dev/null -maxdepth 2 -name "libnccl.so*" 2>/dev/null | head -10)
+NCCL_LIBS=$(find /usr/local/cuda*/lib* /usr/lib* /usr/local/lib* /opt/nccl/lib -maxdepth 2 -name "libnccl.so*" 2>/dev/null | head -10)
 if [ -n "$NCCL_LIBS" ]; then
     log "Libraries found:"
     echo "$NCCL_LIBS" | while read -r lib; do log "  $lib"; done
-    for lib in $NCCL_LIBS; do
+    while IFS= read -r lib; do
         if [[ $lib =~ libnccl\.so\.([0-9]+\.[0-9]+\.[0-9]+) ]]; then
             NCCL_VER="${BASH_REMATCH[1]}"
             break
         fi
-    done
+    done <<< "$NCCL_LIBS"
 fi
 
 # Fallback to header
 if [ -z "$NCCL_VER" ]; then
-    NCCL_HEADER=$(find /usr/local/cuda*/include /usr/include /usr/local/include /opt/nccl/include 2>/dev/null -maxdepth 2 -name "nccl.h" 2>/dev/null | head -1)
+    NCCL_HEADER=$(find /usr/local/cuda*/include /usr/include /usr/local/include /opt/nccl/include -maxdepth 2 -name "nccl.h" 2>/dev/null | head -1)
     if [ -n "$NCCL_HEADER" ]; then
         MAJOR=$(grep "NCCL_MAJOR" "$NCCL_HEADER" 2>/dev/null | head -1 | awk '{print $3}')
         MINOR=$(grep "NCCL_MINOR" "$NCCL_HEADER" 2>/dev/null | head -1 | awk '{print $3}')
@@ -219,7 +229,7 @@ if cmd_exists rpm; then
 fi
 
 # nccl-tests
-NCCL_TESTS=$(find /opt /usr/local 2>/dev/null -maxdepth 4 -name "all_reduce_perf" 2>/dev/null | head -1)
+NCCL_TESTS=$(find /opt /usr/local -maxdepth 4 -name "all_reduce_perf" 2>/dev/null | head -1)
 [ -n "$NCCL_TESTS" ] && log "nccl-tests found: $(dirname "$NCCL_TESTS")"
 log ""
 
@@ -277,7 +287,7 @@ if [ -n "$OFI_NCCL_VER" ]; then
     VERSIONS[AWS_OFI_NCCL]="$OFI_NCCL_VER"
     log "AWS OFI NCCL: v${OFI_NCCL_VER}"
 else
-    OFI_LIB=$(find /opt/amazon/ofi-nccl /usr/lib* 2>/dev/null -maxdepth 3 -name "libnccl-net.so" 2>/dev/null | head -1)
+    OFI_LIB=$(find /opt/amazon/ofi-nccl /usr/lib* -maxdepth 3 -name "libnccl-net.so" 2>/dev/null | head -1)
     if [ -n "$OFI_LIB" ]; then
         log "AWS OFI NCCL library found: $OFI_LIB (version unknown)"
     else
@@ -301,7 +311,7 @@ if [ -n "$GDRCOPY_VER" ]; then
     VERSIONS[GDRCOPY]="$GDRCOPY_VER"
     log "GDRCopy: v${GDRCOPY_VER}"
 else
-    GDRCOPY_LIB=$(find /usr /opt 2>/dev/null -maxdepth 4 -name "libgdrapi.so*" 2>/dev/null | head -1)
+    GDRCOPY_LIB=$(find /usr /opt -maxdepth 4 -name "libgdrapi.so*" 2>/dev/null | head -1)
     [ -n "$GDRCOPY_LIB" ] && log "GDRCopy library found: $GDRCOPY_LIB (version unknown)" || log "${YELLOW}GDRCopy not found${NC}"
 fi
 
@@ -320,7 +330,6 @@ if cmd_exists mpirun; then
     MPI_VER=$(mpirun --version 2>&1 | head -1)
 elif [ -f /opt/amazon/openmpi/bin/mpirun ]; then
     MPI_VER=$(/opt/amazon/openmpi/bin/mpirun --version 2>&1 | head -1)
-    log "MPI (/opt/amazon/openmpi): $MPI_VER"
 fi
 if [ -n "$MPI_VER" ]; then
     VERSIONS[MPI]="$MPI_VER"
@@ -442,7 +451,7 @@ log ""
 section "Container Runtime"
 cmd_exists docker && log "Docker: $(docker --version 2>&1)"
 cmd_exists containerd && log "Containerd: $(containerd --version 2>&1)"
-cmd_exists kubectl && log "kubectl: $(kubectl version --client --short 2>&1 || kubectl version --client 2>&1 | head -1)"
+cmd_exists kubectl && log "kubectl: $(kubectl version --client 2>&1 | head -1)"
 # NVIDIA Container Toolkit
 if cmd_exists nvidia-ctk; then
     NCTK_VER=$(nvidia-ctk --version 2>&1 | head -1)

From 3120eb78252fd4d73a2353bea4608006744e8982 Mon Sep 17 00:00:00 2001
From: Girish Raman <grraman@amazon.com>
Date: Mon, 30 Mar 2026 21:41:28 -0700
Subject: [PATCH 14/15] fix(sagemaker-ai): Fix mise build issues

---
 README.md                      | 16 +++++++--------
 plugins/sagemaker-ai/README.md | 36 +++++++++++++++++-----------------
 2 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index 45d06d01..8042eb90 100644
--- a/README.md
+++ b/README.md
@@ -28,14 +28,14 @@ To maximize the benefits of plugin-assisted development while maintaining securi
 
 ## Plugins
 
-| Plugin                      | Description                                                                                                            | Status                                |
-| --------------------------- | ---------------------------------------------------------------------------------------------------------------------- | ------------------------------------- |
-| **amazon-location-service** | Add maps, geocoding, routing, places search, and geospatial features to applications with Amazon Location Service      | Available                             |
-| **aws-amplify**             | Build full-stack apps with AWS Amplify Gen 2 using guided workflows for auth, data, storage, and functions             | Available                             |
-| **aws-serverless**          | Build serverless applications with Lambda, API Gateway, EventBridge, Step Functions, and durable functions             | Available                             |
-| **databases-on-aws**        | Database guidance for the AWS database portfolio — schema design, queries, migrations, and multi-tenant patterns       | Some Services Available (Aurora DSQL) |
-| **deploy-on-aws**           | Deploy applications to AWS with architecture recommendations, cost estimates, and IaC deployment                       | Available                             |
-| **migration-to-aws**        | Migrate GCP infrastructure to AWS with resource discovery, architecture mapping, cost analysis, and execution planning | Available                             |
+| Plugin                      | Description                                                                                                                                                                                             | Status                                |
+| --------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------- |
+| **amazon-location-service** | Add maps, geocoding, routing, places search, and geospatial features to applications with Amazon Location Service                                                                                       | Available                             |
+| **aws-amplify**             | Build full-stack apps with AWS Amplify Gen 2 using guided workflows for auth, data, storage, and functions                                                                                              | Available                             |
+| **aws-serverless**          | Build serverless applications with Lambda, API Gateway, EventBridge, Step Functions, and durable functions                                                                                              | Available                             |
+| **databases-on-aws**        | Database guidance for the AWS database portfolio — schema design, queries, migrations, and multi-tenant patterns                                                                                        | Some Services Available (Aurora DSQL) |
+| **deploy-on-aws**           | Deploy applications to AWS with architecture recommendations, cost estimates, and IaC deployment                                                                                                        | Available                             |
+| **migration-to-aws**        | Migrate GCP infrastructure to AWS with resource discovery, architecture mapping, cost analysis, and execution planning                                                                                  | Available                             |
 | **sagemaker-ai**            | Build, train, and deploy AI models with deep AWS AI/ML expertise brought directly into your coding assistants, covering the surface area of [Amazon SageMaker AI](https://aws.amazon.com/sagemaker/ai/) | Available                             |
 
 ## Installation
diff --git a/plugins/sagemaker-ai/README.md b/plugins/sagemaker-ai/README.md
index 160f3913..8fed77db 100644
--- a/plugins/sagemaker-ai/README.md
+++ b/plugins/sagemaker-ai/README.md
@@ -7,25 +7,25 @@ This plugin brings deep AWS AI/ML expertise directly into your coding assistant,
 
 ## Agent Skills
 
-| # | Skill | Description | Documentation |
-|---|---|---|---|
-| 1 | `planning` | Builds a dynamic, step-by-step plan tailored to your intents | [SKILL.md](skills/planning/SKILL.md) |
-| 2 | `directory-management` | Manages project directory setup, artifact organization, and plan association for new or existing projects | [SKILL.md](skills/directory-management/SKILL.md) |
-| 3 | `use-case-specification` | Guided, conversational process to define your model customization use case goals, key stakeholders, and success criteria | [SKILL.md](skills/use-case-specification/SKILL.md) |
-| 4 | `dataset-evaluation` | Dataset quality validation, format detection, and data requirements analysis | [SKILL.md](skills/dataset-evaluation/SKILL.md) |
-| 5 | `dataset-transformation` | Dataset format conversion and preparation for SageMaker-compatible training formats | [SKILL.md](skills/dataset-transformation/SKILL.md) |
-| 6 | `finetuning-setup` | Fine-tuning technique selection (SFT, DPO, RVLR, etc.) and base model selection | [SKILL.md](skills/finetuning-setup/SKILL.md) |
-| 7 | `finetuning` | Hyperparameter configuration and training job execution | [SKILL.md](skills/finetuning/SKILL.md) |
-| 8 | `model-evaluation` | Evaluation design, benchmark selection, LLM-as-a-judge, and model comparison | [SKILL.md](skills/model-evaluation/SKILL.md) |
-| 9 | `model-deployment` | Deployment configuration and endpoint setup (SageMaker or Bedrock) | [SKILL.md](skills/model-deployment/SKILL.md) |
-| 10 | `hyperpod-ssm` | Remote command execution and file transfer on HyperPod cluster nodes via SSM | [SKILL.md](skills/hyperpod-ssm/SKILL.md) |
-| 11 | `hyperpod-version-checker` | Check and compare software component versions across HyperPod cluster nodes | [SKILL.md](skills/hyperpod-version-checker/SKILL.md) |
-| 12 | `hyperpod-issue-report` | Generate diagnostic reports for HyperPod troubleshooting and support cases | [SKILL.md](skills/hyperpod-issue-report/SKILL.md) |
+| #  | Skill                      | Description                                                                                                              | Documentation                                        |
+| -- | -------------------------- | ------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------- |
+| 1  | `planning`                 | Builds a dynamic, step-by-step plan tailored to your intents                                                             | [SKILL.md](skills/planning/SKILL.md)                 |
+| 2  | `directory-management`     | Manages project directory setup, artifact organization, and plan association for new or existing projects                | [SKILL.md](skills/directory-management/SKILL.md)     |
+| 3  | `use-case-specification`   | Guided, conversational process to define your model customization use case goals, key stakeholders, and success criteria | [SKILL.md](skills/use-case-specification/SKILL.md)   |
+| 4  | `dataset-evaluation`       | Dataset quality validation, format detection, and data requirements analysis                                             | [SKILL.md](skills/dataset-evaluation/SKILL.md)       |
+| 5  | `dataset-transformation`   | Dataset format conversion and preparation for SageMaker-compatible training formats                                      | [SKILL.md](skills/dataset-transformation/SKILL.md)   |
+| 6  | `finetuning-setup`         | Fine-tuning technique selection (SFT, DPO, RVLR, etc.) and base model selection                                          | [SKILL.md](skills/finetuning-setup/SKILL.md)         |
+| 7  | `finetuning`               | Hyperparameter configuration and training job execution                                                                  | [SKILL.md](skills/finetuning/SKILL.md)               |
+| 8  | `model-evaluation`         | Evaluation design, benchmark selection, LLM-as-a-judge, and model comparison                                             | [SKILL.md](skills/model-evaluation/SKILL.md)         |
+| 9  | `model-deployment`         | Deployment configuration and endpoint setup (SageMaker or Bedrock)                                                       | [SKILL.md](skills/model-deployment/SKILL.md)         |
+| 10 | `hyperpod-ssm`             | Remote command execution and file transfer on HyperPod cluster nodes via SSM                                             | [SKILL.md](skills/hyperpod-ssm/SKILL.md)             |
+| 11 | `hyperpod-version-checker` | Check and compare software component versions across HyperPod cluster nodes                                              | [SKILL.md](skills/hyperpod-version-checker/SKILL.md) |
+| 12 | `hyperpod-issue-report`    | Generate diagnostic reports for HyperPod troubleshooting and support cases                                               | [SKILL.md](skills/hyperpod-issue-report/SKILL.md)    |
 
 ## MCP Servers
 
-| # | Server | Description |
-|---|---|---|
+| # | Server    | Description                                                 |
+| - | --------- | ----------------------------------------------------------- |
 | 1 | `aws-mcp` | AWS documentation and SOP retrieval via `mcp-proxy-for-aws` |
 
 ## Installation
@@ -44,11 +44,11 @@ Install the `sagemaker-ai` plugin from the AWS Agent Plugins Marketplace via the
 
 For coding assistants that don't support agent plugins directly (for example, Kiro CLI or IDE as of Mar 2026), install the MCP server and Skills separately as shown below.
 
-**Step 1: Configure the MCP server**
+#### Step 1: Configure the MCP server
 
 Add the contents from the `.mcp.json` file to your platform's MCP configuration file (e.g., `.kiro/settings/mcp.json` for Kiro, `.vscode/mcp.json` for VS Code).
 
-**Step 2: Install Skills**
+#### Step 2: Install Skills
 
 You may use the [Skills CLI](https://github.com/vercel-labs/skills) (from Vercel Labs) to install the skills into your platform:
 

From 0e7442aae59076a9e340c35d20bdf90443a4a8e5 Mon Sep 17 00:00:00 2001
From: Girish Raman <grraman@amazon.com>
Date: Tue, 31 Mar 2026 16:21:08 -0700
Subject: [PATCH 15/15] fix(sagemaker-ai): Fix Nova RLVR issues

---
 .../references/strategy_data_requirements.md  |   2 +-
 .../sagemaker-ai/skills/finetuning/SKILL.md   |   3 +-
 .../references/rlvr_reward_function.md        |   7 +-
 ...va_rlvr_reward_function_source_template.py | 352 ++++++++++++++++++
 4 files changed, 361 insertions(+), 3 deletions(-)
 create mode 100644 plugins/sagemaker-ai/skills/finetuning/templates/nova_rlvr_reward_function_source_template.py

diff --git a/plugins/sagemaker-ai/skills/dataset-evaluation/references/strategy_data_requirements.md b/plugins/sagemaker-ai/skills/dataset-evaluation/references/strategy_data_requirements.md
index 7821b1ac..4cd4a5db 100644
--- a/plugins/sagemaker-ai/skills/dataset-evaluation/references/strategy_data_requirements.md
+++ b/plugins/sagemaker-ai/skills/dataset-evaluation/references/strategy_data_requirements.md
@@ -167,7 +167,7 @@ The format is the same as SFT for the first N-1 turns. The final assistant turn
     }
   ],
   "reference_answer": {
-    "solution": "49"
+    "answer": "49"
   }
 }
 ```
diff --git a/plugins/sagemaker-ai/skills/finetuning/SKILL.md b/plugins/sagemaker-ai/skills/finetuning/SKILL.md
index 775c3a8d..ac766b32 100644
--- a/plugins/sagemaker-ai/skills/finetuning/SKILL.md
+++ b/plugins/sagemaker-ai/skills/finetuning/SKILL.md
@@ -124,7 +124,8 @@ Read the example notebook matching the finetuning strategy:
 # References
 
 - `rlvr_reward_function.md` - Lambda reward function creation guide (RLVR only)
-- `templates/rlvr_reward_function_source_template.py` - Lambda reward function source template (RLVR only)
+- `templates/rlvr_reward_function_source_template.py` - Lambda reward function source template for open-weights models (RLVR only)
+- `templates/nova_rlvr_reward_function_source_template.py` - Lambda reward function source template for Nova 2.0 Lite (RLVR only)
 - `sft_example.md` - Complete notebook template for Supervised Fine-Tuning
 - `dpo_example.md` - Complete notebook template for Direct Preference Optimization
 - `rlvr_example.md` - Complete notebook template for Reinforcement Learning from Verifiable Rewards
diff --git a/plugins/sagemaker-ai/skills/finetuning/references/rlvr_reward_function.md b/plugins/sagemaker-ai/skills/finetuning/references/rlvr_reward_function.md
index 8668bc6b..d5714f9e 100644
--- a/plugins/sagemaker-ai/skills/finetuning/references/rlvr_reward_function.md
+++ b/plugins/sagemaker-ai/skills/finetuning/references/rlvr_reward_function.md
@@ -9,7 +9,12 @@ numerical rewards. SageMaker calls this Lambda in the training loop to provide l
 
 ### Step 1: Copy Template to Project
 
-Copy the template file `templates/rlvr_reward_function_source_template.py` as `lambda_function.py` into the project's scripts directory.
+Select the reward function template based on the base model:
+
+- **Nova 2.0 Lite** → `templates/nova_rlvr_reward_function_source_template.py`
+- **All other models** → `templates/rlvr_reward_function_source_template.py`
+
+Copy the selected template as `lambda_function.py` into the project's scripts directory.
 
 - Read the `directory-management` skill to determine the correct directory for storing scripts.
 
diff --git a/plugins/sagemaker-ai/skills/finetuning/templates/nova_rlvr_reward_function_source_template.py b/plugins/sagemaker-ai/skills/finetuning/templates/nova_rlvr_reward_function_source_template.py
new file mode 100644
index 00000000..2fc2adae
--- /dev/null
+++ b/plugins/sagemaker-ai/skills/finetuning/templates/nova_rlvr_reward_function_source_template.py
@@ -0,0 +1,352 @@
+"""
+Provide your custom reward function code below. Learn about the available libraries and templates that you can use
+at: https://docs.aws.amazon.com/sagemaker/latest/dg/customize-model.html.
+
+- You must add your evaluation logic in the reward_function() function
+- Do not remove the lambda_handler() function or modify its schema as it is required to create the reward function
+"""
+
+import json  # For JSON parsing - adjust imports based on your use case
+import re    # For pattern matching and validation
+from typing import Dict, Any, List, Optional, Union # For type hints
+# Add any other imports your use case requires
+
+# ========================================================================================
+#  NOTE: INITIAL SUGGESTION ONLY - MUST BE CUSTOMIZED
+#
+#     YOU MUST:
+#     1. Review and update each section per YOUR use case
+#     2. Customize the logic for YOUR SPECIFIC requirements
+#     3. Replace example values (field names, thresholds, etc.) with your actual values
+#     4. Test thoroughly before using
+#
+#     DO NOT use this code as-is. It will not work until you uncomment and customize it.
+# =========================================================================================
+
+
+# =========================================================================================
+# SECTION 1: Helper function — content normalization
+# =========================================================================================
+# Nova messages use content as a string, a list of {"type":"text","text":"..."} chunks,
+# or a dict with a "text" key. This helper normalizes all forms to a plain string.
+def content_to_text(content: Any) -> str:
+    """
+    Normalize Nova message content to a plain string.
+
+    Args:
+        content: String, list of text chunks, or dict with "text" key
+
+    Returns:
+        Plain text string
+    """
+    if content is None:
+        return ""
+    if isinstance(content, str):
+        return content
+    if isinstance(content, list):
+        parts: List[str] = []
+        for item in content:
+            if isinstance(item, str):
+                parts.append(item)
+            elif isinstance(item, dict) and "text" in item:
+                parts.append(item["text"])
+            else:
+                parts.append(str(item))
+        return "".join(parts)
+    if isinstance(content, dict) and "text" in content:
+        return content["text"]
+    return str(content)
+
+
+# =========================================================================================
+# SECTION 2: Helper function — ground truth extraction
+# =========================================================================================
+# Nova reference_answer can be a dict with flexible keys (answer, label, sentiment, etc.),
+# a JSON string, or a plain string.
+def coerce_ground_truth(ground_truth: Union[str, Dict[str, Any], Any]) -> Optional[str]:
+    """
+    Extract the ground-truth answer as a string from reference_answer.
+
+    Args:
+        ground_truth: Dict, JSON string, or plain string
+
+    Returns:
+        Ground truth string, or None if not found
+    """
+    if ground_truth is None:
+        return None
+
+    if isinstance(ground_truth, str):
+        s = ground_truth.strip()
+        if not s:
+            return None
+        if s.startswith("{") or s.startswith("["):
+            try:
+                ground_truth = json.loads(s)
+            except Exception:
+                return s
+        else:
+            return s
+
+    if isinstance(ground_truth, dict):
+        for key in ("ground_truth", "answer", "label", "sentiment", "polarity", "target"):
+            if key in ground_truth and ground_truth[key] is not None:
+                return str(ground_truth[key])
+        if len(ground_truth) == 1:
+            only_val = next(iter(ground_truth.values()))
+            if only_val is not None:
+                return str(only_val)
+        return None
+
+    return str(ground_truth)
+
+
+# =========================================================================================
+# SECTION 3: Helper function — number extraction
+# =========================================================================================
+# TODO: UPDATE or REMOVE the helper function as per YOUR use case
+# Note the below lines of code are examples and will not work for your use case
+# You MUST update them to match YOUR use case
+def extract_number(text: str) -> Optional[float]:
+    """
+    Extract numerical answer from text.
+    Looks for numbers after answer keywords, or returns the last number found.
+
+    Args:
+        text: Text containing a numerical answer
+
+    Returns:
+        Extracted number as float, or None if no number found
+    """
+    if not text:
+        return None
+
+    # Try to find numbers after common answer keywords
+    answer_patterns = [
+        r'(?:equals|is|answer is|result is|=)\s*(-?\d+\.?\d*)',
+        r'(?:answer|result|solution):\s*(-?\d+\.?\d*)',
+    ]
+
+    for pattern in answer_patterns:
+        match = re.search(pattern, text, re.IGNORECASE)
+        if match:
+            try:
+                return float(match.group(1))
+            except ValueError:
+                pass
+
+    # Fallback: find all numbers and return the last one (likely the answer)
+    pattern = r'-?\d+\.?\d*'
+    matches = re.findall(pattern, text)
+
+    if matches:
+        try:
+            return float(matches[-1])
+        except ValueError:
+            return None
+
+    return None
+
+
+# =========================================================================================
+# SECTION 4: Helper function — reasoning quality
+# =========================================================================================
+# TODO: UPDATE or REMOVE the helper function as per YOUR use case
+# Note the below lines of code are examples and will not work for your use case
+# You MUST update them to match YOUR use case
+def compute_reasoning_quality(response: str) -> float:
+    """
+    Compute reasoning quality score based on response characteristics.
+    This is a simple heuristic - customize based on your needs.
+
+    Args:
+        response: The model's response text
+
+    Returns:
+        Quality score between 0.0 and 1.0
+    """
+    if not response:
+        return 0.0
+
+    score = 0.0
+
+    # Check for reasoning indicators (customize these for your use case)
+    reasoning_indicators = [
+        'because', 'therefore', 'thus', 'since', 'so',
+        'first', 'second', 'then', 'finally',
+        'step', 'calculate', 'compute', 'equals'
+    ]
+
+    response_lower = response.lower()
+
+    # Award points for reasoning indicators (max 0.55)
+    indicator_count = sum(1 for indicator in reasoning_indicators if indicator in response_lower)
+    score += min(indicator_count * 0.11, 0.55)
+
+    # Award points for response length (indicates detailed reasoning, max 0.25)
+    if len(response) > 30:
+        score += 0.05
+    if len(response) > 60:
+        score += 0.1
+    if len(response) > 120:
+        score += 0.1
+
+    # Award points for structured response (max 0.2)
+    if '\n' in response or '.' in response:
+        score += 0.2
+
+    return min(score, 1.0)
+
+
+# =========================================================================================
+# SECTION 5: Helper function — answer extraction
+# =========================================================================================
+# TODO: UPDATE or REMOVE the helper function as per YOUR use case
+# Note the below lines of code are examples and will not work for your use case
+# You MUST update them to match YOUR use case
+def extract_answer(response: str) -> Optional[str]:
+    """
+    Extract the answer from a Nova model response.
+    Looks for <|begin_of_solution|>...<|end_of_solution|> blocks and \\boxed{} patterns.
+
+    Args:
+        response: The model's response text
+
+    Returns:
+        Extracted answer string, or None if not found
+    """
+    if not response:
+        return None
+
+    # Try solution block first
+    solution_match = re.search(
+        r"<\|begin_of_solution\|>(.*?)<\|end_of_solution\|>",
+        response,
+        re.DOTALL,
+    )
+    if solution_match:
+        boxed = re.findall(r"\\boxed\{([^}]+)\}", solution_match.group(1))
+        if boxed:
+            return boxed[-1].strip()
+
+    # Fallback: boxed anywhere
+    boxed = re.findall(r"\\boxed\{([^}]+)\}", response)
+    if boxed:
+        return boxed[-1].strip()
+
+    return None
+
+
+# =========================================================================================
+# SECTION 6: Sample reward function
+# =========================================================================================
+# TODO: UPDATE or REMOVE the reward function as per YOUR use case
+# Note the below lines of code are examples and will not work for your use case
+# You MUST update them to match YOUR use case
+def reward_function(sample: Dict[str, Any], index: int) -> Dict[str, Any]:
+    """
+    Args:
+        sample: Dictionary containing messages and reference_answer
+        index: Sample index in batch
+
+    Returns:
+        Dictionary with reward scores and metrics
+    """
+    # ========================================================================
+    # SECTION 7: Parse input
+    # ========================================================================
+    # TODO: UPDATE logic to parse the input as per YOUR use case
+    # Note the below lines of code are examples and will not work for your use case
+    # You MUST update them to match YOUR use case
+    messages = sample.get('messages', [])
+    ground_truth = sample.get('reference_answer', {})
+
+    # Get the assistant's response (last message with role assistant or nova_assistant)
+    response = ""
+    for msg in messages:
+        role = msg.get('role', '')
+        if role in ('assistant', 'nova_assistant'):
+            response = content_to_text(msg.get('content', ''))
+
+    # Extract numerical answers
+    predicted = extract_number(response)
+    expected_str = coerce_ground_truth(ground_truth)
+    expected = extract_number(expected_str) if expected_str else None
+
+    # Compute metrics
+    exact_match = 0.0
+    answer_present = 0.0
+    reasoning_quality = compute_reasoning_quality(response)
+
+    if predicted is not None and expected is not None:
+        exact_match = 1.0 if abs(predicted - expected) < 1e-6 else 0.0
+        answer_present = 1.0
+
+    # ========================================================================
+    # SECTION 8: Compute reward scores
+    # ========================================================================
+    # TODO: UPDATE logic to compute aggregate score
+    # Note the below lines of code are examples and will not work for your use case
+    # You MUST update them to match YOUR use case
+    aggregate_reward = 0.7 * exact_match + 0.3 * reasoning_quality
+
+    # ========================================================================
+    # SECTION 9: Form the metrics list
+    # ========================================================================
+    # TODO: UPDATE logic to compute metrics list
+    # Note the below lines of code are examples and will not work for your use case
+    # You MUST update them to match YOUR use case
+    metrics = [
+        {
+            'name': 'exact_match',
+            'value': float(exact_match),
+            'type': 'Reward'
+        },
+        {
+            'name': 'answer_present',
+            'value': float(answer_present),
+            'type': 'Metric'
+        },
+        {
+            'name': 'reasoning_quality',
+            'value': float(reasoning_quality),
+            'type': 'Metric'
+        }
+    ]
+
+    # ========================================================================
+    # SECTION 10: Return output
+    # ========================================================================
+    # TODO: UPDATE the return statement to return YOUR output
+    # UPDATE the key before creating the evaluator
+    # Note the below lines of code are examples and will not work for your use case
+    # You MUST update them to match YOUR use case
+
+    return {
+        'id': str(sample.get('id', f'sample-{index:03d}')),
+        'aggregate_reward_score': float(aggregate_reward),
+        'metrics_list': metrics
+    }
+
+def lambda_handler(event: Dict[str, Any], context: Any) -> Dict[str, Any]:
+    """
+    AWS Lambda Handler for reward function.
+    SageMaker Nova RLVR invokes this with a bare list of samples and expects
+    a bare list of {id, aggregate_reward_score, ...} dicts in return.
+    """
+    # Event is a bare list of samples
+    batch = event if isinstance(event, list) else []
+
+    results = []
+    for i, sample in enumerate(batch):
+        try:
+            result = reward_function(sample, i)
+            results.append(result)
+        except Exception as e:
+            results.append({
+                'id': str(sample.get('id', f'sample-{i:03d}') if isinstance(sample, dict) else f'sample-{i:03d}'),
+                'aggregate_reward_score': 0.0,
+                'metrics_list': []
+            })
+
+    return results