From 3f059bd740c9f85e62d79c72a5c68a1d08284a32 Mon Sep 17 00:00:00 2001 From: Girish Raman Date: Fri, 27 Mar 2026 15:14:00 -0700 Subject: [PATCH 01/15] feat(sagemaker-ai): Add model customization and hyperpod skills --- .../sagemaker-ai/.claude-plugin/plugin.json | 22 + plugins/sagemaker-ai/.mcp.json | 12 + .../skills/dataset-evaluation/SKILL.md | 54 + .../references/strategy_data_requirements.md | 173 ++ .../scripts/format_detector.py | 678 ++++++++ .../skills/dataset-transformation/SKILL.md | 220 +++ .../references/dataset_transformation_code.md | 135 ++ .../references/notebook_structure.md | 46 + .../references/notebook_writing_guide.md | 99 ++ .../references/sagemaker_dataset_formats.md | 146 ++ .../scripts/transformation_tools.py | 146 ++ .../skills/directory-management/SKILL.md | 32 + .../skills/finetuning-setup/SKILL.md | 73 + .../finetune_technique_selection_guide.md | 37 + .../scripts/get_model_names.py | 43 + .../finetuning-setup/scripts/get_recipes.py | 30 + .../sagemaker-ai/skills/finetuning/SKILL.md | 128 ++ .../finetuning/references/dpo_example.md | 159 ++ .../finetuning/references/eula_links.md | 19 + .../finetuning/references/rlvr_example.md | 169 ++ .../references/rlvr_reward_function.md | 43 + .../finetuning/references/sft_example.md | 159 ++ .../rlvr_reward_function_source_template.py | 250 +++ .../skills/hyperpod-issue-report/SKILL.md | 74 + .../references/collection-details.md | 105 ++ .../references/troubleshooting.md | 22 + .../scripts/hyperpod_issue_report.py | 1430 +++++++++++++++++ .../scripts/requirements.txt | 3 + .../sagemaker-ai/skills/hyperpod-ssm/SKILL.md | 96 ++ .../references/troubleshooting.md | 61 + .../hyperpod-ssm/scripts/get-cluster-info.sh | 20 + .../skills/hyperpod-ssm/scripts/list-nodes.sh | 37 + .../skills/hyperpod-ssm/scripts/ssm-exec.sh | 85 + .../skills/hyperpod-version-checker/SKILL.md | 74 + .../scripts/hyperpod_check_versions.sh | 545 +++++++ .../skills/model-deployment/SKILL.md | 122 ++ .../references/deploy-nova-bedrock.md | 119 ++ .../references/deploy-nova-sagemaker.md | 142 ++ .../references/deploy-oss-bedrock.md | 138 ++ .../references/deploy-oss-sagemaker.md | 149 ++ .../references/model-licenses.md | 23 + .../scripts/deploy-nova-bedrock.py | 51 + .../scripts/deploy-nova-sagemaker.py | 54 + .../scripts/deploy-oss-bedrock.py | 110 ++ .../scripts/deploy-oss-sagemaker.py | 55 + .../skills/model-evaluation/SKILL.md | 240 +++ .../references/builtin-metrics.md | 35 + .../references/llmaaj-builtin-evaluation.md | 23 + .../references/llmaaj-custom-evaluation.md | 63 + .../references/notebook_structure.md | 63 + .../references/supported-judge-models.md | 35 + .../scripts/notebook_cells.py | 83 + .../scripts/validate_custom_metrics.py | 124 ++ plugins/sagemaker-ai/skills/planning/SKILL.md | 121 ++ .../references/model-customization-plan.md | 15 + .../references/skill-routing-constraints.md | 32 + .../skills/use-case-specification/SKILL.md | 76 + 57 files changed, 7298 insertions(+) create mode 100644 plugins/sagemaker-ai/.claude-plugin/plugin.json create mode 100644 plugins/sagemaker-ai/.mcp.json create mode 100644 plugins/sagemaker-ai/skills/dataset-evaluation/SKILL.md create mode 100644 plugins/sagemaker-ai/skills/dataset-evaluation/references/strategy_data_requirements.md create mode 100644 plugins/sagemaker-ai/skills/dataset-evaluation/scripts/format_detector.py create mode 100644 plugins/sagemaker-ai/skills/dataset-transformation/SKILL.md create mode 100644 plugins/sagemaker-ai/skills/dataset-transformation/references/dataset_transformation_code.md create mode 100644 plugins/sagemaker-ai/skills/dataset-transformation/references/notebook_structure.md create mode 100644 plugins/sagemaker-ai/skills/dataset-transformation/references/notebook_writing_guide.md create mode 100644 plugins/sagemaker-ai/skills/dataset-transformation/references/sagemaker_dataset_formats.md create mode 100644 plugins/sagemaker-ai/skills/dataset-transformation/scripts/transformation_tools.py create mode 100644 plugins/sagemaker-ai/skills/directory-management/SKILL.md create mode 100644 plugins/sagemaker-ai/skills/finetuning-setup/SKILL.md create mode 100644 plugins/sagemaker-ai/skills/finetuning-setup/references/finetune_technique_selection_guide.md create mode 100644 plugins/sagemaker-ai/skills/finetuning-setup/scripts/get_model_names.py create mode 100644 plugins/sagemaker-ai/skills/finetuning-setup/scripts/get_recipes.py create mode 100644 plugins/sagemaker-ai/skills/finetuning/SKILL.md create mode 100644 plugins/sagemaker-ai/skills/finetuning/references/dpo_example.md create mode 100644 plugins/sagemaker-ai/skills/finetuning/references/eula_links.md create mode 100644 plugins/sagemaker-ai/skills/finetuning/references/rlvr_example.md create mode 100644 plugins/sagemaker-ai/skills/finetuning/references/rlvr_reward_function.md create mode 100644 plugins/sagemaker-ai/skills/finetuning/references/sft_example.md create mode 100644 plugins/sagemaker-ai/skills/finetuning/templates/rlvr_reward_function_source_template.py create mode 100755 plugins/sagemaker-ai/skills/hyperpod-issue-report/SKILL.md create mode 100755 plugins/sagemaker-ai/skills/hyperpod-issue-report/references/collection-details.md create mode 100755 plugins/sagemaker-ai/skills/hyperpod-issue-report/references/troubleshooting.md create mode 100755 plugins/sagemaker-ai/skills/hyperpod-issue-report/scripts/hyperpod_issue_report.py create mode 100755 plugins/sagemaker-ai/skills/hyperpod-issue-report/scripts/requirements.txt create mode 100755 plugins/sagemaker-ai/skills/hyperpod-ssm/SKILL.md create mode 100755 plugins/sagemaker-ai/skills/hyperpod-ssm/references/troubleshooting.md create mode 100755 plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/get-cluster-info.sh create mode 100755 plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/list-nodes.sh create mode 100755 plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/ssm-exec.sh create mode 100755 plugins/sagemaker-ai/skills/hyperpod-version-checker/SKILL.md create mode 100755 plugins/sagemaker-ai/skills/hyperpod-version-checker/scripts/hyperpod_check_versions.sh create mode 100644 plugins/sagemaker-ai/skills/model-deployment/SKILL.md create mode 100644 plugins/sagemaker-ai/skills/model-deployment/references/deploy-nova-bedrock.md create mode 100644 plugins/sagemaker-ai/skills/model-deployment/references/deploy-nova-sagemaker.md create mode 100644 plugins/sagemaker-ai/skills/model-deployment/references/deploy-oss-bedrock.md create mode 100644 plugins/sagemaker-ai/skills/model-deployment/references/deploy-oss-sagemaker.md create mode 100644 plugins/sagemaker-ai/skills/model-deployment/references/model-licenses.md create mode 100644 plugins/sagemaker-ai/skills/model-deployment/scripts/deploy-nova-bedrock.py create mode 100644 plugins/sagemaker-ai/skills/model-deployment/scripts/deploy-nova-sagemaker.py create mode 100644 plugins/sagemaker-ai/skills/model-deployment/scripts/deploy-oss-bedrock.py create mode 100644 plugins/sagemaker-ai/skills/model-deployment/scripts/deploy-oss-sagemaker.py create mode 100644 plugins/sagemaker-ai/skills/model-evaluation/SKILL.md create mode 100644 plugins/sagemaker-ai/skills/model-evaluation/references/builtin-metrics.md create mode 100644 plugins/sagemaker-ai/skills/model-evaluation/references/llmaaj-builtin-evaluation.md create mode 100644 plugins/sagemaker-ai/skills/model-evaluation/references/llmaaj-custom-evaluation.md create mode 100644 plugins/sagemaker-ai/skills/model-evaluation/references/notebook_structure.md create mode 100644 plugins/sagemaker-ai/skills/model-evaluation/references/supported-judge-models.md create mode 100644 plugins/sagemaker-ai/skills/model-evaluation/scripts/notebook_cells.py create mode 100644 plugins/sagemaker-ai/skills/model-evaluation/scripts/validate_custom_metrics.py create mode 100644 plugins/sagemaker-ai/skills/planning/SKILL.md create mode 100644 plugins/sagemaker-ai/skills/planning/references/model-customization-plan.md create mode 100644 plugins/sagemaker-ai/skills/planning/references/skill-routing-constraints.md create mode 100644 plugins/sagemaker-ai/skills/use-case-specification/SKILL.md diff --git a/plugins/sagemaker-ai/.claude-plugin/plugin.json b/plugins/sagemaker-ai/.claude-plugin/plugin.json new file mode 100644 index 00000000..ee1b60a1 --- /dev/null +++ b/plugins/sagemaker-ai/.claude-plugin/plugin.json @@ -0,0 +1,22 @@ +{ + "author": { + "name": "Amazon Web Services" + }, + "description": "Equip AI coding agents with skills to build, train, and deploy ML and generative AI workloads on Amazon SageMaker AI.", + "homepage": "https://github.com/awslabs/agent-plugins", + "keywords": [ + "sagemaker", + "machine-learning", + "generative-ai", + "fine-tuning", + "training", + "deployment", + "inference", + "mlops", + "aws" + ], + "license": "Apache-2.0", + "name": "sagemaker-ai", + "repository": "https://github.com/awslabs/agent-plugins", + "version": "1.0.0" +} diff --git a/plugins/sagemaker-ai/.mcp.json b/plugins/sagemaker-ai/.mcp.json new file mode 100644 index 00000000..573fb77e --- /dev/null +++ b/plugins/sagemaker-ai/.mcp.json @@ -0,0 +1,12 @@ +{ + "mcpServers": { + "aws-mcp": { + "command": "uvx", + "args": [ + "mcp-proxy-for-aws@latest", + "https://aws-mcp.us-east-1.api.aws/mcp" + ], + "disabled": false + } + } +} diff --git a/plugins/sagemaker-ai/skills/dataset-evaluation/SKILL.md b/plugins/sagemaker-ai/skills/dataset-evaluation/SKILL.md new file mode 100644 index 00000000..81d2d956 --- /dev/null +++ b/plugins/sagemaker-ai/skills/dataset-evaluation/SKILL.md @@ -0,0 +1,54 @@ +--- +name: dataset-evaluation +description: Validates dataset formatting and quality for SageMaker model fine-tuning (SFT, DPO, or RLVR). Use when the user says "is my dataset okay", "evaluate my data", "check my training data", "I have my own data", or before starting any fine-tuning job. Detects file format, checks schema compliance against the selected model and technique, and reports whether the data is ready for training or evaluation. +--- + +# Workflow Instruction + +Follow the workflow shown below. Locate the dataset, check the file type, and resolve any issues with missing files or wrong file types. Determine the fine-tuning model and fine-tuning strategy. Run scripts/format_detector.py to evaluate whether the file is formatted correctly for the currently selected model and strategy. Summarize the results: is the dataset ready for fine-tuning? + +## Workflow + +1. **Locate Dataset**: + - The full path may be a local file path, or an S3 URI + - Resolve the full path to the dataset file, make sure read permissions are available, and help the user if the file is not found + +2. **Determine strategy and model**: + - File formatting depends on the currently selected fine-tuning strategy and fine-tuning base model. + - If the strategy and model are already known from the conversation context (e.g., selected via the finetuning-setup skill), use them. + - If not available in context, activate the finetuning-setup skill to determine them before proceeding. + +3. **Check File Formatting**: Run the tool format_detector.py to make sure the file conforms to formatting requirements. + - Send the full path directly to the format_detector script as an argument + - Do not send the model and strategy as arguments + - Do not download data from S3 + - Do not make local copies of data + +4. **Summarize Results**: Tell the user if their data is ready + - Examine the output of format_detector and compare to the known strategy and model + - **Important: training datasets and evaluation datasets have different format requirements.** + - **Training datasets** must match the fine-tuning strategy format (SFT, DPO, RLVR) per `references/strategy_data_requirements.md` + - **Evaluation datasets** (for model evaluation) must match one of the [SageMaker evaluation dataset formats](https://docs.aws.amazon.com/sagemaker/latest/dg/model-customize-evaluation-dataset-formats.html). + - Report back to the user if their current dataset is valid for its intended purpose + - Warn the user if their dataset is valid, but for a different strategy or model + - Warn the user if their dataset is not valid for any strategy/model pair + +## Messages to the User + +- Introduction: "This skill checks the structure of your dataset for model fine-tuning." +- File types: This skill applies to files that are formatted according to the [Amazon SageMaker AI Developer Guide](https://docs.aws.amazon.com/sagemaker/latest/dg/autopilot-llms-finetuning-data-format.html#autopilot-llms-finetuning-dataset-format) + +# Resources + +- scripts/format_detector.py is self-contained format validation script that can be run independently +- finetuning-setup skill should have already determined the fine-tuning strategy and base model +- references/strategy_data_requirements.md contains data format requirements per strategy + +## Script Details + +- scripts/format_detector.py is self-contained format validation script that can be run independently: + +```bash +# With the file path argument identified in workflow step 1 +python src/format_detector.py local_path/to/dataset +``` diff --git a/plugins/sagemaker-ai/skills/dataset-evaluation/references/strategy_data_requirements.md b/plugins/sagemaker-ai/skills/dataset-evaluation/references/strategy_data_requirements.md new file mode 100644 index 00000000..7821b1ac --- /dev/null +++ b/plugins/sagemaker-ai/skills/dataset-evaluation/references/strategy_data_requirements.md @@ -0,0 +1,173 @@ +# Finetuning Strategy Data Requirements + +**Critical** Nova models have a different set of formats than open weights models. Make sure you refer to the right section based on the user's base model. + +## Open Weights Models Data Format by Strategy (Llama, Qwen, GPT-OSS, etc.) + +### SFT (Supervised Fine-Tuning) + +**Required format:** + +```jsonl +{ + "prompt": "", + "completion": "" +} +``` + +**What it needs:** + +- Input-output pairs +- Single "correct" response per input +- Consistent quality across examples + +### DPO (Direct Preference Optimization) + +**Required format:** + +```jsonl +{ + "prompt": "", + "chosen": "", + "rejected": "" +} +``` + +**What it needs:** + +- Input with two responses: preferred (chosen) and dispreferred (rejected) +- Clear preference signal between responses +- Both responses should be plausible but one is better +- Avoiding unintentional length bias + +### RLVR (Reinforcement Learning from Verifiable Rewards) + +**Required format:** + +```jsonl +{ + "data_source": "", + "prompt": [ + { + "content": "", + "role": "" + } + ], + "ability": "", + "reward_model": { + "ground_truth": "", + "style": "" + } +} +``` + +**What it needs:** + +- user prompt +- Ground truth responses in `reward_model.ground_truth` field (leave empty if user data does not have responses) + +**How it works:** + +1. Model generates response for input +2. Lambda receives full user prompt + reward model fields +3. Lambda computes reward (uses ground_truth if included in verification logic) +4. Model learns to maximize rewards + +## Nova Models Data Format by Strategy + +### SFT (Supervised Fine-Tuning) + +```jsonl +{ + "schemaVersion": "bedrock-conversation-2024", + "system": [ + { + "text": "" + } + ], + "messages": [ + { + "role": "user", + "content": [ + { + "text": "" + } + ] + }, + { + "role": "assistant", + "content": [ + { + "text": "" + } + ] + } + ] +} +``` + +### DPO (Direct Preference Optimization) + +The format is the same as SFT for the first N-1 turns. The final assistant turn uses `candidates` with `preferenceLabel` instead of regular `content`. + +```jsonl +{ + "schemaVersion": "bedrock-conversation-2024", + "system": [ + { + "text": "" + } + ], + "messages": [ + { + "role": "user", + "content": [ + { + "text": "" + } + ] + }, + { + "role": "assistant", + "candidates": [ + { + "content": [ + { + "text": "" + } + ], + "preferenceLabel": "preferred" + }, + { + "content": [ + { + "text": "" + } + ], + "preferenceLabel": "non-preferred" + } + ] + } + ] +} +``` + +### RLVR + +```jsonl +{ + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Hello!" + } + ], + "reference_answer": { + "solution": "49" + } +} +``` diff --git a/plugins/sagemaker-ai/skills/dataset-evaluation/scripts/format_detector.py b/plugins/sagemaker-ai/skills/dataset-evaluation/scripts/format_detector.py new file mode 100644 index 00000000..a9ed1fb6 --- /dev/null +++ b/plugins/sagemaker-ai/skills/dataset-evaluation/scripts/format_detector.py @@ -0,0 +1,678 @@ +"""Format detection for S3 JSONL files. + +This module provides functionality to detect and validate JSONL file formats +stored in S3. It samples the first 1MB of a file to determine the format type +across 11 supported formats: Nova SFT, Nova DPO, Nova RLVR, GPT-OSS SFT, +GPT-OSS DPO, Open Weights SFT, Open Weights SFT Conv, Open Weights DPO, +Verl, Verl Legacy, and SageMaker Eval. + +Usage: + result = detect_format("s3://my-bucket/data.jsonl") + if result.is_valid: + print(f"Format: {result.format_type}") +""" + +from dataclasses import dataclass +from enum import Enum +import boto3 +import json +import logging + +logger = logging.getLogger(__name__) + +__all__ = ["FormatType", "ConfidenceLevel", "ValidationError", "FormatDetectionResult", "detect_format"] + + +class FormatType(Enum): + """Supported JSONL format types.""" + NOVA_SFT = "nova_sft" + NOVA_DPO = "nova_dpo" + NOVA_RLVR = "nova_rlvr" + GPT_OSS_SFT = "gpt_oss_sft" + GPT_OSS_DPO = "gpt_oss_dpo" + OPEN_WEIGHTS_SFT = "open_weights_sft" + OPEN_WEIGHTS_SFT_CONV = "open_weights_sft_conv" + OPEN_WEIGHTS_DPO = "open_weights_dpo" + VERL = "verl" + VERL_LEGACY = "verl_legacy" + SAGEMAKER_EVAL = "sagemaker_eval" + UNKNOWN = "unknown" + + +class ConfidenceLevel(Enum): + """Confidence level for format detection results.""" + HIGH = "high" + LOW = "low" + NONE = "none" + + +@dataclass +class ValidationError: + """Represents a validation error found during format detection.""" + line_number: int + error_type: str + message: str + + +@dataclass +class FormatDetectionResult: + """Result of format detection operation.""" + format_type: FormatType + is_valid: bool + lines_sampled: int + errors: list[ValidationError] + confidence: ConfidenceLevel + + +def _sample_local_file(file_path: str, sample_size: int) -> list[str]: + """Sample lines from local JSONL file. + + Args: + file_path: Path to local file + sample_size: Maximum bytes to read + + Returns: + List of lines from file + + Raises: + FileNotFoundError: If file doesn't exist + IOError: If file can't be read + """ + logger.debug("Sampling local file: %s", file_path) + with open(file_path, "rb") as f: + data = f.read(sample_size) + + if not data: + return [] + + text = data.decode("utf-8") + + last_newline_idx = text.rfind("\n") + if last_newline_idx == -1: + return [] + + complete_text = text[:last_newline_idx + 1] + lines = [line for line in complete_text.split("\n") if line] + + return lines + + +def _sample_s3_file(s3_uri: str, sample_size_bytes: int, s3_client=None) -> list[str]: + """Sample the first N bytes of an S3 file and return complete lines. + + Reads the first sample_size_bytes from an S3 file using a Range request, + then truncates to the last complete newline to avoid partial lines. + + Args: + s3_uri: S3 URI in format "s3://bucket/key" + sample_size_bytes: Number of bytes to sample (default 1MB) + s3_client: Optional boto3 S3 client to reuse + + Returns: + List of complete JSONL lines (strings without trailing newlines) + + Raises: + ValueError: If S3 URI is invalid (missing "s3://", bucket, or key) + botocore.exceptions.ClientError: If S3 access fails + """ + logger.debug("Sampling S3 file: %s (%d bytes)", s3_uri, sample_size_bytes) + # Parse S3 URI + if not s3_uri.startswith("s3://"): + raise ValueError(f"Invalid S3 URI: must start with 's3://' (got: {s3_uri})") + + uri_without_prefix = s3_uri[5:] # Remove "s3://" + parts = uri_without_prefix.split("/", 1) + + if len(parts) != 2 or not parts[0] or not parts[1]: + raise ValueError(f"Invalid S3 URI: must contain bucket and key (got: {s3_uri})") + + bucket, key = parts + + # Read first sample_size_bytes using Range header + client = s3_client or boto3.client("s3") + range_header = f"bytes=0-{sample_size_bytes - 1}" + + response = client.get_object(Bucket=bucket, Key=key, Range=range_header) + data = response["Body"].read() + + # Handle empty file + if not data: + return [] + + # Decode bytes to string + text = data.decode("utf-8") + + # Find last complete newline to avoid truncated lines + last_newline_idx = text.rfind("\n") + if last_newline_idx == -1: + # No newlines found - return empty list if file is all one line + # (we can't be sure it's complete) + return [] + + # Keep only complete lines (up to and including last newline) + complete_text = text[:last_newline_idx + 1] + + # Split on newlines and filter empty strings + lines = [line for line in complete_text.split("\n") if line] + + return lines + + +def _classify_nova_format(record: dict) -> FormatType: + """Classify Nova-specific format by checking last message structure. + + Args: + record: Parsed JSON record with messages field + + Returns: + FormatType.NOVA_DPO if last message has candidates field, + FormatType.NOVA_SFT if last message has standard content field, + FormatType.UNKNOWN otherwise + """ + messages = record.get("messages", []) + if not messages: + return FormatType.UNKNOWN + + last_message = messages[-1] + if "candidates" in last_message: + return FormatType.NOVA_DPO + elif "content" in last_message and last_message["content"]: + return FormatType.NOVA_SFT + else: + return FormatType.UNKNOWN + + +def _classify_messages_format(record: dict) -> FormatType: + """Distinguish Nova vs GPT-OSS/HF by inspecting content structure. + + Nova has nested content arrays (list of dicts with 'text' field), + GPT-OSS/HF has flat content strings. + + Args: + record: Parsed JSON record with messages field + + Returns: + FormatType value for the detected format + """ + messages = record.get("messages") + + # Critical type checking: messages must be a list + if not isinstance(messages, list): + return FormatType.UNKNOWN + + if not messages: + return FormatType.UNKNOWN + + first_message = messages[0] + + # Check if content field exists + if "content" not in first_message: + return FormatType.UNKNOWN + + content = first_message["content"] + + # Nova: nested content arrays (list of dicts with 'text' field) + if isinstance(content, list): + return _classify_nova_format(record) + # GPT-OSS/HF: flat content strings + elif isinstance(content, str): + return FormatType.GPT_OSS_SFT + else: + return FormatType.UNKNOWN + + +def _classify_schema(samples: list[dict]) -> FormatType: + """Top-level classifier that checks for all 11 supported formats. + + Args: + samples: List of parsed JSON records + + Returns: + FormatType value for the detected format + """ + if not samples: + return FormatType.UNKNOWN + + first = samples[0] + fields = set(first.keys()) + + # SageMaker Evaluation: query + response + if "query" in fields and "response" in fields: + return FormatType.SAGEMAKER_EVAL + + # Verl/RLVR: prompt + (reward_model or extra_info), no completion + if "prompt" in fields and ("reward_model" in fields or "extra_info" in fields): + if "completion" not in fields: + if isinstance(first["prompt"], list): + return FormatType.VERL + return FormatType.VERL_LEGACY + + # Messages-based formats: Nova RLVR, Nova, GPT-OSS + if "messages" in fields: + if "reference_answer" in fields: + return FormatType.NOVA_RLVR + return _classify_messages_format(first) + + # DPO: prompt/chosen/rejected + if {"prompt", "chosen", "rejected"}.issubset(fields): + if isinstance(first["prompt"], list): + return FormatType.GPT_OSS_DPO + return FormatType.OPEN_WEIGHTS_DPO + + # SFT: prompt/completion + if {"prompt", "completion"}.issubset(fields): + if isinstance(first["prompt"], list): + return FormatType.OPEN_WEIGHTS_SFT_CONV + return FormatType.OPEN_WEIGHTS_SFT + + return FormatType.UNKNOWN + + +def _validate_nova_messages(messages: list, line_num: int, is_dpo: bool) -> list[ValidationError]: + """Validate Nova SFT/DPO message structure.""" + errors = [] + for msg_idx, msg in enumerate(messages): + if "role" not in msg: + errors.append(ValidationError( + line_number=line_num, + error_type="missing_field", + message=f"Message {msg_idx} missing required field 'role'" + )) + elif msg["role"] not in ["user", "assistant", "system"]: + errors.append(ValidationError( + line_number=line_num, + error_type="invalid_structure", + message=f"Invalid role '{msg['role']}' in message {msg_idx}" + )) + if "content" not in msg and "candidates" not in msg: + errors.append(ValidationError( + line_number=line_num, + error_type="missing_field", + message=f"Message {msg_idx} missing 'content' or 'candidates'" + )) + if "content" in msg and not isinstance(msg["content"], list): + errors.append(ValidationError( + line_number=line_num, + error_type="invalid_structure", + message=f"Nova format content must be list, got {type(msg['content']).__name__}" + )) + if is_dpo and "candidates" in msg: + for cand_idx, candidate in enumerate(msg["candidates"]): + if "preferenceLabel" not in candidate: + errors.append(ValidationError( + line_number=line_num, + error_type="missing_field", + message=f"DPO message {msg_idx} candidate {cand_idx} missing 'preferenceLabel'" + )) + elif candidate["preferenceLabel"] not in ["preferred", "non-preferred"]: + errors.append(ValidationError( + line_number=line_num, + error_type="invalid_structure", + message=f"Invalid preferenceLabel '{candidate['preferenceLabel']}' in message {msg_idx} candidate {cand_idx}" + )) + return errors + + +def _validate_gpt_messages(messages: list, line_num: int) -> list[ValidationError]: + """Validate GPT-OSS SFT message structure.""" + errors = [] + for msg_idx, msg in enumerate(messages): + if "role" not in msg: + errors.append(ValidationError( + line_number=line_num, + error_type="missing_field", + message=f"Message {msg_idx} missing required field 'role'" + )) + elif msg["role"] not in ["user", "assistant", "system"]: + errors.append(ValidationError( + line_number=line_num, + error_type="invalid_structure", + message=f"Invalid role '{msg['role']}' in message {msg_idx}" + )) + if "content" not in msg: + errors.append(ValidationError( + line_number=line_num, + error_type="missing_field", + message=f"Message {msg_idx} missing required field 'content'" + )) + elif not isinstance(msg["content"], str): + errors.append(ValidationError( + line_number=line_num, + error_type="invalid_structure", + message=f"GPT-OSS format content must be string, got {type(msg['content']).__name__}" + )) + return errors + + +def _validate_rlvr_messages(messages: list, line_num: int) -> list[ValidationError]: + """Validate Nova RLVR message structure.""" + errors = [] + for msg_idx, msg in enumerate(messages): + if "role" not in msg: + errors.append(ValidationError( + line_number=line_num, + error_type="missing_field", + message=f"Message {msg_idx} missing required field 'role'" + )) + elif msg["role"] not in ["user", "assistant", "system"]: + errors.append(ValidationError( + line_number=line_num, + error_type="invalid_structure", + message=f"Invalid role '{msg['role']}' in message {msg_idx}" + )) + if "content" not in msg: + errors.append(ValidationError( + line_number=line_num, + error_type="missing_field", + message=f"Message {msg_idx} missing required field 'content'" + )) + elif not isinstance(msg["content"], str): + errors.append(ValidationError( + line_number=line_num, + error_type="invalid_structure", + message=f"Nova RLVR content must be string, got {type(msg['content']).__name__}" + )) + return errors + + +def _validate_verl_prompt(record: dict, line_num: int) -> list[ValidationError]: + """Validate Verl prompt structure (list of role/content dicts).""" + errors = [] + if "prompt" not in record: + errors.append(ValidationError( + line_number=line_num, + error_type="missing_field", + message="Missing required field 'prompt'" + )) + elif not isinstance(record["prompt"], list): + errors.append(ValidationError( + line_number=line_num, + error_type="invalid_structure", + message=f"Verl field 'prompt' must be list, got {type(record['prompt']).__name__}" + )) + else: + for msg_idx, msg in enumerate(record["prompt"]): + if not isinstance(msg, dict) or "role" not in msg or "content" not in msg: + errors.append(ValidationError( + line_number=line_num, + error_type="invalid_structure", + message=f"Prompt message {msg_idx} must have 'role' and 'content'" + )) + if "reward_model" not in record and "extra_info" not in record: + errors.append(ValidationError( + line_number=line_num, + error_type="missing_field", + message="Missing required field 'reward_model' or 'extra_info'" + )) + return errors + + +def _validate_verl_legacy_prompt(record: dict, line_num: int) -> list[ValidationError]: + """Validate Verl Legacy prompt structure (string) and extra_info.""" + errors = [] + if "prompt" not in record: + errors.append(ValidationError( + line_number=line_num, + error_type="missing_field", + message="Missing required field 'prompt'" + )) + elif not isinstance(record["prompt"], str): + errors.append(ValidationError( + line_number=line_num, + error_type="invalid_structure", + message=f"Verl Legacy field 'prompt' must be string, got {type(record['prompt']).__name__}" + )) + if "extra_info" not in record: + errors.append(ValidationError( + line_number=line_num, + error_type="missing_field", + message="Missing required field 'extra_info'" + )) + return errors + + +# Schema-driven format validation specs. +# Each entry defines required_fields (field->type mapping) and an optional +# message_validator or record_validator for complex per-record checks. +# - message_validator: called with (messages_list, line_num) -> list[ValidationError] +# Used for formats whose top-level required field is "messages" (list). +# - record_validator: called with (record, line_num) -> list[ValidationError] +# Used for formats needing whole-record access (verl, verl_legacy). +FORMAT_SCHEMAS = { + FormatType.NOVA_SFT: { + "required_fields": {"messages": list}, + "message_validator": lambda msgs, ln: _validate_nova_messages(msgs, ln, is_dpo=False), + }, + FormatType.NOVA_DPO: { + "required_fields": {"messages": list}, + "message_validator": lambda msgs, ln: _validate_nova_messages(msgs, ln, is_dpo=True), + }, + FormatType.NOVA_RLVR: { + "required_fields": {"messages": list, "reference_answer": dict}, + "message_validator": _validate_rlvr_messages, + }, + FormatType.GPT_OSS_SFT: { + "required_fields": {"messages": list}, + "message_validator": _validate_gpt_messages, + }, + FormatType.GPT_OSS_DPO: { + "required_fields": {"prompt": list, "chosen": list, "rejected": list}, + "field_error_prefix": "GPT-OSS DPO", + }, + FormatType.OPEN_WEIGHTS_SFT: { + "required_fields": {"prompt": str, "completion": str}, + "field_error_prefix": "Open Weights SFT", + }, + FormatType.OPEN_WEIGHTS_SFT_CONV: { + "required_fields": {"prompt": list, "completion": list}, + "field_error_prefix": "Open Weights SFT Conv", + }, + FormatType.OPEN_WEIGHTS_DPO: { + "required_fields": {"prompt": str, "chosen": str, "rejected": str}, + "field_error_prefix": "Open Weights DPO", + }, + FormatType.SAGEMAKER_EVAL: { + "required_fields": {"query": str, "response": str}, + "field_error_prefix": "SageMaker Eval", + }, + FormatType.VERL: { + "required_fields": {}, + "record_validator": _validate_verl_prompt, + }, + FormatType.VERL_LEGACY: { + "required_fields": {}, + "record_validator": _validate_verl_legacy_prompt, + }, +} + + +def _validate_samples(samples: list[dict], expected_format: FormatType, line_numbers: list[int]) -> tuple[bool, list[ValidationError]]: + """Validate that all samples conform to the expected format schema. + + Args: + samples: List of parsed JSON records + expected_format: Expected FormatType enum value + line_numbers: 1-based line numbers corresponding to each sample + + Returns: + Tuple of (is_valid, errors) where errors is a list of ValidationError objects + """ + errors = [] + schema = FORMAT_SCHEMAS.get(expected_format) + + for record, line_num in zip(samples, line_numbers): + # Check schema consistency + detected_format = _classify_schema([record]) + if detected_format != expected_format: + errors.append(ValidationError( + line_number=line_num, + error_type="schema_mismatch", + message=f"Expected {expected_format.value} but found {detected_format.value}" + )) + continue + + if schema is None: + continue + + # Record-level validator (verl, verl_legacy) handles everything + if "record_validator" in schema: + errors.extend(schema["record_validator"](record, line_num)) + continue + + # Check required fields exist with correct types + required = schema["required_fields"] + prefix = schema.get("field_error_prefix", "") + skip_messages = False + for field, expected_type in required.items(): + if field not in record: + errors.append(ValidationError( + line_number=line_num, + error_type="missing_field", + message=f"Missing required field '{field}'" + )) + if field == "messages": + skip_messages = True + elif not isinstance(record[field], expected_type): + actual = type(record[field]).__name__ + if field == "messages": + errors.append(ValidationError( + line_number=line_num, + error_type="invalid_structure", + message=f"Field 'messages' must be a list" + )) + skip_messages = True + elif prefix: + errors.append(ValidationError( + line_number=line_num, + error_type="invalid_structure", + message=f"{prefix} field '{field}' must be {expected_type.__name__}, got {actual}" + )) + else: + errors.append(ValidationError( + line_number=line_num, + error_type="invalid_structure", + message=f"Field '{field}' must be {expected_type.__name__}, got {actual}" + )) + + if skip_messages: + continue + + # Message-level validator + if "message_validator" in schema: + errors.extend(schema["message_validator"](record["messages"], line_num)) + + logger.debug("Validation found %d error(s)", len(errors)) + return (len(errors) == 0, errors) + + +def detect_format(file_path: str, sample_size_bytes: int = 1_048_576, s3_client=None) -> FormatDetectionResult: + """Detect the format of a JSONL file in S3 or on local disk. + + Samples the first sample_size_bytes of the file and analyzes the structure + to determine if it matches one of the 11 supported formats. + + Args: + file_path: S3 URI (s3://bucket/key) or local file path + sample_size_bytes: Number of bytes to sample (default 1MB = 1,048,576 bytes) + s3_client: Optional boto3 S3 client to reuse (ignored for local files) + + Returns: + FormatDetectionResult with format type, validation status, and any errors + """ + if file_path.startswith("s3://"): + lines = _sample_s3_file(file_path, sample_size_bytes, s3_client=s3_client) + else: + lines = _sample_local_file(file_path, sample_size_bytes) + + # Parse JSON lines and collect parse errors + parsed_records = [] + line_numbers = [] + errors = [] + + for line_num, line in enumerate(lines, start=1): + try: + parsed_records.append(json.loads(line)) + line_numbers.append(line_num) + except json.JSONDecodeError as e: + errors.append(ValidationError( + line_number=line_num, + error_type="parse_error", + message=f"Invalid JSON: {str(e)}" + )) + + # If no successfully parsed records, return UNKNOWN with parse errors + if not parsed_records: + confidence = ConfidenceLevel.NONE if errors else ConfidenceLevel.HIGH + return FormatDetectionResult( + format_type=FormatType.UNKNOWN, + is_valid=len(errors) == 0, + lines_sampled=len(lines), + errors=errors, + confidence=confidence + ) + + # Classify schema using first successfully parsed record + format_type = _classify_schema(parsed_records) + + # Validate all parsed records against detected format + is_valid, validation_errors = _validate_samples(parsed_records, format_type, line_numbers) + errors.extend(validation_errors) + + # Calculate confidence level + if len(errors) == 0: + confidence = ConfidenceLevel.HIGH + elif any(err.error_type == "parse_error" for err in errors): + confidence = ConfidenceLevel.NONE + else: + confidence = ConfidenceLevel.LOW + + logger.debug("Detected format: %s (valid=%s, confidence=%s)", format_type.value, is_valid, confidence.value) + + return FormatDetectionResult( + format_type=format_type, + is_valid=len(errors) == 0, + lines_sampled=len(lines), + errors=errors, + confidence=confidence + ) + + +if __name__ == "__main__": + import argparse + import sys + + parser = argparse.ArgumentParser(description="Detect and validate JSONL file formats") + parser.add_argument("file_path", help="S3 URI (s3://bucket/key) or local file path") + parser.add_argument("--sample-size", type=int, default=1_048_576, help="Bytes to sample (default: 1MB)") + parser.add_argument("--json", action="store_true", help="Output as JSON instead of human-readable") + args = parser.parse_args() + + try: + result = detect_format(args.file_path, args.sample_size) + + if args.json: + output = { + "format_type": result.format_type.value, + "is_valid": result.is_valid, + "confidence": result.confidence.value, + "lines_sampled": result.lines_sampled, + "errors": [ + {"line_number": e.line_number, "error_type": e.error_type, "message": e.message} + for e in result.errors + ], + } + print(json.dumps(output, indent=2)) + else: + print(f"Format: {result.format_type.value}") + print(f"Valid: {'✓' if result.is_valid else '✗'}") + print(f"Confidence: {result.confidence.name}") + print(f"Lines sampled: {result.lines_sampled}") + if result.errors: + print("Errors:") + for err in result.errors: + print(f" Line {err.line_number}: {err.message}") + + sys.exit(0 if result.is_valid else 1) + except (FileNotFoundError, IOError, ValueError) as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) diff --git a/plugins/sagemaker-ai/skills/dataset-transformation/SKILL.md b/plugins/sagemaker-ai/skills/dataset-transformation/SKILL.md new file mode 100644 index 00000000..91fef48c --- /dev/null +++ b/plugins/sagemaker-ai/skills/dataset-transformation/SKILL.md @@ -0,0 +1,220 @@ +--- +name: dataset-transformation +description: Generates a Jupyter notebook that transforms datasets between ML schemas for model training or evaluation. Use when the user says "transform", "convert", "reformat", "change the format", or when a dataset's schema needs to change to match the target format — always use this skill for format changes rather than writing inline transformation code. Supports OpenAI chat, SageMaker SFT/DPO/RLVR, HuggingFace preference, Bedrock Nova, VERL, and custom JSONL formats from local files or S3. +--- + +# Dataset Transformation Agent + +Transforms a data set provided by the user into their desired format. All transformation code is delivered as a Jupyter notebook. + +## When to Use + +- User needs to generate code for transforming datasets for SageMaker model training or model evaluation. +- A dataset requires processing, cleaning, or formatting before training or evaluation. +- Workflow requires a formal review and approval cycle before execution. + +## Principles + +1. **One thing at a time.** Each response advances exactly one decision. Never combine multiple questions or recommendations in a single turn. +2. **Confirm before proceeding.** Wait for the user to agree before moving to the next step. You are a guide, not a runaway train. +3. **Don't read files until you need them.** Only read reference files when you've reached the workflow step that requires them and the user has confirmed the direction. Never read ahead. +4. **No narration.** Don't explain what you're about to do or what you just did. Share outcomes and ask questions. Keep responses short and focused. +5. **No repetition.** If you said something before a tool call, don't repeat it after. Only share new information. +6. **Do not deviate from the Workflow.** The steps listed in the workflow should be followed exactly as described. Progress from Step 1 to Step 10 to complete the task. Do not deviate from the workflow! +7. **Always end with a question.** Whenever you pause for user input, acknowledgment, or feedback, your response must end with a question. Never leave the user with a statement and expect them to know they need to respond. +8. **Never overwrite existing files — append instead.** If a target notebook already exists, do NOT overwrite it. Append new cells to the existing file. Notify the user that the file already exists and that you will be appending to it. +9. **Avoid filename collisions.** When creating a new file, check if a file with the same name already exists. If it does, rename the new file by appending a numeric suffix (e.g., `transform_dataset_2.ipynb`) before writing. +10. **Default output format is JSONL.** Unless the user explicitly requests a different file format, the transformed dataset should be written as `.jsonl` (JSON Lines — one JSON object per line). + +## Known Dataset Formats Reference + +This skill supports two transformation purposes — **training data** and **evaluation data** — each with its own format resolution path. The purpose is determined in Step 1 of the workflow. + +### Training Data Formats + +When the transformation is for **model training**, resolve the target format using the reference file `../dataset-evaluation/references/strategy_data_requirements.md`. The required format depends on both the **model type** (Open Weights like Llama/Qwen vs Nova) and the **finetuning technique** (SFT, DPO, RLVR) — make sure to match on both dimensions. If either the model type or technique is not yet known, ask the user before resolving the format. + +### Evaluation Data Formats + +When the transformation is for **model evaluation**, resolve the target format using this order: + +1. Try fetching the live documentation at https://docs.aws.amazon.com/sagemaker/latest/dg/model-customize-evaluation-dataset-formats.html to get the latest evaluation dataset schema definitions. +2. **If the fetch fails** (e.g., no internet access, VPC environment), fall back to the offline copy at `references/sagemaker_dataset_formats.md`. Inform the user that the format schemas are from an offline copy and may be outdated. + +Use whichever source you successfully access as the source of truth for the target format. Do not rely on memorized schemas. + +## Workflow + +### Step 1: Determine transformation purpose + +Your first response should determine whether this transformation is for **model training** or **model evaluation**. If the context already makes this clear (e.g., the user said "I need to prep my training data" or "I need to format my eval dataset"), confirm your understanding and move on. Otherwise, ask: + +> "Is this dataset transformation for model training or model evaluation? This helps me look up the right target format for you." + +- **Training** → format resolution will use the local training data requirements reference (model type + finetuning technique dependent). +- **Evaluation** → format resolution will use the live AWS documentation (with offline fallback). + +Remember this choice — it determines how the target format is resolved in Step 3. + +⏸ Wait for user. + +### Step 2: Set expectations + +Acknowledge the user's request and state what this skill can do: + +> "I can help you transform your dataset's format! Here's my plan: I will first need to understand the format of your dataset and the transformation requirements. Once I have that, I will generate a dataset transformation function that we can refine together. After the dataset transformation function is refined to your liking, I will perform the transformation task and upload it to your desired location! Does this sound good?" + +⏸ Wait for user. + +### Step 3: Understand the dataset transformation task + +For this step, you need to know: **what dataset format the user would like to transform their dataset from and what dataset format they would like to transform it in to.** +If you know this already, skip this step. If not, ask the user: + +> "What's the dataset format you would like to transform it into?" + +Resolve the target format based on the purpose determined in Step 1: + +- **If training data**: Ask the user for the finetuning technique (SFT, DPO, RLVR) and model type (Open Weights like Llama/Qwen vs Nova) if not already known. Then look up the required format from the "Training Data Formats" section in the Known Dataset Formats Reference above. +- **If evaluation data**: If the user mentions a well-known format name (e.g., "OpenAI format", "SageMaker format"), fetch the schema from the live documentation as described in the "Evaluation Data Formats" section above. If a well-known format is fetched, confirm with the user: + +> "I've found a SageMaker dataset format: {sagemaker-dataset-format-name} with schema: {sagemaker-dataset-format-schema}. Is this what you were referring to?" + +If the user describes a custom format not listed in the reference doc, ask them to provide a sample record of the desired output format. + +⏸ Wait for user. + +### Step 4: Get the dataset from the user + +For this step, you need: **the location of the user's dataset**. +If you know this already, skip this step. If not, ask the user: + +> "Where can I find your dataset? Either a local directory or S3 location works!" + +⏸ Wait for user. + +### Step 5: Examine sample data + +Read 1–2 sample records from the user's dataset and show them so the user can confirm the source schema. Do not run format detection — that is handled by the planning skill before this skill is invoked. + +Do not show a side-by-side mapping to the target format here — the detailed mapping will be handled in Step 7 when generating the transformation function. + +⏸ Wait for user. + +### Step 6: Get the dataset output location + +For this step, you need: **to understand where to output the transformed dataset to. It could be an S3 URI or local directory** +If you already know where the dataset is supposed to be output to, skip this step. If not, ask the user: + +> "Where should I output your transformed dataset to? Either a local directory or S3 location works!" + +If the user provides a directory (not a full file path), construct the output filename using the pattern `{original_name}_{target_format}.jsonl` (e.g., `gen_qa_100k_openai.jsonl`). + +⏸ Wait for user. + +### Step 7: Generate and validate the transformation function + +For this step, you need: **to generate a python function that transforms the dataset from the format in Step 5 to the format in Step 3** + +Read the reference guide at `references/dataset_transformation_code.md` and follow its skeleton exactly when generating the transformation function. + +The python function should be in the form of: + +```python +def transform_dataset(df: pd.DataFrame) -> pd.DataFrame: +``` + +Add a `%%writefile /scripts/transform_fn.py` code cell to the notebook AND write the file to disk for testing. The `` is the project directory established by the directory-management skill (e.g., `dpo-to-rlvr-conversion`). All notebooks go in `/notebooks/` and all scripts go in `/scripts/`. + +Continue iterating with the user's feedback — update the notebook cell in place on each revision rather than showing code inline. + +**If sample data was collected in Step 5**, test the function against the sample records: + +1. Generate the transformation function. +2. Write the sample data to a temporary JSONL file (e.g., `/tmp/test_input.jsonl`), then run: + `python3 -c "import sys; sys.path.insert(0, '/scripts'); from transform_fn import transform_dataset; import pandas as pd; df = pd.read_json('/tmp/test_input.jsonl', lines=True); result = transform_dataset(df); print(result.to_json(orient='records', lines=True))"` +3. If the test fails, fix and re-test until it passes. +4. Show the user the function and transformed sample output for review. + +**If no sample data**, present the function for review and refinement. + +⏸ Wait for user. + +### Step 8: Generate the execution cells in the notebook + +**Before writing the notebook, read:** + +- `references/notebook_structure.md` (cell order, placeholders, and content) +- `references/notebook_writing_guide.md` (Jupyter notebook JSON formatting) + +Generate the execution logic as code cells in the notebook. + +- Add a `%%writefile /scripts/.py` code cell to the notebook AND write the file to disk for testing. +- The script must import `transform_dataset` from `transform_fn`. +- Replace placeholders with the actual input/output paths. + +Read the reference guide at `references/dataset_transformation_code.md` and follow its execution script skeleton exactly. + +**If sample data was collected in Step 5**, test the full pipeline: + +1. Write the sample records to a temporary JSONL file (e.g., `/tmp/test_input.jsonl`). +2. Run: `python3 /scripts/ --input /tmp/test_input.jsonl --output /tmp/test_output.jsonl` +3. If it fails, debug and fix, then re-run until successful. +4. Show the user the output for review. + +**If no sample data**, present the notebook for review and refinement. + +⏸ Wait for user. + +### Step 9: Determine and confirm execution mode + +Check the size of the input dataset: + +- If the dataset is in S3, use the AWS MCP tool `head-object` (S3 service) with the bucket and key to get `ContentLength`. +- If the dataset is local, check the file size. + +**Decision criteria:** + +- Dataset < 50 MB → recommend local execution +- Dataset ≥ 50 MB → recommend SageMaker Processing Job + +Inform the user of the recommendation and get their approval: + +If local: + +> "Your dataset is {size} MB — since it's under 50 MB, I'd recommend running the transformation locally. Would you like to proceed with local execution, or would you prefer a SageMaker Processing Job instead?" + +If SageMaker Processing Job: + +> "Your dataset is {size} MB — since it's over 50 MB, I'd recommend running this as a SageMaker Processing Job for better performance. Would you like to proceed with a SageMaker Processing Job, or would you prefer to run it locally instead?" + +Do not execute until the user approves. If the user rejects the recommendation, switch to the alternative and get their explicit approval before proceeding. + +⏸ Wait for user. + +**After user confirms, add an execution cell to the notebook. Do NOT run the full transformation — only generate the cell for the user to execute themselves:** + +If local execution: + +- Add a cell that runs the transformation by importing from the `.py` files already on disk (written by the agent during Steps 7–8): import `transform_dataset` from `transform_fn`, load the dataset, transform, and save output. Scripts are located in `/scripts/`. + +If SageMaker Processing Job: + +- Add a cell that submits and monitors the Processing Job inline using the V3 SageMaker SDK directly (FrameworkProcessor, ProcessingInput, ProcessingOutput, etc.). Create a FrameworkProcessor with the SKLearn 1.2-1 image, configure inputs/outputs, and call `processor.run(wait=True, logs=True)` to block the cell and stream logs until the job completes. See `scripts/transformation_tools.py` for reference implementation details. +- Inform the user they can run this cell to kick off and monitor the job. + +**Important:** The agent must NOT execute the full dataset transformation itself. The notebook cells are generated for the user to review and run. Only sample data (from Steps 7–8) should be transformed by the agent for validation purposes. + +> "I've added the execution cell to the notebook. You can run it to transform the full dataset. Would you like to review the notebook before running it?" + +⏸ Wait for user. + +### Step 10: Verify and confirm with the user + +For this step, you need: **to verify the output looks correct and confirm with the user.** + +- Read 1–2 sample records from the output to show the user. +- Report the total number of records transformed. +- Ask the user if the output looks good. + +⏸ Wait for user to confirm. diff --git a/plugins/sagemaker-ai/skills/dataset-transformation/references/dataset_transformation_code.md b/plugins/sagemaker-ai/skills/dataset-transformation/references/dataset_transformation_code.md new file mode 100644 index 00000000..86adb243 --- /dev/null +++ b/plugins/sagemaker-ai/skills/dataset-transformation/references/dataset_transformation_code.md @@ -0,0 +1,135 @@ +# Dataset Transformation Code Reference + +## When to Reference + +When generating: + +- a dataset transformation function +- a dataset transformation execution script + +**follow the exact python skeletons** captured in this document. + +## Related Files + +- `scripts/transformation_tools.py` — contains `execute_transformation_job()` for running the generated script as a SageMaker Processing Job. Use this when the user wants remote execution instead of local. + +## Requirements + +- The dataset transformation function should: **ONLY transform the input DataFrame into the target output format. No I/O, no side effects.** +- The dataset transformation execution script should: **ORCHESTRATE the full pipeline: load the dataset using `load_dataset_from`, apply the transformation function, and write the output using `output_dataset_to`.** +- The script must work in two execution contexts: + - **Local execution**: paths may be S3 URIs or local file paths + - **SageMaker Processing Job**: inputs are mounted at `/opt/ml/processing/input/` and outputs go to `/opt/ml/processing/output/` + +## Generating a dataset transformation function + +The transformation function should be saved to its own file at `/scripts/transform_fn.py` so the user can view and edit it directly. The `` is the project directory established by the directory-management skill (e.g., `dpo-to-rlvr-conversion`). + +```python +import pandas as pd + +def transform_dataset(df: pd.DataFrame) -> pd.DataFrame: + # Transform each row from source format to target format + # Return a DataFrame matching the target schema + transformed = {transformation logic} + return transformed +``` + +## Generating a dataset transformation execution script + +The execution script imports `transform_dataset` from `transform_fn.py` rather than embedding it inline. Both files must be in the same directory (`/scripts/`). + +```python +import pandas as pd +import json +import subprocess +import shutil +import os +import argparse +from transform_fn import transform_dataset + +def load_dataset_from(input_location: str, to: str): + """ + Load a dataset from S3 or local path. + - input_location: S3 URI or local file path (including SageMaker Processing mounted paths) + - to: local file path to save the dataset to + """ + if input_location.startswith("s3://"): + subprocess.run(["aws", "s3", "cp", input_location, to], check=True) + else: + shutil.copy(input_location, to) + +def output_dataset_to(output_location: str, from_path: str): + """ + Output a dataset to S3 or local path. + - output_location: S3 URI or local directory/file path (including SageMaker Processing mounted paths) + - from_path: local file path of the transformed dataset to upload/move + """ + if output_location.startswith("s3://"): + subprocess.run(["aws", "s3", "cp", from_path, output_location], check=True) + else: + os.makedirs(os.path.dirname(output_location) or ".", exist_ok=True) + shutil.copy(from_path, output_location) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--input", required=True, help="S3 URI, local path, or /opt/ml/processing/input/...") + parser.add_argument("--output", required=True, help="S3 URI, local path, or /opt/ml/processing/output/...") + args = parser.parse_args() + + # 1. Load dataset + local_input = "/tmp/input_dataset.jsonl" + load_dataset_from(args.input, to=local_input) + + # 2. Read into DataFrame + df = pd.read_json(local_input, lines=True) + print(f"Loaded {len(df)} records") + + # 3. Transform + df = transform_dataset(df) + + # 4. Write transformed output locally + local_output = "/tmp/output_dataset.jsonl" + df.to_json(local_output, orient="records", lines=True) + + # 5. Output to destination + output_dataset_to(args.output, from_path=local_output) + + print(f"Transformed {len(df)} records -> {args.output}") +``` + +## Execution Examples + +### Local execution + +```bash +python transform.py --input s3://my-bucket/data/input.jsonl --output s3://my-bucket/data/output.jsonl +``` + +### SageMaker Processing Job + +Use `execute_transformation_job` from `scripts/transformation_tools.py` to run the script as a SageMaker Processing Job. This function handles container setup, S3 input/output mounting, and job orchestration. Do not manually construct Processing Job logic — always delegate to this tool. + +The job is submitted asynchronously (`wait=False`). Use `describe_transformation_job` to check job status. + +```python +from scripts.transformation_tools import execute_transformation_job, describe_transformation_job + +execute_transformation_job( + transform_script_path="transform.py", # Local path to the saved script + dataset_source_s3="s3://bucket/input.jsonl", # S3 URI of input dataset + dataset_output_s3="s3://bucket/output/", # S3 URI for output +) +``` + +After submitting, check status with: + +```python +from scripts.transformation_tools import describe_transformation_job + +status = describe_transformation_job(job_name="") +print(status) +# Returns: {"job_name": "...", "status": "InProgress|Completed|Failed|Stopped", ...} +``` + +Call `describe_transformation_job` repeatedly (every ~30 seconds) until `status` is `Completed`, `Failed`, or `Stopped`. diff --git a/plugins/sagemaker-ai/skills/dataset-transformation/references/notebook_structure.md b/plugins/sagemaker-ai/skills/dataset-transformation/references/notebook_structure.md new file mode 100644 index 00000000..fe29349d --- /dev/null +++ b/plugins/sagemaker-ai/skills/dataset-transformation/references/notebook_structure.md @@ -0,0 +1,46 @@ +# Dataset Transformation Notebook Structure + +Cell order, placeholders, and JSON formatting for the dataset transformation notebook. + +## Cells + +| Cell | Label | Content | +| ---- | ------------------------------------------- | ----------------------------------------------------------------- | +| 0 | Markdown header: `# Dataset Transformation` | Description of the transformation (source format → target format) | +| 1 | Configuration | Input/output paths, region, any user-configurable parameters | +| 2 | Transformation Function | The approved `transform_dataset(df)` function from Step 6 | +| 3 | Load Dataset | Load dataset using `load_dataset_from` and read into DataFrame | +| 4 | Transform | Apply `transform_dataset(df)` and preview transformed records | +| 5 | Save Output | Write transformed DataFrame and upload using `output_dataset_to` | + +## Placeholders (Cell 1 only) + +| Placeholder | Description | Example | +| ------------------- | ------------------------------------- | --------------------------------------- | +| `[INPUT_LOCATION]` | S3 URI or local path to input dataset | `s3://bucket/data/input.jsonl` | +| `[OUTPUT_LOCATION]` | S3 URI or local path for output | `s3://bucket/output/input_openai.jsonl` | + +## JSON Formatting + +Each line of code is a separate string in `source`, ending with `\n` (except the last line): + +```json +{ + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "x = 5\n", + "print(x)" + ] +} +``` + +- Escape quotes inside strings: `\"` +- No trailing commas in arrays or objects +- 2-space indentation +- Use `fs_write` with `command: create` to write the complete notebook JSON +- Markdown cell 0: `"cell_type": "markdown"`, no `execution_count` or `outputs` +- Wrap all cells in `{"cells": [...], "metadata": {...}, "nbformat": 4, "nbformat_minor": 4}` diff --git a/plugins/sagemaker-ai/skills/dataset-transformation/references/notebook_writing_guide.md b/plugins/sagemaker-ai/skills/dataset-transformation/references/notebook_writing_guide.md new file mode 100644 index 00000000..3b79be11 --- /dev/null +++ b/plugins/sagemaker-ai/skills/dataset-transformation/references/notebook_writing_guide.md @@ -0,0 +1,99 @@ +# Guide: Writing Jupyter Notebooks + +## Critical Differences from Regular Files + +Jupyter notebooks (.ipynb) are JSON files with a specific structure. Writing to them is fundamentally different from writing regular Python files. + +## The Problem + +When you write Python code to a regular .py file, you write it as plain text with newlines: + +```python +import os +x = 5 +print(x) +``` + +But in a Jupyter notebook, each line must be a separate string in a JSON array: + +```json +{ + "source": [ + "import os\n", + "x = 5\n", + "print(x)" + ] +} +``` + +## The Solution: Use fs_write with JSON Structure + +**ALWAYS use the `fs_write` tool with `command: create` to write notebooks.** + +### Correct Notebook Structure (Pretty-Print Format) + +Use **2-space indentation** (pretty-print format) for consistent, readable formatting: + +```json +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# This is line 1\n", + "import os\n", + "x = 5\n", + "print(x)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.9.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} +``` + +**CRITICAL**: Use exactly 2 spaces for each indentation level (standard pretty-print format). + +### Key Points + +1. **Each line ends with `\n`** - This is how newlines are represented in JSON strings +2. **Lines are separate array elements** - Each line is a string in the `source` array +3. **Use proper JSON escaping** - Quotes inside strings must be escaped: `\"text\"` +4. **No trailing comma** - Last element in arrays/objects should not have a comma + +## Common Mistakes to Avoid + +❌ **DON'T** use bash commands to generate JSON and pipe to file +❌ **DON'T** write code as a single string without line breaks +❌ **DON'T** forget to escape quotes in strings +❌ **DON'T** add trailing commas to last array elements + +✅ **DO** use fs_write with the complete JSON structure +✅ **DO** add `\n` to end of each line in source arrays +✅ **DO** validate JSON structure before writing +✅ **DO** use proper escaping for special characters + +## Validation Checklist + +Before writing the notebook, verify: + +- [ ] Each cell has proper structure (cell_type, execution_count, metadata, outputs, source) +- [ ] Source arrays have each line as a separate string ending in `\n` +- [ ] Quotes are properly escaped +- [ ] No trailing commas +- [ ] Metadata section is complete +- [ ] nbformat and nbformat_minor are set diff --git a/plugins/sagemaker-ai/skills/dataset-transformation/references/sagemaker_dataset_formats.md b/plugins/sagemaker-ai/skills/dataset-transformation/references/sagemaker_dataset_formats.md new file mode 100644 index 00000000..b0d51d88 --- /dev/null +++ b/plugins/sagemaker-ai/skills/dataset-transformation/references/sagemaker_dataset_formats.md @@ -0,0 +1,146 @@ +# SageMaker Supported Dataset Formats (Offline Fallback) + +This is an offline copy of the supported dataset formats from: +https://docs.aws.amazon.com/sagemaker/latest/dg/model-customize-evaluation-dataset-formats.html + +**Note:** Always attempt to fetch the live documentation first. Only use this file as a fallback when internet access is unavailable (e.g., VPC environments). + +## Required Fields + +| Field | Required | +| ------------- | ---------------------- | +| User Prompt | Yes | +| System Prompt | No | +| Ground truth | Only for Custom Scorer | +| Category | No | + +## 1. OpenAI Format + +```json +{ + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Hello!" + }, + { + "role": "assistant", + "content": "Hello to you!" + } + ] +} +``` + +- `system` role is optional (system prompt) +- `user` role is the query +- `assistant` role is the ground truth + +## 2. SageMaker Evaluation Format + +```json +{ + "system": "You are an English major with top marks in class who likes to give minimal word responses: ", + "query": "What is the symbol that ends the sentence as a question", + "response": "?", + "category": "Grammar" +} +``` + +- `system` and `category` are optional +- `response` is the ground truth + +## 3. HuggingFace Prompt Completion Format + +### Standard + +```json +{ + "prompt": "What is the symbol that ends the sentence as a question", + "completion": "?" +} +``` + +### Conversational + +```json +{ + "prompt": [ + { "role": "user", "content": "What is the symbol that ends the sentence as a question" } + ], + "completion": [ + { "role": "assistant", "content": "?" } + ] +} +``` + +- `completion` is the ground truth + +## 4. HuggingFace Preference Format + +### Standard + +```json +{ + "prompt": "The sky is", + "chosen": "blue", + "rejected": "green" +} +``` + +### Conversational + +```json +{ + "prompt": [ + { "role": "user", "content": "What color is the sky?" } + ], + "chosen": [ + { "role": "assistant", "content": "It is blue." } + ], + "rejected": [ + { "role": "assistant", "content": "It is green." } + ] +} +``` + +- `chosen` is the ground truth + +## 5. Verl Format + +### Current (prompt as messages array) + +```json +{ + "data_source": "openai/gsm8k", + "prompt": [ + { "content": "You are a helpful math tutor.", "role": "system" }, + { "content": "What is 2+2?", "role": "user" } + ], + "ability": "math", + "extra_info": { + "answer": "4" + }, + "reward_model": { + "ground_truth": "4" + } +} +``` + +### Legacy (prompt as string) + +```json +{ + "data_source": "openai/gsm8k", + "prompt": "What is 2+2?", + "extra_info": { + "answer": "4" + } +} +``` + +- Ground truth via `extra_info.answer` (preferred) or `reward_model.ground_truth` +- Preserves metadata fields: `id`, `data_source`, `ability`, `reward_model`, `extra_info`, `attributes`, `difficulty` diff --git a/plugins/sagemaker-ai/skills/dataset-transformation/scripts/transformation_tools.py b/plugins/sagemaker-ai/skills/dataset-transformation/scripts/transformation_tools.py new file mode 100644 index 00000000..4cc38743 --- /dev/null +++ b/plugins/sagemaker-ai/skills/dataset-transformation/scripts/transformation_tools.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 + +import os + +import boto3 +from sagemaker.core.helper.session_helper import Session, get_execution_role +from sagemaker.core import image_uris +from sagemaker.core.processing import FrameworkProcessor +from sagemaker.core.shapes import ProcessingInput, ProcessingOutput, ProcessingS3Input, ProcessingS3Output +from sagemaker.core.resources import ProcessingJob +from sagemaker.core import Attribution, set_attribution + +set_attribution(Attribution.SAGEMAKER_AGENT_PLUGIN) + + +def _get_session(region=None): + """Create a SageMaker Session, optionally pinned to a region.""" + return Session( + boto_session=boto3.Session(region_name=region) if region else None + ) + + +def execute_transformation_job( + transform_script_path, + dataset_source_s3, + dataset_output_s3, + instance_type="ml.m5.xlarge", + region=None, + execution_role=None, + base_job_name="dataset-transformation", + image_uri=None, +): + """ + Execute a dataset transformation script as a SageMaker Processing Job + using the V3 SDK FrameworkProcessor. + + The entire directory containing the script is uploaded as source_dir, + so transform_fn.py (and any other dependencies) are included automatically. + + Args: + transform_script_path: Local path to the transformation script (e.g., "/scripts/transform.py") + dataset_source_s3: S3 URI of the input dataset + dataset_output_s3: S3 URI for the transformed output dataset + instance_type: ML instance type (default: ml.m5.xlarge) + region: AWS region (auto-detected if None) + execution_role: IAM role ARN (auto-detected if None) + base_job_name: Prefix for the Processing Job name + image_uri: Docker image URI for the processing container. + If None, uses the SKLearn processing image. + """ + if not execution_role: + execution_role = get_execution_role() + + sagemaker_session = _get_session(region) + + if not region: + region = sagemaker_session.boto_region_name + + # Use SKLearn processing image as default (includes pandas) + if not image_uri: + image_uri = image_uris.retrieve( + framework="sklearn", + region=region, + version="1.2-1", + instance_type=instance_type, + ) + + source_dir = os.path.dirname(os.path.abspath(transform_script_path)) + script_name = os.path.basename(transform_script_path) + + processor = FrameworkProcessor( + role=execution_role, + image_uri=image_uri, + command=["python3"], + instance_count=1, + instance_type=instance_type, + base_job_name=base_job_name, + sagemaker_session=sagemaker_session, + ) + + input_local_path = "/opt/ml/processing/input" + output_local_path = "/opt/ml/processing/output" + input_filename = os.path.basename(dataset_source_s3.rstrip("/")) + + processor.run( + code=script_name, + source_dir=source_dir, + arguments=[ + "--input", os.path.join(input_local_path, input_filename), + "--output", os.path.join(output_local_path, input_filename), + ], + inputs=[ + ProcessingInput( + input_name="dataset", + s3_input=ProcessingS3Input( + s3_uri=dataset_source_s3, + local_path=input_local_path, + s3_data_type="S3Prefix", + s3_input_mode="File", + ), + ) + ], + outputs=[ + ProcessingOutput( + output_name="transformed", + s3_output=ProcessingS3Output( + s3_uri=dataset_output_s3, + local_path=output_local_path, + s3_upload_mode="EndOfJob", + ), + ) + ], + wait=False, + ) + + print(f"Processing job submitted. Output will be at: {dataset_output_s3}") + + +def describe_transformation_job(job_name, region=None): + """ + Describe a SageMaker Processing Job by name. + + Args: + job_name: The name of the processing job to describe. + region: AWS region (auto-detected if None). + + Returns: + dict: Job details including status, inputs, outputs, and timing info. + """ + sagemaker_session = _get_session(region) + + job = ProcessingJob.get( + processing_job_name=job_name, + session=sagemaker_session.boto_session, + ) + + details = job.refresh().__dict__ + return { + "job_name": job_name, + "status": details.get("processing_job_status"), + "failure_reason": details.get("failure_reason"), + "creation_time": str(details.get("creation_time", "")), + "processing_end_time": str(details.get("processing_end_time", "")), + "inputs": details.get("processing_inputs", []), + "outputs": getattr(details.get("processing_output_config"), "outputs", []), + } diff --git a/plugins/sagemaker-ai/skills/directory-management/SKILL.md b/plugins/sagemaker-ai/skills/directory-management/SKILL.md new file mode 100644 index 00000000..8a7f92b5 --- /dev/null +++ b/plugins/sagemaker-ai/skills/directory-management/SKILL.md @@ -0,0 +1,32 @@ +--- +name: directory-management +description: Manages project directory setup and artifact organization. Use when starting a new project, resuming an existing one, or when a PLAN.md needs to be associated with a project directory. Creates the project folder structure (specs/, scripts/, notebooks/) and resolves project naming. +--- + +# Directory Management + +## Project Setup + +Before any work begins, resolve the project name: + +1. If the project name is already known from conversation context, use it. +2. Otherwise, scan for existing `*/PLAN.md` files in the current directory. If found, ask the user if they are resuming an existing project and load that `PLAN.md` into context. +3. If no existing projects are found, recommend a ≤64-char lowercase slug based on what you know from the conversation (only `[a-z0-9-]`), or ask directly if there isn't enough context. Present the recommended name and wait for user confirmation. + +Once project name is resolved: + +1. Create and/or use the `/` directory using the confirmed name for storing all the artifacts + +## Directory Structure + +When working with the agent, all generated files are organized under an project directory. + +``` +/ +├── specs/ +│ ├── PLAN.md # Your customization plan +├── scripts/ # Generated Python scripts +│ ├── _transform_fn.py +└── notebooks/ # Generated Jupyter notebooks + ├── _training.ipynb +``` diff --git a/plugins/sagemaker-ai/skills/finetuning-setup/SKILL.md b/plugins/sagemaker-ai/skills/finetuning-setup/SKILL.md new file mode 100644 index 00000000..a13a0b38 --- /dev/null +++ b/plugins/sagemaker-ai/skills/finetuning-setup/SKILL.md @@ -0,0 +1,73 @@ +--- +name: finetuning-setup +description: Selects a base model and fine-tuning technique (SFT, DPO, or RLVR) for the user's use case by querying SageMaker Hub. Use when the user asks which model or technique to use, wants to start fine-tuning, or mentions a model name or family (e.g., "Llama", "Mistral") — always activate even for known model names because the exact Hub model ID must be resolved. Queries available models, validates technique compatibility, and confirms selections. +--- + +# Finetuning Setup + +Guides the user through selecting a base model and fine-tuning technique based on their use case. + +## When to Use + +- User asks which fine-tuning technique to use +- User wants to select or change their base model +- User mentions a model name or family (e.g., "Llama", "Mistral") — the exact Hub model ID still needs to be resolved + +## Prerequisites + +- A `use_case_spec.md` file exists. If not, activate the use-case-specification skill to generate it first. + +## Workflow + +### Step 1: Discover Hub + +1. List all available SageMaker Hubs in the user's region by calling the SageMaker `ListHubs` API using the `aws___call_aws` tool. +2. From the results, filter out any hub whose `HubDescription` contains "AI Registry" — these do not contain JumpStart models. +3. The remaining hubs are eligible (e.g., `SageMakerPublicHub` and any private hubs). +4. If exactly one eligible hub exists, use it automatically — do not ask the user. +5. If multiple eligible hubs exist, present them to the user and ask which one to use. Example: + + ``` + I found the following model hubs: + - SageMakerPublicHub — SageMaker Public Hub + - Private-Hub-XYZ — Private Hub models + Which hub would you like to use? + ``` + +6. Store the selected hub name for use in subsequent steps. + +### Step 2: Select Base Model + +1. Read `use_case_spec.md` to understand the use case and success criteria. +2. Restate the use case in one sentence. +3. Always retrieve the full list of available SageMaker Hub model names by running: `python finetuning-setup/scripts/get_model_names.py ` — even if the user has already mentioned a model name or family. Do not skip this step or filter the results. +4. Present all available models to the user, grouped by model family (e.g., Llama, Mistral, Gemma) for readability. +5. Ask the user to pick the exact model ID from the list. +6. Validate the selected model exists in the retrieved list before proceeding. + +EXTREMELY IMPORTANT: NEVER recommend or suggest any particular model based on the context you have. YOU ARE ALLOWED ONLY to display the list of models +as given by the script. DO NOT add your own recommendation or suggestion after displaying the list of models to tell which model is correct. Present this +statement to the user: "Which model would you like to use? Please type the exact model name from the above list." and allow the user to select the model. + +### Step 3: Determine Finetuning Technique + +1. Consult `references/finetune_technique_selection_guide.md` and recommend the best-fit technique (SFT, DPO, or RLVR) for the use case. Present the recommendation and reasoning to the user. +2. Ask the user if they'd like to go with the recommendation or prefer a different technique. +3. Once the user confirms a technique, retrieve the finetuning techniques available for the selected model by running: `python finetuning-setup/scripts/get_recipes.py ` + - This returns only the techniques the model actually supports, filtered to SFT, DPO, and RLVR. Only these three techniques are supported — ignore any other techniques even if the model's recipes include them. +4. If the chosen technique is available for the model, proceed to Step 4. +5. If the chosen technique is not available for the model, explain that the selected model does not support it on SageMaker and offer to go back to Step 2 to pick a different model that supports the chosen technique. + +### Step 4: Confirm Selections + +Present a summary to the user: + +``` +Here's what we've selected: +- Base model: [model name] +- Fine-tuning technique: [SFT/DPO/RLVR] +``` + +## References + +- `references/finetune_technique_selection_guide.md` — Technique guidance diff --git a/plugins/sagemaker-ai/skills/finetuning-setup/references/finetune_technique_selection_guide.md b/plugins/sagemaker-ai/skills/finetuning-setup/references/finetune_technique_selection_guide.md new file mode 100644 index 00000000..38472471 --- /dev/null +++ b/plugins/sagemaker-ai/skills/finetuning-setup/references/finetune_technique_selection_guide.md @@ -0,0 +1,37 @@ +# Finetuning Technique Selection Guide + +Not all models support all techniques. Always validate technique availability against the selected model's recipes before recommending. Only SFT, DPO, and RLVR are supported. + +## Technique Overview + +### SFT (Supervised Fine-Tuning) + +**Use when:** + +- Task has clear right/wrong answers +- Single optimal output per input +- Output represents exemplary responses +- Classification, extraction, structured generation + +### DPO (Direct Preference Optimization) + +**Use when:** + +- Multiple valid outputs, some better than others +- Subjective quality (tone, style, helpfulness) +- Creative tasks with preference judgments + +### RLVR (Reinforcement Learning from Verifiable Rewards) + +**Use when:** + +- Outputs can be verified programmatically +- Want to reward similarity to gold responses +- Code generation (passes tests = reward) +- Math problems (correct answer = reward) +- Constraint satisfaction (meets criteria = reward) + +**Key difference from SFT:** + +- SFT: Model learns to imitate gold responses directly +- RLVR: Model learns to maximize rewards (can be gold similarity or verification-based) diff --git a/plugins/sagemaker-ai/skills/finetuning-setup/scripts/get_model_names.py b/plugins/sagemaker-ai/skills/finetuning-setup/scripts/get_model_names.py new file mode 100644 index 00000000..2b4fcf75 --- /dev/null +++ b/plugins/sagemaker-ai/skills/finetuning-setup/scripts/get_model_names.py @@ -0,0 +1,43 @@ +import boto3 +import json +import sys + +if len(sys.argv) < 2: + print("Usage: python get_model_names.py [region]") + sys.exit(1) + +hub_name = sys.argv[1] +region_name = sys.argv[2] if len(sys.argv) > 2 else None + +sm_client = boto3.client("sagemaker", region_name=region_name) + +# Retrieve all models with pagination +all_contents = [] +next_token = None + +while True: + params = { + "HubName": hub_name, + "HubContentType": "Model", + "MaxResults": 100 + } + + if next_token: + params["NextToken"] = next_token + + response = sm_client.list_hub_contents(**params) + all_contents.extend(response.get("HubContentSummaries", [])) + + next_token = response.get("NextToken") + if not next_token: + break + +# Filter for customization-capable models +customization_models = [ + content for content in all_contents + if "@capability:customization" in content.get("HubContentSearchKeywords", []) +] + +model_names = [m.get("HubContentName") for m in customization_models] + +print(json.dumps(model_names)) diff --git a/plugins/sagemaker-ai/skills/finetuning-setup/scripts/get_recipes.py b/plugins/sagemaker-ai/skills/finetuning-setup/scripts/get_recipes.py new file mode 100644 index 00000000..7439954d --- /dev/null +++ b/plugins/sagemaker-ai/skills/finetuning-setup/scripts/get_recipes.py @@ -0,0 +1,30 @@ +import boto3 +import json +import sys + +if len(sys.argv) < 3: + print("Usage: python get_recipes.py ") + sys.exit(1) + +model_name = sys.argv[1] +hub_name = sys.argv[2] +sm_client = boto3.client("sagemaker") + +detail = sm_client.describe_hub_content( + HubName=hub_name, + HubContentType="Model", + HubContentName=model_name +) + +keywords = detail.get("HubContentSearchKeywords", []) + +# Only include SFT, DPO, and RLVR techniques +supported = {"sft", "dpo", "rlvr"} +techniques = sorted( + t.replace("@recipe:finetuning_", "").split("_")[0] + for t in keywords + if t.startswith("@recipe:finetuning_") +) +techniques = [t for t in dict.fromkeys(techniques) if t in supported] + +print(json.dumps(techniques)) diff --git a/plugins/sagemaker-ai/skills/finetuning/SKILL.md b/plugins/sagemaker-ai/skills/finetuning/SKILL.md new file mode 100644 index 00000000..f475769c --- /dev/null +++ b/plugins/sagemaker-ai/skills/finetuning/SKILL.md @@ -0,0 +1,128 @@ +--- +name: finetuning +description: Generates a Jupyter notebook that fine-tunes a base model using SageMaker serverless training jobs. Use when the user says "start training", "fine-tune my model", "I'm ready to train", or when the plan reaches the finetuning step. Supports SFT, DPO, and RLVR trainers, including RLVR Lambda reward function creation. +--- + +# Prerequisites + +Before starting this workflow, verify: + +1. A `use_case_spec.md` file exists + - If missing: Activate the `use-case-specification` skill first, then resume + - DON'T EVER offer to create a use case spec without activating the use-case-specification skill. + +2. A fine-tuning technique (SFT, DPO, or RLVR) and base model have already been selected + - If missing: Activate the `finetuning-setup` skill to collect what's missing, then resume + - Don't make recommendations on the spot. You MUST activate the finetuning-setup skill. + +3. A base model name available on SageMakerHub has been identified + - If missing: Activate the `finetuning-setup` skill to get it + - **Important:** Only use the model name that `finetuning-setup` retrieves, as it may differ from other commonly used names for the same model + +# Critical Rules + +## Code Generation Rules + +- ✅ Use EXACTLY the imports shown in each cell template +- ❌ Do NOT add additional imports even if they seem helpful +- ❌ Do NOT create variables before they're needed in that cell +- 📋 Copy the code structure precisely - no improvisation +- 🎯 Follow the minimal code principle strictly +- ✅ When writing a notebook cell, make sure the indentation and f strings are correct + +## User Communication Rules + +- ❌ NEVER offer to run the notebook for the user (you don't have the tools) +- ❌ NEVER offer to move on to a downstream skill while training is in progress (logically impossible) +- ❌ NEVER set ACCEPT_EULA to True yourself (user must read and agree) +- ✅ Always mention both the number AND title of cells you reference +- ✅ If user asks how to run: Tell them to run cells one by one, mention ipykernel requirement + +--- + +# Workflow + +## 1. Notebook Setup + +### 1.1 Directory Setup + +1. Identify project directory from conversation context + - If unclear (multiple relevant directories exist) → Ask user which folder to use +2. Create Jupyter notebook: `[title]_finetuning.ipynb` + - `[title]` = snake_case name derived from use case + - Save under the identified directory + +### 1.2 Select Reference Template + +Read the example notebook matching the finetuning strategy: + +- SFT → `references/sft_example.md` +- DPO → `references/dpo_example.md` +- RLVR → `references/rlvr_example.md` + +### 1.3 Copy Notebook Structure + +1. Write the exact cells from the example to `[title]_finetuning.ipynb` +2. Use same order, dependencies, and imports as the example +3. DO NOT improvise or add extra code + +### 1.4 Auto-Generate Configuration Values + +**In the 'Setup & Credentials' cell, populate:** + +1. **BASE_MODEL** + - Use the exact SageMakerHub model name from context + +2. **MODEL_PACKAGE_GROUP_NAME** + - Generate from use case (read `use_case_spec.md` if needed) + - Format rules: + - Lowercase, alphanumeric with hyphens only + - 1-63 characters + - Pattern: `[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}` + - Example: "Customer Support Chatbot" → `customer-support-chatbot-v1` + +3. Save notebook + +## 2. RLVR Reward Function (for RLVR only, skip this section if technique is SFT or DPO) + +### 2.1 Check Reward Function Status + +- Ask if user has a reward function already, or would like help creating one. + - If user says they have one → Ask for the SageMaker Hub Evaluator ARN. Only proceed to Section 2.3 once the user provides a valid Evaluator ARN. If they don't have it registered as a SageMaker Hub Evaluator, continue to 2.2. + - If user says they do not have one → Continue to 2.2 + +### 2.2 Generate Reward Function From Template + +1. Follow workflow in `references/rlvr_reward_function.md` section "Helping Users Create Lambda Functions" + +### 2.3 Set CUSTOM_REWARD_FUNCTION value + +1. Set the value for `CUSTOM_REWARD_FUNCTION` in the Notebook with the ARN of the reward function (either given directly by the user, or from the function generation code as `evaluator.arn`). + +## 3. EULA review and acceptance + +1. Look up the official EULA link for the selected base model from references/eula_links.md +2. Display the EULA link(s) to the user in your message as clickable markdown links +3. Tell the user they must read and agree to the EULA before using this model (one sentence) +4. Ask them to manually change `ACCEPT_EULA` to `True` in the notebook after reviewing the license +5. **NEVER set ACCEPT_EULA to True yourself** + +## 4. Notebook Execution + +1. **Display the following to the user:**: `A Jupyter notebook has now been generated which will help you finetune your model. You are free to run it now. Please let me know once the training is complete.` +2. Wait for user's confirmation about training completion. Once the user has confirmed this, you are free to move to the next step of the plan. + +**CRITICAL:** + +- DON'T suggest moving to next steps before training completes +- DON'T elaborate on the next steps unless the user specifically asks you about them. + +--- + +# References + +- `rlvr_reward_function.md` - Lambda reward function creation guide (RLVR only) +- `templates/rlvr_reward_function_source_template.py` - Lambda reward function source template (RLVR only) +- `sft_example.md` - Complete notebook template for Supervised Fine-Tuning +- `dpo_example.md` - Complete notebook template for Direct Preference Optimization +- `rlvr_example.md` - Complete notebook template for Reinforcement Learning from Verifiable Rewards diff --git a/plugins/sagemaker-ai/skills/finetuning/references/dpo_example.md b/plugins/sagemaker-ai/skills/finetuning/references/dpo_example.md new file mode 100644 index 00000000..041a5ffd --- /dev/null +++ b/plugins/sagemaker-ai/skills/finetuning/references/dpo_example.md @@ -0,0 +1,159 @@ +# DPO (Direct Preference Optimization) Notebook Template + +This template provides the complete cell structure for a DPO finetuning notebook. + +--- + +## Cell 1: Install Dependencies + +```python +!pip install 'sagemaker>=3.7.0,<4.0' boto3 -q +``` + +--- + +## Cell 2: Setup & Credentials + +```python +import os +import boto3 +from sagemaker.ai_registry.dataset_utils import CustomizationTechnique +from botocore.exceptions import ClientError +from sagemaker.ai_registry.dataset import DataSet +from sagemaker.core.resources import ModelPackageGroup +from sagemaker.core.helper.session_helper import Session, get_execution_role +from sagemaker.core import Attribution, set_attribution + +set_attribution(Attribution.SAGEMAKER_AGENT_PLUGIN) + +# Setup +sm_client = boto3.Session().client("sagemaker") +sagemaker_session = Session(sagemaker_client=sm_client) +bucket = sagemaker_session.default_bucket() + +# Configuration - USER please fill in these fields with your information: + +BASE_MODEL = "" # e.g., "meta-textgeneration-llama-3-8b" +TRAINING_DATA_S3 = "" # S3 path +S3_OUTPUT_PATH = f"s3://{bucket}/finetuning-output/" +ROLE_ARN = get_execution_role() # You can change this to a specific role. +ACCEPT_EULA = False # Set to True to accept the base model's End-User License Agreement +MODEL_PACKAGE_GROUP_NAME = "" # Auto-generated based on use case +``` + +--- + +## Cell 3: Create Dataset and Model Package Group + +```python +# Create Model Package Group +try: + model_package_group = ModelPackageGroup.create( + model_package_group_name=MODEL_PACKAGE_GROUP_NAME, + model_package_group_description="", + ) + print(f"Created new model package group named {MODEL_PACKAGE_GROUP_NAME}") +except ClientError as e: + if e.response['Error']['Code'] in ('ResourceInUse', 'ValidationException'): + model_package_group = ModelPackageGroup.get(model_package_group_name=MODEL_PACKAGE_GROUP_NAME) + print(f"There is already a model package group with the name {MODEL_PACKAGE_GROUP_NAME}.\nIf you want to save your finetuned model under a different name, change the value of MODEL_PACKAGE_GROUP_NAME in the previous cell.") + else: + raise + +# Create Dataset +# Register dataset in SageMaker AI Registry. This creates a versioned dataset that can be referenced by ARN +dataset = DataSet.create( + name=MODEL_PACKAGE_GROUP_NAME, + source=TRAINING_DATA_S3, + wait=True +) + +TRAINING_DATASET_ARN = dataset.arn +print(f"Here is your model package group ARN: {model_package_group.model_package_group_arn}\n") +print(f"Here is your training dataset ARN: {dataset.arn}") +``` + +--- + +## Cell 4: Configure Trainer + +```python +from sagemaker.train.dpo_trainer import DPOTrainer +from sagemaker.train.common import TrainingType + +trainer = DPOTrainer( + model=BASE_MODEL, + training_type=TrainingType.LORA, + model_package_group=model_package_group, + training_dataset=TRAINING_DATASET_ARN, + s3_output_path=S3_OUTPUT_PATH, + sagemaker_session=sagemaker_session, + accept_eula=ACCEPT_EULA, + role=ROLE_ARN +) +print ("Here are the recommended hyperparameters for the current training job:") +print(f"Batch size: {trainer.hyperparameters.global_batch_size}") +print(f"Number of epochs: {trainer.hyperparameters.max_epochs}") +print(f"Learning rate: {trainer.hyperparameters.learning_rate}") +print(f"Learning rate warmup ratio: {trainer.hyperparameters.lr_warmup_ratio}") +print(f"Adam Beta: {trainer.hyperparameters.adam_beta}") + +# To change a hyperparameter, uncomment its corresponding line and set the value you want. +# Note: You might get an error if the value you choose is not supported for your model. +# If that happens, simply choose from the allowed range that's indicated in the error. + +# Uncomment the following line to change the learning rate +# trainer.hyperparameters.learning_rate = 0.0002 + +# Uncomment the following line to change the batch size +# trainer.hyperparameters.global_batch_size = 16 + +# Uncomment the following line to change the number of epochs +# trainer.hyperparameters.max_epochs = 5 + +# Uncomment the following line to change the learning rate warmup ratio +# trainer.hyperparameters.lr_warmup_ratio = 0.05 + +# Uncomment the following line to change Adam Beta +# trainer.hyperparameters.adam_beta = 0.01 +``` + +--- + +## Cell 5: Start Training + +```python +# Start training +training_job = trainer.train(wait=True) + +print(f"Training Job Name: {training_job.training_job_name}") +print(f"Training Status: {training_job.training_job_status}") +``` + +--- + +## Cell 6: Plot and Display Metrics + +```python +import matplotlib.pyplot as plt +import mlflow +from mlflow.tracking import MlflowClient + +run_id = training_job.mlflow_details.mlflow_run_id +mlflow.set_tracking_uri(training_job.mlflow_config.mlflow_resource_arn) +client = MlflowClient() + +metrics = ["loss_per_batch", "rewards/chosen", "rewards/rejected", "rewards/margins", "acc_per_batch"] +fig, axes = plt.subplots(1, len(metrics), figsize=(4 * len(metrics), 3)) +for idx, metric in enumerate(metrics): +history = client.get_metric_history(run_id, metric) +axes[idx].plot([h.step for h in history], [h.value for h in history], linewidth=2, marker='o', markersize=4) +axes[idx].set_xlabel('Step') +axes[idx].set_ylabel(metric.split('/')[-1]) +axes[idx].set_title(metric, fontweight='bold') +axes[idx].grid(True, alpha=0.3) + +plt.suptitle(f'Training Metrics: {training_job.training_job_name}', fontweight='bold') +plt.tight_layout() +plt.show() +``` diff --git a/plugins/sagemaker-ai/skills/finetuning/references/eula_links.md b/plugins/sagemaker-ai/skills/finetuning/references/eula_links.md new file mode 100644 index 00000000..52c33b6b --- /dev/null +++ b/plugins/sagemaker-ai/skills/finetuning/references/eula_links.md @@ -0,0 +1,19 @@ +# Model License Information + +| SageMaker Hub Model ID | Model Name | License URL(s) | +| -------------------------------------------- | ---------------------------- | --------------------------------------------------------------------------------------------------------------------------------- | +| `huggingface-reasoning-qwen3-8b` | Qwen3-8B | https://huggingface.co/Qwen/Qwen3-8B/blob/main/LICENSE | +| `huggingface-reasoning-qwen3-32b` | Qwen3-32B | https://huggingface.co/Qwen/Qwen3-32B/blob/main/LICENSE | +| `huggingface-reasoning-qwen3-06b` | Qwen3-0.6B | https://huggingface.co/Qwen/Qwen3-0.6B/blob/main/LICENSE | +| `huggingface-llm-qwen2-5-7b-instruct` | Qwen2.5-7B-Instruct | https://huggingface.co/Qwen/Qwen2.5-7B-Instruct/blob/main/LICENSE | +| `huggingface-llm-qwen2-5-32b-instruct` | Qwen2.5-32B-Instruct | https://huggingface.co/Qwen/Qwen2.5-32B-Instruct/blob/main/LICENSE | +| `deepseek-llm-r1-distill-qwen-32b` | DeepSeek-R1-Distill-Qwen-32B | https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/blob/main/LICENSE | +| `openai-reasoning-gpt-oss-20b` | GPT-OSS-20B | https://huggingface.co/openai/gpt-oss-20b/blob/main/LICENSE
https://huggingface.co/openai/gpt-oss-20b/blob/main/USAGE_POLICY | +| `openai-reasoning-gpt-oss-120b` | GPT-OSS-120B | https://huggingface.co/openai/gpt-oss-120b/blob/main/LICENSE
https://huggingface.co/openai/gpt-oss-120b/blob/main/USAGE_POLICY | +| `nova-textgeneration-pro` | Amazon Nova Pro | https://aws.amazon.com/service-terms/ | +| `nova-textgeneration-micro` | Amazon Nova Micro | https://aws.amazon.com/service-terms/ | +| `nova-textgeneration-lite` | Amazon Nova Lite | https://aws.amazon.com/service-terms/ | +| `nova-textgeneration-lite-v2` | Amazon Nova Lite v2 | https://aws.amazon.com/service-terms/ | +| `meta-textgeneration-llama-3-3-70b-instruct` | Llama 3.3 70B Instruct | https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct/blob/main/LICENSE | +| `meta-textgeneration-llama-3-2-1b-instruct` | Llama 3.2 1B Instruct | https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct/blob/main/LICENSE.txt | +| `meta-textgeneration-llama-3-1-8b-instruct` | Llama 3.1 8B Instruct | https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct/blob/main/LICENSE | diff --git a/plugins/sagemaker-ai/skills/finetuning/references/rlvr_example.md b/plugins/sagemaker-ai/skills/finetuning/references/rlvr_example.md new file mode 100644 index 00000000..2c983436 --- /dev/null +++ b/plugins/sagemaker-ai/skills/finetuning/references/rlvr_example.md @@ -0,0 +1,169 @@ +# RLVR (Reinforcement Learning from Verifiable Rewards) Notebook Template + +This template provides the complete cell structure for an RLVR finetuning notebook. + +--- + +## Cell 1: Install Dependencies + +```python +!pip install 'sagemaker>=3.7.0,<4.0' boto3 -q +``` + +--- + +## Cell 2: Setup & Credentials + +```python +import os +import boto3 +from sagemaker.ai_registry.dataset_utils import CustomizationTechnique +from botocore.exceptions import ClientError +from sagemaker.ai_registry.dataset import DataSet +from sagemaker.core.resources import ModelPackageGroup +from sagemaker.core.helper.session_helper import Session, get_execution_role +from sagemaker.core import Attribution, set_attribution + +set_attribution(Attribution.SAGEMAKER_AGENT_PLUGIN) + +# Setup +sm_client = boto3.Session().client("sagemaker") +sagemaker_session = Session(sagemaker_client=sm_client) +bucket = sagemaker_session.default_bucket() + +# Configuration - USER please fill in these fields with your information: + +BASE_MODEL = "" # e.g., "meta-textgeneration-llama-3-8b" +TRAINING_DATA_S3 = "" # S3 path +S3_OUTPUT_PATH = f"s3://{bucket}/finetuning-output/" +ROLE_ARN = get_execution_role() # You can change this to a specific role. +ACCEPT_EULA = False # Set to True to accept the base model's End-User License Agreement +MODEL_PACKAGE_GROUP_NAME = "" # Auto-generated based on use case +CUSTOM_REWARD_FUNCTION = "" # Reward Function ARN +``` + +--- + +## Cell 3: Create Dataset and Model Package Group + +```python +# Create Model Package Group +try: + model_package_group = ModelPackageGroup.create( + model_package_group_name=MODEL_PACKAGE_GROUP_NAME, + model_package_group_description="", + ) + print(f"Created new model package group named {MODEL_PACKAGE_GROUP_NAME}") +except ClientError as e: + if e.response['Error']['Code'] in ('ResourceInUse', 'ValidationException'): + model_package_group = ModelPackageGroup.get(model_package_group_name=MODEL_PACKAGE_GROUP_NAME) + print(f"There is already a model package group with the name {MODEL_PACKAGE_GROUP_NAME}.\nIf you want to save your finetuned model under a different name, change the value of MODEL_PACKAGE_GROUP_NAME in the previous cell.") + else: + raise + +# Create Dataset +# Register dataset in SageMaker AI Registry. This creates a versioned dataset that can be referenced by ARN +dataset = DataSet.create( + name=MODEL_PACKAGE_GROUP_NAME, + source=TRAINING_DATA_S3, + wait=True +) +TRAINING_DATASET_ARN = dataset.arn + +print(f"Here is your model package group ARN: {model_package_group.model_package_group_arn}\n") +print(f"Here is your training dataset ARN: {dataset.arn}") +``` + +--- + +## Cell 4: Configure Trainer + +```python +from sagemaker.train.rlvr_trainer import RLVRTrainer +from sagemaker.train.common import TrainingType + + +trainer = RLVRTrainer( + model=BASE_MODEL, + model_package_group=model_package_group, + training_dataset=TRAINING_DATASET_ARN, + s3_output_path=S3_OUTPUT_PATH, + sagemaker_session=sagemaker_session, + accept_eula=ACCEPT_EULA, + role=ROLE_ARN, + custom_reward_function=CUSTOM_REWARD_FUNCTION +) +print ("Here are the recommended hyperparameters for the current training job:") +print(f"Batch size: {trainer.hyperparameters.global_batch_size}") +print(f"Number of epochs: {trainer.hyperparameters.max_epochs}") +print(f"Learning rate: {trainer.hyperparameters.learning_rate}") + +# To change a hyperparameter, uncomment its corresponding line and set the value you want. +# Note: You might get an error if the value you choose is not supported for your model. +# If that happens, simply choose from the allowed range that's indicated in the error. + +# Uncomment the following line to change the learning rate +# trainer.hyperparameters.learning_rate = 0.0002 + +# Uncomment the following line to change the batch size +# trainer.hyperparameters.global_batch_size = 16 + +# Uncomment the following line to change the number of epochs +# trainer.hyperparameters.max_epochs = 5 + +# Uncomment the following line to change Adam Beta +# trainer.hyperparameters.adam_beta = 0.01 +``` + +--- + +## Cell 5: Start Training + +```python +# Start training +training_job = trainer.train(wait=True) + +print(f"Training Job Name: {training_job.training_job_name}") +print(f"Training Status: {training_job.training_job_status}") +``` + +--- + +## Cell 6: Plot and Display Metrics + +```python +import matplotlib.pyplot as plt +import mlflow +from mlflow.tracking import MlflowClient + +run_id = training_job.mlflow_details.mlflow_run_id +mlflow.set_tracking_uri(training_job.mlflow_config.mlflow_resource_arn) +client = MlflowClient() + +# Core RL metrics - adjust val-core metric names based on your data source and reward function +metrics = [ +"critic/rewards/mean", +"response_length/mean", +"actor/entropy_loss", +"actor/grad_norm", +"critic/advantages/mean", +] +# Note: Validation reward metrics follow the pattern: val-core/{data_source}/reward(/acc)/mean@{k} +# Add your specific val-core metrics to the list above, e.g.: +# "val-core/my_dataset/reward/mean@1" +# ResponseQuality: Verl allows printing to a file. Check training job output for details. + +fig, axes = plt.subplots(1, len(metrics), figsize=(4 * len(metrics), 3)) +for idx, metric in enumerate(metrics): +history = client.get_metric_history(run_id, metric) +if history: +axes[idx].plot([h.step for h in history], [h.value for h in history], linewidth=2, marker='o', markersize=4) +axes[idx].set_xlabel('Step') +axes[idx].set_ylabel(metric.split('/')[-1]) +axes[idx].set_title(metric, fontweight='bold') +axes[idx].grid(True, alpha=0.3) + +plt.suptitle(f'Training Metrics: {training_job.training_job_name}', fontweight='bold') +plt.tight_layout() +plt.show() +``` diff --git a/plugins/sagemaker-ai/skills/finetuning/references/rlvr_reward_function.md b/plugins/sagemaker-ai/skills/finetuning/references/rlvr_reward_function.md new file mode 100644 index 00000000..8668bc6b --- /dev/null +++ b/plugins/sagemaker-ai/skills/finetuning/references/rlvr_reward_function.md @@ -0,0 +1,43 @@ +# RLVR Lambda Reward Function Guide + +## What is a Lambda Reward Function? + +For RLVR training, a Lambda reward function is an AWS Lambda that evaluates model outputs during training and returns +numerical rewards. SageMaker calls this Lambda in the training loop to provide learning signals. + +## Helping Users Create Lambda Functions + +### Step 1: Copy Template to Project + +Copy the template file `templates/rlvr_reward_function_source_template.py` as `lambda_function.py` into the project's scripts directory. + +- Read the `directory-management` skill to determine the correct directory for storing scripts. + +### Step 2: Generate Notebook Cell + +Create a single notebook cell that registers the local file as a SageMaker Hub Evaluator. Set `reward_function_path` to the path where `lambda_function.py` was saved in Step 1. + +```python +from sagemaker.ai_registry.evaluator import Evaluator + +reward_function_path = "" # Path to lambda_function.py from Step 1 + +evaluator = Evaluator.create( + name="[GENERATE A NAME FOR THE EVALUATOR HERE]", + type="RewardFunction", + source=reward_function_path, +) +print(f"Reward Function ARN: {evaluator.arn}") +``` + +Remember to set an appropriate name for the Evaluator by yourself in the above code, based on the use case and the current context. + +- Format: lowercase, alphanumeric with hyphens only, 1-20 characters +- Pattern: `[a-zA-Z0-9](-*[a-zA-Z0-9]){0,20}` + +### Step 3: Inform User About TODOs + +After copying the template and generating the notebook cell, inform the user that `lambda_function.py` contains `TODO` sections that they +must customize for their use case before running the notebook. The sections that need customization include helper functions, +reward logic, input parsing, score computation, and the return statement. Direct the user to edit `lambda_function.py` directly. +Wait for the user's acknowledgment before proceeding. diff --git a/plugins/sagemaker-ai/skills/finetuning/references/sft_example.md b/plugins/sagemaker-ai/skills/finetuning/references/sft_example.md new file mode 100644 index 00000000..5695abde --- /dev/null +++ b/plugins/sagemaker-ai/skills/finetuning/references/sft_example.md @@ -0,0 +1,159 @@ +# SFT (Supervised Fine-Tuning) Notebook Template + +This template provides the complete cell structure for an SFT finetuning notebook. + +--- + +## Cell 1: Install Dependencies + +```python +!pip install 'sagemaker>=3.7.0,<4.0' boto3 -q +``` + +--- + +## Cell 2: Setup & Credentials + +```python +import os +import boto3 +from sagemaker.ai_registry.dataset_utils import CustomizationTechnique +from botocore.exceptions import ClientError +from sagemaker.ai_registry.dataset import DataSet +from sagemaker.core.resources import ModelPackageGroup +from sagemaker.core.helper.session_helper import Session, get_execution_role +from sagemaker.core import Attribution, set_attribution + +set_attribution(Attribution.SAGEMAKER_AGENT_PLUGIN) + +# Setup +sm_client = boto3.Session().client("sagemaker") +sagemaker_session = Session(sagemaker_client=sm_client) +bucket = sagemaker_session.default_bucket() + +# Configuration - USER please fill in these fields with your information: + +BASE_MODEL = "" # e.g., "meta-textgeneration-llama-3-8b" +TRAINING_DATA_S3 = "" # S3 path +S3_OUTPUT_PATH = f"s3://{bucket}/finetuning-output/" +ROLE_ARN = get_execution_role() # You can change this to a specific role. +ACCEPT_EULA = False # Set to True to accept the base model's End-User License Agreement +MODEL_PACKAGE_GROUP_NAME = "" # Auto-generated based on use case +``` + +--- + +## Cell 3: Create Dataset and Model Package Group + +```python +# Create Model Package Group +try: + model_package_group = ModelPackageGroup.create( + model_package_group_name=MODEL_PACKAGE_GROUP_NAME, + model_package_group_description="", + ) + print(f"Created new model package group named {MODEL_PACKAGE_GROUP_NAME}") +except ClientError as e: + if e.response['Error']['Code'] in ('ResourceInUse', 'ValidationException'): + model_package_group = ModelPackageGroup.get(model_package_group_name=MODEL_PACKAGE_GROUP_NAME) + print(f"There is already a model package group with the name {MODEL_PACKAGE_GROUP_NAME}. If you want to save your finetuned model under a different name, change the value of MODEL_PACKAGE_GROUP_NAME in the previous cell.") + else: + raise + +# Create Dataset +# Register dataset in SageMaker AI Registry. This creates a versioned dataset that can be referenced by ARN +dataset = DataSet.create( + name=MODEL_PACKAGE_GROUP_NAME, + source=TRAINING_DATA_S3, + wait=True +) + +TRAINING_DATASET_ARN = dataset.arn +print(f"Here is your model package group ARN: {model_package_group.model_package_group_arn}\n") +print(f"Here is your training dataset ARN: {dataset.arn}") +``` + +--- + +## Cell 4: Configure Trainer + +```python +from sagemaker.train.sft_trainer import SFTTrainer +from sagemaker.train.common import TrainingType + + +trainer = SFTTrainer( + model=BASE_MODEL, + training_type=TrainingType.LORA, + model_package_group=model_package_group, + training_dataset=TRAINING_DATASET_ARN, + s3_output_path=S3_OUTPUT_PATH, + sagemaker_session=sagemaker_session, + accept_eula=ACCEPT_EULA, + role=ROLE_ARN +) + +print ("Here are the recommended hyperparameters for the current training job:") +print(f"Batch size: {trainer.hyperparameters.global_batch_size}") +print(f"Number of epochs: {trainer.hyperparameters.max_epochs}") +print(f"Learning rate: {trainer.hyperparameters.learning_rate}") +print(f"Learning rate warmup ratio: {trainer.hyperparameters.lr_warmup_ratio}") + +# To change a hyperparameter, uncomment its corresponding line and set the value you want. +# Note: You might get an error if the value you choose is not supported for your model. +# If that happens, simply choose from the allowed range that's indicated in the error. + +# Uncomment the following line to change the learning rate +# trainer.hyperparameters.learning_rate = 0.0002 + +# Uncomment the following line to change the batch size +# trainer.hyperparameters.global_batch_size = 16 + +# Uncomment the following line to change the number of epochs +# trainer.hyperparameters.max_epochs = 5 + +# Uncomment the following line to change the learning rate warmup ratio +# trainer.hyperparameters.lr_warmup_ratio = 0.05 + +# Uncomment the following line to change Adam Beta +# trainer.hyperparameters.adam_beta = 0.01 +``` + +--- + +## Cell 5: Start Training + +```python +# Start training +training_job = trainer.train(wait=True) + +print(f"Training Job Name: {training_job.training_job_name}") +print(f"Training Status: {training_job.training_job_status}") +``` + +--- + +## Cell 6: Plot and Display Metrics + +```python +import matplotlib.pyplot as plt +import mlflow +from mlflow.tracking import MlflowClient + +run_id = training_job.mlflow_details.mlflow_run_id +mlflow.set_tracking_uri(training_job.mlflow_config.mlflow_resource_arn) +client = MlflowClient() + +fig, axes = plt.subplots(1, 2, figsize=(12, 3)) +for idx, metric in enumerate(["total_loss", "val_eval_total_loss"]): +history = client.get_metric_history(run_id, metric) +axes[idx].plot([h.step for h in history], [h.value for h in history], linewidth=2, marker='o', markersize=4) +axes[idx].set_xlabel('Step') +axes[idx].set_ylabel('Loss') +axes[idx].set_title(metric, fontweight='bold') +axes[idx].grid(True, alpha=0.3) + +plt.suptitle(f'Training Metrics: {training_job.training_job_name}', fontweight='bold') +plt.tight_layout() +plt.show() +``` diff --git a/plugins/sagemaker-ai/skills/finetuning/templates/rlvr_reward_function_source_template.py b/plugins/sagemaker-ai/skills/finetuning/templates/rlvr_reward_function_source_template.py new file mode 100644 index 00000000..32f17ea2 --- /dev/null +++ b/plugins/sagemaker-ai/skills/finetuning/templates/rlvr_reward_function_source_template.py @@ -0,0 +1,250 @@ +""" +Provide your custom reward function code below. Learn about the available libraries and templates that you can use +at: https://docs.aws.amazon.com/sagemaker/latest/dg/customize-model.html. + +- You must add your evaluation logic in the reward_function() function +- Do not remove the lambda_handler() function or modify its schema as it is required to create the reward function +""" + +import json # For JSON parsing - adjust imports based on your use case +import re # For pattern matching and validation +from typing import Dict, Any, List, Optional # For type hints +# Add any other imports your use case requires + +# ======================================================================================== +# NOTE: INITIAL SUGGESTION ONLY - MUST BE CUSTOMIZED +# +# YOU MUST: +# 1. Review and update each section per YOUR use case +# 2. Customize the logic for YOUR SPECIFIC requirements +# 3. Replace example values (field names, thresholds, etc.) with your actual values +# 4. Test thoroughly before using +# +# DO NOT use this code as-is. It will not work until you uncomment and customize it. +# ========================================================================================= + + +# ========================================================================================= +# SECTION 1: Helper function 1 +# ========================================================================================= +# TODO: UPDATE or REMOVE the helper function as per YOUR use case +# Note the below lines of code are examples and will not work for your use case +# You MUST update them to match YOUR use case +def extract_number(text: str) -> Optional[float]: + """ + Extract numerical answer from text. + Looks for numbers after answer keywords, or returns the last number found. + + Args: + text: Text containing a numerical answer + + Returns: + Extracted number as float, or None if no number found + """ + if not text: + return None + + # Try to find numbers after common answer keywords + answer_patterns = [ + r'(?:equals|is|answer is|result is|=)\s*(-?\d+\.?\d*)', + r'(?:answer|result|solution):\s*(-?\d+\.?\d*)', + ] + + for pattern in answer_patterns: + match = re.search(pattern, text, re.IGNORECASE) + if match: + try: + return float(match.group(1)) + except ValueError: + pass + + # Fallback: find all numbers and return the last one (likely the answer) + pattern = r'-?\d+\.?\d*' + matches = re.findall(pattern, text) + + if matches: + try: + return float(matches[-1]) # Return last number instead of first + except ValueError: + return None + + return None + +# ========================================================================================= +# SECTION 2: Helper function 2 +# ========================================================================================= +# TODO: UPDATE or REMOVE the helper function as per YOUR use case +# Note the below lines of code are examples and will not work for your use case +# You MUST update them to match YOUR use case +def compute_reasoning_quality(response: str) -> float: + """ + Compute reasoning quality score based on response characteristics. + This is a simple heuristic - customize based on your needs. + + Args: + response: The model's response text + + Returns: + Quality score between 0.0 and 1.0 + """ + if not response: + return 0.0 + + score = 0.0 + + # Check for reasoning indicators (customize these for your use case) + reasoning_indicators = [ + 'because', 'therefore', 'thus', 'since', 'so', + 'first', 'second', 'then', 'finally', + 'step', 'calculate', 'compute', 'equals' + ] + + response_lower = response.lower() + + # Award points for reasoning indicators (max 0.55) + indicator_count = sum(1 for indicator in reasoning_indicators if indicator in response_lower) + score += min(indicator_count * 0.11, 0.55) + + # Award points for response length (indicates detailed reasoning, max 0.25) + if len(response) > 30: + score += 0.05 + if len(response) > 60: + score += 0.1 + if len(response) > 120: + score += 0.1 + + # Award points for structured response (max 0.2) + if '\n' in response or '.' in response: + score += 0.2 + + return min(score, 1.0) + +# ========================================================================================= +# SECTION 3: Sample reward function +# ========================================================================================= +# TODO: UPDATE or REMOVE the reward function as per YOUR use case +# Note the below lines of code are examples and will not work for your use case +# You MUST update them to match YOUR use case +def reward_function(sample: Dict[str, Any], index: int) -> Dict[str, Any]: + """ + Args: + sample: Dictionary containing messages and reference_answer + index: Sample index in batch + + Returns: + Dictionary with reward scores and metrics + """ + # ======================================================================== + # SECTION 4: Parse input + # ======================================================================== + # TODO: UPDATE logic to parse the input as per YOUR use case + # Note the below lines of code are examples and will not work for your use case + # You MUST update them to match YOUR use case + # Extract the response and reference + messages = sample.get('messages', []) + reference_answer = sample.get('reference_answer', {}).get('text', '') + + # Get the question and assistant's response + question = "" + response = "" + for msg in messages: + if msg.get('role') == 'user': + question = msg.get('content', '') + elif msg.get('role') == 'assistant': + response = msg.get('content', '') + + # Extract numerical answers + predicted = extract_number(response) + expected = extract_number(reference_answer) + + # Compute metrics + exact_match = 0.0 + answer_present = 0.0 + reasoning_quality = compute_reasoning_quality(response) + + if predicted is not None and expected is not None: + exact_match = 1.0 if abs(predicted - expected) < 1e-6 else 0.0 + answer_present = 1.0 + + # ======================================================================== + # SECTION 5: Compute reward scores + # ======================================================================== + # TODO: UPDATE logic to compute aggregate score + # Note the below lines of code are examples and will not work for your use case + # You MUST update them to match YOUR use case + # Aggregate reward computation + aggregate_reward = 0.7 * exact_match + 0.3 * reasoning_quality + + # ======================================================================== + # SECTION 6: Form the metrics list + # ======================================================================== + # TODO: UPDATE logic to compute metrics list + # Note the below lines of code are examples and will not work for your use case + # You MUST update them to match YOUR use case + metrics = [ + { + 'name': 'exact_match', + 'value': float(exact_match), + 'type': 'Reward' + }, + { + 'name': 'answer_present', + 'value': float(answer_present), + 'type': 'Metric' + }, + { + 'name': 'reasoning_quality', + 'value': float(reasoning_quality), + 'type': 'Metric' + } + ] + + # ======================================================================== + # SECTION 7: Return output + # ======================================================================== + # TODO: UPDATE the return statement to return YOUR outout + # UPDATE the key before creating the evaluator + # Note the below lines of code are examples and will not work for your use case + # You MUST update them to match YOUR use case + + return { + 'id': str(sample.get('my_key', f'sample-{index:03d}')), # Use formatted index as fallback + 'aggregate_reward_score': float(aggregate_reward), + 'metrics_list': metrics + } + +def lambda_handler(event: Dict[str, Any], context: Any) -> Dict[str, Any]: + """ + AWS Lambda Handler for reward function + """ + try: + # Extract batch from event + batch = event.get('input', event) if isinstance(event, dict) else event + if 'batch' in event: + batch = event.get('batch', []) + elif 'body' in event: + body = json.loads(event.get('body', '{}')) + batch = body.get('batch', []) + + if not batch: + return {"error":"Missing or empty batch"} + + # Process each sample + results = [] + for i, sample in enumerate(batch): + try: + result = reward_function(sample, i) + results.append(result) + except Exception as e: + return {"error": str(e)} + + return { + 'statusCode': 200, + 'headers': {'Content-Type': 'application/json'}, + 'body': json.dumps(results) + } + except Exception as e: + return { + 'statusCode': 400, + 'body': json.dumps({"error": str(e)}) + } diff --git a/plugins/sagemaker-ai/skills/hyperpod-issue-report/SKILL.md b/plugins/sagemaker-ai/skills/hyperpod-issue-report/SKILL.md new file mode 100755 index 00000000..b15fd4c4 --- /dev/null +++ b/plugins/sagemaker-ai/skills/hyperpod-issue-report/SKILL.md @@ -0,0 +1,74 @@ +--- +name: hyperpod-issue-report +description: Generate comprehensive issue reports from HyperPod clusters (EKS and Slurm) by collecting diagnostic logs and configurations for troubleshooting and AWS Support cases. Use when users need to collect diagnostics from HyperPod cluster nodes, generate issue reports for AWS Support, investigate node failures or performance problems, document cluster state, or create diagnostic snapshots. Triggers on requests involving issue reports, diagnostic collection, support case preparation, or cluster troubleshooting that requires gathering logs and system information from multiple nodes. +--- + +# HyperPod Issue Report + +Collect diagnostic logs from HyperPod cluster nodes via SSM, store results in S3. Supports both EKS and Slurm clusters with auto-detection. Uses the bundled `scripts/hyperpod_issue_report.py` for reliable parallel collection. + +## Prerequisites + +- AWS CLI configured with permissions: `sagemaker:DescribeCluster`, `sagemaker:ListClusterNodes`, `ssm:StartSession`, `s3:PutObject`, `s3:GetObject`, `eks:DescribeCluster` +- Python 3.8+ with `pip install -r scripts/requirements.txt` +- SSM Agent running on target nodes; node IAM roles need `s3:GetObject`/`s3:PutObject` on the report bucket +- For EKS clusters: kubectl installed and configured (see Workflow step 2) + +## Workflow + +### 1. Gather Information + +Collect from the user: + +- **Cluster identifier** (required): accepts cluster name or full cluster ARN (e.g., `arn:aws:sagemaker:us-west-2:123456789012:cluster/abc123`) +- **AWS region** (required unless extractable from ARN) +- **S3 path** for report storage (required, e.g. `s3://bucket/prefix`). If the user doesn't have a bucket, create one (e.g., `s3://hyperpod-diagnostics--`) +- **Issue description** (optional) +- **Target scope**: all nodes, specific instance groups, or specific node IDs (optional) +- **Additional commands** to run on nodes (optional) + +### 2. Verify Environment + +```bash +aws sts get-caller-identity +aws sagemaker describe-cluster --cluster-name --region +pip install -r scripts/requirements.txt +``` + +If the S3 bucket doesn't exist, create it: + +```bash +aws s3 mb s3:// --region +``` + +**For EKS clusters** (check `Orchestrator.Eks` in describe-cluster output): + +1. Ensure kubectl is installed (`which kubectl`). If missing, install it for the current platform. +2. Configure kubeconfig using the EKS cluster name from the describe-cluster response: + + ```bash + aws eks update-kubeconfig --name --region + ``` + +### 3. Run the Collection Script + +```bash +python scripts/hyperpod_issue_report.py \ + --cluster \ + --region \ + --s3-path s3://[/prefix] +``` + +Use `--help` for all options including `--instance-groups`, `--nodes`, `--command`, `--max-workers`, and `--debug`. Note: `--instance-groups` and `--nodes` are mutually exclusive. Node identifiers accept instance IDs (`i-*`), EKS names (`hyperpod-i-*`), or Slurm names (`ip-*`). + +### 4. Present Results + +After collection, the script shows statistics and offers interactive download. Report the S3 location and offer to: + +- Download the report locally +- Help analyze collected diagnostics (see [references/collection-details.md](references/collection-details.md) for what's in each file) +- Prepare a summary for AWS Support + +## Troubleshooting + +See [references/troubleshooting.md](references/troubleshooting.md) for error handling, large cluster tuning, and known limitations. diff --git a/plugins/sagemaker-ai/skills/hyperpod-issue-report/references/collection-details.md b/plugins/sagemaker-ai/skills/hyperpod-issue-report/references/collection-details.md new file mode 100755 index 00000000..0e7c4505 --- /dev/null +++ b/plugins/sagemaker-ai/skills/hyperpod-issue-report/references/collection-details.md @@ -0,0 +1,105 @@ +# Collection Details + +## What Gets Collected + +### Common (Both EKS and Slurm) + +- `nvidia_smi.txt` — GPU status, utilization, memory, temperature +- `resource_config.json` — HyperPod resource config from `/opt/ml/config/resource_config.json` +- `cluster_logs/` — Contents of `/var/log/aws/clusters/` +- `systemd_services.txt` — All systemd service statuses +- `disk_usage.txt` — `df` output +- `hostname.txt`, `instance_group.txt`, `instance_id.txt`, `cluster_type.txt`, `timestamp.txt` + +### EKS-Specific (Per-Node) + +- `containerd_status.txt` — `systemctl status containerd` +- `kubelet_status.txt` — `systemctl status kubelet` +- `eks-log-collector-output.txt` — EKS log collector execution log +- `eks-logs/` — EKS log collector output subdirectories: + - `cni/` — CNI plugin logs and config + - `containerd/` — Runtime logs, config, version, images, containers, tasks, plugins + - `docker/` — Docker logs (if present) + - `gpu/` — GPU diagnostics + - `ipamd/` — AWS VPC CNI IPAMD logs + - `kernel/` — dmesg output, uname info + - `kubelet/` — Kubelet logs and config + - `modinfo/` — Kernel module info (lustre, ip_vs, etc.) + - `networking/` — Network config, iptables, routes, interfaces + - `nodeadm/` — Node administration logs + - `sandbox-image/` — Sandbox image info + - `storage/` — Mounts, inodes, lsblk, LVM, fstab, XFS, pod local storage + - `sysctls/` — Kernel parameters + - `system/` — Services, systemd-analyze, top, ps, netstat, CPU/IO throttling + - `var_log/` — System logs from /var/log + +### EKS-Specific (kubectl — Collected Locally) + +Packaged as `kubectl_resources.tar.gz`, collected from the local machine (not from nodes). + +**High Priority:** + +- `nodes_describe.txt` — Detailed node descriptions (capacity, conditions, running pods) +- `pods_all_namespaces.txt` / `pods_describe_all_namespaces.txt` — All pods with details +- `events_all_namespaces.txt` — Cluster events sorted by timestamp +- `pvcs_all_namespaces.txt` / `pvcs_describe_all_namespaces.txt` — PersistentVolumeClaims +- `services_all_namespaces.txt` / `services_describe_all_namespaces.txt` — Network endpoints + +**Medium Priority:** + +- `deployments_all_namespaces.txt`, `statefulsets_all_namespaces.txt`, `daemonsets_all_namespaces.txt` +- `configmaps_all_namespaces.txt`, `secrets_all_namespaces.txt` (metadata only) +- `resourcequotas_all_namespaces.txt`, `networkpolicies_all_namespaces.txt` + +### Slurm-Specific + +- `sinfo.txt` — Node and partition information +- `sinfo_R.txt` — Reasons for node down/drain states +- `slurmctld_status.txt` — Slurm controller daemon status +- `slurmd_status.txt` — Slurm compute node daemon status +- `opt_slurm_etc/` — Slurm configuration from `/opt/slurm/etc/` +- `nvidia-bug-report.log.gz` — NVIDIA bug report (compressed) +- `syslog`, `kern.log` — System logs +- `dmesg_T.txt` — Kernel ring buffer with timestamps +- `var_log_slurm/` — Slurm logs from `/var/log/slurm/` + +### Custom Commands + +User-specified commands are saved as `command_01_.txt`, `command_02_...`, etc. + +## Report Output Structure + +``` +s3://bucket/prefix/cluster-name/YYYYMMDD_HHMMSS/ +├── collector_script.sh +├── summary.json +├── kubectl_resources.tar.gz # EKS only +└── instances/ + ├── worker1_i-abc123.tar.gz + └── worker2_i-abc124.tar.gz +``` + +Tarball filename format: `{instance-group}_{instance-id}.tar.gz` + +## Summary JSON Format + +```json +{ + "cluster_name": "my-cluster", + "cluster_id": "abc123", + "report_id": "20260126_143022", + "timestamp": "2026-01-26T14:30:22.123456", + "total_nodes": 8, + "successful": 7, + "failed": 1, + "results": [ + { + "InstanceId": "i-0123456789abcdef0", + "NodeGroup": "worker-group", + "Success": true, + "Output": "...", + "ElapsedTime": 45.2 + } + ] +} +``` diff --git a/plugins/sagemaker-ai/skills/hyperpod-issue-report/references/troubleshooting.md b/plugins/sagemaker-ai/skills/hyperpod-issue-report/references/troubleshooting.md new file mode 100755 index 00000000..9ab32540 --- /dev/null +++ b/plugins/sagemaker-ai/skills/hyperpod-issue-report/references/troubleshooting.md @@ -0,0 +1,22 @@ +# Troubleshooting + +## Error Handling + +| Issue | Cause | Fix | +| --------------------------------------------- | ------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `kubectl not found in PATH` | kubectl not installed | Install kubectl for the current platform, then re-run | +| `kubectl must be configured for EKS clusters` | kubectl missing or wrong context | Run `aws eks update-kubeconfig --name --region `. Get the EKS cluster name from `aws sagemaker describe-cluster` output (`Orchestrator.Eks.ClusterArn`) | +| Cluster name from ARN not found | ARN contains cluster ID, not name | Pass the full ARN to `--cluster` instead of extracting the ID portion. Alternatively, use `aws sagemaker list-clusters` to find the cluster name | +| No instance reports in S3 | Node IAM role missing S3 permissions | Add `s3:GetObject`/`s3:PutObject` to node role for the report bucket | +| SSM connectivity failed | SSM agent down, missing IAM, or network | Check `systemctl status amazon-ssm-agent`, verify `AmazonSSMManagedInstanceCore` policy | +| "Failed to detect shell prompt" | Custom SSM session config (custom `.bashrc`, SSM preferences) | Not compatible without modifying prompt detection; use manual SSM sessions as workaround | +| SSM throttling | Too many concurrent sessions | Reduce `--max-workers`; automatic retry handles transient throttling | +| Nodes unresponsive | Node completely down | Noted in report; other nodes' diagnostics may reveal pattern | +| EKS log collector fails | Script download or execution error | Check `eks-log-collector-output.txt`; verify disk space in `/var/log/` and `/tmp/` | + +## Large Cluster Handling + +- Default `--max-workers 16` tested up to 130 nodes (99.2% success rate, ~15 min) +- If throttled (`ThrottlingException`): reduce to `--max-workers 8` +- For 200+ nodes: batch by instance group or increase to `--max-workers 32` if no throttling +- kubectl collection may take 20-30 minutes for 1000+ node clusters diff --git a/plugins/sagemaker-ai/skills/hyperpod-issue-report/scripts/hyperpod_issue_report.py b/plugins/sagemaker-ai/skills/hyperpod-issue-report/scripts/hyperpod_issue_report.py new file mode 100755 index 00000000..e68f2f28 --- /dev/null +++ b/plugins/sagemaker-ai/skills/hyperpod-issue-report/scripts/hyperpod_issue_report.py @@ -0,0 +1,1430 @@ +#!/usr/bin/env python3 +""" +HyperPod Issue Report Collector + +Collects diagnostic logs and configurations from multiple HyperPod nodes. +Supports both HyperPod EKS and HyperPod Slurm clusters. +Uses hyperpod_run_on_multi_nodes mechanism to execute collection scripts on nodes. +Downloads collection script from S3 and uploads results back to S3. +""" + +import argparse +import boto3 +import json +import os +import platform +import pexpect +import shutil +import signal +import subprocess # nosec B404 - required for kubectl CLI commands +import sys +import tarfile +import tempfile +import time +import traceback +import zipfile +from concurrent.futures import ThreadPoolExecutor, as_completed +from datetime import datetime, timezone +from typing import List, Dict, Optional + + +# ============================================================================ +# TIMEOUT CONFIGURATION +# ============================================================================ +# These timeouts are calibrated for large clusters (tested up to 130 nodes). +# Adjust these values if you experience timeouts with larger clusters. +# +# Test results (130-node cluster): +# - kubectl commands: 1-26s (longest: kubectl describe pods) +# - SSM node collection: 31-48s per node +# ============================================================================ + +# SSM session timeouts (seconds) +# These are passed explicitly to each pexpect expect() call +SSM_SCRIPT_EXECUTION_TIMEOUT = 900 # 15 minutes - script execution on nodes +SSM_PROMPT_TIMEOUT = 60 # 60 seconds - prompt detection and setup + +# kubectl command timeout (seconds) +KUBECTL_TIMEOUT = 600 # 10 minutes - all kubectl operations + + +class HyperPodIssueReportCollector: + def __init__(self, cluster_name: str, s3_path: str, region: Optional[str] = None, debug: bool = False): + self.cluster_name = cluster_name + self.debug = debug + + # Parse S3 path + self.s3_bucket, self.s3_prefix = self.parse_s3_path(s3_path) + + # Configure boto3 clients with optional region + client_kwargs = {} + if region: + client_kwargs['region_name'] = region + + self.sagemaker_client = boto3.client('sagemaker', **client_kwargs) + self.s3_client = boto3.client('s3', **client_kwargs) + self.eks_client = boto3.client('eks', **client_kwargs) + self.region = region + + self.cluster_arn = None + self.cluster_id = None + self.cluster_type = None # 'eks' or 'slurm' + self.eks_cluster_arn = None + self.eks_cluster_name = None + self.nodes = [] + + # Generate unique report ID using UTC time + self.report_id = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S") + self.report_s3_key = f"{self.s3_prefix}/{cluster_name}/{self.report_id}" + + def parse_s3_path(self, s3_path: str) -> tuple: + """Parse S3 path into bucket and prefix. + + Accepts formats: + - s3://bucket-name/prefix/path + - s3://bucket-name + """ + s3_path = s3_path.strip() + + # Require s3:// prefix + if not s3_path.startswith('s3://'): + raise ValueError( + f"S3 path must start with 's3://' prefix.\n" + f"Received: {s3_path}\n" + f"Expected format: s3://bucket-name or s3://bucket-name/custom-prefix" + ) + + # Remove s3:// prefix + s3_path = s3_path[5:] + + # Split into bucket and prefix + parts = s3_path.split('/', 1) + bucket = parts[0] + prefix = parts[1].rstrip('/') if len(parts) > 1 else 'hyperpod-issue-reports' + + return bucket, prefix + + def extract_cluster_id_from_arn(self, cluster_arn: str) -> str: + """Extract cluster ID from cluster ARN.""" + if cluster_arn: + if '/cluster/' in cluster_arn: + return cluster_arn.split('/cluster/')[-1] + elif ':cluster/' in cluster_arn: + return cluster_arn.split(':cluster/')[-1] + parts = cluster_arn.split(':') + if len(parts) > 0: + return parts[-1] + return None + + def get_slurm_node_name(self, instance_id: str) -> Optional[str]: + """Get Slurm node name (e.g. ip-10-1-104-161) for a node via describe_cluster_node API.""" + try: + response = self.sagemaker_client.describe_cluster_node( + ClusterName=self.cluster_name, + NodeId=instance_id + ) + + # Extract private DNS name from NodeDetails + node_details = response.get('NodeDetails', {}) + private_dns = node_details.get('PrivateDnsHostname', '') + + # Private DNS format is like: ip-10-1-104-161.us-west-2.compute.internal + # Extract the IP part (ip-10-1-104-161) + if private_dns and private_dns.startswith('ip-'): + # Get the first part before the first dot + slurm_node_name = private_dns.split('.')[0] + return slurm_node_name + + return None + + except Exception as e: + if self.debug: + print(f"Warning: Could not get private IP for {instance_id}: {e}") + return None + + def get_cluster_nodes(self) -> List[Dict]: + """Get all nodes in the HyperPod cluster and detect cluster type.""" + try: + print(f"Describing cluster: {self.cluster_name}") + response = self.sagemaker_client.describe_cluster(ClusterName=self.cluster_name) + + print(f"Cluster status: {response.get('ClusterStatus', 'Unknown')}") + + # Detect cluster type from Orchestrator field + orchestrator = response.get('Orchestrator', {}) + + if 'Eks' in orchestrator: + self.cluster_type = 'eks' + print(f"Detected cluster type: EKS") + # Extract EKS cluster ARN + eks_config = orchestrator.get('Eks', {}) + self.eks_cluster_arn = eks_config.get('ClusterArn') + if self.eks_cluster_arn: + # Extract cluster name from ARN: arn:aws:eks:region:account:cluster/cluster-name + self.eks_cluster_name = self.eks_cluster_arn.split('/')[-1] + print(f"EKS Cluster ARN: {self.eks_cluster_arn}") + print(f"EKS Cluster Name: {self.eks_cluster_name}") + else: + print("Warning: Could not extract EKS cluster ARN from orchestrator config") + elif 'Slurm' in orchestrator: + self.cluster_type = 'slurm' + print(f"Detected cluster type: Slurm") + else: + # If Orchestrator field is missing or doesn't contain Eks/Slurm, assume Slurm + self.cluster_type = 'slurm' + print(f"Orchestrator field not found or unrecognized, assuming cluster type: Slurm") + + self.cluster_arn = response.get('ClusterArn') + self.cluster_id = self.extract_cluster_id_from_arn(self.cluster_arn) + print(f"Cluster ID: {self.cluster_id}") + + if not self.cluster_id: + print("Warning: Could not extract cluster ID from ARN") + return [] + + # List all nodes with pagination + instance_ids = [] + next_token = None + page_count = 0 + + while True: + page_count += 1 + print(f"Fetching nodes page {page_count}...") + + list_params = {'ClusterName': self.cluster_name} + if next_token: + list_params['NextToken'] = next_token + + nodes_response = self.sagemaker_client.list_cluster_nodes(**list_params) + + current_page_nodes = nodes_response.get('ClusterNodeSummaries', []) + print(f"Found {len(current_page_nodes)} nodes on page {page_count}") + + for node in current_page_nodes: + instance_id = node.get('InstanceId') + if instance_id: + instance_ids.append({ + 'InstanceId': instance_id, + 'NodeGroup': node.get('InstanceGroupName', 'unknown'), + 'InstanceType': node.get('InstanceType', 'unknown'), + 'InstanceStatus': node.get('InstanceStatus', {}).get('Status', 'unknown') + }) + + next_token = nodes_response.get('NextToken') + if not next_token: + break + + print(f"Total instances found: {len(instance_ids)}") + return instance_ids + + except Exception as e: + print(f"Error getting cluster nodes: {e}") + return [] + + def resolve_node_identifiers(self, node_identifiers: List[str]) -> List[str]: + """Resolve node identifiers to instance IDs. + + Supports multiple formats: + - Instance IDs: i-0123456789abcdef0 (EKS and Slurm) + - Slurm node names: ip-10-1-104-161 (Slurm only) + - EKS node names: hyperpod-i-0123456789abcdef0 (EKS only) + + Returns list of instance IDs. + """ + if not node_identifiers: + return [] + + # Separate different identifier types + instance_ids = [] + slurm_node_names = [] + eks_node_names = [] + + for identifier in node_identifiers: + if identifier.startswith('i-'): + # This is an instance ID + instance_ids.append(identifier) + elif identifier.startswith('ip-'): + # This looks like a Slurm node name + slurm_node_names.append(identifier) + elif identifier.startswith('hyperpod-i-'): + # This looks like an EKS node name (hyperpod-i-*) + eks_node_names.append(identifier) + else: + # Unknown format, treat as instance ID and let validation fail later + instance_ids.append(identifier) + + # Resolve EKS node names if present + if eks_node_names: + if self.cluster_type == 'eks': + print(f"Resolving EKS node names to instance IDs...") + for eks_name in eks_node_names: + # Extract instance ID from hyperpod-i-* format + # Format: hyperpod-i-0123456789abcdef0 + if eks_name.startswith('hyperpod-'): + extracted_id = eks_name[9:] # Remove 'hyperpod-' prefix + if extracted_id.startswith('i-'): + instance_ids.append(extracted_id) + print(f" {eks_name} -> {extracted_id}") + else: + print(f" Warning: Invalid EKS node name format '{eks_name}' (expected hyperpod-i-*)") + else: + print(f" Warning: Invalid EKS node name format '{eks_name}'") + else: + print(f"Warning: EKS node names provided but cluster type is {self.cluster_type}") + print(f" EKS node names (hyperpod-i-*) are only supported for EKS clusters") + print(f" Ignoring: {', '.join(eks_node_names)}") + + # Resolve Slurm node names if present + if slurm_node_names: + if self.cluster_type == 'slurm': + print(f"Resolving Slurm node names to instance IDs...") + + # Build a mapping of Slurm node name to instance ID + slurm_to_instance = {} + + for node in self.nodes: + instance_id = node.get('InstanceId') + if instance_id: + slurm_name = self.get_slurm_node_name(instance_id) + if slurm_name: + slurm_to_instance[slurm_name] = instance_id + + # Resolve the requested Slurm node names + for slurm_name in slurm_node_names: + if slurm_name in slurm_to_instance: + resolved_id = slurm_to_instance[slurm_name] + instance_ids.append(resolved_id) + print(f" {slurm_name} -> {resolved_id}") + else: + print(f" Warning: Slurm node name '{slurm_name}' not found in cluster") + else: + print(f"Warning: Slurm node names provided but cluster type is {self.cluster_type}") + print(f" Slurm node names (ip-*) are only supported for Slurm clusters") + print(f" Ignoring: {', '.join(slurm_node_names)}") + + return instance_ids + + def generate_collector_script(self, commands: List[str]) -> str: + """Generate the bash script that will run on each node. + Instance group and ID are passed as environment variables. + Script content varies based on cluster type (EKS vs Slurm).""" + script_lines = [ + "#!/bin/bash", + "# HyperPod Issue Report Collector Script", + "# Auto-generated script to collect diagnostic information", + "# Expects INSTANCE_GROUP, INSTANCE_ID, and CLUSTER_TYPE environment variables", + "", + "# Note: We don't use 'set -e' because some commands (like grep) may return non-zero", + "# exit codes even when they succeed (e.g., grep returns 1 when no matches found)", + "", + "# Validate required environment variables", + "if [ -z \"${INSTANCE_GROUP}\" ] || [ -z \"${INSTANCE_ID}\" ] || [ -z \"${CLUSTER_TYPE}\" ]; then", + " echo \"Error: INSTANCE_GROUP, INSTANCE_ID, and CLUSTER_TYPE environment variables are required\"", + " exit 1", + "fi", + "", + "# Instance identification", + "TIMESTAMP=$(date -u +%Y%m%d_%H%M%S)", + "OUTPUT_DIR=\"/tmp/hyperpod_report_${INSTANCE_GROUP}_${INSTANCE_ID}_${TIMESTAMP}\"", + "", + "echo \"Creating output directory: ${OUTPUT_DIR}\"", + "mkdir -p \"${OUTPUT_DIR}\"", + "if [ $? -ne 0 ]; then", + " echo \"ERROR: Failed to create output directory\"", + " exit 1", + "fi", + "", + "# Collect system information", + "echo \"Collecting system information...\"", + "echo \"${INSTANCE_GROUP}\" > \"${OUTPUT_DIR}/instance_group.txt\"", + "echo \"${INSTANCE_ID}\" > \"${OUTPUT_DIR}/instance_id.txt\"", + "echo \"${CLUSTER_TYPE}\" > \"${OUTPUT_DIR}/cluster_type.txt\"", + "hostname > \"${OUTPUT_DIR}/hostname.txt\"", + "date -u > \"${OUTPUT_DIR}/timestamp.txt\"", + "", + "# Collect HyperPod resource config if available", + "if [ -f /opt/ml/config/resource_config.json ]; then", + " echo \"Collecting HyperPod resource config...\"", + " cp /opt/ml/config/resource_config.json \"${OUTPUT_DIR}/resource_config.json\" 2>/dev/null || echo \"Could not copy resource_config.json\"", + "fi", + "", + "# Collect cluster logs if available", + "if [ -d /var/log/aws/clusters ]; then", + " echo \"Collecting cluster logs...\"", + " mkdir -p \"${OUTPUT_DIR}/cluster_logs\"", + " cp -r /var/log/aws/clusters/* \"${OUTPUT_DIR}/cluster_logs/\" 2>/dev/null || echo \"Could not copy cluster logs\"", + "fi", + "", + "# Collect systemd service status", + "echo \"Collecting systemd service status...\"", + "systemctl list-units --type=service --all --no-pager > \"${OUTPUT_DIR}/systemd_services.txt\" 2>&1 || echo \"Could not collect systemd services\"", + "", + "# Collect disk usage", + "echo \"Collecting disk usage...\"", + "df > \"${OUTPUT_DIR}/disk_usage.txt\" 2>&1 || echo \"Could not collect disk usage\"", + "", + "# Collect nvidia-smi output", + "echo \"Collecting nvidia-smi output...\"", + "nvidia-smi > \"${OUTPUT_DIR}/nvidia_smi.txt\" 2>&1 || echo \"nvidia-smi not available or failed\"", + "", + ] + + # Add cluster-type specific collections + if self.cluster_type == 'eks': + script_lines.extend([ + "# EKS-specific collections", + "echo \"Collecting containerd service status...\"", + "systemctl status containerd > \"${OUTPUT_DIR}/containerd_status.txt\" 2>&1 || echo \"containerd service not found or not running\"", + "", + "echo \"Collecting kubelet service status...\"", + "systemctl status kubelet > \"${OUTPUT_DIR}/kubelet_status.txt\" 2>&1 || echo \"kubelet service not found or not running\"", + "", + "echo \"Running EKS log collector...\"", + "EKS_LOG_COLLECTOR_URL=\"https://raw.githubusercontent.com/awslabs/amazon-eks-ami/main/log-collector-script/linux/eks-log-collector.sh\"", + "curl -o /tmp/eks-log-collector.sh \"${EKS_LOG_COLLECTOR_URL}\"", # nosec B108 - remote node shell script, not local Python + "chmod +x /tmp/eks-log-collector.sh", + "", + "# Run the collector and capture its output", + "/tmp/eks-log-collector.sh > \"${OUTPUT_DIR}/eks-log-collector-output.txt\" 2>&1 || echo \"EKS log collector completed with warnings\"", + "", + "# Find the generated tarball (it's created in /var/log/)", + "EKS_TARBALL=$(ls -t /var/log/eks_*.tar.gz 2>/dev/null | head -1)", + "if [ -n \"${EKS_TARBALL}\" ]; then", + " echo \"Found EKS logs at ${EKS_TARBALL}\"", + " echo \"Extracting EKS logs from ${EKS_TARBALL}\"", + " mkdir -p \"${OUTPUT_DIR}/eks-logs\"", + " tar -xzf \"${EKS_TARBALL}\" -C \"${OUTPUT_DIR}/eks-logs\" 2>/dev/null || echo \"Extracted EKS logs\"", + " rm -f \"${EKS_TARBALL}\"", + "else", + " echo \"ERROR: No EKS log tarball found in /var/log/\" | tee -a \"${OUTPUT_DIR}/eks-log-collector-output.txt\"", + " echo \"EKS log collector may have failed. Check eks-log-collector-output.txt for details.\" | tee -a \"${OUTPUT_DIR}/eks-log-collector-output.txt\"", + " rm -f /tmp/eks-log-collector.sh", + " exit 1", + "fi", + "", + "# Clean up the collector script", + "rm -f /tmp/eks-log-collector.sh", + "", + ]) + elif self.cluster_type == 'slurm': + script_lines.extend([ + "# Slurm-specific collections", + "echo \"Collecting Slurm information...\"", + "", + "# Slurm info commands", + "sinfo > \"${OUTPUT_DIR}/sinfo.txt\" 2>&1 || echo \"sinfo not available\"", + "sinfo -R > \"${OUTPUT_DIR}/sinfo_R.txt\" 2>&1 || echo \"sinfo -R not available\"", + "", + "# Slurm service status", + "systemctl status slurmctld > \"${OUTPUT_DIR}/slurmctld_status.txt\" 2>&1 || echo \"slurmctld not running on this node\"", + "systemctl status slurmd > \"${OUTPUT_DIR}/slurmd_status.txt\" 2>&1 || echo \"slurmd not running on this node\"", + "", + "# Slurm configuration", + "if [ -d /opt/slurm/etc ]; then", + " echo \"Collecting Slurm configuration...\"", + " mkdir -p \"${OUTPUT_DIR}/opt_slurm_etc\"", + " cp -r /opt/slurm/etc/* \"${OUTPUT_DIR}/opt_slurm_etc/\" 2>/dev/null || echo \"Could not copy Slurm config\"", + "fi", + "", + "# NVIDIA bug report", + "echo \"Running nvidia-bug-report.sh...\"", + "nvidia-bug-report.sh --output-file \"${OUTPUT_DIR}/nvidia-bug-report.log.gz\" 2>&1 || echo \"nvidia-bug-report.sh not available or failed\"", + "", + "# System logs", + "echo \"Collecting system logs...\"", + "cp /var/log/syslog \"${OUTPUT_DIR}/syslog\" 2>/dev/null || echo \"Could not copy syslog\"", + "cp /var/log/kern.log \"${OUTPUT_DIR}/kern.log\" 2>/dev/null || echo \"Could not copy kern.log\"", + "dmesg -T > \"${OUTPUT_DIR}/dmesg_T.txt\" 2>&1 || echo \"Could not run dmesg -T\"", + "", + "# Slurm logs", + "if [ -d /var/log/slurm ]; then", + " echo \"Collecting Slurm logs...\"", + " mkdir -p \"${OUTPUT_DIR}/var_log_slurm\"", + " cp -r /var/log/slurm/* \"${OUTPUT_DIR}/var_log_slurm/\" 2>/dev/null || echo \"Could not copy Slurm logs\"", + "fi", + "", + ]) + + # Add each command to the script + for i, cmd in enumerate(commands, 1): + # Sanitize command for filename - replace problematic characters + safe_name = cmd.replace(' ', '_').replace('/', '_').replace('|', '_').replace('>', '_').replace('<', '_').replace('&', '_').replace(';', '_').replace('(', '_').replace(')', '_').replace('$', '_').replace('`', '_').replace('"', '_').replace("'", '_')[:50] + output_file = f"command_{i:02d}_{safe_name}.txt" + + # Use regular string (not f-string) to avoid any escaping issues with bash variables + cmd_line = f"{cmd} > \"${{OUTPUT_DIR}}/{output_file}\" 2>&1 || echo \"Command failed with exit code $?\" >> \"${{OUTPUT_DIR}}/{output_file}\"" + + script_lines.extend([ + f"# Command {i}: {cmd}", + f"echo \"Running: {cmd}\"", + cmd_line, + "", + ]) + + # Add S3 upload logic with new filename format + script_lines.extend([ + "# Upload results to S3", + f"S3_BUCKET=\"{self.s3_bucket}\"", + f"S3_PREFIX=\"{self.report_s3_key}/instances\"", + "", + "echo \"Creating tarball...\"", + "TARBALL=\"/tmp/${INSTANCE_GROUP}_${INSTANCE_ID}.tar.gz\"", + "tar -czf \"${TARBALL}\" -C /tmp \"$(basename ${OUTPUT_DIR})\"", + "if [ $? -ne 0 ]; then", + " echo \"ERROR: Failed to create tarball\"", + " exit 1", + "fi", + "", + "echo \"Uploading to S3...\"", + "aws s3 cp \"${TARBALL}\" \"s3://${S3_BUCKET}/${S3_PREFIX}/$(basename ${TARBALL})\"", + "", + "if [ $? -eq 0 ]; then", + " echo \"Successfully uploaded report to s3://${S3_BUCKET}/${S3_PREFIX}/$(basename ${TARBALL})\"", + " rm -rf \"${OUTPUT_DIR}\" \"${TARBALL}\"", + " echo \"Report collection completed for ${INSTANCE_GROUP}/${INSTANCE_ID}\"", + " exit 0", + "else", + " echo \"ERROR: Failed to upload to S3\"", + " exit 1", + "fi", + ]) + + return '\n'.join(script_lines) + + def get_hyperpod_ssm_target(self, instance_id: str, instance_group_name: str) -> str: + """Construct the HyperPod SSM target format.""" + if not self.cluster_id: + raise ValueError("Cluster ID is required for HyperPod SSM targets") + return f"sagemaker-cluster:{self.cluster_id}_{instance_group_name}-{instance_id}" + + def execute_collection_on_node(self, node: Dict, commands: List[str], script_s3_uri: str) -> Dict: + """Execute the collection script on a single node via SSM using pexpect.""" + instance_id = node['InstanceId'] + instance_group = node.get('NodeGroup', 'unknown') + + # Start timing + start_time = time.time() + + try: + ssm_target = self.get_hyperpod_ssm_target(instance_id, instance_group) + except ValueError as e: + return { + 'InstanceId': instance_id, + 'NodeGroup': instance_group, + 'Success': False, + 'Error': str(e), + 'ElapsedTime': time.time() - start_time + } + + # Build the command to download and execute the script with environment variables + commands_to_run = [ + f"aws s3 cp {script_s3_uri} /tmp/collector_script.sh", + "chmod +x /tmp/collector_script.sh", + f"INSTANCE_GROUP={instance_group} INSTANCE_ID={instance_id} CLUSTER_TYPE={self.cluster_type} /tmp/collector_script.sh" + ] + + full_command = " && ".join(commands_to_run) + + print(f"Executing collection on {instance_id} ({instance_group})...") + + child = None + custom_prompt = "PEXPECT_READY# " + + try: + ssm_command = f"aws ssm start-session --target {ssm_target}" + + if self.debug: + print(f"[DEBUG] {instance_id}: SSM command: {ssm_command}") + print(f"[DEBUG] {instance_id}: Full command: {full_command}") + + # Use pexpect to handle the interactive session + # Note: No default timeout set - each expect() call has explicit timeout + child = pexpect.spawn(ssm_command, encoding='utf-8') + child.logfile_read = None + + # Wait for initial prompt (60 seconds to handle slow SSM session initialization) + initial_prompt_patterns = [ + r'[\$#]\s+', # Standard shell prompt + r'sh-\d+\.\d+[\$#]\s*', # sh prompt + pexpect.TIMEOUT + ] + + prompt_index = child.expect(initial_prompt_patterns, timeout=SSM_PROMPT_TIMEOUT) + + if prompt_index == len(initial_prompt_patterns) - 1: # TIMEOUT + # Get output for debugging + output_sample = "" + if child and hasattr(child, 'before') and child.before: + # Show more output to help diagnose the issue + output_sample = child.before.strip() + if len(output_sample) > 1000: + output_sample = output_sample[-1000:] # Last 1000 chars + + error_msg = ( + f"Failed to detect shell prompt after 60 seconds.\n" + f"This may indicate:\n" + f" - Custom SSM session configuration interfering with prompt detection\n" + f" - Non-standard shell prompt format\n" + f" - SSM session initialization issues\n" + ) + + if output_sample: + error_msg += f"\nSession output received:\n{output_sample}\n" + error_msg += ( + f"\nExpected prompt patterns: $ or # followed by space\n" + f"If your cluster uses custom SSM session commands or non-standard prompts,\n" + f"this tool may not be compatible." + ) + else: + error_msg += "\nNo output received from SSM session." + + return { + 'InstanceId': instance_id, + 'NodeGroup': instance_group, + 'Success': False, + 'Error': error_msg + } + + # Set custom prompt + child.sendline(f'export PS1="{custom_prompt}"') + child.sendline('echo "PROMPT_SET_MARKER"') + child.expect('PROMPT_SET_MARKER', timeout=SSM_PROMPT_TIMEOUT) + child.expect(custom_prompt, timeout=SSM_PROMPT_TIMEOUT) + + if self.debug: + print(f"[DEBUG] {instance_id}: Custom prompt set") + + # Execute the command and capture exit code immediately + child.sendline(f'{full_command}; EXIT_CODE=$?; echo "EXIT_CODE:$EXIT_CODE"') + + # Wait for command completion (15 minutes for script execution) + child.expect(custom_prompt, timeout=SSM_SCRIPT_EXECUTION_TIMEOUT) + + # Extract output + output = child.before + exit_code = 1 # Default to failure + + if output: + lines = output.split('\n') + cleaned_lines = [] + command_echo_removed = False + + for line in lines: + line_stripped = line.strip() + + # Remove command echo + if not command_echo_removed and full_command in line: + command_echo_removed = True + continue + + # Extract exit code + if line_stripped.startswith('EXIT_CODE:'): + try: + exit_code = int(line_stripped.split(':')[1].strip()) + except (ValueError, IndexError): + pass + continue + + if line_stripped: + cleaned_lines.append(line_stripped) + + output = '\n'.join(cleaned_lines) + else: + output = "" + + # Close session + try: + child.sendline('exit') + child.expect(pexpect.EOF, timeout=5) + except Exception: + try: + child.kill(signal.SIGINT) + except Exception: # nosec B110 - best-effort cleanup + pass + + # Determine success based on exit code OR successful S3 upload message + # Some nodes may not properly echo the EXIT_CODE line due to terminal issues + success_indicators = [ + exit_code == 0, + 'Successfully uploaded report to s3://' in output, + 'upload: ../../tmp/' in output and '.tar.gz to s3://' in output + ] + + if any(success_indicators): + return { + 'InstanceId': instance_id, + 'NodeGroup': instance_group, + 'Success': True, + 'Output': output, + 'ElapsedTime': time.time() - start_time + } + else: + # Show last 15 lines of output which usually contain the error + output_lines = output.split('\n') + error_context = '\n'.join(output_lines[-15:]) if len(output_lines) > 15 else output + + return { + 'InstanceId': instance_id, + 'NodeGroup': instance_group, + 'Success': False, + 'Error': f"Script execution failed (exit code: {exit_code})\n{error_context}", + 'Output': output, + 'ElapsedTime': time.time() - start_time + } + + except pexpect.TIMEOUT: + # Show more context about where the timeout occurred + output_sample = "" + if child and hasattr(child, 'before') and child.before: + output_sample = child.before.strip() + if len(output_sample) > 1000: + output_sample = output_sample[-1000:] # Last 1000 chars + + error_msg = ( + f"Operation timed out during command execution.\n" + f"This may indicate:\n" + f" - Command taking longer than expected to complete\n" + f" - Custom shell configuration interfering with output detection\n" + f" - Network or SSM session issues\n" + ) + + if output_sample: + error_msg += f"\nLast output received:\n{output_sample}" + else: + error_msg += "\nNo output received." + + return { + 'InstanceId': instance_id, + 'NodeGroup': instance_group, + 'Success': False, + 'Error': error_msg, + 'ElapsedTime': time.time() - start_time + } + + except pexpect.EOF: + output_sample = "" + if child and hasattr(child, 'before') and child.before: + output_sample = child.before.strip() + if len(output_sample) > 500: + output_sample = output_sample[-500:] # Last 500 chars + + error_msg = "SSM session ended unexpectedly" + if output_sample: + error_msg += f"\nLast output:\n{output_sample}" + + return { + 'InstanceId': instance_id, + 'NodeGroup': instance_group, + 'Success': False, + 'Error': error_msg, + 'ElapsedTime': time.time() - start_time + } + + except Exception as e: + error_msg = f"Error executing command: {str(e)}" + if self.debug: + error_msg += f"\nTraceback: {traceback.format_exc()}" + return { + 'InstanceId': instance_id, + 'NodeGroup': instance_group, + 'Success': False, + 'Error': error_msg, + 'ElapsedTime': time.time() - start_time + } + + finally: + if child and child.isalive(): + try: + child.terminate(force=True) + except Exception: # nosec B110 - best-effort cleanup + pass + + def execute_with_retry(self, node: Dict, commands: List[str], script_s3_uri: str, max_retries: int = 3) -> Dict: + """Execute collection on a node with exponential backoff on throttling errors.""" + for attempt in range(max_retries): + result = self.execute_collection_on_node(node, commands, script_s3_uri) + + error_msg = result.get('Error', '') + if 'ThrottlingException' in error_msg or 'Rate exceeded' in error_msg: + if attempt < max_retries - 1: + wait_time = 2 ** attempt + if self.debug: + print(f"[DEBUG] {node['InstanceId']}: Throttled, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries})") + time.sleep(wait_time) + continue + + return result + + return result + + def collect_reports(self, commands: List[str], instance_groups: Optional[List[str]] = None, instance_ids: Optional[List[str]] = None, max_workers: int = 16): + """Collect reports from all nodes, specific instance groups, or specific instance IDs. + + For Slurm clusters, instance_ids can be either: + - Instance IDs: i-0123456789abcdef0 + - Slurm node names: ip-10-1-104-161 + + Note: max_workers defaults to 16 to balance speed and avoid SSM throttling on large clusters. + """ + # Get cluster nodes + self.nodes = self.get_cluster_nodes() + + if not self.nodes: + print("No nodes found in cluster") + return + + # Collect kubectl information first (for EKS clusters) + if self.cluster_type == 'eks': + self.collect_kubectl_node_info() + + # Filter by specific instance IDs or Slurm node names if specified + if instance_ids: + # Resolve node identifiers (handles both instance IDs and Slurm node names) + resolved_instance_ids = self.resolve_node_identifiers(instance_ids) + + if not resolved_instance_ids: + print(f"No valid nodes found from specified identifiers: {', '.join(instance_ids)}") + return + + self.nodes = [n for n in self.nodes if n.get('InstanceId') in resolved_instance_ids] + if not self.nodes: + print(f"No nodes found with specified identifiers: {', '.join(instance_ids)}") + return + + # Show which requested identifiers were not found + found_ids = {n.get('InstanceId') for n in self.nodes} + missing_ids = set(resolved_instance_ids) - found_ids + if missing_ids: + print(f"Warning: Instance IDs not found in cluster: {', '.join(missing_ids)}") + # Filter by instance groups if specified (only if instance_ids not specified) + elif instance_groups: + # Convert instance groups to lowercase for case-insensitive matching + instance_groups_lower = [ig.lower() for ig in instance_groups] + self.nodes = [n for n in self.nodes if n.get('NodeGroup', '').lower() in instance_groups_lower] + if not self.nodes: + print(f"No nodes found in instance groups: {', '.join(instance_groups)}") + return + print(f"Filtering to instance groups: {', '.join(instance_groups)}") + + print(f"\nCollecting reports from {len(self.nodes)} nodes") + print(f"Cluster type: {self.cluster_type.upper()}") + print(f"Report ID: {self.report_id}") + print(f"S3 Location: s3://{self.s3_bucket}/{self.report_s3_key}/") + + # Show what will be collected based on cluster type + if self.cluster_type == 'eks': + print(f"Default collections: nvidia-smi, containerd status, kubelet status, EKS log collector, resource config, cluster logs, systemd services, disk usage") + elif self.cluster_type == 'slurm': + print(f"Default collections: nvidia-smi, nvidia-bug-report, sinfo, Slurm services, Slurm config, Slurm logs, system logs") + + if commands: + print(f"Additional commands: {', '.join(commands)}") + print("-" * 60) + + # Generate and upload the collector script once + script_content = self.generate_collector_script(commands) + script_key = f"{self.report_s3_key}/collector_script.sh" + + try: + self.s3_client.put_object( + Bucket=self.s3_bucket, + Key=script_key, + Body=script_content.encode('utf-8'), + ContentType='text/x-shellscript' + ) + script_s3_uri = f"s3://{self.s3_bucket}/{script_key}" + print(f"Uploaded collector script to: {script_s3_uri}") + except Exception as e: + print(f"Error uploading collector script: {e}") + return + + # Execute collection on all nodes using ThreadPoolExecutor + results = [] + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + future_to_node = { + executor.submit(self.execute_with_retry, node, commands, script_s3_uri): node + for node in self.nodes + } + + for future in as_completed(future_to_node): + node = future_to_node[future] + try: + result = future.result() + results.append(result) + + status = "✓" if result['Success'] else "✗" + elapsed = result.get('ElapsedTime', 0) + print(f"[{status}] {result['InstanceId']} ({result['NodeGroup']}) - {elapsed:.1f}s") + + if not result['Success']: + error_msg = result.get('Error', 'Unknown error') + # Print error details with indentation for readability + for line in error_msg.split('\n'): + if line.strip(): + print(f" {line}") + + except Exception as e: + print(f"[✗] {node['InstanceId']}: Exception: {e}") + results.append({ + 'InstanceId': node['InstanceId'], + 'NodeGroup': node.get('NodeGroup', 'unknown'), + 'Success': False, + 'Error': str(e), + 'ElapsedTime': 0 + }) + + # Save summary + self.save_summary(results) + + print("-" * 60) + print(f"\nReport collection completed!") + print(f"Instance reports uploaded to: s3://{self.s3_bucket}/{self.report_s3_key}/instances/") + print(f"Summary: s3://{self.s3_bucket}/{self.report_s3_key}/summary.json") + + # Print statistics + successful = sum(1 for r in results if r['Success']) + failed = len(results) - successful + print(f"\nStatistics:") + print(f" Total nodes: {len(results)}") + print(f" Successful: {successful}") + print(f" Failed: {failed}") + + # Offer to download results + self.offer_download_results() + + def offer_download_results(self): + """Ask user if they want to download results from S3.""" + print("\n" + "=" * 60) + print("Download Results") + print("=" * 60) + + try: + response = input("\nWould you like to download all results from S3 to the current directory? (y/n): ").strip().lower() + + if response in ['y', 'yes']: + download_dir = self.download_results_from_s3() + + if download_dir: + # Ask about creating zip archive + response = input("\nWould you like to create a zip archive of the downloaded results? (y/n): ").strip().lower() + + if response in ['y', 'yes']: + self.create_zip_archive(download_dir) + else: + print("\nSkipping download. You can download manually using:") + print(f" aws s3 sync s3://{self.s3_bucket}/{self.report_s3_key}/ ./{self.cluster_name}_{self.report_id}/") + + except KeyboardInterrupt: + print("\n\nDownload cancelled by user.") + except Exception as e: + print(f"\nError during download prompt: {e}") + + def download_results_from_s3(self) -> Optional[str]: + """Download all results from S3 to local directory. + + Returns: + str: Path to download directory if successful, None otherwise + """ + # Create download directory + download_dir = f"{self.cluster_name}_{self.report_id}" + + print(f"\nDownloading results to: ./{download_dir}/") + print(f"Source: s3://{self.s3_bucket}/{self.report_s3_key}/") + + try: + # List all objects in the S3 prefix + paginator = self.s3_client.get_paginator('list_objects_v2') + pages = paginator.paginate(Bucket=self.s3_bucket, Prefix=self.report_s3_key) + + files_to_download = [] + for page in pages: + if 'Contents' in page: + for obj in page['Contents']: + key = obj['Key'] + # Skip the prefix itself (directory marker) + if key != self.report_s3_key and key != f"{self.report_s3_key}/": + files_to_download.append(key) + + if not files_to_download: + print("No files found to download.") + return None + + print(f"Found {len(files_to_download)} files to download...") + + # Download each file + downloaded = 0 + failed = 0 + + for key in files_to_download: + # Calculate relative path (remove the report_s3_key prefix) + relative_path = key[len(self.report_s3_key):].lstrip('/') + local_path = os.path.join(download_dir, relative_path) + + # Create parent directory if needed + local_dir = os.path.dirname(local_path) + if local_dir: + os.makedirs(local_dir, exist_ok=True) + + try: + # Download file + self.s3_client.download_file(self.s3_bucket, key, local_path) + downloaded += 1 + + # Show progress for every 5 files or last file + if downloaded % 5 == 0 or downloaded == len(files_to_download): + print(f" Downloaded {downloaded}/{len(files_to_download)} files...") + + except Exception as e: + print(f" Failed to download {relative_path}: {e}") + failed += 1 + + print(f"\n✓ Download completed!") + print(f" Downloaded: {downloaded} files") + if failed > 0: + print(f" Failed: {failed} files") + print(f" Location: ./{download_dir}/") + + return download_dir + + except Exception as e: + print(f"\nError downloading results: {e}") + if self.debug: + traceback.print_exc() + return None + + def create_zip_archive(self, directory: str): + """Create a zip archive of the downloaded results. + + Args: + directory: Path to directory to archive + """ + zip_filename = f"{directory}.zip" + + print(f"\nCreating zip archive: {zip_filename}") + + try: + with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf: + # Walk through directory + file_count = 0 + for root, dirs, files in os.walk(directory): + for file in files: + file_path = os.path.join(root, file) + # Calculate archive name (relative to directory) + arcname = os.path.relpath(file_path, os.path.dirname(directory)) + zipf.write(file_path, arcname) + file_count += 1 + + # Show progress + if file_count % 5 == 0: + print(f" Archived {file_count} files...") + + # Get zip file size + zip_size = os.path.getsize(zip_filename) + zip_size_mb = zip_size / (1024 * 1024) + + print(f"\n✓ Zip archive created!") + print(f" File: {zip_filename}") + print(f" Size: {zip_size_mb:.2f} MB") + print(f" Files: {file_count}") + + # Ask if user wants to delete the uncompressed directory + response = input(f"\nWould you like to delete the uncompressed directory '{directory}'? (y/n): ").strip().lower() + + if response in ['y', 'yes']: + shutil.rmtree(directory) + print(f"✓ Deleted directory: {directory}") + else: + print(f"Keeping directory: {directory}") + + except Exception as e: + print(f"\nError creating zip archive: {e}") + if self.debug: + traceback.print_exc() + + def save_summary(self, results: List[Dict]): + """Save collection summary to S3.""" + summary = { + 'cluster_name': self.cluster_name, + 'cluster_id': self.cluster_id, + 'report_id': self.report_id, + 'timestamp': datetime.now(timezone.utc).isoformat(), + 'total_nodes': len(results), + 'successful': sum(1 for r in results if r['Success']), + 'failed': sum(1 for r in results if not r['Success']), + 'results': results + } + + summary_key = f"{self.report_s3_key}/summary.json" + + try: + self.s3_client.put_object( + Bucket=self.s3_bucket, + Key=summary_key, + Body=json.dumps(summary, indent=2).encode('utf-8'), + ContentType='application/json' + ) + print(f"Summary saved to: s3://{self.s3_bucket}/{summary_key}") + except Exception as e: + print(f"Error saving summary: {e}") + + def verify_kubectl_config(self) -> bool: + """Verify kubectl is configured for the EKS cluster.""" + if not self.eks_cluster_name: + print("Warning: EKS cluster name not available, skipping kubectl verification") + return False + + try: + # Check if kubectl is installed + result = subprocess.run(['kubectl', 'version', '--client'], # nosec B603 B607 + capture_output=True, text=True, timeout=10) + if result.returncode != 0: + print("\n" + "!" * 60) + print("ERROR: kubectl is not installed or not in PATH") + print("!" * 60) + return False + + # Extract just the version line + version_line = result.stdout.strip().split('\n')[0] if result.stdout else "kubectl installed" + print(f"kubectl version: {version_line}") + + # Check current context + result = subprocess.run(['kubectl', 'config', 'current-context'], # nosec B603 B607 + capture_output=True, text=True, timeout=10) + if result.returncode == 0: + current_context = result.stdout.strip() + print(f"Current kubectl context: {current_context}") + + # Check if context matches EKS cluster + if self.eks_cluster_name in current_context: + print(f"✓ kubectl is configured for EKS cluster: {self.eks_cluster_name}") + return True + else: + # Extract region from EKS cluster ARN + region = self.eks_cluster_arn.split(':')[3] if self.eks_cluster_arn else 'REGION' + + print("\n" + "!" * 60) + print(f"ERROR: kubectl context does not match EKS cluster") + print(f"Current context: {current_context}") + print(f"Expected cluster: {self.eks_cluster_name}") + print("!" * 60) + print("\nTo configure kubectl for this EKS cluster, run:") + print(f" aws eks update-kubeconfig --name {self.eks_cluster_name} --region {region}") + return False + else: + # Extract region from EKS cluster ARN + region = self.eks_cluster_arn.split(':')[3] if self.eks_cluster_arn else 'REGION' + + print("\n" + "!" * 60) + print("ERROR: No kubectl context configured") + print("!" * 60) + print("\nTo configure kubectl for this EKS cluster, run:") + print(f" aws eks update-kubeconfig --name {self.eks_cluster_name} --region {region}") + return False + + except subprocess.TimeoutExpired: + print("Warning: kubectl command timed out") + return False + except FileNotFoundError: + print("\n" + "!" * 60) + print("ERROR: kubectl not found in PATH") + print("!" * 60) + return False + except Exception as e: + print(f"Warning: Error verifying kubectl config: {e}") + return False + + def collect_kubectl_node_info(self): + """Collect kubectl describe node information for all nodes.""" + if self.cluster_type != 'eks': + print("Skipping kubectl collection - not an EKS cluster") + return + + if not self.eks_cluster_name: + print("Skipping kubectl collection - EKS cluster name not available") + return + + print("\n" + "=" * 60) + print("Collecting kubectl node information...") + print("=" * 60) + + # Verify kubectl configuration - exit if not configured + if not self.verify_kubectl_config(): + print("\n" + "!" * 60) + print("ERROR: kubectl must be configured for EKS clusters") + print("!" * 60) + print("\nPlease configure kubectl and re-run the tool.\n") + sys.exit(1) + + try: + # Create output directory + kubectl_output_dir = tempfile.mkdtemp(prefix='kubectl_output_') + + # Define resources to collect + collections = [ + # High Priority - Essential for troubleshooting + { + 'name': 'nodes_describe', + 'command': ['kubectl', 'describe', 'nodes'], + 'description': 'Node descriptions (capacity, conditions, pods)' + }, + { + 'name': 'pods_all_namespaces', + 'command': ['kubectl', 'get', 'pods', '-A', '-o', 'wide'], + 'description': 'All pods across namespaces (wide output)' + }, + { + 'name': 'pods_describe_all_namespaces', + 'command': ['kubectl', 'describe', 'pods', '-A'], + 'description': 'Detailed pod descriptions (all namespaces)' + }, + { + 'name': 'events_all_namespaces', + 'command': ['kubectl', 'get', 'events', '-A', '--sort-by=.lastTimestamp'], + 'description': 'Cluster events sorted by timestamp' + }, + { + 'name': 'pvcs_all_namespaces', + 'command': ['kubectl', 'get', 'pvc', '-A', '-o', 'wide'], + 'description': 'PersistentVolumeClaims (storage)' + }, + { + 'name': 'pvcs_describe_all_namespaces', + 'command': ['kubectl', 'describe', 'pvc', '-A'], + 'description': 'Detailed PVC descriptions' + }, + { + 'name': 'services_all_namespaces', + 'command': ['kubectl', 'get', 'svc', '-A', '-o', 'wide'], + 'description': 'Services (network endpoints)' + }, + { + 'name': 'services_describe_all_namespaces', + 'command': ['kubectl', 'describe', 'svc', '-A'], + 'description': 'Detailed service descriptions' + }, + + # Medium Priority - Very useful + { + 'name': 'deployments_all_namespaces', + 'command': ['kubectl', 'get', 'deployments', '-A', '-o', 'wide'], + 'description': 'Deployments' + }, + { + 'name': 'statefulsets_all_namespaces', + 'command': ['kubectl', 'get', 'statefulsets', '-A', '-o', 'wide'], + 'description': 'StatefulSets' + }, + { + 'name': 'daemonsets_all_namespaces', + 'command': ['kubectl', 'get', 'daemonsets', '-A', '-o', 'wide'], + 'description': 'DaemonSets' + }, + { + 'name': 'configmaps_all_namespaces', + 'command': ['kubectl', 'get', 'configmaps', '-A'], + 'description': 'ConfigMaps (metadata only)' + }, + { + 'name': 'secrets_all_namespaces', + 'command': ['kubectl', 'get', 'secrets', '-A'], + 'description': 'Secrets (metadata only, no content)' + }, + { + 'name': 'resourcequotas_all_namespaces', + 'command': ['kubectl', 'get', 'resourcequota', '-A'], + 'description': 'Resource quotas' + }, + { + 'name': 'networkpolicies_all_namespaces', + 'command': ['kubectl', 'get', 'networkpolicies', '-A'], + 'description': 'Network policies' + }, + ] + + print(f"Collecting {len(collections)} Kubernetes resource types...") + successful = 0 + failed = 0 + + for collection in collections: + name = collection['name'] + command = collection['command'] + description = collection['description'] + + print(f" Collecting: {description}...", end=' ', flush=True) + + try: + # Use unified timeout for all kubectl operations + timeout = KUBECTL_TIMEOUT + + # Measure execution time + start_time = time.time() + + result = subprocess.run( # nosec B603 + command, + capture_output=True, + text=True, + timeout=timeout + ) + + elapsed_time = time.time() - start_time + + output_file = os.path.join(kubectl_output_dir, f'{name}.txt') + + if result.returncode == 0: + if result.stdout.strip(): + with open(output_file, 'w', encoding='utf-8') as f: + f.write(result.stdout) + print(f"✓ ({elapsed_time:.1f}s)") + successful += 1 + else: + # Empty output (no resources of this type) + with open(output_file, 'w', encoding='utf-8') as f: + f.write("No resources found\n") + print(f"✓ (empty, {elapsed_time:.1f}s)") + successful += 1 + else: + # Command failed + with open(output_file, 'w', encoding='utf-8') as f: + f.write(f"Error: {result.stderr}\n") + print(f"✗ ({result.stderr.strip()[:50]}, {elapsed_time:.1f}s)") + failed += 1 + + except subprocess.TimeoutExpired: + output_file = os.path.join(kubectl_output_dir, f'{name}.txt') + with open(output_file, 'w', encoding='utf-8') as f: + f.write("Error: Command timed out\n") + print(f"✗ (timeout after {timeout}s)") + failed += 1 + + except Exception as e: + output_file = os.path.join(kubectl_output_dir, f'{name}.txt') + with open(output_file, 'w', encoding='utf-8') as f: + f.write(f"Error: {str(e)}\n") + print(f"✗ ({str(e)[:50]})") + failed += 1 + + print(f"\nCollection summary: {successful} successful, {failed} failed") + + # Create tarball with files at root level (no wrapper directory) + print("\nCreating kubectl output tarball...") + tarball_path = os.path.join(tempfile.gettempdir(), 'kubectl_resources.tar.gz') + + with tarfile.open(tarball_path, 'w:gz') as tar: + # Add each file directly to the tarball root (no parent directory) + for filename in os.listdir(kubectl_output_dir): + file_path = os.path.join(kubectl_output_dir, filename) + tar.add(file_path, arcname=filename) + + print(f"Created tarball: {tarball_path}") + + # Upload to S3 + s3_key = f"{self.report_s3_key}/kubectl_resources.tar.gz" + print(f"Uploading to S3: s3://{self.s3_bucket}/{s3_key}") + + self.s3_client.upload_file(tarball_path, self.s3_bucket, s3_key) + + print(f"✓ Successfully uploaded kubectl resource information to S3") + print(f" Location: s3://{self.s3_bucket}/{s3_key}") + + # Cleanup + shutil.rmtree(kubectl_output_dir, ignore_errors=True) + os.remove(tarball_path) + + except Exception as e: + print(f"Error collecting kubectl information: {e}") + if self.debug: + traceback.print_exc() + + +def main(): + # Check platform compatibility + if platform.system() == 'Windows': + print("=" * 70) + print("ERROR: Windows is not supported") + print("=" * 70) + print() + print("This tool uses pexpect for interactive SSM sessions, which has") + print("different behavior on Windows compared to macOS and Linux.") + print() + print("Supported platforms:") + print(" - macOS") + print(" - Linux") + print() + print("Please run this tool from a macOS or Linux machine, or use WSL") + print("(Windows Subsystem for Linux) if you're on Windows.") + print() + sys.exit(1) + + parser = argparse.ArgumentParser( + description='HyperPod Issue Report Collector - Supports both EKS and Slurm clusters', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Basic usage - auto-detects cluster type + python hyperpod_issue_report.py --cluster my-cluster --region us-west-2 --s3-path s3://my-bucket + + # With custom prefix and additional commands + python hyperpod_issue_report.py --cluster my-cluster --region us-west-2 --s3-path s3://my-bucket/diagnostics \\ + --command "df -h" --command "free -h" + + # Target specific instance groups + python hyperpod_issue_report.py --cluster my-cluster --region us-west-2 --s3-path s3://my-bucket \\ + --instance-groups worker-group-1 worker-group-2 + + # Target specific nodes (instance IDs, EKS names, or Slurm names) + python hyperpod_issue_report.py --cluster my-cluster --region us-west-2 --s3-path s3://my-bucket \\ + --nodes i-abc123 hyperpod-i-044bbf66a68558e87 ip-10-1-104-161 + """ + ) + + parser.add_argument('--cluster', '-c', required=True, help='HyperPod cluster name (EKS or Slurm)') + parser.add_argument('--region', '-r', help='AWS region (uses default boto3 region if not specified)') + parser.add_argument('--s3-path', '-s', required=True, help='S3 path for storing reports (e.g., s3://bucket-name/prefix or s3://bucket-name)') + parser.add_argument('--command', '-cmd', action='append', help='Additional command to execute on nodes (can be specified multiple times)') + parser.add_argument('--instance-groups', '-g', nargs='+', help='Target specific instance groups (e.g., --instance-groups worker1 worker2)') + parser.add_argument('--max-workers', '-w', type=int, default=16, help='Maximum concurrent SSM sessions (default: 16, reduce if hitting throttling)') + parser.add_argument('--nodes', '-n', nargs='+', help='Target specific nodes: instance IDs (i-*), EKS node names (hyperpod-i-*), or Slurm node names (ip-*)') + parser.add_argument('--debug', '-d', action='store_true', help='Enable debug mode') + + args = parser.parse_args() + + # Validate mutually exclusive options + if args.instance_groups and args.nodes: + print("Error: --instance-groups and --nodes cannot be used together") + sys.exit(1) + + try: + collector = HyperPodIssueReportCollector( + cluster_name=args.cluster, + s3_path=args.s3_path, + region=args.region, + debug=args.debug + ) + + # User-specified commands + commands = [] + + # Add any user-specified commands + if args.command: + commands.extend(args.command) + + collector.collect_reports( + commands=commands, + instance_groups=args.instance_groups, + instance_ids=args.nodes, + max_workers=args.max_workers + ) + + except KeyboardInterrupt: + print("\n\nInterrupted by user. Exiting...") + sys.exit(1) + except Exception as e: + print(f"\nError: {e}") + if args.debug: + traceback.print_exc() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/plugins/sagemaker-ai/skills/hyperpod-issue-report/scripts/requirements.txt b/plugins/sagemaker-ai/skills/hyperpod-issue-report/scripts/requirements.txt new file mode 100755 index 00000000..690613af --- /dev/null +++ b/plugins/sagemaker-ai/skills/hyperpod-issue-report/scripts/requirements.txt @@ -0,0 +1,3 @@ +boto3>=1.26.0 +botocore>=1.29.0 +pexpect>=4.8.0 diff --git a/plugins/sagemaker-ai/skills/hyperpod-ssm/SKILL.md b/plugins/sagemaker-ai/skills/hyperpod-ssm/SKILL.md new file mode 100755 index 00000000..048d962d --- /dev/null +++ b/plugins/sagemaker-ai/skills/hyperpod-ssm/SKILL.md @@ -0,0 +1,96 @@ +--- +name: hyperpod-ssm +description: Remote command execution and file transfer on SageMaker HyperPod cluster nodes via AWS Systems Manager (SSM). This is the primary interface for accessing HyperPod nodes — direct SSH is not available. Use when any skill, workflow, or user request needs to execute commands on cluster nodes, upload files to nodes, read/download files from nodes, run diagnostics, install packages, or perform any operation requiring shell access to HyperPod instances. Other HyperPod skills depend on this skill for all node-level operations. +--- + +# HyperPod SSM Access + +## SSM Target Format + +Target: `sagemaker-cluster:_-` + +- `CLUSTER_ID`: Last segment of cluster ARN (NOT the cluster name). Extract via `get-cluster-info.sh`. +- `GROUP_NAME`: Instance group name — retrieve via `list-nodes.sh`. +- `INSTANCE_ID`: EC2 instance ID (e.g., `i-0123456789abcdef0`) + +## Scripts + +Three scripts under `scripts/`. Resolve cluster info and nodes **once**, then execute per node. + +### get-cluster-info.sh — Resolve cluster name → ID (call once) + +```bash +scripts/get-cluster-info.sh CLUSTER_NAME [--region REGION] +# Output: {"cluster_id":"...","cluster_arn":"...","cluster_name":"...","region":"..."} +``` + +### list-nodes.sh — List all nodes with pagination (call once) + +```bash +scripts/list-nodes.sh CLUSTER_NAME [--region REGION] [--instance-group GROUP] [--instance-id ID] +# Output: JSON array of ClusterNodeSummaries (InstanceId, InstanceGroupName, InstanceStatus, etc.) +``` + +`list-cluster-nodes` paginates at 100 nodes. This script handles pagination automatically. + +### ssm-exec.sh — Execute command on a node (call per node) + +```bash +# Execute — with pre-built target +scripts/ssm-exec.sh --target "sagemaker-cluster:CLUSTERID_GROUP-INSTANCEID" 'command' [--region REGION] + +# Execute — with parts +scripts/ssm-exec.sh --cluster-id ID --group GROUP --instance-id INSTANCE_ID 'command' [--region REGION] + +# Upload +scripts/ssm-exec.sh --target TARGET --upload LOCAL_PATH REMOTE_PATH [--region REGION] + +# Read remote file +scripts/ssm-exec.sh --target TARGET --read REMOTE_PATH [--region REGION] +``` + +## Running Commands Across Many Nodes + +SSM `start-session` rate limit: **3 TPS** per account. Plan batch size and delay accordingly. + +`aws ssm send-command` does NOT support `sagemaker-cluster:` targets — only `start-session` works. + +## Manual SSM Commands + +When the scripts aren't suitable, use `aws ssm start-session` directly with `AWS-StartNonInteractiveCommand`: + +```bash +cat > /tmp/cmd.json << 'EOF' +{"command": ["bash -c 'echo hello && whoami'"]} +EOF + +aws ssm start-session \ + --target sagemaker-cluster:CLUSTERID_GROUPNAME-INSTANCEID \ + --region REGION \ + --document-name AWS-StartNonInteractiveCommand \ + --parameters file:///tmp/cmd.json +``` + +Always use a JSON file for `--parameters` — inline parameters break with special characters. + +## Common Diagnostic Commands + +| Task | Command | +| ---------------- | -------------------------------------------------------------- | +| Lifecycle logs | `cat /var/log/provision/provisioning.log` | +| Memory | `free -h` | +| Disk/mounts | `df -h && lsblk` | +| GPU status | `nvidia-smi` | +| GPU memory | `nvidia-smi --query-gpu=memory.used,memory.total --format=csv` | +| EFA/network | `fi_info -p efa` | +| CloudWatch agent | `sudo systemctl status amazon-cloudwatch-agent` | +| Top processes | `ps aux --sort=-%mem \| head -20` | + +## Key Details + +- Default SSM non-interactive user is `root`. +- SSM rate limit: **3 TPS** per account. +- For interactive sessions (rare), omit `--document-name` to get a shell. +- Interactive commands (vim, top) are not supported via `AWS-StartNonInteractiveCommand`. +- Large outputs may be truncated by SSM. +- For troubleshooting common errors, see [references/troubleshooting.md](references/troubleshooting.md). diff --git a/plugins/sagemaker-ai/skills/hyperpod-ssm/references/troubleshooting.md b/plugins/sagemaker-ai/skills/hyperpod-ssm/references/troubleshooting.md new file mode 100755 index 00000000..e8a098d8 --- /dev/null +++ b/plugins/sagemaker-ai/skills/hyperpod-ssm/references/troubleshooting.md @@ -0,0 +1,61 @@ +# Troubleshooting + +## TargetNotConnected + +``` +An error occurred (TargetNotConnected) when calling the StartSession operation +``` + +Causes: + +- Wrong target format — verify underscore between cluster ID and group name, hyphen before instance ID +- Cluster ID is wrong — must be extracted from ARN, not the cluster name +- Node not in `Running` state — check with `list-cluster-nodes` +- SSM agent not running on the node + +Verify: + +```bash +aws sagemaker list-cluster-nodes --cluster-name CLUSTER --region REGION \ + --query 'ClusterNodeSummaries[?InstanceId==`INSTANCE_ID`].[InstanceGroupName,InstanceStatus.Status]' \ + --output text +``` + +## AccessDeniedException + +Ensure IAM permissions include: + +- `sagemaker:DescribeCluster`, `sagemaker:ListClusterNodes` +- `ssm:StartSession`, `ssm:TerminateSession` + +## Command Timeout / Hangs + +- Long-running commands without output can cause SSM to hang +- Add periodic output or redirect to file then cat: `bash -c 'cmd > /tmp/out.log 2>&1 && cat /tmp/out.log'` + +## Base64 Upload Corruption + +- Always use `base64 -w 0` (no line wrapping) +- For large files (>256KB), SSM parameter size limits may apply — split into chunks or use shared filesystem (FSx/EFS) instead + +## RunAs User Error + +``` +Unable to start command: failed to start pty since RunAs user does not exist +``` + +SSM Run-as-user is configured but user doesn't exist on the node. Use default (root) and `sudo -u USERNAME` explicitly. + +## ThrottlingException on StartSession + +``` +An error occurred (ThrottlingException) when calling the StartSession operation: Rate exceeded +``` + +Cause: Too many concurrent `start-session` calls. SSM has per-account rate limits. + +Fix: Use batched parallel execution with a delay between batches (see "Running Commands Across Many Nodes" in SKILL.md). A batch size of 20 with a 2-second delay between batches works reliably for clusters of 100+ nodes. + +## send-command Not Supported + +`aws ssm send-command` does not support `sagemaker-cluster:` targets and will return a `ValidationException`. Use `start-session` with `AWS-StartNonInteractiveCommand` instead. diff --git a/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/get-cluster-info.sh b/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/get-cluster-info.sh new file mode 100755 index 00000000..0412462b --- /dev/null +++ b/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/get-cluster-info.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash +# Get HyperPod cluster ID and metadata +# Usage: ./get-cluster-info.sh CLUSTER_NAME [--region REGION] +# Output: JSON with cluster_id extracted from ARN +set -euo pipefail + +CLUSTER="$1"; shift +REGION="${AWS_DEFAULT_REGION:-us-west-2}" +while [[ $# -gt 0 ]]; do + case "$1" in + --region) REGION="$2"; shift 2 ;; + *) shift ;; + esac +done + +ARN=$(aws sagemaker describe-cluster --cluster-name "$CLUSTER" --region "$REGION" \ + --query 'ClusterArn' --output text) +CLUSTER_ID=$(echo "$ARN" | cut -d'/' -f2) + +echo "{\"cluster_id\":\"${CLUSTER_ID}\",\"cluster_arn\":\"${ARN}\",\"cluster_name\":\"${CLUSTER}\",\"region\":\"${REGION}\"}" diff --git a/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/list-nodes.sh b/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/list-nodes.sh new file mode 100755 index 00000000..028df598 --- /dev/null +++ b/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/list-nodes.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash +# List all HyperPod cluster nodes with instance group info (handles pagination) +# Usage: ./list-nodes.sh CLUSTER_NAME [--region REGION] [--instance-group GROUP] [--instance-id ID] +# Output: JSON array of nodes with InstanceId, InstanceGroupName, InstanceStatus, etc. +set -euo pipefail + +CLUSTER="$1"; shift +REGION="${AWS_DEFAULT_REGION:-us-west-2}" +FILTER_GROUP="" ; FILTER_ID="" +while [[ $# -gt 0 ]]; do + case "$1" in + --region) REGION="$2"; shift 2 ;; + --instance-group) FILTER_GROUP="$2"; shift 2 ;; + --instance-id) FILTER_ID="$2"; shift 2 ;; + *) shift ;; + esac +done + +# Paginate to collect ALL nodes +NODES='[]'; NEXT="" +while :; do + PAGE=$(aws sagemaker list-cluster-nodes --cluster-name "$CLUSTER" --region "$REGION" \ + ${NEXT:+--next-token "$NEXT"} --output json) + NODES=$(echo "$NODES" "$PAGE" | jq -s '.[0] + .[1].ClusterNodeSummaries') + NEXT=$(echo "$PAGE" | jq -r '.NextToken // empty') + [[ -z "$NEXT" ]] && break +done + +# Apply filters +if [[ -n "$FILTER_GROUP" ]]; then + NODES=$(echo "$NODES" | jq --arg g "$FILTER_GROUP" '[.[] | select(.InstanceGroupName==$g)]') +fi +if [[ -n "$FILTER_ID" ]]; then + NODES=$(echo "$NODES" | jq --arg id "$FILTER_ID" '[.[] | select(.InstanceId==$id)]') +fi + +echo "$NODES" diff --git a/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/ssm-exec.sh b/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/ssm-exec.sh new file mode 100755 index 00000000..b53e6b7f --- /dev/null +++ b/plugins/sagemaker-ai/skills/hyperpod-ssm/scripts/ssm-exec.sh @@ -0,0 +1,85 @@ +#!/usr/bin/env bash +# Execute SSM command on a HyperPod node using a pre-resolved target +# Usage: +# Execute: ./ssm-exec.sh --target TARGET 'command' [--region REGION] +# Upload: ./ssm-exec.sh --target TARGET --upload LOCAL_PATH REMOTE_PATH [--region REGION] +# Read: ./ssm-exec.sh --target TARGET --read REMOTE_PATH [--region REGION] +# +# Target format: sagemaker-cluster:_- +# Build target from parts: use --cluster-id, --group, --instance-id instead of --target +set -euo pipefail + +REGION="${AWS_DEFAULT_REGION:-us-west-2}" +TARGET="" ; CLUSTER_ID="" ; GROUP="" ; INSTANCE_ID="" +MODE="exec" ; CMD="" ; LOCAL_PATH="" ; REMOTE_PATH="" + +while [[ $# -gt 0 ]]; do + case "$1" in + --target) TARGET="$2"; shift 2 ;; + --cluster-id) CLUSTER_ID="$2"; shift 2 ;; + --group) GROUP="$2"; shift 2 ;; + --instance-id) INSTANCE_ID="$2"; shift 2 ;; + --upload) MODE="upload"; LOCAL_PATH="$2"; REMOTE_PATH="$3"; shift 3 ;; + --read) MODE="read"; REMOTE_PATH="$2"; shift 2 ;; + --region) REGION="$2"; shift 2 ;; + *) CMD="$1"; shift ;; + esac +done + +# Build target from parts if --target not provided +if [[ -z "$TARGET" ]]; then + [[ -z "$CLUSTER_ID" || -z "$GROUP" || -z "$INSTANCE_ID" ]] && \ + echo "Error: Provide --target or all of --cluster-id, --group, --instance-id" >&2 && exit 1 + TARGET="sagemaker-cluster:${CLUSTER_ID}_${GROUP}-${INSTANCE_ID}" +fi + +TMPFILE=$(mktemp /tmp/ssm-cmd-XXXXXX.json) +trap "rm -f '$TMPFILE'" EXIT + +# Cross-platform base64 encode with no line wrapping (GNU: -w0, macOS: -b0) +# Usage: b64_encode FILE or cmd | b64_encode +b64_encode() { + if base64 --help 2>&1 | grep -q '\-w'; then + if [[ $# -gt 0 ]]; then base64 -w 0 "$1"; else base64 -w 0; fi + else + if [[ $# -gt 0 ]]; then base64 -b 0 -i "$1"; else base64 -b 0; fi + fi +} + +json_cmd() { + local cmd="$1" + if command -v jq >/dev/null 2>&1; then + jq -n --arg c "$cmd" '{"command":[$c]}' + else + local escaped + escaped=$(printf '%s' "$cmd" | sed 's/\\/\\\\/g; s/"/\\"/g; s/\t/\\t/g') + printf '{"command":["%s"]}\n' "$escaped" + fi +} + +case "$MODE" in + exec) + [[ -z "$CMD" ]] && echo "Error: No command specified" >&2 && exit 1 + INNER=$(printf '%s' "$CMD" | sed "s/'/'\\\\''/g") + json_cmd "bash -c '${INNER}'" > "$TMPFILE" + ;; + upload) + ENCODED=$(b64_encode "$LOCAL_PATH") + # Compress large files to stay within SSM command limits (~64KB) + if [[ ${#ENCODED} -gt 8000 ]]; then + ENCODED=$(gzip -c "$LOCAL_PATH" | b64_encode) + json_cmd "bash -c 'echo ${ENCODED} | base64 -d | gunzip > ${REMOTE_PATH}'" > "$TMPFILE" + else + json_cmd "bash -c 'echo ${ENCODED} | base64 -d > ${REMOTE_PATH}'" > "$TMPFILE" + fi + ;; + read) + json_cmd "cat '${REMOTE_PATH}'" > "$TMPFILE" + ;; +esac + +aws ssm start-session \ + --target "$TARGET" \ + --region "$REGION" \ + --document-name AWS-StartNonInteractiveCommand \ + --parameters "file://$TMPFILE" diff --git a/plugins/sagemaker-ai/skills/hyperpod-version-checker/SKILL.md b/plugins/sagemaker-ai/skills/hyperpod-version-checker/SKILL.md new file mode 100755 index 00000000..aafcd08b --- /dev/null +++ b/plugins/sagemaker-ai/skills/hyperpod-version-checker/SKILL.md @@ -0,0 +1,74 @@ +--- +name: hyperpod-version-checker +description: Check and compare software component versions on SageMaker HyperPod cluster nodes - NVIDIA drivers, CUDA toolkit, cuDNN, NCCL, EFA, AWS OFI NCCL, GDRCopy, MPI, Neuron SDK (Trainium/Inferentia), Python, and PyTorch. Use when checking component versions, verifying CUDA/driver compatibility, detecting version mismatches across nodes, planning upgrades, documenting cluster configuration, or troubleshooting version-related issues on HyperPod. Triggers on requests about versions, compatibility, component checks, or upgrade planning for HyperPod clusters. +--- + +# HyperPod Version Checker + +Upload to cluster nodes via `hyperpod-ssm` skill, then execute. + +## Usage + +```bash +# Text report to console + file +bash hyperpod_check_versions.sh + +# JSON only to stdout (text report still saved to file) — best for piping/parsing +bash hyperpod_check_versions.sh --json + +# Custom output file +bash hyperpod_check_versions.sh --output /tmp/versions.txt + +# No color (for logging) +bash hyperpod_check_versions.sh --no-color +``` + +Output file: `component_versions__.txt` (default) + +## What It Checks + +| Component | Detection Method | Applicable When | +| ----------------- | ----------------------------------------------- | --------------------------------------------- | +| NVIDIA Driver | `nvidia-smi` | GPU instances (p3/p4/p5/g5) | +| CUDA Toolkit | `nvcc`, `/usr/local/cuda` symlink | GPU instances | +| cuDNN | Header file, packages | GPU instances doing deep learning | +| NCCL | Library filename, header, packages | Distributed GPU training | +| EFA | `/opt/amazon/efa_installed_packages`, `fi_info` | EFA-capable instances (p4d/p4de/p5/trn1/trn2) | +| AWS OFI NCCL | `efa_installed_packages`, library search | EFA + NCCL workloads | +| GDRCopy | rpm/dpkg, kernel module | GPU instances with RDMA (p4d+/p5) | +| MPI | `mpirun`, `/opt/amazon/openmpi` | Distributed training | +| Neuron SDK | `neuronx-cc`, `neuron-ls`, packages | Trainium/Inferentia (trn1/trn2/inf1/inf2) | +| Python/PyTorch | `python3`, `torch` import | ML workloads | +| Container runtime | `docker`, `containerd`, `kubectl`, `nvidia-ctk` | EKS clusters | + +## Multi-Node Comparison + +Run on each node and compare. With `--json`, stdout is clean JSON for easy diffing: + +```bash +# Via hyperpod_run_on_multi_nodes.py (from hyperpod-diagnostics skill) +python hyperpod_run_on_multi_nodes.py --cluster \ + --command "bash hyperpod_check_versions.sh --json" +``` + +Or run individually via SSM on each node and diff the JSON outputs. + +## Compatibility Reference + +The script automatically analyzes CUDA/driver compatibility. For reference: + +| Driver Series | Supported CUDA | +| ------------- | ----------------------------- | +| 580+ | 13.x, 12.x, 11.x | +| 570+ | 12.8+ (Blackwell), 12.x, 11.x | +| 545+ | 12.3-12.7, 11.x | +| 525-535 | 12.0-12.2, 11.x | +| 450+ | 11.x only | + +NCCL: Use 2.18+ for CUDA 12.x, 2.12+ for CUDA 11.x. Must be consistent across all nodes. + +| EFA Installer | AWS OFI NCCL | +| ------------- | --------------------- | +| 1.29+ | v1.7.3+ (recommended) | +| 1.26-1.28 | v1.7.0-v1.7.2 | +| 1.20-1.25 | v1.6.0+ | diff --git a/plugins/sagemaker-ai/skills/hyperpod-version-checker/scripts/hyperpod_check_versions.sh b/plugins/sagemaker-ai/skills/hyperpod-version-checker/scripts/hyperpod_check_versions.sh new file mode 100755 index 00000000..5bda095f --- /dev/null +++ b/plugins/sagemaker-ai/skills/hyperpod-version-checker/scripts/hyperpod_check_versions.sh @@ -0,0 +1,545 @@ +#!/bin/bash +# HyperPod Version Checker - Detect software component versions on HyperPod cluster nodes +# +# Checks: NVIDIA driver, CUDA, cuDNN, NCCL, EFA, AWS OFI NCCL, GDRCopy, MPI, +# Neuron SDK, Python, PyTorch, container runtime +# Works on both EKS and Slurm HyperPod clusters. +# +# Usage: bash hyperpod_check_versions.sh [--json] [--no-color] [--output FILE] + +# --- Defaults --- +JSON_OUTPUT=false +USE_COLOR=true +OUTPUT_FILE="" + +# --- Parse args --- +while [[ $# -gt 0 ]]; do + case "$1" in + --json) JSON_OUTPUT=true; shift ;; + --no-color) USE_COLOR=false; shift ;; + --output|-o) OUTPUT_FILE="$2"; shift 2 ;; + -h|--help) + echo "Usage: bash hyperpod_check_versions.sh [--json] [--no-color] [--output FILE]" + echo " --json Output ONLY JSON to stdout (text report still saved to file)" + echo " --no-color Disable color output" + echo " --output/-o Write report to FILE (default: component_versions__