From 47a850c150a366f7e1b2c8bae7d6dd752ff8e2b7 Mon Sep 17 00:00:00 2001 From: daniellepintz Date: Thu, 20 Nov 2025 16:07:29 +0000 Subject: [PATCH 1/4] add slurm batch script --- apps/grpo/qwen3_8b.yaml | 1 + submit_grpo.sh | 31 +++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+) create mode 100755 submit_grpo.sh diff --git a/apps/grpo/qwen3_8b.yaml b/apps/grpo/qwen3_8b.yaml index a2815c5c0..50ed08b68 100644 --- a/apps/grpo/qwen3_8b.yaml +++ b/apps/grpo/qwen3_8b.yaml @@ -12,6 +12,7 @@ off_by_n: 1 # Off by one by default # Observability configuration metric_logging: wandb: + entity: agentic-models project: grpo-training group: grpo_exp_${oc.env:USER} logging_mode: global_reduce # global_reduce, per_rank_reduce, per_rank_no_reduce diff --git a/submit_grpo.sh b/submit_grpo.sh new file mode 100755 index 000000000..db44042dd --- /dev/null +++ b/submit_grpo.sh @@ -0,0 +1,31 @@ +#!/bin/bash +#SBATCH --job-name=grpo-qwen3-32b +#SBATCH --qos=h200_agentic-models_high +#SBATCH --account=agentic-models +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --gpus-per-node=8 +#SBATCH --cpus-per-task=128 +#SBATCH --mem=500G +#SBATCH --time=72:00:00 + +echo "Starting GRPO training job" + +# Initialize conda +eval "$(conda shell.bash hook)" + +# Activate the conda environment (replace 'forge' with your actual environment name if different) +conda activate forge + +# # Option 1: Set wandb API key (replace with your actual API key) +# export "WANDB_API_KEY=4cf092866223040751bacd9b149cfd87304d19a2" + +# export WANDB_MODE=offline +# export WANDB_DIR="/mnt/wsfuse/teamforge/wandb/$(cat /dev/urandom | tr -dc 'a-zA-Z0-9' | fold -w 5 | head -n 1)" +# mkdir -p "$WANDB_DIR" + +# Change to the torchforge directory +cd /storage/home/daniellepintz/torchforge + +# Run the GRPO training +srun python -m apps.grpo.main --config apps/grpo/qwen3_8b.yaml From 9eef5f488146e390e537427eaedd3c1afd595528 Mon Sep 17 00:00:00 2001 From: daniellepintz Date: Thu, 20 Nov 2025 16:10:51 +0000 Subject: [PATCH 2/4] update --- submit_grpo.sh | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/submit_grpo.sh b/submit_grpo.sh index db44042dd..e64662b4a 100755 --- a/submit_grpo.sh +++ b/submit_grpo.sh @@ -11,21 +11,10 @@ echo "Starting GRPO training job" -# Initialize conda eval "$(conda shell.bash hook)" -# Activate the conda environment (replace 'forge' with your actual environment name if different) conda activate forge -# # Option 1: Set wandb API key (replace with your actual API key) -# export "WANDB_API_KEY=4cf092866223040751bacd9b149cfd87304d19a2" - -# export WANDB_MODE=offline -# export WANDB_DIR="/mnt/wsfuse/teamforge/wandb/$(cat /dev/urandom | tr -dc 'a-zA-Z0-9' | fold -w 5 | head -n 1)" -# mkdir -p "$WANDB_DIR" - -# Change to the torchforge directory cd /storage/home/daniellepintz/torchforge -# Run the GRPO training srun python -m apps.grpo.main --config apps/grpo/qwen3_8b.yaml From 6177f21c7ec78ea12391afd014ca716ccb1ed574 Mon Sep 17 00:00:00 2001 From: daniellepintz Date: Fri, 21 Nov 2025 15:33:13 +0000 Subject: [PATCH 3/4] upd --- submit_grpo.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/submit_grpo.sh b/submit_grpo.sh index e64662b4a..1f18ea034 100755 --- a/submit_grpo.sh +++ b/submit_grpo.sh @@ -15,6 +15,8 @@ eval "$(conda shell.bash hook)" conda activate forge +export TORCHSTORE_RDMA_ENABLED=0 + cd /storage/home/daniellepintz/torchforge -srun python -m apps.grpo.main --config apps/grpo/qwen3_8b.yaml +srun python -m apps.grpo.main --config apps/grpo/qwen3_32b.yaml From 797989bfb3e90d096225fc5967003ea1e7b75141 Mon Sep 17 00:00:00 2001 From: daniellepintz Date: Sun, 23 Nov 2025 13:22:52 +0000 Subject: [PATCH 4/4] upd --- apps/grpo/qwen3_32b.yaml | 2 +- src/forge/controller/launcher.py | 9 +++++++-- submit_grpo.sh | 4 +++- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/apps/grpo/qwen3_32b.yaml b/apps/grpo/qwen3_32b.yaml index 639f6669e..e32dbd8b7 100644 --- a/apps/grpo/qwen3_32b.yaml +++ b/apps/grpo/qwen3_32b.yaml @@ -4,7 +4,7 @@ # Global configuration group_size: 16 -local_batch_size: 32 # per-device batch size +local_batch_size: 2 # per-device batch size max_req_tokens: 1024 max_res_tokens: 1024 model: "Qwen/Qwen3-32B" diff --git a/src/forge/controller/launcher.py b/src/forge/controller/launcher.py index c40982634..e39955aa0 100644 --- a/src/forge/controller/launcher.py +++ b/src/forge/controller/launcher.py @@ -136,14 +136,19 @@ async def get_allocator(self, name: str, num_hosts: int) -> tuple[Any, Any, str] for role in appdef.roles: # Note - this is hardcoded to SLURM # We got this with sinfo - role.resource.memMB = 2062607 - role.resource.cpu = 128 + role.resource.memMB = 2047962 + role.resource.cpu = 192 role.resource.gpu = 8 # Note - we cannot add in an empty workspace, so we create a fake temporary one temp_workspace = tempfile.mkdtemp(prefix="forge_workspace_") server_config = Config( scheduler="slurm", + scheduler_args={ + "account": "agentic-models", + "qos": "h100_lowest", + "time": "72:00:00" + }, appdef=appdef, workspace=monarch.tools.config.workspace.Workspace(dirs=[temp_workspace]), ) diff --git a/submit_grpo.sh b/submit_grpo.sh index 1f18ea034..d99730ea2 100755 --- a/submit_grpo.sh +++ b/submit_grpo.sh @@ -15,8 +15,10 @@ eval "$(conda shell.bash hook)" conda activate forge +export TORCH_COMPILE_DISABLE=1 +unset SLURM_MEM_PER_CPU SLURM_MEM_PER_GPU SLURM_MEM_PER_NODE export TORCHSTORE_RDMA_ENABLED=0 cd /storage/home/daniellepintz/torchforge -srun python -m apps.grpo.main --config apps/grpo/qwen3_32b.yaml +python -m apps.grpo.main --config apps/grpo/qwen3_32b.yaml