purpcode-uiuc · ganler · Aug 7, 2025 · Aug 7, 2025 · Aug 7, 2025 · Aug 7, 2025
diff --git a/README.md b/README.md
@@ -1,28 +1,161 @@
 # 🔮 PurpCode: Reasoning for Safer Code Generation
 
-This repo includes the training and evaluation infrastructure for PurpCode. For other resources, please check out:
+This repo includes the training and evaluation infrastructure for PurpCode. For other related resources, please check out:
 
 * [📝 Paper](https://arxiv.org/abs/2507.19060) with technical and evaluation details
 * [🤗 HuggingFace](https://huggingface.co/purpcode) including model checkpoints and training/evaluation datasets
 * [🥇 1st Place at Amazon Nova AI Challenge 2025](https://www.amazon.science/nova-ai-challenge/pushing-the-boundaries-of-secure-ai-winners-of-the-amazon-nova-ai-challenge)
 
 ## Overview
 
-PurpCode is an alignment method and a fully open-source recipe (data, model, and code) for eliciting **cybersafe reasoning** capabilities of coding models, including secure code generation and defending against malicious cyber events.
+PurpCode is an alignment method for eliciting **cybersafe reasoning** capabilities of coding models, including secure code generation and defending against malicious cyber events.
 PurpCode includes two alignment stages:
 
 1. **[Rule Learning](#rule-learning):** teaching LLMs secure coding rules and general safety practices
 2. **[Reinforcement Learning](#reinforcement-learning):** letting LLMs co-exercise their safety and utility via verifiable tasks
 
-We also curate comprehensive safety data via internal red teaming and use various evaluators covering cybersafety, utility, and overrefusal.
+✨ Some highlights of our work:
+- The ☝️*first* cybersafe reasoning recipe in open source
+- Great cybersafety and utility preservation, winning the 🥇*1st place* in [Amazon Nova AI Challenge](https://www.amazon.science/nova-ai-challenge/pushing-the-boundaries-of-secure-ai-winners-of-the-amazon-nova-ai-challenge)
+- Fully 👐open-sourced, from models, data, to training/evaluation code and data synthesizers
+- 🏎️Fast RL with *Single-Step Dynamic Sampling* -- 12% faster, 15% less sample wasting, and better results than [DAPO](https://arxiv.org/abs/2503.14476)
+- 📚Supporting 13 evals, 90 CWEs, and 4 training objectives & rewards, covering cybersafety, utility, and overrefusal
+- 🙅‍♂️XSCode -- our home-made evaluator and the *first* benchmark for checking overrefusal in secure code generation
+- ... and more details in the [paper](https://arxiv.org/abs/2507.19060)!
+
+## Initial Setup
+
+```bash
+# --- TMUX SESSION "main" ---
+tmux at -t main || tmux new -s main
+# Security analyzers
+export SHELL_RC=${HOME}/.bashrc # or ~/.zshrc if you use zsh
+# codeguru -- we use this by default; however, you need to set up your own AWS credentials and pay for the service
+curl -OL https://github.com/aws/aws-codeguru-cli/releases/download/0.2.4/aws-codeguru-cli.zip
+unzip aws-codeguru-cli.zip -d ${HOME}
+export PATH=$PATH:${HOME}/aws-codeguru-cli/bin/
+if ! grep -q 'export PATH=$PATH:${HOME}/aws-codeguru-cli/bin/' "${SHELL_RC}"; then
+  sed -i '1i export PATH=$PATH:${HOME}/aws-codeguru-cli/bin/' "${SHELL_RC}"
+fi
+
+# codeql -- if you don't want to use codeguru, you can use codeql instead which only eats CPUs but the analyzer completeness and soundness can be different
+#        -- you also need to set environment variable to PURPCODE_CODE_ANALYZER=codeql
+wget https://github.com/github/codeql-action/releases/download/codeql-bundle-v2.21.0/codeql-bundle-linux64.tar.gz
+tar -xf codeql-bundle-linux64.tar.gz -C ${HOME}
+export PATH=$PATH:${HOME}/codeql/
+if ! grep -q 'export PATH=$PATH:${HOME}/codeql/' "${SHELL_RC}"; then
+  sed -i '1i export PATH=$PATH:${HOME}/codeql/' "${SHELL_RC}"
+fi
+
+tmux detach
+# --------------------------
+
+# --- TMUX SESSION "sandbox" ---
+tmux new -s sandbox
+docker run -it -p 8080:8080 volcengine/sandbox-fusion:server-20241204
+tmux detach
+# ------------------------------
+```
 
 ## Rule Learning
 
-TBD
+We will go through the example based on `Qwen/Qwen2.5-14B-Instruct-1M`:
+
+### Rejection Sampling
+
+```bash
+# --- TMUX SESSION "sgl" ---
+conda create -n sgl python=3.12 -y
+conda activate sgl
+pip install --upgrade pip
+pip install "sglang[all]>=0.4.9.post2" "sglang-router" "huggingface-hub"
+
+huggingface-cli download Qwen/Qwen2.5-14B-Instruct-1M
+python3 -m sglang_router.launch_server --model Qwen/Qwen2.5-14B-Instruct-1M --dp-size 8 --port 30000 --host 0.0.0.0 & tmux detach
+# --------------------------
+
+# --- TMUX SESSION "main" ---
+tmux at -t main || tmux new -s main
+# Inference client for self/context distillation
+# NOTE: context distillation (https://arxiv.org/abs/2209.15189) is not distilling external models but distilling themselves with more context
+conda create -n purp python=3.12 -y
+conda activate purp
+pip install -r requirements.txt
+# Sampling
+python datagen/ctxdistill/distill_main.py --model openai/Qwen/Qwen2.5-14B-Instruct-1M --sample-per-prompt 8 --concurrency 400
+# ---------------------------
+
+# --- TMUX SESSION "sgl" ---
+tmux at -t sgl
+# *Manually* kill the sglang server
+# Ctrl + C
+# Serve the LLM judge model
+huggingface-cli download Qwen/Qwen2.5-32B-Instruct
+python3 -m sglang_router.launch_server --model Qwen/Qwen2.5-32B-Instruct --dp-size 8 --port 30000 --host 0.0.0.0
+tmux detach
+# --------------------------
+
+# --- TMUX SESSION "main" ---
+# Verification
+tmux at -t main || tmux new -s main
+export LLM_JUDGE_OPENAI_URL='http://0.0.0.0:30000/v1'
+python datagen/ctxdistill/post.py --generation-path Qwen2.5-14B-Instruct-1M.distill.train.jsonl
+# ----------------------------
+```
+
+### Running SFT
+
+```bash
+# --- TMUX SESSION "main" ---
+tmux at -t main || tmux new -s main
+conda create -n axo python=3.12 -y
+conda activate axo
+git clone https://github.com/axolotl-ai-cloud/axolotl.git
+cd axolotl
+pip3 install torch --index-url https://download.pytorch.org/whl/cu128  # Your CUDA version may vary
+pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
+
+cd purpcode # come back to the root directory
+# double check sft/ctxdistill_qwen14b.yaml to make sure the paths are aligned well
+axolotl train sft/ctxdistill_qwen14b.yaml --deepspeed deepspeed_configs/zero3.json
+# --> outputs/purpcode-14b-ctxdistill
+```
 
 ## Reinforcement Learning
 
-TBD
+```bash
+# --- TMUX SESSION "main" ---
+tmux at -t main || tmux new -s main
+conda create -n rl python=3.12 -y
+conda activate rl
+
+git clone git@github.com:ganler/verl.git
+cd verl
+git checkout opt
-git checkout opt
+git checkout opt-dapo-ds
-git checkout opt
+git checkout opt-dapo-ds
+
+pip install -e . --upgrade
+pip install vllm==0.8.2
+pip install flash-attn --no-build-isolation --upgrade
+
+cd purpcode # come back to the root directory
+python rl/data/merge.py --datasets purpcorn/code-r1-46k-leetcode2k-kodcode purpcorn/rl-codesec-78k purpcorn/rl-secqa-11k purpcorn/rl-safety-8k-single-turn \
+                        --skip     Qwen2.5-14B-Instruct-1M.ez_task_ids.txt
+
+# ---------------------------
+
+# --- TMUX SESSION "sgl" (remote machine) ---
+# Do it in another machine (assuming ip=a.b.c.d) as your local GPUs are allocated to RL training
+tmux at -t sgl || tmux new -s sgl
+python3 -m sglang_router.launch_server --model Qwen/Qwen2.5-32B-Instruct --dp-size 8 --port 30000 --host 0.0.0.0 & tmux detach
+# -------------------------------------------
+
+# --- TMUX SESSION "main" (RL machine) ---
+tmux at -t main || tmux new -s main
+export LLM_JUDGE_OPENAI_URL='http://[a.b.c.d]:30000/v1' # replace [a.b.c.d] with a true IP address
+conda activate rl
+bash rl/main_grpo_qwen14b.sh
+# -------------------------------------------
+```
 
 ## Evaluation
 
@@ -42,6 +175,13 @@ python eval/main.py --task "purpcode/PHTest"             --model purpcode/purpco
 Notes:
 * `--oracle` for evaluating customized generation (default guessing from dataset).
 
+## Acknowledgements
+
+- [Amazon Nova AI Challenge](https://www.amazon.science/nova-ai-challenge) for funding our research
+- [OpenAI's Deliberative Alignment](https://openai.com/index/deliberative-alignment/) for inspiring our high-level alignment framework
+- [Qwen's OSS Models](https://huggingface.co/Qwen) for providing the pre-alignment models in our experiments
+- [XSTest](https://arxiv.org/abs/2308.01263) for inspiring our XSCode dataset
+
 ## References
 
 ```bibtex

diff --git a/datagen/ctxdistill/ctxdistill.py b/datagen/ctxdistill/ctxdistill.py
@@ -146,7 +146,7 @@ def construction_analyzer_info(analyzer_results: dict):
                 + "\n```"
             )
             block.append(
-                recommendataion.split("[Learn more]")[0]
+                recommendation.split("[Learn more]")[0]
                 .split("**More info**")[0]
                 .strip()
             )
@@ -255,7 +255,7 @@ def run_distillation(
 ):
     if not eval:  # using training data
         temperature = 0.8
-        output_path = f"{model.split('/')[-1]}.distill.june.train.jsonl"
+        output_path = f"{model.split('/')[-1]}.distill.train.jsonl"
         print(f"Expected output path: {output_path}")
 
         rows = []
@@ -288,9 +288,7 @@ def run_distillation(
     else:  # eval mode
         assert sample_per_prompt == 1, "Sample per prompt is not supported in eval mode"
         temperature = 0.2 if "Qwen3" in model or "DeepSeek-R1" in model else 0.0
-        output_path = (
-            f"{model.split('/')[-1]}.distill.june.eval.{eval.split('/')[-1]}.jsonl"
-        )
+        output_path = f"{model.split('/')[-1]}.distill.eval.{eval.split('/')[-1]}.jsonl"
         dataset = load_dataset(eval, split="test")
         # expand turns if there's a mixture of user and assistant turns
         print(f"Dataset before expansion: {dataset}")

diff --git a/rl/controlled/main_grpo_qwen14b_dapo_speed.sh b/rl/controlled/main_grpo_qwen14b_dapo_speed.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# The config is optimized for 8xH200
+# Assuming using vLLM >= 0.8 such that is V1 is enbaled by default
+# Depends on: https://github.com/ganler/verl/tree/opt
+set -eux
+
+# IMPORTANT: checkout the specialized verl repository to the `opt-dapo-ds` branch instead of `opt`
+
+export PYTHONPATH=$(pwd)
+
+python -c "import rl.data"
+
+if [ -z "${CUDA_VISIBLE_DEVICES+x}" ]; then
+    GPUS_PER_NODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+else
+    GPUS_PER_NODE=$(echo "$CUDA_VISIBLE_DEVICES" | awk -F',' '{print NF}')
+fi
+
+# Tips for reducing VRAM usage
+# 1. Reduce MICRO_BATCH_PER_GPU (and increase GRAD_ACCUM_STEPS accordingly)
+# 2. Reduce the factor (6) in PPO_MAX_TOKEN_LEN_PER_GPU to 3
+
+# MAIN CONFIG
+DATASET=code-r1-46k-leetcode2k-kodcode-rl-codesec-78k-rl-secqa-11k-rl-safety-8k-single-turn
+MODEL_PATH="outputs/purpcode-14b-ctxdistill"
+MICRO_BATCH_PER_GPU=48
+ROLLOUT_N_SAMPLE=8
+MAX_PROMPT_LEN=2048
+MAX_RESPONSE_LEN=3072
+MAX_EPOCHS=1
+
+# AUTO VALUES
+ROLLOUT_N_QUERY=$((MICRO_BATCH_PER_GPU * GPUS_PER_NODE))
+PPO_MAX_TOKEN_LEN_PER_GPU=$(( 8 * $(( $MAX_PROMPT_LEN + $MAX_RESPONSE_LEN )) ))
+
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=grpo \
+    data.train_files=local_data/$DATASET/train.parquet \
+    data.val_files=local_data/$DATASET/test.parquet \
+    data.filter_overlong_prompts=True \
+    data.train_batch_size=$ROLLOUT_N_QUERY \
+    +data.max_roll_factor=4 \
+    data.max_prompt_length=$MAX_PROMPT_LEN \
+    data.max_response_length=$MAX_RESPONSE_LEN \
+    actor_rollout_ref.actor.optim.lr=5e-7 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.model.path=$MODEL_PATH \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.ppo_mini_batch_size=$ROLLOUT_N_QUERY \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=$PPO_MAX_TOKEN_LEN_PER_GPU \
+    actor_rollout_ref.actor.use_dynamic_bsz=True \
+    actor_rollout_ref.actor.use_kl_loss=True \
+    actor_rollout_ref.actor.kl_loss_coef=0.001 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+    actor_rollout_ref.rollout.n=$ROLLOUT_N_SAMPLE \
+    actor_rollout_ref.rollout.enforce_eager=False \
+    actor_rollout_ref.rollout.free_cache_engine=False \
+    algorithm.kl_ctrl.kl_coef=0.001 \
+    +algorithm.filter_groups.enable=True \
+    trainer.critic_warmup=0 \
+    trainer.logger=['wandb'] \
+    trainer.project_name='purpcode' \
+    trainer.experiment_name=${DATASET}-dapo-speed \
+    trainer.nnodes=1 \
+    trainer.default_local_dir=./models/purpcode-rl-${DATASET}-14b-dapo-speed \
+    trainer.n_gpus_per_node=$GPUS_PER_NODE \
+    trainer.save_freq=32 \
+    trainer.test_freq=16 \
+    trainer.total_epochs=$MAX_EPOCHS \
+    trainer.resume_mode=auto \
+    +custom_reward_function.path=./rl/grouped_reward.py \
+    reward_model.reward_manager=group $@ 2>&1 | tee grpo.log
diff --git a/rl/controlled/main_grpo_qwen14b_direct_rl.sh b/rl/controlled/main_grpo_qwen14b_direct_rl.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+
+# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# The config is optimized for 8xH200
+# Assuming using vLLM >= 0.8 such that is V1 is enbaled by default
+# Depends on: https://github.com/ganler/verl/tree/opt
+set -eux
+
+export PYTHONPATH=$(pwd)
+
+python -c "import rl.data"
+
+if [ -z "${CUDA_VISIBLE_DEVICES+x}" ]; then
+    GPUS_PER_NODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+else
+    GPUS_PER_NODE=$(echo "$CUDA_VISIBLE_DEVICES" | awk -F',' '{print NF}')
+fi
+
+# Tips for reducing VRAM usage
+# 1. Reduce MICRO_BATCH_PER_GPU (and increase GRAD_ACCUM_STEPS accordingly)
+# 2. Reduce the factor (6) in PPO_MAX_TOKEN_LEN_PER_GPU to 3
+
+# MAIN CONFIG
+DATASET=code-r1-46k-leetcode2k-kodcode-rl-codesec-78k-rl-secqa-11k-rl-safety-8k-single-turn
+MODEL_PATH="models/Qwen2.5-14B-Instruct-1M"
+MICRO_BATCH_PER_GPU=48
+ROLLOUT_N_SAMPLE=8
+MAX_PROMPT_LEN=2048
+MAX_RESPONSE_LEN=3072
+MAX_EPOCHS=1
+
+# AUTO VALUES
+ROLLOUT_N_QUERY=$((MICRO_BATCH_PER_GPU * GPUS_PER_NODE))
+PPO_MAX_TOKEN_LEN_PER_GPU=$(( 8 * $(( $MAX_PROMPT_LEN + $MAX_RESPONSE_LEN )) ))
+
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=grpo \
+    data.train_files=local_data/$DATASET/train.parquet \
+    data.val_files=local_data/$DATASET/test.parquet \
+    data.filter_overlong_prompts=True \
+    data.train_batch_size=$ROLLOUT_N_QUERY \
+    +data.max_roll_factor=4 \
+    data.max_prompt_length=$MAX_PROMPT_LEN \
+    data.max_response_length=$MAX_RESPONSE_LEN \
+    actor_rollout_ref.actor.optim.lr=5e-7 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.model.path=$MODEL_PATH \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.ppo_mini_batch_size=$ROLLOUT_N_QUERY \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=$PPO_MAX_TOKEN_LEN_PER_GPU \
+    actor_rollout_ref.actor.use_dynamic_bsz=True \
+    actor_rollout_ref.actor.use_kl_loss=True \
+    actor_rollout_ref.actor.kl_loss_coef=0.001 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+    actor_rollout_ref.rollout.n=$ROLLOUT_N_SAMPLE \
+    actor_rollout_ref.rollout.enforce_eager=False \
+    actor_rollout_ref.rollout.free_cache_engine=False \
+    algorithm.kl_ctrl.kl_coef=0.001 \
+    +algorithm.filter_groups.enable=True \
+    trainer.critic_warmup=0 \
+    trainer.logger=['wandb'] \
+    trainer.project_name='purpcode' \
+    trainer.experiment_name=${DATASET}-direct-rl \
+    trainer.nnodes=1 \
+    trainer.default_local_dir=./models/purpcode-rl-${DATASET}-14b-direct-rl-rebuttal \
+    trainer.n_gpus_per_node=$GPUS_PER_NODE \
+    trainer.save_freq=32 \
+    trainer.test_freq=16 \
+    trainer.total_epochs=$MAX_EPOCHS \
+    trainer.resume_mode=auto \
+    +custom_reward_function.path=./rl/grouped_reward.py \
+    reward_model.reward_manager=group $@ 2>&1 | tee grpo.log