|
| 1 | +""" |
| 2 | +Minimal SFT (Supervised Fine-Tuning) trainer using WorkerDispatch. |
| 3 | +
|
| 4 | +This script demonstrates SFT using the same forward_backward interface as RL training, |
| 5 | +but with loss_fn="cross_entropy" to compute simple negative log-likelihood loss. |
| 6 | +
|
| 7 | +Usage: |
| 8 | + # First, make sure you have Ray installed and a GPU available |
| 9 | + uv run --isolated --extra vllm python examples/sft/sft_trainer.py |
| 10 | +
|
| 11 | +This example: |
| 12 | +1. Loads a small subset of the Alpaca dataset |
| 13 | +2. Tokenizes examples into prompt + completion format |
| 14 | +3. Uses WorkerDispatch.forward_backward(loss_fn="cross_entropy") for SFT |
| 15 | +4. Demonstrates the Tinker-compatible API for supervised fine-tuning |
| 16 | +""" |
| 17 | + |
| 18 | +import ray |
| 19 | +import hydra |
| 20 | +import torch |
| 21 | +from datasets import load_dataset |
| 22 | +from loguru import logger |
| 23 | +from omegaconf import DictConfig |
| 24 | +from transformers import AutoTokenizer |
| 25 | +from tqdm import tqdm |
| 26 | + |
| 27 | +from ray.util.placement_group import placement_group |
| 28 | + |
| 29 | +from skyrl_train.training_batch import TrainingInputBatch |
| 30 | +from skyrl_train.entrypoints.main_base import config_dir |
| 31 | +from skyrl_train.workers.worker_dispatch import WorkerDispatch |
| 32 | +from skyrl_train.workers.worker import PPORayActorGroup |
| 33 | +from skyrl_train.workers.fsdp.fsdp_worker import PolicyWorker |
| 34 | +from skyrl_train.utils.utils import initialize_ray, validate_cfg |
| 35 | +from skyrl_train.utils import get_ray_pg_ready_with_timeout |
| 36 | + |
| 37 | + |
| 38 | +def get_sft_config() -> DictConfig: |
| 39 | + """Get config with SFT-specific overrides.""" |
| 40 | + with hydra.initialize_config_dir(config_dir=config_dir): |
| 41 | + cfg = hydra.compose(config_name="ppo_base_config") |
| 42 | + |
| 43 | + # Use a small model for testing |
| 44 | + cfg.trainer.policy.model.path = "Qwen/Qwen2.5-0.5B-Instruct" |
| 45 | + cfg.trainer.placement.policy_num_gpus_per_node = 1 |
| 46 | + cfg.generator.inference_engine_tensor_parallel_size = 1 |
| 47 | + cfg.trainer.logger = "console" |
| 48 | + cfg.trainer.micro_train_batch_size_per_gpu = 2 |
| 49 | + |
| 50 | + validate_cfg(cfg) |
| 51 | + return cfg |
| 52 | + |
| 53 | + |
| 54 | +def tokenize_sft_example(example: dict, tokenizer, max_length: int = 512) -> dict | None: |
| 55 | + """Tokenize a single SFT example (instruction + output). |
| 56 | +
|
| 57 | + Returns dict with input_ids, attention_mask, num_actions (response length), |
| 58 | + or None if the example was fully truncated. |
| 59 | + """ |
| 60 | + instruction = example.get("instruction", "") |
| 61 | + input_text = example.get("input", "") |
| 62 | + output = example.get("output", "") |
| 63 | + |
| 64 | + # Combine instruction and input |
| 65 | + if input_text: |
| 66 | + prompt = f"{instruction}\n\n{input_text}" |
| 67 | + else: |
| 68 | + prompt = instruction |
| 69 | + |
| 70 | + # Tokenize prompt and full sequence separately to find boundary |
| 71 | + prompt_tokens = tokenizer(prompt, add_special_tokens=True, truncation=True, max_length=max_length) |
| 72 | + full_text = f"{prompt}\n\n{output}" |
| 73 | + full_tokens = tokenizer(full_text, add_special_tokens=True, truncation=True, max_length=max_length) |
| 74 | + |
| 75 | + prompt_len = len(prompt_tokens["input_ids"]) |
| 76 | + full_len = len(full_tokens["input_ids"]) |
| 77 | + num_actions = full_len - prompt_len |
| 78 | + |
| 79 | + # Skip examples where response was fully truncated |
| 80 | + if num_actions <= 0: |
| 81 | + return None |
| 82 | + |
| 83 | + return { |
| 84 | + "input_ids": full_tokens["input_ids"], |
| 85 | + "attention_mask": full_tokens["attention_mask"], |
| 86 | + "num_actions": num_actions, |
| 87 | + } |
| 88 | + |
| 89 | + |
| 90 | +def collate_sft_batch(examples: list, tokenizer) -> TrainingInputBatch: |
| 91 | + """Collate tokenized examples into a TrainingInputBatch. |
| 92 | +
|
| 93 | + Creates the batch format expected by forward_backward with cross_entropy loss: |
| 94 | + - sequences: [batch_size, seq_len] - token IDs (left-padded) |
| 95 | + - attention_mask: [batch_size, seq_len] - 1 for real tokens, 0 for padding |
| 96 | + - loss_mask: [batch_size, num_actions] - 1 for tokens to compute loss on |
| 97 | + """ |
| 98 | + max_len = max(len(ex["input_ids"]) for ex in examples) |
| 99 | + max_num_actions = max(ex["num_actions"] for ex in examples) |
| 100 | + |
| 101 | + sequences = [] |
| 102 | + attention_masks = [] |
| 103 | + loss_masks = [] |
| 104 | + |
| 105 | + for ex in examples: |
| 106 | + pad_len = max_len - len(ex["input_ids"]) |
| 107 | + # Left-pad sequences (SkyRL convention) |
| 108 | + sequences.append([tokenizer.pad_token_id] * pad_len + ex["input_ids"]) |
| 109 | + attention_masks.append([0] * pad_len + ex["attention_mask"]) |
| 110 | + # Per-example loss_mask: 0s for padding, 1s only for this example's response tokens |
| 111 | + action_pad = max_num_actions - ex["num_actions"] |
| 112 | + loss_masks.append([0] * action_pad + [1] * ex["num_actions"]) |
| 113 | + |
| 114 | + batch = TrainingInputBatch( |
| 115 | + { |
| 116 | + "sequences": torch.tensor(sequences, dtype=torch.long), |
| 117 | + "attention_mask": torch.tensor(attention_masks, dtype=torch.long), |
| 118 | + "loss_mask": torch.tensor(loss_masks, dtype=torch.long), |
| 119 | + } |
| 120 | + ) |
| 121 | + batch.metadata = {"response_length": max_num_actions} |
| 122 | + return batch |
| 123 | + |
| 124 | + |
| 125 | +def main(): |
| 126 | + """Run a minimal SFT training loop.""" |
| 127 | + cfg = get_sft_config() |
| 128 | + initialize_ray(cfg) |
| 129 | + |
| 130 | + logger.info("Loading tokenizer...") |
| 131 | + tokenizer = AutoTokenizer.from_pretrained(cfg.trainer.policy.model.path) |
| 132 | + if tokenizer.pad_token is None: |
| 133 | + tokenizer.pad_token = tokenizer.eos_token |
| 134 | + |
| 135 | + logger.info("Loading dataset...") |
| 136 | + # Use a small subset for demonstration |
| 137 | + dataset = load_dataset("yahma/alpaca-cleaned", split="train[:100]") |
| 138 | + |
| 139 | + logger.info("Tokenizing dataset...") |
| 140 | + tokenized = [tokenize_sft_example(ex, tokenizer) for ex in dataset] |
| 141 | + tokenized = [ex for ex in tokenized if ex is not None] # Filter truncated |
| 142 | + logger.info(f"Kept {len(tokenized)} examples after filtering truncated") |
| 143 | + |
| 144 | + logger.info("Initializing policy worker...") |
| 145 | + num_gpus = cfg.trainer.placement.policy_num_gpus_per_node |
| 146 | + pg = placement_group([{"GPU": num_gpus, "CPU": num_gpus}], strategy="PACK") |
| 147 | + get_ray_pg_ready_with_timeout(pg, timeout=30) |
| 148 | + |
| 149 | + actor_group = PPORayActorGroup( |
| 150 | + cfg, |
| 151 | + num_nodes=1, |
| 152 | + num_gpus_per_node=num_gpus, |
| 153 | + ray_actor_type=PolicyWorker, |
| 154 | + pg=pg, |
| 155 | + num_gpus_per_actor=0.75, |
| 156 | + colocate_all=False, |
| 157 | + sequence_parallel_size=cfg.trainer.policy.sequence_parallel_size, |
| 158 | + ) |
| 159 | + ray.get(actor_group.async_init_model(cfg.trainer.policy.model.path)) |
| 160 | + |
| 161 | + dispatch = WorkerDispatch(cfg, policy_actor_group=actor_group) |
| 162 | + |
| 163 | + # Training loop |
| 164 | + batch_size = 4 |
| 165 | + num_steps = 10 |
| 166 | + logger.info(f"Starting SFT training for {num_steps} steps...") |
| 167 | + |
| 168 | + for step in tqdm(range(num_steps)): |
| 169 | + # Create batch from tokenized examples |
| 170 | + start_idx = (step * batch_size) % len(tokenized) |
| 171 | + batch_examples = tokenized[start_idx : start_idx + batch_size] |
| 172 | + if len(batch_examples) < batch_size: |
| 173 | + batch_examples = tokenized[:batch_size] # Wrap around |
| 174 | + |
| 175 | + batch = collate_sft_batch(batch_examples, tokenizer) |
| 176 | + |
| 177 | + # Forward-backward with cross-entropy loss (Tinker API style) |
| 178 | + metrics = dispatch.forward_backward("policy", batch, loss_fn="cross_entropy") |
| 179 | + |
| 180 | + # Optimizer step |
| 181 | + grad_norm = dispatch.optim_step("policy") |
| 182 | + |
| 183 | + if step % 5 == 0: |
| 184 | + loss_val = metrics.get("final_loss", metrics.get("loss", "N/A")) |
| 185 | + logger.info(f"Step {step}: loss={loss_val:.4f}, grad_norm={grad_norm}") |
| 186 | + |
| 187 | + logger.info("SFT training complete!") |
| 188 | + ray.shutdown() |
| 189 | + |
| 190 | + |
| 191 | +if __name__ == "__main__": |
| 192 | + main() |
0 commit comments