agentscope-ai · hiyuchang · Feb 13, 2026 · Feb 24, 2026 · Feb 24, 2026 · Feb 24, 2026
diff --git a/examples/entropy/README.md b/examples/entropy/README.md
@@ -0,0 +1,44 @@
+# Entropy dynamics of RL training
+
+This example shows the two algorithms **Clip_B** and **Clip_V** from the work [On the Entropy Dynamics in Reinforcement Fine-Tuning of Large Language Models](https://arxiv.org/pdf/2602.03392).
+
+NOTE: This example is only tested on verl==0.7.0.
+
+## Data Preparation
+
+We utilize the [DAPO-Math-17k](https://huggingface.co/datasets/open-r1/DAPO-Math-17k-Processed) dataset as our training set. We exclude 500 questions from the training set to form the validation set (denoted by dapo-validation-500).
+The training set is filtered out samples from the training set with excessively high (≥ 15/16) or low (≤ 1/16) pass rates, as evaluated by Qwen2.5-7B-Instruct.
+
+## Clip_B Experiment
+
+1. Apply the patch to keep entropy information in the trainer batch:
+
+```bash
+cd /path/to/Trinity-RFT
+git apply examples/entropy/clipb_trainer.patch
+```
+
+2. Update the dataset paths and other configurations in the file [`clipb.yaml`](clipb.yaml) to point to your local data.
+
+3. Run the experiment:
+
+```bash
+trinity run examples/entropy/clipb.yaml
+```
+
+## Clip_V Implementation
+
+1. Apply the patch to keep entropy information in the trainer batch:
+
+```bash
+cd /path/to/Trinity-RFT
+git apply examples/entropy/clipv_trainer.patch
+```
+
+2. Update the dataset paths and other configurations in the file [`clipv.yaml`](clipv.yaml) to point to your local data.
+
+3. Run the experiment:
+
+```bash
+trinity run examples/entropy/clipv.yaml
+```
diff --git a/examples/entropy/clipb.yaml b/examples/entropy/clipb.yaml
@@ -0,0 +1,100 @@
+project: math_dapo
+name: clipb_example
+checkpoint_root_dir: ${oc.env:TRINITY_CHECKPOINT_ROOT_DIR,./checkpoints}
+model:
+  model_path: ${oc.env:TRINITY_MODEL_PATH,Qwen/Qwen2.5-7B-Instruct}
+  max_prompt_tokens: 1024
+  max_response_tokens: 7168
+algorithm:
+  algorithm_type: grpo_verl
+  advantage_fn: clipb
+  advantage_fn_args:
+    mu: 2.5
+  repeat_times: 16
+  kl_loss_fn_args:
+    kl_coef: 0.0
+cluster:
+  node_num: 1
+  gpu_per_node: 8
+buffer:
+  total_epochs: 20
+  batch_size: 64
+  explorer_input:
+    taskset:
+      name: dapo_235
+      storage_type: file
+      path: ${oc.env:TRINITY_TASKSET_PATH}  # processed DAPO-Math-17k
+      format:
+        prompt_key: 'question'
+        response_key: 'ground_truth'
+      rollout_args:
+        temperature: 1.0
+        logprobs: 20
+    eval_tasksets:
+    - name: dapo-validation-500
+      storage_type: file
+      path: '/path/to/dapo-validation' # validation samples from DAPO-Math-17k
+      split: 'test'
+      repeat_times: 32
+      format:
+        prompt_key: 'question'
+        response_key: 'ground_truth'
+      rollout_args:
+        temperature: 0.7
+    - name: amc23
+      storage_type: file
+      path: math-ai/amc23 # Path to the AMC23 dataset
+      split: 'test'
+      repeat_times: 32
+      format:
+        prompt_key: 'question'
+        response_key: 'answer'
+      rollout_args:
+        temperature: 0.7
+    - name: aime24
+      storage_type: file
+      path: HuggingFaceH4/aime_2024  # Path to the AIME2024 dataset
+      split: 'train'
+      repeat_times: 32
+      format:
+        prompt_key: 'problem'
+        response_key: 'answer'
+      rollout_args:
+        temperature: 0.7
+    - name: aime25
+      storage_type: file
+      path: math-ai/aime25 # Path to the AIME2025 dataset
+      split: 'test'
+      repeat_times: 32
+      format:
+        prompt_key: 'problem'
+        response_key: 'answer'
+      rollout_args:
+        temperature: 0.7
+    default_workflow_type: 'async_math_workflow'
+    default_reward_fn_type: 'math_boxed_reward'
+  trainer_input:
+    experience_buffer:
+      name: math_buffer
+      storage_type: queue
+      max_read_timeout: 7200
+explorer:
+  eval_interval: 20
+  eval_on_startup: true
+  runner_per_model: 8
+  rollout_model:
+    engine_type: vllm_async
+    engine_num: 4
+    tensor_parallel_size: 1
+    seed: 42
+trainer:
+  trainer_type: 'verl'
+  save_interval: 200
+  trainer_config:
+    algorithm:
+      rollout_correction:
+        bypass_mode: false
+synchronizer:
+  sync_method: 'nccl'
+  sync_interval: 1
+  sync_timeout: 3200
diff --git a/examples/entropy/clipb_trainer.patch b/examples/entropy/clipb_trainer.patch
@@ -0,0 +1,11 @@
+--- a/trinity/trainer/verl_trainer.py
++++ b/trinity/trainer/verl_trainer.py
+@@ -501,7 +501,8 @@ class VerlPPOTrainerWrapper(RayPPOTrainer, TrainEngineWrapper):
+                     }
+                     metrics.update(old_log_prob_metrics)
+-                    old_log_prob.batch.pop("entropys")
++                    # Keep entropys in batch so advantage_fn (e.g. Clip_B) can use it
++                    # old_log_prob.batch.pop("entropys")
+                     batch = batch.union(old_log_prob)
+                     if "rollout_log_probs" in batch.batch.keys():
+                         # TODO: we may want to add diff of probs too.
diff --git a/examples/entropy/clipv.yaml b/examples/entropy/clipv.yaml
@@ -0,0 +1,103 @@
+project: math_dapo
+name: clipv_example
+checkpoint_root_dir: ${oc.env:TRINITY_CHECKPOINT_ROOT_DIR,./checkpoints}
+model:
+  model_path: ${oc.env:TRINITY_MODEL_PATH,Qwen/Qwen2.5-7B-Instruct}
+  max_prompt_tokens: 1024
+  max_response_tokens: 7168
+algorithm:
+  algorithm_type: grpo_verl
+  advantage_fn: clipv
+  advantage_fn_args:
+    mu: 8.5
+  repeat_times: 8
+  kl_loss_fn_args:
+    kl_coef: 0.0
+cluster:
+  node_num: 1
+  gpu_per_node: 8
+buffer:
+  total_epochs: 20
+  batch_size: 64
+  explorer_input:
+    taskset:
+      name: dapo_235
+      storage_type: file
+      path: ${oc.env:TRINITY_TASKSET_PATH}  # processed DAPO-Math-17k
+      format:
+        prompt_key: 'question'
+        response_key: 'ground_truth'
+      rollout_args:
+        temperature: 1.0
+        logprobs: 20
+    eval_tasksets:
+    - name: dapo-validation-500
+      storage_type: file
+      path: '/path/to/dapo-validation' # validation samples from DAPO-Math-17k
+      split: 'test'
+      repeat_times: 32
+      format:
+        prompt_key: 'question'
+        response_key: 'ground_truth'
+      rollout_args:
+        temperature: 0.7
+    - name: amc23
+      storage_type: file
+      path: math-ai/amc23 # Path to the AMC23 dataset
+      split: 'test'
+      repeat_times: 32
+      format:
+        prompt_key: 'question'
+        response_key: 'answer'
+      rollout_args:
+        temperature: 0.7
+    - name: aime24
+      storage_type: file
+      path: HuggingFaceH4/aime_2024  # Path to the AIME2024 dataset
+      split: 'train'
+      repeat_times: 32
+      format:
+        prompt_key: 'problem'
+        response_key: 'answer'
+      rollout_args:
+        temperature: 0.7
+    - name: aime25
+      storage_type: file
+      path: math-ai/aime25 # Path to the AIME2025 dataset
+      split: 'test'
+      repeat_times: 32
+      format:
+        prompt_key: 'problem'
+        response_key: 'answer'
+      rollout_args:
+        temperature: 0.7
+    default_workflow_type: 'async_math_workflow'
+    default_reward_fn_type: 'math_boxed_reward'
+  trainer_input:
+    experience_buffer:
+      name: math_buffer
+      storage_type: queue
+      max_read_timeout: 7200
+explorer:
+  eval_interval: 20
+  eval_on_startup: true
+  runner_per_model: 8
+  rollout_model:
+    engine_type: vllm_async
+    engine_num: 4
+    tensor_parallel_size: 1
+    seed: 42
+trainer:
+  trainer_type: 'verl'
+  save_interval: 100
+  trainer_config:
+    actor_rollout_ref:
+      actor:
+        log_prob_fn: clipv_entropy_nec
+    algorithm:
+      rollout_correction:
+        bypass_mode: false
+synchronizer:
+  sync_method: 'nccl'
+  sync_interval: 1
+  sync_timeout: 3600