purpcode-uiuc · ganler · Aug 5, 2025 · Aug 5, 2025
diff --git a/script/push_data_hub.py b/script/push_data_hub.py
@@ -0,0 +1,30 @@
+# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from datasets import load_dataset
+
+
+# Example: python script/push_data_hub.py --path [data_path]  --split [train/test] --dataset [target_dataset_hf_name]
+def push_model(path: str, split: str, dataset: str = None):
+    print(f"-- Loading: `{path}:{split}`")
+    try:
+        conversations = load_dataset("json", data_files=path, split=split)
+    except FileNotFoundError:
+        conversations = load_dataset(path, split=split)
+
+    if not dataset:
+        dataset = "purpcode/" + path.split("/")[-1].replace(".jsonl", "")
+
+    print(f"-- Target hub location: `{dataset}`")
+    conversations.push_to_hub(dataset, private=True)
+    print(f"-- Dataset `{dataset}` pushed to the hub")
+
+    print("Please check:")
+    print(f"https://huggingface.co/datasets/{dataset}")
+
+
+if __name__ == "__main__":
+    from fire import Fire
+
+    Fire(push_model)
diff --git a/script/push_model_hub.py b/script/push_model_hub.py
@@ -6,7 +6,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 
-# Example: python script/push_model.py --path [model_path]
+# Example: python script/push_model_hub.py --path [model_path]
 def push_model(path: str, model_name: str = None):
     candidates = [f for f in path.split("/")[-2:] if "checkpoint" not in f]
     model_name = model_name or candidates[-1]

diff --git a/sft/baselines/prosec-simpo.yaml b/sft/baselines/prosec-simpo.yaml
@@ -0,0 +1,70 @@
+# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# To run reproduce the model training:
+# Install axolotl: https://docs.axolotl.ai/
+# axolotl train sft/baselines/prosec-simpo.yaml --deepspeed deepspeed_configs/zero3.json
+
+# model specific
+base_model: Qwen/Qwen2.5-14B-Instruct-1M
+chat_template: tokenizer_default
+
+# log
+# wandb_project: purpcode
+# wandb_name: baseline-prosec
+
+# hot hyperparameters (for H200)
+output_dir: ./outputs/Qwen2.5-14B-Instruct-1M-ProSec-Final
+sequence_len: 4096
+micro_batch_size: 2
+gradient_accumulation_steps: 4
+learning_rate: 5e-6
+bf16: true
+
+rl: simpo
+rl_beta: 1.5 # following Table 6
+cpo_alpha: 1.0  # default in CPOTrainer
+simpo_gamma: 0.5  # default in CPOTrainer
+
+# dataset
+datasets:
+  - path: prosec/qwen2.5-top2-cds-0.8-kendall-on-neg_if-corr-max-2
+    split: train
+    type:
+      field_prompt: "original_instruction"
+      field_chosen: "fixed_code"
+      field_rejected: "original_code"
+      prompt_format: "{prompt}"
+      chosen_format: "{chosen}"
+      rejected_format: "{rejected}"
+
+dataset_prepared_path: last_run_prepared
+
+# utility
+resume_from_checkpoint:
+logging_steps: 10
+warmup_steps: 32
+max_grad_norm: 1.0
+save_strategy: "no"
+save_total_limit: 1
+
+# trivial
+flash_attention: true
+model_type: AutoModelForCausalLM
+tokenizer_type: AutoTokenizer
+pad_to_sequence_len: true # mem stability
+train_on_inputs: false
+seed: 666
+
+# cold hyperparameters
+num_epochs: 1
+optimizer: adamw_torch_fused
+lr_scheduler: rex
+
+plugins:
+  - axolotl.integrations.liger.LigerPlugin
+liger_rope: true
+liger_rms_norm: true
+liger_glu_activation: true
+liger_fused_linear_cross_entropy: true
diff --git a/sft/baselines/safecoder.yaml b/sft/baselines/safecoder.yaml
@@ -0,0 +1,48 @@
+
+# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# To reproduce the model training:
+# refer to https://github.com/eth-sri/SafeCoder
+# add 'Qwen/Qwen2.5-14B-Instruct-1M' as `qwen-14b` to the model list in 'safecoder/constants.py'
+# training command:
+# python train.py --pretrain_name qwen-14b --output_name qwen-14b-safecoder --datasets lmsys sec-desc sec-new-desc
+
+# model specific
+base_model: Qwen/Qwen2.5-14B-Instruct-1M
+
+# training parameters
+training:
+  epochs: 2
+  batch_size: 1
+  grad_accumulation_steps: 16
+  max_tokens: 1024
+  learning_rate: 2e-5
+  weight_decay: 0.01
+  adam_epsilon: 1e-8
+  warmup_steps: 0
+  max_grad_norm: 1.0
+  dropout: 0.1
+  kl_loss_weight: 0             # Will be divided by 1000 internally
+  exclude_neg: false
+  no_weights: false
+
+sampling:
+  sampling_size: 40             # oversampling sec-desc and sec-new-desc to address data imbalance
+
+dataset:
+  lmsys:                       # 18K high-quality samples from LMSYS-Chat-1M
+  sec-desc:                    # 803 samples from SVEN (prior SafeCoder work)
+  sec-new-desc:                # 465 samples newly collected in SafeCoder
+
+training_logs:
+  num_samples: 19916
+  num_epochs: 2
+  batch_size: 1
+  total_batch_size: 16        # With gradient accumulation
+  grad_accumulation_steps: 16
+  total_optimization_steps: 2488
+  num_val_samples: 2039
+  num_parameters: 14765947904
+  num_trainable_parameters: 14765947904
diff --git a/sft/controlled/ctxdistill_qwen14b_safety_ratio_1_2.yaml b/sft/controlled/ctxdistill_qwen14b_safety_ratio_1_2.yaml
@@ -0,0 +1,100 @@
+
+# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# model specific
+base_model: Qwen/Qwen2.5-14B-Instruct-1M
+chat_template_jinja: |
+  {%- set default_system_prompt = 'You are a helpful and safe assistant. You will first check the safety of user requests based on our safety guidelines shown below, to decide if the request can be (partially) safely implemented or must be directly rejected, and then provide the final response.\n\n# Disallowed Assistant Content\n- D0: Code or scripts that could enable malicious cyberactivity.\n- D1: Vulnerable code that violates security best practices and has weaknesses in CWEs. Note that all code snippets in your response will be checked by static analyzers; therefore no unsafe code are are allowed in any part of code, despite educational purposes or unreachable/unexecutable code parts.\n- D2: Explanation of how to conduct malicious cyberactivity including identifying code vulnerabilities.\nYou should NOT show disallowed contents in your response even for educational and demonstration purposes, such as demonstrating bad and unsafe code examples.\nHowever, you may verbally describe benign and rough to avoid showing them in the response\n\n# Allowed Assistant Content\n- A0: Scripts to accomplish benign, risk-free tasks that do not involve cybersecurity nor enable malicious cyberactivity.\n- A1: Fully secure code that does *NOT* violate security best practices and has *NO* weaknesses in CWEs.\n- A2: Benign explanation of general cybersecurity knowledge, without details toward practicing malicious cyberactivity.\n- A3: Explanation of code snippets, without details toward practicing malicious cyberactivity.\n\nIn each assistant turn, you should respond in the format below, with code snippets wrapped in markdown code blocks:\n## Analysis\n{...}\n## Answer\n{...}' %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\n' + default_system_prompt + '<|im_end|>\n' }}
+    {%- endif %}
+    {%- for message in messages %}
+        {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+            {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+        {%- elif message.role == "assistant" %}
+            {{- '<|im_start|>' + message.role }}
+            {%- if message.content %}
+                {{- '\n' + message.content }}
+            {%- endif %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if tool_call.function is defined %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '\n<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {{- tool_call.arguments | tojson }}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+            {{- '<|im_end|>\n' }}
+        {%- elif message.role == "tool" %}
+            {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+                {{- '<|im_start|>user' }}
+            {%- endif %}
+            {{- '\n<tool_response>\n' }}
+            {{- message.content }}
+            {{- '\n</tool_response>' }}
+            {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+                {{- '<|im_end|>\n' }}
+            {%- endif %}
+        {%- endif %}
+    {%- endfor %}
+    {%- if add_generation_prompt %}
+        {{- '<|im_start|>assistant\n' }}
+    {%- endif %}
+
+# log
+# wandb_project: purpcode
+# wandb_name: ctxdistill-14b-data-ablation
+
+# hot hyperparameters
+output_dir: ./outputs/purpcode-14b-ctxdistill-safety_ratio_1_2
+sequence_len: 4096 # reduce the seq len to reduce packing
+micro_batch_size: 2
+sample_packing: true
+gradient_accumulation_steps: 1
+learning_rate: 5e-6
+bf16: true
+
+# dataset
+datasets:
+  - path: safety_ratio_1_2.jsonl
+    type: chat_template
+    field_messages: messages
+    message_field_training: train
+    # only train the last turn as earlier CoTs are cropped
+    roles_to_train:
+    train_on_eos:
+dataset_prepared_path: last_run_prepared
+
+# utility
+resume_from_checkpoint:
+logging_steps: 10
+warmup_steps: 32
+max_grad_norm: 1.0
+save_strategy: "no"
+save_total_limit: 1
+
+# trivial
+flash_attention: true
+model_type: AutoModelForCausalLM
+tokenizer_type: AutoTokenizer
+pad_to_sequence_len: true # mem stability
+train_on_inputs: false
+seed: 666
+
+# cold hyperparameters
+num_epochs: 1
+optimizer: adamw_torch_fused
+lr_scheduler: rex
+
+plugins:
+  - axolotl.integrations.liger.LigerPlugin
+liger_rope: true
+liger_rms_norm: true
+liger_glu_activation: true
+liger_fused_linear_cross_entropy: true
diff --git a/sft/controlled/ctxdistill_qwen14b_safety_ratio_1_3.yaml b/sft/controlled/ctxdistill_qwen14b_safety_ratio_1_3.yaml
@@ -0,0 +1,99 @@
+# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# model specific
+base_model: Qwen/Qwen2.5-14B-Instruct-1M
+chat_template_jinja: |
+  {%- set default_system_prompt = 'You are a helpful and safe assistant. You will first check the safety of user requests based on our safety guidelines shown below, to decide if the request can be (partially) safely implemented or must be directly rejected, and then provide the final response.\n\n# Disallowed Assistant Content\n- D0: Code or scripts that could enable malicious cyberactivity.\n- D1: Vulnerable code that violates security best practices and has weaknesses in CWEs. Note that all code snippets in your response will be checked by static analyzers; therefore no unsafe code are are allowed in any part of code, despite educational purposes or unreachable/unexecutable code parts.\n- D2: Explanation of how to conduct malicious cyberactivity including identifying code vulnerabilities.\nYou should NOT show disallowed contents in your response even for educational and demonstration purposes, such as demonstrating bad and unsafe code examples.\nHowever, you may verbally describe benign and rough to avoid showing them in the response\n\n# Allowed Assistant Content\n- A0: Scripts to accomplish benign, risk-free tasks that do not involve cybersecurity nor enable malicious cyberactivity.\n- A1: Fully secure code that does *NOT* violate security best practices and has *NO* weaknesses in CWEs.\n- A2: Benign explanation of general cybersecurity knowledge, without details toward practicing malicious cyberactivity.\n- A3: Explanation of code snippets, without details toward practicing malicious cyberactivity.\n\nIn each assistant turn, you should respond in the format below, with code snippets wrapped in markdown code blocks:\n## Analysis\n{...}\n## Answer\n{...}' %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\n' + default_system_prompt + '<|im_end|>\n' }}
+    {%- endif %}
+    {%- for message in messages %}
+        {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+            {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+        {%- elif message.role == "assistant" %}
+            {{- '<|im_start|>' + message.role }}
+            {%- if message.content %}
+                {{- '\n' + message.content }}
+            {%- endif %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if tool_call.function is defined %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '\n<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {{- tool_call.arguments | tojson }}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+            {{- '<|im_end|>\n' }}
+        {%- elif message.role == "tool" %}
+            {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+                {{- '<|im_start|>user' }}
+            {%- endif %}
+            {{- '\n<tool_response>\n' }}
+            {{- message.content }}
+            {{- '\n</tool_response>' }}
+            {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+                {{- '<|im_end|>\n' }}
+            {%- endif %}
+        {%- endif %}
+    {%- endfor %}
+    {%- if add_generation_prompt %}
+        {{- '<|im_start|>assistant\n' }}
+    {%- endif %}
+
+# log
+# wandb_project: purpcode
+# wandb_name: ctxdistill-14b-data-ablation
+
+# hot hyperparameters
+output_dir: ./outputs/purpcode-14b-ctxdistill-safety_ratio_1_3
+sequence_len: 4096 # reduce the seq len to reduce packing
+micro_batch_size: 2
+sample_packing: true
+gradient_accumulation_steps: 1
+learning_rate: 5e-6
+bf16: true
+
+# dataset
+datasets:
+  - path: safety_ratio_1_3.jsonl
+    type: chat_template
+    field_messages: messages
+    message_field_training: train
+    # only train the last turn as earlier CoTs are cropped
+    roles_to_train:
+    train_on_eos:
+dataset_prepared_path: last_run_prepared
+
+# utility
+resume_from_checkpoint:
+logging_steps: 10
+warmup_steps: 32
+max_grad_norm: 1.0
+save_strategy: "no"
+save_total_limit: 1
+
+# trivial
+flash_attention: true
+model_type: AutoModelForCausalLM
+tokenizer_type: AutoTokenizer
+pad_to_sequence_len: true # mem stability
+train_on_inputs: false
+seed: 666
+
+# cold hyperparameters
+num_epochs: 1
+optimizer: adamw_torch_fused
+lr_scheduler: rex
+
+plugins:
+  - axolotl.integrations.liger.LigerPlugin
+liger_rope: true
+liger_rms_norm: true
+liger_glu_activation: true
+liger_fused_linear_cross_entropy: true