Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions script/push_data_hub.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
#
# SPDX-License-Identifier: Apache-2.0

from datasets import load_dataset


# Example: python script/push_data_hub.py --path [data_path] --split [train/test] --dataset [target_dataset_hf_name]
def push_model(path: str, split: str, dataset: str = None):
print(f"-- Loading: `{path}:{split}`")
try:
conversations = load_dataset("json", data_files=path, split=split)
except FileNotFoundError:
conversations = load_dataset(path, split=split)

if not dataset:
dataset = "purpcode/" + path.split("/")[-1].replace(".jsonl", "")

print(f"-- Target hub location: `{dataset}`")
conversations.push_to_hub(dataset, private=True)
print(f"-- Dataset `{dataset}` pushed to the hub")

print("Please check:")
print(f"https://huggingface.co/datasets/{dataset}")


if __name__ == "__main__":
from fire import Fire

Fire(push_model)
2 changes: 1 addition & 1 deletion script/push_model_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from transformers import AutoModelForCausalLM, AutoTokenizer


# Example: python script/push_model.py --path [model_path]
# Example: python script/push_model_hub.py --path [model_path]
def push_model(path: str, model_name: str = None):
candidates = [f for f in path.split("/")[-2:] if "checkpoint" not in f]
model_name = model_name or candidates[-1]
Expand Down
70 changes: 70 additions & 0 deletions sft/baselines/prosec-simpo.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
#
# SPDX-License-Identifier: Apache-2.0

# To run reproduce the model training:
# Install axolotl: https://docs.axolotl.ai/
# axolotl train sft/baselines/prosec-simpo.yaml --deepspeed deepspeed_configs/zero3.json

# model specific
base_model: Qwen/Qwen2.5-14B-Instruct-1M
chat_template: tokenizer_default

# log
# wandb_project: purpcode
# wandb_name: baseline-prosec

# hot hyperparameters (for H200)
output_dir: ./outputs/Qwen2.5-14B-Instruct-1M-ProSec-Final
sequence_len: 4096
micro_batch_size: 2
gradient_accumulation_steps: 4
learning_rate: 5e-6
bf16: true

rl: simpo
rl_beta: 1.5 # following Table 6
cpo_alpha: 1.0 # default in CPOTrainer
simpo_gamma: 0.5 # default in CPOTrainer

# dataset
datasets:
- path: prosec/qwen2.5-top2-cds-0.8-kendall-on-neg_if-corr-max-2
split: train
type:
field_prompt: "original_instruction"
field_chosen: "fixed_code"
field_rejected: "original_code"
prompt_format: "{prompt}"
chosen_format: "{chosen}"
rejected_format: "{rejected}"

dataset_prepared_path: last_run_prepared

# utility
resume_from_checkpoint:
logging_steps: 10
warmup_steps: 32
max_grad_norm: 1.0
save_strategy: "no"
save_total_limit: 1

# trivial
flash_attention: true
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
pad_to_sequence_len: true # mem stability
train_on_inputs: false
seed: 666

# cold hyperparameters
num_epochs: 1
optimizer: adamw_torch_fused
lr_scheduler: rex

plugins:
- axolotl.integrations.liger.LigerPlugin
liger_rope: true
liger_rms_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: true
48 changes: 48 additions & 0 deletions sft/baselines/safecoder.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@

# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
#
# SPDX-License-Identifier: Apache-2.0

# To reproduce the model training:
# refer to https://github.com/eth-sri/SafeCoder
# add 'Qwen/Qwen2.5-14B-Instruct-1M' as `qwen-14b` to the model list in 'safecoder/constants.py'
# training command:
# python train.py --pretrain_name qwen-14b --output_name qwen-14b-safecoder --datasets lmsys sec-desc sec-new-desc

# model specific
base_model: Qwen/Qwen2.5-14B-Instruct-1M

# training parameters
training:
epochs: 2
batch_size: 1
grad_accumulation_steps: 16
max_tokens: 1024
learning_rate: 2e-5
weight_decay: 0.01
adam_epsilon: 1e-8
warmup_steps: 0
max_grad_norm: 1.0
dropout: 0.1
kl_loss_weight: 0 # Will be divided by 1000 internally
exclude_neg: false
no_weights: false

sampling:
sampling_size: 40 # oversampling sec-desc and sec-new-desc to address data imbalance

dataset:
lmsys: # 18K high-quality samples from LMSYS-Chat-1M
sec-desc: # 803 samples from SVEN (prior SafeCoder work)
sec-new-desc: # 465 samples newly collected in SafeCoder

training_logs:
num_samples: 19916
num_epochs: 2
batch_size: 1
total_batch_size: 16 # With gradient accumulation
grad_accumulation_steps: 16
total_optimization_steps: 2488
num_val_samples: 2039
num_parameters: 14765947904
num_trainable_parameters: 14765947904
100 changes: 100 additions & 0 deletions sft/controlled/ctxdistill_qwen14b_safety_ratio_1_2.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@

# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
#
# SPDX-License-Identifier: Apache-2.0

# model specific
base_model: Qwen/Qwen2.5-14B-Instruct-1M
chat_template_jinja: |
{%- set default_system_prompt = 'You are a helpful and safe assistant. You will first check the safety of user requests based on our safety guidelines shown below, to decide if the request can be (partially) safely implemented or must be directly rejected, and then provide the final response.\n\n# Disallowed Assistant Content\n- D0: Code or scripts that could enable malicious cyberactivity.\n- D1: Vulnerable code that violates security best practices and has weaknesses in CWEs. Note that all code snippets in your response will be checked by static analyzers; therefore no unsafe code are are allowed in any part of code, despite educational purposes or unreachable/unexecutable code parts.\n- D2: Explanation of how to conduct malicious cyberactivity including identifying code vulnerabilities.\nYou should NOT show disallowed contents in your response even for educational and demonstration purposes, such as demonstrating bad and unsafe code examples.\nHowever, you may verbally describe benign and rough to avoid showing them in the response\n\n# Allowed Assistant Content\n- A0: Scripts to accomplish benign, risk-free tasks that do not involve cybersecurity nor enable malicious cyberactivity.\n- A1: Fully secure code that does *NOT* violate security best practices and has *NO* weaknesses in CWEs.\n- A2: Benign explanation of general cybersecurity knowledge, without details toward practicing malicious cyberactivity.\n- A3: Explanation of code snippets, without details toward practicing malicious cyberactivity.\n\nIn each assistant turn, you should respond in the format below, with code snippets wrapped in markdown code blocks:\n## Analysis\n{...}\n## Answer\n{...}' %}
{%- if messages[0]['role'] == 'system' %}
{{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
{%- else %}
{{- '<|im_start|>system\n' + default_system_prompt + '<|im_end|>\n' }}
{%- endif %}
{%- for message in messages %}
{%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
{%- elif message.role == "assistant" %}
{{- '<|im_start|>' + message.role }}
{%- if message.content %}
{{- '\n' + message.content }}
{%- endif %}
{%- for tool_call in message.tool_calls %}
{%- if tool_call.function is defined %}
{%- set tool_call = tool_call.function %}
{%- endif %}
{{- '\n<tool_call>\n{"name": "' }}
{{- tool_call.name }}
{{- '", "arguments": ' }}
{{- tool_call.arguments | tojson }}
{{- '}\n</tool_call>' }}
{%- endfor %}
{{- '<|im_end|>\n' }}
{%- elif message.role == "tool" %}
{%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
{{- '<|im_start|>user' }}
{%- endif %}
{{- '\n<tool_response>\n' }}
{{- message.content }}
{{- '\n</tool_response>' }}
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
{{- '<|im_end|>\n' }}
{%- endif %}
{%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
{{- '<|im_start|>assistant\n' }}
{%- endif %}

# log
# wandb_project: purpcode
# wandb_name: ctxdistill-14b-data-ablation

# hot hyperparameters
output_dir: ./outputs/purpcode-14b-ctxdistill-safety_ratio_1_2
sequence_len: 4096 # reduce the seq len to reduce packing
micro_batch_size: 2
sample_packing: true
gradient_accumulation_steps: 1
learning_rate: 5e-6
bf16: true

# dataset
datasets:
- path: safety_ratio_1_2.jsonl
type: chat_template
field_messages: messages
message_field_training: train
# only train the last turn as earlier CoTs are cropped
roles_to_train:
train_on_eos:
dataset_prepared_path: last_run_prepared

# utility
resume_from_checkpoint:
logging_steps: 10
warmup_steps: 32
max_grad_norm: 1.0
save_strategy: "no"
save_total_limit: 1

# trivial
flash_attention: true
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
pad_to_sequence_len: true # mem stability
train_on_inputs: false
seed: 666

# cold hyperparameters
num_epochs: 1
optimizer: adamw_torch_fused
lr_scheduler: rex

plugins:
- axolotl.integrations.liger.LigerPlugin
liger_rope: true
liger_rms_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: true
99 changes: 99 additions & 0 deletions sft/controlled/ctxdistill_qwen14b_safety_ratio_1_3.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
#
# SPDX-License-Identifier: Apache-2.0

# model specific
base_model: Qwen/Qwen2.5-14B-Instruct-1M
chat_template_jinja: |
{%- set default_system_prompt = 'You are a helpful and safe assistant. You will first check the safety of user requests based on our safety guidelines shown below, to decide if the request can be (partially) safely implemented or must be directly rejected, and then provide the final response.\n\n# Disallowed Assistant Content\n- D0: Code or scripts that could enable malicious cyberactivity.\n- D1: Vulnerable code that violates security best practices and has weaknesses in CWEs. Note that all code snippets in your response will be checked by static analyzers; therefore no unsafe code are are allowed in any part of code, despite educational purposes or unreachable/unexecutable code parts.\n- D2: Explanation of how to conduct malicious cyberactivity including identifying code vulnerabilities.\nYou should NOT show disallowed contents in your response even for educational and demonstration purposes, such as demonstrating bad and unsafe code examples.\nHowever, you may verbally describe benign and rough to avoid showing them in the response\n\n# Allowed Assistant Content\n- A0: Scripts to accomplish benign, risk-free tasks that do not involve cybersecurity nor enable malicious cyberactivity.\n- A1: Fully secure code that does *NOT* violate security best practices and has *NO* weaknesses in CWEs.\n- A2: Benign explanation of general cybersecurity knowledge, without details toward practicing malicious cyberactivity.\n- A3: Explanation of code snippets, without details toward practicing malicious cyberactivity.\n\nIn each assistant turn, you should respond in the format below, with code snippets wrapped in markdown code blocks:\n## Analysis\n{...}\n## Answer\n{...}' %}
{%- if messages[0]['role'] == 'system' %}
{{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
{%- else %}
{{- '<|im_start|>system\n' + default_system_prompt + '<|im_end|>\n' }}
{%- endif %}
{%- for message in messages %}
{%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
{%- elif message.role == "assistant" %}
{{- '<|im_start|>' + message.role }}
{%- if message.content %}
{{- '\n' + message.content }}
{%- endif %}
{%- for tool_call in message.tool_calls %}
{%- if tool_call.function is defined %}
{%- set tool_call = tool_call.function %}
{%- endif %}
{{- '\n<tool_call>\n{"name": "' }}
{{- tool_call.name }}
{{- '", "arguments": ' }}
{{- tool_call.arguments | tojson }}
{{- '}\n</tool_call>' }}
{%- endfor %}
{{- '<|im_end|>\n' }}
{%- elif message.role == "tool" %}
{%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
{{- '<|im_start|>user' }}
{%- endif %}
{{- '\n<tool_response>\n' }}
{{- message.content }}
{{- '\n</tool_response>' }}
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
{{- '<|im_end|>\n' }}
{%- endif %}
{%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
{{- '<|im_start|>assistant\n' }}
{%- endif %}

# log
# wandb_project: purpcode
# wandb_name: ctxdistill-14b-data-ablation

# hot hyperparameters
output_dir: ./outputs/purpcode-14b-ctxdistill-safety_ratio_1_3
sequence_len: 4096 # reduce the seq len to reduce packing
micro_batch_size: 2
sample_packing: true
gradient_accumulation_steps: 1
learning_rate: 5e-6
bf16: true

# dataset
datasets:
- path: safety_ratio_1_3.jsonl
type: chat_template
field_messages: messages
message_field_training: train
# only train the last turn as earlier CoTs are cropped
roles_to_train:
train_on_eos:
dataset_prepared_path: last_run_prepared

# utility
resume_from_checkpoint:
logging_steps: 10
warmup_steps: 32
max_grad_norm: 1.0
save_strategy: "no"
save_total_limit: 1

# trivial
flash_attention: true
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
pad_to_sequence_len: true # mem stability
train_on_inputs: false
seed: 666

# cold hyperparameters
num_epochs: 1
optimizer: adamw_torch_fused
lr_scheduler: rex

plugins:
- axolotl.integrations.liger.LigerPlugin
liger_rope: true
liger_rms_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: true
Loading