Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
litellm
fire
datasets
tqdm
boto3
evalplus @ git+https://github.com/evalplus/evalplus.git@master
termcolor
dotenv
vllm
scikit-learn
bandit
tenacity
sandbox-fusion
rich
83 changes: 83 additions & 0 deletions script/cat_convo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
#
# SPDX-License-Identifier: Apache-2.0

import json
import random

from datasets import load_dataset
from rich.console import Console
from rich.markup import escape
from rich.table import Table

console = Console()


def visualize_conversation(conversation):
table = Table(title="Conversation", show_lines=True)
table.add_column("Role", style="bold cyan", no_wrap=True)
table.add_column("Content", style="")

for message in conversation:
role = message.get("role", "unknown")
content = message.get("content", "").encode("utf-8", "replace").decode()
# Optionally style based on role
if role == "assistant":
role_style = "green"
elif role == "user":
role_style = "magenta"
elif role == "system":
role_style = "yellow"
else:
role_style = "white"

table.add_row(f"[{role_style}]{role}[/{role_style}]", escape(content.strip()))

console.print(table)


def main(
path,
shuffle: bool = False,
multi: bool = False,
prefix: str = None,
split: str = "train",
include_kw: str = "",
):
try:
with open(path, "r") as file:
conversations = [json.loads(line) for line in file if line.strip()]
except FileNotFoundError:
conversations = load_dataset(path, split=split)

print(f"{len(conversations)} messages in the conversation:")

if shuffle:
random.shuffle(conversations)

for data in conversations:
if prefix and not data.get("task_id", "").startswith(prefix):
continue
if include_kw not in "".join([r["content"] for r in data["messages"]]):
continue

conversation = data.get("conversation", data.get("messages"))
if multi and len([m for m in conversation if m["role"] != "system"]) <= 2:
continue
if "task_id" in data:
console.print(f"[bold]Task ID:[/bold] {data['task_id']}")
if conversation:
visualize_conversation(conversation)
console.print("\n" + "=" * 50 + "\n")
else:
console.print("[yellow]No conversation found in this line.[/yellow]")

input() # Wait for user input to continue to the next line


# Example:
# python scripts/cat_convo.py --path ./cwe2code_raw.jsonl
if __name__ == "__main__":
from fire import Fire

Fire(main)
38 changes: 38 additions & 0 deletions script/push_model_hub.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
#
# SPDX-License-Identifier: Apache-2.0

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer


# Example: python script/push_model.py --path [model_path]
def push_model(path: str, model_name: str = None):
candidates = [f for f in path.split("/")[-2:] if "checkpoint" not in f]
model_name = model_name or candidates[-1]
assert "/" not in model_name, "Model name should not contain slashes"
repo_id = f"purpcode/{model_name}"
print(f"-- Pushing `{repo_id}` to the hub")

# tokenizer
AutoTokenizer.from_pretrained(
path,
local_files_only=True,
).push_to_hub(repo_id, private=True)
print("-- Tokenizer pushed")

# model
AutoModelForCausalLM.from_pretrained(
path,
torch_dtype=torch.bfloat16,
).push_to_hub(repo_id, private=True)
print("-- Model pushed")

print("Please check:")
print(f"https://huggingface.co/{repo_id}")


if __name__ == "__main__":
from fire import Fire

Fire(push_model)
49 changes: 49 additions & 0 deletions test/check_chat_template.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
#
# SPDX-License-Identifier: Apache-2.0

from transformers import AutoTokenizer


def print_tokens(encoded, tokenizer):
"""Print the tokens and their IDs in a readable format."""
tokens = []
for token_id in encoded.input_ids:
token = tokenizer.decode([token_id])
tokens.append(f"{token_id}: '{token}'")
return tokens


def check_chat_template(model_name, messages):
print("=" * 60)
print(f"Checking chat template for: {model_name}")
print("=" * 60)
tokenizer = AutoTokenizer.from_pretrained(model_name)
print(f"{tokenizer.chat_template}")
print("=" * 60)

print("\n\n")
print("=" * 60)
print("Example chat messages:")
print("=" * 60)
formatted_prompt = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
print(formatted_prompt)
print("=" * 60)


def main(model_path: str):
example_messages = [
{"role": "system", "content": "[...SYSTEM_MESSAGES...]"},
{"role": "user", "content": "[...USER_MESSAGES...]"},
{"role": "assistant", "content": "[...ASSISTANT_MESSAGES...]"},
{"role": "user", "content": "[...USER_MESSAGES...]"},
]
check_chat_template(model_path, example_messages)


if __name__ == "__main__":
from fire import Fire

Fire(main)
15 changes: 15 additions & 0 deletions test/test_bedrock_sonnet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
#
# SPDX-License-Identifier: Apache-2.0

from dotenv import load_dotenv
from litellm import completion

load_dotenv()

response = completion(
model="bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0",
messages=[{"content": "What are you doing?", "role": "user"}],
)

print(response.choices[0].message.content)
89 changes: 89 additions & 0 deletions test/test_hf_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
#
# SPDX-License-Identifier: Apache-2.0

import torch
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
PreTrainedModel,
PreTrainedTokenizer,
)


def trim(ids):
non_zero_indices = torch.nonzero(ids, as_tuple=True)[0]
last_non_zero_index = non_zero_indices[-1].item()
return ids[: last_non_zero_index + 1]


def main(model_path: str):
model: PreTrainedModel = AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
trust_remote_code=True,
attn_implementation="flash_attention_2",
).to("cuda")
tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(
model_path, trust_remote_code=True, padding_side="left"
)
prompts = tokenizer.apply_chat_template(
[
[
{
"role": "user",
"content": "Simply reply 'hello' and stop generation immediately.",
}
],
[
{
"role": "user",
"content": "Can you please write me a function to print hello world in Python.",
}
],
[
{"role": "user", "content": "What's 1+1?"},
{"role": "assistant", "content": "You want me to show you code?"},
{"role": "user", "content": "Yes!"},
],
[
{
"role": "user",
"content": "Please just repeat 'Sorry, I cannot assist with that request'.",
}
],
],
return_tensors="pt",
add_generation_prompt=True,
tokenize=False,
)
inputs = tokenizer(
prompts, return_tensors="pt", padding=True, return_token_type_ids=False
).to("cuda")
output = model.generate(
**inputs,
max_new_tokens=512,
tokenizer=tokenizer,
do_sample=False,
use_cache=True,
)
output = output[:, inputs["input_ids"].size(-1) :]
seqs = tokenizer.batch_decode(output, skip_special_tokens=False)

for i in range(len(prompts)):
input_ids = trim(inputs["input_ids"][i])
print(f"Prompt {i}: {len(input_ids) = }")
print(input_ids.tolist())
print(prompts[i])
print("+")
output_ids = trim(output[i])
print(f"Output {i}: {len(output_ids) = }")
print(output_ids.tolist())
print(seqs[i])
print("---------------")


if __name__ == "__main__":
from fire import Fire

Fire(main)
55 changes: 55 additions & 0 deletions test/test_model_8k_prompt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
#
# SPDX-License-Identifier: Apache-2.0

from typing import List

from transformers import AutoTokenizer, PreTrainedTokenizer
from vllm import LLM, RequestOutput


def main(model_path: str):
tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(
model_path, trust_remote_code=True, padding_side="left"
)
inputs = tokenizer.apply_chat_template(
[
[
{
"role": "user",
"content": "Not a programming problem but do you see any 'R' letter in the text block below?\n\n```\n"
+ ("ZS" * 4096)
+ "R"
+ ("SY" * 4096)
+ "\n```\n\nReminder of the questions, do you see any 'R' letter in the text block above?",
}
],
],
add_generation_prompt=True,
tokenize=False,
)
llm = LLM(model=model_path, generation_config="auto", trust_remote_code=True)

sampling_params = llm.get_default_sampling_params()
sampling_params.temperature = 0.0
sampling_params.max_tokens = 512
sampling_params.skip_special_tokens = False

outputs: List[RequestOutput] = llm.generate(inputs, sampling_params)
for i, output in enumerate(outputs):
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt {i}: {len(output.prompt_token_ids) = }")
print(output.prompt_token_ids)
print(prompt)
print("+")
print(f"Output {i}: {len(output.outputs[0].token_ids) = }")
print(output.outputs[0].token_ids)
print(generated_text)
print("---------------")


if __name__ == "__main__":
from fire import Fire

Fire(main)
Loading