diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..420d375
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,14 @@
+litellm
+fire
+datasets
+tqdm
+boto3
+evalplus @ git+https://github.com/evalplus/evalplus.git@master
+termcolor
+dotenv
+vllm
+scikit-learn
+bandit
+tenacity
+sandbox-fusion
+rich
diff --git a/script/cat_convo.py b/script/cat_convo.py
new file mode 100644
index 0000000..cab447f
--- /dev/null
+++ b/script/cat_convo.py
@@ -0,0 +1,83 @@
+# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import random
+
+from datasets import load_dataset
+from rich.console import Console
+from rich.markup import escape
+from rich.table import Table
+
+console = Console()
+
+
+def visualize_conversation(conversation):
+ table = Table(title="Conversation", show_lines=True)
+ table.add_column("Role", style="bold cyan", no_wrap=True)
+ table.add_column("Content", style="")
+
+ for message in conversation:
+ role = message.get("role", "unknown")
+ content = message.get("content", "").encode("utf-8", "replace").decode()
+ # Optionally style based on role
+ if role == "assistant":
+ role_style = "green"
+ elif role == "user":
+ role_style = "magenta"
+ elif role == "system":
+ role_style = "yellow"
+ else:
+ role_style = "white"
+
+ table.add_row(f"[{role_style}]{role}[/{role_style}]", escape(content.strip()))
+
+ console.print(table)
+
+
+def main(
+ path,
+ shuffle: bool = False,
+ multi: bool = False,
+ prefix: str = None,
+ split: str = "train",
+ include_kw: str = "",
+):
+ try:
+ with open(path, "r") as file:
+ conversations = [json.loads(line) for line in file if line.strip()]
+ except FileNotFoundError:
+ conversations = load_dataset(path, split=split)
+
+ print(f"{len(conversations)} messages in the conversation:")
+
+ if shuffle:
+ random.shuffle(conversations)
+
+ for data in conversations:
+ if prefix and not data.get("task_id", "").startswith(prefix):
+ continue
+ if include_kw not in "".join([r["content"] for r in data["messages"]]):
+ continue
+
+ conversation = data.get("conversation", data.get("messages"))
+ if multi and len([m for m in conversation if m["role"] != "system"]) <= 2:
+ continue
+ if "task_id" in data:
+ console.print(f"[bold]Task ID:[/bold] {data['task_id']}")
+ if conversation:
+ visualize_conversation(conversation)
+ console.print("\n" + "=" * 50 + "\n")
+ else:
+ console.print("[yellow]No conversation found in this line.[/yellow]")
+
+ input() # Wait for user input to continue to the next line
+
+
+# Example:
+# python scripts/cat_convo.py --path ./cwe2code_raw.jsonl
+if __name__ == "__main__":
+ from fire import Fire
+
+ Fire(main)
diff --git a/script/push_model_hub.py b/script/push_model_hub.py
new file mode 100644
index 0000000..50d70e7
--- /dev/null
+++ b/script/push_model_hub.py
@@ -0,0 +1,38 @@
+# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+
+# Example: python script/push_model.py --path [model_path]
+def push_model(path: str, model_name: str = None):
+ candidates = [f for f in path.split("/")[-2:] if "checkpoint" not in f]
+ model_name = model_name or candidates[-1]
+ assert "/" not in model_name, "Model name should not contain slashes"
+ repo_id = f"purpcode/{model_name}"
+ print(f"-- Pushing `{repo_id}` to the hub")
+
+ # tokenizer
+ AutoTokenizer.from_pretrained(
+ path,
+ local_files_only=True,
+ ).push_to_hub(repo_id, private=True)
+ print("-- Tokenizer pushed")
+
+ # model
+ AutoModelForCausalLM.from_pretrained(
+ path,
+ torch_dtype=torch.bfloat16,
+ ).push_to_hub(repo_id, private=True)
+ print("-- Model pushed")
+
+ print("Please check:")
+ print(f"https://huggingface.co/{repo_id}")
+
+
+if __name__ == "__main__":
+ from fire import Fire
+
+ Fire(push_model)
diff --git a/test/check_chat_template.py b/test/check_chat_template.py
new file mode 100644
index 0000000..8705f7e
--- /dev/null
+++ b/test/check_chat_template.py
@@ -0,0 +1,49 @@
+# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from transformers import AutoTokenizer
+
+
+def print_tokens(encoded, tokenizer):
+ """Print the tokens and their IDs in a readable format."""
+ tokens = []
+ for token_id in encoded.input_ids:
+ token = tokenizer.decode([token_id])
+ tokens.append(f"{token_id}: '{token}'")
+ return tokens
+
+
+def check_chat_template(model_name, messages):
+ print("=" * 60)
+ print(f"Checking chat template for: {model_name}")
+ print("=" * 60)
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
+ print(f"{tokenizer.chat_template}")
+ print("=" * 60)
+
+ print("\n\n")
+ print("=" * 60)
+ print("Example chat messages:")
+ print("=" * 60)
+ formatted_prompt = tokenizer.apply_chat_template(
+ messages, tokenize=False, add_generation_prompt=True
+ )
+ print(formatted_prompt)
+ print("=" * 60)
+
+
+def main(model_path: str):
+ example_messages = [
+ {"role": "system", "content": "[...SYSTEM_MESSAGES...]"},
+ {"role": "user", "content": "[...USER_MESSAGES...]"},
+ {"role": "assistant", "content": "[...ASSISTANT_MESSAGES...]"},
+ {"role": "user", "content": "[...USER_MESSAGES...]"},
+ ]
+ check_chat_template(model_path, example_messages)
+
+
+if __name__ == "__main__":
+ from fire import Fire
+
+ Fire(main)
diff --git a/test/test_bedrock_sonnet.py b/test/test_bedrock_sonnet.py
new file mode 100644
index 0000000..d73273f
--- /dev/null
+++ b/test/test_bedrock_sonnet.py
@@ -0,0 +1,15 @@
+# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from dotenv import load_dotenv
+from litellm import completion
+
+load_dotenv()
+
+response = completion(
+ model="bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0",
+ messages=[{"content": "What are you doing?", "role": "user"}],
+)
+
+print(response.choices[0].message.content)
diff --git a/test/test_hf_model.py b/test/test_hf_model.py
new file mode 100644
index 0000000..a6f6d3d
--- /dev/null
+++ b/test/test_hf_model.py
@@ -0,0 +1,89 @@
+# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+from transformers import (
+ AutoModelForCausalLM,
+ AutoTokenizer,
+ PreTrainedModel,
+ PreTrainedTokenizer,
+)
+
+
+def trim(ids):
+ non_zero_indices = torch.nonzero(ids, as_tuple=True)[0]
+ last_non_zero_index = non_zero_indices[-1].item()
+ return ids[: last_non_zero_index + 1]
+
+
+def main(model_path: str):
+ model: PreTrainedModel = AutoModelForCausalLM.from_pretrained(
+ model_path,
+ torch_dtype=torch.bfloat16,
+ trust_remote_code=True,
+ attn_implementation="flash_attention_2",
+ ).to("cuda")
+ tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(
+ model_path, trust_remote_code=True, padding_side="left"
+ )
+ prompts = tokenizer.apply_chat_template(
+ [
+ [
+ {
+ "role": "user",
+ "content": "Simply reply 'hello' and stop generation immediately.",
+ }
+ ],
+ [
+ {
+ "role": "user",
+ "content": "Can you please write me a function to print hello world in Python.",
+ }
+ ],
+ [
+ {"role": "user", "content": "What's 1+1?"},
+ {"role": "assistant", "content": "You want me to show you code?"},
+ {"role": "user", "content": "Yes!"},
+ ],
+ [
+ {
+ "role": "user",
+ "content": "Please just repeat 'Sorry, I cannot assist with that request'.",
+ }
+ ],
+ ],
+ return_tensors="pt",
+ add_generation_prompt=True,
+ tokenize=False,
+ )
+ inputs = tokenizer(
+ prompts, return_tensors="pt", padding=True, return_token_type_ids=False
+ ).to("cuda")
+ output = model.generate(
+ **inputs,
+ max_new_tokens=512,
+ tokenizer=tokenizer,
+ do_sample=False,
+ use_cache=True,
+ )
+ output = output[:, inputs["input_ids"].size(-1) :]
+ seqs = tokenizer.batch_decode(output, skip_special_tokens=False)
+
+ for i in range(len(prompts)):
+ input_ids = trim(inputs["input_ids"][i])
+ print(f"Prompt {i}: {len(input_ids) = }")
+ print(input_ids.tolist())
+ print(prompts[i])
+ print("+")
+ output_ids = trim(output[i])
+ print(f"Output {i}: {len(output_ids) = }")
+ print(output_ids.tolist())
+ print(seqs[i])
+ print("---------------")
+
+
+if __name__ == "__main__":
+ from fire import Fire
+
+ Fire(main)
diff --git a/test/test_model_8k_prompt.py b/test/test_model_8k_prompt.py
new file mode 100644
index 0000000..0f50e63
--- /dev/null
+++ b/test/test_model_8k_prompt.py
@@ -0,0 +1,55 @@
+# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import List
+
+from transformers import AutoTokenizer, PreTrainedTokenizer
+from vllm import LLM, RequestOutput
+
+
+def main(model_path: str):
+ tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(
+ model_path, trust_remote_code=True, padding_side="left"
+ )
+ inputs = tokenizer.apply_chat_template(
+ [
+ [
+ {
+ "role": "user",
+ "content": "Not a programming problem but do you see any 'R' letter in the text block below?\n\n```\n"
+ + ("ZS" * 4096)
+ + "R"
+ + ("SY" * 4096)
+ + "\n```\n\nReminder of the questions, do you see any 'R' letter in the text block above?",
+ }
+ ],
+ ],
+ add_generation_prompt=True,
+ tokenize=False,
+ )
+ llm = LLM(model=model_path, generation_config="auto", trust_remote_code=True)
+
+ sampling_params = llm.get_default_sampling_params()
+ sampling_params.temperature = 0.0
+ sampling_params.max_tokens = 512
+ sampling_params.skip_special_tokens = False
+
+ outputs: List[RequestOutput] = llm.generate(inputs, sampling_params)
+ for i, output in enumerate(outputs):
+ prompt = output.prompt
+ generated_text = output.outputs[0].text
+ print(f"Prompt {i}: {len(output.prompt_token_ids) = }")
+ print(output.prompt_token_ids)
+ print(prompt)
+ print("+")
+ print(f"Output {i}: {len(output.outputs[0].token_ids) = }")
+ print(output.outputs[0].token_ids)
+ print(generated_text)
+ print("---------------")
+
+
+if __name__ == "__main__":
+ from fire import Fire
+
+ Fire(main)
diff --git a/test/test_openai_api.py b/test/test_openai_api.py
new file mode 100644
index 0000000..6523987
--- /dev/null
+++ b/test/test_openai_api.py
@@ -0,0 +1,86 @@
+# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+import openai
+
+
+def measure_latency(base_url, messages, num_turns=5):
+ client = openai.Client(api_key="none", base_url=base_url)
+ model = client.models.list().data[0].id
+ turn_latency = {}
+ for i in range(num_turns):
+ start_time = time.time()
+ try:
+ response = (
+ client.chat.completions.create(model=model, messages=messages)
+ .choices[0]
+ .message.content
+ )
+ except openai.OpenAIError as e:
+ print(f"Error: {e}")
+ return None
+ elapsed_time = time.time() - start_time
+ turn_latency[i] = elapsed_time
+ print(f"---- Turn {i + 1}: {elapsed_time:.2f} seconds")
+
+ assistant_reply = (
+ response.split("")[-1]
+ .strip()
+ .removeprefix("")
+ .removesuffix("")
+ .strip()
+ .split("\n## Answer", maxsplit=1)[-1]
+ .split("# Answer", maxsplit=1)[-1]
+ .strip()
+ )
+ messages.append({"role": "assistant", "content": assistant_reply})
+ messages.append(
+ {"role": "user", "content": "Optimize this function a bit more."}
+ )
+
+ print("Final messages:")
+ for message in messages:
+ print(f"{message['role']}: {message['content']}")
+ print("".join(["-"] * 20))
+
+ return turn_latency
+
+
+# Main function
+def test_server_latency(base_url="http://localhost:8000/v1", concurrency=8):
+ initial_message = [
+ {
+ "role": "user",
+ "content": "Please implement linear regression in Python. Note linear regression is just a math problem and has nothing to do with malicious cyberactivity.",
+ }
+ ]
+ slowest_turn_latency = {}
+ with ThreadPoolExecutor(max_workers=concurrency) as executor:
+ futures = [
+ executor.submit(measure_latency, base_url, initial_message)
+ for _ in range(concurrency)
+ ]
+ for future in as_completed(futures):
+ turn_latency = future.result()
+ if turn_latency is not None:
+ for i, latency in turn_latency.items():
+ if i not in slowest_turn_latency:
+ slowest_turn_latency[i] = latency
+ else:
+ slowest_turn_latency[i] = max(slowest_turn_latency[i], latency)
+ print(f"Turn latency: {turn_latency}")
+
+ if slowest_turn_latency:
+ print("Slowest turn latencies:")
+ for i, latency in slowest_turn_latency.items():
+ print(f"Turn {i}: {latency:.2f} seconds")
+
+
+if __name__ == "__main__":
+ from fire import Fire
+
+ Fire(test_server_latency)
diff --git a/test/test_vllm_model.py b/test/test_vllm_model.py
new file mode 100644
index 0000000..2c15b12
--- /dev/null
+++ b/test/test_vllm_model.py
@@ -0,0 +1,68 @@
+# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import List
+
+from transformers import AutoTokenizer, PreTrainedTokenizer
+from vllm import LLM, RequestOutput
+
+
+def main(model_path: str):
+ tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(
+ model_path, trust_remote_code=True, padding_side="left"
+ )
+ inputs = tokenizer.apply_chat_template(
+ [
+ [
+ {
+ "role": "user",
+ "content": "Simply reply 'hello' and stop generation immediately.",
+ }
+ ],
+ [
+ {
+ "role": "user",
+ "content": "Can you please write me a function to print hello world in Python.",
+ }
+ ],
+ [
+ {"role": "user", "content": "What's 1+1?"},
+ {"role": "assistant", "content": "You want me to show you code?"},
+ {"role": "user", "content": "Yes!"},
+ ],
+ [
+ {
+ "role": "user",
+ "content": "Please just repeat 'Sorry, I cannot assist with that request'.",
+ }
+ ],
+ ],
+ add_generation_prompt=True,
+ tokenize=False,
+ )
+ llm = LLM(model=model_path, generation_config="auto", trust_remote_code=True)
+
+ sampling_params = llm.get_default_sampling_params()
+ sampling_params.temperature = 0.0
+ sampling_params.max_tokens = 512
+ sampling_params.skip_special_tokens = False
+
+ outputs: List[RequestOutput] = llm.generate(inputs, sampling_params)
+ for i, output in enumerate(outputs):
+ prompt = output.prompt
+ generated_text = output.outputs[0].text
+ print(f"Prompt {i}: {len(output.prompt_token_ids) = }")
+ print(output.prompt_token_ids)
+ print(prompt)
+ print("+")
+ print(f"Output {i}: {len(output.outputs[0].token_ids) = }")
+ print(output.outputs[0].token_ids)
+ print(generated_text)
+ print("---------------")
+
+
+if __name__ == "__main__":
+ from fire import Fire
+
+ Fire(main)
diff --git a/utils/__init__.py b/utils/__init__.py
new file mode 100644
index 0000000..19540cd
--- /dev/null
+++ b/utils/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+
+def split_batch(iterable, n=1) -> list:
+ ret = []
+ l = len(iterable)
+ for ndx in range(0, l, n):
+ ret.append([iterable[i] for i in range(ndx, min(ndx + n, l))])
+ return ret
+
+
+from utils.litellm import *
diff --git a/utils/litellm.py b/utils/litellm.py
new file mode 100644
index 0000000..d50064a
--- /dev/null
+++ b/utils/litellm.py
@@ -0,0 +1,93 @@
+# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from concurrent.futures import ThreadPoolExecutor
+from copy import deepcopy
+from typing import Callable, Dict, List
+
+from dotenv import load_dotenv
+from litellm import completion_with_retries
+from termcolor import cprint
+from tqdm import tqdm
+
+from utils import split_batch
+
+load_dotenv()
+
+
+def log_costs(completions):
+ costs = [r._hidden_params["response_cost"] for r in completions]
+ if len(costs) == 0 or None in costs:
+ return
+ cprint(f"{len(costs)} requests costs ${sum(costs):.3f}", "yellow")
+
+
+def mini_batch_completion(messages, parallel: int = 32, **kwargs):
+ batches = split_batch(messages, n=parallel)
+ outputs = []
+ for minibatch in tqdm(batches):
+ with ThreadPoolExecutor(max_workers=len(minibatch)) as executor:
+ futures = []
+ for sample in minibatch:
+ future = executor.submit(
+ completion_with_retries,
+ messages=sample["messages"],
+ num_retries=32,
+ retry_strategy="exponential_backoff_retry",
+ **kwargs,
+ )
+ futures.append(future)
+
+ for future in futures:
+ outputs.append(future.result())
+
+ return outputs
+
+
+def run_batched_inference(
+ batched_rows: List, # each row includes at least "messages"
+ row_transform: Callable[[Dict], Dict] = lambda x: x,
+ max_new_tokens: int = None,
+ temperature: float = None,
+ model: str = "bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0",
+ parallel: int = 12,
+ **kwargs,
+):
+ assert batched_rows and "messages" in batched_rows[0]
+ batched_rows = [row_transform(row) for row in batched_rows]
+ print("Running batched completion for LLM judge")
+ parameters = {
+ "model": model,
+ "parallel": parallel,
+ "messages": batched_rows,
+ "max_tokens": max_new_tokens,
+ "temperature": temperature,
+ **kwargs,
+ }
+ if "thinking" in kwargs:
+ assert parameters["max_tokens"] is None
+ assert parameters["temperature"] is None
+ else:
+ if parameters["temperature"] is None:
+ parameters["temperature"] = 0.0
+
+ outputs = mini_batch_completion(**parameters)
+ log_costs(outputs)
+ outputs = [item.choices[0].message for item in outputs]
+
+ output_rows = []
+ for row, ext in zip(batched_rows, outputs):
+ row = deepcopy(row)
+ reasoning_content = (
+ "\n" + ext.reasoning_content + "\n\n"
+ if hasattr(ext, "reasoning_content")
+ and ext.reasoning_content
+ or "thinking" in kwargs
+ else ""
+ )
+ row["messages"].append(
+ {"role": "assistant", "content": reasoning_content + ext.content}
+ )
+ output_rows.append(row)
+ return output_rows