From 0297012b4367ba15e87f36c1a0312a314c2afbd3 Mon Sep 17 00:00:00 2001 From: ganler Date: Thu, 31 Jul 2025 22:03:22 +0000 Subject: [PATCH 1/4] feat: integrate tests and scripts --- requirements.txt | 14 ++++++ script/cat_convo.py | 83 +++++++++++++++++++++++++++++++ script/push_model_hub.py | 38 +++++++++++++++ test/check_chat_template.py | 49 +++++++++++++++++++ test/test_bedrock_sonnet.py | 15 ++++++ test/test_hf_model.py | 89 ++++++++++++++++++++++++++++++++++ test/test_model_8k_prompt.py | 55 +++++++++++++++++++++ test/test_openai_api.py | 86 +++++++++++++++++++++++++++++++++ test/test_vllm_model.py | 68 ++++++++++++++++++++++++++ utils/__init__.py | 14 ++++++ utils/litellm.py | 94 ++++++++++++++++++++++++++++++++++++ 11 files changed, 605 insertions(+) create mode 100644 requirements.txt create mode 100644 script/cat_convo.py create mode 100644 script/push_model_hub.py create mode 100644 test/check_chat_template.py create mode 100644 test/test_bedrock_sonnet.py create mode 100644 test/test_hf_model.py create mode 100644 test/test_model_8k_prompt.py create mode 100644 test/test_openai_api.py create mode 100644 test/test_vllm_model.py create mode 100644 utils/__init__.py create mode 100644 utils/litellm.py diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..420d375 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,14 @@ +litellm +fire +datasets +tqdm +boto3 +evalplus @ git+https://github.com/evalplus/evalplus.git@master +termcolor +dotenv +vllm +scikit-learn +bandit +tenacity +sandbox-fusion +rich diff --git a/script/cat_convo.py b/script/cat_convo.py new file mode 100644 index 0000000..cab447f --- /dev/null +++ b/script/cat_convo.py @@ -0,0 +1,83 @@ +# SPDX-FileCopyrightText: (c) UIUC PurpCode Team +# +# SPDX-License-Identifier: Apache-2.0 + +import json +import random + +from datasets import load_dataset +from rich.console import Console +from rich.markup import escape +from rich.table import Table + +console = Console() + + +def visualize_conversation(conversation): + table = Table(title="Conversation", show_lines=True) + table.add_column("Role", style="bold cyan", no_wrap=True) + table.add_column("Content", style="") + + for message in conversation: + role = message.get("role", "unknown") + content = message.get("content", "").encode("utf-8", "replace").decode() + # Optionally style based on role + if role == "assistant": + role_style = "green" + elif role == "user": + role_style = "magenta" + elif role == "system": + role_style = "yellow" + else: + role_style = "white" + + table.add_row(f"[{role_style}]{role}[/{role_style}]", escape(content.strip())) + + console.print(table) + + +def main( + path, + shuffle: bool = False, + multi: bool = False, + prefix: str = None, + split: str = "train", + include_kw: str = "", +): + try: + with open(path, "r") as file: + conversations = [json.loads(line) for line in file if line.strip()] + except FileNotFoundError: + conversations = load_dataset(path, split=split) + + print(f"{len(conversations)} messages in the conversation:") + + if shuffle: + random.shuffle(conversations) + + for data in conversations: + if prefix and not data.get("task_id", "").startswith(prefix): + continue + if include_kw not in "".join([r["content"] for r in data["messages"]]): + continue + + conversation = data.get("conversation", data.get("messages")) + if multi and len([m for m in conversation if m["role"] != "system"]) <= 2: + continue + if "task_id" in data: + console.print(f"[bold]Task ID:[/bold] {data['task_id']}") + if conversation: + visualize_conversation(conversation) + console.print("\n" + "=" * 50 + "\n") + else: + console.print("[yellow]No conversation found in this line.[/yellow]") + + input() # Wait for user input to continue to the next line + + +# Example: +# python scripts/cat_convo.py --path ./cwe2code_raw.jsonl +if __name__ == "__main__": + from fire import Fire + + Fire(main) diff --git a/script/push_model_hub.py b/script/push_model_hub.py new file mode 100644 index 0000000..50d70e7 --- /dev/null +++ b/script/push_model_hub.py @@ -0,0 +1,38 @@ +# SPDX-FileCopyrightText: (c) UIUC PurpCode Team +# +# SPDX-License-Identifier: Apache-2.0 + +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + + +# Example: python script/push_model.py --path [model_path] +def push_model(path: str, model_name: str = None): + candidates = [f for f in path.split("/")[-2:] if "checkpoint" not in f] + model_name = model_name or candidates[-1] + assert "/" not in model_name, "Model name should not contain slashes" + repo_id = f"purpcode/{model_name}" + print(f"-- Pushing `{repo_id}` to the hub") + + # tokenizer + AutoTokenizer.from_pretrained( + path, + local_files_only=True, + ).push_to_hub(repo_id, private=True) + print("-- Tokenizer pushed") + + # model + AutoModelForCausalLM.from_pretrained( + path, + torch_dtype=torch.bfloat16, + ).push_to_hub(repo_id, private=True) + print("-- Model pushed") + + print("Please check:") + print(f"https://huggingface.co/{repo_id}") + + +if __name__ == "__main__": + from fire import Fire + + Fire(push_model) diff --git a/test/check_chat_template.py b/test/check_chat_template.py new file mode 100644 index 0000000..8705f7e --- /dev/null +++ b/test/check_chat_template.py @@ -0,0 +1,49 @@ +# SPDX-FileCopyrightText: (c) UIUC PurpCode Team +# +# SPDX-License-Identifier: Apache-2.0 + +from transformers import AutoTokenizer + + +def print_tokens(encoded, tokenizer): + """Print the tokens and their IDs in a readable format.""" + tokens = [] + for token_id in encoded.input_ids: + token = tokenizer.decode([token_id]) + tokens.append(f"{token_id}: '{token}'") + return tokens + + +def check_chat_template(model_name, messages): + print("=" * 60) + print(f"Checking chat template for: {model_name}") + print("=" * 60) + tokenizer = AutoTokenizer.from_pretrained(model_name) + print(f"{tokenizer.chat_template}") + print("=" * 60) + + print("\n\n") + print("=" * 60) + print("Example chat messages:") + print("=" * 60) + formatted_prompt = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + print(formatted_prompt) + print("=" * 60) + + +def main(model_path: str): + example_messages = [ + {"role": "system", "content": "[...SYSTEM_MESSAGES...]"}, + {"role": "user", "content": "[...USER_MESSAGES...]"}, + {"role": "assistant", "content": "[...ASSISTANT_MESSAGES...]"}, + {"role": "user", "content": "[...USER_MESSAGES...]"}, + ] + check_chat_template(model_path, example_messages) + + +if __name__ == "__main__": + from fire import Fire + + Fire(main) diff --git a/test/test_bedrock_sonnet.py b/test/test_bedrock_sonnet.py new file mode 100644 index 0000000..d73273f --- /dev/null +++ b/test/test_bedrock_sonnet.py @@ -0,0 +1,15 @@ +# SPDX-FileCopyrightText: (c) UIUC PurpCode Team +# +# SPDX-License-Identifier: Apache-2.0 + +from dotenv import load_dotenv +from litellm import completion + +load_dotenv() + +response = completion( + model="bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0", + messages=[{"content": "What are you doing?", "role": "user"}], +) + +print(response.choices[0].message.content) diff --git a/test/test_hf_model.py b/test/test_hf_model.py new file mode 100644 index 0000000..a6f6d3d --- /dev/null +++ b/test/test_hf_model.py @@ -0,0 +1,89 @@ +# SPDX-FileCopyrightText: (c) UIUC PurpCode Team +# +# SPDX-License-Identifier: Apache-2.0 + +import torch +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + PreTrainedModel, + PreTrainedTokenizer, +) + + +def trim(ids): + non_zero_indices = torch.nonzero(ids, as_tuple=True)[0] + last_non_zero_index = non_zero_indices[-1].item() + return ids[: last_non_zero_index + 1] + + +def main(model_path: str): + model: PreTrainedModel = AutoModelForCausalLM.from_pretrained( + model_path, + torch_dtype=torch.bfloat16, + trust_remote_code=True, + attn_implementation="flash_attention_2", + ).to("cuda") + tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained( + model_path, trust_remote_code=True, padding_side="left" + ) + prompts = tokenizer.apply_chat_template( + [ + [ + { + "role": "user", + "content": "Simply reply 'hello' and stop generation immediately.", + } + ], + [ + { + "role": "user", + "content": "Can you please write me a function to print hello world in Python.", + } + ], + [ + {"role": "user", "content": "What's 1+1?"}, + {"role": "assistant", "content": "You want me to show you code?"}, + {"role": "user", "content": "Yes!"}, + ], + [ + { + "role": "user", + "content": "Please just repeat 'Sorry, I cannot assist with that request'.", + } + ], + ], + return_tensors="pt", + add_generation_prompt=True, + tokenize=False, + ) + inputs = tokenizer( + prompts, return_tensors="pt", padding=True, return_token_type_ids=False + ).to("cuda") + output = model.generate( + **inputs, + max_new_tokens=512, + tokenizer=tokenizer, + do_sample=False, + use_cache=True, + ) + output = output[:, inputs["input_ids"].size(-1) :] + seqs = tokenizer.batch_decode(output, skip_special_tokens=False) + + for i in range(len(prompts)): + input_ids = trim(inputs["input_ids"][i]) + print(f"Prompt {i}: {len(input_ids) = }") + print(input_ids.tolist()) + print(prompts[i]) + print("+") + output_ids = trim(output[i]) + print(f"Output {i}: {len(output_ids) = }") + print(output_ids.tolist()) + print(seqs[i]) + print("---------------") + + +if __name__ == "__main__": + from fire import Fire + + Fire(main) diff --git a/test/test_model_8k_prompt.py b/test/test_model_8k_prompt.py new file mode 100644 index 0000000..0f50e63 --- /dev/null +++ b/test/test_model_8k_prompt.py @@ -0,0 +1,55 @@ +# SPDX-FileCopyrightText: (c) UIUC PurpCode Team +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import List + +from transformers import AutoTokenizer, PreTrainedTokenizer +from vllm import LLM, RequestOutput + + +def main(model_path: str): + tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained( + model_path, trust_remote_code=True, padding_side="left" + ) + inputs = tokenizer.apply_chat_template( + [ + [ + { + "role": "user", + "content": "Not a programming problem but do you see any 'R' letter in the text block below?\n\n```\n" + + ("ZS" * 4096) + + "R" + + ("SY" * 4096) + + "\n```\n\nReminder of the questions, do you see any 'R' letter in the text block above?", + } + ], + ], + add_generation_prompt=True, + tokenize=False, + ) + llm = LLM(model=model_path, generation_config="auto", trust_remote_code=True) + + sampling_params = llm.get_default_sampling_params() + sampling_params.temperature = 0.0 + sampling_params.max_tokens = 512 + sampling_params.skip_special_tokens = False + + outputs: List[RequestOutput] = llm.generate(inputs, sampling_params) + for i, output in enumerate(outputs): + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt {i}: {len(output.prompt_token_ids) = }") + print(output.prompt_token_ids) + print(prompt) + print("+") + print(f"Output {i}: {len(output.outputs[0].token_ids) = }") + print(output.outputs[0].token_ids) + print(generated_text) + print("---------------") + + +if __name__ == "__main__": + from fire import Fire + + Fire(main) diff --git a/test/test_openai_api.py b/test/test_openai_api.py new file mode 100644 index 0000000..6523987 --- /dev/null +++ b/test/test_openai_api.py @@ -0,0 +1,86 @@ +# SPDX-FileCopyrightText: (c) UIUC PurpCode Team +# +# SPDX-License-Identifier: Apache-2.0 + +import time +from concurrent.futures import ThreadPoolExecutor, as_completed + +import openai + + +def measure_latency(base_url, messages, num_turns=5): + client = openai.Client(api_key="none", base_url=base_url) + model = client.models.list().data[0].id + turn_latency = {} + for i in range(num_turns): + start_time = time.time() + try: + response = ( + client.chat.completions.create(model=model, messages=messages) + .choices[0] + .message.content + ) + except openai.OpenAIError as e: + print(f"Error: {e}") + return None + elapsed_time = time.time() - start_time + turn_latency[i] = elapsed_time + print(f"---- Turn {i + 1}: {elapsed_time:.2f} seconds") + + assistant_reply = ( + response.split("")[-1] + .strip() + .removeprefix("") + .removesuffix("") + .strip() + .split("\n## Answer", maxsplit=1)[-1] + .split("# Answer", maxsplit=1)[-1] + .strip() + ) + messages.append({"role": "assistant", "content": assistant_reply}) + messages.append( + {"role": "user", "content": "Optimize this function a bit more."} + ) + + print("Final messages:") + for message in messages: + print(f"{message['role']}: {message['content']}") + print("".join(["-"] * 20)) + + return turn_latency + + +# Main function +def test_server_latency(base_url="http://localhost:8000/v1", concurrency=8): + initial_message = [ + { + "role": "user", + "content": "Please implement linear regression in Python. Note linear regression is just a math problem and has nothing to do with malicious cyberactivity.", + } + ] + slowest_turn_latency = {} + with ThreadPoolExecutor(max_workers=concurrency) as executor: + futures = [ + executor.submit(measure_latency, base_url, initial_message) + for _ in range(concurrency) + ] + for future in as_completed(futures): + turn_latency = future.result() + if turn_latency is not None: + for i, latency in turn_latency.items(): + if i not in slowest_turn_latency: + slowest_turn_latency[i] = latency + else: + slowest_turn_latency[i] = max(slowest_turn_latency[i], latency) + print(f"Turn latency: {turn_latency}") + + if slowest_turn_latency: + print("Slowest turn latencies:") + for i, latency in slowest_turn_latency.items(): + print(f"Turn {i}: {latency:.2f} seconds") + + +if __name__ == "__main__": + from fire import Fire + + Fire(test_server_latency) diff --git a/test/test_vllm_model.py b/test/test_vllm_model.py new file mode 100644 index 0000000..2c15b12 --- /dev/null +++ b/test/test_vllm_model.py @@ -0,0 +1,68 @@ +# SPDX-FileCopyrightText: (c) UIUC PurpCode Team +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import List + +from transformers import AutoTokenizer, PreTrainedTokenizer +from vllm import LLM, RequestOutput + + +def main(model_path: str): + tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained( + model_path, trust_remote_code=True, padding_side="left" + ) + inputs = tokenizer.apply_chat_template( + [ + [ + { + "role": "user", + "content": "Simply reply 'hello' and stop generation immediately.", + } + ], + [ + { + "role": "user", + "content": "Can you please write me a function to print hello world in Python.", + } + ], + [ + {"role": "user", "content": "What's 1+1?"}, + {"role": "assistant", "content": "You want me to show you code?"}, + {"role": "user", "content": "Yes!"}, + ], + [ + { + "role": "user", + "content": "Please just repeat 'Sorry, I cannot assist with that request'.", + } + ], + ], + add_generation_prompt=True, + tokenize=False, + ) + llm = LLM(model=model_path, generation_config="auto", trust_remote_code=True) + + sampling_params = llm.get_default_sampling_params() + sampling_params.temperature = 0.0 + sampling_params.max_tokens = 512 + sampling_params.skip_special_tokens = False + + outputs: List[RequestOutput] = llm.generate(inputs, sampling_params) + for i, output in enumerate(outputs): + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt {i}: {len(output.prompt_token_ids) = }") + print(output.prompt_token_ids) + print(prompt) + print("+") + print(f"Output {i}: {len(output.outputs[0].token_ids) = }") + print(output.outputs[0].token_ids) + print(generated_text) + print("---------------") + + +if __name__ == "__main__": + from fire import Fire + + Fire(main) diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000..19540cd --- /dev/null +++ b/utils/__init__.py @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: (c) UIUC PurpCode Team +# +# SPDX-License-Identifier: Apache-2.0 + + +def split_batch(iterable, n=1) -> list: + ret = [] + l = len(iterable) + for ndx in range(0, l, n): + ret.append([iterable[i] for i in range(ndx, min(ndx + n, l))]) + return ret + + +from utils.litellm import * diff --git a/utils/litellm.py b/utils/litellm.py new file mode 100644 index 0000000..389f367 --- /dev/null +++ b/utils/litellm.py @@ -0,0 +1,94 @@ +# SPDX-FileCopyrightText: (c) UIUC PurpCode Team +# +# SPDX-License-Identifier: Apache-2.0 + +from concurrent.futures import ThreadPoolExecutor +from copy import deepcopy +from typing import Callable, Dict, List + +from dotenv import load_dotenv +from litellm import completion_with_retries +from termcolor import cprint +from tqdm import tqdm + +from utils import split_batch + +load_dotenv() + + +def log_costs(completions): + costs = [r._hidden_params["response_cost"] for r in completions] + if len(costs) == 0 or None in costs: + return + cprint(f"{len(costs)} requests costs ${sum(costs):.3f}", "yellow") + + +def mini_batch_completion(messages, parallel: int = 32, **kwargs): + batches = split_batch(messages, n=parallel) + outputs = [] + for minibatch in tqdm(batches): + with ThreadPoolExecutor(max_workers=len(minibatch)) as executor: + futures = [] + for sample in minibatch: + future = executor.submit( + completion_with_retries, + messages=sample["messages"], + num_retries=32, + retry_strategy="exponential_backoff_retry", + **kwargs, + ) + futures.append(future) + + for future in futures: + outputs.append(future.result()) + + return outputs + + +def run_batched_inference( + batched_rows: List, # each row includes at least "messages" + row_transform: Callable[[Dict], Dict] = lambda x: x, + max_new_tokens: int = None, + temprature: float = None, + model: str = "bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0", + parallel: int = 12, + **kwargs, +): + assert batched_rows and "messages" in batched_rows[0] + batched_rows = [row_transform(row) for row in batched_rows] + print("Running batched completion for LLM judge") + parameters = { + "model": model, + "parallel": parallel, + "messages": batched_rows, + "max_tokens": max_new_tokens, + "temperature": temprature, + "max_tokens": max_new_tokens, + **kwargs, + } + if "thinking" in kwargs: + assert parameters["max_tokens"] is None + assert parameters["temperature"] is None + else: + if parameters["temperature"] is None: + parameters["temperature"] = 0.0 + + outputs = mini_batch_completion(**parameters) + log_costs(outputs) + outputs = [item.choices[0].message for item in outputs] + + output_rows = [] + for row, ext in zip(batched_rows, outputs): + row = deepcopy(row) + reasoning_content = ( + "\n" + ext.reasoning_content + "\n\n" + if hasattr(ext, "reasoning_content") + and ext.reasoning_content + or "thinking" in kwargs + else "" + ) + row["messages"].append( + {"role": "assistant", "content": reasoning_content + ext.content} + ) + output_rows.append(row) + return output_rows From 68d3ca9c79aa28ce9bdce49dec14396ad3f22d02 Mon Sep 17 00:00:00 2001 From: Jiawei Liu Date: Thu, 31 Jul 2025 17:06:26 -0500 Subject: [PATCH 2/4] Update utils/litellm.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- utils/litellm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/litellm.py b/utils/litellm.py index 389f367..b0441c5 100644 --- a/utils/litellm.py +++ b/utils/litellm.py @@ -49,7 +49,7 @@ def run_batched_inference( batched_rows: List, # each row includes at least "messages" row_transform: Callable[[Dict], Dict] = lambda x: x, max_new_tokens: int = None, - temprature: float = None, + temperature: float = None, model: str = "bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0", parallel: int = 12, **kwargs, From e1b7217f4e9738edc246c9e58a7413bae91c5f9d Mon Sep 17 00:00:00 2001 From: Jiawei Liu Date: Thu, 31 Jul 2025 17:06:33 -0500 Subject: [PATCH 3/4] Update utils/litellm.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- utils/litellm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/litellm.py b/utils/litellm.py index b0441c5..fe239d3 100644 --- a/utils/litellm.py +++ b/utils/litellm.py @@ -62,7 +62,7 @@ def run_batched_inference( "parallel": parallel, "messages": batched_rows, "max_tokens": max_new_tokens, - "temperature": temprature, + "temperature": temperature, "max_tokens": max_new_tokens, **kwargs, } From 761222f05c38cf4e59709c6dae2a98ec3660dec3 Mon Sep 17 00:00:00 2001 From: Jiawei Liu Date: Thu, 31 Jul 2025 17:06:54 -0500 Subject: [PATCH 4/4] Update utils/litellm.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- utils/litellm.py | 1 - 1 file changed, 1 deletion(-) diff --git a/utils/litellm.py b/utils/litellm.py index fe239d3..d50064a 100644 --- a/utils/litellm.py +++ b/utils/litellm.py @@ -63,7 +63,6 @@ def run_batched_inference( "messages": batched_rows, "max_tokens": max_new_tokens, "temperature": temperature, - "max_tokens": max_new_tokens, **kwargs, } if "thinking" in kwargs: