purpcode-uiuc · ganler · Jul 31, 2025 · Jul 31, 2025 · Jul 31, 2025 · Jul 31, 2025
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,14 @@
+litellm
+fire
+datasets
+tqdm
+boto3
+evalplus @ git+https://github.com/evalplus/evalplus.git@master
+termcolor
+dotenv
+vllm
+scikit-learn
+bandit
+tenacity
+sandbox-fusion
+rich
diff --git a/script/cat_convo.py b/script/cat_convo.py
@@ -0,0 +1,83 @@
+# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import random
+
+from datasets import load_dataset
+from rich.console import Console
+from rich.markup import escape
+from rich.table import Table
+
+console = Console()
+
+
+def visualize_conversation(conversation):
+    table = Table(title="Conversation", show_lines=True)
+    table.add_column("Role", style="bold cyan", no_wrap=True)
+    table.add_column("Content", style="")
+
+    for message in conversation:
+        role = message.get("role", "unknown")
+        content = message.get("content", "").encode("utf-8", "replace").decode()
+        # Optionally style based on role
+        if role == "assistant":
+            role_style = "green"
+        elif role == "user":
+            role_style = "magenta"
+        elif role == "system":
+            role_style = "yellow"
+        else:
+            role_style = "white"
+
+        table.add_row(f"[{role_style}]{role}[/{role_style}]", escape(content.strip()))
+
+    console.print(table)
+
+
+def main(
+    path,
+    shuffle: bool = False,
+    multi: bool = False,
+    prefix: str = None,
+    split: str = "train",
+    include_kw: str = "",
+):
+    try:
+        with open(path, "r") as file:
+            conversations = [json.loads(line) for line in file if line.strip()]
+    except FileNotFoundError:
+        conversations = load_dataset(path, split=split)
+
+    print(f"{len(conversations)} messages in the conversation:")
+
+    if shuffle:
+        random.shuffle(conversations)
+
+    for data in conversations:
+        if prefix and not data.get("task_id", "").startswith(prefix):
+            continue
+        if include_kw not in "".join([r["content"] for r in data["messages"]]):
+            continue
+
+        conversation = data.get("conversation", data.get("messages"))
+        if multi and len([m for m in conversation if m["role"] != "system"]) <= 2:
+            continue
+        if "task_id" in data:
+            console.print(f"[bold]Task ID:[/bold] {data['task_id']}")
+        if conversation:
+            visualize_conversation(conversation)
+            console.print("\n" + "=" * 50 + "\n")
+        else:
+            console.print("[yellow]No conversation found in this line.[/yellow]")
+
+        input()  # Wait for user input to continue to the next line
+
+
+# Example:
+# python scripts/cat_convo.py --path ./cwe2code_raw.jsonl
+if __name__ == "__main__":
+    from fire import Fire
+
+    Fire(main)
diff --git a/script/push_model_hub.py b/script/push_model_hub.py
@@ -0,0 +1,38 @@
+# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+
+# Example: python script/push_model.py --path [model_path]
+def push_model(path: str, model_name: str = None):
+    candidates = [f for f in path.split("/")[-2:] if "checkpoint" not in f]
+    model_name = model_name or candidates[-1]
+    assert "/" not in model_name, "Model name should not contain slashes"
+    repo_id = f"purpcode/{model_name}"
+    print(f"-- Pushing `{repo_id}` to the hub")
+
+    # tokenizer
+    AutoTokenizer.from_pretrained(
+        path,
+        local_files_only=True,
+    ).push_to_hub(repo_id, private=True)
+    print("-- Tokenizer pushed")
+
+    # model
+    AutoModelForCausalLM.from_pretrained(
+        path,
+        torch_dtype=torch.bfloat16,
+    ).push_to_hub(repo_id, private=True)
+    print("-- Model pushed")
+
+    print("Please check:")
+    print(f"https://huggingface.co/{repo_id}")
+
+
+if __name__ == "__main__":
+    from fire import Fire
+
+    Fire(push_model)
diff --git a/test/check_chat_template.py b/test/check_chat_template.py
@@ -0,0 +1,49 @@
+# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from transformers import AutoTokenizer
+
+
+def print_tokens(encoded, tokenizer):
+    """Print the tokens and their IDs in a readable format."""
+    tokens = []
+    for token_id in encoded.input_ids:
+        token = tokenizer.decode([token_id])
+        tokens.append(f"{token_id}: '{token}'")
+    return tokens
+
+
+def check_chat_template(model_name, messages):
+    print("=" * 60)
+    print(f"Checking chat template for: {model_name}")
+    print("=" * 60)
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    print(f"{tokenizer.chat_template}")
+    print("=" * 60)
+
+    print("\n\n")
+    print("=" * 60)
+    print("Example chat messages:")
+    print("=" * 60)
+    formatted_prompt = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    print(formatted_prompt)
+    print("=" * 60)
+
+
+def main(model_path: str):
+    example_messages = [
+        {"role": "system", "content": "[...SYSTEM_MESSAGES...]"},
+        {"role": "user", "content": "[...USER_MESSAGES...]"},
+        {"role": "assistant", "content": "[...ASSISTANT_MESSAGES...]"},
+        {"role": "user", "content": "[...USER_MESSAGES...]"},
+    ]
+    check_chat_template(model_path, example_messages)
+
+
+if __name__ == "__main__":
+    from fire import Fire
+
+    Fire(main)
diff --git a/test/test_bedrock_sonnet.py b/test/test_bedrock_sonnet.py
@@ -0,0 +1,15 @@
+# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from dotenv import load_dotenv
+from litellm import completion
+
+load_dotenv()
+
+response = completion(
+    model="bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0",
+    messages=[{"content": "What are you doing?", "role": "user"}],
+)
+
+print(response.choices[0].message.content)
diff --git a/test/test_hf_model.py b/test/test_hf_model.py
@@ -0,0 +1,89 @@
+# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    PreTrainedModel,
+    PreTrainedTokenizer,
+)
+
+
+def trim(ids):
+    non_zero_indices = torch.nonzero(ids, as_tuple=True)[0]
+    last_non_zero_index = non_zero_indices[-1].item()
+    return ids[: last_non_zero_index + 1]
+
+
+def main(model_path: str):
+    model: PreTrainedModel = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        torch_dtype=torch.bfloat16,
+        trust_remote_code=True,
+        attn_implementation="flash_attention_2",
+    ).to("cuda")
+    tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(
+        model_path, trust_remote_code=True, padding_side="left"
+    )
+    prompts = tokenizer.apply_chat_template(
+        [
+            [
+                {
+                    "role": "user",
+                    "content": "Simply reply 'hello' and stop generation immediately.",
+                }
+            ],
+            [
+                {
+                    "role": "user",
+                    "content": "Can you please write me a function to print hello world in Python.",
+                }
+            ],
+            [
+                {"role": "user", "content": "What's 1+1?"},
+                {"role": "assistant", "content": "You want me to show you code?"},
+                {"role": "user", "content": "Yes!"},
+            ],
+            [
+                {
+                    "role": "user",
+                    "content": "Please just repeat 'Sorry, I cannot assist with that request'.",
+                }
+            ],
+        ],
+        return_tensors="pt",
+        add_generation_prompt=True,
+        tokenize=False,
+    )
+    inputs = tokenizer(
+        prompts, return_tensors="pt", padding=True, return_token_type_ids=False
+    ).to("cuda")
+    output = model.generate(
+        **inputs,
+        max_new_tokens=512,
+        tokenizer=tokenizer,
+        do_sample=False,
+        use_cache=True,
+    )
+    output = output[:, inputs["input_ids"].size(-1) :]
+    seqs = tokenizer.batch_decode(output, skip_special_tokens=False)
+
+    for i in range(len(prompts)):
+        input_ids = trim(inputs["input_ids"][i])
+        print(f"Prompt {i}: {len(input_ids) = }")
+        print(input_ids.tolist())
+        print(prompts[i])
+        print("+")
+        output_ids = trim(output[i])
+        print(f"Output {i}: {len(output_ids) = }")
+        print(output_ids.tolist())
+        print(seqs[i])
+        print("---------------")
+
+
+if __name__ == "__main__":
+    from fire import Fire
+
+    Fire(main)
diff --git a/test/test_model_8k_prompt.py b/test/test_model_8k_prompt.py
@@ -0,0 +1,55 @@
+# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import List
+
+from transformers import AutoTokenizer, PreTrainedTokenizer
+from vllm import LLM, RequestOutput
+
+
+def main(model_path: str):
+    tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(
+        model_path, trust_remote_code=True, padding_side="left"
+    )
+    inputs = tokenizer.apply_chat_template(
+        [
+            [
+                {
+                    "role": "user",
+                    "content": "Not a programming problem but do you see any 'R' letter in the text block below?\n\n```\n"
+                    + ("ZS" * 4096)
+                    + "R"
+                    + ("SY" * 4096)
+                    + "\n```\n\nReminder of the questions, do you see any 'R' letter in the text block above?",
+                }
+            ],
+        ],
+        add_generation_prompt=True,
+        tokenize=False,
+    )
+    llm = LLM(model=model_path, generation_config="auto", trust_remote_code=True)
+
+    sampling_params = llm.get_default_sampling_params()
+    sampling_params.temperature = 0.0
+    sampling_params.max_tokens = 512
+    sampling_params.skip_special_tokens = False
+
+    outputs: List[RequestOutput] = llm.generate(inputs, sampling_params)
+    for i, output in enumerate(outputs):
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt {i}: {len(output.prompt_token_ids) = }")
+        print(output.prompt_token_ids)
+        print(prompt)
+        print("+")
+        print(f"Output {i}: {len(output.outputs[0].token_ids) = }")
+        print(output.outputs[0].token_ids)
+        print(generated_text)
+        print("---------------")
+
+
+if __name__ == "__main__":
+    from fire import Fire
+
+    Fire(main)