From 0297012b4367ba15e87f36c1a0312a314c2afbd3 Mon Sep 17 00:00:00 2001
From: ganler <jaway.liu@gmail.com>
Date: Thu, 31 Jul 2025 22:03:22 +0000
Subject: [PATCH 1/4] feat: integrate tests and scripts

---
 requirements.txt             | 14 ++++++
 script/cat_convo.py          | 83 +++++++++++++++++++++++++++++++
 script/push_model_hub.py     | 38 +++++++++++++++
 test/check_chat_template.py  | 49 +++++++++++++++++++
 test/test_bedrock_sonnet.py  | 15 ++++++
 test/test_hf_model.py        | 89 ++++++++++++++++++++++++++++++++++
 test/test_model_8k_prompt.py | 55 +++++++++++++++++++++
 test/test_openai_api.py      | 86 +++++++++++++++++++++++++++++++++
 test/test_vllm_model.py      | 68 ++++++++++++++++++++++++++
 utils/__init__.py            | 14 ++++++
 utils/litellm.py             | 94 ++++++++++++++++++++++++++++++++++++
 11 files changed, 605 insertions(+)
 create mode 100644 requirements.txt
 create mode 100644 script/cat_convo.py
 create mode 100644 script/push_model_hub.py
 create mode 100644 test/check_chat_template.py
 create mode 100644 test/test_bedrock_sonnet.py
 create mode 100644 test/test_hf_model.py
 create mode 100644 test/test_model_8k_prompt.py
 create mode 100644 test/test_openai_api.py
 create mode 100644 test/test_vllm_model.py
 create mode 100644 utils/__init__.py
 create mode 100644 utils/litellm.py

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..420d375
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,14 @@
+litellm
+fire
+datasets
+tqdm
+boto3
+evalplus @ git+https://github.com/evalplus/evalplus.git@master
+termcolor
+dotenv
+vllm
+scikit-learn
+bandit
+tenacity
+sandbox-fusion
+rich
diff --git a/script/cat_convo.py b/script/cat_convo.py
new file mode 100644
index 0000000..cab447f
--- /dev/null
+++ b/script/cat_convo.py
@@ -0,0 +1,83 @@
+# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import random
+
+from datasets import load_dataset
+from rich.console import Console
+from rich.markup import escape
+from rich.table import Table
+
+console = Console()
+
+
+def visualize_conversation(conversation):
+    table = Table(title="Conversation", show_lines=True)
+    table.add_column("Role", style="bold cyan", no_wrap=True)
+    table.add_column("Content", style="")
+
+    for message in conversation:
+        role = message.get("role", "unknown")
+        content = message.get("content", "").encode("utf-8", "replace").decode()
+        # Optionally style based on role
+        if role == "assistant":
+            role_style = "green"
+        elif role == "user":
+            role_style = "magenta"
+        elif role == "system":
+            role_style = "yellow"
+        else:
+            role_style = "white"
+
+        table.add_row(f"[{role_style}]{role}[/{role_style}]", escape(content.strip()))
+
+    console.print(table)
+
+
+def main(
+    path,
+    shuffle: bool = False,
+    multi: bool = False,
+    prefix: str = None,
+    split: str = "train",
+    include_kw: str = "",
+):
+    try:
+        with open(path, "r") as file:
+            conversations = [json.loads(line) for line in file if line.strip()]
+    except FileNotFoundError:
+        conversations = load_dataset(path, split=split)
+
+    print(f"{len(conversations)} messages in the conversation:")
+
+    if shuffle:
+        random.shuffle(conversations)
+
+    for data in conversations:
+        if prefix and not data.get("task_id", "").startswith(prefix):
+            continue
+        if include_kw not in "".join([r["content"] for r in data["messages"]]):
+            continue
+
+        conversation = data.get("conversation", data.get("messages"))
+        if multi and len([m for m in conversation if m["role"] != "system"]) <= 2:
+            continue
+        if "task_id" in data:
+            console.print(f"[bold]Task ID:[/bold] {data['task_id']}")
+        if conversation:
+            visualize_conversation(conversation)
+            console.print("\n" + "=" * 50 + "\n")
+        else:
+            console.print("[yellow]No conversation found in this line.[/yellow]")
+
+        input()  # Wait for user input to continue to the next line
+
+
+# Example:
+# python scripts/cat_convo.py --path ./cwe2code_raw.jsonl
+if __name__ == "__main__":
+    from fire import Fire
+
+    Fire(main)
diff --git a/script/push_model_hub.py b/script/push_model_hub.py
new file mode 100644
index 0000000..50d70e7
--- /dev/null
+++ b/script/push_model_hub.py
@@ -0,0 +1,38 @@
+# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+
+# Example: python script/push_model.py --path [model_path]
+def push_model(path: str, model_name: str = None):
+    candidates = [f for f in path.split("/")[-2:] if "checkpoint" not in f]
+    model_name = model_name or candidates[-1]
+    assert "/" not in model_name, "Model name should not contain slashes"
+    repo_id = f"purpcode/{model_name}"
+    print(f"-- Pushing `{repo_id}` to the hub")
+
+    # tokenizer
+    AutoTokenizer.from_pretrained(
+        path,
+        local_files_only=True,
+    ).push_to_hub(repo_id, private=True)
+    print("-- Tokenizer pushed")
+
+    # model
+    AutoModelForCausalLM.from_pretrained(
+        path,
+        torch_dtype=torch.bfloat16,
+    ).push_to_hub(repo_id, private=True)
+    print("-- Model pushed")
+
+    print("Please check:")
+    print(f"https://huggingface.co/{repo_id}")
+
+
+if __name__ == "__main__":
+    from fire import Fire
+
+    Fire(push_model)
diff --git a/test/check_chat_template.py b/test/check_chat_template.py
new file mode 100644
index 0000000..8705f7e
--- /dev/null
+++ b/test/check_chat_template.py
@@ -0,0 +1,49 @@
+# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from transformers import AutoTokenizer
+
+
+def print_tokens(encoded, tokenizer):
+    """Print the tokens and their IDs in a readable format."""
+    tokens = []
+    for token_id in encoded.input_ids:
+        token = tokenizer.decode([token_id])
+        tokens.append(f"{token_id}: '{token}'")
+    return tokens
+
+
+def check_chat_template(model_name, messages):
+    print("=" * 60)
+    print(f"Checking chat template for: {model_name}")
+    print("=" * 60)
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    print(f"{tokenizer.chat_template}")
+    print("=" * 60)
+
+    print("\n\n")
+    print("=" * 60)
+    print("Example chat messages:")
+    print("=" * 60)
+    formatted_prompt = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    print(formatted_prompt)
+    print("=" * 60)
+
+
+def main(model_path: str):
+    example_messages = [
+        {"role": "system", "content": "[...SYSTEM_MESSAGES...]"},
+        {"role": "user", "content": "[...USER_MESSAGES...]"},
+        {"role": "assistant", "content": "[...ASSISTANT_MESSAGES...]"},
+        {"role": "user", "content": "[...USER_MESSAGES...]"},
+    ]
+    check_chat_template(model_path, example_messages)
+
+
+if __name__ == "__main__":
+    from fire import Fire
+
+    Fire(main)
diff --git a/test/test_bedrock_sonnet.py b/test/test_bedrock_sonnet.py
new file mode 100644
index 0000000..d73273f
--- /dev/null
+++ b/test/test_bedrock_sonnet.py
@@ -0,0 +1,15 @@
+# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from dotenv import load_dotenv
+from litellm import completion
+
+load_dotenv()
+
+response = completion(
+    model="bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0",
+    messages=[{"content": "What are you doing?", "role": "user"}],
+)
+
+print(response.choices[0].message.content)
diff --git a/test/test_hf_model.py b/test/test_hf_model.py
new file mode 100644
index 0000000..a6f6d3d
--- /dev/null
+++ b/test/test_hf_model.py
@@ -0,0 +1,89 @@
+# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    PreTrainedModel,
+    PreTrainedTokenizer,
+)
+
+
+def trim(ids):
+    non_zero_indices = torch.nonzero(ids, as_tuple=True)[0]
+    last_non_zero_index = non_zero_indices[-1].item()
+    return ids[: last_non_zero_index + 1]
+
+
+def main(model_path: str):
+    model: PreTrainedModel = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        torch_dtype=torch.bfloat16,
+        trust_remote_code=True,
+        attn_implementation="flash_attention_2",
+    ).to("cuda")
+    tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(
+        model_path, trust_remote_code=True, padding_side="left"
+    )
+    prompts = tokenizer.apply_chat_template(
+        [
+            [
+                {
+                    "role": "user",
+                    "content": "Simply reply 'hello' and stop generation immediately.",
+                }
+            ],
+            [
+                {
+                    "role": "user",
+                    "content": "Can you please write me a function to print hello world in Python.",
+                }
+            ],
+            [
+                {"role": "user", "content": "What's 1+1?"},
+                {"role": "assistant", "content": "You want me to show you code?"},
+                {"role": "user", "content": "Yes!"},
+            ],
+            [
+                {
+                    "role": "user",
+                    "content": "Please just repeat 'Sorry, I cannot assist with that request'.",
+                }
+            ],
+        ],
+        return_tensors="pt",
+        add_generation_prompt=True,
+        tokenize=False,
+    )
+    inputs = tokenizer(
+        prompts, return_tensors="pt", padding=True, return_token_type_ids=False
+    ).to("cuda")
+    output = model.generate(
+        **inputs,
+        max_new_tokens=512,
+        tokenizer=tokenizer,
+        do_sample=False,
+        use_cache=True,
+    )
+    output = output[:, inputs["input_ids"].size(-1) :]
+    seqs = tokenizer.batch_decode(output, skip_special_tokens=False)
+
+    for i in range(len(prompts)):
+        input_ids = trim(inputs["input_ids"][i])
+        print(f"Prompt {i}: {len(input_ids) = }")
+        print(input_ids.tolist())
+        print(prompts[i])
+        print("+")
+        output_ids = trim(output[i])
+        print(f"Output {i}: {len(output_ids) = }")
+        print(output_ids.tolist())
+        print(seqs[i])
+        print("---------------")
+
+
+if __name__ == "__main__":
+    from fire import Fire
+
+    Fire(main)
diff --git a/test/test_model_8k_prompt.py b/test/test_model_8k_prompt.py
new file mode 100644
index 0000000..0f50e63
--- /dev/null
+++ b/test/test_model_8k_prompt.py
@@ -0,0 +1,55 @@
+# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import List
+
+from transformers import AutoTokenizer, PreTrainedTokenizer
+from vllm import LLM, RequestOutput
+
+
+def main(model_path: str):
+    tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(
+        model_path, trust_remote_code=True, padding_side="left"
+    )
+    inputs = tokenizer.apply_chat_template(
+        [
+            [
+                {
+                    "role": "user",
+                    "content": "Not a programming problem but do you see any 'R' letter in the text block below?\n\n```\n"
+                    + ("ZS" * 4096)
+                    + "R"
+                    + ("SY" * 4096)
+                    + "\n```\n\nReminder of the questions, do you see any 'R' letter in the text block above?",
+                }
+            ],
+        ],
+        add_generation_prompt=True,
+        tokenize=False,
+    )
+    llm = LLM(model=model_path, generation_config="auto", trust_remote_code=True)
+
+    sampling_params = llm.get_default_sampling_params()
+    sampling_params.temperature = 0.0
+    sampling_params.max_tokens = 512
+    sampling_params.skip_special_tokens = False
+
+    outputs: List[RequestOutput] = llm.generate(inputs, sampling_params)
+    for i, output in enumerate(outputs):
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt {i}: {len(output.prompt_token_ids) = }")
+        print(output.prompt_token_ids)
+        print(prompt)
+        print("+")
+        print(f"Output {i}: {len(output.outputs[0].token_ids) = }")
+        print(output.outputs[0].token_ids)
+        print(generated_text)
+        print("---------------")
+
+
+if __name__ == "__main__":
+    from fire import Fire
+
+    Fire(main)
diff --git a/test/test_openai_api.py b/test/test_openai_api.py
new file mode 100644
index 0000000..6523987
--- /dev/null
+++ b/test/test_openai_api.py
@@ -0,0 +1,86 @@
+# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+import openai
+
+
+def measure_latency(base_url, messages, num_turns=5):
+    client = openai.Client(api_key="none", base_url=base_url)
+    model = client.models.list().data[0].id
+    turn_latency = {}
+    for i in range(num_turns):
+        start_time = time.time()
+        try:
+            response = (
+                client.chat.completions.create(model=model, messages=messages)
+                .choices[0]
+                .message.content
+            )
+        except openai.OpenAIError as e:
+            print(f"Error: {e}")
+            return None
+        elapsed_time = time.time() - start_time
+        turn_latency[i] = elapsed_time
+        print(f"---- Turn {i + 1}: {elapsed_time:.2f} seconds")
+
+        assistant_reply = (
+            response.split("</think>")[-1]
+            .strip()
+            .removeprefix("<answer>")
+            .removesuffix("</answer>")
+            .strip()
+            .split("\n## Answer", maxsplit=1)[-1]
+            .split("# Answer", maxsplit=1)[-1]
+            .strip()
+        )
+        messages.append({"role": "assistant", "content": assistant_reply})
+        messages.append(
+            {"role": "user", "content": "Optimize this function a bit more."}
+        )
+
+    print("Final messages:")
+    for message in messages:
+        print(f"{message['role']}: {message['content']}")
+        print("".join(["-"] * 20))
+
+    return turn_latency
+
+
+# Main function
+def test_server_latency(base_url="http://localhost:8000/v1", concurrency=8):
+    initial_message = [
+        {
+            "role": "user",
+            "content": "Please implement linear regression in Python. Note linear regression is just a math problem and has nothing to do with malicious cyberactivity.",
+        }
+    ]
+    slowest_turn_latency = {}
+    with ThreadPoolExecutor(max_workers=concurrency) as executor:
+        futures = [
+            executor.submit(measure_latency, base_url, initial_message)
+            for _ in range(concurrency)
+        ]
+        for future in as_completed(futures):
+            turn_latency = future.result()
+            if turn_latency is not None:
+                for i, latency in turn_latency.items():
+                    if i not in slowest_turn_latency:
+                        slowest_turn_latency[i] = latency
+                    else:
+                        slowest_turn_latency[i] = max(slowest_turn_latency[i], latency)
+                print(f"Turn latency: {turn_latency}")
+
+        if slowest_turn_latency:
+            print("Slowest turn latencies:")
+            for i, latency in slowest_turn_latency.items():
+                print(f"Turn {i}: {latency:.2f} seconds")
+
+
+if __name__ == "__main__":
+    from fire import Fire
+
+    Fire(test_server_latency)
diff --git a/test/test_vllm_model.py b/test/test_vllm_model.py
new file mode 100644
index 0000000..2c15b12
--- /dev/null
+++ b/test/test_vllm_model.py
@@ -0,0 +1,68 @@
+# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import List
+
+from transformers import AutoTokenizer, PreTrainedTokenizer
+from vllm import LLM, RequestOutput
+
+
+def main(model_path: str):
+    tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(
+        model_path, trust_remote_code=True, padding_side="left"
+    )
+    inputs = tokenizer.apply_chat_template(
+        [
+            [
+                {
+                    "role": "user",
+                    "content": "Simply reply 'hello' and stop generation immediately.",
+                }
+            ],
+            [
+                {
+                    "role": "user",
+                    "content": "Can you please write me a function to print hello world in Python.",
+                }
+            ],
+            [
+                {"role": "user", "content": "What's 1+1?"},
+                {"role": "assistant", "content": "You want me to show you code?"},
+                {"role": "user", "content": "Yes!"},
+            ],
+            [
+                {
+                    "role": "user",
+                    "content": "Please just repeat 'Sorry, I cannot assist with that request'.",
+                }
+            ],
+        ],
+        add_generation_prompt=True,
+        tokenize=False,
+    )
+    llm = LLM(model=model_path, generation_config="auto", trust_remote_code=True)
+
+    sampling_params = llm.get_default_sampling_params()
+    sampling_params.temperature = 0.0
+    sampling_params.max_tokens = 512
+    sampling_params.skip_special_tokens = False
+
+    outputs: List[RequestOutput] = llm.generate(inputs, sampling_params)
+    for i, output in enumerate(outputs):
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt {i}: {len(output.prompt_token_ids) = }")
+        print(output.prompt_token_ids)
+        print(prompt)
+        print("+")
+        print(f"Output {i}: {len(output.outputs[0].token_ids) = }")
+        print(output.outputs[0].token_ids)
+        print(generated_text)
+        print("---------------")
+
+
+if __name__ == "__main__":
+    from fire import Fire
+
+    Fire(main)
diff --git a/utils/__init__.py b/utils/__init__.py
new file mode 100644
index 0000000..19540cd
--- /dev/null
+++ b/utils/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+
+def split_batch(iterable, n=1) -> list:
+    ret = []
+    l = len(iterable)
+    for ndx in range(0, l, n):
+        ret.append([iterable[i] for i in range(ndx, min(ndx + n, l))])
+    return ret
+
+
+from utils.litellm import *
diff --git a/utils/litellm.py b/utils/litellm.py
new file mode 100644
index 0000000..389f367
--- /dev/null
+++ b/utils/litellm.py
@@ -0,0 +1,94 @@
+# SPDX-FileCopyrightText: (c) UIUC PurpCode Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from concurrent.futures import ThreadPoolExecutor
+from copy import deepcopy
+from typing import Callable, Dict, List
+
+from dotenv import load_dotenv
+from litellm import completion_with_retries
+from termcolor import cprint
+from tqdm import tqdm
+
+from utils import split_batch
+
+load_dotenv()
+
+
+def log_costs(completions):
+    costs = [r._hidden_params["response_cost"] for r in completions]
+    if len(costs) == 0 or None in costs:
+        return
+    cprint(f"{len(costs)} requests costs ${sum(costs):.3f}", "yellow")
+
+
+def mini_batch_completion(messages, parallel: int = 32, **kwargs):
+    batches = split_batch(messages, n=parallel)
+    outputs = []
+    for minibatch in tqdm(batches):
+        with ThreadPoolExecutor(max_workers=len(minibatch)) as executor:
+            futures = []
+            for sample in minibatch:
+                future = executor.submit(
+                    completion_with_retries,
+                    messages=sample["messages"],
+                    num_retries=32,
+                    retry_strategy="exponential_backoff_retry",
+                    **kwargs,
+                )
+                futures.append(future)
+
+            for future in futures:
+                outputs.append(future.result())
+
+    return outputs
+
+
+def run_batched_inference(
+    batched_rows: List,  # each row includes at least "messages"
+    row_transform: Callable[[Dict], Dict] = lambda x: x,
+    max_new_tokens: int = None,
+    temprature: float = None,
+    model: str = "bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0",
+    parallel: int = 12,
+    **kwargs,
+):
+    assert batched_rows and "messages" in batched_rows[0]
+    batched_rows = [row_transform(row) for row in batched_rows]
+    print("Running batched completion for LLM judge")
+    parameters = {
+        "model": model,
+        "parallel": parallel,
+        "messages": batched_rows,
+        "max_tokens": max_new_tokens,
+        "temperature": temprature,
+        "max_tokens": max_new_tokens,
+        **kwargs,
+    }
+    if "thinking" in kwargs:
+        assert parameters["max_tokens"] is None
+        assert parameters["temperature"] is None
+    else:
+        if parameters["temperature"] is None:
+            parameters["temperature"] = 0.0
+
+    outputs = mini_batch_completion(**parameters)
+    log_costs(outputs)
+    outputs = [item.choices[0].message for item in outputs]
+
+    output_rows = []
+    for row, ext in zip(batched_rows, outputs):
+        row = deepcopy(row)
+        reasoning_content = (
+            "<think>\n" + ext.reasoning_content + "\n</think>\n"
+            if hasattr(ext, "reasoning_content")
+            and ext.reasoning_content
+            or "thinking" in kwargs
+            else ""
+        )
+        row["messages"].append(
+            {"role": "assistant", "content": reasoning_content + ext.content}
+        )
+        output_rows.append(row)
+    return output_rows

From 68d3ca9c79aa28ce9bdce49dec14396ad3f22d02 Mon Sep 17 00:00:00 2001
From: Jiawei Liu <jaway.liu@gmail.com>
Date: Thu, 31 Jul 2025 17:06:26 -0500
Subject: [PATCH 2/4] Update utils/litellm.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 utils/litellm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/litellm.py b/utils/litellm.py
index 389f367..b0441c5 100644
--- a/utils/litellm.py
+++ b/utils/litellm.py
@@ -49,7 +49,7 @@ def run_batched_inference(
     batched_rows: List,  # each row includes at least "messages"
     row_transform: Callable[[Dict], Dict] = lambda x: x,
     max_new_tokens: int = None,
-    temprature: float = None,
+    temperature: float = None,
     model: str = "bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0",
     parallel: int = 12,
     **kwargs,

From e1b7217f4e9738edc246c9e58a7413bae91c5f9d Mon Sep 17 00:00:00 2001
From: Jiawei Liu <jaway.liu@gmail.com>
Date: Thu, 31 Jul 2025 17:06:33 -0500
Subject: [PATCH 3/4] Update utils/litellm.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 utils/litellm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/litellm.py b/utils/litellm.py
index b0441c5..fe239d3 100644
--- a/utils/litellm.py
+++ b/utils/litellm.py
@@ -62,7 +62,7 @@ def run_batched_inference(
         "parallel": parallel,
         "messages": batched_rows,
         "max_tokens": max_new_tokens,
-        "temperature": temprature,
+        "temperature": temperature,
         "max_tokens": max_new_tokens,
         **kwargs,
     }

From 761222f05c38cf4e59709c6dae2a98ec3660dec3 Mon Sep 17 00:00:00 2001
From: Jiawei Liu <jaway.liu@gmail.com>
Date: Thu, 31 Jul 2025 17:06:54 -0500
Subject: [PATCH 4/4] Update utils/litellm.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 utils/litellm.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/utils/litellm.py b/utils/litellm.py
index fe239d3..d50064a 100644
--- a/utils/litellm.py
+++ b/utils/litellm.py
@@ -63,7 +63,6 @@ def run_batched_inference(
         "messages": batched_rows,
         "max_tokens": max_new_tokens,
         "temperature": temperature,
-        "max_tokens": max_new_tokens,
         **kwargs,
     }
     if "thinking" in kwargs: