From a0dc0c8c0b61ef9ab856759500f22650c16b1d45 Mon Sep 17 00:00:00 2001 From: zhewang2001 Date: Thu, 7 Aug 2025 21:24:34 +0000 Subject: [PATCH 1/4] docs: CWEval evaluation setup guide --- README.md | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/README.md b/README.md index 026afdb..1c7a225 100644 --- a/README.md +++ b/README.md @@ -196,6 +196,33 @@ python eval/main.py --task "purpcode/CyberSecEval-SCG" \ + + +
CWEval Evaluation Setup :: click to expand :: +
+ +```bash +# Download and setup CWEval repository for CWEval evaluation +# Note: Run from purpcode directory, CWEval will be cloned as a sibling directory +git clone https://github.com/Co1lin/CWEval.git ../CWEval + +# Run CWEval evaluation (default setup) +python eval/main.py --task "purpcode/CWEval" --model purpcode/purpcode-14b-rl + +# Alternative: if CWEval is not at the same directory level as purpcode, please specify the custom path using --cweval_path parameter +# Example (replace with your actual CWEval installation path): +python eval/main.py --task "purpcode/CWEval" \ + --model purpcode/purpcode-14b-rl \ + --cweval_path ../CWEval + +# Note: Generated files will be saved to the CWEval repository +# purpcode only handles response generation; evaluation must be performed in the CWEval repository +# Follow the CWEval README (https://github.com/Co1lin/CWEval/blob/main/README.md) for further evaluation steps +``` + +
+
+ ## Acknowledgements - [Amazon Nova AI Challenge](https://www.amazon.science/nova-ai-challenge) for funding our research From 4a6a914db09c96cf090e8fa59c26f13c90ea9c9a Mon Sep 17 00:00:00 2001 From: zhewang2001 Date: Thu, 7 Aug 2025 21:25:28 +0000 Subject: [PATCH 2/4] feat(eval): cweval --- eval/cweval.py | 72 +++++++++++++++++++++++++++++++++++++++++++++-- eval/eval_only.py | 2 ++ eval/evaluate.py | 5 +++- eval/main.py | 2 ++ 4 files changed, 77 insertions(+), 4 deletions(-) diff --git a/eval/cweval.py b/eval/cweval.py index 09ed63f..38c9024 100644 --- a/eval/cweval.py +++ b/eval/cweval.py @@ -2,8 +2,74 @@ # # SPDX-License-Identifier: Apache-2.0 -# TODO(@zhewang2001): Please refactor the corresponding code snippets and then upload it. +import json +import os +from datasets import load_dataset -def evaluate_cweval(*args): - pass # placeholder + +def evaluate_cweval(generation_path, task, cweval_path=None): + model = generation_path.split("/")[-1].split(".trimmed")[0] + + current_dir = os.getcwd() + generation_path = os.path.abspath(generation_path) + + if cweval_path: + os.chdir(cweval_path) + else: + os.chdir("..") + os.chdir("CWEval") + base_output_dir = os.path.join(os.getcwd(), "evals", model, "generated_0") + + task_dataset = load_dataset(task)["test"] + task_dict = { + item["task_id"]: item["file_path"].replace("_task", "_raw") + for item in task_dataset + } + + os.makedirs(base_output_dir, exist_ok=True) + + with open(generation_path, "r") as f: + data = [json.loads(line) for line in f] + + for item in data: + task_id = item["task_id"] + file_path = task_dict.get(task_id) + + if file_path and "messages" in item: + assistant_content = None + for message in item["messages"]: + if message["role"] == "assistant": + assistant_content = message["content"] + break + + if assistant_content: + code_start = assistant_content.find("```") + 3 + code_end = assistant_content.find("```", code_start) + + if code_start >= 3 and code_end != -1: + code_block = assistant_content[code_start:code_end].strip() + if code_block.startswith( + ( + "c\n", + "cpp\n", + "go\n", + "js\n", + "py\n", + "python\n", + "java\n", + ) + ): + code_block = code_block.split("\n", 1)[1] + elif code_block.startswith( + ("c:", "cpp:", "go:", "js:", "py:", "python:", "java:") + ): + code_block = code_block.split("\n", 1)[1] + + output_path = os.path.join(base_output_dir, file_path) + os.makedirs(os.path.dirname(output_path), exist_ok=True) + + with open(output_path, "w") as f: + f.write(code_block) + + os.chdir(current_dir) diff --git a/eval/eval_only.py b/eval/eval_only.py index 229454c..122ee9e 100755 --- a/eval/eval_only.py +++ b/eval/eval_only.py @@ -12,6 +12,7 @@ def main( llm_judge: str = DEFAULT_LLM_JUDGE, reference_results_path: str = None, purplellama_path: str = None, + cweval_path: str = None, ): evaluate_main( task, @@ -20,6 +21,7 @@ def main( llm_judge=llm_judge, reference_results_path=reference_results_path, purplellama_path=purplellama_path, + cweval_path=cweval_path, ) diff --git a/eval/evaluate.py b/eval/evaluate.py index 07964cb..2548a65 100755 --- a/eval/evaluate.py +++ b/eval/evaluate.py @@ -39,6 +39,7 @@ def evaluate_main( llm_judge: str = None, reference_results_path: str = None, purplellama_path: str = None, + cweval_path: str = None, ): if oracle is None: # Guessing oracle print(f"Guessing oracle for task {task}...") @@ -143,7 +144,9 @@ def evaluate_main( elif oracle == "cweval": from eval.cweval import evaluate_cweval - evaluate_cweval(generation_path=generation_path, task=task) + evaluate_cweval( + generation_path=generation_path, task=task, cweval_path=cweval_path + ) else: raise ValueError(f"Unknown oracle: {oracle}") diff --git a/eval/main.py b/eval/main.py index 97aa7d7..fca8807 100644 --- a/eval/main.py +++ b/eval/main.py @@ -15,6 +15,7 @@ def main( llm_judge: str = DEFAULT_LLM_JUDGE, reference_results_path: str = None, purplellama_path: str = None, + cweval_path: str = None, tp: int = 1, transform_conversation: str = None, oracle: str = None, @@ -43,6 +44,7 @@ def main( llm_judge=llm_judge, reference_results_path=reference_results_path, purplellama_path=purplellama_path, + cweval_path=cweval_path, ) From 904cbd733473c491a23db79df290d5ebd34868bb Mon Sep 17 00:00:00 2001 From: zhewang2001 Date: Thu, 7 Aug 2025 21:43:14 +0000 Subject: [PATCH 3/4] refactor: simplify code block extraction logic --- eval/cweval.py | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/eval/cweval.py b/eval/cweval.py index 38c9024..61f57eb 100644 --- a/eval/cweval.py +++ b/eval/cweval.py @@ -44,27 +44,13 @@ def evaluate_cweval(generation_path, task, cweval_path=None): break if assistant_content: - code_start = assistant_content.find("```") + 3 - code_end = assistant_content.find("```", code_start) - - if code_start >= 3 and code_end != -1: - code_block = assistant_content[code_start:code_end].strip() - if code_block.startswith( - ( - "c\n", - "cpp\n", - "go\n", - "js\n", - "py\n", - "python\n", - "java\n", - ) - ): - code_block = code_block.split("\n", 1)[1] - elif code_block.startswith( - ("c:", "cpp:", "go:", "js:", "py:", "python:", "java:") - ): - code_block = code_block.split("\n", 1)[1] + code_blocks = assistant_content.split("```") + if len(code_blocks) >= 3: + code_block_with_lang = code_blocks[1] + if "\n" in code_block_with_lang: + code_block = code_block_with_lang.split("\n", 1)[1].strip() + else: + code_block = code_block_with_lang.strip() output_path = os.path.join(base_output_dir, file_path) os.makedirs(os.path.dirname(output_path), exist_ok=True) From 9a47fd0df3e1da44ffbe5fec9e85cb9a0815647e Mon Sep 17 00:00:00 2001 From: zhewang2001 Date: Thu, 7 Aug 2025 21:49:38 +0000 Subject: [PATCH 4/4] fix: gemini comments --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index 1c7a225..441000d 100644 --- a/README.md +++ b/README.md @@ -196,8 +196,6 @@ python eval/main.py --task "purpcode/CyberSecEval-SCG" \ - -
CWEval Evaluation Setup :: click to expand ::