diff --git a/README.md b/README.md index 026afdb..441000d 100644 --- a/README.md +++ b/README.md @@ -196,6 +196,31 @@ python eval/main.py --task "purpcode/CyberSecEval-SCG" \ +
CWEval Evaluation Setup :: click to expand :: +
+ +```bash +# Download and setup CWEval repository for CWEval evaluation +# Note: Run from purpcode directory, CWEval will be cloned as a sibling directory +git clone https://github.com/Co1lin/CWEval.git ../CWEval + +# Run CWEval evaluation (default setup) +python eval/main.py --task "purpcode/CWEval" --model purpcode/purpcode-14b-rl + +# Alternative: if CWEval is not at the same directory level as purpcode, please specify the custom path using --cweval_path parameter +# Example (replace with your actual CWEval installation path): +python eval/main.py --task "purpcode/CWEval" \ + --model purpcode/purpcode-14b-rl \ + --cweval_path ../CWEval + +# Note: Generated files will be saved to the CWEval repository +# purpcode only handles response generation; evaluation must be performed in the CWEval repository +# Follow the CWEval README (https://github.com/Co1lin/CWEval/blob/main/README.md) for further evaluation steps +``` + +
+
+ ## Acknowledgements - [Amazon Nova AI Challenge](https://www.amazon.science/nova-ai-challenge) for funding our research diff --git a/eval/cweval.py b/eval/cweval.py index 09ed63f..61f57eb 100644 --- a/eval/cweval.py +++ b/eval/cweval.py @@ -2,8 +2,60 @@ # # SPDX-License-Identifier: Apache-2.0 -# TODO(@zhewang2001): Please refactor the corresponding code snippets and then upload it. +import json +import os +from datasets import load_dataset -def evaluate_cweval(*args): - pass # placeholder + +def evaluate_cweval(generation_path, task, cweval_path=None): + model = generation_path.split("/")[-1].split(".trimmed")[0] + + current_dir = os.getcwd() + generation_path = os.path.abspath(generation_path) + + if cweval_path: + os.chdir(cweval_path) + else: + os.chdir("..") + os.chdir("CWEval") + base_output_dir = os.path.join(os.getcwd(), "evals", model, "generated_0") + + task_dataset = load_dataset(task)["test"] + task_dict = { + item["task_id"]: item["file_path"].replace("_task", "_raw") + for item in task_dataset + } + + os.makedirs(base_output_dir, exist_ok=True) + + with open(generation_path, "r") as f: + data = [json.loads(line) for line in f] + + for item in data: + task_id = item["task_id"] + file_path = task_dict.get(task_id) + + if file_path and "messages" in item: + assistant_content = None + for message in item["messages"]: + if message["role"] == "assistant": + assistant_content = message["content"] + break + + if assistant_content: + code_blocks = assistant_content.split("```") + if len(code_blocks) >= 3: + code_block_with_lang = code_blocks[1] + if "\n" in code_block_with_lang: + code_block = code_block_with_lang.split("\n", 1)[1].strip() + else: + code_block = code_block_with_lang.strip() + + output_path = os.path.join(base_output_dir, file_path) + os.makedirs(os.path.dirname(output_path), exist_ok=True) + + with open(output_path, "w") as f: + f.write(code_block) + + os.chdir(current_dir) diff --git a/eval/eval_only.py b/eval/eval_only.py index 229454c..122ee9e 100755 --- a/eval/eval_only.py +++ b/eval/eval_only.py @@ -12,6 +12,7 @@ def main( llm_judge: str = DEFAULT_LLM_JUDGE, reference_results_path: str = None, purplellama_path: str = None, + cweval_path: str = None, ): evaluate_main( task, @@ -20,6 +21,7 @@ def main( llm_judge=llm_judge, reference_results_path=reference_results_path, purplellama_path=purplellama_path, + cweval_path=cweval_path, ) diff --git a/eval/evaluate.py b/eval/evaluate.py index 07964cb..2548a65 100755 --- a/eval/evaluate.py +++ b/eval/evaluate.py @@ -39,6 +39,7 @@ def evaluate_main( llm_judge: str = None, reference_results_path: str = None, purplellama_path: str = None, + cweval_path: str = None, ): if oracle is None: # Guessing oracle print(f"Guessing oracle for task {task}...") @@ -143,7 +144,9 @@ def evaluate_main( elif oracle == "cweval": from eval.cweval import evaluate_cweval - evaluate_cweval(generation_path=generation_path, task=task) + evaluate_cweval( + generation_path=generation_path, task=task, cweval_path=cweval_path + ) else: raise ValueError(f"Unknown oracle: {oracle}") diff --git a/eval/main.py b/eval/main.py index 97aa7d7..fca8807 100644 --- a/eval/main.py +++ b/eval/main.py @@ -15,6 +15,7 @@ def main( llm_judge: str = DEFAULT_LLM_JUDGE, reference_results_path: str = None, purplellama_path: str = None, + cweval_path: str = None, tp: int = 1, transform_conversation: str = None, oracle: str = None, @@ -43,6 +44,7 @@ def main( llm_judge=llm_judge, reference_results_path=reference_results_path, purplellama_path=purplellama_path, + cweval_path=cweval_path, )