diff --git a/README.md b/README.md index 0a03e85..026afdb 100644 --- a/README.md +++ b/README.md @@ -174,6 +174,28 @@ python eval/main.py --task "purpcode/PHTest" --model purpcode/purpco Notes: * `--oracle` for evaluating customized generation (default guessing from dataset). +
CyberSecEval SCG Evaluation Setup :: click to expand :: +
+ +```bash +# Download and setup PurpleLlama repository for CyberSecEval evaluation +# Note: Run from purpcode directory, PurpleLlama will be cloned as a sibling directory +git clone https://github.com/meta-llama/PurpleLlama.git ../PurpleLlama +pip install -r ../PurpleLlama/CybersecurityBenchmarks/requirements.txt + +# Run CyberSecEval SCG evaluation (default setup) +python eval/main.py --task "purpcode/CyberSecEval-SCG" --model purpcode/purpcode-14b-rl + +# Alternative: if PurpleLlama is not at the same directory level as purpcode, please specify the custom path using --purplellama_path parameter +# Example (replace with your actual PurpleLlama installation path): +python eval/main.py --task "purpcode/CyberSecEval-SCG" \ + --model purpcode/purpcode-14b-rl \ + --purplellama_path ../PurpleLlama +``` + +
+
+ ## Acknowledgements - [Amazon Nova AI Challenge](https://www.amazon.science/nova-ai-challenge) for funding our research diff --git a/eval/cyberseceval.py b/eval/cyberseceval.py index 2983083..5163be4 100644 --- a/eval/cyberseceval.py +++ b/eval/cyberseceval.py @@ -2,8 +2,75 @@ # # SPDX-License-Identifier: Apache-2.0 -# TODO(@zhewang2001): Please refactor the corresponding code snippets and then upload it. +import json +import os +import subprocess +from datasets import load_dataset -def evaluate_cyberseceval(*args): - pass # placeholder + +def evaluate_cyberseceval(generation_path, task, purplellama_path=None): + output_path = os.path.abspath(generation_path.replace(".jsonl", ".reformat.json")) + task_dataset = load_dataset(task)["test"] + task_dict = {item["task_id"]: item for item in task_dataset} + + with open(generation_path, "r") as f: + data = [json.loads(line) for line in f] + + results = [] + for idx, item in enumerate(data): + task_id = item["task_id"] + assistant_content = None + for msg in item["messages"]: + if msg["role"] == "assistant": + assistant_content = msg["content"] + break + + if task_id in task_dict: + task_data = task_dict[task_id] + model = generation_path.split("/")[-1].split(".")[0] + + result = { + "prompt_id": idx, + "pass_id": 0, + "test_case_prompt": task_data["test_case_prompt"], + "response": assistant_content, + "cwe_identifier": task_data["cwe_identifier"], + "language": task_data["language"], + "line_text": task_data["line_text"], + "origin_code": task_data["origin_code"], + "variant": task_data.get("variant", ""), + "rule": task_data.get("rule", ""), + "repo": task_data.get("repo", ""), + "model": model, + } + results.append(result) + + with open(output_path, "w") as f: + json.dump(results, f, indent=4) + + original_pythonpath = os.environ.get("PYTHONPATH", "") + current_dir = os.getcwd() + + if purplellama_path: + os.chdir(purplellama_path) + else: + os.chdir("..") + os.chdir("PurpleLlama") + os.environ["PYTHONPATH"] = original_pythonpath + os.pathsep + os.getcwd() + + stat_path = output_path.replace(".reformat.json", ".stat.json") + + cmd = [ + "python3", + "-m", + "CybersecurityBenchmarks.benchmark.run", + "--benchmark=instruct", + "--use-precomputed-responses", + f"--response-path={output_path}", + f"--stat-path={stat_path}", + ] + + subprocess.run(cmd, check=True) + os.chdir(current_dir) + os.environ["PYTHONPATH"] = original_pythonpath diff --git a/eval/eval_only.py b/eval/eval_only.py index ac73083..229454c 100755 --- a/eval/eval_only.py +++ b/eval/eval_only.py @@ -11,6 +11,7 @@ def main( oracle: str = None, llm_judge: str = DEFAULT_LLM_JUDGE, reference_results_path: str = None, + purplellama_path: str = None, ): evaluate_main( task, @@ -18,6 +19,7 @@ def main( oracle=oracle, llm_judge=llm_judge, reference_results_path=reference_results_path, + purplellama_path=purplellama_path, ) diff --git a/eval/evaluate.py b/eval/evaluate.py index f202a9c..07964cb 100755 --- a/eval/evaluate.py +++ b/eval/evaluate.py @@ -38,6 +38,7 @@ def evaluate_main( oracle: str = None, llm_judge: str = None, reference_results_path: str = None, + purplellama_path: str = None, ): if oracle is None: # Guessing oracle print(f"Guessing oracle for task {task}...") @@ -89,7 +90,11 @@ def evaluate_main( elif oracle == "cyberseceval": from eval.cyberseceval import evaluate_cyberseceval - evaluate_cyberseceval(generation_path=generation_path, task=task) + evaluate_cyberseceval( + generation_path=generation_path, + task=task, + purplellama_path=purplellama_path, + ) elif oracle == "codeguru": from eval.oracles.secure_code_oracles import evaluate_secure_code_gen diff --git a/eval/main.py b/eval/main.py index 426dffb..97aa7d7 100644 --- a/eval/main.py +++ b/eval/main.py @@ -14,6 +14,7 @@ def main( model_id: str = None, llm_judge: str = DEFAULT_LLM_JUDGE, reference_results_path: str = None, + purplellama_path: str = None, tp: int = 1, transform_conversation: str = None, oracle: str = None, @@ -41,6 +42,7 @@ def main( oracle=oracle, llm_judge=llm_judge, reference_results_path=reference_results_path, + purplellama_path=purplellama_path, )