From 3e607930f37d7c7ee39e3626f4ab3d1481ea847c Mon Sep 17 00:00:00 2001 From: zhewang2001 Date: Thu, 7 Aug 2025 16:34:03 +0000 Subject: [PATCH 1/6] feat(eval): cyberseceval --- eval/cyberseceval.py | 74 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 71 insertions(+), 3 deletions(-) diff --git a/eval/cyberseceval.py b/eval/cyberseceval.py index 2983083..f5a0aab 100644 --- a/eval/cyberseceval.py +++ b/eval/cyberseceval.py @@ -2,8 +2,76 @@ # # SPDX-License-Identifier: Apache-2.0 -# TODO(@zhewang2001): Please refactor the corresponding code snippets and then upload it. +import json +import os +from datasets import load_dataset -def evaluate_cyberseceval(*args): - pass # placeholder + +def evaluate_cyberseceval(generation_path, task): + output_path = os.path.abspath(generation_path.replace(".jsonl", ".reformat.json")) + task_dataset = load_dataset(task)["test"] + task_dict = {item["task_id"]: item for item in task_dataset} + + with open(generation_path, "r") as f: + data = [] + for line in f: + data.append(json.loads(line)) + + results = [] + for idx, item in enumerate(data): + task_id = item["task_id"] + assistant_content = None + for msg in item["messages"]: + if msg["role"] == "assistant": + assistant_content = msg["content"] + break + + if task_id in task_dict: + task_data = task_dict[task_id] + model = generation_path.split("/")[-1].split(".")[0] + + result = { + "prompt_id": idx, + "pass_id": 0, + "test_case_prompt": task_data["test_case_prompt"], + "response": assistant_content, + "cwe_identifier": task_data["cwe_identifier"], + "language": task_data["language"], + "line_text": task_data["line_text"], + "origin_code": task_data["origin_code"], + "variant": task_data.get("variant", ""), + "rule": task_data.get("rule", ""), + "repo": task_data.get("repo", ""), + "model": model, + "original_code": task_data["origin_code"], + } + results.append(result) + + with open(output_path, "w") as f: + json.dump(results, f, indent=4) + + import subprocess + + original_pythonpath = os.environ.get("PYTHONPATH", "") + current_dir = os.getcwd() + + os.chdir("..") + os.chdir("PurpleLlama") + os.environ["PYTHONPATH"] = original_pythonpath + ":" + os.getcwd() + + stat_path = output_path.replace(".reformat.json", ".stat.json") + + cmd = [ + "python3", + "-m", + "CybersecurityBenchmarks.benchmark.run", + "--benchmark=instruct", + "--use-precomputed-responses", + f"--response-path={output_path}", + f"--stat-path={stat_path}", + ] + + subprocess.run(cmd) + os.chdir(current_dir) + os.environ["PYTHONPATH"] = original_pythonpath + ":" + os.getcwd() From cd90f2fa69b773aa11601d0f4b057547feb43162 Mon Sep 17 00:00:00 2001 From: zhewang2001 Date: Thu, 7 Aug 2025 19:35:30 +0000 Subject: [PATCH 2/6] feat: add configurable purplellama_path parameter for cyberseceval --- eval/cyberseceval.py | 11 +++++++---- eval/eval_only.py | 2 ++ eval/evaluate.py | 7 ++++++- eval/main.py | 2 ++ 4 files changed, 17 insertions(+), 5 deletions(-) diff --git a/eval/cyberseceval.py b/eval/cyberseceval.py index f5a0aab..52fefbe 100644 --- a/eval/cyberseceval.py +++ b/eval/cyberseceval.py @@ -8,7 +8,7 @@ from datasets import load_dataset -def evaluate_cyberseceval(generation_path, task): +def evaluate_cyberseceval(generation_path, task, purplellama_path=None): output_path = os.path.abspath(generation_path.replace(".jsonl", ".reformat.json")) task_dataset = load_dataset(task)["test"] task_dict = {item["task_id"]: item for item in task_dataset} @@ -56,8 +56,11 @@ def evaluate_cyberseceval(generation_path, task): original_pythonpath = os.environ.get("PYTHONPATH", "") current_dir = os.getcwd() - os.chdir("..") - os.chdir("PurpleLlama") + if purplellama_path: + os.chdir(purplellama_path) + else: + os.chdir("..") + os.chdir("PurpleLlama") os.environ["PYTHONPATH"] = original_pythonpath + ":" + os.getcwd() stat_path = output_path.replace(".reformat.json", ".stat.json") @@ -74,4 +77,4 @@ def evaluate_cyberseceval(generation_path, task): subprocess.run(cmd) os.chdir(current_dir) - os.environ["PYTHONPATH"] = original_pythonpath + ":" + os.getcwd() + os.environ["PYTHONPATH"] = original_pythonpath diff --git a/eval/eval_only.py b/eval/eval_only.py index ac73083..229454c 100755 --- a/eval/eval_only.py +++ b/eval/eval_only.py @@ -11,6 +11,7 @@ def main( oracle: str = None, llm_judge: str = DEFAULT_LLM_JUDGE, reference_results_path: str = None, + purplellama_path: str = None, ): evaluate_main( task, @@ -18,6 +19,7 @@ def main( oracle=oracle, llm_judge=llm_judge, reference_results_path=reference_results_path, + purplellama_path=purplellama_path, ) diff --git a/eval/evaluate.py b/eval/evaluate.py index f202a9c..07964cb 100755 --- a/eval/evaluate.py +++ b/eval/evaluate.py @@ -38,6 +38,7 @@ def evaluate_main( oracle: str = None, llm_judge: str = None, reference_results_path: str = None, + purplellama_path: str = None, ): if oracle is None: # Guessing oracle print(f"Guessing oracle for task {task}...") @@ -89,7 +90,11 @@ def evaluate_main( elif oracle == "cyberseceval": from eval.cyberseceval import evaluate_cyberseceval - evaluate_cyberseceval(generation_path=generation_path, task=task) + evaluate_cyberseceval( + generation_path=generation_path, + task=task, + purplellama_path=purplellama_path, + ) elif oracle == "codeguru": from eval.oracles.secure_code_oracles import evaluate_secure_code_gen diff --git a/eval/main.py b/eval/main.py index 426dffb..97aa7d7 100644 --- a/eval/main.py +++ b/eval/main.py @@ -14,6 +14,7 @@ def main( model_id: str = None, llm_judge: str = DEFAULT_LLM_JUDGE, reference_results_path: str = None, + purplellama_path: str = None, tp: int = 1, transform_conversation: str = None, oracle: str = None, @@ -41,6 +42,7 @@ def main( oracle=oracle, llm_judge=llm_judge, reference_results_path=reference_results_path, + purplellama_path=purplellama_path, ) From 0a5ebccdc850895c2fdec13dd6963585cf701cb0 Mon Sep 17 00:00:00 2001 From: zhewang2001 Date: Thu, 7 Aug 2025 19:58:41 +0000 Subject: [PATCH 3/6] docs: CyberSecEval SCG evaluation setup guide --- README.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/README.md b/README.md index 0a03e85..44e0b79 100644 --- a/README.md +++ b/README.md @@ -174,6 +174,25 @@ python eval/main.py --task "purpcode/PHTest" --model purpcode/purpco Notes: * `--oracle` for evaluating customized generation (default guessing from dataset). +
CyberSecEval SCG Evaluation Setup :: click to expand :: +
+ +```bash +# Download PurpleLlama repository for CyberSecEval evaluation +git clone https://github.com/meta-llama/PurpleLlama.git + +# Run CyberSecEval SCG evaluation with custom PurpleLlama path +python eval/main.py --task "purpcode/CyberSecEval-SCG" \ + --model purpcode/purpcode-14b-rl \ + --purplellama_path /path/to/PurpleLlama + +# Note: if PurpleLlama is cloned at the same directory level as purpcode +# (e.g., both /path/to/PurpleLlama and /path/to/purpcode), then the --purplellama_path parameter can be omitted +python eval/main.py --task "purpcode/CyberSecEval-SCG" --model purpcode/purpcode-14b-rl +``` +
+
+ ## Acknowledgements - [Amazon Nova AI Challenge](https://www.amazon.science/nova-ai-challenge) for funding our research From 5b439211a8a8175803902b5414162c9cb24038e8 Mon Sep 17 00:00:00 2001 From: zhewang2001 Date: Thu, 7 Aug 2025 20:12:19 +0000 Subject: [PATCH 4/6] docs: refine the guide for CyberSecEval SCG --- README.md | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 44e0b79..a453826 100644 --- a/README.md +++ b/README.md @@ -174,22 +174,26 @@ python eval/main.py --task "purpcode/PHTest" --model purpcode/purpco Notes: * `--oracle` for evaluating customized generation (default guessing from dataset). -
CyberSecEval SCG Evaluation Setup :: click to expand :: +
CyberSecEval SCG Evaluation Setup :: click to expand ::
```bash -# Download PurpleLlama repository for CyberSecEval evaluation +# Download and setup PurpleLlama repository for CyberSecEval evaluation +cd .. # assuming you are in purpcode directory git clone https://github.com/meta-llama/PurpleLlama.git +cd PurpleLlama +pip install -r CybersecurityBenchmarks/requirements.txt +cd ../purpcode -# Run CyberSecEval SCG evaluation with custom PurpleLlama path +# Run CyberSecEval SCG evaluation (default setup) +python eval/main.py --task "purpcode/CyberSecEval-SCG" --model purpcode/purpcode-14b-rl + +# Alternative: if PurpleLlama is not at the same directory level as purpcode, please specify the custom path using --purplellama_path parameter python eval/main.py --task "purpcode/CyberSecEval-SCG" \ --model purpcode/purpcode-14b-rl \ --purplellama_path /path/to/PurpleLlama - -# Note: if PurpleLlama is cloned at the same directory level as purpcode -# (e.g., both /path/to/PurpleLlama and /path/to/purpcode), then the --purplellama_path parameter can be omitted -python eval/main.py --task "purpcode/CyberSecEval-SCG" --model purpcode/purpcode-14b-rl ``` +
From a5b1a53b0e5d9b099429e9f12316d96894182902 Mon Sep 17 00:00:00 2001 From: zhewang2001 Date: Thu, 7 Aug 2025 20:29:25 +0000 Subject: [PATCH 5/6] fix: gemini comments --- README.md | 8 +++----- eval/cyberseceval.py | 8 +++----- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index a453826..e349f03 100644 --- a/README.md +++ b/README.md @@ -179,11 +179,9 @@ Notes: ```bash # Download and setup PurpleLlama repository for CyberSecEval evaluation -cd .. # assuming you are in purpcode directory -git clone https://github.com/meta-llama/PurpleLlama.git -cd PurpleLlama -pip install -r CybersecurityBenchmarks/requirements.txt -cd ../purpcode +# Note: Run from purpcode directory, PurpleLlama will be cloned as a sibling directory +git clone https://github.com/meta-llama/PurpleLlama.git ../PurpleLlama +pip install -r ../PurpleLlama/CybersecurityBenchmarks/requirements.txt # Run CyberSecEval SCG evaluation (default setup) python eval/main.py --task "purpcode/CyberSecEval-SCG" --model purpcode/purpcode-14b-rl diff --git a/eval/cyberseceval.py b/eval/cyberseceval.py index 52fefbe..c016a06 100644 --- a/eval/cyberseceval.py +++ b/eval/cyberseceval.py @@ -4,6 +4,7 @@ import json import os +import subprocess from datasets import load_dataset @@ -44,15 +45,12 @@ def evaluate_cyberseceval(generation_path, task, purplellama_path=None): "rule": task_data.get("rule", ""), "repo": task_data.get("repo", ""), "model": model, - "original_code": task_data["origin_code"], } results.append(result) with open(output_path, "w") as f: json.dump(results, f, indent=4) - import subprocess - original_pythonpath = os.environ.get("PYTHONPATH", "") current_dir = os.getcwd() @@ -61,7 +59,7 @@ def evaluate_cyberseceval(generation_path, task, purplellama_path=None): else: os.chdir("..") os.chdir("PurpleLlama") - os.environ["PYTHONPATH"] = original_pythonpath + ":" + os.getcwd() + os.environ["PYTHONPATH"] = original_pythonpath + os.pathsep + os.getcwd() stat_path = output_path.replace(".reformat.json", ".stat.json") @@ -75,6 +73,6 @@ def evaluate_cyberseceval(generation_path, task, purplellama_path=None): f"--stat-path={stat_path}", ] - subprocess.run(cmd) + subprocess.run(cmd, check=True) os.chdir(current_dir) os.environ["PYTHONPATH"] = original_pythonpath From c29add7dfe55ebf65db70e2f3d505ad4ac58d095 Mon Sep 17 00:00:00 2001 From: zhewang2001 Date: Thu, 7 Aug 2025 20:58:14 +0000 Subject: [PATCH 6/6] fix: ganler comments --- README.md | 3 ++- eval/cyberseceval.py | 4 +--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index e349f03..026afdb 100644 --- a/README.md +++ b/README.md @@ -187,9 +187,10 @@ pip install -r ../PurpleLlama/CybersecurityBenchmarks/requirements.txt python eval/main.py --task "purpcode/CyberSecEval-SCG" --model purpcode/purpcode-14b-rl # Alternative: if PurpleLlama is not at the same directory level as purpcode, please specify the custom path using --purplellama_path parameter +# Example (replace with your actual PurpleLlama installation path): python eval/main.py --task "purpcode/CyberSecEval-SCG" \ --model purpcode/purpcode-14b-rl \ - --purplellama_path /path/to/PurpleLlama + --purplellama_path ../PurpleLlama ``` diff --git a/eval/cyberseceval.py b/eval/cyberseceval.py index c016a06..5163be4 100644 --- a/eval/cyberseceval.py +++ b/eval/cyberseceval.py @@ -15,9 +15,7 @@ def evaluate_cyberseceval(generation_path, task, purplellama_path=None): task_dict = {item["task_id"]: item for item in task_dataset} with open(generation_path, "r") as f: - data = [] - for line in f: - data.append(json.loads(line)) + data = [json.loads(line) for line in f] results = [] for idx, item in enumerate(data):