From 3e607930f37d7c7ee39e3626f4ab3d1481ea847c Mon Sep 17 00:00:00 2001
From: zhewang2001 <zhewang1207@gmail.com>
Date: Thu, 7 Aug 2025 16:34:03 +0000
Subject: [PATCH 1/6] feat(eval): cyberseceval

---
 eval/cyberseceval.py | 74 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 71 insertions(+), 3 deletions(-)

diff --git a/eval/cyberseceval.py b/eval/cyberseceval.py
index 2983083..f5a0aab 100644
--- a/eval/cyberseceval.py
+++ b/eval/cyberseceval.py
@@ -2,8 +2,76 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-# TODO(@zhewang2001): Please refactor the corresponding code snippets and then upload it.
+import json
+import os
 
+from datasets import load_dataset
 
-def evaluate_cyberseceval(*args):
-    pass  # placeholder
+
+def evaluate_cyberseceval(generation_path, task):
+    output_path = os.path.abspath(generation_path.replace(".jsonl", ".reformat.json"))
+    task_dataset = load_dataset(task)["test"]
+    task_dict = {item["task_id"]: item for item in task_dataset}
+
+    with open(generation_path, "r") as f:
+        data = []
+        for line in f:
+            data.append(json.loads(line))
+
+    results = []
+    for idx, item in enumerate(data):
+        task_id = item["task_id"]
+        assistant_content = None
+        for msg in item["messages"]:
+            if msg["role"] == "assistant":
+                assistant_content = msg["content"]
+                break
+
+        if task_id in task_dict:
+            task_data = task_dict[task_id]
+            model = generation_path.split("/")[-1].split(".")[0]
+
+            result = {
+                "prompt_id": idx,
+                "pass_id": 0,
+                "test_case_prompt": task_data["test_case_prompt"],
+                "response": assistant_content,
+                "cwe_identifier": task_data["cwe_identifier"],
+                "language": task_data["language"],
+                "line_text": task_data["line_text"],
+                "origin_code": task_data["origin_code"],
+                "variant": task_data.get("variant", ""),
+                "rule": task_data.get("rule", ""),
+                "repo": task_data.get("repo", ""),
+                "model": model,
+                "original_code": task_data["origin_code"],
+            }
+            results.append(result)
+
+    with open(output_path, "w") as f:
+        json.dump(results, f, indent=4)
+
+    import subprocess
+
+    original_pythonpath = os.environ.get("PYTHONPATH", "")
+    current_dir = os.getcwd()
+
+    os.chdir("..")
+    os.chdir("PurpleLlama")
+    os.environ["PYTHONPATH"] = original_pythonpath + ":" + os.getcwd()
+
+    stat_path = output_path.replace(".reformat.json", ".stat.json")
+
+    cmd = [
+        "python3",
+        "-m",
+        "CybersecurityBenchmarks.benchmark.run",
+        "--benchmark=instruct",
+        "--use-precomputed-responses",
+        f"--response-path={output_path}",
+        f"--stat-path={stat_path}",
+    ]
+
+    subprocess.run(cmd)
+    os.chdir(current_dir)
+    os.environ["PYTHONPATH"] = original_pythonpath + ":" + os.getcwd()

From cd90f2fa69b773aa11601d0f4b057547feb43162 Mon Sep 17 00:00:00 2001
From: zhewang2001 <zhewang1207@gmail.com>
Date: Thu, 7 Aug 2025 19:35:30 +0000
Subject: [PATCH 2/6] feat: add configurable purplellama_path parameter for
 cyberseceval

---
 eval/cyberseceval.py | 11 +++++++----
 eval/eval_only.py    |  2 ++
 eval/evaluate.py     |  7 ++++++-
 eval/main.py         |  2 ++
 4 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/eval/cyberseceval.py b/eval/cyberseceval.py
index f5a0aab..52fefbe 100644
--- a/eval/cyberseceval.py
+++ b/eval/cyberseceval.py
@@ -8,7 +8,7 @@
 from datasets import load_dataset
 
 
-def evaluate_cyberseceval(generation_path, task):
+def evaluate_cyberseceval(generation_path, task, purplellama_path=None):
     output_path = os.path.abspath(generation_path.replace(".jsonl", ".reformat.json"))
     task_dataset = load_dataset(task)["test"]
     task_dict = {item["task_id"]: item for item in task_dataset}
@@ -56,8 +56,11 @@ def evaluate_cyberseceval(generation_path, task):
     original_pythonpath = os.environ.get("PYTHONPATH", "")
     current_dir = os.getcwd()
 
-    os.chdir("..")
-    os.chdir("PurpleLlama")
+    if purplellama_path:
+        os.chdir(purplellama_path)
+    else:
+        os.chdir("..")
+        os.chdir("PurpleLlama")
     os.environ["PYTHONPATH"] = original_pythonpath + ":" + os.getcwd()
 
     stat_path = output_path.replace(".reformat.json", ".stat.json")
@@ -74,4 +77,4 @@ def evaluate_cyberseceval(generation_path, task):
 
     subprocess.run(cmd)
     os.chdir(current_dir)
-    os.environ["PYTHONPATH"] = original_pythonpath + ":" + os.getcwd()
+    os.environ["PYTHONPATH"] = original_pythonpath
diff --git a/eval/eval_only.py b/eval/eval_only.py
index ac73083..229454c 100755
--- a/eval/eval_only.py
+++ b/eval/eval_only.py
@@ -11,6 +11,7 @@ def main(
     oracle: str = None,
     llm_judge: str = DEFAULT_LLM_JUDGE,
     reference_results_path: str = None,
+    purplellama_path: str = None,
 ):
     evaluate_main(
         task,
@@ -18,6 +19,7 @@ def main(
         oracle=oracle,
         llm_judge=llm_judge,
         reference_results_path=reference_results_path,
+        purplellama_path=purplellama_path,
     )
 
 
diff --git a/eval/evaluate.py b/eval/evaluate.py
index f202a9c..07964cb 100755
--- a/eval/evaluate.py
+++ b/eval/evaluate.py
@@ -38,6 +38,7 @@ def evaluate_main(
     oracle: str = None,
     llm_judge: str = None,
     reference_results_path: str = None,
+    purplellama_path: str = None,
 ):
     if oracle is None:  # Guessing oracle
         print(f"Guessing oracle for task {task}...")
@@ -89,7 +90,11 @@ def evaluate_main(
     elif oracle == "cyberseceval":
         from eval.cyberseceval import evaluate_cyberseceval
 
-        evaluate_cyberseceval(generation_path=generation_path, task=task)
+        evaluate_cyberseceval(
+            generation_path=generation_path,
+            task=task,
+            purplellama_path=purplellama_path,
+        )
     elif oracle == "codeguru":
         from eval.oracles.secure_code_oracles import evaluate_secure_code_gen
 
diff --git a/eval/main.py b/eval/main.py
index 426dffb..97aa7d7 100644
--- a/eval/main.py
+++ b/eval/main.py
@@ -14,6 +14,7 @@ def main(
     model_id: str = None,
     llm_judge: str = DEFAULT_LLM_JUDGE,
     reference_results_path: str = None,
+    purplellama_path: str = None,
     tp: int = 1,
     transform_conversation: str = None,
     oracle: str = None,
@@ -41,6 +42,7 @@ def main(
         oracle=oracle,
         llm_judge=llm_judge,
         reference_results_path=reference_results_path,
+        purplellama_path=purplellama_path,
     )
 
 

From 0a5ebccdc850895c2fdec13dd6963585cf701cb0 Mon Sep 17 00:00:00 2001
From: zhewang2001 <zhewang1207@gmail.com>
Date: Thu, 7 Aug 2025 19:58:41 +0000
Subject: [PATCH 3/6] docs: CyberSecEval SCG evaluation setup guide

---
 README.md | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/README.md b/README.md
index 0a03e85..44e0b79 100644
--- a/README.md
+++ b/README.md
@@ -174,6 +174,25 @@ python eval/main.py --task "purpcode/PHTest"             --model purpcode/purpco
 Notes:
 * `--oracle` for evaluating customized generation (default guessing from dataset).
 
+<details><summary>CyberSecEval SCG Evaluation Setup <i>:: click to expand ::</i></summary>
+<div>
+
+```bash
+# Download PurpleLlama repository for CyberSecEval evaluation
+git clone https://github.com/meta-llama/PurpleLlama.git
+
+# Run CyberSecEval SCG evaluation with custom PurpleLlama path
+python eval/main.py --task "purpcode/CyberSecEval-SCG" \
+                    --model purpcode/purpcode-14b-rl \
+                    --purplellama_path /path/to/PurpleLlama
+
+# Note: if PurpleLlama is cloned at the same directory level as purpcode
+# (e.g., both /path/to/PurpleLlama and /path/to/purpcode), then the --purplellama_path parameter can be omitted
+python eval/main.py --task "purpcode/CyberSecEval-SCG" --model purpcode/purpcode-14b-rl
+```
+</div>
+</details>
+
 ## Acknowledgements
 
 - [Amazon Nova AI Challenge](https://www.amazon.science/nova-ai-challenge) for funding our research

From 5b439211a8a8175803902b5414162c9cb24038e8 Mon Sep 17 00:00:00 2001
From: zhewang2001 <zhewang1207@gmail.com>
Date: Thu, 7 Aug 2025 20:12:19 +0000
Subject: [PATCH 4/6] docs: refine the guide for CyberSecEval SCG

---
 README.md | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 44e0b79..a453826 100644
--- a/README.md
+++ b/README.md
@@ -174,22 +174,26 @@ python eval/main.py --task "purpcode/PHTest"             --model purpcode/purpco
 Notes:
 * `--oracle` for evaluating customized generation (default guessing from dataset).
 
-<details><summary>CyberSecEval SCG Evaluation Setup <i>:: click to expand ::</i></summary>
+<details><summary><b>CyberSecEval SCG Evaluation Setup</b> <i>:: click to expand ::</i></summary>
 <div>
 
 ```bash
-# Download PurpleLlama repository for CyberSecEval evaluation
+# Download and setup PurpleLlama repository for CyberSecEval evaluation
+cd .. # assuming you are in purpcode directory
 git clone https://github.com/meta-llama/PurpleLlama.git
+cd PurpleLlama
+pip install -r CybersecurityBenchmarks/requirements.txt
+cd ../purpcode
 
-# Run CyberSecEval SCG evaluation with custom PurpleLlama path
+# Run CyberSecEval SCG evaluation (default setup)
+python eval/main.py --task "purpcode/CyberSecEval-SCG" --model purpcode/purpcode-14b-rl
+
+# Alternative: if PurpleLlama is not at the same directory level as purpcode, please specify the custom path using --purplellama_path parameter
 python eval/main.py --task "purpcode/CyberSecEval-SCG" \
                     --model purpcode/purpcode-14b-rl \
                     --purplellama_path /path/to/PurpleLlama
-
-# Note: if PurpleLlama is cloned at the same directory level as purpcode
-# (e.g., both /path/to/PurpleLlama and /path/to/purpcode), then the --purplellama_path parameter can be omitted
-python eval/main.py --task "purpcode/CyberSecEval-SCG" --model purpcode/purpcode-14b-rl
 ```
+
 </div>
 </details>
 

From a5b1a53b0e5d9b099429e9f12316d96894182902 Mon Sep 17 00:00:00 2001
From: zhewang2001 <zhewang1207@gmail.com>
Date: Thu, 7 Aug 2025 20:29:25 +0000
Subject: [PATCH 5/6] fix: gemini comments

---
 README.md            | 8 +++-----
 eval/cyberseceval.py | 8 +++-----
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index a453826..e349f03 100644
--- a/README.md
+++ b/README.md
@@ -179,11 +179,9 @@ Notes:
 
 ```bash
 # Download and setup PurpleLlama repository for CyberSecEval evaluation
-cd .. # assuming you are in purpcode directory
-git clone https://github.com/meta-llama/PurpleLlama.git
-cd PurpleLlama
-pip install -r CybersecurityBenchmarks/requirements.txt
-cd ../purpcode
+# Note: Run from purpcode directory, PurpleLlama will be cloned as a sibling directory
+git clone https://github.com/meta-llama/PurpleLlama.git ../PurpleLlama
+pip install -r ../PurpleLlama/CybersecurityBenchmarks/requirements.txt
 
 # Run CyberSecEval SCG evaluation (default setup)
 python eval/main.py --task "purpcode/CyberSecEval-SCG" --model purpcode/purpcode-14b-rl
diff --git a/eval/cyberseceval.py b/eval/cyberseceval.py
index 52fefbe..c016a06 100644
--- a/eval/cyberseceval.py
+++ b/eval/cyberseceval.py
@@ -4,6 +4,7 @@
 
 import json
 import os
+import subprocess
 
 from datasets import load_dataset
 
@@ -44,15 +45,12 @@ def evaluate_cyberseceval(generation_path, task, purplellama_path=None):
                 "rule": task_data.get("rule", ""),
                 "repo": task_data.get("repo", ""),
                 "model": model,
-                "original_code": task_data["origin_code"],
             }
             results.append(result)
 
     with open(output_path, "w") as f:
         json.dump(results, f, indent=4)
 
-    import subprocess
-
     original_pythonpath = os.environ.get("PYTHONPATH", "")
     current_dir = os.getcwd()
 
@@ -61,7 +59,7 @@ def evaluate_cyberseceval(generation_path, task, purplellama_path=None):
     else:
         os.chdir("..")
         os.chdir("PurpleLlama")
-    os.environ["PYTHONPATH"] = original_pythonpath + ":" + os.getcwd()
+    os.environ["PYTHONPATH"] = original_pythonpath + os.pathsep + os.getcwd()
 
     stat_path = output_path.replace(".reformat.json", ".stat.json")
 
@@ -75,6 +73,6 @@ def evaluate_cyberseceval(generation_path, task, purplellama_path=None):
         f"--stat-path={stat_path}",
     ]
 
-    subprocess.run(cmd)
+    subprocess.run(cmd, check=True)
     os.chdir(current_dir)
     os.environ["PYTHONPATH"] = original_pythonpath

From c29add7dfe55ebf65db70e2f3d505ad4ac58d095 Mon Sep 17 00:00:00 2001
From: zhewang2001 <zhewang1207@gmail.com>
Date: Thu, 7 Aug 2025 20:58:14 +0000
Subject: [PATCH 6/6] fix: ganler comments

---
 README.md            | 3 ++-
 eval/cyberseceval.py | 4 +---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index e349f03..026afdb 100644
--- a/README.md
+++ b/README.md
@@ -187,9 +187,10 @@ pip install -r ../PurpleLlama/CybersecurityBenchmarks/requirements.txt
 python eval/main.py --task "purpcode/CyberSecEval-SCG" --model purpcode/purpcode-14b-rl
 
 # Alternative: if PurpleLlama is not at the same directory level as purpcode, please specify the custom path using --purplellama_path parameter
+# Example (replace with your actual PurpleLlama installation path):
 python eval/main.py --task "purpcode/CyberSecEval-SCG" \
                     --model purpcode/purpcode-14b-rl \
-                    --purplellama_path /path/to/PurpleLlama
+                    --purplellama_path ../PurpleLlama
 ```
 
 </div>
diff --git a/eval/cyberseceval.py b/eval/cyberseceval.py
index c016a06..5163be4 100644
--- a/eval/cyberseceval.py
+++ b/eval/cyberseceval.py
@@ -15,9 +15,7 @@ def evaluate_cyberseceval(generation_path, task, purplellama_path=None):
     task_dict = {item["task_id"]: item for item in task_dataset}
 
     with open(generation_path, "r") as f:
-        data = []
-        for line in f:
-            data.append(json.loads(line))
+        data = [json.loads(line) for line in f]
 
     results = []
     for idx, item in enumerate(data):