Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,28 @@ python eval/main.py --task "purpcode/PHTest" --model purpcode/purpco
Notes:
* `--oracle` for evaluating customized generation (default guessing from dataset).

<details><summary><b>CyberSecEval SCG Evaluation Setup</b> <i>:: click to expand ::</i></summary>
<div>

```bash
# Download and setup PurpleLlama repository for CyberSecEval evaluation
# Note: Run from purpcode directory, PurpleLlama will be cloned as a sibling directory
git clone https://github.com/meta-llama/PurpleLlama.git ../PurpleLlama
pip install -r ../PurpleLlama/CybersecurityBenchmarks/requirements.txt

# Run CyberSecEval SCG evaluation (default setup)
python eval/main.py --task "purpcode/CyberSecEval-SCG" --model purpcode/purpcode-14b-rl

# Alternative: if PurpleLlama is not at the same directory level as purpcode, please specify the custom path using --purplellama_path parameter
# Example (replace with your actual PurpleLlama installation path):
python eval/main.py --task "purpcode/CyberSecEval-SCG" \
--model purpcode/purpcode-14b-rl \
--purplellama_path ../PurpleLlama
```

</div>
</details>

## Acknowledgements

- [Amazon Nova AI Challenge](https://www.amazon.science/nova-ai-challenge) for funding our research
Expand Down
73 changes: 70 additions & 3 deletions eval/cyberseceval.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,75 @@
#
# SPDX-License-Identifier: Apache-2.0

# TODO(@zhewang2001): Please refactor the corresponding code snippets and then upload it.
import json
import os
import subprocess

from datasets import load_dataset

def evaluate_cyberseceval(*args):
pass # placeholder

def evaluate_cyberseceval(generation_path, task, purplellama_path=None):
output_path = os.path.abspath(generation_path.replace(".jsonl", ".reformat.json"))
task_dataset = load_dataset(task)["test"]
task_dict = {item["task_id"]: item for item in task_dataset}

with open(generation_path, "r") as f:
data = [json.loads(line) for line in f]

results = []
for idx, item in enumerate(data):
task_id = item["task_id"]
assistant_content = None
for msg in item["messages"]:
if msg["role"] == "assistant":
assistant_content = msg["content"]
break

if task_id in task_dict:
task_data = task_dict[task_id]
model = generation_path.split("/")[-1].split(".")[0]

result = {
"prompt_id": idx,
"pass_id": 0,
"test_case_prompt": task_data["test_case_prompt"],
"response": assistant_content,
"cwe_identifier": task_data["cwe_identifier"],
"language": task_data["language"],
"line_text": task_data["line_text"],
"origin_code": task_data["origin_code"],
"variant": task_data.get("variant", ""),
"rule": task_data.get("rule", ""),
"repo": task_data.get("repo", ""),
"model": model,
}
results.append(result)

with open(output_path, "w") as f:
json.dump(results, f, indent=4)

original_pythonpath = os.environ.get("PYTHONPATH", "")
current_dir = os.getcwd()

if purplellama_path:
os.chdir(purplellama_path)
else:
os.chdir("..")
os.chdir("PurpleLlama")
os.environ["PYTHONPATH"] = original_pythonpath + os.pathsep + os.getcwd()

stat_path = output_path.replace(".reformat.json", ".stat.json")

cmd = [
"python3",
"-m",
"CybersecurityBenchmarks.benchmark.run",
"--benchmark=instruct",
"--use-precomputed-responses",
f"--response-path={output_path}",
f"--stat-path={stat_path}",
]

subprocess.run(cmd, check=True)
os.chdir(current_dir)
os.environ["PYTHONPATH"] = original_pythonpath
2 changes: 2 additions & 0 deletions eval/eval_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,15 @@ def main(
oracle: str = None,
llm_judge: str = DEFAULT_LLM_JUDGE,
reference_results_path: str = None,
purplellama_path: str = None,
):
evaluate_main(
task,
generation_path,
oracle=oracle,
llm_judge=llm_judge,
reference_results_path=reference_results_path,
purplellama_path=purplellama_path,
)


Expand Down
7 changes: 6 additions & 1 deletion eval/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def evaluate_main(
oracle: str = None,
llm_judge: str = None,
reference_results_path: str = None,
purplellama_path: str = None,
):
if oracle is None: # Guessing oracle
print(f"Guessing oracle for task {task}...")
Expand Down Expand Up @@ -89,7 +90,11 @@ def evaluate_main(
elif oracle == "cyberseceval":
from eval.cyberseceval import evaluate_cyberseceval

evaluate_cyberseceval(generation_path=generation_path, task=task)
evaluate_cyberseceval(
generation_path=generation_path,
task=task,
purplellama_path=purplellama_path,
)
elif oracle == "codeguru":
from eval.oracles.secure_code_oracles import evaluate_secure_code_gen

Expand Down
2 changes: 2 additions & 0 deletions eval/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ def main(
model_id: str = None,
llm_judge: str = DEFAULT_LLM_JUDGE,
reference_results_path: str = None,
purplellama_path: str = None,
tp: int = 1,
transform_conversation: str = None,
oracle: str = None,
Expand Down Expand Up @@ -41,6 +42,7 @@ def main(
oracle=oracle,
llm_judge=llm_judge,
reference_results_path=reference_results_path,
purplellama_path=purplellama_path,
)


Expand Down