From c41742c2881b5f5f5b8ae5494fb1517359001c7a Mon Sep 17 00:00:00 2001 From: yueliu14 Date: Sun, 8 Mar 2026 21:52:57 -0500 Subject: [PATCH 01/84] add geak benchmark and repo tasks --- agents/geak_benchmark/__init__.py | 4 + agents/geak_benchmark/agent_config.yaml | 11 + agents/geak_benchmark/geak.yaml | 31 ++ agents/geak_benchmark/geak_pre_process.py | 152 +++++++ agents/geak_benchmark/launch_agent.py | 389 ++++++++++++++++++ src/module_registration.py | 5 +- src/preprocessing.py | 91 +++- .../rocprim/block_histogram/config.yaml | 10 + .../block_histogram/scripts/task_runner.py | 326 +++++++++++++++ .../rocprim/block_radix_rank/config.yaml | 10 + .../block_radix_rank/scripts/task_runner.py | 326 +++++++++++++++ .../block_run_length_decode/config.yaml | 10 + .../scripts/task_runner.py | 326 +++++++++++++++ .../rocprim/device_binary_search/config.yaml | 10 + .../scripts/task_runner.py | 326 +++++++++++++++ .../rocprim/device_merge_sort/config.yaml | 10 + .../device_merge_sort/scripts/task_runner.py | 326 +++++++++++++++ .../rocprim/device_nth_element/config.yaml | 10 + .../device_nth_element/scripts/task_runner.py | 326 +++++++++++++++ 19 files changed, 2697 insertions(+), 2 deletions(-) create mode 100644 agents/geak_benchmark/__init__.py create mode 100644 agents/geak_benchmark/agent_config.yaml create mode 100644 agents/geak_benchmark/geak.yaml create mode 100644 agents/geak_benchmark/geak_pre_process.py create mode 100644 agents/geak_benchmark/launch_agent.py create mode 100644 tasks/repository/rocprim/block_histogram/config.yaml create mode 100644 tasks/repository/rocprim/block_histogram/scripts/task_runner.py create mode 100644 tasks/repository/rocprim/block_radix_rank/config.yaml create mode 100644 tasks/repository/rocprim/block_radix_rank/scripts/task_runner.py create mode 100644 tasks/repository/rocprim/block_run_length_decode/config.yaml create mode 100644 tasks/repository/rocprim/block_run_length_decode/scripts/task_runner.py create mode 100644 tasks/repository/rocprim/device_binary_search/config.yaml create mode 100644 tasks/repository/rocprim/device_binary_search/scripts/task_runner.py create mode 100644 tasks/repository/rocprim/device_merge_sort/config.yaml create mode 100644 tasks/repository/rocprim/device_merge_sort/scripts/task_runner.py create mode 100644 tasks/repository/rocprim/device_nth_element/config.yaml create mode 100644 tasks/repository/rocprim/device_nth_element/scripts/task_runner.py diff --git a/agents/geak_benchmark/__init__.py b/agents/geak_benchmark/__init__.py new file mode 100644 index 00000000..7f1b3d7a --- /dev/null +++ b/agents/geak_benchmark/__init__.py @@ -0,0 +1,4 @@ +# Copyright(C) [2026] Advanced Micro Devices, Inc. All rights reserved. +from agents.geak_benchmark.launch_agent import launch_agent + +__all__ = ["launch_agent"] diff --git a/agents/geak_benchmark/agent_config.yaml b/agents/geak_benchmark/agent_config.yaml new file mode 100644 index 00000000..ec35bcf7 --- /dev/null +++ b/agents/geak_benchmark/agent_config.yaml @@ -0,0 +1,11 @@ +version: 0 + +# Agent timeout settings +timeout_seconds: 36000 +python_path: python3 + +run: + # Output folder is relative to AIG-Eval root unless absolute. + # cmd can be mini or geak + cmd: mini + configs: '-c geak.yaml --num-parallel=2 --gpu-ids=2,3 --yolo' \ No newline at end of file diff --git a/agents/geak_benchmark/geak.yaml b/agents/geak_benchmark/geak.yaml new file mode 100644 index 00000000..73237743 --- /dev/null +++ b/agents/geak_benchmark/geak.yaml @@ -0,0 +1,31 @@ +agent: + step_limit: 0. + cost_limit: 0. + mode: confirm +env: + env: + PAGER: cat + MANPAGER: cat + LESS: -R + PIP_PROGRESS_BAR: 'off' + TQDM_DISABLE: '1' + timeout: 3600 +model: + model_class: amd_llm + # claude-opus-4.5, claude-sonnet-4.5, gpt-5.1, gpt-5, gpt-5-codex + model_name: claude-opus-4.5 + api_key: "" + # model_kwargs: + # temperature: 0.0 + # max_tokens: 16000 + # # reasoning is only valid for gpt models, can be set to none, low, medium, high + # reasoning: + # effort: high + # # text is only valid for gpt models, can be set to low or high. determines how many output tokens are generated + # text: + # verbosity: low + +tools: + profiling: true + profiling_type: profiling + strategy_manager: true diff --git a/agents/geak_benchmark/geak_pre_process.py b/agents/geak_benchmark/geak_pre_process.py new file mode 100644 index 00000000..ad31dbc7 --- /dev/null +++ b/agents/geak_benchmark/geak_pre_process.py @@ -0,0 +1,152 @@ +# Copyright(C) [2026] Advanced Micro Devices, Inc. All rights reserved. +""" +GEAK Benchmark Pre-Processing Module. + +This module handles preprocessing for GEAK benchmark tasks: +1. Building simplified prompts from task config +2. Copying python_bindings to workspace +3. Integrating agent config into prompts +""" +import shutil +import logging +from pathlib import Path +from typing import Any +import yaml + + +def simple_prompt_builder(task_config_dir: str, workspace: str, logger: logging.Logger) -> str: + """ + Build a simple prompt for geak_benchmark agent. + Only includes essential information from task config. + + Args: + task_config_dir: Path to the task's config.yaml + workspace: Workspace directory path + logger: Logger instance + + Returns: + str: The simplified prompt + """ + task_config_path = Path(task_config_dir) + with open(task_config_path, 'r') as f: + task_config = yaml.safe_load(f) + + prompt_sections = [] + + # 1. Task info from config + source_files = task_config.get('source_file_path', []) + target_kernels = task_config.get('target_kernel_functions', []) + compile_cmd = task_config.get('compile_command', []) + correctness_cmd = task_config.get('correctness_command', []) + performance_cmd = task_config.get('performance_command', []) + + # Format as list strings + def format_list(items): + if isinstance(items, list): + return '\n'.join(f' - {item}' for item in items) + return f' - {items}' + + # Build test command: compile_command && correctness_command && performance_command (dedup identical cmds) + def build_test_command(compile_cmds, correctness_cmds, perf_cmds): + def normalize(cmds): + if cmds is None: + return [] + if isinstance(cmds, list): + raw = cmds + else: + raw = [cmds] + out = [] + for c in raw: + s = str(c).strip() + if s: + out.append(s) + return out + + ordered = [] + seen = set() + for cmd in normalize(compile_cmds) + normalize(correctness_cmds) + normalize(perf_cmds): + if cmd in seen: + continue + seen.add(cmd) + ordered.append(cmd) + return " && ".join(ordered) + + test_command = build_test_command(compile_cmd, correctness_cmd, performance_cmd) + + task_info = f"""## Task Info + +**Source files:** +{format_list(source_files)} + +**Target kernel functions:** +{format_list(target_kernels)} + +**Test command:** + - `{test_command}` +""" + prompt_sections.append(task_info) + + # 2. Custom instructions from task config (if provided) + instructions = task_config.get('prompt', {}).get('instructions') + if instructions: + prompt_sections.append(f"## Instructions\n\n{instructions}") + else: + prompt_sections.append("Optimize the kernel in the workspace directory.") + + # 3. Workspace directory info + workspace_info = f""" +### Workspace Directory +Your working directory is: `{workspace}` +""" + prompt_sections.append(workspace_info) + + final_prompt = "\n\n".join(prompt_sections) + logger.info(f"Simple prompt built, length: {len(final_prompt)} characters") + + return final_prompt + + +def integrate_agent_config(prompt: str, agent_config: dict[str, Any]) -> str: + """ + Integrate agent config into prompt. + + Args: + prompt: The base prompt string + agent_config: Agent configuration dictionary + + Returns: + str: Updated prompt with agent config integrated + """ + max_iters = agent_config.get("max_iterations") + if max_iters is not None: + prompt = prompt.rstrip() + f"\n\nFor this optimization, you must iterate up to {max_iters} versions." + python_path = agent_config.get("python_path") + if python_path: + prompt = prompt.rstrip() + f"\n\nUse this Python interpreter: `{python_path}`." + return prompt + + +def copy_python_bindings(task_config_dir: str, workspace: str, logger: logging.Logger) -> None: + """ + Copy python_bindings directory from task folder to workspace if it exists. + + Args: + task_config_dir: Path to the task's config.yaml + workspace: Workspace directory path + logger: Logger instance + """ + task_config_path = Path(task_config_dir) + python_bindings_src = task_config_path.parent / "python_bindings" + + if python_bindings_src.exists() and python_bindings_src.is_dir(): + python_bindings_dst = Path(workspace) / "python_bindings" + python_bindings_dst.mkdir(parents=True, exist_ok=True) + + for item in python_bindings_src.iterdir(): + dst = python_bindings_dst / item.name + if item.is_dir(): + shutil.copytree(item, dst, dirs_exist_ok=True) + else: + shutil.copy2(item, dst) + + logger.info(f"Copied python_bindings from {python_bindings_src} to {python_bindings_dst}") diff --git a/agents/geak_benchmark/launch_agent.py b/agents/geak_benchmark/launch_agent.py new file mode 100644 index 00000000..2093cb24 --- /dev/null +++ b/agents/geak_benchmark/launch_agent.py @@ -0,0 +1,389 @@ +# Copyright(C) [2026] Advanced Micro Devices, Inc. All rights reserved. +import subprocess +import shutil +import logging +import threading +import os +import shlex +import re +from pathlib import Path +from datetime import datetime +from typing import Any +import yaml +from agents import register_agent +from src.preprocessing import setup_repo_from_config +from agents.geak_benchmark.geak_pre_process import ( + simple_prompt_builder, + integrate_agent_config, + copy_python_bindings, +) + + +def write_debug_script(workspace: str, cmd: str, agent: str) -> None: + """Optionally write the invocation command to a shell script for debugging.""" + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + script_file = Path(workspace) / f"run_agent_{timestamp}.sh" + + script_lines = [ + "#!/bin/bash", + f"# Generated at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", + f"# Workspace: {workspace}", + f"# Agent: {agent}", + "", + f"cd {workspace}", + cmd, + ] + + script_file.write_text("\n".join(script_lines) + "\n") + os.chmod(script_file, 0o755) + + +@register_agent("geak_benchmark") +def launch_agent(eval_config: dict[str, Any], task_config_dir: str, workspace: str) -> str: + """ + Launch geak_benchmark agent using mini-SWE-agent with real-time output streaming. + + Args: + eval_config: Evaluator settings passed from main (includes task metadata like task_type) + task_config_dir: Path to the task configuration used to build the prompt + workspace: Workspace directory where the agent will run and read/write files + + Returns: + str: Combined agent output (stdout plus stderr summary if present) + """ + # Load agent config + config_path = Path(__file__).with_name("agent_config.yaml") + with config_path.open("r") as f: + agent_config = yaml.safe_load(f) or {} + logger = logging.getLogger(__name__) + + # Get run configuration + run_config = agent_config.get("run", {}) + + # Get command (mini or geak) + AGENT = run_config.get("cmd", "mini") + + # Get configs string directly (e.g., '-c geak.yaml --num-parallel=2 --gpu-ids=2,3 --yolo') + OPTIONS = run_config.get("configs", "") + + # Replace relative config file path with absolute path (e.g., '-c geak.yaml' -> '-c /abs/path/geak.yaml') + agent_dir = Path(__file__).parent + def replace_config_path(match): + config_file = match.group(1) + abs_path = agent_dir / config_file + return f"-c {abs_path!s}" + OPTIONS = re.sub(r'-c\s+(\S+)', replace_config_path, OPTIONS) + + # Check if the command exists + if not shutil.which(AGENT): + raise RuntimeError( + f"Command '{AGENT}' not found. Please ensure it is installed and in your PATH." + ) + + # Load task configuration + task_config_path = Path(task_config_dir) + with open(task_config_path, 'r') as f: + task_config = yaml.safe_load(f) + + # Convert the workspace path to an absolute path + project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")) + workspace = os.path.abspath(os.path.join(project_root, workspace)) + + # Setup repo from config if repo_url is present + repo_path = setup_repo_from_config(task_config_dir, Path(workspace), logger) + if repo_path: + logger.info(f"Repository cloned to: {repo_path}") + # Add --repo argument for parallel mode support + OPTIONS += f" --repo={shlex.quote(str(repo_path))}" + + # Copy python_bindings to workspace + copy_python_bindings(task_config_dir, workspace, logger) + + # Build simplified prompt (only instructions + workspace info) + prompt = simple_prompt_builder(task_config_dir, workspace, logger) + prompt = integrate_agent_config(prompt, agent_config) + + # Write prompt to a temporary file (mini agent reads from file if path exists) + prompt_file = Path(workspace) / "task_prompt.md" + prompt_file.write_text(prompt, encoding="utf-8") + logger.info(f"Wrote task prompt to: {prompt_file}") + + # Put optimization_logs outside workspace to avoid recursive copying when creating worktrees + # Use a sibling directory: workspace_dir_logs/ + workspace_path = Path(workspace) + logs_dir = workspace_path.parent / f"{workspace_path.name}_logs" + logs_dir.mkdir(parents=True, exist_ok=True) + + cmd = f"{AGENT} {OPTIONS} -t {shlex.quote(str(prompt_file))} --output {workspace}/output.traj.json --patch-output {shlex.quote(str(logs_dir))}" + + # Enable to save the command to a shell script for manual replay/debugging. + if False: + write_debug_script(workspace, cmd, AGENT) + logger.info("Debug script written; skipping live run.") + return "" + + logger.info(f"Running command: {cmd}") + logger.info("=" * 80) + logger.info("Agent Output (streaming):") + logger.info("=" * 80) + + # Give the agent a hard stop to avoid blocking downstream tasks + timeout_seconds = int(agent_config.get("timeout_seconds", 3600)) + + # Use Popen for real-time output streaming + process = subprocess.Popen( + cmd, + shell=True, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + cwd=workspace, + bufsize=1 + ) + + # Close stdin immediately + if process.stdin: + process.stdin.close() + + # Collect output while streaming + stdout_lines = [] + stderr_lines = [] + + def format_agent_event(data): + """Convert cursor stream-json payloads into a readable single-line string.""" + if not isinstance(data, dict): + return str(data) + + event_type = data.get("type") + if event_type == "assistant": + content = data.get("message", {}).get("content", []) + texts = [] + for part in content: + if isinstance(part, dict) and part.get("type") == "text": + texts.append(part.get("text", "")) + text = " ".join(t.strip() for t in texts if t and t.strip()) + return f"assistant: {text}" if text else "assistant (no text)" + + if event_type == "thinking": + text = " ".join((data.get("text") or "").split()) + subtype = data.get("subtype") + if not text: + return None + return f"thinking[{subtype}] {text}" if subtype else f"thinking {text}" + + if event_type == "tool_call": + subtype = data.get("subtype") + call = data.get("tool_call") or {} + call_name = next(iter(call.keys()), "unknown_tool") + args = call.get(call_name, {}).get("args", {}) if isinstance(call, dict) else {} + summary = [] + if isinstance(args, dict): + if "path" in args: + summary.append(f"path={args.get('path')}") + if "command" in args: + summary.append(f"cmd={args.get('command')}") + details = " ".join(summary) + return f"tool_call[{subtype}] {call_name} {details}".strip() + + if event_type == "user": + message = data.get("message", {}).get("content", []) + texts = [] + for part in message: + if isinstance(part, dict) and part.get("type") == "text": + texts.append(part.get("text", "")) + text = " ".join(t.strip() for t in texts if t and t.strip()) + if not text: + return "user (no text)" + text = " ".join(text.split()) + return f"user: {text[:160]}{'...' if len(text) > 160 else ''}" + + if event_type == "system": + model = data.get("model") + cwd = data.get("cwd") + return f"system init model={model} cwd={cwd}" + + # Fallback: compact json + import json + return json.dumps(data, ensure_ascii=False, separators=(",", ":")) + + def read_stream(stream, output_list, prefix, log_func): + """Read from stream in a separate thread to avoid blocking""" + import json + import ast + try: + for line in iter(stream.readline, ''): + if not line: + break + raw_line = line.rstrip() + + # Try to parse as JSON (stream-json format) + try: + data = json.loads(raw_line) + formatted = format_agent_event(data) + if formatted: + output_list.append(formatted) + log_func(f"{prefix} {formatted}") + continue + except json.JSONDecodeError: + try: + data = ast.literal_eval(raw_line) + formatted = format_agent_event(data) + if formatted: + output_list.append(formatted) + log_func(f"{prefix} {formatted}") + continue + except Exception: + pass + + if raw_line.strip(): + output_list.append(raw_line) + log_func(f"{prefix} {raw_line}") + finally: + stream.close() + + # Create threads to read stdout and stderr concurrently + stdout_thread = threading.Thread( + target=read_stream, + args=(process.stdout, stdout_lines, "[AGENT]", logger.info), + daemon=True + ) + stderr_thread = threading.Thread( + target=read_stream, + args=(process.stderr, stderr_lines, "[AGENT STDERR]", logger.warning), + daemon=True + ) + + # Start reading threads + stdout_thread.start() + stderr_thread.start() + + # Wait for process to complete + try: + process.wait(timeout=timeout_seconds) + except subprocess.TimeoutExpired: + logger.warning(f"Agent timed out after {timeout_seconds}s; terminating process") + process.terminate() + try: + process.wait(timeout=10) + except subprocess.TimeoutExpired: + logger.warning("Force killing agent process") + process.kill() + + # Wait for output threads to finish reading + stdout_thread.join(timeout=1) + stderr_thread.join(timeout=1) + + # Log stderr summary if present + if stderr_lines: + logger.warning("=" * 80) + logger.warning(f"Agent STDERR captured {len(stderr_lines)} lines") + logger.warning("=" * 80) + + logger.info("=" * 80) + logger.info(f"Agent completed with exit code: {process.returncode}") + logger.info("=" * 80) + + # Apply best patch to original workspace so evaluator sees optimized code + _apply_best_patch_to_workspace(workspace, logs_dir, logger) + + # Return combined output + output = "\n".join(stdout_lines) + if stderr_lines: + output += "\n=== STDERR ===\n" + "\n".join(stderr_lines) + + return output + + +def _apply_best_patch_to_workspace(workspace: str, logs_dir: Path, logger: logging.Logger) -> bool: + """ + Apply the best patch from logs_dir to the original workspace. + + This ensures the centralized evaluator (in main.py) evaluates the optimized code, + not the original baseline code. + + Args: + workspace: Original workspace directory + logs_dir: Logs directory containing best_results.json and patch files + logger: Logger instance + + Returns: + True if patch was applied successfully, False otherwise + """ + import json + + # Find best_results.json + best_results_file = logs_dir / "best_results.json" + if not best_results_file.exists(): + # Check parallel subdirs + candidates = list(logs_dir.glob("parallel_*/best_results.json")) + if candidates: + candidates.sort(key=lambda p: p.stat().st_mtime, reverse=True) + best_results_file = candidates[0] + else: + logger.warning("No best_results.json found, skipping patch application") + return False + + try: + with open(best_results_file, 'r') as f: + best_results = json.load(f) + + patch_file = best_results.get('best_patch_file') + if not patch_file or not Path(patch_file).exists(): + logger.warning(f"Best patch file not found: {patch_file}") + return False + + logger.info("=" * 80) + logger.info(f"Applying best patch to workspace: {patch_file}") + logger.info("=" * 80) + + # Try git apply first (works if workspace is a git repo) + result = subprocess.run( + ["git", "apply", "--check", str(patch_file)], + cwd=workspace, + capture_output=True, + text=True + ) + + if result.returncode == 0: + # Patch can be applied cleanly with git + result = subprocess.run( + ["git", "apply", str(patch_file)], + cwd=workspace, + capture_output=True, + text=True + ) + if result.returncode == 0: + logger.info(f"Successfully applied patch with git apply") + return True + else: + logger.warning(f"git apply failed: {result.stderr}") + + # Fallback to patch command + result = subprocess.run( + ["patch", "-p1", "--dry-run", "-i", str(patch_file)], + cwd=workspace, + capture_output=True, + text=True + ) + + if result.returncode == 0: + result = subprocess.run( + ["patch", "-p1", "-i", str(patch_file)], + cwd=workspace, + capture_output=True, + text=True + ) + if result.returncode == 0: + logger.info(f"Successfully applied patch with patch command") + return True + else: + logger.warning(f"patch command failed: {result.stderr}") + else: + logger.warning(f"Patch dry-run failed: {result.stderr}") + + return False + + except Exception as e: + logger.error(f"Error applying best patch: {e}") + return False diff --git a/src/module_registration.py b/src/module_registration.py index 314ae116..86ec75f7 100755 --- a/src/module_registration.py +++ b/src/module_registration.py @@ -17,6 +17,7 @@ class AgentType(Enum): GEAK_HIP = "geak_hip" OURLLM_KERNEL2KERNEL = "geak_ourllm_kernel2kernel" TASK_VALIDATOR = "task_validator" + GEAK_BENCHMARK = "geak_benchmark" @classmethod def from_string(cls, agent_string: str) -> 'AgentType': @@ -80,6 +81,8 @@ def load_agent_launcher(agent_type: AgentType, logger: logging.Logger) -> Callab from agents.geak_ourllm_kernel2kernel import launch_agent # noqa: F401 elif agent_type == AgentType.TASK_VALIDATOR: from agents.task_validator import launch_agent # noqa: F401 + elif agent_type == AgentType.GEAK_BENCHMARK: + from agents.geak_benchmark import launch_agent # noqa: F401 except ImportError as e: logger.error(f"Failed to import agent {agent_name}: {e}") raise @@ -115,7 +118,7 @@ def load_post_processing_handler(agent_type: AgentType, logger: logging.Logger) from agents.task_validator.validation_postprocessing import validation_post_processing logger.info(f"Using validation_post_processing for agent: {agent_name}") return validation_post_processing - elif agent_type in [AgentType.CURSOR, AgentType.CLAUDE_CODE, AgentType.CODEX, AgentType.SWE_AGENT, AgentType.GEAK_OPTIMAGENTV2, AgentType.GEAK_HIP, AgentType.OPENEVOLVE, AgentType.SINGLE_LLM_CALL, AgentType.OURLLM_KERNEL2KERNEL]: + elif agent_type in [AgentType.CURSOR, AgentType.CLAUDE_CODE, AgentType.CODEX, AgentType.SWE_AGENT, AgentType.GEAK_BENCHMARK, AgentType.GEAK_OPTIMAGENTV2, AgentType.GEAK_HIP, AgentType.OPENEVOLVE, AgentType.SINGLE_LLM_CALL, AgentType.OURLLM_KERNEL2KERNEL]: logger.info(f"Using general_post_processing for agent: {agent_name}") return general_post_processing else: diff --git a/src/preprocessing.py b/src/preprocessing.py index 1c224648..39fba554 100755 --- a/src/preprocessing.py +++ b/src/preprocessing.py @@ -2,10 +2,12 @@ # This script will setup environment tools and dependencies. It will also provide duplicated workspace for the agent import os import shutil +import subprocess import logging from pathlib import Path import yaml +from typing import Optional def _resolve_gfx_arch(target_gpu_model: str) -> str | None: @@ -64,10 +66,82 @@ def check_environment() -> None: pass +def _extract_repo_name(repo_url: str) -> str: + """Extract repository name from URL (e.g. 'https://github.com/ROCm/rocPRIM.git' -> 'rocPRIM').""" + # Remove trailing slashes and .git suffix + url = repo_url.rstrip("/") + if url.endswith(".git"): + url = url[:-4] + # Extract last path component + return url.rsplit("/", 1)[-1] + + +def _clone_repo_to_workspace( + repo_url: str, workspace_path: Path, logger: logging.Logger, subdir_name: Optional[str] = None +) -> Path: + """ + Clone repo into a subdirectory under workspace (not tasks/ folder). + + This keeps tasks/ directory clean (only config + scripts) and clones + fresh repo into each workspace. + + Args: + repo_url: Git repository URL + workspace_path: Workspace directory (e.g. workspace_MI308_cursor/block_histogram_20260305_...) + logger: Logger instance + subdir_name: Optional subdirectory name; if None, extracted from repo_url + + Returns: + Path to the cloned repository subdirectory + """ + if subdir_name is None: + subdir_name = _extract_repo_name(repo_url) + + repo_dir = workspace_path / subdir_name + + # Skip if already cloned (shouldn't happen for fresh workspace, but be safe) + if (repo_dir / ".git").exists(): + logger.info(f"Repository already exists at {repo_dir}, skipping clone") + return repo_dir + + logger.info(f"Cloning {repo_url} into {repo_dir}") + try: + subprocess.run( + ["git", "clone", repo_url, str(repo_dir)], + check=True, + capture_output=True, + text=True, + ) + except subprocess.CalledProcessError as e: + raise RuntimeError(f"git clone failed: {(e.stderr or '').strip()}") from e + + return repo_dir + + +def setup_repo_from_config( + task_config_dir: str, workspace_path: Path, logger: logging.Logger +) -> Optional[Path]: + """Return workspace repo path if task has repo_url, else None.""" + with open(task_config_dir, "r") as f: + task_config = yaml.safe_load(f) or {} + repo_url = task_config.get("repo_url") + if not repo_url: + return None + subdir_name = task_config.get("repo_subdir") or _extract_repo_name(repo_url) + repo_dir = workspace_path / subdir_name + return repo_dir if (repo_dir / ".git").exists() else None + + def setup_workspace(task_config_dir: str, workspace_directory: str, timestamp: str, logger: logging.Logger) -> Path: """ Setup workspace for agent execution by duplicating task directory. + For tasks with repo_url: + 1. Copy task files (config.yaml, scripts/, etc.) to workspace + 2. Clone repo into workspace subdirectory (e.g. workspace/rocPRIM/) + + This keeps tasks/ directory clean and gives each run a fresh repo clone. + Args: task_config_dir: Path to task's config.yaml workspace_directory: Base workspace directory @@ -82,6 +156,10 @@ def setup_workspace(task_config_dir: str, workspace_directory: str, timestamp: s task_folder = task_config_path.parent task_folder_name = task_folder.name + # Load task config + with open(task_config_path, "r") as f: + task_config = yaml.safe_load(f) or {} + # 2. Create new directory with timestamp suffix under workspace_dir new_folder_name = f"{task_folder_name}_{timestamp}" workspace_path = Path(workspace_directory) / new_folder_name @@ -89,8 +167,15 @@ def setup_workspace(task_config_dir: str, workspace_directory: str, timestamp: s logger.info(f"Created workspace directory: {workspace_path}") - # 3. Duplicate all content under task_folder to the new workspace folder + # 3. Copy task folder content to workspace (excluding any previously cloned repos) + repo_subdir = None + if repo_url := task_config.get("repo_url"): + repo_subdir = task_config.get("repo_subdir") or _extract_repo_name(repo_url) + for item in task_folder.iterdir(): + # Skip repo subdirectory if it exists in task folder (legacy cleanup) + if repo_subdir and item.name == repo_subdir: + continue src = item dst = workspace_path / item.name if item.is_dir(): @@ -100,4 +185,8 @@ def setup_workspace(task_config_dir: str, workspace_directory: str, timestamp: s logger.info(f"Copied task folder content from {task_folder} to {workspace_path}") + # 4. Clone repo into workspace subdirectory (not tasks/ folder) + if repo_url: + _clone_repo_to_workspace(repo_url, workspace_path, logger, repo_subdir) + return workspace_path diff --git a/tasks/repository/rocprim/block_histogram/config.yaml b/tasks/repository/rocprim/block_histogram/config.yaml new file mode 100644 index 00000000..97e32dec --- /dev/null +++ b/tasks/repository/rocprim/block_histogram/config.yaml @@ -0,0 +1,10 @@ +repo_url: https://github.com/ROCm/rocPRIM.git +compile_command: + - python3 scripts/task_runner.py compile +correctness_command: + - python3 scripts/task_runner.py correctness +performance_command: + - python3 scripts/task_runner.py performance +prompt: + cheatsheet: null + instructions: "Optimize block_histogram" \ No newline at end of file diff --git a/tasks/repository/rocprim/block_histogram/scripts/task_runner.py b/tasks/repository/rocprim/block_histogram/scripts/task_runner.py new file mode 100644 index 00000000..0786a125 --- /dev/null +++ b/tasks/repository/rocprim/block_histogram/scripts/task_runner.py @@ -0,0 +1,326 @@ +#!/usr/bin/env python3 +# Copyright(C) [2026] Advanced Micro Devices, Inc. All rights reserved. +""" +Task runner for repository/rocprim/block_histogram. + +This script provides a stable interface for AgentKernelArena's evaluator: + - `compile` : configure & build rocPRIM benchmark/test targets + - `correctness` : run `test_block_histogram` + - `performance` : run `benchmark_block_histogram` and emit `build/performance_report.json` + +All reports are written under `/build/` so the centralized evaluator can parse them. +""" + +from __future__ import annotations + +import argparse +import json +import os +import re +import subprocess +import sys +import time +from pathlib import Path +from typing import Optional, Tuple + + +TASK_NAME = "repository/rocprim/block_histogram" +BENCH_TARGET = "benchmark_block_histogram" +TEST_TARGET = "test_block_histogram" +REPO_SUBDIR = "rocPRIM" # Cloned repo lives under workspace/rocPRIM/ + + +def _workspace_root() -> Path: + # scripts/task_runner.py -> scripts/ -> workspace root + return Path(__file__).resolve().parents[1] + + +def _source_root(workspace: Path) -> Path: + """CMake source directory (cloned rocPRIM repo).""" + return workspace / REPO_SUBDIR + + +def _cmake_build_root(workspace: Path) -> Path: + """CMake build root inside the cloned repo (workspace/rocPRIM/build/).""" + return _source_root(workspace) / "build" + + +def _cmake_build_dir(workspace: Path) -> Path: + """CMake build directory (workspace/rocPRIM/build/Release/).""" + return _cmake_build_root(workspace) / "Release" + + +def _report_root(workspace: Path) -> Path: + """Report directory for evaluator (workspace/build/). Separate from CMake build.""" + return workspace / "build" + + +def _detect_arch() -> Optional[str]: + # Main framework sets PYTORCH_ROCM_ARCH from target_gpu_model; reuse it for rocPRIM CMake. + arch = os.environ.get("AMDGPU_TARGETS") or os.environ.get("PYTORCH_ROCM_ARCH") + if not arch: + return None + return arch.strip() or None + + +def _run(cmd: list[str], cwd: Path, timeout_s: int, env: dict[str, str]) -> Tuple[bool, str]: + try: + proc = subprocess.run( + cmd, + cwd=str(cwd), + env=env, + capture_output=True, + text=True, + timeout=timeout_s, + ) + out = (proc.stdout or "") + (proc.stderr or "") + return proc.returncode == 0, out + except subprocess.TimeoutExpired as e: + out = (getattr(e, "stdout", "") or "") + (getattr(e, "stderr", "") or "") + return False, f"TIMEOUT after {timeout_s}s\n{out}" + except Exception as e: + return False, str(e) + + +def _ensure_configured(source_dir: Path, build_dir: Path) -> Tuple[bool, Optional[str]]: + """ + Run CMake configure. + + Args: + source_dir: CMake source directory (cloned repo, e.g. workspace/rocPRIM/) + build_dir: CMake build directory (e.g. workspace/build/Release/) + """ + build_dir.mkdir(parents=True, exist_ok=True) + + env = os.environ.copy() + env.setdefault("ROCM_PATH", "/opt/rocm") + env.setdefault("CXX", "hipcc") + + cmake_args = [ + "cmake", + "-S", + str(source_dir), + "-B", + str(build_dir), + "-DCMAKE_BUILD_TYPE=Release", + "-DBUILD_BENCHMARK=ON", + "-DBUILD_TEST=ON", + ] + + arch = _detect_arch() + if arch: + cmake_args.append(f"-DAMDGPU_TARGETS={arch}") + + ok, out = _run(cmake_args, cwd=source_dir, timeout_s=600, env=env) + if not ok: + return False, f"CMake configure failed.\nCommand: {' '.join(cmake_args)}\nOutput:\n{out}" + return True, None + + +def _cmake_build(source_dir: Path, build_dir: Path, target: str) -> Tuple[bool, Optional[str]]: + """ + Run CMake build. + + Args: + source_dir: CMake source directory (for cwd) + build_dir: CMake build directory + target: Build target name + """ + env = os.environ.copy() + env.setdefault("ROCM_PATH", "/opt/rocm") + env.setdefault("CXX", "hipcc") + + cmd = ["cmake", "--build", str(build_dir), "--target", target, "-j"] + ok, out = _run(cmd, cwd=source_dir, timeout_s=1800, env=env) + if not ok: + return False, f"Build failed for target '{target}'.\nCommand: {' '.join(cmd)}\nOutput:\n{out}" + return True, None + + +def _maybe_build_target( + source_dir: Path, + build_dir: Path, + target: str, + binary_path: Path, +) -> Tuple[bool, Optional[str]]: + """ + Avoid redundant builds when the binary already exists. + + Arena runs compile -> correctness -> performance sequentially, so correctness/perf + should not rebuild unless the required binary is missing. + """ + if binary_path.is_file(): + return True, None + + ok, err = _ensure_configured(source_dir, build_dir) + if not ok: + return False, err + return _cmake_build(source_dir, build_dir, target) + + +def _test_binary_path(build_dir: Path) -> Path: + return build_dir / "test" / "rocprim" / TEST_TARGET + + +def _bench_binary_path(build_dir: Path) -> Path: + return build_dir / "benchmark" / BENCH_TARGET + + + + +def _parse_time_ms(output: str) -> Optional[float]: + # Try to find a reasonable "average/mean" latency number in common units. + patterns = [ + r"avg(?:erage)?(?:\s+time)?\s*[:=]\s*([\d.]+)\s*(ns|us|ms|s)\b", + r"mean(?:\s+time)?\s*[:=]\s*([\d.]+)\s*(ns|us|ms|s)\b", + r"median(?:\s+time)?\s*[:=]\s*([\d.]+)\s*(ns|us|ms|s)\b", + r"Perf(?:ormance)?\s*[:=]\s*([\d.]+)\s*(ns|us|ms|s)\b", + r"([\d.]+)\s*(ns|us|ms|s)\s*/\s*(?:trial|iter(?:ation)?|launch)\b", + ] + unit_mul = {"ns": 1e-6, "us": 1e-3, "ms": 1.0, "s": 1000.0} + for pat in patterns: + m = re.search(pat, output, re.IGNORECASE) + if not m: + continue + val = float(m.group(1)) + unit = m.group(2).lower() + if unit in unit_mul: + return val * unit_mul[unit] + return None + + +def run_compile(workspace: Path) -> Tuple[bool, Optional[str]]: + build_dir = _cmake_build_dir(workspace) + source_dir = _source_root(workspace) + + if not source_dir.is_dir(): + return False, f"Source directory not found: {source_dir}. Repo may not have been cloned." + + ok, err = _ensure_configured(source_dir, build_dir) + if not ok: + return False, err + + # Build both correctness and benchmark targets during compile phase. + ok, err = _cmake_build(source_dir, build_dir, TEST_TARGET) + if not ok: + return False, err + ok, err = _cmake_build(source_dir, build_dir, BENCH_TARGET) + if not ok: + return False, err + + # Sanity-check binaries exist. + test_bin = _test_binary_path(build_dir) + bench_bin = _bench_binary_path(build_dir) + if not test_bin.is_file(): + return False, f"Test binary not found: {test_bin}" + if not bench_bin.is_file(): + return False, f"Benchmark binary not found: {bench_bin}" + + return True, None + + +def run_correctness(workspace: Path) -> Tuple[bool, Optional[str]]: + build_dir = _cmake_build_dir(workspace) + source_dir = _source_root(workspace) + + test_bin = _test_binary_path(build_dir) + ok, err = _maybe_build_target(source_dir, build_dir, TEST_TARGET, test_bin) + if not ok: + return False, err + if not test_bin.is_file(): + return False, f"Test binary not found after build attempt: {test_bin}" + + env = os.environ.copy() + ok, out = _run([str(test_bin)], cwd=workspace, timeout_s=1800, env=env) + if not ok: + return False, f"Correctness test failed.\nCommand: {test_bin}\nOutput:\n{out}" + return True, None + + +def run_performance(workspace: Path, trials: int) -> Tuple[float, str]: + build_dir = _cmake_build_dir(workspace) + source_dir = _source_root(workspace) + + bench_bin = _bench_binary_path(build_dir) + ok, err = _maybe_build_target(source_dir, build_dir, BENCH_TARGET, bench_bin) + if not ok: + return -1.0, err or "build failed" + if not bench_bin.is_file(): + return -1.0, f"Benchmark binary not found after build attempt: {bench_bin}" + + env = os.environ.copy() + cmd = [str(bench_bin), "--trials", str(trials)] + t0 = time.perf_counter() + ok, out = _run(cmd, cwd=workspace, timeout_s=3600, env=env) + elapsed_ms_total = (time.perf_counter() - t0) * 1000.0 + + if not ok: + return -1.0, f"Benchmark failed.\nCommand: {' '.join(cmd)}\nOutput:\n{out}" + + parsed_ms = _parse_time_ms(out) + if parsed_ms is not None and parsed_ms > 0: + return float(parsed_ms), "" + + # Fallback: approximate average per trial from wall-clock runtime. + if trials > 0: + return float(elapsed_ms_total / trials), "" + return float(elapsed_ms_total), "" + + +def main() -> None: + workspace = _workspace_root() + os.chdir(workspace) + report_root = _report_root(workspace) + report_root.mkdir(parents=True, exist_ok=True) + + parser = argparse.ArgumentParser(description=f"Task runner for {TASK_NAME}") + parser.add_argument("mode", choices=["compile", "correctness", "performance"]) + parser.add_argument("--trials", type=int, default=20) + args = parser.parse_args() + + if args.mode == "compile": + ok, err = run_compile(workspace) + report = { + "status": "ok" if ok else "fail", + "error": err, + "arch": _detect_arch(), + "source_dir": str(_source_root(workspace)), + "build_dir": str(_cmake_build_dir(workspace)), + } + (report_root / "compile_report.json").write_text(json.dumps(report, indent=2)) + print(f"Compilation: {'PASS' if ok else 'FAIL'}") + if err: + print(err) + sys.exit(0 if ok else 1) + + if args.mode == "correctness": + ok, err = run_correctness(workspace) + report = { + "status": "ok" if ok else "fail", + "error": err, + } + (report_root / "correctness_report.json").write_text(json.dumps(report, indent=2)) + print(f"Correctness: {'PASS' if ok else 'FAIL'}") + if err: + print(err) + sys.exit(0 if ok else 1) + + if args.mode == "performance": + exec_ms, err = run_performance(workspace, trials=args.trials) + report = [ + { + "test_case_id": "test_case_0", + "execution_time_ms": exec_ms, + "params": {"trials": args.trials}, + } + ] + (report_root / "performance_report.json").write_text(json.dumps(report, indent=2)) + # Also print a recognizable line for stdout parsing fallback. + print(f"Performance: {exec_ms:.4f} ms") + if err: + print(err) + sys.exit(0 if exec_ms != -1.0 else 1) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/tasks/repository/rocprim/block_radix_rank/config.yaml b/tasks/repository/rocprim/block_radix_rank/config.yaml new file mode 100644 index 00000000..97e32dec --- /dev/null +++ b/tasks/repository/rocprim/block_radix_rank/config.yaml @@ -0,0 +1,10 @@ +repo_url: https://github.com/ROCm/rocPRIM.git +compile_command: + - python3 scripts/task_runner.py compile +correctness_command: + - python3 scripts/task_runner.py correctness +performance_command: + - python3 scripts/task_runner.py performance +prompt: + cheatsheet: null + instructions: "Optimize block_histogram" \ No newline at end of file diff --git a/tasks/repository/rocprim/block_radix_rank/scripts/task_runner.py b/tasks/repository/rocprim/block_radix_rank/scripts/task_runner.py new file mode 100644 index 00000000..0786a125 --- /dev/null +++ b/tasks/repository/rocprim/block_radix_rank/scripts/task_runner.py @@ -0,0 +1,326 @@ +#!/usr/bin/env python3 +# Copyright(C) [2026] Advanced Micro Devices, Inc. All rights reserved. +""" +Task runner for repository/rocprim/block_histogram. + +This script provides a stable interface for AgentKernelArena's evaluator: + - `compile` : configure & build rocPRIM benchmark/test targets + - `correctness` : run `test_block_histogram` + - `performance` : run `benchmark_block_histogram` and emit `build/performance_report.json` + +All reports are written under `/build/` so the centralized evaluator can parse them. +""" + +from __future__ import annotations + +import argparse +import json +import os +import re +import subprocess +import sys +import time +from pathlib import Path +from typing import Optional, Tuple + + +TASK_NAME = "repository/rocprim/block_histogram" +BENCH_TARGET = "benchmark_block_histogram" +TEST_TARGET = "test_block_histogram" +REPO_SUBDIR = "rocPRIM" # Cloned repo lives under workspace/rocPRIM/ + + +def _workspace_root() -> Path: + # scripts/task_runner.py -> scripts/ -> workspace root + return Path(__file__).resolve().parents[1] + + +def _source_root(workspace: Path) -> Path: + """CMake source directory (cloned rocPRIM repo).""" + return workspace / REPO_SUBDIR + + +def _cmake_build_root(workspace: Path) -> Path: + """CMake build root inside the cloned repo (workspace/rocPRIM/build/).""" + return _source_root(workspace) / "build" + + +def _cmake_build_dir(workspace: Path) -> Path: + """CMake build directory (workspace/rocPRIM/build/Release/).""" + return _cmake_build_root(workspace) / "Release" + + +def _report_root(workspace: Path) -> Path: + """Report directory for evaluator (workspace/build/). Separate from CMake build.""" + return workspace / "build" + + +def _detect_arch() -> Optional[str]: + # Main framework sets PYTORCH_ROCM_ARCH from target_gpu_model; reuse it for rocPRIM CMake. + arch = os.environ.get("AMDGPU_TARGETS") or os.environ.get("PYTORCH_ROCM_ARCH") + if not arch: + return None + return arch.strip() or None + + +def _run(cmd: list[str], cwd: Path, timeout_s: int, env: dict[str, str]) -> Tuple[bool, str]: + try: + proc = subprocess.run( + cmd, + cwd=str(cwd), + env=env, + capture_output=True, + text=True, + timeout=timeout_s, + ) + out = (proc.stdout or "") + (proc.stderr or "") + return proc.returncode == 0, out + except subprocess.TimeoutExpired as e: + out = (getattr(e, "stdout", "") or "") + (getattr(e, "stderr", "") or "") + return False, f"TIMEOUT after {timeout_s}s\n{out}" + except Exception as e: + return False, str(e) + + +def _ensure_configured(source_dir: Path, build_dir: Path) -> Tuple[bool, Optional[str]]: + """ + Run CMake configure. + + Args: + source_dir: CMake source directory (cloned repo, e.g. workspace/rocPRIM/) + build_dir: CMake build directory (e.g. workspace/build/Release/) + """ + build_dir.mkdir(parents=True, exist_ok=True) + + env = os.environ.copy() + env.setdefault("ROCM_PATH", "/opt/rocm") + env.setdefault("CXX", "hipcc") + + cmake_args = [ + "cmake", + "-S", + str(source_dir), + "-B", + str(build_dir), + "-DCMAKE_BUILD_TYPE=Release", + "-DBUILD_BENCHMARK=ON", + "-DBUILD_TEST=ON", + ] + + arch = _detect_arch() + if arch: + cmake_args.append(f"-DAMDGPU_TARGETS={arch}") + + ok, out = _run(cmake_args, cwd=source_dir, timeout_s=600, env=env) + if not ok: + return False, f"CMake configure failed.\nCommand: {' '.join(cmake_args)}\nOutput:\n{out}" + return True, None + + +def _cmake_build(source_dir: Path, build_dir: Path, target: str) -> Tuple[bool, Optional[str]]: + """ + Run CMake build. + + Args: + source_dir: CMake source directory (for cwd) + build_dir: CMake build directory + target: Build target name + """ + env = os.environ.copy() + env.setdefault("ROCM_PATH", "/opt/rocm") + env.setdefault("CXX", "hipcc") + + cmd = ["cmake", "--build", str(build_dir), "--target", target, "-j"] + ok, out = _run(cmd, cwd=source_dir, timeout_s=1800, env=env) + if not ok: + return False, f"Build failed for target '{target}'.\nCommand: {' '.join(cmd)}\nOutput:\n{out}" + return True, None + + +def _maybe_build_target( + source_dir: Path, + build_dir: Path, + target: str, + binary_path: Path, +) -> Tuple[bool, Optional[str]]: + """ + Avoid redundant builds when the binary already exists. + + Arena runs compile -> correctness -> performance sequentially, so correctness/perf + should not rebuild unless the required binary is missing. + """ + if binary_path.is_file(): + return True, None + + ok, err = _ensure_configured(source_dir, build_dir) + if not ok: + return False, err + return _cmake_build(source_dir, build_dir, target) + + +def _test_binary_path(build_dir: Path) -> Path: + return build_dir / "test" / "rocprim" / TEST_TARGET + + +def _bench_binary_path(build_dir: Path) -> Path: + return build_dir / "benchmark" / BENCH_TARGET + + + + +def _parse_time_ms(output: str) -> Optional[float]: + # Try to find a reasonable "average/mean" latency number in common units. + patterns = [ + r"avg(?:erage)?(?:\s+time)?\s*[:=]\s*([\d.]+)\s*(ns|us|ms|s)\b", + r"mean(?:\s+time)?\s*[:=]\s*([\d.]+)\s*(ns|us|ms|s)\b", + r"median(?:\s+time)?\s*[:=]\s*([\d.]+)\s*(ns|us|ms|s)\b", + r"Perf(?:ormance)?\s*[:=]\s*([\d.]+)\s*(ns|us|ms|s)\b", + r"([\d.]+)\s*(ns|us|ms|s)\s*/\s*(?:trial|iter(?:ation)?|launch)\b", + ] + unit_mul = {"ns": 1e-6, "us": 1e-3, "ms": 1.0, "s": 1000.0} + for pat in patterns: + m = re.search(pat, output, re.IGNORECASE) + if not m: + continue + val = float(m.group(1)) + unit = m.group(2).lower() + if unit in unit_mul: + return val * unit_mul[unit] + return None + + +def run_compile(workspace: Path) -> Tuple[bool, Optional[str]]: + build_dir = _cmake_build_dir(workspace) + source_dir = _source_root(workspace) + + if not source_dir.is_dir(): + return False, f"Source directory not found: {source_dir}. Repo may not have been cloned." + + ok, err = _ensure_configured(source_dir, build_dir) + if not ok: + return False, err + + # Build both correctness and benchmark targets during compile phase. + ok, err = _cmake_build(source_dir, build_dir, TEST_TARGET) + if not ok: + return False, err + ok, err = _cmake_build(source_dir, build_dir, BENCH_TARGET) + if not ok: + return False, err + + # Sanity-check binaries exist. + test_bin = _test_binary_path(build_dir) + bench_bin = _bench_binary_path(build_dir) + if not test_bin.is_file(): + return False, f"Test binary not found: {test_bin}" + if not bench_bin.is_file(): + return False, f"Benchmark binary not found: {bench_bin}" + + return True, None + + +def run_correctness(workspace: Path) -> Tuple[bool, Optional[str]]: + build_dir = _cmake_build_dir(workspace) + source_dir = _source_root(workspace) + + test_bin = _test_binary_path(build_dir) + ok, err = _maybe_build_target(source_dir, build_dir, TEST_TARGET, test_bin) + if not ok: + return False, err + if not test_bin.is_file(): + return False, f"Test binary not found after build attempt: {test_bin}" + + env = os.environ.copy() + ok, out = _run([str(test_bin)], cwd=workspace, timeout_s=1800, env=env) + if not ok: + return False, f"Correctness test failed.\nCommand: {test_bin}\nOutput:\n{out}" + return True, None + + +def run_performance(workspace: Path, trials: int) -> Tuple[float, str]: + build_dir = _cmake_build_dir(workspace) + source_dir = _source_root(workspace) + + bench_bin = _bench_binary_path(build_dir) + ok, err = _maybe_build_target(source_dir, build_dir, BENCH_TARGET, bench_bin) + if not ok: + return -1.0, err or "build failed" + if not bench_bin.is_file(): + return -1.0, f"Benchmark binary not found after build attempt: {bench_bin}" + + env = os.environ.copy() + cmd = [str(bench_bin), "--trials", str(trials)] + t0 = time.perf_counter() + ok, out = _run(cmd, cwd=workspace, timeout_s=3600, env=env) + elapsed_ms_total = (time.perf_counter() - t0) * 1000.0 + + if not ok: + return -1.0, f"Benchmark failed.\nCommand: {' '.join(cmd)}\nOutput:\n{out}" + + parsed_ms = _parse_time_ms(out) + if parsed_ms is not None and parsed_ms > 0: + return float(parsed_ms), "" + + # Fallback: approximate average per trial from wall-clock runtime. + if trials > 0: + return float(elapsed_ms_total / trials), "" + return float(elapsed_ms_total), "" + + +def main() -> None: + workspace = _workspace_root() + os.chdir(workspace) + report_root = _report_root(workspace) + report_root.mkdir(parents=True, exist_ok=True) + + parser = argparse.ArgumentParser(description=f"Task runner for {TASK_NAME}") + parser.add_argument("mode", choices=["compile", "correctness", "performance"]) + parser.add_argument("--trials", type=int, default=20) + args = parser.parse_args() + + if args.mode == "compile": + ok, err = run_compile(workspace) + report = { + "status": "ok" if ok else "fail", + "error": err, + "arch": _detect_arch(), + "source_dir": str(_source_root(workspace)), + "build_dir": str(_cmake_build_dir(workspace)), + } + (report_root / "compile_report.json").write_text(json.dumps(report, indent=2)) + print(f"Compilation: {'PASS' if ok else 'FAIL'}") + if err: + print(err) + sys.exit(0 if ok else 1) + + if args.mode == "correctness": + ok, err = run_correctness(workspace) + report = { + "status": "ok" if ok else "fail", + "error": err, + } + (report_root / "correctness_report.json").write_text(json.dumps(report, indent=2)) + print(f"Correctness: {'PASS' if ok else 'FAIL'}") + if err: + print(err) + sys.exit(0 if ok else 1) + + if args.mode == "performance": + exec_ms, err = run_performance(workspace, trials=args.trials) + report = [ + { + "test_case_id": "test_case_0", + "execution_time_ms": exec_ms, + "params": {"trials": args.trials}, + } + ] + (report_root / "performance_report.json").write_text(json.dumps(report, indent=2)) + # Also print a recognizable line for stdout parsing fallback. + print(f"Performance: {exec_ms:.4f} ms") + if err: + print(err) + sys.exit(0 if exec_ms != -1.0 else 1) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/tasks/repository/rocprim/block_run_length_decode/config.yaml b/tasks/repository/rocprim/block_run_length_decode/config.yaml new file mode 100644 index 00000000..97e32dec --- /dev/null +++ b/tasks/repository/rocprim/block_run_length_decode/config.yaml @@ -0,0 +1,10 @@ +repo_url: https://github.com/ROCm/rocPRIM.git +compile_command: + - python3 scripts/task_runner.py compile +correctness_command: + - python3 scripts/task_runner.py correctness +performance_command: + - python3 scripts/task_runner.py performance +prompt: + cheatsheet: null + instructions: "Optimize block_histogram" \ No newline at end of file diff --git a/tasks/repository/rocprim/block_run_length_decode/scripts/task_runner.py b/tasks/repository/rocprim/block_run_length_decode/scripts/task_runner.py new file mode 100644 index 00000000..0786a125 --- /dev/null +++ b/tasks/repository/rocprim/block_run_length_decode/scripts/task_runner.py @@ -0,0 +1,326 @@ +#!/usr/bin/env python3 +# Copyright(C) [2026] Advanced Micro Devices, Inc. All rights reserved. +""" +Task runner for repository/rocprim/block_histogram. + +This script provides a stable interface for AgentKernelArena's evaluator: + - `compile` : configure & build rocPRIM benchmark/test targets + - `correctness` : run `test_block_histogram` + - `performance` : run `benchmark_block_histogram` and emit `build/performance_report.json` + +All reports are written under `/build/` so the centralized evaluator can parse them. +""" + +from __future__ import annotations + +import argparse +import json +import os +import re +import subprocess +import sys +import time +from pathlib import Path +from typing import Optional, Tuple + + +TASK_NAME = "repository/rocprim/block_histogram" +BENCH_TARGET = "benchmark_block_histogram" +TEST_TARGET = "test_block_histogram" +REPO_SUBDIR = "rocPRIM" # Cloned repo lives under workspace/rocPRIM/ + + +def _workspace_root() -> Path: + # scripts/task_runner.py -> scripts/ -> workspace root + return Path(__file__).resolve().parents[1] + + +def _source_root(workspace: Path) -> Path: + """CMake source directory (cloned rocPRIM repo).""" + return workspace / REPO_SUBDIR + + +def _cmake_build_root(workspace: Path) -> Path: + """CMake build root inside the cloned repo (workspace/rocPRIM/build/).""" + return _source_root(workspace) / "build" + + +def _cmake_build_dir(workspace: Path) -> Path: + """CMake build directory (workspace/rocPRIM/build/Release/).""" + return _cmake_build_root(workspace) / "Release" + + +def _report_root(workspace: Path) -> Path: + """Report directory for evaluator (workspace/build/). Separate from CMake build.""" + return workspace / "build" + + +def _detect_arch() -> Optional[str]: + # Main framework sets PYTORCH_ROCM_ARCH from target_gpu_model; reuse it for rocPRIM CMake. + arch = os.environ.get("AMDGPU_TARGETS") or os.environ.get("PYTORCH_ROCM_ARCH") + if not arch: + return None + return arch.strip() or None + + +def _run(cmd: list[str], cwd: Path, timeout_s: int, env: dict[str, str]) -> Tuple[bool, str]: + try: + proc = subprocess.run( + cmd, + cwd=str(cwd), + env=env, + capture_output=True, + text=True, + timeout=timeout_s, + ) + out = (proc.stdout or "") + (proc.stderr or "") + return proc.returncode == 0, out + except subprocess.TimeoutExpired as e: + out = (getattr(e, "stdout", "") or "") + (getattr(e, "stderr", "") or "") + return False, f"TIMEOUT after {timeout_s}s\n{out}" + except Exception as e: + return False, str(e) + + +def _ensure_configured(source_dir: Path, build_dir: Path) -> Tuple[bool, Optional[str]]: + """ + Run CMake configure. + + Args: + source_dir: CMake source directory (cloned repo, e.g. workspace/rocPRIM/) + build_dir: CMake build directory (e.g. workspace/build/Release/) + """ + build_dir.mkdir(parents=True, exist_ok=True) + + env = os.environ.copy() + env.setdefault("ROCM_PATH", "/opt/rocm") + env.setdefault("CXX", "hipcc") + + cmake_args = [ + "cmake", + "-S", + str(source_dir), + "-B", + str(build_dir), + "-DCMAKE_BUILD_TYPE=Release", + "-DBUILD_BENCHMARK=ON", + "-DBUILD_TEST=ON", + ] + + arch = _detect_arch() + if arch: + cmake_args.append(f"-DAMDGPU_TARGETS={arch}") + + ok, out = _run(cmake_args, cwd=source_dir, timeout_s=600, env=env) + if not ok: + return False, f"CMake configure failed.\nCommand: {' '.join(cmake_args)}\nOutput:\n{out}" + return True, None + + +def _cmake_build(source_dir: Path, build_dir: Path, target: str) -> Tuple[bool, Optional[str]]: + """ + Run CMake build. + + Args: + source_dir: CMake source directory (for cwd) + build_dir: CMake build directory + target: Build target name + """ + env = os.environ.copy() + env.setdefault("ROCM_PATH", "/opt/rocm") + env.setdefault("CXX", "hipcc") + + cmd = ["cmake", "--build", str(build_dir), "--target", target, "-j"] + ok, out = _run(cmd, cwd=source_dir, timeout_s=1800, env=env) + if not ok: + return False, f"Build failed for target '{target}'.\nCommand: {' '.join(cmd)}\nOutput:\n{out}" + return True, None + + +def _maybe_build_target( + source_dir: Path, + build_dir: Path, + target: str, + binary_path: Path, +) -> Tuple[bool, Optional[str]]: + """ + Avoid redundant builds when the binary already exists. + + Arena runs compile -> correctness -> performance sequentially, so correctness/perf + should not rebuild unless the required binary is missing. + """ + if binary_path.is_file(): + return True, None + + ok, err = _ensure_configured(source_dir, build_dir) + if not ok: + return False, err + return _cmake_build(source_dir, build_dir, target) + + +def _test_binary_path(build_dir: Path) -> Path: + return build_dir / "test" / "rocprim" / TEST_TARGET + + +def _bench_binary_path(build_dir: Path) -> Path: + return build_dir / "benchmark" / BENCH_TARGET + + + + +def _parse_time_ms(output: str) -> Optional[float]: + # Try to find a reasonable "average/mean" latency number in common units. + patterns = [ + r"avg(?:erage)?(?:\s+time)?\s*[:=]\s*([\d.]+)\s*(ns|us|ms|s)\b", + r"mean(?:\s+time)?\s*[:=]\s*([\d.]+)\s*(ns|us|ms|s)\b", + r"median(?:\s+time)?\s*[:=]\s*([\d.]+)\s*(ns|us|ms|s)\b", + r"Perf(?:ormance)?\s*[:=]\s*([\d.]+)\s*(ns|us|ms|s)\b", + r"([\d.]+)\s*(ns|us|ms|s)\s*/\s*(?:trial|iter(?:ation)?|launch)\b", + ] + unit_mul = {"ns": 1e-6, "us": 1e-3, "ms": 1.0, "s": 1000.0} + for pat in patterns: + m = re.search(pat, output, re.IGNORECASE) + if not m: + continue + val = float(m.group(1)) + unit = m.group(2).lower() + if unit in unit_mul: + return val * unit_mul[unit] + return None + + +def run_compile(workspace: Path) -> Tuple[bool, Optional[str]]: + build_dir = _cmake_build_dir(workspace) + source_dir = _source_root(workspace) + + if not source_dir.is_dir(): + return False, f"Source directory not found: {source_dir}. Repo may not have been cloned." + + ok, err = _ensure_configured(source_dir, build_dir) + if not ok: + return False, err + + # Build both correctness and benchmark targets during compile phase. + ok, err = _cmake_build(source_dir, build_dir, TEST_TARGET) + if not ok: + return False, err + ok, err = _cmake_build(source_dir, build_dir, BENCH_TARGET) + if not ok: + return False, err + + # Sanity-check binaries exist. + test_bin = _test_binary_path(build_dir) + bench_bin = _bench_binary_path(build_dir) + if not test_bin.is_file(): + return False, f"Test binary not found: {test_bin}" + if not bench_bin.is_file(): + return False, f"Benchmark binary not found: {bench_bin}" + + return True, None + + +def run_correctness(workspace: Path) -> Tuple[bool, Optional[str]]: + build_dir = _cmake_build_dir(workspace) + source_dir = _source_root(workspace) + + test_bin = _test_binary_path(build_dir) + ok, err = _maybe_build_target(source_dir, build_dir, TEST_TARGET, test_bin) + if not ok: + return False, err + if not test_bin.is_file(): + return False, f"Test binary not found after build attempt: {test_bin}" + + env = os.environ.copy() + ok, out = _run([str(test_bin)], cwd=workspace, timeout_s=1800, env=env) + if not ok: + return False, f"Correctness test failed.\nCommand: {test_bin}\nOutput:\n{out}" + return True, None + + +def run_performance(workspace: Path, trials: int) -> Tuple[float, str]: + build_dir = _cmake_build_dir(workspace) + source_dir = _source_root(workspace) + + bench_bin = _bench_binary_path(build_dir) + ok, err = _maybe_build_target(source_dir, build_dir, BENCH_TARGET, bench_bin) + if not ok: + return -1.0, err or "build failed" + if not bench_bin.is_file(): + return -1.0, f"Benchmark binary not found after build attempt: {bench_bin}" + + env = os.environ.copy() + cmd = [str(bench_bin), "--trials", str(trials)] + t0 = time.perf_counter() + ok, out = _run(cmd, cwd=workspace, timeout_s=3600, env=env) + elapsed_ms_total = (time.perf_counter() - t0) * 1000.0 + + if not ok: + return -1.0, f"Benchmark failed.\nCommand: {' '.join(cmd)}\nOutput:\n{out}" + + parsed_ms = _parse_time_ms(out) + if parsed_ms is not None and parsed_ms > 0: + return float(parsed_ms), "" + + # Fallback: approximate average per trial from wall-clock runtime. + if trials > 0: + return float(elapsed_ms_total / trials), "" + return float(elapsed_ms_total), "" + + +def main() -> None: + workspace = _workspace_root() + os.chdir(workspace) + report_root = _report_root(workspace) + report_root.mkdir(parents=True, exist_ok=True) + + parser = argparse.ArgumentParser(description=f"Task runner for {TASK_NAME}") + parser.add_argument("mode", choices=["compile", "correctness", "performance"]) + parser.add_argument("--trials", type=int, default=20) + args = parser.parse_args() + + if args.mode == "compile": + ok, err = run_compile(workspace) + report = { + "status": "ok" if ok else "fail", + "error": err, + "arch": _detect_arch(), + "source_dir": str(_source_root(workspace)), + "build_dir": str(_cmake_build_dir(workspace)), + } + (report_root / "compile_report.json").write_text(json.dumps(report, indent=2)) + print(f"Compilation: {'PASS' if ok else 'FAIL'}") + if err: + print(err) + sys.exit(0 if ok else 1) + + if args.mode == "correctness": + ok, err = run_correctness(workspace) + report = { + "status": "ok" if ok else "fail", + "error": err, + } + (report_root / "correctness_report.json").write_text(json.dumps(report, indent=2)) + print(f"Correctness: {'PASS' if ok else 'FAIL'}") + if err: + print(err) + sys.exit(0 if ok else 1) + + if args.mode == "performance": + exec_ms, err = run_performance(workspace, trials=args.trials) + report = [ + { + "test_case_id": "test_case_0", + "execution_time_ms": exec_ms, + "params": {"trials": args.trials}, + } + ] + (report_root / "performance_report.json").write_text(json.dumps(report, indent=2)) + # Also print a recognizable line for stdout parsing fallback. + print(f"Performance: {exec_ms:.4f} ms") + if err: + print(err) + sys.exit(0 if exec_ms != -1.0 else 1) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/tasks/repository/rocprim/device_binary_search/config.yaml b/tasks/repository/rocprim/device_binary_search/config.yaml new file mode 100644 index 00000000..97e32dec --- /dev/null +++ b/tasks/repository/rocprim/device_binary_search/config.yaml @@ -0,0 +1,10 @@ +repo_url: https://github.com/ROCm/rocPRIM.git +compile_command: + - python3 scripts/task_runner.py compile +correctness_command: + - python3 scripts/task_runner.py correctness +performance_command: + - python3 scripts/task_runner.py performance +prompt: + cheatsheet: null + instructions: "Optimize block_histogram" \ No newline at end of file diff --git a/tasks/repository/rocprim/device_binary_search/scripts/task_runner.py b/tasks/repository/rocprim/device_binary_search/scripts/task_runner.py new file mode 100644 index 00000000..0786a125 --- /dev/null +++ b/tasks/repository/rocprim/device_binary_search/scripts/task_runner.py @@ -0,0 +1,326 @@ +#!/usr/bin/env python3 +# Copyright(C) [2026] Advanced Micro Devices, Inc. All rights reserved. +""" +Task runner for repository/rocprim/block_histogram. + +This script provides a stable interface for AgentKernelArena's evaluator: + - `compile` : configure & build rocPRIM benchmark/test targets + - `correctness` : run `test_block_histogram` + - `performance` : run `benchmark_block_histogram` and emit `build/performance_report.json` + +All reports are written under `/build/` so the centralized evaluator can parse them. +""" + +from __future__ import annotations + +import argparse +import json +import os +import re +import subprocess +import sys +import time +from pathlib import Path +from typing import Optional, Tuple + + +TASK_NAME = "repository/rocprim/block_histogram" +BENCH_TARGET = "benchmark_block_histogram" +TEST_TARGET = "test_block_histogram" +REPO_SUBDIR = "rocPRIM" # Cloned repo lives under workspace/rocPRIM/ + + +def _workspace_root() -> Path: + # scripts/task_runner.py -> scripts/ -> workspace root + return Path(__file__).resolve().parents[1] + + +def _source_root(workspace: Path) -> Path: + """CMake source directory (cloned rocPRIM repo).""" + return workspace / REPO_SUBDIR + + +def _cmake_build_root(workspace: Path) -> Path: + """CMake build root inside the cloned repo (workspace/rocPRIM/build/).""" + return _source_root(workspace) / "build" + + +def _cmake_build_dir(workspace: Path) -> Path: + """CMake build directory (workspace/rocPRIM/build/Release/).""" + return _cmake_build_root(workspace) / "Release" + + +def _report_root(workspace: Path) -> Path: + """Report directory for evaluator (workspace/build/). Separate from CMake build.""" + return workspace / "build" + + +def _detect_arch() -> Optional[str]: + # Main framework sets PYTORCH_ROCM_ARCH from target_gpu_model; reuse it for rocPRIM CMake. + arch = os.environ.get("AMDGPU_TARGETS") or os.environ.get("PYTORCH_ROCM_ARCH") + if not arch: + return None + return arch.strip() or None + + +def _run(cmd: list[str], cwd: Path, timeout_s: int, env: dict[str, str]) -> Tuple[bool, str]: + try: + proc = subprocess.run( + cmd, + cwd=str(cwd), + env=env, + capture_output=True, + text=True, + timeout=timeout_s, + ) + out = (proc.stdout or "") + (proc.stderr or "") + return proc.returncode == 0, out + except subprocess.TimeoutExpired as e: + out = (getattr(e, "stdout", "") or "") + (getattr(e, "stderr", "") or "") + return False, f"TIMEOUT after {timeout_s}s\n{out}" + except Exception as e: + return False, str(e) + + +def _ensure_configured(source_dir: Path, build_dir: Path) -> Tuple[bool, Optional[str]]: + """ + Run CMake configure. + + Args: + source_dir: CMake source directory (cloned repo, e.g. workspace/rocPRIM/) + build_dir: CMake build directory (e.g. workspace/build/Release/) + """ + build_dir.mkdir(parents=True, exist_ok=True) + + env = os.environ.copy() + env.setdefault("ROCM_PATH", "/opt/rocm") + env.setdefault("CXX", "hipcc") + + cmake_args = [ + "cmake", + "-S", + str(source_dir), + "-B", + str(build_dir), + "-DCMAKE_BUILD_TYPE=Release", + "-DBUILD_BENCHMARK=ON", + "-DBUILD_TEST=ON", + ] + + arch = _detect_arch() + if arch: + cmake_args.append(f"-DAMDGPU_TARGETS={arch}") + + ok, out = _run(cmake_args, cwd=source_dir, timeout_s=600, env=env) + if not ok: + return False, f"CMake configure failed.\nCommand: {' '.join(cmake_args)}\nOutput:\n{out}" + return True, None + + +def _cmake_build(source_dir: Path, build_dir: Path, target: str) -> Tuple[bool, Optional[str]]: + """ + Run CMake build. + + Args: + source_dir: CMake source directory (for cwd) + build_dir: CMake build directory + target: Build target name + """ + env = os.environ.copy() + env.setdefault("ROCM_PATH", "/opt/rocm") + env.setdefault("CXX", "hipcc") + + cmd = ["cmake", "--build", str(build_dir), "--target", target, "-j"] + ok, out = _run(cmd, cwd=source_dir, timeout_s=1800, env=env) + if not ok: + return False, f"Build failed for target '{target}'.\nCommand: {' '.join(cmd)}\nOutput:\n{out}" + return True, None + + +def _maybe_build_target( + source_dir: Path, + build_dir: Path, + target: str, + binary_path: Path, +) -> Tuple[bool, Optional[str]]: + """ + Avoid redundant builds when the binary already exists. + + Arena runs compile -> correctness -> performance sequentially, so correctness/perf + should not rebuild unless the required binary is missing. + """ + if binary_path.is_file(): + return True, None + + ok, err = _ensure_configured(source_dir, build_dir) + if not ok: + return False, err + return _cmake_build(source_dir, build_dir, target) + + +def _test_binary_path(build_dir: Path) -> Path: + return build_dir / "test" / "rocprim" / TEST_TARGET + + +def _bench_binary_path(build_dir: Path) -> Path: + return build_dir / "benchmark" / BENCH_TARGET + + + + +def _parse_time_ms(output: str) -> Optional[float]: + # Try to find a reasonable "average/mean" latency number in common units. + patterns = [ + r"avg(?:erage)?(?:\s+time)?\s*[:=]\s*([\d.]+)\s*(ns|us|ms|s)\b", + r"mean(?:\s+time)?\s*[:=]\s*([\d.]+)\s*(ns|us|ms|s)\b", + r"median(?:\s+time)?\s*[:=]\s*([\d.]+)\s*(ns|us|ms|s)\b", + r"Perf(?:ormance)?\s*[:=]\s*([\d.]+)\s*(ns|us|ms|s)\b", + r"([\d.]+)\s*(ns|us|ms|s)\s*/\s*(?:trial|iter(?:ation)?|launch)\b", + ] + unit_mul = {"ns": 1e-6, "us": 1e-3, "ms": 1.0, "s": 1000.0} + for pat in patterns: + m = re.search(pat, output, re.IGNORECASE) + if not m: + continue + val = float(m.group(1)) + unit = m.group(2).lower() + if unit in unit_mul: + return val * unit_mul[unit] + return None + + +def run_compile(workspace: Path) -> Tuple[bool, Optional[str]]: + build_dir = _cmake_build_dir(workspace) + source_dir = _source_root(workspace) + + if not source_dir.is_dir(): + return False, f"Source directory not found: {source_dir}. Repo may not have been cloned." + + ok, err = _ensure_configured(source_dir, build_dir) + if not ok: + return False, err + + # Build both correctness and benchmark targets during compile phase. + ok, err = _cmake_build(source_dir, build_dir, TEST_TARGET) + if not ok: + return False, err + ok, err = _cmake_build(source_dir, build_dir, BENCH_TARGET) + if not ok: + return False, err + + # Sanity-check binaries exist. + test_bin = _test_binary_path(build_dir) + bench_bin = _bench_binary_path(build_dir) + if not test_bin.is_file(): + return False, f"Test binary not found: {test_bin}" + if not bench_bin.is_file(): + return False, f"Benchmark binary not found: {bench_bin}" + + return True, None + + +def run_correctness(workspace: Path) -> Tuple[bool, Optional[str]]: + build_dir = _cmake_build_dir(workspace) + source_dir = _source_root(workspace) + + test_bin = _test_binary_path(build_dir) + ok, err = _maybe_build_target(source_dir, build_dir, TEST_TARGET, test_bin) + if not ok: + return False, err + if not test_bin.is_file(): + return False, f"Test binary not found after build attempt: {test_bin}" + + env = os.environ.copy() + ok, out = _run([str(test_bin)], cwd=workspace, timeout_s=1800, env=env) + if not ok: + return False, f"Correctness test failed.\nCommand: {test_bin}\nOutput:\n{out}" + return True, None + + +def run_performance(workspace: Path, trials: int) -> Tuple[float, str]: + build_dir = _cmake_build_dir(workspace) + source_dir = _source_root(workspace) + + bench_bin = _bench_binary_path(build_dir) + ok, err = _maybe_build_target(source_dir, build_dir, BENCH_TARGET, bench_bin) + if not ok: + return -1.0, err or "build failed" + if not bench_bin.is_file(): + return -1.0, f"Benchmark binary not found after build attempt: {bench_bin}" + + env = os.environ.copy() + cmd = [str(bench_bin), "--trials", str(trials)] + t0 = time.perf_counter() + ok, out = _run(cmd, cwd=workspace, timeout_s=3600, env=env) + elapsed_ms_total = (time.perf_counter() - t0) * 1000.0 + + if not ok: + return -1.0, f"Benchmark failed.\nCommand: {' '.join(cmd)}\nOutput:\n{out}" + + parsed_ms = _parse_time_ms(out) + if parsed_ms is not None and parsed_ms > 0: + return float(parsed_ms), "" + + # Fallback: approximate average per trial from wall-clock runtime. + if trials > 0: + return float(elapsed_ms_total / trials), "" + return float(elapsed_ms_total), "" + + +def main() -> None: + workspace = _workspace_root() + os.chdir(workspace) + report_root = _report_root(workspace) + report_root.mkdir(parents=True, exist_ok=True) + + parser = argparse.ArgumentParser(description=f"Task runner for {TASK_NAME}") + parser.add_argument("mode", choices=["compile", "correctness", "performance"]) + parser.add_argument("--trials", type=int, default=20) + args = parser.parse_args() + + if args.mode == "compile": + ok, err = run_compile(workspace) + report = { + "status": "ok" if ok else "fail", + "error": err, + "arch": _detect_arch(), + "source_dir": str(_source_root(workspace)), + "build_dir": str(_cmake_build_dir(workspace)), + } + (report_root / "compile_report.json").write_text(json.dumps(report, indent=2)) + print(f"Compilation: {'PASS' if ok else 'FAIL'}") + if err: + print(err) + sys.exit(0 if ok else 1) + + if args.mode == "correctness": + ok, err = run_correctness(workspace) + report = { + "status": "ok" if ok else "fail", + "error": err, + } + (report_root / "correctness_report.json").write_text(json.dumps(report, indent=2)) + print(f"Correctness: {'PASS' if ok else 'FAIL'}") + if err: + print(err) + sys.exit(0 if ok else 1) + + if args.mode == "performance": + exec_ms, err = run_performance(workspace, trials=args.trials) + report = [ + { + "test_case_id": "test_case_0", + "execution_time_ms": exec_ms, + "params": {"trials": args.trials}, + } + ] + (report_root / "performance_report.json").write_text(json.dumps(report, indent=2)) + # Also print a recognizable line for stdout parsing fallback. + print(f"Performance: {exec_ms:.4f} ms") + if err: + print(err) + sys.exit(0 if exec_ms != -1.0 else 1) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/tasks/repository/rocprim/device_merge_sort/config.yaml b/tasks/repository/rocprim/device_merge_sort/config.yaml new file mode 100644 index 00000000..97e32dec --- /dev/null +++ b/tasks/repository/rocprim/device_merge_sort/config.yaml @@ -0,0 +1,10 @@ +repo_url: https://github.com/ROCm/rocPRIM.git +compile_command: + - python3 scripts/task_runner.py compile +correctness_command: + - python3 scripts/task_runner.py correctness +performance_command: + - python3 scripts/task_runner.py performance +prompt: + cheatsheet: null + instructions: "Optimize block_histogram" \ No newline at end of file diff --git a/tasks/repository/rocprim/device_merge_sort/scripts/task_runner.py b/tasks/repository/rocprim/device_merge_sort/scripts/task_runner.py new file mode 100644 index 00000000..0786a125 --- /dev/null +++ b/tasks/repository/rocprim/device_merge_sort/scripts/task_runner.py @@ -0,0 +1,326 @@ +#!/usr/bin/env python3 +# Copyright(C) [2026] Advanced Micro Devices, Inc. All rights reserved. +""" +Task runner for repository/rocprim/block_histogram. + +This script provides a stable interface for AgentKernelArena's evaluator: + - `compile` : configure & build rocPRIM benchmark/test targets + - `correctness` : run `test_block_histogram` + - `performance` : run `benchmark_block_histogram` and emit `build/performance_report.json` + +All reports are written under `/build/` so the centralized evaluator can parse them. +""" + +from __future__ import annotations + +import argparse +import json +import os +import re +import subprocess +import sys +import time +from pathlib import Path +from typing import Optional, Tuple + + +TASK_NAME = "repository/rocprim/block_histogram" +BENCH_TARGET = "benchmark_block_histogram" +TEST_TARGET = "test_block_histogram" +REPO_SUBDIR = "rocPRIM" # Cloned repo lives under workspace/rocPRIM/ + + +def _workspace_root() -> Path: + # scripts/task_runner.py -> scripts/ -> workspace root + return Path(__file__).resolve().parents[1] + + +def _source_root(workspace: Path) -> Path: + """CMake source directory (cloned rocPRIM repo).""" + return workspace / REPO_SUBDIR + + +def _cmake_build_root(workspace: Path) -> Path: + """CMake build root inside the cloned repo (workspace/rocPRIM/build/).""" + return _source_root(workspace) / "build" + + +def _cmake_build_dir(workspace: Path) -> Path: + """CMake build directory (workspace/rocPRIM/build/Release/).""" + return _cmake_build_root(workspace) / "Release" + + +def _report_root(workspace: Path) -> Path: + """Report directory for evaluator (workspace/build/). Separate from CMake build.""" + return workspace / "build" + + +def _detect_arch() -> Optional[str]: + # Main framework sets PYTORCH_ROCM_ARCH from target_gpu_model; reuse it for rocPRIM CMake. + arch = os.environ.get("AMDGPU_TARGETS") or os.environ.get("PYTORCH_ROCM_ARCH") + if not arch: + return None + return arch.strip() or None + + +def _run(cmd: list[str], cwd: Path, timeout_s: int, env: dict[str, str]) -> Tuple[bool, str]: + try: + proc = subprocess.run( + cmd, + cwd=str(cwd), + env=env, + capture_output=True, + text=True, + timeout=timeout_s, + ) + out = (proc.stdout or "") + (proc.stderr or "") + return proc.returncode == 0, out + except subprocess.TimeoutExpired as e: + out = (getattr(e, "stdout", "") or "") + (getattr(e, "stderr", "") or "") + return False, f"TIMEOUT after {timeout_s}s\n{out}" + except Exception as e: + return False, str(e) + + +def _ensure_configured(source_dir: Path, build_dir: Path) -> Tuple[bool, Optional[str]]: + """ + Run CMake configure. + + Args: + source_dir: CMake source directory (cloned repo, e.g. workspace/rocPRIM/) + build_dir: CMake build directory (e.g. workspace/build/Release/) + """ + build_dir.mkdir(parents=True, exist_ok=True) + + env = os.environ.copy() + env.setdefault("ROCM_PATH", "/opt/rocm") + env.setdefault("CXX", "hipcc") + + cmake_args = [ + "cmake", + "-S", + str(source_dir), + "-B", + str(build_dir), + "-DCMAKE_BUILD_TYPE=Release", + "-DBUILD_BENCHMARK=ON", + "-DBUILD_TEST=ON", + ] + + arch = _detect_arch() + if arch: + cmake_args.append(f"-DAMDGPU_TARGETS={arch}") + + ok, out = _run(cmake_args, cwd=source_dir, timeout_s=600, env=env) + if not ok: + return False, f"CMake configure failed.\nCommand: {' '.join(cmake_args)}\nOutput:\n{out}" + return True, None + + +def _cmake_build(source_dir: Path, build_dir: Path, target: str) -> Tuple[bool, Optional[str]]: + """ + Run CMake build. + + Args: + source_dir: CMake source directory (for cwd) + build_dir: CMake build directory + target: Build target name + """ + env = os.environ.copy() + env.setdefault("ROCM_PATH", "/opt/rocm") + env.setdefault("CXX", "hipcc") + + cmd = ["cmake", "--build", str(build_dir), "--target", target, "-j"] + ok, out = _run(cmd, cwd=source_dir, timeout_s=1800, env=env) + if not ok: + return False, f"Build failed for target '{target}'.\nCommand: {' '.join(cmd)}\nOutput:\n{out}" + return True, None + + +def _maybe_build_target( + source_dir: Path, + build_dir: Path, + target: str, + binary_path: Path, +) -> Tuple[bool, Optional[str]]: + """ + Avoid redundant builds when the binary already exists. + + Arena runs compile -> correctness -> performance sequentially, so correctness/perf + should not rebuild unless the required binary is missing. + """ + if binary_path.is_file(): + return True, None + + ok, err = _ensure_configured(source_dir, build_dir) + if not ok: + return False, err + return _cmake_build(source_dir, build_dir, target) + + +def _test_binary_path(build_dir: Path) -> Path: + return build_dir / "test" / "rocprim" / TEST_TARGET + + +def _bench_binary_path(build_dir: Path) -> Path: + return build_dir / "benchmark" / BENCH_TARGET + + + + +def _parse_time_ms(output: str) -> Optional[float]: + # Try to find a reasonable "average/mean" latency number in common units. + patterns = [ + r"avg(?:erage)?(?:\s+time)?\s*[:=]\s*([\d.]+)\s*(ns|us|ms|s)\b", + r"mean(?:\s+time)?\s*[:=]\s*([\d.]+)\s*(ns|us|ms|s)\b", + r"median(?:\s+time)?\s*[:=]\s*([\d.]+)\s*(ns|us|ms|s)\b", + r"Perf(?:ormance)?\s*[:=]\s*([\d.]+)\s*(ns|us|ms|s)\b", + r"([\d.]+)\s*(ns|us|ms|s)\s*/\s*(?:trial|iter(?:ation)?|launch)\b", + ] + unit_mul = {"ns": 1e-6, "us": 1e-3, "ms": 1.0, "s": 1000.0} + for pat in patterns: + m = re.search(pat, output, re.IGNORECASE) + if not m: + continue + val = float(m.group(1)) + unit = m.group(2).lower() + if unit in unit_mul: + return val * unit_mul[unit] + return None + + +def run_compile(workspace: Path) -> Tuple[bool, Optional[str]]: + build_dir = _cmake_build_dir(workspace) + source_dir = _source_root(workspace) + + if not source_dir.is_dir(): + return False, f"Source directory not found: {source_dir}. Repo may not have been cloned." + + ok, err = _ensure_configured(source_dir, build_dir) + if not ok: + return False, err + + # Build both correctness and benchmark targets during compile phase. + ok, err = _cmake_build(source_dir, build_dir, TEST_TARGET) + if not ok: + return False, err + ok, err = _cmake_build(source_dir, build_dir, BENCH_TARGET) + if not ok: + return False, err + + # Sanity-check binaries exist. + test_bin = _test_binary_path(build_dir) + bench_bin = _bench_binary_path(build_dir) + if not test_bin.is_file(): + return False, f"Test binary not found: {test_bin}" + if not bench_bin.is_file(): + return False, f"Benchmark binary not found: {bench_bin}" + + return True, None + + +def run_correctness(workspace: Path) -> Tuple[bool, Optional[str]]: + build_dir = _cmake_build_dir(workspace) + source_dir = _source_root(workspace) + + test_bin = _test_binary_path(build_dir) + ok, err = _maybe_build_target(source_dir, build_dir, TEST_TARGET, test_bin) + if not ok: + return False, err + if not test_bin.is_file(): + return False, f"Test binary not found after build attempt: {test_bin}" + + env = os.environ.copy() + ok, out = _run([str(test_bin)], cwd=workspace, timeout_s=1800, env=env) + if not ok: + return False, f"Correctness test failed.\nCommand: {test_bin}\nOutput:\n{out}" + return True, None + + +def run_performance(workspace: Path, trials: int) -> Tuple[float, str]: + build_dir = _cmake_build_dir(workspace) + source_dir = _source_root(workspace) + + bench_bin = _bench_binary_path(build_dir) + ok, err = _maybe_build_target(source_dir, build_dir, BENCH_TARGET, bench_bin) + if not ok: + return -1.0, err or "build failed" + if not bench_bin.is_file(): + return -1.0, f"Benchmark binary not found after build attempt: {bench_bin}" + + env = os.environ.copy() + cmd = [str(bench_bin), "--trials", str(trials)] + t0 = time.perf_counter() + ok, out = _run(cmd, cwd=workspace, timeout_s=3600, env=env) + elapsed_ms_total = (time.perf_counter() - t0) * 1000.0 + + if not ok: + return -1.0, f"Benchmark failed.\nCommand: {' '.join(cmd)}\nOutput:\n{out}" + + parsed_ms = _parse_time_ms(out) + if parsed_ms is not None and parsed_ms > 0: + return float(parsed_ms), "" + + # Fallback: approximate average per trial from wall-clock runtime. + if trials > 0: + return float(elapsed_ms_total / trials), "" + return float(elapsed_ms_total), "" + + +def main() -> None: + workspace = _workspace_root() + os.chdir(workspace) + report_root = _report_root(workspace) + report_root.mkdir(parents=True, exist_ok=True) + + parser = argparse.ArgumentParser(description=f"Task runner for {TASK_NAME}") + parser.add_argument("mode", choices=["compile", "correctness", "performance"]) + parser.add_argument("--trials", type=int, default=20) + args = parser.parse_args() + + if args.mode == "compile": + ok, err = run_compile(workspace) + report = { + "status": "ok" if ok else "fail", + "error": err, + "arch": _detect_arch(), + "source_dir": str(_source_root(workspace)), + "build_dir": str(_cmake_build_dir(workspace)), + } + (report_root / "compile_report.json").write_text(json.dumps(report, indent=2)) + print(f"Compilation: {'PASS' if ok else 'FAIL'}") + if err: + print(err) + sys.exit(0 if ok else 1) + + if args.mode == "correctness": + ok, err = run_correctness(workspace) + report = { + "status": "ok" if ok else "fail", + "error": err, + } + (report_root / "correctness_report.json").write_text(json.dumps(report, indent=2)) + print(f"Correctness: {'PASS' if ok else 'FAIL'}") + if err: + print(err) + sys.exit(0 if ok else 1) + + if args.mode == "performance": + exec_ms, err = run_performance(workspace, trials=args.trials) + report = [ + { + "test_case_id": "test_case_0", + "execution_time_ms": exec_ms, + "params": {"trials": args.trials}, + } + ] + (report_root / "performance_report.json").write_text(json.dumps(report, indent=2)) + # Also print a recognizable line for stdout parsing fallback. + print(f"Performance: {exec_ms:.4f} ms") + if err: + print(err) + sys.exit(0 if exec_ms != -1.0 else 1) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/tasks/repository/rocprim/device_nth_element/config.yaml b/tasks/repository/rocprim/device_nth_element/config.yaml new file mode 100644 index 00000000..97e32dec --- /dev/null +++ b/tasks/repository/rocprim/device_nth_element/config.yaml @@ -0,0 +1,10 @@ +repo_url: https://github.com/ROCm/rocPRIM.git +compile_command: + - python3 scripts/task_runner.py compile +correctness_command: + - python3 scripts/task_runner.py correctness +performance_command: + - python3 scripts/task_runner.py performance +prompt: + cheatsheet: null + instructions: "Optimize block_histogram" \ No newline at end of file diff --git a/tasks/repository/rocprim/device_nth_element/scripts/task_runner.py b/tasks/repository/rocprim/device_nth_element/scripts/task_runner.py new file mode 100644 index 00000000..0786a125 --- /dev/null +++ b/tasks/repository/rocprim/device_nth_element/scripts/task_runner.py @@ -0,0 +1,326 @@ +#!/usr/bin/env python3 +# Copyright(C) [2026] Advanced Micro Devices, Inc. All rights reserved. +""" +Task runner for repository/rocprim/block_histogram. + +This script provides a stable interface for AgentKernelArena's evaluator: + - `compile` : configure & build rocPRIM benchmark/test targets + - `correctness` : run `test_block_histogram` + - `performance` : run `benchmark_block_histogram` and emit `build/performance_report.json` + +All reports are written under `/build/` so the centralized evaluator can parse them. +""" + +from __future__ import annotations + +import argparse +import json +import os +import re +import subprocess +import sys +import time +from pathlib import Path +from typing import Optional, Tuple + + +TASK_NAME = "repository/rocprim/block_histogram" +BENCH_TARGET = "benchmark_block_histogram" +TEST_TARGET = "test_block_histogram" +REPO_SUBDIR = "rocPRIM" # Cloned repo lives under workspace/rocPRIM/ + + +def _workspace_root() -> Path: + # scripts/task_runner.py -> scripts/ -> workspace root + return Path(__file__).resolve().parents[1] + + +def _source_root(workspace: Path) -> Path: + """CMake source directory (cloned rocPRIM repo).""" + return workspace / REPO_SUBDIR + + +def _cmake_build_root(workspace: Path) -> Path: + """CMake build root inside the cloned repo (workspace/rocPRIM/build/).""" + return _source_root(workspace) / "build" + + +def _cmake_build_dir(workspace: Path) -> Path: + """CMake build directory (workspace/rocPRIM/build/Release/).""" + return _cmake_build_root(workspace) / "Release" + + +def _report_root(workspace: Path) -> Path: + """Report directory for evaluator (workspace/build/). Separate from CMake build.""" + return workspace / "build" + + +def _detect_arch() -> Optional[str]: + # Main framework sets PYTORCH_ROCM_ARCH from target_gpu_model; reuse it for rocPRIM CMake. + arch = os.environ.get("AMDGPU_TARGETS") or os.environ.get("PYTORCH_ROCM_ARCH") + if not arch: + return None + return arch.strip() or None + + +def _run(cmd: list[str], cwd: Path, timeout_s: int, env: dict[str, str]) -> Tuple[bool, str]: + try: + proc = subprocess.run( + cmd, + cwd=str(cwd), + env=env, + capture_output=True, + text=True, + timeout=timeout_s, + ) + out = (proc.stdout or "") + (proc.stderr or "") + return proc.returncode == 0, out + except subprocess.TimeoutExpired as e: + out = (getattr(e, "stdout", "") or "") + (getattr(e, "stderr", "") or "") + return False, f"TIMEOUT after {timeout_s}s\n{out}" + except Exception as e: + return False, str(e) + + +def _ensure_configured(source_dir: Path, build_dir: Path) -> Tuple[bool, Optional[str]]: + """ + Run CMake configure. + + Args: + source_dir: CMake source directory (cloned repo, e.g. workspace/rocPRIM/) + build_dir: CMake build directory (e.g. workspace/build/Release/) + """ + build_dir.mkdir(parents=True, exist_ok=True) + + env = os.environ.copy() + env.setdefault("ROCM_PATH", "/opt/rocm") + env.setdefault("CXX", "hipcc") + + cmake_args = [ + "cmake", + "-S", + str(source_dir), + "-B", + str(build_dir), + "-DCMAKE_BUILD_TYPE=Release", + "-DBUILD_BENCHMARK=ON", + "-DBUILD_TEST=ON", + ] + + arch = _detect_arch() + if arch: + cmake_args.append(f"-DAMDGPU_TARGETS={arch}") + + ok, out = _run(cmake_args, cwd=source_dir, timeout_s=600, env=env) + if not ok: + return False, f"CMake configure failed.\nCommand: {' '.join(cmake_args)}\nOutput:\n{out}" + return True, None + + +def _cmake_build(source_dir: Path, build_dir: Path, target: str) -> Tuple[bool, Optional[str]]: + """ + Run CMake build. + + Args: + source_dir: CMake source directory (for cwd) + build_dir: CMake build directory + target: Build target name + """ + env = os.environ.copy() + env.setdefault("ROCM_PATH", "/opt/rocm") + env.setdefault("CXX", "hipcc") + + cmd = ["cmake", "--build", str(build_dir), "--target", target, "-j"] + ok, out = _run(cmd, cwd=source_dir, timeout_s=1800, env=env) + if not ok: + return False, f"Build failed for target '{target}'.\nCommand: {' '.join(cmd)}\nOutput:\n{out}" + return True, None + + +def _maybe_build_target( + source_dir: Path, + build_dir: Path, + target: str, + binary_path: Path, +) -> Tuple[bool, Optional[str]]: + """ + Avoid redundant builds when the binary already exists. + + Arena runs compile -> correctness -> performance sequentially, so correctness/perf + should not rebuild unless the required binary is missing. + """ + if binary_path.is_file(): + return True, None + + ok, err = _ensure_configured(source_dir, build_dir) + if not ok: + return False, err + return _cmake_build(source_dir, build_dir, target) + + +def _test_binary_path(build_dir: Path) -> Path: + return build_dir / "test" / "rocprim" / TEST_TARGET + + +def _bench_binary_path(build_dir: Path) -> Path: + return build_dir / "benchmark" / BENCH_TARGET + + + + +def _parse_time_ms(output: str) -> Optional[float]: + # Try to find a reasonable "average/mean" latency number in common units. + patterns = [ + r"avg(?:erage)?(?:\s+time)?\s*[:=]\s*([\d.]+)\s*(ns|us|ms|s)\b", + r"mean(?:\s+time)?\s*[:=]\s*([\d.]+)\s*(ns|us|ms|s)\b", + r"median(?:\s+time)?\s*[:=]\s*([\d.]+)\s*(ns|us|ms|s)\b", + r"Perf(?:ormance)?\s*[:=]\s*([\d.]+)\s*(ns|us|ms|s)\b", + r"([\d.]+)\s*(ns|us|ms|s)\s*/\s*(?:trial|iter(?:ation)?|launch)\b", + ] + unit_mul = {"ns": 1e-6, "us": 1e-3, "ms": 1.0, "s": 1000.0} + for pat in patterns: + m = re.search(pat, output, re.IGNORECASE) + if not m: + continue + val = float(m.group(1)) + unit = m.group(2).lower() + if unit in unit_mul: + return val * unit_mul[unit] + return None + + +def run_compile(workspace: Path) -> Tuple[bool, Optional[str]]: + build_dir = _cmake_build_dir(workspace) + source_dir = _source_root(workspace) + + if not source_dir.is_dir(): + return False, f"Source directory not found: {source_dir}. Repo may not have been cloned." + + ok, err = _ensure_configured(source_dir, build_dir) + if not ok: + return False, err + + # Build both correctness and benchmark targets during compile phase. + ok, err = _cmake_build(source_dir, build_dir, TEST_TARGET) + if not ok: + return False, err + ok, err = _cmake_build(source_dir, build_dir, BENCH_TARGET) + if not ok: + return False, err + + # Sanity-check binaries exist. + test_bin = _test_binary_path(build_dir) + bench_bin = _bench_binary_path(build_dir) + if not test_bin.is_file(): + return False, f"Test binary not found: {test_bin}" + if not bench_bin.is_file(): + return False, f"Benchmark binary not found: {bench_bin}" + + return True, None + + +def run_correctness(workspace: Path) -> Tuple[bool, Optional[str]]: + build_dir = _cmake_build_dir(workspace) + source_dir = _source_root(workspace) + + test_bin = _test_binary_path(build_dir) + ok, err = _maybe_build_target(source_dir, build_dir, TEST_TARGET, test_bin) + if not ok: + return False, err + if not test_bin.is_file(): + return False, f"Test binary not found after build attempt: {test_bin}" + + env = os.environ.copy() + ok, out = _run([str(test_bin)], cwd=workspace, timeout_s=1800, env=env) + if not ok: + return False, f"Correctness test failed.\nCommand: {test_bin}\nOutput:\n{out}" + return True, None + + +def run_performance(workspace: Path, trials: int) -> Tuple[float, str]: + build_dir = _cmake_build_dir(workspace) + source_dir = _source_root(workspace) + + bench_bin = _bench_binary_path(build_dir) + ok, err = _maybe_build_target(source_dir, build_dir, BENCH_TARGET, bench_bin) + if not ok: + return -1.0, err or "build failed" + if not bench_bin.is_file(): + return -1.0, f"Benchmark binary not found after build attempt: {bench_bin}" + + env = os.environ.copy() + cmd = [str(bench_bin), "--trials", str(trials)] + t0 = time.perf_counter() + ok, out = _run(cmd, cwd=workspace, timeout_s=3600, env=env) + elapsed_ms_total = (time.perf_counter() - t0) * 1000.0 + + if not ok: + return -1.0, f"Benchmark failed.\nCommand: {' '.join(cmd)}\nOutput:\n{out}" + + parsed_ms = _parse_time_ms(out) + if parsed_ms is not None and parsed_ms > 0: + return float(parsed_ms), "" + + # Fallback: approximate average per trial from wall-clock runtime. + if trials > 0: + return float(elapsed_ms_total / trials), "" + return float(elapsed_ms_total), "" + + +def main() -> None: + workspace = _workspace_root() + os.chdir(workspace) + report_root = _report_root(workspace) + report_root.mkdir(parents=True, exist_ok=True) + + parser = argparse.ArgumentParser(description=f"Task runner for {TASK_NAME}") + parser.add_argument("mode", choices=["compile", "correctness", "performance"]) + parser.add_argument("--trials", type=int, default=20) + args = parser.parse_args() + + if args.mode == "compile": + ok, err = run_compile(workspace) + report = { + "status": "ok" if ok else "fail", + "error": err, + "arch": _detect_arch(), + "source_dir": str(_source_root(workspace)), + "build_dir": str(_cmake_build_dir(workspace)), + } + (report_root / "compile_report.json").write_text(json.dumps(report, indent=2)) + print(f"Compilation: {'PASS' if ok else 'FAIL'}") + if err: + print(err) + sys.exit(0 if ok else 1) + + if args.mode == "correctness": + ok, err = run_correctness(workspace) + report = { + "status": "ok" if ok else "fail", + "error": err, + } + (report_root / "correctness_report.json").write_text(json.dumps(report, indent=2)) + print(f"Correctness: {'PASS' if ok else 'FAIL'}") + if err: + print(err) + sys.exit(0 if ok else 1) + + if args.mode == "performance": + exec_ms, err = run_performance(workspace, trials=args.trials) + report = [ + { + "test_case_id": "test_case_0", + "execution_time_ms": exec_ms, + "params": {"trials": args.trials}, + } + ] + (report_root / "performance_report.json").write_text(json.dumps(report, indent=2)) + # Also print a recognizable line for stdout parsing fallback. + print(f"Performance: {exec_ms:.4f} ms") + if err: + print(err) + sys.exit(0 if exec_ms != -1.0 else 1) + + +if __name__ == "__main__": + main() \ No newline at end of file From 3b492799c0f7bdf2fc56458422e22a0a12ac0b56 Mon Sep 17 00:00:00 2001 From: yueliu14 Date: Mon, 9 Mar 2026 11:18:59 +0800 Subject: [PATCH 02/84] modify task_runner --- .../block_run_length_decode/config.yaml | 10 - .../scripts/task_runner.py | 326 ------------------ .../rocprim/device_nth_element/config.yaml | 10 - .../device_nth_element/scripts/task_runner.py | 326 ------------------ .../config.yaml | 0 .../scripts/task_runner.py | 114 +++--- 6 files changed, 65 insertions(+), 721 deletions(-) delete mode 100644 tasks/repository/rocprim/block_run_length_decode/config.yaml delete mode 100644 tasks/repository/rocprim/block_run_length_decode/scripts/task_runner.py delete mode 100644 tasks/repository/rocprim/device_nth_element/config.yaml delete mode 100644 tasks/repository/rocprim/device_nth_element/scripts/task_runner.py rename tasks/repository/rocprim/{block_histogram => device_search_n}/config.yaml (100%) rename tasks/repository/rocprim/{block_histogram => device_search_n}/scripts/task_runner.py (76%) diff --git a/tasks/repository/rocprim/block_run_length_decode/config.yaml b/tasks/repository/rocprim/block_run_length_decode/config.yaml deleted file mode 100644 index 97e32dec..00000000 --- a/tasks/repository/rocprim/block_run_length_decode/config.yaml +++ /dev/null @@ -1,10 +0,0 @@ -repo_url: https://github.com/ROCm/rocPRIM.git -compile_command: - - python3 scripts/task_runner.py compile -correctness_command: - - python3 scripts/task_runner.py correctness -performance_command: - - python3 scripts/task_runner.py performance -prompt: - cheatsheet: null - instructions: "Optimize block_histogram" \ No newline at end of file diff --git a/tasks/repository/rocprim/block_run_length_decode/scripts/task_runner.py b/tasks/repository/rocprim/block_run_length_decode/scripts/task_runner.py deleted file mode 100644 index 0786a125..00000000 --- a/tasks/repository/rocprim/block_run_length_decode/scripts/task_runner.py +++ /dev/null @@ -1,326 +0,0 @@ -#!/usr/bin/env python3 -# Copyright(C) [2026] Advanced Micro Devices, Inc. All rights reserved. -""" -Task runner for repository/rocprim/block_histogram. - -This script provides a stable interface for AgentKernelArena's evaluator: - - `compile` : configure & build rocPRIM benchmark/test targets - - `correctness` : run `test_block_histogram` - - `performance` : run `benchmark_block_histogram` and emit `build/performance_report.json` - -All reports are written under `/build/` so the centralized evaluator can parse them. -""" - -from __future__ import annotations - -import argparse -import json -import os -import re -import subprocess -import sys -import time -from pathlib import Path -from typing import Optional, Tuple - - -TASK_NAME = "repository/rocprim/block_histogram" -BENCH_TARGET = "benchmark_block_histogram" -TEST_TARGET = "test_block_histogram" -REPO_SUBDIR = "rocPRIM" # Cloned repo lives under workspace/rocPRIM/ - - -def _workspace_root() -> Path: - # scripts/task_runner.py -> scripts/ -> workspace root - return Path(__file__).resolve().parents[1] - - -def _source_root(workspace: Path) -> Path: - """CMake source directory (cloned rocPRIM repo).""" - return workspace / REPO_SUBDIR - - -def _cmake_build_root(workspace: Path) -> Path: - """CMake build root inside the cloned repo (workspace/rocPRIM/build/).""" - return _source_root(workspace) / "build" - - -def _cmake_build_dir(workspace: Path) -> Path: - """CMake build directory (workspace/rocPRIM/build/Release/).""" - return _cmake_build_root(workspace) / "Release" - - -def _report_root(workspace: Path) -> Path: - """Report directory for evaluator (workspace/build/). Separate from CMake build.""" - return workspace / "build" - - -def _detect_arch() -> Optional[str]: - # Main framework sets PYTORCH_ROCM_ARCH from target_gpu_model; reuse it for rocPRIM CMake. - arch = os.environ.get("AMDGPU_TARGETS") or os.environ.get("PYTORCH_ROCM_ARCH") - if not arch: - return None - return arch.strip() or None - - -def _run(cmd: list[str], cwd: Path, timeout_s: int, env: dict[str, str]) -> Tuple[bool, str]: - try: - proc = subprocess.run( - cmd, - cwd=str(cwd), - env=env, - capture_output=True, - text=True, - timeout=timeout_s, - ) - out = (proc.stdout or "") + (proc.stderr or "") - return proc.returncode == 0, out - except subprocess.TimeoutExpired as e: - out = (getattr(e, "stdout", "") or "") + (getattr(e, "stderr", "") or "") - return False, f"TIMEOUT after {timeout_s}s\n{out}" - except Exception as e: - return False, str(e) - - -def _ensure_configured(source_dir: Path, build_dir: Path) -> Tuple[bool, Optional[str]]: - """ - Run CMake configure. - - Args: - source_dir: CMake source directory (cloned repo, e.g. workspace/rocPRIM/) - build_dir: CMake build directory (e.g. workspace/build/Release/) - """ - build_dir.mkdir(parents=True, exist_ok=True) - - env = os.environ.copy() - env.setdefault("ROCM_PATH", "/opt/rocm") - env.setdefault("CXX", "hipcc") - - cmake_args = [ - "cmake", - "-S", - str(source_dir), - "-B", - str(build_dir), - "-DCMAKE_BUILD_TYPE=Release", - "-DBUILD_BENCHMARK=ON", - "-DBUILD_TEST=ON", - ] - - arch = _detect_arch() - if arch: - cmake_args.append(f"-DAMDGPU_TARGETS={arch}") - - ok, out = _run(cmake_args, cwd=source_dir, timeout_s=600, env=env) - if not ok: - return False, f"CMake configure failed.\nCommand: {' '.join(cmake_args)}\nOutput:\n{out}" - return True, None - - -def _cmake_build(source_dir: Path, build_dir: Path, target: str) -> Tuple[bool, Optional[str]]: - """ - Run CMake build. - - Args: - source_dir: CMake source directory (for cwd) - build_dir: CMake build directory - target: Build target name - """ - env = os.environ.copy() - env.setdefault("ROCM_PATH", "/opt/rocm") - env.setdefault("CXX", "hipcc") - - cmd = ["cmake", "--build", str(build_dir), "--target", target, "-j"] - ok, out = _run(cmd, cwd=source_dir, timeout_s=1800, env=env) - if not ok: - return False, f"Build failed for target '{target}'.\nCommand: {' '.join(cmd)}\nOutput:\n{out}" - return True, None - - -def _maybe_build_target( - source_dir: Path, - build_dir: Path, - target: str, - binary_path: Path, -) -> Tuple[bool, Optional[str]]: - """ - Avoid redundant builds when the binary already exists. - - Arena runs compile -> correctness -> performance sequentially, so correctness/perf - should not rebuild unless the required binary is missing. - """ - if binary_path.is_file(): - return True, None - - ok, err = _ensure_configured(source_dir, build_dir) - if not ok: - return False, err - return _cmake_build(source_dir, build_dir, target) - - -def _test_binary_path(build_dir: Path) -> Path: - return build_dir / "test" / "rocprim" / TEST_TARGET - - -def _bench_binary_path(build_dir: Path) -> Path: - return build_dir / "benchmark" / BENCH_TARGET - - - - -def _parse_time_ms(output: str) -> Optional[float]: - # Try to find a reasonable "average/mean" latency number in common units. - patterns = [ - r"avg(?:erage)?(?:\s+time)?\s*[:=]\s*([\d.]+)\s*(ns|us|ms|s)\b", - r"mean(?:\s+time)?\s*[:=]\s*([\d.]+)\s*(ns|us|ms|s)\b", - r"median(?:\s+time)?\s*[:=]\s*([\d.]+)\s*(ns|us|ms|s)\b", - r"Perf(?:ormance)?\s*[:=]\s*([\d.]+)\s*(ns|us|ms|s)\b", - r"([\d.]+)\s*(ns|us|ms|s)\s*/\s*(?:trial|iter(?:ation)?|launch)\b", - ] - unit_mul = {"ns": 1e-6, "us": 1e-3, "ms": 1.0, "s": 1000.0} - for pat in patterns: - m = re.search(pat, output, re.IGNORECASE) - if not m: - continue - val = float(m.group(1)) - unit = m.group(2).lower() - if unit in unit_mul: - return val * unit_mul[unit] - return None - - -def run_compile(workspace: Path) -> Tuple[bool, Optional[str]]: - build_dir = _cmake_build_dir(workspace) - source_dir = _source_root(workspace) - - if not source_dir.is_dir(): - return False, f"Source directory not found: {source_dir}. Repo may not have been cloned." - - ok, err = _ensure_configured(source_dir, build_dir) - if not ok: - return False, err - - # Build both correctness and benchmark targets during compile phase. - ok, err = _cmake_build(source_dir, build_dir, TEST_TARGET) - if not ok: - return False, err - ok, err = _cmake_build(source_dir, build_dir, BENCH_TARGET) - if not ok: - return False, err - - # Sanity-check binaries exist. - test_bin = _test_binary_path(build_dir) - bench_bin = _bench_binary_path(build_dir) - if not test_bin.is_file(): - return False, f"Test binary not found: {test_bin}" - if not bench_bin.is_file(): - return False, f"Benchmark binary not found: {bench_bin}" - - return True, None - - -def run_correctness(workspace: Path) -> Tuple[bool, Optional[str]]: - build_dir = _cmake_build_dir(workspace) - source_dir = _source_root(workspace) - - test_bin = _test_binary_path(build_dir) - ok, err = _maybe_build_target(source_dir, build_dir, TEST_TARGET, test_bin) - if not ok: - return False, err - if not test_bin.is_file(): - return False, f"Test binary not found after build attempt: {test_bin}" - - env = os.environ.copy() - ok, out = _run([str(test_bin)], cwd=workspace, timeout_s=1800, env=env) - if not ok: - return False, f"Correctness test failed.\nCommand: {test_bin}\nOutput:\n{out}" - return True, None - - -def run_performance(workspace: Path, trials: int) -> Tuple[float, str]: - build_dir = _cmake_build_dir(workspace) - source_dir = _source_root(workspace) - - bench_bin = _bench_binary_path(build_dir) - ok, err = _maybe_build_target(source_dir, build_dir, BENCH_TARGET, bench_bin) - if not ok: - return -1.0, err or "build failed" - if not bench_bin.is_file(): - return -1.0, f"Benchmark binary not found after build attempt: {bench_bin}" - - env = os.environ.copy() - cmd = [str(bench_bin), "--trials", str(trials)] - t0 = time.perf_counter() - ok, out = _run(cmd, cwd=workspace, timeout_s=3600, env=env) - elapsed_ms_total = (time.perf_counter() - t0) * 1000.0 - - if not ok: - return -1.0, f"Benchmark failed.\nCommand: {' '.join(cmd)}\nOutput:\n{out}" - - parsed_ms = _parse_time_ms(out) - if parsed_ms is not None and parsed_ms > 0: - return float(parsed_ms), "" - - # Fallback: approximate average per trial from wall-clock runtime. - if trials > 0: - return float(elapsed_ms_total / trials), "" - return float(elapsed_ms_total), "" - - -def main() -> None: - workspace = _workspace_root() - os.chdir(workspace) - report_root = _report_root(workspace) - report_root.mkdir(parents=True, exist_ok=True) - - parser = argparse.ArgumentParser(description=f"Task runner for {TASK_NAME}") - parser.add_argument("mode", choices=["compile", "correctness", "performance"]) - parser.add_argument("--trials", type=int, default=20) - args = parser.parse_args() - - if args.mode == "compile": - ok, err = run_compile(workspace) - report = { - "status": "ok" if ok else "fail", - "error": err, - "arch": _detect_arch(), - "source_dir": str(_source_root(workspace)), - "build_dir": str(_cmake_build_dir(workspace)), - } - (report_root / "compile_report.json").write_text(json.dumps(report, indent=2)) - print(f"Compilation: {'PASS' if ok else 'FAIL'}") - if err: - print(err) - sys.exit(0 if ok else 1) - - if args.mode == "correctness": - ok, err = run_correctness(workspace) - report = { - "status": "ok" if ok else "fail", - "error": err, - } - (report_root / "correctness_report.json").write_text(json.dumps(report, indent=2)) - print(f"Correctness: {'PASS' if ok else 'FAIL'}") - if err: - print(err) - sys.exit(0 if ok else 1) - - if args.mode == "performance": - exec_ms, err = run_performance(workspace, trials=args.trials) - report = [ - { - "test_case_id": "test_case_0", - "execution_time_ms": exec_ms, - "params": {"trials": args.trials}, - } - ] - (report_root / "performance_report.json").write_text(json.dumps(report, indent=2)) - # Also print a recognizable line for stdout parsing fallback. - print(f"Performance: {exec_ms:.4f} ms") - if err: - print(err) - sys.exit(0 if exec_ms != -1.0 else 1) - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/tasks/repository/rocprim/device_nth_element/config.yaml b/tasks/repository/rocprim/device_nth_element/config.yaml deleted file mode 100644 index 97e32dec..00000000 --- a/tasks/repository/rocprim/device_nth_element/config.yaml +++ /dev/null @@ -1,10 +0,0 @@ -repo_url: https://github.com/ROCm/rocPRIM.git -compile_command: - - python3 scripts/task_runner.py compile -correctness_command: - - python3 scripts/task_runner.py correctness -performance_command: - - python3 scripts/task_runner.py performance -prompt: - cheatsheet: null - instructions: "Optimize block_histogram" \ No newline at end of file diff --git a/tasks/repository/rocprim/device_nth_element/scripts/task_runner.py b/tasks/repository/rocprim/device_nth_element/scripts/task_runner.py deleted file mode 100644 index 0786a125..00000000 --- a/tasks/repository/rocprim/device_nth_element/scripts/task_runner.py +++ /dev/null @@ -1,326 +0,0 @@ -#!/usr/bin/env python3 -# Copyright(C) [2026] Advanced Micro Devices, Inc. All rights reserved. -""" -Task runner for repository/rocprim/block_histogram. - -This script provides a stable interface for AgentKernelArena's evaluator: - - `compile` : configure & build rocPRIM benchmark/test targets - - `correctness` : run `test_block_histogram` - - `performance` : run `benchmark_block_histogram` and emit `build/performance_report.json` - -All reports are written under `/build/` so the centralized evaluator can parse them. -""" - -from __future__ import annotations - -import argparse -import json -import os -import re -import subprocess -import sys -import time -from pathlib import Path -from typing import Optional, Tuple - - -TASK_NAME = "repository/rocprim/block_histogram" -BENCH_TARGET = "benchmark_block_histogram" -TEST_TARGET = "test_block_histogram" -REPO_SUBDIR = "rocPRIM" # Cloned repo lives under workspace/rocPRIM/ - - -def _workspace_root() -> Path: - # scripts/task_runner.py -> scripts/ -> workspace root - return Path(__file__).resolve().parents[1] - - -def _source_root(workspace: Path) -> Path: - """CMake source directory (cloned rocPRIM repo).""" - return workspace / REPO_SUBDIR - - -def _cmake_build_root(workspace: Path) -> Path: - """CMake build root inside the cloned repo (workspace/rocPRIM/build/).""" - return _source_root(workspace) / "build" - - -def _cmake_build_dir(workspace: Path) -> Path: - """CMake build directory (workspace/rocPRIM/build/Release/).""" - return _cmake_build_root(workspace) / "Release" - - -def _report_root(workspace: Path) -> Path: - """Report directory for evaluator (workspace/build/). Separate from CMake build.""" - return workspace / "build" - - -def _detect_arch() -> Optional[str]: - # Main framework sets PYTORCH_ROCM_ARCH from target_gpu_model; reuse it for rocPRIM CMake. - arch = os.environ.get("AMDGPU_TARGETS") or os.environ.get("PYTORCH_ROCM_ARCH") - if not arch: - return None - return arch.strip() or None - - -def _run(cmd: list[str], cwd: Path, timeout_s: int, env: dict[str, str]) -> Tuple[bool, str]: - try: - proc = subprocess.run( - cmd, - cwd=str(cwd), - env=env, - capture_output=True, - text=True, - timeout=timeout_s, - ) - out = (proc.stdout or "") + (proc.stderr or "") - return proc.returncode == 0, out - except subprocess.TimeoutExpired as e: - out = (getattr(e, "stdout", "") or "") + (getattr(e, "stderr", "") or "") - return False, f"TIMEOUT after {timeout_s}s\n{out}" - except Exception as e: - return False, str(e) - - -def _ensure_configured(source_dir: Path, build_dir: Path) -> Tuple[bool, Optional[str]]: - """ - Run CMake configure. - - Args: - source_dir: CMake source directory (cloned repo, e.g. workspace/rocPRIM/) - build_dir: CMake build directory (e.g. workspace/build/Release/) - """ - build_dir.mkdir(parents=True, exist_ok=True) - - env = os.environ.copy() - env.setdefault("ROCM_PATH", "/opt/rocm") - env.setdefault("CXX", "hipcc") - - cmake_args = [ - "cmake", - "-S", - str(source_dir), - "-B", - str(build_dir), - "-DCMAKE_BUILD_TYPE=Release", - "-DBUILD_BENCHMARK=ON", - "-DBUILD_TEST=ON", - ] - - arch = _detect_arch() - if arch: - cmake_args.append(f"-DAMDGPU_TARGETS={arch}") - - ok, out = _run(cmake_args, cwd=source_dir, timeout_s=600, env=env) - if not ok: - return False, f"CMake configure failed.\nCommand: {' '.join(cmake_args)}\nOutput:\n{out}" - return True, None - - -def _cmake_build(source_dir: Path, build_dir: Path, target: str) -> Tuple[bool, Optional[str]]: - """ - Run CMake build. - - Args: - source_dir: CMake source directory (for cwd) - build_dir: CMake build directory - target: Build target name - """ - env = os.environ.copy() - env.setdefault("ROCM_PATH", "/opt/rocm") - env.setdefault("CXX", "hipcc") - - cmd = ["cmake", "--build", str(build_dir), "--target", target, "-j"] - ok, out = _run(cmd, cwd=source_dir, timeout_s=1800, env=env) - if not ok: - return False, f"Build failed for target '{target}'.\nCommand: {' '.join(cmd)}\nOutput:\n{out}" - return True, None - - -def _maybe_build_target( - source_dir: Path, - build_dir: Path, - target: str, - binary_path: Path, -) -> Tuple[bool, Optional[str]]: - """ - Avoid redundant builds when the binary already exists. - - Arena runs compile -> correctness -> performance sequentially, so correctness/perf - should not rebuild unless the required binary is missing. - """ - if binary_path.is_file(): - return True, None - - ok, err = _ensure_configured(source_dir, build_dir) - if not ok: - return False, err - return _cmake_build(source_dir, build_dir, target) - - -def _test_binary_path(build_dir: Path) -> Path: - return build_dir / "test" / "rocprim" / TEST_TARGET - - -def _bench_binary_path(build_dir: Path) -> Path: - return build_dir / "benchmark" / BENCH_TARGET - - - - -def _parse_time_ms(output: str) -> Optional[float]: - # Try to find a reasonable "average/mean" latency number in common units. - patterns = [ - r"avg(?:erage)?(?:\s+time)?\s*[:=]\s*([\d.]+)\s*(ns|us|ms|s)\b", - r"mean(?:\s+time)?\s*[:=]\s*([\d.]+)\s*(ns|us|ms|s)\b", - r"median(?:\s+time)?\s*[:=]\s*([\d.]+)\s*(ns|us|ms|s)\b", - r"Perf(?:ormance)?\s*[:=]\s*([\d.]+)\s*(ns|us|ms|s)\b", - r"([\d.]+)\s*(ns|us|ms|s)\s*/\s*(?:trial|iter(?:ation)?|launch)\b", - ] - unit_mul = {"ns": 1e-6, "us": 1e-3, "ms": 1.0, "s": 1000.0} - for pat in patterns: - m = re.search(pat, output, re.IGNORECASE) - if not m: - continue - val = float(m.group(1)) - unit = m.group(2).lower() - if unit in unit_mul: - return val * unit_mul[unit] - return None - - -def run_compile(workspace: Path) -> Tuple[bool, Optional[str]]: - build_dir = _cmake_build_dir(workspace) - source_dir = _source_root(workspace) - - if not source_dir.is_dir(): - return False, f"Source directory not found: {source_dir}. Repo may not have been cloned." - - ok, err = _ensure_configured(source_dir, build_dir) - if not ok: - return False, err - - # Build both correctness and benchmark targets during compile phase. - ok, err = _cmake_build(source_dir, build_dir, TEST_TARGET) - if not ok: - return False, err - ok, err = _cmake_build(source_dir, build_dir, BENCH_TARGET) - if not ok: - return False, err - - # Sanity-check binaries exist. - test_bin = _test_binary_path(build_dir) - bench_bin = _bench_binary_path(build_dir) - if not test_bin.is_file(): - return False, f"Test binary not found: {test_bin}" - if not bench_bin.is_file(): - return False, f"Benchmark binary not found: {bench_bin}" - - return True, None - - -def run_correctness(workspace: Path) -> Tuple[bool, Optional[str]]: - build_dir = _cmake_build_dir(workspace) - source_dir = _source_root(workspace) - - test_bin = _test_binary_path(build_dir) - ok, err = _maybe_build_target(source_dir, build_dir, TEST_TARGET, test_bin) - if not ok: - return False, err - if not test_bin.is_file(): - return False, f"Test binary not found after build attempt: {test_bin}" - - env = os.environ.copy() - ok, out = _run([str(test_bin)], cwd=workspace, timeout_s=1800, env=env) - if not ok: - return False, f"Correctness test failed.\nCommand: {test_bin}\nOutput:\n{out}" - return True, None - - -def run_performance(workspace: Path, trials: int) -> Tuple[float, str]: - build_dir = _cmake_build_dir(workspace) - source_dir = _source_root(workspace) - - bench_bin = _bench_binary_path(build_dir) - ok, err = _maybe_build_target(source_dir, build_dir, BENCH_TARGET, bench_bin) - if not ok: - return -1.0, err or "build failed" - if not bench_bin.is_file(): - return -1.0, f"Benchmark binary not found after build attempt: {bench_bin}" - - env = os.environ.copy() - cmd = [str(bench_bin), "--trials", str(trials)] - t0 = time.perf_counter() - ok, out = _run(cmd, cwd=workspace, timeout_s=3600, env=env) - elapsed_ms_total = (time.perf_counter() - t0) * 1000.0 - - if not ok: - return -1.0, f"Benchmark failed.\nCommand: {' '.join(cmd)}\nOutput:\n{out}" - - parsed_ms = _parse_time_ms(out) - if parsed_ms is not None and parsed_ms > 0: - return float(parsed_ms), "" - - # Fallback: approximate average per trial from wall-clock runtime. - if trials > 0: - return float(elapsed_ms_total / trials), "" - return float(elapsed_ms_total), "" - - -def main() -> None: - workspace = _workspace_root() - os.chdir(workspace) - report_root = _report_root(workspace) - report_root.mkdir(parents=True, exist_ok=True) - - parser = argparse.ArgumentParser(description=f"Task runner for {TASK_NAME}") - parser.add_argument("mode", choices=["compile", "correctness", "performance"]) - parser.add_argument("--trials", type=int, default=20) - args = parser.parse_args() - - if args.mode == "compile": - ok, err = run_compile(workspace) - report = { - "status": "ok" if ok else "fail", - "error": err, - "arch": _detect_arch(), - "source_dir": str(_source_root(workspace)), - "build_dir": str(_cmake_build_dir(workspace)), - } - (report_root / "compile_report.json").write_text(json.dumps(report, indent=2)) - print(f"Compilation: {'PASS' if ok else 'FAIL'}") - if err: - print(err) - sys.exit(0 if ok else 1) - - if args.mode == "correctness": - ok, err = run_correctness(workspace) - report = { - "status": "ok" if ok else "fail", - "error": err, - } - (report_root / "correctness_report.json").write_text(json.dumps(report, indent=2)) - print(f"Correctness: {'PASS' if ok else 'FAIL'}") - if err: - print(err) - sys.exit(0 if ok else 1) - - if args.mode == "performance": - exec_ms, err = run_performance(workspace, trials=args.trials) - report = [ - { - "test_case_id": "test_case_0", - "execution_time_ms": exec_ms, - "params": {"trials": args.trials}, - } - ] - (report_root / "performance_report.json").write_text(json.dumps(report, indent=2)) - # Also print a recognizable line for stdout parsing fallback. - print(f"Performance: {exec_ms:.4f} ms") - if err: - print(err) - sys.exit(0 if exec_ms != -1.0 else 1) - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/tasks/repository/rocprim/block_histogram/config.yaml b/tasks/repository/rocprim/device_search_n/config.yaml similarity index 100% rename from tasks/repository/rocprim/block_histogram/config.yaml rename to tasks/repository/rocprim/device_search_n/config.yaml diff --git a/tasks/repository/rocprim/block_histogram/scripts/task_runner.py b/tasks/repository/rocprim/device_search_n/scripts/task_runner.py similarity index 76% rename from tasks/repository/rocprim/block_histogram/scripts/task_runner.py rename to tasks/repository/rocprim/device_search_n/scripts/task_runner.py index 0786a125..2b2ee28b 100644 --- a/tasks/repository/rocprim/block_histogram/scripts/task_runner.py +++ b/tasks/repository/rocprim/device_search_n/scripts/task_runner.py @@ -1,12 +1,12 @@ #!/usr/bin/env python3 # Copyright(C) [2026] Advanced Micro Devices, Inc. All rights reserved. """ -Task runner for repository/rocprim/block_histogram. +Task runner for repository/rocprim/device_search_n. This script provides a stable interface for AgentKernelArena's evaluator: - `compile` : configure & build rocPRIM benchmark/test targets - - `correctness` : run `test_block_histogram` - - `performance` : run `benchmark_block_histogram` and emit `build/performance_report.json` + - `correctness` : run `test_device_search_n` + - `performance` : run `benchmark_device_search_n` and emit `build/performance_report.json` All reports are written under `/build/` so the centralized evaluator can parse them. """ @@ -24,9 +24,9 @@ from typing import Optional, Tuple -TASK_NAME = "repository/rocprim/block_histogram" -BENCH_TARGET = "benchmark_block_histogram" -TEST_TARGET = "test_block_histogram" +TASK_NAME = "repository/rocprim/device_search_n" +BENCH_TARGET = "benchmark_device_search_n" +TEST_TARGET = "test_device_search_n" REPO_SUBDIR = "rocPRIM" # Cloned repo lives under workspace/rocPRIM/ @@ -168,25 +168,34 @@ def _bench_binary_path(build_dir: Path) -> Path: -def _parse_time_ms(output: str) -> Optional[float]: - # Try to find a reasonable "average/mean" latency number in common units. - patterns = [ - r"avg(?:erage)?(?:\s+time)?\s*[:=]\s*([\d.]+)\s*(ns|us|ms|s)\b", - r"mean(?:\s+time)?\s*[:=]\s*([\d.]+)\s*(ns|us|ms|s)\b", - r"median(?:\s+time)?\s*[:=]\s*([\d.]+)\s*(ns|us|ms|s)\b", - r"Perf(?:ormance)?\s*[:=]\s*([\d.]+)\s*(ns|us|ms|s)\b", - r"([\d.]+)\s*(ns|us|ms|s)\s*/\s*(?:trial|iter(?:ation)?|launch)\b", - ] - unit_mul = {"ns": 1e-6, "us": 1e-3, "ms": 1.0, "s": 1000.0} - for pat in patterns: - m = re.search(pat, output, re.IGNORECASE) - if not m: - continue - val = float(m.group(1)) - unit = m.group(2).lower() - if unit in unit_mul: - return val * unit_mul[unit] - return None +def _parse_benchmark_results(output: str) -> list[dict]: + """ + Parse rocPRIM benchmark output for all test cases. + Returns list of dicts with test_case_id and bytes_per_second_gs for each test case. + """ + pattern = re.compile( + r"^(?P.+?)/manual_time\s+" + r"(?P