From a71952aaf4fe9e19466746aebec5e482c4fe2d40 Mon Sep 17 00:00:00 2001 From: Weilin Xu Date: Mon, 1 Jun 2026 11:03:46 -0700 Subject: [PATCH 01/12] Avoid saying triton in the sycl project instructions. Signed-off-by: Weilin Xu --- src/xe_forge/claude/generator.py | 2 +- src/xe_forge/claude/templates/CLAUDE.md.j2 | 14 ++++++++------ src/xe_forge/claude/templates/tool-runner.md.j2 | 2 +- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/xe_forge/claude/generator.py b/src/xe_forge/claude/generator.py index 86eadb7..1a3494d 100644 --- a/src/xe_forge/claude/generator.py +++ b/src/xe_forge/claude/generator.py @@ -70,7 +70,7 @@ def generate_workspace( agent_dir = workspace / ".claude" / "agents" agent_dir.mkdir(parents=True, exist_ok=True) - (agent_dir / "tool-runner.md").write_text(_render("tool-runner.md.j2")) + (agent_dir / "tool-runner.md").write_text(_render("tool-runner.md.j2", dsl=dsl)) _write_kernel_files(workspace, kernel_name, kernel_code, reference_code, spec_path) _symlink_knowledge_base(workspace) diff --git a/src/xe_forge/claude/templates/CLAUDE.md.j2 b/src/xe_forge/claude/templates/CLAUDE.md.j2 index 7131c6a..3f7f7a4 100644 --- a/src/xe_forge/claude/templates/CLAUDE.md.j2 +++ b/src/xe_forge/claude/templates/CLAUDE.md.j2 @@ -33,12 +33,12 @@ All runtime behavior below is gated by these values; re-read `config.yaml` if an | **Benchmark** | `xe-forge-skill benchmark --spec [--baseline-us ]` | | **Init trials** | `xe-forge-skill trial init ` | | **Save trial** | `xe-forge-skill trial save [--parent ] [--strategy "..."]` | -| **Record result** | `xe-forge-skill trial result --correctness --speedup --baseline-us --triton-us ` | +| **Record result** | `xe-forge-skill trial result --correctness --speedup --baseline-us --{{ dsl }}-us ` | | **Check status** | `xe-forge-skill trial status ` | | **Best trial** | `xe-forge-skill trial best ` | | **Baseline time** | `xe-forge-skill trial baseline-us ` | | **Finalize** | `xe-forge-skill trial finalize ` | -| **Profile** (only when `vtune_enabled: true`) | `xe-forge-skill profile --spec ` | +| **Profile** (only when `vtune_enabled: true`) | `xe-forge-skill profile <{{ dsl }}_file> --spec ` | ## WORKFLOW — Follow these steps in order @@ -58,10 +58,10 @@ For each trial: 2. **Validate** — `xe-forge-skill validate --dsl {{ dsl }}` (fix until passing). 3. **Save** — `xe-forge-skill trial save {{ kernel_name }} --parent --strategy "description"`. 4. **Benchmark** (MANDATORY every trial): - - **Trial t0:** `xe-forge-skill benchmark test_kernels/{{ kernel_name }}.py --spec test_kernels/{{ kernel_name }}.yaml` - - **Trials t1+:** Get cached baseline via `xe-forge-skill trial baseline-us {{ kernel_name }}`, then `xe-forge-skill benchmark test_kernels/{{ kernel_name }}.py --spec test_kernels/{{ kernel_name }}.yaml --baseline-us ` -5. **Record** — `xe-forge-skill trial result {{ kernel_name }} --correctness --speedup --baseline-us --triton-us ` -6. **Profile** — if `vtune_enabled: true` in `config.yaml` and this is trial t1 or later, run `xe-forge-skill profile --spec ` and use the output to guide the next trial. If `vtune_enabled: false`, skip this step. + - **Trial t0:** `xe-forge-skill benchmark test_kernels/{{ kernel_name }}.py <{{ dsl }}_file> --spec test_kernels/{{ kernel_name }}.yaml` + - **Trials t1+:** Get cached baseline via `xe-forge-skill trial baseline-us {{ kernel_name }}`, then `xe-forge-skill benchmark test_kernels/{{ kernel_name }}.py <{{ dsl }}_file> --spec test_kernels/{{ kernel_name }}.yaml --baseline-us ` +5. **Record** — `xe-forge-skill trial result {{ kernel_name }} --correctness --speedup --baseline-us --{{ dsl }}-us ` +6. **Profile** — if `vtune_enabled: true` in `config.yaml` and this is trial t1 or later, run `xe-forge-skill profile <{{ dsl }}_file> --spec ` and use the output to guide the next trial. If `vtune_enabled: false`, skip this step. 7. **Decide next action**: - Speedup > 5x -> stop, finalize - Speedup improved -> continue on this branch @@ -76,7 +76,9 @@ xe-forge-skill trial finalize {{ kernel_name }} output/{{ kernel_name }}_optimiz ## CRITICAL CORRECTNESS CONSTRAINTS +{% if dsl == "triton" %} - NO default values for `@triton.autotune` meta-parameters in kernel signature +{% endif %} - Use 1D grid when applying tile swizzling (GROUP_SIZE_M) - `boundary_check` uses dimension indices (0, 1), not booleans - Cast batch indices to `int64` before stride multiplication diff --git a/src/xe_forge/claude/templates/tool-runner.md.j2 b/src/xe_forge/claude/templates/tool-runner.md.j2 index 400aa43..adabf86 100644 --- a/src/xe_forge/claude/templates/tool-runner.md.j2 +++ b/src/xe_forge/claude/templates/tool-runner.md.j2 @@ -31,7 +31,7 @@ Safe to parallelize: analyze, validate, trial (CPU-only). ## Output Rules For benchmark: -- Extract ONLY: Correctness (PASSED/FAILED), Performance (baseline_us, triton_us, speedup), Errors. +- Extract ONLY: Correctness (PASSED/FAILED), Performance (baseline_us, {{ dsl }}_us, speedup), Errors. - Do NOT include configuration header or decorative separators. For profile: From af3d02eab560542f9277bc3482146e85a7e2c764 Mon Sep 17 00:00:00 2001 From: Weilin Xu Date: Mon, 1 Jun 2026 11:04:16 -0700 Subject: [PATCH 02/12] Avoid access to knowledge base of other DSLs. Signed-off-by: Weilin Xu --- src/xe_forge/claude/generator.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/src/xe_forge/claude/generator.py b/src/xe_forge/claude/generator.py index 1a3494d..3ec53b1 100644 --- a/src/xe_forge/claude/generator.py +++ b/src/xe_forge/claude/generator.py @@ -73,7 +73,7 @@ def generate_workspace( (agent_dir / "tool-runner.md").write_text(_render("tool-runner.md.j2", dsl=dsl)) _write_kernel_files(workspace, kernel_name, kernel_code, reference_code, spec_path) - _symlink_knowledge_base(workspace) + _symlink_knowledge_base(workspace, dsl) if config.engine.git_init: _git_init(workspace) @@ -96,11 +96,10 @@ def _write_kernel_files( shutil.copy2(spec_path, tk_dir / f"{kernel_name}.yaml") -def _symlink_knowledge_base(workspace: Path) -> None: - """Create a symlink to the installed knowledge_base directory.""" - kb_link = workspace / "knowledge_base" - if kb_link.exists() or kb_link.is_symlink(): - return +def _symlink_knowledge_base(workspace: Path, dsl: str) -> None: + """Symlink common and DSL-specific knowledge base subdirectories.""" + kb_root = workspace / "knowledge_base" + kb_root.mkdir(parents=True, exist_ok=True) import xe_forge @@ -112,7 +111,17 @@ def _symlink_knowledge_base(workspace: Path) -> None: ] for candidate in candidates: if candidate.is_dir(): - kb_link.symlink_to(candidate.resolve()) + src_root = candidate.resolve() + + common_src = src_root / "common" + dsl_src = src_root / dsl + common_dst = kb_root / "common" + dsl_dst = kb_root / dsl + + if common_src.is_dir() and not (common_dst.exists() or common_dst.is_symlink()): + common_dst.symlink_to(common_src) + if dsl_src.is_dir() and not (dsl_dst.exists() or dsl_dst.is_symlink()): + dsl_dst.symlink_to(dsl_src) return From 17f4d2b86d2505715f2844d95affcdb9814ccd99 Mon Sep 17 00:00:00 2001 From: Weilin Xu Date: Mon, 1 Jun 2026 14:22:55 -0700 Subject: [PATCH 03/12] Only link to knowledge base of the same DSL and Device. Signed-off-by: Weilin Xu --- src/xe_forge/claude/generator.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/xe_forge/claude/generator.py b/src/xe_forge/claude/generator.py index 3ec53b1..aff369e 100644 --- a/src/xe_forge/claude/generator.py +++ b/src/xe_forge/claude/generator.py @@ -73,7 +73,7 @@ def generate_workspace( (agent_dir / "tool-runner.md").write_text(_render("tool-runner.md.j2", dsl=dsl)) _write_kernel_files(workspace, kernel_name, kernel_code, reference_code, spec_path) - _symlink_knowledge_base(workspace, dsl) + _symlink_knowledge_base(workspace, dsl, device) if config.engine.git_init: _git_init(workspace) @@ -96,8 +96,8 @@ def _write_kernel_files( shutil.copy2(spec_path, tk_dir / f"{kernel_name}.yaml") -def _symlink_knowledge_base(workspace: Path, dsl: str) -> None: - """Symlink common and DSL-specific knowledge base subdirectories.""" +def _symlink_knowledge_base(workspace: Path, dsl: str, device: str) -> None: + """Symlink common and DSL/device-specific knowledge base subdirectories.""" kb_root = workspace / "knowledge_base" kb_root.mkdir(parents=True, exist_ok=True) @@ -114,14 +114,17 @@ def _symlink_knowledge_base(workspace: Path, dsl: str) -> None: src_root = candidate.resolve() common_src = src_root / "common" - dsl_src = src_root / dsl + dsl_device_src = src_root / dsl / device common_dst = kb_root / "common" - dsl_dst = kb_root / dsl + dsl_device_dst = kb_root / dsl / device if common_src.is_dir() and not (common_dst.exists() or common_dst.is_symlink()): common_dst.symlink_to(common_src) - if dsl_src.is_dir() and not (dsl_dst.exists() or dsl_dst.is_symlink()): - dsl_dst.symlink_to(dsl_src) + dsl_device_dst.parent.mkdir(parents=True, exist_ok=True) + if dsl_device_src.is_dir() and not ( + dsl_device_dst.exists() or dsl_device_dst.is_symlink() + ): + dsl_device_dst.symlink_to(dsl_device_src) return From 1b749a4eb50b37a4291d818051a703130e104e36 Mon Sep 17 00:00:00 2001 From: Weilin Xu Date: Mon, 1 Jun 2026 14:52:38 -0700 Subject: [PATCH 04/12] Copy the knowledge base, because agents may not find contents in symlinked folders Signed-off-by: Weilin Xu --- src/xe_forge/claude/generator.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/src/xe_forge/claude/generator.py b/src/xe_forge/claude/generator.py index aff369e..9695d4c 100644 --- a/src/xe_forge/claude/generator.py +++ b/src/xe_forge/claude/generator.py @@ -73,7 +73,7 @@ def generate_workspace( (agent_dir / "tool-runner.md").write_text(_render("tool-runner.md.j2", dsl=dsl)) _write_kernel_files(workspace, kernel_name, kernel_code, reference_code, spec_path) - _symlink_knowledge_base(workspace, dsl, device) + _copy_knowledge_base(workspace, dsl, device) if config.engine.git_init: _git_init(workspace) @@ -96,8 +96,8 @@ def _write_kernel_files( shutil.copy2(spec_path, tk_dir / f"{kernel_name}.yaml") -def _symlink_knowledge_base(workspace: Path, dsl: str, device: str) -> None: - """Symlink common and DSL/device-specific knowledge base subdirectories.""" +def _copy_knowledge_base(workspace: Path, dsl: str, device: str) -> None: + """Copy common and DSL/device-specific knowledge base subdirectories.""" kb_root = workspace / "knowledge_base" kb_root.mkdir(parents=True, exist_ok=True) @@ -118,13 +118,16 @@ def _symlink_knowledge_base(workspace: Path, dsl: str, device: str) -> None: common_dst = kb_root / "common" dsl_device_dst = kb_root / dsl / device - if common_src.is_dir() and not (common_dst.exists() or common_dst.is_symlink()): - common_dst.symlink_to(common_src) + if common_dst.is_symlink(): + common_dst.unlink() + if common_src.is_dir(): + shutil.copytree(common_src, common_dst, dirs_exist_ok=True) + dsl_device_dst.parent.mkdir(parents=True, exist_ok=True) - if dsl_device_src.is_dir() and not ( - dsl_device_dst.exists() or dsl_device_dst.is_symlink() - ): - dsl_device_dst.symlink_to(dsl_device_src) + if dsl_device_dst.is_symlink(): + dsl_device_dst.unlink() + if dsl_device_src.is_dir(): + shutil.copytree(dsl_device_src, dsl_device_dst, dirs_exist_ok=True) return From e01330a450758103500f8f9dcdf5a5dab5422320 Mon Sep 17 00:00:00 2001 From: Weilin Xu Date: Tue, 2 Jun 2026 10:11:49 -0700 Subject: [PATCH 05/12] Allow all xe-forge-skill by default. Signed-off-by: Weilin Xu --- src/xe_forge/claude/generator.py | 8 +++++--- src/xe_forge/claude/templates/settings.local.json.j2 | 11 +++++++++++ 2 files changed, 16 insertions(+), 3 deletions(-) create mode 100644 src/xe_forge/claude/templates/settings.local.json.j2 diff --git a/src/xe_forge/claude/generator.py b/src/xe_forge/claude/generator.py index 9695d4c..0d7c1a0 100644 --- a/src/xe_forge/claude/generator.py +++ b/src/xe_forge/claude/generator.py @@ -1,8 +1,9 @@ """Generate a Claude Code workspace for kernel optimization. -Creates CLAUDE.md, config.yaml, .claude/commands/, .claude/agents/, -and copies kernel files into the workspace. All text artifacts are -rendered from Jinja templates under ``templates/``. +Creates CLAUDE.md, config.yaml, .claude/settings.local.json, +.claude/commands/, .claude/agents/, and copies kernel files into +the workspace. All text artifacts are rendered from Jinja templates +under ``templates/``. """ from __future__ import annotations @@ -71,6 +72,7 @@ def generate_workspace( agent_dir = workspace / ".claude" / "agents" agent_dir.mkdir(parents=True, exist_ok=True) (agent_dir / "tool-runner.md").write_text(_render("tool-runner.md.j2", dsl=dsl)) + (workspace / ".claude" / "settings.local.json").write_text(_render("settings.local.json.j2")) _write_kernel_files(workspace, kernel_name, kernel_code, reference_code, spec_path) _copy_knowledge_base(workspace, dsl, device) diff --git a/src/xe_forge/claude/templates/settings.local.json.j2 b/src/xe_forge/claude/templates/settings.local.json.j2 new file mode 100644 index 0000000..b0f0e0b --- /dev/null +++ b/src/xe_forge/claude/templates/settings.local.json.j2 @@ -0,0 +1,11 @@ +{ + "permissions": { + "allow": [ + "Bash(xe-forge-skill analyze *)", + "Bash(xe-forge-skill trial *)", + "Bash(xe-forge-skill validate *)", + "Bash(xe-forge-skill benchmark *)", + "Bash(xe-forge-skill profile *)" + ] + } +} \ No newline at end of file From b46b7adb2b41ae94ad9b68de7c30fd662d8f0f4f Mon Sep 17 00:00:00 2001 From: Weilin Xu Date: Tue, 2 Jun 2026 10:51:54 -0700 Subject: [PATCH 06/12] Allow writing to the workspace by default. Signed-off-by: Weilin Xu --- src/xe_forge/claude/templates/settings.local.json.j2 | 1 + 1 file changed, 1 insertion(+) diff --git a/src/xe_forge/claude/templates/settings.local.json.j2 b/src/xe_forge/claude/templates/settings.local.json.j2 index b0f0e0b..c2ea245 100644 --- a/src/xe_forge/claude/templates/settings.local.json.j2 +++ b/src/xe_forge/claude/templates/settings.local.json.j2 @@ -1,5 +1,6 @@ { "permissions": { + "defaultMode": "acceptEdits", "allow": [ "Bash(xe-forge-skill analyze *)", "Bash(xe-forge-skill trial *)", From 8dcbb4efe33e9ef02e473fb76f1dd5ad342890c6 Mon Sep 17 00:00:00 2001 From: Weilin Xu Date: Tue, 2 Jun 2026 13:19:29 -0700 Subject: [PATCH 07/12] Change triton_us to custom_us, because we also generate custom sycl kernels. Signed-off-by: Weilin Xu --- src/xe_forge/claude/templates/CLAUDE.md.j2 | 2 +- src/xe_forge/claude/templates/tool-runner.md.j2 | 2 +- src/xe_forge/core/trial_manager.py | 13 +++++++++---- src/xe_forge/pipeline.py | 2 +- src/xe_forge/skills/__init__.py | 2 +- src/xe_forge/skills/benchmark.py | 6 +++--- src/xe_forge/skills/trial.py | 2 +- 7 files changed, 17 insertions(+), 12 deletions(-) diff --git a/src/xe_forge/claude/templates/CLAUDE.md.j2 b/src/xe_forge/claude/templates/CLAUDE.md.j2 index 3f7f7a4..1a54918 100644 --- a/src/xe_forge/claude/templates/CLAUDE.md.j2 +++ b/src/xe_forge/claude/templates/CLAUDE.md.j2 @@ -33,7 +33,7 @@ All runtime behavior below is gated by these values; re-read `config.yaml` if an | **Benchmark** | `xe-forge-skill benchmark --spec [--baseline-us ]` | | **Init trials** | `xe-forge-skill trial init ` | | **Save trial** | `xe-forge-skill trial save [--parent ] [--strategy "..."]` | -| **Record result** | `xe-forge-skill trial result --correctness --speedup --baseline-us --{{ dsl }}-us ` | +| **Record result** | `xe-forge-skill trial result --correctness --speedup --baseline-us --custom-us ` | | **Check status** | `xe-forge-skill trial status ` | | **Best trial** | `xe-forge-skill trial best ` | | **Baseline time** | `xe-forge-skill trial baseline-us ` | diff --git a/src/xe_forge/claude/templates/tool-runner.md.j2 b/src/xe_forge/claude/templates/tool-runner.md.j2 index adabf86..a490cd9 100644 --- a/src/xe_forge/claude/templates/tool-runner.md.j2 +++ b/src/xe_forge/claude/templates/tool-runner.md.j2 @@ -31,7 +31,7 @@ Safe to parallelize: analyze, validate, trial (CPU-only). ## Output Rules For benchmark: -- Extract ONLY: Correctness (PASSED/FAILED), Performance (baseline_us, {{ dsl }}_us, speedup), Errors. +- Extract ONLY: Correctness (PASSED/FAILED), Performance (baseline_us, custom_us, speedup), Errors. - Do NOT include configuration header or decorative separators. For profile: diff --git a/src/xe_forge/core/trial_manager.py b/src/xe_forge/core/trial_manager.py index 4009d86..e782f0e 100644 --- a/src/xe_forge/core/trial_manager.py +++ b/src/xe_forge/core/trial_manager.py @@ -40,6 +40,8 @@ def _load_state(self, kernel_name: str) -> dict: for trial in state.get("trials", {}).values(): if "pytorch_us" in trial and "baseline_us" not in trial: trial["baseline_us"] = trial.pop("pytorch_us") + if "triton_us" in trial and "custom_us" not in trial: + trial["custom_us"] = trial.pop("triton_us") return state def _save_state(self, kernel_name: str, state: dict) -> None: @@ -118,7 +120,7 @@ def save_trial( "correctness": None, "speedup": None, "baseline_us": None, - "triton_us": None, + "custom_us": None, "status": "saved", } self._save_state(kernel_name, state) @@ -134,6 +136,7 @@ def record_result( correctness: str | None = None, speedup: float | None = None, baseline_us: float | None = None, + custom_us: float | None = None, triton_us: float | None = None, ) -> dict: """Record benchmark results for a trial. Returns the trial dict.""" @@ -153,8 +156,10 @@ def record_result( trial["speedup"] = speedup if baseline_us is not None: trial["baseline_us"] = baseline_us + if custom_us is not None: + trial["custom_us"] = custom_us if triton_us is not None: - trial["triton_us"] = triton_us + trial["custom_us"] = triton_us if baseline_us is not None and state.get("baseline_us") is None: state["baseline_us"] = [baseline_us] @@ -222,8 +227,8 @@ def _render(tid: str, prefix: str = "", is_last: bool = True) -> None: icon = status_icon.get(trial["status"], "?") speedup_str = f"{trial['speedup']:.2f}x" if trial["speedup"] is not None else "---" runtime = "" - if trial.get("baseline_us") is not None and trial.get("triton_us") is not None: - runtime = f" (bl={trial['baseline_us']:.0f}us, tr={trial['triton_us']:.0f}us)" + if trial.get("baseline_us") is not None and trial.get("custom_us") is not None: + runtime = f" (bl={trial['baseline_us']:.0f}us, opt={trial['custom_us']:.0f}us)" best_marker = " <<<< BEST" if tid == state["best_trial"] else "" strategy_short = (trial["strategy"] or "")[:60] lines.append( diff --git a/src/xe_forge/pipeline.py b/src/xe_forge/pipeline.py index 8e29caf..8eb0648 100644 --- a/src/xe_forge/pipeline.py +++ b/src/xe_forge/pipeline.py @@ -443,7 +443,7 @@ def optimize( correctness="pass" if stage_result.success else "fail", speedup=speedup, baseline_us=(val_orig_ms or 0) * 1000, - triton_us=(current_ms or 0) * 1000, + custom_us=(current_ms or 0) * 1000, ) if stage_result.success: last_trial_id = trial_id diff --git a/src/xe_forge/skills/__init__.py b/src/xe_forge/skills/__init__.py index 0d1ac02..43d5cdb 100644 --- a/src/xe_forge/skills/__init__.py +++ b/src/xe_forge/skills/__init__.py @@ -66,7 +66,7 @@ def main(): t_result.add_argument("--correctness", choices=["pass", "fail"]) t_result.add_argument("--speedup", type=float) t_result.add_argument("--baseline-us", type=float) - t_result.add_argument("--triton-us", type=float) + t_result.add_argument("--custom-us", "--triton-us", dest="custom_us", type=float) t_result.add_argument("--trials-dir", default="./trials") t_status = trial_sub.add_parser("status") diff --git a/src/xe_forge/skills/benchmark.py b/src/xe_forge/skills/benchmark.py index d416d51..8486d0d 100644 --- a/src/xe_forge/skills/benchmark.py +++ b/src/xe_forge/skills/benchmark.py @@ -32,14 +32,14 @@ def run(args): init_args=init_args, input_dtypes=input_dtypes, ) - if optimized_result.success: + if optimized_result.success and optimized_result.execution_time_ms is not None: baseline_ms = sum(baseline_us) / len(baseline_us) / 1000.0 opt_ms = optimized_result.execution_time_ms speedup = baseline_ms / opt_ms if opt_ms > 0 else 0 print(f"Correctness: {'PASSED' if optimized_result.success else 'FAILED'}") print( f"Performance: baseline_us={baseline_ms * 1000:.2f}, " - f"triton_us={opt_ms * 1000:.2f}, speedup={speedup:.2f}x" + f"custom_us={opt_ms * 1000:.2f}, speedup={speedup:.2f}x" ) else: print("Correctness: FAILED") @@ -58,7 +58,7 @@ def run(args): if result.original_time_us and result.optimized_time_us: print( f"Performance: baseline_us={result.original_time_us:.2f}, " - f"triton_us={result.optimized_time_us:.2f}, speedup={result.speedup:.2f}x" + f"custom_us={result.optimized_time_us:.2f}, speedup={result.speedup:.2f}x" ) if result.feedback_message: print(f"Feedback: {result.feedback_message}") diff --git a/src/xe_forge/skills/trial.py b/src/xe_forge/skills/trial.py index 2792770..e3c0179 100644 --- a/src/xe_forge/skills/trial.py +++ b/src/xe_forge/skills/trial.py @@ -30,7 +30,7 @@ def run(args): correctness=args.correctness, speedup=args.speedup, baseline_us=args.baseline_us, - triton_us=args.triton_us, + custom_us=getattr(args, "custom_us", getattr(args, "triton_us", None)), ) status_icon = {"completed": "+", "failed": "X", "partial": "~", "saved": "?"} icon = status_icon.get(trial["status"], "?") From de564ae90465f50b25a1864bd3c86c28fb6163f5 Mon Sep 17 00:00:00 2001 From: Weilin Xu Date: Mon, 8 Jun 2026 12:33:38 -0700 Subject: [PATCH 08/12] settings.local.json --> settings.json Signed-off-by: Weilin Xu --- src/xe_forge/claude/generator.py | 2 +- .../templates/{settings.local.json.j2 => settings.json.j2} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename src/xe_forge/claude/templates/{settings.local.json.j2 => settings.json.j2} (100%) diff --git a/src/xe_forge/claude/generator.py b/src/xe_forge/claude/generator.py index 0d7c1a0..c83782e 100644 --- a/src/xe_forge/claude/generator.py +++ b/src/xe_forge/claude/generator.py @@ -72,7 +72,7 @@ def generate_workspace( agent_dir = workspace / ".claude" / "agents" agent_dir.mkdir(parents=True, exist_ok=True) (agent_dir / "tool-runner.md").write_text(_render("tool-runner.md.j2", dsl=dsl)) - (workspace / ".claude" / "settings.local.json").write_text(_render("settings.local.json.j2")) + (workspace / ".claude" / "settings.json").write_text(_render("settings.json.j2")) _write_kernel_files(workspace, kernel_name, kernel_code, reference_code, spec_path) _copy_knowledge_base(workspace, dsl, device) diff --git a/src/xe_forge/claude/templates/settings.local.json.j2 b/src/xe_forge/claude/templates/settings.json.j2 similarity index 100% rename from src/xe_forge/claude/templates/settings.local.json.j2 rename to src/xe_forge/claude/templates/settings.json.j2 From e5294aeb9bb111ea55f4e688492d3e8eac3373ed Mon Sep 17 00:00:00 2001 From: Weilin Xu Date: Mon, 8 Jun 2026 12:51:34 -0700 Subject: [PATCH 09/12] Add more context for SYCL*TLA generation. Signed-off-by: Weilin Xu --- knowledge_base/sycl/xpu/SYCL_lessons.md | 171 ++++++ ...1_Square_matrix_multiplication_solution.py | 510 ++++++++++++++++++ src/xe_forge/claude/templates/CLAUDE.md.j2 | 12 + 3 files changed, 693 insertions(+) create mode 100644 knowledge_base/sycl/xpu/SYCL_lessons.md create mode 100644 knowledge_base/sycl/xpu/examples/1_Square_matrix_multiplication_solution.py diff --git a/knowledge_base/sycl/xpu/SYCL_lessons.md b/knowledge_base/sycl/xpu/SYCL_lessons.md new file mode 100644 index 0000000..d80866d --- /dev/null +++ b/knowledge_base/sycl/xpu/SYCL_lessons.md @@ -0,0 +1,171 @@ +# SYCL Lessons + +## Lesson: Named Kernels for `parallel_for` + +### Symptom + +Compilation can fail with an error similar to: + +```text +unnamed type '(lambda ...)' is invalid; provide a kernel name, or use '-fsycl-unnamed-lambda' +``` + +### Cause + +Some SYCL toolchains reject unnamed kernel lambdas in `parallel_for`. + +### Fix + +Prefer an explicitly named kernel: + +```cpp +h.parallel_for( + sycl::nd_range<1>(global_size, WG_SIZE), + [=](sycl::nd_item<1> item) { + // kernel body + } +); +``` + +### Fallback + +If a code path cannot be easily updated to named kernels, add this compile flag: + +```text +-fsycl-unnamed-lambda +``` + +### Practical Note + +For portability and future compiler compatibility, named kernels are the safer default. + +--- + +## Lesson: Sub-group Coalesced Memory Access for Intel Xe GPUs + +### Symptom + +A kernel performs element-wise operations on large tensors but runs at only 0.66× PyTorch speed, despite using vectorized loads and a reasonable work-group size. + +### Root Cause + +**Poor memory coalescing**. If each work-item processes a contiguous block of N elements independently, lanes within a sub-group access memory N elements apart. With N=32 and sub-group size=16, consecutive lanes load from offsets 0, 32, 64, ... — hitting 16 separate cache lines per SIMD instruction, wasting 75% of fetched data. + +### Fix: Sub-group Aware Layout + +Reorganize the computation so **consecutive lanes access consecutive memory addresses**. For bf16 pairs, treat them as `uint32` and distribute work by lane: + +```cpp +enum { WG = 512, ITERS = 128, SG_SZ = 16, SGS_PER_WG = WG / SG_SZ }; +enum { PAIRS_PER_SG = SG_SZ * ITERS }; // 2048 pairs per sub-group + +q->submit([&](sycl::handler& h) { + h.parallel_for( + sycl::nd_range<1>(global, WG), + [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(16)]] { + auto sg = item.get_sub_group(); + int lane = sg.get_local_linear_id(); + int64_t sg_id = item.get_group(0) * SGS_PER_WG + sg.get_group_linear_id(); + int64_t sg_base = sg_id * PAIRS_PER_SG; + + #pragma unroll + for (int i = 0; i < ITERS; i++) { + int64_t idx = sg_base + i * SG_SZ + lane; + // Lane 0 reads idx=sg_base+0 + // Lane 1 reads idx=sg_base+1 + // ... Lane 15 reads idx=sg_base+15 + // All lanes read from same cache line! + uint32_t data = load_u32(idx); + process_and_store(data, idx); + } + }); +}); +``` + +**Key points:** +1. All 16 lanes in a sub-group load from consecutive offsets: `[sg_base, sg_base+1, ..., sg_base+15]`. +2. Together they fill exactly **one 64-byte cache line** per iteration. +3. Use `[[sycl::reqd_sub_group_size(16)]]` to guarantee the lane mapping. +4. Use `enum` instead of `constexpr` for compile-time constants to avoid DPC++ host/device capture mismatch. + +### Expected Impact + +- **Memory bandwidth utilization**: ~100% (vs. 25% with poor coalescing). +- **Performance gain**: 0.66× → 0.97× (47% improvement toward parity with optimized reference). + +### Reference (oneAPI GPU Optimization Guide) + +- Thread Mapping and GPU Occupancy (Section 6) +- Memory Hierarchy and Access Patterns (Section 7) +- Sub-group sizes: 16 or 32 for most Xe GPUs; use `get_sub_group()` API to query and adapt at runtime. + +--- + +## Lesson: Speeding Up Row-Scale Kernels (diag(A) * B) + +### Symptom + +The row-wise broadcast multiply kernel `C[i,j] = A[i] * B[i,j]` is correct but only about 0.62x as fast as `torch.compile()`. + +### Root Causes + +- Per-element integer division (`row = idx / M`) in the hot loop +- Scalar bf16 loads/stores and scalar conversion overhead +- Weak memory coalescing pattern across lanes + +### Fix Pattern + +1. Map one work-group to one row +- Use `row = item.get_group(0)`. +- Remove integer division from the inner loop entirely. + +2. Process bf16 pairs as `uint32` +- Load/store two bf16 values at once. +- This doubles useful bytes per memory op with naturally aligned 32-bit accesses. + +3. Use coalesced lane mapping along columns +- Let consecutive local IDs handle consecutive pair indices. +- Stride by work-group size in the column loop. + +4. Use explicit sub-group size on Xe +- Add `[[sycl::reqd_sub_group_size(16)]]` for predictable lane behavior. + +5. Convert bf16 manually with bit operations +- `bf16 -> f32`: left-shift by 16 and bit-cast. +- `f32 -> bf16`: round-to-nearest-even using bias and sticky-bit logic. + +### Minimal Kernel Structure + +```cpp +const int pairs_per_row = M / 2; +enum { WG = 256 }; + +q.submit([&](sycl::handler& h) { + h.parallel_for( + sycl::nd_range<1>((size_t)N * WG, WG), + [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(16)]] { + int row = item.get_group(0); + int lid = item.get_local_id(0); + + float a_val = bf16_to_f32(a[row]); + const uint32_t* b_row = b32 + row * pairs_per_row; + uint32_t* c_row = c32 + row * pairs_per_row; + + for (int pi = lid; pi < pairs_per_row; pi += WG) { + uint32_t p = b_row[pi]; + c_row[pi] = mul_pair_bf16(a_val, p); + } + }); +}); +``` + +### Measured Impact + +- Versus eager PyTorch: improved to about 1.46x +- Versus `torch.compile()`: improved from about 0.62x to about 0.93x + +### oneAPI Guide Alignment + +- Section 6: Thread mapping and occupancy (work-group decomposition) +- Section 7: Memory hierarchy and coalesced access +- Section 11: Prefer wide/coalesced accesses and remove expensive operations from inner loops diff --git a/knowledge_base/sycl/xpu/examples/1_Square_matrix_multiplication_solution.py b/knowledge_base/sycl/xpu/examples/1_Square_matrix_multiplication_solution.py new file mode 100644 index 0000000..0ea71ef --- /dev/null +++ b/knowledge_base/sycl/xpu/examples/1_Square_matrix_multiplication_solution.py @@ -0,0 +1,510 @@ +import os +import re +import sys +from pathlib import Path + +import torch +import torch.nn as nn + +# --------------------------------------------------------------------------- +# IGC workaround: raise vector alias analysis threshold for large CUTLASS kernels +# --------------------------------------------------------------------------- +os.environ.setdefault("IGC_VectorAliasBBThreshold", "100000000000") + +# --------------------------------------------------------------------------- +# Tuning knobs — override these to match your problem size. +# Run onednn_gemm_tuning.py to find optimal values. +# --------------------------------------------------------------------------- +TILE_M = 256 +TILE_N = 256 +TILE_K = 32 +GRF_COUNT = 256 +SCHEDULER_TYPE = 1 # 0=default, 1=Persistent, 2=StreamK +EPILOGUE_TYPE = 0 # 0=LinComb, 1=ReLU, 2=GELU + +# --------------------------------------------------------------------------- +# Inline SYCL kernel source — sycl_tla_gemm_template.cpp +# All tile/GRF/scheduler/epilogue knobs are set via -D flags at compile time. +# --------------------------------------------------------------------------- +_KERNEL_SYCL = r""" +#include +#include +#include + +#include "cute/tensor.hpp" +#include "cutlass/cutlass.h" +#include "cutlass/numeric_types.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/device/gemm_universal.h" +#include "cutlass/gemm/device/gemm_universal_adapter.h" +#include "cutlass/gemm/kernel/gemm_universal.hpp" +#include "cutlass/gemm/collective/collective_builder.hpp" +#include "cutlass/epilogue/collective/collective_builder.hpp" +#include "cutlass/epilogue/collective/default_epilogue.hpp" +#include "cutlass/epilogue/thread/linear_combination.h" +#include "cutlass/epilogue/thread/activation.h" +#include "cutlass/epilogue/fusion/operations.hpp" +#include "cutlass/gemm/dispatch_policy.hpp" +#include "cutlass/gemm/kernel/tile_scheduler.hpp" +#include "cutlass/util/packed_stride.hpp" + +using namespace cute; + +// ---- Tunable parameters (set via -D flags from oneDNN tuning output) ---- + +#ifndef TILE_M +#define TILE_M 256 +#endif +#ifndef TILE_N +#define TILE_N 256 +#endif +#ifndef TILE_K +#define TILE_K 32 +#endif +#ifndef GRF_COUNT +#define GRF_COUNT 256 +#endif +static_assert(GRF_COUNT == 128 || GRF_COUNT == 256, + "GRF_COUNT must be 128 or 256."); +#ifndef SCHEDULER_TYPE +#define SCHEDULER_TYPE 1 +#endif +#ifndef ELEMENT_INPUT +#define ELEMENT_INPUT cutlass::bfloat16_t +#endif +#ifndef ELEMENT_ACC +#define ELEMENT_ACC float +#endif +#ifndef ELEMENT_OUTPUT +#define ELEMENT_OUTPUT cutlass::bfloat16_t +#endif +#ifndef TORCH_OUTPUT_DTYPE +#define TORCH_OUTPUT_DTYPE torch::kBFloat16 +#endif +#ifndef TORCH_OUTPUT_CTYPE +#define TORCH_OUTPUT_CTYPE at::BFloat16 +#endif +#ifndef ALIGNMENT +#define ALIGNMENT 8 +#endif +#ifndef LAYOUT_A +#define LAYOUT_A 0 +#endif +#ifndef LAYOUT_B +#define LAYOUT_B 0 +#endif +#ifndef EPILOGUE_TYPE +#define EPILOGUE_TYPE 0 +#endif + +#define CUTLASS_CHECK(status) \ + { \ + cutlass::Status _err = (status); \ + if (_err != cutlass::Status::kSuccess) { \ + TORCH_CHECK(false, "[CUTLASS] ", cutlassGetStatusString(_err), \ + " at ", __FILE__, ":", __LINE__); \ + } \ + } + +// ---- GEMM kernel type definitions ---- + +using TileShape = cute::Shape, cute::Int, cute::Int>; +using ClusterShape = cute::Shape; + +#if LAYOUT_A == 1 +using LayoutA = cutlass::layout::ColumnMajor; +#else +using LayoutA = cutlass::layout::RowMajor; +#endif +#if LAYOUT_B == 1 +using LayoutB = cutlass::layout::ColumnMajor; +#else +using LayoutB = cutlass::layout::RowMajor; +#endif + +using StageCount = cutlass::gemm::collective::StageCountAuto; + +#if SCHEDULER_TYPE == 2 +using KernelSchedule = cutlass::gemm::KernelXeCooperative; +#else +using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto; +#endif + +// Epilogue schedule must match kernel schedule for cooperative mode +#if SCHEDULER_TYPE == 2 +using EpilogueSchedule = cutlass::epilogue::XeCooperative; +#else +using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto; +#endif + +#if EPILOGUE_TYPE == 1 +using EpilogueFusionOp = cutlass::epilogue::fusion::LinCombEltAct< + cutlass::epilogue::thread::ReLu, ELEMENT_OUTPUT, ELEMENT_ACC, ELEMENT_OUTPUT, ELEMENT_ACC>; +#elif EPILOGUE_TYPE == 2 +using EpilogueFusionOp = cutlass::epilogue::fusion::LinCombEltAct< + cutlass::epilogue::thread::GELU, ELEMENT_OUTPUT, ELEMENT_ACC, ELEMENT_OUTPUT, ELEMENT_ACC>; +#else +using EpilogueFusionOp = cutlass::epilogue::fusion::LinearCombination< + ELEMENT_OUTPUT, ELEMENT_ACC, ELEMENT_OUTPUT, ELEMENT_ACC>; +#endif + +using gemm_epilogue = + typename cutlass::epilogue::collective::CollectiveBuilder< + cutlass::arch::Xe20, cutlass::arch::OpClassTensorOp, + TileShape, ClusterShape, + cutlass::epilogue::collective::EpilogueTileAuto, + ELEMENT_ACC, ELEMENT_ACC, + ELEMENT_OUTPUT, cutlass::layout::RowMajor, ALIGNMENT, + ELEMENT_OUTPUT, cutlass::layout::RowMajor, ALIGNMENT, + EpilogueSchedule, + EpilogueFusionOp + >::CollectiveOp; + +using gemm_mainloop = + typename cutlass::gemm::collective::CollectiveBuilder< + cutlass::arch::Xe20, cutlass::arch::OpClassTensorOp, + ELEMENT_INPUT, LayoutA, ALIGNMENT, + ELEMENT_INPUT, LayoutB, ALIGNMENT, + ELEMENT_ACC, + TileShape, ClusterShape, + StageCount, KernelSchedule + >::CollectiveOp; + +#if SCHEDULER_TYPE == 2 +using TileScheduler = cutlass::gemm::StreamKScheduler; +#elif SCHEDULER_TYPE == 1 +using TileScheduler = cutlass::gemm::PersistentScheduler; +#else +using TileScheduler = void; +#endif + +using gemm_kernel_base = cutlass::gemm::kernel::GemmUniversal< + cute::Shape, + gemm_mainloop, gemm_epilogue, TileScheduler>; +struct gemm_kernel : public gemm_kernel_base {}; +using gemm_device = cutlass::gemm::device::GemmUniversalAdapter; + +// ---- PyTorch binding: gemm_forward(A, B, out?, alpha, beta) ---- +// Accepts 2-D or 3-D (batched) inputs. Forward-only. + +torch::Tensor gemm_forward(torch::Tensor A, torch::Tensor B, + std::optional out = std::nullopt, + float alpha = 1.0f, float beta = 0.0f) { + TORCH_CHECK(A.device().is_xpu(), "A must be an XPU tensor"); + TORCH_CHECK(B.device().is_xpu(), "B must be an XPU tensor"); + TORCH_CHECK(A.dtype() == TORCH_OUTPUT_DTYPE, "A dtype mismatch"); + TORCH_CHECK(B.dtype() == TORCH_OUTPUT_DTYPE, "B dtype mismatch"); + TORCH_CHECK(A.dim() == 2 || A.dim() == 3, "A must be 2-D or 3-D"); + TORCH_CHECK(B.dim() == 2 || B.dim() == 3, "B must be 2-D or 3-D"); + TORCH_CHECK(A.dim() == B.dim(), "A and B must have same number of dimensions"); + + const bool batched = (A.dim() == 3); + if (batched) { + TORCH_CHECK(A.size(0) == B.size(0), "Batch size mismatch"); + } + // Dimension check: K must match between A and B +#if LAYOUT_A == 1 + const int64_t A_K = A.size(-2); +#else + const int64_t A_K = A.size(-1); +#endif +#if LAYOUT_B == 1 + const int64_t B_K = B.size(-1); +#else + const int64_t B_K = B.size(-2); +#endif + TORCH_CHECK(A_K == B_K, "Dimension mismatch: A's K=", A_K, " != B's K=", B_K); + + A = A.contiguous(); + B = B.contiguous(); + + const int L = batched ? static_cast(A.size(0)) : 1; +#if LAYOUT_A == 1 + const int M = static_cast(A.size(-1)); + const int K = static_cast(A.size(-2)); +#else + const int M = static_cast(A.size(-2)); + const int K = static_cast(A.size(-1)); +#endif +#if LAYOUT_B == 1 + const int N = static_cast(B.size(-2)); +#else + const int N = static_cast(B.size(-1)); +#endif + const int64_t batch_stride_A = batched ? int64_t(M) * K : int64_t(0); + const int64_t batch_stride_B = batched ? int64_t(K) * N : int64_t(0); + const int64_t batch_stride_D = batched ? int64_t(M) * N : int64_t(0); + + torch::Tensor D; + if (out.has_value()) { + D = out.value(); + TORCH_CHECK(D.device() == A.device(), "out must be on same device as A"); + TORCH_CHECK(D.dtype() == TORCH_OUTPUT_DTYPE, "out dtype mismatch"); + if (batched) { + TORCH_CHECK(D.dim() == 3 && D.size(0) == L && D.size(1) == M && D.size(2) == N, + "out shape mismatch"); + } else { + TORCH_CHECK(D.dim() == 2 && D.size(0) == M && D.size(1) == N, + "out shape mismatch"); + } + D = D.contiguous(); + } else { + if (batched) { + D = torch::empty({L, M, N}, A.options()); + } else { + D = torch::empty({M, N}, A.options()); + } + } + + using coord_t = cutlass::gemm::GemmCoord::Index; + const int device_idx = A.device().index(); + cutlass::KernelHardwareInfo hw_info; + hw_info.sm_count = + cutlass::KernelHardwareInfo::query_device_multiprocessor_count(device_idx); + + gemm_device::Arguments arguments{ + cutlass::gemm::GemmUniversalMode::kGemm, + {static_cast(M), static_cast(N), + static_cast(K), static_cast(L)}, + { + reinterpret_cast(A.data_ptr()), +#if LAYOUT_A == 1 + {cute::Int<1>{}, int64_t(M), batch_stride_A}, +#else + {int64_t(K), cute::Int<1>{}, batch_stride_A}, +#endif + reinterpret_cast(B.data_ptr()), +#if LAYOUT_B == 1 + {int64_t(K), cute::Int<1>{}, batch_stride_B}, +#else + {cute::Int<1>{}, int64_t(N), batch_stride_B}, +#endif + }, + { + {alpha, beta}, + (beta != 0.f) ? reinterpret_cast(D.data_ptr()) : nullptr, + {int64_t(N), cute::Int<1>{}, batch_stride_D}, + reinterpret_cast(D.data_ptr()), + {int64_t(N), cute::Int<1>{}, batch_stride_D}, + }, + hw_info + }; + + gemm_device gemm_op; + size_t ws_bytes = gemm_op.get_workspace_size(arguments); + auto workspace = torch::empty( + {static_cast(ws_bytes)}, + torch::TensorOptions().dtype(torch::kByte).device(A.device())); + + auto stream = c10::xpu::getCurrentXPUStream(A.device().index()); + sycl::queue* queue = &stream.queue(); + +#ifndef CUTLASS_BACKEND_DISABLE_CHECKS + CUTLASS_CHECK(gemm_op.can_implement(arguments)); +#endif + CUTLASS_CHECK(gemm_op.initialize( + arguments, static_cast(workspace.data_ptr()), queue)); + CUTLASS_CHECK(gemm_op(queue)); + return D; +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("gemm_forward", [](torch::Tensor A, torch::Tensor B, py::object out, + float alpha, float beta) -> torch::Tensor { + std::optional opt_out; + if (!out.is_none()) opt_out = out.cast(); + return gemm_forward(A, B, opt_out, alpha, beta); + }, + "Forward-only GEMM (no autograd backward). " + "Accepts 2-D or 3-D (batched) inputs.", + py::arg("A"), py::arg("B"), py::arg("out") = py::none(), + py::arg("alpha") = 1.0f, py::arg("beta") = 0.0f); +} +""" + + +# --------------------------------------------------------------------------- +# JIT compilation +# --------------------------------------------------------------------------- + +def _get_cutlass_version(cutlass_root: Path) -> str: + version_h = cutlass_root / "include" / "cutlass" / "version.h" + if not version_h.exists(): + return "unknown" + text = version_h.read_text() + major = re.search(r"#define\s+CUTLASS_MAJOR\s+(\d+)", text) + minor = re.search(r"#define\s+CUTLASS_MINOR\s+(\d+)", text) + patch = re.search(r"#define\s+CUTLASS_PATCH\s+(\d+)", text) + if major and minor and patch: + return f"{major.group(1)}.{minor.group(1)}.{patch.group(1)}" + return "unknown" + + +def _detect_sycl_target() -> str: + name = torch.xpu.get_device_name(0).lower() + if any(x in name for x in ("b580", "b570", "b770", "b50", "b60", "bmg-g21")): + return "bmg-g21" + if any(x in name for x in ("b70", "bmg-g31")): + return "bmg-g31" + if "bmg" in name or "battlemage" in name: + return "bmg-g21" + if any(x in name for x in ("a770", "a750", "a580", "a380", "a310", "dg2")): + return "acm-g10" + if "max" in name or "pvc" in name: + return "pvc" + if "lunar" in name or "lnl" in name: + return "lnl-m" + return "bmg-g21" + + +def _load_extension(sycl_target: str = "bmg-g21"): + import torch.utils.cpp_extension as _cpp_ext + from torch.utils.cpp_extension import load_inline + import shutil + + _cutlass_path = os.environ.get("CUTLASS_PATH") + if not _cutlass_path: + sys.exit("ERROR: CUTLASS_PATH environment variable is not set.") + cutlass_root = Path(_cutlass_path) + + _cutlass_ver = _get_cutlass_version(cutlass_root) + _ext_name = ( + f"sycl_tla_gemm_{TILE_M}x{TILE_N}x{TILE_K}_" + f"grf{GRF_COUNT}_sched{SCHEDULER_TYPE}_epi{EPILOGUE_TYPE}_" + f"{_cutlass_ver}_{sycl_target}" + ).replace(".", "_").replace("-", "_") + + extra_include_paths = [ + str(cutlass_root / "include"), + str(cutlass_root / "tools" / "util" / "include"), + ] + + _cutlass_defines = [ + "-DCUTLASS_ENABLE_SYCL", + "-DSYCL_INTEL_TARGET", + "-DCUTLASS_VERSIONS_GENERATED", + "-DCUTLASS_BACKEND_DISABLE_CHECKS", + "-DNDEBUG", + ] + + _tuning_defines = [ + f"-DTILE_M={TILE_M}", + f"-DTILE_N={TILE_N}", + f"-DTILE_K={TILE_K}", + f"-DGRF_COUNT={GRF_COUNT}", + f"-DSCHEDULER_TYPE={SCHEDULER_TYPE}", + f"-DEPILOGUE_TYPE={EPILOGUE_TYPE}", + ] + + extra_sycl_cflags = [ + "-fno-sycl-instrument-device-code", + "-fsycl-targets=spir64_gen", + *_cutlass_defines, + *_tuning_defines, + "-O3", + "-Wno-unused-variable", + "-Wno-unused-local-typedef", + "-Wno-unused-but-set-variable", + "-Wno-uninitialized", + "-Wno-reorder-ctor", + "-Wno-logical-op-parentheses", + "-Wno-unused-function", + "-Wno-unknown-pragmas", + ] + + _dlink_extra_flags = [ + "-Xspirv-translator", + "-spirv-ext=+SPV_INTEL_split_barrier," + "+SPV_INTEL_2d_block_io," + "+SPV_INTEL_subgroup_matrix_multiply_accumulate", + ] + if GRF_COUNT == 256: + _dlink_extra_flags += [ + "-Xs", + '"-options -cl-intel-256-GRF-per-thread"', + ] + for flag in _dlink_extra_flags: + if flag not in _cpp_ext._SYCL_DLINK_FLAGS: + _cpp_ext._SYCL_DLINK_FLAGS.append(flag) + + extra_ldflags = [] + _mklroot = Path(os.environ["MKLROOT"]) if "MKLROOT" in os.environ else None + if _mklroot and (_mklroot / "lib").exists(): + _mkl_lib = _mklroot / "lib" + extra_ldflags += [ + f"-Wl,-rpath,{_mkl_lib}", f"-L{_mkl_lib}", + "-lmkl_intel_ilp64", + "-lmkl_intel_thread", + "-lmkl_core", + ] + + icpx = shutil.which("icpx") + _compiler_lib = None + if icpx: + _icpx_lib = Path(icpx).resolve().parent.parent / "lib" + if _icpx_lib.exists(): + _compiler_lib = _icpx_lib + if _compiler_lib is None: + _cmplr_root = ( + Path(os.environ["CMPLR_ROOT"]) if "CMPLR_ROOT" in os.environ + else None + ) + if _cmplr_root and (_cmplr_root / "lib").exists(): + _compiler_lib = _cmplr_root / "lib" + if _compiler_lib: + extra_ldflags += [ + f"-Wl,-rpath,{_compiler_lib}", f"-L{_compiler_lib}", + "-liomp5", + ] + + os.environ["TORCH_XPU_ARCH_LIST"] = sycl_target + + module = load_inline( + name=_ext_name, + cpp_sources=[""], + sycl_sources=[_KERNEL_SYCL], + extra_cflags=_cutlass_defines + _tuning_defines, + extra_sycl_cflags=extra_sycl_cflags, + extra_include_paths=extra_include_paths, + extra_ldflags=extra_ldflags, + with_sycl=True, + verbose=False, + no_implicit_headers=True, + ) + return module + + +# --------------------------------------------------------------------------- +# Compile the extension at module load time (cached by PyTorch) +# --------------------------------------------------------------------------- +SYCL_TARGET = os.environ.get("SYCL_TLA_TARGET") or _detect_sycl_target() +_module = _load_extension(sycl_target=SYCL_TARGET) + + +# --------------------------------------------------------------------------- +# Model class — drop-in replacement for the reference +# --------------------------------------------------------------------------- + +class Model(nn.Module): + """ + Model that performs square matrix multiplication (C = A * B) + using sycl_tla_gemm_template with tuning knobs set via -D flags. + """ + def __init__(self): + super(Model, self).__init__() + + def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor: + return _module.gemm_forward(A, B) + + +N = 2048 * 2 + + +def get_inputs(): + A = torch.rand(N, N) + B = torch.rand(N, N) + return [A, B] + + +def get_init_inputs(): + return [] diff --git a/src/xe_forge/claude/templates/CLAUDE.md.j2 b/src/xe_forge/claude/templates/CLAUDE.md.j2 index 1a54918..57bda13 100644 --- a/src/xe_forge/claude/templates/CLAUDE.md.j2 +++ b/src/xe_forge/claude/templates/CLAUDE.md.j2 @@ -2,6 +2,18 @@ Optimize kernels into high-performance {{ dsl | upper }} implementations for {{ device | upper }}. +{% if dsl == "sycl" %} +If the reference is PyTorch code, write custom SYCL C++ kernels that implement the same +computation. The kernel must be wrapped in a Python .py file using torch.utils.cpp_extension +(inline_extension or load_inline) to compile and call the SYCL kernel from Python. The Model +class in the .py wrapper must be compatible with the reference model in terms of the state_dict, +and the __init__() and forward() methods. + +Do NOT simply call existing PyTorch operators (torch.mm, F.linear, torch.compile, etc.). +The goal is to write hand-optimized SYCL C++ code using SYCL*TLA (CUTLASS), DPAS/XMX instructions, +shared local memory (SLM), and tiling patterns from the knowledge_base. +{% endif %} + ## CONFIGURATION — Read `config.yaml` first At the start of every session, read `config.yaml` in the workspace root. It controls: From 010d8cae19311abf607d52fbe4144ff328558eaf Mon Sep 17 00:00:00 2001 From: Weilin Xu Date: Tue, 9 Jun 2026 07:15:20 -0700 Subject: [PATCH 10/12] Use consistent --custom-us Signed-off-by: Weilin Xu --- src/xe_forge/claude/templates/CLAUDE.md.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xe_forge/claude/templates/CLAUDE.md.j2 b/src/xe_forge/claude/templates/CLAUDE.md.j2 index 57bda13..b9a38a0 100644 --- a/src/xe_forge/claude/templates/CLAUDE.md.j2 +++ b/src/xe_forge/claude/templates/CLAUDE.md.j2 @@ -72,7 +72,7 @@ For each trial: 4. **Benchmark** (MANDATORY every trial): - **Trial t0:** `xe-forge-skill benchmark test_kernels/{{ kernel_name }}.py <{{ dsl }}_file> --spec test_kernels/{{ kernel_name }}.yaml` - **Trials t1+:** Get cached baseline via `xe-forge-skill trial baseline-us {{ kernel_name }}`, then `xe-forge-skill benchmark test_kernels/{{ kernel_name }}.py <{{ dsl }}_file> --spec test_kernels/{{ kernel_name }}.yaml --baseline-us ` -5. **Record** — `xe-forge-skill trial result {{ kernel_name }} --correctness --speedup --baseline-us --{{ dsl }}-us ` +5. **Record** — `xe-forge-skill trial result {{ kernel_name }} --correctness --speedup --baseline-us --custom-us ` 6. **Profile** — if `vtune_enabled: true` in `config.yaml` and this is trial t1 or later, run `xe-forge-skill profile <{{ dsl }}_file> --spec ` and use the output to guide the next trial. If `vtune_enabled: false`, skip this step. 7. **Decide next action**: - Speedup > 5x -> stop, finalize From fa8a9e0128e774625c35cc94c88fe09eca1293b4 Mon Sep 17 00:00:00 2001 From: Weilin Xu Date: Tue, 9 Jun 2026 10:25:00 -0700 Subject: [PATCH 11/12] Instruct Claude Code to use the knowledge base in the workspace. Signed-off-by: Weilin Xu --- src/xe_forge/claude/templates/CLAUDE.md.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xe_forge/claude/templates/CLAUDE.md.j2 b/src/xe_forge/claude/templates/CLAUDE.md.j2 index b9a38a0..0ca8466 100644 --- a/src/xe_forge/claude/templates/CLAUDE.md.j2 +++ b/src/xe_forge/claude/templates/CLAUDE.md.j2 @@ -57,7 +57,7 @@ All runtime behavior below is gated by these values; re-read `config.yaml` if an ### Step 1: Analyze - Read the baseline source file `test_kernels/{{ kernel_name }}.py`. Identify shapes, dtypes, operations. - Run `xe-forge-skill analyze test_kernels/{{ kernel_name }}.py` to get AST-based analysis. -- Read relevant knowledge_base/ files for optimization patterns. +- Read relevant knowledge_base/ files in the workspace for optimization patterns. ### Step 2: Initialize ```bash From 14ee963ccb32b975812259851c934f0a98b02dfe Mon Sep 17 00:00:00 2001 From: Weilin Xu Date: Tue, 9 Jun 2026 10:25:19 -0700 Subject: [PATCH 12/12] Update SYCL lessons. Signed-off-by: Weilin Xu --- knowledge_base/sycl/xpu/SYCL_lessons.md | 112 +++++++++++++++--------- 1 file changed, 71 insertions(+), 41 deletions(-) diff --git a/knowledge_base/sycl/xpu/SYCL_lessons.md b/knowledge_base/sycl/xpu/SYCL_lessons.md index d80866d..a9d70dd 100644 --- a/knowledge_base/sycl/xpu/SYCL_lessons.md +++ b/knowledge_base/sycl/xpu/SYCL_lessons.md @@ -1,46 +1,5 @@ # SYCL Lessons -## Lesson: Named Kernels for `parallel_for` - -### Symptom - -Compilation can fail with an error similar to: - -```text -unnamed type '(lambda ...)' is invalid; provide a kernel name, or use '-fsycl-unnamed-lambda' -``` - -### Cause - -Some SYCL toolchains reject unnamed kernel lambdas in `parallel_for`. - -### Fix - -Prefer an explicitly named kernel: - -```cpp -h.parallel_for( - sycl::nd_range<1>(global_size, WG_SIZE), - [=](sycl::nd_item<1> item) { - // kernel body - } -); -``` - -### Fallback - -If a code path cannot be easily updated to named kernels, add this compile flag: - -```text --fsycl-unnamed-lambda -``` - -### Practical Note - -For portability and future compiler compatibility, named kernels are the safer default. - ---- - ## Lesson: Sub-group Coalesced Memory Access for Intel Xe GPUs ### Symptom @@ -169,3 +128,74 @@ q.submit([&](sycl::handler& h) { - Section 6: Thread mapping and occupancy (work-group decomposition) - Section 7: Memory hierarchy and coalesced access - Section 11: Prefer wide/coalesced accesses and remove expensive operations from inner loops + +--- + +## Lesson: Never Use `-fsycl-unnamed-lambda` + +### Symptom + +Compilation fails with: + +```text +icpx: error: cannot specify '-fsycl-unnamed-lambda' along with '-fsycl-host-compiler' +``` + +### Cause + +`-fsycl-unnamed-lambda` conflicts with the `-fsycl-host-compiler` flag that the +PyTorch `load_inline` build system injects automatically. The two flags cannot +coexist. + +### Fix + +Always use **named kernel classes** in `parallel_for`: + +```cpp +h.parallel_for( + sycl::nd_range<1>(global_size, WG_SIZE), + [=](sycl::nd_item<1> item) { + // kernel body + } +); +``` + +Never add `-fsycl-unnamed-lambda` to `extra_sycl_cflags` in `load_inline`. + +### Additional: `constexpr` Captures Cause Kernel Size Mismatch + +Capturing `constexpr` float variables from the enclosing host scope causes a +static assertion failure at compile time: + +```text +error: static assertion failed: Unexpected kernel lambda size. + This can be caused by an external host compiler producing a lambda with an + unexpected layout. In many cases the difference is related to capturing + constexpr variables. +``` + +**Fix**: define float constants **inside** the kernel lambda, not outside it: + +```cpp +// WRONG — constexpr captured from host scope +constexpr float S2PI = 0.7978845608028654f; +q->submit([&](sycl::handler& h) { + h.parallel_for(range, [=](sycl::nd_item<1> item) { + float x = S2PI * val; // S2PI captured — size mismatch! + }); +}); + +// CORRECT — constant defined inside the lambda +q->submit([&](sycl::handler& h) { + h.parallel_for(range, [=](sycl::nd_item<1> item) { + const float S2PI = 0.7978845608028654f; + float x = S2PI * val; // no capture, no mismatch + }); +}); +``` + +Use `enum` for integer compile-time constants (safe to define in host scope): + +```cpp +enum { WG = 256, ITERS = 16, SG_SZ = 16 }; // safe — integer enums, not floats +```