From a71952aaf4fe9e19466746aebec5e482c4fe2d40 Mon Sep 17 00:00:00 2001
From: Weilin Xu <weilin.xu@intel.com>
Date: Mon, 1 Jun 2026 11:03:46 -0700
Subject: [PATCH 01/12] Avoid saying triton in the sycl project instructions.

Signed-off-by: Weilin Xu <weilin.xu@intel.com>
---
 src/xe_forge/claude/generator.py                |  2 +-
 src/xe_forge/claude/templates/CLAUDE.md.j2      | 14 ++++++++------
 src/xe_forge/claude/templates/tool-runner.md.j2 |  2 +-
 3 files changed, 10 insertions(+), 8 deletions(-)
diff --git a/src/xe_forge/claude/generator.py b/src/xe_forge/claude/generator.py
index 86eadb7..1a3494d 100644
--- a/src/xe_forge/claude/generator.py
+++ b/src/xe_forge/claude/generator.py
@@ -70,7 +70,7 @@ def generate_workspace(
 
     agent_dir = workspace / ".claude" / "agents"
     agent_dir.mkdir(parents=True, exist_ok=True)
-    (agent_dir / "tool-runner.md").write_text(_render("tool-runner.md.j2"))
+    (agent_dir / "tool-runner.md").write_text(_render("tool-runner.md.j2", dsl=dsl))
 
     _write_kernel_files(workspace, kernel_name, kernel_code, reference_code, spec_path)
     _symlink_knowledge_base(workspace)
diff --git a/src/xe_forge/claude/templates/CLAUDE.md.j2 b/src/xe_forge/claude/templates/CLAUDE.md.j2
index 7131c6a..3f7f7a4 100644
--- a/src/xe_forge/claude/templates/CLAUDE.md.j2
+++ b/src/xe_forge/claude/templates/CLAUDE.md.j2
@@ -33,12 +33,12 @@ All runtime behavior below is gated by these values; re-read `config.yaml` if an
 | **Benchmark** | `xe-forge-skill benchmark <baseline> <optimized> --spec <spec.yaml> [--baseline-us <cached>]` |
 | **Init trials** | `xe-forge-skill trial init <kernel_name> <baseline_file>` |
 | **Save trial** | `xe-forge-skill trial save <kernel_name> <file> [--parent <id>] [--strategy "..."]` |
-| **Record result** | `xe-forge-skill trial result <kernel_name> <trial_id> --correctness <pass\|fail> --speedup <float> --baseline-us <float> --triton-us <float>` |
+| **Record result** | `xe-forge-skill trial result <kernel_name> <trial_id> --correctness <pass\|fail> --speedup <float> --baseline-us <float> --{{ dsl }}-us <float>` |
 | **Check status** | `xe-forge-skill trial status <kernel_name>` |
 | **Best trial** | `xe-forge-skill trial best <kernel_name>` |
 | **Baseline time** | `xe-forge-skill trial baseline-us <kernel_name>` |
 | **Finalize** | `xe-forge-skill trial finalize <kernel_name> <output_file>` |
-| **Profile** (only when `vtune_enabled: true`) | `xe-forge-skill profile <triton_file> --spec <spec.yaml>` |
+| **Profile** (only when `vtune_enabled: true`) | `xe-forge-skill profile <{{ dsl }}_file> --spec <spec.yaml>` |
 
 ## WORKFLOW — Follow these steps in order
 
@@ -58,10 +58,10 @@ For each trial:
 2. **Validate** — `xe-forge-skill validate <file> --dsl {{ dsl }}` (fix until passing).
 3. **Save** — `xe-forge-skill trial save {{ kernel_name }} <file> --parent <id> --strategy "description"`.
 4. **Benchmark** (MANDATORY every trial):
-   - **Trial t0:** `xe-forge-skill benchmark test_kernels/{{ kernel_name }}.py <triton_file> --spec test_kernels/{{ kernel_name }}.yaml`
-   - **Trials t1+:** Get cached baseline via `xe-forge-skill trial baseline-us {{ kernel_name }}`, then `xe-forge-skill benchmark test_kernels/{{ kernel_name }}.py <triton_file> --spec test_kernels/{{ kernel_name }}.yaml --baseline-us <cached>`
-5. **Record** — `xe-forge-skill trial result {{ kernel_name }} <trial_id> --correctness <pass|fail> --speedup <float> --baseline-us <float> --triton-us <float>`
-6. **Profile** — if `vtune_enabled: true` in `config.yaml` and this is trial t1 or later, run `xe-forge-skill profile <triton_file> --spec <spec.yaml>` and use the output to guide the next trial. If `vtune_enabled: false`, skip this step.
+   - **Trial t0:** `xe-forge-skill benchmark test_kernels/{{ kernel_name }}.py <{{ dsl }}_file> --spec test_kernels/{{ kernel_name }}.yaml`
+   - **Trials t1+:** Get cached baseline via `xe-forge-skill trial baseline-us {{ kernel_name }}`, then `xe-forge-skill benchmark test_kernels/{{ kernel_name }}.py <{{ dsl }}_file> --spec test_kernels/{{ kernel_name }}.yaml --baseline-us <cached>`
+5. **Record** — `xe-forge-skill trial result {{ kernel_name }} <trial_id> --correctness <pass|fail> --speedup <float> --baseline-us <float> --{{ dsl }}-us <float>`
+6. **Profile** — if `vtune_enabled: true` in `config.yaml` and this is trial t1 or later, run `xe-forge-skill profile <{{ dsl }}_file> --spec <spec.yaml>` and use the output to guide the next trial. If `vtune_enabled: false`, skip this step.
 7. **Decide next action**:
    - Speedup > 5x -> stop, finalize
    - Speedup improved -> continue on this branch
@@ -76,7 +76,9 @@ xe-forge-skill trial finalize {{ kernel_name }} output/{{ kernel_name }}_optimiz
 
 ## CRITICAL CORRECTNESS CONSTRAINTS
 
+{% if dsl == "triton" %}
 - NO default values for `@triton.autotune` meta-parameters in kernel signature
+{% endif %}
 - Use 1D grid when applying tile swizzling (GROUP_SIZE_M)
 - `boundary_check` uses dimension indices (0, 1), not booleans
 - Cast batch indices to `int64` before stride multiplication
diff --git a/src/xe_forge/claude/templates/tool-runner.md.j2 b/src/xe_forge/claude/templates/tool-runner.md.j2
index 400aa43..adabf86 100644
--- a/src/xe_forge/claude/templates/tool-runner.md.j2
+++ b/src/xe_forge/claude/templates/tool-runner.md.j2
@@ -31,7 +31,7 @@ Safe to parallelize: analyze, validate, trial (CPU-only).
 ## Output Rules
 
 For benchmark:
-- Extract ONLY: Correctness (PASSED/FAILED), Performance (baseline_us, triton_us, speedup), Errors.
+- Extract ONLY: Correctness (PASSED/FAILED), Performance (baseline_us, {{ dsl }}_us, speedup), Errors.
 - Do NOT include configuration header or decorative separators.
 
 For profile:

From af3d02eab560542f9277bc3482146e85a7e2c764 Mon Sep 17 00:00:00 2001
From: Weilin Xu <weilin.xu@intel.com>
Date: Mon, 1 Jun 2026 11:04:16 -0700
Subject: [PATCH 02/12] Avoid access to knowledge base of other DSLs.

Signed-off-by: Weilin Xu <weilin.xu@intel.com>
---
 src/xe_forge/claude/generator.py | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/src/xe_forge/claude/generator.py b/src/xe_forge/claude/generator.py
index 1a3494d..3ec53b1 100644
--- a/src/xe_forge/claude/generator.py
+++ b/src/xe_forge/claude/generator.py
@@ -73,7 +73,7 @@ def generate_workspace(
     (agent_dir / "tool-runner.md").write_text(_render("tool-runner.md.j2", dsl=dsl))
 
     _write_kernel_files(workspace, kernel_name, kernel_code, reference_code, spec_path)
-    _symlink_knowledge_base(workspace)
+    _symlink_knowledge_base(workspace, dsl)
 
     if config.engine.git_init:
         _git_init(workspace)
@@ -96,11 +96,10 @@ def _write_kernel_files(
         shutil.copy2(spec_path, tk_dir / f"{kernel_name}.yaml")
 
 
-def _symlink_knowledge_base(workspace: Path) -> None:
-    """Create a symlink to the installed knowledge_base directory."""
-    kb_link = workspace / "knowledge_base"
-    if kb_link.exists() or kb_link.is_symlink():
-        return
+def _symlink_knowledge_base(workspace: Path, dsl: str) -> None:
+    """Symlink common and DSL-specific knowledge base subdirectories."""
+    kb_root = workspace / "knowledge_base"
+    kb_root.mkdir(parents=True, exist_ok=True)
 
     import xe_forge
 
@@ -112,7 +111,17 @@ def _symlink_knowledge_base(workspace: Path) -> None:
     ]
     for candidate in candidates:
         if candidate.is_dir():
-            kb_link.symlink_to(candidate.resolve())
+            src_root = candidate.resolve()
+
+            common_src = src_root / "common"
+            dsl_src = src_root / dsl
+            common_dst = kb_root / "common"
+            dsl_dst = kb_root / dsl
+
+            if common_src.is_dir() and not (common_dst.exists() or common_dst.is_symlink()):
+                common_dst.symlink_to(common_src)
+            if dsl_src.is_dir() and not (dsl_dst.exists() or dsl_dst.is_symlink()):
+                dsl_dst.symlink_to(dsl_src)
             return
 
 

From 17f4d2b86d2505715f2844d95affcdb9814ccd99 Mon Sep 17 00:00:00 2001
From: Weilin Xu <weilin.xu@intel.com>
Date: Mon, 1 Jun 2026 14:22:55 -0700
Subject: [PATCH 03/12] Only link to knowledge base of the same DSL and Device.

Signed-off-by: Weilin Xu <weilin.xu@intel.com>
---
 src/xe_forge/claude/generator.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/xe_forge/claude/generator.py b/src/xe_forge/claude/generator.py
index 3ec53b1..aff369e 100644
--- a/src/xe_forge/claude/generator.py
+++ b/src/xe_forge/claude/generator.py
@@ -73,7 +73,7 @@ def generate_workspace(
     (agent_dir / "tool-runner.md").write_text(_render("tool-runner.md.j2", dsl=dsl))
 
     _write_kernel_files(workspace, kernel_name, kernel_code, reference_code, spec_path)
-    _symlink_knowledge_base(workspace, dsl)
+    _symlink_knowledge_base(workspace, dsl, device)
 
     if config.engine.git_init:
         _git_init(workspace)
@@ -96,8 +96,8 @@ def _write_kernel_files(
         shutil.copy2(spec_path, tk_dir / f"{kernel_name}.yaml")
 
 
-def _symlink_knowledge_base(workspace: Path, dsl: str) -> None:
-    """Symlink common and DSL-specific knowledge base subdirectories."""
+def _symlink_knowledge_base(workspace: Path, dsl: str, device: str) -> None:
+    """Symlink common and DSL/device-specific knowledge base subdirectories."""
     kb_root = workspace / "knowledge_base"
     kb_root.mkdir(parents=True, exist_ok=True)
 
@@ -114,14 +114,17 @@ def _symlink_knowledge_base(workspace: Path, dsl: str) -> None:
             src_root = candidate.resolve()
 
             common_src = src_root / "common"
-            dsl_src = src_root / dsl
+            dsl_device_src = src_root / dsl / device
             common_dst = kb_root / "common"
-            dsl_dst = kb_root / dsl
+            dsl_device_dst = kb_root / dsl / device
 
             if common_src.is_dir() and not (common_dst.exists() or common_dst.is_symlink()):
                 common_dst.symlink_to(common_src)
-            if dsl_src.is_dir() and not (dsl_dst.exists() or dsl_dst.is_symlink()):
-                dsl_dst.symlink_to(dsl_src)
+            dsl_device_dst.parent.mkdir(parents=True, exist_ok=True)
+            if dsl_device_src.is_dir() and not (
+                dsl_device_dst.exists() or dsl_device_dst.is_symlink()
+            ):
+                dsl_device_dst.symlink_to(dsl_device_src)
             return
 
 

From 1b749a4eb50b37a4291d818051a703130e104e36 Mon Sep 17 00:00:00 2001
From: Weilin Xu <weilin.xu@intel.com>
Date: Mon, 1 Jun 2026 14:52:38 -0700
Subject: [PATCH 04/12] Copy the knowledge base, because agents may not find
 contents in symlinked folders

Signed-off-by: Weilin Xu <weilin.xu@intel.com>
---
 src/xe_forge/claude/generator.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/xe_forge/claude/generator.py b/src/xe_forge/claude/generator.py
index aff369e..9695d4c 100644
--- a/src/xe_forge/claude/generator.py
+++ b/src/xe_forge/claude/generator.py
@@ -73,7 +73,7 @@ def generate_workspace(
     (agent_dir / "tool-runner.md").write_text(_render("tool-runner.md.j2", dsl=dsl))
 
     _write_kernel_files(workspace, kernel_name, kernel_code, reference_code, spec_path)
-    _symlink_knowledge_base(workspace, dsl, device)
+    _copy_knowledge_base(workspace, dsl, device)
 
     if config.engine.git_init:
         _git_init(workspace)
@@ -96,8 +96,8 @@ def _write_kernel_files(
         shutil.copy2(spec_path, tk_dir / f"{kernel_name}.yaml")
 
 
-def _symlink_knowledge_base(workspace: Path, dsl: str, device: str) -> None:
-    """Symlink common and DSL/device-specific knowledge base subdirectories."""
+def _copy_knowledge_base(workspace: Path, dsl: str, device: str) -> None:
+    """Copy common and DSL/device-specific knowledge base subdirectories."""
     kb_root = workspace / "knowledge_base"
     kb_root.mkdir(parents=True, exist_ok=True)
 
@@ -118,13 +118,16 @@ def _symlink_knowledge_base(workspace: Path, dsl: str, device: str) -> None:
             common_dst = kb_root / "common"
             dsl_device_dst = kb_root / dsl / device
 
-            if common_src.is_dir() and not (common_dst.exists() or common_dst.is_symlink()):
-                common_dst.symlink_to(common_src)
+            if common_dst.is_symlink():
+                common_dst.unlink()
+            if common_src.is_dir():
+                shutil.copytree(common_src, common_dst, dirs_exist_ok=True)
+
             dsl_device_dst.parent.mkdir(parents=True, exist_ok=True)
-            if dsl_device_src.is_dir() and not (
-                dsl_device_dst.exists() or dsl_device_dst.is_symlink()
-            ):
-                dsl_device_dst.symlink_to(dsl_device_src)
+            if dsl_device_dst.is_symlink():
+                dsl_device_dst.unlink()
+            if dsl_device_src.is_dir():
+                shutil.copytree(dsl_device_src, dsl_device_dst, dirs_exist_ok=True)
             return
 
 

From e01330a450758103500f8f9dcdf5a5dab5422320 Mon Sep 17 00:00:00 2001
From: Weilin Xu <weilin.xu@intel.com>
Date: Tue, 2 Jun 2026 10:11:49 -0700
Subject: [PATCH 05/12] Allow all xe-forge-skill by default.

Signed-off-by: Weilin Xu <weilin.xu@intel.com>
---
 src/xe_forge/claude/generator.py                     |  8 +++++---
 src/xe_forge/claude/templates/settings.local.json.j2 | 11 +++++++++++
 2 files changed, 16 insertions(+), 3 deletions(-)
 create mode 100644 src/xe_forge/claude/templates/settings.local.json.j2

diff --git a/src/xe_forge/claude/generator.py b/src/xe_forge/claude/generator.py
index 9695d4c..0d7c1a0 100644
--- a/src/xe_forge/claude/generator.py
+++ b/src/xe_forge/claude/generator.py
@@ -1,8 +1,9 @@
 """Generate a Claude Code workspace for kernel optimization.
 
-Creates CLAUDE.md, config.yaml, .claude/commands/, .claude/agents/,
-and copies kernel files into the workspace. All text artifacts are
-rendered from Jinja templates under ``templates/``.
+Creates CLAUDE.md, config.yaml, .claude/settings.local.json,
+.claude/commands/, .claude/agents/, and copies kernel files into
+the workspace. All text artifacts are rendered from Jinja templates
+under ``templates/``.
 """
 
 from __future__ import annotations
@@ -71,6 +72,7 @@ def generate_workspace(
     agent_dir = workspace / ".claude" / "agents"
     agent_dir.mkdir(parents=True, exist_ok=True)
     (agent_dir / "tool-runner.md").write_text(_render("tool-runner.md.j2", dsl=dsl))
+    (workspace / ".claude" / "settings.local.json").write_text(_render("settings.local.json.j2"))
 
     _write_kernel_files(workspace, kernel_name, kernel_code, reference_code, spec_path)
     _copy_knowledge_base(workspace, dsl, device)
diff --git a/src/xe_forge/claude/templates/settings.local.json.j2 b/src/xe_forge/claude/templates/settings.local.json.j2
new file mode 100644
index 0000000..b0f0e0b
--- /dev/null
+++ b/src/xe_forge/claude/templates/settings.local.json.j2
@@ -0,0 +1,11 @@
+{
+  "permissions": {
+    "allow": [
+      "Bash(xe-forge-skill analyze *)",
+      "Bash(xe-forge-skill trial *)",
+      "Bash(xe-forge-skill validate *)",
+      "Bash(xe-forge-skill benchmark *)",
+      "Bash(xe-forge-skill profile *)"
+    ]
+  }
+}
\ No newline at end of file

From b46b7adb2b41ae94ad9b68de7c30fd662d8f0f4f Mon Sep 17 00:00:00 2001
From: Weilin Xu <weilin.xu@intel.com>
Date: Tue, 2 Jun 2026 10:51:54 -0700
Subject: [PATCH 06/12] Allow writing to the workspace by default.

Signed-off-by: Weilin Xu <weilin.xu@intel.com>
---
 src/xe_forge/claude/templates/settings.local.json.j2 | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/xe_forge/claude/templates/settings.local.json.j2 b/src/xe_forge/claude/templates/settings.local.json.j2
index b0f0e0b..c2ea245 100644
--- a/src/xe_forge/claude/templates/settings.local.json.j2
+++ b/src/xe_forge/claude/templates/settings.local.json.j2
@@ -1,5 +1,6 @@
 {
   "permissions": {
+    "defaultMode": "acceptEdits",
     "allow": [
       "Bash(xe-forge-skill analyze *)",
       "Bash(xe-forge-skill trial *)",

From 8dcbb4efe33e9ef02e473fb76f1dd5ad342890c6 Mon Sep 17 00:00:00 2001
From: Weilin Xu <weilin.xu@intel.com>
Date: Tue, 2 Jun 2026 13:19:29 -0700
Subject: [PATCH 07/12] Change triton_us to custom_us, because we also generate
 custom sycl kernels.

Signed-off-by: Weilin Xu <weilin.xu@intel.com>
---
 src/xe_forge/claude/templates/CLAUDE.md.j2      |  2 +-
 src/xe_forge/claude/templates/tool-runner.md.j2 |  2 +-
 src/xe_forge/core/trial_manager.py              | 13 +++++++++----
 src/xe_forge/pipeline.py                        |  2 +-
 src/xe_forge/skills/__init__.py                 |  2 +-
 src/xe_forge/skills/benchmark.py                |  6 +++---
 src/xe_forge/skills/trial.py                    |  2 +-
 7 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/src/xe_forge/claude/templates/CLAUDE.md.j2 b/src/xe_forge/claude/templates/CLAUDE.md.j2
index 3f7f7a4..1a54918 100644
--- a/src/xe_forge/claude/templates/CLAUDE.md.j2
+++ b/src/xe_forge/claude/templates/CLAUDE.md.j2
@@ -33,7 +33,7 @@ All runtime behavior below is gated by these values; re-read `config.yaml` if an
 | **Benchmark** | `xe-forge-skill benchmark <baseline> <optimized> --spec <spec.yaml> [--baseline-us <cached>]` |
 | **Init trials** | `xe-forge-skill trial init <kernel_name> <baseline_file>` |
 | **Save trial** | `xe-forge-skill trial save <kernel_name> <file> [--parent <id>] [--strategy "..."]` |
-| **Record result** | `xe-forge-skill trial result <kernel_name> <trial_id> --correctness <pass\|fail> --speedup <float> --baseline-us <float> --{{ dsl }}-us <float>` |
+| **Record result** | `xe-forge-skill trial result <kernel_name> <trial_id> --correctness <pass\|fail> --speedup <float> --baseline-us <float> --custom-us <float>` |
 | **Check status** | `xe-forge-skill trial status <kernel_name>` |
 | **Best trial** | `xe-forge-skill trial best <kernel_name>` |
 | **Baseline time** | `xe-forge-skill trial baseline-us <kernel_name>` |
diff --git a/src/xe_forge/claude/templates/tool-runner.md.j2 b/src/xe_forge/claude/templates/tool-runner.md.j2
index adabf86..a490cd9 100644
--- a/src/xe_forge/claude/templates/tool-runner.md.j2
+++ b/src/xe_forge/claude/templates/tool-runner.md.j2
@@ -31,7 +31,7 @@ Safe to parallelize: analyze, validate, trial (CPU-only).
 ## Output Rules
 
 For benchmark:
-- Extract ONLY: Correctness (PASSED/FAILED), Performance (baseline_us, {{ dsl }}_us, speedup), Errors.
+- Extract ONLY: Correctness (PASSED/FAILED), Performance (baseline_us, custom_us, speedup), Errors.
 - Do NOT include configuration header or decorative separators.
 
 For profile:
diff --git a/src/xe_forge/core/trial_manager.py b/src/xe_forge/core/trial_manager.py
index 4009d86..e782f0e 100644
--- a/src/xe_forge/core/trial_manager.py
+++ b/src/xe_forge/core/trial_manager.py
@@ -40,6 +40,8 @@ def _load_state(self, kernel_name: str) -> dict:
         for trial in state.get("trials", {}).values():
             if "pytorch_us" in trial and "baseline_us" not in trial:
                 trial["baseline_us"] = trial.pop("pytorch_us")
+            if "triton_us" in trial and "custom_us" not in trial:
+                trial["custom_us"] = trial.pop("triton_us")
         return state
 
     def _save_state(self, kernel_name: str, state: dict) -> None:
@@ -118,7 +120,7 @@ def save_trial(
             "correctness": None,
             "speedup": None,
             "baseline_us": None,
-            "triton_us": None,
+            "custom_us": None,
             "status": "saved",
         }
         self._save_state(kernel_name, state)
@@ -134,6 +136,7 @@ def record_result(
         correctness: str | None = None,
         speedup: float | None = None,
         baseline_us: float | None = None,
+        custom_us: float | None = None,
         triton_us: float | None = None,
     ) -> dict:
         """Record benchmark results for a trial. Returns the trial dict."""
@@ -153,8 +156,10 @@ def record_result(
             trial["speedup"] = speedup
         if baseline_us is not None:
             trial["baseline_us"] = baseline_us
+        if custom_us is not None:
+            trial["custom_us"] = custom_us
         if triton_us is not None:
-            trial["triton_us"] = triton_us
+            trial["custom_us"] = triton_us
 
         if baseline_us is not None and state.get("baseline_us") is None:
             state["baseline_us"] = [baseline_us]
@@ -222,8 +227,8 @@ def _render(tid: str, prefix: str = "", is_last: bool = True) -> None:
             icon = status_icon.get(trial["status"], "?")
             speedup_str = f"{trial['speedup']:.2f}x" if trial["speedup"] is not None else "---"
             runtime = ""
-            if trial.get("baseline_us") is not None and trial.get("triton_us") is not None:
-                runtime = f" (bl={trial['baseline_us']:.0f}us, tr={trial['triton_us']:.0f}us)"
+            if trial.get("baseline_us") is not None and trial.get("custom_us") is not None:
+                runtime = f" (bl={trial['baseline_us']:.0f}us, opt={trial['custom_us']:.0f}us)"
             best_marker = " <<<< BEST" if tid == state["best_trial"] else ""
             strategy_short = (trial["strategy"] or "")[:60]
             lines.append(
diff --git a/src/xe_forge/pipeline.py b/src/xe_forge/pipeline.py
index 8e29caf..8eb0648 100644
--- a/src/xe_forge/pipeline.py
+++ b/src/xe_forge/pipeline.py
@@ -443,7 +443,7 @@ def optimize(
                             correctness="pass" if stage_result.success else "fail",
                             speedup=speedup,
                             baseline_us=(val_orig_ms or 0) * 1000,
-                            triton_us=(current_ms or 0) * 1000,
+                            custom_us=(current_ms or 0) * 1000,
                         )
                         if stage_result.success:
                             last_trial_id = trial_id
diff --git a/src/xe_forge/skills/__init__.py b/src/xe_forge/skills/__init__.py
index 0d1ac02..43d5cdb 100644
--- a/src/xe_forge/skills/__init__.py
+++ b/src/xe_forge/skills/__init__.py
@@ -66,7 +66,7 @@ def main():
     t_result.add_argument("--correctness", choices=["pass", "fail"])
     t_result.add_argument("--speedup", type=float)
     t_result.add_argument("--baseline-us", type=float)
-    t_result.add_argument("--triton-us", type=float)
+    t_result.add_argument("--custom-us", "--triton-us", dest="custom_us", type=float)
     t_result.add_argument("--trials-dir", default="./trials")
 
     t_status = trial_sub.add_parser("status")
diff --git a/src/xe_forge/skills/benchmark.py b/src/xe_forge/skills/benchmark.py
index d416d51..8486d0d 100644
--- a/src/xe_forge/skills/benchmark.py
+++ b/src/xe_forge/skills/benchmark.py
@@ -32,14 +32,14 @@ def run(args):
             init_args=init_args,
             input_dtypes=input_dtypes,
         )
-        if optimized_result.success:
+        if optimized_result.success and optimized_result.execution_time_ms is not None:
             baseline_ms = sum(baseline_us) / len(baseline_us) / 1000.0
             opt_ms = optimized_result.execution_time_ms
             speedup = baseline_ms / opt_ms if opt_ms > 0 else 0
             print(f"Correctness: {'PASSED' if optimized_result.success else 'FAILED'}")
             print(
                 f"Performance: baseline_us={baseline_ms * 1000:.2f}, "
-                f"triton_us={opt_ms * 1000:.2f}, speedup={speedup:.2f}x"
+                f"custom_us={opt_ms * 1000:.2f}, speedup={speedup:.2f}x"
             )
         else:
             print("Correctness: FAILED")
@@ -58,7 +58,7 @@ def run(args):
         if result.original_time_us and result.optimized_time_us:
             print(
                 f"Performance: baseline_us={result.original_time_us:.2f}, "
-                f"triton_us={result.optimized_time_us:.2f}, speedup={result.speedup:.2f}x"
+                f"custom_us={result.optimized_time_us:.2f}, speedup={result.speedup:.2f}x"
             )
         if result.feedback_message:
             print(f"Feedback: {result.feedback_message}")
diff --git a/src/xe_forge/skills/trial.py b/src/xe_forge/skills/trial.py
index 2792770..e3c0179 100644
--- a/src/xe_forge/skills/trial.py
+++ b/src/xe_forge/skills/trial.py
@@ -30,7 +30,7 @@ def run(args):
                 correctness=args.correctness,
                 speedup=args.speedup,
                 baseline_us=args.baseline_us,
-                triton_us=args.triton_us,
+                custom_us=getattr(args, "custom_us", getattr(args, "triton_us", None)),
             )
             status_icon = {"completed": "+", "failed": "X", "partial": "~", "saved": "?"}
             icon = status_icon.get(trial["status"], "?")

From de564ae90465f50b25a1864bd3c86c28fb6163f5 Mon Sep 17 00:00:00 2001
From: Weilin Xu <weilin.xu@intel.com>
Date: Mon, 8 Jun 2026 12:33:38 -0700
Subject: [PATCH 08/12] settings.local.json --> settings.json

Signed-off-by: Weilin Xu <weilin.xu@intel.com>
---
 src/xe_forge/claude/generator.py                                | 2 +-
 .../templates/{settings.local.json.j2 => settings.json.j2}      | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename src/xe_forge/claude/templates/{settings.local.json.j2 => settings.json.j2} (100%)

diff --git a/src/xe_forge/claude/generator.py b/src/xe_forge/claude/generator.py
index 0d7c1a0..c83782e 100644
--- a/src/xe_forge/claude/generator.py
+++ b/src/xe_forge/claude/generator.py
@@ -72,7 +72,7 @@ def generate_workspace(
     agent_dir = workspace / ".claude" / "agents"
     agent_dir.mkdir(parents=True, exist_ok=True)
     (agent_dir / "tool-runner.md").write_text(_render("tool-runner.md.j2", dsl=dsl))
-    (workspace / ".claude" / "settings.local.json").write_text(_render("settings.local.json.j2"))
+    (workspace / ".claude" / "settings.json").write_text(_render("settings.json.j2"))
 
     _write_kernel_files(workspace, kernel_name, kernel_code, reference_code, spec_path)
     _copy_knowledge_base(workspace, dsl, device)
diff --git a/src/xe_forge/claude/templates/settings.local.json.j2 b/src/xe_forge/claude/templates/settings.json.j2
similarity index 100%
rename from src/xe_forge/claude/templates/settings.local.json.j2
rename to src/xe_forge/claude/templates/settings.json.j2

From e5294aeb9bb111ea55f4e688492d3e8eac3373ed Mon Sep 17 00:00:00 2001
From: Weilin Xu <weilin.xu@intel.com>
Date: Mon, 8 Jun 2026 12:51:34 -0700
Subject: [PATCH 09/12] Add more context for SYCL*TLA generation.

Signed-off-by: Weilin Xu <weilin.xu@intel.com>
---
 knowledge_base/sycl/xpu/SYCL_lessons.md       | 171 ++++++
 ...1_Square_matrix_multiplication_solution.py | 510 ++++++++++++++++++
 src/xe_forge/claude/templates/CLAUDE.md.j2    |  12 +
 3 files changed, 693 insertions(+)
 create mode 100644 knowledge_base/sycl/xpu/SYCL_lessons.md
 create mode 100644 knowledge_base/sycl/xpu/examples/1_Square_matrix_multiplication_solution.py

diff --git a/knowledge_base/sycl/xpu/SYCL_lessons.md b/knowledge_base/sycl/xpu/SYCL_lessons.md
new file mode 100644
index 0000000..d80866d
--- /dev/null
+++ b/knowledge_base/sycl/xpu/SYCL_lessons.md
@@ -0,0 +1,171 @@
+# SYCL Lessons
+
+## Lesson: Named Kernels for `parallel_for`
+
+### Symptom
+
+Compilation can fail with an error similar to:
+
+```text
+unnamed type '(lambda ...)' is invalid; provide a kernel name, or use '-fsycl-unnamed-lambda'
+```
+
+### Cause
+
+Some SYCL toolchains reject unnamed kernel lambdas in `parallel_for`.
+
+### Fix
+
+Prefer an explicitly named kernel:
+
+```cpp
+h.parallel_for<class ScalarMulKernel>(
+    sycl::nd_range<1>(global_size, WG_SIZE),
+    [=](sycl::nd_item<1> item) {
+        // kernel body
+    }
+);
+```
+
+### Fallback
+
+If a code path cannot be easily updated to named kernels, add this compile flag:
+
+```text
+-fsycl-unnamed-lambda
+```
+
+### Practical Note
+
+For portability and future compiler compatibility, named kernels are the safer default.
+
+---
+
+## Lesson: Sub-group Coalesced Memory Access for Intel Xe GPUs
+
+### Symptom
+
+A kernel performs element-wise operations on large tensors but runs at only 0.66× PyTorch speed, despite using vectorized loads and a reasonable work-group size.
+
+### Root Cause
+
+**Poor memory coalescing**. If each work-item processes a contiguous block of N elements independently, lanes within a sub-group access memory N elements apart. With N=32 and sub-group size=16, consecutive lanes load from offsets 0, 32, 64, ... — hitting 16 separate cache lines per SIMD instruction, wasting 75% of fetched data.
+
+### Fix: Sub-group Aware Layout
+
+Reorganize the computation so **consecutive lanes access consecutive memory addresses**. For bf16 pairs, treat them as `uint32` and distribute work by lane:
+
+```cpp
+enum { WG = 512, ITERS = 128, SG_SZ = 16, SGS_PER_WG = WG / SG_SZ };
+enum { PAIRS_PER_SG = SG_SZ * ITERS };  // 2048 pairs per sub-group
+
+q->submit([&](sycl::handler& h) {
+    h.parallel_for<class MyKernel>(
+        sycl::nd_range<1>(global, WG),
+        [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(16)]] {
+            auto sg = item.get_sub_group();
+            int lane = sg.get_local_linear_id();
+            int64_t sg_id = item.get_group(0) * SGS_PER_WG + sg.get_group_linear_id();
+            int64_t sg_base = sg_id * PAIRS_PER_SG;
+
+            #pragma unroll
+            for (int i = 0; i < ITERS; i++) {
+                int64_t idx = sg_base + i * SG_SZ + lane;
+                // Lane 0 reads idx=sg_base+0
+                // Lane 1 reads idx=sg_base+1
+                // ... Lane 15 reads idx=sg_base+15
+                // All lanes read from same cache line!
+                uint32_t data = load_u32(idx);
+                process_and_store(data, idx);
+            }
+        });
+});
+```
+
+**Key points:**
+1. All 16 lanes in a sub-group load from consecutive offsets: `[sg_base, sg_base+1, ..., sg_base+15]`.
+2. Together they fill exactly **one 64-byte cache line** per iteration.
+3. Use `[[sycl::reqd_sub_group_size(16)]]` to guarantee the lane mapping.
+4. Use `enum` instead of `constexpr` for compile-time constants to avoid DPC++ host/device capture mismatch.
+
+### Expected Impact
+
+- **Memory bandwidth utilization**: ~100% (vs. 25% with poor coalescing).
+- **Performance gain**: 0.66× → 0.97× (47% improvement toward parity with optimized reference).
+
+### Reference (oneAPI GPU Optimization Guide)
+
+- Thread Mapping and GPU Occupancy (Section 6)
+- Memory Hierarchy and Access Patterns (Section 7)
+- Sub-group sizes: 16 or 32 for most Xe GPUs; use `get_sub_group()` API to query and adapt at runtime.
+
+---
+
+## Lesson: Speeding Up Row-Scale Kernels (diag(A) * B)
+
+### Symptom
+
+The row-wise broadcast multiply kernel `C[i,j] = A[i] * B[i,j]` is correct but only about 0.62x as fast as `torch.compile()`.
+
+### Root Causes
+
+- Per-element integer division (`row = idx / M`) in the hot loop
+- Scalar bf16 loads/stores and scalar conversion overhead
+- Weak memory coalescing pattern across lanes
+
+### Fix Pattern
+
+1. Map one work-group to one row
+- Use `row = item.get_group(0)`.
+- Remove integer division from the inner loop entirely.
+
+2. Process bf16 pairs as `uint32`
+- Load/store two bf16 values at once.
+- This doubles useful bytes per memory op with naturally aligned 32-bit accesses.
+
+3. Use coalesced lane mapping along columns
+- Let consecutive local IDs handle consecutive pair indices.
+- Stride by work-group size in the column loop.
+
+4. Use explicit sub-group size on Xe
+- Add `[[sycl::reqd_sub_group_size(16)]]` for predictable lane behavior.
+
+5. Convert bf16 manually with bit operations
+- `bf16 -> f32`: left-shift by 16 and bit-cast.
+- `f32 -> bf16`: round-to-nearest-even using bias and sticky-bit logic.
+
+### Minimal Kernel Structure
+
+```cpp
+const int pairs_per_row = M / 2;
+enum { WG = 256 };
+
+q.submit([&](sycl::handler& h) {
+    h.parallel_for<class RowScaleKernel>(
+        sycl::nd_range<1>((size_t)N * WG, WG),
+        [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(16)]] {
+            int row = item.get_group(0);
+            int lid = item.get_local_id(0);
+
+            float a_val = bf16_to_f32(a[row]);
+            const uint32_t* b_row = b32 + row * pairs_per_row;
+            uint32_t* c_row = c32 + row * pairs_per_row;
+
+            for (int pi = lid; pi < pairs_per_row; pi += WG) {
+                uint32_t p = b_row[pi];
+                c_row[pi] = mul_pair_bf16(a_val, p);
+            }
+        });
+});
+```
+
+### Measured Impact
+
+- Versus eager PyTorch: improved to about 1.46x
+- Versus `torch.compile()`: improved from about 0.62x to about 0.93x
+
+### oneAPI Guide Alignment
+
+- Section 6: Thread mapping and occupancy (work-group decomposition)
+- Section 7: Memory hierarchy and coalesced access
+- Section 11: Prefer wide/coalesced accesses and remove expensive operations from inner loops
diff --git a/knowledge_base/sycl/xpu/examples/1_Square_matrix_multiplication_solution.py b/knowledge_base/sycl/xpu/examples/1_Square_matrix_multiplication_solution.py
new file mode 100644
index 0000000..0ea71ef
--- /dev/null
+++ b/knowledge_base/sycl/xpu/examples/1_Square_matrix_multiplication_solution.py
@@ -0,0 +1,510 @@
+import os
+import re
+import sys
+from pathlib import Path
+
+import torch
+import torch.nn as nn
+
+# ---------------------------------------------------------------------------
+# IGC workaround: raise vector alias analysis threshold for large CUTLASS kernels
+# ---------------------------------------------------------------------------
+os.environ.setdefault("IGC_VectorAliasBBThreshold", "100000000000")
+
+# ---------------------------------------------------------------------------
+# Tuning knobs — override these to match your problem size.
+# Run onednn_gemm_tuning.py to find optimal values.
+# ---------------------------------------------------------------------------
+TILE_M = 256
+TILE_N = 256
+TILE_K = 32
+GRF_COUNT = 256
+SCHEDULER_TYPE = 1  # 0=default, 1=Persistent, 2=StreamK
+EPILOGUE_TYPE = 0   # 0=LinComb, 1=ReLU, 2=GELU
+
+# ---------------------------------------------------------------------------
+# Inline SYCL kernel source — sycl_tla_gemm_template.cpp
+# All tile/GRF/scheduler/epilogue knobs are set via -D flags at compile time.
+# ---------------------------------------------------------------------------
+_KERNEL_SYCL = r"""
+#include <torch/extension.h>
+#include <c10/xpu/XPUStream.h>
+#include <optional>
+
+#include "cute/tensor.hpp"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/device/gemm_universal.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/fusion/operations.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cutlass/util/packed_stride.hpp"
+
+using namespace cute;
+
+// ---- Tunable parameters (set via -D flags from oneDNN tuning output) ----
+
+#ifndef TILE_M
+#define TILE_M 256
+#endif
+#ifndef TILE_N
+#define TILE_N 256
+#endif
+#ifndef TILE_K
+#define TILE_K 32
+#endif
+#ifndef GRF_COUNT
+#define GRF_COUNT 256
+#endif
+static_assert(GRF_COUNT == 128 || GRF_COUNT == 256,
+              "GRF_COUNT must be 128 or 256.");
+#ifndef SCHEDULER_TYPE
+#define SCHEDULER_TYPE 1
+#endif
+#ifndef ELEMENT_INPUT
+#define ELEMENT_INPUT cutlass::bfloat16_t
+#endif
+#ifndef ELEMENT_ACC
+#define ELEMENT_ACC float
+#endif
+#ifndef ELEMENT_OUTPUT
+#define ELEMENT_OUTPUT cutlass::bfloat16_t
+#endif
+#ifndef TORCH_OUTPUT_DTYPE
+#define TORCH_OUTPUT_DTYPE torch::kBFloat16
+#endif
+#ifndef TORCH_OUTPUT_CTYPE
+#define TORCH_OUTPUT_CTYPE at::BFloat16
+#endif
+#ifndef ALIGNMENT
+#define ALIGNMENT 8
+#endif
+#ifndef LAYOUT_A
+#define LAYOUT_A 0
+#endif
+#ifndef LAYOUT_B
+#define LAYOUT_B 0
+#endif
+#ifndef EPILOGUE_TYPE
+#define EPILOGUE_TYPE 0
+#endif
+
+#define CUTLASS_CHECK(status)                                                   \
+  {                                                                             \
+    cutlass::Status _err = (status);                                            \
+    if (_err != cutlass::Status::kSuccess) {                                    \
+      TORCH_CHECK(false, "[CUTLASS] ", cutlassGetStatusString(_err),            \
+                  " at ", __FILE__, ":", __LINE__);                             \
+    }                                                                           \
+  }
+
+// ---- GEMM kernel type definitions ----
+
+using TileShape   = cute::Shape<cute::Int<TILE_M>, cute::Int<TILE_N>, cute::Int<TILE_K>>;
+using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+
+#if LAYOUT_A == 1
+using LayoutA = cutlass::layout::ColumnMajor;
+#else
+using LayoutA = cutlass::layout::RowMajor;
+#endif
+#if LAYOUT_B == 1
+using LayoutB = cutlass::layout::ColumnMajor;
+#else
+using LayoutB = cutlass::layout::RowMajor;
+#endif
+
+using StageCount = cutlass::gemm::collective::StageCountAuto;
+
+#if SCHEDULER_TYPE == 2
+using KernelSchedule = cutlass::gemm::KernelXeCooperative;
+#else
+using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+#endif
+
+// Epilogue schedule must match kernel schedule for cooperative mode
+#if SCHEDULER_TYPE == 2
+using EpilogueSchedule = cutlass::epilogue::XeCooperative;
+#else
+using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+#endif
+
+#if EPILOGUE_TYPE == 1
+using EpilogueFusionOp = cutlass::epilogue::fusion::LinCombEltAct<
+    cutlass::epilogue::thread::ReLu, ELEMENT_OUTPUT, ELEMENT_ACC, ELEMENT_OUTPUT, ELEMENT_ACC>;
+#elif EPILOGUE_TYPE == 2
+using EpilogueFusionOp = cutlass::epilogue::fusion::LinCombEltAct<
+    cutlass::epilogue::thread::GELU, ELEMENT_OUTPUT, ELEMENT_ACC, ELEMENT_OUTPUT, ELEMENT_ACC>;
+#else
+using EpilogueFusionOp = cutlass::epilogue::fusion::LinearCombination<
+    ELEMENT_OUTPUT, ELEMENT_ACC, ELEMENT_OUTPUT, ELEMENT_ACC>;
+#endif
+
+using gemm_epilogue =
+  typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Xe20, cutlass::arch::OpClassTensorOp,
+    TileShape, ClusterShape,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ELEMENT_ACC, ELEMENT_ACC,
+    ELEMENT_OUTPUT, cutlass::layout::RowMajor, ALIGNMENT,
+    ELEMENT_OUTPUT, cutlass::layout::RowMajor, ALIGNMENT,
+    EpilogueSchedule,
+    EpilogueFusionOp
+  >::CollectiveOp;
+
+using gemm_mainloop =
+  typename cutlass::gemm::collective::CollectiveBuilder<
+    cutlass::arch::Xe20, cutlass::arch::OpClassTensorOp,
+    ELEMENT_INPUT, LayoutA, ALIGNMENT,
+    ELEMENT_INPUT, LayoutB, ALIGNMENT,
+    ELEMENT_ACC,
+    TileShape, ClusterShape,
+    StageCount, KernelSchedule
+  >::CollectiveOp;
+
+#if SCHEDULER_TYPE == 2
+using TileScheduler = cutlass::gemm::StreamKScheduler;
+#elif SCHEDULER_TYPE == 1
+using TileScheduler = cutlass::gemm::PersistentScheduler;
+#else
+using TileScheduler = void;
+#endif
+
+using gemm_kernel_base = cutlass::gemm::kernel::GemmUniversal<
+    cute::Shape<int, int, int, int>,
+    gemm_mainloop, gemm_epilogue, TileScheduler>;
+struct gemm_kernel : public gemm_kernel_base {};
+using gemm_device = cutlass::gemm::device::GemmUniversalAdapter<gemm_kernel>;
+
+// ---- PyTorch binding: gemm_forward(A, B, out?, alpha, beta) ----
+// Accepts 2-D or 3-D (batched) inputs. Forward-only.
+
+torch::Tensor gemm_forward(torch::Tensor A, torch::Tensor B,
+                            std::optional<torch::Tensor> out = std::nullopt,
+                            float alpha = 1.0f, float beta = 0.0f) {
+  TORCH_CHECK(A.device().is_xpu(), "A must be an XPU tensor");
+  TORCH_CHECK(B.device().is_xpu(), "B must be an XPU tensor");
+  TORCH_CHECK(A.dtype() == TORCH_OUTPUT_DTYPE, "A dtype mismatch");
+  TORCH_CHECK(B.dtype() == TORCH_OUTPUT_DTYPE, "B dtype mismatch");
+  TORCH_CHECK(A.dim() == 2 || A.dim() == 3, "A must be 2-D or 3-D");
+  TORCH_CHECK(B.dim() == 2 || B.dim() == 3, "B must be 2-D or 3-D");
+  TORCH_CHECK(A.dim() == B.dim(), "A and B must have same number of dimensions");
+
+  const bool batched = (A.dim() == 3);
+  if (batched) {
+    TORCH_CHECK(A.size(0) == B.size(0), "Batch size mismatch");
+  }
+  // Dimension check: K must match between A and B
+#if LAYOUT_A == 1
+  const int64_t A_K = A.size(-2);
+#else
+  const int64_t A_K = A.size(-1);
+#endif
+#if LAYOUT_B == 1
+  const int64_t B_K = B.size(-1);
+#else
+  const int64_t B_K = B.size(-2);
+#endif
+  TORCH_CHECK(A_K == B_K, "Dimension mismatch: A's K=", A_K, " != B's K=", B_K);
+
+  A = A.contiguous();
+  B = B.contiguous();
+
+  const int L = batched ? static_cast<int>(A.size(0)) : 1;
+#if LAYOUT_A == 1
+  const int M = static_cast<int>(A.size(-1));
+  const int K = static_cast<int>(A.size(-2));
+#else
+  const int M = static_cast<int>(A.size(-2));
+  const int K = static_cast<int>(A.size(-1));
+#endif
+#if LAYOUT_B == 1
+  const int N = static_cast<int>(B.size(-2));
+#else
+  const int N = static_cast<int>(B.size(-1));
+#endif
+  const int64_t batch_stride_A = batched ? int64_t(M) * K : int64_t(0);
+  const int64_t batch_stride_B = batched ? int64_t(K) * N : int64_t(0);
+  const int64_t batch_stride_D = batched ? int64_t(M) * N : int64_t(0);
+
+  torch::Tensor D;
+  if (out.has_value()) {
+    D = out.value();
+    TORCH_CHECK(D.device() == A.device(), "out must be on same device as A");
+    TORCH_CHECK(D.dtype() == TORCH_OUTPUT_DTYPE, "out dtype mismatch");
+    if (batched) {
+      TORCH_CHECK(D.dim() == 3 && D.size(0) == L && D.size(1) == M && D.size(2) == N,
+                  "out shape mismatch");
+    } else {
+      TORCH_CHECK(D.dim() == 2 && D.size(0) == M && D.size(1) == N,
+                  "out shape mismatch");
+    }
+    D = D.contiguous();
+  } else {
+    if (batched) {
+      D = torch::empty({L, M, N}, A.options());
+    } else {
+      D = torch::empty({M, N}, A.options());
+    }
+  }
+
+  using coord_t = cutlass::gemm::GemmCoord::Index;
+  const int device_idx = A.device().index();
+  cutlass::KernelHardwareInfo hw_info;
+  hw_info.sm_count =
+      cutlass::KernelHardwareInfo::query_device_multiprocessor_count(device_idx);
+
+  gemm_device::Arguments arguments{
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    {static_cast<coord_t>(M), static_cast<coord_t>(N),
+     static_cast<coord_t>(K), static_cast<coord_t>(L)},
+    {
+      reinterpret_cast<ELEMENT_INPUT*>(A.data_ptr<TORCH_OUTPUT_CTYPE>()),
+#if LAYOUT_A == 1
+      {cute::Int<1>{}, int64_t(M), batch_stride_A},
+#else
+      {int64_t(K), cute::Int<1>{}, batch_stride_A},
+#endif
+      reinterpret_cast<ELEMENT_INPUT*>(B.data_ptr<TORCH_OUTPUT_CTYPE>()),
+#if LAYOUT_B == 1
+      {int64_t(K), cute::Int<1>{}, batch_stride_B},
+#else
+      {cute::Int<1>{}, int64_t(N), batch_stride_B},
+#endif
+    },
+    {
+      {alpha, beta},
+      (beta != 0.f) ? reinterpret_cast<ELEMENT_OUTPUT*>(D.data_ptr<TORCH_OUTPUT_CTYPE>()) : nullptr,
+      {int64_t(N), cute::Int<1>{}, batch_stride_D},
+      reinterpret_cast<ELEMENT_OUTPUT*>(D.data_ptr<TORCH_OUTPUT_CTYPE>()),
+      {int64_t(N), cute::Int<1>{}, batch_stride_D},
+    },
+    hw_info
+  };
+
+  gemm_device gemm_op;
+  size_t ws_bytes = gemm_op.get_workspace_size(arguments);
+  auto workspace = torch::empty(
+      {static_cast<int64_t>(ws_bytes)},
+      torch::TensorOptions().dtype(torch::kByte).device(A.device()));
+
+  auto stream = c10::xpu::getCurrentXPUStream(A.device().index());
+  sycl::queue* queue = &stream.queue();
+
+#ifndef CUTLASS_BACKEND_DISABLE_CHECKS
+  CUTLASS_CHECK(gemm_op.can_implement(arguments));
+#endif
+  CUTLASS_CHECK(gemm_op.initialize(
+      arguments, static_cast<uint8_t*>(workspace.data_ptr()), queue));
+  CUTLASS_CHECK(gemm_op(queue));
+  return D;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("gemm_forward", [](torch::Tensor A, torch::Tensor B, py::object out,
+                           float alpha, float beta) -> torch::Tensor {
+    std::optional<torch::Tensor> opt_out;
+    if (!out.is_none()) opt_out = out.cast<torch::Tensor>();
+    return gemm_forward(A, B, opt_out, alpha, beta);
+  },
+        "Forward-only GEMM (no autograd backward). "
+        "Accepts 2-D or 3-D (batched) inputs.",
+        py::arg("A"), py::arg("B"), py::arg("out") = py::none(),
+        py::arg("alpha") = 1.0f, py::arg("beta") = 0.0f);
+}
+"""
+
+
+# ---------------------------------------------------------------------------
+# JIT compilation
+# ---------------------------------------------------------------------------
+
+def _get_cutlass_version(cutlass_root: Path) -> str:
+    version_h = cutlass_root / "include" / "cutlass" / "version.h"
+    if not version_h.exists():
+        return "unknown"
+    text = version_h.read_text()
+    major = re.search(r"#define\s+CUTLASS_MAJOR\s+(\d+)", text)
+    minor = re.search(r"#define\s+CUTLASS_MINOR\s+(\d+)", text)
+    patch = re.search(r"#define\s+CUTLASS_PATCH\s+(\d+)", text)
+    if major and minor and patch:
+        return f"{major.group(1)}.{minor.group(1)}.{patch.group(1)}"
+    return "unknown"
+
+
+def _detect_sycl_target() -> str:
+    name = torch.xpu.get_device_name(0).lower()
+    if any(x in name for x in ("b580", "b570", "b770", "b50", "b60", "bmg-g21")):
+        return "bmg-g21"
+    if any(x in name for x in ("b70", "bmg-g31")):
+        return "bmg-g31"
+    if "bmg" in name or "battlemage" in name:
+        return "bmg-g21"
+    if any(x in name for x in ("a770", "a750", "a580", "a380", "a310", "dg2")):
+        return "acm-g10"
+    if "max" in name or "pvc" in name:
+        return "pvc"
+    if "lunar" in name or "lnl" in name:
+        return "lnl-m"
+    return "bmg-g21"
+
+
+def _load_extension(sycl_target: str = "bmg-g21"):
+    import torch.utils.cpp_extension as _cpp_ext
+    from torch.utils.cpp_extension import load_inline
+    import shutil
+
+    _cutlass_path = os.environ.get("CUTLASS_PATH")
+    if not _cutlass_path:
+        sys.exit("ERROR: CUTLASS_PATH environment variable is not set.")
+    cutlass_root = Path(_cutlass_path)
+
+    _cutlass_ver = _get_cutlass_version(cutlass_root)
+    _ext_name = (
+        f"sycl_tla_gemm_{TILE_M}x{TILE_N}x{TILE_K}_"
+        f"grf{GRF_COUNT}_sched{SCHEDULER_TYPE}_epi{EPILOGUE_TYPE}_"
+        f"{_cutlass_ver}_{sycl_target}"
+    ).replace(".", "_").replace("-", "_")
+
+    extra_include_paths = [
+        str(cutlass_root / "include"),
+        str(cutlass_root / "tools" / "util" / "include"),
+    ]
+
+    _cutlass_defines = [
+        "-DCUTLASS_ENABLE_SYCL",
+        "-DSYCL_INTEL_TARGET",
+        "-DCUTLASS_VERSIONS_GENERATED",
+        "-DCUTLASS_BACKEND_DISABLE_CHECKS",
+        "-DNDEBUG",
+    ]
+
+    _tuning_defines = [
+        f"-DTILE_M={TILE_M}",
+        f"-DTILE_N={TILE_N}",
+        f"-DTILE_K={TILE_K}",
+        f"-DGRF_COUNT={GRF_COUNT}",
+        f"-DSCHEDULER_TYPE={SCHEDULER_TYPE}",
+        f"-DEPILOGUE_TYPE={EPILOGUE_TYPE}",
+    ]
+
+    extra_sycl_cflags = [
+        "-fno-sycl-instrument-device-code",
+        "-fsycl-targets=spir64_gen",
+        *_cutlass_defines,
+        *_tuning_defines,
+        "-O3",
+        "-Wno-unused-variable",
+        "-Wno-unused-local-typedef",
+        "-Wno-unused-but-set-variable",
+        "-Wno-uninitialized",
+        "-Wno-reorder-ctor",
+        "-Wno-logical-op-parentheses",
+        "-Wno-unused-function",
+        "-Wno-unknown-pragmas",
+    ]
+
+    _dlink_extra_flags = [
+        "-Xspirv-translator",
+        "-spirv-ext=+SPV_INTEL_split_barrier,"
+        "+SPV_INTEL_2d_block_io,"
+        "+SPV_INTEL_subgroup_matrix_multiply_accumulate",
+    ]
+    if GRF_COUNT == 256:
+        _dlink_extra_flags += [
+            "-Xs",
+            '"-options -cl-intel-256-GRF-per-thread"',
+        ]
+    for flag in _dlink_extra_flags:
+        if flag not in _cpp_ext._SYCL_DLINK_FLAGS:
+            _cpp_ext._SYCL_DLINK_FLAGS.append(flag)
+
+    extra_ldflags = []
+    _mklroot = Path(os.environ["MKLROOT"]) if "MKLROOT" in os.environ else None
+    if _mklroot and (_mklroot / "lib").exists():
+        _mkl_lib = _mklroot / "lib"
+        extra_ldflags += [
+            f"-Wl,-rpath,{_mkl_lib}", f"-L{_mkl_lib}",
+            "-lmkl_intel_ilp64",
+            "-lmkl_intel_thread",
+            "-lmkl_core",
+        ]
+
+    icpx = shutil.which("icpx")
+    _compiler_lib = None
+    if icpx:
+        _icpx_lib = Path(icpx).resolve().parent.parent / "lib"
+        if _icpx_lib.exists():
+            _compiler_lib = _icpx_lib
+    if _compiler_lib is None:
+        _cmplr_root = (
+            Path(os.environ["CMPLR_ROOT"]) if "CMPLR_ROOT" in os.environ
+            else None
+        )
+        if _cmplr_root and (_cmplr_root / "lib").exists():
+            _compiler_lib = _cmplr_root / "lib"
+    if _compiler_lib:
+        extra_ldflags += [
+            f"-Wl,-rpath,{_compiler_lib}", f"-L{_compiler_lib}",
+            "-liomp5",
+        ]
+
+    os.environ["TORCH_XPU_ARCH_LIST"] = sycl_target
+
+    module = load_inline(
+        name=_ext_name,
+        cpp_sources=[""],
+        sycl_sources=[_KERNEL_SYCL],
+        extra_cflags=_cutlass_defines + _tuning_defines,
+        extra_sycl_cflags=extra_sycl_cflags,
+        extra_include_paths=extra_include_paths,
+        extra_ldflags=extra_ldflags,
+        with_sycl=True,
+        verbose=False,
+        no_implicit_headers=True,
+    )
+    return module
+
+
+# ---------------------------------------------------------------------------
+# Compile the extension at module load time (cached by PyTorch)
+# ---------------------------------------------------------------------------
+SYCL_TARGET = os.environ.get("SYCL_TLA_TARGET") or _detect_sycl_target()
+_module = _load_extension(sycl_target=SYCL_TARGET)
+
+
+# ---------------------------------------------------------------------------
+# Model class — drop-in replacement for the reference
+# ---------------------------------------------------------------------------
+
+class Model(nn.Module):
+    """
+    Model that performs square matrix multiplication (C = A * B)
+    using sycl_tla_gemm_template with tuning knobs set via -D flags.
+    """
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
+        return _module.gemm_forward(A, B)
+
+
+N = 2048 * 2
+
+
+def get_inputs():
+    A = torch.rand(N, N)
+    B = torch.rand(N, N)
+    return [A, B]
+
+
+def get_init_inputs():
+    return []
diff --git a/src/xe_forge/claude/templates/CLAUDE.md.j2 b/src/xe_forge/claude/templates/CLAUDE.md.j2
index 1a54918..57bda13 100644
--- a/src/xe_forge/claude/templates/CLAUDE.md.j2
+++ b/src/xe_forge/claude/templates/CLAUDE.md.j2
@@ -2,6 +2,18 @@
 
 Optimize kernels into high-performance {{ dsl | upper }} implementations for {{ device | upper }}.
 
+{% if dsl == "sycl" %}
+If the reference is PyTorch code, write custom SYCL C++ kernels that implement the same
+computation. The kernel must be wrapped in a Python .py file using torch.utils.cpp_extension
+(inline_extension or load_inline) to compile and call the SYCL kernel from Python. The Model
+class in the .py wrapper must be compatible with the reference model in terms of the state_dict,
+and the __init__() and forward() methods.
+
+Do NOT simply call existing PyTorch operators (torch.mm, F.linear, torch.compile, etc.).
+The goal is to write hand-optimized SYCL C++ code using SYCL*TLA (CUTLASS), DPAS/XMX instructions,
+shared local memory (SLM), and tiling patterns from the knowledge_base.
+{% endif %}
+
 ## CONFIGURATION — Read `config.yaml` first
 
 At the start of every session, read `config.yaml` in the workspace root. It controls:

From 010d8cae19311abf607d52fbe4144ff328558eaf Mon Sep 17 00:00:00 2001
From: Weilin Xu <weilin.xu@intel.com>
Date: Tue, 9 Jun 2026 07:15:20 -0700
Subject: [PATCH 10/12] Use consistent --custom-us

Signed-off-by: Weilin Xu <weilin.xu@intel.com>
---
 src/xe_forge/claude/templates/CLAUDE.md.j2 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/xe_forge/claude/templates/CLAUDE.md.j2 b/src/xe_forge/claude/templates/CLAUDE.md.j2
index 57bda13..b9a38a0 100644
--- a/src/xe_forge/claude/templates/CLAUDE.md.j2
+++ b/src/xe_forge/claude/templates/CLAUDE.md.j2
@@ -72,7 +72,7 @@ For each trial:
 4. **Benchmark** (MANDATORY every trial):
    - **Trial t0:** `xe-forge-skill benchmark test_kernels/{{ kernel_name }}.py <{{ dsl }}_file> --spec test_kernels/{{ kernel_name }}.yaml`
    - **Trials t1+:** Get cached baseline via `xe-forge-skill trial baseline-us {{ kernel_name }}`, then `xe-forge-skill benchmark test_kernels/{{ kernel_name }}.py <{{ dsl }}_file> --spec test_kernels/{{ kernel_name }}.yaml --baseline-us <cached>`
-5. **Record** — `xe-forge-skill trial result {{ kernel_name }} <trial_id> --correctness <pass|fail> --speedup <float> --baseline-us <float> --{{ dsl }}-us <float>`
+5. **Record** — `xe-forge-skill trial result {{ kernel_name }} <trial_id> --correctness <pass|fail> --speedup <float> --baseline-us <float> --custom-us <float>`
 6. **Profile** — if `vtune_enabled: true` in `config.yaml` and this is trial t1 or later, run `xe-forge-skill profile <{{ dsl }}_file> --spec <spec.yaml>` and use the output to guide the next trial. If `vtune_enabled: false`, skip this step.
 7. **Decide next action**:
    - Speedup > 5x -> stop, finalize

From fa8a9e0128e774625c35cc94c88fe09eca1293b4 Mon Sep 17 00:00:00 2001
From: Weilin Xu <weilin.xu@intel.com>
Date: Tue, 9 Jun 2026 10:25:00 -0700
Subject: [PATCH 11/12] Instruct Claude Code to use the knowledge base in the
 workspace.

Signed-off-by: Weilin Xu <weilin.xu@intel.com>
---
 src/xe_forge/claude/templates/CLAUDE.md.j2 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/xe_forge/claude/templates/CLAUDE.md.j2 b/src/xe_forge/claude/templates/CLAUDE.md.j2
index b9a38a0..0ca8466 100644
--- a/src/xe_forge/claude/templates/CLAUDE.md.j2
+++ b/src/xe_forge/claude/templates/CLAUDE.md.j2
@@ -57,7 +57,7 @@ All runtime behavior below is gated by these values; re-read `config.yaml` if an
 ### Step 1: Analyze
 - Read the baseline source file `test_kernels/{{ kernel_name }}.py`. Identify shapes, dtypes, operations.
 - Run `xe-forge-skill analyze test_kernels/{{ kernel_name }}.py` to get AST-based analysis.
-- Read relevant knowledge_base/ files for optimization patterns.
+- Read relevant knowledge_base/ files in the workspace for optimization patterns.
 
 ### Step 2: Initialize
 ```bash

From 14ee963ccb32b975812259851c934f0a98b02dfe Mon Sep 17 00:00:00 2001
From: Weilin Xu <weilin.xu@intel.com>
Date: Tue, 9 Jun 2026 10:25:19 -0700
Subject: [PATCH 12/12] Update SYCL lessons.

Signed-off-by: Weilin Xu <weilin.xu@intel.com>
---
 knowledge_base/sycl/xpu/SYCL_lessons.md | 112 +++++++++++++++---------
 1 file changed, 71 insertions(+), 41 deletions(-)

diff --git a/knowledge_base/sycl/xpu/SYCL_lessons.md b/knowledge_base/sycl/xpu/SYCL_lessons.md
index d80866d..a9d70dd 100644
--- a/knowledge_base/sycl/xpu/SYCL_lessons.md
+++ b/knowledge_base/sycl/xpu/SYCL_lessons.md
@@ -1,46 +1,5 @@
 # SYCL Lessons
 
-## Lesson: Named Kernels for `parallel_for`
-
-### Symptom
-
-Compilation can fail with an error similar to:
-
-```text
-unnamed type '(lambda ...)' is invalid; provide a kernel name, or use '-fsycl-unnamed-lambda'
-```
-
-### Cause
-
-Some SYCL toolchains reject unnamed kernel lambdas in `parallel_for`.
-
-### Fix
-
-Prefer an explicitly named kernel:
-
-```cpp
-h.parallel_for<class ScalarMulKernel>(
-    sycl::nd_range<1>(global_size, WG_SIZE),
-    [=](sycl::nd_item<1> item) {
-        // kernel body
-    }
-);
-```
-
-### Fallback
-
-If a code path cannot be easily updated to named kernels, add this compile flag:
-
-```text
--fsycl-unnamed-lambda
-```
-
-### Practical Note
-
-For portability and future compiler compatibility, named kernels are the safer default.
-
----
-
 ## Lesson: Sub-group Coalesced Memory Access for Intel Xe GPUs
 
 ### Symptom
@@ -169,3 +128,74 @@ q.submit([&](sycl::handler& h) {
 - Section 6: Thread mapping and occupancy (work-group decomposition)
 - Section 7: Memory hierarchy and coalesced access
 - Section 11: Prefer wide/coalesced accesses and remove expensive operations from inner loops
+
+---
+
+## Lesson: Never Use `-fsycl-unnamed-lambda`
+
+### Symptom
+
+Compilation fails with:
+
+```text
+icpx: error: cannot specify '-fsycl-unnamed-lambda' along with '-fsycl-host-compiler'
+```
+
+### Cause
+
+`-fsycl-unnamed-lambda` conflicts with the `-fsycl-host-compiler` flag that the
+PyTorch `load_inline` build system injects automatically. The two flags cannot
+coexist.
+
+### Fix
+
+Always use **named kernel classes** in `parallel_for`:
+
+```cpp
+h.parallel_for<class MyKernelName>(
+    sycl::nd_range<1>(global_size, WG_SIZE),
+    [=](sycl::nd_item<1> item) {
+        // kernel body
+    }
+);
+```
+
+Never add `-fsycl-unnamed-lambda` to `extra_sycl_cflags` in `load_inline`.
+
+### Additional: `constexpr` Captures Cause Kernel Size Mismatch
+
+Capturing `constexpr` float variables from the enclosing host scope causes a
+static assertion failure at compile time:
+
+```text
+error: static assertion failed: Unexpected kernel lambda size.
+  This can be caused by an external host compiler producing a lambda with an
+  unexpected layout. In many cases the difference is related to capturing
+  constexpr variables.
+```
+
+**Fix**: define float constants **inside** the kernel lambda, not outside it:
+
+```cpp
+// WRONG — constexpr captured from host scope
+constexpr float S2PI = 0.7978845608028654f;
+q->submit([&](sycl::handler& h) {
+    h.parallel_for<class K>(range, [=](sycl::nd_item<1> item) {
+        float x = S2PI * val;  // S2PI captured — size mismatch!
+    });
+});
+
+// CORRECT — constant defined inside the lambda
+q->submit([&](sycl::handler& h) {
+    h.parallel_for<class K>(range, [=](sycl::nd_item<1> item) {
+        const float S2PI = 0.7978845608028654f;
+        float x = S2PI * val;  // no capture, no mismatch
+    });
+});
+```
+
+Use `enum` for integer compile-time constants (safe to define in host scope):
+
+```cpp
+enum { WG = 256, ITERS = 16, SG_SZ = 16 };  // safe — integer enums, not floats
+```