multikernel
diff --git a/‎BUILTINS.md‎
Lines changed: 48 additions & 3 deletions b/‎BUILTINS.md‎
Lines changed: 48 additions & 3 deletions
diff --git a/‎README.md‎
Lines changed: 7 additions & 4 deletions b/‎README.md‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎SPEC.md‎
Lines changed: 19 additions & 5 deletions b/‎SPEC.md‎
Lines changed: 19 additions & 5 deletions
diff --git a/‎examples/perf_cache_miss.ks‎
Lines changed: 5 additions & 3 deletions b/‎examples/perf_cache_miss.ks‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎src/stdlib.ml‎
Lines changed: 71 additions & 9 deletions b/‎src/stdlib.ml‎
Lines changed: 71 additions & 9 deletions
@@ -98,7 +98,7 @@ fn main() -> i32 {
     - `flags`: Attachment flags (context-dependent)
 - Perf event form:
     - `handle`: Program handle returned from `load()`
-    - `opts`: `perf_options` value — only `perf_type` and `perf_config` are required; all other fields have defaults, including `group_fd=-1`
+    - `opts`: `perf_options` value — only `perf_type` and `perf_config` are required; all other fields have defaults, including no group (`group` invalid and `group_fd=-1`)
     - `flags`: Must be `0` for perf attaches; nonzero values are rejected
 
 **Return Value:**
@@ -126,12 +126,14 @@ var cache = attach(perf_prog, perf_options { perf_type: perf_type_hardware, perf
 var branch = attach(perf_prog, perf_options {
     perf_type: perf_type_hardware,
     perf_config: branch_misses,
-    group_fd: cache.perf_fd,
+    group: cache,
 }, 0)
 detach(branch)
 detach(cache)
 ```
 
+Grouped events are scheduled as one atomic PMU unit. Separate events and separate groups may be multiplexed, but members inside one group cannot be independently multiplexed. Static groups that exceed the target PMU counter limit are rejected at compile time; override the detected/default limit with `KERNELSCRIPT_PERF_GROUP_MAX_EVENTS` when compiling for a different target.
+
 **Context-specific implementations:**
 - **eBPF:** Not available
 - **Userspace:** Uses `attach_bpf_program_by_fd` for standard targets and `ks_attach_perf_event` for perf events
@@ -183,7 +185,50 @@ detach(prog)  // Clean up
 - Returns a scaled value when `time_running < time_enabled`
 - Returns `-1` on invalid/stale attachment or read failure
 - Reads use the attachment's `perf_fd` directly; the internal token detects copied handles used after detach.
-- Group snapshot reads are not supported yet; read grouped attachments individually.
+- Use `read_group(leader)` when you need a same-time group snapshot.
+
+---
+
+#### `read_raw(handle)`
+**Signature:** `read_raw(handle: PerfAttachment) -> i64`
+**Variadic:** No
+**Context:** Userspace only
+
+**Description:** Read the unscaled raw hardware/software counter value from a perf attachment.
+
+**Return Value:**
+- Returns the raw counter value
+- Returns `-1` on invalid/stale attachment or read failure
+
+---
+
+#### `read_details(handle)`
+**Signature:** `read_details(handle: PerfAttachment) -> PerfReadDetails`
+**Variadic:** No
+**Context:** Userspace only
+
+**Description:** Read raw, scaled, `time_enabled`, and `time_running` details for a perf attachment.
+
+**Return Value:**
+- `raw`: unscaled counter value
+- `scaled`: multiplex-corrected value, or `-1` on timing/read error
+- `time_enabled`: perf enabled time
+- `time_running`: perf running time
+
+---
+
+#### `read_group(leader)`
+**Signature:** `read_group(leader: PerfAttachment) -> PerfGroupRead`
+**Variadic:** No
+**Context:** Userspace only
+
+**Description:** Read a same-time snapshot from a perf event group leader. This enables `PERF_FORMAT_GROUP | PERF_FORMAT_ID` in generated perf events.
+
+**Return Value:**
+- `count`: number of entries returned, capped at 16
+- `values`: multiplex-scaled values from the snapshot
+- `ids`: perf event IDs for the returned values
+- `time_enabled` / `time_running`: timing fields used for scaling
 
 ---
 
 
@@ -306,7 +306,7 @@ fn on_branch_miss(ctx: *bpf_perf_event_data) -> i32 {
 fn main() -> i32 {
     var prog = load(on_branch_miss)
 
-    // Minimal form — defaults: pid=-1 (all procs), cpu=0, group_fd=-1,
+    // Minimal form — defaults: pid=-1 (all procs), cpu=0, no group,
     // period=1_000_000, wakeup=1; perf attach flags must be 0
     var att = attach(prog, perf_options { perf_type: perf_type_hardware, perf_config: branch_misses }, 0)
     var count = read(att)
@@ -318,18 +318,21 @@ fn main() -> i32 {
 }
 ```
 
-Perf events can share a kernel scheduling group by passing the leader attachment's `perf_fd` as `group_fd`:
+Perf events can share a kernel scheduling group by passing the leader attachment directly with `group`.
+The lower-level `group_fd: cache.perf_fd` form is still supported for compatibility:
 
 ```kernelscript
 var cache = attach(prog, perf_options { perf_type: perf_type_hardware, perf_config: cache_misses }, 0)
 var branch = attach(prog, perf_options {
     perf_type: perf_type_hardware,
     perf_config: branch_misses,
-    group_fd: cache.perf_fd,
+    group: cache,
 }, 0)
 ```
 
-Adding a member restarts the whole group from zero. Detach members before detaching their leader. `read(att)` still reads one attachment at a time; it returns a multiplex-scaled count when the kernel reports `time_running < time_enabled`. Group snapshot reads are not part of this first-stage API.
+Adding a member restarts the whole group from zero. Detaching a leader cascades to any live members. A group competes for PMU counters as one atomic unit: different groups can be multiplexed over time, but members inside one group are not independently multiplexed. For statically visible groups, the compiler rejects groups that need more PMU counter slots than the target limit. The limit is read from known sysfs PMU caps when available, defaults to 4, and can be overridden with `KERNELSCRIPT_PERF_GROUP_MAX_EVENTS`.
+
+`read(att)` returns a multiplex-scaled count when the kernel reports `time_running < time_enabled`. Use `read_raw(att)` for the raw value, `read_details(att)` for raw/scaled/timing details, and `read_group(leader)` for a same-time group snapshot.
 
 **Available `perf_type` values:**
 
 
@@ -461,7 +461,7 @@ fn main() -> i32 {
     var prog = load(my_handler)
 
     // Only perf_type + perf_config are required; all other fields use language-level defaults:
-    // pid=-1, cpu=0, group_fd=-1, period=1_000_000, wakeup=1, inherit/exclude_*=false
+    // pid=-1, cpu=0, no group, period=1_000_000, wakeup=1, inherit/exclude_*=false
     var misses = attach(prog, perf_options { perf_type: perf_type_hardware, perf_config: branch_misses }, 0)
 
     // Override specific fields as needed:
@@ -474,14 +474,16 @@ fn main() -> i32 {
     }, 0)
 
     // Put branch misses in cache's perf event group. Adding a member restarts
-    // the whole group from zero.
+    // the whole group from zero. The lower-level group_fd: cache.perf_fd form
+    // is still accepted.
     var branch = attach(prog, perf_options {
         perf_type: perf_type_hardware,
         perf_config: branch_misses,
-        group_fd: cache.perf_fd,
+        group: cache,
     }, 0)
 
     print("misses=%lld cache=%lld branch=%lld", read(misses), read(cache), read(branch))
+    var snapshot = read_group(cache)
 
     detach(branch)
     detach(cache)  // IOC_DISABLE → bpf_link__destroy → close(perf_fd)
@@ -500,6 +502,7 @@ fn main() -> i32 {
 | `pid` | `i32` | `-1` | -1 = all processes; ≥0 = specific PID |
 | `cpu` | `i32` | `0` | ≥0 = specific CPU; -1 = any CPU (pid must be ≥0) |
 | `group_fd` | `i32` | `-1` | -1 = standalone event; ≥0 = perf group leader fd |
+| `group` | `PerfAttachment` | invalid attachment | Preferred high-level group leader attachment |
 | `period` | `u64` | `1000000` | Sample after this many events |
 | `wakeup` | `u32` | `1` | Wake userspace after N samples |
 | `inherit` | `bool` | `false` | Inherit to forked children |
@@ -550,6 +553,9 @@ For event families with a richer config space, such as `perf_type_hw_cache`, pro
 | `ks_attach_perf_event` | `PerfAttachment (int prog_fd, ks_perf_options, int flags)` | Full open-reset-attach-enable lifecycle |
 | `ks_read_perf_count` | `int64_t (int perf_fd)` | Reads current counter and applies multiplex scaling when needed |
 | `ks_perf_attachment_read` | `int64_t (PerfAttachment)` | Direct fd read through the attachment value with stale-handle detection |
+| `ks_perf_attachment_read_raw` | `int64_t (PerfAttachment)` | Direct raw counter read with stale-handle detection |
+| `ks_perf_attachment_read_details` | `PerfReadDetails (PerfAttachment)` | Returns raw, scaled, `time_enabled`, and `time_running` |
+| `ks_perf_attachment_read_group` | `PerfGroupRead (PerfAttachment)` | Reads a same-time group snapshot from a leader attachment |
 
 **Attach sequence for standalone events (compiler-generated, inside `ks_attach_perf_event`):**
 1. `ks_attr.attr.disabled = 1` — open counter without starting it  
@@ -559,17 +565,24 @@ For event families with a richer config space, such as `perf_type_hw_cache`, pro
 5. `ioctl(perf_fd, PERF_EVENT_IOC_ENABLE, 0)` — **start counting**  
 
 **Perf event groups:**
+- `group: leader_attachment` is the preferred way to join a perf group.
 - `group_fd >= 0` opens the new event as a member of that leader fd.
 - Group members are opened disabled, linked to the BPF program, then the leader is disabled, reset, and enabled with `PERF_IOC_FLAG_GROUP`.
 - Adding a member to an already running group restarts the whole group from zero.
-- Detaching a member is allowed. Detaching a leader while live members reference it is rejected; detach members first.
-- Group snapshot reads are not implemented yet; read each `PerfAttachment` separately.
+- A group is scheduled as an atomic PMU unit. Separate events and separate groups may be multiplexed; members inside one group are not independently multiplexed. If a statically visible group needs more PMU counter slots than the target limit, compilation fails.
+- The compile-time group limit uses known sysfs PMU caps when available, falls back to `4`, and can be overridden with `KERNELSCRIPT_PERF_GROUP_MAX_EVENTS`.
+- `perf_type_software` and `perf_type_tracepoint` do not consume PMU counter slots for this check; static hardware/raw/cache/breakpoint events consume one slot, and dynamic `perf_type` values are conservatively counted as one slot.
+- Detaching a member is allowed. Detaching a leader cascades to any live members.
+- `read_group(leader)` enables `PERF_FORMAT_GROUP | PERF_FORMAT_ID` and returns up to 16 same-time group values plus perf IDs and timing fields.
 
 **Counter reads:**
 - Generated perf events request `PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING`.
 - `read(att)` returns the raw value when `time_enabled == time_running`.
 - If multiplexing occurred, `read(att)` returns `value * time_enabled / time_running` using a 128-bit intermediate.
 - If `time_running == 0`, `read(att)` reports an error and returns `-1`.
+- `read_raw(att)` returns the unscaled raw counter.
+- `read_details(att)` returns raw, scaled, `time_enabled`, and `time_running`.
+- `read_group(leader)` returns a snapshot struct; group `values[]` are scaled using the snapshot timing fields.
 
 **Detach sequence (compiler-generated):**
 1. `ioctl(perf_fd, PERF_EVENT_IOC_DISABLE, 0)` — stop counting  
@@ -583,6 +596,7 @@ For event families with a richer config space, such as `perf_type_hw_cache`, pro
 - `PerfAttachment` carries `perf_fd` plus an internal generation token; `read(attachment)` avoids global attachment-list scans and rejects copied handles after detach
 - Exposes omitted `perf_options` fields as language-level defaults (partial struct literal)
 - Validates `pid ≥ -1`, `cpu ≥ -1`, `group_fd ≥ -1`, and rejects `pid == -1 && cpu == -1` at runtime
+- Treats `group` as valid only when it carries a live `PerfAttachment` generation token; otherwise `group_fd` controls grouping
 - Emits `PERF_FLAG_FD_CLOEXEC` for safe fd inheritance
 - BPF program section is `SEC("perf_event")`
 
 
@@ -11,17 +11,19 @@ fn on_cache_miss(ctx: *bpf_perf_event_data) -> i32 {
 fn main() -> i32 {
     var prog = load(on_cache_miss)
 
-    // Only perf_type + perf_config are required; pid, cpu, group_fd, period, wakeup and flag fields
+    // Only perf_type + perf_config are required; pid, cpu, group/group_fd, period, wakeup and flag fields
     // default to: pid=-1 (all procs), cpu=0, period=1_000_000, wakeup=1,
-    // group_fd=-1, inherit/exclude_kernel/exclude_user=false.
+    // no group, inherit/exclude_kernel/exclude_user=false.
     var cache = attach(prog, perf_options { perf_type: perf_type_hardware, perf_config: cache_misses, period: 10000000, inherit: true }, 0)
     // branch joins cache's perf event group. Adding a member restarts the whole group from zero.
-    var branch = attach(prog, perf_options { perf_type: perf_type_hardware, perf_config: branch_misses, period: 10000000, inherit: true, group_fd: cache.perf_fd }, 0)
+    var branch = attach(prog, perf_options { perf_type: perf_type_hardware, perf_config: branch_misses, period: 10000000, inherit: true, group: cache }, 0)
     print("Cache-miss and branch-miss perf_event demo attached")
     var cache_count = read(cache)
     print("Cache-miss count: %lld", cache_count)
     var branch_count = read(branch)
     print("Branch-miss count: %lld", branch_count)
+    var snapshot = read_group(cache)
+    print("Grouped snapshot entries: %u", snapshot.count)
 
     detach(branch)
     detach(cache)
 
@@ -129,6 +129,13 @@ let validate_read_function arg_types _ast_context _pos =
   | _ ->
       (false, Some "read() currently requires a PerfAttachment")
 
+let validate_read_group_function arg_types _ast_context _pos =
+  match arg_types with
+  | [Struct "PerfAttachment"] | [UserType "PerfAttachment"] ->
+      (true, None)
+  | _ ->
+      (false, Some "read_group() requires a PerfAttachment group leader")
+
 (** Validation function for detach() - accepts program handles and perf attachments *)
 let validate_detach_function arg_types _ast_context _pos =
   match arg_types with
@@ -244,13 +251,46 @@ let builtin_functions = [
     name = "read";
     param_types = []; (* Custom validation handles attachment-aware overloads *)
     return_type = I64; (* Raw counter value, or -1 on error *)
-    description = "Read the current hardware/software counter value for a perf attachment";
+    description = "Read the multiplex-scaled hardware/software counter value for a perf attachment";
     is_variadic = false;
     ebpf_impl = ""; (* Not available in eBPF context *)
     userspace_impl = "ks_perf_attachment_read";
     kernel_impl = "";
     validate = Some validate_read_function;
   };
+  {
+    name = "read_raw";
+    param_types = [];
+    return_type = I64;
+    description = "Read the raw hardware/software counter value for a perf attachment";
+    is_variadic = false;
+    ebpf_impl = "";
+    userspace_impl = "ks_perf_attachment_read_raw";
+    kernel_impl = "";
+    validate = Some validate_read_function;
+  };
+  {
+    name = "read_details";
+    param_types = [];
+    return_type = Struct "PerfReadDetails";
+    description = "Read raw, scaled, time_enabled, and time_running for a perf attachment";
+    is_variadic = false;
+    ebpf_impl = "";
+    userspace_impl = "ks_perf_attachment_read_details";
+    kernel_impl = "";
+    validate = Some validate_read_function;
+  };
+  {
+    name = "read_group";
+    param_types = [];
+    return_type = Struct "PerfGroupRead";
+    description = "Read a same-time snapshot from a perf event group leader";
+    is_variadic = false;
+    ebpf_impl = "";
+    userspace_impl = "ks_perf_attachment_read_group";
+    kernel_impl = "";
+    validate = Some validate_read_group_function;
+  };
 ]
 
 (** Get built-in function definition by name *)
@@ -350,6 +390,7 @@ let builtin_types = [
     ("pid",            I32);
     ("cpu",            I32);
     ("group_fd",       I32);
+    ("group",          Struct "PerfAttachment");
     ("period",         U64);
     ("wakeup",         U32);
     ("inherit",        Bool);
@@ -364,6 +405,21 @@ let builtin_types = [
     ("prog_fd", I32);
     ("generation", U64);
   ], builtin_pos));
+
+  TypeDef (StructDef ("PerfReadDetails", [
+    ("raw", I64);
+    ("scaled", I64);
+    ("time_enabled", U64);
+    ("time_running", U64);
+  ], builtin_pos));
+
+  TypeDef (StructDef ("PerfGroupRead", [
+    ("count", U32);
+    ("values", Array (I64, 16));
+    ("ids", Array (U64, 16));
+    ("time_enabled", U64);
+    ("time_running", U64);
+  ], builtin_pos));
 ]
 
 (** Default field values for structs that support partial initialisation.
@@ -373,14 +429,20 @@ let builtin_types = [
 let get_struct_field_defaults = function
   | "perf_options" ->
       Some [
-        ("pid",            IntLit (Signed64 (-1L),      None));
-        ("cpu",            IntLit (Signed64 0L,         None));
-        ("group_fd",       IntLit (Signed64 (-1L),      None));
-        ("period",         IntLit (Unsigned64 1000000L, None));
-        ("wakeup",         IntLit (Unsigned64 1L,       None));
-        ("inherit",        BoolLit false);
-        ("exclude_kernel", BoolLit false);
-        ("exclude_user",   BoolLit false);
+        ("pid",            Literal (IntLit (Signed64 (-1L),      None)));
+        ("cpu",            Literal (IntLit (Signed64 0L,         None)));
+        ("group_fd",       Literal (IntLit (Signed64 (-1L),      None)));
+        ("group",          StructLiteral ("PerfAttachment", [
+          ("perf_fd",    make_expr (Literal (IntLit (Signed64 (-1L), None))) builtin_pos);
+          ("link_id",    make_expr (Literal (IntLit (Signed64 (-1L), None))) builtin_pos);
+          ("prog_fd",    make_expr (Literal (IntLit (Signed64 (-1L), None))) builtin_pos);
+          ("generation", make_expr (Literal (IntLit (Unsigned64 0L,  None))) builtin_pos);
+        ]));
+        ("period",         Literal (IntLit (Unsigned64 1000000L, None)));
+        ("wakeup",         Literal (IntLit (Unsigned64 1L,       None)));
+        ("inherit",        Literal (BoolLit false));
+        ("exclude_kernel", Literal (BoolLit false));
+        ("exclude_user",   Literal (BoolLit false));
       ]
   | _ -> None