From f0406908627706f765786fd3b4e26a8c3b96ace7 Mon Sep 17 00:00:00 2001
From: Shiny Brar <shiny.brar@nrc-cnrc.gc.ca>
Date: Thu, 26 Mar 2026 08:20:16 -0700
Subject: [PATCH 01/11] fix(track): update logging format for failed job states
 to use f-strings for consistency

---
 configs/kueue/kueuer/src/kueuer/benchmarks/track.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/configs/kueue/kueuer/src/kueuer/benchmarks/track.py b/configs/kueue/kueuer/src/kueuer/benchmarks/track.py
index 56981e39..0fd8348a 100644
--- a/configs/kueue/kueuer/src/kueuer/benchmarks/track.py
+++ b/configs/kueue/kueuer/src/kueuer/benchmarks/track.py
@@ -290,7 +290,7 @@ def jobs(  # noqa: C901
         elif item.metadata.name in pending and job_reached_state(item, "Failed"):
             failed_count += 1
             pending[item.metadata.name] = False
-            logfire.warning("%s reached terminal state Failed.", item.metadata.name)
+            logfire.warning(f"{item.metadata.name} reached terminal state Failed.")
 
     logfire.info(f"{len(pending)} jobs need to be tracked.")
     logfire.info(f"Starting to track jobs to state {to_state}...")
@@ -323,7 +323,7 @@ def jobs(  # noqa: C901
             elif job_reached_state(item, "Failed"):
                 failed_count += 1
                 pending[name] = False
-                logfire.warning("%s reached terminal state Failed.", name)
+                logfire.warning(f"{name} reached terminal state Failed.")
 
             logfire.debug(f"Pending Jobs Left: {sum(pending.values())}")
 
@@ -338,8 +338,6 @@ def jobs(  # noqa: C901
                 break
     if failed_count:
         logfire.warning(
-            "%s jobs reached Failed state while tracking '%s'.",
-            failed_count,
-            prefix,
+            f"{failed_count} jobs reached Failed state while tracking '{prefix}'."
         )
     return done

From 78ecd990692dd317902f55d4d5cf82742a6db0e5 Mon Sep 17 00:00:00 2001
From: Shiny Brar <shiny.brar@nrc-cnrc.gc.ca>
Date: Thu, 26 Mar 2026 13:51:21 -0700
Subject: [PATCH 02/11] feat(benchmark): add vm_memory_fraction parameter to
 experiment and benchmark functions for improved resource management

---
 .../kueuer/src/kueuer/benchmarks/benchmark.py   | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/configs/kueue/kueuer/src/kueuer/benchmarks/benchmark.py b/configs/kueue/kueuer/src/kueuer/benchmarks/benchmark.py
index 8a671032..55b75349 100644
--- a/configs/kueue/kueuer/src/kueuer/benchmarks/benchmark.py
+++ b/configs/kueue/kueuer/src/kueuer/benchmarks/benchmark.py
@@ -261,6 +261,7 @@ def experiment(
     apply_chunk_size: int = 25,
     apply_retries: int = 2,
     apply_backoff: float = 2.0,
+    vm_memory_fraction: float = k8s.DEFAULT_STRESS_VM_MEMORY_FRACTION,
 ) -> Dict[str, Any]:
     """Run a single experiment with the specified configuration.
 
@@ -314,6 +315,7 @@ def experiment(
         apply_chunk_size=apply_chunk_size,
         apply_retries=apply_retries,
         apply_backoff=apply_backoff,
+        vm_memory_fraction=vm_memory_fraction,
     )
 
     # Track jobs to completion and get timing statistics
@@ -395,6 +397,7 @@ def benchmark(
     apply_chunk_size: int,
     apply_retries: int,
     apply_backoff: float,
+    vm_memory_fraction: float,
 ) -> List[Dict[str, Any]]:
     """
     Run a complete benchmark comparing direct Kubernetes jobs vs Kueue jobs.
@@ -434,6 +437,7 @@ def benchmark(
             apply_chunk_size=apply_chunk_size,
             apply_retries=apply_retries,
             apply_backoff=apply_backoff,
+            vm_memory_fraction=vm_memory_fraction,
         )
         results.append(result)
 
@@ -459,6 +463,7 @@ def benchmark(
             apply_chunk_size=apply_chunk_size,
             apply_retries=apply_retries,
             apply_backoff=apply_backoff,
+            vm_memory_fraction=vm_memory_fraction,
         )
         results.append(kueue_result)
 
@@ -566,6 +571,16 @@ def performance(
         "--apply-backoff",
         help="Backoff base (seconds) between apply retries.",
     ),
+    vm_memory_fraction: float = typer.Option(
+        k8s.DEFAULT_STRESS_VM_MEMORY_FRACTION,
+        "--vm-memory-fraction",
+        min=0.1,
+        max=0.95,
+        help=(
+            "Fraction of per-job memory assigned to stress-ng --vm-bytes. "
+            "Lower values reduce OOM risk."
+        ),
+    ),
 ):
     """Compare native K8s job scheduling vs. Kueue."""
     profile = _normalize_profile_name(profile)
@@ -625,6 +640,7 @@ def performance(
         apply_retries,
         apply_backoff,
     )
+    logger.info("VM Frac  : %s", vm_memory_fraction)
 
     if not k8s.check(namespace, kueue, priority):
         logger.error("Please check your Kueue configuration.")
@@ -645,6 +661,7 @@ def performance(
         apply_chunk_size=apply_chunk_size,
         apply_retries=apply_retries,
         apply_backoff=apply_backoff,
+        vm_memory_fraction=vm_memory_fraction,
     )
     logger.info("Benchmark completed successfully.")
     logger.info("Results saved to %s", output)

From 9c79392baa6a18e0742ead9f62583c48f5f924eb Mon Sep 17 00:00:00 2001
From: Shiny Brar <shiny.brar@nrc-cnrc.gc.ca>
Date: Thu, 26 Mar 2026 13:51:46 -0700
Subject: [PATCH 03/11] feat(benchmark): enhance memory management with
 vm_memory_fraction calculations and OOM risk assessment

---
 .../kueue/kueuer/src/kueuer/benchmarks/k8s.py | 54 ++++++++++++++++++-
 1 file changed, 53 insertions(+), 1 deletion(-)

diff --git a/configs/kueue/kueuer/src/kueuer/benchmarks/k8s.py b/configs/kueue/kueuer/src/kueuer/benchmarks/k8s.py
index 91c553f5..cba93875 100644
--- a/configs/kueue/kueuer/src/kueuer/benchmarks/k8s.py
+++ b/configs/kueue/kueuer/src/kueuer/benchmarks/k8s.py
@@ -18,6 +18,7 @@
 from kueuer.utils.logging import logger
 
 app = typer.Typer(help="Launch K8s Jobs")
+DEFAULT_STRESS_VM_MEMORY_FRACTION = 0.33
 
 
 def check(namespace: str, kueue: str, priority: str) -> bool:
@@ -131,6 +132,33 @@ def stress_cpu_workers(cores: float) -> int:
     return max(int(math.ceil(cores)), 1)
 
 
+def stress_vm_bytes_mb(ram_gb: float, vm_memory_fraction: float) -> float:
+    """Return stress-ng vm-bytes value in megabytes.
+
+    Args:
+        ram_gb: Pod memory limit/request in GiB.
+        vm_memory_fraction: Fraction of pod memory allocated to stress-ng vm worker.
+
+    Returns:
+        float: vm-bytes value in MB.
+    """
+    if not 0.0 < vm_memory_fraction < 1.0:
+        raise ValueError("vm_memory_fraction must be between 0 and 1 (exclusive)")
+    ram_mb = ram_gb * 1024.0
+    return ram_mb * vm_memory_fraction
+
+
+def is_high_oom_risk(ram_gb: float, vm_memory_fraction: float) -> bool:
+    """Heuristic for tight memory headroom likely to trigger OOM.
+
+    This warns when stress-ng memory pressure leaves too little room for process/runtime
+    overhead under pod cgroup limits.
+    """
+    vm_bytes_mb = stress_vm_bytes_mb(ram_gb=ram_gb, vm_memory_fraction=vm_memory_fraction)
+    # For small pods, keep additional safety headroom to avoid allocator/runtime spikes.
+    return vm_memory_fraction >= 0.75 or (ram_gb <= 1.0 and vm_bytes_mb >= 600.0)
+
+
 def _format_cpu_quantity(cores: float) -> str:
     """Format CPU cores into Kubernetes CPU quantity syntax."""
     if float(cores).is_integer():
@@ -383,10 +411,26 @@ def run(
             help="Backoff base (seconds) used between apply retries.",
         )
     ),
+    vm_memory_fraction: float = (
+        typer.Option(
+            DEFAULT_STRESS_VM_MEMORY_FRACTION,
+            "--vm-memory-fraction",
+            min=0.1,
+            max=0.95,
+            help=(
+                "Fraction of pod memory assigned to stress-ng --vm-bytes. "
+                "Lower values leave more headroom and reduce OOM risk."
+            ),
+        )
+    ),
 ) -> Dict[str, Any]:
     """Run jobs to stress k8s cluster."""
     ram_mb: float = ram * 1024.0
     cpu_workers = stress_cpu_workers(cores)
+    vm_bytes_mb = stress_vm_bytes_mb(
+        ram_gb=ram,
+        vm_memory_fraction=vm_memory_fraction,
+    )
     cpu_quantity = _format_cpu_quantity(cores)
     args: List[str] = [
         "--cpu",
@@ -396,13 +440,21 @@ def run(
         "--vm",
         "1",
         "--vm-bytes",
-        f"{ram_mb * 0.8}M",
+        f"{vm_bytes_mb}M",
         "--temp-path",
         "/tmp",
         "--timeout",
         f"{duration}",
         "--metrics-brief",
     ]
+    if is_high_oom_risk(ram_gb=ram, vm_memory_fraction=vm_memory_fraction):
+        logger.warning(
+            "High OOM risk: ram=%sGi, vm-memory-fraction=%s (vm-bytes=%sM). "
+            "Consider reducing --vm-memory-fraction.",
+            ram,
+            vm_memory_fraction,
+            f"{vm_bytes_mb:.1f}",
+        )
     job = io.read_yaml(filepath)
 
     # Write common job parameters

From c2609be47db85b7664a23ce29eb7018514584bca Mon Sep 17 00:00:00 2001
From: Shiny Brar <shiny.brar@nrc-cnrc.gc.ca>
Date: Thu, 26 Mar 2026 13:52:04 -0700
Subject: [PATCH 04/11] feat(lifecycle): add warnings to preflight report and
 enhance access checks output

---
 .../kueuer/src/kueuer/lifecycle/commands.py      | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/configs/kueue/kueuer/src/kueuer/lifecycle/commands.py b/configs/kueue/kueuer/src/kueuer/lifecycle/commands.py
index 76a7087d..7ec1d99d 100644
--- a/configs/kueue/kueuer/src/kueuer/lifecycle/commands.py
+++ b/configs/kueue/kueuer/src/kueuer/lifecycle/commands.py
@@ -128,6 +128,9 @@ def run_cluster_preflight(
         *kueue_report.get("errors", []),
         *queue_report.get("errors", []),
     ]
+    warnings = [
+        *access_report.get("warnings", []),
+    ]
     remediation = [
         *kueue_report.get("remediation", []),
         *queue_report.get("manual_commands", []),
@@ -135,6 +138,7 @@ def run_cluster_preflight(
     return {
         "ok": access_report["ok"] and kueue_report["ok"] and queue_report["ok"],
         "errors": errors,
+        "warnings": warnings,
         "remediation": remediation,
         "checks": {
             "access": access_report.get("checks", {}),
@@ -190,10 +194,16 @@ def print_preflight_report(report: Dict[str, Any]) -> None:
     typer.echo("Access checks:")
     typer.echo(f"  - kubectl installed: {'yes' if access.get('binary:kubectl') else 'no'}")
     typer.echo(f"  - current context readable: {'yes' if access.get('context') else 'no'}")
-    typer.echo(f"  - cluster reachable: {'yes' if access.get('cluster-info') else 'no'}")
+    typer.echo(
+        f"  - workload namespace exists: {'yes' if access.get('namespace-exists') else 'no'}"
+    )
     typer.echo(
         f"  - can create jobs: {'yes' if access.get('can-create-jobs') else 'no'}"
     )
+    typer.echo(
+        "  - cluster-info (kubectl -n <workload-ns>): "
+        f"{'yes' if access.get('cluster-info') else 'no'}"
+    )
 
     kueue = report.get("checks", {}).get("kueue", {})
     typer.echo("Kueue health:")
@@ -214,6 +224,10 @@ def print_preflight_report(report: Dict[str, Any]) -> None:
     _echo_list("LocalQueues", list(inventory.get("localqueues", [])))
     _echo_list("PriorityClasses", list(inventory.get("workloadpriorityclasses", [])))
 
+    if report.get("warnings"):
+        typer.echo("Warnings:")
+        for item in report["warnings"]:
+            typer.echo(f"  - {item}")
     if report.get("errors"):
         typer.echo("Errors:")
         for error in report["errors"]:

From d2a068a66f65a17ea453b3efbbd098e34c01e997 Mon Sep 17 00:00:00 2001
From: Shiny Brar <shiny.brar@nrc-cnrc.gc.ca>
Date: Thu, 26 Mar 2026 13:52:30 -0700
Subject: [PATCH 05/11] feat(lifecycle): enhance preflight checks with
 namespace validation and warning handling

---
 .../kueuer/src/kueuer/lifecycle/preflight.py  | 86 ++++++++++++++-----
 1 file changed, 66 insertions(+), 20 deletions(-)

diff --git a/configs/kueue/kueuer/src/kueuer/lifecycle/preflight.py b/configs/kueue/kueuer/src/kueuer/lifecycle/preflight.py
index 42b670d7..6b6f34d1 100644
--- a/configs/kueue/kueuer/src/kueuer/lifecycle/preflight.py
+++ b/configs/kueue/kueuer/src/kueuer/lifecycle/preflight.py
@@ -12,7 +12,16 @@ def run_preflight(
     command_exists_fn: Callable[[str], bool] = command_exists,
     run_cmd: Callable[[List[str]], Any] = run_command,
 ) -> Dict[str, Any]:
+    """Verify kubectl, workload namespace, Job RBAC, and API connectivity.
+
+    ``kubectl cluster-info`` is not namespace-scoped; we run
+    ``kubectl -n <namespace> cluster-info`` so the workload namespace is set on
+    the kubectl invocation. If that command fails but the namespace exists and
+    ``can-i create jobs`` succeeds, preflight still passes with a warning so
+    benchmarks can run when ``cluster-info`` flakes or is restricted.
+    """
     errors: List[str] = []
+    warnings: List[str] = []
     checks: Dict[str, bool] = {}
 
     for binary in ("kubectl",):
@@ -22,31 +31,68 @@ def run_preflight(
             errors.append(f"Required binary missing: {binary}")
 
     context = ""
-    if not errors:
-        context_result = run_cmd(["kubectl", "config", "current-context"])
-        checks["context"] = context_result.returncode == 0
-        if context_result.returncode != 0:
-            errors.append("kubectl config current-context failed")
-        else:
-            context = context_result.stdout.strip()
-
-        cluster_result = run_cmd(["kubectl", "cluster-info"])
-        checks["cluster-info"] = cluster_result.returncode == 0
-        if cluster_result.returncode != 0:
-            errors.append("kubectl cluster-info failed")
-
-        can_i_result = run_cmd(
-            ["kubectl", "auth", "can-i", "create", "jobs", "-n", namespace]
+    if errors:
+        return {
+            "ok": False,
+            "context": context,
+            "checks": checks,
+            "errors": errors,
+            "warnings": warnings,
+        }
+
+    context_result = run_cmd(["kubectl", "config", "current-context"])
+    checks["context"] = context_result.returncode == 0
+    if context_result.returncode != 0:
+        errors.append("kubectl config current-context failed")
+    else:
+        context = context_result.stdout.strip()
+
+    if errors:
+        return {
+            "ok": False,
+            "context": context,
+            "checks": checks,
+            "errors": errors,
+            "warnings": warnings,
+        }
+
+    ns_result = run_cmd(["kubectl", "get", "namespace", namespace])
+    checks["namespace-exists"] = ns_result.returncode == 0
+    if not checks["namespace-exists"]:
+        errors.append(
+            f"Workload namespace {namespace!r} not found or not accessible."
         )
-        checks["can-create-jobs"] = can_i_result.returncode == 0 and (
-            "yes" in can_i_result.stdout.lower()
+
+    can_i_result = run_cmd(
+        ["kubectl", "auth", "can-i", "create", "jobs", "-n", namespace]
+    )
+    checks["can-create-jobs"] = can_i_result.returncode == 0 and (
+        "yes" in can_i_result.stdout.lower()
+    )
+    if not checks["can-create-jobs"]:
+        errors.append("kubectl auth can-i create jobs failed")
+
+    if errors:
+        return {
+            "ok": False,
+            "context": context,
+            "checks": checks,
+            "errors": errors,
+            "warnings": warnings,
+        }
+
+    cluster_result = run_cmd(["kubectl", "-n", namespace, "cluster-info"])
+    checks["cluster-info"] = cluster_result.returncode == 0
+    if not checks["cluster-info"]:
+        warnings.append(
+            f"kubectl -n {namespace!r} cluster-info failed; continuing because "
+            "the workload namespace exists and Job creation is allowed."
         )
-        if not checks["can-create-jobs"]:
-            errors.append("kubectl auth can-i create jobs failed")
 
     return {
-        "ok": not errors,
+        "ok": True,
         "context": context,
         "checks": checks,
         "errors": errors,
+        "warnings": warnings,
     }

From 3f5433057c19e43de7ef99e5d08447169b836cbf Mon Sep 17 00:00:00 2001
From: Shiny Brar <shiny.brar@nrc-cnrc.gc.ca>
Date: Thu, 26 Mar 2026 13:52:56 -0700
Subject: [PATCH 06/11] feat(benchmark): add stress VM memory fraction tests
 and OOM risk assessment

---
 .../tests/test_k8s_phase2_resilience.py       | 22 ++++++++
 .../kueuer/tests/test_lifecycle_preflight.py  | 53 +++++++++++++------
 2 files changed, 58 insertions(+), 17 deletions(-)

diff --git a/configs/kueue/kueuer/tests/test_k8s_phase2_resilience.py b/configs/kueue/kueuer/tests/test_k8s_phase2_resilience.py
index e48f7dae..064a9268 100644
--- a/configs/kueue/kueuer/tests/test_k8s_phase2_resilience.py
+++ b/configs/kueue/kueuer/tests/test_k8s_phase2_resilience.py
@@ -1,5 +1,7 @@
 from types import SimpleNamespace
 
+import pytest
+
 from kueuer.benchmarks import k8s
 
 
@@ -52,3 +54,23 @@ def list_namespaced_pod(self, namespace):
     monkeypatch.setattr(k8s.client, "CoreV1Api", FakeCoreV1Api)
 
     assert k8s.kueue_controller_restarts() == 0
+
+
+def test_stress_vm_bytes_mb_uses_safer_default_fraction() -> None:
+    assert k8s.DEFAULT_STRESS_VM_MEMORY_FRACTION == 0.4
+    assert k8s.stress_vm_bytes_mb(
+        ram_gb=1.0,
+        vm_memory_fraction=k8s.DEFAULT_STRESS_VM_MEMORY_FRACTION,
+    ) == pytest.approx(409.6)
+
+
+def test_stress_vm_bytes_mb_validates_fraction_bounds() -> None:
+    with pytest.raises(ValueError):
+        k8s.stress_vm_bytes_mb(ram_gb=1.0, vm_memory_fraction=0.0)
+    with pytest.raises(ValueError):
+        k8s.stress_vm_bytes_mb(ram_gb=1.0, vm_memory_fraction=1.0)
+
+
+def test_is_high_oom_risk_flags_tight_memory_headroom() -> None:
+    assert k8s.is_high_oom_risk(ram_gb=1.0, vm_memory_fraction=0.8) is True
+    assert k8s.is_high_oom_risk(ram_gb=1.0, vm_memory_fraction=0.55) is False
diff --git a/configs/kueue/kueuer/tests/test_lifecycle_preflight.py b/configs/kueue/kueuer/tests/test_lifecycle_preflight.py
index 69205256..ffd0845f 100644
--- a/configs/kueue/kueuer/tests/test_lifecycle_preflight.py
+++ b/configs/kueue/kueuer/tests/test_lifecycle_preflight.py
@@ -25,14 +25,17 @@ def _run(command: List[str]) -> FakeResult:
 runner = CliRunner()
 
 
+def _access_success_responses() -> dict[str, FakeResult]:
+    return {
+        "kubectl config current-context": FakeResult(0, "minikube\n"),
+        "kubectl get namespace skaha-workload": FakeResult(0, "Active\n"),
+        "kubectl auth can-i create jobs -n skaha-workload": FakeResult(0, "yes\n"),
+        "kubectl -n skaha-workload cluster-info": FakeResult(0, "ok\n"),
+    }
+
+
 def test_run_preflight_success() -> None:
-    run = _runner(
-        {
-            "kubectl config current-context": FakeResult(0, "minikube\n"),
-            "kubectl cluster-info": FakeResult(0, "ok\n"),
-            "kubectl auth can-i create jobs -n skaha-workload": FakeResult(0, "yes\n"),
-        }
-    )
+    run = _runner(_access_success_responses())
     report = preflight.run_preflight(
         namespace="skaha-workload",
         command_exists_fn=lambda cmd: True,
@@ -43,13 +46,7 @@ def test_run_preflight_success() -> None:
 
 
 def test_run_preflight_success_without_helm() -> None:
-    run = _runner(
-        {
-            "kubectl config current-context": FakeResult(0, "minikube\n"),
-            "kubectl cluster-info": FakeResult(0, "ok\n"),
-            "kubectl auth can-i create jobs -n skaha-workload": FakeResult(0, "yes\n"),
-        }
-    )
+    run = _runner(_access_success_responses())
     report = preflight.run_preflight(
         namespace="skaha-workload",
         command_exists_fn=lambda cmd: cmd == "kubectl",
@@ -68,11 +65,32 @@ def test_run_preflight_fails_when_kubectl_missing() -> None:
     assert "kubectl" in " ".join(report["errors"]).lower()
 
 
-def test_run_preflight_fails_on_cluster_unreachable() -> None:
+def test_run_preflight_warns_when_cluster_info_fails_but_namespace_ok() -> None:
+    """cluster-info is non-fatal; namespace + can-i determine success."""
+    run = _runner(
+        {
+            "kubectl config current-context": FakeResult(0, "minikube\n"),
+            "kubectl get namespace skaha-workload": FakeResult(0, "Active\n"),
+            "kubectl auth can-i create jobs -n skaha-workload": FakeResult(0, "yes\n"),
+            "kubectl -n skaha-workload cluster-info": FakeResult(1, "", "connection refused"),
+        }
+    )
+    report = preflight.run_preflight(
+        namespace="skaha-workload",
+        command_exists_fn=lambda cmd: True,
+        run_cmd=run,
+    )
+    assert report["ok"] is True
+    assert report["checks"]["cluster-info"] is False
+    assert report["warnings"]
+    assert not report["errors"]
+
+
+def test_run_preflight_fails_when_namespace_missing() -> None:
     run = _runner(
         {
             "kubectl config current-context": FakeResult(0, "minikube\n"),
-            "kubectl cluster-info": FakeResult(1, "", "connection refused"),
+            "kubectl get namespace skaha-workload": FakeResult(1, "", "NotFound"),
         }
     )
     report = preflight.run_preflight(
@@ -81,7 +99,7 @@ def test_run_preflight_fails_on_cluster_unreachable() -> None:
         run_cmd=run,
     )
     assert report["ok"] is False
-    assert "cluster-info" in " ".join(report["errors"]).lower()
+    assert "namespace" in " ".join(report["errors"]).lower()
 
 
 def test_preflight_command_prints_verbose_inventory(monkeypatch, tmp_path) -> None:
@@ -97,6 +115,7 @@ def test_preflight_command_prints_verbose_inventory(monkeypatch, tmp_path) -> No
                 "access": {
                     "binary:kubectl": True,
                     "context": True,
+                    "namespace-exists": True,
                     "cluster-info": True,
                     "can-create-jobs": True,
                 },

From 8883ebf8430df60c2d24b046b0e1ac66a2d5248b Mon Sep 17 00:00:00 2001
From: Shiny Brar <shiny.brar@nrc-cnrc.gc.ca>
Date: Thu, 26 Mar 2026 14:57:02 -0700
Subject: [PATCH 07/11] feat(benchmark): introduce spawn mechanism parameter
 for job submission methods

---
 .../kueuer/src/kueuer/benchmarks/benchmark.py |  25 ++
 .../kueue/kueuer/src/kueuer/benchmarks/k8s.py | 231 ++++++++++++++++--
 .../kueuer/src/kueuer/lifecycle/commands.py   |   2 +
 .../kueuer/src/kueuer/lifecycle/suite.py      |   7 +
 .../kueuer/tests/test_spawn_mechanism_api.py  | 137 +++++++++++
 5 files changed, 388 insertions(+), 14 deletions(-)
 create mode 100644 configs/kueue/kueuer/tests/test_spawn_mechanism_api.py

diff --git a/configs/kueue/kueuer/src/kueuer/benchmarks/benchmark.py b/configs/kueue/kueuer/src/kueuer/benchmarks/benchmark.py
index 55b75349..27886636 100644
--- a/configs/kueue/kueuer/src/kueuer/benchmarks/benchmark.py
+++ b/configs/kueue/kueuer/src/kueuer/benchmarks/benchmark.py
@@ -262,6 +262,7 @@ def experiment(
     apply_retries: int = 2,
     apply_backoff: float = 2.0,
     vm_memory_fraction: float = k8s.DEFAULT_STRESS_VM_MEMORY_FRACTION,
+    spawn_mechanism: str = "kubectl",
 ) -> Dict[str, Any]:
     """Run a single experiment with the specified configuration.
 
@@ -316,6 +317,7 @@ def experiment(
         apply_retries=apply_retries,
         apply_backoff=apply_backoff,
         vm_memory_fraction=vm_memory_fraction,
+        spawn_mechanism=spawn_mechanism,
     )
 
     # Track jobs to completion and get timing statistics
@@ -398,6 +400,7 @@ def benchmark(
     apply_retries: int,
     apply_backoff: float,
     vm_memory_fraction: float,
+    spawn_mechanism: str,
 ) -> List[Dict[str, Any]]:
     """
     Run a complete benchmark comparing direct Kubernetes jobs vs Kueue jobs.
@@ -438,6 +441,7 @@ def benchmark(
             apply_retries=apply_retries,
             apply_backoff=apply_backoff,
             vm_memory_fraction=vm_memory_fraction,
+            spawn_mechanism=spawn_mechanism,
         )
         results.append(result)
 
@@ -464,6 +468,7 @@ def benchmark(
             apply_retries=apply_retries,
             apply_backoff=apply_backoff,
             vm_memory_fraction=vm_memory_fraction,
+            spawn_mechanism=spawn_mechanism,
         )
         results.append(kueue_result)
 
@@ -571,6 +576,11 @@ def performance(
         "--apply-backoff",
         help="Backoff base (seconds) between apply retries.",
     ),
+    spawn_mechanism: str = typer.Option(
+        "kubectl",
+        "--spawn-mechanism",
+        help="Job spawn mechanism to use: kubectl (apply) or api (client create).",
+    ),
     vm_memory_fraction: float = typer.Option(
         k8s.DEFAULT_STRESS_VM_MEMORY_FRACTION,
         "--vm-memory-fraction",
@@ -640,6 +650,7 @@ def performance(
         apply_retries,
         apply_backoff,
     )
+    logger.info("Spawn   : %s", spawn_mechanism)
     logger.info("VM Frac  : %s", vm_memory_fraction)
 
     if not k8s.check(namespace, kueue, priority):
@@ -662,6 +673,7 @@ def performance(
         apply_retries=apply_retries,
         apply_backoff=apply_backoff,
         vm_memory_fraction=vm_memory_fraction,
+        spawn_mechanism=spawn_mechanism,
     )
     logger.info("Benchmark completed successfully.")
     logger.info("Results saved to %s", output)
@@ -756,6 +768,11 @@ def eviction(
         "--apply-backoff",
         help="Backoff base (seconds) between apply retries.",
     ),
+    spawn_mechanism: str = typer.Option(
+        "kubectl",
+        "--spawn-mechanism",
+        help="Job spawn mechanism to use: kubectl (apply) or api (client create).",
+    ),
 ):
     """Run a benchmark to test eviction behavior of Kueue in a packed cluster queue."""
     profile = _normalize_profile_name(profile)
@@ -809,6 +826,7 @@ def eviction(
         apply_retries,
         apply_backoff,
     )
+    logger.info("Spawn       : %s", spawn_mechanism)
     logger.info("K8s Resource : %s", resource_id)
 
     for priority in priorities:
@@ -851,6 +869,7 @@ def eviction(
             apply_chunk_size=apply_chunk_size,
             apply_retries=apply_retries,
             apply_backoff=apply_backoff,
+            spawn_mechanism=spawn_mechanism,
         )
 
     logger.info("All jobs launched successfully.")
@@ -1005,6 +1024,11 @@ def e2e(
         "--keep-artifacts/--no-keep-artifacts",
         help="Keep generated artifacts.",
     ),
+    spawn_mechanism: str = typer.Option(
+        "kubectl",
+        "--spawn-mechanism",
+        help="Job spawn mechanism to use: kubectl (apply) or api (client create).",
+    ),
 ) -> None:
     """Run the full benchmark workflow with automatic post-processing."""
     from kueuer.lifecycle import commands as lifecycle_commands
@@ -1041,6 +1065,7 @@ def e2e(
         observe_output_subdir=observe_output_subdir,
         skip_queue_apply=skip_queue_apply,
         skip_teardown=skip_teardown,
+        spawn_mechanism=spawn_mechanism,
     )
 
     typer.echo(f"e2e {'ok' if report['ok'] else 'failed'} for run {report['run_id']}")
diff --git a/configs/kueue/kueuer/src/kueuer/benchmarks/k8s.py b/configs/kueue/kueuer/src/kueuer/benchmarks/k8s.py
index cba93875..f7260dfb 100644
--- a/configs/kueue/kueuer/src/kueuer/benchmarks/k8s.py
+++ b/configs/kueue/kueuer/src/kueuer/benchmarks/k8s.py
@@ -3,8 +3,9 @@
 import asyncio
 import copy
 import math
+import random
 from time import time
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple, cast
 
 import aiofiles
 import aiofiles.os
@@ -19,6 +20,10 @@
 
 app = typer.Typer(help="Launch K8s Jobs")
 DEFAULT_STRESS_VM_MEMORY_FRACTION = 0.33
+DEFAULT_SPAWN_MECHANISM = "kubectl"
+
+# Cache API connectivity mode per process.
+_API_CLIENT_READY: bool = False
 
 
 def check(namespace: str, kueue: str, priority: str) -> bool:
@@ -284,6 +289,7 @@ async def apply(
         "apply_retries": 0,
         "manifest_apply_seconds": 0.0,
         "last_error": "",
+        "spawn_mechanism": "kubectl",
     }
     for start, end in chunk_ranges(count, chunk_size):
         report["chunks_total"] += 1
@@ -291,12 +297,7 @@ async def apply(
         async with aiofiles.tempfile.NamedTemporaryFile(
             delete=False, mode="w", suffix=".yaml"
         ) as temp:
-            for num in range(start, end):
-                manifest = copy.deepcopy(data)
-                name: str = f"{prefix}-{num}"
-                manifest["metadata"]["name"] = name
-                for container in manifest["spec"]["template"]["spec"]["containers"]:
-                    container["name"] = name
+            for manifest in render_job_manifests(data, prefix, start, end):
                 await temp.write(yaml.dump(manifest))
                 await temp.write("\n---\n")
         logger.debug("Applying %s", temp.name)
@@ -347,6 +348,199 @@ async def apply(
     return report
 
 
+def render_job_manifest(
+    template: Dict[Any, Any],
+    name: str,
+) -> Dict[Any, Any]:
+    """Return a deep-copied job manifest with name and container names set."""
+    manifest = copy.deepcopy(template)
+    manifest["metadata"]["name"] = name
+    for container in manifest["spec"]["template"]["spec"]["containers"]:
+        container["name"] = name
+    return manifest
+
+
+def render_job_manifests(
+    template: Dict[Any, Any],
+    prefix: str,
+    start: int,
+    end: int,
+) -> List[Dict[Any, Any]]:
+    """Render job manifests for indices in [start, end)."""
+    return [render_job_manifest(template, f"{prefix}-{num}") for num in range(start, end)]
+
+
+def _is_transient_api_error(error: ApiException) -> bool:
+    try:
+        status = int(getattr(error, "status", 0) or 0)
+    except Exception:  # noqa: BLE001
+        return False
+    return status in {429, 500, 502, 503, 504}
+
+
+def _api_preflight_check(namespace: str) -> None:
+    """Perform a lightweight namespace-scoped API check."""
+    v1 = client.CoreV1Api()
+    # Prefer a namespaced call to ensure auth scopes match workload permissions.
+    v1.list_namespaced_pod(namespace=namespace, limit=1)  # type: ignore[arg-type]
+
+
+def ensure_api_client_ready(namespace: str) -> None:
+    """Ensure Kubernetes Python client is configured (kubeconfig or incluster)."""
+    global _API_CLIENT_READY
+    if _API_CLIENT_READY:
+        return
+
+    errors: List[str] = []
+    try:
+        config.load_kube_config()
+        _api_preflight_check(namespace)
+        _API_CLIENT_READY = True
+        logger.info("Kubernetes API client configured using kubeconfig.")
+        return
+    except Exception as error:  # noqa: BLE001
+        errors.append(f"kubeconfig: {error}")
+
+    try:
+        config.load_incluster_config()
+        _api_preflight_check(namespace)
+        _API_CLIENT_READY = True
+        logger.info("Kubernetes API client configured using in-cluster service account.")
+        return
+    except Exception as error:  # noqa: BLE001
+        errors.append(f"incluster: {error}")
+
+    msg = "Unable to configure Kubernetes API client. " + "; ".join(errors)
+    raise RuntimeError(msg)
+
+
+async def _create_job_with_retries(
+    batch: client.BatchV1Api,
+    namespace: str,
+    manifest: Dict[Any, Any],
+    retries: int,
+    backoff_seconds: float,
+) -> Tuple[bool, str]:
+    """Create a job with transient retries. Returns (ok, error_message)."""
+    name = str(manifest.get("metadata", {}).get("name", ""))
+    for attempt in range(1, retries + 2):
+        try:
+            batch.create_namespaced_job(namespace=namespace, body=manifest)  # type: ignore[arg-type]
+            return True, ""
+        except ApiException as error:
+            if getattr(error, "status", None) == 409:
+                # Create-only semantics: treat existing job as a failure.
+                return False, f"{name}: already exists (409)"
+            if not _is_transient_api_error(error) or attempt > retries:
+                return False, f"{name}: {error}"
+            sleep_s = backoff_seconds * attempt
+            # Add jitter to avoid thundering herd.
+            sleep_s *= 0.8 + (0.4 * random.random())
+            await asyncio.sleep(sleep_s)
+        except Exception as error:  # noqa: BLE001
+            if attempt > retries:
+                return False, f"{name}: {error}"
+            sleep_s = backoff_seconds * attempt
+            sleep_s *= 0.8 + (0.4 * random.random())
+            await asyncio.sleep(sleep_s)
+    return False, f"{name}: unknown failure"
+
+
+async def apply_api(
+    data: Dict[Any, Any],
+    prefix: str,
+    count: int,
+    chunk_size: int = 25,
+    retries: int = 2,
+    backoff_seconds: float = 2.0,
+    api_concurrency: int = 10,
+    namespace: str = "default",
+) -> Dict[str, Any]:
+    """Create Kubernetes Jobs using the Python API client."""
+    ensure_api_client_ready(namespace=namespace)
+    batch = client.BatchV1Api()
+    semaphore = asyncio.Semaphore(max(int(api_concurrency), 1))
+
+    now = time()
+    report: Dict[str, Any] = {
+        "requested_jobs": count,
+        "chunk_size": chunk_size,
+        "chunks_total": 0,
+        "chunks_succeeded": 0,
+        "chunks_failed": 0,
+        "jobs_applied": 0,
+        "jobs_failed_to_apply": 0,
+        "apply_attempts": 0,
+        "apply_retries": 0,
+        "manifest_apply_seconds": 0.0,
+        "last_error": "",
+        "spawn_mechanism": "api",
+        "api_concurrency": api_concurrency,
+    }
+
+    async def _guarded_create(manifest: Dict[Any, Any]) -> Tuple[bool, str]:
+        async with semaphore:
+            ok, err = await _create_job_with_retries(
+                batch=batch,
+                namespace=namespace,
+                manifest=manifest,
+                retries=retries,
+                backoff_seconds=backoff_seconds,
+            )
+            report["apply_attempts"] += 1
+            if not ok and err:
+                report["last_error"] = err
+            return ok, err
+
+    for start, end in chunk_ranges(count, chunk_size):
+        report["chunks_total"] += 1
+        manifests = render_job_manifests(data, prefix, start, end)
+        results = await asyncio.gather(*[_guarded_create(m) for m in manifests])
+        failures = [err for ok, err in results if not ok]
+        if failures:
+            report["chunks_failed"] += 1
+            report["jobs_failed_to_apply"] += len(failures)
+            report["last_error"] = failures[0]
+        else:
+            report["chunks_succeeded"] += 1
+            report["jobs_applied"] += len(results)
+
+    report["manifest_apply_seconds"] = time() - now
+    logger.info("Took %ss to submit jobs via API", report["manifest_apply_seconds"])
+    return report
+
+
+async def submit_jobs(
+    template: Dict[Any, Any],
+    prefix: str,
+    jobs: int,
+    spawn_mechanism: str,
+    namespace: str,
+    apply_chunk_size: int,
+    apply_retries: int,
+    apply_backoff: float,
+) -> Dict[str, Any]:
+    """Submit rendered jobs using the selected spawn mechanism."""
+    if spawn_mechanism == "api":
+        return await apply_api(
+            template,
+            prefix,
+            jobs,
+            chunk_size=apply_chunk_size,
+            retries=apply_retries,
+            backoff_seconds=apply_backoff,
+            namespace=namespace,
+        )
+    return await apply(
+        template,
+        prefix,
+        jobs,
+        chunk_size=apply_chunk_size,
+        retries=apply_retries,
+        backoff_seconds=apply_backoff,
+    )
+
+
 @app.command("run")
 def run(
     filepath: str = (
@@ -423,6 +617,13 @@ def run(
             ),
         )
     ),
+    spawn_mechanism: str = (
+        typer.Option(
+            DEFAULT_SPAWN_MECHANISM,
+            "--spawn-mechanism",
+            help="Job spawn mechanism to use: kubectl (apply) or api (client create).",
+        )
+    ),
 ) -> Dict[str, Any]:
     """Run jobs to stress k8s cluster."""
     ram_mb: float = ram * 1024.0
@@ -480,13 +681,15 @@ def run(
     loop = asyncio.get_event_loop()
     asyncio.set_event_loop(loop)
     result = loop.run_until_complete(
-        apply(
-            job,
-            prefix,
-            jobs,
-            chunk_size=apply_chunk_size,
-            retries=apply_retries,
-            backoff_seconds=apply_backoff,
+        submit_jobs(
+            template=job,
+            prefix=prefix,
+            jobs=jobs,
+            spawn_mechanism=spawn_mechanism,
+            namespace=namespace,
+            apply_chunk_size=apply_chunk_size,
+            apply_retries=apply_retries,
+            apply_backoff=apply_backoff,
         )
     )
     return result
diff --git a/configs/kueue/kueuer/src/kueuer/lifecycle/commands.py b/configs/kueue/kueuer/src/kueuer/lifecycle/commands.py
index 7ec1d99d..e45feed0 100644
--- a/configs/kueue/kueuer/src/kueuer/lifecycle/commands.py
+++ b/configs/kueue/kueuer/src/kueuer/lifecycle/commands.py
@@ -423,6 +423,7 @@ def run_benchmark_e2e(
     observe_output_subdir: str = DEFAULT_OBSERVATION_SUBDIR,
     skip_queue_apply: bool = False,
     skip_teardown: bool = False,
+    spawn_mechanism: str = "kubectl",
 ) -> Dict[str, Any]:
     """Run the internal benchmark end-to-end workflow and persist its manifest."""
     effective = run_id or default_run_id()
@@ -452,6 +453,7 @@ def run_benchmark_e2e(
             observe=observe,
             observe_interval_seconds=observe_interval_seconds,
             observe_output_subdir=observe_output_subdir,
+            spawn_mechanism=spawn_mechanism,
         ),
         collect_fn=lambda: collect_outputs(
             performance_csv=_resolve_suite_path(
diff --git a/configs/kueue/kueuer/src/kueuer/lifecycle/suite.py b/configs/kueue/kueuer/src/kueuer/lifecycle/suite.py
index 3d1383a9..397b8d5c 100644
--- a/configs/kueue/kueuer/src/kueuer/lifecycle/suite.py
+++ b/configs/kueue/kueuer/src/kueuer/lifecycle/suite.py
@@ -19,6 +19,7 @@ def _suite_commands(
     localqueue: str,
     priority: str,
     artifacts_dir: str,
+    spawn_mechanism: str = "kubectl",
 ) -> List[str]:
     """Build the command transcript recorded in lifecycle suite reports."""
     return [
@@ -31,6 +32,7 @@ def _suite_commands(
             f"--ram {performance_options['ram']} "
             f"--storage {performance_options['storage']} "
             f"--wait {performance_options['wait']} "
+            f"--spawn-mechanism {spawn_mechanism} "
             f"-n {namespace} -k {localqueue} -p {priority} "
             f"-o {artifacts_dir}"
         ),
@@ -42,6 +44,7 @@ def _suite_commands(
             f"--cores {eviction_options['cores']} "
             f"--ram {eviction_options['ram']} "
             f"--storage {eviction_options['storage']} "
+            f"--spawn-mechanism {spawn_mechanism} "
             f"-n {namespace} -k {localqueue} "
             "-p low -p medium -p high "
             f"-o {artifacts_dir}"
@@ -61,6 +64,7 @@ def run_benchmark_suite(
     observe: bool = False,
     observe_interval_seconds: float = 5.0,
     observe_output_subdir: str = "observe",
+    spawn_mechanism: str = "kubectl",
     collector_factory: Callable[..., Any] = ObservationCollector,
     scenario_apply_fn: Callable[..., Dict[str, Any]] = apply_scenario,
     scenario_restore_fn: Callable[..., Dict[str, Any]] = restore_scenario,
@@ -87,6 +91,7 @@ def run_benchmark_suite(
         localqueue=localqueue,
         priority=priority,
         artifacts_dir=artifacts_dir,
+        spawn_mechanism=spawn_mechanism,
     )
     with tempfile.TemporaryDirectory(prefix="kueuer-scenario-") as scenario_tmp:
         scenario_context = scenario_apply_fn(
@@ -145,6 +150,7 @@ def run_benchmark_suite(
                 apply_chunk_size=25,
                 apply_retries=2,
                 apply_backoff=2.0,
+                spawn_mechanism=spawn_mechanism,
             )
             eviction_runner(
                 filepath=DEFAULT_JOBSPEC_FILEPATH,
@@ -162,6 +168,7 @@ def run_benchmark_suite(
                 apply_chunk_size=25,
                 apply_retries=2,
                 apply_backoff=2.0,
+                spawn_mechanism=spawn_mechanism,
             )
         finally:
             if collector is not None:
diff --git a/configs/kueue/kueuer/tests/test_spawn_mechanism_api.py b/configs/kueue/kueuer/tests/test_spawn_mechanism_api.py
new file mode 100644
index 00000000..0ef05a8c
--- /dev/null
+++ b/configs/kueue/kueuer/tests/test_spawn_mechanism_api.py
@@ -0,0 +1,137 @@
+import asyncio
+
+import pytest
+from kubernetes.client.rest import ApiException
+
+from kueuer.benchmarks import k8s
+
+
+def _template() -> dict:
+    return {
+        "apiVersion": "batch/v1",
+        "kind": "Job",
+        "metadata": {"name": "template", "namespace": "ns"},
+        "spec": {
+            "template": {
+                "spec": {
+                    "containers": [{"name": "template", "image": "busybox"}],
+                    "restartPolicy": "Never",
+                }
+            }
+        },
+    }
+
+
+def test_render_job_manifests_sets_names_and_container_names() -> None:
+    manifests = k8s.render_job_manifests(_template(), prefix="pfx", start=0, end=3)
+    assert [m["metadata"]["name"] for m in manifests] == ["pfx-0", "pfx-1", "pfx-2"]
+    assert [
+        m["spec"]["template"]["spec"]["containers"][0]["name"] for m in manifests
+    ] == ["pfx-0", "pfx-1", "pfx-2"]
+
+
+def test_is_transient_api_error_matches_expected_statuses() -> None:
+    assert k8s._is_transient_api_error(ApiException(status=502)) is True
+    assert k8s._is_transient_api_error(ApiException(status=500)) is True
+    assert k8s._is_transient_api_error(ApiException(status=429)) is True
+    assert k8s._is_transient_api_error(ApiException(status=403)) is False
+    assert k8s._is_transient_api_error(ApiException(status=409)) is False
+
+
+def test_ensure_api_client_ready_falls_back_to_incluster(monkeypatch) -> None:
+    k8s._API_CLIENT_READY = False
+
+    def fail_kubeconfig() -> None:
+        raise RuntimeError("no kubeconfig")
+
+    called = {"incluster": 0, "check": 0}
+
+    def ok_incluster() -> None:
+        called["incluster"] += 1
+
+    def ok_check(namespace: str) -> None:
+        called["check"] += 1
+
+    monkeypatch.setattr(k8s.config, "load_kube_config", fail_kubeconfig)
+    monkeypatch.setattr(k8s.config, "load_incluster_config", ok_incluster)
+    monkeypatch.setattr(k8s, "_api_preflight_check", ok_check)
+
+    k8s.ensure_api_client_ready(namespace="ns")
+    assert called["incluster"] == 1
+    assert called["check"] == 1
+
+
+def test_submit_jobs_dispatches_to_kubectl(monkeypatch) -> None:
+    async def fake_apply(*args, **kwargs):
+        return {"spawn_mechanism": "kubectl"}
+
+    async def fake_apply_api(*args, **kwargs):
+        return {"spawn_mechanism": "api"}
+
+    monkeypatch.setattr(k8s, "apply", fake_apply)
+    monkeypatch.setattr(k8s, "apply_api", fake_apply_api)
+
+    result = asyncio.run(
+        k8s.submit_jobs(
+            template=_template(),
+            prefix="pfx",
+            jobs=1,
+            spawn_mechanism="kubectl",
+            namespace="ns",
+            apply_chunk_size=1,
+            apply_retries=0,
+            apply_backoff=0.0,
+        )
+    )
+    assert result["spawn_mechanism"] == "kubectl"
+
+
+def test_submit_jobs_dispatches_to_api(monkeypatch) -> None:
+    async def fake_apply(*args, **kwargs):
+        return {"spawn_mechanism": "kubectl"}
+
+    async def fake_apply_api(*args, **kwargs):
+        return {"spawn_mechanism": "api"}
+
+    monkeypatch.setattr(k8s, "apply", fake_apply)
+    monkeypatch.setattr(k8s, "apply_api", fake_apply_api)
+
+    result = asyncio.run(
+        k8s.submit_jobs(
+            template=_template(),
+            prefix="pfx",
+            jobs=1,
+            spawn_mechanism="api",
+            namespace="ns",
+            apply_chunk_size=1,
+            apply_retries=0,
+            apply_backoff=0.0,
+        )
+    )
+    assert result["spawn_mechanism"] == "api"
+
+
+@pytest.mark.parametrize("status", [500, 502, 503])
+def test_create_job_with_retries_retries_transient_errors(monkeypatch, status: int) -> None:
+    attempts = {"count": 0}
+
+    class FakeBatch:
+        def create_namespaced_job(self, namespace, body):
+            attempts["count"] += 1
+            if attempts["count"] < 2:
+                raise ApiException(status=status)
+            return object()
+
+    ok, err = asyncio.run(
+        k8s._create_job_with_retries(
+            batch=FakeBatch(),
+            namespace="ns",
+            manifest={"metadata": {"name": "job1"}},
+            retries=3,
+            backoff_seconds=0.0,
+        )
+    )
+    assert ok is True
+    assert err == ""
+    assert attempts["count"] == 2
+

From d42fea0330490783d1754f24a86387b1fd5a95aa Mon Sep 17 00:00:00 2001
From: Shiny Brar <shiny.brar@nrc-cnrc.gc.ca>
Date: Mon, 30 Mar 2026 07:39:53 -0700
Subject: [PATCH 08/11] feat(resources): enhance GPU resource handling with
 detailed kind resolution and updated return structure

---
 configs/kueue/kueuer/src/kueuer/resources.py | 134 ++++++++++++++++---
 1 file changed, 114 insertions(+), 20 deletions(-)

diff --git a/configs/kueue/kueuer/src/kueuer/resources.py b/configs/kueue/kueuer/src/kueuer/resources.py
index bae896c2..de4a78a0 100644
--- a/configs/kueue/kueuer/src/kueuer/resources.py
+++ b/configs/kueue/kueuer/src/kueuer/resources.py
@@ -11,8 +11,12 @@
 Totals cluster resources across Kubernetes nodes filtered by name regex.
 
 - Deduplicates nodes by UID (so overlapping regex lists don't double count).
-- By default totals from node .status.capacity; use --field allocatable to sum .status.allocatable instead.
-- Returns a mapping: dict[str, dict[str, str]] with { "value": <string>, "unit": <string> }.
+- By default totals from node .status.capacity; use --field allocatable to sum
+  .status.allocatable instead.
+- Returns a mapping: ``cpu`` uses ``{ "value", "unit": "cores" }``; ``memory`` and
+  ``ephemeral-storage`` use values in **Gi** (1024³ bytes) with ``unit: "Gi"``.
+- For ``nvidia.com/gpu`` and ``amd.com/gpu``: ``{ "kind", "value", "unit": "count" }``
+  where ``kind`` comes from node labels (e.g. ``nvidia.com/gpu.product``).
 - If a resource does not exist on any matched node, it is **omitted**.
 
 Examples:
@@ -27,10 +31,9 @@
 import sys
 from dataclasses import dataclass
 from decimal import Decimal, getcontext
-from typing import Annotated, Dict, Iterable, List, Optional, Sequence
+from typing import Annotated, Any, Dict, Iterable, List, Optional, Sequence, Union, cast
 
 import typer
-from kubernetes import client, config
 from kubernetes.client import CoreV1Api, V1Node
 from kubernetes.utils.quantity import parse_quantity
 from pydantic import BaseModel, Field, RootModel, ValidationError, field_validator
@@ -60,7 +63,21 @@ class ResourceItem(BaseModel):
     )
 
 
-class ResourceMap(RootModel[Dict[str, ResourceItem]]):
+class GpuResourceItem(BaseModel):
+    """Cluster totals for a GPU resource."""
+
+    kind: str = Field(
+        ...,
+        description=(
+            "Product name if the cluster uses a single model; empty if unknown; "
+            "'mixed' if multiple models."
+        ),
+    )
+    value: str = Field(..., description="Total GPU count.")
+    unit: Literal["count"] = "count"
+
+
+class ResourceMap(RootModel[Dict[str, Union[ResourceItem, GpuResourceItem]]]):
     """Dynamic resource map so unavailable resources can be omitted."""
 
 
@@ -92,6 +109,8 @@ class TotalsAcc:
     ephemeral_bytes: Optional[int]
     nvidia_gpu: Optional[int]
     amd_gpu: Optional[int]
+    nvidia_by_kind: Optional[Dict[str, int]]
+    amd_by_kind: Optional[Dict[str, int]]
 
 
 # =========================
@@ -132,6 +151,53 @@ def _collect_nodes(v1: CoreV1Api, patterns: Optional[Sequence[str]]) -> List[V1N
     return list(dedup.values())
 
 
+def _nvidia_gpu_kind_from_labels(labels: Dict[str, str]) -> str:
+    """Resolve GPU product name from common NVIDIA node labels."""
+    for key in (
+        "nvidia.com/gpu.product",
+        "nvidia.com/gfd.gpu.product",
+    ):
+        v = labels.get(key)
+        if v:
+            return str(v).strip()
+    return ""
+
+
+def _amd_gpu_kind_from_labels(labels: Dict[str, str]) -> str:
+    """Resolve GPU product name from common AMD node labels."""
+    for key in (
+        "amd.com/gpu.product",
+        "amd.com/gpu.family",
+    ):
+        v = labels.get(key)
+        if v:
+            return str(v).strip()
+    return ""
+
+
+def _summary_gpu_kind(by_kind: Dict[str, int]) -> str:
+    """Single model name, empty if unknown, or 'mixed' if multiple distinct models."""
+    active = {k: v for k, v in by_kind.items() if v > 0}
+    if not active:
+        return ""
+    distinct_nonempty = {k for k in active if k}
+    if not distinct_nonempty:
+        return ""
+    if len(distinct_nonempty) == 1:
+        return next(iter(distinct_nonempty))
+    return "mixed"
+
+
+def _bytes_to_gi_str(total_bytes: int) -> str:
+    """Convert a byte total to a decimal string in Gi (1 Gi = 1024³ bytes)."""
+    # Use integer 1024**3 so the divisor is exact (Decimal(1024)**3 can round).
+    gi = Decimal(total_bytes) / Decimal(1024**3)
+    s = format(gi, "f")
+    if "." in s:
+        s = s.rstrip("0").rstrip(".")
+    return s
+
+
 def _get_field_map(node: V1Node, field: str) -> Dict[str, str]:
     """
     Extract either .status.capacity or .status.allocatable as a plain dict[str, str].
@@ -193,11 +259,14 @@ def _sum_resources(nodes: List[V1Node], field: str) -> TotalsAcc:
     eph_vals: List[str] = []
     nvidia_vals: List[str] = []
     amd_vals: List[str] = []
+    nvidia_by_kind: Dict[str, int] = {}
+    amd_by_kind: Dict[str, int] = {}
 
     for n in nodes:
         m = _get_field_map(n, field)
         if not m:
             continue
+        labels = (n.metadata.labels or {}) if n.metadata else {}
         if "cpu" in m:
             cpu_vals.append(m["cpu"])
         if "memory" in m:
@@ -206,8 +275,16 @@ def _sum_resources(nodes: List[V1Node], field: str) -> TotalsAcc:
             eph_vals.append(m["ephemeral-storage"])
         if "nvidia.com/gpu" in m:
             nvidia_vals.append(m["nvidia.com/gpu"])
+            nk = _nvidia_gpu_kind_from_labels(labels)
+            nvidia_by_kind[nk] = nvidia_by_kind.get(nk, 0) + int(
+                parse_quantity(m["nvidia.com/gpu"])
+            )
         if "amd.com/gpu" in m:
             amd_vals.append(m["amd.com/gpu"])
+            ak = _amd_gpu_kind_from_labels(labels)
+            amd_by_kind[ak] = amd_by_kind.get(ak, 0) + int(
+                parse_quantity(m["amd.com/gpu"])
+            )
 
     # Convert lists → optional totals (None means "omit key")
     def _try_sum(dec_sum_fn, vals):
@@ -228,6 +305,8 @@ def _try_sum(dec_sum_fn, vals):
         ephemeral_bytes=int(eph_total) if eph_total is not None else None,
         nvidia_gpu=nvidia_total,
         amd_gpu=amd_total,
+        nvidia_by_kind=nvidia_by_kind if nvidia_total is not None else None,
+        amd_by_kind=amd_by_kind if amd_total is not None else None,
     )
 
 
@@ -238,21 +317,27 @@ def _try_sum(dec_sum_fn, vals):
 
 def total(
     patterns: Optional[List[str]] = None, field: str = "capacity"
-) -> Dict[str, Dict[str, str]]:
+) -> Dict[str, Any]:
     """
-    Calculate total cluster resources across nodes matching any of the given regex patterns.
+    Calculate total cluster resources across nodes matching regex patterns.
 
     Args:
-        patterns: List of regex strings to match node names. If None or empty, includes all nodes.
-        field:    Which field to sum: "capacity" (default) or "allocatable".
+        patterns: Regex strings for node names. If None or empty, includes all nodes.
+        field: Which field to sum: "capacity" (default) or "allocatable".
 
     Returns:
-        dict[str, dict[str, str]] mapping resource name -> {"value": <str>, "unit": <str>}
-        Only includes resources that exist on at least one matched node.
+        Mapping of resource name to detail dicts. Memory and ephemeral-storage use
+        Gi and ``unit`` ``\"Gi\"``. GPU entries include ``kind``, ``value``, and
+        ``unit`` ``\"count\"``. Only includes resources present on at least one node.
     """
     # Validate inputs with Pydantic
+    if field not in ("capacity", "allocatable"):
+        raise ValueError('field must be "capacity" or "allocatable"')
     try:
-        cfg = Settings(patterns=patterns, field=field)
+        cfg = Settings(
+            patterns=patterns,
+            field=cast(Literal["capacity", "allocatable"], field),
+        )
     except ValidationError as e:
         raise ValueError(str(e)) from e
 
@@ -261,20 +346,30 @@ def total(
     acc = _sum_resources(nodes, cfg.field)
 
     # Build a dynamic map (omit unavailable resources)
-    result: Dict[str, ResourceItem] = {}
+    result: Dict[str, Union[ResourceItem, GpuResourceItem]] = {}
 
     if acc.cpu_cores is not None:
         result["cpu"] = ResourceItem(value=f"{acc.cpu_cores}", unit="cores")
     if acc.memory_bytes is not None:
-        result["memory"] = ResourceItem(value=f"{acc.memory_bytes}", unit="bytes")
+        result["memory"] = ResourceItem(
+            value=_bytes_to_gi_str(acc.memory_bytes),
+            unit="Gi",
+        )
     if acc.ephemeral_bytes is not None:
         result["ephemeral-storage"] = ResourceItem(
-            value=f"{acc.ephemeral_bytes}", unit="bytes"
+            value=_bytes_to_gi_str(acc.ephemeral_bytes),
+            unit="Gi",
+        )
+    if acc.nvidia_gpu is not None and acc.nvidia_by_kind is not None:
+        result["nvidia.com/gpu"] = GpuResourceItem(
+            kind=_summary_gpu_kind(acc.nvidia_by_kind),
+            value=str(acc.nvidia_gpu),
+        )
+    if acc.amd_gpu is not None and acc.amd_by_kind is not None:
+        result["amd.com/gpu"] = GpuResourceItem(
+            kind=_summary_gpu_kind(acc.amd_by_kind),
+            value=str(acc.amd_gpu),
         )
-    if acc.nvidia_gpu is not None:
-        result["nvidia.com/gpu"] = ResourceItem(value=f"{acc.nvidia_gpu}", unit="count")
-    if acc.amd_gpu is not None:
-        result["amd.com/gpu"] = ResourceItem(value=f"{acc.amd_gpu}", unit="count")
 
     # Validate and dump with Pydantic
     return ResourceMap(result).model_dump()
@@ -327,7 +422,6 @@ def resources(
         if scale != 1.0:
             console.print(f"Scaling by {scale * 100}%...")
             for _k, v in result.items():
-                # Limit to Decimal precision to 3 decimal places
                 v["value"] = str(Decimal(v["value"]) * Decimal(scale))
             console.print(result, width=120)
     except Exception as e:

From 94c6c51cecd3df4e0512ae091e2b17a57ae77146 Mon Sep 17 00:00:00 2001
From: Shiny Brar <shiny.brar@nrc-cnrc.gc.ca>
Date: Mon, 30 Mar 2026 07:40:42 -0700
Subject: [PATCH 09/11] feat(benchmark): improve Kubernetes client
 configuration handling and enhance error logging for pod and job outcome
 collection

---
 .../kueue/kueuer/src/kueuer/benchmarks/k8s.py |  73 +++++++--
 .../kueuer/src/kueuer/benchmarks/plot.py      | 133 ++++++++++++++-
 configs/kueue/kueuer/src/kueuer/utils/io.py   |   3 +
 .../tests/test_k8s_phase2_resilience.py       |  37 ++++-
 .../tests/test_plot_filters_and_semantics.py  |  17 ++
 configs/kueue/kueuer/tests/test_resources.py  | 154 ++++++++++++++++++
 .../kueuer/tests/test_spawn_mechanism_api.py  |   1 -
 7 files changed, 397 insertions(+), 21 deletions(-)
 create mode 100644 configs/kueue/kueuer/tests/test_resources.py

diff --git a/configs/kueue/kueuer/src/kueuer/benchmarks/k8s.py b/configs/kueue/kueuer/src/kueuer/benchmarks/k8s.py
index f7260dfb..b75c9cdc 100644
--- a/configs/kueue/kueuer/src/kueuer/benchmarks/k8s.py
+++ b/configs/kueue/kueuer/src/kueuer/benchmarks/k8s.py
@@ -205,11 +205,38 @@ def summarize_pod_statuses(pods: List[Any]) -> Dict[str, int]:
     return summary
 
 
+def _load_client_config_best_effort() -> bool:
+    """Try kubeconfig first, then in-cluster config."""
+    try:
+        config.load_kube_config()
+        return True
+    except Exception:  # noqa: BLE001
+        try:
+            config.load_incluster_config()
+            return True
+        except Exception:  # noqa: BLE001
+            return False
+
+
 def collect_pod_outcomes(namespace: str, prefix: str) -> Dict[str, int]:
     """Collect pod outcome summary for jobs with the given name prefix."""
-    config.load_kube_config()
+    empty = summarize_pod_statuses([])
+    if not _load_client_config_best_effort():
+        logger.warning("Unable to configure Kubernetes client while collecting pod outcomes.")
+        return empty
+
     v1 = client.CoreV1Api()
-    pods = v1.list_namespaced_pod(namespace=namespace).items
+    try:
+        pod_list = v1.list_namespaced_pod(namespace=namespace)
+        pods = list(getattr(pod_list, "items", None) or [])
+    except Exception as error:  # noqa: BLE001
+        logger.warning(
+            "Unable to collect pod outcomes in namespace %s: %s",
+            namespace,
+            error,
+        )
+        return empty
+
     selected = [
         pod
         for pod in pods
@@ -220,20 +247,34 @@ def collect_pod_outcomes(namespace: str, prefix: str) -> Dict[str, int]:
 
 def collect_job_outcomes(namespace: str, prefix: str) -> Dict[str, int]:
     """Collect job outcome counters for jobs with the given name prefix."""
-    config.load_kube_config()
+    empty: Dict[str, int] = {
+        "jobs_total": 0,
+        "jobs_succeeded": 0,
+        "jobs_failed": 0,
+        "jobs_active": 0,
+    }
+    if not _load_client_config_best_effort():
+        logger.warning("Unable to configure Kubernetes client while collecting job outcomes.")
+        return empty
+
     batch_v1 = client.BatchV1Api()
-    jobs = batch_v1.list_namespaced_job(namespace=namespace).items
+    try:
+        job_list = batch_v1.list_namespaced_job(namespace=namespace)
+        jobs = list(getattr(job_list, "items", None) or [])
+    except Exception as error:  # noqa: BLE001
+        logger.warning(
+            "Unable to collect job outcomes in namespace %s: %s",
+            namespace,
+            error,
+        )
+        return empty
+
     selected = [
         job
         for job in jobs
         if (job.metadata and job.metadata.name and job.metadata.name.startswith(prefix))
     ]
-    outcomes: Dict[str, int] = {
-        "jobs_total": len(selected),
-        "jobs_succeeded": 0,
-        "jobs_failed": 0,
-        "jobs_active": 0,
-    }
+    outcomes = {**empty, "jobs_total": len(selected)}
     for job in selected:
         outcomes["jobs_succeeded"] += int(getattr(job.status, "succeeded", 0) or 0)
         outcomes["jobs_failed"] += int(getattr(job.status, "failed", 0) or 0)
@@ -244,9 +285,11 @@ def collect_job_outcomes(namespace: str, prefix: str) -> Dict[str, int]:
 def kueue_controller_restarts(namespace: str = "kueue-system") -> int:
     """Return aggregate restart count of kueue-system pods."""
     try:
-        config.load_kube_config()
+        if not _load_client_config_best_effort():
+            raise RuntimeError("unable to configure kubernetes client")
         v1 = client.CoreV1Api()
-        pods = v1.list_namespaced_pod(namespace=namespace).items
+        pod_list = v1.list_namespaced_pod(namespace=namespace)
+        pods = list(getattr(pod_list, "items", None) or [])
         total = 0
         for pod in pods:
             for status in getattr(pod.status, "container_statuses", None) or []:
@@ -288,10 +331,12 @@ async def apply(
         "apply_attempts": 0,
         "apply_retries": 0,
         "manifest_apply_seconds": 0.0,
+        "chunk_spawn_seconds": [],
         "last_error": "",
         "spawn_mechanism": "kubectl",
     }
     for start, end in chunk_ranges(count, chunk_size):
+        chunk_start = time()
         report["chunks_total"] += 1
         chunk_jobs = end - start
         async with aiofiles.tempfile.NamedTemporaryFile(
@@ -343,6 +388,7 @@ async def apply(
             await temp.close()
             await aiofiles.os.remove(str(temp.name))
             logger.debug("Deleted %s", temp.name)
+        report["chunk_spawn_seconds"].append(time() - chunk_start)
     report["manifest_apply_seconds"] = time() - now
     logger.info("Took %ss to apply k8s manifest", report["manifest_apply_seconds"])
     return report
@@ -473,6 +519,7 @@ async def apply_api(
         "apply_attempts": 0,
         "apply_retries": 0,
         "manifest_apply_seconds": 0.0,
+        "chunk_spawn_seconds": [],
         "last_error": "",
         "spawn_mechanism": "api",
         "api_concurrency": api_concurrency,
@@ -493,6 +540,7 @@ async def _guarded_create(manifest: Dict[Any, Any]) -> Tuple[bool, str]:
             return ok, err
 
     for start, end in chunk_ranges(count, chunk_size):
+        chunk_start = time()
         report["chunks_total"] += 1
         manifests = render_job_manifests(data, prefix, start, end)
         results = await asyncio.gather(*[_guarded_create(m) for m in manifests])
@@ -504,6 +552,7 @@ async def _guarded_create(manifest: Dict[Any, Any]) -> Tuple[bool, str]:
         else:
             report["chunks_succeeded"] += 1
             report["jobs_applied"] += len(results)
+        report["chunk_spawn_seconds"].append(time() - chunk_start)
 
     report["manifest_apply_seconds"] = time() - now
     logger.info("Took %ss to submit jobs via API", report["manifest_apply_seconds"])
diff --git a/configs/kueue/kueuer/src/kueuer/benchmarks/plot.py b/configs/kueue/kueuer/src/kueuer/benchmarks/plot.py
index db1732c2..ceae2d1d 100644
--- a/configs/kueue/kueuer/src/kueuer/benchmarks/plot.py
+++ b/configs/kueue/kueuer/src/kueuer/benchmarks/plot.py
@@ -1,7 +1,9 @@
+import json
 from pathlib import Path
-from typing import Dict, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 
 import matplotlib.pyplot as plt
+import numpy as np
 import pandas as pd
 import seaborn as sns
 import typer
@@ -69,6 +71,28 @@ def _finalize_plot(
     plt.close(fig)
 
 
+def _parse_chunk_spawn_seconds(raw: Any) -> List[float]:
+    """Parse submission_chunk_spawn_seconds from CSV (JSON string or list)."""
+    if raw is None:
+        return []
+    if isinstance(raw, float) and pd.isna(raw):
+        return []
+    if isinstance(raw, list):
+        return [float(x) for x in raw]
+    if isinstance(raw, str):
+        text = raw.strip()
+        if not text:
+            return []
+        try:
+            parsed = json.loads(text)
+        except json.JSONDecodeError:
+            return []
+        if isinstance(parsed, list):
+            return [float(x) for x in parsed]
+        return []
+    return []
+
+
 def _annotate_empty(ax: plt.Axes, message: str) -> None:
     ax.text(
         0.5,
@@ -93,14 +117,49 @@ def _style_axis(ax: plt.Axes, title: str, ylabel: str) -> None:
         ax.spines[spine].set_visible(False)
 
 
+def _maybe_rotate_job_count_xlabels(ax: plt.Axes, n_distinct: int) -> None:
+    """Tilt labels when many distinct job counts would overlap on the X axis."""
+    if n_distinct > 12:
+        ax.tick_params(axis="x", labelrotation=42, labelsize=9)
+        for lbl in ax.get_xticklabels():
+            lbl.set_horizontalalignment("right")
+    elif n_distinct > 8:
+        ax.tick_params(axis="x", labelrotation=22, labelsize=10)
+        for lbl in ax.get_xticklabels():
+            lbl.set_horizontalalignment("right")
+
+
+def _format_job_count_k(value: float, _pos: int | None = None) -> str:
+    """Compact linear-scale labels: 500, 1k, 2k, 3.5k, 10k."""
+    if not np.isfinite(value):
+        return ""
+    v = float(value)
+    if abs(v) >= 1000:
+        k = v / 1000.0
+        if abs(k - round(k)) < 1e-9:
+            return f"{int(round(k))}k"
+        text = f"{k:.2f}".rstrip("0").rstrip(".")
+        return f"{text}k"
+    return f"{int(round(v))}"
+
+
 def _job_count_ticks(ax: plt.Axes, values: pd.Series) -> None:
+    """Linear X axis for job counts: bounded tick count and compact k-style labels."""
     unique = sorted({int(value) for value in values.dropna().tolist()})
     if not unique:
         return
-    if len(unique) >= 4 and max(unique) / min(unique) >= 4:
-        ax.set_xscale("log")
-    ax.set_xticks(unique)
-    ax.xaxis.set_major_formatter(ticker.StrMethodFormatter("{x:,.0f}"))
+    lo_f, hi_f = float(min(unique)), float(max(unique))
+    span = hi_f - lo_f if hi_f > lo_f else max(hi_f * 0.05, 1.0)
+    pad = max(span * 0.02, 1.0)
+    ax.set_xlim(lo_f - pad, hi_f + pad)
+
+    ax.set_xscale("linear")
+    ax.xaxis.set_major_locator(
+        ticker.MaxNLocator(nbins=7, min_n_ticks=4, integer=True, prune="both")
+    )
+    ax.xaxis.set_major_formatter(ticker.FuncFormatter(_format_job_count_k))
+
+    _maybe_rotate_job_count_xlabels(ax, len(unique))
 
 
 def _summary_by_count(df: pd.DataFrame, metric: str) -> pd.DataFrame:
@@ -232,7 +291,8 @@ def _plot_performance_overview(
     output_dir: Optional[str],
     show: bool,
 ) -> None:
-    fig, axes = plt.subplots(2, 2, figsize=(15, 10), sharex=False)
+    n_counts = len({int(x) for x in df["job_count"].dropna().unique().tolist()})
+    fig, axes = plt.subplots(2, 2, figsize=(16, 10.2), sharex=False)
     _overview_panel(
         axes[0, 0],
         df,
@@ -278,9 +338,69 @@ def _plot_performance_overview(
         fontweight="bold",
     )
     fig.tight_layout()
+    if n_counts > 8:
+        fig.subplots_adjust(bottom=0.12, hspace=0.28, wspace=0.22)
     _finalize_plot(fig, "performance_overview.png", output_dir, show)
 
 
+def _plot_spawn_time_by_job_count(
+    df: pd.DataFrame,
+    output_dir: Optional[str],
+    show: bool,
+) -> None:
+    """Single chart: job count on X, spawn time on Y, grouped bars for Direct vs Kueue."""
+    col_total = "submission_manifest_apply_seconds"
+    if col_total not in df.columns:
+        return
+
+    counts = sorted({int(x) for x in df["job_count"].dropna().unique().tolist()})
+    if not counts:
+        return
+
+    fig_w = max(9.0, 0.85 * len(counts) + 5.0)
+    fig, ax = plt.subplots(figsize=(fig_w, 5.2))
+    x = np.arange(len(counts), dtype=float)
+    width = min(0.36, 0.8 / 2.2)
+
+    for use_kueue in (False, True):
+        heights: List[float] = []
+        for jc in counts:
+            sub = df[(df["job_count"] == jc) & (df["use_kueue"] == use_kueue)][col_total].dropna()
+            if sub.empty:
+                heights.append(float("nan"))
+            else:
+                heights.append(float(sub.astype(float).median()))
+        offset = -width / 2 if use_kueue is False else width / 2
+        ax.bar(
+            x + offset,
+            heights,
+            width,
+            label=MODE_LABELS[use_kueue],
+            color=PALETTE[use_kueue],
+            edgecolor="white",
+            linewidth=0.6,
+            zorder=2,
+        )
+
+    ax.set_xticks(x)
+    ax.set_xticklabels([_format_job_count_k(float(c)) for c in counts])
+    _maybe_rotate_job_count_xlabels(ax, len(counts))
+    _style_axis(
+        ax,
+        title="Spawn time by requested job count",
+        ylabel="Time to spawn all jobs (s)",
+    )
+    ax.set_xlabel("Requested job count")
+    ax.legend(loc="upper left", frameon=True)
+    ax.grid(axis="y", alpha=0.45)
+    ax.grid(axis="x", alpha=0.12)
+    for spine in ("top", "right"):
+        ax.spines[spine].set_visible(False)
+    fig.tight_layout()
+    fig.subplots_adjust(bottom=0.22 if len(counts) > 8 else 0.14)
+    _finalize_plot(fig, "spawn_time_by_job_count.png", output_dir, show)
+
+
 def _plot_eviction_pressure(
     df: pd.DataFrame,
     output_dir: Optional[str],
@@ -543,6 +663,7 @@ def render_performance_plots(
     comparative_df = compute_comparative_metrics(df)
 
     _plot_performance_overview(df, output_dir, show)
+    _plot_spawn_time_by_job_count(df, output_dir, show)
     _plot_metric_trend(
         df,
         "throughput",
diff --git a/configs/kueue/kueuer/src/kueuer/utils/io.py b/configs/kueue/kueuer/src/kueuer/utils/io.py
index 1833c0c0..9e9192dd 100644
--- a/configs/kueue/kueuer/src/kueuer/utils/io.py
+++ b/configs/kueue/kueuer/src/kueuer/utils/io.py
@@ -1,6 +1,7 @@
 """Input/Output utilities for reading and writing files."""
 
 import csv
+import json
 from datetime import datetime
 from pathlib import Path
 from typing import Any, Dict, List, Set
@@ -51,6 +52,8 @@ def save_performance_to_csv(results: List[Dict[str, Any]], filename: str) -> Non
             for key, value in result.items():
                 if isinstance(value, datetime):
                     row_data[key] = value.isoformat()
+                elif isinstance(value, (list, dict)):
+                    row_data[key] = json.dumps(value)
                 else:
                     row_data[key] = value
             writer.writerow(row_data)  # type: ignore
diff --git a/configs/kueue/kueuer/tests/test_k8s_phase2_resilience.py b/configs/kueue/kueuer/tests/test_k8s_phase2_resilience.py
index 064a9268..a8df7684 100644
--- a/configs/kueue/kueuer/tests/test_k8s_phase2_resilience.py
+++ b/configs/kueue/kueuer/tests/test_k8s_phase2_resilience.py
@@ -56,12 +56,45 @@ def list_namespaced_pod(self, namespace):
     assert k8s.kueue_controller_restarts() == 0
 
 
+def test_collect_pod_outcomes_handles_malformed_pod_list(monkeypatch) -> None:
+    monkeypatch.setattr(k8s.config, "load_kube_config", lambda: None)
+
+    class FakeCoreV1Api:
+        def list_namespaced_pod(self, namespace):
+            raise ValueError("Invalid value for `items`, must not be `None`")
+
+    monkeypatch.setattr(k8s.client, "CoreV1Api", FakeCoreV1Api)
+
+    summary = k8s.collect_pod_outcomes(namespace="default", prefix="bench")
+    assert summary["pods_total"] == 0
+    assert summary["pods_failed"] == 0
+    assert summary["pods_oomkilled"] == 0
+
+
+def test_collect_job_outcomes_handles_malformed_job_list(monkeypatch) -> None:
+    monkeypatch.setattr(k8s.config, "load_kube_config", lambda: None)
+
+    class FakeBatchV1Api:
+        def list_namespaced_job(self, namespace):
+            raise ValueError("Invalid value for `items`, must not be `None`")
+
+    monkeypatch.setattr(k8s.client, "BatchV1Api", FakeBatchV1Api)
+
+    outcomes = k8s.collect_job_outcomes(namespace="default", prefix="bench")
+    assert outcomes == {
+        "jobs_total": 0,
+        "jobs_succeeded": 0,
+        "jobs_failed": 0,
+        "jobs_active": 0,
+    }
+
+
 def test_stress_vm_bytes_mb_uses_safer_default_fraction() -> None:
-    assert k8s.DEFAULT_STRESS_VM_MEMORY_FRACTION == 0.4
+    assert k8s.DEFAULT_STRESS_VM_MEMORY_FRACTION == 0.33
     assert k8s.stress_vm_bytes_mb(
         ram_gb=1.0,
         vm_memory_fraction=k8s.DEFAULT_STRESS_VM_MEMORY_FRACTION,
-    ) == pytest.approx(409.6)
+    ) == pytest.approx(337.92)
 
 
 def test_stress_vm_bytes_mb_validates_fraction_bounds() -> None:
diff --git a/configs/kueue/kueuer/tests/test_plot_filters_and_semantics.py b/configs/kueue/kueuer/tests/test_plot_filters_and_semantics.py
index 3fa6eb07..c7ae99f8 100644
--- a/configs/kueue/kueuer/tests/test_plot_filters_and_semantics.py
+++ b/configs/kueue/kueuer/tests/test_plot_filters_and_semantics.py
@@ -24,6 +24,20 @@ def test_compute_latency_adds_turnaround_columns() -> None:
     assert out["completion_latency"].iloc[0] == 5.0
 
 
+def test_format_job_count_k_compact_labels() -> None:
+    assert plot_module._format_job_count_k(500) == "500"
+    assert plot_module._format_job_count_k(1000) == "1k"
+    assert plot_module._format_job_count_k(3500) == "3.5k"
+    assert plot_module._format_job_count_k(10000) == "10k"
+
+
+def test_parse_chunk_spawn_seconds_accepts_json_and_list() -> None:
+    assert plot_module._parse_chunk_spawn_seconds("[1.0, 2.5]") == [1.0, 2.5]
+    assert plot_module._parse_chunk_spawn_seconds([3.0, 4.0]) == [3.0, 4.0]
+    assert plot_module._parse_chunk_spawn_seconds("") == []
+    assert plot_module._parse_chunk_spawn_seconds(None) == []
+
+
 def test_compute_completion_ratio_defaults_to_one_without_tracking_columns() -> None:
     df = pd.DataFrame(
         {
@@ -52,6 +66,8 @@ def test_performance_command_writes_plot_files(tmp_path, monkeypatch) -> None:
             "median_time_from_creation_completion": [3.0, 2.5],
             "std_dev_time_from_creation_completion": [0.2, 0.3],
             "job_duration": [1, 1],
+            "submission_manifest_apply_seconds": [1.5, 1.7],
+            "submission_chunk_spawn_seconds": ["[0.7, 0.8]", "[0.85, 0.85]"],
         }
     ).to_csv(csv_path, index=False)
 
@@ -64,6 +80,7 @@ def test_performance_command_writes_plot_files(tmp_path, monkeypatch) -> None:
 
     expected = [
         "performance_overview.png",
+        "spawn_time_by_job_count.png",
         "throughput_by_job_count.png",
         "completion_ratio_by_job_count.png",
         "tail_turnaround_by_job_count.png",
diff --git a/configs/kueue/kueuer/tests/test_resources.py b/configs/kueue/kueuer/tests/test_resources.py
new file mode 100644
index 00000000..9c36be1c
--- /dev/null
+++ b/configs/kueue/kueuer/tests/test_resources.py
@@ -0,0 +1,154 @@
+"""Tests for cluster resource aggregation."""
+
+from __future__ import annotations
+
+from kubernetes.client import V1Node, V1NodeStatus, V1ObjectMeta
+
+from kueuer.resources import _bytes_to_gi_str, total
+
+
+def _node(
+    name: str,
+    capacity: dict[str, str],
+    labels: dict[str, str] | None = None,
+    uid: str | None = None,
+) -> V1Node:
+    return V1Node(
+        metadata=V1ObjectMeta(name=name, uid=uid or f"uid-{name}", labels=labels),
+        status=V1NodeStatus(capacity=capacity, allocatable=capacity),
+    )
+
+
+def test_bytes_to_gi_str_examples() -> None:
+    gi = 1024**3
+    assert _bytes_to_gi_str(20550 * gi) == "20550"
+    assert _bytes_to_gi_str(1073741824 // 2) == "0.5"
+    assert _bytes_to_gi_str(gi) == "1"
+
+
+def test_total_memory_ephemeral_gi(monkeypatch) -> None:
+    nodes = [
+        _node(
+            "n1",
+            {"cpu": "8", "memory": "16Gi", "ephemeral-storage": "100Gi"},
+        ),
+    ]
+
+    def fake_list_node(*_a, **_k):
+        class R:
+            items = nodes
+
+        return R()
+
+    monkeypatch.setattr(
+        "kueuer.resources._load_kube",
+        lambda: type("X", (), {"list_node": fake_list_node})(),
+    )
+
+    out = total(None, field="capacity")
+    assert out["memory"] == {"value": "16", "unit": "Gi"}
+    assert out["ephemeral-storage"] == {"value": "100", "unit": "Gi"}
+
+
+def test_total_nvidia_gpu_single_model(monkeypatch) -> None:
+    nodes = [
+        _node(
+            "g1",
+            {"nvidia.com/gpu": "4"},
+            {"nvidia.com/gpu.product": "NVIDIA-A100-SXM4-40GB"},
+        ),
+        _node(
+            "g2",
+            {"nvidia.com/gpu": "8"},
+            {"nvidia.com/gpu.product": "NVIDIA-A100-SXM4-40GB"},
+        ),
+    ]
+
+    def fake_list_node(*_a, **_k):
+        class R:
+            items = nodes
+
+        return R()
+
+    monkeypatch.setattr(
+        "kueuer.resources._load_kube",
+        lambda: type("X", (), {"list_node": fake_list_node})(),
+    )
+
+    out = total(None, field="capacity")
+    assert out["nvidia.com/gpu"] == {
+        "kind": "NVIDIA-A100-SXM4-40GB",
+        "value": "12",
+        "unit": "count",
+    }
+
+
+def test_total_nvidia_gpu_mixed_kind(monkeypatch) -> None:
+    nodes = [
+        _node("a", {"nvidia.com/gpu": "2"}, {"nvidia.com/gpu.product": "NVIDIA-T4"}),
+        _node("b", {"nvidia.com/gpu": "4"}, {"nvidia.com/gpu.product": "NVIDIA-A100"}),
+    ]
+
+    def fake_list_node(*_a, **_k):
+        class R:
+            items = nodes
+
+        return R()
+
+    monkeypatch.setattr(
+        "kueuer.resources._load_kube",
+        lambda: type("X", (), {"list_node": fake_list_node})(),
+    )
+
+    out = total(None, field="capacity")
+    assert out["nvidia.com/gpu"] == {
+        "kind": "mixed",
+        "value": "6",
+        "unit": "count",
+    }
+
+
+def test_total_amd_gpu(monkeypatch) -> None:
+    nodes = [
+        _node(
+            "w1",
+            {"amd.com/gpu": "8"},
+            {"amd.com/gpu.product": "AMD-INSTINCT-MI250X"},
+        ),
+    ]
+
+    def fake_list_node(*_a, **_k):
+        class R:
+            items = nodes
+
+        return R()
+
+    monkeypatch.setattr(
+        "kueuer.resources._load_kube",
+        lambda: type("X", (), {"list_node": fake_list_node})(),
+    )
+
+    out = total(None, field="capacity")
+    assert out["amd.com/gpu"] == {
+        "kind": "AMD-INSTINCT-MI250X",
+        "value": "8",
+        "unit": "count",
+    }
+
+
+def test_total_nvidia_unknown_kind(monkeypatch) -> None:
+    nodes = [_node("x", {"nvidia.com/gpu": "3"}, {})]
+
+    def fake_list_node(*_a, **_k):
+        class R:
+            items = nodes
+
+        return R()
+
+    monkeypatch.setattr(
+        "kueuer.resources._load_kube",
+        lambda: type("X", (), {"list_node": fake_list_node})(),
+    )
+
+    out = total(None, field="capacity")
+    assert out["nvidia.com/gpu"] == {"kind": "", "value": "3", "unit": "count"}
diff --git a/configs/kueue/kueuer/tests/test_spawn_mechanism_api.py b/configs/kueue/kueuer/tests/test_spawn_mechanism_api.py
index 0ef05a8c..8b1fe0a0 100644
--- a/configs/kueue/kueuer/tests/test_spawn_mechanism_api.py
+++ b/configs/kueue/kueuer/tests/test_spawn_mechanism_api.py
@@ -134,4 +134,3 @@ def create_namespaced_job(self, namespace, body):
     assert ok is True
     assert err == ""
     assert attempts["count"] == 2
-

From abd850b3860013016fc95f6643c9a446db479b9c Mon Sep 17 00:00:00 2001
From: Shiny Brar <shiny.brar@nrc-cnrc.gc.ca>
Date: Thu, 9 Apr 2026 14:08:12 -0700
Subject: [PATCH 10/11] feat(kueuer): added command kr cluster resources to
 provide node label scoped values and also provide weights scaled to vCPUs

---
 configs/kueue/kueuer/src/kueuer/resources.py | 451 +++++++++++++++----
 configs/kueue/kueuer/tests/test_resources.py | 290 ++++++++++--
 2 files changed, 623 insertions(+), 118 deletions(-)

diff --git a/configs/kueue/kueuer/src/kueuer/resources.py b/configs/kueue/kueuer/src/kueuer/resources.py
index de4a78a0..b0ea73d5 100644
--- a/configs/kueue/kueuer/src/kueuer/resources.py
+++ b/configs/kueue/kueuer/src/kueuer/resources.py
@@ -13,11 +13,20 @@
 - Deduplicates nodes by UID (so overlapping regex lists don't double count).
 - By default totals from node .status.capacity; use --field allocatable to sum
   .status.allocatable instead.
-- Returns a mapping: ``cpu`` uses ``{ "value", "unit": "cores" }``; ``memory`` and
-  ``ephemeral-storage`` use values in **Gi** (1024³ bytes) with ``unit: "Gi"``.
-- For ``nvidia.com/gpu`` and ``amd.com/gpu``: ``{ "kind", "value", "unit": "count" }``
-  where ``kind`` comes from node labels (e.g. ``nvidia.com/gpu.product``).
-- If a resource does not exist on any matched node, it is **omitted**.
+- Results are grouped by a configurable node label (see CLI ``--node-label-key``;
+  ``total()`` requires ``node_label_key`` with no default in code). Nodes without
+  the label are grouped under ``""``. Each group has ``count`` (nodes in group),
+  ``cpu``, ``memory``, ``ephemeral-storage`` (binary **GiB**, 1024³; values up to
+  3 decimal places), per-bucket **weights** (same 3 decimal places; pool CPU
+  cores per GiB / per GPU kind—see ``ResourceWeights``), and GPU lists.
+- ``nvidia.com/gpu`` is a list of ``{ "kind", "value", "unit": "count" }`` per
+  distinct ``nvidia.com/gpu.product`` label, summed across nodes. When
+  capacity/allocatable reports 0 or omits ``nvidia.com/gpu`` but the NVIDIA
+  Device Plugin exposes counts on labels (e.g. ``nvidia.com/gpu.count``), those
+  label values are used with ``kind`` from ``nvidia.com/gpu.product``.
+- ``amd.com/gpu`` uses the same list shape, summed by ``amd.com/gpu.product``.
+- If a resource does not exist for nodes in a group, it is **omitted** for that
+  group.
 
 Examples:
   uv run resources.py
@@ -30,13 +39,13 @@
 import re
 import sys
 from dataclasses import dataclass
-from decimal import Decimal, getcontext
-from typing import Annotated, Any, Dict, Iterable, List, Optional, Sequence, Union, cast
+from decimal import ROUND_HALF_UP, Decimal, getcontext, localcontext
+from typing import Annotated, Any, Dict, Iterable, List, Optional, Sequence, cast
 
 import typer
 from kubernetes.client import CoreV1Api, V1Node
 from kubernetes.utils.quantity import parse_quantity
-from pydantic import BaseModel, Field, RootModel, ValidationError, field_validator
+from pydantic import BaseModel, ConfigDict, Field, ValidationError, field_validator
 from rich.console import Console
 from typing_extensions import Literal
 
@@ -47,6 +56,12 @@
 # Rationale: See DECIMAL_PRECISION in utils/constants.py
 getcontext().prec = DECIMAL_PRECISION
 
+# Reported fractional precision for CPU, GiB display quantities, and weight ratios.
+REPORT_MAX_DECIMAL_PLACES = 3
+
+# Intermediate precision for weight ratio division before rounding to ``REPORT_MAX_DECIMAL_PLACES``.
+_WEIGHT_RATIO_DIV_PREC = max(80, DECIMAL_PRECISION)
+
 app = typer.Typer(help="Cluster utilities")
 
 # =========================
@@ -56,29 +71,93 @@
 
 class ResourceItem(BaseModel):
     value: str = Field(
-        ..., description="Numeric value as a string, max precision retained."
+        ...,
+        description="Numeric string with at most three fractional decimal places.",
     )
     unit: str = Field(
-        ..., description="Unit for the value, e.g., 'cores', 'bytes', 'count'."
+        ...,
+        description="Binary GiB for memory/ephemeral totals, 'cores', or 'count'.",
     )
 
 
 class GpuResourceItem(BaseModel):
-    """Cluster totals for a GPU resource."""
+    """Per-model GPU totals within a node-type group."""
 
     kind: str = Field(
         ...,
+        description="Product name from node labels (e.g. nvidia.com/gpu.product).",
+    )
+    value: str = Field(..., description="Total GPU count for this kind.")
+    unit: Literal["count"] = "count"
+
+
+class ResourceWeights(BaseModel):
+    """Pool-level ratios vs CPU cores (dimensionless); see module docstring for interpretation."""
+
+    model_config = ConfigDict(populate_by_name=True)
+
+    cpu: str = Field(
+        default="1",
+        description="Baseline; other weights are pool CPU per unit of that resource.",
+    )
+    memory: Optional[str] = Field(
+        None,
+        description="Pool CPU cores divided by total memory in binary GiB.",
+    )
+    ephemeral_storage: Optional[str] = Field(
+        None,
+        serialization_alias="ephemeral-storage",
+        description="Pool CPU cores divided by total ephemeral storage in binary GiB.",
+    )
+    nvidia_gpu: Optional[Dict[str, str]] = Field(
+        None,
+        serialization_alias="nvidia.com/gpu",
+        description="Per GPU product: pool CPU cores divided by count of that kind.",
+    )
+
+
+class NodeTypeResources(BaseModel):
+    """Resource totals for one value of the grouping node label."""
+
+    model_config = ConfigDict(populate_by_name=True)
+
+    count: int = Field(
+        ...,
+        ge=0,
+        description="Number of nodes in this group (unique nodes after pattern filter).",
+    )
+    cpu: Optional[ResourceItem] = None
+    memory: Optional[ResourceItem] = None
+    ephemeral_storage: Optional[ResourceItem] = Field(
+        default=None,
+        serialization_alias="ephemeral-storage",
+    )
+    nvidia_gpu: Optional[List[GpuResourceItem]] = Field(
+        default=None,
+        serialization_alias="nvidia.com/gpu",
+    )
+    amd_gpu: Optional[List[GpuResourceItem]] = Field(
+        default=None,
+        serialization_alias="amd.com/gpu",
+    )
+    weights: Optional[ResourceWeights] = Field(
+        None,
         description=(
-            "Product name if the cluster uses a single model; empty if unknown; "
-            "'mixed' if multiple models."
+            "CPU-normalized pool composition weights (decimal strings, same precision "
+            "as other reported quantities). "
+            "Omitted if the pool has no CPU total to divide by."
         ),
     )
-    value: str = Field(..., description="Total GPU count.")
-    unit: Literal["count"] = "count"
 
 
-class ResourceMap(RootModel[Dict[str, Union[ResourceItem, GpuResourceItem]]]):
-    """Dynamic resource map so unavailable resources can be omitted."""
+class ClusterResourcesResult(BaseModel):
+    """Cluster resources grouped by ``node_label_key`` label values."""
+
+    node_label_key: str = Field(
+        ...,
+        description="Kubernetes node label key used to form each group.",
+    )
+    by_label_value: Dict[str, NodeTypeResources]
 
 
 class Settings(BaseModel):
@@ -107,8 +186,6 @@ class TotalsAcc:
     cpu_cores: Optional[Decimal]  # None -> omit key
     memory_bytes: Optional[int]
     ephemeral_bytes: Optional[int]
-    nvidia_gpu: Optional[int]
-    amd_gpu: Optional[int]
     nvidia_by_kind: Optional[Dict[str, int]]
     amd_by_kind: Optional[Dict[str, int]]
 
@@ -175,29 +252,134 @@ def _amd_gpu_kind_from_labels(labels: Dict[str, str]) -> str:
     return ""
 
 
-def _summary_gpu_kind(by_kind: Dict[str, int]) -> str:
-    """Single model name, empty if unknown, or 'mixed' if multiple distinct models."""
-    active = {k: v for k, v in by_kind.items() if v > 0}
-    if not active:
-        return ""
-    distinct_nonempty = {k for k in active if k}
-    if not distinct_nonempty:
-        return ""
-    if len(distinct_nonempty) == 1:
-        return next(iter(distinct_nonempty))
-    return "mixed"
-
-
-def _bytes_to_gi_str(total_bytes: int) -> str:
-    """Convert a byte total to a decimal string in Gi (1 Gi = 1024³ bytes)."""
-    # Use integer 1024**3 so the divisor is exact (Decimal(1024)**3 can round).
-    gi = Decimal(total_bytes) / Decimal(1024**3)
-    s = format(gi, "f")
+def _node_nvidia_gpu_contrib(
+    labels: Dict[str, str], m: Dict[str, str]
+) -> Optional[tuple[int, str]]:
+    """
+    NVIDIA GPUs advertised for this node: count and product kind.
+
+    Prefer ``.status.capacity``/``allocatable`` when ``nvidia.com/gpu`` is
+    positive. If it is zero or absent, fall back to ``nvidia.com/gpu.count`` so
+    MIG / device-plugin-only reporting still aggregates correctly.
+    """
+    kind = _nvidia_gpu_kind_from_labels(labels)
+    cap_s = m.get("nvidia.com/gpu")
+    cap_n = int(parse_quantity(cap_s)) if cap_s else 0
+    lc_raw = labels.get("nvidia.com/gpu.count")
+    label_n: Optional[int] = None
+    if lc_raw is not None and str(lc_raw).strip() != "":
+        label_n = int(parse_quantity(str(lc_raw).strip()))
+
+    if cap_n > 0:
+        return (cap_n, kind)
+    if label_n is not None and label_n > 0:
+        return (label_n, kind)
+    return None
+
+
+def _gpu_kind_totals_to_list(by_kind: Optional[Dict[str, int]]) -> Optional[List[GpuResourceItem]]:
+    """Convert per-kind counts to a stable list for JSON output."""
+    if not by_kind:
+        return None
+    items = [
+        GpuResourceItem(kind=k, value=str(v))
+        for k, v in sorted(by_kind.items(), key=lambda kv: (kv[0] == "", kv[0]))
+        if v > 0
+    ]
+    return items or None
+
+
+def _format_decimal_report(value: Decimal) -> str:
+    """Stringify a non-negative Decimal with at most ``REPORT_MAX_DECIMAL_PLACES`` places."""
+    if value < 0:
+        raise ValueError("value must be non-negative")
+    q = Decimal("1").scaleb(-REPORT_MAX_DECIMAL_PLACES)
+    rounded = value.quantize(q, rounding=ROUND_HALF_UP)
+    s = format(rounded, "f")
     if "." in s:
         s = s.rstrip("0").rstrip(".")
     return s
 
 
+# Binary gibibyte (Kubernetes-style): 1 GiB = 1024³ bytes.
+_GIB_BYTES = Decimal(1024**3)
+
+
+def _bytes_to_binary_gib_decimal(total_bytes: int) -> Decimal:
+    """Convert byte totals to binary GiB (full ``Decimal``, unrounded)."""
+    if total_bytes < 0:
+        raise ValueError("byte total must be non-negative")
+    return Decimal(total_bytes) / _GIB_BYTES
+
+
+def _gib_resource_item(total_bytes: int) -> ResourceItem:
+    """Memory / ephemeral totals: always reported in GiB with limited display precision."""
+    if total_bytes == 0:
+        return ResourceItem(value="0", unit="GiB")
+    v = _bytes_to_binary_gib_decimal(total_bytes)
+    return ResourceItem(value=_format_decimal_report(v), unit="GiB")
+
+
+def _gib_display_to_bytes(value: Decimal) -> Decimal:
+    """Interpret a displayed GiB quantity as bytes."""
+    return value * _GIB_BYTES
+
+
+def _decimal_ratio_string(numerator: Decimal, denominator: Decimal) -> str:
+    """``numerator / denominator`` rounded to ``REPORT_MAX_DECIMAL_PLACES`` (half-up)."""
+    if denominator <= 0:
+        raise ValueError("denominator must be positive")
+    with localcontext() as ctx:
+        ctx.prec = _WEIGHT_RATIO_DIV_PREC
+        ratio = numerator / denominator
+    return _format_decimal_report(ratio)
+
+
+def _compute_resource_weights(acc: TotalsAcc) -> Optional[ResourceWeights]:
+    """
+    Weights normalize pool totals to a per-CPU baseline: ``cpu`` is 1; other
+    fields are ``TOTAL_CPU / TOTAL_QUANTITY`` in compatible units (GiB for
+    memory and ephemeral; per-GPU-kind counts for NVIDIA).
+
+    **Interpretation (heuristic):** For a node pool with totals ``(C, M, E, …)``,
+    weights map ``(c, m, e, …)`` requests to a linear ``c·1 + m·w_mem + …`` style
+    score *if* you treat the pool's aggregate ratio as a fixed substitution rate
+    between CPU and other resources. That is a **comparative** normalization, not a
+    guarantee of schedulability, pricing, or optimal packing—heterogeneous nodes,
+    fragmentation, and priorities are not captured.
+    """
+    cpu = acc.cpu_cores
+    if cpu is None or cpu <= 0:
+        return None
+
+    mem_w: Optional[str] = None
+    if acc.memory_bytes is not None and acc.memory_bytes > 0:
+        mem_w = _decimal_ratio_string(cpu, _bytes_to_binary_gib_decimal(acc.memory_bytes))
+
+    eph_w: Optional[str] = None
+    if acc.ephemeral_bytes is not None and acc.ephemeral_bytes > 0:
+        eph_w = _decimal_ratio_string(cpu, _bytes_to_binary_gib_decimal(acc.ephemeral_bytes))
+
+    nv_map: Optional[Dict[str, str]] = None
+    if acc.nvidia_by_kind:
+        entries: Dict[str, str] = {}
+        for kind, cnt in sorted(
+            acc.nvidia_by_kind.items(),
+            key=lambda kv: (kv[0] == "", kv[0]),
+        ):
+            if cnt <= 0:
+                continue
+            entries[kind] = _decimal_ratio_string(cpu, Decimal(cnt))
+        nv_map = entries or None
+
+    return ResourceWeights(
+        cpu="1",
+        memory=mem_w,
+        ephemeral_storage=eph_w,
+        nvidia_gpu=nv_map,
+    )
+
+
 def _get_field_map(node: V1Node, field: str) -> Dict[str, str]:
     """
     Extract either .status.capacity or .status.allocatable as a plain dict[str, str].
@@ -257,28 +439,25 @@ def _sum_resources(nodes: List[V1Node], field: str) -> TotalsAcc:
     cpu_vals: List[str] = []
     mem_vals: List[str] = []
     eph_vals: List[str] = []
-    nvidia_vals: List[str] = []
     amd_vals: List[str] = []
     nvidia_by_kind: Dict[str, int] = {}
     amd_by_kind: Dict[str, int] = {}
 
     for n in nodes:
         m = _get_field_map(n, field)
+        labels = (n.metadata.labels or {}) if n.metadata else {}
+        nvidia_c = _node_nvidia_gpu_contrib(labels, m)
+        if nvidia_c is not None:
+            cnt, nk = nvidia_c
+            nvidia_by_kind[nk] = nvidia_by_kind.get(nk, 0) + cnt
         if not m:
             continue
-        labels = (n.metadata.labels or {}) if n.metadata else {}
         if "cpu" in m:
             cpu_vals.append(m["cpu"])
         if "memory" in m:
             mem_vals.append(m["memory"])
         if "ephemeral-storage" in m:
             eph_vals.append(m["ephemeral-storage"])
-        if "nvidia.com/gpu" in m:
-            nvidia_vals.append(m["nvidia.com/gpu"])
-            nk = _nvidia_gpu_kind_from_labels(labels)
-            nvidia_by_kind[nk] = nvidia_by_kind.get(nk, 0) + int(
-                parse_quantity(m["nvidia.com/gpu"])
-            )
         if "amd.com/gpu" in m:
             amd_vals.append(m["amd.com/gpu"])
             ak = _amd_gpu_kind_from_labels(labels)
@@ -296,27 +475,55 @@ def _try_sum(dec_sum_fn, vals):
     cpu_total = _try_sum(_sum_quantity, cpu_vals)
     mem_total = _try_sum(_sum_quantity, mem_vals)
     eph_total = _try_sum(_sum_quantity, eph_vals)
-    nvidia_total = _try_sum(_sum_int_quantity, nvidia_vals)
     amd_total = _try_sum(_sum_int_quantity, amd_vals)
 
     return TotalsAcc(
         cpu_cores=cpu_total,
         memory_bytes=int(mem_total) if mem_total is not None else None,
         ephemeral_bytes=int(eph_total) if eph_total is not None else None,
-        nvidia_gpu=nvidia_total,
-        amd_gpu=amd_total,
-        nvidia_by_kind=nvidia_by_kind if nvidia_total is not None else None,
+        nvidia_by_kind=nvidia_by_kind if nvidia_by_kind else None,
         amd_by_kind=amd_by_kind if amd_total is not None else None,
     )
 
 
+def _totals_acc_to_node_type_resources(acc: TotalsAcc, node_count: int) -> NodeTypeResources:
+    """Build one NodeTypeResources from aggregated totals."""
+    return NodeTypeResources(
+        count=node_count,
+        cpu=(
+            ResourceItem(
+                value=_format_decimal_report(acc.cpu_cores),
+                unit="cores",
+            )
+            if acc.cpu_cores is not None
+            else None
+        ),
+        memory=(
+            _gib_resource_item(acc.memory_bytes)
+            if acc.memory_bytes is not None
+            else None
+        ),
+        ephemeral_storage=(
+            _gib_resource_item(acc.ephemeral_bytes)
+            if acc.ephemeral_bytes is not None
+            else None
+        ),
+        nvidia_gpu=_gpu_kind_totals_to_list(acc.nvidia_by_kind),
+        amd_gpu=_gpu_kind_totals_to_list(acc.amd_by_kind),
+        weights=_compute_resource_weights(acc),
+    )
+
+
 # =========================
 # Public API
 # =========================
 
 
 def total(
-    patterns: Optional[List[str]] = None, field: str = "capacity"
+    patterns: Optional[List[str]] = None,
+    field: str = "capacity",
+    *,
+    node_label_key: str,
 ) -> Dict[str, Any]:
     """
     Calculate total cluster resources across nodes matching regex patterns.
@@ -324,12 +531,17 @@ def total(
     Args:
         patterns: Regex strings for node names. If None or empty, includes all nodes.
         field: Which field to sum: "capacity" (default) or "allocatable".
+        node_label_key: Kubernetes node label key used to group results (callers
+            such as the CLI supply the default; this function does not default it).
 
     Returns:
-        Mapping of resource name to detail dicts. Memory and ephemeral-storage use
-        Gi and ``unit`` ``\"Gi\"``. GPU entries include ``kind``, ``value``, and
-        ``unit`` ``\"count\"``. Only includes resources present on at least one node.
+        A dict with ``node_label_key``, ``by_label_value`` (each key is a label
+        value, or ``\"\"`` if unset), and per-group ``count`` plus resource maps.
     """
+    label_key = node_label_key.strip()
+    if not label_key:
+        raise ValueError("node_label_key must be a non-empty string")
+
     # Validate inputs with Pydantic
     if field not in ("capacity", "allocatable"):
         raise ValueError('field must be "capacity" or "allocatable"')
@@ -343,36 +555,74 @@ def total(
 
     v1 = _load_kube()
     nodes = _collect_nodes(v1, cfg.patterns)
-    acc = _sum_resources(nodes, cfg.field)
+    by_nt: Dict[str, List[V1Node]] = {}
+    for n in nodes:
+        labels = (n.metadata.labels or {}) if n.metadata else {}
+        raw_nt = labels.get(label_key)
+        nt_key = "" if raw_nt is None else str(raw_nt)
+        by_nt.setdefault(nt_key, []).append(n)
+
+    groups: Dict[str, NodeTypeResources] = {}
+    for nt_key in sorted(by_nt.keys(), key=lambda s: (s == "", s)):
+        bucket = by_nt[nt_key]
+        acc = _sum_resources(bucket, cfg.field)
+        groups[nt_key] = _totals_acc_to_node_type_resources(acc, len(bucket))
+
+    return ClusterResourcesResult(
+        node_label_key=label_key,
+        by_label_value=groups,
+    ).model_dump(
+        by_alias=True,
+        exclude_none=True,
+    )
 
-    # Build a dynamic map (omit unavailable resources)
-    result: Dict[str, Union[ResourceItem, GpuResourceItem]] = {}
 
-    if acc.cpu_cores is not None:
-        result["cpu"] = ResourceItem(value=f"{acc.cpu_cores}", unit="cores")
-    if acc.memory_bytes is not None:
-        result["memory"] = ResourceItem(
-            value=_bytes_to_gi_str(acc.memory_bytes),
-            unit="Gi",
-        )
-    if acc.ephemeral_bytes is not None:
-        result["ephemeral-storage"] = ResourceItem(
-            value=_bytes_to_gi_str(acc.ephemeral_bytes),
-            unit="Gi",
-        )
-    if acc.nvidia_gpu is not None and acc.nvidia_by_kind is not None:
-        result["nvidia.com/gpu"] = GpuResourceItem(
-            kind=_summary_gpu_kind(acc.nvidia_by_kind),
-            value=str(acc.nvidia_gpu),
-        )
-    if acc.amd_gpu is not None and acc.amd_by_kind is not None:
-        result["amd.com/gpu"] = GpuResourceItem(
-            kind=_summary_gpu_kind(acc.amd_by_kind),
-            value=str(acc.amd_gpu),
-        )
-
-    # Validate and dump with Pydantic
-    return ResourceMap(result).model_dump()
+def _scale_resource_item_inplace(item: Dict[str, Any], scale: Decimal) -> None:
+    """Apply ``--scale`` to one resource dict with ``value`` and ``unit``."""
+    unit = str(item.get("unit", ""))
+    v = Decimal(str(item["value"]))
+    if unit == "cores":
+        item["value"] = _format_decimal_report(v * scale)
+    elif unit == "GiB":
+        scaled_bytes = _gib_display_to_bytes(v) * scale
+        int_bytes = max(0, int(scaled_bytes.to_integral_value(rounding=ROUND_HALF_UP)))
+        out = _gib_resource_item(int_bytes)
+        item["value"] = out.value
+        item["unit"] = out.unit
+    elif unit == "count":
+        item["value"] = _format_decimal_report(v * scale)
+    else:
+        item["value"] = _format_decimal_report(v * scale)
+
+
+def _scale_cluster_resources_payload(result: Dict[str, Any], scale: Decimal) -> None:
+    """Multiply numeric ``value`` fields in-place (CLI ``--scale``). Leaves ``weights`` unchanged."""
+    inner = result.get("by_label_value")
+    if not isinstance(inner, dict):
+        return
+    for block in inner.values():
+        if not isinstance(block, dict):
+            continue
+        for res_key in ("cpu", "memory", "ephemeral-storage"):
+            item = block.get(res_key)
+            if isinstance(item, dict) and "value" in item and "unit" in item:
+                _scale_resource_item_inplace(item, scale)
+        for gpu_key in ("nvidia.com/gpu", "amd.com/gpu"):
+            lst = block.get(gpu_key)
+            if isinstance(lst, list):
+                for g in lst:
+                    if isinstance(g, dict) and "value" in g and "unit" in g:
+                        _scale_resource_item_inplace(g, scale)
+
+
+def list_resource_quotas(namespace: str) -> Dict[str, Any]:
+    """List ResourceQuota objects in a namespace via the Kubernetes Python client."""
+    k8s = get_k8s_config()
+    quota_list = k8s.core_v1.list_namespaced_resource_quota(namespace=namespace)
+    payload = k8s.api_client.sanitize_for_serialization(quota_list)
+    if isinstance(payload, dict):
+        return payload
+    return {"items": payload or []}
 
 
 # =========================
@@ -410,6 +660,16 @@ def resources(
             help="Scale resources by this percentage.",
         ),
     ] = 1.0,
+    node_label_key: Annotated[
+        str,
+        typer.Option(
+            "--node-label-key",
+            help=(
+                "Node label key used to group totals by label value "
+                '(default only applies to this CLI, not to total()).'
+            ),
+        ),
+    ] = "skaha.opencadc.org/node-type",
 ):
     """
     Sum resources across nodes matching any of the provided regex patterns.
@@ -417,17 +677,40 @@ def resources(
     assert field in ["capacity", "allocatable"]
     assert scale > 0.0 and scale <= 1.0, "Percentage must be in (0, 1]"
     try:
-        result = total(patterns or None, field=field)
+        result = total(
+            patterns or None,
+            field=field,
+            node_label_key=node_label_key,
+        )
         console.print(result, width=120)
         if scale != 1.0:
             console.print(f"Scaling by {scale * 100}%...")
-            for _k, v in result.items():
-                v["value"] = str(Decimal(v["value"]) * Decimal(scale))
+            _scale_cluster_resources_payload(result, Decimal(str(scale)))
             console.print(result, width=120)
     except Exception as e:
         print(f"Error: {e}", file=sys.stderr)
         raise SystemExit(1)
 
 
+@app.command("resourcequota")
+def resourcequota(
+    namespace: Annotated[
+        str,
+        typer.Option(
+            "-n",
+            "--namespace",
+            help="Namespace to query for ResourceQuota objects.",
+        ),
+    ],
+):
+    """List namespace ResourceQuota objects using the Kubernetes Python client."""
+    try:
+        response = list_resource_quotas(namespace)
+        console.print({"response": response, "resource_quotas": response.get("items", [])})
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        raise SystemExit(1)
+
+
 if __name__ == "__main__":
     app()
diff --git a/configs/kueue/kueuer/tests/test_resources.py b/configs/kueue/kueuer/tests/test_resources.py
index 9c36be1c..c40f2e00 100644
--- a/configs/kueue/kueuer/tests/test_resources.py
+++ b/configs/kueue/kueuer/tests/test_resources.py
@@ -2,9 +2,24 @@
 
 from __future__ import annotations
 
+from decimal import Decimal
+
+import pytest
+from typer.testing import CliRunner
 from kubernetes.client import V1Node, V1NodeStatus, V1ObjectMeta
 
-from kueuer.resources import _bytes_to_gi_str, total
+from kueuer.cli import app
+from kueuer.resources import (
+    _bytes_to_binary_gib_decimal,
+    _format_decimal_report,
+    list_resource_quotas,
+    total,
+)
+
+runner = CliRunner()
+
+# Library API has no default for node_label_key; tests use the same key as the CLI default.
+NODE_LABEL_KEY = "skaha.opencadc.org/node-type"
 
 
 def _node(
@@ -19,11 +34,17 @@ def _node(
     )
 
 
-def test_bytes_to_gi_str_examples() -> None:
+def test_binary_gib_conversion() -> None:
     gi = 1024**3
-    assert _bytes_to_gi_str(20550 * gi) == "20550"
-    assert _bytes_to_gi_str(1073741824 // 2) == "0.5"
-    assert _bytes_to_gi_str(gi) == "1"
+    assert _bytes_to_binary_gib_decimal(20550 * gi) == Decimal("20550")
+    assert _bytes_to_binary_gib_decimal(gi // 2) == Decimal("0.5")
+    assert _bytes_to_binary_gib_decimal(gi) == Decimal("1")
+
+
+def test_format_decimal_report_three_places() -> None:
+    assert _format_decimal_report(Decimal("8")) == "8"
+    assert _format_decimal_report(Decimal("1.23456789")) == "1.235"
+    assert _format_decimal_report(Decimal("0")) == "0"
 
 
 def test_total_memory_ephemeral_gi(monkeypatch) -> None:
@@ -45,21 +66,29 @@ class R:
         lambda: type("X", (), {"list_node": fake_list_node})(),
     )
 
-    out = total(None, field="capacity")
-    assert out["memory"] == {"value": "16", "unit": "Gi"}
-    assert out["ephemeral-storage"] == {"value": "100", "unit": "Gi"}
+    out = total(None, field="capacity", node_label_key=NODE_LABEL_KEY)
+    assert out["node_label_key"] == NODE_LABEL_KEY
+    bucket = out["by_label_value"][""]
+    assert bucket["count"] == 1
+    assert bucket["memory"] == {"value": "16", "unit": "GiB"}
+    assert bucket["ephemeral-storage"] == {"value": "100", "unit": "GiB"}
+    w = bucket["weights"]
+    assert w["cpu"] == "1"
+    assert w["memory"] == "0.5"
+    assert w["ephemeral-storage"] == "0.08"
+    assert "nvidia.com/gpu" not in w
 
 
 def test_total_nvidia_gpu_single_model(monkeypatch) -> None:
     nodes = [
         _node(
             "g1",
-            {"nvidia.com/gpu": "4"},
+            {"cpu": "4", "nvidia.com/gpu": "4"},
             {"nvidia.com/gpu.product": "NVIDIA-A100-SXM4-40GB"},
         ),
         _node(
             "g2",
-            {"nvidia.com/gpu": "8"},
+            {"cpu": "8", "nvidia.com/gpu": "8"},
             {"nvidia.com/gpu.product": "NVIDIA-A100-SXM4-40GB"},
         ),
     ]
@@ -75,18 +104,27 @@ class R:
         lambda: type("X", (), {"list_node": fake_list_node})(),
     )
 
-    out = total(None, field="capacity")
-    assert out["nvidia.com/gpu"] == {
-        "kind": "NVIDIA-A100-SXM4-40GB",
-        "value": "12",
-        "unit": "count",
-    }
+    out = total(None, field="capacity", node_label_key=NODE_LABEL_KEY)
+    b = out["by_label_value"][""]
+    assert b["count"] == 2
+    assert b["nvidia.com/gpu"] == [
+        {"kind": "NVIDIA-A100-SXM4-40GB", "value": "12", "unit": "count"},
+    ]
+    assert b["weights"]["nvidia.com/gpu"]["NVIDIA-A100-SXM4-40GB"] == "1"
 
 
 def test_total_nvidia_gpu_mixed_kind(monkeypatch) -> None:
     nodes = [
-        _node("a", {"nvidia.com/gpu": "2"}, {"nvidia.com/gpu.product": "NVIDIA-T4"}),
-        _node("b", {"nvidia.com/gpu": "4"}, {"nvidia.com/gpu.product": "NVIDIA-A100"}),
+        _node(
+            "a",
+            {"cpu": "4", "nvidia.com/gpu": "2"},
+            {"nvidia.com/gpu.product": "NVIDIA-T4"},
+        ),
+        _node(
+            "b",
+            {"cpu": "4", "nvidia.com/gpu": "4"},
+            {"nvidia.com/gpu.product": "NVIDIA-A100"},
+        ),
     ]
 
     def fake_list_node(*_a, **_k):
@@ -100,19 +138,23 @@ class R:
         lambda: type("X", (), {"list_node": fake_list_node})(),
     )
 
-    out = total(None, field="capacity")
-    assert out["nvidia.com/gpu"] == {
-        "kind": "mixed",
-        "value": "6",
-        "unit": "count",
-    }
+    out = total(None, field="capacity", node_label_key=NODE_LABEL_KEY)
+    b = out["by_label_value"][""]
+    assert b["count"] == 2
+    assert b["nvidia.com/gpu"] == [
+        {"kind": "NVIDIA-A100", "value": "4", "unit": "count"},
+        {"kind": "NVIDIA-T4", "value": "2", "unit": "count"},
+    ]
+    wg = b["weights"]["nvidia.com/gpu"]
+    assert wg["NVIDIA-A100"] == "2"
+    assert wg["NVIDIA-T4"] == "4"
 
 
 def test_total_amd_gpu(monkeypatch) -> None:
     nodes = [
         _node(
             "w1",
-            {"amd.com/gpu": "8"},
+            {"cpu": "8", "amd.com/gpu": "8"},
             {"amd.com/gpu.product": "AMD-INSTINCT-MI250X"},
         ),
     ]
@@ -128,16 +170,16 @@ class R:
         lambda: type("X", (), {"list_node": fake_list_node})(),
     )
 
-    out = total(None, field="capacity")
-    assert out["amd.com/gpu"] == {
-        "kind": "AMD-INSTINCT-MI250X",
-        "value": "8",
-        "unit": "count",
-    }
+    out = total(None, field="capacity", node_label_key=NODE_LABEL_KEY)
+    b = out["by_label_value"][""]
+    assert b["count"] == 1
+    assert b["amd.com/gpu"] == [
+        {"kind": "AMD-INSTINCT-MI250X", "value": "8", "unit": "count"},
+    ]
 
 
 def test_total_nvidia_unknown_kind(monkeypatch) -> None:
-    nodes = [_node("x", {"nvidia.com/gpu": "3"}, {})]
+    nodes = [_node("x", {"cpu": "3", "nvidia.com/gpu": "3"}, {})]
 
     def fake_list_node(*_a, **_k):
         class R:
@@ -150,5 +192,185 @@ class R:
         lambda: type("X", (), {"list_node": fake_list_node})(),
     )
 
-    out = total(None, field="capacity")
-    assert out["nvidia.com/gpu"] == {"kind": "", "value": "3", "unit": "count"}
+    out = total(None, field="capacity", node_label_key=NODE_LABEL_KEY)
+    b = out["by_label_value"][""]
+    assert b["count"] == 1
+    assert b["nvidia.com/gpu"] == [
+        {"kind": "", "value": "3", "unit": "count"},
+    ]
+    assert b["weights"]["nvidia.com/gpu"][""] == "1"
+
+
+def test_total_groups_by_custom_node_label_key(monkeypatch) -> None:
+    nodes = [
+        _node("a", {"cpu": "2"}, {"pool": "east"}),
+        _node("b", {"cpu": "2"}, {"pool": "east"}),
+        _node("c", {"cpu": "4"}, {"pool": "west"}),
+    ]
+
+    def fake_list_node(*_a, **_k):
+        class R:
+            items = nodes
+
+        return R()
+
+    monkeypatch.setattr(
+        "kueuer.resources._load_kube",
+        lambda: type("X", (), {"list_node": fake_list_node})(),
+    )
+
+    out = total(None, field="capacity", node_label_key="pool")
+    assert out["node_label_key"] == "pool"
+    assert out["by_label_value"]["east"]["count"] == 2
+    assert out["by_label_value"]["west"]["count"] == 1
+
+
+def test_total_rejects_blank_node_label_key() -> None:
+    with pytest.raises(ValueError, match="non-empty"):
+        total(None, node_label_key="   ")
+
+
+def test_total_split_by_skaha_node_type(monkeypatch) -> None:
+    nodes = [
+        _node(
+            "cpu1",
+            {"cpu": "4", "memory": "8Gi"},
+            {"skaha.opencadc.org/node-type": "cpu-node"},
+        ),
+        _node(
+            "gpu1",
+            {"cpu": "8", "nvidia.com/gpu": "2"},
+            {
+                "nvidia.com/gpu.product": "NVIDIA-T4",
+                "skaha.opencadc.org/node-type": "gpu-node",
+            },
+        ),
+    ]
+
+    def fake_list_node(*_a, **_k):
+        class R:
+            items = nodes
+
+        return R()
+
+    monkeypatch.setattr(
+        "kueuer.resources._load_kube",
+        lambda: type("X", (), {"list_node": fake_list_node})(),
+    )
+
+    out = total(None, field="capacity", node_label_key=NODE_LABEL_KEY)
+    by_t = out["by_label_value"]
+    assert by_t["cpu-node"]["count"] == 1
+    assert by_t["gpu-node"]["count"] == 1
+    assert by_t["cpu-node"]["cpu"] == {"value": "4", "unit": "cores"}
+    assert by_t["gpu-node"]["nvidia.com/gpu"] == [
+        {"kind": "NVIDIA-T4", "value": "2", "unit": "count"},
+    ]
+    assert by_t["cpu-node"]["weights"]["memory"] == "0.5"
+    assert by_t["gpu-node"]["weights"]["nvidia.com/gpu"]["NVIDIA-T4"] == "4"
+
+
+def test_total_nvidia_gpu_from_labels_when_capacity_zero(monkeypatch) -> None:
+    """MIG-style nodes may advertise GPUs via labels while capacity nvidia.com/gpu is 0."""
+    nodes = [
+        _node(
+            "g1",
+            {
+                "cpu": "96",
+                "memory": "1000Gi",
+                "ephemeral-storage": "500Gi",
+                "nvidia.com/gpu": "0",
+            },
+            {
+                "nvidia.com/gpu.count": "12",
+                "nvidia.com/gpu.product": "NVIDIA-H100-NVL-MIG-2g.24gb",
+                "skaha.opencadc.org/node-type": "gpu-node",
+            },
+        ),
+    ]
+
+    def fake_list_node(*_a, **_k):
+        class R:
+            items = nodes
+
+        return R()
+
+    monkeypatch.setattr(
+        "kueuer.resources._load_kube",
+        lambda: type("X", (), {"list_node": fake_list_node})(),
+    )
+
+    out = total(None, field="capacity", node_label_key=NODE_LABEL_KEY)
+    g = out["by_label_value"]["gpu-node"]
+    assert g["count"] == 1
+    assert g["nvidia.com/gpu"] == [
+        {"kind": "NVIDIA-H100-NVL-MIG-2g.24gb", "value": "12", "unit": "count"},
+    ]
+    gw = g["weights"]
+    assert gw["memory"] == "0.096"
+    assert gw["ephemeral-storage"] == "0.192"
+    assert gw["nvidia.com/gpu"]["NVIDIA-H100-NVL-MIG-2g.24gb"] == "8"
+
+
+def test_list_resource_quotas_returns_serialized_payload(monkeypatch) -> None:
+    payload = {
+        "apiVersion": "v1",
+        "kind": "ResourceQuotaList",
+        "items": [
+            {
+                "metadata": {"name": "compute-quota", "namespace": "canfar-kueue-testing"},
+                "spec": {"hard": {"requests.cpu": "8", "requests.memory": "32Gi"}},
+            }
+        ],
+    }
+
+    class FakeCoreV1:
+        def list_namespaced_resource_quota(self, namespace: str):
+            assert namespace == "canfar-kueue-testing"
+            return object()
+
+    class FakeApiClient:
+        def sanitize_for_serialization(self, value):
+            assert value is not None
+            return payload
+
+    fake_k8s = type(
+        "FakeK8s",
+        (),
+        {"core_v1": FakeCoreV1(), "api_client": FakeApiClient()},
+    )()
+    monkeypatch.setattr("kueuer.resources.get_k8s_config", lambda: fake_k8s)
+
+    result = list_resource_quotas("canfar-kueue-testing")
+
+    assert result == payload
+
+
+def test_resources_cli_includes_node_label_key_option() -> None:
+    result = runner.invoke(app, ["cluster", "resources", "--help"])
+    assert result.exit_code == 0
+    assert "--node-label-key" in result.stdout
+
+
+def test_resourcequota_cli_prints_response_and_objects(monkeypatch) -> None:
+    payload = {
+        "apiVersion": "v1",
+        "kind": "ResourceQuotaList",
+        "items": [
+            {
+                "metadata": {"name": "compute-quota", "namespace": "canfar-kueue-testing"},
+                "status": {"hard": {"requests.cpu": "8"}},
+            }
+        ],
+    }
+    monkeypatch.setattr("kueuer.resources.list_resource_quotas", lambda namespace: payload)
+
+    result = runner.invoke(
+        app,
+        ["cluster", "resourcequota", "--namespace", "canfar-kueue-testing"],
+    )
+
+    assert result.exit_code == 0
+    assert "response" in result.stdout
+    assert "resource_quotas" in result.stdout
+    assert "compute-quota" in result.stdout

From 8e3f7c3d5b069126e5eba4d97592d6e5485c8a5b Mon Sep 17 00:00:00 2001
From: Shiny Brar <shiny.brar@nrc-cnrc.gc.ca>
Date: Thu, 23 Apr 2026 13:20:56 -0700
Subject: [PATCH 11/11] refactor(wip): kueue

---
 .../kueue/docs/adrs/ADR-001-kueue-adoption.md |  35 ++
 .../kueue/docs/adrs/ADR-002-workload-apis.md  |  36 ++
 .../ADR-003-shared-workloads-namespace.md     |  36 ++
 .../ADR-004-standalone-control-service.md     |  40 ++
 ...DR-005-fairness-priority-and-preemption.md |  43 ++
 .../ADR-006-posix-group-project-mapping.md    |  50 ++
 .../adrs/ADR-007-resource-flavor-taxonomy.md  |  45 ++
 ...ueue-enforcement-and-managed-namespaces.md |  35 ++
 .../adrs/ADR-009-visibility-and-ui-scope.md   |  39 ++
 ...010-single-cluster-now-multikueue-later.md |  36 ++
 ...ed-persistent-and-interactive-workloads.md |  40 ++
 ...-scale-testing-and-operational-evidence.md |  41 ++
 configs/kueue/docs/adrs/README.md             |  28 +
 configs/kueue/docs/architecture.md            | 545 ++++++++++++++++++
 configs/kueue/docs/operations.md              | 258 +++++++++
 configs/kueue/docs/reportstyle.markdown       | 303 ++++++++++
 configs/kueue/docs/roadmap.md                 | 277 +++++++++
 configs/kueue/docs/ui-spec.md                 | 222 +++++++
 configs/kueue/kueuer/AGENTS.md                |  13 +
 configs/kueue/kueuer/src/kueuer/resources.py  | 468 ++++++++++-----
 configs/kueue/kueuer/tests/test_resources.py  | 212 +++++--
 21 files changed, 2616 insertions(+), 186 deletions(-)
 create mode 100644 configs/kueue/docs/adrs/ADR-001-kueue-adoption.md
 create mode 100644 configs/kueue/docs/adrs/ADR-002-workload-apis.md
 create mode 100644 configs/kueue/docs/adrs/ADR-003-shared-workloads-namespace.md
 create mode 100644 configs/kueue/docs/adrs/ADR-004-standalone-control-service.md
 create mode 100644 configs/kueue/docs/adrs/ADR-005-fairness-priority-and-preemption.md
 create mode 100644 configs/kueue/docs/adrs/ADR-006-posix-group-project-mapping.md
 create mode 100644 configs/kueue/docs/adrs/ADR-007-resource-flavor-taxonomy.md
 create mode 100644 configs/kueue/docs/adrs/ADR-008-queue-enforcement-and-managed-namespaces.md
 create mode 100644 configs/kueue/docs/adrs/ADR-009-visibility-and-ui-scope.md
 create mode 100644 configs/kueue/docs/adrs/ADR-010-single-cluster-now-multikueue-later.md
 create mode 100644 configs/kueue/docs/adrs/ADR-011-protected-persistent-and-interactive-workloads.md
 create mode 100644 configs/kueue/docs/adrs/ADR-012-scale-testing-and-operational-evidence.md
 create mode 100644 configs/kueue/docs/adrs/README.md
 create mode 100644 configs/kueue/docs/architecture.md
 create mode 100644 configs/kueue/docs/operations.md
 create mode 100644 configs/kueue/docs/reportstyle.markdown
 create mode 100644 configs/kueue/docs/roadmap.md
 create mode 100644 configs/kueue/docs/ui-spec.md
 create mode 100644 configs/kueue/kueuer/AGENTS.md

diff --git a/configs/kueue/docs/adrs/ADR-001-kueue-adoption.md b/configs/kueue/docs/adrs/ADR-001-kueue-adoption.md
new file mode 100644
index 00000000..6b01400d
--- /dev/null
+++ b/configs/kueue/docs/adrs/ADR-001-kueue-adoption.md
@@ -0,0 +1,35 @@
+# ADR-001: Kueue Adoption
+
+- Status: Accepted
+- Date: January 2025
+
+## Context
+
+CANFAR needs a Kubernetes-native way to control admission, queueing, quotas,
+borrowing, reclaim, and visibility for a mix of interactive, persistent, and
+batch science workloads. The platform must handle very large pending backlogs
+without treating direct Kubernetes scheduling as the tenant policy layer.
+
+## Decision
+
+CANFAR adopts Kueue as the admission and quota orchestration layer for the
+Science Platform. Kubernetes remains the runtime scheduler and execution plane.
+`skaha` remains the main user submission entry point.
+
+## Consequences
+
+Kueue provides the needed queue, quota, priority, cohort, and visibility
+primitives. It also creates a clean path to future topology-aware scheduling and
+MultiKueue.
+
+CANFAR must still solve identity, project mapping, and accounting outside
+Kueue. Kueue is not the tenant system of record.
+
+## Alternatives considered
+
+- Continue with direct Kubernetes scheduling and custom ad hoc controls
+- Build a custom scheduling layer or scheduler plugin stack
+- Treat the backlog problem as only a `skaha` rate-limiting problem
+
+These alternatives either move too much policy into custom code or fail to give
+native cohort, quota, and admission control semantics.
diff --git a/configs/kueue/docs/adrs/ADR-002-workload-apis.md b/configs/kueue/docs/adrs/ADR-002-workload-apis.md
new file mode 100644
index 00000000..2c019f32
--- /dev/null
+++ b/configs/kueue/docs/adrs/ADR-002-workload-apis.md
@@ -0,0 +1,36 @@
+# ADR-002: Supported Workload APIs
+
+- Status: Accepted
+- Date: Spring 2025
+
+## Context
+
+The target architecture must support a broad workload taxonomy, but the current
+repository baseline and the need for safe operational rollout make it unwise to
+treat every Kueue integration production commitment.
+
+## Decision
+
+Production support centers on `batch/v1.Job`, including Indexed Job
+usage patterns for large independent fan-out work. Protected interactive and
+persistent workloads may be brought under Kueue using mature controller
+patterns, but only where the team can verify the operational behavior.
+
+`JobSet`, MPI, Ray, and other advanced or distributed controllers remain part of
+the target taxonomy and future roadmap, not the initial production commitment.
+
+## Consequences
+
+The platform gets a safe first production lane for large-scale batch admission
+without blocking future support for more advanced workload types.
+
+The package still documents the full workload taxonomy so later phases do not
+need to invent a new fairness or queue model.
+
+## Alternatives considered
+
+- Promise full support for every Kueue integration production commitment
+- Delay all interactive or persistent integration until after batch-only rollout
+
+The first option creates avoidable operational risk. The second option breaks
+the desired unified scheduling model too early.
diff --git a/configs/kueue/docs/adrs/ADR-003-shared-workloads-namespace.md b/configs/kueue/docs/adrs/ADR-003-shared-workloads-namespace.md
new file mode 100644
index 00000000..b2dc3f78
--- /dev/null
+++ b/configs/kueue/docs/adrs/ADR-003-shared-workloads-namespace.md
@@ -0,0 +1,36 @@
+# ADR-003: Shared `workloads` namespace now, namespace evolution later
+
+- Status: Accepted
+- Date: March 12, 2026
+
+## Context
+
+The current Kueue repository baseline uses multiple managed namespaces, but the
+target architecture wants one shared Kueue-managed namespace at first so queue
+governance, RBAC, and visibility can be kept simple while the new tenant model
+is introduced.
+
+## Decision
+
+Use one shared `workloads` namespace for Kueue-managed user workloads in the
+target single-cluster design. Create project-scoped `LocalQueue` objects in that
+shared namespace on demand.
+
+Future supported namespace layouts include one namespace per community, one namespace per workload class, or a
+hybrid namespace layout.
+
+## Consequences
+
+This keeps the initial rollout simpler and reduces the number of moving parts
+while project-based fairness and community ownership are introduced.
+
+Future namespace splits remain possible without changing the core
+community-project-cohort model.
+
+## Alternatives considered
+
+- Start immediately with one namespace per community
+- Start immediately with one namespace per workload class
+
+Both alternatives increase governance and visibility complexity too early for
+production commitment.
diff --git a/configs/kueue/docs/adrs/ADR-004-standalone-control-service.md b/configs/kueue/docs/adrs/ADR-004-standalone-control-service.md
new file mode 100644
index 00000000..c282e707
--- /dev/null
+++ b/configs/kueue/docs/adrs/ADR-004-standalone-control-service.md
@@ -0,0 +1,40 @@
+# ADR-004: Standalone accounting and control service
+
+- Status: Accepted
+- Date: March 12, 2026
+
+## Context
+
+Kueue cannot serve as the system of record for communities, projects, POSIX
+group mapping, delegated project administration, or accounting relationships.
+Those concerns are fundamental to CANFAR's policy and visibility.
+
+## Decision
+
+Define a new standalone accounting and control service as a required future
+dependency of the platform. The service remains out of scope for implementation
+in this package, but it is in scope for architecture and requirements.
+
+The service must support:
+
+- community creation and management by cluster admins
+- project creation inside a community by delegated project admins
+- project-to-group mapping and later user or group resolution
+- override request workflows for temporary fair-share changes
+- exposure of tenant metadata to `skaha` and the future visibility UI
+
+## Consequences
+
+The scheduler design stays clean. Kueue owns admission and quota behavior while
+the control service owns tenant and policy metadata.
+
+The rollout now has an explicit dependency that must be addressed in later
+phases rather than hidden behind manual configuration.
+
+## Alternatives considered
+
+- Extend an existing service implicitly without naming a new component
+- Keep project metadata as static Kubernetes configuration only
+
+Both alternatives hide ownership and make future admin workflows difficult to
+design and operate.
diff --git a/configs/kueue/docs/adrs/ADR-005-fairness-priority-and-preemption.md b/configs/kueue/docs/adrs/ADR-005-fairness-priority-and-preemption.md
new file mode 100644
index 00000000..9b182c84
--- /dev/null
+++ b/configs/kueue/docs/adrs/ADR-005-fairness-priority-and-preemption.md
@@ -0,0 +1,43 @@
+# ADR-005: Fairness, workload priority, and preemption model
+
+- Status: Accepted
+- Date: March 12, 2026
+
+## Context
+
+CANFAR needs fair competition between projects, community ownership of
+resources, borrowing of idle capacity, and a workload-ordering model that keeps
+interactive work ahead of batch work inside each project.
+
+## Decision
+
+Use the following split model:
+
+- Community = `ClusterQueue`
+- Project = `LocalQueue`
+- Multiple communities sharing capacity = `Cohort`
+- Project competition inside one community = Admission Fair Sharing with
+  adjustable `LocalQueue` weights
+- Workload ordering inside one project = `WorkloadPriorityClass`
+
+Use cohort borrowing and reclaim for community-level resource ownership. Use
+project-local workload priority to select interactive work before lower-priority
+batch work inside the chosen project queue.
+
+## Consequences
+
+This preserves community ownership while still maximizing idle cluster use. It
+also avoids pretending that project fair-share and workload priority are the
+same thing.
+
+Cross-community competition remains community-scoped rather than global
+project-scoped. That is intentional.
+
+## Alternatives considered
+
+- One global project fair-share plane across all communities
+- Priority-only scheduling without project fair-share weights
+- Community-only fairness with no project-level balancing
+
+These alternatives either ignore community ownership or fail to give projects a
+meaningful fairness model inside a community.
diff --git a/configs/kueue/docs/adrs/ADR-006-posix-group-project-mapping.md b/configs/kueue/docs/adrs/ADR-006-posix-group-project-mapping.md
new file mode 100644
index 00000000..7aef8242
--- /dev/null
+++ b/configs/kueue/docs/adrs/ADR-006-posix-group-project-mapping.md
@@ -0,0 +1,50 @@
+# ADR-006: POSIX group to project mapping options
+
+- Status: Proposed
+- Date: March 12, 2026
+
+## Context
+
+Projects may contain multiple POSIX groups and communities may contain multiple
+projects. The open question is whether a POSIX group may belong to more than one
+project.
+
+This decision changes the submission experience because ambiguous group mapping
+may force the API layer to require an explicit project field.
+
+## Options
+
+### Option A: One group maps to exactly one project
+
+Under this option, a POSIX group may not belong to multiple projects.
+
+#### Benefits
+
+- `Skaha` can often infer project and community from group context
+- submission stays simpler for users
+- visibility and accounting reasoning stay easier to explain
+
+#### Costs
+
+- the identity model is stricter
+- some administrative use cases may need new group structures
+
+### Option B: A group may map to multiple projects
+
+Under this option, a POSIX group may belong to more than one project.
+
+#### Benefits
+
+- the identity model is more flexible
+- administrators can reuse groups across projects
+
+#### Costs
+
+- the submission path must require explicit project selection in ambiguous cases
+- user experience becomes more complex
+- the control service and UI must explain ambiguity clearly
+
+## Current direction
+
+Leave the decision open. The architecture and UI must support both models until
+the tenant administration workflow is finalized.
diff --git a/configs/kueue/docs/adrs/ADR-007-resource-flavor-taxonomy.md b/configs/kueue/docs/adrs/ADR-007-resource-flavor-taxonomy.md
new file mode 100644
index 00000000..6956a2d3
--- /dev/null
+++ b/configs/kueue/docs/adrs/ADR-007-resource-flavor-taxonomy.md
@@ -0,0 +1,45 @@
+# ADR-007: ResourceFlavor taxonomy and topology model
+
+- Status: Accepted
+- Date: March 12, 2026
+
+## Context
+
+CANFAR needs a flavor model that captures resource identity across cluster,
+zone, accelerator type, storage class, and later topology-aware scheduling
+domains. The model must stay readable to operators and extensible to future
+MultiKueue deployments.
+
+## Decision
+
+Use `ResourceFlavor` as the canonical scheduler-facing identity for placement and
+hardware classes. Standardize flavor naming around stable placement and hardware
+dimensions rather than workload class.
+
+Adopt the following naming pattern:
+
+`rf-<cluster>-<zone>-<resource-class>[-<accelerator-class>]`
+
+Examples:
+
+- `rf-ca-west-01-cpu-standard`
+- `rf-ca-west-01-cpu-highmem`
+- `rf-ca-west-01-gpu-a100`
+
+Treat topology-aware scheduling as a future phase. When topology becomes active,
+use `Topology` objects and flavor association rather than encoding full topology
+hierarchy into the flavor name itself.
+
+## Consequences
+
+Operators get a stable taxonomy that works in both single-cluster and future
+manager-worker designs. Users and admins can also read flavor identity in a
+predictable way.
+
+## Alternatives considered
+
+- Opaque flavor names with documentation-only meaning
+- One flavor per workload class
+- Encoding every topology dimension directly in the flavor name
+
+These alternatives either hide meaning or create unnecessary flavor sprawl.
diff --git a/configs/kueue/docs/adrs/ADR-008-queue-enforcement-and-managed-namespaces.md b/configs/kueue/docs/adrs/ADR-008-queue-enforcement-and-managed-namespaces.md
new file mode 100644
index 00000000..abf8191d
--- /dev/null
+++ b/configs/kueue/docs/adrs/ADR-008-queue-enforcement-and-managed-namespaces.md
@@ -0,0 +1,35 @@
+# ADR-008: Queue enforcement and managed namespace model
+
+- Status: Accepted
+- Date: March 12, 2026
+
+## Context
+
+Kueue policy only works predictably when managed workloads land in managed
+namespaces and carry valid queue information. CANFAR requires users to
+submit through `skaha`, not through raw Kubernetes APIs without platform policy.
+
+## Decision
+
+Use explicitly managed namespaces for Kueue-managed user work. In the target
+state this is one shared `workloads` namespace. The submission path must resolve
+and apply a `LocalQueue` explicitly.
+
+Keep `manageJobsWithoutQueueName` disabled and reject malformed or unqueued
+submissions in managed namespaces through admission policy and service-side
+validation.
+
+## Consequences
+
+The scheduler does not need to guess tenant identity. Platform policy remains
+explicit, and visibility stays consistent with actual queue assignment.
+
+Future namespace evolution remains possible as long as the same enforcement
+principles are preserved.
+
+## Alternatives considered
+
+- Allow silent default queue assignment everywhere
+- Allow users to create unmanaged work in the same namespaces as managed work
+
+These alternatives make fairness and explanation harder to trust.
diff --git a/configs/kueue/docs/adrs/ADR-009-visibility-and-ui-scope.md b/configs/kueue/docs/adrs/ADR-009-visibility-and-ui-scope.md
new file mode 100644
index 00000000..c3e1750b
--- /dev/null
+++ b/configs/kueue/docs/adrs/ADR-009-visibility-and-ui-scope.md
@@ -0,0 +1,39 @@
+# ADR-009: Visibility and UI scope
+
+- Status: Accepted
+- Date: March 12, 2026
+
+## Context
+
+Fair scheduling without understandable visibility will be perceived as arbitrary.
+CANFAR's users, project admins, and cluster admins all need different levels of
+insight into ownership, pending reasons, and current queue position.
+
+## Decision
+
+Treat visibility as a first-class architectural concern. Production commitment relies on
+`kubectl`, Grafana, Kueue metrics, and the pending-workloads visibility API.
+Later phases add a read-only queue UI and then guided admin workflows.
+
+The UI must explain scheduling outcomes in terms of:
+
+- fair-share position
+- workload priority
+- quota exhaustion
+- insufficient resource availability
+- policy rejection
+
+## Consequences
+
+The architecture gains a clear product surface instead of assuming that raw
+conditions or controller logs are enough.
+
+This also creates a requirement for the control service to expose tenant and
+override metadata to the UI.
+
+## Alternatives considered
+
+- Delay visibility until after scheduling is complete
+- Rely only on Kubernetes-native object inspection
+
+These alternatives make correct policy look opaque to most users.
diff --git a/configs/kueue/docs/adrs/ADR-010-single-cluster-now-multikueue-later.md b/configs/kueue/docs/adrs/ADR-010-single-cluster-now-multikueue-later.md
new file mode 100644
index 00000000..c4e2cc0d
--- /dev/null
+++ b/configs/kueue/docs/adrs/ADR-010-single-cluster-now-multikueue-later.md
@@ -0,0 +1,36 @@
+# ADR-010: Single-cluster now, MultiKueue-ready later
+
+- Status: Accepted
+- Date: March 12, 2026
+
+## Context
+
+CANFAR wants a design that can grow into a multi-cluster model, but the team
+also needs to prove the policy model, control-plane behavior, and backlog
+health in a single cluster before adding federation complexity.
+
+## Decision
+
+Build for a single-cluster production target first. Preserve a vocabulary and
+resource model that remain compatible with future MultiKueue manager and worker
+clusters.
+
+Do not make MultiKueue a phase 1 requirement. Treat it as a later phase that is
+triggered by proven need, such as API-server pressure, site-level separation, or
+distinct hardware pools that require worker-cluster sharding.
+
+## Consequences
+
+The rollout remains practical while still avoiding a dead-end tenancy model.
+
+The architecture package must still document manager-worker deployment patterns,
+future quota mapping, and operational risks so later federation does not become
+a redesign exercise.
+
+## Alternatives considered
+
+- Design only for one cluster with no future federation path
+- Make MultiKueue a near-term mandatory target
+
+The first option creates future migration pain. The second option adds too much
+operational complexity before the single-cluster model is proven.
diff --git a/configs/kueue/docs/adrs/ADR-011-protected-persistent-and-interactive-workloads.md b/configs/kueue/docs/adrs/ADR-011-protected-persistent-and-interactive-workloads.md
new file mode 100644
index 00000000..a6ad5bc8
--- /dev/null
+++ b/configs/kueue/docs/adrs/ADR-011-protected-persistent-and-interactive-workloads.md
@@ -0,0 +1,40 @@
+# ADR-011: Protected persistent and interactive workloads
+
+- Status: Accepted
+- Date: March 12, 2026
+
+## Context
+
+CANFAR does not run only batch jobs. The platform also runs interactive sessions
+and persistent user-facing services that users experience directly.
+
+## Decision
+
+Keep persistent and interactive work in the same overall Kueue-managed system,
+but treat them as protected workload classes rather than ordinary best-effort
+batch.
+
+Inside each project:
+
+- interactive work has higher workload priority than batch work
+- persistent workloads are protected and must not be treated as cheap
+  preemption targets
+
+The exact controller integrations may mature in phases, but the policy model is
+fixed now.
+
+## Consequences
+
+The platform retains a unified tenant and fairness model while still protecting
+the user experience for long-lived or interactive work.
+
+Operators must still watch for cases where the single-plane model causes too
+much contention and then adjust protection or namespace boundaries later.
+
+## Alternatives considered
+
+- Keep all persistent and interactive work outside Kueue
+- Treat persistent and interactive work exactly like best-effort batch
+
+The first option breaks the unified policy model. The second option creates
+avoidable user pain and preemption risk.
diff --git a/configs/kueue/docs/adrs/ADR-012-scale-testing-and-operational-evidence.md b/configs/kueue/docs/adrs/ADR-012-scale-testing-and-operational-evidence.md
new file mode 100644
index 00000000..3bc076b2
--- /dev/null
+++ b/configs/kueue/docs/adrs/ADR-012-scale-testing-and-operational-evidence.md
@@ -0,0 +1,41 @@
+# ADR-012: Scale-testing and operational evidence
+
+- Status: Accepted
+- Date: March 12, 2026
+
+## Context
+
+The target system must support large pending backlog without relying on guesses
+about safe operating thresholds. Kueue behavior, Kubernetes API behavior, and
+the submission path must be measured together.
+
+## Decision
+
+Use benchmark-driven operational evidence as a release gate. `kueuer`,
+Prometheus, Grafana, and controlled workload storms are required parts of the
+platform rollout, not optional afterthoughts.
+
+The evidence model must include:
+
+- scale gates at increasing backlog levels
+- workload admission timing and throughput
+- controller health and restart behavior
+- API server latency and saturation
+- visibility API behavior under backlog
+- user-path latency through `skaha`
+
+## Consequences
+
+The platform gets explicit stop or go thresholds for production change rather
+than relying on anecdotal experience.
+
+This also means roadmap phases need objective exit criteria and repeatable test
+artifacts.
+
+## Alternatives considered
+
+- Tune Kueue and trust best effort observations
+- Measure only Kueue controller metrics
+
+These alternatives fail to prove end-to-end system behavior under realistic
+submission pressure.
diff --git a/configs/kueue/docs/adrs/README.md b/configs/kueue/docs/adrs/README.md
new file mode 100644
index 00000000..b09e1e4c
--- /dev/null
+++ b/configs/kueue/docs/adrs/README.md
@@ -0,0 +1,28 @@
+# Architecture Decision Records
+
+This directory contains the architecture decision records for the CANFAR Science Platform Kueue
+design.
+
+## ADR index
+
+| ADR | Title | Status |
+| --- | ----- | ------ |
+| [ADR-001](./ADR-001-kueue-adoption.md) | Why CANFAR adopts Kueue | Accepted |
+| [ADR-002](./ADR-002-phase-1-workload-apis.md) | Supported workload APIs in phase 1 | Accepted |
+| [ADR-003](./ADR-003-shared-workloads-namespace.md) | Shared `workloads` namespace now, namespace evolution later | Accepted |
+| [ADR-004](./ADR-004-standalone-control-service.md) | Standalone accounting and control service | Accepted |
+| [ADR-005](./ADR-005-fairness-priority-and-preemption.md) | Fairness, workload priority, and preemption model | Accepted |
+| [ADR-006](./ADR-006-posix-group-project-mapping.md) | POSIX group to project mapping options | Proposed |
+| [ADR-007](./ADR-007-resource-flavor-taxonomy.md) | ResourceFlavor taxonomy and topology model | Accepted |
+| [ADR-008](./ADR-008-queue-enforcement-and-managed-namespaces.md) | Queue enforcement and managed namespace model | Accepted |
+| [ADR-009](./ADR-009-visibility-and-ui-scope.md) | Visibility and UI scope | Accepted |
+| [ADR-010](./ADR-010-single-cluster-now-multikueue-later.md) | Single-cluster now, MultiKueue-ready later | Accepted |
+| [ADR-011](./ADR-011-protected-persistent-and-interactive-workloads.md) | Protected persistent and interactive workloads | Accepted |
+| [ADR-012](./ADR-012-scale-testing-and-operational-evidence.md) | Scale-testing and operational evidence | Accepted |
+
+## How to use the ADR set
+
+Read the ADRs in numerical order the first time. After that, use the table
+above to jump to the specific decision you need. Only ADR-006 remains open in
+this package. All other ADRs define the current architectural intent for the
+target rollout.
diff --git a/configs/kueue/docs/architecture.md b/configs/kueue/docs/architecture.md
new file mode 100644
index 00000000..a600e8bb
--- /dev/null
+++ b/configs/kueue/docs/architecture.md
@@ -0,0 +1,545 @@
+# CANFAR Science Platform Kueue Architecture
+
+This document is the primary architecture reference for Kueue rollout on the CANFAR Science
+Platform. It captures the current baseline, the target
+single-cluster architecture, and the future-ready extension points for
+MultiKueue, topology-aware scheduling, and richer tenant control.
+
+Use this document together with [roadmap.md](./roadmap.md), [operations.md](./operations.md),
+[ui-spec.md](./ui-spec.md), and [the ADR index](./adrs/README.md). The existing
+deep-dive reference remains useful background material, but this document is the
+system design source of truth.
+
+## 1. Introduction
+
+This section defines the problem, the user groups, and the success criteria for
+the Kueue architecture on the CANFAR Science Platform.
+
+### 1.1 Problem statement
+
+The CANFAR Science Platform needs a batch and tenant-admission layer for Kubernetes
+that can absorb very large pending backlogs while still giving users predictable
+submission behavior and clear explanations for delay. The platform must support
+interactive sessions, persistent user-facing services, MPI and distributed jobs,
+and very large numbers of batch jobs in the same managed environment.
+
+The design target is not only to admit work fairly. It must also preserve
+control-plane health when the backlog grows to `100,000`, `200,000`, or more
+pending jobs or workload objects, while the active execution footprint remains
+much smaller.
+
+### 1.2 Users and stakeholders
+
+The architecture serves the following groups:
+
+- Science users who launch notebooks, interactive tools, batch jobs, and
+  distributed workloads through `Skaha`
+- Project administrators who manage projects and group membership inside a
+  community
+- Cluster system administrators who own infrastructure, Kueue policy, and
+  emergency controls
+- Platform operators who need strong observability, clear runbooks, and safe
+  rollout procedures
+- Future control-service operators who will manage community ownership, project
+  metadata, and accounting relationships
+
+### 1.3 Success Criteria
+
+The design is successful when the platform can:
+
+- Admit and manage work fairly across communities and projects
+- Prioritize interactive work over batch work inside each project
+- Preserve community ownership of resources while still borrowing idle capacity
+- Explain pending or rejected work in terms users and admins can act on
+- Support at least one shared production workload namespace now
+- Stay compatible with future namespace splits, MultiKueue, and topology-aware
+  scheduling
+- Produce benchmark evidence and operational thresholds rather than relying on
+  informal tuning
+
+### 1.4 Quality Goals
+
+The top quality goals for this system are:
+
+- Fairness: projects compete fairly within a community, and communities reclaim
+  owned resources within a cohort
+- Transparency: users and admins can see why work is pending, admitted,
+  preempted, or rejected
+- Reliability: the Kueue control plane, Kubernetes API, and visibility surfaces
+  remain healthy under heavy pending backlog
+- Operability: cluster admins can pause, drain, observe, and roll back changes
+  safely
+- Scalability: the design can grow from a single-cluster deployment into a
+  manager and worker model without rewriting the tenancy model
+
+## 2. Constraints
+
+This section captures the hard realities that shape the design.
+
+### 2.1 Repository and Deployment Baseline
+
+The current repository baseline still reflects an older Kueue deployment:
+
+- Current deployment documents `0.11.6` as the installed release
+- Current deployment uses `config.kueue.x-k8s.io/v1beta1`
+- Current deployment uses `batch/job` only
+- Current deployment references `skaha-workload` and `canfar-b-workload` rather than a single shared `workloads` namespace
+
+This architecture therefore distinguishes three states:
+
+- Current state: the repository and deployed baseline today
+- Target state: the single-cluster architecture described in this package
+- Future state: capabilities that remain out of scope for phase 1 but must stay
+  compatible with the design
+
+### 2.2 Identity and governance constraints
+
+The current identity hierarchy is:
+
+`Community -> Project -> POSIX Group -> User`
+
+This means the scheduler cannot be the system of record for tenant structure.
+Kueue needs an external control service to provide authoritative definitions for
+communities, projects, project-to-group mappings, and later accounting policy.
+
+### 2.3 Namespace constraint
+
+The target design uses one shared `workloads` namespace for Kueue-managed user
+work now. `LocalQueue` objects are still project-scoped, but they exist in that
+shared namespace and are created on demand.
+
+The architecture must also stay compatible with future namespace evolution, such
+as:
+
+- one namespace per community
+- one namespace per workload class, such as `gpu-workloads` or
+  `batch-workloads`
+
+Those namespace changes are future roadmap items, not phase 1 requirements.
+
+### 2.4 Operational constraints
+
+The system must work in an environment where:
+
+- the cluster is managed by CADC
+- users mostly submit via `Skaha`, not by direct raw Kubernetes access
+- batch backlog can exceed active capacity by two or more orders of magnitude
+- interactive and persistent work cannot be treated as disposable noise
+- the real bottleneck may be Kueue, the Kubernetes API server, etcd, or the
+  submission path, not just one controller setting
+
+## 3. Context and scope
+
+This section defines the system boundary and the key external actors.
+
+### 3.1 System scope
+
+The architecture covers:
+
+- `Skaha` submission and workload resolution behavior
+- Kueue queue, quota, flavor, and priority objects
+- the future standalone accounting and control service
+- user, project admin, and cluster admin visibility surfaces
+- benchmark and operations evidence for scale and correctness
+
+The architecture does not implement:
+
+- the control service itself
+- the accounting penalty model for temporary fair-share overrides
+- a production user interface implementation
+- a MultiKueue production rollout in phase 1
+
+### 3.2 Communities and cohorts
+
+Resources such as CPU, memory, storage, GPUs, network bandwidth, and I/O are
+owned at the community level. Communities are implemented as `ClusterQueue`
+objects. Multiple communities together form a cohort and may lend or borrow
+capacity from one another.
+
+The initial named communities used in examples and diagrams are:
+
+- `cadc`
+- `ska`
+- `chimefrb`
+
+### 3.3 Projects and local queues
+
+Projects exist inside a community and are implemented as `LocalQueue` objects.
+`LocalQueue` creation is project-scoped and on demand. The `LocalQueue`
+therefore represents the scheduler-facing identity of a project inside a shared
+community resource pool.
+
+This has one important consequence: project fair sharing is intra-community.
+Projects compete fairly with other projects that target the same community
+`ClusterQueue`. Cross-community competition is governed by `ClusterQueue`
+policies and the cohort, not by one global project fair-share plane.
+
+## 4. Solution strategy
+
+This section states the high-level design strategy and the main trade-offs.
+
+### 4.1 Scheduler model
+
+Kueue acts as the admission and quota orchestration layer. Kubernetes still
+performs pod placement and runtime scheduling. Kueue decides when work can enter
+the active scheduling plane and under what quota, flavor, and priority rules.
+
+### 4.2 Tenancy model
+
+The target tenancy mapping is:
+
+- Community = `ClusterQueue`
+- Project = `LocalQueue`
+- Multiple communities = one or more `Cohort` relationships
+
+This model gives CANFAR the following properties:
+
+- community-owned resources remain a first-class concept
+- communities can lend and borrow unused capacity
+- projects compete fairly inside their community
+- workload class ordering stays project-local through priority
+- the control service remains the source of truth for project and group mapping
+
+### 4.3 Fairness and priority model
+
+The scheduling and fairness model is split deliberately:
+
+- Within a cohort of communities, communities borrow and reclaim through
+  `ClusterQueue` and cohort policy.
+- Within a single community, projects compete using Admission Fair Sharing,
+  driven by `LocalQueue` historical usage and adjustable weights.
+- Within a project, workloads are ordered by `WorkloadPriorityClass`.
+
+This preserves an important operational rule: interactive work wins inside a
+project, but not at the cost of pretending that project fair-share history does
+not exist. A project that has consumed a great deal of recent capacity can still
+feel the effect of fair-share decay, even if its next workload is interactive.
+
+### 4.4 Control service strategy
+
+Kueue is not the tenant authority. A future standalone accounting and control
+service is required to manage:
+
+- community definitions and resource ownership
+- project creation inside a community
+- project-to-group mappings
+- quota and usage relationships outside raw Kueue quota semantics
+- temporary fair-share override workflow for project admins and cluster admins
+
+This service is out of scope for phase 1 implementation, but it is in scope for
+architecture and requirements.
+
+## 5. Building block view
+
+This section describes the main building blocks of the target system.
+
+### 5.1 Submission plane
+
+The submission plane consists of:
+
+- `Skaha` as the main user-facing submission path
+- a queue and policy resolution layer inside `Skaha`
+- the future control service for project, group, and ownership lookup
+
+`Skaha` accepts a user request, resolves the effective project and community,
+selects the correct `LocalQueue`, attaches the required labels, and creates the
+workload object in the shared `workloads` namespace.
+
+### 5.2 Queueing plane
+
+The queueing plane consists of Kueue CRDs and controller behavior:
+
+- `ResourceFlavor` for cluster, zone, hardware, and future topology identity
+- `ClusterQueue` for community-owned quota and preemption rules
+- `LocalQueue` for project-level competition within a community
+- `WorkloadPriorityClass` for workload-class ordering within a project
+- `Cohort` for sharing and reclaim between communities
+
+### 5.3 Execution plane
+
+The execution plane remains native Kubernetes:
+
+- Kubernetes API server and etcd store all workload state
+- the default scheduler places admitted pods
+- workload-specific controllers such as `Job`, `JobSet`, `MPIJob`, or `Ray`
+  drive runtime behavior after admission
+
+### 5.4 Visibility plane
+
+The visibility plane combines several sources:
+
+- Kueue Prometheus metrics
+- the Kueue pending-workloads visibility API
+- Kubernetes and apiserver metrics
+- a future UI for user and admin-facing queue explanations
+
+The architecture uses these sources to explain whether a workload is delayed
+because of fair-share position, workload priority, resource shortage, quota
+exhaustion, or policy rejection.
+
+### 5.5 Control service requirements
+
+The future standalone accounting and control service must support:
+
+- cluster admins creating and managing communities
+- delegated project admins creating projects within a community
+- attaching one or more POSIX groups to a project
+- resolving a user or group to the correct project at submission time, or
+  requiring explicit project selection when the mapping model is ambiguous
+- tracking usage and quota relationships that are out of scope for Kueue alone
+- handling temporary fair-share override requests
+
+The service must expose enough state for both `Skaha` and the user visibility
+surface. It does not need to become the workload execution system.
+
+## 6. Runtime view
+
+This section describes the key runtime behaviors of the target system. Detailed
+sequence diagrams are provided in [diagrams.md](./diagrams.md).
+
+### 6.1 Submit and admit flow
+
+The normal runtime path is:
+
+1. A user submits work through `Skaha`.
+2. `Skaha` resolves the effective project and community through internal policy
+   and later through the control service. If the mapping model is ambiguous, the
+   submission path requires explicit project selection.
+3. `Skaha` selects a `LocalQueue`, applies the required labels, and creates the
+   Kubernetes workload object in `workloads`.
+4. Kueue creates or updates the corresponding `Workload`.
+5. Kueue evaluates quota, flavors, priority, and admission checks.
+6. Once quota is reserved and admission is satisfied, Kueue admits the
+   workload.
+7. Kubernetes schedules the pods and the native controller runs them.
+
+### 6.2 Community borrowing and reclaim
+
+When one community is idle and another is busy, the busy community can borrow
+from the cohort. When the idle community becomes active again, Kueue reclaims
+nominal quota according to cohort preemption policy.
+
+This behavior is community-scoped. It is not project-scoped reclaim. Project
+competition inside a community is governed separately through `LocalQueue`
+fairness and workload priority.
+
+### 6.3 Priority and fair-share interaction
+
+Project fair-sharing decides which project gets the next chance to consume
+community quota. Workload priority decides which workload from that project is
+selected first.
+
+This means:
+
+- higher-priority interactive work can win against lower-priority batch work
+  inside the same project
+- a project with a high recent usage history can still wait behind a project
+  with lower recent usage
+- temporary fair-share weight adjustments change the project's relative share,
+  not just the priority of one workload
+
+### 6.4 Pending-state explanation
+
+The platform must expose actionable pending reasons. The user-facing explanation
+model uses the following categories:
+
+- waiting behind higher fair-share demand from other projects in the same
+  community
+- waiting behind higher-priority workloads in the same project
+- blocked because owned or borrowed resources are not currently available
+- blocked because the project or submission policy rejected the request
+- blocked because the control plane is degraded and admission is not keeping up
+
+## 7. Deployment view
+
+This section describes the target single-cluster deployment and the future
+expansion path.
+
+### 7.1 Current target deployment
+
+The target single-cluster deployment includes:
+
+- one Kueue control plane in `kueue-system`
+- one shared `workloads` namespace for Kueue-managed user workloads
+- one `ClusterQueue` per community, such as `cadc`, `ska`, and `chimefrb`
+- one or more project `LocalQueue` objects per community, created on demand
+- `ResourceFlavor` objects that capture cluster, zone, GPU class, and later
+  topology-aware placement
+- one monitoring stack with Prometheus and Grafana
+
+### 7.2 Namespace evolution
+
+The architecture preserves compatibility with future namespace splits. Likely
+future directions are:
+
+- namespace per community
+- namespace per workload class
+- mixed namespace policy for highly specialized resources
+
+The scheduler model and control service model remain valid under any of these
+future namespace layouts. The main affected areas are `LocalQueue` ownership,
+RBAC, and visibility scope.
+
+### 7.3 Multi-cluster future
+
+The future-state expansion path is MultiKueue, with one manager cluster and one
+or more worker clusters. The manager and workers must preserve the same
+community, project, and flavor vocabulary so the single-cluster tenant model can
+evolve rather than be replaced.
+
+## 8. Cross-cutting concepts
+
+This section captures concepts that shape multiple parts of the design.
+
+### 8.1 Workload taxonomy
+
+The architecture recognizes the following workload classes:
+
+- Interactive sessions such as notebooks and user-facing interactive tools
+- Persistent user-facing services or deployments
+- Standard batch `Job`
+- Indexed `Job` for large independent fan-out workloads
+- `JobSet` and MPI-style grouped or distributed jobs
+- `RayJob`, `RayCluster`, and related distributed compute workloads
+- Cron-triggered batch work
+- Plain Pod or exception paths only when a better controller does not exist
+
+Phase 1 does not need to productionize every Kueue integration, but the
+taxonomy must be documented now because it affects priority, fairness, and
+observability semantics.
+
+### 8.2 Resource ownership and flavors
+
+Communities own resources. Kueue expresses that ownership through `ClusterQueue`
+quota and `ResourceFlavor` taxonomy. The flavor model must support:
+
+- CPU and memory
+- storage classes and local storage distinctions when relevant
+- GPU model and accelerator pool identity
+- cluster and zone placement
+- future topology domains such as rack, block, or GPU island
+
+### 8.3 Governance and enforcement
+
+Managed workloads must set a queue explicitly or be assigned one by the
+submission service. The target governance model is:
+
+- users submit through `Skaha`
+- `Skaha` resolves `LocalQueue` assignment
+- managed namespaces reject malformed or unqueued submissions
+- cluster admins retain emergency controls for stopping or draining queues
+
+### 8.4 Protected workloads
+
+Interactive and persistent workloads remain part of the same Kueue-managed
+system, but they are not treated the same way as best-effort batch. Persistent
+workloads are a protected class. Interactive work holds higher workload priority
+than batch inside each project.
+
+The architecture deliberately leaves room for stricter protection later if
+evidence shows that the single-plane model causes avoidable user pain.
+
+### 8.5 Temporary fair-share overrides
+
+Project admins may request temporary fair-share overrides through a future UI
+and control-service workflow. Only cluster admins can approve or apply those
+changes.
+
+The downstream accounting model for the quota or usage cost of those overrides
+is explicitly out of scope here. This architecture only records the requirement
+that the accounting model must exist and must remain visible to admins.
+
+## 9. Quality requirements
+
+This section turns the earlier goals into concrete architecture obligations.
+
+### 9.1 Reliability
+
+The design must keep Kueue and the Kubernetes API healthy under load. The
+architecture therefore requires:
+
+- benchmark-driven thresholds
+- explicit rollout and rollback procedures
+- visibility into queue backlog and controller saturation
+
+### 9.2 Transparency
+
+The design must expose queue state in ways users can understand without reading
+controller logs or raw conditions. The UI and observability model therefore must
+present effective fair-share state, workload priority, ownership boundaries, and
+pending reasons clearly.
+
+### 9.3 Scalability
+
+The design must support large pending backlogs without assuming that listing all
+raw objects remains cheap. This is why the architecture treats visibility APIs,
+metrics, and summary views as primary interfaces rather than optional extras.
+
+### 9.4 Operability
+
+Cluster admins must be able to:
+
+- inspect community and project usage quickly
+- pause or drain queues safely
+- identify whether bottlenecks are in Kueue, the Kubernetes API, or the
+  submission path
+- run repeatable benchmark suites before changing policy or scale assumptions
+
+## 10. Risks and technical debt
+
+This section identifies the major known risks.
+
+- The current repository baseline is behind the target capability set and will
+  require careful Kueue upgrade work before all target features are usable.
+- Fairness is easy to misinterpret. Users may perceive a correct fair-share
+  result as unfair if the UI does not explain recent usage and priority clearly.
+- The one shared namespace is operationally simple now, but it may become a
+  pressure point for RBAC and visibility policy as more workload types are
+  onboarded.
+- Flexible workloads with changing resource shape can challenge request-based
+  admission semantics and will need careful validation before broader use.
+- The control service is required by the architecture but remains out of scope
+  for immediate implementation, so integration points must stay explicit.
+- MultiKueue and topology-aware scheduling remain future capabilities with
+  meaningful operational constraints and must not be promised as phase 1
+  features.
+
+## 11. Decisions summary
+
+This section summarizes the architectural decisions captured in detail by the
+ADR set.
+
+- Use Kueue as the admission and quota orchestration system.
+- Model communities as `ClusterQueue` objects and projects as `LocalQueue`
+  objects.
+- Keep one shared `workloads` namespace now and treat namespace evolution as a
+  future roadmap item.
+- Use a standalone accounting and control service as the future system of
+  record.
+- Use fair-share weights for project competition and workload priority for
+  project-internal ordering.
+- Keep persistent workloads protected and interactive workloads higher priority
+  than batch workloads.
+- Preserve a MultiKueue-ready vocabulary even while deploying single-cluster
+  first.
+
+See [the ADR index](./adrs/README.md) for the full decision log.
+
+## 12. Glossary
+
+This glossary standardizes the key terms used across the package.
+
+- Community: the top-level resource owner in CANFAR; implemented as a
+  `ClusterQueue`
+- Cohort: a set of `ClusterQueue` objects that may lend or borrow capacity
+- Project: the scheduler-facing tenant inside a community; implemented as a
+  `LocalQueue`
+- POSIX group: an identity grouping used for project membership and submission
+  resolution
+- Workload class: a platform-level category such as interactive, persistent, or
+  batch
+- Fair-share weight: the configurable project weighting used for community-local
+  project competition
+- Workload priority: the ordering signal used inside a project's queue
+- Control service: the future standalone service that stores communities,
+  projects, mappings, and accounting relationships
diff --git a/configs/kueue/docs/operations.md b/configs/kueue/docs/operations.md
new file mode 100644
index 00000000..0595f6ba
--- /dev/null
+++ b/configs/kueue/docs/operations.md
@@ -0,0 +1,258 @@
+# CANFAR Kueue operations appendix
+
+This document defines the operational model for the CANFAR Kueue architecture.
+It covers SLOs, observability, alerting, runbooks, rollout, and rollback. Use
+it together with [architecture.md](./architecture.md), [roadmap.md](./roadmap.md),
+[ui-spec.md](./ui-spec.md), and [the ADR index](./adrs/README.md).
+
+## 1. Operational objectives
+
+The Kueue platform must remain understandable and recoverable under pressure.
+Operators need to know whether the problem is in Kueue, the Kubernetes API, the
+submission path, or the workload mix itself.
+
+This appendix therefore focuses on:
+
+- service-level objectives
+- the metrics and dashboards that support those objectives
+- incident runbooks
+- safe rollout and rollback mechanics
+
+## 2. Service Level Indicators (SLIs) and Objectives (SLOs)
+
+The following SLOs form the initial operating contract. They are subject to
+refinement after benchmark evidence is gathered.
+
+### 2.1 Submission and admission SLOs
+
+| SLI | Target |
+| --- | ------ |
+| `skaha` successful submission response rate | `>= 99.9%` over 30 days |
+| `skaha` P95 create-session latency under nominal load | `<= 2s` |
+| Kueue P95 admission wait for interactive workloads under nominal conditions | `<= 30s` |
+| Kueue P95 admission wait for standard batch under nominal backlog | `<= 10m` |
+| Visibility API P95 latency for paged pending queries | `<= 2s` |
+
+### 2.2 Control-plane health SLOs
+
+| SLI | Target |
+| --- | ------ |
+| Kueue controller availability | `>= 99.95%` over 30 days |
+| Zero unplanned controller crash loops | Required |
+| Kubernetes API P99 write latency for workload creation under nominal load | `<= 1s` |
+| Kubernetes API P99 read latency for queue visibility queries | `<= 1s` |
+
+### 2.3 Fairness and stability SLOs
+
+| SLI | Target |
+| --- | ------ |
+| Unexplained pending-state responses in UI | `0` |
+| Preemptions without user-visible reason category | `0` |
+| Queue-policy changes without rollback path | `0` |
+| Benchmark-backed backlog ceiling published and current | Required |
+
+## 3. Metrics and dashboards
+
+The platform needs dashboards that show both tenant policy and control-plane
+health. Kueue metrics alone are not enough.
+
+### 3.1 Required metrics
+
+Use Kueue, Kubernetes, and platform metrics together. Important Kueue metrics
+include:
+
+- `kueue_pending_workloads`
+- `kueue_admission_wait_time_seconds`
+- `kueue_cluster_queue_resource_usage`
+- `kueue_cluster_queue_nominal_quota`
+- `kueue_evicted_workloads_total`
+- `kueue_local_queue_evicted_workloads_total`
+- `kueue_admitted_workloads_total`
+- `kueue_finished_workloads_total`
+
+Important Kubernetes and control-plane metrics include:
+
+- apiserver request latency and error metrics
+- etcd latency and saturation indicators
+- controller pod restart count and RSS memory
+- scheduler latency for admitted workloads
+- `Skaha` request latency and error rate
+
+### 3.2 Required dashboards
+
+Create the following dashboards at minimum:
+
+- Community ownership and quota view
+- Project fair-share and queue-position view
+- Pending workload health by class and queue
+- Admission latency and throughput
+- Preemption and eviction reason view
+- Controller and API-server health
+- End-to-end submission latency
+
+### 3.3 Ownership
+
+Dashboard ownership must be explicit:
+
+- Platform team owns Kueue, Kubernetes API, and rollout health dashboards
+- `Skaha` owners own submission-path latency and error dashboards
+- Future control-service owners own tenant and override workflow dashboards
+
+## 4. Alerts
+
+Alerts must drive action, not noise. Each alert needs a linked runbook.
+
+### 4.1 High-severity alerts
+
+- Kueue controller unavailable or crash looping
+- Kubernetes API write latency above threshold for sustained periods
+- Visibility API unavailable or returning sustained errors
+- Preemption storm above agreed threshold
+- Shared `workloads` namespace submission failures above threshold
+
+### 4.2 Medium-severity alerts
+
+- Kueue controller memory growth beyond normal envelope
+- Pending backlog growing while admission throughput collapses
+- Community reclaim behavior not restoring owned capacity in time
+- Fair-share override still active past approved window
+
+### 4.3 Low-severity alerts
+
+- Project or community configuration drift
+- Dashboard ingestion gaps
+- Non-critical benchmark regression signals
+
+## 5. Runbooks
+
+These runbooks define the minimum operational response set.
+
+### 5.1 API slowness or admission collapse
+
+Symptoms:
+
+- `Skaha` create requests slow down
+- Kueue admission wait time rises across multiple workload classes
+- Kubernetes API latency or error rate rises
+
+Actions:
+
+1. Confirm whether the bottleneck is `Skaha`, Kueue, apiserver, or etcd.
+2. Check Kueue controller restarts, memory, and work-queue saturation.
+3. Check apiserver latency and request load by verb and resource.
+4. Reduce new batch pressure if the submission path is part of the problem.
+5. Capture the incident state for later benchmark comparison.
+
+### 5.2 Kueue memory pressure
+
+Symptoms:
+
+- controller RSS grows rapidly
+- controller pod restarts or is OOM-killed
+
+Actions:
+
+1. Inspect backlog size, active queue count, and recent submission burst.
+2. Check whether API latency is causing request buildup.
+3. Reduce submission pressure if required.
+4. Apply emergency queue hold or drain policy only if the system cannot recover
+   safely.
+5. Record controller configuration and backlog context for follow-up analysis.
+
+### 5.3 Queue stall or unfairness complaint
+
+Symptoms:
+
+- users report that workloads are not moving
+- one project appears to dominate or starve unexpectedly
+
+Actions:
+
+1. Inspect the affected `LocalQueue` and community `ClusterQueue`.
+2. Confirm project fair-share weights and recent usage history.
+3. Confirm workload priorities inside the project.
+4. Confirm whether the issue is community reclaim, project fair share, or lack
+   of physical resources.
+5. Communicate the cause using the standard pending explanation categories.
+
+### 5.4 Preemption storm
+
+Symptoms:
+
+- rapid increase in preempted or evicted workloads
+- user complaints about unstable interactive or persistent work
+
+Actions:
+
+1. Confirm whether the preemptions are due to community reclaim, project-local
+   priority, or protection policy misconfiguration.
+2. Check recent weight overrides and policy changes.
+3. Pause further policy rollout until the behavior is understood.
+4. Revert the triggering policy if needed.
+
+### 5.5 Visibility failure
+
+Symptoms:
+
+- UI cannot explain pending reasons
+- pending-workloads visibility API times out or returns errors
+
+Actions:
+
+1. Confirm whether the failure is in the UI, metrics layer, or visibility API.
+2. Fall back to `kubectl` and Grafana-based diagnosis.
+3. Restore the visibility service before resuming major policy changes.
+
+## 6. Rollout
+
+Every Kueue upgrade or policy change must follow a controlled rollout.
+
+### 6.1 Rollout steps
+
+1. Validate the change in a non-production environment.
+2. Capture pre-change benchmarks and key health metrics.
+3. Apply the change through the source-of-truth deployment method.
+4. Watch controller health, admission timing, and API latency.
+5. Verify queue visibility and pending explanation behavior.
+6. Run a defined smoke test for interactive and batch submission.
+7. Mark the rollout complete only after the health window closes cleanly.
+
+### 6.2 Rollout guardrails
+
+- Do not change fairness and preemption policy during an unresolved incident.
+- Do not combine Kueue upgrades with unrelated tenant policy changes unless the
+  rollback method covers both.
+- Do not announce a new backlog ceiling without fresh benchmark evidence.
+
+## 7. Rollback
+
+Rollback must be designed before rollout begins.
+
+### 7.1 Rollback triggers
+
+- repeated controller restarts
+- sustained admission collapse
+- unacceptable API latency regression
+- broken queue visibility
+- unexpected preemption of protected workload classes
+
+### 7.2 Rollback steps
+
+1. Stop further policy changes.
+2. Revert the deployment or configuration to the last known-good state.
+3. Confirm controller stability and API recovery.
+4. Verify that queue visibility still works.
+5. Record the incident details and the exact rollback trigger.
+
+## 8. Evidence and reporting
+
+Operational claims must be backed by evidence. Keep the following artifacts for
+major changes:
+
+- benchmark results and plots
+- controller and API latency dashboards
+- admission wait time summaries by workload class
+- preemption and eviction summaries
+- incident notes and rollback evidence when applicable
+
+These artifacts are release inputs, not optional attachments.
diff --git a/configs/kueue/docs/reportstyle.markdown b/configs/kueue/docs/reportstyle.markdown
new file mode 100644
index 00000000..6604129e
--- /dev/null
+++ b/configs/kueue/docs/reportstyle.markdown
@@ -0,0 +1,303 @@
+# CANFAR architecture report style
+
+This document captures the report-writing preferences inferred from the current
+CANFAR Kueue architecture package. It is based on the edited state of
+`architecture.md`, `roadmap.md`, `operations.md`, `ui-spec.md`, and the ADR
+set.
+
+The goal is not to define a generic technical writing style. The goal is to
+capture the specific way these CANFAR architecture and planning reports are
+meant to read.
+
+## 1. Core writing motivation
+
+The core motivation behind this report style is clarity for decision-makers,
+operators, and technical reviewers.
+
+These reports are not meant to be literary, conversational, or highly abstract.
+They are meant to:
+
+- explain a system design clearly
+- support architectural review and technical decision-making
+- preserve operational intent
+- make scope and ownership boundaries obvious
+- connect policy, implementation, and future roadmap in one narrative
+
+The writing therefore favors directness, structure, and explicit reasoning over
+personality, flourish, or overly soft phrasing.
+
+## 2. High-level style traits
+
+The current edits consistently show the following preferences.
+
+### 2.1 Formal report framing
+
+Document titles are framed as formal report titles, not lightweight notes. For
+example:
+
+- `CANFAR Science Platform Kueue Architecture`
+- `CANFAR Kueue Roadmap`
+- `Architecture Decision Records`
+
+This style treats each document as part of a reviewable architecture package.
+
+### 2.2 Direct, declarative tone
+
+The preferred tone is factual and assertive. Statements are written as design
+claims, requirements, or observations. The writing avoids unnecessary hedging
+and avoids casual filler.
+
+Preferred pattern:
+
+- "The current repository baseline still reflects an older Kueue deployment."
+- "The architecture serves the following groups."
+- "This phase focuses on making scheduling behavior understandable to users and
+  admins."
+
+Avoid:
+
+- conversational asides
+- rhetorical questions
+- motivational language
+- vague claims without operational meaning
+
+### 2.3 Platform-first wording
+
+The reports should speak about the CANFAR Science Platform as a real operating
+environment, not as a generic software project. The preferred writing makes the
+platform identity explicit and keeps the narrative centered on actual tenant,
+operator, and workload needs.
+
+Preferred pattern:
+
+- "The CANFAR Science Platform needs..."
+- "This document is the primary architecture reference for Kueue rollout on the
+  CANFAR Science Platform."
+
+## 3. Structural preferences
+
+The current edits show a strong preference for highly structured reports.
+
+### 3.1 Numbered major sections
+
+Use numbered top-level sections and numbered subsections. This makes the reports
+ easy to review, annotate, and discuss in meetings.
+
+Preferred pattern:
+
+- `## 1. Introduction`
+- `### 1.1 Problem statement`
+- `### 1.2 Users and stakeholders`
+
+### 3.2 Title Case section headings
+
+The current edits move headings toward Title Case for important report
+subsections.
+
+Preferred pattern:
+
+- `Success Criteria`
+- `Quality Goals`
+- `Repository and Deployment Baseline`
+- `Service Level Indicators (SLIs) and Objectives (SLOs)`
+
+This indicates a preference for a formal report look over sentence-case prose
+headings.
+
+### 3.3 Short overview paragraph before lists
+
+Each section begins with a short framing paragraph before bullets or numbered
+items. This is important. The reports are not lists with headings attached. They
+are structured narratives that then break into lists.
+
+### 3.4 Parallel list structure
+
+Bullets are short, parallel, and information-dense. Lists are used to enumerate
+requirements, deliverables, activities, or consequences, not for decorative
+formatting.
+
+Preferred pattern:
+
+- "community-owned resources remain a first-class concept"
+- "communities can lend and borrow unused capacity"
+- "projects compete fairly inside their community"
+
+## 4. Content preferences
+
+The edits reveal several consistent content choices.
+
+### 4.1 Remove repo noise from narrative sections
+
+When discussing the current baseline, the preferred report style summarizes the
+state directly instead of embedding too many file-path references inside the
+main prose.
+
+Preferred pattern:
+
+- "Current deployment documents `0.11.6` as the installed release"
+- "Current deployment uses `batch/job` only"
+
+This keeps the main report readable. File-level evidence can still exist, but
+the report itself should read like an architecture document, not a code review.
+
+### 4.2 Keep implementation awareness without dropping into code detail
+
+The preferred style is technically specific, but not source-code heavy. It names
+real systems, CRDs, workloads, and policies, but it does not drown the report
+in low-level manifest detail unless the detail matters for a decision.
+
+### 4.3 Emphasize scope boundaries
+
+The reports should say clearly what is in scope, what is out of scope, and what
+is future work.
+
+Preferred pattern:
+
+- "This service is out of scope for phase 1 implementation, but it is in scope
+  for architecture and requirements."
+- "Those namespace changes are future roadmap items, not phase 1 requirements."
+
+This is one of the strongest recurring preferences in the edits.
+
+### 4.4 Make ambiguity explicit
+
+The report style does not hide open issues. It records them directly and
+operationally.
+
+Preferred pattern:
+
+- "The submission path can resolve project and community deterministically for
+  the selected mapping model, or require explicit project selection when the
+  mapping model is ambiguous."
+
+This suggests a strong preference for showing the real operational implication of
+an open decision rather than just saying that a question remains open.
+
+## 5. Communication style preferences
+
+The communication style is best described as formal, practical, and reviewable.
+
+### 5.1 Write for architects, operators, and reviewers
+
+The reports should read as if they are written for:
+
+- architecture reviewers
+- platform operators
+- technical leads
+- future implementers
+
+They should not read like marketing material or general onboarding content.
+
+### 5.2 Prefer precise nouns over expressive prose
+
+The edits favor terms like:
+
+- baseline
+- target state
+- future state
+- deliverables
+- dependencies
+- acceptance criteria
+- ownership
+- scope
+- consequences
+
+This reflects a preference for decision and execution vocabulary.
+
+### 5.3 Prefer explicit operational language
+
+When possible, use operationally meaningful wording:
+
+- "rollback"
+- "admission collapse"
+- "preemption storm"
+- "safe operating region"
+- "measured thresholds"
+
+This style grounds the report in real system behavior rather than abstract
+architecture theory.
+
+## 6. Writing rules to preserve this style
+
+Use these rules when writing future CANFAR architecture reports.
+
+### 6.1 Titles and headings
+
+- Use strong document titles that name the platform and the document purpose.
+- Use numbered sections.
+- Use Title Case for major subsection headings when the document is formal and
+  report-like.
+
+### 6.2 Paragraph style
+
+- Start sections with a short framing paragraph.
+- Keep paragraphs compact and high-signal.
+- Use direct statements.
+- Avoid unnecessary hedging unless uncertainty itself is the point.
+
+### 6.3 Lists
+
+- Use bullets for requirements, traits, risks, and deliverables.
+- Use numbered lists for flows, steps, or ordered explanations.
+- Keep list items parallel and concise.
+
+### 6.4 Scope and decisions
+
+- State what is current, what is target, and what is future.
+- Mark out-of-scope items clearly.
+- Record open decisions in a way that shows operational impact.
+- Tie every recommendation to either architecture, operations, or roadmap
+  intent.
+
+### 6.5 References and evidence
+
+- Keep the main report readable first.
+- Use file paths, manifests, and implementation references sparingly in the main
+  narrative.
+- Prefer summarized statements in the report body and detailed references in
+  supporting material.
+
+## 7. Preferred report voice
+
+The best voice for these reports is:
+
+- formal
+- technical
+- direct
+- practical
+- audit-friendly
+- operator-aware
+
+The voice is not:
+
+- casual
+- academic in an abstract sense
+- promotional
+- speculative without labeling the speculation
+
+## 8. Template for future reports
+
+Use the following pattern for future architecture or roadmap reports:
+
+1. Start with a direct statement of purpose.
+2. State the current baseline or current problem plainly.
+3. Define the target design or target operating model.
+4. Separate current, target, and future state clearly.
+5. Use lists for requirements, activities, risks, and acceptance criteria.
+6. Keep open decisions visible and explain their impact.
+7. End sections with operational implications, not just abstract conclusions.
+
+## 9. Summary
+
+Your report-writing preference is not simply "formal technical writing." It is a
+specific style optimized for platform architecture review:
+
+- strong document framing
+- direct declarative language
+- explicit scope boundaries
+- operationally meaningful wording
+- high-structure sectioning
+- open decisions recorded with consequences
+- readable narrative without excessive repo-level clutter
+
+That combination is what gives the package its current voice.
diff --git a/configs/kueue/docs/roadmap.md b/configs/kueue/docs/roadmap.md
new file mode 100644
index 00000000..f683b259
--- /dev/null
+++ b/configs/kueue/docs/roadmap.md
@@ -0,0 +1,277 @@
+# CANFAR Kueue Roadmap
+
+This roadmap turns the architecture into an execution plan with measurable phase
+exits. Use it together with [architecture.md](./architecture.md), [operations.md](./operations.md),
+[ui-spec.md](./ui-spec.md), and [the ADR set](./adrs/README.md).
+
+## Roadmap principles
+
+This roadmap follows four principles:
+
+- Prove policy and control-plane behavior before claiming scale
+- Keep one shared `workloads` namespace first, but preserve future namespace and
+  MultiKueue compatibility
+- Separate what Kueue owns from what the future control service owns
+- Use evidence-based phase exits instead of subjective readiness claims
+
+## Phase 0: Architecture closure and upgrade prerequisites
+
+This phase closes the design, records the decisions, and prepares the current
+older repository baseline for a safer Kueue upgrade.
+
+### Deliverables
+
+- Approved architecture package in `configs/kueue/docs`
+- Confirmed target vocabulary for community, project, cohort, flavor, and
+  workload class
+- Inventory of current repository gaps between the deployed baseline and the
+  target feature set
+- Upgrade preflight checklist for current Kueue controller configuration and
+  CRDs
+
+### Key activities
+
+- Validate the current Kueue deployment and CRD baseline against target
+  features such as Admission Fair Sharing, visibility APIs, and later topology
+  support
+- Confirm the initial communities and example queue structure for `cadc`, `ska`,
+  and `chimefrb`
+- Define the target `workloads` namespace and identify migration tasks from the
+  current namespace configuration
+- Confirm the initial workload-class vocabulary: interactive, persistent,
+  batch, and advanced distributed
+
+### Dependencies
+
+- Architecture approval
+- Access to current cluster configuration and deployment history
+
+### Acceptance criteria
+
+- The architecture package is merged and accepted as the design baseline
+- The upgrade preflight checklist exists and identifies all current config
+  mismatches
+- No unresolved architectural blocker remains except ADR-006
+
+## Phase 1: Core Kueue platform hardening
+
+This phase upgrades and hardens the Kueue control plane to support the target
+tenant model and observability baseline.
+
+### Deliverables
+
+- Updated Kueue installation aligned with the target feature baseline
+- Shared `workloads` namespace policy
+- Initial `ClusterQueue` objects for `cadc`, `ska`, and `chimefrb`
+- Initial `ResourceFlavor` taxonomy for cluster, zone, CPU, memory, and GPU
+- Prometheus and Grafana coverage for queue, controller, and API health
+
+### Key activities
+
+- Upgrade Kueue safely from the current older release line
+- Enable the feature gates and controller settings needed for the target model
+- Move managed namespace scope to the shared `workloads` namespace
+- Introduce community `ClusterQueue` objects and a shared cohort
+- Standardize flavor naming and resource coverage
+- Turn on the visibility and metrics surfaces needed for later phase evidence
+
+### Dependencies
+
+- Phase 0 complete
+- Confirmed upgrade window and rollback method
+
+### Acceptance criteria
+
+- Kueue runs stably on the target baseline with no repeated controller crashes
+- Community `ClusterQueue` objects exist and report metrics cleanly
+- The visibility API and core queue metrics are reachable
+- Rollback to the previous deployment has been exercised in a non-production
+  environment
+
+## Phase 2: Tenancy and control-service integration points
+
+This phase establishes the community and project model operationally, even if
+the full control service is not implemented yet.
+
+### Deliverables
+
+- Project `LocalQueue` creation model in the shared namespace
+- Project fair-share weight policy and administrative workflow
+- Submission resolution rules from user and group context to project and
+  community
+- Requirements contract for the future standalone control service
+
+### Key activities
+
+- Define the project naming convention for `LocalQueue` objects
+- Establish how `Skaha` resolves the effective project today and later through
+  the control service
+- Introduce project fair-share weights with cluster-admin-only approval
+- Record the future control-service API and data needs
+- Decide how temporary override requests are surfaced and approved
+
+### Dependencies
+
+- Phase 1 complete
+- Administrative agreement on tenant naming and ownership boundaries
+
+### Acceptance criteria
+
+- New projects can be represented as `LocalQueue` objects on demand
+- Project weights are visible and adjustable by cluster admins
+- The submission path can resolve project and community deterministically for
+  the selected mapping model, or require explicit project selection when the
+  mapping model is ambiguous
+- The control-service requirements are specific enough to hand to a separate
+  implementation effort
+
+## Phase 3: Visibility UX and policy-aware diagnostics
+
+This phase focuses on making scheduling behavior understandable to users and
+admins.
+
+### Deliverables
+
+- Read-only queue explorer backed by Kueue visibility and metrics surfaces
+- Pending-state explanation model implemented in the UI or CLI layer
+- Resource ownership view by community and project
+- Temporary fair-share override request workflow design
+
+### Key activities
+
+- Expose `LocalQueue` and `ClusterQueue` visibility in user-facing terms
+- Show current effective project fair-share position and weight
+- Show workload priority and protected workload-class policy
+- Explain delays as one of the approved explanation categories
+- Provide cluster-admin visibility into community reclaim and borrowing state
+
+### Dependencies
+
+- Phase 2 complete
+- Stable visibility API and metrics from phase 1
+
+### Acceptance criteria
+
+- A user can inspect a workload and receive an actionable pending explanation
+- A project admin can see project weight and current community position
+- A cluster admin can inspect cohort borrowing and community reclaim behavior
+- The UI language matches the vocabulary defined in `ui-spec.md`
+
+## Phase 4: Scale, benchmark, and operational proof
+
+This phase proves backlog scale and control-plane behavior with repeatable test
+artifacts.
+
+### Deliverables
+
+- `kueuer` benchmark suites for raw Kueue and end-to-end `Skaha` pressure
+- Measured thresholds for backlog growth and control-plane degradation
+- Evidence pack for `10k`, `50k`, `100k`, and `200k` backlog gates
+- Clear stop or go criteria for larger backlog claims
+
+### Key activities
+
+- Extend benchmark coverage to user-path submission and visibility stress
+- Record admission timing, controller health, API server latency, and pending
+  backlog behavior
+- Measure user-facing interactive behavior during large batch backlog
+- Capture failure modes such as memory pressure, API slowness, and visibility
+  degradation
+
+### Dependencies
+
+- Phases 1 through 3 complete
+- Bench environments and monitoring available
+
+### Acceptance criteria
+
+- Benchmark suites run repeatably and produce comparable artifacts
+- The team can state a measured safe operating region for backlog size
+- The evidence distinguishes Kueue bottlenecks from API-server or etcd
+  bottlenecks
+- Interactive and protected workloads retain acceptable service behavior during
+  backlog tests
+
+## Phase 5: Future capability evaluation
+
+This phase evaluates advanced Kueue capabilities without making them phase 1
+production promises.
+
+### Deliverables
+
+- Evaluation of topology-aware scheduling for MPI, JobSet, Ray, and GPU work
+- Evaluation of advanced controller integrations such as `JobSet`, MPI, and Ray
+- Evaluation of elastic workload applicability for CANFAR batch patterns
+- Decision updates or new ADRs for supported future capabilities
+
+### Key activities
+
+- Test topology-aware scheduling against real node and fabric labels
+- Test distributed workload semantics with `waitForPodsReady` where needed
+- Evaluate whether elastic workloads help high-parallelism batch patterns
+- Evaluate whether protected persistent or interactive workloads need stronger
+  isolation than the initial single-plane model
+
+### Dependencies
+
+- Phase 4 evidence complete
+- Access to representative GPU, network, and multi-pod job environments
+
+### Acceptance criteria
+
+- Each advanced capability has a documented fit, risk, and recommendation
+- New workload classes are not promoted to production without measured evidence
+- Any capability that remains unsuitable is documented clearly rather than left
+  ambiguous
+
+## Phase 6: Optional MultiKueue federation
+
+This phase introduces manager and worker cluster federation only if the evidence
+supports the need.
+
+### Deliverables
+
+- MultiKueue proof of concept
+- Manager and worker flavor and queue mapping design
+- Operational model for manager and worker observability and failure handling
+- Migration criteria for deciding when to move from one cluster to many
+
+### Key activities
+
+- Define manager and worker queue vocabulary and synchronization rules
+- Decide how worker clusters map to community ownership and specialized hardware
+- Test manager visibility and worker execution state consistency
+- Define site-placement and failure-domain policy
+
+### Dependencies
+
+- Phases 1 through 5 complete
+- Evidence that single-cluster operation is insufficient or too risky
+
+### Acceptance criteria
+
+- The team has a documented reason to federate, not just a theoretical interest
+- A MultiKueue proof of concept validates the community and project model
+- Manager and worker failure handling is documented and testable
+
+## Cross-phase risks
+
+These risks apply across the whole roadmap:
+
+- Users may interpret fair-share behavior as arbitrary if visibility lags behind
+  policy rollout
+- The control service may become the gating dependency for later phases if its
+  requirements stay vague
+- Namespace evolution may be forced earlier if workload-class policies diverge
+  faster than expected
+- Advanced features such as topology-aware scheduling or elastic workloads may
+  reveal integration limits that require new ADRs
+
+## Exit criteria for the package
+
+The roadmap is complete when each phase can answer four questions:
+
+- What is being delivered?
+- What evidence proves it works?
+- What dependencies and risks apply?
+- What operator or user behavior changes when the phase lands?
diff --git a/configs/kueue/docs/ui-spec.md b/configs/kueue/docs/ui-spec.md
new file mode 100644
index 00000000..a2efe283
--- /dev/null
+++ b/configs/kueue/docs/ui-spec.md
@@ -0,0 +1,222 @@
+# CANFAR Kueue visibility and UI specification
+
+This document defines the user-facing and admin-facing product surface for the
+CANFAR Kueue architecture. It does not describe a finished implementation. It
+defines the information model, workflows, and explanation language that the UI
+must support.
+
+Use this document together with [architecture.md](./architecture.md), [operations.md](./operations.md),
+[roadmap.md](./roadmap.md), and [the ADR index](./adrs/README.md).
+
+## 1. Product goals
+
+The UI exists to make queue and ownership behavior understandable. It must not
+become a generic Kubernetes portal. It must explain why work is pending, what
+resources a community owns, and what position a project holds inside its
+community.
+
+The UI must support:
+
+- science users who need workload status and explanation
+- project admins who need project-level fairness and ownership visibility
+- cluster admins who need policy, override, and incident visibility
+
+## 2. Personas
+
+This section defines the primary personas and what each one needs.
+
+### 2.1 Science user
+
+The science user needs to:
+
+- submit work through `Skaha`
+- see current workload state
+- understand why work is pending
+- distinguish priority delays from quota or capacity delays
+
+The user does not need raw CRD editing or full Kubernetes visibility.
+
+### 2.2 Project administrator
+
+The project administrator needs to:
+
+- see which projects exist in a community
+- see which POSIX groups attach to a project
+- understand project fair-share position and weight
+- request temporary fair-share overrides through the control-service workflow
+
+The project administrator does not approve overrides directly.
+
+### 2.3 Cluster administrator
+
+The cluster administrator needs to:
+
+- inspect community ownership and borrow or reclaim behavior
+- see and adjust project fair-share weights
+- approve or reject temporary override requests
+- diagnose fairness, preemption, and visibility problems
+
+## 3. Primary views
+
+This section defines the minimum view set the product must expose.
+
+### 3.1 Queue explorer
+
+The queue explorer is the main entry point for queue visibility. It must show:
+
+- community
+- project
+- workload class
+- current queue position where available
+- effective fair-share state
+- workload priority
+- pending reason summary
+
+### 3.2 Resource ownership view
+
+The resource ownership view must show:
+
+- which resources each community owns
+- current usage against owned quota
+- borrowing or lending state
+- key `ResourceFlavor` breakdowns such as CPU, memory, GPU, cluster, and zone
+
+This view is important for both project admins and cluster admins.
+
+### 3.3 Project fairness view
+
+The project fairness view must show:
+
+- current fair-share weight
+- recent effective usage
+- relative position inside the community
+- whether a temporary override is active
+- who approved the override and when it expires
+
+### 3.4 Workload detail view
+
+The workload detail view must show:
+
+- workload class
+- priority class
+- selected queue
+- community and project identity
+- active, pending, admitted, running, finished, or preempted state
+- pending or preemption explanation using the standard reason language
+
+## 4. Explanation language
+
+The UI must use standard explanation categories so users see consistent,
+actionable reasons rather than raw controller text.
+
+### 4.1 Approved pending reasons
+
+Use the following pending reason categories:
+
+- Waiting behind other projects with better current fair-share position
+- Waiting behind higher-priority work in the same project
+- Waiting for community-owned or borrowed resources to become available
+- Waiting because the project or policy does not currently allow admission
+- Waiting because the platform control plane is degraded
+
+### 4.2 Approved rejection or policy reasons
+
+Use the following rejection categories:
+
+- Project or community could not be resolved
+- Submission is missing required queue or policy metadata
+- Workload requests do not satisfy platform requirements
+- External control-service policy denied the request
+
+### 4.3 Approved preemption reasons
+
+Use the following preemption categories:
+
+- Preempted because the owning community reclaimed its quota
+- Preempted by higher-priority work inside the same community
+- Evicted because platform recovery policy triggered
+- Evicted because queue stop or drain policy was applied
+
+## 5. Workflows
+
+This section defines the primary workflows the UI must support.
+
+### 5.1 User workload inspection
+
+The user inspects a workload and sees:
+
+1. Which community and project it belongs to
+2. Which queue it targets
+3. Which workload class and priority apply
+4. Why it is waiting or what preempted it
+5. What the next likely action is
+
+### 5.2 Project admin fairness inspection
+
+The project admin inspects a project and sees:
+
+1. Current fair-share weight
+2. Current community-relative position
+3. Current active and pending work counts
+4. Whether an override exists or has expired
+5. Which POSIX groups attach to the project
+
+### 5.3 Temporary override request
+
+The request flow is:
+
+1. Project admin selects a project
+2. Project admin proposes a temporary weight increase and business reason
+3. Control-service workflow records the request
+4. Cluster admin approves, rejects, or modifies the request
+5. UI displays approval state and expiry time
+
+The downstream accounting or quota-cost model for the override is out of scope
+for this document, but the UI must display that such a cost model exists.
+
+## 6. Phased product surface
+
+The UI strategy is phased to match the roadmap.
+
+### Phase 1
+
+Use `kubectl`, Grafana, and the Kueue visibility API. No custom UI is required
+yet beyond operator tooling.
+
+### Phase 2
+
+Introduce a read-only queue visibility UI with workload detail, project fairness
+view, and community ownership view.
+
+### Phase 3
+
+Introduce admin workflows such as temporary override requests and richer project
+and mapping inspection.
+
+### Phase 4
+
+Introduce guided submission hints, recommended queue or flavor explanations, and
+self-service debugging aids if the earlier phases prove stable.
+
+## 7. Data requirements
+
+The UI needs data from multiple sources:
+
+- Kueue visibility API for queue position and pending workloads
+- Kueue metrics for quota, admission latency, and queue health
+- Kubernetes workload state for current execution status
+- Control-service metadata for projects, groups, ownership, and overrides
+
+The UI must not depend on full raw workload listing for basic queue views when
+the backlog is large.
+
+## 8. Non-goals
+
+This UI specification does not require:
+
+- raw Kubernetes object editing
+- a full generic cluster dashboard
+- per-user accounting implementation
+- automated override approval logic
+
+The goal is clarity and policy transparency, not a complete platform portal.
diff --git a/configs/kueue/kueuer/AGENTS.md b/configs/kueue/kueuer/AGENTS.md
new file mode 100644
index 00000000..c41251ec
--- /dev/null
+++ b/configs/kueue/kueuer/AGENTS.md
@@ -0,0 +1,13 @@
+## Learned User Preferences
+- Prefer testing workflows that continue under restricted production RBAC by treating control-plane visibility checks as optional and collecting partial metrics.
+- Prefer practical workarounds for production constraints over cluster-level changes that require broader platform modifications.
+- Prefer inspecting node or cluster-wide inventory via the Kubernetes API (or other narrowly scoped API calls) instead of broad `kubectl get nodes` when that matches environment constraints.
+- Prefer `kr cluster resources` output that uses IEC binary byte units (B, KiB, …, PiB) and at most three decimal places for displayed CPU and byte quantities.
+
+## Learned Workspace Facts
+- Benchmarking and preflight work for this area is centered in `configs/kueue/kueuer`.
+- The production context used for tests has workload namespace access (for example, `canfar-kueue-testing`) but limited visibility into `kueue-system`, which restricts control-plane metric collection.
+- Default stress VM memory fraction for benchmark jobs is `0.33` (`DEFAULT_STRESS_VM_MEMORY_FRACTION`); override with `--vm-memory-fraction` when needed.
+- Benchmark job submission supports `--spawn-mechanism kubectl` (default, chunked `kubectl apply`) or `api` (Python client create-only with bounded concurrency and retries).
+- `kr cluster resources` groups totals by `--node-label-key` (CLI default `skaha.opencadc.org/node-type`); `kueuer.resources.total()` requires `node_label_key` explicitly and does not default it in library code.
+- Grouped results include per-bucket `count` (nodes) and per-product GPU lists; when `nvidia.com/gpu` capacity is zero or missing, counts may come from `nvidia.com/gpu.count` with kind from `nvidia.com/gpu.product` (e.g. MIG-style nodes).
diff --git a/configs/kueue/kueuer/src/kueuer/resources.py b/configs/kueue/kueuer/src/kueuer/resources.py
index b0ea73d5..7f5ff25c 100644
--- a/configs/kueue/kueuer/src/kueuer/resources.py
+++ b/configs/kueue/kueuer/src/kueuer/resources.py
@@ -8,17 +8,19 @@
 # ]
 # ///
 """
-Totals cluster resources across Kubernetes nodes filtered by name regex.
+Totals cluster resources across Kubernetes nodes.
 
-- Deduplicates nodes by UID (so overlapping regex lists don't double count).
+- Deduplicates nodes by UID when listing.
 - By default totals from node .status.capacity; use --field allocatable to sum
   .status.allocatable instead.
-- Results are grouped by a configurable node label (see CLI ``--node-label-key``;
-  ``total()`` requires ``node_label_key`` with no default in code). Nodes without
+- Results are grouped by a configurable node label (see CLI ``--node-label``;
+  ``total()`` requires ``node_label`` with no default in code). Nodes without
   the label are grouped under ``""``. Each group has ``count`` (nodes in group),
-  ``cpu``, ``memory``, ``ephemeral-storage`` (binary **GiB**, 1024³; values up to
-  3 decimal places), per-bucket **weights** (same 3 decimal places; pool CPU
-  cores per GiB / per GPU kind—see ``ResourceWeights``), and GPU lists.
+  ``cpu``, ``memory``, ``ephemeral-storage`` (binary units per ``--units``;
+  values up to 3 decimal places), per-bucket **weights** (IEEE-754 binary64,
+  shortest round-trip decimal strings; normalized to ``--baseline`` (``-b``),
+  computed in the same byte scale as ``--units``—see ``ResourceWeights``), and
+  GPU lists.
 - ``nvidia.com/gpu`` is a list of ``{ "kind", "value", "unit": "count" }`` per
   distinct ``nvidia.com/gpu.product`` label, summed across nodes. When
   capacity/allocatable reports 0 or omits ``nvidia.com/gpu`` but the NVIDIA
@@ -30,22 +32,21 @@
 
 Examples:
   uv run resources.py
-  uv run resources.py 'gpu-.*' 'node-1[0-9]'
-  uv run resources.py --field allocatable --pretty 'worker-.*'
+  uv run resources.py --field allocatable
+  uv run resources.py --units Mi --baseline cpu
 """
 
 from __future__ import annotations
 
-import re
 import sys
 from dataclasses import dataclass
 from decimal import ROUND_HALF_UP, Decimal, getcontext, localcontext
-from typing import Annotated, Any, Dict, Iterable, List, Optional, Sequence, cast
+from typing import Annotated, Any, Dict, Iterable, List, Optional, cast
 
 import typer
 from kubernetes.client import CoreV1Api, V1Node
 from kubernetes.utils.quantity import parse_quantity
-from pydantic import BaseModel, ConfigDict, Field, ValidationError, field_validator
+from pydantic import BaseModel, ConfigDict, Field, ValidationError
 from rich.console import Console
 from typing_extensions import Literal
 
@@ -59,9 +60,18 @@
 # Reported fractional precision for CPU, GiB display quantities, and weight ratios.
 REPORT_MAX_DECIMAL_PLACES = 3
 
-# Intermediate precision for weight ratio division before rounding to ``REPORT_MAX_DECIMAL_PLACES``.
+# Intermediate precision for weight ratio division before rounding to
+# ``REPORT_MAX_DECIMAL_PLACES``.
 _WEIGHT_RATIO_DIV_PREC = max(80, DECIMAL_PRECISION)
 
+# Baseline weight ``1`` as a binary64 round-trip string (matches computed weights).
+_WEIGHT_BASELINE_ONE_STR = repr(1.0)
+
+# ``total()`` / ``--baseline`` accepted resource names.
+_ALLOWED_BASELINES: frozenset[str] = frozenset(
+    ("cpu", "memory", "ephemeral-storage", "nvidia.com/gpu")
+)
+
 app = typer.Typer(help="Cluster utilities")
 
 # =========================
@@ -76,7 +86,9 @@ class ResourceItem(BaseModel):
     )
     unit: str = Field(
         ...,
-        description="Binary GiB for memory/ephemeral totals, 'cores', or 'count'.",
+        description=(
+            "Binary unit (B, Ki, Mi, …) for memory/ephemeral, 'cores', or 'count'."
+        ),
     )
 
 
@@ -92,27 +104,27 @@ class GpuResourceItem(BaseModel):
 
 
 class ResourceWeights(BaseModel):
-    """Pool-level ratios vs CPU cores (dimensionless); see module docstring for interpretation."""
+    """Pool composition weights vs ``--baseline``; see module docstring."""
 
     model_config = ConfigDict(populate_by_name=True)
 
-    cpu: str = Field(
-        default="1",
-        description="Baseline; other weights are pool CPU per unit of that resource.",
+    cpu: Optional[str] = Field(
+        None,
+        description="IEEE-754 binary64 string; baseline CPU weight is ``1.0``.",
     )
     memory: Optional[str] = Field(
         None,
-        description="Pool CPU cores divided by total memory in binary GiB.",
+        description="IEEE-754 binary64 string vs baseline in ``--units``.",
     )
     ephemeral_storage: Optional[str] = Field(
         None,
         serialization_alias="ephemeral-storage",
-        description="Pool CPU cores divided by total ephemeral storage in binary GiB.",
+        description="IEEE-754 binary64 string vs baseline in ``--units``.",
     )
     nvidia_gpu: Optional[Dict[str, str]] = Field(
         None,
         serialization_alias="nvidia.com/gpu",
-        description="Per GPU product: pool CPU cores divided by count of that kind.",
+        description="Per GPU product: IEEE-754 binary64 weight vs baseline.",
     )
 
 
@@ -124,7 +136,7 @@ class NodeTypeResources(BaseModel):
     count: int = Field(
         ...,
         ge=0,
-        description="Number of nodes in this group (unique nodes after pattern filter).",
+        description="Number of nodes in this group (unique nodes).",
     )
     cpu: Optional[ResourceItem] = None
     memory: Optional[ResourceItem] = None
@@ -143,17 +155,17 @@ class NodeTypeResources(BaseModel):
     weights: Optional[ResourceWeights] = Field(
         None,
         description=(
-            "CPU-normalized pool composition weights (decimal strings, same precision "
-            "as other reported quantities). "
-            "Omitted if the pool has no CPU total to divide by."
+            "Pool composition weights (IEEE-754 binary64, shortest round-trip "
+            "strings). Omitted when the baseline resource total is missing or "
+            "non-positive."
         ),
     )
 
 
 class ClusterResourcesResult(BaseModel):
-    """Cluster resources grouped by ``node_label_key`` label values."""
+    """Cluster resources grouped by ``node_label`` label values."""
 
-    node_label_key: str = Field(
+    node_label: str = Field(
         ...,
         description="Kubernetes node label key used to form each group.",
     )
@@ -161,20 +173,9 @@ class ClusterResourcesResult(BaseModel):
 
 
 class Settings(BaseModel):
-    patterns: Optional[List[str]] = Field(
-        default=None, description="Regex patterns for node names."
-    )
     field: Literal["capacity", "allocatable"] = "capacity"
     pretty: bool = False
 
-    @field_validator("patterns")
-    @classmethod
-    def validate_patterns(cls, v: Optional[List[str]]) -> Optional[List[str]]:
-        if v is None:
-            return None
-        cleaned = [p for p in (s.strip() for s in v) if p]
-        return cleaned or None
-
 
 # =========================
 # Internal Calculation Types
@@ -203,28 +204,13 @@ def _load_kube() -> CoreV1Api:
     return k8s.core_v1
 
 
-def _compile_patterns(patterns: Optional[Sequence[str]]) -> Optional[List[re.Pattern]]:
-    if not patterns:
-        return None
-    return [re.compile(p) for p in patterns]
-
-
-def _node_matches(name: str, compiled: Optional[List[re.Pattern]]) -> bool:
-    if compiled is None:
-        return True
-    return any(p.search(name) for p in compiled)
-
-
-def _collect_nodes(v1: CoreV1Api, patterns: Optional[Sequence[str]]) -> List[V1Node]:
-    compiled = _compile_patterns(patterns)
+def _collect_nodes(v1: CoreV1Api) -> List[V1Node]:
     all_nodes = v1.list_node().items
-    # Deduplicate by UID so overlapping regex patterns don't double count
     dedup: Dict[str, V1Node] = {}
     for n in all_nodes:
         name = n.metadata.name or ""
-        if _node_matches(name, compiled):
-            uid = n.metadata.uid or name  # Fallback to name if UID missing
-            dedup[uid] = n
+        uid = n.metadata.uid or name  # Fallback to name if UID missing
+        dedup[uid] = n
     return list(dedup.values())
 
 
@@ -277,7 +263,9 @@ def _node_nvidia_gpu_contrib(
     return None
 
 
-def _gpu_kind_totals_to_list(by_kind: Optional[Dict[str, int]]) -> Optional[List[GpuResourceItem]]:
+def _gpu_kind_totals_to_list(
+    by_kind: Optional[Dict[str, int]],
+) -> Optional[List[GpuResourceItem]]:
     """Convert per-kind counts to a stable list for JSON output."""
     if not by_kind:
         return None
@@ -290,7 +278,7 @@ def _gpu_kind_totals_to_list(by_kind: Optional[Dict[str, int]]) -> Optional[List
 
 
 def _format_decimal_report(value: Decimal) -> str:
-    """Stringify a non-negative Decimal with at most ``REPORT_MAX_DECIMAL_PLACES`` places."""
+    """Stringify a non-negative Decimal with at most ``REPORT_MAX_DECIMAL_PLACES``."""
     if value < 0:
         raise ValueError("value must be non-negative")
     q = Decimal("1").scaleb(-REPORT_MAX_DECIMAL_PLACES)
@@ -301,85 +289,221 @@ def _format_decimal_report(value: Decimal) -> str:
     return s
 
 
-# Binary gibibyte (Kubernetes-style): 1 GiB = 1024³ bytes.
-_GIB_BYTES = Decimal(1024**3)
+# Binary IEC factors: unit string -> bytes per one unit (B, Ki, Mi, Gi, Ti, Pi).
+_BINARY_UNIT_BYTES: Dict[str, int] = {
+    "B": 1,
+    "Ki": 1024,
+    "Mi": 1024**2,
+    "Gi": 1024**3,
+    "Ti": 1024**4,
+    "Pi": 1024**5,
+}
+
 
+def normalize_binary_unit(units: str) -> str:
+    """Return canonical binary unit key or raise ``ValueError``."""
+    u = units.strip()
+    if u not in _BINARY_UNIT_BYTES:
+        allowed = ", ".join(sorted(_BINARY_UNIT_BYTES))
+        raise ValueError(f'units must be one of: {allowed} (got "{units}")')
+    return u
 
-def _bytes_to_binary_gib_decimal(total_bytes: int) -> Decimal:
-    """Convert byte totals to binary GiB (full ``Decimal``, unrounded)."""
+
+def _bytes_to_qty_decimal(total_bytes: int, unit: str) -> Decimal:
+    """Convert byte totals to the selected binary unit (full ``Decimal``, unrounded)."""
     if total_bytes < 0:
         raise ValueError("byte total must be non-negative")
-    return Decimal(total_bytes) / _GIB_BYTES
+    factor = Decimal(_BINARY_UNIT_BYTES[unit])
+    return Decimal(total_bytes) / factor
 
 
-def _gib_resource_item(total_bytes: int) -> ResourceItem:
-    """Memory / ephemeral totals: always reported in GiB with limited display precision."""
+def _bytes_to_resource_item(total_bytes: int, unit: str) -> ResourceItem:
+    """Memory / ephemeral totals in ``unit`` with limited display precision."""
     if total_bytes == 0:
-        return ResourceItem(value="0", unit="GiB")
-    v = _bytes_to_binary_gib_decimal(total_bytes)
-    return ResourceItem(value=_format_decimal_report(v), unit="GiB")
+        return ResourceItem(value="0", unit=unit)
+    v = _bytes_to_qty_decimal(total_bytes, unit)
+    return ResourceItem(value=_format_decimal_report(v), unit=unit)
 
 
-def _gib_display_to_bytes(value: Decimal) -> Decimal:
-    """Interpret a displayed GiB quantity as bytes."""
-    return value * _GIB_BYTES
+def _display_qty_to_bytes(value: Decimal, unit: str) -> Decimal:
+    """Interpret a displayed quantity in ``unit`` as bytes."""
+    return value * Decimal(_BINARY_UNIT_BYTES[unit])
 
 
 def _decimal_ratio_string(numerator: Decimal, denominator: Decimal) -> str:
-    """``numerator / denominator`` rounded to ``REPORT_MAX_DECIMAL_PLACES`` (half-up)."""
+    """Weight ratio as IEEE-754 binary64 (``float``), shortest round-trip ``repr``."""
     if denominator <= 0:
         raise ValueError("denominator must be positive")
     with localcontext() as ctx:
         ctx.prec = _WEIGHT_RATIO_DIV_PREC
         ratio = numerator / denominator
-    return _format_decimal_report(ratio)
+    return repr(float(ratio))
 
 
-def _compute_resource_weights(acc: TotalsAcc) -> Optional[ResourceWeights]:
-    """
-    Weights normalize pool totals to a per-CPU baseline: ``cpu`` is 1; other
-    fields are ``TOTAL_CPU / TOTAL_QUANTITY`` in compatible units (GiB for
-    memory and ephemeral; per-GPU-kind counts for NVIDIA).
-
-    **Interpretation (heuristic):** For a node pool with totals ``(C, M, E, …)``,
-    weights map ``(c, m, e, …)`` requests to a linear ``c·1 + m·w_mem + …`` style
-    score *if* you treat the pool's aggregate ratio as a fixed substitution rate
-    between CPU and other resources. That is a **comparative** normalization, not a
-    guarantee of schedulability, pricing, or optimal packing—heterogeneous nodes,
-    fragmentation, and priorities are not captured.
-    """
-    cpu = acc.cpu_cores
-    if cpu is None or cpu <= 0:
+def _nvidia_weights_from_numerator(
+    by_kind: Optional[Dict[str, int]],
+    numer: Decimal,
+) -> Optional[Dict[str, str]]:
+    if not by_kind:
         return None
+    entries: Dict[str, str] = {}
+    for kind, cnt in sorted(by_kind.items(), key=lambda kv: (kv[0] == "", kv[0])):
+        if cnt <= 0:
+            continue
+        entries[kind] = _decimal_ratio_string(numer, Decimal(cnt))
+    return entries or None
 
-    mem_w: Optional[str] = None
-    if acc.memory_bytes is not None and acc.memory_bytes > 0:
-        mem_w = _decimal_ratio_string(cpu, _bytes_to_binary_gib_decimal(acc.memory_bytes))
-
-    eph_w: Optional[str] = None
-    if acc.ephemeral_bytes is not None and acc.ephemeral_bytes > 0:
-        eph_w = _decimal_ratio_string(cpu, _bytes_to_binary_gib_decimal(acc.ephemeral_bytes))
-
-    nv_map: Optional[Dict[str, str]] = None
-    if acc.nvidia_by_kind:
-        entries: Dict[str, str] = {}
-        for kind, cnt in sorted(
-            acc.nvidia_by_kind.items(),
-            key=lambda kv: (kv[0] == "", kv[0]),
-        ):
-            if cnt <= 0:
-                continue
-            entries[kind] = _decimal_ratio_string(cpu, Decimal(cnt))
-        nv_map = entries or None
 
+def _nvidia_pool_total(by_kind: Optional[Dict[str, int]]) -> Optional[Decimal]:
+    """Sum of NVIDIA GPU counts across kinds (pool-wide GPU total)."""
+    if not by_kind:
+        return None
+    t = sum(c for c in by_kind.values() if c > 0)
+    return Decimal(t) if t > 0 else None
+
+
+def _weight_quantities(
+    acc: TotalsAcc,
+    units: str,
+) -> tuple[Optional[Decimal], Optional[Decimal], Optional[Decimal]]:
+    mem_b = acc.memory_bytes
+    eph_b = acc.ephemeral_bytes
+    qty_mem = (
+        _bytes_to_qty_decimal(mem_b, units) if mem_b is not None and mem_b > 0 else None
+    )
+    qty_eph = (
+        _bytes_to_qty_decimal(eph_b, units) if eph_b is not None and eph_b > 0 else None
+    )
+    return (acc.cpu_cores, qty_mem, qty_eph)
+
+
+def _weights_baseline_cpu(
+    acc: TotalsAcc,
+    qty_cpu: Decimal,
+    qty_mem: Optional[Decimal],
+    qty_eph: Optional[Decimal],
+) -> Optional[ResourceWeights]:
+    w_mem = _decimal_ratio_string(qty_cpu, qty_mem) if qty_mem is not None else None
+    w_eph = _decimal_ratio_string(qty_cpu, qty_eph) if qty_eph is not None else None
+    w_nv = _nvidia_weights_from_numerator(acc.nvidia_by_kind, qty_cpu)
+    return ResourceWeights(
+        cpu=_WEIGHT_BASELINE_ONE_STR,
+        memory=w_mem,
+        ephemeral_storage=w_eph,
+        nvidia_gpu=w_nv,
+    )
+
+
+def _weights_baseline_memory(
+    acc: TotalsAcc,
+    qty_cpu: Optional[Decimal],
+    qty_mem: Decimal,
+    qty_eph: Optional[Decimal],
+) -> Optional[ResourceWeights]:
+    w_cpu = (
+        _decimal_ratio_string(qty_mem, qty_cpu)
+        if qty_cpu is not None and qty_cpu > 0
+        else None
+    )
+    w_eph = _decimal_ratio_string(qty_mem, qty_eph) if qty_eph is not None else None
+    w_nv = _nvidia_weights_from_numerator(acc.nvidia_by_kind, qty_mem)
     return ResourceWeights(
-        cpu="1",
-        memory=mem_w,
-        ephemeral_storage=eph_w,
-        nvidia_gpu=nv_map,
+        cpu=w_cpu,
+        memory=_WEIGHT_BASELINE_ONE_STR,
+        ephemeral_storage=w_eph,
+        nvidia_gpu=w_nv,
     )
 
 
+def _weights_baseline_ephemeral(
+    acc: TotalsAcc,
+    qty_cpu: Optional[Decimal],
+    qty_mem: Optional[Decimal],
+    qty_eph: Decimal,
+) -> Optional[ResourceWeights]:
+    w_cpu = (
+        _decimal_ratio_string(qty_eph, qty_cpu)
+        if qty_cpu is not None and qty_cpu > 0
+        else None
+    )
+    w_mem = _decimal_ratio_string(qty_eph, qty_mem) if qty_mem is not None else None
+    w_nv = _nvidia_weights_from_numerator(acc.nvidia_by_kind, qty_eph)
+    return ResourceWeights(
+        cpu=w_cpu,
+        memory=w_mem,
+        ephemeral_storage=_WEIGHT_BASELINE_ONE_STR,
+        nvidia_gpu=w_nv,
+    )
+
+
+def _weights_baseline_nvidia(
+    acc: TotalsAcc,
+    qty_nv_total: Decimal,
+    qty_cpu: Optional[Decimal],
+    qty_mem: Optional[Decimal],
+    qty_eph: Optional[Decimal],
+) -> ResourceWeights:
+    """Baseline is total NVIDIA GPU count; per-kind GPU weights use ``qty_nv_total``."""
+    w_cpu = (
+        _decimal_ratio_string(qty_nv_total, qty_cpu)
+        if qty_cpu is not None and qty_cpu > 0
+        else None
+    )
+    w_mem = (
+        _decimal_ratio_string(qty_nv_total, qty_mem) if qty_mem is not None else None
+    )
+    w_eph = (
+        _decimal_ratio_string(qty_nv_total, qty_eph) if qty_eph is not None else None
+    )
+    w_nv = _nvidia_weights_from_numerator(acc.nvidia_by_kind, qty_nv_total)
+    return ResourceWeights(
+        cpu=w_cpu,
+        memory=w_mem,
+        ephemeral_storage=w_eph,
+        nvidia_gpu=w_nv,
+    )
+
+
+def _compute_resource_weights(
+    acc: TotalsAcc,
+    *,
+    baseline: str,
+    units: str,
+) -> Optional[ResourceWeights]:
+    """
+    Weights normalize pool totals to ``baseline``: that resource is ``1``; other
+    fields are ratios in the same byte scale as ``units`` (for memory and
+    ephemeral) or in GPU counts (for NVIDIA). For ``nvidia.com/gpu``, the
+    baseline quantity is the **sum** of all NVIDIA GPU counts in the group.
+
+    **Interpretation (heuristic):** comparative normalization, not schedulability.
+    """
+    qty_cpu, qty_mem, qty_eph = _weight_quantities(acc, units)
+
+    if baseline == "cpu":
+        if qty_cpu is None or qty_cpu <= 0:
+            return None
+        return _weights_baseline_cpu(acc, qty_cpu, qty_mem, qty_eph)
+
+    if baseline == "memory":
+        if qty_mem is None or qty_mem <= 0:
+            return None
+        return _weights_baseline_memory(acc, qty_cpu, qty_mem, qty_eph)
+
+    if baseline == "ephemeral-storage":
+        if qty_eph is None or qty_eph <= 0:
+            return None
+        return _weights_baseline_ephemeral(acc, qty_cpu, qty_mem, qty_eph)
+
+    if baseline == "nvidia.com/gpu":
+        qty_nv = _nvidia_pool_total(acc.nvidia_by_kind)
+        if qty_nv is None or qty_nv <= 0:
+            return None
+        return _weights_baseline_nvidia(acc, qty_nv, qty_cpu, qty_mem, qty_eph)
+
+    raise ValueError(f'unknown baseline: "{baseline}"')
+
+
 def _get_field_map(node: V1Node, field: str) -> Dict[str, str]:
     """
     Extract either .status.capacity or .status.allocatable as a plain dict[str, str].
@@ -486,7 +610,13 @@ def _try_sum(dec_sum_fn, vals):
     )
 
 
-def _totals_acc_to_node_type_resources(acc: TotalsAcc, node_count: int) -> NodeTypeResources:
+def _totals_acc_to_node_type_resources(
+    acc: TotalsAcc,
+    node_count: int,
+    *,
+    units: str,
+    baseline: str,
+) -> NodeTypeResources:
     """Build one NodeTypeResources from aggregated totals."""
     return NodeTypeResources(
         count=node_count,
@@ -499,18 +629,22 @@ def _totals_acc_to_node_type_resources(acc: TotalsAcc, node_count: int) -> NodeT
             else None
         ),
         memory=(
-            _gib_resource_item(acc.memory_bytes)
+            _bytes_to_resource_item(acc.memory_bytes, units)
             if acc.memory_bytes is not None
             else None
         ),
         ephemeral_storage=(
-            _gib_resource_item(acc.ephemeral_bytes)
+            _bytes_to_resource_item(acc.ephemeral_bytes, units)
             if acc.ephemeral_bytes is not None
             else None
         ),
         nvidia_gpu=_gpu_kind_totals_to_list(acc.nvidia_by_kind),
         amd_gpu=_gpu_kind_totals_to_list(acc.amd_by_kind),
-        weights=_compute_resource_weights(acc),
+        weights=_compute_resource_weights(
+            acc,
+            baseline=baseline,
+            units=units,
+        ),
     )
 
 
@@ -520,41 +654,50 @@ def _totals_acc_to_node_type_resources(acc: TotalsAcc, node_count: int) -> NodeT
 
 
 def total(
-    patterns: Optional[List[str]] = None,
     field: str = "capacity",
     *,
-    node_label_key: str,
+    node_label: str,
+    units: str = "Gi",
+    baseline: str = "cpu",
 ) -> Dict[str, Any]:
     """
-    Calculate total cluster resources across nodes matching regex patterns.
+    Calculate total cluster resources across all nodes.
 
     Args:
-        patterns: Regex strings for node names. If None or empty, includes all nodes.
         field: Which field to sum: "capacity" (default) or "allocatable".
-        node_label_key: Kubernetes node label key used to group results (callers
+        node_label: Kubernetes node label key used to group results (callers
             such as the CLI supply the default; this function does not default it).
+        units: Binary byte unit for memory and ephemeral totals (``B``, ``Ki``,
+            ``Mi``, ``Gi``, ``Ti``, ``Pi``).
+        baseline: Resource with weight ``1``; others are expressed per this
+            baseline in ``units`` for memory/ephemeral. One of ``cpu``,
+            ``memory``, ``ephemeral-storage``, or ``nvidia.com/gpu`` (total
+            NVIDIA GPU count).
 
     Returns:
-        A dict with ``node_label_key``, ``by_label_value`` (each key is a label
+        A dict with ``node_label``, ``by_label_value`` (each key is a label
         value, or ``\"\"`` if unset), and per-group ``count`` plus resource maps.
     """
-    label_key = node_label_key.strip()
+    label_key = node_label.strip()
     if not label_key:
-        raise ValueError("node_label_key must be a non-empty string")
+        raise ValueError("node_label must be a non-empty string")
+
+    unit_key = normalize_binary_unit(units)
+    br = baseline.strip()
+    if br not in _ALLOWED_BASELINES:
+        allowed = ", ".join(sorted(_ALLOWED_BASELINES))
+        raise ValueError(f'baseline must be one of: {allowed} (got "{baseline}")')
 
     # Validate inputs with Pydantic
     if field not in ("capacity", "allocatable"):
         raise ValueError('field must be "capacity" or "allocatable"')
     try:
-        cfg = Settings(
-            patterns=patterns,
-            field=cast(Literal["capacity", "allocatable"], field),
-        )
+        cfg = Settings(field=cast(Literal["capacity", "allocatable"], field))
     except ValidationError as e:
         raise ValueError(str(e)) from e
 
     v1 = _load_kube()
-    nodes = _collect_nodes(v1, cfg.patterns)
+    nodes = _collect_nodes(v1)
     by_nt: Dict[str, List[V1Node]] = {}
     for n in nodes:
         labels = (n.metadata.labels or {}) if n.metadata else {}
@@ -566,10 +709,15 @@ def total(
     for nt_key in sorted(by_nt.keys(), key=lambda s: (s == "", s)):
         bucket = by_nt[nt_key]
         acc = _sum_resources(bucket, cfg.field)
-        groups[nt_key] = _totals_acc_to_node_type_resources(acc, len(bucket))
+        groups[nt_key] = _totals_acc_to_node_type_resources(
+            acc,
+            len(bucket),
+            units=unit_key,
+            baseline=br,
+        )
 
     return ClusterResourcesResult(
-        node_label_key=label_key,
+        node_label=label_key,
         by_label_value=groups,
     ).model_dump(
         by_alias=True,
@@ -583,10 +731,10 @@ def _scale_resource_item_inplace(item: Dict[str, Any], scale: Decimal) -> None:
     v = Decimal(str(item["value"]))
     if unit == "cores":
         item["value"] = _format_decimal_report(v * scale)
-    elif unit == "GiB":
-        scaled_bytes = _gib_display_to_bytes(v) * scale
+    elif unit in _BINARY_UNIT_BYTES:
+        scaled_bytes = _display_qty_to_bytes(v, unit) * scale
         int_bytes = max(0, int(scaled_bytes.to_integral_value(rounding=ROUND_HALF_UP)))
-        out = _gib_resource_item(int_bytes)
+        out = _bytes_to_resource_item(int_bytes, unit)
         item["value"] = out.value
         item["unit"] = out.unit
     elif unit == "count":
@@ -596,7 +744,10 @@ def _scale_resource_item_inplace(item: Dict[str, Any], scale: Decimal) -> None:
 
 
 def _scale_cluster_resources_payload(result: Dict[str, Any], scale: Decimal) -> None:
-    """Multiply numeric ``value`` fields in-place (CLI ``--scale``). Leaves ``weights`` unchanged."""
+    """Multiply numeric ``value`` fields in-place (CLI ``--scale``).
+
+    Leaves ``weights`` unchanged.
+    """
     inner = result.get("by_label_value")
     if not isinstance(inner, dict):
         return
@@ -635,15 +786,6 @@ def list_resource_quotas(namespace: str) -> Dict[str, Any]:
 
 @app.command("resources")
 def resources(
-    patterns: Annotated[
-        Optional[List[str]],
-        typer.Option(
-            "-p",
-            "--pattern",
-            metavar="PATTERN",
-            help="Regex pattern for node names. Can be specified multiple times.",
-        ),
-    ] = None,
     field: Annotated[
         str,
         typer.Option(
@@ -660,27 +802,51 @@ def resources(
             help="Scale resources by this percentage.",
         ),
     ] = 1.0,
-    node_label_key: Annotated[
+    units: Annotated[
         str,
         typer.Option(
-            "--node-label-key",
+            "-u",
+            "--units",
+            help=(
+                "Binary byte unit for memory and ephemeral totals: "
+                '"B", "Ki", "Mi", "Gi", "Ti", "Pi".'
+            ),
+        ),
+    ] = "Gi",
+    baseline: Annotated[
+        str,
+        typer.Option(
+            "-b",
+            "--baseline",
+            help=(
+                "Resource with weight 1 for pool weights: "
+                '"cpu", "memory", "ephemeral-storage", or "nvidia.com/gpu".'
+            ),
+        ),
+    ] = "cpu",
+    node_label: Annotated[
+        str,
+        typer.Option(
+            "-n",
+            "--node-label",
             help=(
                 "Node label key used to group totals by label value "
-                '(default only applies to this CLI, not to total()).'
+                "(default only applies to this CLI, not to total())."
             ),
         ),
     ] = "skaha.opencadc.org/node-type",
 ):
     """
-    Sum resources across nodes matching any of the provided regex patterns.
+    Sum resources across all nodes, grouped by a node label.
     """
     assert field in ["capacity", "allocatable"]
     assert scale > 0.0 and scale <= 1.0, "Percentage must be in (0, 1]"
     try:
         result = total(
-            patterns or None,
             field=field,
-            node_label_key=node_label_key,
+            node_label=node_label,
+            units=units,
+            baseline=baseline,
         )
         console.print(result, width=120)
         if scale != 1.0:
@@ -706,7 +872,9 @@ def resourcequota(
     """List namespace ResourceQuota objects using the Kubernetes Python client."""
     try:
         response = list_resource_quotas(namespace)
-        console.print({"response": response, "resource_quotas": response.get("items", [])})
+        console.print(
+            {"response": response, "resource_quotas": response.get("items", [])}
+        )
     except Exception as e:
         print(f"Error: {e}", file=sys.stderr)
         raise SystemExit(1)
diff --git a/configs/kueue/kueuer/tests/test_resources.py b/configs/kueue/kueuer/tests/test_resources.py
index c40f2e00..cf64b1a9 100644
--- a/configs/kueue/kueuer/tests/test_resources.py
+++ b/configs/kueue/kueuer/tests/test_resources.py
@@ -5,21 +5,22 @@
 from decimal import Decimal
 
 import pytest
-from typer.testing import CliRunner
 from kubernetes.client import V1Node, V1NodeStatus, V1ObjectMeta
+from typer.testing import CliRunner
 
 from kueuer.cli import app
 from kueuer.resources import (
-    _bytes_to_binary_gib_decimal,
+    _bytes_to_qty_decimal,
     _format_decimal_report,
     list_resource_quotas,
+    normalize_binary_unit,
     total,
 )
 
 runner = CliRunner()
 
-# Library API has no default for node_label_key; tests use the same key as the CLI default.
-NODE_LABEL_KEY = "skaha.opencadc.org/node-type"
+# Library API has no default for node_label; tests use the same key as the CLI default.
+NODE_LABEL = "skaha.opencadc.org/node-type"
 
 
 def _node(
@@ -34,11 +35,16 @@ def _node(
     )
 
 
-def test_binary_gib_conversion() -> None:
+def test_binary_unit_conversion() -> None:
     gi = 1024**3
-    assert _bytes_to_binary_gib_decimal(20550 * gi) == Decimal("20550")
-    assert _bytes_to_binary_gib_decimal(gi // 2) == Decimal("0.5")
-    assert _bytes_to_binary_gib_decimal(gi) == Decimal("1")
+    assert _bytes_to_qty_decimal(20550 * gi, "Gi") == Decimal("20550")
+    assert _bytes_to_qty_decimal(gi // 2, "Gi") == Decimal("0.5")
+    assert _bytes_to_qty_decimal(gi, "Gi") == Decimal("1")
+
+
+def test_normalize_binary_unit_rejects_unknown() -> None:
+    with pytest.raises(ValueError, match="units must be one of"):
+        normalize_binary_unit("GB")
 
 
 def test_format_decimal_report_three_places() -> None:
@@ -66,14 +72,14 @@ class R:
         lambda: type("X", (), {"list_node": fake_list_node})(),
     )
 
-    out = total(None, field="capacity", node_label_key=NODE_LABEL_KEY)
-    assert out["node_label_key"] == NODE_LABEL_KEY
+    out = total(field="capacity", node_label=NODE_LABEL)
+    assert out["node_label"] == NODE_LABEL
     bucket = out["by_label_value"][""]
     assert bucket["count"] == 1
-    assert bucket["memory"] == {"value": "16", "unit": "GiB"}
-    assert bucket["ephemeral-storage"] == {"value": "100", "unit": "GiB"}
+    assert bucket["memory"] == {"value": "16", "unit": "Gi"}
+    assert bucket["ephemeral-storage"] == {"value": "100", "unit": "Gi"}
     w = bucket["weights"]
-    assert w["cpu"] == "1"
+    assert w["cpu"] == "1.0"
     assert w["memory"] == "0.5"
     assert w["ephemeral-storage"] == "0.08"
     assert "nvidia.com/gpu" not in w
@@ -104,13 +110,13 @@ class R:
         lambda: type("X", (), {"list_node": fake_list_node})(),
     )
 
-    out = total(None, field="capacity", node_label_key=NODE_LABEL_KEY)
+    out = total(field="capacity", node_label=NODE_LABEL)
     b = out["by_label_value"][""]
     assert b["count"] == 2
     assert b["nvidia.com/gpu"] == [
         {"kind": "NVIDIA-A100-SXM4-40GB", "value": "12", "unit": "count"},
     ]
-    assert b["weights"]["nvidia.com/gpu"]["NVIDIA-A100-SXM4-40GB"] == "1"
+    assert b["weights"]["nvidia.com/gpu"]["NVIDIA-A100-SXM4-40GB"] == "1.0"
 
 
 def test_total_nvidia_gpu_mixed_kind(monkeypatch) -> None:
@@ -138,7 +144,7 @@ class R:
         lambda: type("X", (), {"list_node": fake_list_node})(),
     )
 
-    out = total(None, field="capacity", node_label_key=NODE_LABEL_KEY)
+    out = total(field="capacity", node_label=NODE_LABEL)
     b = out["by_label_value"][""]
     assert b["count"] == 2
     assert b["nvidia.com/gpu"] == [
@@ -146,8 +152,8 @@ class R:
         {"kind": "NVIDIA-T4", "value": "2", "unit": "count"},
     ]
     wg = b["weights"]["nvidia.com/gpu"]
-    assert wg["NVIDIA-A100"] == "2"
-    assert wg["NVIDIA-T4"] == "4"
+    assert wg["NVIDIA-A100"] == "2.0"
+    assert wg["NVIDIA-T4"] == "4.0"
 
 
 def test_total_amd_gpu(monkeypatch) -> None:
@@ -170,7 +176,7 @@ class R:
         lambda: type("X", (), {"list_node": fake_list_node})(),
     )
 
-    out = total(None, field="capacity", node_label_key=NODE_LABEL_KEY)
+    out = total(field="capacity", node_label=NODE_LABEL)
     b = out["by_label_value"][""]
     assert b["count"] == 1
     assert b["amd.com/gpu"] == [
@@ -192,16 +198,16 @@ class R:
         lambda: type("X", (), {"list_node": fake_list_node})(),
     )
 
-    out = total(None, field="capacity", node_label_key=NODE_LABEL_KEY)
+    out = total(field="capacity", node_label=NODE_LABEL)
     b = out["by_label_value"][""]
     assert b["count"] == 1
     assert b["nvidia.com/gpu"] == [
         {"kind": "", "value": "3", "unit": "count"},
     ]
-    assert b["weights"]["nvidia.com/gpu"][""] == "1"
+    assert b["weights"]["nvidia.com/gpu"][""] == "1.0"
 
 
-def test_total_groups_by_custom_node_label_key(monkeypatch) -> None:
+def test_total_groups_by_custom_node_label(monkeypatch) -> None:
     nodes = [
         _node("a", {"cpu": "2"}, {"pool": "east"}),
         _node("b", {"cpu": "2"}, {"pool": "east"}),
@@ -219,15 +225,15 @@ class R:
         lambda: type("X", (), {"list_node": fake_list_node})(),
     )
 
-    out = total(None, field="capacity", node_label_key="pool")
-    assert out["node_label_key"] == "pool"
+    out = total(field="capacity", node_label="pool")
+    assert out["node_label"] == "pool"
     assert out["by_label_value"]["east"]["count"] == 2
     assert out["by_label_value"]["west"]["count"] == 1
 
 
-def test_total_rejects_blank_node_label_key() -> None:
+def test_total_rejects_blank_node_label() -> None:
     with pytest.raises(ValueError, match="non-empty"):
-        total(None, node_label_key="   ")
+        total(node_label="   ")
 
 
 def test_total_split_by_skaha_node_type(monkeypatch) -> None:
@@ -258,7 +264,7 @@ class R:
         lambda: type("X", (), {"list_node": fake_list_node})(),
     )
 
-    out = total(None, field="capacity", node_label_key=NODE_LABEL_KEY)
+    out = total(field="capacity", node_label=NODE_LABEL)
     by_t = out["by_label_value"]
     assert by_t["cpu-node"]["count"] == 1
     assert by_t["gpu-node"]["count"] == 1
@@ -267,11 +273,11 @@ class R:
         {"kind": "NVIDIA-T4", "value": "2", "unit": "count"},
     ]
     assert by_t["cpu-node"]["weights"]["memory"] == "0.5"
-    assert by_t["gpu-node"]["weights"]["nvidia.com/gpu"]["NVIDIA-T4"] == "4"
+    assert by_t["gpu-node"]["weights"]["nvidia.com/gpu"]["NVIDIA-T4"] == "4.0"
 
 
 def test_total_nvidia_gpu_from_labels_when_capacity_zero(monkeypatch) -> None:
-    """MIG-style nodes may advertise GPUs via labels while capacity nvidia.com/gpu is 0."""
+    """MIG nodes may expose GPUs via labels when capacity nvidia.com/gpu is 0."""
     nodes = [
         _node(
             "g1",
@@ -300,7 +306,7 @@ class R:
         lambda: type("X", (), {"list_node": fake_list_node})(),
     )
 
-    out = total(None, field="capacity", node_label_key=NODE_LABEL_KEY)
+    out = total(field="capacity", node_label=NODE_LABEL)
     g = out["by_label_value"]["gpu-node"]
     assert g["count"] == 1
     assert g["nvidia.com/gpu"] == [
@@ -309,7 +315,125 @@ class R:
     gw = g["weights"]
     assert gw["memory"] == "0.096"
     assert gw["ephemeral-storage"] == "0.192"
-    assert gw["nvidia.com/gpu"]["NVIDIA-H100-NVL-MIG-2g.24gb"] == "8"
+    assert gw["nvidia.com/gpu"]["NVIDIA-H100-NVL-MIG-2g.24gb"] == "8.0"
+
+
+def test_total_memory_reported_in_mi_and_weights_use_same_scale(monkeypatch) -> None:
+    nodes = [
+        _node("n1", {"cpu": "8", "memory": "8Mi"}),
+    ]
+
+    def fake_list_node(*_a, **_k):
+        class R:
+            items = nodes
+
+        return R()
+
+    monkeypatch.setattr(
+        "kueuer.resources._load_kube",
+        lambda: type("X", (), {"list_node": fake_list_node})(),
+    )
+
+    out = total(field="capacity", node_label=NODE_LABEL, units="Mi")
+    bucket = out["by_label_value"][""]
+    assert bucket["memory"] == {"value": "8", "unit": "Mi"}
+    assert bucket["weights"]["cpu"] == "1.0"
+    assert bucket["weights"]["memory"] == "1.0"
+
+
+def test_total_weights_baseline_memory(monkeypatch) -> None:
+    nodes = [
+        _node("n1", {"cpu": "8", "memory": "16Gi", "ephemeral-storage": "32Gi"}),
+    ]
+
+    def fake_list_node(*_a, **_k):
+        class R:
+            items = nodes
+
+        return R()
+
+    monkeypatch.setattr(
+        "kueuer.resources._load_kube",
+        lambda: type("X", (), {"list_node": fake_list_node})(),
+    )
+
+    out = total(
+        field="capacity",
+        node_label=NODE_LABEL,
+        baseline="memory",
+    )
+    w = out["by_label_value"][""]["weights"]
+    assert w["memory"] == "1.0"
+    assert w["cpu"] == "2.0"
+    assert w["ephemeral-storage"] == "0.5"
+
+
+def test_total_weights_baseline_nvidia_single_kind(monkeypatch) -> None:
+    nodes = [
+        _node(
+            "g1",
+            {"cpu": "4", "nvidia.com/gpu": "4"},
+            {"nvidia.com/gpu.product": "NVIDIA-A100-SXM4-40GB"},
+        ),
+        _node(
+            "g2",
+            {"cpu": "8", "nvidia.com/gpu": "8"},
+            {"nvidia.com/gpu.product": "NVIDIA-A100-SXM4-40GB"},
+        ),
+    ]
+
+    def fake_list_node(*_a, **_k):
+        class R:
+            items = nodes
+
+        return R()
+
+    monkeypatch.setattr(
+        "kueuer.resources._load_kube",
+        lambda: type("X", (), {"list_node": fake_list_node})(),
+    )
+
+    out = total(field="capacity", node_label=NODE_LABEL, baseline="nvidia.com/gpu")
+    w = out["by_label_value"][""]["weights"]
+    assert w["cpu"] == "1.0"
+    assert w["nvidia.com/gpu"]["NVIDIA-A100-SXM4-40GB"] == "1.0"
+
+
+def test_total_weights_baseline_nvidia_mixed_kind(monkeypatch) -> None:
+    nodes = [
+        _node(
+            "a",
+            {"cpu": "4", "nvidia.com/gpu": "2"},
+            {"nvidia.com/gpu.product": "NVIDIA-T4"},
+        ),
+        _node(
+            "b",
+            {"cpu": "4", "nvidia.com/gpu": "4"},
+            {"nvidia.com/gpu.product": "NVIDIA-A100"},
+        ),
+    ]
+
+    def fake_list_node(*_a, **_k):
+        class R:
+            items = nodes
+
+        return R()
+
+    monkeypatch.setattr(
+        "kueuer.resources._load_kube",
+        lambda: type("X", (), {"list_node": fake_list_node})(),
+    )
+
+    out = total(field="capacity", node_label=NODE_LABEL, baseline="nvidia.com/gpu")
+    w = out["by_label_value"][""]["weights"]
+    assert w["cpu"] == "0.75"
+    assert w["nvidia.com/gpu"]["NVIDIA-A100"] == "1.5"
+    assert w["nvidia.com/gpu"]["NVIDIA-T4"] == "3.0"
+
+
+def test_total_rejects_unknown_baseline() -> None:
+    with pytest.raises(ValueError, match="baseline must be one of"):
+        total(node_label=NODE_LABEL, baseline="amd.com/gpu")
 
 
 def test_list_resource_quotas_returns_serialized_payload(monkeypatch) -> None:
@@ -318,7 +442,10 @@ def test_list_resource_quotas_returns_serialized_payload(monkeypatch) -> None:
         "kind": "ResourceQuotaList",
         "items": [
             {
-                "metadata": {"name": "compute-quota", "namespace": "canfar-kueue-testing"},
+                "metadata": {
+                    "name": "compute-quota",
+                    "namespace": "canfar-kueue-testing",
+                },
                 "spec": {"hard": {"requests.cpu": "8", "requests.memory": "32Gi"}},
             }
         ],
@@ -346,10 +473,17 @@ def sanitize_for_serialization(self, value):
     assert result == payload
 
 
-def test_resources_cli_includes_node_label_key_option() -> None:
+def test_resources_cli_includes_resource_options() -> None:
     result = runner.invoke(app, ["cluster", "resources", "--help"])
     assert result.exit_code == 0
-    assert "--node-label-key" in result.stdout
+    assert "--node-label" in result.stdout
+    assert "-n" in result.stdout
+    assert "--units" in result.stdout
+    assert "-u" in result.stdout
+    assert "--baseline" in result.stdout
+    assert "-b" in result.stdout
+    assert "--baseline-resource" not in result.stdout
+    assert "--pattern" not in result.stdout
 
 
 def test_resourcequota_cli_prints_response_and_objects(monkeypatch) -> None:
@@ -358,12 +492,18 @@ def test_resourcequota_cli_prints_response_and_objects(monkeypatch) -> None:
         "kind": "ResourceQuotaList",
         "items": [
             {
-                "metadata": {"name": "compute-quota", "namespace": "canfar-kueue-testing"},
+                "metadata": {
+                    "name": "compute-quota",
+                    "namespace": "canfar-kueue-testing",
+                },
                 "status": {"hard": {"requests.cpu": "8"}},
             }
         ],
     }
-    monkeypatch.setattr("kueuer.resources.list_resource_quotas", lambda namespace: payload)
+    monkeypatch.setattr(
+        "kueuer.resources.list_resource_quotas",
+        lambda namespace: payload,
+    )
 
     result = runner.invoke(
         app,