From f0406908627706f765786fd3b4e26a8c3b96ace7 Mon Sep 17 00:00:00 2001 From: Shiny Brar Date: Thu, 26 Mar 2026 08:20:16 -0700 Subject: [PATCH 01/11] fix(track): update logging format for failed job states to use f-strings for consistency --- configs/kueue/kueuer/src/kueuer/benchmarks/track.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/configs/kueue/kueuer/src/kueuer/benchmarks/track.py b/configs/kueue/kueuer/src/kueuer/benchmarks/track.py index 56981e39..0fd8348a 100644 --- a/configs/kueue/kueuer/src/kueuer/benchmarks/track.py +++ b/configs/kueue/kueuer/src/kueuer/benchmarks/track.py @@ -290,7 +290,7 @@ def jobs( # noqa: C901 elif item.metadata.name in pending and job_reached_state(item, "Failed"): failed_count += 1 pending[item.metadata.name] = False - logfire.warning("%s reached terminal state Failed.", item.metadata.name) + logfire.warning(f"{item.metadata.name} reached terminal state Failed.") logfire.info(f"{len(pending)} jobs need to be tracked.") logfire.info(f"Starting to track jobs to state {to_state}...") @@ -323,7 +323,7 @@ def jobs( # noqa: C901 elif job_reached_state(item, "Failed"): failed_count += 1 pending[name] = False - logfire.warning("%s reached terminal state Failed.", name) + logfire.warning(f"{name} reached terminal state Failed.") logfire.debug(f"Pending Jobs Left: {sum(pending.values())}") @@ -338,8 +338,6 @@ def jobs( # noqa: C901 break if failed_count: logfire.warning( - "%s jobs reached Failed state while tracking '%s'.", - failed_count, - prefix, + f"{failed_count} jobs reached Failed state while tracking '{prefix}'." ) return done From 78ecd990692dd317902f55d4d5cf82742a6db0e5 Mon Sep 17 00:00:00 2001 From: Shiny Brar Date: Thu, 26 Mar 2026 13:51:21 -0700 Subject: [PATCH 02/11] feat(benchmark): add vm_memory_fraction parameter to experiment and benchmark functions for improved resource management --- .../kueuer/src/kueuer/benchmarks/benchmark.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/configs/kueue/kueuer/src/kueuer/benchmarks/benchmark.py b/configs/kueue/kueuer/src/kueuer/benchmarks/benchmark.py index 8a671032..55b75349 100644 --- a/configs/kueue/kueuer/src/kueuer/benchmarks/benchmark.py +++ b/configs/kueue/kueuer/src/kueuer/benchmarks/benchmark.py @@ -261,6 +261,7 @@ def experiment( apply_chunk_size: int = 25, apply_retries: int = 2, apply_backoff: float = 2.0, + vm_memory_fraction: float = k8s.DEFAULT_STRESS_VM_MEMORY_FRACTION, ) -> Dict[str, Any]: """Run a single experiment with the specified configuration. @@ -314,6 +315,7 @@ def experiment( apply_chunk_size=apply_chunk_size, apply_retries=apply_retries, apply_backoff=apply_backoff, + vm_memory_fraction=vm_memory_fraction, ) # Track jobs to completion and get timing statistics @@ -395,6 +397,7 @@ def benchmark( apply_chunk_size: int, apply_retries: int, apply_backoff: float, + vm_memory_fraction: float, ) -> List[Dict[str, Any]]: """ Run a complete benchmark comparing direct Kubernetes jobs vs Kueue jobs. @@ -434,6 +437,7 @@ def benchmark( apply_chunk_size=apply_chunk_size, apply_retries=apply_retries, apply_backoff=apply_backoff, + vm_memory_fraction=vm_memory_fraction, ) results.append(result) @@ -459,6 +463,7 @@ def benchmark( apply_chunk_size=apply_chunk_size, apply_retries=apply_retries, apply_backoff=apply_backoff, + vm_memory_fraction=vm_memory_fraction, ) results.append(kueue_result) @@ -566,6 +571,16 @@ def performance( "--apply-backoff", help="Backoff base (seconds) between apply retries.", ), + vm_memory_fraction: float = typer.Option( + k8s.DEFAULT_STRESS_VM_MEMORY_FRACTION, + "--vm-memory-fraction", + min=0.1, + max=0.95, + help=( + "Fraction of per-job memory assigned to stress-ng --vm-bytes. " + "Lower values reduce OOM risk." + ), + ), ): """Compare native K8s job scheduling vs. Kueue.""" profile = _normalize_profile_name(profile) @@ -625,6 +640,7 @@ def performance( apply_retries, apply_backoff, ) + logger.info("VM Frac : %s", vm_memory_fraction) if not k8s.check(namespace, kueue, priority): logger.error("Please check your Kueue configuration.") @@ -645,6 +661,7 @@ def performance( apply_chunk_size=apply_chunk_size, apply_retries=apply_retries, apply_backoff=apply_backoff, + vm_memory_fraction=vm_memory_fraction, ) logger.info("Benchmark completed successfully.") logger.info("Results saved to %s", output) From 9c79392baa6a18e0742ead9f62583c48f5f924eb Mon Sep 17 00:00:00 2001 From: Shiny Brar Date: Thu, 26 Mar 2026 13:51:46 -0700 Subject: [PATCH 03/11] feat(benchmark): enhance memory management with vm_memory_fraction calculations and OOM risk assessment --- .../kueue/kueuer/src/kueuer/benchmarks/k8s.py | 54 ++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/configs/kueue/kueuer/src/kueuer/benchmarks/k8s.py b/configs/kueue/kueuer/src/kueuer/benchmarks/k8s.py index 91c553f5..cba93875 100644 --- a/configs/kueue/kueuer/src/kueuer/benchmarks/k8s.py +++ b/configs/kueue/kueuer/src/kueuer/benchmarks/k8s.py @@ -18,6 +18,7 @@ from kueuer.utils.logging import logger app = typer.Typer(help="Launch K8s Jobs") +DEFAULT_STRESS_VM_MEMORY_FRACTION = 0.33 def check(namespace: str, kueue: str, priority: str) -> bool: @@ -131,6 +132,33 @@ def stress_cpu_workers(cores: float) -> int: return max(int(math.ceil(cores)), 1) +def stress_vm_bytes_mb(ram_gb: float, vm_memory_fraction: float) -> float: + """Return stress-ng vm-bytes value in megabytes. + + Args: + ram_gb: Pod memory limit/request in GiB. + vm_memory_fraction: Fraction of pod memory allocated to stress-ng vm worker. + + Returns: + float: vm-bytes value in MB. + """ + if not 0.0 < vm_memory_fraction < 1.0: + raise ValueError("vm_memory_fraction must be between 0 and 1 (exclusive)") + ram_mb = ram_gb * 1024.0 + return ram_mb * vm_memory_fraction + + +def is_high_oom_risk(ram_gb: float, vm_memory_fraction: float) -> bool: + """Heuristic for tight memory headroom likely to trigger OOM. + + This warns when stress-ng memory pressure leaves too little room for process/runtime + overhead under pod cgroup limits. + """ + vm_bytes_mb = stress_vm_bytes_mb(ram_gb=ram_gb, vm_memory_fraction=vm_memory_fraction) + # For small pods, keep additional safety headroom to avoid allocator/runtime spikes. + return vm_memory_fraction >= 0.75 or (ram_gb <= 1.0 and vm_bytes_mb >= 600.0) + + def _format_cpu_quantity(cores: float) -> str: """Format CPU cores into Kubernetes CPU quantity syntax.""" if float(cores).is_integer(): @@ -383,10 +411,26 @@ def run( help="Backoff base (seconds) used between apply retries.", ) ), + vm_memory_fraction: float = ( + typer.Option( + DEFAULT_STRESS_VM_MEMORY_FRACTION, + "--vm-memory-fraction", + min=0.1, + max=0.95, + help=( + "Fraction of pod memory assigned to stress-ng --vm-bytes. " + "Lower values leave more headroom and reduce OOM risk." + ), + ) + ), ) -> Dict[str, Any]: """Run jobs to stress k8s cluster.""" ram_mb: float = ram * 1024.0 cpu_workers = stress_cpu_workers(cores) + vm_bytes_mb = stress_vm_bytes_mb( + ram_gb=ram, + vm_memory_fraction=vm_memory_fraction, + ) cpu_quantity = _format_cpu_quantity(cores) args: List[str] = [ "--cpu", @@ -396,13 +440,21 @@ def run( "--vm", "1", "--vm-bytes", - f"{ram_mb * 0.8}M", + f"{vm_bytes_mb}M", "--temp-path", "/tmp", "--timeout", f"{duration}", "--metrics-brief", ] + if is_high_oom_risk(ram_gb=ram, vm_memory_fraction=vm_memory_fraction): + logger.warning( + "High OOM risk: ram=%sGi, vm-memory-fraction=%s (vm-bytes=%sM). " + "Consider reducing --vm-memory-fraction.", + ram, + vm_memory_fraction, + f"{vm_bytes_mb:.1f}", + ) job = io.read_yaml(filepath) # Write common job parameters From c2609be47db85b7664a23ce29eb7018514584bca Mon Sep 17 00:00:00 2001 From: Shiny Brar Date: Thu, 26 Mar 2026 13:52:04 -0700 Subject: [PATCH 04/11] feat(lifecycle): add warnings to preflight report and enhance access checks output --- .../kueuer/src/kueuer/lifecycle/commands.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/configs/kueue/kueuer/src/kueuer/lifecycle/commands.py b/configs/kueue/kueuer/src/kueuer/lifecycle/commands.py index 76a7087d..7ec1d99d 100644 --- a/configs/kueue/kueuer/src/kueuer/lifecycle/commands.py +++ b/configs/kueue/kueuer/src/kueuer/lifecycle/commands.py @@ -128,6 +128,9 @@ def run_cluster_preflight( *kueue_report.get("errors", []), *queue_report.get("errors", []), ] + warnings = [ + *access_report.get("warnings", []), + ] remediation = [ *kueue_report.get("remediation", []), *queue_report.get("manual_commands", []), @@ -135,6 +138,7 @@ def run_cluster_preflight( return { "ok": access_report["ok"] and kueue_report["ok"] and queue_report["ok"], "errors": errors, + "warnings": warnings, "remediation": remediation, "checks": { "access": access_report.get("checks", {}), @@ -190,10 +194,16 @@ def print_preflight_report(report: Dict[str, Any]) -> None: typer.echo("Access checks:") typer.echo(f" - kubectl installed: {'yes' if access.get('binary:kubectl') else 'no'}") typer.echo(f" - current context readable: {'yes' if access.get('context') else 'no'}") - typer.echo(f" - cluster reachable: {'yes' if access.get('cluster-info') else 'no'}") + typer.echo( + f" - workload namespace exists: {'yes' if access.get('namespace-exists') else 'no'}" + ) typer.echo( f" - can create jobs: {'yes' if access.get('can-create-jobs') else 'no'}" ) + typer.echo( + " - cluster-info (kubectl -n ): " + f"{'yes' if access.get('cluster-info') else 'no'}" + ) kueue = report.get("checks", {}).get("kueue", {}) typer.echo("Kueue health:") @@ -214,6 +224,10 @@ def print_preflight_report(report: Dict[str, Any]) -> None: _echo_list("LocalQueues", list(inventory.get("localqueues", []))) _echo_list("PriorityClasses", list(inventory.get("workloadpriorityclasses", []))) + if report.get("warnings"): + typer.echo("Warnings:") + for item in report["warnings"]: + typer.echo(f" - {item}") if report.get("errors"): typer.echo("Errors:") for error in report["errors"]: From d2a068a66f65a17ea453b3efbbd098e34c01e997 Mon Sep 17 00:00:00 2001 From: Shiny Brar Date: Thu, 26 Mar 2026 13:52:30 -0700 Subject: [PATCH 05/11] feat(lifecycle): enhance preflight checks with namespace validation and warning handling --- .../kueuer/src/kueuer/lifecycle/preflight.py | 86 ++++++++++++++----- 1 file changed, 66 insertions(+), 20 deletions(-) diff --git a/configs/kueue/kueuer/src/kueuer/lifecycle/preflight.py b/configs/kueue/kueuer/src/kueuer/lifecycle/preflight.py index 42b670d7..6b6f34d1 100644 --- a/configs/kueue/kueuer/src/kueuer/lifecycle/preflight.py +++ b/configs/kueue/kueuer/src/kueuer/lifecycle/preflight.py @@ -12,7 +12,16 @@ def run_preflight( command_exists_fn: Callable[[str], bool] = command_exists, run_cmd: Callable[[List[str]], Any] = run_command, ) -> Dict[str, Any]: + """Verify kubectl, workload namespace, Job RBAC, and API connectivity. + + ``kubectl cluster-info`` is not namespace-scoped; we run + ``kubectl -n cluster-info`` so the workload namespace is set on + the kubectl invocation. If that command fails but the namespace exists and + ``can-i create jobs`` succeeds, preflight still passes with a warning so + benchmarks can run when ``cluster-info`` flakes or is restricted. + """ errors: List[str] = [] + warnings: List[str] = [] checks: Dict[str, bool] = {} for binary in ("kubectl",): @@ -22,31 +31,68 @@ def run_preflight( errors.append(f"Required binary missing: {binary}") context = "" - if not errors: - context_result = run_cmd(["kubectl", "config", "current-context"]) - checks["context"] = context_result.returncode == 0 - if context_result.returncode != 0: - errors.append("kubectl config current-context failed") - else: - context = context_result.stdout.strip() - - cluster_result = run_cmd(["kubectl", "cluster-info"]) - checks["cluster-info"] = cluster_result.returncode == 0 - if cluster_result.returncode != 0: - errors.append("kubectl cluster-info failed") - - can_i_result = run_cmd( - ["kubectl", "auth", "can-i", "create", "jobs", "-n", namespace] + if errors: + return { + "ok": False, + "context": context, + "checks": checks, + "errors": errors, + "warnings": warnings, + } + + context_result = run_cmd(["kubectl", "config", "current-context"]) + checks["context"] = context_result.returncode == 0 + if context_result.returncode != 0: + errors.append("kubectl config current-context failed") + else: + context = context_result.stdout.strip() + + if errors: + return { + "ok": False, + "context": context, + "checks": checks, + "errors": errors, + "warnings": warnings, + } + + ns_result = run_cmd(["kubectl", "get", "namespace", namespace]) + checks["namespace-exists"] = ns_result.returncode == 0 + if not checks["namespace-exists"]: + errors.append( + f"Workload namespace {namespace!r} not found or not accessible." ) - checks["can-create-jobs"] = can_i_result.returncode == 0 and ( - "yes" in can_i_result.stdout.lower() + + can_i_result = run_cmd( + ["kubectl", "auth", "can-i", "create", "jobs", "-n", namespace] + ) + checks["can-create-jobs"] = can_i_result.returncode == 0 and ( + "yes" in can_i_result.stdout.lower() + ) + if not checks["can-create-jobs"]: + errors.append("kubectl auth can-i create jobs failed") + + if errors: + return { + "ok": False, + "context": context, + "checks": checks, + "errors": errors, + "warnings": warnings, + } + + cluster_result = run_cmd(["kubectl", "-n", namespace, "cluster-info"]) + checks["cluster-info"] = cluster_result.returncode == 0 + if not checks["cluster-info"]: + warnings.append( + f"kubectl -n {namespace!r} cluster-info failed; continuing because " + "the workload namespace exists and Job creation is allowed." ) - if not checks["can-create-jobs"]: - errors.append("kubectl auth can-i create jobs failed") return { - "ok": not errors, + "ok": True, "context": context, "checks": checks, "errors": errors, + "warnings": warnings, } From 3f5433057c19e43de7ef99e5d08447169b836cbf Mon Sep 17 00:00:00 2001 From: Shiny Brar Date: Thu, 26 Mar 2026 13:52:56 -0700 Subject: [PATCH 06/11] feat(benchmark): add stress VM memory fraction tests and OOM risk assessment --- .../tests/test_k8s_phase2_resilience.py | 22 ++++++++ .../kueuer/tests/test_lifecycle_preflight.py | 53 +++++++++++++------ 2 files changed, 58 insertions(+), 17 deletions(-) diff --git a/configs/kueue/kueuer/tests/test_k8s_phase2_resilience.py b/configs/kueue/kueuer/tests/test_k8s_phase2_resilience.py index e48f7dae..064a9268 100644 --- a/configs/kueue/kueuer/tests/test_k8s_phase2_resilience.py +++ b/configs/kueue/kueuer/tests/test_k8s_phase2_resilience.py @@ -1,5 +1,7 @@ from types import SimpleNamespace +import pytest + from kueuer.benchmarks import k8s @@ -52,3 +54,23 @@ def list_namespaced_pod(self, namespace): monkeypatch.setattr(k8s.client, "CoreV1Api", FakeCoreV1Api) assert k8s.kueue_controller_restarts() == 0 + + +def test_stress_vm_bytes_mb_uses_safer_default_fraction() -> None: + assert k8s.DEFAULT_STRESS_VM_MEMORY_FRACTION == 0.4 + assert k8s.stress_vm_bytes_mb( + ram_gb=1.0, + vm_memory_fraction=k8s.DEFAULT_STRESS_VM_MEMORY_FRACTION, + ) == pytest.approx(409.6) + + +def test_stress_vm_bytes_mb_validates_fraction_bounds() -> None: + with pytest.raises(ValueError): + k8s.stress_vm_bytes_mb(ram_gb=1.0, vm_memory_fraction=0.0) + with pytest.raises(ValueError): + k8s.stress_vm_bytes_mb(ram_gb=1.0, vm_memory_fraction=1.0) + + +def test_is_high_oom_risk_flags_tight_memory_headroom() -> None: + assert k8s.is_high_oom_risk(ram_gb=1.0, vm_memory_fraction=0.8) is True + assert k8s.is_high_oom_risk(ram_gb=1.0, vm_memory_fraction=0.55) is False diff --git a/configs/kueue/kueuer/tests/test_lifecycle_preflight.py b/configs/kueue/kueuer/tests/test_lifecycle_preflight.py index 69205256..ffd0845f 100644 --- a/configs/kueue/kueuer/tests/test_lifecycle_preflight.py +++ b/configs/kueue/kueuer/tests/test_lifecycle_preflight.py @@ -25,14 +25,17 @@ def _run(command: List[str]) -> FakeResult: runner = CliRunner() +def _access_success_responses() -> dict[str, FakeResult]: + return { + "kubectl config current-context": FakeResult(0, "minikube\n"), + "kubectl get namespace skaha-workload": FakeResult(0, "Active\n"), + "kubectl auth can-i create jobs -n skaha-workload": FakeResult(0, "yes\n"), + "kubectl -n skaha-workload cluster-info": FakeResult(0, "ok\n"), + } + + def test_run_preflight_success() -> None: - run = _runner( - { - "kubectl config current-context": FakeResult(0, "minikube\n"), - "kubectl cluster-info": FakeResult(0, "ok\n"), - "kubectl auth can-i create jobs -n skaha-workload": FakeResult(0, "yes\n"), - } - ) + run = _runner(_access_success_responses()) report = preflight.run_preflight( namespace="skaha-workload", command_exists_fn=lambda cmd: True, @@ -43,13 +46,7 @@ def test_run_preflight_success() -> None: def test_run_preflight_success_without_helm() -> None: - run = _runner( - { - "kubectl config current-context": FakeResult(0, "minikube\n"), - "kubectl cluster-info": FakeResult(0, "ok\n"), - "kubectl auth can-i create jobs -n skaha-workload": FakeResult(0, "yes\n"), - } - ) + run = _runner(_access_success_responses()) report = preflight.run_preflight( namespace="skaha-workload", command_exists_fn=lambda cmd: cmd == "kubectl", @@ -68,11 +65,32 @@ def test_run_preflight_fails_when_kubectl_missing() -> None: assert "kubectl" in " ".join(report["errors"]).lower() -def test_run_preflight_fails_on_cluster_unreachable() -> None: +def test_run_preflight_warns_when_cluster_info_fails_but_namespace_ok() -> None: + """cluster-info is non-fatal; namespace + can-i determine success.""" + run = _runner( + { + "kubectl config current-context": FakeResult(0, "minikube\n"), + "kubectl get namespace skaha-workload": FakeResult(0, "Active\n"), + "kubectl auth can-i create jobs -n skaha-workload": FakeResult(0, "yes\n"), + "kubectl -n skaha-workload cluster-info": FakeResult(1, "", "connection refused"), + } + ) + report = preflight.run_preflight( + namespace="skaha-workload", + command_exists_fn=lambda cmd: True, + run_cmd=run, + ) + assert report["ok"] is True + assert report["checks"]["cluster-info"] is False + assert report["warnings"] + assert not report["errors"] + + +def test_run_preflight_fails_when_namespace_missing() -> None: run = _runner( { "kubectl config current-context": FakeResult(0, "minikube\n"), - "kubectl cluster-info": FakeResult(1, "", "connection refused"), + "kubectl get namespace skaha-workload": FakeResult(1, "", "NotFound"), } ) report = preflight.run_preflight( @@ -81,7 +99,7 @@ def test_run_preflight_fails_on_cluster_unreachable() -> None: run_cmd=run, ) assert report["ok"] is False - assert "cluster-info" in " ".join(report["errors"]).lower() + assert "namespace" in " ".join(report["errors"]).lower() def test_preflight_command_prints_verbose_inventory(monkeypatch, tmp_path) -> None: @@ -97,6 +115,7 @@ def test_preflight_command_prints_verbose_inventory(monkeypatch, tmp_path) -> No "access": { "binary:kubectl": True, "context": True, + "namespace-exists": True, "cluster-info": True, "can-create-jobs": True, }, From 8883ebf8430df60c2d24b046b0e1ac66a2d5248b Mon Sep 17 00:00:00 2001 From: Shiny Brar Date: Thu, 26 Mar 2026 14:57:02 -0700 Subject: [PATCH 07/11] feat(benchmark): introduce spawn mechanism parameter for job submission methods --- .../kueuer/src/kueuer/benchmarks/benchmark.py | 25 ++ .../kueue/kueuer/src/kueuer/benchmarks/k8s.py | 231 ++++++++++++++++-- .../kueuer/src/kueuer/lifecycle/commands.py | 2 + .../kueuer/src/kueuer/lifecycle/suite.py | 7 + .../kueuer/tests/test_spawn_mechanism_api.py | 137 +++++++++++ 5 files changed, 388 insertions(+), 14 deletions(-) create mode 100644 configs/kueue/kueuer/tests/test_spawn_mechanism_api.py diff --git a/configs/kueue/kueuer/src/kueuer/benchmarks/benchmark.py b/configs/kueue/kueuer/src/kueuer/benchmarks/benchmark.py index 55b75349..27886636 100644 --- a/configs/kueue/kueuer/src/kueuer/benchmarks/benchmark.py +++ b/configs/kueue/kueuer/src/kueuer/benchmarks/benchmark.py @@ -262,6 +262,7 @@ def experiment( apply_retries: int = 2, apply_backoff: float = 2.0, vm_memory_fraction: float = k8s.DEFAULT_STRESS_VM_MEMORY_FRACTION, + spawn_mechanism: str = "kubectl", ) -> Dict[str, Any]: """Run a single experiment with the specified configuration. @@ -316,6 +317,7 @@ def experiment( apply_retries=apply_retries, apply_backoff=apply_backoff, vm_memory_fraction=vm_memory_fraction, + spawn_mechanism=spawn_mechanism, ) # Track jobs to completion and get timing statistics @@ -398,6 +400,7 @@ def benchmark( apply_retries: int, apply_backoff: float, vm_memory_fraction: float, + spawn_mechanism: str, ) -> List[Dict[str, Any]]: """ Run a complete benchmark comparing direct Kubernetes jobs vs Kueue jobs. @@ -438,6 +441,7 @@ def benchmark( apply_retries=apply_retries, apply_backoff=apply_backoff, vm_memory_fraction=vm_memory_fraction, + spawn_mechanism=spawn_mechanism, ) results.append(result) @@ -464,6 +468,7 @@ def benchmark( apply_retries=apply_retries, apply_backoff=apply_backoff, vm_memory_fraction=vm_memory_fraction, + spawn_mechanism=spawn_mechanism, ) results.append(kueue_result) @@ -571,6 +576,11 @@ def performance( "--apply-backoff", help="Backoff base (seconds) between apply retries.", ), + spawn_mechanism: str = typer.Option( + "kubectl", + "--spawn-mechanism", + help="Job spawn mechanism to use: kubectl (apply) or api (client create).", + ), vm_memory_fraction: float = typer.Option( k8s.DEFAULT_STRESS_VM_MEMORY_FRACTION, "--vm-memory-fraction", @@ -640,6 +650,7 @@ def performance( apply_retries, apply_backoff, ) + logger.info("Spawn : %s", spawn_mechanism) logger.info("VM Frac : %s", vm_memory_fraction) if not k8s.check(namespace, kueue, priority): @@ -662,6 +673,7 @@ def performance( apply_retries=apply_retries, apply_backoff=apply_backoff, vm_memory_fraction=vm_memory_fraction, + spawn_mechanism=spawn_mechanism, ) logger.info("Benchmark completed successfully.") logger.info("Results saved to %s", output) @@ -756,6 +768,11 @@ def eviction( "--apply-backoff", help="Backoff base (seconds) between apply retries.", ), + spawn_mechanism: str = typer.Option( + "kubectl", + "--spawn-mechanism", + help="Job spawn mechanism to use: kubectl (apply) or api (client create).", + ), ): """Run a benchmark to test eviction behavior of Kueue in a packed cluster queue.""" profile = _normalize_profile_name(profile) @@ -809,6 +826,7 @@ def eviction( apply_retries, apply_backoff, ) + logger.info("Spawn : %s", spawn_mechanism) logger.info("K8s Resource : %s", resource_id) for priority in priorities: @@ -851,6 +869,7 @@ def eviction( apply_chunk_size=apply_chunk_size, apply_retries=apply_retries, apply_backoff=apply_backoff, + spawn_mechanism=spawn_mechanism, ) logger.info("All jobs launched successfully.") @@ -1005,6 +1024,11 @@ def e2e( "--keep-artifacts/--no-keep-artifacts", help="Keep generated artifacts.", ), + spawn_mechanism: str = typer.Option( + "kubectl", + "--spawn-mechanism", + help="Job spawn mechanism to use: kubectl (apply) or api (client create).", + ), ) -> None: """Run the full benchmark workflow with automatic post-processing.""" from kueuer.lifecycle import commands as lifecycle_commands @@ -1041,6 +1065,7 @@ def e2e( observe_output_subdir=observe_output_subdir, skip_queue_apply=skip_queue_apply, skip_teardown=skip_teardown, + spawn_mechanism=spawn_mechanism, ) typer.echo(f"e2e {'ok' if report['ok'] else 'failed'} for run {report['run_id']}") diff --git a/configs/kueue/kueuer/src/kueuer/benchmarks/k8s.py b/configs/kueue/kueuer/src/kueuer/benchmarks/k8s.py index cba93875..f7260dfb 100644 --- a/configs/kueue/kueuer/src/kueuer/benchmarks/k8s.py +++ b/configs/kueue/kueuer/src/kueuer/benchmarks/k8s.py @@ -3,8 +3,9 @@ import asyncio import copy import math +import random from time import time -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import aiofiles import aiofiles.os @@ -19,6 +20,10 @@ app = typer.Typer(help="Launch K8s Jobs") DEFAULT_STRESS_VM_MEMORY_FRACTION = 0.33 +DEFAULT_SPAWN_MECHANISM = "kubectl" + +# Cache API connectivity mode per process. +_API_CLIENT_READY: bool = False def check(namespace: str, kueue: str, priority: str) -> bool: @@ -284,6 +289,7 @@ async def apply( "apply_retries": 0, "manifest_apply_seconds": 0.0, "last_error": "", + "spawn_mechanism": "kubectl", } for start, end in chunk_ranges(count, chunk_size): report["chunks_total"] += 1 @@ -291,12 +297,7 @@ async def apply( async with aiofiles.tempfile.NamedTemporaryFile( delete=False, mode="w", suffix=".yaml" ) as temp: - for num in range(start, end): - manifest = copy.deepcopy(data) - name: str = f"{prefix}-{num}" - manifest["metadata"]["name"] = name - for container in manifest["spec"]["template"]["spec"]["containers"]: - container["name"] = name + for manifest in render_job_manifests(data, prefix, start, end): await temp.write(yaml.dump(manifest)) await temp.write("\n---\n") logger.debug("Applying %s", temp.name) @@ -347,6 +348,199 @@ async def apply( return report +def render_job_manifest( + template: Dict[Any, Any], + name: str, +) -> Dict[Any, Any]: + """Return a deep-copied job manifest with name and container names set.""" + manifest = copy.deepcopy(template) + manifest["metadata"]["name"] = name + for container in manifest["spec"]["template"]["spec"]["containers"]: + container["name"] = name + return manifest + + +def render_job_manifests( + template: Dict[Any, Any], + prefix: str, + start: int, + end: int, +) -> List[Dict[Any, Any]]: + """Render job manifests for indices in [start, end).""" + return [render_job_manifest(template, f"{prefix}-{num}") for num in range(start, end)] + + +def _is_transient_api_error(error: ApiException) -> bool: + try: + status = int(getattr(error, "status", 0) or 0) + except Exception: # noqa: BLE001 + return False + return status in {429, 500, 502, 503, 504} + + +def _api_preflight_check(namespace: str) -> None: + """Perform a lightweight namespace-scoped API check.""" + v1 = client.CoreV1Api() + # Prefer a namespaced call to ensure auth scopes match workload permissions. + v1.list_namespaced_pod(namespace=namespace, limit=1) # type: ignore[arg-type] + + +def ensure_api_client_ready(namespace: str) -> None: + """Ensure Kubernetes Python client is configured (kubeconfig or incluster).""" + global _API_CLIENT_READY + if _API_CLIENT_READY: + return + + errors: List[str] = [] + try: + config.load_kube_config() + _api_preflight_check(namespace) + _API_CLIENT_READY = True + logger.info("Kubernetes API client configured using kubeconfig.") + return + except Exception as error: # noqa: BLE001 + errors.append(f"kubeconfig: {error}") + + try: + config.load_incluster_config() + _api_preflight_check(namespace) + _API_CLIENT_READY = True + logger.info("Kubernetes API client configured using in-cluster service account.") + return + except Exception as error: # noqa: BLE001 + errors.append(f"incluster: {error}") + + msg = "Unable to configure Kubernetes API client. " + "; ".join(errors) + raise RuntimeError(msg) + + +async def _create_job_with_retries( + batch: client.BatchV1Api, + namespace: str, + manifest: Dict[Any, Any], + retries: int, + backoff_seconds: float, +) -> Tuple[bool, str]: + """Create a job with transient retries. Returns (ok, error_message).""" + name = str(manifest.get("metadata", {}).get("name", "")) + for attempt in range(1, retries + 2): + try: + batch.create_namespaced_job(namespace=namespace, body=manifest) # type: ignore[arg-type] + return True, "" + except ApiException as error: + if getattr(error, "status", None) == 409: + # Create-only semantics: treat existing job as a failure. + return False, f"{name}: already exists (409)" + if not _is_transient_api_error(error) or attempt > retries: + return False, f"{name}: {error}" + sleep_s = backoff_seconds * attempt + # Add jitter to avoid thundering herd. + sleep_s *= 0.8 + (0.4 * random.random()) + await asyncio.sleep(sleep_s) + except Exception as error: # noqa: BLE001 + if attempt > retries: + return False, f"{name}: {error}" + sleep_s = backoff_seconds * attempt + sleep_s *= 0.8 + (0.4 * random.random()) + await asyncio.sleep(sleep_s) + return False, f"{name}: unknown failure" + + +async def apply_api( + data: Dict[Any, Any], + prefix: str, + count: int, + chunk_size: int = 25, + retries: int = 2, + backoff_seconds: float = 2.0, + api_concurrency: int = 10, + namespace: str = "default", +) -> Dict[str, Any]: + """Create Kubernetes Jobs using the Python API client.""" + ensure_api_client_ready(namespace=namespace) + batch = client.BatchV1Api() + semaphore = asyncio.Semaphore(max(int(api_concurrency), 1)) + + now = time() + report: Dict[str, Any] = { + "requested_jobs": count, + "chunk_size": chunk_size, + "chunks_total": 0, + "chunks_succeeded": 0, + "chunks_failed": 0, + "jobs_applied": 0, + "jobs_failed_to_apply": 0, + "apply_attempts": 0, + "apply_retries": 0, + "manifest_apply_seconds": 0.0, + "last_error": "", + "spawn_mechanism": "api", + "api_concurrency": api_concurrency, + } + + async def _guarded_create(manifest: Dict[Any, Any]) -> Tuple[bool, str]: + async with semaphore: + ok, err = await _create_job_with_retries( + batch=batch, + namespace=namespace, + manifest=manifest, + retries=retries, + backoff_seconds=backoff_seconds, + ) + report["apply_attempts"] += 1 + if not ok and err: + report["last_error"] = err + return ok, err + + for start, end in chunk_ranges(count, chunk_size): + report["chunks_total"] += 1 + manifests = render_job_manifests(data, prefix, start, end) + results = await asyncio.gather(*[_guarded_create(m) for m in manifests]) + failures = [err for ok, err in results if not ok] + if failures: + report["chunks_failed"] += 1 + report["jobs_failed_to_apply"] += len(failures) + report["last_error"] = failures[0] + else: + report["chunks_succeeded"] += 1 + report["jobs_applied"] += len(results) + + report["manifest_apply_seconds"] = time() - now + logger.info("Took %ss to submit jobs via API", report["manifest_apply_seconds"]) + return report + + +async def submit_jobs( + template: Dict[Any, Any], + prefix: str, + jobs: int, + spawn_mechanism: str, + namespace: str, + apply_chunk_size: int, + apply_retries: int, + apply_backoff: float, +) -> Dict[str, Any]: + """Submit rendered jobs using the selected spawn mechanism.""" + if spawn_mechanism == "api": + return await apply_api( + template, + prefix, + jobs, + chunk_size=apply_chunk_size, + retries=apply_retries, + backoff_seconds=apply_backoff, + namespace=namespace, + ) + return await apply( + template, + prefix, + jobs, + chunk_size=apply_chunk_size, + retries=apply_retries, + backoff_seconds=apply_backoff, + ) + + @app.command("run") def run( filepath: str = ( @@ -423,6 +617,13 @@ def run( ), ) ), + spawn_mechanism: str = ( + typer.Option( + DEFAULT_SPAWN_MECHANISM, + "--spawn-mechanism", + help="Job spawn mechanism to use: kubectl (apply) or api (client create).", + ) + ), ) -> Dict[str, Any]: """Run jobs to stress k8s cluster.""" ram_mb: float = ram * 1024.0 @@ -480,13 +681,15 @@ def run( loop = asyncio.get_event_loop() asyncio.set_event_loop(loop) result = loop.run_until_complete( - apply( - job, - prefix, - jobs, - chunk_size=apply_chunk_size, - retries=apply_retries, - backoff_seconds=apply_backoff, + submit_jobs( + template=job, + prefix=prefix, + jobs=jobs, + spawn_mechanism=spawn_mechanism, + namespace=namespace, + apply_chunk_size=apply_chunk_size, + apply_retries=apply_retries, + apply_backoff=apply_backoff, ) ) return result diff --git a/configs/kueue/kueuer/src/kueuer/lifecycle/commands.py b/configs/kueue/kueuer/src/kueuer/lifecycle/commands.py index 7ec1d99d..e45feed0 100644 --- a/configs/kueue/kueuer/src/kueuer/lifecycle/commands.py +++ b/configs/kueue/kueuer/src/kueuer/lifecycle/commands.py @@ -423,6 +423,7 @@ def run_benchmark_e2e( observe_output_subdir: str = DEFAULT_OBSERVATION_SUBDIR, skip_queue_apply: bool = False, skip_teardown: bool = False, + spawn_mechanism: str = "kubectl", ) -> Dict[str, Any]: """Run the internal benchmark end-to-end workflow and persist its manifest.""" effective = run_id or default_run_id() @@ -452,6 +453,7 @@ def run_benchmark_e2e( observe=observe, observe_interval_seconds=observe_interval_seconds, observe_output_subdir=observe_output_subdir, + spawn_mechanism=spawn_mechanism, ), collect_fn=lambda: collect_outputs( performance_csv=_resolve_suite_path( diff --git a/configs/kueue/kueuer/src/kueuer/lifecycle/suite.py b/configs/kueue/kueuer/src/kueuer/lifecycle/suite.py index 3d1383a9..397b8d5c 100644 --- a/configs/kueue/kueuer/src/kueuer/lifecycle/suite.py +++ b/configs/kueue/kueuer/src/kueuer/lifecycle/suite.py @@ -19,6 +19,7 @@ def _suite_commands( localqueue: str, priority: str, artifacts_dir: str, + spawn_mechanism: str = "kubectl", ) -> List[str]: """Build the command transcript recorded in lifecycle suite reports.""" return [ @@ -31,6 +32,7 @@ def _suite_commands( f"--ram {performance_options['ram']} " f"--storage {performance_options['storage']} " f"--wait {performance_options['wait']} " + f"--spawn-mechanism {spawn_mechanism} " f"-n {namespace} -k {localqueue} -p {priority} " f"-o {artifacts_dir}" ), @@ -42,6 +44,7 @@ def _suite_commands( f"--cores {eviction_options['cores']} " f"--ram {eviction_options['ram']} " f"--storage {eviction_options['storage']} " + f"--spawn-mechanism {spawn_mechanism} " f"-n {namespace} -k {localqueue} " "-p low -p medium -p high " f"-o {artifacts_dir}" @@ -61,6 +64,7 @@ def run_benchmark_suite( observe: bool = False, observe_interval_seconds: float = 5.0, observe_output_subdir: str = "observe", + spawn_mechanism: str = "kubectl", collector_factory: Callable[..., Any] = ObservationCollector, scenario_apply_fn: Callable[..., Dict[str, Any]] = apply_scenario, scenario_restore_fn: Callable[..., Dict[str, Any]] = restore_scenario, @@ -87,6 +91,7 @@ def run_benchmark_suite( localqueue=localqueue, priority=priority, artifacts_dir=artifacts_dir, + spawn_mechanism=spawn_mechanism, ) with tempfile.TemporaryDirectory(prefix="kueuer-scenario-") as scenario_tmp: scenario_context = scenario_apply_fn( @@ -145,6 +150,7 @@ def run_benchmark_suite( apply_chunk_size=25, apply_retries=2, apply_backoff=2.0, + spawn_mechanism=spawn_mechanism, ) eviction_runner( filepath=DEFAULT_JOBSPEC_FILEPATH, @@ -162,6 +168,7 @@ def run_benchmark_suite( apply_chunk_size=25, apply_retries=2, apply_backoff=2.0, + spawn_mechanism=spawn_mechanism, ) finally: if collector is not None: diff --git a/configs/kueue/kueuer/tests/test_spawn_mechanism_api.py b/configs/kueue/kueuer/tests/test_spawn_mechanism_api.py new file mode 100644 index 00000000..0ef05a8c --- /dev/null +++ b/configs/kueue/kueuer/tests/test_spawn_mechanism_api.py @@ -0,0 +1,137 @@ +import asyncio + +import pytest +from kubernetes.client.rest import ApiException + +from kueuer.benchmarks import k8s + + +def _template() -> dict: + return { + "apiVersion": "batch/v1", + "kind": "Job", + "metadata": {"name": "template", "namespace": "ns"}, + "spec": { + "template": { + "spec": { + "containers": [{"name": "template", "image": "busybox"}], + "restartPolicy": "Never", + } + } + }, + } + + +def test_render_job_manifests_sets_names_and_container_names() -> None: + manifests = k8s.render_job_manifests(_template(), prefix="pfx", start=0, end=3) + assert [m["metadata"]["name"] for m in manifests] == ["pfx-0", "pfx-1", "pfx-2"] + assert [ + m["spec"]["template"]["spec"]["containers"][0]["name"] for m in manifests + ] == ["pfx-0", "pfx-1", "pfx-2"] + + +def test_is_transient_api_error_matches_expected_statuses() -> None: + assert k8s._is_transient_api_error(ApiException(status=502)) is True + assert k8s._is_transient_api_error(ApiException(status=500)) is True + assert k8s._is_transient_api_error(ApiException(status=429)) is True + assert k8s._is_transient_api_error(ApiException(status=403)) is False + assert k8s._is_transient_api_error(ApiException(status=409)) is False + + +def test_ensure_api_client_ready_falls_back_to_incluster(monkeypatch) -> None: + k8s._API_CLIENT_READY = False + + def fail_kubeconfig() -> None: + raise RuntimeError("no kubeconfig") + + called = {"incluster": 0, "check": 0} + + def ok_incluster() -> None: + called["incluster"] += 1 + + def ok_check(namespace: str) -> None: + called["check"] += 1 + + monkeypatch.setattr(k8s.config, "load_kube_config", fail_kubeconfig) + monkeypatch.setattr(k8s.config, "load_incluster_config", ok_incluster) + monkeypatch.setattr(k8s, "_api_preflight_check", ok_check) + + k8s.ensure_api_client_ready(namespace="ns") + assert called["incluster"] == 1 + assert called["check"] == 1 + + +def test_submit_jobs_dispatches_to_kubectl(monkeypatch) -> None: + async def fake_apply(*args, **kwargs): + return {"spawn_mechanism": "kubectl"} + + async def fake_apply_api(*args, **kwargs): + return {"spawn_mechanism": "api"} + + monkeypatch.setattr(k8s, "apply", fake_apply) + monkeypatch.setattr(k8s, "apply_api", fake_apply_api) + + result = asyncio.run( + k8s.submit_jobs( + template=_template(), + prefix="pfx", + jobs=1, + spawn_mechanism="kubectl", + namespace="ns", + apply_chunk_size=1, + apply_retries=0, + apply_backoff=0.0, + ) + ) + assert result["spawn_mechanism"] == "kubectl" + + +def test_submit_jobs_dispatches_to_api(monkeypatch) -> None: + async def fake_apply(*args, **kwargs): + return {"spawn_mechanism": "kubectl"} + + async def fake_apply_api(*args, **kwargs): + return {"spawn_mechanism": "api"} + + monkeypatch.setattr(k8s, "apply", fake_apply) + monkeypatch.setattr(k8s, "apply_api", fake_apply_api) + + result = asyncio.run( + k8s.submit_jobs( + template=_template(), + prefix="pfx", + jobs=1, + spawn_mechanism="api", + namespace="ns", + apply_chunk_size=1, + apply_retries=0, + apply_backoff=0.0, + ) + ) + assert result["spawn_mechanism"] == "api" + + +@pytest.mark.parametrize("status", [500, 502, 503]) +def test_create_job_with_retries_retries_transient_errors(monkeypatch, status: int) -> None: + attempts = {"count": 0} + + class FakeBatch: + def create_namespaced_job(self, namespace, body): + attempts["count"] += 1 + if attempts["count"] < 2: + raise ApiException(status=status) + return object() + + ok, err = asyncio.run( + k8s._create_job_with_retries( + batch=FakeBatch(), + namespace="ns", + manifest={"metadata": {"name": "job1"}}, + retries=3, + backoff_seconds=0.0, + ) + ) + assert ok is True + assert err == "" + assert attempts["count"] == 2 + From d42fea0330490783d1754f24a86387b1fd5a95aa Mon Sep 17 00:00:00 2001 From: Shiny Brar Date: Mon, 30 Mar 2026 07:39:53 -0700 Subject: [PATCH 08/11] feat(resources): enhance GPU resource handling with detailed kind resolution and updated return structure --- configs/kueue/kueuer/src/kueuer/resources.py | 134 ++++++++++++++++--- 1 file changed, 114 insertions(+), 20 deletions(-) diff --git a/configs/kueue/kueuer/src/kueuer/resources.py b/configs/kueue/kueuer/src/kueuer/resources.py index bae896c2..de4a78a0 100644 --- a/configs/kueue/kueuer/src/kueuer/resources.py +++ b/configs/kueue/kueuer/src/kueuer/resources.py @@ -11,8 +11,12 @@ Totals cluster resources across Kubernetes nodes filtered by name regex. - Deduplicates nodes by UID (so overlapping regex lists don't double count). -- By default totals from node .status.capacity; use --field allocatable to sum .status.allocatable instead. -- Returns a mapping: dict[str, dict[str, str]] with { "value": , "unit": }. +- By default totals from node .status.capacity; use --field allocatable to sum + .status.allocatable instead. +- Returns a mapping: ``cpu`` uses ``{ "value", "unit": "cores" }``; ``memory`` and + ``ephemeral-storage`` use values in **Gi** (1024³ bytes) with ``unit: "Gi"``. +- For ``nvidia.com/gpu`` and ``amd.com/gpu``: ``{ "kind", "value", "unit": "count" }`` + where ``kind`` comes from node labels (e.g. ``nvidia.com/gpu.product``). - If a resource does not exist on any matched node, it is **omitted**. Examples: @@ -27,10 +31,9 @@ import sys from dataclasses import dataclass from decimal import Decimal, getcontext -from typing import Annotated, Dict, Iterable, List, Optional, Sequence +from typing import Annotated, Any, Dict, Iterable, List, Optional, Sequence, Union, cast import typer -from kubernetes import client, config from kubernetes.client import CoreV1Api, V1Node from kubernetes.utils.quantity import parse_quantity from pydantic import BaseModel, Field, RootModel, ValidationError, field_validator @@ -60,7 +63,21 @@ class ResourceItem(BaseModel): ) -class ResourceMap(RootModel[Dict[str, ResourceItem]]): +class GpuResourceItem(BaseModel): + """Cluster totals for a GPU resource.""" + + kind: str = Field( + ..., + description=( + "Product name if the cluster uses a single model; empty if unknown; " + "'mixed' if multiple models." + ), + ) + value: str = Field(..., description="Total GPU count.") + unit: Literal["count"] = "count" + + +class ResourceMap(RootModel[Dict[str, Union[ResourceItem, GpuResourceItem]]]): """Dynamic resource map so unavailable resources can be omitted.""" @@ -92,6 +109,8 @@ class TotalsAcc: ephemeral_bytes: Optional[int] nvidia_gpu: Optional[int] amd_gpu: Optional[int] + nvidia_by_kind: Optional[Dict[str, int]] + amd_by_kind: Optional[Dict[str, int]] # ========================= @@ -132,6 +151,53 @@ def _collect_nodes(v1: CoreV1Api, patterns: Optional[Sequence[str]]) -> List[V1N return list(dedup.values()) +def _nvidia_gpu_kind_from_labels(labels: Dict[str, str]) -> str: + """Resolve GPU product name from common NVIDIA node labels.""" + for key in ( + "nvidia.com/gpu.product", + "nvidia.com/gfd.gpu.product", + ): + v = labels.get(key) + if v: + return str(v).strip() + return "" + + +def _amd_gpu_kind_from_labels(labels: Dict[str, str]) -> str: + """Resolve GPU product name from common AMD node labels.""" + for key in ( + "amd.com/gpu.product", + "amd.com/gpu.family", + ): + v = labels.get(key) + if v: + return str(v).strip() + return "" + + +def _summary_gpu_kind(by_kind: Dict[str, int]) -> str: + """Single model name, empty if unknown, or 'mixed' if multiple distinct models.""" + active = {k: v for k, v in by_kind.items() if v > 0} + if not active: + return "" + distinct_nonempty = {k for k in active if k} + if not distinct_nonempty: + return "" + if len(distinct_nonempty) == 1: + return next(iter(distinct_nonempty)) + return "mixed" + + +def _bytes_to_gi_str(total_bytes: int) -> str: + """Convert a byte total to a decimal string in Gi (1 Gi = 1024³ bytes).""" + # Use integer 1024**3 so the divisor is exact (Decimal(1024)**3 can round). + gi = Decimal(total_bytes) / Decimal(1024**3) + s = format(gi, "f") + if "." in s: + s = s.rstrip("0").rstrip(".") + return s + + def _get_field_map(node: V1Node, field: str) -> Dict[str, str]: """ Extract either .status.capacity or .status.allocatable as a plain dict[str, str]. @@ -193,11 +259,14 @@ def _sum_resources(nodes: List[V1Node], field: str) -> TotalsAcc: eph_vals: List[str] = [] nvidia_vals: List[str] = [] amd_vals: List[str] = [] + nvidia_by_kind: Dict[str, int] = {} + amd_by_kind: Dict[str, int] = {} for n in nodes: m = _get_field_map(n, field) if not m: continue + labels = (n.metadata.labels or {}) if n.metadata else {} if "cpu" in m: cpu_vals.append(m["cpu"]) if "memory" in m: @@ -206,8 +275,16 @@ def _sum_resources(nodes: List[V1Node], field: str) -> TotalsAcc: eph_vals.append(m["ephemeral-storage"]) if "nvidia.com/gpu" in m: nvidia_vals.append(m["nvidia.com/gpu"]) + nk = _nvidia_gpu_kind_from_labels(labels) + nvidia_by_kind[nk] = nvidia_by_kind.get(nk, 0) + int( + parse_quantity(m["nvidia.com/gpu"]) + ) if "amd.com/gpu" in m: amd_vals.append(m["amd.com/gpu"]) + ak = _amd_gpu_kind_from_labels(labels) + amd_by_kind[ak] = amd_by_kind.get(ak, 0) + int( + parse_quantity(m["amd.com/gpu"]) + ) # Convert lists → optional totals (None means "omit key") def _try_sum(dec_sum_fn, vals): @@ -228,6 +305,8 @@ def _try_sum(dec_sum_fn, vals): ephemeral_bytes=int(eph_total) if eph_total is not None else None, nvidia_gpu=nvidia_total, amd_gpu=amd_total, + nvidia_by_kind=nvidia_by_kind if nvidia_total is not None else None, + amd_by_kind=amd_by_kind if amd_total is not None else None, ) @@ -238,21 +317,27 @@ def _try_sum(dec_sum_fn, vals): def total( patterns: Optional[List[str]] = None, field: str = "capacity" -) -> Dict[str, Dict[str, str]]: +) -> Dict[str, Any]: """ - Calculate total cluster resources across nodes matching any of the given regex patterns. + Calculate total cluster resources across nodes matching regex patterns. Args: - patterns: List of regex strings to match node names. If None or empty, includes all nodes. - field: Which field to sum: "capacity" (default) or "allocatable". + patterns: Regex strings for node names. If None or empty, includes all nodes. + field: Which field to sum: "capacity" (default) or "allocatable". Returns: - dict[str, dict[str, str]] mapping resource name -> {"value": , "unit": } - Only includes resources that exist on at least one matched node. + Mapping of resource name to detail dicts. Memory and ephemeral-storage use + Gi and ``unit`` ``\"Gi\"``. GPU entries include ``kind``, ``value``, and + ``unit`` ``\"count\"``. Only includes resources present on at least one node. """ # Validate inputs with Pydantic + if field not in ("capacity", "allocatable"): + raise ValueError('field must be "capacity" or "allocatable"') try: - cfg = Settings(patterns=patterns, field=field) + cfg = Settings( + patterns=patterns, + field=cast(Literal["capacity", "allocatable"], field), + ) except ValidationError as e: raise ValueError(str(e)) from e @@ -261,20 +346,30 @@ def total( acc = _sum_resources(nodes, cfg.field) # Build a dynamic map (omit unavailable resources) - result: Dict[str, ResourceItem] = {} + result: Dict[str, Union[ResourceItem, GpuResourceItem]] = {} if acc.cpu_cores is not None: result["cpu"] = ResourceItem(value=f"{acc.cpu_cores}", unit="cores") if acc.memory_bytes is not None: - result["memory"] = ResourceItem(value=f"{acc.memory_bytes}", unit="bytes") + result["memory"] = ResourceItem( + value=_bytes_to_gi_str(acc.memory_bytes), + unit="Gi", + ) if acc.ephemeral_bytes is not None: result["ephemeral-storage"] = ResourceItem( - value=f"{acc.ephemeral_bytes}", unit="bytes" + value=_bytes_to_gi_str(acc.ephemeral_bytes), + unit="Gi", + ) + if acc.nvidia_gpu is not None and acc.nvidia_by_kind is not None: + result["nvidia.com/gpu"] = GpuResourceItem( + kind=_summary_gpu_kind(acc.nvidia_by_kind), + value=str(acc.nvidia_gpu), + ) + if acc.amd_gpu is not None and acc.amd_by_kind is not None: + result["amd.com/gpu"] = GpuResourceItem( + kind=_summary_gpu_kind(acc.amd_by_kind), + value=str(acc.amd_gpu), ) - if acc.nvidia_gpu is not None: - result["nvidia.com/gpu"] = ResourceItem(value=f"{acc.nvidia_gpu}", unit="count") - if acc.amd_gpu is not None: - result["amd.com/gpu"] = ResourceItem(value=f"{acc.amd_gpu}", unit="count") # Validate and dump with Pydantic return ResourceMap(result).model_dump() @@ -327,7 +422,6 @@ def resources( if scale != 1.0: console.print(f"Scaling by {scale * 100}%...") for _k, v in result.items(): - # Limit to Decimal precision to 3 decimal places v["value"] = str(Decimal(v["value"]) * Decimal(scale)) console.print(result, width=120) except Exception as e: From 94c6c51cecd3df4e0512ae091e2b17a57ae77146 Mon Sep 17 00:00:00 2001 From: Shiny Brar Date: Mon, 30 Mar 2026 07:40:42 -0700 Subject: [PATCH 09/11] feat(benchmark): improve Kubernetes client configuration handling and enhance error logging for pod and job outcome collection --- .../kueue/kueuer/src/kueuer/benchmarks/k8s.py | 73 +++++++-- .../kueuer/src/kueuer/benchmarks/plot.py | 133 ++++++++++++++- configs/kueue/kueuer/src/kueuer/utils/io.py | 3 + .../tests/test_k8s_phase2_resilience.py | 37 ++++- .../tests/test_plot_filters_and_semantics.py | 17 ++ configs/kueue/kueuer/tests/test_resources.py | 154 ++++++++++++++++++ .../kueuer/tests/test_spawn_mechanism_api.py | 1 - 7 files changed, 397 insertions(+), 21 deletions(-) create mode 100644 configs/kueue/kueuer/tests/test_resources.py diff --git a/configs/kueue/kueuer/src/kueuer/benchmarks/k8s.py b/configs/kueue/kueuer/src/kueuer/benchmarks/k8s.py index f7260dfb..b75c9cdc 100644 --- a/configs/kueue/kueuer/src/kueuer/benchmarks/k8s.py +++ b/configs/kueue/kueuer/src/kueuer/benchmarks/k8s.py @@ -205,11 +205,38 @@ def summarize_pod_statuses(pods: List[Any]) -> Dict[str, int]: return summary +def _load_client_config_best_effort() -> bool: + """Try kubeconfig first, then in-cluster config.""" + try: + config.load_kube_config() + return True + except Exception: # noqa: BLE001 + try: + config.load_incluster_config() + return True + except Exception: # noqa: BLE001 + return False + + def collect_pod_outcomes(namespace: str, prefix: str) -> Dict[str, int]: """Collect pod outcome summary for jobs with the given name prefix.""" - config.load_kube_config() + empty = summarize_pod_statuses([]) + if not _load_client_config_best_effort(): + logger.warning("Unable to configure Kubernetes client while collecting pod outcomes.") + return empty + v1 = client.CoreV1Api() - pods = v1.list_namespaced_pod(namespace=namespace).items + try: + pod_list = v1.list_namespaced_pod(namespace=namespace) + pods = list(getattr(pod_list, "items", None) or []) + except Exception as error: # noqa: BLE001 + logger.warning( + "Unable to collect pod outcomes in namespace %s: %s", + namespace, + error, + ) + return empty + selected = [ pod for pod in pods @@ -220,20 +247,34 @@ def collect_pod_outcomes(namespace: str, prefix: str) -> Dict[str, int]: def collect_job_outcomes(namespace: str, prefix: str) -> Dict[str, int]: """Collect job outcome counters for jobs with the given name prefix.""" - config.load_kube_config() + empty: Dict[str, int] = { + "jobs_total": 0, + "jobs_succeeded": 0, + "jobs_failed": 0, + "jobs_active": 0, + } + if not _load_client_config_best_effort(): + logger.warning("Unable to configure Kubernetes client while collecting job outcomes.") + return empty + batch_v1 = client.BatchV1Api() - jobs = batch_v1.list_namespaced_job(namespace=namespace).items + try: + job_list = batch_v1.list_namespaced_job(namespace=namespace) + jobs = list(getattr(job_list, "items", None) or []) + except Exception as error: # noqa: BLE001 + logger.warning( + "Unable to collect job outcomes in namespace %s: %s", + namespace, + error, + ) + return empty + selected = [ job for job in jobs if (job.metadata and job.metadata.name and job.metadata.name.startswith(prefix)) ] - outcomes: Dict[str, int] = { - "jobs_total": len(selected), - "jobs_succeeded": 0, - "jobs_failed": 0, - "jobs_active": 0, - } + outcomes = {**empty, "jobs_total": len(selected)} for job in selected: outcomes["jobs_succeeded"] += int(getattr(job.status, "succeeded", 0) or 0) outcomes["jobs_failed"] += int(getattr(job.status, "failed", 0) or 0) @@ -244,9 +285,11 @@ def collect_job_outcomes(namespace: str, prefix: str) -> Dict[str, int]: def kueue_controller_restarts(namespace: str = "kueue-system") -> int: """Return aggregate restart count of kueue-system pods.""" try: - config.load_kube_config() + if not _load_client_config_best_effort(): + raise RuntimeError("unable to configure kubernetes client") v1 = client.CoreV1Api() - pods = v1.list_namespaced_pod(namespace=namespace).items + pod_list = v1.list_namespaced_pod(namespace=namespace) + pods = list(getattr(pod_list, "items", None) or []) total = 0 for pod in pods: for status in getattr(pod.status, "container_statuses", None) or []: @@ -288,10 +331,12 @@ async def apply( "apply_attempts": 0, "apply_retries": 0, "manifest_apply_seconds": 0.0, + "chunk_spawn_seconds": [], "last_error": "", "spawn_mechanism": "kubectl", } for start, end in chunk_ranges(count, chunk_size): + chunk_start = time() report["chunks_total"] += 1 chunk_jobs = end - start async with aiofiles.tempfile.NamedTemporaryFile( @@ -343,6 +388,7 @@ async def apply( await temp.close() await aiofiles.os.remove(str(temp.name)) logger.debug("Deleted %s", temp.name) + report["chunk_spawn_seconds"].append(time() - chunk_start) report["manifest_apply_seconds"] = time() - now logger.info("Took %ss to apply k8s manifest", report["manifest_apply_seconds"]) return report @@ -473,6 +519,7 @@ async def apply_api( "apply_attempts": 0, "apply_retries": 0, "manifest_apply_seconds": 0.0, + "chunk_spawn_seconds": [], "last_error": "", "spawn_mechanism": "api", "api_concurrency": api_concurrency, @@ -493,6 +540,7 @@ async def _guarded_create(manifest: Dict[Any, Any]) -> Tuple[bool, str]: return ok, err for start, end in chunk_ranges(count, chunk_size): + chunk_start = time() report["chunks_total"] += 1 manifests = render_job_manifests(data, prefix, start, end) results = await asyncio.gather(*[_guarded_create(m) for m in manifests]) @@ -504,6 +552,7 @@ async def _guarded_create(manifest: Dict[Any, Any]) -> Tuple[bool, str]: else: report["chunks_succeeded"] += 1 report["jobs_applied"] += len(results) + report["chunk_spawn_seconds"].append(time() - chunk_start) report["manifest_apply_seconds"] = time() - now logger.info("Took %ss to submit jobs via API", report["manifest_apply_seconds"]) diff --git a/configs/kueue/kueuer/src/kueuer/benchmarks/plot.py b/configs/kueue/kueuer/src/kueuer/benchmarks/plot.py index db1732c2..ceae2d1d 100644 --- a/configs/kueue/kueuer/src/kueuer/benchmarks/plot.py +++ b/configs/kueue/kueuer/src/kueuer/benchmarks/plot.py @@ -1,7 +1,9 @@ +import json from pathlib import Path -from typing import Dict, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple import matplotlib.pyplot as plt +import numpy as np import pandas as pd import seaborn as sns import typer @@ -69,6 +71,28 @@ def _finalize_plot( plt.close(fig) +def _parse_chunk_spawn_seconds(raw: Any) -> List[float]: + """Parse submission_chunk_spawn_seconds from CSV (JSON string or list).""" + if raw is None: + return [] + if isinstance(raw, float) and pd.isna(raw): + return [] + if isinstance(raw, list): + return [float(x) for x in raw] + if isinstance(raw, str): + text = raw.strip() + if not text: + return [] + try: + parsed = json.loads(text) + except json.JSONDecodeError: + return [] + if isinstance(parsed, list): + return [float(x) for x in parsed] + return [] + return [] + + def _annotate_empty(ax: plt.Axes, message: str) -> None: ax.text( 0.5, @@ -93,14 +117,49 @@ def _style_axis(ax: plt.Axes, title: str, ylabel: str) -> None: ax.spines[spine].set_visible(False) +def _maybe_rotate_job_count_xlabels(ax: plt.Axes, n_distinct: int) -> None: + """Tilt labels when many distinct job counts would overlap on the X axis.""" + if n_distinct > 12: + ax.tick_params(axis="x", labelrotation=42, labelsize=9) + for lbl in ax.get_xticklabels(): + lbl.set_horizontalalignment("right") + elif n_distinct > 8: + ax.tick_params(axis="x", labelrotation=22, labelsize=10) + for lbl in ax.get_xticklabels(): + lbl.set_horizontalalignment("right") + + +def _format_job_count_k(value: float, _pos: int | None = None) -> str: + """Compact linear-scale labels: 500, 1k, 2k, 3.5k, 10k.""" + if not np.isfinite(value): + return "" + v = float(value) + if abs(v) >= 1000: + k = v / 1000.0 + if abs(k - round(k)) < 1e-9: + return f"{int(round(k))}k" + text = f"{k:.2f}".rstrip("0").rstrip(".") + return f"{text}k" + return f"{int(round(v))}" + + def _job_count_ticks(ax: plt.Axes, values: pd.Series) -> None: + """Linear X axis for job counts: bounded tick count and compact k-style labels.""" unique = sorted({int(value) for value in values.dropna().tolist()}) if not unique: return - if len(unique) >= 4 and max(unique) / min(unique) >= 4: - ax.set_xscale("log") - ax.set_xticks(unique) - ax.xaxis.set_major_formatter(ticker.StrMethodFormatter("{x:,.0f}")) + lo_f, hi_f = float(min(unique)), float(max(unique)) + span = hi_f - lo_f if hi_f > lo_f else max(hi_f * 0.05, 1.0) + pad = max(span * 0.02, 1.0) + ax.set_xlim(lo_f - pad, hi_f + pad) + + ax.set_xscale("linear") + ax.xaxis.set_major_locator( + ticker.MaxNLocator(nbins=7, min_n_ticks=4, integer=True, prune="both") + ) + ax.xaxis.set_major_formatter(ticker.FuncFormatter(_format_job_count_k)) + + _maybe_rotate_job_count_xlabels(ax, len(unique)) def _summary_by_count(df: pd.DataFrame, metric: str) -> pd.DataFrame: @@ -232,7 +291,8 @@ def _plot_performance_overview( output_dir: Optional[str], show: bool, ) -> None: - fig, axes = plt.subplots(2, 2, figsize=(15, 10), sharex=False) + n_counts = len({int(x) for x in df["job_count"].dropna().unique().tolist()}) + fig, axes = plt.subplots(2, 2, figsize=(16, 10.2), sharex=False) _overview_panel( axes[0, 0], df, @@ -278,9 +338,69 @@ def _plot_performance_overview( fontweight="bold", ) fig.tight_layout() + if n_counts > 8: + fig.subplots_adjust(bottom=0.12, hspace=0.28, wspace=0.22) _finalize_plot(fig, "performance_overview.png", output_dir, show) +def _plot_spawn_time_by_job_count( + df: pd.DataFrame, + output_dir: Optional[str], + show: bool, +) -> None: + """Single chart: job count on X, spawn time on Y, grouped bars for Direct vs Kueue.""" + col_total = "submission_manifest_apply_seconds" + if col_total not in df.columns: + return + + counts = sorted({int(x) for x in df["job_count"].dropna().unique().tolist()}) + if not counts: + return + + fig_w = max(9.0, 0.85 * len(counts) + 5.0) + fig, ax = plt.subplots(figsize=(fig_w, 5.2)) + x = np.arange(len(counts), dtype=float) + width = min(0.36, 0.8 / 2.2) + + for use_kueue in (False, True): + heights: List[float] = [] + for jc in counts: + sub = df[(df["job_count"] == jc) & (df["use_kueue"] == use_kueue)][col_total].dropna() + if sub.empty: + heights.append(float("nan")) + else: + heights.append(float(sub.astype(float).median())) + offset = -width / 2 if use_kueue is False else width / 2 + ax.bar( + x + offset, + heights, + width, + label=MODE_LABELS[use_kueue], + color=PALETTE[use_kueue], + edgecolor="white", + linewidth=0.6, + zorder=2, + ) + + ax.set_xticks(x) + ax.set_xticklabels([_format_job_count_k(float(c)) for c in counts]) + _maybe_rotate_job_count_xlabels(ax, len(counts)) + _style_axis( + ax, + title="Spawn time by requested job count", + ylabel="Time to spawn all jobs (s)", + ) + ax.set_xlabel("Requested job count") + ax.legend(loc="upper left", frameon=True) + ax.grid(axis="y", alpha=0.45) + ax.grid(axis="x", alpha=0.12) + for spine in ("top", "right"): + ax.spines[spine].set_visible(False) + fig.tight_layout() + fig.subplots_adjust(bottom=0.22 if len(counts) > 8 else 0.14) + _finalize_plot(fig, "spawn_time_by_job_count.png", output_dir, show) + + def _plot_eviction_pressure( df: pd.DataFrame, output_dir: Optional[str], @@ -543,6 +663,7 @@ def render_performance_plots( comparative_df = compute_comparative_metrics(df) _plot_performance_overview(df, output_dir, show) + _plot_spawn_time_by_job_count(df, output_dir, show) _plot_metric_trend( df, "throughput", diff --git a/configs/kueue/kueuer/src/kueuer/utils/io.py b/configs/kueue/kueuer/src/kueuer/utils/io.py index 1833c0c0..9e9192dd 100644 --- a/configs/kueue/kueuer/src/kueuer/utils/io.py +++ b/configs/kueue/kueuer/src/kueuer/utils/io.py @@ -1,6 +1,7 @@ """Input/Output utilities for reading and writing files.""" import csv +import json from datetime import datetime from pathlib import Path from typing import Any, Dict, List, Set @@ -51,6 +52,8 @@ def save_performance_to_csv(results: List[Dict[str, Any]], filename: str) -> Non for key, value in result.items(): if isinstance(value, datetime): row_data[key] = value.isoformat() + elif isinstance(value, (list, dict)): + row_data[key] = json.dumps(value) else: row_data[key] = value writer.writerow(row_data) # type: ignore diff --git a/configs/kueue/kueuer/tests/test_k8s_phase2_resilience.py b/configs/kueue/kueuer/tests/test_k8s_phase2_resilience.py index 064a9268..a8df7684 100644 --- a/configs/kueue/kueuer/tests/test_k8s_phase2_resilience.py +++ b/configs/kueue/kueuer/tests/test_k8s_phase2_resilience.py @@ -56,12 +56,45 @@ def list_namespaced_pod(self, namespace): assert k8s.kueue_controller_restarts() == 0 +def test_collect_pod_outcomes_handles_malformed_pod_list(monkeypatch) -> None: + monkeypatch.setattr(k8s.config, "load_kube_config", lambda: None) + + class FakeCoreV1Api: + def list_namespaced_pod(self, namespace): + raise ValueError("Invalid value for `items`, must not be `None`") + + monkeypatch.setattr(k8s.client, "CoreV1Api", FakeCoreV1Api) + + summary = k8s.collect_pod_outcomes(namespace="default", prefix="bench") + assert summary["pods_total"] == 0 + assert summary["pods_failed"] == 0 + assert summary["pods_oomkilled"] == 0 + + +def test_collect_job_outcomes_handles_malformed_job_list(monkeypatch) -> None: + monkeypatch.setattr(k8s.config, "load_kube_config", lambda: None) + + class FakeBatchV1Api: + def list_namespaced_job(self, namespace): + raise ValueError("Invalid value for `items`, must not be `None`") + + monkeypatch.setattr(k8s.client, "BatchV1Api", FakeBatchV1Api) + + outcomes = k8s.collect_job_outcomes(namespace="default", prefix="bench") + assert outcomes == { + "jobs_total": 0, + "jobs_succeeded": 0, + "jobs_failed": 0, + "jobs_active": 0, + } + + def test_stress_vm_bytes_mb_uses_safer_default_fraction() -> None: - assert k8s.DEFAULT_STRESS_VM_MEMORY_FRACTION == 0.4 + assert k8s.DEFAULT_STRESS_VM_MEMORY_FRACTION == 0.33 assert k8s.stress_vm_bytes_mb( ram_gb=1.0, vm_memory_fraction=k8s.DEFAULT_STRESS_VM_MEMORY_FRACTION, - ) == pytest.approx(409.6) + ) == pytest.approx(337.92) def test_stress_vm_bytes_mb_validates_fraction_bounds() -> None: diff --git a/configs/kueue/kueuer/tests/test_plot_filters_and_semantics.py b/configs/kueue/kueuer/tests/test_plot_filters_and_semantics.py index 3fa6eb07..c7ae99f8 100644 --- a/configs/kueue/kueuer/tests/test_plot_filters_and_semantics.py +++ b/configs/kueue/kueuer/tests/test_plot_filters_and_semantics.py @@ -24,6 +24,20 @@ def test_compute_latency_adds_turnaround_columns() -> None: assert out["completion_latency"].iloc[0] == 5.0 +def test_format_job_count_k_compact_labels() -> None: + assert plot_module._format_job_count_k(500) == "500" + assert plot_module._format_job_count_k(1000) == "1k" + assert plot_module._format_job_count_k(3500) == "3.5k" + assert plot_module._format_job_count_k(10000) == "10k" + + +def test_parse_chunk_spawn_seconds_accepts_json_and_list() -> None: + assert plot_module._parse_chunk_spawn_seconds("[1.0, 2.5]") == [1.0, 2.5] + assert plot_module._parse_chunk_spawn_seconds([3.0, 4.0]) == [3.0, 4.0] + assert plot_module._parse_chunk_spawn_seconds("") == [] + assert plot_module._parse_chunk_spawn_seconds(None) == [] + + def test_compute_completion_ratio_defaults_to_one_without_tracking_columns() -> None: df = pd.DataFrame( { @@ -52,6 +66,8 @@ def test_performance_command_writes_plot_files(tmp_path, monkeypatch) -> None: "median_time_from_creation_completion": [3.0, 2.5], "std_dev_time_from_creation_completion": [0.2, 0.3], "job_duration": [1, 1], + "submission_manifest_apply_seconds": [1.5, 1.7], + "submission_chunk_spawn_seconds": ["[0.7, 0.8]", "[0.85, 0.85]"], } ).to_csv(csv_path, index=False) @@ -64,6 +80,7 @@ def test_performance_command_writes_plot_files(tmp_path, monkeypatch) -> None: expected = [ "performance_overview.png", + "spawn_time_by_job_count.png", "throughput_by_job_count.png", "completion_ratio_by_job_count.png", "tail_turnaround_by_job_count.png", diff --git a/configs/kueue/kueuer/tests/test_resources.py b/configs/kueue/kueuer/tests/test_resources.py new file mode 100644 index 00000000..9c36be1c --- /dev/null +++ b/configs/kueue/kueuer/tests/test_resources.py @@ -0,0 +1,154 @@ +"""Tests for cluster resource aggregation.""" + +from __future__ import annotations + +from kubernetes.client import V1Node, V1NodeStatus, V1ObjectMeta + +from kueuer.resources import _bytes_to_gi_str, total + + +def _node( + name: str, + capacity: dict[str, str], + labels: dict[str, str] | None = None, + uid: str | None = None, +) -> V1Node: + return V1Node( + metadata=V1ObjectMeta(name=name, uid=uid or f"uid-{name}", labels=labels), + status=V1NodeStatus(capacity=capacity, allocatable=capacity), + ) + + +def test_bytes_to_gi_str_examples() -> None: + gi = 1024**3 + assert _bytes_to_gi_str(20550 * gi) == "20550" + assert _bytes_to_gi_str(1073741824 // 2) == "0.5" + assert _bytes_to_gi_str(gi) == "1" + + +def test_total_memory_ephemeral_gi(monkeypatch) -> None: + nodes = [ + _node( + "n1", + {"cpu": "8", "memory": "16Gi", "ephemeral-storage": "100Gi"}, + ), + ] + + def fake_list_node(*_a, **_k): + class R: + items = nodes + + return R() + + monkeypatch.setattr( + "kueuer.resources._load_kube", + lambda: type("X", (), {"list_node": fake_list_node})(), + ) + + out = total(None, field="capacity") + assert out["memory"] == {"value": "16", "unit": "Gi"} + assert out["ephemeral-storage"] == {"value": "100", "unit": "Gi"} + + +def test_total_nvidia_gpu_single_model(monkeypatch) -> None: + nodes = [ + _node( + "g1", + {"nvidia.com/gpu": "4"}, + {"nvidia.com/gpu.product": "NVIDIA-A100-SXM4-40GB"}, + ), + _node( + "g2", + {"nvidia.com/gpu": "8"}, + {"nvidia.com/gpu.product": "NVIDIA-A100-SXM4-40GB"}, + ), + ] + + def fake_list_node(*_a, **_k): + class R: + items = nodes + + return R() + + monkeypatch.setattr( + "kueuer.resources._load_kube", + lambda: type("X", (), {"list_node": fake_list_node})(), + ) + + out = total(None, field="capacity") + assert out["nvidia.com/gpu"] == { + "kind": "NVIDIA-A100-SXM4-40GB", + "value": "12", + "unit": "count", + } + + +def test_total_nvidia_gpu_mixed_kind(monkeypatch) -> None: + nodes = [ + _node("a", {"nvidia.com/gpu": "2"}, {"nvidia.com/gpu.product": "NVIDIA-T4"}), + _node("b", {"nvidia.com/gpu": "4"}, {"nvidia.com/gpu.product": "NVIDIA-A100"}), + ] + + def fake_list_node(*_a, **_k): + class R: + items = nodes + + return R() + + monkeypatch.setattr( + "kueuer.resources._load_kube", + lambda: type("X", (), {"list_node": fake_list_node})(), + ) + + out = total(None, field="capacity") + assert out["nvidia.com/gpu"] == { + "kind": "mixed", + "value": "6", + "unit": "count", + } + + +def test_total_amd_gpu(monkeypatch) -> None: + nodes = [ + _node( + "w1", + {"amd.com/gpu": "8"}, + {"amd.com/gpu.product": "AMD-INSTINCT-MI250X"}, + ), + ] + + def fake_list_node(*_a, **_k): + class R: + items = nodes + + return R() + + monkeypatch.setattr( + "kueuer.resources._load_kube", + lambda: type("X", (), {"list_node": fake_list_node})(), + ) + + out = total(None, field="capacity") + assert out["amd.com/gpu"] == { + "kind": "AMD-INSTINCT-MI250X", + "value": "8", + "unit": "count", + } + + +def test_total_nvidia_unknown_kind(monkeypatch) -> None: + nodes = [_node("x", {"nvidia.com/gpu": "3"}, {})] + + def fake_list_node(*_a, **_k): + class R: + items = nodes + + return R() + + monkeypatch.setattr( + "kueuer.resources._load_kube", + lambda: type("X", (), {"list_node": fake_list_node})(), + ) + + out = total(None, field="capacity") + assert out["nvidia.com/gpu"] == {"kind": "", "value": "3", "unit": "count"} diff --git a/configs/kueue/kueuer/tests/test_spawn_mechanism_api.py b/configs/kueue/kueuer/tests/test_spawn_mechanism_api.py index 0ef05a8c..8b1fe0a0 100644 --- a/configs/kueue/kueuer/tests/test_spawn_mechanism_api.py +++ b/configs/kueue/kueuer/tests/test_spawn_mechanism_api.py @@ -134,4 +134,3 @@ def create_namespaced_job(self, namespace, body): assert ok is True assert err == "" assert attempts["count"] == 2 - From abd850b3860013016fc95f6643c9a446db479b9c Mon Sep 17 00:00:00 2001 From: Shiny Brar Date: Thu, 9 Apr 2026 14:08:12 -0700 Subject: [PATCH 10/11] feat(kueuer): added command kr cluster resources to provide node label scoped values and also provide weights scaled to vCPUs --- configs/kueue/kueuer/src/kueuer/resources.py | 451 +++++++++++++++---- configs/kueue/kueuer/tests/test_resources.py | 290 ++++++++++-- 2 files changed, 623 insertions(+), 118 deletions(-) diff --git a/configs/kueue/kueuer/src/kueuer/resources.py b/configs/kueue/kueuer/src/kueuer/resources.py index de4a78a0..b0ea73d5 100644 --- a/configs/kueue/kueuer/src/kueuer/resources.py +++ b/configs/kueue/kueuer/src/kueuer/resources.py @@ -13,11 +13,20 @@ - Deduplicates nodes by UID (so overlapping regex lists don't double count). - By default totals from node .status.capacity; use --field allocatable to sum .status.allocatable instead. -- Returns a mapping: ``cpu`` uses ``{ "value", "unit": "cores" }``; ``memory`` and - ``ephemeral-storage`` use values in **Gi** (1024³ bytes) with ``unit: "Gi"``. -- For ``nvidia.com/gpu`` and ``amd.com/gpu``: ``{ "kind", "value", "unit": "count" }`` - where ``kind`` comes from node labels (e.g. ``nvidia.com/gpu.product``). -- If a resource does not exist on any matched node, it is **omitted**. +- Results are grouped by a configurable node label (see CLI ``--node-label-key``; + ``total()`` requires ``node_label_key`` with no default in code). Nodes without + the label are grouped under ``""``. Each group has ``count`` (nodes in group), + ``cpu``, ``memory``, ``ephemeral-storage`` (binary **GiB**, 1024³; values up to + 3 decimal places), per-bucket **weights** (same 3 decimal places; pool CPU + cores per GiB / per GPU kind—see ``ResourceWeights``), and GPU lists. +- ``nvidia.com/gpu`` is a list of ``{ "kind", "value", "unit": "count" }`` per + distinct ``nvidia.com/gpu.product`` label, summed across nodes. When + capacity/allocatable reports 0 or omits ``nvidia.com/gpu`` but the NVIDIA + Device Plugin exposes counts on labels (e.g. ``nvidia.com/gpu.count``), those + label values are used with ``kind`` from ``nvidia.com/gpu.product``. +- ``amd.com/gpu`` uses the same list shape, summed by ``amd.com/gpu.product``. +- If a resource does not exist for nodes in a group, it is **omitted** for that + group. Examples: uv run resources.py @@ -30,13 +39,13 @@ import re import sys from dataclasses import dataclass -from decimal import Decimal, getcontext -from typing import Annotated, Any, Dict, Iterable, List, Optional, Sequence, Union, cast +from decimal import ROUND_HALF_UP, Decimal, getcontext, localcontext +from typing import Annotated, Any, Dict, Iterable, List, Optional, Sequence, cast import typer from kubernetes.client import CoreV1Api, V1Node from kubernetes.utils.quantity import parse_quantity -from pydantic import BaseModel, Field, RootModel, ValidationError, field_validator +from pydantic import BaseModel, ConfigDict, Field, ValidationError, field_validator from rich.console import Console from typing_extensions import Literal @@ -47,6 +56,12 @@ # Rationale: See DECIMAL_PRECISION in utils/constants.py getcontext().prec = DECIMAL_PRECISION +# Reported fractional precision for CPU, GiB display quantities, and weight ratios. +REPORT_MAX_DECIMAL_PLACES = 3 + +# Intermediate precision for weight ratio division before rounding to ``REPORT_MAX_DECIMAL_PLACES``. +_WEIGHT_RATIO_DIV_PREC = max(80, DECIMAL_PRECISION) + app = typer.Typer(help="Cluster utilities") # ========================= @@ -56,29 +71,93 @@ class ResourceItem(BaseModel): value: str = Field( - ..., description="Numeric value as a string, max precision retained." + ..., + description="Numeric string with at most three fractional decimal places.", ) unit: str = Field( - ..., description="Unit for the value, e.g., 'cores', 'bytes', 'count'." + ..., + description="Binary GiB for memory/ephemeral totals, 'cores', or 'count'.", ) class GpuResourceItem(BaseModel): - """Cluster totals for a GPU resource.""" + """Per-model GPU totals within a node-type group.""" kind: str = Field( ..., + description="Product name from node labels (e.g. nvidia.com/gpu.product).", + ) + value: str = Field(..., description="Total GPU count for this kind.") + unit: Literal["count"] = "count" + + +class ResourceWeights(BaseModel): + """Pool-level ratios vs CPU cores (dimensionless); see module docstring for interpretation.""" + + model_config = ConfigDict(populate_by_name=True) + + cpu: str = Field( + default="1", + description="Baseline; other weights are pool CPU per unit of that resource.", + ) + memory: Optional[str] = Field( + None, + description="Pool CPU cores divided by total memory in binary GiB.", + ) + ephemeral_storage: Optional[str] = Field( + None, + serialization_alias="ephemeral-storage", + description="Pool CPU cores divided by total ephemeral storage in binary GiB.", + ) + nvidia_gpu: Optional[Dict[str, str]] = Field( + None, + serialization_alias="nvidia.com/gpu", + description="Per GPU product: pool CPU cores divided by count of that kind.", + ) + + +class NodeTypeResources(BaseModel): + """Resource totals for one value of the grouping node label.""" + + model_config = ConfigDict(populate_by_name=True) + + count: int = Field( + ..., + ge=0, + description="Number of nodes in this group (unique nodes after pattern filter).", + ) + cpu: Optional[ResourceItem] = None + memory: Optional[ResourceItem] = None + ephemeral_storage: Optional[ResourceItem] = Field( + default=None, + serialization_alias="ephemeral-storage", + ) + nvidia_gpu: Optional[List[GpuResourceItem]] = Field( + default=None, + serialization_alias="nvidia.com/gpu", + ) + amd_gpu: Optional[List[GpuResourceItem]] = Field( + default=None, + serialization_alias="amd.com/gpu", + ) + weights: Optional[ResourceWeights] = Field( + None, description=( - "Product name if the cluster uses a single model; empty if unknown; " - "'mixed' if multiple models." + "CPU-normalized pool composition weights (decimal strings, same precision " + "as other reported quantities). " + "Omitted if the pool has no CPU total to divide by." ), ) - value: str = Field(..., description="Total GPU count.") - unit: Literal["count"] = "count" -class ResourceMap(RootModel[Dict[str, Union[ResourceItem, GpuResourceItem]]]): - """Dynamic resource map so unavailable resources can be omitted.""" +class ClusterResourcesResult(BaseModel): + """Cluster resources grouped by ``node_label_key`` label values.""" + + node_label_key: str = Field( + ..., + description="Kubernetes node label key used to form each group.", + ) + by_label_value: Dict[str, NodeTypeResources] class Settings(BaseModel): @@ -107,8 +186,6 @@ class TotalsAcc: cpu_cores: Optional[Decimal] # None -> omit key memory_bytes: Optional[int] ephemeral_bytes: Optional[int] - nvidia_gpu: Optional[int] - amd_gpu: Optional[int] nvidia_by_kind: Optional[Dict[str, int]] amd_by_kind: Optional[Dict[str, int]] @@ -175,29 +252,134 @@ def _amd_gpu_kind_from_labels(labels: Dict[str, str]) -> str: return "" -def _summary_gpu_kind(by_kind: Dict[str, int]) -> str: - """Single model name, empty if unknown, or 'mixed' if multiple distinct models.""" - active = {k: v for k, v in by_kind.items() if v > 0} - if not active: - return "" - distinct_nonempty = {k for k in active if k} - if not distinct_nonempty: - return "" - if len(distinct_nonempty) == 1: - return next(iter(distinct_nonempty)) - return "mixed" - - -def _bytes_to_gi_str(total_bytes: int) -> str: - """Convert a byte total to a decimal string in Gi (1 Gi = 1024³ bytes).""" - # Use integer 1024**3 so the divisor is exact (Decimal(1024)**3 can round). - gi = Decimal(total_bytes) / Decimal(1024**3) - s = format(gi, "f") +def _node_nvidia_gpu_contrib( + labels: Dict[str, str], m: Dict[str, str] +) -> Optional[tuple[int, str]]: + """ + NVIDIA GPUs advertised for this node: count and product kind. + + Prefer ``.status.capacity``/``allocatable`` when ``nvidia.com/gpu`` is + positive. If it is zero or absent, fall back to ``nvidia.com/gpu.count`` so + MIG / device-plugin-only reporting still aggregates correctly. + """ + kind = _nvidia_gpu_kind_from_labels(labels) + cap_s = m.get("nvidia.com/gpu") + cap_n = int(parse_quantity(cap_s)) if cap_s else 0 + lc_raw = labels.get("nvidia.com/gpu.count") + label_n: Optional[int] = None + if lc_raw is not None and str(lc_raw).strip() != "": + label_n = int(parse_quantity(str(lc_raw).strip())) + + if cap_n > 0: + return (cap_n, kind) + if label_n is not None and label_n > 0: + return (label_n, kind) + return None + + +def _gpu_kind_totals_to_list(by_kind: Optional[Dict[str, int]]) -> Optional[List[GpuResourceItem]]: + """Convert per-kind counts to a stable list for JSON output.""" + if not by_kind: + return None + items = [ + GpuResourceItem(kind=k, value=str(v)) + for k, v in sorted(by_kind.items(), key=lambda kv: (kv[0] == "", kv[0])) + if v > 0 + ] + return items or None + + +def _format_decimal_report(value: Decimal) -> str: + """Stringify a non-negative Decimal with at most ``REPORT_MAX_DECIMAL_PLACES`` places.""" + if value < 0: + raise ValueError("value must be non-negative") + q = Decimal("1").scaleb(-REPORT_MAX_DECIMAL_PLACES) + rounded = value.quantize(q, rounding=ROUND_HALF_UP) + s = format(rounded, "f") if "." in s: s = s.rstrip("0").rstrip(".") return s +# Binary gibibyte (Kubernetes-style): 1 GiB = 1024³ bytes. +_GIB_BYTES = Decimal(1024**3) + + +def _bytes_to_binary_gib_decimal(total_bytes: int) -> Decimal: + """Convert byte totals to binary GiB (full ``Decimal``, unrounded).""" + if total_bytes < 0: + raise ValueError("byte total must be non-negative") + return Decimal(total_bytes) / _GIB_BYTES + + +def _gib_resource_item(total_bytes: int) -> ResourceItem: + """Memory / ephemeral totals: always reported in GiB with limited display precision.""" + if total_bytes == 0: + return ResourceItem(value="0", unit="GiB") + v = _bytes_to_binary_gib_decimal(total_bytes) + return ResourceItem(value=_format_decimal_report(v), unit="GiB") + + +def _gib_display_to_bytes(value: Decimal) -> Decimal: + """Interpret a displayed GiB quantity as bytes.""" + return value * _GIB_BYTES + + +def _decimal_ratio_string(numerator: Decimal, denominator: Decimal) -> str: + """``numerator / denominator`` rounded to ``REPORT_MAX_DECIMAL_PLACES`` (half-up).""" + if denominator <= 0: + raise ValueError("denominator must be positive") + with localcontext() as ctx: + ctx.prec = _WEIGHT_RATIO_DIV_PREC + ratio = numerator / denominator + return _format_decimal_report(ratio) + + +def _compute_resource_weights(acc: TotalsAcc) -> Optional[ResourceWeights]: + """ + Weights normalize pool totals to a per-CPU baseline: ``cpu`` is 1; other + fields are ``TOTAL_CPU / TOTAL_QUANTITY`` in compatible units (GiB for + memory and ephemeral; per-GPU-kind counts for NVIDIA). + + **Interpretation (heuristic):** For a node pool with totals ``(C, M, E, …)``, + weights map ``(c, m, e, …)`` requests to a linear ``c·1 + m·w_mem + …`` style + score *if* you treat the pool's aggregate ratio as a fixed substitution rate + between CPU and other resources. That is a **comparative** normalization, not a + guarantee of schedulability, pricing, or optimal packing—heterogeneous nodes, + fragmentation, and priorities are not captured. + """ + cpu = acc.cpu_cores + if cpu is None or cpu <= 0: + return None + + mem_w: Optional[str] = None + if acc.memory_bytes is not None and acc.memory_bytes > 0: + mem_w = _decimal_ratio_string(cpu, _bytes_to_binary_gib_decimal(acc.memory_bytes)) + + eph_w: Optional[str] = None + if acc.ephemeral_bytes is not None and acc.ephemeral_bytes > 0: + eph_w = _decimal_ratio_string(cpu, _bytes_to_binary_gib_decimal(acc.ephemeral_bytes)) + + nv_map: Optional[Dict[str, str]] = None + if acc.nvidia_by_kind: + entries: Dict[str, str] = {} + for kind, cnt in sorted( + acc.nvidia_by_kind.items(), + key=lambda kv: (kv[0] == "", kv[0]), + ): + if cnt <= 0: + continue + entries[kind] = _decimal_ratio_string(cpu, Decimal(cnt)) + nv_map = entries or None + + return ResourceWeights( + cpu="1", + memory=mem_w, + ephemeral_storage=eph_w, + nvidia_gpu=nv_map, + ) + + def _get_field_map(node: V1Node, field: str) -> Dict[str, str]: """ Extract either .status.capacity or .status.allocatable as a plain dict[str, str]. @@ -257,28 +439,25 @@ def _sum_resources(nodes: List[V1Node], field: str) -> TotalsAcc: cpu_vals: List[str] = [] mem_vals: List[str] = [] eph_vals: List[str] = [] - nvidia_vals: List[str] = [] amd_vals: List[str] = [] nvidia_by_kind: Dict[str, int] = {} amd_by_kind: Dict[str, int] = {} for n in nodes: m = _get_field_map(n, field) + labels = (n.metadata.labels or {}) if n.metadata else {} + nvidia_c = _node_nvidia_gpu_contrib(labels, m) + if nvidia_c is not None: + cnt, nk = nvidia_c + nvidia_by_kind[nk] = nvidia_by_kind.get(nk, 0) + cnt if not m: continue - labels = (n.metadata.labels or {}) if n.metadata else {} if "cpu" in m: cpu_vals.append(m["cpu"]) if "memory" in m: mem_vals.append(m["memory"]) if "ephemeral-storage" in m: eph_vals.append(m["ephemeral-storage"]) - if "nvidia.com/gpu" in m: - nvidia_vals.append(m["nvidia.com/gpu"]) - nk = _nvidia_gpu_kind_from_labels(labels) - nvidia_by_kind[nk] = nvidia_by_kind.get(nk, 0) + int( - parse_quantity(m["nvidia.com/gpu"]) - ) if "amd.com/gpu" in m: amd_vals.append(m["amd.com/gpu"]) ak = _amd_gpu_kind_from_labels(labels) @@ -296,27 +475,55 @@ def _try_sum(dec_sum_fn, vals): cpu_total = _try_sum(_sum_quantity, cpu_vals) mem_total = _try_sum(_sum_quantity, mem_vals) eph_total = _try_sum(_sum_quantity, eph_vals) - nvidia_total = _try_sum(_sum_int_quantity, nvidia_vals) amd_total = _try_sum(_sum_int_quantity, amd_vals) return TotalsAcc( cpu_cores=cpu_total, memory_bytes=int(mem_total) if mem_total is not None else None, ephemeral_bytes=int(eph_total) if eph_total is not None else None, - nvidia_gpu=nvidia_total, - amd_gpu=amd_total, - nvidia_by_kind=nvidia_by_kind if nvidia_total is not None else None, + nvidia_by_kind=nvidia_by_kind if nvidia_by_kind else None, amd_by_kind=amd_by_kind if amd_total is not None else None, ) +def _totals_acc_to_node_type_resources(acc: TotalsAcc, node_count: int) -> NodeTypeResources: + """Build one NodeTypeResources from aggregated totals.""" + return NodeTypeResources( + count=node_count, + cpu=( + ResourceItem( + value=_format_decimal_report(acc.cpu_cores), + unit="cores", + ) + if acc.cpu_cores is not None + else None + ), + memory=( + _gib_resource_item(acc.memory_bytes) + if acc.memory_bytes is not None + else None + ), + ephemeral_storage=( + _gib_resource_item(acc.ephemeral_bytes) + if acc.ephemeral_bytes is not None + else None + ), + nvidia_gpu=_gpu_kind_totals_to_list(acc.nvidia_by_kind), + amd_gpu=_gpu_kind_totals_to_list(acc.amd_by_kind), + weights=_compute_resource_weights(acc), + ) + + # ========================= # Public API # ========================= def total( - patterns: Optional[List[str]] = None, field: str = "capacity" + patterns: Optional[List[str]] = None, + field: str = "capacity", + *, + node_label_key: str, ) -> Dict[str, Any]: """ Calculate total cluster resources across nodes matching regex patterns. @@ -324,12 +531,17 @@ def total( Args: patterns: Regex strings for node names. If None or empty, includes all nodes. field: Which field to sum: "capacity" (default) or "allocatable". + node_label_key: Kubernetes node label key used to group results (callers + such as the CLI supply the default; this function does not default it). Returns: - Mapping of resource name to detail dicts. Memory and ephemeral-storage use - Gi and ``unit`` ``\"Gi\"``. GPU entries include ``kind``, ``value``, and - ``unit`` ``\"count\"``. Only includes resources present on at least one node. + A dict with ``node_label_key``, ``by_label_value`` (each key is a label + value, or ``\"\"`` if unset), and per-group ``count`` plus resource maps. """ + label_key = node_label_key.strip() + if not label_key: + raise ValueError("node_label_key must be a non-empty string") + # Validate inputs with Pydantic if field not in ("capacity", "allocatable"): raise ValueError('field must be "capacity" or "allocatable"') @@ -343,36 +555,74 @@ def total( v1 = _load_kube() nodes = _collect_nodes(v1, cfg.patterns) - acc = _sum_resources(nodes, cfg.field) + by_nt: Dict[str, List[V1Node]] = {} + for n in nodes: + labels = (n.metadata.labels or {}) if n.metadata else {} + raw_nt = labels.get(label_key) + nt_key = "" if raw_nt is None else str(raw_nt) + by_nt.setdefault(nt_key, []).append(n) + + groups: Dict[str, NodeTypeResources] = {} + for nt_key in sorted(by_nt.keys(), key=lambda s: (s == "", s)): + bucket = by_nt[nt_key] + acc = _sum_resources(bucket, cfg.field) + groups[nt_key] = _totals_acc_to_node_type_resources(acc, len(bucket)) + + return ClusterResourcesResult( + node_label_key=label_key, + by_label_value=groups, + ).model_dump( + by_alias=True, + exclude_none=True, + ) - # Build a dynamic map (omit unavailable resources) - result: Dict[str, Union[ResourceItem, GpuResourceItem]] = {} - if acc.cpu_cores is not None: - result["cpu"] = ResourceItem(value=f"{acc.cpu_cores}", unit="cores") - if acc.memory_bytes is not None: - result["memory"] = ResourceItem( - value=_bytes_to_gi_str(acc.memory_bytes), - unit="Gi", - ) - if acc.ephemeral_bytes is not None: - result["ephemeral-storage"] = ResourceItem( - value=_bytes_to_gi_str(acc.ephemeral_bytes), - unit="Gi", - ) - if acc.nvidia_gpu is not None and acc.nvidia_by_kind is not None: - result["nvidia.com/gpu"] = GpuResourceItem( - kind=_summary_gpu_kind(acc.nvidia_by_kind), - value=str(acc.nvidia_gpu), - ) - if acc.amd_gpu is not None and acc.amd_by_kind is not None: - result["amd.com/gpu"] = GpuResourceItem( - kind=_summary_gpu_kind(acc.amd_by_kind), - value=str(acc.amd_gpu), - ) - - # Validate and dump with Pydantic - return ResourceMap(result).model_dump() +def _scale_resource_item_inplace(item: Dict[str, Any], scale: Decimal) -> None: + """Apply ``--scale`` to one resource dict with ``value`` and ``unit``.""" + unit = str(item.get("unit", "")) + v = Decimal(str(item["value"])) + if unit == "cores": + item["value"] = _format_decimal_report(v * scale) + elif unit == "GiB": + scaled_bytes = _gib_display_to_bytes(v) * scale + int_bytes = max(0, int(scaled_bytes.to_integral_value(rounding=ROUND_HALF_UP))) + out = _gib_resource_item(int_bytes) + item["value"] = out.value + item["unit"] = out.unit + elif unit == "count": + item["value"] = _format_decimal_report(v * scale) + else: + item["value"] = _format_decimal_report(v * scale) + + +def _scale_cluster_resources_payload(result: Dict[str, Any], scale: Decimal) -> None: + """Multiply numeric ``value`` fields in-place (CLI ``--scale``). Leaves ``weights`` unchanged.""" + inner = result.get("by_label_value") + if not isinstance(inner, dict): + return + for block in inner.values(): + if not isinstance(block, dict): + continue + for res_key in ("cpu", "memory", "ephemeral-storage"): + item = block.get(res_key) + if isinstance(item, dict) and "value" in item and "unit" in item: + _scale_resource_item_inplace(item, scale) + for gpu_key in ("nvidia.com/gpu", "amd.com/gpu"): + lst = block.get(gpu_key) + if isinstance(lst, list): + for g in lst: + if isinstance(g, dict) and "value" in g and "unit" in g: + _scale_resource_item_inplace(g, scale) + + +def list_resource_quotas(namespace: str) -> Dict[str, Any]: + """List ResourceQuota objects in a namespace via the Kubernetes Python client.""" + k8s = get_k8s_config() + quota_list = k8s.core_v1.list_namespaced_resource_quota(namespace=namespace) + payload = k8s.api_client.sanitize_for_serialization(quota_list) + if isinstance(payload, dict): + return payload + return {"items": payload or []} # ========================= @@ -410,6 +660,16 @@ def resources( help="Scale resources by this percentage.", ), ] = 1.0, + node_label_key: Annotated[ + str, + typer.Option( + "--node-label-key", + help=( + "Node label key used to group totals by label value " + '(default only applies to this CLI, not to total()).' + ), + ), + ] = "skaha.opencadc.org/node-type", ): """ Sum resources across nodes matching any of the provided regex patterns. @@ -417,17 +677,40 @@ def resources( assert field in ["capacity", "allocatable"] assert scale > 0.0 and scale <= 1.0, "Percentage must be in (0, 1]" try: - result = total(patterns or None, field=field) + result = total( + patterns or None, + field=field, + node_label_key=node_label_key, + ) console.print(result, width=120) if scale != 1.0: console.print(f"Scaling by {scale * 100}%...") - for _k, v in result.items(): - v["value"] = str(Decimal(v["value"]) * Decimal(scale)) + _scale_cluster_resources_payload(result, Decimal(str(scale))) console.print(result, width=120) except Exception as e: print(f"Error: {e}", file=sys.stderr) raise SystemExit(1) +@app.command("resourcequota") +def resourcequota( + namespace: Annotated[ + str, + typer.Option( + "-n", + "--namespace", + help="Namespace to query for ResourceQuota objects.", + ), + ], +): + """List namespace ResourceQuota objects using the Kubernetes Python client.""" + try: + response = list_resource_quotas(namespace) + console.print({"response": response, "resource_quotas": response.get("items", [])}) + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + raise SystemExit(1) + + if __name__ == "__main__": app() diff --git a/configs/kueue/kueuer/tests/test_resources.py b/configs/kueue/kueuer/tests/test_resources.py index 9c36be1c..c40f2e00 100644 --- a/configs/kueue/kueuer/tests/test_resources.py +++ b/configs/kueue/kueuer/tests/test_resources.py @@ -2,9 +2,24 @@ from __future__ import annotations +from decimal import Decimal + +import pytest +from typer.testing import CliRunner from kubernetes.client import V1Node, V1NodeStatus, V1ObjectMeta -from kueuer.resources import _bytes_to_gi_str, total +from kueuer.cli import app +from kueuer.resources import ( + _bytes_to_binary_gib_decimal, + _format_decimal_report, + list_resource_quotas, + total, +) + +runner = CliRunner() + +# Library API has no default for node_label_key; tests use the same key as the CLI default. +NODE_LABEL_KEY = "skaha.opencadc.org/node-type" def _node( @@ -19,11 +34,17 @@ def _node( ) -def test_bytes_to_gi_str_examples() -> None: +def test_binary_gib_conversion() -> None: gi = 1024**3 - assert _bytes_to_gi_str(20550 * gi) == "20550" - assert _bytes_to_gi_str(1073741824 // 2) == "0.5" - assert _bytes_to_gi_str(gi) == "1" + assert _bytes_to_binary_gib_decimal(20550 * gi) == Decimal("20550") + assert _bytes_to_binary_gib_decimal(gi // 2) == Decimal("0.5") + assert _bytes_to_binary_gib_decimal(gi) == Decimal("1") + + +def test_format_decimal_report_three_places() -> None: + assert _format_decimal_report(Decimal("8")) == "8" + assert _format_decimal_report(Decimal("1.23456789")) == "1.235" + assert _format_decimal_report(Decimal("0")) == "0" def test_total_memory_ephemeral_gi(monkeypatch) -> None: @@ -45,21 +66,29 @@ class R: lambda: type("X", (), {"list_node": fake_list_node})(), ) - out = total(None, field="capacity") - assert out["memory"] == {"value": "16", "unit": "Gi"} - assert out["ephemeral-storage"] == {"value": "100", "unit": "Gi"} + out = total(None, field="capacity", node_label_key=NODE_LABEL_KEY) + assert out["node_label_key"] == NODE_LABEL_KEY + bucket = out["by_label_value"][""] + assert bucket["count"] == 1 + assert bucket["memory"] == {"value": "16", "unit": "GiB"} + assert bucket["ephemeral-storage"] == {"value": "100", "unit": "GiB"} + w = bucket["weights"] + assert w["cpu"] == "1" + assert w["memory"] == "0.5" + assert w["ephemeral-storage"] == "0.08" + assert "nvidia.com/gpu" not in w def test_total_nvidia_gpu_single_model(monkeypatch) -> None: nodes = [ _node( "g1", - {"nvidia.com/gpu": "4"}, + {"cpu": "4", "nvidia.com/gpu": "4"}, {"nvidia.com/gpu.product": "NVIDIA-A100-SXM4-40GB"}, ), _node( "g2", - {"nvidia.com/gpu": "8"}, + {"cpu": "8", "nvidia.com/gpu": "8"}, {"nvidia.com/gpu.product": "NVIDIA-A100-SXM4-40GB"}, ), ] @@ -75,18 +104,27 @@ class R: lambda: type("X", (), {"list_node": fake_list_node})(), ) - out = total(None, field="capacity") - assert out["nvidia.com/gpu"] == { - "kind": "NVIDIA-A100-SXM4-40GB", - "value": "12", - "unit": "count", - } + out = total(None, field="capacity", node_label_key=NODE_LABEL_KEY) + b = out["by_label_value"][""] + assert b["count"] == 2 + assert b["nvidia.com/gpu"] == [ + {"kind": "NVIDIA-A100-SXM4-40GB", "value": "12", "unit": "count"}, + ] + assert b["weights"]["nvidia.com/gpu"]["NVIDIA-A100-SXM4-40GB"] == "1" def test_total_nvidia_gpu_mixed_kind(monkeypatch) -> None: nodes = [ - _node("a", {"nvidia.com/gpu": "2"}, {"nvidia.com/gpu.product": "NVIDIA-T4"}), - _node("b", {"nvidia.com/gpu": "4"}, {"nvidia.com/gpu.product": "NVIDIA-A100"}), + _node( + "a", + {"cpu": "4", "nvidia.com/gpu": "2"}, + {"nvidia.com/gpu.product": "NVIDIA-T4"}, + ), + _node( + "b", + {"cpu": "4", "nvidia.com/gpu": "4"}, + {"nvidia.com/gpu.product": "NVIDIA-A100"}, + ), ] def fake_list_node(*_a, **_k): @@ -100,19 +138,23 @@ class R: lambda: type("X", (), {"list_node": fake_list_node})(), ) - out = total(None, field="capacity") - assert out["nvidia.com/gpu"] == { - "kind": "mixed", - "value": "6", - "unit": "count", - } + out = total(None, field="capacity", node_label_key=NODE_LABEL_KEY) + b = out["by_label_value"][""] + assert b["count"] == 2 + assert b["nvidia.com/gpu"] == [ + {"kind": "NVIDIA-A100", "value": "4", "unit": "count"}, + {"kind": "NVIDIA-T4", "value": "2", "unit": "count"}, + ] + wg = b["weights"]["nvidia.com/gpu"] + assert wg["NVIDIA-A100"] == "2" + assert wg["NVIDIA-T4"] == "4" def test_total_amd_gpu(monkeypatch) -> None: nodes = [ _node( "w1", - {"amd.com/gpu": "8"}, + {"cpu": "8", "amd.com/gpu": "8"}, {"amd.com/gpu.product": "AMD-INSTINCT-MI250X"}, ), ] @@ -128,16 +170,16 @@ class R: lambda: type("X", (), {"list_node": fake_list_node})(), ) - out = total(None, field="capacity") - assert out["amd.com/gpu"] == { - "kind": "AMD-INSTINCT-MI250X", - "value": "8", - "unit": "count", - } + out = total(None, field="capacity", node_label_key=NODE_LABEL_KEY) + b = out["by_label_value"][""] + assert b["count"] == 1 + assert b["amd.com/gpu"] == [ + {"kind": "AMD-INSTINCT-MI250X", "value": "8", "unit": "count"}, + ] def test_total_nvidia_unknown_kind(monkeypatch) -> None: - nodes = [_node("x", {"nvidia.com/gpu": "3"}, {})] + nodes = [_node("x", {"cpu": "3", "nvidia.com/gpu": "3"}, {})] def fake_list_node(*_a, **_k): class R: @@ -150,5 +192,185 @@ class R: lambda: type("X", (), {"list_node": fake_list_node})(), ) - out = total(None, field="capacity") - assert out["nvidia.com/gpu"] == {"kind": "", "value": "3", "unit": "count"} + out = total(None, field="capacity", node_label_key=NODE_LABEL_KEY) + b = out["by_label_value"][""] + assert b["count"] == 1 + assert b["nvidia.com/gpu"] == [ + {"kind": "", "value": "3", "unit": "count"}, + ] + assert b["weights"]["nvidia.com/gpu"][""] == "1" + + +def test_total_groups_by_custom_node_label_key(monkeypatch) -> None: + nodes = [ + _node("a", {"cpu": "2"}, {"pool": "east"}), + _node("b", {"cpu": "2"}, {"pool": "east"}), + _node("c", {"cpu": "4"}, {"pool": "west"}), + ] + + def fake_list_node(*_a, **_k): + class R: + items = nodes + + return R() + + monkeypatch.setattr( + "kueuer.resources._load_kube", + lambda: type("X", (), {"list_node": fake_list_node})(), + ) + + out = total(None, field="capacity", node_label_key="pool") + assert out["node_label_key"] == "pool" + assert out["by_label_value"]["east"]["count"] == 2 + assert out["by_label_value"]["west"]["count"] == 1 + + +def test_total_rejects_blank_node_label_key() -> None: + with pytest.raises(ValueError, match="non-empty"): + total(None, node_label_key=" ") + + +def test_total_split_by_skaha_node_type(monkeypatch) -> None: + nodes = [ + _node( + "cpu1", + {"cpu": "4", "memory": "8Gi"}, + {"skaha.opencadc.org/node-type": "cpu-node"}, + ), + _node( + "gpu1", + {"cpu": "8", "nvidia.com/gpu": "2"}, + { + "nvidia.com/gpu.product": "NVIDIA-T4", + "skaha.opencadc.org/node-type": "gpu-node", + }, + ), + ] + + def fake_list_node(*_a, **_k): + class R: + items = nodes + + return R() + + monkeypatch.setattr( + "kueuer.resources._load_kube", + lambda: type("X", (), {"list_node": fake_list_node})(), + ) + + out = total(None, field="capacity", node_label_key=NODE_LABEL_KEY) + by_t = out["by_label_value"] + assert by_t["cpu-node"]["count"] == 1 + assert by_t["gpu-node"]["count"] == 1 + assert by_t["cpu-node"]["cpu"] == {"value": "4", "unit": "cores"} + assert by_t["gpu-node"]["nvidia.com/gpu"] == [ + {"kind": "NVIDIA-T4", "value": "2", "unit": "count"}, + ] + assert by_t["cpu-node"]["weights"]["memory"] == "0.5" + assert by_t["gpu-node"]["weights"]["nvidia.com/gpu"]["NVIDIA-T4"] == "4" + + +def test_total_nvidia_gpu_from_labels_when_capacity_zero(monkeypatch) -> None: + """MIG-style nodes may advertise GPUs via labels while capacity nvidia.com/gpu is 0.""" + nodes = [ + _node( + "g1", + { + "cpu": "96", + "memory": "1000Gi", + "ephemeral-storage": "500Gi", + "nvidia.com/gpu": "0", + }, + { + "nvidia.com/gpu.count": "12", + "nvidia.com/gpu.product": "NVIDIA-H100-NVL-MIG-2g.24gb", + "skaha.opencadc.org/node-type": "gpu-node", + }, + ), + ] + + def fake_list_node(*_a, **_k): + class R: + items = nodes + + return R() + + monkeypatch.setattr( + "kueuer.resources._load_kube", + lambda: type("X", (), {"list_node": fake_list_node})(), + ) + + out = total(None, field="capacity", node_label_key=NODE_LABEL_KEY) + g = out["by_label_value"]["gpu-node"] + assert g["count"] == 1 + assert g["nvidia.com/gpu"] == [ + {"kind": "NVIDIA-H100-NVL-MIG-2g.24gb", "value": "12", "unit": "count"}, + ] + gw = g["weights"] + assert gw["memory"] == "0.096" + assert gw["ephemeral-storage"] == "0.192" + assert gw["nvidia.com/gpu"]["NVIDIA-H100-NVL-MIG-2g.24gb"] == "8" + + +def test_list_resource_quotas_returns_serialized_payload(monkeypatch) -> None: + payload = { + "apiVersion": "v1", + "kind": "ResourceQuotaList", + "items": [ + { + "metadata": {"name": "compute-quota", "namespace": "canfar-kueue-testing"}, + "spec": {"hard": {"requests.cpu": "8", "requests.memory": "32Gi"}}, + } + ], + } + + class FakeCoreV1: + def list_namespaced_resource_quota(self, namespace: str): + assert namespace == "canfar-kueue-testing" + return object() + + class FakeApiClient: + def sanitize_for_serialization(self, value): + assert value is not None + return payload + + fake_k8s = type( + "FakeK8s", + (), + {"core_v1": FakeCoreV1(), "api_client": FakeApiClient()}, + )() + monkeypatch.setattr("kueuer.resources.get_k8s_config", lambda: fake_k8s) + + result = list_resource_quotas("canfar-kueue-testing") + + assert result == payload + + +def test_resources_cli_includes_node_label_key_option() -> None: + result = runner.invoke(app, ["cluster", "resources", "--help"]) + assert result.exit_code == 0 + assert "--node-label-key" in result.stdout + + +def test_resourcequota_cli_prints_response_and_objects(monkeypatch) -> None: + payload = { + "apiVersion": "v1", + "kind": "ResourceQuotaList", + "items": [ + { + "metadata": {"name": "compute-quota", "namespace": "canfar-kueue-testing"}, + "status": {"hard": {"requests.cpu": "8"}}, + } + ], + } + monkeypatch.setattr("kueuer.resources.list_resource_quotas", lambda namespace: payload) + + result = runner.invoke( + app, + ["cluster", "resourcequota", "--namespace", "canfar-kueue-testing"], + ) + + assert result.exit_code == 0 + assert "response" in result.stdout + assert "resource_quotas" in result.stdout + assert "compute-quota" in result.stdout From 8e3f7c3d5b069126e5eba4d97592d6e5485c8a5b Mon Sep 17 00:00:00 2001 From: Shiny Brar Date: Thu, 23 Apr 2026 13:20:56 -0700 Subject: [PATCH 11/11] refactor(wip): kueue --- .../kueue/docs/adrs/ADR-001-kueue-adoption.md | 35 ++ .../kueue/docs/adrs/ADR-002-workload-apis.md | 36 ++ .../ADR-003-shared-workloads-namespace.md | 36 ++ .../ADR-004-standalone-control-service.md | 40 ++ ...DR-005-fairness-priority-and-preemption.md | 43 ++ .../ADR-006-posix-group-project-mapping.md | 50 ++ .../adrs/ADR-007-resource-flavor-taxonomy.md | 45 ++ ...ueue-enforcement-and-managed-namespaces.md | 35 ++ .../adrs/ADR-009-visibility-and-ui-scope.md | 39 ++ ...010-single-cluster-now-multikueue-later.md | 36 ++ ...ed-persistent-and-interactive-workloads.md | 40 ++ ...-scale-testing-and-operational-evidence.md | 41 ++ configs/kueue/docs/adrs/README.md | 28 + configs/kueue/docs/architecture.md | 545 ++++++++++++++++++ configs/kueue/docs/operations.md | 258 +++++++++ configs/kueue/docs/reportstyle.markdown | 303 ++++++++++ configs/kueue/docs/roadmap.md | 277 +++++++++ configs/kueue/docs/ui-spec.md | 222 +++++++ configs/kueue/kueuer/AGENTS.md | 13 + configs/kueue/kueuer/src/kueuer/resources.py | 468 ++++++++++----- configs/kueue/kueuer/tests/test_resources.py | 212 +++++-- 21 files changed, 2616 insertions(+), 186 deletions(-) create mode 100644 configs/kueue/docs/adrs/ADR-001-kueue-adoption.md create mode 100644 configs/kueue/docs/adrs/ADR-002-workload-apis.md create mode 100644 configs/kueue/docs/adrs/ADR-003-shared-workloads-namespace.md create mode 100644 configs/kueue/docs/adrs/ADR-004-standalone-control-service.md create mode 100644 configs/kueue/docs/adrs/ADR-005-fairness-priority-and-preemption.md create mode 100644 configs/kueue/docs/adrs/ADR-006-posix-group-project-mapping.md create mode 100644 configs/kueue/docs/adrs/ADR-007-resource-flavor-taxonomy.md create mode 100644 configs/kueue/docs/adrs/ADR-008-queue-enforcement-and-managed-namespaces.md create mode 100644 configs/kueue/docs/adrs/ADR-009-visibility-and-ui-scope.md create mode 100644 configs/kueue/docs/adrs/ADR-010-single-cluster-now-multikueue-later.md create mode 100644 configs/kueue/docs/adrs/ADR-011-protected-persistent-and-interactive-workloads.md create mode 100644 configs/kueue/docs/adrs/ADR-012-scale-testing-and-operational-evidence.md create mode 100644 configs/kueue/docs/adrs/README.md create mode 100644 configs/kueue/docs/architecture.md create mode 100644 configs/kueue/docs/operations.md create mode 100644 configs/kueue/docs/reportstyle.markdown create mode 100644 configs/kueue/docs/roadmap.md create mode 100644 configs/kueue/docs/ui-spec.md create mode 100644 configs/kueue/kueuer/AGENTS.md diff --git a/configs/kueue/docs/adrs/ADR-001-kueue-adoption.md b/configs/kueue/docs/adrs/ADR-001-kueue-adoption.md new file mode 100644 index 00000000..6b01400d --- /dev/null +++ b/configs/kueue/docs/adrs/ADR-001-kueue-adoption.md @@ -0,0 +1,35 @@ +# ADR-001: Kueue Adoption + +- Status: Accepted +- Date: January 2025 + +## Context + +CANFAR needs a Kubernetes-native way to control admission, queueing, quotas, +borrowing, reclaim, and visibility for a mix of interactive, persistent, and +batch science workloads. The platform must handle very large pending backlogs +without treating direct Kubernetes scheduling as the tenant policy layer. + +## Decision + +CANFAR adopts Kueue as the admission and quota orchestration layer for the +Science Platform. Kubernetes remains the runtime scheduler and execution plane. +`skaha` remains the main user submission entry point. + +## Consequences + +Kueue provides the needed queue, quota, priority, cohort, and visibility +primitives. It also creates a clean path to future topology-aware scheduling and +MultiKueue. + +CANFAR must still solve identity, project mapping, and accounting outside +Kueue. Kueue is not the tenant system of record. + +## Alternatives considered + +- Continue with direct Kubernetes scheduling and custom ad hoc controls +- Build a custom scheduling layer or scheduler plugin stack +- Treat the backlog problem as only a `skaha` rate-limiting problem + +These alternatives either move too much policy into custom code or fail to give +native cohort, quota, and admission control semantics. diff --git a/configs/kueue/docs/adrs/ADR-002-workload-apis.md b/configs/kueue/docs/adrs/ADR-002-workload-apis.md new file mode 100644 index 00000000..2c019f32 --- /dev/null +++ b/configs/kueue/docs/adrs/ADR-002-workload-apis.md @@ -0,0 +1,36 @@ +# ADR-002: Supported Workload APIs + +- Status: Accepted +- Date: Spring 2025 + +## Context + +The target architecture must support a broad workload taxonomy, but the current +repository baseline and the need for safe operational rollout make it unwise to +treat every Kueue integration production commitment. + +## Decision + +Production support centers on `batch/v1.Job`, including Indexed Job +usage patterns for large independent fan-out work. Protected interactive and +persistent workloads may be brought under Kueue using mature controller +patterns, but only where the team can verify the operational behavior. + +`JobSet`, MPI, Ray, and other advanced or distributed controllers remain part of +the target taxonomy and future roadmap, not the initial production commitment. + +## Consequences + +The platform gets a safe first production lane for large-scale batch admission +without blocking future support for more advanced workload types. + +The package still documents the full workload taxonomy so later phases do not +need to invent a new fairness or queue model. + +## Alternatives considered + +- Promise full support for every Kueue integration production commitment +- Delay all interactive or persistent integration until after batch-only rollout + +The first option creates avoidable operational risk. The second option breaks +the desired unified scheduling model too early. diff --git a/configs/kueue/docs/adrs/ADR-003-shared-workloads-namespace.md b/configs/kueue/docs/adrs/ADR-003-shared-workloads-namespace.md new file mode 100644 index 00000000..b2dc3f78 --- /dev/null +++ b/configs/kueue/docs/adrs/ADR-003-shared-workloads-namespace.md @@ -0,0 +1,36 @@ +# ADR-003: Shared `workloads` namespace now, namespace evolution later + +- Status: Accepted +- Date: March 12, 2026 + +## Context + +The current Kueue repository baseline uses multiple managed namespaces, but the +target architecture wants one shared Kueue-managed namespace at first so queue +governance, RBAC, and visibility can be kept simple while the new tenant model +is introduced. + +## Decision + +Use one shared `workloads` namespace for Kueue-managed user workloads in the +target single-cluster design. Create project-scoped `LocalQueue` objects in that +shared namespace on demand. + +Future supported namespace layouts include one namespace per community, one namespace per workload class, or a +hybrid namespace layout. + +## Consequences + +This keeps the initial rollout simpler and reduces the number of moving parts +while project-based fairness and community ownership are introduced. + +Future namespace splits remain possible without changing the core +community-project-cohort model. + +## Alternatives considered + +- Start immediately with one namespace per community +- Start immediately with one namespace per workload class + +Both alternatives increase governance and visibility complexity too early for +production commitment. diff --git a/configs/kueue/docs/adrs/ADR-004-standalone-control-service.md b/configs/kueue/docs/adrs/ADR-004-standalone-control-service.md new file mode 100644 index 00000000..c282e707 --- /dev/null +++ b/configs/kueue/docs/adrs/ADR-004-standalone-control-service.md @@ -0,0 +1,40 @@ +# ADR-004: Standalone accounting and control service + +- Status: Accepted +- Date: March 12, 2026 + +## Context + +Kueue cannot serve as the system of record for communities, projects, POSIX +group mapping, delegated project administration, or accounting relationships. +Those concerns are fundamental to CANFAR's policy and visibility. + +## Decision + +Define a new standalone accounting and control service as a required future +dependency of the platform. The service remains out of scope for implementation +in this package, but it is in scope for architecture and requirements. + +The service must support: + +- community creation and management by cluster admins +- project creation inside a community by delegated project admins +- project-to-group mapping and later user or group resolution +- override request workflows for temporary fair-share changes +- exposure of tenant metadata to `skaha` and the future visibility UI + +## Consequences + +The scheduler design stays clean. Kueue owns admission and quota behavior while +the control service owns tenant and policy metadata. + +The rollout now has an explicit dependency that must be addressed in later +phases rather than hidden behind manual configuration. + +## Alternatives considered + +- Extend an existing service implicitly without naming a new component +- Keep project metadata as static Kubernetes configuration only + +Both alternatives hide ownership and make future admin workflows difficult to +design and operate. diff --git a/configs/kueue/docs/adrs/ADR-005-fairness-priority-and-preemption.md b/configs/kueue/docs/adrs/ADR-005-fairness-priority-and-preemption.md new file mode 100644 index 00000000..9b182c84 --- /dev/null +++ b/configs/kueue/docs/adrs/ADR-005-fairness-priority-and-preemption.md @@ -0,0 +1,43 @@ +# ADR-005: Fairness, workload priority, and preemption model + +- Status: Accepted +- Date: March 12, 2026 + +## Context + +CANFAR needs fair competition between projects, community ownership of +resources, borrowing of idle capacity, and a workload-ordering model that keeps +interactive work ahead of batch work inside each project. + +## Decision + +Use the following split model: + +- Community = `ClusterQueue` +- Project = `LocalQueue` +- Multiple communities sharing capacity = `Cohort` +- Project competition inside one community = Admission Fair Sharing with + adjustable `LocalQueue` weights +- Workload ordering inside one project = `WorkloadPriorityClass` + +Use cohort borrowing and reclaim for community-level resource ownership. Use +project-local workload priority to select interactive work before lower-priority +batch work inside the chosen project queue. + +## Consequences + +This preserves community ownership while still maximizing idle cluster use. It +also avoids pretending that project fair-share and workload priority are the +same thing. + +Cross-community competition remains community-scoped rather than global +project-scoped. That is intentional. + +## Alternatives considered + +- One global project fair-share plane across all communities +- Priority-only scheduling without project fair-share weights +- Community-only fairness with no project-level balancing + +These alternatives either ignore community ownership or fail to give projects a +meaningful fairness model inside a community. diff --git a/configs/kueue/docs/adrs/ADR-006-posix-group-project-mapping.md b/configs/kueue/docs/adrs/ADR-006-posix-group-project-mapping.md new file mode 100644 index 00000000..7aef8242 --- /dev/null +++ b/configs/kueue/docs/adrs/ADR-006-posix-group-project-mapping.md @@ -0,0 +1,50 @@ +# ADR-006: POSIX group to project mapping options + +- Status: Proposed +- Date: March 12, 2026 + +## Context + +Projects may contain multiple POSIX groups and communities may contain multiple +projects. The open question is whether a POSIX group may belong to more than one +project. + +This decision changes the submission experience because ambiguous group mapping +may force the API layer to require an explicit project field. + +## Options + +### Option A: One group maps to exactly one project + +Under this option, a POSIX group may not belong to multiple projects. + +#### Benefits + +- `Skaha` can often infer project and community from group context +- submission stays simpler for users +- visibility and accounting reasoning stay easier to explain + +#### Costs + +- the identity model is stricter +- some administrative use cases may need new group structures + +### Option B: A group may map to multiple projects + +Under this option, a POSIX group may belong to more than one project. + +#### Benefits + +- the identity model is more flexible +- administrators can reuse groups across projects + +#### Costs + +- the submission path must require explicit project selection in ambiguous cases +- user experience becomes more complex +- the control service and UI must explain ambiguity clearly + +## Current direction + +Leave the decision open. The architecture and UI must support both models until +the tenant administration workflow is finalized. diff --git a/configs/kueue/docs/adrs/ADR-007-resource-flavor-taxonomy.md b/configs/kueue/docs/adrs/ADR-007-resource-flavor-taxonomy.md new file mode 100644 index 00000000..6956a2d3 --- /dev/null +++ b/configs/kueue/docs/adrs/ADR-007-resource-flavor-taxonomy.md @@ -0,0 +1,45 @@ +# ADR-007: ResourceFlavor taxonomy and topology model + +- Status: Accepted +- Date: March 12, 2026 + +## Context + +CANFAR needs a flavor model that captures resource identity across cluster, +zone, accelerator type, storage class, and later topology-aware scheduling +domains. The model must stay readable to operators and extensible to future +MultiKueue deployments. + +## Decision + +Use `ResourceFlavor` as the canonical scheduler-facing identity for placement and +hardware classes. Standardize flavor naming around stable placement and hardware +dimensions rather than workload class. + +Adopt the following naming pattern: + +`rf---[-]` + +Examples: + +- `rf-ca-west-01-cpu-standard` +- `rf-ca-west-01-cpu-highmem` +- `rf-ca-west-01-gpu-a100` + +Treat topology-aware scheduling as a future phase. When topology becomes active, +use `Topology` objects and flavor association rather than encoding full topology +hierarchy into the flavor name itself. + +## Consequences + +Operators get a stable taxonomy that works in both single-cluster and future +manager-worker designs. Users and admins can also read flavor identity in a +predictable way. + +## Alternatives considered + +- Opaque flavor names with documentation-only meaning +- One flavor per workload class +- Encoding every topology dimension directly in the flavor name + +These alternatives either hide meaning or create unnecessary flavor sprawl. diff --git a/configs/kueue/docs/adrs/ADR-008-queue-enforcement-and-managed-namespaces.md b/configs/kueue/docs/adrs/ADR-008-queue-enforcement-and-managed-namespaces.md new file mode 100644 index 00000000..abf8191d --- /dev/null +++ b/configs/kueue/docs/adrs/ADR-008-queue-enforcement-and-managed-namespaces.md @@ -0,0 +1,35 @@ +# ADR-008: Queue enforcement and managed namespace model + +- Status: Accepted +- Date: March 12, 2026 + +## Context + +Kueue policy only works predictably when managed workloads land in managed +namespaces and carry valid queue information. CANFAR requires users to +submit through `skaha`, not through raw Kubernetes APIs without platform policy. + +## Decision + +Use explicitly managed namespaces for Kueue-managed user work. In the target +state this is one shared `workloads` namespace. The submission path must resolve +and apply a `LocalQueue` explicitly. + +Keep `manageJobsWithoutQueueName` disabled and reject malformed or unqueued +submissions in managed namespaces through admission policy and service-side +validation. + +## Consequences + +The scheduler does not need to guess tenant identity. Platform policy remains +explicit, and visibility stays consistent with actual queue assignment. + +Future namespace evolution remains possible as long as the same enforcement +principles are preserved. + +## Alternatives considered + +- Allow silent default queue assignment everywhere +- Allow users to create unmanaged work in the same namespaces as managed work + +These alternatives make fairness and explanation harder to trust. diff --git a/configs/kueue/docs/adrs/ADR-009-visibility-and-ui-scope.md b/configs/kueue/docs/adrs/ADR-009-visibility-and-ui-scope.md new file mode 100644 index 00000000..c3e1750b --- /dev/null +++ b/configs/kueue/docs/adrs/ADR-009-visibility-and-ui-scope.md @@ -0,0 +1,39 @@ +# ADR-009: Visibility and UI scope + +- Status: Accepted +- Date: March 12, 2026 + +## Context + +Fair scheduling without understandable visibility will be perceived as arbitrary. +CANFAR's users, project admins, and cluster admins all need different levels of +insight into ownership, pending reasons, and current queue position. + +## Decision + +Treat visibility as a first-class architectural concern. Production commitment relies on +`kubectl`, Grafana, Kueue metrics, and the pending-workloads visibility API. +Later phases add a read-only queue UI and then guided admin workflows. + +The UI must explain scheduling outcomes in terms of: + +- fair-share position +- workload priority +- quota exhaustion +- insufficient resource availability +- policy rejection + +## Consequences + +The architecture gains a clear product surface instead of assuming that raw +conditions or controller logs are enough. + +This also creates a requirement for the control service to expose tenant and +override metadata to the UI. + +## Alternatives considered + +- Delay visibility until after scheduling is complete +- Rely only on Kubernetes-native object inspection + +These alternatives make correct policy look opaque to most users. diff --git a/configs/kueue/docs/adrs/ADR-010-single-cluster-now-multikueue-later.md b/configs/kueue/docs/adrs/ADR-010-single-cluster-now-multikueue-later.md new file mode 100644 index 00000000..c4e2cc0d --- /dev/null +++ b/configs/kueue/docs/adrs/ADR-010-single-cluster-now-multikueue-later.md @@ -0,0 +1,36 @@ +# ADR-010: Single-cluster now, MultiKueue-ready later + +- Status: Accepted +- Date: March 12, 2026 + +## Context + +CANFAR wants a design that can grow into a multi-cluster model, but the team +also needs to prove the policy model, control-plane behavior, and backlog +health in a single cluster before adding federation complexity. + +## Decision + +Build for a single-cluster production target first. Preserve a vocabulary and +resource model that remain compatible with future MultiKueue manager and worker +clusters. + +Do not make MultiKueue a phase 1 requirement. Treat it as a later phase that is +triggered by proven need, such as API-server pressure, site-level separation, or +distinct hardware pools that require worker-cluster sharding. + +## Consequences + +The rollout remains practical while still avoiding a dead-end tenancy model. + +The architecture package must still document manager-worker deployment patterns, +future quota mapping, and operational risks so later federation does not become +a redesign exercise. + +## Alternatives considered + +- Design only for one cluster with no future federation path +- Make MultiKueue a near-term mandatory target + +The first option creates future migration pain. The second option adds too much +operational complexity before the single-cluster model is proven. diff --git a/configs/kueue/docs/adrs/ADR-011-protected-persistent-and-interactive-workloads.md b/configs/kueue/docs/adrs/ADR-011-protected-persistent-and-interactive-workloads.md new file mode 100644 index 00000000..a6ad5bc8 --- /dev/null +++ b/configs/kueue/docs/adrs/ADR-011-protected-persistent-and-interactive-workloads.md @@ -0,0 +1,40 @@ +# ADR-011: Protected persistent and interactive workloads + +- Status: Accepted +- Date: March 12, 2026 + +## Context + +CANFAR does not run only batch jobs. The platform also runs interactive sessions +and persistent user-facing services that users experience directly. + +## Decision + +Keep persistent and interactive work in the same overall Kueue-managed system, +but treat them as protected workload classes rather than ordinary best-effort +batch. + +Inside each project: + +- interactive work has higher workload priority than batch work +- persistent workloads are protected and must not be treated as cheap + preemption targets + +The exact controller integrations may mature in phases, but the policy model is +fixed now. + +## Consequences + +The platform retains a unified tenant and fairness model while still protecting +the user experience for long-lived or interactive work. + +Operators must still watch for cases where the single-plane model causes too +much contention and then adjust protection or namespace boundaries later. + +## Alternatives considered + +- Keep all persistent and interactive work outside Kueue +- Treat persistent and interactive work exactly like best-effort batch + +The first option breaks the unified policy model. The second option creates +avoidable user pain and preemption risk. diff --git a/configs/kueue/docs/adrs/ADR-012-scale-testing-and-operational-evidence.md b/configs/kueue/docs/adrs/ADR-012-scale-testing-and-operational-evidence.md new file mode 100644 index 00000000..3bc076b2 --- /dev/null +++ b/configs/kueue/docs/adrs/ADR-012-scale-testing-and-operational-evidence.md @@ -0,0 +1,41 @@ +# ADR-012: Scale-testing and operational evidence + +- Status: Accepted +- Date: March 12, 2026 + +## Context + +The target system must support large pending backlog without relying on guesses +about safe operating thresholds. Kueue behavior, Kubernetes API behavior, and +the submission path must be measured together. + +## Decision + +Use benchmark-driven operational evidence as a release gate. `kueuer`, +Prometheus, Grafana, and controlled workload storms are required parts of the +platform rollout, not optional afterthoughts. + +The evidence model must include: + +- scale gates at increasing backlog levels +- workload admission timing and throughput +- controller health and restart behavior +- API server latency and saturation +- visibility API behavior under backlog +- user-path latency through `skaha` + +## Consequences + +The platform gets explicit stop or go thresholds for production change rather +than relying on anecdotal experience. + +This also means roadmap phases need objective exit criteria and repeatable test +artifacts. + +## Alternatives considered + +- Tune Kueue and trust best effort observations +- Measure only Kueue controller metrics + +These alternatives fail to prove end-to-end system behavior under realistic +submission pressure. diff --git a/configs/kueue/docs/adrs/README.md b/configs/kueue/docs/adrs/README.md new file mode 100644 index 00000000..b09e1e4c --- /dev/null +++ b/configs/kueue/docs/adrs/README.md @@ -0,0 +1,28 @@ +# Architecture Decision Records + +This directory contains the architecture decision records for the CANFAR Science Platform Kueue +design. + +## ADR index + +| ADR | Title | Status | +| --- | ----- | ------ | +| [ADR-001](./ADR-001-kueue-adoption.md) | Why CANFAR adopts Kueue | Accepted | +| [ADR-002](./ADR-002-phase-1-workload-apis.md) | Supported workload APIs in phase 1 | Accepted | +| [ADR-003](./ADR-003-shared-workloads-namespace.md) | Shared `workloads` namespace now, namespace evolution later | Accepted | +| [ADR-004](./ADR-004-standalone-control-service.md) | Standalone accounting and control service | Accepted | +| [ADR-005](./ADR-005-fairness-priority-and-preemption.md) | Fairness, workload priority, and preemption model | Accepted | +| [ADR-006](./ADR-006-posix-group-project-mapping.md) | POSIX group to project mapping options | Proposed | +| [ADR-007](./ADR-007-resource-flavor-taxonomy.md) | ResourceFlavor taxonomy and topology model | Accepted | +| [ADR-008](./ADR-008-queue-enforcement-and-managed-namespaces.md) | Queue enforcement and managed namespace model | Accepted | +| [ADR-009](./ADR-009-visibility-and-ui-scope.md) | Visibility and UI scope | Accepted | +| [ADR-010](./ADR-010-single-cluster-now-multikueue-later.md) | Single-cluster now, MultiKueue-ready later | Accepted | +| [ADR-011](./ADR-011-protected-persistent-and-interactive-workloads.md) | Protected persistent and interactive workloads | Accepted | +| [ADR-012](./ADR-012-scale-testing-and-operational-evidence.md) | Scale-testing and operational evidence | Accepted | + +## How to use the ADR set + +Read the ADRs in numerical order the first time. After that, use the table +above to jump to the specific decision you need. Only ADR-006 remains open in +this package. All other ADRs define the current architectural intent for the +target rollout. diff --git a/configs/kueue/docs/architecture.md b/configs/kueue/docs/architecture.md new file mode 100644 index 00000000..a600e8bb --- /dev/null +++ b/configs/kueue/docs/architecture.md @@ -0,0 +1,545 @@ +# CANFAR Science Platform Kueue Architecture + +This document is the primary architecture reference for Kueue rollout on the CANFAR Science +Platform. It captures the current baseline, the target +single-cluster architecture, and the future-ready extension points for +MultiKueue, topology-aware scheduling, and richer tenant control. + +Use this document together with [roadmap.md](./roadmap.md), [operations.md](./operations.md), +[ui-spec.md](./ui-spec.md), and [the ADR index](./adrs/README.md). The existing +deep-dive reference remains useful background material, but this document is the +system design source of truth. + +## 1. Introduction + +This section defines the problem, the user groups, and the success criteria for +the Kueue architecture on the CANFAR Science Platform. + +### 1.1 Problem statement + +The CANFAR Science Platform needs a batch and tenant-admission layer for Kubernetes +that can absorb very large pending backlogs while still giving users predictable +submission behavior and clear explanations for delay. The platform must support +interactive sessions, persistent user-facing services, MPI and distributed jobs, +and very large numbers of batch jobs in the same managed environment. + +The design target is not only to admit work fairly. It must also preserve +control-plane health when the backlog grows to `100,000`, `200,000`, or more +pending jobs or workload objects, while the active execution footprint remains +much smaller. + +### 1.2 Users and stakeholders + +The architecture serves the following groups: + +- Science users who launch notebooks, interactive tools, batch jobs, and + distributed workloads through `Skaha` +- Project administrators who manage projects and group membership inside a + community +- Cluster system administrators who own infrastructure, Kueue policy, and + emergency controls +- Platform operators who need strong observability, clear runbooks, and safe + rollout procedures +- Future control-service operators who will manage community ownership, project + metadata, and accounting relationships + +### 1.3 Success Criteria + +The design is successful when the platform can: + +- Admit and manage work fairly across communities and projects +- Prioritize interactive work over batch work inside each project +- Preserve community ownership of resources while still borrowing idle capacity +- Explain pending or rejected work in terms users and admins can act on +- Support at least one shared production workload namespace now +- Stay compatible with future namespace splits, MultiKueue, and topology-aware + scheduling +- Produce benchmark evidence and operational thresholds rather than relying on + informal tuning + +### 1.4 Quality Goals + +The top quality goals for this system are: + +- Fairness: projects compete fairly within a community, and communities reclaim + owned resources within a cohort +- Transparency: users and admins can see why work is pending, admitted, + preempted, or rejected +- Reliability: the Kueue control plane, Kubernetes API, and visibility surfaces + remain healthy under heavy pending backlog +- Operability: cluster admins can pause, drain, observe, and roll back changes + safely +- Scalability: the design can grow from a single-cluster deployment into a + manager and worker model without rewriting the tenancy model + +## 2. Constraints + +This section captures the hard realities that shape the design. + +### 2.1 Repository and Deployment Baseline + +The current repository baseline still reflects an older Kueue deployment: + +- Current deployment documents `0.11.6` as the installed release +- Current deployment uses `config.kueue.x-k8s.io/v1beta1` +- Current deployment uses `batch/job` only +- Current deployment references `skaha-workload` and `canfar-b-workload` rather than a single shared `workloads` namespace + +This architecture therefore distinguishes three states: + +- Current state: the repository and deployed baseline today +- Target state: the single-cluster architecture described in this package +- Future state: capabilities that remain out of scope for phase 1 but must stay + compatible with the design + +### 2.2 Identity and governance constraints + +The current identity hierarchy is: + +`Community -> Project -> POSIX Group -> User` + +This means the scheduler cannot be the system of record for tenant structure. +Kueue needs an external control service to provide authoritative definitions for +communities, projects, project-to-group mappings, and later accounting policy. + +### 2.3 Namespace constraint + +The target design uses one shared `workloads` namespace for Kueue-managed user +work now. `LocalQueue` objects are still project-scoped, but they exist in that +shared namespace and are created on demand. + +The architecture must also stay compatible with future namespace evolution, such +as: + +- one namespace per community +- one namespace per workload class, such as `gpu-workloads` or + `batch-workloads` + +Those namespace changes are future roadmap items, not phase 1 requirements. + +### 2.4 Operational constraints + +The system must work in an environment where: + +- the cluster is managed by CADC +- users mostly submit via `Skaha`, not by direct raw Kubernetes access +- batch backlog can exceed active capacity by two or more orders of magnitude +- interactive and persistent work cannot be treated as disposable noise +- the real bottleneck may be Kueue, the Kubernetes API server, etcd, or the + submission path, not just one controller setting + +## 3. Context and scope + +This section defines the system boundary and the key external actors. + +### 3.1 System scope + +The architecture covers: + +- `Skaha` submission and workload resolution behavior +- Kueue queue, quota, flavor, and priority objects +- the future standalone accounting and control service +- user, project admin, and cluster admin visibility surfaces +- benchmark and operations evidence for scale and correctness + +The architecture does not implement: + +- the control service itself +- the accounting penalty model for temporary fair-share overrides +- a production user interface implementation +- a MultiKueue production rollout in phase 1 + +### 3.2 Communities and cohorts + +Resources such as CPU, memory, storage, GPUs, network bandwidth, and I/O are +owned at the community level. Communities are implemented as `ClusterQueue` +objects. Multiple communities together form a cohort and may lend or borrow +capacity from one another. + +The initial named communities used in examples and diagrams are: + +- `cadc` +- `ska` +- `chimefrb` + +### 3.3 Projects and local queues + +Projects exist inside a community and are implemented as `LocalQueue` objects. +`LocalQueue` creation is project-scoped and on demand. The `LocalQueue` +therefore represents the scheduler-facing identity of a project inside a shared +community resource pool. + +This has one important consequence: project fair sharing is intra-community. +Projects compete fairly with other projects that target the same community +`ClusterQueue`. Cross-community competition is governed by `ClusterQueue` +policies and the cohort, not by one global project fair-share plane. + +## 4. Solution strategy + +This section states the high-level design strategy and the main trade-offs. + +### 4.1 Scheduler model + +Kueue acts as the admission and quota orchestration layer. Kubernetes still +performs pod placement and runtime scheduling. Kueue decides when work can enter +the active scheduling plane and under what quota, flavor, and priority rules. + +### 4.2 Tenancy model + +The target tenancy mapping is: + +- Community = `ClusterQueue` +- Project = `LocalQueue` +- Multiple communities = one or more `Cohort` relationships + +This model gives CANFAR the following properties: + +- community-owned resources remain a first-class concept +- communities can lend and borrow unused capacity +- projects compete fairly inside their community +- workload class ordering stays project-local through priority +- the control service remains the source of truth for project and group mapping + +### 4.3 Fairness and priority model + +The scheduling and fairness model is split deliberately: + +- Within a cohort of communities, communities borrow and reclaim through + `ClusterQueue` and cohort policy. +- Within a single community, projects compete using Admission Fair Sharing, + driven by `LocalQueue` historical usage and adjustable weights. +- Within a project, workloads are ordered by `WorkloadPriorityClass`. + +This preserves an important operational rule: interactive work wins inside a +project, but not at the cost of pretending that project fair-share history does +not exist. A project that has consumed a great deal of recent capacity can still +feel the effect of fair-share decay, even if its next workload is interactive. + +### 4.4 Control service strategy + +Kueue is not the tenant authority. A future standalone accounting and control +service is required to manage: + +- community definitions and resource ownership +- project creation inside a community +- project-to-group mappings +- quota and usage relationships outside raw Kueue quota semantics +- temporary fair-share override workflow for project admins and cluster admins + +This service is out of scope for phase 1 implementation, but it is in scope for +architecture and requirements. + +## 5. Building block view + +This section describes the main building blocks of the target system. + +### 5.1 Submission plane + +The submission plane consists of: + +- `Skaha` as the main user-facing submission path +- a queue and policy resolution layer inside `Skaha` +- the future control service for project, group, and ownership lookup + +`Skaha` accepts a user request, resolves the effective project and community, +selects the correct `LocalQueue`, attaches the required labels, and creates the +workload object in the shared `workloads` namespace. + +### 5.2 Queueing plane + +The queueing plane consists of Kueue CRDs and controller behavior: + +- `ResourceFlavor` for cluster, zone, hardware, and future topology identity +- `ClusterQueue` for community-owned quota and preemption rules +- `LocalQueue` for project-level competition within a community +- `WorkloadPriorityClass` for workload-class ordering within a project +- `Cohort` for sharing and reclaim between communities + +### 5.3 Execution plane + +The execution plane remains native Kubernetes: + +- Kubernetes API server and etcd store all workload state +- the default scheduler places admitted pods +- workload-specific controllers such as `Job`, `JobSet`, `MPIJob`, or `Ray` + drive runtime behavior after admission + +### 5.4 Visibility plane + +The visibility plane combines several sources: + +- Kueue Prometheus metrics +- the Kueue pending-workloads visibility API +- Kubernetes and apiserver metrics +- a future UI for user and admin-facing queue explanations + +The architecture uses these sources to explain whether a workload is delayed +because of fair-share position, workload priority, resource shortage, quota +exhaustion, or policy rejection. + +### 5.5 Control service requirements + +The future standalone accounting and control service must support: + +- cluster admins creating and managing communities +- delegated project admins creating projects within a community +- attaching one or more POSIX groups to a project +- resolving a user or group to the correct project at submission time, or + requiring explicit project selection when the mapping model is ambiguous +- tracking usage and quota relationships that are out of scope for Kueue alone +- handling temporary fair-share override requests + +The service must expose enough state for both `Skaha` and the user visibility +surface. It does not need to become the workload execution system. + +## 6. Runtime view + +This section describes the key runtime behaviors of the target system. Detailed +sequence diagrams are provided in [diagrams.md](./diagrams.md). + +### 6.1 Submit and admit flow + +The normal runtime path is: + +1. A user submits work through `Skaha`. +2. `Skaha` resolves the effective project and community through internal policy + and later through the control service. If the mapping model is ambiguous, the + submission path requires explicit project selection. +3. `Skaha` selects a `LocalQueue`, applies the required labels, and creates the + Kubernetes workload object in `workloads`. +4. Kueue creates or updates the corresponding `Workload`. +5. Kueue evaluates quota, flavors, priority, and admission checks. +6. Once quota is reserved and admission is satisfied, Kueue admits the + workload. +7. Kubernetes schedules the pods and the native controller runs them. + +### 6.2 Community borrowing and reclaim + +When one community is idle and another is busy, the busy community can borrow +from the cohort. When the idle community becomes active again, Kueue reclaims +nominal quota according to cohort preemption policy. + +This behavior is community-scoped. It is not project-scoped reclaim. Project +competition inside a community is governed separately through `LocalQueue` +fairness and workload priority. + +### 6.3 Priority and fair-share interaction + +Project fair-sharing decides which project gets the next chance to consume +community quota. Workload priority decides which workload from that project is +selected first. + +This means: + +- higher-priority interactive work can win against lower-priority batch work + inside the same project +- a project with a high recent usage history can still wait behind a project + with lower recent usage +- temporary fair-share weight adjustments change the project's relative share, + not just the priority of one workload + +### 6.4 Pending-state explanation + +The platform must expose actionable pending reasons. The user-facing explanation +model uses the following categories: + +- waiting behind higher fair-share demand from other projects in the same + community +- waiting behind higher-priority workloads in the same project +- blocked because owned or borrowed resources are not currently available +- blocked because the project or submission policy rejected the request +- blocked because the control plane is degraded and admission is not keeping up + +## 7. Deployment view + +This section describes the target single-cluster deployment and the future +expansion path. + +### 7.1 Current target deployment + +The target single-cluster deployment includes: + +- one Kueue control plane in `kueue-system` +- one shared `workloads` namespace for Kueue-managed user workloads +- one `ClusterQueue` per community, such as `cadc`, `ska`, and `chimefrb` +- one or more project `LocalQueue` objects per community, created on demand +- `ResourceFlavor` objects that capture cluster, zone, GPU class, and later + topology-aware placement +- one monitoring stack with Prometheus and Grafana + +### 7.2 Namespace evolution + +The architecture preserves compatibility with future namespace splits. Likely +future directions are: + +- namespace per community +- namespace per workload class +- mixed namespace policy for highly specialized resources + +The scheduler model and control service model remain valid under any of these +future namespace layouts. The main affected areas are `LocalQueue` ownership, +RBAC, and visibility scope. + +### 7.3 Multi-cluster future + +The future-state expansion path is MultiKueue, with one manager cluster and one +or more worker clusters. The manager and workers must preserve the same +community, project, and flavor vocabulary so the single-cluster tenant model can +evolve rather than be replaced. + +## 8. Cross-cutting concepts + +This section captures concepts that shape multiple parts of the design. + +### 8.1 Workload taxonomy + +The architecture recognizes the following workload classes: + +- Interactive sessions such as notebooks and user-facing interactive tools +- Persistent user-facing services or deployments +- Standard batch `Job` +- Indexed `Job` for large independent fan-out workloads +- `JobSet` and MPI-style grouped or distributed jobs +- `RayJob`, `RayCluster`, and related distributed compute workloads +- Cron-triggered batch work +- Plain Pod or exception paths only when a better controller does not exist + +Phase 1 does not need to productionize every Kueue integration, but the +taxonomy must be documented now because it affects priority, fairness, and +observability semantics. + +### 8.2 Resource ownership and flavors + +Communities own resources. Kueue expresses that ownership through `ClusterQueue` +quota and `ResourceFlavor` taxonomy. The flavor model must support: + +- CPU and memory +- storage classes and local storage distinctions when relevant +- GPU model and accelerator pool identity +- cluster and zone placement +- future topology domains such as rack, block, or GPU island + +### 8.3 Governance and enforcement + +Managed workloads must set a queue explicitly or be assigned one by the +submission service. The target governance model is: + +- users submit through `Skaha` +- `Skaha` resolves `LocalQueue` assignment +- managed namespaces reject malformed or unqueued submissions +- cluster admins retain emergency controls for stopping or draining queues + +### 8.4 Protected workloads + +Interactive and persistent workloads remain part of the same Kueue-managed +system, but they are not treated the same way as best-effort batch. Persistent +workloads are a protected class. Interactive work holds higher workload priority +than batch inside each project. + +The architecture deliberately leaves room for stricter protection later if +evidence shows that the single-plane model causes avoidable user pain. + +### 8.5 Temporary fair-share overrides + +Project admins may request temporary fair-share overrides through a future UI +and control-service workflow. Only cluster admins can approve or apply those +changes. + +The downstream accounting model for the quota or usage cost of those overrides +is explicitly out of scope here. This architecture only records the requirement +that the accounting model must exist and must remain visible to admins. + +## 9. Quality requirements + +This section turns the earlier goals into concrete architecture obligations. + +### 9.1 Reliability + +The design must keep Kueue and the Kubernetes API healthy under load. The +architecture therefore requires: + +- benchmark-driven thresholds +- explicit rollout and rollback procedures +- visibility into queue backlog and controller saturation + +### 9.2 Transparency + +The design must expose queue state in ways users can understand without reading +controller logs or raw conditions. The UI and observability model therefore must +present effective fair-share state, workload priority, ownership boundaries, and +pending reasons clearly. + +### 9.3 Scalability + +The design must support large pending backlogs without assuming that listing all +raw objects remains cheap. This is why the architecture treats visibility APIs, +metrics, and summary views as primary interfaces rather than optional extras. + +### 9.4 Operability + +Cluster admins must be able to: + +- inspect community and project usage quickly +- pause or drain queues safely +- identify whether bottlenecks are in Kueue, the Kubernetes API, or the + submission path +- run repeatable benchmark suites before changing policy or scale assumptions + +## 10. Risks and technical debt + +This section identifies the major known risks. + +- The current repository baseline is behind the target capability set and will + require careful Kueue upgrade work before all target features are usable. +- Fairness is easy to misinterpret. Users may perceive a correct fair-share + result as unfair if the UI does not explain recent usage and priority clearly. +- The one shared namespace is operationally simple now, but it may become a + pressure point for RBAC and visibility policy as more workload types are + onboarded. +- Flexible workloads with changing resource shape can challenge request-based + admission semantics and will need careful validation before broader use. +- The control service is required by the architecture but remains out of scope + for immediate implementation, so integration points must stay explicit. +- MultiKueue and topology-aware scheduling remain future capabilities with + meaningful operational constraints and must not be promised as phase 1 + features. + +## 11. Decisions summary + +This section summarizes the architectural decisions captured in detail by the +ADR set. + +- Use Kueue as the admission and quota orchestration system. +- Model communities as `ClusterQueue` objects and projects as `LocalQueue` + objects. +- Keep one shared `workloads` namespace now and treat namespace evolution as a + future roadmap item. +- Use a standalone accounting and control service as the future system of + record. +- Use fair-share weights for project competition and workload priority for + project-internal ordering. +- Keep persistent workloads protected and interactive workloads higher priority + than batch workloads. +- Preserve a MultiKueue-ready vocabulary even while deploying single-cluster + first. + +See [the ADR index](./adrs/README.md) for the full decision log. + +## 12. Glossary + +This glossary standardizes the key terms used across the package. + +- Community: the top-level resource owner in CANFAR; implemented as a + `ClusterQueue` +- Cohort: a set of `ClusterQueue` objects that may lend or borrow capacity +- Project: the scheduler-facing tenant inside a community; implemented as a + `LocalQueue` +- POSIX group: an identity grouping used for project membership and submission + resolution +- Workload class: a platform-level category such as interactive, persistent, or + batch +- Fair-share weight: the configurable project weighting used for community-local + project competition +- Workload priority: the ordering signal used inside a project's queue +- Control service: the future standalone service that stores communities, + projects, mappings, and accounting relationships diff --git a/configs/kueue/docs/operations.md b/configs/kueue/docs/operations.md new file mode 100644 index 00000000..0595f6ba --- /dev/null +++ b/configs/kueue/docs/operations.md @@ -0,0 +1,258 @@ +# CANFAR Kueue operations appendix + +This document defines the operational model for the CANFAR Kueue architecture. +It covers SLOs, observability, alerting, runbooks, rollout, and rollback. Use +it together with [architecture.md](./architecture.md), [roadmap.md](./roadmap.md), +[ui-spec.md](./ui-spec.md), and [the ADR index](./adrs/README.md). + +## 1. Operational objectives + +The Kueue platform must remain understandable and recoverable under pressure. +Operators need to know whether the problem is in Kueue, the Kubernetes API, the +submission path, or the workload mix itself. + +This appendix therefore focuses on: + +- service-level objectives +- the metrics and dashboards that support those objectives +- incident runbooks +- safe rollout and rollback mechanics + +## 2. Service Level Indicators (SLIs) and Objectives (SLOs) + +The following SLOs form the initial operating contract. They are subject to +refinement after benchmark evidence is gathered. + +### 2.1 Submission and admission SLOs + +| SLI | Target | +| --- | ------ | +| `skaha` successful submission response rate | `>= 99.9%` over 30 days | +| `skaha` P95 create-session latency under nominal load | `<= 2s` | +| Kueue P95 admission wait for interactive workloads under nominal conditions | `<= 30s` | +| Kueue P95 admission wait for standard batch under nominal backlog | `<= 10m` | +| Visibility API P95 latency for paged pending queries | `<= 2s` | + +### 2.2 Control-plane health SLOs + +| SLI | Target | +| --- | ------ | +| Kueue controller availability | `>= 99.95%` over 30 days | +| Zero unplanned controller crash loops | Required | +| Kubernetes API P99 write latency for workload creation under nominal load | `<= 1s` | +| Kubernetes API P99 read latency for queue visibility queries | `<= 1s` | + +### 2.3 Fairness and stability SLOs + +| SLI | Target | +| --- | ------ | +| Unexplained pending-state responses in UI | `0` | +| Preemptions without user-visible reason category | `0` | +| Queue-policy changes without rollback path | `0` | +| Benchmark-backed backlog ceiling published and current | Required | + +## 3. Metrics and dashboards + +The platform needs dashboards that show both tenant policy and control-plane +health. Kueue metrics alone are not enough. + +### 3.1 Required metrics + +Use Kueue, Kubernetes, and platform metrics together. Important Kueue metrics +include: + +- `kueue_pending_workloads` +- `kueue_admission_wait_time_seconds` +- `kueue_cluster_queue_resource_usage` +- `kueue_cluster_queue_nominal_quota` +- `kueue_evicted_workloads_total` +- `kueue_local_queue_evicted_workloads_total` +- `kueue_admitted_workloads_total` +- `kueue_finished_workloads_total` + +Important Kubernetes and control-plane metrics include: + +- apiserver request latency and error metrics +- etcd latency and saturation indicators +- controller pod restart count and RSS memory +- scheduler latency for admitted workloads +- `Skaha` request latency and error rate + +### 3.2 Required dashboards + +Create the following dashboards at minimum: + +- Community ownership and quota view +- Project fair-share and queue-position view +- Pending workload health by class and queue +- Admission latency and throughput +- Preemption and eviction reason view +- Controller and API-server health +- End-to-end submission latency + +### 3.3 Ownership + +Dashboard ownership must be explicit: + +- Platform team owns Kueue, Kubernetes API, and rollout health dashboards +- `Skaha` owners own submission-path latency and error dashboards +- Future control-service owners own tenant and override workflow dashboards + +## 4. Alerts + +Alerts must drive action, not noise. Each alert needs a linked runbook. + +### 4.1 High-severity alerts + +- Kueue controller unavailable or crash looping +- Kubernetes API write latency above threshold for sustained periods +- Visibility API unavailable or returning sustained errors +- Preemption storm above agreed threshold +- Shared `workloads` namespace submission failures above threshold + +### 4.2 Medium-severity alerts + +- Kueue controller memory growth beyond normal envelope +- Pending backlog growing while admission throughput collapses +- Community reclaim behavior not restoring owned capacity in time +- Fair-share override still active past approved window + +### 4.3 Low-severity alerts + +- Project or community configuration drift +- Dashboard ingestion gaps +- Non-critical benchmark regression signals + +## 5. Runbooks + +These runbooks define the minimum operational response set. + +### 5.1 API slowness or admission collapse + +Symptoms: + +- `Skaha` create requests slow down +- Kueue admission wait time rises across multiple workload classes +- Kubernetes API latency or error rate rises + +Actions: + +1. Confirm whether the bottleneck is `Skaha`, Kueue, apiserver, or etcd. +2. Check Kueue controller restarts, memory, and work-queue saturation. +3. Check apiserver latency and request load by verb and resource. +4. Reduce new batch pressure if the submission path is part of the problem. +5. Capture the incident state for later benchmark comparison. + +### 5.2 Kueue memory pressure + +Symptoms: + +- controller RSS grows rapidly +- controller pod restarts or is OOM-killed + +Actions: + +1. Inspect backlog size, active queue count, and recent submission burst. +2. Check whether API latency is causing request buildup. +3. Reduce submission pressure if required. +4. Apply emergency queue hold or drain policy only if the system cannot recover + safely. +5. Record controller configuration and backlog context for follow-up analysis. + +### 5.3 Queue stall or unfairness complaint + +Symptoms: + +- users report that workloads are not moving +- one project appears to dominate or starve unexpectedly + +Actions: + +1. Inspect the affected `LocalQueue` and community `ClusterQueue`. +2. Confirm project fair-share weights and recent usage history. +3. Confirm workload priorities inside the project. +4. Confirm whether the issue is community reclaim, project fair share, or lack + of physical resources. +5. Communicate the cause using the standard pending explanation categories. + +### 5.4 Preemption storm + +Symptoms: + +- rapid increase in preempted or evicted workloads +- user complaints about unstable interactive or persistent work + +Actions: + +1. Confirm whether the preemptions are due to community reclaim, project-local + priority, or protection policy misconfiguration. +2. Check recent weight overrides and policy changes. +3. Pause further policy rollout until the behavior is understood. +4. Revert the triggering policy if needed. + +### 5.5 Visibility failure + +Symptoms: + +- UI cannot explain pending reasons +- pending-workloads visibility API times out or returns errors + +Actions: + +1. Confirm whether the failure is in the UI, metrics layer, or visibility API. +2. Fall back to `kubectl` and Grafana-based diagnosis. +3. Restore the visibility service before resuming major policy changes. + +## 6. Rollout + +Every Kueue upgrade or policy change must follow a controlled rollout. + +### 6.1 Rollout steps + +1. Validate the change in a non-production environment. +2. Capture pre-change benchmarks and key health metrics. +3. Apply the change through the source-of-truth deployment method. +4. Watch controller health, admission timing, and API latency. +5. Verify queue visibility and pending explanation behavior. +6. Run a defined smoke test for interactive and batch submission. +7. Mark the rollout complete only after the health window closes cleanly. + +### 6.2 Rollout guardrails + +- Do not change fairness and preemption policy during an unresolved incident. +- Do not combine Kueue upgrades with unrelated tenant policy changes unless the + rollback method covers both. +- Do not announce a new backlog ceiling without fresh benchmark evidence. + +## 7. Rollback + +Rollback must be designed before rollout begins. + +### 7.1 Rollback triggers + +- repeated controller restarts +- sustained admission collapse +- unacceptable API latency regression +- broken queue visibility +- unexpected preemption of protected workload classes + +### 7.2 Rollback steps + +1. Stop further policy changes. +2. Revert the deployment or configuration to the last known-good state. +3. Confirm controller stability and API recovery. +4. Verify that queue visibility still works. +5. Record the incident details and the exact rollback trigger. + +## 8. Evidence and reporting + +Operational claims must be backed by evidence. Keep the following artifacts for +major changes: + +- benchmark results and plots +- controller and API latency dashboards +- admission wait time summaries by workload class +- preemption and eviction summaries +- incident notes and rollback evidence when applicable + +These artifacts are release inputs, not optional attachments. diff --git a/configs/kueue/docs/reportstyle.markdown b/configs/kueue/docs/reportstyle.markdown new file mode 100644 index 00000000..6604129e --- /dev/null +++ b/configs/kueue/docs/reportstyle.markdown @@ -0,0 +1,303 @@ +# CANFAR architecture report style + +This document captures the report-writing preferences inferred from the current +CANFAR Kueue architecture package. It is based on the edited state of +`architecture.md`, `roadmap.md`, `operations.md`, `ui-spec.md`, and the ADR +set. + +The goal is not to define a generic technical writing style. The goal is to +capture the specific way these CANFAR architecture and planning reports are +meant to read. + +## 1. Core writing motivation + +The core motivation behind this report style is clarity for decision-makers, +operators, and technical reviewers. + +These reports are not meant to be literary, conversational, or highly abstract. +They are meant to: + +- explain a system design clearly +- support architectural review and technical decision-making +- preserve operational intent +- make scope and ownership boundaries obvious +- connect policy, implementation, and future roadmap in one narrative + +The writing therefore favors directness, structure, and explicit reasoning over +personality, flourish, or overly soft phrasing. + +## 2. High-level style traits + +The current edits consistently show the following preferences. + +### 2.1 Formal report framing + +Document titles are framed as formal report titles, not lightweight notes. For +example: + +- `CANFAR Science Platform Kueue Architecture` +- `CANFAR Kueue Roadmap` +- `Architecture Decision Records` + +This style treats each document as part of a reviewable architecture package. + +### 2.2 Direct, declarative tone + +The preferred tone is factual and assertive. Statements are written as design +claims, requirements, or observations. The writing avoids unnecessary hedging +and avoids casual filler. + +Preferred pattern: + +- "The current repository baseline still reflects an older Kueue deployment." +- "The architecture serves the following groups." +- "This phase focuses on making scheduling behavior understandable to users and + admins." + +Avoid: + +- conversational asides +- rhetorical questions +- motivational language +- vague claims without operational meaning + +### 2.3 Platform-first wording + +The reports should speak about the CANFAR Science Platform as a real operating +environment, not as a generic software project. The preferred writing makes the +platform identity explicit and keeps the narrative centered on actual tenant, +operator, and workload needs. + +Preferred pattern: + +- "The CANFAR Science Platform needs..." +- "This document is the primary architecture reference for Kueue rollout on the + CANFAR Science Platform." + +## 3. Structural preferences + +The current edits show a strong preference for highly structured reports. + +### 3.1 Numbered major sections + +Use numbered top-level sections and numbered subsections. This makes the reports + easy to review, annotate, and discuss in meetings. + +Preferred pattern: + +- `## 1. Introduction` +- `### 1.1 Problem statement` +- `### 1.2 Users and stakeholders` + +### 3.2 Title Case section headings + +The current edits move headings toward Title Case for important report +subsections. + +Preferred pattern: + +- `Success Criteria` +- `Quality Goals` +- `Repository and Deployment Baseline` +- `Service Level Indicators (SLIs) and Objectives (SLOs)` + +This indicates a preference for a formal report look over sentence-case prose +headings. + +### 3.3 Short overview paragraph before lists + +Each section begins with a short framing paragraph before bullets or numbered +items. This is important. The reports are not lists with headings attached. They +are structured narratives that then break into lists. + +### 3.4 Parallel list structure + +Bullets are short, parallel, and information-dense. Lists are used to enumerate +requirements, deliverables, activities, or consequences, not for decorative +formatting. + +Preferred pattern: + +- "community-owned resources remain a first-class concept" +- "communities can lend and borrow unused capacity" +- "projects compete fairly inside their community" + +## 4. Content preferences + +The edits reveal several consistent content choices. + +### 4.1 Remove repo noise from narrative sections + +When discussing the current baseline, the preferred report style summarizes the +state directly instead of embedding too many file-path references inside the +main prose. + +Preferred pattern: + +- "Current deployment documents `0.11.6` as the installed release" +- "Current deployment uses `batch/job` only" + +This keeps the main report readable. File-level evidence can still exist, but +the report itself should read like an architecture document, not a code review. + +### 4.2 Keep implementation awareness without dropping into code detail + +The preferred style is technically specific, but not source-code heavy. It names +real systems, CRDs, workloads, and policies, but it does not drown the report +in low-level manifest detail unless the detail matters for a decision. + +### 4.3 Emphasize scope boundaries + +The reports should say clearly what is in scope, what is out of scope, and what +is future work. + +Preferred pattern: + +- "This service is out of scope for phase 1 implementation, but it is in scope + for architecture and requirements." +- "Those namespace changes are future roadmap items, not phase 1 requirements." + +This is one of the strongest recurring preferences in the edits. + +### 4.4 Make ambiguity explicit + +The report style does not hide open issues. It records them directly and +operationally. + +Preferred pattern: + +- "The submission path can resolve project and community deterministically for + the selected mapping model, or require explicit project selection when the + mapping model is ambiguous." + +This suggests a strong preference for showing the real operational implication of +an open decision rather than just saying that a question remains open. + +## 5. Communication style preferences + +The communication style is best described as formal, practical, and reviewable. + +### 5.1 Write for architects, operators, and reviewers + +The reports should read as if they are written for: + +- architecture reviewers +- platform operators +- technical leads +- future implementers + +They should not read like marketing material or general onboarding content. + +### 5.2 Prefer precise nouns over expressive prose + +The edits favor terms like: + +- baseline +- target state +- future state +- deliverables +- dependencies +- acceptance criteria +- ownership +- scope +- consequences + +This reflects a preference for decision and execution vocabulary. + +### 5.3 Prefer explicit operational language + +When possible, use operationally meaningful wording: + +- "rollback" +- "admission collapse" +- "preemption storm" +- "safe operating region" +- "measured thresholds" + +This style grounds the report in real system behavior rather than abstract +architecture theory. + +## 6. Writing rules to preserve this style + +Use these rules when writing future CANFAR architecture reports. + +### 6.1 Titles and headings + +- Use strong document titles that name the platform and the document purpose. +- Use numbered sections. +- Use Title Case for major subsection headings when the document is formal and + report-like. + +### 6.2 Paragraph style + +- Start sections with a short framing paragraph. +- Keep paragraphs compact and high-signal. +- Use direct statements. +- Avoid unnecessary hedging unless uncertainty itself is the point. + +### 6.3 Lists + +- Use bullets for requirements, traits, risks, and deliverables. +- Use numbered lists for flows, steps, or ordered explanations. +- Keep list items parallel and concise. + +### 6.4 Scope and decisions + +- State what is current, what is target, and what is future. +- Mark out-of-scope items clearly. +- Record open decisions in a way that shows operational impact. +- Tie every recommendation to either architecture, operations, or roadmap + intent. + +### 6.5 References and evidence + +- Keep the main report readable first. +- Use file paths, manifests, and implementation references sparingly in the main + narrative. +- Prefer summarized statements in the report body and detailed references in + supporting material. + +## 7. Preferred report voice + +The best voice for these reports is: + +- formal +- technical +- direct +- practical +- audit-friendly +- operator-aware + +The voice is not: + +- casual +- academic in an abstract sense +- promotional +- speculative without labeling the speculation + +## 8. Template for future reports + +Use the following pattern for future architecture or roadmap reports: + +1. Start with a direct statement of purpose. +2. State the current baseline or current problem plainly. +3. Define the target design or target operating model. +4. Separate current, target, and future state clearly. +5. Use lists for requirements, activities, risks, and acceptance criteria. +6. Keep open decisions visible and explain their impact. +7. End sections with operational implications, not just abstract conclusions. + +## 9. Summary + +Your report-writing preference is not simply "formal technical writing." It is a +specific style optimized for platform architecture review: + +- strong document framing +- direct declarative language +- explicit scope boundaries +- operationally meaningful wording +- high-structure sectioning +- open decisions recorded with consequences +- readable narrative without excessive repo-level clutter + +That combination is what gives the package its current voice. diff --git a/configs/kueue/docs/roadmap.md b/configs/kueue/docs/roadmap.md new file mode 100644 index 00000000..f683b259 --- /dev/null +++ b/configs/kueue/docs/roadmap.md @@ -0,0 +1,277 @@ +# CANFAR Kueue Roadmap + +This roadmap turns the architecture into an execution plan with measurable phase +exits. Use it together with [architecture.md](./architecture.md), [operations.md](./operations.md), +[ui-spec.md](./ui-spec.md), and [the ADR set](./adrs/README.md). + +## Roadmap principles + +This roadmap follows four principles: + +- Prove policy and control-plane behavior before claiming scale +- Keep one shared `workloads` namespace first, but preserve future namespace and + MultiKueue compatibility +- Separate what Kueue owns from what the future control service owns +- Use evidence-based phase exits instead of subjective readiness claims + +## Phase 0: Architecture closure and upgrade prerequisites + +This phase closes the design, records the decisions, and prepares the current +older repository baseline for a safer Kueue upgrade. + +### Deliverables + +- Approved architecture package in `configs/kueue/docs` +- Confirmed target vocabulary for community, project, cohort, flavor, and + workload class +- Inventory of current repository gaps between the deployed baseline and the + target feature set +- Upgrade preflight checklist for current Kueue controller configuration and + CRDs + +### Key activities + +- Validate the current Kueue deployment and CRD baseline against target + features such as Admission Fair Sharing, visibility APIs, and later topology + support +- Confirm the initial communities and example queue structure for `cadc`, `ska`, + and `chimefrb` +- Define the target `workloads` namespace and identify migration tasks from the + current namespace configuration +- Confirm the initial workload-class vocabulary: interactive, persistent, + batch, and advanced distributed + +### Dependencies + +- Architecture approval +- Access to current cluster configuration and deployment history + +### Acceptance criteria + +- The architecture package is merged and accepted as the design baseline +- The upgrade preflight checklist exists and identifies all current config + mismatches +- No unresolved architectural blocker remains except ADR-006 + +## Phase 1: Core Kueue platform hardening + +This phase upgrades and hardens the Kueue control plane to support the target +tenant model and observability baseline. + +### Deliverables + +- Updated Kueue installation aligned with the target feature baseline +- Shared `workloads` namespace policy +- Initial `ClusterQueue` objects for `cadc`, `ska`, and `chimefrb` +- Initial `ResourceFlavor` taxonomy for cluster, zone, CPU, memory, and GPU +- Prometheus and Grafana coverage for queue, controller, and API health + +### Key activities + +- Upgrade Kueue safely from the current older release line +- Enable the feature gates and controller settings needed for the target model +- Move managed namespace scope to the shared `workloads` namespace +- Introduce community `ClusterQueue` objects and a shared cohort +- Standardize flavor naming and resource coverage +- Turn on the visibility and metrics surfaces needed for later phase evidence + +### Dependencies + +- Phase 0 complete +- Confirmed upgrade window and rollback method + +### Acceptance criteria + +- Kueue runs stably on the target baseline with no repeated controller crashes +- Community `ClusterQueue` objects exist and report metrics cleanly +- The visibility API and core queue metrics are reachable +- Rollback to the previous deployment has been exercised in a non-production + environment + +## Phase 2: Tenancy and control-service integration points + +This phase establishes the community and project model operationally, even if +the full control service is not implemented yet. + +### Deliverables + +- Project `LocalQueue` creation model in the shared namespace +- Project fair-share weight policy and administrative workflow +- Submission resolution rules from user and group context to project and + community +- Requirements contract for the future standalone control service + +### Key activities + +- Define the project naming convention for `LocalQueue` objects +- Establish how `Skaha` resolves the effective project today and later through + the control service +- Introduce project fair-share weights with cluster-admin-only approval +- Record the future control-service API and data needs +- Decide how temporary override requests are surfaced and approved + +### Dependencies + +- Phase 1 complete +- Administrative agreement on tenant naming and ownership boundaries + +### Acceptance criteria + +- New projects can be represented as `LocalQueue` objects on demand +- Project weights are visible and adjustable by cluster admins +- The submission path can resolve project and community deterministically for + the selected mapping model, or require explicit project selection when the + mapping model is ambiguous +- The control-service requirements are specific enough to hand to a separate + implementation effort + +## Phase 3: Visibility UX and policy-aware diagnostics + +This phase focuses on making scheduling behavior understandable to users and +admins. + +### Deliverables + +- Read-only queue explorer backed by Kueue visibility and metrics surfaces +- Pending-state explanation model implemented in the UI or CLI layer +- Resource ownership view by community and project +- Temporary fair-share override request workflow design + +### Key activities + +- Expose `LocalQueue` and `ClusterQueue` visibility in user-facing terms +- Show current effective project fair-share position and weight +- Show workload priority and protected workload-class policy +- Explain delays as one of the approved explanation categories +- Provide cluster-admin visibility into community reclaim and borrowing state + +### Dependencies + +- Phase 2 complete +- Stable visibility API and metrics from phase 1 + +### Acceptance criteria + +- A user can inspect a workload and receive an actionable pending explanation +- A project admin can see project weight and current community position +- A cluster admin can inspect cohort borrowing and community reclaim behavior +- The UI language matches the vocabulary defined in `ui-spec.md` + +## Phase 4: Scale, benchmark, and operational proof + +This phase proves backlog scale and control-plane behavior with repeatable test +artifacts. + +### Deliverables + +- `kueuer` benchmark suites for raw Kueue and end-to-end `Skaha` pressure +- Measured thresholds for backlog growth and control-plane degradation +- Evidence pack for `10k`, `50k`, `100k`, and `200k` backlog gates +- Clear stop or go criteria for larger backlog claims + +### Key activities + +- Extend benchmark coverage to user-path submission and visibility stress +- Record admission timing, controller health, API server latency, and pending + backlog behavior +- Measure user-facing interactive behavior during large batch backlog +- Capture failure modes such as memory pressure, API slowness, and visibility + degradation + +### Dependencies + +- Phases 1 through 3 complete +- Bench environments and monitoring available + +### Acceptance criteria + +- Benchmark suites run repeatably and produce comparable artifacts +- The team can state a measured safe operating region for backlog size +- The evidence distinguishes Kueue bottlenecks from API-server or etcd + bottlenecks +- Interactive and protected workloads retain acceptable service behavior during + backlog tests + +## Phase 5: Future capability evaluation + +This phase evaluates advanced Kueue capabilities without making them phase 1 +production promises. + +### Deliverables + +- Evaluation of topology-aware scheduling for MPI, JobSet, Ray, and GPU work +- Evaluation of advanced controller integrations such as `JobSet`, MPI, and Ray +- Evaluation of elastic workload applicability for CANFAR batch patterns +- Decision updates or new ADRs for supported future capabilities + +### Key activities + +- Test topology-aware scheduling against real node and fabric labels +- Test distributed workload semantics with `waitForPodsReady` where needed +- Evaluate whether elastic workloads help high-parallelism batch patterns +- Evaluate whether protected persistent or interactive workloads need stronger + isolation than the initial single-plane model + +### Dependencies + +- Phase 4 evidence complete +- Access to representative GPU, network, and multi-pod job environments + +### Acceptance criteria + +- Each advanced capability has a documented fit, risk, and recommendation +- New workload classes are not promoted to production without measured evidence +- Any capability that remains unsuitable is documented clearly rather than left + ambiguous + +## Phase 6: Optional MultiKueue federation + +This phase introduces manager and worker cluster federation only if the evidence +supports the need. + +### Deliverables + +- MultiKueue proof of concept +- Manager and worker flavor and queue mapping design +- Operational model for manager and worker observability and failure handling +- Migration criteria for deciding when to move from one cluster to many + +### Key activities + +- Define manager and worker queue vocabulary and synchronization rules +- Decide how worker clusters map to community ownership and specialized hardware +- Test manager visibility and worker execution state consistency +- Define site-placement and failure-domain policy + +### Dependencies + +- Phases 1 through 5 complete +- Evidence that single-cluster operation is insufficient or too risky + +### Acceptance criteria + +- The team has a documented reason to federate, not just a theoretical interest +- A MultiKueue proof of concept validates the community and project model +- Manager and worker failure handling is documented and testable + +## Cross-phase risks + +These risks apply across the whole roadmap: + +- Users may interpret fair-share behavior as arbitrary if visibility lags behind + policy rollout +- The control service may become the gating dependency for later phases if its + requirements stay vague +- Namespace evolution may be forced earlier if workload-class policies diverge + faster than expected +- Advanced features such as topology-aware scheduling or elastic workloads may + reveal integration limits that require new ADRs + +## Exit criteria for the package + +The roadmap is complete when each phase can answer four questions: + +- What is being delivered? +- What evidence proves it works? +- What dependencies and risks apply? +- What operator or user behavior changes when the phase lands? diff --git a/configs/kueue/docs/ui-spec.md b/configs/kueue/docs/ui-spec.md new file mode 100644 index 00000000..a2efe283 --- /dev/null +++ b/configs/kueue/docs/ui-spec.md @@ -0,0 +1,222 @@ +# CANFAR Kueue visibility and UI specification + +This document defines the user-facing and admin-facing product surface for the +CANFAR Kueue architecture. It does not describe a finished implementation. It +defines the information model, workflows, and explanation language that the UI +must support. + +Use this document together with [architecture.md](./architecture.md), [operations.md](./operations.md), +[roadmap.md](./roadmap.md), and [the ADR index](./adrs/README.md). + +## 1. Product goals + +The UI exists to make queue and ownership behavior understandable. It must not +become a generic Kubernetes portal. It must explain why work is pending, what +resources a community owns, and what position a project holds inside its +community. + +The UI must support: + +- science users who need workload status and explanation +- project admins who need project-level fairness and ownership visibility +- cluster admins who need policy, override, and incident visibility + +## 2. Personas + +This section defines the primary personas and what each one needs. + +### 2.1 Science user + +The science user needs to: + +- submit work through `Skaha` +- see current workload state +- understand why work is pending +- distinguish priority delays from quota or capacity delays + +The user does not need raw CRD editing or full Kubernetes visibility. + +### 2.2 Project administrator + +The project administrator needs to: + +- see which projects exist in a community +- see which POSIX groups attach to a project +- understand project fair-share position and weight +- request temporary fair-share overrides through the control-service workflow + +The project administrator does not approve overrides directly. + +### 2.3 Cluster administrator + +The cluster administrator needs to: + +- inspect community ownership and borrow or reclaim behavior +- see and adjust project fair-share weights +- approve or reject temporary override requests +- diagnose fairness, preemption, and visibility problems + +## 3. Primary views + +This section defines the minimum view set the product must expose. + +### 3.1 Queue explorer + +The queue explorer is the main entry point for queue visibility. It must show: + +- community +- project +- workload class +- current queue position where available +- effective fair-share state +- workload priority +- pending reason summary + +### 3.2 Resource ownership view + +The resource ownership view must show: + +- which resources each community owns +- current usage against owned quota +- borrowing or lending state +- key `ResourceFlavor` breakdowns such as CPU, memory, GPU, cluster, and zone + +This view is important for both project admins and cluster admins. + +### 3.3 Project fairness view + +The project fairness view must show: + +- current fair-share weight +- recent effective usage +- relative position inside the community +- whether a temporary override is active +- who approved the override and when it expires + +### 3.4 Workload detail view + +The workload detail view must show: + +- workload class +- priority class +- selected queue +- community and project identity +- active, pending, admitted, running, finished, or preempted state +- pending or preemption explanation using the standard reason language + +## 4. Explanation language + +The UI must use standard explanation categories so users see consistent, +actionable reasons rather than raw controller text. + +### 4.1 Approved pending reasons + +Use the following pending reason categories: + +- Waiting behind other projects with better current fair-share position +- Waiting behind higher-priority work in the same project +- Waiting for community-owned or borrowed resources to become available +- Waiting because the project or policy does not currently allow admission +- Waiting because the platform control plane is degraded + +### 4.2 Approved rejection or policy reasons + +Use the following rejection categories: + +- Project or community could not be resolved +- Submission is missing required queue or policy metadata +- Workload requests do not satisfy platform requirements +- External control-service policy denied the request + +### 4.3 Approved preemption reasons + +Use the following preemption categories: + +- Preempted because the owning community reclaimed its quota +- Preempted by higher-priority work inside the same community +- Evicted because platform recovery policy triggered +- Evicted because queue stop or drain policy was applied + +## 5. Workflows + +This section defines the primary workflows the UI must support. + +### 5.1 User workload inspection + +The user inspects a workload and sees: + +1. Which community and project it belongs to +2. Which queue it targets +3. Which workload class and priority apply +4. Why it is waiting or what preempted it +5. What the next likely action is + +### 5.2 Project admin fairness inspection + +The project admin inspects a project and sees: + +1. Current fair-share weight +2. Current community-relative position +3. Current active and pending work counts +4. Whether an override exists or has expired +5. Which POSIX groups attach to the project + +### 5.3 Temporary override request + +The request flow is: + +1. Project admin selects a project +2. Project admin proposes a temporary weight increase and business reason +3. Control-service workflow records the request +4. Cluster admin approves, rejects, or modifies the request +5. UI displays approval state and expiry time + +The downstream accounting or quota-cost model for the override is out of scope +for this document, but the UI must display that such a cost model exists. + +## 6. Phased product surface + +The UI strategy is phased to match the roadmap. + +### Phase 1 + +Use `kubectl`, Grafana, and the Kueue visibility API. No custom UI is required +yet beyond operator tooling. + +### Phase 2 + +Introduce a read-only queue visibility UI with workload detail, project fairness +view, and community ownership view. + +### Phase 3 + +Introduce admin workflows such as temporary override requests and richer project +and mapping inspection. + +### Phase 4 + +Introduce guided submission hints, recommended queue or flavor explanations, and +self-service debugging aids if the earlier phases prove stable. + +## 7. Data requirements + +The UI needs data from multiple sources: + +- Kueue visibility API for queue position and pending workloads +- Kueue metrics for quota, admission latency, and queue health +- Kubernetes workload state for current execution status +- Control-service metadata for projects, groups, ownership, and overrides + +The UI must not depend on full raw workload listing for basic queue views when +the backlog is large. + +## 8. Non-goals + +This UI specification does not require: + +- raw Kubernetes object editing +- a full generic cluster dashboard +- per-user accounting implementation +- automated override approval logic + +The goal is clarity and policy transparency, not a complete platform portal. diff --git a/configs/kueue/kueuer/AGENTS.md b/configs/kueue/kueuer/AGENTS.md new file mode 100644 index 00000000..c41251ec --- /dev/null +++ b/configs/kueue/kueuer/AGENTS.md @@ -0,0 +1,13 @@ +## Learned User Preferences +- Prefer testing workflows that continue under restricted production RBAC by treating control-plane visibility checks as optional and collecting partial metrics. +- Prefer practical workarounds for production constraints over cluster-level changes that require broader platform modifications. +- Prefer inspecting node or cluster-wide inventory via the Kubernetes API (or other narrowly scoped API calls) instead of broad `kubectl get nodes` when that matches environment constraints. +- Prefer `kr cluster resources` output that uses IEC binary byte units (B, KiB, …, PiB) and at most three decimal places for displayed CPU and byte quantities. + +## Learned Workspace Facts +- Benchmarking and preflight work for this area is centered in `configs/kueue/kueuer`. +- The production context used for tests has workload namespace access (for example, `canfar-kueue-testing`) but limited visibility into `kueue-system`, which restricts control-plane metric collection. +- Default stress VM memory fraction for benchmark jobs is `0.33` (`DEFAULT_STRESS_VM_MEMORY_FRACTION`); override with `--vm-memory-fraction` when needed. +- Benchmark job submission supports `--spawn-mechanism kubectl` (default, chunked `kubectl apply`) or `api` (Python client create-only with bounded concurrency and retries). +- `kr cluster resources` groups totals by `--node-label-key` (CLI default `skaha.opencadc.org/node-type`); `kueuer.resources.total()` requires `node_label_key` explicitly and does not default it in library code. +- Grouped results include per-bucket `count` (nodes) and per-product GPU lists; when `nvidia.com/gpu` capacity is zero or missing, counts may come from `nvidia.com/gpu.count` with kind from `nvidia.com/gpu.product` (e.g. MIG-style nodes). diff --git a/configs/kueue/kueuer/src/kueuer/resources.py b/configs/kueue/kueuer/src/kueuer/resources.py index b0ea73d5..7f5ff25c 100644 --- a/configs/kueue/kueuer/src/kueuer/resources.py +++ b/configs/kueue/kueuer/src/kueuer/resources.py @@ -8,17 +8,19 @@ # ] # /// """ -Totals cluster resources across Kubernetes nodes filtered by name regex. +Totals cluster resources across Kubernetes nodes. -- Deduplicates nodes by UID (so overlapping regex lists don't double count). +- Deduplicates nodes by UID when listing. - By default totals from node .status.capacity; use --field allocatable to sum .status.allocatable instead. -- Results are grouped by a configurable node label (see CLI ``--node-label-key``; - ``total()`` requires ``node_label_key`` with no default in code). Nodes without +- Results are grouped by a configurable node label (see CLI ``--node-label``; + ``total()`` requires ``node_label`` with no default in code). Nodes without the label are grouped under ``""``. Each group has ``count`` (nodes in group), - ``cpu``, ``memory``, ``ephemeral-storage`` (binary **GiB**, 1024³; values up to - 3 decimal places), per-bucket **weights** (same 3 decimal places; pool CPU - cores per GiB / per GPU kind—see ``ResourceWeights``), and GPU lists. + ``cpu``, ``memory``, ``ephemeral-storage`` (binary units per ``--units``; + values up to 3 decimal places), per-bucket **weights** (IEEE-754 binary64, + shortest round-trip decimal strings; normalized to ``--baseline`` (``-b``), + computed in the same byte scale as ``--units``—see ``ResourceWeights``), and + GPU lists. - ``nvidia.com/gpu`` is a list of ``{ "kind", "value", "unit": "count" }`` per distinct ``nvidia.com/gpu.product`` label, summed across nodes. When capacity/allocatable reports 0 or omits ``nvidia.com/gpu`` but the NVIDIA @@ -30,22 +32,21 @@ Examples: uv run resources.py - uv run resources.py 'gpu-.*' 'node-1[0-9]' - uv run resources.py --field allocatable --pretty 'worker-.*' + uv run resources.py --field allocatable + uv run resources.py --units Mi --baseline cpu """ from __future__ import annotations -import re import sys from dataclasses import dataclass from decimal import ROUND_HALF_UP, Decimal, getcontext, localcontext -from typing import Annotated, Any, Dict, Iterable, List, Optional, Sequence, cast +from typing import Annotated, Any, Dict, Iterable, List, Optional, cast import typer from kubernetes.client import CoreV1Api, V1Node from kubernetes.utils.quantity import parse_quantity -from pydantic import BaseModel, ConfigDict, Field, ValidationError, field_validator +from pydantic import BaseModel, ConfigDict, Field, ValidationError from rich.console import Console from typing_extensions import Literal @@ -59,9 +60,18 @@ # Reported fractional precision for CPU, GiB display quantities, and weight ratios. REPORT_MAX_DECIMAL_PLACES = 3 -# Intermediate precision for weight ratio division before rounding to ``REPORT_MAX_DECIMAL_PLACES``. +# Intermediate precision for weight ratio division before rounding to +# ``REPORT_MAX_DECIMAL_PLACES``. _WEIGHT_RATIO_DIV_PREC = max(80, DECIMAL_PRECISION) +# Baseline weight ``1`` as a binary64 round-trip string (matches computed weights). +_WEIGHT_BASELINE_ONE_STR = repr(1.0) + +# ``total()`` / ``--baseline`` accepted resource names. +_ALLOWED_BASELINES: frozenset[str] = frozenset( + ("cpu", "memory", "ephemeral-storage", "nvidia.com/gpu") +) + app = typer.Typer(help="Cluster utilities") # ========================= @@ -76,7 +86,9 @@ class ResourceItem(BaseModel): ) unit: str = Field( ..., - description="Binary GiB for memory/ephemeral totals, 'cores', or 'count'.", + description=( + "Binary unit (B, Ki, Mi, …) for memory/ephemeral, 'cores', or 'count'." + ), ) @@ -92,27 +104,27 @@ class GpuResourceItem(BaseModel): class ResourceWeights(BaseModel): - """Pool-level ratios vs CPU cores (dimensionless); see module docstring for interpretation.""" + """Pool composition weights vs ``--baseline``; see module docstring.""" model_config = ConfigDict(populate_by_name=True) - cpu: str = Field( - default="1", - description="Baseline; other weights are pool CPU per unit of that resource.", + cpu: Optional[str] = Field( + None, + description="IEEE-754 binary64 string; baseline CPU weight is ``1.0``.", ) memory: Optional[str] = Field( None, - description="Pool CPU cores divided by total memory in binary GiB.", + description="IEEE-754 binary64 string vs baseline in ``--units``.", ) ephemeral_storage: Optional[str] = Field( None, serialization_alias="ephemeral-storage", - description="Pool CPU cores divided by total ephemeral storage in binary GiB.", + description="IEEE-754 binary64 string vs baseline in ``--units``.", ) nvidia_gpu: Optional[Dict[str, str]] = Field( None, serialization_alias="nvidia.com/gpu", - description="Per GPU product: pool CPU cores divided by count of that kind.", + description="Per GPU product: IEEE-754 binary64 weight vs baseline.", ) @@ -124,7 +136,7 @@ class NodeTypeResources(BaseModel): count: int = Field( ..., ge=0, - description="Number of nodes in this group (unique nodes after pattern filter).", + description="Number of nodes in this group (unique nodes).", ) cpu: Optional[ResourceItem] = None memory: Optional[ResourceItem] = None @@ -143,17 +155,17 @@ class NodeTypeResources(BaseModel): weights: Optional[ResourceWeights] = Field( None, description=( - "CPU-normalized pool composition weights (decimal strings, same precision " - "as other reported quantities). " - "Omitted if the pool has no CPU total to divide by." + "Pool composition weights (IEEE-754 binary64, shortest round-trip " + "strings). Omitted when the baseline resource total is missing or " + "non-positive." ), ) class ClusterResourcesResult(BaseModel): - """Cluster resources grouped by ``node_label_key`` label values.""" + """Cluster resources grouped by ``node_label`` label values.""" - node_label_key: str = Field( + node_label: str = Field( ..., description="Kubernetes node label key used to form each group.", ) @@ -161,20 +173,9 @@ class ClusterResourcesResult(BaseModel): class Settings(BaseModel): - patterns: Optional[List[str]] = Field( - default=None, description="Regex patterns for node names." - ) field: Literal["capacity", "allocatable"] = "capacity" pretty: bool = False - @field_validator("patterns") - @classmethod - def validate_patterns(cls, v: Optional[List[str]]) -> Optional[List[str]]: - if v is None: - return None - cleaned = [p for p in (s.strip() for s in v) if p] - return cleaned or None - # ========================= # Internal Calculation Types @@ -203,28 +204,13 @@ def _load_kube() -> CoreV1Api: return k8s.core_v1 -def _compile_patterns(patterns: Optional[Sequence[str]]) -> Optional[List[re.Pattern]]: - if not patterns: - return None - return [re.compile(p) for p in patterns] - - -def _node_matches(name: str, compiled: Optional[List[re.Pattern]]) -> bool: - if compiled is None: - return True - return any(p.search(name) for p in compiled) - - -def _collect_nodes(v1: CoreV1Api, patterns: Optional[Sequence[str]]) -> List[V1Node]: - compiled = _compile_patterns(patterns) +def _collect_nodes(v1: CoreV1Api) -> List[V1Node]: all_nodes = v1.list_node().items - # Deduplicate by UID so overlapping regex patterns don't double count dedup: Dict[str, V1Node] = {} for n in all_nodes: name = n.metadata.name or "" - if _node_matches(name, compiled): - uid = n.metadata.uid or name # Fallback to name if UID missing - dedup[uid] = n + uid = n.metadata.uid or name # Fallback to name if UID missing + dedup[uid] = n return list(dedup.values()) @@ -277,7 +263,9 @@ def _node_nvidia_gpu_contrib( return None -def _gpu_kind_totals_to_list(by_kind: Optional[Dict[str, int]]) -> Optional[List[GpuResourceItem]]: +def _gpu_kind_totals_to_list( + by_kind: Optional[Dict[str, int]], +) -> Optional[List[GpuResourceItem]]: """Convert per-kind counts to a stable list for JSON output.""" if not by_kind: return None @@ -290,7 +278,7 @@ def _gpu_kind_totals_to_list(by_kind: Optional[Dict[str, int]]) -> Optional[List def _format_decimal_report(value: Decimal) -> str: - """Stringify a non-negative Decimal with at most ``REPORT_MAX_DECIMAL_PLACES`` places.""" + """Stringify a non-negative Decimal with at most ``REPORT_MAX_DECIMAL_PLACES``.""" if value < 0: raise ValueError("value must be non-negative") q = Decimal("1").scaleb(-REPORT_MAX_DECIMAL_PLACES) @@ -301,85 +289,221 @@ def _format_decimal_report(value: Decimal) -> str: return s -# Binary gibibyte (Kubernetes-style): 1 GiB = 1024³ bytes. -_GIB_BYTES = Decimal(1024**3) +# Binary IEC factors: unit string -> bytes per one unit (B, Ki, Mi, Gi, Ti, Pi). +_BINARY_UNIT_BYTES: Dict[str, int] = { + "B": 1, + "Ki": 1024, + "Mi": 1024**2, + "Gi": 1024**3, + "Ti": 1024**4, + "Pi": 1024**5, +} + +def normalize_binary_unit(units: str) -> str: + """Return canonical binary unit key or raise ``ValueError``.""" + u = units.strip() + if u not in _BINARY_UNIT_BYTES: + allowed = ", ".join(sorted(_BINARY_UNIT_BYTES)) + raise ValueError(f'units must be one of: {allowed} (got "{units}")') + return u -def _bytes_to_binary_gib_decimal(total_bytes: int) -> Decimal: - """Convert byte totals to binary GiB (full ``Decimal``, unrounded).""" + +def _bytes_to_qty_decimal(total_bytes: int, unit: str) -> Decimal: + """Convert byte totals to the selected binary unit (full ``Decimal``, unrounded).""" if total_bytes < 0: raise ValueError("byte total must be non-negative") - return Decimal(total_bytes) / _GIB_BYTES + factor = Decimal(_BINARY_UNIT_BYTES[unit]) + return Decimal(total_bytes) / factor -def _gib_resource_item(total_bytes: int) -> ResourceItem: - """Memory / ephemeral totals: always reported in GiB with limited display precision.""" +def _bytes_to_resource_item(total_bytes: int, unit: str) -> ResourceItem: + """Memory / ephemeral totals in ``unit`` with limited display precision.""" if total_bytes == 0: - return ResourceItem(value="0", unit="GiB") - v = _bytes_to_binary_gib_decimal(total_bytes) - return ResourceItem(value=_format_decimal_report(v), unit="GiB") + return ResourceItem(value="0", unit=unit) + v = _bytes_to_qty_decimal(total_bytes, unit) + return ResourceItem(value=_format_decimal_report(v), unit=unit) -def _gib_display_to_bytes(value: Decimal) -> Decimal: - """Interpret a displayed GiB quantity as bytes.""" - return value * _GIB_BYTES +def _display_qty_to_bytes(value: Decimal, unit: str) -> Decimal: + """Interpret a displayed quantity in ``unit`` as bytes.""" + return value * Decimal(_BINARY_UNIT_BYTES[unit]) def _decimal_ratio_string(numerator: Decimal, denominator: Decimal) -> str: - """``numerator / denominator`` rounded to ``REPORT_MAX_DECIMAL_PLACES`` (half-up).""" + """Weight ratio as IEEE-754 binary64 (``float``), shortest round-trip ``repr``.""" if denominator <= 0: raise ValueError("denominator must be positive") with localcontext() as ctx: ctx.prec = _WEIGHT_RATIO_DIV_PREC ratio = numerator / denominator - return _format_decimal_report(ratio) + return repr(float(ratio)) -def _compute_resource_weights(acc: TotalsAcc) -> Optional[ResourceWeights]: - """ - Weights normalize pool totals to a per-CPU baseline: ``cpu`` is 1; other - fields are ``TOTAL_CPU / TOTAL_QUANTITY`` in compatible units (GiB for - memory and ephemeral; per-GPU-kind counts for NVIDIA). - - **Interpretation (heuristic):** For a node pool with totals ``(C, M, E, …)``, - weights map ``(c, m, e, …)`` requests to a linear ``c·1 + m·w_mem + …`` style - score *if* you treat the pool's aggregate ratio as a fixed substitution rate - between CPU and other resources. That is a **comparative** normalization, not a - guarantee of schedulability, pricing, or optimal packing—heterogeneous nodes, - fragmentation, and priorities are not captured. - """ - cpu = acc.cpu_cores - if cpu is None or cpu <= 0: +def _nvidia_weights_from_numerator( + by_kind: Optional[Dict[str, int]], + numer: Decimal, +) -> Optional[Dict[str, str]]: + if not by_kind: return None + entries: Dict[str, str] = {} + for kind, cnt in sorted(by_kind.items(), key=lambda kv: (kv[0] == "", kv[0])): + if cnt <= 0: + continue + entries[kind] = _decimal_ratio_string(numer, Decimal(cnt)) + return entries or None - mem_w: Optional[str] = None - if acc.memory_bytes is not None and acc.memory_bytes > 0: - mem_w = _decimal_ratio_string(cpu, _bytes_to_binary_gib_decimal(acc.memory_bytes)) - - eph_w: Optional[str] = None - if acc.ephemeral_bytes is not None and acc.ephemeral_bytes > 0: - eph_w = _decimal_ratio_string(cpu, _bytes_to_binary_gib_decimal(acc.ephemeral_bytes)) - - nv_map: Optional[Dict[str, str]] = None - if acc.nvidia_by_kind: - entries: Dict[str, str] = {} - for kind, cnt in sorted( - acc.nvidia_by_kind.items(), - key=lambda kv: (kv[0] == "", kv[0]), - ): - if cnt <= 0: - continue - entries[kind] = _decimal_ratio_string(cpu, Decimal(cnt)) - nv_map = entries or None +def _nvidia_pool_total(by_kind: Optional[Dict[str, int]]) -> Optional[Decimal]: + """Sum of NVIDIA GPU counts across kinds (pool-wide GPU total).""" + if not by_kind: + return None + t = sum(c for c in by_kind.values() if c > 0) + return Decimal(t) if t > 0 else None + + +def _weight_quantities( + acc: TotalsAcc, + units: str, +) -> tuple[Optional[Decimal], Optional[Decimal], Optional[Decimal]]: + mem_b = acc.memory_bytes + eph_b = acc.ephemeral_bytes + qty_mem = ( + _bytes_to_qty_decimal(mem_b, units) if mem_b is not None and mem_b > 0 else None + ) + qty_eph = ( + _bytes_to_qty_decimal(eph_b, units) if eph_b is not None and eph_b > 0 else None + ) + return (acc.cpu_cores, qty_mem, qty_eph) + + +def _weights_baseline_cpu( + acc: TotalsAcc, + qty_cpu: Decimal, + qty_mem: Optional[Decimal], + qty_eph: Optional[Decimal], +) -> Optional[ResourceWeights]: + w_mem = _decimal_ratio_string(qty_cpu, qty_mem) if qty_mem is not None else None + w_eph = _decimal_ratio_string(qty_cpu, qty_eph) if qty_eph is not None else None + w_nv = _nvidia_weights_from_numerator(acc.nvidia_by_kind, qty_cpu) + return ResourceWeights( + cpu=_WEIGHT_BASELINE_ONE_STR, + memory=w_mem, + ephemeral_storage=w_eph, + nvidia_gpu=w_nv, + ) + + +def _weights_baseline_memory( + acc: TotalsAcc, + qty_cpu: Optional[Decimal], + qty_mem: Decimal, + qty_eph: Optional[Decimal], +) -> Optional[ResourceWeights]: + w_cpu = ( + _decimal_ratio_string(qty_mem, qty_cpu) + if qty_cpu is not None and qty_cpu > 0 + else None + ) + w_eph = _decimal_ratio_string(qty_mem, qty_eph) if qty_eph is not None else None + w_nv = _nvidia_weights_from_numerator(acc.nvidia_by_kind, qty_mem) return ResourceWeights( - cpu="1", - memory=mem_w, - ephemeral_storage=eph_w, - nvidia_gpu=nv_map, + cpu=w_cpu, + memory=_WEIGHT_BASELINE_ONE_STR, + ephemeral_storage=w_eph, + nvidia_gpu=w_nv, ) +def _weights_baseline_ephemeral( + acc: TotalsAcc, + qty_cpu: Optional[Decimal], + qty_mem: Optional[Decimal], + qty_eph: Decimal, +) -> Optional[ResourceWeights]: + w_cpu = ( + _decimal_ratio_string(qty_eph, qty_cpu) + if qty_cpu is not None and qty_cpu > 0 + else None + ) + w_mem = _decimal_ratio_string(qty_eph, qty_mem) if qty_mem is not None else None + w_nv = _nvidia_weights_from_numerator(acc.nvidia_by_kind, qty_eph) + return ResourceWeights( + cpu=w_cpu, + memory=w_mem, + ephemeral_storage=_WEIGHT_BASELINE_ONE_STR, + nvidia_gpu=w_nv, + ) + + +def _weights_baseline_nvidia( + acc: TotalsAcc, + qty_nv_total: Decimal, + qty_cpu: Optional[Decimal], + qty_mem: Optional[Decimal], + qty_eph: Optional[Decimal], +) -> ResourceWeights: + """Baseline is total NVIDIA GPU count; per-kind GPU weights use ``qty_nv_total``.""" + w_cpu = ( + _decimal_ratio_string(qty_nv_total, qty_cpu) + if qty_cpu is not None and qty_cpu > 0 + else None + ) + w_mem = ( + _decimal_ratio_string(qty_nv_total, qty_mem) if qty_mem is not None else None + ) + w_eph = ( + _decimal_ratio_string(qty_nv_total, qty_eph) if qty_eph is not None else None + ) + w_nv = _nvidia_weights_from_numerator(acc.nvidia_by_kind, qty_nv_total) + return ResourceWeights( + cpu=w_cpu, + memory=w_mem, + ephemeral_storage=w_eph, + nvidia_gpu=w_nv, + ) + + +def _compute_resource_weights( + acc: TotalsAcc, + *, + baseline: str, + units: str, +) -> Optional[ResourceWeights]: + """ + Weights normalize pool totals to ``baseline``: that resource is ``1``; other + fields are ratios in the same byte scale as ``units`` (for memory and + ephemeral) or in GPU counts (for NVIDIA). For ``nvidia.com/gpu``, the + baseline quantity is the **sum** of all NVIDIA GPU counts in the group. + + **Interpretation (heuristic):** comparative normalization, not schedulability. + """ + qty_cpu, qty_mem, qty_eph = _weight_quantities(acc, units) + + if baseline == "cpu": + if qty_cpu is None or qty_cpu <= 0: + return None + return _weights_baseline_cpu(acc, qty_cpu, qty_mem, qty_eph) + + if baseline == "memory": + if qty_mem is None or qty_mem <= 0: + return None + return _weights_baseline_memory(acc, qty_cpu, qty_mem, qty_eph) + + if baseline == "ephemeral-storage": + if qty_eph is None or qty_eph <= 0: + return None + return _weights_baseline_ephemeral(acc, qty_cpu, qty_mem, qty_eph) + + if baseline == "nvidia.com/gpu": + qty_nv = _nvidia_pool_total(acc.nvidia_by_kind) + if qty_nv is None or qty_nv <= 0: + return None + return _weights_baseline_nvidia(acc, qty_nv, qty_cpu, qty_mem, qty_eph) + + raise ValueError(f'unknown baseline: "{baseline}"') + + def _get_field_map(node: V1Node, field: str) -> Dict[str, str]: """ Extract either .status.capacity or .status.allocatable as a plain dict[str, str]. @@ -486,7 +610,13 @@ def _try_sum(dec_sum_fn, vals): ) -def _totals_acc_to_node_type_resources(acc: TotalsAcc, node_count: int) -> NodeTypeResources: +def _totals_acc_to_node_type_resources( + acc: TotalsAcc, + node_count: int, + *, + units: str, + baseline: str, +) -> NodeTypeResources: """Build one NodeTypeResources from aggregated totals.""" return NodeTypeResources( count=node_count, @@ -499,18 +629,22 @@ def _totals_acc_to_node_type_resources(acc: TotalsAcc, node_count: int) -> NodeT else None ), memory=( - _gib_resource_item(acc.memory_bytes) + _bytes_to_resource_item(acc.memory_bytes, units) if acc.memory_bytes is not None else None ), ephemeral_storage=( - _gib_resource_item(acc.ephemeral_bytes) + _bytes_to_resource_item(acc.ephemeral_bytes, units) if acc.ephemeral_bytes is not None else None ), nvidia_gpu=_gpu_kind_totals_to_list(acc.nvidia_by_kind), amd_gpu=_gpu_kind_totals_to_list(acc.amd_by_kind), - weights=_compute_resource_weights(acc), + weights=_compute_resource_weights( + acc, + baseline=baseline, + units=units, + ), ) @@ -520,41 +654,50 @@ def _totals_acc_to_node_type_resources(acc: TotalsAcc, node_count: int) -> NodeT def total( - patterns: Optional[List[str]] = None, field: str = "capacity", *, - node_label_key: str, + node_label: str, + units: str = "Gi", + baseline: str = "cpu", ) -> Dict[str, Any]: """ - Calculate total cluster resources across nodes matching regex patterns. + Calculate total cluster resources across all nodes. Args: - patterns: Regex strings for node names. If None or empty, includes all nodes. field: Which field to sum: "capacity" (default) or "allocatable". - node_label_key: Kubernetes node label key used to group results (callers + node_label: Kubernetes node label key used to group results (callers such as the CLI supply the default; this function does not default it). + units: Binary byte unit for memory and ephemeral totals (``B``, ``Ki``, + ``Mi``, ``Gi``, ``Ti``, ``Pi``). + baseline: Resource with weight ``1``; others are expressed per this + baseline in ``units`` for memory/ephemeral. One of ``cpu``, + ``memory``, ``ephemeral-storage``, or ``nvidia.com/gpu`` (total + NVIDIA GPU count). Returns: - A dict with ``node_label_key``, ``by_label_value`` (each key is a label + A dict with ``node_label``, ``by_label_value`` (each key is a label value, or ``\"\"`` if unset), and per-group ``count`` plus resource maps. """ - label_key = node_label_key.strip() + label_key = node_label.strip() if not label_key: - raise ValueError("node_label_key must be a non-empty string") + raise ValueError("node_label must be a non-empty string") + + unit_key = normalize_binary_unit(units) + br = baseline.strip() + if br not in _ALLOWED_BASELINES: + allowed = ", ".join(sorted(_ALLOWED_BASELINES)) + raise ValueError(f'baseline must be one of: {allowed} (got "{baseline}")') # Validate inputs with Pydantic if field not in ("capacity", "allocatable"): raise ValueError('field must be "capacity" or "allocatable"') try: - cfg = Settings( - patterns=patterns, - field=cast(Literal["capacity", "allocatable"], field), - ) + cfg = Settings(field=cast(Literal["capacity", "allocatable"], field)) except ValidationError as e: raise ValueError(str(e)) from e v1 = _load_kube() - nodes = _collect_nodes(v1, cfg.patterns) + nodes = _collect_nodes(v1) by_nt: Dict[str, List[V1Node]] = {} for n in nodes: labels = (n.metadata.labels or {}) if n.metadata else {} @@ -566,10 +709,15 @@ def total( for nt_key in sorted(by_nt.keys(), key=lambda s: (s == "", s)): bucket = by_nt[nt_key] acc = _sum_resources(bucket, cfg.field) - groups[nt_key] = _totals_acc_to_node_type_resources(acc, len(bucket)) + groups[nt_key] = _totals_acc_to_node_type_resources( + acc, + len(bucket), + units=unit_key, + baseline=br, + ) return ClusterResourcesResult( - node_label_key=label_key, + node_label=label_key, by_label_value=groups, ).model_dump( by_alias=True, @@ -583,10 +731,10 @@ def _scale_resource_item_inplace(item: Dict[str, Any], scale: Decimal) -> None: v = Decimal(str(item["value"])) if unit == "cores": item["value"] = _format_decimal_report(v * scale) - elif unit == "GiB": - scaled_bytes = _gib_display_to_bytes(v) * scale + elif unit in _BINARY_UNIT_BYTES: + scaled_bytes = _display_qty_to_bytes(v, unit) * scale int_bytes = max(0, int(scaled_bytes.to_integral_value(rounding=ROUND_HALF_UP))) - out = _gib_resource_item(int_bytes) + out = _bytes_to_resource_item(int_bytes, unit) item["value"] = out.value item["unit"] = out.unit elif unit == "count": @@ -596,7 +744,10 @@ def _scale_resource_item_inplace(item: Dict[str, Any], scale: Decimal) -> None: def _scale_cluster_resources_payload(result: Dict[str, Any], scale: Decimal) -> None: - """Multiply numeric ``value`` fields in-place (CLI ``--scale``). Leaves ``weights`` unchanged.""" + """Multiply numeric ``value`` fields in-place (CLI ``--scale``). + + Leaves ``weights`` unchanged. + """ inner = result.get("by_label_value") if not isinstance(inner, dict): return @@ -635,15 +786,6 @@ def list_resource_quotas(namespace: str) -> Dict[str, Any]: @app.command("resources") def resources( - patterns: Annotated[ - Optional[List[str]], - typer.Option( - "-p", - "--pattern", - metavar="PATTERN", - help="Regex pattern for node names. Can be specified multiple times.", - ), - ] = None, field: Annotated[ str, typer.Option( @@ -660,27 +802,51 @@ def resources( help="Scale resources by this percentage.", ), ] = 1.0, - node_label_key: Annotated[ + units: Annotated[ str, typer.Option( - "--node-label-key", + "-u", + "--units", + help=( + "Binary byte unit for memory and ephemeral totals: " + '"B", "Ki", "Mi", "Gi", "Ti", "Pi".' + ), + ), + ] = "Gi", + baseline: Annotated[ + str, + typer.Option( + "-b", + "--baseline", + help=( + "Resource with weight 1 for pool weights: " + '"cpu", "memory", "ephemeral-storage", or "nvidia.com/gpu".' + ), + ), + ] = "cpu", + node_label: Annotated[ + str, + typer.Option( + "-n", + "--node-label", help=( "Node label key used to group totals by label value " - '(default only applies to this CLI, not to total()).' + "(default only applies to this CLI, not to total())." ), ), ] = "skaha.opencadc.org/node-type", ): """ - Sum resources across nodes matching any of the provided regex patterns. + Sum resources across all nodes, grouped by a node label. """ assert field in ["capacity", "allocatable"] assert scale > 0.0 and scale <= 1.0, "Percentage must be in (0, 1]" try: result = total( - patterns or None, field=field, - node_label_key=node_label_key, + node_label=node_label, + units=units, + baseline=baseline, ) console.print(result, width=120) if scale != 1.0: @@ -706,7 +872,9 @@ def resourcequota( """List namespace ResourceQuota objects using the Kubernetes Python client.""" try: response = list_resource_quotas(namespace) - console.print({"response": response, "resource_quotas": response.get("items", [])}) + console.print( + {"response": response, "resource_quotas": response.get("items", [])} + ) except Exception as e: print(f"Error: {e}", file=sys.stderr) raise SystemExit(1) diff --git a/configs/kueue/kueuer/tests/test_resources.py b/configs/kueue/kueuer/tests/test_resources.py index c40f2e00..cf64b1a9 100644 --- a/configs/kueue/kueuer/tests/test_resources.py +++ b/configs/kueue/kueuer/tests/test_resources.py @@ -5,21 +5,22 @@ from decimal import Decimal import pytest -from typer.testing import CliRunner from kubernetes.client import V1Node, V1NodeStatus, V1ObjectMeta +from typer.testing import CliRunner from kueuer.cli import app from kueuer.resources import ( - _bytes_to_binary_gib_decimal, + _bytes_to_qty_decimal, _format_decimal_report, list_resource_quotas, + normalize_binary_unit, total, ) runner = CliRunner() -# Library API has no default for node_label_key; tests use the same key as the CLI default. -NODE_LABEL_KEY = "skaha.opencadc.org/node-type" +# Library API has no default for node_label; tests use the same key as the CLI default. +NODE_LABEL = "skaha.opencadc.org/node-type" def _node( @@ -34,11 +35,16 @@ def _node( ) -def test_binary_gib_conversion() -> None: +def test_binary_unit_conversion() -> None: gi = 1024**3 - assert _bytes_to_binary_gib_decimal(20550 * gi) == Decimal("20550") - assert _bytes_to_binary_gib_decimal(gi // 2) == Decimal("0.5") - assert _bytes_to_binary_gib_decimal(gi) == Decimal("1") + assert _bytes_to_qty_decimal(20550 * gi, "Gi") == Decimal("20550") + assert _bytes_to_qty_decimal(gi // 2, "Gi") == Decimal("0.5") + assert _bytes_to_qty_decimal(gi, "Gi") == Decimal("1") + + +def test_normalize_binary_unit_rejects_unknown() -> None: + with pytest.raises(ValueError, match="units must be one of"): + normalize_binary_unit("GB") def test_format_decimal_report_three_places() -> None: @@ -66,14 +72,14 @@ class R: lambda: type("X", (), {"list_node": fake_list_node})(), ) - out = total(None, field="capacity", node_label_key=NODE_LABEL_KEY) - assert out["node_label_key"] == NODE_LABEL_KEY + out = total(field="capacity", node_label=NODE_LABEL) + assert out["node_label"] == NODE_LABEL bucket = out["by_label_value"][""] assert bucket["count"] == 1 - assert bucket["memory"] == {"value": "16", "unit": "GiB"} - assert bucket["ephemeral-storage"] == {"value": "100", "unit": "GiB"} + assert bucket["memory"] == {"value": "16", "unit": "Gi"} + assert bucket["ephemeral-storage"] == {"value": "100", "unit": "Gi"} w = bucket["weights"] - assert w["cpu"] == "1" + assert w["cpu"] == "1.0" assert w["memory"] == "0.5" assert w["ephemeral-storage"] == "0.08" assert "nvidia.com/gpu" not in w @@ -104,13 +110,13 @@ class R: lambda: type("X", (), {"list_node": fake_list_node})(), ) - out = total(None, field="capacity", node_label_key=NODE_LABEL_KEY) + out = total(field="capacity", node_label=NODE_LABEL) b = out["by_label_value"][""] assert b["count"] == 2 assert b["nvidia.com/gpu"] == [ {"kind": "NVIDIA-A100-SXM4-40GB", "value": "12", "unit": "count"}, ] - assert b["weights"]["nvidia.com/gpu"]["NVIDIA-A100-SXM4-40GB"] == "1" + assert b["weights"]["nvidia.com/gpu"]["NVIDIA-A100-SXM4-40GB"] == "1.0" def test_total_nvidia_gpu_mixed_kind(monkeypatch) -> None: @@ -138,7 +144,7 @@ class R: lambda: type("X", (), {"list_node": fake_list_node})(), ) - out = total(None, field="capacity", node_label_key=NODE_LABEL_KEY) + out = total(field="capacity", node_label=NODE_LABEL) b = out["by_label_value"][""] assert b["count"] == 2 assert b["nvidia.com/gpu"] == [ @@ -146,8 +152,8 @@ class R: {"kind": "NVIDIA-T4", "value": "2", "unit": "count"}, ] wg = b["weights"]["nvidia.com/gpu"] - assert wg["NVIDIA-A100"] == "2" - assert wg["NVIDIA-T4"] == "4" + assert wg["NVIDIA-A100"] == "2.0" + assert wg["NVIDIA-T4"] == "4.0" def test_total_amd_gpu(monkeypatch) -> None: @@ -170,7 +176,7 @@ class R: lambda: type("X", (), {"list_node": fake_list_node})(), ) - out = total(None, field="capacity", node_label_key=NODE_LABEL_KEY) + out = total(field="capacity", node_label=NODE_LABEL) b = out["by_label_value"][""] assert b["count"] == 1 assert b["amd.com/gpu"] == [ @@ -192,16 +198,16 @@ class R: lambda: type("X", (), {"list_node": fake_list_node})(), ) - out = total(None, field="capacity", node_label_key=NODE_LABEL_KEY) + out = total(field="capacity", node_label=NODE_LABEL) b = out["by_label_value"][""] assert b["count"] == 1 assert b["nvidia.com/gpu"] == [ {"kind": "", "value": "3", "unit": "count"}, ] - assert b["weights"]["nvidia.com/gpu"][""] == "1" + assert b["weights"]["nvidia.com/gpu"][""] == "1.0" -def test_total_groups_by_custom_node_label_key(monkeypatch) -> None: +def test_total_groups_by_custom_node_label(monkeypatch) -> None: nodes = [ _node("a", {"cpu": "2"}, {"pool": "east"}), _node("b", {"cpu": "2"}, {"pool": "east"}), @@ -219,15 +225,15 @@ class R: lambda: type("X", (), {"list_node": fake_list_node})(), ) - out = total(None, field="capacity", node_label_key="pool") - assert out["node_label_key"] == "pool" + out = total(field="capacity", node_label="pool") + assert out["node_label"] == "pool" assert out["by_label_value"]["east"]["count"] == 2 assert out["by_label_value"]["west"]["count"] == 1 -def test_total_rejects_blank_node_label_key() -> None: +def test_total_rejects_blank_node_label() -> None: with pytest.raises(ValueError, match="non-empty"): - total(None, node_label_key=" ") + total(node_label=" ") def test_total_split_by_skaha_node_type(monkeypatch) -> None: @@ -258,7 +264,7 @@ class R: lambda: type("X", (), {"list_node": fake_list_node})(), ) - out = total(None, field="capacity", node_label_key=NODE_LABEL_KEY) + out = total(field="capacity", node_label=NODE_LABEL) by_t = out["by_label_value"] assert by_t["cpu-node"]["count"] == 1 assert by_t["gpu-node"]["count"] == 1 @@ -267,11 +273,11 @@ class R: {"kind": "NVIDIA-T4", "value": "2", "unit": "count"}, ] assert by_t["cpu-node"]["weights"]["memory"] == "0.5" - assert by_t["gpu-node"]["weights"]["nvidia.com/gpu"]["NVIDIA-T4"] == "4" + assert by_t["gpu-node"]["weights"]["nvidia.com/gpu"]["NVIDIA-T4"] == "4.0" def test_total_nvidia_gpu_from_labels_when_capacity_zero(monkeypatch) -> None: - """MIG-style nodes may advertise GPUs via labels while capacity nvidia.com/gpu is 0.""" + """MIG nodes may expose GPUs via labels when capacity nvidia.com/gpu is 0.""" nodes = [ _node( "g1", @@ -300,7 +306,7 @@ class R: lambda: type("X", (), {"list_node": fake_list_node})(), ) - out = total(None, field="capacity", node_label_key=NODE_LABEL_KEY) + out = total(field="capacity", node_label=NODE_LABEL) g = out["by_label_value"]["gpu-node"] assert g["count"] == 1 assert g["nvidia.com/gpu"] == [ @@ -309,7 +315,125 @@ class R: gw = g["weights"] assert gw["memory"] == "0.096" assert gw["ephemeral-storage"] == "0.192" - assert gw["nvidia.com/gpu"]["NVIDIA-H100-NVL-MIG-2g.24gb"] == "8" + assert gw["nvidia.com/gpu"]["NVIDIA-H100-NVL-MIG-2g.24gb"] == "8.0" + + +def test_total_memory_reported_in_mi_and_weights_use_same_scale(monkeypatch) -> None: + nodes = [ + _node("n1", {"cpu": "8", "memory": "8Mi"}), + ] + + def fake_list_node(*_a, **_k): + class R: + items = nodes + + return R() + + monkeypatch.setattr( + "kueuer.resources._load_kube", + lambda: type("X", (), {"list_node": fake_list_node})(), + ) + + out = total(field="capacity", node_label=NODE_LABEL, units="Mi") + bucket = out["by_label_value"][""] + assert bucket["memory"] == {"value": "8", "unit": "Mi"} + assert bucket["weights"]["cpu"] == "1.0" + assert bucket["weights"]["memory"] == "1.0" + + +def test_total_weights_baseline_memory(monkeypatch) -> None: + nodes = [ + _node("n1", {"cpu": "8", "memory": "16Gi", "ephemeral-storage": "32Gi"}), + ] + + def fake_list_node(*_a, **_k): + class R: + items = nodes + + return R() + + monkeypatch.setattr( + "kueuer.resources._load_kube", + lambda: type("X", (), {"list_node": fake_list_node})(), + ) + + out = total( + field="capacity", + node_label=NODE_LABEL, + baseline="memory", + ) + w = out["by_label_value"][""]["weights"] + assert w["memory"] == "1.0" + assert w["cpu"] == "2.0" + assert w["ephemeral-storage"] == "0.5" + + +def test_total_weights_baseline_nvidia_single_kind(monkeypatch) -> None: + nodes = [ + _node( + "g1", + {"cpu": "4", "nvidia.com/gpu": "4"}, + {"nvidia.com/gpu.product": "NVIDIA-A100-SXM4-40GB"}, + ), + _node( + "g2", + {"cpu": "8", "nvidia.com/gpu": "8"}, + {"nvidia.com/gpu.product": "NVIDIA-A100-SXM4-40GB"}, + ), + ] + + def fake_list_node(*_a, **_k): + class R: + items = nodes + + return R() + + monkeypatch.setattr( + "kueuer.resources._load_kube", + lambda: type("X", (), {"list_node": fake_list_node})(), + ) + + out = total(field="capacity", node_label=NODE_LABEL, baseline="nvidia.com/gpu") + w = out["by_label_value"][""]["weights"] + assert w["cpu"] == "1.0" + assert w["nvidia.com/gpu"]["NVIDIA-A100-SXM4-40GB"] == "1.0" + + +def test_total_weights_baseline_nvidia_mixed_kind(monkeypatch) -> None: + nodes = [ + _node( + "a", + {"cpu": "4", "nvidia.com/gpu": "2"}, + {"nvidia.com/gpu.product": "NVIDIA-T4"}, + ), + _node( + "b", + {"cpu": "4", "nvidia.com/gpu": "4"}, + {"nvidia.com/gpu.product": "NVIDIA-A100"}, + ), + ] + + def fake_list_node(*_a, **_k): + class R: + items = nodes + + return R() + + monkeypatch.setattr( + "kueuer.resources._load_kube", + lambda: type("X", (), {"list_node": fake_list_node})(), + ) + + out = total(field="capacity", node_label=NODE_LABEL, baseline="nvidia.com/gpu") + w = out["by_label_value"][""]["weights"] + assert w["cpu"] == "0.75" + assert w["nvidia.com/gpu"]["NVIDIA-A100"] == "1.5" + assert w["nvidia.com/gpu"]["NVIDIA-T4"] == "3.0" + + +def test_total_rejects_unknown_baseline() -> None: + with pytest.raises(ValueError, match="baseline must be one of"): + total(node_label=NODE_LABEL, baseline="amd.com/gpu") def test_list_resource_quotas_returns_serialized_payload(monkeypatch) -> None: @@ -318,7 +442,10 @@ def test_list_resource_quotas_returns_serialized_payload(monkeypatch) -> None: "kind": "ResourceQuotaList", "items": [ { - "metadata": {"name": "compute-quota", "namespace": "canfar-kueue-testing"}, + "metadata": { + "name": "compute-quota", + "namespace": "canfar-kueue-testing", + }, "spec": {"hard": {"requests.cpu": "8", "requests.memory": "32Gi"}}, } ], @@ -346,10 +473,17 @@ def sanitize_for_serialization(self, value): assert result == payload -def test_resources_cli_includes_node_label_key_option() -> None: +def test_resources_cli_includes_resource_options() -> None: result = runner.invoke(app, ["cluster", "resources", "--help"]) assert result.exit_code == 0 - assert "--node-label-key" in result.stdout + assert "--node-label" in result.stdout + assert "-n" in result.stdout + assert "--units" in result.stdout + assert "-u" in result.stdout + assert "--baseline" in result.stdout + assert "-b" in result.stdout + assert "--baseline-resource" not in result.stdout + assert "--pattern" not in result.stdout def test_resourcequota_cli_prints_response_and_objects(monkeypatch) -> None: @@ -358,12 +492,18 @@ def test_resourcequota_cli_prints_response_and_objects(monkeypatch) -> None: "kind": "ResourceQuotaList", "items": [ { - "metadata": {"name": "compute-quota", "namespace": "canfar-kueue-testing"}, + "metadata": { + "name": "compute-quota", + "namespace": "canfar-kueue-testing", + }, "status": {"hard": {"requests.cpu": "8"}}, } ], } - monkeypatch.setattr("kueuer.resources.list_resource_quotas", lambda namespace: payload) + monkeypatch.setattr( + "kueuer.resources.list_resource_quotas", + lambda namespace: payload, + ) result = runner.invoke( app,