Slurm accepts the submission and tries to execute job. which fails with
---------------------------------------------------------------------------
GatewayClusterError Traceback (most recent call last)
Cell In[176], line 1
----> 1 cluster = gateway.new_cluster(options)
File /opt/conda/lib/python3.10/site-packages/dask_gateway/client.py:641, in Gateway.new_cluster(self, cluster_options, shutdown_on_close, **kwargs)
618 def new_cluster(self, cluster_options=None, shutdown_on_close=True, **kwargs):
619 """Submit a new cluster to the gateway, and wait for it to be started.
620
621 Same as calling ``submit`` and ``connect`` in one go.
(...)
639 cluster : GatewayCluster
640 """
--> 641 return GatewayCluster(
642 address=self.address,
643 proxy_address=self.proxy_address,
644 public_address=self._public_address,
645 auth=self.auth,
646 asynchronous=self.asynchronous,
647 loop=self.loop,
648 cluster_options=cluster_options,
649 shutdown_on_close=shutdown_on_close,
650 **kwargs,
651 )
File /opt/conda/lib/python3.10/site-packages/dask_gateway/client.py:816, in GatewayCluster.__init__(self, address, proxy_address, public_address, auth, cluster_options, shutdown_on_close, asynchronous, loop, **kwargs)
804 def __init__(
805 self,
806 address=None,
(...)
814 **kwargs,
815 ):
--> 816 self._init_internal(
817 address=address,
818 proxy_address=proxy_address,
819 public_address=public_address,
820 auth=auth,
821 cluster_options=cluster_options,
822 cluster_kwargs=kwargs,
823 shutdown_on_close=shutdown_on_close,
824 asynchronous=asynchronous,
825 loop=loop,
826 )
File /opt/conda/lib/python3.10/site-packages/dask_gateway/client.py:921, in GatewayCluster._init_internal(self, address, proxy_address, public_address, auth, cluster_options, cluster_kwargs, shutdown_on_close, asynchronous, loop, name)
919 self.status = "starting"
920 if not self.asynchronous:
--> 921 self.gateway.sync(self._start_internal)
File /opt/conda/lib/python3.10/site-packages/dask_gateway/client.py:344, in Gateway.sync(self, func, *args, **kwargs)
340 future = asyncio.run_coroutine_threadsafe(
341 func(*args, **kwargs), self.loop.asyncio_loop
342 )
343 try:
--> 344 return future.result()
345 except BaseException:
346 future.cancel()
File /opt/conda/lib/python3.10/concurrent/futures/_base.py:458, in Future.result(self, timeout)
456 raise CancelledError()
457 elif self._state == FINISHED:
--> 458 return self.__get_result()
459 else:
460 raise TimeoutError()
File /opt/conda/lib/python3.10/concurrent/futures/_base.py:403, in Future.__get_result(self)
401 if self._exception:
402 try:
--> 403 raise self._exception
404 finally:
405 # Break a reference cycle with the exception in self._exception
406 self = None
File /opt/conda/lib/python3.10/site-packages/dask_gateway/client.py:935, in GatewayCluster._start_internal(self)
933 self._start_task = asyncio.ensure_future(self._start_async())
934 try:
--> 935 await self._start_task
936 except BaseException:
937 # On exception, cleanup
938 await self._stop_internal()
File /opt/conda/lib/python3.10/site-packages/dask_gateway/client.py:953, in GatewayCluster._start_async(self)
951 # Connect to cluster
952 try:
--> 953 report = await self.gateway._wait_for_start(self.name)
954 except GatewayClusterError:
955 raise
File /opt/conda/lib/python3.10/site-packages/dask_gateway/client.py:581, in Gateway._wait_for_start(self, cluster_name)
579 return report
580 elif report.status is ClusterStatus.FAILED:
--> 581 raise GatewayClusterError(
582 "Cluster %r failed to start, see logs for "
583 "more information" % cluster_name
584 )
585 elif report.status is ClusterStatus.STOPPED:
586 raise GatewayClusterError(
587 "Cluster %r is already stopped" % cluster_name
588 )
GatewayClusterError: Cluster '904e0c326cd5484cacf9d343bd1c5226' failed to start, see logs for more information
/opt/slurm/slurmd/job00068/slurm_script: 3: source: not found
/opt/slurm/slurmd/job00068/slurm_script: 5: dask-scheduler: not found
Describe the issue:
I try to spawn a cluster via dask-gateway with the following config:
Slurm accepts the submission and tries to execute job. which fails with
inside the logs from that submission is:
and if i check how the script put together by the gateway with
/bin/shdoes not know the commandsourceand me changing it to.results inUnrecognized Shellwhich i think conda is not happy withsh.So the Example on https://gateway.dask.org/install-jobqueue.html is not working for me.
Minimal Complete Verifiable Example:
Anything else we need to know?:
Suggested change is to expose a config for the slurm script shebang, like you are already doing for dask_jobqueue.SLURMCluster
Affected lines are:
dask-gateway/dask-gateway-server/dask_gateway_server/backends/jobqueue/slurm.py
Line 72 in bf04d65
dask-gateway/dask-gateway-server/dask_gateway_server/backends/jobqueue/slurm.py
Line 84 in bf04d65
Environment: