-
Notifications
You must be signed in to change notification settings - Fork 28
Error:Train.sh #16
Copy link
Copy link
Open
Description
Error executing job with overrides: []
[rank1]: Traceback (most recent call last):
[rank1]: File "/kargobot-vepfs-zone-c/common_rw/yanchaowei1_v/Flow-Planner/flow_planner/run_script/../trainer.py", line 177, in <module>
[rank1]: trainer()
[rank1]: File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/hydra/main.py", line 94, in decorated_main
[rank1]: _run_hydra(
[rank1]: File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/hydra/_internal/utils.py", line 394, in _run_hydra
[rank1]: _run_app(
[rank1]: File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/hydra/_internal/utils.py", line 457, in _run_app
[rank1]: run_and_report(
[rank1]: File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/hydra/_internal/utils.py", line 223, in run_and_report
[rank1]: raise ex
[rank1]: File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/hydra/_internal/utils.py", line 220, in run_and_report
[rank1]: return func()
[rank1]: File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/hydra/_internal/utils.py", line 458, in <lambda>
[rank1]: lambda: hydra.run(
[rank1]: File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/hydra/_internal/hydra.py", line 132, in run
[rank1]: _ = ret.return_value
[rank1]: File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/hydra/core/utils.py", line 260, in return_value
[rank1]: raise self._return_value
[rank1]: File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/hydra/core/utils.py", line 186, in run_job
[rank1]: ret.return_value = task_function(task_cfg)
[rank1]: File "/kargobot-vepfs-zone-c/common_rw/yanchaowei1_v/Flow-Planner/flow_planner/run_script/../trainer.py", line 122, in trainer
[rank1]: for k, data in enumerate(trainloader):
[rank1]: File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 631, in __next__
[rank1]: data = self._next_data()
[rank1]: File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1346, in _next_data
[rank1]: return self._process_data(data)
[rank1]: File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1372, in _process_data
[rank1]: data.reraise()
[rank1]: File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/torch/_utils.py", line 705, in reraise
[rank1]: raise exception
[rank1]: KeyError: Caught KeyError in DataLoader worker process 0.
[rank1]: Original Traceback (most recent call last):
[rank1]: File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
[rank1]: data = fetcher.fetch(index) # type: ignore[possibly-undefined]
[rank1]: File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 51, in fetch
[rank1]: data = [self.dataset[idx] for idx in possibly_batched_index]
[rank1]: File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 51, in <listcomp>
[rank1]: data = [self.dataset[idx] for idx in possibly_batched_index]
[rank1]: File "/kargobot-vepfs-zone-c/common_rw/yanchaowei1_v/Flow-Planner/flow_planner/data/dataset/nuplan.py", line 245, in __getitem__
[rank1]: ego_agent_past = torch.from_numpy(data['ego_agent_past']) #没有
[rank1]: File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/numpy/lib/npyio.py", line 251, in __getitem__
[rank1]: raise KeyError("%s is not a file in the archive" % key)
[rank1]: KeyError: 'ego_agent_past is not a file in the archive'
E0313 08:21:21.853535 140390896924480 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 7100) of binary: /root/miniconda3/envs/flow_planner/bin/python
Traceback (most recent call last):
File "/root/miniconda3/envs/flow_planner/lib/python3.9/runpy.py", line 197, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/root/miniconda3/envs/flow_planner/lib/python3.9/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/torch/distributed/run.py", line 883, in <module>
main()
File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
return f(*args, **kwargs)
File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/torch/distributed/run.py", line 879, in main
run(args)
File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/torch/distributed/run.py", line 870, in run
elastic_launch(
File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
../trainer.py FAILED
Has anyone else encountered a similar problem?I did not make any modifications to the data processing script.
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels