Skip to content

Error:Train.sh #16

@charriry

Description

@charriry
Error executing job with overrides: []
[rank1]: Traceback (most recent call last):
[rank1]:   File "/kargobot-vepfs-zone-c/common_rw/yanchaowei1_v/Flow-Planner/flow_planner/run_script/../trainer.py", line 177, in <module>
[rank1]:     trainer()
[rank1]:   File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/hydra/main.py", line 94, in decorated_main
[rank1]:     _run_hydra(
[rank1]:   File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/hydra/_internal/utils.py", line 394, in _run_hydra
[rank1]:     _run_app(
[rank1]:   File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/hydra/_internal/utils.py", line 457, in _run_app
[rank1]:     run_and_report(
[rank1]:   File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/hydra/_internal/utils.py", line 223, in run_and_report
[rank1]:     raise ex
[rank1]:   File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/hydra/_internal/utils.py", line 220, in run_and_report
[rank1]:     return func()
[rank1]:   File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/hydra/_internal/utils.py", line 458, in <lambda>
[rank1]:     lambda: hydra.run(
[rank1]:   File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/hydra/_internal/hydra.py", line 132, in run
[rank1]:     _ = ret.return_value
[rank1]:   File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/hydra/core/utils.py", line 260, in return_value
[rank1]:     raise self._return_value
[rank1]:   File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/hydra/core/utils.py", line 186, in run_job
[rank1]:     ret.return_value = task_function(task_cfg)
[rank1]:   File "/kargobot-vepfs-zone-c/common_rw/yanchaowei1_v/Flow-Planner/flow_planner/run_script/../trainer.py", line 122, in trainer
[rank1]:     for k, data in enumerate(trainloader):
[rank1]:   File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 631, in __next__
[rank1]:     data = self._next_data()
[rank1]:   File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1346, in _next_data
[rank1]:     return self._process_data(data)
[rank1]:   File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1372, in _process_data
[rank1]:     data.reraise()
[rank1]:   File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/torch/_utils.py", line 705, in reraise
[rank1]:     raise exception
[rank1]: KeyError: Caught KeyError in DataLoader worker process 0.
[rank1]: Original Traceback (most recent call last):
[rank1]:   File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
[rank1]:     data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
[rank1]:   File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 51, in fetch
[rank1]:     data = [self.dataset[idx] for idx in possibly_batched_index]
[rank1]:   File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 51, in <listcomp>
[rank1]:     data = [self.dataset[idx] for idx in possibly_batched_index]
[rank1]:   File "/kargobot-vepfs-zone-c/common_rw/yanchaowei1_v/Flow-Planner/flow_planner/data/dataset/nuplan.py", line 245, in __getitem__
[rank1]:     ego_agent_past = torch.from_numpy(data['ego_agent_past']) #没有
[rank1]:   File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/numpy/lib/npyio.py", line 251, in __getitem__
[rank1]:     raise KeyError("%s is not a file in the archive" % key)
[rank1]: KeyError: 'ego_agent_past is not a file in the archive'

E0313 08:21:21.853535 140390896924480 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 7100) of binary: /root/miniconda3/envs/flow_planner/bin/python
Traceback (most recent call last):
  File "/root/miniconda3/envs/flow_planner/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/root/miniconda3/envs/flow_planner/lib/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/torch/distributed/run.py", line 883, in <module>
    main()
  File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
    return f(*args, **kwargs)
  File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/torch/distributed/run.py", line 879, in main
    run(args)
  File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/torch/distributed/run.py", line 870, in run
    elastic_launch(
  File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
  File "/root/miniconda3/envs/flow_planner/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent
    raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
============================================================
../trainer.py FAILED

Has anyone else encountered a similar problem?I did not make any modifications to the data processing script.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions