-
Notifications
You must be signed in to change notification settings - Fork 34
Open
Description
Getting an error when training:
{'data_dir': '/root/private/dataset/CREMA-D/processed', 'audio_emb_dir': '/root/private/dataset/CREMA-D/processed/audio_emb', 'identity_frame': 'random', 'n_test_actors': 8, 'img_resize': 128, 'n_motion_frames': 2, 'n_audio_motion_embs': 2, 'grayscale_motion': True, 'motion_blur': False, 'n_timesteps': 1000, 'model_channels': 256, 'out_channels': 6, 'num_res_blocks': 2, 'attention_resolutions': [32], 'dropout': 0.1, 'channel_mult': [1, 2, 3], 'num_heads': 4, 'num_head_channels': 64, 'resblock_updown': True, 'id_condition_type': 'frame', 'audio_condition_type': 'double_pre_gn', 'checkpoint': None, 'precision': 32, 'n_nodes': 1, 'n_workers': 0, 'bsz': 2, 'n_epochs': 5000, 'lr': 5e-05, 'swa_lr': None, 'swa_epoch_start': 800, 'vlb_weight': 0.0001, 'lip_weight': 0.2, 'val_every_n_epochs': 5000, 'n_frames_per_sample': 4, 'log_dir': '/root/private/code/Audio2Video/Diffusion/diffused-heads-train/log/train/crema', 'debug': False}
Global seed set to 2137
"attention_resolutions": [32]
"audio_condition_type": double_pre_gn
"audio_rate": 16000
"bsz": 2
"channel_mult": [1, 2, 3]
"dropout": 0.1
"grayscale_motion": True
"id_condition_type": frame
"image_size": 128
"in_channels": 3
"lip_weight": 0.2
"log_dir": /root/private/code/Audio2Video/Diffusion/diffused-heads-train/log/train/crema
"lr": 5e-05
"model_channels": 256
"motion_transforms": Compose(
Grayscale(num_output_channels=1)
)
"n_audio_motion_embs": 2
"n_epochs": 5000
"n_motion_frames": 2
"n_timesteps": 1000
"num_head_channels": 64
"num_heads": 4
"num_res_blocks": 2
"out_channels": 6
"precision": 32
"resblock_updown": True
"video_rate": 25
"vlb_weight": 0.0001
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
[rank: 0] Global seed set to 2137
Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/1
[2025-03-22 21:50:55,462][torch.distributed.distributed_c10d][INFO] - Added key: store_based_barrier_key:1 to store for rank: 0
[2025-03-22 21:50:55,462][torch.distributed.distributed_c10d][INFO] - Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 1 nodes.
----------------------------------------------------------------------------------------------------
distributed_backend=nccl
All distributed processes registered. Starting with 1 processes
----------------------------------------------------------------------------------------------------
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Scheduler 277
| Name | Type | Params
----------------------------------------
0 | unet | UNet | 215 M
1 | diffusion | Diffusion | 215 M
----------------------------------------
215 M Trainable params
0 Non-trainable params
215 M Total params
862.958 Total estimated model params size (MB)
/root/miniconda3/envs/talking-faces/lib/python3.7/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:229: PossibleUserWarning: The dataloader, train_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 48 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.
category=PossibleUserWarning,
Epoch 0: 0%| | 0/3308 [00:00<?, ?it/s]Error executing job with overrides: []
Traceback (most recent call last):
File "/root/private/code/Audio2Video/temp/diffused-heads-train/train.py", line 115, in main
trainer.fit(model, data_module)
File "/root/miniconda3/envs/talking-faces/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 609, in fit
self, self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
File "/root/miniconda3/envs/talking-faces/lib/python3.7/site-packages/pytorch_lightning/trainer/call.py", line 36, in _call_and_handle_interrupt
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
File "/root/miniconda3/envs/talking-faces/lib/python3.7/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 88, in launch
return function(*args, **kwargs)
File "/root/miniconda3/envs/talking-faces/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 650, in _fit_impl
self._run(model, ckpt_path=self.ckpt_path)
File "/root/miniconda3/envs/talking-faces/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1112, in _run
results = self._run_stage()
File "/root/miniconda3/envs/talking-faces/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1191, in _run_stage
self._run_train()
File "/root/miniconda3/envs/talking-faces/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1214, in _run_train
self.fit_loop.run()
File "/root/miniconda3/envs/talking-faces/lib/python3.7/site-packages/pytorch_lightning/loops/loop.py", line 199, in run
self.advance(*args, **kwargs)
File "/root/miniconda3/envs/talking-faces/lib/python3.7/site-packages/pytorch_lightning/loops/fit_loop.py", line 267, in advance
self._outputs = self.epoch_loop.run(self._data_fetcher)
File "/root/miniconda3/envs/talking-faces/lib/python3.7/site-packages/pytorch_lightning/loops/loop.py", line 199, in run
self.advance(*args, **kwargs)
File "/root/miniconda3/envs/talking-faces/lib/python3.7/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 213, in advance
batch_output = self.batch_loop.run(kwargs)
File "/root/miniconda3/envs/talking-faces/lib/python3.7/site-packages/pytorch_lightning/loops/loop.py", line 199, in run
self.advance(*args, **kwargs)
File "/root/miniconda3/envs/talking-faces/lib/python3.7/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 88, in advance
outputs = self.optimizer_loop.run(optimizers, kwargs)
File "/root/miniconda3/envs/talking-faces/lib/python3.7/site-packages/pytorch_lightning/loops/loop.py", line 199, in run
self.advance(*args, **kwargs)
File "/root/miniconda3/envs/talking-faces/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 202, in advance
result = self._run_optimization(kwargs, self._optimizers[self.optim_progress.optimizer_position])
File "/root/miniconda3/envs/talking-faces/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 249, in _run_optimization
self._optimizer_step(optimizer, opt_idx, kwargs.get("batch_idx", 0), closure)
File "/root/miniconda3/envs/talking-faces/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 379, in _optimizer_step
using_lbfgs=is_lbfgs,
File "/root/miniconda3/envs/talking-faces/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1356, in _call_lightning_module_hook
output = fn(*args, **kwargs)
File "/root/miniconda3/envs/talking-faces/lib/python3.7/site-packages/pytorch_lightning/core/module.py", line 1754, in optimizer_step
optimizer.step(closure=optimizer_closure)
File "/root/miniconda3/envs/talking-faces/lib/python3.7/site-packages/pytorch_lightning/core/optimizer.py", line 169, in step
step_output = self._strategy.optimizer_step(self._optimizer, self._optimizer_idx, closure, **kwargs)
File "/root/miniconda3/envs/talking-faces/lib/python3.7/site-packages/pytorch_lightning/strategies/ddp.py", line 280, in optimizer_step
optimizer_output = super().optimizer_step(optimizer, opt_idx, closure, model, **kwargs)
File "/root/miniconda3/envs/talking-faces/lib/python3.7/site-packages/pytorch_lightning/strategies/strategy.py", line 235, in optimizer_step
optimizer, model=model, optimizer_idx=opt_idx, closure=closure, **kwargs
File "/root/miniconda3/envs/talking-faces/lib/python3.7/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 119, in optimizer_step
return optimizer.step(closure=closure, **kwargs)
File "/root/miniconda3/envs/talking-faces/lib/python3.7/site-packages/torch/optim/lr_scheduler.py", line 68, in wrapper
return wrapped(*args, **kwargs)
File "/root/miniconda3/envs/talking-faces/lib/python3.7/site-packages/torch/optim/optimizer.py", line 140, in wrapper
out = func(*args, **kwargs)
File "/root/miniconda3/envs/talking-faces/lib/python3.7/site-packages/torch/optim/optimizer.py", line 23, in _use_grad
ret = func(self, *args, **kwargs)
File "/root/miniconda3/envs/talking-faces/lib/python3.7/site-packages/torch/optim/adam.py", line 183, in step
loss = closure()
File "/root/miniconda3/envs/talking-faces/lib/python3.7/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 105, in _wrap_closure
closure_result = closure()
File "/root/miniconda3/envs/talking-faces/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 149, in __call__
self._result = self.closure(*args, **kwargs)
File "/root/miniconda3/envs/talking-faces/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 135, in closure
step_output = self._step_fn()
File "/root/miniconda3/envs/talking-faces/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 419, in _training_step
training_step_output = self.trainer._call_strategy_hook("training_step", *kwargs.values())
File "/root/miniconda3/envs/talking-faces/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1494, in _call_strategy_hook
output = fn(*args, **kwargs)
File "/root/miniconda3/envs/talking-faces/lib/python3.7/site-packages/pytorch_lightning/strategies/ddp.py", line 351, in training_step
return self.model(*args, **kwargs)
File "/root/miniconda3/envs/talking-faces/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/root/miniconda3/envs/talking-faces/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 1040, in forward
output = self._run_ddp_forward(*inputs, **kwargs)
File "/root/miniconda3/envs/talking-faces/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 1000, in _run_ddp_forward
return module_to_run(*inputs[0], **kwargs[0])
File "/root/miniconda3/envs/talking-faces/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/root/miniconda3/envs/talking-faces/lib/python3.7/site-packages/pytorch_lightning/overrides/base.py", line 98, in forward
output = self._forward_module.training_step(*inputs, **kwargs)
File "/root/private/code/Audio2Video/temp/diffused-heads-train/lightning_modules/model.py", line 56, in training_step
losses = self.diffusion(x, x_cond, motion_frames=motion_frames, audio_emb=audio_emb, landmarks=landmarks)
File "/root/miniconda3/envs/talking-faces/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/root/private/code/Audio2Video/temp/diffused-heads-train/models/diffusion.py", line 45, in forward
losses['vlb'] = self.vlb_loss(x0, xt, timesteps, nn_out_frozen).mean()
File "/root/private/code/Audio2Video/temp/diffused-heads-train/models/diffusion.py", line 128, in vlb_loss
p_mean, p_logvar = self.get_p_params(xt, timesteps, nn_out)
File "/root/private/code/Audio2Video/temp/diffused-heads-train/models/diffusion.py", line 105, in get_p_params
p_logvar = nu * self.broadcast(torch.log(self.beta[timesteps])) + (1 - nu) * self.broadcast(self.log_beta_tilde_clipped[timesteps])
RuntimeError: indices should be either on cpu or on the same device as the indexed tensor (cpu)
Set the environment variable HYDRA_FULL_ERROR=1 for a complete stack trace.
There is a problem with the training code, I found that
@property
def device(self):
return next(self.nn_backbone.parameters()).devicereturn cpu ,but x0 is in cuda ,is anyone can fix this problem,thank u!
Metadata
Metadata
Assignees
Labels
No labels