Is it possible to train RetinaNet with SpineNet-190 backbone on single GPU using the classic mmdetection command python tools/train.py ${CONFIG_FILE}?
I tried and I get this error:
Traceback (most recent call last): File "tools/train.py", line 142, in <module> main() File "tools/train.py", line 138, in main meta=meta) File "/content/mmdetection_b/mmdet/apis/train.py", line 111, in train_detector meta=meta) File "/content/mmdetection_b/mmdet/apis/train.py", line 305, in _non_dist_train runner.run(data_loaders, cfg.workflow, cfg.total_epochs) File "/usr/local/lib/python3.6/dist-packages/mmcv/runner/runner.py", line 384, in run epoch_runner(data_loaders[i], **kwargs) File "/usr/local/lib/python3.6/dist-packages/mmcv/runner/runner.py", line 283, in train self.model, data_batch, train_mode=True, **kwargs) File "/content/mmdetection_b/mmdet/apis/train.py", line 75, in batch_processor losses = model(**data) File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 532, in __call__ result = self.forward(*input, **kwargs) File "/usr/local/lib/python3.6/dist-packages/torch/nn/parallel/data_parallel.py", line 150, in forward return self.module(*inputs[0], **kwargs[0]) File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 532, in __call__ result = self.forward(*input, **kwargs) File "/content/mmdetection_b/mmdet/core/fp16/decorators.py", line 49, in new_func return old_func(*args, **kwargs) File "/content/mmdetection_b/mmdet/models/detectors/base.py", line 137, in forward return self.forward_train(img, img_meta, **kwargs) File "/content/mmdetection_b/mmdet/models/detectors/single_stage.py", line 67, in forward_train x = self.extract_feat(img) File "/content/mmdetection_b/mmdet/models/detectors/single_stage.py", line 47, in extract_feat x = self.backbone(img) File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 532, in __call__ result = self.forward(*input, **kwargs) File "/content/mmdetection_b/mmdet/models/backbones/spinenet.py", line 251, in forward feat = self.maxpool(self.conv1(input)) File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 532, in __call__ result = self.forward(*input, **kwargs) File "/usr/local/lib/python3.6/dist-packages/mmcv/cnn/bricks/conv_module.py", line 181, in forward x = self.norm(x) File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 532, in __call__ result = self.forward(*input, **kwargs) File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/batchnorm.py", line 458, in forward world_size = torch.distributed.get_world_size(process_group) File "/usr/local/lib/python3.6/dist-packages/torch/distributed/distributed_c10d.py", line 586, in get_world_size return _get_group_size(group) File "/usr/local/lib/python3.6/dist-packages/torch/distributed/distributed_c10d.py", line 202, in _get_group_size _check_default_pg() File "/usr/local/lib/python3.6/dist-packages/torch/distributed/distributed_c10d.py", line 193, in _check_default_pg "Default process group is not initialized" AssertionError: Default process group is not initialized
Thanks for your help.
Is it possible to train RetinaNet with SpineNet-190 backbone on single GPU using the classic mmdetection command
python tools/train.py ${CONFIG_FILE}?I tried and I get this error:
Traceback (most recent call last): File "tools/train.py", line 142, in <module> main() File "tools/train.py", line 138, in main meta=meta) File "/content/mmdetection_b/mmdet/apis/train.py", line 111, in train_detector meta=meta) File "/content/mmdetection_b/mmdet/apis/train.py", line 305, in _non_dist_train runner.run(data_loaders, cfg.workflow, cfg.total_epochs) File "/usr/local/lib/python3.6/dist-packages/mmcv/runner/runner.py", line 384, in run epoch_runner(data_loaders[i], **kwargs) File "/usr/local/lib/python3.6/dist-packages/mmcv/runner/runner.py", line 283, in train self.model, data_batch, train_mode=True, **kwargs) File "/content/mmdetection_b/mmdet/apis/train.py", line 75, in batch_processor losses = model(**data) File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 532, in __call__ result = self.forward(*input, **kwargs) File "/usr/local/lib/python3.6/dist-packages/torch/nn/parallel/data_parallel.py", line 150, in forward return self.module(*inputs[0], **kwargs[0]) File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 532, in __call__ result = self.forward(*input, **kwargs) File "/content/mmdetection_b/mmdet/core/fp16/decorators.py", line 49, in new_func return old_func(*args, **kwargs) File "/content/mmdetection_b/mmdet/models/detectors/base.py", line 137, in forward return self.forward_train(img, img_meta, **kwargs) File "/content/mmdetection_b/mmdet/models/detectors/single_stage.py", line 67, in forward_train x = self.extract_feat(img) File "/content/mmdetection_b/mmdet/models/detectors/single_stage.py", line 47, in extract_feat x = self.backbone(img) File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 532, in __call__ result = self.forward(*input, **kwargs) File "/content/mmdetection_b/mmdet/models/backbones/spinenet.py", line 251, in forward feat = self.maxpool(self.conv1(input)) File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 532, in __call__ result = self.forward(*input, **kwargs) File "/usr/local/lib/python3.6/dist-packages/mmcv/cnn/bricks/conv_module.py", line 181, in forward x = self.norm(x) File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 532, in __call__ result = self.forward(*input, **kwargs) File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/batchnorm.py", line 458, in forward world_size = torch.distributed.get_world_size(process_group) File "/usr/local/lib/python3.6/dist-packages/torch/distributed/distributed_c10d.py", line 586, in get_world_size return _get_group_size(group) File "/usr/local/lib/python3.6/dist-packages/torch/distributed/distributed_c10d.py", line 202, in _get_group_size _check_default_pg() File "/usr/local/lib/python3.6/dist-packages/torch/distributed/distributed_c10d.py", line 193, in _check_default_pg "Default process group is not initialized" AssertionError: Default process group is not initializedThanks for your help.