You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
{{ message }}
This repository has been archived by the owner on Nov 1, 2024. It is now read-only.
I got the following traceback:
THCudaCheck FAIL file=/opt/conda/conda-bld/pytorch_1595629401015/work/torch/csrc/cuda/Module.cpp line=59 error=10 : invalid device ordinal
Process Process-2:
Traceback (most recent call last):
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 138, in run
destroy_process_group()
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 50, in destroy_process_group
torch.distributed.destroy_process_group()
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 555, in destroy_process_group
raise RuntimeError("Invalid process group specified")
Traceback (most recent call last):
File "tools/test_net.py", line 24, in
Process Process-3:
main()
File "tools/test_net.py", line 20, in main
dist.multi_proc_run(num_proc=cfg.NUM_GPUS, fun=trainer.test_model)
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 164, in multi_proc_run
p.join()
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/multiprocessing/process.py", line 149, in join
res = self._popen.wait(timeout)
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/multiprocessing/popen_fork.py", line 47, in wait
return self.poll(os.WNOHANG if timeout == 0.0 else 0)
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/multiprocessing/popen_fork.py", line 27, in poll
pid, sts = os.waitpid(self.pid, flag)
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 120, in signal_handler
raise ChildException(self.error_queue.get())
pycls.core.distributed.ChildException: Traceback (most recent call last):
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 127, in run
init_process_group(proc_rank, world_size, port)
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 38, in init_process_group
torch.cuda.set_device(proc_rank)
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/site-packages/torch/cuda/init.py", line 281, in set_device
torch._C._cuda_setDevice(device)
RuntimeError: cuda runtime error (10) : invalid device ordinal at /opt/conda/conda-bld/pytorch_1595629401015/work/torch/csrc/cuda/Module.cpp:59
Traceback (most recent call last):
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 127, in run
init_process_group(proc_rank, world_size, port)
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 38, in init_process_group
torch.cuda.set_device(proc_rank)
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/site-packages/torch/cuda/init.py", line 281, in set_device
torch._C._cuda_setDevice(device)
RuntimeError: cuda runtime error (10) : invalid device ordinal at /opt/conda/conda-bld/pytorch_1595629401015/work/torch/csrc/cuda/Module.cpp:59
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 127, in run
init_process_group(proc_rank, world_size, port)
KeyboardInterrupt
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 138, in run
destroy_process_group()
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 50, in destroy_process_group
torch.distributed.destroy_process_group()
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 555, in destroy_process_group
raise RuntimeError("Invalid process group specified")
RuntimeError: Invalid process group specified
Process Process-6:
Traceback (most recent call last):
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 138, in run
destroy_process_group()
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 50, in destroy_process_group
torch.distributed.destroy_process_group()
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 555, in destroy_process_group
raise RuntimeError("Invalid process group specified")
RuntimeError: Invalid process group specified
Process Process-5:
Process Process-4:
Traceback (most recent call last):
Process Process-7:
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 138, in run
destroy_process_group()
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 50, in destroy_process_group
torch.distributed.destroy_process_group()
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 555, in destroy_process_group
raise RuntimeError("Invalid process group specified")
RuntimeError: Invalid process group specified
Traceback (most recent call last):
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 138, in run
destroy_process_group()
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 50, in destroy_process_group
torch.distributed.destroy_process_group()
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 555, in destroy_process_group
raise RuntimeError("Invalid process group specified")
RuntimeError: Invalid process group specified
Traceback (most recent call last):
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 138, in run
destroy_process_group()
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 50, in destroy_process_group
torch.distributed.destroy_process_group()
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 555, in destroy_process_group
raise RuntimeError("Invalid process group specified")
RuntimeError: Invalid process group specified
Process Process-8:
Traceback (most recent call last):
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 138, in run
destroy_process_group()
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 50, in destroy_process_group
torch.distributed.destroy_process_group()
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 555, in destroy_process_group
raise RuntimeError("Invalid process group specified")
RuntimeError: Invalid process group specified
How should I solve it?
The text was updated successfully, but these errors were encountered:
While running:
python tools/test_net.py
--cfg configs/dds_baselines/regnetx/RegNetX-400MF_dds_8gpu.yaml
TEST.WEIGHTS https://dl.fbaipublicfiles.com/pycls/dds_baselines/160905967/RegNetX-400MF_dds_8gpu.pyth
OUT_DIR /tmp
I got the following traceback:
THCudaCheck FAIL file=/opt/conda/conda-bld/pytorch_1595629401015/work/torch/csrc/cuda/Module.cpp line=59 error=10 : invalid device ordinal
Process Process-2:
Traceback (most recent call last):
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 138, in run
destroy_process_group()
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 50, in destroy_process_group
torch.distributed.destroy_process_group()
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 555, in destroy_process_group
raise RuntimeError("Invalid process group specified")
Traceback (most recent call last):
File "tools/test_net.py", line 24, in
Process Process-3:
main()
File "tools/test_net.py", line 20, in main
dist.multi_proc_run(num_proc=cfg.NUM_GPUS, fun=trainer.test_model)
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 164, in multi_proc_run
p.join()
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/multiprocessing/process.py", line 149, in join
res = self._popen.wait(timeout)
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/multiprocessing/popen_fork.py", line 47, in wait
return self.poll(os.WNOHANG if timeout == 0.0 else 0)
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/multiprocessing/popen_fork.py", line 27, in poll
pid, sts = os.waitpid(self.pid, flag)
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 120, in signal_handler
raise ChildException(self.error_queue.get())
pycls.core.distributed.ChildException: Traceback (most recent call last):
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 127, in run
init_process_group(proc_rank, world_size, port)
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 38, in init_process_group
torch.cuda.set_device(proc_rank)
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/site-packages/torch/cuda/init.py", line 281, in set_device
torch._C._cuda_setDevice(device)
RuntimeError: cuda runtime error (10) : invalid device ordinal at /opt/conda/conda-bld/pytorch_1595629401015/work/torch/csrc/cuda/Module.cpp:59
Traceback (most recent call last):
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 127, in run
init_process_group(proc_rank, world_size, port)
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 38, in init_process_group
torch.cuda.set_device(proc_rank)
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/site-packages/torch/cuda/init.py", line 281, in set_device
torch._C._cuda_setDevice(device)
RuntimeError: cuda runtime error (10) : invalid device ordinal at /opt/conda/conda-bld/pytorch_1595629401015/work/torch/csrc/cuda/Module.cpp:59
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 127, in run
init_process_group(proc_rank, world_size, port)
KeyboardInterrupt
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 138, in run
destroy_process_group()
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 50, in destroy_process_group
torch.distributed.destroy_process_group()
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 555, in destroy_process_group
raise RuntimeError("Invalid process group specified")
RuntimeError: Invalid process group specified
Process Process-6:
Traceback (most recent call last):
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 138, in run
destroy_process_group()
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 50, in destroy_process_group
torch.distributed.destroy_process_group()
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 555, in destroy_process_group
raise RuntimeError("Invalid process group specified")
RuntimeError: Invalid process group specified
Process Process-5:
Process Process-4:
Traceback (most recent call last):
Process Process-7:
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 138, in run
destroy_process_group()
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 50, in destroy_process_group
torch.distributed.destroy_process_group()
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 555, in destroy_process_group
raise RuntimeError("Invalid process group specified")
RuntimeError: Invalid process group specified
Traceback (most recent call last):
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 138, in run
destroy_process_group()
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 50, in destroy_process_group
torch.distributed.destroy_process_group()
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 555, in destroy_process_group
raise RuntimeError("Invalid process group specified")
RuntimeError: Invalid process group specified
Traceback (most recent call last):
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 138, in run
destroy_process_group()
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 50, in destroy_process_group
torch.distributed.destroy_process_group()
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 555, in destroy_process_group
raise RuntimeError("Invalid process group specified")
RuntimeError: Invalid process group specified
Process Process-8:
Traceback (most recent call last):
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 138, in run
destroy_process_group()
File "/home/nskgpu/pycls/pycls/core/distributed.py", line 50, in destroy_process_group
torch.distributed.destroy_process_group()
File "/home/nskgpu/anaconda3/envs/PyclsEnv1/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 555, in destroy_process_group
raise RuntimeError("Invalid process group specified")
RuntimeError: Invalid process group specified
How should I solve it?
The text was updated successfully, but these errors were encountered: