You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Assistant:<unk>
Traceback (most recent call last):
Traceback (most recent call last):
File "/tmp/CCT/src/train_ppo.py", line 82, in <module>
File "/tmp/CCT/src/train_ppo.py", line 82, in <module>
Traceback (most recent call last):
Traceback (most recent call last):
File "/tmp/CCT/src/train_ppo.py", line 82, in <module>
File "/tmp/CCT/src/train_ppo.py", line 82, in <module>
main()
main()
File "/tmp/CCT/src/train_ppo.py", line 55, in main
File "/tmp/CCT/src/train_ppo.py", line 55, in main
main()
File "/tmp/CCT/src/train_ppo.py", line 55, in main
main()
File "/tmp/CCT/src/train_ppo.py", line 55, in main
ppo_trainer = PPOPeftTrainer(ppo_trainer = PPOPeftTrainer(
File "/tmp/CCT/src/utils/ppo.py", line 72, in __init__
ppo_trainer = PPOPeftTrainer( File "/tmp/CCT/src/utils/ppo.py", line 72, in __init__
File "/tmp/CCT/src/utils/ppo.py", line 72, in __init__
ppo_trainer = PPOPeftTrainer(
File "/tmp/CCT/src/utils/ppo.py", line 72, in __init__
PPOTrainer.__init__(self, **kwargs)
File "/root/miniconda3/envs/llama_etuning/lib/python3.10/site-packages/trl/trainer/ppo_trainer.py", line 290, in __init__
PPOTrainer.__init__(self, **kwargs) PPOTrainer.__init__(self, **kwargs)
PPOTrainer.__init__(self, **kwargs)
File "/root/miniconda3/envs/llama_etuning/lib/python3.10/site-packages/trl/trainer/ppo_trainer.py", line 290, in __init__
File "/root/miniconda3/envs/llama_etuning/lib/python3.10/site-packages/trl/trainer/ppo_trainer.py", line 290, in __init__
File "/root/miniconda3/envs/llama_etuning/lib/python3.10/site-packages/trl/trainer/ppo_trainer.py", line 290, in __init__
) = self.accelerator.prepare(
File "/root/miniconda3/envs/llama_etuning/lib/python3.10/site-packages/accelerate/accelerator.py", line 1182, in prepare
) = self.accelerator.prepare() = self.accelerator.prepare() = self.accelerator.prepare(
File "/root/miniconda3/envs/llama_etuning/lib/python3.10/site-packages/accelerate/accelerator.py", line 1182, in prepare
File "/root/miniconda3/envs/llama_etuning/lib/python3.10/site-packages/accelerate/accelerator.py", line 1182, in prepare
File "/root/miniconda3/envs/llama_etuning/lib/python3.10/site-packages/accelerate/accelerator.py", line 1182, in prepare
result = tuple(
File "/root/miniconda3/envs/llama_etuning/lib/python3.10/site-packages/accelerate/accelerator.py", line 1183, in <genexpr>
result = tuple(result = tuple(result = tuple(
File "/root/miniconda3/envs/llama_etuning/lib/python3.10/site-packages/accelerate/accelerator.py", line 1183, in <genexpr>
File "/root/miniconda3/envs/llama_etuning/lib/python3.10/site-packages/accelerate/accelerator.py", line 1183, in <genexpr>
File "/root/miniconda3/envs/llama_etuning/lib/python3.10/site-packages/accelerate/accelerator.py", line 1183, in <genexpr>
self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)
File "/root/miniconda3/envs/llama_etuning/lib/python3.10/site-packages/accelerate/accelerator.py", line 1022, in _prepare_one
self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)
self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)
File "/root/miniconda3/envs/llama_etuning/lib/python3.10/site-packages/accelerate/accelerator.py", line 1022, in _prepare_one
File "/root/miniconda3/envs/llama_etuning/lib/python3.10/site-packages/accelerate/accelerator.py", line 1022, in _prepare_one
File "/root/miniconda3/envs/llama_etuning/lib/python3.10/site-packages/accelerate/accelerator.py", line 1022, in _prepare_one
return self.prepare_model(obj, device_placement=device_placement)
File "/root/miniconda3/envs/llama_etuning/lib/python3.10/site-packages/accelerate/accelerator.py", line 1275, in prepare_model
return self.prepare_model(obj, device_placement=device_placement)
File "/root/miniconda3/envs/llama_etuning/lib/python3.10/site-packages/accelerate/accelerator.py", line 1275, in prepare_model
return self.prepare_model(obj, device_placement=device_placement)
return self.prepare_model(obj, device_placement=device_placement) File "/root/miniconda3/envs/llama_etuning/lib/python3.10/site-packages/accelerate/accelerator.py", line 1275, in prepare_model
File "/root/miniconda3/envs/llama_etuning/lib/python3.10/site-packages/accelerate/accelerator.py", line 1275, in prepare_model
model = torch.nn.parallel.DistributedDataParallel(
File "/root/miniconda3/envs/llama_etuning/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 676, in __init__
model = torch.nn.parallel.DistributedDataParallel(
model = torch.nn.parallel.DistributedDataParallel( File "/root/miniconda3/envs/llama_etuning/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 676, in __init__
File "/root/miniconda3/envs/llama_etuning/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 676, in __init__
model = torch.nn.parallel.DistributedDataParallel(
File "/root/miniconda3/envs/llama_etuning/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 676, in __init__
_sync_module_states(_sync_module_states(
_sync_module_states(_sync_module_states( File "/root/miniconda3/envs/llama_etuning/lib/python3.10/site-packages/torch/distributed/utils.py", line 142, in _sync_module_states
File "/root/miniconda3/envs/llama_etuning/lib/python3.10/site-packages/torch/distributed/utils.py", line 142, in _sync_module_states
File "/root/miniconda3/envs/llama_etuning/lib/python3.10/site-packages/torch/distributed/utils.py", line 142, in _sync_module_states
File "/root/miniconda3/envs/llama_etuning/lib/python3.10/site-packages/torch/distributed/utils.py", line 142, in _sync_module_states
_sync_params_and_buffers(
_sync_params_and_buffers(_sync_params_and_buffers(
File "/root/miniconda3/envs/llama_etuning/lib/python3.10/site-packages/torch/distributed/utils.py", line 160, in _sync_params_and_buffers
_sync_params_and_buffers(
File "/root/miniconda3/envs/llama_etuning/lib/python3.10/site-packages/torch/distributed/utils.py", line 160, in _sync_params_and_buffers
File "/root/miniconda3/envs/llama_etuning/lib/python3.10/site-packages/torch/distributed/utils.py", line 160, in _sync_params_and_buffers
File "/root/miniconda3/envs/llama_etuning/lib/python3.10/site-packages/torch/distributed/utils.py", line 160, in _sync_params_and_buffers
dist._broadcast_coalesced(
dist._broadcast_coalesced(
dist._broadcast_coalesced(dist._broadcast_coalesced(
RuntimeError: Tensors must be CUDA and dense
RuntimeErrorRuntimeError: : RuntimeErrorTensors must be CUDA and denseTensors must be CUDA and dense
:
Tensors must be CUDA and dense
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 897529) of binary: /root/miniconda3/envs/llama_etuning/bin/python
Traceback (most recent call last):
File "/root/miniconda3/envs/llama_etuning/bin/accelerate", line 8, in <module>
sys.exit(main())
File "/root/miniconda3/envs/llama_etuning/lib/python3.10/site-packages/accelerate/commands/accelerate_cli.py", line 45, in main
args.func(args)
File "/root/miniconda3/envs/llama_etuning/lib/python3.10/site-packages/accelerate/commands/launch.py", line 932, in launch_command
multi_gpu_launcher(args)
File "/root/miniconda3/envs/llama_etuning/lib/python3.10/site-packages/accelerate/commands/launch.py", line 627, in multi_gpu_launcher
distrib_run.run(args)
File "/root/miniconda3/envs/llama_etuning/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run
elastic_launch(
File "/root/miniconda3/envs/llama_etuning/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/root/miniconda3/envs/llama_etuning/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
src/train_ppo.py FAILED
------------------------------------------------------------
Failures:
[1]:
time : 2023-06-09_10:00:51
host : mpudgx202302-DGX-Station-A100-920-23487-2531-000
rank : 1 (local_rank: 1)
exitcode : 1 (pid: 897530)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[2]:
time : 2023-06-09_10:00:51
host : mpudgx202302-DGX-Station-A100-920-23487-2531-000
rank : 2 (local_rank: 2)
exitcode : 1 (pid: 897531)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[3]:
time : 2023-06-09_10:00:51
host : mpudgx202302-DGX-Station-A100-920-23487-2531-000
rank : 3 (local_rank: 3)
exitcode : 1 (pid: 897532)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
time : 2023-06-09_10:00:51
host : mpudgx202302-DGX-Station-A100-920-23487-2531-000
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 897529)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================
The text was updated successfully, but these errors were encountered: