You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
训练发生如下报错:
`
Traceback (most recent call last):
File "/root/miniconda3/envs/gyj_colossal/bin/torchrun", line 8, in
sys.exit(main())
^^^^^^
File "/root/miniconda3/envs/gyj_colossal/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper
return f(*args, **kwargs)
^^^^^^^^^^^^^^^^^^
File "/root/miniconda3/envs/gyj_colossal/lib/python3.11/site-packages/torch/distributed/run.py", line 762, in main
run(args)
File "/root/miniconda3/envs/gyj_colossal/lib/python3.11/site-packages/torch/distributed/run.py", line 753, in run
elastic_launch(
File "/root/miniconda3/envs/gyj_colossal/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 132, in call
return launch_agent(self._config, self._entrypoint, list(args))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/miniconda3/envs/gyj_colossal/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 237, in launch_agent
result = agent.run()
^^^^^^^^^^^
File "/root/miniconda3/envs/gyj_colossal/lib/python3.11/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper
result = f(*args, **kwargs)
^^^^^^^^^^^^^^^^^^
File "/root/miniconda3/envs/gyj_colossal/lib/python3.11/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run
result = self._invoke_run(role)
^^^^^^^^^^^^^^^^^^^^^^
File "/root/miniconda3/envs/gyj_colossal/lib/python3.11/site-packages/torch/distributed/elastic/agent/server/api.py", line 844, in _invoke_run
self._initialize_workers(self._worker_group)
File "/root/miniconda3/envs/gyj_colossal/lib/python3.11/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper
result = f(*args, **kwargs)
^^^^^^^^^^^^^^^^^^
File "/root/miniconda3/envs/gyj_colossal/lib/python3.11/site-packages/torch/distributed/elastic/agent/server/api.py", line 681, in _initialize_workers
worker_ids = self._start_workers(worker_group)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/miniconda3/envs/gyj_colossal/lib/python3.11/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper
result = f(*args, **kwargs)
^^^^^^^^^^^^^^^^^^
File "/root/miniconda3/envs/gyj_colossal/lib/python3.11/site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py", line 271, in _start_workers
self._pcontext = start_processes(
^^^^^^^^^^^^^^^^
File "/root/miniconda3/envs/gyj_colossal/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/init.py", line 207, in start_processes
redirs = to_map(redirects, nprocs)
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/miniconda3/envs/gyj_colossal/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 162, in to_map
map[i] = val_or_map.get(i, Std.NONE)
^^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'get'
Error: failed to run torchrun --nproc_per_node=8 --nnodes=2 --node_rank=0 --master_addr=172.17.120.11 --master_port=3325 train.py --pretrained /mnt/nas/gyj/Chinese-LLaMA-Alpaca-2-main/scripts/training/output_dir3/merge_llama2_13b_step3 --dataset /mnt/nas/gyj/ColossalAI-main/applications/Colossal-LLaMA-2/data/out_arrows/part-00000 /mnt/nas/gyj/ColossalAI-main/applications/Colossal-LLaMA-2/data/out_arrows/part-00001 /mnt/nas/gyj/ColossalAI-main/applications/Colossal-LLaMA-2/data/out_arrows/part-00002 --plugin zero2 --save_interval 400 --save_dir output_checkpoint_dir1step4-2023-10-11-20-29-16 --tensorboard_dir output_tensorboard_dir1step4-2023-10-11-20-29-16 --config_file output_config_dir1step4-2023-10-11-20-29-16.json --num_epochs 1 --micro_batch_size 8 --lr 1e-4 --mixed_precision bf16 --grad_clip 1.0 --weight_decay 0.01 --warmup_steps 100 --use_grad_checkpoint --use_flash_attn on 172.17.120.11, is localhost: False, exception: Encountered a bad command exit code!
🐛 Describe the bug
源码安装的colossal-0.3.3,按照https://github.com/hpcaitech/ColossalAI/blob/main/examples/language/llama2/README.md中安装了其他的包。
启动命令:
` #!/bin/bash
NCCL IB environment variables
export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1
export NCCL_IB_DISABLE=0
export NCCL_SOCKET_IFNAME=enp95s0f1
export NCCL_IB_GID_INDEX=3
export NCCL_IB_TIMEOUT=23
export NCCL_IB_RETRY_CNT=7
export OMP_NUM_THREADS=8
export NCCL_DEBUG=INFO
PROJECT_NAME="step4"
PARENT_SAVE_DIR="output_checkpoint_dir1"
PARENT_TENSORBOARD_DIR="output_tensorboard_dir1"
PARENT_CONFIG_FILE="output_config_dir1"
PRETRAINED_MODEL_PATH="/mnt/nas/gyj/Chinese-LLaMA-Alpaca-2-main/scripts/training/output_dir3/merge_llama2_13b_step3"
declare -a dataset=(
"/mnt/nas/gyj/ColossalAI-main/applications/Colossal-LLaMA-2/data/out_arrows/part-00000"
"/mnt/nas/gyj/ColossalAI-main/applications/Colossal-LLaMA-2/data/out_arrows/part-00001"
"/mnt/nas/gyj/ColossalAI-main/applications/Colossal-LLaMA-2/data/out_arrows/part-00002"
)
TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}"
TENSORBOARD_DIR="${PARENT_TENSORBOARD_DIR}${FULL_PROJECT_NAME}"
CONFIG_FILE="${PARENT_CONFIG_FILE}${FULL_PROJECT_NAME}.json"
echo "使用colossalai命令行"
colossalai run --nproc_per_node 8 --hostfile gyj_hostfile --master_port 3325 train.py
--pretrained $PRETRAINED_MODEL_PATH
--dataset ${dataset[@]}
--plugin "zero2"
--save_interval 400
--save_dir $SAVE_DIR
--tensorboard_dir $TENSORBOARD_DIR
--config_file $CONFIG_FILE
--num_epochs 1
--micro_batch_size 8
--lr 1e-4
--mixed_precision "bf16"
--grad_clip 1.0
--weight_decay 0.01
--warmup_steps 100
--use_grad_checkpoint
--use_flash_attn
`
训练发生如下报错:
`
Traceback (most recent call last):
File "/root/miniconda3/envs/gyj_colossal/bin/torchrun", line 8, in
sys.exit(main())
^^^^^^
File "/root/miniconda3/envs/gyj_colossal/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper
return f(*args, **kwargs)
^^^^^^^^^^^^^^^^^^
File "/root/miniconda3/envs/gyj_colossal/lib/python3.11/site-packages/torch/distributed/run.py", line 762, in main
run(args)
File "/root/miniconda3/envs/gyj_colossal/lib/python3.11/site-packages/torch/distributed/run.py", line 753, in run
elastic_launch(
File "/root/miniconda3/envs/gyj_colossal/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 132, in call
return launch_agent(self._config, self._entrypoint, list(args))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/miniconda3/envs/gyj_colossal/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 237, in launch_agent
result = agent.run()
^^^^^^^^^^^
File "/root/miniconda3/envs/gyj_colossal/lib/python3.11/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper
result = f(*args, **kwargs)
^^^^^^^^^^^^^^^^^^
File "/root/miniconda3/envs/gyj_colossal/lib/python3.11/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run
result = self._invoke_run(role)
^^^^^^^^^^^^^^^^^^^^^^
File "/root/miniconda3/envs/gyj_colossal/lib/python3.11/site-packages/torch/distributed/elastic/agent/server/api.py", line 844, in _invoke_run
self._initialize_workers(self._worker_group)
File "/root/miniconda3/envs/gyj_colossal/lib/python3.11/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper
result = f(*args, **kwargs)
^^^^^^^^^^^^^^^^^^
File "/root/miniconda3/envs/gyj_colossal/lib/python3.11/site-packages/torch/distributed/elastic/agent/server/api.py", line 681, in _initialize_workers
worker_ids = self._start_workers(worker_group)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/miniconda3/envs/gyj_colossal/lib/python3.11/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper
result = f(*args, **kwargs)
^^^^^^^^^^^^^^^^^^
File "/root/miniconda3/envs/gyj_colossal/lib/python3.11/site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py", line 271, in _start_workers
self._pcontext = start_processes(
^^^^^^^^^^^^^^^^
File "/root/miniconda3/envs/gyj_colossal/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/init.py", line 207, in start_processes
redirs = to_map(redirects, nprocs)
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/miniconda3/envs/gyj_colossal/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 162, in to_map
map[i] = val_or_map.get(i, Std.NONE)
^^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'get'
Error: failed to run torchrun --nproc_per_node=8 --nnodes=2 --node_rank=0 --master_addr=172.17.120.11 --master_port=3325 train.py --pretrained /mnt/nas/gyj/Chinese-LLaMA-Alpaca-2-main/scripts/training/output_dir3/merge_llama2_13b_step3 --dataset /mnt/nas/gyj/ColossalAI-main/applications/Colossal-LLaMA-2/data/out_arrows/part-00000 /mnt/nas/gyj/ColossalAI-main/applications/Colossal-LLaMA-2/data/out_arrows/part-00001 /mnt/nas/gyj/ColossalAI-main/applications/Colossal-LLaMA-2/data/out_arrows/part-00002 --plugin zero2 --save_interval 400 --save_dir output_checkpoint_dir1step4-2023-10-11-20-29-16 --tensorboard_dir output_tensorboard_dir1step4-2023-10-11-20-29-16 --config_file output_config_dir1step4-2023-10-11-20-29-16.json --num_epochs 1 --micro_batch_size 8 --lr 1e-4 --mixed_precision bf16 --grad_clip 1.0 --weight_decay 0.01 --warmup_steps 100 --use_grad_checkpoint --use_flash_attn on 172.17.120.11, is localhost: False, exception: Encountered a bad command exit code!
Command: 'cd /mnt/nas/gyj/ColossalAI-main/applications/Colossal-LLaMA-2 && export LC_PAPER="zh_CN.UTF-8" MANPATH="/opt/rh/devtoolset-8/root/usr/share/man:" LC_ADDRESS="zh_CN.UTF-8" XDG_SESSION_ID="32" LC_MONETARY="zh_CN.UTF-8" HOSTNAME="localhost.localdomain" SHELL="/bin/bash" TERM="xterm-256color" NCCL_SOCKET_IFNAME="enp95s0f1" HISTSIZE="1000" SSH_CLIENT="172.17.120.207 53840 22" PERL5LIB="/opt/rh/devtoolset-8/root//usr/lib64/perl5/vendor_perl:/opt/rh/devtoolset-8/root/usr/lib/perl5:/opt/rh/devtoolset-8/root//usr/share/perl5/vendor_perl" CONDA_SHLVL="2" CONDA_PROMPT_MODIFIER="(gyj_colossal) " LC_NUMERIC="zh_CN.UTF-8" SSH_TTY="/dev/pts/7" USER="root" PCP_DIR="/opt/rh/devtoolset-8/root" CUDA_HOME="/usr/local/cuda-11.8" NCCL_IB_TIMEOUT="23" NCCL_IB_GID_INDEX="3" LC_TELEPHONE="zh_CN.UTF-8" LS_COLORS="rs=0:di=38;5;27:ln=38;5;51:mh=44;38;5;15:pi=40;38;5;11:so=38;5;13:do=38;5;5:bd=48;5;232;38;5;11:cd=48;5;232;38;5;3:or=48;5;232;38;5;9:mi=05;48;5;232;38;5;15:su=48;5;196;38;5;15:sg=48;5;11;38;5;16:ca=48;5;196;38;5;226:tw=48;5;10;38;5;16:ow=48;5;10;38;5;21:st=48;5;21;38;5;15:ex=38;5;34:.tar=38;5;9:.tgz=38;5;9:.arc=38;5;9:.arj=38;5;9:.taz=38;5;9:.lha=38;5;9:.lz4=38;5;9:.lzh=38;5;9:.lzma=38;5;9:.tlz=38;5;9:.txz=38;5;9:.tzo=38;5;9:.t7z=38;5;9:.zip=38;5;9:.z=38;5;9:.Z=38;5;9:.dz=38;5;9:.gz=38;5;9:.lrz=38;5;9:.lz=38;5;9:.lzo=38;5;9:.xz=38;5;9:.bz2=38;5;9:.bz=38;5;9:.tbz=38;5;9:.tbz2=38;5;9:.tz=38;5;9:.deb=38;5;9:.rpm=38;5;9:.jar=38;5;9:.war=38;5;9:.ear=38;5;9:.sar=38;5;9:.rar=38;5;9:.alz=38;5;9:.ace=38;5;9:.zoo=38;5;9:.cpio=38;5;9:.7z=38;5;9:.rz=38;5;9:.cab=38;5;9:.jpg=38;5;13:.jpeg=38;5;13:.gif=38;5;13:.bmp=38;5;13:.pbm=38;5;13:.pgm=38;5;13:.ppm=38;5;13:.tga=38;5;13:.xbm=38;5;13:.xpm=38;5;13:.tif=38;5;13:.tiff=38;5;13:.png=38;5;13:.svg=38;5;13:.svgz=38;5;13:.mng=38;5;13:.pcx=38;5;13:.mov=38;5;13:.mpg=38;5;13:.mpeg=38;5;13:.m2v=38;5;13:.mkv=38;5;13:.webm=38;5;13:.ogm=38;5;13:.mp4=38;5;13:.m4v=38;5;13:.mp4v=38;5;13:.vob=38;5;13:.qt=38;5;13:.nuv=38;5;13:.wmv=38;5;13:.asf=38;5;13:.rm=38;5;13:.rmvb=38;5;13:.flc=38;5;13:.avi=38;5;13:.fli=38;5;13:.flv=38;5;13:.gl=38;5;13:.dl=38;5;13:.xcf=38;5;13:.xwd=38;5;13:.yuv=38;5;13:.cgm=38;5;13:.emf=38;5;13:.axv=38;5;13:.anx=38;5;13:.ogv=38;5;13:.ogx=38;5;13:.aac=38;5;45:.au=38;5;45:.flac=38;5;45:.mid=38;5;45:.midi=38;5;45:.mka=38;5;45:.mp3=38;5;45:.mpc=38;5;45:.ogg=38;5;45:.ra=38;5;45:.wav=38;5;45:.axa=38;5;45:.oga=38;5;45:.spx=38;5;45:*.xspf=38;5;45:" LD_LIBRARY_PATH="/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:/usr/local/cuda-11.8/lib64" CONDA_EXE="/root/miniconda3/bin/conda" CONDA_PREFIX_1="/root/miniconda3" PATH="/usr/local/mpi/bin:/root/miniconda3/envs/gyj_colossal/bin:/root/miniconda3/condabin:/usr/local/cuda-11.8/bin:/opt/rh/devtoolset-8/root/usr/bin:/usr/local/pdsh/bin:/usr/local/pdsh/bin:/usr/local/cuda-11.8/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/root/bin" MAIL="/var/spool/mail/root" LC_IDENTIFICATION="zh_CN.UTF-8" CONDA_PREFIX="/root/miniconda3/envs/gyj_colossal" PWD="/mnt/nas/gyj/ColossalAI-main/applications/Colossal-LLaMA-2" LANG="zh_CN.UTF-8" LC_MEASUREMENT="zh_CN.UTF-8" HISTCONTROL="ignoredups" HOME="/root" SHLVL="2" NCCL_IB_DISABLE="0" NCCL_IB_HCA="mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1" LOGNAME="root" PYTHONPATH="/opt/rh/devtoolset-8/root/usr/lib64/python2.7/site-packages:/opt/rh/devtoolset-8/root/usr/lib/python2.7/site-packages" CONDA_PYTHON_EXE="/root/miniconda3/bin/python" NCCL_IB_RETRY_CNT="7" SSH_CONNECTION="172.17.120.207 53840 172.17.120.11 22" OMP_NUM_THREADS="8" LESSOPEN="||/usr/bin/lesspipe.sh %s" PKG_CONFIG_PATH="/opt/rh/devtoolset-8/root/usr/lib64/pkgconfig" CONDA_DEFAULT_ENV="gyj_colossal" INFOPATH="/opt/rh/devtoolset-8/root/usr/share/info" XDG_RUNTIME_DIR="/run/user/0" NCCL_DEBUG="INFO" LC_TIME="zh_CN.UTF-8" LC_NAME="zh_CN.UTF-8" _="/root/miniconda3/envs/gyj_colossal/bin/colossalai" && torchrun --nproc_per_node=8 --nnodes=2 --node_rank=0 --master_addr=172.17.120.11 --master_port=3325 train.py --pretrained /mnt/nas/gyj/Chinese-LLaMA-Alpaca-2-main/scripts/training/output_dir3/merge_llama2_13b_step3 --dataset /mnt/nas/gyj/ColossalAI-main/applications/Colossal-LLaMA-2/data/out_arrows/part-00000 /mnt/nas/gyj/ColossalAI-main/applications/Colossal-LLaMA-2/data/out_arrows/part-00001 /mnt/nas/gyj/ColossalAI-main/applications/Colossal-LLaMA-2/data/out_arrows/part-00002 --plugin zero2 --save_interval 400 --save_dir output_checkpoint_dir1step4-2023-10-11-20-29-16 --tensorboard_dir output_tensorboard_dir1step4-2023-10-11-20-29-16 --config_file output_config_dir1step4-2023-10-11-20-29-16.json --num_epochs 1 --micro_batch_size 8 --lr 1e-4 --mixed_precision bf16 --grad_clip 1.0 --weight_decay 0.01 --warmup_steps 100 --use_grad_checkpoint --use_flash_attn
'
Exit code: 1
Stdout: already printed
Stderr: already printed`
Environment
cuda:11.8
cudnn:8.5.0.96
pytorch:1.31.1
python:3.11
The text was updated successfully, but these errors were encountered: