Skip to content

Commit

Permalink
Add precision option in PP inference examples (#11440)
Browse files Browse the repository at this point in the history
  • Loading branch information
plusbang authored Jun 27, 2024
1 parent e9e8f9b commit 508c364
Show file tree
Hide file tree
Showing 13 changed files with 31 additions and 27 deletions.
2 changes: 2 additions & 0 deletions python/llm/example/GPU/Pipeline-Parallel-Inference/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ pip install oneccl_bind_pt==2.1.100 --extra-index-url https://pytorch-extension.

For optimal performance, it is recommended to set several environment variables. We provide example usages as following:

> Note: INT4 optimization is applied to the model by default. You could specify other low bit optimizations (such as 'fp8' and 'fp6') through `--low-bit`.
</details>

<details>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,24 +34,26 @@
help='Prompt to infer')
parser.add_argument('--n-predict', type=int, default=32,
help='Max tokens to predict')
parser.add_argument('--low-bit', type=str, default='sym_int4', help='The quantization type the model will convert to.')
parser.add_argument('--gpu-num', type=int, default=2, help='GPU number to use')

args = parser.parse_args()
model_path = args.repo_id_or_model_path
low_bit = args.low_bit

# Load model in 4 bit,
# which convert the relevant layers in the model into INT4 format
try:
model = AutoModelForCausalLM.from_pretrained(model_path,
load_in_4bit=True,
load_in_low_bit=low_bit,
optimize_model=True,
trust_remote_code=True,
use_cache=True,
torch_dtype=torch.float16,
pipeline_parallel_stages=args.gpu_num)
except:
model = AutoModel.from_pretrained(model_path,
load_in_4bit=True,
load_in_low_bit=low_bit,
optimize_model=True,
trust_remote_code=True,
use_cache=True,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ NUM_GPUS=2 # number of used GPU

# To run Baichuan2-7B-Chat
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
generate.py --repo-id-or-model-path 'baichuan-inc/Baichuan2-7B-Chat' --gpu-num $NUM_GPUS
generate.py --repo-id-or-model-path 'baichuan-inc/Baichuan2-7B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'

# # To run Baichuan2-13B-Chat
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
# generate.py --repo-id-or-model-path 'baichuan-inc/Baichuan2-13B-Chat' --gpu-num $NUM_GPUS
# generate.py --repo-id-or-model-path 'baichuan-inc/Baichuan2-13B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,4 @@ export TORCH_LLM_ALLREDUCE=0
NUM_GPUS=2 # number of used GPU
# To run chatglm3-6b
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
generate.py --repo-id-or-model-path 'THUDM/chatglm3-6b' --gpu-num $NUM_GPUS
generate.py --repo-id-or-model-path 'THUDM/chatglm3-6b' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,12 @@ NUM_GPUS=2 # number of used GPU

# To run CodeLlama-7b-Instruct-hf
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
generate.py --repo-id-or-model-path 'codellama/CodeLlama-7b-Instruct-hf' --gpu-num $NUM_GPUS
generate.py --repo-id-or-model-path 'codellama/CodeLlama-7b-Instruct-hf' --gpu-num $NUM_GPUS --low-bit 'sym_int4'

# To run CodeLlama-13b-Instruct-hf
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
# generate.py --repo-id-or-model-path 'codellama/CodeLlama-13b-Instruct-hf' --gpu-num $NUM_GPUS
# generate.py --repo-id-or-model-path 'codellama/CodeLlama-13b-Instruct-hf' --gpu-num $NUM_GPUS --low-bit 'sym_int4'

# To run CodeLlama-34b-Instruct-hf
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
# generate.py --repo-id-or-model-path 'codellama/CodeLlama-34b-Instruct-hf' --gpu-num $NUM_GPUS
# generate.py --repo-id-or-model-path 'codellama/CodeLlama-34b-Instruct-hf' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,12 @@ NUM_GPUS=2 # number of used GPU

# To run Llama-2-7b-chat-hf
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
generate.py --repo-id-or-model-path 'meta-llama/Llama-2-7b-chat-hf' --gpu-num $NUM_GPUS
generate.py --repo-id-or-model-path 'meta-llama/Llama-2-7b-chat-hf' --gpu-num $NUM_GPUS --low-bit 'sym_int4'

# # To run Llama-2-13b-chat-hf
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
# generate.py --repo-id-or-model-path 'meta-llama/Llama-2-13b-chat-hf' --gpu-num $NUM_GPUS
# generate.py --repo-id-or-model-path 'meta-llama/Llama-2-13b-chat-hf' --gpu-num $NUM_GPUS --low-bit 'sym_int4'

# # To run Meta-Llama-3-8B-Instruct
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
# generate.py --repo-id-or-model-path 'meta-llama/Meta-Llama-3-8B-Instruct' --gpu-num $NUM_GPUS
# generate.py --repo-id-or-model-path 'meta-llama/Meta-Llama-3-8B-Instruct' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ NUM_GPUS=2 # number of used GPU

# To run Mistral-7B-v0.1
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
generate.py --repo-id-or-model-path 'mistralai/Mistral-7B-v0.1' --gpu-num $NUM_GPUS
generate.py --repo-id-or-model-path 'mistralai/Mistral-7B-v0.1' --gpu-num $NUM_GPUS --low-bit 'sym_int4'

# To run Mixtral-8x7B-Instruct-v0.1
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
# generate.py --repo-id-or-model-path 'mistralai/Mixtral-8x7B-Instruct-v0.1' --gpu-num $NUM_GPUS
# generate.py --repo-id-or-model-path 'mistralai/Mixtral-8x7B-Instruct-v0.1' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ NUM_GPUS=2 # number of used GPU

# To run Phi-3-medium-4k-instruct
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
generate.py --repo-id-or-model-path 'microsoft/Phi-3-medium-4k-instruct' --gpu-num $NUM_GPUS
generate.py --repo-id-or-model-path 'microsoft/Phi-3-medium-4k-instruct' --gpu-num $NUM_GPUS --low-bit 'sym_int4'

# # To run Phi-3-mini-4k-instruct
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
# generate.py --repo-id-or-model-path 'microsoft/Phi-3-mini-4k-instruct' --gpu-num $NUM_GPUS
# generate.py --repo-id-or-model-path 'microsoft/Phi-3-mini-4k-instruct' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
Original file line number Diff line number Diff line change
Expand Up @@ -29,20 +29,20 @@ NUM_GPUS=2 # number of used GPU

# To run Qwen1.5-7B-Chat
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-7B-Chat' --gpu-num $NUM_GPUS
generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-7B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'

# # To run Qwen1.5-14B-Chat
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
# generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-14B-Chat' --gpu-num $NUM_GPUS
# generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-14B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'

# # To run Qwen1.5-32B-Chat
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
# generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-32B-Chat' --gpu-num $NUM_GPUS
# generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-32B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'

# # To run Qwen1.5-MoE-A2.7B-Chat
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
# generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-MoE-A2.7B-Chat' --gpu-num $NUM_GPUS
# generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-MoE-A2.7B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'

# # To run CodeQwen1.5-7B-Chat
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
# generate.py --repo-id-or-model-path 'Qwen/CodeQwen1.5-7B-Chat' --gpu-num $NUM_GPUS
# generate.py --repo-id-or-model-path 'Qwen/CodeQwen1.5-7B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,4 @@ NUM_GPUS=2 # number of used GPU

# To run Qwen2-7B-Instruct
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
generate.py --repo-id-or-model-path 'Qwen/Qwen2-7B-Instruct' --gpu-num $NUM_GPUS
generate.py --repo-id-or-model-path 'Qwen/Qwen2-7B-Instruct' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,4 @@ NUM_GPUS=2 # number of used GPU

# To run SOLAR-10.7B-Instruct-v1.0
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
generate.py --repo-id-or-model-path 'upstage/SOLAR-10.7B-Instruct-v1.0' --gpu-num $NUM_GPUS
generate.py --repo-id-or-model-path 'upstage/SOLAR-10.7B-Instruct-v1.0' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,12 @@ NUM_GPUS=2 # number of used GPU

# To run vicuna-7b-v1.3
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
generate.py --repo-id-or-model-path 'lmsys/vicuna-7b-v1.3' --gpu-num $NUM_GPUS
generate.py --repo-id-or-model-path 'lmsys/vicuna-7b-v1.3' --gpu-num $NUM_GPUS --low-bit 'sym_int4'

# To run vicuna-13b-v1.3
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
# generate.py --repo-id-or-model-path 'lmsys/vicuna-13b-v1.3' --gpu-num $NUM_GPUS
# generate.py --repo-id-or-model-path 'lmsys/vicuna-13b-v1.3' --gpu-num $NUM_GPUS --low-bit 'sym_int4'

# To run vicuna-33b-v1.3
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
# generate.py --repo-id-or-model-path 'lmsys/vicuna-33b-v1.3' --gpu-num $NUM_GPUS
# generate.py --repo-id-or-model-path 'lmsys/vicuna-33b-v1.3' --gpu-num $NUM_GPUS --low-bit 'sym_int4'
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ NUM_GPUS=2 # number of used GPU

# To run Yi-6B-Chat
CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
generate.py --repo-id-or-model-path '01-ai/Yi-6B-Chat' --gpu-num $NUM_GPUS
generate.py --repo-id-or-model-path '01-ai/Yi-6B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'

# To run Yi-34B-Chat
# CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \
# generate.py --repo-id-or-model-path '01-ai/Yi-34B-Chat' --gpu-num $NUM_GPUS
# generate.py --repo-id-or-model-path '01-ai/Yi-34B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'

0 comments on commit 508c364

Please sign in to comment.