diff --git a/.ci/scripts/validate.sh b/.ci/scripts/validate.sh index d4bc75da5..971965305 100644 --- a/.ci/scripts/validate.sh +++ b/.ci/scripts/validate.sh @@ -14,7 +14,7 @@ function generate_eager_model_output() { local TARGET_DEVICE="${2:-cpu}" local MODEL_DIR="${CHECKPOINT_PATH%/*}" local MODEL_NAME=$(basename "$CHECKPOINT_PATH" | sed 's/\.[^.]*$//') - echo "Run inference with eager model for $MODEL_NAME" + echo "Run inference with eager model" python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --prompt "$PROMPT" --device "$TARGET_DEVICE" > "$MODEL_DIR/output_eager" || exit 1 cat "$MODEL_DIR/output_eager" } @@ -24,9 +24,70 @@ function generate_compiled_model_output() { local TARGET_DEVICE="${2:-cpu}" local MODEL_DIR="${CHECKPOINT_PATH%/*}" local MODEL_NAME=$(basename "$CHECKPOINT_PATH" | sed 's/\.[^.]*$//') - echo ""############### Run inference with torch.compile for $MODEL_NAME "###############" + + echo ""############### Run inference with torch.compile "###############" + echo "" + echo "******************************************" + echo "************** non-quantized *************" + echo "******************************************" python -W ignore generate.py --compile --checkpoint-path "$CHECKPOINT_PATH" --prompt "$PROMPT" --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" || exit 1 cat "$MODEL_DIR/output_compiled" + + echo "******************************************" + echo "******* Emb: channel-wise quantized ******" + echo "******************************************" + python -W ignore generate.py --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_eager" || exit 1 + cat "$MODEL_DIR/output_eager" + python -W ignore generate.py --compile --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" || exit 1 + cat "$MODEL_DIR/output_compiled" + + echo "******************************************" + echo "******** Emb: group-wise quantized *******" + echo "******************************************" + python -W ignore generate.py --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_eager" || exit 1 + cat "$MODEL_DIR/output_eager" + python -W ignore generate.py --compile --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" || exit 1 + cat "$MODEL_DIR/output_compiled" + + echo "***********************************************" + echo "******* Emb: 4bit channel-wise quantized ******" + echo "***********************************************" + python -W ignore generate.py --quant '{"embedding" : {"bitwidth": 4, "groupsize": 0, "packed": "True"}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_eager" || exit 1 + cat "$MODEL_DIR/output_eager" + python -W ignore generate.py --compile --quant '{"embedding" : {"bitwidth": 4, "groupsize": 0, "packed": "True"}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" || exit 1 + cat "$MODEL_DIR/output_compiled" + + echo "***********************************************" + echo "******** Emb: 4bit group-wise quantized *******" + echo "***********************************************" + python -W ignore generate.py --quant '{"embedding" : {"bitwidth": 4, "groupsize": 8, "packed": "True"}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_eager" || exit 1 + cat "$MODEL_DIR/output_eager" + python -W ignore generate.py --compile --quant '{"embedding" : {"bitwidth": 4, "groupsize": 8, "packed": "True"}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" || exit 1 + cat "$MODEL_DIR/output_compiled" + + echo "******************************************" + echo "******* INT8 channel-wise quantized ******" + echo "******************************************" + python -W ignore generate.py --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_eager" || exit 1 + cat "$MODEL_DIR/output_eager" + python -W ignore generate.py --compile --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" || exit 1 + cat "$MODEL_DIR/output_compiled" + + echo "******************************************" + echo "******** INT8 group-wise quantized *******" + echo "******************************************" + python -W ignore generate.py --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_eager" || exit 1 + cat "$MODEL_DIR/output_eager" + python -W ignore generate.py --compile --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" || exit 1 + cat "$MODEL_DIR/output_compiled" + + echo "******************************************" + echo "******** INT4 group-wise quantized *******" + echo "******************************************" + python -W ignore generate.py --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_eager" || exit 1 + cat "$MODEL_DIR/output_eager" + python -W ignore generate.py --compile --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" || exit 1 + cat "$MODEL_DIR/output_compiled" } function generate_aoti_model_output() { @@ -34,10 +95,64 @@ function generate_aoti_model_output() { local TARGET_DEVICE="${2:-cpu}" local MODEL_DIR="${CHECKPOINT_PATH%/*}" local MODEL_NAME=$(basename "$CHECKPOINT_PATH" | sed 's/\.[^.]*$//') - echo ""############### Run inference with AOTInductor for $MODEL_NAME "###############" - python -W ignore export.py --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path "${MODEL_DIR}/${MODEL_NAME}.so" --device "$TARGET_DEVICE" + + echo ""############### Run inference with AOT Inductor "###############" + echo "" + echo "******************************************" + echo "************** non-quantized *************" + echo "******************************************" + python -W ignore export.py --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path "${MODEL_DIR}/${MODEL_NAME}.so" --device "$TARGET_DEVICE" || exit 1 python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --dso-path "$MODEL_DIR/${MODEL_NAME}.so" --prompt "$PROMPT" --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1 cat "$MODEL_DIR/output_aoti" + + echo "******************************************" + echo "******* Emb: channel-wise quantized ******" + echo "******************************************" + python -W ignore export.py --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1 + python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1 + cat "$MODEL_DIR/output_aoti" + + echo "******************************************" + echo "******** Emb: group-wise quantized *******" + echo "******************************************" + python -W ignore export.py --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1 + python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1 + cat "$MODEL_DIR/output_aoti" + + echo "***********************************************" + echo "******* Emb: 4bit channel-wise quantized ******" + echo "***********************************************" + python -W ignore export.py --quant '{"embedding" : {"bitwidth": 4, "groupsize": 0, "packed": "True"}}' --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1 + python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1 + cat "$MODEL_DIR/output_aoti" + + echo "***********************************************" + echo "******** Emb: 4bit group-wise quantized *******" + echo "***********************************************" + python -W ignore export.py --quant '{"embedding" : {"bitwidth": 4, "groupsize": 8, "packed": "True"}}' --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1 + python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1 + cat "$MODEL_DIR/output_aoti" + + echo "******************************************" + echo "******* INT8 channel-wise quantized ******" + echo "******************************************" + python -W ignore export.py --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1 + python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1 + cat "$MODEL_DIR/output_aoti" + + echo "******************************************" + echo "******** INT8 group-wise quantized *******" + echo "******************************************" + python -W ignore export.py --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1 + python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1 + cat "$MODEL_DIR/output_aoti" + + echo "******************************************" + echo "******** INT4 group-wise quantized *******" + echo "******************************************" + python -W ignore export.py --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1 + python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1 + cat "$MODEL_DIR/output_aoti" } function generate_executorch_model_output() { @@ -45,7 +160,7 @@ function generate_executorch_model_output() { local TARGET_DEVICE="${2:-cpu}" local MODEL_DIR="${CHECKPOINT_PATH%/*}" local MODEL_NAME=$(basename "$CHECKPOINT_PATH" | sed 's/\.[^.]*$//') - echo ""############### Run inference with ExecuTorch using XNNPACK for $MODEL_NAME "###############" + echo ""############### Run inference with ExecuTorch using XNNPACK "###############" python -W ignore export.py --checkpoint-path "$CHECKPOINT_PATH" --output-pte-path "$MODEL_DIR/${MODEL_NAME}.pte" -d "fp32" || exit 1 python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --prompt "$PROMPT" --device "$TARGET_DEVICE" --pte-path "$MODEL_DIR/${MODEL_NAME}.pte" > "$MODEL_DIR/output_et" || exit 1 cat "$MODEL_DIR/output_et" diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml deleted file mode 100644 index a50d378f5..000000000 --- a/.github/workflows/compile.yml +++ /dev/null @@ -1,139 +0,0 @@ -name: Compile main - -on: - push: - branches: - - main - pull_request: - workflow_dispatch: - -jobs: - run-tinystories: - strategy: - matrix: - runner: [ubuntu-latest, macos-14] - runs-on: ${{matrix.runner}} - steps: - - name: Checkout repo - uses: actions/checkout@v2 - - name: Setup Python - uses: actions/setup-python@v2 - with: - python-version: 3.11 - - name: Print machine info - run: | - uname -a - if [ $(uname -s) == Darwin ]; then - sysctl machdep.cpu.brand_string - sysctl machdep.cpu.core_count - fi - - name: Install requirements - run: | - pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu - pip install -r requirements.txt - - name: Download checkpoints - run: | - mkdir -p checkpoints/stories15M - pushd checkpoints/stories15M - wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt - wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model - popd - - name: Run inference - run: | - export MODEL_PATH=checkpoints/stories15M/stories15M.pt - export MODEL_NAME=stories15M - export MODEL_DIR=/tmp - python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager - cat ./output_eager - python generate.py --compile --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled - cat ./output_compiled - python export.py --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so - python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so > ./output_aoti - cat ./output_aoti - - echo "******************************************" - echo "******* Emb: channel-wise quantized ******" - echo "******************************************" - python generate.py --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager - cat ./output_eager - python generate.py --compile --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled - cat ./output_compiled - python export.py --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so - python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so > ./output_aoti - cat ./output_aoti - - echo "******************************************" - echo "******** Emb: group-wise quantized *******" - echo "******************************************" - python generate.py --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager - cat ./output_eager - python generate.py --compile --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled - cat ./output_compiled - python export.py --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so - python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so > ./output_aoti - cat ./output_aoti - - echo "***********************************************" - echo "******* Emb: 4bit channel-wise quantized ******" - echo "***********************************************" - python generate.py --quant '{"embedding" : {"bitwidth": 4, "groupsize": 0, "packed": "True"}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager - cat ./output_eager - python generate.py --compile --quant '{"embedding" : {"bitwidth": 4, "groupsize": 0, "packed": "True"}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled - cat ./output_compiled - python export.py --quant '{"embedding" : {"bitwidth": 4, "groupsize": 0, "packed": "True"}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so - python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so > ./output_aoti - cat ./output_aoti - - echo "***********************************************" - echo "******** Emb: 4bit group-wise quantized *******" - echo "***********************************************" - python generate.py --quant '{"embedding" : {"bitwidth": 4, "groupsize": 8, "packed": "True"}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager - cat ./output_eager - python generate.py --compile --quant '{"embedding" : {"bitwidth": 4, "groupsize": 8, "packed": "True"}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled - cat ./output_compiled - python export.py --quant '{"embedding" : {"bitwidth": 4, "groupsize": 8, "packed": "True"}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so - python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so > ./output_aoti - cat ./output_aoti - - echo "******************************************" - echo "******* INT8 channel-wise quantized ******" - echo "******************************************" - python generate.py --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager - cat ./output_eager - python generate.py --compile --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled - cat ./output_compiled - python export.py --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so - python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so > ./output_aoti - cat ./output_aoti - - echo "******************************************" - echo "******** INT8 group-wise quantized *******" - echo "******************************************" - python generate.py --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager - cat ./output_eager - python generate.py --compile --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled - cat ./output_compiled - python export.py --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so - python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so > ./output_aoti - cat ./output_aoti - - echo "******************************************" - echo "******** INT4 group-wise quantized *******" - echo "******************************************" - python generate.py --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager - cat ./output_eager - python generate.py --compile --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled - cat ./output_compiled - python export.py --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so - python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so > ./output_aoti - cat ./output_aoti - - echo "tests complete" - echo "******************************************" - # echo "********* EAGER vs TORCH.COMPILE *********" - # echo "******************************************" - # diff output_eager output_compiled - # echo "******************************************" - # echo "********* EAGER vs AOT INDUCTOR *********" - # echo "******************************************" - # diff output_eager output_aoti diff --git a/.github/workflows/compile_t4.yml b/.github/workflows/compile_t4.yml deleted file mode 100644 index 9815f46a1..000000000 --- a/.github/workflows/compile_t4.yml +++ /dev/null @@ -1,110 +0,0 @@ -name: Run compile tests - -on: - pull_request: - push: - branches: - - main - workflow_dispatch: - -jobs: - test-cuda: - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main - with: - runner: linux.g5.4xlarge.nvidia.gpu - gpu-arch-type: cuda - gpu-arch-version: "12.1" - script: | - echo "::group::Print machine info" - uname -a - if [ $(uname -s) == Darwin ]; then - sysctl machdep.cpu.brand_string - sysctl machdep.cpu.core_count - fi - echo "::endgroup::" - - echo "::group::Download checkpoints" - # Install requirements - pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121 - pip install -r requirements.txt - echo "::endgroup::" - - echo "::group::Download checkpoints" - mkdir -p checkpoints/stories15M - pushd checkpoints/stories15M - wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt - wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model - popd - echo "::endgroup::" - - echo "::group::Run inference" - export MODEL_PATH=checkpoints/stories15M/stories15M.pt - export MODEL_NAME=stories15M - export MODEL_DIR=/tmp - python generate.py --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager - cat ./output_eager - python generate.py --device cuda --compile --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled - cat ./output_compiled - python export.py --device cuda --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so - python generate.py --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so > ./output_aoti - cat ./output_aoti - - echo "******************************************" - echo "******* Emb: channel-wise quantized ******" - echo "******************************************" - python generate.py --device cuda --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager - cat ./output_eager - python generate.py --device cuda --compile --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled - cat ./output_compiled - python export.py --device cuda --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so - python generate.py --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so > ./output_aoti - cat ./output_aoti - - echo "******************************************" - echo "******** Emb: group-wise quantized *******" - echo "******************************************" - python generate.py --device cuda --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager - cat ./output_eager - python generate.py --device cuda --compile --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled - cat ./output_compiled - python export.py --device cuda --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so - python generate.py --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so > ./output_aoti - cat ./output_aoti - - echo "******************************************" - echo "******* INT8 channel-wise quantized ******" - echo "******************************************" - python generate.py --device cuda --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager - cat ./output_eager - python generate.py --device cuda --compile --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled - cat ./output_compiled - python export.py --device cuda --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so - python generate.py --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so > ./output_aoti - cat ./output_aoti - - echo "******************************************" - echo "******** INT8 group-wise quantized *******" - echo "******************************************" - python generate.py --device cuda --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager - cat ./output_eager - python generate.py --device cuda --compile --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled - cat ./output_compiled - python export.py --device cuda --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so - python generate.py --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so > ./output_aoti - cat ./output_aoti - - echo "******************************************" - echo "******** INT4 group-wise quantized *******" - echo "******************************************" - python generate.py --device cuda --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager - cat ./output_eager - python generate.py --device cuda --compile --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled - cat ./output_compiled - python export.py --device cuda --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so - python generate.py --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so > ./output_aoti - cat ./output_aoti - - echo "tests complete" - echo "******************************************" - echo "::endgroup::" -