diff --git a/.ci/scripts/validate.sh b/.ci/scripts/validate.sh
index d4bc75da5..971965305 100644
--- a/.ci/scripts/validate.sh
+++ b/.ci/scripts/validate.sh
@@ -14,7 +14,7 @@ function generate_eager_model_output() {
     local TARGET_DEVICE="${2:-cpu}"
     local MODEL_DIR="${CHECKPOINT_PATH%/*}"
     local MODEL_NAME=$(basename "$CHECKPOINT_PATH" | sed 's/\.[^.]*$//')
-    echo "Run inference with eager model for $MODEL_NAME"
+    echo "Run inference with eager model"
     python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --prompt "$PROMPT" --device "$TARGET_DEVICE" > "$MODEL_DIR/output_eager" || exit 1
     cat "$MODEL_DIR/output_eager"
 }
@@ -24,9 +24,70 @@ function generate_compiled_model_output() {
     local TARGET_DEVICE="${2:-cpu}"
     local MODEL_DIR="${CHECKPOINT_PATH%/*}"
     local MODEL_NAME=$(basename "$CHECKPOINT_PATH" | sed 's/\.[^.]*$//')
-    echo ""############### Run inference with torch.compile for $MODEL_NAME "###############"
+
+    echo ""############### Run inference with torch.compile "###############"
+    echo ""
+    echo "******************************************"
+    echo "************** non-quantized *************"
+    echo "******************************************"
     python -W ignore generate.py --compile --checkpoint-path "$CHECKPOINT_PATH" --prompt "$PROMPT" --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" || exit 1
     cat "$MODEL_DIR/output_compiled"
+
+    echo "******************************************"
+    echo "******* Emb: channel-wise quantized ******"
+    echo "******************************************"
+    python -W ignore generate.py --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_eager" || exit 1
+    cat "$MODEL_DIR/output_eager"
+    python -W ignore generate.py --compile --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" || exit 1
+    cat "$MODEL_DIR/output_compiled"
+
+    echo "******************************************"
+    echo "******** Emb: group-wise quantized *******"
+    echo "******************************************"
+    python -W ignore generate.py --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_eager" || exit 1
+    cat "$MODEL_DIR/output_eager"
+    python -W ignore generate.py --compile --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" || exit 1
+    cat "$MODEL_DIR/output_compiled"
+
+    echo "***********************************************"
+    echo "******* Emb: 4bit channel-wise quantized ******"
+    echo "***********************************************"
+    python -W ignore generate.py --quant '{"embedding" : {"bitwidth": 4, "groupsize": 0, "packed": "True"}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_eager" || exit 1
+    cat "$MODEL_DIR/output_eager"
+    python -W ignore generate.py --compile --quant '{"embedding" : {"bitwidth": 4, "groupsize": 0, "packed": "True"}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" || exit 1
+    cat "$MODEL_DIR/output_compiled"
+
+    echo "***********************************************"
+    echo "******** Emb: 4bit group-wise quantized *******"
+    echo "***********************************************"
+    python -W ignore generate.py --quant '{"embedding" : {"bitwidth": 4, "groupsize": 8, "packed": "True"}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_eager" || exit 1
+    cat "$MODEL_DIR/output_eager"
+    python -W ignore generate.py --compile --quant '{"embedding" : {"bitwidth": 4, "groupsize": 8, "packed": "True"}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" || exit 1
+    cat "$MODEL_DIR/output_compiled"
+
+    echo "******************************************"
+    echo "******* INT8 channel-wise quantized ******"
+    echo "******************************************"
+    python -W ignore generate.py --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_eager" || exit 1
+    cat "$MODEL_DIR/output_eager"
+    python -W ignore generate.py --compile --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" || exit 1
+    cat "$MODEL_DIR/output_compiled"
+
+    echo "******************************************"
+    echo "******** INT8 group-wise quantized *******"
+    echo "******************************************"
+    python -W ignore generate.py --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_eager" || exit 1
+    cat "$MODEL_DIR/output_eager"
+    python -W ignore generate.py --compile --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" || exit 1
+    cat "$MODEL_DIR/output_compiled"
+
+    echo "******************************************"
+    echo "******** INT4 group-wise quantized *******"
+    echo "******************************************"
+    python -W ignore generate.py --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_eager" || exit 1
+    cat "$MODEL_DIR/output_eager"
+    python -W ignore generate.py --compile --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" || exit 1
+    cat "$MODEL_DIR/output_compiled"
 }
 
 function generate_aoti_model_output() {
@@ -34,10 +95,64 @@ function generate_aoti_model_output() {
     local TARGET_DEVICE="${2:-cpu}"
     local MODEL_DIR="${CHECKPOINT_PATH%/*}"
     local MODEL_NAME=$(basename "$CHECKPOINT_PATH" | sed 's/\.[^.]*$//')
-    echo ""############### Run inference with AOTInductor for $MODEL_NAME "###############"
-    python -W ignore export.py --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path "${MODEL_DIR}/${MODEL_NAME}.so" --device "$TARGET_DEVICE"
+
+    echo ""############### Run inference with AOT Inductor "###############"
+    echo ""
+    echo "******************************************"
+    echo "************** non-quantized *************"
+    echo "******************************************"
+    python -W ignore export.py --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path "${MODEL_DIR}/${MODEL_NAME}.so" --device "$TARGET_DEVICE" || exit 1
     python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --dso-path "$MODEL_DIR/${MODEL_NAME}.so" --prompt "$PROMPT" --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
     cat "$MODEL_DIR/output_aoti"
+
+    echo "******************************************"
+    echo "******* Emb: channel-wise quantized ******"
+    echo "******************************************"
+    python -W ignore export.py --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1
+    python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
+    cat "$MODEL_DIR/output_aoti"
+
+    echo "******************************************"
+    echo "******** Emb: group-wise quantized *******"
+    echo "******************************************"
+    python -W ignore export.py --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1
+    python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
+    cat "$MODEL_DIR/output_aoti"
+
+    echo "***********************************************"
+    echo "******* Emb: 4bit channel-wise quantized ******"
+    echo "***********************************************"
+    python -W ignore export.py --quant '{"embedding" : {"bitwidth": 4, "groupsize": 0, "packed": "True"}}' --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1
+    python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
+    cat "$MODEL_DIR/output_aoti"
+
+    echo "***********************************************"
+    echo "******** Emb: 4bit group-wise quantized *******"
+    echo "***********************************************"
+    python -W ignore export.py --quant '{"embedding" : {"bitwidth": 4, "groupsize": 8, "packed": "True"}}' --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1
+    python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
+    cat "$MODEL_DIR/output_aoti"
+
+    echo "******************************************"
+    echo "******* INT8 channel-wise quantized ******"
+    echo "******************************************"
+    python -W ignore export.py --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1
+    python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
+    cat "$MODEL_DIR/output_aoti"
+
+    echo "******************************************"
+    echo "******** INT8 group-wise quantized *******"
+    echo "******************************************"
+    python -W ignore export.py --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1
+    python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
+    cat "$MODEL_DIR/output_aoti"
+
+    echo "******************************************"
+    echo "******** INT4 group-wise quantized *******"
+    echo "******************************************"
+    python -W ignore export.py --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1
+    python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
+    cat "$MODEL_DIR/output_aoti"
 }
 
 function generate_executorch_model_output() {
@@ -45,7 +160,7 @@ function generate_executorch_model_output() {
     local TARGET_DEVICE="${2:-cpu}"
     local MODEL_DIR="${CHECKPOINT_PATH%/*}"
     local MODEL_NAME=$(basename "$CHECKPOINT_PATH" | sed 's/\.[^.]*$//')
-    echo ""############### Run inference with ExecuTorch using XNNPACK for $MODEL_NAME "###############"
+    echo ""############### Run inference with ExecuTorch using XNNPACK "###############"
     python -W ignore export.py --checkpoint-path "$CHECKPOINT_PATH" --output-pte-path "$MODEL_DIR/${MODEL_NAME}.pte" -d "fp32" || exit 1
     python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --prompt "$PROMPT" --device "$TARGET_DEVICE" --pte-path "$MODEL_DIR/${MODEL_NAME}.pte" > "$MODEL_DIR/output_et" || exit 1
     cat "$MODEL_DIR/output_et"
diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml
deleted file mode 100644
index a50d378f5..000000000
--- a/.github/workflows/compile.yml
+++ /dev/null
@@ -1,139 +0,0 @@
-name: Compile main
-
-on:
-  push:
-    branches:
-      - main
-  pull_request:
-  workflow_dispatch:
-
-jobs:
-  run-tinystories:
-    strategy:
-      matrix:
-        runner: [ubuntu-latest, macos-14]
-    runs-on: ${{matrix.runner}}
-    steps:
-      - name: Checkout repo
-        uses: actions/checkout@v2
-      - name: Setup Python
-        uses: actions/setup-python@v2
-        with:
-          python-version: 3.11
-      - name: Print machine info
-        run: |
-          uname -a
-          if [ $(uname -s) == Darwin ]; then
-            sysctl machdep.cpu.brand_string
-            sysctl machdep.cpu.core_count
-          fi
-      - name: Install requirements
-        run: |
-          pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
-          pip install -r requirements.txt
-      - name: Download checkpoints
-        run: |
-          mkdir -p checkpoints/stories15M
-          pushd checkpoints/stories15M
-          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
-          wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
-          popd
-      - name: Run inference
-        run: |          
-          export MODEL_PATH=checkpoints/stories15M/stories15M.pt
-          export MODEL_NAME=stories15M
-          export MODEL_DIR=/tmp
-          python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
-          cat ./output_eager
-          python generate.py --compile --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
-          cat ./output_compiled
-          python export.py --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
-          python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti
-          cat ./output_aoti
-
-          echo "******************************************"
-          echo "******* Emb: channel-wise quantized ******"
-          echo "******************************************"
-          python generate.py --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
-          cat ./output_eager
-          python generate.py --compile --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
-          cat ./output_compiled
-          python export.py --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
-          python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti
-          cat ./output_aoti
-
-          echo "******************************************"
-          echo "******** Emb: group-wise quantized *******"
-          echo "******************************************"
-          python generate.py --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
-          cat ./output_eager
-          python generate.py --compile --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
-          cat ./output_compiled
-          python export.py --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
-          python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti
-          cat ./output_aoti
-
-          echo "***********************************************"
-          echo "******* Emb: 4bit channel-wise quantized ******"
-          echo "***********************************************"
-          python generate.py --quant '{"embedding" : {"bitwidth": 4, "groupsize": 0, "packed": "True"}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
-          cat ./output_eager
-          python generate.py --compile --quant '{"embedding" : {"bitwidth": 4, "groupsize": 0, "packed": "True"}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
-          cat ./output_compiled
-          python export.py --quant '{"embedding" : {"bitwidth": 4, "groupsize": 0, "packed": "True"}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
-          python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti
-          cat ./output_aoti
-
-          echo "***********************************************"
-          echo "******** Emb: 4bit group-wise quantized *******"
-          echo "***********************************************"
-          python generate.py --quant '{"embedding" : {"bitwidth": 4, "groupsize": 8, "packed": "True"}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
-          cat ./output_eager
-          python generate.py --compile --quant '{"embedding" : {"bitwidth": 4, "groupsize": 8, "packed": "True"}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
-          cat ./output_compiled
-          python export.py --quant '{"embedding" : {"bitwidth": 4, "groupsize": 8, "packed": "True"}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
-          python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti
-          cat ./output_aoti
-
-          echo "******************************************"
-          echo "******* INT8 channel-wise quantized ******"
-          echo "******************************************"
-          python generate.py --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
-          cat ./output_eager
-          python generate.py --compile --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
-          cat ./output_compiled
-          python export.py --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
-          python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti
-          cat ./output_aoti
-
-          echo "******************************************"
-          echo "******** INT8 group-wise quantized *******"
-          echo "******************************************"
-          python generate.py --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
-          cat ./output_eager
-          python generate.py --compile --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
-          cat ./output_compiled
-          python export.py --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
-          python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti
-          cat ./output_aoti
-
-          echo "******************************************"
-          echo "******** INT4 group-wise quantized *******"
-          echo "******************************************"
-          python generate.py --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
-          cat ./output_eager
-          python generate.py --compile --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
-          cat ./output_compiled
-          python export.py --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
-          python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti
-          cat ./output_aoti
-
-          echo "tests complete"
-          echo "******************************************"
-          # echo "********* EAGER vs TORCH.COMPILE *********"
-          # echo "******************************************"
-          # diff output_eager output_compiled
-          # echo "******************************************"
-          # echo "********* EAGER vs AOT INDUCTOR  *********"
-          # echo "******************************************"
-          # diff output_eager output_aoti
diff --git a/.github/workflows/compile_t4.yml b/.github/workflows/compile_t4.yml
deleted file mode 100644
index 9815f46a1..000000000
--- a/.github/workflows/compile_t4.yml
+++ /dev/null
@@ -1,110 +0,0 @@
-name: Run compile tests
-
-on:
-  pull_request:
-  push:
-    branches:
-      - main
-  workflow_dispatch:
-
-jobs:
-  test-cuda:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    with:
-      runner: linux.g5.4xlarge.nvidia.gpu
-      gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
-      script: |
-        echo "::group::Print machine info"
-        uname -a
-        if [ $(uname -s) == Darwin ]; then
-          sysctl machdep.cpu.brand_string
-          sysctl machdep.cpu.core_count
-        fi
-        echo "::endgroup::"
-
-        echo "::group::Download checkpoints"
-        # Install requirements
-        pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121
-        pip install -r requirements.txt
-        echo "::endgroup::"
-
-        echo "::group::Download checkpoints"
-        mkdir -p checkpoints/stories15M
-        pushd checkpoints/stories15M
-        wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
-        wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
-        popd
-        echo "::endgroup::"
-
-        echo "::group::Run inference"
-        export MODEL_PATH=checkpoints/stories15M/stories15M.pt
-        export MODEL_NAME=stories15M
-        export MODEL_DIR=/tmp
-        python generate.py --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
-        cat ./output_eager
-        python generate.py --device cuda --compile --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
-        cat ./output_compiled
-        python export.py --device cuda --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
-        python generate.py --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti
-        cat ./output_aoti
-
-        echo "******************************************"
-        echo "******* Emb: channel-wise quantized ******"
-        echo "******************************************"
-        python generate.py --device cuda --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
-        cat ./output_eager
-        python generate.py --device cuda --compile --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
-        cat ./output_compiled
-        python export.py --device cuda --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
-        python generate.py --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti
-        cat ./output_aoti
-
-        echo "******************************************"
-        echo "******** Emb: group-wise quantized *******"
-        echo "******************************************"
-        python generate.py --device cuda --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
-        cat ./output_eager
-        python generate.py --device cuda --compile --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
-        cat ./output_compiled
-        python export.py --device cuda --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
-        python generate.py --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti
-        cat ./output_aoti
-
-        echo "******************************************"
-        echo "******* INT8 channel-wise quantized ******"
-        echo "******************************************"
-        python generate.py --device cuda --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
-        cat ./output_eager
-        python generate.py --device cuda --compile --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
-        cat ./output_compiled
-        python export.py --device cuda --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
-        python generate.py --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti
-        cat ./output_aoti
-
-        echo "******************************************"
-        echo "******** INT8 group-wise quantized *******"
-        echo "******************************************"
-        python generate.py --device cuda --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
-        cat ./output_eager
-        python generate.py --device cuda --compile --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
-        cat ./output_compiled
-        python export.py --device cuda --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
-        python generate.py --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti
-        cat ./output_aoti
-
-        echo "******************************************"
-        echo "******** INT4 group-wise quantized *******"
-        echo "******************************************"
-        python generate.py --device cuda --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
-        cat ./output_eager
-        python generate.py --device cuda --compile --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
-        cat ./output_compiled
-        python export.py --device cuda --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
-        python generate.py --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti
-        cat ./output_aoti
-
-        echo "tests complete"
-        echo "******************************************"
-        echo "::endgroup::"
-