diff --git a/.ci/scripts/run-docs b/.ci/scripts/run-docs index c93b011cf..6f5ee46c7 100755 --- a/.ci/scripts/run-docs +++ b/.ci/scripts/run-docs @@ -92,7 +92,7 @@ fi if [ "$1" == "multimodal" ]; then # Expecting that this might fail this test as-is, because - # it's the first on-pr test depending on githib secrets for access with HF token access + # it's the first on-pr test depending on github secrets for access with HF token access echo "::group::Create script to run multimodal" python3 torchchat/utils/scripts/updown.py --file docs/multimodal.md > ./run-multimodal.sh @@ -108,3 +108,20 @@ if [ "$1" == "multimodal" ]; then bash -x ./run-multimodal.sh echo "::endgroup::" fi + +if [ "$1" == "native" ]; then + + echo "::group::Create script to run native-execution" + python3 torchchat/utils/scripts/updown.py --file docs/native-execution.md > ./run-native.sh + # for good measure, if something happened to updown processor, + # and it did not error out, fail with an exit 1 + echo "exit 1" >> ./run-native.sh + echo "::endgroup::" + + echo "::group::Run native-execution" + echo "*******************************************" + cat ./run-native.sh + echo "*******************************************" + bash -x ./run-native.sh + echo "::endgroup::" +fi diff --git a/.github/workflows/run-readme-pr-mps.yml b/.github/workflows/run-readme-pr-mps.yml index 718d5cf9e..3e90265f5 100644 --- a/.github/workflows/run-readme-pr-mps.yml +++ b/.github/workflows/run-readme-pr-mps.yml @@ -10,6 +10,7 @@ jobs: uses: pytorch/test-infra/.github/workflows/macos_job.yml@main with: runner: macos-m1-14 + timeout-minutes: 50 script: | conda create -y -n test-readme-mps-macos python=3.10.11 llvm-openmp conda activate test-readme-mps-macos diff --git a/.github/workflows/run-readme-pr.yml b/.github/workflows/run-readme-pr.yml index 4e5e6d014..1dc2942ef 100644 --- a/.github/workflows/run-readme-pr.yml +++ b/.github/workflows/run-readme-pr.yml @@ -287,3 +287,46 @@ jobs: echo "::endgroup::" TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs multimodal + + test-native-any: + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + with: + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: "12.1" + timeout: 60 + script: | + echo "::group::Print machine info" + uname -a + echo "::endgroup::" + + echo "::group::Install newer objcopy that supports --set-section-alignment" + yum install -y devtoolset-10-binutils + export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH + echo "::endgroup::" + + .ci/scripts/run-docs native + + echo "::group::Completion" + echo "tests complete" + echo "*******************************************" + echo "::endgroup::" + + test-native-cpu: + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + with: + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: "12.1" + timeout: 60 + script: | + echo "::group::Print machine info" + uname -a + echo "::endgroup::" + + echo "::group::Install newer objcopy that supports --set-section-alignment" + yum install -y devtoolset-10-binutils + export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH + echo "::endgroup::" + + TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs native diff --git a/README.md b/README.md index 4b910e575..3c37edf09 100644 --- a/README.md +++ b/README.md @@ -231,6 +231,8 @@ python3 torchchat.py server llama3.1 ``` [skip default]: end +[shell default]: python3 torchchat.py server llama3.1 & server_pid=$! + In another terminal, query the server using `curl`. Depending on the model configuration, this query might take a few minutes to respond. > [!NOTE] @@ -244,8 +246,6 @@ Setting `stream` to "true" in the request emits a response in chunks. If `stream **Example Input + Output** -[skip default]: begin - ``` curl http://127.0.0.1:5000/v1/chat/completions \ -H "Content-Type: application/json" \ @@ -265,12 +265,14 @@ curl http://127.0.0.1:5000/v1/chat/completions \ ] }' ``` +[skip default]: begin ``` {"response":" I'm a software developer with a passion for building innovative and user-friendly applications. I have experience in developing web and mobile applications using various technologies such as Java, Python, and JavaScript. I'm always looking for new challenges and opportunities to learn and grow as a developer.\n\nIn my free time, I enjoy reading books on computer science and programming, as well as experimenting with new technologies and techniques. I'm also interested in machine learning and artificial intelligence, and I'm always looking for ways to apply these concepts to real-world problems.\n\nI'm excited to be a part of the developer community and to have the opportunity to share my knowledge and experience with others. I'm always happy to help with any questions or problems you may have, and I'm looking forward to learning from you as well.\n\nThank you for visiting my profile! I hope you find my information helpful and interesting. If you have any questions or would like to discuss any topics, please feel free to reach out to me. I"} ``` [skip default]: end +[shell default]: kill ${server_pid} @@ -664,6 +666,6 @@ awesome libraries and tools you've built around local LLM inference. torchchat is released under the [BSD 3 license](LICENSE). (Additional code in this distribution is covered by the MIT and Apache Open Source -licenses.) However you may have other legal obligations that govern +licenses.) However, you may have other legal obligations that govern your use of content, such as the terms of service for third-party models. diff --git a/docs/ADVANCED-USERS.md b/docs/ADVANCED-USERS.md index 8f66b8a29..b996bf202 100644 --- a/docs/ADVANCED-USERS.md +++ b/docs/ADVANCED-USERS.md @@ -251,6 +251,8 @@ To improve performance, you can compile the model with `--compile` trading off the time to first token processed with time per token. To improve performance further, you may also compile the prefill with `--compile-prefill`. This will increase further compilation times though. +For CPU, you can use `--max-autotune` to further improve the performance +with `--compile` and `compile-prefill`. See [`max-autotune on CPU tutorial`](https://pytorch.org/tutorials/prototype/max_autotune_on_CPU_tutorial.html). Parallel prefill is not yet supported by exported models, and may be supported in a future release. diff --git a/docs/model_customization.md b/docs/model_customization.md index 3c076fa71..7108b4ce2 100644 --- a/docs/model_customization.md +++ b/docs/model_customization.md @@ -34,6 +34,9 @@ prefill with `--compile_prefill`. To learn more about compilation, check out: https://pytorch.org/get-started/pytorch-2.0/ +For CPU, you can use `--max-autotune` to further improve the performance with `--compile` and `compile-prefill`. + +See [`max-autotune on CPU tutorial`](https://pytorch.org/tutorials/prototype/max_autotune_on_CPU_tutorial.html). ## Model Precision diff --git a/install/install_requirements.sh b/install/install_requirements.sh index a39c55cc8..3e1f9a655 100755 --- a/install/install_requirements.sh +++ b/install/install_requirements.sh @@ -62,13 +62,13 @@ echo "Using pip executable: $PIP_EXECUTABLE" # NOTE: If a newly-fetched version of the executorch repo changes the value of # PYTORCH_NIGHTLY_VERSION, you should re-run this script to install the necessary # package versions. -PYTORCH_NIGHTLY_VERSION=dev20241010 +PYTORCH_NIGHTLY_VERSION=dev20241013 # Nightly version for torchvision -VISION_NIGHTLY_VERSION=dev20241010 +VISION_NIGHTLY_VERSION=dev20241013 # Nightly version for torchtune -TUNE_NIGHTLY_VERSION=dev20241010 +TUNE_NIGHTLY_VERSION=dev20241013 # Uninstall triton, as nightly will depend on pytorch-triton, which is one and the same (