From e904a85d98773f24bb572f6b187b53fe0e85d474 Mon Sep 17 00:00:00 2001 From: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com> Date: Tue, 8 Aug 2023 09:34:41 -0500 Subject: [PATCH 001/212] Set FORCE_RPATH for ROCm (#1468) --- manywheel/build_rocm.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/manywheel/build_rocm.sh b/manywheel/build_rocm.sh index 0c1650f9be..fbbf7d3a65 100755 --- a/manywheel/build_rocm.sh +++ b/manywheel/build_rocm.sh @@ -14,6 +14,8 @@ export USE_STATIC_NCCL=1 export ATEN_STATIC_CUDA=1 export USE_CUDA_STATIC_LINK=1 export INSTALL_TEST=0 # dont install test binaries into site-packages +# Set RPATH instead of RUNPATH when using patchelf to avoid LD_LIBRARY_PATH override +export FORCE_RPATH="--force-rpath" # Keep an array of cmake variables to add to if [[ -z "$CMAKE_ARGS" ]]; then From 3c467094834426cfcd8320a5264f19a539e190c7 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Tue, 8 Aug 2023 13:46:29 -0400 Subject: [PATCH 002/212] Decouple aarch64 ci setup and build (#1470) --- aarch64_linux/aarch64_ci_build.sh | 43 ++---------------------------- aarch64_linux/aarch64_ci_setup.sh | 44 +++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 41 deletions(-) mode change 100755 => 100644 aarch64_linux/aarch64_ci_build.sh create mode 100644 aarch64_linux/aarch64_ci_setup.sh diff --git a/aarch64_linux/aarch64_ci_build.sh b/aarch64_linux/aarch64_ci_build.sh old mode 100755 new mode 100644 index 41843cead4..c374359c27 --- a/aarch64_linux/aarch64_ci_build.sh +++ b/aarch64_linux/aarch64_ci_build.sh @@ -1,47 +1,8 @@ #!/bin/bash set -eux -o pipefail -# This script is used to prepare the Docker container for aarch64_ci_wheel_build.py python script -# as we need to install conda and setup the python version for the build. - -CONDA_PYTHON_EXE=/opt/conda/bin/python -CONDA_EXE=/opt/conda/bin/conda -PATH=/opt/conda/bin:$PATH -LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH - -############################################################################### -# Install OS dependent packages -############################################################################### -yum -y install epel-release -yum -y install less zstd libgomp - -############################################################################### -# Install conda -# disable SSL_verify due to getting "Could not find a suitable TLS CA certificate bundle, invalid path" -# when using Python version, less than the conda latest -############################################################################### -echo 'Installing conda-forge' -curl -L -o /mambaforge.sh https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-aarch64.sh -chmod +x /mambaforge.sh -/mambaforge.sh -b -p /opt/conda -rm /mambaforge.sh -/opt/conda/bin/conda config --set ssl_verify False -/opt/conda/bin/conda install -y -c conda-forge python=${DESIRED_PYTHON} numpy pyyaml setuptools patchelf pygit2 openblas -python --version -conda --version - -############################################################################### -# Exec libglfortran.a hack -# -# libgfortran.a from quay.io/pypa/manylinux2014_aarch64 is not compiled with -fPIC. -# This causes __stack_chk_guard@@GLIBC_2.17 on pytorch build. To solve, get -# ubuntu's libgfortran.a which is compiled with -fPIC -############################################################################### -cd ~/ -curl -L -o ~/libgfortran-10-dev.deb http://ports.ubuntu.com/ubuntu-ports/pool/universe/g/gcc-10/libgfortran-10-dev_10.5.0-1ubuntu1_arm64.deb -ar x ~/libgfortran-10-dev.deb -tar --use-compress-program=unzstd -xvf data.tar.zst -C ~/ -cp -f ~/usr/lib/gcc/aarch64-linux-gnu/10/libgfortran.a /opt/rh/devtoolset-10/root/usr/lib/gcc/aarch64-redhat-linux/10/ +SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" +source $SCRIPTPATH/aarch64_ci_setup.sh ############################################################################### # Run aarch64 builder python diff --git a/aarch64_linux/aarch64_ci_setup.sh b/aarch64_linux/aarch64_ci_setup.sh new file mode 100644 index 0000000000..c7065056a2 --- /dev/null +++ b/aarch64_linux/aarch64_ci_setup.sh @@ -0,0 +1,44 @@ +#!/bin/bash +set -eux -o pipefail + +# This script is used to prepare the Docker container for aarch64_ci_wheel_build.py python script +# as we need to install conda and setup the python version for the build. + +CONDA_PYTHON_EXE=/opt/conda/bin/python +CONDA_EXE=/opt/conda/bin/conda +PATH=/opt/conda/bin:$PATH +LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH + +############################################################################### +# Install OS dependent packages +############################################################################### +yum -y install epel-release +yum -y install less zstd libgomp + +############################################################################### +# Install conda +# disable SSL_verify due to getting "Could not find a suitable TLS CA certificate bundle, invalid path" +# when using Python version, less than the conda latest +############################################################################### +echo 'Installing conda-forge' +curl -L -o /mambaforge.sh https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-aarch64.sh +chmod +x /mambaforge.sh +/mambaforge.sh -b -p /opt/conda +rm /mambaforge.sh +/opt/conda/bin/conda config --set ssl_verify False +/opt/conda/bin/conda install -y -c conda-forge python=${DESIRED_PYTHON} numpy pyyaml setuptools patchelf pygit2 openblas +python --version +conda --version + +############################################################################### +# Exec libglfortran.a hack +# +# libgfortran.a from quay.io/pypa/manylinux2014_aarch64 is not compiled with -fPIC. +# This causes __stack_chk_guard@@GLIBC_2.17 on pytorch build. To solve, get +# ubuntu's libgfortran.a which is compiled with -fPIC +############################################################################### +cd ~/ +curl -L -o ~/libgfortran-10-dev.deb http://ports.ubuntu.com/ubuntu-ports/pool/universe/g/gcc-10/libgfortran-10-dev_10.5.0-1ubuntu1_arm64.deb +ar x ~/libgfortran-10-dev.deb +tar --use-compress-program=unzstd -xvf data.tar.zst -C ~/ +cp -f ~/usr/lib/gcc/aarch64-linux-gnu/10/libgfortran.a /opt/rh/devtoolset-10/root/usr/lib/gcc/aarch64-redhat-linux/10/ From 14851d94f784d04534433288cb6dbc448a8f8193 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Tue, 8 Aug 2023 14:06:58 -0400 Subject: [PATCH 003/212] Run git update-index --chmod=+x aarch64_ci_setup.sh (#1471) --- aarch64_linux/aarch64_ci_setup.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 aarch64_linux/aarch64_ci_setup.sh diff --git a/aarch64_linux/aarch64_ci_setup.sh b/aarch64_linux/aarch64_ci_setup.sh old mode 100644 new mode 100755 From bb821d4afed1b7ce9a2fd2ee043d173cde2dfd5c Mon Sep 17 00:00:00 2001 From: Mike Schneider <104035434+xncqr@users.noreply.github.com> Date: Wed, 9 Aug 2023 11:02:34 -0700 Subject: [PATCH 004/212] [aarch64][CICD]Add aarch64 docker image build. (#1472) * Add aarch64 docker image build * removing ulimit for PT workflow * set aarch64 worker for docker build --- .github/workflows/build-manywheel-images.yml | 17 ++++ aarch64_linux/aarch64_ci_setup.sh | 21 +---- aarch64_linux/aarch64_wheel_ci_build.py | 67 ++++++++------- manywheel/Dockerfile_aarch64 | 86 ++++++++++++++++++++ manywheel/build_all_docker.sh | 2 + manywheel/build_docker.sh | 8 ++ 6 files changed, 147 insertions(+), 54 deletions(-) create mode 100644 manywheel/Dockerfile_aarch64 diff --git a/.github/workflows/build-manywheel-images.yml b/.github/workflows/build-manywheel-images.yml index 69af67803f..df890f0389 100644 --- a/.github/workflows/build-manywheel-images.yml +++ b/.github/workflows/build-manywheel-images.yml @@ -12,6 +12,7 @@ on: paths: - .github/workflows/build-manywheel-images.yml - manywheel/Dockerfile + - manywheel/Dockerfile_aarch64 - manywheel/Dockerfile_cxx11-abi - manywheel/build_docker.sh - 'common/*' @@ -19,6 +20,7 @@ on: paths: - .github/workflows/build-manywheel-images.yml - manywheel/Dockerfile + - manywheel/Dockerfile_aarch64 - manywheel/Dockerfile_cxx11-abi - 'common/*' - manywheel/build_docker.sh @@ -82,6 +84,21 @@ jobs: - name: Build Docker Image run: | manywheel/build_docker.sh + build-docker-cpu-aarch64: + runs-on: linux.t4g.2xlarge + env: + GPU_ARCH_TYPE: cpu-aarch64 + steps: + - name: Checkout PyTorch + uses: actions/checkout@v3 + - name: Authenticate if WITH_PUSH + run: | + if [[ "${WITH_PUSH}" == true ]]; then + echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin + fi + - name: Build Docker Image + run: | + manywheel/build_docker.sh build-docker-cpu-cxx11-abi: runs-on: ubuntu-22.04 env: diff --git a/aarch64_linux/aarch64_ci_setup.sh b/aarch64_linux/aarch64_ci_setup.sh index c7065056a2..6d2d780fe8 100755 --- a/aarch64_linux/aarch64_ci_setup.sh +++ b/aarch64_linux/aarch64_ci_setup.sh @@ -9,12 +9,6 @@ CONDA_EXE=/opt/conda/bin/conda PATH=/opt/conda/bin:$PATH LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH -############################################################################### -# Install OS dependent packages -############################################################################### -yum -y install epel-release -yum -y install less zstd libgomp - ############################################################################### # Install conda # disable SSL_verify due to getting "Could not find a suitable TLS CA certificate bundle, invalid path" @@ -26,19 +20,6 @@ chmod +x /mambaforge.sh /mambaforge.sh -b -p /opt/conda rm /mambaforge.sh /opt/conda/bin/conda config --set ssl_verify False -/opt/conda/bin/conda install -y -c conda-forge python=${DESIRED_PYTHON} numpy pyyaml setuptools patchelf pygit2 openblas +/opt/conda/bin/conda install -y -c conda-forge python=${DESIRED_PYTHON} numpy pyyaml setuptools patchelf pygit2 openblas ninja scons python --version conda --version - -############################################################################### -# Exec libglfortran.a hack -# -# libgfortran.a from quay.io/pypa/manylinux2014_aarch64 is not compiled with -fPIC. -# This causes __stack_chk_guard@@GLIBC_2.17 on pytorch build. To solve, get -# ubuntu's libgfortran.a which is compiled with -fPIC -############################################################################### -cd ~/ -curl -L -o ~/libgfortran-10-dev.deb http://ports.ubuntu.com/ubuntu-ports/pool/universe/g/gcc-10/libgfortran-10-dev_10.5.0-1ubuntu1_arm64.deb -ar x ~/libgfortran-10-dev.deb -tar --use-compress-program=unzstd -xvf data.tar.zst -C ~/ -cp -f ~/usr/lib/gcc/aarch64-linux-gnu/10/libgfortran.a /opt/rh/devtoolset-10/root/usr/lib/gcc/aarch64-redhat-linux/10/ diff --git a/aarch64_linux/aarch64_wheel_ci_build.py b/aarch64_linux/aarch64_wheel_ci_build.py index 125cfe9fd8..5d80a95e4e 100755 --- a/aarch64_linux/aarch64_wheel_ci_build.py +++ b/aarch64_linux/aarch64_wheel_ci_build.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 +# encoding: UTF-8 import os import subprocess @@ -6,36 +7,36 @@ from typing import List -'''' -Helper for getting paths for Python -''' def list_dir(path: str) -> List[str]: - return subprocess.check_output(["ls", "-1", path]).decode().split("\n") + '''' + Helper for getting paths for Python + ''' + return subprocess.check_output(["ls", "-1", path]).decode().split("\n") -''' -Using ArmComputeLibrary for aarch64 PyTorch -''' def build_ArmComputeLibrary(git_clone_flags: str = "") -> None: + ''' + Using ArmComputeLibrary for aarch64 PyTorch + ''' print('Building Arm Compute Library') os.system("cd / && mkdir /acl") os.system(f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v23.05.1 {git_clone_flags}") os.system('sed -i -e \'s/"armv8.2-a"/"armv8-a"/g\' ComputeLibrary/SConscript; ' 'sed -i -e \'s/-march=armv8.2-a+fp16/-march=armv8-a/g\' ComputeLibrary/SConstruct; ' 'sed -i -e \'s/"-march=armv8.2-a"/"-march=armv8-a"/g\' ComputeLibrary/filedefs.json') - os.system(f"cd ComputeLibrary; export acl_install_dir=/acl; " \ - f"scons Werror=1 -j8 debug=0 neon=1 opencl=0 os=linux openmp=1 cppthreads=0 arch=armv8.2-a multi_isa=1 build=native build_dir=$acl_install_dir/build; " \ - f"cp -r arm_compute $acl_install_dir; " \ - f"cp -r include $acl_install_dir; " \ - f"cp -r utils $acl_install_dir; " \ - f"cp -r support $acl_install_dir; " \ - f"cp -r src $acl_install_dir; cd /") - - -''' -Complete wheel build and put in artifact location -''' + os.system("cd ComputeLibrary; export acl_install_dir=/acl; " + "scons Werror=1 -j8 debug=0 neon=1 opencl=0 os=linux openmp=1 cppthreads=0 arch=armv8.2-a multi_isa=1 build=native build_dir=$acl_install_dir/build; " + "cp -r arm_compute $acl_install_dir; " + "cp -r include $acl_install_dir; " + "cp -r utils $acl_install_dir; " + "cp -r support $acl_install_dir; " + "cp -r src $acl_install_dir; cd /") + + def complete_wheel(folder: str): + ''' + Complete wheel build and put in artifact location + ''' wheel_name = list_dir(f"/{folder}/dist")[0] if "pytorch" in folder: @@ -54,10 +55,10 @@ def complete_wheel(folder: str): return repaired_wheel_name -''' -Parse inline arguments -''' def parse_arguments(): + ''' + Parse inline arguments + ''' from argparse import ArgumentParser parser = ArgumentParser("AARCH64 wheels python CD") parser.add_argument("--debug", action="store_true") @@ -67,11 +68,10 @@ def parse_arguments(): return parser.parse_args() -''' -Entry Point -''' if __name__ == '__main__': - + ''' + Entry Point + ''' args = parse_arguments() enable_mkldnn = args.enable_mkldnn repo = Repository('/pytorch') @@ -80,15 +80,14 @@ def parse_arguments(): branch = 'master' git_clone_flags = " --depth 1 --shallow-submodules" - os.system(f"conda install -y ninja scons") print('Building PyTorch wheel') build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 " - os.system(f"python setup.py clean") + os.system("python setup.py clean") if branch == 'nightly' or branch == 'master': - build_date = subprocess.check_output(['git','log','--pretty=format:%cs','-1'], cwd='/pytorch').decode().replace('-','') - version = subprocess.check_output(['cat','version.txt'], cwd='/pytorch').decode().strip()[:-2] + build_date = subprocess.check_output(['git', 'log', '--pretty=format:%cs', '-1'], cwd='/pytorch').decode().replace('-', '') + version = subprocess.check_output(['cat', 'version.txt'], cwd='/pytorch').decode().strip()[:-2] build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1 " if branch.startswith("v1.") or branch.startswith("v2."): build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1:branch.find('-')]} PYTORCH_BUILD_NUMBER=1 " @@ -96,10 +95,10 @@ def parse_arguments(): build_ArmComputeLibrary(git_clone_flags) print("build pytorch with mkldnn+acl backend") build_vars += "USE_MKLDNN=ON USE_MKLDNN_ACL=ON " \ - "ACL_ROOT_DIR=/acl " \ - "LD_LIBRARY_PATH=/pytorch/build/lib:/acl/build:$LD_LIBRARY_PATH " \ - "ACL_INCLUDE_DIR=/acl/build " \ - "ACL_LIBRARY=/acl/build " + "ACL_ROOT_DIR=/acl " \ + "LD_LIBRARY_PATH=/pytorch/build/lib:/acl/build:$LD_LIBRARY_PATH " \ + "ACL_INCLUDE_DIR=/acl/build " \ + "ACL_LIBRARY=/acl/build " else: print("build pytorch without mkldnn backend") diff --git a/manywheel/Dockerfile_aarch64 b/manywheel/Dockerfile_aarch64 new file mode 100644 index 0000000000..abfc2fd844 --- /dev/null +++ b/manywheel/Dockerfile_aarch64 @@ -0,0 +1,86 @@ +FROM quay.io/pypa/manylinux2014_aarch64 as base + + +# Graviton needs GCC 10 for the build +ARG DEVTOOLSET_VERSION=10 + +# Language variabes +ENV LC_ALL=en_US.UTF-8 +ENV LANG=en_US.UTF-8 +ENV LANGUAGE=en_US.UTF-8 + +# Installed needed OS packages. This is to support all +# the binary builds (torch, vision, audio, text, data) +RUN yum -y install epel-release +RUN yum -y update +RUN yum install -y \ + autoconf \ + automake \ + bison \ + bzip2 \ + curl \ + diffutils \ + file \ + git \ + make \ + patch \ + perl \ + unzip \ + util-linux \ + wget \ + which \ + xz \ + yasm \ + less \ + zstd \ + libgomp \ + devtoolset-${DEVTOOLSET_VERSION}-gcc \ + devtoolset-${DEVTOOLSET_VERSION}-gcc-c++ \ + devtoolset-${DEVTOOLSET_VERSION}-gcc-gfortran \ + devtoolset-${DEVTOOLSET_VERSION}-binutils + +# Ensure the expected devtoolset is used +ENV PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH +ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH + + +# git236+ would refuse to run git commands in repos owned by other users +# Which causes version check to fail, as pytorch repo is bind-mounted into the image +# Override this behaviour by treating every folder as safe +# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327 +RUN git config --global --add safe.directory "*" + + +############################################################################### +# libglfortran.a hack +# +# libgfortran.a from quay.io/pypa/manylinux2014_aarch64 is not compiled with -fPIC. +# This causes __stack_chk_guard@@GLIBC_2.17 on pytorch build. To solve, get +# ubuntu's libgfortran.a which is compiled with -fPIC +# NOTE: Need a better way to get this library as Ubuntu's package can be removed by the vender, or changed +############################################################################### +RUN cd ~/ \ + && curl -L -o ~/libgfortran-10-dev.deb http://ports.ubuntu.com/ubuntu-ports/pool/universe/g/gcc-10/libgfortran-10-dev_10.5.0-1ubuntu1_arm64.deb \ + && ar x ~/libgfortran-10-dev.deb \ + && tar --use-compress-program=unzstd -xvf data.tar.zst -C ~/ \ + && cp -f ~/usr/lib/gcc/aarch64-linux-gnu/10/libgfortran.a /opt/rh/devtoolset-10/root/usr/lib/gcc/aarch64-redhat-linux/10/ + +# install cmake +RUN yum install -y cmake3 && \ + ln -s /usr/bin/cmake3 /usr/bin/cmake + +FROM base as openssl +# Install openssl (this must precede `build python` step) +# (In order to have a proper SSL module, Python is compiled +# against a recent openssl [see env vars above], which is linked +# statically. We delete openssl afterwards.) +ADD ./common/install_openssl.sh install_openssl.sh +RUN bash ./install_openssl.sh && rm install_openssl.sh +ENV SSL_CERT_FILE=/opt/_internal/certs.pem + +FROM openssl as final +# remove unncessary python versions +RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2 +RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4 +RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6 +RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6 diff --git a/manywheel/build_all_docker.sh b/manywheel/build_all_docker.sh index 97c1f89ab5..2bd720f2f6 100644 --- a/manywheel/build_all_docker.sh +++ b/manywheel/build_all_docker.sh @@ -7,6 +7,8 @@ TOPDIR=$(git rev-parse --show-toplevel) GPU_ARCH_TYPE=cpu "${TOPDIR}/manywheel/build_docker.sh" MANYLINUX_VERSION=2014 GPU_ARCH_TYPE=cpu "${TOPDIR}/manywheel/build_docker.sh" +GPU_ARCH_TYPE=cpu-aarch64 "${TOPDIR}/manywheel/build_docker.sh" + GPU_ARCH_TYPE=cpu-cxx11-abi "${TOPDIR}/manywheel/build_docker.sh" for cuda_version in 12.1 11.8; do diff --git a/manywheel/build_docker.sh b/manywheel/build_docker.sh index 38d043ff43..e547b42757 100755 --- a/manywheel/build_docker.sh +++ b/manywheel/build_docker.sh @@ -20,6 +20,14 @@ case ${GPU_ARCH_TYPE} in GPU_IMAGE=centos:7 DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=9" ;; + cpu-aarch64) + TARGET=final + DOCKER_TAG=cpu-aarch64 + LEGACY_DOCKER_IMAGE=${DOCKER_REGISTRY}/pytorch/manylinux-cpu-aarch64 + GPU_IMAGE=arm64v8/centos:7 + DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=10" + MANY_LINUX_VERSION="aarch64" + ;; cpu-cxx11-abi) TARGET=final DOCKER_TAG=cpu-cxx11-abi From 8715349b3ffbb67a9cf7054491da6f3296d62e75 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Sat, 12 Aug 2023 00:37:59 +0000 Subject: [PATCH 005/212] Fix `install_conda.sh` By pinning conda version to 23.5.2 as latest(23.7.2 at this time) does not have a compatible version of `git` packages Fixes https://github.com/pytorch/builder/issues/1473 --- common/install_conda.sh | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/common/install_conda.sh b/common/install_conda.sh index bd06075257..6ae978f058 100644 --- a/common/install_conda.sh +++ b/common/install_conda.sh @@ -3,13 +3,18 @@ set -ex # Anaconda -wget -q https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -chmod +x Miniconda3-latest-Linux-x86_64.sh +# Latest anaconda is using openssl-3 which is incompatible with all currently published versions of git +# Which are using openssl-1.1.1, see https://anaconda.org/anaconda/git/files?version=2.40.1 for example +MINICONDA_URL=https://repo.anaconda.com/miniconda/Miniconda3-py311_23.5.2-0-Linux-x86_64.sh +wget -q $MINICONDA_URL # NB: Manually invoke bash per https://github.com/conda/conda/issues/10431 -bash ./Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda -rm Miniconda3-latest-Linux-x86_64.sh +bash $(basename "$MINICONDA_URL") -b -p /opt/conda +rm $(basename "$MINICONDA_URL") export PATH=/opt/conda/bin:$PATH +# See https://github.com/pytorch/builder/issues/1473 +# Pin conda to 23.5.2 as it's the last one compatible with openssl-1.1.1 +conda install -y conda=23.5.2 conda-build anaconda-client git ninja # The cmake version here needs to match with the minimum version of cmake # supported by PyTorch (3.18). There is only 3.18.2 on anaconda -conda install -y conda-build anaconda-client git ninja cmake=3.18.2 +/opt/conda/bin/pip3 install cmake==3.18.2 conda remove -y --force patchelf From 912957ae6690d7dd5af7261b5c4ec8e237a35e63 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Sat, 12 Aug 2023 01:26:09 +0000 Subject: [PATCH 006/212] Remove explicit `conda install cmake` As it's already done as part of `common/install_conda.sh` script --- libtorch/Dockerfile | 1 - 1 file changed, 1 deletion(-) diff --git a/libtorch/Dockerfile b/libtorch/Dockerfile index a71d785940..c01c6416e4 100644 --- a/libtorch/Dockerfile +++ b/libtorch/Dockerfile @@ -44,7 +44,6 @@ ENV CUDA_HOME /usr/local/cuda FROM base as conda ADD ./common/install_conda.sh install_conda.sh RUN bash ./install_conda.sh && rm install_conda.sh -RUN /opt/conda/bin/conda install -y cmake=3.18 FROM cuda as cuda11.8 RUN bash ./install_cuda.sh 11.8 From 941be28cb5c686dc41b7ea8681701e64192c3002 Mon Sep 17 00:00:00 2001 From: ptrblck Date: Fri, 11 Aug 2023 21:20:54 -0700 Subject: [PATCH 007/212] update to CUDA 12.1U1 (#1476) Should fix pytorch/pytorch#94772 in wheel builds --- common/install_cuda.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/common/install_cuda.sh b/common/install_cuda.sh index 6972c61b14..e087a44c1c 100644 --- a/common/install_cuda.sh +++ b/common/install_cuda.sh @@ -37,10 +37,10 @@ function install_121 { echo "Installing CUDA 12.1 and cuDNN 8.9 and NCCL 2.18.1" rm -rf /usr/local/cuda-12.1 /usr/local/cuda # install CUDA 12.1.0 in the same container - wget -q https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run - chmod +x cuda_12.1.0_530.30.02_linux.run - ./cuda_12.1.0_530.30.02_linux.run --toolkit --silent - rm -f cuda_12.1.0_530.30.02_linux.run + wget -q https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda_12.1.1_530.30.02_linux.run + chmod +x cuda_12.1.1_530.30.02_linux.run + ./cuda_12.1.1_530.30.02_linux.run --toolkit --silent + rm -f cuda_12.1.1_530.30.02_linux.run rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.1 /usr/local/cuda # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement From 19007fecd4f1990f7392fb50d66f5fb9faadcf33 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Mon, 14 Aug 2023 11:28:26 -0400 Subject: [PATCH 008/212] Use conda version 23.5.2 for conda pytorch build (#1477) --- conda/build_pytorch.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda/build_pytorch.sh b/conda/build_pytorch.sh index 19c7c5b63d..748d2604e5 100755 --- a/conda/build_pytorch.sh +++ b/conda/build_pytorch.sh @@ -337,7 +337,7 @@ for py_ver in "${DESIRED_PYTHON[@]}"; do # Build the package echo "Build $build_folder for Python version $py_ver" conda config --set anaconda_upload no - conda install -y conda-package-handling conda==22.9.0 + conda install -y conda-package-handling conda==23.5.2 if [[ "$OSTYPE" == "msys" ]]; then # Don't run tests on windows (they were ignored mostly anyways) From 5585c052357c87026e75160cfa2c35911e51d4f0 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Mon, 14 Aug 2023 13:48:14 -0400 Subject: [PATCH 009/212] Use py311 miniconda install (#1479) --- conda/build_pytorch.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conda/build_pytorch.sh b/conda/build_pytorch.sh index 748d2604e5..c25cd4f6a1 100755 --- a/conda/build_pytorch.sh +++ b/conda/build_pytorch.sh @@ -201,7 +201,7 @@ if [[ "$(uname)" == 'Darwin' ]]; then miniconda_sh="${MAC_PACKAGE_WORK_DIR}/miniconda.sh" rm -rf "$tmp_conda" rm -f "$miniconda_sh" - retry curl -sS https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-x86_64.sh -o "$miniconda_sh" + retry curl -sS https://repo.anaconda.com/miniconda/Miniconda3-py311_23.5.2-0-MacOSX-x86_64.sh -o "$miniconda_sh" chmod +x "$miniconda_sh" && \ "$miniconda_sh" -b -p "$tmp_conda" && \ rm "$miniconda_sh" @@ -212,7 +212,7 @@ elif [[ "$OSTYPE" == "msys" ]]; then export miniconda_exe="${WIN_PACKAGE_WORK_DIR}\\miniconda.exe" rm -rf "$tmp_conda" rm -f "$miniconda_exe" - curl -sSk https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-Windows-x86_64.exe -o "$miniconda_exe" + curl -sSk https://repo.anaconda.com/miniconda/Miniconda3-py311_23.5.2-0-Windows-x86_64.exe -o "$miniconda_exe" "$SOURCE_DIR/install_conda.bat" && rm "$miniconda_exe" pushd $tmp_conda export PATH="$(pwd):$(pwd)/Library/usr/bin:$(pwd)/Library/bin:$(pwd)/Scripts:$(pwd)/bin:$PATH" From 963129206eeef51062aa2c881f0c57e39282e72c Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Mon, 14 Aug 2023 14:09:42 -0400 Subject: [PATCH 010/212] Windows conda build fix (#1480) --- conda/build_pytorch.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/conda/build_pytorch.sh b/conda/build_pytorch.sh index c25cd4f6a1..5501a25a2a 100755 --- a/conda/build_pytorch.sh +++ b/conda/build_pytorch.sh @@ -337,12 +337,14 @@ for py_ver in "${DESIRED_PYTHON[@]}"; do # Build the package echo "Build $build_folder for Python version $py_ver" conda config --set anaconda_upload no - conda install -y conda-package-handling conda==23.5.2 if [[ "$OSTYPE" == "msys" ]]; then # Don't run tests on windows (they were ignored mostly anyways) NO_TEST="--no-test" + # Fow windows need to keep older conda version + conda install -y conda-package-handling conda==22.9.0 else + conda install -y conda-package-handling conda==23.5.2 # NS: To be removed after conda docker images are updated conda update -y conda-build fi From 9f0c8ebdc77d5a1e0445839379cb3e69a6de29f0 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Mon, 14 Aug 2023 14:18:03 -0400 Subject: [PATCH 011/212] Revert "Use py311 miniconda install (#1479)" (#1481) This reverts commit 5585c052357c87026e75160cfa2c35911e51d4f0. --- conda/build_pytorch.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conda/build_pytorch.sh b/conda/build_pytorch.sh index 5501a25a2a..ef630e155d 100755 --- a/conda/build_pytorch.sh +++ b/conda/build_pytorch.sh @@ -201,7 +201,7 @@ if [[ "$(uname)" == 'Darwin' ]]; then miniconda_sh="${MAC_PACKAGE_WORK_DIR}/miniconda.sh" rm -rf "$tmp_conda" rm -f "$miniconda_sh" - retry curl -sS https://repo.anaconda.com/miniconda/Miniconda3-py311_23.5.2-0-MacOSX-x86_64.sh -o "$miniconda_sh" + retry curl -sS https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-x86_64.sh -o "$miniconda_sh" chmod +x "$miniconda_sh" && \ "$miniconda_sh" -b -p "$tmp_conda" && \ rm "$miniconda_sh" @@ -212,7 +212,7 @@ elif [[ "$OSTYPE" == "msys" ]]; then export miniconda_exe="${WIN_PACKAGE_WORK_DIR}\\miniconda.exe" rm -rf "$tmp_conda" rm -f "$miniconda_exe" - curl -sSk https://repo.anaconda.com/miniconda/Miniconda3-py311_23.5.2-0-Windows-x86_64.exe -o "$miniconda_exe" + curl -sSk https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-Windows-x86_64.exe -o "$miniconda_exe" "$SOURCE_DIR/install_conda.bat" && rm "$miniconda_exe" pushd $tmp_conda export PATH="$(pwd):$(pwd)/Library/usr/bin:$(pwd)/Library/bin:$(pwd)/Scripts:$(pwd)/bin:$PATH" From 82aef70b417124d92783b3376abc1f6519d181ce Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Mon, 14 Aug 2023 17:17:22 -0400 Subject: [PATCH 012/212] Remove c/cb folder on windows (#1482) --- conda/build_pytorch.sh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/conda/build_pytorch.sh b/conda/build_pytorch.sh index ef630e155d..88626a34ab 100755 --- a/conda/build_pytorch.sh +++ b/conda/build_pytorch.sh @@ -286,9 +286,7 @@ fi # Some tricks for sccache with conda builds on Windows if [[ "$OSTYPE" == "msys" && "$USE_SCCACHE" == "1" ]]; then - if [[ ! -d "/c/cb" ]]; then - rm -rf /c/cb - fi + rm -rf /c/cb mkdir -p /c/cb/pytorch_1000000000000 export CONDA_BLD_PATH="C:\\cb" export CONDA_BUILD_EXTRA_ARGS="--dirty" From 64187393523109aefd1c3ec2f2193bc40d50d410 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Tue, 15 Aug 2023 09:36:33 -0400 Subject: [PATCH 013/212] Add numpy install - fix windows smoke tests (#1483) * Add numpy install * Add numpy install --- windows/internal/smoke_test.bat | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/windows/internal/smoke_test.bat b/windows/internal/smoke_test.bat index decb2cfb1a..ad276b9928 100644 --- a/windows/internal/smoke_test.bat +++ b/windows/internal/smoke_test.bat @@ -91,7 +91,9 @@ call %CONDA_HOME%\condabin\activate.bat testenv if errorlevel 1 exit /b 1 :: do conda install to make sure all the dependencies are installed -call conda install -yq pytorch %CONDA_EXTRA_ARGS% +:: Install numpy see: https://github.com/pytorch/pytorch/issues/107228 +:: todo: Remove numpy install once the issue above is resolved +call conda install -yq numpy pytorch %CONDA_EXTRA_ARGS% if ERRORLEVEL 1 exit /b 1 set /a CUDA_VER=%CUDA_VERSION% From 3f44ffe2a901fe78ed0b5161d5509a80d9a7fe83 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Tue, 15 Aug 2023 10:20:24 -0400 Subject: [PATCH 014/212] Add hostedtoolcache purge step (#1484) * Add hostedtoolcache purge step * Change step name --- .github/workflows/build-manywheel-images.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/build-manywheel-images.yml b/.github/workflows/build-manywheel-images.yml index df890f0389..3bf2dbe70e 100644 --- a/.github/workflows/build-manywheel-images.yml +++ b/.github/workflows/build-manywheel-images.yml @@ -42,6 +42,8 @@ jobs: GPU_ARCH_TYPE: cuda GPU_ARCH_VERSION: ${{ matrix.cuda_version }} steps: + - name: Purge tools folder (free space for build) + run: rm -rf /opt/hostedtoolcache - name: Checkout PyTorch builder uses: actions/checkout@v3 - name: Authenticate if WITH_PUSH From 2db96763ce9f9eae959982cb1272cfc56a72eefc Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Tue, 15 Aug 2023 10:33:32 -0400 Subject: [PATCH 015/212] Update CUDA_UPGRADE_GUIDE.MD --- CUDA_UPGRADE_GUIDE.MD | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CUDA_UPGRADE_GUIDE.MD b/CUDA_UPGRADE_GUIDE.MD index bee03ecd1a..ca8687cbd4 100644 --- a/CUDA_UPGRADE_GUIDE.MD +++ b/CUDA_UPGRADE_GUIDE.MD @@ -52,7 +52,7 @@ There are three types of Docker containers we maintain in order to build Linux b ## 3. Update Magma for Linux Build Magma for Linux. Our Linux CUDA jobs use conda, so we need to build magma-cuda116 and push it to anaconda: -1. Follow this [PR 997](https://github.com/pytorch/builder/pull/997) for all steps in this section +1. Follow this [PR 1368](https://github.com/pytorch/builder/pull/1368) for all steps in this section 2. Currently, this is mainly copy-paste in [`magma/Makefile`](magma/Makefile) if there are no major code API changes/deprecations to the CUDA version. Previously, we've needed to add patches to MAGMA, so this may be something to check with NVIDIA about. 3. To push the package, please update build-magma-linux workflow [PR 897](https://github.com/pytorch/builder/pull/897). 4. NOTE: This step relies on the conda-builder image (changes to `.github/workflows/build-conda-images.yml`), so make sure you have pushed the new conda-builder prior. Validate this step by logging into anaconda.org and seeing your package deployed for example [here](https://anaconda.org/pytorch/magma-cuda115) From 294487f161d49c7128dae12188a9b70f93303c5f Mon Sep 17 00:00:00 2001 From: ptrblck Date: Tue, 15 Aug 2023 08:40:18 -0700 Subject: [PATCH 016/212] update CUDA to 12.1U1 for Windows (#1485) --- windows/internal/cuda_install.bat | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/windows/internal/cuda_install.bat b/windows/internal/cuda_install.bat index fd20541ee0..acd457a170 100644 --- a/windows/internal/cuda_install.bat +++ b/windows/internal/cuda_install.bat @@ -54,7 +54,7 @@ goto cuda_common :cuda121 -set CUDA_INSTALL_EXE=cuda_12.1.0_531.14_windows.exe +set CUDA_INSTALL_EXE=cuda_12.1.1_531.14_windows.exe if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" ( curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" if errorlevel 1 exit /b 1 From c8a03796b91d6e436ba3ed3c2c0c69af7785462d Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Wed, 16 Aug 2023 09:27:32 -0400 Subject: [PATCH 017/212] Small improvements in build pytorch script (#1486) --- conda/build_pytorch.sh | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/conda/build_pytorch.sh b/conda/build_pytorch.sh index 88626a34ab..12c0fb8272 100755 --- a/conda/build_pytorch.sh +++ b/conda/build_pytorch.sh @@ -98,13 +98,11 @@ if [[ -z "$EXTRA_CAFFE2_CMAKE_FLAGS" ]]; then # These are passed to tools/build_pytorch_libs.sh::build_caffe2() EXTRA_CAFFE2_CMAKE_FLAGS=() fi + if [[ -z "$DESIRED_PYTHON" ]]; then - if [[ "$OSTYPE" == "msys" ]]; then - DESIRED_PYTHON=('3.5' '3.6' '3.7') - else - DESIRED_PYTHON=('2.7' '3.5' '3.6' '3.7' '3.8') - fi + DESIRED_PYTHON=('3.8') fi + if [[ "$OSTYPE" == "darwin"* ]]; then DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer fi @@ -366,7 +364,7 @@ for py_ver in "${DESIRED_PYTHON[@]}"; do # TODO these reqs are hardcoded for pytorch-nightly test_env="env_$folder_tag" retry conda create -yn "$test_env" python="$py_ver" - source activate "$test_env" + conda activate "$test_env" # Extract the package for testing ls -lah "$output_folder" @@ -410,14 +408,19 @@ for py_ver in "${DESIRED_PYTHON[@]}"; do fi # Clean up test folder - source deactivate + conda deactivate conda env remove -yn "$test_env" rm -rf "$output_folder" done # Cleanup the tricks for sccache with conda builds on Windows if [[ "$OSTYPE" == "msys" ]]; then + # Please note sometimes we get Device or resource busy during + # this cleanup step. We don't want to fail the build because of this + # hence adding +e, -e around the cleanup step + set +e rm -rf /c/cb/pytorch_1000000000000 + set -e unset CONDA_BLD_PATH fi unset CONDA_BUILD_EXTRA_ARGS From 3e7a8560d3979e079d21b2700fb5776eb1c6bbf8 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Wed, 16 Aug 2023 13:34:10 -0400 Subject: [PATCH 018/212] Undo using conda activate (#1487) --- conda/build_pytorch.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conda/build_pytorch.sh b/conda/build_pytorch.sh index 12c0fb8272..6f8eaf502a 100755 --- a/conda/build_pytorch.sh +++ b/conda/build_pytorch.sh @@ -364,7 +364,7 @@ for py_ver in "${DESIRED_PYTHON[@]}"; do # TODO these reqs are hardcoded for pytorch-nightly test_env="env_$folder_tag" retry conda create -yn "$test_env" python="$py_ver" - conda activate "$test_env" + source activate "$test_env" # Extract the package for testing ls -lah "$output_folder" @@ -408,7 +408,7 @@ for py_ver in "${DESIRED_PYTHON[@]}"; do fi # Clean up test folder - conda deactivate + source deactivate conda env remove -yn "$test_env" rm -rf "$output_folder" done From a5aa27fd0254b4f389293ae1a721b82e6fa24030 Mon Sep 17 00:00:00 2001 From: JYX Date: Sat, 19 Aug 2023 04:30:50 +0800 Subject: [PATCH 019/212] Update meta.yaml (#1389) --- conda/pytorch-nightly/meta.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/conda/pytorch-nightly/meta.yaml b/conda/pytorch-nightly/meta.yaml index 52db9ac0ee..5cab9f53d6 100644 --- a/conda/pytorch-nightly/meta.yaml +++ b/conda/pytorch-nightly/meta.yaml @@ -42,6 +42,9 @@ requirements: {% endif %} - libuv # [win] - intel-openmp # [win] + # llvm-openmp 16 leads to wrong processor affinity for fork child, see #99625. + # Before a decent fix, force llvm-openmp version <16. + - llvm-openmp <16 # [linux] - typing_extensions - sympy - filelock From eba456f07dca6de6fee488c163a40d97f9e4db9c Mon Sep 17 00:00:00 2001 From: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com> Date: Tue, 22 Aug 2023 18:37:16 -0500 Subject: [PATCH 020/212] Add pytorch-triton-rocm as an install dependency for ROCm (#1463) * Add pytorch-triton-rocm as an install dependency for ROCm * Update build_rocm.sh --- manywheel/build_rocm.sh | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/manywheel/build_rocm.sh b/manywheel/build_rocm.sh index fbbf7d3a65..80ebde6f41 100755 --- a/manywheel/build_rocm.sh +++ b/manywheel/build_rocm.sh @@ -214,6 +214,18 @@ elif [[ $ROCM_INT -ge 50600 ]]; then DEPS_AUX_DSTLIST+=(${RCCL_SHARE_FILES[@]/#/$RCCL_SHARE_DST/}) fi +# Add triton install dependency +if [[ $(uname) == "Linux" ]]; then + TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton-rocm.txt) + TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt) + + if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then + export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="pytorch-triton-rocm==${TRITON_VERSION}+${TRITON_SHORTHASH}" + else + export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS} | pytorch-triton-rocm==${TRITON_VERSION}+${TRITON_SHORTHASH}" + fi +fi + echo "PYTORCH_ROCM_ARCH: ${PYTORCH_ROCM_ARCH}" From 331b031b85e8a90cdb06e3db2b73e6b4e08c742a Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Wed, 23 Aug 2023 11:23:28 -0400 Subject: [PATCH 021/212] Add aarch64 to validation framework (#1474) --- .github/scripts/validate_binaries.sh | 2 + .../validate-aarch64-linux-binaries.yml | 61 +++++++++++++++++++ 2 files changed, 63 insertions(+) create mode 100644 .github/workflows/validate-aarch64-linux-binaries.yml diff --git a/.github/scripts/validate_binaries.sh b/.github/scripts/validate_binaries.sh index 56668ab774..cdcbea30b0 100755 --- a/.github/scripts/validate_binaries.sh +++ b/.github/scripts/validate_binaries.sh @@ -27,6 +27,8 @@ else if [[ ${TARGET_OS} == 'windows' ]]; then python ./test/smoke_test/smoke_test.py + elif [[ ${TARGET_OS} == 'aarch64-linux' ]]; then + python3 ./test/smoke_test/smoke_test.py --package=torchonly else python3 ./test/smoke_test/smoke_test.py fi diff --git a/.github/workflows/validate-aarch64-linux-binaries.yml b/.github/workflows/validate-aarch64-linux-binaries.yml new file mode 100644 index 0000000000..3ffefa52ab --- /dev/null +++ b/.github/workflows/validate-aarch64-linux-binaries.yml @@ -0,0 +1,61 @@ +name: Validate Aarch64 linux binaries + +on: + workflow_call: + inputs: + channel: + description: "Channel to use (nightly, test, release, all)" + required: true + type: string + ref: + description: 'Reference to checkout, defaults to empty' + default: "" + required: false + type: string + workflow_dispatch: + inputs: + channel: + description: "Channel to use (nightly, test, release, all)" + required: true + type: choice + options: + - release + - nightly + - test + - all + ref: + description: 'Reference to checkout, defaults to empty' + default: "" + required: false + type: string + +jobs: + generate-aarch64-linux-matrix: + uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main + with: + package-type: wheel + os: linux-aarch64 + channel: ${{ inputs.channel }} + with-cuda: disable + + linux: + needs: generate-aarch64-linux-matrix + strategy: + matrix: ${{ fromJson(needs.generate-linux-matrix.outputs.matrix) }} + fail-fast: false + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + name: ${{ matrix.build_name }} + with: + runner: ${{ matrix.validation_runner }} + repository: "pytorch/builder" + ref: ${{ inputs.ref || github.ref }} + job-name: ${{ matrix.build_name }} + binary-matrix: ${{ toJSON(matrix) }} + script: | + set -ex + export ENV_NAME="conda-env-${{ github.run_id }}" + export TARGET_OS="aarch64-linux" + eval "$(conda shell.bash hook)" + + # Standart case: Validate binaries + source ./.github/scripts/validate_binaries.sh From 3f1e42f065de54b3044fe39f273aeaa5452162e2 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Wed, 23 Aug 2023 11:40:14 -0400 Subject: [PATCH 022/212] Add aarch64 to validation framework (#1489) --- .github/scripts/validate_binaries.sh | 2 +- .github/workflows/validate-aarch64-linux-binaries.yml | 2 +- .github/workflows/validate-binaries.yml | 8 ++++++++ 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/.github/scripts/validate_binaries.sh b/.github/scripts/validate_binaries.sh index cdcbea30b0..e1d0ec75bf 100755 --- a/.github/scripts/validate_binaries.sh +++ b/.github/scripts/validate_binaries.sh @@ -27,7 +27,7 @@ else if [[ ${TARGET_OS} == 'windows' ]]; then python ./test/smoke_test/smoke_test.py - elif [[ ${TARGET_OS} == 'aarch64-linux' ]]; then + elif [[ ${TARGET_OS} == 'linux-aarch64' ]]; then python3 ./test/smoke_test/smoke_test.py --package=torchonly else python3 ./test/smoke_test/smoke_test.py diff --git a/.github/workflows/validate-aarch64-linux-binaries.yml b/.github/workflows/validate-aarch64-linux-binaries.yml index 3ffefa52ab..a174e57bee 100644 --- a/.github/workflows/validate-aarch64-linux-binaries.yml +++ b/.github/workflows/validate-aarch64-linux-binaries.yml @@ -54,7 +54,7 @@ jobs: script: | set -ex export ENV_NAME="conda-env-${{ github.run_id }}" - export TARGET_OS="aarch64-linux" + export TARGET_OS="linux-aarch64" eval "$(conda shell.bash hook)" # Standart case: Validate binaries diff --git a/.github/workflows/validate-binaries.yml b/.github/workflows/validate-binaries.yml index 2a6106a206..080352f905 100644 --- a/.github/workflows/validate-binaries.yml +++ b/.github/workflows/validate-binaries.yml @@ -32,6 +32,7 @@ on: options: - windows - linux + - linux-aarch64 - macos - all channel: @@ -65,6 +66,13 @@ jobs: channel: ${{ inputs.channel }} ref: ${{ inputs.ref || github.ref }} + linux: + if: inputs.os == 'linux-aarch64' + uses: ./.github/workflows/validate-aarch64-linux-binaries.yml + with: + channel: ${{ inputs.channel }} + ref: ${{ inputs.ref || github.ref }} + mac: if: inputs.os == 'macos' || inputs.os == 'all' uses: ./.github/workflows/validate-macos-binaries.yml From 1656d22c7f4ca6a230cf4656fd62406dc80772b4 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Wed, 23 Aug 2023 11:47:08 -0400 Subject: [PATCH 023/212] Add aarch64 to validation framework (#1490) * Add aarch64 to validation framework * Add aarch64 to validation framework --- .github/workflows/validate-binaries.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/validate-binaries.yml b/.github/workflows/validate-binaries.yml index 080352f905..7a679a267b 100644 --- a/.github/workflows/validate-binaries.yml +++ b/.github/workflows/validate-binaries.yml @@ -66,7 +66,7 @@ jobs: channel: ${{ inputs.channel }} ref: ${{ inputs.ref || github.ref }} - linux: + linux-aarch64: if: inputs.os == 'linux-aarch64' uses: ./.github/workflows/validate-aarch64-linux-binaries.yml with: From 0ceb5a98f8f1fe762167b1d16de7028c83de3066 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Wed, 23 Aug 2023 11:58:29 -0400 Subject: [PATCH 024/212] Add aarch64 to validation framework (#1491) * Add aarch64 to validation framework * Add aarch64 to validation framework * Add aarch64 to validation framework --- .github/workflows/validate-aarch64-linux-binaries.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/validate-aarch64-linux-binaries.yml b/.github/workflows/validate-aarch64-linux-binaries.yml index a174e57bee..4b1f06720e 100644 --- a/.github/workflows/validate-aarch64-linux-binaries.yml +++ b/.github/workflows/validate-aarch64-linux-binaries.yml @@ -38,10 +38,10 @@ jobs: channel: ${{ inputs.channel }} with-cuda: disable - linux: + linux-aarch64: needs: generate-aarch64-linux-matrix strategy: - matrix: ${{ fromJson(needs.generate-linux-matrix.outputs.matrix) }} + matrix: ${{ fromJson(needs.generate-aarch64-linux-matrix.outputs.matrix) }} fail-fast: false uses: pytorch/test-infra/.github/workflows/linux_job.yml@main name: ${{ matrix.build_name }} From 897b1dfd70acbe85df155c0ba3e7eb731b2dca47 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Wed, 23 Aug 2023 12:45:03 -0400 Subject: [PATCH 025/212] Temporary disable poetry test (#1492) --- .github/workflows/validate-linux-binaries.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/validate-linux-binaries.yml b/.github/workflows/validate-linux-binaries.yml index e33d89fb60..13d980488d 100644 --- a/.github/workflows/validate-linux-binaries.yml +++ b/.github/workflows/validate-linux-binaries.yml @@ -61,7 +61,8 @@ jobs: ([[ ${MATRIX_GPU_ARCH_VERSION} == "12.1" && ${MATRIX_CHANNEL} != "release" ]] || \ [[ ${MATRIX_GPU_ARCH_VERSION} == "11.7" && ${MATRIX_CHANNEL} == "release" ]]); then source ./.github/scripts/validate_pipy.sh --runtime-error-check disabled - source ./.github/scripts/validate_poetry.sh --runtime-error-check disabled + # temporary disable poetry check + # source ./.github/scripts/validate_poetry.sh --runtime-error-check disabled fi # Standart case: Validate binaries From 63e0dab0723b6f733b551d2747326630450335e0 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Wed, 23 Aug 2023 17:00:34 -0400 Subject: [PATCH 026/212] Add torchonly option to validation workflows (#1494) * Add torchonly option to validation workflows * fix typo --- .github/scripts/validate_binaries.sh | 9 +++++++-- .../workflows/validate-aarch64-linux-binaries.yml | 11 +++++++++++ .github/workflows/validate-binaries.yml | 15 +++++++++++++++ .github/workflows/validate-linux-binaries.yml | 11 +++++++++++ .../workflows/validate-macos-arm64-binaries.yml | 11 +++++++++++ .github/workflows/validate-macos-binaries.yml | 11 +++++++++++ .github/workflows/validate-windows-binaries.yml | 11 +++++++++++ 7 files changed, 77 insertions(+), 2 deletions(-) diff --git a/.github/scripts/validate_binaries.sh b/.github/scripts/validate_binaries.sh index e1d0ec75bf..6b4bccd6ba 100755 --- a/.github/scripts/validate_binaries.sh +++ b/.github/scripts/validate_binaries.sh @@ -6,6 +6,11 @@ else conda create -y -n ${ENV_NAME} python=${MATRIX_PYTHON_VERSION} numpy ffmpeg conda activate ${ENV_NAME} INSTALLATION=${MATRIX_INSTALLATION/"conda install"/"conda install -y"} + TEST_SUFFIX="" + if [[ ${TORCH_ONLY} == 'true' ]]; then + INSTALLATION=${INSTALLATION/"torchvision torchaudio"/""} + TEST_SUFFIX=" --package torchonly" + fi export OLD_PATH=${PATH} # Workaround macos-arm64 runners. Issue: https://github.com/pytorch/test-infra/issues/4342 @@ -26,11 +31,11 @@ else fi if [[ ${TARGET_OS} == 'windows' ]]; then - python ./test/smoke_test/smoke_test.py + python ./test/smoke_test/smoke_test.py ${TEST_SUFFIX} elif [[ ${TARGET_OS} == 'linux-aarch64' ]]; then python3 ./test/smoke_test/smoke_test.py --package=torchonly else - python3 ./test/smoke_test/smoke_test.py + python3 ./test/smoke_test/smoke_test.py ${TEST_SUFFIX} fi if [[ ${TARGET_OS} == 'macos-arm64' ]]; then diff --git a/.github/workflows/validate-aarch64-linux-binaries.yml b/.github/workflows/validate-aarch64-linux-binaries.yml index 4b1f06720e..92b5e48ea8 100644 --- a/.github/workflows/validate-aarch64-linux-binaries.yml +++ b/.github/workflows/validate-aarch64-linux-binaries.yml @@ -12,6 +12,11 @@ on: default: "" required: false type: string + torchonly: + description: 'Validate torchonly' + default: false + required: false + type: boolean workflow_dispatch: inputs: channel: @@ -28,6 +33,11 @@ on: default: "" required: false type: string + torchonly: + description: 'Validate torchonly' + default: false + required: false + type: boolean jobs: generate-aarch64-linux-matrix: @@ -55,6 +65,7 @@ jobs: set -ex export ENV_NAME="conda-env-${{ github.run_id }}" export TARGET_OS="linux-aarch64" + export TORCH_ONLY=${{ inputs.torchonly }} eval "$(conda shell.bash hook)" # Standart case: Validate binaries diff --git a/.github/workflows/validate-binaries.yml b/.github/workflows/validate-binaries.yml index 7a679a267b..9c877f4edd 100644 --- a/.github/workflows/validate-binaries.yml +++ b/.github/workflows/validate-binaries.yml @@ -22,6 +22,11 @@ on: default: "" required: false type: string + torchonly: + description: 'Validate torchonly' + default: false + required: false + type: boolean workflow_dispatch: inputs: os: @@ -50,6 +55,11 @@ on: default: "" required: false type: string + torchonly: + description: 'Validate torchonly' + default: false + required: false + type: boolean jobs: win: @@ -58,6 +68,7 @@ jobs: with: channel: ${{ inputs.channel }} ref: ${{ inputs.ref || github.ref }} + torchonly: ${{ inputs.torchonly }} linux: if: inputs.os == 'linux' || inputs.os == 'all' @@ -65,6 +76,7 @@ jobs: with: channel: ${{ inputs.channel }} ref: ${{ inputs.ref || github.ref }} + torchonly: ${{ inputs.torchonly }} linux-aarch64: if: inputs.os == 'linux-aarch64' @@ -72,6 +84,7 @@ jobs: with: channel: ${{ inputs.channel }} ref: ${{ inputs.ref || github.ref }} + torchonly: ${{ inputs.torchonly }} mac: if: inputs.os == 'macos' || inputs.os == 'all' @@ -79,6 +92,7 @@ jobs: with: channel: ${{ inputs.channel }} ref: ${{ inputs.ref || github.ref }} + torchonly: ${{ inputs.torchonly }} mac-arm64: if: inputs.os == 'macos' || inputs.os == 'all' @@ -86,3 +100,4 @@ jobs: with: channel: ${{ inputs.channel }} ref: ${{ inputs.ref || github.ref }} + torchonly: ${{ inputs.torchonly }} diff --git a/.github/workflows/validate-linux-binaries.yml b/.github/workflows/validate-linux-binaries.yml index 13d980488d..937f0e95f2 100644 --- a/.github/workflows/validate-linux-binaries.yml +++ b/.github/workflows/validate-linux-binaries.yml @@ -12,6 +12,11 @@ on: default: "" required: false type: string + torchonly: + description: 'Validate torchonly' + default: false + required: false + type: boolean workflow_dispatch: inputs: channel: @@ -28,6 +33,11 @@ on: default: "" required: false type: string + torchonly: + description: 'Validate torchonly' + default: false + required: false + type: boolean jobs: generate-linux-matrix: @@ -53,6 +63,7 @@ jobs: script: | set -ex export ENV_NAME="conda-env-${{ github.run_id }}" + export TORCH_ONLY=${{ inputs.torchonly }} export TARGET_OS="linux" eval "$(conda shell.bash hook)" diff --git a/.github/workflows/validate-macos-arm64-binaries.yml b/.github/workflows/validate-macos-arm64-binaries.yml index f321022d42..f23dec3f6d 100644 --- a/.github/workflows/validate-macos-arm64-binaries.yml +++ b/.github/workflows/validate-macos-arm64-binaries.yml @@ -12,6 +12,11 @@ on: default: "" required: false type: string + torchonly: + description: 'Validate torchonly' + default: false + required: false + type: boolean workflow_dispatch: inputs: channel: @@ -28,6 +33,11 @@ on: default: "" required: false type: string + torchonly: + description: 'Validate torchonly' + default: false + required: false + type: boolean jobs: generate-macos-arm64-matrix: @@ -53,4 +63,5 @@ jobs: set -ex export ENV_NAME="conda-env-${{ github.run_id }}" export TARGET_OS="macos-arm64" + export TORCH_ONLY=${{ inputs.torchonly }} source ./.github/scripts/validate_binaries.sh diff --git a/.github/workflows/validate-macos-binaries.yml b/.github/workflows/validate-macos-binaries.yml index 0e3f38ff86..0926dbe933 100644 --- a/.github/workflows/validate-macos-binaries.yml +++ b/.github/workflows/validate-macos-binaries.yml @@ -12,6 +12,11 @@ on: default: "" required: false type: string + torchonly: + description: 'Validate torchonly' + default: false + required: false + type: boolean workflow_dispatch: inputs: channel: @@ -28,6 +33,11 @@ on: default: "" required: false type: string + torchonly: + description: 'Validate torchonly' + default: false + required: false + type: boolean jobs: generate-macos-matrix: @@ -53,4 +63,5 @@ jobs: set -ex export ENV_NAME="conda-env-${{ github.run_id }}" export TARGET_OS="macos" + export TORCH_ONLY=${{ inputs.torchonly }} source ./.github/scripts/validate_binaries.sh diff --git a/.github/workflows/validate-windows-binaries.yml b/.github/workflows/validate-windows-binaries.yml index 463626c5a6..96d2b281ee 100644 --- a/.github/workflows/validate-windows-binaries.yml +++ b/.github/workflows/validate-windows-binaries.yml @@ -12,6 +12,11 @@ on: default: "" required: false type: string + torchonly: + description: 'Validate torchonly' + default: false + required: false + type: boolean workflow_dispatch: inputs: channel: @@ -28,6 +33,11 @@ on: default: "" required: false type: string + torchonly: + description: 'Validate torchonly' + default: false + required: false + type: boolean jobs: generate-windows-matrix: @@ -55,6 +65,7 @@ jobs: set -ex export ENV_NAME="conda-env-${{ github.run_id }}" export TARGET_OS="windows" + export TORCH_ONLY=${{ inputs.torchonly }} source /c/Jenkins/Miniconda3/etc/profile.d/conda.sh if [[ ${MATRIX_GPU_ARCH_VERSION} == "12.1" ]]; then ./windows/internal/driver_update.bat From dbc20b68b9f8432a90fa7961d437ace2b3386cf2 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Fri, 25 Aug 2023 14:04:05 -0400 Subject: [PATCH 027/212] Remove pipy validation temporarily (#1495) --- .github/workflows/validate-linux-binaries.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/validate-linux-binaries.yml b/.github/workflows/validate-linux-binaries.yml index 937f0e95f2..088e4dec88 100644 --- a/.github/workflows/validate-linux-binaries.yml +++ b/.github/workflows/validate-linux-binaries.yml @@ -71,7 +71,7 @@ jobs: if [[ ${MATRIX_PACKAGE_TYPE} == "manywheel" ]] && \ ([[ ${MATRIX_GPU_ARCH_VERSION} == "12.1" && ${MATRIX_CHANNEL} != "release" ]] || \ [[ ${MATRIX_GPU_ARCH_VERSION} == "11.7" && ${MATRIX_CHANNEL} == "release" ]]); then - source ./.github/scripts/validate_pipy.sh --runtime-error-check disabled + # source ./.github/scripts/validate_pipy.sh --runtime-error-check disabled # temporary disable poetry check # source ./.github/scripts/validate_poetry.sh --runtime-error-check disabled fi From 7ce4bc75ae1729bc696edc9359d9c7df8a45d4b7 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Fri, 25 Aug 2023 15:25:23 -0400 Subject: [PATCH 028/212] Remove pipy validation temporarily (#1496) --- .github/workflows/validate-linux-binaries.yml | 9 --------- 1 file changed, 9 deletions(-) diff --git a/.github/workflows/validate-linux-binaries.yml b/.github/workflows/validate-linux-binaries.yml index 088e4dec88..6d135e3a38 100644 --- a/.github/workflows/validate-linux-binaries.yml +++ b/.github/workflows/validate-linux-binaries.yml @@ -67,14 +67,5 @@ jobs: export TARGET_OS="linux" eval "$(conda shell.bash hook)" - # Special case PyPi installation package. And Install of PyPi package via poetry - if [[ ${MATRIX_PACKAGE_TYPE} == "manywheel" ]] && \ - ([[ ${MATRIX_GPU_ARCH_VERSION} == "12.1" && ${MATRIX_CHANNEL} != "release" ]] || \ - [[ ${MATRIX_GPU_ARCH_VERSION} == "11.7" && ${MATRIX_CHANNEL} == "release" ]]); then - # source ./.github/scripts/validate_pipy.sh --runtime-error-check disabled - # temporary disable poetry check - # source ./.github/scripts/validate_poetry.sh --runtime-error-check disabled - fi - # Standart case: Validate binaries source ./.github/scripts/validate_binaries.sh From 99f34d63f2cd577f2619c38468fc16e77660c5fe Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Tue, 29 Aug 2023 12:14:23 -0400 Subject: [PATCH 029/212] Add no-sudo to linux-aarch64 tests (#1499) --- .github/workflows/validate-aarch64-linux-binaries.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/validate-aarch64-linux-binaries.yml b/.github/workflows/validate-aarch64-linux-binaries.yml index 92b5e48ea8..f496498cfa 100644 --- a/.github/workflows/validate-aarch64-linux-binaries.yml +++ b/.github/workflows/validate-aarch64-linux-binaries.yml @@ -61,6 +61,7 @@ jobs: ref: ${{ inputs.ref || github.ref }} job-name: ${{ matrix.build_name }} binary-matrix: ${{ toJSON(matrix) }} + no-sudo: true script: | set -ex export ENV_NAME="conda-env-${{ github.run_id }}" From 6cbaf70f2e4e71ebc8605de35326487344f23d2a Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Tue, 29 Aug 2023 12:32:01 -0400 Subject: [PATCH 030/212] Pass container image to aarch64 test jobs (#1500) --- .github/workflows/validate-aarch64-linux-binaries.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/validate-aarch64-linux-binaries.yml b/.github/workflows/validate-aarch64-linux-binaries.yml index f496498cfa..57a37da05d 100644 --- a/.github/workflows/validate-aarch64-linux-binaries.yml +++ b/.github/workflows/validate-aarch64-linux-binaries.yml @@ -60,6 +60,7 @@ jobs: repository: "pytorch/builder" ref: ${{ inputs.ref || github.ref }} job-name: ${{ matrix.build_name }} + docker-image: ${{ matrix.container_image }} binary-matrix: ${{ toJSON(matrix) }} no-sudo: true script: | From 912bb2aac5ae5f905f2636f84f84db4768f93a11 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Tue, 29 Aug 2023 12:43:01 -0400 Subject: [PATCH 031/212] Add setup aarch64 builds for aarch64 testing (#1501) --- .github/workflows/validate-aarch64-linux-binaries.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/validate-aarch64-linux-binaries.yml b/.github/workflows/validate-aarch64-linux-binaries.yml index 57a37da05d..b067f4821e 100644 --- a/.github/workflows/validate-aarch64-linux-binaries.yml +++ b/.github/workflows/validate-aarch64-linux-binaries.yml @@ -65,6 +65,9 @@ jobs: no-sudo: true script: | set -ex + source ./aarch64_linux/aarch64_ci_setup.sh + echo "/opt/conda/bin" >> $GITHUB_PATH + export ENV_NAME="conda-env-${{ github.run_id }}" export TARGET_OS="linux-aarch64" export TORCH_ONLY=${{ inputs.torchonly }} From 0c3634c34e036c7467c6c5d548b4057e46c2ed5c Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Tue, 29 Aug 2023 12:50:32 -0400 Subject: [PATCH 032/212] Fix DESIRED_PYTHON setting for aarch64 validations (#1502) --- .github/workflows/validate-aarch64-linux-binaries.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/validate-aarch64-linux-binaries.yml b/.github/workflows/validate-aarch64-linux-binaries.yml index b067f4821e..d3e57fd5c7 100644 --- a/.github/workflows/validate-aarch64-linux-binaries.yml +++ b/.github/workflows/validate-aarch64-linux-binaries.yml @@ -65,6 +65,7 @@ jobs: no-sudo: true script: | set -ex + export DESIRED_PYTHON=${{ matrix.python_version }} source ./aarch64_linux/aarch64_ci_setup.sh echo "/opt/conda/bin" >> $GITHUB_PATH From f50bd85685bbe6662cbb51658648f4fa35dd3a47 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Tue, 29 Aug 2023 15:29:19 -0400 Subject: [PATCH 033/212] Use extra-index-url for aarch64 builds (#1503) --- .github/workflows/validate-aarch64-linux-binaries.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/validate-aarch64-linux-binaries.yml b/.github/workflows/validate-aarch64-linux-binaries.yml index d3e57fd5c7..b66f0f1a39 100644 --- a/.github/workflows/validate-aarch64-linux-binaries.yml +++ b/.github/workflows/validate-aarch64-linux-binaries.yml @@ -69,6 +69,8 @@ jobs: source ./aarch64_linux/aarch64_ci_setup.sh echo "/opt/conda/bin" >> $GITHUB_PATH + MATRIX_INSTALLATION=${MATRIX_INSTALLATION/"index-url"/"extra-index-url"} + export ENV_NAME="conda-env-${{ github.run_id }}" export TARGET_OS="linux-aarch64" export TORCH_ONLY=${{ inputs.torchonly }} From 8b39596d58c2232c2c3a6aebc850eaced1c5b1ed Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Tue, 29 Aug 2023 16:25:00 -0400 Subject: [PATCH 034/212] Pypi validation enable (#1504) --- .github/workflows/validate-linux-binaries.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/workflows/validate-linux-binaries.yml b/.github/workflows/validate-linux-binaries.yml index 6d135e3a38..937f0e95f2 100644 --- a/.github/workflows/validate-linux-binaries.yml +++ b/.github/workflows/validate-linux-binaries.yml @@ -67,5 +67,14 @@ jobs: export TARGET_OS="linux" eval "$(conda shell.bash hook)" + # Special case PyPi installation package. And Install of PyPi package via poetry + if [[ ${MATRIX_PACKAGE_TYPE} == "manywheel" ]] && \ + ([[ ${MATRIX_GPU_ARCH_VERSION} == "12.1" && ${MATRIX_CHANNEL} != "release" ]] || \ + [[ ${MATRIX_GPU_ARCH_VERSION} == "11.7" && ${MATRIX_CHANNEL} == "release" ]]); then + source ./.github/scripts/validate_pipy.sh --runtime-error-check disabled + # temporary disable poetry check + # source ./.github/scripts/validate_poetry.sh --runtime-error-check disabled + fi + # Standart case: Validate binaries source ./.github/scripts/validate_binaries.sh From 3c2d43c0c7572019e53be34d6a0f0a38392c2c63 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Tue, 29 Aug 2023 16:46:43 -0400 Subject: [PATCH 035/212] Validation pypi torchonly (#1505) --- .github/scripts/validate_pipy.sh | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/.github/scripts/validate_pipy.sh b/.github/scripts/validate_pipy.sh index 85b788cf0d..d840fe0194 100644 --- a/.github/scripts/validate_pipy.sh +++ b/.github/scripts/validate_pipy.sh @@ -1,12 +1,18 @@ conda create -yp ${ENV_NAME}_pypi python=${MATRIX_PYTHON_VERSION} numpy ffmpeg -if [[ ${MATRIX_CHANNEL} != "release" ]]; then +TEST_SUFFIX="" +if [[ ${TORCH_ONLY} == 'true' ]]; then + TEST_SUFFIX=" --package torchonly" conda run -p ${ENV_NAME}_pypi pip3 install --pre torch --index-url "https://download.pytorch.org/whl/${MATRIX_CHANNEL}/${MATRIX_DESIRED_CUDA}_pypi_cudnn" - conda run -p ${ENV_NAME}_pypi pip3 install --pre torchvision torchaudio --index-url "https://download.pytorch.org/whl/${MATRIX_CHANNEL}/${MATRIX_DESIRED_CUDA}" else - conda run -p ${ENV_NAME}_pypi pip3 install torch torchvision torchaudio + if [[ ${MATRIX_CHANNEL} != "release" ]]; then + conda run -p ${ENV_NAME}_pypi pip3 install --pre torch --index-url "https://download.pytorch.org/whl/${MATRIX_CHANNEL}/${MATRIX_DESIRED_CUDA}_pypi_cudnn" + conda run -p ${ENV_NAME}_pypi pip3 install --pre torchvision torchaudio --index-url "https://download.pytorch.org/whl/${MATRIX_CHANNEL}/${MATRIX_DESIRED_CUDA}" + else + conda run -p ${ENV_NAME}_pypi pip3 install torch torchvision torchaudio + fi fi -conda run -p ${ENV_NAME}_pypi python ./test/smoke_test/smoke_test.py +conda run -p ${ENV_NAME}_pypi python ./test/smoke_test/smoke_test.py ${TEST_SUFFIX} --runtime-error-check disabled conda deactivate conda env remove -p ${ENV_NAME}_pypi From 3145dfade0628a82c02cc850962f7bd670f58525 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Tue, 29 Aug 2023 18:07:38 -0400 Subject: [PATCH 036/212] Pipy validation workflow (#1506) --- .github/scripts/validate_pipy.sh | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/scripts/validate_pipy.sh b/.github/scripts/validate_pipy.sh index d840fe0194..c8c87b277f 100644 --- a/.github/scripts/validate_pipy.sh +++ b/.github/scripts/validate_pipy.sh @@ -1,18 +1,20 @@ conda create -yp ${ENV_NAME}_pypi python=${MATRIX_PYTHON_VERSION} numpy ffmpeg +conda activate ${ENV_NAME}_pypi TEST_SUFFIX="" if [[ ${TORCH_ONLY} == 'true' ]]; then TEST_SUFFIX=" --package torchonly" - conda run -p ${ENV_NAME}_pypi pip3 install --pre torch --index-url "https://download.pytorch.org/whl/${MATRIX_CHANNEL}/${MATRIX_DESIRED_CUDA}_pypi_cudnn" + pip3 install --pre torch --index-url "https://download.pytorch.org/whl/${MATRIX_CHANNEL}/${MATRIX_DESIRED_CUDA}_pypi_cudnn" else if [[ ${MATRIX_CHANNEL} != "release" ]]; then - conda run -p ${ENV_NAME}_pypi pip3 install --pre torch --index-url "https://download.pytorch.org/whl/${MATRIX_CHANNEL}/${MATRIX_DESIRED_CUDA}_pypi_cudnn" - conda run -p ${ENV_NAME}_pypi pip3 install --pre torchvision torchaudio --index-url "https://download.pytorch.org/whl/${MATRIX_CHANNEL}/${MATRIX_DESIRED_CUDA}" + pip3 install --pre torch --index-url "https://download.pytorch.org/whl/${MATRIX_CHANNEL}/${MATRIX_DESIRED_CUDA}_pypi_cudnn" + pip3 install --pre torchvision torchaudio --index-url "https://download.pytorch.org/whl/${MATRIX_CHANNEL}/${MATRIX_DESIRED_CUDA}" else - conda run -p ${ENV_NAME}_pypi pip3 install torch torchvision torchaudio + pip3 install torch torchvision torchaudio fi fi -conda run -p ${ENV_NAME}_pypi python ./test/smoke_test/smoke_test.py ${TEST_SUFFIX} --runtime-error-check disabled +python ./test/smoke_test/smoke_test.py ${TEST_SUFFIX} --runtime-error-check disabled + conda deactivate conda env remove -p ${ENV_NAME}_pypi From c7f4331998edd1f6915dcaab39d25bb50bac679c Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Tue, 29 Aug 2023 18:39:51 -0400 Subject: [PATCH 037/212] Pipy validation workflow (#1507) --- .github/scripts/validate_pipy.sh | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/.github/scripts/validate_pipy.sh b/.github/scripts/validate_pipy.sh index c8c87b277f..91f8d692f9 100644 --- a/.github/scripts/validate_pipy.sh +++ b/.github/scripts/validate_pipy.sh @@ -1,20 +1,18 @@ conda create -yp ${ENV_NAME}_pypi python=${MATRIX_PYTHON_VERSION} numpy ffmpeg -conda activate ${ENV_NAME}_pypi TEST_SUFFIX="" if [[ ${TORCH_ONLY} == 'true' ]]; then TEST_SUFFIX=" --package torchonly" - pip3 install --pre torch --index-url "https://download.pytorch.org/whl/${MATRIX_CHANNEL}/${MATRIX_DESIRED_CUDA}_pypi_cudnn" + conda run -p ${ENV_NAME}_pypi pip install --pre torch --index-url "https://download.pytorch.org/whl/${MATRIX_CHANNEL}/${MATRIX_DESIRED_CUDA}_pypi_cudnn" else if [[ ${MATRIX_CHANNEL} != "release" ]]; then - pip3 install --pre torch --index-url "https://download.pytorch.org/whl/${MATRIX_CHANNEL}/${MATRIX_DESIRED_CUDA}_pypi_cudnn" - pip3 install --pre torchvision torchaudio --index-url "https://download.pytorch.org/whl/${MATRIX_CHANNEL}/${MATRIX_DESIRED_CUDA}" + conda run -p ${ENV_NAME}_pypi pip install --pre torch --index-url "https://download.pytorch.org/whl/${MATRIX_CHANNEL}/${MATRIX_DESIRED_CUDA}_pypi_cudnn" + conda run -p ${ENV_NAME}_pypi pip install --pre torchvision torchaudio --index-url "https://download.pytorch.org/whl/${MATRIX_CHANNEL}/${MATRIX_DESIRED_CUDA}" else - pip3 install torch torchvision torchaudio + conda run -p ${ENV_NAME}_pypi pip install torch torchvision torchaudio fi fi -python ./test/smoke_test/smoke_test.py ${TEST_SUFFIX} --runtime-error-check disabled - +conda run -p ${ENV_NAME}_pypi python ./test/smoke_test/smoke_test.py ${TEST_SUFFIX} --runtime-error-check disabled conda deactivate conda env remove -p ${ENV_NAME}_pypi From ccbfd40227ce8c6789bc50b3df0676fd0a9a1028 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Tue, 29 Aug 2023 19:04:34 -0400 Subject: [PATCH 038/212] Pipy validation workflow (#1508) --- .github/scripts/validate_pipy.sh | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/.github/scripts/validate_pipy.sh b/.github/scripts/validate_pipy.sh index 91f8d692f9..578e2c8d42 100644 --- a/.github/scripts/validate_pipy.sh +++ b/.github/scripts/validate_pipy.sh @@ -1,18 +1,19 @@ -conda create -yp ${ENV_NAME}_pypi python=${MATRIX_PYTHON_VERSION} numpy ffmpeg +conda create -yn ${ENV_NAME}_pypi python=${MATRIX_PYTHON_VERSION} numpy ffmpeg +conda activate ${ENV_NAME}_pypi TEST_SUFFIX="" if [[ ${TORCH_ONLY} == 'true' ]]; then TEST_SUFFIX=" --package torchonly" - conda run -p ${ENV_NAME}_pypi pip install --pre torch --index-url "https://download.pytorch.org/whl/${MATRIX_CHANNEL}/${MATRIX_DESIRED_CUDA}_pypi_cudnn" + pip3 install --pre torch --index-url "https://download.pytorch.org/whl/${MATRIX_CHANNEL}/${MATRIX_DESIRED_CUDA}_pypi_cudnn" else if [[ ${MATRIX_CHANNEL} != "release" ]]; then - conda run -p ${ENV_NAME}_pypi pip install --pre torch --index-url "https://download.pytorch.org/whl/${MATRIX_CHANNEL}/${MATRIX_DESIRED_CUDA}_pypi_cudnn" - conda run -p ${ENV_NAME}_pypi pip install --pre torchvision torchaudio --index-url "https://download.pytorch.org/whl/${MATRIX_CHANNEL}/${MATRIX_DESIRED_CUDA}" + pip3 install --pre torch --index-url "https://download.pytorch.org/whl/${MATRIX_CHANNEL}/${MATRIX_DESIRED_CUDA}_pypi_cudnn" + pip3 install --pre torchvision torchaudio --index-url "https://download.pytorch.org/whl/${MATRIX_CHANNEL}/${MATRIX_DESIRED_CUDA}" else - conda run -p ${ENV_NAME}_pypi pip install torch torchvision torchaudio + pip3 install torch torchvision torchaudio fi fi -conda run -p ${ENV_NAME}_pypi python ./test/smoke_test/smoke_test.py ${TEST_SUFFIX} --runtime-error-check disabled +python ./test/smoke_test/smoke_test.py ${TEST_SUFFIX} --runtime-error-check disabled conda deactivate conda env remove -p ${ENV_NAME}_pypi From fe02df78f44c6c61b396410766a3333dc87b3c31 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Tue, 29 Aug 2023 19:22:14 -0400 Subject: [PATCH 039/212] Pipy validation workflow (#1509) --- .github/scripts/validate_pipy.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/scripts/validate_pipy.sh b/.github/scripts/validate_pipy.sh index 578e2c8d42..ed79150799 100644 --- a/.github/scripts/validate_pipy.sh +++ b/.github/scripts/validate_pipy.sh @@ -4,11 +4,11 @@ conda activate ${ENV_NAME}_pypi TEST_SUFFIX="" if [[ ${TORCH_ONLY} == 'true' ]]; then TEST_SUFFIX=" --package torchonly" - pip3 install --pre torch --index-url "https://download.pytorch.org/whl/${MATRIX_CHANNEL}/${MATRIX_DESIRED_CUDA}_pypi_cudnn" + pip3 install --pre torch --extra-index-url "https://download.pytorch.org/whl/${MATRIX_CHANNEL}/${MATRIX_DESIRED_CUDA}_pypi_cudnn" else if [[ ${MATRIX_CHANNEL} != "release" ]]; then - pip3 install --pre torch --index-url "https://download.pytorch.org/whl/${MATRIX_CHANNEL}/${MATRIX_DESIRED_CUDA}_pypi_cudnn" - pip3 install --pre torchvision torchaudio --index-url "https://download.pytorch.org/whl/${MATRIX_CHANNEL}/${MATRIX_DESIRED_CUDA}" + pip3 install --pre torch --extra-index-url "https://download.pytorch.org/whl/${MATRIX_CHANNEL}/${MATRIX_DESIRED_CUDA}_pypi_cudnn" + pip3 install --pre torchvision torchaudio --extra-index-url "https://download.pytorch.org/whl/${MATRIX_CHANNEL}/${MATRIX_DESIRED_CUDA}" else pip3 install torch torchvision torchaudio fi From 10f5379c4e2744b9800248c95f248c61ce1e7e06 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Wed, 30 Aug 2023 07:54:24 -0400 Subject: [PATCH 040/212] Validate poetry workflow (#1511) --- .github/scripts/validate_poetry.sh | 20 +++++++++++++++---- .github/workflows/validate-linux-binaries.yml | 4 ++-- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/.github/scripts/validate_poetry.sh b/.github/scripts/validate_poetry.sh index 65540bb358..85101a409e 100644 --- a/.github/scripts/validate_poetry.sh +++ b/.github/scripts/validate_poetry.sh @@ -8,20 +8,32 @@ poetry --version poetry new test_poetry cd test_poetry +TEST_SUFFIX="" +if [[ ${TORCH_ONLY} == 'true' ]]; then + TEST_SUFFIX=" --package torchonly" +else + if [[ ${MATRIX_CHANNEL} != "release" ]]; then # Installing poetry from our custom repo. We need to configure it before use and disable authentication export PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring poetry source add --priority=explicit domains "https://download.pytorch.org/whl/${MATRIX_CHANNEL}/${MATRIX_DESIRED_CUDA}" - poetry source add --priority=supplemental pytorch-nightly "https://download.pytorch.org/whl/${MATRIX_CHANNEL}" + poetry source add --priority=supplemental pytorch-channel "https://download.pytorch.org/whl/${MATRIX_CHANNEL}" poetry source add --priority=supplemental pytorch "https://download.pytorch.org/whl/${MATRIX_CHANNEL}/${MATRIX_DESIRED_CUDA}_pypi_cudnn" poetry --quiet add --source pytorch torch - poetry --quiet add --source domains torchvision torchaudio + + if [[ ${TORCH_ONLY} != 'true' ]]; then + poetry --quiet add --source domains torchvision torchaudio + fi else export PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring - poetry --quiet add torch torchaudio torchvision + if [[ ${TORCH_ONLY} == 'true' ]]; then + poetry --quiet add torch + else + poetry --quiet add torch torchaudio torchvision + fi fi -python ../test/smoke_test/smoke_test.py +python ../test/smoke_test/smoke_test.py ${TEST_SUFFIX} --runtime-error-check disabled conda deactivate conda env remove -p ${ENV_NAME}_poetry cd .. diff --git a/.github/workflows/validate-linux-binaries.yml b/.github/workflows/validate-linux-binaries.yml index 937f0e95f2..12335e8436 100644 --- a/.github/workflows/validate-linux-binaries.yml +++ b/.github/workflows/validate-linux-binaries.yml @@ -71,9 +71,9 @@ jobs: if [[ ${MATRIX_PACKAGE_TYPE} == "manywheel" ]] && \ ([[ ${MATRIX_GPU_ARCH_VERSION} == "12.1" && ${MATRIX_CHANNEL} != "release" ]] || \ [[ ${MATRIX_GPU_ARCH_VERSION} == "11.7" && ${MATRIX_CHANNEL} == "release" ]]); then - source ./.github/scripts/validate_pipy.sh --runtime-error-check disabled + source ./.github/scripts/validate_pipy.sh # temporary disable poetry check - # source ./.github/scripts/validate_poetry.sh --runtime-error-check disabled + source ./.github/scripts/validate_poetry.sh fi # Standart case: Validate binaries From d172580f54e4ba2dd0f3ae64e51b413605a65c00 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Wed, 30 Aug 2023 09:14:30 -0400 Subject: [PATCH 041/212] Validate poetry workflow (#1512) --- .github/scripts/validate_poetry.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/validate_poetry.sh b/.github/scripts/validate_poetry.sh index 85101a409e..c4e4fd1549 100644 --- a/.github/scripts/validate_poetry.sh +++ b/.github/scripts/validate_poetry.sh @@ -11,7 +11,7 @@ cd test_poetry TEST_SUFFIX="" if [[ ${TORCH_ONLY} == 'true' ]]; then TEST_SUFFIX=" --package torchonly" -else +fi if [[ ${MATRIX_CHANNEL} != "release" ]]; then # Installing poetry from our custom repo. We need to configure it before use and disable authentication From 894baacd1214ae7c58456b00f6b4f90efdbb6fc5 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Wed, 30 Aug 2023 11:22:18 -0400 Subject: [PATCH 042/212] Remove linux-aarch64 installation workaround (#1513) --- .github/workflows/validate-aarch64-linux-binaries.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/validate-aarch64-linux-binaries.yml b/.github/workflows/validate-aarch64-linux-binaries.yml index b66f0f1a39..d3e57fd5c7 100644 --- a/.github/workflows/validate-aarch64-linux-binaries.yml +++ b/.github/workflows/validate-aarch64-linux-binaries.yml @@ -69,8 +69,6 @@ jobs: source ./aarch64_linux/aarch64_ci_setup.sh echo "/opt/conda/bin" >> $GITHUB_PATH - MATRIX_INSTALLATION=${MATRIX_INSTALLATION/"index-url"/"extra-index-url"} - export ENV_NAME="conda-env-${{ github.run_id }}" export TARGET_OS="linux-aarch64" export TORCH_ONLY=${{ inputs.torchonly }} From 1e281befc6ca16e16d826965b3dd6b784be20f77 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Thu, 31 Aug 2023 15:13:17 -0400 Subject: [PATCH 043/212] Temporary change test aarch64 builds (#1514) --- .github/workflows/validate-aarch64-linux-binaries.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/validate-aarch64-linux-binaries.yml b/.github/workflows/validate-aarch64-linux-binaries.yml index d3e57fd5c7..4d8b493342 100644 --- a/.github/workflows/validate-aarch64-linux-binaries.yml +++ b/.github/workflows/validate-aarch64-linux-binaries.yml @@ -69,6 +69,9 @@ jobs: source ./aarch64_linux/aarch64_ci_setup.sh echo "/opt/conda/bin" >> $GITHUB_PATH + # todo: Remove after aarch64 filename is fixed + export MATRIX_INSTALLATION=${MATRIX_INSTALLATION/"pip3 install"/"pip3 install --pre"} + export ENV_NAME="conda-env-${{ github.run_id }}" export TARGET_OS="linux-aarch64" export TORCH_ONLY=${{ inputs.torchonly }} From 39740d922a9c8dc0bca4ddb8d242ae9d43c67e0f Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Thu, 31 Aug 2023 18:12:08 -0400 Subject: [PATCH 044/212] Remove torchonly restictions from aarch64 builds (#1517) --- .github/scripts/validate_binaries.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/scripts/validate_binaries.sh b/.github/scripts/validate_binaries.sh index 6b4bccd6ba..0c01dbca22 100755 --- a/.github/scripts/validate_binaries.sh +++ b/.github/scripts/validate_binaries.sh @@ -32,8 +32,6 @@ else if [[ ${TARGET_OS} == 'windows' ]]; then python ./test/smoke_test/smoke_test.py ${TEST_SUFFIX} - elif [[ ${TARGET_OS} == 'linux-aarch64' ]]; then - python3 ./test/smoke_test/smoke_test.py --package=torchonly else python3 ./test/smoke_test/smoke_test.py ${TEST_SUFFIX} fi From 57ee59abdb99440428aa5821dd077620937f4b93 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Fri, 1 Sep 2023 11:09:57 -0400 Subject: [PATCH 045/212] Fix aarch64 nightly/release version override (#1518) * Aarch64 fix overrdie passing from CI to build * Aarch64 fix overrdie passing from CI to build * Aarch64 fix overrdie passing from CI to build --- aarch64_linux/aarch64_wheel_ci_build.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/aarch64_linux/aarch64_wheel_ci_build.py b/aarch64_linux/aarch64_wheel_ci_build.py index 5d80a95e4e..4ad620ba2e 100755 --- a/aarch64_linux/aarch64_wheel_ci_build.py +++ b/aarch64_linux/aarch64_wheel_ci_build.py @@ -48,7 +48,7 @@ def complete_wheel(folder: str): os.system(f"mv /{folder}/wheelhouse/{repaired_wheel_name} /{folder}/dist/") else: repaired_wheel_name = wheel_name - + print(f"Copying {repaired_wheel_name} to artfacts") os.system(f"mv /{folder}/dist/{repaired_wheel_name} /artifacts/") @@ -85,12 +85,18 @@ def parse_arguments(): build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 " os.system("python setup.py clean") - if branch == 'nightly' or branch == 'master': - build_date = subprocess.check_output(['git', 'log', '--pretty=format:%cs', '-1'], cwd='/pytorch').decode().replace('-', '') - version = subprocess.check_output(['cat', 'version.txt'], cwd='/pytorch').decode().strip()[:-2] - build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1 " - if branch.startswith("v1.") or branch.startswith("v2."): - build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1:branch.find('-')]} PYTORCH_BUILD_NUMBER=1 " + override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION") + if override_package_version is not None: + version = override_package_version + build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version} PYTORCH_BUILD_NUMBER=1 " + else: + if branch == 'nightly' or branch == 'master': + build_date = subprocess.check_output(['git', 'log', '--pretty=format:%cs', '-1'], cwd='/pytorch').decode().replace('-', '') + version = subprocess.check_output(['cat', 'version.txt'], cwd='/pytorch').decode().strip()[:-2] + build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1 " + if branch.startswith("v1.") or branch.startswith("v2."): + build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1:branch.find('-')]} PYTORCH_BUILD_NUMBER=1 " + if enable_mkldnn: build_ArmComputeLibrary(git_clone_flags) print("build pytorch with mkldnn+acl backend") From 20173e029e07767d5a4bbf5653548aa48c207a47 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Tue, 5 Sep 2023 10:44:59 -0400 Subject: [PATCH 046/212] Revert "Temporary change test aarch64 builds (#1514)" (#1521) This reverts commit 1e281befc6ca16e16d826965b3dd6b784be20f77. --- .github/workflows/validate-aarch64-linux-binaries.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/validate-aarch64-linux-binaries.yml b/.github/workflows/validate-aarch64-linux-binaries.yml index 4d8b493342..d3e57fd5c7 100644 --- a/.github/workflows/validate-aarch64-linux-binaries.yml +++ b/.github/workflows/validate-aarch64-linux-binaries.yml @@ -69,9 +69,6 @@ jobs: source ./aarch64_linux/aarch64_ci_setup.sh echo "/opt/conda/bin" >> $GITHUB_PATH - # todo: Remove after aarch64 filename is fixed - export MATRIX_INSTALLATION=${MATRIX_INSTALLATION/"pip3 install"/"pip3 install --pre"} - export ENV_NAME="conda-env-${{ github.run_id }}" export TARGET_OS="linux-aarch64" export TORCH_ONLY=${{ inputs.torchonly }} From b4814f5322fc4d1f22a6a7568d5a169a65e616a6 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Wed, 6 Sep 2023 08:04:44 -0400 Subject: [PATCH 047/212] Changes related to OVERRIDE_PACKAGE_VERSION in aarch64 builds (#1520) (#1523) --- aarch64_linux/aarch64_ci_build.sh | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/aarch64_linux/aarch64_ci_build.sh b/aarch64_linux/aarch64_ci_build.sh index c374359c27..321287ff51 100644 --- a/aarch64_linux/aarch64_ci_build.sh +++ b/aarch64_linux/aarch64_ci_build.sh @@ -4,6 +4,19 @@ set -eux -o pipefail SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" source $SCRIPTPATH/aarch64_ci_setup.sh +tagged_version() { + GIT_DESCRIBE="git --git-dir /pytorch/.git describe --tags --match v[0-9]*.[0-9]*.[0-9]*" + if ${GIT_DESCRIBE} --exact >/dev/null; then + ${GIT_DESCRIBE} + else + return 1 + fi +} + +if tagged_version >/dev/null; then + export OVERRIDE_PACKAGE_VERSION="$(tagged_version | sed -e 's/^v//' -e 's/-.*$//')" +fi + ############################################################################### # Run aarch64 builder python ############################################################################### From 18c2797a34714b2bb748d21754915cab8a741d93 Mon Sep 17 00:00:00 2001 From: Omkar Salpekar Date: Wed, 6 Sep 2023 14:30:23 -0400 Subject: [PATCH 048/212] Torchmetrics in S3 Index (#1522) We will need the stable torchmetrics wheel in the S3 index, since torchrec depends on it. This is similar to how pytorch depends on numpy, etc. and these binaries need to be hosted in our index when uses try to pip install from download.pytorch.org. --- s3_management/manage.py | 1 + 1 file changed, 1 insertion(+) diff --git a/s3_management/manage.py b/s3_management/manage.py index 655f7de40e..4b35ecab2c 100644 --- a/s3_management/manage.py +++ b/s3_management/manage.py @@ -85,6 +85,7 @@ "torchcsprng", "torchdata", "torchdistx", + "torchmetrics", "torchrec", "torchtext", "torchvision", From 3026f248be97304bcca1c348de276958a9d5eefc Mon Sep 17 00:00:00 2001 From: snadampal <87143774+snadampal@users.noreply.github.com> Date: Thu, 7 Sep 2023 08:55:09 -0500 Subject: [PATCH 049/212] [aarch64] update ACL version to v23.05.1 and OpenBLAS to v0.3.20 (#1488) --- aarch64_linux/build_aarch64_wheel.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/aarch64_linux/build_aarch64_wheel.py b/aarch64_linux/build_aarch64_wheel.py index 0bab3126a9..5595dc94a7 100755 --- a/aarch64_linux/build_aarch64_wheel.py +++ b/aarch64_linux/build_aarch64_wheel.py @@ -219,7 +219,7 @@ def install_condaforge_python(host: RemoteHost, python_version="3.8") -> None: def build_OpenBLAS(host: RemoteHost, git_clone_flags: str = "") -> None: print('Building OpenBLAS') - host.run_cmd(f"git clone https://github.com/xianyi/OpenBLAS -b v0.3.19 {git_clone_flags}") + host.run_cmd(f"git clone https://github.com/xianyi/OpenBLAS -b v0.3.20 {git_clone_flags}") make_flags = "NUM_THREADS=64 USE_OPENMP=1 NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=ARMV8" host.run_cmd(f"pushd OpenBLAS && make {make_flags} -j8 && sudo make {make_flags} install && popd && rm -rf OpenBLAS") @@ -227,10 +227,7 @@ def build_OpenBLAS(host: RemoteHost, git_clone_flags: str = "") -> None: def build_ArmComputeLibrary(host: RemoteHost, git_clone_flags: str = "") -> None: print('Building Arm Compute Library') acl_build_flags="debug=0 neon=1 opencl=0 os=linux openmp=1 cppthreads=0 arch=armv8a multi_isa=1 build=native" - host.run_cmd(f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v22.11 {git_clone_flags}") - host.run_cmd(['sed -i -e \'s/"armv8.2-a"/"armv8-a"/g\' ComputeLibrary/SConscript']) - host.run_cmd(['sed -i -e \'s/-march=armv8.2-a+fp16/-march=armv8-a/g\' ComputeLibrary/SConstruct']) - host.run_cmd(['sed -i -e \'s/"-march=armv8.2-a"/"-march=armv8-a"/g\' ComputeLibrary/filedefs.json']) + host.run_cmd(f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v23.05.1 {git_clone_flags}") host.run_cmd(f"cd ComputeLibrary && scons Werror=1 -j8 {acl_build_flags}") From 195148266541a9789074265141cb7dc19dc14c54 Mon Sep 17 00:00:00 2001 From: Danylo Baibak Date: Mon, 11 Sep 2023 09:18:23 +0200 Subject: [PATCH 050/212] Changed runner for linux arm64 (#1525) --- .github/workflows/build-manywheel-images.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-manywheel-images.yml b/.github/workflows/build-manywheel-images.yml index 3bf2dbe70e..c7dbe22488 100644 --- a/.github/workflows/build-manywheel-images.yml +++ b/.github/workflows/build-manywheel-images.yml @@ -87,7 +87,7 @@ jobs: run: | manywheel/build_docker.sh build-docker-cpu-aarch64: - runs-on: linux.t4g.2xlarge + runs-on: linux.arm64.2xlarge env: GPU_ARCH_TYPE: cpu-aarch64 steps: From 0c1d107c4e9f1ed5f09e6d4e1e740353f00e6a09 Mon Sep 17 00:00:00 2001 From: Omkar Salpekar Date: Mon, 11 Sep 2023 16:59:46 -0400 Subject: [PATCH 051/212] Add torch-tensorrt to S3 PyPI Index (#1529) As pytorch/tensorrt moves off of CCI onto Nova, we must to host their nightlies on our S3 index. This change allows the indexing to occur correctly for this package. --- s3_management/manage.py | 1 + 1 file changed, 1 insertion(+) diff --git a/s3_management/manage.py b/s3_management/manage.py index 4b35ecab2c..719923d478 100644 --- a/s3_management/manage.py +++ b/s3_management/manage.py @@ -80,6 +80,7 @@ "requests", "sympy", "torch", + "torch_tensorrt", "torcharrow", "torchaudio", "torchcsprng", From 12a6ea560790246cb3230c9fb02e795b32fa74b7 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Thu, 14 Sep 2023 14:49:46 -0400 Subject: [PATCH 052/212] Enable torch compile for python 3.11 smoke tests (#1534) * Enable torch compile for python 3.11 smoke tests * Make sure release is covered * Fix typo --- test/smoke_test/smoke_test.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/smoke_test/smoke_test.py b/test/smoke_test/smoke_test.py index 3ae4d9421f..539f1ba1bc 100644 --- a/test/smoke_test/smoke_test.py +++ b/test/smoke_test/smoke_test.py @@ -129,7 +129,9 @@ def smoke_test_cuda(package: str, runtime_error_check: str) -> None: print(f"cuDNN enabled? {torch.backends.cudnn.enabled}") # torch.compile is available only on Linux and python 3.8-3.10 - if (sys.platform == "linux" or sys.platform == "linux2") and sys.version_info < (3, 11, 0): + if (sys.platform == "linux" or sys.platform == "linux2") and sys.version_info < (3, 11, 0) and channel == "release": + smoke_test_compile() + elif (sys.platform == "linux" or sys.platform == "linux2") and channel != "release": smoke_test_compile() if(runtime_error_check == "enabled"): From 224a4c536e5efdf8d55ea25e6cd7a5dbb474cc43 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Thu, 14 Sep 2023 16:21:10 -0400 Subject: [PATCH 053/212] add jinja2 (#1536) --- .github/scripts/validate_binaries.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/validate_binaries.sh b/.github/scripts/validate_binaries.sh index 0c01dbca22..e9b780057c 100755 --- a/.github/scripts/validate_binaries.sh +++ b/.github/scripts/validate_binaries.sh @@ -3,7 +3,7 @@ if [[ ${MATRIX_PACKAGE_TYPE} == "libtorch" ]]; then unzip libtorch.zip else # Please note ffmpeg is required for torchaudio, see https://github.com/pytorch/pytorch/issues/96159 - conda create -y -n ${ENV_NAME} python=${MATRIX_PYTHON_VERSION} numpy ffmpeg + conda create -y -n ${ENV_NAME} python=${MATRIX_PYTHON_VERSION} numpy ffmpeg jinja2 conda activate ${ENV_NAME} INSTALLATION=${MATRIX_INSTALLATION/"conda install"/"conda install -y"} TEST_SUFFIX="" From d76e1bbd533d6dfbb8b1d3004fa2022b14a4b8a8 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Thu, 14 Sep 2023 16:37:27 -0400 Subject: [PATCH 054/212] Remove restriction on 3.11 (#1537) --- conda/pytorch-nightly/meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda/pytorch-nightly/meta.yaml b/conda/pytorch-nightly/meta.yaml index 5cab9f53d6..79025dc8c1 100644 --- a/conda/pytorch-nightly/meta.yaml +++ b/conda/pytorch-nightly/meta.yaml @@ -49,7 +49,7 @@ requirements: - sympy - filelock - networkx - - jinja2 # [py <= 310] + - jinja2 - pyyaml {% if cross_compile_arm64 == 0 %} - blas * mkl From e9e31edc19196d08fb32d39ca22dfc98301cbfed Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Thu, 14 Sep 2023 17:08:41 -0400 Subject: [PATCH 055/212] Revert "add jinja2 (#1536)" (#1538) This reverts commit 224a4c536e5efdf8d55ea25e6cd7a5dbb474cc43. --- .github/scripts/validate_binaries.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/validate_binaries.sh b/.github/scripts/validate_binaries.sh index e9b780057c..0c01dbca22 100755 --- a/.github/scripts/validate_binaries.sh +++ b/.github/scripts/validate_binaries.sh @@ -3,7 +3,7 @@ if [[ ${MATRIX_PACKAGE_TYPE} == "libtorch" ]]; then unzip libtorch.zip else # Please note ffmpeg is required for torchaudio, see https://github.com/pytorch/pytorch/issues/96159 - conda create -y -n ${ENV_NAME} python=${MATRIX_PYTHON_VERSION} numpy ffmpeg jinja2 + conda create -y -n ${ENV_NAME} python=${MATRIX_PYTHON_VERSION} numpy ffmpeg conda activate ${ENV_NAME} INSTALLATION=${MATRIX_INSTALLATION/"conda install"/"conda install -y"} TEST_SUFFIX="" From 9f365204ffce6d37c5447e07f8a5db291120ec16 Mon Sep 17 00:00:00 2001 From: Omkar Salpekar Date: Thu, 14 Sep 2023 17:19:53 -0400 Subject: [PATCH 056/212] S3 Management Job Outside Docker (#1531) * S3 Management Job Outside Docker * job name * remove failfast * no matrix * inherit secrets * spacing? * random nits * add back secrets * add back matrix * export env vars correctlty * Update update-s3-html.yml --- .github/workflows/update-s3-html.yml | 35 ++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 .github/workflows/update-s3-html.yml diff --git a/.github/workflows/update-s3-html.yml b/.github/workflows/update-s3-html.yml new file mode 100644 index 0000000000..7c285418ef --- /dev/null +++ b/.github/workflows/update-s3-html.yml @@ -0,0 +1,35 @@ +name: Update S3 HTML indices for download.pytorch.org + +on: + schedule: + # Update the indices every 30 minutes + - cron: "*/30 * * * *" + workflow_dispatch: + +jobs: + update: + strategy: + matrix: + prefix: ["whl", "whl/test", "whl/nightly", "whl/lts/1.8"] + fail-fast: False + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + secrets: inherit + with: + repository: pytorch/builder + timeout: 60 + secrets-env: AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY + script: | + set -ex + + # Create Conda Environment + git config --global --add safe.directory /__w/builder/builder + conda create --quiet -y --prefix run_env python="3.8" + conda activate ./run_env + + # Set Envs + export AWS_ACCESS_KEY_ID="${SECRET_AWS_ACCESS_KEY_ID}" + export AWS_SECRET_ACCESS_KEY="${SECRET_AWS_SECRET_ACCESS_KEY}" + + # Install requirements + pip install -r s3_management/requirements.txt + python s3_management/manage.py --generate-pep503 ${{ matrix.prefix }} From 22f0903d597a9d6cdbcf8790d5d6e18c94bd2afe Mon Sep 17 00:00:00 2001 From: Supadchaya <138070207+spcyppt@users.noreply.github.com> Date: Fri, 15 Sep 2023 11:09:59 -0700 Subject: [PATCH 057/212] Add fbgemm-gpu to S3 Index (#1539) --- s3_management/manage.py | 1 + 1 file changed, 1 insertion(+) diff --git a/s3_management/manage.py b/s3_management/manage.py index 719923d478..ef7ae74fb4 100644 --- a/s3_management/manage.py +++ b/s3_management/manage.py @@ -40,6 +40,7 @@ "charset_normalizer", "cmake", "colorama", + "fbgemm_gpu", "filelock", "fsspec", "idna", From 17ea05e4536e78c9c0be8641952b1c5850a298cb Mon Sep 17 00:00:00 2001 From: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com> Date: Mon, 18 Sep 2023 20:45:13 -0500 Subject: [PATCH 058/212] Update builder images to ROCm5.7 (#1541) * Update docker build images for rocm5.7 * Fix erroneous logic that was skipping msccl files even for ROCm5.6; update msccl path for ROCm5.7 (cherry picked from commit 36c10cc3be475780aa7d76a7ccdbe3f8731042c9) * missing bzip2 package install for miopen * Revert "missing bzip2 package install for miopen" This reverts commit 8ef5fc956508e34315866059431ca015f485f77d. * ROCm 5.7 MIOpen does not need any patches, do not build from source --------- Co-authored-by: Jeff Daily --- .github/workflows/build-libtorch-images.yml | 2 +- .github/workflows/build-manywheel-images.yml | 2 +- common/install_miopen.sh | 5 ++++- manywheel/build_rocm.sh | 13 ++++++++++--- 4 files changed, 16 insertions(+), 6 deletions(-) diff --git a/.github/workflows/build-libtorch-images.yml b/.github/workflows/build-libtorch-images.yml index b2b50d2504..d620300473 100644 --- a/.github/workflows/build-libtorch-images.yml +++ b/.github/workflows/build-libtorch-images.yml @@ -52,7 +52,7 @@ jobs: runs-on: linux.12xlarge strategy: matrix: - rocm_version: ["5.5", "5.6"] + rocm_version: ["5.6", "5.7"] env: GPU_ARCH_TYPE: rocm GPU_ARCH_VERSION: ${{ matrix.rocm_version }} diff --git a/.github/workflows/build-manywheel-images.yml b/.github/workflows/build-manywheel-images.yml index c7dbe22488..bbac707dd9 100644 --- a/.github/workflows/build-manywheel-images.yml +++ b/.github/workflows/build-manywheel-images.yml @@ -58,7 +58,7 @@ jobs: runs-on: linux.12xlarge strategy: matrix: - rocm_version: ["5.5", "5.6"] + rocm_version: ["5.6", "5.7"] env: GPU_ARCH_TYPE: rocm GPU_ARCH_VERSION: ${{ matrix.rocm_version }} diff --git a/common/install_miopen.sh b/common/install_miopen.sh index 696a91905f..c015179278 100644 --- a/common/install_miopen.sh +++ b/common/install_miopen.sh @@ -58,7 +58,10 @@ MIOPEN_CMAKE_COMMON_FLAGS=" -DMIOPEN_BUILD_DRIVER=OFF " # Pull MIOpen repo and set DMIOPEN_EMBED_DB based on ROCm version -if [[ $ROCM_INT -ge 50600 ]] && [[ $ROCM_INT -lt 50700 ]]; then +if [[ $ROCM_INT -ge 50700 ]] && [[ $ROCM_INT -lt 50800 ]]; then + echo "ROCm 5.7 MIOpen does not need any patches, do not build from source" + exit 0 +elif [[ $ROCM_INT -ge 50600 ]] && [[ $ROCM_INT -lt 50700 ]]; then MIOPEN_BRANCH="release/rocm-rel-5.6-staging" elif [[ $ROCM_INT -ge 50500 ]] && [[ $ROCM_INT -lt 50600 ]]; then MIOPEN_BRANCH="release/rocm-rel-5.5-gfx11" diff --git a/manywheel/build_rocm.sh b/manywheel/build_rocm.sh index 80ebde6f41..0fed5970b9 100755 --- a/manywheel/build_rocm.sh +++ b/manywheel/build_rocm.sh @@ -204,10 +204,17 @@ if [[ $ROCM_INT -ge 50500 ]]; then DEPS_AUX_SRCLIST+=(${MIOPEN_SHARE_FILES[@]/#/$MIOPEN_SHARE_SRC/}) DEPS_AUX_DSTLIST+=(${MIOPEN_SHARE_FILES[@]/#/$MIOPEN_SHARE_DST/}) -elif [[ $ROCM_INT -ge 50600 ]]; then +fi + +if [[ $ROCM_INT -ge 50600 ]]; then # RCCL library files - RCCL_SHARE_SRC=$ROCM_HOME/lib/msccl-algorithms - RCCL_SHARE_DST=lib/msccl-algorithms + if [[ $ROCM_INT -ge 50700 ]]; then + RCCL_SHARE_SRC=$ROCM_HOME/share/rccl/msccl-algorithms + RCCL_SHARE_DST=share/rccl/msccl-algorithms + else + RCCL_SHARE_SRC=$ROCM_HOME/lib/msccl-algorithms + RCCL_SHARE_DST=lib/msccl-algorithms + fi RCCL_SHARE_FILES=($(ls $RCCL_SHARE_SRC)) DEPS_AUX_SRCLIST+=(${RCCL_SHARE_FILES[@]/#/$RCCL_SHARE_SRC/}) From cbc95ff6ee0483cb5bf42949a0d1e78ef0571ce1 Mon Sep 17 00:00:00 2001 From: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com> Date: Mon, 18 Sep 2023 20:56:13 -0500 Subject: [PATCH 059/212] Update docker build convenience scripts to ROCm5.7 (#1543) --- libtorch/build_all_docker.sh | 2 +- manywheel/build_all_docker.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/libtorch/build_all_docker.sh b/libtorch/build_all_docker.sh index e73e713b32..fb6bd975be 100755 --- a/libtorch/build_all_docker.sh +++ b/libtorch/build_all_docker.sh @@ -8,6 +8,6 @@ for cuda_version in 12.1 11.8; do GPU_ARCH_TYPE=cuda GPU_ARCH_VERSION="${cuda_version}" "${TOPDIR}/libtorch/build_docker.sh" done -for rocm_version in 5.5 5.6; do +for rocm_version in 5.6 5.7; do GPU_ARCH_TYPE=rocm GPU_ARCH_VERSION="${rocm_version}" "${TOPDIR}/libtorch/build_docker.sh" done diff --git a/manywheel/build_all_docker.sh b/manywheel/build_all_docker.sh index 2bd720f2f6..2995e3be76 100644 --- a/manywheel/build_all_docker.sh +++ b/manywheel/build_all_docker.sh @@ -16,7 +16,7 @@ for cuda_version in 12.1 11.8; do MANYLINUX_VERSION=2014 GPU_ARCH_TYPE=cuda GPU_ARCH_VERSION="${cuda_version}" "${TOPDIR}/manywheel/build_docker.sh" done -for rocm_version in 5.5 5.6; do +for rocm_version in 5.6 5.7; do GPU_ARCH_TYPE=rocm GPU_ARCH_VERSION="${rocm_version}" "${TOPDIR}/manywheel/build_docker.sh" MANYLINUX_VERSION=2014 GPU_ARCH_TYPE=rocm GPU_ARCH_VERSION="${rocm_version}" "${TOPDIR}/manywheel/build_docker.sh" done From 59a2f92aa12c3c0cb11622b05fe77de8312f6d00 Mon Sep 17 00:00:00 2001 From: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com> Date: Tue, 19 Sep 2023 12:09:56 -0500 Subject: [PATCH 060/212] Do not uninstall MIOpen if skipping build-from-source (#1544) --- common/install_miopen.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/install_miopen.sh b/common/install_miopen.sh index c015179278..779bc755d4 100644 --- a/common/install_miopen.sh +++ b/common/install_miopen.sh @@ -33,8 +33,6 @@ if [[ $ROCM_INT -lt 40001 ]]; then exit 0 fi -yum remove -y miopen-hip - # Function to retry functions that sometimes timeout or have flaky failures retry () { $* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*) @@ -85,6 +83,8 @@ else exit 1 fi +yum remove -y miopen-hip + git clone https://github.com/ROCmSoftwarePlatform/MIOpen -b ${MIOPEN_BRANCH} pushd MIOpen # remove .git to save disk space since CI runner was running out From 553b4dff742ccead6d349993cc588f6cc3a8e98d Mon Sep 17 00:00:00 2001 From: cyy Date: Sat, 23 Sep 2023 03:45:39 +0800 Subject: [PATCH 061/212] Install nvtx3 on Windows (#1547) --- windows/internal/cuda_install.bat | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/windows/internal/cuda_install.bat b/windows/internal/cuda_install.bat index acd457a170..18a64b6a23 100644 --- a/windows/internal/cuda_install.bat +++ b/windows/internal/cuda_install.bat @@ -32,7 +32,7 @@ if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" ( curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" if errorlevel 1 exit /b 1 set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" - set "ARGS=cuda_profiler_api_11.8 thrust_11.8 nvcc_11.8 cuobjdump_11.8 nvprune_11.8 nvprof_11.8 cupti_11.8 cublas_11.8 cublas_dev_11.8 cudart_11.8 cufft_11.8 cufft_dev_11.8 curand_11.8 curand_dev_11.8 cusolver_11.8 cusolver_dev_11.8 cusparse_11.8 cusparse_dev_11.8 npp_11.8 npp_dev_11.8 nvrtc_11.8 nvrtc_dev_11.8 nvml_dev_11.8" + set "ARGS=cuda_profiler_api_11.8 thrust_11.8 nvcc_11.8 cuobjdump_11.8 nvprune_11.8 nvprof_11.8 cupti_11.8 cublas_11.8 cublas_dev_11.8 cudart_11.8 cufft_11.8 cufft_dev_11.8 curand_11.8 curand_dev_11.8 cusolver_11.8 cusolver_dev_11.8 cusparse_11.8 cusparse_dev_11.8 npp_11.8 npp_dev_11.8 nvrtc_11.8 nvrtc_dev_11.8 nvml_dev_11.8 nvtx_11.8" ) set CUDNN_FOLDER=cudnn-windows-x86_64-8.7.0.84_cuda11-archive @@ -59,7 +59,7 @@ if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" ( curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" if errorlevel 1 exit /b 1 set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" - set "ARGS=cuda_profiler_api_12.1 thrust_12.1 nvcc_12.1 cuobjdump_12.1 nvprune_12.1 nvprof_12.1 cupti_12.1 cublas_12.1 cublas_dev_12.1 cudart_12.1 cufft_12.1 cufft_dev_12.1 curand_12.1 curand_dev_12.1 cusolver_12.1 cusolver_dev_12.1 cusparse_12.1 cusparse_dev_12.1 npp_12.1 npp_dev_12.1 nvrtc_12.1 nvrtc_dev_12.1 nvml_dev_12.1 nvjitlink_12.1" + set "ARGS=cuda_profiler_api_12.1 thrust_12.1 nvcc_12.1 cuobjdump_12.1 nvprune_12.1 nvprof_12.1 cupti_12.1 cublas_12.1 cublas_dev_12.1 cudart_12.1 cufft_12.1 cufft_dev_12.1 curand_12.1 curand_dev_12.1 cusolver_12.1 cusolver_dev_12.1 cusparse_12.1 cusparse_dev_12.1 npp_12.1 npp_dev_12.1 nvrtc_12.1 nvrtc_dev_12.1 nvml_dev_12.1 nvjitlink_12.1 nvtx_12.1" ) set CUDNN_FOLDER=cudnn-windows-x86_64-8.9.2.26_cuda12-archive From dbad8b7e78bd30bb79ee5f2b2c04b9b7024282c0 Mon Sep 17 00:00:00 2001 From: Matt Davis Date: Fri, 22 Sep 2023 20:00:27 -0400 Subject: [PATCH 062/212] Provide file hashes in the URLs to avoid unnecessary file downloads (bandwidth saver) (#1433) Supply sha256 query parameters using boto3 to avoid hundreds of extra Gigabytes of downloads each day during pipenv and poetry resolution lock cycles. Fixes point 1 in https://github.com/pytorch/pytorch/issues/76557 Fixes #1347 --- s3_management/manage.py | 59 ++++++++++++++++++++++++++++++----------- 1 file changed, 44 insertions(+), 15 deletions(-) diff --git a/s3_management/manage.py b/s3_management/manage.py index ef7ae74fb4..51fede7612 100644 --- a/s3_management/manage.py +++ b/s3_management/manage.py @@ -1,12 +1,15 @@ #!/usr/bin/env python import argparse +import base64 +import dataclasses +import functools import time from os import path, makedirs from datetime import datetime from collections import defaultdict -from typing import Iterator, List, Type, Dict, Set, TypeVar, Optional +from typing import Iterable, List, Type, Dict, Set, TypeVar, Optional from re import sub, match, search from packaging.version import parse @@ -14,7 +17,6 @@ S3 = boto3.resource('s3') -CLIENT = boto3.client('s3') BUCKET = S3.Bucket('pytorch') ACCEPTED_FILE_EXTENSIONS = ("whl", "zip", "tar.gz") @@ -107,6 +109,23 @@ S3IndexType = TypeVar('S3IndexType', bound='S3Index') + +@dataclasses.dataclass(frozen=True) +@functools.total_ordering +class S3Object: + key: str + checksum: str | None + + def __str__(self): + return self.key + + def __eq__(self, other): + return self.key == other.key + + def __lt__(self, other): + return self.key < other.key + + def extract_package_build_time(full_package_name: str) -> datetime: result = search(PACKAGE_DATE_REGEX, full_package_name) if result is not None: @@ -124,7 +143,7 @@ def between_bad_dates(package_build_time: datetime): class S3Index: - def __init__(self: S3IndexType, objects: List[str], prefix: str) -> None: + def __init__(self: S3IndexType, objects: List[S3Object], prefix: str) -> None: self.objects = objects self.prefix = prefix.rstrip("/") self.html_name = PREFIXES_WITH_HTML[self.prefix] @@ -134,7 +153,7 @@ def __init__(self: S3IndexType, objects: List[str], prefix: str) -> None: path.dirname(obj) for obj in objects if path.dirname != prefix } - def nightly_packages_to_show(self: S3IndexType) -> Set[str]: + def nightly_packages_to_show(self: S3IndexType) -> Set[S3Object]: """Finding packages to show based on a threshold we specify Basically takes our S3 packages, normalizes the version for easier @@ -174,8 +193,8 @@ def nightly_packages_to_show(self: S3IndexType) -> Set[str]: if self.normalize_package_version(obj) in to_hide }) - def is_obj_at_root(self, obj:str) -> bool: - return path.dirname(obj) == self.prefix + def is_obj_at_root(self, obj: S3Object) -> bool: + return path.dirname(str(obj)) == self.prefix def _resolve_subdir(self, subdir: Optional[str] = None) -> str: if not subdir: @@ -187,7 +206,7 @@ def gen_file_list( self, subdir: Optional[str]=None, package_name: Optional[str] = None - ) -> Iterator[str]: + ) -> Iterable[S3Object]: objects = ( self.nightly_packages_to_show() if self.prefix == 'whl/nightly' else self.objects @@ -197,23 +216,23 @@ def gen_file_list( if package_name is not None: if self.obj_to_package_name(obj) != package_name: continue - if self.is_obj_at_root(obj) or obj.startswith(subdir): + if self.is_obj_at_root(obj) or str(obj).startswith(subdir): yield obj def get_package_names(self, subdir: Optional[str] = None) -> List[str]: return sorted(set(self.obj_to_package_name(obj) for obj in self.gen_file_list(subdir))) - def normalize_package_version(self: S3IndexType, obj: str) -> str: + def normalize_package_version(self: S3IndexType, obj: S3Object) -> str: # removes the GPU specifier from the package name as well as # unnecessary things like the file extension, architecture name, etc. return sub( r"%2B.*", "", - "-".join(path.basename(obj).split("-")[:2]) + "-".join(path.basename(str(obj)).split("-")[:2]) ) - def obj_to_package_name(self, obj: str) -> str: - return path.basename(obj).split('-', 1)[0] + def obj_to_package_name(self, obj: S3Object) -> str: + return path.basename(str(obj)).split('-', 1)[0] def to_legacy_html( self, @@ -258,7 +277,8 @@ def to_simple_package_html( out.append(' ') out.append('

Links for {}

'.format(package_name.lower().replace("_","-"))) for obj in sorted(self.gen_file_list(subdir, package_name)): - out.append(f' {path.basename(obj).replace("%2B","+")}
') + maybe_fragment = f"#sha256={obj.checksum}" if obj.checksum else "" + out.append(f' {path.basename(obj).replace("%2B","+")}
') # Adding html footer out.append(' ') out.append('') @@ -319,7 +339,6 @@ def upload_pep503_htmls(self) -> None: Body=self.to_simple_package_html(subdir=subdir, package_name=pkg_name) ) - def save_legacy_html(self) -> None: for subdir in self.subdirs: print(f"INFO Saving {subdir}/{self.html_name}") @@ -351,10 +370,18 @@ def from_S3(cls: Type[S3IndexType], prefix: str) -> S3IndexType: for pattern in ACCEPTED_SUBDIR_PATTERNS ]) and obj.key.endswith(ACCEPTED_FILE_EXTENSIONS) if is_acceptable: + # Add PEP 503-compatible hashes to URLs to allow clients to avoid spurious downloads, if possible. + response = obj.meta.client.head_object(Bucket=BUCKET.name, Key=obj.key, ChecksumMode="ENABLED") + sha256 = (_b64 := response.get("ChecksumSHA256")) and base64.b64decode(_b64).hex() sanitized_key = obj.key.replace("+", "%2B") - objects.append(sanitized_key) + s3_object = S3Object( + key=sanitized_key, + checksum=sha256, + ) + objects.append(s3_object) return cls(objects, prefix) + def create_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser("Manage S3 HTML indices for PyTorch") parser.add_argument( @@ -366,6 +393,7 @@ def create_parser() -> argparse.ArgumentParser: parser.add_argument("--generate-pep503", action="store_true") return parser + def main(): parser = create_parser() args = parser.parse_args() @@ -390,5 +418,6 @@ def main(): if args.generate_pep503: idx.upload_pep503_htmls() + if __name__ == "__main__": main() From 1327c0b2d6050832d135fe708118225bedcaad1e Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Fri, 22 Sep 2023 17:02:16 -0700 Subject: [PATCH 063/212] Workaround for older files --- s3_management/manage.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/s3_management/manage.py b/s3_management/manage.py index 51fede7612..03950af55b 100644 --- a/s3_management/manage.py +++ b/s3_management/manage.py @@ -373,6 +373,9 @@ def from_S3(cls: Type[S3IndexType], prefix: str) -> S3IndexType: # Add PEP 503-compatible hashes to URLs to allow clients to avoid spurious downloads, if possible. response = obj.meta.client.head_object(Bucket=BUCKET.name, Key=obj.key, ChecksumMode="ENABLED") sha256 = (_b64 := response.get("ChecksumSHA256")) and base64.b64decode(_b64).hex() + # For older files, rely on checksumsha256 metadata that can be added to the file later + if sha256 is None: + sha256 = response.get("Metadata", {}).get("checksumsha256") sanitized_key = obj.key.replace("+", "%2B") s3_object = S3Object( key=sanitized_key, From dc0a5791766f5f8dc1612993854b6e5d5056cd12 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Fri, 22 Sep 2023 17:42:16 -0700 Subject: [PATCH 064/212] Bugfixes introduced by https://github.com/pytorch/builder/pull/1433 Replace `obj` with `obj.key` in few places Dismantle pyramid of doom while iterating over objects Test plan: Run `python manage.py whl/test --generate-pep503` --- s3_management/manage.py | 45 +++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/s3_management/manage.py b/s3_management/manage.py index 03950af55b..a46a1d9da9 100644 --- a/s3_management/manage.py +++ b/s3_management/manage.py @@ -114,7 +114,7 @@ @functools.total_ordering class S3Object: key: str - checksum: str | None + checksum: Optional[str] def __str__(self): return self.key @@ -150,7 +150,7 @@ def __init__(self: S3IndexType, objects: List[S3Object], prefix: str) -> None: # should dynamically grab subdirectories like whl/test/cu101 # so we don't need to add them manually anymore self.subdirs = { - path.dirname(obj) for obj in objects if path.dirname != prefix + path.dirname(obj.key) for obj in objects if path.dirname != prefix } def nightly_packages_to_show(self: S3IndexType) -> Set[S3Object]: @@ -194,7 +194,7 @@ def nightly_packages_to_show(self: S3IndexType) -> Set[S3Object]: }) def is_obj_at_root(self, obj: S3Object) -> bool: - return path.dirname(str(obj)) == self.prefix + return path.dirname(obj.key) == self.prefix def _resolve_subdir(self, subdir: Optional[str] = None) -> str: if not subdir: @@ -216,7 +216,7 @@ def gen_file_list( if package_name is not None: if self.obj_to_package_name(obj) != package_name: continue - if self.is_obj_at_root(obj) or str(obj).startswith(subdir): + if self.is_obj_at_root(obj) or obj.key.startswith(subdir): yield obj def get_package_names(self, subdir: Optional[str] = None) -> List[str]: @@ -228,11 +228,11 @@ def normalize_package_version(self: S3IndexType, obj: S3Object) -> str: return sub( r"%2B.*", "", - "-".join(path.basename(str(obj)).split("-")[:2]) + "-".join(path.basename(obj.key).split("-")[:2]) ) def obj_to_package_name(self, obj: S3Object) -> str: - return path.basename(str(obj)).split('-', 1)[0] + return path.basename(obj.key).split('-', 1)[0] def to_legacy_html( self, @@ -250,7 +250,7 @@ def to_legacy_html( is_root = subdir == self.prefix for obj in self.gen_file_list(subdir): # Strip our prefix - sanitized_obj = obj.replace(subdir, "", 1) + sanitized_obj = obj.key.replace(subdir, "", 1) if sanitized_obj.startswith('/'): sanitized_obj = sanitized_obj.lstrip("/") # we include objects at our root prefix so that users can still @@ -258,7 +258,7 @@ def to_legacy_html( # to install a specific GPU arch of torch / torchvision if not is_root and self.is_obj_at_root(obj): # strip root prefix - sanitized_obj = obj.replace(self.prefix, "", 1).lstrip("/") + sanitized_obj = obj.key.replace(self.prefix, "", 1).lstrip("/") sanitized_obj = f"../{sanitized_obj}" out.append(f'{sanitized_obj}
') return "\n".join(sorted(out)) @@ -278,7 +278,7 @@ def to_simple_package_html( out.append('

Links for {}

'.format(package_name.lower().replace("_","-"))) for obj in sorted(self.gen_file_list(subdir, package_name)): maybe_fragment = f"#sha256={obj.checksum}" if obj.checksum else "" - out.append(f' {path.basename(obj).replace("%2B","+")}
') + out.append(f' {path.basename(obj.key).replace("%2B","+")}
') # Adding html footer out.append(' ') out.append('') @@ -369,19 +369,20 @@ def from_S3(cls: Type[S3IndexType], prefix: str) -> S3IndexType: ) for pattern in ACCEPTED_SUBDIR_PATTERNS ]) and obj.key.endswith(ACCEPTED_FILE_EXTENSIONS) - if is_acceptable: - # Add PEP 503-compatible hashes to URLs to allow clients to avoid spurious downloads, if possible. - response = obj.meta.client.head_object(Bucket=BUCKET.name, Key=obj.key, ChecksumMode="ENABLED") - sha256 = (_b64 := response.get("ChecksumSHA256")) and base64.b64decode(_b64).hex() - # For older files, rely on checksumsha256 metadata that can be added to the file later - if sha256 is None: - sha256 = response.get("Metadata", {}).get("checksumsha256") - sanitized_key = obj.key.replace("+", "%2B") - s3_object = S3Object( - key=sanitized_key, - checksum=sha256, - ) - objects.append(s3_object) + if not is_acceptable: + continue + # Add PEP 503-compatible hashes to URLs to allow clients to avoid spurious downloads, if possible. + response = obj.meta.client.head_object(Bucket=BUCKET.name, Key=obj.key, ChecksumMode="ENABLED") + sha256 = (_b64 := response.get("ChecksumSHA256")) and base64.b64decode(_b64).hex() + # For older files, rely on checksum-sha256 metadata that can be added to the file later + if sha256 is None: + sha256 = response.get("Metadata", {}).get("checksum-sha256") + sanitized_key = obj.key.replace("+", "%2B") + s3_object = S3Object( + key=sanitized_key, + checksum=sha256, + ) + objects.append(s3_object) return cls(objects, prefix) From 21ffba158c9c79c6d13ac85d04008d1828d8d139 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Mon, 25 Sep 2023 10:20:51 -0700 Subject: [PATCH 065/212] [S3_management] Update boto3 to 1.28.53 --- s3_management/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/s3_management/requirements.txt b/s3_management/requirements.txt index d9fe7f1f00..fa23e39b1b 100644 --- a/s3_management/requirements.txt +++ b/s3_management/requirements.txt @@ -1,2 +1,2 @@ -boto3==1.12.7 +boto3==1.28.53 packaging==21.3 From 410ec8ea9298a3a1b11c426e4dfed151df71456a Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Mon, 25 Sep 2023 17:19:05 -0700 Subject: [PATCH 066/212] [manage_s3] Download objects metadata concurrently Using `concurrent.futures.ThreadPoolExecutor` This speeds up rebuilding `whl/test` index from 300 sec to 90 sec on my laptop --- s3_management/manage.py | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/s3_management/manage.py b/s3_management/manage.py index a46a1d9da9..0df513d32a 100644 --- a/s3_management/manage.py +++ b/s3_management/manage.py @@ -2,6 +2,7 @@ import argparse import base64 +import concurrent.futures import dataclasses import functools import time @@ -17,6 +18,7 @@ S3 = boto3.resource('s3') +CLIENT = boto3.client('s3') BUCKET = S3.Bucket('pytorch') ACCEPTED_FILE_EXTENSIONS = ("whl", "zip", "tar.gz") @@ -359,8 +361,8 @@ def save_pep503_htmls(self) -> None: @classmethod def from_S3(cls: Type[S3IndexType], prefix: str) -> S3IndexType: - objects = [] prefix = prefix.rstrip("/") + obj_names = [] for obj in BUCKET.objects.filter(Prefix=prefix): is_acceptable = any([path.dirname(obj.key) == prefix] + [ match( @@ -371,18 +373,25 @@ def from_S3(cls: Type[S3IndexType], prefix: str) -> S3IndexType: ]) and obj.key.endswith(ACCEPTED_FILE_EXTENSIONS) if not is_acceptable: continue + obj_names.append(obj.key) + objects = [] + def fetch_metadata(key: str) : + return CLIENT.head_object(Bucket=BUCKET.name, Key=key, ChecksumMode="Enabled") + + with concurrent.futures.ThreadPoolExecutor(max_workers=6) as executor: # Add PEP 503-compatible hashes to URLs to allow clients to avoid spurious downloads, if possible. - response = obj.meta.client.head_object(Bucket=BUCKET.name, Key=obj.key, ChecksumMode="ENABLED") - sha256 = (_b64 := response.get("ChecksumSHA256")) and base64.b64decode(_b64).hex() - # For older files, rely on checksum-sha256 metadata that can be added to the file later - if sha256 is None: - sha256 = response.get("Metadata", {}).get("checksum-sha256") - sanitized_key = obj.key.replace("+", "%2B") - s3_object = S3Object( - key=sanitized_key, - checksum=sha256, - ) - objects.append(s3_object) + for obj_key, future in {key: executor.submit(fetch_metadata, key) for key in obj_names}.items(): + response = future.result() + sha256 = (_b64 := response.get("ChecksumSHA256")) and base64.b64decode(_b64).hex() + # For older files, rely on checksum-sha256 metadata that can be added to the file later + if sha256 is None: + sha256 = response.get("Metadata", {}).get("checksum-sha256") + sanitized_key = obj_key.replace("+", "%2B") + s3_object = S3Object( + key=sanitized_key, + checksum=sha256, + ) + objects.append(s3_object) return cls(objects, prefix) From ef74cedc67d27d73df4b3586c804b0591bfdf7b8 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Tue, 26 Sep 2023 10:31:09 -0700 Subject: [PATCH 067/212] Make smoke-test runnable without envvars --- test/smoke_test/smoke_test.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/smoke_test/smoke_test.py b/test/smoke_test/smoke_test.py index 539f1ba1bc..375ff45be3 100644 --- a/test/smoke_test/smoke_test.py +++ b/test/smoke_test/smoke_test.py @@ -57,11 +57,13 @@ def check_version(package: str) -> None: # only makes sense to check nightly package where dates are known if channel == "nightly": check_nightly_binaries_date(package) - else: + elif stable_version is not None: if not torch.__version__.startswith(stable_version): raise RuntimeError( f"Torch version mismatch, expected {stable_version} for channel {channel}. But its {torch.__version__}" ) + else: + print(f"Skip version check for channel {channel} as stable version is None") def check_nightly_binaries_date(package: str) -> None: from datetime import datetime, timedelta From f6d12ba88ad82bfa29618c56f88a6f9168ceb757 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Thu, 28 Sep 2023 18:47:50 -0400 Subject: [PATCH 068/212] [aarch64] set acl_build_flags arch=armv8a, remove editing build flags (#1550) Looking at this PR: https://github.com/pytorch/builder/pull/1370/ this line: https://github.com/pytorch/builder/pull/1370/files#diff-54480d0a69ca27f54fb0736a9762caa8b03bd4736dcd77190d99ec3033c9bd2fR229 That fixed the issue: https://github.com/pytorch/pytorch/issues/97226 One of the changes is to set ``` arch=armv8a ``` We are experiencing the same issue now: https://github.com/pytorch/pytorch/issues/109312 Hence this fix. --- aarch64_linux/aarch64_wheel_ci_build.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/aarch64_linux/aarch64_wheel_ci_build.py b/aarch64_linux/aarch64_wheel_ci_build.py index 4ad620ba2e..f6797ce1c0 100755 --- a/aarch64_linux/aarch64_wheel_ci_build.py +++ b/aarch64_linux/aarch64_wheel_ci_build.py @@ -21,11 +21,8 @@ def build_ArmComputeLibrary(git_clone_flags: str = "") -> None: print('Building Arm Compute Library') os.system("cd / && mkdir /acl") os.system(f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v23.05.1 {git_clone_flags}") - os.system('sed -i -e \'s/"armv8.2-a"/"armv8-a"/g\' ComputeLibrary/SConscript; ' - 'sed -i -e \'s/-march=armv8.2-a+fp16/-march=armv8-a/g\' ComputeLibrary/SConstruct; ' - 'sed -i -e \'s/"-march=armv8.2-a"/"-march=armv8-a"/g\' ComputeLibrary/filedefs.json') os.system("cd ComputeLibrary; export acl_install_dir=/acl; " - "scons Werror=1 -j8 debug=0 neon=1 opencl=0 os=linux openmp=1 cppthreads=0 arch=armv8.2-a multi_isa=1 build=native build_dir=$acl_install_dir/build; " + "scons Werror=1 -j8 debug=0 neon=1 opencl=0 os=linux openmp=1 cppthreads=0 arch=armv8a multi_isa=1 build=native build_dir=$acl_install_dir/build; " "cp -r arm_compute $acl_install_dir; " "cp -r include $acl_install_dir; " "cp -r utils $acl_install_dir; " From 8465dbe657a814b95b91cc3243b071fc21c37519 Mon Sep 17 00:00:00 2001 From: Nikita Shulga <2453524+malfet@users.noreply.github.com> Date: Fri, 29 Sep 2023 16:26:17 -0700 Subject: [PATCH 069/212] [BE] Fix all flake8 violations in `smoke_test.py` (#1553) Namely: - `if(x):` -> `if x:` - `"dev\d+"` -> `"dev\\d+"` - Keep 2 newlines between functions - Add `assert foo is not None` to suppress "variable assigned but not used" warning --- test/smoke_test/smoke_test.py | 40 +++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/test/smoke_test/smoke_test.py b/test/smoke_test/smoke_test.py index 375ff45be3..ca44b0369b 100644 --- a/test/smoke_test/smoke_test.py +++ b/test/smoke_test/smoke_test.py @@ -38,6 +38,7 @@ }, ] + class Net(nn.Module): def __init__(self): super(Net, self).__init__() @@ -53,6 +54,7 @@ def forward(self, x): output = self.fc1(x) return output + def check_version(package: str) -> None: # only makes sense to check nightly package where dates are known if channel == "nightly": @@ -65,23 +67,23 @@ def check_version(package: str) -> None: else: print(f"Skip version check for channel {channel} as stable version is None") + def check_nightly_binaries_date(package: str) -> None: from datetime import datetime, timedelta format_dt = '%Y%m%d' - torch_str = torch.__version__ - date_t_str = re.findall("dev\d+", torch.__version__) + date_t_str = re.findall("dev\\d+", torch.__version__) date_t_delta = datetime.now() - datetime.strptime(date_t_str[0][3:], format_dt) if date_t_delta.days >= NIGHTLY_ALLOWED_DELTA: raise RuntimeError( f"the binaries are from {date_t_str} and are more than {NIGHTLY_ALLOWED_DELTA} days old!" ) - if(package == "all"): + if package == "all": for module in MODULES: imported_module = importlib.import_module(module["name"]) module_version = imported_module.__version__ - date_m_str = re.findall("dev\d+", module_version) + date_m_str = re.findall("dev\\d+", module_version) date_m_delta = datetime.now() - datetime.strptime(date_m_str[0][3:], format_dt) print(f"Nightly date check for {module['name']} version {module_version}") if date_m_delta.days > NIGHTLY_ALLOWED_DELTA: @@ -89,8 +91,9 @@ def check_nightly_binaries_date(package: str) -> None: f"Expected {module['name']} to be less then {NIGHTLY_ALLOWED_DELTA} days. But its {date_m_delta}" ) + def test_cuda_runtime_errors_captured() -> None: - cuda_exception_missed=True + cuda_exception_missed = True try: print("Testing test_cuda_runtime_errors_captured") torch._assert_async(torch.tensor(0, device="cuda")) @@ -101,14 +104,15 @@ def test_cuda_runtime_errors_captured() -> None: cuda_exception_missed = False else: raise e - if(cuda_exception_missed): - raise RuntimeError( f"Expected CUDA RuntimeError but have not received!") + if cuda_exception_missed: + raise RuntimeError("Expected CUDA RuntimeError but have not received!") + def smoke_test_cuda(package: str, runtime_error_check: str) -> None: if not torch.cuda.is_available() and is_cuda_system: raise RuntimeError(f"Expected CUDA {gpu_arch_ver}. However CUDA is not loaded.") - if(package == 'all' and is_cuda_system): + if package == 'all' and is_cuda_system: for module in MODULES: imported_module = importlib.import_module(module["name"]) # TBD for vision move extension module to private so it will @@ -131,12 +135,10 @@ def smoke_test_cuda(package: str, runtime_error_check: str) -> None: print(f"cuDNN enabled? {torch.backends.cudnn.enabled}") # torch.compile is available only on Linux and python 3.8-3.10 - if (sys.platform == "linux" or sys.platform == "linux2") and sys.version_info < (3, 11, 0) and channel == "release": - smoke_test_compile() - elif (sys.platform == "linux" or sys.platform == "linux2") and channel != "release": + if sys.platform in ["linux", "linux2"] and (sys.version_info < (3, 11, 0) or channel != "release"): smoke_test_compile() - if(runtime_error_check == "enabled"): + if runtime_error_check == "enabled": test_cuda_runtime_errors_captured() @@ -148,6 +150,7 @@ def smoke_test_conv2d() -> None: m = nn.Conv2d(16, 33, 3, stride=2) # non-square kernels and unequal stride and with padding m = nn.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2)) + assert m is not None # non-square kernels and unequal stride and with padding and dilation basic_conv = nn.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3, 1)) input = torch.randn(20, 16, 50, 100) @@ -156,9 +159,10 @@ def smoke_test_conv2d() -> None: if is_cuda_system: print("Testing smoke_test_conv2d with cuda") conv = nn.Conv2d(3, 3, 3).cuda() - x = torch.randn(1, 3, 24, 24).cuda() + x = torch.randn(1, 3, 24, 24, device="cuda") with torch.cuda.amp.autocast(): out = conv(x) + assert out is not None supported_dtypes = [torch.float16, torch.float32, torch.float64] for dtype in supported_dtypes: @@ -166,6 +170,8 @@ def smoke_test_conv2d() -> None: conv = basic_conv.to(dtype).cuda() input = torch.randn(20, 16, 50, 100, device="cuda").type(dtype) output = conv(input) + assert output is not None + def smoke_test_linalg() -> None: print("Testing smoke_test_linalg") @@ -189,10 +195,13 @@ def smoke_test_linalg() -> None: A = torch.randn(20, 16, 50, 100, device="cuda").type(dtype) torch.linalg.svd(A) + def smoke_test_compile() -> None: supported_dtypes = [torch.float16, torch.float32, torch.float64] + def foo(x: torch.Tensor) -> torch.Tensor: return torch.sin(x) + torch.cos(x) + for dtype in supported_dtypes: print(f"Testing smoke_test_compile for {dtype}") x = torch.rand(3, 3, device="cuda").type(dtype) @@ -209,6 +218,7 @@ def foo(x: torch.Tensor) -> torch.Tensor: model = Net().to(device="cuda") x_pt2 = torch.compile(model, mode="max-autotune")(x) + def smoke_test_modules(): cwd = os.getcwd() for module in MODULES: @@ -224,9 +234,7 @@ def smoke_test_modules(): smoke_test_command, stderr=subprocess.STDOUT, shell=True, universal_newlines=True) except subprocess.CalledProcessError as exc: - raise RuntimeError( - f"Module {module['name']} FAIL: {exc.returncode} Output: {exc.output}" - ) + raise RuntimeError(f"Module {module['name']} FAIL: {exc.returncode} Output: {exc.output}") else: print("Output: \n{}\n".format(output)) From d0fc085ddd11a8b98c0a1809273be87c65eef68f Mon Sep 17 00:00:00 2001 From: snadampal <87143774+snadampal@users.noreply.github.com> Date: Fri, 29 Sep 2023 22:34:39 -0500 Subject: [PATCH 070/212] [aarch64] patch mkl-dnn to use 'march=armv8-a' as the default build (#1554) --- ...4-fix-default-build-flags-to-armv8-a.patch | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 mkldnn_fix/aarch64-fix-default-build-flags-to-armv8-a.patch diff --git a/mkldnn_fix/aarch64-fix-default-build-flags-to-armv8-a.patch b/mkldnn_fix/aarch64-fix-default-build-flags-to-armv8-a.patch new file mode 100644 index 0000000000..f6e91010ab --- /dev/null +++ b/mkldnn_fix/aarch64-fix-default-build-flags-to-armv8-a.patch @@ -0,0 +1,29 @@ +--- + cmake/platform.cmake | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/cmake/platform.cmake b/cmake/platform.cmake +index 8630460ce..602eafe8e 100644 +--- a/cmake/platform.cmake ++++ b/cmake/platform.cmake +@@ -198,7 +198,7 @@ elseif(UNIX OR MINGW) + endif() + # For native compilation tune for the host processor + if (CMAKE_SYSTEM_PROCESSOR STREQUAL CMAKE_HOST_SYSTEM_PROCESSOR) +- append(DEF_ARCH_OPT_FLAGS "-mcpu=native") ++ append(DEF_ARCH_OPT_FLAGS "-march=armv8-a") + endif() + elseif(DNNL_TARGET_ARCH STREQUAL "PPC64") + if (NOT CMAKE_BUILD_TYPE STREQUAL "Debug") +@@ -295,7 +295,7 @@ elseif(UNIX OR MINGW) + endif() + # For native compilation tune for the host processor + if (CMAKE_SYSTEM_PROCESSOR STREQUAL CMAKE_HOST_SYSTEM_PROCESSOR) +- append(DEF_ARCH_OPT_FLAGS "-mcpu=native") ++ append(DEF_ARCH_OPT_FLAGS "-march=armv8-a") + endif() + elseif(DNNL_TARGET_ARCH STREQUAL "PPC64") + if (NOT CMAKE_BUILD_TYPE STREQUAL "Debug") +-- +2.34.1 + From 6021651b99e8bacdc7fba4f6f60f0034bc053190 Mon Sep 17 00:00:00 2001 From: snadampal <87143774+snadampal@users.noreply.github.com> Date: Fri, 29 Sep 2023 22:35:25 -0500 Subject: [PATCH 071/212] [aarch64] patch pytorch 2.1 for mkl-dnn fix (#1555) --- aarch64_linux/build_aarch64_wheel.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/aarch64_linux/build_aarch64_wheel.py b/aarch64_linux/build_aarch64_wheel.py index 5595dc94a7..dd43bf2188 100755 --- a/aarch64_linux/build_aarch64_wheel.py +++ b/aarch64_linux/build_aarch64_wheel.py @@ -554,7 +554,9 @@ def start_build(host: RemoteHost, *, build_ArmComputeLibrary(host, git_clone_flags) print("build pytorch with mkldnn+acl backend") build_vars += " USE_MKLDNN=ON USE_MKLDNN_ACL=ON" - host.run_cmd(f"cd pytorch && export ACL_ROOT_DIR=$HOME/ComputeLibrary && {build_vars} python3 setup.py bdist_wheel{build_opts}") + host.run_cmd(f"cd $HOME && git clone https://github.com/pytorch/builder.git") + host.run_cmd(f"cd $HOME/pytorch/third_party/ideep/mkl-dnn && patch -p1 < $HOME/builder/mkldnn_fix/aarch64-fix-default-build-flags-to-armv8-a.patch") + host.run_cmd(f"cd $HOME/pytorch && export ACL_ROOT_DIR=$HOME/ComputeLibrary && {build_vars} python3 setup.py bdist_wheel{build_opts}") print('Repair the wheel') pytorch_wheel_name = host.list_dir("pytorch/dist")[0] host.run_cmd(f"export LD_LIBRARY_PATH=$HOME/acl/build:$HOME/pytorch/build/lib && auditwheel repair $HOME/pytorch/dist/{pytorch_wheel_name}") From 71772d14b84960eb3a0c2a8b6f91b86efd1593be Mon Sep 17 00:00:00 2001 From: snadampal <87143774+snadampal@users.noreply.github.com> Date: Fri, 29 Sep 2023 23:07:27 -0500 Subject: [PATCH 072/212] patch ci script with mkldnn fix (#1556) --- aarch64_linux/aarch64_wheel_ci_build.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/aarch64_linux/aarch64_wheel_ci_build.py b/aarch64_linux/aarch64_wheel_ci_build.py index f6797ce1c0..da789a2314 100755 --- a/aarch64_linux/aarch64_wheel_ci_build.py +++ b/aarch64_linux/aarch64_wheel_ci_build.py @@ -105,6 +105,9 @@ def parse_arguments(): else: print("build pytorch without mkldnn backend") + # work around to fix Raspberry pie crash + os.system(f"cd $HOME && git clone https://github.com/pytorch/builder.git") + os.system(f"cd $HOME/pytorch/third_party/ideep/mkl-dnn && patch -p1 < $HOME/builder/mkldnn_fix/aarch64-fix-default-build-flags-to-armv8-a.patch") os.system(f"cd /pytorch; {build_vars} python3 setup.py bdist_wheel") pytorch_wheel_name = complete_wheel("pytorch") print(f"Build Compelete. Created {pytorch_wheel_name}..") From 685a807f08e46e0479c3ccdd175cb3333da4d72f Mon Sep 17 00:00:00 2001 From: Nikita Shulga <2453524+malfet@users.noreply.github.com> Date: Sat, 30 Sep 2023 13:57:38 -0700 Subject: [PATCH 073/212] [BE] Add lint workflow (#1557) And format `smoke_test.py` with `ruff` Invoke/confgure `ruff` using `lintrunner` Copy lint runner adapters from https://github.com/pytorch/pytorch/tree/main/tools/linter/adapters --- .github/workflows/lint.yml | 35 ++ .lintrunner.toml | 20 ++ pyproject.toml | 23 ++ test/smoke_test/smoke_test.py | 14 +- tools/linter/adapters/pip_init.py | 83 +++++ tools/linter/adapters/ruff_linter.py | 462 +++++++++++++++++++++++++++ 6 files changed, 629 insertions(+), 8 deletions(-) create mode 100644 .github/workflows/lint.yml create mode 100644 .lintrunner.toml create mode 100644 pyproject.toml create mode 100644 tools/linter/adapters/pip_init.py create mode 100644 tools/linter/adapters/ruff_linter.py diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000000..c8bd056134 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,35 @@ +name: Lint + +on: + push: + branches: + - main + pull_request: + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + lintrunner: + name: lintrunner + + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v3 + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: "3.11" + - name: Install Lintrunner + run: | + pip install lintrunner + lintrunner init + - name: Run lintrunner on all files - Linux + run: | + set +e + if ! lintrunner -v --force-color --all-files --tee-json=lint.json; then + echo "" + echo -e "\e[1m\e[36mYou can reproduce these results locally by using \`lintrunner -m main\`.\e[0m" + exit 1 + fi diff --git a/.lintrunner.toml b/.lintrunner.toml new file mode 100644 index 0000000000..c551cb732d --- /dev/null +++ b/.lintrunner.toml @@ -0,0 +1,20 @@ +merge_base_with = "origin/main" + +[[linter]] +code = 'RUFF' +include_patterns = ['test/smoke_test/*.py'] +command = [ + 'python3', + 'tools/linter/adapters/ruff_linter.py', + '--config=pyproject.toml', + '--show-disable', + '--', + '@{{PATHSFILE}}' +] +init_command = [ + 'python3', + 'tools/linter/adapters/pip_init.py', + '--dry-run={{DRYRUN}}', + 'ruff==0.0.290', +] +is_formatter = true diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000000..efa884a07f --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,23 @@ +[tool.ruff] +target-version = "py38" +line-length = 120 +select = [ + "B", + "C4", + "G", + "E", + "F", + "SIM1", + "W", + # Not included in flake8 + "UP", + "PERF", + "PGH004", + "PIE807", + "PIE810", + "PLE", + "PLR1722", # use sys exit + "PLW3301", # nested min max + "RUF017", + "TRY302", +] diff --git a/test/smoke_test/smoke_test.py b/test/smoke_test/smoke_test.py index ca44b0369b..8ae1d1c512 100644 --- a/test/smoke_test/smoke_test.py +++ b/test/smoke_test/smoke_test.py @@ -1,10 +1,8 @@ import os import re import sys -from pathlib import Path import argparse import torch -import platform import importlib import subprocess import torch._dynamo @@ -41,7 +39,7 @@ class Net(nn.Module): def __init__(self): - super(Net, self).__init__() + super().__init__() self.conv1 = nn.Conv2d(1, 32, 3, 1) self.conv2 = nn.Conv2d(32, 64, 3, 1) self.fc1 = nn.Linear(9216, 1) @@ -69,7 +67,7 @@ def check_version(package: str) -> None: def check_nightly_binaries_date(package: str) -> None: - from datetime import datetime, timedelta + from datetime import datetime format_dt = '%Y%m%d' date_t_str = re.findall("dev\\d+", torch.__version__) @@ -177,11 +175,11 @@ def smoke_test_linalg() -> None: print("Testing smoke_test_linalg") A = torch.randn(5, 3) U, S, Vh = torch.linalg.svd(A, full_matrices=False) - U.shape, S.shape, Vh.shape + assert U.shape == A.shape and S.shape == torch.Size([3]) and Vh.shape == torch.Size([3, 3]) torch.dist(A, U @ torch.diag(S) @ Vh) U, S, Vh = torch.linalg.svd(A) - U.shape, S.shape, Vh.shape + assert U.shape == A.shape and S.shape == torch.Size([3]) and Vh.shape == torch.Size([3, 3]) torch.dist(A, U[:, :3] @ torch.diag(S) @ Vh) A = torch.randn(7, 5, 3) @@ -234,9 +232,9 @@ def smoke_test_modules(): smoke_test_command, stderr=subprocess.STDOUT, shell=True, universal_newlines=True) except subprocess.CalledProcessError as exc: - raise RuntimeError(f"Module {module['name']} FAIL: {exc.returncode} Output: {exc.output}") + raise RuntimeError(f"Module {module['name']} FAIL: {exc.returncode} Output: {exc.output}") from exc else: - print("Output: \n{}\n".format(output)) + print(f"Output: \n{output}\n") def main() -> None: diff --git a/tools/linter/adapters/pip_init.py b/tools/linter/adapters/pip_init.py new file mode 100644 index 0000000000..f177a920d0 --- /dev/null +++ b/tools/linter/adapters/pip_init.py @@ -0,0 +1,83 @@ +""" +Initializer script that installs stuff to pip. +""" +import argparse +import logging +import os +import subprocess +import sys +import time + +from typing import List + + +def run_command(args: List[str]) -> "subprocess.CompletedProcess[bytes]": + logging.debug("$ %s", " ".join(args)) + start_time = time.monotonic() + try: + return subprocess.run(args, check=True) + finally: + end_time = time.monotonic() + logging.debug("took %dms", (end_time - start_time) * 1000) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="pip initializer") + parser.add_argument( + "packages", + nargs="+", + help="pip packages to install", + ) + parser.add_argument( + "--verbose", + action="store_true", + help="verbose logging", + ) + parser.add_argument( + "--dry-run", help="do not install anything, just print what would be done." + ) + parser.add_argument( + "--no-black-binary", + help="do not use pre-compiled binaries from pip for black.", + action="store_true", + ) + + args = parser.parse_args() + + logging.basicConfig( + format="<%(threadName)s:%(levelname)s> %(message)s", + level=logging.NOTSET if args.verbose else logging.DEBUG, + stream=sys.stderr, + ) + + pip_args = ["pip3", "install"] + + # If we are in a global install, use `--user` to install so that you do not + # need root access in order to initialize linters. + # + # However, `pip install --user` interacts poorly with virtualenvs (see: + # https://bit.ly/3vD4kvl) and conda (see: https://bit.ly/3KG7ZfU). So in + # these cases perform a regular installation. + in_conda = os.environ.get("CONDA_PREFIX") is not None + in_virtualenv = os.environ.get("VIRTUAL_ENV") is not None + if not in_conda and not in_virtualenv: + pip_args.append("--user") + + pip_args.extend(args.packages) + + for package in args.packages: + package_name, _, version = package.partition("=") + if version == "": + raise RuntimeError( + "Package {package_name} did not have a version specified. " + "Please specify a version to produce a consistent linting experience." + ) + if args.no_black_binary and "black" in package_name: + pip_args.append(f"--no-binary={package_name}") + + dry_run = args.dry_run == "1" + if dry_run: + print(f"Would have run: {pip_args}") + sys.exit(0) + + run_command(pip_args) diff --git a/tools/linter/adapters/ruff_linter.py b/tools/linter/adapters/ruff_linter.py new file mode 100644 index 0000000000..451834aa7c --- /dev/null +++ b/tools/linter/adapters/ruff_linter.py @@ -0,0 +1,462 @@ +"""Adapter for https://github.com/charliermarsh/ruff.""" + +from __future__ import annotations + +import argparse +import concurrent.futures +import dataclasses +import enum +import json +import logging +import os +import subprocess +import sys +import time +from typing import Any, BinaryIO + +LINTER_CODE = "RUFF" +IS_WINDOWS: bool = os.name == "nt" + + +def eprint(*args: Any, **kwargs: Any) -> None: + """Print to stderr.""" + print(*args, file=sys.stderr, flush=True, **kwargs) + + +class LintSeverity(str, enum.Enum): + """Severity of a lint message.""" + + ERROR = "error" + WARNING = "warning" + ADVICE = "advice" + DISABLED = "disabled" + + +@dataclasses.dataclass(frozen=True) +class LintMessage: + """A lint message defined by https://docs.rs/lintrunner/latest/lintrunner/lint_message/struct.LintMessage.html.""" + + path: str | None + line: int | None + char: int | None + code: str + severity: LintSeverity + name: str + original: str | None + replacement: str | None + description: str | None + + def asdict(self) -> dict[str, Any]: + return dataclasses.asdict(self) + + def display(self) -> None: + """Print to stdout for lintrunner to consume.""" + print(json.dumps(self.asdict()), flush=True) + + +def as_posix(name: str) -> str: + return name.replace("\\", "/") if IS_WINDOWS else name + + +def _run_command( + args: list[str], + *, + timeout: int | None, + stdin: BinaryIO | None, + input: bytes | None, + check: bool, + cwd: os.PathLike[Any] | None, +) -> subprocess.CompletedProcess[bytes]: + logging.debug("$ %s", " ".join(args)) + start_time = time.monotonic() + try: + if input is not None: + return subprocess.run( + args, + capture_output=True, + shell=False, + input=input, + timeout=timeout, + check=check, + cwd=cwd, + ) + + return subprocess.run( + args, + stdin=stdin, + capture_output=True, + shell=False, + timeout=timeout, + check=check, + cwd=cwd, + ) + finally: + end_time = time.monotonic() + logging.debug("took %dms", (end_time - start_time) * 1000) + + +def run_command( + args: list[str], + *, + retries: int = 0, + timeout: int | None = None, + stdin: BinaryIO | None = None, + input: bytes | None = None, + check: bool = False, + cwd: os.PathLike[Any] | None = None, +) -> subprocess.CompletedProcess[bytes]: + remaining_retries = retries + while True: + try: + return _run_command( + args, timeout=timeout, stdin=stdin, input=input, check=check, cwd=cwd + ) + except subprocess.TimeoutExpired as err: + if remaining_retries == 0: + raise err + remaining_retries -= 1 + logging.warning( + "(%s/%s) Retrying because command failed with: %r", + retries - remaining_retries, + retries, + err, + ) + time.sleep(1) + + +def add_default_options(parser: argparse.ArgumentParser) -> None: + """Add default options to a parser. + + This should be called the last in the chain of add_argument calls. + """ + parser.add_argument( + "--retries", + type=int, + default=3, + help="number of times to retry if the linter times out.", + ) + parser.add_argument( + "--verbose", + action="store_true", + help="verbose logging", + ) + parser.add_argument( + "filenames", + nargs="+", + help="paths to lint", + ) + + +def explain_rule(code: str) -> str: + proc = run_command( + ["ruff", "rule", "--format=json", code], + check=True, + ) + rule = json.loads(str(proc.stdout, "utf-8").strip()) + return f"\n{rule['linter']}: {rule['summary']}" + + +def get_issue_severity(code: str) -> LintSeverity: + # "B901": `return x` inside a generator + # "B902": Invalid first argument to a method + # "B903": __slots__ efficiency + # "B950": Line too long + # "C4": Flake8 Comprehensions + # "C9": Cyclomatic complexity + # "E2": PEP8 horizontal whitespace "errors" + # "E3": PEP8 blank line "errors" + # "E5": PEP8 line length "errors" + # "T400": type checking Notes + # "T49": internal type checker errors or unmatched messages + if any( + code.startswith(x) + for x in ( + "B9", + "C4", + "C9", + "E2", + "E3", + "E5", + "T400", + "T49", + "PLC", + "PLR", + ) + ): + return LintSeverity.ADVICE + + # "F821": Undefined name + # "E999": syntax error + if any(code.startswith(x) for x in ("F821", "E999", "PLE")): + return LintSeverity.ERROR + + # "F": PyFlakes Error + # "B": flake8-bugbear Error + # "E": PEP8 "Error" + # "W": PEP8 Warning + # possibly other plugins... + return LintSeverity.WARNING + + +def format_lint_message( + message: str, code: str, rules: dict[str, str], show_disable: bool +) -> str: + if rules: + message += f".\n{rules.get(code) or ''}" + message += ".\nSee https://beta.ruff.rs/docs/rules/" + if show_disable: + message += f".\n\nTo disable, use ` # noqa: {code}`" + return message + + +def check_files( + filenames: list[str], + severities: dict[str, LintSeverity], + *, + config: str | None, + retries: int, + timeout: int, + explain: bool, + show_disable: bool, +) -> list[LintMessage]: + try: + proc = run_command( + [ + sys.executable, + "-m", + "ruff", + "--exit-zero", + "--quiet", + "--format=json", + *([f"--config={config}"] if config else []), + *filenames, + ], + retries=retries, + timeout=timeout, + check=True, + ) + except (OSError, subprocess.CalledProcessError) as err: + return [ + LintMessage( + path=None, + line=None, + char=None, + code=LINTER_CODE, + severity=LintSeverity.ERROR, + name="command-failed", + original=None, + replacement=None, + description=( + f"Failed due to {err.__class__.__name__}:\n{err}" + if not isinstance(err, subprocess.CalledProcessError) + else ( + f"COMMAND (exit code {err.returncode})\n" + f"{' '.join(as_posix(x) for x in err.cmd)}\n\n" + f"STDERR\n{err.stderr.decode('utf-8').strip() or '(empty)'}\n\n" + f"STDOUT\n{err.stdout.decode('utf-8').strip() or '(empty)'}" + ) + ), + ) + ] + + stdout = str(proc.stdout, "utf-8").strip() + vulnerabilities = json.loads(stdout) + + if explain: + all_codes = {v["code"] for v in vulnerabilities} + rules = {code: explain_rule(code) for code in all_codes} + else: + rules = {} + + return [ + LintMessage( + path=vuln["filename"], + name=vuln["code"], + description=( + format_lint_message( + vuln["message"], + vuln["code"], + rules, + show_disable, + ) + ), + line=int(vuln["location"]["row"]), + char=int(vuln["location"]["column"]), + code=LINTER_CODE, + severity=severities.get(vuln["code"], get_issue_severity(vuln["code"])), + original=None, + replacement=None, + ) + for vuln in vulnerabilities + ] + + +def check_file_for_fixes( + filename: str, + *, + config: str | None, + retries: int, + timeout: int, +) -> list[LintMessage]: + try: + with open(filename, "rb") as f: + original = f.read() + with open(filename, "rb") as f: + proc_fix = run_command( + [ + sys.executable, + "-m", + "ruff", + "--fix-only", + "--exit-zero", + *([f"--config={config}"] if config else []), + "--stdin-filename", + filename, + "-", + ], + stdin=f, + retries=retries, + timeout=timeout, + check=True, + ) + except (OSError, subprocess.CalledProcessError) as err: + return [ + LintMessage( + path=None, + line=None, + char=None, + code=LINTER_CODE, + severity=LintSeverity.ERROR, + name="command-failed", + original=None, + replacement=None, + description=( + f"Failed due to {err.__class__.__name__}:\n{err}" + if not isinstance(err, subprocess.CalledProcessError) + else ( + f"COMMAND (exit code {err.returncode})\n" + f"{' '.join(as_posix(x) for x in err.cmd)}\n\n" + f"STDERR\n{err.stderr.decode('utf-8').strip() or '(empty)'}\n\n" + f"STDOUT\n{err.stdout.decode('utf-8').strip() or '(empty)'}" + ) + ), + ) + ] + + replacement = proc_fix.stdout + if original == replacement: + return [] + + return [ + LintMessage( + path=filename, + name="format", + description="Run `lintrunner -a` to apply this patch.", + line=None, + char=None, + code=LINTER_CODE, + severity=LintSeverity.WARNING, + original=original.decode("utf-8"), + replacement=replacement.decode("utf-8"), + ) + ] + + +def main() -> None: + parser = argparse.ArgumentParser( + description=f"Ruff linter. Linter code: {LINTER_CODE}. Use with RUFF-FIX to auto-fix issues.", + fromfile_prefix_chars="@", + ) + parser.add_argument( + "--config", + default=None, + help="Path to the `pyproject.toml` or `ruff.toml` file to use for configuration", + ) + parser.add_argument( + "--explain", + action="store_true", + help="Explain a rule", + ) + parser.add_argument( + "--show-disable", + action="store_true", + help="Show how to disable a lint message", + ) + parser.add_argument( + "--timeout", + default=90, + type=int, + help="Seconds to wait for ruff", + ) + parser.add_argument( + "--severity", + action="append", + help="map code to severity (e.g. `F401:advice`). This option can be used multiple times.", + ) + parser.add_argument( + "--no-fix", + action="store_true", + help="Do not suggest fixes", + ) + add_default_options(parser) + args = parser.parse_args() + + logging.basicConfig( + format="<%(threadName)s:%(levelname)s> %(message)s", + level=logging.NOTSET + if args.verbose + else logging.DEBUG + if len(args.filenames) < 1000 + else logging.INFO, + stream=sys.stderr, + ) + + severities: dict[str, LintSeverity] = {} + if args.severity: + for severity in args.severity: + parts = severity.split(":", 1) + assert len(parts) == 2, f"invalid severity `{severity}`" + severities[parts[0]] = LintSeverity(parts[1]) + + lint_messages = check_files( + args.filenames, + severities=severities, + config=args.config, + retries=args.retries, + timeout=args.timeout, + explain=args.explain, + show_disable=args.show_disable, + ) + for lint_message in lint_messages: + lint_message.display() + + if args.no_fix or not lint_messages: + # If we're not fixing, we can exit early + return + + files_with_lints = {lint.path for lint in lint_messages if lint.path is not None} + with concurrent.futures.ThreadPoolExecutor( + max_workers=os.cpu_count(), + thread_name_prefix="Thread", + ) as executor: + futures = { + executor.submit( + check_file_for_fixes, + path, + config=args.config, + retries=args.retries, + timeout=args.timeout, + ): path + for path in files_with_lints + } + for future in concurrent.futures.as_completed(futures): + try: + for lint_message in future.result(): + lint_message.display() + except Exception: # Catch all exceptions for lintrunner + logging.critical('Failed at "%s".', futures[future]) + raise + + +if __name__ == "__main__": + main() From b2b429b7e4abd84955d91e76c288faae7ec9d19e Mon Sep 17 00:00:00 2001 From: Nikita Shulga <2453524+malfet@users.noreply.github.com> Date: Sat, 30 Sep 2023 14:08:43 -0700 Subject: [PATCH 074/212] [BE] Add `s3_management` to the linted folders (#1558) Add `PERF401` to list of ignored suggestions, fix the rest. --- .lintrunner.toml | 2 +- pyproject.toml | 3 +++ s3_management/backup_conda.py | 14 ++++++++------ s3_management/manage.py | 17 +++++++---------- 4 files changed, 19 insertions(+), 17 deletions(-) diff --git a/.lintrunner.toml b/.lintrunner.toml index c551cb732d..5c63a6c5d8 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -2,7 +2,7 @@ merge_base_with = "origin/main" [[linter]] code = 'RUFF' -include_patterns = ['test/smoke_test/*.py'] +include_patterns = ['test/smoke_test/*.py', 's3_management/*.py'] command = [ 'python3', 'tools/linter/adapters/ruff_linter.py', diff --git a/pyproject.toml b/pyproject.toml index efa884a07f..b167148522 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,9 @@ [tool.ruff] target-version = "py38" line-length = 120 +ignore = [ + "PERF401", +] select = [ "B", "C4", diff --git a/s3_management/backup_conda.py b/s3_management/backup_conda.py index 06926589d3..7dafa32b46 100644 --- a/s3_management/backup_conda.py +++ b/s3_management/backup_conda.py @@ -4,9 +4,9 @@ # Do not use unless you know what you are doing # Usage: python backup_conda.py --version 1.6.0 -import conda.api import boto3 from typing import List, Optional +import conda.api import urllib import os import hashlib @@ -22,8 +22,11 @@ def compute_md5(path:str) -> str: return hashlib.md5(f.read()).hexdigest() -def download_conda_package(package:str, version:Optional[str] = None, depends:Optional[str] = None, channel:Optional[str] = None) -> List[str]: - packages = conda.api.SubdirData.query_all(package, channels = [channel] if channel is not None else None, subdirs = _known_subdirs) +def download_conda_package(package:str, version:Optional[str] = None, + depends:Optional[str] = None, channel:Optional[str] = None) -> List[str]: + packages = conda.api.SubdirData.query_all(package, + channels = [channel] if channel is not None else None, + subdirs = _known_subdirs) rc = [] for pkg in packages: @@ -36,9 +39,8 @@ def download_conda_package(package:str, version:Optional[str] = None, depends:Op os.makedirs(pkg.subdir, exist_ok = True) fname = f"{pkg.subdir}/{pkg.fn}" if not os.path.exists(fname): - with open(fname, "wb") as f: - with urllib.request.urlopen(pkg.url) as url: - f.write(url.read()) + with open(fname, "wb") as f, urllib.request.urlopen(pkg.url) as url: + f.write(url.read()) if compute_md5(fname) != pkg.md5: print(f"md5 of {fname} is {compute_md5(fname)} does not match {pkg.md5}") continue diff --git a/s3_management/manage.py b/s3_management/manage.py index 0df513d32a..57af9259b8 100644 --- a/s3_management/manage.py +++ b/s3_management/manage.py @@ -184,9 +184,7 @@ def nightly_packages_to_show(self: S3IndexType) -> Set[S3Object]: if package_name not in PACKAGE_ALLOW_LIST: to_hide.add(obj) continue - if packages[package_name] >= KEEP_THRESHOLD: - to_hide.add(obj) - elif between_bad_dates(package_build_time): + if packages[package_name] >= KEEP_THRESHOLD or between_bad_dates(package_build_time): to_hide.add(obj) else: packages[package_name] += 1 @@ -215,14 +213,13 @@ def gen_file_list( ) subdir = self._resolve_subdir(subdir) + '/' for obj in objects: - if package_name is not None: - if self.obj_to_package_name(obj) != package_name: - continue + if package_name is not None and self.obj_to_package_name(obj) != package_name: + continue if self.is_obj_at_root(obj) or obj.key.startswith(subdir): yield obj def get_package_names(self, subdir: Optional[str] = None) -> List[str]: - return sorted(set(self.obj_to_package_name(obj) for obj in self.gen_file_list(subdir))) + return sorted({self.obj_to_package_name(obj) for obj in self.gen_file_list(subdir)}) def normalize_package_version(self: S3IndexType, obj: S3Object) -> str: # removes the GPU specifier from the package name as well as @@ -284,7 +281,7 @@ def to_simple_package_html( # Adding html footer out.append(' ') out.append('') - out.append(''.format(int(time.time()))) + out.append(f'') return '\n'.join(out) def to_simple_packages_html( @@ -303,7 +300,7 @@ def to_simple_packages_html( # Adding html footer out.append(' ') out.append('') - out.append(''.format(int(time.time()))) + out.append(f'') return '\n'.join(out) def upload_legacy_html(self) -> None: @@ -412,7 +409,7 @@ def main(): args = parser.parse_args() action = "Saving" if args.do_not_upload else "Uploading" if args.prefix == 'all': - for prefix in PREFIXES_WITH_HTML.keys(): + for prefix in PREFIXES_WITH_HTML: print(f"INFO: {action} indices for '{prefix}'") idx = S3Index.from_S3(prefix=prefix) if args.do_not_upload: From aa6df6fc264ee6a414652d37f2022d5634e42174 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Mon, 2 Oct 2023 10:47:29 -0400 Subject: [PATCH 075/212] Fix path issue when building aarch64 wheels (#1560) --- aarch64_linux/aarch64_wheel_ci_build.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aarch64_linux/aarch64_wheel_ci_build.py b/aarch64_linux/aarch64_wheel_ci_build.py index da789a2314..bdc6717ef2 100755 --- a/aarch64_linux/aarch64_wheel_ci_build.py +++ b/aarch64_linux/aarch64_wheel_ci_build.py @@ -106,8 +106,8 @@ def parse_arguments(): print("build pytorch without mkldnn backend") # work around to fix Raspberry pie crash - os.system(f"cd $HOME && git clone https://github.com/pytorch/builder.git") - os.system(f"cd $HOME/pytorch/third_party/ideep/mkl-dnn && patch -p1 < $HOME/builder/mkldnn_fix/aarch64-fix-default-build-flags-to-armv8-a.patch") + print("Applying mkl-dnn patch to fix Raspberry pie crash") + os.system(f"cd /pytorch/third_party/ideep/mkl-dnn && patch -p1 < /builder/mkldnn_fix/aarch64-fix-default-build-flags-to-armv8-a.patch") os.system(f"cd /pytorch; {build_vars} python3 setup.py bdist_wheel") pytorch_wheel_name = complete_wheel("pytorch") print(f"Build Compelete. Created {pytorch_wheel_name}..") From ab2443d2a8e666bfc589bc625990d4d9b092a6d4 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Mon, 2 Oct 2023 15:50:34 -0400 Subject: [PATCH 076/212] Fix linalg smoke tests (#1563) --- test/smoke_test/smoke_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/smoke_test/smoke_test.py b/test/smoke_test/smoke_test.py index 8ae1d1c512..3d1b6af64b 100644 --- a/test/smoke_test/smoke_test.py +++ b/test/smoke_test/smoke_test.py @@ -179,7 +179,7 @@ def smoke_test_linalg() -> None: torch.dist(A, U @ torch.diag(S) @ Vh) U, S, Vh = torch.linalg.svd(A) - assert U.shape == A.shape and S.shape == torch.Size([3]) and Vh.shape == torch.Size([3, 3]) + assert U.shape == torch.Size([5, 5]) and S.shape == torch.Size([3]) and Vh.shape == torch.Size([3, 3]) torch.dist(A, U[:, :3] @ torch.diag(S) @ Vh) A = torch.randn(7, 5, 3) From 50a6e91f972158529a7c9f06c4c6fd4510196e32 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Mon, 2 Oct 2023 21:02:59 -0700 Subject: [PATCH 077/212] Towards enabling M1 wheel builds Do not try to install MKL on Apple Silicon --- wheel/build_wheel.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/wheel/build_wheel.sh b/wheel/build_wheel.sh index 99d251977a..a45522bf8a 100755 --- a/wheel/build_wheel.sh +++ b/wheel/build_wheel.sh @@ -168,7 +168,11 @@ if [[ "$desired_python" == "3.11" ]]; then else retry conda install ${EXTRA_CONDA_INSTALL_FLAGS} -yq "numpy${NUMPY_PINNED_VERSION}" nomkl "setuptools${SETUPTOOLS_PINNED_VERSION}" "pyyaml${PYYAML_PINNED_VERSION}" typing_extensions requests fi -retry conda install ${EXTRA_CONDA_INSTALL_FLAGS} -yq cmake ninja mkl-include==2022.2.1 mkl-static==2022.2.1 -c intel +if [[ "$(uname -m)" == "arm64" ]]; then + retry conda install ${EXTRA_CONDA_INSTALL_FLAGS} -yq cmake ninja +else + retry conda install ${EXTRA_CONDA_INSTALL_FLAGS} -yq cmake ninja mkl-include==2022.2.1 mkl-static==2022.2.1 -c intel +fi retry pip install -qr "${pytorch_rootdir}/requirements.txt" || true # For USE_DISTRIBUTED=1 on macOS, need libuv and pkg-config to find libuv. From bbb29b0467ecbf4fa1f14e51f30f68a0d7c9dc23 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Mon, 2 Oct 2023 21:12:50 -0700 Subject: [PATCH 078/212] And only install llvm-9 on x86 systems --- wheel/build_wheel.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wheel/build_wheel.sh b/wheel/build_wheel.sh index a45522bf8a..5ac52f4d48 100755 --- a/wheel/build_wheel.sh +++ b/wheel/build_wheel.sh @@ -184,7 +184,7 @@ if [[ -n "$CROSS_COMPILE_ARM64" ]]; then export USE_MKLDNN=OFF export USE_QNNPACK=OFF export BUILD_TEST=OFF -else +elif [[ "$(uname -m)" == "x86_64" ]]; then retry conda install ${EXTRA_CONDA_INSTALL_FLAGS} -yq llvmdev=9 export USE_LLVM="${CONDA_PREFIX}" fi From cc4f1f9055b9e32eb4c81d8525f10a537c913274 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Mon, 2 Oct 2023 22:23:09 -0700 Subject: [PATCH 079/212] Do not build tests when building natively on M1 --- wheel/build_wheel.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/wheel/build_wheel.sh b/wheel/build_wheel.sh index 5ac52f4d48..d93e16effe 100755 --- a/wheel/build_wheel.sh +++ b/wheel/build_wheel.sh @@ -179,8 +179,10 @@ retry pip install -qr "${pytorch_rootdir}/requirements.txt" || true export USE_DISTRIBUTED=1 retry conda install ${EXTRA_CONDA_INSTALL_FLAGS} -yq libuv pkg-config -if [[ -n "$CROSS_COMPILE_ARM64" ]]; then - export CMAKE_OSX_ARCHITECTURES=arm64 +if [[ -n "$CROSS_COMPILE_ARM64" || "$(uname -m)" == "arm64" ]]; then + if [[ -n "$CROSS_COMPILE_ARM64" ]]; then + export CMAKE_OSX_ARCHITECTURES=arm64 + fi export USE_MKLDNN=OFF export USE_QNNPACK=OFF export BUILD_TEST=OFF From 8bcc83dbb1e947f7e4c546ea19959d775e7fa7d6 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Mon, 2 Oct 2023 22:24:37 -0700 Subject: [PATCH 080/212] And fix Python-3.8 native compilation on M1 There are no numpy=3.17 for M1 --- wheel/build_wheel.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/wheel/build_wheel.sh b/wheel/build_wheel.sh index d93e16effe..75138fc094 100755 --- a/wheel/build_wheel.sh +++ b/wheel/build_wheel.sh @@ -151,7 +151,13 @@ case ${desired_python} in NUMPY_PINNED_VERSION="=1.19" ;; 3.8) - NUMPY_PINNED_VERSION="=1.17" + if [[ "$(uname -m)" == "arm64" ]]; then + SETUPTOOLS_PINNED_VERSION=">=46.0.0" + PYYAML_PINNED_VERSION=">=5.3" + NUMPY_PINNED_VERSION="=1.19" + else + NUMPY_PINNED_VERSION="=1.17" + fi ;; *) NUMPY_PINNED_VERSION="=1.11.3" From b39cccf546f54c95f15001d0cc5f2ce222d35fdd Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Tue, 3 Oct 2023 14:08:11 -0400 Subject: [PATCH 081/212] Release 2.1 update promotion scripts (#1564) --- release/promote.sh | 14 +++++++------- release/pypi/promote_pypi_to_staging.sh | 4 ++-- release/release_versions.sh | 10 +++++----- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/release/promote.sh b/release/promote.sh index 1147dc0c98..b3656dda6c 100644 --- a/release/promote.sh +++ b/release/promote.sh @@ -6,11 +6,11 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" source "${DIR}/release_versions.sh" # Make sure to update these versions when doing a release first -PYTORCH_VERSION=${PYTORCH_VERSION:-2.0.0} -TORCHVISION_VERSION=${TORCHVISION_VERSION:-0.15.0} -TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION:-2.0.0} -TORCHTEXT_VERSION=${TORCHTEXT_VERSION:-0.15.0} -TORCHDATA_VERSION=${TORCHDATA_VERSION:-0.6.0} +PYTORCH_VERSION=${PYTORCH_VERSION:-2.1.0} +TORCHVISION_VERSION=${TORCHVISION_VERSION:-0.16.0} +TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION:-2.1.0} +TORCHTEXT_VERSION=${TORCHTEXT_VERSION:-0.16.0} +TORCHDATA_VERSION=${TORCHDATA_VERSION:-0.7.0} DRY_RUN=${DRY_RUN:-enabled} @@ -104,9 +104,9 @@ promote_pypi() { # promote_s3 torchdata whl "${TORCHDATA_VERSION}" # promote_s3 "libtorch-*" libtorch "${PYTORCH_VERSION}" -# promote_conda torchtriton conda "2.0.0" -# promote_conda pytorch-cuda conda "11.7" +# promote_conda torchtriton conda "2.1.0" # promote_conda pytorch-cuda conda "11.8" +# promote_conda pytorch-cuda conda "12.1" # promote_conda pytorch conda "${PYTORCH_VERSION}" # promote_conda torchvision conda "${TORCHVISION_VERSION}" diff --git a/release/pypi/promote_pypi_to_staging.sh b/release/pypi/promote_pypi_to_staging.sh index 74f139680e..678d9dd034 100644 --- a/release/pypi/promote_pypi_to_staging.sh +++ b/release/pypi/promote_pypi_to_staging.sh @@ -21,8 +21,8 @@ upload_pypi_to_staging() { } # Uncomment these to promote to pypi -PYTORCH_LINUX_VERSION_SUFFIX="%2Bcu117.with.pypi.cudnn" -LINUX_VERSION_SUFFIX="%2Bcu117" +PYTORCH_LINUX_VERSION_SUFFIX="%2Bcu121.with.pypi.cudnn" +LINUX_VERSION_SUFFIX="%2Bcu121" WIN_VERSION_SUFFIX="%2Bcpu" MACOS_X86_64="macosx_.*_x86_64" MACOS_ARM64="macosx_.*_arm64" diff --git a/release/release_versions.sh b/release/release_versions.sh index f0db2a0895..ab35075b6f 100644 --- a/release/release_versions.sh +++ b/release/release_versions.sh @@ -1,8 +1,8 @@ #!/usr/bin/env bash # Make sure to update these versions when doing a release first -PYTORCH_VERSION=${PYTORCH_VERSION:-2.0.0} -TORCHVISION_VERSION=${TORCHVISION_VERSION:-0.15.0} -TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION:-2.0.0} -TORCHTEXT_VERSION=${TORCHTEXT_VERSION:-0.15.0} -TORCHDATA_VERSION=${TORCHDATA_VERSION:-0.6.0} +PYTORCH_VERSION=${PYTORCH_VERSION:-2.1.0} +TORCHVISION_VERSION=${TORCHVISION_VERSION:-0.16.0} +TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION:-2.1.0} +TORCHTEXT_VERSION=${TORCHTEXT_VERSION:-0.16.0} +TORCHDATA_VERSION=${TORCHDATA_VERSION:-0.7.0} From 8a2aacef9a0a29ad6694a325b2f6162b862dd2d3 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Wed, 4 Oct 2023 08:52:08 -0700 Subject: [PATCH 082/212] [BE] Small code cleanup Fold multiple inidices and single index generation into one loop As loop body is the same anyway... --- s3_management/manage.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/s3_management/manage.py b/s3_management/manage.py index 57af9259b8..f698deb872 100644 --- a/s3_management/manage.py +++ b/s3_management/manage.py @@ -404,21 +404,14 @@ def create_parser() -> argparse.ArgumentParser: return parser -def main(): +def main() -> None: parser = create_parser() args = parser.parse_args() action = "Saving" if args.do_not_upload else "Uploading" - if args.prefix == 'all': - for prefix in PREFIXES_WITH_HTML: - print(f"INFO: {action} indices for '{prefix}'") - idx = S3Index.from_S3(prefix=prefix) - if args.do_not_upload: - idx.save_legacy_html() - else: - idx.upload_legacy_html() - else: - print(f"INFO: {action} indices for '{args.prefix}'") - idx = S3Index.from_S3(prefix=args.prefix) + prefixes = PREFIXES_WITH_HTML if args.prefix == 'all' else [args.prefix] + for prefix in prefixes: + print(f"INFO: {action} indices for '{prefix}'") + idx = S3Index.from_S3(prefix=prefix) if args.do_not_upload: idx.save_legacy_html() if args.generate_pep503: From dddbbf7c9bd61b05447e32f35a1a77e7cf6213e3 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Wed, 4 Oct 2023 10:36:10 -0700 Subject: [PATCH 083/212] S3_management: Add option to compute sha256 That will be used later to generate sha256 indexes in PEP503 --- s3_management/manage.py | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/s3_management/manage.py b/s3_management/manage.py index f698deb872..3b2e736fca 100644 --- a/s3_management/manage.py +++ b/s3_management/manage.py @@ -6,6 +6,7 @@ import dataclasses import functools import time +import hashlib from os import path, makedirs from datetime import datetime @@ -116,7 +117,9 @@ @functools.total_ordering class S3Object: key: str + orig_key: str checksum: Optional[str] + size: Optional[int] def __str__(self): return self.key @@ -356,6 +359,20 @@ def save_pep503_htmls(self) -> None: with open(path.join(subdir, pkg_name, "index.html"), mode="w", encoding="utf-8") as f: f.write(self.to_simple_package_html(subdir=subdir, package_name=pkg_name)) + def compute_sha256(self) -> None: + for obj in self.objects: + if obj.checksum is not None: + continue + print(f"Computing sha256 for {obj.orig_key} of size {obj.size}") + sha256_sum = hashlib.sha256() + s3_obj = BUCKET.Object(key=obj.orig_key) + sha256_sum.update(s3_obj.get()["Body"].read()) + digest = sha256_sum.hexdigest() + s3_obj.metadata.update({"checksum-sha256": digest}) + s3_obj.copy_from(CopySource={"Bucket": BUCKET.name, "Key": obj.orig_key}, + Metadata=s3_obj.metadata, MetadataDirective="REPLACE") + + @classmethod def from_S3(cls: Type[S3IndexType], prefix: str) -> S3IndexType: prefix = prefix.rstrip("/") @@ -383,10 +400,15 @@ def fetch_metadata(key: str) : # For older files, rely on checksum-sha256 metadata that can be added to the file later if sha256 is None: sha256 = response.get("Metadata", {}).get("checksum-sha256") + if sha256 is not None: + print(f"Find metadata for {obj_key}") sanitized_key = obj_key.replace("+", "%2B") + size = response.get("ContentLength") s3_object = S3Object( key=sanitized_key, + orig_key=obj_key, checksum=sha256, + size=int(size) if size else size, ) objects.append(s3_object) return cls(objects, prefix) @@ -401,18 +423,24 @@ def create_parser() -> argparse.ArgumentParser: ) parser.add_argument("--do-not-upload", action="store_true") parser.add_argument("--generate-pep503", action="store_true") + parser.add_argument("--compute-sha256", action="store_true") return parser def main() -> None: parser = create_parser() args = parser.parse_args() - action = "Saving" if args.do_not_upload else "Uploading" + action = "Saving indices" if args.do_not_upload else "Uploading indices" + if args.compute_sha256: + action = "Computing checksums" + prefixes = PREFIXES_WITH_HTML if args.prefix == 'all' else [args.prefix] for prefix in prefixes: - print(f"INFO: {action} indices for '{prefix}'") + print(f"INFO: {action} for '{prefix}'") idx = S3Index.from_S3(prefix=prefix) - if args.do_not_upload: + if args.compute_sha256: + idx.compute_sha256() + elif args.do_not_upload: idx.save_legacy_html() if args.generate_pep503: idx.save_pep503_htmls() From 312e0519507b9601ad980f4ad7c111dc44df934e Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Wed, 4 Oct 2023 10:42:01 -0700 Subject: [PATCH 084/212] Remove debug print --- s3_management/manage.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/s3_management/manage.py b/s3_management/manage.py index 3b2e736fca..38cb9e8bc0 100644 --- a/s3_management/manage.py +++ b/s3_management/manage.py @@ -400,8 +400,6 @@ def fetch_metadata(key: str) : # For older files, rely on checksum-sha256 metadata that can be added to the file later if sha256 is None: sha256 = response.get("Metadata", {}).get("checksum-sha256") - if sha256 is not None: - print(f"Find metadata for {obj_key}") sanitized_key = obj_key.replace("+", "%2B") size = response.get("ContentLength") s3_object = S3Object( From 2a7b17df319c184d060cb21d376e07cf3fe546a8 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Wed, 4 Oct 2023 12:28:46 -0700 Subject: [PATCH 085/212] [S3_management] Minor improvements - Refactor `fetch_obj_names` into class method - Make sure that object remains public when ACL is computed - Add `has_public_read` and `grant_public_read` class methods --- s3_management/manage.py | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/s3_management/manage.py b/s3_management/manage.py index 38cb9e8bc0..3412d1692f 100644 --- a/s3_management/manage.py +++ b/s3_management/manage.py @@ -370,12 +370,28 @@ def compute_sha256(self) -> None: digest = sha256_sum.hexdigest() s3_obj.metadata.update({"checksum-sha256": digest}) s3_obj.copy_from(CopySource={"Bucket": BUCKET.name, "Key": obj.orig_key}, - Metadata=s3_obj.metadata, MetadataDirective="REPLACE") + Metadata=s3_obj.metadata, MetadataDirective="REPLACE", + ACL="public-read", + ChecksumAlgorithm="SHA256") @classmethod - def from_S3(cls: Type[S3IndexType], prefix: str) -> S3IndexType: - prefix = prefix.rstrip("/") + def has_public_read(cls:Type[S3IndexType], key: str) -> bool: + def is_all_users_group(o) -> bool: + return o.get("Grantee",{}).get("URI") == "http://acs.amazonaws.com/groups/global/AllUsers" + + def can_read(o) -> bool: + return o.get("Permission") in ["READ", "FULL_CONTROL"] + + acl_grants = CLIENT.get_object_acl(Bucket=BUCKET.name, Key=key)["Grants"] + return any(is_all_users_group(x) and can_read(x) for x in acl_grants) + + @classmethod + def grant_public_read(cls: Type[S3IndexType], key: str) -> None: + CLIENT.put_object_acl(Bucket=BUCKET.name, Key=key, ACL="public-read") + + @classmethod + def fetch_object_names(cls: Type[S3IndexType], prefix: str) -> List[str]: obj_names = [] for obj in BUCKET.objects.filter(Prefix=prefix): is_acceptable = any([path.dirname(obj.key) == prefix] + [ @@ -388,6 +404,12 @@ def from_S3(cls: Type[S3IndexType], prefix: str) -> S3IndexType: if not is_acceptable: continue obj_names.append(obj.key) + return obj_names + + @classmethod + def from_S3(cls: Type[S3IndexType], prefix: str) -> S3IndexType: + prefix = prefix.rstrip("/") + obj_names = cls.fetch_object_names(prefix) objects = [] def fetch_metadata(key: str) : return CLIENT.head_object(Bucket=BUCKET.name, Key=key, ChecksumMode="Enabled") From 3e2d4a1e65f144897d4dec37bc57519b9d078396 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Wed, 4 Oct 2023 12:51:27 -0700 Subject: [PATCH 086/212] s3_management: compute checksum in cloud I.e. file never gets downloaded on the client, which is a nice thing --- s3_management/manage.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/s3_management/manage.py b/s3_management/manage.py index 3412d1692f..a2c79f2117 100644 --- a/s3_management/manage.py +++ b/s3_management/manage.py @@ -6,7 +6,6 @@ import dataclasses import functools import time -import hashlib from os import path, makedirs from datetime import datetime @@ -363,12 +362,8 @@ def compute_sha256(self) -> None: for obj in self.objects: if obj.checksum is not None: continue - print(f"Computing sha256 for {obj.orig_key} of size {obj.size}") - sha256_sum = hashlib.sha256() + print(f"Updating {obj.orig_key} of size {obj.size} with SHA256 checksum") s3_obj = BUCKET.Object(key=obj.orig_key) - sha256_sum.update(s3_obj.get()["Body"].read()) - digest = sha256_sum.hexdigest() - s3_obj.metadata.update({"checksum-sha256": digest}) s3_obj.copy_from(CopySource={"Bucket": BUCKET.name, "Key": obj.orig_key}, Metadata=s3_obj.metadata, MetadataDirective="REPLACE", ACL="public-read", From 07efc44334baf7f900bf7ecbc56d1ab54010d7de Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Thu, 5 Oct 2023 07:30:32 -0700 Subject: [PATCH 087/212] [S3Management] Add `undelete_prefix` method That can be used to recover object in a versioned bucket --- s3_management/manage.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/s3_management/manage.py b/s3_management/manage.py index a2c79f2117..f8674bc191 100644 --- a/s3_management/manage.py +++ b/s3_management/manage.py @@ -428,6 +428,18 @@ def fetch_metadata(key: str) : objects.append(s3_object) return cls(objects, prefix) + @classmethod + def undelete_prefix(cls: Type[S3IndexType], prefix: str) -> None: + paginator = CLIENT.get_paginator("list_object_versions") + for page in paginator.paginate(Bucket=BUCKET.name, Prefix=prefix): + for obj in page.get("DeleteMarkers", []): + if not obj.get("IsLatest"): + continue + obj_key, obj_version_id = obj["Key"], obj["VersionId"] + obj_ver = S3.ObjectVersion(BUCKET.name, obj_key, obj_version_id) + print(f"Undeleting {obj_key} deleted on {obj['LastModified']}") + obj_ver.delete() + def create_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser("Manage S3 HTML indices for PyTorch") From eebd2ce10fe8472ae56ddfcfbb8809d4bf1be01d Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Fri, 6 Oct 2023 12:51:34 -0400 Subject: [PATCH 088/212] Validate poetry for release (#1567) * Validate poetry for release * test * test * fixtypo --- .github/scripts/validate_poetry.sh | 3 ++- .github/workflows/validate-linux-binaries.yml | 5 +---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/.github/scripts/validate_poetry.sh b/.github/scripts/validate_poetry.sh index c4e4fd1549..3c41d5b452 100644 --- a/.github/scripts/validate_poetry.sh +++ b/.github/scripts/validate_poetry.sh @@ -26,10 +26,11 @@ if [[ ${MATRIX_CHANNEL} != "release" ]]; then fi else export PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring + poetry source add --priority=explicit pytorch "https://download.pytorch.org/whl/${MATRIX_DESIRED_CUDA}" if [[ ${TORCH_ONLY} == 'true' ]]; then poetry --quiet add torch else - poetry --quiet add torch torchaudio torchvision + poetry --quiet add --source pytorch torch torchaudio torchvision fi fi diff --git a/.github/workflows/validate-linux-binaries.yml b/.github/workflows/validate-linux-binaries.yml index 12335e8436..3c5aac2ebd 100644 --- a/.github/workflows/validate-linux-binaries.yml +++ b/.github/workflows/validate-linux-binaries.yml @@ -68,11 +68,8 @@ jobs: eval "$(conda shell.bash hook)" # Special case PyPi installation package. And Install of PyPi package via poetry - if [[ ${MATRIX_PACKAGE_TYPE} == "manywheel" ]] && \ - ([[ ${MATRIX_GPU_ARCH_VERSION} == "12.1" && ${MATRIX_CHANNEL} != "release" ]] || \ - [[ ${MATRIX_GPU_ARCH_VERSION} == "11.7" && ${MATRIX_CHANNEL} == "release" ]]); then + if [[ ${MATRIX_PACKAGE_TYPE} == "manywheel" && ${MATRIX_GPU_ARCH_VERSION} == "12.1" ]]; then source ./.github/scripts/validate_pipy.sh - # temporary disable poetry check source ./.github/scripts/validate_poetry.sh fi From a5f2068c2597a6be55159fb5aaa3bf361ccea621 Mon Sep 17 00:00:00 2001 From: Nikita Shulga <2453524+malfet@users.noreply.github.com> Date: Sun, 8 Oct 2023 22:11:41 -0700 Subject: [PATCH 089/212] Use released version of 3.12 (#1568) As it was released on Oct 6 2023: https://www.python.org/downloads/release/python-3120/ --- common/install_cpython.sh | 4 ---- 1 file changed, 4 deletions(-) diff --git a/common/install_cpython.sh b/common/install_cpython.sh index 8347822b60..67bffe36f9 100755 --- a/common/install_cpython.sh +++ b/common/install_cpython.sh @@ -64,10 +64,6 @@ function build_cpython { check_var $py_ver check_var $PYTHON_DOWNLOAD_URL local py_ver_folder=$py_ver - # Only b2 version of 3.12 is available right now - if [ "$py_ver" = "3.12.0" ]; then - py_ver=$py_ver"b2" - fi wget -q $PYTHON_DOWNLOAD_URL/$py_ver_folder/Python-$py_ver.tgz do_cpython_build $py_ver none rm -f Python-$py_ver.tgz From 0481289201f161445ae3ff34aa7fd762d50bd07b Mon Sep 17 00:00:00 2001 From: Nikita Shulga <2453524+malfet@users.noreply.github.com> Date: Tue, 10 Oct 2023 21:51:08 -0700 Subject: [PATCH 090/212] Move manywheel builds to `linux.12xlarge.ephemeral` (#1569) Should be faster(<20 min vs 40+ min) and as secure as using GH ones --- .github/workflows/build-manywheel-images.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-manywheel-images.yml b/.github/workflows/build-manywheel-images.yml index bbac707dd9..d717416f63 100644 --- a/.github/workflows/build-manywheel-images.yml +++ b/.github/workflows/build-manywheel-images.yml @@ -34,7 +34,7 @@ env: jobs: build-docker-cuda: - runs-on: ubuntu-22.04 + runs-on: linux.12xlarge.ephemeral strategy: matrix: cuda_version: ["12.1", "11.8"] From 00841b69a7f7d0aed526fe42444e57e60d813630 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Wed, 11 Oct 2023 05:09:54 +0000 Subject: [PATCH 091/212] Add cuSparseLt-0.5.0 to manywheel images --- common/install_cuda.sh | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/common/install_cuda.sh b/common/install_cuda.sh index e087a44c1c..8907284e01 100644 --- a/common/install_cuda.sh +++ b/common/install_cuda.sh @@ -3,7 +3,7 @@ set -ex function install_118 { - echo "Installing CUDA 11.8 and cuDNN 8.7 and NCCL 2.15" + echo "Installing CUDA 11.8 and cuDNN 8.7 and NCCL 2.15 and cuSparseLt-0.5.0" rm -rf /usr/local/cuda-11.8 /usr/local/cuda # install CUDA 11.8.0 in the same container wget -q https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run @@ -31,10 +31,20 @@ function install_118 { cd .. rm -rf tmp_nccl ldconfig + + # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html + mkdir tmp_cusparselt && pushd tmp_cusparselt + wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.5.0.1-archive.tar.xz + tar xf libcusparse_lt-linux-x86_64-0.5.0.1-archive.tar.xz + cp -a libcusparse_lt-linux-x86_64-0.5.0.1-archive/include/* /usr/local/cuda/include/ + cp -a libcusparse_lt-linux-x86_64-0.5.0.1-archive/lib/* /usr/local/cuda/lib64/ + popd + rm -rf tmp_custparselt + ldconfig } function install_121 { - echo "Installing CUDA 12.1 and cuDNN 8.9 and NCCL 2.18.1" + echo "Installing CUDA 12.1 and cuDNN 8.9 and NCCL 2.18.1 and cuSparseLt-0.5.0" rm -rf /usr/local/cuda-12.1 /usr/local/cuda # install CUDA 12.1.0 in the same container wget -q https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda_12.1.1_530.30.02_linux.run @@ -62,6 +72,16 @@ function install_121 { cd .. rm -rf tmp_nccl ldconfig + + # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html + mkdir tmp_cusparselt && pushd tmp_cusparselt + wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.5.0.1-archive.tar.xz + tar xf libcusparse_lt-linux-x86_64-0.5.0.1-archive.tar.xz + cp -a libcusparse_lt-linux-x86_64-0.5.0.1-archive/include/* /usr/local/cuda/include/ + cp -a libcusparse_lt-linux-x86_64-0.5.0.1-archive/lib/* /usr/local/cuda/lib64/ + popd + rm -rf tmp_custparselt + ldconfig } function prune_118 { From 321ab64ca5552da731fea5e26e3a2829b96b3b20 Mon Sep 17 00:00:00 2001 From: Nikita Shulga <2453524+malfet@users.noreply.github.com> Date: Wed, 11 Oct 2023 08:35:36 -0700 Subject: [PATCH 092/212] Use `linux.12xlarge.ephemeral` for conda docker builds (#1570) As `ubuntu.20.04` often OOM/failed to fetch data from RHEL repo --- .github/workflows/build-conda-images.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-conda-images.yml b/.github/workflows/build-conda-images.yml index 4f4f9f43d7..9e5290cef5 100644 --- a/.github/workflows/build-conda-images.yml +++ b/.github/workflows/build-conda-images.yml @@ -28,7 +28,7 @@ env: jobs: build-docker: - runs-on: ubuntu-22.04 + runs-on: linux.12xlarge.ephemeral strategy: matrix: cuda_version: ["11.8", "12.1", "cpu"] From 4e68b6d6a3437b50f3774486bcab9e000940a421 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Wed, 11 Oct 2023 08:48:36 -0700 Subject: [PATCH 093/212] Revert "Add cuSparseLt-0.5.0 to manywheel images" This reverts commit 00841b69a7f7d0aed526fe42444e57e60d813630 as cuSparseLT is not compatible with CentOS 7 --- common/install_cuda.sh | 24 ++---------------------- 1 file changed, 2 insertions(+), 22 deletions(-) diff --git a/common/install_cuda.sh b/common/install_cuda.sh index 8907284e01..e087a44c1c 100644 --- a/common/install_cuda.sh +++ b/common/install_cuda.sh @@ -3,7 +3,7 @@ set -ex function install_118 { - echo "Installing CUDA 11.8 and cuDNN 8.7 and NCCL 2.15 and cuSparseLt-0.5.0" + echo "Installing CUDA 11.8 and cuDNN 8.7 and NCCL 2.15" rm -rf /usr/local/cuda-11.8 /usr/local/cuda # install CUDA 11.8.0 in the same container wget -q https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run @@ -31,20 +31,10 @@ function install_118 { cd .. rm -rf tmp_nccl ldconfig - - # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html - mkdir tmp_cusparselt && pushd tmp_cusparselt - wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.5.0.1-archive.tar.xz - tar xf libcusparse_lt-linux-x86_64-0.5.0.1-archive.tar.xz - cp -a libcusparse_lt-linux-x86_64-0.5.0.1-archive/include/* /usr/local/cuda/include/ - cp -a libcusparse_lt-linux-x86_64-0.5.0.1-archive/lib/* /usr/local/cuda/lib64/ - popd - rm -rf tmp_custparselt - ldconfig } function install_121 { - echo "Installing CUDA 12.1 and cuDNN 8.9 and NCCL 2.18.1 and cuSparseLt-0.5.0" + echo "Installing CUDA 12.1 and cuDNN 8.9 and NCCL 2.18.1" rm -rf /usr/local/cuda-12.1 /usr/local/cuda # install CUDA 12.1.0 in the same container wget -q https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda_12.1.1_530.30.02_linux.run @@ -72,16 +62,6 @@ function install_121 { cd .. rm -rf tmp_nccl ldconfig - - # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html - mkdir tmp_cusparselt && pushd tmp_cusparselt - wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.5.0.1-archive.tar.xz - tar xf libcusparse_lt-linux-x86_64-0.5.0.1-archive.tar.xz - cp -a libcusparse_lt-linux-x86_64-0.5.0.1-archive/include/* /usr/local/cuda/include/ - cp -a libcusparse_lt-linux-x86_64-0.5.0.1-archive/lib/* /usr/local/cuda/lib64/ - popd - rm -rf tmp_custparselt - ldconfig } function prune_118 { From 4395f498f4e1f6624cb1a4f7b1c83ea912fff7e4 Mon Sep 17 00:00:00 2001 From: Nikita Shulga <2453524+malfet@users.noreply.github.com> Date: Wed, 11 Oct 2023 09:24:06 -0700 Subject: [PATCH 094/212] Move libtorch docker builder to `linux.12xlarge.ephemeral` (#1571) As running it on `ubutu22.04` often results in flay infra failures/running out of disk space, for example, from https://github.com/pytorch/builder/actions/runs/6484948230/job/17609933012 ``` cat: write error: No space left on device ``` --- .github/workflows/build-libtorch-images.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-libtorch-images.yml b/.github/workflows/build-libtorch-images.yml index d620300473..7968bbb26d 100644 --- a/.github/workflows/build-libtorch-images.yml +++ b/.github/workflows/build-libtorch-images.yml @@ -30,7 +30,7 @@ env: jobs: build-docker-cuda: - runs-on: ubuntu-22.04 + runs-on: linux.12xlarge.ephemeral strategy: matrix: cuda_version: ["12.1", "11.8"] From ce1c649a3a778726cb57129c57dbfe2c8fa19713 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Wed, 11 Oct 2023 05:09:54 +0000 Subject: [PATCH 095/212] Add cuSparseLt-0.4.0 to manywheel images But set USE_CUSPARSELT to 0 by default --- common/install_cuda.sh | 23 +++++++++++++++++++---- manywheel/build_cuda.sh | 1 + 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/common/install_cuda.sh b/common/install_cuda.sh index e087a44c1c..f09666e643 100644 --- a/common/install_cuda.sh +++ b/common/install_cuda.sh @@ -2,8 +2,19 @@ set -ex +function install_cusparselt_040 { + # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html + mkdir tmp_cusparselt && pushd tmp_cusparselt + wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.4.0.7-archive.tar.xz + tar xf libcusparse_lt-linux-x86_64-0.4.0.7-archive.tar.xz + cp -a libcusparse_lt-linux-x86_64-0.4.0.7-archive/include/* /usr/local/cuda/include/ + cp -a libcusparse_lt-linux-x86_64-0.4.0.7-archive/lib/* /usr/local/cuda/lib64/ + popd + rm -rf tmp_custparselt +} + function install_118 { - echo "Installing CUDA 11.8 and cuDNN 8.7 and NCCL 2.15" + echo "Installing CUDA 11.8 and cuDNN 8.7 and NCCL 2.15 and cuSparseLt-0.5.0" rm -rf /usr/local/cuda-11.8 /usr/local/cuda # install CUDA 11.8.0 in the same container wget -q https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run @@ -20,7 +31,6 @@ function install_118 { cp -a cudnn-linux-x86_64-8.7.0.84_cuda11-archive/lib/* /usr/local/cuda/lib64/ cd .. rm -rf tmp_cudnn - ldconfig # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses mkdir tmp_nccl && cd tmp_nccl @@ -30,11 +40,14 @@ function install_118 { cp -a nccl_2.15.5-1+cuda11.8_x86_64/lib/* /usr/local/cuda/lib64/ cd .. rm -rf tmp_nccl + + install_cusparselt_040 + ldconfig } function install_121 { - echo "Installing CUDA 12.1 and cuDNN 8.9 and NCCL 2.18.1" + echo "Installing CUDA 12.1 and cuDNN 8.9 and NCCL 2.18.1 and cuSparseLt-0.5.0" rm -rf /usr/local/cuda-12.1 /usr/local/cuda # install CUDA 12.1.0 in the same container wget -q https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda_12.1.1_530.30.02_linux.run @@ -51,7 +64,6 @@ function install_121 { cp -a cudnn-linux-x86_64-8.9.2.26_cuda12-archive/lib/* /usr/local/cuda/lib64/ cd .. rm -rf tmp_cudnn - ldconfig # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses mkdir tmp_nccl && cd tmp_nccl @@ -61,6 +73,9 @@ function install_121 { cp -a nccl_2.18.1-1+cuda12.1_x86_64/lib/* /usr/local/cuda/lib64/ cd .. rm -rf tmp_nccl + + install_cusparselt_040 + ldconfig } diff --git a/manywheel/build_cuda.sh b/manywheel/build_cuda.sh index afb4aa8113..2689dbd482 100644 --- a/manywheel/build_cuda.sh +++ b/manywheel/build_cuda.sh @@ -13,6 +13,7 @@ export ATEN_STATIC_CUDA=1 export USE_CUDA_STATIC_LINK=1 export INSTALL_TEST=0 # dont install test binaries into site-packages export USE_CUPTI_SO=0 +export USE_CUSPARSELT=0 # disable for now # Keep an array of cmake variables to add to if [[ -z "$CMAKE_ARGS" ]]; then From 94b198b7eae9f1ad40331feca26634d43dfc81da Mon Sep 17 00:00:00 2001 From: Nikita Shulga <2453524+malfet@users.noreply.github.com> Date: Fri, 13 Oct 2023 08:58:28 -0700 Subject: [PATCH 096/212] Add xformers to the list of indexable packages --- s3_management/manage.py | 1 + 1 file changed, 1 insertion(+) diff --git a/s3_management/manage.py b/s3_management/manage.py index f8674bc191..a8bda72e29 100644 --- a/s3_management/manage.py +++ b/s3_management/manage.py @@ -99,6 +99,7 @@ "tqdm", "typing_extensions", "urllib3", + "xformers", } # Should match torch-2.0.0.dev20221221+cu118-cp310-cp310-linux_x86_64.whl as: From f01d7105b19b417802731fa2092df5fa9f911bf2 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Fri, 13 Oct 2023 14:45:01 -0700 Subject: [PATCH 097/212] Build wheels with cuSparseLt Build libtorch without cuSparseLt so far Factor out `DEPS_LIST` to top level and add cuSparseLt of `USE_CUSPARSELT` is set to 1 Tested in https://github.com/pytorch/pytorch/pull/111245 --- libtorch/build.sh | 2 +- manywheel/build_cuda.sh | 30 +++++++++++++++++------------- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/libtorch/build.sh b/libtorch/build.sh index 88c8c6f9a8..d9f78fd66e 100644 --- a/libtorch/build.sh +++ b/libtorch/build.sh @@ -7,4 +7,4 @@ set -ex SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.8" ${SCRIPTPATH}/../manywheel/build.sh +USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.8" ${SCRIPTPATH}/../manywheel/build.sh diff --git a/manywheel/build_cuda.sh b/manywheel/build_cuda.sh index 2689dbd482..c59cbca163 100644 --- a/manywheel/build_cuda.sh +++ b/manywheel/build_cuda.sh @@ -13,7 +13,7 @@ export ATEN_STATIC_CUDA=1 export USE_CUDA_STATIC_LINK=1 export INSTALL_TEST=0 # dont install test binaries into site-packages export USE_CUPTI_SO=0 -export USE_CUSPARSELT=0 # disable for now +export USE_CUSPARSELT=${USE_CUSPARSELT:-1} # Enable if not disabled by libtorch build # Keep an array of cmake variables to add to if [[ -z "$CMAKE_ARGS" ]]; then @@ -113,16 +113,26 @@ elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then LIBGOMP_PATH="/usr/lib/x86_64-linux-gnu/libgomp.so.1" fi +DEPS_LIST=( + "$LIBGOMP_PATH" +) +DEPS_SONAME=( + "libgomp.so.1" +) + +if [[ $USE_CUSPARSELT == "1" ]]; then + DEPS_SONAME+=( + "libcusparseLt.so.0" + ) + DEPS_LIST+=( + "/usr/local/cuda/lib64/libcusparseLt.so.0" + ) +fi + if [[ $CUDA_VERSION == "12.1" ]]; then export USE_STATIC_CUDNN=0 # Try parallelizing nvcc as well export TORCH_NVCC_FLAGS="-Xfatbin -compress-all --threads 2" - DEPS_LIST=( - "$LIBGOMP_PATH" - ) - DEPS_SONAME=( - "libgomp.so.1" - ) if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then echo "Bundling with cudnn and cublas." @@ -187,12 +197,6 @@ elif [[ $CUDA_VERSION == "11.8" ]]; then export USE_STATIC_CUDNN=0 # Try parallelizing nvcc as well export TORCH_NVCC_FLAGS="-Xfatbin -compress-all --threads 2" - DEPS_LIST=( - "$LIBGOMP_PATH" - ) - DEPS_SONAME=( - "libgomp.so.1" - ) if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then echo "Bundling with cudnn and cublas." From d41bcbfe52fc38bf323a3f2a8165289abdcabd23 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Mon, 16 Oct 2023 16:29:53 -0700 Subject: [PATCH 098/212] Do not build conda with CuSparseLT --- conda/build_pytorch.sh | 1 + conda/pytorch-nightly/meta.yaml | 1 + 2 files changed, 2 insertions(+) diff --git a/conda/build_pytorch.sh b/conda/build_pytorch.sh index 6f8eaf502a..029372303b 100755 --- a/conda/build_pytorch.sh +++ b/conda/build_pytorch.sh @@ -351,6 +351,7 @@ for py_ver in "${DESIRED_PYTHON[@]}"; do PYTORCH_GITHUB_ROOT_DIR="$pytorch_rootdir" \ PYTORCH_BUILD_STRING="$build_string" \ PYTORCH_MAGMA_CUDA_VERSION="$cuda_nodot" \ + USE_CUSPARSELT=0 \ conda build -c "$ANACONDA_USER" \ ${NO_TEST:-} \ --no-anaconda-upload \ diff --git a/conda/pytorch-nightly/meta.yaml b/conda/pytorch-nightly/meta.yaml index 79025dc8c1..59efc729bf 100644 --- a/conda/pytorch-nightly/meta.yaml +++ b/conda/pytorch-nightly/meta.yaml @@ -94,6 +94,7 @@ build: - USE_COREML_DELEGATE # [osx] - _GLIBCXX_USE_CXX11_ABI # [unix] - OVERRIDE_TORCH_CUDA_ARCH_LIST + - USE_CUSPARSELT test: imports: From 63cb272d4e556bc535a579f1cc077acf0001ef01 Mon Sep 17 00:00:00 2001 From: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com> Date: Wed, 18 Oct 2023 02:45:43 -0500 Subject: [PATCH 099/212] Add ROCM_PATH env var to Dockerfile for ROCm5.7 issue with finding HIP (#1572) --- libtorch/Dockerfile | 4 ++++ manywheel/Dockerfile | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/libtorch/Dockerfile b/libtorch/Dockerfile index c01c6416e4..5d5b707af8 100644 --- a/libtorch/Dockerfile +++ b/libtorch/Dockerfile @@ -59,6 +59,10 @@ FROM cpu as rocm ARG PYTORCH_ROCM_ARCH ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH} ENV MKLROOT /opt/intel +# Adding ROCM_PATH env var so that LoadHip.cmake (even with logic updated for ROCm6.0) +# find HIP works for ROCm5.7. Not needed for ROCm6.0 and above. +# Remove below when ROCm5.7 is not in support matrix anymore. +ENV ROCM_PATH /opt/rocm # No need to install ROCm as base docker image should have full ROCm install #ADD ./common/install_rocm.sh install_rocm.sh ADD ./common/install_rocm_drm.sh install_rocm_drm.sh diff --git a/manywheel/Dockerfile b/manywheel/Dockerfile index d3e9ad2ef9..4edaef1932 100644 --- a/manywheel/Dockerfile +++ b/manywheel/Dockerfile @@ -159,6 +159,10 @@ FROM cpu_final as rocm_final ARG ROCM_VERSION=3.7 ARG PYTORCH_ROCM_ARCH ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH} +# Adding ROCM_PATH env var so that LoadHip.cmake (even with logic updated for ROCm6.0) +# find HIP works for ROCm5.7. Not needed for ROCm6.0 and above. +# Remove below when ROCm5.7 is not in support matrix anymore. +ENV ROCM_PATH /opt/rocm # No need to install ROCm as base docker image should have full ROCm install #ADD ./common/install_rocm.sh install_rocm.sh #RUN ROCM_VERSION=${ROCM_VERSION} bash ./install_rocm.sh && rm install_rocm.sh From 85df81032b2c3508585f72ca05c1b312804b0266 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Mon, 23 Oct 2023 20:48:31 -0700 Subject: [PATCH 100/212] [aarch64_wheel] Minor typing improvements --- aarch64_linux/aarch64_wheel_ci_build.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aarch64_linux/aarch64_wheel_ci_build.py b/aarch64_linux/aarch64_wheel_ci_build.py index bdc6717ef2..0c413b07e8 100755 --- a/aarch64_linux/aarch64_wheel_ci_build.py +++ b/aarch64_linux/aarch64_wheel_ci_build.py @@ -30,7 +30,7 @@ def build_ArmComputeLibrary(git_clone_flags: str = "") -> None: "cp -r src $acl_install_dir; cd /") -def complete_wheel(folder: str): +def complete_wheel(folder: str) -> str: ''' Complete wheel build and put in artifact location ''' From a414219beff5961ee40af86fe1db31c54982c240 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Mon, 23 Oct 2023 20:49:50 -0700 Subject: [PATCH 101/212] [aarch64_wheel] Flake8 fix --- aarch64_linux/aarch64_wheel_ci_build.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aarch64_linux/aarch64_wheel_ci_build.py b/aarch64_linux/aarch64_wheel_ci_build.py index 0c413b07e8..9cc4ac4d8a 100755 --- a/aarch64_linux/aarch64_wheel_ci_build.py +++ b/aarch64_linux/aarch64_wheel_ci_build.py @@ -107,7 +107,7 @@ def parse_arguments(): # work around to fix Raspberry pie crash print("Applying mkl-dnn patch to fix Raspberry pie crash") - os.system(f"cd /pytorch/third_party/ideep/mkl-dnn && patch -p1 < /builder/mkldnn_fix/aarch64-fix-default-build-flags-to-armv8-a.patch") + os.system("cd /pytorch/third_party/ideep/mkl-dnn && patch -p1 < /builder/mkldnn_fix/aarch64-fix-default-build-flags-to-armv8-a.patch") os.system(f"cd /pytorch; {build_vars} python3 setup.py bdist_wheel") pytorch_wheel_name = complete_wheel("pytorch") print(f"Build Compelete. Created {pytorch_wheel_name}..") From e11155aa6bae87e16a7b3e96eb235354d6405c01 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Mon, 23 Oct 2023 20:50:42 -0700 Subject: [PATCH 102/212] [aarch64_wheel] Cosmetic changes --- aarch64_linux/aarch64_wheel_ci_build.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aarch64_linux/aarch64_wheel_ci_build.py b/aarch64_linux/aarch64_wheel_ci_build.py index 9cc4ac4d8a..4a8fcc8751 100755 --- a/aarch64_linux/aarch64_wheel_ci_build.py +++ b/aarch64_linux/aarch64_wheel_ci_build.py @@ -87,7 +87,7 @@ def parse_arguments(): version = override_package_version build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version} PYTORCH_BUILD_NUMBER=1 " else: - if branch == 'nightly' or branch == 'master': + if branch in ['nightly', 'master']: build_date = subprocess.check_output(['git', 'log', '--pretty=format:%cs', '-1'], cwd='/pytorch').decode().replace('-', '') version = subprocess.check_output(['cat', 'version.txt'], cwd='/pytorch').decode().strip()[:-2] build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1 " From c5e331c0858e37fedc047707466161dfe0cadff6 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Mon, 23 Oct 2023 20:52:12 -0700 Subject: [PATCH 103/212] [aarch64_wheel] Fix readdir crash Probably fixes https://github.com/pytorch/pytorch/issues/111695 --- aarch64_linux/aarch64_wheel_ci_build.py | 2 ++ mkldnn_fix/aarch64-fix-readdir-crash.patch | 14 ++++++++++++++ 2 files changed, 16 insertions(+) create mode 100644 mkldnn_fix/aarch64-fix-readdir-crash.patch diff --git a/aarch64_linux/aarch64_wheel_ci_build.py b/aarch64_linux/aarch64_wheel_ci_build.py index 4a8fcc8751..3fc86053c7 100755 --- a/aarch64_linux/aarch64_wheel_ci_build.py +++ b/aarch64_linux/aarch64_wheel_ci_build.py @@ -108,6 +108,8 @@ def parse_arguments(): # work around to fix Raspberry pie crash print("Applying mkl-dnn patch to fix Raspberry pie crash") os.system("cd /pytorch/third_party/ideep/mkl-dnn && patch -p1 < /builder/mkldnn_fix/aarch64-fix-default-build-flags-to-armv8-a.patch") + print("Applying mkl-dnn patch to fix readdir crash") + os.system("cd /pytorch/third_party/ideep/mkl-dnn && patch -p1 < /builder/mkldnn_fix/aarch64-fix-readdir-crash.patch") os.system(f"cd /pytorch; {build_vars} python3 setup.py bdist_wheel") pytorch_wheel_name = complete_wheel("pytorch") print(f"Build Compelete. Created {pytorch_wheel_name}..") diff --git a/mkldnn_fix/aarch64-fix-readdir-crash.patch b/mkldnn_fix/aarch64-fix-readdir-crash.patch new file mode 100644 index 0000000000..81d46d4065 --- /dev/null +++ b/mkldnn_fix/aarch64-fix-readdir-crash.patch @@ -0,0 +1,14 @@ +diff --git a/src/cpu/aarch64/xbyak_aarch64/src/util_impl.cpp b/src/cpu/aarch64/xbyak_aarch64/src/util_impl.cpp +index cb800b2509..5516373b90 100644 +--- a/src/cpu/aarch64/xbyak_aarch64/src/util_impl.cpp ++++ b/src/cpu/aarch64/xbyak_aarch64/src/util_impl.cpp +@@ -170,6 +170,8 @@ int Cpu::getFilePathMaxTailNumPlus1(const char *path) { + fflush(stdout); + + DIR *dir = opendir(dir_path); ++ if (dir == NULL) ++ return 0; + struct dirent *dp; + + dp = readdir(dir); + From 13c071f2783ec4100fae94c74d2285ccedf5acb7 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Mon, 30 Oct 2023 08:48:30 -0700 Subject: [PATCH 104/212] [S3_management] generate libtorch index.html --- s3_management/manage.py | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/s3_management/manage.py b/s3_management/manage.py index a8bda72e29..f17526f6a9 100644 --- a/s3_management/manage.py +++ b/s3_management/manage.py @@ -23,8 +23,8 @@ ACCEPTED_FILE_EXTENSIONS = ("whl", "zip", "tar.gz") ACCEPTED_SUBDIR_PATTERNS = [ - r"cu[0-9]+", # for cuda - r"rocm[0-9]+\.[0-9]+", # for rocm + r"cu[0-9]+", # for cuda + r"rocm[0-9]+\.[0-9]+", # for rocm "cpu", ] PREFIXES_WITH_HTML = { @@ -32,6 +32,8 @@ "whl/lts/1.8": "torch_lts.html", "whl/nightly": "torch_nightly.html", "whl/test": "torch_test.html", + "libtorch": "index.html", + "libtorch/nightly": "index.html", } # NOTE: This refers to the name on the wheels themselves and not the name of @@ -141,6 +143,7 @@ def extract_package_build_time(full_package_name: str) -> datetime: pass return datetime.now() + def between_bad_dates(package_build_time: datetime): start_bad = datetime(year=2022, month=8, day=17) end_bad = datetime(year=2022, month=12, day=30) @@ -207,7 +210,7 @@ def _resolve_subdir(self, subdir: Optional[str] = None) -> str: def gen_file_list( self, - subdir: Optional[str]=None, + subdir: Optional[str] = None, package_name: Optional[str] = None ) -> Iterable[S3Object]: objects = ( @@ -238,7 +241,7 @@ def obj_to_package_name(self, obj: S3Object) -> str: def to_legacy_html( self, - subdir: Optional[str]=None + subdir: Optional[str] = None ) -> str: """Generates a string that can be used as the HTML index @@ -277,7 +280,7 @@ def to_simple_package_html( out.append('') out.append('') out.append(' ') - out.append('

Links for {}

'.format(package_name.lower().replace("_","-"))) + out.append('

Links for {}

'.format(package_name.lower().replace("_", "-"))) for obj in sorted(self.gen_file_list(subdir, package_name)): maybe_fragment = f"#sha256={obj.checksum}" if obj.checksum else "" out.append(f' {path.basename(obj.key).replace("%2B","+")}
') @@ -370,11 +373,10 @@ def compute_sha256(self) -> None: ACL="public-read", ChecksumAlgorithm="SHA256") - @classmethod - def has_public_read(cls:Type[S3IndexType], key: str) -> bool: + def has_public_read(cls: Type[S3IndexType], key: str) -> bool: def is_all_users_group(o) -> bool: - return o.get("Grantee",{}).get("URI") == "http://acs.amazonaws.com/groups/global/AllUsers" + return o.get("Grantee", {}).get("URI") == "http://acs.amazonaws.com/groups/global/AllUsers" def can_read(o) -> bool: return o.get("Permission") in ["READ", "FULL_CONTROL"] @@ -403,13 +405,23 @@ def fetch_object_names(cls: Type[S3IndexType], prefix: str) -> List[str]: return obj_names @classmethod - def from_S3(cls: Type[S3IndexType], prefix: str) -> S3IndexType: + def from_S3(cls: Type[S3IndexType], prefix: str, with_metadata: bool = True) -> S3IndexType: prefix = prefix.rstrip("/") obj_names = cls.fetch_object_names(prefix) objects = [] - def fetch_metadata(key: str) : + + def fetch_metadata(key: str): return CLIENT.head_object(Bucket=BUCKET.name, Key=key, ChecksumMode="Enabled") + def sanitize_key(key: str) -> str: + return key.replace("+", "%2B") + + if not with_metadata: + return cls([S3Object(key=sanitize_key(key), + orig_key=key, + checksum=None, + size=None) for key in obj_names], prefix) + with concurrent.futures.ThreadPoolExecutor(max_workers=6) as executor: # Add PEP 503-compatible hashes to URLs to allow clients to avoid spurious downloads, if possible. for obj_key, future in {key: executor.submit(fetch_metadata, key) for key in obj_names}.items(): @@ -418,10 +430,9 @@ def fetch_metadata(key: str) : # For older files, rely on checksum-sha256 metadata that can be added to the file later if sha256 is None: sha256 = response.get("Metadata", {}).get("checksum-sha256") - sanitized_key = obj_key.replace("+", "%2B") size = response.get("ContentLength") s3_object = S3Object( - key=sanitized_key, + key=sanitize_key(obj_key), orig_key=obj_key, checksum=sha256, size=int(size) if size else size, @@ -465,7 +476,7 @@ def main() -> None: prefixes = PREFIXES_WITH_HTML if args.prefix == 'all' else [args.prefix] for prefix in prefixes: print(f"INFO: {action} for '{prefix}'") - idx = S3Index.from_S3(prefix=prefix) + idx = S3Index.from_S3(prefix=prefix, with_metadata=args.generate_pep503 or args.compute_sha256) if args.compute_sha256: idx.compute_sha256() elif args.do_not_upload: From 3229f7f7026dbba9565dbea3f8d6328b82eeedad Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Mon, 30 Oct 2023 13:44:14 -0700 Subject: [PATCH 105/212] [CI] Update ruff to 0.1.1 To keep it in sync with pytorch --- .lintrunner.toml | 2 +- tools/linter/adapters/ruff_linter.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.lintrunner.toml b/.lintrunner.toml index 5c63a6c5d8..b7375092ae 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -15,6 +15,6 @@ init_command = [ 'python3', 'tools/linter/adapters/pip_init.py', '--dry-run={{DRYRUN}}', - 'ruff==0.0.290', + 'ruff==0.1.1', ] is_formatter = true diff --git a/tools/linter/adapters/ruff_linter.py b/tools/linter/adapters/ruff_linter.py index 451834aa7c..1fb14aa59e 100644 --- a/tools/linter/adapters/ruff_linter.py +++ b/tools/linter/adapters/ruff_linter.py @@ -227,7 +227,7 @@ def check_files( "ruff", "--exit-zero", "--quiet", - "--format=json", + "--output-format=json", *([f"--config={config}"] if config else []), *filenames, ], From e9ce243cf5174035a68ec23d70156be7399ebd94 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Mon, 30 Oct 2023 17:23:15 -0700 Subject: [PATCH 106/212] Get rid of http://repo.okay.com.mx (#1575) --- manywheel/Dockerfile | 1 - 1 file changed, 1 deletion(-) diff --git a/manywheel/Dockerfile b/manywheel/Dockerfile index 4edaef1932..1adc4ba226 100644 --- a/manywheel/Dockerfile +++ b/manywheel/Dockerfile @@ -145,7 +145,6 @@ RUN yum install -y python3-pip && \ ln -s /usr/local/bin/cmake /usr/bin/cmake # ninja -RUN yum install -y http://repo.okay.com.mx/centos/7/x86_64/release/okay-release-1-5.el7.noarch.rpm RUN yum install -y ninja-build FROM cpu_final as cuda_final From d12d1f204151caa889a4df35a3f6e8c0992cf809 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Tue, 31 Oct 2023 00:48:12 +0000 Subject: [PATCH 107/212] [S3_management] Print time it takes to fetch index --- s3_management/manage.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/s3_management/manage.py b/s3_management/manage.py index f17526f6a9..2b83d51744 100644 --- a/s3_management/manage.py +++ b/s3_management/manage.py @@ -476,7 +476,10 @@ def main() -> None: prefixes = PREFIXES_WITH_HTML if args.prefix == 'all' else [args.prefix] for prefix in prefixes: print(f"INFO: {action} for '{prefix}'") + stime = time.time() idx = S3Index.from_S3(prefix=prefix, with_metadata=args.generate_pep503 or args.compute_sha256) + etime = time.time() + print(f"DEBUG: Fetched {len(idx.objects)} objects for '{prefix}' in {etime-stime:.2f} seconds") if args.compute_sha256: idx.compute_sha256() elif args.do_not_upload: From 96cbf68ff44368977cd6e847ee8d1c85e3ec8b10 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Tue, 31 Oct 2023 01:00:22 +0000 Subject: [PATCH 108/212] [S3_manage] Handle invalid versions --- s3_management/manage.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/s3_management/manage.py b/s3_management/manage.py index 2b83d51744..412db197f9 100644 --- a/s3_management/manage.py +++ b/s3_management/manage.py @@ -12,7 +12,7 @@ from collections import defaultdict from typing import Iterable, List, Type, Dict, Set, TypeVar, Optional from re import sub, match, search -from packaging.version import parse +from packaging.version import parse as _parse_version, Version, InvalidVersion import boto3 @@ -150,6 +150,12 @@ def between_bad_dates(package_build_time: datetime): return start_bad <= package_build_time <= end_bad +def safe_parse_version(ver_str: str) -> Version: + try: + return _parse_version(ver_str) + except InvalidVersion: + return Version(0, 0, 0) + class S3Index: def __init__(self: S3IndexType, objects: List[S3Object], prefix: str) -> None: self.objects = objects @@ -177,7 +183,7 @@ def nightly_packages_to_show(self: S3IndexType) -> Set[S3Object]: # sorting, sorts in reverse to put the most recent versions first all_sorted_packages = sorted( {self.normalize_package_version(obj) for obj in self.objects}, - key=lambda name_ver: parse(name_ver.split('-', 1)[-1]), + key=lambda name_ver: safe_parse_version(name_ver.split('-', 1)[-1]), reverse=True, ) packages: Dict[str, int] = defaultdict(int) From ad9cc665c45b7c5069791aebcc51ca743f97bd77 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Tue, 31 Oct 2023 04:27:17 +0000 Subject: [PATCH 109/212] [S3_management] Fix Version on error And fix flake8 lint violation --- s3_management/manage.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/s3_management/manage.py b/s3_management/manage.py index 412db197f9..11c454003f 100644 --- a/s3_management/manage.py +++ b/s3_management/manage.py @@ -154,7 +154,8 @@ def safe_parse_version(ver_str: str) -> Version: try: return _parse_version(ver_str) except InvalidVersion: - return Version(0, 0, 0) + return Version("0.0.0") + class S3Index: def __init__(self: S3IndexType, objects: List[S3Object], prefix: str) -> None: From 838c550302632d4d55cef4d8272a961312441aba Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Tue, 31 Oct 2023 04:10:44 +0000 Subject: [PATCH 110/212] [S3_Management] Refactor `from_S3` Move `fetch_metadata` into its own method, which could be called later on Make S3Object non-frozen and introduce implicit __hash__ method --- s3_management/manage.py | 62 ++++++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 28 deletions(-) diff --git a/s3_management/manage.py b/s3_management/manage.py index 11c454003f..90eee79077 100644 --- a/s3_management/manage.py +++ b/s3_management/manage.py @@ -115,7 +115,7 @@ S3IndexType = TypeVar('S3IndexType', bound='S3Index') -@dataclasses.dataclass(frozen=True) +@dataclasses.dataclass(frozen=False) @functools.total_ordering class S3Object: key: str @@ -123,6 +123,9 @@ class S3Object: checksum: Optional[str] size: Optional[int] + def __hash__(self): + return hash(self.key) + def __str__(self): return self.key @@ -157,6 +160,7 @@ def safe_parse_version(ver_str: str) -> Version: return Version("0.0.0") + class S3Index: def __init__(self: S3IndexType, objects: List[S3Object], prefix: str) -> None: self.objects = objects @@ -411,41 +415,43 @@ def fetch_object_names(cls: Type[S3IndexType], prefix: str) -> List[str]: obj_names.append(obj.key) return obj_names + def fetch_metadata(self: S3IndexType) -> None: + # Add PEP 503-compatible hashes to URLs to allow clients to avoid spurious downloads, if possible. + with concurrent.futures.ThreadPoolExecutor(max_workers=6) as executor: + for idx, future in { + idx: executor.submit( + lambda key: CLIENT.head_object( + Bucket=BUCKET.name, Key=key, ChecksumMode="Enabled" + ), + obj.orig_key, + ) + for (idx, obj) in enumerate(self.objects) + if obj.size is None + }.items(): + response = future.result() + sha256 = (_b64 := response.get("ChecksumSHA256")) and base64.b64decode(_b64).hex() + # For older files, rely on checksum-sha256 metadata that can be added to the file later + if sha256 is None: + sha256 = response.get("Metadata", {}).get("checksum-sha256") + self.objects[idx].checksum = sha256 + if size := response.get("ContentLength"): + self.objects[idx].size = int(size) + @classmethod def from_S3(cls: Type[S3IndexType], prefix: str, with_metadata: bool = True) -> S3IndexType: prefix = prefix.rstrip("/") obj_names = cls.fetch_object_names(prefix) - objects = [] - - def fetch_metadata(key: str): - return CLIENT.head_object(Bucket=BUCKET.name, Key=key, ChecksumMode="Enabled") def sanitize_key(key: str) -> str: return key.replace("+", "%2B") - if not with_metadata: - return cls([S3Object(key=sanitize_key(key), - orig_key=key, - checksum=None, - size=None) for key in obj_names], prefix) - - with concurrent.futures.ThreadPoolExecutor(max_workers=6) as executor: - # Add PEP 503-compatible hashes to URLs to allow clients to avoid spurious downloads, if possible. - for obj_key, future in {key: executor.submit(fetch_metadata, key) for key in obj_names}.items(): - response = future.result() - sha256 = (_b64 := response.get("ChecksumSHA256")) and base64.b64decode(_b64).hex() - # For older files, rely on checksum-sha256 metadata that can be added to the file later - if sha256 is None: - sha256 = response.get("Metadata", {}).get("checksum-sha256") - size = response.get("ContentLength") - s3_object = S3Object( - key=sanitize_key(obj_key), - orig_key=obj_key, - checksum=sha256, - size=int(size) if size else size, - ) - objects.append(s3_object) - return cls(objects, prefix) + rc = cls([S3Object(key=sanitize_key(key), + orig_key=key, + checksum=None, + size=None) for key in obj_names], prefix) + if with_metadata: + rc.fetch_metadata() + return rc @classmethod def undelete_prefix(cls: Type[S3IndexType], prefix: str) -> None: From 7b1a100d1b18ccdc8a955f78b0c91b8f67986ea9 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Tue, 31 Oct 2023 04:27:56 +0000 Subject: [PATCH 111/212] [S3_Management] Filter nighly before `fetch_metadata` This reduces time to call `from_S3Index` from 600 to 80 sec --- s3_management/manage.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/s3_management/manage.py b/s3_management/manage.py index 90eee79077..83a383ff2f 100644 --- a/s3_management/manage.py +++ b/s3_management/manage.py @@ -172,7 +172,7 @@ def __init__(self: S3IndexType, objects: List[S3Object], prefix: str) -> None: path.dirname(obj.key) for obj in objects if path.dirname != prefix } - def nightly_packages_to_show(self: S3IndexType) -> Set[S3Object]: + def nightly_packages_to_show(self: S3IndexType) -> List[S3Object]: """Finding packages to show based on a threshold we specify Basically takes our S3 packages, normalizes the version for easier @@ -205,10 +205,10 @@ def nightly_packages_to_show(self: S3IndexType) -> Set[S3Object]: to_hide.add(obj) else: packages[package_name] += 1 - return set(self.objects).difference({ + return list(set(self.objects).difference({ obj for obj in self.objects if self.normalize_package_version(obj) in to_hide - }) + })) def is_obj_at_root(self, obj: S3Object) -> bool: return path.dirname(obj.key) == self.prefix @@ -224,10 +224,7 @@ def gen_file_list( subdir: Optional[str] = None, package_name: Optional[str] = None ) -> Iterable[S3Object]: - objects = ( - self.nightly_packages_to_show() if self.prefix == 'whl/nightly' - else self.objects - ) + objects = self.objects subdir = self._resolve_subdir(subdir) + '/' for obj in objects: if package_name is not None and self.obj_to_package_name(obj) != package_name: @@ -449,6 +446,8 @@ def sanitize_key(key: str) -> str: orig_key=key, checksum=None, size=None) for key in obj_names], prefix) + if prefix == "whl/nightly": + rc.objects = rc.nightly_packages_to_show() if with_metadata: rc.fetch_metadata() return rc From 02c06296b09f6792cfd0109a95ddcffdf7e173fb Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Mon, 30 Oct 2023 21:56:49 -0700 Subject: [PATCH 112/212] Add option to build -arm64- libtorch binaries --- wheel/build_wheel.sh | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/wheel/build_wheel.sh b/wheel/build_wheel.sh index 75138fc094..e2dc089dd7 100755 --- a/wheel/build_wheel.sh +++ b/wheel/build_wheel.sh @@ -99,8 +99,10 @@ mkdir -p "$whl_tmp_dir" if [[ -n "$CROSS_COMPILE_ARM64" || $(uname -m) == "arm64" ]]; then mac_version='macosx_11_0_arm64' + libtorch_arch='-arm64-' else mac_version='macosx_10_9_x86_64' + libtorch_arch='' fi # Create a consistent wheel package name to rename the wheel to @@ -264,17 +266,19 @@ else cp -r "$(pwd)/any_wheel/torch/lib/include" "$(pwd)/libtorch/" fi cp -r "$(pwd)/any_wheel/torch/share/cmake" "$(pwd)/libtorch/share/" - if [[ -x "$(pwd)/any_wheel/torch/.dylibs/libiomp5.dylib" ]]; then - cp -r "$(pwd)/any_wheel/torch/.dylibs/libiomp5.dylib" "$(pwd)/libtorch/lib/" - else - cp -r "$(pwd)/any_wheel/torch/lib/libiomp5.dylib" "$(pwd)/libtorch/lib/" + if [[ "${libtorch_arch}" != "-arm64-" ]]; then + if [[ -x "$(pwd)/any_wheel/torch/.dylibs/libiomp5.dylib" ]]; then + cp -r "$(pwd)/any_wheel/torch/.dylibs/libiomp5.dylib" "$(pwd)/libtorch/lib/" + else + cp -r "$(pwd)/any_wheel/torch/lib/libiomp5.dylib" "$(pwd)/libtorch/lib/" + fi fi rm -rf "$(pwd)/any_wheel" echo $PYTORCH_BUILD_VERSION > libtorch/build-version echo "$(pushd $pytorch_rootdir && git rev-parse HEAD)" > libtorch/build-hash - zip -rq "$PYTORCH_FINAL_PACKAGE_DIR/libtorch-macos-$PYTORCH_BUILD_VERSION.zip" libtorch - cp "$PYTORCH_FINAL_PACKAGE_DIR/libtorch-macos-$PYTORCH_BUILD_VERSION.zip" \ - "$PYTORCH_FINAL_PACKAGE_DIR/libtorch-macos-latest.zip" + zip -rq "$PYTORCH_FINAL_PACKAGE_DIR/libtorch-macos${libtorch_arch}-$PYTORCH_BUILD_VERSION.zip" libtorch + cp "$PYTORCH_FINAL_PACKAGE_DIR/libtorch-macos${libtorch_arch}-$PYTORCH_BUILD_VERSION.zip" \ + "$PYTORCH_FINAL_PACKAGE_DIR/libtorch-macos${libtorch_arch}-latest.zip" fi From 617327ece91dc0308cb4447657ddd1e718237a56 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Tue, 31 Oct 2023 07:57:24 -0700 Subject: [PATCH 113/212] [Docker] Remove trailing whitespace And cause docker rebuild, to overwrite docker build from release/2.1 branch artifacts --- manywheel/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manywheel/Dockerfile b/manywheel/Dockerfile index 1adc4ba226..df8f5acd80 100644 --- a/manywheel/Dockerfile +++ b/manywheel/Dockerfile @@ -158,7 +158,7 @@ FROM cpu_final as rocm_final ARG ROCM_VERSION=3.7 ARG PYTORCH_ROCM_ARCH ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH} -# Adding ROCM_PATH env var so that LoadHip.cmake (even with logic updated for ROCm6.0) +# Adding ROCM_PATH env var so that LoadHip.cmake (even with logic updated for ROCm6.0) # find HIP works for ROCm5.7. Not needed for ROCm6.0 and above. # Remove below when ROCm5.7 is not in support matrix anymore. ENV ROCM_PATH /opt/rocm From 9467b4ea1d5456dc6d42bc4420792063406649c5 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Wed, 1 Nov 2023 14:55:58 -0700 Subject: [PATCH 114/212] [MacOS] Small changes to libtorch naming Intel x86 libtorch builds will have `x86_64` suffix and Apple Silicon ones will have `arm64` ones, but latest will point to Intel ones for now. --- wheel/build_wheel.sh | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/wheel/build_wheel.sh b/wheel/build_wheel.sh index e2dc089dd7..c3c45cf421 100755 --- a/wheel/build_wheel.sh +++ b/wheel/build_wheel.sh @@ -99,10 +99,10 @@ mkdir -p "$whl_tmp_dir" if [[ -n "$CROSS_COMPILE_ARM64" || $(uname -m) == "arm64" ]]; then mac_version='macosx_11_0_arm64' - libtorch_arch='-arm64-' + libtorch_arch='arm64' else mac_version='macosx_10_9_x86_64' - libtorch_arch='' + libtorch_arch='x86_64' fi # Create a consistent wheel package name to rename the wheel to @@ -266,7 +266,7 @@ else cp -r "$(pwd)/any_wheel/torch/lib/include" "$(pwd)/libtorch/" fi cp -r "$(pwd)/any_wheel/torch/share/cmake" "$(pwd)/libtorch/share/" - if [[ "${libtorch_arch}" != "-arm64-" ]]; then + if [[ "${libtorch_arch}" == "x86_64" ]]; then if [[ -x "$(pwd)/any_wheel/torch/.dylibs/libiomp5.dylib" ]]; then cp -r "$(pwd)/any_wheel/torch/.dylibs/libiomp5.dylib" "$(pwd)/libtorch/lib/" else @@ -278,7 +278,12 @@ else echo $PYTORCH_BUILD_VERSION > libtorch/build-version echo "$(pushd $pytorch_rootdir && git rev-parse HEAD)" > libtorch/build-hash - zip -rq "$PYTORCH_FINAL_PACKAGE_DIR/libtorch-macos${libtorch_arch}-$PYTORCH_BUILD_VERSION.zip" libtorch - cp "$PYTORCH_FINAL_PACKAGE_DIR/libtorch-macos${libtorch_arch}-$PYTORCH_BUILD_VERSION.zip" \ - "$PYTORCH_FINAL_PACKAGE_DIR/libtorch-macos${libtorch_arch}-latest.zip" + zip -rq "$PYTORCH_FINAL_PACKAGE_DIR/libtorch-macos-${libtorch_arch}-$PYTORCH_BUILD_VERSION.zip" libtorch + cp "$PYTORCH_FINAL_PACKAGE_DIR/libtorch-macos-${libtorch_arch}-$PYTORCH_BUILD_VERSION.zip" \ + "$PYTORCH_FINAL_PACKAGE_DIR/libtorch-macos-${libtorch_arch}-latest.zip" + if [[ "${libtorch_arch}" == "x86_64" ]]; then + # For backward compatibility make unarched latest to point to x86_64 + cp "$PYTORCH_FINAL_PACKAGE_DIR/libtorch-macos-${libtorch_arch}-$PYTORCH_BUILD_VERSION.zip" \ + "$PYTORCH_FINAL_PACKAGE_DIR/libtorch-macos-latest.zip" + fi fi From df5f7c57855603dd629eb2568c8accce6d18daf5 Mon Sep 17 00:00:00 2001 From: Nikita Shulga <2453524+malfet@users.noreply.github.com> Date: Thu, 2 Nov 2023 18:03:18 -0700 Subject: [PATCH 115/212] Update libtorch/Dockerfile to use Ubuntu-20.04 (#1578) As 18.04 EOLed --- libtorch/Dockerfile | 2 +- libtorch/build_docker.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/libtorch/Dockerfile b/libtorch/Dockerfile index 5d5b707af8..8d69ac9444 100644 --- a/libtorch/Dockerfile +++ b/libtorch/Dockerfile @@ -1,5 +1,5 @@ ARG BASE_TARGET=base -ARG GPU_IMAGE=ubuntu:18.04 +ARG GPU_IMAGE=ubuntu:20.04 FROM ${GPU_IMAGE} as base ENV DEBIAN_FRONTEND=noninteractive diff --git a/libtorch/build_docker.sh b/libtorch/build_docker.sh index c799bb1788..8997f69cfe 100755 --- a/libtorch/build_docker.sh +++ b/libtorch/build_docker.sh @@ -15,13 +15,13 @@ case ${GPU_ARCH_TYPE} in cpu) BASE_TARGET=cpu DOCKER_TAG=cpu - GPU_IMAGE=ubuntu:18.04 + GPU_IMAGE=ubuntu:20.04 DOCKER_GPU_BUILD_ARG="" ;; cuda) BASE_TARGET=cuda${GPU_ARCH_VERSION} DOCKER_TAG=cuda${GPU_ARCH_VERSION} - GPU_IMAGE=ubuntu:18.04 + GPU_IMAGE=ubuntu:20.04 DOCKER_GPU_BUILD_ARG="" ;; rocm) From 16b77c7152c62ef0c31ad4cadc27a1f620f40917 Mon Sep 17 00:00:00 2001 From: Nikita Shulga <2453524+malfet@users.noreply.github.com> Date: Fri, 3 Nov 2023 19:04:03 -0700 Subject: [PATCH 116/212] Conda builds should respect `MAX_JOBS` May be this help with OOMs --- conda/pytorch-nightly/meta.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/conda/pytorch-nightly/meta.yaml b/conda/pytorch-nightly/meta.yaml index 59efc729bf..4a03c3d058 100644 --- a/conda/pytorch-nightly/meta.yaml +++ b/conda/pytorch-nightly/meta.yaml @@ -93,6 +93,7 @@ build: - USE_PYTORCH_METAL_EXPORT # [osx] - USE_COREML_DELEGATE # [osx] - _GLIBCXX_USE_CXX11_ABI # [unix] + - MAX_JOBS # [unix] - OVERRIDE_TORCH_CUDA_ARCH_LIST - USE_CUSPARSELT From ca0040fa24f8bf1929718c255afd17a4f962a6a3 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Fri, 10 Nov 2023 13:59:17 -0800 Subject: [PATCH 117/212] [S3_management] Fix subpackage urls Make them `lower()` --- s3_management/manage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/s3_management/manage.py b/s3_management/manage.py index 83a383ff2f..47c151f087 100644 --- a/s3_management/manage.py +++ b/s3_management/manage.py @@ -310,7 +310,7 @@ def to_simple_packages_html( out.append('') out.append(' ') for pkg_name in sorted(self.get_package_names(subdir)): - out.append(f' {pkg_name.replace("_","-")}
') + out.append(f' {pkg_name.replace("_","-")}
') # Adding html footer out.append(' ') out.append('') From 4cfde0044e9f3b1454029c2533189743f7452d72 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Tue, 14 Nov 2023 10:52:36 -0500 Subject: [PATCH 118/212] Advance versions for release 2.1.1 (#1583) --- release/promote.sh | 10 +++++----- release/release_versions.sh | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/release/promote.sh b/release/promote.sh index b3656dda6c..a7f273bc10 100644 --- a/release/promote.sh +++ b/release/promote.sh @@ -6,11 +6,11 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" source "${DIR}/release_versions.sh" # Make sure to update these versions when doing a release first -PYTORCH_VERSION=${PYTORCH_VERSION:-2.1.0} -TORCHVISION_VERSION=${TORCHVISION_VERSION:-0.16.0} -TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION:-2.1.0} -TORCHTEXT_VERSION=${TORCHTEXT_VERSION:-0.16.0} -TORCHDATA_VERSION=${TORCHDATA_VERSION:-0.7.0} +PYTORCH_VERSION=${PYTORCH_VERSION:-2.1.1} +TORCHVISION_VERSION=${TORCHVISION_VERSION:-0.16.1} +TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION:-2.1.1} +TORCHTEXT_VERSION=${TORCHTEXT_VERSION:-0.16.1} +TORCHDATA_VERSION=${TORCHDATA_VERSION:-0.7.1} DRY_RUN=${DRY_RUN:-enabled} diff --git a/release/release_versions.sh b/release/release_versions.sh index ab35075b6f..d362cb1ca3 100644 --- a/release/release_versions.sh +++ b/release/release_versions.sh @@ -1,8 +1,8 @@ #!/usr/bin/env bash # Make sure to update these versions when doing a release first -PYTORCH_VERSION=${PYTORCH_VERSION:-2.1.0} -TORCHVISION_VERSION=${TORCHVISION_VERSION:-0.16.0} -TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION:-2.1.0} -TORCHTEXT_VERSION=${TORCHTEXT_VERSION:-0.16.0} -TORCHDATA_VERSION=${TORCHDATA_VERSION:-0.7.0} +PYTORCH_VERSION=${PYTORCH_VERSION:-2.1.1} +TORCHVISION_VERSION=${TORCHVISION_VERSION:-0.16.1} +TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION:-2.1.1} +TORCHTEXT_VERSION=${TORCHTEXT_VERSION:-0.16.1} +TORCHDATA_VERSION=${TORCHDATA_VERSION:-0.7.1} From 75a877030d102fb04b4c3dbd08be2cefaaadd56c Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Tue, 14 Nov 2023 14:19:48 -0500 Subject: [PATCH 119/212] [aarch64] Release pypi prep script change for aarch64 builds (#1585) --- release/pypi/promote_pypi_to_staging.sh | 4 ++-- release/pypi/upload_pypi_to_staging.sh | 10 ++++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/release/pypi/promote_pypi_to_staging.sh b/release/pypi/promote_pypi_to_staging.sh index 678d9dd034..a9dbe535b7 100644 --- a/release/pypi/promote_pypi_to_staging.sh +++ b/release/pypi/promote_pypi_to_staging.sh @@ -34,13 +34,13 @@ PLATFORM="${MACOS_X86_64}" VERSION_SUFFIX="" PLATFORM="${MACOS_ARM64}" VERSION_SUFFIX="" upload_pypi_to_staging torch "${PYTORCH_VERSION}" # m1 mac PLATFORM="linux_x86_64" VERSION_SUFFIX="${LINUX_VERSION_SUFFIX}" upload_pypi_to_staging torchvision "${TORCHVISION_VERSION}" -PLATFORM="manylinux2014_aarch64" VERSION_SUFFIX="" upload_pypi_to_staging torchvision "${TORCHVISION_VERSION}" +PLATFORM="linux_aarch64" VERSION_SUFFIX="" upload_pypi_to_staging torchvision "${TORCHVISION_VERSION}" PLATFORM="win_amd64" VERSION_SUFFIX="${WIN_VERSION_SUFFIX}" upload_pypi_to_staging torchvision "${TORCHVISION_VERSION}" PLATFORM="${MACOS_X86_64}" VERSION_SUFFIX="" upload_pypi_to_staging torchvision "${TORCHVISION_VERSION}" PLATFORM="${MACOS_ARM64}" VERSION_SUFFIX="" upload_pypi_to_staging torchvision "${TORCHVISION_VERSION}" PLATFORM="linux_x86_64" VERSION_SUFFIX="${LINUX_VERSION_SUFFIX}" upload_pypi_to_staging torchaudio "${TORCHAUDIO_VERSION}" -PLATFORM="manylinux2014_aarch64" VERSION_SUFFIX="" upload_pypi_to_staging torchaudio "${TORCHAUDIO_VERSION}" +PLATFORM="linux_aarch64" VERSION_SUFFIX="" upload_pypi_to_staging torchaudio "${TORCHAUDIO_VERSION}" PLATFORM="win_amd64" VERSION_SUFFIX="${WIN_VERSION_SUFFIX}" upload_pypi_to_staging torchaudio "${TORCHAUDIO_VERSION}" PLATFORM="${MACOS_X86_64}" VERSION_SUFFIX="" upload_pypi_to_staging torchaudio "${TORCHAUDIO_VERSION}" PLATFORM="${MACOS_ARM64}" VERSION_SUFFIX="" upload_pypi_to_staging torchaudio "${TORCHAUDIO_VERSION}" diff --git a/release/pypi/upload_pypi_to_staging.sh b/release/pypi/upload_pypi_to_staging.sh index a9573eac2a..250b231f91 100644 --- a/release/pypi/upload_pypi_to_staging.sh +++ b/release/pypi/upload_pypi_to_staging.sh @@ -42,10 +42,16 @@ fi for pkg in ${pkgs_to_promote}; do pkg_basename="$(basename "${pkg}")" - # Don't attempt to change if manylinux2014 - if [[ "${pkg}" != *manylinux2014* ]]; then + + if [[ "${pkg}" != *aarch64* ]]; then # sub out linux for manylinux1 pkg_basename="$(basename "${pkg//linux/manylinux1}")" + elif [[ "${pkg}" == *manylinux_2_17_aarch64* ]]; then + # strip manylinux_2_17 from core filename + pkg_basename="$(basename "${pkg//manylinux_2_17_aarch64./}")" + elif [[ "${pkg}" == *linux_aarch64* ]]; then + # domains change linux_aarch64 to manylinux2014_aarch64 + pkg_basename="$(basename "${pkg//linux_aarch64/manylinux2014_aarch64}")" fi orig_pkg="${tmp_dir}/${pkg_basename}" ( From 8aa71bd9f28ed682443f1d3f35721f1ae3b52863 Mon Sep 17 00:00:00 2001 From: albanD Date: Wed, 15 Nov 2023 09:40:48 -0500 Subject: [PATCH 120/212] Changes needed for core enablement of 3.12 binary wheels (#1586) --- aarch64_linux/aarch64_ci_setup.sh | 7 +++++-- conda/pytorch-nightly/meta.yaml | 9 ++++++--- manywheel/build_common.sh | 3 +++ manywheel/build_cuda.sh | 4 +++- manywheel/build_rocm.sh | 4 +++- wheel/build_wheel.sh | 13 ++++++++++++- windows/condaenv.bat | 1 + windows/internal/smoke_test.bat | 1 + 8 files changed, 34 insertions(+), 8 deletions(-) diff --git a/aarch64_linux/aarch64_ci_setup.sh b/aarch64_linux/aarch64_ci_setup.sh index 6d2d780fe8..ace6a85fb7 100755 --- a/aarch64_linux/aarch64_ci_setup.sh +++ b/aarch64_linux/aarch64_ci_setup.sh @@ -19,7 +19,10 @@ curl -L -o /mambaforge.sh https://github.com/conda-forge/miniforge/releases/late chmod +x /mambaforge.sh /mambaforge.sh -b -p /opt/conda rm /mambaforge.sh -/opt/conda/bin/conda config --set ssl_verify False -/opt/conda/bin/conda install -y -c conda-forge python=${DESIRED_PYTHON} numpy pyyaml setuptools patchelf pygit2 openblas ninja scons +source /opt/conda/etc/profile.d/conda.sh +conda config --set ssl_verify False +conda create -y -c conda-forge -n aarch64_env python=${DESIRED_PYTHON} +conda activate aarch64_env +conda install -y -c conda-forge numpy==1.26.0 pyyaml==6.0.1 patchelf==0.17.2 pygit2==1.13.2 openblas==0.3.24 ninja==1.11.1 scons==4.5.2 python --version conda --version diff --git a/conda/pytorch-nightly/meta.yaml b/conda/pytorch-nightly/meta.yaml index 4a03c3d058..882f13da09 100644 --- a/conda/pytorch-nightly/meta.yaml +++ b/conda/pytorch-nightly/meta.yaml @@ -21,7 +21,8 @@ requirements: - pyyaml {% if cross_compile_arm64 == 0 %} - mkl-include # [x86_64] - - mkl=2020.2 # [x86_64 and not win] + - mkl=2020.2 # [py <= 311 and x86_64 and not win] + - mkl=2023.1 # [py >= 312 and x86_64 and not win] - mkl=2021.4 # [x86_64 and win] {% endif %} - typing_extensions @@ -29,9 +30,11 @@ requirements: - libuv # [win] - numpy=1.19 # [py <= 39] - numpy=1.21.5 # [py == 310] - - numpy=1.23.5 # [py >= 311] + - numpy=1.23.5 # [py == 311] + - numpy=1.26.1 # [py >= 312] - openssl=1.1.1l # [py >= 38 and py <= 310 and linux] - - openssl=1.1.1s # [py >= 311 and linux] + - openssl=1.1.1s # [py == 311 and linux] + - openssl=3.1.4 # [py >= 312 and linux] {{ environ.get('PYTORCH_LLVM_PACKAGE', ' - llvmdev=9') }} {{ environ.get('MAGMA_PACKAGE', '') }} diff --git a/manywheel/build_common.sh b/manywheel/build_common.sh index f4794b7190..31f188d946 100644 --- a/manywheel/build_common.sh +++ b/manywheel/build_common.sh @@ -131,6 +131,9 @@ case ${DESIRED_PYTHON} in cp311*) retry pip install -q numpy==1.23.1 ;; + cp312*) + retry pip install -q numpy==1.26.1 + ;; # Should catch 3.9+ *) retry pip install -q numpy==1.19.4 diff --git a/manywheel/build_cuda.sh b/manywheel/build_cuda.sh index c59cbca163..31f4e263b0 100644 --- a/manywheel/build_cuda.sh +++ b/manywheel/build_cuda.sh @@ -263,7 +263,9 @@ else fi # TODO: Remove me when Triton has a proper release channel -if [[ $(uname) == "Linux" ]]; then +# No triton dependency for now on 3.12 since we don't have binaries for it +# and torch.compile doesn't work. +if [[ $(uname) == "Linux" && "$DESIRED_PYTHON" != "3.12" ]]; then TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.github/ci_commit_pins/triton.txt) if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then diff --git a/manywheel/build_rocm.sh b/manywheel/build_rocm.sh index 0fed5970b9..4fbca76970 100755 --- a/manywheel/build_rocm.sh +++ b/manywheel/build_rocm.sh @@ -222,7 +222,9 @@ if [[ $ROCM_INT -ge 50600 ]]; then fi # Add triton install dependency -if [[ $(uname) == "Linux" ]]; then +# No triton dependency for now on 3.12 since we don't have binaries for it +# and torch.compile doesn't work. +if [[ $(uname) == "Linux" && "$DESIRED_PYTHON" != "3.12" ]]; then TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton-rocm.txt) TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt) diff --git a/wheel/build_wheel.sh b/wheel/build_wheel.sh index c3c45cf421..1186bc56ae 100755 --- a/wheel/build_wheel.sh +++ b/wheel/build_wheel.sh @@ -136,23 +136,33 @@ export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"} SETUPTOOLS_PINNED_VERSION="=46.0.0" PYYAML_PINNED_VERSION="=5.3" EXTRA_CONDA_INSTALL_FLAGS="" -case ${desired_python} in +case $desired_python in + 3.12) + echo "Using 3.12 deps" + SETUPTOOLS_PINNED_VERSION=">=68.0.0" + PYYAML_PINNED_VERSION=">=6.0.1" + NUMPY_PINNED_VERSION="==1.26.0" + ;; 3.11) + echo "Using 3.11 deps" SETUPTOOLS_PINNED_VERSION=">=46.0.0" PYYAML_PINNED_VERSION=">=5.3" NUMPY_PINNED_VERSION="==1.23.5" ;; 3.10) + echo "Using 3.10 deps" SETUPTOOLS_PINNED_VERSION=">=46.0.0" PYYAML_PINNED_VERSION=">=5.3" NUMPY_PINNED_VERSION="=1.21.2" ;; 3.9) + echo "Using 3.9 deps" SETUPTOOLS_PINNED_VERSION=">=46.0.0" PYYAML_PINNED_VERSION=">=5.3" NUMPY_PINNED_VERSION="=1.19" ;; 3.8) + echo "Using 3.8 deps" if [[ "$(uname -m)" == "arm64" ]]; then SETUPTOOLS_PINNED_VERSION=">=46.0.0" PYYAML_PINNED_VERSION=">=5.3" @@ -162,6 +172,7 @@ case ${desired_python} in fi ;; *) + echo "Using default deps" NUMPY_PINNED_VERSION="=1.11.3" ;; esac diff --git a/windows/condaenv.bat b/windows/condaenv.bat index 464eeb01ce..cf1b2c8655 100644 --- a/windows/condaenv.bat +++ b/windows/condaenv.bat @@ -14,6 +14,7 @@ FOR %%v IN (%DESIRED_PYTHON%) DO ( if "%%v" == "3.9" call conda create -n py!PYTHON_VERSION_STR! -y -q numpy>=1.11 "mkl=2020.2" pyyaml boto3 cmake ninja typing_extensions python=%%v if "%%v" == "3.10" call conda create -n py!PYTHON_VERSION_STR! -y -q -c=conda-forge numpy=1.21.3 "mkl=2020.2" pyyaml boto3 "cmake=3.19.6" ninja typing_extensions python=%%v if "%%v" == "3.11" call conda create -n py!PYTHON_VERSION_STR! -y -q -c=conda-forge numpy=1.23.4 "mkl=2020.2" pyyaml boto3 "cmake=3.19.6" ninja typing_extensions python=%%v + if "%%v" == "3.12" call conda create -n py!PYTHON_VERSION_STR! -y -q -c=conda-forge numpy=1.26.0 "mkl=2023.1" pyyaml boto3 "cmake=3.19.6" ninja typing_extensions python=%%v if "%%v" == "3" call conda create -n py!PYTHON_VERSION_STR! -y -q numpy=1.11 "mkl=2020.2" pyyaml boto3 cmake ninja typing_extensions python=%%v ) endlocal diff --git a/windows/internal/smoke_test.bat b/windows/internal/smoke_test.bat index ad276b9928..1ade2cbda2 100644 --- a/windows/internal/smoke_test.bat +++ b/windows/internal/smoke_test.bat @@ -30,6 +30,7 @@ exit /b 1 echo "install wheel package" set PYTHON_INSTALLER_URL= +if "%DESIRED_PYTHON%" == "3.12" set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.12.0/python-3.12.0-amd64.exe" if "%DESIRED_PYTHON%" == "3.11" set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.11.0/python-3.11.0-amd64.exe" if "%DESIRED_PYTHON%" == "3.10" set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.10.0/python-3.10.0-amd64.exe" if "%DESIRED_PYTHON%" == "3.9" set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.9.0/python-3.9.0-amd64.exe" From c6cbe7700a9e361a6d33e6d5f40956806147da2d Mon Sep 17 00:00:00 2001 From: albanD Date: Wed, 15 Nov 2023 14:58:16 -0500 Subject: [PATCH 121/212] Fix aarch64 build on 3.8 (#1593) --- aarch64_linux/aarch64_ci_setup.sh | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/aarch64_linux/aarch64_ci_setup.sh b/aarch64_linux/aarch64_ci_setup.sh index ace6a85fb7..f10e49b402 100755 --- a/aarch64_linux/aarch64_ci_setup.sh +++ b/aarch64_linux/aarch64_ci_setup.sh @@ -23,6 +23,13 @@ source /opt/conda/etc/profile.d/conda.sh conda config --set ssl_verify False conda create -y -c conda-forge -n aarch64_env python=${DESIRED_PYTHON} conda activate aarch64_env -conda install -y -c conda-forge numpy==1.26.0 pyyaml==6.0.1 patchelf==0.17.2 pygit2==1.13.2 openblas==0.3.24 ninja==1.11.1 scons==4.5.2 + +if [[ "$DESIRED_PYTHON" == "3.8" ]]; then + NUMPY_VERSION="1.24.4" +else + NUMPY_VERSION="1.26.0" +fi +conda install -y -c conda-forge numpy==${NUMPY_VERSION} pyyaml==6.0.1 patchelf==0.17.2 pygit2==1.13.2 openblas==0.3.24 ninja==1.11.1 scons==4.5.2 + python --version conda --version From 4c7fa069848aa32a24e47cae0e5996bc0eeeb70a Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 15 Nov 2023 19:35:28 -0800 Subject: [PATCH 122/212] Add some more validation checks for torch.linalg.eigh and torch.compile (#1580) * Add some more validation checks for torch.linalg.eigh and torch.compile * Update test * Also update smoke_test.py * Fix lint --- check_binary.sh | 6 ++++++ test/smoke_test/smoke_test.py | 3 +++ test_example_code/torch_compile_smoke.py | 12 ++++++++++++ 3 files changed, 21 insertions(+) create mode 100644 test_example_code/torch_compile_smoke.py diff --git a/check_binary.sh b/check_binary.sh index 30b44b5350..9e7d03a548 100755 --- a/check_binary.sh +++ b/check_binary.sh @@ -404,6 +404,12 @@ if [[ "$DESIRED_CUDA" != 'cpu' && "$DESIRED_CUDA" != 'cpu-cxx11-abi' && "$DESIRE echo "Test that linalg works" python -c "import torch;x=torch.rand(3,3,device='cuda');print(torch.linalg.svd(torch.mm(x.t(), x)))" + echo "Test that linalg.eigh works" + python -c "import torch;x=torch.rand(3,3,device='cuda');print(torch.linalg.eigh(torch.mm(x.t(), x)))" + + echo "Checking that basic torch.compile works" + python ${TEST_CODE_DIR}/torch_compile_smoke.py + popd fi # if libtorch fi # if cuda diff --git a/test/smoke_test/smoke_test.py b/test/smoke_test/smoke_test.py index 3d1b6af64b..64efc76016 100644 --- a/test/smoke_test/smoke_test.py +++ b/test/smoke_test/smoke_test.py @@ -193,6 +193,9 @@ def smoke_test_linalg() -> None: A = torch.randn(20, 16, 50, 100, device="cuda").type(dtype) torch.linalg.svd(A) + A = torch.rand(3, 3, device="cuda") + L, Q = torch.linalg.eigh(torch.mm(A.t(), A)) + def smoke_test_compile() -> None: supported_dtypes = [torch.float16, torch.float32, torch.float64] diff --git a/test_example_code/torch_compile_smoke.py b/test_example_code/torch_compile_smoke.py new file mode 100644 index 0000000000..7a12a013eb --- /dev/null +++ b/test_example_code/torch_compile_smoke.py @@ -0,0 +1,12 @@ +import torch + + +def foo(x: torch.Tensor) -> torch.Tensor: + return torch.sin(x) + torch.cos(x) + + +if __name__ == "__main__": + x = torch.rand(3, 3, device="cuda") + x_eager = foo(x) + x_pt2 = torch.compile(foo)(x) + print(torch.allclose(x_eager, x_pt2)) From b321562c29350cc195083913d2d561d820cb60b5 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 15 Nov 2023 21:06:00 -0800 Subject: [PATCH 123/212] Revert "Add some more validation checks for torch.linalg.eigh and torch.compile (#1580)" (#1594) This reverts commit 4c7fa069848aa32a24e47cae0e5996bc0eeeb70a. --- check_binary.sh | 6 ------ test/smoke_test/smoke_test.py | 3 --- test_example_code/torch_compile_smoke.py | 12 ------------ 3 files changed, 21 deletions(-) delete mode 100644 test_example_code/torch_compile_smoke.py diff --git a/check_binary.sh b/check_binary.sh index 9e7d03a548..30b44b5350 100755 --- a/check_binary.sh +++ b/check_binary.sh @@ -404,12 +404,6 @@ if [[ "$DESIRED_CUDA" != 'cpu' && "$DESIRED_CUDA" != 'cpu-cxx11-abi' && "$DESIRE echo "Test that linalg works" python -c "import torch;x=torch.rand(3,3,device='cuda');print(torch.linalg.svd(torch.mm(x.t(), x)))" - echo "Test that linalg.eigh works" - python -c "import torch;x=torch.rand(3,3,device='cuda');print(torch.linalg.eigh(torch.mm(x.t(), x)))" - - echo "Checking that basic torch.compile works" - python ${TEST_CODE_DIR}/torch_compile_smoke.py - popd fi # if libtorch fi # if cuda diff --git a/test/smoke_test/smoke_test.py b/test/smoke_test/smoke_test.py index 64efc76016..3d1b6af64b 100644 --- a/test/smoke_test/smoke_test.py +++ b/test/smoke_test/smoke_test.py @@ -193,9 +193,6 @@ def smoke_test_linalg() -> None: A = torch.randn(20, 16, 50, 100, device="cuda").type(dtype) torch.linalg.svd(A) - A = torch.rand(3, 3, device="cuda") - L, Q = torch.linalg.eigh(torch.mm(A.t(), A)) - def smoke_test_compile() -> None: supported_dtypes = [torch.float16, torch.float32, torch.float64] diff --git a/test_example_code/torch_compile_smoke.py b/test_example_code/torch_compile_smoke.py deleted file mode 100644 index 7a12a013eb..0000000000 --- a/test_example_code/torch_compile_smoke.py +++ /dev/null @@ -1,12 +0,0 @@ -import torch - - -def foo(x: torch.Tensor) -> torch.Tensor: - return torch.sin(x) + torch.cos(x) - - -if __name__ == "__main__": - x = torch.rand(3, 3, device="cuda") - x_eager = foo(x) - x_pt2 = torch.compile(foo)(x) - print(torch.allclose(x_eager, x_pt2)) From 252463f16d20a060c51969ad4a289ceae55d28ad Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Thu, 16 Nov 2023 17:15:07 -0500 Subject: [PATCH 124/212] Release validations using release version matrix (#1611) * Release pypi prep change (#1587) * [aarch64] Release pypi prep script change for aarch64 builds * Release versions for testing Testing calling version (#1588) Upstream/release validations (#1589) * Testing calling version * add release matrix Upstream/release validations (#1590) * Testing calling version * add release matrix * test test (#1591) test (#1592) Release v1 (#1595) * test * test Release v1 (#1596) * test * test * test test (#1597) Test versions validations (#1598) * test * basedir Test versions validations (#1599) * test * basedir * test test (#1600) * test * test Add release versions everywhere (#1601) * test * test * test * test test (#1602) Test version validations (#1603) * test * test Test version validations (#1604) * test * test * test tests (#1605) More tests nov16 (#1606) * tests * test More tests nov16 (#1607) * tests * test * test More tests nov16 (#1608) * tests * test * test * test More tests nov16 (#1609) * tests * test * test * test * test * fix_lint --- .../validate-aarch64-linux-binaries.yml | 23 ++++++++++- .github/workflows/validate-binaries.yml | 27 ++++++++++++- .github/workflows/validate-linux-binaries.yml | 23 +++++++++++ .../validate-macos-arm64-binaries.yml | 22 +++++++++++ .github/workflows/validate-macos-binaries.yml | 22 +++++++++++ .../workflows/validate-windows-binaries.yml | 24 +++++++++++- test/smoke_test/smoke_test.py | 38 ++++++++++++++++++- 7 files changed, 174 insertions(+), 5 deletions(-) diff --git a/.github/workflows/validate-aarch64-linux-binaries.yml b/.github/workflows/validate-aarch64-linux-binaries.yml index d3e57fd5c7..14b7b6395f 100644 --- a/.github/workflows/validate-aarch64-linux-binaries.yml +++ b/.github/workflows/validate-aarch64-linux-binaries.yml @@ -17,6 +17,16 @@ on: default: false required: false type: boolean + version: + description: 'Version to validate - optional' + default: "" + required: false + type: string + release-matrix: + description: 'Release matrix - optional' + default: "" + required: false + type: string workflow_dispatch: inputs: channel: @@ -38,6 +48,16 @@ on: default: false required: false type: boolean + version: + description: 'Version to validate - optional' + default: "" + required: false + type: string + release-matrix: + description: 'Release matrix - optional' + default: "" + required: false + type: string jobs: generate-aarch64-linux-matrix: @@ -47,7 +67,6 @@ jobs: os: linux-aarch64 channel: ${{ inputs.channel }} with-cuda: disable - linux-aarch64: needs: generate-aarch64-linux-matrix strategy: @@ -72,6 +91,8 @@ jobs: export ENV_NAME="conda-env-${{ github.run_id }}" export TARGET_OS="linux-aarch64" export TORCH_ONLY=${{ inputs.torchonly }} + export RELEASE_VERSION=${{ inputs.version }} + printf '%s\n' ${{ toJson(inputs.release-matrix) }} > release_matrix.json eval "$(conda shell.bash hook)" # Standart case: Validate binaries diff --git a/.github/workflows/validate-binaries.yml b/.github/workflows/validate-binaries.yml index 9c877f4edd..f6f73b0126 100644 --- a/.github/workflows/validate-binaries.yml +++ b/.github/workflows/validate-binaries.yml @@ -60,44 +60,69 @@ on: default: false required: false type: boolean + version: + description: 'Version to validate' + default: "" + required: false + type: string + jobs: + generate-release-matrix: + uses: pytorch/test-infra/.github/workflows/generate_release_matrix.yml@main + with: + version: ${{ inputs.version }} + win: if: inputs.os == 'windows' || inputs.os == 'all' + needs: generate-release-matrix uses: ./.github/workflows/validate-windows-binaries.yml with: channel: ${{ inputs.channel }} ref: ${{ inputs.ref || github.ref }} torchonly: ${{ inputs.torchonly }} + version: ${{ inputs.version }} + release-matrix: ${{ needs.generate-release-matrix.outputs.matrix }} linux: if: inputs.os == 'linux' || inputs.os == 'all' + needs: generate-release-matrix uses: ./.github/workflows/validate-linux-binaries.yml with: channel: ${{ inputs.channel }} ref: ${{ inputs.ref || github.ref }} torchonly: ${{ inputs.torchonly }} + version: ${{ inputs.version }} + release-matrix: ${{ needs.generate-release-matrix.outputs.matrix }} linux-aarch64: if: inputs.os == 'linux-aarch64' + needs: generate-release-matrix uses: ./.github/workflows/validate-aarch64-linux-binaries.yml with: channel: ${{ inputs.channel }} ref: ${{ inputs.ref || github.ref }} torchonly: ${{ inputs.torchonly }} - + version: ${{ inputs.version }} + release-matrix: ${{ needs.generate-release-matrix.outputs.matrix }} mac: if: inputs.os == 'macos' || inputs.os == 'all' + needs: generate-release-matrix uses: ./.github/workflows/validate-macos-binaries.yml with: channel: ${{ inputs.channel }} ref: ${{ inputs.ref || github.ref }} torchonly: ${{ inputs.torchonly }} + version: ${{ inputs.version }} + release-matrix: ${{ needs.generate-rlease-matrix.outputs.matrix }} mac-arm64: if: inputs.os == 'macos' || inputs.os == 'all' + needs: generate-release-matrix uses: ./.github/workflows/validate-macos-arm64-binaries.yml with: channel: ${{ inputs.channel }} ref: ${{ inputs.ref || github.ref }} torchonly: ${{ inputs.torchonly }} + version: ${{ inputs.version }} + release-matrix: ${{ needs.generate-release-matrix.outputs.matrix }} diff --git a/.github/workflows/validate-linux-binaries.yml b/.github/workflows/validate-linux-binaries.yml index 3c5aac2ebd..dcbfc93a04 100644 --- a/.github/workflows/validate-linux-binaries.yml +++ b/.github/workflows/validate-linux-binaries.yml @@ -17,6 +17,16 @@ on: default: false required: false type: boolean + version: + description: 'Version to validate - optional' + default: "" + required: false + type: string + release-matrix: + description: 'Release matrix - optional' + default: "" + required: false + type: string workflow_dispatch: inputs: channel: @@ -38,6 +48,16 @@ on: default: false required: false type: boolean + version: + description: 'Version to validate - optional' + default: "" + required: false + type: string + release-matrix: + description: 'Release matrix - optional' + default: "" + required: false + type: string jobs: generate-linux-matrix: @@ -64,8 +84,11 @@ jobs: set -ex export ENV_NAME="conda-env-${{ github.run_id }}" export TORCH_ONLY=${{ inputs.torchonly }} + export RELEASE_VERSION=${{ inputs.version }} export TARGET_OS="linux" eval "$(conda shell.bash hook)" + printf '%s\n' ${{ toJson(inputs.release-matrix) }} > release_matrix.json + cat release_matrix.json # Special case PyPi installation package. And Install of PyPi package via poetry if [[ ${MATRIX_PACKAGE_TYPE} == "manywheel" && ${MATRIX_GPU_ARCH_VERSION} == "12.1" ]]; then diff --git a/.github/workflows/validate-macos-arm64-binaries.yml b/.github/workflows/validate-macos-arm64-binaries.yml index f23dec3f6d..541183b9af 100644 --- a/.github/workflows/validate-macos-arm64-binaries.yml +++ b/.github/workflows/validate-macos-arm64-binaries.yml @@ -17,6 +17,16 @@ on: default: false required: false type: boolean + version: + description: 'Version to validate - optional' + default: "" + required: false + type: string + release-matrix: + description: 'Release matrix - optional' + default: "" + required: false + type: string workflow_dispatch: inputs: channel: @@ -38,6 +48,16 @@ on: default: false required: false type: boolean + version: + description: 'Version to validate - optional' + default: "" + required: false + type: string + release-matrix: + description: 'Release matrix - optional' + default: "" + required: false + type: string jobs: generate-macos-arm64-matrix: @@ -64,4 +84,6 @@ jobs: export ENV_NAME="conda-env-${{ github.run_id }}" export TARGET_OS="macos-arm64" export TORCH_ONLY=${{ inputs.torchonly }} + export RELEASE_VERSION=${{ inputs.version }} + printf '%s\n' ${{ toJson(inputs.release-matrix) }} > release_matrix.json source ./.github/scripts/validate_binaries.sh diff --git a/.github/workflows/validate-macos-binaries.yml b/.github/workflows/validate-macos-binaries.yml index 0926dbe933..9610b36f70 100644 --- a/.github/workflows/validate-macos-binaries.yml +++ b/.github/workflows/validate-macos-binaries.yml @@ -17,6 +17,16 @@ on: default: false required: false type: boolean + version: + description: 'Version to validate - optional' + default: "" + required: false + type: string + release-matrix: + description: 'Release matrix - optional' + default: "" + required: false + type: string workflow_dispatch: inputs: channel: @@ -38,6 +48,16 @@ on: default: false required: false type: boolean + version: + description: 'Version to validate - optional' + default: "" + required: false + type: string + release-matrix: + description: 'Release matrix - optional' + default: "" + required: false + type: string jobs: generate-macos-matrix: @@ -64,4 +84,6 @@ jobs: export ENV_NAME="conda-env-${{ github.run_id }}" export TARGET_OS="macos" export TORCH_ONLY=${{ inputs.torchonly }} + export RELEASE_VERSION=${{ inputs.version }} + printf '%s\n' ${{ toJson(inputs.release-matrix) }} > release_matrix.json source ./.github/scripts/validate_binaries.sh diff --git a/.github/workflows/validate-windows-binaries.yml b/.github/workflows/validate-windows-binaries.yml index 96d2b281ee..1c501cfb39 100644 --- a/.github/workflows/validate-windows-binaries.yml +++ b/.github/workflows/validate-windows-binaries.yml @@ -17,6 +17,16 @@ on: default: false required: false type: boolean + version: + description: 'Version to validate - optional' + default: "" + required: false + type: string + release-matrix: + description: 'Release matrix - optional' + default: "" + required: false + type: string workflow_dispatch: inputs: channel: @@ -38,6 +48,16 @@ on: default: false required: false type: boolean + version: + description: 'Version to validate - optional' + default: "" + required: false + type: string + release-matrix: + description: 'Release matrix - optional' + default: "" + required: false + type: string jobs: generate-windows-matrix: @@ -46,7 +66,6 @@ jobs: package-type: all os: windows channel: ${{ inputs.channel }} - win: needs: generate-windows-matrix strategy: @@ -66,9 +85,10 @@ jobs: export ENV_NAME="conda-env-${{ github.run_id }}" export TARGET_OS="windows" export TORCH_ONLY=${{ inputs.torchonly }} + export RELEASE_VERSION=${{ inputs.version }} + printf '%s\n' ${{ toJson(inputs.release-matrix) }} > release_matrix.json source /c/Jenkins/Miniconda3/etc/profile.d/conda.sh if [[ ${MATRIX_GPU_ARCH_VERSION} == "12.1" ]]; then ./windows/internal/driver_update.bat fi - source ./.github/scripts/validate_binaries.sh diff --git a/test/smoke_test/smoke_test.py b/test/smoke_test/smoke_test.py index 3d1b6af64b..a4cd6dff26 100644 --- a/test/smoke_test/smoke_test.py +++ b/test/smoke_test/smoke_test.py @@ -3,18 +3,20 @@ import sys import argparse import torch +import json import importlib import subprocess import torch._dynamo import torch.nn as nn import torch.nn.functional as F +from pathlib import Path gpu_arch_ver = os.getenv("MATRIX_GPU_ARCH_VERSION") gpu_arch_type = os.getenv("MATRIX_GPU_ARCH_TYPE") channel = os.getenv("MATRIX_CHANNEL") -stable_version = os.getenv("MATRIX_STABLE_VERSION") package_type = os.getenv("MATRIX_PACKAGE_TYPE") target_os = os.getenv("TARGET_OS") +BASE_DIR = Path(__file__).parent.parent.parent is_cuda_system = gpu_arch_type == "cuda" NIGHTLY_ALLOWED_DELTA = 3 @@ -52,8 +54,27 @@ def forward(self, x): output = self.fc1(x) return output +def load_json_from_basedir(filename: str): + try: + with open(BASE_DIR / filename) as fptr: + return json.load(fptr) + except FileNotFoundError as exc: + raise ImportError(f"File {filename} not found error: {exc.strerror}") from exc + except json.JSONDecodeError as exc: + raise ImportError(f"Invalid JSON {filename}") from exc + +def read_release_matrix(): + return load_json_from_basedir("release_matrix.json") def check_version(package: str) -> None: + release_version = os.getenv("RELEASE_VERSION") + # if release_version is specified, use it to validate the packages + if(release_version): + release_matrix = read_release_matrix() + stable_version = release_matrix["torch"] + else: + stable_version = os.getenv("MATRIX_STABLE_VERSION") + # only makes sense to check nightly package where dates are known if channel == "nightly": check_nightly_binaries_date(package) @@ -62,6 +83,20 @@ def check_version(package: str) -> None: raise RuntimeError( f"Torch version mismatch, expected {stable_version} for channel {channel}. But its {torch.__version__}" ) + + if release_version and package == "all": + for module in MODULES: + imported_module = importlib.import_module(module["name"]) + module_version = imported_module.__version__ + if not module_version.startswith(release_matrix[module["name"]]): + raise RuntimeError( + f"{module['name']} version mismatch, expected: \ + {release_matrix[module['name']]} for channel {channel}. But its {module_version}" + ) + else: + print(f"{module['name']} version actual: {module_version} expected: \ + {release_matrix[module['name']]} for channel {channel}.") + else: print(f"Skip version check for channel {channel} as stable version is None") @@ -255,6 +290,7 @@ def main() -> None: ) options = parser.parse_args() print(f"torch: {torch.__version__}") + check_version(options.package) smoke_test_conv2d() smoke_test_linalg() From a91f149b0e4ee62c757d3a11995082617e6f9cbe Mon Sep 17 00:00:00 2001 From: Luo Bo <84075753+0x804d8000@users.noreply.github.com> Date: Fri, 17 Nov 2023 06:15:19 +0800 Subject: [PATCH 125/212] fix: typo (#1581) --- manywheel/build_libtorch.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manywheel/build_libtorch.sh b/manywheel/build_libtorch.sh index 43a3334dbe..1dfbad0317 100644 --- a/manywheel/build_libtorch.sh +++ b/manywheel/build_libtorch.sh @@ -343,7 +343,7 @@ for pkg in /$LIBTORCH_HOUSE_DIR/libtorch*.zip; do fi # zip up the wheel back - zip -rq $(basename $pkg) $PREIX* + zip -rq $(basename $pkg) $PREFIX* # replace original wheel rm -f $pkg From 7e1f31c83cade651c04ee525da16dfdbe78e4bab Mon Sep 17 00:00:00 2001 From: Sergii Dymchenko Date: Thu, 16 Nov 2023 14:39:03 -0800 Subject: [PATCH 126/212] desired_cuda -> DESIRED_CUDA (#1612) * desired_cuda -> DESIRED_CUDA Found with shellcheck * Update manywheel/build_cuda.sh Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com> --------- Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com> --- manywheel/build_cuda.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manywheel/build_cuda.sh b/manywheel/build_cuda.sh index 31f4e263b0..768f19cee7 100644 --- a/manywheel/build_cuda.sh +++ b/manywheel/build_cuda.sh @@ -47,7 +47,7 @@ if [[ -n "$DESIRED_CUDA" ]]; then # There really has to be a better way to do this - eli # Possibly limiting builds to specific cuda versions be delimiting images would be a choice if [[ "$OS_NAME" == *"Ubuntu"* ]]; then - echo "Switching to CUDA version $desired_cuda" + echo "Switching to CUDA version ${DESIRED_CUDA}" /builder/conda/switch_cuda_version.sh "${DESIRED_CUDA}" fi else From e584c27214c97af9162923c93bf2ac421c0523b2 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Thu, 16 Nov 2023 18:24:43 -0500 Subject: [PATCH 127/212] [BE] Cleanup build unused code (#1613) 1. Upload Scripts are not used anymore. We use Github Action upload workflows 2. M1 Builds are now automated 3. build_all.bat run git grep in pytorch and builder - No result --- build_m1_domains.sh | 45 ----------------- conda/upload.sh | 28 ---------- manywheel/upload.sh | 44 ---------------- windows/build_all.bat | 104 -------------------------------------- windows/build_pytorch.bat | 2 - 5 files changed, 223 deletions(-) delete mode 100755 build_m1_domains.sh delete mode 100755 conda/upload.sh delete mode 100755 manywheel/upload.sh delete mode 100755 windows/build_all.bat diff --git a/build_m1_domains.sh b/build_m1_domains.sh deleted file mode 100755 index e574cb0054..0000000000 --- a/build_m1_domains.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/bin/zsh -# Script used to build domain libraries wheels for M1 -source ~/miniconda3/etc/profile.d/conda.sh -set -ex -TORCH_VERSION=1.11.0 -TORCHVISION_VERSION=0.12.0 -TORCHAUDIO_VERSION=0.11.0 -TORCHTEXT_VERSION=0.12.0 - -for PYTHON_VERSION in 3.8 3.9 3.10; do - PY_VERSION=${PYTHON_VERSION/.} - conda create -yn whl-py${PY_VERSION}-torch-${TORCH_VERSION} python=${PYTHON_VERSION} numpy libpng openjpeg wheel pkg-config - conda activate whl-py${PY_VERSION}-torch-${TORCH_VERSION} - python3 -mpip install torch --extra-index-url=https://download.pytorch.org/whl/test torch==${TORCH_VERSION} - python3 -mpip install delocate - - pushd ~/git/pytorch/vision - git checkout release/${TORCHVISION_VERSION%.*} - rm -rf build - BUILD_VERSION=${TORCHVISION_VERSION} python3 setup.py bdist_wheel - WHL_NAME=torchvision-${TORCHVISION_VERSION}-cp${PY_VERSION}-cp${PY_VERSION}-macosx_11_0_arm64.whl - DYLD_FALLBACK_LIBRARY_PATH="$(dirname $(dirname $(which python)))/lib" delocate-wheel -v --ignore-missing-dependencies dist/${WHL_NAME} - python3 -mpip install dist/${WHL_NAME} - popd - - pushd ~/git/pytorch/audio - git checkout release/${TORCHAUDIO_VERSION%.*} - rm -rf build - BUILD_VERSION=${TORCHAUDIO_VERSION} python3 setup.py bdist_wheel - WHL_NAME=torchaudio-${TORCHAUDIO_VERSION}-cp${PY_VERSION}-cp${PY_VERSION}-macosx_11_0_arm64.whl - python3 -mpip install dist/${WHL_NAME} - popd - - pushd ~/git/pytorch/text - git checkout release/${TORCHTEXT_VERSION%.*} - rm -rf build - BUILD_VERSION=${TORCHTEXT_VERSION} python3 setup.py bdist_wheel - WHL_NAME=torchtext-${TORCHTEXT_VERSION}-cp${PY_VERSION}-cp${PY_VERSION}-macosx_11_0_arm64.whl - python3 -mpip install dist/${WHL_NAME} - popd - - python -c "import torch;import torchvision;print('Is torchvision useable?', all(x is not None for x in [torch.ops.image.decode_png, torch.ops.torchvision.roi_align]))" - python -c "import torch;import torchaudio;torchaudio.set_audio_backend('sox_io')" -done - diff --git a/conda/upload.sh b/conda/upload.sh deleted file mode 100755 index 404ee77e75..0000000000 --- a/conda/upload.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash - -set -ex - -# Upload linux conda packages (from inside the docker) -echo "Trying to upload conda packages from $HOST_PACKAGE_DIR" -if [[ -n "$HOST_PACKAGE_DIR" && -d "$HOST_PACKAGE_DIR" ]]; then - ls "$HOST_PACKAGE_DIR" | xargs -I {} anaconda upload "$HOST_PACKAGE_DIR"/{} -u pytorch --label main -else - echo "Couldn't find $HOST_PACKAGE_DIR" -fi - -# Upload linux conda packages (from outside the docker) -# This env variable should only be populated if this was called by cron/upload.sh -echo "Trying to upload conda packages from ${today}/conda_pkgs" -if [[ -n "$today" && -d "${today}/conda_pkgs" ]]; then - ls "${today}/conda_pkgs" | xargs -I {} anaconda upload "${today}/conda_pkgs"/{} -u pytorch --label main -else - echo "Couldn't find ${today}/conda_pkgs" -fi - -# Upload mac conda packages -echo "Trying to upload conda packages from $MAC_CONDA_FINAL_FOLDER" -if [[ -n "$MAC_CONDA_FINAL_FOLDER" && -d "$MAC_CONDA_FINAL_FOLDER" ]]; then - ls "$MAC_CONDA_FINAL_FOLDER" | xargs -I {} anaconda upload "$MAC_CONDA_FINAL_FOLDER"/{} -u pytorch --label main -else - echo "Couldn't find $MAC_CONDA_FINAL_FOLDER" -fi diff --git a/manywheel/upload.sh b/manywheel/upload.sh deleted file mode 100755 index a0c7b5b85f..0000000000 --- a/manywheel/upload.sh +++ /dev/null @@ -1,44 +0,0 @@ -set -ex - -# PIP_UPLOAD_FOLDER should end in a slash. This is to handle it being empty -# (when uploading to e.g. whl/cpu/) and also to handle nightlies (when -# uploading to e.g. /whl/nightly/cpu) - -if [[ -z "$PACKAGE_ROOT_DIR" ]]; then - PACKAGE_ROOT_DIR="$(pwd)" -fi - -# Upload for all CUDA/cpu versions if not given one to use -if [[ -z "$CUDA_VERSIONS" ]]; then - CUDA_VERSIONS=('cpu' 'cu90' 'cu100' 'cu101') -fi - -# Make sure the user specifically refers to an upload folder -if [[ -z "$PIP_UPLOAD_FOLDER" ]]; then - echo 'The upload folder is not set. We refuse to upload.' - echo 'Please set PIP_UPLOAD_FOLDER' - exit 1 -fi - -for cuda_ver in "${CUDA_VERSIONS[@]}"; do - s3_wheel_dir="s3://pytorch/whl/${PIP_UPLOAD_FOLDER}${cuda_ver}/" - s3_libtorch_dir="s3://pytorch/libtorch/${PIP_UPLOAD_FOLDER}${cuda_ver}/" - if [[ "$cuda_ver" == cpu ]]; then - wheel_dir="${PACKAGE_ROOT_DIR}/wheelhousecpu/" - libtorch_dir="${PACKAGE_ROOT_DIR}/libtorch_housecpu/" - else - wheel_dir="${PACKAGE_ROOT_DIR}/wheelhouse${cuda_ver:2:2}/" - libtorch_dir="${PACKAGE_ROOT_DIR}/libtorch_house${cuda_ver:2:2}/" - fi - - # Upload the wheels to s3 - if [[ -d "$wheel_dir" ]]; then - echo "Uploading all of: $(ls $wheel_dir) to $s3_wheel_dir" - ls "$wheel_dir" | xargs -I {} aws s3 cp "$wheel_dir"/{} "$s3_wheel_dir" --acl public-read - fi - - if [[ -d "$libtorch_dir" ]]; then - echo "Uploading all of: $(ls $libtorch_dir) to $s3_libtorch_dir" - ls "$libtorch_dir" | xargs -I {} aws s3 cp "$libtorch_dir"/{} "$s3_libtorch_dir" --acl public-read - fi -done diff --git a/windows/build_all.bat b/windows/build_all.bat deleted file mode 100755 index f60da8c763..0000000000 --- a/windows/build_all.bat +++ /dev/null @@ -1,104 +0,0 @@ -@echo off - -if "%~1"=="" goto arg_error -if "%~2"=="" goto arg_error -if NOT "%~3"=="" goto arg_error -goto arg_end - -:arg_error - -echo Illegal number of parameters. Pass pytorch version, build number -exit /b 1 - -:arg_end - -set PYTORCH_BUILD_VERSION=%~1 -set PYTORCH_BUILD_NUMBER=%~2 - -REM Install Miniconda3 -set "CONDA_HOME=%CD%\conda" -set "tmp_conda=%CONDA_HOME%" -set "miniconda_exe=%CD%\miniconda.exe" -rmdir /s /q conda -del miniconda.exe -curl -k https://repo.continuum.io/miniconda/Miniconda3-latest-Windows-x86_64.exe -o "%miniconda_exe%" -call ..\conda\install_conda.bat - -set "PATH=%CONDA_HOME%;%CONDA_HOME%\scripts;%CONDA_HOME%\Library\bin;%PATH%" -set "ORIG_PATH=%PATH%" - -conda remove -n py36 --all -y || rmdir %CONDA_HOME%\envs\py36 /s -conda remove -n py37 --all -y || rmdir %CONDA_HOME%\envs\py37 /s - -conda create -n py36 -y -q numpy=1.11 mkl=2018 pyyaml boto3 cmake ninja typing_extensions python=3.6 -conda create -n py37 -y -q numpy=1.11 mkl=2018 pyyaml boto3 cmake ninja typing_extensions python=3.7 - -REM Install MKL -rmdir /s /q mkl -del mkl_2018.2.185.7z -curl https://s3.amazonaws.com/ossci-windows/mkl_2018.2.185.7z -k -O -7z x -aoa mkl_2018.2.185.7z -omkl -set CMAKE_INCLUDE_PATH=%cd%\\mkl\\include -set LIB=%cd%\\mkl\\lib;%LIB% - -REM Download MAGMA Files -for %%p in ( - cuda80 - cuda90 - cuda92 - ) do ( - rmdir /s /q magma_%%p_release - del magma_%%p_release.7z - curl -k https://s3.amazonaws.com/ossci-windows/magma_%%p_release_mkl_2018.2.185.7z -o magma_%%p_release.7z - 7z x -aoa magma_%%p_release.7z -omagma_%%p_release - ) - -REM Install sccache -mkdir %CD%\\tmp_bin -curl -k https://s3.amazonaws.com/ossci-windows/sccache.exe --output %CD%\\tmp_bin\\sccache.exe -copy %CD%\\tmp_bin\\sccache.exe %CD%\\tmp_bin\\nvcc.exe - -set CUDA_NVCC_EXECUTABLE=%CD%\\tmp_bin\\nvcc -set "PATH=%CD%\\tmp_bin;%PATH%" - -set PYTORCH_BINARY_BUILD=1 -set TH_BINARY_BUILD=1 - -@setlocal EnableDelayedExpansion -for %%v in ( - py35 - py36 - py37 - ) do ( - REM Activate Python Environment - set "CONDA_LIB_PATH=%CONDA_HOME%\envs\%%v\Library\bin" - set "PATH=%CONDA_HOME%\envs\%%v;%CONDA_HOME%\envs\%%v\scripts;%CONDA_HOME%\envs\%%v\Library\bin;%ORIG_PATH%" - pip install ninja - for %%c in ( - cpu - 80 - 90 - 92 - ) do ( - @setlocal - - REM Set Flags - if NOT "%%c"=="cpu" ( - if NOT "%%c"=="92" ( - set MAGMA_HOME=%cd%\\magma_!CUDA_PREFIX!_release - ) else ( - set MAGMA_HOME=%cd%\\magma_!CUDA_PREFIX!_release\magma_cuda92\magma\install - ) - set CUDA_VERSION=%%c - set CUDA_PREFIX=cuda!CUDA_VERSION! - ) else ( - set CUDA_PREFIX=cpu - ) - call !CUDA_PREFIX!.bat - @endlocal - ) - ) - -@endlocal - -set "PATH=%ORIG_PATH%" diff --git a/windows/build_pytorch.bat b/windows/build_pytorch.bat index 6351387357..d4a1249ed6 100644 --- a/windows/build_pytorch.bat +++ b/windows/build_pytorch.bat @@ -74,8 +74,6 @@ set LIB=%cd%\mkl\lib;%LIB% :: Download MAGMA Files on CUDA builds set MAGMA_VERSION=2.5.4 -if "%CUDA_VERSION%" == "92" set MAGMA_VERSION=2.5.2 -if "%CUDA_VERSION%" == "100" set MAGMA_VERSION=2.5.2 if "%DEBUG%" == "1" ( set BUILD_TYPE=debug From 5014b0f7698fffadd8edcb6011a72ee2395e3e0a Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Fri, 17 Nov 2023 09:25:02 -0500 Subject: [PATCH 128/212] Changes to pypi release promotion scripts introduced for 2.1.0 and 2.1.1 (#1614) * Changes topypi release promotion scripts introduced during 2.1.1 * typo --- release/pypi/promote_pypi_to_production.sh | 1 + release/pypi/promote_pypi_to_staging.sh | 18 +++++++++++++----- release/pypi/upload_pypi_to_staging.sh | 2 +- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/release/pypi/promote_pypi_to_production.sh b/release/pypi/promote_pypi_to_production.sh index b517cbedb5..a14fbb6379 100644 --- a/release/pypi/promote_pypi_to_production.sh +++ b/release/pypi/promote_pypi_to_production.sh @@ -34,3 +34,4 @@ promote_staging_binaries torch "${PYTORCH_VERSION}" promote_staging_binaries torchvision "${TORCHVISION_VERSION}" promote_staging_binaries torchaudio "${TORCHAUDIO_VERSION}" promote_staging_binaries torchtext "${TORCHTEXT_VERSION}" +promote_staging_binaries torchdata "${TORCHDATA_VERSION}" diff --git a/release/pypi/promote_pypi_to_staging.sh b/release/pypi/promote_pypi_to_staging.sh index a9dbe535b7..46cd958cd4 100644 --- a/release/pypi/promote_pypi_to_staging.sh +++ b/release/pypi/promote_pypi_to_staging.sh @@ -23,26 +23,34 @@ upload_pypi_to_staging() { # Uncomment these to promote to pypi PYTORCH_LINUX_VERSION_SUFFIX="%2Bcu121.with.pypi.cudnn" LINUX_VERSION_SUFFIX="%2Bcu121" -WIN_VERSION_SUFFIX="%2Bcpu" +CPU_VERSION_SUFFIX="%2Bcpu" MACOS_X86_64="macosx_.*_x86_64" MACOS_ARM64="macosx_.*_arm64" PLATFORM="linux_x86_64" VERSION_SUFFIX="${PYTORCH_LINUX_VERSION_SUFFIX}" upload_pypi_to_staging torch "${PYTORCH_VERSION}" PLATFORM="manylinux2014_aarch64" VERSION_SUFFIX="" upload_pypi_to_staging torch "${PYTORCH_VERSION}" -PLATFORM="win_amd64" VERSION_SUFFIX="${WIN_VERSION_SUFFIX}" upload_pypi_to_staging torch "${PYTORCH_VERSION}" +PLATFORM="win_amd64" VERSION_SUFFIX="${CPU_VERSION_SUFFIX}" upload_pypi_to_staging torch "${PYTORCH_VERSION}" PLATFORM="${MACOS_X86_64}" VERSION_SUFFIX="" upload_pypi_to_staging torch "${PYTORCH_VERSION}" # intel mac PLATFORM="${MACOS_ARM64}" VERSION_SUFFIX="" upload_pypi_to_staging torch "${PYTORCH_VERSION}" # m1 mac PLATFORM="linux_x86_64" VERSION_SUFFIX="${LINUX_VERSION_SUFFIX}" upload_pypi_to_staging torchvision "${TORCHVISION_VERSION}" PLATFORM="linux_aarch64" VERSION_SUFFIX="" upload_pypi_to_staging torchvision "${TORCHVISION_VERSION}" -PLATFORM="win_amd64" VERSION_SUFFIX="${WIN_VERSION_SUFFIX}" upload_pypi_to_staging torchvision "${TORCHVISION_VERSION}" +PLATFORM="win_amd64" VERSION_SUFFIX="${CPU_VERSION_SUFFIX}" upload_pypi_to_staging torchvision "${TORCHVISION_VERSION}" PLATFORM="${MACOS_X86_64}" VERSION_SUFFIX="" upload_pypi_to_staging torchvision "${TORCHVISION_VERSION}" PLATFORM="${MACOS_ARM64}" VERSION_SUFFIX="" upload_pypi_to_staging torchvision "${TORCHVISION_VERSION}" PLATFORM="linux_x86_64" VERSION_SUFFIX="${LINUX_VERSION_SUFFIX}" upload_pypi_to_staging torchaudio "${TORCHAUDIO_VERSION}" PLATFORM="linux_aarch64" VERSION_SUFFIX="" upload_pypi_to_staging torchaudio "${TORCHAUDIO_VERSION}" -PLATFORM="win_amd64" VERSION_SUFFIX="${WIN_VERSION_SUFFIX}" upload_pypi_to_staging torchaudio "${TORCHAUDIO_VERSION}" +PLATFORM="win_amd64" VERSION_SUFFIX="${CPU_VERSION_SUFFIX}" upload_pypi_to_staging torchaudio "${TORCHAUDIO_VERSION}" PLATFORM="${MACOS_X86_64}" VERSION_SUFFIX="" upload_pypi_to_staging torchaudio "${TORCHAUDIO_VERSION}" PLATFORM="${MACOS_ARM64}" VERSION_SUFFIX="" upload_pypi_to_staging torchaudio "${TORCHAUDIO_VERSION}" -upload_pypi_to_staging torchtext "${TORCHTEXT_VERSION}" +PLATFORM="linux_x86" VERSION_SUFFIX="${CPU_VERSION_SUFFIX}" upload_pypi_to_staging torchtext "${TORCHTEXT_VERSION}" +PLATFORM="win_amd64" VERSION_SUFFIX="${CPU_VERSION_SUFFIX}" upload_pypi_to_staging torchtext "${TORCHTEXT_VERSION}" +PLATFORM="${MACOS_X86_64}" VERSION_SUFFIX="" upload_pypi_to_staging torchtext "${TORCHTEXT_VERSION}" +PLATFORM="${MACOS_ARM64}" VERSION_SUFFIX="" upload_pypi_to_staging torchtext "${TORCHTEXT_VERSION}" + +PLATFORM="manylinux2014_x86_64" VERSION_SUFFIX="" upload_pypi_to_staging torchdata "${TORCHDATA_VERSION}" +PLATFORM="win_amd64" VERSION_SUFFIX="" upload_pypi_to_staging torchdata "${TORCHDATA_VERSION}" +PLATFORM="${MACOS_X86_64}" VERSION_SUFFIX="" upload_pypi_to_staging torchdata "${TORCHDATA_VERSION}" +PLATFORM="${MACOS_ARM64}" VERSION_SUFFIX="" upload_pypi_to_staging torchdata "${TORCHDATA_VERSION}" diff --git a/release/pypi/upload_pypi_to_staging.sh b/release/pypi/upload_pypi_to_staging.sh index 250b231f91..b1a7ddf6d7 100644 --- a/release/pypi/upload_pypi_to_staging.sh +++ b/release/pypi/upload_pypi_to_staging.sh @@ -43,7 +43,7 @@ fi for pkg in ${pkgs_to_promote}; do pkg_basename="$(basename "${pkg}")" - if [[ "${pkg}" != *aarch64* ]]; then + if [[ "${pkg}" != *aarch64* && "${pkg}" != *torchdata* ]]; then # sub out linux for manylinux1 pkg_basename="$(basename "${pkg//linux/manylinux1}")" elif [[ "${pkg}" == *manylinux_2_17_aarch64* ]]; then From c3e0f559d5eeeea3546ec1357112fffa9d5452b2 Mon Sep 17 00:00:00 2001 From: Nikita Shulga <2453524+malfet@users.noreply.github.com> Date: Fri, 17 Nov 2023 09:21:36 -0800 Subject: [PATCH 129/212] Pin miniconda version for Windows To Miniconda3-py311_23.9.0-0-Windows-x86_64.exe --- windows/build_pytorch.bat | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/windows/build_pytorch.bat b/windows/build_pytorch.bat index d4a1249ed6..37e19f9339 100644 --- a/windows/build_pytorch.bat +++ b/windows/build_pytorch.bat @@ -44,7 +44,7 @@ set "tmp_conda=%CONDA_HOME%" set "miniconda_exe=%CD%\miniconda.exe" rmdir /s /q conda del miniconda.exe -curl --retry 3 -k https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe -o "%miniconda_exe%" +curl --retry 3 -k https://repo.anaconda.com/miniconda/Miniconda3-py311_23.9.0-0-Windows-x86_64.exe -o "%miniconda_exe%" call ..\conda\install_conda.bat if ERRORLEVEL 1 exit /b 1 set "ORIG_PATH=%PATH%" From 0cd5228907e627658a24c414b3c8d054e77fed99 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Tue, 21 Nov 2023 18:52:37 +0000 Subject: [PATCH 130/212] Fix poetry and pypi validations when version is specified (#1622) * test (#1617) Fix validations (#1618) * test * poetry_fix * test Fix validations (#1619) * test * poetry_fix * test * test * restrict --- .github/scripts/validate_pipy.sh | 12 ++++++--- .github/scripts/validate_poetry.sh | 25 ++++++------------- .github/workflows/validate-binaries.yml | 2 +- .github/workflows/validate-linux-binaries.yml | 5 +++- 4 files changed, 22 insertions(+), 22 deletions(-) diff --git a/.github/scripts/validate_pipy.sh b/.github/scripts/validate_pipy.sh index ed79150799..d7a9308990 100644 --- a/.github/scripts/validate_pipy.sh +++ b/.github/scripts/validate_pipy.sh @@ -2,15 +2,21 @@ conda create -yn ${ENV_NAME}_pypi python=${MATRIX_PYTHON_VERSION} numpy ffmpeg conda activate ${ENV_NAME}_pypi TEST_SUFFIX="" +RELEASE_SUFFIX="" +# if RELESE version is passed as parameter - install speific version +if [[ ! -z ${RELEASE_VERSION} ]]; then + RELEASE_SUFFIX="==${RELEASE_VERSION}" +fi + if [[ ${TORCH_ONLY} == 'true' ]]; then TEST_SUFFIX=" --package torchonly" - pip3 install --pre torch --extra-index-url "https://download.pytorch.org/whl/${MATRIX_CHANNEL}/${MATRIX_DESIRED_CUDA}_pypi_cudnn" + pip3 install --pre torch${RELEASE_SUFFIX} --extra-index-url "https://download.pytorch.org/whl/${MATRIX_CHANNEL}/${MATRIX_DESIRED_CUDA}_pypi_cudnn" else if [[ ${MATRIX_CHANNEL} != "release" ]]; then - pip3 install --pre torch --extra-index-url "https://download.pytorch.org/whl/${MATRIX_CHANNEL}/${MATRIX_DESIRED_CUDA}_pypi_cudnn" + pip3 install --pre torch${RELEASE_SUFFIX} --extra-index-url "https://download.pytorch.org/whl/${MATRIX_CHANNEL}/${MATRIX_DESIRED_CUDA}_pypi_cudnn" pip3 install --pre torchvision torchaudio --extra-index-url "https://download.pytorch.org/whl/${MATRIX_CHANNEL}/${MATRIX_DESIRED_CUDA}" else - pip3 install torch torchvision torchaudio + pip3 install torch${RELEASE_SUFFIX} torchvision torchaudio fi fi diff --git a/.github/scripts/validate_poetry.sh b/.github/scripts/validate_poetry.sh index 3c41d5b452..6b7fe2412b 100644 --- a/.github/scripts/validate_poetry.sh +++ b/.github/scripts/validate_poetry.sh @@ -13,25 +13,16 @@ if [[ ${TORCH_ONLY} == 'true' ]]; then TEST_SUFFIX=" --package torchonly" fi -if [[ ${MATRIX_CHANNEL} != "release" ]]; then - # Installing poetry from our custom repo. We need to configure it before use and disable authentication - export PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring - poetry source add --priority=explicit domains "https://download.pytorch.org/whl/${MATRIX_CHANNEL}/${MATRIX_DESIRED_CUDA}" - poetry source add --priority=supplemental pytorch-channel "https://download.pytorch.org/whl/${MATRIX_CHANNEL}" - poetry source add --priority=supplemental pytorch "https://download.pytorch.org/whl/${MATRIX_CHANNEL}/${MATRIX_DESIRED_CUDA}_pypi_cudnn" - poetry --quiet add --source pytorch torch +RELEASE_SUFFIX="" +# if RELESE version is passed as parameter - install speific version +if [[ ! -z ${RELEASE_VERSION} ]]; then + RELEASE_SUFFIX="@${RELEASE_VERSION}" +fi - if [[ ${TORCH_ONLY} != 'true' ]]; then - poetry --quiet add --source domains torchvision torchaudio - fi +if [[ ${TORCH_ONLY} == 'true' ]]; then + poetry --quiet add torch${RELEASE_SUFFIX} else - export PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring - poetry source add --priority=explicit pytorch "https://download.pytorch.org/whl/${MATRIX_DESIRED_CUDA}" - if [[ ${TORCH_ONLY} == 'true' ]]; then - poetry --quiet add torch - else - poetry --quiet add --source pytorch torch torchaudio torchvision - fi + poetry --quiet add torch${RELEASE_SUFFIX} torchaudio torchvision fi python ../test/smoke_test/smoke_test.py ${TEST_SUFFIX} --runtime-error-check disabled diff --git a/.github/workflows/validate-binaries.yml b/.github/workflows/validate-binaries.yml index f6f73b0126..fee16dca9d 100644 --- a/.github/workflows/validate-binaries.yml +++ b/.github/workflows/validate-binaries.yml @@ -114,7 +114,7 @@ jobs: ref: ${{ inputs.ref || github.ref }} torchonly: ${{ inputs.torchonly }} version: ${{ inputs.version }} - release-matrix: ${{ needs.generate-rlease-matrix.outputs.matrix }} + release-matrix: ${{ needs.generate-release-matrix.outputs.matrix }} mac-arm64: if: inputs.os == 'macos' || inputs.os == 'all' diff --git a/.github/workflows/validate-linux-binaries.yml b/.github/workflows/validate-linux-binaries.yml index dcbfc93a04..aedffeef26 100644 --- a/.github/workflows/validate-linux-binaries.yml +++ b/.github/workflows/validate-linux-binaries.yml @@ -93,7 +93,10 @@ jobs: # Special case PyPi installation package. And Install of PyPi package via poetry if [[ ${MATRIX_PACKAGE_TYPE} == "manywheel" && ${MATRIX_GPU_ARCH_VERSION} == "12.1" ]]; then source ./.github/scripts/validate_pipy.sh - source ./.github/scripts/validate_poetry.sh + + if [[ ${MATRIX_CHANNEL} == "release" ]]; then + source ./.github/scripts/validate_poetry.sh + fi fi # Standart case: Validate binaries From 4db3d68b5a4c3b87148ba394d31960ff805c3850 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Tue, 21 Nov 2023 23:42:20 +0000 Subject: [PATCH 131/212] Validate pypi build only for release (#1623) --- .github/scripts/validate_pipy.sh | 9 ++------- .github/workflows/validate-linux-binaries.yml | 7 ++----- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/.github/scripts/validate_pipy.sh b/.github/scripts/validate_pipy.sh index d7a9308990..5858e4c282 100644 --- a/.github/scripts/validate_pipy.sh +++ b/.github/scripts/validate_pipy.sh @@ -10,14 +10,9 @@ fi if [[ ${TORCH_ONLY} == 'true' ]]; then TEST_SUFFIX=" --package torchonly" - pip3 install --pre torch${RELEASE_SUFFIX} --extra-index-url "https://download.pytorch.org/whl/${MATRIX_CHANNEL}/${MATRIX_DESIRED_CUDA}_pypi_cudnn" + pip3 install torch${RELEASE_SUFFIX} else - if [[ ${MATRIX_CHANNEL} != "release" ]]; then - pip3 install --pre torch${RELEASE_SUFFIX} --extra-index-url "https://download.pytorch.org/whl/${MATRIX_CHANNEL}/${MATRIX_DESIRED_CUDA}_pypi_cudnn" - pip3 install --pre torchvision torchaudio --extra-index-url "https://download.pytorch.org/whl/${MATRIX_CHANNEL}/${MATRIX_DESIRED_CUDA}" - else - pip3 install torch${RELEASE_SUFFIX} torchvision torchaudio - fi + pip3 install torch${RELEASE_SUFFIX} torchvision torchaudio fi python ./test/smoke_test/smoke_test.py ${TEST_SUFFIX} --runtime-error-check disabled diff --git a/.github/workflows/validate-linux-binaries.yml b/.github/workflows/validate-linux-binaries.yml index aedffeef26..1b84eaa317 100644 --- a/.github/workflows/validate-linux-binaries.yml +++ b/.github/workflows/validate-linux-binaries.yml @@ -91,12 +91,9 @@ jobs: cat release_matrix.json # Special case PyPi installation package. And Install of PyPi package via poetry - if [[ ${MATRIX_PACKAGE_TYPE} == "manywheel" && ${MATRIX_GPU_ARCH_VERSION} == "12.1" ]]; then + if [[ ${MATRIX_PACKAGE_TYPE} == "manywheel" && ${MATRIX_GPU_ARCH_VERSION} == "12.1" && ${MATRIX_CHANNEL} == "release"]]; then source ./.github/scripts/validate_pipy.sh - - if [[ ${MATRIX_CHANNEL} == "release" ]]; then - source ./.github/scripts/validate_poetry.sh - fi + source ./.github/scripts/validate_poetry.sh fi # Standart case: Validate binaries From 56556d0aaca4da61c0497608b9136b058573c8d6 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Tue, 21 Nov 2023 23:52:33 +0000 Subject: [PATCH 132/212] Validate pypi build only for release (#1624) --- .github/workflows/validate-linux-binaries.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/validate-linux-binaries.yml b/.github/workflows/validate-linux-binaries.yml index 1b84eaa317..d1c6c29bd0 100644 --- a/.github/workflows/validate-linux-binaries.yml +++ b/.github/workflows/validate-linux-binaries.yml @@ -88,10 +88,9 @@ jobs: export TARGET_OS="linux" eval "$(conda shell.bash hook)" printf '%s\n' ${{ toJson(inputs.release-matrix) }} > release_matrix.json - cat release_matrix.json # Special case PyPi installation package. And Install of PyPi package via poetry - if [[ ${MATRIX_PACKAGE_TYPE} == "manywheel" && ${MATRIX_GPU_ARCH_VERSION} == "12.1" && ${MATRIX_CHANNEL} == "release"]]; then + if [[ ${MATRIX_PACKAGE_TYPE} == "manywheel" && ${MATRIX_GPU_ARCH_VERSION} == "12.1" && ${MATRIX_CHANNEL} == "release" ]]; then source ./.github/scripts/validate_pipy.sh source ./.github/scripts/validate_poetry.sh fi From 12f6acd63456dafae70b6273659edf557d68c9fa Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Tue, 21 Nov 2023 17:09:20 -0800 Subject: [PATCH 133/212] [Manywheel] Do not hardcode triton version --- manywheel/build_cuda.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/manywheel/build_cuda.sh b/manywheel/build_cuda.sh index 768f19cee7..b09abfea44 100644 --- a/manywheel/build_cuda.sh +++ b/manywheel/build_cuda.sh @@ -267,11 +267,12 @@ fi # and torch.compile doesn't work. if [[ $(uname) == "Linux" && "$DESIRED_PYTHON" != "3.12" ]]; then TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.github/ci_commit_pins/triton.txt) + TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt) if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then - export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="pytorch-triton==2.1.0+${TRITON_SHORTHASH}" + export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="pytorch-triton==${TRITON_VERSION}+${TRITON_SHORTHASH}" else - export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS} | pytorch-triton==2.1.0+${TRITON_SHORTHASH}" + export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS} | pytorch-triton==${TRITON_VERSION}+${TRITON_SHORTHASH}" fi fi From fb8aae9a0b7badf574b5fd2f436024f6b10d8c14 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Tue, 21 Nov 2023 17:12:46 -0800 Subject: [PATCH 134/212] [Manywheel][BE] Dedup Triton requirement spec --- manywheel/build_cuda.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/manywheel/build_cuda.sh b/manywheel/build_cuda.sh index b09abfea44..4938ffef22 100644 --- a/manywheel/build_cuda.sh +++ b/manywheel/build_cuda.sh @@ -268,11 +268,12 @@ fi if [[ $(uname) == "Linux" && "$DESIRED_PYTHON" != "3.12" ]]; then TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.github/ci_commit_pins/triton.txt) TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt) + TRITON_REQUIREMENT="pytorch-triton==${TRITON_VERSION}+${TRITON_SHORTHASH}" if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then - export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="pytorch-triton==${TRITON_VERSION}+${TRITON_SHORTHASH}" + export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${TRITON_REQUIREMENT}" else - export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS} | pytorch-triton==${TRITON_VERSION}+${TRITON_SHORTHASH}" + export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS} | ${TRITON_REQUIREMENT}" fi fi From 2a750ebb877695947146a307b22cf27b6c0d0302 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Tue, 21 Nov 2023 22:43:00 -0800 Subject: [PATCH 135/212] [Manywheel] Restrict `pytorch-triton` to x86-64 Linux Partially addresses https://github.com/pytorch/pytorch/issues/114042 --- manywheel/build_cuda.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manywheel/build_cuda.sh b/manywheel/build_cuda.sh index 4938ffef22..9919247ed0 100644 --- a/manywheel/build_cuda.sh +++ b/manywheel/build_cuda.sh @@ -268,7 +268,7 @@ fi if [[ $(uname) == "Linux" && "$DESIRED_PYTHON" != "3.12" ]]; then TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.github/ci_commit_pins/triton.txt) TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt) - TRITON_REQUIREMENT="pytorch-triton==${TRITON_VERSION}+${TRITON_SHORTHASH}" + TRITON_REQUIREMENT="pytorch-triton==${TRITON_VERSION}+${TRITON_SHORTHASH}; platform_system == 'Linux' and platform_machine == 'x86_64'" if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${TRITON_REQUIREMENT}" From 6b8c73fecb93f06f3c18364cec9d3714f99bc479 Mon Sep 17 00:00:00 2001 From: Nikita Shulga <2453524+malfet@users.noreply.github.com> Date: Wed, 29 Nov 2023 18:39:22 -0800 Subject: [PATCH 136/212] Tweak py312 conda requirements --- conda/pytorch-nightly/meta.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conda/pytorch-nightly/meta.yaml b/conda/pytorch-nightly/meta.yaml index 882f13da09..f51339d532 100644 --- a/conda/pytorch-nightly/meta.yaml +++ b/conda/pytorch-nightly/meta.yaml @@ -31,10 +31,10 @@ requirements: - numpy=1.19 # [py <= 39] - numpy=1.21.5 # [py == 310] - numpy=1.23.5 # [py == 311] - - numpy=1.26.1 # [py >= 312] + - numpy=1.26.0 # [py >= 312] - openssl=1.1.1l # [py >= 38 and py <= 310 and linux] - openssl=1.1.1s # [py == 311 and linux] - - openssl=3.1.4 # [py >= 312 and linux] + - openssl=3.0.12 # [py >= 312 and linux] {{ environ.get('PYTORCH_LLVM_PACKAGE', ' - llvmdev=9') }} {{ environ.get('MAGMA_PACKAGE', '') }} From 3c7404d80c24d2b59f0a15d818eff2806b19f216 Mon Sep 17 00:00:00 2001 From: Nikita Shulga <2453524+malfet@users.noreply.github.com> Date: Wed, 29 Nov 2023 19:41:02 -0800 Subject: [PATCH 137/212] Build PyTorch without TLS for 3.12 Because GLOO still expect OpenSSL-1, but 3.12 is build with OpenSSL-3 --- conda/build_pytorch.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/conda/build_pytorch.sh b/conda/build_pytorch.sh index 029372303b..77789a17b2 100755 --- a/conda/build_pytorch.sh +++ b/conda/build_pytorch.sh @@ -303,6 +303,11 @@ fi # Loop through all Python versions to build a package for each for py_ver in "${DESIRED_PYTHON[@]}"; do + # TODO: Enable TLS support for 3.12 builds (or disable it for the rest + if [[ "$(uname)" == 'Linux' && "${py_ver}" == '3.12' ]]; then + export USE_GLOO_WITH_OPENSSL=0 + fi + build_string="py${py_ver}_${build_string_suffix}" folder_tag="${build_string}_$(date +'%Y%m%d')" From 88457a11da457ad52c2a8847503cfb39f8071c49 Mon Sep 17 00:00:00 2001 From: Nikita Shulga <2453524+malfet@users.noreply.github.com> Date: Wed, 29 Nov 2023 20:34:48 -0800 Subject: [PATCH 138/212] [conda] Skip sympy for 3.12 As at the moment it is only available for Windows %) --- conda/pytorch-nightly/meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda/pytorch-nightly/meta.yaml b/conda/pytorch-nightly/meta.yaml index f51339d532..16cc1bd83f 100644 --- a/conda/pytorch-nightly/meta.yaml +++ b/conda/pytorch-nightly/meta.yaml @@ -49,7 +49,7 @@ requirements: # Before a decent fix, force llvm-openmp version <16. - llvm-openmp <16 # [linux] - typing_extensions - - sympy + - sympy # [py < 312] - filelock - networkx - jinja2 From ca378c16f88781e92a7005c65a3473e3ddca5be3 Mon Sep 17 00:00:00 2001 From: Nikita Shulga <2453524+malfet@users.noreply.github.com> Date: Wed, 29 Nov 2023 20:37:57 -0800 Subject: [PATCH 139/212] [conda] Do not depend on triton for 3.12 yet --- conda/build_pytorch.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda/build_pytorch.sh b/conda/build_pytorch.sh index 77789a17b2..4c2e4836b1 100755 --- a/conda/build_pytorch.sh +++ b/conda/build_pytorch.sh @@ -276,7 +276,7 @@ else if [[ "$OSTYPE" != "msys" ]]; then # TODO: Remove me when Triton has a proper release channel TRITON_SHORTHASH=$(cut -c1-10 $pytorch_rootdir/.github/ci_commit_pins/triton.txt) - export CONDA_TRITON_CONSTRAINT=" - torchtriton==2.1.0+${TRITON_SHORTHASH}" + export CONDA_TRITON_CONSTRAINT=" - torchtriton==2.1.0+${TRITON_SHORTHASH} # [py < 312]" fi build_string_suffix="cuda${CUDA_VERSION}_cudnn${CUDNN_VERSION}_${build_string_suffix}" From fc773dde9764b1be1a1a0e8a1b1df581582013ec Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Thu, 30 Nov 2023 14:47:10 -0800 Subject: [PATCH 140/212] Tweak mkl requirements for win+py312 --- conda/pytorch-nightly/meta.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conda/pytorch-nightly/meta.yaml b/conda/pytorch-nightly/meta.yaml index 16cc1bd83f..93ec533a93 100644 --- a/conda/pytorch-nightly/meta.yaml +++ b/conda/pytorch-nightly/meta.yaml @@ -22,8 +22,8 @@ requirements: {% if cross_compile_arm64 == 0 %} - mkl-include # [x86_64] - mkl=2020.2 # [py <= 311 and x86_64 and not win] - - mkl=2023.1 # [py >= 312 and x86_64 and not win] - - mkl=2021.4 # [x86_64 and win] + - mkl=2023.1 # [py >= 312 and x86_64] + - mkl=2021.4 # [x86_64 and win and py <= 311] {% endif %} - typing_extensions - ninja From 8e5151cb2beb7c7c8fda1b64eaf468e5ec585fb5 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Fri, 1 Dec 2023 10:23:03 -0800 Subject: [PATCH 141/212] Add aarch64 conda env lib to LD_LIBRARY_PATH (#1628) After the change on https://github.com/pytorch/builder/pull/1586, nightly aarch64 wheel fails to find `libopenblas.so` which is now installed under `/opt/conda/envs/aarch64_env/lib/` instead of the base conda `/opt/conda/lib`. Using CPU nightly wheels on aarch64 from Nov 16 then ends up with the error as described in https://github.com/pytorch/pytorch/issues/114862: `Calling torch.geqrf on a CPU tensor requires compiling PyTorch with LAPACK. Please use PyTorch built with LAPACK support`. The error can be found on night build log https://github.com/pytorch/pytorch/actions/runs/6887666324/job/18735230109#step:15:4933 Fixes https://github.com/pytorch/pytorch/issues/114862 I double check `2.1.[0-1]` and the current RC for 2.1.2, the issue is not there because https://github.com/pytorch/builder/pull/1586 only change builder main, thus impacting nightly. ### Testing Build nightly wheel manually on aarch64 runner and confirm that openblas is detected correctly: ``` -- Found a library with BLAS API (open). Full path: (/opt/conda/envs/aarch64_env/lib/libopenblas.so) ... -- USE_BLAS : 1 -- BLAS : open -- BLAS_HAS_SBGEMM : -- USE_LAPACK : 1 -- LAPACK : open ... ``` --- aarch64_linux/aarch64_ci_setup.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/aarch64_linux/aarch64_ci_setup.sh b/aarch64_linux/aarch64_ci_setup.sh index f10e49b402..88029c9405 100755 --- a/aarch64_linux/aarch64_ci_setup.sh +++ b/aarch64_linux/aarch64_ci_setup.sh @@ -6,8 +6,9 @@ set -eux -o pipefail CONDA_PYTHON_EXE=/opt/conda/bin/python CONDA_EXE=/opt/conda/bin/conda +CONDA_ENV_NAME=aarch64_env PATH=/opt/conda/bin:$PATH -LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH +LD_LIBRARY_PATH=/opt/conda/envs/${CONDA_ENV_NAME}/lib/:/opt/conda/lib:$LD_LIBRARY_PATH ############################################################################### # Install conda @@ -21,8 +22,8 @@ chmod +x /mambaforge.sh rm /mambaforge.sh source /opt/conda/etc/profile.d/conda.sh conda config --set ssl_verify False -conda create -y -c conda-forge -n aarch64_env python=${DESIRED_PYTHON} -conda activate aarch64_env +conda create -y -c conda-forge -n "${CONDA_ENV_NAME}" python=${DESIRED_PYTHON} +conda activate "${CONDA_ENV_NAME}" if [[ "$DESIRED_PYTHON" == "3.8" ]]; then NUMPY_VERSION="1.24.4" From 6ce30be4cb65075cedd69e8460cb4529dd9c7f39 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Fri, 1 Dec 2023 18:46:47 -0800 Subject: [PATCH 142/212] Revert "[conda] Skip sympy for 3.12" This reverts commit 88457a11da457ad52c2a8847503cfb39f8071c49. As sympy has been updated to 1.12 and it now supports Python-3.12 --- conda/pytorch-nightly/meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda/pytorch-nightly/meta.yaml b/conda/pytorch-nightly/meta.yaml index 93ec533a93..05be9c7d76 100644 --- a/conda/pytorch-nightly/meta.yaml +++ b/conda/pytorch-nightly/meta.yaml @@ -49,7 +49,7 @@ requirements: # Before a decent fix, force llvm-openmp version <16. - llvm-openmp <16 # [linux] - typing_extensions - - sympy # [py < 312] + - sympy - filelock - networkx - jinja2 From b92da8cd64c9db81effcb8a48992e69ecb4d6346 Mon Sep 17 00:00:00 2001 From: snadampal <87143774+snadampal@users.noreply.github.com> Date: Fri, 1 Dec 2023 21:00:56 -0600 Subject: [PATCH 143/212] [aarch64] ACL, OpenBLAS and mkldnn updates for PyTorch 2.2 (#1627) Note# ~~This PR has a dependency on updating the oneDNN version to v3.3 (via ideep submodule to v3.3)~~ ideep submodule update is done, so, this PR can be merged anytime now. This PR is for: ACL - build with fixed format kernels OpenBLAS - upgrade the version to 0.3.25 numpy - upgrade version to 1.26.2 and mkldnn - cleanup the patches that are already upstreamed. --- aarch64_linux/aarch64_ci_setup.sh | 4 +-- aarch64_linux/aarch64_wheel_ci_build.py | 6 ++-- aarch64_linux/build_aarch64_wheel.py | 7 ++--- ...4-fix-default-build-flags-to-armv8-a.patch | 29 ------------------- 4 files changed, 7 insertions(+), 39 deletions(-) delete mode 100644 mkldnn_fix/aarch64-fix-default-build-flags-to-armv8-a.patch diff --git a/aarch64_linux/aarch64_ci_setup.sh b/aarch64_linux/aarch64_ci_setup.sh index 88029c9405..53c8a5320c 100755 --- a/aarch64_linux/aarch64_ci_setup.sh +++ b/aarch64_linux/aarch64_ci_setup.sh @@ -28,9 +28,9 @@ conda activate "${CONDA_ENV_NAME}" if [[ "$DESIRED_PYTHON" == "3.8" ]]; then NUMPY_VERSION="1.24.4" else - NUMPY_VERSION="1.26.0" + NUMPY_VERSION="1.26.2" fi -conda install -y -c conda-forge numpy==${NUMPY_VERSION} pyyaml==6.0.1 patchelf==0.17.2 pygit2==1.13.2 openblas==0.3.24 ninja==1.11.1 scons==4.5.2 +conda install -y -c conda-forge numpy==${NUMPY_VERSION} pyyaml==6.0.1 patchelf==0.17.2 pygit2==1.13.2 openblas==0.3.25 ninja==1.11.1 scons==4.5.2 python --version conda --version diff --git a/aarch64_linux/aarch64_wheel_ci_build.py b/aarch64_linux/aarch64_wheel_ci_build.py index 3fc86053c7..3b772847c5 100755 --- a/aarch64_linux/aarch64_wheel_ci_build.py +++ b/aarch64_linux/aarch64_wheel_ci_build.py @@ -20,9 +20,9 @@ def build_ArmComputeLibrary(git_clone_flags: str = "") -> None: ''' print('Building Arm Compute Library') os.system("cd / && mkdir /acl") - os.system(f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v23.05.1 {git_clone_flags}") + os.system(f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v23.08 {git_clone_flags}") os.system("cd ComputeLibrary; export acl_install_dir=/acl; " - "scons Werror=1 -j8 debug=0 neon=1 opencl=0 os=linux openmp=1 cppthreads=0 arch=armv8a multi_isa=1 build=native build_dir=$acl_install_dir/build; " + "scons Werror=1 -j8 debug=0 neon=1 opencl=0 os=linux openmp=1 cppthreads=0 arch=armv8a multi_isa=1 fixed_format_kernels=1 build=native build_dir=$acl_install_dir/build; " "cp -r arm_compute $acl_install_dir; " "cp -r include $acl_install_dir; " "cp -r utils $acl_install_dir; " @@ -106,8 +106,6 @@ def parse_arguments(): print("build pytorch without mkldnn backend") # work around to fix Raspberry pie crash - print("Applying mkl-dnn patch to fix Raspberry pie crash") - os.system("cd /pytorch/third_party/ideep/mkl-dnn && patch -p1 < /builder/mkldnn_fix/aarch64-fix-default-build-flags-to-armv8-a.patch") print("Applying mkl-dnn patch to fix readdir crash") os.system("cd /pytorch/third_party/ideep/mkl-dnn && patch -p1 < /builder/mkldnn_fix/aarch64-fix-readdir-crash.patch") os.system(f"cd /pytorch; {build_vars} python3 setup.py bdist_wheel") diff --git a/aarch64_linux/build_aarch64_wheel.py b/aarch64_linux/build_aarch64_wheel.py index dd43bf2188..9efd2e6ae5 100755 --- a/aarch64_linux/build_aarch64_wheel.py +++ b/aarch64_linux/build_aarch64_wheel.py @@ -219,15 +219,15 @@ def install_condaforge_python(host: RemoteHost, python_version="3.8") -> None: def build_OpenBLAS(host: RemoteHost, git_clone_flags: str = "") -> None: print('Building OpenBLAS') - host.run_cmd(f"git clone https://github.com/xianyi/OpenBLAS -b v0.3.20 {git_clone_flags}") + host.run_cmd(f"git clone https://github.com/xianyi/OpenBLAS -b v0.3.25 {git_clone_flags}") make_flags = "NUM_THREADS=64 USE_OPENMP=1 NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=ARMV8" host.run_cmd(f"pushd OpenBLAS && make {make_flags} -j8 && sudo make {make_flags} install && popd && rm -rf OpenBLAS") def build_ArmComputeLibrary(host: RemoteHost, git_clone_flags: str = "") -> None: print('Building Arm Compute Library') - acl_build_flags="debug=0 neon=1 opencl=0 os=linux openmp=1 cppthreads=0 arch=armv8a multi_isa=1 build=native" - host.run_cmd(f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v23.05.1 {git_clone_flags}") + acl_build_flags="debug=0 neon=1 opencl=0 os=linux openmp=1 cppthreads=0 arch=armv8a multi_isa=1 fixed_format_kernels=1 build=native" + host.run_cmd(f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v23.08 {git_clone_flags}") host.run_cmd(f"cd ComputeLibrary && scons Werror=1 -j8 {acl_build_flags}") @@ -555,7 +555,6 @@ def start_build(host: RemoteHost, *, print("build pytorch with mkldnn+acl backend") build_vars += " USE_MKLDNN=ON USE_MKLDNN_ACL=ON" host.run_cmd(f"cd $HOME && git clone https://github.com/pytorch/builder.git") - host.run_cmd(f"cd $HOME/pytorch/third_party/ideep/mkl-dnn && patch -p1 < $HOME/builder/mkldnn_fix/aarch64-fix-default-build-flags-to-armv8-a.patch") host.run_cmd(f"cd $HOME/pytorch && export ACL_ROOT_DIR=$HOME/ComputeLibrary && {build_vars} python3 setup.py bdist_wheel{build_opts}") print('Repair the wheel') pytorch_wheel_name = host.list_dir("pytorch/dist")[0] diff --git a/mkldnn_fix/aarch64-fix-default-build-flags-to-armv8-a.patch b/mkldnn_fix/aarch64-fix-default-build-flags-to-armv8-a.patch deleted file mode 100644 index f6e91010ab..0000000000 --- a/mkldnn_fix/aarch64-fix-default-build-flags-to-armv8-a.patch +++ /dev/null @@ -1,29 +0,0 @@ ---- - cmake/platform.cmake | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/cmake/platform.cmake b/cmake/platform.cmake -index 8630460ce..602eafe8e 100644 ---- a/cmake/platform.cmake -+++ b/cmake/platform.cmake -@@ -198,7 +198,7 @@ elseif(UNIX OR MINGW) - endif() - # For native compilation tune for the host processor - if (CMAKE_SYSTEM_PROCESSOR STREQUAL CMAKE_HOST_SYSTEM_PROCESSOR) -- append(DEF_ARCH_OPT_FLAGS "-mcpu=native") -+ append(DEF_ARCH_OPT_FLAGS "-march=armv8-a") - endif() - elseif(DNNL_TARGET_ARCH STREQUAL "PPC64") - if (NOT CMAKE_BUILD_TYPE STREQUAL "Debug") -@@ -295,7 +295,7 @@ elseif(UNIX OR MINGW) - endif() - # For native compilation tune for the host processor - if (CMAKE_SYSTEM_PROCESSOR STREQUAL CMAKE_HOST_SYSTEM_PROCESSOR) -- append(DEF_ARCH_OPT_FLAGS "-mcpu=native") -+ append(DEF_ARCH_OPT_FLAGS "-march=armv8-a") - endif() - elseif(DNNL_TARGET_ARCH STREQUAL "PPC64") - if (NOT CMAKE_BUILD_TYPE STREQUAL "Debug") --- -2.34.1 - From 3723ee76b112d63b6623103876d936e95ad98876 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Wed, 6 Dec 2023 23:03:02 +0000 Subject: [PATCH 144/212] Validation scripts, install using version (#1633) --- .github/scripts/validate_binaries.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/scripts/validate_binaries.sh b/.github/scripts/validate_binaries.sh index 0c01dbca22..acdcef5e58 100755 --- a/.github/scripts/validate_binaries.sh +++ b/.github/scripts/validate_binaries.sh @@ -12,6 +12,13 @@ else TEST_SUFFIX=" --package torchonly" fi + # if RELESE version is passed as parameter - install speific version + if [[ ! -z ${RELEASE_VERSION} ]]; then + INSTALLATION=${INSTALLATION/"torch "/"torch==${RELEASE_VERSION} "} + INSTALLATION=${INSTALLATION/"-y pytorch "/"-y pytorch==${RELEASE_VERSION} "} + INSTALLATION=${INSTALLATION/"::pytorch "/"::pytorch==${RELEASE_VERSION} "} + fi + export OLD_PATH=${PATH} # Workaround macos-arm64 runners. Issue: https://github.com/pytorch/test-infra/issues/4342 if [[ ${TARGET_OS} == 'macos-arm64' ]]; then From 60169e3cdd1e3b4555b4386cc62f85b6564c3f01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ionu=C8=9B=20Man=C8=9Ba?= Date: Fri, 8 Dec 2023 07:09:30 +0200 Subject: [PATCH 145/212] Test Windows static lib (#1465) Add support for testing Windows Cuda static lib --- test_example_code/CMakeLists.txt | 26 +++++ windows/internal/static_lib_test.bat | 137 +++++++++++++++++++++++++++ 2 files changed, 163 insertions(+) create mode 100644 test_example_code/CMakeLists.txt create mode 100644 windows/internal/static_lib_test.bat diff --git a/test_example_code/CMakeLists.txt b/test_example_code/CMakeLists.txt new file mode 100644 index 0000000000..1724a6ed01 --- /dev/null +++ b/test_example_code/CMakeLists.txt @@ -0,0 +1,26 @@ +cmake_minimum_required(VERSION 3.0 FATAL_ERROR) +project(simple-torch-test) + +find_package(Torch REQUIRED) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}") + + +add_executable(simple-torch-test simple-torch-test.cpp) +target_include_directories(simple-torch-test PRIVATE ${TORCH_INCLUDE_DIRS}) +target_link_libraries(simple-torch-test "${TORCH_LIBRARIES}") +set_property(TARGET simple-torch-test PROPERTY CXX_STANDARD 17) + +find_package(CUDAToolkit 11.8) + +target_link_libraries(simple-torch-test CUDA::cudart CUDA::cufft CUDA::cusparse CUDA::cublas CUDA::cusolver) +find_library(CUDNN_LIBRARY NAMES cudnn) +target_link_libraries(simple-torch-test ${CUDNN_LIBRARY} ) +if (MSVC) + file(GLOB TORCH_DLLS "$ENV{CUDA_PATH}/bin/cudnn64_8.dll" "$ENV{NVTOOLSEXT_PATH}/bin/x64/*.dll") + message("dlls to copy " ${TORCH_DLLS}) + add_custom_command(TARGET simple-torch-test + POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different + ${TORCH_DLLS} + $) +endif (MSVC) diff --git a/windows/internal/static_lib_test.bat b/windows/internal/static_lib_test.bat new file mode 100644 index 0000000000..aa15dd68f3 --- /dev/null +++ b/windows/internal/static_lib_test.bat @@ -0,0 +1,137 @@ +set SRC_DIR=%~dp0 + +pushd %SRC_DIR%\.. + +if "%CUDA_VERSION%" == "cpu" call internal\driver_update.bat +if errorlevel 1 exit /b 1 + +call internal\cuda_install.bat +set LIB=%CUDA_PATH%\lib\x64;%LIB% +if errorlevel 1 exit /b 1 +set "ORIG_PATH=%PATH%" + +setlocal EnableDelayedExpansion +set NVIDIA_GPU_EXISTS=0 +for /F "delims=" %%i in ('wmic path win32_VideoController get name') do ( + set GPUS=%%i + if not "x!GPUS:NVIDIA=!" == "x!GPUS!" ( + SET NVIDIA_GPU_EXISTS=1 + goto gpu_check_end + ) +) +:gpu_check_end +endlocal & set NVIDIA_GPU_EXISTS=%NVIDIA_GPU_EXISTS% + +:: Download MAGMA Files on CUDA builds +set MAGMA_VERSION=2.5.4 +set CUDA_PREFIX=cuda%CUDA_VERSION% +if "%CUDA_VERSION%" == "92" set MAGMA_VERSION=2.5.2 +if "%CUDA_VERSION%" == "100" set MAGMA_VERSION=2.5.2 + +if "%DEBUG%" == "1" ( + set BUILD_TYPE=debug +) else ( + set BUILD_TYPE=release +) + +if not "%CUDA_VERSION%" == "cpu" ( + rmdir /s /q magma_%CUDA_PREFIX%_%BUILD_TYPE% + del magma_%CUDA_PREFIX%_%BUILD_TYPE%.7z + curl -k https://s3.amazonaws.com/ossci-windows/magma_%MAGMA_VERSION%_%CUDA_PREFIX%_%BUILD_TYPE%.7z -o magma_%CUDA_PREFIX%_%BUILD_TYPE%.7z + 7z x -aoa magma_%CUDA_PREFIX%_%BUILD_TYPE%.7z -omagma_%CUDA_PREFIX%_%BUILD_TYPE% + set LIB=%CD%\magma_%CUDA_PREFIX%_%BUILD_TYPE%\lib;%LIB% +) + +echo "install conda package" + +:: Install Miniconda3 +set "CONDA_HOME=%CD%\conda" +set "tmp_conda=%CONDA_HOME%" +set "miniconda_exe=%CD%\miniconda.exe" + +rmdir /s /q conda +del miniconda.exe +curl -k https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe -o "%miniconda_exe%" +call ..\conda\install_conda.bat +if ERRORLEVEL 1 exit /b 1 + +set "PATH=%CONDA_HOME%;%CONDA_HOME%\scripts;%CONDA_HOME%\Library\bin;%PATH%" + +conda create -qyn testenv python=%DESIRED_PYTHON% +if errorlevel 1 exit /b 1 + +call %CONDA_HOME%\condabin\activate.bat testenv +if errorlevel 1 exit /b 1 + +call conda install -y -q -c conda-forge libuv=1.39 +call conda install -y -q intel-openmp + +echo "install and test libtorch" +pip install cmake +echo "installing cmake" + +curl https://s3.amazonaws.com/ossci-windows/mkl_2020.2.254.7z -k -O +7z x -aoa mkl_2020.2.254.7z -omkl +set LIB=%CD%\mkl\lib;%LIB% + + +if "%VC_YEAR%" == "2019" powershell internal\vs2019_install.ps1 +if "%VC_YEAR%" == "2022" powershell internal\vs2022_install.ps1 + +if ERRORLEVEL 1 exit /b 1 + +for /F "delims=" %%i in ('where /R "%PYTORCH_FINAL_PACKAGE_DIR:/=\%" *-latest.zip') do 7z x "%%i" -otmp +if ERRORLEVEL 1 exit /b 1 + + +pushd tmp\libtorch + +set VC_VERSION_LOWER=17 +set VC_VERSION_UPPER=18 +IF "%VC_YEAR%" == "2019" ( + set VC_VERSION_LOWER=16 + set VC_VERSION_UPPER=17 +) + +for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [%VC_VERSION_LOWER%^,%VC_VERSION_UPPER%^) -property installationPath`) do ( + if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" ( + set "VS15INSTALLDIR=%%i" + set "VS15VCVARSALL=%%i\VC\Auxiliary\Build\vcvarsall.bat" + goto vswhere + ) +) + +:vswhere +IF "%VS15VCVARSALL%"=="" ( + echo Visual Studio %VC_YEAR% C++ BuildTools is required to compile PyTorch test on Windows + exit /b 1 +) +call "%VS15VCVARSALL%" x64 + +set install_root=%CD% +set INCLUDE=%INCLUDE%;%install_root%\include;%install_root%\include\torch\csrc\api\include +set LIB=%LIB%;%install_root%\lib\x64 +set PATH=%PATH%;%install_root%\lib + + +cd %BUILDER_ROOT%\test_example_code\ +mkdir build +cd build + +cmake -DCMAKE_PREFIX_PATH=%install_root% .. + +if ERRORLEVEL 1 exit /b 1 + +cmake --build . --config Release + +.\Release\simple-torch-test.exe +if ERRORLEVEL 1 exit /b 1 + +popd + +echo Cleaning temp files +rd /s /q "tmp" || ver > nul + +:end +set "PATH=%ORIG_PATH%" +popd From 2a4c533360a1aa2dec292baa95e69c7ec6ecd166 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Fri, 8 Dec 2023 15:55:51 +0000 Subject: [PATCH 146/212] Pin windows intel-openmp to 2023.2.0 (#1635) (#1636) --- windows/condaenv.bat | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/windows/condaenv.bat b/windows/condaenv.bat index cf1b2c8655..4475b05689 100644 --- a/windows/condaenv.bat +++ b/windows/condaenv.bat @@ -12,9 +12,9 @@ FOR %%v IN (%DESIRED_PYTHON%) DO ( if "%%v" == "3.7" call conda create -n py!PYTHON_VERSION_STR! -y -q numpy=1.11 "mkl=2020.2" pyyaml boto3 cmake ninja typing_extensions python=%%v if "%%v" == "3.8" call conda create -n py!PYTHON_VERSION_STR! -y -q numpy=1.11 "mkl=2020.2" pyyaml boto3 cmake ninja typing_extensions python=%%v if "%%v" == "3.9" call conda create -n py!PYTHON_VERSION_STR! -y -q numpy>=1.11 "mkl=2020.2" pyyaml boto3 cmake ninja typing_extensions python=%%v - if "%%v" == "3.10" call conda create -n py!PYTHON_VERSION_STR! -y -q -c=conda-forge numpy=1.21.3 "mkl=2020.2" pyyaml boto3 "cmake=3.19.6" ninja typing_extensions python=%%v - if "%%v" == "3.11" call conda create -n py!PYTHON_VERSION_STR! -y -q -c=conda-forge numpy=1.23.4 "mkl=2020.2" pyyaml boto3 "cmake=3.19.6" ninja typing_extensions python=%%v - if "%%v" == "3.12" call conda create -n py!PYTHON_VERSION_STR! -y -q -c=conda-forge numpy=1.26.0 "mkl=2023.1" pyyaml boto3 "cmake=3.19.6" ninja typing_extensions python=%%v + if "%%v" == "3.10" call conda create -n py!PYTHON_VERSION_STR! -y -q -c=conda-forge numpy=1.21.3 "mkl=2020.2" intel-openmp=2023.2.0 pyyaml boto3 "cmake=3.19.6" ninja typing_extensions python=%%v + if "%%v" == "3.11" call conda create -n py!PYTHON_VERSION_STR! -y -q -c=conda-forge numpy=1.23.4 "mkl=2020.2" intel-openmp=2023.2.0 pyyaml boto3 "cmake=3.19.6" ninja typing_extensions python=%%v + if "%%v" == "3.12" call conda create -n py!PYTHON_VERSION_STR! -y -q -c=conda-forge numpy=1.26.0 "mkl=2023.1" intel-openmp=2023.2.0 pyyaml boto3 "cmake=3.19.6" ninja typing_extensions python=%%v if "%%v" == "3" call conda create -n py!PYTHON_VERSION_STR! -y -q numpy=1.11 "mkl=2020.2" pyyaml boto3 cmake ninja typing_extensions python=%%v ) endlocal From c10e2547f19d6cf985d72a6c05389a63705321a1 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Sat, 9 Dec 2023 15:56:17 +0000 Subject: [PATCH 147/212] Torch compile test for python 3.8-3.11 linux only (#1629) This should fix failure on with Python 3.12 validations: https://github.com/pytorch/builder/actions/runs/7064433251/job/19232483984#step:11:4859 --- test/smoke_test/smoke_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/smoke_test/smoke_test.py b/test/smoke_test/smoke_test.py index a4cd6dff26..3b5b18c35c 100644 --- a/test/smoke_test/smoke_test.py +++ b/test/smoke_test/smoke_test.py @@ -167,8 +167,8 @@ def smoke_test_cuda(package: str, runtime_error_check: str) -> None: print(f"torch cudnn: {torch.backends.cudnn.version()}") print(f"cuDNN enabled? {torch.backends.cudnn.enabled}") - # torch.compile is available only on Linux and python 3.8-3.10 - if sys.platform in ["linux", "linux2"] and (sys.version_info < (3, 11, 0) or channel != "release"): + # torch.compile is available only on Linux and python 3.8-3.11 + if (sys.platform in ["linux", "linux2"]) and sys.version_info < (3, 12, 0): smoke_test_compile() if runtime_error_check == "enabled": From f4144554d893b0bf62694617b9d5dc8977ca72ec Mon Sep 17 00:00:00 2001 From: snadampal <87143774+snadampal@users.noreply.github.com> Date: Sat, 9 Dec 2023 10:20:23 -0600 Subject: [PATCH 148/212] [aarch64] cleanup mkldnn patching (#1630) pytorch is moved to oneDNN v3.3.2 and some of the old patches are not applicable any more. --- aarch64_linux/aarch64_wheel_ci_build.py | 3 --- aarch64_linux/build_aarch64_wheel.py | 1 - mkldnn_fix/aarch64-fix-readdir-crash.patch | 14 -------------- 3 files changed, 18 deletions(-) delete mode 100644 mkldnn_fix/aarch64-fix-readdir-crash.patch diff --git a/aarch64_linux/aarch64_wheel_ci_build.py b/aarch64_linux/aarch64_wheel_ci_build.py index 3b772847c5..d3910f227c 100755 --- a/aarch64_linux/aarch64_wheel_ci_build.py +++ b/aarch64_linux/aarch64_wheel_ci_build.py @@ -105,9 +105,6 @@ def parse_arguments(): else: print("build pytorch without mkldnn backend") - # work around to fix Raspberry pie crash - print("Applying mkl-dnn patch to fix readdir crash") - os.system("cd /pytorch/third_party/ideep/mkl-dnn && patch -p1 < /builder/mkldnn_fix/aarch64-fix-readdir-crash.patch") os.system(f"cd /pytorch; {build_vars} python3 setup.py bdist_wheel") pytorch_wheel_name = complete_wheel("pytorch") print(f"Build Compelete. Created {pytorch_wheel_name}..") diff --git a/aarch64_linux/build_aarch64_wheel.py b/aarch64_linux/build_aarch64_wheel.py index 9efd2e6ae5..d4fa6f8ad1 100755 --- a/aarch64_linux/build_aarch64_wheel.py +++ b/aarch64_linux/build_aarch64_wheel.py @@ -554,7 +554,6 @@ def start_build(host: RemoteHost, *, build_ArmComputeLibrary(host, git_clone_flags) print("build pytorch with mkldnn+acl backend") build_vars += " USE_MKLDNN=ON USE_MKLDNN_ACL=ON" - host.run_cmd(f"cd $HOME && git clone https://github.com/pytorch/builder.git") host.run_cmd(f"cd $HOME/pytorch && export ACL_ROOT_DIR=$HOME/ComputeLibrary && {build_vars} python3 setup.py bdist_wheel{build_opts}") print('Repair the wheel') pytorch_wheel_name = host.list_dir("pytorch/dist")[0] diff --git a/mkldnn_fix/aarch64-fix-readdir-crash.patch b/mkldnn_fix/aarch64-fix-readdir-crash.patch deleted file mode 100644 index 81d46d4065..0000000000 --- a/mkldnn_fix/aarch64-fix-readdir-crash.patch +++ /dev/null @@ -1,14 +0,0 @@ -diff --git a/src/cpu/aarch64/xbyak_aarch64/src/util_impl.cpp b/src/cpu/aarch64/xbyak_aarch64/src/util_impl.cpp -index cb800b2509..5516373b90 100644 ---- a/src/cpu/aarch64/xbyak_aarch64/src/util_impl.cpp -+++ b/src/cpu/aarch64/xbyak_aarch64/src/util_impl.cpp -@@ -170,6 +170,8 @@ int Cpu::getFilePathMaxTailNumPlus1(const char *path) { - fflush(stdout); - - DIR *dir = opendir(dir_path); -+ if (dir == NULL) -+ return 0; - struct dirent *dp; - - dp = readdir(dir); - From bafda5103ffe512e914adf6e2fb75b1a9ca943be Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Sat, 9 Dec 2023 08:37:00 -0800 Subject: [PATCH 149/212] Add `aarch64_linux` to the list of linted files --- .lintrunner.toml | 2 +- aarch64_linux/build_aarch64_wheel.py | 2 +- aarch64_linux/embed_library.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.lintrunner.toml b/.lintrunner.toml index b7375092ae..7d48258bc7 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -2,7 +2,7 @@ merge_base_with = "origin/main" [[linter]] code = 'RUFF' -include_patterns = ['test/smoke_test/*.py', 's3_management/*.py'] +include_patterns = ['test/smoke_test/*.py', 's3_management/*.py', 'aarch64_linux/*.py'] command = [ 'python3', 'tools/linter/adapters/ruff_linter.py', diff --git a/aarch64_linux/build_aarch64_wheel.py b/aarch64_linux/build_aarch64_wheel.py index d4fa6f8ad1..333b8b910f 100755 --- a/aarch64_linux/build_aarch64_wheel.py +++ b/aarch64_linux/build_aarch64_wheel.py @@ -301,7 +301,7 @@ def build_torchvision(host: RemoteHost, *, # Remove .so files to force static linking host.run_cmd("rm miniforge3/lib/libpng.so miniforge3/lib/libpng16.so miniforge3/lib/libjpeg.so") # And patch setup.py to include libz dependency for libpng - host.run_cmd(['sed -i -e \'s/image_link_flags\.append("png")/image_link_flags += ["png", "z"]/\' vision/setup.py']) + host.run_cmd(['sed -i -e \'s/image_link_flags\\.append("png")/image_link_flags += ["png", "z"]/\' vision/setup.py']) build_vars = "" if branch == "nightly": diff --git a/aarch64_linux/embed_library.py b/aarch64_linux/embed_library.py index 978970d45f..1a31148237 100644 --- a/aarch64_linux/embed_library.py +++ b/aarch64_linux/embed_library.py @@ -13,7 +13,7 @@ def replace_tag(filename): - with open(filename, 'r') as f: + with open(filename) as f: lines = f.read().split("\\n") for i,line in enumerate(lines): if not line.startswith("Tag: "): From bb9b32c00cdb3ddbb0ed7754adeb7220d44d031f Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Sat, 9 Dec 2023 09:00:18 -0800 Subject: [PATCH 150/212] Actually fix lint this type --- aarch64_linux/aarch64_wheel_ci_build.py | 21 ++++++++--------- aarch64_linux/build_aarch64_wheel.py | 30 +++++++++++++------------ aarch64_linux/embed_library.py | 2 +- 3 files changed, 28 insertions(+), 25 deletions(-) diff --git a/aarch64_linux/aarch64_wheel_ci_build.py b/aarch64_linux/aarch64_wheel_ci_build.py index d3910f227c..d24b6f2fd8 100755 --- a/aarch64_linux/aarch64_wheel_ci_build.py +++ b/aarch64_linux/aarch64_wheel_ci_build.py @@ -2,7 +2,7 @@ # encoding: UTF-8 import os -import subprocess +from subprocess import check_output from pygit2 import Repository from typing import List @@ -11,7 +11,7 @@ def list_dir(path: str) -> List[str]: '''' Helper for getting paths for Python ''' - return subprocess.check_output(["ls", "-1", path]).decode().split("\n") + return check_output(["ls", "-1", path]).decode().split("\n") def build_ArmComputeLibrary(git_clone_flags: str = "") -> None: @@ -19,10 +19,12 @@ def build_ArmComputeLibrary(git_clone_flags: str = "") -> None: Using ArmComputeLibrary for aarch64 PyTorch ''' print('Building Arm Compute Library') + acl_build_flags=" ".join(["debug=0", "neon=1", "opencl=0", "os=linux", "openmp=1", "cppthreads=0", + "arch=armv8a", "multi_isa=1", "fixed_format_kernels=1", "build=native"]) os.system("cd / && mkdir /acl") os.system(f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v23.08 {git_clone_flags}") os.system("cd ComputeLibrary; export acl_install_dir=/acl; " - "scons Werror=1 -j8 debug=0 neon=1 opencl=0 os=linux openmp=1 cppthreads=0 arch=armv8a multi_isa=1 fixed_format_kernels=1 build=native build_dir=$acl_install_dir/build; " + f"scons Werror=1 -j8 {acl_build_flags} build_dir=$acl_install_dir/build; " "cp -r arm_compute $acl_install_dir; " "cp -r include $acl_install_dir; " "cp -r utils $acl_install_dir; " @@ -86,13 +88,12 @@ def parse_arguments(): if override_package_version is not None: version = override_package_version build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version} PYTORCH_BUILD_NUMBER=1 " - else: - if branch in ['nightly', 'master']: - build_date = subprocess.check_output(['git', 'log', '--pretty=format:%cs', '-1'], cwd='/pytorch').decode().replace('-', '') - version = subprocess.check_output(['cat', 'version.txt'], cwd='/pytorch').decode().strip()[:-2] - build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1 " - if branch.startswith("v1.") or branch.startswith("v2."): - build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1:branch.find('-')]} PYTORCH_BUILD_NUMBER=1 " + elif branch in ['nightly', 'master']: + build_date = check_output(['git', 'log', '--pretty=format:%cs', '-1'], cwd='/pytorch').decode().replace('-', '') + version = check_output(['cat', 'version.txt'], cwd='/pytorch').decode().strip()[:-2] + build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1 " + elif branch.startswith(("v1.", "v2.")): + build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1:branch.find('-')]} PYTORCH_BUILD_NUMBER=1 " if enable_mkldnn: build_ArmComputeLibrary(git_clone_flags) diff --git a/aarch64_linux/build_aarch64_wheel.py b/aarch64_linux/build_aarch64_wheel.py index 333b8b910f..0ff286ad2d 100755 --- a/aarch64_linux/build_aarch64_wheel.py +++ b/aarch64_linux/build_aarch64_wheel.py @@ -2,9 +2,10 @@ # This script is for building AARCH64 wheels using AWS EC2 instances. # To generate binaries for the release follow these steps: -# 1. Update mappings for each of the Domain Libraries by adding new row to a table like this: "v1.11.0": ("0.11.0", "rc1"), -# 2. Run script with following arguments for each of the supported python versions and specify required RC tag for example: v1.11.0-rc3: -# build_aarch64_wheel.py --key-name --use-docker --python 3.8 --branch +# 1. Update mappings for each of the Domain Libraries by adding new row to a table like this: +# "v1.11.0": ("0.11.0", "rc1"), +# 2. Run script with following arguments for each of the supported python versions and required tag, for example: +# build_aarch64_wheel.py --key-name --use-docker --python 3.8 --branch v1.11.0-rc3 import boto3 @@ -177,7 +178,7 @@ def wait_for_connection(addr, port, timeout=15, attempt_cnt=5): try: with socket.create_connection((addr, port), timeout=timeout): return - except (ConnectionRefusedError, socket.timeout): + except (ConnectionRefusedError, socket.timeout): # noqa: PERF203 if i == attempt_cnt - 1: raise time.sleep(timeout) @@ -203,7 +204,7 @@ def install_condaforge(host: RemoteHost, if host.using_docker(): host.run_cmd("echo 'PATH=$HOME/miniforge3/bin:$PATH'>>.bashrc") else: - host.run_cmd(['sed', '-i', '\'/^# If not running interactively.*/i PATH=$HOME/miniforge3/bin:$PATH\'', '.bashrc']) + host.run_cmd(['sed', '-i', '\'/^# If not running interactively.*/i PATH=$HOME/miniforge3/bin:$PATH\'', '.bashrc']) # noqa: E501 def install_condaforge_python(host: RemoteHost, python_version="3.8") -> None: @@ -221,12 +222,13 @@ def build_OpenBLAS(host: RemoteHost, git_clone_flags: str = "") -> None: print('Building OpenBLAS') host.run_cmd(f"git clone https://github.com/xianyi/OpenBLAS -b v0.3.25 {git_clone_flags}") make_flags = "NUM_THREADS=64 USE_OPENMP=1 NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=ARMV8" - host.run_cmd(f"pushd OpenBLAS && make {make_flags} -j8 && sudo make {make_flags} install && popd && rm -rf OpenBLAS") + host.run_cmd(f"pushd OpenBLAS && make {make_flags} -j8 && sudo make {make_flags} install && popd && rm -rf OpenBLAS") # noqa: E501 def build_ArmComputeLibrary(host: RemoteHost, git_clone_flags: str = "") -> None: print('Building Arm Compute Library') - acl_build_flags="debug=0 neon=1 opencl=0 os=linux openmp=1 cppthreads=0 arch=armv8a multi_isa=1 fixed_format_kernels=1 build=native" + acl_build_flags=" ".join(["debug=0", "neon=1", "opencl=0", "os=linux", "openmp=1", "cppthreads=0", + "arch=armv8a", "multi_isa=1", "fixed_format_kernels=1", "build=native"]) host.run_cmd(f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v23.08 {git_clone_flags}") host.run_cmd(f"cd ComputeLibrary && scons Werror=1 -j8 {acl_build_flags}") @@ -301,7 +303,7 @@ def build_torchvision(host: RemoteHost, *, # Remove .so files to force static linking host.run_cmd("rm miniforge3/lib/libpng.so miniforge3/lib/libpng16.so miniforge3/lib/libjpeg.so") # And patch setup.py to include libz dependency for libpng - host.run_cmd(['sed -i -e \'s/image_link_flags\\.append("png")/image_link_flags += ["png", "z"]/\' vision/setup.py']) + host.run_cmd(['sed -i -e \'s/image_link_flags\\.append("png")/image_link_flags += ["png", "z"]/\' vision/setup.py']) # noqa: E501 build_vars = "" if branch == "nightly": @@ -525,7 +527,7 @@ def start_build(host: RemoteHost, *, if host.using_docker(): print("Move libgfortant.a into a standard location") # HACK: pypa gforntran.a is compiled without PIC, which leads to the following error - # libgfortran.a(error.o)(.text._gfortrani_st_printf+0x34): unresolvable R_AARCH64_ADR_PREL_PG_HI21 relocation against symbol `__stack_chk_guard@@GLIBC_2.17' + # libgfortran.a(error.o)(.text._gfortrani_st_printf+0x34): unresolvable R_AARCH64_ADR_PREL_PG_HI21 relocation against symbol `__stack_chk_guard@@GLIBC_2.17' # noqa: E501 # Workaround by copying gfortran library from the host host.run_ssh_cmd("sudo apt-get install -y gfortran-8") host.run_cmd("mkdir -p /usr/lib/gcc/aarch64-linux-gnu/8") @@ -543,10 +545,10 @@ def start_build(host: RemoteHost, *, # Breakpad build fails on aarch64 build_vars = "USE_BREAKPAD=0 " if branch == 'nightly': - build_date = host.check_output("cd pytorch && git log --pretty=format:%s -1").strip().split()[0].replace("-", "") + build_date = host.check_output("cd pytorch && git log --pretty=format:%s -1").strip().split()[0].replace("-", "") # noqa: E501 version = host.check_output("cat pytorch/version.txt").strip()[:-2] build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1" - if branch.startswith("v1.") or branch.startswith("v2."): + if branch.startswith(("v1.", "v2.")): build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1:branch.find('-')]} PYTORCH_BUILD_NUMBER=1" if host.using_docker(): build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000" @@ -554,10 +556,10 @@ def start_build(host: RemoteHost, *, build_ArmComputeLibrary(host, git_clone_flags) print("build pytorch with mkldnn+acl backend") build_vars += " USE_MKLDNN=ON USE_MKLDNN_ACL=ON" - host.run_cmd(f"cd $HOME/pytorch && export ACL_ROOT_DIR=$HOME/ComputeLibrary && {build_vars} python3 setup.py bdist_wheel{build_opts}") + host.run_cmd(f"cd $HOME/pytorch && export ACL_ROOT_DIR=$HOME/ComputeLibrary && {build_vars} python3 setup.py bdist_wheel{build_opts}") # noqa: E501 print('Repair the wheel') pytorch_wheel_name = host.list_dir("pytorch/dist")[0] - host.run_cmd(f"export LD_LIBRARY_PATH=$HOME/acl/build:$HOME/pytorch/build/lib && auditwheel repair $HOME/pytorch/dist/{pytorch_wheel_name}") + host.run_cmd(f"export LD_LIBRARY_PATH=$HOME/acl/build:$HOME/pytorch/build/lib && auditwheel repair $HOME/pytorch/dist/{pytorch_wheel_name}") # noqa: E501 print('replace the original wheel with the repaired one') pytorch_repaired_wheel_name = host.list_dir("wheelhouse")[0] host.run_cmd(f"cp $HOME/wheelhouse/{pytorch_repaired_wheel_name} $HOME/pytorch/dist/{pytorch_wheel_name}") @@ -705,7 +707,7 @@ def parse_arguments(): parser.add_argument("--build-only", action="store_true") parser.add_argument("--test-only", type=str) parser.add_argument("--os", type=str, choices=list(os_amis.keys()), default='ubuntu20_04') - parser.add_argument("--python-version", type=str, choices=['3.6', '3.7', '3.8', '3.9', '3.10', '3.11'], default=None) + parser.add_argument("--python-version", type=str, choices=[f'3.{d}' for d in range(6, 12)], default=None) parser.add_argument("--alloc-instance", action="store_true") parser.add_argument("--list-instances", action="store_true") parser.add_argument("--pytorch-only", action="store_true") diff --git a/aarch64_linux/embed_library.py b/aarch64_linux/embed_library.py index 1a31148237..be6bb048f3 100644 --- a/aarch64_linux/embed_library.py +++ b/aarch64_linux/embed_library.py @@ -42,7 +42,7 @@ def embed_library(whl_path, lib_soname, update_tag=False): torchlib_path = os.path.join(ctx._tmpdir.name, 'torch', 'lib') ctx.out_wheel=tmp_whl_name new_lib_path, new_lib_soname = None, None - for filename, elf in elf_file_filter(ctx.iter_files()): + for filename, _ in elf_file_filter(ctx.iter_files()): if not filename.startswith('torch/lib'): continue libtree = lddtree(filename) From 4f298cbaff97caeeaa1b278caf9874d6c367a750 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Sat, 9 Dec 2023 14:28:00 -0800 Subject: [PATCH 151/212] Extend test_linalg from smoke_test.py To take device as an argument and run tests on both cpu and cuda --- test/smoke_test/smoke_test.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/test/smoke_test/smoke_test.py b/test/smoke_test/smoke_test.py index 3b5b18c35c..f4c06150ee 100644 --- a/test/smoke_test/smoke_test.py +++ b/test/smoke_test/smoke_test.py @@ -206,9 +206,9 @@ def smoke_test_conv2d() -> None: assert output is not None -def smoke_test_linalg() -> None: - print("Testing smoke_test_linalg") - A = torch.randn(5, 3) +def test_linalg(device="cpu") -> None: + print(f"Testing smoke_test_linalg on {device}") + A = torch.randn(5, 3, device=device) U, S, Vh = torch.linalg.svd(A, full_matrices=False) assert U.shape == A.shape and S.shape == torch.Size([3]) and Vh.shape == torch.Size([3, 3]) torch.dist(A, U @ torch.diag(S) @ Vh) @@ -217,15 +217,15 @@ def smoke_test_linalg() -> None: assert U.shape == torch.Size([5, 5]) and S.shape == torch.Size([3]) and Vh.shape == torch.Size([3, 3]) torch.dist(A, U[:, :3] @ torch.diag(S) @ Vh) - A = torch.randn(7, 5, 3) + A = torch.randn(7, 5, 3, device=device) U, S, Vh = torch.linalg.svd(A, full_matrices=False) torch.dist(A, U @ torch.diag_embed(S) @ Vh) - if is_cuda_system: + if device == "cuda": supported_dtypes = [torch.float32, torch.float64] for dtype in supported_dtypes: print(f"Testing smoke_test_linalg with cuda for {dtype}") - A = torch.randn(20, 16, 50, 100, device="cuda").type(dtype) + A = torch.randn(20, 16, 50, 100, device=device, dtype=dtype) torch.linalg.svd(A) @@ -293,7 +293,9 @@ def main() -> None: check_version(options.package) smoke_test_conv2d() - smoke_test_linalg() + test_linalg() + if is_cuda_system: + test_linalg("cuda") if options.package == "all": smoke_test_modules() From 70d5c5f7b5dd8005c36843beab7a087d1296066e Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Sat, 9 Dec 2023 16:39:26 -0800 Subject: [PATCH 152/212] Run smoke_test_linalg during check_binary This is a regression test for https://github.com/pytorch/pytorch/issues/114862 --- check_binary.sh | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/check_binary.sh b/check_binary.sh index 30b44b5350..e9fde2bc5e 100755 --- a/check_binary.sh +++ b/check_binary.sh @@ -408,6 +408,18 @@ if [[ "$DESIRED_CUDA" != 'cpu' && "$DESIRED_CUDA" != 'cpu-cxx11-abi' && "$DESIRE fi # if libtorch fi # if cuda +########################## +# Run parts of smoke tests +########################## +if [[ "$PACKAGE_TYPE" != 'libtorch' ]]; then + pushd test/smoke_test + python -c "from smoke_test import test_linalg; test_linalg()" + if [[ "$DESIRED_CUDA" == *cuda* ]]; then + python -c "from smoke_test import test_linalg; test_linalg('cuda')" + fi + popd +fi + ############################################################################### # Check PyTorch supports TCP_TLS gloo transport ############################################################################### From 5d7e8e1f49f0842a5c6d55be5212390c7bd3b875 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Sat, 9 Dec 2023 17:08:36 -0800 Subject: [PATCH 153/212] Fix linalg testing --- check_binary.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/check_binary.sh b/check_binary.sh index e9fde2bc5e..2b5f228d6c 100755 --- a/check_binary.sh +++ b/check_binary.sh @@ -412,7 +412,7 @@ fi # if cuda # Run parts of smoke tests ########################## if [[ "$PACKAGE_TYPE" != 'libtorch' ]]; then - pushd test/smoke_test + pushd "$(dirname ${BASH_SOURCE[0]})/test/smoke_test" python -c "from smoke_test import test_linalg; test_linalg()" if [[ "$DESIRED_CUDA" == *cuda* ]]; then python -c "from smoke_test import test_linalg; test_linalg('cuda')" From 2b17d68ca300a603ba03444129171fdb60c5754e Mon Sep 17 00:00:00 2001 From: Nikita Shulga <2453524+malfet@users.noreply.github.com> Date: Sun, 10 Dec 2023 13:38:19 -0800 Subject: [PATCH 154/212] [BE] Add CI for check_binary.sh changes (#1637) Make sure latest nightly passes the testing for: - Linux Wheel CPU - Linux Wheel CUDA Tweak script a bit to work correctly with relative path to executable --- .github/workflows/test-check-binary.yml | 37 +++++++++++++++++++++++++ check_binary.sh | 2 +- 2 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/test-check-binary.yml diff --git a/.github/workflows/test-check-binary.yml b/.github/workflows/test-check-binary.yml new file mode 100644 index 0000000000..e29ee7ba94 --- /dev/null +++ b/.github/workflows/test-check-binary.yml @@ -0,0 +1,37 @@ +name: Test check_binary + +on: + push: + branches: + - main + pull_request: + paths: + - .github/workflows/test-check-binary.yml + - check_binary.sh + - test/smoke_test/smoke_test.py + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + check_binary_linux_cpu: + name: Test check_binary.sh for Linux CPU + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + with: + repository: "pytorch/builder" + docker-image: python:3.11 + script: | + pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu + DESIRED_PYTHON=3.11 DESIRED_CUDA=cpu PACKAGE_TYPE=manywheel ./check_binary.sh + + check_binary_linux_cuda: + name: Test check_binary.sh for Linux CUDA + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + with: + repository: "pytorch/builder" + runner: linux.4xlarge.nvidia.gpu + docker-image: python:3.11 + script: | + pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121 + DESIRED_PYTHON=3.11 DESIRED_CUDA=cu121 PACKAGE_TYPE=manywheel ./check_binary.sh diff --git a/check_binary.sh b/check_binary.sh index 2b5f228d6c..9a2cf065b4 100755 --- a/check_binary.sh +++ b/check_binary.sh @@ -261,7 +261,7 @@ setup_link_flags () { fi } -TEST_CODE_DIR="$(dirname ${BASH_SOURCE[0]})/test_example_code" +TEST_CODE_DIR="$(dirname $(realpath ${BASH_SOURCE[0]}))/test_example_code" build_and_run_example_cpp () { if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then GLIBCXX_USE_CXX11_ABI=1 From 6f3cb2ba1753c5ecade8b800c29fc18d18d149db Mon Sep 17 00:00:00 2001 From: Huy Do Date: Thu, 14 Dec 2023 11:00:07 -0800 Subject: [PATCH 155/212] Keep nightly 20231010 for ExecuTorch alpha 0.1 for now (#1642) --- s3_management/manage.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/s3_management/manage.py b/s3_management/manage.py index 47c151f087..851a848c66 100644 --- a/s3_management/manage.py +++ b/s3_management/manage.py @@ -112,6 +112,12 @@ # How many packages should we keep of a specific package? KEEP_THRESHOLD = 60 +# TODO (huydhn): Clean this up once ExecuTorch has a new stable release that +# match PyTorch stable release cadence. This nightly version is currently +# referred to publicly in ExecuTorch alpha 0.1 release. So we want to keep +# nightly binaries around for now +KEEP_NIGHTLY_PACKAGES_FOR_EXECUTORCH = {datetime(2023, 10, 10, 0, 0)} + S3IndexType = TypeVar('S3IndexType', bound='S3Index') @@ -201,7 +207,10 @@ def nightly_packages_to_show(self: S3IndexType) -> List[S3Object]: if package_name not in PACKAGE_ALLOW_LIST: to_hide.add(obj) continue - if packages[package_name] >= KEEP_THRESHOLD or between_bad_dates(package_build_time): + if package_build_time not in KEEP_NIGHTLY_PACKAGES_FOR_EXECUTORCH and ( + packages[package_name] >= KEEP_THRESHOLD + or between_bad_dates(package_build_time) + ): to_hide.add(obj) else: packages[package_name] += 1 From 3daf3bf2cfbe1ddf85fe1726dfc80e36599f37a5 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Thu, 14 Dec 2023 20:23:41 +0000 Subject: [PATCH 156/212] [Validations] do conda update before starting validations (#1643) --- .github/scripts/validate_binaries.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/scripts/validate_binaries.sh b/.github/scripts/validate_binaries.sh index acdcef5e58..22fc9d1090 100755 --- a/.github/scripts/validate_binaries.sh +++ b/.github/scripts/validate_binaries.sh @@ -2,6 +2,8 @@ if [[ ${MATRIX_PACKAGE_TYPE} == "libtorch" ]]; then curl ${MATRIX_INSTALLATION} -o libtorch.zip unzip libtorch.zip else + + conda update -y -n base -c defaults conda # Please note ffmpeg is required for torchaudio, see https://github.com/pytorch/pytorch/issues/96159 conda create -y -n ${ENV_NAME} python=${MATRIX_PYTHON_VERSION} numpy ffmpeg conda activate ${ENV_NAME} From df2d4e8fb381c81030ce921378aaafe4c177645c Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Fri, 15 Dec 2023 16:54:07 +0000 Subject: [PATCH 157/212] [Validations] Validate aarch64 if all is slected (#1644) --- .github/workflows/validate-binaries.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/validate-binaries.yml b/.github/workflows/validate-binaries.yml index fee16dca9d..8dbe5f27c2 100644 --- a/.github/workflows/validate-binaries.yml +++ b/.github/workflows/validate-binaries.yml @@ -96,7 +96,7 @@ jobs: release-matrix: ${{ needs.generate-release-matrix.outputs.matrix }} linux-aarch64: - if: inputs.os == 'linux-aarch64' + if: inputs.os == 'linux-aarch64' || inputs.os == 'all' needs: generate-release-matrix uses: ./.github/workflows/validate-aarch64-linux-binaries.yml with: @@ -105,6 +105,7 @@ jobs: torchonly: ${{ inputs.torchonly }} version: ${{ inputs.version }} release-matrix: ${{ needs.generate-release-matrix.outputs.matrix }} + mac: if: inputs.os == 'macos' || inputs.os == 'all' needs: generate-release-matrix From cd257e911b55a8001c3ddb10b078c9122c575984 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Fri, 15 Dec 2023 16:02:53 -0800 Subject: [PATCH 158/212] Fix validation workflow on aarch64 with conda 23.11.0 and GLIBC_2.25 (#1645) * Debug aarch64 clone * Debug * Fix validation workflow with conda 23.11.0 and GLIBC_2.25 * Gate the change on linux-aarch64 and keep the old LD_LIBRARY_PATH * Try to unset LD_LIBRARY_PATH in the workflow instead * Fix copy/paste typo --- .github/workflows/validate-aarch64-linux-binaries.yml | 8 +++++++- test/smoke_test/smoke_test.py | 11 ++++++++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/.github/workflows/validate-aarch64-linux-binaries.yml b/.github/workflows/validate-aarch64-linux-binaries.yml index 14b7b6395f..6b1a60d7cc 100644 --- a/.github/workflows/validate-aarch64-linux-binaries.yml +++ b/.github/workflows/validate-aarch64-linux-binaries.yml @@ -95,5 +95,11 @@ jobs: printf '%s\n' ${{ toJson(inputs.release-matrix) }} > release_matrix.json eval "$(conda shell.bash hook)" - # Standart case: Validate binaries + # NB: The latest conda 23.11.0 pulls in some dependencies of conda-libmamba-solver that + # require GLIBC_2.25, which is not available in the current aarch64 image causing the + # subsequence git command to fail. Basically, they don't work with CentOS 7 which AML 2 + # is based on https://github.com/ContinuumIO/anaconda-issues/issues/12822 + unset LD_LIBRARY_PATH + + # Standard case: Validate binaries source ./.github/scripts/validate_binaries.sh diff --git a/test/smoke_test/smoke_test.py b/test/smoke_test/smoke_test.py index f4c06150ee..14e04d366c 100644 --- a/test/smoke_test/smoke_test.py +++ b/test/smoke_test/smoke_test.py @@ -258,7 +258,16 @@ def smoke_test_modules(): if module["repo"]: if not os.path.exists(f"{cwd}/{module['repo_name']}"): print(f"Path does not exist: {cwd}/{module['repo_name']}") - subprocess.check_output(f"git clone --depth 1 {module['repo']}", stderr=subprocess.STDOUT, shell=True) + try: + subprocess.check_output( + f"git clone --depth 1 {module['repo']}", + stderr=subprocess.STDOUT, + shell=True, + ) + except subprocess.CalledProcessError as exc: + raise RuntimeError( + f"Cloning {module['repo']} FAIL: {exc.returncode} Output: {exc.output}" + ) from exc try: smoke_test_command = f"python3 {module['smoke_test']}" if target_os == 'windows': From 22d7be551d404455b49caf9df1ebf1b80c18fa04 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Mon, 18 Dec 2023 11:50:12 -0800 Subject: [PATCH 159/212] Do not hardcode triton version in builder code (#1646) * Do not hardcode triton version in builder code * Minor tweak to use pytorch_rootdir --- conda/build_pytorch.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/conda/build_pytorch.sh b/conda/build_pytorch.sh index 4c2e4836b1..cf8b2f4631 100755 --- a/conda/build_pytorch.sh +++ b/conda/build_pytorch.sh @@ -275,8 +275,9 @@ else fi if [[ "$OSTYPE" != "msys" ]]; then # TODO: Remove me when Triton has a proper release channel + TRITON_VERSION=$(cat $pytorch_rootdir/.ci/docker/triton_version.txt) TRITON_SHORTHASH=$(cut -c1-10 $pytorch_rootdir/.github/ci_commit_pins/triton.txt) - export CONDA_TRITON_CONSTRAINT=" - torchtriton==2.1.0+${TRITON_SHORTHASH} # [py < 312]" + export CONDA_TRITON_CONSTRAINT=" - torchtriton==${TRITON_VERSION}+${TRITON_SHORTHASH} # [py < 312]" fi build_string_suffix="cuda${CUDA_VERSION}_cudnn${CUDNN_VERSION}_${build_string_suffix}" From 1b30e261d1d103264076a6f636801abc1905cbca Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Tue, 19 Dec 2023 20:12:40 -0800 Subject: [PATCH 160/212] [Lint] Prohibit tabs in shell scripts Fix current violations --- .lintrunner.toml | 23 +++ common/install_cuda.sh | 8 +- conda/build_pytorch.sh | 4 +- conda/pytorch-nightly/build.sh | 22 +-- ffmpeg/recipe/build.sh | 2 +- manywheel/build_rocm.sh | 8 +- manywheel/conda_build.sh | 8 +- tools/linter/adapters/grep_linter.py | 272 +++++++++++++++++++++++++++ 8 files changed, 321 insertions(+), 26 deletions(-) create mode 100644 tools/linter/adapters/grep_linter.py diff --git a/.lintrunner.toml b/.lintrunner.toml index 7d48258bc7..fdfca4cef4 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -18,3 +18,26 @@ init_command = [ 'ruff==0.1.1', ] is_formatter = true + +[[linter]] +code = 'TABS' +include_patterns = ['**/*.sh'] +exclude_patterns = [ + '**/*Makefile', + 'common/install_rocm_drm.sh', + '.lintrunner.toml', +] +command = [ + 'python3', + 'tools/linter/adapters/grep_linter.py', + # @lint-ignore TXT2 + '--pattern= ', + '--linter-name=TABS', + '--error-name=saw some tabs', + '--replace-pattern=s/\t/ /', + """--error-description=\ + This line has tabs; please replace them with spaces.\ + """, + '--', + '@{{PATHSFILE}}' +] diff --git a/common/install_cuda.sh b/common/install_cuda.sh index f09666e643..35babf576e 100644 --- a/common/install_cuda.sh +++ b/common/install_cuda.sh @@ -146,11 +146,11 @@ while test $# -gt 0 do case "$1" in 11.8) install_118; prune_118 - ;; + ;; 12.1) install_121; prune_121 - ;; - *) echo "bad argument $1"; exit 1 - ;; + ;; + *) echo "bad argument $1"; exit 1 + ;; esac shift done diff --git a/conda/build_pytorch.sh b/conda/build_pytorch.sh index cf8b2f4631..844d77f323 100755 --- a/conda/build_pytorch.sh +++ b/conda/build_pytorch.sh @@ -264,8 +264,8 @@ else # TODO, simplify after anaconda fixes their cudatoolkit versioning inconsistency. # see: https://github.com/conda-forge/conda-forge.github.io/issues/687#issuecomment-460086164 if [[ "$desired_cuda" == "12.1" ]]; then - export CONDA_CUDATOOLKIT_CONSTRAINT=" - pytorch-cuda >=12.1,<12.2 # [not osx]" - export MAGMA_PACKAGE=" - magma-cuda121 # [not osx and not win]" + export CONDA_CUDATOOLKIT_CONSTRAINT=" - pytorch-cuda >=12.1,<12.2 # [not osx]" + export MAGMA_PACKAGE=" - magma-cuda121 # [not osx and not win]" elif [[ "$desired_cuda" == "11.8" ]]; then export CONDA_CUDATOOLKIT_CONSTRAINT=" - pytorch-cuda >=11.8,<11.9 # [not osx]" export MAGMA_PACKAGE=" - magma-cuda118 # [not osx and not win]" diff --git a/conda/pytorch-nightly/build.sh b/conda/pytorch-nightly/build.sh index c649d32510..d9ccb708b0 100755 --- a/conda/pytorch-nightly/build.sh +++ b/conda/pytorch-nightly/build.sh @@ -56,18 +56,18 @@ if [[ -n "$build_with_cuda" ]]; then export USE_STATIC_CUDNN=1 # links cudnn statically (driven by tools/setup_helpers/cudnn.py) if [[ $CUDA_VERSION == 11.8* ]]; then - TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST;3.7+PTX;9.0" - #for cuda 11.8 we use cudnn 8.7 - #which does not have single static libcudnn_static.a deliverable to link with - export USE_STATIC_CUDNN=0 - #for cuda 11.8 include all dynamic loading libraries - DEPS_LIST=(/usr/local/cuda/lib64/libcudnn*.so.8 /usr/local/cuda-11.8/extras/CUPTI/lib64/libcupti.so.11.8) + TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST;3.7+PTX;9.0" + #for cuda 11.8 we use cudnn 8.7 + #which does not have single static libcudnn_static.a deliverable to link with + export USE_STATIC_CUDNN=0 + #for cuda 11.8 include all dynamic loading libraries + DEPS_LIST=(/usr/local/cuda/lib64/libcudnn*.so.8 /usr/local/cuda-11.8/extras/CUPTI/lib64/libcupti.so.11.8) elif [[ $CUDA_VERSION == 12.1* ]]; then - # cuda 12 does not support sm_3x - TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST;9.0" - # for cuda 12.1 we use cudnn 8.8 and include all dynamic loading libraries - export USE_STATIC_CUDNN=0 - DEPS_LIST=(/usr/local/cuda/lib64/libcudnn*.so.8 /usr/local/cuda-12.1/extras/CUPTI/lib64/libcupti.so.12) + # cuda 12 does not support sm_3x + TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST;9.0" + # for cuda 12.1 we use cudnn 8.8 and include all dynamic loading libraries + export USE_STATIC_CUDNN=0 + DEPS_LIST=(/usr/local/cuda/lib64/libcudnn*.so.8 /usr/local/cuda-12.1/extras/CUPTI/lib64/libcupti.so.12) fi if [[ -n "$OVERRIDE_TORCH_CUDA_ARCH_LIST" ]]; then TORCH_CUDA_ARCH_LIST="$OVERRIDE_TORCH_CUDA_ARCH_LIST" diff --git a/ffmpeg/recipe/build.sh b/ffmpeg/recipe/build.sh index b3c5b5967a..9b82c6553d 100644 --- a/ffmpeg/recipe/build.sh +++ b/ffmpeg/recipe/build.sh @@ -23,7 +23,7 @@ fi --disable-static \ --enable-version3 \ --enable-zlib \ - --enable-libmp3lame + --enable-libmp3lame make -j${CPU_COUNT} make install -j${CPU_COUNT} diff --git a/manywheel/build_rocm.sh b/manywheel/build_rocm.sh index 4fbca76970..a44d6212f6 100755 --- a/manywheel/build_rocm.sh +++ b/manywheel/build_rocm.sh @@ -88,7 +88,7 @@ ROCM_SO_FILES=( "librccl.so" "librocblas.so" "librocfft.so" - "librocm_smi64.so" + "librocm_smi64.so" "librocrand.so" "librocsolver.so" "librocsparse.so" @@ -128,7 +128,7 @@ elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then LIBTINFO_PATH="/lib/x86_64-linux-gnu/libtinfo.so.6" else LIBTINFO_PATH="/lib/x86_64-linux-gnu/libtinfo.so.5" - fi + fi LIBDRM_PATH="/usr/lib/x86_64-linux-gnu/libdrm.so.2" LIBDRM_AMDGPU_PATH="/usr/lib/x86_64-linux-gnu/libdrm_amdgpu.so.1" MAYBE_LIB64=lib @@ -170,8 +170,8 @@ do file_path=($(find $ROCM_HOME/ -name "$lib")) # Then search in ROCM_HOME fi if [[ -z $file_path ]]; then - echo "Error: Library file $lib is not found." >&2 - exit 1 + echo "Error: Library file $lib is not found." >&2 + exit 1 fi ROCM_SO_PATHS[${#ROCM_SO_PATHS[@]}]="$file_path" # Append lib to array done diff --git a/manywheel/conda_build.sh b/manywheel/conda_build.sh index 99a28768d8..407b4e4d4d 100755 --- a/manywheel/conda_build.sh +++ b/manywheel/conda_build.sh @@ -8,7 +8,7 @@ if ! ls /usr/local/cuda-7.5 then echo "Downloading CUDA 7.5" wget -c http://developer.download.nvidia.com/compute/cuda/7.5/Prod/local_installers/cuda_7.5.18_linux.run \ - -O /remote/cuda_7.5.18_linux.run + -O /remote/cuda_7.5.18_linux.run echo "Installing CUDA 7.5" chmod +x /remote/cuda_7.5.18_linux.run @@ -23,7 +23,7 @@ if ! ls /usr/local/cuda-8.0 then echo "Downloading CUDA 8.0" wget -c https://developer.nvidia.com/compute/cuda/8.0/Prod2/local_installers/cuda_8.0.61_375.26_linux-run \ - -O /remote/cuda_8.0.61_linux-run + -O /remote/cuda_8.0.61_linux-run echo "Installing CUDA 8.0" chmod +x /remote/cuda_8.0.61_linux-run @@ -39,7 +39,7 @@ if ! ls /usr/local/cuda-7.5/lib64/libcudnn.so.6.0.21 then rm -rf /tmp/cuda wget -c http://developer.download.nvidia.com/compute/redist/cudnn/v6.0/cudnn-7.5-linux-x64-v6.0.tgz \ - -O /remote/cudnn-7.5-linux-x64-v6.0.tgz + -O /remote/cudnn-7.5-linux-x64-v6.0.tgz pushd /tmp tar -xvf /remote/cudnn-7.5-linux-x64-v6.0.tgz cp -P /tmp/cuda/include/* /usr/local/cuda-7.5/include/ @@ -51,7 +51,7 @@ if ! ls /usr/local/cuda-8.0/lib64/libcudnn.so.6.0.21 then rm -rf /tmp/cuda wget -c http://developer.download.nvidia.com/compute/redist/cudnn/v6.0/cudnn-8.0-linux-x64-v6.0.tgz \ - -O /remote/cudnn-8.0-linux-x64-v6.0.tgz + -O /remote/cudnn-8.0-linux-x64-v6.0.tgz pushd /tmp tar -xvf /remote/cudnn-8.0-linux-x64-v6.0.tgz cp -P /tmp/cuda/include/* /usr/local/cuda-8.0/include/ diff --git a/tools/linter/adapters/grep_linter.py b/tools/linter/adapters/grep_linter.py new file mode 100644 index 0000000000..168800eb44 --- /dev/null +++ b/tools/linter/adapters/grep_linter.py @@ -0,0 +1,272 @@ +""" +Generic linter that greps for a pattern and optionally suggests replacements. +""" + +import argparse +import json +import logging +import os +import subprocess +import sys +import time +from enum import Enum +from typing import Any, List, NamedTuple, Optional + + +IS_WINDOWS: bool = os.name == "nt" + + +def eprint(*args: Any, **kwargs: Any) -> None: + print(*args, file=sys.stderr, flush=True, **kwargs) + + +class LintSeverity(str, Enum): + ERROR = "error" + WARNING = "warning" + ADVICE = "advice" + DISABLED = "disabled" + + +class LintMessage(NamedTuple): + path: Optional[str] + line: Optional[int] + char: Optional[int] + code: str + severity: LintSeverity + name: str + original: Optional[str] + replacement: Optional[str] + description: Optional[str] + + +def as_posix(name: str) -> str: + return name.replace("\\", "/") if IS_WINDOWS else name + + +def run_command( + args: List[str], +) -> "subprocess.CompletedProcess[bytes]": + logging.debug("$ %s", " ".join(args)) + start_time = time.monotonic() + try: + return subprocess.run( + args, + capture_output=True, + ) + finally: + end_time = time.monotonic() + logging.debug("took %dms", (end_time - start_time) * 1000) + + +def lint_file( + matching_line: str, + allowlist_pattern: str, + replace_pattern: str, + linter_name: str, + error_name: str, + error_description: str, +) -> Optional[LintMessage]: + # matching_line looks like: + # tools/linter/clangtidy_linter.py:13:import foo.bar.baz + split = matching_line.split(":") + filename = split[0] + + if allowlist_pattern: + try: + proc = run_command(["grep", "-nEHI", allowlist_pattern, filename]) + except Exception as err: + return LintMessage( + path=None, + line=None, + char=None, + code=linter_name, + severity=LintSeverity.ERROR, + name="command-failed", + original=None, + replacement=None, + description=( + f"Failed due to {err.__class__.__name__}:\n{err}" + if not isinstance(err, subprocess.CalledProcessError) + else ( + "COMMAND (exit code {returncode})\n" + "{command}\n\n" + "STDERR\n{stderr}\n\n" + "STDOUT\n{stdout}" + ).format( + returncode=err.returncode, + command=" ".join(as_posix(x) for x in err.cmd), + stderr=err.stderr.decode("utf-8").strip() or "(empty)", + stdout=err.stdout.decode("utf-8").strip() or "(empty)", + ) + ), + ) + + # allowlist pattern was found, abort lint + if proc.returncode == 0: + return None + + original = None + replacement = None + if replace_pattern: + with open(filename) as f: + original = f.read() + + try: + proc = run_command(["sed", "-r", replace_pattern, filename]) + replacement = proc.stdout.decode("utf-8") + except Exception as err: + return LintMessage( + path=None, + line=None, + char=None, + code=linter_name, + severity=LintSeverity.ERROR, + name="command-failed", + original=None, + replacement=None, + description=( + f"Failed due to {err.__class__.__name__}:\n{err}" + if not isinstance(err, subprocess.CalledProcessError) + else ( + "COMMAND (exit code {returncode})\n" + "{command}\n\n" + "STDERR\n{stderr}\n\n" + "STDOUT\n{stdout}" + ).format( + returncode=err.returncode, + command=" ".join(as_posix(x) for x in err.cmd), + stderr=err.stderr.decode("utf-8").strip() or "(empty)", + stdout=err.stdout.decode("utf-8").strip() or "(empty)", + ) + ), + ) + + return LintMessage( + path=split[0], + line=int(split[1]) if len(split) > 1 else None, + char=None, + code=linter_name, + severity=LintSeverity.ERROR, + name=error_name, + original=original, + replacement=replacement, + description=error_description, + ) + + +def main() -> None: + parser = argparse.ArgumentParser( + description="grep wrapper linter.", + fromfile_prefix_chars="@", + ) + parser.add_argument( + "--pattern", + required=True, + help="pattern to grep for", + ) + parser.add_argument( + "--allowlist-pattern", + help="if this pattern is true in the file, we don't grep for pattern", + ) + parser.add_argument( + "--linter-name", + required=True, + help="name of the linter", + ) + parser.add_argument( + "--match-first-only", + action="store_true", + help="only match the first hit in the file", + ) + parser.add_argument( + "--error-name", + required=True, + help="human-readable description of what the error is", + ) + parser.add_argument( + "--error-description", + required=True, + help="message to display when the pattern is found", + ) + parser.add_argument( + "--replace-pattern", + help=( + "the form of a pattern passed to `sed -r`. " + "If specified, this will become proposed replacement text." + ), + ) + parser.add_argument( + "--verbose", + action="store_true", + help="verbose logging", + ) + parser.add_argument( + "filenames", + nargs="+", + help="paths to lint", + ) + args = parser.parse_args() + + logging.basicConfig( + format="<%(threadName)s:%(levelname)s> %(message)s", + level=logging.NOTSET + if args.verbose + else logging.DEBUG + if len(args.filenames) < 1000 + else logging.INFO, + stream=sys.stderr, + ) + + files_with_matches = [] + if args.match_first_only: + files_with_matches = ["--files-with-matches"] + + try: + proc = run_command( + ["grep", "-nEHI", *files_with_matches, args.pattern, *args.filenames] + ) + except Exception as err: + err_msg = LintMessage( + path=None, + line=None, + char=None, + code=args.linter_name, + severity=LintSeverity.ERROR, + name="command-failed", + original=None, + replacement=None, + description=( + f"Failed due to {err.__class__.__name__}:\n{err}" + if not isinstance(err, subprocess.CalledProcessError) + else ( + "COMMAND (exit code {returncode})\n" + "{command}\n\n" + "STDERR\n{stderr}\n\n" + "STDOUT\n{stdout}" + ).format( + returncode=err.returncode, + command=" ".join(as_posix(x) for x in err.cmd), + stderr=err.stderr.decode("utf-8").strip() or "(empty)", + stdout=err.stdout.decode("utf-8").strip() or "(empty)", + ) + ), + ) + print(json.dumps(err_msg._asdict()), flush=True) + sys.exit(0) + + lines = proc.stdout.decode().splitlines() + for line in lines: + lint_message = lint_file( + line, + args.allowlist_pattern, + args.replace_pattern, + args.linter_name, + args.error_name, + args.error_description, + ) + if lint_message is not None: + print(json.dumps(lint_message._asdict()), flush=True) + + +if __name__ == "__main__": + main() From c55c58b8720868a6d6e82720a37f97a649eda2f5 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Tue, 19 Dec 2023 20:35:07 -0800 Subject: [PATCH 161/212] Link conda packages with cusparselt Fixes https://github.com/pytorch/pytorch/issues/115085 --- conda/build_pytorch.sh | 1 - conda/pytorch-nightly/build.sh | 11 ++++------- conda/pytorch-nightly/meta.yaml | 1 - 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/conda/build_pytorch.sh b/conda/build_pytorch.sh index 844d77f323..06e72da7cc 100755 --- a/conda/build_pytorch.sh +++ b/conda/build_pytorch.sh @@ -357,7 +357,6 @@ for py_ver in "${DESIRED_PYTHON[@]}"; do PYTORCH_GITHUB_ROOT_DIR="$pytorch_rootdir" \ PYTORCH_BUILD_STRING="$build_string" \ PYTORCH_MAGMA_CUDA_VERSION="$cuda_nodot" \ - USE_CUSPARSELT=0 \ conda build -c "$ANACONDA_USER" \ ${NO_TEST:-} \ --no-anaconda-upload \ diff --git a/conda/pytorch-nightly/build.sh b/conda/pytorch-nightly/build.sh index d9ccb708b0..db2b7b246f 100755 --- a/conda/pytorch-nightly/build.sh +++ b/conda/pytorch-nightly/build.sh @@ -53,21 +53,18 @@ fi if [[ -n "$build_with_cuda" ]]; then export TORCH_NVCC_FLAGS="-Xfatbin -compress-all" TORCH_CUDA_ARCH_LIST="5.0;6.0;6.1;7.0;7.5;8.0;8.6" - export USE_STATIC_CUDNN=1 # links cudnn statically (driven by tools/setup_helpers/cudnn.py) + export USE_STATIC_CUDNN=0 # link with cudnn dynamically + export USE_CUSPARSELT=1 # link with cusparselt if [[ $CUDA_VERSION == 11.8* ]]; then TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST;3.7+PTX;9.0" - #for cuda 11.8 we use cudnn 8.7 - #which does not have single static libcudnn_static.a deliverable to link with - export USE_STATIC_CUDNN=0 #for cuda 11.8 include all dynamic loading libraries - DEPS_LIST=(/usr/local/cuda/lib64/libcudnn*.so.8 /usr/local/cuda-11.8/extras/CUPTI/lib64/libcupti.so.11.8) + DEPS_LIST=(/usr/local/cuda/lib64/libcudnn*.so.8 /usr/local/cuda-11.8/extras/CUPTI/lib64/libcupti.so.11.8 /usr/local/cuda/lib64/libcusparseLt.so.0) elif [[ $CUDA_VERSION == 12.1* ]]; then # cuda 12 does not support sm_3x TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST;9.0" # for cuda 12.1 we use cudnn 8.8 and include all dynamic loading libraries - export USE_STATIC_CUDNN=0 - DEPS_LIST=(/usr/local/cuda/lib64/libcudnn*.so.8 /usr/local/cuda-12.1/extras/CUPTI/lib64/libcupti.so.12) + DEPS_LIST=(/usr/local/cuda/lib64/libcudnn*.so.8 /usr/local/cuda-12.1/extras/CUPTI/lib64/libcupti.so.12 /usr/local/cuda/lib64/libcusparseLt.so.0) fi if [[ -n "$OVERRIDE_TORCH_CUDA_ARCH_LIST" ]]; then TORCH_CUDA_ARCH_LIST="$OVERRIDE_TORCH_CUDA_ARCH_LIST" diff --git a/conda/pytorch-nightly/meta.yaml b/conda/pytorch-nightly/meta.yaml index 05be9c7d76..e56fe7f683 100644 --- a/conda/pytorch-nightly/meta.yaml +++ b/conda/pytorch-nightly/meta.yaml @@ -98,7 +98,6 @@ build: - _GLIBCXX_USE_CXX11_ABI # [unix] - MAX_JOBS # [unix] - OVERRIDE_TORCH_CUDA_ARCH_LIST - - USE_CUSPARSELT test: imports: From 524a0272df7c6ab2bf004fe74b1fe09c5da6b667 Mon Sep 17 00:00:00 2001 From: snadampal <87143774+snadampal@users.noreply.github.com> Date: Thu, 21 Dec 2023 13:09:05 -0600 Subject: [PATCH 162/212] aarch64: patch mkl-dnn for xbyak crashes due to /sys not accessible (#1648) There are platforms with /sys not mounted. skip handling HW caps for such platforms. cherry-pick of: oneapi-src/oneDNN#1773 This fixes the issue# pytorch/pytorch#115482 --- aarch64_linux/aarch64_wheel_ci_build.py | 4 ++ aarch64_linux/build_aarch64_wheel.py | 2 + mkldnn_fix/fix-xbyak-failure.patch | 96 +++++++++++++++++++++++++ 3 files changed, 102 insertions(+) create mode 100644 mkldnn_fix/fix-xbyak-failure.patch diff --git a/aarch64_linux/aarch64_wheel_ci_build.py b/aarch64_linux/aarch64_wheel_ci_build.py index d24b6f2fd8..a57dab5458 100755 --- a/aarch64_linux/aarch64_wheel_ci_build.py +++ b/aarch64_linux/aarch64_wheel_ci_build.py @@ -106,6 +106,10 @@ def parse_arguments(): else: print("build pytorch without mkldnn backend") + # patch mkldnn to fix aarch64 mac and aws lambda crash + print("Applying mkl-dnn patch to fix crash due to /sys not accesible") + os.system("cd /pytorch/third_party/ideep/mkl-dnn && patch -p1 < /builder/mkldnn_fix/fix-xbyak-failure.patch") + os.system(f"cd /pytorch; {build_vars} python3 setup.py bdist_wheel") pytorch_wheel_name = complete_wheel("pytorch") print(f"Build Compelete. Created {pytorch_wheel_name}..") diff --git a/aarch64_linux/build_aarch64_wheel.py b/aarch64_linux/build_aarch64_wheel.py index 0ff286ad2d..1615c78a6a 100755 --- a/aarch64_linux/build_aarch64_wheel.py +++ b/aarch64_linux/build_aarch64_wheel.py @@ -556,6 +556,8 @@ def start_build(host: RemoteHost, *, build_ArmComputeLibrary(host, git_clone_flags) print("build pytorch with mkldnn+acl backend") build_vars += " USE_MKLDNN=ON USE_MKLDNN_ACL=ON" + host.run_cmd("cd $HOME && git clone https://github.com/pytorch/builder.git") + host.run_cmd("cd $HOME/pytorch/third_party/ideep/mkl-dnn && patch -p1 < $HOME/builder/mkldnn_fix/fix-xbyak-failure.patch") # noqa: E501 host.run_cmd(f"cd $HOME/pytorch && export ACL_ROOT_DIR=$HOME/ComputeLibrary && {build_vars} python3 setup.py bdist_wheel{build_opts}") # noqa: E501 print('Repair the wheel') pytorch_wheel_name = host.list_dir("pytorch/dist")[0] diff --git a/mkldnn_fix/fix-xbyak-failure.patch b/mkldnn_fix/fix-xbyak-failure.patch new file mode 100644 index 0000000000..2ad278f0b6 --- /dev/null +++ b/mkldnn_fix/fix-xbyak-failure.patch @@ -0,0 +1,96 @@ +cpu: aarch64: fix xbyak functions for /sys access failures + +There are platforms with /sys not mounted. skip handling HW caps +for such platforms. + +This fixes the issue# pytorch/pytorch#115482 +--- + .../xbyak_aarch64/src/util_impl_linux.h | 24 ++++++++++++++----- + .../aarch64/xbyak_aarch64/src/util_impl_mac.h | 9 ++++--- + 2 files changed, 24 insertions(+), 9 deletions(-) + +diff --git a/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h b/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h +index 2c7b28e58b..860a05700f 100644 +--- a/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h ++++ b/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h +@@ -144,8 +144,13 @@ private: + regex_t regexBuf; + regmatch_t match[1]; + +- if (regcomp(®exBuf, regex, REG_EXTENDED) != 0) +- throw ERR_INTERNAL; ++ if (regcomp(®exBuf, regex, REG_EXTENDED) != 0) { ++ /* There are platforms with /sys not mounted. return empty buffers ++ * in these scenarios ++ */ ++ buf[0] = '\0'; ++ return 0; ++ } + + const int retVal = regexec(®exBuf, path, 1, match, 0); + regfree(®exBuf); +@@ -187,8 +192,12 @@ private: + regex_t regexBuf; + regmatch_t match[2]; + +- if (regcomp(®exBuf, "index[0-9]*$", REG_EXTENDED) != 0) +- throw ERR_INTERNAL; ++ if (regcomp(®exBuf, "index[0-9]*$", REG_EXTENDED) != 0) { ++ /* There are platforms with /sys not mounted. return gracefully ++ * in these scenarios ++ */ ++ goto init_and_return_false; ++ } + + if (regexec(®exBuf, dp->d_name, 1, match, 0) == 0) { // Found index[1-9][0-9]. directory + char *dir_name = buf0; +@@ -438,12 +447,15 @@ private: + + FILE *file = fopen(path_midr_el1, "r"); + if (file == nullptr) { +- throw Error(ERR_INTERNAL); ++ /* There are platforms with /sys not mounted. return empty buffer ++ * in these scenarios ++ */ ++ cacheInfo_.midr_el1 = 0xFE << 24; + return; + } + + if (fread(buf, sizeof(char), 64, file) == 0) { +- throw Error(ERR_INTERNAL); ++ cacheInfo_.midr_el1 = 0xFE << 24; + return; + } + +diff --git a/src/cpu/aarch64/xbyak_aarch64/src/util_impl_mac.h b/src/cpu/aarch64/xbyak_aarch64/src/util_impl_mac.h +index ebd6dba7c0..93bdae1d7a 100644 +--- a/src/cpu/aarch64/xbyak_aarch64/src/util_impl_mac.h ++++ b/src/cpu/aarch64/xbyak_aarch64/src/util_impl_mac.h +@@ -102,18 +102,21 @@ private: + size_t val = 0; + size_t len = sizeof(val); + ++ /* There are platforms with /sys not mounted. skip ++ * handling HW caps for such platforms. ++ */ + if (sysctlbyname(hw_opt_atomics, &val, &len, NULL, 0) != 0) +- throw Error(ERR_INTERNAL); ++ type_ = 0; + else + type_ |= (val == 1) ? (Type)XBYAK_AARCH64_HWCAP_ATOMIC : 0; + + if (sysctlbyname(hw_opt_fp, &val, &len, NULL, 0) != 0) +- throw Error(ERR_INTERNAL); ++ type_ = 0; + else + type_ |= (val == 1) ? (Type)XBYAK_AARCH64_HWCAP_FP : 0; + + if (sysctlbyname(hw_opt_neon, &val, &len, NULL, 0) != 0) +- throw Error(ERR_INTERNAL); ++ type_ = 0; + else + type_ |= (val == 1) ? (Type)XBYAK_AARCH64_HWCAP_ADVSIMD : 0; + } +-- +2.34.1 + From b9d2b93c08bd256c42f7b1900a07bd958b3ab8ee Mon Sep 17 00:00:00 2001 From: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com> Date: Fri, 22 Dec 2023 22:03:36 -0600 Subject: [PATCH 163/212] Update builder images to ROCm6.0 (#1647) * Update ROCm versions for docker images * Don't build MIOpen from source for ROCm6.0 * Temporarily use magma fork with ROCm6.0 patch * Update ROCm versions for docker images * Add gfx942 * Update MIOpen repo * Magma PR 42 is merged, so use upstream repo master branch now * gfx942 target only fully supported for ROCm6.0 and above --- .github/workflows/build-libtorch-images.yml | 2 +- .github/workflows/build-manywheel-images.yml | 2 +- common/install_miopen.sh | 7 +++++-- common/install_rocm_magma.sh | 2 +- libtorch/build_all_docker.sh | 2 +- libtorch/build_docker.sh | 3 +++ manywheel/build_all_docker.sh | 2 +- manywheel/build_docker.sh | 3 +++ 8 files changed, 16 insertions(+), 7 deletions(-) diff --git a/.github/workflows/build-libtorch-images.yml b/.github/workflows/build-libtorch-images.yml index 7968bbb26d..7c8e59f363 100644 --- a/.github/workflows/build-libtorch-images.yml +++ b/.github/workflows/build-libtorch-images.yml @@ -52,7 +52,7 @@ jobs: runs-on: linux.12xlarge strategy: matrix: - rocm_version: ["5.6", "5.7"] + rocm_version: ["5.7", "6.0"] env: GPU_ARCH_TYPE: rocm GPU_ARCH_VERSION: ${{ matrix.rocm_version }} diff --git a/.github/workflows/build-manywheel-images.yml b/.github/workflows/build-manywheel-images.yml index d717416f63..46056ba141 100644 --- a/.github/workflows/build-manywheel-images.yml +++ b/.github/workflows/build-manywheel-images.yml @@ -58,7 +58,7 @@ jobs: runs-on: linux.12xlarge strategy: matrix: - rocm_version: ["5.6", "5.7"] + rocm_version: ["5.7", "6.0"] env: GPU_ARCH_TYPE: rocm GPU_ARCH_VERSION: ${{ matrix.rocm_version }} diff --git a/common/install_miopen.sh b/common/install_miopen.sh index 779bc755d4..09ab251b7c 100644 --- a/common/install_miopen.sh +++ b/common/install_miopen.sh @@ -56,7 +56,10 @@ MIOPEN_CMAKE_COMMON_FLAGS=" -DMIOPEN_BUILD_DRIVER=OFF " # Pull MIOpen repo and set DMIOPEN_EMBED_DB based on ROCm version -if [[ $ROCM_INT -ge 50700 ]] && [[ $ROCM_INT -lt 50800 ]]; then +if [[ $ROCM_INT -ge 60000 ]] && [[ $ROCM_INT -lt 60100 ]]; then + echo "ROCm 6.0 MIOpen does not need any patches, do not build from source" + exit 0 +elif [[ $ROCM_INT -ge 50700 ]] && [[ $ROCM_INT -lt 60000 ]]; then echo "ROCm 5.7 MIOpen does not need any patches, do not build from source" exit 0 elif [[ $ROCM_INT -ge 50600 ]] && [[ $ROCM_INT -lt 50700 ]]; then @@ -85,7 +88,7 @@ fi yum remove -y miopen-hip -git clone https://github.com/ROCmSoftwarePlatform/MIOpen -b ${MIOPEN_BRANCH} +git clone https://github.com/ROCm/MIOpen -b ${MIOPEN_BRANCH} pushd MIOpen # remove .git to save disk space since CI runner was running out rm -rf .git diff --git a/common/install_rocm_magma.sh b/common/install_rocm_magma.sh index c37c1e30ab..c8e43f675b 100644 --- a/common/install_rocm_magma.sh +++ b/common/install_rocm_magma.sh @@ -15,7 +15,7 @@ pushd magma if [[ $PYTORCH_BRANCH == "release/1.10.1" ]]; then git checkout magma_ctrl_launch_bounds else - git checkout 28592a7170e4b3707ed92644bf4a689ed600c27f + git checkout a1625ff4d9bc362906bd01f805dbbe12612953f6 fi cp make.inc-examples/make.inc.hip-gcc-mkl make.inc echo 'LIBDIR += -L$(MKLROOT)/lib' >> make.inc diff --git a/libtorch/build_all_docker.sh b/libtorch/build_all_docker.sh index fb6bd975be..1a3a90d5a9 100755 --- a/libtorch/build_all_docker.sh +++ b/libtorch/build_all_docker.sh @@ -8,6 +8,6 @@ for cuda_version in 12.1 11.8; do GPU_ARCH_TYPE=cuda GPU_ARCH_VERSION="${cuda_version}" "${TOPDIR}/libtorch/build_docker.sh" done -for rocm_version in 5.6 5.7; do +for rocm_version in 5.7 6.0; do GPU_ARCH_TYPE=rocm GPU_ARCH_VERSION="${rocm_version}" "${TOPDIR}/libtorch/build_docker.sh" done diff --git a/libtorch/build_docker.sh b/libtorch/build_docker.sh index 8997f69cfe..b7ebdd36ec 100755 --- a/libtorch/build_docker.sh +++ b/libtorch/build_docker.sh @@ -36,6 +36,9 @@ case ${GPU_ARCH_TYPE} in echo "ERROR: rocm regex failed" exit 1 fi + if [[ $ROCM_VERSION_INT -ge 60000 ]]; then + PYTORCH_ROCM_ARCH+=";gfx942" + fi DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}" ;; *) diff --git a/manywheel/build_all_docker.sh b/manywheel/build_all_docker.sh index 2995e3be76..8a02361cb7 100644 --- a/manywheel/build_all_docker.sh +++ b/manywheel/build_all_docker.sh @@ -16,7 +16,7 @@ for cuda_version in 12.1 11.8; do MANYLINUX_VERSION=2014 GPU_ARCH_TYPE=cuda GPU_ARCH_VERSION="${cuda_version}" "${TOPDIR}/manywheel/build_docker.sh" done -for rocm_version in 5.6 5.7; do +for rocm_version in 5.7 6.0; do GPU_ARCH_TYPE=rocm GPU_ARCH_VERSION="${rocm_version}" "${TOPDIR}/manywheel/build_docker.sh" MANYLINUX_VERSION=2014 GPU_ARCH_TYPE=rocm GPU_ARCH_VERSION="${rocm_version}" "${TOPDIR}/manywheel/build_docker.sh" done diff --git a/manywheel/build_docker.sh b/manywheel/build_docker.sh index e547b42757..63b8e0c3db 100755 --- a/manywheel/build_docker.sh +++ b/manywheel/build_docker.sh @@ -57,6 +57,9 @@ case ${GPU_ARCH_TYPE} in echo "ERROR: rocm regex failed" exit 1 fi + if [[ $ROCM_VERSION_INT -ge 60000 ]]; then + PYTORCH_ROCM_ARCH+=";gfx942" + fi DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=9" ;; *) From 770c8275d6ad237fdc637bb97e5f84018baee08b Mon Sep 17 00:00:00 2001 From: cyyever Date: Wed, 27 Dec 2023 23:35:10 +0800 Subject: [PATCH 164/212] Avoid finding out std::basic_string_view (#1528) As pytorch moving to C++17, the binary can contain both "std::basic_string_view" and "std::__cxx11::basic_string<", change the pattern to avoid finding out std::basic_string_view, causing false positives. --- check_binary.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/check_binary.sh b/check_binary.sh index 9a2cf065b4..98a5267ebb 100755 --- a/check_binary.sh +++ b/check_binary.sh @@ -116,7 +116,7 @@ if [[ "$(uname)" != 'Darwin' ]]; then # # To check whether it is using cxx11 ABI, check non-existence of symbol: PRE_CXX11_SYMBOLS=( - "std::basic_string" + "std::basic_string<" "std::list" ) # To check whether it is using pre-cxx11 ABI, check non-existence of symbol: From 27c47026e4108f8e2f02892ab5062a42f28a3160 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Thu, 28 Dec 2023 17:56:03 +0000 Subject: [PATCH 165/212] Add test ops validation for validation workflows (#1650) * Add test ops validation * include workflows --- .github/scripts/validate_test_ops.sh | 16 ++++++++++++++++ .github/workflows/validate-binaries.yml | 11 +++++++++++ .github/workflows/validate-linux-binaries.yml | 15 +++++++++++++++ 3 files changed, 42 insertions(+) create mode 100644 .github/scripts/validate_test_ops.sh diff --git a/.github/scripts/validate_test_ops.sh b/.github/scripts/validate_test_ops.sh new file mode 100644 index 0000000000..7c80b601ef --- /dev/null +++ b/.github/scripts/validate_test_ops.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +set -eux -o pipefail + +retry () { + $* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*) +} + +# Clone the Pytorch branch +retry git clone --depth 1 https://github.com/pytorch/pytorch.git +retry git submodule update --init --recursive +pushd pytorch + +# Run test_ops validation +export CUDA_LAUNCH_BLOCKING=1 +python3 test/test_ops.py diff --git a/.github/workflows/validate-binaries.yml b/.github/workflows/validate-binaries.yml index 8dbe5f27c2..558be8e566 100644 --- a/.github/workflows/validate-binaries.yml +++ b/.github/workflows/validate-binaries.yml @@ -27,6 +27,11 @@ on: default: false required: false type: boolean + include-test-ops: + description: 'Include Test Ops tests (only Linux)' + default: false + required: false + type: boolean workflow_dispatch: inputs: os: @@ -65,6 +70,11 @@ on: default: "" required: false type: string + include-test-ops: + description: 'Include Test Ops tests (only Linux)' + default: false + required: false + type: boolean jobs: @@ -94,6 +104,7 @@ jobs: torchonly: ${{ inputs.torchonly }} version: ${{ inputs.version }} release-matrix: ${{ needs.generate-release-matrix.outputs.matrix }} + include-test-ops: ${{ inputs.include-test-ops }} linux-aarch64: if: inputs.os == 'linux-aarch64' || inputs.os == 'all' diff --git a/.github/workflows/validate-linux-binaries.yml b/.github/workflows/validate-linux-binaries.yml index d1c6c29bd0..3f652eff8d 100644 --- a/.github/workflows/validate-linux-binaries.yml +++ b/.github/workflows/validate-linux-binaries.yml @@ -27,6 +27,11 @@ on: default: "" required: false type: string + include-test-ops: + description: 'Include Test Ops tests (only Linux)' + default: false + required: false + type: boolean workflow_dispatch: inputs: channel: @@ -58,6 +63,11 @@ on: default: "" required: false type: string + include-test-ops: + description: 'Include Test Ops tests (only Linux)' + default: false + required: false + type: boolean jobs: generate-linux-matrix: @@ -84,6 +94,7 @@ jobs: set -ex export ENV_NAME="conda-env-${{ github.run_id }}" export TORCH_ONLY=${{ inputs.torchonly }} + export INCLUDE_TEST_OPS=${{ inputs.include-test-ops }} export RELEASE_VERSION=${{ inputs.version }} export TARGET_OS="linux" eval "$(conda shell.bash hook)" @@ -97,3 +108,7 @@ jobs: # Standart case: Validate binaries source ./.github/scripts/validate_binaries.sh + + if [[ ${INCLUDE_TEST_OPS} == 'true' ]]; then + source ./.github/scripts/validate_test_ops.sh + fi From b16ac1fa2e64e98f26afaba4e300fe5843156349 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Thu, 28 Dec 2023 19:18:21 +0000 Subject: [PATCH 166/212] Add test ops validation for validation workflows (#1651) --- .github/scripts/validate_binaries.sh | 4 ++++ .github/workflows/validate-linux-binaries.yml | 4 ---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/scripts/validate_binaries.sh b/.github/scripts/validate_binaries.sh index 22fc9d1090..916f04f1aa 100755 --- a/.github/scripts/validate_binaries.sh +++ b/.github/scripts/validate_binaries.sh @@ -49,6 +49,10 @@ else export PATH=${OLD_PATH} fi + if [[ ${INCLUDE_TEST_OPS} == 'true' ]]; then + source ./.github/scripts/validate_test_ops.sh + fi + conda deactivate conda env remove -n ${ENV_NAME} fi diff --git a/.github/workflows/validate-linux-binaries.yml b/.github/workflows/validate-linux-binaries.yml index 3f652eff8d..74271e3d1c 100644 --- a/.github/workflows/validate-linux-binaries.yml +++ b/.github/workflows/validate-linux-binaries.yml @@ -108,7 +108,3 @@ jobs: # Standart case: Validate binaries source ./.github/scripts/validate_binaries.sh - - if [[ ${INCLUDE_TEST_OPS} == 'true' ]]; then - source ./.github/scripts/validate_test_ops.sh - fi From 9c8a8dcce8ede569cada3f05adef755449547ce6 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Thu, 28 Dec 2023 20:33:11 +0000 Subject: [PATCH 167/212] Add test ops validation for validation workflows (#1652) --- .github/scripts/validate_test_ops.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/scripts/validate_test_ops.sh b/.github/scripts/validate_test_ops.sh index 7c80b601ef..d8031c0712 100644 --- a/.github/scripts/validate_test_ops.sh +++ b/.github/scripts/validate_test_ops.sh @@ -11,6 +11,8 @@ retry git clone --depth 1 https://github.com/pytorch/pytorch.git retry git submodule update --init --recursive pushd pytorch +pip install expecttest pyyaml jinja2 + # Run test_ops validation export CUDA_LAUNCH_BLOCKING=1 python3 test/test_ops.py From 50cb7c5c54556cb1376b5f706de10ea2d8c1ac71 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Thu, 28 Dec 2023 21:24:21 +0000 Subject: [PATCH 168/212] Add test ops validation for validation workflows (#1653) --- .github/scripts/validate_test_ops.sh | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/.github/scripts/validate_test_ops.sh b/.github/scripts/validate_test_ops.sh index d8031c0712..00bca8d794 100644 --- a/.github/scripts/validate_test_ops.sh +++ b/.github/scripts/validate_test_ops.sh @@ -6,12 +6,19 @@ retry () { $* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*) } +BRANCH = "@main" +if [[ ${MATRIX_CHANNEL} == "test" ]] + SHORT_VERSION=${MATRIX_STABLE_VERSION%.*} + BRANCH="@release/${SHORT_VERSION}" +fi + + # Clone the Pytorch branch -retry git clone --depth 1 https://github.com/pytorch/pytorch.git +retry git clone --depth 1 https://github.com/pytorch/pytorch.git${BRANCH} retry git submodule update --init --recursive pushd pytorch -pip install expecttest pyyaml jinja2 +pip install expecttest pyyaml jinja2 packaging # Run test_ops validation export CUDA_LAUNCH_BLOCKING=1 From 3b47169c71361c7b1fe02b349bfc416031f6ba0d Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Thu, 28 Dec 2023 21:39:01 +0000 Subject: [PATCH 169/212] Add test ops validation for validation workflows (#1654) --- .github/scripts/validate_test_ops.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/scripts/validate_test_ops.sh b/.github/scripts/validate_test_ops.sh index 00bca8d794..547b7340ba 100644 --- a/.github/scripts/validate_test_ops.sh +++ b/.github/scripts/validate_test_ops.sh @@ -6,8 +6,8 @@ retry () { $* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*) } -BRANCH = "@main" -if [[ ${MATRIX_CHANNEL} == "test" ]] +BRANCH="@main" +if [[ ${MATRIX_CHANNEL} == "test" ]]; then SHORT_VERSION=${MATRIX_STABLE_VERSION%.*} BRANCH="@release/${SHORT_VERSION}" fi From f2b22ada2f5e028e1683ee18766fa5d3751ad271 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Thu, 28 Dec 2023 22:00:04 +0000 Subject: [PATCH 170/212] Add test ops validation for validation workflows (#1655) --- .github/scripts/validate_test_ops.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/scripts/validate_test_ops.sh b/.github/scripts/validate_test_ops.sh index 547b7340ba..bd724394b0 100644 --- a/.github/scripts/validate_test_ops.sh +++ b/.github/scripts/validate_test_ops.sh @@ -6,15 +6,15 @@ retry () { $* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*) } -BRANCH="@main" +BRANCH="" if [[ ${MATRIX_CHANNEL} == "test" ]]; then SHORT_VERSION=${MATRIX_STABLE_VERSION%.*} - BRANCH="@release/${SHORT_VERSION}" + BRANCH="--branch release/${SHORT_VERSION}" fi # Clone the Pytorch branch -retry git clone --depth 1 https://github.com/pytorch/pytorch.git${BRANCH} +retry git clone ${BRANCH} --depth 1 https://github.com/pytorch/pytorch.git retry git submodule update --init --recursive pushd pytorch From b91724c654c6618883823fcef3c1841637fb3fbc Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Thu, 28 Dec 2023 22:27:59 +0000 Subject: [PATCH 171/212] [validations] Add missing required packages (#1656) --- .github/scripts/validate_test_ops.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/validate_test_ops.sh b/.github/scripts/validate_test_ops.sh index bd724394b0..91ef3ffde3 100644 --- a/.github/scripts/validate_test_ops.sh +++ b/.github/scripts/validate_test_ops.sh @@ -18,7 +18,7 @@ retry git clone ${BRANCH} --depth 1 https://github.com/pytorch/pytorch.git retry git submodule update --init --recursive pushd pytorch -pip install expecttest pyyaml jinja2 packaging +pip install expecttest numpy pyyaml jinja2 packaging xmlrunner hypothesis unittest-xml-reporting # Run test_ops validation export CUDA_LAUNCH_BLOCKING=1 From f1e19a7ecdd72804e02e7129f3893a87db68bb77 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Fri, 29 Dec 2023 15:15:20 +0000 Subject: [PATCH 172/212] [validations] Perform test_ops only on CUDA binaries (#1657) --- .github/scripts/validate_binaries.sh | 2 +- .github/scripts/validate_test_ops.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/scripts/validate_binaries.sh b/.github/scripts/validate_binaries.sh index 916f04f1aa..23a411d19c 100755 --- a/.github/scripts/validate_binaries.sh +++ b/.github/scripts/validate_binaries.sh @@ -49,7 +49,7 @@ else export PATH=${OLD_PATH} fi - if [[ ${INCLUDE_TEST_OPS} == 'true' ]]; then + if [[ ${INCLUDE_TEST_OPS} == 'true' && ${MATRIX_GPU_ARCH_TYPE} == 'cuda' ]]; then source ./.github/scripts/validate_test_ops.sh fi diff --git a/.github/scripts/validate_test_ops.sh b/.github/scripts/validate_test_ops.sh index 91ef3ffde3..60686ee097 100644 --- a/.github/scripts/validate_test_ops.sh +++ b/.github/scripts/validate_test_ops.sh @@ -18,8 +18,8 @@ retry git clone ${BRANCH} --depth 1 https://github.com/pytorch/pytorch.git retry git submodule update --init --recursive pushd pytorch -pip install expecttest numpy pyyaml jinja2 packaging xmlrunner hypothesis unittest-xml-reporting +pip install expecttest numpy pyyaml jinja2 packaging hypothesis unittest-xml-reporting # Run test_ops validation export CUDA_LAUNCH_BLOCKING=1 -python3 test/test_ops.py +python3 test/test_ops.py TestCommonCUDA From 52259ba524d1b5b915b057525b1f360dd996cb95 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Fri, 29 Dec 2023 15:50:43 +0000 Subject: [PATCH 173/212] [validations] Adjust timeout for linux jobs (#1658) --- .github/workflows/validate-linux-binaries.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/validate-linux-binaries.yml b/.github/workflows/validate-linux-binaries.yml index 74271e3d1c..f674afa04e 100644 --- a/.github/workflows/validate-linux-binaries.yml +++ b/.github/workflows/validate-linux-binaries.yml @@ -90,6 +90,7 @@ jobs: ref: ${{ inputs.ref || github.ref }} job-name: ${{ matrix.build_name }} binary-matrix: ${{ toJSON(matrix) }} + timeout: 120 script: | set -ex export ENV_NAME="conda-env-${{ github.run_id }}" From 58b0295915e4f1ffe73a1305b06edeadc48e1230 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Fri, 29 Dec 2023 17:11:02 +0000 Subject: [PATCH 174/212] [validations] Restrict testing for python 3.8-3.11 (#1659) --- .github/scripts/validate_binaries.sh | 3 ++- .github/scripts/validate_test_ops.sh | 2 +- .github/workflows/validate-linux-binaries.yml | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/scripts/validate_binaries.sh b/.github/scripts/validate_binaries.sh index 23a411d19c..5039475565 100755 --- a/.github/scripts/validate_binaries.sh +++ b/.github/scripts/validate_binaries.sh @@ -49,7 +49,8 @@ else export PATH=${OLD_PATH} fi - if [[ ${INCLUDE_TEST_OPS} == 'true' && ${MATRIX_GPU_ARCH_TYPE} == 'cuda' ]]; then + # We are only interested in CUDA tests and Python 3.8-3.11. Not all requirement libraries are available for 3.12 yet. + if [[ ${INCLUDE_TEST_OPS} == 'true' && ${MATRIX_GPU_ARCH_TYPE} == 'cuda' && ${MATRIX_PYTHON_VERSION} != "3.12" ]]; then source ./.github/scripts/validate_test_ops.sh fi diff --git a/.github/scripts/validate_test_ops.sh b/.github/scripts/validate_test_ops.sh index 60686ee097..e93f289180 100644 --- a/.github/scripts/validate_test_ops.sh +++ b/.github/scripts/validate_test_ops.sh @@ -18,7 +18,7 @@ retry git clone ${BRANCH} --depth 1 https://github.com/pytorch/pytorch.git retry git submodule update --init --recursive pushd pytorch -pip install expecttest numpy pyyaml jinja2 packaging hypothesis unittest-xml-reporting +pip install expecttest numpy pyyaml jinja2 packaging hypothesis unittest-xml-reporting scipy # Run test_ops validation export CUDA_LAUNCH_BLOCKING=1 diff --git a/.github/workflows/validate-linux-binaries.yml b/.github/workflows/validate-linux-binaries.yml index f674afa04e..de5bda9998 100644 --- a/.github/workflows/validate-linux-binaries.yml +++ b/.github/workflows/validate-linux-binaries.yml @@ -90,7 +90,7 @@ jobs: ref: ${{ inputs.ref || github.ref }} job-name: ${{ matrix.build_name }} binary-matrix: ${{ toJSON(matrix) }} - timeout: 120 + timeout: 180 script: | set -ex export ENV_NAME="conda-env-${{ github.run_id }}" From f2efe21571d9c83b891be121c4f9d50cd5ecdc38 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Fri, 29 Dec 2023 20:20:59 +0000 Subject: [PATCH 175/212] [validations] Fix use case if INCLUDE_TEST_OPS is not set (#1660) --- .github/scripts/validate_binaries.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/validate_binaries.sh b/.github/scripts/validate_binaries.sh index 5039475565..042b227679 100755 --- a/.github/scripts/validate_binaries.sh +++ b/.github/scripts/validate_binaries.sh @@ -50,7 +50,7 @@ else fi # We are only interested in CUDA tests and Python 3.8-3.11. Not all requirement libraries are available for 3.12 yet. - if [[ ${INCLUDE_TEST_OPS} == 'true' && ${MATRIX_GPU_ARCH_TYPE} == 'cuda' && ${MATRIX_PYTHON_VERSION} != "3.12" ]]; then + if [[ ${INCLUDE_TEST_OPS:-} == 'true' && ${MATRIX_GPU_ARCH_TYPE} == 'cuda' && ${MATRIX_PYTHON_VERSION} != "3.12" ]]; then source ./.github/scripts/validate_test_ops.sh fi From ca784208feabb8a3ed3a50a1d66afa49ee855186 Mon Sep 17 00:00:00 2001 From: Wei Wang <143543872+nWEIdia@users.noreply.github.com> Date: Fri, 5 Jan 2024 06:24:11 -0800 Subject: [PATCH 176/212] Add unit tests and one line reproducers to detect bad pytorch cuda wheels (#1663) * Add one line reproducers and unit tests that would fail when bad wheels were generated by the compiler(s). nextafter reproducer thanks to @malfet! * cosmetic fixes * fix comments --- .github/scripts/validate_test_ops.sh | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/.github/scripts/validate_test_ops.sh b/.github/scripts/validate_test_ops.sh index e93f289180..e874c75a80 100644 --- a/.github/scripts/validate_test_ops.sh +++ b/.github/scripts/validate_test_ops.sh @@ -20,6 +20,16 @@ pushd pytorch pip install expecttest numpy pyyaml jinja2 packaging hypothesis unittest-xml-reporting scipy -# Run test_ops validation -export CUDA_LAUNCH_BLOCKING=1 -python3 test/test_ops.py TestCommonCUDA +# Run pytorch cuda wheels validation +# Detect ReduceLogicKernel (ReduceOp and kernel) IMA +python test/test_ops.py -k test_dtypes_all_cuda +# Detect BinaryMulKernel (elementwise binary functor internal mul) IMA +python test/test_torch.py -k test_index_reduce_reduce_prod_cuda_int32 +# Detect BinaryBitwiseOpsKernels (at::native::BitwiseAndFunctor) IMA +python test/test_binary_ufuncs.py -k test_contig_vs_every_other___rand___cuda_int32 +# Detect MaxMinElementwiseKernel (maximum) IMA +python test/test_schema_check.py -k test_schema_correctness_clamp_cuda_int8 +# Detect StepKernel (nextafter) IMA +python -c "import torch; print(torch.nextafter(torch.tensor([-4.5149, -5.9053, -0.9516, -2.3615, 1.5591], device='cuda:0'), torch.tensor(3.8075, device='cuda:0')))" +# Detect BinaryGeometricKernels (atan2) IMA +python -c "import torch; x = (torch.randn((2,1,1), dtype=torch.float, device="cuda")*5).to(torch.float32); y=(torch.randn((), dtype=torch.float, device="cuda")*5).to(torch.float32); print(torch.atan2(x,y))" From fe83c2156946c7f7f2b37b358de55ae8b01b1b6e Mon Sep 17 00:00:00 2001 From: Wei Wang <143543872+nWEIdia@users.noreply.github.com> Date: Fri, 5 Jan 2024 16:34:58 -0800 Subject: [PATCH 177/212] Fix quotation issues when migrating from python file to one line format (#1664) Sorry, looks like the last line had an issue while porting it from multi-line python file to one-line. Side question: when does this file get used? Is it only used during release binary generation/testing? --- .github/scripts/validate_test_ops.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/validate_test_ops.sh b/.github/scripts/validate_test_ops.sh index e874c75a80..0578d5ed96 100644 --- a/.github/scripts/validate_test_ops.sh +++ b/.github/scripts/validate_test_ops.sh @@ -32,4 +32,4 @@ python test/test_schema_check.py -k test_schema_correctness_clamp_cuda_int8 # Detect StepKernel (nextafter) IMA python -c "import torch; print(torch.nextafter(torch.tensor([-4.5149, -5.9053, -0.9516, -2.3615, 1.5591], device='cuda:0'), torch.tensor(3.8075, device='cuda:0')))" # Detect BinaryGeometricKernels (atan2) IMA -python -c "import torch; x = (torch.randn((2,1,1), dtype=torch.float, device="cuda")*5).to(torch.float32); y=(torch.randn((), dtype=torch.float, device="cuda")*5).to(torch.float32); print(torch.atan2(x,y))" +python -c "import torch; x = (torch.randn((2,1,1), dtype=torch.float, device='cuda')*5).to(torch.float32); y=(torch.randn((), dtype=torch.float, device='cuda')*5).to(torch.float32); print(torch.atan2(x,y))" From 04ef1bf031cc8b75d5d7f8a5e7a0832d5a0a46da Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Mon, 8 Jan 2024 16:53:56 +0000 Subject: [PATCH 178/212] Add nccl version print for cuda related smoke test (#1667) --- test/smoke_test/smoke_test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/smoke_test/smoke_test.py b/test/smoke_test/smoke_test.py index 14e04d366c..c518f15c8f 100644 --- a/test/smoke_test/smoke_test.py +++ b/test/smoke_test/smoke_test.py @@ -163,6 +163,7 @@ def smoke_test_cuda(package: str, runtime_error_check: str) -> None: f"Wrong CUDA version. Loaded: {torch.version.cuda} Expected: {gpu_arch_ver}" ) print(f"torch cuda: {torch.version.cuda}") + print(f"torch nccl version: {torch.cuda.nccl.version()}" ) # todo add cudnn version validation print(f"torch cudnn: {torch.backends.cudnn.version()}") print(f"cuDNN enabled? {torch.backends.cudnn.enabled}") From 588ab91d6b344fab54c83f1c34e0021e8772c6fc Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Mon, 8 Jan 2024 21:37:08 +0000 Subject: [PATCH 179/212] Apply nccl test to linux only (#1669) --- test/smoke_test/smoke_test.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/test/smoke_test/smoke_test.py b/test/smoke_test/smoke_test.py index c518f15c8f..cf78eb9e13 100644 --- a/test/smoke_test/smoke_test.py +++ b/test/smoke_test/smoke_test.py @@ -163,14 +163,17 @@ def smoke_test_cuda(package: str, runtime_error_check: str) -> None: f"Wrong CUDA version. Loaded: {torch.version.cuda} Expected: {gpu_arch_ver}" ) print(f"torch cuda: {torch.version.cuda}") - print(f"torch nccl version: {torch.cuda.nccl.version()}" ) # todo add cudnn version validation print(f"torch cudnn: {torch.backends.cudnn.version()}") print(f"cuDNN enabled? {torch.backends.cudnn.enabled}") # torch.compile is available only on Linux and python 3.8-3.11 - if (sys.platform in ["linux", "linux2"]) and sys.version_info < (3, 12, 0): - smoke_test_compile() + # nccl is availbale only on Linux + if (sys.platform in ["linux", "linux2"]): + print(f"torch nccl version: {torch.cuda.nccl.version()}") + + if(sys.version_info < (3, 12, 0)): + smoke_test_compile() if runtime_error_check == "enabled": test_cuda_runtime_errors_captured() From 4c758b39db39b80c4b0ab43a83e3e519bdce4565 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Tue, 9 Jan 2024 01:54:08 +0000 Subject: [PATCH 180/212] Build nccl after installing cuda (#1670) Fix: https://github.com/pytorch/pytorch/issues/116977 Nccl 2.19.3 don't exist for cuda 11.8 and cuda 12.1. Refer to https://docs.nvidia.com/deeplearning/nccl/release-notes/rel_2-19-3.html#rel_2-19-3 CUDA 12.0, 12.2, 12.3 are supported. Hence we do manual build. Follow this build process: https://github.com/NVIDIA/nccl/tree/v2.19.3-1?tab=readme-ov-file#build We want nccl version be exactly the same as installed here: https://github.com/pytorch/pytorch/blob/main/.github/scripts/generate_binary_build_matrix.py#L45 --- common/install_cuda.sh | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/common/install_cuda.sh b/common/install_cuda.sh index 35babf576e..01ed13def7 100644 --- a/common/install_cuda.sh +++ b/common/install_cuda.sh @@ -33,13 +33,13 @@ function install_118 { rm -rf tmp_cudnn # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses - mkdir tmp_nccl && cd tmp_nccl - wget -q https://developer.download.nvidia.com/compute/redist/nccl/v2.15.5/nccl_2.15.5-1+cuda11.8_x86_64.txz - tar xf nccl_2.15.5-1+cuda11.8_x86_64.txz - cp -a nccl_2.15.5-1+cuda11.8_x86_64/include/* /usr/local/cuda/include/ - cp -a nccl_2.15.5-1+cuda11.8_x86_64/lib/* /usr/local/cuda/lib64/ + # Follow build: https://github.com/NVIDIA/nccl/tree/v2.19.3-1?tab=readme-ov-file#build + git clone -b v2.19.3-1 --depth 1 https://github.com/NVIDIA/nccl.git + cd nccl && make -j src.build + cp -a build/include/* /usr/local/cuda/include/ + cp -a build/lib/* /usr/local/cuda/lib64/ cd .. - rm -rf tmp_nccl + rm -rf nccl install_cusparselt_040 @@ -66,13 +66,13 @@ function install_121 { rm -rf tmp_cudnn # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses - mkdir tmp_nccl && cd tmp_nccl - wget -q https://developer.download.nvidia.com/compute/redist/nccl/v2.18.1/nccl_2.18.1-1+cuda12.1_x86_64.txz - tar xf nccl_2.18.1-1+cuda12.1_x86_64.txz - cp -a nccl_2.18.1-1+cuda12.1_x86_64/include/* /usr/local/cuda/include/ - cp -a nccl_2.18.1-1+cuda12.1_x86_64/lib/* /usr/local/cuda/lib64/ + # Follow build: https://github.com/NVIDIA/nccl/tree/v2.19.3-1?tab=readme-ov-file#build + git clone -b v2.19.3-1 --depth 1 https://github.com/NVIDIA/nccl.git + cd nccl && make -j src.build + cp -a build/include/* /usr/local/cuda/include/ + cp -a build/lib/* /usr/local/cuda/lib64/ cd .. - rm -rf tmp_nccl + rm -rf nccl install_cusparselt_040 From 53b5b02311b3a736e30e17a656c63de9c78c45c5 Mon Sep 17 00:00:00 2001 From: Jesse Cai Date: Tue, 9 Jan 2024 18:06:23 -0500 Subject: [PATCH 181/212] Update cusparselt to v0.5.2 (#1672) This PR adds in support for cuSPARSELt v0.5.2 and updates the cuda 12.1 build step to use it instead of 0.4.0 Also fixes a typo when deleting the cusparselt folder after installing. --- common/install_cuda.sh | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/common/install_cuda.sh b/common/install_cuda.sh index 01ed13def7..a62831f7d0 100644 --- a/common/install_cuda.sh +++ b/common/install_cuda.sh @@ -10,11 +10,22 @@ function install_cusparselt_040 { cp -a libcusparse_lt-linux-x86_64-0.4.0.7-archive/include/* /usr/local/cuda/include/ cp -a libcusparse_lt-linux-x86_64-0.4.0.7-archive/lib/* /usr/local/cuda/lib64/ popd - rm -rf tmp_custparselt + rm -rf tmp_cusparselt +} + +function install_cusparselt_052 { + # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html + mkdir tmp_cusparselt && pushd tmp_cusparselt + wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.5.2.1-archive.tar.xz + tar xf libcusparse_lt-linux-x86_64-0.5.2.1-archive.tar.xz + cp -a libcusparse_lt-linux-x86_64-0.5.2.1-archive/include/* /usr/local/cuda/include/ + cp -a libcusparse_lt-linux-x86_64-0.5.2.1-archive/lib/* /usr/local/cuda/lib64/ + popd + rm -rf tmp_cusparselt } function install_118 { - echo "Installing CUDA 11.8 and cuDNN 8.7 and NCCL 2.15 and cuSparseLt-0.5.0" + echo "Installing CUDA 11.8 and cuDNN 8.7 and NCCL 2.15 and cuSparseLt-0.4.0" rm -rf /usr/local/cuda-11.8 /usr/local/cuda # install CUDA 11.8.0 in the same container wget -q https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run @@ -47,7 +58,7 @@ function install_118 { } function install_121 { - echo "Installing CUDA 12.1 and cuDNN 8.9 and NCCL 2.18.1 and cuSparseLt-0.5.0" + echo "Installing CUDA 12.1 and cuDNN 8.9 and NCCL 2.18.1 and cuSparseLt-0.5.2" rm -rf /usr/local/cuda-12.1 /usr/local/cuda # install CUDA 12.1.0 in the same container wget -q https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda_12.1.1_530.30.02_linux.run @@ -74,7 +85,7 @@ function install_121 { cd .. rm -rf nccl - install_cusparselt_040 + install_cusparselt_052 ldconfig } From 1d1f352b852253ae880c129ce2d019904f113bf6 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Fri, 12 Jan 2024 21:11:39 +0000 Subject: [PATCH 182/212] Run test ops tests from outside of pytorch root folder (#1676) --- .github/scripts/validate_test_ops.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/scripts/validate_test_ops.sh b/.github/scripts/validate_test_ops.sh index 0578d5ed96..12963f289b 100644 --- a/.github/scripts/validate_test_ops.sh +++ b/.github/scripts/validate_test_ops.sh @@ -20,7 +20,7 @@ pushd pytorch pip install expecttest numpy pyyaml jinja2 packaging hypothesis unittest-xml-reporting scipy -# Run pytorch cuda wheels validation +# Run pytorch cuda wheels validation # Detect ReduceLogicKernel (ReduceOp and kernel) IMA python test/test_ops.py -k test_dtypes_all_cuda # Detect BinaryMulKernel (elementwise binary functor internal mul) IMA @@ -29,7 +29,10 @@ python test/test_torch.py -k test_index_reduce_reduce_prod_cuda_int32 python test/test_binary_ufuncs.py -k test_contig_vs_every_other___rand___cuda_int32 # Detect MaxMinElementwiseKernel (maximum) IMA python test/test_schema_check.py -k test_schema_correctness_clamp_cuda_int8 + +pushd /tmp # Detect StepKernel (nextafter) IMA python -c "import torch; print(torch.nextafter(torch.tensor([-4.5149, -5.9053, -0.9516, -2.3615, 1.5591], device='cuda:0'), torch.tensor(3.8075, device='cuda:0')))" # Detect BinaryGeometricKernels (atan2) IMA python -c "import torch; x = (torch.randn((2,1,1), dtype=torch.float, device='cuda')*5).to(torch.float32); y=(torch.randn((), dtype=torch.float, device='cuda')*5).to(torch.float32); print(torch.atan2(x,y))" +popd From 9870b250419df4fc8267ded3746cf03fcfa62674 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Mon, 15 Jan 2024 21:50:38 +0000 Subject: [PATCH 183/212] Remove s3 update html job and scripts (#1677) --- .github/workflows/update-s3-html.yml | 35 -- .lintrunner.toml | 2 +- s3_management/Dockerfile | 6 - s3_management/Makefile | 9 - s3_management/README.md | 21 -- s3_management/backup_conda.py | 73 ---- s3_management/manage.py | 517 --------------------------- s3_management/requirements.txt | 2 - 8 files changed, 1 insertion(+), 664 deletions(-) delete mode 100644 .github/workflows/update-s3-html.yml delete mode 100644 s3_management/Dockerfile delete mode 100644 s3_management/Makefile delete mode 100644 s3_management/README.md delete mode 100644 s3_management/backup_conda.py delete mode 100644 s3_management/manage.py delete mode 100644 s3_management/requirements.txt diff --git a/.github/workflows/update-s3-html.yml b/.github/workflows/update-s3-html.yml deleted file mode 100644 index 7c285418ef..0000000000 --- a/.github/workflows/update-s3-html.yml +++ /dev/null @@ -1,35 +0,0 @@ -name: Update S3 HTML indices for download.pytorch.org - -on: - schedule: - # Update the indices every 30 minutes - - cron: "*/30 * * * *" - workflow_dispatch: - -jobs: - update: - strategy: - matrix: - prefix: ["whl", "whl/test", "whl/nightly", "whl/lts/1.8"] - fail-fast: False - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main - secrets: inherit - with: - repository: pytorch/builder - timeout: 60 - secrets-env: AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY - script: | - set -ex - - # Create Conda Environment - git config --global --add safe.directory /__w/builder/builder - conda create --quiet -y --prefix run_env python="3.8" - conda activate ./run_env - - # Set Envs - export AWS_ACCESS_KEY_ID="${SECRET_AWS_ACCESS_KEY_ID}" - export AWS_SECRET_ACCESS_KEY="${SECRET_AWS_SECRET_ACCESS_KEY}" - - # Install requirements - pip install -r s3_management/requirements.txt - python s3_management/manage.py --generate-pep503 ${{ matrix.prefix }} diff --git a/.lintrunner.toml b/.lintrunner.toml index fdfca4cef4..b229976833 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -2,7 +2,7 @@ merge_base_with = "origin/main" [[linter]] code = 'RUFF' -include_patterns = ['test/smoke_test/*.py', 's3_management/*.py', 'aarch64_linux/*.py'] +include_patterns = ['test/smoke_test/*.py', 'aarch64_linux/*.py'] command = [ 'python3', 'tools/linter/adapters/ruff_linter.py', diff --git a/s3_management/Dockerfile b/s3_management/Dockerfile deleted file mode 100644 index def716f75e..0000000000 --- a/s3_management/Dockerfile +++ /dev/null @@ -1,6 +0,0 @@ -FROM python:3.8 -WORKDIR /work -ADD requirements.txt . -RUN pip install -r requirements.txt -ADD manage.py . -ENTRYPOINT ["python", "/work/manage.py"] diff --git a/s3_management/Makefile b/s3_management/Makefile deleted file mode 100644 index e9e4699d12..0000000000 --- a/s3_management/Makefile +++ /dev/null @@ -1,9 +0,0 @@ -DIR:=$(strip $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))) - -.PHONY: build-image -build-image: requirements.txt manage.py - docker build -t pytorch/manage_s3_html "$(DIR)" - -.PHONY: push-image -push-image: build-image - docker push pytorch/manage_s3_html diff --git a/s3_management/README.md b/s3_management/README.md deleted file mode 100644 index edc87691bd..0000000000 --- a/s3_management/README.md +++ /dev/null @@ -1,21 +0,0 @@ -# s3_management - -This directory houses scripts to maintain the s3 HTML indices for https://download.pytorch.org/whl - -## Building the image - -``` -make build-image -``` - -## Pushing the image - -``` -make push-image -``` - -## Running the image - -``` -docker run --rm -it -e AWS_SECRET_ACCESS_KEY -e AWS_ACCESS_KEY_ID pytorch/manage_s3_html all -``` diff --git a/s3_management/backup_conda.py b/s3_management/backup_conda.py deleted file mode 100644 index 7dafa32b46..0000000000 --- a/s3_management/backup_conda.py +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/env python3 -# Downloads domain pytorch and library packages from channel -# And backs them up to S3 -# Do not use unless you know what you are doing -# Usage: python backup_conda.py --version 1.6.0 - -import boto3 -from typing import List, Optional -import conda.api -import urllib -import os -import hashlib -import argparse - -S3 = boto3.resource('s3') -BUCKET = S3.Bucket('pytorch-backup') -_known_subdirs = ["linux-64", "osx-64", "osx-arm64", "win-64"] - - -def compute_md5(path:str) -> str: - with open(path, "rb") as f: - return hashlib.md5(f.read()).hexdigest() - - -def download_conda_package(package:str, version:Optional[str] = None, - depends:Optional[str] = None, channel:Optional[str] = None) -> List[str]: - packages = conda.api.SubdirData.query_all(package, - channels = [channel] if channel is not None else None, - subdirs = _known_subdirs) - rc = [] - - for pkg in packages: - if version is not None and pkg.version != version: - continue - if depends is not None and depends not in pkg.depends: - continue - - print(f"Downloading {pkg.url}...") - os.makedirs(pkg.subdir, exist_ok = True) - fname = f"{pkg.subdir}/{pkg.fn}" - if not os.path.exists(fname): - with open(fname, "wb") as f, urllib.request.urlopen(pkg.url) as url: - f.write(url.read()) - if compute_md5(fname) != pkg.md5: - print(f"md5 of {fname} is {compute_md5(fname)} does not match {pkg.md5}") - continue - rc.append(fname) - - return rc - -def upload_to_s3(prefix: str, fnames: List[str]) -> None: - for fname in fnames: - BUCKET.upload_file(fname, f"{prefix}/{fname}") - print(fname) - - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--version", - help="PyTorch Version to backup", - type=str, - required = True - ) - options = parser.parse_args() - rc = download_conda_package("pytorch", channel = "pytorch", version = options.version) - upload_to_s3(f"v{options.version}/conda", rc) - - for libname in ["torchvision", "torchaudio", "torchtext"]: - print(f"processing {libname}") - rc = download_conda_package(libname, channel = "pytorch", depends = f"pytorch {options.version}") - upload_to_s3(f"v{options.version}/conda", rc) diff --git a/s3_management/manage.py b/s3_management/manage.py deleted file mode 100644 index 851a848c66..0000000000 --- a/s3_management/manage.py +++ /dev/null @@ -1,517 +0,0 @@ -#!/usr/bin/env python - -import argparse -import base64 -import concurrent.futures -import dataclasses -import functools -import time - -from os import path, makedirs -from datetime import datetime -from collections import defaultdict -from typing import Iterable, List, Type, Dict, Set, TypeVar, Optional -from re import sub, match, search -from packaging.version import parse as _parse_version, Version, InvalidVersion - -import boto3 - - -S3 = boto3.resource('s3') -CLIENT = boto3.client('s3') -BUCKET = S3.Bucket('pytorch') - -ACCEPTED_FILE_EXTENSIONS = ("whl", "zip", "tar.gz") -ACCEPTED_SUBDIR_PATTERNS = [ - r"cu[0-9]+", # for cuda - r"rocm[0-9]+\.[0-9]+", # for rocm - "cpu", -] -PREFIXES_WITH_HTML = { - "whl": "torch_stable.html", - "whl/lts/1.8": "torch_lts.html", - "whl/nightly": "torch_nightly.html", - "whl/test": "torch_test.html", - "libtorch": "index.html", - "libtorch/nightly": "index.html", -} - -# NOTE: This refers to the name on the wheels themselves and not the name of -# package as specified by setuptools, for packages with "-" (hyphens) in their -# names you need to convert them to "_" (underscores) in order for them to be -# allowed here since the name of the wheels is compared here -PACKAGE_ALLOW_LIST = { - "Pillow", - "certifi", - "charset_normalizer", - "cmake", - "colorama", - "fbgemm_gpu", - "filelock", - "fsspec", - "idna", - "Jinja2", - "lit", - "MarkupSafe", - "mpmath", - "nestedtensor", - "networkx", - "numpy", - "nvidia_cublas_cu11", - "nvidia_cuda_cupti_cu11", - "nvidia_cuda_nvrtc_cu11", - "nvidia_cuda_runtime_cu11", - "nvidia_cudnn_cu11", - "nvidia_cufft_cu11", - "nvidia_curand_cu11", - "nvidia_cusolver_cu11", - "nvidia_cusparse_cu11", - "nvidia_nccl_cu11", - "nvidia_nvtx_cu11", - "nvidia_cublas_cu12", - "nvidia_cuda_cupti_cu12", - "nvidia_cuda_nvrtc_cu12", - "nvidia_cuda_runtime_cu12", - "nvidia_cudnn_cu12", - "nvidia_cufft_cu12", - "nvidia_curand_cu12", - "nvidia_cusolver_cu12", - "nvidia_cusparse_cu12", - "nvidia_nccl_cu12", - "nvidia_nvtx_cu12", - "nvidia_nvjitlink_cu12", - "packaging", - "portalocker", - "pytorch_triton", - "pytorch_triton_rocm", - "requests", - "sympy", - "torch", - "torch_tensorrt", - "torcharrow", - "torchaudio", - "torchcsprng", - "torchdata", - "torchdistx", - "torchmetrics", - "torchrec", - "torchtext", - "torchvision", - "triton", - "tqdm", - "typing_extensions", - "urllib3", - "xformers", -} - -# Should match torch-2.0.0.dev20221221+cu118-cp310-cp310-linux_x86_64.whl as: -# Group 1: torch-2.0.0.dev -# Group 2: 20221221 -PACKAGE_DATE_REGEX = r"([a-zA-z]*-[0-9.]*.dev)([0-9]*)" - -# How many packages should we keep of a specific package? -KEEP_THRESHOLD = 60 - -# TODO (huydhn): Clean this up once ExecuTorch has a new stable release that -# match PyTorch stable release cadence. This nightly version is currently -# referred to publicly in ExecuTorch alpha 0.1 release. So we want to keep -# nightly binaries around for now -KEEP_NIGHTLY_PACKAGES_FOR_EXECUTORCH = {datetime(2023, 10, 10, 0, 0)} - -S3IndexType = TypeVar('S3IndexType', bound='S3Index') - - -@dataclasses.dataclass(frozen=False) -@functools.total_ordering -class S3Object: - key: str - orig_key: str - checksum: Optional[str] - size: Optional[int] - - def __hash__(self): - return hash(self.key) - - def __str__(self): - return self.key - - def __eq__(self, other): - return self.key == other.key - - def __lt__(self, other): - return self.key < other.key - - -def extract_package_build_time(full_package_name: str) -> datetime: - result = search(PACKAGE_DATE_REGEX, full_package_name) - if result is not None: - try: - return datetime.strptime(result.group(2), "%Y%m%d") - except ValueError: - # Ignore any value errors since they probably shouldn't be hidden anyways - pass - return datetime.now() - - -def between_bad_dates(package_build_time: datetime): - start_bad = datetime(year=2022, month=8, day=17) - end_bad = datetime(year=2022, month=12, day=30) - return start_bad <= package_build_time <= end_bad - - -def safe_parse_version(ver_str: str) -> Version: - try: - return _parse_version(ver_str) - except InvalidVersion: - return Version("0.0.0") - - - -class S3Index: - def __init__(self: S3IndexType, objects: List[S3Object], prefix: str) -> None: - self.objects = objects - self.prefix = prefix.rstrip("/") - self.html_name = PREFIXES_WITH_HTML[self.prefix] - # should dynamically grab subdirectories like whl/test/cu101 - # so we don't need to add them manually anymore - self.subdirs = { - path.dirname(obj.key) for obj in objects if path.dirname != prefix - } - - def nightly_packages_to_show(self: S3IndexType) -> List[S3Object]: - """Finding packages to show based on a threshold we specify - - Basically takes our S3 packages, normalizes the version for easier - comparisons, then iterates over normalized versions until we reach a - threshold and then starts adding package to delete after that threshold - has been reached - - After figuring out what versions we'd like to hide we iterate over - our original object list again and pick out the full paths to the - packages that are included in the list of versions to delete - """ - # also includes versions without GPU specifier (i.e. cu102) for easier - # sorting, sorts in reverse to put the most recent versions first - all_sorted_packages = sorted( - {self.normalize_package_version(obj) for obj in self.objects}, - key=lambda name_ver: safe_parse_version(name_ver.split('-', 1)[-1]), - reverse=True, - ) - packages: Dict[str, int] = defaultdict(int) - to_hide: Set[str] = set() - for obj in all_sorted_packages: - full_package_name = path.basename(obj) - package_name = full_package_name.split('-')[0] - package_build_time = extract_package_build_time(full_package_name) - # Hard pass on packages that are included in our allow list - if package_name not in PACKAGE_ALLOW_LIST: - to_hide.add(obj) - continue - if package_build_time not in KEEP_NIGHTLY_PACKAGES_FOR_EXECUTORCH and ( - packages[package_name] >= KEEP_THRESHOLD - or between_bad_dates(package_build_time) - ): - to_hide.add(obj) - else: - packages[package_name] += 1 - return list(set(self.objects).difference({ - obj for obj in self.objects - if self.normalize_package_version(obj) in to_hide - })) - - def is_obj_at_root(self, obj: S3Object) -> bool: - return path.dirname(obj.key) == self.prefix - - def _resolve_subdir(self, subdir: Optional[str] = None) -> str: - if not subdir: - subdir = self.prefix - # make sure we strip any trailing slashes - return subdir.rstrip("/") - - def gen_file_list( - self, - subdir: Optional[str] = None, - package_name: Optional[str] = None - ) -> Iterable[S3Object]: - objects = self.objects - subdir = self._resolve_subdir(subdir) + '/' - for obj in objects: - if package_name is not None and self.obj_to_package_name(obj) != package_name: - continue - if self.is_obj_at_root(obj) or obj.key.startswith(subdir): - yield obj - - def get_package_names(self, subdir: Optional[str] = None) -> List[str]: - return sorted({self.obj_to_package_name(obj) for obj in self.gen_file_list(subdir)}) - - def normalize_package_version(self: S3IndexType, obj: S3Object) -> str: - # removes the GPU specifier from the package name as well as - # unnecessary things like the file extension, architecture name, etc. - return sub( - r"%2B.*", - "", - "-".join(path.basename(obj.key).split("-")[:2]) - ) - - def obj_to_package_name(self, obj: S3Object) -> str: - return path.basename(obj.key).split('-', 1)[0] - - def to_legacy_html( - self, - subdir: Optional[str] = None - ) -> str: - """Generates a string that can be used as the HTML index - - Takes our objects and transforms them into HTML that have historically - been used by pip for installing pytorch. - - NOTE: These are not PEP 503 compliant but are here for legacy purposes - """ - out: List[str] = [] - subdir = self._resolve_subdir(subdir) - is_root = subdir == self.prefix - for obj in self.gen_file_list(subdir): - # Strip our prefix - sanitized_obj = obj.key.replace(subdir, "", 1) - if sanitized_obj.startswith('/'): - sanitized_obj = sanitized_obj.lstrip("/") - # we include objects at our root prefix so that users can still - # install packages like torchaudio / torchtext even if they want - # to install a specific GPU arch of torch / torchvision - if not is_root and self.is_obj_at_root(obj): - # strip root prefix - sanitized_obj = obj.key.replace(self.prefix, "", 1).lstrip("/") - sanitized_obj = f"../{sanitized_obj}" - out.append(f'{sanitized_obj}
') - return "\n".join(sorted(out)) - - def to_simple_package_html( - self, - subdir: Optional[str], - package_name: str - ) -> str: - """Generates a string that can be used as the package simple HTML index - """ - out: List[str] = [] - # Adding html header - out.append('') - out.append('') - out.append(' ') - out.append('

Links for {}

'.format(package_name.lower().replace("_", "-"))) - for obj in sorted(self.gen_file_list(subdir, package_name)): - maybe_fragment = f"#sha256={obj.checksum}" if obj.checksum else "" - out.append(f' {path.basename(obj.key).replace("%2B","+")}
') - # Adding html footer - out.append(' ') - out.append('') - out.append(f'') - return '\n'.join(out) - - def to_simple_packages_html( - self, - subdir: Optional[str], - ) -> str: - """Generates a string that can be used as the simple HTML index - """ - out: List[str] = [] - # Adding html header - out.append('') - out.append('') - out.append(' ') - for pkg_name in sorted(self.get_package_names(subdir)): - out.append(f' {pkg_name.replace("_","-")}
') - # Adding html footer - out.append(' ') - out.append('') - out.append(f'') - return '\n'.join(out) - - def upload_legacy_html(self) -> None: - for subdir in self.subdirs: - print(f"INFO Uploading {subdir}/{self.html_name}") - BUCKET.Object( - key=f"{subdir}/{self.html_name}" - ).put( - ACL='public-read', - CacheControl='no-cache,no-store,must-revalidate', - ContentType='text/html', - Body=self.to_legacy_html(subdir=subdir) - ) - - def upload_pep503_htmls(self) -> None: - for subdir in self.subdirs: - print(f"INFO Uploading {subdir}/index.html") - BUCKET.Object( - key=f"{subdir}/index.html" - ).put( - ACL='public-read', - CacheControl='no-cache,no-store,must-revalidate', - ContentType='text/html', - Body=self.to_simple_packages_html(subdir=subdir) - ) - for pkg_name in self.get_package_names(subdir=subdir): - compat_pkg_name = pkg_name.lower().replace("_", "-") - print(f"INFO Uploading {subdir}/{compat_pkg_name}/index.html") - BUCKET.Object( - key=f"{subdir}/{compat_pkg_name}/index.html" - ).put( - ACL='public-read', - CacheControl='no-cache,no-store,must-revalidate', - ContentType='text/html', - Body=self.to_simple_package_html(subdir=subdir, package_name=pkg_name) - ) - - def save_legacy_html(self) -> None: - for subdir in self.subdirs: - print(f"INFO Saving {subdir}/{self.html_name}") - makedirs(subdir, exist_ok=True) - with open(path.join(subdir, self.html_name), mode="w", encoding="utf-8") as f: - f.write(self.to_legacy_html(subdir=subdir)) - - def save_pep503_htmls(self) -> None: - for subdir in self.subdirs: - print(f"INFO Saving {subdir}/index.html") - makedirs(subdir, exist_ok=True) - with open(path.join(subdir, "index.html"), mode="w", encoding="utf-8") as f: - f.write(self.to_simple_packages_html(subdir=subdir)) - for pkg_name in self.get_package_names(subdir=subdir): - makedirs(path.join(subdir, pkg_name), exist_ok=True) - with open(path.join(subdir, pkg_name, "index.html"), mode="w", encoding="utf-8") as f: - f.write(self.to_simple_package_html(subdir=subdir, package_name=pkg_name)) - - def compute_sha256(self) -> None: - for obj in self.objects: - if obj.checksum is not None: - continue - print(f"Updating {obj.orig_key} of size {obj.size} with SHA256 checksum") - s3_obj = BUCKET.Object(key=obj.orig_key) - s3_obj.copy_from(CopySource={"Bucket": BUCKET.name, "Key": obj.orig_key}, - Metadata=s3_obj.metadata, MetadataDirective="REPLACE", - ACL="public-read", - ChecksumAlgorithm="SHA256") - - @classmethod - def has_public_read(cls: Type[S3IndexType], key: str) -> bool: - def is_all_users_group(o) -> bool: - return o.get("Grantee", {}).get("URI") == "http://acs.amazonaws.com/groups/global/AllUsers" - - def can_read(o) -> bool: - return o.get("Permission") in ["READ", "FULL_CONTROL"] - - acl_grants = CLIENT.get_object_acl(Bucket=BUCKET.name, Key=key)["Grants"] - return any(is_all_users_group(x) and can_read(x) for x in acl_grants) - - @classmethod - def grant_public_read(cls: Type[S3IndexType], key: str) -> None: - CLIENT.put_object_acl(Bucket=BUCKET.name, Key=key, ACL="public-read") - - @classmethod - def fetch_object_names(cls: Type[S3IndexType], prefix: str) -> List[str]: - obj_names = [] - for obj in BUCKET.objects.filter(Prefix=prefix): - is_acceptable = any([path.dirname(obj.key) == prefix] + [ - match( - f"{prefix}/{pattern}", - path.dirname(obj.key) - ) - for pattern in ACCEPTED_SUBDIR_PATTERNS - ]) and obj.key.endswith(ACCEPTED_FILE_EXTENSIONS) - if not is_acceptable: - continue - obj_names.append(obj.key) - return obj_names - - def fetch_metadata(self: S3IndexType) -> None: - # Add PEP 503-compatible hashes to URLs to allow clients to avoid spurious downloads, if possible. - with concurrent.futures.ThreadPoolExecutor(max_workers=6) as executor: - for idx, future in { - idx: executor.submit( - lambda key: CLIENT.head_object( - Bucket=BUCKET.name, Key=key, ChecksumMode="Enabled" - ), - obj.orig_key, - ) - for (idx, obj) in enumerate(self.objects) - if obj.size is None - }.items(): - response = future.result() - sha256 = (_b64 := response.get("ChecksumSHA256")) and base64.b64decode(_b64).hex() - # For older files, rely on checksum-sha256 metadata that can be added to the file later - if sha256 is None: - sha256 = response.get("Metadata", {}).get("checksum-sha256") - self.objects[idx].checksum = sha256 - if size := response.get("ContentLength"): - self.objects[idx].size = int(size) - - @classmethod - def from_S3(cls: Type[S3IndexType], prefix: str, with_metadata: bool = True) -> S3IndexType: - prefix = prefix.rstrip("/") - obj_names = cls.fetch_object_names(prefix) - - def sanitize_key(key: str) -> str: - return key.replace("+", "%2B") - - rc = cls([S3Object(key=sanitize_key(key), - orig_key=key, - checksum=None, - size=None) for key in obj_names], prefix) - if prefix == "whl/nightly": - rc.objects = rc.nightly_packages_to_show() - if with_metadata: - rc.fetch_metadata() - return rc - - @classmethod - def undelete_prefix(cls: Type[S3IndexType], prefix: str) -> None: - paginator = CLIENT.get_paginator("list_object_versions") - for page in paginator.paginate(Bucket=BUCKET.name, Prefix=prefix): - for obj in page.get("DeleteMarkers", []): - if not obj.get("IsLatest"): - continue - obj_key, obj_version_id = obj["Key"], obj["VersionId"] - obj_ver = S3.ObjectVersion(BUCKET.name, obj_key, obj_version_id) - print(f"Undeleting {obj_key} deleted on {obj['LastModified']}") - obj_ver.delete() - - -def create_parser() -> argparse.ArgumentParser: - parser = argparse.ArgumentParser("Manage S3 HTML indices for PyTorch") - parser.add_argument( - "prefix", - type=str, - choices=list(PREFIXES_WITH_HTML.keys()) + ["all"] - ) - parser.add_argument("--do-not-upload", action="store_true") - parser.add_argument("--generate-pep503", action="store_true") - parser.add_argument("--compute-sha256", action="store_true") - return parser - - -def main() -> None: - parser = create_parser() - args = parser.parse_args() - action = "Saving indices" if args.do_not_upload else "Uploading indices" - if args.compute_sha256: - action = "Computing checksums" - - prefixes = PREFIXES_WITH_HTML if args.prefix == 'all' else [args.prefix] - for prefix in prefixes: - print(f"INFO: {action} for '{prefix}'") - stime = time.time() - idx = S3Index.from_S3(prefix=prefix, with_metadata=args.generate_pep503 or args.compute_sha256) - etime = time.time() - print(f"DEBUG: Fetched {len(idx.objects)} objects for '{prefix}' in {etime-stime:.2f} seconds") - if args.compute_sha256: - idx.compute_sha256() - elif args.do_not_upload: - idx.save_legacy_html() - if args.generate_pep503: - idx.save_pep503_htmls() - else: - idx.upload_legacy_html() - if args.generate_pep503: - idx.upload_pep503_htmls() - - -if __name__ == "__main__": - main() diff --git a/s3_management/requirements.txt b/s3_management/requirements.txt deleted file mode 100644 index fa23e39b1b..0000000000 --- a/s3_management/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -boto3==1.28.53 -packaging==21.3 From f7d8ebd106818e4c36368204700edb8c8d15e42f Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Mon, 15 Jan 2024 22:44:03 +0000 Subject: [PATCH 184/212] [BE] Remove unused nightly_defaults.bat (#1678) --- windows/internal/auth.bat | 46 ------ windows/internal/nightly_defaults.bat | 201 -------------------------- windows/internal/publish.bat | 97 ------------- windows/internal/upload.bat | 96 ------------ 4 files changed, 440 deletions(-) delete mode 100644 windows/internal/auth.bat delete mode 100644 windows/internal/nightly_defaults.bat delete mode 100644 windows/internal/publish.bat delete mode 100644 windows/internal/upload.bat diff --git a/windows/internal/auth.bat b/windows/internal/auth.bat deleted file mode 100644 index c874bce493..0000000000 --- a/windows/internal/auth.bat +++ /dev/null @@ -1,46 +0,0 @@ -@echo off - -: From the following doc, the build won't be triggered if the users don't sign in daily. -: https://docs.microsoft.com/en-us/azure/devops/pipelines/build/triggers?tabs=yaml&view=vsts#my-build-didnt-run-what-happened -: To avoid this problem, we can just go through the sign in process using the following command. - -:auth_start - -if "%RETRY_TIMES%" == "" ( - set /a RETRY_TIMES=10 - set /a SLEEP_TIME=2 -) else ( - set /a RETRY_TIMES=%RETRY_TIMES%-1 - set /a SLEEP_TIME=%SLEEP_TIME%*2 -) - -for /f "usebackq tokens=*" %%i in (`curl -so NUL -w "%%{http_code}" -u %VSTS_AUTH% https://dev.azure.com/pytorch`) do ( - set STATUS_CODE=%%i -) - -IF NOT "%STATUS_CODE%" == "200" ( - echo Auth retry times remaining: %RETRY_TIMES% - echo Sleep time: %SLEEP_TIME% seconds - IF %RETRY_TIMES% EQU 0 ( - echo Auth failed - goto err - ) - waitfor SomethingThatIsNeverHappening /t %SLEEP_TIME% 2>nul || ver >nul - goto auth_start -) ELSE ( - echo Login Attempt Succeeded - goto auth_end -) - -:err - -: Throw a warning if it fails -powershell -c "Write-Warning 'Login Attempt Failed'" - -:auth_end - -set RETRY_TIMES= -set SLEEP_TIME= -set STATUS_CODE= - -exit /b 0 diff --git a/windows/internal/nightly_defaults.bat b/windows/internal/nightly_defaults.bat deleted file mode 100644 index e74d55e0bb..0000000000 --- a/windows/internal/nightly_defaults.bat +++ /dev/null @@ -1,201 +0,0 @@ -@echo off - -if "%~1"=="" goto arg_error -if NOT "%~2"=="" goto arg_error -goto arg_end - -:arg_error - -echo Illegal number of parameters. Pass packge type `Conda` or `Wheels`. -exit /b 1 - -:arg_end - -echo "nightly_defaults.bat at %CD% starting at %DATE%" - -set SRC_DIR=%~dp0\.. - -:: NIGHTLIES_FOLDER -:: N.B. this is also defined in cron_start.sh -:: An arbitrary root folder to store all nightlies folders, each of which is a -:: parent level date folder with separate subdirs for logs, wheels, conda -:: packages, etc. This should be kept the same across all scripts called in a -:: cron job, so it only has a default value in the top-most script -:: build_cron.sh to avoid the default values from diverging. -if "%NIGHTLIES_FOLDER%" == "" set "NIGHTLIES_FOLDER=%SRC_DIR%" - -:: NIGHTLIES_DATE -:: N.B. this is also defined in cron_start.sh -:: The date in YYYY_mm_dd format that we are building for. If this is not -:: already set, then this will first try to find the date of the nightlies -:: folder that this builder repo exists in; e.g. if this script exists in -:: some_dir/2019_09_04/builder/cron/ then this will be set to 2019_09_04 (must -:: match YYYY_mm_dd). This is for convenience when debugging/uploading past -:: dates, so that you don't have to set NIGHTLIES_DATE yourself. If a date -:: folder cannot be found in that exact location, then this will default to -:: the current date. - - -if NOT "%NIGHTLIES_DATE%" == "" goto date_end - -:date_start - -set "DATE_CMD=Get-Date ([System.TimeZoneInfo]::ConvertTimeFromUtc((Get-Date).ToUniversalTime(), [System.TimeZoneInfo]::FindSystemTimeZoneById('Pacific Standard Time'))) -f 'yyyy_MM_dd'" -set "DATE_COMPACT_CMD=Get-Date ([System.TimeZoneInfo]::ConvertTimeFromUtc((Get-Date).ToUniversalTime(), [System.TimeZoneInfo]::FindSystemTimeZoneById('Pacific Standard Time'))) -f 'yyyyMMdd'" - -FOR /F "delims=" %%i IN ('powershell -c "%DATE_CMD%"') DO set NIGHTLIES_DATE=%%i -FOR /F "delims=" %%i IN ('powershell -c "%DATE_COMPACT_CMD%"') DO set NIGHTLIES_DATE_COMPACT=%%i - -:date_end - -if "%NIGHTLIES_DATE_COMPACT%" == "" set NIGHTLIES_DATE_COMPACT=%NIGHTLIES_DATE:~0,4%%NIGHTLIES_DATE:~5,2%%NIGHTLIES_DATE:~8,2% - -:: Used in lots of places as the root dir to store all conda/wheel/manywheel -:: packages as well as logs for the day -set today=%NIGHTLIES_FOLDER%\%NIGHTLIES_DATE% -mkdir "%today%" || ver >nul - - -::############################################################################# -:: Add new configuration variables below this line. 'today' should always be -:: defined ASAP to avoid weird errors -::############################################################################# - - -:: List of people to email when things go wrong. This is passed directly to -:: `mail -t` -:: TODO: Not supported yet -if "%NIGHTLIES_EMAIL_LIST%" == "" set NIGHTLIES_EMAIL_LIST=peterghost86@gmail.com - -:: PYTORCH_CREDENTIALS_FILE -:: A bash file that exports credentials needed to upload to aws and anaconda. -:: Needed variables are PYTORCH_ANACONDA_USERNAME, PYTORCH_ANACONDA_PASSWORD, -:: AWS_ACCESS_KEY_ID, and AWS_SECRET_ACCESS_KEY. Or it can just export the AWS -:: keys and then prepend a logged-in conda installation to the path. -:: TODO: Not supported yet -if "%PYTORCH_CREDENTIALS_FILE%" == "" set PYTORCH_CREDENTIALS_FILE=/c/Users/administrator/nightlies/credentials.sh - -:: Location of the temporary miniconda that is downloaded to install conda-build -:: and aws to upload finished packages TODO this is messy to install this in -:: upload.sh and later use it in upload_logs.sh -if "%CONDA_UPLOADER_INSTALLATION%" == "" set "CONDA_UPLOADER_INSTALLATION=%today%\miniconda" - -:: N.B. BUILDER_REPO and BUILDER_BRANCH are both set in cron_start.sh, as that -:: is the script that actually clones the builder repo that /this/ script is -:: running from. -pushd "%SRC_DIR%\.." -set NIGHTLIES_BUILDER_ROOT=%CD% -popd - -:: The shared pytorch repo to be used by all builds -if "%NIGHTLIES_PYTORCH_ROOT%" == "" set "NIGHTLIES_PYTORCH_ROOT=%today%\pytorch" - -:: PYTORCH_REPO -:: The Github org/user whose fork of Pytorch to check out (git clone -:: https://github.com//pytorch.git). This will always be cloned -:: fresh to build with. Default is 'pytorch' -if "%PYTORCH_REPO%" == "" set PYTORCH_REPO=pytorch - -:: PYTORCH_BRANCH -:: The branch of Pytorch to checkout for building (git checkout ). -:: This can either be the name of the branch (e.g. git checkout -:: my_branch_name) or can be a git commit (git checkout 4b2674n...). Default -:: is 'latest', which is a special term that signals to pull the last commit -:: before 0:00 midnight on the NIGHTLIES_DATE -if "%PYTORCH_BRANCH%" == "" set PYTORCH_BRANCH=nightly - -:: Clone the requested pytorch checkout -if exist "%NIGHTLIES_PYTORCH_ROOT%" goto clone_end - -:clone_start - -git clone --recursive "https://github.com/%PYTORCH_REPO%/pytorch.git" "%NIGHTLIES_PYTORCH_ROOT%" -pushd "%NIGHTLIES_PYTORCH_ROOT%" - -if NOT "%PYTORCH_BRANCH%" == "latest" goto latest_end - -:latest_start - -:: Switch to the latest commit by 11:59 yesterday -echo PYTORCH_BRANCH is set to latest so I will find the last commit -echo before 0:00 midnight on %NIGHTLIES_DATE% -set git_date=%NIGHTLIES_DATE:_=-% -FOR /F "delims=" %%i IN ('git log --before %git_date% -n 1 "--pretty=%%H"') DO set last_commit=%%i -echo Setting PYTORCH_BRANCH to %last_commit% since that was the last -echo commit before %NIGHTLIES_DATE% -set PYTORCH_BRANCH=%last_commit% - -:latest_end - -git checkout "%PYTORCH_BRANCH%" -git submodule update -popd - -:clone_end - -if "%CUDA_VERSION%" == "cpu" ( - set _DESIRED_CUDA=cpu -) else ( - set _DESIRED_CUDA=cu%CUDA_VERSION% -) - -:: PYTORCH_BUILD_VERSION -:: The actual version string. Used in conda like -:: pytorch-nightly==1.0.0.dev20180908 -:: or in manylinux like -:: torch_nightly-1.0.0.dev20180908-cp27-cp27m-linux_x86_64.whl -if "%PYTORCH_BUILD_VERSION%" == "" set PYTORCH_BUILD_VERSION=1.5.0.dev%NIGHTLIES_DATE_COMPACT% - -if "%~1" == "Wheels" ( - if "%BUILD_PYTHONLESS%" == "" ( - if not "%CUDA_VERSION%" == "102" ( - set PYTORCH_BUILD_VERSION=%PYTORCH_BUILD_VERSION%+%_DESIRED_CUDA% - ) - ) -) - -:: PYTORCH_BUILD_NUMBER -:: This is usually the number 1. If more than one build is uploaded for the -:: same version/date, then this can be incremented to 2,3 etc in which case -:: '.post2' will be appended to the version string of the package. This can -:: be set to '0' only if OVERRIDE_PACKAGE_VERSION is being used to bypass -:: all the version string logic in downstream scripts. Since we use the -:: override below, exporting this shouldn't actually matter. -if "%PYTORCH_BUILD_NUMBER%" == "" set /a PYTORCH_BUILD_NUMBER=1 -if %PYTORCH_BUILD_NUMBER% GTR 1 set PYTORCH_BUILD_VERSION=%PYTORCH_BUILD_VERSION%%PYTORCH_BUILD_NUMBER% - -:: The nightly builds use their own versioning logic, so we override whatever -:: logic is in setup.py or other scripts -:: TODO: Not supported yet -set OVERRIDE_PACKAGE_VERSION=%PYTORCH_BUILD_VERSION% - -:: Build folder for conda builds to use -if "%TORCH_CONDA_BUILD_FOLDER%" == "" set TORCH_CONDA_BUILD_FOLDER=pytorch-nightly - -:: TORCH_PACKAGE_NAME -:: The name of the package to upload. This should probably be pytorch or -:: pytorch-nightly. N.B. that pip will change all '-' to '_' but conda will -:: not. This is dealt with in downstream scripts. -:: TODO: Not supported yet -if "%TORCH_PACKAGE_NAME%" == "" set TORCH_PACKAGE_NAME=torch - -:: PIP_UPLOAD_FOLDER should end in a slash. This is to handle it being empty -:: (when uploading to e.g. whl/cpu/) and also to handle nightlies (when -:: uploading to e.g. /whl/nightly/cpu) -:: TODO: Not supported yet -if "%PIP_UPLOAD_FOLDER%" == "" set "PIP_UPLOAD_FOLDER=nightly\" - -:: The location of the binary_sizes dir in s3 is hardcoded into -:: upload_binary_sizes.sh - -:: DAYS_TO_KEEP -:: How many days to keep around for clean.sh. Build folders older than this -:: will be purged at the end of cron jobs. '1' means to keep only the current -:: day. Values less than 1 are not allowed. The default is 5. -:: TODO: Not supported yet -if "%DAYS_TO_KEEP%" == "" set /a DAYS_TO_KEEP=5 -if %DAYS_TO_KEEP% LSS 1 ( - echo DAYS_TO_KEEP cannot be less than 1. - echo A value of 1 means to only keep the build for today - exit /b 1 -) diff --git a/windows/internal/publish.bat b/windows/internal/publish.bat deleted file mode 100644 index 765fb39fed..0000000000 --- a/windows/internal/publish.bat +++ /dev/null @@ -1,97 +0,0 @@ -@echo off - -set SRC_DIR=%~dp0 -pushd %SRC_DIR% - -if not "%CUDA_VERSION%" == "cpu" ( - set PACKAGE_SUFFIX=_cuda%CUDA_VERSION% -) else ( - set PACKAGE_SUFFIX= -) - -if "%PACKAGEFULLNAME%" == "Conda" ( - set PACKAGE=conda -) else ( - set PACKAGE=wheels -) - -if "%DEBUG%" == "1" ( - if not defined PACKAGE_SUFFIX ( - set PACKAGE_SUFFIX=_debug - ) else ( - set PACKAGE_SUFFIX=%PACKAGE_SUFFIX%_debug - ) -) - -if not defined PACKAGE_SUFFIX ( - set PUBLISH_BRANCH=%PACKAGE%_%DESIRED_PYTHON% -) else ( - set PUBLISH_BRANCH=%PACKAGE%_%DESIRED_PYTHON%%PACKAGE_SUFFIX% -) - -git clone %ARTIFACT_REPO_URL% -b %PUBLISH_BRANCH% --single-branch >nul 2>&1 - -IF ERRORLEVEL 1 ( - echo Branch %PUBLISH_BRANCH% not exist, falling back to master - set NO_BRANCH=1 - git clone %ARTIFACT_REPO_URL% -b master --single-branch >nul 2>&1 -) - -IF ERRORLEVEL 1 ( - echo Clone failed - goto err -) - -cd pytorch_builder -attrib -s -h -r . /s /d - -:: Empty repo -rd /s /q . || ver >nul - -IF NOT EXIST %PACKAGE% mkdir %PACKAGE% - -xcopy /S /E /Y ..\..\output\*.* %PACKAGE%\ - -git config --global user.name "Azure DevOps" -git config --global user.email peterghost86@gmail.com -git init -git checkout --orphan %PUBLISH_BRANCH% -git remote add origin %ARTIFACT_REPO_URL% -git add . -git commit -m "Update artifacts" - -:push - -if "%RETRY_TIMES%" == "" ( - set /a RETRY_TIMES=10 - set /a SLEEP_TIME=2 -) else ( - set /a RETRY_TIMES=%RETRY_TIMES%-1 - set /a SLEEP_TIME=%SLEEP_TIME%*2 -) - -git push origin %PUBLISH_BRANCH% -f > nul 2>&1 - -IF ERRORLEVEL 1 ( - echo Git push retry times remaining: %RETRY_TIMES% - echo Sleep time: %SLEEP_TIME% seconds - IF %RETRY_TIMES% EQU 0 ( - echo Push failed - goto err - ) - waitfor SomethingThatIsNeverHappening /t %SLEEP_TIME% 2>nul || ver >nul - goto push -) ELSE ( - set RETRY_TIMES= - set SLEEP_TIME= -) - -popd - -exit /b 0 - -:err - -popd - -exit /b 1 diff --git a/windows/internal/upload.bat b/windows/internal/upload.bat deleted file mode 100644 index 8be04d841d..0000000000 --- a/windows/internal/upload.bat +++ /dev/null @@ -1,96 +0,0 @@ -@echo off - -IF "%CONDA_UPLOADER_INSTALLATION%" == "" goto precheck_fail -IF "%PYTORCH_FINAL_PACKAGE_DIR%" == "" goto precheck_fail -IF "%today%" == "" goto precheck_fail -IF "%PYTORCH_ANACONDA_USERNAME%" == "" goto precheck_fail -IF "%PYTORCH_ANACONDA_PASSWORD%" == "" goto precheck_fail - -goto precheck_pass - -:precheck_fail - -echo Please run nightly_defaults.bat first. -echo And remember to set `PYTORCH_FINAL_PACKAGE_DIR` -echo Finally, don't forget to set anaconda tokens -exit /b 1 - -:precheck_pass - -pushd %today% - -:: Install anaconda client -set "CONDA_HOME=%CONDA_UPLOADER_INSTALLATION%" -set "tmp_conda=%CONDA_HOME%" -set "miniconda_exe=%CD%\miniconda.exe" -rmdir /s /q "%CONDA_HOME%" -del miniconda.exe -curl -k https://repo.continuum.io/miniconda/Miniconda3-latest-Windows-x86_64.exe -o "%miniconda_exe%" -popd - -IF ERRORLEVEL 1 ( - echo Conda download failed - exit /b 1 -) - -call %~dp0\..\..\conda\install_conda.bat - -IF ERRORLEVEL 1 ( - echo Conda installation failed - exit /b 1 -) - -set "ORIG_PATH=%PATH%" -set "PATH=%CONDA_HOME%;%CONDA_HOME%\scripts;%CONDA_HOME%\Library\bin;%PATH%" - -REM conda install -y anaconda-client -pip install git+https://github.com/peterjc123/anaconda-client.git@log_more_meaningfull_errors -IF ERRORLEVEL 1 ( - echo Anaconda client installation failed - exit /b 1 -) - -REM bash -c "yes | anaconda login --username "%PYTORCH_ANACONDA_USERNAME%" --password "%PYTORCH_ANACONDA_PASSWORD%"" -anaconda login --username "%PYTORCH_ANACONDA_USERNAME%" --password "%PYTORCH_ANACONDA_PASSWORD%" -IF ERRORLEVEL 1 ( - echo Anaconda client login failed - exit /b 1 -) - -set PYTORCH_FINAL_PACKAGE= -:: Upload all the packages under `PYTORCH_FINAL_PACKAGE_DIR` -FOR /F "delims=" %%i IN ('where /R %PYTORCH_FINAL_PACKAGE_DIR% *pytorch*.tar.bz2') DO ( - set "PYTORCH_FINAL_PACKAGE=%%i" -) - -IF "%PYTORCH_FINAL_PACKAGE%" == "" ( - echo No package to upload - exit /b 0 -) - -:upload - -if "%RETRY_TIMES%" == "" ( - set /a RETRY_TIMES=10 - set /a SLEEP_TIME=2 -) else ( - set /a RETRY_TIMES=%RETRY_TIMES%-1 - set /a SLEEP_TIME=%SLEEP_TIME%*2 -) - -echo Uploading %PYTORCH_FINAL_PACKAGE% to Anaconda Cloud -anaconda upload "%PYTORCH_FINAL_PACKAGE%" -u pytorch-nightly --label main --force --no-progress - -IF ERRORLEVEL 1 ( - echo Anaconda upload retry times remaining: %RETRY_TIMES% - echo Sleep time: %SLEEP_TIME% seconds - IF %RETRY_TIMES% EQU 0 ( - echo Upload failed - exit /b 1 - ) - waitfor SomethingThatIsNeverHappening /t %SLEEP_TIME% 2>nul || ver >nul - goto upload -) ELSE ( - set RETRY_TIMES= - set SLEEP_TIME= -) From 8b67d32929b950c4851066800f5ef57c7646994c Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Thu, 18 Jan 2024 16:15:06 -0800 Subject: [PATCH 185/212] [Conda] Mark `blas * mkl` as x86 only dependency --- conda/pytorch-nightly/meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda/pytorch-nightly/meta.yaml b/conda/pytorch-nightly/meta.yaml index e56fe7f683..9e8f90a947 100644 --- a/conda/pytorch-nightly/meta.yaml +++ b/conda/pytorch-nightly/meta.yaml @@ -55,7 +55,7 @@ requirements: - jinja2 - pyyaml {% if cross_compile_arm64 == 0 %} - - blas * mkl + - blas * mkl # [x86_64] {% endif %} - pytorch-mutex 1.0 {{ build_variant }} # [not osx ] {{ environ.get('CONDA_CUDATOOLKIT_CONSTRAINT', '') }} From eb78393f1e4bd68134d87e4059b9b25194af7dbb Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Thu, 18 Jan 2024 16:31:04 -0800 Subject: [PATCH 186/212] [Conda] Download arch appropriate Miniconda By using `$(uname -m)` as suffix, which is arm64 on Apple Silicon and x86 on Intel Macs --- conda/build_pytorch.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda/build_pytorch.sh b/conda/build_pytorch.sh index 06e72da7cc..40b4a64a72 100755 --- a/conda/build_pytorch.sh +++ b/conda/build_pytorch.sh @@ -199,7 +199,7 @@ if [[ "$(uname)" == 'Darwin' ]]; then miniconda_sh="${MAC_PACKAGE_WORK_DIR}/miniconda.sh" rm -rf "$tmp_conda" rm -f "$miniconda_sh" - retry curl -sS https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-x86_64.sh -o "$miniconda_sh" + retry curl -sS https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh -o "$miniconda_sh" chmod +x "$miniconda_sh" && \ "$miniconda_sh" -b -p "$tmp_conda" && \ rm "$miniconda_sh" From 0d3aea4ee08e00b76fc263ce58e4c10df9f58e44 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Thu, 18 Jan 2024 16:40:39 -0800 Subject: [PATCH 187/212] [Conda] Do not depend on llvmdev-9 on ARM As earliest available for the platform is llvmdev-11 --- conda/build_pytorch.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda/build_pytorch.sh b/conda/build_pytorch.sh index 40b4a64a72..09d4aca8ad 100755 --- a/conda/build_pytorch.sh +++ b/conda/build_pytorch.sh @@ -67,7 +67,7 @@ if [[ -n "$OVERRIDE_PACKAGE_VERSION" ]]; then fi # differentiate package name for cross compilation to avoid collision -if [[ -n "$CROSS_COMPILE_ARM64" ]]; then +if [[ -n "$CROSS_COMPILE_ARM64" || "$(uname -m)" == "arm64" ]]; then export PYTORCH_LLVM_PACKAGE="" fi From 6c6a33b2712bdb4be4406a10f75e3a404541ccd7 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Thu, 18 Jan 2024 17:03:46 -0800 Subject: [PATCH 188/212] [Conda] Set correct developer dir for MacOS runners --- conda/build_pytorch.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/conda/build_pytorch.sh b/conda/build_pytorch.sh index 09d4aca8ad..39aab7ee89 100755 --- a/conda/build_pytorch.sh +++ b/conda/build_pytorch.sh @@ -104,7 +104,11 @@ if [[ -z "$DESIRED_PYTHON" ]]; then fi if [[ "$OSTYPE" == "darwin"* ]]; then - DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer + if [[ "$(uname -m)" == "arm64" ]]; then + DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer + else + DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer + fi fi if [[ "$desired_cuda" == 'cpu' ]]; then cpu_only=1 From 74b04f302afede5c25275d8026f34a06330cc515 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Fri, 19 Jan 2024 06:18:52 -0800 Subject: [PATCH 189/212] [Conda] Add llvm-openmp dependency for ARM64 PyTorch for M1 is finally built with OpenMP, so it needs to depend on it --- conda/pytorch-nightly/meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda/pytorch-nightly/meta.yaml b/conda/pytorch-nightly/meta.yaml index 9e8f90a947..d29b870189 100644 --- a/conda/pytorch-nightly/meta.yaml +++ b/conda/pytorch-nightly/meta.yaml @@ -47,7 +47,7 @@ requirements: - intel-openmp # [win] # llvm-openmp 16 leads to wrong processor affinity for fork child, see #99625. # Before a decent fix, force llvm-openmp version <16. - - llvm-openmp <16 # [linux] + - llvm-openmp <16 # [linux or arm64] - typing_extensions - sympy - filelock From 896b6df5f0ce23431bf760ac3090f26bd6c44ee9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ionu=C8=9B=20Man=C8=9Ba?= Date: Mon, 22 Jan 2024 19:48:28 +0200 Subject: [PATCH 190/212] Use dynamic MKL on Windows (#1467) Use dynamic MKL on Windows and updated MKL to 2021.4.0 On conda python 3.12 use mkl 2023.1 --- conda/pytorch-nightly/bld.bat | 5 ----- conda/pytorch-nightly/meta.yaml | 7 +++++-- windows/build_pytorch.bat | 6 +----- windows/internal/copy.bat | 4 ++++ windows/internal/copy_cpu.bat | 5 +++++ windows/internal/smoke_test.bat | 19 +++++++++++-------- 6 files changed, 26 insertions(+), 20 deletions(-) diff --git a/conda/pytorch-nightly/bld.bat b/conda/pytorch-nightly/bld.bat index 972df7e9cf..775256ea7f 100644 --- a/conda/pytorch-nightly/bld.bat +++ b/conda/pytorch-nightly/bld.bat @@ -34,11 +34,6 @@ if "%desired_cuda%" == "12.1" ( set DISTUTILS_USE_SDK=1 -curl https://s3.amazonaws.com/ossci-windows/mkl_2020.2.254.7z -k -O -7z x -aoa mkl_2020.2.254.7z -omkl -set CMAKE_INCLUDE_PATH=%SRC_DIR%\mkl\include -set LIB=%SRC_DIR%\mkl\lib;%LIB% - set libuv_ROOT=%PREFIX%\Library echo libuv_ROOT=%libuv_ROOT% diff --git a/conda/pytorch-nightly/meta.yaml b/conda/pytorch-nightly/meta.yaml index d29b870189..3da1625c40 100644 --- a/conda/pytorch-nightly/meta.yaml +++ b/conda/pytorch-nightly/meta.yaml @@ -23,7 +23,8 @@ requirements: - mkl-include # [x86_64] - mkl=2020.2 # [py <= 311 and x86_64 and not win] - mkl=2023.1 # [py >= 312 and x86_64] - - mkl=2021.4 # [x86_64 and win and py <= 311] + - mkl-devel=2021.4.0 # [x86_64 and win and py<=311] + - mkl-devel=2023.1 # [x86_64 and win and py>=312] {% endif %} - typing_extensions - ninja @@ -41,7 +42,9 @@ requirements: run: - python {% if cross_compile_arm64 == 0 %} - - mkl >=2018 # [x86_64] + - mkl >=2018 # [x86_64 and not win] + - mkl=2021.4 # [x86_64 and win and py <= 311] + - mkl=2023.1 # [x86_64 and win and py >= 312] {% endif %} - libuv # [win] - intel-openmp # [win] diff --git a/windows/build_pytorch.bat b/windows/build_pytorch.bat index 37e19f9339..750d3c5e35 100644 --- a/windows/build_pytorch.bat +++ b/windows/build_pytorch.bat @@ -67,10 +67,6 @@ exit /B 1 :: Install MKL rmdir /s /q mkl del mkl_2020.2.254.7z -curl https://s3.amazonaws.com/ossci-windows/mkl_2020.2.254.7z -k -O -7z x -aoa mkl_2020.2.254.7z -omkl -set CMAKE_INCLUDE_PATH=%cd%\mkl\include -set LIB=%cd%\mkl\lib;%LIB% :: Download MAGMA Files on CUDA builds set MAGMA_VERSION=2.5.4 @@ -126,7 +122,7 @@ for %%v in (%DESIRED_PYTHON_PREFIX%) do ( ) else ( set "PATH=%CONDA_HOME%\envs\%%v;%CONDA_HOME%\envs\%%v\scripts;%CONDA_HOME%\envs\%%v\Library\bin;%ORIG_PATH%" ) - pip install ninja + pip install ninja mkl-include==2021.4.0 mkl-devel==2021.4.0 @setlocal :: Set Flags if not "%CUDA_VERSION%"=="cpu" ( diff --git a/windows/internal/copy.bat b/windows/internal/copy.bat index 490d9593a1..9893fc7c52 100755 --- a/windows/internal/copy.bat +++ b/windows/internal/copy.bat @@ -11,6 +11,10 @@ copy "%CUDA_PATH%\extras\CUPTI\lib64\cupti64_*.dll*" pytorch\torch\lib copy "C:\Program Files\NVIDIA Corporation\NvToolsExt\bin\x64\nvToolsExt64_1.dll*" pytorch\torch\lib copy "%CONDA_LIB_PATH%\libiomp*5md.dll" pytorch\torch\lib +IF "%PACKAGE_TYPE%"=="libtorch" ( + copy "%CONDA_LIB_PATH%\mkl_intel_thread.1.dll" pytorch\torch\lib + copy "%CONDA_LIB_PATH%\mkl_core.1.dll" pytorch\torch\lib +) :: Should be set in build_pytorch.bat copy "%libuv_ROOT%\bin\uv.dll" pytorch\torch\lib diff --git a/windows/internal/copy_cpu.bat b/windows/internal/copy_cpu.bat index 2dae4613ee..0a4c0dabb2 100755 --- a/windows/internal/copy_cpu.bat +++ b/windows/internal/copy_cpu.bat @@ -1,3 +1,8 @@ copy "%CONDA_LIB_PATH%\libiomp*5md.dll" pytorch\torch\lib :: Should be set in build_pytorch.bat copy "%libuv_ROOT%\bin\uv.dll" pytorch\torch\lib + +IF "%PACKAGE_TYPE%"=="libtorch" ( + copy "%CONDA_LIB_PATH%\mkl_intel_thread.1.dll" pytorch\torch\lib + copy "%CONDA_LIB_PATH%\mkl_core.1.dll" pytorch\torch\lib +) diff --git a/windows/internal/smoke_test.bat b/windows/internal/smoke_test.bat index 1ade2cbda2..ce097f6a21 100644 --- a/windows/internal/smoke_test.bat +++ b/windows/internal/smoke_test.bat @@ -54,7 +54,7 @@ if errorlevel 1 exit /b 1 set "PATH=%CD%\Python%PYTHON_VERSION%\Scripts;%CD%\Python;%PATH%" -pip install -q numpy protobuf "mkl>=2019" +pip install -q numpy protobuf if errorlevel 1 exit /b 1 for /F "delims=" %%i in ('where /R "%PYTORCH_FINAL_PACKAGE_DIR:/=\%" *.whl') do pip install "%%i" @@ -87,14 +87,18 @@ set "PATH=%CONDA_HOME%;%CONDA_HOME%\scripts;%CONDA_HOME%\Library\bin;%PATH%" conda create -qyn testenv python=%DESIRED_PYTHON% if errorlevel 1 exit /b 1 - +call conda install -yq conda-build +if errorlevel 1 exit /b 1 call %CONDA_HOME%\condabin\activate.bat testenv if errorlevel 1 exit /b 1 +set "NO_ARCH_PATH=%PYTORCH_FINAL_PACKAGE_DIR:/=\%\noarch" +mkdir %NO_ARCH_PATH% +for /F "delims=" %%i in ('where /R "%PYTORCH_FINAL_PACKAGE_DIR:/=\%" *') do xcopy "%%i" %NO_ARCH_PATH% /Y +if ERRORLEVEL 1 exit /b 1 +call conda index %PYTORCH_FINAL_PACKAGE_DIR% +if errorlevel 1 exit /b 1 +call conda install -yq -c "file:///%PYTORCH_FINAL_PACKAGE_DIR%" pytorch==%PYTORCH_BUILD_VERSION% -c pytorch -c numba/label/dev -c nvidia -:: do conda install to make sure all the dependencies are installed -:: Install numpy see: https://github.com/pytorch/pytorch/issues/107228 -:: todo: Remove numpy install once the issue above is resolved -call conda install -yq numpy pytorch %CONDA_EXTRA_ARGS% if ERRORLEVEL 1 exit /b 1 set /a CUDA_VER=%CUDA_VERSION% @@ -103,8 +107,7 @@ set CUDA_VER_MINOR=%CUDA_VERSION:~-1,1% set CUDA_VERSION_STR=%CUDA_VER_MAJOR%.%CUDA_VER_MINOR% :: Install package we just build -for /F "delims=" %%i in ('where /R "%PYTORCH_FINAL_PACKAGE_DIR:/=\%" *.tar.bz2') do call conda install -yq "%%i" --offline -if ERRORLEVEL 1 exit /b 1 + :smoke_test python -c "import torch" From 122ff0d0af5b283512e022ea92a94c272c8ce335 Mon Sep 17 00:00:00 2001 From: henrylhtsang <91030427+henrylhtsang@users.noreply.github.com> Date: Mon, 22 Jan 2024 17:55:03 -0800 Subject: [PATCH 191/212] Add torchrec to promote s3 script (#1680) * Add torchrec to promote s3 script * Add torchrec version to release_version.sh --- release/promote.sh | 2 ++ release/release_versions.sh | 1 + 2 files changed, 3 insertions(+) diff --git a/release/promote.sh b/release/promote.sh index a7f273bc10..54e21ce7e7 100644 --- a/release/promote.sh +++ b/release/promote.sh @@ -11,6 +11,7 @@ TORCHVISION_VERSION=${TORCHVISION_VERSION:-0.16.1} TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION:-2.1.1} TORCHTEXT_VERSION=${TORCHTEXT_VERSION:-0.16.1} TORCHDATA_VERSION=${TORCHDATA_VERSION:-0.7.1} +TORCHREC_VERSION=${TORCHREC_VERSION:-0.6.0} DRY_RUN=${DRY_RUN:-enabled} @@ -102,6 +103,7 @@ promote_pypi() { # promote_s3 torchaudio whl "${TORCHAUDIO_VERSION}" # promote_s3 torchtext whl "${TORCHTEXT_VERSION}" # promote_s3 torchdata whl "${TORCHDATA_VERSION}" +# promote_s3 torchrec whl "${TORCHREC_VERSION}" # promote_s3 "libtorch-*" libtorch "${PYTORCH_VERSION}" # promote_conda torchtriton conda "2.1.0" diff --git a/release/release_versions.sh b/release/release_versions.sh index d362cb1ca3..53dbe435b4 100644 --- a/release/release_versions.sh +++ b/release/release_versions.sh @@ -6,3 +6,4 @@ TORCHVISION_VERSION=${TORCHVISION_VERSION:-0.16.1} TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION:-2.1.1} TORCHTEXT_VERSION=${TORCHTEXT_VERSION:-0.16.1} TORCHDATA_VERSION=${TORCHDATA_VERSION:-0.7.1} +TORCHREC_VERSION=${TORCHREC_VERSION:-0.6.0} \ No newline at end of file From 7d704653442c5a84048bfe4b6cd7e619f157cada Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Tue, 23 Jan 2024 22:12:16 +0000 Subject: [PATCH 192/212] Revert "Dynamic MKL windows" (#1682) --- conda/pytorch-nightly/bld.bat | 5 +++++ conda/pytorch-nightly/meta.yaml | 7 ++----- windows/build_pytorch.bat | 6 +++++- windows/internal/copy.bat | 4 ---- windows/internal/copy_cpu.bat | 5 ----- windows/internal/smoke_test.bat | 19 ++++++++----------- 6 files changed, 20 insertions(+), 26 deletions(-) diff --git a/conda/pytorch-nightly/bld.bat b/conda/pytorch-nightly/bld.bat index 775256ea7f..972df7e9cf 100644 --- a/conda/pytorch-nightly/bld.bat +++ b/conda/pytorch-nightly/bld.bat @@ -34,6 +34,11 @@ if "%desired_cuda%" == "12.1" ( set DISTUTILS_USE_SDK=1 +curl https://s3.amazonaws.com/ossci-windows/mkl_2020.2.254.7z -k -O +7z x -aoa mkl_2020.2.254.7z -omkl +set CMAKE_INCLUDE_PATH=%SRC_DIR%\mkl\include +set LIB=%SRC_DIR%\mkl\lib;%LIB% + set libuv_ROOT=%PREFIX%\Library echo libuv_ROOT=%libuv_ROOT% diff --git a/conda/pytorch-nightly/meta.yaml b/conda/pytorch-nightly/meta.yaml index 3da1625c40..d29b870189 100644 --- a/conda/pytorch-nightly/meta.yaml +++ b/conda/pytorch-nightly/meta.yaml @@ -23,8 +23,7 @@ requirements: - mkl-include # [x86_64] - mkl=2020.2 # [py <= 311 and x86_64 and not win] - mkl=2023.1 # [py >= 312 and x86_64] - - mkl-devel=2021.4.0 # [x86_64 and win and py<=311] - - mkl-devel=2023.1 # [x86_64 and win and py>=312] + - mkl=2021.4 # [x86_64 and win and py <= 311] {% endif %} - typing_extensions - ninja @@ -42,9 +41,7 @@ requirements: run: - python {% if cross_compile_arm64 == 0 %} - - mkl >=2018 # [x86_64 and not win] - - mkl=2021.4 # [x86_64 and win and py <= 311] - - mkl=2023.1 # [x86_64 and win and py >= 312] + - mkl >=2018 # [x86_64] {% endif %} - libuv # [win] - intel-openmp # [win] diff --git a/windows/build_pytorch.bat b/windows/build_pytorch.bat index 750d3c5e35..37e19f9339 100644 --- a/windows/build_pytorch.bat +++ b/windows/build_pytorch.bat @@ -67,6 +67,10 @@ exit /B 1 :: Install MKL rmdir /s /q mkl del mkl_2020.2.254.7z +curl https://s3.amazonaws.com/ossci-windows/mkl_2020.2.254.7z -k -O +7z x -aoa mkl_2020.2.254.7z -omkl +set CMAKE_INCLUDE_PATH=%cd%\mkl\include +set LIB=%cd%\mkl\lib;%LIB% :: Download MAGMA Files on CUDA builds set MAGMA_VERSION=2.5.4 @@ -122,7 +126,7 @@ for %%v in (%DESIRED_PYTHON_PREFIX%) do ( ) else ( set "PATH=%CONDA_HOME%\envs\%%v;%CONDA_HOME%\envs\%%v\scripts;%CONDA_HOME%\envs\%%v\Library\bin;%ORIG_PATH%" ) - pip install ninja mkl-include==2021.4.0 mkl-devel==2021.4.0 + pip install ninja @setlocal :: Set Flags if not "%CUDA_VERSION%"=="cpu" ( diff --git a/windows/internal/copy.bat b/windows/internal/copy.bat index 9893fc7c52..490d9593a1 100755 --- a/windows/internal/copy.bat +++ b/windows/internal/copy.bat @@ -11,10 +11,6 @@ copy "%CUDA_PATH%\extras\CUPTI\lib64\cupti64_*.dll*" pytorch\torch\lib copy "C:\Program Files\NVIDIA Corporation\NvToolsExt\bin\x64\nvToolsExt64_1.dll*" pytorch\torch\lib copy "%CONDA_LIB_PATH%\libiomp*5md.dll" pytorch\torch\lib -IF "%PACKAGE_TYPE%"=="libtorch" ( - copy "%CONDA_LIB_PATH%\mkl_intel_thread.1.dll" pytorch\torch\lib - copy "%CONDA_LIB_PATH%\mkl_core.1.dll" pytorch\torch\lib -) :: Should be set in build_pytorch.bat copy "%libuv_ROOT%\bin\uv.dll" pytorch\torch\lib diff --git a/windows/internal/copy_cpu.bat b/windows/internal/copy_cpu.bat index 0a4c0dabb2..2dae4613ee 100755 --- a/windows/internal/copy_cpu.bat +++ b/windows/internal/copy_cpu.bat @@ -1,8 +1,3 @@ copy "%CONDA_LIB_PATH%\libiomp*5md.dll" pytorch\torch\lib :: Should be set in build_pytorch.bat copy "%libuv_ROOT%\bin\uv.dll" pytorch\torch\lib - -IF "%PACKAGE_TYPE%"=="libtorch" ( - copy "%CONDA_LIB_PATH%\mkl_intel_thread.1.dll" pytorch\torch\lib - copy "%CONDA_LIB_PATH%\mkl_core.1.dll" pytorch\torch\lib -) diff --git a/windows/internal/smoke_test.bat b/windows/internal/smoke_test.bat index ce097f6a21..1ade2cbda2 100644 --- a/windows/internal/smoke_test.bat +++ b/windows/internal/smoke_test.bat @@ -54,7 +54,7 @@ if errorlevel 1 exit /b 1 set "PATH=%CD%\Python%PYTHON_VERSION%\Scripts;%CD%\Python;%PATH%" -pip install -q numpy protobuf +pip install -q numpy protobuf "mkl>=2019" if errorlevel 1 exit /b 1 for /F "delims=" %%i in ('where /R "%PYTORCH_FINAL_PACKAGE_DIR:/=\%" *.whl') do pip install "%%i" @@ -87,18 +87,14 @@ set "PATH=%CONDA_HOME%;%CONDA_HOME%\scripts;%CONDA_HOME%\Library\bin;%PATH%" conda create -qyn testenv python=%DESIRED_PYTHON% if errorlevel 1 exit /b 1 -call conda install -yq conda-build -if errorlevel 1 exit /b 1 + call %CONDA_HOME%\condabin\activate.bat testenv if errorlevel 1 exit /b 1 -set "NO_ARCH_PATH=%PYTORCH_FINAL_PACKAGE_DIR:/=\%\noarch" -mkdir %NO_ARCH_PATH% -for /F "delims=" %%i in ('where /R "%PYTORCH_FINAL_PACKAGE_DIR:/=\%" *') do xcopy "%%i" %NO_ARCH_PATH% /Y -if ERRORLEVEL 1 exit /b 1 -call conda index %PYTORCH_FINAL_PACKAGE_DIR% -if errorlevel 1 exit /b 1 -call conda install -yq -c "file:///%PYTORCH_FINAL_PACKAGE_DIR%" pytorch==%PYTORCH_BUILD_VERSION% -c pytorch -c numba/label/dev -c nvidia +:: do conda install to make sure all the dependencies are installed +:: Install numpy see: https://github.com/pytorch/pytorch/issues/107228 +:: todo: Remove numpy install once the issue above is resolved +call conda install -yq numpy pytorch %CONDA_EXTRA_ARGS% if ERRORLEVEL 1 exit /b 1 set /a CUDA_VER=%CUDA_VERSION% @@ -107,7 +103,8 @@ set CUDA_VER_MINOR=%CUDA_VERSION:~-1,1% set CUDA_VERSION_STR=%CUDA_VER_MAJOR%.%CUDA_VER_MINOR% :: Install package we just build - +for /F "delims=" %%i in ('where /R "%PYTORCH_FINAL_PACKAGE_DIR:/=\%" *.tar.bz2') do call conda install -yq "%%i" --offline +if ERRORLEVEL 1 exit /b 1 :smoke_test python -c "import torch" From 0816ae7d1acdbcdfd7d51418db22f72fdec8030d Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Tue, 23 Jan 2024 22:19:40 +0000 Subject: [PATCH 193/212] Revert "Revert "Dynamic MKL windows"" (#1683) --- conda/pytorch-nightly/bld.bat | 5 ----- conda/pytorch-nightly/meta.yaml | 7 +++++-- windows/build_pytorch.bat | 6 +----- windows/internal/copy.bat | 4 ++++ windows/internal/copy_cpu.bat | 5 +++++ windows/internal/smoke_test.bat | 19 +++++++++++-------- 6 files changed, 26 insertions(+), 20 deletions(-) diff --git a/conda/pytorch-nightly/bld.bat b/conda/pytorch-nightly/bld.bat index 972df7e9cf..775256ea7f 100644 --- a/conda/pytorch-nightly/bld.bat +++ b/conda/pytorch-nightly/bld.bat @@ -34,11 +34,6 @@ if "%desired_cuda%" == "12.1" ( set DISTUTILS_USE_SDK=1 -curl https://s3.amazonaws.com/ossci-windows/mkl_2020.2.254.7z -k -O -7z x -aoa mkl_2020.2.254.7z -omkl -set CMAKE_INCLUDE_PATH=%SRC_DIR%\mkl\include -set LIB=%SRC_DIR%\mkl\lib;%LIB% - set libuv_ROOT=%PREFIX%\Library echo libuv_ROOT=%libuv_ROOT% diff --git a/conda/pytorch-nightly/meta.yaml b/conda/pytorch-nightly/meta.yaml index d29b870189..3da1625c40 100644 --- a/conda/pytorch-nightly/meta.yaml +++ b/conda/pytorch-nightly/meta.yaml @@ -23,7 +23,8 @@ requirements: - mkl-include # [x86_64] - mkl=2020.2 # [py <= 311 and x86_64 and not win] - mkl=2023.1 # [py >= 312 and x86_64] - - mkl=2021.4 # [x86_64 and win and py <= 311] + - mkl-devel=2021.4.0 # [x86_64 and win and py<=311] + - mkl-devel=2023.1 # [x86_64 and win and py>=312] {% endif %} - typing_extensions - ninja @@ -41,7 +42,9 @@ requirements: run: - python {% if cross_compile_arm64 == 0 %} - - mkl >=2018 # [x86_64] + - mkl >=2018 # [x86_64 and not win] + - mkl=2021.4 # [x86_64 and win and py <= 311] + - mkl=2023.1 # [x86_64 and win and py >= 312] {% endif %} - libuv # [win] - intel-openmp # [win] diff --git a/windows/build_pytorch.bat b/windows/build_pytorch.bat index 37e19f9339..750d3c5e35 100644 --- a/windows/build_pytorch.bat +++ b/windows/build_pytorch.bat @@ -67,10 +67,6 @@ exit /B 1 :: Install MKL rmdir /s /q mkl del mkl_2020.2.254.7z -curl https://s3.amazonaws.com/ossci-windows/mkl_2020.2.254.7z -k -O -7z x -aoa mkl_2020.2.254.7z -omkl -set CMAKE_INCLUDE_PATH=%cd%\mkl\include -set LIB=%cd%\mkl\lib;%LIB% :: Download MAGMA Files on CUDA builds set MAGMA_VERSION=2.5.4 @@ -126,7 +122,7 @@ for %%v in (%DESIRED_PYTHON_PREFIX%) do ( ) else ( set "PATH=%CONDA_HOME%\envs\%%v;%CONDA_HOME%\envs\%%v\scripts;%CONDA_HOME%\envs\%%v\Library\bin;%ORIG_PATH%" ) - pip install ninja + pip install ninja mkl-include==2021.4.0 mkl-devel==2021.4.0 @setlocal :: Set Flags if not "%CUDA_VERSION%"=="cpu" ( diff --git a/windows/internal/copy.bat b/windows/internal/copy.bat index 490d9593a1..9893fc7c52 100755 --- a/windows/internal/copy.bat +++ b/windows/internal/copy.bat @@ -11,6 +11,10 @@ copy "%CUDA_PATH%\extras\CUPTI\lib64\cupti64_*.dll*" pytorch\torch\lib copy "C:\Program Files\NVIDIA Corporation\NvToolsExt\bin\x64\nvToolsExt64_1.dll*" pytorch\torch\lib copy "%CONDA_LIB_PATH%\libiomp*5md.dll" pytorch\torch\lib +IF "%PACKAGE_TYPE%"=="libtorch" ( + copy "%CONDA_LIB_PATH%\mkl_intel_thread.1.dll" pytorch\torch\lib + copy "%CONDA_LIB_PATH%\mkl_core.1.dll" pytorch\torch\lib +) :: Should be set in build_pytorch.bat copy "%libuv_ROOT%\bin\uv.dll" pytorch\torch\lib diff --git a/windows/internal/copy_cpu.bat b/windows/internal/copy_cpu.bat index 2dae4613ee..0a4c0dabb2 100755 --- a/windows/internal/copy_cpu.bat +++ b/windows/internal/copy_cpu.bat @@ -1,3 +1,8 @@ copy "%CONDA_LIB_PATH%\libiomp*5md.dll" pytorch\torch\lib :: Should be set in build_pytorch.bat copy "%libuv_ROOT%\bin\uv.dll" pytorch\torch\lib + +IF "%PACKAGE_TYPE%"=="libtorch" ( + copy "%CONDA_LIB_PATH%\mkl_intel_thread.1.dll" pytorch\torch\lib + copy "%CONDA_LIB_PATH%\mkl_core.1.dll" pytorch\torch\lib +) diff --git a/windows/internal/smoke_test.bat b/windows/internal/smoke_test.bat index 1ade2cbda2..ce097f6a21 100644 --- a/windows/internal/smoke_test.bat +++ b/windows/internal/smoke_test.bat @@ -54,7 +54,7 @@ if errorlevel 1 exit /b 1 set "PATH=%CD%\Python%PYTHON_VERSION%\Scripts;%CD%\Python;%PATH%" -pip install -q numpy protobuf "mkl>=2019" +pip install -q numpy protobuf if errorlevel 1 exit /b 1 for /F "delims=" %%i in ('where /R "%PYTORCH_FINAL_PACKAGE_DIR:/=\%" *.whl') do pip install "%%i" @@ -87,14 +87,18 @@ set "PATH=%CONDA_HOME%;%CONDA_HOME%\scripts;%CONDA_HOME%\Library\bin;%PATH%" conda create -qyn testenv python=%DESIRED_PYTHON% if errorlevel 1 exit /b 1 - +call conda install -yq conda-build +if errorlevel 1 exit /b 1 call %CONDA_HOME%\condabin\activate.bat testenv if errorlevel 1 exit /b 1 +set "NO_ARCH_PATH=%PYTORCH_FINAL_PACKAGE_DIR:/=\%\noarch" +mkdir %NO_ARCH_PATH% +for /F "delims=" %%i in ('where /R "%PYTORCH_FINAL_PACKAGE_DIR:/=\%" *') do xcopy "%%i" %NO_ARCH_PATH% /Y +if ERRORLEVEL 1 exit /b 1 +call conda index %PYTORCH_FINAL_PACKAGE_DIR% +if errorlevel 1 exit /b 1 +call conda install -yq -c "file:///%PYTORCH_FINAL_PACKAGE_DIR%" pytorch==%PYTORCH_BUILD_VERSION% -c pytorch -c numba/label/dev -c nvidia -:: do conda install to make sure all the dependencies are installed -:: Install numpy see: https://github.com/pytorch/pytorch/issues/107228 -:: todo: Remove numpy install once the issue above is resolved -call conda install -yq numpy pytorch %CONDA_EXTRA_ARGS% if ERRORLEVEL 1 exit /b 1 set /a CUDA_VER=%CUDA_VERSION% @@ -103,8 +107,7 @@ set CUDA_VER_MINOR=%CUDA_VERSION:~-1,1% set CUDA_VERSION_STR=%CUDA_VER_MAJOR%.%CUDA_VER_MINOR% :: Install package we just build -for /F "delims=" %%i in ('where /R "%PYTORCH_FINAL_PACKAGE_DIR:/=\%" *.tar.bz2') do call conda install -yq "%%i" --offline -if ERRORLEVEL 1 exit /b 1 + :smoke_test python -c "import torch" From e6c514248ad026a2c6494fcd402bb5976f461ce1 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Wed, 24 Jan 2024 12:46:10 -0500 Subject: [PATCH 194/212] Add numpy install to windows conda tests (#1684) --- windows/internal/smoke_test.bat | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/windows/internal/smoke_test.bat b/windows/internal/smoke_test.bat index ce097f6a21..e1980fed31 100644 --- a/windows/internal/smoke_test.bat +++ b/windows/internal/smoke_test.bat @@ -54,7 +54,7 @@ if errorlevel 1 exit /b 1 set "PATH=%CD%\Python%PYTHON_VERSION%\Scripts;%CD%\Python;%PATH%" -pip install -q numpy protobuf +pip install -q numpy protobuf if errorlevel 1 exit /b 1 for /F "delims=" %%i in ('where /R "%PYTORCH_FINAL_PACKAGE_DIR:/=\%" *.whl') do pip install "%%i" @@ -87,7 +87,7 @@ set "PATH=%CONDA_HOME%;%CONDA_HOME%\scripts;%CONDA_HOME%\Library\bin;%PATH%" conda create -qyn testenv python=%DESIRED_PYTHON% if errorlevel 1 exit /b 1 -call conda install -yq conda-build +call conda install -yq conda-build numpy if errorlevel 1 exit /b 1 call %CONDA_HOME%\condabin\activate.bat testenv if errorlevel 1 exit /b 1 From c162c7579a3b49c4b701f9d6d38f8af8cb2bdd2e Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Wed, 24 Jan 2024 14:05:55 -0500 Subject: [PATCH 195/212] Windows conda test. Install numpy in conda testenv (#1685) --- windows/internal/smoke_test.bat | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/windows/internal/smoke_test.bat b/windows/internal/smoke_test.bat index e1980fed31..8c5aed2ef6 100644 --- a/windows/internal/smoke_test.bat +++ b/windows/internal/smoke_test.bat @@ -87,7 +87,7 @@ set "PATH=%CONDA_HOME%;%CONDA_HOME%\scripts;%CONDA_HOME%\Library\bin;%PATH%" conda create -qyn testenv python=%DESIRED_PYTHON% if errorlevel 1 exit /b 1 -call conda install -yq conda-build numpy +call conda install -yq conda-build if errorlevel 1 exit /b 1 call %CONDA_HOME%\condabin\activate.bat testenv if errorlevel 1 exit /b 1 @@ -98,7 +98,8 @@ if ERRORLEVEL 1 exit /b 1 call conda index %PYTORCH_FINAL_PACKAGE_DIR% if errorlevel 1 exit /b 1 call conda install -yq -c "file:///%PYTORCH_FINAL_PACKAGE_DIR%" pytorch==%PYTORCH_BUILD_VERSION% -c pytorch -c numba/label/dev -c nvidia - +if ERRORLEVEL 1 exit /b 1 +call conda install -yq numpy if ERRORLEVEL 1 exit /b 1 set /a CUDA_VER=%CUDA_VERSION% From 55b339d2a9f21ec1c5c5ead7b4762e675929bd07 Mon Sep 17 00:00:00 2001 From: Supadchaya <138070207+spcyppt@users.noreply.github.com> Date: Wed, 24 Jan 2024 14:38:06 -0800 Subject: [PATCH 196/212] Add fbgemm to promote s3 script (#1681) --- release/promote.sh | 2 ++ release/release_versions.sh | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/release/promote.sh b/release/promote.sh index 54e21ce7e7..5bf7fe0b5e 100644 --- a/release/promote.sh +++ b/release/promote.sh @@ -12,6 +12,7 @@ TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION:-2.1.1} TORCHTEXT_VERSION=${TORCHTEXT_VERSION:-0.16.1} TORCHDATA_VERSION=${TORCHDATA_VERSION:-0.7.1} TORCHREC_VERSION=${TORCHREC_VERSION:-0.6.0} +FBGEMMGPU_VERSION=${FBGEMMGPU_VERSION:-0.6.0} DRY_RUN=${DRY_RUN:-enabled} @@ -104,6 +105,7 @@ promote_pypi() { # promote_s3 torchtext whl "${TORCHTEXT_VERSION}" # promote_s3 torchdata whl "${TORCHDATA_VERSION}" # promote_s3 torchrec whl "${TORCHREC_VERSION}" +# promote_s3 fbgemm-gpu whl "${FBGEMMGPU_VERSION}" # promote_s3 "libtorch-*" libtorch "${PYTORCH_VERSION}" # promote_conda torchtriton conda "2.1.0" diff --git a/release/release_versions.sh b/release/release_versions.sh index 53dbe435b4..981a18ea07 100644 --- a/release/release_versions.sh +++ b/release/release_versions.sh @@ -6,4 +6,5 @@ TORCHVISION_VERSION=${TORCHVISION_VERSION:-0.16.1} TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION:-2.1.1} TORCHTEXT_VERSION=${TORCHTEXT_VERSION:-0.16.1} TORCHDATA_VERSION=${TORCHDATA_VERSION:-0.7.1} -TORCHREC_VERSION=${TORCHREC_VERSION:-0.6.0} \ No newline at end of file +TORCHREC_VERSION=${TORCHREC_VERSION:-0.6.0} +FBGEMMGPU_VERSION=${FBGEMMGPU_VERSION:-0.6.0} From b8539eb5202b27eabc6ca3dc632663bf9f13f71f Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Wed, 24 Jan 2024 20:23:44 -0500 Subject: [PATCH 197/212] Release 2.2.0 pypi prep script modifications (#1686) --- .../validate-repackaged-binary-sizes.yml | 54 +++++++++---------- release/pypi/prep_binary_for_pypi.sh | 8 --- release/pypi/promote_pypi_to_staging.sh | 3 +- release/pypi/upload_pypi_to_staging.sh | 2 - release/release_versions.sh | 8 +-- 5 files changed, 31 insertions(+), 44 deletions(-) diff --git a/.github/workflows/validate-repackaged-binary-sizes.yml b/.github/workflows/validate-repackaged-binary-sizes.yml index 695c68d3aa..cb1a6a73e7 100644 --- a/.github/workflows/validate-repackaged-binary-sizes.yml +++ b/.github/workflows/validate-repackaged-binary-sizes.yml @@ -23,66 +23,64 @@ jobs: fail-fast: false matrix: whl: - - url: https://download.pytorch.org/whl/test/cu117_pypi_cudnn/torch-1.13.1%2Bcu117.with.pypi.cudnn-cp310-cp310-linux_x86_64.whl - python: "3.10" # python version to use for smoke tests - upload_artifact: false # upload the repackaged binary as an artifact - - url: https://download.pytorch.org/whl/test/cu117_pypi_cudnn/torch-1.13.1%2Bcu117.with.pypi.cudnn-cp37-cp37m-linux_x86_64.whl - python: "3.7" + - url: https://download.pytorch.org/whl/test/cu121/torch-2.2.0%2Bcu121-cp312-cp312-linux_x86_64.whl + python: "3.12" artifact: false - - url: https://download.pytorch.org/whl/test/cu117_pypi_cudnn/torch-1.13.1%2Bcu117.with.pypi.cudnn-cp38-cp38-linux_x86_64.whl - python: "3.8" + - url: https://download.pytorch.org/whl/test/cu121/torch-2.2.0%2Bcu121-cp311-cp311-linux_x86_64.whl + python: "3.11" # python version to use for smoke tests + upload_artifact: false # upload the repackaged binary as an artifact + - url: https://download.pytorch.org/whl/test/cu121/torch-2.2.0%2Bcu121-cp310-cp310-linux_x86_64.whl + python: "3.10" artifact: false - - url: https://download.pytorch.org/whl/test/cu117_pypi_cudnn/torch-1.13.1%2Bcu117.with.pypi.cudnn-cp39-cp39-linux_x86_64.whl + - url: https://download.pytorch.org/whl/test/cu121/torch-2.2.0%2Bcu121-cp39-cp39-linux_x86_64.whl python: "3.9" artifact: false - # - url: https://download.pytorch.org/whl/test/cu117_pypi_cudnn/torch-1.13.1%2Bcu117.with.pypi.cudnn-cp311-cp311-linux_x86_64.whl - # python: "3.11" - # artifact: false + - url: https://download.pytorch.org/whl/test/cu121/torch-2.2.0%2Bcu121-cp38-cp38-linux_x86_64.whl + python: "3.8" + artifact: false uses: pytorch/test-infra/.github/workflows/linux_job.yml@main with: - runner: linux.4xlarge.nvidia.gpu + runner: linux.g5.4xlarge.nvidia.gpu job-name: "Validate binary size" upload-artifact: ${{ matrix.whl.upload_artifact == 'true' && 'repackaged-binary' || '' }} script: | set -ex export ENV_NAME="conda-env-${{ github.run_id }}" - export GPU_ARCH_VER="11.7" - export GPU_ARCH_TYPE="cuda" - export CUDA_VER="11.7" + export MATRIX_GPU_ARCH_VERSION="12.1" + export MATRIX_GPU_ARCH_TYPE="cuda" + export MATRIX_CUDA_VER="12.1" export DESIRED_PYTHON="${{ matrix.whl.python }}" - export DESIRED_CUDA="cu117" - export PACKAGE_TYPE="wheel" + export MATRIX_PACKAGE_TYPE="wheel" export TARGET_OS="linux" - export INSTALLATION="" - + # install zip sudo yum install zip -y - + # install patchelf chmod a+x common/install_patchelf.sh sudo common/install_patchelf.sh - + # download torch whl wget ${{ matrix.whl.url }} FILENAME=$(ls -1 *.whl | head -n 1) SIZE_BEFORE=$(du -h $FILENAME | cut -f1) - + # repackage into manywheel release/pypi/prep_binary_for_pypi.sh $FILENAME - + NEW_FILENAME=$(ls -1 *.whl | head -n 1) echo "::notice:: $FILENAME before: $SIZE_BEFORE after: $(du -h $NEW_FILENAME | cut -f1)" - + # cp to ${RUNNER_ARTIFACT_DIR} cp $NEW_FILENAME ${RUNNER_ARTIFACT_DIR}/ - + # create conda env conda create -y -n $ENV_NAME python=$DESIRED_PYTHON conda activate $ENV_NAME - + # install torch pip install numpy pillow $NEW_FILENAME - + # run smoke test - python ./test/smoke_test/smoke_test.py --package=torchonly \ No newline at end of file + python ./test/smoke_test/smoke_test.py --package=torchonly diff --git a/release/pypi/prep_binary_for_pypi.sh b/release/pypi/prep_binary_for_pypi.sh index fdd9bf4a0e..154b228527 100755 --- a/release/pypi/prep_binary_for_pypi.sh +++ b/release/pypi/prep_binary_for_pypi.sh @@ -52,14 +52,6 @@ for whl_file in "$@"; do ( set -x - # Special build with pypi cudnn remove it from version - if [[ $whl_file == *"with.pypi.cudnn"* ]]; then - rm -rf "${whl_dir}/caffe2" - rm -rf "${whl_dir}"/torch/lib/libnvrtc* - - sed -i -e "s/-with-pypi-cudnn//g" "${whl_dir}/torch/version.py" - fi - find "${dist_info_folder}" -type f -exec sed -i "s!${version_with_suffix}!${version_no_suffix}!" {} \; # Moves distinfo from one with a version suffix to one without # Example: torch-1.8.0+cpu.dist-info => torch-1.8.0.dist-info diff --git a/release/pypi/promote_pypi_to_staging.sh b/release/pypi/promote_pypi_to_staging.sh index 46cd958cd4..dbc00e24ad 100644 --- a/release/pypi/promote_pypi_to_staging.sh +++ b/release/pypi/promote_pypi_to_staging.sh @@ -21,13 +21,12 @@ upload_pypi_to_staging() { } # Uncomment these to promote to pypi -PYTORCH_LINUX_VERSION_SUFFIX="%2Bcu121.with.pypi.cudnn" LINUX_VERSION_SUFFIX="%2Bcu121" CPU_VERSION_SUFFIX="%2Bcpu" MACOS_X86_64="macosx_.*_x86_64" MACOS_ARM64="macosx_.*_arm64" -PLATFORM="linux_x86_64" VERSION_SUFFIX="${PYTORCH_LINUX_VERSION_SUFFIX}" upload_pypi_to_staging torch "${PYTORCH_VERSION}" +PLATFORM="linux_x86_64" VERSION_SUFFIX="${LINUX_VERSION_SUFFIX}" upload_pypi_to_staging torch "${PYTORCH_VERSION}" PLATFORM="manylinux2014_aarch64" VERSION_SUFFIX="" upload_pypi_to_staging torch "${PYTORCH_VERSION}" PLATFORM="win_amd64" VERSION_SUFFIX="${CPU_VERSION_SUFFIX}" upload_pypi_to_staging torch "${PYTORCH_VERSION}" PLATFORM="${MACOS_X86_64}" VERSION_SUFFIX="" upload_pypi_to_staging torch "${PYTORCH_VERSION}" # intel mac diff --git a/release/pypi/upload_pypi_to_staging.sh b/release/pypi/upload_pypi_to_staging.sh index b1a7ddf6d7..f002717159 100644 --- a/release/pypi/upload_pypi_to_staging.sh +++ b/release/pypi/upload_pypi_to_staging.sh @@ -33,10 +33,8 @@ pushd "${output_tmp_dir}" # Dry run by default DRY_RUN=${DRY_RUN:-enabled} # On dry run just echo the commands that are meant to be run -TWINE_UPLOAD="echo twine upload" DRY_RUN_FLAG="--dryrun" if [[ $DRY_RUN = "disabled" ]]; then - TWINE_UPLOAD="twine upload" DRY_RUN_FLAG="" fi diff --git a/release/release_versions.sh b/release/release_versions.sh index 981a18ea07..88b6cbcd3a 100644 --- a/release/release_versions.sh +++ b/release/release_versions.sh @@ -1,10 +1,10 @@ #!/usr/bin/env bash # Make sure to update these versions when doing a release first -PYTORCH_VERSION=${PYTORCH_VERSION:-2.1.1} -TORCHVISION_VERSION=${TORCHVISION_VERSION:-0.16.1} -TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION:-2.1.1} -TORCHTEXT_VERSION=${TORCHTEXT_VERSION:-0.16.1} +PYTORCH_VERSION=${PYTORCH_VERSION:-2.2.0} +TORCHVISION_VERSION=${TORCHVISION_VERSION:-0.17.0} +TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION:-2.2.0} +TORCHTEXT_VERSION=${TORCHTEXT_VERSION:-0.17.0} TORCHDATA_VERSION=${TORCHDATA_VERSION:-0.7.1} TORCHREC_VERSION=${TORCHREC_VERSION:-0.6.0} FBGEMMGPU_VERSION=${FBGEMMGPU_VERSION:-0.6.0} From 42852bb99ef7b61068c541c146ce202f22692019 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Fri, 26 Jan 2024 12:10:09 -0500 Subject: [PATCH 198/212] [Analytics] add pypi staging validations, remove circleci script (#1688) --- analytics/circleci_analyze.py | 596 ------------------------ analytics/validate_pypi_staging.py | 124 +++++ release/README.md | 7 + release/pypi/promote_pypi_to_staging.sh | 9 +- 4 files changed, 136 insertions(+), 600 deletions(-) delete mode 100755 analytics/circleci_analyze.py create mode 100644 analytics/validate_pypi_staging.py diff --git a/analytics/circleci_analyze.py b/analytics/circleci_analyze.py deleted file mode 100755 index 03e8c3e9af..0000000000 --- a/analytics/circleci_analyze.py +++ /dev/null @@ -1,596 +0,0 @@ -#!/usr/bin/env python3.7 -from datetime import datetime, time -import json -import requests -import itertools -import sqlite3 -import os -import sys -from typing import Callable, Dict, Generator, List, MutableSet, Optional - - -def get_executor_price_rate(executor): - (etype, eclass) = executor['type'], executor['resource_class'] - assert etype in ['machine', 'external', 'docker', 'macos', 'runner'], f'Unexpected type {etype}:{eclass}' - if etype == 'machine': - return { - 'medium': 10, - 'large': 20, - 'xlarge': 100, - '2xlarge': 200, - 'gpu.medium': 160, - 'gpu.large': 320, - 'gpu.small': 80, - 'windows.medium': 40, - 'windows.large': 120, - 'windows.xlarge': 210, - 'windows.2xlarge': 500, - 'windows.gpu.nvidia.medium': 500, - 'gpu.nvidia.small': 160, - 'gpu.nvidia.medium': 240, - 'gpu.nvidia.large': 1000, - }[eclass] - if etype == 'macos': - return { - 'medium': 50, - 'large': 100, - }[eclass] - if etype == 'docker': - return { - 'small': 5, - 'medium': 10, - 'medium+': 15, - 'large': 20, - 'xlarge': 40, - '2xlarge': 80, - '2xlarge+': 100, - }[eclass] - if etype == 'runner' or etype == 'external': - return { - 'pytorch/amd-gpu': 0, - }[eclass] - raise RuntimeError(f'Undefined executor {etype}:{eclass}') - - -price_per_credit = 6e-4 - - -def get_circleci_token() -> str: - token_file_path = os.path.join(os.getenv('HOME'), '.circleci_token') - token = os.getenv('CIRCLECI_TOKEN') - if token is not None: - return token - if not os.path.exists(token_file_path): - raise RuntimeError('Can not get CirclCI token' - ' neither from CIRCLECI_TOKEN environment variable,' - ' nor via ~/.circleci_token file') - with open(token_file_path) as f: - return f.read().strip() - - -def is_workflow_in_progress(workflow: Dict) -> bool: - return workflow['status'] in ['running', 'not_run', 'failing', 'on_hold'] - - -def str2date(val: str) -> datetime: - assert val is not None - return datetime.fromisoformat(val[:-1] if val.endswith('Z') else val) - - -class CircleCICache: - def __init__(self, token: Optional[str], db_name: str = 'circleci-cache.db') -> None: - file_folder = os.path.dirname(__file__) - self.url_prefix = 'https://circleci.com/api/v2' - self.session = requests.session() - self.headers = { - 'Accept': 'application/json', - 'Circle-Token': token, - } if token is not None else None - self.db = sqlite3.connect(os.path.join(file_folder, db_name)) - self.db.execute('CREATE TABLE IF NOT EXISTS jobs(slug TEXT NOT NULL, job_id INTEGER NOT NULL, json TEXT NOT NULL);') - self.db.execute('CREATE TABLE IF NOT EXISTS artifacts(slug TEXT NOT NULL, job_id INTEGER NOT NULL, json TEXT NOT NULL);') - self.db.execute('CREATE UNIQUE INDEX IF NOT EXISTS jobs_key on jobs(slug, job_id);') - self.db.execute('CREATE TABLE IF NOT EXISTS workflows(id TEXT NOT NULL PRIMARY KEY, json TEXT NOT NULL);') - self.db.execute('CREATE TABLE IF NOT EXISTS pipeline_workflows(id TEXT NOT NULL PRIMARY KEY, json TEXT NOT NULL);') - self.db.execute('CREATE TABLE IF NOT EXISTS pipelines(id TEXT NOT NULL PRIMARY KEY, json TEXT NOT NULL, branch TEXT, revision TEXT);') - self.db.commit() - - def is_offline(self) -> bool: - return self.headers is None - - def _get_paged_items_list(self, url: str, params: Optional[Dict] = None, item_count: Optional[int] = -1) -> List: - rc, token, run_once = [], None, False - - def _should_quit(): - nonlocal run_once, rc, token - if not run_once: - run_once = True - return False - if token is None: - return True - if item_count is None: - return True - return item_count >= 0 and len(rc) >= item_count - - if params is None: - params = {} - while not _should_quit(): - if token is not None: - params['page-token'] = token - r = self.session.get(url, params=params, headers=self.headers) - try: - j = r.json() - except json.JSONDecodeError: - print(f"Failed to decode {rc}", file=sys.stderr) - raise - if 'message' in j: - raise RuntimeError(f'Failed to get list from {url}: {j["message"]}') - token = j['next_page_token'] - rc.extend(j['items']) - return rc - - def get_pipelines(self, project: str = 'github/pytorch/pytorch', branch: Optional[str] = None, item_count: Optional[int] = None) -> List: - if self.is_offline(): - c = self.db.cursor() - cmd = "SELECT json from pipelines" - if branch is not None: - cmd += f" WHERE branch='{branch}'" - if item_count is not None and item_count > 0: - cmd += f" LIMIT {item_count}" - c.execute(cmd) - return [json.loads(val[0]) for val in c.fetchall()] - rc = self._get_paged_items_list(f'{self.url_prefix}/project/{project}/pipeline', {'branch': branch} if branch is not None else {}, item_count) - for pipeline in rc: - vcs = pipeline['vcs'] - pid, branch, revision, pser = pipeline['id'], vcs['branch'], vcs['revision'], json.dumps(pipeline) - self.db.execute("INSERT OR REPLACE INTO pipelines(id, branch, revision, json) VALUES (?, ?, ?, ?)", (pid, branch, revision, pser)) - self.db.commit() - return rc - - def get_pipeline_workflows(self, pipeline) -> List: - c = self.db.cursor() - c.execute("SELECT json FROM pipeline_workflows WHERE id=?", (pipeline,)) - rc = c.fetchone() - if rc is not None: - rc = json.loads(rc[0]) - if not any(is_workflow_in_progress(w) for w in rc) or self.is_offline(): - return rc - if self.is_offline(): - return [] - rc = self._get_paged_items_list(f'{self.url_prefix}/pipeline/{pipeline}/workflow') - self.db.execute("INSERT OR REPLACE INTO pipeline_workflows(id, json) VALUES (?, ?)", (pipeline, json.dumps(rc))) - self.db.commit() - return rc - - def get_workflow_jobs(self, workflow, should_cache=True) -> List: - c = self.db.cursor() - c.execute("select json from workflows where id=?", (workflow,)) - rc = c.fetchone() - if rc is not None: - return json.loads(rc[0]) - if self.is_offline(): - return [] - rc = self._get_paged_items_list(f'{self.url_prefix}/workflow/{workflow}/job') - if should_cache: - self.db.execute("INSERT INTO workflows(id, json) VALUES (?, ?)", (workflow, json.dumps(rc))) - self.db.commit() - return rc - - def get_job(self, project_slug, job_number) -> Dict: - c = self.db.cursor() - c.execute("select json from jobs where slug=? and job_id = ?", (project_slug, job_number)) - rc = c.fetchone() - if rc is not None: - return json.loads(rc[0]) - if self.is_offline(): - return {} - r = self.session.get(f'{self.url_prefix}/project/{project_slug}/job/{job_number}', headers=self.headers) - try: - rc = r.json() - except json.JSONDecodeError: - print(f"Failed to decode {rc}", file=sys.stderr) - raise - self.db.execute("INSERT INTO jobs(slug,job_id, json) VALUES (?, ?, ?)", (project_slug, job_number, json.dumps(rc))) - self.db.commit() - return rc - - def get_job_artifacts(self, project_slug, job_number) -> List[Dict]: - c = self.db.cursor() - c.execute("select json from artifacts where slug=? and job_id = ?", (project_slug, job_number)) - rc = c.fetchone() - if rc is not None: - return json.loads(rc[0]) - if self.is_offline(): - return [{}] - rc = self._get_paged_items_list(f"{self.url_prefix}/project/{project_slug}/{job_number}/artifacts") - self.db.execute("INSERT INTO artifacts(slug,job_id, json) VALUES (?, ?, ?)", (project_slug, job_number, json.dumps(rc))) - self.db.commit() - return rc - - def get_pipeline_jobs(self, project: str = 'github/pytorch/pytorch', branch: Optional[str] = None, item_count: Optional[int] = None) -> Generator: - for pipeline in self.get_pipelines(project, branch, item_count): - for workflow in self.get_pipeline_workflows(pipeline['id']): - in_progress = is_workflow_in_progress(workflow) - for job in self.get_workflow_jobs(workflow['id'], should_cache=not in_progress): - yield (pipeline, workflow, job) - - def get_jobs_summary(self, slug='gh/pytorch/pytorch', workflow='build') -> Dict: - items = self._get_paged_items_list(f'{self.url_prefix}/insights/{slug}/workflows/{workflow}/jobs') - return {item['name']: item for item in items} - - def get_job_timeseries(self, job_name: str, - slug: str = 'gh/pytorch/pytorch', - workflow: str = 'build', - branch: Optional[str] = None) -> List: - params = {'branch': branch} if branch is not None else {} - items = self._get_paged_items_list(f'{self.url_prefix}/insights/{slug}/workflows/build/jobs/{job_name}', params) - return [(str2date(x['started_at']), x['duration']) for x in items if x['status'] == 'success'] - - -def aggregate_by_day(series): - rc = {} - for (ts, val) in series: - date = datetime.combine(ts.date(), time()) - valcount = [val, 1.0] - if date not in rc: - rc[date] = valcount - else: - rc[date] = [sum(x) for x in zip(rc[date], valcount)] - return [(x, rc[x][0] / rc[x][1]) for x in sorted(rc.keys())] - - -def filter_names(names: List[str], name_filter: Optional[str] = None) -> List[str]: - import re - if name_filter is None: - return names - filters = name_filter.split(",") - return [name for name in names if any(re.match(filter, name) for filter in filters)] - - -def common_prefix(names: List[str]) -> str: - if len(names) == 0 or len(names[0]) == 0: - return '' - if len(names) == 1: - return names[0] - rc = names[0][0] - while rc != names[0] and all(name.startswith(rc) for name in names[1:]): - rc = names[0][:len(rc) + 1] - return rc[:-1] - - -def plot_graph(name_filter: Optional[str] = None, - output_file: Optional[str] = None, - branch: Optional[str] = None) -> None: - import matplotlib.pyplot as plt - import matplotlib.dates as mdates - - ci_cache = CircleCICache(token=get_circleci_token()) - summary = ci_cache.get_jobs_summary() - test_jobs = [name for name in summary.keys() if name.startswith('pytorch') and 'test' in name] - filtered_jobs = filter_names(test_jobs, name_filter) - prefix = common_prefix(filtered_jobs) - if len(filtered_jobs) == 0: - print(f'Filter "{name_filter}" does not match to any of {test_jobs}') - return - series = [] - labels = [] - styles = [f'{color}{style}' for (style, color) in itertools.product(['-', '--', '-.', ':'], ['b', 'g', 'r', 'c', 'm', 'y', 'k'])] - fig, ax = plt.subplots() - for name in test_jobs: - label = f"{name}(p95 = {int(summary[name]['metrics']['duration_metrics']['p95']/60)} min)" - if name not in filtered_jobs: - print(label) - continue - ts = ci_cache.get_job_timeseries(name, branch=branch) - if len(ts) == 0: - print(f'{label} time series is empty!') - continue - print(f'{label} time series has {len(ts)} elements') - labels.append(label[len(prefix):]) - series.append(ts) - x, y = zip(*aggregate_by_day(ts)) - plt.plot(x, [i / 60.0 for i in y], styles[len(labels) % len(styles)]) - plt.legend(labels, loc='upper left') - plt.title(f'{prefix} timeseries') - ax.set_ylabel("Duration (m)") - # Format date - ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %d')) - # Rotate tick labels - plt.setp(ax.get_xticklabels(), rotation=45, ha='right', rotation_mode='anchor') - if output_file is not None: - plt.savefig(output_file) - else: - plt.show() - - -def print_line(line: str, padding: Optional[int] = None, newline: bool = True) -> None: - if padding is not None and len(line) < padding: - line += ' ' * (padding - len(line)) - print(line, end='\n' if newline else '\r', flush=True) - - -def fetch_status(branch=None, item_count=50): - isatty = sys.stdout.isatty() - padding = os.get_terminal_size().columns - 1 if isatty else None - ci_cache = CircleCICache(token=get_circleci_token()) - print(f"About to fetch {item_count} latest pipelines against {branch if branch is not None else 'all branches'}") - pipelines = ci_cache.get_pipelines(branch=branch, item_count=item_count) - total_price, total_master_price = 0, 0 - for pipeline_idx, pipeline in enumerate(pipelines): - revision = pipeline['vcs']['revision'] - branch = pipeline['vcs']['branch'] - workflows = ci_cache.get_pipeline_workflows(pipeline['id']) - known_job_ids = [] - for workflow in workflows: - url = f'https://app.circleci.com/pipelines/github/pytorch/pytorch/{workflow["pipeline_number"]}/workflows/{workflow["id"]}' - if is_workflow_in_progress(workflow): - print_line(f'Skipping {url} name:{workflow["name"]} status:{workflow["status"]}', - newline=not sys.stdout.isatty()) - continue - rerun = False - total_credits, test_credits, gpu_credits, wincpu_credits, wingpu_credits = 0, 0, 0, 0, 0 - jobs = ci_cache.get_workflow_jobs(workflow['id']) - for job in jobs: - job_name, job_status, job_number = job['name'], job['status'], job.get('job_number', None) - if job_status in ['blocked', 'canceled', 'unauthorized', 'running', 'not_run', 'failing']: - continue - if job_number is None: - print(job) - continue - if job_number in known_job_ids: - rerun = True - continue - job_info = ci_cache.get_job(job['project_slug'], job_number) - if 'executor' not in job_info: - print(f'executor not found in {job_info}') - continue - job_executor = job_info['executor'] - resource_class = job_executor['resource_class'] - if resource_class is None: - print(f'resource_class is none for {job_info}') - continue - job_on_gpu = 'gpu' in resource_class - job_on_win = 'windows' in resource_class - if job_status != 'infrastructure_fail': - duration = str2date(job_info['stopped_at']) - str2date(job_info['started_at']) - job_credits = get_executor_price_rate(job_executor) * int(job_info['duration']) * 1e-3 / 60 - else: - job_credits, duration = 0, 0 - job_cost = job_credits * price_per_credit - total_credits += job_credits - if 'test' in job_name or job_name.startswith('smoke_'): - test_credits += job_credits - elif job_on_gpu: - print(f'Running build job {job_name} on GPU!!!') - if job_on_gpu: - gpu_credits += job_credits - if job_on_win: - wingpu_credits += job_credits - if job_on_win and not job_on_gpu: - wincpu_credits += job_credits - known_job_ids.append(job_number) - print_line(f' {job_name} {job_status} {duration} ${job_cost:.2f}', - padding=padding, newline=not isatty) - # Increment totals - total_price += total_credits * price_per_credit - if branch in ['master', 'nightly', 'postnightly', 'release/1.6']: - total_master_price += total_credits * price_per_credit - # skip small jobs - if total_credits * price_per_credit < .1: - continue - workflow_status = f'[{pipeline_idx}/{len(pipelines)}]' - workflow_status += f' {url} {workflow["name"]} status:{workflow["status"]}' - workflow_status += f' price: ${total_credits * price_per_credit:.2f}' - workflow_status += ' (Rerun?)' if rerun else '' - workflow_status += f'\n\t\tdate: {workflow["created_at"]} branch:{branch} revision:{revision}' - workflow_status += f'\n\t\ttotal credits: {int(total_credits)}' - if test_credits != 0: - workflow_status += f' testing: {100 * test_credits / total_credits:.1f}%' - if gpu_credits != 0: - workflow_status += f' GPU testing: {100 * gpu_credits / total_credits:.1f}%' - if wingpu_credits != 0: - workflow_status += f' WINGPU/GPU: {100 * wingpu_credits / gpu_credits:.1f}%' - - if wincpu_credits != 0: - workflow_status += f' Win CPU: {100 * wincpu_credits / total_credits:.1f}%' - workflow_status += f' Total: ${total_price:.2f} master fraction: {100 * total_master_price/ total_price:.1f}%' - print_line(workflow_status, padding=padding) - - -def plot_heatmap(cov_matrix, names): - import numpy as np - import matplotlib.pyplot as plt - assert cov_matrix.shape == (len(names), len(names)) - fig, ax = plt.subplots() - ax.imshow(cov_matrix) - ax.set_xticks(np.arange(len(names))) - ax.set_yticks(np.arange(len(names))) - ax.set_xticklabels(names) - ax.set_yticklabels(names) - # Rotate tick labels - plt.setp(ax.get_xticklabels(), rotation=45, ha='right', rotation_mode='anchor') - # Annotate values - for i in range(len(names)): - for j in range(len(names)): - ax.text(j, i, f'{cov_matrix[i, j]:.2f}', ha='center', va='center', color='w') - plt.show() - - -def filter_service_jobs(name): - if name.startswith('docker'): - return True - if name.startswith('binary'): - return True - return False - - -def filter_cuda_test(name): - if filter_service_jobs(name): - return False - if 'libtorch' in name: - return False - if 'test' not in name: - return False - # Skip jit-profiling tests - if 'jit-profiling' in name: - return False - if 'cuda11' in name: - return False - # Skip VS2017 tests - if 'vs2017' in name: - return False - return 'cuda' in name and 'nogpu' not in name - - -def filter_cuda_build(name): - if filter_service_jobs(name): - return False - if 'libtorch' in name: - return False - return 'cuda' in name and name.endswith('build') - - -def filter_windows_test(name): - if filter_service_jobs(name): - return False - # Skip jit-profiling tests - if 'jit-profiling' in name: - return False - return 'test' in name and 'windows' in name - - -def compute_covariance(branch='master', name_filter: Optional[Callable[[str], bool]] = None): - import numpy as np - revisions: MutableSet[str] = set() - job_summary: Dict[str, Dict[str, float]] = {} - - # Extract data - print(f"Computing covariance for {branch if branch is not None else 'all branches'}") - ci_cache = CircleCICache(None) - pipelines = ci_cache.get_pipelines(branch=branch) - for pipeline in pipelines: - if pipeline['trigger']['type'] == 'schedule': - continue - revision = pipeline['vcs']['revision'] - pipeline_jobs: Dict[str, float] = {} - blocked_jobs: MutableSet[str] = set() - workflows = ci_cache.get_pipeline_workflows(pipeline['id']) - for workflow in workflows: - if is_workflow_in_progress(workflow): - continue - jobs = ci_cache.get_workflow_jobs(workflow['id']) - for job in jobs: - job_name = job['name'] - job_status = job['status'] - # Handle renames - if job_name == 'pytorch_linux_xenial_cuda10_1_cudnn7_py3_NO_AVX2_test': - job_name = 'pytorch_linux_xenial_cuda10_1_cudnn7_py3_nogpu_NO_AVX2_test' - if job_name == 'pytorch_linux_xenial_cuda10_1_cudnn7_py3_NO_AVX_NO_AVX2_test': - job_name = 'pytorch_linux_xenial_cuda10_1_cudnn7_py3_nogpu_NO_AVX_test' - if job_status in ['infrastructure_fail', 'canceled']: - continue - if callable(name_filter) and not name_filter(job_name): - continue - if job_status == 'blocked': - blocked_jobs.add(job_name) - continue - if job_name in blocked_jobs: - blocked_jobs.remove(job_name) - result = 1.0 if job_status == 'success' else -1.0 - pipeline_jobs[job_name] = result - # Skip build with blocked job [which usually means build failed due to the test failure] - if len(blocked_jobs) != 0: - continue - # Skip all success workflows - if all(result == 1.0 for result in pipeline_jobs.values()): - continue - revisions.add(revision) - for job_name in pipeline_jobs: - if job_name not in job_summary: - job_summary[job_name] = {} - job_summary[job_name][revision] = pipeline_jobs[job_name] - # Analyze results - job_names = sorted(job_summary.keys()) - # revisions = sorted(revisions) - job_data = np.zeros((len(job_names), len(revisions)), dtype=np.float) - print(f"Number of observations: {len(revisions)}") - for job_idx, job_name in enumerate(job_names): - job_row = job_summary[job_name] - for rev_idx, revision in enumerate(revisions): - if revision in job_row: - job_data[job_idx, rev_idx] = job_row[revision] - success_rate = job_data[job_idx, ].sum(where=job_data[job_idx, ] > 0.0) / len(job_row) - present_rate = 1.0 * len(job_row) / len(revisions) - print(f"{job_name}: missing {100.0 * (1.0 - present_rate):.2f}% success rate: {100 * success_rate:.2f}%") - cov_matrix = np.corrcoef(job_data) - plot_heatmap(cov_matrix, job_names) - - -def print_artifacts(branch, item_count, name_filter: Callable[[str], bool]) -> None: - ci_cache = CircleCICache(token=get_circleci_token()) - for pipeline, _, job in ci_cache.get_pipeline_jobs(branch=branch, item_count=item_count): - revision = pipeline['vcs']['revision'] - if not name_filter(job["name"]): - continue - job_number = job.get("job_number") - if job_number is None: - continue - artifacts = ci_cache.get_job_artifacts('gh/pytorch/pytorch', job_number) - for artifact in artifacts: - name = os.path.basename(artifact['path']) - url = artifact["url"] - print(f"{revision} {name} {url}") - - -def print_duration(branch, item_count, name_filter: Callable[[str], bool]) -> None: - ci_cache = CircleCICache(token=get_circleci_token()) - for pipeline, workflow, job in ci_cache.get_pipeline_jobs(branch=branch, item_count=item_count): - job_name, job_status, job_number = job['name'], job['status'], job.get("job_number") - revision = pipeline['vcs']['revision'] - if not name_filter(job_name) or job_number is None: - continue - if job_status in ['blocked', 'canceled', 'unauthorized', 'running', 'not_run', 'failing']: - continue - started_at = str2date(job['started_at']) - stopped_at = str2date(job['stopped_at']) - duration = stopped_at - started_at - print(f"{job_name} {revision} {duration} {started_at}") - - -def parse_arguments(): - from argparse import ArgumentParser - parser = ArgumentParser(description="Download and analyze circle logs") - parser.add_argument('--plot-graph', type=str, nargs='?', help="Plot job time trends", const='') - parser.add_argument('--output', type=str, help="Output file name for the graphs") - parser.add_argument('--get_artifacts', type=str) - parser.add_argument('--print-duration', type=str) - parser.add_argument('--branch', type=str) - parser.add_argument('--item_count', type=int, default=100) - parser.add_argument('--compute_covariance', choices=['cuda_test', 'cuda_build', 'windows_test']) - return parser.parse_args() - - -if __name__ == '__main__': - args = parse_arguments() - if args.get_artifacts is not None: - print_artifacts(branch=args.branch, - item_count=args.item_count, - name_filter=lambda x: args.get_artifacts in x) - sys.exit(0) - if args.print_duration is not None: - print_duration(branch=args.branch, - item_count=args.item_count, - name_filter=lambda x: args.print_duration in x) - sys.exit(0) - if args.compute_covariance is not None: - name_filter = { - 'cuda_test': filter_cuda_test, - 'cuda_build': filter_cuda_build, - 'windows_test': filter_windows_test, - }[args.compute_covariance] - compute_covariance(branch=args.branch, name_filter=name_filter) - sys.exit(0) - if args.plot_graph is not None: - plot_graph(args.plot_graph, args.output, args.branch) - sys.exit(0) - fetch_status(branch=args.branch, item_count=args.item_count) diff --git a/analytics/validate_pypi_staging.py b/analytics/validate_pypi_staging.py new file mode 100644 index 0000000000..a7104c3140 --- /dev/null +++ b/analytics/validate_pypi_staging.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 + +import os.path +import shutil +import tempfile +import zipfile +import boto3 +import botocore + + +PLATFORMS = [ + "manylinux1_x86_64", + "manylinux2014_aarch64", + "win_amd64", + "macosx_10_9_x86_64", + "macosx_11_0_arm64", +] +PYTHON_VERSIONS = ["cp38", "cp39", "cp310", "cp311", "cp312"] +S3_PYPI_STAGING = "pytorch-backup" +PACKAGE_RELEASES = { + "torch": "2.2.0", + "torchvision": "0.17.0", + "torchaudio": "2.2.0", + "torchtext": "0.17.0", +} + +PATTERN_V = "Version:" +PATTERN_RD = "Requires-Dist:" + +s3 = boto3.client("s3") + + +def get_size(path): + size = os.path.getsize(path) + if size < 1024: + return f"{size} bytes" + elif size < pow(1024, 2): + return f"{round(size/1024, 2)} KB" + elif size < pow(1024, 3): + return f"{round(size/(pow(1024,2)), 2)} MB" + elif size < pow(1024, 4): + return f"{round(size/(pow(1024,3)), 2)} GB" + + +def generate_expected_builds(platform: str, package: str, release: str) -> list: + builds = [] + for py_version in PYTHON_VERSIONS: + py_spec = f"{py_version}-{py_version}" + platform_spec = platform + + if package == "torchtext" and ( + platform == "manylinux2014_aarch64" or py_version == "cp312" + ): + continue + + # strange macos file nameing + if "macos" in platform: + if package == "torch": + py_spec = f"{py_version}-none" + elif "macosx_10_9_x86_64" in platform: + platform_spec = "macosx_10_13_x86_64" + + builds.append( + f"{package}-{release}-pypi-staging/{package}-{release}-{py_spec}-{platform_spec}.whl" + ) + + return builds + + +def validate_file_metadata(build: str, package: str, version: str): + temp_dir = tempfile.mkdtemp() + tmp_file = f"{temp_dir}/{os.path.basename(build)}" + s3.download_file(Bucket=S3_PYPI_STAGING, Key=build, Filename=tmp_file) + print(f"Downloaded: {tmp_file} {get_size(tmp_file)}") + with zipfile.ZipFile(f"{temp_dir}/{os.path.basename(build)}", "r") as zip_ref: + zip_ref.extractall(f"{temp_dir}") + + for i, line in enumerate( + open(f"{temp_dir}/{package}-{version}.dist-info/METADATA") + ): + if line.startswith(PATTERN_V): + print(f"{line}", end="") + exttracted_version = line.removeprefix(PATTERN_V).strip() + if version != exttracted_version: + print( + f"FAILURE VERSION DOES NOT MATCH expected {version} got {exttracted_version}" + ) + + elif line.startswith(PATTERN_RD): + print(f"{line}", end="") + + shutil.rmtree(temp_dir) + + +def main(): + expected_builds = dict.fromkeys(PACKAGE_RELEASES, []) + + # Iterate over platform to gather build information of available conda version. + for package in PACKAGE_RELEASES: + for platform in PLATFORMS: + expected_builds[package] = expected_builds[ + package + ] + generate_expected_builds(platform, package, PACKAGE_RELEASES[package]) + + for package in PACKAGE_RELEASES: + count = 0 + for build in expected_builds[package]: + try: + s3.head_object(Bucket=S3_PYPI_STAGING, Key=build) + print(f"Validating filename {os.path.basename(build)}") + validate_file_metadata(build, package, PACKAGE_RELEASES[package]) + count += 1 + except botocore.exceptions.ClientError as e: + if e.response["Error"]["Code"] == "404": + print(f"FAILED 404 Error on {build}") + elif e.response["Error"]["Code"] == "403": + print(f"FAILED Unauthorized Error on {build}") + else: + print(f"Error on {build}") + print(f"Package Validated {count} for {package}") + + +if __name__ == "__main__": + main() diff --git a/release/README.md b/release/README.md index 19fd28bf80..a80144b17d 100644 --- a/release/README.md +++ b/release/README.md @@ -12,6 +12,13 @@ These are a collection of scripts that are to be used for release activities. * Access to upload conda packages to the [`pytorch`](https://anaconda.org/pytorch) conda channel * Access to the PyPI repositories (like [torch](https://pypi.org/project/torch)) +## Promote pypi to staging + +Following steps needed in order to promote pypi to staging: +1. Edit `release_versions.sh` and set correct version +2. Run promote script : `./pypi/promote_pypi_to_staging.sh` +3. Edit and run `../analytics/validate_pypi_staging.py` to perform initial prevalidation of binaries for pypi promotion +4. Manually inspect and spot check binaries staged for pypi promotion by logging into s3 and downloading packages ## Promote diff --git a/release/pypi/promote_pypi_to_staging.sh b/release/pypi/promote_pypi_to_staging.sh index dbc00e24ad..e05634c363 100644 --- a/release/pypi/promote_pypi_to_staging.sh +++ b/release/pypi/promote_pypi_to_staging.sh @@ -49,7 +49,8 @@ PLATFORM="win_amd64" VERSION_SUFFIX="${CPU_VERSION_SUFFIX}" upload_pypi_to_stagi PLATFORM="${MACOS_X86_64}" VERSION_SUFFIX="" upload_pypi_to_staging torchtext "${TORCHTEXT_VERSION}" PLATFORM="${MACOS_ARM64}" VERSION_SUFFIX="" upload_pypi_to_staging torchtext "${TORCHTEXT_VERSION}" -PLATFORM="manylinux2014_x86_64" VERSION_SUFFIX="" upload_pypi_to_staging torchdata "${TORCHDATA_VERSION}" -PLATFORM="win_amd64" VERSION_SUFFIX="" upload_pypi_to_staging torchdata "${TORCHDATA_VERSION}" -PLATFORM="${MACOS_X86_64}" VERSION_SUFFIX="" upload_pypi_to_staging torchdata "${TORCHDATA_VERSION}" -PLATFORM="${MACOS_ARM64}" VERSION_SUFFIX="" upload_pypi_to_staging torchdata "${TORCHDATA_VERSION}" +# Please note torchdata is not released currently hence turning it off +#PLATFORM="manylinux2014_x86_64" VERSION_SUFFIX="" upload_pypi_to_staging torchdata "${TORCHDATA_VERSION}" +#PLATFORM="win_amd64" VERSION_SUFFIX="" upload_pypi_to_staging torchdata "${TORCHDATA_VERSION}" +#PLATFORM="${MACOS_X86_64}" VERSION_SUFFIX="" upload_pypi_to_staging torchdata "${TORCHDATA_VERSION}" +#PLATFORM="${MACOS_ARM64}" VERSION_SUFFIX="" upload_pypi_to_staging torchdata "${TORCHDATA_VERSION}" From 6f3530cd25ce0b5456febf193d57ef343663a608 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Fri, 26 Jan 2024 17:42:48 -0500 Subject: [PATCH 199/212] [Analytics] Pypi validations. Add call to check-wheel-contents (#1689) --- analytics/validate_pypi_staging.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/analytics/validate_pypi_staging.py b/analytics/validate_pypi_staging.py index a7104c3140..fd5026a9ed 100644 --- a/analytics/validate_pypi_staging.py +++ b/analytics/validate_pypi_staging.py @@ -2,12 +2,13 @@ import os.path import shutil +import subprocess import tempfile import zipfile + import boto3 import botocore - PLATFORMS = [ "manylinux1_x86_64", "manylinux2014_aarch64", @@ -72,6 +73,22 @@ def validate_file_metadata(build: str, package: str, version: str): tmp_file = f"{temp_dir}/{os.path.basename(build)}" s3.download_file(Bucket=S3_PYPI_STAGING, Key=build, Filename=tmp_file) print(f"Downloaded: {tmp_file} {get_size(tmp_file)}") + + try: + check_wheels = subprocess.run( + ["check-wheel-contents", tmp_file, "--ignore", "W002,W009,W004"], + capture_output=True, + text=True, + check=True, + encoding="utf-8", + ) + print(check_wheels.stdout) + print(check_wheels.stderr) + except subprocess.CalledProcessError as e: + exit_code = e.returncode + stderror = e.stderr + print(exit_code, stderror) + with zipfile.ZipFile(f"{temp_dir}/{os.path.basename(build)}", "r") as zip_ref: zip_ref.extractall(f"{temp_dir}") From 0582b02f0ba73ca3ec5dc26943e7980d5c19d7fc Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Mon, 29 Jan 2024 13:28:30 -0500 Subject: [PATCH 200/212] Modify Validate Nightly PyPI Wheel Binary Size to pick correct binary (#1690) --- .github/workflows/validate-nightly-pypi-wheel-binary-size.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/validate-nightly-pypi-wheel-binary-size.yml b/.github/workflows/validate-nightly-pypi-wheel-binary-size.yml index a995ec817a..24fffc16eb 100644 --- a/.github/workflows/validate-nightly-pypi-wheel-binary-size.yml +++ b/.github/workflows/validate-nightly-pypi-wheel-binary-size.yml @@ -22,5 +22,5 @@ jobs: - name: Run validation run: | python tools/binary_size_validation/binary_size_validation.py \ - --url https://download.pytorch.org/whl/nightly/torch/ \ - --include "pypi" --only-latest-version --threshold 750 \ No newline at end of file + --url https://download.pytorch.org/whl/nightly/cu121/torch/ \ + --include "linux" --only-latest-version --threshold 750 From aad5cecd7653aa231964dedfe86227d350a0b969 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Tue, 30 Jan 2024 08:25:27 -0500 Subject: [PATCH 201/212] Fix test_ops scripts on release validation testing (#1691) --- .github/scripts/validate_test_ops.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/validate_test_ops.sh b/.github/scripts/validate_test_ops.sh index 12963f289b..5df646705c 100644 --- a/.github/scripts/validate_test_ops.sh +++ b/.github/scripts/validate_test_ops.sh @@ -7,7 +7,7 @@ retry () { } BRANCH="" -if [[ ${MATRIX_CHANNEL} == "test" ]]; then +if [[ ${MATRIX_CHANNEL} == "test" || ${MATRIX_CHANNEL} == "release" ]]; then SHORT_VERSION=${MATRIX_STABLE_VERSION%.*} BRANCH="--branch release/${SHORT_VERSION}" fi From 88adb304e04a7eaea46d572490230d09991c4bf8 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Tue, 30 Jan 2024 09:38:07 -0500 Subject: [PATCH 202/212] Add option to validate only from download.pytorch.org (#1692) --- .../workflows/validate-aarch64-linux-binaries.yml | 12 ++++++++++++ .github/workflows/validate-binaries.yml | 15 +++++++++++++++ .github/workflows/validate-linux-binaries.yml | 11 +++++++++++ .../workflows/validate-macos-arm64-binaries.yml | 12 ++++++++++++ .github/workflows/validate-macos-binaries.yml | 12 ++++++++++++ .github/workflows/validate-windows-binaries.yml | 12 ++++++++++++ 6 files changed, 74 insertions(+) diff --git a/.github/workflows/validate-aarch64-linux-binaries.yml b/.github/workflows/validate-aarch64-linux-binaries.yml index 6b1a60d7cc..8761df4f9b 100644 --- a/.github/workflows/validate-aarch64-linux-binaries.yml +++ b/.github/workflows/validate-aarch64-linux-binaries.yml @@ -27,6 +27,11 @@ on: default: "" required: false type: string + use-only-dl-pytorch-org: + description: 'Use only download.pytorch.org when generating wheel install command' + default: false + required: false + type: boolean workflow_dispatch: inputs: channel: @@ -58,6 +63,11 @@ on: default: "" required: false type: string + use-only-dl-pytorch-org: + description: 'Use only download.pytorch.org when generating wheel install command' + default: false + required: false + type: boolean jobs: generate-aarch64-linux-matrix: @@ -67,6 +77,8 @@ jobs: os: linux-aarch64 channel: ${{ inputs.channel }} with-cuda: disable + use-only-dl-pytorch-org: ${{ inputs.use-only-dl-pytorch-org }} + linux-aarch64: needs: generate-aarch64-linux-matrix strategy: diff --git a/.github/workflows/validate-binaries.yml b/.github/workflows/validate-binaries.yml index 558be8e566..78e631ab89 100644 --- a/.github/workflows/validate-binaries.yml +++ b/.github/workflows/validate-binaries.yml @@ -32,6 +32,11 @@ on: default: false required: false type: boolean + use-only-dl-pytorch-org: + description: 'Use only download.pytorch.org when generating wheel install command' + default: false + required: false + type: boolean workflow_dispatch: inputs: os: @@ -75,6 +80,11 @@ on: default: false required: false type: boolean + use-only-dl-pytorch-org: + description: 'Use only download.pytorch.org when generating wheel install command' + default: false + required: false + type: boolean jobs: @@ -93,6 +103,7 @@ jobs: torchonly: ${{ inputs.torchonly }} version: ${{ inputs.version }} release-matrix: ${{ needs.generate-release-matrix.outputs.matrix }} + use-only-dl-pytorch-org: ${{ inputs.use-only-dl-pytorch-org }} linux: if: inputs.os == 'linux' || inputs.os == 'all' @@ -105,6 +116,7 @@ jobs: version: ${{ inputs.version }} release-matrix: ${{ needs.generate-release-matrix.outputs.matrix }} include-test-ops: ${{ inputs.include-test-ops }} + use-only-dl-pytorch-org: ${{ inputs.use-only-dl-pytorch-org }} linux-aarch64: if: inputs.os == 'linux-aarch64' || inputs.os == 'all' @@ -116,6 +128,7 @@ jobs: torchonly: ${{ inputs.torchonly }} version: ${{ inputs.version }} release-matrix: ${{ needs.generate-release-matrix.outputs.matrix }} + use-only-dl-pytorch-org: ${{ inputs.use-only-dl-pytorch-org }} mac: if: inputs.os == 'macos' || inputs.os == 'all' @@ -127,6 +140,7 @@ jobs: torchonly: ${{ inputs.torchonly }} version: ${{ inputs.version }} release-matrix: ${{ needs.generate-release-matrix.outputs.matrix }} + use-only-dl-pytorch-org: ${{ inputs.use-only-dl-pytorch-org }} mac-arm64: if: inputs.os == 'macos' || inputs.os == 'all' @@ -138,3 +152,4 @@ jobs: torchonly: ${{ inputs.torchonly }} version: ${{ inputs.version }} release-matrix: ${{ needs.generate-release-matrix.outputs.matrix }} + use-only-dl-pytorch-org: ${{ inputs.use-only-dl-pytorch-org }} diff --git a/.github/workflows/validate-linux-binaries.yml b/.github/workflows/validate-linux-binaries.yml index de5bda9998..08507f64d0 100644 --- a/.github/workflows/validate-linux-binaries.yml +++ b/.github/workflows/validate-linux-binaries.yml @@ -32,6 +32,11 @@ on: default: false required: false type: boolean + use-only-dl-pytorch-org: + description: 'Use only download.pytorch.org when generating wheel install command' + default: false + required: false + type: boolean workflow_dispatch: inputs: channel: @@ -68,6 +73,11 @@ on: default: false required: false type: boolean + use-only-dl-pytorch-org: + description: 'Use only download.pytorch.org when generating wheel install command' + default: false + required: false + type: boolean jobs: generate-linux-matrix: @@ -76,6 +86,7 @@ jobs: package-type: all os: linux channel: ${{ inputs.channel }} + use-only-dl-pytorch-org: ${{ inputs.use-only-dl-pytorch-org }} linux: needs: generate-linux-matrix diff --git a/.github/workflows/validate-macos-arm64-binaries.yml b/.github/workflows/validate-macos-arm64-binaries.yml index 541183b9af..dea76ffeb8 100644 --- a/.github/workflows/validate-macos-arm64-binaries.yml +++ b/.github/workflows/validate-macos-arm64-binaries.yml @@ -27,6 +27,11 @@ on: default: "" required: false type: string + use-only-dl-pytorch-org: + description: 'Use only download.pytorch.org when generating wheel install command' + default: false + required: false + type: boolean workflow_dispatch: inputs: channel: @@ -58,6 +63,11 @@ on: default: "" required: false type: string + use-only-dl-pytorch-org: + description: 'Use only download.pytorch.org when generating wheel install command' + default: false + required: false + type: boolean jobs: generate-macos-arm64-matrix: @@ -66,6 +76,8 @@ jobs: package-type: all os: macos-arm64 channel: ${{ inputs.channel }} + use-only-dl-pytorch-org: ${{ inputs.use-only-dl-pytorch-org }} + macos-arm64: needs: generate-macos-arm64-matrix strategy: diff --git a/.github/workflows/validate-macos-binaries.yml b/.github/workflows/validate-macos-binaries.yml index 9610b36f70..76035a97d2 100644 --- a/.github/workflows/validate-macos-binaries.yml +++ b/.github/workflows/validate-macos-binaries.yml @@ -27,6 +27,11 @@ on: default: "" required: false type: string + use-only-dl-pytorch-org: + description: 'Use only download.pytorch.org when generating wheel install command' + default: false + required: false + type: boolean workflow_dispatch: inputs: channel: @@ -58,6 +63,11 @@ on: default: "" required: false type: string + use-only-dl-pytorch-org: + description: 'Use only download.pytorch.org when generating wheel install command' + default: false + required: false + type: boolean jobs: generate-macos-matrix: @@ -66,6 +76,8 @@ jobs: package-type: all os: macos channel: ${{ inputs.channel }} + use-only-dl-pytorch-org: ${{ inputs.use-only-dl-pytorch-org }} + macos: needs: generate-macos-matrix strategy: diff --git a/.github/workflows/validate-windows-binaries.yml b/.github/workflows/validate-windows-binaries.yml index 1c501cfb39..9d4b3a8c43 100644 --- a/.github/workflows/validate-windows-binaries.yml +++ b/.github/workflows/validate-windows-binaries.yml @@ -27,6 +27,11 @@ on: default: "" required: false type: string + use-only-dl-pytorch-org: + description: 'Use only download.pytorch.org when generating wheel install command' + default: false + required: false + type: boolean workflow_dispatch: inputs: channel: @@ -58,6 +63,11 @@ on: default: "" required: false type: string + use-only-dl-pytorch-org: + description: 'Use only download.pytorch.org when generating wheel install command' + default: false + required: false + type: boolean jobs: generate-windows-matrix: @@ -66,6 +76,8 @@ jobs: package-type: all os: windows channel: ${{ inputs.channel }} + use-only-dl-pytorch-org: ${{ inputs.use-only-dl-pytorch-org }} + win: needs: generate-windows-matrix strategy: From add4488dcc3504b9e58bd470a1041501a294e7d2 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Tue, 30 Jan 2024 10:27:11 -0500 Subject: [PATCH 203/212] Exclude pipy and poetry tests when USE_ONLY_DL_PYTORCH_ORG is set (#1693) --- .github/workflows/validate-linux-binaries.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/validate-linux-binaries.yml b/.github/workflows/validate-linux-binaries.yml index 08507f64d0..87c9f7cd8b 100644 --- a/.github/workflows/validate-linux-binaries.yml +++ b/.github/workflows/validate-linux-binaries.yml @@ -107,13 +107,17 @@ jobs: export ENV_NAME="conda-env-${{ github.run_id }}" export TORCH_ONLY=${{ inputs.torchonly }} export INCLUDE_TEST_OPS=${{ inputs.include-test-ops }} + export USE_ONLY_DL_PYTORCH_ORG=${{ inputs.use-only-dl-pytorch-org }} export RELEASE_VERSION=${{ inputs.version }} export TARGET_OS="linux" eval "$(conda shell.bash hook)" printf '%s\n' ${{ toJson(inputs.release-matrix) }} > release_matrix.json # Special case PyPi installation package. And Install of PyPi package via poetry - if [[ ${MATRIX_PACKAGE_TYPE} == "manywheel" && ${MATRIX_GPU_ARCH_VERSION} == "12.1" && ${MATRIX_CHANNEL} == "release" ]]; then + if [[ ${MATRIX_PACKAGE_TYPE} == "manywheel" && \ + ${MATRIX_GPU_ARCH_VERSION} == "12.1" && \ + ${MATRIX_CHANNEL} == "release" && \ + ${USE_ONLY_DL_PYTORCH_ORG} == "false" ]]; then source ./.github/scripts/validate_pipy.sh source ./.github/scripts/validate_poetry.sh fi From 3d302eec26e4e26ec5bf767aef8278c1b521239d Mon Sep 17 00:00:00 2001 From: Jeff Daily Date: Thu, 1 Feb 2024 13:33:45 -0800 Subject: [PATCH 204/212] [ROCm] add hipblaslt library files (#1695) With https://github.com/pytorch/pytorch/pull/114329 merged, we need to include hipblaslt library files within the ROCm wheel. --- manywheel/build_rocm.sh | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/manywheel/build_rocm.sh b/manywheel/build_rocm.sh index a44d6212f6..4e513957f5 100755 --- a/manywheel/build_rocm.sh +++ b/manywheel/build_rocm.sh @@ -133,7 +133,7 @@ elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then LIBDRM_AMDGPU_PATH="/usr/lib/x86_64-linux-gnu/libdrm_amdgpu.so.1" MAYBE_LIB64=lib fi -OS_SO_PATHS=($LIBGOMP_PATH $LIBNUMA_PATH\ +OS_SO_PATHS=($LIBGOMP_PATH $LIBNUMA_PATH\ $LIBELF_PATH $LIBTINFO_PATH\ $LIBDRM_PATH $LIBDRM_AMDGPU_PATH) OS_SO_FILES=() @@ -147,7 +147,7 @@ done if [[ $ROCM_INT -ge 50200 ]]; then ROCBLAS_LIB_SRC=$ROCM_HOME/lib/rocblas/library ROCBLAS_LIB_DST=lib/rocblas/library -else +else ROCBLAS_LIB_SRC=$ROCM_HOME/rocblas/lib/library ROCBLAS_LIB_DST=lib/library fi @@ -156,17 +156,24 @@ ARCH_SPECIFIC_FILES=$(ls $ROCBLAS_LIB_SRC | grep -E $ARCH) OTHER_FILES=$(ls $ROCBLAS_LIB_SRC | grep -v gfx) ROCBLAS_LIB_FILES=($ARCH_SPECIFIC_FILES $OTHER_FILES) +# hipblaslt library files +HIPBLASLT_LIB_SRC=$ROCM_HOME/lib/hipblaslt/library +HIPBLASLT_LIB_DST=lib/hipblaslt/library +ARCH_SPECIFIC_FILES=$(ls $HIPBLASLT_LIB_SRC | grep -E $ARCH) +OTHER_FILES=$(ls $HIPBLASLT_LIB_SRC | grep -v gfx) +HIPBLASLT_LIB_FILES=($ARCH_SPECIFIC_FILES $OTHER_FILES) + # ROCm library files ROCM_SO_PATHS=() for lib in "${ROCM_SO_FILES[@]}" do file_path=($(find $ROCM_HOME/lib/ -name "$lib")) # First search in lib - if [[ -z $file_path ]]; then + if [[ -z $file_path ]]; then if [ -d "$ROCM_HOME/lib64/" ]; then file_path=($(find $ROCM_HOME/lib64/ -name "$lib")) # Then search in lib64 fi fi - if [[ -z $file_path ]]; then + if [[ -z $file_path ]]; then file_path=($(find $ROCM_HOME/ -name "$lib")) # Then search in ROCM_HOME fi if [[ -z $file_path ]]; then @@ -188,11 +195,13 @@ DEPS_SONAME=( DEPS_AUX_SRCLIST=( "${ROCBLAS_LIB_FILES[@]/#/$ROCBLAS_LIB_SRC/}" + "${HIPBLASLT_LIB_FILES[@]/#/$HIPBLASLT_LIB_SRC/}" "/opt/amdgpu/share/libdrm/amdgpu.ids" ) DEPS_AUX_DSTLIST=( "${ROCBLAS_LIB_FILES[@]/#/$ROCBLAS_LIB_DST/}" + "${HIPBLASLT_LIB_FILES[@]/#/$HIPBLASLT_LIB_DST/}" "share/libdrm/amdgpu.ids" ) From da779da84f6dd212f8262c675e29afd993f8d289 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Thu, 1 Feb 2024 22:30:58 -0800 Subject: [PATCH 205/212] Minor tweak to fbgemmgpu version to ignore RC suffix (#1694) --- release/promote.sh | 8 +++++++- release/release_versions.sh | 8 +++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/release/promote.sh b/release/promote.sh index 5bf7fe0b5e..36644494e2 100644 --- a/release/promote.sh +++ b/release/promote.sh @@ -12,7 +12,13 @@ TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION:-2.1.1} TORCHTEXT_VERSION=${TORCHTEXT_VERSION:-0.16.1} TORCHDATA_VERSION=${TORCHDATA_VERSION:-0.7.1} TORCHREC_VERSION=${TORCHREC_VERSION:-0.6.0} -FBGEMMGPU_VERSION=${FBGEMMGPU_VERSION:-0.6.0} + +# NB: FBGEMMGPU uses the practice of keeping rc version in the filename, i.e. +# fbgemm_gpu-0.6.0rc1+cpu-cp311-cp311. On the other hand, its final RC will +# be without rc suffix, fbgemm_gpu-0.6.0+cpu-cp311-cp311, and that's the one +# ready to be promoted. So, keeping a + here in the version name allows the +# promote script to find the correct binaries +FBGEMMGPU_VERSION=${FBGEMMGPU_VERSION:-0.6.0+} DRY_RUN=${DRY_RUN:-enabled} diff --git a/release/release_versions.sh b/release/release_versions.sh index 88b6cbcd3a..311358bd37 100644 --- a/release/release_versions.sh +++ b/release/release_versions.sh @@ -7,4 +7,10 @@ TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION:-2.2.0} TORCHTEXT_VERSION=${TORCHTEXT_VERSION:-0.17.0} TORCHDATA_VERSION=${TORCHDATA_VERSION:-0.7.1} TORCHREC_VERSION=${TORCHREC_VERSION:-0.6.0} -FBGEMMGPU_VERSION=${FBGEMMGPU_VERSION:-0.6.0} + +# NB: FBGEMMGPU uses the practice of keeping rc version in the filename, i.e. +# fbgemm_gpu-0.6.0rc1+cpu-cp311-cp311. On the other hand, its final RC will +# be without rc suffix, fbgemm_gpu-0.6.0+cpu-cp311-cp311, and that's the one +# ready to be promoted. So, keeping a + here in the version name allows the +# promote script to find the correct binaries +FBGEMMGPU_VERSION=${FBGEMMGPU_VERSION:-0.6.0+} From 96bd8512cb0a04368930cdad47b6b91ff6671f5f Mon Sep 17 00:00:00 2001 From: Huy Do Date: Tue, 6 Feb 2024 11:03:20 -0800 Subject: [PATCH 206/212] Remove custom PyTorch build dependency logic on 3.11 (#1697) * Remove custom PyTorch build dependency logic on 3.11 * Add a smoke test for openmp --- run_tests.sh | 5 ++++- wheel/build_wheel.sh | 7 ++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/run_tests.sh b/run_tests.sh index fd66835e23..2046501caa 100755 --- a/run_tests.sh +++ b/run_tests.sh @@ -142,7 +142,7 @@ if [[ "$cuda_ver" != 'cpu' ]]; then fi fi -# Check that OpenBlas is not linked to on Macs +# Check that OpenBlas is not linked to on MacOS if [[ "$(uname)" == 'Darwin' ]]; then echo "Checking the OpenBLAS is not linked to" all_dylibs=($(find "$(python -c "import site; print(site.getsitepackages()[0])")"/torch -name '*.dylib')) @@ -153,6 +153,9 @@ if [[ "$(uname)" == 'Darwin' ]]; then exit 1 fi done + + echo "Checking that OpenMP is available" + python -c "import torch; exit(0 if torch.backends.openmp.is_available() else 1)" fi popd diff --git a/wheel/build_wheel.sh b/wheel/build_wheel.sh index 1186bc56ae..5e9c68041d 100755 --- a/wheel/build_wheel.sh +++ b/wheel/build_wheel.sh @@ -182,11 +182,8 @@ tmp_env_name="wheel_py$python_nodot" conda create ${EXTRA_CONDA_INSTALL_FLAGS} -yn "$tmp_env_name" python="$desired_python" source activate "$tmp_env_name" -if [[ "$desired_python" == "3.11" ]]; then - retry pip install -q "numpy${NUMPY_PINNED_VERSION}" "setuptools${SETUPTOOLS_PINNED_VERSION}" "pyyaml${PYYAML_PINNED_VERSION}" typing_extensions requests -else - retry conda install ${EXTRA_CONDA_INSTALL_FLAGS} -yq "numpy${NUMPY_PINNED_VERSION}" nomkl "setuptools${SETUPTOOLS_PINNED_VERSION}" "pyyaml${PYYAML_PINNED_VERSION}" typing_extensions requests -fi +retry conda install ${EXTRA_CONDA_INSTALL_FLAGS} -yq "numpy${NUMPY_PINNED_VERSION}" nomkl "setuptools${SETUPTOOLS_PINNED_VERSION}" "pyyaml${PYYAML_PINNED_VERSION}" typing_extensions requests + if [[ "$(uname -m)" == "arm64" ]]; then retry conda install ${EXTRA_CONDA_INSTALL_FLAGS} -yq cmake ninja else From 196b77bc829631379dc9242797cbe3e7e03b28dc Mon Sep 17 00:00:00 2001 From: Huy Do Date: Tue, 6 Feb 2024 12:09:00 -0800 Subject: [PATCH 207/212] Pin conda-build to 3.28.4 (#1698) --- conda/build_pytorch.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/conda/build_pytorch.sh b/conda/build_pytorch.sh index 39aab7ee89..473d3bf278 100755 --- a/conda/build_pytorch.sh +++ b/conda/build_pytorch.sh @@ -208,7 +208,8 @@ if [[ "$(uname)" == 'Darwin' ]]; then "$miniconda_sh" -b -p "$tmp_conda" && \ rm "$miniconda_sh" export PATH="$tmp_conda/bin:$PATH" - retry conda install -yq conda-build + # TODO(huydhn): We can revert the pin after https://github.com/conda/conda-build/issues/5167 is resolved + retry conda install -yq conda-build=3.28.4 elif [[ "$OSTYPE" == "msys" ]]; then export tmp_conda="${WIN_PACKAGE_WORK_DIR}\\conda" export miniconda_exe="${WIN_PACKAGE_WORK_DIR}\\miniconda.exe" @@ -351,8 +352,6 @@ for py_ver in "${DESIRED_PYTHON[@]}"; do conda install -y conda-package-handling conda==22.9.0 else conda install -y conda-package-handling conda==23.5.2 - # NS: To be removed after conda docker images are updated - conda update -y conda-build fi echo "Calling conda-build at $(date)" From 850d28bb1503dee0d0946fcdd3e1759af2061ffb Mon Sep 17 00:00:00 2001 From: snadampal <87143774+snadampal@users.noreply.github.com> Date: Tue, 6 Feb 2024 17:46:25 -0600 Subject: [PATCH 208/212] ci: aarch64 linux: fix torch performance issue with conda openblas package (#1696) changing the conda openblas package from pthread version to openmp version to match torch openmp runtime. The pthread version was conflicting with the openmp runtime and causing thread over-subscription and performance degradation. --- aarch64_linux/aarch64_ci_setup.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aarch64_linux/aarch64_ci_setup.sh b/aarch64_linux/aarch64_ci_setup.sh index 53c8a5320c..07a4757ff8 100755 --- a/aarch64_linux/aarch64_ci_setup.sh +++ b/aarch64_linux/aarch64_ci_setup.sh @@ -30,7 +30,7 @@ if [[ "$DESIRED_PYTHON" == "3.8" ]]; then else NUMPY_VERSION="1.26.2" fi -conda install -y -c conda-forge numpy==${NUMPY_VERSION} pyyaml==6.0.1 patchelf==0.17.2 pygit2==1.13.2 openblas==0.3.25 ninja==1.11.1 scons==4.5.2 +conda install -y -c conda-forge numpy==${NUMPY_VERSION} pyyaml==6.0.1 patchelf==0.17.2 pygit2==1.13.2 openblas==0.3.25=*openmp* ninja==1.11.1 scons==4.5.2 python --version conda --version From fa8b6d667be4828bb88ec2a7d9e1b90fc4c081a9 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Tue, 13 Feb 2024 13:41:10 -0500 Subject: [PATCH 209/212] Add triton version for nightly and release (#1703) --- manywheel/build_cuda.sh | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/manywheel/build_cuda.sh b/manywheel/build_cuda.sh index 9919247ed0..318273ba41 100644 --- a/manywheel/build_cuda.sh +++ b/manywheel/build_cuda.sh @@ -262,19 +262,20 @@ else exit 1 fi -# TODO: Remove me when Triton has a proper release channel -# No triton dependency for now on 3.12 since we don't have binaries for it -# and torch.compile doesn't work. -if [[ $(uname) == "Linux" && "$DESIRED_PYTHON" != "3.12" ]]; then + +TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt) +# Only linux Python < 3.12 are supported wheels for triton +TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64' and python_version < '3.12'" +TRITON_REQUIREMENT="pytorch-triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}" +if [[ -n "$OVERRIDE_PACKAGE_VERSION" && "$OVERRIDE_PACKAGE_VERSION" =~ .*dev.* ]]; then TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.github/ci_commit_pins/triton.txt) - TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt) - TRITON_REQUIREMENT="pytorch-triton==${TRITON_VERSION}+${TRITON_SHORTHASH}; platform_system == 'Linux' and platform_machine == 'x86_64'" + TRITON_REQUIREMENT="pytorch-triton==${TRITON_VERSION}+${TRITON_SHORTHASH}; ${TRITON_CONSTRAINT}" +fi - if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then - export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${TRITON_REQUIREMENT}" - else - export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS} | ${TRITON_REQUIREMENT}" - fi +if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then + export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${TRITON_REQUIREMENT}" +else + export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS} | ${TRITON_REQUIREMENT}" fi # builder/test.sh requires DESIRED_CUDA to know what tests to exclude From 5c814e2527b3f5797488bf57d9d5425e63dcc1ac Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Tue, 13 Feb 2024 10:58:30 -0800 Subject: [PATCH 210/212] Bundle PTXAS into 11.8 wheel --- manywheel/build_cuda.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/manywheel/build_cuda.sh b/manywheel/build_cuda.sh index 318273ba41..39dafe8b1b 100644 --- a/manywheel/build_cuda.sh +++ b/manywheel/build_cuda.sh @@ -197,6 +197,8 @@ elif [[ $CUDA_VERSION == "11.8" ]]; then export USE_STATIC_CUDNN=0 # Try parallelizing nvcc as well export TORCH_NVCC_FLAGS="-Xfatbin -compress-all --threads 2" + # Bundle ptxas into the wheel, see https://github.com/pytorch/pytorch/pull/119750 + export BUILD_BUNDLE_PTXAS=1 if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then echo "Bundling with cudnn and cublas." From f4b92598544c7e8250a208e9616e62a25e7f7f18 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Fri, 16 Feb 2024 12:55:06 -0500 Subject: [PATCH 211/212] Add tensorrt promo script, bump release version for 2.2.1 (#1706) --- release/promote.sh | 10 ++++++---- release/release_versions.sh | 9 +++++---- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/release/promote.sh b/release/promote.sh index 36644494e2..78a43409b2 100644 --- a/release/promote.sh +++ b/release/promote.sh @@ -6,12 +6,13 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" source "${DIR}/release_versions.sh" # Make sure to update these versions when doing a release first -PYTORCH_VERSION=${PYTORCH_VERSION:-2.1.1} -TORCHVISION_VERSION=${TORCHVISION_VERSION:-0.16.1} -TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION:-2.1.1} -TORCHTEXT_VERSION=${TORCHTEXT_VERSION:-0.16.1} +PYTORCH_VERSION=${PYTORCH_VERSION:-2.2.1} +TORCHVISION_VERSION=${TORCHVISION_VERSION:-0.17.1} +TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION:-2.2.1} +TORCHTEXT_VERSION=${TORCHTEXT_VERSION:-0.17.1} TORCHDATA_VERSION=${TORCHDATA_VERSION:-0.7.1} TORCHREC_VERSION=${TORCHREC_VERSION:-0.6.0} +TENSORRT_VERSION=${TENSORRT_VERSION:-2.2.0} # NB: FBGEMMGPU uses the practice of keeping rc version in the filename, i.e. # fbgemm_gpu-0.6.0rc1+cpu-cp311-cp311. On the other hand, its final RC will @@ -113,6 +114,7 @@ promote_pypi() { # promote_s3 torchrec whl "${TORCHREC_VERSION}" # promote_s3 fbgemm-gpu whl "${FBGEMMGPU_VERSION}" # promote_s3 "libtorch-*" libtorch "${PYTORCH_VERSION}" +# promote_s3 "torch_tensorrt" whl "${TENSORRT_VERSION}" # promote_conda torchtriton conda "2.1.0" # promote_conda pytorch-cuda conda "11.8" diff --git a/release/release_versions.sh b/release/release_versions.sh index 311358bd37..af02cf7eb8 100644 --- a/release/release_versions.sh +++ b/release/release_versions.sh @@ -1,12 +1,13 @@ #!/usr/bin/env bash # Make sure to update these versions when doing a release first -PYTORCH_VERSION=${PYTORCH_VERSION:-2.2.0} -TORCHVISION_VERSION=${TORCHVISION_VERSION:-0.17.0} -TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION:-2.2.0} -TORCHTEXT_VERSION=${TORCHTEXT_VERSION:-0.17.0} +PYTORCH_VERSION=${PYTORCH_VERSION:-2.2.1} +TORCHVISION_VERSION=${TORCHVISION_VERSION:-0.17.1} +TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION:-2.2.1} +TORCHTEXT_VERSION=${TORCHTEXT_VERSION:-0.17.1} TORCHDATA_VERSION=${TORCHDATA_VERSION:-0.7.1} TORCHREC_VERSION=${TORCHREC_VERSION:-0.6.0} +TENSORRT_VERSION=${TENSORRT_VERSION:-2.2.0} # NB: FBGEMMGPU uses the practice of keeping rc version in the filename, i.e. # fbgemm_gpu-0.6.0rc1+cpu-cp311-cp311. On the other hand, its final RC will From af4827c637d2f1fca7fbc52e96364ea9840508a3 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Fri, 16 Feb 2024 17:46:15 -0500 Subject: [PATCH 212/212] Pin Conda to 23.11.0 --- .github/scripts/validate_binaries.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/scripts/validate_binaries.sh b/.github/scripts/validate_binaries.sh index 042b227679..f3b73e3de3 100755 --- a/.github/scripts/validate_binaries.sh +++ b/.github/scripts/validate_binaries.sh @@ -3,7 +3,8 @@ if [[ ${MATRIX_PACKAGE_TYPE} == "libtorch" ]]; then unzip libtorch.zip else - conda update -y -n base -c defaults conda + # Conda pinned see issue: https://github.com/ContinuumIO/anaconda-issues/issues/13350 + conda install -y conda=23.11.0 # Please note ffmpeg is required for torchaudio, see https://github.com/pytorch/pytorch/issues/96159 conda create -y -n ${ENV_NAME} python=${MATRIX_PYTHON_VERSION} numpy ffmpeg conda activate ${ENV_NAME}