diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index a2a8bff..eecde17 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -31,8 +31,7 @@ jobs: - "3.10" - "3.11" torch_version: - - "2.1.1" - - "2.1.2" + - "2.2.0" cuda_short_version: - "118" - "121" diff --git a/.github/workflows/wheels_build.yml b/.github/workflows/wheels_build.yml index 473e22f..58e525f 100644 --- a/.github/workflows/wheels_build.yml +++ b/.github/workflows/wheels_build.yml @@ -66,7 +66,7 @@ jobs: # windows does not have per version binary, it is just 'python3' PY: python${{ contains(inputs.os, 'ubuntu') && inputs.python || '3' }} - container: ${{ contains(inputs.os, 'ubuntu') && 'quay.io/pypa/manylinux2014_x86_64' || null }} + # container: ${{ contains(inputs.os, 'ubuntu') && 'quay.io/pypa/manylinux2014_x86_64' || null }} timeout-minutes: 360 defaults: run: @@ -117,14 +117,46 @@ jobs: fp.write("TORCH_CUDA_ARCH_LIST=" + arch_list + "\n") - run: echo "${TORCH_CUDA_ARCH_LIST}" + - if: contains(inputs.os, 'ubuntu') + name: Free Disk Space (Ubuntu) + uses: jlumbroso/free-disk-space@v1.3.1 + with: + # this might remove tools that are actually needed, + # if set to "true" but frees about 6 GB + tool-cache: false + + # all of these default to true, but feel free to set to + # "false" if necessary for your workflow + android: true + dotnet: true + haskell: true + large-packages: true + docker-images: true + swap-storage: true + - if: runner.os == 'Linux' name: (Linux) install cuda - run: > - yum install wget git prename -y && - yum clean all --verbose && - wget -q "${{ steps.cuda_info.outputs.CUDA_INSTALL_SCRIPT }}" -O cuda.run && - sh ./cuda.run --silent --toolkit && + run: | + # yum install wget git prename -y + # yum clean all --verbose + sudo apt update + sudo apt install -y wget git rename + sudo apt clean -y + sudo apt autoremove -y + wget -q "${{ steps.cuda_info.outputs.CUDA_INSTALL_SCRIPT }}" -O cuda.run + sudo sh ./cuda.run --silent --toolkit --toolkitpath=/usr/local/cuda || cat /tmp/cuda-installer.log rm ./cuda.run + echo "CUDA_HOME=/usr/local/cuda" >> ${GITHUB_ENV} + echo "PATH=/usr/local/cuda/bin:$PATH" >> ${GITHUB_ENV} + + - if: runner.os == 'Linux' + name: (Linux) install python + run: | + sudo add-apt-repository ppa:deadsnakes/ppa -y + sudo apt update + sudo apt install -y python${{ inputs.python }} python${{ inputs.python }}-dev python${{ inputs.python }}-venv + sudo apt clean -y + sudo apt autoremove -y - name: Recursive checkout uses: actions/checkout@v3 @@ -182,6 +214,7 @@ jobs: run: | cudnn_next_version_major=$((${CUDNN_VERSION_MAJOR} + 1)) cudnn_package_name="${CUDNN_PYPI_PACKAGE}>=${CUDNN_VERSION_MAJOR}.0.0.0,<${cudnn_next_version_major}.0.0.0" + $PY -m pip install --upgrade pip $PY -m pip install wheel setuptools ninja twine "torch==${{ inputs.torch_version }}" "${cudnn_package_name}" -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu${{ inputs.cuda_short_version }} --no-cache-dir - name: Build wheel diff --git a/community/reproduce_vae_segfault.py b/community/reproduce_vae_segfault.py new file mode 100644 index 0000000..d269115 --- /dev/null +++ b/community/reproduce_vae_segfault.py @@ -0,0 +1,37 @@ +import torch +import torch.nn.functional as F + +from diffusers import AutoencoderKL + +from sfast.compilers.stable_diffusion_pipeline_compiler import ( + compile_vae, + CompilationConfig, +) + +device = torch.device("cuda:0") + +SD_2_1_DIFFUSERS_MODEL = "stabilityai/stable-diffusion-2-1" +variant = {"variant": "fp16"} +vae_orig = AutoencoderKL.from_pretrained( + SD_2_1_DIFFUSERS_MODEL, + subfolder="vae", + torch_dtype=torch.float16, + **variant, + ) + +vae_orig.to(device) + +sfast_config = CompilationConfig.Default() +sfast_config.enable_xformers = False +sfast_config.enable_triton = True +sfast_config.enable_cuda_graph = False +vae = compile_vae(vae_orig, sfast_config) + +sample_imgs = torch.randn(4, 3, 128, 128, dtype=vae.dtype, device=device) +latents1 = torch.randn(4, 4, 16, 16, dtype=vae.dtype, device=device) + +latents = vae.encode(sample_imgs).latent_dist.sample() + +sample_imgs_dup = sample_imgs.clone().detach().requires_grad_(True) +latents2 = vae_orig.encode(sample_imgs_dup).latent_dist.sample() +print("Test done") diff --git a/src/sfast/csrc/jit/python_operator.cpp b/src/sfast/csrc/jit/python_operator.cpp index 1b739fd..d58cc4b 100644 --- a/src/sfast/csrc/jit/python_operator.cpp +++ b/src/sfast/csrc/jit/python_operator.cpp @@ -28,16 +28,18 @@ void RegisterCustomPythonOperator(const std::string &schema, auto arguments = parsed_schema.arguments(); auto returns = parsed_schema.returns(); - std::shared_ptr func_ptr( + std::shared_ptr func_ptr( new py::function(py::reinterpret_borrow( - py::handle(const_cast(py_callable.get())))), + py::handle(py_callable.get()))), [](py::function *ptr) { - // Check if the current thread is holding the GIL - if (PyGILState_Check()) { - delete ptr; - } else { - py::gil_scoped_acquire gil; - delete ptr; + if (Py_IsInitialized()) { + // Check if the current thread is holding the GIL + if (PyGILState_Check()) { + delete ptr; + } else { + py::gil_scoped_acquire gil; + delete ptr; + } } }); diff --git a/src/sfast/jit/utils.py b/src/sfast/jit/utils.py index 5d287b5..ffab1a3 100644 --- a/src/sfast/jit/utils.py +++ b/src/sfast/jit/utils.py @@ -1,9 +1,12 @@ +import logging import inspect import functools import torch import sfast from .overrides import TracingMode +logger = logging.getLogger() + class ScriptModuleClearHook: @@ -13,8 +16,8 @@ def __init__(self, script_module_c): def __del__(self): try: sfast._C._jit_clear_class_type_registration(self.class_type) - except Exception: - pass + except Exception as e: + logger.warning(f'Failed to clear class type registration: {e}') def attach_script_module_clear_hook(