diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index a2a8bff..eecde17 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -31,8 +31,7 @@ jobs:
           - "3.10"
           - "3.11"
         torch_version:
-          - "2.1.1"
-          - "2.1.2"
+          - "2.2.0"
         cuda_short_version:
           - "118"
           - "121"
diff --git a/.github/workflows/wheels_build.yml b/.github/workflows/wheels_build.yml
index 473e22f..58e525f 100644
--- a/.github/workflows/wheels_build.yml
+++ b/.github/workflows/wheels_build.yml
@@ -66,7 +66,7 @@ jobs:
       # windows does not have per version binary, it is just 'python3'
       PY: python${{ contains(inputs.os, 'ubuntu') && inputs.python || '3' }}
 
-    container: ${{ contains(inputs.os, 'ubuntu') && 'quay.io/pypa/manylinux2014_x86_64' || null }}
+    # container: ${{ contains(inputs.os, 'ubuntu') && 'quay.io/pypa/manylinux2014_x86_64' || null }}
     timeout-minutes: 360
     defaults:
       run:
@@ -117,14 +117,46 @@ jobs:
             fp.write("TORCH_CUDA_ARCH_LIST=" + arch_list + "\n")
       - run: echo "${TORCH_CUDA_ARCH_LIST}"
 
+      - if: contains(inputs.os, 'ubuntu')
+        name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@v1.3.1
+        with:
+          # this might remove tools that are actually needed,
+          # if set to "true" but frees about 6 GB
+          tool-cache: false
+          
+          # all of these default to true, but feel free to set to
+          # "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: true
+          swap-storage: true
+
       - if: runner.os == 'Linux'
         name: (Linux) install cuda
-        run: >
-          yum install wget git prename -y &&
-          yum clean all --verbose &&
-          wget -q "${{ steps.cuda_info.outputs.CUDA_INSTALL_SCRIPT }}" -O cuda.run &&
-          sh ./cuda.run --silent --toolkit &&
+        run: |
+          # yum install wget git prename -y
+          # yum clean all --verbose
+          sudo apt update
+          sudo apt install -y wget git rename
+          sudo apt clean -y
+          sudo apt autoremove -y
+          wget -q "${{ steps.cuda_info.outputs.CUDA_INSTALL_SCRIPT }}" -O cuda.run
+          sudo sh ./cuda.run --silent --toolkit --toolkitpath=/usr/local/cuda || cat /tmp/cuda-installer.log
           rm ./cuda.run
+          echo "CUDA_HOME=/usr/local/cuda" >> ${GITHUB_ENV}
+          echo "PATH=/usr/local/cuda/bin:$PATH" >> ${GITHUB_ENV}
+
+      - if: runner.os == 'Linux'
+        name: (Linux) install python
+        run: |
+          sudo add-apt-repository ppa:deadsnakes/ppa -y
+          sudo apt update
+          sudo apt install -y python${{ inputs.python }} python${{ inputs.python }}-dev python${{ inputs.python }}-venv
+          sudo apt clean -y
+          sudo apt autoremove -y
 
       - name: Recursive checkout
         uses: actions/checkout@v3
@@ -182,6 +214,7 @@ jobs:
         run: |
           cudnn_next_version_major=$((${CUDNN_VERSION_MAJOR} + 1))
           cudnn_package_name="${CUDNN_PYPI_PACKAGE}>=${CUDNN_VERSION_MAJOR}.0.0.0,<${cudnn_next_version_major}.0.0.0"
+          $PY -m pip install --upgrade pip
           $PY -m pip install wheel setuptools ninja twine "torch==${{ inputs.torch_version }}" "${cudnn_package_name}" -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu${{ inputs.cuda_short_version }} --no-cache-dir
 
       - name: Build wheel
diff --git a/community/reproduce_vae_segfault.py b/community/reproduce_vae_segfault.py
new file mode 100644
index 0000000..d269115
--- /dev/null
+++ b/community/reproduce_vae_segfault.py
@@ -0,0 +1,37 @@
+import torch
+import torch.nn.functional as F
+
+from diffusers import AutoencoderKL
+
+from sfast.compilers.stable_diffusion_pipeline_compiler import (
+    compile_vae,
+    CompilationConfig,
+)
+
+device = torch.device("cuda:0")
+
+SD_2_1_DIFFUSERS_MODEL = "stabilityai/stable-diffusion-2-1"
+variant = {"variant": "fp16"}
+vae_orig = AutoencoderKL.from_pretrained(
+        SD_2_1_DIFFUSERS_MODEL,
+        subfolder="vae",
+        torch_dtype=torch.float16,
+        **variant,
+    )
+
+vae_orig.to(device)
+
+sfast_config = CompilationConfig.Default()
+sfast_config.enable_xformers = False
+sfast_config.enable_triton = True
+sfast_config.enable_cuda_graph = False
+vae = compile_vae(vae_orig, sfast_config)
+
+sample_imgs = torch.randn(4, 3, 128, 128, dtype=vae.dtype, device=device)
+latents1 = torch.randn(4, 4, 16, 16, dtype=vae.dtype, device=device)
+
+latents = vae.encode(sample_imgs).latent_dist.sample()
+
+sample_imgs_dup = sample_imgs.clone().detach().requires_grad_(True)
+latents2 = vae_orig.encode(sample_imgs_dup).latent_dist.sample()
+print("Test done")
diff --git a/src/sfast/csrc/jit/python_operator.cpp b/src/sfast/csrc/jit/python_operator.cpp
index 1b739fd..d58cc4b 100644
--- a/src/sfast/csrc/jit/python_operator.cpp
+++ b/src/sfast/csrc/jit/python_operator.cpp
@@ -28,16 +28,18 @@ void RegisterCustomPythonOperator(const std::string &schema,
   auto arguments = parsed_schema.arguments();
   auto returns = parsed_schema.returns();
 
-  std::shared_ptr<py::function> func_ptr(
+  std::shared_ptr<const py::function> func_ptr(
       new py::function(py::reinterpret_borrow<const py::function>(
-          py::handle(const_cast<PyObject *>(py_callable.get())))),
+          py::handle(py_callable.get()))),
       [](py::function *ptr) {
-        // Check if the current thread is holding the GIL
-        if (PyGILState_Check()) {
-          delete ptr;
-        } else {
-          py::gil_scoped_acquire gil;
-          delete ptr;
+        if (Py_IsInitialized()) {
+          // Check if the current thread is holding the GIL
+          if (PyGILState_Check()) {
+            delete ptr;
+          } else {
+            py::gil_scoped_acquire gil;
+            delete ptr;
+          }
         }
       });
 
diff --git a/src/sfast/jit/utils.py b/src/sfast/jit/utils.py
index 5d287b5..ffab1a3 100644
--- a/src/sfast/jit/utils.py
+++ b/src/sfast/jit/utils.py
@@ -1,9 +1,12 @@
+import logging
 import inspect
 import functools
 import torch
 import sfast
 from .overrides import TracingMode
 
+logger = logging.getLogger()
+
 
 class ScriptModuleClearHook:
 
@@ -13,8 +16,8 @@ def __init__(self, script_module_c):
     def __del__(self):
         try:
             sfast._C._jit_clear_class_type_registration(self.class_type)
-        except Exception:
-            pass
+        except Exception as e:
+            logger.warning(f'Failed to clear class type registration: {e}')
 
 
 def attach_script_module_clear_hook(